{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.138745129108429, "epoch": 0.0037313432835820895, "grad_norm": 1.7020611763000488, "learning_rate": 0.0002, "loss": 2.4721007347106934, "mean_token_accuracy": 0.5372578650712967, "num_tokens": 16325.0, "step": 1 }, { "entropy": 1.2262731790542603, "epoch": 0.007462686567164179, "grad_norm": 1.5422499179840088, "learning_rate": 0.0002, "loss": 2.1402571201324463, "mean_token_accuracy": 0.5742411762475967, "num_tokens": 32666.0, "step": 2 }, { "entropy": 1.409499078989029, "epoch": 0.011194029850746268, "grad_norm": 1.1927348375320435, "learning_rate": 0.0002, "loss": 1.7202329635620117, "mean_token_accuracy": 0.5956366509199142, "num_tokens": 48877.0, "step": 3 }, { "entropy": 1.3392578959465027, "epoch": 0.014925373134328358, "grad_norm": 0.9159098863601685, "learning_rate": 0.0002, "loss": 1.3790637254714966, "mean_token_accuracy": 0.6494399756193161, "num_tokens": 65097.0, "step": 4 }, { "entropy": 1.329741895198822, "epoch": 0.018656716417910446, "grad_norm": 0.9530413150787354, "learning_rate": 0.0002, "loss": 1.2827703952789307, "mean_token_accuracy": 0.649653822183609, "num_tokens": 81423.0, "step": 5 }, { "entropy": 1.2239453792572021, "epoch": 0.022388059701492536, "grad_norm": 0.45381543040275574, "learning_rate": 0.0002, "loss": 1.1552369594573975, "mean_token_accuracy": 0.6654698848724365, "num_tokens": 97674.0, "step": 6 }, { "entropy": 1.1408285796642303, "epoch": 0.026119402985074626, "grad_norm": 0.40323638916015625, "learning_rate": 0.0002, "loss": 1.063366413116455, "mean_token_accuracy": 0.6731287389993668, "num_tokens": 114207.0, "step": 7 }, { "entropy": 1.0467455089092255, "epoch": 0.029850746268656716, "grad_norm": 0.4862216114997864, "learning_rate": 0.0002, "loss": 0.9919917583465576, "mean_token_accuracy": 0.6862917095422745, "num_tokens": 130364.0, "step": 8 }, { "entropy": 0.9914536625146866, "epoch": 0.033582089552238806, "grad_norm": 0.563399612903595, "learning_rate": 0.0002, "loss": 0.9576236605644226, "mean_token_accuracy": 0.6916692554950714, "num_tokens": 146675.0, "step": 9 }, { "entropy": 0.9863343089818954, "epoch": 0.03731343283582089, "grad_norm": 0.4532151520252228, "learning_rate": 0.0002, "loss": 0.8861619234085083, "mean_token_accuracy": 0.7066572606563568, "num_tokens": 162793.0, "step": 10 }, { "entropy": 0.9439148157835007, "epoch": 0.041044776119402986, "grad_norm": 0.4917202889919281, "learning_rate": 0.0002, "loss": 0.8438840508460999, "mean_token_accuracy": 0.7115702927112579, "num_tokens": 178972.0, "step": 11 }, { "entropy": 0.86412213742733, "epoch": 0.04477611940298507, "grad_norm": 0.4633786678314209, "learning_rate": 0.0002, "loss": 0.8079400658607483, "mean_token_accuracy": 0.7117275148630142, "num_tokens": 195446.0, "step": 12 }, { "entropy": 0.7569762617349625, "epoch": 0.048507462686567165, "grad_norm": 0.4152548909187317, "learning_rate": 0.0002, "loss": 0.7479823231697083, "mean_token_accuracy": 0.7288273125886917, "num_tokens": 211604.0, "step": 13 }, { "entropy": 0.7370023280382156, "epoch": 0.05223880597014925, "grad_norm": 0.38331395387649536, "learning_rate": 0.0002, "loss": 0.7293781638145447, "mean_token_accuracy": 0.7328485548496246, "num_tokens": 228114.0, "step": 14 }, { "entropy": 0.6818548142910004, "epoch": 0.055970149253731345, "grad_norm": 0.4065186679363251, "learning_rate": 0.0002, "loss": 0.6948679685592651, "mean_token_accuracy": 0.7417702227830887, "num_tokens": 244615.0, "step": 15 }, { "entropy": 0.6801213175058365, "epoch": 0.05970149253731343, "grad_norm": 0.3765408992767334, "learning_rate": 0.0002, "loss": 0.6942192316055298, "mean_token_accuracy": 0.7383946776390076, "num_tokens": 260940.0, "step": 16 }, { "entropy": 0.6828830540180206, "epoch": 0.06343283582089553, "grad_norm": 0.31789109110832214, "learning_rate": 0.0002, "loss": 0.6663458347320557, "mean_token_accuracy": 0.7480802536010742, "num_tokens": 277198.0, "step": 17 }, { "entropy": 0.6609166115522385, "epoch": 0.06716417910447761, "grad_norm": 0.3814696669578552, "learning_rate": 0.0002, "loss": 0.6373794078826904, "mean_token_accuracy": 0.7566290199756622, "num_tokens": 293415.0, "step": 18 }, { "entropy": 0.6822013854980469, "epoch": 0.0708955223880597, "grad_norm": 0.3390759527683258, "learning_rate": 0.0002, "loss": 0.6543835997581482, "mean_token_accuracy": 0.7451244294643402, "num_tokens": 309815.0, "step": 19 }, { "entropy": 0.632593423128128, "epoch": 0.07462686567164178, "grad_norm": 0.41862595081329346, "learning_rate": 0.0002, "loss": 0.6299830675125122, "mean_token_accuracy": 0.7534051537513733, "num_tokens": 326057.0, "step": 20 }, { "entropy": 0.6358507871627808, "epoch": 0.07835820895522388, "grad_norm": 0.30084753036499023, "learning_rate": 0.0002, "loss": 0.62652587890625, "mean_token_accuracy": 0.7561640441417694, "num_tokens": 342366.0, "step": 21 }, { "entropy": 0.601889356970787, "epoch": 0.08208955223880597, "grad_norm": 0.30453744530677795, "learning_rate": 0.0002, "loss": 0.5936654210090637, "mean_token_accuracy": 0.7655821740627289, "num_tokens": 358935.0, "step": 22 }, { "entropy": 0.5926243662834167, "epoch": 0.08582089552238806, "grad_norm": 0.24678799510002136, "learning_rate": 0.0002, "loss": 0.5894668698310852, "mean_token_accuracy": 0.7695567756891251, "num_tokens": 375125.0, "step": 23 }, { "entropy": 0.5948957055807114, "epoch": 0.08955223880597014, "grad_norm": 0.26838821172714233, "learning_rate": 0.0002, "loss": 0.5975726246833801, "mean_token_accuracy": 0.766963854432106, "num_tokens": 391519.0, "step": 24 }, { "entropy": 0.5925572067499161, "epoch": 0.09328358208955224, "grad_norm": 0.24850629270076752, "learning_rate": 0.0002, "loss": 0.5895435214042664, "mean_token_accuracy": 0.7683891654014587, "num_tokens": 408003.0, "step": 25 }, { "entropy": 0.579643040895462, "epoch": 0.09701492537313433, "grad_norm": 0.24649304151535034, "learning_rate": 0.0002, "loss": 0.5773741006851196, "mean_token_accuracy": 0.7704576104879379, "num_tokens": 424170.0, "step": 26 }, { "entropy": 0.579850047826767, "epoch": 0.10074626865671642, "grad_norm": 0.24893403053283691, "learning_rate": 0.0002, "loss": 0.5705626010894775, "mean_token_accuracy": 0.7733898609876633, "num_tokens": 440584.0, "step": 27 }, { "entropy": 0.5937480330467224, "epoch": 0.1044776119402985, "grad_norm": 0.222214013338089, "learning_rate": 0.0002, "loss": 0.584485650062561, "mean_token_accuracy": 0.7649911344051361, "num_tokens": 456887.0, "step": 28 }, { "entropy": 0.5631287395954132, "epoch": 0.10820895522388059, "grad_norm": 0.26287850737571716, "learning_rate": 0.0002, "loss": 0.559370219707489, "mean_token_accuracy": 0.7786488234996796, "num_tokens": 473285.0, "step": 29 }, { "entropy": 0.5510498583316803, "epoch": 0.11194029850746269, "grad_norm": 0.2989422380924225, "learning_rate": 0.0002, "loss": 0.5596640110015869, "mean_token_accuracy": 0.7761659324169159, "num_tokens": 489394.0, "step": 30 }, { "entropy": 0.5780725926160812, "epoch": 0.11567164179104478, "grad_norm": 0.23725202679634094, "learning_rate": 0.0002, "loss": 0.5835093259811401, "mean_token_accuracy": 0.7684815227985382, "num_tokens": 505756.0, "step": 31 }, { "entropy": 0.5761191546916962, "epoch": 0.11940298507462686, "grad_norm": 0.2031526267528534, "learning_rate": 0.0002, "loss": 0.5835364460945129, "mean_token_accuracy": 0.7682848125696182, "num_tokens": 522094.0, "step": 32 }, { "entropy": 0.5485773086547852, "epoch": 0.12313432835820895, "grad_norm": 0.20444567501544952, "learning_rate": 0.0002, "loss": 0.5546419620513916, "mean_token_accuracy": 0.777488186955452, "num_tokens": 538415.0, "step": 33 }, { "entropy": 0.5861198753118515, "epoch": 0.12686567164179105, "grad_norm": 0.21942971646785736, "learning_rate": 0.0002, "loss": 0.5825690031051636, "mean_token_accuracy": 0.7697215527296066, "num_tokens": 554886.0, "step": 34 }, { "entropy": 0.5715848505496979, "epoch": 0.13059701492537312, "grad_norm": 0.20764704048633575, "learning_rate": 0.0002, "loss": 0.570915162563324, "mean_token_accuracy": 0.7720184922218323, "num_tokens": 571367.0, "step": 35 }, { "entropy": 0.5560943633317947, "epoch": 0.13432835820895522, "grad_norm": 0.20819340646266937, "learning_rate": 0.0002, "loss": 0.5549942851066589, "mean_token_accuracy": 0.7778844088315964, "num_tokens": 587594.0, "step": 36 }, { "entropy": 0.556964784860611, "epoch": 0.13805970149253732, "grad_norm": 0.17859336733818054, "learning_rate": 0.0002, "loss": 0.5563804507255554, "mean_token_accuracy": 0.7767369300127029, "num_tokens": 604052.0, "step": 37 }, { "entropy": 0.5532324761152267, "epoch": 0.1417910447761194, "grad_norm": 0.18194721639156342, "learning_rate": 0.0002, "loss": 0.5552038550376892, "mean_token_accuracy": 0.7764725238084793, "num_tokens": 620200.0, "step": 38 }, { "entropy": 0.5707972347736359, "epoch": 0.1455223880597015, "grad_norm": 0.17879748344421387, "learning_rate": 0.0002, "loss": 0.568923830986023, "mean_token_accuracy": 0.7714048773050308, "num_tokens": 636528.0, "step": 39 }, { "entropy": 0.5603279024362564, "epoch": 0.14925373134328357, "grad_norm": 0.19374136626720428, "learning_rate": 0.0002, "loss": 0.5574957728385925, "mean_token_accuracy": 0.7773427516222, "num_tokens": 652629.0, "step": 40 }, { "entropy": 0.5546282231807709, "epoch": 0.15298507462686567, "grad_norm": 0.19636894762516022, "learning_rate": 0.0002, "loss": 0.5532153844833374, "mean_token_accuracy": 0.7793182134628296, "num_tokens": 668683.0, "step": 41 }, { "entropy": 0.5812623649835587, "epoch": 0.15671641791044777, "grad_norm": 0.17162267863750458, "learning_rate": 0.0002, "loss": 0.5755793452262878, "mean_token_accuracy": 0.7692758589982986, "num_tokens": 685277.0, "step": 42 }, { "entropy": 0.5617634505033493, "epoch": 0.16044776119402984, "grad_norm": 0.16276565194129944, "learning_rate": 0.0002, "loss": 0.5628421306610107, "mean_token_accuracy": 0.7769913524389267, "num_tokens": 701728.0, "step": 43 }, { "entropy": 0.5570202618837357, "epoch": 0.16417910447761194, "grad_norm": 0.16841551661491394, "learning_rate": 0.0002, "loss": 0.5597431659698486, "mean_token_accuracy": 0.7756171226501465, "num_tokens": 718323.0, "step": 44 }, { "entropy": 0.5491841286420822, "epoch": 0.16791044776119404, "grad_norm": 0.14662496745586395, "learning_rate": 0.0002, "loss": 0.5556524991989136, "mean_token_accuracy": 0.7775459736585617, "num_tokens": 734628.0, "step": 45 }, { "entropy": 0.5427970439195633, "epoch": 0.17164179104477612, "grad_norm": 0.13948297500610352, "learning_rate": 0.0002, "loss": 0.5476619601249695, "mean_token_accuracy": 0.7795768678188324, "num_tokens": 750996.0, "step": 46 }, { "entropy": 0.5452166348695755, "epoch": 0.17537313432835822, "grad_norm": 0.17319753766059875, "learning_rate": 0.0002, "loss": 0.5554689168930054, "mean_token_accuracy": 0.7776593416929245, "num_tokens": 767284.0, "step": 47 }, { "entropy": 0.5613571405410767, "epoch": 0.1791044776119403, "grad_norm": 0.15226703882217407, "learning_rate": 0.0002, "loss": 0.5640038847923279, "mean_token_accuracy": 0.7746699303388596, "num_tokens": 783601.0, "step": 48 }, { "entropy": 0.5535127073526382, "epoch": 0.1828358208955224, "grad_norm": 0.166432186961174, "learning_rate": 0.0002, "loss": 0.5462499856948853, "mean_token_accuracy": 0.7813286185264587, "num_tokens": 799773.0, "step": 49 }, { "entropy": 0.5604032725095749, "epoch": 0.1865671641791045, "grad_norm": 0.17004649341106415, "learning_rate": 0.0002, "loss": 0.5530112981796265, "mean_token_accuracy": 0.7776568233966827, "num_tokens": 816032.0, "step": 50 }, { "entropy": 0.5409559532999992, "epoch": 0.19029850746268656, "grad_norm": 0.14887484908103943, "learning_rate": 0.0002, "loss": 0.5343962907791138, "mean_token_accuracy": 0.7841377556324005, "num_tokens": 832227.0, "step": 51 }, { "entropy": 0.5414481312036514, "epoch": 0.19402985074626866, "grad_norm": 0.20319198071956635, "learning_rate": 0.0002, "loss": 0.5386375188827515, "mean_token_accuracy": 0.7845792174339294, "num_tokens": 848643.0, "step": 52 }, { "entropy": 0.5497538298368454, "epoch": 0.19776119402985073, "grad_norm": 0.16608890891075134, "learning_rate": 0.0002, "loss": 0.5512281656265259, "mean_token_accuracy": 0.7805987298488617, "num_tokens": 865199.0, "step": 53 }, { "entropy": 0.545375257730484, "epoch": 0.20149253731343283, "grad_norm": 0.17525805532932281, "learning_rate": 0.0002, "loss": 0.5542587637901306, "mean_token_accuracy": 0.7773701697587967, "num_tokens": 881379.0, "step": 54 }, { "entropy": 0.5477564036846161, "epoch": 0.20522388059701493, "grad_norm": 0.19050806760787964, "learning_rate": 0.0002, "loss": 0.5655733942985535, "mean_token_accuracy": 0.7745383828878403, "num_tokens": 897934.0, "step": 55 }, { "entropy": 0.5568059235811234, "epoch": 0.208955223880597, "grad_norm": 0.16148774325847626, "learning_rate": 0.0002, "loss": 0.5592997074127197, "mean_token_accuracy": 0.7772074788808823, "num_tokens": 914308.0, "step": 56 }, { "entropy": 0.5678450167179108, "epoch": 0.2126865671641791, "grad_norm": 0.16515380144119263, "learning_rate": 0.0002, "loss": 0.569266676902771, "mean_token_accuracy": 0.7714356333017349, "num_tokens": 930508.0, "step": 57 }, { "entropy": 0.580150917172432, "epoch": 0.21641791044776118, "grad_norm": 0.17066031694412231, "learning_rate": 0.0002, "loss": 0.5749757289886475, "mean_token_accuracy": 0.7655356675386429, "num_tokens": 946877.0, "step": 58 }, { "entropy": 0.5695585310459137, "epoch": 0.22014925373134328, "grad_norm": 0.16599293053150177, "learning_rate": 0.0002, "loss": 0.5635928511619568, "mean_token_accuracy": 0.7739954739809036, "num_tokens": 963218.0, "step": 59 }, { "entropy": 0.5330293923616409, "epoch": 0.22388059701492538, "grad_norm": 0.14891624450683594, "learning_rate": 0.0002, "loss": 0.5344960689544678, "mean_token_accuracy": 0.7841218858957291, "num_tokens": 979460.0, "step": 60 }, { "entropy": 0.5383697599172592, "epoch": 0.22761194029850745, "grad_norm": 0.16252915561199188, "learning_rate": 0.0002, "loss": 0.5413715243339539, "mean_token_accuracy": 0.7826660871505737, "num_tokens": 995619.0, "step": 61 }, { "entropy": 0.5535406023263931, "epoch": 0.23134328358208955, "grad_norm": 0.15229789912700653, "learning_rate": 0.0002, "loss": 0.558712899684906, "mean_token_accuracy": 0.7769492119550705, "num_tokens": 1011885.0, "step": 62 }, { "entropy": 0.5603247284889221, "epoch": 0.23507462686567165, "grad_norm": 0.14967045187950134, "learning_rate": 0.0002, "loss": 0.5645769834518433, "mean_token_accuracy": 0.771862581372261, "num_tokens": 1028352.0, "step": 63 }, { "entropy": 0.563384547829628, "epoch": 0.23880597014925373, "grad_norm": 0.15884719789028168, "learning_rate": 0.0002, "loss": 0.5637681484222412, "mean_token_accuracy": 0.7742781639099121, "num_tokens": 1044550.0, "step": 64 }, { "entropy": 0.5692009180784225, "epoch": 0.24253731343283583, "grad_norm": 0.16877400875091553, "learning_rate": 0.0002, "loss": 0.5609120726585388, "mean_token_accuracy": 0.7724380940198898, "num_tokens": 1060869.0, "step": 65 }, { "entropy": 0.5652668327093124, "epoch": 0.2462686567164179, "grad_norm": 0.14263105392456055, "learning_rate": 0.0002, "loss": 0.5577319264411926, "mean_token_accuracy": 0.7767308205366135, "num_tokens": 1077318.0, "step": 66 }, { "entropy": 0.5624865591526031, "epoch": 0.25, "grad_norm": 0.1326468139886856, "learning_rate": 0.0002, "loss": 0.5610349774360657, "mean_token_accuracy": 0.7767885029315948, "num_tokens": 1093946.0, "step": 67 }, { "entropy": 0.5453900694847107, "epoch": 0.2537313432835821, "grad_norm": 0.15602754056453705, "learning_rate": 0.0002, "loss": 0.5474068522453308, "mean_token_accuracy": 0.7804547101259232, "num_tokens": 1110166.0, "step": 68 }, { "entropy": 0.5495888441801071, "epoch": 0.2574626865671642, "grad_norm": 0.16421914100646973, "learning_rate": 0.0002, "loss": 0.5586546063423157, "mean_token_accuracy": 0.7761986404657364, "num_tokens": 1126524.0, "step": 69 }, { "entropy": 0.5564677566289902, "epoch": 0.26119402985074625, "grad_norm": 0.17955079674720764, "learning_rate": 0.0002, "loss": 0.570371687412262, "mean_token_accuracy": 0.7711490094661713, "num_tokens": 1142935.0, "step": 70 }, { "entropy": 0.5473903864622116, "epoch": 0.26492537313432835, "grad_norm": 0.14180611073970795, "learning_rate": 0.0002, "loss": 0.549370527267456, "mean_token_accuracy": 0.7789817303419113, "num_tokens": 1159182.0, "step": 71 }, { "entropy": 0.5544993579387665, "epoch": 0.26865671641791045, "grad_norm": 0.1569361388683319, "learning_rate": 0.0002, "loss": 0.5507487058639526, "mean_token_accuracy": 0.7766937166452408, "num_tokens": 1175525.0, "step": 72 }, { "entropy": 0.5662118345499039, "epoch": 0.27238805970149255, "grad_norm": 0.15652883052825928, "learning_rate": 0.0002, "loss": 0.5632150173187256, "mean_token_accuracy": 0.7702545374631882, "num_tokens": 1191955.0, "step": 73 }, { "entropy": 0.5581929385662079, "epoch": 0.27611940298507465, "grad_norm": 0.1360681653022766, "learning_rate": 0.0002, "loss": 0.5503684878349304, "mean_token_accuracy": 0.7764260619878769, "num_tokens": 1208034.0, "step": 74 }, { "entropy": 0.5687559396028519, "epoch": 0.2798507462686567, "grad_norm": 0.13728748261928558, "learning_rate": 0.0002, "loss": 0.5678715109825134, "mean_token_accuracy": 0.7728003114461899, "num_tokens": 1224533.0, "step": 75 }, { "entropy": 0.5481379926204681, "epoch": 0.2835820895522388, "grad_norm": 0.16217739880084991, "learning_rate": 0.0002, "loss": 0.5537081956863403, "mean_token_accuracy": 0.7751952260732651, "num_tokens": 1240962.0, "step": 76 }, { "entropy": 0.5639017820358276, "epoch": 0.2873134328358209, "grad_norm": 0.1611357033252716, "learning_rate": 0.0002, "loss": 0.5741861462593079, "mean_token_accuracy": 0.7681055814027786, "num_tokens": 1257195.0, "step": 77 }, { "entropy": 0.5481198877096176, "epoch": 0.291044776119403, "grad_norm": 0.12783770263195038, "learning_rate": 0.0002, "loss": 0.5473082065582275, "mean_token_accuracy": 0.777423769235611, "num_tokens": 1273603.0, "step": 78 }, { "entropy": 0.539246067404747, "epoch": 0.2947761194029851, "grad_norm": 0.1314576119184494, "learning_rate": 0.0002, "loss": 0.5311948657035828, "mean_token_accuracy": 0.7861492037773132, "num_tokens": 1289837.0, "step": 79 }, { "entropy": 0.554696649312973, "epoch": 0.29850746268656714, "grad_norm": 0.1476278305053711, "learning_rate": 0.0002, "loss": 0.5538964867591858, "mean_token_accuracy": 0.7750344574451447, "num_tokens": 1306338.0, "step": 80 }, { "entropy": 0.5469587296247482, "epoch": 0.30223880597014924, "grad_norm": 0.16194719076156616, "learning_rate": 0.0002, "loss": 0.554472804069519, "mean_token_accuracy": 0.7799090445041656, "num_tokens": 1322825.0, "step": 81 }, { "entropy": 0.5433253645896912, "epoch": 0.30597014925373134, "grad_norm": 0.16987131536006927, "learning_rate": 0.0002, "loss": 0.5523664355278015, "mean_token_accuracy": 0.776031419634819, "num_tokens": 1338865.0, "step": 82 }, { "entropy": 0.5386127680540085, "epoch": 0.30970149253731344, "grad_norm": 0.14176225662231445, "learning_rate": 0.0002, "loss": 0.5489001870155334, "mean_token_accuracy": 0.7799653261899948, "num_tokens": 1355248.0, "step": 83 }, { "entropy": 0.5415250957012177, "epoch": 0.31343283582089554, "grad_norm": 0.17086099088191986, "learning_rate": 0.0002, "loss": 0.545318067073822, "mean_token_accuracy": 0.7825302183628082, "num_tokens": 1371746.0, "step": 84 }, { "entropy": 0.5727111548185349, "epoch": 0.31716417910447764, "grad_norm": 0.15196099877357483, "learning_rate": 0.0002, "loss": 0.5717822909355164, "mean_token_accuracy": 0.769862562417984, "num_tokens": 1388201.0, "step": 85 }, { "entropy": 0.5487467050552368, "epoch": 0.3208955223880597, "grad_norm": 0.12406057119369507, "learning_rate": 0.0002, "loss": 0.5426313877105713, "mean_token_accuracy": 0.7817563712596893, "num_tokens": 1404461.0, "step": 86 }, { "entropy": 0.5417477786540985, "epoch": 0.3246268656716418, "grad_norm": 0.1868571937084198, "learning_rate": 0.0002, "loss": 0.5441780090332031, "mean_token_accuracy": 0.7824695259332657, "num_tokens": 1420484.0, "step": 87 }, { "entropy": 0.552739754319191, "epoch": 0.3283582089552239, "grad_norm": 0.12260660529136658, "learning_rate": 0.0002, "loss": 0.5459186434745789, "mean_token_accuracy": 0.7800513356924057, "num_tokens": 1436981.0, "step": 88 }, { "entropy": 0.5539838075637817, "epoch": 0.332089552238806, "grad_norm": 0.19637417793273926, "learning_rate": 0.0002, "loss": 0.5502506494522095, "mean_token_accuracy": 0.779677152633667, "num_tokens": 1453360.0, "step": 89 }, { "entropy": 0.5393257439136505, "epoch": 0.3358208955223881, "grad_norm": 0.14825744926929474, "learning_rate": 0.0002, "loss": 0.5465800762176514, "mean_token_accuracy": 0.7785906046628952, "num_tokens": 1469575.0, "step": 90 }, { "entropy": 0.5319312065839767, "epoch": 0.33955223880597013, "grad_norm": 0.1817854791879654, "learning_rate": 0.0002, "loss": 0.5348737835884094, "mean_token_accuracy": 0.7835152447223663, "num_tokens": 1485763.0, "step": 91 }, { "entropy": 0.5510641485452652, "epoch": 0.34328358208955223, "grad_norm": 0.1455191969871521, "learning_rate": 0.0002, "loss": 0.5464341044425964, "mean_token_accuracy": 0.7820889949798584, "num_tokens": 1502105.0, "step": 92 }, { "entropy": 0.5406191498041153, "epoch": 0.34701492537313433, "grad_norm": 0.1273794025182724, "learning_rate": 0.0002, "loss": 0.5421090722084045, "mean_token_accuracy": 0.7849924713373184, "num_tokens": 1518477.0, "step": 93 }, { "entropy": 0.5232429951429367, "epoch": 0.35074626865671643, "grad_norm": 0.14684391021728516, "learning_rate": 0.0002, "loss": 0.5232130289077759, "mean_token_accuracy": 0.7893925607204437, "num_tokens": 1534743.0, "step": 94 }, { "entropy": 0.5493894517421722, "epoch": 0.35447761194029853, "grad_norm": 0.12976326048374176, "learning_rate": 0.0002, "loss": 0.5556308627128601, "mean_token_accuracy": 0.7738792598247528, "num_tokens": 1551015.0, "step": 95 }, { "entropy": 0.5568605363368988, "epoch": 0.3582089552238806, "grad_norm": 0.15545816719532013, "learning_rate": 0.0002, "loss": 0.5611149668693542, "mean_token_accuracy": 0.7729773372411728, "num_tokens": 1567597.0, "step": 96 }, { "entropy": 0.554488942027092, "epoch": 0.3619402985074627, "grad_norm": 0.1307706981897354, "learning_rate": 0.0002, "loss": 0.5501843094825745, "mean_token_accuracy": 0.7798233777284622, "num_tokens": 1583851.0, "step": 97 }, { "entropy": 0.5296479314565659, "epoch": 0.3656716417910448, "grad_norm": 0.1413222700357437, "learning_rate": 0.0002, "loss": 0.5348843932151794, "mean_token_accuracy": 0.7847397029399872, "num_tokens": 1599880.0, "step": 98 }, { "entropy": 0.5739381164312363, "epoch": 0.3694029850746269, "grad_norm": 0.14992888271808624, "learning_rate": 0.0002, "loss": 0.5711988210678101, "mean_token_accuracy": 0.769414946436882, "num_tokens": 1616161.0, "step": 99 }, { "entropy": 0.5500659346580505, "epoch": 0.373134328358209, "grad_norm": 0.13987883925437927, "learning_rate": 0.0002, "loss": 0.5535774230957031, "mean_token_accuracy": 0.7796037644147873, "num_tokens": 1632650.0, "step": 100 }, { "entropy": 0.5421769469976425, "epoch": 0.376865671641791, "grad_norm": 0.14819589257240295, "learning_rate": 0.0002, "loss": 0.5429503917694092, "mean_token_accuracy": 0.7809022516012192, "num_tokens": 1649147.0, "step": 101 }, { "entropy": 0.5444748848676682, "epoch": 0.3805970149253731, "grad_norm": 0.15763095021247864, "learning_rate": 0.0002, "loss": 0.5527257919311523, "mean_token_accuracy": 0.7789772897958755, "num_tokens": 1665434.0, "step": 102 }, { "entropy": 0.5364149361848831, "epoch": 0.3843283582089552, "grad_norm": 0.12937362492084503, "learning_rate": 0.0002, "loss": 0.5445730090141296, "mean_token_accuracy": 0.7801977097988129, "num_tokens": 1681628.0, "step": 103 }, { "entropy": 0.5520685017108917, "epoch": 0.3880597014925373, "grad_norm": 0.13224048912525177, "learning_rate": 0.0002, "loss": 0.5565529465675354, "mean_token_accuracy": 0.7761769741773605, "num_tokens": 1698024.0, "step": 104 }, { "entropy": 0.5505486279726028, "epoch": 0.3917910447761194, "grad_norm": 0.12523634731769562, "learning_rate": 0.0002, "loss": 0.5501624345779419, "mean_token_accuracy": 0.776427686214447, "num_tokens": 1714432.0, "step": 105 }, { "entropy": 0.5415863394737244, "epoch": 0.39552238805970147, "grad_norm": 0.12370901554822922, "learning_rate": 0.0002, "loss": 0.5389205813407898, "mean_token_accuracy": 0.7835447043180466, "num_tokens": 1730701.0, "step": 106 }, { "entropy": 0.535835400223732, "epoch": 0.39925373134328357, "grad_norm": 0.12875092029571533, "learning_rate": 0.0002, "loss": 0.5339052081108093, "mean_token_accuracy": 0.7833075076341629, "num_tokens": 1747039.0, "step": 107 }, { "entropy": 0.5391292423009872, "epoch": 0.40298507462686567, "grad_norm": 0.13361512124538422, "learning_rate": 0.0002, "loss": 0.5480363368988037, "mean_token_accuracy": 0.778292641043663, "num_tokens": 1763231.0, "step": 108 }, { "entropy": 0.5451123267412186, "epoch": 0.40671641791044777, "grad_norm": 0.12270035594701767, "learning_rate": 0.0002, "loss": 0.544527530670166, "mean_token_accuracy": 0.7805600017309189, "num_tokens": 1779643.0, "step": 109 }, { "entropy": 0.5353200137615204, "epoch": 0.41044776119402987, "grad_norm": 0.15249699354171753, "learning_rate": 0.0002, "loss": 0.540695309638977, "mean_token_accuracy": 0.7809852063655853, "num_tokens": 1795799.0, "step": 110 }, { "entropy": 0.5517745912075043, "epoch": 0.4141791044776119, "grad_norm": 0.13048961758613586, "learning_rate": 0.0002, "loss": 0.5428080558776855, "mean_token_accuracy": 0.7799961864948273, "num_tokens": 1812372.0, "step": 111 }, { "entropy": 0.5553679913282394, "epoch": 0.417910447761194, "grad_norm": 0.135862797498703, "learning_rate": 0.0002, "loss": 0.5515741109848022, "mean_token_accuracy": 0.7762576192617416, "num_tokens": 1828663.0, "step": 112 }, { "entropy": 0.5415378957986832, "epoch": 0.4216417910447761, "grad_norm": 0.17365720868110657, "learning_rate": 0.0002, "loss": 0.5439163446426392, "mean_token_accuracy": 0.7816168814897537, "num_tokens": 1845046.0, "step": 113 }, { "entropy": 0.5443854928016663, "epoch": 0.4253731343283582, "grad_norm": 0.13225306570529938, "learning_rate": 0.0002, "loss": 0.5523333549499512, "mean_token_accuracy": 0.7754887640476227, "num_tokens": 1861463.0, "step": 114 }, { "entropy": 0.536818191409111, "epoch": 0.4291044776119403, "grad_norm": 0.18661700189113617, "learning_rate": 0.0002, "loss": 0.5445066094398499, "mean_token_accuracy": 0.7783756703138351, "num_tokens": 1877488.0, "step": 115 }, { "entropy": 0.5401700437068939, "epoch": 0.43283582089552236, "grad_norm": 0.1313197761774063, "learning_rate": 0.0002, "loss": 0.5441405773162842, "mean_token_accuracy": 0.779263436794281, "num_tokens": 1893953.0, "step": 116 }, { "entropy": 0.5655902773141861, "epoch": 0.43656716417910446, "grad_norm": 0.14134129881858826, "learning_rate": 0.0002, "loss": 0.5561054944992065, "mean_token_accuracy": 0.7760706096887589, "num_tokens": 1910559.0, "step": 117 }, { "entropy": 0.5377545058727264, "epoch": 0.44029850746268656, "grad_norm": 0.1476624757051468, "learning_rate": 0.0002, "loss": 0.5377650260925293, "mean_token_accuracy": 0.784254401922226, "num_tokens": 1926798.0, "step": 118 }, { "entropy": 0.5710994154214859, "epoch": 0.44402985074626866, "grad_norm": 0.12695498764514923, "learning_rate": 0.0002, "loss": 0.5705847144126892, "mean_token_accuracy": 0.7709101736545563, "num_tokens": 1943309.0, "step": 119 }, { "entropy": 0.5473001599311829, "epoch": 0.44776119402985076, "grad_norm": 0.13190272450447083, "learning_rate": 0.0002, "loss": 0.5527402758598328, "mean_token_accuracy": 0.7776251584291458, "num_tokens": 1959914.0, "step": 120 }, { "entropy": 0.5332797467708588, "epoch": 0.45149253731343286, "grad_norm": 0.1538720279932022, "learning_rate": 0.0002, "loss": 0.541407585144043, "mean_token_accuracy": 0.7805240601301193, "num_tokens": 1976350.0, "step": 121 }, { "entropy": 0.5485477149486542, "epoch": 0.4552238805970149, "grad_norm": 0.1464855819940567, "learning_rate": 0.0002, "loss": 0.5562998056411743, "mean_token_accuracy": 0.7745071202516556, "num_tokens": 1992575.0, "step": 122 }, { "entropy": 0.5465153902769089, "epoch": 0.458955223880597, "grad_norm": 0.1392602175474167, "learning_rate": 0.0002, "loss": 0.5450125932693481, "mean_token_accuracy": 0.7803204655647278, "num_tokens": 2008818.0, "step": 123 }, { "entropy": 0.5216257721185684, "epoch": 0.4626865671641791, "grad_norm": 0.16500917077064514, "learning_rate": 0.0002, "loss": 0.5204989314079285, "mean_token_accuracy": 0.7916441410779953, "num_tokens": 2024909.0, "step": 124 }, { "entropy": 0.5582488030195236, "epoch": 0.4664179104477612, "grad_norm": 0.12797319889068604, "learning_rate": 0.0002, "loss": 0.5522317290306091, "mean_token_accuracy": 0.7782706022262573, "num_tokens": 2041274.0, "step": 125 }, { "entropy": 0.5451529324054718, "epoch": 0.4701492537313433, "grad_norm": 0.136440709233284, "learning_rate": 0.0002, "loss": 0.5448014736175537, "mean_token_accuracy": 0.7787207514047623, "num_tokens": 2057665.0, "step": 126 }, { "entropy": 0.5657823532819748, "epoch": 0.47388059701492535, "grad_norm": 0.13369601964950562, "learning_rate": 0.0002, "loss": 0.5634066462516785, "mean_token_accuracy": 0.7729785293340683, "num_tokens": 2074159.0, "step": 127 }, { "entropy": 0.52435402572155, "epoch": 0.47761194029850745, "grad_norm": 0.13124150037765503, "learning_rate": 0.0002, "loss": 0.5261214971542358, "mean_token_accuracy": 0.787582278251648, "num_tokens": 2090388.0, "step": 128 }, { "entropy": 0.5388573259115219, "epoch": 0.48134328358208955, "grad_norm": 0.1402949094772339, "learning_rate": 0.0002, "loss": 0.5444526672363281, "mean_token_accuracy": 0.780138373374939, "num_tokens": 2106895.0, "step": 129 }, { "entropy": 0.5594224631786346, "epoch": 0.48507462686567165, "grad_norm": 0.12214766442775726, "learning_rate": 0.0002, "loss": 0.5680845379829407, "mean_token_accuracy": 0.7693810015916824, "num_tokens": 2122936.0, "step": 130 }, { "entropy": 0.5598264634609222, "epoch": 0.48880597014925375, "grad_norm": 0.11836589127779007, "learning_rate": 0.0002, "loss": 0.5608173608779907, "mean_token_accuracy": 0.7735486477613449, "num_tokens": 2139356.0, "step": 131 }, { "entropy": 0.5484192073345184, "epoch": 0.4925373134328358, "grad_norm": 0.11776985228061676, "learning_rate": 0.0002, "loss": 0.5445444583892822, "mean_token_accuracy": 0.7797606885433197, "num_tokens": 2155868.0, "step": 132 }, { "entropy": 0.5602923631668091, "epoch": 0.4962686567164179, "grad_norm": 0.12020131945610046, "learning_rate": 0.0002, "loss": 0.5522936582565308, "mean_token_accuracy": 0.7776170521974564, "num_tokens": 2172336.0, "step": 133 }, { "entropy": 0.5583924055099487, "epoch": 0.5, "grad_norm": 0.1295275241136551, "learning_rate": 0.0002, "loss": 0.5662660002708435, "mean_token_accuracy": 0.7716575860977173, "num_tokens": 2188518.0, "step": 134 }, { "entropy": 0.5514810979366302, "epoch": 0.503731343283582, "grad_norm": 0.1089273989200592, "learning_rate": 0.0002, "loss": 0.5514034032821655, "mean_token_accuracy": 0.7769223898649216, "num_tokens": 2205142.0, "step": 135 }, { "entropy": 0.5440865606069565, "epoch": 0.5074626865671642, "grad_norm": 0.13056722283363342, "learning_rate": 0.0002, "loss": 0.5475744009017944, "mean_token_accuracy": 0.7764044553041458, "num_tokens": 2221743.0, "step": 136 }, { "entropy": 0.5476541817188263, "epoch": 0.5111940298507462, "grad_norm": 0.13166996836662292, "learning_rate": 0.0002, "loss": 0.5477900505065918, "mean_token_accuracy": 0.7784378528594971, "num_tokens": 2238142.0, "step": 137 }, { "entropy": 0.5558486729860306, "epoch": 0.5149253731343284, "grad_norm": 0.12133946269750595, "learning_rate": 0.0002, "loss": 0.5609108209609985, "mean_token_accuracy": 0.7736046612262726, "num_tokens": 2254456.0, "step": 138 }, { "entropy": 0.5566332340240479, "epoch": 0.5186567164179104, "grad_norm": 0.12148908525705338, "learning_rate": 0.0002, "loss": 0.5561110973358154, "mean_token_accuracy": 0.7756631374359131, "num_tokens": 2270696.0, "step": 139 }, { "entropy": 0.5462600067257881, "epoch": 0.5223880597014925, "grad_norm": 0.1129021942615509, "learning_rate": 0.0002, "loss": 0.5448604822158813, "mean_token_accuracy": 0.7795793265104294, "num_tokens": 2287025.0, "step": 140 }, { "entropy": 0.5399314314126968, "epoch": 0.5261194029850746, "grad_norm": 0.1251847892999649, "learning_rate": 0.0002, "loss": 0.5481414794921875, "mean_token_accuracy": 0.778893768787384, "num_tokens": 2303399.0, "step": 141 }, { "entropy": 0.5469618439674377, "epoch": 0.5298507462686567, "grad_norm": 0.11956755071878433, "learning_rate": 0.0002, "loss": 0.5474820137023926, "mean_token_accuracy": 0.7784739285707474, "num_tokens": 2319818.0, "step": 142 }, { "entropy": 0.5447351336479187, "epoch": 0.5335820895522388, "grad_norm": 0.14881564676761627, "learning_rate": 0.0002, "loss": 0.5410581827163696, "mean_token_accuracy": 0.781320258975029, "num_tokens": 2335949.0, "step": 143 }, { "entropy": 0.5449966341257095, "epoch": 0.5373134328358209, "grad_norm": 0.12103825062513351, "learning_rate": 0.0002, "loss": 0.5471005439758301, "mean_token_accuracy": 0.7796377539634705, "num_tokens": 2352269.0, "step": 144 }, { "entropy": 0.5632765144109726, "epoch": 0.5410447761194029, "grad_norm": 0.12277977168560028, "learning_rate": 0.0002, "loss": 0.5630727410316467, "mean_token_accuracy": 0.7703763097524643, "num_tokens": 2368674.0, "step": 145 }, { "entropy": 0.5339089632034302, "epoch": 0.5447761194029851, "grad_norm": 0.14498627185821533, "learning_rate": 0.0002, "loss": 0.5364416241645813, "mean_token_accuracy": 0.7819968014955521, "num_tokens": 2384936.0, "step": 146 }, { "entropy": 0.5429459661245346, "epoch": 0.5485074626865671, "grad_norm": 0.12051384150981903, "learning_rate": 0.0002, "loss": 0.5456188917160034, "mean_token_accuracy": 0.7803860902786255, "num_tokens": 2401292.0, "step": 147 }, { "entropy": 0.5626052618026733, "epoch": 0.5522388059701493, "grad_norm": 0.1412496566772461, "learning_rate": 0.0002, "loss": 0.5596410036087036, "mean_token_accuracy": 0.7737385481595993, "num_tokens": 2417925.0, "step": 148 }, { "entropy": 0.5565475225448608, "epoch": 0.5559701492537313, "grad_norm": 0.1441730409860611, "learning_rate": 0.0002, "loss": 0.5520785450935364, "mean_token_accuracy": 0.775386318564415, "num_tokens": 2434621.0, "step": 149 }, { "entropy": 0.5197634100914001, "epoch": 0.5597014925373134, "grad_norm": 0.12098351866006851, "learning_rate": 0.0002, "loss": 0.5222008228302002, "mean_token_accuracy": 0.7903124392032623, "num_tokens": 2450903.0, "step": 150 }, { "entropy": 0.5489796698093414, "epoch": 0.5634328358208955, "grad_norm": 0.14946326613426208, "learning_rate": 0.0002, "loss": 0.5559377074241638, "mean_token_accuracy": 0.775105893611908, "num_tokens": 2467105.0, "step": 151 }, { "entropy": 0.5400301665067673, "epoch": 0.5671641791044776, "grad_norm": 0.12906025350093842, "learning_rate": 0.0002, "loss": 0.5420807600021362, "mean_token_accuracy": 0.7795381844043732, "num_tokens": 2483456.0, "step": 152 }, { "entropy": 0.5474328249692917, "epoch": 0.5708955223880597, "grad_norm": 0.12021685391664505, "learning_rate": 0.0002, "loss": 0.5509780049324036, "mean_token_accuracy": 0.7768895477056503, "num_tokens": 2500011.0, "step": 153 }, { "entropy": 0.5380930155515671, "epoch": 0.5746268656716418, "grad_norm": 0.11843080073595047, "learning_rate": 0.0002, "loss": 0.5308334827423096, "mean_token_accuracy": 0.7881843447685242, "num_tokens": 2516780.0, "step": 154 }, { "entropy": 0.5460693091154099, "epoch": 0.5783582089552238, "grad_norm": 0.16729064285755157, "learning_rate": 0.0002, "loss": 0.5530881285667419, "mean_token_accuracy": 0.7742334753274918, "num_tokens": 2532837.0, "step": 155 }, { "entropy": 0.5500553995370865, "epoch": 0.582089552238806, "grad_norm": 0.1366872787475586, "learning_rate": 0.0002, "loss": 0.5533830523490906, "mean_token_accuracy": 0.7750078588724136, "num_tokens": 2549157.0, "step": 156 }, { "entropy": 0.5497538447380066, "epoch": 0.585820895522388, "grad_norm": 0.12214312702417374, "learning_rate": 0.0002, "loss": 0.5549652576446533, "mean_token_accuracy": 0.7742869108915329, "num_tokens": 2565745.0, "step": 157 }, { "entropy": 0.5520212799310684, "epoch": 0.5895522388059702, "grad_norm": 0.13198687136173248, "learning_rate": 0.0002, "loss": 0.5503985285758972, "mean_token_accuracy": 0.7776314318180084, "num_tokens": 2582172.0, "step": 158 }, { "entropy": 0.5420894026756287, "epoch": 0.5932835820895522, "grad_norm": 0.1303817480802536, "learning_rate": 0.0002, "loss": 0.545700192451477, "mean_token_accuracy": 0.7790375500917435, "num_tokens": 2598785.0, "step": 159 }, { "entropy": 0.5361281335353851, "epoch": 0.5970149253731343, "grad_norm": 0.13537634909152985, "learning_rate": 0.0002, "loss": 0.5409078598022461, "mean_token_accuracy": 0.779214471578598, "num_tokens": 2615324.0, "step": 160 }, { "entropy": 0.5633385479450226, "epoch": 0.6007462686567164, "grad_norm": 0.11204258352518082, "learning_rate": 0.0002, "loss": 0.5624291896820068, "mean_token_accuracy": 0.7730776518583298, "num_tokens": 2631612.0, "step": 161 }, { "entropy": 0.5239899605512619, "epoch": 0.6044776119402985, "grad_norm": 0.14660899341106415, "learning_rate": 0.0002, "loss": 0.5244404673576355, "mean_token_accuracy": 0.7870436310768127, "num_tokens": 2648098.0, "step": 162 }, { "entropy": 0.5414755046367645, "epoch": 0.6082089552238806, "grad_norm": 0.11887400597333908, "learning_rate": 0.0002, "loss": 0.5397330522537231, "mean_token_accuracy": 0.7847625911235809, "num_tokens": 2664285.0, "step": 163 }, { "entropy": 0.5442674309015274, "epoch": 0.6119402985074627, "grad_norm": 0.11572780460119247, "learning_rate": 0.0002, "loss": 0.5454840660095215, "mean_token_accuracy": 0.7809286564588547, "num_tokens": 2680551.0, "step": 164 }, { "entropy": 0.5371343344449997, "epoch": 0.6156716417910447, "grad_norm": 0.1056356355547905, "learning_rate": 0.0002, "loss": 0.5348964929580688, "mean_token_accuracy": 0.7857467532157898, "num_tokens": 2697071.0, "step": 165 }, { "entropy": 0.5399870425462723, "epoch": 0.6194029850746269, "grad_norm": 0.13278594613075256, "learning_rate": 0.0002, "loss": 0.5447728633880615, "mean_token_accuracy": 0.7792245298624039, "num_tokens": 2713461.0, "step": 166 }, { "entropy": 0.5350475907325745, "epoch": 0.6231343283582089, "grad_norm": 0.1305065155029297, "learning_rate": 0.0002, "loss": 0.5362796783447266, "mean_token_accuracy": 0.7812380343675613, "num_tokens": 2729505.0, "step": 167 }, { "entropy": 0.5582499951124191, "epoch": 0.6268656716417911, "grad_norm": 0.12587526440620422, "learning_rate": 0.0002, "loss": 0.5559293627738953, "mean_token_accuracy": 0.7746618837118149, "num_tokens": 2746287.0, "step": 168 }, { "entropy": 0.5586439073085785, "epoch": 0.6305970149253731, "grad_norm": 0.12845800817012787, "learning_rate": 0.0002, "loss": 0.5518544912338257, "mean_token_accuracy": 0.7751341164112091, "num_tokens": 2762818.0, "step": 169 }, { "entropy": 0.5343242138624191, "epoch": 0.6343283582089553, "grad_norm": 0.15256647765636444, "learning_rate": 0.0002, "loss": 0.5386060476303101, "mean_token_accuracy": 0.7807702422142029, "num_tokens": 2779199.0, "step": 170 }, { "entropy": 0.5373098105192184, "epoch": 0.6380597014925373, "grad_norm": 0.13263238966464996, "learning_rate": 0.0002, "loss": 0.5466636419296265, "mean_token_accuracy": 0.7765426337718964, "num_tokens": 2795330.0, "step": 171 }, { "entropy": 0.5298089534044266, "epoch": 0.6417910447761194, "grad_norm": 0.12450744211673737, "learning_rate": 0.0002, "loss": 0.5325064659118652, "mean_token_accuracy": 0.7838508486747742, "num_tokens": 2811566.0, "step": 172 }, { "entropy": 0.5550331622362137, "epoch": 0.6455223880597015, "grad_norm": 0.111052505671978, "learning_rate": 0.0002, "loss": 0.5552961230278015, "mean_token_accuracy": 0.7752347737550735, "num_tokens": 2827783.0, "step": 173 }, { "entropy": 0.5506296455860138, "epoch": 0.6492537313432836, "grad_norm": 0.13255524635314941, "learning_rate": 0.0002, "loss": 0.5490573048591614, "mean_token_accuracy": 0.7767810970544815, "num_tokens": 2844210.0, "step": 174 }, { "entropy": 0.5634674280881882, "epoch": 0.6529850746268657, "grad_norm": 0.11786694079637527, "learning_rate": 0.0002, "loss": 0.5620654225349426, "mean_token_accuracy": 0.7710569798946381, "num_tokens": 2860606.0, "step": 175 }, { "entropy": 0.5491903871297836, "epoch": 0.6567164179104478, "grad_norm": 0.1378813087940216, "learning_rate": 0.0002, "loss": 0.5544133186340332, "mean_token_accuracy": 0.7742699533700943, "num_tokens": 2876978.0, "step": 176 }, { "entropy": 0.5418348163366318, "epoch": 0.6604477611940298, "grad_norm": 0.1719319373369217, "learning_rate": 0.0002, "loss": 0.5509114265441895, "mean_token_accuracy": 0.7738531082868576, "num_tokens": 2893436.0, "step": 177 }, { "entropy": 0.5621145367622375, "epoch": 0.664179104477612, "grad_norm": 0.13473528623580933, "learning_rate": 0.0002, "loss": 0.5569881796836853, "mean_token_accuracy": 0.7752742022275925, "num_tokens": 2909714.0, "step": 178 }, { "entropy": 0.5567401647567749, "epoch": 0.667910447761194, "grad_norm": 0.15127326548099518, "learning_rate": 0.0002, "loss": 0.5531461238861084, "mean_token_accuracy": 0.7789575010538101, "num_tokens": 2926148.0, "step": 179 }, { "entropy": 0.5276759713888168, "epoch": 0.6716417910447762, "grad_norm": 0.1254606693983078, "learning_rate": 0.0002, "loss": 0.5301634669303894, "mean_token_accuracy": 0.7837289869785309, "num_tokens": 2942739.0, "step": 180 }, { "entropy": 0.5278603881597519, "epoch": 0.6753731343283582, "grad_norm": 0.128974050283432, "learning_rate": 0.0002, "loss": 0.5369632244110107, "mean_token_accuracy": 0.7825482338666916, "num_tokens": 2958977.0, "step": 181 }, { "entropy": 0.5377722084522247, "epoch": 0.6791044776119403, "grad_norm": 0.13316886126995087, "learning_rate": 0.0002, "loss": 0.5483193397521973, "mean_token_accuracy": 0.7763564735651016, "num_tokens": 2975274.0, "step": 182 }, { "entropy": 0.5507437884807587, "epoch": 0.6828358208955224, "grad_norm": 0.12445816397666931, "learning_rate": 0.0002, "loss": 0.5532326698303223, "mean_token_accuracy": 0.7756502628326416, "num_tokens": 2991599.0, "step": 183 }, { "entropy": 0.5495483875274658, "epoch": 0.6865671641791045, "grad_norm": 0.11616785079240799, "learning_rate": 0.0002, "loss": 0.5388738512992859, "mean_token_accuracy": 0.780926913022995, "num_tokens": 3008127.0, "step": 184 }, { "entropy": 0.5604113638401031, "epoch": 0.6902985074626866, "grad_norm": 0.10933785885572433, "learning_rate": 0.0002, "loss": 0.5567720532417297, "mean_token_accuracy": 0.7762922942638397, "num_tokens": 3024360.0, "step": 185 }, { "entropy": 0.5393257141113281, "epoch": 0.6940298507462687, "grad_norm": 0.13075008988380432, "learning_rate": 0.0002, "loss": 0.5377945303916931, "mean_token_accuracy": 0.7826398611068726, "num_tokens": 3040880.0, "step": 186 }, { "entropy": 0.534931406378746, "epoch": 0.6977611940298507, "grad_norm": 0.11783911287784576, "learning_rate": 0.0002, "loss": 0.5384173393249512, "mean_token_accuracy": 0.7814484983682632, "num_tokens": 3057215.0, "step": 187 }, { "entropy": 0.5480581521987915, "epoch": 0.7014925373134329, "grad_norm": 0.11767826229333878, "learning_rate": 0.0002, "loss": 0.5535053610801697, "mean_token_accuracy": 0.7753477245569229, "num_tokens": 3073526.0, "step": 188 }, { "entropy": 0.5417313128709793, "epoch": 0.7052238805970149, "grad_norm": 0.1221914142370224, "learning_rate": 0.0002, "loss": 0.5454643368721008, "mean_token_accuracy": 0.7766887843608856, "num_tokens": 3089677.0, "step": 189 }, { "entropy": 0.5625078678131104, "epoch": 0.7089552238805971, "grad_norm": 0.11974587291479111, "learning_rate": 0.0002, "loss": 0.5611926913261414, "mean_token_accuracy": 0.7717815935611725, "num_tokens": 3105979.0, "step": 190 }, { "entropy": 0.5516901463270187, "epoch": 0.7126865671641791, "grad_norm": 0.11311069130897522, "learning_rate": 0.0002, "loss": 0.5487813949584961, "mean_token_accuracy": 0.7764030396938324, "num_tokens": 3122320.0, "step": 191 }, { "entropy": 0.5541231781244278, "epoch": 0.7164179104477612, "grad_norm": 0.12345684319734573, "learning_rate": 0.0002, "loss": 0.5585082173347473, "mean_token_accuracy": 0.774434968829155, "num_tokens": 3138647.0, "step": 192 }, { "entropy": 0.5558422803878784, "epoch": 0.7201492537313433, "grad_norm": 0.13054387271404266, "learning_rate": 0.0002, "loss": 0.5540096163749695, "mean_token_accuracy": 0.7756641954183578, "num_tokens": 3154847.0, "step": 193 }, { "entropy": 0.55143603682518, "epoch": 0.7238805970149254, "grad_norm": 0.14231973886489868, "learning_rate": 0.0002, "loss": 0.5643096566200256, "mean_token_accuracy": 0.7717767059803009, "num_tokens": 3171336.0, "step": 194 }, { "entropy": 0.5277590304613113, "epoch": 0.7276119402985075, "grad_norm": 0.12328840047121048, "learning_rate": 0.0002, "loss": 0.5327441096305847, "mean_token_accuracy": 0.7853522598743439, "num_tokens": 3187829.0, "step": 195 }, { "entropy": 0.5539046078920364, "epoch": 0.7313432835820896, "grad_norm": 0.12686993181705475, "learning_rate": 0.0002, "loss": 0.5454736948013306, "mean_token_accuracy": 0.7813247591257095, "num_tokens": 3204100.0, "step": 196 }, { "entropy": 0.5553427636623383, "epoch": 0.7350746268656716, "grad_norm": 0.14084763824939728, "learning_rate": 0.0002, "loss": 0.5538918972015381, "mean_token_accuracy": 0.7761572599411011, "num_tokens": 3220526.0, "step": 197 }, { "entropy": 0.5553955286741257, "epoch": 0.7388059701492538, "grad_norm": 0.15137532353401184, "learning_rate": 0.0002, "loss": 0.5569421648979187, "mean_token_accuracy": 0.7751066386699677, "num_tokens": 3237005.0, "step": 198 }, { "entropy": 0.5306164473295212, "epoch": 0.7425373134328358, "grad_norm": 0.14029283821582794, "learning_rate": 0.0002, "loss": 0.5325392484664917, "mean_token_accuracy": 0.7821047902107239, "num_tokens": 3253191.0, "step": 199 }, { "entropy": 0.5289445072412491, "epoch": 0.746268656716418, "grad_norm": 0.1625203937292099, "learning_rate": 0.0002, "loss": 0.530889093875885, "mean_token_accuracy": 0.7839524000883102, "num_tokens": 3269303.0, "step": 200 }, { "entropy": 0.5537738502025604, "epoch": 0.75, "grad_norm": 0.12837141752243042, "learning_rate": 0.0002, "loss": 0.5496644377708435, "mean_token_accuracy": 0.7775348573923111, "num_tokens": 3285861.0, "step": 201 }, { "entropy": 0.5437710881233215, "epoch": 0.753731343283582, "grad_norm": 0.15969154238700867, "learning_rate": 0.0002, "loss": 0.5445458889007568, "mean_token_accuracy": 0.7779001444578171, "num_tokens": 3302531.0, "step": 202 }, { "entropy": 0.5435174703598022, "epoch": 0.7574626865671642, "grad_norm": 0.1447206735610962, "learning_rate": 0.0002, "loss": 0.5419492125511169, "mean_token_accuracy": 0.782675564289093, "num_tokens": 3318918.0, "step": 203 }, { "entropy": 0.5337730944156647, "epoch": 0.7611940298507462, "grad_norm": 0.13017146289348602, "learning_rate": 0.0002, "loss": 0.5400105714797974, "mean_token_accuracy": 0.7810544222593307, "num_tokens": 3335348.0, "step": 204 }, { "entropy": 0.534254178404808, "epoch": 0.7649253731343284, "grad_norm": 0.11939690262079239, "learning_rate": 0.0002, "loss": 0.5358497500419617, "mean_token_accuracy": 0.7831085026264191, "num_tokens": 3351607.0, "step": 205 }, { "entropy": 0.5295046716928482, "epoch": 0.7686567164179104, "grad_norm": 0.17022010684013367, "learning_rate": 0.0002, "loss": 0.5389232635498047, "mean_token_accuracy": 0.7811893969774246, "num_tokens": 3368046.0, "step": 206 }, { "entropy": 0.5532102882862091, "epoch": 0.7723880597014925, "grad_norm": 0.13207128643989563, "learning_rate": 0.0002, "loss": 0.556742787361145, "mean_token_accuracy": 0.7772794514894485, "num_tokens": 3384496.0, "step": 207 }, { "entropy": 0.5532752573490143, "epoch": 0.7761194029850746, "grad_norm": 0.16495952010154724, "learning_rate": 0.0002, "loss": 0.5545544624328613, "mean_token_accuracy": 0.777538612484932, "num_tokens": 3400918.0, "step": 208 }, { "entropy": 0.534032866358757, "epoch": 0.7798507462686567, "grad_norm": 0.1333177387714386, "learning_rate": 0.0002, "loss": 0.533141553401947, "mean_token_accuracy": 0.7848780155181885, "num_tokens": 3417300.0, "step": 209 }, { "entropy": 0.5450873523950577, "epoch": 0.7835820895522388, "grad_norm": 0.12406419962644577, "learning_rate": 0.0002, "loss": 0.5425257086753845, "mean_token_accuracy": 0.781457707285881, "num_tokens": 3433516.0, "step": 210 }, { "entropy": 0.5520957857370377, "epoch": 0.7873134328358209, "grad_norm": 0.16319960355758667, "learning_rate": 0.0002, "loss": 0.5528780817985535, "mean_token_accuracy": 0.7751211673021317, "num_tokens": 3449854.0, "step": 211 }, { "entropy": 0.5411545261740685, "epoch": 0.7910447761194029, "grad_norm": 0.11995123326778412, "learning_rate": 0.0002, "loss": 0.5378537178039551, "mean_token_accuracy": 0.7797028720378876, "num_tokens": 3466138.0, "step": 212 }, { "entropy": 0.5522632747888565, "epoch": 0.7947761194029851, "grad_norm": 0.14674413204193115, "learning_rate": 0.0002, "loss": 0.5561342239379883, "mean_token_accuracy": 0.7742671966552734, "num_tokens": 3482443.0, "step": 213 }, { "entropy": 0.5423247516155243, "epoch": 0.7985074626865671, "grad_norm": 0.1413860321044922, "learning_rate": 0.0002, "loss": 0.5450446605682373, "mean_token_accuracy": 0.7770555764436722, "num_tokens": 3498627.0, "step": 214 }, { "entropy": 0.5330623686313629, "epoch": 0.8022388059701493, "grad_norm": 0.1323142796754837, "learning_rate": 0.0002, "loss": 0.5411436557769775, "mean_token_accuracy": 0.7801088243722916, "num_tokens": 3515028.0, "step": 215 }, { "entropy": 0.5561616569757462, "epoch": 0.8059701492537313, "grad_norm": 0.14549626410007477, "learning_rate": 0.0002, "loss": 0.5557980537414551, "mean_token_accuracy": 0.774229571223259, "num_tokens": 3531502.0, "step": 216 }, { "entropy": 0.5611517131328583, "epoch": 0.8097014925373134, "grad_norm": 0.13433797657489777, "learning_rate": 0.0002, "loss": 0.5634274482727051, "mean_token_accuracy": 0.7715686410665512, "num_tokens": 3547519.0, "step": 217 }, { "entropy": 0.5514582842588425, "epoch": 0.8134328358208955, "grad_norm": 0.11890087276697159, "learning_rate": 0.0002, "loss": 0.5433245897293091, "mean_token_accuracy": 0.7793933302164078, "num_tokens": 3563773.0, "step": 218 }, { "entropy": 0.534797728061676, "epoch": 0.8171641791044776, "grad_norm": 0.1360422521829605, "learning_rate": 0.0002, "loss": 0.5381568670272827, "mean_token_accuracy": 0.7809459120035172, "num_tokens": 3580120.0, "step": 219 }, { "entropy": 0.5429193377494812, "epoch": 0.8208955223880597, "grad_norm": 0.13077932596206665, "learning_rate": 0.0002, "loss": 0.5535344481468201, "mean_token_accuracy": 0.7765921354293823, "num_tokens": 3596382.0, "step": 220 }, { "entropy": 0.5237333700060844, "epoch": 0.8246268656716418, "grad_norm": 0.1276118904352188, "learning_rate": 0.0002, "loss": 0.5291868448257446, "mean_token_accuracy": 0.7849691659212112, "num_tokens": 3612537.0, "step": 221 }, { "entropy": 0.5639058351516724, "epoch": 0.8283582089552238, "grad_norm": 0.1108359843492508, "learning_rate": 0.0002, "loss": 0.5600181221961975, "mean_token_accuracy": 0.7725061029195786, "num_tokens": 3629049.0, "step": 222 }, { "entropy": 0.5387094169855118, "epoch": 0.832089552238806, "grad_norm": 0.14372611045837402, "learning_rate": 0.0002, "loss": 0.5452870726585388, "mean_token_accuracy": 0.7791440933942795, "num_tokens": 3645497.0, "step": 223 }, { "entropy": 0.5521352589130402, "epoch": 0.835820895522388, "grad_norm": 0.1448589414358139, "learning_rate": 0.0002, "loss": 0.5500624775886536, "mean_token_accuracy": 0.7766592055559158, "num_tokens": 3661916.0, "step": 224 }, { "entropy": 0.5495995134115219, "epoch": 0.8395522388059702, "grad_norm": 0.11583460122346878, "learning_rate": 0.0002, "loss": 0.5486539006233215, "mean_token_accuracy": 0.77958944439888, "num_tokens": 3678385.0, "step": 225 }, { "entropy": 0.5483616590499878, "epoch": 0.8432835820895522, "grad_norm": 0.12950138747692108, "learning_rate": 0.0002, "loss": 0.550271213054657, "mean_token_accuracy": 0.7755987495183945, "num_tokens": 3694915.0, "step": 226 }, { "entropy": 0.5614336878061295, "epoch": 0.8470149253731343, "grad_norm": 0.1335671842098236, "learning_rate": 0.0002, "loss": 0.5636512041091919, "mean_token_accuracy": 0.7719693928956985, "num_tokens": 3710911.0, "step": 227 }, { "entropy": 0.5516408532857895, "epoch": 0.8507462686567164, "grad_norm": 0.11091525852680206, "learning_rate": 0.0002, "loss": 0.5478500127792358, "mean_token_accuracy": 0.7780372649431229, "num_tokens": 3727387.0, "step": 228 }, { "entropy": 0.5346055030822754, "epoch": 0.8544776119402985, "grad_norm": 0.1468094438314438, "learning_rate": 0.0002, "loss": 0.5368551015853882, "mean_token_accuracy": 0.7816846072673798, "num_tokens": 3743610.0, "step": 229 }, { "entropy": 0.5556191802024841, "epoch": 0.8582089552238806, "grad_norm": 0.12531019747257233, "learning_rate": 0.0002, "loss": 0.554017961025238, "mean_token_accuracy": 0.775733008980751, "num_tokens": 3759900.0, "step": 230 }, { "entropy": 0.5382195562124252, "epoch": 0.8619402985074627, "grad_norm": 0.12708726525306702, "learning_rate": 0.0002, "loss": 0.5370462536811829, "mean_token_accuracy": 0.7824227660894394, "num_tokens": 3776209.0, "step": 231 }, { "entropy": 0.5437551140785217, "epoch": 0.8656716417910447, "grad_norm": 0.14250780642032623, "learning_rate": 0.0002, "loss": 0.5482578277587891, "mean_token_accuracy": 0.7775947004556656, "num_tokens": 3792690.0, "step": 232 }, { "entropy": 0.5299069508910179, "epoch": 0.8694029850746269, "grad_norm": 0.09997344017028809, "learning_rate": 0.0002, "loss": 0.5321590900421143, "mean_token_accuracy": 0.7849525660276413, "num_tokens": 3808996.0, "step": 233 }, { "entropy": 0.5415566265583038, "epoch": 0.8731343283582089, "grad_norm": 0.14475880563259125, "learning_rate": 0.0002, "loss": 0.5407425165176392, "mean_token_accuracy": 0.7812676578760147, "num_tokens": 3825184.0, "step": 234 }, { "entropy": 0.5459320992231369, "epoch": 0.8768656716417911, "grad_norm": 0.1116221696138382, "learning_rate": 0.0002, "loss": 0.546471118927002, "mean_token_accuracy": 0.779377743601799, "num_tokens": 3841452.0, "step": 235 }, { "entropy": 0.5291514843702316, "epoch": 0.8805970149253731, "grad_norm": 0.12996730208396912, "learning_rate": 0.0002, "loss": 0.5327478647232056, "mean_token_accuracy": 0.7848521023988724, "num_tokens": 3858017.0, "step": 236 }, { "entropy": 0.5208889245986938, "epoch": 0.8843283582089553, "grad_norm": 0.16807906329631805, "learning_rate": 0.0002, "loss": 0.5301882028579712, "mean_token_accuracy": 0.786228597164154, "num_tokens": 3874064.0, "step": 237 }, { "entropy": 0.5617295503616333, "epoch": 0.8880597014925373, "grad_norm": 0.10751146823167801, "learning_rate": 0.0002, "loss": 0.5591222047805786, "mean_token_accuracy": 0.7737416923046112, "num_tokens": 3890590.0, "step": 238 }, { "entropy": 0.5473610609769821, "epoch": 0.8917910447761194, "grad_norm": 0.156968355178833, "learning_rate": 0.0002, "loss": 0.5408577919006348, "mean_token_accuracy": 0.7787807583808899, "num_tokens": 3906796.0, "step": 239 }, { "entropy": 0.5521116256713867, "epoch": 0.8955223880597015, "grad_norm": 0.1288469135761261, "learning_rate": 0.0002, "loss": 0.549975574016571, "mean_token_accuracy": 0.7787336856126785, "num_tokens": 3923243.0, "step": 240 }, { "entropy": 0.5367736220359802, "epoch": 0.8992537313432836, "grad_norm": 0.15267081558704376, "learning_rate": 0.0002, "loss": 0.5406203269958496, "mean_token_accuracy": 0.7823334783315659, "num_tokens": 3939802.0, "step": 241 }, { "entropy": 0.5384350121021271, "epoch": 0.9029850746268657, "grad_norm": 0.12661150097846985, "learning_rate": 0.0002, "loss": 0.5470013618469238, "mean_token_accuracy": 0.7777878791093826, "num_tokens": 3956169.0, "step": 242 }, { "entropy": 0.534332662820816, "epoch": 0.9067164179104478, "grad_norm": 0.1578921526670456, "learning_rate": 0.0002, "loss": 0.5447706580162048, "mean_token_accuracy": 0.7791011482477188, "num_tokens": 3972588.0, "step": 243 }, { "entropy": 0.5489266514778137, "epoch": 0.9104477611940298, "grad_norm": 0.12818928062915802, "learning_rate": 0.0002, "loss": 0.5481740236282349, "mean_token_accuracy": 0.7786219567060471, "num_tokens": 3988829.0, "step": 244 }, { "entropy": 0.5603043735027313, "epoch": 0.914179104477612, "grad_norm": 0.12620778381824493, "learning_rate": 0.0002, "loss": 0.5473756194114685, "mean_token_accuracy": 0.7766416519880295, "num_tokens": 4005147.0, "step": 245 }, { "entropy": 0.5429242998361588, "epoch": 0.917910447761194, "grad_norm": 0.12476211786270142, "learning_rate": 0.0002, "loss": 0.5349637269973755, "mean_token_accuracy": 0.7825885117053986, "num_tokens": 4021414.0, "step": 246 }, { "entropy": 0.5483033657073975, "epoch": 0.9216417910447762, "grad_norm": 0.12620662152767181, "learning_rate": 0.0002, "loss": 0.5528666973114014, "mean_token_accuracy": 0.7761824727058411, "num_tokens": 4038127.0, "step": 247 }, { "entropy": 0.5366939753293991, "epoch": 0.9253731343283582, "grad_norm": 0.14575915038585663, "learning_rate": 0.0002, "loss": 0.5463760495185852, "mean_token_accuracy": 0.7789819538593292, "num_tokens": 4054823.0, "step": 248 }, { "entropy": 0.5289286822080612, "epoch": 0.9291044776119403, "grad_norm": 0.13227254152297974, "learning_rate": 0.0002, "loss": 0.5342484712600708, "mean_token_accuracy": 0.7823342829942703, "num_tokens": 4071168.0, "step": 249 }, { "entropy": 0.5574782639741898, "epoch": 0.9328358208955224, "grad_norm": 0.11694958060979843, "learning_rate": 0.0002, "loss": 0.555205225944519, "mean_token_accuracy": 0.7752824872732162, "num_tokens": 4087486.0, "step": 250 }, { "entropy": 0.5487115234136581, "epoch": 0.9365671641791045, "grad_norm": 0.12190678715705872, "learning_rate": 0.0002, "loss": 0.5393535494804382, "mean_token_accuracy": 0.7831632941961288, "num_tokens": 4103816.0, "step": 251 }, { "entropy": 0.559577152132988, "epoch": 0.9402985074626866, "grad_norm": 0.17028383910655975, "learning_rate": 0.0002, "loss": 0.5525080561637878, "mean_token_accuracy": 0.7758573293685913, "num_tokens": 4120222.0, "step": 252 }, { "entropy": 0.5650424063205719, "epoch": 0.9440298507462687, "grad_norm": 0.11132688075304031, "learning_rate": 0.0002, "loss": 0.5637966394424438, "mean_token_accuracy": 0.7707894593477249, "num_tokens": 4136652.0, "step": 253 }, { "entropy": 0.5160737410187721, "epoch": 0.9477611940298507, "grad_norm": 0.15931887924671173, "learning_rate": 0.0002, "loss": 0.5282326936721802, "mean_token_accuracy": 0.7854665815830231, "num_tokens": 4152947.0, "step": 254 }, { "entropy": 0.537076398730278, "epoch": 0.9514925373134329, "grad_norm": 0.12814630568027496, "learning_rate": 0.0002, "loss": 0.5451772809028625, "mean_token_accuracy": 0.7802058607339859, "num_tokens": 4169503.0, "step": 255 }, { "entropy": 0.5342639088630676, "epoch": 0.9552238805970149, "grad_norm": 0.1517118364572525, "learning_rate": 0.0002, "loss": 0.5411078333854675, "mean_token_accuracy": 0.7786644250154495, "num_tokens": 4185621.0, "step": 256 }, { "entropy": 0.5415196269750595, "epoch": 0.9589552238805971, "grad_norm": 0.1379823535680771, "learning_rate": 0.0002, "loss": 0.5376235842704773, "mean_token_accuracy": 0.782574325799942, "num_tokens": 4201870.0, "step": 257 }, { "entropy": 0.5464203655719757, "epoch": 0.9626865671641791, "grad_norm": 0.11068425327539444, "learning_rate": 0.0002, "loss": 0.5408488512039185, "mean_token_accuracy": 0.780770868062973, "num_tokens": 4218151.0, "step": 258 }, { "entropy": 0.5458406358957291, "epoch": 0.9664179104477612, "grad_norm": 0.12213952839374542, "learning_rate": 0.0002, "loss": 0.5443609952926636, "mean_token_accuracy": 0.7778299003839493, "num_tokens": 4234366.0, "step": 259 }, { "entropy": 0.5463070273399353, "epoch": 0.9701492537313433, "grad_norm": 0.13273894786834717, "learning_rate": 0.0002, "loss": 0.5463058948516846, "mean_token_accuracy": 0.7797796875238419, "num_tokens": 4250736.0, "step": 260 }, { "entropy": 0.5530222281813622, "epoch": 0.9738805970149254, "grad_norm": 0.1269286721944809, "learning_rate": 0.0002, "loss": 0.5598427057266235, "mean_token_accuracy": 0.7720119059085846, "num_tokens": 4267145.0, "step": 261 }, { "entropy": 0.5307595282793045, "epoch": 0.9776119402985075, "grad_norm": 0.15041397511959076, "learning_rate": 0.0002, "loss": 0.5379044413566589, "mean_token_accuracy": 0.7826298028230667, "num_tokens": 4283482.0, "step": 262 }, { "entropy": 0.5570843815803528, "epoch": 0.9813432835820896, "grad_norm": 0.11555695533752441, "learning_rate": 0.0002, "loss": 0.5584969520568848, "mean_token_accuracy": 0.7722631692886353, "num_tokens": 4300006.0, "step": 263 }, { "entropy": 0.5427989065647125, "epoch": 0.9850746268656716, "grad_norm": 0.11381992697715759, "learning_rate": 0.0002, "loss": 0.5401906967163086, "mean_token_accuracy": 0.7819131314754486, "num_tokens": 4316285.0, "step": 264 }, { "entropy": 0.5244657546281815, "epoch": 0.9888059701492538, "grad_norm": 0.12954184412956238, "learning_rate": 0.0002, "loss": 0.5230352282524109, "mean_token_accuracy": 0.7875886708498001, "num_tokens": 4332644.0, "step": 265 }, { "entropy": 0.5411987751722336, "epoch": 0.9925373134328358, "grad_norm": 0.12008430063724518, "learning_rate": 0.0002, "loss": 0.5408762097358704, "mean_token_accuracy": 0.7805971801280975, "num_tokens": 4349014.0, "step": 266 }, { "entropy": 0.537212684750557, "epoch": 0.996268656716418, "grad_norm": 0.13956718146800995, "learning_rate": 0.0002, "loss": 0.5449704527854919, "mean_token_accuracy": 0.7769150733947754, "num_tokens": 4365397.0, "step": 267 }, { "entropy": 0.5412362664937973, "epoch": 1.0, "grad_norm": 0.11382853984832764, "learning_rate": 0.0002, "loss": 0.5392265319824219, "mean_token_accuracy": 0.7833839505910873, "num_tokens": 4381834.0, "step": 268 }, { "entropy": 0.5496137291193008, "epoch": 1.0037313432835822, "grad_norm": 0.14231012761592865, "learning_rate": 0.0002, "loss": 0.5489864945411682, "mean_token_accuracy": 0.7766753733158112, "num_tokens": 4398074.0, "step": 269 }, { "entropy": 0.5562388151884079, "epoch": 1.007462686567164, "grad_norm": 0.14497025310993195, "learning_rate": 0.0002, "loss": 0.5603899359703064, "mean_token_accuracy": 0.7735977172851562, "num_tokens": 4414424.0, "step": 270 }, { "entropy": 0.5293630063533783, "epoch": 1.0111940298507462, "grad_norm": 0.12251973897218704, "learning_rate": 0.0002, "loss": 0.5230416059494019, "mean_token_accuracy": 0.7859042882919312, "num_tokens": 4430738.0, "step": 271 }, { "entropy": 0.5297266095876694, "epoch": 1.0149253731343284, "grad_norm": 0.12865795195102692, "learning_rate": 0.0002, "loss": 0.5318350195884705, "mean_token_accuracy": 0.7831861972808838, "num_tokens": 4446854.0, "step": 272 }, { "entropy": 0.5223220437765121, "epoch": 1.0186567164179103, "grad_norm": 0.1494293063879013, "learning_rate": 0.0002, "loss": 0.5327814221382141, "mean_token_accuracy": 0.7832103371620178, "num_tokens": 4463067.0, "step": 273 }, { "entropy": 0.5113897025585175, "epoch": 1.0223880597014925, "grad_norm": 0.11985855549573898, "learning_rate": 0.0002, "loss": 0.5085136890411377, "mean_token_accuracy": 0.7943005859851837, "num_tokens": 4479208.0, "step": 274 }, { "entropy": 0.5331714898347855, "epoch": 1.0261194029850746, "grad_norm": 0.11615335196256638, "learning_rate": 0.0002, "loss": 0.5315767526626587, "mean_token_accuracy": 0.7823154479265213, "num_tokens": 4495400.0, "step": 275 }, { "entropy": 0.5418258756399155, "epoch": 1.0298507462686568, "grad_norm": 0.12503200769424438, "learning_rate": 0.0002, "loss": 0.5371681451797485, "mean_token_accuracy": 0.7810330092906952, "num_tokens": 4511712.0, "step": 276 }, { "entropy": 0.5291843414306641, "epoch": 1.0335820895522387, "grad_norm": 0.12552055716514587, "learning_rate": 0.0002, "loss": 0.5229098796844482, "mean_token_accuracy": 0.7861831933259964, "num_tokens": 4527757.0, "step": 277 }, { "entropy": 0.5402754694223404, "epoch": 1.037313432835821, "grad_norm": 0.12993621826171875, "learning_rate": 0.0002, "loss": 0.5389543771743774, "mean_token_accuracy": 0.782686859369278, "num_tokens": 4544172.0, "step": 278 }, { "entropy": 0.5249762684106827, "epoch": 1.041044776119403, "grad_norm": 0.1478368192911148, "learning_rate": 0.0002, "loss": 0.5288144946098328, "mean_token_accuracy": 0.7870309799909592, "num_tokens": 4560317.0, "step": 279 }, { "entropy": 0.5261744558811188, "epoch": 1.044776119402985, "grad_norm": 0.12392111867666245, "learning_rate": 0.0002, "loss": 0.5337116122245789, "mean_token_accuracy": 0.7859398722648621, "num_tokens": 4576552.0, "step": 280 }, { "entropy": 0.5196933448314667, "epoch": 1.0485074626865671, "grad_norm": 0.13088668882846832, "learning_rate": 0.0002, "loss": 0.5231020450592041, "mean_token_accuracy": 0.7892478257417679, "num_tokens": 4592581.0, "step": 281 }, { "entropy": 0.530863881111145, "epoch": 1.0522388059701493, "grad_norm": 0.12411776930093765, "learning_rate": 0.0002, "loss": 0.521477460861206, "mean_token_accuracy": 0.7883302420377731, "num_tokens": 4609148.0, "step": 282 }, { "entropy": 0.5147035792469978, "epoch": 1.0559701492537314, "grad_norm": 0.11664963513612747, "learning_rate": 0.0002, "loss": 0.5152803063392639, "mean_token_accuracy": 0.7897714674472809, "num_tokens": 4625339.0, "step": 283 }, { "entropy": 0.5241324007511139, "epoch": 1.0597014925373134, "grad_norm": 0.12206321954727173, "learning_rate": 0.0002, "loss": 0.5279011726379395, "mean_token_accuracy": 0.7872984111309052, "num_tokens": 4641602.0, "step": 284 }, { "entropy": 0.5386586785316467, "epoch": 1.0634328358208955, "grad_norm": 0.15844044089317322, "learning_rate": 0.0002, "loss": 0.5462183356285095, "mean_token_accuracy": 0.7776554077863693, "num_tokens": 4657935.0, "step": 285 }, { "entropy": 0.5212236195802689, "epoch": 1.0671641791044777, "grad_norm": 0.12227971851825714, "learning_rate": 0.0002, "loss": 0.524368941783905, "mean_token_accuracy": 0.7889244109392166, "num_tokens": 4674375.0, "step": 286 }, { "entropy": 0.5298297703266144, "epoch": 1.0708955223880596, "grad_norm": 0.11141645163297653, "learning_rate": 0.0002, "loss": 0.5300790667533875, "mean_token_accuracy": 0.7826484590768814, "num_tokens": 4690771.0, "step": 287 }, { "entropy": 0.5472451746463776, "epoch": 1.0746268656716418, "grad_norm": 0.12320703268051147, "learning_rate": 0.0002, "loss": 0.5423391461372375, "mean_token_accuracy": 0.780271515250206, "num_tokens": 4707429.0, "step": 288 }, { "entropy": 0.5120319426059723, "epoch": 1.078358208955224, "grad_norm": 0.12205273658037186, "learning_rate": 0.0002, "loss": 0.5049785375595093, "mean_token_accuracy": 0.7964775711297989, "num_tokens": 4723707.0, "step": 289 }, { "entropy": 0.5228906571865082, "epoch": 1.0820895522388059, "grad_norm": 0.14154046773910522, "learning_rate": 0.0002, "loss": 0.5207747220993042, "mean_token_accuracy": 0.7898598164319992, "num_tokens": 4739904.0, "step": 290 }, { "entropy": 0.522852934896946, "epoch": 1.085820895522388, "grad_norm": 0.12813158333301544, "learning_rate": 0.0002, "loss": 0.5229586958885193, "mean_token_accuracy": 0.7879058122634888, "num_tokens": 4756146.0, "step": 291 }, { "entropy": 0.5175448060035706, "epoch": 1.0895522388059702, "grad_norm": 0.1693999171257019, "learning_rate": 0.0002, "loss": 0.5333408713340759, "mean_token_accuracy": 0.7839324027299881, "num_tokens": 4772324.0, "step": 292 }, { "entropy": 0.5461927354335785, "epoch": 1.0932835820895523, "grad_norm": 0.1190054640173912, "learning_rate": 0.0002, "loss": 0.545452892780304, "mean_token_accuracy": 0.7791879326105118, "num_tokens": 4788838.0, "step": 293 }, { "entropy": 0.5367765128612518, "epoch": 1.0970149253731343, "grad_norm": 0.160573810338974, "learning_rate": 0.0002, "loss": 0.5323442816734314, "mean_token_accuracy": 0.7844058275222778, "num_tokens": 4805260.0, "step": 294 }, { "entropy": 0.5449754297733307, "epoch": 1.1007462686567164, "grad_norm": 0.13656781613826752, "learning_rate": 0.0002, "loss": 0.5343316793441772, "mean_token_accuracy": 0.786631390452385, "num_tokens": 4821651.0, "step": 295 }, { "entropy": 0.539639413356781, "epoch": 1.1044776119402986, "grad_norm": 0.15722377598285675, "learning_rate": 0.0002, "loss": 0.537823498249054, "mean_token_accuracy": 0.7838342785835266, "num_tokens": 4838086.0, "step": 296 }, { "entropy": 0.5071177557110786, "epoch": 1.1082089552238805, "grad_norm": 0.13242004811763763, "learning_rate": 0.0002, "loss": 0.519379198551178, "mean_token_accuracy": 0.790022000670433, "num_tokens": 4854421.0, "step": 297 }, { "entropy": 0.5327034294605255, "epoch": 1.1119402985074627, "grad_norm": 0.21717894077301025, "learning_rate": 0.0002, "loss": 0.5451952815055847, "mean_token_accuracy": 0.7793966829776764, "num_tokens": 4870862.0, "step": 298 }, { "entropy": 0.5120953842997551, "epoch": 1.1156716417910448, "grad_norm": 0.11570360511541367, "learning_rate": 0.0002, "loss": 0.5137699246406555, "mean_token_accuracy": 0.7910549491643906, "num_tokens": 4887047.0, "step": 299 }, { "entropy": 0.5416189283132553, "epoch": 1.1194029850746268, "grad_norm": 0.15835031867027283, "learning_rate": 0.0002, "loss": 0.5377160310745239, "mean_token_accuracy": 0.7817842811346054, "num_tokens": 4903770.0, "step": 300 }, { "entropy": 0.5600537657737732, "epoch": 1.123134328358209, "grad_norm": 0.16074593365192413, "learning_rate": 0.0002, "loss": 0.5558266043663025, "mean_token_accuracy": 0.7756943106651306, "num_tokens": 4920314.0, "step": 301 }, { "entropy": 0.5424332320690155, "epoch": 1.126865671641791, "grad_norm": 0.13547547161579132, "learning_rate": 0.0002, "loss": 0.5412736535072327, "mean_token_accuracy": 0.7802875488996506, "num_tokens": 4936795.0, "step": 302 }, { "entropy": 0.5479728579521179, "epoch": 1.1305970149253732, "grad_norm": 0.17388752102851868, "learning_rate": 0.0002, "loss": 0.5473156571388245, "mean_token_accuracy": 0.7779090404510498, "num_tokens": 4953215.0, "step": 303 }, { "entropy": 0.5354913771152496, "epoch": 1.1343283582089552, "grad_norm": 0.12070244550704956, "learning_rate": 0.0002, "loss": 0.5346955060958862, "mean_token_accuracy": 0.7821491658687592, "num_tokens": 4969473.0, "step": 304 }, { "entropy": 0.5357395708560944, "epoch": 1.1380597014925373, "grad_norm": 0.1695796698331833, "learning_rate": 0.0002, "loss": 0.5382478833198547, "mean_token_accuracy": 0.7825665175914764, "num_tokens": 4985892.0, "step": 305 }, { "entropy": 0.5406463518738747, "epoch": 1.1417910447761195, "grad_norm": 0.13278549909591675, "learning_rate": 0.0002, "loss": 0.5439954996109009, "mean_token_accuracy": 0.781127467751503, "num_tokens": 5002244.0, "step": 306 }, { "entropy": 0.5423679053783417, "epoch": 1.1455223880597014, "grad_norm": 0.1525002419948578, "learning_rate": 0.0002, "loss": 0.5506120324134827, "mean_token_accuracy": 0.7751760631799698, "num_tokens": 5018518.0, "step": 307 }, { "entropy": 0.5409325361251831, "epoch": 1.1492537313432836, "grad_norm": 0.1641884595155716, "learning_rate": 0.0002, "loss": 0.5398315787315369, "mean_token_accuracy": 0.7811702787876129, "num_tokens": 5034880.0, "step": 308 }, { "entropy": 0.527726948261261, "epoch": 1.1529850746268657, "grad_norm": 0.13098926842212677, "learning_rate": 0.0002, "loss": 0.5239942669868469, "mean_token_accuracy": 0.7863958179950714, "num_tokens": 5051492.0, "step": 309 }, { "entropy": 0.5603475868701935, "epoch": 1.1567164179104479, "grad_norm": 0.17059364914894104, "learning_rate": 0.0002, "loss": 0.5537184476852417, "mean_token_accuracy": 0.7751886546611786, "num_tokens": 5067902.0, "step": 310 }, { "entropy": 0.522188276052475, "epoch": 1.1604477611940298, "grad_norm": 0.14454245567321777, "learning_rate": 0.0002, "loss": 0.5286940932273865, "mean_token_accuracy": 0.7850693166255951, "num_tokens": 5084221.0, "step": 311 }, { "entropy": 0.5343948155641556, "epoch": 1.164179104477612, "grad_norm": 0.13227348029613495, "learning_rate": 0.0002, "loss": 0.5384489297866821, "mean_token_accuracy": 0.7807275205850601, "num_tokens": 5100663.0, "step": 312 }, { "entropy": 0.5275873988866806, "epoch": 1.1679104477611941, "grad_norm": 0.1753464788198471, "learning_rate": 0.0002, "loss": 0.5382294058799744, "mean_token_accuracy": 0.7828755527734756, "num_tokens": 5117302.0, "step": 313 }, { "entropy": 0.5497360378503799, "epoch": 1.171641791044776, "grad_norm": 0.13286371529102325, "learning_rate": 0.0002, "loss": 0.5496618151664734, "mean_token_accuracy": 0.7774941623210907, "num_tokens": 5133769.0, "step": 314 }, { "entropy": 0.532920241355896, "epoch": 1.1753731343283582, "grad_norm": 0.15036581456661224, "learning_rate": 0.0002, "loss": 0.5245468020439148, "mean_token_accuracy": 0.7888032495975494, "num_tokens": 5150119.0, "step": 315 }, { "entropy": 0.5440064817667007, "epoch": 1.1791044776119404, "grad_norm": 0.13510671257972717, "learning_rate": 0.0002, "loss": 0.5358728170394897, "mean_token_accuracy": 0.7828054130077362, "num_tokens": 5166721.0, "step": 316 }, { "entropy": 0.5312670171260834, "epoch": 1.1828358208955223, "grad_norm": 0.11371396481990814, "learning_rate": 0.0002, "loss": 0.5337090492248535, "mean_token_accuracy": 0.7806256115436554, "num_tokens": 5182960.0, "step": 317 }, { "entropy": 0.5359569638967514, "epoch": 1.1865671641791045, "grad_norm": 0.1442011594772339, "learning_rate": 0.0002, "loss": 0.5444678068161011, "mean_token_accuracy": 0.7807507514953613, "num_tokens": 5199188.0, "step": 318 }, { "entropy": 0.5328075140714645, "epoch": 1.1902985074626866, "grad_norm": 0.14832444489002228, "learning_rate": 0.0002, "loss": 0.5382975339889526, "mean_token_accuracy": 0.7805762439966202, "num_tokens": 5215650.0, "step": 319 }, { "entropy": 0.5216325521469116, "epoch": 1.1940298507462686, "grad_norm": 0.14424221217632294, "learning_rate": 0.0002, "loss": 0.5250576734542847, "mean_token_accuracy": 0.7859031856060028, "num_tokens": 5231820.0, "step": 320 }, { "entropy": 0.5351075977087021, "epoch": 1.1977611940298507, "grad_norm": 0.14221367239952087, "learning_rate": 0.0002, "loss": 0.5295757055282593, "mean_token_accuracy": 0.7862369567155838, "num_tokens": 5248279.0, "step": 321 }, { "entropy": 0.5397693365812302, "epoch": 1.2014925373134329, "grad_norm": 0.13292263448238373, "learning_rate": 0.0002, "loss": 0.5341707468032837, "mean_token_accuracy": 0.7843815088272095, "num_tokens": 5264712.0, "step": 322 }, { "entropy": 0.5192128270864487, "epoch": 1.205223880597015, "grad_norm": 0.14713309705257416, "learning_rate": 0.0002, "loss": 0.5247495770454407, "mean_token_accuracy": 0.7879969924688339, "num_tokens": 5280975.0, "step": 323 }, { "entropy": 0.542580246925354, "epoch": 1.208955223880597, "grad_norm": 0.1425526738166809, "learning_rate": 0.0002, "loss": 0.5457293391227722, "mean_token_accuracy": 0.7779300808906555, "num_tokens": 5297373.0, "step": 324 }, { "entropy": 0.51340202242136, "epoch": 1.212686567164179, "grad_norm": 0.13574931025505066, "learning_rate": 0.0002, "loss": 0.5158831477165222, "mean_token_accuracy": 0.7899662852287292, "num_tokens": 5313524.0, "step": 325 }, { "entropy": 0.5239507853984833, "epoch": 1.2164179104477613, "grad_norm": 0.1242108941078186, "learning_rate": 0.0002, "loss": 0.5264536142349243, "mean_token_accuracy": 0.7876432240009308, "num_tokens": 5330035.0, "step": 326 }, { "entropy": 0.5461296737194061, "epoch": 1.2201492537313432, "grad_norm": 0.13526761531829834, "learning_rate": 0.0002, "loss": 0.5456458330154419, "mean_token_accuracy": 0.7787662595510483, "num_tokens": 5346713.0, "step": 327 }, { "entropy": 0.5285127460956573, "epoch": 1.2238805970149254, "grad_norm": 0.1288863569498062, "learning_rate": 0.0002, "loss": 0.5286239385604858, "mean_token_accuracy": 0.7839469760656357, "num_tokens": 5362892.0, "step": 328 }, { "entropy": 0.5281976014375687, "epoch": 1.2276119402985075, "grad_norm": 0.15830843150615692, "learning_rate": 0.0002, "loss": 0.5338830351829529, "mean_token_accuracy": 0.7864977121353149, "num_tokens": 5379105.0, "step": 329 }, { "entropy": 0.537989154458046, "epoch": 1.2313432835820897, "grad_norm": 0.14264224469661713, "learning_rate": 0.0002, "loss": 0.5378222465515137, "mean_token_accuracy": 0.7845461368560791, "num_tokens": 5395557.0, "step": 330 }, { "entropy": 0.5446864664554596, "epoch": 1.2350746268656716, "grad_norm": 0.15385743975639343, "learning_rate": 0.0002, "loss": 0.5452708005905151, "mean_token_accuracy": 0.7787858992815018, "num_tokens": 5411870.0, "step": 331 }, { "entropy": 0.5162093490362167, "epoch": 1.2388059701492538, "grad_norm": 0.13330549001693726, "learning_rate": 0.0002, "loss": 0.5179134607315063, "mean_token_accuracy": 0.7886767089366913, "num_tokens": 5428174.0, "step": 332 }, { "entropy": 0.5166965126991272, "epoch": 1.242537313432836, "grad_norm": 0.13044792413711548, "learning_rate": 0.0002, "loss": 0.5149925947189331, "mean_token_accuracy": 0.7877358198165894, "num_tokens": 5444504.0, "step": 333 }, { "entropy": 0.5293487906455994, "epoch": 1.2462686567164178, "grad_norm": 0.15583756566047668, "learning_rate": 0.0002, "loss": 0.5320658087730408, "mean_token_accuracy": 0.7861583828926086, "num_tokens": 5460813.0, "step": 334 }, { "entropy": 0.5320923030376434, "epoch": 1.25, "grad_norm": 0.12959426641464233, "learning_rate": 0.0002, "loss": 0.5345736145973206, "mean_token_accuracy": 0.7825423777103424, "num_tokens": 5477333.0, "step": 335 }, { "entropy": 0.5326530635356903, "epoch": 1.2537313432835822, "grad_norm": 0.15951137244701385, "learning_rate": 0.0002, "loss": 0.5311124920845032, "mean_token_accuracy": 0.7841883301734924, "num_tokens": 5493735.0, "step": 336 }, { "entropy": 0.544501468539238, "epoch": 1.2574626865671643, "grad_norm": 0.12288819998502731, "learning_rate": 0.0002, "loss": 0.5451238751411438, "mean_token_accuracy": 0.7775899171829224, "num_tokens": 5510068.0, "step": 337 }, { "entropy": 0.5330418646335602, "epoch": 1.2611940298507462, "grad_norm": 0.13410672545433044, "learning_rate": 0.0002, "loss": 0.535346269607544, "mean_token_accuracy": 0.7835884392261505, "num_tokens": 5526452.0, "step": 338 }, { "entropy": 0.5434266775846481, "epoch": 1.2649253731343284, "grad_norm": 0.13076815009117126, "learning_rate": 0.0002, "loss": 0.5440234541893005, "mean_token_accuracy": 0.7821687757968903, "num_tokens": 5542951.0, "step": 339 }, { "entropy": 0.5151484906673431, "epoch": 1.2686567164179103, "grad_norm": 0.12828661501407623, "learning_rate": 0.0002, "loss": 0.5160608887672424, "mean_token_accuracy": 0.791755273938179, "num_tokens": 5559086.0, "step": 340 }, { "entropy": 0.5275644734501839, "epoch": 1.2723880597014925, "grad_norm": 0.13408422470092773, "learning_rate": 0.0002, "loss": 0.5317025184631348, "mean_token_accuracy": 0.7861050963401794, "num_tokens": 5575521.0, "step": 341 }, { "entropy": 0.5177630484104156, "epoch": 1.2761194029850746, "grad_norm": 0.12419670075178146, "learning_rate": 0.0002, "loss": 0.5191144347190857, "mean_token_accuracy": 0.7892575412988663, "num_tokens": 5591947.0, "step": 342 }, { "entropy": 0.5407169461250305, "epoch": 1.2798507462686568, "grad_norm": 0.1364241987466812, "learning_rate": 0.0002, "loss": 0.5430530309677124, "mean_token_accuracy": 0.779339611530304, "num_tokens": 5608447.0, "step": 343 }, { "entropy": 0.5262736082077026, "epoch": 1.2835820895522387, "grad_norm": 0.15587468445301056, "learning_rate": 0.0002, "loss": 0.5301055312156677, "mean_token_accuracy": 0.7836160659790039, "num_tokens": 5625044.0, "step": 344 }, { "entropy": 0.5458462238311768, "epoch": 1.287313432835821, "grad_norm": 0.13173708319664001, "learning_rate": 0.0002, "loss": 0.5517262816429138, "mean_token_accuracy": 0.7764803022146225, "num_tokens": 5641335.0, "step": 345 }, { "entropy": 0.5216450393199921, "epoch": 1.291044776119403, "grad_norm": 0.17484262585639954, "learning_rate": 0.0002, "loss": 0.5218112468719482, "mean_token_accuracy": 0.7843209207057953, "num_tokens": 5657347.0, "step": 346 }, { "entropy": 0.5498285889625549, "epoch": 1.294776119402985, "grad_norm": 0.12871748208999634, "learning_rate": 0.0002, "loss": 0.5382349491119385, "mean_token_accuracy": 0.7812492400407791, "num_tokens": 5673588.0, "step": 347 }, { "entropy": 0.5317611545324326, "epoch": 1.2985074626865671, "grad_norm": 0.15342608094215393, "learning_rate": 0.0002, "loss": 0.5276378989219666, "mean_token_accuracy": 0.7836941033601761, "num_tokens": 5689687.0, "step": 348 }, { "entropy": 0.5218729674816132, "epoch": 1.3022388059701493, "grad_norm": 0.1535658985376358, "learning_rate": 0.0002, "loss": 0.5265159606933594, "mean_token_accuracy": 0.7863410115242004, "num_tokens": 5705883.0, "step": 349 }, { "entropy": 0.5283405184745789, "epoch": 1.3059701492537314, "grad_norm": 0.1400662213563919, "learning_rate": 0.0002, "loss": 0.5348565578460693, "mean_token_accuracy": 0.7835897505283356, "num_tokens": 5722396.0, "step": 350 }, { "entropy": 0.5465448051691055, "epoch": 1.3097014925373134, "grad_norm": 0.1789598912000656, "learning_rate": 0.0002, "loss": 0.5508973002433777, "mean_token_accuracy": 0.7770535051822662, "num_tokens": 5738946.0, "step": 351 }, { "entropy": 0.5288202613592148, "epoch": 1.3134328358208955, "grad_norm": 0.12526051700115204, "learning_rate": 0.0002, "loss": 0.5298986434936523, "mean_token_accuracy": 0.7855530083179474, "num_tokens": 5755207.0, "step": 352 }, { "entropy": 0.5429712533950806, "epoch": 1.3171641791044777, "grad_norm": 0.12195583432912827, "learning_rate": 0.0002, "loss": 0.5387951731681824, "mean_token_accuracy": 0.7802612334489822, "num_tokens": 5771582.0, "step": 353 }, { "entropy": 0.5358787178993225, "epoch": 1.3208955223880596, "grad_norm": 0.15126559138298035, "learning_rate": 0.0002, "loss": 0.5349993705749512, "mean_token_accuracy": 0.7822433114051819, "num_tokens": 5787967.0, "step": 354 }, { "entropy": 0.5424338132143021, "epoch": 1.3246268656716418, "grad_norm": 0.1308310180902481, "learning_rate": 0.0002, "loss": 0.5434916615486145, "mean_token_accuracy": 0.7826928794384003, "num_tokens": 5804528.0, "step": 355 }, { "entropy": 0.5337295234203339, "epoch": 1.328358208955224, "grad_norm": 0.16843028366565704, "learning_rate": 0.0002, "loss": 0.5465773344039917, "mean_token_accuracy": 0.777764692902565, "num_tokens": 5820684.0, "step": 356 }, { "entropy": 0.504702128469944, "epoch": 1.332089552238806, "grad_norm": 0.1529076248407364, "learning_rate": 0.0002, "loss": 0.5113453269004822, "mean_token_accuracy": 0.791937530040741, "num_tokens": 5836988.0, "step": 357 }, { "entropy": 0.536053940653801, "epoch": 1.335820895522388, "grad_norm": 0.1379069983959198, "learning_rate": 0.0002, "loss": 0.5389484763145447, "mean_token_accuracy": 0.7813952714204788, "num_tokens": 5853542.0, "step": 358 }, { "entropy": 0.5438119322061539, "epoch": 1.3395522388059702, "grad_norm": 0.12008243054151535, "learning_rate": 0.0002, "loss": 0.5360631346702576, "mean_token_accuracy": 0.7817373275756836, "num_tokens": 5870213.0, "step": 359 }, { "entropy": 0.550885871052742, "epoch": 1.3432835820895521, "grad_norm": 0.13378706574440002, "learning_rate": 0.0002, "loss": 0.54970383644104, "mean_token_accuracy": 0.7768265455961227, "num_tokens": 5886513.0, "step": 360 }, { "entropy": 0.5400225073099136, "epoch": 1.3470149253731343, "grad_norm": 0.13530388474464417, "learning_rate": 0.0002, "loss": 0.5343542098999023, "mean_token_accuracy": 0.782709077000618, "num_tokens": 5903049.0, "step": 361 }, { "entropy": 0.5389147847890854, "epoch": 1.3507462686567164, "grad_norm": 0.12446677684783936, "learning_rate": 0.0002, "loss": 0.5388710498809814, "mean_token_accuracy": 0.781377524137497, "num_tokens": 5919403.0, "step": 362 }, { "entropy": 0.537296935915947, "epoch": 1.3544776119402986, "grad_norm": 0.13781245052814484, "learning_rate": 0.0002, "loss": 0.5438515543937683, "mean_token_accuracy": 0.7785618007183075, "num_tokens": 5935511.0, "step": 363 }, { "entropy": 0.5429168194532394, "epoch": 1.3582089552238805, "grad_norm": 0.13629309833049774, "learning_rate": 0.0002, "loss": 0.5453547239303589, "mean_token_accuracy": 0.7784431874752045, "num_tokens": 5951972.0, "step": 364 }, { "entropy": 0.5427183359861374, "epoch": 1.3619402985074627, "grad_norm": 0.1370571255683899, "learning_rate": 0.0002, "loss": 0.545956552028656, "mean_token_accuracy": 0.7787607759237289, "num_tokens": 5968229.0, "step": 365 }, { "entropy": 0.5378859043121338, "epoch": 1.3656716417910448, "grad_norm": 0.12471959739923477, "learning_rate": 0.0002, "loss": 0.5353823900222778, "mean_token_accuracy": 0.7809005975723267, "num_tokens": 5984669.0, "step": 366 }, { "entropy": 0.5365873426198959, "epoch": 1.3694029850746268, "grad_norm": 0.16501657664775848, "learning_rate": 0.0002, "loss": 0.5319327712059021, "mean_token_accuracy": 0.7824555039405823, "num_tokens": 6001027.0, "step": 367 }, { "entropy": 0.5265276953577995, "epoch": 1.373134328358209, "grad_norm": 0.12363235652446747, "learning_rate": 0.0002, "loss": 0.5210375785827637, "mean_token_accuracy": 0.7883688807487488, "num_tokens": 6017125.0, "step": 368 }, { "entropy": 0.5277390778064728, "epoch": 1.376865671641791, "grad_norm": 0.1423310935497284, "learning_rate": 0.0002, "loss": 0.5316471457481384, "mean_token_accuracy": 0.7828662693500519, "num_tokens": 6033508.0, "step": 369 }, { "entropy": 0.5263610854744911, "epoch": 1.3805970149253732, "grad_norm": 0.1381843090057373, "learning_rate": 0.0002, "loss": 0.5311442613601685, "mean_token_accuracy": 0.7821517586708069, "num_tokens": 6049886.0, "step": 370 }, { "entropy": 0.5286078453063965, "epoch": 1.3843283582089552, "grad_norm": 0.18003322184085846, "learning_rate": 0.0002, "loss": 0.5398144721984863, "mean_token_accuracy": 0.7803981304168701, "num_tokens": 6066120.0, "step": 371 }, { "entropy": 0.5356258824467659, "epoch": 1.3880597014925373, "grad_norm": 0.11802922934293747, "learning_rate": 0.0002, "loss": 0.53504878282547, "mean_token_accuracy": 0.7814585119485855, "num_tokens": 6082732.0, "step": 372 }, { "entropy": 0.5387788712978363, "epoch": 1.3917910447761195, "grad_norm": 0.13874171674251556, "learning_rate": 0.0002, "loss": 0.5358333587646484, "mean_token_accuracy": 0.7825580388307571, "num_tokens": 6099018.0, "step": 373 }, { "entropy": 0.5342960059642792, "epoch": 1.3955223880597014, "grad_norm": 0.1402461677789688, "learning_rate": 0.0002, "loss": 0.5348989963531494, "mean_token_accuracy": 0.7847650349140167, "num_tokens": 6115279.0, "step": 374 }, { "entropy": 0.5361053943634033, "epoch": 1.3992537313432836, "grad_norm": 0.11853493005037308, "learning_rate": 0.0002, "loss": 0.5328879356384277, "mean_token_accuracy": 0.7853472977876663, "num_tokens": 6131854.0, "step": 375 }, { "entropy": 0.5300562530755997, "epoch": 1.4029850746268657, "grad_norm": 0.1642550826072693, "learning_rate": 0.0002, "loss": 0.5330582857131958, "mean_token_accuracy": 0.7824369519948959, "num_tokens": 6148329.0, "step": 376 }, { "entropy": 0.5351111143827438, "epoch": 1.4067164179104479, "grad_norm": 0.13296250998973846, "learning_rate": 0.0002, "loss": 0.5308345556259155, "mean_token_accuracy": 0.7840287983417511, "num_tokens": 6164520.0, "step": 377 }, { "entropy": 0.549595445394516, "epoch": 1.4104477611940298, "grad_norm": 0.11937810480594635, "learning_rate": 0.0002, "loss": 0.5439208745956421, "mean_token_accuracy": 0.7801520526409149, "num_tokens": 6180840.0, "step": 378 }, { "entropy": 0.5249980017542839, "epoch": 1.414179104477612, "grad_norm": 0.14947783946990967, "learning_rate": 0.0002, "loss": 0.5214130878448486, "mean_token_accuracy": 0.7883247882127762, "num_tokens": 6197072.0, "step": 379 }, { "entropy": 0.5341014862060547, "epoch": 1.417910447761194, "grad_norm": 0.14708726108074188, "learning_rate": 0.0002, "loss": 0.5437160730361938, "mean_token_accuracy": 0.7790101766586304, "num_tokens": 6213410.0, "step": 380 }, { "entropy": 0.5305748581886292, "epoch": 1.421641791044776, "grad_norm": 0.15660500526428223, "learning_rate": 0.0002, "loss": 0.538860559463501, "mean_token_accuracy": 0.7808915078639984, "num_tokens": 6229812.0, "step": 381 }, { "entropy": 0.5335244983434677, "epoch": 1.4253731343283582, "grad_norm": 0.14013393223285675, "learning_rate": 0.0002, "loss": 0.5405108332633972, "mean_token_accuracy": 0.7806441932916641, "num_tokens": 6246122.0, "step": 382 }, { "entropy": 0.5370550155639648, "epoch": 1.4291044776119404, "grad_norm": 0.15498457849025726, "learning_rate": 0.0002, "loss": 0.5275038480758667, "mean_token_accuracy": 0.7845180481672287, "num_tokens": 6262400.0, "step": 383 }, { "entropy": 0.5388240739703178, "epoch": 1.4328358208955223, "grad_norm": 0.13547126948833466, "learning_rate": 0.0002, "loss": 0.5339113473892212, "mean_token_accuracy": 0.7817906439304352, "num_tokens": 6278433.0, "step": 384 }, { "entropy": 0.5327373743057251, "epoch": 1.4365671641791045, "grad_norm": 0.15488973259925842, "learning_rate": 0.0002, "loss": 0.536837637424469, "mean_token_accuracy": 0.7805320471525192, "num_tokens": 6294780.0, "step": 385 }, { "entropy": 0.5164054483175278, "epoch": 1.4402985074626866, "grad_norm": 0.13659167289733887, "learning_rate": 0.0002, "loss": 0.5196657180786133, "mean_token_accuracy": 0.7893420159816742, "num_tokens": 6310926.0, "step": 386 }, { "entropy": 0.5441898256540298, "epoch": 1.4440298507462686, "grad_norm": 0.30239349603652954, "learning_rate": 0.0002, "loss": 0.5498929023742676, "mean_token_accuracy": 0.7768156677484512, "num_tokens": 6327465.0, "step": 387 }, { "entropy": 0.5278986096382141, "epoch": 1.4477611940298507, "grad_norm": 0.16996067762374878, "learning_rate": 0.0002, "loss": 0.5285515785217285, "mean_token_accuracy": 0.786761000752449, "num_tokens": 6343503.0, "step": 388 }, { "entropy": 0.508112832903862, "epoch": 1.4514925373134329, "grad_norm": 0.14852264523506165, "learning_rate": 0.0002, "loss": 0.5129667520523071, "mean_token_accuracy": 0.7919276505708694, "num_tokens": 6359667.0, "step": 389 }, { "entropy": 0.5249242335557938, "epoch": 1.455223880597015, "grad_norm": 0.17182905972003937, "learning_rate": 0.0002, "loss": 0.5207914113998413, "mean_token_accuracy": 0.7878070920705795, "num_tokens": 6376114.0, "step": 390 }, { "entropy": 0.5415022522211075, "epoch": 1.458955223880597, "grad_norm": 0.14497698843479156, "learning_rate": 0.0002, "loss": 0.5450653433799744, "mean_token_accuracy": 0.7796677798032761, "num_tokens": 6392417.0, "step": 391 }, { "entropy": 0.5454135686159134, "epoch": 1.462686567164179, "grad_norm": 0.14885719120502472, "learning_rate": 0.0002, "loss": 0.5476389527320862, "mean_token_accuracy": 0.7781424224376678, "num_tokens": 6408701.0, "step": 392 }, { "entropy": 0.5305422842502594, "epoch": 1.4664179104477613, "grad_norm": 0.13111279904842377, "learning_rate": 0.0002, "loss": 0.5283982753753662, "mean_token_accuracy": 0.786282405257225, "num_tokens": 6425186.0, "step": 393 }, { "entropy": 0.519924134016037, "epoch": 1.4701492537313432, "grad_norm": 0.15385456383228302, "learning_rate": 0.0002, "loss": 0.5183860659599304, "mean_token_accuracy": 0.7890526354312897, "num_tokens": 6441474.0, "step": 394 }, { "entropy": 0.5419893115758896, "epoch": 1.4738805970149254, "grad_norm": 0.12959027290344238, "learning_rate": 0.0002, "loss": 0.5391095876693726, "mean_token_accuracy": 0.7845679074525833, "num_tokens": 6458137.0, "step": 395 }, { "entropy": 0.5297622233629227, "epoch": 1.4776119402985075, "grad_norm": 0.12876980006694794, "learning_rate": 0.0002, "loss": 0.5316991209983826, "mean_token_accuracy": 0.783607617020607, "num_tokens": 6474605.0, "step": 396 }, { "entropy": 0.5133326500654221, "epoch": 1.4813432835820897, "grad_norm": 0.23840782046318054, "learning_rate": 0.0002, "loss": 0.5223475098609924, "mean_token_accuracy": 0.7896056026220322, "num_tokens": 6490747.0, "step": 397 }, { "entropy": 0.540631890296936, "epoch": 1.4850746268656716, "grad_norm": 0.18176521360874176, "learning_rate": 0.0002, "loss": 0.5429366230964661, "mean_token_accuracy": 0.7787415534257889, "num_tokens": 6507149.0, "step": 398 }, { "entropy": 0.5534960627555847, "epoch": 1.4888059701492538, "grad_norm": 0.38266992568969727, "learning_rate": 0.0002, "loss": 0.5652564764022827, "mean_token_accuracy": 0.7736776769161224, "num_tokens": 6523502.0, "step": 399 }, { "entropy": 0.5438710153102875, "epoch": 1.4925373134328357, "grad_norm": 0.15845677256584167, "learning_rate": 0.0002, "loss": 0.5439051985740662, "mean_token_accuracy": 0.7816531956195831, "num_tokens": 6539815.0, "step": 400 }, { "entropy": 0.5452860891819, "epoch": 1.4962686567164178, "grad_norm": 0.19755159318447113, "learning_rate": 0.0002, "loss": 0.5404053926467896, "mean_token_accuracy": 0.7815948128700256, "num_tokens": 6555976.0, "step": 401 }, { "entropy": 0.5241969153285027, "epoch": 1.5, "grad_norm": 0.14966075122356415, "learning_rate": 0.0002, "loss": 0.5205419063568115, "mean_token_accuracy": 0.7888282835483551, "num_tokens": 6572116.0, "step": 402 }, { "entropy": 0.5179315954446793, "epoch": 1.5037313432835822, "grad_norm": 0.15208128094673157, "learning_rate": 0.0002, "loss": 0.5195380449295044, "mean_token_accuracy": 0.7901398837566376, "num_tokens": 6588360.0, "step": 403 }, { "entropy": 0.5443613976240158, "epoch": 1.5074626865671643, "grad_norm": 0.15764807164669037, "learning_rate": 0.0002, "loss": 0.5409551858901978, "mean_token_accuracy": 0.7817244678735733, "num_tokens": 6604909.0, "step": 404 }, { "entropy": 0.5555933266878128, "epoch": 1.5111940298507462, "grad_norm": 0.15518265962600708, "learning_rate": 0.0002, "loss": 0.5575823187828064, "mean_token_accuracy": 0.7727370858192444, "num_tokens": 6621271.0, "step": 405 }, { "entropy": 0.5448516458272934, "epoch": 1.5149253731343284, "grad_norm": 0.13999900221824646, "learning_rate": 0.0002, "loss": 0.5443175435066223, "mean_token_accuracy": 0.7797447293996811, "num_tokens": 6637394.0, "step": 406 }, { "entropy": 0.5633855164051056, "epoch": 1.5186567164179103, "grad_norm": 0.12512464821338654, "learning_rate": 0.0002, "loss": 0.5552009344100952, "mean_token_accuracy": 0.7740202099084854, "num_tokens": 6653670.0, "step": 407 }, { "entropy": 0.5442499816417694, "epoch": 1.5223880597014925, "grad_norm": 0.13073165714740753, "learning_rate": 0.0002, "loss": 0.5353500843048096, "mean_token_accuracy": 0.7859338223934174, "num_tokens": 6670329.0, "step": 408 }, { "entropy": 0.5133479535579681, "epoch": 1.5261194029850746, "grad_norm": 0.1424253284931183, "learning_rate": 0.0002, "loss": 0.5181159377098083, "mean_token_accuracy": 0.791978657245636, "num_tokens": 6686590.0, "step": 409 }, { "entropy": 0.5216629430651665, "epoch": 1.5298507462686568, "grad_norm": 0.15952785313129425, "learning_rate": 0.0002, "loss": 0.5411725640296936, "mean_token_accuracy": 0.7812029272317886, "num_tokens": 6702970.0, "step": 410 }, { "entropy": 0.5392735451459885, "epoch": 1.533582089552239, "grad_norm": 0.13047060370445251, "learning_rate": 0.0002, "loss": 0.5485432147979736, "mean_token_accuracy": 0.7774497866630554, "num_tokens": 6719627.0, "step": 411 }, { "entropy": 0.5269859135150909, "epoch": 1.537313432835821, "grad_norm": 0.13100764155387878, "learning_rate": 0.0002, "loss": 0.5288376212120056, "mean_token_accuracy": 0.7857958972454071, "num_tokens": 6735951.0, "step": 412 }, { "entropy": 0.546154260635376, "epoch": 1.5410447761194028, "grad_norm": 0.13160941004753113, "learning_rate": 0.0002, "loss": 0.5382481813430786, "mean_token_accuracy": 0.7786583751440048, "num_tokens": 6752564.0, "step": 413 }, { "entropy": 0.552439495921135, "epoch": 1.544776119402985, "grad_norm": 0.13911442458629608, "learning_rate": 0.0002, "loss": 0.5381487011909485, "mean_token_accuracy": 0.782607913017273, "num_tokens": 6768993.0, "step": 414 }, { "entropy": 0.5463637262582779, "epoch": 1.5485074626865671, "grad_norm": 0.12377088516950607, "learning_rate": 0.0002, "loss": 0.5482580661773682, "mean_token_accuracy": 0.7775403410196304, "num_tokens": 6785304.0, "step": 415 }, { "entropy": 0.5309856235980988, "epoch": 1.5522388059701493, "grad_norm": 0.14743956923484802, "learning_rate": 0.0002, "loss": 0.5372556447982788, "mean_token_accuracy": 0.7811425626277924, "num_tokens": 6801545.0, "step": 416 }, { "entropy": 0.5256488621234894, "epoch": 1.5559701492537314, "grad_norm": 0.13745813071727753, "learning_rate": 0.0002, "loss": 0.5335954427719116, "mean_token_accuracy": 0.7857853770256042, "num_tokens": 6817793.0, "step": 417 }, { "entropy": 0.5426470339298248, "epoch": 1.5597014925373134, "grad_norm": 0.15729817748069763, "learning_rate": 0.0002, "loss": 0.5557945966720581, "mean_token_accuracy": 0.7755606323480606, "num_tokens": 6834171.0, "step": 418 }, { "entropy": 0.5429180264472961, "epoch": 1.5634328358208955, "grad_norm": 0.1530143916606903, "learning_rate": 0.0002, "loss": 0.5445144176483154, "mean_token_accuracy": 0.7793177515268326, "num_tokens": 6850298.0, "step": 419 }, { "entropy": 0.5458863228559494, "epoch": 1.5671641791044775, "grad_norm": 0.1244051530957222, "learning_rate": 0.0002, "loss": 0.5383530855178833, "mean_token_accuracy": 0.7812670916318893, "num_tokens": 6866891.0, "step": 420 }, { "entropy": 0.564603790640831, "epoch": 1.5708955223880596, "grad_norm": 0.14283782243728638, "learning_rate": 0.0002, "loss": 0.5600205659866333, "mean_token_accuracy": 0.7725525945425034, "num_tokens": 6883247.0, "step": 421 }, { "entropy": 0.5389530211687088, "epoch": 1.5746268656716418, "grad_norm": 0.13312764465808868, "learning_rate": 0.0002, "loss": 0.5395158529281616, "mean_token_accuracy": 0.7833812385797501, "num_tokens": 6899801.0, "step": 422 }, { "entropy": 0.5225178450345993, "epoch": 1.578358208955224, "grad_norm": 0.12671785056591034, "learning_rate": 0.0002, "loss": 0.530681312084198, "mean_token_accuracy": 0.7860707342624664, "num_tokens": 6916126.0, "step": 423 }, { "entropy": 0.5225076675415039, "epoch": 1.582089552238806, "grad_norm": 0.1846325844526291, "learning_rate": 0.0002, "loss": 0.5287823677062988, "mean_token_accuracy": 0.7858179211616516, "num_tokens": 6932572.0, "step": 424 }, { "entropy": 0.5322756171226501, "epoch": 1.585820895522388, "grad_norm": 0.1279527246952057, "learning_rate": 0.0002, "loss": 0.5314757823944092, "mean_token_accuracy": 0.7839424312114716, "num_tokens": 6948915.0, "step": 425 }, { "entropy": 0.5399055480957031, "epoch": 1.5895522388059702, "grad_norm": 0.14472827315330505, "learning_rate": 0.0002, "loss": 0.5389757752418518, "mean_token_accuracy": 0.781254380941391, "num_tokens": 6965311.0, "step": 426 }, { "entropy": 0.543253481388092, "epoch": 1.5932835820895521, "grad_norm": 0.1291203647851944, "learning_rate": 0.0002, "loss": 0.542615532875061, "mean_token_accuracy": 0.7801599353551865, "num_tokens": 6981751.0, "step": 427 }, { "entropy": 0.5258511453866959, "epoch": 1.5970149253731343, "grad_norm": 0.14912551641464233, "learning_rate": 0.0002, "loss": 0.5212829113006592, "mean_token_accuracy": 0.7879799157381058, "num_tokens": 6997999.0, "step": 428 }, { "entropy": 0.5359253436326981, "epoch": 1.6007462686567164, "grad_norm": 0.13902713358402252, "learning_rate": 0.0002, "loss": 0.5354318618774414, "mean_token_accuracy": 0.7819556444883347, "num_tokens": 7014251.0, "step": 429 }, { "entropy": 0.5399288833141327, "epoch": 1.6044776119402986, "grad_norm": 0.15356454253196716, "learning_rate": 0.0002, "loss": 0.5459235906600952, "mean_token_accuracy": 0.7798596769571304, "num_tokens": 7030929.0, "step": 430 }, { "entropy": 0.5489939600229263, "epoch": 1.6082089552238807, "grad_norm": 0.16724750399589539, "learning_rate": 0.0002, "loss": 0.5488972663879395, "mean_token_accuracy": 0.7782986462116241, "num_tokens": 7047344.0, "step": 431 }, { "entropy": 0.5521660596132278, "epoch": 1.6119402985074627, "grad_norm": 0.1370435506105423, "learning_rate": 0.0002, "loss": 0.5541171431541443, "mean_token_accuracy": 0.775096669793129, "num_tokens": 7063772.0, "step": 432 }, { "entropy": 0.5448116213083267, "epoch": 1.6156716417910446, "grad_norm": 0.16458411514759064, "learning_rate": 0.0002, "loss": 0.5444625616073608, "mean_token_accuracy": 0.7808038741350174, "num_tokens": 7080008.0, "step": 433 }, { "entropy": 0.5336454659700394, "epoch": 1.6194029850746268, "grad_norm": 0.13929054141044617, "learning_rate": 0.0002, "loss": 0.5374733805656433, "mean_token_accuracy": 0.7845250517129898, "num_tokens": 7096322.0, "step": 434 }, { "entropy": 0.5490863621234894, "epoch": 1.623134328358209, "grad_norm": 0.17425119876861572, "learning_rate": 0.0002, "loss": 0.5510268211364746, "mean_token_accuracy": 0.7752214223146439, "num_tokens": 7112627.0, "step": 435 }, { "entropy": 0.5409643575549126, "epoch": 1.626865671641791, "grad_norm": 0.1438315510749817, "learning_rate": 0.0002, "loss": 0.5421441197395325, "mean_token_accuracy": 0.7772217243909836, "num_tokens": 7128753.0, "step": 436 }, { "entropy": 0.5132558643817902, "epoch": 1.6305970149253732, "grad_norm": 0.19491760432720184, "learning_rate": 0.0002, "loss": 0.5172038674354553, "mean_token_accuracy": 0.7922582030296326, "num_tokens": 7145005.0, "step": 437 }, { "entropy": 0.5373466610908508, "epoch": 1.6343283582089554, "grad_norm": 0.1514309048652649, "learning_rate": 0.0002, "loss": 0.5405304431915283, "mean_token_accuracy": 0.7827999889850616, "num_tokens": 7161264.0, "step": 438 }, { "entropy": 0.5462755262851715, "epoch": 1.6380597014925373, "grad_norm": 0.1856052726507187, "learning_rate": 0.0002, "loss": 0.5462319254875183, "mean_token_accuracy": 0.7752426117658615, "num_tokens": 7177601.0, "step": 439 }, { "entropy": 0.5239088907837868, "epoch": 1.6417910447761193, "grad_norm": 0.15442201495170593, "learning_rate": 0.0002, "loss": 0.5269871950149536, "mean_token_accuracy": 0.7881719172000885, "num_tokens": 7194088.0, "step": 440 }, { "entropy": 0.5473000258207321, "epoch": 1.6455223880597014, "grad_norm": 0.1733047217130661, "learning_rate": 0.0002, "loss": 0.5468770861625671, "mean_token_accuracy": 0.7766072303056717, "num_tokens": 7210540.0, "step": 441 }, { "entropy": 0.5247174948453903, "epoch": 1.6492537313432836, "grad_norm": 0.15060853958129883, "learning_rate": 0.0002, "loss": 0.5271586179733276, "mean_token_accuracy": 0.7868671417236328, "num_tokens": 7226800.0, "step": 442 }, { "entropy": 0.5296545326709747, "epoch": 1.6529850746268657, "grad_norm": 0.14210547506809235, "learning_rate": 0.0002, "loss": 0.5233073830604553, "mean_token_accuracy": 0.7905395030975342, "num_tokens": 7242933.0, "step": 443 }, { "entropy": 0.5275071337819099, "epoch": 1.6567164179104479, "grad_norm": 0.16420303285121918, "learning_rate": 0.0002, "loss": 0.5262512564659119, "mean_token_accuracy": 0.7878832370042801, "num_tokens": 7259229.0, "step": 444 }, { "entropy": 0.5286994576454163, "epoch": 1.6604477611940298, "grad_norm": 0.16218696534633636, "learning_rate": 0.0002, "loss": 0.5356262922286987, "mean_token_accuracy": 0.781034916639328, "num_tokens": 7275629.0, "step": 445 }, { "entropy": 0.5249519050121307, "epoch": 1.664179104477612, "grad_norm": 0.13650326430797577, "learning_rate": 0.0002, "loss": 0.5306994915008545, "mean_token_accuracy": 0.7849638760089874, "num_tokens": 7291780.0, "step": 446 }, { "entropy": 0.5290274769067764, "epoch": 1.667910447761194, "grad_norm": 0.13130812346935272, "learning_rate": 0.0002, "loss": 0.5366880297660828, "mean_token_accuracy": 0.7813905030488968, "num_tokens": 7308207.0, "step": 447 }, { "entropy": 0.549896240234375, "epoch": 1.671641791044776, "grad_norm": 0.13799095153808594, "learning_rate": 0.0002, "loss": 0.542113721370697, "mean_token_accuracy": 0.7787055224180222, "num_tokens": 7324630.0, "step": 448 }, { "entropy": 0.5595291256904602, "epoch": 1.6753731343283582, "grad_norm": 0.12968024611473083, "learning_rate": 0.0002, "loss": 0.5542213320732117, "mean_token_accuracy": 0.7749587148427963, "num_tokens": 7340980.0, "step": 449 }, { "entropy": 0.5328024327754974, "epoch": 1.6791044776119404, "grad_norm": 0.15673688054084778, "learning_rate": 0.0002, "loss": 0.5303700566291809, "mean_token_accuracy": 0.7840248346328735, "num_tokens": 7357233.0, "step": 450 }, { "entropy": 0.527419738471508, "epoch": 1.6828358208955225, "grad_norm": 0.15271416306495667, "learning_rate": 0.0002, "loss": 0.5339101552963257, "mean_token_accuracy": 0.7841878533363342, "num_tokens": 7373557.0, "step": 451 }, { "entropy": 0.5285895839333534, "epoch": 1.6865671641791045, "grad_norm": 0.1619284600019455, "learning_rate": 0.0002, "loss": 0.5426527261734009, "mean_token_accuracy": 0.7801112830638885, "num_tokens": 7389775.0, "step": 452 }, { "entropy": 0.5160977020859718, "epoch": 1.6902985074626866, "grad_norm": 0.14479905366897583, "learning_rate": 0.0002, "loss": 0.5143705606460571, "mean_token_accuracy": 0.792098343372345, "num_tokens": 7406142.0, "step": 453 }, { "entropy": 0.5246409177780151, "epoch": 1.6940298507462686, "grad_norm": 0.13829895853996277, "learning_rate": 0.0002, "loss": 0.5241281986236572, "mean_token_accuracy": 0.7888348549604416, "num_tokens": 7422123.0, "step": 454 }, { "entropy": 0.539468988776207, "epoch": 1.6977611940298507, "grad_norm": 0.14040212333202362, "learning_rate": 0.0002, "loss": 0.538709282875061, "mean_token_accuracy": 0.7814967185258865, "num_tokens": 7438449.0, "step": 455 }, { "entropy": 0.5327620357275009, "epoch": 1.7014925373134329, "grad_norm": 0.13067209720611572, "learning_rate": 0.0002, "loss": 0.531409740447998, "mean_token_accuracy": 0.7817434817552567, "num_tokens": 7454843.0, "step": 456 }, { "entropy": 0.5428982973098755, "epoch": 1.705223880597015, "grad_norm": 0.13850897550582886, "learning_rate": 0.0002, "loss": 0.5404822826385498, "mean_token_accuracy": 0.7804021388292313, "num_tokens": 7471239.0, "step": 457 }, { "entropy": 0.5262090265750885, "epoch": 1.7089552238805972, "grad_norm": 0.1596522480249405, "learning_rate": 0.0002, "loss": 0.5263737440109253, "mean_token_accuracy": 0.7867833971977234, "num_tokens": 7487626.0, "step": 458 }, { "entropy": 0.5263974219560623, "epoch": 1.712686567164179, "grad_norm": 0.1885124146938324, "learning_rate": 0.0002, "loss": 0.5317808985710144, "mean_token_accuracy": 0.7860947102308273, "num_tokens": 7504295.0, "step": 459 }, { "entropy": 0.5409001708030701, "epoch": 1.716417910447761, "grad_norm": 0.18569619953632355, "learning_rate": 0.0002, "loss": 0.5502086877822876, "mean_token_accuracy": 0.7780454903841019, "num_tokens": 7520700.0, "step": 460 }, { "entropy": 0.5375530123710632, "epoch": 1.7201492537313432, "grad_norm": 0.1682044118642807, "learning_rate": 0.0002, "loss": 0.5417311787605286, "mean_token_accuracy": 0.7811579406261444, "num_tokens": 7537296.0, "step": 461 }, { "entropy": 0.5499445050954819, "epoch": 1.7238805970149254, "grad_norm": 0.13629741966724396, "learning_rate": 0.0002, "loss": 0.5424147248268127, "mean_token_accuracy": 0.7811519056558609, "num_tokens": 7553751.0, "step": 462 }, { "entropy": 0.5344928205013275, "epoch": 1.7276119402985075, "grad_norm": 0.15897303819656372, "learning_rate": 0.0002, "loss": 0.5359247326850891, "mean_token_accuracy": 0.7842150777578354, "num_tokens": 7569929.0, "step": 463 }, { "entropy": 0.5554052591323853, "epoch": 1.7313432835820897, "grad_norm": 0.1417708843946457, "learning_rate": 0.0002, "loss": 0.5565856099128723, "mean_token_accuracy": 0.7738053798675537, "num_tokens": 7586469.0, "step": 464 }, { "entropy": 0.5416853874921799, "epoch": 1.7350746268656716, "grad_norm": 0.13722717761993408, "learning_rate": 0.0002, "loss": 0.5358468294143677, "mean_token_accuracy": 0.7817960679531097, "num_tokens": 7602590.0, "step": 465 }, { "entropy": 0.5408632606267929, "epoch": 1.7388059701492538, "grad_norm": 0.157133087515831, "learning_rate": 0.0002, "loss": 0.5427348017692566, "mean_token_accuracy": 0.7806098312139511, "num_tokens": 7618775.0, "step": 466 }, { "entropy": 0.5247721523046494, "epoch": 1.7425373134328357, "grad_norm": 0.14061616361141205, "learning_rate": 0.0002, "loss": 0.5321290493011475, "mean_token_accuracy": 0.7820450663566589, "num_tokens": 7635093.0, "step": 467 }, { "entropy": 0.5205557495355606, "epoch": 1.7462686567164178, "grad_norm": 0.16123539209365845, "learning_rate": 0.0002, "loss": 0.5302354097366333, "mean_token_accuracy": 0.7854211032390594, "num_tokens": 7651685.0, "step": 468 }, { "entropy": 0.5282921940088272, "epoch": 1.75, "grad_norm": 0.15153366327285767, "learning_rate": 0.0002, "loss": 0.5328198671340942, "mean_token_accuracy": 0.7865671813488007, "num_tokens": 7667959.0, "step": 469 }, { "entropy": 0.5481950640678406, "epoch": 1.7537313432835822, "grad_norm": 0.12894481420516968, "learning_rate": 0.0002, "loss": 0.5497183799743652, "mean_token_accuracy": 0.778036966919899, "num_tokens": 7684463.0, "step": 470 }, { "entropy": 0.5427480936050415, "epoch": 1.7574626865671643, "grad_norm": 0.13647432625293732, "learning_rate": 0.0002, "loss": 0.53739994764328, "mean_token_accuracy": 0.7818431705236435, "num_tokens": 7700823.0, "step": 471 }, { "entropy": 0.5326214283704758, "epoch": 1.7611940298507462, "grad_norm": 0.13095979392528534, "learning_rate": 0.0002, "loss": 0.5291880369186401, "mean_token_accuracy": 0.7828460037708282, "num_tokens": 7717112.0, "step": 472 }, { "entropy": 0.5443256497383118, "epoch": 1.7649253731343284, "grad_norm": 0.15335077047348022, "learning_rate": 0.0002, "loss": 0.5414584875106812, "mean_token_accuracy": 0.7815631777048111, "num_tokens": 7733478.0, "step": 473 }, { "entropy": 0.5510082393884659, "epoch": 1.7686567164179103, "grad_norm": 0.12999047338962555, "learning_rate": 0.0002, "loss": 0.5502053499221802, "mean_token_accuracy": 0.7768876850605011, "num_tokens": 7749733.0, "step": 474 }, { "entropy": 0.5287549048662186, "epoch": 1.7723880597014925, "grad_norm": 0.14021116495132446, "learning_rate": 0.0002, "loss": 0.5351616740226746, "mean_token_accuracy": 0.7807136327028275, "num_tokens": 7766232.0, "step": 475 }, { "entropy": 0.5237460732460022, "epoch": 1.7761194029850746, "grad_norm": 0.13716712594032288, "learning_rate": 0.0002, "loss": 0.5256913900375366, "mean_token_accuracy": 0.7864228338003159, "num_tokens": 7782399.0, "step": 476 }, { "entropy": 0.5354984253644943, "epoch": 1.7798507462686568, "grad_norm": 0.1459989845752716, "learning_rate": 0.0002, "loss": 0.544316291809082, "mean_token_accuracy": 0.7797362506389618, "num_tokens": 7798787.0, "step": 477 }, { "entropy": 0.5540675520896912, "epoch": 1.783582089552239, "grad_norm": 0.12925799190998077, "learning_rate": 0.0002, "loss": 0.5467855334281921, "mean_token_accuracy": 0.7801081091165543, "num_tokens": 7815176.0, "step": 478 }, { "entropy": 0.5433181077241898, "epoch": 1.787313432835821, "grad_norm": 0.14298273622989655, "learning_rate": 0.0002, "loss": 0.5409605503082275, "mean_token_accuracy": 0.7799843400716782, "num_tokens": 7831722.0, "step": 479 }, { "entropy": 0.5389926880598068, "epoch": 1.7910447761194028, "grad_norm": 0.13404588401317596, "learning_rate": 0.0002, "loss": 0.5434566736221313, "mean_token_accuracy": 0.7795996069908142, "num_tokens": 7847789.0, "step": 480 }, { "entropy": 0.5389460772275925, "epoch": 1.794776119402985, "grad_norm": 0.14891406893730164, "learning_rate": 0.0002, "loss": 0.5478703379631042, "mean_token_accuracy": 0.7784013152122498, "num_tokens": 7864039.0, "step": 481 }, { "entropy": 0.5258179157972336, "epoch": 1.7985074626865671, "grad_norm": 0.1405036896467209, "learning_rate": 0.0002, "loss": 0.5313145518302917, "mean_token_accuracy": 0.7844405174255371, "num_tokens": 7880600.0, "step": 482 }, { "entropy": 0.546451672911644, "epoch": 1.8022388059701493, "grad_norm": 0.12789376080036163, "learning_rate": 0.0002, "loss": 0.5392628908157349, "mean_token_accuracy": 0.780185878276825, "num_tokens": 7897113.0, "step": 483 }, { "entropy": 0.5462568253278732, "epoch": 1.8059701492537314, "grad_norm": 0.15970084071159363, "learning_rate": 0.0002, "loss": 0.5421247482299805, "mean_token_accuracy": 0.7790002077817917, "num_tokens": 7913715.0, "step": 484 }, { "entropy": 0.5245223939418793, "epoch": 1.8097014925373134, "grad_norm": 0.12480644881725311, "learning_rate": 0.0002, "loss": 0.5243803858757019, "mean_token_accuracy": 0.7877090722322464, "num_tokens": 7930253.0, "step": 485 }, { "entropy": 0.5543881952762604, "epoch": 1.8134328358208955, "grad_norm": 0.17440125346183777, "learning_rate": 0.0002, "loss": 0.5611490607261658, "mean_token_accuracy": 0.773423507809639, "num_tokens": 7946773.0, "step": 486 }, { "entropy": 0.5448231846094131, "epoch": 1.8171641791044775, "grad_norm": 0.1254844069480896, "learning_rate": 0.0002, "loss": 0.5429874658584595, "mean_token_accuracy": 0.7773167043924332, "num_tokens": 7963214.0, "step": 487 }, { "entropy": 0.5249373018741608, "epoch": 1.8208955223880596, "grad_norm": 0.13412347435951233, "learning_rate": 0.0002, "loss": 0.5265883803367615, "mean_token_accuracy": 0.7875321507453918, "num_tokens": 7979611.0, "step": 488 }, { "entropy": 0.5382010042667389, "epoch": 1.8246268656716418, "grad_norm": 0.16182008385658264, "learning_rate": 0.0002, "loss": 0.5412148237228394, "mean_token_accuracy": 0.7788311392068863, "num_tokens": 7996094.0, "step": 489 }, { "entropy": 0.5332826524972916, "epoch": 1.828358208955224, "grad_norm": 0.1427432894706726, "learning_rate": 0.0002, "loss": 0.5368761420249939, "mean_token_accuracy": 0.7825220227241516, "num_tokens": 8012432.0, "step": 490 }, { "entropy": 0.5108669325709343, "epoch": 1.832089552238806, "grad_norm": 0.1509285867214203, "learning_rate": 0.0002, "loss": 0.5119490623474121, "mean_token_accuracy": 0.7906075417995453, "num_tokens": 8028665.0, "step": 491 }, { "entropy": 0.5145807713270187, "epoch": 1.835820895522388, "grad_norm": 0.1396896094083786, "learning_rate": 0.0002, "loss": 0.5196783542633057, "mean_token_accuracy": 0.7873106449842453, "num_tokens": 8044855.0, "step": 492 }, { "entropy": 0.5123258233070374, "epoch": 1.8395522388059702, "grad_norm": 0.14697767794132233, "learning_rate": 0.0002, "loss": 0.5223352909088135, "mean_token_accuracy": 0.7885845303535461, "num_tokens": 8061121.0, "step": 493 }, { "entropy": 0.5335386842489243, "epoch": 1.8432835820895521, "grad_norm": 0.14804190397262573, "learning_rate": 0.0002, "loss": 0.534782350063324, "mean_token_accuracy": 0.7838051915168762, "num_tokens": 8077519.0, "step": 494 }, { "entropy": 0.5602670460939407, "epoch": 1.8470149253731343, "grad_norm": 0.13603031635284424, "learning_rate": 0.0002, "loss": 0.5542025566101074, "mean_token_accuracy": 0.7756092548370361, "num_tokens": 8093937.0, "step": 495 }, { "entropy": 0.5355454534292221, "epoch": 1.8507462686567164, "grad_norm": 0.11670524626970291, "learning_rate": 0.0002, "loss": 0.5269724130630493, "mean_token_accuracy": 0.7864131927490234, "num_tokens": 8110383.0, "step": 496 }, { "entropy": 0.5373311340808868, "epoch": 1.8544776119402986, "grad_norm": 0.13412456214427948, "learning_rate": 0.0002, "loss": 0.5295535326004028, "mean_token_accuracy": 0.7874404042959213, "num_tokens": 8126795.0, "step": 497 }, { "entropy": 0.5373153984546661, "epoch": 1.8582089552238807, "grad_norm": 0.1485511064529419, "learning_rate": 0.0002, "loss": 0.5427818894386292, "mean_token_accuracy": 0.7803584039211273, "num_tokens": 8143234.0, "step": 498 }, { "entropy": 0.522105023264885, "epoch": 1.8619402985074627, "grad_norm": 0.1580716073513031, "learning_rate": 0.0002, "loss": 0.5267635583877563, "mean_token_accuracy": 0.7869967371225357, "num_tokens": 8159687.0, "step": 499 }, { "entropy": 0.5215406715869904, "epoch": 1.8656716417910446, "grad_norm": 0.1573050171136856, "learning_rate": 0.0002, "loss": 0.5285288691520691, "mean_token_accuracy": 0.7851908951997757, "num_tokens": 8176020.0, "step": 500 }, { "entropy": 0.5404719114303589, "epoch": 1.8694029850746268, "grad_norm": 0.1411486119031906, "learning_rate": 0.0002, "loss": 0.5365728735923767, "mean_token_accuracy": 0.7837002873420715, "num_tokens": 8192551.0, "step": 501 }, { "entropy": 0.5438470244407654, "epoch": 1.873134328358209, "grad_norm": 0.130998432636261, "learning_rate": 0.0002, "loss": 0.5430339574813843, "mean_token_accuracy": 0.7819307893514633, "num_tokens": 8209082.0, "step": 502 }, { "entropy": 0.5403178930282593, "epoch": 1.876865671641791, "grad_norm": 0.1385144740343094, "learning_rate": 0.0002, "loss": 0.5460789203643799, "mean_token_accuracy": 0.7790951728820801, "num_tokens": 8225744.0, "step": 503 }, { "entropy": 0.5280100554227829, "epoch": 1.8805970149253732, "grad_norm": 0.14330939948558807, "learning_rate": 0.0002, "loss": 0.5235118269920349, "mean_token_accuracy": 0.7890605628490448, "num_tokens": 8242208.0, "step": 504 }, { "entropy": 0.5532096922397614, "epoch": 1.8843283582089554, "grad_norm": 0.1357594132423401, "learning_rate": 0.0002, "loss": 0.5498918890953064, "mean_token_accuracy": 0.7760927677154541, "num_tokens": 8258496.0, "step": 505 }, { "entropy": 0.5294792056083679, "epoch": 1.8880597014925373, "grad_norm": 0.13375437259674072, "learning_rate": 0.0002, "loss": 0.5297701358795166, "mean_token_accuracy": 0.7845475971698761, "num_tokens": 8274536.0, "step": 506 }, { "entropy": 0.5456722378730774, "epoch": 1.8917910447761193, "grad_norm": 0.14889481663703918, "learning_rate": 0.0002, "loss": 0.5517223477363586, "mean_token_accuracy": 0.7756078243255615, "num_tokens": 8290986.0, "step": 507 }, { "entropy": 0.5214451998472214, "epoch": 1.8955223880597014, "grad_norm": 0.13305895030498505, "learning_rate": 0.0002, "loss": 0.5249897837638855, "mean_token_accuracy": 0.7870367765426636, "num_tokens": 8307117.0, "step": 508 }, { "entropy": 0.5336883068084717, "epoch": 1.8992537313432836, "grad_norm": 0.13193877041339874, "learning_rate": 0.0002, "loss": 0.5352887511253357, "mean_token_accuracy": 0.7798391133546829, "num_tokens": 8323273.0, "step": 509 }, { "entropy": 0.5336564183235168, "epoch": 1.9029850746268657, "grad_norm": 0.12489310652017593, "learning_rate": 0.0002, "loss": 0.5302382111549377, "mean_token_accuracy": 0.7845423817634583, "num_tokens": 8339385.0, "step": 510 }, { "entropy": 0.5382219552993774, "epoch": 1.9067164179104479, "grad_norm": 0.1456049680709839, "learning_rate": 0.0002, "loss": 0.5372790694236755, "mean_token_accuracy": 0.782544881105423, "num_tokens": 8355706.0, "step": 511 }, { "entropy": 0.5403454750776291, "epoch": 1.9104477611940298, "grad_norm": 0.12694604694843292, "learning_rate": 0.0002, "loss": 0.5402185320854187, "mean_token_accuracy": 0.7826471477746964, "num_tokens": 8372132.0, "step": 512 }, { "entropy": 0.5318908393383026, "epoch": 1.914179104477612, "grad_norm": 0.1555122435092926, "learning_rate": 0.0002, "loss": 0.541782796382904, "mean_token_accuracy": 0.7774071842432022, "num_tokens": 8388306.0, "step": 513 }, { "entropy": 0.5221689939498901, "epoch": 1.917910447761194, "grad_norm": 0.1543516367673874, "learning_rate": 0.0002, "loss": 0.5357338190078735, "mean_token_accuracy": 0.7826261073350906, "num_tokens": 8404876.0, "step": 514 }, { "entropy": 0.5229770094156265, "epoch": 1.921641791044776, "grad_norm": 0.13613452017307281, "learning_rate": 0.0002, "loss": 0.5244792699813843, "mean_token_accuracy": 0.7872123420238495, "num_tokens": 8421349.0, "step": 515 }, { "entropy": 0.5398612320423126, "epoch": 1.9253731343283582, "grad_norm": 0.14049243927001953, "learning_rate": 0.0002, "loss": 0.5422282218933105, "mean_token_accuracy": 0.7783734500408173, "num_tokens": 8437774.0, "step": 516 }, { "entropy": 0.5401616841554642, "epoch": 1.9291044776119404, "grad_norm": 0.13164237141609192, "learning_rate": 0.0002, "loss": 0.5331213474273682, "mean_token_accuracy": 0.7848468571901321, "num_tokens": 8454123.0, "step": 517 }, { "entropy": 0.5214215666055679, "epoch": 1.9328358208955225, "grad_norm": 0.13749226927757263, "learning_rate": 0.0002, "loss": 0.5158907175064087, "mean_token_accuracy": 0.7904626429080963, "num_tokens": 8470320.0, "step": 518 }, { "entropy": 0.5412722826004028, "epoch": 1.9365671641791045, "grad_norm": 0.127340629696846, "learning_rate": 0.0002, "loss": 0.5443693995475769, "mean_token_accuracy": 0.7785214781761169, "num_tokens": 8486754.0, "step": 519 }, { "entropy": 0.5276665389537811, "epoch": 1.9402985074626866, "grad_norm": 0.13310599327087402, "learning_rate": 0.0002, "loss": 0.5311852693557739, "mean_token_accuracy": 0.7849074453115463, "num_tokens": 8503273.0, "step": 520 }, { "entropy": 0.5346188247203827, "epoch": 1.9440298507462686, "grad_norm": 0.12909531593322754, "learning_rate": 0.0002, "loss": 0.5408310890197754, "mean_token_accuracy": 0.779103621840477, "num_tokens": 8519520.0, "step": 521 }, { "entropy": 0.5392955094575882, "epoch": 1.9477611940298507, "grad_norm": 0.12654371559619904, "learning_rate": 0.0002, "loss": 0.5376543998718262, "mean_token_accuracy": 0.7810464203357697, "num_tokens": 8535688.0, "step": 522 }, { "entropy": 0.526744157075882, "epoch": 1.9514925373134329, "grad_norm": 0.11877280473709106, "learning_rate": 0.0002, "loss": 0.5258936882019043, "mean_token_accuracy": 0.7875306010246277, "num_tokens": 8551996.0, "step": 523 }, { "entropy": 0.5467166006565094, "epoch": 1.955223880597015, "grad_norm": 0.1407010555267334, "learning_rate": 0.0002, "loss": 0.5389098525047302, "mean_token_accuracy": 0.7805493026971817, "num_tokens": 8568202.0, "step": 524 }, { "entropy": 0.5553875267505646, "epoch": 1.9589552238805972, "grad_norm": 0.13490191102027893, "learning_rate": 0.0002, "loss": 0.5481207370758057, "mean_token_accuracy": 0.7784747332334518, "num_tokens": 8584625.0, "step": 525 }, { "entropy": 0.5178312584757805, "epoch": 1.962686567164179, "grad_norm": 0.14236751198768616, "learning_rate": 0.0002, "loss": 0.5226012468338013, "mean_token_accuracy": 0.7866991758346558, "num_tokens": 8600683.0, "step": 526 }, { "entropy": 0.5227778926491737, "epoch": 1.966417910447761, "grad_norm": 0.16303445398807526, "learning_rate": 0.0002, "loss": 0.5365378856658936, "mean_token_accuracy": 0.7807085812091827, "num_tokens": 8616685.0, "step": 527 }, { "entropy": 0.5410575568675995, "epoch": 1.9701492537313432, "grad_norm": 0.16557544469833374, "learning_rate": 0.0002, "loss": 0.5510291457176208, "mean_token_accuracy": 0.7770103365182877, "num_tokens": 8633088.0, "step": 528 }, { "entropy": 0.531767264008522, "epoch": 1.9738805970149254, "grad_norm": 0.16024784743785858, "learning_rate": 0.0002, "loss": 0.5305666327476501, "mean_token_accuracy": 0.7834270149469376, "num_tokens": 8649322.0, "step": 529 }, { "entropy": 0.5423388332128525, "epoch": 1.9776119402985075, "grad_norm": 0.1314675360918045, "learning_rate": 0.0002, "loss": 0.5316357016563416, "mean_token_accuracy": 0.7857660055160522, "num_tokens": 8665670.0, "step": 530 }, { "entropy": 0.5405716001987457, "epoch": 1.9813432835820897, "grad_norm": 0.1407650113105774, "learning_rate": 0.0002, "loss": 0.5429906845092773, "mean_token_accuracy": 0.7817323058843613, "num_tokens": 8681998.0, "step": 531 }, { "entropy": 0.5365249365568161, "epoch": 1.9850746268656716, "grad_norm": 0.14180989563465118, "learning_rate": 0.0002, "loss": 0.5345437526702881, "mean_token_accuracy": 0.7865561246871948, "num_tokens": 8698483.0, "step": 532 }, { "entropy": 0.5290075689554214, "epoch": 1.9888059701492538, "grad_norm": 0.1477176696062088, "learning_rate": 0.0002, "loss": 0.5337146520614624, "mean_token_accuracy": 0.7824839055538177, "num_tokens": 8714640.0, "step": 533 }, { "entropy": 0.5333692282438278, "epoch": 1.9925373134328357, "grad_norm": 0.17112773656845093, "learning_rate": 0.0002, "loss": 0.5424102544784546, "mean_token_accuracy": 0.779076337814331, "num_tokens": 8730887.0, "step": 534 }, { "entropy": 0.5415492355823517, "epoch": 1.9962686567164178, "grad_norm": 0.14943642914295197, "learning_rate": 0.0002, "loss": 0.5476213693618774, "mean_token_accuracy": 0.7769679576158524, "num_tokens": 8747309.0, "step": 535 }, { "entropy": 0.5581045299768448, "epoch": 2.0, "grad_norm": 0.15832063555717468, "learning_rate": 0.0002, "loss": 0.5548263788223267, "mean_token_accuracy": 0.776277557015419, "num_tokens": 8763550.0, "step": 536 }, { "entropy": 0.5369964390993118, "epoch": 2.003731343283582, "grad_norm": 0.15130668878555298, "learning_rate": 0.0002, "loss": 0.5179107189178467, "mean_token_accuracy": 0.7907675057649612, "num_tokens": 8779922.0, "step": 537 }, { "entropy": 0.5117110908031464, "epoch": 2.0074626865671643, "grad_norm": 0.16026535630226135, "learning_rate": 0.0002, "loss": 0.5020841956138611, "mean_token_accuracy": 0.7973873615264893, "num_tokens": 8795988.0, "step": 538 }, { "entropy": 0.5028296113014221, "epoch": 2.0111940298507465, "grad_norm": 0.1676231324672699, "learning_rate": 0.0002, "loss": 0.51214998960495, "mean_token_accuracy": 0.7921472936868668, "num_tokens": 8812261.0, "step": 539 }, { "entropy": 0.5081141889095306, "epoch": 2.014925373134328, "grad_norm": 0.21105162799358368, "learning_rate": 0.0002, "loss": 0.5206259489059448, "mean_token_accuracy": 0.7869252115488052, "num_tokens": 8828964.0, "step": 540 }, { "entropy": 0.5053770169615746, "epoch": 2.0186567164179103, "grad_norm": 0.1996072232723236, "learning_rate": 0.0002, "loss": 0.5146310329437256, "mean_token_accuracy": 0.7916830629110336, "num_tokens": 8845583.0, "step": 541 }, { "entropy": 0.5284380093216896, "epoch": 2.0223880597014925, "grad_norm": 0.14588730037212372, "learning_rate": 0.0002, "loss": 0.5199918150901794, "mean_token_accuracy": 0.7893239259719849, "num_tokens": 8861873.0, "step": 542 }, { "entropy": 0.5435770899057388, "epoch": 2.0261194029850746, "grad_norm": 0.14907799661159515, "learning_rate": 0.0002, "loss": 0.536811113357544, "mean_token_accuracy": 0.7802763283252716, "num_tokens": 8878456.0, "step": 543 }, { "entropy": 0.5174986571073532, "epoch": 2.029850746268657, "grad_norm": 0.14996512234210968, "learning_rate": 0.0002, "loss": 0.5144167542457581, "mean_token_accuracy": 0.7930785864591599, "num_tokens": 8894797.0, "step": 544 }, { "entropy": 0.5272421538829803, "epoch": 2.033582089552239, "grad_norm": 0.16765476763248444, "learning_rate": 0.0002, "loss": 0.5306269526481628, "mean_token_accuracy": 0.7856330573558807, "num_tokens": 8911217.0, "step": 545 }, { "entropy": 0.49972501397132874, "epoch": 2.0373134328358207, "grad_norm": 0.1322057694196701, "learning_rate": 0.0002, "loss": 0.5012874603271484, "mean_token_accuracy": 0.7979290634393692, "num_tokens": 8927511.0, "step": 546 }, { "entropy": 0.5031155720353127, "epoch": 2.041044776119403, "grad_norm": 0.16402538120746613, "learning_rate": 0.0002, "loss": 0.5100584626197815, "mean_token_accuracy": 0.7926298826932907, "num_tokens": 8943509.0, "step": 547 }, { "entropy": 0.5090021565556526, "epoch": 2.044776119402985, "grad_norm": 0.1516626924276352, "learning_rate": 0.0002, "loss": 0.51352858543396, "mean_token_accuracy": 0.7925879657268524, "num_tokens": 8959744.0, "step": 548 }, { "entropy": 0.4990556240081787, "epoch": 2.048507462686567, "grad_norm": 0.14189165830612183, "learning_rate": 0.0002, "loss": 0.5032692551612854, "mean_token_accuracy": 0.7943097651004791, "num_tokens": 8976001.0, "step": 549 }, { "entropy": 0.5276429355144501, "epoch": 2.0522388059701493, "grad_norm": 0.13545501232147217, "learning_rate": 0.0002, "loss": 0.5224078893661499, "mean_token_accuracy": 0.7892052680253983, "num_tokens": 8992265.0, "step": 550 }, { "entropy": 0.5246792286634445, "epoch": 2.0559701492537314, "grad_norm": 0.15987011790275574, "learning_rate": 0.0002, "loss": 0.5220500230789185, "mean_token_accuracy": 0.7897221744060516, "num_tokens": 9008612.0, "step": 551 }, { "entropy": 0.5142855197191238, "epoch": 2.0597014925373136, "grad_norm": 0.17870153486728668, "learning_rate": 0.0002, "loss": 0.5103524923324585, "mean_token_accuracy": 0.7925411611795425, "num_tokens": 9025112.0, "step": 552 }, { "entropy": 0.5080101564526558, "epoch": 2.0634328358208953, "grad_norm": 0.19365249574184418, "learning_rate": 0.0002, "loss": 0.5135321617126465, "mean_token_accuracy": 0.792420819401741, "num_tokens": 9041825.0, "step": 553 }, { "entropy": 0.5249690413475037, "epoch": 2.0671641791044775, "grad_norm": 0.17408262193202972, "learning_rate": 0.0002, "loss": 0.527820348739624, "mean_token_accuracy": 0.7850991487503052, "num_tokens": 9058218.0, "step": 554 }, { "entropy": 0.5355798751115799, "epoch": 2.0708955223880596, "grad_norm": 0.17400678992271423, "learning_rate": 0.0002, "loss": 0.5327027440071106, "mean_token_accuracy": 0.7834015786647797, "num_tokens": 9074538.0, "step": 555 }, { "entropy": 0.5193932577967644, "epoch": 2.074626865671642, "grad_norm": 0.19260965287685394, "learning_rate": 0.0002, "loss": 0.5203508138656616, "mean_token_accuracy": 0.7900512516498566, "num_tokens": 9090645.0, "step": 556 }, { "entropy": 0.5282454341650009, "epoch": 2.078358208955224, "grad_norm": 0.17010283470153809, "learning_rate": 0.0002, "loss": 0.5296856760978699, "mean_token_accuracy": 0.7844990193843842, "num_tokens": 9107205.0, "step": 557 }, { "entropy": 0.5335307121276855, "epoch": 2.082089552238806, "grad_norm": 0.18085786700248718, "learning_rate": 0.0002, "loss": 0.5380091667175293, "mean_token_accuracy": 0.7830383628606796, "num_tokens": 9123633.0, "step": 558 }, { "entropy": 0.5050861239433289, "epoch": 2.0858208955223883, "grad_norm": 0.1828233301639557, "learning_rate": 0.0002, "loss": 0.5116996169090271, "mean_token_accuracy": 0.7909363359212875, "num_tokens": 9139672.0, "step": 559 }, { "entropy": 0.5233924090862274, "epoch": 2.08955223880597, "grad_norm": 0.1721849888563156, "learning_rate": 0.0002, "loss": 0.5234174728393555, "mean_token_accuracy": 0.7887046784162521, "num_tokens": 9156329.0, "step": 560 }, { "entropy": 0.5096859857439995, "epoch": 2.093283582089552, "grad_norm": 0.13895049691200256, "learning_rate": 0.0002, "loss": 0.5016306638717651, "mean_token_accuracy": 0.7958591133356094, "num_tokens": 9172549.0, "step": 561 }, { "entropy": 0.5022074803709984, "epoch": 2.0970149253731343, "grad_norm": 0.18107853829860687, "learning_rate": 0.0002, "loss": 0.49785315990448, "mean_token_accuracy": 0.7988625317811966, "num_tokens": 9188916.0, "step": 562 }, { "entropy": 0.49919093400239944, "epoch": 2.1007462686567164, "grad_norm": 0.18361544609069824, "learning_rate": 0.0002, "loss": 0.5069372057914734, "mean_token_accuracy": 0.7953463643789291, "num_tokens": 9205116.0, "step": 563 }, { "entropy": 0.5179380178451538, "epoch": 2.1044776119402986, "grad_norm": 0.17814478278160095, "learning_rate": 0.0002, "loss": 0.5233405232429504, "mean_token_accuracy": 0.7879672199487686, "num_tokens": 9221422.0, "step": 564 }, { "entropy": 0.5209343507885933, "epoch": 2.1082089552238807, "grad_norm": 0.16368801891803741, "learning_rate": 0.0002, "loss": 0.5220014452934265, "mean_token_accuracy": 0.7900985032320023, "num_tokens": 9237878.0, "step": 565 }, { "entropy": 0.5203168541193008, "epoch": 2.111940298507463, "grad_norm": 0.18038009107112885, "learning_rate": 0.0002, "loss": 0.5181905627250671, "mean_token_accuracy": 0.7902995347976685, "num_tokens": 9254207.0, "step": 566 }, { "entropy": 0.5203139036893845, "epoch": 2.1156716417910446, "grad_norm": 0.15972773730754852, "learning_rate": 0.0002, "loss": 0.5092154145240784, "mean_token_accuracy": 0.793173611164093, "num_tokens": 9270204.0, "step": 567 }, { "entropy": 0.5298740118741989, "epoch": 2.1194029850746268, "grad_norm": 0.16917745769023895, "learning_rate": 0.0002, "loss": 0.521593451499939, "mean_token_accuracy": 0.789896160364151, "num_tokens": 9286472.0, "step": 568 }, { "entropy": 0.5120234042406082, "epoch": 2.123134328358209, "grad_norm": 0.1817537248134613, "learning_rate": 0.0002, "loss": 0.5180550813674927, "mean_token_accuracy": 0.7886006981134415, "num_tokens": 9302801.0, "step": 569 }, { "entropy": 0.5053592845797539, "epoch": 2.126865671641791, "grad_norm": 0.17402999103069305, "learning_rate": 0.0002, "loss": 0.5133467316627502, "mean_token_accuracy": 0.7945185601711273, "num_tokens": 9318994.0, "step": 570 }, { "entropy": 0.5077695101499557, "epoch": 2.1305970149253732, "grad_norm": 0.1826324611902237, "learning_rate": 0.0002, "loss": 0.5111861228942871, "mean_token_accuracy": 0.7935459464788437, "num_tokens": 9335440.0, "step": 571 }, { "entropy": 0.5085733756422997, "epoch": 2.1343283582089554, "grad_norm": 0.20258648693561554, "learning_rate": 0.0002, "loss": 0.5162274837493896, "mean_token_accuracy": 0.7936873137950897, "num_tokens": 9351752.0, "step": 572 }, { "entropy": 0.5466553270816803, "epoch": 2.138059701492537, "grad_norm": 0.21011336147785187, "learning_rate": 0.0002, "loss": 0.5393267273902893, "mean_token_accuracy": 0.7812587320804596, "num_tokens": 9368219.0, "step": 573 }, { "entropy": 0.5103291645646095, "epoch": 2.1417910447761193, "grad_norm": 0.16960836946964264, "learning_rate": 0.0002, "loss": 0.5084283351898193, "mean_token_accuracy": 0.7936739772558212, "num_tokens": 9384590.0, "step": 574 }, { "entropy": 0.5131630301475525, "epoch": 2.1455223880597014, "grad_norm": 0.17001323401927948, "learning_rate": 0.0002, "loss": 0.5123889446258545, "mean_token_accuracy": 0.7904325425624847, "num_tokens": 9400768.0, "step": 575 }, { "entropy": 0.5091337114572525, "epoch": 2.1492537313432836, "grad_norm": 0.19518889486789703, "learning_rate": 0.0002, "loss": 0.512664794921875, "mean_token_accuracy": 0.7909765988588333, "num_tokens": 9416962.0, "step": 576 }, { "entropy": 0.506959430873394, "epoch": 2.1529850746268657, "grad_norm": 0.19361013174057007, "learning_rate": 0.0002, "loss": 0.5145208835601807, "mean_token_accuracy": 0.7909970581531525, "num_tokens": 9433273.0, "step": 577 }, { "entropy": 0.5075285658240318, "epoch": 2.156716417910448, "grad_norm": 0.20014171302318573, "learning_rate": 0.0002, "loss": 0.5108210444450378, "mean_token_accuracy": 0.795252114534378, "num_tokens": 9449764.0, "step": 578 }, { "entropy": 0.5293942838907242, "epoch": 2.16044776119403, "grad_norm": 0.1974441111087799, "learning_rate": 0.0002, "loss": 0.5285412669181824, "mean_token_accuracy": 0.7868294268846512, "num_tokens": 9466170.0, "step": 579 }, { "entropy": 0.5336958318948746, "epoch": 2.1641791044776117, "grad_norm": 0.16498853266239166, "learning_rate": 0.0002, "loss": 0.5246227383613586, "mean_token_accuracy": 0.7904203087091446, "num_tokens": 9482671.0, "step": 580 }, { "entropy": 0.5340626388788223, "epoch": 2.167910447761194, "grad_norm": 0.16569171845912933, "learning_rate": 0.0002, "loss": 0.5292053818702698, "mean_token_accuracy": 0.7861965000629425, "num_tokens": 9499134.0, "step": 581 }, { "entropy": 0.5213732421398163, "epoch": 2.171641791044776, "grad_norm": 0.191435769200325, "learning_rate": 0.0002, "loss": 0.527378499507904, "mean_token_accuracy": 0.7864173054695129, "num_tokens": 9515505.0, "step": 582 }, { "entropy": 0.5035439431667328, "epoch": 2.175373134328358, "grad_norm": 0.1665230244398117, "learning_rate": 0.0002, "loss": 0.5038704872131348, "mean_token_accuracy": 0.7968962043523788, "num_tokens": 9532118.0, "step": 583 }, { "entropy": 0.5060234367847443, "epoch": 2.1791044776119404, "grad_norm": 0.16969595849514008, "learning_rate": 0.0002, "loss": 0.5113446712493896, "mean_token_accuracy": 0.7920107841491699, "num_tokens": 9548351.0, "step": 584 }, { "entropy": 0.5291168391704559, "epoch": 2.1828358208955225, "grad_norm": 0.16809239983558655, "learning_rate": 0.0002, "loss": 0.5360448360443115, "mean_token_accuracy": 0.7811578214168549, "num_tokens": 9564913.0, "step": 585 }, { "entropy": 0.5199222788214684, "epoch": 2.1865671641791047, "grad_norm": 0.15394440293312073, "learning_rate": 0.0002, "loss": 0.5177597403526306, "mean_token_accuracy": 0.7905119061470032, "num_tokens": 9581583.0, "step": 586 }, { "entropy": 0.5282980501651764, "epoch": 2.1902985074626864, "grad_norm": 0.17473557591438293, "learning_rate": 0.0002, "loss": 0.527908980846405, "mean_token_accuracy": 0.7872945964336395, "num_tokens": 9598262.0, "step": 587 }, { "entropy": 0.5268830358982086, "epoch": 2.1940298507462686, "grad_norm": 0.16386888921260834, "learning_rate": 0.0002, "loss": 0.5233091711997986, "mean_token_accuracy": 0.788049191236496, "num_tokens": 9614535.0, "step": 588 }, { "entropy": 0.5275766104459763, "epoch": 2.1977611940298507, "grad_norm": 0.17853675782680511, "learning_rate": 0.0002, "loss": 0.5314985513687134, "mean_token_accuracy": 0.7853439450263977, "num_tokens": 9630730.0, "step": 589 }, { "entropy": 0.5230407416820526, "epoch": 2.201492537313433, "grad_norm": 0.18614573776721954, "learning_rate": 0.0002, "loss": 0.5324023365974426, "mean_token_accuracy": 0.7870204299688339, "num_tokens": 9647367.0, "step": 590 }, { "entropy": 0.5045590102672577, "epoch": 2.205223880597015, "grad_norm": 0.16460436582565308, "learning_rate": 0.0002, "loss": 0.5095564723014832, "mean_token_accuracy": 0.7933550179004669, "num_tokens": 9663807.0, "step": 591 }, { "entropy": 0.5061227604746819, "epoch": 2.208955223880597, "grad_norm": 0.1727134734392166, "learning_rate": 0.0002, "loss": 0.50539630651474, "mean_token_accuracy": 0.79543037712574, "num_tokens": 9679957.0, "step": 592 }, { "entropy": 0.5444381237030029, "epoch": 2.2126865671641793, "grad_norm": 0.1631772667169571, "learning_rate": 0.0002, "loss": 0.5421435832977295, "mean_token_accuracy": 0.7804461270570755, "num_tokens": 9696269.0, "step": 593 }, { "entropy": 0.5140876695513725, "epoch": 2.216417910447761, "grad_norm": 0.14234963059425354, "learning_rate": 0.0002, "loss": 0.5083339214324951, "mean_token_accuracy": 0.7940346747636795, "num_tokens": 9712614.0, "step": 594 }, { "entropy": 0.5227879285812378, "epoch": 2.220149253731343, "grad_norm": 0.1700550764799118, "learning_rate": 0.0002, "loss": 0.5256499648094177, "mean_token_accuracy": 0.788642093539238, "num_tokens": 9729090.0, "step": 595 }, { "entropy": 0.5193727314472198, "epoch": 2.2238805970149254, "grad_norm": 0.16189917922019958, "learning_rate": 0.0002, "loss": 0.515200674533844, "mean_token_accuracy": 0.7933167964220047, "num_tokens": 9745602.0, "step": 596 }, { "entropy": 0.5037901103496552, "epoch": 2.2276119402985075, "grad_norm": 0.15295493602752686, "learning_rate": 0.0002, "loss": 0.5038392543792725, "mean_token_accuracy": 0.7972543388605118, "num_tokens": 9761880.0, "step": 597 }, { "entropy": 0.5051177442073822, "epoch": 2.2313432835820897, "grad_norm": 0.18619783222675323, "learning_rate": 0.0002, "loss": 0.5126343369483948, "mean_token_accuracy": 0.794564738869667, "num_tokens": 9778073.0, "step": 598 }, { "entropy": 0.5051270872354507, "epoch": 2.235074626865672, "grad_norm": 0.1611267328262329, "learning_rate": 0.0002, "loss": 0.5092532634735107, "mean_token_accuracy": 0.7946549952030182, "num_tokens": 9794345.0, "step": 599 }, { "entropy": 0.5325346812605858, "epoch": 2.2388059701492535, "grad_norm": 0.20552673935890198, "learning_rate": 0.0002, "loss": 0.5378585457801819, "mean_token_accuracy": 0.7835244834423065, "num_tokens": 9810716.0, "step": 600 }, { "entropy": 0.5362858921289444, "epoch": 2.2425373134328357, "grad_norm": 0.1832580715417862, "learning_rate": 0.0002, "loss": 0.5247851014137268, "mean_token_accuracy": 0.7862047404050827, "num_tokens": 9826899.0, "step": 601 }, { "entropy": 0.515026330947876, "epoch": 2.246268656716418, "grad_norm": 0.1738833785057068, "learning_rate": 0.0002, "loss": 0.5104220509529114, "mean_token_accuracy": 0.7956585586071014, "num_tokens": 9843201.0, "step": 602 }, { "entropy": 0.5326243042945862, "epoch": 2.25, "grad_norm": 0.19789133965969086, "learning_rate": 0.0002, "loss": 0.5377206206321716, "mean_token_accuracy": 0.7844580560922623, "num_tokens": 9859428.0, "step": 603 }, { "entropy": 0.5045425221323967, "epoch": 2.253731343283582, "grad_norm": 0.22017110884189606, "learning_rate": 0.0002, "loss": 0.5142727494239807, "mean_token_accuracy": 0.7916774153709412, "num_tokens": 9875509.0, "step": 604 }, { "entropy": 0.5083225071430206, "epoch": 2.2574626865671643, "grad_norm": 0.20720691978931427, "learning_rate": 0.0002, "loss": 0.5168294906616211, "mean_token_accuracy": 0.7916733622550964, "num_tokens": 9891513.0, "step": 605 }, { "entropy": 0.5038861483335495, "epoch": 2.2611940298507465, "grad_norm": 0.22461913526058197, "learning_rate": 0.0002, "loss": 0.5155696868896484, "mean_token_accuracy": 0.7936981916427612, "num_tokens": 9907970.0, "step": 606 }, { "entropy": 0.544201672077179, "epoch": 2.264925373134328, "grad_norm": 0.22078122198581696, "learning_rate": 0.0002, "loss": 0.5377649664878845, "mean_token_accuracy": 0.7846001982688904, "num_tokens": 9924358.0, "step": 607 }, { "entropy": 0.5319496989250183, "epoch": 2.2686567164179103, "grad_norm": 0.15865834057331085, "learning_rate": 0.0002, "loss": 0.5269988775253296, "mean_token_accuracy": 0.7889304012060165, "num_tokens": 9940613.0, "step": 608 }, { "entropy": 0.5121538639068604, "epoch": 2.2723880597014925, "grad_norm": 0.19707661867141724, "learning_rate": 0.0002, "loss": 0.5115834474563599, "mean_token_accuracy": 0.7899812310934067, "num_tokens": 9956900.0, "step": 609 }, { "entropy": 0.5339771807193756, "epoch": 2.2761194029850746, "grad_norm": 0.15257956087589264, "learning_rate": 0.0002, "loss": 0.5300955772399902, "mean_token_accuracy": 0.785103976726532, "num_tokens": 9973499.0, "step": 610 }, { "entropy": 0.5281384140253067, "epoch": 2.279850746268657, "grad_norm": 0.16553470492362976, "learning_rate": 0.0002, "loss": 0.5257382392883301, "mean_token_accuracy": 0.7875041514635086, "num_tokens": 9989801.0, "step": 611 }, { "entropy": 0.5170317441225052, "epoch": 2.283582089552239, "grad_norm": 0.1715046465396881, "learning_rate": 0.0002, "loss": 0.5181665420532227, "mean_token_accuracy": 0.7884780019521713, "num_tokens": 10006078.0, "step": 612 }, { "entropy": 0.5153259709477425, "epoch": 2.2873134328358207, "grad_norm": 0.1548839956521988, "learning_rate": 0.0002, "loss": 0.514171302318573, "mean_token_accuracy": 0.7930748611688614, "num_tokens": 10022246.0, "step": 613 }, { "entropy": 0.5224331915378571, "epoch": 2.291044776119403, "grad_norm": 0.1681355983018875, "learning_rate": 0.0002, "loss": 0.5221542119979858, "mean_token_accuracy": 0.7877352833747864, "num_tokens": 10038788.0, "step": 614 }, { "entropy": 0.5205291956663132, "epoch": 2.294776119402985, "grad_norm": 0.16179999709129333, "learning_rate": 0.0002, "loss": 0.5216364860534668, "mean_token_accuracy": 0.7894330769777298, "num_tokens": 10055226.0, "step": 615 }, { "entropy": 0.5362520515918732, "epoch": 2.298507462686567, "grad_norm": 0.19491799175739288, "learning_rate": 0.0002, "loss": 0.5382164716720581, "mean_token_accuracy": 0.7841734141111374, "num_tokens": 10071636.0, "step": 616 }, { "entropy": 0.5122754499316216, "epoch": 2.3022388059701493, "grad_norm": 0.15888278186321259, "learning_rate": 0.0002, "loss": 0.5128467082977295, "mean_token_accuracy": 0.7957093715667725, "num_tokens": 10087915.0, "step": 617 }, { "entropy": 0.530030369758606, "epoch": 2.3059701492537314, "grad_norm": 0.20173799991607666, "learning_rate": 0.0002, "loss": 0.5327577590942383, "mean_token_accuracy": 0.7822887450456619, "num_tokens": 10104328.0, "step": 618 }, { "entropy": 0.511964850127697, "epoch": 2.3097014925373136, "grad_norm": 0.22716699540615082, "learning_rate": 0.0002, "loss": 0.5194392800331116, "mean_token_accuracy": 0.7923955619335175, "num_tokens": 10120902.0, "step": 619 }, { "entropy": 0.5184068530797958, "epoch": 2.3134328358208958, "grad_norm": 0.1653965413570404, "learning_rate": 0.0002, "loss": 0.5168477892875671, "mean_token_accuracy": 0.7927787899971008, "num_tokens": 10137330.0, "step": 620 }, { "entropy": 0.5173092186450958, "epoch": 2.3171641791044775, "grad_norm": 0.1853804737329483, "learning_rate": 0.0002, "loss": 0.5189480781555176, "mean_token_accuracy": 0.7897288352251053, "num_tokens": 10153802.0, "step": 621 }, { "entropy": 0.5215531587600708, "epoch": 2.3208955223880596, "grad_norm": 0.1907532960176468, "learning_rate": 0.0002, "loss": 0.5235369801521301, "mean_token_accuracy": 0.7906839698553085, "num_tokens": 10170052.0, "step": 622 }, { "entropy": 0.5299772173166275, "epoch": 2.324626865671642, "grad_norm": 0.17518973350524902, "learning_rate": 0.0002, "loss": 0.5251893401145935, "mean_token_accuracy": 0.7905509769916534, "num_tokens": 10186299.0, "step": 623 }, { "entropy": 0.5111118629574776, "epoch": 2.328358208955224, "grad_norm": 0.162562295794487, "learning_rate": 0.0002, "loss": 0.5044469237327576, "mean_token_accuracy": 0.793881356716156, "num_tokens": 10202479.0, "step": 624 }, { "entropy": 0.5176884084939957, "epoch": 2.332089552238806, "grad_norm": 0.15817266702651978, "learning_rate": 0.0002, "loss": 0.5189487934112549, "mean_token_accuracy": 0.7899019569158554, "num_tokens": 10218755.0, "step": 625 }, { "entropy": 0.5375020056962967, "epoch": 2.3358208955223883, "grad_norm": 0.16503086686134338, "learning_rate": 0.0002, "loss": 0.5378777980804443, "mean_token_accuracy": 0.7797044813632965, "num_tokens": 10235308.0, "step": 626 }, { "entropy": 0.5069606155157089, "epoch": 2.33955223880597, "grad_norm": 0.19356752932071686, "learning_rate": 0.0002, "loss": 0.5149304866790771, "mean_token_accuracy": 0.790899932384491, "num_tokens": 10251410.0, "step": 627 }, { "entropy": 0.5025136545300484, "epoch": 2.343283582089552, "grad_norm": 0.1775875836610794, "learning_rate": 0.0002, "loss": 0.5070807933807373, "mean_token_accuracy": 0.7955823987722397, "num_tokens": 10267499.0, "step": 628 }, { "entropy": 0.5052608847618103, "epoch": 2.3470149253731343, "grad_norm": 0.21965590119361877, "learning_rate": 0.0002, "loss": 0.5101135969161987, "mean_token_accuracy": 0.7949910014867783, "num_tokens": 10283791.0, "step": 629 }, { "entropy": 0.5179193317890167, "epoch": 2.3507462686567164, "grad_norm": 0.19963982701301575, "learning_rate": 0.0002, "loss": 0.5215207934379578, "mean_token_accuracy": 0.7893756926059723, "num_tokens": 10299845.0, "step": 630 }, { "entropy": 0.5158931389451027, "epoch": 2.3544776119402986, "grad_norm": 0.160457581281662, "learning_rate": 0.0002, "loss": 0.5119190216064453, "mean_token_accuracy": 0.7945539355278015, "num_tokens": 10316272.0, "step": 631 }, { "entropy": 0.5080019608139992, "epoch": 2.3582089552238807, "grad_norm": 0.1729355752468109, "learning_rate": 0.0002, "loss": 0.5050552487373352, "mean_token_accuracy": 0.7989319264888763, "num_tokens": 10332919.0, "step": 632 }, { "entropy": 0.5174911320209503, "epoch": 2.361940298507463, "grad_norm": 0.1741209179162979, "learning_rate": 0.0002, "loss": 0.5234130024909973, "mean_token_accuracy": 0.7888159304857254, "num_tokens": 10349259.0, "step": 633 }, { "entropy": 0.5265702903270721, "epoch": 2.3656716417910446, "grad_norm": 0.19182217121124268, "learning_rate": 0.0002, "loss": 0.5293515920639038, "mean_token_accuracy": 0.7829533070325851, "num_tokens": 10365491.0, "step": 634 }, { "entropy": 0.5425137877464294, "epoch": 2.3694029850746268, "grad_norm": 0.16463470458984375, "learning_rate": 0.0002, "loss": 0.542192280292511, "mean_token_accuracy": 0.7816719859838486, "num_tokens": 10381847.0, "step": 635 }, { "entropy": 0.5144196897745132, "epoch": 2.373134328358209, "grad_norm": 0.16132977604866028, "learning_rate": 0.0002, "loss": 0.5131939053535461, "mean_token_accuracy": 0.7919805645942688, "num_tokens": 10398171.0, "step": 636 }, { "entropy": 0.5415032058954239, "epoch": 2.376865671641791, "grad_norm": 0.16324372589588165, "learning_rate": 0.0002, "loss": 0.5371772050857544, "mean_token_accuracy": 0.7831342816352844, "num_tokens": 10414686.0, "step": 637 }, { "entropy": 0.5282690078020096, "epoch": 2.3805970149253732, "grad_norm": 0.17967335879802704, "learning_rate": 0.0002, "loss": 0.5203690528869629, "mean_token_accuracy": 0.7885807305574417, "num_tokens": 10431126.0, "step": 638 }, { "entropy": 0.5216360539197922, "epoch": 2.3843283582089554, "grad_norm": 0.16235722601413727, "learning_rate": 0.0002, "loss": 0.5236966013908386, "mean_token_accuracy": 0.7884224951267242, "num_tokens": 10447324.0, "step": 639 }, { "entropy": 0.5296328365802765, "epoch": 2.388059701492537, "grad_norm": 0.1916787028312683, "learning_rate": 0.0002, "loss": 0.5376251339912415, "mean_token_accuracy": 0.7802027314901352, "num_tokens": 10463603.0, "step": 640 }, { "entropy": 0.5012985095381737, "epoch": 2.3917910447761193, "grad_norm": 0.19376890361309052, "learning_rate": 0.0002, "loss": 0.5101221203804016, "mean_token_accuracy": 0.7951995581388474, "num_tokens": 10479993.0, "step": 641 }, { "entropy": 0.5038901194930077, "epoch": 2.3955223880597014, "grad_norm": 0.17371249198913574, "learning_rate": 0.0002, "loss": 0.5146278738975525, "mean_token_accuracy": 0.7905002534389496, "num_tokens": 10496023.0, "step": 642 }, { "entropy": 0.5509473532438278, "epoch": 2.3992537313432836, "grad_norm": 0.15395016968250275, "learning_rate": 0.0002, "loss": 0.546664834022522, "mean_token_accuracy": 0.7777733653783798, "num_tokens": 10512527.0, "step": 643 }, { "entropy": 0.5174002125859261, "epoch": 2.4029850746268657, "grad_norm": 0.1537095606327057, "learning_rate": 0.0002, "loss": 0.5125638842582703, "mean_token_accuracy": 0.7953683733940125, "num_tokens": 10529050.0, "step": 644 }, { "entropy": 0.5259301066398621, "epoch": 2.406716417910448, "grad_norm": 0.19275200366973877, "learning_rate": 0.0002, "loss": 0.534030556678772, "mean_token_accuracy": 0.7856698781251907, "num_tokens": 10545403.0, "step": 645 }, { "entropy": 0.5141283497214317, "epoch": 2.41044776119403, "grad_norm": 0.2044205218553543, "learning_rate": 0.0002, "loss": 0.5202509760856628, "mean_token_accuracy": 0.7915003001689911, "num_tokens": 10561404.0, "step": 646 }, { "entropy": 0.5140255615115166, "epoch": 2.4141791044776117, "grad_norm": 0.17939844727516174, "learning_rate": 0.0002, "loss": 0.5115104913711548, "mean_token_accuracy": 0.7907571196556091, "num_tokens": 10577588.0, "step": 647 }, { "entropy": 0.5283705443143845, "epoch": 2.417910447761194, "grad_norm": 0.19888189435005188, "learning_rate": 0.0002, "loss": 0.5198178291320801, "mean_token_accuracy": 0.7891141772270203, "num_tokens": 10593859.0, "step": 648 }, { "entropy": 0.5462386906147003, "epoch": 2.421641791044776, "grad_norm": 0.1922907531261444, "learning_rate": 0.0002, "loss": 0.5396484732627869, "mean_token_accuracy": 0.7813579887151718, "num_tokens": 10610303.0, "step": 649 }, { "entropy": 0.5058758109807968, "epoch": 2.425373134328358, "grad_norm": 0.21254123747348785, "learning_rate": 0.0002, "loss": 0.5134891271591187, "mean_token_accuracy": 0.7951326668262482, "num_tokens": 10626628.0, "step": 650 }, { "entropy": 0.5051485821604729, "epoch": 2.4291044776119404, "grad_norm": 0.17681139707565308, "learning_rate": 0.0002, "loss": 0.5095136761665344, "mean_token_accuracy": 0.7927682101726532, "num_tokens": 10642872.0, "step": 651 }, { "entropy": 0.5098261535167694, "epoch": 2.4328358208955225, "grad_norm": 0.1644936203956604, "learning_rate": 0.0002, "loss": 0.5163934230804443, "mean_token_accuracy": 0.7900458127260208, "num_tokens": 10659143.0, "step": 652 }, { "entropy": 0.5026194378733635, "epoch": 2.4365671641791042, "grad_norm": 0.1890725940465927, "learning_rate": 0.0002, "loss": 0.511451244354248, "mean_token_accuracy": 0.7927152365446091, "num_tokens": 10675503.0, "step": 653 }, { "entropy": 0.5148562490940094, "epoch": 2.4402985074626864, "grad_norm": 0.1650211215019226, "learning_rate": 0.0002, "loss": 0.5156391263008118, "mean_token_accuracy": 0.7906764894723892, "num_tokens": 10691795.0, "step": 654 }, { "entropy": 0.5057827532291412, "epoch": 2.4440298507462686, "grad_norm": 0.1589452177286148, "learning_rate": 0.0002, "loss": 0.5033491849899292, "mean_token_accuracy": 0.7994053959846497, "num_tokens": 10707762.0, "step": 655 }, { "entropy": 0.5219250470399857, "epoch": 2.4477611940298507, "grad_norm": 0.18478544056415558, "learning_rate": 0.0002, "loss": 0.5219628810882568, "mean_token_accuracy": 0.7873866856098175, "num_tokens": 10724063.0, "step": 656 }, { "entropy": 0.5177232921123505, "epoch": 2.451492537313433, "grad_norm": 0.17303429543972015, "learning_rate": 0.0002, "loss": 0.5200316309928894, "mean_token_accuracy": 0.7885988503694534, "num_tokens": 10740399.0, "step": 657 }, { "entropy": 0.5319043695926666, "epoch": 2.455223880597015, "grad_norm": 0.18429186940193176, "learning_rate": 0.0002, "loss": 0.5326516032218933, "mean_token_accuracy": 0.7862447798252106, "num_tokens": 10756986.0, "step": 658 }, { "entropy": 0.5453691333532333, "epoch": 2.458955223880597, "grad_norm": 0.16711914539337158, "learning_rate": 0.0002, "loss": 0.5386096239089966, "mean_token_accuracy": 0.7812793850898743, "num_tokens": 10773458.0, "step": 659 }, { "entropy": 0.5214618891477585, "epoch": 2.4626865671641793, "grad_norm": 0.1909995675086975, "learning_rate": 0.0002, "loss": 0.518884003162384, "mean_token_accuracy": 0.7878068089485168, "num_tokens": 10789818.0, "step": 660 }, { "entropy": 0.523200586438179, "epoch": 2.466417910447761, "grad_norm": 0.17626361548900604, "learning_rate": 0.0002, "loss": 0.5212401151657104, "mean_token_accuracy": 0.7900760471820831, "num_tokens": 10806143.0, "step": 661 }, { "entropy": 0.5310025811195374, "epoch": 2.470149253731343, "grad_norm": 0.24172359704971313, "learning_rate": 0.0002, "loss": 0.5338881611824036, "mean_token_accuracy": 0.7858817130327225, "num_tokens": 10822437.0, "step": 662 }, { "entropy": 0.5151319652795792, "epoch": 2.4738805970149254, "grad_norm": 0.19658994674682617, "learning_rate": 0.0002, "loss": 0.5139521956443787, "mean_token_accuracy": 0.7917647659778595, "num_tokens": 10838442.0, "step": 663 }, { "entropy": 0.5117574036121368, "epoch": 2.4776119402985075, "grad_norm": 0.2189301699399948, "learning_rate": 0.0002, "loss": 0.513599693775177, "mean_token_accuracy": 0.7897299826145172, "num_tokens": 10854797.0, "step": 664 }, { "entropy": 0.5397205054759979, "epoch": 2.4813432835820897, "grad_norm": 0.2076101452112198, "learning_rate": 0.0002, "loss": 0.5459029078483582, "mean_token_accuracy": 0.7777052521705627, "num_tokens": 10871117.0, "step": 665 }, { "entropy": 0.525243952870369, "epoch": 2.485074626865672, "grad_norm": 0.1969526708126068, "learning_rate": 0.0002, "loss": 0.5259374380111694, "mean_token_accuracy": 0.7870301008224487, "num_tokens": 10887285.0, "step": 666 }, { "entropy": 0.521914929151535, "epoch": 2.4888059701492535, "grad_norm": 0.1793866604566574, "learning_rate": 0.0002, "loss": 0.523249626159668, "mean_token_accuracy": 0.7908923327922821, "num_tokens": 10903583.0, "step": 667 }, { "entropy": 0.5157094374299049, "epoch": 2.4925373134328357, "grad_norm": 0.1676340252161026, "learning_rate": 0.0002, "loss": 0.5196658372879028, "mean_token_accuracy": 0.7936161011457443, "num_tokens": 10919876.0, "step": 668 }, { "entropy": 0.49876970052719116, "epoch": 2.496268656716418, "grad_norm": 0.18448136746883392, "learning_rate": 0.0002, "loss": 0.49738743901252747, "mean_token_accuracy": 0.8003499060869217, "num_tokens": 10936091.0, "step": 669 }, { "entropy": 0.5243137031793594, "epoch": 2.5, "grad_norm": 0.1985243260860443, "learning_rate": 0.0002, "loss": 0.526336133480072, "mean_token_accuracy": 0.7861499488353729, "num_tokens": 10952522.0, "step": 670 }, { "entropy": 0.5277926176786423, "epoch": 2.503731343283582, "grad_norm": 0.15664395689964294, "learning_rate": 0.0002, "loss": 0.5211771726608276, "mean_token_accuracy": 0.7905664294958115, "num_tokens": 10968886.0, "step": 671 }, { "entropy": 0.5109870582818985, "epoch": 2.5074626865671643, "grad_norm": 0.17840486764907837, "learning_rate": 0.0002, "loss": 0.5104790925979614, "mean_token_accuracy": 0.7953955680131912, "num_tokens": 10985258.0, "step": 672 }, { "entropy": 0.4981943815946579, "epoch": 2.5111940298507465, "grad_norm": 0.15788039565086365, "learning_rate": 0.0002, "loss": 0.5019396543502808, "mean_token_accuracy": 0.7957722395658493, "num_tokens": 11001537.0, "step": 673 }, { "entropy": 0.4992476552724838, "epoch": 2.5149253731343286, "grad_norm": 0.20122262835502625, "learning_rate": 0.0002, "loss": 0.5123214721679688, "mean_token_accuracy": 0.7936280071735382, "num_tokens": 11017858.0, "step": 674 }, { "entropy": 0.5326351076364517, "epoch": 2.5186567164179103, "grad_norm": 0.15370923280715942, "learning_rate": 0.0002, "loss": 0.5299698114395142, "mean_token_accuracy": 0.7864175289869308, "num_tokens": 11034251.0, "step": 675 }, { "entropy": 0.5276974588632584, "epoch": 2.5223880597014925, "grad_norm": 0.16408182680606842, "learning_rate": 0.0002, "loss": 0.5256198644638062, "mean_token_accuracy": 0.7864832729101181, "num_tokens": 11050538.0, "step": 676 }, { "entropy": 0.5174605995416641, "epoch": 2.5261194029850746, "grad_norm": 0.1726282238960266, "learning_rate": 0.0002, "loss": 0.5166889429092407, "mean_token_accuracy": 0.7903372198343277, "num_tokens": 11066909.0, "step": 677 }, { "entropy": 0.5096773952245712, "epoch": 2.529850746268657, "grad_norm": 0.18736550211906433, "learning_rate": 0.0002, "loss": 0.5147178173065186, "mean_token_accuracy": 0.7915707528591156, "num_tokens": 11083296.0, "step": 678 }, { "entropy": 0.5143576934933662, "epoch": 2.533582089552239, "grad_norm": 0.18496522307395935, "learning_rate": 0.0002, "loss": 0.5202215909957886, "mean_token_accuracy": 0.7876331657171249, "num_tokens": 11099735.0, "step": 679 }, { "entropy": 0.5062269270420074, "epoch": 2.5373134328358207, "grad_norm": 0.18014365434646606, "learning_rate": 0.0002, "loss": 0.5091406106948853, "mean_token_accuracy": 0.7964621633291245, "num_tokens": 11116208.0, "step": 680 }, { "entropy": 0.5146580412983894, "epoch": 2.541044776119403, "grad_norm": 0.15533168613910675, "learning_rate": 0.0002, "loss": 0.5158394575119019, "mean_token_accuracy": 0.7913824915885925, "num_tokens": 11132744.0, "step": 681 }, { "entropy": 0.5299884453415871, "epoch": 2.544776119402985, "grad_norm": 0.19397816061973572, "learning_rate": 0.0002, "loss": 0.5282403826713562, "mean_token_accuracy": 0.7865999937057495, "num_tokens": 11149385.0, "step": 682 }, { "entropy": 0.5197403728961945, "epoch": 2.548507462686567, "grad_norm": 0.1893748939037323, "learning_rate": 0.0002, "loss": 0.5172282457351685, "mean_token_accuracy": 0.7889421880245209, "num_tokens": 11165536.0, "step": 683 }, { "entropy": 0.5483877509832382, "epoch": 2.5522388059701493, "grad_norm": 0.1692439764738083, "learning_rate": 0.0002, "loss": 0.5408689975738525, "mean_token_accuracy": 0.7819931209087372, "num_tokens": 11182199.0, "step": 684 }, { "entropy": 0.5187435150146484, "epoch": 2.5559701492537314, "grad_norm": 0.16838251054286957, "learning_rate": 0.0002, "loss": 0.5220701098442078, "mean_token_accuracy": 0.7913226187229156, "num_tokens": 11198351.0, "step": 685 }, { "entropy": 0.5129819363355637, "epoch": 2.5597014925373136, "grad_norm": 0.18473690748214722, "learning_rate": 0.0002, "loss": 0.5199850797653198, "mean_token_accuracy": 0.7907718271017075, "num_tokens": 11214899.0, "step": 686 }, { "entropy": 0.5174092352390289, "epoch": 2.5634328358208958, "grad_norm": 0.18355096876621246, "learning_rate": 0.0002, "loss": 0.5231988430023193, "mean_token_accuracy": 0.7854581624269485, "num_tokens": 11231316.0, "step": 687 }, { "entropy": 0.5146564170718193, "epoch": 2.5671641791044775, "grad_norm": 0.20094642043113708, "learning_rate": 0.0002, "loss": 0.5167846083641052, "mean_token_accuracy": 0.7892555296421051, "num_tokens": 11247525.0, "step": 688 }, { "entropy": 0.5073134675621986, "epoch": 2.5708955223880596, "grad_norm": 0.17776694893836975, "learning_rate": 0.0002, "loss": 0.5059224963188171, "mean_token_accuracy": 0.7938186377286911, "num_tokens": 11263630.0, "step": 689 }, { "entropy": 0.51164161413908, "epoch": 2.574626865671642, "grad_norm": 0.23441171646118164, "learning_rate": 0.0002, "loss": 0.5132524371147156, "mean_token_accuracy": 0.7924985736608505, "num_tokens": 11279891.0, "step": 690 }, { "entropy": 0.5324152410030365, "epoch": 2.578358208955224, "grad_norm": 0.1964472234249115, "learning_rate": 0.0002, "loss": 0.5321142673492432, "mean_token_accuracy": 0.7884731590747833, "num_tokens": 11296194.0, "step": 691 }, { "entropy": 0.5136373415589333, "epoch": 2.582089552238806, "grad_norm": 0.23449179530143738, "learning_rate": 0.0002, "loss": 0.5196998715400696, "mean_token_accuracy": 0.7908406853675842, "num_tokens": 11312615.0, "step": 692 }, { "entropy": 0.5276090502738953, "epoch": 2.585820895522388, "grad_norm": 0.16686299443244934, "learning_rate": 0.0002, "loss": 0.5247229337692261, "mean_token_accuracy": 0.7879517525434494, "num_tokens": 11329158.0, "step": 693 }, { "entropy": 0.5419809222221375, "epoch": 2.58955223880597, "grad_norm": 0.19849538803100586, "learning_rate": 0.0002, "loss": 0.5328899621963501, "mean_token_accuracy": 0.7848672121763229, "num_tokens": 11345724.0, "step": 694 }, { "entropy": 0.5273312255740166, "epoch": 2.593283582089552, "grad_norm": 0.15091370046138763, "learning_rate": 0.0002, "loss": 0.5279825925827026, "mean_token_accuracy": 0.7853807210922241, "num_tokens": 11362189.0, "step": 695 }, { "entropy": 0.5198656767606735, "epoch": 2.5970149253731343, "grad_norm": 0.23191620409488678, "learning_rate": 0.0002, "loss": 0.5321477651596069, "mean_token_accuracy": 0.7849823385477066, "num_tokens": 11378807.0, "step": 696 }, { "entropy": 0.5051373466849327, "epoch": 2.6007462686567164, "grad_norm": 0.16530166566371918, "learning_rate": 0.0002, "loss": 0.5118955373764038, "mean_token_accuracy": 0.7921792417764664, "num_tokens": 11395066.0, "step": 697 }, { "entropy": 0.5375550240278244, "epoch": 2.6044776119402986, "grad_norm": 0.16651837527751923, "learning_rate": 0.0002, "loss": 0.5333649516105652, "mean_token_accuracy": 0.7834018468856812, "num_tokens": 11411502.0, "step": 698 }, { "entropy": 0.509097121655941, "epoch": 2.6082089552238807, "grad_norm": 0.19326747953891754, "learning_rate": 0.0002, "loss": 0.5079880952835083, "mean_token_accuracy": 0.7902690321207047, "num_tokens": 11427527.0, "step": 699 }, { "entropy": 0.5243344008922577, "epoch": 2.611940298507463, "grad_norm": 0.17708131670951843, "learning_rate": 0.0002, "loss": 0.527232825756073, "mean_token_accuracy": 0.78766830265522, "num_tokens": 11443934.0, "step": 700 }, { "entropy": 0.5099955424666405, "epoch": 2.6156716417910446, "grad_norm": 0.22393395006656647, "learning_rate": 0.0002, "loss": 0.5181647539138794, "mean_token_accuracy": 0.7911688387393951, "num_tokens": 11460041.0, "step": 701 }, { "entropy": 0.5081977397203445, "epoch": 2.6194029850746268, "grad_norm": 0.19041450321674347, "learning_rate": 0.0002, "loss": 0.5169417262077332, "mean_token_accuracy": 0.7914475202560425, "num_tokens": 11476118.0, "step": 702 }, { "entropy": 0.531707689166069, "epoch": 2.623134328358209, "grad_norm": 0.1838483214378357, "learning_rate": 0.0002, "loss": 0.5199188590049744, "mean_token_accuracy": 0.7899897545576096, "num_tokens": 11492660.0, "step": 703 }, { "entropy": 0.5364825427532196, "epoch": 2.626865671641791, "grad_norm": 0.1751444786787033, "learning_rate": 0.0002, "loss": 0.5356893539428711, "mean_token_accuracy": 0.7835856378078461, "num_tokens": 11509081.0, "step": 704 }, { "entropy": 0.5187056511640549, "epoch": 2.6305970149253732, "grad_norm": 0.17921118438243866, "learning_rate": 0.0002, "loss": 0.5232405066490173, "mean_token_accuracy": 0.7884531170129776, "num_tokens": 11525499.0, "step": 705 }, { "entropy": 0.5242651104927063, "epoch": 2.6343283582089554, "grad_norm": 0.18693575263023376, "learning_rate": 0.0002, "loss": 0.5285453796386719, "mean_token_accuracy": 0.786514088511467, "num_tokens": 11541734.0, "step": 706 }, { "entropy": 0.516477108001709, "epoch": 2.638059701492537, "grad_norm": 0.1994662582874298, "learning_rate": 0.0002, "loss": 0.5184328556060791, "mean_token_accuracy": 0.79111048579216, "num_tokens": 11558204.0, "step": 707 }, { "entropy": 0.5288708806037903, "epoch": 2.6417910447761193, "grad_norm": 0.16373923420906067, "learning_rate": 0.0002, "loss": 0.5213331580162048, "mean_token_accuracy": 0.7881525307893753, "num_tokens": 11574434.0, "step": 708 }, { "entropy": 0.5072719901800156, "epoch": 2.6455223880597014, "grad_norm": 0.1917801946401596, "learning_rate": 0.0002, "loss": 0.509112536907196, "mean_token_accuracy": 0.7960505336523056, "num_tokens": 11590646.0, "step": 709 }, { "entropy": 0.5356978923082352, "epoch": 2.6492537313432836, "grad_norm": 0.19294337928295135, "learning_rate": 0.0002, "loss": 0.5388337969779968, "mean_token_accuracy": 0.7824567407369614, "num_tokens": 11606979.0, "step": 710 }, { "entropy": 0.5163687542080879, "epoch": 2.6529850746268657, "grad_norm": 0.1852083057165146, "learning_rate": 0.0002, "loss": 0.5158357620239258, "mean_token_accuracy": 0.7907344847917557, "num_tokens": 11623404.0, "step": 711 }, { "entropy": 0.5283653736114502, "epoch": 2.656716417910448, "grad_norm": 0.17565470933914185, "learning_rate": 0.0002, "loss": 0.5322569608688354, "mean_token_accuracy": 0.7860839515924454, "num_tokens": 11639756.0, "step": 712 }, { "entropy": 0.5301189422607422, "epoch": 2.66044776119403, "grad_norm": 0.18470223248004913, "learning_rate": 0.0002, "loss": 0.5344855785369873, "mean_token_accuracy": 0.7831524461507797, "num_tokens": 11656115.0, "step": 713 }, { "entropy": 0.5131835639476776, "epoch": 2.664179104477612, "grad_norm": 0.14412830770015717, "learning_rate": 0.0002, "loss": 0.5086023211479187, "mean_token_accuracy": 0.7938779592514038, "num_tokens": 11672197.0, "step": 714 }, { "entropy": 0.5248347520828247, "epoch": 2.667910447761194, "grad_norm": 0.1623944342136383, "learning_rate": 0.0002, "loss": 0.5236642360687256, "mean_token_accuracy": 0.78847536444664, "num_tokens": 11688778.0, "step": 715 }, { "entropy": 0.5317736268043518, "epoch": 2.671641791044776, "grad_norm": 0.17043523490428925, "learning_rate": 0.0002, "loss": 0.5294151306152344, "mean_token_accuracy": 0.7867350727319717, "num_tokens": 11704972.0, "step": 716 }, { "entropy": 0.5292799472808838, "epoch": 2.675373134328358, "grad_norm": 0.21420958638191223, "learning_rate": 0.0002, "loss": 0.5348944664001465, "mean_token_accuracy": 0.784217044711113, "num_tokens": 11721357.0, "step": 717 }, { "entropy": 0.513471245765686, "epoch": 2.6791044776119404, "grad_norm": 0.18216556310653687, "learning_rate": 0.0002, "loss": 0.5178148746490479, "mean_token_accuracy": 0.7881872206926346, "num_tokens": 11737640.0, "step": 718 }, { "entropy": 0.5091867446899414, "epoch": 2.6828358208955225, "grad_norm": 0.18353325128555298, "learning_rate": 0.0002, "loss": 0.509505033493042, "mean_token_accuracy": 0.7933301627635956, "num_tokens": 11753743.0, "step": 719 }, { "entropy": 0.4985937625169754, "epoch": 2.6865671641791042, "grad_norm": 0.17763254046440125, "learning_rate": 0.0002, "loss": 0.5041629076004028, "mean_token_accuracy": 0.7961723208427429, "num_tokens": 11769941.0, "step": 720 }, { "entropy": 0.5326617211103439, "epoch": 2.6902985074626864, "grad_norm": 0.17128810286521912, "learning_rate": 0.0002, "loss": 0.5273231863975525, "mean_token_accuracy": 0.7882279455661774, "num_tokens": 11786468.0, "step": 721 }, { "entropy": 0.5309469103813171, "epoch": 2.6940298507462686, "grad_norm": 0.16436029970645905, "learning_rate": 0.0002, "loss": 0.5328190326690674, "mean_token_accuracy": 0.7852970659732819, "num_tokens": 11802907.0, "step": 722 }, { "entropy": 0.5232216566801071, "epoch": 2.6977611940298507, "grad_norm": 0.16719315946102142, "learning_rate": 0.0002, "loss": 0.5230921506881714, "mean_token_accuracy": 0.7876270413398743, "num_tokens": 11819317.0, "step": 723 }, { "entropy": 0.5203052535653114, "epoch": 2.701492537313433, "grad_norm": 0.19284284114837646, "learning_rate": 0.0002, "loss": 0.5245278477668762, "mean_token_accuracy": 0.7879077643156052, "num_tokens": 11835688.0, "step": 724 }, { "entropy": 0.5309562981128693, "epoch": 2.705223880597015, "grad_norm": 0.237013041973114, "learning_rate": 0.0002, "loss": 0.5299087166786194, "mean_token_accuracy": 0.7888383269309998, "num_tokens": 11851919.0, "step": 725 }, { "entropy": 0.5239868611097336, "epoch": 2.708955223880597, "grad_norm": 0.1684781163930893, "learning_rate": 0.0002, "loss": 0.5212418437004089, "mean_token_accuracy": 0.7896943688392639, "num_tokens": 11868352.0, "step": 726 }, { "entropy": 0.5078758075833321, "epoch": 2.7126865671641793, "grad_norm": 0.18132759630680084, "learning_rate": 0.0002, "loss": 0.5123098492622375, "mean_token_accuracy": 0.7928104400634766, "num_tokens": 11884504.0, "step": 727 }, { "entropy": 0.5257874876260757, "epoch": 2.716417910447761, "grad_norm": 0.18958209455013275, "learning_rate": 0.0002, "loss": 0.5350735783576965, "mean_token_accuracy": 0.7816809117794037, "num_tokens": 11900762.0, "step": 728 }, { "entropy": 0.5237897783517838, "epoch": 2.720149253731343, "grad_norm": 0.17628394067287445, "learning_rate": 0.0002, "loss": 0.5271024107933044, "mean_token_accuracy": 0.7875955998897552, "num_tokens": 11917096.0, "step": 729 }, { "entropy": 0.5278095304965973, "epoch": 2.7238805970149254, "grad_norm": 0.1737760603427887, "learning_rate": 0.0002, "loss": 0.5236294865608215, "mean_token_accuracy": 0.7871440947055817, "num_tokens": 11933442.0, "step": 730 }, { "entropy": 0.5360710769891739, "epoch": 2.7276119402985075, "grad_norm": 0.17106162011623383, "learning_rate": 0.0002, "loss": 0.5306381583213806, "mean_token_accuracy": 0.7830738425254822, "num_tokens": 11949977.0, "step": 731 }, { "entropy": 0.5101736485958099, "epoch": 2.7313432835820897, "grad_norm": 0.17468304932117462, "learning_rate": 0.0002, "loss": 0.5146869421005249, "mean_token_accuracy": 0.7935636639595032, "num_tokens": 11966192.0, "step": 732 }, { "entropy": 0.5177389085292816, "epoch": 2.7350746268656714, "grad_norm": 0.18631240725517273, "learning_rate": 0.0002, "loss": 0.5224716663360596, "mean_token_accuracy": 0.78856061398983, "num_tokens": 11982767.0, "step": 733 }, { "entropy": 0.5130163431167603, "epoch": 2.7388059701492535, "grad_norm": 0.18318809568881989, "learning_rate": 0.0002, "loss": 0.5186882019042969, "mean_token_accuracy": 0.7916167229413986, "num_tokens": 11998980.0, "step": 734 }, { "entropy": 0.5177224427461624, "epoch": 2.7425373134328357, "grad_norm": 0.15900187194347382, "learning_rate": 0.0002, "loss": 0.5131608843803406, "mean_token_accuracy": 0.7938690781593323, "num_tokens": 12015535.0, "step": 735 }, { "entropy": 0.526519387960434, "epoch": 2.746268656716418, "grad_norm": 0.174263134598732, "learning_rate": 0.0002, "loss": 0.5261813402175903, "mean_token_accuracy": 0.7892861515283585, "num_tokens": 12031788.0, "step": 736 }, { "entropy": 0.5191493332386017, "epoch": 2.75, "grad_norm": 0.18909449875354767, "learning_rate": 0.0002, "loss": 0.5240525007247925, "mean_token_accuracy": 0.7878368943929672, "num_tokens": 12047980.0, "step": 737 }, { "entropy": 0.5201373547315598, "epoch": 2.753731343283582, "grad_norm": 0.18388764560222626, "learning_rate": 0.0002, "loss": 0.5292187929153442, "mean_token_accuracy": 0.7905917465686798, "num_tokens": 12064314.0, "step": 738 }, { "entropy": 0.5199328809976578, "epoch": 2.7574626865671643, "grad_norm": 0.19509336352348328, "learning_rate": 0.0002, "loss": 0.5188801884651184, "mean_token_accuracy": 0.7895538657903671, "num_tokens": 12080751.0, "step": 739 }, { "entropy": 0.5277723222970963, "epoch": 2.7611940298507465, "grad_norm": 0.16337504982948303, "learning_rate": 0.0002, "loss": 0.5206757187843323, "mean_token_accuracy": 0.7895227074623108, "num_tokens": 12097014.0, "step": 740 }, { "entropy": 0.5113491863012314, "epoch": 2.7649253731343286, "grad_norm": 0.17909789085388184, "learning_rate": 0.0002, "loss": 0.5122904777526855, "mean_token_accuracy": 0.7908981740474701, "num_tokens": 12113252.0, "step": 741 }, { "entropy": 0.5200309902429581, "epoch": 2.7686567164179103, "grad_norm": 0.17350299656391144, "learning_rate": 0.0002, "loss": 0.5194863677024841, "mean_token_accuracy": 0.7900390475988388, "num_tokens": 12129709.0, "step": 742 }, { "entropy": 0.5226462483406067, "epoch": 2.7723880597014925, "grad_norm": 0.21633893251419067, "learning_rate": 0.0002, "loss": 0.5241018533706665, "mean_token_accuracy": 0.7901509553194046, "num_tokens": 12146084.0, "step": 743 }, { "entropy": 0.5130392387509346, "epoch": 2.7761194029850746, "grad_norm": 0.19013682007789612, "learning_rate": 0.0002, "loss": 0.5189740061759949, "mean_token_accuracy": 0.7909031510353088, "num_tokens": 12162307.0, "step": 744 }, { "entropy": 0.5150926038622856, "epoch": 2.779850746268657, "grad_norm": 0.2071346938610077, "learning_rate": 0.0002, "loss": 0.5166252255439758, "mean_token_accuracy": 0.7929645031690598, "num_tokens": 12178654.0, "step": 745 }, { "entropy": 0.5175644010305405, "epoch": 2.783582089552239, "grad_norm": 0.1927538812160492, "learning_rate": 0.0002, "loss": 0.5234126448631287, "mean_token_accuracy": 0.7895888537168503, "num_tokens": 12194657.0, "step": 746 }, { "entropy": 0.5124155282974243, "epoch": 2.7873134328358207, "grad_norm": 0.20746196806430817, "learning_rate": 0.0002, "loss": 0.5111269950866699, "mean_token_accuracy": 0.7925330102443695, "num_tokens": 12211150.0, "step": 747 }, { "entropy": 0.5269140601158142, "epoch": 2.791044776119403, "grad_norm": 0.16280147433280945, "learning_rate": 0.0002, "loss": 0.5249094367027283, "mean_token_accuracy": 0.7845876812934875, "num_tokens": 12227551.0, "step": 748 }, { "entropy": 0.5178611427545547, "epoch": 2.794776119402985, "grad_norm": 0.23840144276618958, "learning_rate": 0.0002, "loss": 0.5257112383842468, "mean_token_accuracy": 0.7894743531942368, "num_tokens": 12243876.0, "step": 749 }, { "entropy": 0.5116888880729675, "epoch": 2.798507462686567, "grad_norm": 0.18411816656589508, "learning_rate": 0.0002, "loss": 0.5144840478897095, "mean_token_accuracy": 0.7931785434484482, "num_tokens": 12260217.0, "step": 750 }, { "entropy": 0.5289624482393265, "epoch": 2.8022388059701493, "grad_norm": 0.22270359098911285, "learning_rate": 0.0002, "loss": 0.5311276316642761, "mean_token_accuracy": 0.7855756431818008, "num_tokens": 12276532.0, "step": 751 }, { "entropy": 0.547882929444313, "epoch": 2.8059701492537314, "grad_norm": 0.15829682350158691, "learning_rate": 0.0002, "loss": 0.5395496487617493, "mean_token_accuracy": 0.7822854816913605, "num_tokens": 12292809.0, "step": 752 }, { "entropy": 0.5366968065500259, "epoch": 2.8097014925373136, "grad_norm": 0.17022006213665009, "learning_rate": 0.0002, "loss": 0.5253041982650757, "mean_token_accuracy": 0.7889240682125092, "num_tokens": 12309272.0, "step": 753 }, { "entropy": 0.5104647874832153, "epoch": 2.8134328358208958, "grad_norm": 0.20047977566719055, "learning_rate": 0.0002, "loss": 0.5114369988441467, "mean_token_accuracy": 0.7932160943746567, "num_tokens": 12325725.0, "step": 754 }, { "entropy": 0.530600056052208, "epoch": 2.8171641791044775, "grad_norm": 0.18938857316970825, "learning_rate": 0.0002, "loss": 0.5256994366645813, "mean_token_accuracy": 0.787563219666481, "num_tokens": 12341933.0, "step": 755 }, { "entropy": 0.5128819495439529, "epoch": 2.8208955223880596, "grad_norm": 0.19077159464359283, "learning_rate": 0.0002, "loss": 0.5233974456787109, "mean_token_accuracy": 0.7869286239147186, "num_tokens": 12358445.0, "step": 756 }, { "entropy": 0.5205030888319016, "epoch": 2.824626865671642, "grad_norm": 0.2066243290901184, "learning_rate": 0.0002, "loss": 0.527535617351532, "mean_token_accuracy": 0.7873703986406326, "num_tokens": 12374542.0, "step": 757 }, { "entropy": 0.5135227516293526, "epoch": 2.828358208955224, "grad_norm": 0.20685350894927979, "learning_rate": 0.0002, "loss": 0.5181005597114563, "mean_token_accuracy": 0.7896196097135544, "num_tokens": 12390788.0, "step": 758 }, { "entropy": 0.5336467772722244, "epoch": 2.832089552238806, "grad_norm": 0.1939532607793808, "learning_rate": 0.0002, "loss": 0.5294384956359863, "mean_token_accuracy": 0.7889339476823807, "num_tokens": 12407229.0, "step": 759 }, { "entropy": 0.5257266908884048, "epoch": 2.835820895522388, "grad_norm": 0.1771981567144394, "learning_rate": 0.0002, "loss": 0.5216140151023865, "mean_token_accuracy": 0.7899226099252701, "num_tokens": 12423846.0, "step": 760 }, { "entropy": 0.5299984812736511, "epoch": 2.83955223880597, "grad_norm": 0.20455680787563324, "learning_rate": 0.0002, "loss": 0.5296297073364258, "mean_token_accuracy": 0.7862879633903503, "num_tokens": 12440158.0, "step": 761 }, { "entropy": 0.5143841132521629, "epoch": 2.843283582089552, "grad_norm": 0.2076958268880844, "learning_rate": 0.0002, "loss": 0.5176342725753784, "mean_token_accuracy": 0.7894581258296967, "num_tokens": 12456654.0, "step": 762 }, { "entropy": 0.4974513649940491, "epoch": 2.8470149253731343, "grad_norm": 0.193134143948555, "learning_rate": 0.0002, "loss": 0.5035260319709778, "mean_token_accuracy": 0.7979147285223007, "num_tokens": 12472987.0, "step": 763 }, { "entropy": 0.516231395304203, "epoch": 2.8507462686567164, "grad_norm": 0.19579733908176422, "learning_rate": 0.0002, "loss": 0.523535430431366, "mean_token_accuracy": 0.7885937541723251, "num_tokens": 12489201.0, "step": 764 }, { "entropy": 0.5090928375720978, "epoch": 2.8544776119402986, "grad_norm": 0.1745532602071762, "learning_rate": 0.0002, "loss": 0.5120922327041626, "mean_token_accuracy": 0.7926068156957626, "num_tokens": 12505297.0, "step": 765 }, { "entropy": 0.5212984532117844, "epoch": 2.8582089552238807, "grad_norm": 0.1687193065881729, "learning_rate": 0.0002, "loss": 0.5186242461204529, "mean_token_accuracy": 0.7898098975419998, "num_tokens": 12521805.0, "step": 766 }, { "entropy": 0.5455201715230942, "epoch": 2.861940298507463, "grad_norm": 0.14300285279750824, "learning_rate": 0.0002, "loss": 0.5431771278381348, "mean_token_accuracy": 0.7779514342546463, "num_tokens": 12538465.0, "step": 767 }, { "entropy": 0.5209106504917145, "epoch": 2.8656716417910446, "grad_norm": 0.16800960898399353, "learning_rate": 0.0002, "loss": 0.5184243321418762, "mean_token_accuracy": 0.7890264838933945, "num_tokens": 12554886.0, "step": 768 }, { "entropy": 0.5088474899530411, "epoch": 2.8694029850746268, "grad_norm": 0.1462314873933792, "learning_rate": 0.0002, "loss": 0.5083324909210205, "mean_token_accuracy": 0.7934228926897049, "num_tokens": 12571276.0, "step": 769 }, { "entropy": 0.5271053463220596, "epoch": 2.873134328358209, "grad_norm": 0.16391947865486145, "learning_rate": 0.0002, "loss": 0.5293073654174805, "mean_token_accuracy": 0.7859203815460205, "num_tokens": 12587621.0, "step": 770 }, { "entropy": 0.5014189630746841, "epoch": 2.876865671641791, "grad_norm": 0.16328679025173187, "learning_rate": 0.0002, "loss": 0.5073498487472534, "mean_token_accuracy": 0.7924041301012039, "num_tokens": 12604113.0, "step": 771 }, { "entropy": 0.5268891751766205, "epoch": 2.8805970149253732, "grad_norm": 0.21644122898578644, "learning_rate": 0.0002, "loss": 0.5315952301025391, "mean_token_accuracy": 0.7878720760345459, "num_tokens": 12620599.0, "step": 772 }, { "entropy": 0.5303193777799606, "epoch": 2.8843283582089554, "grad_norm": 0.16348110139369965, "learning_rate": 0.0002, "loss": 0.5203503966331482, "mean_token_accuracy": 0.7895929515361786, "num_tokens": 12636920.0, "step": 773 }, { "entropy": 0.5373167991638184, "epoch": 2.888059701492537, "grad_norm": 0.1674329936504364, "learning_rate": 0.0002, "loss": 0.5308367609977722, "mean_token_accuracy": 0.7839034348726273, "num_tokens": 12653507.0, "step": 774 }, { "entropy": 0.5245395004749298, "epoch": 2.8917910447761193, "grad_norm": 0.16798977553844452, "learning_rate": 0.0002, "loss": 0.525133490562439, "mean_token_accuracy": 0.7879597991704941, "num_tokens": 12669748.0, "step": 775 }, { "entropy": 0.4995606988668442, "epoch": 2.8955223880597014, "grad_norm": 0.16923899948596954, "learning_rate": 0.0002, "loss": 0.5072147250175476, "mean_token_accuracy": 0.7954233735799789, "num_tokens": 12686075.0, "step": 776 }, { "entropy": 0.5168571919202805, "epoch": 2.8992537313432836, "grad_norm": 0.19585320353507996, "learning_rate": 0.0002, "loss": 0.531486988067627, "mean_token_accuracy": 0.786114364862442, "num_tokens": 12702228.0, "step": 777 }, { "entropy": 0.5194735378026962, "epoch": 2.9029850746268657, "grad_norm": 0.17308996617794037, "learning_rate": 0.0002, "loss": 0.5222083926200867, "mean_token_accuracy": 0.7887429147958755, "num_tokens": 12718513.0, "step": 778 }, { "entropy": 0.5187652111053467, "epoch": 2.906716417910448, "grad_norm": 0.18012917041778564, "learning_rate": 0.0002, "loss": 0.5144599676132202, "mean_token_accuracy": 0.7928689271211624, "num_tokens": 12734912.0, "step": 779 }, { "entropy": 0.5175924748182297, "epoch": 2.91044776119403, "grad_norm": 0.15708911418914795, "learning_rate": 0.0002, "loss": 0.5127027034759521, "mean_token_accuracy": 0.7910457104444504, "num_tokens": 12751312.0, "step": 780 }, { "entropy": 0.5184929892420769, "epoch": 2.914179104477612, "grad_norm": 0.17460955679416656, "learning_rate": 0.0002, "loss": 0.5223311185836792, "mean_token_accuracy": 0.7881267666816711, "num_tokens": 12767906.0, "step": 781 }, { "entropy": 0.5162710845470428, "epoch": 2.917910447761194, "grad_norm": 0.1744503378868103, "learning_rate": 0.0002, "loss": 0.5184698104858398, "mean_token_accuracy": 0.7896480411291122, "num_tokens": 12784363.0, "step": 782 }, { "entropy": 0.5054134130477905, "epoch": 2.921641791044776, "grad_norm": 0.16419187188148499, "learning_rate": 0.0002, "loss": 0.5100088715553284, "mean_token_accuracy": 0.7937912940979004, "num_tokens": 12800729.0, "step": 783 }, { "entropy": 0.5267587229609489, "epoch": 2.925373134328358, "grad_norm": 0.15712794661521912, "learning_rate": 0.0002, "loss": 0.5234281420707703, "mean_token_accuracy": 0.7873355746269226, "num_tokens": 12817275.0, "step": 784 }, { "entropy": 0.5252643376588821, "epoch": 2.9291044776119404, "grad_norm": 0.17461742460727692, "learning_rate": 0.0002, "loss": 0.5149291753768921, "mean_token_accuracy": 0.792007714509964, "num_tokens": 12833722.0, "step": 785 }, { "entropy": 0.5310375690460205, "epoch": 2.9328358208955225, "grad_norm": 0.16197697818279266, "learning_rate": 0.0002, "loss": 0.5280002355575562, "mean_token_accuracy": 0.7869867831468582, "num_tokens": 12850311.0, "step": 786 }, { "entropy": 0.5165882706642151, "epoch": 2.9365671641791042, "grad_norm": 0.18169313669204712, "learning_rate": 0.0002, "loss": 0.5169544219970703, "mean_token_accuracy": 0.7926650643348694, "num_tokens": 12866551.0, "step": 787 }, { "entropy": 0.506410725414753, "epoch": 2.9402985074626864, "grad_norm": 0.16465988755226135, "learning_rate": 0.0002, "loss": 0.5119289755821228, "mean_token_accuracy": 0.7941572368144989, "num_tokens": 12882861.0, "step": 788 }, { "entropy": 0.5014762431383133, "epoch": 2.9440298507462686, "grad_norm": 0.18377594649791718, "learning_rate": 0.0002, "loss": 0.5110628008842468, "mean_token_accuracy": 0.7946459800004959, "num_tokens": 12899241.0, "step": 789 }, { "entropy": 0.5248052775859833, "epoch": 2.9477611940298507, "grad_norm": 0.20053857564926147, "learning_rate": 0.0002, "loss": 0.5319278240203857, "mean_token_accuracy": 0.7844424396753311, "num_tokens": 12915385.0, "step": 790 }, { "entropy": 0.53006511926651, "epoch": 2.951492537313433, "grad_norm": 0.17584678530693054, "learning_rate": 0.0002, "loss": 0.5255709886550903, "mean_token_accuracy": 0.7863388210535049, "num_tokens": 12931592.0, "step": 791 }, { "entropy": 0.5275840014219284, "epoch": 2.955223880597015, "grad_norm": 0.17536833882331848, "learning_rate": 0.0002, "loss": 0.5213799476623535, "mean_token_accuracy": 0.7920176684856415, "num_tokens": 12948004.0, "step": 792 }, { "entropy": 0.5442412495613098, "epoch": 2.958955223880597, "grad_norm": 0.17195221781730652, "learning_rate": 0.0002, "loss": 0.5382991433143616, "mean_token_accuracy": 0.7807125151157379, "num_tokens": 12964350.0, "step": 793 }, { "entropy": 0.514294296503067, "epoch": 2.9626865671641793, "grad_norm": 0.1958279013633728, "learning_rate": 0.0002, "loss": 0.5191056728363037, "mean_token_accuracy": 0.7889736741781235, "num_tokens": 12980870.0, "step": 794 }, { "entropy": 0.516971156001091, "epoch": 2.966417910447761, "grad_norm": 0.17031143605709076, "learning_rate": 0.0002, "loss": 0.5235239863395691, "mean_token_accuracy": 0.7902554422616959, "num_tokens": 12997265.0, "step": 795 }, { "entropy": 0.519709937274456, "epoch": 2.970149253731343, "grad_norm": 0.19241590797901154, "learning_rate": 0.0002, "loss": 0.5290430188179016, "mean_token_accuracy": 0.786635085940361, "num_tokens": 13013641.0, "step": 796 }, { "entropy": 0.5278842747211456, "epoch": 2.9738805970149254, "grad_norm": 0.1847175806760788, "learning_rate": 0.0002, "loss": 0.5301830768585205, "mean_token_accuracy": 0.7861872166395187, "num_tokens": 13030089.0, "step": 797 }, { "entropy": 0.543852686882019, "epoch": 2.9776119402985075, "grad_norm": 0.1565551459789276, "learning_rate": 0.0002, "loss": 0.5390616655349731, "mean_token_accuracy": 0.7804800420999527, "num_tokens": 13046782.0, "step": 798 }, { "entropy": 0.5507520437240601, "epoch": 2.9813432835820897, "grad_norm": 0.19360534846782684, "learning_rate": 0.0002, "loss": 0.5457417964935303, "mean_token_accuracy": 0.7808282524347305, "num_tokens": 13063260.0, "step": 799 }, { "entropy": 0.5130215361714363, "epoch": 2.9850746268656714, "grad_norm": 0.17565752565860748, "learning_rate": 0.0002, "loss": 0.5124551057815552, "mean_token_accuracy": 0.7940163463354111, "num_tokens": 13079496.0, "step": 800 }, { "entropy": 0.5296107679605484, "epoch": 2.9888059701492535, "grad_norm": 0.18528884649276733, "learning_rate": 0.0002, "loss": 0.5258690714836121, "mean_token_accuracy": 0.7890074849128723, "num_tokens": 13095995.0, "step": 801 }, { "entropy": 0.5083938241004944, "epoch": 2.9925373134328357, "grad_norm": 0.17645564675331116, "learning_rate": 0.0002, "loss": 0.5169539451599121, "mean_token_accuracy": 0.7913031429052353, "num_tokens": 13112668.0, "step": 802 }, { "entropy": 0.5120368450880051, "epoch": 2.996268656716418, "grad_norm": 0.1844874620437622, "learning_rate": 0.0002, "loss": 0.5195419192314148, "mean_token_accuracy": 0.7927880436182022, "num_tokens": 13128901.0, "step": 803 }, { "entropy": 0.5261139273643494, "epoch": 3.0, "grad_norm": 0.19706764817237854, "learning_rate": 0.0002, "loss": 0.5334464311599731, "mean_token_accuracy": 0.7812356650829315, "num_tokens": 13145317.0, "step": 804 } ], "logging_steps": 1, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2252935644732457e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }