{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1336015462875366, "epoch": 0.003738317757009346, "grad_norm": 0.4115395247936249, "learning_rate": 0.0002, "loss": 2.4710798263549805, "mean_token_accuracy": 0.5324664115905762, "num_tokens": 16496.0, "step": 1 }, { "entropy": 1.2463930547237396, "epoch": 0.007476635514018692, "grad_norm": 0.3692863881587982, "learning_rate": 0.0002, "loss": 2.165541648864746, "mean_token_accuracy": 0.5610552132129669, "num_tokens": 32901.0, "step": 2 }, { "entropy": 1.4113854467868805, "epoch": 0.011214953271028037, "grad_norm": 0.2915845811367035, "learning_rate": 0.0002, "loss": 1.7357215881347656, "mean_token_accuracy": 0.5886629670858383, "num_tokens": 49245.0, "step": 3 }, { "entropy": 1.379658043384552, "epoch": 0.014953271028037384, "grad_norm": 0.23361942172050476, "learning_rate": 0.0002, "loss": 1.410735011100769, "mean_token_accuracy": 0.6355755776166916, "num_tokens": 65811.0, "step": 4 }, { "entropy": 1.3623565435409546, "epoch": 0.018691588785046728, "grad_norm": 0.26191750168800354, "learning_rate": 0.0002, "loss": 1.2986161708831787, "mean_token_accuracy": 0.6415031999349594, "num_tokens": 82189.0, "step": 5 }, { "entropy": 1.2727859914302826, "epoch": 0.022429906542056073, "grad_norm": 0.1533316969871521, "learning_rate": 0.0002, "loss": 1.1948474645614624, "mean_token_accuracy": 0.6546026170253754, "num_tokens": 98489.0, "step": 6 }, { "entropy": 1.2184827625751495, "epoch": 0.026168224299065422, "grad_norm": 0.10424298793077469, "learning_rate": 0.0002, "loss": 1.1188591718673706, "mean_token_accuracy": 0.6631771177053452, "num_tokens": 114851.0, "step": 7 }, { "entropy": 1.1237380504608154, "epoch": 0.029906542056074768, "grad_norm": 0.10689449310302734, "learning_rate": 0.0002, "loss": 1.0371830463409424, "mean_token_accuracy": 0.6718492060899734, "num_tokens": 131220.0, "step": 8 }, { "entropy": 1.0455615520477295, "epoch": 0.03364485981308411, "grad_norm": 0.12944048643112183, "learning_rate": 0.0002, "loss": 0.9913585782051086, "mean_token_accuracy": 0.6828599572181702, "num_tokens": 147616.0, "step": 9 }, { "entropy": 0.9801072925329208, "epoch": 0.037383177570093455, "grad_norm": 0.1291113793849945, "learning_rate": 0.0002, "loss": 0.9284825325012207, "mean_token_accuracy": 0.7001921981573105, "num_tokens": 164002.0, "step": 10 }, { "entropy": 0.953565314412117, "epoch": 0.041121495327102804, "grad_norm": 0.10645624995231628, "learning_rate": 0.0002, "loss": 0.8795915842056274, "mean_token_accuracy": 0.7043117135763168, "num_tokens": 180220.0, "step": 11 }, { "entropy": 0.9155157953500748, "epoch": 0.044859813084112146, "grad_norm": 0.11287244409322739, "learning_rate": 0.0002, "loss": 0.8326205015182495, "mean_token_accuracy": 0.7109687179327011, "num_tokens": 196521.0, "step": 12 }, { "entropy": 0.8468948155641556, "epoch": 0.048598130841121495, "grad_norm": 0.10245727747678757, "learning_rate": 0.0002, "loss": 0.8009377121925354, "mean_token_accuracy": 0.7149728387594223, "num_tokens": 212778.0, "step": 13 }, { "entropy": 0.7708506435155869, "epoch": 0.052336448598130844, "grad_norm": 0.09908365458250046, "learning_rate": 0.0002, "loss": 0.7473602890968323, "mean_token_accuracy": 0.7281823754310608, "num_tokens": 228942.0, "step": 14 }, { "entropy": 0.7574831545352936, "epoch": 0.056074766355140186, "grad_norm": 0.10171845555305481, "learning_rate": 0.0002, "loss": 0.7353494167327881, "mean_token_accuracy": 0.7308090776205063, "num_tokens": 245256.0, "step": 15 }, { "entropy": 0.6849008500576019, "epoch": 0.059813084112149535, "grad_norm": 0.08664627373218536, "learning_rate": 0.0002, "loss": 0.6817273497581482, "mean_token_accuracy": 0.7445196211338043, "num_tokens": 261288.0, "step": 16 }, { "entropy": 0.6784532964229584, "epoch": 0.06355140186915888, "grad_norm": 0.08904161304235458, "learning_rate": 0.0002, "loss": 0.6835237741470337, "mean_token_accuracy": 0.7402277588844299, "num_tokens": 277473.0, "step": 17 }, { "entropy": 0.6737232953310013, "epoch": 0.06728971962616823, "grad_norm": 0.08908089250326157, "learning_rate": 0.0002, "loss": 0.6696494817733765, "mean_token_accuracy": 0.7452213168144226, "num_tokens": 293986.0, "step": 18 }, { "entropy": 0.676809772849083, "epoch": 0.07102803738317758, "grad_norm": 0.08826066553592682, "learning_rate": 0.0002, "loss": 0.6623877286911011, "mean_token_accuracy": 0.747529536485672, "num_tokens": 310269.0, "step": 19 }, { "entropy": 0.6532965898513794, "epoch": 0.07476635514018691, "grad_norm": 0.08917281031608582, "learning_rate": 0.0002, "loss": 0.6443736553192139, "mean_token_accuracy": 0.7480695396661758, "num_tokens": 326491.0, "step": 20 }, { "entropy": 0.6552709937095642, "epoch": 0.07850467289719626, "grad_norm": 0.08073496073484421, "learning_rate": 0.0002, "loss": 0.6399368643760681, "mean_token_accuracy": 0.7507821917533875, "num_tokens": 342841.0, "step": 21 }, { "entropy": 0.6378396600484848, "epoch": 0.08224299065420561, "grad_norm": 0.063417449593544, "learning_rate": 0.0002, "loss": 0.6258761882781982, "mean_token_accuracy": 0.7539727091789246, "num_tokens": 359584.0, "step": 22 }, { "entropy": 0.6046861261129379, "epoch": 0.08598130841121496, "grad_norm": 0.06905008107423782, "learning_rate": 0.0002, "loss": 0.6049938201904297, "mean_token_accuracy": 0.7625735104084015, "num_tokens": 375502.0, "step": 23 }, { "entropy": 0.6043607741594315, "epoch": 0.08971962616822429, "grad_norm": 0.0712490975856781, "learning_rate": 0.0002, "loss": 0.6081230640411377, "mean_token_accuracy": 0.761991336941719, "num_tokens": 391668.0, "step": 24 }, { "entropy": 0.5921229273080826, "epoch": 0.09345794392523364, "grad_norm": 0.06059383973479271, "learning_rate": 0.0002, "loss": 0.5966373682022095, "mean_token_accuracy": 0.7640610188245773, "num_tokens": 408064.0, "step": 25 }, { "entropy": 0.6013955473899841, "epoch": 0.09719626168224299, "grad_norm": 0.05800875276327133, "learning_rate": 0.0002, "loss": 0.6032594442367554, "mean_token_accuracy": 0.7606146037578583, "num_tokens": 424308.0, "step": 26 }, { "entropy": 0.6059402525424957, "epoch": 0.10093457943925234, "grad_norm": 0.05799295753240585, "learning_rate": 0.0002, "loss": 0.6014454960823059, "mean_token_accuracy": 0.7633127868175507, "num_tokens": 440626.0, "step": 27 }, { "entropy": 0.6059208810329437, "epoch": 0.10467289719626169, "grad_norm": 0.06835797429084778, "learning_rate": 0.0002, "loss": 0.5960400104522705, "mean_token_accuracy": 0.7644040137529373, "num_tokens": 457127.0, "step": 28 }, { "entropy": 0.6063490360975266, "epoch": 0.10841121495327102, "grad_norm": 0.08442196249961853, "learning_rate": 0.0002, "loss": 0.5988196730613708, "mean_token_accuracy": 0.7642622292041779, "num_tokens": 473449.0, "step": 29 }, { "entropy": 0.6044150143861771, "epoch": 0.11214953271028037, "grad_norm": 0.05611753463745117, "learning_rate": 0.0002, "loss": 0.5849661231040955, "mean_token_accuracy": 0.7694830596446991, "num_tokens": 489953.0, "step": 30 }, { "entropy": 0.5886638015508652, "epoch": 0.11588785046728972, "grad_norm": 0.055090922862291336, "learning_rate": 0.0002, "loss": 0.5829939842224121, "mean_token_accuracy": 0.769635483622551, "num_tokens": 506414.0, "step": 31 }, { "entropy": 0.5746142864227295, "epoch": 0.11962616822429907, "grad_norm": 0.049661796540021896, "learning_rate": 0.0002, "loss": 0.5790735483169556, "mean_token_accuracy": 0.7714909315109253, "num_tokens": 522742.0, "step": 32 }, { "entropy": 0.5767629146575928, "epoch": 0.1233644859813084, "grad_norm": 0.04847181588411331, "learning_rate": 0.0002, "loss": 0.580193281173706, "mean_token_accuracy": 0.7714395672082901, "num_tokens": 539199.0, "step": 33 }, { "entropy": 0.5745265781879425, "epoch": 0.12710280373831775, "grad_norm": 0.05860326439142227, "learning_rate": 0.0002, "loss": 0.5901641845703125, "mean_token_accuracy": 0.7679091691970825, "num_tokens": 555326.0, "step": 34 }, { "entropy": 0.567798376083374, "epoch": 0.1308411214953271, "grad_norm": 0.05234525725245476, "learning_rate": 0.0002, "loss": 0.5799325704574585, "mean_token_accuracy": 0.766155481338501, "num_tokens": 571808.0, "step": 35 }, { "entropy": 0.5698586851358414, "epoch": 0.13457943925233645, "grad_norm": 0.041219986975193024, "learning_rate": 0.0002, "loss": 0.573387086391449, "mean_token_accuracy": 0.769883319735527, "num_tokens": 588161.0, "step": 36 }, { "entropy": 0.5851186513900757, "epoch": 0.1383177570093458, "grad_norm": 0.04337616264820099, "learning_rate": 0.0002, "loss": 0.5821909308433533, "mean_token_accuracy": 0.7661230564117432, "num_tokens": 604598.0, "step": 37 }, { "entropy": 0.5961429327726364, "epoch": 0.14205607476635515, "grad_norm": 0.05468963831663132, "learning_rate": 0.0002, "loss": 0.5940048098564148, "mean_token_accuracy": 0.7601669579744339, "num_tokens": 620746.0, "step": 38 }, { "entropy": 0.5826456397771835, "epoch": 0.14579439252336449, "grad_norm": 0.047812167555093765, "learning_rate": 0.0002, "loss": 0.5687558054924011, "mean_token_accuracy": 0.771986335515976, "num_tokens": 637151.0, "step": 39 }, { "entropy": 0.5903666168451309, "epoch": 0.14953271028037382, "grad_norm": 0.044994354248046875, "learning_rate": 0.0002, "loss": 0.5762028098106384, "mean_token_accuracy": 0.7677688300609589, "num_tokens": 653530.0, "step": 40 }, { "entropy": 0.5751803368330002, "epoch": 0.15327102803738318, "grad_norm": 0.04342395439743996, "learning_rate": 0.0002, "loss": 0.5721427798271179, "mean_token_accuracy": 0.7731492966413498, "num_tokens": 669957.0, "step": 41 }, { "entropy": 0.5582813173532486, "epoch": 0.15700934579439252, "grad_norm": 0.05154528096318245, "learning_rate": 0.0002, "loss": 0.5713383555412292, "mean_token_accuracy": 0.7701951861381531, "num_tokens": 685933.0, "step": 42 }, { "entropy": 0.5747530311346054, "epoch": 0.16074766355140188, "grad_norm": 0.05052989348769188, "learning_rate": 0.0002, "loss": 0.5861970782279968, "mean_token_accuracy": 0.7652492970228195, "num_tokens": 702131.0, "step": 43 }, { "entropy": 0.5861315429210663, "epoch": 0.16448598130841122, "grad_norm": 0.043960776180028915, "learning_rate": 0.0002, "loss": 0.5891501903533936, "mean_token_accuracy": 0.7628277689218521, "num_tokens": 718330.0, "step": 44 }, { "entropy": 0.5868926346302032, "epoch": 0.16822429906542055, "grad_norm": 0.035861797630786896, "learning_rate": 0.0002, "loss": 0.5814363360404968, "mean_token_accuracy": 0.7670950144529343, "num_tokens": 734754.0, "step": 45 }, { "entropy": 0.5696061849594116, "epoch": 0.17196261682242991, "grad_norm": 0.03567943349480629, "learning_rate": 0.0002, "loss": 0.5582084655761719, "mean_token_accuracy": 0.7754767388105392, "num_tokens": 750952.0, "step": 46 }, { "entropy": 0.5884592086076736, "epoch": 0.17570093457943925, "grad_norm": 0.04051043465733528, "learning_rate": 0.0002, "loss": 0.5837826132774353, "mean_token_accuracy": 0.7652305215597153, "num_tokens": 767136.0, "step": 47 }, { "entropy": 0.568819597363472, "epoch": 0.17943925233644858, "grad_norm": 0.04234869405627251, "learning_rate": 0.0002, "loss": 0.5664035081863403, "mean_token_accuracy": 0.7719341665506363, "num_tokens": 783513.0, "step": 48 }, { "entropy": 0.553595632314682, "epoch": 0.18317757009345795, "grad_norm": 0.04170480743050575, "learning_rate": 0.0002, "loss": 0.564354658126831, "mean_token_accuracy": 0.7749540507793427, "num_tokens": 799703.0, "step": 49 }, { "entropy": 0.5621031820774078, "epoch": 0.18691588785046728, "grad_norm": 0.042460180819034576, "learning_rate": 0.0002, "loss": 0.576507568359375, "mean_token_accuracy": 0.7702780216932297, "num_tokens": 815979.0, "step": 50 }, { "entropy": 0.5803797841072083, "epoch": 0.19065420560747665, "grad_norm": 0.036130718886852264, "learning_rate": 0.0002, "loss": 0.5826534628868103, "mean_token_accuracy": 0.767243430018425, "num_tokens": 832435.0, "step": 51 }, { "entropy": 0.5492766499519348, "epoch": 0.19439252336448598, "grad_norm": 0.04120517149567604, "learning_rate": 0.0002, "loss": 0.5535300374031067, "mean_token_accuracy": 0.7766350656747818, "num_tokens": 848601.0, "step": 52 }, { "entropy": 0.5690171420574188, "epoch": 0.19813084112149532, "grad_norm": 0.03631429374217987, "learning_rate": 0.0002, "loss": 0.5688353776931763, "mean_token_accuracy": 0.7699357271194458, "num_tokens": 864779.0, "step": 53 }, { "entropy": 0.5830478370189667, "epoch": 0.20186915887850468, "grad_norm": 0.03915117308497429, "learning_rate": 0.0002, "loss": 0.5719392895698547, "mean_token_accuracy": 0.7702472358942032, "num_tokens": 881366.0, "step": 54 }, { "entropy": 0.5905578434467316, "epoch": 0.205607476635514, "grad_norm": 0.038457099348306656, "learning_rate": 0.0002, "loss": 0.5855496525764465, "mean_token_accuracy": 0.7646182626485825, "num_tokens": 897955.0, "step": 55 }, { "entropy": 0.5837848633527756, "epoch": 0.20934579439252338, "grad_norm": 0.04033343121409416, "learning_rate": 0.0002, "loss": 0.5784925222396851, "mean_token_accuracy": 0.7649644762277603, "num_tokens": 914164.0, "step": 56 }, { "entropy": 0.5470199286937714, "epoch": 0.2130841121495327, "grad_norm": 0.036680735647678375, "learning_rate": 0.0002, "loss": 0.5427253246307373, "mean_token_accuracy": 0.7822186052799225, "num_tokens": 930444.0, "step": 57 }, { "entropy": 0.5544598549604416, "epoch": 0.21682242990654205, "grad_norm": 0.04701124131679535, "learning_rate": 0.0002, "loss": 0.569618821144104, "mean_token_accuracy": 0.771122008562088, "num_tokens": 946567.0, "step": 58 }, { "entropy": 0.5725786834955215, "epoch": 0.2205607476635514, "grad_norm": 0.04193125665187836, "learning_rate": 0.0002, "loss": 0.5894483923912048, "mean_token_accuracy": 0.7642552405595779, "num_tokens": 962894.0, "step": 59 }, { "entropy": 0.5668687969446182, "epoch": 0.22429906542056074, "grad_norm": 0.033951517194509506, "learning_rate": 0.0002, "loss": 0.5699459314346313, "mean_token_accuracy": 0.7729462385177612, "num_tokens": 979210.0, "step": 60 }, { "entropy": 0.5792391896247864, "epoch": 0.22803738317757008, "grad_norm": 0.041912537068128586, "learning_rate": 0.0002, "loss": 0.5683349370956421, "mean_token_accuracy": 0.7706285119056702, "num_tokens": 995540.0, "step": 61 }, { "entropy": 0.5809753388166428, "epoch": 0.23177570093457944, "grad_norm": 0.036393389105796814, "learning_rate": 0.0002, "loss": 0.5727679133415222, "mean_token_accuracy": 0.7684315294027328, "num_tokens": 1011805.0, "step": 62 }, { "entropy": 0.5670438855886459, "epoch": 0.23551401869158878, "grad_norm": 0.03674926608800888, "learning_rate": 0.0002, "loss": 0.5604680776596069, "mean_token_accuracy": 0.7723257541656494, "num_tokens": 1028009.0, "step": 63 }, { "entropy": 0.5653442144393921, "epoch": 0.23925233644859814, "grad_norm": 0.03534647822380066, "learning_rate": 0.0002, "loss": 0.5580601096153259, "mean_token_accuracy": 0.7755836397409439, "num_tokens": 1044521.0, "step": 64 }, { "entropy": 0.5762730091810226, "epoch": 0.24299065420560748, "grad_norm": 0.03369547426700592, "learning_rate": 0.0002, "loss": 0.5709710121154785, "mean_token_accuracy": 0.7710799872875214, "num_tokens": 1060984.0, "step": 65 }, { "entropy": 0.56136754155159, "epoch": 0.2467289719626168, "grad_norm": 0.050162531435489655, "learning_rate": 0.0002, "loss": 0.5662704706192017, "mean_token_accuracy": 0.7702763229608536, "num_tokens": 1077512.0, "step": 66 }, { "entropy": 0.5493937730789185, "epoch": 0.2504672897196262, "grad_norm": 0.0446079783141613, "learning_rate": 0.0002, "loss": 0.563389778137207, "mean_token_accuracy": 0.7724475711584091, "num_tokens": 1093860.0, "step": 67 }, { "entropy": 0.5527212023735046, "epoch": 0.2542056074766355, "grad_norm": 0.04445589333772659, "learning_rate": 0.0002, "loss": 0.553238034248352, "mean_token_accuracy": 0.777790442109108, "num_tokens": 1109927.0, "step": 68 }, { "entropy": 0.5742960721254349, "epoch": 0.25794392523364484, "grad_norm": 0.03155473247170448, "learning_rate": 0.0002, "loss": 0.5755714774131775, "mean_token_accuracy": 0.7682003676891327, "num_tokens": 1126507.0, "step": 69 }, { "entropy": 0.570902407169342, "epoch": 0.2616822429906542, "grad_norm": 0.03776158019900322, "learning_rate": 0.0002, "loss": 0.5687341094017029, "mean_token_accuracy": 0.7690709233283997, "num_tokens": 1142690.0, "step": 70 }, { "entropy": 0.5869749188423157, "epoch": 0.26542056074766357, "grad_norm": 0.03637450933456421, "learning_rate": 0.0002, "loss": 0.5745267271995544, "mean_token_accuracy": 0.7675913572311401, "num_tokens": 1158998.0, "step": 71 }, { "entropy": 0.5770464688539505, "epoch": 0.2691588785046729, "grad_norm": 0.03824329748749733, "learning_rate": 0.0002, "loss": 0.5806713104248047, "mean_token_accuracy": 0.765295684337616, "num_tokens": 1175369.0, "step": 72 }, { "entropy": 0.5496443659067154, "epoch": 0.27289719626168224, "grad_norm": 0.03833479806780815, "learning_rate": 0.0002, "loss": 0.552317202091217, "mean_token_accuracy": 0.7775600254535675, "num_tokens": 1191776.0, "step": 73 }, { "entropy": 0.5672993659973145, "epoch": 0.2766355140186916, "grad_norm": 0.035141605883836746, "learning_rate": 0.0002, "loss": 0.5738911032676697, "mean_token_accuracy": 0.769673228263855, "num_tokens": 1208289.0, "step": 74 }, { "entropy": 0.5747457444667816, "epoch": 0.2803738317757009, "grad_norm": 0.03779706731438637, "learning_rate": 0.0002, "loss": 0.580111026763916, "mean_token_accuracy": 0.7651933431625366, "num_tokens": 1224804.0, "step": 75 }, { "entropy": 0.5685230642557144, "epoch": 0.2841121495327103, "grad_norm": 0.03369152173399925, "learning_rate": 0.0002, "loss": 0.571203351020813, "mean_token_accuracy": 0.7706969380378723, "num_tokens": 1240994.0, "step": 76 }, { "entropy": 0.5724664479494095, "epoch": 0.28785046728971964, "grad_norm": 0.03279148414731026, "learning_rate": 0.0002, "loss": 0.5703553557395935, "mean_token_accuracy": 0.7710930705070496, "num_tokens": 1257180.0, "step": 77 }, { "entropy": 0.570750430226326, "epoch": 0.29158878504672897, "grad_norm": 0.035474326461553574, "learning_rate": 0.0002, "loss": 0.57155442237854, "mean_token_accuracy": 0.7676969021558762, "num_tokens": 1273176.0, "step": 78 }, { "entropy": 0.5746997892856598, "epoch": 0.2953271028037383, "grad_norm": 0.03326554223895073, "learning_rate": 0.0002, "loss": 0.5764865279197693, "mean_token_accuracy": 0.7667145133018494, "num_tokens": 1289572.0, "step": 79 }, { "entropy": 0.5560239851474762, "epoch": 0.29906542056074764, "grad_norm": 0.033652499318122864, "learning_rate": 0.0002, "loss": 0.5541852712631226, "mean_token_accuracy": 0.7752721756696701, "num_tokens": 1305646.0, "step": 80 }, { "entropy": 0.5700062215328217, "epoch": 0.30280373831775703, "grad_norm": 0.036336466670036316, "learning_rate": 0.0002, "loss": 0.5715289115905762, "mean_token_accuracy": 0.7702216506004333, "num_tokens": 1322328.0, "step": 81 }, { "entropy": 0.5599597245454788, "epoch": 0.30654205607476637, "grad_norm": 0.032290052622556686, "learning_rate": 0.0002, "loss": 0.5614467859268188, "mean_token_accuracy": 0.7732760310173035, "num_tokens": 1338359.0, "step": 82 }, { "entropy": 0.5446556061506271, "epoch": 0.3102803738317757, "grad_norm": 0.03226450830698013, "learning_rate": 0.0002, "loss": 0.5512461066246033, "mean_token_accuracy": 0.7779420912265778, "num_tokens": 1354321.0, "step": 83 }, { "entropy": 0.5505060404539108, "epoch": 0.31401869158878504, "grad_norm": 0.035315077751874924, "learning_rate": 0.0002, "loss": 0.5553967952728271, "mean_token_accuracy": 0.7761841863393784, "num_tokens": 1370409.0, "step": 84 }, { "entropy": 0.5602358281612396, "epoch": 0.3177570093457944, "grad_norm": 0.031360018998384476, "learning_rate": 0.0002, "loss": 0.5553810596466064, "mean_token_accuracy": 0.7750610113143921, "num_tokens": 1386951.0, "step": 85 }, { "entropy": 0.5592145472764969, "epoch": 0.32149532710280376, "grad_norm": 0.03307170048356056, "learning_rate": 0.0002, "loss": 0.5547728538513184, "mean_token_accuracy": 0.7769513875246048, "num_tokens": 1403318.0, "step": 86 }, { "entropy": 0.5478426665067673, "epoch": 0.3252336448598131, "grad_norm": 0.03468095511198044, "learning_rate": 0.0002, "loss": 0.5475176572799683, "mean_token_accuracy": 0.7787642478942871, "num_tokens": 1419588.0, "step": 87 }, { "entropy": 0.5575945675373077, "epoch": 0.32897196261682243, "grad_norm": 0.0372730977833271, "learning_rate": 0.0002, "loss": 0.5592425465583801, "mean_token_accuracy": 0.7753143310546875, "num_tokens": 1435879.0, "step": 88 }, { "entropy": 0.5516618192195892, "epoch": 0.33271028037383177, "grad_norm": 0.03459680825471878, "learning_rate": 0.0002, "loss": 0.5590015649795532, "mean_token_accuracy": 0.7763092070817947, "num_tokens": 1452255.0, "step": 89 }, { "entropy": 0.5537828356027603, "epoch": 0.3364485981308411, "grad_norm": 0.037478331476449966, "learning_rate": 0.0002, "loss": 0.5628093481063843, "mean_token_accuracy": 0.7731254547834396, "num_tokens": 1468440.0, "step": 90 }, { "entropy": 0.5597833395004272, "epoch": 0.3401869158878505, "grad_norm": 0.03566694259643555, "learning_rate": 0.0002, "loss": 0.5576118230819702, "mean_token_accuracy": 0.7733734101057053, "num_tokens": 1484803.0, "step": 91 }, { "entropy": 0.5624473690986633, "epoch": 0.34392523364485983, "grad_norm": 0.038208235055208206, "learning_rate": 0.0002, "loss": 0.5643529891967773, "mean_token_accuracy": 0.773946151137352, "num_tokens": 1500849.0, "step": 92 }, { "entropy": 0.5809104889631271, "epoch": 0.34766355140186916, "grad_norm": 0.03173667564988136, "learning_rate": 0.0002, "loss": 0.5739686489105225, "mean_token_accuracy": 0.7694463729858398, "num_tokens": 1517263.0, "step": 93 }, { "entropy": 0.5697960555553436, "epoch": 0.3514018691588785, "grad_norm": 0.03167756646871567, "learning_rate": 0.0002, "loss": 0.5665271878242493, "mean_token_accuracy": 0.7699908316135406, "num_tokens": 1533648.0, "step": 94 }, { "entropy": 0.5966296941041946, "epoch": 0.35514018691588783, "grad_norm": 0.036720361560583115, "learning_rate": 0.0002, "loss": 0.5901257395744324, "mean_token_accuracy": 0.7647226899862289, "num_tokens": 1550084.0, "step": 95 }, { "entropy": 0.5599866956472397, "epoch": 0.35887850467289717, "grad_norm": 0.03618223965167999, "learning_rate": 0.0002, "loss": 0.5656697750091553, "mean_token_accuracy": 0.7732058614492416, "num_tokens": 1566526.0, "step": 96 }, { "entropy": 0.5660023838281631, "epoch": 0.36261682242990656, "grad_norm": 0.037616875022649765, "learning_rate": 0.0002, "loss": 0.5731638669967651, "mean_token_accuracy": 0.7681225687265396, "num_tokens": 1582887.0, "step": 97 }, { "entropy": 0.5692461878061295, "epoch": 0.3663551401869159, "grad_norm": 0.04291412979364395, "learning_rate": 0.0002, "loss": 0.5790476202964783, "mean_token_accuracy": 0.7658884823322296, "num_tokens": 1599367.0, "step": 98 }, { "entropy": 0.5626956224441528, "epoch": 0.37009345794392523, "grad_norm": 0.03269932419061661, "learning_rate": 0.0002, "loss": 0.5623303651809692, "mean_token_accuracy": 0.7726950198411942, "num_tokens": 1615716.0, "step": 99 }, { "entropy": 0.5417574644088745, "epoch": 0.37383177570093457, "grad_norm": 0.029643645510077477, "learning_rate": 0.0002, "loss": 0.5503037571907043, "mean_token_accuracy": 0.7786638289690018, "num_tokens": 1631985.0, "step": 100 }, { "entropy": 0.5644317716360092, "epoch": 0.3775700934579439, "grad_norm": 0.03810103237628937, "learning_rate": 0.0002, "loss": 0.5641601085662842, "mean_token_accuracy": 0.7715529501438141, "num_tokens": 1648148.0, "step": 101 }, { "entropy": 0.5648799985647202, "epoch": 0.3813084112149533, "grad_norm": 0.02914907969534397, "learning_rate": 0.0002, "loss": 0.5619527101516724, "mean_token_accuracy": 0.7744928747415543, "num_tokens": 1664554.0, "step": 102 }, { "entropy": 0.5753660798072815, "epoch": 0.3850467289719626, "grad_norm": 0.02887723594903946, "learning_rate": 0.0002, "loss": 0.5688785314559937, "mean_token_accuracy": 0.7692504674196243, "num_tokens": 1680782.0, "step": 103 }, { "entropy": 0.561363086104393, "epoch": 0.38878504672897196, "grad_norm": 0.028774583712220192, "learning_rate": 0.0002, "loss": 0.560323178768158, "mean_token_accuracy": 0.7716943174600601, "num_tokens": 1696855.0, "step": 104 }, { "entropy": 0.5558189004659653, "epoch": 0.3925233644859813, "grad_norm": 0.030897047370672226, "learning_rate": 0.0002, "loss": 0.5627227425575256, "mean_token_accuracy": 0.7728832811117172, "num_tokens": 1713092.0, "step": 105 }, { "entropy": 0.5579479783773422, "epoch": 0.39626168224299063, "grad_norm": 0.03168272599577904, "learning_rate": 0.0002, "loss": 0.5611063241958618, "mean_token_accuracy": 0.7737848162651062, "num_tokens": 1729174.0, "step": 106 }, { "entropy": 0.5593132227659225, "epoch": 0.4, "grad_norm": 0.030001681298017502, "learning_rate": 0.0002, "loss": 0.5634371638298035, "mean_token_accuracy": 0.7737011611461639, "num_tokens": 1745387.0, "step": 107 }, { "entropy": 0.5454982221126556, "epoch": 0.40373831775700936, "grad_norm": 0.033263012766838074, "learning_rate": 0.0002, "loss": 0.5490332841873169, "mean_token_accuracy": 0.7772792428731918, "num_tokens": 1761446.0, "step": 108 }, { "entropy": 0.5551732182502747, "epoch": 0.4074766355140187, "grad_norm": 0.030698338523507118, "learning_rate": 0.0002, "loss": 0.5535954236984253, "mean_token_accuracy": 0.773947462439537, "num_tokens": 1778105.0, "step": 109 }, { "entropy": 0.5650522261857986, "epoch": 0.411214953271028, "grad_norm": 0.02939177118241787, "learning_rate": 0.0002, "loss": 0.5615048408508301, "mean_token_accuracy": 0.7712746411561966, "num_tokens": 1794562.0, "step": 110 }, { "entropy": 0.5696343183517456, "epoch": 0.41495327102803736, "grad_norm": 0.03011537715792656, "learning_rate": 0.0002, "loss": 0.5706506967544556, "mean_token_accuracy": 0.7699969708919525, "num_tokens": 1810779.0, "step": 111 }, { "entropy": 0.5387005656957626, "epoch": 0.41869158878504675, "grad_norm": 0.033464495092630386, "learning_rate": 0.0002, "loss": 0.5423218607902527, "mean_token_accuracy": 0.7795679718255997, "num_tokens": 1827208.0, "step": 112 }, { "entropy": 0.5597733706235886, "epoch": 0.4224299065420561, "grad_norm": 0.029017142951488495, "learning_rate": 0.0002, "loss": 0.5561181306838989, "mean_token_accuracy": 0.7743376046419144, "num_tokens": 1843649.0, "step": 113 }, { "entropy": 0.5541809946298599, "epoch": 0.4261682242990654, "grad_norm": 0.030042298138141632, "learning_rate": 0.0002, "loss": 0.5544824600219727, "mean_token_accuracy": 0.7773302495479584, "num_tokens": 1859919.0, "step": 114 }, { "entropy": 0.5697837471961975, "epoch": 0.42990654205607476, "grad_norm": 0.029710182920098305, "learning_rate": 0.0002, "loss": 0.5684210658073425, "mean_token_accuracy": 0.7717447876930237, "num_tokens": 1876288.0, "step": 115 }, { "entropy": 0.5591758489608765, "epoch": 0.4336448598130841, "grad_norm": 0.031515248119831085, "learning_rate": 0.0002, "loss": 0.5618751645088196, "mean_token_accuracy": 0.77419513463974, "num_tokens": 1892685.0, "step": 116 }, { "entropy": 0.5360209345817566, "epoch": 0.4373831775700935, "grad_norm": 0.036333996802568436, "learning_rate": 0.0002, "loss": 0.5519132614135742, "mean_token_accuracy": 0.77690489590168, "num_tokens": 1908983.0, "step": 117 }, { "entropy": 0.5584719926118851, "epoch": 0.4411214953271028, "grad_norm": 0.03057498298585415, "learning_rate": 0.0002, "loss": 0.5668904185295105, "mean_token_accuracy": 0.7719320356845856, "num_tokens": 1925134.0, "step": 118 }, { "entropy": 0.5634136199951172, "epoch": 0.44485981308411215, "grad_norm": 0.038503021001815796, "learning_rate": 0.0002, "loss": 0.5522302389144897, "mean_token_accuracy": 0.7777165621519089, "num_tokens": 1941319.0, "step": 119 }, { "entropy": 0.5695697367191315, "epoch": 0.4485981308411215, "grad_norm": 0.02690051682293415, "learning_rate": 0.0002, "loss": 0.5623375773429871, "mean_token_accuracy": 0.7749422192573547, "num_tokens": 1957576.0, "step": 120 }, { "entropy": 0.5670370161533356, "epoch": 0.4523364485981308, "grad_norm": 0.030103027820587158, "learning_rate": 0.0002, "loss": 0.5645368695259094, "mean_token_accuracy": 0.7715286463499069, "num_tokens": 1973598.0, "step": 121 }, { "entropy": 0.5673844367265701, "epoch": 0.45607476635514016, "grad_norm": 0.03927698731422424, "learning_rate": 0.0002, "loss": 0.5738642811775208, "mean_token_accuracy": 0.7676763832569122, "num_tokens": 1989896.0, "step": 122 }, { "entropy": 0.5642601549625397, "epoch": 0.45981308411214955, "grad_norm": 0.040063194930553436, "learning_rate": 0.0002, "loss": 0.5772222280502319, "mean_token_accuracy": 0.7651336938142776, "num_tokens": 2006217.0, "step": 123 }, { "entropy": 0.5646145790815353, "epoch": 0.4635514018691589, "grad_norm": 0.02972179837524891, "learning_rate": 0.0002, "loss": 0.5596722960472107, "mean_token_accuracy": 0.7738584727048874, "num_tokens": 2022407.0, "step": 124 }, { "entropy": 0.5680184960365295, "epoch": 0.4672897196261682, "grad_norm": 0.03161488473415375, "learning_rate": 0.0002, "loss": 0.5569790601730347, "mean_token_accuracy": 0.7752905040979385, "num_tokens": 2038990.0, "step": 125 }, { "entropy": 0.5721628367900848, "epoch": 0.47102803738317756, "grad_norm": 0.03150559216737747, "learning_rate": 0.0002, "loss": 0.56056147813797, "mean_token_accuracy": 0.7753510475158691, "num_tokens": 2055485.0, "step": 126 }, { "entropy": 0.5526139587163925, "epoch": 0.4747663551401869, "grad_norm": 0.02876976877450943, "learning_rate": 0.0002, "loss": 0.555187463760376, "mean_token_accuracy": 0.7740543335676193, "num_tokens": 2071792.0, "step": 127 }, { "entropy": 0.542378157377243, "epoch": 0.4785046728971963, "grad_norm": 0.03460092097520828, "learning_rate": 0.0002, "loss": 0.5530366897583008, "mean_token_accuracy": 0.7747022658586502, "num_tokens": 2087874.0, "step": 128 }, { "entropy": 0.5451681464910507, "epoch": 0.4822429906542056, "grad_norm": 0.02991570346057415, "learning_rate": 0.0002, "loss": 0.549987256526947, "mean_token_accuracy": 0.7774564474821091, "num_tokens": 2104238.0, "step": 129 }, { "entropy": 0.5554285645484924, "epoch": 0.48598130841121495, "grad_norm": 0.0326702855527401, "learning_rate": 0.0002, "loss": 0.5605641603469849, "mean_token_accuracy": 0.7726142853498459, "num_tokens": 2120477.0, "step": 130 }, { "entropy": 0.555129811167717, "epoch": 0.4897196261682243, "grad_norm": 0.031020283699035645, "learning_rate": 0.0002, "loss": 0.5525497198104858, "mean_token_accuracy": 0.7749627828598022, "num_tokens": 2136857.0, "step": 131 }, { "entropy": 0.5660799294710159, "epoch": 0.4934579439252336, "grad_norm": 0.03083673305809498, "learning_rate": 0.0002, "loss": 0.5555440187454224, "mean_token_accuracy": 0.7719593346118927, "num_tokens": 2153526.0, "step": 132 }, { "entropy": 0.5561708807945251, "epoch": 0.497196261682243, "grad_norm": 0.031476520001888275, "learning_rate": 0.0002, "loss": 0.555605411529541, "mean_token_accuracy": 0.7762354910373688, "num_tokens": 2169651.0, "step": 133 }, { "entropy": 0.525283932685852, "epoch": 0.5009345794392523, "grad_norm": 0.03160262852907181, "learning_rate": 0.0002, "loss": 0.5320227742195129, "mean_token_accuracy": 0.7818241119384766, "num_tokens": 2185700.0, "step": 134 }, { "entropy": 0.5597178190946579, "epoch": 0.5046728971962616, "grad_norm": 0.03169814869761467, "learning_rate": 0.0002, "loss": 0.5603609681129456, "mean_token_accuracy": 0.7734936475753784, "num_tokens": 2201832.0, "step": 135 }, { "entropy": 0.5660498142242432, "epoch": 0.508411214953271, "grad_norm": 0.03322802484035492, "learning_rate": 0.0002, "loss": 0.570435643196106, "mean_token_accuracy": 0.7702528983354568, "num_tokens": 2218197.0, "step": 136 }, { "entropy": 0.5471976101398468, "epoch": 0.5121495327102804, "grad_norm": 0.031250759959220886, "learning_rate": 0.0002, "loss": 0.5555264353752136, "mean_token_accuracy": 0.7744151949882507, "num_tokens": 2234366.0, "step": 137 }, { "entropy": 0.5514054894447327, "epoch": 0.5158878504672897, "grad_norm": 0.026281429454684258, "learning_rate": 0.0002, "loss": 0.5531660318374634, "mean_token_accuracy": 0.7755394726991653, "num_tokens": 2250665.0, "step": 138 }, { "entropy": 0.5651220381259918, "epoch": 0.5196261682242991, "grad_norm": 0.031022025272250175, "learning_rate": 0.0002, "loss": 0.564669132232666, "mean_token_accuracy": 0.773309201002121, "num_tokens": 2266978.0, "step": 139 }, { "entropy": 0.5677877366542816, "epoch": 0.5233644859813084, "grad_norm": 0.030657587572932243, "learning_rate": 0.0002, "loss": 0.564283013343811, "mean_token_accuracy": 0.7711436003446579, "num_tokens": 2283321.0, "step": 140 }, { "entropy": 0.5454884767532349, "epoch": 0.5271028037383177, "grad_norm": 0.029621724039316177, "learning_rate": 0.0002, "loss": 0.5448048710823059, "mean_token_accuracy": 0.7774412333965302, "num_tokens": 2299654.0, "step": 141 }, { "entropy": 0.5593066215515137, "epoch": 0.5308411214953271, "grad_norm": 0.03370071202516556, "learning_rate": 0.0002, "loss": 0.5656630992889404, "mean_token_accuracy": 0.7700357884168625, "num_tokens": 2315917.0, "step": 142 }, { "entropy": 0.5630017071962357, "epoch": 0.5345794392523364, "grad_norm": 0.03445977345108986, "learning_rate": 0.0002, "loss": 0.5749462842941284, "mean_token_accuracy": 0.7682285755872726, "num_tokens": 2332053.0, "step": 143 }, { "entropy": 0.5692644715309143, "epoch": 0.5383177570093458, "grad_norm": 0.034105394035577774, "learning_rate": 0.0002, "loss": 0.5713233351707458, "mean_token_accuracy": 0.7670455425977707, "num_tokens": 2348321.0, "step": 144 }, { "entropy": 0.5742600113153458, "epoch": 0.5420560747663551, "grad_norm": 0.031007220968604088, "learning_rate": 0.0002, "loss": 0.571353554725647, "mean_token_accuracy": 0.76962810754776, "num_tokens": 2364386.0, "step": 145 }, { "entropy": 0.5725259482860565, "epoch": 0.5457943925233645, "grad_norm": 0.030071116983890533, "learning_rate": 0.0002, "loss": 0.5640747547149658, "mean_token_accuracy": 0.7740518748760223, "num_tokens": 2380815.0, "step": 146 }, { "entropy": 0.5748542249202728, "epoch": 0.5495327102803739, "grad_norm": 0.03353971987962723, "learning_rate": 0.0002, "loss": 0.5691145062446594, "mean_token_accuracy": 0.7703811824321747, "num_tokens": 2396915.0, "step": 147 }, { "entropy": 0.5501144975423813, "epoch": 0.5532710280373832, "grad_norm": 0.029002781957387924, "learning_rate": 0.0002, "loss": 0.5473450422286987, "mean_token_accuracy": 0.7768280953168869, "num_tokens": 2412894.0, "step": 148 }, { "entropy": 0.5640593320131302, "epoch": 0.5570093457943925, "grad_norm": 0.0339277982711792, "learning_rate": 0.0002, "loss": 0.568105936050415, "mean_token_accuracy": 0.7686444222927094, "num_tokens": 2429333.0, "step": 149 }, { "entropy": 0.5358926355838776, "epoch": 0.5607476635514018, "grad_norm": 0.03321727365255356, "learning_rate": 0.0002, "loss": 0.5451691150665283, "mean_token_accuracy": 0.7813747376203537, "num_tokens": 2445547.0, "step": 150 }, { "entropy": 0.575822114944458, "epoch": 0.5644859813084112, "grad_norm": 0.028913335874676704, "learning_rate": 0.0002, "loss": 0.5796110033988953, "mean_token_accuracy": 0.7663715481758118, "num_tokens": 2461739.0, "step": 151 }, { "entropy": 0.5666410624980927, "epoch": 0.5682242990654206, "grad_norm": 0.030346350744366646, "learning_rate": 0.0002, "loss": 0.5563742518424988, "mean_token_accuracy": 0.7750760018825531, "num_tokens": 2478290.0, "step": 152 }, { "entropy": 0.5700524747371674, "epoch": 0.5719626168224299, "grad_norm": 0.03455440327525139, "learning_rate": 0.0002, "loss": 0.5611424446105957, "mean_token_accuracy": 0.7719277888536453, "num_tokens": 2494845.0, "step": 153 }, { "entropy": 0.561910405755043, "epoch": 0.5757009345794393, "grad_norm": 0.029596278443932533, "learning_rate": 0.0002, "loss": 0.5637333393096924, "mean_token_accuracy": 0.771451935172081, "num_tokens": 2511497.0, "step": 154 }, { "entropy": 0.5496856719255447, "epoch": 0.5794392523364486, "grad_norm": 0.02896132506430149, "learning_rate": 0.0002, "loss": 0.5627070665359497, "mean_token_accuracy": 0.7726458758115768, "num_tokens": 2527582.0, "step": 155 }, { "entropy": 0.5563309341669083, "epoch": 0.5831775700934579, "grad_norm": 0.04145891219377518, "learning_rate": 0.0002, "loss": 0.5785839557647705, "mean_token_accuracy": 0.7629837244749069, "num_tokens": 2543948.0, "step": 156 }, { "entropy": 0.5635025650262833, "epoch": 0.5869158878504673, "grad_norm": 0.028125908225774765, "learning_rate": 0.0002, "loss": 0.5688048005104065, "mean_token_accuracy": 0.7708674967288971, "num_tokens": 2560174.0, "step": 157 }, { "entropy": 0.5650362074375153, "epoch": 0.5906542056074766, "grad_norm": 0.031838495284318924, "learning_rate": 0.0002, "loss": 0.5594847798347473, "mean_token_accuracy": 0.7728245556354523, "num_tokens": 2576418.0, "step": 158 }, { "entropy": 0.5560010820627213, "epoch": 0.594392523364486, "grad_norm": 0.03514372557401657, "learning_rate": 0.0002, "loss": 0.5445454120635986, "mean_token_accuracy": 0.7787751257419586, "num_tokens": 2592454.0, "step": 159 }, { "entropy": 0.552829384803772, "epoch": 0.5981308411214953, "grad_norm": 0.028390226885676384, "learning_rate": 0.0002, "loss": 0.5493785738945007, "mean_token_accuracy": 0.7761707901954651, "num_tokens": 2608586.0, "step": 160 }, { "entropy": 0.5553926527500153, "epoch": 0.6018691588785047, "grad_norm": 0.02847958728671074, "learning_rate": 0.0002, "loss": 0.5555365681648254, "mean_token_accuracy": 0.7766669541597366, "num_tokens": 2624962.0, "step": 161 }, { "entropy": 0.551996037364006, "epoch": 0.6056074766355141, "grad_norm": 0.03402937948703766, "learning_rate": 0.0002, "loss": 0.557694673538208, "mean_token_accuracy": 0.7744593769311905, "num_tokens": 2641382.0, "step": 162 }, { "entropy": 0.5671762228012085, "epoch": 0.6093457943925233, "grad_norm": 0.03495490923523903, "learning_rate": 0.0002, "loss": 0.5758394002914429, "mean_token_accuracy": 0.7660740315914154, "num_tokens": 2657986.0, "step": 163 }, { "entropy": 0.5575901418924332, "epoch": 0.6130841121495327, "grad_norm": 0.03418085724115372, "learning_rate": 0.0002, "loss": 0.5583428740501404, "mean_token_accuracy": 0.7739714235067368, "num_tokens": 2673995.0, "step": 164 }, { "entropy": 0.5644998699426651, "epoch": 0.616822429906542, "grad_norm": 0.028694115579128265, "learning_rate": 0.0002, "loss": 0.5556347370147705, "mean_token_accuracy": 0.775534600019455, "num_tokens": 2690249.0, "step": 165 }, { "entropy": 0.5767987668514252, "epoch": 0.6205607476635514, "grad_norm": 0.03323300555348396, "learning_rate": 0.0002, "loss": 0.5688591003417969, "mean_token_accuracy": 0.7711433321237564, "num_tokens": 2706818.0, "step": 166 }, { "entropy": 0.5557750165462494, "epoch": 0.6242990654205608, "grad_norm": 0.030084028840065002, "learning_rate": 0.0002, "loss": 0.5595380067825317, "mean_token_accuracy": 0.7722294181585312, "num_tokens": 2722820.0, "step": 167 }, { "entropy": 0.562026247382164, "epoch": 0.6280373831775701, "grad_norm": 0.03125706687569618, "learning_rate": 0.0002, "loss": 0.5637321472167969, "mean_token_accuracy": 0.7692414969205856, "num_tokens": 2739398.0, "step": 168 }, { "entropy": 0.5448627471923828, "epoch": 0.6317757009345795, "grad_norm": 0.03390555456280708, "learning_rate": 0.0002, "loss": 0.5494401454925537, "mean_token_accuracy": 0.7776045203208923, "num_tokens": 2755453.0, "step": 169 }, { "entropy": 0.5523964762687683, "epoch": 0.6355140186915887, "grad_norm": 0.03687772527337074, "learning_rate": 0.0002, "loss": 0.5620272159576416, "mean_token_accuracy": 0.7718589901924133, "num_tokens": 2771533.0, "step": 170 }, { "entropy": 0.5672519207000732, "epoch": 0.6392523364485981, "grad_norm": 0.035152945667505264, "learning_rate": 0.0002, "loss": 0.5725542306900024, "mean_token_accuracy": 0.768815353512764, "num_tokens": 2787816.0, "step": 171 }, { "entropy": 0.5715326368808746, "epoch": 0.6429906542056075, "grad_norm": 0.032671887427568436, "learning_rate": 0.0002, "loss": 0.5690709352493286, "mean_token_accuracy": 0.7705206274986267, "num_tokens": 2804253.0, "step": 172 }, { "entropy": 0.5771492570638657, "epoch": 0.6467289719626168, "grad_norm": 0.03344012424349785, "learning_rate": 0.0002, "loss": 0.5672138929367065, "mean_token_accuracy": 0.7719729393720627, "num_tokens": 2820473.0, "step": 173 }, { "entropy": 0.5444837659597397, "epoch": 0.6504672897196262, "grad_norm": 0.029676884412765503, "learning_rate": 0.0002, "loss": 0.5400466322898865, "mean_token_accuracy": 0.7845920622348785, "num_tokens": 2836738.0, "step": 174 }, { "entropy": 0.5679149776697159, "epoch": 0.6542056074766355, "grad_norm": 0.03190155327320099, "learning_rate": 0.0002, "loss": 0.5703109502792358, "mean_token_accuracy": 0.7677883356809616, "num_tokens": 2853015.0, "step": 175 }, { "entropy": 0.5386882424354553, "epoch": 0.6579439252336449, "grad_norm": 0.03156553953886032, "learning_rate": 0.0002, "loss": 0.5451309680938721, "mean_token_accuracy": 0.7785861194133759, "num_tokens": 2869326.0, "step": 176 }, { "entropy": 0.5546389669179916, "epoch": 0.6616822429906543, "grad_norm": 0.03298742696642876, "learning_rate": 0.0002, "loss": 0.5598126649856567, "mean_token_accuracy": 0.7714642137289047, "num_tokens": 2885638.0, "step": 177 }, { "entropy": 0.5554563403129578, "epoch": 0.6654205607476635, "grad_norm": 0.034988123923540115, "learning_rate": 0.0002, "loss": 0.5639896392822266, "mean_token_accuracy": 0.7712263017892838, "num_tokens": 2902116.0, "step": 178 }, { "entropy": 0.5492645055055618, "epoch": 0.6691588785046729, "grad_norm": 0.03213873505592346, "learning_rate": 0.0002, "loss": 0.5490330457687378, "mean_token_accuracy": 0.7778918445110321, "num_tokens": 2918514.0, "step": 179 }, { "entropy": 0.5809471905231476, "epoch": 0.6728971962616822, "grad_norm": 0.02829456329345703, "learning_rate": 0.0002, "loss": 0.5780236721038818, "mean_token_accuracy": 0.7631959617137909, "num_tokens": 2935180.0, "step": 180 }, { "entropy": 0.5545472204685211, "epoch": 0.6766355140186916, "grad_norm": 0.026784643530845642, "learning_rate": 0.0002, "loss": 0.5539122819900513, "mean_token_accuracy": 0.7744273245334625, "num_tokens": 2951485.0, "step": 181 }, { "entropy": 0.5583300441503525, "epoch": 0.680373831775701, "grad_norm": 0.028181226924061775, "learning_rate": 0.0002, "loss": 0.5567899942398071, "mean_token_accuracy": 0.7753158956766129, "num_tokens": 2967799.0, "step": 182 }, { "entropy": 0.5597800463438034, "epoch": 0.6841121495327103, "grad_norm": 0.027700597420334816, "learning_rate": 0.0002, "loss": 0.559861958026886, "mean_token_accuracy": 0.772071048617363, "num_tokens": 2984240.0, "step": 183 }, { "entropy": 0.5409596711397171, "epoch": 0.6878504672897197, "grad_norm": 0.030223077163100243, "learning_rate": 0.0002, "loss": 0.5486294031143188, "mean_token_accuracy": 0.7773659527301788, "num_tokens": 3000681.0, "step": 184 }, { "entropy": 0.5551634728908539, "epoch": 0.6915887850467289, "grad_norm": 0.02896454744040966, "learning_rate": 0.0002, "loss": 0.5600041151046753, "mean_token_accuracy": 0.7721187323331833, "num_tokens": 3017042.0, "step": 185 }, { "entropy": 0.5551397949457169, "epoch": 0.6953271028037383, "grad_norm": 0.02665393240749836, "learning_rate": 0.0002, "loss": 0.556494414806366, "mean_token_accuracy": 0.7747326493263245, "num_tokens": 3033356.0, "step": 186 }, { "entropy": 0.5497598797082901, "epoch": 0.6990654205607477, "grad_norm": 0.026862069964408875, "learning_rate": 0.0002, "loss": 0.5495949983596802, "mean_token_accuracy": 0.7788131833076477, "num_tokens": 3049609.0, "step": 187 }, { "entropy": 0.5756572186946869, "epoch": 0.702803738317757, "grad_norm": 0.028672486543655396, "learning_rate": 0.0002, "loss": 0.5735815763473511, "mean_token_accuracy": 0.7667711675167084, "num_tokens": 3065873.0, "step": 188 }, { "entropy": 0.560253381729126, "epoch": 0.7065420560747664, "grad_norm": 0.029232166707515717, "learning_rate": 0.0002, "loss": 0.5650488138198853, "mean_token_accuracy": 0.768238291144371, "num_tokens": 3081904.0, "step": 189 }, { "entropy": 0.5659812092781067, "epoch": 0.7102803738317757, "grad_norm": 0.028001444414258003, "learning_rate": 0.0002, "loss": 0.563786506652832, "mean_token_accuracy": 0.7705834209918976, "num_tokens": 3098208.0, "step": 190 }, { "entropy": 0.5397079735994339, "epoch": 0.7140186915887851, "grad_norm": 0.030035637319087982, "learning_rate": 0.0002, "loss": 0.5431380271911621, "mean_token_accuracy": 0.7773479521274567, "num_tokens": 3114448.0, "step": 191 }, { "entropy": 0.5607352703809738, "epoch": 0.7177570093457943, "grad_norm": 0.026054881513118744, "learning_rate": 0.0002, "loss": 0.5583080649375916, "mean_token_accuracy": 0.7758101969957352, "num_tokens": 3130755.0, "step": 192 }, { "entropy": 0.551689624786377, "epoch": 0.7214953271028037, "grad_norm": 0.02845809981226921, "learning_rate": 0.0002, "loss": 0.5481313467025757, "mean_token_accuracy": 0.7777986079454422, "num_tokens": 3147133.0, "step": 193 }, { "entropy": 0.5639677792787552, "epoch": 0.7252336448598131, "grad_norm": 0.029969094321131706, "learning_rate": 0.0002, "loss": 0.5681430697441101, "mean_token_accuracy": 0.7705964744091034, "num_tokens": 3163582.0, "step": 194 }, { "entropy": 0.5548544675111771, "epoch": 0.7289719626168224, "grad_norm": 0.026430293917655945, "learning_rate": 0.0002, "loss": 0.5528862476348877, "mean_token_accuracy": 0.7741632461547852, "num_tokens": 3180102.0, "step": 195 }, { "entropy": 0.5530348271131516, "epoch": 0.7327102803738318, "grad_norm": 0.026484189555048943, "learning_rate": 0.0002, "loss": 0.5540847778320312, "mean_token_accuracy": 0.7735424339771271, "num_tokens": 3196312.0, "step": 196 }, { "entropy": 0.5409010052680969, "epoch": 0.7364485981308411, "grad_norm": 0.030766047537326813, "learning_rate": 0.0002, "loss": 0.5487144589424133, "mean_token_accuracy": 0.7778207361698151, "num_tokens": 3212408.0, "step": 197 }, { "entropy": 0.5607801675796509, "epoch": 0.7401869158878505, "grad_norm": 0.029135972261428833, "learning_rate": 0.0002, "loss": 0.5579065680503845, "mean_token_accuracy": 0.7756243348121643, "num_tokens": 3228688.0, "step": 198 }, { "entropy": 0.5638224929571152, "epoch": 0.7439252336448599, "grad_norm": 0.028466643765568733, "learning_rate": 0.0002, "loss": 0.5634393095970154, "mean_token_accuracy": 0.770130917429924, "num_tokens": 3244856.0, "step": 199 }, { "entropy": 0.5390120446681976, "epoch": 0.7476635514018691, "grad_norm": 0.029409240931272507, "learning_rate": 0.0002, "loss": 0.5443782210350037, "mean_token_accuracy": 0.7796739190816879, "num_tokens": 3261004.0, "step": 200 }, { "entropy": 0.5513757616281509, "epoch": 0.7514018691588785, "grad_norm": 0.032466452568769455, "learning_rate": 0.0002, "loss": 0.5502808690071106, "mean_token_accuracy": 0.7751527577638626, "num_tokens": 3277310.0, "step": 201 }, { "entropy": 0.5808768719434738, "epoch": 0.7551401869158878, "grad_norm": 0.02947174198925495, "learning_rate": 0.0002, "loss": 0.5795295238494873, "mean_token_accuracy": 0.7640405744314194, "num_tokens": 3293719.0, "step": 202 }, { "entropy": 0.5713460445404053, "epoch": 0.7588785046728972, "grad_norm": 0.02874363400042057, "learning_rate": 0.0002, "loss": 0.5726850032806396, "mean_token_accuracy": 0.7662371546030045, "num_tokens": 3310262.0, "step": 203 }, { "entropy": 0.5619738698005676, "epoch": 0.7626168224299066, "grad_norm": 0.028361184522509575, "learning_rate": 0.0002, "loss": 0.5660584568977356, "mean_token_accuracy": 0.7703312337398529, "num_tokens": 3326670.0, "step": 204 }, { "entropy": 0.5531926304101944, "epoch": 0.7663551401869159, "grad_norm": 0.029734794050455093, "learning_rate": 0.0002, "loss": 0.5551853775978088, "mean_token_accuracy": 0.7757412046194077, "num_tokens": 3343182.0, "step": 205 }, { "entropy": 0.5436140149831772, "epoch": 0.7700934579439253, "grad_norm": 0.027612119913101196, "learning_rate": 0.0002, "loss": 0.5460025668144226, "mean_token_accuracy": 0.7787571996450424, "num_tokens": 3359734.0, "step": 206 }, { "entropy": 0.5484267920255661, "epoch": 0.7738317757009345, "grad_norm": 0.0273665152490139, "learning_rate": 0.0002, "loss": 0.5512120723724365, "mean_token_accuracy": 0.7762885689735413, "num_tokens": 3375965.0, "step": 207 }, { "entropy": 0.5604408234357834, "epoch": 0.7775700934579439, "grad_norm": 0.03310655429959297, "learning_rate": 0.0002, "loss": 0.5644571185112, "mean_token_accuracy": 0.7733126729726791, "num_tokens": 3392102.0, "step": 208 }, { "entropy": 0.5418381690979004, "epoch": 0.7813084112149533, "grad_norm": 0.03232184052467346, "learning_rate": 0.0002, "loss": 0.5521958470344543, "mean_token_accuracy": 0.7741148620843887, "num_tokens": 3408306.0, "step": 209 }, { "entropy": 0.5678922086954117, "epoch": 0.7850467289719626, "grad_norm": 0.02696731500327587, "learning_rate": 0.0002, "loss": 0.5638433694839478, "mean_token_accuracy": 0.7702384293079376, "num_tokens": 3424846.0, "step": 210 }, { "entropy": 0.5885234028100967, "epoch": 0.788785046728972, "grad_norm": 0.032732248306274414, "learning_rate": 0.0002, "loss": 0.5857526659965515, "mean_token_accuracy": 0.7618716955184937, "num_tokens": 3441315.0, "step": 211 }, { "entropy": 0.5481836199760437, "epoch": 0.7925233644859813, "grad_norm": 0.03158198669552803, "learning_rate": 0.0002, "loss": 0.5456998348236084, "mean_token_accuracy": 0.7771993726491928, "num_tokens": 3457579.0, "step": 212 }, { "entropy": 0.5607763081789017, "epoch": 0.7962616822429907, "grad_norm": 0.03416353091597557, "learning_rate": 0.0002, "loss": 0.5663735270500183, "mean_token_accuracy": 0.7718233168125153, "num_tokens": 3474205.0, "step": 213 }, { "entropy": 0.5533930957317352, "epoch": 0.8, "grad_norm": 0.02877282351255417, "learning_rate": 0.0002, "loss": 0.5556164383888245, "mean_token_accuracy": 0.7742215096950531, "num_tokens": 3490438.0, "step": 214 }, { "entropy": 0.5604168176651001, "epoch": 0.8037383177570093, "grad_norm": 0.026928121224045753, "learning_rate": 0.0002, "loss": 0.5551791191101074, "mean_token_accuracy": 0.77230204641819, "num_tokens": 3506851.0, "step": 215 }, { "entropy": 0.5647037774324417, "epoch": 0.8074766355140187, "grad_norm": 0.03445446118712425, "learning_rate": 0.0002, "loss": 0.5678783655166626, "mean_token_accuracy": 0.7699416279792786, "num_tokens": 3523043.0, "step": 216 }, { "entropy": 0.571955680847168, "epoch": 0.811214953271028, "grad_norm": 0.028322864323854446, "learning_rate": 0.0002, "loss": 0.5738518238067627, "mean_token_accuracy": 0.7654245793819427, "num_tokens": 3539365.0, "step": 217 }, { "entropy": 0.5523362904787064, "epoch": 0.8149532710280374, "grad_norm": 0.033752068877220154, "learning_rate": 0.0002, "loss": 0.5535821914672852, "mean_token_accuracy": 0.7761557102203369, "num_tokens": 3555412.0, "step": 218 }, { "entropy": 0.5571073293685913, "epoch": 0.8186915887850468, "grad_norm": 0.03274444863200188, "learning_rate": 0.0002, "loss": 0.5591251850128174, "mean_token_accuracy": 0.7738742381334305, "num_tokens": 3571607.0, "step": 219 }, { "entropy": 0.5460310876369476, "epoch": 0.822429906542056, "grad_norm": 0.03267780691385269, "learning_rate": 0.0002, "loss": 0.5483282208442688, "mean_token_accuracy": 0.774459958076477, "num_tokens": 3588112.0, "step": 220 }, { "entropy": 0.5458645969629288, "epoch": 0.8261682242990654, "grad_norm": 0.029655037447810173, "learning_rate": 0.0002, "loss": 0.553710401058197, "mean_token_accuracy": 0.7749865502119064, "num_tokens": 3604422.0, "step": 221 }, { "entropy": 0.5589277297258377, "epoch": 0.8299065420560747, "grad_norm": 0.0299095269292593, "learning_rate": 0.0002, "loss": 0.5621532201766968, "mean_token_accuracy": 0.7721328884363174, "num_tokens": 3620586.0, "step": 222 }, { "entropy": 0.5576933324337006, "epoch": 0.8336448598130841, "grad_norm": 0.031302373856306076, "learning_rate": 0.0002, "loss": 0.5637439489364624, "mean_token_accuracy": 0.7706159353256226, "num_tokens": 3636859.0, "step": 223 }, { "entropy": 0.5583267956972122, "epoch": 0.8373831775700935, "grad_norm": 0.02684536948800087, "learning_rate": 0.0002, "loss": 0.5605804920196533, "mean_token_accuracy": 0.7703929096460342, "num_tokens": 3653154.0, "step": 224 }, { "entropy": 0.5555603951215744, "epoch": 0.8411214953271028, "grad_norm": 0.025324055925011635, "learning_rate": 0.0002, "loss": 0.5553929805755615, "mean_token_accuracy": 0.773400217294693, "num_tokens": 3669474.0, "step": 225 }, { "entropy": 0.5502129048109055, "epoch": 0.8448598130841122, "grad_norm": 0.03151983022689819, "learning_rate": 0.0002, "loss": 0.5402862429618835, "mean_token_accuracy": 0.7839637249708176, "num_tokens": 3685885.0, "step": 226 }, { "entropy": 0.5631079375743866, "epoch": 0.8485981308411215, "grad_norm": 0.026639366522431374, "learning_rate": 0.0002, "loss": 0.5603518486022949, "mean_token_accuracy": 0.7707885354757309, "num_tokens": 3702475.0, "step": 227 }, { "entropy": 0.5576464682817459, "epoch": 0.8523364485981308, "grad_norm": 0.028526777401566505, "learning_rate": 0.0002, "loss": 0.5615932941436768, "mean_token_accuracy": 0.7698924392461777, "num_tokens": 3718675.0, "step": 228 }, { "entropy": 0.5553766041994095, "epoch": 0.8560747663551402, "grad_norm": 0.028387868776917458, "learning_rate": 0.0002, "loss": 0.5598117709159851, "mean_token_accuracy": 0.7748202681541443, "num_tokens": 3734973.0, "step": 229 }, { "entropy": 0.5636192113161087, "epoch": 0.8598130841121495, "grad_norm": 0.029663704335689545, "learning_rate": 0.0002, "loss": 0.5619429349899292, "mean_token_accuracy": 0.7697723060846329, "num_tokens": 3751197.0, "step": 230 }, { "entropy": 0.5656130164861679, "epoch": 0.8635514018691589, "grad_norm": 0.027196481823921204, "learning_rate": 0.0002, "loss": 0.559482753276825, "mean_token_accuracy": 0.7736194878816605, "num_tokens": 3767681.0, "step": 231 }, { "entropy": 0.5610507130622864, "epoch": 0.8672897196261682, "grad_norm": 0.02665848098695278, "learning_rate": 0.0002, "loss": 0.5574455857276917, "mean_token_accuracy": 0.7723447382450104, "num_tokens": 3784223.0, "step": 232 }, { "entropy": 0.5565789192914963, "epoch": 0.8710280373831776, "grad_norm": 0.029676776379346848, "learning_rate": 0.0002, "loss": 0.5581963062286377, "mean_token_accuracy": 0.7723328024148941, "num_tokens": 3800606.0, "step": 233 }, { "entropy": 0.5488535314798355, "epoch": 0.874766355140187, "grad_norm": 0.026432445272803307, "learning_rate": 0.0002, "loss": 0.5548264384269714, "mean_token_accuracy": 0.776095449924469, "num_tokens": 3817211.0, "step": 234 }, { "entropy": 0.5432089567184448, "epoch": 0.8785046728971962, "grad_norm": 0.028454309329390526, "learning_rate": 0.0002, "loss": 0.5551573038101196, "mean_token_accuracy": 0.7737965285778046, "num_tokens": 3833562.0, "step": 235 }, { "entropy": 0.5564523041248322, "epoch": 0.8822429906542056, "grad_norm": 0.03045317530632019, "learning_rate": 0.0002, "loss": 0.5593273043632507, "mean_token_accuracy": 0.7728880196809769, "num_tokens": 3849716.0, "step": 236 }, { "entropy": 0.5449672043323517, "epoch": 0.8859813084112149, "grad_norm": 0.026425793766975403, "learning_rate": 0.0002, "loss": 0.5469970107078552, "mean_token_accuracy": 0.777935191988945, "num_tokens": 3865915.0, "step": 237 }, { "entropy": 0.5773142129182816, "epoch": 0.8897196261682243, "grad_norm": 0.024763669818639755, "learning_rate": 0.0002, "loss": 0.5751665830612183, "mean_token_accuracy": 0.7665848284959793, "num_tokens": 3882374.0, "step": 238 }, { "entropy": 0.5337313264608383, "epoch": 0.8934579439252337, "grad_norm": 0.027221228927373886, "learning_rate": 0.0002, "loss": 0.5295661687850952, "mean_token_accuracy": 0.7860913276672363, "num_tokens": 3898501.0, "step": 239 }, { "entropy": 0.5395989120006561, "epoch": 0.897196261682243, "grad_norm": 0.026916388422250748, "learning_rate": 0.0002, "loss": 0.5377291440963745, "mean_token_accuracy": 0.7827803045511246, "num_tokens": 3914802.0, "step": 240 }, { "entropy": 0.56096251308918, "epoch": 0.9009345794392524, "grad_norm": 0.03178329020738602, "learning_rate": 0.0002, "loss": 0.5572348237037659, "mean_token_accuracy": 0.774958074092865, "num_tokens": 3931307.0, "step": 241 }, { "entropy": 0.5351977944374084, "epoch": 0.9046728971962616, "grad_norm": 0.027758494019508362, "learning_rate": 0.0002, "loss": 0.5389144420623779, "mean_token_accuracy": 0.7842132151126862, "num_tokens": 3947818.0, "step": 242 }, { "entropy": 0.5689495801925659, "epoch": 0.908411214953271, "grad_norm": 0.028313076123595238, "learning_rate": 0.0002, "loss": 0.5732687711715698, "mean_token_accuracy": 0.7685291916131973, "num_tokens": 3964238.0, "step": 243 }, { "entropy": 0.5562418401241302, "epoch": 0.9121495327102803, "grad_norm": 0.028738385066390038, "learning_rate": 0.0002, "loss": 0.5559317469596863, "mean_token_accuracy": 0.7747041881084442, "num_tokens": 3980625.0, "step": 244 }, { "entropy": 0.5630334913730621, "epoch": 0.9158878504672897, "grad_norm": 0.024547314271330833, "learning_rate": 0.0002, "loss": 0.560680627822876, "mean_token_accuracy": 0.7717334777116776, "num_tokens": 3997248.0, "step": 245 }, { "entropy": 0.5409311354160309, "epoch": 0.9196261682242991, "grad_norm": 0.029392484575510025, "learning_rate": 0.0002, "loss": 0.5488813519477844, "mean_token_accuracy": 0.7771373838186264, "num_tokens": 4013356.0, "step": 246 }, { "entropy": 0.5529599785804749, "epoch": 0.9233644859813084, "grad_norm": 0.024964116513729095, "learning_rate": 0.0002, "loss": 0.5492331385612488, "mean_token_accuracy": 0.778782069683075, "num_tokens": 4029521.0, "step": 247 }, { "entropy": 0.5397895872592926, "epoch": 0.9271028037383178, "grad_norm": 0.026621561497449875, "learning_rate": 0.0002, "loss": 0.5443588495254517, "mean_token_accuracy": 0.7782554626464844, "num_tokens": 4045913.0, "step": 248 }, { "entropy": 0.5582248121500015, "epoch": 0.930841121495327, "grad_norm": 0.02803446725010872, "learning_rate": 0.0002, "loss": 0.5627061128616333, "mean_token_accuracy": 0.7742072343826294, "num_tokens": 4062448.0, "step": 249 }, { "entropy": 0.5673990696668625, "epoch": 0.9345794392523364, "grad_norm": 0.03014424815773964, "learning_rate": 0.0002, "loss": 0.5727946162223816, "mean_token_accuracy": 0.7685662358999252, "num_tokens": 4078711.0, "step": 250 }, { "entropy": 0.566023588180542, "epoch": 0.9383177570093458, "grad_norm": 0.030524935573339462, "learning_rate": 0.0002, "loss": 0.5595183372497559, "mean_token_accuracy": 0.7738057672977448, "num_tokens": 4095240.0, "step": 251 }, { "entropy": 0.5499134510755539, "epoch": 0.9420560747663551, "grad_norm": 0.02502668835222721, "learning_rate": 0.0002, "loss": 0.5446998476982117, "mean_token_accuracy": 0.7789950519800186, "num_tokens": 4111687.0, "step": 252 }, { "entropy": 0.5639411062002182, "epoch": 0.9457943925233645, "grad_norm": 0.03420841693878174, "learning_rate": 0.0002, "loss": 0.5659236311912537, "mean_token_accuracy": 0.7703807950019836, "num_tokens": 4128093.0, "step": 253 }, { "entropy": 0.5703455805778503, "epoch": 0.9495327102803738, "grad_norm": 0.0303607527166605, "learning_rate": 0.0002, "loss": 0.5696687698364258, "mean_token_accuracy": 0.7690610140562057, "num_tokens": 4144612.0, "step": 254 }, { "entropy": 0.558226928114891, "epoch": 0.9532710280373832, "grad_norm": 0.03168858587741852, "learning_rate": 0.0002, "loss": 0.5676078200340271, "mean_token_accuracy": 0.7693912833929062, "num_tokens": 4161169.0, "step": 255 }, { "entropy": 0.5530082136392593, "epoch": 0.9570093457943926, "grad_norm": 0.027083205059170723, "learning_rate": 0.0002, "loss": 0.5579201579093933, "mean_token_accuracy": 0.772939920425415, "num_tokens": 4177454.0, "step": 256 }, { "entropy": 0.5732781291007996, "epoch": 0.9607476635514018, "grad_norm": 0.025865184143185616, "learning_rate": 0.0002, "loss": 0.5745596289634705, "mean_token_accuracy": 0.7667286545038223, "num_tokens": 4193733.0, "step": 257 }, { "entropy": 0.5650701373815536, "epoch": 0.9644859813084112, "grad_norm": 0.03244631364941597, "learning_rate": 0.0002, "loss": 0.5617667436599731, "mean_token_accuracy": 0.7715478390455246, "num_tokens": 4209843.0, "step": 258 }, { "entropy": 0.5724828094244003, "epoch": 0.9682242990654205, "grad_norm": 0.02807115763425827, "learning_rate": 0.0002, "loss": 0.5692450404167175, "mean_token_accuracy": 0.76779405772686, "num_tokens": 4226262.0, "step": 259 }, { "entropy": 0.5677514672279358, "epoch": 0.9719626168224299, "grad_norm": 0.024189095944166183, "learning_rate": 0.0002, "loss": 0.5623309016227722, "mean_token_accuracy": 0.7734705060720444, "num_tokens": 4242877.0, "step": 260 }, { "entropy": 0.56018927693367, "epoch": 0.9757009345794393, "grad_norm": 0.030152512714266777, "learning_rate": 0.0002, "loss": 0.5675455927848816, "mean_token_accuracy": 0.7673967182636261, "num_tokens": 4259432.0, "step": 261 }, { "entropy": 0.5601605176925659, "epoch": 0.9794392523364486, "grad_norm": 0.0288025364279747, "learning_rate": 0.0002, "loss": 0.5698415040969849, "mean_token_accuracy": 0.7686598151922226, "num_tokens": 4275917.0, "step": 262 }, { "entropy": 0.5593424290418625, "epoch": 0.983177570093458, "grad_norm": 0.024790652096271515, "learning_rate": 0.0002, "loss": 0.5574150085449219, "mean_token_accuracy": 0.7770240753889084, "num_tokens": 4292310.0, "step": 263 }, { "entropy": 0.5394274890422821, "epoch": 0.9869158878504672, "grad_norm": 0.02477172389626503, "learning_rate": 0.0002, "loss": 0.5407758951187134, "mean_token_accuracy": 0.780282586812973, "num_tokens": 4308380.0, "step": 264 }, { "entropy": 0.5651121735572815, "epoch": 0.9906542056074766, "grad_norm": 0.028029976412653923, "learning_rate": 0.0002, "loss": 0.5648099184036255, "mean_token_accuracy": 0.7703951746225357, "num_tokens": 4324834.0, "step": 265 }, { "entropy": 0.5426322817802429, "epoch": 0.994392523364486, "grad_norm": 0.025631116703152657, "learning_rate": 0.0002, "loss": 0.5393193364143372, "mean_token_accuracy": 0.7813181281089783, "num_tokens": 4341233.0, "step": 266 }, { "entropy": 0.5464787781238556, "epoch": 0.9981308411214953, "grad_norm": 0.029863541945815086, "learning_rate": 0.0002, "loss": 0.5550025701522827, "mean_token_accuracy": 0.7747247219085693, "num_tokens": 4357682.0, "step": 267 }, { "entropy": 0.5607179999351501, "epoch": 1.0, "grad_norm": 0.03738218545913696, "learning_rate": 0.0002, "loss": 0.5586302876472473, "mean_token_accuracy": 0.7706243097782135, "num_tokens": 4364958.0, "step": 268 }, { "entropy": 0.5429188311100006, "epoch": 1.0037383177570094, "grad_norm": 0.031045127660036087, "learning_rate": 0.0002, "loss": 0.5379543900489807, "mean_token_accuracy": 0.7818119078874588, "num_tokens": 4381160.0, "step": 269 }, { "entropy": 0.5693697482347488, "epoch": 1.0074766355140188, "grad_norm": 0.034702617675065994, "learning_rate": 0.0002, "loss": 0.5631182789802551, "mean_token_accuracy": 0.7740933299064636, "num_tokens": 4397580.0, "step": 270 }, { "entropy": 0.5556007027626038, "epoch": 1.011214953271028, "grad_norm": 0.029613088816404343, "learning_rate": 0.0002, "loss": 0.5564326643943787, "mean_token_accuracy": 0.7747503072023392, "num_tokens": 4413970.0, "step": 271 }, { "entropy": 0.5529852658510208, "epoch": 1.0149532710280373, "grad_norm": 0.028977181762456894, "learning_rate": 0.0002, "loss": 0.5552069544792175, "mean_token_accuracy": 0.7720492333173752, "num_tokens": 4430293.0, "step": 272 }, { "entropy": 0.5520482361316681, "epoch": 1.0186915887850467, "grad_norm": 0.03374192863702774, "learning_rate": 0.0002, "loss": 0.5517052412033081, "mean_token_accuracy": 0.7761924266815186, "num_tokens": 4446900.0, "step": 273 }, { "entropy": 0.5477887243032455, "epoch": 1.0224299065420561, "grad_norm": 0.02954636886715889, "learning_rate": 0.0002, "loss": 0.5459023714065552, "mean_token_accuracy": 0.7766608893871307, "num_tokens": 4463329.0, "step": 274 }, { "entropy": 0.5484108775854111, "epoch": 1.0261682242990655, "grad_norm": 0.029792649671435356, "learning_rate": 0.0002, "loss": 0.553299069404602, "mean_token_accuracy": 0.7751943320035934, "num_tokens": 4479679.0, "step": 275 }, { "entropy": 0.5480824410915375, "epoch": 1.0299065420560747, "grad_norm": 0.03428385779261589, "learning_rate": 0.0002, "loss": 0.54673171043396, "mean_token_accuracy": 0.7777809202671051, "num_tokens": 4496261.0, "step": 276 }, { "entropy": 0.5371964275836945, "epoch": 1.033644859813084, "grad_norm": 0.027453402057290077, "learning_rate": 0.0002, "loss": 0.5412828922271729, "mean_token_accuracy": 0.7782962769269943, "num_tokens": 4512363.0, "step": 277 }, { "entropy": 0.5626021921634674, "epoch": 1.0373831775700935, "grad_norm": 0.03147402033209801, "learning_rate": 0.0002, "loss": 0.5639899373054504, "mean_token_accuracy": 0.772662416100502, "num_tokens": 4528687.0, "step": 278 }, { "entropy": 0.5309132784605026, "epoch": 1.0411214953271029, "grad_norm": 0.03592999279499054, "learning_rate": 0.0002, "loss": 0.5408714413642883, "mean_token_accuracy": 0.7803217619657516, "num_tokens": 4544861.0, "step": 279 }, { "entropy": 0.5621335506439209, "epoch": 1.0448598130841122, "grad_norm": 0.027180444449186325, "learning_rate": 0.0002, "loss": 0.5557287931442261, "mean_token_accuracy": 0.7766296565532684, "num_tokens": 4561446.0, "step": 280 }, { "entropy": 0.5597621351480484, "epoch": 1.0485981308411214, "grad_norm": 0.030723722651600838, "learning_rate": 0.0002, "loss": 0.5488376617431641, "mean_token_accuracy": 0.7752789407968521, "num_tokens": 4577902.0, "step": 281 }, { "entropy": 0.5447895377874374, "epoch": 1.0523364485981308, "grad_norm": 0.03346191346645355, "learning_rate": 0.0002, "loss": 0.54459547996521, "mean_token_accuracy": 0.7764092832803726, "num_tokens": 4593907.0, "step": 282 }, { "entropy": 0.5376723855733871, "epoch": 1.0560747663551402, "grad_norm": 0.029941193759441376, "learning_rate": 0.0002, "loss": 0.5396949052810669, "mean_token_accuracy": 0.7800134569406509, "num_tokens": 4610281.0, "step": 283 }, { "entropy": 0.532968744635582, "epoch": 1.0598130841121496, "grad_norm": 0.03566444665193558, "learning_rate": 0.0002, "loss": 0.5449310541152954, "mean_token_accuracy": 0.7814425081014633, "num_tokens": 4626569.0, "step": 284 }, { "entropy": 0.5349016040563583, "epoch": 1.063551401869159, "grad_norm": 0.03160771727561951, "learning_rate": 0.0002, "loss": 0.5422961115837097, "mean_token_accuracy": 0.7798893004655838, "num_tokens": 4643058.0, "step": 285 }, { "entropy": 0.533850871026516, "epoch": 1.0672897196261681, "grad_norm": 0.036520425230264664, "learning_rate": 0.0002, "loss": 0.5418434739112854, "mean_token_accuracy": 0.7801807075738907, "num_tokens": 4659171.0, "step": 286 }, { "entropy": 0.5512394160032272, "epoch": 1.0710280373831775, "grad_norm": 0.030453668907284737, "learning_rate": 0.0002, "loss": 0.547731339931488, "mean_token_accuracy": 0.77372145652771, "num_tokens": 4675372.0, "step": 287 }, { "entropy": 0.5371382534503937, "epoch": 1.074766355140187, "grad_norm": 0.031432170420885086, "learning_rate": 0.0002, "loss": 0.5252817869186401, "mean_token_accuracy": 0.7852388918399811, "num_tokens": 4691895.0, "step": 288 }, { "entropy": 0.5536183714866638, "epoch": 1.0785046728971963, "grad_norm": 0.036878716200590134, "learning_rate": 0.0002, "loss": 0.5542073249816895, "mean_token_accuracy": 0.7766832113265991, "num_tokens": 4708579.0, "step": 289 }, { "entropy": 0.5479064285755157, "epoch": 1.0822429906542057, "grad_norm": 0.031178997829556465, "learning_rate": 0.0002, "loss": 0.5539444088935852, "mean_token_accuracy": 0.7733383923768997, "num_tokens": 4725006.0, "step": 290 }, { "entropy": 0.5490889102220535, "epoch": 1.0859813084112149, "grad_norm": 0.03600861504673958, "learning_rate": 0.0002, "loss": 0.5477103590965271, "mean_token_accuracy": 0.7760229259729385, "num_tokens": 4741146.0, "step": 291 }, { "entropy": 0.5331408083438873, "epoch": 1.0897196261682243, "grad_norm": 0.029067492112517357, "learning_rate": 0.0002, "loss": 0.5310513377189636, "mean_token_accuracy": 0.7808917611837387, "num_tokens": 4757405.0, "step": 292 }, { "entropy": 0.5732952356338501, "epoch": 1.0934579439252337, "grad_norm": 0.027897845953702927, "learning_rate": 0.0002, "loss": 0.5689205527305603, "mean_token_accuracy": 0.7669987231492996, "num_tokens": 4773935.0, "step": 293 }, { "entropy": 0.5514747202396393, "epoch": 1.097196261682243, "grad_norm": 0.03678213432431221, "learning_rate": 0.0002, "loss": 0.5475887060165405, "mean_token_accuracy": 0.7782610803842545, "num_tokens": 4790197.0, "step": 294 }, { "entropy": 0.5528618544340134, "epoch": 1.1009345794392524, "grad_norm": 0.03136972337961197, "learning_rate": 0.0002, "loss": 0.5539395213127136, "mean_token_accuracy": 0.7734730243682861, "num_tokens": 4806625.0, "step": 295 }, { "entropy": 0.5395589917898178, "epoch": 1.1046728971962616, "grad_norm": 0.030648380517959595, "learning_rate": 0.0002, "loss": 0.5440752506256104, "mean_token_accuracy": 0.7809486091136932, "num_tokens": 4823046.0, "step": 296 }, { "entropy": 0.5670987218618393, "epoch": 1.108411214953271, "grad_norm": 0.028722837567329407, "learning_rate": 0.0002, "loss": 0.5669575929641724, "mean_token_accuracy": 0.7682226747274399, "num_tokens": 4839449.0, "step": 297 }, { "entropy": 0.5453528463840485, "epoch": 1.1121495327102804, "grad_norm": 0.03358433395624161, "learning_rate": 0.0002, "loss": 0.5394450426101685, "mean_token_accuracy": 0.7793479263782501, "num_tokens": 4855702.0, "step": 298 }, { "entropy": 0.5313688218593597, "epoch": 1.1158878504672898, "grad_norm": 0.031751058995723724, "learning_rate": 0.0002, "loss": 0.5339279174804688, "mean_token_accuracy": 0.7852170914411545, "num_tokens": 4872035.0, "step": 299 }, { "entropy": 0.5542233884334564, "epoch": 1.1196261682242992, "grad_norm": 0.030381185933947563, "learning_rate": 0.0002, "loss": 0.5629603862762451, "mean_token_accuracy": 0.76924729347229, "num_tokens": 4888405.0, "step": 300 }, { "entropy": 0.5514146685600281, "epoch": 1.1233644859813083, "grad_norm": 0.028884021565318108, "learning_rate": 0.0002, "loss": 0.550013542175293, "mean_token_accuracy": 0.7766973823308945, "num_tokens": 4904871.0, "step": 301 }, { "entropy": 0.5544252693653107, "epoch": 1.1271028037383177, "grad_norm": 0.03688167408108711, "learning_rate": 0.0002, "loss": 0.5589375495910645, "mean_token_accuracy": 0.7750934660434723, "num_tokens": 4921370.0, "step": 302 }, { "entropy": 0.5409253090620041, "epoch": 1.1308411214953271, "grad_norm": 0.026449156925082207, "learning_rate": 0.0002, "loss": 0.5402511358261108, "mean_token_accuracy": 0.7794521301984787, "num_tokens": 4937635.0, "step": 303 }, { "entropy": 0.5496914833784103, "epoch": 1.1345794392523365, "grad_norm": 0.030888745561242104, "learning_rate": 0.0002, "loss": 0.5520302653312683, "mean_token_accuracy": 0.7741389274597168, "num_tokens": 4953795.0, "step": 304 }, { "entropy": 0.5356033593416214, "epoch": 1.1383177570093457, "grad_norm": 0.030453680083155632, "learning_rate": 0.0002, "loss": 0.5415939092636108, "mean_token_accuracy": 0.7807344794273376, "num_tokens": 4970296.0, "step": 305 }, { "entropy": 0.53813037276268, "epoch": 1.142056074766355, "grad_norm": 0.03046366199851036, "learning_rate": 0.0002, "loss": 0.5416396856307983, "mean_token_accuracy": 0.7764643579721451, "num_tokens": 4986502.0, "step": 306 }, { "entropy": 0.5428405404090881, "epoch": 1.1457943925233645, "grad_norm": 0.03174874931573868, "learning_rate": 0.0002, "loss": 0.5486522912979126, "mean_token_accuracy": 0.7775285989046097, "num_tokens": 5002702.0, "step": 307 }, { "entropy": 0.5566747784614563, "epoch": 1.1495327102803738, "grad_norm": 0.028818320482969284, "learning_rate": 0.0002, "loss": 0.5562471151351929, "mean_token_accuracy": 0.77483069896698, "num_tokens": 5019050.0, "step": 308 }, { "entropy": 0.5498685240745544, "epoch": 1.1532710280373832, "grad_norm": 0.028088422492146492, "learning_rate": 0.0002, "loss": 0.5427108407020569, "mean_token_accuracy": 0.7781059741973877, "num_tokens": 5035367.0, "step": 309 }, { "entropy": 0.5676623731851578, "epoch": 1.1570093457943926, "grad_norm": 0.02635916881263256, "learning_rate": 0.0002, "loss": 0.5621261596679688, "mean_token_accuracy": 0.7690412253141403, "num_tokens": 5051623.0, "step": 310 }, { "entropy": 0.5571839809417725, "epoch": 1.1607476635514018, "grad_norm": 0.030562767758965492, "learning_rate": 0.0002, "loss": 0.5547442436218262, "mean_token_accuracy": 0.773685023188591, "num_tokens": 5067784.0, "step": 311 }, { "entropy": 0.5521961599588394, "epoch": 1.1644859813084112, "grad_norm": 0.02953186444938183, "learning_rate": 0.0002, "loss": 0.5498039722442627, "mean_token_accuracy": 0.7766331732273102, "num_tokens": 5084198.0, "step": 312 }, { "entropy": 0.5448037981987, "epoch": 1.1682242990654206, "grad_norm": 0.04071420431137085, "learning_rate": 0.0002, "loss": 0.5559482574462891, "mean_token_accuracy": 0.7727169245481491, "num_tokens": 5100585.0, "step": 313 }, { "entropy": 0.5439905822277069, "epoch": 1.17196261682243, "grad_norm": 0.031825143843889236, "learning_rate": 0.0002, "loss": 0.5438477396965027, "mean_token_accuracy": 0.7780765742063522, "num_tokens": 5116856.0, "step": 314 }, { "entropy": 0.5614278465509415, "epoch": 1.1757009345794391, "grad_norm": 0.03391456976532936, "learning_rate": 0.0002, "loss": 0.5585231781005859, "mean_token_accuracy": 0.774724468588829, "num_tokens": 5133123.0, "step": 315 }, { "entropy": 0.5348840728402138, "epoch": 1.1794392523364485, "grad_norm": 0.030404910445213318, "learning_rate": 0.0002, "loss": 0.5299553275108337, "mean_token_accuracy": 0.7871359586715698, "num_tokens": 5149505.0, "step": 316 }, { "entropy": 0.5417611449956894, "epoch": 1.183177570093458, "grad_norm": 0.03005358763039112, "learning_rate": 0.0002, "loss": 0.5521109700202942, "mean_token_accuracy": 0.7752534449100494, "num_tokens": 5165665.0, "step": 317 }, { "entropy": 0.5467934459447861, "epoch": 1.1869158878504673, "grad_norm": 0.030464891344308853, "learning_rate": 0.0002, "loss": 0.5535311698913574, "mean_token_accuracy": 0.7757606655359268, "num_tokens": 5182312.0, "step": 318 }, { "entropy": 0.55706687271595, "epoch": 1.1906542056074767, "grad_norm": 0.03402930125594139, "learning_rate": 0.0002, "loss": 0.56557697057724, "mean_token_accuracy": 0.773482084274292, "num_tokens": 5198753.0, "step": 319 }, { "entropy": 0.5285287350416183, "epoch": 1.194392523364486, "grad_norm": 0.03398562967777252, "learning_rate": 0.0002, "loss": 0.5356812477111816, "mean_token_accuracy": 0.781065508723259, "num_tokens": 5214716.0, "step": 320 }, { "entropy": 0.5561061501502991, "epoch": 1.1981308411214953, "grad_norm": 0.04313025251030922, "learning_rate": 0.0002, "loss": 0.5472796559333801, "mean_token_accuracy": 0.7778294533491135, "num_tokens": 5230933.0, "step": 321 }, { "entropy": 0.556538999080658, "epoch": 1.2018691588785047, "grad_norm": 0.03227441757917404, "learning_rate": 0.0002, "loss": 0.5438181161880493, "mean_token_accuracy": 0.7791680693626404, "num_tokens": 5247202.0, "step": 322 }, { "entropy": 0.5609522461891174, "epoch": 1.205607476635514, "grad_norm": 0.03183369338512421, "learning_rate": 0.0002, "loss": 0.5561162829399109, "mean_token_accuracy": 0.7751743495464325, "num_tokens": 5263696.0, "step": 323 }, { "entropy": 0.5427358001470566, "epoch": 1.2093457943925234, "grad_norm": 0.03253727778792381, "learning_rate": 0.0002, "loss": 0.5515695214271545, "mean_token_accuracy": 0.7756281793117523, "num_tokens": 5280141.0, "step": 324 }, { "entropy": 0.5160750597715378, "epoch": 1.2130841121495326, "grad_norm": 0.03668288141489029, "learning_rate": 0.0002, "loss": 0.526226282119751, "mean_token_accuracy": 0.7851300984621048, "num_tokens": 5296198.0, "step": 325 }, { "entropy": 0.5500008910894394, "epoch": 1.216822429906542, "grad_norm": 0.03275466337800026, "learning_rate": 0.0002, "loss": 0.5556660890579224, "mean_token_accuracy": 0.7739221006631851, "num_tokens": 5312653.0, "step": 326 }, { "entropy": 0.5459257364273071, "epoch": 1.2205607476635514, "grad_norm": 0.02891591377556324, "learning_rate": 0.0002, "loss": 0.5413340330123901, "mean_token_accuracy": 0.781257688999176, "num_tokens": 5328926.0, "step": 327 }, { "entropy": 0.5695579349994659, "epoch": 1.2242990654205608, "grad_norm": 0.0299241840839386, "learning_rate": 0.0002, "loss": 0.5636513233184814, "mean_token_accuracy": 0.7732590138912201, "num_tokens": 5345213.0, "step": 328 }, { "entropy": 0.5591664463281631, "epoch": 1.2280373831775702, "grad_norm": 0.034591834992170334, "learning_rate": 0.0002, "loss": 0.5587798953056335, "mean_token_accuracy": 0.7725549340248108, "num_tokens": 5361493.0, "step": 329 }, { "entropy": 0.5631786286830902, "epoch": 1.2317757009345796, "grad_norm": 0.03143571689724922, "learning_rate": 0.0002, "loss": 0.5540720224380493, "mean_token_accuracy": 0.7765887975692749, "num_tokens": 5378085.0, "step": 330 }, { "entropy": 0.5508914291858673, "epoch": 1.2355140186915887, "grad_norm": 0.032595690339803696, "learning_rate": 0.0002, "loss": 0.5526955723762512, "mean_token_accuracy": 0.7747674286365509, "num_tokens": 5394458.0, "step": 331 }, { "entropy": 0.536909781396389, "epoch": 1.2392523364485981, "grad_norm": 0.033028744161129, "learning_rate": 0.0002, "loss": 0.5481626987457275, "mean_token_accuracy": 0.7782605588436127, "num_tokens": 5410880.0, "step": 332 }, { "entropy": 0.5499342679977417, "epoch": 1.2429906542056075, "grad_norm": 0.03855755180120468, "learning_rate": 0.0002, "loss": 0.5627814531326294, "mean_token_accuracy": 0.7700037658214569, "num_tokens": 5426885.0, "step": 333 }, { "entropy": 0.5494136810302734, "epoch": 1.246728971962617, "grad_norm": 0.03397782891988754, "learning_rate": 0.0002, "loss": 0.5508397817611694, "mean_token_accuracy": 0.7756514847278595, "num_tokens": 5443330.0, "step": 334 }, { "entropy": 0.5679187029600143, "epoch": 1.250467289719626, "grad_norm": 0.03217748925089836, "learning_rate": 0.0002, "loss": 0.5683805346488953, "mean_token_accuracy": 0.770328551530838, "num_tokens": 5459602.0, "step": 335 }, { "entropy": 0.5620801448822021, "epoch": 1.2542056074766355, "grad_norm": 0.03699919581413269, "learning_rate": 0.0002, "loss": 0.556020200252533, "mean_token_accuracy": 0.7749847769737244, "num_tokens": 5475920.0, "step": 336 }, { "entropy": 0.5483541190624237, "epoch": 1.2579439252336448, "grad_norm": 0.027093922719359398, "learning_rate": 0.0002, "loss": 0.5420067310333252, "mean_token_accuracy": 0.7774698734283447, "num_tokens": 5492418.0, "step": 337 }, { "entropy": 0.5432356148958206, "epoch": 1.2616822429906542, "grad_norm": 0.029740024358034134, "learning_rate": 0.0002, "loss": 0.5436828136444092, "mean_token_accuracy": 0.7754241824150085, "num_tokens": 5508720.0, "step": 338 }, { "entropy": 0.5282722562551498, "epoch": 1.2654205607476636, "grad_norm": 0.02825041115283966, "learning_rate": 0.0002, "loss": 0.5287445783615112, "mean_token_accuracy": 0.785777822136879, "num_tokens": 5524810.0, "step": 339 }, { "entropy": 0.5574855506420135, "epoch": 1.269158878504673, "grad_norm": 0.03507409617304802, "learning_rate": 0.0002, "loss": 0.5642590522766113, "mean_token_accuracy": 0.7694929391145706, "num_tokens": 5541154.0, "step": 340 }, { "entropy": 0.5311331301927567, "epoch": 1.2728971962616822, "grad_norm": 0.029530638828873634, "learning_rate": 0.0002, "loss": 0.5375971794128418, "mean_token_accuracy": 0.7804928719997406, "num_tokens": 5557415.0, "step": 341 }, { "entropy": 0.5492513477802277, "epoch": 1.2766355140186916, "grad_norm": 0.03299937769770622, "learning_rate": 0.0002, "loss": 0.5487713813781738, "mean_token_accuracy": 0.7776053845882416, "num_tokens": 5573593.0, "step": 342 }, { "entropy": 0.5501092821359634, "epoch": 1.280373831775701, "grad_norm": 0.03342421352863312, "learning_rate": 0.0002, "loss": 0.5497907996177673, "mean_token_accuracy": 0.7747702449560165, "num_tokens": 5590001.0, "step": 343 }, { "entropy": 0.5520797073841095, "epoch": 1.2841121495327104, "grad_norm": 0.029625268653035164, "learning_rate": 0.0002, "loss": 0.5493736267089844, "mean_token_accuracy": 0.7800589352846146, "num_tokens": 5606174.0, "step": 344 }, { "entropy": 0.5360356196761131, "epoch": 1.2878504672897195, "grad_norm": 0.03089168108999729, "learning_rate": 0.0002, "loss": 0.5362368226051331, "mean_token_accuracy": 0.7833685129880905, "num_tokens": 5622436.0, "step": 345 }, { "entropy": 0.5267095118761063, "epoch": 1.291588785046729, "grad_norm": 0.03297918289899826, "learning_rate": 0.0002, "loss": 0.5281186699867249, "mean_token_accuracy": 0.7881515920162201, "num_tokens": 5638451.0, "step": 346 }, { "entropy": 0.5502850115299225, "epoch": 1.2953271028037383, "grad_norm": 0.047267865389585495, "learning_rate": 0.0002, "loss": 0.5505760312080383, "mean_token_accuracy": 0.7761109918355942, "num_tokens": 5655041.0, "step": 347 }, { "entropy": 0.5508257895708084, "epoch": 1.2990654205607477, "grad_norm": 0.028140036389231682, "learning_rate": 0.0002, "loss": 0.5515832304954529, "mean_token_accuracy": 0.7750399112701416, "num_tokens": 5671677.0, "step": 348 }, { "entropy": 0.5565541088581085, "epoch": 1.302803738317757, "grad_norm": 0.032449062913656235, "learning_rate": 0.0002, "loss": 0.5538536310195923, "mean_token_accuracy": 0.7736092507839203, "num_tokens": 5688187.0, "step": 349 }, { "entropy": 0.5361721217632294, "epoch": 1.3065420560747665, "grad_norm": 0.029190748929977417, "learning_rate": 0.0002, "loss": 0.5377737879753113, "mean_token_accuracy": 0.7808200567960739, "num_tokens": 5704636.0, "step": 350 }, { "entropy": 0.5346792191267014, "epoch": 1.3102803738317756, "grad_norm": 0.03473074361681938, "learning_rate": 0.0002, "loss": 0.5417028665542603, "mean_token_accuracy": 0.778437003493309, "num_tokens": 5721160.0, "step": 351 }, { "entropy": 0.5305602103471756, "epoch": 1.314018691588785, "grad_norm": 0.03426121547818184, "learning_rate": 0.0002, "loss": 0.5302631258964539, "mean_token_accuracy": 0.7822723984718323, "num_tokens": 5737508.0, "step": 352 }, { "entropy": 0.5443065613508224, "epoch": 1.3177570093457944, "grad_norm": 0.031232863664627075, "learning_rate": 0.0002, "loss": 0.5438801050186157, "mean_token_accuracy": 0.7807773351669312, "num_tokens": 5753931.0, "step": 353 }, { "entropy": 0.5547338575124741, "epoch": 1.3214953271028038, "grad_norm": 0.03515113145112991, "learning_rate": 0.0002, "loss": 0.5590701103210449, "mean_token_accuracy": 0.7718778848648071, "num_tokens": 5770396.0, "step": 354 }, { "entropy": 0.5776932686567307, "epoch": 1.325233644859813, "grad_norm": 0.031292639672756195, "learning_rate": 0.0002, "loss": 0.5758817791938782, "mean_token_accuracy": 0.76340052485466, "num_tokens": 5786743.0, "step": 355 }, { "entropy": 0.5471627116203308, "epoch": 1.3289719626168224, "grad_norm": 0.02935577929019928, "learning_rate": 0.0002, "loss": 0.5406426787376404, "mean_token_accuracy": 0.7801960557699203, "num_tokens": 5803296.0, "step": 356 }, { "entropy": 0.5335498154163361, "epoch": 1.3327102803738318, "grad_norm": 0.029476149007678032, "learning_rate": 0.0002, "loss": 0.5379401445388794, "mean_token_accuracy": 0.7807924002408981, "num_tokens": 5819523.0, "step": 357 }, { "entropy": 0.571747362613678, "epoch": 1.3364485981308412, "grad_norm": 0.030969126150012016, "learning_rate": 0.0002, "loss": 0.5734298825263977, "mean_token_accuracy": 0.7665233165025711, "num_tokens": 5835904.0, "step": 358 }, { "entropy": 0.5278273224830627, "epoch": 1.3401869158878505, "grad_norm": 0.035017624497413635, "learning_rate": 0.0002, "loss": 0.5390288233757019, "mean_token_accuracy": 0.7818515002727509, "num_tokens": 5852087.0, "step": 359 }, { "entropy": 0.5494511723518372, "epoch": 1.34392523364486, "grad_norm": 0.0332498699426651, "learning_rate": 0.0002, "loss": 0.5546149611473083, "mean_token_accuracy": 0.7754078060388565, "num_tokens": 5868313.0, "step": 360 }, { "entropy": 0.5656353235244751, "epoch": 1.347663551401869, "grad_norm": 0.029156476259231567, "learning_rate": 0.0002, "loss": 0.5639902353286743, "mean_token_accuracy": 0.7691005319356918, "num_tokens": 5884673.0, "step": 361 }, { "entropy": 0.5517591834068298, "epoch": 1.3514018691588785, "grad_norm": 0.033162813633680344, "learning_rate": 0.0002, "loss": 0.5487698316574097, "mean_token_accuracy": 0.7762563526630402, "num_tokens": 5901026.0, "step": 362 }, { "entropy": 0.5693054497241974, "epoch": 1.355140186915888, "grad_norm": 0.03303493186831474, "learning_rate": 0.0002, "loss": 0.5636650323867798, "mean_token_accuracy": 0.7702258229255676, "num_tokens": 5917299.0, "step": 363 }, { "entropy": 0.5485306680202484, "epoch": 1.358878504672897, "grad_norm": 0.028174106031656265, "learning_rate": 0.0002, "loss": 0.5443013310432434, "mean_token_accuracy": 0.7785944491624832, "num_tokens": 5933711.0, "step": 364 }, { "entropy": 0.5455866008996964, "epoch": 1.3626168224299064, "grad_norm": 0.03680690750479698, "learning_rate": 0.0002, "loss": 0.5549443364143372, "mean_token_accuracy": 0.7760016471147537, "num_tokens": 5949851.0, "step": 365 }, { "entropy": 0.5625369846820831, "epoch": 1.3663551401869158, "grad_norm": 0.03274211287498474, "learning_rate": 0.0002, "loss": 0.5614032745361328, "mean_token_accuracy": 0.7710064649581909, "num_tokens": 5966219.0, "step": 366 }, { "entropy": 0.5512880086898804, "epoch": 1.3700934579439252, "grad_norm": 0.029914218932390213, "learning_rate": 0.0002, "loss": 0.5541912317276001, "mean_token_accuracy": 0.7744521200656891, "num_tokens": 5982685.0, "step": 367 }, { "entropy": 0.5462228506803513, "epoch": 1.3738317757009346, "grad_norm": 0.03740010783076286, "learning_rate": 0.0002, "loss": 0.542587161064148, "mean_token_accuracy": 0.7833080589771271, "num_tokens": 5999012.0, "step": 368 }, { "entropy": 0.5561699420213699, "epoch": 1.377570093457944, "grad_norm": 0.03154682740569115, "learning_rate": 0.0002, "loss": 0.5543806552886963, "mean_token_accuracy": 0.7729498744010925, "num_tokens": 6015418.0, "step": 369 }, { "entropy": 0.5295282006263733, "epoch": 1.3813084112149534, "grad_norm": 0.029992269352078438, "learning_rate": 0.0002, "loss": 0.5347234010696411, "mean_token_accuracy": 0.7826734483242035, "num_tokens": 6031664.0, "step": 370 }, { "entropy": 0.5307233035564423, "epoch": 1.3850467289719626, "grad_norm": 0.0387556329369545, "learning_rate": 0.0002, "loss": 0.5442472696304321, "mean_token_accuracy": 0.7788428515195847, "num_tokens": 6047789.0, "step": 371 }, { "entropy": 0.5666087120771408, "epoch": 1.388785046728972, "grad_norm": 0.03485598787665367, "learning_rate": 0.0002, "loss": 0.5701879858970642, "mean_token_accuracy": 0.7664644569158554, "num_tokens": 6064072.0, "step": 372 }, { "entropy": 0.5600801408290863, "epoch": 1.3925233644859814, "grad_norm": 0.030468204990029335, "learning_rate": 0.0002, "loss": 0.557839035987854, "mean_token_accuracy": 0.7774783074855804, "num_tokens": 6080233.0, "step": 373 }, { "entropy": 0.5573039948940277, "epoch": 1.3962616822429905, "grad_norm": 0.03327672928571701, "learning_rate": 0.0002, "loss": 0.5551377534866333, "mean_token_accuracy": 0.7740774154663086, "num_tokens": 6096552.0, "step": 374 }, { "entropy": 0.5559895187616348, "epoch": 1.4, "grad_norm": 0.029464859515428543, "learning_rate": 0.0002, "loss": 0.5499491691589355, "mean_token_accuracy": 0.778936430811882, "num_tokens": 6112721.0, "step": 375 }, { "entropy": 0.5373993217945099, "epoch": 1.4037383177570093, "grad_norm": 0.033405598253011703, "learning_rate": 0.0002, "loss": 0.5378676652908325, "mean_token_accuracy": 0.78409743309021, "num_tokens": 6128876.0, "step": 376 }, { "entropy": 0.5293000936508179, "epoch": 1.4074766355140187, "grad_norm": 0.03749069571495056, "learning_rate": 0.0002, "loss": 0.5442302823066711, "mean_token_accuracy": 0.7793403714895248, "num_tokens": 6145070.0, "step": 377 }, { "entropy": 0.5288459360599518, "epoch": 1.411214953271028, "grad_norm": 0.0304460097104311, "learning_rate": 0.0002, "loss": 0.5322169661521912, "mean_token_accuracy": 0.7845710813999176, "num_tokens": 6161358.0, "step": 378 }, { "entropy": 0.5396905541419983, "epoch": 1.4149532710280375, "grad_norm": 0.0334291011095047, "learning_rate": 0.0002, "loss": 0.536848783493042, "mean_token_accuracy": 0.7786440551280975, "num_tokens": 6177744.0, "step": 379 }, { "entropy": 0.5749261528253555, "epoch": 1.4186915887850469, "grad_norm": 0.03149184212088585, "learning_rate": 0.0002, "loss": 0.5657936334609985, "mean_token_accuracy": 0.7711158096790314, "num_tokens": 6194294.0, "step": 380 }, { "entropy": 0.5584524124860764, "epoch": 1.422429906542056, "grad_norm": 0.03502335026860237, "learning_rate": 0.0002, "loss": 0.5578019618988037, "mean_token_accuracy": 0.7754084765911102, "num_tokens": 6210591.0, "step": 381 }, { "entropy": 0.5385516434907913, "epoch": 1.4261682242990654, "grad_norm": 0.029922619462013245, "learning_rate": 0.0002, "loss": 0.5379009246826172, "mean_token_accuracy": 0.7822572886943817, "num_tokens": 6226836.0, "step": 382 }, { "entropy": 0.5303553491830826, "epoch": 1.4299065420560748, "grad_norm": 0.03207620605826378, "learning_rate": 0.0002, "loss": 0.5399402379989624, "mean_token_accuracy": 0.7848275154829025, "num_tokens": 6243140.0, "step": 383 }, { "entropy": 0.5435499548912048, "epoch": 1.433644859813084, "grad_norm": 0.034929681569337845, "learning_rate": 0.0002, "loss": 0.5510104298591614, "mean_token_accuracy": 0.7754337340593338, "num_tokens": 6259135.0, "step": 384 }, { "entropy": 0.5495016276836395, "epoch": 1.4373831775700934, "grad_norm": 0.02961392141878605, "learning_rate": 0.0002, "loss": 0.5518282651901245, "mean_token_accuracy": 0.7770158797502518, "num_tokens": 6275478.0, "step": 385 }, { "entropy": 0.5597821772098541, "epoch": 1.4411214953271028, "grad_norm": 0.03038998879492283, "learning_rate": 0.0002, "loss": 0.5598548650741577, "mean_token_accuracy": 0.7717087864875793, "num_tokens": 6292022.0, "step": 386 }, { "entropy": 0.5554857552051544, "epoch": 1.4448598130841122, "grad_norm": 0.034831635653972626, "learning_rate": 0.0002, "loss": 0.5589088201522827, "mean_token_accuracy": 0.7742104977369308, "num_tokens": 6308395.0, "step": 387 }, { "entropy": 0.5330976247787476, "epoch": 1.4485981308411215, "grad_norm": 0.03864655643701553, "learning_rate": 0.0002, "loss": 0.5340397357940674, "mean_token_accuracy": 0.7843937277793884, "num_tokens": 6324443.0, "step": 388 }, { "entropy": 0.5459477603435516, "epoch": 1.452336448598131, "grad_norm": 0.03552354499697685, "learning_rate": 0.0002, "loss": 0.546898603439331, "mean_token_accuracy": 0.7767336070537567, "num_tokens": 6340452.0, "step": 389 }, { "entropy": 0.555869922041893, "epoch": 1.45607476635514, "grad_norm": 0.042999885976314545, "learning_rate": 0.0002, "loss": 0.5562218427658081, "mean_token_accuracy": 0.772677481174469, "num_tokens": 6356737.0, "step": 390 }, { "entropy": 0.5476373881101608, "epoch": 1.4598130841121495, "grad_norm": 0.034353937953710556, "learning_rate": 0.0002, "loss": 0.5502485632896423, "mean_token_accuracy": 0.7757505625486374, "num_tokens": 6372959.0, "step": 391 }, { "entropy": 0.5542000085115433, "epoch": 1.4635514018691589, "grad_norm": 0.030675135552883148, "learning_rate": 0.0002, "loss": 0.5507063865661621, "mean_token_accuracy": 0.7746506035327911, "num_tokens": 6389285.0, "step": 392 }, { "entropy": 0.5308681577444077, "epoch": 1.4672897196261683, "grad_norm": 0.03328751027584076, "learning_rate": 0.0002, "loss": 0.5308902263641357, "mean_token_accuracy": 0.7832993865013123, "num_tokens": 6405473.0, "step": 393 }, { "entropy": 0.5490089803934097, "epoch": 1.4710280373831774, "grad_norm": 0.03258799389004707, "learning_rate": 0.0002, "loss": 0.5524098873138428, "mean_token_accuracy": 0.7753634303808212, "num_tokens": 6421682.0, "step": 394 }, { "entropy": 0.5617490261793137, "epoch": 1.4747663551401868, "grad_norm": 0.03237268701195717, "learning_rate": 0.0002, "loss": 0.5609363913536072, "mean_token_accuracy": 0.7727462351322174, "num_tokens": 6438225.0, "step": 395 }, { "entropy": 0.5548438280820847, "epoch": 1.4785046728971962, "grad_norm": 0.0355081707239151, "learning_rate": 0.0002, "loss": 0.5486972332000732, "mean_token_accuracy": 0.7752490490674973, "num_tokens": 6454558.0, "step": 396 }, { "entropy": 0.539698138833046, "epoch": 1.4822429906542056, "grad_norm": 0.03101828694343567, "learning_rate": 0.0002, "loss": 0.5438753366470337, "mean_token_accuracy": 0.776269868016243, "num_tokens": 6470673.0, "step": 397 }, { "entropy": 0.5318429321050644, "epoch": 1.485981308411215, "grad_norm": 0.040831487625837326, "learning_rate": 0.0002, "loss": 0.5361422300338745, "mean_token_accuracy": 0.7855317145586014, "num_tokens": 6486739.0, "step": 398 }, { "entropy": 0.5382596254348755, "epoch": 1.4897196261682244, "grad_norm": 0.03325575962662697, "learning_rate": 0.0002, "loss": 0.5401434302330017, "mean_token_accuracy": 0.7797534018754959, "num_tokens": 6502900.0, "step": 399 }, { "entropy": 0.5596988648176193, "epoch": 1.4934579439252336, "grad_norm": 0.028764478862285614, "learning_rate": 0.0002, "loss": 0.5577390193939209, "mean_token_accuracy": 0.7748348712921143, "num_tokens": 6519408.0, "step": 400 }, { "entropy": 0.5493527054786682, "epoch": 1.497196261682243, "grad_norm": 0.028892861679196358, "learning_rate": 0.0002, "loss": 0.5473135709762573, "mean_token_accuracy": 0.777830645442009, "num_tokens": 6535811.0, "step": 401 }, { "entropy": 0.5402602255344391, "epoch": 1.5009345794392523, "grad_norm": 0.03191126883029938, "learning_rate": 0.0002, "loss": 0.5474570989608765, "mean_token_accuracy": 0.7774458974599838, "num_tokens": 6552173.0, "step": 402 }, { "entropy": 0.540817379951477, "epoch": 1.5046728971962615, "grad_norm": 0.03177822753787041, "learning_rate": 0.0002, "loss": 0.548837423324585, "mean_token_accuracy": 0.7776143550872803, "num_tokens": 6568527.0, "step": 403 }, { "entropy": 0.5428208336234093, "epoch": 1.508411214953271, "grad_norm": 0.030568130314350128, "learning_rate": 0.0002, "loss": 0.5432289242744446, "mean_token_accuracy": 0.7798717468976974, "num_tokens": 6584756.0, "step": 404 }, { "entropy": 0.5466499626636505, "epoch": 1.5121495327102803, "grad_norm": 0.032929882407188416, "learning_rate": 0.0002, "loss": 0.5407195687294006, "mean_token_accuracy": 0.7786379009485245, "num_tokens": 6601082.0, "step": 405 }, { "entropy": 0.5593132823705673, "epoch": 1.5158878504672897, "grad_norm": 0.03837394341826439, "learning_rate": 0.0002, "loss": 0.5646262168884277, "mean_token_accuracy": 0.771564781665802, "num_tokens": 6617429.0, "step": 406 }, { "entropy": 0.5453289300203323, "epoch": 1.519626168224299, "grad_norm": 0.03576509654521942, "learning_rate": 0.0002, "loss": 0.5487722158432007, "mean_token_accuracy": 0.7768426388502121, "num_tokens": 6633826.0, "step": 407 }, { "entropy": 0.53939288854599, "epoch": 1.5233644859813085, "grad_norm": 0.032857585698366165, "learning_rate": 0.0002, "loss": 0.5385522246360779, "mean_token_accuracy": 0.7790959179401398, "num_tokens": 6650240.0, "step": 408 }, { "entropy": 0.5520011931657791, "epoch": 1.5271028037383179, "grad_norm": 0.030627621337771416, "learning_rate": 0.0002, "loss": 0.5516581535339355, "mean_token_accuracy": 0.7760986834764481, "num_tokens": 6666454.0, "step": 409 }, { "entropy": 0.5406108945608139, "epoch": 1.5308411214953273, "grad_norm": 0.036952704191207886, "learning_rate": 0.0002, "loss": 0.545346736907959, "mean_token_accuracy": 0.7765967845916748, "num_tokens": 6682741.0, "step": 410 }, { "entropy": 0.5551878213882446, "epoch": 1.5345794392523364, "grad_norm": 0.02871653437614441, "learning_rate": 0.0002, "loss": 0.54979008436203, "mean_token_accuracy": 0.7789790332317352, "num_tokens": 6699160.0, "step": 411 }, { "entropy": 0.5512814819812775, "epoch": 1.5383177570093458, "grad_norm": 0.03201194107532501, "learning_rate": 0.0002, "loss": 0.5527634620666504, "mean_token_accuracy": 0.7734574526548386, "num_tokens": 6715511.0, "step": 412 }, { "entropy": 0.5432283580303192, "epoch": 1.542056074766355, "grad_norm": 0.040297310799360275, "learning_rate": 0.0002, "loss": 0.5455228686332703, "mean_token_accuracy": 0.7767939269542694, "num_tokens": 6731688.0, "step": 413 }, { "entropy": 0.5464504212141037, "epoch": 1.5457943925233644, "grad_norm": 0.03343544527888298, "learning_rate": 0.0002, "loss": 0.543891191482544, "mean_token_accuracy": 0.7797385454177856, "num_tokens": 6747995.0, "step": 414 }, { "entropy": 0.5669636428356171, "epoch": 1.5495327102803738, "grad_norm": 0.03769576549530029, "learning_rate": 0.0002, "loss": 0.5689972639083862, "mean_token_accuracy": 0.7693852484226227, "num_tokens": 6764353.0, "step": 415 }, { "entropy": 0.5392922759056091, "epoch": 1.5532710280373832, "grad_norm": 0.03238385543227196, "learning_rate": 0.0002, "loss": 0.5441082715988159, "mean_token_accuracy": 0.779180720448494, "num_tokens": 6780896.0, "step": 416 }, { "entropy": 0.530147522687912, "epoch": 1.5570093457943925, "grad_norm": 0.040036849677562714, "learning_rate": 0.0002, "loss": 0.5422973036766052, "mean_token_accuracy": 0.7789286226034164, "num_tokens": 6797151.0, "step": 417 }, { "entropy": 0.5386764258146286, "epoch": 1.560747663551402, "grad_norm": 0.03689395636320114, "learning_rate": 0.0002, "loss": 0.5467624068260193, "mean_token_accuracy": 0.7778990417718887, "num_tokens": 6813386.0, "step": 418 }, { "entropy": 0.5509621798992157, "epoch": 1.5644859813084113, "grad_norm": 0.029403693974018097, "learning_rate": 0.0002, "loss": 0.5459365248680115, "mean_token_accuracy": 0.7784391641616821, "num_tokens": 6829627.0, "step": 419 }, { "entropy": 0.5576108992099762, "epoch": 1.5682242990654207, "grad_norm": 0.03426877036690712, "learning_rate": 0.0002, "loss": 0.5519037246704102, "mean_token_accuracy": 0.7766879051923752, "num_tokens": 6845675.0, "step": 420 }, { "entropy": 0.5511836111545563, "epoch": 1.5719626168224299, "grad_norm": 0.03294205665588379, "learning_rate": 0.0002, "loss": 0.5434479117393494, "mean_token_accuracy": 0.7805502861738205, "num_tokens": 6861921.0, "step": 421 }, { "entropy": 0.5404133796691895, "epoch": 1.5757009345794393, "grad_norm": 0.032488446682691574, "learning_rate": 0.0002, "loss": 0.5410423278808594, "mean_token_accuracy": 0.7808396965265274, "num_tokens": 6877883.0, "step": 422 }, { "entropy": 0.5403463542461395, "epoch": 1.5794392523364484, "grad_norm": 0.03610778972506523, "learning_rate": 0.0002, "loss": 0.5484398603439331, "mean_token_accuracy": 0.775899812579155, "num_tokens": 6894361.0, "step": 423 }, { "entropy": 0.5344756990671158, "epoch": 1.5831775700934578, "grad_norm": 0.040382951498031616, "learning_rate": 0.0002, "loss": 0.5388015508651733, "mean_token_accuracy": 0.7805848121643066, "num_tokens": 6910715.0, "step": 424 }, { "entropy": 0.5353002026677132, "epoch": 1.5869158878504672, "grad_norm": 0.03316662460565567, "learning_rate": 0.0002, "loss": 0.5393432974815369, "mean_token_accuracy": 0.7816650718450546, "num_tokens": 6927150.0, "step": 425 }, { "entropy": 0.5770704746246338, "epoch": 1.5906542056074766, "grad_norm": 0.034545231610536575, "learning_rate": 0.0002, "loss": 0.579833984375, "mean_token_accuracy": 0.7628369480371475, "num_tokens": 6943549.0, "step": 426 }, { "entropy": 0.5552347898483276, "epoch": 1.594392523364486, "grad_norm": 0.03268204629421234, "learning_rate": 0.0002, "loss": 0.5537080764770508, "mean_token_accuracy": 0.7791409194469452, "num_tokens": 6959832.0, "step": 427 }, { "entropy": 0.5671118795871735, "epoch": 1.5981308411214954, "grad_norm": 0.025902021676301956, "learning_rate": 0.0002, "loss": 0.5616373419761658, "mean_token_accuracy": 0.771975114941597, "num_tokens": 6976368.0, "step": 428 }, { "entropy": 0.5544670224189758, "epoch": 1.6018691588785048, "grad_norm": 0.0315086655318737, "learning_rate": 0.0002, "loss": 0.5545330047607422, "mean_token_accuracy": 0.7738883197307587, "num_tokens": 6992718.0, "step": 429 }, { "entropy": 0.5558904558420181, "epoch": 1.6056074766355142, "grad_norm": 0.033460259437561035, "learning_rate": 0.0002, "loss": 0.5574325323104858, "mean_token_accuracy": 0.772273600101471, "num_tokens": 7009062.0, "step": 430 }, { "entropy": 0.5590114444494247, "epoch": 1.6093457943925233, "grad_norm": 0.029064292088150978, "learning_rate": 0.0002, "loss": 0.5580740571022034, "mean_token_accuracy": 0.7744424343109131, "num_tokens": 7025645.0, "step": 431 }, { "entropy": 0.5402631610631943, "epoch": 1.6130841121495327, "grad_norm": 0.04296636953949928, "learning_rate": 0.0002, "loss": 0.5493630170822144, "mean_token_accuracy": 0.7780915945768356, "num_tokens": 7041830.0, "step": 432 }, { "entropy": 0.5555061250925064, "epoch": 1.616822429906542, "grad_norm": 0.03312353044748306, "learning_rate": 0.0002, "loss": 0.5578774809837341, "mean_token_accuracy": 0.7739899456501007, "num_tokens": 7058231.0, "step": 433 }, { "entropy": 0.5563363283872604, "epoch": 1.6205607476635513, "grad_norm": 0.03301616013050079, "learning_rate": 0.0002, "loss": 0.5517432689666748, "mean_token_accuracy": 0.7788877487182617, "num_tokens": 7074655.0, "step": 434 }, { "entropy": 0.5507991462945938, "epoch": 1.6242990654205607, "grad_norm": 0.03195936232805252, "learning_rate": 0.0002, "loss": 0.5476133227348328, "mean_token_accuracy": 0.7775176912546158, "num_tokens": 7090766.0, "step": 435 }, { "entropy": 0.5565993189811707, "epoch": 1.62803738317757, "grad_norm": 0.03229626268148422, "learning_rate": 0.0002, "loss": 0.5532009601593018, "mean_token_accuracy": 0.7752693891525269, "num_tokens": 7106963.0, "step": 436 }, { "entropy": 0.5465118885040283, "epoch": 1.6317757009345795, "grad_norm": 0.034706246107816696, "learning_rate": 0.0002, "loss": 0.551576554775238, "mean_token_accuracy": 0.7718321233987808, "num_tokens": 7122926.0, "step": 437 }, { "entropy": 0.5443113446235657, "epoch": 1.6355140186915889, "grad_norm": 0.04082060605287552, "learning_rate": 0.0002, "loss": 0.5574634671211243, "mean_token_accuracy": 0.7741082310676575, "num_tokens": 7139165.0, "step": 438 }, { "entropy": 0.5489460676908493, "epoch": 1.6392523364485982, "grad_norm": 0.03261584788560867, "learning_rate": 0.0002, "loss": 0.5546178817749023, "mean_token_accuracy": 0.7754340916872025, "num_tokens": 7155500.0, "step": 439 }, { "entropy": 0.5663624107837677, "epoch": 1.6429906542056076, "grad_norm": 0.030861368402838707, "learning_rate": 0.0002, "loss": 0.564441442489624, "mean_token_accuracy": 0.7708708792924881, "num_tokens": 7171927.0, "step": 440 }, { "entropy": 0.5702053755521774, "epoch": 1.6467289719626168, "grad_norm": 0.03468736633658409, "learning_rate": 0.0002, "loss": 0.5645827651023865, "mean_token_accuracy": 0.768431767821312, "num_tokens": 7188341.0, "step": 441 }, { "entropy": 0.5505633056163788, "epoch": 1.6504672897196262, "grad_norm": 0.03153201565146446, "learning_rate": 0.0002, "loss": 0.5395671725273132, "mean_token_accuracy": 0.7812985777854919, "num_tokens": 7204527.0, "step": 442 }, { "entropy": 0.5565541088581085, "epoch": 1.6542056074766354, "grad_norm": 0.033020708709955215, "learning_rate": 0.0002, "loss": 0.557956874370575, "mean_token_accuracy": 0.7709688693284988, "num_tokens": 7220831.0, "step": 443 }, { "entropy": 0.5384746044874191, "epoch": 1.6579439252336448, "grad_norm": 0.0418318547308445, "learning_rate": 0.0002, "loss": 0.5513378977775574, "mean_token_accuracy": 0.7791547626256943, "num_tokens": 7236949.0, "step": 444 }, { "entropy": 0.5353372693061829, "epoch": 1.6616822429906541, "grad_norm": 0.03820660710334778, "learning_rate": 0.0002, "loss": 0.5490580201148987, "mean_token_accuracy": 0.7749721854925156, "num_tokens": 7253242.0, "step": 445 }, { "entropy": 0.5484792143106461, "epoch": 1.6654205607476635, "grad_norm": 0.03215263411402702, "learning_rate": 0.0002, "loss": 0.5497522354125977, "mean_token_accuracy": 0.7769928872585297, "num_tokens": 7269457.0, "step": 446 }, { "entropy": 0.5664080828428268, "epoch": 1.669158878504673, "grad_norm": 0.02815551683306694, "learning_rate": 0.0002, "loss": 0.5563632249832153, "mean_token_accuracy": 0.7749156504869461, "num_tokens": 7285879.0, "step": 447 }, { "entropy": 0.5464235991239548, "epoch": 1.6728971962616823, "grad_norm": 0.02781211957335472, "learning_rate": 0.0002, "loss": 0.5405099391937256, "mean_token_accuracy": 0.781552255153656, "num_tokens": 7302263.0, "step": 448 }, { "entropy": 0.5339583903551102, "epoch": 1.6766355140186917, "grad_norm": 0.02980860136449337, "learning_rate": 0.0002, "loss": 0.5369037985801697, "mean_token_accuracy": 0.7814508825540543, "num_tokens": 7318270.0, "step": 449 }, { "entropy": 0.5407254248857498, "epoch": 1.680373831775701, "grad_norm": 0.03138496354222298, "learning_rate": 0.0002, "loss": 0.5460474491119385, "mean_token_accuracy": 0.7780201584100723, "num_tokens": 7334492.0, "step": 450 }, { "entropy": 0.5503694117069244, "epoch": 1.6841121495327103, "grad_norm": 0.033992450684309006, "learning_rate": 0.0002, "loss": 0.5556005239486694, "mean_token_accuracy": 0.7745715081691742, "num_tokens": 7350627.0, "step": 451 }, { "entropy": 0.5451936274766922, "epoch": 1.6878504672897197, "grad_norm": 0.03251323476433754, "learning_rate": 0.0002, "loss": 0.5443669557571411, "mean_token_accuracy": 0.7780810743570328, "num_tokens": 7367005.0, "step": 452 }, { "entropy": 0.5657957345247269, "epoch": 1.6915887850467288, "grad_norm": 0.034646324813365936, "learning_rate": 0.0002, "loss": 0.5615976452827454, "mean_token_accuracy": 0.7718859612941742, "num_tokens": 7383262.0, "step": 453 }, { "entropy": 0.5525887459516525, "epoch": 1.6953271028037382, "grad_norm": 0.04024709016084671, "learning_rate": 0.0002, "loss": 0.5542372465133667, "mean_token_accuracy": 0.7756317108869553, "num_tokens": 7399750.0, "step": 454 }, { "entropy": 0.5493184924125671, "epoch": 1.6990654205607476, "grad_norm": 0.030978472903370857, "learning_rate": 0.0002, "loss": 0.5475279688835144, "mean_token_accuracy": 0.7762274444103241, "num_tokens": 7415800.0, "step": 455 }, { "entropy": 0.5400003641843796, "epoch": 1.702803738317757, "grad_norm": 0.03376868739724159, "learning_rate": 0.0002, "loss": 0.5407789349555969, "mean_token_accuracy": 0.7818103283643723, "num_tokens": 7431961.0, "step": 456 }, { "entropy": 0.535884216427803, "epoch": 1.7065420560747664, "grad_norm": 0.031221890822052956, "learning_rate": 0.0002, "loss": 0.5440670847892761, "mean_token_accuracy": 0.7796338200569153, "num_tokens": 7448202.0, "step": 457 }, { "entropy": 0.5389861762523651, "epoch": 1.7102803738317758, "grad_norm": 0.035680338740348816, "learning_rate": 0.0002, "loss": 0.5449787974357605, "mean_token_accuracy": 0.7797497361898422, "num_tokens": 7464671.0, "step": 458 }, { "entropy": 0.5451969653367996, "epoch": 1.7140186915887852, "grad_norm": 0.03255719691514969, "learning_rate": 0.0002, "loss": 0.5538266897201538, "mean_token_accuracy": 0.776149570941925, "num_tokens": 7480992.0, "step": 459 }, { "entropy": 0.5643452405929565, "epoch": 1.7177570093457943, "grad_norm": 0.03378691151738167, "learning_rate": 0.0002, "loss": 0.5571281313896179, "mean_token_accuracy": 0.7731311619281769, "num_tokens": 7497232.0, "step": 460 }, { "entropy": 0.5346335917711258, "epoch": 1.7214953271028037, "grad_norm": 0.03035924583673477, "learning_rate": 0.0002, "loss": 0.5269172191619873, "mean_token_accuracy": 0.7836929112672806, "num_tokens": 7513644.0, "step": 461 }, { "entropy": 0.5628820955753326, "epoch": 1.7252336448598131, "grad_norm": 0.03539309278130531, "learning_rate": 0.0002, "loss": 0.5605576634407043, "mean_token_accuracy": 0.7706831097602844, "num_tokens": 7529830.0, "step": 462 }, { "entropy": 0.5182670503854752, "epoch": 1.7289719626168223, "grad_norm": 0.036859650164842606, "learning_rate": 0.0002, "loss": 0.5209002494812012, "mean_token_accuracy": 0.7879375368356705, "num_tokens": 7545846.0, "step": 463 }, { "entropy": 0.5474621504545212, "epoch": 1.7327102803738317, "grad_norm": 0.037796422839164734, "learning_rate": 0.0002, "loss": 0.5536765456199646, "mean_token_accuracy": 0.7753565907478333, "num_tokens": 7562267.0, "step": 464 }, { "entropy": 0.5636439174413681, "epoch": 1.736448598130841, "grad_norm": 0.037271831184625626, "learning_rate": 0.0002, "loss": 0.5606362223625183, "mean_token_accuracy": 0.7704486697912216, "num_tokens": 7578670.0, "step": 465 }, { "entropy": 0.5483116805553436, "epoch": 1.7401869158878505, "grad_norm": 0.031047314405441284, "learning_rate": 0.0002, "loss": 0.5489611029624939, "mean_token_accuracy": 0.7756731957197189, "num_tokens": 7595113.0, "step": 466 }, { "entropy": 0.5289314538240433, "epoch": 1.7439252336448599, "grad_norm": 0.035078927874565125, "learning_rate": 0.0002, "loss": 0.5344489216804504, "mean_token_accuracy": 0.7853281199932098, "num_tokens": 7611153.0, "step": 467 }, { "entropy": 0.541694313287735, "epoch": 1.7476635514018692, "grad_norm": 0.030235178768634796, "learning_rate": 0.0002, "loss": 0.5412616729736328, "mean_token_accuracy": 0.7781483829021454, "num_tokens": 7627712.0, "step": 468 }, { "entropy": 0.5554275363683701, "epoch": 1.7514018691588786, "grad_norm": 0.036943912506103516, "learning_rate": 0.0002, "loss": 0.5531514286994934, "mean_token_accuracy": 0.7756786197423935, "num_tokens": 7643922.0, "step": 469 }, { "entropy": 0.5472631007432938, "epoch": 1.7551401869158878, "grad_norm": 0.030970100313425064, "learning_rate": 0.0002, "loss": 0.5467809438705444, "mean_token_accuracy": 0.780939131975174, "num_tokens": 7660096.0, "step": 470 }, { "entropy": 0.525331124663353, "epoch": 1.7588785046728972, "grad_norm": 0.04763743281364441, "learning_rate": 0.0002, "loss": 0.5361969470977783, "mean_token_accuracy": 0.782649889588356, "num_tokens": 7676237.0, "step": 471 }, { "entropy": 0.5514428466558456, "epoch": 1.7626168224299066, "grad_norm": 0.02942316047847271, "learning_rate": 0.0002, "loss": 0.5563341975212097, "mean_token_accuracy": 0.773899495601654, "num_tokens": 7692848.0, "step": 472 }, { "entropy": 0.5428648442029953, "epoch": 1.7663551401869158, "grad_norm": 0.038572002202272415, "learning_rate": 0.0002, "loss": 0.5449008941650391, "mean_token_accuracy": 0.7810295820236206, "num_tokens": 7708895.0, "step": 473 }, { "entropy": 0.5526584386825562, "epoch": 1.7700934579439251, "grad_norm": 0.03303026407957077, "learning_rate": 0.0002, "loss": 0.5465356111526489, "mean_token_accuracy": 0.7774733603000641, "num_tokens": 7725206.0, "step": 474 }, { "entropy": 0.5638225227594376, "epoch": 1.7738317757009345, "grad_norm": 0.029633166268467903, "learning_rate": 0.0002, "loss": 0.5624324083328247, "mean_token_accuracy": 0.7697116434574127, "num_tokens": 7741838.0, "step": 475 }, { "entropy": 0.5561016201972961, "epoch": 1.777570093457944, "grad_norm": 0.0328570231795311, "learning_rate": 0.0002, "loss": 0.5563735961914062, "mean_token_accuracy": 0.7721449285745621, "num_tokens": 7758049.0, "step": 476 }, { "entropy": 0.5516675412654877, "epoch": 1.7813084112149533, "grad_norm": 0.03453238308429718, "learning_rate": 0.0002, "loss": 0.5518988370895386, "mean_token_accuracy": 0.7777107656002045, "num_tokens": 7774257.0, "step": 477 }, { "entropy": 0.5394668728113174, "epoch": 1.7850467289719627, "grad_norm": 0.03409087657928467, "learning_rate": 0.0002, "loss": 0.5432859659194946, "mean_token_accuracy": 0.7796248197555542, "num_tokens": 7790837.0, "step": 478 }, { "entropy": 0.5491889864206314, "epoch": 1.788785046728972, "grad_norm": 0.03139546513557434, "learning_rate": 0.0002, "loss": 0.5477681159973145, "mean_token_accuracy": 0.7775027453899384, "num_tokens": 7807302.0, "step": 479 }, { "entropy": 0.5528343021869659, "epoch": 1.7925233644859813, "grad_norm": 0.031248709186911583, "learning_rate": 0.0002, "loss": 0.5557167530059814, "mean_token_accuracy": 0.7744993418455124, "num_tokens": 7823635.0, "step": 480 }, { "entropy": 0.5458249896764755, "epoch": 1.7962616822429907, "grad_norm": 0.03402215987443924, "learning_rate": 0.0002, "loss": 0.5505017042160034, "mean_token_accuracy": 0.7759317308664322, "num_tokens": 7839914.0, "step": 481 }, { "entropy": 0.552555724978447, "epoch": 1.8, "grad_norm": 0.030951669439673424, "learning_rate": 0.0002, "loss": 0.560877799987793, "mean_token_accuracy": 0.77203568816185, "num_tokens": 7856194.0, "step": 482 }, { "entropy": 0.5391200333833694, "epoch": 1.8037383177570092, "grad_norm": 0.04003436490893364, "learning_rate": 0.0002, "loss": 0.5390163660049438, "mean_token_accuracy": 0.7827838510274887, "num_tokens": 7872434.0, "step": 483 }, { "entropy": 0.5392342656850815, "epoch": 1.8074766355140186, "grad_norm": 0.03150493651628494, "learning_rate": 0.0002, "loss": 0.5406180620193481, "mean_token_accuracy": 0.7828439474105835, "num_tokens": 7888751.0, "step": 484 }, { "entropy": 0.5622579157352448, "epoch": 1.811214953271028, "grad_norm": 0.03376127406954765, "learning_rate": 0.0002, "loss": 0.5644164681434631, "mean_token_accuracy": 0.7707268595695496, "num_tokens": 7905072.0, "step": 485 }, { "entropy": 0.5327235907316208, "epoch": 1.8149532710280374, "grad_norm": 0.028277890756726265, "learning_rate": 0.0002, "loss": 0.5303685069084167, "mean_token_accuracy": 0.7862435132265091, "num_tokens": 7921459.0, "step": 486 }, { "entropy": 0.5588890165090561, "epoch": 1.8186915887850468, "grad_norm": 0.03095029853284359, "learning_rate": 0.0002, "loss": 0.5525569915771484, "mean_token_accuracy": 0.7770346105098724, "num_tokens": 7937961.0, "step": 487 }, { "entropy": 0.5573548376560211, "epoch": 1.8224299065420562, "grad_norm": 0.03045843541622162, "learning_rate": 0.0002, "loss": 0.5535331964492798, "mean_token_accuracy": 0.7766827940940857, "num_tokens": 7954609.0, "step": 488 }, { "entropy": 0.5567604452371597, "epoch": 1.8261682242990656, "grad_norm": 0.029482809826731682, "learning_rate": 0.0002, "loss": 0.5576134324073792, "mean_token_accuracy": 0.772316038608551, "num_tokens": 7971097.0, "step": 489 }, { "entropy": 0.5545413047075272, "epoch": 1.8299065420560747, "grad_norm": 0.03891676291823387, "learning_rate": 0.0002, "loss": 0.5648533701896667, "mean_token_accuracy": 0.7718105167150497, "num_tokens": 7987377.0, "step": 490 }, { "entropy": 0.5786599218845367, "epoch": 1.8336448598130841, "grad_norm": 0.030758248642086983, "learning_rate": 0.0002, "loss": 0.5835361480712891, "mean_token_accuracy": 0.762917771935463, "num_tokens": 8003799.0, "step": 491 }, { "entropy": 0.5397150218486786, "epoch": 1.8373831775700935, "grad_norm": 0.03965795785188675, "learning_rate": 0.0002, "loss": 0.538779616355896, "mean_token_accuracy": 0.7839108556509018, "num_tokens": 8020279.0, "step": 492 }, { "entropy": 0.5535183995962143, "epoch": 1.8411214953271027, "grad_norm": 0.03004513867199421, "learning_rate": 0.0002, "loss": 0.5507811903953552, "mean_token_accuracy": 0.7755124121904373, "num_tokens": 8036491.0, "step": 493 }, { "entropy": 0.5442592799663544, "epoch": 1.844859813084112, "grad_norm": 0.03522132337093353, "learning_rate": 0.0002, "loss": 0.5478004217147827, "mean_token_accuracy": 0.7766154408454895, "num_tokens": 8052807.0, "step": 494 }, { "entropy": 0.5266854241490364, "epoch": 1.8485981308411215, "grad_norm": 0.030206192284822464, "learning_rate": 0.0002, "loss": 0.529688835144043, "mean_token_accuracy": 0.7819836139678955, "num_tokens": 8068712.0, "step": 495 }, { "entropy": 0.5283671095967293, "epoch": 1.8523364485981308, "grad_norm": 0.03329138457775116, "learning_rate": 0.0002, "loss": 0.5376101136207581, "mean_token_accuracy": 0.7793748378753662, "num_tokens": 8085084.0, "step": 496 }, { "entropy": 0.5712718665599823, "epoch": 1.8560747663551402, "grad_norm": 0.0325874425470829, "learning_rate": 0.0002, "loss": 0.5709162950515747, "mean_token_accuracy": 0.7662056684494019, "num_tokens": 8101731.0, "step": 497 }, { "entropy": 0.5663121491670609, "epoch": 1.8598130841121496, "grad_norm": 0.03357568010687828, "learning_rate": 0.0002, "loss": 0.5650657415390015, "mean_token_accuracy": 0.7691219747066498, "num_tokens": 8118244.0, "step": 498 }, { "entropy": 0.5427432358264923, "epoch": 1.863551401869159, "grad_norm": 0.03203551098704338, "learning_rate": 0.0002, "loss": 0.5398803949356079, "mean_token_accuracy": 0.7808598130941391, "num_tokens": 8134657.0, "step": 499 }, { "entropy": 0.5573120266199112, "epoch": 1.8672897196261682, "grad_norm": 0.029932986944913864, "learning_rate": 0.0002, "loss": 0.5522656440734863, "mean_token_accuracy": 0.7727643102407455, "num_tokens": 8151058.0, "step": 500 }, { "entropy": 0.5573428720235825, "epoch": 1.8710280373831776, "grad_norm": 0.02661440148949623, "learning_rate": 0.0002, "loss": 0.5512294173240662, "mean_token_accuracy": 0.7765780538320541, "num_tokens": 8167736.0, "step": 501 }, { "entropy": 0.5472890585660934, "epoch": 1.874766355140187, "grad_norm": 0.028882022947072983, "learning_rate": 0.0002, "loss": 0.5479044318199158, "mean_token_accuracy": 0.777178093791008, "num_tokens": 8183857.0, "step": 502 }, { "entropy": 0.5511818528175354, "epoch": 1.8785046728971961, "grad_norm": 0.032389186322689056, "learning_rate": 0.0002, "loss": 0.5552236437797546, "mean_token_accuracy": 0.7762337774038315, "num_tokens": 8199955.0, "step": 503 }, { "entropy": 0.546854555606842, "epoch": 1.8822429906542055, "grad_norm": 0.0336172878742218, "learning_rate": 0.0002, "loss": 0.55290687084198, "mean_token_accuracy": 0.7735693603754044, "num_tokens": 8216221.0, "step": 504 }, { "entropy": 0.5447833836078644, "epoch": 1.885981308411215, "grad_norm": 0.0326668806374073, "learning_rate": 0.0002, "loss": 0.5433166027069092, "mean_token_accuracy": 0.7759248912334442, "num_tokens": 8232519.0, "step": 505 }, { "entropy": 0.5311590135097504, "epoch": 1.8897196261682243, "grad_norm": 0.0328470915555954, "learning_rate": 0.0002, "loss": 0.5332115888595581, "mean_token_accuracy": 0.7827264666557312, "num_tokens": 8248973.0, "step": 506 }, { "entropy": 0.5405398160219193, "epoch": 1.8934579439252337, "grad_norm": 0.03319946303963661, "learning_rate": 0.0002, "loss": 0.5498695969581604, "mean_token_accuracy": 0.7756136506795883, "num_tokens": 8265054.0, "step": 507 }, { "entropy": 0.5590761750936508, "epoch": 1.897196261682243, "grad_norm": 0.03323895111680031, "learning_rate": 0.0002, "loss": 0.5674346685409546, "mean_token_accuracy": 0.7680935710668564, "num_tokens": 8281659.0, "step": 508 }, { "entropy": 0.5502993315458298, "epoch": 1.9009345794392525, "grad_norm": 0.036393504589796066, "learning_rate": 0.0002, "loss": 0.5518926382064819, "mean_token_accuracy": 0.7772549986839294, "num_tokens": 8298120.0, "step": 509 }, { "entropy": 0.5434653609991074, "epoch": 1.9046728971962616, "grad_norm": 0.030826875939965248, "learning_rate": 0.0002, "loss": 0.5373662710189819, "mean_token_accuracy": 0.7814789414405823, "num_tokens": 8314165.0, "step": 510 }, { "entropy": 0.5616354942321777, "epoch": 1.908411214953271, "grad_norm": 0.03320663422346115, "learning_rate": 0.0002, "loss": 0.5573338866233826, "mean_token_accuracy": 0.7744273245334625, "num_tokens": 8330561.0, "step": 511 }, { "entropy": 0.5629893988370895, "epoch": 1.9121495327102802, "grad_norm": 0.03727097064256668, "learning_rate": 0.0002, "loss": 0.5611152648925781, "mean_token_accuracy": 0.773328885436058, "num_tokens": 8346708.0, "step": 512 }, { "entropy": 0.5592319965362549, "epoch": 1.9158878504672896, "grad_norm": 0.03037538379430771, "learning_rate": 0.0002, "loss": 0.5616269111633301, "mean_token_accuracy": 0.7723426669836044, "num_tokens": 8362957.0, "step": 513 }, { "entropy": 0.549030601978302, "epoch": 1.919626168224299, "grad_norm": 0.03563016280531883, "learning_rate": 0.0002, "loss": 0.5529686808586121, "mean_token_accuracy": 0.7743269205093384, "num_tokens": 8379387.0, "step": 514 }, { "entropy": 0.5441324412822723, "epoch": 1.9233644859813084, "grad_norm": 0.031737376004457474, "learning_rate": 0.0002, "loss": 0.5500344038009644, "mean_token_accuracy": 0.7763906866312027, "num_tokens": 8395747.0, "step": 515 }, { "entropy": 0.5507270097732544, "epoch": 1.9271028037383178, "grad_norm": 0.03285627067089081, "learning_rate": 0.0002, "loss": 0.5587583780288696, "mean_token_accuracy": 0.7742376923561096, "num_tokens": 8412181.0, "step": 516 }, { "entropy": 0.5456591248512268, "epoch": 1.9308411214953272, "grad_norm": 0.03147684410214424, "learning_rate": 0.0002, "loss": 0.5484343767166138, "mean_token_accuracy": 0.7780278623104095, "num_tokens": 8428664.0, "step": 517 }, { "entropy": 0.5484454035758972, "epoch": 1.9345794392523366, "grad_norm": 0.036278773099184036, "learning_rate": 0.0002, "loss": 0.5547294616699219, "mean_token_accuracy": 0.7715467214584351, "num_tokens": 8444942.0, "step": 518 }, { "entropy": 0.5404845178127289, "epoch": 1.938317757009346, "grad_norm": 0.032282162457704544, "learning_rate": 0.0002, "loss": 0.5409780740737915, "mean_token_accuracy": 0.779809907078743, "num_tokens": 8461035.0, "step": 519 }, { "entropy": 0.5527531504631042, "epoch": 1.9420560747663551, "grad_norm": 0.03141535073518753, "learning_rate": 0.0002, "loss": 0.5439110994338989, "mean_token_accuracy": 0.7789405584335327, "num_tokens": 8477333.0, "step": 520 }, { "entropy": 0.5531508475542068, "epoch": 1.9457943925233645, "grad_norm": 0.032372504472732544, "learning_rate": 0.0002, "loss": 0.5456727147102356, "mean_token_accuracy": 0.7779283076524734, "num_tokens": 8493646.0, "step": 521 }, { "entropy": 0.558539628982544, "epoch": 1.9495327102803737, "grad_norm": 0.03805968537926674, "learning_rate": 0.0002, "loss": 0.5575815439224243, "mean_token_accuracy": 0.7742009460926056, "num_tokens": 8510069.0, "step": 522 }, { "entropy": 0.5543098747730255, "epoch": 1.953271028037383, "grad_norm": 0.03495538979768753, "learning_rate": 0.0002, "loss": 0.558309018611908, "mean_token_accuracy": 0.7752062678337097, "num_tokens": 8526687.0, "step": 523 }, { "entropy": 0.5394291281700134, "epoch": 1.9570093457943925, "grad_norm": 0.060034435242414474, "learning_rate": 0.0002, "loss": 0.5499407649040222, "mean_token_accuracy": 0.7747859209775925, "num_tokens": 8543194.0, "step": 524 }, { "entropy": 0.5493269860744476, "epoch": 1.9607476635514018, "grad_norm": 0.03242463245987892, "learning_rate": 0.0002, "loss": 0.5581871271133423, "mean_token_accuracy": 0.7717736065387726, "num_tokens": 8559635.0, "step": 525 }, { "entropy": 0.5709338933229446, "epoch": 1.9644859813084112, "grad_norm": 0.09612691402435303, "learning_rate": 0.0002, "loss": 0.5857856273651123, "mean_token_accuracy": 0.7716985046863556, "num_tokens": 8575682.0, "step": 526 }, { "entropy": 0.5535681843757629, "epoch": 1.9682242990654206, "grad_norm": 0.03228386864066124, "learning_rate": 0.0002, "loss": 0.5427148342132568, "mean_token_accuracy": 0.7775698453187943, "num_tokens": 8591993.0, "step": 527 }, { "entropy": 0.5595205128192902, "epoch": 1.97196261682243, "grad_norm": 0.05833456665277481, "learning_rate": 0.0002, "loss": 0.5632327795028687, "mean_token_accuracy": 0.7714700251817703, "num_tokens": 8608390.0, "step": 528 }, { "entropy": 0.5412962287664413, "epoch": 1.9757009345794394, "grad_norm": 0.04238782078027725, "learning_rate": 0.0002, "loss": 0.5416378378868103, "mean_token_accuracy": 0.7781312763690948, "num_tokens": 8624553.0, "step": 529 }, { "entropy": 0.5466502904891968, "epoch": 1.9794392523364486, "grad_norm": 0.038432635366916656, "learning_rate": 0.0002, "loss": 0.5546246767044067, "mean_token_accuracy": 0.7747474908828735, "num_tokens": 8640859.0, "step": 530 }, { "entropy": 0.5358689278364182, "epoch": 1.983177570093458, "grad_norm": 0.03189871460199356, "learning_rate": 0.0002, "loss": 0.5437639355659485, "mean_token_accuracy": 0.7790816277265549, "num_tokens": 8657164.0, "step": 531 }, { "entropy": 0.5428293794393539, "epoch": 1.9869158878504671, "grad_norm": 0.031927406787872314, "learning_rate": 0.0002, "loss": 0.5386630892753601, "mean_token_accuracy": 0.7813318967819214, "num_tokens": 8673653.0, "step": 532 }, { "entropy": 0.5520585179328918, "epoch": 1.9906542056074765, "grad_norm": 0.036430567502975464, "learning_rate": 0.0002, "loss": 0.5499424338340759, "mean_token_accuracy": 0.7754785418510437, "num_tokens": 8689872.0, "step": 533 }, { "entropy": 0.5492618307471275, "epoch": 1.994392523364486, "grad_norm": 0.03422766923904419, "learning_rate": 0.0002, "loss": 0.5523169040679932, "mean_token_accuracy": 0.7751457393169403, "num_tokens": 8706316.0, "step": 534 }, { "entropy": 0.5318035036325455, "epoch": 1.9981308411214953, "grad_norm": 0.029748188331723213, "learning_rate": 0.0002, "loss": 0.5351126790046692, "mean_token_accuracy": 0.7828892469406128, "num_tokens": 8722797.0, "step": 535 }, { "entropy": 0.5385511517524719, "epoch": 2.0, "grad_norm": 0.05353870987892151, "learning_rate": 0.0002, "loss": 0.5426778197288513, "mean_token_accuracy": 0.7800059914588928, "num_tokens": 8729632.0, "step": 536 }, { "entropy": 0.5657109320163727, "epoch": 2.0037383177570094, "grad_norm": 0.03845514729619026, "learning_rate": 0.0002, "loss": 0.5532518029212952, "mean_token_accuracy": 0.7752802222967148, "num_tokens": 8746094.0, "step": 537 }, { "entropy": 0.5414439141750336, "epoch": 2.007476635514019, "grad_norm": 0.030992809683084488, "learning_rate": 0.0002, "loss": 0.5374770164489746, "mean_token_accuracy": 0.7807809114456177, "num_tokens": 8762553.0, "step": 538 }, { "entropy": 0.520616352558136, "epoch": 2.011214953271028, "grad_norm": 0.03543594852089882, "learning_rate": 0.0002, "loss": 0.5239126086235046, "mean_token_accuracy": 0.7860341370105743, "num_tokens": 8778649.0, "step": 539 }, { "entropy": 0.5175309702754021, "epoch": 2.0149532710280376, "grad_norm": 0.03473593294620514, "learning_rate": 0.0002, "loss": 0.5261198282241821, "mean_token_accuracy": 0.7881468534469604, "num_tokens": 8794905.0, "step": 540 }, { "entropy": 0.5151422992348671, "epoch": 2.0186915887850465, "grad_norm": 0.038792964071035385, "learning_rate": 0.0002, "loss": 0.5288342833518982, "mean_token_accuracy": 0.7841326147317886, "num_tokens": 8811277.0, "step": 541 }, { "entropy": 0.5424266159534454, "epoch": 2.022429906542056, "grad_norm": 0.03833077475428581, "learning_rate": 0.0002, "loss": 0.5454620718955994, "mean_token_accuracy": 0.7795733213424683, "num_tokens": 8827670.0, "step": 542 }, { "entropy": 0.533804714679718, "epoch": 2.0261682242990653, "grad_norm": 0.03583015128970146, "learning_rate": 0.0002, "loss": 0.5267578959465027, "mean_token_accuracy": 0.7867784053087234, "num_tokens": 8843733.0, "step": 543 }, { "entropy": 0.5466929823160172, "epoch": 2.0299065420560747, "grad_norm": 0.03870777040719986, "learning_rate": 0.0002, "loss": 0.5435931086540222, "mean_token_accuracy": 0.7770351320505142, "num_tokens": 8860036.0, "step": 544 }, { "entropy": 0.5408391207456589, "epoch": 2.033644859813084, "grad_norm": 0.03353007137775421, "learning_rate": 0.0002, "loss": 0.5323169827461243, "mean_token_accuracy": 0.7834465950727463, "num_tokens": 8876470.0, "step": 545 }, { "entropy": 0.5217868834733963, "epoch": 2.0373831775700935, "grad_norm": 0.036939021199941635, "learning_rate": 0.0002, "loss": 0.5216724276542664, "mean_token_accuracy": 0.7884602099657059, "num_tokens": 8892628.0, "step": 546 }, { "entropy": 0.5368964821100235, "epoch": 2.041121495327103, "grad_norm": 0.043159015476703644, "learning_rate": 0.0002, "loss": 0.5444939136505127, "mean_token_accuracy": 0.778968021273613, "num_tokens": 8909028.0, "step": 547 }, { "entropy": 0.5433569848537445, "epoch": 2.0448598130841122, "grad_norm": 0.03701786324381828, "learning_rate": 0.0002, "loss": 0.5439752340316772, "mean_token_accuracy": 0.7791613191366196, "num_tokens": 8925310.0, "step": 548 }, { "entropy": 0.5270129442214966, "epoch": 2.0485981308411216, "grad_norm": 0.04250190034508705, "learning_rate": 0.0002, "loss": 0.5210642218589783, "mean_token_accuracy": 0.7867415547370911, "num_tokens": 8941225.0, "step": 549 }, { "entropy": 0.5519801378250122, "epoch": 2.052336448598131, "grad_norm": 0.03549535945057869, "learning_rate": 0.0002, "loss": 0.550297200679779, "mean_token_accuracy": 0.7756542861461639, "num_tokens": 8957662.0, "step": 550 }, { "entropy": 0.5188534706830978, "epoch": 2.05607476635514, "grad_norm": 0.03532535210251808, "learning_rate": 0.0002, "loss": 0.5225726962089539, "mean_token_accuracy": 0.7875347584486008, "num_tokens": 8973986.0, "step": 551 }, { "entropy": 0.5331487953662872, "epoch": 2.0598130841121494, "grad_norm": 0.0401851125061512, "learning_rate": 0.0002, "loss": 0.5345657467842102, "mean_token_accuracy": 0.7807552814483643, "num_tokens": 8990453.0, "step": 552 }, { "entropy": 0.5301813259720802, "epoch": 2.0635514018691588, "grad_norm": 0.04093443974852562, "learning_rate": 0.0002, "loss": 0.536128580570221, "mean_token_accuracy": 0.781855434179306, "num_tokens": 9006810.0, "step": 553 }, { "entropy": 0.5511504411697388, "epoch": 2.067289719626168, "grad_norm": 0.04108293727040291, "learning_rate": 0.0002, "loss": 0.547398567199707, "mean_token_accuracy": 0.7787968963384628, "num_tokens": 9023044.0, "step": 554 }, { "entropy": 0.5452945232391357, "epoch": 2.0710280373831775, "grad_norm": 0.04133358225226402, "learning_rate": 0.0002, "loss": 0.5406649112701416, "mean_token_accuracy": 0.7804151326417923, "num_tokens": 9039300.0, "step": 555 }, { "entropy": 0.5133676081895828, "epoch": 2.074766355140187, "grad_norm": 0.0368187241256237, "learning_rate": 0.0002, "loss": 0.510840892791748, "mean_token_accuracy": 0.7948838770389557, "num_tokens": 9055408.0, "step": 556 }, { "entropy": 0.5286162942647934, "epoch": 2.0785046728971963, "grad_norm": 0.037287503480911255, "learning_rate": 0.0002, "loss": 0.5286219120025635, "mean_token_accuracy": 0.7867581397294998, "num_tokens": 9071847.0, "step": 557 }, { "entropy": 0.5187130272388458, "epoch": 2.0822429906542057, "grad_norm": 0.03932078555226326, "learning_rate": 0.0002, "loss": 0.5252044200897217, "mean_token_accuracy": 0.788768544793129, "num_tokens": 9088062.0, "step": 558 }, { "entropy": 0.5239534676074982, "epoch": 2.085981308411215, "grad_norm": 0.04231242835521698, "learning_rate": 0.0002, "loss": 0.535202145576477, "mean_token_accuracy": 0.7852179259061813, "num_tokens": 9104468.0, "step": 559 }, { "entropy": 0.528278037905693, "epoch": 2.0897196261682245, "grad_norm": 0.03444297984242439, "learning_rate": 0.0002, "loss": 0.5238081812858582, "mean_token_accuracy": 0.7863867878913879, "num_tokens": 9120622.0, "step": 560 }, { "entropy": 0.5545478612184525, "epoch": 2.0934579439252334, "grad_norm": 0.04182487353682518, "learning_rate": 0.0002, "loss": 0.5527917742729187, "mean_token_accuracy": 0.7766451835632324, "num_tokens": 9137031.0, "step": 561 }, { "entropy": 0.521744892001152, "epoch": 2.097196261682243, "grad_norm": 0.03438956290483475, "learning_rate": 0.0002, "loss": 0.5255383849143982, "mean_token_accuracy": 0.7855681478977203, "num_tokens": 9153374.0, "step": 562 }, { "entropy": 0.5317307189106941, "epoch": 2.100934579439252, "grad_norm": 0.04259387031197548, "learning_rate": 0.0002, "loss": 0.530976414680481, "mean_token_accuracy": 0.7861284911632538, "num_tokens": 9169379.0, "step": 563 }, { "entropy": 0.5382358431816101, "epoch": 2.1046728971962616, "grad_norm": 0.03778582066297531, "learning_rate": 0.0002, "loss": 0.5446516871452332, "mean_token_accuracy": 0.7786799967288971, "num_tokens": 9185673.0, "step": 564 }, { "entropy": 0.5174337849020958, "epoch": 2.108411214953271, "grad_norm": 0.03816930949687958, "learning_rate": 0.0002, "loss": 0.5179592967033386, "mean_token_accuracy": 0.7912393063306808, "num_tokens": 9201995.0, "step": 565 }, { "entropy": 0.5279374569654465, "epoch": 2.1121495327102804, "grad_norm": 0.038216955959796906, "learning_rate": 0.0002, "loss": 0.5243582129478455, "mean_token_accuracy": 0.7866894006729126, "num_tokens": 9218133.0, "step": 566 }, { "entropy": 0.5245715379714966, "epoch": 2.1158878504672898, "grad_norm": 0.03613874316215515, "learning_rate": 0.0002, "loss": 0.5249512791633606, "mean_token_accuracy": 0.7851840853691101, "num_tokens": 9234342.0, "step": 567 }, { "entropy": 0.5192612558603287, "epoch": 2.119626168224299, "grad_norm": 0.04042578116059303, "learning_rate": 0.0002, "loss": 0.5259383320808411, "mean_token_accuracy": 0.7858112007379532, "num_tokens": 9250696.0, "step": 568 }, { "entropy": 0.5262997299432755, "epoch": 2.1233644859813086, "grad_norm": 0.04460779204964638, "learning_rate": 0.0002, "loss": 0.5308440923690796, "mean_token_accuracy": 0.7877162247896194, "num_tokens": 9266979.0, "step": 569 }, { "entropy": 0.5224001705646515, "epoch": 2.127102803738318, "grad_norm": 0.03817397728562355, "learning_rate": 0.0002, "loss": 0.5229726433753967, "mean_token_accuracy": 0.7861741036176682, "num_tokens": 9283280.0, "step": 570 }, { "entropy": 0.5274494737386703, "epoch": 2.130841121495327, "grad_norm": 0.04161069914698601, "learning_rate": 0.0002, "loss": 0.5270024538040161, "mean_token_accuracy": 0.7860408127307892, "num_tokens": 9299630.0, "step": 571 }, { "entropy": 0.5552078932523727, "epoch": 2.1345794392523363, "grad_norm": 0.04526656121015549, "learning_rate": 0.0002, "loss": 0.547661542892456, "mean_token_accuracy": 0.77776238322258, "num_tokens": 9316114.0, "step": 572 }, { "entropy": 0.5352555364370346, "epoch": 2.1383177570093457, "grad_norm": 0.037117403000593185, "learning_rate": 0.0002, "loss": 0.5322074294090271, "mean_token_accuracy": 0.7845579087734222, "num_tokens": 9332486.0, "step": 573 }, { "entropy": 0.5299685597419739, "epoch": 2.142056074766355, "grad_norm": 0.04335174337029457, "learning_rate": 0.0002, "loss": 0.5333051085472107, "mean_token_accuracy": 0.7831422835588455, "num_tokens": 9348999.0, "step": 574 }, { "entropy": 0.5251427963376045, "epoch": 2.1457943925233645, "grad_norm": 0.04729305952787399, "learning_rate": 0.0002, "loss": 0.5304993987083435, "mean_token_accuracy": 0.7857193797826767, "num_tokens": 9365291.0, "step": 575 }, { "entropy": 0.5248839110136032, "epoch": 2.149532710280374, "grad_norm": 0.04293828830122948, "learning_rate": 0.0002, "loss": 0.5300874710083008, "mean_token_accuracy": 0.784340038895607, "num_tokens": 9381734.0, "step": 576 }, { "entropy": 0.5214874297380447, "epoch": 2.1532710280373832, "grad_norm": 0.04350607469677925, "learning_rate": 0.0002, "loss": 0.5177597403526306, "mean_token_accuracy": 0.7909844070672989, "num_tokens": 9397955.0, "step": 577 }, { "entropy": 0.5421570688486099, "epoch": 2.1570093457943926, "grad_norm": 0.042496006935834885, "learning_rate": 0.0002, "loss": 0.5425592660903931, "mean_token_accuracy": 0.7795795798301697, "num_tokens": 9414143.0, "step": 578 }, { "entropy": 0.535075232386589, "epoch": 2.160747663551402, "grad_norm": 0.049906548112630844, "learning_rate": 0.0002, "loss": 0.5370741486549377, "mean_token_accuracy": 0.7806216180324554, "num_tokens": 9430295.0, "step": 579 }, { "entropy": 0.535729855298996, "epoch": 2.1644859813084114, "grad_norm": 0.04840796813368797, "learning_rate": 0.0002, "loss": 0.5347393155097961, "mean_token_accuracy": 0.7850737869739532, "num_tokens": 9446633.0, "step": 580 }, { "entropy": 0.5312991067767143, "epoch": 2.1682242990654204, "grad_norm": 0.04839569702744484, "learning_rate": 0.0002, "loss": 0.5378549098968506, "mean_token_accuracy": 0.7815908044576645, "num_tokens": 9462924.0, "step": 581 }, { "entropy": 0.5284993052482605, "epoch": 2.1719626168224297, "grad_norm": 0.04563288018107414, "learning_rate": 0.0002, "loss": 0.5385716557502747, "mean_token_accuracy": 0.7814656347036362, "num_tokens": 9479222.0, "step": 582 }, { "entropy": 0.535816490650177, "epoch": 2.175700934579439, "grad_norm": 0.05489310622215271, "learning_rate": 0.0002, "loss": 0.5382475256919861, "mean_token_accuracy": 0.7812406271696091, "num_tokens": 9495589.0, "step": 583 }, { "entropy": 0.549729734659195, "epoch": 2.1794392523364485, "grad_norm": 0.0424075648188591, "learning_rate": 0.0002, "loss": 0.539716899394989, "mean_token_accuracy": 0.7819323092699051, "num_tokens": 9511725.0, "step": 584 }, { "entropy": 0.5317162126302719, "epoch": 2.183177570093458, "grad_norm": 0.03563420847058296, "learning_rate": 0.0002, "loss": 0.5235797166824341, "mean_token_accuracy": 0.7905198931694031, "num_tokens": 9527971.0, "step": 585 }, { "entropy": 0.5211209952831268, "epoch": 2.1869158878504673, "grad_norm": 0.048658616840839386, "learning_rate": 0.0002, "loss": 0.5268206000328064, "mean_token_accuracy": 0.7845446020364761, "num_tokens": 9544253.0, "step": 586 }, { "entropy": 0.5116122514009476, "epoch": 2.1906542056074767, "grad_norm": 0.04198598116636276, "learning_rate": 0.0002, "loss": 0.5190539360046387, "mean_token_accuracy": 0.7874016612768173, "num_tokens": 9560518.0, "step": 587 }, { "entropy": 0.5246260613203049, "epoch": 2.194392523364486, "grad_norm": 0.03876075521111488, "learning_rate": 0.0002, "loss": 0.5228715538978577, "mean_token_accuracy": 0.7850266695022583, "num_tokens": 9576775.0, "step": 588 }, { "entropy": 0.5278798937797546, "epoch": 2.1981308411214955, "grad_norm": 0.04761234670877457, "learning_rate": 0.0002, "loss": 0.5265949964523315, "mean_token_accuracy": 0.7893748730421066, "num_tokens": 9593040.0, "step": 589 }, { "entropy": 0.548830658197403, "epoch": 2.201869158878505, "grad_norm": 0.04078621417284012, "learning_rate": 0.0002, "loss": 0.5517237186431885, "mean_token_accuracy": 0.778541699051857, "num_tokens": 9609499.0, "step": 590 }, { "entropy": 0.5336392223834991, "epoch": 2.205607476635514, "grad_norm": 0.04143911972641945, "learning_rate": 0.0002, "loss": 0.5296382308006287, "mean_token_accuracy": 0.7824793457984924, "num_tokens": 9625911.0, "step": 591 }, { "entropy": 0.5379772335290909, "epoch": 2.209345794392523, "grad_norm": 0.03608503192663193, "learning_rate": 0.0002, "loss": 0.5343111753463745, "mean_token_accuracy": 0.7822979539632797, "num_tokens": 9642395.0, "step": 592 }, { "entropy": 0.5172793120145798, "epoch": 2.2130841121495326, "grad_norm": 0.034696269780397415, "learning_rate": 0.0002, "loss": 0.5195714235305786, "mean_token_accuracy": 0.7902600318193436, "num_tokens": 9658662.0, "step": 593 }, { "entropy": 0.5202511548995972, "epoch": 2.216822429906542, "grad_norm": 0.0416097529232502, "learning_rate": 0.0002, "loss": 0.5290377736091614, "mean_token_accuracy": 0.7843390554189682, "num_tokens": 9674880.0, "step": 594 }, { "entropy": 0.5413576662540436, "epoch": 2.2205607476635514, "grad_norm": 0.0419846810400486, "learning_rate": 0.0002, "loss": 0.5517836809158325, "mean_token_accuracy": 0.7757999449968338, "num_tokens": 9691443.0, "step": 595 }, { "entropy": 0.5511815398931503, "epoch": 2.2242990654205608, "grad_norm": 0.042311880737543106, "learning_rate": 0.0002, "loss": 0.5441216230392456, "mean_token_accuracy": 0.7797399759292603, "num_tokens": 9707667.0, "step": 596 }, { "entropy": 0.5390328615903854, "epoch": 2.22803738317757, "grad_norm": 0.04130427911877632, "learning_rate": 0.0002, "loss": 0.5381530523300171, "mean_token_accuracy": 0.7850432395935059, "num_tokens": 9723670.0, "step": 597 }, { "entropy": 0.5145308524370193, "epoch": 2.2317757009345796, "grad_norm": 0.04054151102900505, "learning_rate": 0.0002, "loss": 0.5153539776802063, "mean_token_accuracy": 0.7911680340766907, "num_tokens": 9740111.0, "step": 598 }, { "entropy": 0.5264055132865906, "epoch": 2.235514018691589, "grad_norm": 0.04768845811486244, "learning_rate": 0.0002, "loss": 0.5321245193481445, "mean_token_accuracy": 0.7862783521413803, "num_tokens": 9756445.0, "step": 599 }, { "entropy": 0.5161085873842239, "epoch": 2.2392523364485983, "grad_norm": 0.047890279442071915, "learning_rate": 0.0002, "loss": 0.5329167246818542, "mean_token_accuracy": 0.7836614698171616, "num_tokens": 9772513.0, "step": 600 }, { "entropy": 0.5542461574077606, "epoch": 2.2429906542056073, "grad_norm": 0.04093446582555771, "learning_rate": 0.0002, "loss": 0.555320680141449, "mean_token_accuracy": 0.7749381363391876, "num_tokens": 9789085.0, "step": 601 }, { "entropy": 0.5521011054515839, "epoch": 2.2467289719626167, "grad_norm": 0.0422159768640995, "learning_rate": 0.0002, "loss": 0.5415031313896179, "mean_token_accuracy": 0.7801210135221481, "num_tokens": 9805542.0, "step": 602 }, { "entropy": 0.5508425533771515, "epoch": 2.250467289719626, "grad_norm": 0.04688411206007004, "learning_rate": 0.0002, "loss": 0.5387436151504517, "mean_token_accuracy": 0.7821325659751892, "num_tokens": 9821923.0, "step": 603 }, { "entropy": 0.5507242232561111, "epoch": 2.2542056074766355, "grad_norm": 0.035407017916440964, "learning_rate": 0.0002, "loss": 0.5444649457931519, "mean_token_accuracy": 0.7809951901435852, "num_tokens": 9838298.0, "step": 604 }, { "entropy": 0.5216517895460129, "epoch": 2.257943925233645, "grad_norm": 0.041920073330402374, "learning_rate": 0.0002, "loss": 0.5264837741851807, "mean_token_accuracy": 0.7897377163171768, "num_tokens": 9854659.0, "step": 605 }, { "entropy": 0.5258049964904785, "epoch": 2.2616822429906542, "grad_norm": 0.0534173846244812, "learning_rate": 0.0002, "loss": 0.5415172576904297, "mean_token_accuracy": 0.7817163467407227, "num_tokens": 9870877.0, "step": 606 }, { "entropy": 0.5240575075149536, "epoch": 2.2654205607476636, "grad_norm": 0.03395333141088486, "learning_rate": 0.0002, "loss": 0.5256165862083435, "mean_token_accuracy": 0.7837403416633606, "num_tokens": 9887224.0, "step": 607 }, { "entropy": 0.5454617738723755, "epoch": 2.269158878504673, "grad_norm": 0.034148454666137695, "learning_rate": 0.0002, "loss": 0.5424824953079224, "mean_token_accuracy": 0.7791529148817062, "num_tokens": 9903786.0, "step": 608 }, { "entropy": 0.5350487977266312, "epoch": 2.2728971962616824, "grad_norm": 0.042522136121988297, "learning_rate": 0.0002, "loss": 0.5272009372711182, "mean_token_accuracy": 0.7874994874000549, "num_tokens": 9920053.0, "step": 609 }, { "entropy": 0.5338039100170135, "epoch": 2.2766355140186914, "grad_norm": 0.036921191960573196, "learning_rate": 0.0002, "loss": 0.5227792859077454, "mean_token_accuracy": 0.7891070544719696, "num_tokens": 9936211.0, "step": 610 }, { "entropy": 0.5317139476537704, "epoch": 2.2803738317757007, "grad_norm": 0.038269490003585815, "learning_rate": 0.0002, "loss": 0.5253998637199402, "mean_token_accuracy": 0.7870776653289795, "num_tokens": 9952725.0, "step": 611 }, { "entropy": 0.5196784734725952, "epoch": 2.28411214953271, "grad_norm": 0.03972024843096733, "learning_rate": 0.0002, "loss": 0.5251049995422363, "mean_token_accuracy": 0.7839716672897339, "num_tokens": 9969316.0, "step": 612 }, { "entropy": 0.5095352083444595, "epoch": 2.2878504672897195, "grad_norm": 0.0507940798997879, "learning_rate": 0.0002, "loss": 0.5290789008140564, "mean_token_accuracy": 0.7861248552799225, "num_tokens": 9985447.0, "step": 613 }, { "entropy": 0.5270750820636749, "epoch": 2.291588785046729, "grad_norm": 0.04321181774139404, "learning_rate": 0.0002, "loss": 0.5311838984489441, "mean_token_accuracy": 0.7838535755872726, "num_tokens": 10001725.0, "step": 614 }, { "entropy": 0.5379711389541626, "epoch": 2.2953271028037383, "grad_norm": 0.040656980127096176, "learning_rate": 0.0002, "loss": 0.5385247468948364, "mean_token_accuracy": 0.7803602814674377, "num_tokens": 10018134.0, "step": 615 }, { "entropy": 0.5364449620246887, "epoch": 2.2990654205607477, "grad_norm": 0.044270358979701996, "learning_rate": 0.0002, "loss": 0.5303220748901367, "mean_token_accuracy": 0.7875775545835495, "num_tokens": 10034256.0, "step": 616 }, { "entropy": 0.5223758369684219, "epoch": 2.302803738317757, "grad_norm": 0.04040619730949402, "learning_rate": 0.0002, "loss": 0.5194275379180908, "mean_token_accuracy": 0.7908173054456711, "num_tokens": 10050260.0, "step": 617 }, { "entropy": 0.5754473656415939, "epoch": 2.3065420560747665, "grad_norm": 0.0413733534514904, "learning_rate": 0.0002, "loss": 0.5673513412475586, "mean_token_accuracy": 0.7693175226449966, "num_tokens": 10066439.0, "step": 618 }, { "entropy": 0.5494302958250046, "epoch": 2.310280373831776, "grad_norm": 0.04788622632622719, "learning_rate": 0.0002, "loss": 0.5560729503631592, "mean_token_accuracy": 0.7737975120544434, "num_tokens": 10082592.0, "step": 619 }, { "entropy": 0.5400004386901855, "epoch": 2.3140186915887853, "grad_norm": 0.04467733949422836, "learning_rate": 0.0002, "loss": 0.5475805997848511, "mean_token_accuracy": 0.7767456918954849, "num_tokens": 10098902.0, "step": 620 }, { "entropy": 0.5090039819478989, "epoch": 2.317757009345794, "grad_norm": 0.04413570463657379, "learning_rate": 0.0002, "loss": 0.5152875781059265, "mean_token_accuracy": 0.792495995759964, "num_tokens": 10115273.0, "step": 621 }, { "entropy": 0.5372920483350754, "epoch": 2.3214953271028036, "grad_norm": 0.037302058190107346, "learning_rate": 0.0002, "loss": 0.5321361422538757, "mean_token_accuracy": 0.7862480282783508, "num_tokens": 10131501.0, "step": 622 }, { "entropy": 0.5543005019426346, "epoch": 2.325233644859813, "grad_norm": 0.03829365596175194, "learning_rate": 0.0002, "loss": 0.5508820414543152, "mean_token_accuracy": 0.7745321840047836, "num_tokens": 10147998.0, "step": 623 }, { "entropy": 0.5153163969516754, "epoch": 2.3289719626168224, "grad_norm": 0.045321445912122726, "learning_rate": 0.0002, "loss": 0.5118069052696228, "mean_token_accuracy": 0.7935506701469421, "num_tokens": 10164126.0, "step": 624 }, { "entropy": 0.5008471608161926, "epoch": 2.3327102803738318, "grad_norm": 0.04449000954627991, "learning_rate": 0.0002, "loss": 0.5082967877388, "mean_token_accuracy": 0.7942900061607361, "num_tokens": 10180274.0, "step": 625 }, { "entropy": 0.532206118106842, "epoch": 2.336448598130841, "grad_norm": 0.05191594734787941, "learning_rate": 0.0002, "loss": 0.5367388129234314, "mean_token_accuracy": 0.7808051854372025, "num_tokens": 10196609.0, "step": 626 }, { "entropy": 0.5258989185094833, "epoch": 2.3401869158878505, "grad_norm": 0.044721271842718124, "learning_rate": 0.0002, "loss": 0.5331224203109741, "mean_token_accuracy": 0.7829412668943405, "num_tokens": 10212895.0, "step": 627 }, { "entropy": 0.5370120704174042, "epoch": 2.34392523364486, "grad_norm": 0.041769906878471375, "learning_rate": 0.0002, "loss": 0.5412429571151733, "mean_token_accuracy": 0.7827376574277878, "num_tokens": 10229237.0, "step": 628 }, { "entropy": 0.5400294661521912, "epoch": 2.3476635514018693, "grad_norm": 0.040269553661346436, "learning_rate": 0.0002, "loss": 0.5357171893119812, "mean_token_accuracy": 0.7816246598958969, "num_tokens": 10245453.0, "step": 629 }, { "entropy": 0.5325844436883926, "epoch": 2.3514018691588783, "grad_norm": 0.04499928280711174, "learning_rate": 0.0002, "loss": 0.5283193588256836, "mean_token_accuracy": 0.7859142124652863, "num_tokens": 10261777.0, "step": 630 }, { "entropy": 0.5282296687364578, "epoch": 2.3551401869158877, "grad_norm": 0.04336896538734436, "learning_rate": 0.0002, "loss": 0.5254157781600952, "mean_token_accuracy": 0.789379209280014, "num_tokens": 10278007.0, "step": 631 }, { "entropy": 0.5453646928071976, "epoch": 2.358878504672897, "grad_norm": 0.05249177664518356, "learning_rate": 0.0002, "loss": 0.5468531250953674, "mean_token_accuracy": 0.7771991342306137, "num_tokens": 10294331.0, "step": 632 }, { "entropy": 0.543931856751442, "epoch": 2.3626168224299064, "grad_norm": 0.037500377744436264, "learning_rate": 0.0002, "loss": 0.5477216839790344, "mean_token_accuracy": 0.7776368409395218, "num_tokens": 10310976.0, "step": 633 }, { "entropy": 0.5300342440605164, "epoch": 2.366355140186916, "grad_norm": 0.04039130359888077, "learning_rate": 0.0002, "loss": 0.5305655002593994, "mean_token_accuracy": 0.7832176089286804, "num_tokens": 10327256.0, "step": 634 }, { "entropy": 0.5378967821598053, "epoch": 2.3700934579439252, "grad_norm": 0.04444447159767151, "learning_rate": 0.0002, "loss": 0.5362187027931213, "mean_token_accuracy": 0.7842839509248734, "num_tokens": 10343608.0, "step": 635 }, { "entropy": 0.5510306656360626, "epoch": 2.3738317757009346, "grad_norm": 0.04542792961001396, "learning_rate": 0.0002, "loss": 0.5493132472038269, "mean_token_accuracy": 0.7786229699850082, "num_tokens": 10359923.0, "step": 636 }, { "entropy": 0.5210727900266647, "epoch": 2.377570093457944, "grad_norm": 0.043661415576934814, "learning_rate": 0.0002, "loss": 0.5236334800720215, "mean_token_accuracy": 0.7890983521938324, "num_tokens": 10376100.0, "step": 637 }, { "entropy": 0.5260880589485168, "epoch": 2.3813084112149534, "grad_norm": 0.04262132570147514, "learning_rate": 0.0002, "loss": 0.5248558521270752, "mean_token_accuracy": 0.7902341783046722, "num_tokens": 10392698.0, "step": 638 }, { "entropy": 0.5457091331481934, "epoch": 2.385046728971963, "grad_norm": 0.04899441823363304, "learning_rate": 0.0002, "loss": 0.5536708235740662, "mean_token_accuracy": 0.7760955542325974, "num_tokens": 10409076.0, "step": 639 }, { "entropy": 0.5321961939334869, "epoch": 2.388785046728972, "grad_norm": 0.045906826853752136, "learning_rate": 0.0002, "loss": 0.5316425561904907, "mean_token_accuracy": 0.7848930060863495, "num_tokens": 10425501.0, "step": 640 }, { "entropy": 0.5476334244012833, "epoch": 2.392523364485981, "grad_norm": 0.038592927157878876, "learning_rate": 0.0002, "loss": 0.5469234585762024, "mean_token_accuracy": 0.7766659259796143, "num_tokens": 10441907.0, "step": 641 }, { "entropy": 0.514763131737709, "epoch": 2.3962616822429905, "grad_norm": 0.04247188940644264, "learning_rate": 0.0002, "loss": 0.5191242098808289, "mean_token_accuracy": 0.7888349145650864, "num_tokens": 10458019.0, "step": 642 }, { "entropy": 0.5377763360738754, "epoch": 2.4, "grad_norm": 0.037420280277729034, "learning_rate": 0.0002, "loss": 0.5363115072250366, "mean_token_accuracy": 0.7803380340337753, "num_tokens": 10474412.0, "step": 643 }, { "entropy": 0.5383724719285965, "epoch": 2.4037383177570093, "grad_norm": 0.038523126393556595, "learning_rate": 0.0002, "loss": 0.5415539145469666, "mean_token_accuracy": 0.7787618041038513, "num_tokens": 10490995.0, "step": 644 }, { "entropy": 0.5374136418104172, "epoch": 2.4074766355140187, "grad_norm": 0.03964264318346977, "learning_rate": 0.0002, "loss": 0.5468027591705322, "mean_token_accuracy": 0.779059037566185, "num_tokens": 10507482.0, "step": 645 }, { "entropy": 0.5512133836746216, "epoch": 2.411214953271028, "grad_norm": 0.0391349270939827, "learning_rate": 0.0002, "loss": 0.5508245825767517, "mean_token_accuracy": 0.7754583358764648, "num_tokens": 10523993.0, "step": 646 }, { "entropy": 0.5193808674812317, "epoch": 2.4149532710280375, "grad_norm": 0.03556473180651665, "learning_rate": 0.0002, "loss": 0.5196793675422668, "mean_token_accuracy": 0.78975510597229, "num_tokens": 10540005.0, "step": 647 }, { "entropy": 0.5471558570861816, "epoch": 2.418691588785047, "grad_norm": 0.04553184658288956, "learning_rate": 0.0002, "loss": 0.547728419303894, "mean_token_accuracy": 0.7780675292015076, "num_tokens": 10555891.0, "step": 648 }, { "entropy": 0.519458457827568, "epoch": 2.4224299065420563, "grad_norm": 0.045790717005729675, "learning_rate": 0.0002, "loss": 0.5232809782028198, "mean_token_accuracy": 0.7882662564516068, "num_tokens": 10572109.0, "step": 649 }, { "entropy": 0.5270252674818039, "epoch": 2.426168224299065, "grad_norm": 0.04227881506085396, "learning_rate": 0.0002, "loss": 0.5288085341453552, "mean_token_accuracy": 0.7866526395082474, "num_tokens": 10588192.0, "step": 650 }, { "entropy": 0.548214852809906, "epoch": 2.4299065420560746, "grad_norm": 0.04126811400055885, "learning_rate": 0.0002, "loss": 0.5440689325332642, "mean_token_accuracy": 0.779522180557251, "num_tokens": 10604498.0, "step": 651 }, { "entropy": 0.5452295988798141, "epoch": 2.433644859813084, "grad_norm": 0.044819604605436325, "learning_rate": 0.0002, "loss": 0.547234833240509, "mean_token_accuracy": 0.7796365767717361, "num_tokens": 10620949.0, "step": 652 }, { "entropy": 0.5525990724563599, "epoch": 2.4373831775700934, "grad_norm": 0.042418453842401505, "learning_rate": 0.0002, "loss": 0.5493718385696411, "mean_token_accuracy": 0.7783072590827942, "num_tokens": 10637398.0, "step": 653 }, { "entropy": 0.5338578671216965, "epoch": 2.4411214953271028, "grad_norm": 0.048241496086120605, "learning_rate": 0.0002, "loss": 0.5348434448242188, "mean_token_accuracy": 0.7853177338838577, "num_tokens": 10653827.0, "step": 654 }, { "entropy": 0.5247549116611481, "epoch": 2.444859813084112, "grad_norm": 0.03876890614628792, "learning_rate": 0.0002, "loss": 0.5283288359642029, "mean_token_accuracy": 0.7865240424871445, "num_tokens": 10670227.0, "step": 655 }, { "entropy": 0.5525484532117844, "epoch": 2.4485981308411215, "grad_norm": 0.04079402610659599, "learning_rate": 0.0002, "loss": 0.5510199069976807, "mean_token_accuracy": 0.7765209227800369, "num_tokens": 10686514.0, "step": 656 }, { "entropy": 0.5248308256268501, "epoch": 2.452336448598131, "grad_norm": 0.03220357000827789, "learning_rate": 0.0002, "loss": 0.5197701454162598, "mean_token_accuracy": 0.7878830432891846, "num_tokens": 10702613.0, "step": 657 }, { "entropy": 0.5264022424817085, "epoch": 2.4560747663551403, "grad_norm": 0.038926877081394196, "learning_rate": 0.0002, "loss": 0.5227438807487488, "mean_token_accuracy": 0.7853628695011139, "num_tokens": 10718690.0, "step": 658 }, { "entropy": 0.5430135428905487, "epoch": 2.4598130841121497, "grad_norm": 0.04270581528544426, "learning_rate": 0.0002, "loss": 0.5455408096313477, "mean_token_accuracy": 0.7791119664907455, "num_tokens": 10735135.0, "step": 659 }, { "entropy": 0.5284547656774521, "epoch": 2.463551401869159, "grad_norm": 0.04039589315652847, "learning_rate": 0.0002, "loss": 0.5309383273124695, "mean_token_accuracy": 0.784732460975647, "num_tokens": 10751298.0, "step": 660 }, { "entropy": 0.5267135500907898, "epoch": 2.467289719626168, "grad_norm": 0.042588524520397186, "learning_rate": 0.0002, "loss": 0.5272895097732544, "mean_token_accuracy": 0.7885420620441437, "num_tokens": 10767947.0, "step": 661 }, { "entropy": 0.5294100195169449, "epoch": 2.4710280373831774, "grad_norm": 0.04541191831231117, "learning_rate": 0.0002, "loss": 0.5415511727333069, "mean_token_accuracy": 0.7802952826023102, "num_tokens": 10784155.0, "step": 662 }, { "entropy": 0.5230477377772331, "epoch": 2.474766355140187, "grad_norm": 0.04615366831421852, "learning_rate": 0.0002, "loss": 0.5295774936676025, "mean_token_accuracy": 0.7873392999172211, "num_tokens": 10800552.0, "step": 663 }, { "entropy": 0.5188637897372246, "epoch": 2.4785046728971962, "grad_norm": 0.03992808610200882, "learning_rate": 0.0002, "loss": 0.5195883512496948, "mean_token_accuracy": 0.7883334010839462, "num_tokens": 10816926.0, "step": 664 }, { "entropy": 0.5323937982320786, "epoch": 2.4822429906542056, "grad_norm": 0.04497828707098961, "learning_rate": 0.0002, "loss": 0.5278034210205078, "mean_token_accuracy": 0.7848539501428604, "num_tokens": 10833159.0, "step": 665 }, { "entropy": 0.5480016022920609, "epoch": 2.485981308411215, "grad_norm": 0.0394604429602623, "learning_rate": 0.0002, "loss": 0.5437833070755005, "mean_token_accuracy": 0.7807918637990952, "num_tokens": 10849417.0, "step": 666 }, { "entropy": 0.5170062035322189, "epoch": 2.4897196261682244, "grad_norm": 0.041445329785346985, "learning_rate": 0.0002, "loss": 0.517329216003418, "mean_token_accuracy": 0.7887666076421738, "num_tokens": 10865715.0, "step": 667 }, { "entropy": 0.5371522009372711, "epoch": 2.493457943925234, "grad_norm": 0.042152535170316696, "learning_rate": 0.0002, "loss": 0.5461167693138123, "mean_token_accuracy": 0.7759047448635101, "num_tokens": 10881891.0, "step": 668 }, { "entropy": 0.522216372191906, "epoch": 2.497196261682243, "grad_norm": 0.04944324120879173, "learning_rate": 0.0002, "loss": 0.5293608903884888, "mean_token_accuracy": 0.7865939140319824, "num_tokens": 10898086.0, "step": 669 }, { "entropy": 0.5419133603572845, "epoch": 2.500934579439252, "grad_norm": 0.03869049996137619, "learning_rate": 0.0002, "loss": 0.5435135364532471, "mean_token_accuracy": 0.7788238078355789, "num_tokens": 10914630.0, "step": 670 }, { "entropy": 0.543552428483963, "epoch": 2.5046728971962615, "grad_norm": 0.040104418992996216, "learning_rate": 0.0002, "loss": 0.5451544523239136, "mean_token_accuracy": 0.7762735784053802, "num_tokens": 10931142.0, "step": 671 }, { "entropy": 0.5488818436861038, "epoch": 2.508411214953271, "grad_norm": 0.03650939092040062, "learning_rate": 0.0002, "loss": 0.5461534857749939, "mean_token_accuracy": 0.7810324132442474, "num_tokens": 10947432.0, "step": 672 }, { "entropy": 0.5514579713344574, "epoch": 2.5121495327102803, "grad_norm": 0.035640496760606766, "learning_rate": 0.0002, "loss": 0.5461341142654419, "mean_token_accuracy": 0.7758427411317825, "num_tokens": 10963793.0, "step": 673 }, { "entropy": 0.5298633724451065, "epoch": 2.5158878504672897, "grad_norm": 0.036869630217552185, "learning_rate": 0.0002, "loss": 0.5271415710449219, "mean_token_accuracy": 0.7874128669500351, "num_tokens": 10980238.0, "step": 674 }, { "entropy": 0.5178606957197189, "epoch": 2.519626168224299, "grad_norm": 0.04496290162205696, "learning_rate": 0.0002, "loss": 0.5193417072296143, "mean_token_accuracy": 0.7885989248752594, "num_tokens": 10996365.0, "step": 675 }, { "entropy": 0.5270267352461815, "epoch": 2.5233644859813085, "grad_norm": 0.04544811695814133, "learning_rate": 0.0002, "loss": 0.5387653112411499, "mean_token_accuracy": 0.7800068855285645, "num_tokens": 11012575.0, "step": 676 }, { "entropy": 0.527735561132431, "epoch": 2.527102803738318, "grad_norm": 0.04031702131032944, "learning_rate": 0.0002, "loss": 0.5367462635040283, "mean_token_accuracy": 0.7821540981531143, "num_tokens": 11028942.0, "step": 677 }, { "entropy": 0.5479142069816589, "epoch": 2.5308411214953273, "grad_norm": 0.042728912085294724, "learning_rate": 0.0002, "loss": 0.5432093739509583, "mean_token_accuracy": 0.7799795567989349, "num_tokens": 11045296.0, "step": 678 }, { "entropy": 0.5360302478075027, "epoch": 2.5345794392523366, "grad_norm": 0.040872231125831604, "learning_rate": 0.0002, "loss": 0.5265986323356628, "mean_token_accuracy": 0.7887827455997467, "num_tokens": 11061450.0, "step": 679 }, { "entropy": 0.5468751043081284, "epoch": 2.538317757009346, "grad_norm": 0.0408024825155735, "learning_rate": 0.0002, "loss": 0.5442636609077454, "mean_token_accuracy": 0.7790944874286652, "num_tokens": 11077540.0, "step": 680 }, { "entropy": 0.530633345246315, "epoch": 2.542056074766355, "grad_norm": 0.04209808632731438, "learning_rate": 0.0002, "loss": 0.5363141894340515, "mean_token_accuracy": 0.7819496542215347, "num_tokens": 11093632.0, "step": 681 }, { "entropy": 0.5098425000905991, "epoch": 2.5457943925233644, "grad_norm": 0.04276811331510544, "learning_rate": 0.0002, "loss": 0.5222542881965637, "mean_token_accuracy": 0.7871226519346237, "num_tokens": 11110142.0, "step": 682 }, { "entropy": 0.5203486457467079, "epoch": 2.5495327102803738, "grad_norm": 0.04667636379599571, "learning_rate": 0.0002, "loss": 0.52687668800354, "mean_token_accuracy": 0.7876535356044769, "num_tokens": 11126405.0, "step": 683 }, { "entropy": 0.5424248725175858, "epoch": 2.553271028037383, "grad_norm": 0.03960704430937767, "learning_rate": 0.0002, "loss": 0.5351195335388184, "mean_token_accuracy": 0.7820920497179031, "num_tokens": 11142681.0, "step": 684 }, { "entropy": 0.5479930490255356, "epoch": 2.5570093457943925, "grad_norm": 0.03865355625748634, "learning_rate": 0.0002, "loss": 0.5381141901016235, "mean_token_accuracy": 0.7842580229043961, "num_tokens": 11158981.0, "step": 685 }, { "entropy": 0.5378328114748001, "epoch": 2.560747663551402, "grad_norm": 0.0406392477452755, "learning_rate": 0.0002, "loss": 0.5395403504371643, "mean_token_accuracy": 0.7812999784946442, "num_tokens": 11175185.0, "step": 686 }, { "entropy": 0.5591647922992706, "epoch": 2.5644859813084113, "grad_norm": 0.042679473757743835, "learning_rate": 0.0002, "loss": 0.5618141889572144, "mean_token_accuracy": 0.7730479836463928, "num_tokens": 11191516.0, "step": 687 }, { "entropy": 0.540540523827076, "epoch": 2.5682242990654207, "grad_norm": 0.0401788055896759, "learning_rate": 0.0002, "loss": 0.5431095957756042, "mean_token_accuracy": 0.7800974696874619, "num_tokens": 11207897.0, "step": 688 }, { "entropy": 0.5273384600877762, "epoch": 2.5719626168224297, "grad_norm": 0.04009004309773445, "learning_rate": 0.0002, "loss": 0.5236154794692993, "mean_token_accuracy": 0.7862724959850311, "num_tokens": 11224233.0, "step": 689 }, { "entropy": 0.5341546684503555, "epoch": 2.575700934579439, "grad_norm": 0.045469239354133606, "learning_rate": 0.0002, "loss": 0.5359405875205994, "mean_token_accuracy": 0.7828920185565948, "num_tokens": 11240583.0, "step": 690 }, { "entropy": 0.516716443002224, "epoch": 2.5794392523364484, "grad_norm": 0.03841989487409592, "learning_rate": 0.0002, "loss": 0.5178863406181335, "mean_token_accuracy": 0.7926649451255798, "num_tokens": 11256814.0, "step": 691 }, { "entropy": 0.5300464928150177, "epoch": 2.583177570093458, "grad_norm": 0.043383657932281494, "learning_rate": 0.0002, "loss": 0.534642219543457, "mean_token_accuracy": 0.7844998836517334, "num_tokens": 11273092.0, "step": 692 }, { "entropy": 0.5270805209875107, "epoch": 2.586915887850467, "grad_norm": 0.042948167771101, "learning_rate": 0.0002, "loss": 0.5318405628204346, "mean_token_accuracy": 0.7814630717039108, "num_tokens": 11289382.0, "step": 693 }, { "entropy": 0.5576307624578476, "epoch": 2.5906542056074766, "grad_norm": 0.04289550706744194, "learning_rate": 0.0002, "loss": 0.5595361590385437, "mean_token_accuracy": 0.77448670566082, "num_tokens": 11305822.0, "step": 694 }, { "entropy": 0.5350489318370819, "epoch": 2.594392523364486, "grad_norm": 0.036010973155498505, "learning_rate": 0.0002, "loss": 0.5320281982421875, "mean_token_accuracy": 0.7841717451810837, "num_tokens": 11322116.0, "step": 695 }, { "entropy": 0.5389258116483688, "epoch": 2.5981308411214954, "grad_norm": 0.036538656800985336, "learning_rate": 0.0002, "loss": 0.5332745313644409, "mean_token_accuracy": 0.7836548089981079, "num_tokens": 11338486.0, "step": 696 }, { "entropy": 0.5357422530651093, "epoch": 2.601869158878505, "grad_norm": 0.03977203741669655, "learning_rate": 0.0002, "loss": 0.5403972864151001, "mean_token_accuracy": 0.7783884555101395, "num_tokens": 11355126.0, "step": 697 }, { "entropy": 0.5224239528179169, "epoch": 2.605607476635514, "grad_norm": 0.03854282945394516, "learning_rate": 0.0002, "loss": 0.5209836363792419, "mean_token_accuracy": 0.7890230715274811, "num_tokens": 11371642.0, "step": 698 }, { "entropy": 0.527114674448967, "epoch": 2.6093457943925236, "grad_norm": 0.03806879743933678, "learning_rate": 0.0002, "loss": 0.5328760743141174, "mean_token_accuracy": 0.7834767252206802, "num_tokens": 11388018.0, "step": 699 }, { "entropy": 0.5207114219665527, "epoch": 2.613084112149533, "grad_norm": 0.04797474667429924, "learning_rate": 0.0002, "loss": 0.5281696915626526, "mean_token_accuracy": 0.7842787057161331, "num_tokens": 11404304.0, "step": 700 }, { "entropy": 0.5329904109239578, "epoch": 2.616822429906542, "grad_norm": 0.04143727570772171, "learning_rate": 0.0002, "loss": 0.5371139645576477, "mean_token_accuracy": 0.7831498682498932, "num_tokens": 11420561.0, "step": 701 }, { "entropy": 0.5422161221504211, "epoch": 2.6205607476635513, "grad_norm": 0.04683515056967735, "learning_rate": 0.0002, "loss": 0.5436529517173767, "mean_token_accuracy": 0.7796959728002548, "num_tokens": 11436820.0, "step": 702 }, { "entropy": 0.5309348404407501, "epoch": 2.6242990654205607, "grad_norm": 0.036559656262397766, "learning_rate": 0.0002, "loss": 0.5223227143287659, "mean_token_accuracy": 0.7849199175834656, "num_tokens": 11453134.0, "step": 703 }, { "entropy": 0.5515079498291016, "epoch": 2.62803738317757, "grad_norm": 0.047568727284669876, "learning_rate": 0.0002, "loss": 0.5509875416755676, "mean_token_accuracy": 0.7774451673030853, "num_tokens": 11469442.0, "step": 704 }, { "entropy": 0.5654275268316269, "epoch": 2.6317757009345795, "grad_norm": 0.03854409605264664, "learning_rate": 0.0002, "loss": 0.559022068977356, "mean_token_accuracy": 0.7747441530227661, "num_tokens": 11485880.0, "step": 705 }, { "entropy": 0.5369984805583954, "epoch": 2.635514018691589, "grad_norm": 0.04869009181857109, "learning_rate": 0.0002, "loss": 0.5361051559448242, "mean_token_accuracy": 0.780804455280304, "num_tokens": 11502359.0, "step": 706 }, { "entropy": 0.542375922203064, "epoch": 2.6392523364485982, "grad_norm": 0.045840587466955185, "learning_rate": 0.0002, "loss": 0.5502850413322449, "mean_token_accuracy": 0.7759635299444199, "num_tokens": 11518813.0, "step": 707 }, { "entropy": 0.5237139612436295, "epoch": 2.6429906542056076, "grad_norm": 0.043406110256910324, "learning_rate": 0.0002, "loss": 0.5281059741973877, "mean_token_accuracy": 0.7859614938497543, "num_tokens": 11535188.0, "step": 708 }, { "entropy": 0.5367631316184998, "epoch": 2.6467289719626166, "grad_norm": 0.04024430736899376, "learning_rate": 0.0002, "loss": 0.5387470126152039, "mean_token_accuracy": 0.7812274694442749, "num_tokens": 11551645.0, "step": 709 }, { "entropy": 0.5330280810594559, "epoch": 2.650467289719626, "grad_norm": 0.0389426052570343, "learning_rate": 0.0002, "loss": 0.5361229181289673, "mean_token_accuracy": 0.7837622314691544, "num_tokens": 11567892.0, "step": 710 }, { "entropy": 0.5259372144937515, "epoch": 2.6542056074766354, "grad_norm": 0.03997652605175972, "learning_rate": 0.0002, "loss": 0.5267660617828369, "mean_token_accuracy": 0.7850897163152695, "num_tokens": 11584153.0, "step": 711 }, { "entropy": 0.5390958487987518, "epoch": 2.6579439252336448, "grad_norm": 0.04180564358830452, "learning_rate": 0.0002, "loss": 0.5372406244277954, "mean_token_accuracy": 0.7838725447654724, "num_tokens": 11600597.0, "step": 712 }, { "entropy": 0.5279987677931786, "epoch": 2.661682242990654, "grad_norm": 0.03591061756014824, "learning_rate": 0.0002, "loss": 0.5308532118797302, "mean_token_accuracy": 0.785730242729187, "num_tokens": 11616881.0, "step": 713 }, { "entropy": 0.5563876032829285, "epoch": 2.6654205607476635, "grad_norm": 0.03892669454216957, "learning_rate": 0.0002, "loss": 0.5556321144104004, "mean_token_accuracy": 0.7758439630270004, "num_tokens": 11633329.0, "step": 714 }, { "entropy": 0.5373513847589493, "epoch": 2.669158878504673, "grad_norm": 0.03863142430782318, "learning_rate": 0.0002, "loss": 0.5352209806442261, "mean_token_accuracy": 0.7836543023586273, "num_tokens": 11649751.0, "step": 715 }, { "entropy": 0.5123810023069382, "epoch": 2.6728971962616823, "grad_norm": 0.04038078337907791, "learning_rate": 0.0002, "loss": 0.5158439874649048, "mean_token_accuracy": 0.7905206978321075, "num_tokens": 11665928.0, "step": 716 }, { "entropy": 0.5479727983474731, "epoch": 2.6766355140186917, "grad_norm": 0.04204852879047394, "learning_rate": 0.0002, "loss": 0.5506036281585693, "mean_token_accuracy": 0.7781369537115097, "num_tokens": 11682349.0, "step": 717 }, { "entropy": 0.5410658866167068, "epoch": 2.680373831775701, "grad_norm": 0.04252674803137779, "learning_rate": 0.0002, "loss": 0.5433157086372375, "mean_token_accuracy": 0.776948869228363, "num_tokens": 11698941.0, "step": 718 }, { "entropy": 0.5443103611469269, "epoch": 2.6841121495327105, "grad_norm": 0.044883646070957184, "learning_rate": 0.0002, "loss": 0.5470229983329773, "mean_token_accuracy": 0.7803091257810593, "num_tokens": 11715434.0, "step": 719 }, { "entropy": 0.5390113294124603, "epoch": 2.68785046728972, "grad_norm": 0.04012865573167801, "learning_rate": 0.0002, "loss": 0.5320149660110474, "mean_token_accuracy": 0.7860948741436005, "num_tokens": 11731697.0, "step": 720 }, { "entropy": 0.5281476825475693, "epoch": 2.691588785046729, "grad_norm": 0.04816235229372978, "learning_rate": 0.0002, "loss": 0.5312087535858154, "mean_token_accuracy": 0.7858725935220718, "num_tokens": 11747788.0, "step": 721 }, { "entropy": 0.5142519026994705, "epoch": 2.695327102803738, "grad_norm": 0.0394207127392292, "learning_rate": 0.0002, "loss": 0.5175022482872009, "mean_token_accuracy": 0.7914264351129532, "num_tokens": 11763802.0, "step": 722 }, { "entropy": 0.5183316618204117, "epoch": 2.6990654205607476, "grad_norm": 0.04731175675988197, "learning_rate": 0.0002, "loss": 0.5275416374206543, "mean_token_accuracy": 0.7866149395704269, "num_tokens": 11779759.0, "step": 723 }, { "entropy": 0.5322978273034096, "epoch": 2.702803738317757, "grad_norm": 0.045594654977321625, "learning_rate": 0.0002, "loss": 0.5377396941184998, "mean_token_accuracy": 0.7802564948797226, "num_tokens": 11795656.0, "step": 724 }, { "entropy": 0.5265089273452759, "epoch": 2.7065420560747664, "grad_norm": 0.04707048460841179, "learning_rate": 0.0002, "loss": 0.5340720415115356, "mean_token_accuracy": 0.7816154807806015, "num_tokens": 11811757.0, "step": 725 }, { "entropy": 0.5486596673727036, "epoch": 2.710280373831776, "grad_norm": 0.04378875717520714, "learning_rate": 0.0002, "loss": 0.5447016358375549, "mean_token_accuracy": 0.7777462303638458, "num_tokens": 11828249.0, "step": 726 }, { "entropy": 0.5557577461004257, "epoch": 2.714018691588785, "grad_norm": 0.044526614248752594, "learning_rate": 0.0002, "loss": 0.5464760661125183, "mean_token_accuracy": 0.7786324173212051, "num_tokens": 11844645.0, "step": 727 }, { "entropy": 0.5483285784721375, "epoch": 2.717757009345794, "grad_norm": 0.05415434390306473, "learning_rate": 0.0002, "loss": 0.5537320971488953, "mean_token_accuracy": 0.774675577878952, "num_tokens": 11860972.0, "step": 728 }, { "entropy": 0.5311020910739899, "epoch": 2.7214953271028035, "grad_norm": 0.043242573738098145, "learning_rate": 0.0002, "loss": 0.5344421863555908, "mean_token_accuracy": 0.7838677763938904, "num_tokens": 11876848.0, "step": 729 }, { "entropy": 0.5571545660495758, "epoch": 2.725233644859813, "grad_norm": 0.04775959998369217, "learning_rate": 0.0002, "loss": 0.5543075799942017, "mean_token_accuracy": 0.7767691016197205, "num_tokens": 11893101.0, "step": 730 }, { "entropy": 0.5632807910442352, "epoch": 2.7289719626168223, "grad_norm": 0.040951792150735855, "learning_rate": 0.0002, "loss": 0.556804895401001, "mean_token_accuracy": 0.7738458663225174, "num_tokens": 11909248.0, "step": 731 }, { "entropy": 0.5437204986810684, "epoch": 2.7327102803738317, "grad_norm": 0.041280943900346756, "learning_rate": 0.0002, "loss": 0.5405519604682922, "mean_token_accuracy": 0.7808393985033035, "num_tokens": 11925644.0, "step": 732 }, { "entropy": 0.5410651564598083, "epoch": 2.736448598130841, "grad_norm": 0.04410838708281517, "learning_rate": 0.0002, "loss": 0.5487910509109497, "mean_token_accuracy": 0.7771375328302383, "num_tokens": 11941579.0, "step": 733 }, { "entropy": 0.543538823723793, "epoch": 2.7401869158878505, "grad_norm": 0.04985618218779564, "learning_rate": 0.0002, "loss": 0.5518176555633545, "mean_token_accuracy": 0.775468647480011, "num_tokens": 11957981.0, "step": 734 }, { "entropy": 0.5253164023160934, "epoch": 2.74392523364486, "grad_norm": 0.04087154567241669, "learning_rate": 0.0002, "loss": 0.5267685651779175, "mean_token_accuracy": 0.7876032888889313, "num_tokens": 11974282.0, "step": 735 }, { "entropy": 0.5454862713813782, "epoch": 2.7476635514018692, "grad_norm": 0.04045165702700615, "learning_rate": 0.0002, "loss": 0.5382283926010132, "mean_token_accuracy": 0.7811629176139832, "num_tokens": 11990945.0, "step": 736 }, { "entropy": 0.5417391657829285, "epoch": 2.7514018691588786, "grad_norm": 0.042311448603868484, "learning_rate": 0.0002, "loss": 0.540289044380188, "mean_token_accuracy": 0.7793714255094528, "num_tokens": 12007392.0, "step": 737 }, { "entropy": 0.5214735865592957, "epoch": 2.755140186915888, "grad_norm": 0.04158855974674225, "learning_rate": 0.0002, "loss": 0.5217651128768921, "mean_token_accuracy": 0.7852792292833328, "num_tokens": 12023581.0, "step": 738 }, { "entropy": 0.5328553915023804, "epoch": 2.7588785046728974, "grad_norm": 0.038325536996126175, "learning_rate": 0.0002, "loss": 0.5344902873039246, "mean_token_accuracy": 0.7842058092355728, "num_tokens": 12039885.0, "step": 739 }, { "entropy": 0.5496254563331604, "epoch": 2.762616822429907, "grad_norm": 0.04375292733311653, "learning_rate": 0.0002, "loss": 0.55174720287323, "mean_token_accuracy": 0.7766779661178589, "num_tokens": 12056371.0, "step": 740 }, { "entropy": 0.558516189455986, "epoch": 2.7663551401869158, "grad_norm": 0.049271486699581146, "learning_rate": 0.0002, "loss": 0.561238169670105, "mean_token_accuracy": 0.77435702085495, "num_tokens": 12072839.0, "step": 741 }, { "entropy": 0.5472046732902527, "epoch": 2.770093457943925, "grad_norm": 0.04255034402012825, "learning_rate": 0.0002, "loss": 0.5455073714256287, "mean_token_accuracy": 0.7776911556720734, "num_tokens": 12089121.0, "step": 742 }, { "entropy": 0.5307886898517609, "epoch": 2.7738317757009345, "grad_norm": 0.04008355364203453, "learning_rate": 0.0002, "loss": 0.5308167934417725, "mean_token_accuracy": 0.785127267241478, "num_tokens": 12105321.0, "step": 743 }, { "entropy": 0.5314194560050964, "epoch": 2.777570093457944, "grad_norm": 0.043235525488853455, "learning_rate": 0.0002, "loss": 0.5316693186759949, "mean_token_accuracy": 0.7851164489984512, "num_tokens": 12121581.0, "step": 744 }, { "entropy": 0.5243879109621048, "epoch": 2.7813084112149533, "grad_norm": 0.0358644537627697, "learning_rate": 0.0002, "loss": 0.5208507776260376, "mean_token_accuracy": 0.7896229773759842, "num_tokens": 12138064.0, "step": 745 }, { "entropy": 0.5349021703004837, "epoch": 2.7850467289719627, "grad_norm": 0.04395059868693352, "learning_rate": 0.0002, "loss": 0.541559100151062, "mean_token_accuracy": 0.7818141132593155, "num_tokens": 12154580.0, "step": 746 }, { "entropy": 0.5464755445718765, "epoch": 2.788785046728972, "grad_norm": 0.03772180154919624, "learning_rate": 0.0002, "loss": 0.5500795245170593, "mean_token_accuracy": 0.7745375484228134, "num_tokens": 12170944.0, "step": 747 }, { "entropy": 0.5316334664821625, "epoch": 2.792523364485981, "grad_norm": 0.042537569999694824, "learning_rate": 0.0002, "loss": 0.5385891795158386, "mean_token_accuracy": 0.7813721299171448, "num_tokens": 12187183.0, "step": 748 }, { "entropy": 0.5325866043567657, "epoch": 2.7962616822429904, "grad_norm": 0.03928552195429802, "learning_rate": 0.0002, "loss": 0.5372824668884277, "mean_token_accuracy": 0.782025933265686, "num_tokens": 12203656.0, "step": 749 }, { "entropy": 0.5230025053024292, "epoch": 2.8, "grad_norm": 0.045356832444667816, "learning_rate": 0.0002, "loss": 0.5221288204193115, "mean_token_accuracy": 0.7879509478807449, "num_tokens": 12220217.0, "step": 750 }, { "entropy": 0.5552905946969986, "epoch": 2.803738317757009, "grad_norm": 0.03520367294549942, "learning_rate": 0.0002, "loss": 0.5458053350448608, "mean_token_accuracy": 0.7801086604595184, "num_tokens": 12236926.0, "step": 751 }, { "entropy": 0.5284090638160706, "epoch": 2.8074766355140186, "grad_norm": 0.04301855340600014, "learning_rate": 0.0002, "loss": 0.5322295427322388, "mean_token_accuracy": 0.7865041345357895, "num_tokens": 12253231.0, "step": 752 }, { "entropy": 0.5464428961277008, "epoch": 2.811214953271028, "grad_norm": 0.04177437350153923, "learning_rate": 0.0002, "loss": 0.5503079295158386, "mean_token_accuracy": 0.7759024053812027, "num_tokens": 12269564.0, "step": 753 }, { "entropy": 0.5288181900978088, "epoch": 2.8149532710280374, "grad_norm": 0.04611227661371231, "learning_rate": 0.0002, "loss": 0.5422286987304688, "mean_token_accuracy": 0.7793826460838318, "num_tokens": 12285764.0, "step": 754 }, { "entropy": 0.538264587521553, "epoch": 2.8186915887850468, "grad_norm": 0.039094604551792145, "learning_rate": 0.0002, "loss": 0.5421559810638428, "mean_token_accuracy": 0.7824651896953583, "num_tokens": 12301975.0, "step": 755 }, { "entropy": 0.5448143184185028, "epoch": 2.822429906542056, "grad_norm": 0.03843825310468674, "learning_rate": 0.0002, "loss": 0.5424494743347168, "mean_token_accuracy": 0.7786366790533066, "num_tokens": 12318265.0, "step": 756 }, { "entropy": 0.5362522453069687, "epoch": 2.8261682242990656, "grad_norm": 0.037981439381837845, "learning_rate": 0.0002, "loss": 0.5347139835357666, "mean_token_accuracy": 0.7820651233196259, "num_tokens": 12334596.0, "step": 757 }, { "entropy": 0.5419719219207764, "epoch": 2.829906542056075, "grad_norm": 0.03768031671643257, "learning_rate": 0.0002, "loss": 0.540343701839447, "mean_token_accuracy": 0.779738038778305, "num_tokens": 12351022.0, "step": 758 }, { "entropy": 0.5576566010713577, "epoch": 2.8336448598130843, "grad_norm": 0.03845515102148056, "learning_rate": 0.0002, "loss": 0.556204617023468, "mean_token_accuracy": 0.7719219624996185, "num_tokens": 12367469.0, "step": 759 }, { "entropy": 0.5245185047388077, "epoch": 2.8373831775700937, "grad_norm": 0.04210665449500084, "learning_rate": 0.0002, "loss": 0.5240767598152161, "mean_token_accuracy": 0.7867787629365921, "num_tokens": 12383664.0, "step": 760 }, { "entropy": 0.5366124212741852, "epoch": 2.8411214953271027, "grad_norm": 0.039727386087179184, "learning_rate": 0.0002, "loss": 0.5391771197319031, "mean_token_accuracy": 0.7799243628978729, "num_tokens": 12399816.0, "step": 761 }, { "entropy": 0.5430543571710587, "epoch": 2.844859813084112, "grad_norm": 0.04284166544675827, "learning_rate": 0.0002, "loss": 0.555898129940033, "mean_token_accuracy": 0.7769357264041901, "num_tokens": 12416232.0, "step": 762 }, { "entropy": 0.5447599291801453, "epoch": 2.8485981308411215, "grad_norm": 0.04133335128426552, "learning_rate": 0.0002, "loss": 0.5458224415779114, "mean_token_accuracy": 0.7791205793619156, "num_tokens": 12432772.0, "step": 763 }, { "entropy": 0.5463473051786423, "epoch": 2.852336448598131, "grad_norm": 0.04293463006615639, "learning_rate": 0.0002, "loss": 0.5410310626029968, "mean_token_accuracy": 0.7824665307998657, "num_tokens": 12449390.0, "step": 764 }, { "entropy": 0.5433794260025024, "epoch": 2.8560747663551402, "grad_norm": 0.0383763313293457, "learning_rate": 0.0002, "loss": 0.5330025553703308, "mean_token_accuracy": 0.786294624209404, "num_tokens": 12465761.0, "step": 765 }, { "entropy": 0.5348140597343445, "epoch": 2.8598130841121496, "grad_norm": 0.038813136518001556, "learning_rate": 0.0002, "loss": 0.5356075167655945, "mean_token_accuracy": 0.7799220532178879, "num_tokens": 12481995.0, "step": 766 }, { "entropy": 0.5310825854539871, "epoch": 2.863551401869159, "grad_norm": 0.04623069986701012, "learning_rate": 0.0002, "loss": 0.5389203429222107, "mean_token_accuracy": 0.7763766050338745, "num_tokens": 12498209.0, "step": 767 }, { "entropy": 0.5357654541730881, "epoch": 2.867289719626168, "grad_norm": 0.03819035738706589, "learning_rate": 0.0002, "loss": 0.5394827723503113, "mean_token_accuracy": 0.7809223681688309, "num_tokens": 12514712.0, "step": 768 }, { "entropy": 0.543551579117775, "epoch": 2.8710280373831774, "grad_norm": 0.043649353086948395, "learning_rate": 0.0002, "loss": 0.5464720129966736, "mean_token_accuracy": 0.7787970453500748, "num_tokens": 12531249.0, "step": 769 }, { "entropy": 0.5389954522252083, "epoch": 2.8747663551401867, "grad_norm": 0.036311469972133636, "learning_rate": 0.0002, "loss": 0.5379980206489563, "mean_token_accuracy": 0.7832965403795242, "num_tokens": 12547833.0, "step": 770 }, { "entropy": 0.5408525764942169, "epoch": 2.878504672897196, "grad_norm": 0.03780903294682503, "learning_rate": 0.0002, "loss": 0.539055585861206, "mean_token_accuracy": 0.7843980342149734, "num_tokens": 12564468.0, "step": 771 }, { "entropy": 0.5521610230207443, "epoch": 2.8822429906542055, "grad_norm": 0.042727869004011154, "learning_rate": 0.0002, "loss": 0.5518633723258972, "mean_token_accuracy": 0.7730461955070496, "num_tokens": 12580822.0, "step": 772 }, { "entropy": 0.5392657667398453, "epoch": 2.885981308411215, "grad_norm": 0.042652204632759094, "learning_rate": 0.0002, "loss": 0.5403409004211426, "mean_token_accuracy": 0.7833160161972046, "num_tokens": 12597306.0, "step": 773 }, { "entropy": 0.5409767031669617, "epoch": 2.8897196261682243, "grad_norm": 0.04756668955087662, "learning_rate": 0.0002, "loss": 0.5477514266967773, "mean_token_accuracy": 0.7775042653083801, "num_tokens": 12613430.0, "step": 774 }, { "entropy": 0.529184103012085, "epoch": 2.8934579439252337, "grad_norm": 0.040852271020412445, "learning_rate": 0.0002, "loss": 0.5368978381156921, "mean_token_accuracy": 0.7799389064311981, "num_tokens": 12629734.0, "step": 775 }, { "entropy": 0.5528028011322021, "epoch": 2.897196261682243, "grad_norm": 0.04610953480005264, "learning_rate": 0.0002, "loss": 0.5489134788513184, "mean_token_accuracy": 0.7778203934431076, "num_tokens": 12646051.0, "step": 776 }, { "entropy": 0.5398439168930054, "epoch": 2.9009345794392525, "grad_norm": 0.03999875858426094, "learning_rate": 0.0002, "loss": 0.5301113128662109, "mean_token_accuracy": 0.786536455154419, "num_tokens": 12662398.0, "step": 777 }, { "entropy": 0.5450849235057831, "epoch": 2.904672897196262, "grad_norm": 0.04052022844552994, "learning_rate": 0.0002, "loss": 0.5446597933769226, "mean_token_accuracy": 0.7773038446903229, "num_tokens": 12679053.0, "step": 778 }, { "entropy": 0.5272800028324127, "epoch": 2.9084112149532713, "grad_norm": 0.041017524898052216, "learning_rate": 0.0002, "loss": 0.5308842062950134, "mean_token_accuracy": 0.7858325839042664, "num_tokens": 12695608.0, "step": 779 }, { "entropy": 0.5401904284954071, "epoch": 2.91214953271028, "grad_norm": 0.04053664207458496, "learning_rate": 0.0002, "loss": 0.5450324416160583, "mean_token_accuracy": 0.7785527408123016, "num_tokens": 12712035.0, "step": 780 }, { "entropy": 0.5284470915794373, "epoch": 2.9158878504672896, "grad_norm": 0.04656258225440979, "learning_rate": 0.0002, "loss": 0.5301587581634521, "mean_token_accuracy": 0.781079113483429, "num_tokens": 12728285.0, "step": 781 }, { "entropy": 0.5552389323711395, "epoch": 2.919626168224299, "grad_norm": 0.043133046478033066, "learning_rate": 0.0002, "loss": 0.5493855476379395, "mean_token_accuracy": 0.7788817882537842, "num_tokens": 12744626.0, "step": 782 }, { "entropy": 0.536635085940361, "epoch": 2.9233644859813084, "grad_norm": 0.04232388734817505, "learning_rate": 0.0002, "loss": 0.5350582599639893, "mean_token_accuracy": 0.784316211938858, "num_tokens": 12760817.0, "step": 783 }, { "entropy": 0.5175309851765633, "epoch": 2.9271028037383178, "grad_norm": 0.05120910704135895, "learning_rate": 0.0002, "loss": 0.5239328742027283, "mean_token_accuracy": 0.7904608845710754, "num_tokens": 12777129.0, "step": 784 }, { "entropy": 0.5613889098167419, "epoch": 2.930841121495327, "grad_norm": 0.04064096510410309, "learning_rate": 0.0002, "loss": 0.5573512315750122, "mean_token_accuracy": 0.7735461741685867, "num_tokens": 12793633.0, "step": 785 }, { "entropy": 0.540812149643898, "epoch": 2.9345794392523366, "grad_norm": 0.04686618968844414, "learning_rate": 0.0002, "loss": 0.5428805947303772, "mean_token_accuracy": 0.7786334455013275, "num_tokens": 12809886.0, "step": 786 }, { "entropy": 0.5354818254709244, "epoch": 2.938317757009346, "grad_norm": 0.04068305343389511, "learning_rate": 0.0002, "loss": 0.5409020185470581, "mean_token_accuracy": 0.781467393040657, "num_tokens": 12826079.0, "step": 787 }, { "entropy": 0.5340152084827423, "epoch": 2.942056074766355, "grad_norm": 0.04302098974585533, "learning_rate": 0.0002, "loss": 0.5352627038955688, "mean_token_accuracy": 0.7827621698379517, "num_tokens": 12842255.0, "step": 788 }, { "entropy": 0.5471729636192322, "epoch": 2.9457943925233643, "grad_norm": 0.03707803413271904, "learning_rate": 0.0002, "loss": 0.5461200475692749, "mean_token_accuracy": 0.7784449309110641, "num_tokens": 12859013.0, "step": 789 }, { "entropy": 0.5401621907949448, "epoch": 2.9495327102803737, "grad_norm": 0.044071633368730545, "learning_rate": 0.0002, "loss": 0.5385332107543945, "mean_token_accuracy": 0.783258393406868, "num_tokens": 12875373.0, "step": 790 }, { "entropy": 0.5508020371198654, "epoch": 2.953271028037383, "grad_norm": 0.03822047635912895, "learning_rate": 0.0002, "loss": 0.5456752181053162, "mean_token_accuracy": 0.7771204560995102, "num_tokens": 12891653.0, "step": 791 }, { "entropy": 0.5405401140451431, "epoch": 2.9570093457943925, "grad_norm": 0.05170199275016785, "learning_rate": 0.0002, "loss": 0.5398849248886108, "mean_token_accuracy": 0.7820375263690948, "num_tokens": 12908131.0, "step": 792 }, { "entropy": 0.5514362305402756, "epoch": 2.960747663551402, "grad_norm": 0.036166463047266006, "learning_rate": 0.0002, "loss": 0.5504743456840515, "mean_token_accuracy": 0.7789987325668335, "num_tokens": 12924376.0, "step": 793 }, { "entropy": 0.5308372974395752, "epoch": 2.9644859813084112, "grad_norm": 0.04786797612905502, "learning_rate": 0.0002, "loss": 0.5306717753410339, "mean_token_accuracy": 0.7853545248508453, "num_tokens": 12940776.0, "step": 794 }, { "entropy": 0.532660722732544, "epoch": 2.9682242990654206, "grad_norm": 0.045564983040094376, "learning_rate": 0.0002, "loss": 0.5463993549346924, "mean_token_accuracy": 0.777183935046196, "num_tokens": 12957326.0, "step": 795 }, { "entropy": 0.5434572845697403, "epoch": 2.97196261682243, "grad_norm": 0.04280655458569527, "learning_rate": 0.0002, "loss": 0.5493361353874207, "mean_token_accuracy": 0.776650920510292, "num_tokens": 12973820.0, "step": 796 }, { "entropy": 0.5530060529708862, "epoch": 2.9757009345794394, "grad_norm": 0.04003579169511795, "learning_rate": 0.0002, "loss": 0.5533372759819031, "mean_token_accuracy": 0.7766715437173843, "num_tokens": 12990177.0, "step": 797 }, { "entropy": 0.5516588985919952, "epoch": 2.979439252336449, "grad_norm": 0.0351371206343174, "learning_rate": 0.0002, "loss": 0.5491815209388733, "mean_token_accuracy": 0.7761321365833282, "num_tokens": 13006638.0, "step": 798 }, { "entropy": 0.5496395230293274, "epoch": 2.983177570093458, "grad_norm": 0.03455950319766998, "learning_rate": 0.0002, "loss": 0.5390848517417908, "mean_token_accuracy": 0.7827516794204712, "num_tokens": 13022895.0, "step": 799 }, { "entropy": 0.5255894213914871, "epoch": 2.986915887850467, "grad_norm": 0.0403040274977684, "learning_rate": 0.0002, "loss": 0.5258710980415344, "mean_token_accuracy": 0.7874301820993423, "num_tokens": 13039127.0, "step": 800 }, { "entropy": 0.5152293890714645, "epoch": 2.9906542056074765, "grad_norm": 0.04018184915184975, "learning_rate": 0.0002, "loss": 0.5248207449913025, "mean_token_accuracy": 0.789091631770134, "num_tokens": 13055038.0, "step": 801 }, { "entropy": 0.5260308086872101, "epoch": 2.994392523364486, "grad_norm": 0.04690062627196312, "learning_rate": 0.0002, "loss": 0.5380572080612183, "mean_token_accuracy": 0.7809655517339706, "num_tokens": 13070955.0, "step": 802 }, { "entropy": 0.5523715615272522, "epoch": 2.9981308411214953, "grad_norm": 0.040551379323005676, "learning_rate": 0.0002, "loss": 0.5491956472396851, "mean_token_accuracy": 0.7785847187042236, "num_tokens": 13087325.0, "step": 803 }, { "entropy": 0.5784902274608612, "epoch": 3.0, "grad_norm": 0.04703172296285629, "learning_rate": 0.0002, "loss": 0.5652958750724792, "mean_token_accuracy": 0.7655995786190033, "num_tokens": 13094423.0, "step": 804 } ], "logging_steps": 1, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2209408416111657e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }