{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 5360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.238032042980194, "epoch": 0.0009332711152589828, "grad_norm": 1.1639747619628906, "learning_rate": 0.0002, "loss": 2.624, "mean_token_accuracy": 0.5043214038014412, "num_tokens": 3632.0, "step": 1 }, { "entropy": 1.254976749420166, "epoch": 0.0018665422305179655, "grad_norm": 1.8520084619522095, "learning_rate": 0.0002, "loss": 2.3931, "mean_token_accuracy": 0.5251746326684952, "num_tokens": 7149.0, "step": 2 }, { "entropy": 1.3129678666591644, "epoch": 0.0027998133457769483, "grad_norm": 0.7353417873382568, "learning_rate": 0.0002, "loss": 2.2249, "mean_token_accuracy": 0.5356370210647583, "num_tokens": 10857.0, "step": 3 }, { "entropy": 1.3454727530479431, "epoch": 0.003733084461035931, "grad_norm": 0.7795115113258362, "learning_rate": 0.0002, "loss": 2.0534, "mean_token_accuracy": 0.5570452511310577, "num_tokens": 14449.0, "step": 4 }, { "entropy": 1.398919403553009, "epoch": 0.004666355576294913, "grad_norm": 0.6844388246536255, "learning_rate": 0.0002, "loss": 1.8331, "mean_token_accuracy": 0.5790584981441498, "num_tokens": 18119.0, "step": 5 }, { "entropy": 1.485109955072403, "epoch": 0.0055996266915538965, "grad_norm": 0.5946104526519775, "learning_rate": 0.0002, "loss": 1.7256, "mean_token_accuracy": 0.5777529925107956, "num_tokens": 21698.0, "step": 6 }, { "entropy": 1.3979853391647339, "epoch": 0.006532897806812879, "grad_norm": 0.45341312885284424, "learning_rate": 0.0002, "loss": 1.6167, "mean_token_accuracy": 0.6037390828132629, "num_tokens": 25324.0, "step": 7 }, { "entropy": 1.4071973264217377, "epoch": 0.007466168922071862, "grad_norm": 0.6072640419006348, "learning_rate": 0.0002, "loss": 1.5238, "mean_token_accuracy": 0.6087846159934998, "num_tokens": 29042.0, "step": 8 }, { "entropy": 1.3779878616333008, "epoch": 0.008399440037330844, "grad_norm": 0.6349203586578369, "learning_rate": 0.0002, "loss": 1.4428, "mean_token_accuracy": 0.6112671196460724, "num_tokens": 32663.0, "step": 9 }, { "entropy": 1.3956011533737183, "epoch": 0.009332711152589827, "grad_norm": 0.5268082618713379, "learning_rate": 0.0002, "loss": 1.4315, "mean_token_accuracy": 0.6079887300729752, "num_tokens": 36305.0, "step": 10 }, { "entropy": 1.3843933045864105, "epoch": 0.01026598226784881, "grad_norm": 0.5158156156539917, "learning_rate": 0.0002, "loss": 1.3667, "mean_token_accuracy": 0.6084621995687485, "num_tokens": 39999.0, "step": 11 }, { "entropy": 1.3331674337387085, "epoch": 0.011199253383107793, "grad_norm": 0.5412152409553528, "learning_rate": 0.0002, "loss": 1.284, "mean_token_accuracy": 0.6266739219427109, "num_tokens": 43597.0, "step": 12 }, { "entropy": 1.315556913614273, "epoch": 0.012132524498366775, "grad_norm": 0.5877755880355835, "learning_rate": 0.0002, "loss": 1.225, "mean_token_accuracy": 0.6337851583957672, "num_tokens": 47237.0, "step": 13 }, { "entropy": 1.284736156463623, "epoch": 0.013065795613625758, "grad_norm": 0.6147598624229431, "learning_rate": 0.0002, "loss": 1.1598, "mean_token_accuracy": 0.634219154715538, "num_tokens": 50942.0, "step": 14 }, { "entropy": 1.2489074766635895, "epoch": 0.013999066728884742, "grad_norm": 0.5499444007873535, "learning_rate": 0.0002, "loss": 1.0963, "mean_token_accuracy": 0.6550355404615402, "num_tokens": 54445.0, "step": 15 }, { "entropy": 1.1893448531627655, "epoch": 0.014932337844143724, "grad_norm": 0.5230265259742737, "learning_rate": 0.0002, "loss": 1.0661, "mean_token_accuracy": 0.6572947651147842, "num_tokens": 58292.0, "step": 16 }, { "entropy": 1.1328391134738922, "epoch": 0.015865608959402706, "grad_norm": 0.5632361769676208, "learning_rate": 0.0002, "loss": 1.0354, "mean_token_accuracy": 0.6614871472120285, "num_tokens": 61905.0, "step": 17 }, { "entropy": 1.073226124048233, "epoch": 0.01679888007466169, "grad_norm": 0.5865839123725891, "learning_rate": 0.0002, "loss": 0.9943, "mean_token_accuracy": 0.6801784336566925, "num_tokens": 65483.0, "step": 18 }, { "entropy": 1.0235921889543533, "epoch": 0.01773215118992067, "grad_norm": 0.6260231137275696, "learning_rate": 0.0002, "loss": 0.9902, "mean_token_accuracy": 0.6652950942516327, "num_tokens": 69237.0, "step": 19 }, { "entropy": 0.9650732576847076, "epoch": 0.018665422305179653, "grad_norm": 0.5481460094451904, "learning_rate": 0.0002, "loss": 0.9309, "mean_token_accuracy": 0.680676743388176, "num_tokens": 72744.0, "step": 20 }, { "entropy": 0.9337102919816971, "epoch": 0.01959869342043864, "grad_norm": 0.46371644735336304, "learning_rate": 0.0002, "loss": 0.9052, "mean_token_accuracy": 0.67674121260643, "num_tokens": 76484.0, "step": 21 }, { "entropy": 0.8476392328739166, "epoch": 0.02053196453569762, "grad_norm": 0.5473657846450806, "learning_rate": 0.0002, "loss": 0.869, "mean_token_accuracy": 0.6932830214500427, "num_tokens": 80148.0, "step": 22 }, { "entropy": 0.8816152513027191, "epoch": 0.021465235650956604, "grad_norm": 0.5542680025100708, "learning_rate": 0.0002, "loss": 0.917, "mean_token_accuracy": 0.6831570267677307, "num_tokens": 83664.0, "step": 23 }, { "entropy": 0.9102343320846558, "epoch": 0.022398506766215586, "grad_norm": 0.5326656699180603, "learning_rate": 0.0002, "loss": 0.9073, "mean_token_accuracy": 0.6671889275312424, "num_tokens": 87297.0, "step": 24 }, { "entropy": 0.861797958612442, "epoch": 0.02333177788147457, "grad_norm": 0.5176796317100525, "learning_rate": 0.0002, "loss": 0.827, "mean_token_accuracy": 0.6997370719909668, "num_tokens": 90817.0, "step": 25 }, { "entropy": 0.8544954359531403, "epoch": 0.02426504899673355, "grad_norm": 0.5992906093597412, "learning_rate": 0.0002, "loss": 0.8319, "mean_token_accuracy": 0.6909503787755966, "num_tokens": 94469.0, "step": 26 }, { "entropy": 0.8183820247650146, "epoch": 0.025198320111992533, "grad_norm": 0.5394362807273865, "learning_rate": 0.0002, "loss": 0.7619, "mean_token_accuracy": 0.7170816212892532, "num_tokens": 98033.0, "step": 27 }, { "entropy": 0.8231970071792603, "epoch": 0.026131591227251515, "grad_norm": 0.5084280371665955, "learning_rate": 0.0002, "loss": 0.7952, "mean_token_accuracy": 0.7002080380916595, "num_tokens": 101694.0, "step": 28 }, { "entropy": 0.8220045417547226, "epoch": 0.027064862342510498, "grad_norm": 0.517466127872467, "learning_rate": 0.0002, "loss": 0.7959, "mean_token_accuracy": 0.6903281956911087, "num_tokens": 105451.0, "step": 29 }, { "entropy": 0.7855145186185837, "epoch": 0.027998133457769483, "grad_norm": 0.5042310953140259, "learning_rate": 0.0002, "loss": 0.7733, "mean_token_accuracy": 0.7006007134914398, "num_tokens": 109093.0, "step": 30 }, { "entropy": 0.7849300503730774, "epoch": 0.028931404573028466, "grad_norm": 0.5362587571144104, "learning_rate": 0.0002, "loss": 0.769, "mean_token_accuracy": 0.7084459662437439, "num_tokens": 112805.0, "step": 31 }, { "entropy": 0.7677375823259354, "epoch": 0.029864675688287448, "grad_norm": 0.4908425211906433, "learning_rate": 0.0002, "loss": 0.746, "mean_token_accuracy": 0.7144936919212341, "num_tokens": 116434.0, "step": 32 }, { "entropy": 0.7124157100915909, "epoch": 0.03079794680354643, "grad_norm": 0.586961567401886, "learning_rate": 0.0002, "loss": 0.7351, "mean_token_accuracy": 0.7201817482709885, "num_tokens": 120059.0, "step": 33 }, { "entropy": 0.6895398050546646, "epoch": 0.03173121791880541, "grad_norm": 0.5112178921699524, "learning_rate": 0.0002, "loss": 0.7072, "mean_token_accuracy": 0.7333744019269943, "num_tokens": 123743.0, "step": 34 }, { "entropy": 0.7315993458032608, "epoch": 0.032664489034064395, "grad_norm": 0.5525657534599304, "learning_rate": 0.0002, "loss": 0.7322, "mean_token_accuracy": 0.7168141305446625, "num_tokens": 127428.0, "step": 35 }, { "entropy": 0.7172030061483383, "epoch": 0.03359776014932338, "grad_norm": 0.4799160063266754, "learning_rate": 0.0002, "loss": 0.6899, "mean_token_accuracy": 0.7394338846206665, "num_tokens": 130950.0, "step": 36 }, { "entropy": 0.7183454483747482, "epoch": 0.03453103126458236, "grad_norm": 0.4821692109107971, "learning_rate": 0.0002, "loss": 0.672, "mean_token_accuracy": 0.7415967881679535, "num_tokens": 134631.0, "step": 37 }, { "entropy": 0.7597247064113617, "epoch": 0.03546430237984134, "grad_norm": 0.4842160642147064, "learning_rate": 0.0002, "loss": 0.7288, "mean_token_accuracy": 0.709240049123764, "num_tokens": 138330.0, "step": 38 }, { "entropy": 0.7114877551794052, "epoch": 0.036397573495100324, "grad_norm": 0.4979349970817566, "learning_rate": 0.0002, "loss": 0.7052, "mean_token_accuracy": 0.7197959125041962, "num_tokens": 141843.0, "step": 39 }, { "entropy": 0.6953291296958923, "epoch": 0.03733084461035931, "grad_norm": 0.5867581963539124, "learning_rate": 0.0002, "loss": 0.7135, "mean_token_accuracy": 0.7272190600633621, "num_tokens": 145448.0, "step": 40 }, { "entropy": 0.6256467252969742, "epoch": 0.03826411572561829, "grad_norm": 0.5248032212257385, "learning_rate": 0.0002, "loss": 0.6542, "mean_token_accuracy": 0.7413808554410934, "num_tokens": 149008.0, "step": 41 }, { "entropy": 0.703561395406723, "epoch": 0.03919738684087728, "grad_norm": 0.5016943216323853, "learning_rate": 0.0002, "loss": 0.7082, "mean_token_accuracy": 0.7199033200740814, "num_tokens": 152621.0, "step": 42 }, { "entropy": 0.6874178647994995, "epoch": 0.04013065795613626, "grad_norm": 0.4511376619338989, "learning_rate": 0.0002, "loss": 0.6816, "mean_token_accuracy": 0.737816259264946, "num_tokens": 156363.0, "step": 43 }, { "entropy": 0.6529847383499146, "epoch": 0.04106392907139524, "grad_norm": 0.40076661109924316, "learning_rate": 0.0002, "loss": 0.6502, "mean_token_accuracy": 0.7464438825845718, "num_tokens": 159930.0, "step": 44 }, { "entropy": 0.7194813191890717, "epoch": 0.041997200186654225, "grad_norm": 0.48623719811439514, "learning_rate": 0.0002, "loss": 0.6995, "mean_token_accuracy": 0.7277878373861313, "num_tokens": 163536.0, "step": 45 }, { "entropy": 0.6917680501937866, "epoch": 0.04293047130191321, "grad_norm": 0.44339001178741455, "learning_rate": 0.0002, "loss": 0.6977, "mean_token_accuracy": 0.7297296226024628, "num_tokens": 167254.0, "step": 46 }, { "entropy": 0.67356076836586, "epoch": 0.04386374241717219, "grad_norm": 0.46861252188682556, "learning_rate": 0.0002, "loss": 0.6771, "mean_token_accuracy": 0.7343941032886505, "num_tokens": 170930.0, "step": 47 }, { "entropy": 0.6744781732559204, "epoch": 0.04479701353243117, "grad_norm": 0.4226837456226349, "learning_rate": 0.0002, "loss": 0.6877, "mean_token_accuracy": 0.7297642081975937, "num_tokens": 174682.0, "step": 48 }, { "entropy": 0.6464174687862396, "epoch": 0.045730284647690154, "grad_norm": 0.4155750870704651, "learning_rate": 0.0002, "loss": 0.6616, "mean_token_accuracy": 0.7466181069612503, "num_tokens": 178269.0, "step": 49 }, { "entropy": 0.6686215549707413, "epoch": 0.04666355576294914, "grad_norm": 0.4517916142940521, "learning_rate": 0.0002, "loss": 0.6632, "mean_token_accuracy": 0.7471886277198792, "num_tokens": 181898.0, "step": 50 }, { "entropy": 0.6312864124774933, "epoch": 0.04759682687820812, "grad_norm": 0.3972981572151184, "learning_rate": 0.0002, "loss": 0.64, "mean_token_accuracy": 0.7463629394769669, "num_tokens": 185591.0, "step": 51 }, { "entropy": 0.6921845227479935, "epoch": 0.0485300979934671, "grad_norm": 0.40510231256484985, "learning_rate": 0.0002, "loss": 0.6851, "mean_token_accuracy": 0.7320987284183502, "num_tokens": 189265.0, "step": 52 }, { "entropy": 0.650676965713501, "epoch": 0.049463369108726084, "grad_norm": 0.4098157286643982, "learning_rate": 0.0002, "loss": 0.6548, "mean_token_accuracy": 0.7398397326469421, "num_tokens": 192848.0, "step": 53 }, { "entropy": 0.6953828185796738, "epoch": 0.050396640223985066, "grad_norm": 0.3829086422920227, "learning_rate": 0.0002, "loss": 0.692, "mean_token_accuracy": 0.7253122478723526, "num_tokens": 196577.0, "step": 54 }, { "entropy": 0.6860110610723495, "epoch": 0.05132991133924405, "grad_norm": 0.3626587986946106, "learning_rate": 0.0002, "loss": 0.6669, "mean_token_accuracy": 0.7356055825948715, "num_tokens": 200326.0, "step": 55 }, { "entropy": 0.6235200464725494, "epoch": 0.05226318245450303, "grad_norm": 0.3610437512397766, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.7490793317556381, "num_tokens": 203841.0, "step": 56 }, { "entropy": 0.6269147247076035, "epoch": 0.05319645356976201, "grad_norm": 0.5107990503311157, "learning_rate": 0.0002, "loss": 0.653, "mean_token_accuracy": 0.7439539283514023, "num_tokens": 207392.0, "step": 57 }, { "entropy": 0.6445672363042831, "epoch": 0.054129724685020995, "grad_norm": 0.40620699524879456, "learning_rate": 0.0002, "loss": 0.6443, "mean_token_accuracy": 0.7480349391698837, "num_tokens": 211049.0, "step": 58 }, { "entropy": 0.6269707232713699, "epoch": 0.055062995800279985, "grad_norm": 0.36476364731788635, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.7550042867660522, "num_tokens": 214750.0, "step": 59 }, { "entropy": 0.6417617946863174, "epoch": 0.05599626691553897, "grad_norm": 0.3832146227359772, "learning_rate": 0.0002, "loss": 0.6405, "mean_token_accuracy": 0.7529201209545135, "num_tokens": 218292.0, "step": 60 }, { "entropy": 0.620335727930069, "epoch": 0.05692953803079795, "grad_norm": 0.3496507406234741, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.7485546469688416, "num_tokens": 221935.0, "step": 61 }, { "entropy": 0.6347710490226746, "epoch": 0.05786280914605693, "grad_norm": 0.4580725431442261, "learning_rate": 0.0002, "loss": 0.6445, "mean_token_accuracy": 0.7460792809724808, "num_tokens": 225553.0, "step": 62 }, { "entropy": 0.619176521897316, "epoch": 0.058796080261315914, "grad_norm": 0.40347322821617126, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.7486258745193481, "num_tokens": 229263.0, "step": 63 }, { "entropy": 0.615541085600853, "epoch": 0.059729351376574896, "grad_norm": 0.42285582423210144, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.7447081506252289, "num_tokens": 232788.0, "step": 64 }, { "entropy": 0.6286216080188751, "epoch": 0.06066262249183388, "grad_norm": 0.35210269689559937, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.7488521337509155, "num_tokens": 236301.0, "step": 65 }, { "entropy": 0.6392722874879837, "epoch": 0.06159589360709286, "grad_norm": 0.3131972551345825, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.7503565549850464, "num_tokens": 239804.0, "step": 66 }, { "entropy": 0.6616053581237793, "epoch": 0.06252916472235184, "grad_norm": 0.29930952191352844, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.7482137084007263, "num_tokens": 243488.0, "step": 67 }, { "entropy": 0.6621737331151962, "epoch": 0.06346243583761083, "grad_norm": 0.30890125036239624, "learning_rate": 0.0002, "loss": 0.6464, "mean_token_accuracy": 0.7441992163658142, "num_tokens": 247195.0, "step": 68 }, { "entropy": 0.6485453695058823, "epoch": 0.06439570695286981, "grad_norm": 0.3032291829586029, "learning_rate": 0.0002, "loss": 0.6355, "mean_token_accuracy": 0.7434397339820862, "num_tokens": 250680.0, "step": 69 }, { "entropy": 0.62110935151577, "epoch": 0.06532897806812879, "grad_norm": 0.3070567548274994, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.750543400645256, "num_tokens": 254337.0, "step": 70 }, { "entropy": 0.6060507446527481, "epoch": 0.06626224918338777, "grad_norm": 0.34801799058914185, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.7519558519124985, "num_tokens": 257830.0, "step": 71 }, { "entropy": 0.6199661791324615, "epoch": 0.06719552029864675, "grad_norm": 0.40068989992141724, "learning_rate": 0.0002, "loss": 0.6427, "mean_token_accuracy": 0.7509061545133591, "num_tokens": 261437.0, "step": 72 }, { "entropy": 0.6210741698741913, "epoch": 0.06812879141390574, "grad_norm": 0.3350900113582611, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.7479204833507538, "num_tokens": 264949.0, "step": 73 }, { "entropy": 0.6343181431293488, "epoch": 0.06906206252916472, "grad_norm": 0.27748245000839233, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.7565944939851761, "num_tokens": 268572.0, "step": 74 }, { "entropy": 0.6399798840284348, "epoch": 0.0699953336444237, "grad_norm": 0.31898033618927, "learning_rate": 0.0002, "loss": 0.6328, "mean_token_accuracy": 0.7538827806711197, "num_tokens": 272212.0, "step": 75 }, { "entropy": 0.6789921522140503, "epoch": 0.07092860475968268, "grad_norm": 0.30240291357040405, "learning_rate": 0.0002, "loss": 0.6751, "mean_token_accuracy": 0.7330592274665833, "num_tokens": 275878.0, "step": 76 }, { "entropy": 0.6180820167064667, "epoch": 0.07186187587494167, "grad_norm": 0.30339327454566956, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.7600033134222031, "num_tokens": 279536.0, "step": 77 }, { "entropy": 0.6338329017162323, "epoch": 0.07279514699020065, "grad_norm": 0.3109918534755707, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.7417657375335693, "num_tokens": 283125.0, "step": 78 }, { "entropy": 0.6542191803455353, "epoch": 0.07372841810545963, "grad_norm": 0.41286158561706543, "learning_rate": 0.0002, "loss": 0.6559, "mean_token_accuracy": 0.7373685389757156, "num_tokens": 286814.0, "step": 79 }, { "entropy": 0.6399752050638199, "epoch": 0.07466168922071861, "grad_norm": 0.3763265311717987, "learning_rate": 0.0002, "loss": 0.6485, "mean_token_accuracy": 0.7414896786212921, "num_tokens": 290351.0, "step": 80 }, { "entropy": 0.6115842163562775, "epoch": 0.0755949603359776, "grad_norm": 0.29847657680511475, "learning_rate": 0.0002, "loss": 0.6168, "mean_token_accuracy": 0.7517617344856262, "num_tokens": 293877.0, "step": 81 }, { "entropy": 0.6345693618059158, "epoch": 0.07652823145123658, "grad_norm": 0.32319849729537964, "learning_rate": 0.0002, "loss": 0.6462, "mean_token_accuracy": 0.7409350723028183, "num_tokens": 297425.0, "step": 82 }, { "entropy": 0.6228432208299637, "epoch": 0.07746150256649556, "grad_norm": 0.34619563817977905, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.7604348957538605, "num_tokens": 301120.0, "step": 83 }, { "entropy": 0.6245952993631363, "epoch": 0.07839477368175456, "grad_norm": 0.3065875768661499, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.7510392963886261, "num_tokens": 304724.0, "step": 84 }, { "entropy": 0.676215410232544, "epoch": 0.07932804479701354, "grad_norm": 0.3571251332759857, "learning_rate": 0.0002, "loss": 0.6673, "mean_token_accuracy": 0.7368158251047134, "num_tokens": 308517.0, "step": 85 }, { "entropy": 0.6332674771547318, "epoch": 0.08026131591227252, "grad_norm": 0.31473830342292786, "learning_rate": 0.0002, "loss": 0.6449, "mean_token_accuracy": 0.7450313717126846, "num_tokens": 312171.0, "step": 86 }, { "entropy": 0.5897831320762634, "epoch": 0.0811945870275315, "grad_norm": 0.3016223907470703, "learning_rate": 0.0002, "loss": 0.5902, "mean_token_accuracy": 0.7631714791059494, "num_tokens": 315750.0, "step": 87 }, { "entropy": 0.6480985134840012, "epoch": 0.08212785814279049, "grad_norm": 0.3238847255706787, "learning_rate": 0.0002, "loss": 0.6587, "mean_token_accuracy": 0.7352481633424759, "num_tokens": 319542.0, "step": 88 }, { "entropy": 0.6226422935724258, "epoch": 0.08306112925804947, "grad_norm": 0.31977325677871704, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.7467455565929413, "num_tokens": 323129.0, "step": 89 }, { "entropy": 0.6313899904489517, "epoch": 0.08399440037330845, "grad_norm": 0.3356255292892456, "learning_rate": 0.0002, "loss": 0.6466, "mean_token_accuracy": 0.7420767992734909, "num_tokens": 326761.0, "step": 90 }, { "entropy": 0.6244661808013916, "epoch": 0.08492767148856743, "grad_norm": 0.30021145939826965, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.7539113312959671, "num_tokens": 330343.0, "step": 91 }, { "entropy": 0.6334827989339828, "epoch": 0.08586094260382642, "grad_norm": 0.2890295684337616, "learning_rate": 0.0002, "loss": 0.6333, "mean_token_accuracy": 0.7481255978345871, "num_tokens": 334021.0, "step": 92 }, { "entropy": 0.661640852689743, "epoch": 0.0867942137190854, "grad_norm": 0.29577574133872986, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.7419004589319229, "num_tokens": 337667.0, "step": 93 }, { "entropy": 0.6721207052469254, "epoch": 0.08772748483434438, "grad_norm": 0.32148876786231995, "learning_rate": 0.0002, "loss": 0.6759, "mean_token_accuracy": 0.7289273142814636, "num_tokens": 341202.0, "step": 94 }, { "entropy": 0.6466003805398941, "epoch": 0.08866075594960336, "grad_norm": 0.2904382646083832, "learning_rate": 0.0002, "loss": 0.6461, "mean_token_accuracy": 0.7441334873437881, "num_tokens": 344914.0, "step": 95 }, { "entropy": 0.6464525461196899, "epoch": 0.08959402706486234, "grad_norm": 0.29067525267601013, "learning_rate": 0.0002, "loss": 0.6458, "mean_token_accuracy": 0.7413109391927719, "num_tokens": 348631.0, "step": 96 }, { "entropy": 0.649706557393074, "epoch": 0.09052729818012133, "grad_norm": 0.2835700213909149, "learning_rate": 0.0002, "loss": 0.6544, "mean_token_accuracy": 0.7392159849405289, "num_tokens": 352349.0, "step": 97 }, { "entropy": 0.6099058240652084, "epoch": 0.09146056929538031, "grad_norm": 0.3315757215023041, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.7557008117437363, "num_tokens": 356014.0, "step": 98 }, { "entropy": 0.637185275554657, "epoch": 0.09239384041063929, "grad_norm": 0.3097708523273468, "learning_rate": 0.0002, "loss": 0.6396, "mean_token_accuracy": 0.7370903789997101, "num_tokens": 359776.0, "step": 99 }, { "entropy": 0.6069396436214447, "epoch": 0.09332711152589827, "grad_norm": 0.3533344268798828, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.743950605392456, "num_tokens": 363323.0, "step": 100 }, { "entropy": 0.6273242831230164, "epoch": 0.09426038264115726, "grad_norm": 0.27062469720840454, "learning_rate": 0.0002, "loss": 0.6372, "mean_token_accuracy": 0.7454263120889664, "num_tokens": 366890.0, "step": 101 }, { "entropy": 0.642305314540863, "epoch": 0.09519365375641624, "grad_norm": 0.2941336929798126, "learning_rate": 0.0002, "loss": 0.6449, "mean_token_accuracy": 0.7372082471847534, "num_tokens": 370457.0, "step": 102 }, { "entropy": 0.6035270243883133, "epoch": 0.09612692487167522, "grad_norm": 0.31498217582702637, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.7575544714927673, "num_tokens": 373967.0, "step": 103 }, { "entropy": 0.644278272986412, "epoch": 0.0970601959869342, "grad_norm": 0.2929423153400421, "learning_rate": 0.0002, "loss": 0.6669, "mean_token_accuracy": 0.7349564582109451, "num_tokens": 377590.0, "step": 104 }, { "entropy": 0.635860800743103, "epoch": 0.09799346710219319, "grad_norm": 0.3369288742542267, "learning_rate": 0.0002, "loss": 0.6349, "mean_token_accuracy": 0.7402680218219757, "num_tokens": 381269.0, "step": 105 }, { "entropy": 0.5979665964841843, "epoch": 0.09892673821745217, "grad_norm": 0.27370232343673706, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.7542353719472885, "num_tokens": 384749.0, "step": 106 }, { "entropy": 0.6489188969135284, "epoch": 0.09986000933271115, "grad_norm": 0.2635843753814697, "learning_rate": 0.0002, "loss": 0.6466, "mean_token_accuracy": 0.7418016493320465, "num_tokens": 388350.0, "step": 107 }, { "entropy": 0.6361844539642334, "epoch": 0.10079328044797013, "grad_norm": 0.2754812240600586, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.7591836601495743, "num_tokens": 392123.0, "step": 108 }, { "entropy": 0.6321296095848083, "epoch": 0.10172655156322911, "grad_norm": 0.26241248846054077, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.752978652715683, "num_tokens": 395854.0, "step": 109 }, { "entropy": 0.6196689009666443, "epoch": 0.1026598226784881, "grad_norm": 0.24862346053123474, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.7576078772544861, "num_tokens": 399448.0, "step": 110 }, { "entropy": 0.6512951552867889, "epoch": 0.10359309379374708, "grad_norm": 0.2643628418445587, "learning_rate": 0.0002, "loss": 0.6418, "mean_token_accuracy": 0.7444390952587128, "num_tokens": 403088.0, "step": 111 }, { "entropy": 0.6427737176418304, "epoch": 0.10452636490900606, "grad_norm": 0.28680023550987244, "learning_rate": 0.0002, "loss": 0.6597, "mean_token_accuracy": 0.7350614219903946, "num_tokens": 406749.0, "step": 112 }, { "entropy": 0.5904862433671951, "epoch": 0.10545963602426504, "grad_norm": 0.27169057726860046, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.7622718065977097, "num_tokens": 410422.0, "step": 113 }, { "entropy": 0.596582904458046, "epoch": 0.10639290713952403, "grad_norm": 0.31019309163093567, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.7589492648839951, "num_tokens": 413852.0, "step": 114 }, { "entropy": 0.6331626623868942, "epoch": 0.10732617825478301, "grad_norm": 0.2573491036891937, "learning_rate": 0.0002, "loss": 0.6421, "mean_token_accuracy": 0.7447192370891571, "num_tokens": 417429.0, "step": 115 }, { "entropy": 0.634491965174675, "epoch": 0.10825944937004199, "grad_norm": 0.3038738965988159, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.7472241222858429, "num_tokens": 421011.0, "step": 116 }, { "entropy": 0.6338039338588715, "epoch": 0.10919272048530097, "grad_norm": 0.2924957573413849, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.7504428029060364, "num_tokens": 424683.0, "step": 117 }, { "entropy": 0.6111537963151932, "epoch": 0.11012599160055997, "grad_norm": 0.3005410432815552, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.7448770701885223, "num_tokens": 428226.0, "step": 118 }, { "entropy": 0.6281677335500717, "epoch": 0.11105926271581895, "grad_norm": 0.2710913121700287, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.7463484704494476, "num_tokens": 431782.0, "step": 119 }, { "entropy": 0.6376543343067169, "epoch": 0.11199253383107793, "grad_norm": 0.2510261535644531, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.7453751415014267, "num_tokens": 435472.0, "step": 120 }, { "entropy": 0.5869117081165314, "epoch": 0.11292580494633692, "grad_norm": 0.2757682800292969, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.7652303874492645, "num_tokens": 439124.0, "step": 121 }, { "entropy": 0.6032629609107971, "epoch": 0.1138590760615959, "grad_norm": 0.3368721008300781, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.7516492158174515, "num_tokens": 442668.0, "step": 122 }, { "entropy": 0.6482813656330109, "epoch": 0.11479234717685488, "grad_norm": 0.3557293117046356, "learning_rate": 0.0002, "loss": 0.6598, "mean_token_accuracy": 0.738071084022522, "num_tokens": 446260.0, "step": 123 }, { "entropy": 0.6116222888231277, "epoch": 0.11572561829211386, "grad_norm": 0.27834174036979675, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.7490458488464355, "num_tokens": 449930.0, "step": 124 }, { "entropy": 0.6129489541053772, "epoch": 0.11665888940737285, "grad_norm": 0.2581518590450287, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.7534941583871841, "num_tokens": 453575.0, "step": 125 }, { "entropy": 0.617837131023407, "epoch": 0.11759216052263183, "grad_norm": 0.2822326123714447, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.7555846124887466, "num_tokens": 457316.0, "step": 126 }, { "entropy": 0.642390251159668, "epoch": 0.11852543163789081, "grad_norm": 0.27645590901374817, "learning_rate": 0.0002, "loss": 0.6527, "mean_token_accuracy": 0.742468386888504, "num_tokens": 461129.0, "step": 127 }, { "entropy": 0.6243897676467896, "epoch": 0.11945870275314979, "grad_norm": 0.23544079065322876, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.7492697685956955, "num_tokens": 464683.0, "step": 128 }, { "entropy": 0.6339560598134995, "epoch": 0.12039197386840877, "grad_norm": 0.26311081647872925, "learning_rate": 0.0002, "loss": 0.6477, "mean_token_accuracy": 0.7440393567085266, "num_tokens": 468293.0, "step": 129 }, { "entropy": 0.6453955620527267, "epoch": 0.12132524498366776, "grad_norm": 0.27588823437690735, "learning_rate": 0.0002, "loss": 0.6502, "mean_token_accuracy": 0.7402883321046829, "num_tokens": 471929.0, "step": 130 }, { "entropy": 0.6028461456298828, "epoch": 0.12225851609892674, "grad_norm": 0.2882139980792999, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.7530756145715714, "num_tokens": 475492.0, "step": 131 }, { "entropy": 0.6240412443876266, "epoch": 0.12319178721418572, "grad_norm": 0.2454262375831604, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.7511583417654037, "num_tokens": 479187.0, "step": 132 }, { "entropy": 0.6349049657583237, "epoch": 0.1241250583294447, "grad_norm": 0.26981422305107117, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.7374670058488846, "num_tokens": 482877.0, "step": 133 }, { "entropy": 0.5848357230424881, "epoch": 0.1250583294447037, "grad_norm": 0.28297799825668335, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7660666555166245, "num_tokens": 486374.0, "step": 134 }, { "entropy": 0.5504437834024429, "epoch": 0.12599160055996267, "grad_norm": 0.35001659393310547, "learning_rate": 0.0002, "loss": 0.5772, "mean_token_accuracy": 0.7675510495901108, "num_tokens": 489970.0, "step": 135 }, { "entropy": 0.607095405459404, "epoch": 0.12692487167522165, "grad_norm": 0.30750665068626404, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.7515251487493515, "num_tokens": 493639.0, "step": 136 }, { "entropy": 0.642394408583641, "epoch": 0.12785814279048063, "grad_norm": 0.28031566739082336, "learning_rate": 0.0002, "loss": 0.655, "mean_token_accuracy": 0.7418509721755981, "num_tokens": 497297.0, "step": 137 }, { "entropy": 0.6579112410545349, "epoch": 0.12879141390573962, "grad_norm": 0.27005183696746826, "learning_rate": 0.0002, "loss": 0.6663, "mean_token_accuracy": 0.7352576404809952, "num_tokens": 500975.0, "step": 138 }, { "entropy": 0.6708986014127731, "epoch": 0.1297246850209986, "grad_norm": 0.23322969675064087, "learning_rate": 0.0002, "loss": 0.6658, "mean_token_accuracy": 0.7298001646995544, "num_tokens": 504665.0, "step": 139 }, { "entropy": 0.6241953074932098, "epoch": 0.13065795613625758, "grad_norm": 0.2642923891544342, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.7563126236200333, "num_tokens": 508255.0, "step": 140 }, { "entropy": 0.6216610819101334, "epoch": 0.13159122725151656, "grad_norm": 0.24610836803913116, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.7632915079593658, "num_tokens": 511879.0, "step": 141 }, { "entropy": 0.6326995342969894, "epoch": 0.13252449836677554, "grad_norm": 0.23786967992782593, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.7541671991348267, "num_tokens": 515531.0, "step": 142 }, { "entropy": 0.6339477747678757, "epoch": 0.13345776948203453, "grad_norm": 0.2203858196735382, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.7523991763591766, "num_tokens": 519160.0, "step": 143 }, { "entropy": 0.6281312853097916, "epoch": 0.1343910405972935, "grad_norm": 0.2529226839542389, "learning_rate": 0.0002, "loss": 0.6326, "mean_token_accuracy": 0.7482670098543167, "num_tokens": 522690.0, "step": 144 }, { "entropy": 0.6087938547134399, "epoch": 0.1353243117125525, "grad_norm": 0.2891392409801483, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.7440899014472961, "num_tokens": 526197.0, "step": 145 }, { "entropy": 0.5905243009328842, "epoch": 0.13625758282781147, "grad_norm": 0.28046396374702454, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.7535561472177505, "num_tokens": 529737.0, "step": 146 }, { "entropy": 0.6189921200275421, "epoch": 0.13719085394307046, "grad_norm": 0.2389392852783203, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.7502854466438293, "num_tokens": 533426.0, "step": 147 }, { "entropy": 0.620497465133667, "epoch": 0.13812412505832944, "grad_norm": 0.24434681236743927, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.7557744085788727, "num_tokens": 537113.0, "step": 148 }, { "entropy": 0.5958927273750305, "epoch": 0.13905739617358842, "grad_norm": 0.2396402359008789, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.7515599131584167, "num_tokens": 540735.0, "step": 149 }, { "entropy": 0.6292619705200195, "epoch": 0.1399906672888474, "grad_norm": 0.21932756900787354, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.7477551996707916, "num_tokens": 544485.0, "step": 150 }, { "entropy": 0.6558041572570801, "epoch": 0.14092393840410639, "grad_norm": 0.24288199841976166, "learning_rate": 0.0002, "loss": 0.6689, "mean_token_accuracy": 0.7322881817817688, "num_tokens": 548124.0, "step": 151 }, { "entropy": 0.608700230717659, "epoch": 0.14185720951936537, "grad_norm": 0.254705011844635, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7611811757087708, "num_tokens": 551829.0, "step": 152 }, { "entropy": 0.6570206880569458, "epoch": 0.14279048063462435, "grad_norm": 0.2351575344800949, "learning_rate": 0.0002, "loss": 0.6426, "mean_token_accuracy": 0.7406206279993057, "num_tokens": 555453.0, "step": 153 }, { "entropy": 0.6156815737485886, "epoch": 0.14372375174988333, "grad_norm": 0.24893145263195038, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.7618319243192673, "num_tokens": 559090.0, "step": 154 }, { "entropy": 0.6017518192529678, "epoch": 0.14465702286514232, "grad_norm": 0.2725159227848053, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.7531151175498962, "num_tokens": 562744.0, "step": 155 }, { "entropy": 0.5966658145189285, "epoch": 0.1455902939804013, "grad_norm": 0.2736251652240753, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.7496384531259537, "num_tokens": 566253.0, "step": 156 }, { "entropy": 0.5750633627176285, "epoch": 0.14652356509566028, "grad_norm": 0.26680371165275574, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.7635132372379303, "num_tokens": 569828.0, "step": 157 }, { "entropy": 0.6051732301712036, "epoch": 0.14745683621091926, "grad_norm": 0.2606704831123352, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.7434609830379486, "num_tokens": 573506.0, "step": 158 }, { "entropy": 0.5991878360509872, "epoch": 0.14839010732617824, "grad_norm": 0.24064388871192932, "learning_rate": 0.0002, "loss": 0.5914, "mean_token_accuracy": 0.759908139705658, "num_tokens": 577170.0, "step": 159 }, { "entropy": 0.6270691156387329, "epoch": 0.14932337844143723, "grad_norm": 0.25130823254585266, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.7464575469493866, "num_tokens": 580824.0, "step": 160 }, { "entropy": 0.632448211312294, "epoch": 0.1502566495566962, "grad_norm": 0.23973777890205383, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.7483935952186584, "num_tokens": 584451.0, "step": 161 }, { "entropy": 0.5970974564552307, "epoch": 0.1511899206719552, "grad_norm": 0.2403346747159958, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.7546212524175644, "num_tokens": 588024.0, "step": 162 }, { "entropy": 0.6266964375972748, "epoch": 0.15212319178721417, "grad_norm": 0.2669350802898407, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.7475338876247406, "num_tokens": 591575.0, "step": 163 }, { "entropy": 0.6330577582120895, "epoch": 0.15305646290247316, "grad_norm": 0.314880907535553, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.738884374499321, "num_tokens": 595218.0, "step": 164 }, { "entropy": 0.5537855625152588, "epoch": 0.15398973401773214, "grad_norm": 0.287954181432724, "learning_rate": 0.0002, "loss": 0.5731, "mean_token_accuracy": 0.765826553106308, "num_tokens": 598789.0, "step": 165 }, { "entropy": 0.608406126499176, "epoch": 0.15492300513299112, "grad_norm": 0.3315788507461548, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.7440959960222244, "num_tokens": 602255.0, "step": 166 }, { "entropy": 0.5972480028867722, "epoch": 0.1558562762482501, "grad_norm": 0.24809187650680542, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.7597341686487198, "num_tokens": 605844.0, "step": 167 }, { "entropy": 0.6412074118852615, "epoch": 0.1567895473635091, "grad_norm": 0.4750399589538574, "learning_rate": 0.0002, "loss": 0.6363, "mean_token_accuracy": 0.7476449608802795, "num_tokens": 609488.0, "step": 168 }, { "entropy": 0.5999118238687515, "epoch": 0.1577228184787681, "grad_norm": 0.20879052579402924, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.7608277797698975, "num_tokens": 613072.0, "step": 169 }, { "entropy": 0.6202736347913742, "epoch": 0.15865608959402708, "grad_norm": 0.23107048869132996, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.7531519383192062, "num_tokens": 616661.0, "step": 170 }, { "entropy": 0.6640915721654892, "epoch": 0.15958936070928606, "grad_norm": 0.27002063393592834, "learning_rate": 0.0002, "loss": 0.6578, "mean_token_accuracy": 0.7361062467098236, "num_tokens": 620325.0, "step": 171 }, { "entropy": 0.5750575661659241, "epoch": 0.16052263182454504, "grad_norm": 0.2354954183101654, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7591314166784286, "num_tokens": 623839.0, "step": 172 }, { "entropy": 0.6241327226161957, "epoch": 0.16145590293980402, "grad_norm": 0.28900352120399475, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.746707409620285, "num_tokens": 627670.0, "step": 173 }, { "entropy": 0.6232179701328278, "epoch": 0.162389174055063, "grad_norm": 0.25896087288856506, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.7448995262384415, "num_tokens": 631225.0, "step": 174 }, { "entropy": 0.6067274212837219, "epoch": 0.163322445170322, "grad_norm": 0.27432236075401306, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.7555064409971237, "num_tokens": 634852.0, "step": 175 }, { "entropy": 0.621697261929512, "epoch": 0.16425571628558097, "grad_norm": 0.2743587791919708, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.7479459494352341, "num_tokens": 638542.0, "step": 176 }, { "entropy": 0.6179335713386536, "epoch": 0.16518898740083995, "grad_norm": 0.29290083050727844, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.744606077671051, "num_tokens": 642193.0, "step": 177 }, { "entropy": 0.6250169426202774, "epoch": 0.16612225851609894, "grad_norm": 0.25477907061576843, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.7539174258708954, "num_tokens": 645817.0, "step": 178 }, { "entropy": 0.595383808016777, "epoch": 0.16705552963135792, "grad_norm": 0.25349563360214233, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.758468896150589, "num_tokens": 649542.0, "step": 179 }, { "entropy": 0.6317081302404404, "epoch": 0.1679888007466169, "grad_norm": 0.25695207715034485, "learning_rate": 0.0002, "loss": 0.6371, "mean_token_accuracy": 0.7463939338922501, "num_tokens": 653145.0, "step": 180 }, { "entropy": 0.5857246667146683, "epoch": 0.16892207186187588, "grad_norm": 0.28805211186408997, "learning_rate": 0.0002, "loss": 0.5921, "mean_token_accuracy": 0.7651209086179733, "num_tokens": 656582.0, "step": 181 }, { "entropy": 0.6236688047647476, "epoch": 0.16985534297713487, "grad_norm": 0.22554291784763336, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.7513447403907776, "num_tokens": 660179.0, "step": 182 }, { "entropy": 0.5727080702781677, "epoch": 0.17078861409239385, "grad_norm": 0.34345850348472595, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7636747062206268, "num_tokens": 663748.0, "step": 183 }, { "entropy": 0.5918214470148087, "epoch": 0.17172188520765283, "grad_norm": 0.23382219672203064, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.7576747685670853, "num_tokens": 667525.0, "step": 184 }, { "entropy": 0.6308113783597946, "epoch": 0.1726551563229118, "grad_norm": 0.24184122681617737, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.7453972846269608, "num_tokens": 671196.0, "step": 185 }, { "entropy": 0.605795681476593, "epoch": 0.1735884274381708, "grad_norm": 0.2509753406047821, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.759684681892395, "num_tokens": 674915.0, "step": 186 }, { "entropy": 0.5998933613300323, "epoch": 0.17452169855342978, "grad_norm": 0.3412162661552429, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.7630914598703384, "num_tokens": 678586.0, "step": 187 }, { "entropy": 0.6238373965024948, "epoch": 0.17545496966868876, "grad_norm": 0.3031356930732727, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.7459771633148193, "num_tokens": 682256.0, "step": 188 }, { "entropy": 0.6020677983760834, "epoch": 0.17638824078394774, "grad_norm": 0.25279659032821655, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.7614628970623016, "num_tokens": 685849.0, "step": 189 }, { "entropy": 0.6398890763521194, "epoch": 0.17732151189920672, "grad_norm": 0.23759107291698456, "learning_rate": 0.0002, "loss": 0.6427, "mean_token_accuracy": 0.7463666498661041, "num_tokens": 689541.0, "step": 190 }, { "entropy": 0.6532247066497803, "epoch": 0.1782547830144657, "grad_norm": 0.2566443085670471, "learning_rate": 0.0002, "loss": 0.6496, "mean_token_accuracy": 0.7365892678499222, "num_tokens": 693127.0, "step": 191 }, { "entropy": 0.6099744290113449, "epoch": 0.1791880541297247, "grad_norm": 0.2897506058216095, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.7450228035449982, "num_tokens": 696671.0, "step": 192 }, { "entropy": 0.6053965091705322, "epoch": 0.18012132524498367, "grad_norm": 0.24541716277599335, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.7502627670764923, "num_tokens": 700293.0, "step": 193 }, { "entropy": 0.6190284192562103, "epoch": 0.18105459636024265, "grad_norm": 0.26916369795799255, "learning_rate": 0.0002, "loss": 0.6338, "mean_token_accuracy": 0.7439676970243454, "num_tokens": 703844.0, "step": 194 }, { "entropy": 0.6275667548179626, "epoch": 0.18198786747550164, "grad_norm": 0.2828158438205719, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.7448041886091232, "num_tokens": 707545.0, "step": 195 }, { "entropy": 0.6403878182172775, "epoch": 0.18292113859076062, "grad_norm": 0.24170711636543274, "learning_rate": 0.0002, "loss": 0.6327, "mean_token_accuracy": 0.7435235232114792, "num_tokens": 711245.0, "step": 196 }, { "entropy": 0.6383012980222702, "epoch": 0.1838544097060196, "grad_norm": 0.3352697491645813, "learning_rate": 0.0002, "loss": 0.6507, "mean_token_accuracy": 0.7408519983291626, "num_tokens": 714910.0, "step": 197 }, { "entropy": 0.6249253004789352, "epoch": 0.18478768082127858, "grad_norm": 0.2284345030784607, "learning_rate": 0.0002, "loss": 0.6403, "mean_token_accuracy": 0.7444155812263489, "num_tokens": 718483.0, "step": 198 }, { "entropy": 0.6026186943054199, "epoch": 0.18572095193653756, "grad_norm": 0.217362180352211, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.7555657923221588, "num_tokens": 722170.0, "step": 199 }, { "entropy": 0.6890466958284378, "epoch": 0.18665422305179655, "grad_norm": 0.2601906359195709, "learning_rate": 0.0002, "loss": 0.6761, "mean_token_accuracy": 0.7332215905189514, "num_tokens": 725840.0, "step": 200 }, { "entropy": 0.5560615658760071, "epoch": 0.18758749416705553, "grad_norm": 0.26223263144493103, "learning_rate": 0.0002, "loss": 0.5436, "mean_token_accuracy": 0.7823840826749802, "num_tokens": 729348.0, "step": 201 }, { "entropy": 0.5996962785720825, "epoch": 0.1885207652823145, "grad_norm": 0.2748759984970093, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.7604239583015442, "num_tokens": 732871.0, "step": 202 }, { "entropy": 0.5800734907388687, "epoch": 0.1894540363975735, "grad_norm": 0.26087355613708496, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.7525783330202103, "num_tokens": 736449.0, "step": 203 }, { "entropy": 0.5842571556568146, "epoch": 0.19038730751283248, "grad_norm": 0.23526206612586975, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.7587614506483078, "num_tokens": 740180.0, "step": 204 }, { "entropy": 0.5629585683345795, "epoch": 0.19132057862809146, "grad_norm": 0.2398541271686554, "learning_rate": 0.0002, "loss": 0.5734, "mean_token_accuracy": 0.76799176633358, "num_tokens": 743664.0, "step": 205 }, { "entropy": 0.6320621073246002, "epoch": 0.19225384974335044, "grad_norm": 0.25731390714645386, "learning_rate": 0.0002, "loss": 0.6525, "mean_token_accuracy": 0.7370477020740509, "num_tokens": 747280.0, "step": 206 }, { "entropy": 0.6035778969526291, "epoch": 0.19318712085860942, "grad_norm": 0.23380134999752045, "learning_rate": 0.0002, "loss": 0.5889, "mean_token_accuracy": 0.7672587931156158, "num_tokens": 751001.0, "step": 207 }, { "entropy": 0.6217583566904068, "epoch": 0.1941203919738684, "grad_norm": 0.2564546763896942, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.752321258187294, "num_tokens": 754584.0, "step": 208 }, { "entropy": 0.6264652758836746, "epoch": 0.1950536630891274, "grad_norm": 0.23776431381702423, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.744939774274826, "num_tokens": 758289.0, "step": 209 }, { "entropy": 0.6006923615932465, "epoch": 0.19598693420438637, "grad_norm": 0.26134875416755676, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.7589907646179199, "num_tokens": 761955.0, "step": 210 }, { "entropy": 0.610225111246109, "epoch": 0.19692020531964535, "grad_norm": 0.23414131999015808, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.7500801384449005, "num_tokens": 765636.0, "step": 211 }, { "entropy": 0.6199745684862137, "epoch": 0.19785347643490434, "grad_norm": 0.272738516330719, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.7472145706415176, "num_tokens": 769400.0, "step": 212 }, { "entropy": 0.5927752554416656, "epoch": 0.19878674755016332, "grad_norm": 0.24875399470329285, "learning_rate": 0.0002, "loss": 0.6173, "mean_token_accuracy": 0.7445481568574905, "num_tokens": 772976.0, "step": 213 }, { "entropy": 0.6127741485834122, "epoch": 0.1997200186654223, "grad_norm": 0.2387901395559311, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.7477727830410004, "num_tokens": 776647.0, "step": 214 }, { "entropy": 0.6165394634008408, "epoch": 0.20065328978068128, "grad_norm": 0.2639712691307068, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.754319503903389, "num_tokens": 780217.0, "step": 215 }, { "entropy": 0.6155958026647568, "epoch": 0.20158656089594026, "grad_norm": 0.2703871428966522, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.7532576471567154, "num_tokens": 783968.0, "step": 216 }, { "entropy": 0.61528280377388, "epoch": 0.20251983201119925, "grad_norm": 0.2133951336145401, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.7481294125318527, "num_tokens": 787621.0, "step": 217 }, { "entropy": 0.6158957928419113, "epoch": 0.20345310312645823, "grad_norm": 0.258661687374115, "learning_rate": 0.0002, "loss": 0.6319, "mean_token_accuracy": 0.7471871078014374, "num_tokens": 791102.0, "step": 218 }, { "entropy": 0.6168512552976608, "epoch": 0.2043863742417172, "grad_norm": 0.3034660518169403, "learning_rate": 0.0002, "loss": 0.6429, "mean_token_accuracy": 0.7398579716682434, "num_tokens": 794724.0, "step": 219 }, { "entropy": 0.6110662370920181, "epoch": 0.2053196453569762, "grad_norm": 0.23590125143527985, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.7508528977632523, "num_tokens": 798469.0, "step": 220 }, { "entropy": 0.6560942381620407, "epoch": 0.20625291647223518, "grad_norm": 0.22595880925655365, "learning_rate": 0.0002, "loss": 0.6507, "mean_token_accuracy": 0.735445186495781, "num_tokens": 802103.0, "step": 221 }, { "entropy": 0.6149120777845383, "epoch": 0.20718618758749416, "grad_norm": 0.21946866810321808, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7493060231208801, "num_tokens": 805728.0, "step": 222 }, { "entropy": 0.5718913376331329, "epoch": 0.20811945870275314, "grad_norm": 0.23979522287845612, "learning_rate": 0.0002, "loss": 0.5765, "mean_token_accuracy": 0.7686641812324524, "num_tokens": 809384.0, "step": 223 }, { "entropy": 0.6254722625017166, "epoch": 0.20905272981801212, "grad_norm": 0.2721552848815918, "learning_rate": 0.0002, "loss": 0.6317, "mean_token_accuracy": 0.7463520169258118, "num_tokens": 812892.0, "step": 224 }, { "entropy": 0.5527019053697586, "epoch": 0.2099860009332711, "grad_norm": 0.2151440978050232, "learning_rate": 0.0002, "loss": 0.5547, "mean_token_accuracy": 0.7704993486404419, "num_tokens": 816480.0, "step": 225 }, { "entropy": 0.5949896275997162, "epoch": 0.2109192720485301, "grad_norm": 0.25440722703933716, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.753014400601387, "num_tokens": 820002.0, "step": 226 }, { "entropy": 0.6562753319740295, "epoch": 0.21185254316378907, "grad_norm": 0.254067987203598, "learning_rate": 0.0002, "loss": 0.6621, "mean_token_accuracy": 0.7353549897670746, "num_tokens": 823690.0, "step": 227 }, { "entropy": 0.6105601638555527, "epoch": 0.21278581427904805, "grad_norm": 0.2414735108613968, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.7579648047685623, "num_tokens": 827206.0, "step": 228 }, { "entropy": 0.5988323241472244, "epoch": 0.21371908539430703, "grad_norm": 0.2429419755935669, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.7545661926269531, "num_tokens": 830704.0, "step": 229 }, { "entropy": 0.6313779950141907, "epoch": 0.21465235650956602, "grad_norm": 0.2469571828842163, "learning_rate": 0.0002, "loss": 0.6326, "mean_token_accuracy": 0.7444997429847717, "num_tokens": 834379.0, "step": 230 }, { "entropy": 0.6383220702409744, "epoch": 0.215585627624825, "grad_norm": 0.24318064749240875, "learning_rate": 0.0002, "loss": 0.6478, "mean_token_accuracy": 0.7371481508016586, "num_tokens": 838018.0, "step": 231 }, { "entropy": 0.6246862262487411, "epoch": 0.21651889874008398, "grad_norm": 0.21833717823028564, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.7488425821065903, "num_tokens": 841624.0, "step": 232 }, { "entropy": 0.6057994961738586, "epoch": 0.21745216985534296, "grad_norm": 0.25634661316871643, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.7531262934207916, "num_tokens": 845157.0, "step": 233 }, { "entropy": 0.6662411689758301, "epoch": 0.21838544097060195, "grad_norm": 0.21819192171096802, "learning_rate": 0.0002, "loss": 0.6514, "mean_token_accuracy": 0.7362586110830307, "num_tokens": 848870.0, "step": 234 }, { "entropy": 0.6228697299957275, "epoch": 0.21931871208586096, "grad_norm": 0.23247107863426208, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.7459995001554489, "num_tokens": 852411.0, "step": 235 }, { "entropy": 0.6096363663673401, "epoch": 0.22025198320111994, "grad_norm": 0.25261467695236206, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.7479837387800217, "num_tokens": 856023.0, "step": 236 }, { "entropy": 0.6098160743713379, "epoch": 0.22118525431637892, "grad_norm": 0.22388289868831635, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.7445860207080841, "num_tokens": 859703.0, "step": 237 }, { "entropy": 0.6158948987722397, "epoch": 0.2221185254316379, "grad_norm": 0.27309468388557434, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.749657616019249, "num_tokens": 863349.0, "step": 238 }, { "entropy": 0.5758228674530983, "epoch": 0.22305179654689689, "grad_norm": 0.2549325227737427, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.7596073001623154, "num_tokens": 866929.0, "step": 239 }, { "entropy": 0.6322315633296967, "epoch": 0.22398506766215587, "grad_norm": 0.24724318087100983, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.7433344572782516, "num_tokens": 870496.0, "step": 240 }, { "entropy": 0.6228243857622147, "epoch": 0.22491833877741485, "grad_norm": 0.21758802235126495, "learning_rate": 0.0002, "loss": 0.6283, "mean_token_accuracy": 0.7459953725337982, "num_tokens": 874047.0, "step": 241 }, { "entropy": 0.6174435317516327, "epoch": 0.22585160989267383, "grad_norm": 0.2307950109243393, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.749544769525528, "num_tokens": 877800.0, "step": 242 }, { "entropy": 0.6227843910455704, "epoch": 0.22678488100793281, "grad_norm": 0.19341588020324707, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.7530659288167953, "num_tokens": 881441.0, "step": 243 }, { "entropy": 0.6062646061182022, "epoch": 0.2277181521231918, "grad_norm": 0.2877410054206848, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.7488056868314743, "num_tokens": 884982.0, "step": 244 }, { "entropy": 0.625371515750885, "epoch": 0.22865142323845078, "grad_norm": 0.2561280131340027, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.7483764290809631, "num_tokens": 888656.0, "step": 245 }, { "entropy": 0.6210257411003113, "epoch": 0.22958469435370976, "grad_norm": 0.23064196109771729, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.7475909292697906, "num_tokens": 892456.0, "step": 246 }, { "entropy": 0.6544372290372849, "epoch": 0.23051796546896874, "grad_norm": 0.2555047273635864, "learning_rate": 0.0002, "loss": 0.6507, "mean_token_accuracy": 0.7375649213790894, "num_tokens": 896119.0, "step": 247 }, { "entropy": 0.5840075016021729, "epoch": 0.23145123658422773, "grad_norm": 0.21877087652683258, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.759378045797348, "num_tokens": 899612.0, "step": 248 }, { "entropy": 0.5986610352993011, "epoch": 0.2323845076994867, "grad_norm": 0.23976625502109528, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.7492552846670151, "num_tokens": 903292.0, "step": 249 }, { "entropy": 0.6365804076194763, "epoch": 0.2333177788147457, "grad_norm": 0.2232261300086975, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.7459896355867386, "num_tokens": 906870.0, "step": 250 }, { "entropy": 0.647284597158432, "epoch": 0.23425104993000467, "grad_norm": 0.2258215695619583, "learning_rate": 0.0002, "loss": 0.655, "mean_token_accuracy": 0.7368626147508621, "num_tokens": 910544.0, "step": 251 }, { "entropy": 0.6097860038280487, "epoch": 0.23518432104526366, "grad_norm": 0.21181917190551758, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.7547638267278671, "num_tokens": 914198.0, "step": 252 }, { "entropy": 0.6026428490877151, "epoch": 0.23611759216052264, "grad_norm": 0.24616965651512146, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7534868121147156, "num_tokens": 917750.0, "step": 253 }, { "entropy": 0.6503418236970901, "epoch": 0.23705086327578162, "grad_norm": 0.23387610912322998, "learning_rate": 0.0002, "loss": 0.6548, "mean_token_accuracy": 0.741888552904129, "num_tokens": 921366.0, "step": 254 }, { "entropy": 0.6411082595586777, "epoch": 0.2379841343910406, "grad_norm": 0.3027603030204773, "learning_rate": 0.0002, "loss": 0.6404, "mean_token_accuracy": 0.7426335513591766, "num_tokens": 925107.0, "step": 255 }, { "entropy": 0.6108865439891815, "epoch": 0.23891740550629958, "grad_norm": 0.2605670392513275, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.7455543726682663, "num_tokens": 928753.0, "step": 256 }, { "entropy": 0.6077248454093933, "epoch": 0.23985067662155857, "grad_norm": 0.27511587738990784, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.7506539970636368, "num_tokens": 932335.0, "step": 257 }, { "entropy": 0.626903623342514, "epoch": 0.24078394773681755, "grad_norm": 0.24599343538284302, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.7511112838983536, "num_tokens": 935967.0, "step": 258 }, { "entropy": 0.618951827287674, "epoch": 0.24171721885207653, "grad_norm": 0.2827540338039398, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.7451323866844177, "num_tokens": 939607.0, "step": 259 }, { "entropy": 0.6324618458747864, "epoch": 0.24265048996733551, "grad_norm": 0.2715925872325897, "learning_rate": 0.0002, "loss": 0.6425, "mean_token_accuracy": 0.738193467259407, "num_tokens": 943151.0, "step": 260 }, { "entropy": 0.6031533479690552, "epoch": 0.2435837610825945, "grad_norm": 0.230172261595726, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.7527964115142822, "num_tokens": 946907.0, "step": 261 }, { "entropy": 0.6206395179033279, "epoch": 0.24451703219785348, "grad_norm": 0.25124189257621765, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.7486433386802673, "num_tokens": 950521.0, "step": 262 }, { "entropy": 0.6440339237451553, "epoch": 0.24545030331311246, "grad_norm": 0.20642443001270294, "learning_rate": 0.0002, "loss": 0.6347, "mean_token_accuracy": 0.7459895610809326, "num_tokens": 954261.0, "step": 263 }, { "entropy": 0.6292641460895538, "epoch": 0.24638357442837144, "grad_norm": 0.2169097661972046, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.7421683818101883, "num_tokens": 957843.0, "step": 264 }, { "entropy": 0.6146288514137268, "epoch": 0.24731684554363043, "grad_norm": 0.2507776618003845, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.7589435428380966, "num_tokens": 961571.0, "step": 265 }, { "entropy": 0.574441522359848, "epoch": 0.2482501166588894, "grad_norm": 0.2976577877998352, "learning_rate": 0.0002, "loss": 0.5686, "mean_token_accuracy": 0.7695203721523285, "num_tokens": 965282.0, "step": 266 }, { "entropy": 0.6483120620250702, "epoch": 0.2491833877741484, "grad_norm": 0.20985764265060425, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.7457327991724014, "num_tokens": 969013.0, "step": 267 }, { "entropy": 0.6461483538150787, "epoch": 0.2501166588894074, "grad_norm": 0.2911612391471863, "learning_rate": 0.0002, "loss": 0.6459, "mean_token_accuracy": 0.7426430284976959, "num_tokens": 972592.0, "step": 268 }, { "entropy": 0.5818840116262436, "epoch": 0.25104993000466636, "grad_norm": 0.23449541628360748, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7681708335876465, "num_tokens": 976295.0, "step": 269 }, { "entropy": 0.5908921509981155, "epoch": 0.25198320111992534, "grad_norm": 0.26701241731643677, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.7512622177600861, "num_tokens": 979984.0, "step": 270 }, { "entropy": 0.6007975488901138, "epoch": 0.2529164722351843, "grad_norm": 0.24850456416606903, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.7483939528465271, "num_tokens": 983731.0, "step": 271 }, { "entropy": 0.5573849380016327, "epoch": 0.2538497433504433, "grad_norm": 0.275779128074646, "learning_rate": 0.0002, "loss": 0.5745, "mean_token_accuracy": 0.7668437957763672, "num_tokens": 987321.0, "step": 272 }, { "entropy": 0.6206032335758209, "epoch": 0.2547830144657023, "grad_norm": 0.3776557445526123, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.7501537650823593, "num_tokens": 990986.0, "step": 273 }, { "entropy": 0.6222099363803864, "epoch": 0.25571628558096127, "grad_norm": 0.23503698408603668, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.7554141581058502, "num_tokens": 994623.0, "step": 274 }, { "entropy": 0.6178972274065018, "epoch": 0.25664955669622025, "grad_norm": 0.22250908613204956, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.7465093731880188, "num_tokens": 998329.0, "step": 275 }, { "entropy": 0.6592730283737183, "epoch": 0.25758282781147923, "grad_norm": 0.21317912638187408, "learning_rate": 0.0002, "loss": 0.6386, "mean_token_accuracy": 0.7463464140892029, "num_tokens": 1002015.0, "step": 276 }, { "entropy": 0.6304679065942764, "epoch": 0.2585160989267382, "grad_norm": 0.20819774270057678, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.7546389847993851, "num_tokens": 1005589.0, "step": 277 }, { "entropy": 0.5945038199424744, "epoch": 0.2594493700419972, "grad_norm": 0.21087612211704254, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7676447927951813, "num_tokens": 1009223.0, "step": 278 }, { "entropy": 0.5728593170642853, "epoch": 0.2603826411572562, "grad_norm": 0.26314777135849, "learning_rate": 0.0002, "loss": 0.5759, "mean_token_accuracy": 0.7703249156475067, "num_tokens": 1012847.0, "step": 279 }, { "entropy": 0.6195310354232788, "epoch": 0.26131591227251516, "grad_norm": 0.29450416564941406, "learning_rate": 0.0002, "loss": 0.6421, "mean_token_accuracy": 0.7434626221656799, "num_tokens": 1016384.0, "step": 280 }, { "entropy": 0.567408561706543, "epoch": 0.26224918338777414, "grad_norm": 0.2618149220943451, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.7650034427642822, "num_tokens": 1019964.0, "step": 281 }, { "entropy": 0.6377571821212769, "epoch": 0.2631824545030331, "grad_norm": 0.2697064280509949, "learning_rate": 0.0002, "loss": 0.6555, "mean_token_accuracy": 0.7346802353858948, "num_tokens": 1023660.0, "step": 282 }, { "entropy": 0.5862014442682266, "epoch": 0.2641157256182921, "grad_norm": 0.2469477355480194, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.7541156858205795, "num_tokens": 1027208.0, "step": 283 }, { "entropy": 0.6236500144004822, "epoch": 0.2650489967335511, "grad_norm": 0.23605135083198547, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.7528289258480072, "num_tokens": 1030828.0, "step": 284 }, { "entropy": 0.6237832754850388, "epoch": 0.26598226784881007, "grad_norm": 0.2188662588596344, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.7499840408563614, "num_tokens": 1034346.0, "step": 285 }, { "entropy": 0.6626402884721756, "epoch": 0.26691553896406905, "grad_norm": 0.20738428831100464, "learning_rate": 0.0002, "loss": 0.6502, "mean_token_accuracy": 0.7376929223537445, "num_tokens": 1038082.0, "step": 286 }, { "entropy": 0.6518249809741974, "epoch": 0.26784881007932804, "grad_norm": 0.237564817070961, "learning_rate": 0.0002, "loss": 0.657, "mean_token_accuracy": 0.7363761961460114, "num_tokens": 1041597.0, "step": 287 }, { "entropy": 0.6569457948207855, "epoch": 0.268782081194587, "grad_norm": 0.2046477049589157, "learning_rate": 0.0002, "loss": 0.653, "mean_token_accuracy": 0.7392484843730927, "num_tokens": 1045250.0, "step": 288 }, { "entropy": 0.6010307669639587, "epoch": 0.269715352309846, "grad_norm": 0.24642395973205566, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.7493630796670914, "num_tokens": 1048898.0, "step": 289 }, { "entropy": 0.5858932882547379, "epoch": 0.270648623425105, "grad_norm": 0.21390631794929504, "learning_rate": 0.0002, "loss": 0.5838, "mean_token_accuracy": 0.7679879814386368, "num_tokens": 1052563.0, "step": 290 }, { "entropy": 0.6008359342813492, "epoch": 0.27158189454036397, "grad_norm": 0.23739905655384064, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.7466578781604767, "num_tokens": 1056257.0, "step": 291 }, { "entropy": 0.6130369603633881, "epoch": 0.27251516565562295, "grad_norm": 0.27225300669670105, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.7470966130495071, "num_tokens": 1059900.0, "step": 292 }, { "entropy": 0.6194625049829483, "epoch": 0.27344843677088193, "grad_norm": 0.21914733946323395, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7487413734197617, "num_tokens": 1063606.0, "step": 293 }, { "entropy": 0.5879460871219635, "epoch": 0.2743817078861409, "grad_norm": 0.23188331723213196, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7624316215515137, "num_tokens": 1067242.0, "step": 294 }, { "entropy": 0.592158243060112, "epoch": 0.2753149790013999, "grad_norm": 0.2917449176311493, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.7608749568462372, "num_tokens": 1070821.0, "step": 295 }, { "entropy": 0.5934346914291382, "epoch": 0.2762482501166589, "grad_norm": 0.24298319220542908, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7567747384309769, "num_tokens": 1074428.0, "step": 296 }, { "entropy": 0.5949414819478989, "epoch": 0.27718152123191786, "grad_norm": 0.2491264045238495, "learning_rate": 0.0002, "loss": 0.5904, "mean_token_accuracy": 0.7606220841407776, "num_tokens": 1078032.0, "step": 297 }, { "entropy": 0.6307313442230225, "epoch": 0.27811479234717684, "grad_norm": 0.23083384335041046, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.7362936437129974, "num_tokens": 1081589.0, "step": 298 }, { "entropy": 0.6157567650079727, "epoch": 0.2790480634624358, "grad_norm": 0.2656978964805603, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.749842569231987, "num_tokens": 1085228.0, "step": 299 }, { "entropy": 0.5669458508491516, "epoch": 0.2799813345776948, "grad_norm": 0.24704335629940033, "learning_rate": 0.0002, "loss": 0.5801, "mean_token_accuracy": 0.7641920894384384, "num_tokens": 1088880.0, "step": 300 }, { "entropy": 0.6344714313745499, "epoch": 0.2809146056929538, "grad_norm": 0.24963775277137756, "learning_rate": 0.0002, "loss": 0.6406, "mean_token_accuracy": 0.746170163154602, "num_tokens": 1092641.0, "step": 301 }, { "entropy": 0.6174575835466385, "epoch": 0.28184787680821277, "grad_norm": 0.23457922041416168, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.7502936720848083, "num_tokens": 1096144.0, "step": 302 }, { "entropy": 0.6172576397657394, "epoch": 0.28278114792347175, "grad_norm": 0.2192540466785431, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.750115692615509, "num_tokens": 1099677.0, "step": 303 }, { "entropy": 0.6074609905481339, "epoch": 0.28371441903873074, "grad_norm": 0.2816256284713745, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.7507822215557098, "num_tokens": 1103214.0, "step": 304 }, { "entropy": 0.6212022751569748, "epoch": 0.2846476901539897, "grad_norm": 0.21151141822338104, "learning_rate": 0.0002, "loss": 0.6183, "mean_token_accuracy": 0.7523869872093201, "num_tokens": 1106804.0, "step": 305 }, { "entropy": 0.5748876482248306, "epoch": 0.2855809612692487, "grad_norm": 0.22084778547286987, "learning_rate": 0.0002, "loss": 0.5744, "mean_token_accuracy": 0.7668226510286331, "num_tokens": 1110324.0, "step": 306 }, { "entropy": 0.5899158269166946, "epoch": 0.2865142323845077, "grad_norm": 0.19880327582359314, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.7537516504526138, "num_tokens": 1113831.0, "step": 307 }, { "entropy": 0.6471531987190247, "epoch": 0.28744750349976667, "grad_norm": 0.24146264791488647, "learning_rate": 0.0002, "loss": 0.6598, "mean_token_accuracy": 0.7329321056604385, "num_tokens": 1117514.0, "step": 308 }, { "entropy": 0.6253788471221924, "epoch": 0.28838077461502565, "grad_norm": 0.21777553856372833, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.749675452709198, "num_tokens": 1121172.0, "step": 309 }, { "entropy": 0.6081382483243942, "epoch": 0.28931404573028463, "grad_norm": 0.25095707178115845, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7478205114603043, "num_tokens": 1124765.0, "step": 310 }, { "entropy": 0.6202685683965683, "epoch": 0.2902473168455436, "grad_norm": 0.2301933914422989, "learning_rate": 0.0002, "loss": 0.6372, "mean_token_accuracy": 0.7454356700181961, "num_tokens": 1128345.0, "step": 311 }, { "entropy": 0.6180685758590698, "epoch": 0.2911805879608026, "grad_norm": 0.25074467062950134, "learning_rate": 0.0002, "loss": 0.6338, "mean_token_accuracy": 0.7469735592603683, "num_tokens": 1131911.0, "step": 312 }, { "entropy": 0.5987216979265213, "epoch": 0.2921138590760616, "grad_norm": 0.2729342579841614, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.751179963350296, "num_tokens": 1135397.0, "step": 313 }, { "entropy": 0.6078919023275375, "epoch": 0.29304713019132056, "grad_norm": 0.2441406399011612, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7593741714954376, "num_tokens": 1138980.0, "step": 314 }, { "entropy": 0.6372731477022171, "epoch": 0.29398040130657954, "grad_norm": 0.2276860922574997, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.7389883399009705, "num_tokens": 1142586.0, "step": 315 }, { "entropy": 0.6201260983943939, "epoch": 0.2949136724218385, "grad_norm": 0.2690199911594391, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.7519151121377945, "num_tokens": 1146121.0, "step": 316 }, { "entropy": 0.5924785882234573, "epoch": 0.2958469435370975, "grad_norm": 0.2467505931854248, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7632642537355423, "num_tokens": 1149708.0, "step": 317 }, { "entropy": 0.6103527694940567, "epoch": 0.2967802146523565, "grad_norm": 0.19957681000232697, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.7506077289581299, "num_tokens": 1153325.0, "step": 318 }, { "entropy": 0.6278337389230728, "epoch": 0.29771348576761547, "grad_norm": 0.27817314863204956, "learning_rate": 0.0002, "loss": 0.6436, "mean_token_accuracy": 0.7401367425918579, "num_tokens": 1156957.0, "step": 319 }, { "entropy": 0.6169297099113464, "epoch": 0.29864675688287445, "grad_norm": 0.24306392669677734, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.74124476313591, "num_tokens": 1160463.0, "step": 320 }, { "entropy": 0.6195003390312195, "epoch": 0.29958002799813344, "grad_norm": 0.21362222731113434, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.7584252953529358, "num_tokens": 1164046.0, "step": 321 }, { "entropy": 0.5986830592155457, "epoch": 0.3005132991133924, "grad_norm": 0.22763696312904358, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7592554837465286, "num_tokens": 1167625.0, "step": 322 }, { "entropy": 0.6200439631938934, "epoch": 0.3014465702286514, "grad_norm": 0.24690859019756317, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.7414265722036362, "num_tokens": 1171149.0, "step": 323 }, { "entropy": 0.6427660286426544, "epoch": 0.3023798413439104, "grad_norm": 0.2356821894645691, "learning_rate": 0.0002, "loss": 0.6498, "mean_token_accuracy": 0.7429594844579697, "num_tokens": 1174783.0, "step": 324 }, { "entropy": 0.640150398015976, "epoch": 0.30331311245916937, "grad_norm": 0.25030872225761414, "learning_rate": 0.0002, "loss": 0.6398, "mean_token_accuracy": 0.7366082519292831, "num_tokens": 1178391.0, "step": 325 }, { "entropy": 0.5915670543909073, "epoch": 0.30424638357442835, "grad_norm": 0.22520935535430908, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7640996873378754, "num_tokens": 1181888.0, "step": 326 }, { "entropy": 0.6268191039562225, "epoch": 0.30517965468968733, "grad_norm": 0.21728338301181793, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.7510809898376465, "num_tokens": 1185576.0, "step": 327 }, { "entropy": 0.5616789907217026, "epoch": 0.3061129258049463, "grad_norm": 0.21547482907772064, "learning_rate": 0.0002, "loss": 0.5787, "mean_token_accuracy": 0.7668380290269852, "num_tokens": 1189107.0, "step": 328 }, { "entropy": 0.6214643865823746, "epoch": 0.3070461969202053, "grad_norm": 0.25943729281425476, "learning_rate": 0.0002, "loss": 0.6444, "mean_token_accuracy": 0.7369241267442703, "num_tokens": 1192656.0, "step": 329 }, { "entropy": 0.6243578940629959, "epoch": 0.3079794680354643, "grad_norm": 0.22083476185798645, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.7450756132602692, "num_tokens": 1196264.0, "step": 330 }, { "entropy": 0.579920306801796, "epoch": 0.30891273915072326, "grad_norm": 0.20630919933319092, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.7668572217226028, "num_tokens": 1199793.0, "step": 331 }, { "entropy": 0.5741087794303894, "epoch": 0.30984601026598224, "grad_norm": 0.21569089591503143, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.7638017237186432, "num_tokens": 1203423.0, "step": 332 }, { "entropy": 0.5836642533540726, "epoch": 0.3107792813812412, "grad_norm": 0.2118726521730423, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.7563299983739853, "num_tokens": 1206904.0, "step": 333 }, { "entropy": 0.6266708523035049, "epoch": 0.3117125524965002, "grad_norm": 0.22088375687599182, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.7499650120735168, "num_tokens": 1210484.0, "step": 334 }, { "entropy": 0.5870046764612198, "epoch": 0.31264582361175924, "grad_norm": 0.23060370981693268, "learning_rate": 0.0002, "loss": 0.5819, "mean_token_accuracy": 0.7613692134618759, "num_tokens": 1214190.0, "step": 335 }, { "entropy": 0.6101571023464203, "epoch": 0.3135790947270182, "grad_norm": 0.2165846973657608, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.7515923231840134, "num_tokens": 1217626.0, "step": 336 }, { "entropy": 0.6078171133995056, "epoch": 0.3145123658422772, "grad_norm": 0.22835344076156616, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.7536688148975372, "num_tokens": 1221231.0, "step": 337 }, { "entropy": 0.5780431926250458, "epoch": 0.3154456369575362, "grad_norm": 0.21286775171756744, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7593173384666443, "num_tokens": 1224768.0, "step": 338 }, { "entropy": 0.6263787895441055, "epoch": 0.3163789080727952, "grad_norm": 0.2597062885761261, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.7447996884584427, "num_tokens": 1228515.0, "step": 339 }, { "entropy": 0.5898604542016983, "epoch": 0.31731217918805416, "grad_norm": 0.24986936151981354, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.7549707740545273, "num_tokens": 1232049.0, "step": 340 }, { "entropy": 0.5990090221166611, "epoch": 0.31824545030331314, "grad_norm": 0.2359207421541214, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.7505118697881699, "num_tokens": 1235644.0, "step": 341 }, { "entropy": 0.5914523005485535, "epoch": 0.3191787214185721, "grad_norm": 0.232805535197258, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.7616169154644012, "num_tokens": 1239329.0, "step": 342 }, { "entropy": 0.613632321357727, "epoch": 0.3201119925338311, "grad_norm": 0.2661004364490509, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.7557211667299271, "num_tokens": 1242916.0, "step": 343 }, { "entropy": 0.614156186580658, "epoch": 0.3210452636490901, "grad_norm": 0.1851717084646225, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.7508998513221741, "num_tokens": 1246618.0, "step": 344 }, { "entropy": 0.5903591960668564, "epoch": 0.32197853476434907, "grad_norm": 0.21408240497112274, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.7538702040910721, "num_tokens": 1250091.0, "step": 345 }, { "entropy": 0.6021784543991089, "epoch": 0.32291180587960805, "grad_norm": 0.21963505446910858, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.7517879009246826, "num_tokens": 1253679.0, "step": 346 }, { "entropy": 0.6390803754329681, "epoch": 0.32384507699486703, "grad_norm": 0.20665013790130615, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.7460967153310776, "num_tokens": 1257276.0, "step": 347 }, { "entropy": 0.627433717250824, "epoch": 0.324778348110126, "grad_norm": 0.2606278955936432, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.7425560653209686, "num_tokens": 1260694.0, "step": 348 }, { "entropy": 0.5854783207178116, "epoch": 0.325711619225385, "grad_norm": 0.19505207240581512, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.7572886198759079, "num_tokens": 1264270.0, "step": 349 }, { "entropy": 0.6104092448949814, "epoch": 0.326644890340644, "grad_norm": 0.27850067615509033, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.753401979804039, "num_tokens": 1267926.0, "step": 350 }, { "entropy": 0.5945405513048172, "epoch": 0.32757816145590296, "grad_norm": 0.20189014077186584, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.758490577340126, "num_tokens": 1271462.0, "step": 351 }, { "entropy": 0.6336624324321747, "epoch": 0.32851143257116194, "grad_norm": 0.26650872826576233, "learning_rate": 0.0002, "loss": 0.6396, "mean_token_accuracy": 0.7401424944400787, "num_tokens": 1275102.0, "step": 352 }, { "entropy": 0.6115435063838959, "epoch": 0.3294447036864209, "grad_norm": 0.22786034643650055, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.7429598271846771, "num_tokens": 1278642.0, "step": 353 }, { "entropy": 0.5812320560216904, "epoch": 0.3303779748016799, "grad_norm": 0.183819979429245, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.7689248770475388, "num_tokens": 1282165.0, "step": 354 }, { "entropy": 0.5847655087709427, "epoch": 0.3313112459169389, "grad_norm": 0.19989608228206635, "learning_rate": 0.0002, "loss": 0.589, "mean_token_accuracy": 0.7661341726779938, "num_tokens": 1285759.0, "step": 355 }, { "entropy": 0.5984356701374054, "epoch": 0.3322445170321979, "grad_norm": 0.2546808421611786, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.7550733387470245, "num_tokens": 1289316.0, "step": 356 }, { "entropy": 0.6160804480314255, "epoch": 0.33317778814745685, "grad_norm": 0.22378405928611755, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.7508740574121475, "num_tokens": 1292910.0, "step": 357 }, { "entropy": 0.5899059921503067, "epoch": 0.33411105926271584, "grad_norm": 0.22932404279708862, "learning_rate": 0.0002, "loss": 0.5861, "mean_token_accuracy": 0.7649360597133636, "num_tokens": 1296578.0, "step": 358 }, { "entropy": 0.6710889488458633, "epoch": 0.3350443303779748, "grad_norm": 0.24526171386241913, "learning_rate": 0.0002, "loss": 0.6679, "mean_token_accuracy": 0.7290333360433578, "num_tokens": 1300239.0, "step": 359 }, { "entropy": 0.6154076009988785, "epoch": 0.3359776014932338, "grad_norm": 0.2306792140007019, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.7568444609642029, "num_tokens": 1303776.0, "step": 360 }, { "entropy": 0.5984497219324112, "epoch": 0.3369108726084928, "grad_norm": 0.23140272498130798, "learning_rate": 0.0002, "loss": 0.6006, "mean_token_accuracy": 0.7509630173444748, "num_tokens": 1307554.0, "step": 361 }, { "entropy": 0.5946592539548874, "epoch": 0.33784414372375177, "grad_norm": 0.29730385541915894, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.7548128515481949, "num_tokens": 1311276.0, "step": 362 }, { "entropy": 0.6066769063472748, "epoch": 0.33877741483901075, "grad_norm": 0.2370462566614151, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.743898794054985, "num_tokens": 1314817.0, "step": 363 }, { "entropy": 0.6237184703350067, "epoch": 0.33971068595426973, "grad_norm": 0.22379326820373535, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.7474159598350525, "num_tokens": 1318522.0, "step": 364 }, { "entropy": 0.6195416897535324, "epoch": 0.3406439570695287, "grad_norm": 0.2198534458875656, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.7533635348081589, "num_tokens": 1322162.0, "step": 365 }, { "entropy": 0.6308283656835556, "epoch": 0.3415772281847877, "grad_norm": 0.22901062667369843, "learning_rate": 0.0002, "loss": 0.6401, "mean_token_accuracy": 0.7436907142400742, "num_tokens": 1325672.0, "step": 366 }, { "entropy": 0.593099907040596, "epoch": 0.3425104993000467, "grad_norm": 0.21790964901447296, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7577232420444489, "num_tokens": 1329399.0, "step": 367 }, { "entropy": 0.6479755938053131, "epoch": 0.34344377041530566, "grad_norm": 0.21687085926532745, "learning_rate": 0.0002, "loss": 0.6399, "mean_token_accuracy": 0.7407496571540833, "num_tokens": 1332982.0, "step": 368 }, { "entropy": 0.6347972005605698, "epoch": 0.34437704153056464, "grad_norm": 0.26061534881591797, "learning_rate": 0.0002, "loss": 0.6384, "mean_token_accuracy": 0.7446777373552322, "num_tokens": 1336567.0, "step": 369 }, { "entropy": 0.6241190731525421, "epoch": 0.3453103126458236, "grad_norm": 0.24576880037784576, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.7468370646238327, "num_tokens": 1340064.0, "step": 370 }, { "entropy": 0.6380767524242401, "epoch": 0.3462435837610826, "grad_norm": 0.2171047478914261, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.7482398450374603, "num_tokens": 1343720.0, "step": 371 }, { "entropy": 0.6216683834791183, "epoch": 0.3471768548763416, "grad_norm": 0.280464768409729, "learning_rate": 0.0002, "loss": 0.6458, "mean_token_accuracy": 0.7344055622816086, "num_tokens": 1347301.0, "step": 372 }, { "entropy": 0.6090810596942902, "epoch": 0.34811012599160057, "grad_norm": 0.23577742278575897, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.7505800127983093, "num_tokens": 1350885.0, "step": 373 }, { "entropy": 0.6314007639884949, "epoch": 0.34904339710685955, "grad_norm": 0.2249661237001419, "learning_rate": 0.0002, "loss": 0.6339, "mean_token_accuracy": 0.7498449087142944, "num_tokens": 1354455.0, "step": 374 }, { "entropy": 0.6458257734775543, "epoch": 0.34997666822211854, "grad_norm": 0.22789525985717773, "learning_rate": 0.0002, "loss": 0.6485, "mean_token_accuracy": 0.7356987744569778, "num_tokens": 1358082.0, "step": 375 }, { "entropy": 0.5810594856739044, "epoch": 0.3509099393373775, "grad_norm": 0.22420641779899597, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.7633730918169022, "num_tokens": 1361716.0, "step": 376 }, { "entropy": 0.5878690481185913, "epoch": 0.3518432104526365, "grad_norm": 0.23166850209236145, "learning_rate": 0.0002, "loss": 0.5968, "mean_token_accuracy": 0.7610799223184586, "num_tokens": 1365197.0, "step": 377 }, { "entropy": 0.6096364110708237, "epoch": 0.3527764815678955, "grad_norm": 0.21607016026973724, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.7519436925649643, "num_tokens": 1368819.0, "step": 378 }, { "entropy": 0.6249891370534897, "epoch": 0.35370975268315447, "grad_norm": 0.4773690104484558, "learning_rate": 0.0002, "loss": 0.6581, "mean_token_accuracy": 0.7373586744070053, "num_tokens": 1372475.0, "step": 379 }, { "entropy": 0.6183346509933472, "epoch": 0.35464302379841345, "grad_norm": 0.23047888278961182, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.7543335407972336, "num_tokens": 1376156.0, "step": 380 }, { "entropy": 0.5943261384963989, "epoch": 0.35557629491367243, "grad_norm": 0.2624111771583557, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.7639524340629578, "num_tokens": 1379819.0, "step": 381 }, { "entropy": 0.6645634472370148, "epoch": 0.3565095660289314, "grad_norm": 0.2375471442937851, "learning_rate": 0.0002, "loss": 0.6619, "mean_token_accuracy": 0.7374669909477234, "num_tokens": 1383432.0, "step": 382 }, { "entropy": 0.630711704492569, "epoch": 0.3574428371441904, "grad_norm": 0.2724907100200653, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.7522044479846954, "num_tokens": 1387086.0, "step": 383 }, { "entropy": 0.579196497797966, "epoch": 0.3583761082594494, "grad_norm": 0.2126770317554474, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7684217989444733, "num_tokens": 1390709.0, "step": 384 }, { "entropy": 0.6369570791721344, "epoch": 0.35930937937470836, "grad_norm": 0.20885342359542847, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.7492280751466751, "num_tokens": 1394348.0, "step": 385 }, { "entropy": 0.5800481140613556, "epoch": 0.36024265048996734, "grad_norm": 0.22694551944732666, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.7665986716747284, "num_tokens": 1398020.0, "step": 386 }, { "entropy": 0.5827400237321854, "epoch": 0.3611759216052263, "grad_norm": 0.30266207456588745, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.7627837806940079, "num_tokens": 1401581.0, "step": 387 }, { "entropy": 0.6121315211057663, "epoch": 0.3621091927204853, "grad_norm": 0.3236875534057617, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.7581509798765182, "num_tokens": 1405251.0, "step": 388 }, { "entropy": 0.5997529625892639, "epoch": 0.3630424638357443, "grad_norm": 0.27346500754356384, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.7418871521949768, "num_tokens": 1408886.0, "step": 389 }, { "entropy": 0.5911572277545929, "epoch": 0.36397573495100327, "grad_norm": 0.26092949509620667, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.7553617656230927, "num_tokens": 1412580.0, "step": 390 }, { "entropy": 0.5824232399463654, "epoch": 0.36490900606626225, "grad_norm": 0.23115184903144836, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.7586528062820435, "num_tokens": 1416196.0, "step": 391 }, { "entropy": 0.6483248174190521, "epoch": 0.36584227718152124, "grad_norm": 0.22439347207546234, "learning_rate": 0.0002, "loss": 0.649, "mean_token_accuracy": 0.7427305430173874, "num_tokens": 1419855.0, "step": 392 }, { "entropy": 0.5965899974107742, "epoch": 0.3667755482967802, "grad_norm": 0.2648840546607971, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.7543503940105438, "num_tokens": 1423524.0, "step": 393 }, { "entropy": 0.6175757348537445, "epoch": 0.3677088194120392, "grad_norm": 0.2837810218334198, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.7507263720035553, "num_tokens": 1427014.0, "step": 394 }, { "entropy": 0.5793129652738571, "epoch": 0.3686420905272982, "grad_norm": 0.19554108381271362, "learning_rate": 0.0002, "loss": 0.5756, "mean_token_accuracy": 0.7649319171905518, "num_tokens": 1430650.0, "step": 395 }, { "entropy": 0.5920907557010651, "epoch": 0.36957536164255717, "grad_norm": 0.19599369168281555, "learning_rate": 0.0002, "loss": 0.5861, "mean_token_accuracy": 0.7611645311117172, "num_tokens": 1434291.0, "step": 396 }, { "entropy": 0.5972599387168884, "epoch": 0.37050863275781615, "grad_norm": 0.2146742343902588, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.7556688338518143, "num_tokens": 1437835.0, "step": 397 }, { "entropy": 0.6373176723718643, "epoch": 0.37144190387307513, "grad_norm": 0.25356748700141907, "learning_rate": 0.0002, "loss": 0.6477, "mean_token_accuracy": 0.7385593056678772, "num_tokens": 1441416.0, "step": 398 }, { "entropy": 0.591210201382637, "epoch": 0.3723751749883341, "grad_norm": 0.21997016668319702, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.76143379509449, "num_tokens": 1444923.0, "step": 399 }, { "entropy": 0.5790425166487694, "epoch": 0.3733084461035931, "grad_norm": 0.20937463641166687, "learning_rate": 0.0002, "loss": 0.5846, "mean_token_accuracy": 0.7649457454681396, "num_tokens": 1448595.0, "step": 400 }, { "entropy": 0.6249474138021469, "epoch": 0.3742417172188521, "grad_norm": 0.23173649609088898, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.7441944926977158, "num_tokens": 1452163.0, "step": 401 }, { "entropy": 0.6181075423955917, "epoch": 0.37517498833411106, "grad_norm": 0.22161071002483368, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.7455354183912277, "num_tokens": 1455726.0, "step": 402 }, { "entropy": 0.6109613627195358, "epoch": 0.37610825944937004, "grad_norm": 0.25769132375717163, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.749205157160759, "num_tokens": 1459254.0, "step": 403 }, { "entropy": 0.612871915102005, "epoch": 0.377041530564629, "grad_norm": 0.22754663228988647, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.757092222571373, "num_tokens": 1462905.0, "step": 404 }, { "entropy": 0.5554372072219849, "epoch": 0.377974801679888, "grad_norm": 0.23889710009098053, "learning_rate": 0.0002, "loss": 0.5606, "mean_token_accuracy": 0.7775957882404327, "num_tokens": 1466362.0, "step": 405 }, { "entropy": 0.6151146590709686, "epoch": 0.378908072795147, "grad_norm": 0.24351723492145538, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.7467886358499527, "num_tokens": 1469892.0, "step": 406 }, { "entropy": 0.5948413759469986, "epoch": 0.37984134391040597, "grad_norm": 0.2489289492368698, "learning_rate": 0.0002, "loss": 0.6042, "mean_token_accuracy": 0.7552263885736465, "num_tokens": 1473456.0, "step": 407 }, { "entropy": 0.6514520347118378, "epoch": 0.38077461502566495, "grad_norm": 0.21675430238246918, "learning_rate": 0.0002, "loss": 0.6536, "mean_token_accuracy": 0.7320958226919174, "num_tokens": 1477204.0, "step": 408 }, { "entropy": 0.6002169996500015, "epoch": 0.38170788614092394, "grad_norm": 0.20564278960227966, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7515855133533478, "num_tokens": 1480917.0, "step": 409 }, { "entropy": 0.6056803911924362, "epoch": 0.3826411572561829, "grad_norm": 0.2332601696252823, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.7587864995002747, "num_tokens": 1484618.0, "step": 410 }, { "entropy": 0.6287792176008224, "epoch": 0.3835744283714419, "grad_norm": 0.2355736643075943, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.7448130995035172, "num_tokens": 1488236.0, "step": 411 }, { "entropy": 0.6484764218330383, "epoch": 0.3845076994867009, "grad_norm": 0.2035057097673416, "learning_rate": 0.0002, "loss": 0.6527, "mean_token_accuracy": 0.7360820472240448, "num_tokens": 1491869.0, "step": 412 }, { "entropy": 0.6360791027545929, "epoch": 0.38544097060195986, "grad_norm": 0.22852467000484467, "learning_rate": 0.0002, "loss": 0.6422, "mean_token_accuracy": 0.7392690777778625, "num_tokens": 1495633.0, "step": 413 }, { "entropy": 0.618283674120903, "epoch": 0.38637424171721885, "grad_norm": 0.19870463013648987, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.7523349821567535, "num_tokens": 1499407.0, "step": 414 }, { "entropy": 0.5462178736925125, "epoch": 0.38730751283247783, "grad_norm": 0.2444918304681778, "learning_rate": 0.0002, "loss": 0.5613, "mean_token_accuracy": 0.7686880975961685, "num_tokens": 1502890.0, "step": 415 }, { "entropy": 0.6087295860052109, "epoch": 0.3882407839477368, "grad_norm": 0.26696375012397766, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.755668118596077, "num_tokens": 1506414.0, "step": 416 }, { "entropy": 0.6091526746749878, "epoch": 0.3891740550629958, "grad_norm": 0.207493856549263, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.749883234500885, "num_tokens": 1510131.0, "step": 417 }, { "entropy": 0.548588216304779, "epoch": 0.3901073261782548, "grad_norm": 0.18546663224697113, "learning_rate": 0.0002, "loss": 0.5493, "mean_token_accuracy": 0.776019886136055, "num_tokens": 1513874.0, "step": 418 }, { "entropy": 0.5989990383386612, "epoch": 0.39104059729351376, "grad_norm": 0.2156958132982254, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.7582882195711136, "num_tokens": 1517435.0, "step": 419 }, { "entropy": 0.583926260471344, "epoch": 0.39197386840877274, "grad_norm": 0.21883417665958405, "learning_rate": 0.0002, "loss": 0.5835, "mean_token_accuracy": 0.7627844363451004, "num_tokens": 1521143.0, "step": 420 }, { "entropy": 0.6015520393848419, "epoch": 0.3929071395240317, "grad_norm": 0.2115660309791565, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.753062516450882, "num_tokens": 1524755.0, "step": 421 }, { "entropy": 0.6090584397315979, "epoch": 0.3938404106392907, "grad_norm": 0.23086337745189667, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.7485830336809158, "num_tokens": 1528373.0, "step": 422 }, { "entropy": 0.6048530638217926, "epoch": 0.3947736817545497, "grad_norm": 0.26202768087387085, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.7442813217639923, "num_tokens": 1531849.0, "step": 423 }, { "entropy": 0.6436657458543777, "epoch": 0.39570695286980867, "grad_norm": 0.22831924259662628, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.7431166023015976, "num_tokens": 1535568.0, "step": 424 }, { "entropy": 0.6121799349784851, "epoch": 0.39664022398506765, "grad_norm": 0.23078174889087677, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.7544636875391006, "num_tokens": 1539189.0, "step": 425 }, { "entropy": 0.6414619237184525, "epoch": 0.39757349510032663, "grad_norm": 0.19844016432762146, "learning_rate": 0.0002, "loss": 0.6398, "mean_token_accuracy": 0.7412266284227371, "num_tokens": 1542863.0, "step": 426 }, { "entropy": 0.6200491487979889, "epoch": 0.3985067662155856, "grad_norm": 0.21976591646671295, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.7503442168235779, "num_tokens": 1546447.0, "step": 427 }, { "entropy": 0.5753028392791748, "epoch": 0.3994400373308446, "grad_norm": 0.20993764698505402, "learning_rate": 0.0002, "loss": 0.5757, "mean_token_accuracy": 0.7716755419969559, "num_tokens": 1550164.0, "step": 428 }, { "entropy": 0.6134968101978302, "epoch": 0.4003733084461036, "grad_norm": 0.28896018862724304, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.7445253133773804, "num_tokens": 1553773.0, "step": 429 }, { "entropy": 0.5808843672275543, "epoch": 0.40130657956136256, "grad_norm": 0.21802420914173126, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.7565340548753738, "num_tokens": 1557294.0, "step": 430 }, { "entropy": 0.6004755645990372, "epoch": 0.40223985067662155, "grad_norm": 0.2013148069381714, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.7476448863744736, "num_tokens": 1561013.0, "step": 431 }, { "entropy": 0.6116288751363754, "epoch": 0.40317312179188053, "grad_norm": 0.20840208232402802, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.7532025575637817, "num_tokens": 1564679.0, "step": 432 }, { "entropy": 0.5568309426307678, "epoch": 0.4041063929071395, "grad_norm": 0.2794225811958313, "learning_rate": 0.0002, "loss": 0.5508, "mean_token_accuracy": 0.7789036929607391, "num_tokens": 1568315.0, "step": 433 }, { "entropy": 0.6213642954826355, "epoch": 0.4050396640223985, "grad_norm": 0.20629864931106567, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.7531854659318924, "num_tokens": 1572078.0, "step": 434 }, { "entropy": 0.6264741569757462, "epoch": 0.4059729351376575, "grad_norm": 0.2116464227437973, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.7461398839950562, "num_tokens": 1575668.0, "step": 435 }, { "entropy": 0.5888563394546509, "epoch": 0.40690620625291646, "grad_norm": 0.21166282892227173, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.7616350054740906, "num_tokens": 1579296.0, "step": 436 }, { "entropy": 0.6168862730264664, "epoch": 0.40783947736817544, "grad_norm": 0.23338682949543, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.7514671385288239, "num_tokens": 1582998.0, "step": 437 }, { "entropy": 0.5854594111442566, "epoch": 0.4087727484834344, "grad_norm": 0.2398950606584549, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.754451185464859, "num_tokens": 1586552.0, "step": 438 }, { "entropy": 0.5729570388793945, "epoch": 0.4097060195986934, "grad_norm": 0.23153294622898102, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.7614750117063522, "num_tokens": 1590237.0, "step": 439 }, { "entropy": 0.5578116625547409, "epoch": 0.4106392907139524, "grad_norm": 0.25818148255348206, "learning_rate": 0.0002, "loss": 0.5775, "mean_token_accuracy": 0.7652178704738617, "num_tokens": 1593677.0, "step": 440 }, { "entropy": 0.6341408938169479, "epoch": 0.41157256182921137, "grad_norm": 0.26176732778549194, "learning_rate": 0.0002, "loss": 0.6578, "mean_token_accuracy": 0.7298594415187836, "num_tokens": 1597367.0, "step": 441 }, { "entropy": 0.6089527010917664, "epoch": 0.41250583294447035, "grad_norm": 0.20524422824382782, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.748669445514679, "num_tokens": 1600980.0, "step": 442 }, { "entropy": 0.6088770627975464, "epoch": 0.41343910405972933, "grad_norm": 0.18685582280158997, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7551285624504089, "num_tokens": 1604716.0, "step": 443 }, { "entropy": 0.6358718276023865, "epoch": 0.4143723751749883, "grad_norm": 0.20167051255702972, "learning_rate": 0.0002, "loss": 0.6283, "mean_token_accuracy": 0.7495709210634232, "num_tokens": 1608404.0, "step": 444 }, { "entropy": 0.6588656902313232, "epoch": 0.4153056462902473, "grad_norm": 0.18997955322265625, "learning_rate": 0.0002, "loss": 0.6541, "mean_token_accuracy": 0.7358649969100952, "num_tokens": 1612112.0, "step": 445 }, { "entropy": 0.6643947660923004, "epoch": 0.4162389174055063, "grad_norm": 0.20268765091896057, "learning_rate": 0.0002, "loss": 0.6602, "mean_token_accuracy": 0.7282259166240692, "num_tokens": 1615780.0, "step": 446 }, { "entropy": 0.6287236958742142, "epoch": 0.41717218852076526, "grad_norm": 0.20473895967006683, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.7462587356567383, "num_tokens": 1619318.0, "step": 447 }, { "entropy": 0.5777546912431717, "epoch": 0.41810545963602425, "grad_norm": 0.20942486822605133, "learning_rate": 0.0002, "loss": 0.5723, "mean_token_accuracy": 0.7674697041511536, "num_tokens": 1622883.0, "step": 448 }, { "entropy": 0.5664989203214645, "epoch": 0.41903873075128323, "grad_norm": 0.23915570974349976, "learning_rate": 0.0002, "loss": 0.5706, "mean_token_accuracy": 0.7707549780607224, "num_tokens": 1626485.0, "step": 449 }, { "entropy": 0.5722636282444, "epoch": 0.4199720018665422, "grad_norm": 0.23664964735507965, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.7642188966274261, "num_tokens": 1630162.0, "step": 450 }, { "entropy": 0.5782869011163712, "epoch": 0.4209052729818012, "grad_norm": 0.2639448940753937, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.7626081854104996, "num_tokens": 1633779.0, "step": 451 }, { "entropy": 0.6273588538169861, "epoch": 0.4218385440970602, "grad_norm": 0.2569105625152588, "learning_rate": 0.0002, "loss": 0.6544, "mean_token_accuracy": 0.7371693849563599, "num_tokens": 1637341.0, "step": 452 }, { "entropy": 0.603368416428566, "epoch": 0.42277181521231916, "grad_norm": 0.23220407962799072, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.756953775882721, "num_tokens": 1641004.0, "step": 453 }, { "entropy": 0.5992353856563568, "epoch": 0.42370508632757814, "grad_norm": 0.1963675320148468, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.7551756352186203, "num_tokens": 1644623.0, "step": 454 }, { "entropy": 0.6181684583425522, "epoch": 0.4246383574428371, "grad_norm": 0.3011968433856964, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.7383728474378586, "num_tokens": 1648250.0, "step": 455 }, { "entropy": 0.5498093739151955, "epoch": 0.4255716285580961, "grad_norm": 0.2413301020860672, "learning_rate": 0.0002, "loss": 0.5589, "mean_token_accuracy": 0.772691935300827, "num_tokens": 1651767.0, "step": 456 }, { "entropy": 0.6557972878217697, "epoch": 0.4265048996733551, "grad_norm": 0.24576060473918915, "learning_rate": 0.0002, "loss": 0.6478, "mean_token_accuracy": 0.7400306165218353, "num_tokens": 1655396.0, "step": 457 }, { "entropy": 0.6262096613645554, "epoch": 0.42743817078861407, "grad_norm": 0.2525562644004822, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.7500291913747787, "num_tokens": 1658806.0, "step": 458 }, { "entropy": 0.6344262808561325, "epoch": 0.42837144190387305, "grad_norm": 0.2177077978849411, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.7468637973070145, "num_tokens": 1662454.0, "step": 459 }, { "entropy": 0.6489483565092087, "epoch": 0.42930471301913203, "grad_norm": 0.18358604609966278, "learning_rate": 0.0002, "loss": 0.6454, "mean_token_accuracy": 0.7407630234956741, "num_tokens": 1666108.0, "step": 460 }, { "entropy": 0.6369544863700867, "epoch": 0.430237984134391, "grad_norm": 0.24405793845653534, "learning_rate": 0.0002, "loss": 0.6293, "mean_token_accuracy": 0.7408265024423599, "num_tokens": 1669691.0, "step": 461 }, { "entropy": 0.5755114704370499, "epoch": 0.43117125524965, "grad_norm": 0.20880405604839325, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.7664613276720047, "num_tokens": 1673307.0, "step": 462 }, { "entropy": 0.6328095942735672, "epoch": 0.432104526364909, "grad_norm": 0.21288110315799713, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.7417377382516861, "num_tokens": 1676991.0, "step": 463 }, { "entropy": 0.6561430096626282, "epoch": 0.43303779748016796, "grad_norm": 0.26392465829849243, "learning_rate": 0.0002, "loss": 0.6663, "mean_token_accuracy": 0.7314817905426025, "num_tokens": 1680631.0, "step": 464 }, { "entropy": 0.582016110420227, "epoch": 0.43397106859542695, "grad_norm": 0.24724449217319489, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.7569096833467484, "num_tokens": 1684274.0, "step": 465 }, { "entropy": 0.6108385771512985, "epoch": 0.4349043397106859, "grad_norm": 0.25080010294914246, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.7474347800016403, "num_tokens": 1687818.0, "step": 466 }, { "entropy": 0.5554463490843773, "epoch": 0.4358376108259449, "grad_norm": 0.23783200979232788, "learning_rate": 0.0002, "loss": 0.5646, "mean_token_accuracy": 0.7702253013849258, "num_tokens": 1691418.0, "step": 467 }, { "entropy": 0.6007915735244751, "epoch": 0.4367708819412039, "grad_norm": 0.24824760854244232, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.7542315125465393, "num_tokens": 1695080.0, "step": 468 }, { "entropy": 0.6305365711450577, "epoch": 0.43770415305646293, "grad_norm": 0.2279318869113922, "learning_rate": 0.0002, "loss": 0.6215, "mean_token_accuracy": 0.7407190501689911, "num_tokens": 1698639.0, "step": 469 }, { "entropy": 0.6181277334690094, "epoch": 0.4386374241717219, "grad_norm": 0.2178512066602707, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.7535411715507507, "num_tokens": 1702207.0, "step": 470 }, { "entropy": 0.5776286274194717, "epoch": 0.4395706952869809, "grad_norm": 0.22589510679244995, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.767062708735466, "num_tokens": 1705748.0, "step": 471 }, { "entropy": 0.6302258223295212, "epoch": 0.4405039664022399, "grad_norm": 0.24440187215805054, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7504953294992447, "num_tokens": 1709422.0, "step": 472 }, { "entropy": 0.6013066172599792, "epoch": 0.44143723751749886, "grad_norm": 0.2161339819431305, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.7565919011831284, "num_tokens": 1713046.0, "step": 473 }, { "entropy": 0.6447531282901764, "epoch": 0.44237050863275784, "grad_norm": 0.24048173427581787, "learning_rate": 0.0002, "loss": 0.6552, "mean_token_accuracy": 0.7334858626127243, "num_tokens": 1716617.0, "step": 474 }, { "entropy": 0.6011638641357422, "epoch": 0.4433037797480168, "grad_norm": 0.2128731906414032, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.7511695325374603, "num_tokens": 1720294.0, "step": 475 }, { "entropy": 0.593930572271347, "epoch": 0.4442370508632758, "grad_norm": 0.2826727628707886, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.7486419230699539, "num_tokens": 1723801.0, "step": 476 }, { "entropy": 0.581388384103775, "epoch": 0.4451703219785348, "grad_norm": 0.2758433520793915, "learning_rate": 0.0002, "loss": 0.6061, "mean_token_accuracy": 0.7605137974023819, "num_tokens": 1727396.0, "step": 477 }, { "entropy": 0.6107244491577148, "epoch": 0.44610359309379377, "grad_norm": 0.24574656784534454, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.7443448156118393, "num_tokens": 1730949.0, "step": 478 }, { "entropy": 0.5942057520151138, "epoch": 0.44703686420905275, "grad_norm": 0.2629700303077698, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7564302086830139, "num_tokens": 1734559.0, "step": 479 }, { "entropy": 0.5794282853603363, "epoch": 0.44797013532431174, "grad_norm": 0.23752184212207794, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.7582080662250519, "num_tokens": 1738109.0, "step": 480 }, { "entropy": 0.593425527215004, "epoch": 0.4489034064395707, "grad_norm": 0.22867943346500397, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.7566874623298645, "num_tokens": 1741538.0, "step": 481 }, { "entropy": 0.6190366894006729, "epoch": 0.4498366775548297, "grad_norm": 0.22431443631649017, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.756907731294632, "num_tokens": 1745121.0, "step": 482 }, { "entropy": 0.6499160677194595, "epoch": 0.4507699486700887, "grad_norm": 0.22753562033176422, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.7509006857872009, "num_tokens": 1748731.0, "step": 483 }, { "entropy": 0.6070314049720764, "epoch": 0.45170321978534766, "grad_norm": 0.19506925344467163, "learning_rate": 0.0002, "loss": 0.5891, "mean_token_accuracy": 0.7620608955621719, "num_tokens": 1752347.0, "step": 484 }, { "entropy": 0.639227420091629, "epoch": 0.45263649090060665, "grad_norm": 0.20678700506687164, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7561115622520447, "num_tokens": 1756106.0, "step": 485 }, { "entropy": 0.64481121301651, "epoch": 0.45356976201586563, "grad_norm": 0.19412213563919067, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.7352472841739655, "num_tokens": 1759814.0, "step": 486 }, { "entropy": 0.6285559386014938, "epoch": 0.4545030331311246, "grad_norm": 0.23103299736976624, "learning_rate": 0.0002, "loss": 0.6435, "mean_token_accuracy": 0.7330741733312607, "num_tokens": 1763357.0, "step": 487 }, { "entropy": 0.5692185014486313, "epoch": 0.4554363042463836, "grad_norm": 0.232151597738266, "learning_rate": 0.0002, "loss": 0.5837, "mean_token_accuracy": 0.764122948050499, "num_tokens": 1766908.0, "step": 488 }, { "entropy": 0.6087078899145126, "epoch": 0.4563695753616426, "grad_norm": 0.29596665501594543, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.7542658895254135, "num_tokens": 1770362.0, "step": 489 }, { "entropy": 0.5964542776346207, "epoch": 0.45730284647690156, "grad_norm": 0.23678913712501526, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.7495933175086975, "num_tokens": 1773938.0, "step": 490 }, { "entropy": 0.642185315489769, "epoch": 0.45823611759216054, "grad_norm": 0.19446046650409698, "learning_rate": 0.0002, "loss": 0.6356, "mean_token_accuracy": 0.7412782460451126, "num_tokens": 1777535.0, "step": 491 }, { "entropy": 0.5841348767280579, "epoch": 0.4591693887074195, "grad_norm": 0.22333481907844543, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.7654763758182526, "num_tokens": 1781161.0, "step": 492 }, { "entropy": 0.6212467104196548, "epoch": 0.4601026598226785, "grad_norm": 0.2813858091831207, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.7433548122644424, "num_tokens": 1784749.0, "step": 493 }, { "entropy": 0.6306095570325851, "epoch": 0.4610359309379375, "grad_norm": 0.20597881078720093, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.7461930215358734, "num_tokens": 1788450.0, "step": 494 }, { "entropy": 0.5940341800451279, "epoch": 0.46196920205319647, "grad_norm": 0.1942528635263443, "learning_rate": 0.0002, "loss": 0.5845, "mean_token_accuracy": 0.7609553933143616, "num_tokens": 1792129.0, "step": 495 }, { "entropy": 0.5897771716117859, "epoch": 0.46290247316845545, "grad_norm": 0.22309216856956482, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.7583287209272385, "num_tokens": 1795671.0, "step": 496 }, { "entropy": 0.6222110390663147, "epoch": 0.46383574428371444, "grad_norm": 0.19774983823299408, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.7421960681676865, "num_tokens": 1799209.0, "step": 497 }, { "entropy": 0.5948785990476608, "epoch": 0.4647690153989734, "grad_norm": 0.23217858374118805, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.7445811331272125, "num_tokens": 1802751.0, "step": 498 }, { "entropy": 0.6586802750825882, "epoch": 0.4657022865142324, "grad_norm": 0.2724892199039459, "learning_rate": 0.0002, "loss": 0.6668, "mean_token_accuracy": 0.7286554425954819, "num_tokens": 1806360.0, "step": 499 }, { "entropy": 0.5885701924562454, "epoch": 0.4666355576294914, "grad_norm": 0.6046621203422546, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7626414597034454, "num_tokens": 1810021.0, "step": 500 }, { "entropy": 0.6085949391126633, "epoch": 0.46756882874475036, "grad_norm": 0.24251966178417206, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.7582648545503616, "num_tokens": 1813659.0, "step": 501 }, { "entropy": 0.6009816527366638, "epoch": 0.46850209986000935, "grad_norm": 0.3091435134410858, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.7582293003797531, "num_tokens": 1817297.0, "step": 502 }, { "entropy": 0.607995867729187, "epoch": 0.46943537097526833, "grad_norm": 0.302284836769104, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.7495416402816772, "num_tokens": 1820793.0, "step": 503 }, { "entropy": 0.5860798209905624, "epoch": 0.4703686420905273, "grad_norm": 0.281692773103714, "learning_rate": 0.0002, "loss": 0.6105, "mean_token_accuracy": 0.7575143426656723, "num_tokens": 1824443.0, "step": 504 }, { "entropy": 0.5959719717502594, "epoch": 0.4713019132057863, "grad_norm": 0.30019766092300415, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.7507368624210358, "num_tokens": 1828032.0, "step": 505 }, { "entropy": 0.6059419512748718, "epoch": 0.4722351843210453, "grad_norm": 0.27972251176834106, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.7432912737131119, "num_tokens": 1831590.0, "step": 506 }, { "entropy": 0.6119865328073502, "epoch": 0.47316845543630426, "grad_norm": 0.2650858461856842, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7524244785308838, "num_tokens": 1835080.0, "step": 507 }, { "entropy": 0.6203949451446533, "epoch": 0.47410172655156324, "grad_norm": 0.19326429069042206, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.7561132609844208, "num_tokens": 1838817.0, "step": 508 }, { "entropy": 0.6557294726371765, "epoch": 0.4750349976668222, "grad_norm": 0.26020365953445435, "learning_rate": 0.0002, "loss": 0.6474, "mean_token_accuracy": 0.7374190837144852, "num_tokens": 1842503.0, "step": 509 }, { "entropy": 0.6445588022470474, "epoch": 0.4759682687820812, "grad_norm": 0.22767597436904907, "learning_rate": 0.0002, "loss": 0.6435, "mean_token_accuracy": 0.7376889288425446, "num_tokens": 1846139.0, "step": 510 }, { "entropy": 0.6249513626098633, "epoch": 0.4769015398973402, "grad_norm": 0.24564391374588013, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.7533416897058487, "num_tokens": 1849663.0, "step": 511 }, { "entropy": 0.6182565540075302, "epoch": 0.47783481101259917, "grad_norm": 0.2080013006925583, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.758715957403183, "num_tokens": 1853309.0, "step": 512 }, { "entropy": 0.6050814986228943, "epoch": 0.47876808212785815, "grad_norm": 0.22791148722171783, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.7479860484600067, "num_tokens": 1856860.0, "step": 513 }, { "entropy": 0.5921116918325424, "epoch": 0.47970135324311713, "grad_norm": 0.18685010075569153, "learning_rate": 0.0002, "loss": 0.5872, "mean_token_accuracy": 0.7639712542295456, "num_tokens": 1860558.0, "step": 514 }, { "entropy": 0.5540153980255127, "epoch": 0.4806346243583761, "grad_norm": 0.22777342796325684, "learning_rate": 0.0002, "loss": 0.57, "mean_token_accuracy": 0.7709729671478271, "num_tokens": 1864231.0, "step": 515 }, { "entropy": 0.6377909630537033, "epoch": 0.4815678954736351, "grad_norm": 0.26171520352363586, "learning_rate": 0.0002, "loss": 0.6652, "mean_token_accuracy": 0.7346604615449905, "num_tokens": 1867982.0, "step": 516 }, { "entropy": 0.5937848389148712, "epoch": 0.4825011665888941, "grad_norm": 0.23369687795639038, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.7548262923955917, "num_tokens": 1871586.0, "step": 517 }, { "entropy": 0.5866114050149918, "epoch": 0.48343443770415306, "grad_norm": 0.25140082836151123, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.7549565732479095, "num_tokens": 1875114.0, "step": 518 }, { "entropy": 0.6016811728477478, "epoch": 0.48436770881941205, "grad_norm": 0.2129909247159958, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.7586175948381424, "num_tokens": 1878772.0, "step": 519 }, { "entropy": 0.5678152963519096, "epoch": 0.48530097993467103, "grad_norm": 0.24592013657093048, "learning_rate": 0.0002, "loss": 0.576, "mean_token_accuracy": 0.7682454586029053, "num_tokens": 1882298.0, "step": 520 }, { "entropy": 0.6545820385217667, "epoch": 0.48623425104993, "grad_norm": 0.22277647256851196, "learning_rate": 0.0002, "loss": 0.6451, "mean_token_accuracy": 0.7418922781944275, "num_tokens": 1885970.0, "step": 521 }, { "entropy": 0.6246357411146164, "epoch": 0.487167522165189, "grad_norm": 0.21170637011528015, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.7413342744112015, "num_tokens": 1889584.0, "step": 522 }, { "entropy": 0.6219459176063538, "epoch": 0.488100793280448, "grad_norm": 0.27718207240104675, "learning_rate": 0.0002, "loss": 0.6428, "mean_token_accuracy": 0.7339842766523361, "num_tokens": 1893122.0, "step": 523 }, { "entropy": 0.5849523991346359, "epoch": 0.48903406439570696, "grad_norm": 0.22737884521484375, "learning_rate": 0.0002, "loss": 0.592, "mean_token_accuracy": 0.7552362531423569, "num_tokens": 1896618.0, "step": 524 }, { "entropy": 0.6161646693944931, "epoch": 0.48996733551096594, "grad_norm": 0.24336765706539154, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7531228512525558, "num_tokens": 1900297.0, "step": 525 }, { "entropy": 0.6193034499883652, "epoch": 0.4909006066262249, "grad_norm": 0.23012636601924896, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.751009151339531, "num_tokens": 1903911.0, "step": 526 }, { "entropy": 0.606223851442337, "epoch": 0.4918338777414839, "grad_norm": 0.24781034886837006, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.7546146959066391, "num_tokens": 1907509.0, "step": 527 }, { "entropy": 0.632103368639946, "epoch": 0.4927671488567429, "grad_norm": 0.2623005211353302, "learning_rate": 0.0002, "loss": 0.6325, "mean_token_accuracy": 0.742919310927391, "num_tokens": 1911188.0, "step": 528 }, { "entropy": 0.6316887140274048, "epoch": 0.49370041997200187, "grad_norm": 0.27283912897109985, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.7417255342006683, "num_tokens": 1914795.0, "step": 529 }, { "entropy": 0.6190870404243469, "epoch": 0.49463369108726085, "grad_norm": 0.22906313836574554, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.748499795794487, "num_tokens": 1918489.0, "step": 530 }, { "entropy": 0.6059915870428085, "epoch": 0.49556696220251983, "grad_norm": 0.21479454636573792, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.7561027705669403, "num_tokens": 1922078.0, "step": 531 }, { "entropy": 0.548756331205368, "epoch": 0.4965002333177788, "grad_norm": 0.2242330014705658, "learning_rate": 0.0002, "loss": 0.5541, "mean_token_accuracy": 0.7784936428070068, "num_tokens": 1925683.0, "step": 532 }, { "entropy": 0.589206725358963, "epoch": 0.4974335044330378, "grad_norm": 0.3264258801937103, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.748665064573288, "num_tokens": 1929240.0, "step": 533 }, { "entropy": 0.6473591774702072, "epoch": 0.4983667755482968, "grad_norm": 0.25774556398391724, "learning_rate": 0.0002, "loss": 0.66, "mean_token_accuracy": 0.730330303311348, "num_tokens": 1932939.0, "step": 534 }, { "entropy": 0.6295206099748611, "epoch": 0.49930004666355576, "grad_norm": 0.1974191665649414, "learning_rate": 0.0002, "loss": 0.6317, "mean_token_accuracy": 0.7406523078680038, "num_tokens": 1936537.0, "step": 535 }, { "entropy": 0.6022147685289383, "epoch": 0.5002333177788147, "grad_norm": 0.2428286224603653, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7557090222835541, "num_tokens": 1940150.0, "step": 536 }, { "entropy": 0.6322564631700516, "epoch": 0.5011665888940737, "grad_norm": 0.28705525398254395, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.7497996091842651, "num_tokens": 1943693.0, "step": 537 }, { "entropy": 0.644588828086853, "epoch": 0.5020998600093327, "grad_norm": 0.243915393948555, "learning_rate": 0.0002, "loss": 0.6462, "mean_token_accuracy": 0.7416273951530457, "num_tokens": 1947239.0, "step": 538 }, { "entropy": 0.5565096363425255, "epoch": 0.5030331311245917, "grad_norm": 0.22264209389686584, "learning_rate": 0.0002, "loss": 0.5562, "mean_token_accuracy": 0.7696363776922226, "num_tokens": 1950784.0, "step": 539 }, { "entropy": 0.6096963733434677, "epoch": 0.5039664022398507, "grad_norm": 0.24413947761058807, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.7582883387804031, "num_tokens": 1954361.0, "step": 540 }, { "entropy": 0.6190861612558365, "epoch": 0.5048996733551097, "grad_norm": 0.233244851231575, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.7435539066791534, "num_tokens": 1958010.0, "step": 541 }, { "entropy": 0.6074635237455368, "epoch": 0.5058329444703686, "grad_norm": 0.3824629783630371, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.7517760694026947, "num_tokens": 1961603.0, "step": 542 }, { "entropy": 0.6014325171709061, "epoch": 0.5067662155856276, "grad_norm": 0.2410052865743637, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.7551465630531311, "num_tokens": 1965264.0, "step": 543 }, { "entropy": 0.5701528638601303, "epoch": 0.5076994867008866, "grad_norm": 0.2524022161960602, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7609460651874542, "num_tokens": 1968823.0, "step": 544 }, { "entropy": 0.584552451968193, "epoch": 0.5086327578161456, "grad_norm": 0.24330593645572662, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.75290647149086, "num_tokens": 1972543.0, "step": 545 }, { "entropy": 0.5987952649593353, "epoch": 0.5095660289314046, "grad_norm": 0.4852622151374817, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.7621091306209564, "num_tokens": 1976168.0, "step": 546 }, { "entropy": 0.621493011713028, "epoch": 0.5104993000466636, "grad_norm": 0.2389807254076004, "learning_rate": 0.0002, "loss": 0.6399, "mean_token_accuracy": 0.7420757710933685, "num_tokens": 1979771.0, "step": 547 }, { "entropy": 0.6213755458593369, "epoch": 0.5114325711619225, "grad_norm": 0.3466503620147705, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.7460227906703949, "num_tokens": 1983313.0, "step": 548 }, { "entropy": 0.6321664601564407, "epoch": 0.5123658422771815, "grad_norm": 0.2572785019874573, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.7455581277608871, "num_tokens": 1987060.0, "step": 549 }, { "entropy": 0.5688913762569427, "epoch": 0.5132991133924405, "grad_norm": 0.19045941531658173, "learning_rate": 0.0002, "loss": 0.5668, "mean_token_accuracy": 0.774849534034729, "num_tokens": 1990596.0, "step": 550 }, { "entropy": 0.5920332223176956, "epoch": 0.5142323845076995, "grad_norm": 0.23980528116226196, "learning_rate": 0.0002, "loss": 0.5866, "mean_token_accuracy": 0.761077269911766, "num_tokens": 1994183.0, "step": 551 }, { "entropy": 0.5656691938638687, "epoch": 0.5151656556229585, "grad_norm": 0.24367259442806244, "learning_rate": 0.0002, "loss": 0.5641, "mean_token_accuracy": 0.7764574289321899, "num_tokens": 1997755.0, "step": 552 }, { "entropy": 0.6111474484205246, "epoch": 0.5160989267382174, "grad_norm": 0.24117279052734375, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.7547838389873505, "num_tokens": 2001331.0, "step": 553 }, { "entropy": 0.5802249163389206, "epoch": 0.5170321978534764, "grad_norm": 0.23052208125591278, "learning_rate": 0.0002, "loss": 0.5868, "mean_token_accuracy": 0.7593701779842377, "num_tokens": 2004916.0, "step": 554 }, { "entropy": 0.6315817534923553, "epoch": 0.5179654689687354, "grad_norm": 0.2725941836833954, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.7440078407526016, "num_tokens": 2008540.0, "step": 555 }, { "entropy": 0.6077404320240021, "epoch": 0.5188987400839944, "grad_norm": 0.23686137795448303, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.7630778849124908, "num_tokens": 2012205.0, "step": 556 }, { "entropy": 0.6188667863607407, "epoch": 0.5198320111992534, "grad_norm": 0.23134677112102509, "learning_rate": 0.0002, "loss": 0.628, "mean_token_accuracy": 0.7463242411613464, "num_tokens": 2015912.0, "step": 557 }, { "entropy": 0.6022833734750748, "epoch": 0.5207652823145124, "grad_norm": 0.25757449865341187, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7498631626367569, "num_tokens": 2019598.0, "step": 558 }, { "entropy": 0.6121849864721298, "epoch": 0.5216985534297713, "grad_norm": 0.22658950090408325, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.7508550584316254, "num_tokens": 2023215.0, "step": 559 }, { "entropy": 0.5977055728435516, "epoch": 0.5226318245450303, "grad_norm": 0.24915415048599243, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.7504899352788925, "num_tokens": 2026804.0, "step": 560 }, { "entropy": 0.5835391134023666, "epoch": 0.5235650956602893, "grad_norm": 0.2853584289550781, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7529281079769135, "num_tokens": 2030417.0, "step": 561 }, { "entropy": 0.6320301592350006, "epoch": 0.5244983667755483, "grad_norm": 0.24018581211566925, "learning_rate": 0.0002, "loss": 0.6345, "mean_token_accuracy": 0.7510936558246613, "num_tokens": 2034030.0, "step": 562 }, { "entropy": 0.6083960682153702, "epoch": 0.5254316378908073, "grad_norm": 0.2608059346675873, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.7557867467403412, "num_tokens": 2037647.0, "step": 563 }, { "entropy": 0.6189428716897964, "epoch": 0.5263649090060663, "grad_norm": 0.2676878869533539, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.7490536570549011, "num_tokens": 2041211.0, "step": 564 }, { "entropy": 0.6251794844865799, "epoch": 0.5272981801213252, "grad_norm": 0.25728639960289, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.7479203194379807, "num_tokens": 2044884.0, "step": 565 }, { "entropy": 0.5767540037631989, "epoch": 0.5282314512365842, "grad_norm": 0.2348065972328186, "learning_rate": 0.0002, "loss": 0.5816, "mean_token_accuracy": 0.7646252512931824, "num_tokens": 2048498.0, "step": 566 }, { "entropy": 0.5888216942548752, "epoch": 0.5291647223518432, "grad_norm": 0.258383184671402, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7674905508756638, "num_tokens": 2052112.0, "step": 567 }, { "entropy": 0.6130224168300629, "epoch": 0.5300979934671022, "grad_norm": 0.2268688976764679, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.753316193819046, "num_tokens": 2055715.0, "step": 568 }, { "entropy": 0.6233886480331421, "epoch": 0.5310312645823612, "grad_norm": 0.2236703783273697, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.74983249604702, "num_tokens": 2059360.0, "step": 569 }, { "entropy": 0.6275019347667694, "epoch": 0.5319645356976201, "grad_norm": 0.2512408494949341, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.7459133416414261, "num_tokens": 2062904.0, "step": 570 }, { "entropy": 0.5741313844919205, "epoch": 0.5328978068128791, "grad_norm": 0.21832828223705292, "learning_rate": 0.0002, "loss": 0.5773, "mean_token_accuracy": 0.7606221437454224, "num_tokens": 2066497.0, "step": 571 }, { "entropy": 0.5867547988891602, "epoch": 0.5338310779281381, "grad_norm": 0.22333365678787231, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.754267692565918, "num_tokens": 2070163.0, "step": 572 }, { "entropy": 0.577679768204689, "epoch": 0.5347643490433971, "grad_norm": 0.2535644471645355, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.755240336060524, "num_tokens": 2073825.0, "step": 573 }, { "entropy": 0.5978310704231262, "epoch": 0.5356976201586561, "grad_norm": 0.2550305724143982, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.7622738629579544, "num_tokens": 2077411.0, "step": 574 }, { "entropy": 0.6140522658824921, "epoch": 0.5366308912739151, "grad_norm": 0.2330036461353302, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.7533890604972839, "num_tokens": 2081052.0, "step": 575 }, { "entropy": 0.6308914422988892, "epoch": 0.537564162389174, "grad_norm": 0.2643299698829651, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.7457119822502136, "num_tokens": 2084728.0, "step": 576 }, { "entropy": 0.6212409138679504, "epoch": 0.538497433504433, "grad_norm": 0.22142568230628967, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.7562910318374634, "num_tokens": 2088251.0, "step": 577 }, { "entropy": 0.6316259950399399, "epoch": 0.539430704619692, "grad_norm": 0.21009226143360138, "learning_rate": 0.0002, "loss": 0.6155, "mean_token_accuracy": 0.7527958303689957, "num_tokens": 2091969.0, "step": 578 }, { "entropy": 0.6191743612289429, "epoch": 0.540363975734951, "grad_norm": 0.28116920590400696, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.7457010447978973, "num_tokens": 2095595.0, "step": 579 }, { "entropy": 0.5883292704820633, "epoch": 0.54129724685021, "grad_norm": 0.18404874205589294, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7594925165176392, "num_tokens": 2099380.0, "step": 580 }, { "entropy": 0.6318969428539276, "epoch": 0.542230517965469, "grad_norm": 0.2788572609424591, "learning_rate": 0.0002, "loss": 0.6504, "mean_token_accuracy": 0.7358361184597015, "num_tokens": 2102937.0, "step": 581 }, { "entropy": 0.5608608424663544, "epoch": 0.5431637890807279, "grad_norm": 0.2443041056394577, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.7577566355466843, "num_tokens": 2106489.0, "step": 582 }, { "entropy": 0.6297293603420258, "epoch": 0.5440970601959869, "grad_norm": 0.30691906809806824, "learning_rate": 0.0002, "loss": 0.634, "mean_token_accuracy": 0.7464357614517212, "num_tokens": 2110276.0, "step": 583 }, { "entropy": 0.6315603256225586, "epoch": 0.5450303313112459, "grad_norm": 0.20992031693458557, "learning_rate": 0.0002, "loss": 0.6301, "mean_token_accuracy": 0.7482829391956329, "num_tokens": 2113863.0, "step": 584 }, { "entropy": 0.6156140118837357, "epoch": 0.5459636024265049, "grad_norm": 0.26182013750076294, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.7576695382595062, "num_tokens": 2117533.0, "step": 585 }, { "entropy": 0.6022928357124329, "epoch": 0.5468968735417639, "grad_norm": 0.20355640351772308, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7502982318401337, "num_tokens": 2121111.0, "step": 586 }, { "entropy": 0.6206756830215454, "epoch": 0.5478301446570228, "grad_norm": 0.1877318024635315, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.7506903111934662, "num_tokens": 2124768.0, "step": 587 }, { "entropy": 0.5922321081161499, "epoch": 0.5487634157722818, "grad_norm": 0.23859453201293945, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7614504098892212, "num_tokens": 2128443.0, "step": 588 }, { "entropy": 0.6385919898748398, "epoch": 0.5496966868875408, "grad_norm": 0.18892315030097961, "learning_rate": 0.0002, "loss": 0.6468, "mean_token_accuracy": 0.7388083189725876, "num_tokens": 2132176.0, "step": 589 }, { "entropy": 0.6088386923074722, "epoch": 0.5506299580027998, "grad_norm": 0.24854958057403564, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.7383077293634415, "num_tokens": 2135730.0, "step": 590 }, { "entropy": 0.5928901135921478, "epoch": 0.5515632291180588, "grad_norm": 0.2225223332643509, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.7553098797798157, "num_tokens": 2139402.0, "step": 591 }, { "entropy": 0.6465137004852295, "epoch": 0.5524965002333178, "grad_norm": 0.25852739810943604, "learning_rate": 0.0002, "loss": 0.6472, "mean_token_accuracy": 0.7347801625728607, "num_tokens": 2143046.0, "step": 592 }, { "entropy": 0.5912451148033142, "epoch": 0.5534297713485767, "grad_norm": 0.19989070296287537, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7646196633577347, "num_tokens": 2146796.0, "step": 593 }, { "entropy": 0.5814379453659058, "epoch": 0.5543630424638357, "grad_norm": 0.20489796996116638, "learning_rate": 0.0002, "loss": 0.5771, "mean_token_accuracy": 0.7681462317705154, "num_tokens": 2150450.0, "step": 594 }, { "entropy": 0.5983973741531372, "epoch": 0.5552963135790947, "grad_norm": 0.2529965043067932, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7602561265230179, "num_tokens": 2153955.0, "step": 595 }, { "entropy": 0.599485233426094, "epoch": 0.5562295846943537, "grad_norm": 0.22376041114330292, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.7585104554891586, "num_tokens": 2157493.0, "step": 596 }, { "entropy": 0.6147795468568802, "epoch": 0.5571628558096127, "grad_norm": 0.2337314337491989, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.7496368288993835, "num_tokens": 2161223.0, "step": 597 }, { "entropy": 0.6050089448690414, "epoch": 0.5580961269248716, "grad_norm": 0.26658883690834045, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.7451014816761017, "num_tokens": 2164825.0, "step": 598 }, { "entropy": 0.6206872463226318, "epoch": 0.5590293980401306, "grad_norm": 0.25212785601615906, "learning_rate": 0.0002, "loss": 0.6409, "mean_token_accuracy": 0.7462498992681503, "num_tokens": 2168491.0, "step": 599 }, { "entropy": 0.6103123277425766, "epoch": 0.5599626691553896, "grad_norm": 0.2313571274280548, "learning_rate": 0.0002, "loss": 0.6129, "mean_token_accuracy": 0.7546882778406143, "num_tokens": 2172182.0, "step": 600 }, { "entropy": 0.5950224548578262, "epoch": 0.5608959402706486, "grad_norm": 0.18902678787708282, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.760997474193573, "num_tokens": 2175851.0, "step": 601 }, { "entropy": 0.6605288982391357, "epoch": 0.5618292113859076, "grad_norm": 0.21955937147140503, "learning_rate": 0.0002, "loss": 0.6677, "mean_token_accuracy": 0.7286249101161957, "num_tokens": 2179475.0, "step": 602 }, { "entropy": 0.603619709610939, "epoch": 0.5627624825011666, "grad_norm": 0.1958063840866089, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.756118655204773, "num_tokens": 2183154.0, "step": 603 }, { "entropy": 0.6418652236461639, "epoch": 0.5636957536164255, "grad_norm": 0.1904730498790741, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.7448119968175888, "num_tokens": 2186904.0, "step": 604 }, { "entropy": 0.6223194450139999, "epoch": 0.5646290247316845, "grad_norm": 0.20739367604255676, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.7567580193281174, "num_tokens": 2190726.0, "step": 605 }, { "entropy": 0.6238417625427246, "epoch": 0.5655622958469435, "grad_norm": 0.2288621962070465, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.7442229837179184, "num_tokens": 2194346.0, "step": 606 }, { "entropy": 0.5806388854980469, "epoch": 0.5664955669622025, "grad_norm": 0.24216139316558838, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.7583803981542587, "num_tokens": 2197887.0, "step": 607 }, { "entropy": 0.5859744250774384, "epoch": 0.5674288380774615, "grad_norm": 0.3106839954853058, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.7510814666748047, "num_tokens": 2201345.0, "step": 608 }, { "entropy": 0.628120481967926, "epoch": 0.5683621091927205, "grad_norm": 0.218292698264122, "learning_rate": 0.0002, "loss": 0.6396, "mean_token_accuracy": 0.7395099699497223, "num_tokens": 2204937.0, "step": 609 }, { "entropy": 0.5705380737781525, "epoch": 0.5692953803079794, "grad_norm": 0.19273996353149414, "learning_rate": 0.0002, "loss": 0.5698, "mean_token_accuracy": 0.771387368440628, "num_tokens": 2208650.0, "step": 610 }, { "entropy": 0.6788849532604218, "epoch": 0.5702286514232384, "grad_norm": 0.31241506338119507, "learning_rate": 0.0002, "loss": 0.6874, "mean_token_accuracy": 0.7216528058052063, "num_tokens": 2212325.0, "step": 611 }, { "entropy": 0.6160916388034821, "epoch": 0.5711619225384974, "grad_norm": 0.21813081204891205, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.7494376599788666, "num_tokens": 2215733.0, "step": 612 }, { "entropy": 0.593697801232338, "epoch": 0.5720951936537564, "grad_norm": 0.21896892786026, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.765286922454834, "num_tokens": 2219370.0, "step": 613 }, { "entropy": 0.6374763697385788, "epoch": 0.5730284647690154, "grad_norm": 0.20519328117370605, "learning_rate": 0.0002, "loss": 0.6461, "mean_token_accuracy": 0.7326332777738571, "num_tokens": 2222996.0, "step": 614 }, { "entropy": 0.6102698147296906, "epoch": 0.5739617358842743, "grad_norm": 0.22523924708366394, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.7542233169078827, "num_tokens": 2226640.0, "step": 615 }, { "entropy": 0.5955345034599304, "epoch": 0.5748950069995333, "grad_norm": 0.24114324152469635, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.7608686685562134, "num_tokens": 2230277.0, "step": 616 }, { "entropy": 0.5928029417991638, "epoch": 0.5758282781147923, "grad_norm": 0.24148298799991608, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.7535218596458435, "num_tokens": 2233802.0, "step": 617 }, { "entropy": 0.6550627052783966, "epoch": 0.5767615492300513, "grad_norm": 0.2273973822593689, "learning_rate": 0.0002, "loss": 0.6595, "mean_token_accuracy": 0.7361426800489426, "num_tokens": 2237564.0, "step": 618 }, { "entropy": 0.5986487716436386, "epoch": 0.5776948203453103, "grad_norm": 0.20957091450691223, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.7564622610807419, "num_tokens": 2241304.0, "step": 619 }, { "entropy": 0.6047791391611099, "epoch": 0.5786280914605693, "grad_norm": 0.30775532126426697, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.7528978884220123, "num_tokens": 2245093.0, "step": 620 }, { "entropy": 0.6091954708099365, "epoch": 0.5795613625758282, "grad_norm": 0.27155765891075134, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.7483118921518326, "num_tokens": 2248660.0, "step": 621 }, { "entropy": 0.5952047109603882, "epoch": 0.5804946336910872, "grad_norm": 0.22638940811157227, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.7623997330665588, "num_tokens": 2252322.0, "step": 622 }, { "entropy": 0.6382639855146408, "epoch": 0.5814279048063462, "grad_norm": 0.22293159365653992, "learning_rate": 0.0002, "loss": 0.6355, "mean_token_accuracy": 0.7425176352262497, "num_tokens": 2256013.0, "step": 623 }, { "entropy": 0.6116872578859329, "epoch": 0.5823611759216052, "grad_norm": 0.2432248592376709, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.7474506199359894, "num_tokens": 2259688.0, "step": 624 }, { "entropy": 0.6094284206628799, "epoch": 0.5832944470368642, "grad_norm": 0.2592344880104065, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.7498786598443985, "num_tokens": 2263219.0, "step": 625 }, { "entropy": 0.5933926999568939, "epoch": 0.5842277181521232, "grad_norm": 0.22340580821037292, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.7583221644163132, "num_tokens": 2266842.0, "step": 626 }, { "entropy": 0.6025984138250351, "epoch": 0.5851609892673821, "grad_norm": 0.21044649183750153, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.7560776174068451, "num_tokens": 2270464.0, "step": 627 }, { "entropy": 0.5880196392536163, "epoch": 0.5860942603826411, "grad_norm": 0.18335139751434326, "learning_rate": 0.0002, "loss": 0.5862, "mean_token_accuracy": 0.7619286924600601, "num_tokens": 2274006.0, "step": 628 }, { "entropy": 0.568743109703064, "epoch": 0.5870275314979001, "grad_norm": 0.21740184724330902, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.7630722522735596, "num_tokens": 2277616.0, "step": 629 }, { "entropy": 0.6060753762722015, "epoch": 0.5879608026131591, "grad_norm": 0.26494601368904114, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.7538398951292038, "num_tokens": 2281228.0, "step": 630 }, { "entropy": 0.5894195288419724, "epoch": 0.5888940737284181, "grad_norm": 0.21015943586826324, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.757981151342392, "num_tokens": 2284826.0, "step": 631 }, { "entropy": 0.6342244446277618, "epoch": 0.589827344843677, "grad_norm": 0.23458042740821838, "learning_rate": 0.0002, "loss": 0.637, "mean_token_accuracy": 0.7408255487680435, "num_tokens": 2288471.0, "step": 632 }, { "entropy": 0.6442506015300751, "epoch": 0.590760615958936, "grad_norm": 0.2215435653924942, "learning_rate": 0.0002, "loss": 0.6506, "mean_token_accuracy": 0.7320486307144165, "num_tokens": 2292202.0, "step": 633 }, { "entropy": 0.6192289441823959, "epoch": 0.591693887074195, "grad_norm": 0.19500331580638885, "learning_rate": 0.0002, "loss": 0.6042, "mean_token_accuracy": 0.7601695209741592, "num_tokens": 2295865.0, "step": 634 }, { "entropy": 0.6158197820186615, "epoch": 0.592627158189454, "grad_norm": 0.21311883628368378, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.7507375776767731, "num_tokens": 2299438.0, "step": 635 }, { "entropy": 0.6307994723320007, "epoch": 0.593560429304713, "grad_norm": 0.23045286536216736, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.7477785348892212, "num_tokens": 2303129.0, "step": 636 }, { "entropy": 0.6251413524150848, "epoch": 0.594493700419972, "grad_norm": 0.2460610270500183, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.7434120327234268, "num_tokens": 2306758.0, "step": 637 }, { "entropy": 0.6368845701217651, "epoch": 0.5954269715352309, "grad_norm": 0.22236786782741547, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.7471970617771149, "num_tokens": 2310338.0, "step": 638 }, { "entropy": 0.6354386806488037, "epoch": 0.5963602426504899, "grad_norm": 0.29677289724349976, "learning_rate": 0.0002, "loss": 0.6494, "mean_token_accuracy": 0.7283449172973633, "num_tokens": 2313886.0, "step": 639 }, { "entropy": 0.621285617351532, "epoch": 0.5972935137657489, "grad_norm": 0.20131072402000427, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7514660805463791, "num_tokens": 2317543.0, "step": 640 }, { "entropy": 0.6110919266939163, "epoch": 0.5982267848810079, "grad_norm": 0.23639345169067383, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.7542045265436172, "num_tokens": 2321159.0, "step": 641 }, { "entropy": 0.6094369292259216, "epoch": 0.5991600559962669, "grad_norm": 0.1897607445716858, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.7425357699394226, "num_tokens": 2324832.0, "step": 642 }, { "entropy": 0.5925014317035675, "epoch": 0.6000933271115259, "grad_norm": 0.30368682742118835, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7499009072780609, "num_tokens": 2328400.0, "step": 643 }, { "entropy": 0.6370650082826614, "epoch": 0.6010265982267848, "grad_norm": 0.24284061789512634, "learning_rate": 0.0002, "loss": 0.6489, "mean_token_accuracy": 0.7344943284988403, "num_tokens": 2332102.0, "step": 644 }, { "entropy": 0.5983480960130692, "epoch": 0.6019598693420438, "grad_norm": 0.19705864787101746, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.7562436014413834, "num_tokens": 2335756.0, "step": 645 }, { "entropy": 0.5497676581144333, "epoch": 0.6028931404573028, "grad_norm": 0.27064457535743713, "learning_rate": 0.0002, "loss": 0.5483, "mean_token_accuracy": 0.7784012258052826, "num_tokens": 2339351.0, "step": 646 }, { "entropy": 0.609081357717514, "epoch": 0.6038264115725618, "grad_norm": 0.23605570197105408, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.7466789931058884, "num_tokens": 2343029.0, "step": 647 }, { "entropy": 0.5885906964540482, "epoch": 0.6047596826878208, "grad_norm": 0.24827244877815247, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.7540981769561768, "num_tokens": 2346680.0, "step": 648 }, { "entropy": 0.6115444898605347, "epoch": 0.6056929538030797, "grad_norm": 0.20172670483589172, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.7486587464809418, "num_tokens": 2350250.0, "step": 649 }, { "entropy": 0.646283283829689, "epoch": 0.6066262249183387, "grad_norm": 0.23828136920928955, "learning_rate": 0.0002, "loss": 0.6467, "mean_token_accuracy": 0.7398516237735748, "num_tokens": 2353903.0, "step": 650 }, { "entropy": 0.6468851566314697, "epoch": 0.6075594960335977, "grad_norm": 0.28586727380752563, "learning_rate": 0.0002, "loss": 0.645, "mean_token_accuracy": 0.7380722314119339, "num_tokens": 2357500.0, "step": 651 }, { "entropy": 0.5815537869930267, "epoch": 0.6084927671488567, "grad_norm": 0.22093114256858826, "learning_rate": 0.0002, "loss": 0.5931, "mean_token_accuracy": 0.7588611543178558, "num_tokens": 2360973.0, "step": 652 }, { "entropy": 0.6157013326883316, "epoch": 0.6094260382641157, "grad_norm": 0.26912441849708557, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.7495449632406235, "num_tokens": 2364569.0, "step": 653 }, { "entropy": 0.5990114957094193, "epoch": 0.6103593093793747, "grad_norm": 0.2411167323589325, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7571083307266235, "num_tokens": 2368274.0, "step": 654 }, { "entropy": 0.5809359848499298, "epoch": 0.6112925804946336, "grad_norm": 0.19493885338306427, "learning_rate": 0.0002, "loss": 0.5829, "mean_token_accuracy": 0.7607181221246719, "num_tokens": 2371983.0, "step": 655 }, { "entropy": 0.6119157373905182, "epoch": 0.6122258516098926, "grad_norm": 0.22109173238277435, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.7471954077482224, "num_tokens": 2375553.0, "step": 656 }, { "entropy": 0.6157381385564804, "epoch": 0.6131591227251516, "grad_norm": 0.21985799074172974, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.7413774281740189, "num_tokens": 2379198.0, "step": 657 }, { "entropy": 0.6478002518415451, "epoch": 0.6140923938404106, "grad_norm": 0.20938728749752045, "learning_rate": 0.0002, "loss": 0.6407, "mean_token_accuracy": 0.7399473041296005, "num_tokens": 2382883.0, "step": 658 }, { "entropy": 0.6286050975322723, "epoch": 0.6150256649556696, "grad_norm": 0.1887018382549286, "learning_rate": 0.0002, "loss": 0.6363, "mean_token_accuracy": 0.7378429919481277, "num_tokens": 2386541.0, "step": 659 }, { "entropy": 0.5645611733198166, "epoch": 0.6159589360709286, "grad_norm": 0.31708139181137085, "learning_rate": 0.0002, "loss": 0.5655, "mean_token_accuracy": 0.7734876126050949, "num_tokens": 2390084.0, "step": 660 }, { "entropy": 0.613273099064827, "epoch": 0.6168922071861875, "grad_norm": 0.28384557366371155, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7596790790557861, "num_tokens": 2393709.0, "step": 661 }, { "entropy": 0.6195472925901413, "epoch": 0.6178254783014465, "grad_norm": 0.22314175963401794, "learning_rate": 0.0002, "loss": 0.6219, "mean_token_accuracy": 0.751022607088089, "num_tokens": 2397424.0, "step": 662 }, { "entropy": 0.5720948725938797, "epoch": 0.6187587494167055, "grad_norm": 0.2165556252002716, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.7641439586877823, "num_tokens": 2401109.0, "step": 663 }, { "entropy": 0.5590943843126297, "epoch": 0.6196920205319645, "grad_norm": 0.23308061063289642, "learning_rate": 0.0002, "loss": 0.5738, "mean_token_accuracy": 0.7659985572099686, "num_tokens": 2404762.0, "step": 664 }, { "entropy": 0.6140776723623276, "epoch": 0.6206252916472235, "grad_norm": 0.32725295424461365, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.745684802532196, "num_tokens": 2408450.0, "step": 665 }, { "entropy": 0.5905702710151672, "epoch": 0.6215585627624824, "grad_norm": 0.2581106722354889, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.7611397802829742, "num_tokens": 2412047.0, "step": 666 }, { "entropy": 0.589358851313591, "epoch": 0.6224918338777414, "grad_norm": 0.23935237526893616, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.7644881457090378, "num_tokens": 2415617.0, "step": 667 }, { "entropy": 0.6031628847122192, "epoch": 0.6234251049930004, "grad_norm": 0.2905412018299103, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.7539803087711334, "num_tokens": 2419118.0, "step": 668 }, { "entropy": 0.6366724818944931, "epoch": 0.6243583761082594, "grad_norm": 0.23619675636291504, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.7457192242145538, "num_tokens": 2422920.0, "step": 669 }, { "entropy": 0.6264664530754089, "epoch": 0.6252916472235185, "grad_norm": 0.24389860033988953, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.7466477006673813, "num_tokens": 2426584.0, "step": 670 }, { "entropy": 0.6143964380025864, "epoch": 0.6262249183387775, "grad_norm": 0.20042724907398224, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.7501610368490219, "num_tokens": 2430274.0, "step": 671 }, { "entropy": 0.5861894339323044, "epoch": 0.6271581894540365, "grad_norm": 0.2558136284351349, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.7569135129451752, "num_tokens": 2433755.0, "step": 672 }, { "entropy": 0.6133661270141602, "epoch": 0.6280914605692954, "grad_norm": 0.26754269003868103, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.7464321851730347, "num_tokens": 2437353.0, "step": 673 }, { "entropy": 0.6073099821805954, "epoch": 0.6290247316845544, "grad_norm": 0.22355039417743683, "learning_rate": 0.0002, "loss": 0.6042, "mean_token_accuracy": 0.7551224231719971, "num_tokens": 2441026.0, "step": 674 }, { "entropy": 0.6019643247127533, "epoch": 0.6299580027998134, "grad_norm": 0.25388360023498535, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7601454854011536, "num_tokens": 2444706.0, "step": 675 }, { "entropy": 0.6205550283193588, "epoch": 0.6308912739150724, "grad_norm": 0.35187047719955444, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.747304767370224, "num_tokens": 2448337.0, "step": 676 }, { "entropy": 0.5971081852912903, "epoch": 0.6318245450303314, "grad_norm": 0.2167615294456482, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.7570333331823349, "num_tokens": 2451912.0, "step": 677 }, { "entropy": 0.6111015677452087, "epoch": 0.6327578161455903, "grad_norm": 0.24324844777584076, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.7469299882650375, "num_tokens": 2455654.0, "step": 678 }, { "entropy": 0.6344803422689438, "epoch": 0.6336910872608493, "grad_norm": 0.2246641218662262, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.7513847798109055, "num_tokens": 2459294.0, "step": 679 }, { "entropy": 0.6234834492206573, "epoch": 0.6346243583761083, "grad_norm": 0.2139703333377838, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.7474209368228912, "num_tokens": 2462796.0, "step": 680 }, { "entropy": 0.6179849207401276, "epoch": 0.6355576294913673, "grad_norm": 0.3953574299812317, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.7448330521583557, "num_tokens": 2466624.0, "step": 681 }, { "entropy": 0.6191472411155701, "epoch": 0.6364909006066263, "grad_norm": 0.2590923607349396, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.7436940222978592, "num_tokens": 2470203.0, "step": 682 }, { "entropy": 0.576295793056488, "epoch": 0.6374241717218853, "grad_norm": 0.25879356265068054, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7600140571594238, "num_tokens": 2473779.0, "step": 683 }, { "entropy": 0.6275065690279007, "epoch": 0.6383574428371442, "grad_norm": 0.2714138925075531, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.7441595643758774, "num_tokens": 2477432.0, "step": 684 }, { "entropy": 0.6261069178581238, "epoch": 0.6392907139524032, "grad_norm": 0.24463024735450745, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.7463856041431427, "num_tokens": 2480944.0, "step": 685 }, { "entropy": 0.6282495409250259, "epoch": 0.6402239850676622, "grad_norm": 0.2406713366508484, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.7425217032432556, "num_tokens": 2484582.0, "step": 686 }, { "entropy": 0.6366893500089645, "epoch": 0.6411572561829212, "grad_norm": 0.2172444462776184, "learning_rate": 0.0002, "loss": 0.6342, "mean_token_accuracy": 0.7403228431940079, "num_tokens": 2488155.0, "step": 687 }, { "entropy": 0.6011794358491898, "epoch": 0.6420905272981802, "grad_norm": 0.19785374402999878, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.7523338347673416, "num_tokens": 2491789.0, "step": 688 }, { "entropy": 0.6019739657640457, "epoch": 0.6430237984134392, "grad_norm": 0.2109215408563614, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.7530991733074188, "num_tokens": 2495393.0, "step": 689 }, { "entropy": 0.5906093865633011, "epoch": 0.6439570695286981, "grad_norm": 0.2213364690542221, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.7598689645528793, "num_tokens": 2498848.0, "step": 690 }, { "entropy": 0.5982709228992462, "epoch": 0.6448903406439571, "grad_norm": 0.19759494066238403, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.7604715675115585, "num_tokens": 2502638.0, "step": 691 }, { "entropy": 0.5572324171662331, "epoch": 0.6458236117592161, "grad_norm": 0.18897955119609833, "learning_rate": 0.0002, "loss": 0.5567, "mean_token_accuracy": 0.7753035426139832, "num_tokens": 2506203.0, "step": 692 }, { "entropy": 0.6309312433004379, "epoch": 0.6467568828744751, "grad_norm": 0.22637969255447388, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.7458716481924057, "num_tokens": 2509859.0, "step": 693 }, { "entropy": 0.6184558272361755, "epoch": 0.6476901539897341, "grad_norm": 0.2231743335723877, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.7423657178878784, "num_tokens": 2513417.0, "step": 694 }, { "entropy": 0.6361770331859589, "epoch": 0.648623425104993, "grad_norm": 0.21766285598278046, "learning_rate": 0.0002, "loss": 0.6433, "mean_token_accuracy": 0.7414954453706741, "num_tokens": 2517189.0, "step": 695 }, { "entropy": 0.6137528121471405, "epoch": 0.649556696220252, "grad_norm": 0.24917960166931152, "learning_rate": 0.0002, "loss": 0.6288, "mean_token_accuracy": 0.7475361675024033, "num_tokens": 2520809.0, "step": 696 }, { "entropy": 0.6121505349874496, "epoch": 0.650489967335511, "grad_norm": 0.24639782309532166, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.7467421293258667, "num_tokens": 2524499.0, "step": 697 }, { "entropy": 0.5644374489784241, "epoch": 0.65142323845077, "grad_norm": 0.19823738932609558, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7662626057863235, "num_tokens": 2528043.0, "step": 698 }, { "entropy": 0.5904609113931656, "epoch": 0.652356509566029, "grad_norm": 0.20159132778644562, "learning_rate": 0.0002, "loss": 0.5927, "mean_token_accuracy": 0.760688066482544, "num_tokens": 2531677.0, "step": 699 }, { "entropy": 0.648699015378952, "epoch": 0.653289780681288, "grad_norm": 0.20596207678318024, "learning_rate": 0.0002, "loss": 0.6444, "mean_token_accuracy": 0.7384260147809982, "num_tokens": 2535494.0, "step": 700 }, { "entropy": 0.5941232293844223, "epoch": 0.6542230517965469, "grad_norm": 0.2672054171562195, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.7573621720075607, "num_tokens": 2539006.0, "step": 701 }, { "entropy": 0.5692880004644394, "epoch": 0.6551563229118059, "grad_norm": 0.25068607926368713, "learning_rate": 0.0002, "loss": 0.5669, "mean_token_accuracy": 0.776006668806076, "num_tokens": 2542775.0, "step": 702 }, { "entropy": 0.6404178738594055, "epoch": 0.6560895940270649, "grad_norm": 0.24959731101989746, "learning_rate": 0.0002, "loss": 0.6586, "mean_token_accuracy": 0.7338258922100067, "num_tokens": 2546422.0, "step": 703 }, { "entropy": 0.6223199963569641, "epoch": 0.6570228651423239, "grad_norm": 0.21638977527618408, "learning_rate": 0.0002, "loss": 0.6264, "mean_token_accuracy": 0.7452749460935593, "num_tokens": 2550086.0, "step": 704 }, { "entropy": 0.6482072323560715, "epoch": 0.6579561362575829, "grad_norm": 0.23552963137626648, "learning_rate": 0.0002, "loss": 0.66, "mean_token_accuracy": 0.7322734594345093, "num_tokens": 2553857.0, "step": 705 }, { "entropy": 0.6450587213039398, "epoch": 0.6588894073728419, "grad_norm": 0.21287326514720917, "learning_rate": 0.0002, "loss": 0.6468, "mean_token_accuracy": 0.7421576976776123, "num_tokens": 2557593.0, "step": 706 }, { "entropy": 0.6133073270320892, "epoch": 0.6598226784881008, "grad_norm": 0.23523929715156555, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.7495991885662079, "num_tokens": 2561226.0, "step": 707 }, { "entropy": 0.6570499837398529, "epoch": 0.6607559496033598, "grad_norm": 0.21513301134109497, "learning_rate": 0.0002, "loss": 0.6654, "mean_token_accuracy": 0.7308409959077835, "num_tokens": 2564846.0, "step": 708 }, { "entropy": 0.5902947038412094, "epoch": 0.6616892207186188, "grad_norm": 0.23836642503738403, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.7566909939050674, "num_tokens": 2568381.0, "step": 709 }, { "entropy": 0.5878032818436623, "epoch": 0.6626224918338778, "grad_norm": 0.2134702503681183, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.754427433013916, "num_tokens": 2572032.0, "step": 710 }, { "entropy": 0.6335567384958267, "epoch": 0.6635557629491368, "grad_norm": 0.1973092257976532, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.7452935576438904, "num_tokens": 2575789.0, "step": 711 }, { "entropy": 0.6317075490951538, "epoch": 0.6644890340643957, "grad_norm": 0.18483944237232208, "learning_rate": 0.0002, "loss": 0.6345, "mean_token_accuracy": 0.7461477518081665, "num_tokens": 2579504.0, "step": 712 }, { "entropy": 0.568916067481041, "epoch": 0.6654223051796547, "grad_norm": 0.22887572646141052, "learning_rate": 0.0002, "loss": 0.5638, "mean_token_accuracy": 0.7729592174291611, "num_tokens": 2583258.0, "step": 713 }, { "entropy": 0.614827573299408, "epoch": 0.6663555762949137, "grad_norm": 0.24430212378501892, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.7459622621536255, "num_tokens": 2586825.0, "step": 714 }, { "entropy": 0.6056788116693497, "epoch": 0.6672888474101727, "grad_norm": 0.21105118095874786, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.7518429160118103, "num_tokens": 2590362.0, "step": 715 }, { "entropy": 0.6302312463521957, "epoch": 0.6682221185254317, "grad_norm": 0.22846098244190216, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.7401278167963028, "num_tokens": 2593925.0, "step": 716 }, { "entropy": 0.6464699357748032, "epoch": 0.6691553896406907, "grad_norm": 0.2274424433708191, "learning_rate": 0.0002, "loss": 0.6472, "mean_token_accuracy": 0.7377369403839111, "num_tokens": 2597500.0, "step": 717 }, { "entropy": 0.6142957359552383, "epoch": 0.6700886607559496, "grad_norm": 0.2468862682580948, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.751122236251831, "num_tokens": 2601151.0, "step": 718 }, { "entropy": 0.6069158613681793, "epoch": 0.6710219318712086, "grad_norm": 0.3589651882648468, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.7438846528530121, "num_tokens": 2604713.0, "step": 719 }, { "entropy": 0.5838519036769867, "epoch": 0.6719552029864676, "grad_norm": 0.22281858325004578, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.756470188498497, "num_tokens": 2608212.0, "step": 720 }, { "entropy": 0.5735815763473511, "epoch": 0.6728884741017266, "grad_norm": 0.23652467131614685, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.7622499912977219, "num_tokens": 2611919.0, "step": 721 }, { "entropy": 0.5835630148649216, "epoch": 0.6738217452169856, "grad_norm": 0.2399749457836151, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.7605995684862137, "num_tokens": 2615413.0, "step": 722 }, { "entropy": 0.643311083316803, "epoch": 0.6747550163322446, "grad_norm": 0.20864297449588776, "learning_rate": 0.0002, "loss": 0.6424, "mean_token_accuracy": 0.7390967756509781, "num_tokens": 2619033.0, "step": 723 }, { "entropy": 0.6258364766836166, "epoch": 0.6756882874475035, "grad_norm": 0.21454203128814697, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.7499982863664627, "num_tokens": 2622720.0, "step": 724 }, { "entropy": 0.6384877115488052, "epoch": 0.6766215585627625, "grad_norm": 0.24696020781993866, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.7450248450040817, "num_tokens": 2626385.0, "step": 725 }, { "entropy": 0.6402577608823776, "epoch": 0.6775548296780215, "grad_norm": 0.1911025494337082, "learning_rate": 0.0002, "loss": 0.6453, "mean_token_accuracy": 0.7347097098827362, "num_tokens": 2630028.0, "step": 726 }, { "entropy": 0.6799449026584625, "epoch": 0.6784881007932805, "grad_norm": 0.20395633578300476, "learning_rate": 0.0002, "loss": 0.6699, "mean_token_accuracy": 0.734823077917099, "num_tokens": 2633788.0, "step": 727 }, { "entropy": 0.6058418899774551, "epoch": 0.6794213719085395, "grad_norm": 0.22931045293807983, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.756956622004509, "num_tokens": 2637362.0, "step": 728 }, { "entropy": 0.6129723489284515, "epoch": 0.6803546430237984, "grad_norm": 0.20009322464466095, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.7480470687150955, "num_tokens": 2640880.0, "step": 729 }, { "entropy": 0.61861951649189, "epoch": 0.6812879141390574, "grad_norm": 0.22056905925273895, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.7526113986968994, "num_tokens": 2644509.0, "step": 730 }, { "entropy": 0.6012367308139801, "epoch": 0.6822211852543164, "grad_norm": 0.22392946481704712, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.7599274218082428, "num_tokens": 2648089.0, "step": 731 }, { "entropy": 0.5948634445667267, "epoch": 0.6831544563695754, "grad_norm": 0.22807234525680542, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.7582634091377258, "num_tokens": 2651619.0, "step": 732 }, { "entropy": 0.6397681683301926, "epoch": 0.6840877274848344, "grad_norm": 0.21628515422344208, "learning_rate": 0.0002, "loss": 0.6476, "mean_token_accuracy": 0.738898977637291, "num_tokens": 2655259.0, "step": 733 }, { "entropy": 0.5851717740297318, "epoch": 0.6850209986000934, "grad_norm": 0.21289727091789246, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.7568319141864777, "num_tokens": 2659045.0, "step": 734 }, { "entropy": 0.6276812106370926, "epoch": 0.6859542697153523, "grad_norm": 0.24025756120681763, "learning_rate": 0.0002, "loss": 0.6401, "mean_token_accuracy": 0.7474641650915146, "num_tokens": 2662561.0, "step": 735 }, { "entropy": 0.615345686674118, "epoch": 0.6868875408306113, "grad_norm": 0.22318992018699646, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.7470707297325134, "num_tokens": 2666197.0, "step": 736 }, { "entropy": 0.6041459292173386, "epoch": 0.6878208119458703, "grad_norm": 0.21701817214488983, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.7545884400606155, "num_tokens": 2669778.0, "step": 737 }, { "entropy": 0.6169263273477554, "epoch": 0.6887540830611293, "grad_norm": 0.2152700573205948, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.7519784867763519, "num_tokens": 2673458.0, "step": 738 }, { "entropy": 0.5772988498210907, "epoch": 0.6896873541763883, "grad_norm": 0.2399342805147171, "learning_rate": 0.0002, "loss": 0.5806, "mean_token_accuracy": 0.7698366045951843, "num_tokens": 2677044.0, "step": 739 }, { "entropy": 0.6136231422424316, "epoch": 0.6906206252916472, "grad_norm": 0.298306405544281, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.749444305896759, "num_tokens": 2680644.0, "step": 740 }, { "entropy": 0.6270999610424042, "epoch": 0.6915538964069062, "grad_norm": 0.27888187766075134, "learning_rate": 0.0002, "loss": 0.6397, "mean_token_accuracy": 0.7419653683900833, "num_tokens": 2684184.0, "step": 741 }, { "entropy": 0.6137462109327316, "epoch": 0.6924871675221652, "grad_norm": 0.26204341650009155, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.7491353750228882, "num_tokens": 2687976.0, "step": 742 }, { "entropy": 0.5677110254764557, "epoch": 0.6934204386374242, "grad_norm": 0.2243829071521759, "learning_rate": 0.0002, "loss": 0.5769, "mean_token_accuracy": 0.7662224918603897, "num_tokens": 2691616.0, "step": 743 }, { "entropy": 0.6506608128547668, "epoch": 0.6943537097526832, "grad_norm": 0.21948722004890442, "learning_rate": 0.0002, "loss": 0.6395, "mean_token_accuracy": 0.7395874112844467, "num_tokens": 2695288.0, "step": 744 }, { "entropy": 0.6195942163467407, "epoch": 0.6952869808679422, "grad_norm": 0.22717221081256866, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.7498127818107605, "num_tokens": 2698917.0, "step": 745 }, { "entropy": 0.6251284182071686, "epoch": 0.6962202519832011, "grad_norm": 0.24616023898124695, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.7421327829360962, "num_tokens": 2702590.0, "step": 746 }, { "entropy": 0.5931864529848099, "epoch": 0.6971535230984601, "grad_norm": 0.2340129166841507, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.7521718591451645, "num_tokens": 2706168.0, "step": 747 }, { "entropy": 0.5955173671245575, "epoch": 0.6980867942137191, "grad_norm": 0.2234364002943039, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.7606653720140457, "num_tokens": 2709730.0, "step": 748 }, { "entropy": 0.6409309357404709, "epoch": 0.6990200653289781, "grad_norm": 0.21460972726345062, "learning_rate": 0.0002, "loss": 0.6416, "mean_token_accuracy": 0.7394376993179321, "num_tokens": 2713353.0, "step": 749 }, { "entropy": 0.6467620432376862, "epoch": 0.6999533364442371, "grad_norm": 0.21031540632247925, "learning_rate": 0.0002, "loss": 0.6456, "mean_token_accuracy": 0.7365802824497223, "num_tokens": 2717024.0, "step": 750 }, { "entropy": 0.653753325343132, "epoch": 0.700886607559496, "grad_norm": 0.20517279207706451, "learning_rate": 0.0002, "loss": 0.6561, "mean_token_accuracy": 0.7355674058198929, "num_tokens": 2720670.0, "step": 751 }, { "entropy": 0.6247546970844269, "epoch": 0.701819878674755, "grad_norm": 0.22577428817749023, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.7466522306203842, "num_tokens": 2724293.0, "step": 752 }, { "entropy": 0.606750562787056, "epoch": 0.702753149790014, "grad_norm": 0.21522288024425507, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7520420998334885, "num_tokens": 2728042.0, "step": 753 }, { "entropy": 0.6001033931970596, "epoch": 0.703686420905273, "grad_norm": 0.23636871576309204, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.7513014823198318, "num_tokens": 2731508.0, "step": 754 }, { "entropy": 0.6229653507471085, "epoch": 0.704619692020532, "grad_norm": 0.21418774127960205, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.7503755837678909, "num_tokens": 2735241.0, "step": 755 }, { "entropy": 0.5490423291921616, "epoch": 0.705552963135791, "grad_norm": 0.2583906650543213, "learning_rate": 0.0002, "loss": 0.5625, "mean_token_accuracy": 0.7707652598619461, "num_tokens": 2738893.0, "step": 756 }, { "entropy": 0.631389930844307, "epoch": 0.70648623425105, "grad_norm": 0.22889924049377441, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.7380217909812927, "num_tokens": 2742520.0, "step": 757 }, { "entropy": 0.625264048576355, "epoch": 0.7074195053663089, "grad_norm": 0.2556902766227722, "learning_rate": 0.0002, "loss": 0.6359, "mean_token_accuracy": 0.7418669909238815, "num_tokens": 2746044.0, "step": 758 }, { "entropy": 0.6469711810350418, "epoch": 0.7083527764815679, "grad_norm": 0.20394133031368256, "learning_rate": 0.0002, "loss": 0.6457, "mean_token_accuracy": 0.7393891364336014, "num_tokens": 2749706.0, "step": 759 }, { "entropy": 0.6507914811372757, "epoch": 0.7092860475968269, "grad_norm": 0.2516878843307495, "learning_rate": 0.0002, "loss": 0.6437, "mean_token_accuracy": 0.7384824007749557, "num_tokens": 2753418.0, "step": 760 }, { "entropy": 0.6525522470474243, "epoch": 0.7102193187120859, "grad_norm": 0.21053099632263184, "learning_rate": 0.0002, "loss": 0.6507, "mean_token_accuracy": 0.7348183989524841, "num_tokens": 2756961.0, "step": 761 }, { "entropy": 0.6174996048212051, "epoch": 0.7111525898273449, "grad_norm": 0.22050683200359344, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7504381537437439, "num_tokens": 2760569.0, "step": 762 }, { "entropy": 0.5941135436296463, "epoch": 0.7120858609426038, "grad_norm": 0.2966843545436859, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.7547492384910583, "num_tokens": 2764272.0, "step": 763 }, { "entropy": 0.5710463672876358, "epoch": 0.7130191320578628, "grad_norm": 0.2636772692203522, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.7620075941085815, "num_tokens": 2767896.0, "step": 764 }, { "entropy": 0.6023706942796707, "epoch": 0.7139524031731218, "grad_norm": 0.23153091967105865, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7467928528785706, "num_tokens": 2771519.0, "step": 765 }, { "entropy": 0.6218724846839905, "epoch": 0.7148856742883808, "grad_norm": 0.2603501081466675, "learning_rate": 0.0002, "loss": 0.6453, "mean_token_accuracy": 0.7354018837213516, "num_tokens": 2775054.0, "step": 766 }, { "entropy": 0.5883719772100449, "epoch": 0.7158189454036398, "grad_norm": 0.26542627811431885, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.7542552649974823, "num_tokens": 2778467.0, "step": 767 }, { "entropy": 0.633525013923645, "epoch": 0.7167522165188988, "grad_norm": 0.23230710625648499, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.7426392883062363, "num_tokens": 2782110.0, "step": 768 }, { "entropy": 0.591704249382019, "epoch": 0.7176854876341577, "grad_norm": 0.19367174804210663, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.7518965005874634, "num_tokens": 2785724.0, "step": 769 }, { "entropy": 0.6485718041658401, "epoch": 0.7186187587494167, "grad_norm": 0.21032461524009705, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.7461829781532288, "num_tokens": 2789388.0, "step": 770 }, { "entropy": 0.6418161392211914, "epoch": 0.7195520298646757, "grad_norm": 0.2326827198266983, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.7345519959926605, "num_tokens": 2793049.0, "step": 771 }, { "entropy": 0.6367700546979904, "epoch": 0.7204853009799347, "grad_norm": 0.2723037600517273, "learning_rate": 0.0002, "loss": 0.6375, "mean_token_accuracy": 0.740404024720192, "num_tokens": 2796627.0, "step": 772 }, { "entropy": 0.6344336122274399, "epoch": 0.7214185720951937, "grad_norm": 0.22182683646678925, "learning_rate": 0.0002, "loss": 0.6428, "mean_token_accuracy": 0.7349346280097961, "num_tokens": 2800268.0, "step": 773 }, { "entropy": 0.5889290869235992, "epoch": 0.7223518432104526, "grad_norm": 0.1868208348751068, "learning_rate": 0.0002, "loss": 0.584, "mean_token_accuracy": 0.764473482966423, "num_tokens": 2804002.0, "step": 774 }, { "entropy": 0.6165845096111298, "epoch": 0.7232851143257116, "grad_norm": 0.24451763927936554, "learning_rate": 0.0002, "loss": 0.6311, "mean_token_accuracy": 0.7440432608127594, "num_tokens": 2807559.0, "step": 775 }, { "entropy": 0.5660706609487534, "epoch": 0.7242183854409706, "grad_norm": 0.23919141292572021, "learning_rate": 0.0002, "loss": 0.5766, "mean_token_accuracy": 0.7642731368541718, "num_tokens": 2811219.0, "step": 776 }, { "entropy": 0.6118446290493011, "epoch": 0.7251516565562296, "grad_norm": 0.23280787467956543, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.7447889596223831, "num_tokens": 2814812.0, "step": 777 }, { "entropy": 0.5897985249757767, "epoch": 0.7260849276714886, "grad_norm": 0.25252386927604675, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.7480250746011734, "num_tokens": 2818399.0, "step": 778 }, { "entropy": 0.6056823581457138, "epoch": 0.7270181987867476, "grad_norm": 0.25293776392936707, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.7510677129030228, "num_tokens": 2821978.0, "step": 779 }, { "entropy": 0.663695365190506, "epoch": 0.7279514699020065, "grad_norm": 0.23030032217502594, "learning_rate": 0.0002, "loss": 0.6651, "mean_token_accuracy": 0.7345515191555023, "num_tokens": 2825645.0, "step": 780 }, { "entropy": 0.6349863111972809, "epoch": 0.7288847410172655, "grad_norm": 0.1996770203113556, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.7460969835519791, "num_tokens": 2829334.0, "step": 781 }, { "entropy": 0.6184824407100677, "epoch": 0.7298180121325245, "grad_norm": 0.18571926653385162, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.7572721093893051, "num_tokens": 2833006.0, "step": 782 }, { "entropy": 0.5942963808774948, "epoch": 0.7307512832477835, "grad_norm": 0.21931974589824677, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.7596588283777237, "num_tokens": 2836678.0, "step": 783 }, { "entropy": 0.5949952006340027, "epoch": 0.7316845543630425, "grad_norm": 0.1998152732849121, "learning_rate": 0.0002, "loss": 0.5875, "mean_token_accuracy": 0.7599774450063705, "num_tokens": 2840299.0, "step": 784 }, { "entropy": 0.6336467266082764, "epoch": 0.7326178254783015, "grad_norm": 0.20147491991519928, "learning_rate": 0.0002, "loss": 0.6337, "mean_token_accuracy": 0.7462684363126755, "num_tokens": 2843944.0, "step": 785 }, { "entropy": 0.6184998601675034, "epoch": 0.7335510965935604, "grad_norm": 0.25427234172821045, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7566153258085251, "num_tokens": 2847524.0, "step": 786 }, { "entropy": 0.6319364011287689, "epoch": 0.7344843677088194, "grad_norm": 0.2255890965461731, "learning_rate": 0.0002, "loss": 0.6327, "mean_token_accuracy": 0.7465481162071228, "num_tokens": 2851205.0, "step": 787 }, { "entropy": 0.5993489027023315, "epoch": 0.7354176388240784, "grad_norm": 0.24607178568840027, "learning_rate": 0.0002, "loss": 0.6283, "mean_token_accuracy": 0.7453311234712601, "num_tokens": 2854805.0, "step": 788 }, { "entropy": 0.5859900712966919, "epoch": 0.7363509099393374, "grad_norm": 0.2805096507072449, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.7541529089212418, "num_tokens": 2858362.0, "step": 789 }, { "entropy": 0.620645061135292, "epoch": 0.7372841810545964, "grad_norm": 0.22093228995800018, "learning_rate": 0.0002, "loss": 0.6297, "mean_token_accuracy": 0.7456287443637848, "num_tokens": 2861998.0, "step": 790 }, { "entropy": 0.6493832021951675, "epoch": 0.7382174521698553, "grad_norm": 0.26154443621635437, "learning_rate": 0.0002, "loss": 0.6431, "mean_token_accuracy": 0.7350076884031296, "num_tokens": 2865674.0, "step": 791 }, { "entropy": 0.6113140881061554, "epoch": 0.7391507232851143, "grad_norm": 0.1949017494916916, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.7514502853155136, "num_tokens": 2869309.0, "step": 792 }, { "entropy": 0.6077594012022018, "epoch": 0.7400839944003733, "grad_norm": 0.2004355639219284, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.7598496377468109, "num_tokens": 2872930.0, "step": 793 }, { "entropy": 0.6466297209262848, "epoch": 0.7410172655156323, "grad_norm": 0.2365228533744812, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.7396881729364395, "num_tokens": 2876542.0, "step": 794 }, { "entropy": 0.6420071125030518, "epoch": 0.7419505366308913, "grad_norm": 0.2110598236322403, "learning_rate": 0.0002, "loss": 0.6421, "mean_token_accuracy": 0.7380658835172653, "num_tokens": 2880280.0, "step": 795 }, { "entropy": 0.6309136226773262, "epoch": 0.7428838077461503, "grad_norm": 0.20147623121738434, "learning_rate": 0.0002, "loss": 0.6349, "mean_token_accuracy": 0.7507063150405884, "num_tokens": 2884031.0, "step": 796 }, { "entropy": 0.5841285437345505, "epoch": 0.7438170788614092, "grad_norm": 0.2902391254901886, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.7494097203016281, "num_tokens": 2887611.0, "step": 797 }, { "entropy": 0.6205988377332687, "epoch": 0.7447503499766682, "grad_norm": 0.2268381267786026, "learning_rate": 0.0002, "loss": 0.6324, "mean_token_accuracy": 0.7434268891811371, "num_tokens": 2891202.0, "step": 798 }, { "entropy": 0.583517462015152, "epoch": 0.7456836210919272, "grad_norm": 0.23363865911960602, "learning_rate": 0.0002, "loss": 0.6028, "mean_token_accuracy": 0.7548172473907471, "num_tokens": 2894813.0, "step": 799 }, { "entropy": 0.6290982812643051, "epoch": 0.7466168922071862, "grad_norm": 0.21497133374214172, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.7498426139354706, "num_tokens": 2898523.0, "step": 800 }, { "entropy": 0.5929540991783142, "epoch": 0.7475501633224452, "grad_norm": 0.21653927862644196, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.7555231302976608, "num_tokens": 2902102.0, "step": 801 }, { "entropy": 0.6338641941547394, "epoch": 0.7484834344377042, "grad_norm": 0.19927088916301727, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.7489189952611923, "num_tokens": 2905665.0, "step": 802 }, { "entropy": 0.5895919054746628, "epoch": 0.7494167055529631, "grad_norm": 0.2229759246110916, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.7574741393327713, "num_tokens": 2909278.0, "step": 803 }, { "entropy": 0.6327093988656998, "epoch": 0.7503499766682221, "grad_norm": 0.2039036750793457, "learning_rate": 0.0002, "loss": 0.6352, "mean_token_accuracy": 0.7441875487565994, "num_tokens": 2912821.0, "step": 804 }, { "entropy": 0.5895678699016571, "epoch": 0.7512832477834811, "grad_norm": 0.2439955323934555, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.7520964592695236, "num_tokens": 2916348.0, "step": 805 }, { "entropy": 0.628684937953949, "epoch": 0.7522165188987401, "grad_norm": 0.2508292496204376, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.747513622045517, "num_tokens": 2919993.0, "step": 806 }, { "entropy": 0.6395972222089767, "epoch": 0.7531497900139991, "grad_norm": 0.25552865862846375, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.7409296184778214, "num_tokens": 2923693.0, "step": 807 }, { "entropy": 0.6365974396467209, "epoch": 0.754083061129258, "grad_norm": 0.2445273995399475, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.7352539300918579, "num_tokens": 2927272.0, "step": 808 }, { "entropy": 0.6141413003206253, "epoch": 0.755016332244517, "grad_norm": 0.20222562551498413, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.7490103840827942, "num_tokens": 2931004.0, "step": 809 }, { "entropy": 0.5749493986368179, "epoch": 0.755949603359776, "grad_norm": 0.20349445939064026, "learning_rate": 0.0002, "loss": 0.5763, "mean_token_accuracy": 0.7633761763572693, "num_tokens": 2934794.0, "step": 810 }, { "entropy": 0.6065438836812973, "epoch": 0.756882874475035, "grad_norm": 0.2862829267978668, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.7509150207042694, "num_tokens": 2938395.0, "step": 811 }, { "entropy": 0.6008400768041611, "epoch": 0.757816145590294, "grad_norm": 0.24169835448265076, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.7591991126537323, "num_tokens": 2941943.0, "step": 812 }, { "entropy": 0.6168337166309357, "epoch": 0.758749416705553, "grad_norm": 0.2295503169298172, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.7503000795841217, "num_tokens": 2945613.0, "step": 813 }, { "entropy": 0.6295826137065887, "epoch": 0.7596826878208119, "grad_norm": 0.21229860186576843, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.7504187822341919, "num_tokens": 2949339.0, "step": 814 }, { "entropy": 0.6341367810964584, "epoch": 0.7606159589360709, "grad_norm": 0.3820662200450897, "learning_rate": 0.0002, "loss": 0.6497, "mean_token_accuracy": 0.7358937859535217, "num_tokens": 2952940.0, "step": 815 }, { "entropy": 0.5960005223751068, "epoch": 0.7615492300513299, "grad_norm": 0.22333920001983643, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.7615475952625275, "num_tokens": 2956533.0, "step": 816 }, { "entropy": 0.6173315197229385, "epoch": 0.7624825011665889, "grad_norm": 0.25695475935935974, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.7450265437364578, "num_tokens": 2960171.0, "step": 817 }, { "entropy": 0.607050895690918, "epoch": 0.7634157722818479, "grad_norm": 0.24567240476608276, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.7517410814762115, "num_tokens": 2963668.0, "step": 818 }, { "entropy": 0.6338638961315155, "epoch": 0.7643490433971069, "grad_norm": 0.2520834505558014, "learning_rate": 0.0002, "loss": 0.6491, "mean_token_accuracy": 0.7383094131946564, "num_tokens": 2967430.0, "step": 819 }, { "entropy": 0.6049632132053375, "epoch": 0.7652823145123658, "grad_norm": 0.2329198718070984, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.7547138035297394, "num_tokens": 2971023.0, "step": 820 }, { "entropy": 0.6150783151388168, "epoch": 0.7662155856276248, "grad_norm": 0.2571249306201935, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.7541193068027496, "num_tokens": 2974547.0, "step": 821 }, { "entropy": 0.5554257929325104, "epoch": 0.7671488567428838, "grad_norm": 0.21770606935024261, "learning_rate": 0.0002, "loss": 0.5642, "mean_token_accuracy": 0.7715229988098145, "num_tokens": 2978235.0, "step": 822 }, { "entropy": 0.5639173239469528, "epoch": 0.7680821278581428, "grad_norm": 0.20327089726924896, "learning_rate": 0.0002, "loss": 0.5651, "mean_token_accuracy": 0.7725966721773148, "num_tokens": 2981795.0, "step": 823 }, { "entropy": 0.6197311729192734, "epoch": 0.7690153989734018, "grad_norm": 0.22860993444919586, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.7490777671337128, "num_tokens": 2985455.0, "step": 824 }, { "entropy": 0.6236830949783325, "epoch": 0.7699486700886607, "grad_norm": 0.2224522978067398, "learning_rate": 0.0002, "loss": 0.6345, "mean_token_accuracy": 0.7415605783462524, "num_tokens": 2989120.0, "step": 825 }, { "entropy": 0.5998463481664658, "epoch": 0.7708819412039197, "grad_norm": 0.2201024442911148, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.757529690861702, "num_tokens": 2992725.0, "step": 826 }, { "entropy": 0.5898505598306656, "epoch": 0.7718152123191787, "grad_norm": 0.2112148404121399, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.7531297355890274, "num_tokens": 2996341.0, "step": 827 }, { "entropy": 0.6053341627120972, "epoch": 0.7727484834344377, "grad_norm": 0.18392470479011536, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.750179648399353, "num_tokens": 3000058.0, "step": 828 }, { "entropy": 0.5880081355571747, "epoch": 0.7736817545496967, "grad_norm": 0.23983046412467957, "learning_rate": 0.0002, "loss": 0.5879, "mean_token_accuracy": 0.7629110515117645, "num_tokens": 3003674.0, "step": 829 }, { "entropy": 0.6144276410341263, "epoch": 0.7746150256649557, "grad_norm": 0.21617083251476288, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.7507240027189255, "num_tokens": 3007241.0, "step": 830 }, { "entropy": 0.6114107966423035, "epoch": 0.7755482967802146, "grad_norm": 0.28308725357055664, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.7472745180130005, "num_tokens": 3011034.0, "step": 831 }, { "entropy": 0.5891468822956085, "epoch": 0.7764815678954736, "grad_norm": 0.2733062505722046, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7633031606674194, "num_tokens": 3014593.0, "step": 832 }, { "entropy": 0.5954455137252808, "epoch": 0.7774148390107326, "grad_norm": 0.21611051261425018, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.7576207220554352, "num_tokens": 3018092.0, "step": 833 }, { "entropy": 0.6075674444437027, "epoch": 0.7783481101259916, "grad_norm": 0.26744627952575684, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.7553070187568665, "num_tokens": 3021579.0, "step": 834 }, { "entropy": 0.6428234577178955, "epoch": 0.7792813812412506, "grad_norm": 0.22622142732143402, "learning_rate": 0.0002, "loss": 0.6415, "mean_token_accuracy": 0.739069327712059, "num_tokens": 3025251.0, "step": 835 }, { "entropy": 0.6473434567451477, "epoch": 0.7802146523565096, "grad_norm": 0.21322830021381378, "learning_rate": 0.0002, "loss": 0.6441, "mean_token_accuracy": 0.7423209846019745, "num_tokens": 3028843.0, "step": 836 }, { "entropy": 0.6190459281206131, "epoch": 0.7811479234717685, "grad_norm": 0.21388107538223267, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.751074880361557, "num_tokens": 3032567.0, "step": 837 }, { "entropy": 0.6468786597251892, "epoch": 0.7820811945870275, "grad_norm": 0.21758686006069183, "learning_rate": 0.0002, "loss": 0.6418, "mean_token_accuracy": 0.7424664199352264, "num_tokens": 3036224.0, "step": 838 }, { "entropy": 0.5654818341135979, "epoch": 0.7830144657022865, "grad_norm": 0.17668989300727844, "learning_rate": 0.0002, "loss": 0.5742, "mean_token_accuracy": 0.7662543505430222, "num_tokens": 3039750.0, "step": 839 }, { "entropy": 0.5952781140804291, "epoch": 0.7839477368175455, "grad_norm": 0.21546433866024017, "learning_rate": 0.0002, "loss": 0.6042, "mean_token_accuracy": 0.7589846700429916, "num_tokens": 3043328.0, "step": 840 }, { "entropy": 0.5765703171491623, "epoch": 0.7848810079328045, "grad_norm": 0.20405177772045135, "learning_rate": 0.0002, "loss": 0.5774, "mean_token_accuracy": 0.7581525892019272, "num_tokens": 3046954.0, "step": 841 }, { "entropy": 0.5834700465202332, "epoch": 0.7858142790480634, "grad_norm": 0.21407265961170197, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.7586976140737534, "num_tokens": 3050570.0, "step": 842 }, { "entropy": 0.6121360659599304, "epoch": 0.7867475501633224, "grad_norm": 0.30305036902427673, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.7438352108001709, "num_tokens": 3054130.0, "step": 843 }, { "entropy": 0.6392316669225693, "epoch": 0.7876808212785814, "grad_norm": 0.22084417939186096, "learning_rate": 0.0002, "loss": 0.6375, "mean_token_accuracy": 0.7484669238328934, "num_tokens": 3057880.0, "step": 844 }, { "entropy": 0.5887109041213989, "epoch": 0.7886140923938404, "grad_norm": 0.2783225476741791, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.7576820403337479, "num_tokens": 3061491.0, "step": 845 }, { "entropy": 0.5568461120128632, "epoch": 0.7895473635090994, "grad_norm": 0.20779460668563843, "learning_rate": 0.0002, "loss": 0.5596, "mean_token_accuracy": 0.7746846377849579, "num_tokens": 3065194.0, "step": 846 }, { "entropy": 0.6076231002807617, "epoch": 0.7904806346243584, "grad_norm": 0.2635418474674225, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.760390892624855, "num_tokens": 3068785.0, "step": 847 }, { "entropy": 0.626486137509346, "epoch": 0.7914139057396173, "grad_norm": 0.21745650470256805, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.7553741037845612, "num_tokens": 3072434.0, "step": 848 }, { "entropy": 0.5968287587165833, "epoch": 0.7923471768548763, "grad_norm": 0.2624586820602417, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.7576701194047928, "num_tokens": 3076028.0, "step": 849 }, { "entropy": 0.6264362335205078, "epoch": 0.7932804479701353, "grad_norm": 0.22909067571163177, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.738393560051918, "num_tokens": 3079759.0, "step": 850 }, { "entropy": 0.6235905140638351, "epoch": 0.7942137190853943, "grad_norm": 0.24237309396266937, "learning_rate": 0.0002, "loss": 0.6334, "mean_token_accuracy": 0.7477540224790573, "num_tokens": 3083285.0, "step": 851 }, { "entropy": 0.6238450706005096, "epoch": 0.7951469902006533, "grad_norm": 0.21118971705436707, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.745773509144783, "num_tokens": 3086889.0, "step": 852 }, { "entropy": 0.6531502157449722, "epoch": 0.7960802613159123, "grad_norm": 0.21507324278354645, "learning_rate": 0.0002, "loss": 0.6598, "mean_token_accuracy": 0.732495978474617, "num_tokens": 3090472.0, "step": 853 }, { "entropy": 0.5605291351675987, "epoch": 0.7970135324311712, "grad_norm": 0.23387844860553741, "learning_rate": 0.0002, "loss": 0.5587, "mean_token_accuracy": 0.7719616591930389, "num_tokens": 3094031.0, "step": 854 }, { "entropy": 0.5812034010887146, "epoch": 0.7979468035464302, "grad_norm": 0.2201613336801529, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.7563750296831131, "num_tokens": 3097553.0, "step": 855 }, { "entropy": 0.6254999935626984, "epoch": 0.7988800746616892, "grad_norm": 0.21238970756530762, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.7459723055362701, "num_tokens": 3101153.0, "step": 856 }, { "entropy": 0.6370571702718735, "epoch": 0.7998133457769482, "grad_norm": 0.21427470445632935, "learning_rate": 0.0002, "loss": 0.6266, "mean_token_accuracy": 0.7501794248819351, "num_tokens": 3104811.0, "step": 857 }, { "entropy": 0.6013757735490799, "epoch": 0.8007466168922072, "grad_norm": 0.19768455624580383, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.754877582192421, "num_tokens": 3108501.0, "step": 858 }, { "entropy": 0.5956685841083527, "epoch": 0.8016798880074661, "grad_norm": 0.21123406291007996, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.7554425299167633, "num_tokens": 3112177.0, "step": 859 }, { "entropy": 0.6588978320360184, "epoch": 0.8026131591227251, "grad_norm": 0.23048549890518188, "learning_rate": 0.0002, "loss": 0.6734, "mean_token_accuracy": 0.7298877686262131, "num_tokens": 3115835.0, "step": 860 }, { "entropy": 0.6192128211259842, "epoch": 0.8035464302379841, "grad_norm": 0.20475855469703674, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.7555033266544342, "num_tokens": 3119509.0, "step": 861 }, { "entropy": 0.6092359870672226, "epoch": 0.8044797013532431, "grad_norm": 0.23329445719718933, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.7425690144300461, "num_tokens": 3123166.0, "step": 862 }, { "entropy": 0.6245740950107574, "epoch": 0.8054129724685021, "grad_norm": 0.21097496151924133, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.7440453469753265, "num_tokens": 3126814.0, "step": 863 }, { "entropy": 0.6368090808391571, "epoch": 0.8063462435837611, "grad_norm": 0.2420298457145691, "learning_rate": 0.0002, "loss": 0.6258, "mean_token_accuracy": 0.7487890273332596, "num_tokens": 3130409.0, "step": 864 }, { "entropy": 0.6046779006719589, "epoch": 0.80727951469902, "grad_norm": 0.2202235609292984, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.7597577571868896, "num_tokens": 3134046.0, "step": 865 }, { "entropy": 0.6033747643232346, "epoch": 0.808212785814279, "grad_norm": 0.21388396620750427, "learning_rate": 0.0002, "loss": 0.6105, "mean_token_accuracy": 0.7511253505945206, "num_tokens": 3137677.0, "step": 866 }, { "entropy": 0.619894340634346, "epoch": 0.809146056929538, "grad_norm": 0.22200167179107666, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.7523390352725983, "num_tokens": 3141297.0, "step": 867 }, { "entropy": 0.6186093688011169, "epoch": 0.810079328044797, "grad_norm": 0.21584483981132507, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.7416420131921768, "num_tokens": 3144937.0, "step": 868 }, { "entropy": 0.672151118516922, "epoch": 0.811012599160056, "grad_norm": 0.2084781676530838, "learning_rate": 0.0002, "loss": 0.6688, "mean_token_accuracy": 0.7300055027008057, "num_tokens": 3148725.0, "step": 869 }, { "entropy": 0.6078613698482513, "epoch": 0.811945870275315, "grad_norm": 0.23573707044124603, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.7460915446281433, "num_tokens": 3152411.0, "step": 870 }, { "entropy": 0.6117004752159119, "epoch": 0.8128791413905739, "grad_norm": 0.2143416553735733, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.7615141719579697, "num_tokens": 3156154.0, "step": 871 }, { "entropy": 0.6065992712974548, "epoch": 0.8138124125058329, "grad_norm": 0.22896108031272888, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.7511822432279587, "num_tokens": 3159764.0, "step": 872 }, { "entropy": 0.6134763658046722, "epoch": 0.8147456836210919, "grad_norm": 0.23022955656051636, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.7463437914848328, "num_tokens": 3163334.0, "step": 873 }, { "entropy": 0.6142333596944809, "epoch": 0.8156789547363509, "grad_norm": 0.2223431020975113, "learning_rate": 0.0002, "loss": 0.6218, "mean_token_accuracy": 0.7524621337652206, "num_tokens": 3166998.0, "step": 874 }, { "entropy": 0.6369864493608475, "epoch": 0.8166122258516099, "grad_norm": 0.24583816528320312, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.7433925420045853, "num_tokens": 3170655.0, "step": 875 }, { "entropy": 0.5785895884037018, "epoch": 0.8175454969668688, "grad_norm": 0.2310793250799179, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.7569928765296936, "num_tokens": 3174190.0, "step": 876 }, { "entropy": 0.6240631341934204, "epoch": 0.8184787680821278, "grad_norm": 0.3066695034503937, "learning_rate": 0.0002, "loss": 0.6422, "mean_token_accuracy": 0.7392638474702835, "num_tokens": 3177760.0, "step": 877 }, { "entropy": 0.5753375738859177, "epoch": 0.8194120391973868, "grad_norm": 0.2648378014564514, "learning_rate": 0.0002, "loss": 0.5828, "mean_token_accuracy": 0.7658298462629318, "num_tokens": 3181487.0, "step": 878 }, { "entropy": 0.6130495667457581, "epoch": 0.8203453103126458, "grad_norm": 0.2837807536125183, "learning_rate": 0.0002, "loss": 0.6227, "mean_token_accuracy": 0.7435403019189835, "num_tokens": 3185016.0, "step": 879 }, { "entropy": 0.6409981697797775, "epoch": 0.8212785814279048, "grad_norm": 0.2111208289861679, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.7473866790533066, "num_tokens": 3188677.0, "step": 880 }, { "entropy": 0.6302047222852707, "epoch": 0.8222118525431638, "grad_norm": 0.21225272119045258, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.750467523932457, "num_tokens": 3192253.0, "step": 881 }, { "entropy": 0.5882159322500229, "epoch": 0.8231451236584227, "grad_norm": 0.1993175745010376, "learning_rate": 0.0002, "loss": 0.5865, "mean_token_accuracy": 0.7596279382705688, "num_tokens": 3195883.0, "step": 882 }, { "entropy": 0.5927509516477585, "epoch": 0.8240783947736817, "grad_norm": 0.2415497899055481, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.7544320076704025, "num_tokens": 3199371.0, "step": 883 }, { "entropy": 0.6583038419485092, "epoch": 0.8250116658889407, "grad_norm": 0.22216640412807465, "learning_rate": 0.0002, "loss": 0.6563, "mean_token_accuracy": 0.7356101125478745, "num_tokens": 3203050.0, "step": 884 }, { "entropy": 0.6138497591018677, "epoch": 0.8259449370041997, "grad_norm": 0.23635026812553406, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.7476901262998581, "num_tokens": 3206695.0, "step": 885 }, { "entropy": 0.6183085292577744, "epoch": 0.8268782081194587, "grad_norm": 0.21343523263931274, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.7440039962530136, "num_tokens": 3210406.0, "step": 886 }, { "entropy": 0.650519534945488, "epoch": 0.8278114792347177, "grad_norm": 0.2571674883365631, "learning_rate": 0.0002, "loss": 0.6468, "mean_token_accuracy": 0.7440283447504044, "num_tokens": 3214075.0, "step": 887 }, { "entropy": 0.6200706511735916, "epoch": 0.8287447503499766, "grad_norm": 0.2412182092666626, "learning_rate": 0.0002, "loss": 0.629, "mean_token_accuracy": 0.7467334419488907, "num_tokens": 3217657.0, "step": 888 }, { "entropy": 0.6572431474924088, "epoch": 0.8296780214652356, "grad_norm": 0.20618022978305817, "learning_rate": 0.0002, "loss": 0.6643, "mean_token_accuracy": 0.7345285266637802, "num_tokens": 3221314.0, "step": 889 }, { "entropy": 0.6018436849117279, "epoch": 0.8306112925804946, "grad_norm": 0.2594725787639618, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.7538476586341858, "num_tokens": 3224920.0, "step": 890 }, { "entropy": 0.6363339126110077, "epoch": 0.8315445636957536, "grad_norm": 0.2397744357585907, "learning_rate": 0.0002, "loss": 0.6402, "mean_token_accuracy": 0.7396020591259003, "num_tokens": 3228541.0, "step": 891 }, { "entropy": 0.5881905406713486, "epoch": 0.8324778348110126, "grad_norm": 0.23779617249965668, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.7587847709655762, "num_tokens": 3232130.0, "step": 892 }, { "entropy": 0.5474623218178749, "epoch": 0.8334111059262715, "grad_norm": 0.2332586944103241, "learning_rate": 0.0002, "loss": 0.5534, "mean_token_accuracy": 0.7732291370630264, "num_tokens": 3235674.0, "step": 893 }, { "entropy": 0.6592623889446259, "epoch": 0.8343443770415305, "grad_norm": 0.22350478172302246, "learning_rate": 0.0002, "loss": 0.6704, "mean_token_accuracy": 0.7268105000257492, "num_tokens": 3239364.0, "step": 894 }, { "entropy": 0.5948190093040466, "epoch": 0.8352776481567895, "grad_norm": 0.24925962090492249, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.754833772778511, "num_tokens": 3243020.0, "step": 895 }, { "entropy": 0.6122484356164932, "epoch": 0.8362109192720485, "grad_norm": 0.2253568321466446, "learning_rate": 0.0002, "loss": 0.6183, "mean_token_accuracy": 0.7445878237485886, "num_tokens": 3246660.0, "step": 896 }, { "entropy": 0.6073036789894104, "epoch": 0.8371441903873075, "grad_norm": 0.20878222584724426, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.7520214021205902, "num_tokens": 3250229.0, "step": 897 }, { "entropy": 0.6259484589099884, "epoch": 0.8380774615025665, "grad_norm": 0.3382832407951355, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.7495731860399246, "num_tokens": 3253888.0, "step": 898 }, { "entropy": 0.6335137784481049, "epoch": 0.8390107326178254, "grad_norm": 0.248164102435112, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.7469904869794846, "num_tokens": 3257549.0, "step": 899 }, { "entropy": 0.5950460284948349, "epoch": 0.8399440037330844, "grad_norm": 0.20783692598342896, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.7474508434534073, "num_tokens": 3261160.0, "step": 900 }, { "entropy": 0.6101006865501404, "epoch": 0.8408772748483434, "grad_norm": 0.23862408101558685, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.7455872744321823, "num_tokens": 3264728.0, "step": 901 }, { "entropy": 0.6251267045736313, "epoch": 0.8418105459636024, "grad_norm": 0.2256557047367096, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.745031476020813, "num_tokens": 3268449.0, "step": 902 }, { "entropy": 0.5973101705312729, "epoch": 0.8427438170788614, "grad_norm": 0.1985136866569519, "learning_rate": 0.0002, "loss": 0.591, "mean_token_accuracy": 0.7546463310718536, "num_tokens": 3272151.0, "step": 903 }, { "entropy": 0.6084209978580475, "epoch": 0.8436770881941204, "grad_norm": 0.2124672830104828, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.7433728277683258, "num_tokens": 3275751.0, "step": 904 }, { "entropy": 0.6007262021303177, "epoch": 0.8446103593093793, "grad_norm": 0.21438398957252502, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.755223423242569, "num_tokens": 3279279.0, "step": 905 }, { "entropy": 0.6438082158565521, "epoch": 0.8455436304246383, "grad_norm": 0.2641831636428833, "learning_rate": 0.0002, "loss": 0.6529, "mean_token_accuracy": 0.7332585155963898, "num_tokens": 3282805.0, "step": 906 }, { "entropy": 0.5672825574874878, "epoch": 0.8464769015398973, "grad_norm": 0.2301226109266281, "learning_rate": 0.0002, "loss": 0.5692, "mean_token_accuracy": 0.7618565857410431, "num_tokens": 3286300.0, "step": 907 }, { "entropy": 0.6232046782970428, "epoch": 0.8474101726551563, "grad_norm": 0.20028528571128845, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.7521807700395584, "num_tokens": 3290075.0, "step": 908 }, { "entropy": 0.594790980219841, "epoch": 0.8483434437704153, "grad_norm": 0.2079406976699829, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.7544907182455063, "num_tokens": 3293702.0, "step": 909 }, { "entropy": 0.6350180208683014, "epoch": 0.8492767148856742, "grad_norm": 0.1958298236131668, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.747057631611824, "num_tokens": 3297330.0, "step": 910 }, { "entropy": 0.5676863193511963, "epoch": 0.8502099860009332, "grad_norm": 0.21050065755844116, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7671588659286499, "num_tokens": 3300952.0, "step": 911 }, { "entropy": 0.6510196328163147, "epoch": 0.8511432571161922, "grad_norm": 0.23856960237026215, "learning_rate": 0.0002, "loss": 0.6616, "mean_token_accuracy": 0.7351560443639755, "num_tokens": 3304643.0, "step": 912 }, { "entropy": 0.585195317864418, "epoch": 0.8520765282314512, "grad_norm": 0.2820931077003479, "learning_rate": 0.0002, "loss": 0.6006, "mean_token_accuracy": 0.7542054057121277, "num_tokens": 3308298.0, "step": 913 }, { "entropy": 0.606810137629509, "epoch": 0.8530097993467102, "grad_norm": 0.2109670788049698, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.757291629910469, "num_tokens": 3311958.0, "step": 914 }, { "entropy": 0.5844490677118301, "epoch": 0.8539430704619692, "grad_norm": 0.2785903513431549, "learning_rate": 0.0002, "loss": 0.5912, "mean_token_accuracy": 0.7576685845851898, "num_tokens": 3315666.0, "step": 915 }, { "entropy": 0.5999371558427811, "epoch": 0.8548763415772281, "grad_norm": 0.22029085457324982, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.7515857666730881, "num_tokens": 3319315.0, "step": 916 }, { "entropy": 0.6114688515663147, "epoch": 0.8558096126924871, "grad_norm": 0.2351505607366562, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.7532511800527573, "num_tokens": 3322906.0, "step": 917 }, { "entropy": 0.633906289935112, "epoch": 0.8567428838077461, "grad_norm": 0.22190052270889282, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.751401424407959, "num_tokens": 3326438.0, "step": 918 }, { "entropy": 0.5990193784236908, "epoch": 0.8576761549230051, "grad_norm": 0.21474699676036835, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.752531498670578, "num_tokens": 3330042.0, "step": 919 }, { "entropy": 0.6110025346279144, "epoch": 0.8586094260382641, "grad_norm": 0.22764766216278076, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.7478052526712418, "num_tokens": 3333633.0, "step": 920 }, { "entropy": 0.583649218082428, "epoch": 0.859542697153523, "grad_norm": 0.22290736436843872, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.7581894546747208, "num_tokens": 3337174.0, "step": 921 }, { "entropy": 0.5623084902763367, "epoch": 0.860475968268782, "grad_norm": 0.22021567821502686, "learning_rate": 0.0002, "loss": 0.5633, "mean_token_accuracy": 0.7710089683532715, "num_tokens": 3340755.0, "step": 922 }, { "entropy": 0.617858424782753, "epoch": 0.861409239384041, "grad_norm": 0.31273552775382996, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.7433298975229263, "num_tokens": 3344337.0, "step": 923 }, { "entropy": 0.6033164262771606, "epoch": 0.8623425104993, "grad_norm": 0.24230466783046722, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.746968001127243, "num_tokens": 3347930.0, "step": 924 }, { "entropy": 0.6215299963951111, "epoch": 0.863275781614559, "grad_norm": 0.24856917560100555, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.7432773858308792, "num_tokens": 3351496.0, "step": 925 }, { "entropy": 0.607501283288002, "epoch": 0.864209052729818, "grad_norm": 0.24738387763500214, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.7474801689386368, "num_tokens": 3354958.0, "step": 926 }, { "entropy": 0.658294290304184, "epoch": 0.8651423238450769, "grad_norm": 0.18645112216472626, "learning_rate": 0.0002, "loss": 0.6531, "mean_token_accuracy": 0.7299483567476273, "num_tokens": 3358625.0, "step": 927 }, { "entropy": 0.6248582452535629, "epoch": 0.8660755949603359, "grad_norm": 0.25789472460746765, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.754611998796463, "num_tokens": 3362211.0, "step": 928 }, { "entropy": 0.6055457592010498, "epoch": 0.8670088660755949, "grad_norm": 0.23710237443447113, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7591741681098938, "num_tokens": 3365813.0, "step": 929 }, { "entropy": 0.6255745738744736, "epoch": 0.8679421371908539, "grad_norm": 0.2847382724285126, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.7499403208494186, "num_tokens": 3369364.0, "step": 930 }, { "entropy": 0.6394165605306625, "epoch": 0.8688754083061129, "grad_norm": 0.20917560160160065, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.7463392019271851, "num_tokens": 3373128.0, "step": 931 }, { "entropy": 0.5763626098632812, "epoch": 0.8698086794213719, "grad_norm": 0.22507937252521515, "learning_rate": 0.0002, "loss": 0.5807, "mean_token_accuracy": 0.7734289020299911, "num_tokens": 3376752.0, "step": 932 }, { "entropy": 0.6476683616638184, "epoch": 0.8707419505366308, "grad_norm": 0.23516401648521423, "learning_rate": 0.0002, "loss": 0.6599, "mean_token_accuracy": 0.7339435964822769, "num_tokens": 3380348.0, "step": 933 }, { "entropy": 0.6007173508405685, "epoch": 0.8716752216518898, "grad_norm": 0.2794935405254364, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.7481650412082672, "num_tokens": 3383914.0, "step": 934 }, { "entropy": 0.6219699084758759, "epoch": 0.8726084927671488, "grad_norm": 0.22412657737731934, "learning_rate": 0.0002, "loss": 0.6275, "mean_token_accuracy": 0.740465372800827, "num_tokens": 3387710.0, "step": 935 }, { "entropy": 0.620863065123558, "epoch": 0.8735417638824078, "grad_norm": 0.2756800353527069, "learning_rate": 0.0002, "loss": 0.6482, "mean_token_accuracy": 0.7353069484233856, "num_tokens": 3391227.0, "step": 936 }, { "entropy": 0.5448387265205383, "epoch": 0.8744750349976668, "grad_norm": 0.23953542113304138, "learning_rate": 0.0002, "loss": 0.5513, "mean_token_accuracy": 0.7786146402359009, "num_tokens": 3394849.0, "step": 937 }, { "entropy": 0.5955428332090378, "epoch": 0.8754083061129259, "grad_norm": 0.25031381845474243, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.7469810992479324, "num_tokens": 3398443.0, "step": 938 }, { "entropy": 0.6854472905397415, "epoch": 0.8763415772281848, "grad_norm": 0.23227758705615997, "learning_rate": 0.0002, "loss": 0.6669, "mean_token_accuracy": 0.727220356464386, "num_tokens": 3402217.0, "step": 939 }, { "entropy": 0.5883684158325195, "epoch": 0.8772748483434438, "grad_norm": 0.2305300384759903, "learning_rate": 0.0002, "loss": 0.5873, "mean_token_accuracy": 0.7656785547733307, "num_tokens": 3405717.0, "step": 940 }, { "entropy": 0.6299880146980286, "epoch": 0.8782081194587028, "grad_norm": 0.2228046953678131, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.7562084645032883, "num_tokens": 3409326.0, "step": 941 }, { "entropy": 0.6360105574131012, "epoch": 0.8791413905739618, "grad_norm": 0.191792830824852, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.7447732537984848, "num_tokens": 3413024.0, "step": 942 }, { "entropy": 0.6672945767641068, "epoch": 0.8800746616892208, "grad_norm": 0.23308902978897095, "learning_rate": 0.0002, "loss": 0.6733, "mean_token_accuracy": 0.7326675951480865, "num_tokens": 3416638.0, "step": 943 }, { "entropy": 0.6073235720396042, "epoch": 0.8810079328044798, "grad_norm": 0.23260174691677094, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.7480104267597198, "num_tokens": 3420451.0, "step": 944 }, { "entropy": 0.6286613196134567, "epoch": 0.8819412039197387, "grad_norm": 0.23256874084472656, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.7425329387187958, "num_tokens": 3424131.0, "step": 945 }, { "entropy": 0.6265650689601898, "epoch": 0.8828744750349977, "grad_norm": 0.22301113605499268, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.7455740869045258, "num_tokens": 3427786.0, "step": 946 }, { "entropy": 0.5793304145336151, "epoch": 0.8838077461502567, "grad_norm": 0.1947644054889679, "learning_rate": 0.0002, "loss": 0.5825, "mean_token_accuracy": 0.7650973051786423, "num_tokens": 3431382.0, "step": 947 }, { "entropy": 0.6070086658000946, "epoch": 0.8847410172655157, "grad_norm": 0.23896171152591705, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.7414264529943466, "num_tokens": 3434973.0, "step": 948 }, { "entropy": 0.6315445452928543, "epoch": 0.8856742883807747, "grad_norm": 0.24040305614471436, "learning_rate": 0.0002, "loss": 0.6426, "mean_token_accuracy": 0.7355186939239502, "num_tokens": 3438542.0, "step": 949 }, { "entropy": 0.6139037907123566, "epoch": 0.8866075594960336, "grad_norm": 0.18944673240184784, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.748910129070282, "num_tokens": 3442281.0, "step": 950 }, { "entropy": 0.6239611655473709, "epoch": 0.8875408306112926, "grad_norm": 0.19498221576213837, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.756896585226059, "num_tokens": 3445734.0, "step": 951 }, { "entropy": 0.5997806638479233, "epoch": 0.8884741017265516, "grad_norm": 0.23204007744789124, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7595545202493668, "num_tokens": 3449379.0, "step": 952 }, { "entropy": 0.6257578134536743, "epoch": 0.8894073728418106, "grad_norm": 0.23791232705116272, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.746998518705368, "num_tokens": 3452909.0, "step": 953 }, { "entropy": 0.5922590792179108, "epoch": 0.8903406439570696, "grad_norm": 0.18362577259540558, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.7545074224472046, "num_tokens": 3456648.0, "step": 954 }, { "entropy": 0.5453580170869827, "epoch": 0.8912739150723286, "grad_norm": 0.2475186437368393, "learning_rate": 0.0002, "loss": 0.5438, "mean_token_accuracy": 0.780629888176918, "num_tokens": 3460312.0, "step": 955 }, { "entropy": 0.6053523123264313, "epoch": 0.8922071861875875, "grad_norm": 0.22628776729106903, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.7488850802183151, "num_tokens": 3463974.0, "step": 956 }, { "entropy": 0.5655805468559265, "epoch": 0.8931404573028465, "grad_norm": 0.20568735897541046, "learning_rate": 0.0002, "loss": 0.5659, "mean_token_accuracy": 0.7698361724615097, "num_tokens": 3467567.0, "step": 957 }, { "entropy": 0.6054786294698715, "epoch": 0.8940737284181055, "grad_norm": 0.19612133502960205, "learning_rate": 0.0002, "loss": 0.6067, "mean_token_accuracy": 0.751382514834404, "num_tokens": 3471257.0, "step": 958 }, { "entropy": 0.6107126772403717, "epoch": 0.8950069995333645, "grad_norm": 0.2208346277475357, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.7497208118438721, "num_tokens": 3474913.0, "step": 959 }, { "entropy": 0.5834950655698776, "epoch": 0.8959402706486235, "grad_norm": 0.2471582293510437, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.7627946436405182, "num_tokens": 3478526.0, "step": 960 }, { "entropy": 0.6059368848800659, "epoch": 0.8968735417638825, "grad_norm": 0.25016549229621887, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.7533753216266632, "num_tokens": 3482034.0, "step": 961 }, { "entropy": 0.6112779080867767, "epoch": 0.8978068128791414, "grad_norm": 0.2501973509788513, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.7548162639141083, "num_tokens": 3485638.0, "step": 962 }, { "entropy": 0.5563293695449829, "epoch": 0.8987400839944004, "grad_norm": 0.2613650858402252, "learning_rate": 0.0002, "loss": 0.5674, "mean_token_accuracy": 0.7657275646924973, "num_tokens": 3489192.0, "step": 963 }, { "entropy": 0.6393873244524002, "epoch": 0.8996733551096594, "grad_norm": 0.27151018381118774, "learning_rate": 0.0002, "loss": 0.6587, "mean_token_accuracy": 0.7284048050642014, "num_tokens": 3492741.0, "step": 964 }, { "entropy": 0.6240639686584473, "epoch": 0.9006066262249184, "grad_norm": 0.19929707050323486, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.7465076297521591, "num_tokens": 3496400.0, "step": 965 }, { "entropy": 0.5937365740537643, "epoch": 0.9015398973401774, "grad_norm": 0.20557649433612823, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.7639545798301697, "num_tokens": 3500024.0, "step": 966 }, { "entropy": 0.5921351760625839, "epoch": 0.9024731684554363, "grad_norm": 0.21049924194812775, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7566007077693939, "num_tokens": 3503695.0, "step": 967 }, { "entropy": 0.6321775764226913, "epoch": 0.9034064395706953, "grad_norm": 0.2182459980249405, "learning_rate": 0.0002, "loss": 0.6206, "mean_token_accuracy": 0.7488838881254196, "num_tokens": 3507418.0, "step": 968 }, { "entropy": 0.6441711932420731, "epoch": 0.9043397106859543, "grad_norm": 0.22217866778373718, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.7490884214639664, "num_tokens": 3511158.0, "step": 969 }, { "entropy": 0.6153720766305923, "epoch": 0.9052729818012133, "grad_norm": 0.21121646463871002, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.7400148957967758, "num_tokens": 3514756.0, "step": 970 }, { "entropy": 0.6227361112833023, "epoch": 0.9062062529164723, "grad_norm": 0.1881455034017563, "learning_rate": 0.0002, "loss": 0.6263, "mean_token_accuracy": 0.7475137412548065, "num_tokens": 3518256.0, "step": 971 }, { "entropy": 0.5963353961706161, "epoch": 0.9071395240317313, "grad_norm": 0.2091335952281952, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7599248290061951, "num_tokens": 3521921.0, "step": 972 }, { "entropy": 0.6227505505084991, "epoch": 0.9080727951469902, "grad_norm": 0.21721823513507843, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.7501185834407806, "num_tokens": 3525456.0, "step": 973 }, { "entropy": 0.602179691195488, "epoch": 0.9090060662622492, "grad_norm": 0.24995474517345428, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.7469927072525024, "num_tokens": 3529047.0, "step": 974 }, { "entropy": 0.5744163691997528, "epoch": 0.9099393373775082, "grad_norm": 0.2325841635465622, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.7569796293973923, "num_tokens": 3532586.0, "step": 975 }, { "entropy": 0.6242080330848694, "epoch": 0.9108726084927672, "grad_norm": 0.21930736303329468, "learning_rate": 0.0002, "loss": 0.6243, "mean_token_accuracy": 0.7479029446840286, "num_tokens": 3536262.0, "step": 976 }, { "entropy": 0.6381217241287231, "epoch": 0.9118058796080262, "grad_norm": 0.32795703411102295, "learning_rate": 0.0002, "loss": 0.6335, "mean_token_accuracy": 0.7432620376348495, "num_tokens": 3540034.0, "step": 977 }, { "entropy": 0.6044614017009735, "epoch": 0.9127391507232852, "grad_norm": 0.28263869881629944, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.7476668804883957, "num_tokens": 3543640.0, "step": 978 }, { "entropy": 0.6111284047365189, "epoch": 0.9136724218385441, "grad_norm": 0.22875556349754333, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.7538595348596573, "num_tokens": 3547265.0, "step": 979 }, { "entropy": 0.5681032240390778, "epoch": 0.9146056929538031, "grad_norm": 0.27223706245422363, "learning_rate": 0.0002, "loss": 0.5769, "mean_token_accuracy": 0.7685117721557617, "num_tokens": 3550884.0, "step": 980 }, { "entropy": 0.6101363599300385, "epoch": 0.9155389640690621, "grad_norm": 0.21420127153396606, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.7550997138023376, "num_tokens": 3554471.0, "step": 981 }, { "entropy": 0.6219411194324493, "epoch": 0.9164722351843211, "grad_norm": 0.27448058128356934, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.7465732395648956, "num_tokens": 3558127.0, "step": 982 }, { "entropy": 0.6517399847507477, "epoch": 0.9174055062995801, "grad_norm": 0.22220584750175476, "learning_rate": 0.0002, "loss": 0.6565, "mean_token_accuracy": 0.7376943528652191, "num_tokens": 3561770.0, "step": 983 }, { "entropy": 0.6295941025018692, "epoch": 0.918338777414839, "grad_norm": 0.19717414677143097, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.738877534866333, "num_tokens": 3565417.0, "step": 984 }, { "entropy": 0.6105553060770035, "epoch": 0.919272048530098, "grad_norm": 0.19412840902805328, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.7480553090572357, "num_tokens": 3569107.0, "step": 985 }, { "entropy": 0.6381321549415588, "epoch": 0.920205319645357, "grad_norm": 0.23561422526836395, "learning_rate": 0.0002, "loss": 0.6532, "mean_token_accuracy": 0.7335329204797745, "num_tokens": 3572739.0, "step": 986 }, { "entropy": 0.5975278913974762, "epoch": 0.921138590760616, "grad_norm": 0.2248794585466385, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.7620813399553299, "num_tokens": 3576157.0, "step": 987 }, { "entropy": 0.6427773982286453, "epoch": 0.922071861875875, "grad_norm": 0.20993822813034058, "learning_rate": 0.0002, "loss": 0.635, "mean_token_accuracy": 0.7402443885803223, "num_tokens": 3579783.0, "step": 988 }, { "entropy": 0.6169949024915695, "epoch": 0.923005132991134, "grad_norm": 0.19316455721855164, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.7518258541822433, "num_tokens": 3583613.0, "step": 989 }, { "entropy": 0.6099706292152405, "epoch": 0.9239384041063929, "grad_norm": 0.22860963642597198, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.7524517476558685, "num_tokens": 3587223.0, "step": 990 }, { "entropy": 0.5830223262310028, "epoch": 0.9248716752216519, "grad_norm": 0.24718163907527924, "learning_rate": 0.0002, "loss": 0.5836, "mean_token_accuracy": 0.7667772024869919, "num_tokens": 3590849.0, "step": 991 }, { "entropy": 0.5912155508995056, "epoch": 0.9258049463369109, "grad_norm": 0.20853932201862335, "learning_rate": 0.0002, "loss": 0.5921, "mean_token_accuracy": 0.7606669515371323, "num_tokens": 3594413.0, "step": 992 }, { "entropy": 0.5883132070302963, "epoch": 0.9267382174521699, "grad_norm": 0.2505350708961487, "learning_rate": 0.0002, "loss": 0.6046, "mean_token_accuracy": 0.7506591975688934, "num_tokens": 3598006.0, "step": 993 }, { "entropy": 0.6139202564954758, "epoch": 0.9276714885674289, "grad_norm": 0.27735140919685364, "learning_rate": 0.0002, "loss": 0.6368, "mean_token_accuracy": 0.7381778359413147, "num_tokens": 3601606.0, "step": 994 }, { "entropy": 0.6061946600675583, "epoch": 0.9286047596826879, "grad_norm": 0.23414352536201477, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.7410517036914825, "num_tokens": 3605269.0, "step": 995 }, { "entropy": 0.5800118893384933, "epoch": 0.9295380307979468, "grad_norm": 0.23610126972198486, "learning_rate": 0.0002, "loss": 0.5788, "mean_token_accuracy": 0.7569734156131744, "num_tokens": 3608820.0, "step": 996 }, { "entropy": 0.6529659479856491, "epoch": 0.9304713019132058, "grad_norm": 0.21582600474357605, "learning_rate": 0.0002, "loss": 0.6482, "mean_token_accuracy": 0.7379873692989349, "num_tokens": 3612625.0, "step": 997 }, { "entropy": 0.6310292035341263, "epoch": 0.9314045730284648, "grad_norm": 0.21507640182971954, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.7405186593532562, "num_tokens": 3616219.0, "step": 998 }, { "entropy": 0.6173887848854065, "epoch": 0.9323378441437238, "grad_norm": 0.21989129483699799, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.7568347603082657, "num_tokens": 3619909.0, "step": 999 }, { "entropy": 0.5821108222007751, "epoch": 0.9332711152589828, "grad_norm": 0.2445746213197708, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.7599096149206161, "num_tokens": 3623596.0, "step": 1000 }, { "entropy": 0.6436759829521179, "epoch": 0.9342043863742417, "grad_norm": 0.20567400753498077, "learning_rate": 0.0002, "loss": 0.6472, "mean_token_accuracy": 0.7311262786388397, "num_tokens": 3627190.0, "step": 1001 }, { "entropy": 0.6359108090400696, "epoch": 0.9351376574895007, "grad_norm": 0.20319697260856628, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.7489369362592697, "num_tokens": 3630834.0, "step": 1002 }, { "entropy": 0.5948545336723328, "epoch": 0.9360709286047597, "grad_norm": 0.18844455480575562, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.7587515264749527, "num_tokens": 3634474.0, "step": 1003 }, { "entropy": 0.6025988608598709, "epoch": 0.9370041997200187, "grad_norm": 0.18969789147377014, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7549912631511688, "num_tokens": 3638025.0, "step": 1004 }, { "entropy": 0.5519662201404572, "epoch": 0.9379374708352777, "grad_norm": 0.2219277322292328, "learning_rate": 0.0002, "loss": 0.5543, "mean_token_accuracy": 0.7778527587652206, "num_tokens": 3641663.0, "step": 1005 }, { "entropy": 0.6052159070968628, "epoch": 0.9388707419505367, "grad_norm": 0.27419403195381165, "learning_rate": 0.0002, "loss": 0.6301, "mean_token_accuracy": 0.7398725003004074, "num_tokens": 3645178.0, "step": 1006 }, { "entropy": 0.6275213807821274, "epoch": 0.9398040130657956, "grad_norm": 0.22871708869934082, "learning_rate": 0.0002, "loss": 0.6359, "mean_token_accuracy": 0.744513064622879, "num_tokens": 3648889.0, "step": 1007 }, { "entropy": 0.5775939673185349, "epoch": 0.9407372841810546, "grad_norm": 0.22834119200706482, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.766847163438797, "num_tokens": 3652456.0, "step": 1008 }, { "entropy": 0.5740585923194885, "epoch": 0.9416705552963136, "grad_norm": 0.20079553127288818, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.7613830417394638, "num_tokens": 3656101.0, "step": 1009 }, { "entropy": 0.6040508151054382, "epoch": 0.9426038264115726, "grad_norm": 0.2601279616355896, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.7478747069835663, "num_tokens": 3659711.0, "step": 1010 }, { "entropy": 0.6051161140203476, "epoch": 0.9435370975268316, "grad_norm": 0.25471964478492737, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.7538603693246841, "num_tokens": 3663355.0, "step": 1011 }, { "entropy": 0.6207922995090485, "epoch": 0.9444703686420906, "grad_norm": 0.21078643202781677, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.7469834834337234, "num_tokens": 3666938.0, "step": 1012 }, { "entropy": 0.5888893008232117, "epoch": 0.9454036397573495, "grad_norm": 0.2349822223186493, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.755613848567009, "num_tokens": 3670598.0, "step": 1013 }, { "entropy": 0.6344944983720779, "epoch": 0.9463369108726085, "grad_norm": 0.26445794105529785, "learning_rate": 0.0002, "loss": 0.6425, "mean_token_accuracy": 0.7351108491420746, "num_tokens": 3674229.0, "step": 1014 }, { "entropy": 0.6092076003551483, "epoch": 0.9472701819878675, "grad_norm": 0.23596763610839844, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.7496247440576553, "num_tokens": 3677846.0, "step": 1015 }, { "entropy": 0.6320943236351013, "epoch": 0.9482034531031265, "grad_norm": 0.19506685435771942, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.7500215619802475, "num_tokens": 3681484.0, "step": 1016 }, { "entropy": 0.6151046901941299, "epoch": 0.9491367242183855, "grad_norm": 0.22481916844844818, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.7487077116966248, "num_tokens": 3685134.0, "step": 1017 }, { "entropy": 0.6071106493473053, "epoch": 0.9500699953336444, "grad_norm": 0.21051813662052155, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.7508251816034317, "num_tokens": 3688678.0, "step": 1018 }, { "entropy": 0.6471702456474304, "epoch": 0.9510032664489034, "grad_norm": 0.20764118432998657, "learning_rate": 0.0002, "loss": 0.6463, "mean_token_accuracy": 0.7376591116189957, "num_tokens": 3692235.0, "step": 1019 }, { "entropy": 0.5850642770528793, "epoch": 0.9519365375641624, "grad_norm": 0.21367858350276947, "learning_rate": 0.0002, "loss": 0.5772, "mean_token_accuracy": 0.7650772929191589, "num_tokens": 3695843.0, "step": 1020 }, { "entropy": 0.631848081946373, "epoch": 0.9528698086794214, "grad_norm": 0.20553775131702423, "learning_rate": 0.0002, "loss": 0.6285, "mean_token_accuracy": 0.7425847500562668, "num_tokens": 3699459.0, "step": 1021 }, { "entropy": 0.5731992125511169, "epoch": 0.9538030797946804, "grad_norm": 0.2111768126487732, "learning_rate": 0.0002, "loss": 0.5781, "mean_token_accuracy": 0.75692218542099, "num_tokens": 3703012.0, "step": 1022 }, { "entropy": 0.6055203825235367, "epoch": 0.9547363509099394, "grad_norm": 0.22126390039920807, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.7550532966852188, "num_tokens": 3706606.0, "step": 1023 }, { "entropy": 0.5944153517484665, "epoch": 0.9556696220251983, "grad_norm": 0.2696364223957062, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.7489088624715805, "num_tokens": 3710233.0, "step": 1024 }, { "entropy": 0.555782213807106, "epoch": 0.9566028931404573, "grad_norm": 0.2368595153093338, "learning_rate": 0.0002, "loss": 0.567, "mean_token_accuracy": 0.7656756043434143, "num_tokens": 3713843.0, "step": 1025 }, { "entropy": 0.5961475074291229, "epoch": 0.9575361642557163, "grad_norm": 0.2504209578037262, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.7483637779951096, "num_tokens": 3717410.0, "step": 1026 }, { "entropy": 0.5809739679098129, "epoch": 0.9584694353709753, "grad_norm": 0.23941470682621002, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.7622173726558685, "num_tokens": 3720934.0, "step": 1027 }, { "entropy": 0.6349577903747559, "epoch": 0.9594027064862343, "grad_norm": 0.24047353863716125, "learning_rate": 0.0002, "loss": 0.6394, "mean_token_accuracy": 0.746508851647377, "num_tokens": 3724550.0, "step": 1028 }, { "entropy": 0.5847130119800568, "epoch": 0.9603359776014933, "grad_norm": 0.1921546310186386, "learning_rate": 0.0002, "loss": 0.5759, "mean_token_accuracy": 0.7637797147035599, "num_tokens": 3728068.0, "step": 1029 }, { "entropy": 0.6508066207170486, "epoch": 0.9612692487167522, "grad_norm": 0.2222174108028412, "learning_rate": 0.0002, "loss": 0.66, "mean_token_accuracy": 0.7321474999189377, "num_tokens": 3731733.0, "step": 1030 }, { "entropy": 0.62265844643116, "epoch": 0.9622025198320112, "grad_norm": 0.22416797280311584, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.7407685816287994, "num_tokens": 3735319.0, "step": 1031 }, { "entropy": 0.6220069825649261, "epoch": 0.9631357909472702, "grad_norm": 0.2163158506155014, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.7486557960510254, "num_tokens": 3739006.0, "step": 1032 }, { "entropy": 0.6061500012874603, "epoch": 0.9640690620625292, "grad_norm": 0.20930062234401703, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.752839058637619, "num_tokens": 3742634.0, "step": 1033 }, { "entropy": 0.6437918394804001, "epoch": 0.9650023331777882, "grad_norm": 0.24093204736709595, "learning_rate": 0.0002, "loss": 0.6413, "mean_token_accuracy": 0.7390135079622269, "num_tokens": 3746325.0, "step": 1034 }, { "entropy": 0.6251623034477234, "epoch": 0.9659356042930471, "grad_norm": 0.2248561680316925, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.7454505562782288, "num_tokens": 3750055.0, "step": 1035 }, { "entropy": 0.5987447053194046, "epoch": 0.9668688754083061, "grad_norm": 0.23127806186676025, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.7590354979038239, "num_tokens": 3753644.0, "step": 1036 }, { "entropy": 0.6203121095895767, "epoch": 0.9678021465235651, "grad_norm": 0.27094903588294983, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.7383685261011124, "num_tokens": 3757222.0, "step": 1037 }, { "entropy": 0.5736724883317947, "epoch": 0.9687354176388241, "grad_norm": 0.21608519554138184, "learning_rate": 0.0002, "loss": 0.565, "mean_token_accuracy": 0.773889034986496, "num_tokens": 3760842.0, "step": 1038 }, { "entropy": 0.6081874966621399, "epoch": 0.9696686887540831, "grad_norm": 0.20478583872318268, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.7500341534614563, "num_tokens": 3764550.0, "step": 1039 }, { "entropy": 0.6289982944726944, "epoch": 0.9706019598693421, "grad_norm": 0.24504274129867554, "learning_rate": 0.0002, "loss": 0.6386, "mean_token_accuracy": 0.7403787225484848, "num_tokens": 3768092.0, "step": 1040 }, { "entropy": 0.6383838057518005, "epoch": 0.971535230984601, "grad_norm": 0.23302961885929108, "learning_rate": 0.0002, "loss": 0.6433, "mean_token_accuracy": 0.7365994900465012, "num_tokens": 3771725.0, "step": 1041 }, { "entropy": 0.609977051615715, "epoch": 0.97246850209986, "grad_norm": 0.21986858546733856, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.7571266442537308, "num_tokens": 3775342.0, "step": 1042 }, { "entropy": 0.6017493158578873, "epoch": 0.973401773215119, "grad_norm": 0.2333524525165558, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.755514070391655, "num_tokens": 3778906.0, "step": 1043 }, { "entropy": 0.5970482528209686, "epoch": 0.974335044330378, "grad_norm": 0.23878833651542664, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.7499648928642273, "num_tokens": 3782567.0, "step": 1044 }, { "entropy": 0.5905427932739258, "epoch": 0.975268315445637, "grad_norm": 0.21460957825183868, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.7599915415048599, "num_tokens": 3786230.0, "step": 1045 }, { "entropy": 0.6221023350954056, "epoch": 0.976201586560896, "grad_norm": 0.30250853300094604, "learning_rate": 0.0002, "loss": 0.6503, "mean_token_accuracy": 0.738779827952385, "num_tokens": 3789862.0, "step": 1046 }, { "entropy": 0.5930271297693253, "epoch": 0.9771348576761549, "grad_norm": 0.259878009557724, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.7464452683925629, "num_tokens": 3793497.0, "step": 1047 }, { "entropy": 0.5933587104082108, "epoch": 0.9780681287914139, "grad_norm": 0.18905402719974518, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.7592352628707886, "num_tokens": 3797080.0, "step": 1048 }, { "entropy": 0.6226524710655212, "epoch": 0.9790013999066729, "grad_norm": 0.3190253973007202, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.736584946513176, "num_tokens": 3800790.0, "step": 1049 }, { "entropy": 0.6229960024356842, "epoch": 0.9799346710219319, "grad_norm": 0.2778717577457428, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.7494606822729111, "num_tokens": 3804385.0, "step": 1050 }, { "entropy": 0.6545553356409073, "epoch": 0.9808679421371909, "grad_norm": 0.18269945681095123, "learning_rate": 0.0002, "loss": 0.6466, "mean_token_accuracy": 0.7362277209758759, "num_tokens": 3808105.0, "step": 1051 }, { "entropy": 0.6096540838479996, "epoch": 0.9818012132524498, "grad_norm": 0.18127375841140747, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.7601791173219681, "num_tokens": 3811702.0, "step": 1052 }, { "entropy": 0.622975155711174, "epoch": 0.9827344843677088, "grad_norm": 0.2339409440755844, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.7463676631450653, "num_tokens": 3815297.0, "step": 1053 }, { "entropy": 0.6008087545633316, "epoch": 0.9836677554829678, "grad_norm": 0.25017422437667847, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.7579670995473862, "num_tokens": 3818913.0, "step": 1054 }, { "entropy": 0.6096787601709366, "epoch": 0.9846010265982268, "grad_norm": 0.2527852952480316, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.7514237761497498, "num_tokens": 3822575.0, "step": 1055 }, { "entropy": 0.6003454327583313, "epoch": 0.9855342977134858, "grad_norm": 0.22633641958236694, "learning_rate": 0.0002, "loss": 0.6105, "mean_token_accuracy": 0.7535508871078491, "num_tokens": 3826142.0, "step": 1056 }, { "entropy": 0.5919899493455887, "epoch": 0.9864675688287448, "grad_norm": 0.2817830443382263, "learning_rate": 0.0002, "loss": 0.6006, "mean_token_accuracy": 0.7602093666791916, "num_tokens": 3829691.0, "step": 1057 }, { "entropy": 0.6166488230228424, "epoch": 0.9874008399440037, "grad_norm": 0.2950601279735565, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.7473626732826233, "num_tokens": 3833469.0, "step": 1058 }, { "entropy": 0.6159386187791824, "epoch": 0.9883341110592627, "grad_norm": 0.2738671600818634, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.7476357519626617, "num_tokens": 3837032.0, "step": 1059 }, { "entropy": 0.6421031504869461, "epoch": 0.9892673821745217, "grad_norm": 0.22127465903759003, "learning_rate": 0.0002, "loss": 0.6401, "mean_token_accuracy": 0.7405971735715866, "num_tokens": 3840723.0, "step": 1060 }, { "entropy": 0.6422024816274643, "epoch": 0.9902006532897807, "grad_norm": 0.24415437877178192, "learning_rate": 0.0002, "loss": 0.6468, "mean_token_accuracy": 0.7365008145570755, "num_tokens": 3844400.0, "step": 1061 }, { "entropy": 0.6339176595211029, "epoch": 0.9911339244050397, "grad_norm": 0.2546505630016327, "learning_rate": 0.0002, "loss": 0.6488, "mean_token_accuracy": 0.7397857010364532, "num_tokens": 3848123.0, "step": 1062 }, { "entropy": 0.6366727501153946, "epoch": 0.9920671955202987, "grad_norm": 0.21031653881072998, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.7477778494358063, "num_tokens": 3851676.0, "step": 1063 }, { "entropy": 0.6018293648958206, "epoch": 0.9930004666355576, "grad_norm": 0.2394326627254486, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.754316508769989, "num_tokens": 3855319.0, "step": 1064 }, { "entropy": 0.5934012979269028, "epoch": 0.9939337377508166, "grad_norm": 0.2086758315563202, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.7603261172771454, "num_tokens": 3859022.0, "step": 1065 }, { "entropy": 0.5954241454601288, "epoch": 0.9948670088660756, "grad_norm": 0.21972031891345978, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.7597841769456863, "num_tokens": 3862748.0, "step": 1066 }, { "entropy": 0.6061007529497147, "epoch": 0.9958002799813346, "grad_norm": 0.2268170863389969, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.7630028575658798, "num_tokens": 3866337.0, "step": 1067 }, { "entropy": 0.5771865546703339, "epoch": 0.9967335510965936, "grad_norm": 0.21258656680583954, "learning_rate": 0.0002, "loss": 0.5796, "mean_token_accuracy": 0.7668052464723587, "num_tokens": 3869934.0, "step": 1068 }, { "entropy": 0.5848489850759506, "epoch": 0.9976668222118525, "grad_norm": 0.20795488357543945, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.7590954005718231, "num_tokens": 3873528.0, "step": 1069 }, { "entropy": 0.6063309609889984, "epoch": 0.9986000933271115, "grad_norm": 0.20123635232448578, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.7548137307167053, "num_tokens": 3877162.0, "step": 1070 }, { "entropy": 0.5663825273513794, "epoch": 0.9995333644423705, "grad_norm": 0.30025047063827515, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7641247510910034, "num_tokens": 3880801.0, "step": 1071 }, { "entropy": 0.554339587688446, "epoch": 1.0, "grad_norm": 0.3434506356716156, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.7616103291511536, "num_tokens": 3881841.0, "step": 1072 }, { "entropy": 0.6196876168251038, "epoch": 1.000933271115259, "grad_norm": 0.20071667432785034, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.7510274648666382, "num_tokens": 3885395.0, "step": 1073 }, { "entropy": 0.5885817557573318, "epoch": 1.001866542230518, "grad_norm": 0.1912836730480194, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7623266279697418, "num_tokens": 3889042.0, "step": 1074 }, { "entropy": 0.613819882273674, "epoch": 1.002799813345777, "grad_norm": 0.23407568037509918, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.7580405324697495, "num_tokens": 3892663.0, "step": 1075 }, { "entropy": 0.5607831925153732, "epoch": 1.003733084461036, "grad_norm": 0.22295290231704712, "learning_rate": 0.0002, "loss": 0.5574, "mean_token_accuracy": 0.7725434452295303, "num_tokens": 3896171.0, "step": 1076 }, { "entropy": 0.6133506745100021, "epoch": 1.004666355576295, "grad_norm": 0.255754292011261, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.7507913261651993, "num_tokens": 3899828.0, "step": 1077 }, { "entropy": 0.6095616817474365, "epoch": 1.005599626691554, "grad_norm": 0.23289723694324493, "learning_rate": 0.0002, "loss": 0.6139, "mean_token_accuracy": 0.7467008084058762, "num_tokens": 3903431.0, "step": 1078 }, { "entropy": 0.5613607615232468, "epoch": 1.0065328978068129, "grad_norm": 0.28178268671035767, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.7652135789394379, "num_tokens": 3906915.0, "step": 1079 }, { "entropy": 0.54168301820755, "epoch": 1.0074661689220719, "grad_norm": 0.24211814999580383, "learning_rate": 0.0002, "loss": 0.5467, "mean_token_accuracy": 0.7790049165487289, "num_tokens": 3910526.0, "step": 1080 }, { "entropy": 0.5929768532514572, "epoch": 1.0083994400373308, "grad_norm": 0.22489678859710693, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.7630016654729843, "num_tokens": 3914199.0, "step": 1081 }, { "entropy": 0.6102780699729919, "epoch": 1.0093327111525898, "grad_norm": 0.24968476593494415, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.7529278993606567, "num_tokens": 3917883.0, "step": 1082 }, { "entropy": 0.5695993155241013, "epoch": 1.0102659822678488, "grad_norm": 0.23349279165267944, "learning_rate": 0.0002, "loss": 0.5714, "mean_token_accuracy": 0.7687487006187439, "num_tokens": 3921508.0, "step": 1083 }, { "entropy": 0.5766718983650208, "epoch": 1.0111992533831078, "grad_norm": 0.23010191321372986, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.7616383880376816, "num_tokens": 3925119.0, "step": 1084 }, { "entropy": 0.5681562423706055, "epoch": 1.0121325244983668, "grad_norm": 0.23847483098506927, "learning_rate": 0.0002, "loss": 0.5657, "mean_token_accuracy": 0.7646367996931076, "num_tokens": 3928730.0, "step": 1085 }, { "entropy": 0.6237134486436844, "epoch": 1.0130657956136258, "grad_norm": 0.24740830063819885, "learning_rate": 0.0002, "loss": 0.6359, "mean_token_accuracy": 0.7400389462709427, "num_tokens": 3932326.0, "step": 1086 }, { "entropy": 0.5983414947986603, "epoch": 1.0139990667288847, "grad_norm": 0.23711003363132477, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.7575321346521378, "num_tokens": 3935961.0, "step": 1087 }, { "entropy": 0.5707122087478638, "epoch": 1.0149323378441437, "grad_norm": 0.2311970740556717, "learning_rate": 0.0002, "loss": 0.5678, "mean_token_accuracy": 0.7720106244087219, "num_tokens": 3939534.0, "step": 1088 }, { "entropy": 0.5749760270118713, "epoch": 1.0158656089594027, "grad_norm": 0.23789827525615692, "learning_rate": 0.0002, "loss": 0.5771, "mean_token_accuracy": 0.7673846781253815, "num_tokens": 3943117.0, "step": 1089 }, { "entropy": 0.6165488511323929, "epoch": 1.0167988800746617, "grad_norm": 0.18943564593791962, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.7505463063716888, "num_tokens": 3946796.0, "step": 1090 }, { "entropy": 0.5819817334413528, "epoch": 1.0177321511899207, "grad_norm": 0.2242528647184372, "learning_rate": 0.0002, "loss": 0.5785, "mean_token_accuracy": 0.7641334682703018, "num_tokens": 3950403.0, "step": 1091 }, { "entropy": 0.5996922552585602, "epoch": 1.0186654223051796, "grad_norm": 0.22629879415035248, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7579053342342377, "num_tokens": 3953993.0, "step": 1092 }, { "entropy": 0.6013685315847397, "epoch": 1.0195986934204386, "grad_norm": 0.2915244698524475, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.7509530335664749, "num_tokens": 3957721.0, "step": 1093 }, { "entropy": 0.5868679732084274, "epoch": 1.0205319645356976, "grad_norm": 0.24181976914405823, "learning_rate": 0.0002, "loss": 0.6039, "mean_token_accuracy": 0.7587897032499313, "num_tokens": 3961294.0, "step": 1094 }, { "entropy": 0.5609594136476517, "epoch": 1.0214652356509566, "grad_norm": 0.24649503827095032, "learning_rate": 0.0002, "loss": 0.5813, "mean_token_accuracy": 0.7635461986064911, "num_tokens": 3964844.0, "step": 1095 }, { "entropy": 0.5967434644699097, "epoch": 1.0223985067662156, "grad_norm": 0.2321065068244934, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.7534425109624863, "num_tokens": 3968575.0, "step": 1096 }, { "entropy": 0.5452862530946732, "epoch": 1.0233317778814746, "grad_norm": 0.2135581225156784, "learning_rate": 0.0002, "loss": 0.5458, "mean_token_accuracy": 0.781183272600174, "num_tokens": 3972122.0, "step": 1097 }, { "entropy": 0.5967236757278442, "epoch": 1.0242650489967335, "grad_norm": 0.2764890491962433, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.7664670199155807, "num_tokens": 3975742.0, "step": 1098 }, { "entropy": 0.5861189216375351, "epoch": 1.0251983201119925, "grad_norm": 0.25599405169487, "learning_rate": 0.0002, "loss": 0.5806, "mean_token_accuracy": 0.7682262808084488, "num_tokens": 3979221.0, "step": 1099 }, { "entropy": 0.6168408840894699, "epoch": 1.0261315912272515, "grad_norm": 0.22579482197761536, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.7584555000066757, "num_tokens": 3982932.0, "step": 1100 }, { "entropy": 0.604411169886589, "epoch": 1.0270648623425105, "grad_norm": 0.2111763060092926, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.7555941343307495, "num_tokens": 3986514.0, "step": 1101 }, { "entropy": 0.6072782427072525, "epoch": 1.0279981334577695, "grad_norm": 0.2753923237323761, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.7539841681718826, "num_tokens": 3990057.0, "step": 1102 }, { "entropy": 0.5990683734416962, "epoch": 1.0289314045730285, "grad_norm": 0.2414902150630951, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.7548021674156189, "num_tokens": 3993658.0, "step": 1103 }, { "entropy": 0.5963222831487656, "epoch": 1.0298646756882874, "grad_norm": 0.30048489570617676, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.7477454394102097, "num_tokens": 3997185.0, "step": 1104 }, { "entropy": 0.5732065290212631, "epoch": 1.0307979468035464, "grad_norm": 0.2685425281524658, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7605966776609421, "num_tokens": 4000825.0, "step": 1105 }, { "entropy": 0.6236933767795563, "epoch": 1.0317312179188054, "grad_norm": 0.36170750856399536, "learning_rate": 0.0002, "loss": 0.6469, "mean_token_accuracy": 0.7405789196491241, "num_tokens": 4004370.0, "step": 1106 }, { "entropy": 0.6045770943164825, "epoch": 1.0326644890340644, "grad_norm": 0.25409021973609924, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.7531943917274475, "num_tokens": 4008004.0, "step": 1107 }, { "entropy": 0.5972187966108322, "epoch": 1.0335977601493234, "grad_norm": 0.21179328858852386, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7626983523368835, "num_tokens": 4011710.0, "step": 1108 }, { "entropy": 0.5720956325531006, "epoch": 1.0345310312645823, "grad_norm": 0.22205647826194763, "learning_rate": 0.0002, "loss": 0.565, "mean_token_accuracy": 0.7717267870903015, "num_tokens": 4015480.0, "step": 1109 }, { "entropy": 0.5737668871879578, "epoch": 1.0354643023798413, "grad_norm": 0.1952465921640396, "learning_rate": 0.0002, "loss": 0.567, "mean_token_accuracy": 0.7681820094585419, "num_tokens": 4019085.0, "step": 1110 }, { "entropy": 0.644490122795105, "epoch": 1.0363975734951003, "grad_norm": 0.25606557726860046, "learning_rate": 0.0002, "loss": 0.6324, "mean_token_accuracy": 0.7501639127731323, "num_tokens": 4022651.0, "step": 1111 }, { "entropy": 0.5867243409156799, "epoch": 1.0373308446103593, "grad_norm": 0.2205377072095871, "learning_rate": 0.0002, "loss": 0.5917, "mean_token_accuracy": 0.7571871876716614, "num_tokens": 4026377.0, "step": 1112 }, { "entropy": 0.5699877291917801, "epoch": 1.0382641157256183, "grad_norm": 0.23570841550827026, "learning_rate": 0.0002, "loss": 0.589, "mean_token_accuracy": 0.758704423904419, "num_tokens": 4029972.0, "step": 1113 }, { "entropy": 0.6078994870185852, "epoch": 1.0391973868408773, "grad_norm": 0.2656703293323517, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.7594700753688812, "num_tokens": 4033677.0, "step": 1114 }, { "entropy": 0.5898154526948929, "epoch": 1.0401306579561362, "grad_norm": 0.33408045768737793, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.7562896609306335, "num_tokens": 4037229.0, "step": 1115 }, { "entropy": 0.5738260746002197, "epoch": 1.0410639290713952, "grad_norm": 0.23784619569778442, "learning_rate": 0.0002, "loss": 0.5775, "mean_token_accuracy": 0.7698379755020142, "num_tokens": 4040740.0, "step": 1116 }, { "entropy": 0.612351581454277, "epoch": 1.0419972001866542, "grad_norm": 0.3253702223300934, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.7500055879354477, "num_tokens": 4044452.0, "step": 1117 }, { "entropy": 0.6397425085306168, "epoch": 1.0429304713019132, "grad_norm": 0.3223883807659149, "learning_rate": 0.0002, "loss": 0.6511, "mean_token_accuracy": 0.7367266863584518, "num_tokens": 4048021.0, "step": 1118 }, { "entropy": 0.6021498441696167, "epoch": 1.0438637424171722, "grad_norm": 0.22905658185482025, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7562064677476883, "num_tokens": 4051626.0, "step": 1119 }, { "entropy": 0.5832901895046234, "epoch": 1.0447970135324312, "grad_norm": 0.20937184989452362, "learning_rate": 0.0002, "loss": 0.5773, "mean_token_accuracy": 0.7596762478351593, "num_tokens": 4055257.0, "step": 1120 }, { "entropy": 0.5740011632442474, "epoch": 1.0457302846476901, "grad_norm": 0.25523680448532104, "learning_rate": 0.0002, "loss": 0.5685, "mean_token_accuracy": 0.772971898317337, "num_tokens": 4058958.0, "step": 1121 }, { "entropy": 0.641285315155983, "epoch": 1.0466635557629491, "grad_norm": 0.3533610701560974, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.74741992354393, "num_tokens": 4062605.0, "step": 1122 }, { "entropy": 0.640784502029419, "epoch": 1.047596826878208, "grad_norm": 0.2478134036064148, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.7472634613513947, "num_tokens": 4066290.0, "step": 1123 }, { "entropy": 0.5779941976070404, "epoch": 1.048530097993467, "grad_norm": 0.2672136425971985, "learning_rate": 0.0002, "loss": 0.5817, "mean_token_accuracy": 0.7600543946027756, "num_tokens": 4069805.0, "step": 1124 }, { "entropy": 0.6039032191038132, "epoch": 1.049463369108726, "grad_norm": 0.24231916666030884, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.7560063302516937, "num_tokens": 4073382.0, "step": 1125 }, { "entropy": 0.5956085175275803, "epoch": 1.050396640223985, "grad_norm": 0.2685554027557373, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.7561258673667908, "num_tokens": 4076981.0, "step": 1126 }, { "entropy": 0.5841371566057205, "epoch": 1.051329911339244, "grad_norm": 0.2500322461128235, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.7554816007614136, "num_tokens": 4080760.0, "step": 1127 }, { "entropy": 0.5864620804786682, "epoch": 1.052263182454503, "grad_norm": 0.2479894459247589, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.7476089745759964, "num_tokens": 4084367.0, "step": 1128 }, { "entropy": 0.587070569396019, "epoch": 1.053196453569762, "grad_norm": 0.24735517799854279, "learning_rate": 0.0002, "loss": 0.5828, "mean_token_accuracy": 0.7636759281158447, "num_tokens": 4087988.0, "step": 1129 }, { "entropy": 0.582788422703743, "epoch": 1.054129724685021, "grad_norm": 0.22572675347328186, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.756134420633316, "num_tokens": 4091485.0, "step": 1130 }, { "entropy": 0.598074734210968, "epoch": 1.05506299580028, "grad_norm": 0.2230987548828125, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.7569568604230881, "num_tokens": 4095082.0, "step": 1131 }, { "entropy": 0.5896729230880737, "epoch": 1.055996266915539, "grad_norm": 0.22499051690101624, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.7567136138677597, "num_tokens": 4098785.0, "step": 1132 }, { "entropy": 0.5635609030723572, "epoch": 1.056929538030798, "grad_norm": 0.2337937355041504, "learning_rate": 0.0002, "loss": 0.5736, "mean_token_accuracy": 0.7681312561035156, "num_tokens": 4102425.0, "step": 1133 }, { "entropy": 0.5862243622541428, "epoch": 1.057862809146057, "grad_norm": 0.23709647357463837, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.7586928755044937, "num_tokens": 4106096.0, "step": 1134 }, { "entropy": 0.5754620730876923, "epoch": 1.0587960802613159, "grad_norm": 0.21770831942558289, "learning_rate": 0.0002, "loss": 0.5714, "mean_token_accuracy": 0.7665442377328873, "num_tokens": 4109851.0, "step": 1135 }, { "entropy": 0.5592700839042664, "epoch": 1.0597293513765749, "grad_norm": 0.2438599169254303, "learning_rate": 0.0002, "loss": 0.5641, "mean_token_accuracy": 0.776313915848732, "num_tokens": 4113514.0, "step": 1136 }, { "entropy": 0.5790890753269196, "epoch": 1.0606626224918339, "grad_norm": 0.25690758228302, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.7611184418201447, "num_tokens": 4117117.0, "step": 1137 }, { "entropy": 0.5517263114452362, "epoch": 1.0615958936070928, "grad_norm": 0.25708916783332825, "learning_rate": 0.0002, "loss": 0.5542, "mean_token_accuracy": 0.7794785797595978, "num_tokens": 4120666.0, "step": 1138 }, { "entropy": 0.6028632372617722, "epoch": 1.0625291647223518, "grad_norm": 0.2288038283586502, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7473186254501343, "num_tokens": 4124264.0, "step": 1139 }, { "entropy": 0.5923435837030411, "epoch": 1.0634624358376108, "grad_norm": 0.2501973509788513, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.7615616023540497, "num_tokens": 4127890.0, "step": 1140 }, { "entropy": 0.6162766814231873, "epoch": 1.0643957069528698, "grad_norm": 0.24172000586986542, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.7485285699367523, "num_tokens": 4131544.0, "step": 1141 }, { "entropy": 0.5264947488903999, "epoch": 1.0653289780681288, "grad_norm": 0.2752936780452728, "learning_rate": 0.0002, "loss": 0.5361, "mean_token_accuracy": 0.7841377258300781, "num_tokens": 4135110.0, "step": 1142 }, { "entropy": 0.5886484757065773, "epoch": 1.0662622491833877, "grad_norm": 0.2418910712003708, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.7641568183898926, "num_tokens": 4138728.0, "step": 1143 }, { "entropy": 0.5947759449481964, "epoch": 1.0671955202986467, "grad_norm": 0.23581650853157043, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.7614518702030182, "num_tokens": 4142277.0, "step": 1144 }, { "entropy": 0.6008647680282593, "epoch": 1.0681287914139057, "grad_norm": 0.25615864992141724, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.7585248053073883, "num_tokens": 4145854.0, "step": 1145 }, { "entropy": 0.5731842070817947, "epoch": 1.0690620625291647, "grad_norm": 0.23907384276390076, "learning_rate": 0.0002, "loss": 0.5636, "mean_token_accuracy": 0.773061141371727, "num_tokens": 4149406.0, "step": 1146 }, { "entropy": 0.5816588550806046, "epoch": 1.0699953336444237, "grad_norm": 0.2512907385826111, "learning_rate": 0.0002, "loss": 0.5722, "mean_token_accuracy": 0.7715330570936203, "num_tokens": 4153025.0, "step": 1147 }, { "entropy": 0.589260146021843, "epoch": 1.0709286047596827, "grad_norm": 0.24153655767440796, "learning_rate": 0.0002, "loss": 0.591, "mean_token_accuracy": 0.7602186799049377, "num_tokens": 4156623.0, "step": 1148 }, { "entropy": 0.5966326892375946, "epoch": 1.0718618758749416, "grad_norm": 0.3344581425189972, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.7491293251514435, "num_tokens": 4160153.0, "step": 1149 }, { "entropy": 0.6240898072719574, "epoch": 1.0727951469902006, "grad_norm": 0.2442578375339508, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.747322678565979, "num_tokens": 4163798.0, "step": 1150 }, { "entropy": 0.6107754409313202, "epoch": 1.0737284181054596, "grad_norm": 0.24257411062717438, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.7470063716173172, "num_tokens": 4167457.0, "step": 1151 }, { "entropy": 0.5795059204101562, "epoch": 1.0746616892207186, "grad_norm": 0.23712590336799622, "learning_rate": 0.0002, "loss": 0.5902, "mean_token_accuracy": 0.7629116028547287, "num_tokens": 4171010.0, "step": 1152 }, { "entropy": 0.5959185659885406, "epoch": 1.0755949603359776, "grad_norm": 0.27526646852493286, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.7526157647371292, "num_tokens": 4174615.0, "step": 1153 }, { "entropy": 0.5743700712919235, "epoch": 1.0765282314512366, "grad_norm": 0.2640918493270874, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.7643108069896698, "num_tokens": 4178086.0, "step": 1154 }, { "entropy": 0.6026676446199417, "epoch": 1.0774615025664955, "grad_norm": 0.23615190386772156, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.7554437220096588, "num_tokens": 4181758.0, "step": 1155 }, { "entropy": 0.5782712250947952, "epoch": 1.0783947736817545, "grad_norm": 0.24181759357452393, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7554184943437576, "num_tokens": 4185254.0, "step": 1156 }, { "entropy": 0.6011746376752853, "epoch": 1.0793280447970135, "grad_norm": 0.29363587498664856, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.7554003447294235, "num_tokens": 4188836.0, "step": 1157 }, { "entropy": 0.6032052636146545, "epoch": 1.0802613159122725, "grad_norm": 0.2624664306640625, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.751806765794754, "num_tokens": 4192494.0, "step": 1158 }, { "entropy": 0.6324547827243805, "epoch": 1.0811945870275315, "grad_norm": 0.2547738552093506, "learning_rate": 0.0002, "loss": 0.641, "mean_token_accuracy": 0.7410971373319626, "num_tokens": 4196093.0, "step": 1159 }, { "entropy": 0.6311033517122269, "epoch": 1.0821278581427904, "grad_norm": 0.30315619707107544, "learning_rate": 0.0002, "loss": 0.6381, "mean_token_accuracy": 0.7453387379646301, "num_tokens": 4199659.0, "step": 1160 }, { "entropy": 0.6441246718168259, "epoch": 1.0830611292580494, "grad_norm": 0.2667006850242615, "learning_rate": 0.0002, "loss": 0.6454, "mean_token_accuracy": 0.7383474409580231, "num_tokens": 4203389.0, "step": 1161 }, { "entropy": 0.6132991313934326, "epoch": 1.0839944003733084, "grad_norm": 0.37008944153785706, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.750207930803299, "num_tokens": 4206908.0, "step": 1162 }, { "entropy": 0.6350075751543045, "epoch": 1.0849276714885674, "grad_norm": 0.3072596788406372, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.7425640821456909, "num_tokens": 4210647.0, "step": 1163 }, { "entropy": 0.6430349797010422, "epoch": 1.0858609426038264, "grad_norm": 0.3440340459346771, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.7502025365829468, "num_tokens": 4214316.0, "step": 1164 }, { "entropy": 0.5548781454563141, "epoch": 1.0867942137190854, "grad_norm": 0.20711055397987366, "learning_rate": 0.0002, "loss": 0.552, "mean_token_accuracy": 0.7704902291297913, "num_tokens": 4217819.0, "step": 1165 }, { "entropy": 0.6207278370857239, "epoch": 1.0877274848343443, "grad_norm": 0.2708820700645447, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.7473660260438919, "num_tokens": 4221511.0, "step": 1166 }, { "entropy": 0.6122444272041321, "epoch": 1.0886607559496033, "grad_norm": 0.269202321767807, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.7493595480918884, "num_tokens": 4225219.0, "step": 1167 }, { "entropy": 0.5787427872419357, "epoch": 1.0895940270648623, "grad_norm": 0.30925384163856506, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7691614776849747, "num_tokens": 4228933.0, "step": 1168 }, { "entropy": 0.569681778550148, "epoch": 1.0905272981801213, "grad_norm": 0.24758674204349518, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7642954140901566, "num_tokens": 4232591.0, "step": 1169 }, { "entropy": 0.5785565674304962, "epoch": 1.0914605692953803, "grad_norm": 0.24864482879638672, "learning_rate": 0.0002, "loss": 0.5837, "mean_token_accuracy": 0.7686823457479477, "num_tokens": 4236263.0, "step": 1170 }, { "entropy": 0.6027144640684128, "epoch": 1.0923938404106392, "grad_norm": 0.26291942596435547, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.7660268694162369, "num_tokens": 4239814.0, "step": 1171 }, { "entropy": 0.6569175571203232, "epoch": 1.0933271115258982, "grad_norm": 0.23418675363063812, "learning_rate": 0.0002, "loss": 0.6517, "mean_token_accuracy": 0.7383657991886139, "num_tokens": 4243542.0, "step": 1172 }, { "entropy": 0.60727459192276, "epoch": 1.0942603826411572, "grad_norm": 0.34153351187705994, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.7495335936546326, "num_tokens": 4247081.0, "step": 1173 }, { "entropy": 0.6033570468425751, "epoch": 1.0951936537564162, "grad_norm": 0.2686994969844818, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.7590598464012146, "num_tokens": 4250889.0, "step": 1174 }, { "entropy": 0.6123284995555878, "epoch": 1.0961269248716752, "grad_norm": 0.2793242037296295, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7517882436513901, "num_tokens": 4254535.0, "step": 1175 }, { "entropy": 0.5875754952430725, "epoch": 1.0970601959869342, "grad_norm": 0.2094554752111435, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.7631491124629974, "num_tokens": 4258043.0, "step": 1176 }, { "entropy": 0.5772421509027481, "epoch": 1.0979934671021931, "grad_norm": 0.22839365899562836, "learning_rate": 0.0002, "loss": 0.5752, "mean_token_accuracy": 0.7617000192403793, "num_tokens": 4261746.0, "step": 1177 }, { "entropy": 0.5733971297740936, "epoch": 1.0989267382174521, "grad_norm": 0.2564690113067627, "learning_rate": 0.0002, "loss": 0.5753, "mean_token_accuracy": 0.7674972116947174, "num_tokens": 4265407.0, "step": 1178 }, { "entropy": 0.6072157770395279, "epoch": 1.099860009332711, "grad_norm": 0.2840120792388916, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.7443269640207291, "num_tokens": 4268970.0, "step": 1179 }, { "entropy": 0.5985493212938309, "epoch": 1.10079328044797, "grad_norm": 0.26341280341148376, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.7549693733453751, "num_tokens": 4272632.0, "step": 1180 }, { "entropy": 0.5944692045450211, "epoch": 1.101726551563229, "grad_norm": 0.26915258169174194, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.7573886662721634, "num_tokens": 4276389.0, "step": 1181 }, { "entropy": 0.5892274379730225, "epoch": 1.102659822678488, "grad_norm": 0.29591578245162964, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.7608792334794998, "num_tokens": 4280018.0, "step": 1182 }, { "entropy": 0.6051250100135803, "epoch": 1.103593093793747, "grad_norm": 0.2370976060628891, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.7540702074766159, "num_tokens": 4283652.0, "step": 1183 }, { "entropy": 0.5938491076231003, "epoch": 1.104526364909006, "grad_norm": 0.2985800504684448, "learning_rate": 0.0002, "loss": 0.5891, "mean_token_accuracy": 0.7682900875806808, "num_tokens": 4287240.0, "step": 1184 }, { "entropy": 0.6068399101495743, "epoch": 1.105459636024265, "grad_norm": 0.23221342265605927, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.7556649297475815, "num_tokens": 4290868.0, "step": 1185 }, { "entropy": 0.6331735104322433, "epoch": 1.106392907139524, "grad_norm": 0.24381299316883087, "learning_rate": 0.0002, "loss": 0.6331, "mean_token_accuracy": 0.7460650056600571, "num_tokens": 4294579.0, "step": 1186 }, { "entropy": 0.6117393672466278, "epoch": 1.107326178254783, "grad_norm": 0.23209865391254425, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.7549971640110016, "num_tokens": 4298223.0, "step": 1187 }, { "entropy": 0.6011423766613007, "epoch": 1.108259449370042, "grad_norm": 0.246726855635643, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7524861991405487, "num_tokens": 4301730.0, "step": 1188 }, { "entropy": 0.607647180557251, "epoch": 1.109192720485301, "grad_norm": 0.23154184222221375, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.7544190883636475, "num_tokens": 4305396.0, "step": 1189 }, { "entropy": 0.6149439960718155, "epoch": 1.11012599160056, "grad_norm": 0.24387352168560028, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.7439283281564713, "num_tokens": 4309064.0, "step": 1190 }, { "entropy": 0.5930565893650055, "epoch": 1.111059262715819, "grad_norm": 0.24471460282802582, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.7566158026456833, "num_tokens": 4312788.0, "step": 1191 }, { "entropy": 0.5890672951936722, "epoch": 1.1119925338310779, "grad_norm": 0.2569296360015869, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.7639264166355133, "num_tokens": 4316495.0, "step": 1192 }, { "entropy": 0.5656487345695496, "epoch": 1.1129258049463369, "grad_norm": 0.23919855058193207, "learning_rate": 0.0002, "loss": 0.5757, "mean_token_accuracy": 0.7656695246696472, "num_tokens": 4320148.0, "step": 1193 }, { "entropy": 0.6132683306932449, "epoch": 1.1138590760615958, "grad_norm": 0.2555757462978363, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.7533993870019913, "num_tokens": 4323777.0, "step": 1194 }, { "entropy": 0.587346151471138, "epoch": 1.1147923471768548, "grad_norm": 0.26703810691833496, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7558653354644775, "num_tokens": 4327237.0, "step": 1195 }, { "entropy": 0.5937947332859039, "epoch": 1.1157256182921138, "grad_norm": 0.25903961062431335, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.7513121068477631, "num_tokens": 4331053.0, "step": 1196 }, { "entropy": 0.6112654060125351, "epoch": 1.1166588894073728, "grad_norm": 0.2694290280342102, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.7491072565317154, "num_tokens": 4334725.0, "step": 1197 }, { "entropy": 0.6162684857845306, "epoch": 1.1175921605226318, "grad_norm": 0.5101764798164368, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.7528711557388306, "num_tokens": 4338174.0, "step": 1198 }, { "entropy": 0.6252504885196686, "epoch": 1.1185254316378908, "grad_norm": 0.2834681570529938, "learning_rate": 0.0002, "loss": 0.6378, "mean_token_accuracy": 0.738287091255188, "num_tokens": 4341776.0, "step": 1199 }, { "entropy": 0.5828897804021835, "epoch": 1.1194587027531497, "grad_norm": 0.26412123441696167, "learning_rate": 0.0002, "loss": 0.5866, "mean_token_accuracy": 0.7664359211921692, "num_tokens": 4345378.0, "step": 1200 }, { "entropy": 0.5789649337530136, "epoch": 1.1203919738684087, "grad_norm": 0.349213570356369, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.7617710679769516, "num_tokens": 4348977.0, "step": 1201 }, { "entropy": 0.6366727203130722, "epoch": 1.1213252449836677, "grad_norm": 0.39819931983947754, "learning_rate": 0.0002, "loss": 0.644, "mean_token_accuracy": 0.7355251163244247, "num_tokens": 4352650.0, "step": 1202 }, { "entropy": 0.5928388386964798, "epoch": 1.1222585160989267, "grad_norm": 0.2784482538700104, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7671993523836136, "num_tokens": 4356205.0, "step": 1203 }, { "entropy": 0.6235855668783188, "epoch": 1.1231917872141857, "grad_norm": 0.33394819498062134, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.7540284097194672, "num_tokens": 4359875.0, "step": 1204 }, { "entropy": 0.6198291927576065, "epoch": 1.1241250583294446, "grad_norm": 0.2754463851451874, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7490982115268707, "num_tokens": 4363491.0, "step": 1205 }, { "entropy": 0.6078521013259888, "epoch": 1.1250583294447036, "grad_norm": 0.3236178159713745, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.7530374526977539, "num_tokens": 4367177.0, "step": 1206 }, { "entropy": 0.5424081832170486, "epoch": 1.1259916005599626, "grad_norm": 0.3986901640892029, "learning_rate": 0.0002, "loss": 0.5692, "mean_token_accuracy": 0.7697962820529938, "num_tokens": 4370851.0, "step": 1207 }, { "entropy": 0.6053501665592194, "epoch": 1.1269248716752216, "grad_norm": 0.3303784132003784, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.7490976750850677, "num_tokens": 4374449.0, "step": 1208 }, { "entropy": 0.5833835601806641, "epoch": 1.1278581427904806, "grad_norm": 0.2646164894104004, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.7621305137872696, "num_tokens": 4378133.0, "step": 1209 }, { "entropy": 0.6136528551578522, "epoch": 1.1287914139057396, "grad_norm": 0.2569977343082428, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.7473931908607483, "num_tokens": 4381831.0, "step": 1210 }, { "entropy": 0.5897126793861389, "epoch": 1.1297246850209985, "grad_norm": 0.21203143894672394, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.75443334877491, "num_tokens": 4385392.0, "step": 1211 }, { "entropy": 0.5773434489965439, "epoch": 1.1306579561362575, "grad_norm": 0.2757454216480255, "learning_rate": 0.0002, "loss": 0.5822, "mean_token_accuracy": 0.7632689028978348, "num_tokens": 4388995.0, "step": 1212 }, { "entropy": 0.6299498975276947, "epoch": 1.1315912272515165, "grad_norm": 0.2603781521320343, "learning_rate": 0.0002, "loss": 0.63, "mean_token_accuracy": 0.7461886405944824, "num_tokens": 4392601.0, "step": 1213 }, { "entropy": 0.5857987701892853, "epoch": 1.1325244983667755, "grad_norm": 0.2547178864479065, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.7647912055253983, "num_tokens": 4396087.0, "step": 1214 }, { "entropy": 0.6124046444892883, "epoch": 1.1334577694820345, "grad_norm": 0.24504396319389343, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.7556785643100739, "num_tokens": 4399742.0, "step": 1215 }, { "entropy": 0.6174919605255127, "epoch": 1.1343910405972935, "grad_norm": 0.2754562199115753, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.748796284198761, "num_tokens": 4403327.0, "step": 1216 }, { "entropy": 0.5783385038375854, "epoch": 1.1353243117125524, "grad_norm": 0.2842562198638916, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7605612725019455, "num_tokens": 4406886.0, "step": 1217 }, { "entropy": 0.6225049942731857, "epoch": 1.1362575828278114, "grad_norm": 0.2692372500896454, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.7417872995138168, "num_tokens": 4410555.0, "step": 1218 }, { "entropy": 0.5643046945333481, "epoch": 1.1371908539430704, "grad_norm": 0.2385636270046234, "learning_rate": 0.0002, "loss": 0.5711, "mean_token_accuracy": 0.7677874863147736, "num_tokens": 4414137.0, "step": 1219 }, { "entropy": 0.594844326376915, "epoch": 1.1381241250583294, "grad_norm": 0.23388004302978516, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.7580701559782028, "num_tokens": 4417778.0, "step": 1220 }, { "entropy": 0.5765068084001541, "epoch": 1.1390573961735884, "grad_norm": 0.22006511688232422, "learning_rate": 0.0002, "loss": 0.5796, "mean_token_accuracy": 0.7644301503896713, "num_tokens": 4421478.0, "step": 1221 }, { "entropy": 0.5905089676380157, "epoch": 1.1399906672888473, "grad_norm": 0.23555630445480347, "learning_rate": 0.0002, "loss": 0.5916, "mean_token_accuracy": 0.7595047503709793, "num_tokens": 4425214.0, "step": 1222 }, { "entropy": 0.6000205725431442, "epoch": 1.1409239384041063, "grad_norm": 0.27142801880836487, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.7592317312955856, "num_tokens": 4428828.0, "step": 1223 }, { "entropy": 0.592579111456871, "epoch": 1.1418572095193653, "grad_norm": 0.27291470766067505, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7624055594205856, "num_tokens": 4432390.0, "step": 1224 }, { "entropy": 0.6204696595668793, "epoch": 1.1427904806346243, "grad_norm": 0.3029044270515442, "learning_rate": 0.0002, "loss": 0.6366, "mean_token_accuracy": 0.7396967262029648, "num_tokens": 4436000.0, "step": 1225 }, { "entropy": 0.5445071905851364, "epoch": 1.1437237517498833, "grad_norm": 0.2707897424697876, "learning_rate": 0.0002, "loss": 0.5504, "mean_token_accuracy": 0.7786100655794144, "num_tokens": 4439496.0, "step": 1226 }, { "entropy": 0.5861040204763412, "epoch": 1.1446570228651423, "grad_norm": 0.2305113673210144, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.754399910569191, "num_tokens": 4443156.0, "step": 1227 }, { "entropy": 0.5639618188142776, "epoch": 1.1455902939804012, "grad_norm": 0.22435829043388367, "learning_rate": 0.0002, "loss": 0.564, "mean_token_accuracy": 0.7689625024795532, "num_tokens": 4446759.0, "step": 1228 }, { "entropy": 0.5875468701124191, "epoch": 1.1465235650956602, "grad_norm": 0.2936243414878845, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7542673200368881, "num_tokens": 4450305.0, "step": 1229 }, { "entropy": 0.6033259183168411, "epoch": 1.1474568362109192, "grad_norm": 0.2175438553094864, "learning_rate": 0.0002, "loss": 0.6061, "mean_token_accuracy": 0.7558301091194153, "num_tokens": 4453956.0, "step": 1230 }, { "entropy": 0.6540074199438095, "epoch": 1.1483901073261782, "grad_norm": 0.2724934220314026, "learning_rate": 0.0002, "loss": 0.6552, "mean_token_accuracy": 0.7300193607807159, "num_tokens": 4457469.0, "step": 1231 }, { "entropy": 0.6359201222658157, "epoch": 1.1493233784414372, "grad_norm": 0.22295457124710083, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.7422886490821838, "num_tokens": 4461116.0, "step": 1232 }, { "entropy": 0.595000609755516, "epoch": 1.1502566495566962, "grad_norm": 0.24836499989032745, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7612973749637604, "num_tokens": 4464762.0, "step": 1233 }, { "entropy": 0.6260000467300415, "epoch": 1.1511899206719551, "grad_norm": 0.28300777077674866, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.746180847287178, "num_tokens": 4468274.0, "step": 1234 }, { "entropy": 0.598585307598114, "epoch": 1.1521231917872141, "grad_norm": 0.2436583787202835, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.7569845765829086, "num_tokens": 4471904.0, "step": 1235 }, { "entropy": 0.6158511638641357, "epoch": 1.153056462902473, "grad_norm": 0.24836213886737823, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.747326597571373, "num_tokens": 4475642.0, "step": 1236 }, { "entropy": 0.60733462870121, "epoch": 1.153989734017732, "grad_norm": 0.2428881973028183, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.7501186430454254, "num_tokens": 4479212.0, "step": 1237 }, { "entropy": 0.5970664471387863, "epoch": 1.154923005132991, "grad_norm": 0.2818988859653473, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.7534895092248917, "num_tokens": 4482816.0, "step": 1238 }, { "entropy": 0.6405521929264069, "epoch": 1.15585627624825, "grad_norm": 0.22499719262123108, "learning_rate": 0.0002, "loss": 0.6399, "mean_token_accuracy": 0.7468511760234833, "num_tokens": 4486474.0, "step": 1239 }, { "entropy": 0.6492093056440353, "epoch": 1.156789547363509, "grad_norm": 0.23098193109035492, "learning_rate": 0.0002, "loss": 0.6522, "mean_token_accuracy": 0.7387702763080597, "num_tokens": 4490191.0, "step": 1240 }, { "entropy": 0.6099929213523865, "epoch": 1.157722818478768, "grad_norm": 0.2817385196685791, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.7527201473712921, "num_tokens": 4493840.0, "step": 1241 }, { "entropy": 0.6168241947889328, "epoch": 1.158656089594027, "grad_norm": 0.2618480920791626, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.7412067651748657, "num_tokens": 4497476.0, "step": 1242 }, { "entropy": 0.6164257228374481, "epoch": 1.159589360709286, "grad_norm": 0.2302348017692566, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.7481765151023865, "num_tokens": 4501079.0, "step": 1243 }, { "entropy": 0.6133585870265961, "epoch": 1.160522631824545, "grad_norm": 0.20954573154449463, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.7527308911085129, "num_tokens": 4504701.0, "step": 1244 }, { "entropy": 0.6110677272081375, "epoch": 1.161455902939804, "grad_norm": 0.28969037532806396, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.7444902509450912, "num_tokens": 4508224.0, "step": 1245 }, { "entropy": 0.6428615599870682, "epoch": 1.162389174055063, "grad_norm": 0.2537444233894348, "learning_rate": 0.0002, "loss": 0.6423, "mean_token_accuracy": 0.7338407337665558, "num_tokens": 4511940.0, "step": 1246 }, { "entropy": 0.611653208732605, "epoch": 1.163322445170322, "grad_norm": 0.21702557802200317, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.7541915327310562, "num_tokens": 4515505.0, "step": 1247 }, { "entropy": 0.561603382229805, "epoch": 1.1642557162855809, "grad_norm": 0.22502735257148743, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.768964409828186, "num_tokens": 4519165.0, "step": 1248 }, { "entropy": 0.575377494096756, "epoch": 1.1651889874008399, "grad_norm": 0.3331286311149597, "learning_rate": 0.0002, "loss": 0.5853, "mean_token_accuracy": 0.7610722631216049, "num_tokens": 4522734.0, "step": 1249 }, { "entropy": 0.5868176370859146, "epoch": 1.1661222585160989, "grad_norm": 0.2591674029827118, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7685189992189407, "num_tokens": 4526320.0, "step": 1250 }, { "entropy": 0.6042935997247696, "epoch": 1.1670555296313578, "grad_norm": 0.22394676506519318, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.7590226382017136, "num_tokens": 4530047.0, "step": 1251 }, { "entropy": 0.6039951592683792, "epoch": 1.1679888007466168, "grad_norm": 0.24054843187332153, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.7612219154834747, "num_tokens": 4533700.0, "step": 1252 }, { "entropy": 0.6220381408929825, "epoch": 1.1689220718618758, "grad_norm": 0.25530582666397095, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.7483276724815369, "num_tokens": 4537436.0, "step": 1253 }, { "entropy": 0.6193282157182693, "epoch": 1.1698553429771348, "grad_norm": 0.27515363693237305, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.7441795021295547, "num_tokens": 4540993.0, "step": 1254 }, { "entropy": 0.5813523009419441, "epoch": 1.1707886140923938, "grad_norm": 0.23564274609088898, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7721471190452576, "num_tokens": 4544588.0, "step": 1255 }, { "entropy": 0.5856733471155167, "epoch": 1.1717218852076527, "grad_norm": 0.25300657749176025, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.759991779923439, "num_tokens": 4548289.0, "step": 1256 }, { "entropy": 0.5516929253935814, "epoch": 1.1726551563229117, "grad_norm": 0.22552679479122162, "learning_rate": 0.0002, "loss": 0.5587, "mean_token_accuracy": 0.7766859829425812, "num_tokens": 4551865.0, "step": 1257 }, { "entropy": 0.6129982024431229, "epoch": 1.1735884274381707, "grad_norm": 0.23444119095802307, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.7562789618968964, "num_tokens": 4555555.0, "step": 1258 }, { "entropy": 0.6307830959558487, "epoch": 1.1745216985534297, "grad_norm": 0.25556129217147827, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.7463335692882538, "num_tokens": 4559382.0, "step": 1259 }, { "entropy": 0.5948701649904251, "epoch": 1.1754549696686887, "grad_norm": 0.2484498918056488, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.7578098624944687, "num_tokens": 4562868.0, "step": 1260 }, { "entropy": 0.5887980908155441, "epoch": 1.1763882407839477, "grad_norm": 0.24880972504615784, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.756829708814621, "num_tokens": 4566448.0, "step": 1261 }, { "entropy": 0.5903638154268265, "epoch": 1.1773215118992066, "grad_norm": 0.24388641119003296, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.7618903815746307, "num_tokens": 4570065.0, "step": 1262 }, { "entropy": 0.5964146703481674, "epoch": 1.1782547830144656, "grad_norm": 0.2209826558828354, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.7585657238960266, "num_tokens": 4573689.0, "step": 1263 }, { "entropy": 0.611288994550705, "epoch": 1.1791880541297246, "grad_norm": 0.22196796536445618, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.7512204498052597, "num_tokens": 4577358.0, "step": 1264 }, { "entropy": 0.5737494975328445, "epoch": 1.1801213252449836, "grad_norm": 0.23946338891983032, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.762876033782959, "num_tokens": 4580854.0, "step": 1265 }, { "entropy": 0.592563807964325, "epoch": 1.1810545963602426, "grad_norm": 0.21104271709918976, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.7544963210821152, "num_tokens": 4584553.0, "step": 1266 }, { "entropy": 0.6493550539016724, "epoch": 1.1819878674755016, "grad_norm": 0.251607209444046, "learning_rate": 0.0002, "loss": 0.6515, "mean_token_accuracy": 0.7370408624410629, "num_tokens": 4588301.0, "step": 1267 }, { "entropy": 0.6119677424430847, "epoch": 1.1829211385907605, "grad_norm": 0.25489330291748047, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.7474663108587265, "num_tokens": 4591901.0, "step": 1268 }, { "entropy": 0.6227080225944519, "epoch": 1.1838544097060195, "grad_norm": 0.27349942922592163, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.7521779239177704, "num_tokens": 4595400.0, "step": 1269 }, { "entropy": 0.6066797524690628, "epoch": 1.1847876808212785, "grad_norm": 0.24355503916740417, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.7427815347909927, "num_tokens": 4599069.0, "step": 1270 }, { "entropy": 0.6303318440914154, "epoch": 1.1857209519365375, "grad_norm": 0.22682812809944153, "learning_rate": 0.0002, "loss": 0.6331, "mean_token_accuracy": 0.7427325695753098, "num_tokens": 4602667.0, "step": 1271 }, { "entropy": 0.6137190461158752, "epoch": 1.1866542230517965, "grad_norm": 0.2294629067182541, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.7477195560932159, "num_tokens": 4606264.0, "step": 1272 }, { "entropy": 0.6122864335775375, "epoch": 1.1875874941670554, "grad_norm": 0.2336626797914505, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.7617202997207642, "num_tokens": 4609922.0, "step": 1273 }, { "entropy": 0.5612748265266418, "epoch": 1.1885207652823144, "grad_norm": 0.22371861338615417, "learning_rate": 0.0002, "loss": 0.5541, "mean_token_accuracy": 0.7792564034461975, "num_tokens": 4613622.0, "step": 1274 }, { "entropy": 0.625264585018158, "epoch": 1.1894540363975734, "grad_norm": 0.25951874256134033, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.7421546429395676, "num_tokens": 4617149.0, "step": 1275 }, { "entropy": 0.5715575367212296, "epoch": 1.1903873075128324, "grad_norm": 0.2651207149028778, "learning_rate": 0.0002, "loss": 0.5816, "mean_token_accuracy": 0.7680575102567673, "num_tokens": 4620788.0, "step": 1276 }, { "entropy": 0.5909120589494705, "epoch": 1.1913205786280914, "grad_norm": 0.29783156514167786, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.7569748014211655, "num_tokens": 4624427.0, "step": 1277 }, { "entropy": 0.6155180335044861, "epoch": 1.1922538497433504, "grad_norm": 0.23460648953914642, "learning_rate": 0.0002, "loss": 0.6249, "mean_token_accuracy": 0.7496028542518616, "num_tokens": 4628155.0, "step": 1278 }, { "entropy": 0.5642882287502289, "epoch": 1.1931871208586093, "grad_norm": 0.27352190017700195, "learning_rate": 0.0002, "loss": 0.5651, "mean_token_accuracy": 0.7708800882101059, "num_tokens": 4631771.0, "step": 1279 }, { "entropy": 0.6172695904970169, "epoch": 1.1941203919738683, "grad_norm": 0.25526896119117737, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.7517412900924683, "num_tokens": 4635420.0, "step": 1280 }, { "entropy": 0.62244813144207, "epoch": 1.1950536630891273, "grad_norm": 0.2217046022415161, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7496538609266281, "num_tokens": 4639086.0, "step": 1281 }, { "entropy": 0.6513757705688477, "epoch": 1.1959869342043863, "grad_norm": 0.22331054508686066, "learning_rate": 0.0002, "loss": 0.6345, "mean_token_accuracy": 0.7409411668777466, "num_tokens": 4642848.0, "step": 1282 }, { "entropy": 0.6008409112691879, "epoch": 1.1969202053196453, "grad_norm": 0.2301636040210724, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.7636523693799973, "num_tokens": 4646646.0, "step": 1283 }, { "entropy": 0.5765864551067352, "epoch": 1.1978534764349043, "grad_norm": 0.3287019431591034, "learning_rate": 0.0002, "loss": 0.5863, "mean_token_accuracy": 0.7623507380485535, "num_tokens": 4650195.0, "step": 1284 }, { "entropy": 0.5732782185077667, "epoch": 1.1987867475501632, "grad_norm": 0.33838585019111633, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.7561032474040985, "num_tokens": 4653774.0, "step": 1285 }, { "entropy": 0.5909520089626312, "epoch": 1.1997200186654222, "grad_norm": 0.29115816950798035, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.7417499870061874, "num_tokens": 4657457.0, "step": 1286 }, { "entropy": 0.5921488106250763, "epoch": 1.2006532897806812, "grad_norm": 0.2805023193359375, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.7507372051477432, "num_tokens": 4660961.0, "step": 1287 }, { "entropy": 0.611193835735321, "epoch": 1.2015865608959402, "grad_norm": 0.24528081715106964, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.7580326497554779, "num_tokens": 4664609.0, "step": 1288 }, { "entropy": 0.6044455915689468, "epoch": 1.2025198320111992, "grad_norm": 0.2232203632593155, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7652961164712906, "num_tokens": 4668344.0, "step": 1289 }, { "entropy": 0.6349958330392838, "epoch": 1.2034531031264581, "grad_norm": 0.27380385994911194, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.7494397312402725, "num_tokens": 4672014.0, "step": 1290 }, { "entropy": 0.6143328994512558, "epoch": 1.2043863742417171, "grad_norm": 0.276798814535141, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.7478311508893967, "num_tokens": 4675611.0, "step": 1291 }, { "entropy": 0.611118495464325, "epoch": 1.205319645356976, "grad_norm": 0.27434712648391724, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.7577848434448242, "num_tokens": 4679288.0, "step": 1292 }, { "entropy": 0.5910409837961197, "epoch": 1.206252916472235, "grad_norm": 0.2743571996688843, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.7493457496166229, "num_tokens": 4682862.0, "step": 1293 }, { "entropy": 0.5784618705511093, "epoch": 1.207186187587494, "grad_norm": 0.21157340705394745, "learning_rate": 0.0002, "loss": 0.5798, "mean_token_accuracy": 0.7685439586639404, "num_tokens": 4686579.0, "step": 1294 }, { "entropy": 0.6184923052787781, "epoch": 1.208119458702753, "grad_norm": 0.22861535847187042, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7456774860620499, "num_tokens": 4690210.0, "step": 1295 }, { "entropy": 0.5942887961864471, "epoch": 1.209052729818012, "grad_norm": 0.2835206985473633, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.7511053085327148, "num_tokens": 4693771.0, "step": 1296 }, { "entropy": 0.5911070257425308, "epoch": 1.209986000933271, "grad_norm": 0.24853025376796722, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.758936196565628, "num_tokens": 4697533.0, "step": 1297 }, { "entropy": 0.5923463851213455, "epoch": 1.21091927204853, "grad_norm": 0.2603748142719269, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.7539932876825333, "num_tokens": 4701163.0, "step": 1298 }, { "entropy": 0.5674715936183929, "epoch": 1.211852543163789, "grad_norm": 0.28158238530158997, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.7637924551963806, "num_tokens": 4704673.0, "step": 1299 }, { "entropy": 0.6039640009403229, "epoch": 1.212785814279048, "grad_norm": 0.31835657358169556, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.7437068372964859, "num_tokens": 4708209.0, "step": 1300 }, { "entropy": 0.6296544373035431, "epoch": 1.213719085394307, "grad_norm": 0.29044243693351746, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.7449681758880615, "num_tokens": 4711857.0, "step": 1301 }, { "entropy": 0.609091728925705, "epoch": 1.214652356509566, "grad_norm": 0.23809657990932465, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.7488805055618286, "num_tokens": 4715410.0, "step": 1302 }, { "entropy": 0.619204506278038, "epoch": 1.215585627624825, "grad_norm": 0.22200249135494232, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.7454660385847092, "num_tokens": 4719011.0, "step": 1303 }, { "entropy": 0.6150100082159042, "epoch": 1.216518898740084, "grad_norm": 0.222891166806221, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.7495020776987076, "num_tokens": 4722561.0, "step": 1304 }, { "entropy": 0.5950970351696014, "epoch": 1.2174521698553429, "grad_norm": 0.22319164872169495, "learning_rate": 0.0002, "loss": 0.5872, "mean_token_accuracy": 0.7598423659801483, "num_tokens": 4726108.0, "step": 1305 }, { "entropy": 0.6266640722751617, "epoch": 1.2183854409706019, "grad_norm": 0.23471617698669434, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.7426328361034393, "num_tokens": 4729656.0, "step": 1306 }, { "entropy": 0.6176575571298599, "epoch": 1.2193187120858608, "grad_norm": 0.2589946985244751, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.7520790249109268, "num_tokens": 4733290.0, "step": 1307 }, { "entropy": 0.5862959772348404, "epoch": 1.2202519832011198, "grad_norm": 0.2882247865200043, "learning_rate": 0.0002, "loss": 0.5776, "mean_token_accuracy": 0.7659910768270493, "num_tokens": 4736895.0, "step": 1308 }, { "entropy": 0.6325369328260422, "epoch": 1.2211852543163788, "grad_norm": 0.2692652642726898, "learning_rate": 0.0002, "loss": 0.6415, "mean_token_accuracy": 0.7401176542043686, "num_tokens": 4740520.0, "step": 1309 }, { "entropy": 0.5870164632797241, "epoch": 1.2221185254316378, "grad_norm": 0.29304608702659607, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.7431651502847672, "num_tokens": 4744167.0, "step": 1310 }, { "entropy": 0.5817097723484039, "epoch": 1.2230517965468968, "grad_norm": 0.27071142196655273, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7645485699176788, "num_tokens": 4747856.0, "step": 1311 }, { "entropy": 0.5896659791469574, "epoch": 1.2239850676621558, "grad_norm": 0.25581127405166626, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.7603296190500259, "num_tokens": 4751528.0, "step": 1312 }, { "entropy": 0.6110672056674957, "epoch": 1.2249183387774147, "grad_norm": 0.3004114627838135, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.7431618273258209, "num_tokens": 4755182.0, "step": 1313 }, { "entropy": 0.607249304652214, "epoch": 1.2258516098926737, "grad_norm": 0.272165983915329, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.7512993812561035, "num_tokens": 4758814.0, "step": 1314 }, { "entropy": 0.6503890007734299, "epoch": 1.2267848810079327, "grad_norm": 0.22312253713607788, "learning_rate": 0.0002, "loss": 0.6449, "mean_token_accuracy": 0.7459966987371445, "num_tokens": 4762464.0, "step": 1315 }, { "entropy": 0.5676947981119156, "epoch": 1.2277181521231917, "grad_norm": 0.21129071712493896, "learning_rate": 0.0002, "loss": 0.5591, "mean_token_accuracy": 0.7689218372106552, "num_tokens": 4766142.0, "step": 1316 }, { "entropy": 0.6079443097114563, "epoch": 1.2286514232384507, "grad_norm": 0.21704770624637604, "learning_rate": 0.0002, "loss": 0.6061, "mean_token_accuracy": 0.7506141364574432, "num_tokens": 4769694.0, "step": 1317 }, { "entropy": 0.594678059220314, "epoch": 1.2295846943537097, "grad_norm": 0.24844208359718323, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.7627069056034088, "num_tokens": 4773348.0, "step": 1318 }, { "entropy": 0.6064060032367706, "epoch": 1.2305179654689686, "grad_norm": 0.25089362263679504, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.7476353049278259, "num_tokens": 4777028.0, "step": 1319 }, { "entropy": 0.6194854378700256, "epoch": 1.2314512365842276, "grad_norm": 0.26658815145492554, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.7477138489484787, "num_tokens": 4780535.0, "step": 1320 }, { "entropy": 0.5668972581624985, "epoch": 1.2323845076994866, "grad_norm": 0.22662629187107086, "learning_rate": 0.0002, "loss": 0.5744, "mean_token_accuracy": 0.7668307572603226, "num_tokens": 4784144.0, "step": 1321 }, { "entropy": 0.645233765244484, "epoch": 1.2333177788147456, "grad_norm": 0.28345733880996704, "learning_rate": 0.0002, "loss": 0.6557, "mean_token_accuracy": 0.7382364124059677, "num_tokens": 4787801.0, "step": 1322 }, { "entropy": 0.5722016841173172, "epoch": 1.2342510499300046, "grad_norm": 0.2994857132434845, "learning_rate": 0.0002, "loss": 0.5821, "mean_token_accuracy": 0.767656996846199, "num_tokens": 4791335.0, "step": 1323 }, { "entropy": 0.6057450473308563, "epoch": 1.2351843210452635, "grad_norm": 0.228093221783638, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.7474103569984436, "num_tokens": 4794862.0, "step": 1324 }, { "entropy": 0.6220769286155701, "epoch": 1.2361175921605225, "grad_norm": 0.2479103058576584, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.7499846518039703, "num_tokens": 4798520.0, "step": 1325 }, { "entropy": 0.6286314427852631, "epoch": 1.2370508632757815, "grad_norm": 0.3039076626300812, "learning_rate": 0.0002, "loss": 0.6438, "mean_token_accuracy": 0.7400738745927811, "num_tokens": 4802113.0, "step": 1326 }, { "entropy": 0.6036764085292816, "epoch": 1.2379841343910405, "grad_norm": 0.2783754765987396, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.7548053860664368, "num_tokens": 4805778.0, "step": 1327 }, { "entropy": 0.6156470328569412, "epoch": 1.2389174055062995, "grad_norm": 0.2923991084098816, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.7565134763717651, "num_tokens": 4809408.0, "step": 1328 }, { "entropy": 0.6054894179105759, "epoch": 1.2398506766215585, "grad_norm": 0.2671886682510376, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7612636834383011, "num_tokens": 4812960.0, "step": 1329 }, { "entropy": 0.5931719243526459, "epoch": 1.2407839477368174, "grad_norm": 0.2715870141983032, "learning_rate": 0.0002, "loss": 0.5853, "mean_token_accuracy": 0.7588178962469101, "num_tokens": 4816520.0, "step": 1330 }, { "entropy": 0.6369378715753555, "epoch": 1.2417172188520764, "grad_norm": 0.25670501589775085, "learning_rate": 0.0002, "loss": 0.63, "mean_token_accuracy": 0.7487699389457703, "num_tokens": 4820065.0, "step": 1331 }, { "entropy": 0.6412648856639862, "epoch": 1.2426504899673354, "grad_norm": 0.27845698595046997, "learning_rate": 0.0002, "loss": 0.6413, "mean_token_accuracy": 0.7435834556818008, "num_tokens": 4823726.0, "step": 1332 }, { "entropy": 0.6154259592294693, "epoch": 1.2435837610825944, "grad_norm": 0.24896745383739471, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.7448258250951767, "num_tokens": 4827355.0, "step": 1333 }, { "entropy": 0.6310400366783142, "epoch": 1.2445170321978534, "grad_norm": 0.24893765151500702, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.7391531020402908, "num_tokens": 4831062.0, "step": 1334 }, { "entropy": 0.6260269731283188, "epoch": 1.2454503033131124, "grad_norm": 0.22365838289260864, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.7458150684833527, "num_tokens": 4834675.0, "step": 1335 }, { "entropy": 0.5873889923095703, "epoch": 1.2463835744283713, "grad_norm": 0.2525661289691925, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.7507454752922058, "num_tokens": 4838211.0, "step": 1336 }, { "entropy": 0.6106625348329544, "epoch": 1.2473168455436303, "grad_norm": 0.24998076260089874, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.7500563263893127, "num_tokens": 4841873.0, "step": 1337 }, { "entropy": 0.6069313138723373, "epoch": 1.2482501166588893, "grad_norm": 0.24551032483577728, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.746624693274498, "num_tokens": 4845637.0, "step": 1338 }, { "entropy": 0.6212575137615204, "epoch": 1.2491833877741483, "grad_norm": 0.22101083397865295, "learning_rate": 0.0002, "loss": 0.6231, "mean_token_accuracy": 0.7440236210823059, "num_tokens": 4849223.0, "step": 1339 }, { "entropy": 0.6495173275470734, "epoch": 1.2501166588894073, "grad_norm": 0.22471143305301666, "learning_rate": 0.0002, "loss": 0.6428, "mean_token_accuracy": 0.736904576420784, "num_tokens": 4852944.0, "step": 1340 }, { "entropy": 0.6000730097293854, "epoch": 1.2510499300046662, "grad_norm": 0.2840003967285156, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.7575267106294632, "num_tokens": 4856559.0, "step": 1341 }, { "entropy": 0.6053216457366943, "epoch": 1.2519832011199252, "grad_norm": 0.28440287709236145, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.7506470829248428, "num_tokens": 4860104.0, "step": 1342 }, { "entropy": 0.5951776653528214, "epoch": 1.2529164722351842, "grad_norm": 0.2982814311981201, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.7534665167331696, "num_tokens": 4863804.0, "step": 1343 }, { "entropy": 0.5796527862548828, "epoch": 1.2538497433504432, "grad_norm": 0.2869037091732025, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.7586392611265182, "num_tokens": 4867376.0, "step": 1344 }, { "entropy": 0.5736997723579407, "epoch": 1.2547830144657022, "grad_norm": 0.23507355153560638, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7607221454381943, "num_tokens": 4870946.0, "step": 1345 }, { "entropy": 0.6236578226089478, "epoch": 1.2557162855809612, "grad_norm": 0.3164258301258087, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.7431653141975403, "num_tokens": 4874502.0, "step": 1346 }, { "entropy": 0.6033287644386292, "epoch": 1.2566495566962201, "grad_norm": 0.2663683593273163, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.7456085681915283, "num_tokens": 4878107.0, "step": 1347 }, { "entropy": 0.6193195879459381, "epoch": 1.2575828278114791, "grad_norm": 0.2988167703151703, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.7481429427862167, "num_tokens": 4881644.0, "step": 1348 }, { "entropy": 0.6217253059148788, "epoch": 1.258516098926738, "grad_norm": 0.2297036498785019, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.7466186285018921, "num_tokens": 4885321.0, "step": 1349 }, { "entropy": 0.5956785082817078, "epoch": 1.259449370041997, "grad_norm": 0.23433038592338562, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.7595550417900085, "num_tokens": 4888971.0, "step": 1350 }, { "entropy": 0.6147083044052124, "epoch": 1.260382641157256, "grad_norm": 0.2729690670967102, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.7523398101329803, "num_tokens": 4892638.0, "step": 1351 }, { "entropy": 0.6334916055202484, "epoch": 1.261315912272515, "grad_norm": 0.2767595648765564, "learning_rate": 0.0002, "loss": 0.6341, "mean_token_accuracy": 0.7416566759347916, "num_tokens": 4896293.0, "step": 1352 }, { "entropy": 0.5799263417720795, "epoch": 1.262249183387774, "grad_norm": 0.21896971762180328, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.7635500729084015, "num_tokens": 4900038.0, "step": 1353 }, { "entropy": 0.6005065590143204, "epoch": 1.263182454503033, "grad_norm": 0.2969277501106262, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.7446716278791428, "num_tokens": 4903634.0, "step": 1354 }, { "entropy": 0.5703061372041702, "epoch": 1.264115725618292, "grad_norm": 0.25475412607192993, "learning_rate": 0.0002, "loss": 0.5846, "mean_token_accuracy": 0.7622727602720261, "num_tokens": 4907221.0, "step": 1355 }, { "entropy": 0.5927886068820953, "epoch": 1.265048996733551, "grad_norm": 0.2643025815486908, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.7564459443092346, "num_tokens": 4910816.0, "step": 1356 }, { "entropy": 0.5999107658863068, "epoch": 1.26598226784881, "grad_norm": 0.2427753210067749, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.75819331407547, "num_tokens": 4914499.0, "step": 1357 }, { "entropy": 0.5902251154184341, "epoch": 1.266915538964069, "grad_norm": 0.2308676689863205, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7622543573379517, "num_tokens": 4918105.0, "step": 1358 }, { "entropy": 0.5628859102725983, "epoch": 1.267848810079328, "grad_norm": 0.2297695130109787, "learning_rate": 0.0002, "loss": 0.5631, "mean_token_accuracy": 0.7666881084442139, "num_tokens": 4921686.0, "step": 1359 }, { "entropy": 0.6248321384191513, "epoch": 1.268782081194587, "grad_norm": 0.29966050386428833, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.747243344783783, "num_tokens": 4925236.0, "step": 1360 }, { "entropy": 0.6359751224517822, "epoch": 1.269715352309846, "grad_norm": 0.22786080837249756, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.7385902553796768, "num_tokens": 4929003.0, "step": 1361 }, { "entropy": 0.5921815484762192, "epoch": 1.2706486234251049, "grad_norm": 0.23958641290664673, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7519514709711075, "num_tokens": 4932679.0, "step": 1362 }, { "entropy": 0.5986232459545135, "epoch": 1.2715818945403639, "grad_norm": 0.2308715283870697, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.7667548805475235, "num_tokens": 4936272.0, "step": 1363 }, { "entropy": 0.6233547627925873, "epoch": 1.2725151656556228, "grad_norm": 0.25378692150115967, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.7450079470872879, "num_tokens": 4939931.0, "step": 1364 }, { "entropy": 0.618148997426033, "epoch": 1.2734484367708818, "grad_norm": 0.2468647062778473, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.7520188093185425, "num_tokens": 4943494.0, "step": 1365 }, { "entropy": 0.6139204949140549, "epoch": 1.2743817078861408, "grad_norm": 0.26592856645584106, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.7468750476837158, "num_tokens": 4947018.0, "step": 1366 }, { "entropy": 0.604757621884346, "epoch": 1.2753149790013998, "grad_norm": 0.285571426153183, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.7460086047649384, "num_tokens": 4950496.0, "step": 1367 }, { "entropy": 0.6003442406654358, "epoch": 1.2762482501166588, "grad_norm": 0.23818092048168182, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.7574896663427353, "num_tokens": 4954129.0, "step": 1368 }, { "entropy": 0.5853187441825867, "epoch": 1.2771815212319177, "grad_norm": 0.2196350395679474, "learning_rate": 0.0002, "loss": 0.5773, "mean_token_accuracy": 0.7699357569217682, "num_tokens": 4957703.0, "step": 1369 }, { "entropy": 0.6243634968996048, "epoch": 1.2781147923471767, "grad_norm": 0.29537707567214966, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.7471366077661514, "num_tokens": 4961318.0, "step": 1370 }, { "entropy": 0.5994166284799576, "epoch": 1.2790480634624357, "grad_norm": 0.2685776948928833, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.758295089006424, "num_tokens": 4964981.0, "step": 1371 }, { "entropy": 0.6062564700841904, "epoch": 1.2799813345776947, "grad_norm": 0.2556458115577698, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7583215683698654, "num_tokens": 4968537.0, "step": 1372 }, { "entropy": 0.5860258638858795, "epoch": 1.2809146056929537, "grad_norm": 0.25509268045425415, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.7592420130968094, "num_tokens": 4972098.0, "step": 1373 }, { "entropy": 0.572629913687706, "epoch": 1.2818478768082127, "grad_norm": 0.25292086601257324, "learning_rate": 0.0002, "loss": 0.5737, "mean_token_accuracy": 0.7649634778499603, "num_tokens": 4975617.0, "step": 1374 }, { "entropy": 0.5924038887023926, "epoch": 1.2827811479234716, "grad_norm": 0.3233717083930969, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.7548478543758392, "num_tokens": 4979289.0, "step": 1375 }, { "entropy": 0.6208571493625641, "epoch": 1.2837144190387306, "grad_norm": 0.27141520380973816, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.7506929486989975, "num_tokens": 4982970.0, "step": 1376 }, { "entropy": 0.5597628653049469, "epoch": 1.2846476901539896, "grad_norm": 0.26949891448020935, "learning_rate": 0.0002, "loss": 0.5838, "mean_token_accuracy": 0.7600020170211792, "num_tokens": 4986514.0, "step": 1377 }, { "entropy": 0.5989705175161362, "epoch": 1.2855809612692486, "grad_norm": 0.24762292206287384, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.7548587620258331, "num_tokens": 4990104.0, "step": 1378 }, { "entropy": 0.6194029003381729, "epoch": 1.2865142323845076, "grad_norm": 0.26729294657707214, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.7421314567327499, "num_tokens": 4993720.0, "step": 1379 }, { "entropy": 0.6613375097513199, "epoch": 1.2874475034997666, "grad_norm": 0.24260541796684265, "learning_rate": 0.0002, "loss": 0.6602, "mean_token_accuracy": 0.7292233407497406, "num_tokens": 4997464.0, "step": 1380 }, { "entropy": 0.6011819988489151, "epoch": 1.2883807746150255, "grad_norm": 0.22152219712734222, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7617766559123993, "num_tokens": 5001141.0, "step": 1381 }, { "entropy": 0.555761307477951, "epoch": 1.2893140457302845, "grad_norm": 0.23667804896831512, "learning_rate": 0.0002, "loss": 0.5557, "mean_token_accuracy": 0.7709829807281494, "num_tokens": 5004795.0, "step": 1382 }, { "entropy": 0.6279473453760147, "epoch": 1.2902473168455435, "grad_norm": 0.2657395005226135, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.751440703868866, "num_tokens": 5008562.0, "step": 1383 }, { "entropy": 0.601669505238533, "epoch": 1.2911805879608025, "grad_norm": 0.2425934374332428, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.7603422552347183, "num_tokens": 5012169.0, "step": 1384 }, { "entropy": 0.6276982575654984, "epoch": 1.2921138590760615, "grad_norm": 0.34705591201782227, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.747181162238121, "num_tokens": 5015622.0, "step": 1385 }, { "entropy": 0.6385592818260193, "epoch": 1.2930471301913204, "grad_norm": 0.3052337169647217, "learning_rate": 0.0002, "loss": 0.6305, "mean_token_accuracy": 0.7457159906625748, "num_tokens": 5019371.0, "step": 1386 }, { "entropy": 0.601155012845993, "epoch": 1.2939804013065794, "grad_norm": 0.30880364775657654, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.7504104524850845, "num_tokens": 5023002.0, "step": 1387 }, { "entropy": 0.6027387231588364, "epoch": 1.2949136724218384, "grad_norm": 0.26143956184387207, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.7537680715322495, "num_tokens": 5026724.0, "step": 1388 }, { "entropy": 0.574920579791069, "epoch": 1.2958469435370974, "grad_norm": 0.310926228761673, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.7603752017021179, "num_tokens": 5030338.0, "step": 1389 }, { "entropy": 0.6435239166021347, "epoch": 1.2967802146523564, "grad_norm": 0.2570893168449402, "learning_rate": 0.0002, "loss": 0.6312, "mean_token_accuracy": 0.7413957267999649, "num_tokens": 5033983.0, "step": 1390 }, { "entropy": 0.6456697434186935, "epoch": 1.2977134857676154, "grad_norm": 0.23716478049755096, "learning_rate": 0.0002, "loss": 0.6374, "mean_token_accuracy": 0.7411017268896103, "num_tokens": 5037778.0, "step": 1391 }, { "entropy": 0.5990906059741974, "epoch": 1.2986467568828743, "grad_norm": 0.2658984065055847, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.7559030652046204, "num_tokens": 5041519.0, "step": 1392 }, { "entropy": 0.6044503301382065, "epoch": 1.2995800279981333, "grad_norm": 0.2531687617301941, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.7553191781044006, "num_tokens": 5045020.0, "step": 1393 }, { "entropy": 0.5913060754537582, "epoch": 1.3005132991133923, "grad_norm": 0.24744874238967896, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.7646677494049072, "num_tokens": 5048684.0, "step": 1394 }, { "entropy": 0.5821192413568497, "epoch": 1.3014465702286513, "grad_norm": 0.22306878864765167, "learning_rate": 0.0002, "loss": 0.5984, "mean_token_accuracy": 0.7604885846376419, "num_tokens": 5052426.0, "step": 1395 }, { "entropy": 0.5831283628940582, "epoch": 1.3023798413439103, "grad_norm": 0.25978440046310425, "learning_rate": 0.0002, "loss": 0.5896, "mean_token_accuracy": 0.7660894095897675, "num_tokens": 5056111.0, "step": 1396 }, { "entropy": 0.5709592252969742, "epoch": 1.3033131124591693, "grad_norm": 0.34233543276786804, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.7620095014572144, "num_tokens": 5059770.0, "step": 1397 }, { "entropy": 0.6205911338329315, "epoch": 1.3042463835744282, "grad_norm": 0.25353845953941345, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.7493605017662048, "num_tokens": 5063446.0, "step": 1398 }, { "entropy": 0.5895390212535858, "epoch": 1.3051796546896872, "grad_norm": 0.2285623550415039, "learning_rate": 0.0002, "loss": 0.592, "mean_token_accuracy": 0.7636985629796982, "num_tokens": 5067057.0, "step": 1399 }, { "entropy": 0.6059560775756836, "epoch": 1.3061129258049462, "grad_norm": 0.3448876440525055, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.7522378116846085, "num_tokens": 5070740.0, "step": 1400 }, { "entropy": 0.5649417340755463, "epoch": 1.3070461969202052, "grad_norm": 0.2511598467826843, "learning_rate": 0.0002, "loss": 0.575, "mean_token_accuracy": 0.7646537125110626, "num_tokens": 5074269.0, "step": 1401 }, { "entropy": 0.60374516248703, "epoch": 1.3079794680354642, "grad_norm": 0.24250651895999908, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.7534209191799164, "num_tokens": 5077908.0, "step": 1402 }, { "entropy": 0.5969599485397339, "epoch": 1.3089127391507231, "grad_norm": 0.23148909211158752, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7617790997028351, "num_tokens": 5081616.0, "step": 1403 }, { "entropy": 0.5904203951358795, "epoch": 1.3098460102659821, "grad_norm": 0.23871611058712006, "learning_rate": 0.0002, "loss": 0.5889, "mean_token_accuracy": 0.7563346773386002, "num_tokens": 5085107.0, "step": 1404 }, { "entropy": 0.5755064487457275, "epoch": 1.3107792813812411, "grad_norm": 0.2858751714229584, "learning_rate": 0.0002, "loss": 0.5836, "mean_token_accuracy": 0.7647160887718201, "num_tokens": 5088606.0, "step": 1405 }, { "entropy": 0.5888769775629044, "epoch": 1.3117125524965, "grad_norm": 0.21165603399276733, "learning_rate": 0.0002, "loss": 0.5759, "mean_token_accuracy": 0.7671177983283997, "num_tokens": 5092334.0, "step": 1406 }, { "entropy": 0.5861976146697998, "epoch": 1.3126458236117593, "grad_norm": 0.23040351271629333, "learning_rate": 0.0002, "loss": 0.584, "mean_token_accuracy": 0.76004658639431, "num_tokens": 5096021.0, "step": 1407 }, { "entropy": 0.5818957984447479, "epoch": 1.3135790947270183, "grad_norm": 0.24434952437877655, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.7591098248958588, "num_tokens": 5099788.0, "step": 1408 }, { "entropy": 0.5918351262807846, "epoch": 1.3145123658422773, "grad_norm": 0.2918086349964142, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.7585365027189255, "num_tokens": 5103417.0, "step": 1409 }, { "entropy": 0.6434693485498428, "epoch": 1.3154456369575362, "grad_norm": 0.24945560097694397, "learning_rate": 0.0002, "loss": 0.6555, "mean_token_accuracy": 0.7377463579177856, "num_tokens": 5107115.0, "step": 1410 }, { "entropy": 0.5695724338293076, "epoch": 1.3163789080727952, "grad_norm": 0.2717817723751068, "learning_rate": 0.0002, "loss": 0.5708, "mean_token_accuracy": 0.7692449390888214, "num_tokens": 5110688.0, "step": 1411 }, { "entropy": 0.5985060632228851, "epoch": 1.3173121791880542, "grad_norm": 0.2239627242088318, "learning_rate": 0.0002, "loss": 0.5866, "mean_token_accuracy": 0.7633930444717407, "num_tokens": 5114383.0, "step": 1412 }, { "entropy": 0.6374081969261169, "epoch": 1.3182454503033132, "grad_norm": 0.23924748599529266, "learning_rate": 0.0002, "loss": 0.6422, "mean_token_accuracy": 0.7446242719888687, "num_tokens": 5118077.0, "step": 1413 }, { "entropy": 0.5933541655540466, "epoch": 1.3191787214185722, "grad_norm": 0.21878893673419952, "learning_rate": 0.0002, "loss": 0.598, "mean_token_accuracy": 0.7563155144453049, "num_tokens": 5121730.0, "step": 1414 }, { "entropy": 0.5779164880514145, "epoch": 1.3201119925338312, "grad_norm": 0.2414209395647049, "learning_rate": 0.0002, "loss": 0.5739, "mean_token_accuracy": 0.7689913362264633, "num_tokens": 5125285.0, "step": 1415 }, { "entropy": 0.6063515990972519, "epoch": 1.3210452636490901, "grad_norm": 0.2754276394844055, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.7570479959249496, "num_tokens": 5128919.0, "step": 1416 }, { "entropy": 0.6388894319534302, "epoch": 1.3219785347643491, "grad_norm": 0.2784821689128876, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.7461093068122864, "num_tokens": 5132572.0, "step": 1417 }, { "entropy": 0.6184647977352142, "epoch": 1.322911805879608, "grad_norm": 0.2652323544025421, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.7442047148942947, "num_tokens": 5136189.0, "step": 1418 }, { "entropy": 0.5876024812459946, "epoch": 1.323845076994867, "grad_norm": 0.296478807926178, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.7533602118492126, "num_tokens": 5139826.0, "step": 1419 }, { "entropy": 0.5869695991277695, "epoch": 1.324778348110126, "grad_norm": 0.2705935835838318, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.7592797428369522, "num_tokens": 5143448.0, "step": 1420 }, { "entropy": 0.5763687640428543, "epoch": 1.325711619225385, "grad_norm": 0.21471527218818665, "learning_rate": 0.0002, "loss": 0.5754, "mean_token_accuracy": 0.7693696171045303, "num_tokens": 5147131.0, "step": 1421 }, { "entropy": 0.5773242861032486, "epoch": 1.326644890340644, "grad_norm": 0.2844087779521942, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7684360444545746, "num_tokens": 5150778.0, "step": 1422 }, { "entropy": 0.5934134274721146, "epoch": 1.327578161455903, "grad_norm": 0.25244200229644775, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.753212109208107, "num_tokens": 5154463.0, "step": 1423 }, { "entropy": 0.642511174082756, "epoch": 1.328511432571162, "grad_norm": 0.28064513206481934, "learning_rate": 0.0002, "loss": 0.6526, "mean_token_accuracy": 0.7313572019338608, "num_tokens": 5157932.0, "step": 1424 }, { "entropy": 0.625408798456192, "epoch": 1.329444703686421, "grad_norm": 0.26198554039001465, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.7387708127498627, "num_tokens": 5161595.0, "step": 1425 }, { "entropy": 0.6392551958560944, "epoch": 1.33037797480168, "grad_norm": 0.21588504314422607, "learning_rate": 0.0002, "loss": 0.6443, "mean_token_accuracy": 0.7360593527555466, "num_tokens": 5165421.0, "step": 1426 }, { "entropy": 0.584230937063694, "epoch": 1.331311245916939, "grad_norm": 0.22138391435146332, "learning_rate": 0.0002, "loss": 0.5819, "mean_token_accuracy": 0.7692983448505402, "num_tokens": 5169028.0, "step": 1427 }, { "entropy": 0.5510176718235016, "epoch": 1.332244517032198, "grad_norm": 0.21426406502723694, "learning_rate": 0.0002, "loss": 0.5493, "mean_token_accuracy": 0.7829608768224716, "num_tokens": 5172669.0, "step": 1428 }, { "entropy": 0.6426789909601212, "epoch": 1.333177788147457, "grad_norm": 0.3282562494277954, "learning_rate": 0.0002, "loss": 0.6536, "mean_token_accuracy": 0.7353210002183914, "num_tokens": 5176268.0, "step": 1429 }, { "entropy": 0.6044910699129105, "epoch": 1.334111059262716, "grad_norm": 0.2576674520969391, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.748555064201355, "num_tokens": 5179849.0, "step": 1430 }, { "entropy": 0.6557688117027283, "epoch": 1.3350443303779749, "grad_norm": 0.24494852125644684, "learning_rate": 0.0002, "loss": 0.6562, "mean_token_accuracy": 0.7324310392141342, "num_tokens": 5183529.0, "step": 1431 }, { "entropy": 0.61728136241436, "epoch": 1.3359776014932339, "grad_norm": 0.2511427104473114, "learning_rate": 0.0002, "loss": 0.6188, "mean_token_accuracy": 0.7544118314981461, "num_tokens": 5187311.0, "step": 1432 }, { "entropy": 0.6030793786048889, "epoch": 1.3369108726084928, "grad_norm": 0.2919917702674866, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.7537975460290909, "num_tokens": 5190934.0, "step": 1433 }, { "entropy": 0.6047617048025131, "epoch": 1.3378441437237518, "grad_norm": 0.2595498263835907, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.7474298179149628, "num_tokens": 5194520.0, "step": 1434 }, { "entropy": 0.6034767031669617, "epoch": 1.3387774148390108, "grad_norm": 0.2888951897621155, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.7585840076208115, "num_tokens": 5197982.0, "step": 1435 }, { "entropy": 0.5979668200016022, "epoch": 1.3397106859542698, "grad_norm": 0.27349111437797546, "learning_rate": 0.0002, "loss": 0.6185, "mean_token_accuracy": 0.7479134202003479, "num_tokens": 5201581.0, "step": 1436 }, { "entropy": 0.591351717710495, "epoch": 1.3406439570695288, "grad_norm": 0.34969383478164673, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.7589731067419052, "num_tokens": 5205283.0, "step": 1437 }, { "entropy": 0.6121120005846024, "epoch": 1.3415772281847878, "grad_norm": 0.312870591878891, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.7537252902984619, "num_tokens": 5208872.0, "step": 1438 }, { "entropy": 0.5880540907382965, "epoch": 1.3425104993000467, "grad_norm": 0.23502816259860992, "learning_rate": 0.0002, "loss": 0.5803, "mean_token_accuracy": 0.7696573734283447, "num_tokens": 5212509.0, "step": 1439 }, { "entropy": 0.6042034476995468, "epoch": 1.3434437704153057, "grad_norm": 0.23632743954658508, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7657436728477478, "num_tokens": 5216185.0, "step": 1440 }, { "entropy": 0.5776687413454056, "epoch": 1.3443770415305647, "grad_norm": 0.25354403257369995, "learning_rate": 0.0002, "loss": 0.5861, "mean_token_accuracy": 0.7652843445539474, "num_tokens": 5219682.0, "step": 1441 }, { "entropy": 0.5943739116191864, "epoch": 1.3453103126458237, "grad_norm": 0.2503204941749573, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.7551778554916382, "num_tokens": 5223271.0, "step": 1442 }, { "entropy": 0.584469735622406, "epoch": 1.3462435837610827, "grad_norm": 0.2513621151447296, "learning_rate": 0.0002, "loss": 0.584, "mean_token_accuracy": 0.7622344791889191, "num_tokens": 5226872.0, "step": 1443 }, { "entropy": 0.6099960952997208, "epoch": 1.3471768548763416, "grad_norm": 0.22191612422466278, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.751566931605339, "num_tokens": 5230489.0, "step": 1444 }, { "entropy": 0.5890656560659409, "epoch": 1.3481101259916006, "grad_norm": 0.27430546283721924, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.7648108601570129, "num_tokens": 5234086.0, "step": 1445 }, { "entropy": 0.612069696187973, "epoch": 1.3490433971068596, "grad_norm": 0.24297268688678741, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.7477314323186874, "num_tokens": 5237648.0, "step": 1446 }, { "entropy": 0.6192381083965302, "epoch": 1.3499766682221186, "grad_norm": 0.22182545065879822, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.7520227432250977, "num_tokens": 5241340.0, "step": 1447 }, { "entropy": 0.5824158191680908, "epoch": 1.3509099393373776, "grad_norm": 0.2882135510444641, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.7563153803348541, "num_tokens": 5244784.0, "step": 1448 }, { "entropy": 0.6345638036727905, "epoch": 1.3518432104526366, "grad_norm": 0.2594049870967865, "learning_rate": 0.0002, "loss": 0.6482, "mean_token_accuracy": 0.7347896546125412, "num_tokens": 5248475.0, "step": 1449 }, { "entropy": 0.5962653011083603, "epoch": 1.3527764815678955, "grad_norm": 0.2821407616138458, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7481476664543152, "num_tokens": 5252113.0, "step": 1450 }, { "entropy": 0.6634441763162613, "epoch": 1.3537097526831545, "grad_norm": 0.2604718804359436, "learning_rate": 0.0002, "loss": 0.6743, "mean_token_accuracy": 0.7250696420669556, "num_tokens": 5255735.0, "step": 1451 }, { "entropy": 0.6040287911891937, "epoch": 1.3546430237984135, "grad_norm": 0.34811466932296753, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.7514883875846863, "num_tokens": 5259282.0, "step": 1452 }, { "entropy": 0.5774891078472137, "epoch": 1.3555762949136725, "grad_norm": 0.3228495419025421, "learning_rate": 0.0002, "loss": 0.5813, "mean_token_accuracy": 0.7610880881547928, "num_tokens": 5262702.0, "step": 1453 }, { "entropy": 0.5963091254234314, "epoch": 1.3565095660289315, "grad_norm": 0.2352723777294159, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.7500572800636292, "num_tokens": 5266187.0, "step": 1454 }, { "entropy": 0.6176682561635971, "epoch": 1.3574428371441905, "grad_norm": 0.22866302728652954, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7533967792987823, "num_tokens": 5269772.0, "step": 1455 }, { "entropy": 0.6366072446107864, "epoch": 1.3583761082594494, "grad_norm": 0.27662256360054016, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.7421107143163681, "num_tokens": 5273448.0, "step": 1456 }, { "entropy": 0.6482416987419128, "epoch": 1.3593093793747084, "grad_norm": 0.2546951174736023, "learning_rate": 0.0002, "loss": 0.6355, "mean_token_accuracy": 0.7473448514938354, "num_tokens": 5277068.0, "step": 1457 }, { "entropy": 0.5900776237249374, "epoch": 1.3602426504899674, "grad_norm": 0.23108838498592377, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.7617122083902359, "num_tokens": 5280751.0, "step": 1458 }, { "entropy": 0.5972720682621002, "epoch": 1.3611759216052264, "grad_norm": 0.29372474551200867, "learning_rate": 0.0002, "loss": 0.598, "mean_token_accuracy": 0.7545118927955627, "num_tokens": 5284393.0, "step": 1459 }, { "entropy": 0.6018877476453781, "epoch": 1.3621091927204854, "grad_norm": 0.24987730383872986, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7576934993267059, "num_tokens": 5287990.0, "step": 1460 }, { "entropy": 0.5861396640539169, "epoch": 1.3630424638357443, "grad_norm": 0.25788864493370056, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.7592917382717133, "num_tokens": 5291645.0, "step": 1461 }, { "entropy": 0.5729429572820663, "epoch": 1.3639757349510033, "grad_norm": 0.26545992493629456, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7612981796264648, "num_tokens": 5295200.0, "step": 1462 }, { "entropy": 0.6032336950302124, "epoch": 1.3649090060662623, "grad_norm": 0.32446086406707764, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.7496871650218964, "num_tokens": 5298688.0, "step": 1463 }, { "entropy": 0.5897957235574722, "epoch": 1.3658422771815213, "grad_norm": 0.22444744408130646, "learning_rate": 0.0002, "loss": 0.5837, "mean_token_accuracy": 0.7628576308488846, "num_tokens": 5302295.0, "step": 1464 }, { "entropy": 0.5844330489635468, "epoch": 1.3667755482967803, "grad_norm": 0.2236071527004242, "learning_rate": 0.0002, "loss": 0.5775, "mean_token_accuracy": 0.7656562924385071, "num_tokens": 5305953.0, "step": 1465 }, { "entropy": 0.6072567403316498, "epoch": 1.3677088194120393, "grad_norm": 0.23292139172554016, "learning_rate": 0.0002, "loss": 0.6, "mean_token_accuracy": 0.7563328593969345, "num_tokens": 5309724.0, "step": 1466 }, { "entropy": 0.6150154322385788, "epoch": 1.3686420905272982, "grad_norm": 0.24033494293689728, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.7487253546714783, "num_tokens": 5313361.0, "step": 1467 }, { "entropy": 0.6093757003545761, "epoch": 1.3695753616425572, "grad_norm": 0.20583224296569824, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7481553554534912, "num_tokens": 5316981.0, "step": 1468 }, { "entropy": 0.580867737531662, "epoch": 1.3705086327578162, "grad_norm": 0.2614976167678833, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7589725404977798, "num_tokens": 5320575.0, "step": 1469 }, { "entropy": 0.6345946043729782, "epoch": 1.3714419038730752, "grad_norm": 0.23670613765716553, "learning_rate": 0.0002, "loss": 0.6414, "mean_token_accuracy": 0.7334587574005127, "num_tokens": 5324270.0, "step": 1470 }, { "entropy": 0.6069620698690414, "epoch": 1.3723751749883342, "grad_norm": 0.24435243010520935, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7558817863464355, "num_tokens": 5327773.0, "step": 1471 }, { "entropy": 0.614675298333168, "epoch": 1.3733084461035932, "grad_norm": 0.3189481794834137, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.746653363108635, "num_tokens": 5331335.0, "step": 1472 }, { "entropy": 0.5996317714452744, "epoch": 1.3742417172188521, "grad_norm": 0.2141815572977066, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7567559033632278, "num_tokens": 5334970.0, "step": 1473 }, { "entropy": 0.5635630786418915, "epoch": 1.3751749883341111, "grad_norm": 0.2527802884578705, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.7638080716133118, "num_tokens": 5338534.0, "step": 1474 }, { "entropy": 0.5985124260187149, "epoch": 1.37610825944937, "grad_norm": 0.27651578187942505, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.7518664002418518, "num_tokens": 5342122.0, "step": 1475 }, { "entropy": 0.6131134331226349, "epoch": 1.377041530564629, "grad_norm": 0.2576986849308014, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.7406666278839111, "num_tokens": 5345729.0, "step": 1476 }, { "entropy": 0.6338272094726562, "epoch": 1.377974801679888, "grad_norm": 0.22394312918186188, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.7470467537641525, "num_tokens": 5349346.0, "step": 1477 }, { "entropy": 0.6214082092046738, "epoch": 1.378908072795147, "grad_norm": 0.21135403215885162, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.7553903311491013, "num_tokens": 5353015.0, "step": 1478 }, { "entropy": 0.6119960844516754, "epoch": 1.379841343910406, "grad_norm": 0.3050529658794403, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.7534425556659698, "num_tokens": 5356632.0, "step": 1479 }, { "entropy": 0.5786918699741364, "epoch": 1.380774615025665, "grad_norm": 0.23211075365543365, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.7572827488183975, "num_tokens": 5360212.0, "step": 1480 }, { "entropy": 0.5696492344141006, "epoch": 1.381707886140924, "grad_norm": 0.25324690341949463, "learning_rate": 0.0002, "loss": 0.5699, "mean_token_accuracy": 0.7687506079673767, "num_tokens": 5363892.0, "step": 1481 }, { "entropy": 0.6471274048089981, "epoch": 1.382641157256183, "grad_norm": 0.2746274173259735, "learning_rate": 0.0002, "loss": 0.6513, "mean_token_accuracy": 0.7358112931251526, "num_tokens": 5367555.0, "step": 1482 }, { "entropy": 0.5948372781276703, "epoch": 1.383574428371442, "grad_norm": 0.24527744948863983, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.7536090016365051, "num_tokens": 5371348.0, "step": 1483 }, { "entropy": 0.57411028444767, "epoch": 1.384507699486701, "grad_norm": 0.3363211452960968, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.760308712720871, "num_tokens": 5374891.0, "step": 1484 }, { "entropy": 0.6206023991107941, "epoch": 1.38544097060196, "grad_norm": 0.25268274545669556, "learning_rate": 0.0002, "loss": 0.6345, "mean_token_accuracy": 0.7414100468158722, "num_tokens": 5378542.0, "step": 1485 }, { "entropy": 0.6227917075157166, "epoch": 1.386374241717219, "grad_norm": 0.23966084420681, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.7541338950395584, "num_tokens": 5382139.0, "step": 1486 }, { "entropy": 0.5805439203977585, "epoch": 1.3873075128324779, "grad_norm": 0.26723015308380127, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.7563973814249039, "num_tokens": 5385768.0, "step": 1487 }, { "entropy": 0.6423561722040176, "epoch": 1.3882407839477369, "grad_norm": 0.28425589203834534, "learning_rate": 0.0002, "loss": 0.6473, "mean_token_accuracy": 0.7331940829753876, "num_tokens": 5389514.0, "step": 1488 }, { "entropy": 0.6032439321279526, "epoch": 1.3891740550629958, "grad_norm": 0.2581166625022888, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.7586871087551117, "num_tokens": 5393139.0, "step": 1489 }, { "entropy": 0.5989155620336533, "epoch": 1.3901073261782548, "grad_norm": 0.24677658081054688, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.7555549442768097, "num_tokens": 5396700.0, "step": 1490 }, { "entropy": 0.5906091779470444, "epoch": 1.3910405972935138, "grad_norm": 0.22982022166252136, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7636927217245102, "num_tokens": 5400274.0, "step": 1491 }, { "entropy": 0.6626580059528351, "epoch": 1.3919738684087728, "grad_norm": 0.27451133728027344, "learning_rate": 0.0002, "loss": 0.6659, "mean_token_accuracy": 0.727195143699646, "num_tokens": 5403880.0, "step": 1492 }, { "entropy": 0.6387646049261093, "epoch": 1.3929071395240318, "grad_norm": 0.2101600617170334, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.7472149431705475, "num_tokens": 5407600.0, "step": 1493 }, { "entropy": 0.603633388876915, "epoch": 1.3938404106392908, "grad_norm": 0.21933919191360474, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.7537377923727036, "num_tokens": 5411303.0, "step": 1494 }, { "entropy": 0.6383317857980728, "epoch": 1.3947736817545497, "grad_norm": 0.24432896077632904, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.7393133044242859, "num_tokens": 5415095.0, "step": 1495 }, { "entropy": 0.6301003396511078, "epoch": 1.3957069528698087, "grad_norm": 0.23809149861335754, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.7408500462770462, "num_tokens": 5418718.0, "step": 1496 }, { "entropy": 0.5611502230167389, "epoch": 1.3966402239850677, "grad_norm": 0.3036459982395172, "learning_rate": 0.0002, "loss": 0.5738, "mean_token_accuracy": 0.7660821229219437, "num_tokens": 5422296.0, "step": 1497 }, { "entropy": 0.6097014397382736, "epoch": 1.3975734951003267, "grad_norm": 0.3111914098262787, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.7478307783603668, "num_tokens": 5425924.0, "step": 1498 }, { "entropy": 0.5568419173359871, "epoch": 1.3985067662155857, "grad_norm": 0.2252274602651596, "learning_rate": 0.0002, "loss": 0.5611, "mean_token_accuracy": 0.7718268781900406, "num_tokens": 5429685.0, "step": 1499 }, { "entropy": 0.5755603015422821, "epoch": 1.3994400373308447, "grad_norm": 0.2576828896999359, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.7586881369352341, "num_tokens": 5433114.0, "step": 1500 }, { "entropy": 0.5912192165851593, "epoch": 1.4003733084461036, "grad_norm": 0.32065895199775696, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.7569649070501328, "num_tokens": 5436671.0, "step": 1501 }, { "entropy": 0.6233372390270233, "epoch": 1.4013065795613626, "grad_norm": 0.35306423902511597, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.7486381828784943, "num_tokens": 5440170.0, "step": 1502 }, { "entropy": 0.6053595393896103, "epoch": 1.4022398506766216, "grad_norm": 0.2347007840871811, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.7537746876478195, "num_tokens": 5443906.0, "step": 1503 }, { "entropy": 0.5963535010814667, "epoch": 1.4031731217918806, "grad_norm": 0.32124683260917664, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.7487811148166656, "num_tokens": 5447477.0, "step": 1504 }, { "entropy": 0.5719363987445831, "epoch": 1.4041063929071396, "grad_norm": 0.27641627192497253, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.7678062468767166, "num_tokens": 5450943.0, "step": 1505 }, { "entropy": 0.6293062418699265, "epoch": 1.4050396640223985, "grad_norm": 0.2873194217681885, "learning_rate": 0.0002, "loss": 0.6328, "mean_token_accuracy": 0.741648405790329, "num_tokens": 5454633.0, "step": 1506 }, { "entropy": 0.642852857708931, "epoch": 1.4059729351376575, "grad_norm": 0.3736438453197479, "learning_rate": 0.0002, "loss": 0.6418, "mean_token_accuracy": 0.7399380952119827, "num_tokens": 5458200.0, "step": 1507 }, { "entropy": 0.5959714651107788, "epoch": 1.4069062062529165, "grad_norm": 0.27121785283088684, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.7525013536214828, "num_tokens": 5461763.0, "step": 1508 }, { "entropy": 0.6054220199584961, "epoch": 1.4078394773681755, "grad_norm": 0.4071211516857147, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.7600380480289459, "num_tokens": 5465295.0, "step": 1509 }, { "entropy": 0.575075164437294, "epoch": 1.4087727484834345, "grad_norm": 0.25122731924057007, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.7662502825260162, "num_tokens": 5468911.0, "step": 1510 }, { "entropy": 0.6440918147563934, "epoch": 1.4097060195986935, "grad_norm": 0.23065577447414398, "learning_rate": 0.0002, "loss": 0.6439, "mean_token_accuracy": 0.7391496300697327, "num_tokens": 5472564.0, "step": 1511 }, { "entropy": 0.6141751557588577, "epoch": 1.4106392907139524, "grad_norm": 0.3168572783470154, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.7512077540159225, "num_tokens": 5476228.0, "step": 1512 }, { "entropy": 0.6187660545110703, "epoch": 1.4115725618292114, "grad_norm": 0.3200725317001343, "learning_rate": 0.0002, "loss": 0.634, "mean_token_accuracy": 0.7475645840167999, "num_tokens": 5479988.0, "step": 1513 }, { "entropy": 0.6217849403619766, "epoch": 1.4125058329444704, "grad_norm": 0.22921589016914368, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.7505638599395752, "num_tokens": 5483535.0, "step": 1514 }, { "entropy": 0.5874466001987457, "epoch": 1.4134391040597294, "grad_norm": 0.2583721876144409, "learning_rate": 0.0002, "loss": 0.5847, "mean_token_accuracy": 0.7657193243503571, "num_tokens": 5487235.0, "step": 1515 }, { "entropy": 0.5972344279289246, "epoch": 1.4143723751749884, "grad_norm": 0.2565325200557709, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7580980360507965, "num_tokens": 5490869.0, "step": 1516 }, { "entropy": 0.6640100628137589, "epoch": 1.4153056462902474, "grad_norm": 0.284916490316391, "learning_rate": 0.0002, "loss": 0.6627, "mean_token_accuracy": 0.7304757833480835, "num_tokens": 5494449.0, "step": 1517 }, { "entropy": 0.6086358726024628, "epoch": 1.4162389174055063, "grad_norm": 0.29202160239219666, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.7593079209327698, "num_tokens": 5498098.0, "step": 1518 }, { "entropy": 0.6038800925016403, "epoch": 1.4171721885207653, "grad_norm": 0.2796822786331177, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.7550300657749176, "num_tokens": 5501761.0, "step": 1519 }, { "entropy": 0.6267329007387161, "epoch": 1.4181054596360243, "grad_norm": 0.22389787435531616, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.745242714881897, "num_tokens": 5505453.0, "step": 1520 }, { "entropy": 0.5948386043310165, "epoch": 1.4190387307512833, "grad_norm": 0.2594214677810669, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.7601706832647324, "num_tokens": 5509006.0, "step": 1521 }, { "entropy": 0.6084579229354858, "epoch": 1.4199720018665423, "grad_norm": 0.2843722999095917, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.754422977566719, "num_tokens": 5512706.0, "step": 1522 }, { "entropy": 0.5858398079872131, "epoch": 1.4209052729818012, "grad_norm": 0.28771767020225525, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7571233063936234, "num_tokens": 5516316.0, "step": 1523 }, { "entropy": 0.5639893785119057, "epoch": 1.4218385440970602, "grad_norm": 0.22662384808063507, "learning_rate": 0.0002, "loss": 0.5644, "mean_token_accuracy": 0.7682643979787827, "num_tokens": 5519941.0, "step": 1524 }, { "entropy": 0.5855547040700912, "epoch": 1.4227718152123192, "grad_norm": 0.24827685952186584, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.7561760544776917, "num_tokens": 5523547.0, "step": 1525 }, { "entropy": 0.6011104732751846, "epoch": 1.4237050863275782, "grad_norm": 0.3361165225505829, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.7507306486368179, "num_tokens": 5527127.0, "step": 1526 }, { "entropy": 0.6280468255281448, "epoch": 1.4246383574428372, "grad_norm": 0.22328466176986694, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.7536209672689438, "num_tokens": 5530700.0, "step": 1527 }, { "entropy": 0.6149221062660217, "epoch": 1.4255716285580962, "grad_norm": 0.22472411394119263, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.7529192864894867, "num_tokens": 5534318.0, "step": 1528 }, { "entropy": 0.5963147580623627, "epoch": 1.4265048996733551, "grad_norm": 0.22917044162750244, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.7583369165658951, "num_tokens": 5537954.0, "step": 1529 }, { "entropy": 0.6075241267681122, "epoch": 1.4274381707886141, "grad_norm": 0.2550565004348755, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.7512475848197937, "num_tokens": 5541613.0, "step": 1530 }, { "entropy": 0.5934523046016693, "epoch": 1.428371441903873, "grad_norm": 0.2658859193325043, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7558578252792358, "num_tokens": 5545299.0, "step": 1531 }, { "entropy": 0.6146905720233917, "epoch": 1.429304713019132, "grad_norm": 0.22941264510154724, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.7444760650396347, "num_tokens": 5549065.0, "step": 1532 }, { "entropy": 0.5962013006210327, "epoch": 1.430237984134391, "grad_norm": 0.23732928931713104, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.7529346495866776, "num_tokens": 5552562.0, "step": 1533 }, { "entropy": 0.5588367730379105, "epoch": 1.43117125524965, "grad_norm": 0.2790442407131195, "learning_rate": 0.0002, "loss": 0.5623, "mean_token_accuracy": 0.7725836783647537, "num_tokens": 5556070.0, "step": 1534 }, { "entropy": 0.6063140630722046, "epoch": 1.432104526364909, "grad_norm": 0.20497584342956543, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.7617446631193161, "num_tokens": 5559841.0, "step": 1535 }, { "entropy": 0.5695931762456894, "epoch": 1.433037797480168, "grad_norm": 0.2336513102054596, "learning_rate": 0.0002, "loss": 0.5759, "mean_token_accuracy": 0.7676617354154587, "num_tokens": 5563390.0, "step": 1536 }, { "entropy": 0.6001177430152893, "epoch": 1.433971068595427, "grad_norm": 0.23905576765537262, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.7495283037424088, "num_tokens": 5566937.0, "step": 1537 }, { "entropy": 0.6123689711093903, "epoch": 1.434904339710686, "grad_norm": 0.24106895923614502, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.7535965293645859, "num_tokens": 5570629.0, "step": 1538 }, { "entropy": 0.6090688705444336, "epoch": 1.435837610825945, "grad_norm": 0.3039942681789398, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.7472166121006012, "num_tokens": 5574340.0, "step": 1539 }, { "entropy": 0.6204144209623337, "epoch": 1.436770881941204, "grad_norm": 0.2923363745212555, "learning_rate": 0.0002, "loss": 0.6348, "mean_token_accuracy": 0.7426729947328568, "num_tokens": 5578081.0, "step": 1540 }, { "entropy": 0.6051127314567566, "epoch": 1.437704153056463, "grad_norm": 0.22805306315422058, "learning_rate": 0.0002, "loss": 0.6115, "mean_token_accuracy": 0.7512811422348022, "num_tokens": 5581695.0, "step": 1541 }, { "entropy": 0.6456767916679382, "epoch": 1.438637424171722, "grad_norm": 0.257696270942688, "learning_rate": 0.0002, "loss": 0.6332, "mean_token_accuracy": 0.7443371713161469, "num_tokens": 5585352.0, "step": 1542 }, { "entropy": 0.5889737457036972, "epoch": 1.439570695286981, "grad_norm": 0.2643260061740875, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.7656252533197403, "num_tokens": 5589005.0, "step": 1543 }, { "entropy": 0.612905889749527, "epoch": 1.4405039664022399, "grad_norm": 0.24774803221225739, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.7576755881309509, "num_tokens": 5592619.0, "step": 1544 }, { "entropy": 0.6308027654886246, "epoch": 1.4414372375174989, "grad_norm": 0.2869466245174408, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.7493539303541183, "num_tokens": 5596325.0, "step": 1545 }, { "entropy": 0.6318563222885132, "epoch": 1.4423705086327578, "grad_norm": 0.22593405842781067, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.744680717587471, "num_tokens": 5599966.0, "step": 1546 }, { "entropy": 0.574545830488205, "epoch": 1.4433037797480168, "grad_norm": 0.2683560848236084, "learning_rate": 0.0002, "loss": 0.5904, "mean_token_accuracy": 0.7574695497751236, "num_tokens": 5603515.0, "step": 1547 }, { "entropy": 0.5655121505260468, "epoch": 1.4442370508632758, "grad_norm": 0.23946626484394073, "learning_rate": 0.0002, "loss": 0.5711, "mean_token_accuracy": 0.7707047462463379, "num_tokens": 5607143.0, "step": 1548 }, { "entropy": 0.609978049993515, "epoch": 1.4451703219785348, "grad_norm": 0.27589893341064453, "learning_rate": 0.0002, "loss": 0.6331, "mean_token_accuracy": 0.7438595145940781, "num_tokens": 5610773.0, "step": 1549 }, { "entropy": 0.5812713354825974, "epoch": 1.4461035930937938, "grad_norm": 0.3069015443325043, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.7591842263936996, "num_tokens": 5614345.0, "step": 1550 }, { "entropy": 0.5564647763967514, "epoch": 1.4470368642090528, "grad_norm": 0.233759805560112, "learning_rate": 0.0002, "loss": 0.5648, "mean_token_accuracy": 0.7746418118476868, "num_tokens": 5617946.0, "step": 1551 }, { "entropy": 0.6491913497447968, "epoch": 1.4479701353243117, "grad_norm": 0.24727869033813477, "learning_rate": 0.0002, "loss": 0.6604, "mean_token_accuracy": 0.7305073887109756, "num_tokens": 5621673.0, "step": 1552 }, { "entropy": 0.5833785384893417, "epoch": 1.4489034064395707, "grad_norm": 0.24159370362758636, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.770511120557785, "num_tokens": 5625226.0, "step": 1553 }, { "entropy": 0.5963957756757736, "epoch": 1.4498366775548297, "grad_norm": 0.2278643101453781, "learning_rate": 0.0002, "loss": 0.5949, "mean_token_accuracy": 0.7622931152582169, "num_tokens": 5628886.0, "step": 1554 }, { "entropy": 0.6392792761325836, "epoch": 1.4507699486700887, "grad_norm": 0.24378611147403717, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.7446253299713135, "num_tokens": 5632591.0, "step": 1555 }, { "entropy": 0.5993954539299011, "epoch": 1.4517032197853477, "grad_norm": 0.261692613363266, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.7656216621398926, "num_tokens": 5636205.0, "step": 1556 }, { "entropy": 0.5858766883611679, "epoch": 1.4526364909006066, "grad_norm": 0.2751373052597046, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7529970854520798, "num_tokens": 5639704.0, "step": 1557 }, { "entropy": 0.6276889443397522, "epoch": 1.4535697620158656, "grad_norm": 0.24428017437458038, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.7466541081666946, "num_tokens": 5643282.0, "step": 1558 }, { "entropy": 0.6064343005418777, "epoch": 1.4545030331311246, "grad_norm": 0.2502093017101288, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7600931525230408, "num_tokens": 5646909.0, "step": 1559 }, { "entropy": 0.6530828475952148, "epoch": 1.4554363042463836, "grad_norm": 0.2461569756269455, "learning_rate": 0.0002, "loss": 0.6494, "mean_token_accuracy": 0.7430914342403412, "num_tokens": 5650552.0, "step": 1560 }, { "entropy": 0.5929452776908875, "epoch": 1.4563695753616426, "grad_norm": 0.24917824566364288, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.7554769665002823, "num_tokens": 5654111.0, "step": 1561 }, { "entropy": 0.6173983812332153, "epoch": 1.4573028464769016, "grad_norm": 0.2971239686012268, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.7443678230047226, "num_tokens": 5657770.0, "step": 1562 }, { "entropy": 0.5860996395349503, "epoch": 1.4582361175921605, "grad_norm": 0.26804208755493164, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.7531860619783401, "num_tokens": 5661422.0, "step": 1563 }, { "entropy": 0.6182676404714584, "epoch": 1.4591693887074195, "grad_norm": 0.23700189590454102, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.7488352060317993, "num_tokens": 5665102.0, "step": 1564 }, { "entropy": 0.644167110323906, "epoch": 1.4601026598226785, "grad_norm": 0.2853315472602844, "learning_rate": 0.0002, "loss": 0.6498, "mean_token_accuracy": 0.7332803457975388, "num_tokens": 5668734.0, "step": 1565 }, { "entropy": 0.5835207402706146, "epoch": 1.4610359309379375, "grad_norm": 0.23726992309093475, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7592703104019165, "num_tokens": 5672310.0, "step": 1566 }, { "entropy": 0.6102059781551361, "epoch": 1.4619692020531965, "grad_norm": 0.21532419323921204, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.7595967054367065, "num_tokens": 5676021.0, "step": 1567 }, { "entropy": 0.6113739162683487, "epoch": 1.4629024731684555, "grad_norm": 0.22930973768234253, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7554019242525101, "num_tokens": 5679598.0, "step": 1568 }, { "entropy": 0.5866664946079254, "epoch": 1.4638357442837144, "grad_norm": 0.23355405032634735, "learning_rate": 0.0002, "loss": 0.5852, "mean_token_accuracy": 0.7587731182575226, "num_tokens": 5683156.0, "step": 1569 }, { "entropy": 0.6308477073907852, "epoch": 1.4647690153989734, "grad_norm": 0.22734558582305908, "learning_rate": 0.0002, "loss": 0.6438, "mean_token_accuracy": 0.7365361601114273, "num_tokens": 5686728.0, "step": 1570 }, { "entropy": 0.6434645652770996, "epoch": 1.4657022865142324, "grad_norm": 0.2797907888889313, "learning_rate": 0.0002, "loss": 0.6384, "mean_token_accuracy": 0.7386491596698761, "num_tokens": 5690376.0, "step": 1571 }, { "entropy": 0.6122748106718063, "epoch": 1.4666355576294914, "grad_norm": 0.224885031580925, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.7478459179401398, "num_tokens": 5693991.0, "step": 1572 }, { "entropy": 0.597405880689621, "epoch": 1.4675688287447504, "grad_norm": 0.30020689964294434, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.7569256722927094, "num_tokens": 5697483.0, "step": 1573 }, { "entropy": 0.6085799783468246, "epoch": 1.4685020998600093, "grad_norm": 0.3261772096157074, "learning_rate": 0.0002, "loss": 0.6295, "mean_token_accuracy": 0.7439093142747879, "num_tokens": 5701047.0, "step": 1574 }, { "entropy": 0.636113315820694, "epoch": 1.4694353709752683, "grad_norm": 0.29233798384666443, "learning_rate": 0.0002, "loss": 0.6492, "mean_token_accuracy": 0.7355693727731705, "num_tokens": 5704677.0, "step": 1575 }, { "entropy": 0.60234235227108, "epoch": 1.4703686420905273, "grad_norm": 0.28490880131721497, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7587972581386566, "num_tokens": 5708322.0, "step": 1576 }, { "entropy": 0.6066195964813232, "epoch": 1.4713019132057863, "grad_norm": 0.31988680362701416, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.7450576722621918, "num_tokens": 5712043.0, "step": 1577 }, { "entropy": 0.5948356986045837, "epoch": 1.4722351843210453, "grad_norm": 0.24304728209972382, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.7597002238035202, "num_tokens": 5715718.0, "step": 1578 }, { "entropy": 0.5953975021839142, "epoch": 1.4731684554363043, "grad_norm": 0.2848148047924042, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.7489166110754013, "num_tokens": 5719276.0, "step": 1579 }, { "entropy": 0.5852679461240768, "epoch": 1.4741017265515632, "grad_norm": 0.29567644000053406, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7604146897792816, "num_tokens": 5722826.0, "step": 1580 }, { "entropy": 0.6241034716367722, "epoch": 1.4750349976668222, "grad_norm": 0.2702333927154541, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.742174431681633, "num_tokens": 5726411.0, "step": 1581 }, { "entropy": 0.5958732813596725, "epoch": 1.4759682687820812, "grad_norm": 0.27434810996055603, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.7632386833429337, "num_tokens": 5730017.0, "step": 1582 }, { "entropy": 0.6229503154754639, "epoch": 1.4769015398973402, "grad_norm": 0.23403291404247284, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.748606726527214, "num_tokens": 5733607.0, "step": 1583 }, { "entropy": 0.5963354408740997, "epoch": 1.4778348110125992, "grad_norm": 0.274787038564682, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.7529251575469971, "num_tokens": 5737272.0, "step": 1584 }, { "entropy": 0.6107395589351654, "epoch": 1.4787680821278582, "grad_norm": 0.22734153270721436, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.7521265298128128, "num_tokens": 5740969.0, "step": 1585 }, { "entropy": 0.5843441933393478, "epoch": 1.4797013532431171, "grad_norm": 0.31463590264320374, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.7538265138864517, "num_tokens": 5744563.0, "step": 1586 }, { "entropy": 0.6044463366270065, "epoch": 1.4806346243583761, "grad_norm": 0.24023355543613434, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.7568319439888, "num_tokens": 5748090.0, "step": 1587 }, { "entropy": 0.56961490213871, "epoch": 1.481567895473635, "grad_norm": 0.26607266068458557, "learning_rate": 0.0002, "loss": 0.5734, "mean_token_accuracy": 0.7762671411037445, "num_tokens": 5751737.0, "step": 1588 }, { "entropy": 0.6109072268009186, "epoch": 1.482501166588894, "grad_norm": 0.33024054765701294, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.747230589389801, "num_tokens": 5755403.0, "step": 1589 }, { "entropy": 0.6034787744283676, "epoch": 1.483434437704153, "grad_norm": 0.22890187799930573, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.7501373142004013, "num_tokens": 5759060.0, "step": 1590 }, { "entropy": 0.5778242945671082, "epoch": 1.484367708819412, "grad_norm": 0.20285333693027496, "learning_rate": 0.0002, "loss": 0.5766, "mean_token_accuracy": 0.7675112634897232, "num_tokens": 5762649.0, "step": 1591 }, { "entropy": 0.6014440804719925, "epoch": 1.485300979934671, "grad_norm": 0.3051675856113434, "learning_rate": 0.0002, "loss": 0.6039, "mean_token_accuracy": 0.7484031319618225, "num_tokens": 5766226.0, "step": 1592 }, { "entropy": 0.6092644780874252, "epoch": 1.48623425104993, "grad_norm": 0.2611527740955353, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.7522686123847961, "num_tokens": 5769862.0, "step": 1593 }, { "entropy": 0.6156227886676788, "epoch": 1.487167522165189, "grad_norm": 0.2720547318458557, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.7480451613664627, "num_tokens": 5773435.0, "step": 1594 }, { "entropy": 0.6123298555612564, "epoch": 1.488100793280448, "grad_norm": 0.2928479015827179, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.7586937993764877, "num_tokens": 5777047.0, "step": 1595 }, { "entropy": 0.6032314449548721, "epoch": 1.489034064395707, "grad_norm": 0.2416948527097702, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.7515234500169754, "num_tokens": 5780751.0, "step": 1596 }, { "entropy": 0.64577516913414, "epoch": 1.489967335510966, "grad_norm": 0.2766801714897156, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.7449992299079895, "num_tokens": 5784433.0, "step": 1597 }, { "entropy": 0.5561575144529343, "epoch": 1.490900606626225, "grad_norm": 0.29296883940696716, "learning_rate": 0.0002, "loss": 0.5693, "mean_token_accuracy": 0.7708267718553543, "num_tokens": 5788026.0, "step": 1598 }, { "entropy": 0.581879198551178, "epoch": 1.491833877741484, "grad_norm": 0.3401438891887665, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.7649365067481995, "num_tokens": 5791757.0, "step": 1599 }, { "entropy": 0.6101785451173782, "epoch": 1.4927671488567429, "grad_norm": 0.2704155147075653, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.7427130788564682, "num_tokens": 5795341.0, "step": 1600 }, { "entropy": 0.5853835195302963, "epoch": 1.4937004199720019, "grad_norm": 0.24244360625743866, "learning_rate": 0.0002, "loss": 0.5846, "mean_token_accuracy": 0.7645356059074402, "num_tokens": 5798984.0, "step": 1601 }, { "entropy": 0.5721931308507919, "epoch": 1.4946336910872609, "grad_norm": 0.2648351490497589, "learning_rate": 0.0002, "loss": 0.5621, "mean_token_accuracy": 0.7732928693294525, "num_tokens": 5802555.0, "step": 1602 }, { "entropy": 0.557863175868988, "epoch": 1.4955669622025198, "grad_norm": 0.23101264238357544, "learning_rate": 0.0002, "loss": 0.5581, "mean_token_accuracy": 0.7685393542051315, "num_tokens": 5806165.0, "step": 1603 }, { "entropy": 0.5661223232746124, "epoch": 1.4965002333177788, "grad_norm": 0.2479429841041565, "learning_rate": 0.0002, "loss": 0.5693, "mean_token_accuracy": 0.7703033685684204, "num_tokens": 5809637.0, "step": 1604 }, { "entropy": 0.6224995106458664, "epoch": 1.4974335044330378, "grad_norm": 0.25041183829307556, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.7531776279211044, "num_tokens": 5813313.0, "step": 1605 }, { "entropy": 0.5820220038294792, "epoch": 1.4983667755482968, "grad_norm": 0.2493238002061844, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7650808095932007, "num_tokens": 5816901.0, "step": 1606 }, { "entropy": 0.5664663687348366, "epoch": 1.4993000466635558, "grad_norm": 0.25024423003196716, "learning_rate": 0.0002, "loss": 0.5765, "mean_token_accuracy": 0.7660568654537201, "num_tokens": 5820551.0, "step": 1607 }, { "entropy": 0.6350041478872299, "epoch": 1.5002333177788147, "grad_norm": 0.26999759674072266, "learning_rate": 0.0002, "loss": 0.642, "mean_token_accuracy": 0.741390272974968, "num_tokens": 5824189.0, "step": 1608 }, { "entropy": 0.5911365300416946, "epoch": 1.5011665888940737, "grad_norm": 0.24426895380020142, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.7568325102329254, "num_tokens": 5827800.0, "step": 1609 }, { "entropy": 0.5666453689336777, "epoch": 1.5020998600093327, "grad_norm": 0.2075652927160263, "learning_rate": 0.0002, "loss": 0.5624, "mean_token_accuracy": 0.7663795799016953, "num_tokens": 5831438.0, "step": 1610 }, { "entropy": 0.6104936301708221, "epoch": 1.5030331311245917, "grad_norm": 0.23286724090576172, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.7527004182338715, "num_tokens": 5835113.0, "step": 1611 }, { "entropy": 0.6569793373346329, "epoch": 1.5039664022398507, "grad_norm": 0.2988138794898987, "learning_rate": 0.0002, "loss": 0.664, "mean_token_accuracy": 0.7290067821741104, "num_tokens": 5838894.0, "step": 1612 }, { "entropy": 0.6153888702392578, "epoch": 1.5048996733551097, "grad_norm": 0.27263399958610535, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.7472150474786758, "num_tokens": 5842560.0, "step": 1613 }, { "entropy": 0.5875637829303741, "epoch": 1.5058329444703686, "grad_norm": 0.26663288474082947, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.7632701992988586, "num_tokens": 5846136.0, "step": 1614 }, { "entropy": 0.6492624282836914, "epoch": 1.5067662155856276, "grad_norm": 0.24559682607650757, "learning_rate": 0.0002, "loss": 0.6413, "mean_token_accuracy": 0.7399996966123581, "num_tokens": 5849805.0, "step": 1615 }, { "entropy": 0.570728212594986, "epoch": 1.5076994867008866, "grad_norm": 0.23697279393672943, "learning_rate": 0.0002, "loss": 0.5753, "mean_token_accuracy": 0.7709172815084457, "num_tokens": 5853303.0, "step": 1616 }, { "entropy": 0.629894033074379, "epoch": 1.5086327578161456, "grad_norm": 0.2690035104751587, "learning_rate": 0.0002, "loss": 0.6338, "mean_token_accuracy": 0.7399402856826782, "num_tokens": 5857011.0, "step": 1617 }, { "entropy": 0.5939649045467377, "epoch": 1.5095660289314046, "grad_norm": 0.2758556306362152, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.7510074973106384, "num_tokens": 5860634.0, "step": 1618 }, { "entropy": 0.6163745224475861, "epoch": 1.5104993000466636, "grad_norm": 0.2962110638618469, "learning_rate": 0.0002, "loss": 0.6218, "mean_token_accuracy": 0.7470788210630417, "num_tokens": 5864236.0, "step": 1619 }, { "entropy": 0.6301936209201813, "epoch": 1.5114325711619225, "grad_norm": 0.27924036979675293, "learning_rate": 0.0002, "loss": 0.6345, "mean_token_accuracy": 0.7387940734624863, "num_tokens": 5867984.0, "step": 1620 }, { "entropy": 0.587147980928421, "epoch": 1.5123658422771815, "grad_norm": 0.266087144613266, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.7595897167921066, "num_tokens": 5871556.0, "step": 1621 }, { "entropy": 0.5919239372014999, "epoch": 1.5132991133924405, "grad_norm": 0.2696959972381592, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.7545257061719894, "num_tokens": 5875162.0, "step": 1622 }, { "entropy": 0.5833220034837723, "epoch": 1.5142323845076995, "grad_norm": 0.3023752272129059, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.7640846073627472, "num_tokens": 5878697.0, "step": 1623 }, { "entropy": 0.5840954333543777, "epoch": 1.5151656556229585, "grad_norm": 0.2736456096172333, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.7681104689836502, "num_tokens": 5882541.0, "step": 1624 }, { "entropy": 0.5942938178777695, "epoch": 1.5160989267382174, "grad_norm": 0.2389795333147049, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.763122633099556, "num_tokens": 5886180.0, "step": 1625 }, { "entropy": 0.603192538022995, "epoch": 1.5170321978534764, "grad_norm": 0.27674105763435364, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.7502360939979553, "num_tokens": 5889698.0, "step": 1626 }, { "entropy": 0.6247055530548096, "epoch": 1.5179654689687354, "grad_norm": 0.2561008334159851, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.757381409406662, "num_tokens": 5893380.0, "step": 1627 }, { "entropy": 0.5544449985027313, "epoch": 1.5188987400839944, "grad_norm": 0.24802163243293762, "learning_rate": 0.0002, "loss": 0.5567, "mean_token_accuracy": 0.7792487740516663, "num_tokens": 5896976.0, "step": 1628 }, { "entropy": 0.5665250420570374, "epoch": 1.5198320111992534, "grad_norm": 0.2617007791996002, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7638097256422043, "num_tokens": 5900559.0, "step": 1629 }, { "entropy": 0.6005188524723053, "epoch": 1.5207652823145124, "grad_norm": 0.25969353318214417, "learning_rate": 0.0002, "loss": 0.6042, "mean_token_accuracy": 0.7544921040534973, "num_tokens": 5904167.0, "step": 1630 }, { "entropy": 0.601777583360672, "epoch": 1.5216985534297713, "grad_norm": 0.39422935247421265, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.749951958656311, "num_tokens": 5907759.0, "step": 1631 }, { "entropy": 0.5796855837106705, "epoch": 1.5226318245450303, "grad_norm": 0.2233860194683075, "learning_rate": 0.0002, "loss": 0.5847, "mean_token_accuracy": 0.765801340341568, "num_tokens": 5911449.0, "step": 1632 }, { "entropy": 0.6105727702379227, "epoch": 1.5235650956602893, "grad_norm": 0.2864755094051361, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.7560862153768539, "num_tokens": 5915094.0, "step": 1633 }, { "entropy": 0.6314799636602402, "epoch": 1.5244983667755483, "grad_norm": 0.28365767002105713, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.7530845105648041, "num_tokens": 5918739.0, "step": 1634 }, { "entropy": 0.6021855920553207, "epoch": 1.5254316378908073, "grad_norm": 0.29506587982177734, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.7541551440954208, "num_tokens": 5922310.0, "step": 1635 }, { "entropy": 0.5494497567415237, "epoch": 1.5263649090060663, "grad_norm": 0.24806778132915497, "learning_rate": 0.0002, "loss": 0.5624, "mean_token_accuracy": 0.7682270407676697, "num_tokens": 5925831.0, "step": 1636 }, { "entropy": 0.5976577401161194, "epoch": 1.5272981801213252, "grad_norm": 0.27186763286590576, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.751509353518486, "num_tokens": 5929348.0, "step": 1637 }, { "entropy": 0.6019576638936996, "epoch": 1.5282314512365842, "grad_norm": 0.3061120808124542, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.7549310922622681, "num_tokens": 5933063.0, "step": 1638 }, { "entropy": 0.5779614746570587, "epoch": 1.5291647223518432, "grad_norm": 0.2803496718406677, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7662913203239441, "num_tokens": 5936673.0, "step": 1639 }, { "entropy": 0.6047740876674652, "epoch": 1.5300979934671022, "grad_norm": 0.2620490491390228, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.7572859674692154, "num_tokens": 5940397.0, "step": 1640 }, { "entropy": 0.581456184387207, "epoch": 1.5310312645823612, "grad_norm": 0.2289317399263382, "learning_rate": 0.0002, "loss": 0.578, "mean_token_accuracy": 0.7632913142442703, "num_tokens": 5944100.0, "step": 1641 }, { "entropy": 0.604337602853775, "epoch": 1.5319645356976201, "grad_norm": 0.30274128913879395, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.7452493607997894, "num_tokens": 5947625.0, "step": 1642 }, { "entropy": 0.5793634802103043, "epoch": 1.5328978068128791, "grad_norm": 0.2828543782234192, "learning_rate": 0.0002, "loss": 0.582, "mean_token_accuracy": 0.7706554532051086, "num_tokens": 5951299.0, "step": 1643 }, { "entropy": 0.6026080995798111, "epoch": 1.533831077928138, "grad_norm": 0.2552560269832611, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.7606441378593445, "num_tokens": 5955074.0, "step": 1644 }, { "entropy": 0.5820570141077042, "epoch": 1.534764349043397, "grad_norm": 0.2395731806755066, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7644011676311493, "num_tokens": 5958682.0, "step": 1645 }, { "entropy": 0.6448573023080826, "epoch": 1.535697620158656, "grad_norm": 0.3230825960636139, "learning_rate": 0.0002, "loss": 0.643, "mean_token_accuracy": 0.733604371547699, "num_tokens": 5962417.0, "step": 1646 }, { "entropy": 0.6007218062877655, "epoch": 1.536630891273915, "grad_norm": 0.26162680983543396, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7570241540670395, "num_tokens": 5965951.0, "step": 1647 }, { "entropy": 0.6320108622312546, "epoch": 1.537564162389174, "grad_norm": 0.22075927257537842, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.744466632604599, "num_tokens": 5969655.0, "step": 1648 }, { "entropy": 0.5747384428977966, "epoch": 1.538497433504433, "grad_norm": 0.27262240648269653, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.7662787735462189, "num_tokens": 5973138.0, "step": 1649 }, { "entropy": 0.5922460705041885, "epoch": 1.539430704619692, "grad_norm": 0.23825518786907196, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.760946661233902, "num_tokens": 5976711.0, "step": 1650 }, { "entropy": 0.642812505364418, "epoch": 1.540363975734951, "grad_norm": 0.28633812069892883, "learning_rate": 0.0002, "loss": 0.6432, "mean_token_accuracy": 0.7407468110322952, "num_tokens": 5980349.0, "step": 1651 }, { "entropy": 0.5726866275072098, "epoch": 1.54129724685021, "grad_norm": 0.3273249864578247, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.76613949239254, "num_tokens": 5983958.0, "step": 1652 }, { "entropy": 0.5754923075437546, "epoch": 1.542230517965469, "grad_norm": 0.3079545199871063, "learning_rate": 0.0002, "loss": 0.5775, "mean_token_accuracy": 0.7656200528144836, "num_tokens": 5987754.0, "step": 1653 }, { "entropy": 0.6098000407218933, "epoch": 1.543163789080728, "grad_norm": 0.2903008759021759, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.7516267597675323, "num_tokens": 5991232.0, "step": 1654 }, { "entropy": 0.5923157334327698, "epoch": 1.544097060195987, "grad_norm": 0.31721508502960205, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.7589886635541916, "num_tokens": 5994916.0, "step": 1655 }, { "entropy": 0.5641031563282013, "epoch": 1.545030331311246, "grad_norm": 0.30900833010673523, "learning_rate": 0.0002, "loss": 0.5742, "mean_token_accuracy": 0.7679266035556793, "num_tokens": 5998499.0, "step": 1656 }, { "entropy": 0.5923379063606262, "epoch": 1.5459636024265049, "grad_norm": 0.25438740849494934, "learning_rate": 0.0002, "loss": 0.598, "mean_token_accuracy": 0.7577186226844788, "num_tokens": 6002254.0, "step": 1657 }, { "entropy": 0.5999187380075455, "epoch": 1.5468968735417639, "grad_norm": 0.3051535189151764, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7535951435565948, "num_tokens": 6005852.0, "step": 1658 }, { "entropy": 0.614093005657196, "epoch": 1.5478301446570228, "grad_norm": 0.20679669082164764, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.7615060061216354, "num_tokens": 6009591.0, "step": 1659 }, { "entropy": 0.6415398567914963, "epoch": 1.5487634157722818, "grad_norm": 0.2113180160522461, "learning_rate": 0.0002, "loss": 0.6339, "mean_token_accuracy": 0.7444204837083817, "num_tokens": 6013352.0, "step": 1660 }, { "entropy": 0.5928166061639786, "epoch": 1.5496966868875408, "grad_norm": 0.2510918974876404, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.7543600052595139, "num_tokens": 6016980.0, "step": 1661 }, { "entropy": 0.6050298064947128, "epoch": 1.5506299580027998, "grad_norm": 0.2638547420501709, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.7541382908821106, "num_tokens": 6020609.0, "step": 1662 }, { "entropy": 0.5983502715826035, "epoch": 1.5515632291180588, "grad_norm": 0.22314782440662384, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.7589359134435654, "num_tokens": 6024358.0, "step": 1663 }, { "entropy": 0.5897650048136711, "epoch": 1.5524965002333178, "grad_norm": 0.2390764057636261, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.763454020023346, "num_tokens": 6027975.0, "step": 1664 }, { "entropy": 0.5613812804222107, "epoch": 1.5534297713485767, "grad_norm": 0.29529309272766113, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.7650555372238159, "num_tokens": 6031418.0, "step": 1665 }, { "entropy": 0.5957460701465607, "epoch": 1.5543630424638357, "grad_norm": 0.2350597232580185, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.764057457447052, "num_tokens": 6035084.0, "step": 1666 }, { "entropy": 0.6304851770401001, "epoch": 1.5552963135790947, "grad_norm": 0.2451062947511673, "learning_rate": 0.0002, "loss": 0.6353, "mean_token_accuracy": 0.7440118044614792, "num_tokens": 6038775.0, "step": 1667 }, { "entropy": 0.5927499830722809, "epoch": 1.5562295846943537, "grad_norm": 0.27219709753990173, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.7599086463451385, "num_tokens": 6042333.0, "step": 1668 }, { "entropy": 0.6019692569971085, "epoch": 1.5571628558096127, "grad_norm": 0.27187758684158325, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.7516009360551834, "num_tokens": 6045850.0, "step": 1669 }, { "entropy": 0.5916900634765625, "epoch": 1.5580961269248716, "grad_norm": 0.2896479666233063, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.7611449509859085, "num_tokens": 6049357.0, "step": 1670 }, { "entropy": 0.5800489336252213, "epoch": 1.5590293980401306, "grad_norm": 0.24173320829868317, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.7618200480937958, "num_tokens": 6052947.0, "step": 1671 }, { "entropy": 0.6647946685552597, "epoch": 1.5599626691553896, "grad_norm": 0.2717168927192688, "learning_rate": 0.0002, "loss": 0.6709, "mean_token_accuracy": 0.7246333807706833, "num_tokens": 6056607.0, "step": 1672 }, { "entropy": 0.5972121208906174, "epoch": 1.5608959402706486, "grad_norm": 0.2502278685569763, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.753385066986084, "num_tokens": 6060297.0, "step": 1673 }, { "entropy": 0.6021622270345688, "epoch": 1.5618292113859076, "grad_norm": 0.2296857088804245, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.7510497272014618, "num_tokens": 6064001.0, "step": 1674 }, { "entropy": 0.621107667684555, "epoch": 1.5627624825011666, "grad_norm": 0.24265959858894348, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.7515273839235306, "num_tokens": 6067575.0, "step": 1675 }, { "entropy": 0.6097716838121414, "epoch": 1.5636957536164255, "grad_norm": 0.24178645014762878, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.7542490214109421, "num_tokens": 6071129.0, "step": 1676 }, { "entropy": 0.6154325902462006, "epoch": 1.5646290247316845, "grad_norm": 0.2752450406551361, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.7477815896272659, "num_tokens": 6074726.0, "step": 1677 }, { "entropy": 0.6108643859624863, "epoch": 1.5655622958469435, "grad_norm": 0.26475751399993896, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.7520577907562256, "num_tokens": 6078305.0, "step": 1678 }, { "entropy": 0.6433001011610031, "epoch": 1.5664955669622025, "grad_norm": 0.3201035261154175, "learning_rate": 0.0002, "loss": 0.6538, "mean_token_accuracy": 0.738977387547493, "num_tokens": 6081859.0, "step": 1679 }, { "entropy": 0.6171122789382935, "epoch": 1.5674288380774615, "grad_norm": 0.2926573157310486, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.7535630017518997, "num_tokens": 6085447.0, "step": 1680 }, { "entropy": 0.6280564367771149, "epoch": 1.5683621091927205, "grad_norm": 0.32677537202835083, "learning_rate": 0.0002, "loss": 0.6542, "mean_token_accuracy": 0.7344930917024612, "num_tokens": 6089047.0, "step": 1681 }, { "entropy": 0.622661218047142, "epoch": 1.5692953803079794, "grad_norm": 0.28301936388015747, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.7472170144319534, "num_tokens": 6092739.0, "step": 1682 }, { "entropy": 0.5978922843933105, "epoch": 1.5702286514232384, "grad_norm": 0.24383443593978882, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.7574408799409866, "num_tokens": 6096295.0, "step": 1683 }, { "entropy": 0.6465901583433151, "epoch": 1.5711619225384974, "grad_norm": 0.24766261875629425, "learning_rate": 0.0002, "loss": 0.6384, "mean_token_accuracy": 0.7466094344854355, "num_tokens": 6099860.0, "step": 1684 }, { "entropy": 0.6193002462387085, "epoch": 1.5720951936537564, "grad_norm": 0.23924513161182404, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.7478423714637756, "num_tokens": 6103469.0, "step": 1685 }, { "entropy": 0.5835680067539215, "epoch": 1.5730284647690154, "grad_norm": 0.236978679895401, "learning_rate": 0.0002, "loss": 0.5755, "mean_token_accuracy": 0.7631948590278625, "num_tokens": 6107130.0, "step": 1686 }, { "entropy": 0.5577538460493088, "epoch": 1.5739617358842743, "grad_norm": 0.2633891999721527, "learning_rate": 0.0002, "loss": 0.5654, "mean_token_accuracy": 0.7696224600076675, "num_tokens": 6110613.0, "step": 1687 }, { "entropy": 0.6361117660999298, "epoch": 1.5748950069995333, "grad_norm": 0.23691223561763763, "learning_rate": 0.0002, "loss": 0.6265, "mean_token_accuracy": 0.7497578710317612, "num_tokens": 6114400.0, "step": 1688 }, { "entropy": 0.6091517508029938, "epoch": 1.5758282781147923, "grad_norm": 0.2697770893573761, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.7483248710632324, "num_tokens": 6118017.0, "step": 1689 }, { "entropy": 0.5867620259523392, "epoch": 1.5767615492300513, "grad_norm": 0.23496975004673004, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.757259264588356, "num_tokens": 6121604.0, "step": 1690 }, { "entropy": 0.5941403061151505, "epoch": 1.5776948203453103, "grad_norm": 0.24476923048496246, "learning_rate": 0.0002, "loss": 0.6129, "mean_token_accuracy": 0.7543016076087952, "num_tokens": 6125285.0, "step": 1691 }, { "entropy": 0.6317189782857895, "epoch": 1.5786280914605693, "grad_norm": 0.2561674416065216, "learning_rate": 0.0002, "loss": 0.6442, "mean_token_accuracy": 0.7441034764051437, "num_tokens": 6128934.0, "step": 1692 }, { "entropy": 0.6116095334291458, "epoch": 1.5795613625758282, "grad_norm": 0.25203216075897217, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.752489447593689, "num_tokens": 6132627.0, "step": 1693 }, { "entropy": 0.6591275334358215, "epoch": 1.5804946336910872, "grad_norm": 0.25046417117118835, "learning_rate": 0.0002, "loss": 0.6547, "mean_token_accuracy": 0.7363362163305283, "num_tokens": 6136237.0, "step": 1694 }, { "entropy": 0.6074287295341492, "epoch": 1.5814279048063462, "grad_norm": 0.2586587369441986, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.7546404898166656, "num_tokens": 6139750.0, "step": 1695 }, { "entropy": 0.5756990313529968, "epoch": 1.5823611759216052, "grad_norm": 0.22564104199409485, "learning_rate": 0.0002, "loss": 0.5729, "mean_token_accuracy": 0.7710815966129303, "num_tokens": 6143333.0, "step": 1696 }, { "entropy": 0.5603280514478683, "epoch": 1.5832944470368642, "grad_norm": 0.28835761547088623, "learning_rate": 0.0002, "loss": 0.5747, "mean_token_accuracy": 0.762704387307167, "num_tokens": 6146928.0, "step": 1697 }, { "entropy": 0.6106321960687637, "epoch": 1.5842277181521232, "grad_norm": 0.26158589124679565, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7477263063192368, "num_tokens": 6150542.0, "step": 1698 }, { "entropy": 0.59872767329216, "epoch": 1.5851609892673821, "grad_norm": 0.25911495089530945, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.750586986541748, "num_tokens": 6154220.0, "step": 1699 }, { "entropy": 0.6275942027568817, "epoch": 1.5860942603826411, "grad_norm": 0.24690091609954834, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.7401276230812073, "num_tokens": 6157886.0, "step": 1700 }, { "entropy": 0.6189593970775604, "epoch": 1.5870275314979, "grad_norm": 0.2762717008590698, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.7388720810413361, "num_tokens": 6161489.0, "step": 1701 }, { "entropy": 0.5810252279043198, "epoch": 1.587960802613159, "grad_norm": 0.2783573865890503, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.765655905008316, "num_tokens": 6165064.0, "step": 1702 }, { "entropy": 0.5952746868133545, "epoch": 1.588894073728418, "grad_norm": 0.32045885920524597, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7497838139533997, "num_tokens": 6168659.0, "step": 1703 }, { "entropy": 0.5995337218046188, "epoch": 1.589827344843677, "grad_norm": 0.23550806939601898, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7605825960636139, "num_tokens": 6172203.0, "step": 1704 }, { "entropy": 0.5872446745634079, "epoch": 1.590760615958936, "grad_norm": 0.2391948252916336, "learning_rate": 0.0002, "loss": 0.5873, "mean_token_accuracy": 0.7629214078187943, "num_tokens": 6175850.0, "step": 1705 }, { "entropy": 0.6539479941129684, "epoch": 1.591693887074195, "grad_norm": 0.3270823359489441, "learning_rate": 0.0002, "loss": 0.6545, "mean_token_accuracy": 0.7346877157688141, "num_tokens": 6179455.0, "step": 1706 }, { "entropy": 0.6042264401912689, "epoch": 1.592627158189454, "grad_norm": 0.26954859495162964, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.7515758872032166, "num_tokens": 6183097.0, "step": 1707 }, { "entropy": 0.6061564087867737, "epoch": 1.593560429304713, "grad_norm": 0.24980756640434265, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.7511590421199799, "num_tokens": 6186820.0, "step": 1708 }, { "entropy": 0.6131314337253571, "epoch": 1.594493700419972, "grad_norm": 0.23693996667861938, "learning_rate": 0.0002, "loss": 0.6247, "mean_token_accuracy": 0.7392751127481461, "num_tokens": 6190526.0, "step": 1709 }, { "entropy": 0.5996253788471222, "epoch": 1.595426971535231, "grad_norm": 0.2373073697090149, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.7503921538591385, "num_tokens": 6194087.0, "step": 1710 }, { "entropy": 0.5851544141769409, "epoch": 1.59636024265049, "grad_norm": 0.24698381125926971, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.7610025852918625, "num_tokens": 6197688.0, "step": 1711 }, { "entropy": 0.6314685195684433, "epoch": 1.597293513765749, "grad_norm": 0.2132321149110794, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.743475466966629, "num_tokens": 6201323.0, "step": 1712 }, { "entropy": 0.6053592264652252, "epoch": 1.598226784881008, "grad_norm": 0.24593231081962585, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.7537787854671478, "num_tokens": 6204855.0, "step": 1713 }, { "entropy": 0.6250728368759155, "epoch": 1.5991600559962669, "grad_norm": 0.24569877982139587, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.7519030719995499, "num_tokens": 6208526.0, "step": 1714 }, { "entropy": 0.5792568027973175, "epoch": 1.6000933271115259, "grad_norm": 0.24873052537441254, "learning_rate": 0.0002, "loss": 0.5752, "mean_token_accuracy": 0.7648060321807861, "num_tokens": 6212124.0, "step": 1715 }, { "entropy": 0.6202835142612457, "epoch": 1.6010265982267848, "grad_norm": 0.2686322033405304, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.7502094358205795, "num_tokens": 6215840.0, "step": 1716 }, { "entropy": 0.5745758563280106, "epoch": 1.6019598693420438, "grad_norm": 0.24344250559806824, "learning_rate": 0.0002, "loss": 0.5765, "mean_token_accuracy": 0.7606489658355713, "num_tokens": 6219361.0, "step": 1717 }, { "entropy": 0.6185169517993927, "epoch": 1.6028931404573028, "grad_norm": 0.3099324107170105, "learning_rate": 0.0002, "loss": 0.63, "mean_token_accuracy": 0.7449633628129959, "num_tokens": 6222926.0, "step": 1718 }, { "entropy": 0.5925685167312622, "epoch": 1.6038264115725618, "grad_norm": 0.2755502760410309, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.7539825141429901, "num_tokens": 6226545.0, "step": 1719 }, { "entropy": 0.5883228182792664, "epoch": 1.6047596826878208, "grad_norm": 0.26388269662857056, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7627731710672379, "num_tokens": 6230148.0, "step": 1720 }, { "entropy": 0.5929269343614578, "epoch": 1.6056929538030797, "grad_norm": 0.25653451681137085, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.7568662613630295, "num_tokens": 6233754.0, "step": 1721 }, { "entropy": 0.5936732590198517, "epoch": 1.6066262249183387, "grad_norm": 0.22710835933685303, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.7575790286064148, "num_tokens": 6237442.0, "step": 1722 }, { "entropy": 0.6006222143769264, "epoch": 1.6075594960335977, "grad_norm": 0.27951937913894653, "learning_rate": 0.0002, "loss": 0.6107, "mean_token_accuracy": 0.754315197467804, "num_tokens": 6240958.0, "step": 1723 }, { "entropy": 0.5923704206943512, "epoch": 1.6084927671488567, "grad_norm": 0.2718023657798767, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7625268250703812, "num_tokens": 6244680.0, "step": 1724 }, { "entropy": 0.6281224638223648, "epoch": 1.6094260382641157, "grad_norm": 0.28808867931365967, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.7522484958171844, "num_tokens": 6248254.0, "step": 1725 }, { "entropy": 0.6251838207244873, "epoch": 1.6103593093793747, "grad_norm": 0.21878591179847717, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.7470429241657257, "num_tokens": 6251885.0, "step": 1726 }, { "entropy": 0.6343297511339188, "epoch": 1.6112925804946336, "grad_norm": 0.2489483505487442, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.7410907000303268, "num_tokens": 6255592.0, "step": 1727 }, { "entropy": 0.5840195268392563, "epoch": 1.6122258516098926, "grad_norm": 0.32606732845306396, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.7631366103887558, "num_tokens": 6259364.0, "step": 1728 }, { "entropy": 0.5991330146789551, "epoch": 1.6131591227251516, "grad_norm": 0.29756960272789, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.7514402866363525, "num_tokens": 6263072.0, "step": 1729 }, { "entropy": 0.6044631600379944, "epoch": 1.6140923938404106, "grad_norm": 0.24143941700458527, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.7556382119655609, "num_tokens": 6266614.0, "step": 1730 }, { "entropy": 0.6140667051076889, "epoch": 1.6150256649556696, "grad_norm": 0.24858692288398743, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.7540757358074188, "num_tokens": 6270247.0, "step": 1731 }, { "entropy": 0.6453464776277542, "epoch": 1.6159589360709286, "grad_norm": 0.28917837142944336, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.7446357756853104, "num_tokens": 6273850.0, "step": 1732 }, { "entropy": 0.6098129451274872, "epoch": 1.6168922071861875, "grad_norm": 0.2742806375026703, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.7560674697160721, "num_tokens": 6277537.0, "step": 1733 }, { "entropy": 0.5848876386880875, "epoch": 1.6178254783014465, "grad_norm": 0.2570408880710602, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7675201147794724, "num_tokens": 6281106.0, "step": 1734 }, { "entropy": 0.6411146819591522, "epoch": 1.6187587494167055, "grad_norm": 0.3059960901737213, "learning_rate": 0.0002, "loss": 0.6548, "mean_token_accuracy": 0.7346130162477493, "num_tokens": 6284801.0, "step": 1735 }, { "entropy": 0.637573316693306, "epoch": 1.6196920205319645, "grad_norm": 0.28350746631622314, "learning_rate": 0.0002, "loss": 0.6484, "mean_token_accuracy": 0.7373316586017609, "num_tokens": 6288491.0, "step": 1736 }, { "entropy": 0.6215126067399979, "epoch": 1.6206252916472235, "grad_norm": 0.24615082144737244, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.7548659294843674, "num_tokens": 6292086.0, "step": 1737 }, { "entropy": 0.6113915145397186, "epoch": 1.6215585627624824, "grad_norm": 0.22485463321208954, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.7499384433031082, "num_tokens": 6295569.0, "step": 1738 }, { "entropy": 0.5971938818693161, "epoch": 1.6224918338777414, "grad_norm": 0.2375795990228653, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.754962295293808, "num_tokens": 6299208.0, "step": 1739 }, { "entropy": 0.5807953476905823, "epoch": 1.6234251049930004, "grad_norm": 0.2996141016483307, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.7530704736709595, "num_tokens": 6302790.0, "step": 1740 }, { "entropy": 0.6100825816392899, "epoch": 1.6243583761082594, "grad_norm": 0.22682331502437592, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.7575991302728653, "num_tokens": 6306456.0, "step": 1741 }, { "entropy": 0.5723076313734055, "epoch": 1.6252916472235186, "grad_norm": 0.26062726974487305, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.7593650817871094, "num_tokens": 6310028.0, "step": 1742 }, { "entropy": 0.5701606720685959, "epoch": 1.6262249183387776, "grad_norm": 0.2824462056159973, "learning_rate": 0.0002, "loss": 0.5777, "mean_token_accuracy": 0.7599933296442032, "num_tokens": 6313582.0, "step": 1743 }, { "entropy": 0.5635017156600952, "epoch": 1.6271581894540366, "grad_norm": 0.2360965609550476, "learning_rate": 0.0002, "loss": 0.5725, "mean_token_accuracy": 0.7717981338500977, "num_tokens": 6317161.0, "step": 1744 }, { "entropy": 0.5951766669750214, "epoch": 1.6280914605692955, "grad_norm": 0.24986495077610016, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.7572273164987564, "num_tokens": 6320782.0, "step": 1745 }, { "entropy": 0.5889915972948074, "epoch": 1.6290247316845545, "grad_norm": 0.21625325083732605, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.7617516666650772, "num_tokens": 6324400.0, "step": 1746 }, { "entropy": 0.5941626280546188, "epoch": 1.6299580027998135, "grad_norm": 0.2176760733127594, "learning_rate": 0.0002, "loss": 0.5897, "mean_token_accuracy": 0.7707582265138626, "num_tokens": 6328100.0, "step": 1747 }, { "entropy": 0.5952302813529968, "epoch": 1.6308912739150725, "grad_norm": 0.2337324619293213, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.7599412947893143, "num_tokens": 6331719.0, "step": 1748 }, { "entropy": 0.5968585461378098, "epoch": 1.6318245450303315, "grad_norm": 0.2398844212293625, "learning_rate": 0.0002, "loss": 0.5914, "mean_token_accuracy": 0.7556185722351074, "num_tokens": 6335367.0, "step": 1749 }, { "entropy": 0.605326920747757, "epoch": 1.6327578161455905, "grad_norm": 0.2566312253475189, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.7528323382139206, "num_tokens": 6338995.0, "step": 1750 }, { "entropy": 0.6066140234470367, "epoch": 1.6336910872608494, "grad_norm": 0.23008188605308533, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.757899597287178, "num_tokens": 6342629.0, "step": 1751 }, { "entropy": 0.633355975151062, "epoch": 1.6346243583761084, "grad_norm": 0.26606595516204834, "learning_rate": 0.0002, "loss": 0.644, "mean_token_accuracy": 0.7378411293029785, "num_tokens": 6346145.0, "step": 1752 }, { "entropy": 0.6250825077295303, "epoch": 1.6355576294913674, "grad_norm": 0.24602600932121277, "learning_rate": 0.0002, "loss": 0.6208, "mean_token_accuracy": 0.7491239756345749, "num_tokens": 6349837.0, "step": 1753 }, { "entropy": 0.5783654004335403, "epoch": 1.6364909006066264, "grad_norm": 0.3184244632720947, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.7575833797454834, "num_tokens": 6353260.0, "step": 1754 }, { "entropy": 0.6189661622047424, "epoch": 1.6374241717218854, "grad_norm": 0.24905796349048615, "learning_rate": 0.0002, "loss": 0.6241, "mean_token_accuracy": 0.749834269285202, "num_tokens": 6356885.0, "step": 1755 }, { "entropy": 0.5982859581708908, "epoch": 1.6383574428371444, "grad_norm": 0.2681504786014557, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.7502151727676392, "num_tokens": 6360505.0, "step": 1756 }, { "entropy": 0.613476350903511, "epoch": 1.6392907139524033, "grad_norm": 0.25587013363838196, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.7446714043617249, "num_tokens": 6364109.0, "step": 1757 }, { "entropy": 0.6326131075620651, "epoch": 1.6402239850676623, "grad_norm": 0.22397997975349426, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.7421345263719559, "num_tokens": 6367634.0, "step": 1758 }, { "entropy": 0.5775399059057236, "epoch": 1.6411572561829213, "grad_norm": 0.25395098328590393, "learning_rate": 0.0002, "loss": 0.587, "mean_token_accuracy": 0.7613372057676315, "num_tokens": 6371237.0, "step": 1759 }, { "entropy": 0.595443919301033, "epoch": 1.6420905272981803, "grad_norm": 0.2573327124118805, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.7547490745782852, "num_tokens": 6374898.0, "step": 1760 }, { "entropy": 0.5906455963850021, "epoch": 1.6430237984134393, "grad_norm": 0.23302462697029114, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.7669836133718491, "num_tokens": 6378530.0, "step": 1761 }, { "entropy": 0.5975710600614548, "epoch": 1.6439570695286982, "grad_norm": 0.3103943467140198, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.7472264468669891, "num_tokens": 6382098.0, "step": 1762 }, { "entropy": 0.5871939063072205, "epoch": 1.6448903406439572, "grad_norm": 0.24226640164852142, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.754821851849556, "num_tokens": 6385604.0, "step": 1763 }, { "entropy": 0.6073741465806961, "epoch": 1.6458236117592162, "grad_norm": 0.2547711431980133, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.7491205930709839, "num_tokens": 6389264.0, "step": 1764 }, { "entropy": 0.5800888687372208, "epoch": 1.6467568828744752, "grad_norm": 0.20783933997154236, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7572503685951233, "num_tokens": 6392757.0, "step": 1765 }, { "entropy": 0.6428406238555908, "epoch": 1.6476901539897342, "grad_norm": 0.2455923706293106, "learning_rate": 0.0002, "loss": 0.6463, "mean_token_accuracy": 0.7392764389514923, "num_tokens": 6396265.0, "step": 1766 }, { "entropy": 0.5872590839862823, "epoch": 1.6486234251049932, "grad_norm": 0.20542994141578674, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7597682625055313, "num_tokens": 6400016.0, "step": 1767 }, { "entropy": 0.5926589518785477, "epoch": 1.6495566962202521, "grad_norm": 0.20780988037586212, "learning_rate": 0.0002, "loss": 0.5747, "mean_token_accuracy": 0.762549489736557, "num_tokens": 6403785.0, "step": 1768 }, { "entropy": 0.577176608145237, "epoch": 1.6504899673355111, "grad_norm": 0.2582305371761322, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.767647311091423, "num_tokens": 6407245.0, "step": 1769 }, { "entropy": 0.6025865077972412, "epoch": 1.65142323845077, "grad_norm": 0.23107746243476868, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.7528401911258698, "num_tokens": 6410922.0, "step": 1770 }, { "entropy": 0.5911220237612724, "epoch": 1.652356509566029, "grad_norm": 0.2478981465101242, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.7560052871704102, "num_tokens": 6414496.0, "step": 1771 }, { "entropy": 0.5814266800880432, "epoch": 1.653289780681288, "grad_norm": 0.304634690284729, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.7507881671190262, "num_tokens": 6417992.0, "step": 1772 }, { "entropy": 0.5833895057439804, "epoch": 1.654223051796547, "grad_norm": 0.2823333740234375, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.760659709572792, "num_tokens": 6421570.0, "step": 1773 }, { "entropy": 0.5963637679815292, "epoch": 1.655156322911806, "grad_norm": 0.24856410920619965, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.7601036131381989, "num_tokens": 6425229.0, "step": 1774 }, { "entropy": 0.5837786793708801, "epoch": 1.656089594027065, "grad_norm": 0.24376200139522552, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7622852176427841, "num_tokens": 6428861.0, "step": 1775 }, { "entropy": 0.6148248165845871, "epoch": 1.657022865142324, "grad_norm": 0.220432311296463, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.753641277551651, "num_tokens": 6432464.0, "step": 1776 }, { "entropy": 0.6097028404474258, "epoch": 1.657956136257583, "grad_norm": 0.30683302879333496, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.7533155530691147, "num_tokens": 6436067.0, "step": 1777 }, { "entropy": 0.6341149657964706, "epoch": 1.658889407372842, "grad_norm": 0.2861899733543396, "learning_rate": 0.0002, "loss": 0.6419, "mean_token_accuracy": 0.7448538541793823, "num_tokens": 6439627.0, "step": 1778 }, { "entropy": 0.5908276736736298, "epoch": 1.659822678488101, "grad_norm": 0.26360970735549927, "learning_rate": 0.0002, "loss": 0.585, "mean_token_accuracy": 0.7634565234184265, "num_tokens": 6443268.0, "step": 1779 }, { "entropy": 0.6241216957569122, "epoch": 1.66075594960336, "grad_norm": 0.2302301824092865, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.7526360750198364, "num_tokens": 6446971.0, "step": 1780 }, { "entropy": 0.5982050150632858, "epoch": 1.661689220718619, "grad_norm": 0.21418635547161102, "learning_rate": 0.0002, "loss": 0.5828, "mean_token_accuracy": 0.7668180018663406, "num_tokens": 6450639.0, "step": 1781 }, { "entropy": 0.5950793474912643, "epoch": 1.662622491833878, "grad_norm": 0.23948627710342407, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.753530278801918, "num_tokens": 6454140.0, "step": 1782 }, { "entropy": 0.6468114852905273, "epoch": 1.6635557629491369, "grad_norm": 0.2485283762216568, "learning_rate": 0.0002, "loss": 0.6605, "mean_token_accuracy": 0.7326049357652664, "num_tokens": 6457639.0, "step": 1783 }, { "entropy": 0.583422839641571, "epoch": 1.6644890340643959, "grad_norm": 0.23541788756847382, "learning_rate": 0.0002, "loss": 0.5789, "mean_token_accuracy": 0.7672300487756729, "num_tokens": 6461335.0, "step": 1784 }, { "entropy": 0.6238056868314743, "epoch": 1.6654223051796548, "grad_norm": 0.2652822434902191, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.7421389669179916, "num_tokens": 6464996.0, "step": 1785 }, { "entropy": 0.6144469678401947, "epoch": 1.6663555762949138, "grad_norm": 0.25274503231048584, "learning_rate": 0.0002, "loss": 0.611, "mean_token_accuracy": 0.7531907111406326, "num_tokens": 6468692.0, "step": 1786 }, { "entropy": 0.5780823454260826, "epoch": 1.6672888474101728, "grad_norm": 0.2438121736049652, "learning_rate": 0.0002, "loss": 0.5765, "mean_token_accuracy": 0.7678893655538559, "num_tokens": 6472353.0, "step": 1787 }, { "entropy": 0.5673408210277557, "epoch": 1.6682221185254318, "grad_norm": 0.2161622941493988, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.7680238783359528, "num_tokens": 6475942.0, "step": 1788 }, { "entropy": 0.6039825677871704, "epoch": 1.6691553896406908, "grad_norm": 0.28444457054138184, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.7502839118242264, "num_tokens": 6479532.0, "step": 1789 }, { "entropy": 0.572153702378273, "epoch": 1.6700886607559497, "grad_norm": 0.2526184320449829, "learning_rate": 0.0002, "loss": 0.5771, "mean_token_accuracy": 0.7701733559370041, "num_tokens": 6483174.0, "step": 1790 }, { "entropy": 0.5920291990041733, "epoch": 1.6710219318712087, "grad_norm": 0.2389027625322342, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.7565938979387283, "num_tokens": 6486776.0, "step": 1791 }, { "entropy": 0.5660016238689423, "epoch": 1.6719552029864677, "grad_norm": 0.2920759320259094, "learning_rate": 0.0002, "loss": 0.5834, "mean_token_accuracy": 0.7656805366277695, "num_tokens": 6490362.0, "step": 1792 }, { "entropy": 0.5935862213373184, "epoch": 1.6728884741017267, "grad_norm": 0.3178803622722626, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.758570060133934, "num_tokens": 6493857.0, "step": 1793 }, { "entropy": 0.6351021379232407, "epoch": 1.6738217452169857, "grad_norm": 0.24798421561717987, "learning_rate": 0.0002, "loss": 0.6336, "mean_token_accuracy": 0.7357656061649323, "num_tokens": 6497428.0, "step": 1794 }, { "entropy": 0.6257569938898087, "epoch": 1.6747550163322447, "grad_norm": 0.31804099678993225, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.7452487796545029, "num_tokens": 6501106.0, "step": 1795 }, { "entropy": 0.6082371175289154, "epoch": 1.6756882874475036, "grad_norm": 0.25298261642456055, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.7511005848646164, "num_tokens": 6504845.0, "step": 1796 }, { "entropy": 0.6106375455856323, "epoch": 1.6766215585627626, "grad_norm": 0.292245090007782, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.747566431760788, "num_tokens": 6508487.0, "step": 1797 }, { "entropy": 0.6214048117399216, "epoch": 1.6775548296780216, "grad_norm": 0.23478800058364868, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.7452792525291443, "num_tokens": 6512020.0, "step": 1798 }, { "entropy": 0.6249949783086777, "epoch": 1.6784881007932806, "grad_norm": 0.2670957148075104, "learning_rate": 0.0002, "loss": 0.6237, "mean_token_accuracy": 0.7481392472982407, "num_tokens": 6515535.0, "step": 1799 }, { "entropy": 0.6065542995929718, "epoch": 1.6794213719085396, "grad_norm": 0.21602845191955566, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.7508950382471085, "num_tokens": 6519250.0, "step": 1800 }, { "entropy": 0.5997198224067688, "epoch": 1.6803546430237986, "grad_norm": 0.23398078978061676, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.7537074089050293, "num_tokens": 6522896.0, "step": 1801 }, { "entropy": 0.6073427498340607, "epoch": 1.6812879141390575, "grad_norm": 0.22241519391536713, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7558527886867523, "num_tokens": 6526438.0, "step": 1802 }, { "entropy": 0.5800670385360718, "epoch": 1.6822211852543165, "grad_norm": 0.2431839555501938, "learning_rate": 0.0002, "loss": 0.5815, "mean_token_accuracy": 0.7650197446346283, "num_tokens": 6530059.0, "step": 1803 }, { "entropy": 0.6328914910554886, "epoch": 1.6831544563695755, "grad_norm": 0.2696601450443268, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.7360139489173889, "num_tokens": 6533748.0, "step": 1804 }, { "entropy": 0.6433763355016708, "epoch": 1.6840877274848345, "grad_norm": 0.2184409648180008, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.736011728644371, "num_tokens": 6537441.0, "step": 1805 }, { "entropy": 0.6146339476108551, "epoch": 1.6850209986000935, "grad_norm": 0.26649579405784607, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.7480151653289795, "num_tokens": 6541045.0, "step": 1806 }, { "entropy": 0.6095980405807495, "epoch": 1.6859542697153524, "grad_norm": 0.22114725410938263, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.7593050599098206, "num_tokens": 6544798.0, "step": 1807 }, { "entropy": 0.6232210844755173, "epoch": 1.6868875408306114, "grad_norm": 0.22797703742980957, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.743002861738205, "num_tokens": 6548305.0, "step": 1808 }, { "entropy": 0.6260858774185181, "epoch": 1.6878208119458704, "grad_norm": 0.26598915457725525, "learning_rate": 0.0002, "loss": 0.6315, "mean_token_accuracy": 0.743966355919838, "num_tokens": 6551780.0, "step": 1809 }, { "entropy": 0.57163305580616, "epoch": 1.6887540830611294, "grad_norm": 0.2347581684589386, "learning_rate": 0.0002, "loss": 0.5602, "mean_token_accuracy": 0.7694391757249832, "num_tokens": 6555332.0, "step": 1810 }, { "entropy": 0.5835154801607132, "epoch": 1.6896873541763884, "grad_norm": 0.23247112333774567, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.7604409456253052, "num_tokens": 6558911.0, "step": 1811 }, { "entropy": 0.5800936818122864, "epoch": 1.6906206252916474, "grad_norm": 0.2605375349521637, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7516767233610153, "num_tokens": 6562473.0, "step": 1812 }, { "entropy": 0.5976196080446243, "epoch": 1.6915538964069063, "grad_norm": 0.24105796217918396, "learning_rate": 0.0002, "loss": 0.6138, "mean_token_accuracy": 0.7476336359977722, "num_tokens": 6566117.0, "step": 1813 }, { "entropy": 0.5865929573774338, "epoch": 1.6924871675221653, "grad_norm": 0.2131316214799881, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7617488354444504, "num_tokens": 6569786.0, "step": 1814 }, { "entropy": 0.6131063401699066, "epoch": 1.6934204386374243, "grad_norm": 0.2355186939239502, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.7498774081468582, "num_tokens": 6573473.0, "step": 1815 }, { "entropy": 0.6271011978387833, "epoch": 1.6943537097526833, "grad_norm": 0.2859914004802704, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.7437841147184372, "num_tokens": 6577088.0, "step": 1816 }, { "entropy": 0.6486406028270721, "epoch": 1.6952869808679423, "grad_norm": 0.28336888551712036, "learning_rate": 0.0002, "loss": 0.6535, "mean_token_accuracy": 0.7317063510417938, "num_tokens": 6580663.0, "step": 1817 }, { "entropy": 0.5922882109880447, "epoch": 1.6962202519832013, "grad_norm": 0.28702640533447266, "learning_rate": 0.0002, "loss": 0.5824, "mean_token_accuracy": 0.7603912949562073, "num_tokens": 6584402.0, "step": 1818 }, { "entropy": 0.6485768258571625, "epoch": 1.6971535230984602, "grad_norm": 0.25988319516181946, "learning_rate": 0.0002, "loss": 0.6573, "mean_token_accuracy": 0.7350210398435593, "num_tokens": 6588124.0, "step": 1819 }, { "entropy": 0.5600825995206833, "epoch": 1.6980867942137192, "grad_norm": 0.24007155001163483, "learning_rate": 0.0002, "loss": 0.5727, "mean_token_accuracy": 0.772437110543251, "num_tokens": 6591749.0, "step": 1820 }, { "entropy": 0.6161256581544876, "epoch": 1.6990200653289782, "grad_norm": 0.2989840805530548, "learning_rate": 0.0002, "loss": 0.6363, "mean_token_accuracy": 0.7406417280435562, "num_tokens": 6595309.0, "step": 1821 }, { "entropy": 0.6401363611221313, "epoch": 1.6999533364442372, "grad_norm": 0.2573033273220062, "learning_rate": 0.0002, "loss": 0.6391, "mean_token_accuracy": 0.7430246323347092, "num_tokens": 6598927.0, "step": 1822 }, { "entropy": 0.6025333404541016, "epoch": 1.7008866075594962, "grad_norm": 0.23534491658210754, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.7559124082326889, "num_tokens": 6602479.0, "step": 1823 }, { "entropy": 0.5948107689619064, "epoch": 1.7018198786747551, "grad_norm": 0.21305185556411743, "learning_rate": 0.0002, "loss": 0.5905, "mean_token_accuracy": 0.762471467256546, "num_tokens": 6606092.0, "step": 1824 }, { "entropy": 0.6052270829677582, "epoch": 1.7027531497900141, "grad_norm": 0.22671648859977722, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.7513532489538193, "num_tokens": 6609621.0, "step": 1825 }, { "entropy": 0.6045996695756912, "epoch": 1.7036864209052731, "grad_norm": 0.2591613531112671, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.7582155913114548, "num_tokens": 6613211.0, "step": 1826 }, { "entropy": 0.6124996095895767, "epoch": 1.704619692020532, "grad_norm": 0.26866644620895386, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.7425211071968079, "num_tokens": 6616814.0, "step": 1827 }, { "entropy": 0.6201707124710083, "epoch": 1.705552963135791, "grad_norm": 0.2614215016365051, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.7451774328947067, "num_tokens": 6620457.0, "step": 1828 }, { "entropy": 0.6234747767448425, "epoch": 1.70648623425105, "grad_norm": 0.3328664004802704, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.7413674294948578, "num_tokens": 6624126.0, "step": 1829 }, { "entropy": 0.6409410238265991, "epoch": 1.707419505366309, "grad_norm": 0.25544413924217224, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.7407330125570297, "num_tokens": 6627704.0, "step": 1830 }, { "entropy": 0.6378980576992035, "epoch": 1.708352776481568, "grad_norm": 0.26018431782722473, "learning_rate": 0.0002, "loss": 0.6433, "mean_token_accuracy": 0.7368504554033279, "num_tokens": 6631348.0, "step": 1831 }, { "entropy": 0.6111174821853638, "epoch": 1.709286047596827, "grad_norm": 0.39988234639167786, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.7508596181869507, "num_tokens": 6634983.0, "step": 1832 }, { "entropy": 0.5543322265148163, "epoch": 1.710219318712086, "grad_norm": 0.2785995900630951, "learning_rate": 0.0002, "loss": 0.5582, "mean_token_accuracy": 0.7724382132291794, "num_tokens": 6638550.0, "step": 1833 }, { "entropy": 0.5883592814207077, "epoch": 1.711152589827345, "grad_norm": 0.2631172239780426, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.7575053870677948, "num_tokens": 6642227.0, "step": 1834 }, { "entropy": 0.6321828812360764, "epoch": 1.712085860942604, "grad_norm": 0.28069692850112915, "learning_rate": 0.0002, "loss": 0.626, "mean_token_accuracy": 0.7501056641340256, "num_tokens": 6645792.0, "step": 1835 }, { "entropy": 0.5813716799020767, "epoch": 1.713019132057863, "grad_norm": 0.33019664883613586, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.7609377205371857, "num_tokens": 6649324.0, "step": 1836 }, { "entropy": 0.6619121432304382, "epoch": 1.713952403173122, "grad_norm": 0.3592051863670349, "learning_rate": 0.0002, "loss": 0.6601, "mean_token_accuracy": 0.7313006967306137, "num_tokens": 6653089.0, "step": 1837 }, { "entropy": 0.6091229915618896, "epoch": 1.714885674288381, "grad_norm": 0.251117467880249, "learning_rate": 0.0002, "loss": 0.6339, "mean_token_accuracy": 0.7381568402051926, "num_tokens": 6656626.0, "step": 1838 }, { "entropy": 0.6219113618135452, "epoch": 1.7158189454036399, "grad_norm": 0.28820738196372986, "learning_rate": 0.0002, "loss": 0.6352, "mean_token_accuracy": 0.7423661202192307, "num_tokens": 6660357.0, "step": 1839 }, { "entropy": 0.5974901765584946, "epoch": 1.7167522165188989, "grad_norm": 0.2675193250179291, "learning_rate": 0.0002, "loss": 0.592, "mean_token_accuracy": 0.7599681913852692, "num_tokens": 6664129.0, "step": 1840 }, { "entropy": 0.6080421358346939, "epoch": 1.7176854876341578, "grad_norm": 0.2864704132080078, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.750658854842186, "num_tokens": 6667765.0, "step": 1841 }, { "entropy": 0.5806869715452194, "epoch": 1.7186187587494168, "grad_norm": 0.2643788158893585, "learning_rate": 0.0002, "loss": 0.5917, "mean_token_accuracy": 0.7607394605875015, "num_tokens": 6671368.0, "step": 1842 }, { "entropy": 0.6091624796390533, "epoch": 1.7195520298646758, "grad_norm": 0.280703067779541, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7522078603506088, "num_tokens": 6675010.0, "step": 1843 }, { "entropy": 0.5736302137374878, "epoch": 1.7204853009799348, "grad_norm": 0.24694018065929413, "learning_rate": 0.0002, "loss": 0.5796, "mean_token_accuracy": 0.7736953794956207, "num_tokens": 6678586.0, "step": 1844 }, { "entropy": 0.5869442820549011, "epoch": 1.7214185720951938, "grad_norm": 0.29455217719078064, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.7657293379306793, "num_tokens": 6682265.0, "step": 1845 }, { "entropy": 0.6179107278585434, "epoch": 1.7223518432104528, "grad_norm": 0.2669997215270996, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.7512550204992294, "num_tokens": 6685836.0, "step": 1846 }, { "entropy": 0.6113640069961548, "epoch": 1.7232851143257117, "grad_norm": 0.27163198590278625, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.7569455951452255, "num_tokens": 6689381.0, "step": 1847 }, { "entropy": 0.5793052911758423, "epoch": 1.7242183854409707, "grad_norm": 0.28595370054244995, "learning_rate": 0.0002, "loss": 0.5763, "mean_token_accuracy": 0.7652895301580429, "num_tokens": 6692895.0, "step": 1848 }, { "entropy": 0.6229353547096252, "epoch": 1.7251516565562297, "grad_norm": 0.24048267304897308, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7462664097547531, "num_tokens": 6696465.0, "step": 1849 }, { "entropy": 0.6038338243961334, "epoch": 1.7260849276714887, "grad_norm": 0.2553085684776306, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.7555287778377533, "num_tokens": 6700177.0, "step": 1850 }, { "entropy": 0.58666130900383, "epoch": 1.7270181987867477, "grad_norm": 0.2295467108488083, "learning_rate": 0.0002, "loss": 0.5817, "mean_token_accuracy": 0.7626078128814697, "num_tokens": 6703667.0, "step": 1851 }, { "entropy": 0.5942671447992325, "epoch": 1.7279514699020067, "grad_norm": 0.2131567746400833, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7669167816638947, "num_tokens": 6707350.0, "step": 1852 }, { "entropy": 0.6420735120773315, "epoch": 1.7288847410172656, "grad_norm": 0.24791944026947021, "learning_rate": 0.0002, "loss": 0.6505, "mean_token_accuracy": 0.7395756542682648, "num_tokens": 6711020.0, "step": 1853 }, { "entropy": 0.6225130707025528, "epoch": 1.7298180121325246, "grad_norm": 0.2435688078403473, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.7393501698970795, "num_tokens": 6714694.0, "step": 1854 }, { "entropy": 0.5468217432498932, "epoch": 1.7307512832477836, "grad_norm": 0.30479493737220764, "learning_rate": 0.0002, "loss": 0.5543, "mean_token_accuracy": 0.7724295258522034, "num_tokens": 6718269.0, "step": 1855 }, { "entropy": 0.5752130895853043, "epoch": 1.7316845543630426, "grad_norm": 0.2701444923877716, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.7613906562328339, "num_tokens": 6721823.0, "step": 1856 }, { "entropy": 0.6204609274864197, "epoch": 1.7326178254783016, "grad_norm": 0.22718462347984314, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.7462497055530548, "num_tokens": 6725460.0, "step": 1857 }, { "entropy": 0.6105624735355377, "epoch": 1.7335510965935605, "grad_norm": 0.23477213084697723, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.7499202936887741, "num_tokens": 6728960.0, "step": 1858 }, { "entropy": 0.6179772764444351, "epoch": 1.7344843677088195, "grad_norm": 0.22191952168941498, "learning_rate": 0.0002, "loss": 0.6311, "mean_token_accuracy": 0.7436550259590149, "num_tokens": 6732495.0, "step": 1859 }, { "entropy": 0.6180326491594315, "epoch": 1.7354176388240785, "grad_norm": 0.28237661719322205, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.7434940040111542, "num_tokens": 6736128.0, "step": 1860 }, { "entropy": 0.6282762438058853, "epoch": 1.7363509099393375, "grad_norm": 0.25809457898139954, "learning_rate": 0.0002, "loss": 0.6283, "mean_token_accuracy": 0.7391121536493301, "num_tokens": 6739819.0, "step": 1861 }, { "entropy": 0.6595683693885803, "epoch": 1.7372841810545965, "grad_norm": 0.24446061253547668, "learning_rate": 0.0002, "loss": 0.6474, "mean_token_accuracy": 0.73812435567379, "num_tokens": 6743583.0, "step": 1862 }, { "entropy": 0.6434293985366821, "epoch": 1.7382174521698555, "grad_norm": 0.26437970995903015, "learning_rate": 0.0002, "loss": 0.657, "mean_token_accuracy": 0.737125039100647, "num_tokens": 6747163.0, "step": 1863 }, { "entropy": 0.5933619290590286, "epoch": 1.7391507232851144, "grad_norm": 0.22710825502872467, "learning_rate": 0.0002, "loss": 0.5984, "mean_token_accuracy": 0.7542142421007156, "num_tokens": 6750832.0, "step": 1864 }, { "entropy": 0.5989790707826614, "epoch": 1.7400839944003734, "grad_norm": 0.30075839161872864, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7527662068605423, "num_tokens": 6754519.0, "step": 1865 }, { "entropy": 0.5830256938934326, "epoch": 1.7410172655156324, "grad_norm": 0.3107200264930725, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7521071881055832, "num_tokens": 6758054.0, "step": 1866 }, { "entropy": 0.5961964577436447, "epoch": 1.7419505366308914, "grad_norm": 0.2646557092666626, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.7581335008144379, "num_tokens": 6761799.0, "step": 1867 }, { "entropy": 0.6032601892948151, "epoch": 1.7428838077461504, "grad_norm": 0.2247680127620697, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.7534383684396744, "num_tokens": 6765495.0, "step": 1868 }, { "entropy": 0.5995467454195023, "epoch": 1.7438170788614094, "grad_norm": 0.21218864619731903, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.753336101770401, "num_tokens": 6769333.0, "step": 1869 }, { "entropy": 0.5704792439937592, "epoch": 1.7447503499766683, "grad_norm": 0.2482548952102661, "learning_rate": 0.0002, "loss": 0.5751, "mean_token_accuracy": 0.7687889486551285, "num_tokens": 6772990.0, "step": 1870 }, { "entropy": 0.5505224242806435, "epoch": 1.7456836210919273, "grad_norm": 0.24394384026527405, "learning_rate": 0.0002, "loss": 0.5554, "mean_token_accuracy": 0.7737231701612473, "num_tokens": 6776457.0, "step": 1871 }, { "entropy": 0.6277417540550232, "epoch": 1.7466168922071863, "grad_norm": 0.25724661350250244, "learning_rate": 0.0002, "loss": 0.6358, "mean_token_accuracy": 0.7429577708244324, "num_tokens": 6780081.0, "step": 1872 }, { "entropy": 0.5862596035003662, "epoch": 1.7475501633224453, "grad_norm": 0.2664036154747009, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.7588876038789749, "num_tokens": 6783665.0, "step": 1873 }, { "entropy": 0.6568341553211212, "epoch": 1.7484834344377043, "grad_norm": 0.21809439361095428, "learning_rate": 0.0002, "loss": 0.6502, "mean_token_accuracy": 0.7286118417978287, "num_tokens": 6787277.0, "step": 1874 }, { "entropy": 0.5799648463726044, "epoch": 1.7494167055529632, "grad_norm": 0.2830334007740021, "learning_rate": 0.0002, "loss": 0.6004, "mean_token_accuracy": 0.7617077976465225, "num_tokens": 6790875.0, "step": 1875 }, { "entropy": 0.5868234187364578, "epoch": 1.7503499766682222, "grad_norm": 0.23919756710529327, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7659401297569275, "num_tokens": 6794569.0, "step": 1876 }, { "entropy": 0.6267431229352951, "epoch": 1.7512832477834812, "grad_norm": 0.24333910644054413, "learning_rate": 0.0002, "loss": 0.6339, "mean_token_accuracy": 0.7423048466444016, "num_tokens": 6798231.0, "step": 1877 }, { "entropy": 0.6074234694242477, "epoch": 1.7522165188987402, "grad_norm": 0.2177412360906601, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.7541158944368362, "num_tokens": 6801879.0, "step": 1878 }, { "entropy": 0.6124870330095291, "epoch": 1.7531497900139992, "grad_norm": 0.2208767980337143, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.7535650432109833, "num_tokens": 6805488.0, "step": 1879 }, { "entropy": 0.6004997938871384, "epoch": 1.7540830611292582, "grad_norm": 0.21080617606639862, "learning_rate": 0.0002, "loss": 0.607, "mean_token_accuracy": 0.7555433958768845, "num_tokens": 6809152.0, "step": 1880 }, { "entropy": 0.6187247484922409, "epoch": 1.7550163322445171, "grad_norm": 0.2412988394498825, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.745167151093483, "num_tokens": 6812721.0, "step": 1881 }, { "entropy": 0.6283550411462784, "epoch": 1.7559496033597761, "grad_norm": 0.2555260956287384, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.7466004639863968, "num_tokens": 6816316.0, "step": 1882 }, { "entropy": 0.6269446015357971, "epoch": 1.756882874475035, "grad_norm": 0.25438228249549866, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.7387303411960602, "num_tokens": 6819979.0, "step": 1883 }, { "entropy": 0.5716069787740707, "epoch": 1.757816145590294, "grad_norm": 0.23292206227779388, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7636187374591827, "num_tokens": 6823659.0, "step": 1884 }, { "entropy": 0.5942920595407486, "epoch": 1.758749416705553, "grad_norm": 0.2831074297428131, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.7510235011577606, "num_tokens": 6827176.0, "step": 1885 }, { "entropy": 0.5611752271652222, "epoch": 1.759682687820812, "grad_norm": 0.28625839948654175, "learning_rate": 0.0002, "loss": 0.5729, "mean_token_accuracy": 0.7702569961547852, "num_tokens": 6830768.0, "step": 1886 }, { "entropy": 0.6397388875484467, "epoch": 1.760615958936071, "grad_norm": 0.29148006439208984, "learning_rate": 0.0002, "loss": 0.6346, "mean_token_accuracy": 0.7492653131484985, "num_tokens": 6834399.0, "step": 1887 }, { "entropy": 0.5799808949232101, "epoch": 1.76154923005133, "grad_norm": 0.2349727898836136, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.7572033703327179, "num_tokens": 6837969.0, "step": 1888 }, { "entropy": 0.6266836822032928, "epoch": 1.762482501166589, "grad_norm": 0.22418397665023804, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.7459535598754883, "num_tokens": 6841563.0, "step": 1889 }, { "entropy": 0.5934327989816666, "epoch": 1.763415772281848, "grad_norm": 0.2220500409603119, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.7607349753379822, "num_tokens": 6845207.0, "step": 1890 }, { "entropy": 0.6241078525781631, "epoch": 1.764349043397107, "grad_norm": 0.2397388368844986, "learning_rate": 0.0002, "loss": 0.6197, "mean_token_accuracy": 0.7539225220680237, "num_tokens": 6848971.0, "step": 1891 }, { "entropy": 0.5814435482025146, "epoch": 1.765282314512366, "grad_norm": 0.24946732819080353, "learning_rate": 0.0002, "loss": 0.585, "mean_token_accuracy": 0.7622857391834259, "num_tokens": 6852675.0, "step": 1892 }, { "entropy": 0.5771313011646271, "epoch": 1.766215585627625, "grad_norm": 0.28432339429855347, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.7601511776447296, "num_tokens": 6856062.0, "step": 1893 }, { "entropy": 0.5717050284147263, "epoch": 1.767148856742884, "grad_norm": 0.23432819545269012, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7606582790613174, "num_tokens": 6859659.0, "step": 1894 }, { "entropy": 0.60822394490242, "epoch": 1.768082127858143, "grad_norm": 0.26894810795783997, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.7488020807504654, "num_tokens": 6863295.0, "step": 1895 }, { "entropy": 0.606867104768753, "epoch": 1.7690153989734019, "grad_norm": 0.2962116003036499, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.7476552873849869, "num_tokens": 6866815.0, "step": 1896 }, { "entropy": 0.5575737208127975, "epoch": 1.7699486700886609, "grad_norm": 0.2367931753396988, "learning_rate": 0.0002, "loss": 0.5577, "mean_token_accuracy": 0.7771807461977005, "num_tokens": 6870294.0, "step": 1897 }, { "entropy": 0.6115903705358505, "epoch": 1.7708819412039198, "grad_norm": 0.2435760796070099, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.7506041079759598, "num_tokens": 6873887.0, "step": 1898 }, { "entropy": 0.5962492376565933, "epoch": 1.7718152123191788, "grad_norm": 0.24322813749313354, "learning_rate": 0.0002, "loss": 0.5865, "mean_token_accuracy": 0.7605177164077759, "num_tokens": 6877516.0, "step": 1899 }, { "entropy": 0.5730057209730148, "epoch": 1.7727484834344378, "grad_norm": 0.23254013061523438, "learning_rate": 0.0002, "loss": 0.5751, "mean_token_accuracy": 0.7650284916162491, "num_tokens": 6881085.0, "step": 1900 }, { "entropy": 0.6271042674779892, "epoch": 1.7736817545496968, "grad_norm": 0.24476471543312073, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.7400743067264557, "num_tokens": 6884675.0, "step": 1901 }, { "entropy": 0.5729819387197495, "epoch": 1.7746150256649558, "grad_norm": 0.2525876462459564, "learning_rate": 0.0002, "loss": 0.5758, "mean_token_accuracy": 0.7604411542415619, "num_tokens": 6888293.0, "step": 1902 }, { "entropy": 0.5855732858181, "epoch": 1.7755482967802148, "grad_norm": 0.2425987422466278, "learning_rate": 0.0002, "loss": 0.5834, "mean_token_accuracy": 0.7673011124134064, "num_tokens": 6891987.0, "step": 1903 }, { "entropy": 0.6134186238050461, "epoch": 1.7764815678954737, "grad_norm": 0.24483056366443634, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.7450037002563477, "num_tokens": 6895639.0, "step": 1904 }, { "entropy": 0.636281818151474, "epoch": 1.7774148390107327, "grad_norm": 0.3051453232765198, "learning_rate": 0.0002, "loss": 0.6363, "mean_token_accuracy": 0.7373410314321518, "num_tokens": 6899309.0, "step": 1905 }, { "entropy": 0.6120012253522873, "epoch": 1.7783481101259917, "grad_norm": 0.2823890447616577, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.7549251616001129, "num_tokens": 6903011.0, "step": 1906 }, { "entropy": 0.6130100637674332, "epoch": 1.7792813812412507, "grad_norm": 0.2824368476867676, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.7530791759490967, "num_tokens": 6906678.0, "step": 1907 }, { "entropy": 0.5929142087697983, "epoch": 1.7802146523565097, "grad_norm": 0.24861885607242584, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.7557525783777237, "num_tokens": 6910232.0, "step": 1908 }, { "entropy": 0.6330219805240631, "epoch": 1.7811479234717686, "grad_norm": 0.26956385374069214, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.746796578168869, "num_tokens": 6913889.0, "step": 1909 }, { "entropy": 0.6074580699205399, "epoch": 1.7820811945870276, "grad_norm": 0.2575360834598541, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7495936006307602, "num_tokens": 6917456.0, "step": 1910 }, { "entropy": 0.5960415899753571, "epoch": 1.7830144657022866, "grad_norm": 0.24021373689174652, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7519774734973907, "num_tokens": 6921055.0, "step": 1911 }, { "entropy": 0.6229307502508163, "epoch": 1.7839477368175456, "grad_norm": 0.2269347906112671, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.7479539662599564, "num_tokens": 6924704.0, "step": 1912 }, { "entropy": 0.6328893303871155, "epoch": 1.7848810079328046, "grad_norm": 0.28894516825675964, "learning_rate": 0.0002, "loss": 0.651, "mean_token_accuracy": 0.7330974340438843, "num_tokens": 6928354.0, "step": 1913 }, { "entropy": 0.6328782439231873, "epoch": 1.7858142790480636, "grad_norm": 0.2185562402009964, "learning_rate": 0.0002, "loss": 0.6438, "mean_token_accuracy": 0.7377255707979202, "num_tokens": 6932038.0, "step": 1914 }, { "entropy": 0.5483825355768204, "epoch": 1.7867475501633225, "grad_norm": 0.23131421208381653, "learning_rate": 0.0002, "loss": 0.5467, "mean_token_accuracy": 0.7811415046453476, "num_tokens": 6935524.0, "step": 1915 }, { "entropy": 0.5954821109771729, "epoch": 1.7876808212785815, "grad_norm": 0.2436859905719757, "learning_rate": 0.0002, "loss": 0.5883, "mean_token_accuracy": 0.7569498717784882, "num_tokens": 6939174.0, "step": 1916 }, { "entropy": 0.6053332388401031, "epoch": 1.7886140923938405, "grad_norm": 0.2537092864513397, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.7586090713739395, "num_tokens": 6942784.0, "step": 1917 }, { "entropy": 0.5992056280374527, "epoch": 1.7895473635090995, "grad_norm": 0.22237791121006012, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.753287062048912, "num_tokens": 6946393.0, "step": 1918 }, { "entropy": 0.559271514415741, "epoch": 1.7904806346243585, "grad_norm": 0.2609305679798126, "learning_rate": 0.0002, "loss": 0.5646, "mean_token_accuracy": 0.7688850611448288, "num_tokens": 6949883.0, "step": 1919 }, { "entropy": 0.5704145133495331, "epoch": 1.7914139057396175, "grad_norm": 0.2871963083744049, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.7565074265003204, "num_tokens": 6953501.0, "step": 1920 }, { "entropy": 0.5966026037931442, "epoch": 1.7923471768548764, "grad_norm": 0.30087438225746155, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.7476854026317596, "num_tokens": 6957114.0, "step": 1921 }, { "entropy": 0.5632449686527252, "epoch": 1.7932804479701354, "grad_norm": 0.3369523286819458, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.754164531826973, "num_tokens": 6960623.0, "step": 1922 }, { "entropy": 0.5936648994684219, "epoch": 1.7942137190853944, "grad_norm": 0.2645880877971649, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.7495139390230179, "num_tokens": 6964160.0, "step": 1923 }, { "entropy": 0.6126968264579773, "epoch": 1.7951469902006534, "grad_norm": 0.2915595769882202, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.7470590621232986, "num_tokens": 6967793.0, "step": 1924 }, { "entropy": 0.6154760867357254, "epoch": 1.7960802613159124, "grad_norm": 0.24545781314373016, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.7546386420726776, "num_tokens": 6971413.0, "step": 1925 }, { "entropy": 0.6634869575500488, "epoch": 1.7970135324311713, "grad_norm": 0.2266920655965805, "learning_rate": 0.0002, "loss": 0.6373, "mean_token_accuracy": 0.7416670173406601, "num_tokens": 6975050.0, "step": 1926 }, { "entropy": 0.6234626173973083, "epoch": 1.7979468035464303, "grad_norm": 0.24588559567928314, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7515081167221069, "num_tokens": 6978850.0, "step": 1927 }, { "entropy": 0.6509423702955246, "epoch": 1.7988800746616893, "grad_norm": 0.1971644014120102, "learning_rate": 0.0002, "loss": 0.6349, "mean_token_accuracy": 0.7438052445650101, "num_tokens": 6982528.0, "step": 1928 }, { "entropy": 0.6209890842437744, "epoch": 1.7998133457769483, "grad_norm": 0.2558201253414154, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.7549275606870651, "num_tokens": 6986155.0, "step": 1929 }, { "entropy": 0.565154567360878, "epoch": 1.8007466168922073, "grad_norm": 0.30951324105262756, "learning_rate": 0.0002, "loss": 0.5667, "mean_token_accuracy": 0.7699216455221176, "num_tokens": 6989809.0, "step": 1930 }, { "entropy": 0.5802939832210541, "epoch": 1.8016798880074663, "grad_norm": 0.2443498969078064, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.762464627623558, "num_tokens": 6993420.0, "step": 1931 }, { "entropy": 0.6429274827241898, "epoch": 1.8026131591227252, "grad_norm": 0.2911980450153351, "learning_rate": 0.0002, "loss": 0.6425, "mean_token_accuracy": 0.7405750751495361, "num_tokens": 6997178.0, "step": 1932 }, { "entropy": 0.614846259355545, "epoch": 1.8035464302379842, "grad_norm": 0.33124685287475586, "learning_rate": 0.0002, "loss": 0.6475, "mean_token_accuracy": 0.7407257556915283, "num_tokens": 7000883.0, "step": 1933 }, { "entropy": 0.5968641638755798, "epoch": 1.8044797013532432, "grad_norm": 0.2394046187400818, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.7563975900411606, "num_tokens": 7004713.0, "step": 1934 }, { "entropy": 0.6226436793804169, "epoch": 1.8054129724685022, "grad_norm": 0.2320629060268402, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.749428391456604, "num_tokens": 7008399.0, "step": 1935 }, { "entropy": 0.6128804236650467, "epoch": 1.8063462435837612, "grad_norm": 0.2654680609703064, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.7494582682847977, "num_tokens": 7012076.0, "step": 1936 }, { "entropy": 0.6320488154888153, "epoch": 1.8072795146990202, "grad_norm": 0.2505747377872467, "learning_rate": 0.0002, "loss": 0.6307, "mean_token_accuracy": 0.7387755960226059, "num_tokens": 7015797.0, "step": 1937 }, { "entropy": 0.6187125444412231, "epoch": 1.8082127858142791, "grad_norm": 0.220687597990036, "learning_rate": 0.0002, "loss": 0.6168, "mean_token_accuracy": 0.7464780807495117, "num_tokens": 7019439.0, "step": 1938 }, { "entropy": 0.6031928658485413, "epoch": 1.8091460569295381, "grad_norm": 0.21982234716415405, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.7521413713693619, "num_tokens": 7023036.0, "step": 1939 }, { "entropy": 0.6145918071269989, "epoch": 1.810079328044797, "grad_norm": 0.24062712490558624, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.7527556866407394, "num_tokens": 7026801.0, "step": 1940 }, { "entropy": 0.5885131061077118, "epoch": 1.811012599160056, "grad_norm": 0.2565814256668091, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.7610924690961838, "num_tokens": 7030408.0, "step": 1941 }, { "entropy": 0.6068475097417831, "epoch": 1.811945870275315, "grad_norm": 0.2876201272010803, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.7521682679653168, "num_tokens": 7033937.0, "step": 1942 }, { "entropy": 0.6086252927780151, "epoch": 1.812879141390574, "grad_norm": 0.28313687443733215, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.7480164617300034, "num_tokens": 7037567.0, "step": 1943 }, { "entropy": 0.6286919414997101, "epoch": 1.813812412505833, "grad_norm": 0.24489790201187134, "learning_rate": 0.0002, "loss": 0.6332, "mean_token_accuracy": 0.7422590404748917, "num_tokens": 7041193.0, "step": 1944 }, { "entropy": 0.6038795411586761, "epoch": 1.814745683621092, "grad_norm": 0.27399879693984985, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.7518823742866516, "num_tokens": 7044673.0, "step": 1945 }, { "entropy": 0.5729928910732269, "epoch": 1.815678954736351, "grad_norm": 0.22174663841724396, "learning_rate": 0.0002, "loss": 0.57, "mean_token_accuracy": 0.7672777622938156, "num_tokens": 7048346.0, "step": 1946 }, { "entropy": 0.5914383083581924, "epoch": 1.81661222585161, "grad_norm": 0.2339327335357666, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.7608778178691864, "num_tokens": 7051963.0, "step": 1947 }, { "entropy": 0.5939162150025368, "epoch": 1.817545496966869, "grad_norm": 0.25145283341407776, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.7484289258718491, "num_tokens": 7055582.0, "step": 1948 }, { "entropy": 0.585673451423645, "epoch": 1.818478768082128, "grad_norm": 0.25709816813468933, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.7593719214200974, "num_tokens": 7059221.0, "step": 1949 }, { "entropy": 0.6432221531867981, "epoch": 1.819412039197387, "grad_norm": 0.308527410030365, "learning_rate": 0.0002, "loss": 0.6726, "mean_token_accuracy": 0.7290161848068237, "num_tokens": 7062816.0, "step": 1950 }, { "entropy": 0.6025533229112625, "epoch": 1.820345310312646, "grad_norm": 0.25094640254974365, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7605886161327362, "num_tokens": 7066437.0, "step": 1951 }, { "entropy": 0.6232088357210159, "epoch": 1.8212785814279049, "grad_norm": 0.27212920784950256, "learning_rate": 0.0002, "loss": 0.6266, "mean_token_accuracy": 0.7473696023225784, "num_tokens": 7070122.0, "step": 1952 }, { "entropy": 0.6351532638072968, "epoch": 1.8222118525431639, "grad_norm": 0.24538342654705048, "learning_rate": 0.0002, "loss": 0.6258, "mean_token_accuracy": 0.7413685172796249, "num_tokens": 7073671.0, "step": 1953 }, { "entropy": 0.6165336519479752, "epoch": 1.8231451236584229, "grad_norm": 0.25642234086990356, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.7558563202619553, "num_tokens": 7077370.0, "step": 1954 }, { "entropy": 0.5976254343986511, "epoch": 1.8240783947736818, "grad_norm": 0.25838765501976013, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.7565771341323853, "num_tokens": 7081053.0, "step": 1955 }, { "entropy": 0.612096443772316, "epoch": 1.8250116658889408, "grad_norm": 0.2566034197807312, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.7501742988824844, "num_tokens": 7084685.0, "step": 1956 }, { "entropy": 0.6062662601470947, "epoch": 1.8259449370041998, "grad_norm": 0.26184460520744324, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.7451974004507065, "num_tokens": 7088297.0, "step": 1957 }, { "entropy": 0.5797479003667831, "epoch": 1.8268782081194588, "grad_norm": 0.2513458728790283, "learning_rate": 0.0002, "loss": 0.584, "mean_token_accuracy": 0.7633444219827652, "num_tokens": 7091954.0, "step": 1958 }, { "entropy": 0.5913399457931519, "epoch": 1.8278114792347178, "grad_norm": 0.24045544862747192, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7632881253957748, "num_tokens": 7095605.0, "step": 1959 }, { "entropy": 0.6290463954210281, "epoch": 1.8287447503499767, "grad_norm": 0.26717299222946167, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.7421909123659134, "num_tokens": 7099353.0, "step": 1960 }, { "entropy": 0.6093256622552872, "epoch": 1.8296780214652357, "grad_norm": 0.25146013498306274, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.7581947147846222, "num_tokens": 7103076.0, "step": 1961 }, { "entropy": 0.5748381316661835, "epoch": 1.8306112925804947, "grad_norm": 0.24930116534233093, "learning_rate": 0.0002, "loss": 0.5739, "mean_token_accuracy": 0.7604848444461823, "num_tokens": 7106705.0, "step": 1962 }, { "entropy": 0.6197525858879089, "epoch": 1.8315445636957537, "grad_norm": 0.34103459119796753, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.7454806566238403, "num_tokens": 7110331.0, "step": 1963 }, { "entropy": 0.6166300028562546, "epoch": 1.8324778348110127, "grad_norm": 0.23121799528598785, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.7504102885723114, "num_tokens": 7114058.0, "step": 1964 }, { "entropy": 0.5971930027008057, "epoch": 1.8334111059262717, "grad_norm": 0.2666342258453369, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7591194212436676, "num_tokens": 7117624.0, "step": 1965 }, { "entropy": 0.626311406493187, "epoch": 1.8343443770415306, "grad_norm": 0.23591136932373047, "learning_rate": 0.0002, "loss": 0.6285, "mean_token_accuracy": 0.7399208396673203, "num_tokens": 7121143.0, "step": 1966 }, { "entropy": 0.6225715577602386, "epoch": 1.8352776481567896, "grad_norm": 0.24526509642601013, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.7501428872346878, "num_tokens": 7124770.0, "step": 1967 }, { "entropy": 0.6244671791791916, "epoch": 1.8362109192720486, "grad_norm": 0.2503562271595001, "learning_rate": 0.0002, "loss": 0.6125, "mean_token_accuracy": 0.7538400292396545, "num_tokens": 7128454.0, "step": 1968 }, { "entropy": 0.6328994035720825, "epoch": 1.8371441903873076, "grad_norm": 0.22043581306934357, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.7427585124969482, "num_tokens": 7132013.0, "step": 1969 }, { "entropy": 0.6022193878889084, "epoch": 1.8380774615025666, "grad_norm": 0.2499414086341858, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.7493800818920135, "num_tokens": 7135601.0, "step": 1970 }, { "entropy": 0.6283420026302338, "epoch": 1.8390107326178255, "grad_norm": 0.3167988955974579, "learning_rate": 0.0002, "loss": 0.6393, "mean_token_accuracy": 0.7343988418579102, "num_tokens": 7139343.0, "step": 1971 }, { "entropy": 0.6179436445236206, "epoch": 1.8399440037330845, "grad_norm": 0.28848782181739807, "learning_rate": 0.0002, "loss": 0.6246, "mean_token_accuracy": 0.7481627017259598, "num_tokens": 7142981.0, "step": 1972 }, { "entropy": 0.6152937859296799, "epoch": 1.8408772748483435, "grad_norm": 0.31180259585380554, "learning_rate": 0.0002, "loss": 0.6294, "mean_token_accuracy": 0.7509432435035706, "num_tokens": 7146746.0, "step": 1973 }, { "entropy": 0.5915137231349945, "epoch": 1.8418105459636025, "grad_norm": 0.29026415944099426, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.7534918785095215, "num_tokens": 7150337.0, "step": 1974 }, { "entropy": 0.5983506292104721, "epoch": 1.8427438170788615, "grad_norm": 0.2531638443470001, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7484514117240906, "num_tokens": 7153947.0, "step": 1975 }, { "entropy": 0.6130942404270172, "epoch": 1.8436770881941205, "grad_norm": 0.29404041171073914, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.7511842846870422, "num_tokens": 7157556.0, "step": 1976 }, { "entropy": 0.5874306559562683, "epoch": 1.8446103593093794, "grad_norm": 0.219297394156456, "learning_rate": 0.0002, "loss": 0.5847, "mean_token_accuracy": 0.759384885430336, "num_tokens": 7161154.0, "step": 1977 }, { "entropy": 0.6222779303789139, "epoch": 1.8455436304246384, "grad_norm": 0.2828183174133301, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.7499133348464966, "num_tokens": 7164767.0, "step": 1978 }, { "entropy": 0.5567679703235626, "epoch": 1.8464769015398974, "grad_norm": 0.21722052991390228, "learning_rate": 0.0002, "loss": 0.5535, "mean_token_accuracy": 0.7754718512296677, "num_tokens": 7168392.0, "step": 1979 }, { "entropy": 0.6289063543081284, "epoch": 1.8474101726551564, "grad_norm": 0.2408675104379654, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.7405707091093063, "num_tokens": 7171970.0, "step": 1980 }, { "entropy": 0.6249770373106003, "epoch": 1.8483434437704154, "grad_norm": 0.2535587549209595, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.7494661211967468, "num_tokens": 7175619.0, "step": 1981 }, { "entropy": 0.6038305312395096, "epoch": 1.8492767148856744, "grad_norm": 0.2372436374425888, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.7596606314182281, "num_tokens": 7179184.0, "step": 1982 }, { "entropy": 0.5850977003574371, "epoch": 1.8502099860009333, "grad_norm": 0.23801682889461517, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.7656396478414536, "num_tokens": 7182818.0, "step": 1983 }, { "entropy": 0.5774066895246506, "epoch": 1.8511432571161923, "grad_norm": 0.29614511132240295, "learning_rate": 0.0002, "loss": 0.5984, "mean_token_accuracy": 0.7580457627773285, "num_tokens": 7186333.0, "step": 1984 }, { "entropy": 0.5980075150728226, "epoch": 1.8520765282314513, "grad_norm": 0.23474453389644623, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.7604599148035049, "num_tokens": 7190012.0, "step": 1985 }, { "entropy": 0.5898908823728561, "epoch": 1.8530097993467103, "grad_norm": 0.30417412519454956, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.7601276934146881, "num_tokens": 7193609.0, "step": 1986 }, { "entropy": 0.5947508215904236, "epoch": 1.8539430704619693, "grad_norm": 0.23262359201908112, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7605807334184647, "num_tokens": 7197150.0, "step": 1987 }, { "entropy": 0.5752701312303543, "epoch": 1.8548763415772282, "grad_norm": 0.2836894392967224, "learning_rate": 0.0002, "loss": 0.5823, "mean_token_accuracy": 0.7629403322935104, "num_tokens": 7200651.0, "step": 1988 }, { "entropy": 0.6290169954299927, "epoch": 1.8558096126924872, "grad_norm": 0.292520672082901, "learning_rate": 0.0002, "loss": 0.6339, "mean_token_accuracy": 0.7380457818508148, "num_tokens": 7204165.0, "step": 1989 }, { "entropy": 0.6039870232343674, "epoch": 1.8567428838077462, "grad_norm": 0.2738334536552429, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.758189246058464, "num_tokens": 7207808.0, "step": 1990 }, { "entropy": 0.6206848919391632, "epoch": 1.8576761549230052, "grad_norm": 0.2924184203147888, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.7529062330722809, "num_tokens": 7211383.0, "step": 1991 }, { "entropy": 0.5878458619117737, "epoch": 1.8586094260382642, "grad_norm": 0.24235372245311737, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.7617557644844055, "num_tokens": 7215109.0, "step": 1992 }, { "entropy": 0.6094233244657516, "epoch": 1.8595426971535232, "grad_norm": 0.24443745613098145, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.7498938292264938, "num_tokens": 7218827.0, "step": 1993 }, { "entropy": 0.5904861986637115, "epoch": 1.8604759682687821, "grad_norm": 0.31043168902397156, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.7548473030328751, "num_tokens": 7222345.0, "step": 1994 }, { "entropy": 0.5960548371076584, "epoch": 1.8614092393840411, "grad_norm": 0.25524163246154785, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.7609274387359619, "num_tokens": 7225933.0, "step": 1995 }, { "entropy": 0.5640939772129059, "epoch": 1.8623425104993, "grad_norm": 0.24383054673671722, "learning_rate": 0.0002, "loss": 0.575, "mean_token_accuracy": 0.7703619599342346, "num_tokens": 7229483.0, "step": 1996 }, { "entropy": 0.5675372332334518, "epoch": 1.863275781614559, "grad_norm": 0.26458045840263367, "learning_rate": 0.0002, "loss": 0.5762, "mean_token_accuracy": 0.765500396490097, "num_tokens": 7233079.0, "step": 1997 }, { "entropy": 0.6185908317565918, "epoch": 1.864209052729818, "grad_norm": 0.26297667622566223, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.7520388960838318, "num_tokens": 7236727.0, "step": 1998 }, { "entropy": 0.6068966686725616, "epoch": 1.865142323845077, "grad_norm": 0.22249019145965576, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.7566292136907578, "num_tokens": 7240502.0, "step": 1999 }, { "entropy": 0.5931725203990936, "epoch": 1.866075594960336, "grad_norm": 0.25924354791641235, "learning_rate": 0.0002, "loss": 0.5904, "mean_token_accuracy": 0.7605213522911072, "num_tokens": 7244153.0, "step": 2000 }, { "entropy": 0.6252978593111038, "epoch": 1.867008866075595, "grad_norm": 0.26000499725341797, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.7470914572477341, "num_tokens": 7247769.0, "step": 2001 }, { "entropy": 0.6062953025102615, "epoch": 1.867942137190854, "grad_norm": 0.25536781549453735, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.7526402473449707, "num_tokens": 7251302.0, "step": 2002 }, { "entropy": 0.5889302492141724, "epoch": 1.868875408306113, "grad_norm": 0.2770785987377167, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.7583634704351425, "num_tokens": 7254817.0, "step": 2003 }, { "entropy": 0.5779958367347717, "epoch": 1.869808679421372, "grad_norm": 0.26691779494285583, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7628279328346252, "num_tokens": 7258273.0, "step": 2004 }, { "entropy": 0.6172583550214767, "epoch": 1.870741950536631, "grad_norm": 0.2429082989692688, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.7482197880744934, "num_tokens": 7261797.0, "step": 2005 }, { "entropy": 0.6077843904495239, "epoch": 1.87167522165189, "grad_norm": 0.24083615839481354, "learning_rate": 0.0002, "loss": 0.6078, "mean_token_accuracy": 0.7540624886751175, "num_tokens": 7265321.0, "step": 2006 }, { "entropy": 0.6291968822479248, "epoch": 1.872608492767149, "grad_norm": 0.2639492154121399, "learning_rate": 0.0002, "loss": 0.6386, "mean_token_accuracy": 0.7456155866384506, "num_tokens": 7269008.0, "step": 2007 }, { "entropy": 0.6129645258188248, "epoch": 1.873541763882408, "grad_norm": 0.2842775881290436, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.7483361810445786, "num_tokens": 7272645.0, "step": 2008 }, { "entropy": 0.6035104095935822, "epoch": 1.8744750349976669, "grad_norm": 0.2391897439956665, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.7568454891443253, "num_tokens": 7276349.0, "step": 2009 }, { "entropy": 0.6121442019939423, "epoch": 1.8754083061129259, "grad_norm": 0.2422976791858673, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7510596662759781, "num_tokens": 7279887.0, "step": 2010 }, { "entropy": 0.6086160093545914, "epoch": 1.8763415772281848, "grad_norm": 0.24211114645004272, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.7540599703788757, "num_tokens": 7283572.0, "step": 2011 }, { "entropy": 0.5919157415628433, "epoch": 1.8772748483434438, "grad_norm": 0.2636239230632782, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.7655450999736786, "num_tokens": 7287190.0, "step": 2012 }, { "entropy": 0.642204150557518, "epoch": 1.8782081194587028, "grad_norm": 0.24388901889324188, "learning_rate": 0.0002, "loss": 0.645, "mean_token_accuracy": 0.7360018044710159, "num_tokens": 7290817.0, "step": 2013 }, { "entropy": 0.6044538468122482, "epoch": 1.8791413905739618, "grad_norm": 0.21075361967086792, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.7640554159879684, "num_tokens": 7294509.0, "step": 2014 }, { "entropy": 0.5779598951339722, "epoch": 1.8800746616892208, "grad_norm": 0.25905779004096985, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7671894878149033, "num_tokens": 7298096.0, "step": 2015 }, { "entropy": 0.5848302245140076, "epoch": 1.8810079328044798, "grad_norm": 0.31864550709724426, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.7506037652492523, "num_tokens": 7301763.0, "step": 2016 }, { "entropy": 0.5931458324193954, "epoch": 1.8819412039197387, "grad_norm": 0.2715812921524048, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.7460930347442627, "num_tokens": 7305344.0, "step": 2017 }, { "entropy": 0.5957524329423904, "epoch": 1.8828744750349977, "grad_norm": 0.2541942894458771, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.7575068473815918, "num_tokens": 7308870.0, "step": 2018 }, { "entropy": 0.6350279301404953, "epoch": 1.8838077461502567, "grad_norm": 0.23298798501491547, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.7428231239318848, "num_tokens": 7312630.0, "step": 2019 }, { "entropy": 0.6335053741931915, "epoch": 1.8847410172655157, "grad_norm": 0.250649631023407, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.7458794862031937, "num_tokens": 7316288.0, "step": 2020 }, { "entropy": 0.5872355103492737, "epoch": 1.8856742883807747, "grad_norm": 0.25591525435447693, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.7633209526538849, "num_tokens": 7320016.0, "step": 2021 }, { "entropy": 0.6094226241111755, "epoch": 1.8866075594960336, "grad_norm": 0.22515518963336945, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.7598665952682495, "num_tokens": 7323658.0, "step": 2022 }, { "entropy": 0.6431400775909424, "epoch": 1.8875408306112926, "grad_norm": 0.23789775371551514, "learning_rate": 0.0002, "loss": 0.6424, "mean_token_accuracy": 0.7396348565816879, "num_tokens": 7327442.0, "step": 2023 }, { "entropy": 0.6406620442867279, "epoch": 1.8884741017265516, "grad_norm": 0.2895805239677429, "learning_rate": 0.0002, "loss": 0.6361, "mean_token_accuracy": 0.7460184693336487, "num_tokens": 7331205.0, "step": 2024 }, { "entropy": 0.5653202086687088, "epoch": 1.8894073728418106, "grad_norm": 0.2363799661397934, "learning_rate": 0.0002, "loss": 0.5717, "mean_token_accuracy": 0.7703899890184402, "num_tokens": 7334816.0, "step": 2025 }, { "entropy": 0.5904927402734756, "epoch": 1.8903406439570696, "grad_norm": 0.2507525086402893, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7576494514942169, "num_tokens": 7338473.0, "step": 2026 }, { "entropy": 0.562930092215538, "epoch": 1.8912739150723286, "grad_norm": 0.2720288336277008, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.76626355946064, "num_tokens": 7342019.0, "step": 2027 }, { "entropy": 0.6070714443922043, "epoch": 1.8922071861875875, "grad_norm": 0.24561059474945068, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.7477030158042908, "num_tokens": 7345787.0, "step": 2028 }, { "entropy": 0.6161806136369705, "epoch": 1.8931404573028465, "grad_norm": 0.2317928820848465, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.7554292231798172, "num_tokens": 7349515.0, "step": 2029 }, { "entropy": 0.6150674968957901, "epoch": 1.8940737284181055, "grad_norm": 0.23594875633716583, "learning_rate": 0.0002, "loss": 0.6218, "mean_token_accuracy": 0.7443321347236633, "num_tokens": 7353157.0, "step": 2030 }, { "entropy": 0.64931820333004, "epoch": 1.8950069995333645, "grad_norm": 0.2273443043231964, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.7438555955886841, "num_tokens": 7356703.0, "step": 2031 }, { "entropy": 0.6101210415363312, "epoch": 1.8959402706486235, "grad_norm": 0.22833624482154846, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7550871670246124, "num_tokens": 7360404.0, "step": 2032 }, { "entropy": 0.641112744808197, "epoch": 1.8968735417638825, "grad_norm": 0.21557500958442688, "learning_rate": 0.0002, "loss": 0.6363, "mean_token_accuracy": 0.7499409019947052, "num_tokens": 7363997.0, "step": 2033 }, { "entropy": 0.5816243439912796, "epoch": 1.8978068128791414, "grad_norm": 0.24280306696891785, "learning_rate": 0.0002, "loss": 0.5838, "mean_token_accuracy": 0.7621595710515976, "num_tokens": 7367649.0, "step": 2034 }, { "entropy": 0.6296760588884354, "epoch": 1.8987400839944004, "grad_norm": 0.2710602283477783, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.7432963997125626, "num_tokens": 7371097.0, "step": 2035 }, { "entropy": 0.5795957744121552, "epoch": 1.8996733551096594, "grad_norm": 0.2566348910331726, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.7605890035629272, "num_tokens": 7374731.0, "step": 2036 }, { "entropy": 0.6302494406700134, "epoch": 1.9006066262249184, "grad_norm": 0.26794466376304626, "learning_rate": 0.0002, "loss": 0.631, "mean_token_accuracy": 0.7440385967493057, "num_tokens": 7378527.0, "step": 2037 }, { "entropy": 0.6344163119792938, "epoch": 1.9015398973401774, "grad_norm": 0.27875566482543945, "learning_rate": 0.0002, "loss": 0.651, "mean_token_accuracy": 0.7366444319486618, "num_tokens": 7382107.0, "step": 2038 }, { "entropy": 0.6047942489385605, "epoch": 1.9024731684554363, "grad_norm": 0.23873180150985718, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.7532882988452911, "num_tokens": 7385692.0, "step": 2039 }, { "entropy": 0.6077268272638321, "epoch": 1.9034064395706953, "grad_norm": 0.27937817573547363, "learning_rate": 0.0002, "loss": 0.6168, "mean_token_accuracy": 0.7529356330633163, "num_tokens": 7389403.0, "step": 2040 }, { "entropy": 0.6127370297908783, "epoch": 1.9043397106859543, "grad_norm": 0.23206917941570282, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.7498744428157806, "num_tokens": 7393005.0, "step": 2041 }, { "entropy": 0.5967395901679993, "epoch": 1.9052729818012133, "grad_norm": 0.24639619886875153, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.7603207379579544, "num_tokens": 7396668.0, "step": 2042 }, { "entropy": 0.6060031205415726, "epoch": 1.9062062529164723, "grad_norm": 0.23611749708652496, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.7515426427125931, "num_tokens": 7400347.0, "step": 2043 }, { "entropy": 0.5773215293884277, "epoch": 1.9071395240317313, "grad_norm": 0.2406165599822998, "learning_rate": 0.0002, "loss": 0.5738, "mean_token_accuracy": 0.7639746367931366, "num_tokens": 7403927.0, "step": 2044 }, { "entropy": 0.591107115149498, "epoch": 1.9080727951469902, "grad_norm": 0.233232781291008, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7611332982778549, "num_tokens": 7407508.0, "step": 2045 }, { "entropy": 0.6328333914279938, "epoch": 1.9090060662622492, "grad_norm": 0.22637170553207397, "learning_rate": 0.0002, "loss": 0.638, "mean_token_accuracy": 0.7434579730033875, "num_tokens": 7411094.0, "step": 2046 }, { "entropy": 0.5876439809799194, "epoch": 1.9099393373775082, "grad_norm": 0.3478338420391083, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.751057431101799, "num_tokens": 7414620.0, "step": 2047 }, { "entropy": 0.638778567314148, "epoch": 1.9108726084927672, "grad_norm": 0.27005526423454285, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.7426710724830627, "num_tokens": 7418207.0, "step": 2048 }, { "entropy": 0.6284086108207703, "epoch": 1.9118058796080262, "grad_norm": 0.2721993327140808, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.7421156167984009, "num_tokens": 7421894.0, "step": 2049 }, { "entropy": 0.6060706377029419, "epoch": 1.9127391507232852, "grad_norm": 0.28801068663597107, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.7539441287517548, "num_tokens": 7425456.0, "step": 2050 }, { "entropy": 0.5912741422653198, "epoch": 1.9136724218385441, "grad_norm": 0.2517850399017334, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.7638732492923737, "num_tokens": 7429188.0, "step": 2051 }, { "entropy": 0.5695986151695251, "epoch": 1.9146056929538031, "grad_norm": 0.24211151897907257, "learning_rate": 0.0002, "loss": 0.5746, "mean_token_accuracy": 0.7653989493846893, "num_tokens": 7432751.0, "step": 2052 }, { "entropy": 0.5946926325559616, "epoch": 1.915538964069062, "grad_norm": 0.24545879662036896, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.7490897923707962, "num_tokens": 7436452.0, "step": 2053 }, { "entropy": 0.6252778619527817, "epoch": 1.916472235184321, "grad_norm": 0.2376643568277359, "learning_rate": 0.0002, "loss": 0.6309, "mean_token_accuracy": 0.7494227290153503, "num_tokens": 7440027.0, "step": 2054 }, { "entropy": 0.6363604366779327, "epoch": 1.91740550629958, "grad_norm": 0.23966732621192932, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.7503713220357895, "num_tokens": 7443734.0, "step": 2055 }, { "entropy": 0.5640689879655838, "epoch": 1.918338777414839, "grad_norm": 0.2274339199066162, "learning_rate": 0.0002, "loss": 0.57, "mean_token_accuracy": 0.77304807305336, "num_tokens": 7447313.0, "step": 2056 }, { "entropy": 0.6277337670326233, "epoch": 1.919272048530098, "grad_norm": 0.20711080729961395, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.7477781921625137, "num_tokens": 7451011.0, "step": 2057 }, { "entropy": 0.5997487306594849, "epoch": 1.920205319645357, "grad_norm": 0.2606486976146698, "learning_rate": 0.0002, "loss": 0.621, "mean_token_accuracy": 0.7525602132081985, "num_tokens": 7454594.0, "step": 2058 }, { "entropy": 0.6396210789680481, "epoch": 1.921138590760616, "grad_norm": 0.23420479893684387, "learning_rate": 0.0002, "loss": 0.6374, "mean_token_accuracy": 0.7438270449638367, "num_tokens": 7458312.0, "step": 2059 }, { "entropy": 0.6027058064937592, "epoch": 1.922071861875875, "grad_norm": 0.23834705352783203, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.7484287470579147, "num_tokens": 7461969.0, "step": 2060 }, { "entropy": 0.5637620985507965, "epoch": 1.923005132991134, "grad_norm": 0.24106742441654205, "learning_rate": 0.0002, "loss": 0.5733, "mean_token_accuracy": 0.7665562480688095, "num_tokens": 7465505.0, "step": 2061 }, { "entropy": 0.610404446721077, "epoch": 1.923938404106393, "grad_norm": 0.23880760371685028, "learning_rate": 0.0002, "loss": 0.6082, "mean_token_accuracy": 0.7490327954292297, "num_tokens": 7469230.0, "step": 2062 }, { "entropy": 0.5869418233633041, "epoch": 1.924871675221652, "grad_norm": 0.26225170493125916, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.7540386766195297, "num_tokens": 7472862.0, "step": 2063 }, { "entropy": 0.5971108376979828, "epoch": 1.925804946336911, "grad_norm": 0.2530252933502197, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.7616571933031082, "num_tokens": 7476440.0, "step": 2064 }, { "entropy": 0.6302479952573776, "epoch": 1.9267382174521699, "grad_norm": 0.30869153141975403, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.7479139715433121, "num_tokens": 7480145.0, "step": 2065 }, { "entropy": 0.6098366230726242, "epoch": 1.9276714885674289, "grad_norm": 0.2294439971446991, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.7595492601394653, "num_tokens": 7483778.0, "step": 2066 }, { "entropy": 0.6019464582204819, "epoch": 1.9286047596826879, "grad_norm": 0.2612704336643219, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.759915292263031, "num_tokens": 7487516.0, "step": 2067 }, { "entropy": 0.5867633521556854, "epoch": 1.9295380307979468, "grad_norm": 0.36795103549957275, "learning_rate": 0.0002, "loss": 0.5885, "mean_token_accuracy": 0.7670353353023529, "num_tokens": 7491102.0, "step": 2068 }, { "entropy": 0.6024768650531769, "epoch": 1.9304713019132058, "grad_norm": 0.2930009067058563, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.7536996155977249, "num_tokens": 7494659.0, "step": 2069 }, { "entropy": 0.6163506805896759, "epoch": 1.9314045730284648, "grad_norm": 0.3537689447402954, "learning_rate": 0.0002, "loss": 0.6318, "mean_token_accuracy": 0.7438501864671707, "num_tokens": 7498378.0, "step": 2070 }, { "entropy": 0.6012271195650101, "epoch": 1.9323378441437238, "grad_norm": 0.22428661584854126, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.7529854029417038, "num_tokens": 7502204.0, "step": 2071 }, { "entropy": 0.5912560075521469, "epoch": 1.9332711152589828, "grad_norm": 0.3184375762939453, "learning_rate": 0.0002, "loss": 0.5863, "mean_token_accuracy": 0.7605176717042923, "num_tokens": 7505839.0, "step": 2072 }, { "entropy": 0.6024796962738037, "epoch": 1.9342043863742417, "grad_norm": 0.2048654854297638, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.7619328498840332, "num_tokens": 7509586.0, "step": 2073 }, { "entropy": 0.6197866648435593, "epoch": 1.9351376574895007, "grad_norm": 0.27053532004356384, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.7525589019060135, "num_tokens": 7513208.0, "step": 2074 }, { "entropy": 0.5511162430047989, "epoch": 1.9360709286047597, "grad_norm": 0.256078839302063, "learning_rate": 0.0002, "loss": 0.5607, "mean_token_accuracy": 0.7726705074310303, "num_tokens": 7516802.0, "step": 2075 }, { "entropy": 0.6269324272871017, "epoch": 1.9370041997200187, "grad_norm": 0.24790972471237183, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.7458849847316742, "num_tokens": 7520369.0, "step": 2076 }, { "entropy": 0.5655101239681244, "epoch": 1.9379374708352777, "grad_norm": 0.23728680610656738, "learning_rate": 0.0002, "loss": 0.573, "mean_token_accuracy": 0.7694834619760513, "num_tokens": 7523937.0, "step": 2077 }, { "entropy": 0.6145260185003281, "epoch": 1.9388707419505367, "grad_norm": 0.2537592649459839, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.7511931508779526, "num_tokens": 7527576.0, "step": 2078 }, { "entropy": 0.6097839623689651, "epoch": 1.9398040130657956, "grad_norm": 0.24606378376483917, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.7442684769630432, "num_tokens": 7531212.0, "step": 2079 }, { "entropy": 0.5775715559720993, "epoch": 1.9407372841810546, "grad_norm": 0.2653619349002838, "learning_rate": 0.0002, "loss": 0.5846, "mean_token_accuracy": 0.7644858062267303, "num_tokens": 7534849.0, "step": 2080 }, { "entropy": 0.5819500982761383, "epoch": 1.9416705552963136, "grad_norm": 0.25800126791000366, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.7570944279432297, "num_tokens": 7538526.0, "step": 2081 }, { "entropy": 0.607459232211113, "epoch": 1.9426038264115726, "grad_norm": 0.278037965297699, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7554880231618881, "num_tokens": 7542122.0, "step": 2082 }, { "entropy": 0.6065463125705719, "epoch": 1.9435370975268316, "grad_norm": 0.3310930132865906, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.7509988099336624, "num_tokens": 7545671.0, "step": 2083 }, { "entropy": 0.6007460355758667, "epoch": 1.9444703686420906, "grad_norm": 0.2668308615684509, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.7619257569313049, "num_tokens": 7549317.0, "step": 2084 }, { "entropy": 0.6030623763799667, "epoch": 1.9454036397573495, "grad_norm": 0.25632354617118835, "learning_rate": 0.0002, "loss": 0.5917, "mean_token_accuracy": 0.7622346431016922, "num_tokens": 7552978.0, "step": 2085 }, { "entropy": 0.578212708234787, "epoch": 1.9463369108726085, "grad_norm": 0.291157603263855, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.7566135972738266, "num_tokens": 7556534.0, "step": 2086 }, { "entropy": 0.6325452476739883, "epoch": 1.9472701819878675, "grad_norm": 0.2920370399951935, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.7403034567832947, "num_tokens": 7560168.0, "step": 2087 }, { "entropy": 0.6615372151136398, "epoch": 1.9482034531031265, "grad_norm": 0.2165130376815796, "learning_rate": 0.0002, "loss": 0.6554, "mean_token_accuracy": 0.7430559992790222, "num_tokens": 7563932.0, "step": 2088 }, { "entropy": 0.589093491435051, "epoch": 1.9491367242183855, "grad_norm": 0.25083768367767334, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7595001012086868, "num_tokens": 7567598.0, "step": 2089 }, { "entropy": 0.6385217010974884, "epoch": 1.9500699953336444, "grad_norm": 0.30037635564804077, "learning_rate": 0.0002, "loss": 0.6474, "mean_token_accuracy": 0.7460437417030334, "num_tokens": 7571130.0, "step": 2090 }, { "entropy": 0.6067562699317932, "epoch": 1.9510032664489034, "grad_norm": 0.28244295716285706, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.7426514625549316, "num_tokens": 7574754.0, "step": 2091 }, { "entropy": 0.6328784227371216, "epoch": 1.9519365375641624, "grad_norm": 0.2939643859863281, "learning_rate": 0.0002, "loss": 0.6455, "mean_token_accuracy": 0.7361503541469574, "num_tokens": 7578314.0, "step": 2092 }, { "entropy": 0.6025311052799225, "epoch": 1.9528698086794214, "grad_norm": 0.3000953793525696, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.7526258528232574, "num_tokens": 7581936.0, "step": 2093 }, { "entropy": 0.5580358654260635, "epoch": 1.9538030797946804, "grad_norm": 0.24413834512233734, "learning_rate": 0.0002, "loss": 0.5591, "mean_token_accuracy": 0.7765924036502838, "num_tokens": 7585475.0, "step": 2094 }, { "entropy": 0.6099698841571808, "epoch": 1.9547363509099394, "grad_norm": 0.26002517342567444, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.7524811625480652, "num_tokens": 7589041.0, "step": 2095 }, { "entropy": 0.673944503068924, "epoch": 1.9556696220251983, "grad_norm": 0.28904852271080017, "learning_rate": 0.0002, "loss": 0.6785, "mean_token_accuracy": 0.7280202209949493, "num_tokens": 7592723.0, "step": 2096 }, { "entropy": 0.5941649079322815, "epoch": 1.9566028931404573, "grad_norm": 0.23090529441833496, "learning_rate": 0.0002, "loss": 0.5916, "mean_token_accuracy": 0.7580156624317169, "num_tokens": 7596220.0, "step": 2097 }, { "entropy": 0.6132930815219879, "epoch": 1.9575361642557163, "grad_norm": 0.3013401925563812, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7471934705972672, "num_tokens": 7599952.0, "step": 2098 }, { "entropy": 0.620486319065094, "epoch": 1.9584694353709753, "grad_norm": 0.23247049748897552, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.7419543713331223, "num_tokens": 7603559.0, "step": 2099 }, { "entropy": 0.6144340187311172, "epoch": 1.9594027064862343, "grad_norm": 0.2588485777378082, "learning_rate": 0.0002, "loss": 0.6103, "mean_token_accuracy": 0.7588571310043335, "num_tokens": 7607182.0, "step": 2100 }, { "entropy": 0.5894313603639603, "epoch": 1.9603359776014933, "grad_norm": 0.25192609429359436, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.7571156620979309, "num_tokens": 7610845.0, "step": 2101 }, { "entropy": 0.5818270593881607, "epoch": 1.9612692487167522, "grad_norm": 0.29857587814331055, "learning_rate": 0.0002, "loss": 0.5972, "mean_token_accuracy": 0.7627667635679245, "num_tokens": 7614430.0, "step": 2102 }, { "entropy": 0.5914144068956375, "epoch": 1.9622025198320112, "grad_norm": 0.29205599427223206, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7477659285068512, "num_tokens": 7617960.0, "step": 2103 }, { "entropy": 0.6044926196336746, "epoch": 1.9631357909472702, "grad_norm": 0.25927019119262695, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.7533396631479263, "num_tokens": 7621564.0, "step": 2104 }, { "entropy": 0.5719156786799431, "epoch": 1.9640690620625292, "grad_norm": 0.22124840319156647, "learning_rate": 0.0002, "loss": 0.5699, "mean_token_accuracy": 0.76653653383255, "num_tokens": 7625192.0, "step": 2105 }, { "entropy": 0.6302364766597748, "epoch": 1.9650023331777882, "grad_norm": 0.2575875222682953, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.7495551854372025, "num_tokens": 7628900.0, "step": 2106 }, { "entropy": 0.5948629528284073, "epoch": 1.9659356042930471, "grad_norm": 0.23977693915367126, "learning_rate": 0.0002, "loss": 0.5885, "mean_token_accuracy": 0.758589580655098, "num_tokens": 7632580.0, "step": 2107 }, { "entropy": 0.5922930091619492, "epoch": 1.9668688754083061, "grad_norm": 0.24683184921741486, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.76258684694767, "num_tokens": 7636167.0, "step": 2108 }, { "entropy": 0.5865319818258286, "epoch": 1.967802146523565, "grad_norm": 0.2717609405517578, "learning_rate": 0.0002, "loss": 0.5873, "mean_token_accuracy": 0.7521674782037735, "num_tokens": 7639806.0, "step": 2109 }, { "entropy": 0.5741292834281921, "epoch": 1.968735417638824, "grad_norm": 0.28939974308013916, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.7568727731704712, "num_tokens": 7643270.0, "step": 2110 }, { "entropy": 0.5946325808763504, "epoch": 1.969668688754083, "grad_norm": 0.2629062533378601, "learning_rate": 0.0002, "loss": 0.6059, "mean_token_accuracy": 0.7520578950643539, "num_tokens": 7647008.0, "step": 2111 }, { "entropy": 0.5906870514154434, "epoch": 1.970601959869342, "grad_norm": 0.34241172671318054, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7498970478773117, "num_tokens": 7650503.0, "step": 2112 }, { "entropy": 0.5888873487710953, "epoch": 1.971535230984601, "grad_norm": 0.28352537751197815, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7575471252202988, "num_tokens": 7654067.0, "step": 2113 }, { "entropy": 0.5902065485715866, "epoch": 1.97246850209986, "grad_norm": 0.2916381359100342, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.7536128163337708, "num_tokens": 7657703.0, "step": 2114 }, { "entropy": 0.6127436757087708, "epoch": 1.973401773215119, "grad_norm": 0.25943824648857117, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.7485750764608383, "num_tokens": 7661321.0, "step": 2115 }, { "entropy": 0.608333557844162, "epoch": 1.974335044330378, "grad_norm": 0.2639859914779663, "learning_rate": 0.0002, "loss": 0.6171, "mean_token_accuracy": 0.7464666813611984, "num_tokens": 7664935.0, "step": 2116 }, { "entropy": 0.6413038820028305, "epoch": 1.975268315445637, "grad_norm": 0.3050020933151245, "learning_rate": 0.0002, "loss": 0.6471, "mean_token_accuracy": 0.7368225902318954, "num_tokens": 7668468.0, "step": 2117 }, { "entropy": 0.5923921167850494, "epoch": 1.976201586560896, "grad_norm": 0.2186446338891983, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.7624161392450333, "num_tokens": 7671982.0, "step": 2118 }, { "entropy": 0.6086378544569016, "epoch": 1.977134857676155, "grad_norm": 0.2677394151687622, "learning_rate": 0.0002, "loss": 0.5894, "mean_token_accuracy": 0.764879122376442, "num_tokens": 7675483.0, "step": 2119 }, { "entropy": 0.5943811386823654, "epoch": 1.978068128791414, "grad_norm": 0.23047545552253723, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7609171718358994, "num_tokens": 7679135.0, "step": 2120 }, { "entropy": 0.5858715623617172, "epoch": 1.979001399906673, "grad_norm": 0.23963360488414764, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.7752134054899216, "num_tokens": 7682703.0, "step": 2121 }, { "entropy": 0.5696269497275352, "epoch": 1.9799346710219319, "grad_norm": 0.21831606328487396, "learning_rate": 0.0002, "loss": 0.5697, "mean_token_accuracy": 0.7700311094522476, "num_tokens": 7686362.0, "step": 2122 }, { "entropy": 0.6023120284080505, "epoch": 1.9808679421371909, "grad_norm": 0.22868101298809052, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.7495027333498001, "num_tokens": 7689955.0, "step": 2123 }, { "entropy": 0.6027294099330902, "epoch": 1.9818012132524498, "grad_norm": 0.28199195861816406, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.7506663054227829, "num_tokens": 7693620.0, "step": 2124 }, { "entropy": 0.592132031917572, "epoch": 1.9827344843677088, "grad_norm": 0.2627236545085907, "learning_rate": 0.0002, "loss": 0.6064, "mean_token_accuracy": 0.7507821172475815, "num_tokens": 7697252.0, "step": 2125 }, { "entropy": 0.6088712513446808, "epoch": 1.9836677554829678, "grad_norm": 0.29981765151023865, "learning_rate": 0.0002, "loss": 0.634, "mean_token_accuracy": 0.7428309917449951, "num_tokens": 7700825.0, "step": 2126 }, { "entropy": 0.5979858338832855, "epoch": 1.9846010265982268, "grad_norm": 0.25335535407066345, "learning_rate": 0.0002, "loss": 0.6164, "mean_token_accuracy": 0.7548377960920334, "num_tokens": 7704404.0, "step": 2127 }, { "entropy": 0.5695979595184326, "epoch": 1.9855342977134858, "grad_norm": 0.21707795560359955, "learning_rate": 0.0002, "loss": 0.5678, "mean_token_accuracy": 0.7703558802604675, "num_tokens": 7708146.0, "step": 2128 }, { "entropy": 0.6146382540464401, "epoch": 1.9864675688287448, "grad_norm": 0.24224674701690674, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.7610587775707245, "num_tokens": 7711677.0, "step": 2129 }, { "entropy": 0.6339966058731079, "epoch": 1.9874008399440037, "grad_norm": 0.2493220418691635, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.7493502646684647, "num_tokens": 7715205.0, "step": 2130 }, { "entropy": 0.6169112622737885, "epoch": 1.9883341110592627, "grad_norm": 0.22175373136997223, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7505882978439331, "num_tokens": 7718969.0, "step": 2131 }, { "entropy": 0.6017252057790756, "epoch": 1.9892673821745217, "grad_norm": 0.2242012321949005, "learning_rate": 0.0002, "loss": 0.5895, "mean_token_accuracy": 0.7610363960266113, "num_tokens": 7722602.0, "step": 2132 }, { "entropy": 0.6260471791028976, "epoch": 1.9902006532897807, "grad_norm": 0.2461249977350235, "learning_rate": 0.0002, "loss": 0.6347, "mean_token_accuracy": 0.740030512213707, "num_tokens": 7726263.0, "step": 2133 }, { "entropy": 0.5836127400398254, "epoch": 1.9911339244050397, "grad_norm": 0.26902276277542114, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.7571311593055725, "num_tokens": 7729841.0, "step": 2134 }, { "entropy": 0.6184303611516953, "epoch": 1.9920671955202987, "grad_norm": 0.24514979124069214, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.7507387101650238, "num_tokens": 7733550.0, "step": 2135 }, { "entropy": 0.5837832093238831, "epoch": 1.9930004666355576, "grad_norm": 0.2136295586824417, "learning_rate": 0.0002, "loss": 0.5873, "mean_token_accuracy": 0.7647462785243988, "num_tokens": 7737161.0, "step": 2136 }, { "entropy": 0.6360878050327301, "epoch": 1.9939337377508166, "grad_norm": 0.22855179011821747, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.7428700625896454, "num_tokens": 7740832.0, "step": 2137 }, { "entropy": 0.5926887691020966, "epoch": 1.9948670088660756, "grad_norm": 0.21738605201244354, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7605743855237961, "num_tokens": 7744402.0, "step": 2138 }, { "entropy": 0.6238280683755875, "epoch": 1.9958002799813346, "grad_norm": 0.2775142788887024, "learning_rate": 0.0002, "loss": 0.6437, "mean_token_accuracy": 0.741531178355217, "num_tokens": 7747984.0, "step": 2139 }, { "entropy": 0.6088924407958984, "epoch": 1.9967335510965936, "grad_norm": 0.24017179012298584, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.7475108653306961, "num_tokens": 7751659.0, "step": 2140 }, { "entropy": 0.5967195332050323, "epoch": 1.9976668222118525, "grad_norm": 0.2536953389644623, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.7538667917251587, "num_tokens": 7755243.0, "step": 2141 }, { "entropy": 0.5922544598579407, "epoch": 1.9986000933271115, "grad_norm": 0.2830933630466461, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.7490468919277191, "num_tokens": 7759012.0, "step": 2142 }, { "entropy": 0.5904329419136047, "epoch": 1.9995333644423705, "grad_norm": 0.27976199984550476, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.76920185983181, "num_tokens": 7762625.0, "step": 2143 }, { "entropy": 0.6247565150260925, "epoch": 2.0, "grad_norm": 0.33120593428611755, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.7429797947406769, "num_tokens": 7763645.0, "step": 2144 }, { "entropy": 0.5988006889820099, "epoch": 2.000933271115259, "grad_norm": 0.2317148894071579, "learning_rate": 0.0002, "loss": 0.5803, "mean_token_accuracy": 0.7652952522039413, "num_tokens": 7767244.0, "step": 2145 }, { "entropy": 0.5801083743572235, "epoch": 2.001866542230518, "grad_norm": 0.3246651887893677, "learning_rate": 0.0002, "loss": 0.5845, "mean_token_accuracy": 0.7604587823152542, "num_tokens": 7770664.0, "step": 2146 }, { "entropy": 0.6324456483125687, "epoch": 2.002799813345777, "grad_norm": 0.2846151292324066, "learning_rate": 0.0002, "loss": 0.637, "mean_token_accuracy": 0.7410003244876862, "num_tokens": 7774305.0, "step": 2147 }, { "entropy": 0.5805274993181229, "epoch": 2.003733084461036, "grad_norm": 0.25029227137565613, "learning_rate": 0.0002, "loss": 0.5732, "mean_token_accuracy": 0.7666499316692352, "num_tokens": 7777905.0, "step": 2148 }, { "entropy": 0.5766306668519974, "epoch": 2.004666355576295, "grad_norm": 0.27270790934562683, "learning_rate": 0.0002, "loss": 0.5761, "mean_token_accuracy": 0.7654426991939545, "num_tokens": 7781502.0, "step": 2149 }, { "entropy": 0.5762638002634048, "epoch": 2.005599626691554, "grad_norm": 0.26406553387641907, "learning_rate": 0.0002, "loss": 0.5819, "mean_token_accuracy": 0.7654205560684204, "num_tokens": 7784957.0, "step": 2150 }, { "entropy": 0.5641797780990601, "epoch": 2.006532897806813, "grad_norm": 0.3157612085342407, "learning_rate": 0.0002, "loss": 0.5662, "mean_token_accuracy": 0.7700645178556442, "num_tokens": 7788538.0, "step": 2151 }, { "entropy": 0.578358843922615, "epoch": 2.007466168922072, "grad_norm": 0.32803118228912354, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.7638294547796249, "num_tokens": 7792111.0, "step": 2152 }, { "entropy": 0.5861729085445404, "epoch": 2.008399440037331, "grad_norm": 0.3013969361782074, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.7562432736158371, "num_tokens": 7795707.0, "step": 2153 }, { "entropy": 0.5783582627773285, "epoch": 2.00933271115259, "grad_norm": 0.2736397981643677, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.7663216292858124, "num_tokens": 7799243.0, "step": 2154 }, { "entropy": 0.6134651601314545, "epoch": 2.010265982267849, "grad_norm": 0.27869266271591187, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.7468147724866867, "num_tokens": 7802847.0, "step": 2155 }, { "entropy": 0.614054873585701, "epoch": 2.011199253383108, "grad_norm": 0.27841225266456604, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.7570309191942215, "num_tokens": 7806588.0, "step": 2156 }, { "entropy": 0.6101917326450348, "epoch": 2.0121325244983668, "grad_norm": 0.2388913780450821, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.757154256105423, "num_tokens": 7810335.0, "step": 2157 }, { "entropy": 0.5939789265394211, "epoch": 2.0130657956136258, "grad_norm": 0.35543057322502136, "learning_rate": 0.0002, "loss": 0.5889, "mean_token_accuracy": 0.7555249929428101, "num_tokens": 7813892.0, "step": 2158 }, { "entropy": 0.5807772427797318, "epoch": 2.0139990667288847, "grad_norm": 0.28590771555900574, "learning_rate": 0.0002, "loss": 0.5783, "mean_token_accuracy": 0.771584615111351, "num_tokens": 7817561.0, "step": 2159 }, { "entropy": 0.5711186528205872, "epoch": 2.0149323378441437, "grad_norm": 0.25821250677108765, "learning_rate": 0.0002, "loss": 0.5682, "mean_token_accuracy": 0.7684255689382553, "num_tokens": 7821113.0, "step": 2160 }, { "entropy": 0.5486085116863251, "epoch": 2.0158656089594027, "grad_norm": 0.3047255575656891, "learning_rate": 0.0002, "loss": 0.5605, "mean_token_accuracy": 0.7704424113035202, "num_tokens": 7824668.0, "step": 2161 }, { "entropy": 0.5936267375946045, "epoch": 2.0167988800746617, "grad_norm": 0.32381367683410645, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.754013329744339, "num_tokens": 7828318.0, "step": 2162 }, { "entropy": 0.5775647014379501, "epoch": 2.0177321511899207, "grad_norm": 0.28833192586898804, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7706641554832458, "num_tokens": 7831911.0, "step": 2163 }, { "entropy": 0.5835361778736115, "epoch": 2.0186654223051796, "grad_norm": 0.2991505563259125, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.7622695714235306, "num_tokens": 7835541.0, "step": 2164 }, { "entropy": 0.6475862264633179, "epoch": 2.0195986934204386, "grad_norm": 0.25383561849594116, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.7416275292634964, "num_tokens": 7839355.0, "step": 2165 }, { "entropy": 0.5572551488876343, "epoch": 2.0205319645356976, "grad_norm": 0.3387506902217865, "learning_rate": 0.0002, "loss": 0.5704, "mean_token_accuracy": 0.7700397819280624, "num_tokens": 7842878.0, "step": 2166 }, { "entropy": 0.6003604084253311, "epoch": 2.0214652356509566, "grad_norm": 0.28075510263442993, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7499705404043198, "num_tokens": 7846675.0, "step": 2167 }, { "entropy": 0.6225280910730362, "epoch": 2.0223985067662156, "grad_norm": 0.26130667328834534, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.7504464238882065, "num_tokens": 7850235.0, "step": 2168 }, { "entropy": 0.6151465475559235, "epoch": 2.0233317778814746, "grad_norm": 0.2484264373779297, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7497126311063766, "num_tokens": 7853959.0, "step": 2169 }, { "entropy": 0.5650428980588913, "epoch": 2.0242650489967335, "grad_norm": 0.25451451539993286, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.7656832784414291, "num_tokens": 7857408.0, "step": 2170 }, { "entropy": 0.5623536705970764, "epoch": 2.0251983201119925, "grad_norm": 0.2937193810939789, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7651733458042145, "num_tokens": 7861000.0, "step": 2171 }, { "entropy": 0.5792637318372726, "epoch": 2.0261315912272515, "grad_norm": 0.2640951871871948, "learning_rate": 0.0002, "loss": 0.5823, "mean_token_accuracy": 0.7622305303812027, "num_tokens": 7864784.0, "step": 2172 }, { "entropy": 0.5827578455209732, "epoch": 2.0270648623425105, "grad_norm": 0.27920404076576233, "learning_rate": 0.0002, "loss": 0.591, "mean_token_accuracy": 0.7617113292217255, "num_tokens": 7868429.0, "step": 2173 }, { "entropy": 0.550785094499588, "epoch": 2.0279981334577695, "grad_norm": 0.2667945325374603, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7700108736753464, "num_tokens": 7871872.0, "step": 2174 }, { "entropy": 0.5704996585845947, "epoch": 2.0289314045730285, "grad_norm": 0.28157463669776917, "learning_rate": 0.0002, "loss": 0.5696, "mean_token_accuracy": 0.7717513591051102, "num_tokens": 7875518.0, "step": 2175 }, { "entropy": 0.5742013156414032, "epoch": 2.0298646756882874, "grad_norm": 0.3596973121166229, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.7666895091533661, "num_tokens": 7879205.0, "step": 2176 }, { "entropy": 0.5759735107421875, "epoch": 2.0307979468035464, "grad_norm": 0.28866323828697205, "learning_rate": 0.0002, "loss": 0.5782, "mean_token_accuracy": 0.7629864066839218, "num_tokens": 7882794.0, "step": 2177 }, { "entropy": 0.5816966742277145, "epoch": 2.0317312179188054, "grad_norm": 0.24525269865989685, "learning_rate": 0.0002, "loss": 0.5722, "mean_token_accuracy": 0.7721831351518631, "num_tokens": 7886477.0, "step": 2178 }, { "entropy": 0.5580491721630096, "epoch": 2.0326644890340644, "grad_norm": 0.28435376286506653, "learning_rate": 0.0002, "loss": 0.5585, "mean_token_accuracy": 0.7727787047624588, "num_tokens": 7890038.0, "step": 2179 }, { "entropy": 0.6112171411514282, "epoch": 2.0335977601493234, "grad_norm": 0.2827962636947632, "learning_rate": 0.0002, "loss": 0.6105, "mean_token_accuracy": 0.7513738572597504, "num_tokens": 7893644.0, "step": 2180 }, { "entropy": 0.5897201895713806, "epoch": 2.0345310312645823, "grad_norm": 0.25195619463920593, "learning_rate": 0.0002, "loss": 0.579, "mean_token_accuracy": 0.770047053694725, "num_tokens": 7897367.0, "step": 2181 }, { "entropy": 0.5640669912099838, "epoch": 2.0354643023798413, "grad_norm": 0.23775054514408112, "learning_rate": 0.0002, "loss": 0.5618, "mean_token_accuracy": 0.7724369913339615, "num_tokens": 7901001.0, "step": 2182 }, { "entropy": 0.6263616681098938, "epoch": 2.0363975734951003, "grad_norm": 0.32800325751304626, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.7480600327253342, "num_tokens": 7904628.0, "step": 2183 }, { "entropy": 0.5754449814558029, "epoch": 2.0373308446103593, "grad_norm": 0.2775791883468628, "learning_rate": 0.0002, "loss": 0.5853, "mean_token_accuracy": 0.7656309902667999, "num_tokens": 7908165.0, "step": 2184 }, { "entropy": 0.6141359210014343, "epoch": 2.0382641157256183, "grad_norm": 0.27541419863700867, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.7482814192771912, "num_tokens": 7911846.0, "step": 2185 }, { "entropy": 0.5829830020666122, "epoch": 2.0391973868408773, "grad_norm": 0.3363085091114044, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.7540279924869537, "num_tokens": 7915488.0, "step": 2186 }, { "entropy": 0.5837907046079636, "epoch": 2.0401306579561362, "grad_norm": 0.2782089114189148, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7588837891817093, "num_tokens": 7919236.0, "step": 2187 }, { "entropy": 0.5753042101860046, "epoch": 2.0410639290713952, "grad_norm": 0.2514640688896179, "learning_rate": 0.0002, "loss": 0.57, "mean_token_accuracy": 0.7686426937580109, "num_tokens": 7922946.0, "step": 2188 }, { "entropy": 0.6113276481628418, "epoch": 2.041997200186654, "grad_norm": 0.28849366307258606, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.755901649594307, "num_tokens": 7926705.0, "step": 2189 }, { "entropy": 0.6072168201208115, "epoch": 2.042930471301913, "grad_norm": 0.451267808675766, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.744663655757904, "num_tokens": 7930348.0, "step": 2190 }, { "entropy": 0.5920100212097168, "epoch": 2.043863742417172, "grad_norm": 0.32122334837913513, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.7610814273357391, "num_tokens": 7934031.0, "step": 2191 }, { "entropy": 0.5827621072530746, "epoch": 2.044797013532431, "grad_norm": 0.30841463804244995, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.767289787530899, "num_tokens": 7937630.0, "step": 2192 }, { "entropy": 0.6063332855701447, "epoch": 2.04573028464769, "grad_norm": 0.27827927470207214, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.750719279050827, "num_tokens": 7941091.0, "step": 2193 }, { "entropy": 0.5943696200847626, "epoch": 2.046663555762949, "grad_norm": 0.40351152420043945, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.7565318197011948, "num_tokens": 7944722.0, "step": 2194 }, { "entropy": 0.5925899296998978, "epoch": 2.047596826878208, "grad_norm": 0.2571002244949341, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.7553196400403976, "num_tokens": 7948370.0, "step": 2195 }, { "entropy": 0.5997728109359741, "epoch": 2.048530097993467, "grad_norm": 0.3077201545238495, "learning_rate": 0.0002, "loss": 0.5968, "mean_token_accuracy": 0.7608203589916229, "num_tokens": 7951952.0, "step": 2196 }, { "entropy": 0.5939337760210037, "epoch": 2.049463369108726, "grad_norm": 0.3280780613422394, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.7571506351232529, "num_tokens": 7955493.0, "step": 2197 }, { "entropy": 0.607332780957222, "epoch": 2.050396640223985, "grad_norm": 0.26918482780456543, "learning_rate": 0.0002, "loss": 0.6094, "mean_token_accuracy": 0.7536357343196869, "num_tokens": 7959093.0, "step": 2198 }, { "entropy": 0.5748914778232574, "epoch": 2.051329911339244, "grad_norm": 0.2818826138973236, "learning_rate": 0.0002, "loss": 0.5762, "mean_token_accuracy": 0.7671241462230682, "num_tokens": 7962784.0, "step": 2199 }, { "entropy": 0.5687174201011658, "epoch": 2.052263182454503, "grad_norm": 0.3027034103870392, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7656547427177429, "num_tokens": 7966453.0, "step": 2200 }, { "entropy": 0.6047182530164719, "epoch": 2.053196453569762, "grad_norm": 0.2719506323337555, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.756078228354454, "num_tokens": 7970103.0, "step": 2201 }, { "entropy": 0.600930243730545, "epoch": 2.054129724685021, "grad_norm": 0.2709527611732483, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.7511443942785263, "num_tokens": 7973740.0, "step": 2202 }, { "entropy": 0.5919905006885529, "epoch": 2.05506299580028, "grad_norm": 0.26474300026893616, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.7575397342443466, "num_tokens": 7977351.0, "step": 2203 }, { "entropy": 0.5818803906440735, "epoch": 2.055996266915539, "grad_norm": 0.2515166103839874, "learning_rate": 0.0002, "loss": 0.5753, "mean_token_accuracy": 0.7699039578437805, "num_tokens": 7981090.0, "step": 2204 }, { "entropy": 0.5726082175970078, "epoch": 2.056929538030798, "grad_norm": 0.27204132080078125, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7656097114086151, "num_tokens": 7984692.0, "step": 2205 }, { "entropy": 0.5866244286298752, "epoch": 2.057862809146057, "grad_norm": 0.3112364411354065, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7593788355588913, "num_tokens": 7988184.0, "step": 2206 }, { "entropy": 0.5797349959611893, "epoch": 2.058796080261316, "grad_norm": 0.2568266689777374, "learning_rate": 0.0002, "loss": 0.5861, "mean_token_accuracy": 0.7605071812868118, "num_tokens": 7991795.0, "step": 2207 }, { "entropy": 0.591879203915596, "epoch": 2.059729351376575, "grad_norm": 0.277788370847702, "learning_rate": 0.0002, "loss": 0.5894, "mean_token_accuracy": 0.7653345316648483, "num_tokens": 7995501.0, "step": 2208 }, { "entropy": 0.596518948674202, "epoch": 2.060662622491834, "grad_norm": 0.3965107202529907, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.7519219815731049, "num_tokens": 7999187.0, "step": 2209 }, { "entropy": 0.5787267982959747, "epoch": 2.061595893607093, "grad_norm": 0.3335218131542206, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7561336308717728, "num_tokens": 8002743.0, "step": 2210 }, { "entropy": 0.5526891052722931, "epoch": 2.062529164722352, "grad_norm": 0.2750367522239685, "learning_rate": 0.0002, "loss": 0.5528, "mean_token_accuracy": 0.7760433703660965, "num_tokens": 8006398.0, "step": 2211 }, { "entropy": 0.6097678691148758, "epoch": 2.063462435837611, "grad_norm": 0.2695549428462982, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.7503645122051239, "num_tokens": 8010134.0, "step": 2212 }, { "entropy": 0.6073281317949295, "epoch": 2.06439570695287, "grad_norm": 0.31386569142341614, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7513862997293472, "num_tokens": 8013855.0, "step": 2213 }, { "entropy": 0.5953258872032166, "epoch": 2.0653289780681288, "grad_norm": 0.2763494551181793, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.7650687545537949, "num_tokens": 8017533.0, "step": 2214 }, { "entropy": 0.5705434381961823, "epoch": 2.0662622491833877, "grad_norm": 0.31284666061401367, "learning_rate": 0.0002, "loss": 0.574, "mean_token_accuracy": 0.7667861878871918, "num_tokens": 8021179.0, "step": 2215 }, { "entropy": 0.6049347817897797, "epoch": 2.0671955202986467, "grad_norm": 0.32985860109329224, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.743881955742836, "num_tokens": 8024826.0, "step": 2216 }, { "entropy": 0.5382913500070572, "epoch": 2.0681287914139057, "grad_norm": 0.2808414697647095, "learning_rate": 0.0002, "loss": 0.5468, "mean_token_accuracy": 0.7797303795814514, "num_tokens": 8028447.0, "step": 2217 }, { "entropy": 0.610811784863472, "epoch": 2.0690620625291647, "grad_norm": 0.2875843644142151, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.7587168663740158, "num_tokens": 8032211.0, "step": 2218 }, { "entropy": 0.5850614160299301, "epoch": 2.0699953336444237, "grad_norm": 0.2720795273780823, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.7637057602405548, "num_tokens": 8035778.0, "step": 2219 }, { "entropy": 0.5927619487047195, "epoch": 2.0709286047596827, "grad_norm": 0.27470964193344116, "learning_rate": 0.0002, "loss": 0.5866, "mean_token_accuracy": 0.7641609013080597, "num_tokens": 8039400.0, "step": 2220 }, { "entropy": 0.5922029465436935, "epoch": 2.0718618758749416, "grad_norm": 0.24046461284160614, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.7616100460290909, "num_tokens": 8043104.0, "step": 2221 }, { "entropy": 0.5798885375261307, "epoch": 2.0727951469902006, "grad_norm": 0.2689226269721985, "learning_rate": 0.0002, "loss": 0.5684, "mean_token_accuracy": 0.7719147354364395, "num_tokens": 8046878.0, "step": 2222 }, { "entropy": 0.6018268465995789, "epoch": 2.0737284181054596, "grad_norm": 0.3127143681049347, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7538418024778366, "num_tokens": 8050535.0, "step": 2223 }, { "entropy": 0.5660705417394638, "epoch": 2.0746616892207186, "grad_norm": 0.317127525806427, "learning_rate": 0.0002, "loss": 0.5835, "mean_token_accuracy": 0.7653170078992844, "num_tokens": 8054060.0, "step": 2224 }, { "entropy": 0.5911367386579514, "epoch": 2.0755949603359776, "grad_norm": 0.2967802882194519, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.75965815782547, "num_tokens": 8057631.0, "step": 2225 }, { "entropy": 0.5983722358942032, "epoch": 2.0765282314512366, "grad_norm": 0.3424811065196991, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.7581979632377625, "num_tokens": 8061317.0, "step": 2226 }, { "entropy": 0.5902890861034393, "epoch": 2.0774615025664955, "grad_norm": 0.37410831451416016, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.7616977691650391, "num_tokens": 8064844.0, "step": 2227 }, { "entropy": 0.5762847065925598, "epoch": 2.0783947736817545, "grad_norm": 0.24927577376365662, "learning_rate": 0.0002, "loss": 0.5699, "mean_token_accuracy": 0.7720105350017548, "num_tokens": 8068562.0, "step": 2228 }, { "entropy": 0.5790136754512787, "epoch": 2.0793280447970135, "grad_norm": 0.2594166696071625, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.7543931007385254, "num_tokens": 8072129.0, "step": 2229 }, { "entropy": 0.560194343328476, "epoch": 2.0802613159122725, "grad_norm": 0.24498505890369415, "learning_rate": 0.0002, "loss": 0.5659, "mean_token_accuracy": 0.7820642590522766, "num_tokens": 8075773.0, "step": 2230 }, { "entropy": 0.6111870557069778, "epoch": 2.0811945870275315, "grad_norm": 0.32250210642814636, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.7579309940338135, "num_tokens": 8079480.0, "step": 2231 }, { "entropy": 0.6232975572347641, "epoch": 2.0821278581427904, "grad_norm": 0.266794353723526, "learning_rate": 0.0002, "loss": 0.6383, "mean_token_accuracy": 0.7441994100809097, "num_tokens": 8083136.0, "step": 2232 }, { "entropy": 0.6012145727872849, "epoch": 2.0830611292580494, "grad_norm": 0.26696786284446716, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.7563326060771942, "num_tokens": 8086827.0, "step": 2233 }, { "entropy": 0.5438442155718803, "epoch": 2.0839944003733084, "grad_norm": 0.3357878625392914, "learning_rate": 0.0002, "loss": 0.5443, "mean_token_accuracy": 0.7825492471456528, "num_tokens": 8090377.0, "step": 2234 }, { "entropy": 0.5744924694299698, "epoch": 2.0849276714885674, "grad_norm": 0.26607635617256165, "learning_rate": 0.0002, "loss": 0.5588, "mean_token_accuracy": 0.7768044024705887, "num_tokens": 8093957.0, "step": 2235 }, { "entropy": 0.5961675941944122, "epoch": 2.0858609426038264, "grad_norm": 0.30871474742889404, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.7557289451360703, "num_tokens": 8097469.0, "step": 2236 }, { "entropy": 0.6000724881887436, "epoch": 2.0867942137190854, "grad_norm": 0.2784532308578491, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.757633164525032, "num_tokens": 8101122.0, "step": 2237 }, { "entropy": 0.5993941873311996, "epoch": 2.0877274848343443, "grad_norm": 0.2463454157114029, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.7570018619298935, "num_tokens": 8104813.0, "step": 2238 }, { "entropy": 0.5520271509885788, "epoch": 2.0886607559496033, "grad_norm": 0.35935178399086, "learning_rate": 0.0002, "loss": 0.5687, "mean_token_accuracy": 0.7650067508220673, "num_tokens": 8108423.0, "step": 2239 }, { "entropy": 0.5777813196182251, "epoch": 2.0895940270648623, "grad_norm": 0.2784895598888397, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.7630660831928253, "num_tokens": 8112003.0, "step": 2240 }, { "entropy": 0.5871253460645676, "epoch": 2.0905272981801213, "grad_norm": 0.25822001695632935, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.7591975182294846, "num_tokens": 8115430.0, "step": 2241 }, { "entropy": 0.6070327162742615, "epoch": 2.0914605692953803, "grad_norm": 0.2786569595336914, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.7533381581306458, "num_tokens": 8119022.0, "step": 2242 }, { "entropy": 0.5779010206460953, "epoch": 2.0923938404106392, "grad_norm": 0.3004540503025055, "learning_rate": 0.0002, "loss": 0.5749, "mean_token_accuracy": 0.7688222974538803, "num_tokens": 8122667.0, "step": 2243 }, { "entropy": 0.6029519140720367, "epoch": 2.0933271115258982, "grad_norm": 0.264016717672348, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.7602716088294983, "num_tokens": 8126357.0, "step": 2244 }, { "entropy": 0.6221707910299301, "epoch": 2.094260382641157, "grad_norm": 0.34708499908447266, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.7499993294477463, "num_tokens": 8129920.0, "step": 2245 }, { "entropy": 0.5664742588996887, "epoch": 2.095193653756416, "grad_norm": 0.28561004996299744, "learning_rate": 0.0002, "loss": 0.5804, "mean_token_accuracy": 0.760911300778389, "num_tokens": 8133403.0, "step": 2246 }, { "entropy": 0.593341052532196, "epoch": 2.096126924871675, "grad_norm": 0.2839479148387909, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7536005079746246, "num_tokens": 8136956.0, "step": 2247 }, { "entropy": 0.5577875003218651, "epoch": 2.097060195986934, "grad_norm": 0.32225894927978516, "learning_rate": 0.0002, "loss": 0.568, "mean_token_accuracy": 0.7652128785848618, "num_tokens": 8140502.0, "step": 2248 }, { "entropy": 0.5789213180541992, "epoch": 2.097993467102193, "grad_norm": 0.22498537600040436, "learning_rate": 0.0002, "loss": 0.5647, "mean_token_accuracy": 0.7765842378139496, "num_tokens": 8144116.0, "step": 2249 }, { "entropy": 0.6142255663871765, "epoch": 2.098926738217452, "grad_norm": 0.3071654140949249, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.7565809786319733, "num_tokens": 8147684.0, "step": 2250 }, { "entropy": 0.5922154784202576, "epoch": 2.099860009332711, "grad_norm": 0.301449179649353, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7571930885314941, "num_tokens": 8151375.0, "step": 2251 }, { "entropy": 0.6122957170009613, "epoch": 2.10079328044797, "grad_norm": 0.3119675815105438, "learning_rate": 0.0002, "loss": 0.6141, "mean_token_accuracy": 0.7545228004455566, "num_tokens": 8155037.0, "step": 2252 }, { "entropy": 0.5643935352563858, "epoch": 2.101726551563229, "grad_norm": 0.3057679831981659, "learning_rate": 0.0002, "loss": 0.5762, "mean_token_accuracy": 0.7675812095403671, "num_tokens": 8158703.0, "step": 2253 }, { "entropy": 0.5726190358400345, "epoch": 2.102659822678488, "grad_norm": 0.2738810181617737, "learning_rate": 0.0002, "loss": 0.582, "mean_token_accuracy": 0.764410212635994, "num_tokens": 8162279.0, "step": 2254 }, { "entropy": 0.6205900460481644, "epoch": 2.103593093793747, "grad_norm": 0.28648748993873596, "learning_rate": 0.0002, "loss": 0.6347, "mean_token_accuracy": 0.7401082068681717, "num_tokens": 8165854.0, "step": 2255 }, { "entropy": 0.5682659894227982, "epoch": 2.104526364909006, "grad_norm": 0.29879918694496155, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.7565762847661972, "num_tokens": 8169346.0, "step": 2256 }, { "entropy": 0.5712814182043076, "epoch": 2.105459636024265, "grad_norm": 0.2651250958442688, "learning_rate": 0.0002, "loss": 0.5735, "mean_token_accuracy": 0.7639885097742081, "num_tokens": 8173103.0, "step": 2257 }, { "entropy": 0.6142923384904861, "epoch": 2.106392907139524, "grad_norm": 0.29025790095329285, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.752712070941925, "num_tokens": 8176706.0, "step": 2258 }, { "entropy": 0.5884431302547455, "epoch": 2.107326178254783, "grad_norm": 0.2718122601509094, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.7598102241754532, "num_tokens": 8180341.0, "step": 2259 }, { "entropy": 0.6127192080020905, "epoch": 2.108259449370042, "grad_norm": 0.2501174807548523, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.7557329386472702, "num_tokens": 8183990.0, "step": 2260 }, { "entropy": 0.6128068268299103, "epoch": 2.109192720485301, "grad_norm": 0.2384764701128006, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.7606671303510666, "num_tokens": 8187585.0, "step": 2261 }, { "entropy": 0.5737184435129166, "epoch": 2.11012599160056, "grad_norm": 0.2819713056087494, "learning_rate": 0.0002, "loss": 0.578, "mean_token_accuracy": 0.7629826664924622, "num_tokens": 8191125.0, "step": 2262 }, { "entropy": 0.5898037254810333, "epoch": 2.111059262715819, "grad_norm": 0.261159747838974, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.7589449137449265, "num_tokens": 8194705.0, "step": 2263 }, { "entropy": 0.5810644179582596, "epoch": 2.111992533831078, "grad_norm": 0.3166103661060333, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.7610558122396469, "num_tokens": 8198360.0, "step": 2264 }, { "entropy": 0.5584096908569336, "epoch": 2.112925804946337, "grad_norm": 0.27144110202789307, "learning_rate": 0.0002, "loss": 0.5667, "mean_token_accuracy": 0.7724540829658508, "num_tokens": 8202075.0, "step": 2265 }, { "entropy": 0.5606291145086288, "epoch": 2.113859076061596, "grad_norm": 0.2776046395301819, "learning_rate": 0.0002, "loss": 0.5685, "mean_token_accuracy": 0.767621785402298, "num_tokens": 8205779.0, "step": 2266 }, { "entropy": 0.6084107160568237, "epoch": 2.114792347176855, "grad_norm": 0.32538270950317383, "learning_rate": 0.0002, "loss": 0.6269, "mean_token_accuracy": 0.7374046444892883, "num_tokens": 8209436.0, "step": 2267 }, { "entropy": 0.5876586437225342, "epoch": 2.115725618292114, "grad_norm": 0.30438151955604553, "learning_rate": 0.0002, "loss": 0.5823, "mean_token_accuracy": 0.758630245923996, "num_tokens": 8213085.0, "step": 2268 }, { "entropy": 0.6078964620828629, "epoch": 2.116658889407373, "grad_norm": 0.26936689019203186, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.751944050192833, "num_tokens": 8216683.0, "step": 2269 }, { "entropy": 0.6075794696807861, "epoch": 2.1175921605226318, "grad_norm": 0.2876438498497009, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7566319555044174, "num_tokens": 8220248.0, "step": 2270 }, { "entropy": 0.6186230480670929, "epoch": 2.1185254316378908, "grad_norm": 0.24538174271583557, "learning_rate": 0.0002, "loss": 0.6076, "mean_token_accuracy": 0.7565308958292007, "num_tokens": 8223918.0, "step": 2271 }, { "entropy": 0.5940633565187454, "epoch": 2.1194587027531497, "grad_norm": 0.23326216638088226, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.7642742246389389, "num_tokens": 8227610.0, "step": 2272 }, { "entropy": 0.6107978373765945, "epoch": 2.1203919738684087, "grad_norm": 0.25912144780158997, "learning_rate": 0.0002, "loss": 0.6105, "mean_token_accuracy": 0.7534486055374146, "num_tokens": 8231234.0, "step": 2273 }, { "entropy": 0.57998026907444, "epoch": 2.1213252449836677, "grad_norm": 0.2826509177684784, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.767267718911171, "num_tokens": 8234911.0, "step": 2274 }, { "entropy": 0.58481764793396, "epoch": 2.1222585160989267, "grad_norm": 0.34978631138801575, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.7630192190408707, "num_tokens": 8238534.0, "step": 2275 }, { "entropy": 0.6124267578125, "epoch": 2.1231917872141857, "grad_norm": 0.27584439516067505, "learning_rate": 0.0002, "loss": 0.6189, "mean_token_accuracy": 0.7488401085138321, "num_tokens": 8242200.0, "step": 2276 }, { "entropy": 0.5871164053678513, "epoch": 2.1241250583294446, "grad_norm": 0.27020061016082764, "learning_rate": 0.0002, "loss": 0.5931, "mean_token_accuracy": 0.7588271200656891, "num_tokens": 8245797.0, "step": 2277 }, { "entropy": 0.5470297187566757, "epoch": 2.1250583294447036, "grad_norm": 0.2864471971988678, "learning_rate": 0.0002, "loss": 0.5598, "mean_token_accuracy": 0.7761086672544479, "num_tokens": 8249303.0, "step": 2278 }, { "entropy": 0.5669999867677689, "epoch": 2.1259916005599626, "grad_norm": 0.3044615685939789, "learning_rate": 0.0002, "loss": 0.5742, "mean_token_accuracy": 0.7691217660903931, "num_tokens": 8252801.0, "step": 2279 }, { "entropy": 0.5782038569450378, "epoch": 2.1269248716752216, "grad_norm": 0.3251620829105377, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.757232740521431, "num_tokens": 8256346.0, "step": 2280 }, { "entropy": 0.6038341224193573, "epoch": 2.1278581427904806, "grad_norm": 0.29008278250694275, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7545742988586426, "num_tokens": 8259974.0, "step": 2281 }, { "entropy": 0.6182076036930084, "epoch": 2.1287914139057396, "grad_norm": 0.3125447630882263, "learning_rate": 0.0002, "loss": 0.6249, "mean_token_accuracy": 0.7435896247625351, "num_tokens": 8263570.0, "step": 2282 }, { "entropy": 0.6024322956800461, "epoch": 2.1297246850209985, "grad_norm": 0.2695988714694977, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.751977264881134, "num_tokens": 8267210.0, "step": 2283 }, { "entropy": 0.5371197462081909, "epoch": 2.1306579561362575, "grad_norm": 0.28607895970344543, "learning_rate": 0.0002, "loss": 0.5361, "mean_token_accuracy": 0.7833854407072067, "num_tokens": 8270766.0, "step": 2284 }, { "entropy": 0.6054855585098267, "epoch": 2.1315912272515165, "grad_norm": 0.27645421028137207, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7571483850479126, "num_tokens": 8274429.0, "step": 2285 }, { "entropy": 0.552215501666069, "epoch": 2.1325244983667755, "grad_norm": 0.3037779927253723, "learning_rate": 0.0002, "loss": 0.5619, "mean_token_accuracy": 0.7674799114465714, "num_tokens": 8277779.0, "step": 2286 }, { "entropy": 0.5568560808897018, "epoch": 2.1334577694820345, "grad_norm": 0.2816643714904785, "learning_rate": 0.0002, "loss": 0.5573, "mean_token_accuracy": 0.7691613286733627, "num_tokens": 8281382.0, "step": 2287 }, { "entropy": 0.6453960537910461, "epoch": 2.1343910405972935, "grad_norm": 0.28451183438301086, "learning_rate": 0.0002, "loss": 0.6435, "mean_token_accuracy": 0.7458890974521637, "num_tokens": 8284947.0, "step": 2288 }, { "entropy": 0.5984252542257309, "epoch": 2.1353243117125524, "grad_norm": 0.3108806014060974, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.7506808787584305, "num_tokens": 8288645.0, "step": 2289 }, { "entropy": 0.6185686588287354, "epoch": 2.1362575828278114, "grad_norm": 0.30804863572120667, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.747748538851738, "num_tokens": 8292330.0, "step": 2290 }, { "entropy": 0.595654621720314, "epoch": 2.1371908539430704, "grad_norm": 0.26756751537323, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7597111165523529, "num_tokens": 8296000.0, "step": 2291 }, { "entropy": 0.6294244974851608, "epoch": 2.1381241250583294, "grad_norm": 0.29089978337287903, "learning_rate": 0.0002, "loss": 0.6248, "mean_token_accuracy": 0.7455633878707886, "num_tokens": 8299684.0, "step": 2292 }, { "entropy": 0.615734651684761, "epoch": 2.1390573961735884, "grad_norm": 0.31525376439094543, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.7496893405914307, "num_tokens": 8303203.0, "step": 2293 }, { "entropy": 0.5758403912186623, "epoch": 2.1399906672888473, "grad_norm": 0.3292861878871918, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.7674686461687088, "num_tokens": 8306799.0, "step": 2294 }, { "entropy": 0.5664787441492081, "epoch": 2.1409239384041063, "grad_norm": 0.3505602777004242, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.7577099800109863, "num_tokens": 8310448.0, "step": 2295 }, { "entropy": 0.5592279210686684, "epoch": 2.1418572095193653, "grad_norm": 0.2871904671192169, "learning_rate": 0.0002, "loss": 0.5709, "mean_token_accuracy": 0.7663271874189377, "num_tokens": 8313989.0, "step": 2296 }, { "entropy": 0.6170869022607803, "epoch": 2.1427904806346243, "grad_norm": 0.29245635867118835, "learning_rate": 0.0002, "loss": 0.6147, "mean_token_accuracy": 0.752310037612915, "num_tokens": 8317652.0, "step": 2297 }, { "entropy": 0.6170367002487183, "epoch": 2.1437237517498833, "grad_norm": 0.3450734317302704, "learning_rate": 0.0002, "loss": 0.6257, "mean_token_accuracy": 0.752133309841156, "num_tokens": 8321292.0, "step": 2298 }, { "entropy": 0.5917398780584335, "epoch": 2.1446570228651423, "grad_norm": 0.38344430923461914, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.7611623555421829, "num_tokens": 8325078.0, "step": 2299 }, { "entropy": 0.5689868181943893, "epoch": 2.1455902939804012, "grad_norm": 0.37523353099823, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7618496417999268, "num_tokens": 8328642.0, "step": 2300 }, { "entropy": 0.6109979897737503, "epoch": 2.1465235650956602, "grad_norm": 0.2717086970806122, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.7582127302885056, "num_tokens": 8332329.0, "step": 2301 }, { "entropy": 0.5760907381772995, "epoch": 2.147456836210919, "grad_norm": 0.3253421187400818, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.760476142168045, "num_tokens": 8335858.0, "step": 2302 }, { "entropy": 0.5933248996734619, "epoch": 2.148390107326178, "grad_norm": 0.26201021671295166, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7626147866249084, "num_tokens": 8339525.0, "step": 2303 }, { "entropy": 0.6269838958978653, "epoch": 2.149323378441437, "grad_norm": 0.3186527490615845, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.7493605017662048, "num_tokens": 8343185.0, "step": 2304 }, { "entropy": 0.6130197197198868, "epoch": 2.150256649556696, "grad_norm": 0.3630645275115967, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.7562390714883804, "num_tokens": 8346903.0, "step": 2305 }, { "entropy": 0.5764106661081314, "epoch": 2.151189920671955, "grad_norm": 0.2890404760837555, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7617520838975906, "num_tokens": 8350510.0, "step": 2306 }, { "entropy": 0.6148927509784698, "epoch": 2.152123191787214, "grad_norm": 0.27749326825141907, "learning_rate": 0.0002, "loss": 0.6193, "mean_token_accuracy": 0.7477836906909943, "num_tokens": 8354268.0, "step": 2307 }, { "entropy": 0.5648437589406967, "epoch": 2.153056462902473, "grad_norm": 0.28256461024284363, "learning_rate": 0.0002, "loss": 0.5662, "mean_token_accuracy": 0.7736520171165466, "num_tokens": 8357908.0, "step": 2308 }, { "entropy": 0.5966114550828934, "epoch": 2.153989734017732, "grad_norm": 0.2550146281719208, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.7631022334098816, "num_tokens": 8361650.0, "step": 2309 }, { "entropy": 0.5865554213523865, "epoch": 2.154923005132991, "grad_norm": 0.27810025215148926, "learning_rate": 0.0002, "loss": 0.5834, "mean_token_accuracy": 0.7593575119972229, "num_tokens": 8365195.0, "step": 2310 }, { "entropy": 0.5745296478271484, "epoch": 2.15585627624825, "grad_norm": 0.2689867615699768, "learning_rate": 0.0002, "loss": 0.5721, "mean_token_accuracy": 0.7710607200860977, "num_tokens": 8368828.0, "step": 2311 }, { "entropy": 0.5901143550872803, "epoch": 2.156789547363509, "grad_norm": 0.2747341990470886, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.7607621550559998, "num_tokens": 8372409.0, "step": 2312 }, { "entropy": 0.5956049412488937, "epoch": 2.157722818478768, "grad_norm": 0.28184929490089417, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.7586119472980499, "num_tokens": 8376098.0, "step": 2313 }, { "entropy": 0.5889282524585724, "epoch": 2.158656089594027, "grad_norm": 0.3198992609977722, "learning_rate": 0.0002, "loss": 0.592, "mean_token_accuracy": 0.759218618273735, "num_tokens": 8379672.0, "step": 2314 }, { "entropy": 0.5541287511587143, "epoch": 2.159589360709286, "grad_norm": 0.3090055584907532, "learning_rate": 0.0002, "loss": 0.5676, "mean_token_accuracy": 0.7726028561592102, "num_tokens": 8383196.0, "step": 2315 }, { "entropy": 0.5832509845495224, "epoch": 2.160522631824545, "grad_norm": 0.3231296241283417, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.7570188194513321, "num_tokens": 8386726.0, "step": 2316 }, { "entropy": 0.6300223022699356, "epoch": 2.161455902939804, "grad_norm": 0.33409595489501953, "learning_rate": 0.0002, "loss": 0.65, "mean_token_accuracy": 0.7415565997362137, "num_tokens": 8390355.0, "step": 2317 }, { "entropy": 0.5766201019287109, "epoch": 2.162389174055063, "grad_norm": 0.2858700752258301, "learning_rate": 0.0002, "loss": 0.5873, "mean_token_accuracy": 0.7668737024068832, "num_tokens": 8393958.0, "step": 2318 }, { "entropy": 0.5894267559051514, "epoch": 2.163322445170322, "grad_norm": 0.2647208869457245, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7698917537927628, "num_tokens": 8397731.0, "step": 2319 }, { "entropy": 0.6216875314712524, "epoch": 2.164255716285581, "grad_norm": 0.2774721384048462, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.7421796321868896, "num_tokens": 8401256.0, "step": 2320 }, { "entropy": 0.5857991427183151, "epoch": 2.16518898740084, "grad_norm": 0.2693770229816437, "learning_rate": 0.0002, "loss": 0.5807, "mean_token_accuracy": 0.7624678015708923, "num_tokens": 8404855.0, "step": 2321 }, { "entropy": 0.5848852545022964, "epoch": 2.166122258516099, "grad_norm": 0.27287423610687256, "learning_rate": 0.0002, "loss": 0.5717, "mean_token_accuracy": 0.768419548869133, "num_tokens": 8408651.0, "step": 2322 }, { "entropy": 0.5934577286243439, "epoch": 2.167055529631358, "grad_norm": 0.31089162826538086, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.7644048184156418, "num_tokens": 8412279.0, "step": 2323 }, { "entropy": 0.5895729660987854, "epoch": 2.167988800746617, "grad_norm": 0.32973867654800415, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.7627944946289062, "num_tokens": 8415810.0, "step": 2324 }, { "entropy": 0.5489595830440521, "epoch": 2.168922071861876, "grad_norm": 0.292237788438797, "learning_rate": 0.0002, "loss": 0.5514, "mean_token_accuracy": 0.7779190093278885, "num_tokens": 8419454.0, "step": 2325 }, { "entropy": 0.5679409205913544, "epoch": 2.169855342977135, "grad_norm": 0.38407647609710693, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7625149637460709, "num_tokens": 8422887.0, "step": 2326 }, { "entropy": 0.564596489071846, "epoch": 2.1707886140923938, "grad_norm": 0.36137521266937256, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.7640045583248138, "num_tokens": 8426395.0, "step": 2327 }, { "entropy": 0.5978202074766159, "epoch": 2.1717218852076527, "grad_norm": 0.23268276453018188, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7561551481485367, "num_tokens": 8430309.0, "step": 2328 }, { "entropy": 0.568810909986496, "epoch": 2.1726551563229117, "grad_norm": 0.2700110673904419, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.7665290236473083, "num_tokens": 8433882.0, "step": 2329 }, { "entropy": 0.610126867890358, "epoch": 2.1735884274381707, "grad_norm": 0.27902284264564514, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.7507376968860626, "num_tokens": 8437592.0, "step": 2330 }, { "entropy": 0.5720477849245071, "epoch": 2.1745216985534297, "grad_norm": 0.3829280436038971, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7563038170337677, "num_tokens": 8441123.0, "step": 2331 }, { "entropy": 0.6215395629405975, "epoch": 2.1754549696686887, "grad_norm": 0.2724095582962036, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.754868820309639, "num_tokens": 8444718.0, "step": 2332 }, { "entropy": 0.5932022333145142, "epoch": 2.1763882407839477, "grad_norm": 0.25356370210647583, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.7598023116588593, "num_tokens": 8448482.0, "step": 2333 }, { "entropy": 0.6110702455043793, "epoch": 2.1773215118992066, "grad_norm": 0.34206464886665344, "learning_rate": 0.0002, "loss": 0.6123, "mean_token_accuracy": 0.7426034957170486, "num_tokens": 8451999.0, "step": 2334 }, { "entropy": 0.5546213686466217, "epoch": 2.1782547830144656, "grad_norm": 0.2809150218963623, "learning_rate": 0.0002, "loss": 0.5571, "mean_token_accuracy": 0.7712195813655853, "num_tokens": 8455505.0, "step": 2335 }, { "entropy": 0.5877770036458969, "epoch": 2.1791880541297246, "grad_norm": 0.29239317774772644, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.7599957138299942, "num_tokens": 8459140.0, "step": 2336 }, { "entropy": 0.5550197660923004, "epoch": 2.1801213252449836, "grad_norm": 0.27376651763916016, "learning_rate": 0.0002, "loss": 0.5705, "mean_token_accuracy": 0.7643794566392899, "num_tokens": 8462753.0, "step": 2337 }, { "entropy": 0.5657875463366508, "epoch": 2.1810545963602426, "grad_norm": 0.3186740577220917, "learning_rate": 0.0002, "loss": 0.5652, "mean_token_accuracy": 0.7679927945137024, "num_tokens": 8466288.0, "step": 2338 }, { "entropy": 0.6079001128673553, "epoch": 2.1819878674755016, "grad_norm": 0.3901255130767822, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.7500372678041458, "num_tokens": 8469893.0, "step": 2339 }, { "entropy": 0.6011014580726624, "epoch": 2.1829211385907605, "grad_norm": 0.2960359752178192, "learning_rate": 0.0002, "loss": 0.6103, "mean_token_accuracy": 0.7494435608386993, "num_tokens": 8473590.0, "step": 2340 }, { "entropy": 0.5819852352142334, "epoch": 2.1838544097060195, "grad_norm": 0.29967939853668213, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.7552744597196579, "num_tokens": 8477229.0, "step": 2341 }, { "entropy": 0.5640967637300491, "epoch": 2.1847876808212785, "grad_norm": 0.325507253408432, "learning_rate": 0.0002, "loss": 0.5713, "mean_token_accuracy": 0.7707193493843079, "num_tokens": 8480754.0, "step": 2342 }, { "entropy": 0.5917907208204269, "epoch": 2.1857209519365375, "grad_norm": 0.28986138105392456, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.7501442432403564, "num_tokens": 8484404.0, "step": 2343 }, { "entropy": 0.5793856233358383, "epoch": 2.1866542230517965, "grad_norm": 0.28094467520713806, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7672288566827774, "num_tokens": 8487933.0, "step": 2344 }, { "entropy": 0.6215274184942245, "epoch": 2.1875874941670554, "grad_norm": 0.2607601583003998, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.7493434846401215, "num_tokens": 8491562.0, "step": 2345 }, { "entropy": 0.6208483278751373, "epoch": 2.1885207652823144, "grad_norm": 0.29296958446502686, "learning_rate": 0.0002, "loss": 0.6172, "mean_token_accuracy": 0.7484779953956604, "num_tokens": 8495321.0, "step": 2346 }, { "entropy": 0.6079575717449188, "epoch": 2.1894540363975734, "grad_norm": 0.29936087131500244, "learning_rate": 0.0002, "loss": 0.6176, "mean_token_accuracy": 0.7481423616409302, "num_tokens": 8498941.0, "step": 2347 }, { "entropy": 0.5971207767724991, "epoch": 2.1903873075128324, "grad_norm": 0.3007107675075531, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.7625744342803955, "num_tokens": 8502485.0, "step": 2348 }, { "entropy": 0.5886639952659607, "epoch": 2.1913205786280914, "grad_norm": 0.2951311767101288, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.7665237188339233, "num_tokens": 8506041.0, "step": 2349 }, { "entropy": 0.589599996805191, "epoch": 2.1922538497433504, "grad_norm": 0.29476651549339294, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.7576693147420883, "num_tokens": 8509555.0, "step": 2350 }, { "entropy": 0.5989267975091934, "epoch": 2.1931871208586093, "grad_norm": 0.2732846140861511, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.7561537176370621, "num_tokens": 8513243.0, "step": 2351 }, { "entropy": 0.5845438688993454, "epoch": 2.1941203919738683, "grad_norm": 0.2797333300113678, "learning_rate": 0.0002, "loss": 0.5914, "mean_token_accuracy": 0.765125572681427, "num_tokens": 8516860.0, "step": 2352 }, { "entropy": 0.5707387626171112, "epoch": 2.1950536630891273, "grad_norm": 0.26184073090553284, "learning_rate": 0.0002, "loss": 0.5798, "mean_token_accuracy": 0.7649408578872681, "num_tokens": 8520499.0, "step": 2353 }, { "entropy": 0.6104388535022736, "epoch": 2.1959869342043863, "grad_norm": 0.3420996367931366, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.7479348480701447, "num_tokens": 8524144.0, "step": 2354 }, { "entropy": 0.5688076466321945, "epoch": 2.1969202053196453, "grad_norm": 0.2603188455104828, "learning_rate": 0.0002, "loss": 0.5772, "mean_token_accuracy": 0.7604003101587296, "num_tokens": 8527796.0, "step": 2355 }, { "entropy": 0.5667518824338913, "epoch": 2.1978534764349043, "grad_norm": 0.28417181968688965, "learning_rate": 0.0002, "loss": 0.5708, "mean_token_accuracy": 0.7685412019491196, "num_tokens": 8531469.0, "step": 2356 }, { "entropy": 0.5934633165597916, "epoch": 2.1987867475501632, "grad_norm": 0.31418463587760925, "learning_rate": 0.0002, "loss": 0.592, "mean_token_accuracy": 0.7568591088056564, "num_tokens": 8535273.0, "step": 2357 }, { "entropy": 0.5902548581361771, "epoch": 2.199720018665422, "grad_norm": 0.26681554317474365, "learning_rate": 0.0002, "loss": 0.5819, "mean_token_accuracy": 0.7647341787815094, "num_tokens": 8538919.0, "step": 2358 }, { "entropy": 0.5891347527503967, "epoch": 2.200653289780681, "grad_norm": 0.3127342760562897, "learning_rate": 0.0002, "loss": 0.5927, "mean_token_accuracy": 0.7592729926109314, "num_tokens": 8542544.0, "step": 2359 }, { "entropy": 0.6021530330181122, "epoch": 2.20158656089594, "grad_norm": 0.31843137741088867, "learning_rate": 0.0002, "loss": 0.6058, "mean_token_accuracy": 0.7551460117101669, "num_tokens": 8546305.0, "step": 2360 }, { "entropy": 0.5926598459482193, "epoch": 2.202519832011199, "grad_norm": 0.31639575958251953, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7615261077880859, "num_tokens": 8549859.0, "step": 2361 }, { "entropy": 0.5963807702064514, "epoch": 2.203453103126458, "grad_norm": 0.2565319836139679, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.7592038959264755, "num_tokens": 8553398.0, "step": 2362 }, { "entropy": 0.5719095468521118, "epoch": 2.204386374241717, "grad_norm": 0.30380719900131226, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.7704861015081406, "num_tokens": 8557122.0, "step": 2363 }, { "entropy": 0.5990542620420456, "epoch": 2.205319645356976, "grad_norm": 0.29104602336883545, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7577747404575348, "num_tokens": 8560873.0, "step": 2364 }, { "entropy": 0.5861097872257233, "epoch": 2.206252916472235, "grad_norm": 0.3022252023220062, "learning_rate": 0.0002, "loss": 0.5891, "mean_token_accuracy": 0.7610445320606232, "num_tokens": 8564692.0, "step": 2365 }, { "entropy": 0.6104065626859665, "epoch": 2.207186187587494, "grad_norm": 0.2837691903114319, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.7449212670326233, "num_tokens": 8568238.0, "step": 2366 }, { "entropy": 0.6277349144220352, "epoch": 2.208119458702753, "grad_norm": 0.27214211225509644, "learning_rate": 0.0002, "loss": 0.6291, "mean_token_accuracy": 0.7480335831642151, "num_tokens": 8571984.0, "step": 2367 }, { "entropy": 0.5833575576543808, "epoch": 2.209052729818012, "grad_norm": 0.24983038008213043, "learning_rate": 0.0002, "loss": 0.5837, "mean_token_accuracy": 0.763738602399826, "num_tokens": 8575686.0, "step": 2368 }, { "entropy": 0.5752873420715332, "epoch": 2.209986000933271, "grad_norm": 0.333661288022995, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.7561593800783157, "num_tokens": 8579244.0, "step": 2369 }, { "entropy": 0.5627732425928116, "epoch": 2.21091927204853, "grad_norm": 0.30257493257522583, "learning_rate": 0.0002, "loss": 0.5664, "mean_token_accuracy": 0.7695437967777252, "num_tokens": 8582844.0, "step": 2370 }, { "entropy": 0.6063052117824554, "epoch": 2.211852543163789, "grad_norm": 0.28276365995407104, "learning_rate": 0.0002, "loss": 0.6088, "mean_token_accuracy": 0.7576028853654861, "num_tokens": 8586473.0, "step": 2371 }, { "entropy": 0.5084924697875977, "epoch": 2.212785814279048, "grad_norm": 0.3096979558467865, "learning_rate": 0.0002, "loss": 0.5038, "mean_token_accuracy": 0.7941272556781769, "num_tokens": 8590057.0, "step": 2372 }, { "entropy": 0.6126297861337662, "epoch": 2.213719085394307, "grad_norm": 0.2751995325088501, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.7530572414398193, "num_tokens": 8593692.0, "step": 2373 }, { "entropy": 0.6150067150592804, "epoch": 2.214652356509566, "grad_norm": 0.27048206329345703, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.7568836659193039, "num_tokens": 8597241.0, "step": 2374 }, { "entropy": 0.6284038871526718, "epoch": 2.215585627624825, "grad_norm": 0.33619558811187744, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.7440925389528275, "num_tokens": 8600868.0, "step": 2375 }, { "entropy": 0.606640949845314, "epoch": 2.216518898740084, "grad_norm": 0.2912035286426544, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.7606271207332611, "num_tokens": 8604608.0, "step": 2376 }, { "entropy": 0.597091019153595, "epoch": 2.217452169855343, "grad_norm": 0.3309215307235718, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7568850070238113, "num_tokens": 8608172.0, "step": 2377 }, { "entropy": 0.5824596136808395, "epoch": 2.218385440970602, "grad_norm": 0.32744404673576355, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.7578031122684479, "num_tokens": 8611763.0, "step": 2378 }, { "entropy": 0.5887960642576218, "epoch": 2.219318712085861, "grad_norm": 0.31700119376182556, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.760401651263237, "num_tokens": 8615311.0, "step": 2379 }, { "entropy": 0.6017894595861435, "epoch": 2.22025198320112, "grad_norm": 0.3057185709476471, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.7495683282613754, "num_tokens": 8618825.0, "step": 2380 }, { "entropy": 0.6314666420221329, "epoch": 2.221185254316379, "grad_norm": 0.2663949131965637, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.7509291917085648, "num_tokens": 8622445.0, "step": 2381 }, { "entropy": 0.5903994143009186, "epoch": 2.222118525431638, "grad_norm": 0.3037620186805725, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7535839825868607, "num_tokens": 8625947.0, "step": 2382 }, { "entropy": 0.5576892346143723, "epoch": 2.2230517965468968, "grad_norm": 0.2942694127559662, "learning_rate": 0.0002, "loss": 0.5589, "mean_token_accuracy": 0.7703699618577957, "num_tokens": 8629533.0, "step": 2383 }, { "entropy": 0.5749717652797699, "epoch": 2.2239850676621558, "grad_norm": 0.2861919403076172, "learning_rate": 0.0002, "loss": 0.5788, "mean_token_accuracy": 0.7621338069438934, "num_tokens": 8633172.0, "step": 2384 }, { "entropy": 0.6051831543445587, "epoch": 2.2249183387774147, "grad_norm": 0.2631887197494507, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.7553282976150513, "num_tokens": 8636798.0, "step": 2385 }, { "entropy": 0.5960742384195328, "epoch": 2.2258516098926737, "grad_norm": 0.2791377007961273, "learning_rate": 0.0002, "loss": 0.6105, "mean_token_accuracy": 0.7530831098556519, "num_tokens": 8640388.0, "step": 2386 }, { "entropy": 0.5787006616592407, "epoch": 2.2267848810079327, "grad_norm": 0.2780774235725403, "learning_rate": 0.0002, "loss": 0.5794, "mean_token_accuracy": 0.7605503499507904, "num_tokens": 8644013.0, "step": 2387 }, { "entropy": 0.5782035291194916, "epoch": 2.2277181521231917, "grad_norm": 0.3460947275161743, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.7617191970348358, "num_tokens": 8647575.0, "step": 2388 }, { "entropy": 0.5520891919732094, "epoch": 2.2286514232384507, "grad_norm": 0.288711816072464, "learning_rate": 0.0002, "loss": 0.5568, "mean_token_accuracy": 0.7772928774356842, "num_tokens": 8651175.0, "step": 2389 }, { "entropy": 0.6054556369781494, "epoch": 2.2295846943537097, "grad_norm": 0.2656196653842926, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.7616490870714188, "num_tokens": 8654787.0, "step": 2390 }, { "entropy": 0.6012173444032669, "epoch": 2.2305179654689686, "grad_norm": 0.3416280448436737, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.7521859705448151, "num_tokens": 8658422.0, "step": 2391 }, { "entropy": 0.5872707217931747, "epoch": 2.2314512365842276, "grad_norm": 0.6270185708999634, "learning_rate": 0.0002, "loss": 0.5873, "mean_token_accuracy": 0.7626411616802216, "num_tokens": 8662084.0, "step": 2392 }, { "entropy": 0.6232167929410934, "epoch": 2.2323845076994866, "grad_norm": 0.2563695013523102, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.7522178590297699, "num_tokens": 8665816.0, "step": 2393 }, { "entropy": 0.5702142268419266, "epoch": 2.2333177788147456, "grad_norm": 0.26106902956962585, "learning_rate": 0.0002, "loss": 0.5681, "mean_token_accuracy": 0.7722164988517761, "num_tokens": 8669471.0, "step": 2394 }, { "entropy": 0.5697062015533447, "epoch": 2.2342510499300046, "grad_norm": 0.2760434150695801, "learning_rate": 0.0002, "loss": 0.5836, "mean_token_accuracy": 0.7606301605701447, "num_tokens": 8673123.0, "step": 2395 }, { "entropy": 0.5707233399152756, "epoch": 2.2351843210452635, "grad_norm": 0.32027724385261536, "learning_rate": 0.0002, "loss": 0.5712, "mean_token_accuracy": 0.7742370963096619, "num_tokens": 8676724.0, "step": 2396 }, { "entropy": 0.548112541437149, "epoch": 2.2361175921605225, "grad_norm": 0.2927444875240326, "learning_rate": 0.0002, "loss": 0.5582, "mean_token_accuracy": 0.7683015018701553, "num_tokens": 8680331.0, "step": 2397 }, { "entropy": 0.599630281329155, "epoch": 2.2370508632757815, "grad_norm": 0.2967711389064789, "learning_rate": 0.0002, "loss": 0.6109, "mean_token_accuracy": 0.7470305114984512, "num_tokens": 8683987.0, "step": 2398 }, { "entropy": 0.5906425565481186, "epoch": 2.2379841343910405, "grad_norm": 0.3032587766647339, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.7645378112792969, "num_tokens": 8687534.0, "step": 2399 }, { "entropy": 0.5803706645965576, "epoch": 2.2389174055062995, "grad_norm": 0.24496936798095703, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.7654324024915695, "num_tokens": 8691161.0, "step": 2400 }, { "entropy": 0.5909268110990524, "epoch": 2.2398506766215585, "grad_norm": 0.3254065215587616, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.7588689029216766, "num_tokens": 8694765.0, "step": 2401 }, { "entropy": 0.5711219161748886, "epoch": 2.2407839477368174, "grad_norm": 0.35934826731681824, "learning_rate": 0.0002, "loss": 0.589, "mean_token_accuracy": 0.7623362988233566, "num_tokens": 8698299.0, "step": 2402 }, { "entropy": 0.596192479133606, "epoch": 2.2417172188520764, "grad_norm": 0.3652099072933197, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.7614441066980362, "num_tokens": 8701982.0, "step": 2403 }, { "entropy": 0.5821194052696228, "epoch": 2.2426504899673354, "grad_norm": 0.30006468296051025, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.7601598501205444, "num_tokens": 8705742.0, "step": 2404 }, { "entropy": 0.6043014973402023, "epoch": 2.2435837610825944, "grad_norm": 0.2818129062652588, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.7643305659294128, "num_tokens": 8709336.0, "step": 2405 }, { "entropy": 0.5935932099819183, "epoch": 2.2445170321978534, "grad_norm": 0.38057941198349, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.7541938573122025, "num_tokens": 8712886.0, "step": 2406 }, { "entropy": 0.567374050617218, "epoch": 2.2454503033131124, "grad_norm": 0.3198612630367279, "learning_rate": 0.0002, "loss": 0.5695, "mean_token_accuracy": 0.7671309113502502, "num_tokens": 8716502.0, "step": 2407 }, { "entropy": 0.560381256043911, "epoch": 2.2463835744283713, "grad_norm": 0.3220510184764862, "learning_rate": 0.0002, "loss": 0.5527, "mean_token_accuracy": 0.7796934396028519, "num_tokens": 8720071.0, "step": 2408 }, { "entropy": 0.57096928358078, "epoch": 2.2473168455436303, "grad_norm": 0.2763105034828186, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7784620523452759, "num_tokens": 8723745.0, "step": 2409 }, { "entropy": 0.618706464767456, "epoch": 2.2482501166588893, "grad_norm": 0.33517491817474365, "learning_rate": 0.0002, "loss": 0.6278, "mean_token_accuracy": 0.75254325568676, "num_tokens": 8727428.0, "step": 2410 }, { "entropy": 0.5931539982557297, "epoch": 2.2491833877741483, "grad_norm": 0.29769837856292725, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.7606174498796463, "num_tokens": 8731012.0, "step": 2411 }, { "entropy": 0.5563886910676956, "epoch": 2.2501166588894073, "grad_norm": 0.3468764126300812, "learning_rate": 0.0002, "loss": 0.5762, "mean_token_accuracy": 0.7705220878124237, "num_tokens": 8734429.0, "step": 2412 }, { "entropy": 0.5931205600500107, "epoch": 2.2510499300046662, "grad_norm": 0.35277047753334045, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.753533199429512, "num_tokens": 8738007.0, "step": 2413 }, { "entropy": 0.5816841274499893, "epoch": 2.2519832011199252, "grad_norm": 0.29700765013694763, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.754295602440834, "num_tokens": 8741612.0, "step": 2414 }, { "entropy": 0.5941565781831741, "epoch": 2.252916472235184, "grad_norm": 0.2920217514038086, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.7561419606208801, "num_tokens": 8745231.0, "step": 2415 }, { "entropy": 0.6096569895744324, "epoch": 2.253849743350443, "grad_norm": 0.2775484025478363, "learning_rate": 0.0002, "loss": 0.6004, "mean_token_accuracy": 0.7520378828048706, "num_tokens": 8748809.0, "step": 2416 }, { "entropy": 0.5850409716367722, "epoch": 2.254783014465702, "grad_norm": 0.24937494099140167, "learning_rate": 0.0002, "loss": 0.5872, "mean_token_accuracy": 0.7570434957742691, "num_tokens": 8752451.0, "step": 2417 }, { "entropy": 0.579177588224411, "epoch": 2.255716285580961, "grad_norm": 0.31068143248558044, "learning_rate": 0.0002, "loss": 0.5823, "mean_token_accuracy": 0.7618287801742554, "num_tokens": 8756070.0, "step": 2418 }, { "entropy": 0.5757591128349304, "epoch": 2.25664955669622, "grad_norm": 0.27199822664260864, "learning_rate": 0.0002, "loss": 0.5712, "mean_token_accuracy": 0.7652783542871475, "num_tokens": 8759743.0, "step": 2419 }, { "entropy": 0.5885882526636124, "epoch": 2.257582827811479, "grad_norm": 0.26846224069595337, "learning_rate": 0.0002, "loss": 0.5931, "mean_token_accuracy": 0.7581339180469513, "num_tokens": 8763359.0, "step": 2420 }, { "entropy": 0.5574115365743637, "epoch": 2.258516098926738, "grad_norm": 0.25927427411079407, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.767599031329155, "num_tokens": 8766885.0, "step": 2421 }, { "entropy": 0.5752872973680496, "epoch": 2.259449370041997, "grad_norm": 0.35603776574134827, "learning_rate": 0.0002, "loss": 0.5814, "mean_token_accuracy": 0.7652915120124817, "num_tokens": 8770399.0, "step": 2422 }, { "entropy": 0.5906836986541748, "epoch": 2.260382641157256, "grad_norm": 0.34193524718284607, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.7604731321334839, "num_tokens": 8774058.0, "step": 2423 }, { "entropy": 0.5904323011636734, "epoch": 2.261315912272515, "grad_norm": 0.3334072232246399, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.7622794955968857, "num_tokens": 8777581.0, "step": 2424 }, { "entropy": 0.5713632702827454, "epoch": 2.262249183387774, "grad_norm": 0.2777933180332184, "learning_rate": 0.0002, "loss": 0.5785, "mean_token_accuracy": 0.7662398666143417, "num_tokens": 8781257.0, "step": 2425 }, { "entropy": 0.55107182264328, "epoch": 2.263182454503033, "grad_norm": 0.24971358478069305, "learning_rate": 0.0002, "loss": 0.5561, "mean_token_accuracy": 0.7755829840898514, "num_tokens": 8784930.0, "step": 2426 }, { "entropy": 0.55837582051754, "epoch": 2.264115725618292, "grad_norm": 0.2433808594942093, "learning_rate": 0.0002, "loss": 0.5509, "mean_token_accuracy": 0.7768456339836121, "num_tokens": 8788525.0, "step": 2427 }, { "entropy": 0.62226901948452, "epoch": 2.265048996733551, "grad_norm": 0.2670277953147888, "learning_rate": 0.0002, "loss": 0.6204, "mean_token_accuracy": 0.7522156536579132, "num_tokens": 8792219.0, "step": 2428 }, { "entropy": 0.5490000173449516, "epoch": 2.26598226784881, "grad_norm": 0.2816324532032013, "learning_rate": 0.0002, "loss": 0.5636, "mean_token_accuracy": 0.7764046341180801, "num_tokens": 8795819.0, "step": 2429 }, { "entropy": 0.5988800227642059, "epoch": 2.266915538964069, "grad_norm": 0.28991881012916565, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.750813290476799, "num_tokens": 8799410.0, "step": 2430 }, { "entropy": 0.5707672983407974, "epoch": 2.267848810079328, "grad_norm": 0.32226836681365967, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7658718079328537, "num_tokens": 8802954.0, "step": 2431 }, { "entropy": 0.6042834222316742, "epoch": 2.268782081194587, "grad_norm": 0.2765134572982788, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.7578894048929214, "num_tokens": 8806563.0, "step": 2432 }, { "entropy": 0.5545040667057037, "epoch": 2.269715352309846, "grad_norm": 0.26694706082344055, "learning_rate": 0.0002, "loss": 0.5575, "mean_token_accuracy": 0.7743913382291794, "num_tokens": 8810107.0, "step": 2433 }, { "entropy": 0.5801795572042465, "epoch": 2.270648623425105, "grad_norm": 0.30860596895217896, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7600752860307693, "num_tokens": 8813723.0, "step": 2434 }, { "entropy": 0.6274842321872711, "epoch": 2.271581894540364, "grad_norm": 0.2847239077091217, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.7543263882398605, "num_tokens": 8817446.0, "step": 2435 }, { "entropy": 0.6281452924013138, "epoch": 2.272515165655623, "grad_norm": 0.2911607027053833, "learning_rate": 0.0002, "loss": 0.6374, "mean_token_accuracy": 0.7413156479597092, "num_tokens": 8821165.0, "step": 2436 }, { "entropy": 0.5999100059270859, "epoch": 2.273448436770882, "grad_norm": 0.3215242922306061, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.7633186727762222, "num_tokens": 8824736.0, "step": 2437 }, { "entropy": 0.5536673069000244, "epoch": 2.274381707886141, "grad_norm": 0.31624835729599, "learning_rate": 0.0002, "loss": 0.5657, "mean_token_accuracy": 0.7665057480335236, "num_tokens": 8828351.0, "step": 2438 }, { "entropy": 0.5903618186712265, "epoch": 2.2753149790014, "grad_norm": 0.41006579995155334, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.7585706412792206, "num_tokens": 8831948.0, "step": 2439 }, { "entropy": 0.6315478831529617, "epoch": 2.2762482501166588, "grad_norm": 0.3059978187084198, "learning_rate": 0.0002, "loss": 0.6346, "mean_token_accuracy": 0.7427516877651215, "num_tokens": 8835656.0, "step": 2440 }, { "entropy": 0.588816687464714, "epoch": 2.2771815212319177, "grad_norm": 0.327660471200943, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.7654533535242081, "num_tokens": 8839335.0, "step": 2441 }, { "entropy": 0.61586694419384, "epoch": 2.2781147923471767, "grad_norm": 0.34916868805885315, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.7466961592435837, "num_tokens": 8842894.0, "step": 2442 }, { "entropy": 0.5700227618217468, "epoch": 2.2790480634624357, "grad_norm": 0.3286486566066742, "learning_rate": 0.0002, "loss": 0.5684, "mean_token_accuracy": 0.7700330018997192, "num_tokens": 8846364.0, "step": 2443 }, { "entropy": 0.5897790640592575, "epoch": 2.2799813345776947, "grad_norm": 0.31103551387786865, "learning_rate": 0.0002, "loss": 0.5862, "mean_token_accuracy": 0.762577474117279, "num_tokens": 8849920.0, "step": 2444 }, { "entropy": 0.6110757291316986, "epoch": 2.2809146056929537, "grad_norm": 0.2325456440448761, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.7533215135335922, "num_tokens": 8853587.0, "step": 2445 }, { "entropy": 0.580785259604454, "epoch": 2.2818478768082127, "grad_norm": 0.3094017207622528, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.7609402686357498, "num_tokens": 8857165.0, "step": 2446 }, { "entropy": 0.5950721353292465, "epoch": 2.2827811479234716, "grad_norm": 0.2535117268562317, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.7628819644451141, "num_tokens": 8860922.0, "step": 2447 }, { "entropy": 0.5825671851634979, "epoch": 2.2837144190387306, "grad_norm": 0.31300657987594604, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.7590464055538177, "num_tokens": 8864461.0, "step": 2448 }, { "entropy": 0.609311655163765, "epoch": 2.2846476901539896, "grad_norm": 0.2971428632736206, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.755622074007988, "num_tokens": 8868064.0, "step": 2449 }, { "entropy": 0.5924998968839645, "epoch": 2.2855809612692486, "grad_norm": 0.2576356828212738, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.7580444812774658, "num_tokens": 8871658.0, "step": 2450 }, { "entropy": 0.5686187595129013, "epoch": 2.2865142323845076, "grad_norm": 0.2748410999774933, "learning_rate": 0.0002, "loss": 0.5632, "mean_token_accuracy": 0.769321084022522, "num_tokens": 8875172.0, "step": 2451 }, { "entropy": 0.5883881896734238, "epoch": 2.2874475034997666, "grad_norm": 0.3147693872451782, "learning_rate": 0.0002, "loss": 0.6028, "mean_token_accuracy": 0.750324010848999, "num_tokens": 8878706.0, "step": 2452 }, { "entropy": 0.6036804169416428, "epoch": 2.2883807746150255, "grad_norm": 0.29653409123420715, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.7568544745445251, "num_tokens": 8882373.0, "step": 2453 }, { "entropy": 0.6186504513025284, "epoch": 2.2893140457302845, "grad_norm": 0.25972291827201843, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.7520485371351242, "num_tokens": 8886089.0, "step": 2454 }, { "entropy": 0.5837692767381668, "epoch": 2.2902473168455435, "grad_norm": 0.2987351417541504, "learning_rate": 0.0002, "loss": 0.5879, "mean_token_accuracy": 0.7639967203140259, "num_tokens": 8889680.0, "step": 2455 }, { "entropy": 0.5788727700710297, "epoch": 2.2911805879608025, "grad_norm": 0.33266058564186096, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.7649902701377869, "num_tokens": 8893309.0, "step": 2456 }, { "entropy": 0.5669997334480286, "epoch": 2.2921138590760615, "grad_norm": 0.3057156801223755, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.7677110284566879, "num_tokens": 8896997.0, "step": 2457 }, { "entropy": 0.5712777972221375, "epoch": 2.2930471301913204, "grad_norm": 0.31062835454940796, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7630815207958221, "num_tokens": 8900497.0, "step": 2458 }, { "entropy": 0.6290003508329391, "epoch": 2.2939804013065794, "grad_norm": 0.38012242317199707, "learning_rate": 0.0002, "loss": 0.6447, "mean_token_accuracy": 0.7384316325187683, "num_tokens": 8904244.0, "step": 2459 }, { "entropy": 0.562022939324379, "epoch": 2.2949136724218384, "grad_norm": 0.2869296371936798, "learning_rate": 0.0002, "loss": 0.5522, "mean_token_accuracy": 0.7771805673837662, "num_tokens": 8907838.0, "step": 2460 }, { "entropy": 0.5628091096878052, "epoch": 2.2958469435370974, "grad_norm": 0.3334134817123413, "learning_rate": 0.0002, "loss": 0.5638, "mean_token_accuracy": 0.7631844282150269, "num_tokens": 8911465.0, "step": 2461 }, { "entropy": 0.5717372000217438, "epoch": 2.2967802146523564, "grad_norm": 0.2776024639606476, "learning_rate": 0.0002, "loss": 0.5676, "mean_token_accuracy": 0.7727221250534058, "num_tokens": 8915122.0, "step": 2462 }, { "entropy": 0.611750602722168, "epoch": 2.2977134857676154, "grad_norm": 0.3320581316947937, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.7601931691169739, "num_tokens": 8918747.0, "step": 2463 }, { "entropy": 0.5866394937038422, "epoch": 2.2986467568828743, "grad_norm": 0.3225594162940979, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.7636179774999619, "num_tokens": 8922391.0, "step": 2464 }, { "entropy": 0.6331189274787903, "epoch": 2.2995800279981333, "grad_norm": 0.2647189497947693, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.7407966256141663, "num_tokens": 8926132.0, "step": 2465 }, { "entropy": 0.567930743098259, "epoch": 2.3005132991133923, "grad_norm": 0.34711384773254395, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.7653175741434097, "num_tokens": 8929733.0, "step": 2466 }, { "entropy": 0.5955432057380676, "epoch": 2.3014465702286513, "grad_norm": 0.3446289300918579, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7558828145265579, "num_tokens": 8933328.0, "step": 2467 }, { "entropy": 0.5855847150087357, "epoch": 2.3023798413439103, "grad_norm": 0.34118103981018066, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.7583454251289368, "num_tokens": 8936938.0, "step": 2468 }, { "entropy": 0.5620356202125549, "epoch": 2.3033131124591693, "grad_norm": 0.29968059062957764, "learning_rate": 0.0002, "loss": 0.5735, "mean_token_accuracy": 0.7716311365365982, "num_tokens": 8940417.0, "step": 2469 }, { "entropy": 0.6205061376094818, "epoch": 2.3042463835744282, "grad_norm": 0.3577302396297455, "learning_rate": 0.0002, "loss": 0.6213, "mean_token_accuracy": 0.7490758895874023, "num_tokens": 8944047.0, "step": 2470 }, { "entropy": 0.6311260163784027, "epoch": 2.305179654689687, "grad_norm": 0.3039022386074066, "learning_rate": 0.0002, "loss": 0.6351, "mean_token_accuracy": 0.7430185824632645, "num_tokens": 8947580.0, "step": 2471 }, { "entropy": 0.6073903739452362, "epoch": 2.306112925804946, "grad_norm": 0.2801710069179535, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.7594807296991348, "num_tokens": 8951217.0, "step": 2472 }, { "entropy": 0.6138640195131302, "epoch": 2.307046196920205, "grad_norm": 0.24995014071464539, "learning_rate": 0.0002, "loss": 0.6115, "mean_token_accuracy": 0.7502348124980927, "num_tokens": 8954880.0, "step": 2473 }, { "entropy": 0.5881125181913376, "epoch": 2.307979468035464, "grad_norm": 0.3120088279247284, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.757205456495285, "num_tokens": 8958658.0, "step": 2474 }, { "entropy": 0.6119008660316467, "epoch": 2.308912739150723, "grad_norm": 0.3292243480682373, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.7545085549354553, "num_tokens": 8962231.0, "step": 2475 }, { "entropy": 0.5541795790195465, "epoch": 2.309846010265982, "grad_norm": 0.30222442746162415, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.7722873240709305, "num_tokens": 8965872.0, "step": 2476 }, { "entropy": 0.6084976047277451, "epoch": 2.310779281381241, "grad_norm": 0.2928650379180908, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.7537739425897598, "num_tokens": 8969667.0, "step": 2477 }, { "entropy": 0.5966882109642029, "epoch": 2.3117125524965, "grad_norm": 0.25600939989089966, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.7598296999931335, "num_tokens": 8973314.0, "step": 2478 }, { "entropy": 0.6095393300056458, "epoch": 2.312645823611759, "grad_norm": 0.26674020290374756, "learning_rate": 0.0002, "loss": 0.6045, "mean_token_accuracy": 0.757060319185257, "num_tokens": 8976927.0, "step": 2479 }, { "entropy": 0.6230290681123734, "epoch": 2.313579094727018, "grad_norm": 0.25754472613334656, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.7463133931159973, "num_tokens": 8980654.0, "step": 2480 }, { "entropy": 0.5728166699409485, "epoch": 2.314512365842277, "grad_norm": 0.2595420181751251, "learning_rate": 0.0002, "loss": 0.569, "mean_token_accuracy": 0.7663120627403259, "num_tokens": 8984274.0, "step": 2481 }, { "entropy": 0.603045329451561, "epoch": 2.315445636957536, "grad_norm": 0.28609901666641235, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.7572083920240402, "num_tokens": 8987854.0, "step": 2482 }, { "entropy": 0.5860645622014999, "epoch": 2.316378908072795, "grad_norm": 0.29366666078567505, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.7573997676372528, "num_tokens": 8991429.0, "step": 2483 }, { "entropy": 0.6192111670970917, "epoch": 2.317312179188054, "grad_norm": 0.34540989995002747, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.7527506798505783, "num_tokens": 8995021.0, "step": 2484 }, { "entropy": 0.566005066037178, "epoch": 2.318245450303313, "grad_norm": 0.2696877717971802, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.7796875089406967, "num_tokens": 8998570.0, "step": 2485 }, { "entropy": 0.6202348321676254, "epoch": 2.319178721418572, "grad_norm": 0.3402385413646698, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.7460505664348602, "num_tokens": 9002226.0, "step": 2486 }, { "entropy": 0.5713826790452003, "epoch": 2.320111992533831, "grad_norm": 0.334403395652771, "learning_rate": 0.0002, "loss": 0.5829, "mean_token_accuracy": 0.7712581157684326, "num_tokens": 9005750.0, "step": 2487 }, { "entropy": 0.5684665143489838, "epoch": 2.32104526364909, "grad_norm": 0.260640412569046, "learning_rate": 0.0002, "loss": 0.5732, "mean_token_accuracy": 0.765202209353447, "num_tokens": 9009353.0, "step": 2488 }, { "entropy": 0.5431803911924362, "epoch": 2.321978534764349, "grad_norm": 0.33397167921066284, "learning_rate": 0.0002, "loss": 0.5549, "mean_token_accuracy": 0.7765693962574005, "num_tokens": 9012916.0, "step": 2489 }, { "entropy": 0.5749883502721786, "epoch": 2.322911805879608, "grad_norm": 0.28098371624946594, "learning_rate": 0.0002, "loss": 0.5769, "mean_token_accuracy": 0.766113206744194, "num_tokens": 9016626.0, "step": 2490 }, { "entropy": 0.5872912853956223, "epoch": 2.323845076994867, "grad_norm": 0.2577754855155945, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.764400064945221, "num_tokens": 9020228.0, "step": 2491 }, { "entropy": 0.6081327944993973, "epoch": 2.324778348110126, "grad_norm": 0.27423807978630066, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.7539316117763519, "num_tokens": 9023957.0, "step": 2492 }, { "entropy": 0.5593881756067276, "epoch": 2.325711619225385, "grad_norm": 0.25664687156677246, "learning_rate": 0.0002, "loss": 0.5546, "mean_token_accuracy": 0.7769625633955002, "num_tokens": 9027571.0, "step": 2493 }, { "entropy": 0.5978922843933105, "epoch": 2.326644890340644, "grad_norm": 0.3110886812210083, "learning_rate": 0.0002, "loss": 0.5912, "mean_token_accuracy": 0.7614462673664093, "num_tokens": 9031163.0, "step": 2494 }, { "entropy": 0.6003994941711426, "epoch": 2.327578161455903, "grad_norm": 0.28274962306022644, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.7621183097362518, "num_tokens": 9034703.0, "step": 2495 }, { "entropy": 0.6112218499183655, "epoch": 2.3285114325711618, "grad_norm": 0.30579259991645813, "learning_rate": 0.0002, "loss": 0.6098, "mean_token_accuracy": 0.7600456029176712, "num_tokens": 9038371.0, "step": 2496 }, { "entropy": 0.6339983940124512, "epoch": 2.3294447036864208, "grad_norm": 0.3267880082130432, "learning_rate": 0.0002, "loss": 0.6405, "mean_token_accuracy": 0.7490425407886505, "num_tokens": 9042034.0, "step": 2497 }, { "entropy": 0.5680512338876724, "epoch": 2.3303779748016797, "grad_norm": 0.34162411093711853, "learning_rate": 0.0002, "loss": 0.5757, "mean_token_accuracy": 0.7696889042854309, "num_tokens": 9045586.0, "step": 2498 }, { "entropy": 0.6122363656759262, "epoch": 2.3313112459169387, "grad_norm": 0.32041099667549133, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.7427248507738113, "num_tokens": 9049130.0, "step": 2499 }, { "entropy": 0.5883697420358658, "epoch": 2.3322445170321977, "grad_norm": 0.31473273038864136, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.762309193611145, "num_tokens": 9052808.0, "step": 2500 }, { "entropy": 0.5923911035060883, "epoch": 2.3331777881474567, "grad_norm": 0.29519784450531006, "learning_rate": 0.0002, "loss": 0.5956, "mean_token_accuracy": 0.7538354098796844, "num_tokens": 9056487.0, "step": 2501 }, { "entropy": 0.5801538974046707, "epoch": 2.3341110592627157, "grad_norm": 0.3242976665496826, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.7652087509632111, "num_tokens": 9060030.0, "step": 2502 }, { "entropy": 0.5795683115720749, "epoch": 2.3350443303779747, "grad_norm": 0.2889755368232727, "learning_rate": 0.0002, "loss": 0.5783, "mean_token_accuracy": 0.7631170898675919, "num_tokens": 9063709.0, "step": 2503 }, { "entropy": 0.5861436873674393, "epoch": 2.3359776014932336, "grad_norm": 0.2635176479816437, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.7619268000125885, "num_tokens": 9067344.0, "step": 2504 }, { "entropy": 0.6092338263988495, "epoch": 2.3369108726084926, "grad_norm": 0.26501232385635376, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.753710612654686, "num_tokens": 9071089.0, "step": 2505 }, { "entropy": 0.6131794452667236, "epoch": 2.3378441437237516, "grad_norm": 0.25359922647476196, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.751650258898735, "num_tokens": 9074706.0, "step": 2506 }, { "entropy": 0.5891621559858322, "epoch": 2.3387774148390106, "grad_norm": 0.30741292238235474, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7598507702350616, "num_tokens": 9078303.0, "step": 2507 }, { "entropy": 0.5679450333118439, "epoch": 2.3397106859542696, "grad_norm": 0.2673749625682831, "learning_rate": 0.0002, "loss": 0.5704, "mean_token_accuracy": 0.7627439945936203, "num_tokens": 9081878.0, "step": 2508 }, { "entropy": 0.5560851693153381, "epoch": 2.3406439570695285, "grad_norm": 0.31030702590942383, "learning_rate": 0.0002, "loss": 0.5587, "mean_token_accuracy": 0.7682564407587051, "num_tokens": 9085476.0, "step": 2509 }, { "entropy": 0.6462197452783585, "epoch": 2.3415772281847875, "grad_norm": 0.32137447595596313, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.7366510480642319, "num_tokens": 9089038.0, "step": 2510 }, { "entropy": 0.5844675451517105, "epoch": 2.3425104993000465, "grad_norm": 0.29787537455558777, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.7598829865455627, "num_tokens": 9092627.0, "step": 2511 }, { "entropy": 0.6004132032394409, "epoch": 2.3434437704153055, "grad_norm": 0.33569326996803284, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.7580690085887909, "num_tokens": 9096202.0, "step": 2512 }, { "entropy": 0.5982140004634857, "epoch": 2.3443770415305645, "grad_norm": 0.3784014582633972, "learning_rate": 0.0002, "loss": 0.6091, "mean_token_accuracy": 0.7456049770116806, "num_tokens": 9099822.0, "step": 2513 }, { "entropy": 0.5650241523981094, "epoch": 2.3453103126458235, "grad_norm": 0.30553844571113586, "learning_rate": 0.0002, "loss": 0.5674, "mean_token_accuracy": 0.7731841206550598, "num_tokens": 9103462.0, "step": 2514 }, { "entropy": 0.6145084053277969, "epoch": 2.3462435837610824, "grad_norm": 0.3670195937156677, "learning_rate": 0.0002, "loss": 0.6311, "mean_token_accuracy": 0.7386458367109299, "num_tokens": 9107193.0, "step": 2515 }, { "entropy": 0.5833791345357895, "epoch": 2.3471768548763414, "grad_norm": 0.3553655445575714, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.7586190849542618, "num_tokens": 9110891.0, "step": 2516 }, { "entropy": 0.6191007196903229, "epoch": 2.3481101259916004, "grad_norm": 0.2823416590690613, "learning_rate": 0.0002, "loss": 0.6262, "mean_token_accuracy": 0.7421327382326126, "num_tokens": 9114472.0, "step": 2517 }, { "entropy": 0.5860526263713837, "epoch": 2.3490433971068594, "grad_norm": 0.25715529918670654, "learning_rate": 0.0002, "loss": 0.5725, "mean_token_accuracy": 0.7717370837926865, "num_tokens": 9118166.0, "step": 2518 }, { "entropy": 0.6126139163970947, "epoch": 2.3499766682221184, "grad_norm": 0.2673700153827667, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.7587346136569977, "num_tokens": 9121791.0, "step": 2519 }, { "entropy": 0.6296816021203995, "epoch": 2.3509099393373774, "grad_norm": 0.26972007751464844, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.7513338774442673, "num_tokens": 9125555.0, "step": 2520 }, { "entropy": 0.6214188486337662, "epoch": 2.3518432104526363, "grad_norm": 0.3072670102119446, "learning_rate": 0.0002, "loss": 0.6174, "mean_token_accuracy": 0.7502973973751068, "num_tokens": 9129143.0, "step": 2521 }, { "entropy": 0.6220842450857162, "epoch": 2.3527764815678953, "grad_norm": 0.2993640899658203, "learning_rate": 0.0002, "loss": 0.6279, "mean_token_accuracy": 0.7459362298250198, "num_tokens": 9132791.0, "step": 2522 }, { "entropy": 0.6037693917751312, "epoch": 2.3537097526831543, "grad_norm": 0.27492567896842957, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.7538044899702072, "num_tokens": 9136371.0, "step": 2523 }, { "entropy": 0.5611698478460312, "epoch": 2.3546430237984133, "grad_norm": 0.32844001054763794, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.76678267121315, "num_tokens": 9139918.0, "step": 2524 }, { "entropy": 0.5915271490812302, "epoch": 2.3555762949136723, "grad_norm": 0.3908126652240753, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.7595698535442352, "num_tokens": 9143488.0, "step": 2525 }, { "entropy": 0.6121900677680969, "epoch": 2.3565095660289312, "grad_norm": 0.2963161766529083, "learning_rate": 0.0002, "loss": 0.6238, "mean_token_accuracy": 0.7511444538831711, "num_tokens": 9147139.0, "step": 2526 }, { "entropy": 0.5916421562433243, "epoch": 2.3574428371441902, "grad_norm": 0.2943776845932007, "learning_rate": 0.0002, "loss": 0.5889, "mean_token_accuracy": 0.76236192882061, "num_tokens": 9150815.0, "step": 2527 }, { "entropy": 0.6053974628448486, "epoch": 2.358376108259449, "grad_norm": 0.2989824712276459, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7582410871982574, "num_tokens": 9154318.0, "step": 2528 }, { "entropy": 0.6004398316144943, "epoch": 2.359309379374708, "grad_norm": 0.3728059232234955, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.7559059709310532, "num_tokens": 9157887.0, "step": 2529 }, { "entropy": 0.5330690890550613, "epoch": 2.360242650489967, "grad_norm": 0.2690599262714386, "learning_rate": 0.0002, "loss": 0.5309, "mean_token_accuracy": 0.7835265845060349, "num_tokens": 9161413.0, "step": 2530 }, { "entropy": 0.6078470349311829, "epoch": 2.361175921605226, "grad_norm": 0.29644522070884705, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7508005350828171, "num_tokens": 9164846.0, "step": 2531 }, { "entropy": 0.5714403688907623, "epoch": 2.362109192720485, "grad_norm": 0.29532718658447266, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.7675605863332748, "num_tokens": 9168350.0, "step": 2532 }, { "entropy": 0.5650284290313721, "epoch": 2.363042463835744, "grad_norm": 0.26354509592056274, "learning_rate": 0.0002, "loss": 0.5685, "mean_token_accuracy": 0.7688814848661423, "num_tokens": 9171975.0, "step": 2533 }, { "entropy": 0.5731722563505173, "epoch": 2.363975734951003, "grad_norm": 0.3021261394023895, "learning_rate": 0.0002, "loss": 0.5792, "mean_token_accuracy": 0.7678463757038116, "num_tokens": 9175503.0, "step": 2534 }, { "entropy": 0.567136749625206, "epoch": 2.364909006066262, "grad_norm": 0.3385653793811798, "learning_rate": 0.0002, "loss": 0.5699, "mean_token_accuracy": 0.7744714617729187, "num_tokens": 9179138.0, "step": 2535 }, { "entropy": 0.6130668520927429, "epoch": 2.365842277181521, "grad_norm": 0.33342158794403076, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.7422554939985275, "num_tokens": 9182710.0, "step": 2536 }, { "entropy": 0.6083517074584961, "epoch": 2.36677554829678, "grad_norm": 0.27584657073020935, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7535440176725388, "num_tokens": 9186291.0, "step": 2537 }, { "entropy": 0.5653240233659744, "epoch": 2.367708819412039, "grad_norm": 0.3132845461368561, "learning_rate": 0.0002, "loss": 0.5586, "mean_token_accuracy": 0.7814210057258606, "num_tokens": 9189907.0, "step": 2538 }, { "entropy": 0.5849036276340485, "epoch": 2.368642090527298, "grad_norm": 0.2685871720314026, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7611515372991562, "num_tokens": 9193528.0, "step": 2539 }, { "entropy": 0.6319975852966309, "epoch": 2.369575361642557, "grad_norm": 0.293454647064209, "learning_rate": 0.0002, "loss": 0.6296, "mean_token_accuracy": 0.7426915764808655, "num_tokens": 9197199.0, "step": 2540 }, { "entropy": 0.6041073948144913, "epoch": 2.370508632757816, "grad_norm": 0.3482208251953125, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7550292611122131, "num_tokens": 9200819.0, "step": 2541 }, { "entropy": 0.6244876980781555, "epoch": 2.371441903873075, "grad_norm": 0.3037407696247101, "learning_rate": 0.0002, "loss": 0.6329, "mean_token_accuracy": 0.7433865517377853, "num_tokens": 9204471.0, "step": 2542 }, { "entropy": 0.5531594753265381, "epoch": 2.372375174988334, "grad_norm": 0.30719155073165894, "learning_rate": 0.0002, "loss": 0.5588, "mean_token_accuracy": 0.7738870531320572, "num_tokens": 9208051.0, "step": 2543 }, { "entropy": 0.5590465366840363, "epoch": 2.373308446103593, "grad_norm": 0.31187307834625244, "learning_rate": 0.0002, "loss": 0.5736, "mean_token_accuracy": 0.7710327953100204, "num_tokens": 9211673.0, "step": 2544 }, { "entropy": 0.5361663326621056, "epoch": 2.374241717218852, "grad_norm": 0.3176564574241638, "learning_rate": 0.0002, "loss": 0.5563, "mean_token_accuracy": 0.7794159948825836, "num_tokens": 9215159.0, "step": 2545 }, { "entropy": 0.5876544117927551, "epoch": 2.375174988334111, "grad_norm": 0.29842808842658997, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.7586546093225479, "num_tokens": 9218589.0, "step": 2546 }, { "entropy": 0.5865459889173508, "epoch": 2.37610825944937, "grad_norm": 0.3105543553829193, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.764804944396019, "num_tokens": 9222187.0, "step": 2547 }, { "entropy": 0.5869432091712952, "epoch": 2.377041530564629, "grad_norm": 0.2627582550048828, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.7622543275356293, "num_tokens": 9225888.0, "step": 2548 }, { "entropy": 0.6560425460338593, "epoch": 2.377974801679888, "grad_norm": 0.294074684381485, "learning_rate": 0.0002, "loss": 0.6491, "mean_token_accuracy": 0.7308540791273117, "num_tokens": 9229447.0, "step": 2549 }, { "entropy": 0.6099931001663208, "epoch": 2.378908072795147, "grad_norm": 0.29543930292129517, "learning_rate": 0.0002, "loss": 0.6093, "mean_token_accuracy": 0.7492469102144241, "num_tokens": 9233036.0, "step": 2550 }, { "entropy": 0.5951214730739594, "epoch": 2.379841343910406, "grad_norm": 0.30387362837791443, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.7606094479560852, "num_tokens": 9236510.0, "step": 2551 }, { "entropy": 0.6294455081224442, "epoch": 2.380774615025665, "grad_norm": 0.2697882056236267, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.7538290470838547, "num_tokens": 9240129.0, "step": 2552 }, { "entropy": 0.624773234128952, "epoch": 2.3817078861409238, "grad_norm": 0.27788853645324707, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.7493225038051605, "num_tokens": 9243774.0, "step": 2553 }, { "entropy": 0.5943918004631996, "epoch": 2.3826411572561828, "grad_norm": 0.3039112389087677, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7577492743730545, "num_tokens": 9247269.0, "step": 2554 }, { "entropy": 0.5959509611129761, "epoch": 2.3835744283714417, "grad_norm": 0.3299744427204132, "learning_rate": 0.0002, "loss": 0.6125, "mean_token_accuracy": 0.758970707654953, "num_tokens": 9250891.0, "step": 2555 }, { "entropy": 0.6057161837816238, "epoch": 2.3845076994867007, "grad_norm": 0.34155356884002686, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.7480152398347855, "num_tokens": 9254652.0, "step": 2556 }, { "entropy": 0.6384190171957016, "epoch": 2.3854409706019597, "grad_norm": 0.37231922149658203, "learning_rate": 0.0002, "loss": 0.6479, "mean_token_accuracy": 0.7419425398111343, "num_tokens": 9258287.0, "step": 2557 }, { "entropy": 0.6095262169837952, "epoch": 2.3863742417172187, "grad_norm": 0.30435192584991455, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.7562910914421082, "num_tokens": 9261853.0, "step": 2558 }, { "entropy": 0.6166697591543198, "epoch": 2.3873075128324777, "grad_norm": 0.30147379636764526, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7512940466403961, "num_tokens": 9265487.0, "step": 2559 }, { "entropy": 0.5883072465658188, "epoch": 2.3882407839477366, "grad_norm": 0.3056500256061554, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.7599428594112396, "num_tokens": 9269091.0, "step": 2560 }, { "entropy": 0.6304004788398743, "epoch": 2.3891740550629956, "grad_norm": 0.2971253991127014, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.7501646280288696, "num_tokens": 9272715.0, "step": 2561 }, { "entropy": 0.5749754756689072, "epoch": 2.3901073261782546, "grad_norm": 0.2298806607723236, "learning_rate": 0.0002, "loss": 0.5722, "mean_token_accuracy": 0.7637462466955185, "num_tokens": 9276296.0, "step": 2562 }, { "entropy": 0.6169842928647995, "epoch": 2.3910405972935136, "grad_norm": 0.24139605462551117, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.7474232316017151, "num_tokens": 9279955.0, "step": 2563 }, { "entropy": 0.6082211136817932, "epoch": 2.3919738684087726, "grad_norm": 0.3358626663684845, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.7525102943181992, "num_tokens": 9283640.0, "step": 2564 }, { "entropy": 0.6034368276596069, "epoch": 2.3929071395240316, "grad_norm": 0.35686221718788147, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.7519455850124359, "num_tokens": 9287213.0, "step": 2565 }, { "entropy": 0.5740625113248825, "epoch": 2.3938404106392905, "grad_norm": 0.2878904938697815, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7678066492080688, "num_tokens": 9290855.0, "step": 2566 }, { "entropy": 0.5667293667793274, "epoch": 2.3947736817545495, "grad_norm": 0.45161136984825134, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.763294130563736, "num_tokens": 9294351.0, "step": 2567 }, { "entropy": 0.5688539147377014, "epoch": 2.3957069528698085, "grad_norm": 0.42621609568595886, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.7639860063791275, "num_tokens": 9297899.0, "step": 2568 }, { "entropy": 0.6039485484361649, "epoch": 2.3966402239850675, "grad_norm": 0.3137003183364868, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.7559025436639786, "num_tokens": 9301639.0, "step": 2569 }, { "entropy": 0.6027548909187317, "epoch": 2.3975734951003265, "grad_norm": 0.3110453486442566, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.7553947120904922, "num_tokens": 9305421.0, "step": 2570 }, { "entropy": 0.5842466205358505, "epoch": 2.3985067662155855, "grad_norm": 0.2945517897605896, "learning_rate": 0.0002, "loss": 0.581, "mean_token_accuracy": 0.7679035365581512, "num_tokens": 9308998.0, "step": 2571 }, { "entropy": 0.6331669986248016, "epoch": 2.3994400373308444, "grad_norm": 0.26731064915657043, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.7535108774900436, "num_tokens": 9312719.0, "step": 2572 }, { "entropy": 0.6067015379667282, "epoch": 2.4003733084461034, "grad_norm": 0.3188585340976715, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.7580841481685638, "num_tokens": 9316328.0, "step": 2573 }, { "entropy": 0.6370298266410828, "epoch": 2.4013065795613624, "grad_norm": 0.3667255640029907, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.7423640340566635, "num_tokens": 9320010.0, "step": 2574 }, { "entropy": 0.5760375112295151, "epoch": 2.4022398506766214, "grad_norm": 0.2687532305717468, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.7610410004854202, "num_tokens": 9323729.0, "step": 2575 }, { "entropy": 0.6006486862897873, "epoch": 2.4031731217918804, "grad_norm": 0.29913872480392456, "learning_rate": 0.0002, "loss": 0.6, "mean_token_accuracy": 0.7542074471712112, "num_tokens": 9327398.0, "step": 2576 }, { "entropy": 0.5664404034614563, "epoch": 2.4041063929071393, "grad_norm": 0.3112587630748749, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7645968049764633, "num_tokens": 9330966.0, "step": 2577 }, { "entropy": 0.5528246313333511, "epoch": 2.4050396640223983, "grad_norm": 0.3641952872276306, "learning_rate": 0.0002, "loss": 0.5598, "mean_token_accuracy": 0.7799306213855743, "num_tokens": 9334392.0, "step": 2578 }, { "entropy": 0.6257403790950775, "epoch": 2.4059729351376573, "grad_norm": 0.33556538820266724, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.7520910501480103, "num_tokens": 9337932.0, "step": 2579 }, { "entropy": 0.5922639667987823, "epoch": 2.4069062062529163, "grad_norm": 0.34974604845046997, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.7526819854974747, "num_tokens": 9341553.0, "step": 2580 }, { "entropy": 0.6079245507717133, "epoch": 2.4078394773681753, "grad_norm": 0.3748161196708679, "learning_rate": 0.0002, "loss": 0.6273, "mean_token_accuracy": 0.7429895848035812, "num_tokens": 9345159.0, "step": 2581 }, { "entropy": 0.5722736120223999, "epoch": 2.4087727484834343, "grad_norm": 0.29851609468460083, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.7723444253206253, "num_tokens": 9348815.0, "step": 2582 }, { "entropy": 0.5810171216726303, "epoch": 2.4097060195986932, "grad_norm": 0.2784387767314911, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.7671367526054382, "num_tokens": 9352355.0, "step": 2583 }, { "entropy": 0.5603510290384293, "epoch": 2.410639290713952, "grad_norm": 0.340796560049057, "learning_rate": 0.0002, "loss": 0.5625, "mean_token_accuracy": 0.768856093287468, "num_tokens": 9355813.0, "step": 2584 }, { "entropy": 0.5919639021158218, "epoch": 2.411572561829211, "grad_norm": 0.2695472836494446, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.7622827291488647, "num_tokens": 9359497.0, "step": 2585 }, { "entropy": 0.6013524383306503, "epoch": 2.41250583294447, "grad_norm": 0.34539055824279785, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.7497496604919434, "num_tokens": 9363106.0, "step": 2586 }, { "entropy": 0.591686874628067, "epoch": 2.413439104059729, "grad_norm": 0.33606278896331787, "learning_rate": 0.0002, "loss": 0.5991, "mean_token_accuracy": 0.7544471919536591, "num_tokens": 9366760.0, "step": 2587 }, { "entropy": 0.5985051989555359, "epoch": 2.414372375174988, "grad_norm": 0.32337257266044617, "learning_rate": 0.0002, "loss": 0.6007, "mean_token_accuracy": 0.7586445212364197, "num_tokens": 9370359.0, "step": 2588 }, { "entropy": 0.5667757242918015, "epoch": 2.415305646290247, "grad_norm": 0.293844997882843, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.7697674334049225, "num_tokens": 9374060.0, "step": 2589 }, { "entropy": 0.5992635488510132, "epoch": 2.416238917405506, "grad_norm": 0.38404959440231323, "learning_rate": 0.0002, "loss": 0.6327, "mean_token_accuracy": 0.7474267035722733, "num_tokens": 9377735.0, "step": 2590 }, { "entropy": 0.5987850576639175, "epoch": 2.417172188520765, "grad_norm": 0.34148189425468445, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.7545128017663956, "num_tokens": 9381367.0, "step": 2591 }, { "entropy": 0.6376546323299408, "epoch": 2.418105459636024, "grad_norm": 0.3500854969024658, "learning_rate": 0.0002, "loss": 0.6352, "mean_token_accuracy": 0.746341660618782, "num_tokens": 9384993.0, "step": 2592 }, { "entropy": 0.6104214787483215, "epoch": 2.419038730751283, "grad_norm": 0.2949490249156952, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.7546838819980621, "num_tokens": 9388578.0, "step": 2593 }, { "entropy": 0.5970562547445297, "epoch": 2.419972001866542, "grad_norm": 0.36592337489128113, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7613036036491394, "num_tokens": 9392183.0, "step": 2594 }, { "entropy": 0.610404297709465, "epoch": 2.420905272981801, "grad_norm": 0.3095610439777374, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.7546307444572449, "num_tokens": 9395833.0, "step": 2595 }, { "entropy": 0.6145349442958832, "epoch": 2.42183854409706, "grad_norm": 0.2929363250732422, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.7483824044466019, "num_tokens": 9399463.0, "step": 2596 }, { "entropy": 0.6067209392786026, "epoch": 2.422771815212319, "grad_norm": 0.28092947602272034, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.7520033717155457, "num_tokens": 9403055.0, "step": 2597 }, { "entropy": 0.5946817547082901, "epoch": 2.423705086327578, "grad_norm": 0.29551443457603455, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.7540600448846817, "num_tokens": 9406596.0, "step": 2598 }, { "entropy": 0.5943239033222198, "epoch": 2.424638357442837, "grad_norm": 0.23412026464939117, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7647775411605835, "num_tokens": 9410292.0, "step": 2599 }, { "entropy": 0.5781925320625305, "epoch": 2.425571628558096, "grad_norm": 0.29205408692359924, "learning_rate": 0.0002, "loss": 0.5783, "mean_token_accuracy": 0.7662081569433212, "num_tokens": 9414006.0, "step": 2600 }, { "entropy": 0.6031884253025055, "epoch": 2.426504899673355, "grad_norm": 0.2707125246524811, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.7489841729402542, "num_tokens": 9417634.0, "step": 2601 }, { "entropy": 0.6140481382608414, "epoch": 2.427438170788614, "grad_norm": 0.3825131356716156, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.7530184984207153, "num_tokens": 9421233.0, "step": 2602 }, { "entropy": 0.5774415731430054, "epoch": 2.428371441903873, "grad_norm": 0.31967049837112427, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.75945083796978, "num_tokens": 9424841.0, "step": 2603 }, { "entropy": 0.609619528055191, "epoch": 2.429304713019132, "grad_norm": 0.29764580726623535, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.7509591579437256, "num_tokens": 9428595.0, "step": 2604 }, { "entropy": 0.577502578496933, "epoch": 2.430237984134391, "grad_norm": 0.29193219542503357, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.7564556002616882, "num_tokens": 9432122.0, "step": 2605 }, { "entropy": 0.6113668233156204, "epoch": 2.43117125524965, "grad_norm": 0.2751844823360443, "learning_rate": 0.0002, "loss": 0.6117, "mean_token_accuracy": 0.750765934586525, "num_tokens": 9435683.0, "step": 2606 }, { "entropy": 0.5885588526725769, "epoch": 2.432104526364909, "grad_norm": 0.32234007120132446, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.7490733861923218, "num_tokens": 9439117.0, "step": 2607 }, { "entropy": 0.5430319607257843, "epoch": 2.433037797480168, "grad_norm": 0.31094521284103394, "learning_rate": 0.0002, "loss": 0.5456, "mean_token_accuracy": 0.7814468145370483, "num_tokens": 9442645.0, "step": 2608 }, { "entropy": 0.5815542489290237, "epoch": 2.433971068595427, "grad_norm": 0.258327841758728, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7653660923242569, "num_tokens": 9446308.0, "step": 2609 }, { "entropy": 0.6008356362581253, "epoch": 2.4349043397106858, "grad_norm": 0.2923174202442169, "learning_rate": 0.0002, "loss": 0.6004, "mean_token_accuracy": 0.761135533452034, "num_tokens": 9449913.0, "step": 2610 }, { "entropy": 0.559976652264595, "epoch": 2.4358376108259447, "grad_norm": 0.32292047142982483, "learning_rate": 0.0002, "loss": 0.5665, "mean_token_accuracy": 0.7729199528694153, "num_tokens": 9453452.0, "step": 2611 }, { "entropy": 0.6299623548984528, "epoch": 2.4367708819412037, "grad_norm": 0.26324760913848877, "learning_rate": 0.0002, "loss": 0.6321, "mean_token_accuracy": 0.7450360357761383, "num_tokens": 9457021.0, "step": 2612 }, { "entropy": 0.6178702265024185, "epoch": 2.4377041530564627, "grad_norm": 0.27589547634124756, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.7530135214328766, "num_tokens": 9460764.0, "step": 2613 }, { "entropy": 0.629514291882515, "epoch": 2.4386374241717217, "grad_norm": 0.31923043727874756, "learning_rate": 0.0002, "loss": 0.6444, "mean_token_accuracy": 0.7422147840261459, "num_tokens": 9464389.0, "step": 2614 }, { "entropy": 0.618286743760109, "epoch": 2.4395706952869807, "grad_norm": 0.3066919147968292, "learning_rate": 0.0002, "loss": 0.6232, "mean_token_accuracy": 0.7454600185155869, "num_tokens": 9468063.0, "step": 2615 }, { "entropy": 0.6458250731229782, "epoch": 2.4405039664022397, "grad_norm": 0.2826118469238281, "learning_rate": 0.0002, "loss": 0.6435, "mean_token_accuracy": 0.7496414035558701, "num_tokens": 9471687.0, "step": 2616 }, { "entropy": 0.5934179574251175, "epoch": 2.4414372375174986, "grad_norm": 0.2699531614780426, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.7541202753782272, "num_tokens": 9475223.0, "step": 2617 }, { "entropy": 0.5981999933719635, "epoch": 2.4423705086327576, "grad_norm": 0.29383018612861633, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.7600056082010269, "num_tokens": 9478776.0, "step": 2618 }, { "entropy": 0.6108338981866837, "epoch": 2.4433037797480166, "grad_norm": 0.24552978575229645, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.7530884593725204, "num_tokens": 9482479.0, "step": 2619 }, { "entropy": 0.6118176132440567, "epoch": 2.4442370508632756, "grad_norm": 0.2933366000652313, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.746880829334259, "num_tokens": 9486162.0, "step": 2620 }, { "entropy": 0.5884489566087723, "epoch": 2.4451703219785346, "grad_norm": 0.3463361859321594, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.7535686492919922, "num_tokens": 9489706.0, "step": 2621 }, { "entropy": 0.5595777332782745, "epoch": 2.4461035930937935, "grad_norm": 0.2986154854297638, "learning_rate": 0.0002, "loss": 0.5739, "mean_token_accuracy": 0.7629955559968948, "num_tokens": 9493331.0, "step": 2622 }, { "entropy": 0.5807541161775589, "epoch": 2.4470368642090525, "grad_norm": 0.27797773480415344, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.7563483715057373, "num_tokens": 9496941.0, "step": 2623 }, { "entropy": 0.583545058965683, "epoch": 2.4479701353243115, "grad_norm": 0.33167344331741333, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.7592030763626099, "num_tokens": 9500594.0, "step": 2624 }, { "entropy": 0.5919197052717209, "epoch": 2.4489034064395705, "grad_norm": 0.3240535259246826, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.7504412829875946, "num_tokens": 9504229.0, "step": 2625 }, { "entropy": 0.5955404192209244, "epoch": 2.4498366775548295, "grad_norm": 0.30288854241371155, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.7579644173383713, "num_tokens": 9507821.0, "step": 2626 }, { "entropy": 0.5505084320902824, "epoch": 2.4507699486700885, "grad_norm": 0.24607349932193756, "learning_rate": 0.0002, "loss": 0.5504, "mean_token_accuracy": 0.7810537219047546, "num_tokens": 9511367.0, "step": 2627 }, { "entropy": 0.573561891913414, "epoch": 2.4517032197853474, "grad_norm": 0.26076796650886536, "learning_rate": 0.0002, "loss": 0.5651, "mean_token_accuracy": 0.7746884673833847, "num_tokens": 9514969.0, "step": 2628 }, { "entropy": 0.6542039215564728, "epoch": 2.4526364909006064, "grad_norm": 0.2562912404537201, "learning_rate": 0.0002, "loss": 0.645, "mean_token_accuracy": 0.739281639456749, "num_tokens": 9518685.0, "step": 2629 }, { "entropy": 0.5562879592180252, "epoch": 2.4535697620158654, "grad_norm": 0.3844725787639618, "learning_rate": 0.0002, "loss": 0.5672, "mean_token_accuracy": 0.7699924856424332, "num_tokens": 9522082.0, "step": 2630 }, { "entropy": 0.6098922938108444, "epoch": 2.4545030331311244, "grad_norm": 0.2562113404273987, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.7511230707168579, "num_tokens": 9525683.0, "step": 2631 }, { "entropy": 0.5639205425977707, "epoch": 2.4554363042463834, "grad_norm": 0.28749310970306396, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.772371917963028, "num_tokens": 9529300.0, "step": 2632 }, { "entropy": 0.5985614210367203, "epoch": 2.4563695753616424, "grad_norm": 0.2960498332977295, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.7533997595310211, "num_tokens": 9532883.0, "step": 2633 }, { "entropy": 0.571635439991951, "epoch": 2.4573028464769013, "grad_norm": 0.31166109442710876, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.7607947140932083, "num_tokens": 9536437.0, "step": 2634 }, { "entropy": 0.5534617751836777, "epoch": 2.4582361175921603, "grad_norm": 0.2915644943714142, "learning_rate": 0.0002, "loss": 0.572, "mean_token_accuracy": 0.7654829919338226, "num_tokens": 9540018.0, "step": 2635 }, { "entropy": 0.5343586131930351, "epoch": 2.4591693887074193, "grad_norm": 0.3132517635822296, "learning_rate": 0.0002, "loss": 0.5319, "mean_token_accuracy": 0.7829297930002213, "num_tokens": 9543585.0, "step": 2636 }, { "entropy": 0.5823009163141251, "epoch": 2.4601026598226783, "grad_norm": 0.29371607303619385, "learning_rate": 0.0002, "loss": 0.5777, "mean_token_accuracy": 0.7625466883182526, "num_tokens": 9547331.0, "step": 2637 }, { "entropy": 0.5724675208330154, "epoch": 2.4610359309379373, "grad_norm": 0.27862030267715454, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7681071162223816, "num_tokens": 9551068.0, "step": 2638 }, { "entropy": 0.5873464792966843, "epoch": 2.4619692020531962, "grad_norm": 0.28167542815208435, "learning_rate": 0.0002, "loss": 0.581, "mean_token_accuracy": 0.7669133543968201, "num_tokens": 9554605.0, "step": 2639 }, { "entropy": 0.5903975814580917, "epoch": 2.4629024731684552, "grad_norm": 0.2571181356906891, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.7682637870311737, "num_tokens": 9558240.0, "step": 2640 }, { "entropy": 0.5978406816720963, "epoch": 2.463835744283714, "grad_norm": 0.2527775764465332, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.7632445991039276, "num_tokens": 9561868.0, "step": 2641 }, { "entropy": 0.6058532297611237, "epoch": 2.464769015398973, "grad_norm": 0.3171558082103729, "learning_rate": 0.0002, "loss": 0.6042, "mean_token_accuracy": 0.7502654790878296, "num_tokens": 9565396.0, "step": 2642 }, { "entropy": 0.5871748477220535, "epoch": 2.465702286514232, "grad_norm": 0.26127102971076965, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7599902004003525, "num_tokens": 9569036.0, "step": 2643 }, { "entropy": 0.5616967380046844, "epoch": 2.466635557629491, "grad_norm": 0.2566419541835785, "learning_rate": 0.0002, "loss": 0.569, "mean_token_accuracy": 0.7664889693260193, "num_tokens": 9572655.0, "step": 2644 }, { "entropy": 0.5949001312255859, "epoch": 2.46756882874475, "grad_norm": 0.2788521349430084, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.7529981136322021, "num_tokens": 9576275.0, "step": 2645 }, { "entropy": 0.5791649520397186, "epoch": 2.468502099860009, "grad_norm": 0.32289692759513855, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7599729001522064, "num_tokens": 9579939.0, "step": 2646 }, { "entropy": 0.6038092523813248, "epoch": 2.469435370975268, "grad_norm": 0.30458492040634155, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.7615799456834793, "num_tokens": 9583639.0, "step": 2647 }, { "entropy": 0.6283626705408096, "epoch": 2.470368642090527, "grad_norm": 0.3112182021141052, "learning_rate": 0.0002, "loss": 0.6406, "mean_token_accuracy": 0.7407815754413605, "num_tokens": 9587179.0, "step": 2648 }, { "entropy": 0.6298587024211884, "epoch": 2.471301913205786, "grad_norm": 0.3343373239040375, "learning_rate": 0.0002, "loss": 0.6426, "mean_token_accuracy": 0.7309030890464783, "num_tokens": 9590771.0, "step": 2649 }, { "entropy": 0.6157993823289871, "epoch": 2.472235184321045, "grad_norm": 0.3028816282749176, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.7547494322061539, "num_tokens": 9594392.0, "step": 2650 }, { "entropy": 0.5390750914812088, "epoch": 2.473168455436304, "grad_norm": 0.303640216588974, "learning_rate": 0.0002, "loss": 0.5396, "mean_token_accuracy": 0.7799815833568573, "num_tokens": 9598042.0, "step": 2651 }, { "entropy": 0.6397410780191422, "epoch": 2.474101726551563, "grad_norm": 0.29868143796920776, "learning_rate": 0.0002, "loss": 0.6394, "mean_token_accuracy": 0.7374908924102783, "num_tokens": 9601600.0, "step": 2652 }, { "entropy": 0.651474118232727, "epoch": 2.475034997666822, "grad_norm": 0.26398158073425293, "learning_rate": 0.0002, "loss": 0.6485, "mean_token_accuracy": 0.7362889498472214, "num_tokens": 9605239.0, "step": 2653 }, { "entropy": 0.571497231721878, "epoch": 2.475968268782081, "grad_norm": 0.28389105200767517, "learning_rate": 0.0002, "loss": 0.5774, "mean_token_accuracy": 0.7678800374269485, "num_tokens": 9608830.0, "step": 2654 }, { "entropy": 0.6169499605894089, "epoch": 2.47690153989734, "grad_norm": 0.29709678888320923, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.7511942088603973, "num_tokens": 9612380.0, "step": 2655 }, { "entropy": 0.6058224439620972, "epoch": 2.477834811012599, "grad_norm": 0.29210302233695984, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.7516107857227325, "num_tokens": 9616080.0, "step": 2656 }, { "entropy": 0.6127380430698395, "epoch": 2.478768082127858, "grad_norm": 0.28571969270706177, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.7527180016040802, "num_tokens": 9619752.0, "step": 2657 }, { "entropy": 0.5259988158941269, "epoch": 2.479701353243117, "grad_norm": 0.35491856932640076, "learning_rate": 0.0002, "loss": 0.5516, "mean_token_accuracy": 0.7779395431280136, "num_tokens": 9623288.0, "step": 2658 }, { "entropy": 0.5878065973520279, "epoch": 2.480634624358376, "grad_norm": 0.3735830783843994, "learning_rate": 0.0002, "loss": 0.6028, "mean_token_accuracy": 0.7536479979753494, "num_tokens": 9626869.0, "step": 2659 }, { "entropy": 0.5583927631378174, "epoch": 2.481567895473635, "grad_norm": 0.2950437366962433, "learning_rate": 0.0002, "loss": 0.5626, "mean_token_accuracy": 0.7736140042543411, "num_tokens": 9630480.0, "step": 2660 }, { "entropy": 0.5819810330867767, "epoch": 2.482501166588894, "grad_norm": 0.3187326192855835, "learning_rate": 0.0002, "loss": 0.5785, "mean_token_accuracy": 0.7674976289272308, "num_tokens": 9634248.0, "step": 2661 }, { "entropy": 0.5814832597970963, "epoch": 2.483434437704153, "grad_norm": 0.3072148859500885, "learning_rate": 0.0002, "loss": 0.5897, "mean_token_accuracy": 0.7666526436805725, "num_tokens": 9637847.0, "step": 2662 }, { "entropy": 0.6216253787279129, "epoch": 2.484367708819412, "grad_norm": 0.28909048438072205, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.7453842610120773, "num_tokens": 9641537.0, "step": 2663 }, { "entropy": 0.5943068265914917, "epoch": 2.485300979934671, "grad_norm": 0.30725449323654175, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.756819099187851, "num_tokens": 9645139.0, "step": 2664 }, { "entropy": 0.5969606935977936, "epoch": 2.48623425104993, "grad_norm": 0.29603299498558044, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.7658668011426926, "num_tokens": 9648638.0, "step": 2665 }, { "entropy": 0.6015838831663132, "epoch": 2.4871675221651888, "grad_norm": 0.301362544298172, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.7633672058582306, "num_tokens": 9652228.0, "step": 2666 }, { "entropy": 0.6293639242649078, "epoch": 2.4881007932804478, "grad_norm": 0.28427138924598694, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.7454374581575394, "num_tokens": 9655816.0, "step": 2667 }, { "entropy": 0.5731652826070786, "epoch": 2.4890340643957067, "grad_norm": 0.3067905008792877, "learning_rate": 0.0002, "loss": 0.5823, "mean_token_accuracy": 0.7677087336778641, "num_tokens": 9659409.0, "step": 2668 }, { "entropy": 0.6084983348846436, "epoch": 2.4899673355109657, "grad_norm": 0.300523579120636, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.7485455870628357, "num_tokens": 9662986.0, "step": 2669 }, { "entropy": 0.6558130085468292, "epoch": 2.4909006066262247, "grad_norm": 0.2923228442668915, "learning_rate": 0.0002, "loss": 0.6525, "mean_token_accuracy": 0.7330092191696167, "num_tokens": 9666708.0, "step": 2670 }, { "entropy": 0.6079660505056381, "epoch": 2.4918338777414837, "grad_norm": 0.2635451853275299, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7558840066194534, "num_tokens": 9670369.0, "step": 2671 }, { "entropy": 0.5946465134620667, "epoch": 2.4927671488567427, "grad_norm": 0.2790412902832031, "learning_rate": 0.0002, "loss": 0.585, "mean_token_accuracy": 0.7668284326791763, "num_tokens": 9674035.0, "step": 2672 }, { "entropy": 0.6077221184968948, "epoch": 2.4937004199720016, "grad_norm": 0.28883665800094604, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7531954050064087, "num_tokens": 9677723.0, "step": 2673 }, { "entropy": 0.5590933561325073, "epoch": 2.4946336910872606, "grad_norm": 0.3220762312412262, "learning_rate": 0.0002, "loss": 0.5547, "mean_token_accuracy": 0.7728161811828613, "num_tokens": 9681234.0, "step": 2674 }, { "entropy": 0.5939715057611465, "epoch": 2.4955669622025196, "grad_norm": 0.30765557289123535, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.7618031352758408, "num_tokens": 9684755.0, "step": 2675 }, { "entropy": 0.6036807745695114, "epoch": 2.4965002333177786, "grad_norm": 0.34909647703170776, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.7411252856254578, "num_tokens": 9688424.0, "step": 2676 }, { "entropy": 0.598258763551712, "epoch": 2.4974335044330376, "grad_norm": 0.31488364934921265, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.7495478689670563, "num_tokens": 9692124.0, "step": 2677 }, { "entropy": 0.5768322497606277, "epoch": 2.4983667755482966, "grad_norm": 0.30930790305137634, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.7573429346084595, "num_tokens": 9695784.0, "step": 2678 }, { "entropy": 0.6049512028694153, "epoch": 2.4993000466635555, "grad_norm": 0.2913684546947479, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.7520669549703598, "num_tokens": 9699408.0, "step": 2679 }, { "entropy": 0.6050483584403992, "epoch": 2.5002333177788145, "grad_norm": 0.25435054302215576, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.7581223398447037, "num_tokens": 9703079.0, "step": 2680 }, { "entropy": 0.6073437035083771, "epoch": 2.5011665888940735, "grad_norm": 0.34802278876304626, "learning_rate": 0.0002, "loss": 0.6044, "mean_token_accuracy": 0.7508739084005356, "num_tokens": 9706738.0, "step": 2681 }, { "entropy": 0.5943589061498642, "epoch": 2.5020998600093325, "grad_norm": 0.2628326117992401, "learning_rate": 0.0002, "loss": 0.5845, "mean_token_accuracy": 0.7590198218822479, "num_tokens": 9710405.0, "step": 2682 }, { "entropy": 0.6074687242507935, "epoch": 2.5030331311245915, "grad_norm": 0.2546836733818054, "learning_rate": 0.0002, "loss": 0.5968, "mean_token_accuracy": 0.7579157650470734, "num_tokens": 9714119.0, "step": 2683 }, { "entropy": 0.6316413879394531, "epoch": 2.5039664022398505, "grad_norm": 0.30363863706588745, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.7418618202209473, "num_tokens": 9717816.0, "step": 2684 }, { "entropy": 0.591644287109375, "epoch": 2.5048996733551094, "grad_norm": 0.27643123269081116, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.7575828433036804, "num_tokens": 9721450.0, "step": 2685 }, { "entropy": 0.5777681916952133, "epoch": 2.5058329444703684, "grad_norm": 0.3376903533935547, "learning_rate": 0.0002, "loss": 0.5894, "mean_token_accuracy": 0.7660348117351532, "num_tokens": 9725019.0, "step": 2686 }, { "entropy": 0.5875403136014938, "epoch": 2.5067662155856274, "grad_norm": 0.2974793314933777, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.7632418572902679, "num_tokens": 9728687.0, "step": 2687 }, { "entropy": 0.5747693628072739, "epoch": 2.5076994867008864, "grad_norm": 0.2684188783168793, "learning_rate": 0.0002, "loss": 0.5704, "mean_token_accuracy": 0.7720061093568802, "num_tokens": 9732444.0, "step": 2688 }, { "entropy": 0.5517778694629669, "epoch": 2.5086327578161454, "grad_norm": 0.3028607964515686, "learning_rate": 0.0002, "loss": 0.5612, "mean_token_accuracy": 0.770093709230423, "num_tokens": 9736104.0, "step": 2689 }, { "entropy": 0.5972714722156525, "epoch": 2.5095660289314043, "grad_norm": 0.2896239459514618, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.7593598961830139, "num_tokens": 9739754.0, "step": 2690 }, { "entropy": 0.5981622338294983, "epoch": 2.5104993000466633, "grad_norm": 0.3779999017715454, "learning_rate": 0.0002, "loss": 0.6113, "mean_token_accuracy": 0.7476693987846375, "num_tokens": 9743413.0, "step": 2691 }, { "entropy": 0.5845213234424591, "epoch": 2.5114325711619223, "grad_norm": 0.299356073141098, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.7592864781618118, "num_tokens": 9747059.0, "step": 2692 }, { "entropy": 0.6189575344324112, "epoch": 2.5123658422771813, "grad_norm": 0.2886812388896942, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.7474416494369507, "num_tokens": 9750764.0, "step": 2693 }, { "entropy": 0.550584003329277, "epoch": 2.5132991133924403, "grad_norm": 0.3452458381652832, "learning_rate": 0.0002, "loss": 0.5592, "mean_token_accuracy": 0.7750629931688309, "num_tokens": 9754440.0, "step": 2694 }, { "entropy": 0.5965986549854279, "epoch": 2.5142323845076993, "grad_norm": 0.3272583484649658, "learning_rate": 0.0002, "loss": 0.598, "mean_token_accuracy": 0.7607519179582596, "num_tokens": 9758078.0, "step": 2695 }, { "entropy": 0.5644586980342865, "epoch": 2.5151656556229582, "grad_norm": 0.23047874867916107, "learning_rate": 0.0002, "loss": 0.561, "mean_token_accuracy": 0.7740487158298492, "num_tokens": 9761798.0, "step": 2696 }, { "entropy": 0.6167726963758469, "epoch": 2.5160989267382172, "grad_norm": 0.27547982335090637, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7596319615840912, "num_tokens": 9765390.0, "step": 2697 }, { "entropy": 0.5640893429517746, "epoch": 2.517032197853476, "grad_norm": 0.3672252893447876, "learning_rate": 0.0002, "loss": 0.5685, "mean_token_accuracy": 0.765369638800621, "num_tokens": 9768869.0, "step": 2698 }, { "entropy": 0.5546280741691589, "epoch": 2.517965468968735, "grad_norm": 0.3367132842540741, "learning_rate": 0.0002, "loss": 0.5476, "mean_token_accuracy": 0.7805480659008026, "num_tokens": 9772491.0, "step": 2699 }, { "entropy": 0.5787492394447327, "epoch": 2.518898740083994, "grad_norm": 0.26655641198158264, "learning_rate": 0.0002, "loss": 0.5795, "mean_token_accuracy": 0.7729964852333069, "num_tokens": 9776137.0, "step": 2700 }, { "entropy": 0.5505237728357315, "epoch": 2.519832011199253, "grad_norm": 0.2947348356246948, "learning_rate": 0.0002, "loss": 0.5617, "mean_token_accuracy": 0.7715700268745422, "num_tokens": 9779689.0, "step": 2701 }, { "entropy": 0.6406308710575104, "epoch": 2.520765282314512, "grad_norm": 0.3427746891975403, "learning_rate": 0.0002, "loss": 0.6493, "mean_token_accuracy": 0.7327671349048615, "num_tokens": 9783431.0, "step": 2702 }, { "entropy": 0.5900155603885651, "epoch": 2.521698553429771, "grad_norm": 0.28592878580093384, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.7572310566902161, "num_tokens": 9787041.0, "step": 2703 }, { "entropy": 0.5808587446808815, "epoch": 2.52263182454503, "grad_norm": 0.2600191533565521, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7705361098051071, "num_tokens": 9790705.0, "step": 2704 }, { "entropy": 0.6261202841997147, "epoch": 2.523565095660289, "grad_norm": 0.34073182940483093, "learning_rate": 0.0002, "loss": 0.6332, "mean_token_accuracy": 0.7447866499423981, "num_tokens": 9794174.0, "step": 2705 }, { "entropy": 0.5947665497660637, "epoch": 2.524498366775548, "grad_norm": 0.32567405700683594, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7600498199462891, "num_tokens": 9797891.0, "step": 2706 }, { "entropy": 0.5437153875827789, "epoch": 2.525431637890807, "grad_norm": 0.24478410184383392, "learning_rate": 0.0002, "loss": 0.5508, "mean_token_accuracy": 0.7685441225767136, "num_tokens": 9801405.0, "step": 2707 }, { "entropy": 0.5992899686098099, "epoch": 2.526364909006066, "grad_norm": 0.3176611065864563, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.7560915648937225, "num_tokens": 9805057.0, "step": 2708 }, { "entropy": 0.5730193853378296, "epoch": 2.527298180121325, "grad_norm": 0.32538312673568726, "learning_rate": 0.0002, "loss": 0.5846, "mean_token_accuracy": 0.768812358379364, "num_tokens": 9808763.0, "step": 2709 }, { "entropy": 0.5383094996213913, "epoch": 2.528231451236584, "grad_norm": 0.2412121742963791, "learning_rate": 0.0002, "loss": 0.5541, "mean_token_accuracy": 0.7740368545055389, "num_tokens": 9812386.0, "step": 2710 }, { "entropy": 0.5945821702480316, "epoch": 2.529164722351843, "grad_norm": 0.30601146817207336, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7601138949394226, "num_tokens": 9815880.0, "step": 2711 }, { "entropy": 0.6252660900354385, "epoch": 2.530097993467102, "grad_norm": 0.2821194529533386, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.7483199238777161, "num_tokens": 9819614.0, "step": 2712 }, { "entropy": 0.6429148018360138, "epoch": 2.531031264582361, "grad_norm": 0.2897984981536865, "learning_rate": 0.0002, "loss": 0.6408, "mean_token_accuracy": 0.7476805746555328, "num_tokens": 9823216.0, "step": 2713 }, { "entropy": 0.6150541603565216, "epoch": 2.53196453569762, "grad_norm": 0.31955665349960327, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.7530700713396072, "num_tokens": 9826862.0, "step": 2714 }, { "entropy": 0.5890474766492844, "epoch": 2.532897806812879, "grad_norm": 0.3133856952190399, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.7634277641773224, "num_tokens": 9830353.0, "step": 2715 }, { "entropy": 0.5896966904401779, "epoch": 2.533831077928138, "grad_norm": 0.3071066737174988, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.7625811994075775, "num_tokens": 9833899.0, "step": 2716 }, { "entropy": 0.6044367253780365, "epoch": 2.534764349043397, "grad_norm": 0.31456854939460754, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.754907950758934, "num_tokens": 9837554.0, "step": 2717 }, { "entropy": 0.6165612787008286, "epoch": 2.535697620158656, "grad_norm": 0.3229001462459564, "learning_rate": 0.0002, "loss": 0.6255, "mean_token_accuracy": 0.7413779199123383, "num_tokens": 9841152.0, "step": 2718 }, { "entropy": 0.5654467940330505, "epoch": 2.536630891273915, "grad_norm": 0.26667746901512146, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.7653248310089111, "num_tokens": 9844778.0, "step": 2719 }, { "entropy": 0.5491684675216675, "epoch": 2.537564162389174, "grad_norm": 0.2890278100967407, "learning_rate": 0.0002, "loss": 0.5588, "mean_token_accuracy": 0.7740936577320099, "num_tokens": 9848307.0, "step": 2720 }, { "entropy": 0.5797063410282135, "epoch": 2.538497433504433, "grad_norm": 0.2763175666332245, "learning_rate": 0.0002, "loss": 0.5816, "mean_token_accuracy": 0.7683936655521393, "num_tokens": 9852037.0, "step": 2721 }, { "entropy": 0.6220067739486694, "epoch": 2.539430704619692, "grad_norm": 0.2604435980319977, "learning_rate": 0.0002, "loss": 0.6244, "mean_token_accuracy": 0.7519726306200027, "num_tokens": 9855641.0, "step": 2722 }, { "entropy": 0.5699381977319717, "epoch": 2.5403639757349508, "grad_norm": 0.2533974051475525, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7754561305046082, "num_tokens": 9859480.0, "step": 2723 }, { "entropy": 0.5996859669685364, "epoch": 2.5412972468502097, "grad_norm": 0.28795236349105835, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.754556804895401, "num_tokens": 9863055.0, "step": 2724 }, { "entropy": 0.5803292095661163, "epoch": 2.5422305179654687, "grad_norm": 0.28584539890289307, "learning_rate": 0.0002, "loss": 0.5896, "mean_token_accuracy": 0.7678126692771912, "num_tokens": 9866632.0, "step": 2725 }, { "entropy": 0.5841351449489594, "epoch": 2.5431637890807277, "grad_norm": 0.30520257353782654, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.7540667355060577, "num_tokens": 9870229.0, "step": 2726 }, { "entropy": 0.5648975223302841, "epoch": 2.5440970601959867, "grad_norm": 0.342704176902771, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7693791687488556, "num_tokens": 9873731.0, "step": 2727 }, { "entropy": 0.6057059913873672, "epoch": 2.5450303313112457, "grad_norm": 0.3066652715206146, "learning_rate": 0.0002, "loss": 0.6126, "mean_token_accuracy": 0.7469928413629532, "num_tokens": 9877398.0, "step": 2728 }, { "entropy": 0.6214585155248642, "epoch": 2.5459636024265047, "grad_norm": 0.3911408483982086, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.751716673374176, "num_tokens": 9880982.0, "step": 2729 }, { "entropy": 0.5753865092992783, "epoch": 2.5468968735417636, "grad_norm": 0.2408490777015686, "learning_rate": 0.0002, "loss": 0.5676, "mean_token_accuracy": 0.7674327343702316, "num_tokens": 9884648.0, "step": 2730 }, { "entropy": 0.6067681908607483, "epoch": 2.5478301446570226, "grad_norm": 0.2714061439037323, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7530554383993149, "num_tokens": 9888393.0, "step": 2731 }, { "entropy": 0.5797049552202225, "epoch": 2.5487634157722816, "grad_norm": 0.3350869119167328, "learning_rate": 0.0002, "loss": 0.5852, "mean_token_accuracy": 0.7614769786596298, "num_tokens": 9891999.0, "step": 2732 }, { "entropy": 0.6015781909227371, "epoch": 2.5496966868875406, "grad_norm": 0.29272663593292236, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7617468237876892, "num_tokens": 9895574.0, "step": 2733 }, { "entropy": 0.6036890894174576, "epoch": 2.5506299580027996, "grad_norm": 0.3154201805591583, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.7560755759477615, "num_tokens": 9899246.0, "step": 2734 }, { "entropy": 0.567592665553093, "epoch": 2.5515632291180586, "grad_norm": 0.32427307963371277, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.7595757842063904, "num_tokens": 9902860.0, "step": 2735 }, { "entropy": 0.5899058729410172, "epoch": 2.5524965002333175, "grad_norm": 0.3181326687335968, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.7570235580205917, "num_tokens": 9906592.0, "step": 2736 }, { "entropy": 0.5960281491279602, "epoch": 2.5534297713485765, "grad_norm": 0.35899847745895386, "learning_rate": 0.0002, "loss": 0.6158, "mean_token_accuracy": 0.7498795539140701, "num_tokens": 9910139.0, "step": 2737 }, { "entropy": 0.5710704773664474, "epoch": 2.5543630424638355, "grad_norm": 0.28341734409332275, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.7599730789661407, "num_tokens": 9913741.0, "step": 2738 }, { "entropy": 0.5761062353849411, "epoch": 2.5552963135790945, "grad_norm": 0.32294806838035583, "learning_rate": 0.0002, "loss": 0.5823, "mean_token_accuracy": 0.7655332237482071, "num_tokens": 9917456.0, "step": 2739 }, { "entropy": 0.5960856229066849, "epoch": 2.5562295846943535, "grad_norm": 0.296495646238327, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.7519427686929703, "num_tokens": 9920981.0, "step": 2740 }, { "entropy": 0.5993579179048538, "epoch": 2.5571628558096124, "grad_norm": 0.2746134400367737, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.7595819681882858, "num_tokens": 9924582.0, "step": 2741 }, { "entropy": 0.6373406201601028, "epoch": 2.5580961269248714, "grad_norm": 0.28308236598968506, "learning_rate": 0.0002, "loss": 0.6253, "mean_token_accuracy": 0.7446233332157135, "num_tokens": 9928224.0, "step": 2742 }, { "entropy": 0.6430558413267136, "epoch": 2.5590293980401304, "grad_norm": 0.2547254264354706, "learning_rate": 0.0002, "loss": 0.6394, "mean_token_accuracy": 0.7425792813301086, "num_tokens": 9932005.0, "step": 2743 }, { "entropy": 0.610855832695961, "epoch": 2.5599626691553894, "grad_norm": 0.28122225403785706, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.7523365467786789, "num_tokens": 9935677.0, "step": 2744 }, { "entropy": 0.6022045612335205, "epoch": 2.5608959402706484, "grad_norm": 0.28221386671066284, "learning_rate": 0.0002, "loss": 0.6143, "mean_token_accuracy": 0.7514488101005554, "num_tokens": 9939373.0, "step": 2745 }, { "entropy": 0.6046989113092422, "epoch": 2.5618292113859074, "grad_norm": 0.26462727785110474, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.7577460557222366, "num_tokens": 9942841.0, "step": 2746 }, { "entropy": 0.5899644047021866, "epoch": 2.5627624825011663, "grad_norm": 0.27945494651794434, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.7631041407585144, "num_tokens": 9946464.0, "step": 2747 }, { "entropy": 0.5953328013420105, "epoch": 2.5636957536164253, "grad_norm": 0.2790399193763733, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.7597660571336746, "num_tokens": 9950158.0, "step": 2748 }, { "entropy": 0.5677313655614853, "epoch": 2.5646290247316843, "grad_norm": 0.245552659034729, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.7770144641399384, "num_tokens": 9953736.0, "step": 2749 }, { "entropy": 0.606298416852951, "epoch": 2.5655622958469433, "grad_norm": 0.27024397253990173, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.753109484910965, "num_tokens": 9957440.0, "step": 2750 }, { "entropy": 0.5854782462120056, "epoch": 2.5664955669622023, "grad_norm": 0.29741281270980835, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.7558440119028091, "num_tokens": 9961059.0, "step": 2751 }, { "entropy": 0.6098271757364273, "epoch": 2.5674288380774613, "grad_norm": 0.31123191118240356, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.7483432441949844, "num_tokens": 9964721.0, "step": 2752 }, { "entropy": 0.5538851767778397, "epoch": 2.5683621091927202, "grad_norm": 0.3032993972301483, "learning_rate": 0.0002, "loss": 0.5696, "mean_token_accuracy": 0.7699971497058868, "num_tokens": 9968346.0, "step": 2753 }, { "entropy": 0.5760697573423386, "epoch": 2.569295380307979, "grad_norm": 0.3189980685710907, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7629860043525696, "num_tokens": 9972211.0, "step": 2754 }, { "entropy": 0.5586536675691605, "epoch": 2.570228651423238, "grad_norm": 0.24426110088825226, "learning_rate": 0.0002, "loss": 0.5618, "mean_token_accuracy": 0.774279773235321, "num_tokens": 9975769.0, "step": 2755 }, { "entropy": 0.5738093703985214, "epoch": 2.571161922538497, "grad_norm": 0.36865657567977905, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.7651463598012924, "num_tokens": 9979300.0, "step": 2756 }, { "entropy": 0.6076614409685135, "epoch": 2.572095193653756, "grad_norm": 0.2821599841117859, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.7581081688404083, "num_tokens": 9982835.0, "step": 2757 }, { "entropy": 0.6332030445337296, "epoch": 2.573028464769015, "grad_norm": 0.26384997367858887, "learning_rate": 0.0002, "loss": 0.6326, "mean_token_accuracy": 0.7388061285018921, "num_tokens": 9986587.0, "step": 2758 }, { "entropy": 0.5642060488462448, "epoch": 2.573961735884274, "grad_norm": 0.2731363773345947, "learning_rate": 0.0002, "loss": 0.5658, "mean_token_accuracy": 0.7691419720649719, "num_tokens": 9990146.0, "step": 2759 }, { "entropy": 0.6222523301839828, "epoch": 2.574895006999533, "grad_norm": 0.25727298855781555, "learning_rate": 0.0002, "loss": 0.6249, "mean_token_accuracy": 0.7472957521677017, "num_tokens": 9993737.0, "step": 2760 }, { "entropy": 0.5978803262114525, "epoch": 2.575828278114792, "grad_norm": 0.300643652677536, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.753381758928299, "num_tokens": 9997433.0, "step": 2761 }, { "entropy": 0.588722437620163, "epoch": 2.576761549230051, "grad_norm": 0.2858535647392273, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.7540170103311539, "num_tokens": 10001090.0, "step": 2762 }, { "entropy": 0.6083834171295166, "epoch": 2.57769482034531, "grad_norm": 0.34758323431015015, "learning_rate": 0.0002, "loss": 0.6239, "mean_token_accuracy": 0.7496663331985474, "num_tokens": 10004818.0, "step": 2763 }, { "entropy": 0.583373412489891, "epoch": 2.578628091460569, "grad_norm": 0.32454052567481995, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.7545308619737625, "num_tokens": 10008501.0, "step": 2764 }, { "entropy": 0.5705965608358383, "epoch": 2.579561362575828, "grad_norm": 0.28886088728904724, "learning_rate": 0.0002, "loss": 0.5758, "mean_token_accuracy": 0.7606883645057678, "num_tokens": 10012113.0, "step": 2765 }, { "entropy": 0.5699885189533234, "epoch": 2.580494633691087, "grad_norm": 0.30049988627433777, "learning_rate": 0.0002, "loss": 0.5787, "mean_token_accuracy": 0.7648070752620697, "num_tokens": 10015731.0, "step": 2766 }, { "entropy": 0.5935674011707306, "epoch": 2.581427904806346, "grad_norm": 0.26237753033638, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.7594339996576309, "num_tokens": 10019303.0, "step": 2767 }, { "entropy": 0.6044932752847672, "epoch": 2.582361175921605, "grad_norm": 0.2464599758386612, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.7615322321653366, "num_tokens": 10022858.0, "step": 2768 }, { "entropy": 0.6309289634227753, "epoch": 2.583294447036864, "grad_norm": 0.2589556574821472, "learning_rate": 0.0002, "loss": 0.6182, "mean_token_accuracy": 0.750581830739975, "num_tokens": 10026558.0, "step": 2769 }, { "entropy": 0.5717357993125916, "epoch": 2.584227718152123, "grad_norm": 0.2701955735683441, "learning_rate": 0.0002, "loss": 0.5792, "mean_token_accuracy": 0.7657465785741806, "num_tokens": 10030148.0, "step": 2770 }, { "entropy": 0.5970119088888168, "epoch": 2.585160989267382, "grad_norm": 0.3099971115589142, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.7603666335344315, "num_tokens": 10033976.0, "step": 2771 }, { "entropy": 0.566252663731575, "epoch": 2.586094260382641, "grad_norm": 0.3031105399131775, "learning_rate": 0.0002, "loss": 0.5682, "mean_token_accuracy": 0.7734898179769516, "num_tokens": 10037493.0, "step": 2772 }, { "entropy": 0.5590258538722992, "epoch": 2.5870275314979, "grad_norm": 0.28125467896461487, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7646832913160324, "num_tokens": 10041100.0, "step": 2773 }, { "entropy": 0.5912938266992569, "epoch": 2.587960802613159, "grad_norm": 0.3414657711982727, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7554241120815277, "num_tokens": 10044671.0, "step": 2774 }, { "entropy": 0.5560954362154007, "epoch": 2.588894073728418, "grad_norm": 0.28786149621009827, "learning_rate": 0.0002, "loss": 0.5679, "mean_token_accuracy": 0.7695580124855042, "num_tokens": 10048281.0, "step": 2775 }, { "entropy": 0.5784274935722351, "epoch": 2.589827344843677, "grad_norm": 0.3645786643028259, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7674322128295898, "num_tokens": 10051946.0, "step": 2776 }, { "entropy": 0.5969065576791763, "epoch": 2.590760615958936, "grad_norm": 0.31576865911483765, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.7551237642765045, "num_tokens": 10055507.0, "step": 2777 }, { "entropy": 0.5821502804756165, "epoch": 2.591693887074195, "grad_norm": 0.29046857357025146, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.7541697025299072, "num_tokens": 10058984.0, "step": 2778 }, { "entropy": 0.5814065486192703, "epoch": 2.5926271581894538, "grad_norm": 0.3300810158252716, "learning_rate": 0.0002, "loss": 0.5944, "mean_token_accuracy": 0.75612011551857, "num_tokens": 10062573.0, "step": 2779 }, { "entropy": 0.5495246797800064, "epoch": 2.5935604293047128, "grad_norm": 0.33302822709083557, "learning_rate": 0.0002, "loss": 0.559, "mean_token_accuracy": 0.7731181234121323, "num_tokens": 10066152.0, "step": 2780 }, { "entropy": 0.6002669036388397, "epoch": 2.5944937004199717, "grad_norm": 0.24384935200214386, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.7613951414823532, "num_tokens": 10069840.0, "step": 2781 }, { "entropy": 0.5636795163154602, "epoch": 2.5954269715352307, "grad_norm": 0.30140912532806396, "learning_rate": 0.0002, "loss": 0.5594, "mean_token_accuracy": 0.779959037899971, "num_tokens": 10073538.0, "step": 2782 }, { "entropy": 0.5958980768918991, "epoch": 2.5963602426504897, "grad_norm": 0.2938953638076782, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.75311578810215, "num_tokens": 10077068.0, "step": 2783 }, { "entropy": 0.5920533686876297, "epoch": 2.5972935137657487, "grad_norm": 0.3036532700061798, "learning_rate": 0.0002, "loss": 0.5884, "mean_token_accuracy": 0.7594675868749619, "num_tokens": 10080643.0, "step": 2784 }, { "entropy": 0.5627211183309555, "epoch": 2.5982267848810077, "grad_norm": 0.3273390829563141, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.770118311047554, "num_tokens": 10084323.0, "step": 2785 }, { "entropy": 0.5821723341941833, "epoch": 2.5991600559962666, "grad_norm": 0.3328598141670227, "learning_rate": 0.0002, "loss": 0.5921, "mean_token_accuracy": 0.7605109512805939, "num_tokens": 10087982.0, "step": 2786 }, { "entropy": 0.5681204348802567, "epoch": 2.6000933271115256, "grad_norm": 0.3638099730014801, "learning_rate": 0.0002, "loss": 0.5796, "mean_token_accuracy": 0.7712569832801819, "num_tokens": 10091563.0, "step": 2787 }, { "entropy": 0.5762875899672508, "epoch": 2.6010265982267846, "grad_norm": 0.3003784418106079, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7664307355880737, "num_tokens": 10095124.0, "step": 2788 }, { "entropy": 0.6337495595216751, "epoch": 2.6019598693420436, "grad_norm": 0.2836015224456787, "learning_rate": 0.0002, "loss": 0.6442, "mean_token_accuracy": 0.738272100687027, "num_tokens": 10098809.0, "step": 2789 }, { "entropy": 0.5807363912463188, "epoch": 2.6028931404573026, "grad_norm": 0.3367491066455841, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7565274089574814, "num_tokens": 10102437.0, "step": 2790 }, { "entropy": 0.5910967290401459, "epoch": 2.6038264115725616, "grad_norm": 0.28740406036376953, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.7538609355688095, "num_tokens": 10106060.0, "step": 2791 }, { "entropy": 0.5851007252931595, "epoch": 2.6047596826878205, "grad_norm": 0.27602875232696533, "learning_rate": 0.0002, "loss": 0.5807, "mean_token_accuracy": 0.7662334144115448, "num_tokens": 10109750.0, "step": 2792 }, { "entropy": 0.5868482887744904, "epoch": 2.6056929538030795, "grad_norm": 0.2630710005760193, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.7557904422283173, "num_tokens": 10113300.0, "step": 2793 }, { "entropy": 0.6057766824960709, "epoch": 2.6066262249183385, "grad_norm": 0.298073947429657, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.751077339053154, "num_tokens": 10116997.0, "step": 2794 }, { "entropy": 0.5540283769369125, "epoch": 2.6075594960335975, "grad_norm": 0.26228833198547363, "learning_rate": 0.0002, "loss": 0.5465, "mean_token_accuracy": 0.7786097675561905, "num_tokens": 10120587.0, "step": 2795 }, { "entropy": 0.5893142521381378, "epoch": 2.6084927671488565, "grad_norm": 0.27356573939323425, "learning_rate": 0.0002, "loss": 0.5968, "mean_token_accuracy": 0.7591630667448044, "num_tokens": 10124221.0, "step": 2796 }, { "entropy": 0.6550378352403641, "epoch": 2.6094260382641155, "grad_norm": 0.2963027358055115, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.7416638731956482, "num_tokens": 10127910.0, "step": 2797 }, { "entropy": 0.6322608292102814, "epoch": 2.6103593093793744, "grad_norm": 0.3467712104320526, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.7427549064159393, "num_tokens": 10131450.0, "step": 2798 }, { "entropy": 0.5989229083061218, "epoch": 2.6112925804946334, "grad_norm": 0.3208020031452179, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7559259086847305, "num_tokens": 10135067.0, "step": 2799 }, { "entropy": 0.5691123753786087, "epoch": 2.6122258516098924, "grad_norm": 0.26566702127456665, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.7613796144723892, "num_tokens": 10138759.0, "step": 2800 }, { "entropy": 0.6027870178222656, "epoch": 2.6131591227251514, "grad_norm": 0.2705887258052826, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.761613517999649, "num_tokens": 10142392.0, "step": 2801 }, { "entropy": 0.5914319753646851, "epoch": 2.6140923938404104, "grad_norm": 0.2945248782634735, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.7592689692974091, "num_tokens": 10145932.0, "step": 2802 }, { "entropy": 0.5696438848972321, "epoch": 2.6150256649556693, "grad_norm": 0.22157520055770874, "learning_rate": 0.0002, "loss": 0.5656, "mean_token_accuracy": 0.773918628692627, "num_tokens": 10149575.0, "step": 2803 }, { "entropy": 0.5919068455696106, "epoch": 2.6159589360709283, "grad_norm": 0.25367802381515503, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.757166713476181, "num_tokens": 10153273.0, "step": 2804 }, { "entropy": 0.5428730174899101, "epoch": 2.6168922071861873, "grad_norm": 0.2599395513534546, "learning_rate": 0.0002, "loss": 0.5548, "mean_token_accuracy": 0.7772492170333862, "num_tokens": 10156848.0, "step": 2805 }, { "entropy": 0.5971904993057251, "epoch": 2.6178254783014463, "grad_norm": 0.3099898397922516, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.7587191760540009, "num_tokens": 10160593.0, "step": 2806 }, { "entropy": 0.597881019115448, "epoch": 2.6187587494167053, "grad_norm": 0.31703245639801025, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.7497971206903458, "num_tokens": 10164166.0, "step": 2807 }, { "entropy": 0.605304129421711, "epoch": 2.6196920205319643, "grad_norm": 0.2769455313682556, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.7540264874696732, "num_tokens": 10167859.0, "step": 2808 }, { "entropy": 0.5895132571458817, "epoch": 2.6206252916472232, "grad_norm": 0.24655169248580933, "learning_rate": 0.0002, "loss": 0.5896, "mean_token_accuracy": 0.7605806887149811, "num_tokens": 10171622.0, "step": 2809 }, { "entropy": 0.6188045889139175, "epoch": 2.6215585627624822, "grad_norm": 0.26643067598342896, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7572979480028152, "num_tokens": 10175308.0, "step": 2810 }, { "entropy": 0.6120922267436981, "epoch": 2.622491833877741, "grad_norm": 0.24811865389347076, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.7538174986839294, "num_tokens": 10178959.0, "step": 2811 }, { "entropy": 0.5786515474319458, "epoch": 2.623425104993, "grad_norm": 0.27626726031303406, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.765509307384491, "num_tokens": 10182623.0, "step": 2812 }, { "entropy": 0.6119775176048279, "epoch": 2.624358376108259, "grad_norm": 0.2704806327819824, "learning_rate": 0.0002, "loss": 0.6041, "mean_token_accuracy": 0.7614017277956009, "num_tokens": 10186318.0, "step": 2813 }, { "entropy": 0.5806817710399628, "epoch": 2.6252916472235186, "grad_norm": 0.2696544826030731, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.770472913980484, "num_tokens": 10189926.0, "step": 2814 }, { "entropy": 0.6312392055988312, "epoch": 2.6262249183387776, "grad_norm": 0.3323095738887787, "learning_rate": 0.0002, "loss": 0.6375, "mean_token_accuracy": 0.7389508485794067, "num_tokens": 10193570.0, "step": 2815 }, { "entropy": 0.6082043200731277, "epoch": 2.6271581894540366, "grad_norm": 0.25839152932167053, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.7495358437299728, "num_tokens": 10197194.0, "step": 2816 }, { "entropy": 0.6079178601503372, "epoch": 2.6280914605692955, "grad_norm": 0.30761951208114624, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.7430620938539505, "num_tokens": 10200732.0, "step": 2817 }, { "entropy": 0.5760051608085632, "epoch": 2.6290247316845545, "grad_norm": 0.27221494913101196, "learning_rate": 0.0002, "loss": 0.5835, "mean_token_accuracy": 0.764682337641716, "num_tokens": 10204347.0, "step": 2818 }, { "entropy": 0.563058614730835, "epoch": 2.6299580027998135, "grad_norm": 0.2869252860546112, "learning_rate": 0.0002, "loss": 0.5721, "mean_token_accuracy": 0.7700645923614502, "num_tokens": 10208001.0, "step": 2819 }, { "entropy": 0.6032182425260544, "epoch": 2.6308912739150725, "grad_norm": 0.2758370637893677, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.7556792497634888, "num_tokens": 10211617.0, "step": 2820 }, { "entropy": 0.5678400099277496, "epoch": 2.6318245450303315, "grad_norm": 0.2571010887622833, "learning_rate": 0.0002, "loss": 0.5686, "mean_token_accuracy": 0.7702120542526245, "num_tokens": 10215264.0, "step": 2821 }, { "entropy": 0.6303253918886185, "epoch": 2.6327578161455905, "grad_norm": 0.2713262438774109, "learning_rate": 0.0002, "loss": 0.6313, "mean_token_accuracy": 0.7515621930360794, "num_tokens": 10218961.0, "step": 2822 }, { "entropy": 0.5633492171764374, "epoch": 2.6336910872608494, "grad_norm": 0.29997625946998596, "learning_rate": 0.0002, "loss": 0.5757, "mean_token_accuracy": 0.7609906792640686, "num_tokens": 10222488.0, "step": 2823 }, { "entropy": 0.5681850910186768, "epoch": 2.6346243583761084, "grad_norm": 0.29000407457351685, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7615512013435364, "num_tokens": 10226100.0, "step": 2824 }, { "entropy": 0.6093781292438507, "epoch": 2.6355576294913674, "grad_norm": 0.271348774433136, "learning_rate": 0.0002, "loss": 0.607, "mean_token_accuracy": 0.7461941838264465, "num_tokens": 10229745.0, "step": 2825 }, { "entropy": 0.5899776220321655, "epoch": 2.6364909006066264, "grad_norm": 0.28390565514564514, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.7626245766878128, "num_tokens": 10233264.0, "step": 2826 }, { "entropy": 0.5287318825721741, "epoch": 2.6374241717218854, "grad_norm": 0.33004555106163025, "learning_rate": 0.0002, "loss": 0.539, "mean_token_accuracy": 0.7830114364624023, "num_tokens": 10236677.0, "step": 2827 }, { "entropy": 0.5641269981861115, "epoch": 2.6383574428371444, "grad_norm": 0.27551573514938354, "learning_rate": 0.0002, "loss": 0.5745, "mean_token_accuracy": 0.7680020928382874, "num_tokens": 10240266.0, "step": 2828 }, { "entropy": 0.5590282827615738, "epoch": 2.6392907139524033, "grad_norm": 0.35233891010284424, "learning_rate": 0.0002, "loss": 0.5696, "mean_token_accuracy": 0.7703631371259689, "num_tokens": 10243807.0, "step": 2829 }, { "entropy": 0.5915062427520752, "epoch": 2.6402239850676623, "grad_norm": 0.3035694658756256, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7656960189342499, "num_tokens": 10247557.0, "step": 2830 }, { "entropy": 0.6351398974657059, "epoch": 2.6411572561829213, "grad_norm": 0.3307667672634125, "learning_rate": 0.0002, "loss": 0.6385, "mean_token_accuracy": 0.7342944890260696, "num_tokens": 10251206.0, "step": 2831 }, { "entropy": 0.5847917050123215, "epoch": 2.6420905272981803, "grad_norm": 0.3452144265174866, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7537728101015091, "num_tokens": 10254787.0, "step": 2832 }, { "entropy": 0.5644850730895996, "epoch": 2.6430237984134393, "grad_norm": 0.3436676859855652, "learning_rate": 0.0002, "loss": 0.5736, "mean_token_accuracy": 0.7645125538110733, "num_tokens": 10258393.0, "step": 2833 }, { "entropy": 0.5867429077625275, "epoch": 2.6439570695286982, "grad_norm": 0.31385964155197144, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.7550681382417679, "num_tokens": 10262011.0, "step": 2834 }, { "entropy": 0.5885627865791321, "epoch": 2.6448903406439572, "grad_norm": 0.2827264368534088, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.7630571573972702, "num_tokens": 10265653.0, "step": 2835 }, { "entropy": 0.5993053168058395, "epoch": 2.645823611759216, "grad_norm": 0.3216567039489746, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.754731684923172, "num_tokens": 10269234.0, "step": 2836 }, { "entropy": 0.5573609471321106, "epoch": 2.646756882874475, "grad_norm": 0.31958502531051636, "learning_rate": 0.0002, "loss": 0.5638, "mean_token_accuracy": 0.7698836922645569, "num_tokens": 10272852.0, "step": 2837 }, { "entropy": 0.6110302805900574, "epoch": 2.647690153989734, "grad_norm": 0.25315603613853455, "learning_rate": 0.0002, "loss": 0.6042, "mean_token_accuracy": 0.7559208869934082, "num_tokens": 10276593.0, "step": 2838 }, { "entropy": 0.57917919754982, "epoch": 2.648623425104993, "grad_norm": 0.3436906337738037, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.7656681686639786, "num_tokens": 10280187.0, "step": 2839 }, { "entropy": 0.6101285070180893, "epoch": 2.649556696220252, "grad_norm": 0.279163658618927, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.7544515132904053, "num_tokens": 10283774.0, "step": 2840 }, { "entropy": 0.6236784607172012, "epoch": 2.650489967335511, "grad_norm": 0.2861846387386322, "learning_rate": 0.0002, "loss": 0.6288, "mean_token_accuracy": 0.7547957450151443, "num_tokens": 10287483.0, "step": 2841 }, { "entropy": 0.5766654461622238, "epoch": 2.65142323845077, "grad_norm": 0.26332393288612366, "learning_rate": 0.0002, "loss": 0.5669, "mean_token_accuracy": 0.773125946521759, "num_tokens": 10291176.0, "step": 2842 }, { "entropy": 0.6172192990779877, "epoch": 2.652356509566029, "grad_norm": 0.2768195867538452, "learning_rate": 0.0002, "loss": 0.6286, "mean_token_accuracy": 0.744415670633316, "num_tokens": 10294893.0, "step": 2843 }, { "entropy": 0.5633749067783356, "epoch": 2.653289780681288, "grad_norm": 0.2926238775253296, "learning_rate": 0.0002, "loss": 0.5735, "mean_token_accuracy": 0.7680182904005051, "num_tokens": 10298488.0, "step": 2844 }, { "entropy": 0.6089549213647842, "epoch": 2.654223051796547, "grad_norm": 0.24787257611751556, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.7513040453195572, "num_tokens": 10302326.0, "step": 2845 }, { "entropy": 0.5755298733711243, "epoch": 2.655156322911806, "grad_norm": 0.3587159812450409, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.7625543922185898, "num_tokens": 10305938.0, "step": 2846 }, { "entropy": 0.5649407356977463, "epoch": 2.656089594027065, "grad_norm": 0.28654634952545166, "learning_rate": 0.0002, "loss": 0.5602, "mean_token_accuracy": 0.7698401361703873, "num_tokens": 10309653.0, "step": 2847 }, { "entropy": 0.5961479991674423, "epoch": 2.657022865142324, "grad_norm": 0.2729703485965729, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.7582693994045258, "num_tokens": 10313310.0, "step": 2848 }, { "entropy": 0.6311646699905396, "epoch": 2.657956136257583, "grad_norm": 0.3255838453769684, "learning_rate": 0.0002, "loss": 0.6362, "mean_token_accuracy": 0.7453893721103668, "num_tokens": 10316909.0, "step": 2849 }, { "entropy": 0.6040830761194229, "epoch": 2.658889407372842, "grad_norm": 0.3410155177116394, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.7589414417743683, "num_tokens": 10320536.0, "step": 2850 }, { "entropy": 0.5833234041929245, "epoch": 2.659822678488101, "grad_norm": 0.3104991018772125, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.7571243196725845, "num_tokens": 10324173.0, "step": 2851 }, { "entropy": 0.5974277853965759, "epoch": 2.66075594960336, "grad_norm": 0.28374406695365906, "learning_rate": 0.0002, "loss": 0.5897, "mean_token_accuracy": 0.7638079226016998, "num_tokens": 10327818.0, "step": 2852 }, { "entropy": 0.5679856091737747, "epoch": 2.661689220718619, "grad_norm": 0.3131098747253418, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.7639501541852951, "num_tokens": 10331366.0, "step": 2853 }, { "entropy": 0.5970869809389114, "epoch": 2.662622491833878, "grad_norm": 0.3432559072971344, "learning_rate": 0.0002, "loss": 0.6031, "mean_token_accuracy": 0.762735590338707, "num_tokens": 10335129.0, "step": 2854 }, { "entropy": 0.5963824391365051, "epoch": 2.663555762949137, "grad_norm": 0.264800101518631, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.75605908036232, "num_tokens": 10338761.0, "step": 2855 }, { "entropy": 0.5724810063838959, "epoch": 2.664489034064396, "grad_norm": 0.2560005187988281, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.7592030763626099, "num_tokens": 10342337.0, "step": 2856 }, { "entropy": 0.5827980935573578, "epoch": 2.665422305179655, "grad_norm": 0.3199585974216461, "learning_rate": 0.0002, "loss": 0.5929, "mean_token_accuracy": 0.7636239975690842, "num_tokens": 10345998.0, "step": 2857 }, { "entropy": 0.5979607105255127, "epoch": 2.666355576294914, "grad_norm": 0.270867258310318, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.7645296156406403, "num_tokens": 10349656.0, "step": 2858 }, { "entropy": 0.5952378213405609, "epoch": 2.667288847410173, "grad_norm": 0.30551427602767944, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.7543524354696274, "num_tokens": 10353281.0, "step": 2859 }, { "entropy": 0.62692591547966, "epoch": 2.668222118525432, "grad_norm": 0.3088206946849823, "learning_rate": 0.0002, "loss": 0.6308, "mean_token_accuracy": 0.7457846254110336, "num_tokens": 10356920.0, "step": 2860 }, { "entropy": 0.5488396286964417, "epoch": 2.6691553896406908, "grad_norm": 0.30285128951072693, "learning_rate": 0.0002, "loss": 0.5756, "mean_token_accuracy": 0.7665066421031952, "num_tokens": 10360511.0, "step": 2861 }, { "entropy": 0.5633780211210251, "epoch": 2.6700886607559497, "grad_norm": 0.313071608543396, "learning_rate": 0.0002, "loss": 0.5845, "mean_token_accuracy": 0.7598731070756912, "num_tokens": 10364034.0, "step": 2862 }, { "entropy": 0.5646597445011139, "epoch": 2.6710219318712087, "grad_norm": 0.3032713532447815, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.7629126310348511, "num_tokens": 10367552.0, "step": 2863 }, { "entropy": 0.5733852088451385, "epoch": 2.6719552029864677, "grad_norm": 0.3007439076900482, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.7619200646877289, "num_tokens": 10371242.0, "step": 2864 }, { "entropy": 0.5978994518518448, "epoch": 2.6728884741017267, "grad_norm": 0.3172339200973511, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.7569737285375595, "num_tokens": 10374913.0, "step": 2865 }, { "entropy": 0.6182045340538025, "epoch": 2.6738217452169857, "grad_norm": 0.2789122462272644, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.7479248046875, "num_tokens": 10378498.0, "step": 2866 }, { "entropy": 0.5675746202468872, "epoch": 2.6747550163322447, "grad_norm": 0.24698254466056824, "learning_rate": 0.0002, "loss": 0.5551, "mean_token_accuracy": 0.7776653617620468, "num_tokens": 10382078.0, "step": 2867 }, { "entropy": 0.6024395376443863, "epoch": 2.6756882874475036, "grad_norm": 0.2596673369407654, "learning_rate": 0.0002, "loss": 0.592, "mean_token_accuracy": 0.7594407945871353, "num_tokens": 10385715.0, "step": 2868 }, { "entropy": 0.6216818690299988, "epoch": 2.6766215585627626, "grad_norm": 0.29837992787361145, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7461343705654144, "num_tokens": 10389315.0, "step": 2869 }, { "entropy": 0.6157044619321823, "epoch": 2.6775548296780216, "grad_norm": 0.25938570499420166, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.7552199363708496, "num_tokens": 10393031.0, "step": 2870 }, { "entropy": 0.5945307314395905, "epoch": 2.6784881007932806, "grad_norm": 0.2771987318992615, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7578018456697464, "num_tokens": 10396537.0, "step": 2871 }, { "entropy": 0.5953018218278885, "epoch": 2.6794213719085396, "grad_norm": 0.2927113175392151, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7580855190753937, "num_tokens": 10400167.0, "step": 2872 }, { "entropy": 0.5680172219872475, "epoch": 2.6803546430237986, "grad_norm": 0.3458040654659271, "learning_rate": 0.0002, "loss": 0.5706, "mean_token_accuracy": 0.7713437229394913, "num_tokens": 10403730.0, "step": 2873 }, { "entropy": 0.560113437473774, "epoch": 2.6812879141390575, "grad_norm": 0.29805541038513184, "learning_rate": 0.0002, "loss": 0.5774, "mean_token_accuracy": 0.7714994847774506, "num_tokens": 10407326.0, "step": 2874 }, { "entropy": 0.5968486368656158, "epoch": 2.6822211852543165, "grad_norm": 0.3150286078453064, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.7537056654691696, "num_tokens": 10410930.0, "step": 2875 }, { "entropy": 0.5694253742694855, "epoch": 2.6831544563695755, "grad_norm": 0.31886523962020874, "learning_rate": 0.0002, "loss": 0.5804, "mean_token_accuracy": 0.766162246465683, "num_tokens": 10414478.0, "step": 2876 }, { "entropy": 0.5993807166814804, "epoch": 2.6840877274848345, "grad_norm": 0.36062124371528625, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7660034000873566, "num_tokens": 10418180.0, "step": 2877 }, { "entropy": 0.5855659693479538, "epoch": 2.6850209986000935, "grad_norm": 0.30899205803871155, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.755257323384285, "num_tokens": 10421754.0, "step": 2878 }, { "entropy": 0.605323314666748, "epoch": 2.6859542697153524, "grad_norm": 0.2815881371498108, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7609332203865051, "num_tokens": 10425345.0, "step": 2879 }, { "entropy": 0.6132051944732666, "epoch": 2.6868875408306114, "grad_norm": 0.25951164960861206, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.7479953318834305, "num_tokens": 10428905.0, "step": 2880 }, { "entropy": 0.6100850999355316, "epoch": 2.6878208119458704, "grad_norm": 0.2927425801753998, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7596034109592438, "num_tokens": 10432641.0, "step": 2881 }, { "entropy": 0.6064320802688599, "epoch": 2.6887540830611294, "grad_norm": 0.2953912019729614, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.7517426908016205, "num_tokens": 10436235.0, "step": 2882 }, { "entropy": 0.6161717623472214, "epoch": 2.6896873541763884, "grad_norm": 0.2624815106391907, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.7494934499263763, "num_tokens": 10439987.0, "step": 2883 }, { "entropy": 0.5995090305805206, "epoch": 2.6906206252916474, "grad_norm": 0.3041272461414337, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.750388965010643, "num_tokens": 10443588.0, "step": 2884 }, { "entropy": 0.6038102805614471, "epoch": 2.6915538964069063, "grad_norm": 0.32551416754722595, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.7478233128786087, "num_tokens": 10447248.0, "step": 2885 }, { "entropy": 0.5852050930261612, "epoch": 2.6924871675221653, "grad_norm": 0.2586779296398163, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.7643430680036545, "num_tokens": 10450867.0, "step": 2886 }, { "entropy": 0.606071874499321, "epoch": 2.6934204386374243, "grad_norm": 0.2516472637653351, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.7562364190816879, "num_tokens": 10454593.0, "step": 2887 }, { "entropy": 0.5784239172935486, "epoch": 2.6943537097526833, "grad_norm": 0.2900892198085785, "learning_rate": 0.0002, "loss": 0.5786, "mean_token_accuracy": 0.7627390176057816, "num_tokens": 10458080.0, "step": 2888 }, { "entropy": 0.5875871777534485, "epoch": 2.6952869808679423, "grad_norm": 0.2971639633178711, "learning_rate": 0.0002, "loss": 0.5885, "mean_token_accuracy": 0.7595333904027939, "num_tokens": 10461670.0, "step": 2889 }, { "entropy": 0.6040884256362915, "epoch": 2.6962202519832013, "grad_norm": 0.28326091170310974, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.7479574531316757, "num_tokens": 10465276.0, "step": 2890 }, { "entropy": 0.5868431776762009, "epoch": 2.6971535230984602, "grad_norm": 0.29593291878700256, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.7533126473426819, "num_tokens": 10468810.0, "step": 2891 }, { "entropy": 0.6047821193933487, "epoch": 2.698086794213719, "grad_norm": 0.2939571142196655, "learning_rate": 0.0002, "loss": 0.6099, "mean_token_accuracy": 0.7527885138988495, "num_tokens": 10472393.0, "step": 2892 }, { "entropy": 0.5825858861207962, "epoch": 2.699020065328978, "grad_norm": 0.32170581817626953, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.7530898898839951, "num_tokens": 10475945.0, "step": 2893 }, { "entropy": 0.6181543320417404, "epoch": 2.699953336444237, "grad_norm": 0.3502485752105713, "learning_rate": 0.0002, "loss": 0.6401, "mean_token_accuracy": 0.738127201795578, "num_tokens": 10479475.0, "step": 2894 }, { "entropy": 0.5812874734401703, "epoch": 2.700886607559496, "grad_norm": 0.28191253542900085, "learning_rate": 0.0002, "loss": 0.5829, "mean_token_accuracy": 0.7658970952033997, "num_tokens": 10483100.0, "step": 2895 }, { "entropy": 0.6026106029748917, "epoch": 2.701819878674755, "grad_norm": 0.2362523227930069, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.7574286460876465, "num_tokens": 10486743.0, "step": 2896 }, { "entropy": 0.5785082876682281, "epoch": 2.702753149790014, "grad_norm": 0.2574014663696289, "learning_rate": 0.0002, "loss": 0.566, "mean_token_accuracy": 0.7770560383796692, "num_tokens": 10490195.0, "step": 2897 }, { "entropy": 0.6241272687911987, "epoch": 2.703686420905273, "grad_norm": 0.2386685162782669, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.7518585920333862, "num_tokens": 10493862.0, "step": 2898 }, { "entropy": 0.6010641306638718, "epoch": 2.704619692020532, "grad_norm": 0.3298874497413635, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.7561163604259491, "num_tokens": 10497537.0, "step": 2899 }, { "entropy": 0.5843217223882675, "epoch": 2.705552963135791, "grad_norm": 0.27196282148361206, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.7665914595127106, "num_tokens": 10501093.0, "step": 2900 }, { "entropy": 0.5856528431177139, "epoch": 2.70648623425105, "grad_norm": 0.27931198477745056, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.7573636621236801, "num_tokens": 10504678.0, "step": 2901 }, { "entropy": 0.5579489022493362, "epoch": 2.707419505366309, "grad_norm": 0.32525068521499634, "learning_rate": 0.0002, "loss": 0.5618, "mean_token_accuracy": 0.7698354423046112, "num_tokens": 10508282.0, "step": 2902 }, { "entropy": 0.5635915845632553, "epoch": 2.708352776481568, "grad_norm": 0.288144588470459, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.770320475101471, "num_tokens": 10512091.0, "step": 2903 }, { "entropy": 0.5777433663606644, "epoch": 2.709286047596827, "grad_norm": 0.24926602840423584, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.7711969465017319, "num_tokens": 10515792.0, "step": 2904 }, { "entropy": 0.5955610573291779, "epoch": 2.710219318712086, "grad_norm": 0.33033567667007446, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.751881942152977, "num_tokens": 10519452.0, "step": 2905 }, { "entropy": 0.5917357057332993, "epoch": 2.711152589827345, "grad_norm": 0.29397618770599365, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.7554068118333817, "num_tokens": 10523025.0, "step": 2906 }, { "entropy": 0.5803124457597733, "epoch": 2.712085860942604, "grad_norm": 0.30005812644958496, "learning_rate": 0.0002, "loss": 0.5681, "mean_token_accuracy": 0.7750290781259537, "num_tokens": 10526654.0, "step": 2907 }, { "entropy": 0.5863052755594254, "epoch": 2.713019132057863, "grad_norm": 0.34019115567207336, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.7554391175508499, "num_tokens": 10530316.0, "step": 2908 }, { "entropy": 0.5837038457393646, "epoch": 2.713952403173122, "grad_norm": 0.29114413261413574, "learning_rate": 0.0002, "loss": 0.587, "mean_token_accuracy": 0.765812948346138, "num_tokens": 10534027.0, "step": 2909 }, { "entropy": 0.6037418246269226, "epoch": 2.714885674288381, "grad_norm": 0.329889178276062, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.7550014555454254, "num_tokens": 10537643.0, "step": 2910 }, { "entropy": 0.5827833116054535, "epoch": 2.71581894540364, "grad_norm": 0.28896641731262207, "learning_rate": 0.0002, "loss": 0.5815, "mean_token_accuracy": 0.7624285519123077, "num_tokens": 10541151.0, "step": 2911 }, { "entropy": 0.6367050856351852, "epoch": 2.716752216518899, "grad_norm": 0.30167537927627563, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.7431788295507431, "num_tokens": 10544838.0, "step": 2912 }, { "entropy": 0.5877486169338226, "epoch": 2.717685487634158, "grad_norm": 0.27358439564704895, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.759221613407135, "num_tokens": 10548502.0, "step": 2913 }, { "entropy": 0.5658725649118423, "epoch": 2.718618758749417, "grad_norm": 0.24378331005573273, "learning_rate": 0.0002, "loss": 0.5657, "mean_token_accuracy": 0.7681344300508499, "num_tokens": 10552218.0, "step": 2914 }, { "entropy": 0.5605893433094025, "epoch": 2.719552029864676, "grad_norm": 0.4195540249347687, "learning_rate": 0.0002, "loss": 0.5711, "mean_token_accuracy": 0.7692812979221344, "num_tokens": 10555828.0, "step": 2915 }, { "entropy": 0.6098630130290985, "epoch": 2.720485300979935, "grad_norm": 0.32178741693496704, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.7516336739063263, "num_tokens": 10559443.0, "step": 2916 }, { "entropy": 0.6262217909097672, "epoch": 2.7214185720951938, "grad_norm": 0.3739950656890869, "learning_rate": 0.0002, "loss": 0.6271, "mean_token_accuracy": 0.7503859102725983, "num_tokens": 10563222.0, "step": 2917 }, { "entropy": 0.6106446832418442, "epoch": 2.7223518432104528, "grad_norm": 0.2909451723098755, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.7516937255859375, "num_tokens": 10566964.0, "step": 2918 }, { "entropy": 0.6023547798395157, "epoch": 2.7232851143257117, "grad_norm": 0.306405633687973, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.7593949437141418, "num_tokens": 10570616.0, "step": 2919 }, { "entropy": 0.5785899311304092, "epoch": 2.7242183854409707, "grad_norm": 0.2683722972869873, "learning_rate": 0.0002, "loss": 0.5759, "mean_token_accuracy": 0.7654569298028946, "num_tokens": 10574408.0, "step": 2920 }, { "entropy": 0.616819828748703, "epoch": 2.7251516565562297, "grad_norm": 0.26981666684150696, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.7519167959690094, "num_tokens": 10578093.0, "step": 2921 }, { "entropy": 0.5975365489721298, "epoch": 2.7260849276714887, "grad_norm": 0.26279106736183167, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.7563955932855606, "num_tokens": 10581758.0, "step": 2922 }, { "entropy": 0.6042553782463074, "epoch": 2.7270181987867477, "grad_norm": 0.3000021278858185, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.7621610164642334, "num_tokens": 10585296.0, "step": 2923 }, { "entropy": 0.5889676064252853, "epoch": 2.7279514699020067, "grad_norm": 0.29016634821891785, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7548023760318756, "num_tokens": 10588987.0, "step": 2924 }, { "entropy": 0.6118484139442444, "epoch": 2.7288847410172656, "grad_norm": 0.32010239362716675, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.7486202120780945, "num_tokens": 10592645.0, "step": 2925 }, { "entropy": 0.5839240401983261, "epoch": 2.7298180121325246, "grad_norm": 0.32657620310783386, "learning_rate": 0.0002, "loss": 0.6076, "mean_token_accuracy": 0.7493649274110794, "num_tokens": 10596308.0, "step": 2926 }, { "entropy": 0.6039933860301971, "epoch": 2.7307512832477836, "grad_norm": 0.39051195979118347, "learning_rate": 0.0002, "loss": 0.619, "mean_token_accuracy": 0.7506439834833145, "num_tokens": 10599973.0, "step": 2927 }, { "entropy": 0.5860694199800491, "epoch": 2.7316845543630426, "grad_norm": 0.2892131507396698, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.751648560166359, "num_tokens": 10603591.0, "step": 2928 }, { "entropy": 0.6056053042411804, "epoch": 2.7326178254783016, "grad_norm": 0.3070290982723236, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.7498381435871124, "num_tokens": 10607205.0, "step": 2929 }, { "entropy": 0.5966543406248093, "epoch": 2.7335510965935605, "grad_norm": 0.3304140865802765, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7601138055324554, "num_tokens": 10610853.0, "step": 2930 }, { "entropy": 0.5859653055667877, "epoch": 2.7344843677088195, "grad_norm": 0.27733227610588074, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.7691010683774948, "num_tokens": 10614534.0, "step": 2931 }, { "entropy": 0.6315150260925293, "epoch": 2.7354176388240785, "grad_norm": 0.334257036447525, "learning_rate": 0.0002, "loss": 0.6282, "mean_token_accuracy": 0.740838810801506, "num_tokens": 10618073.0, "step": 2932 }, { "entropy": 0.618027463555336, "epoch": 2.7363509099393375, "grad_norm": 0.2828821539878845, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.7406304329633713, "num_tokens": 10621717.0, "step": 2933 }, { "entropy": 0.6097659021615982, "epoch": 2.7372841810545965, "grad_norm": 0.2637903094291687, "learning_rate": 0.0002, "loss": 0.6156, "mean_token_accuracy": 0.7479549646377563, "num_tokens": 10625295.0, "step": 2934 }, { "entropy": 0.6298611462116241, "epoch": 2.7382174521698555, "grad_norm": 0.3338701128959656, "learning_rate": 0.0002, "loss": 0.6369, "mean_token_accuracy": 0.7407689839601517, "num_tokens": 10628965.0, "step": 2935 }, { "entropy": 0.5930904448032379, "epoch": 2.7391507232851144, "grad_norm": 0.26684457063674927, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.76059889793396, "num_tokens": 10632682.0, "step": 2936 }, { "entropy": 0.6065693199634552, "epoch": 2.7400839944003734, "grad_norm": 0.31224575638771057, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.755589172244072, "num_tokens": 10636251.0, "step": 2937 }, { "entropy": 0.5806271433830261, "epoch": 2.7410172655156324, "grad_norm": 0.28885170817375183, "learning_rate": 0.0002, "loss": 0.5861, "mean_token_accuracy": 0.7581702023744583, "num_tokens": 10639877.0, "step": 2938 }, { "entropy": 0.6308843940496445, "epoch": 2.7419505366308914, "grad_norm": 0.27065643668174744, "learning_rate": 0.0002, "loss": 0.6268, "mean_token_accuracy": 0.7483752369880676, "num_tokens": 10643632.0, "step": 2939 }, { "entropy": 0.6228572130203247, "epoch": 2.7428838077461504, "grad_norm": 0.28785815834999084, "learning_rate": 0.0002, "loss": 0.6387, "mean_token_accuracy": 0.7369548082351685, "num_tokens": 10647385.0, "step": 2940 }, { "entropy": 0.5849888622760773, "epoch": 2.7438170788614094, "grad_norm": 0.2576548159122467, "learning_rate": 0.0002, "loss": 0.583, "mean_token_accuracy": 0.761383593082428, "num_tokens": 10650980.0, "step": 2941 }, { "entropy": 0.6224680989980698, "epoch": 2.7447503499766683, "grad_norm": 0.27243372797966003, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.7481716424226761, "num_tokens": 10654595.0, "step": 2942 }, { "entropy": 0.5671621859073639, "epoch": 2.7456836210919273, "grad_norm": 0.2873936891555786, "learning_rate": 0.0002, "loss": 0.5815, "mean_token_accuracy": 0.7632164806127548, "num_tokens": 10658213.0, "step": 2943 }, { "entropy": 0.5991893708705902, "epoch": 2.7466168922071863, "grad_norm": 0.27787327766418457, "learning_rate": 0.0002, "loss": 0.6152, "mean_token_accuracy": 0.753093808889389, "num_tokens": 10661847.0, "step": 2944 }, { "entropy": 0.6020065546035767, "epoch": 2.7475501633224453, "grad_norm": 0.27484145760536194, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.7526761889457703, "num_tokens": 10665602.0, "step": 2945 }, { "entropy": 0.5944593399763107, "epoch": 2.7484834344377043, "grad_norm": 0.29162275791168213, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7575049102306366, "num_tokens": 10669291.0, "step": 2946 }, { "entropy": 0.5943544209003448, "epoch": 2.7494167055529632, "grad_norm": 0.24354708194732666, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.7584688514471054, "num_tokens": 10672994.0, "step": 2947 }, { "entropy": 0.6097064018249512, "epoch": 2.7503499766682222, "grad_norm": 0.2739247977733612, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.7543578743934631, "num_tokens": 10676645.0, "step": 2948 }, { "entropy": 0.6075806319713593, "epoch": 2.751283247783481, "grad_norm": 0.284956157207489, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.7508084326982498, "num_tokens": 10680271.0, "step": 2949 }, { "entropy": 0.6073503941297531, "epoch": 2.75221651889874, "grad_norm": 0.2952117621898651, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.7552280575037003, "num_tokens": 10683781.0, "step": 2950 }, { "entropy": 0.5418258011341095, "epoch": 2.753149790013999, "grad_norm": 0.27438753843307495, "learning_rate": 0.0002, "loss": 0.551, "mean_token_accuracy": 0.7752265483140945, "num_tokens": 10687211.0, "step": 2951 }, { "entropy": 0.6081000119447708, "epoch": 2.754083061129258, "grad_norm": 0.30617859959602356, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.7573294341564178, "num_tokens": 10690788.0, "step": 2952 }, { "entropy": 0.5462359040975571, "epoch": 2.755016332244517, "grad_norm": 0.3775222599506378, "learning_rate": 0.0002, "loss": 0.5598, "mean_token_accuracy": 0.7731146663427353, "num_tokens": 10694410.0, "step": 2953 }, { "entropy": 0.6360747814178467, "epoch": 2.755949603359776, "grad_norm": 0.33963698148727417, "learning_rate": 0.0002, "loss": 0.6535, "mean_token_accuracy": 0.7397075146436691, "num_tokens": 10698030.0, "step": 2954 }, { "entropy": 0.5817694067955017, "epoch": 2.756882874475035, "grad_norm": 0.2698473632335663, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.7527592033147812, "num_tokens": 10701675.0, "step": 2955 }, { "entropy": 0.6118921488523483, "epoch": 2.757816145590294, "grad_norm": 0.3705582022666931, "learning_rate": 0.0002, "loss": 0.62, "mean_token_accuracy": 0.7487671822309494, "num_tokens": 10705344.0, "step": 2956 }, { "entropy": 0.6371321231126785, "epoch": 2.758749416705553, "grad_norm": 0.3526270389556885, "learning_rate": 0.0002, "loss": 0.6472, "mean_token_accuracy": 0.739277184009552, "num_tokens": 10708896.0, "step": 2957 }, { "entropy": 0.5942284315824509, "epoch": 2.759682687820812, "grad_norm": 0.312301903963089, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.757733628153801, "num_tokens": 10712468.0, "step": 2958 }, { "entropy": 0.5653786957263947, "epoch": 2.760615958936071, "grad_norm": 0.25120967626571655, "learning_rate": 0.0002, "loss": 0.5656, "mean_token_accuracy": 0.7663618475198746, "num_tokens": 10715994.0, "step": 2959 }, { "entropy": 0.6317610442638397, "epoch": 2.76154923005133, "grad_norm": 0.2885478436946869, "learning_rate": 0.0002, "loss": 0.6242, "mean_token_accuracy": 0.7406086623668671, "num_tokens": 10719652.0, "step": 2960 }, { "entropy": 0.6086512953042984, "epoch": 2.762482501166589, "grad_norm": 0.31343623995780945, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.7524770200252533, "num_tokens": 10723336.0, "step": 2961 }, { "entropy": 0.62699294090271, "epoch": 2.763415772281848, "grad_norm": 0.2678343951702118, "learning_rate": 0.0002, "loss": 0.6235, "mean_token_accuracy": 0.7460043728351593, "num_tokens": 10727035.0, "step": 2962 }, { "entropy": 0.589777797460556, "epoch": 2.764349043397107, "grad_norm": 0.2840663194656372, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7614950686693192, "num_tokens": 10730695.0, "step": 2963 }, { "entropy": 0.5731484591960907, "epoch": 2.765282314512366, "grad_norm": 0.2616406977176666, "learning_rate": 0.0002, "loss": 0.5721, "mean_token_accuracy": 0.7625556588172913, "num_tokens": 10734243.0, "step": 2964 }, { "entropy": 0.5888113975524902, "epoch": 2.766215585627625, "grad_norm": 0.3112415671348572, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.7621708512306213, "num_tokens": 10737981.0, "step": 2965 }, { "entropy": 0.6290577799081802, "epoch": 2.767148856742884, "grad_norm": 0.27939915657043457, "learning_rate": 0.0002, "loss": 0.6285, "mean_token_accuracy": 0.7459504753351212, "num_tokens": 10741809.0, "step": 2966 }, { "entropy": 0.6158834397792816, "epoch": 2.768082127858143, "grad_norm": 0.2763204276561737, "learning_rate": 0.0002, "loss": 0.6207, "mean_token_accuracy": 0.7523501664400101, "num_tokens": 10745469.0, "step": 2967 }, { "entropy": 0.5768440961837769, "epoch": 2.769015398973402, "grad_norm": 0.3156189024448395, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.7581243962049484, "num_tokens": 10749090.0, "step": 2968 }, { "entropy": 0.5885700434446335, "epoch": 2.769948670088661, "grad_norm": 0.2857017517089844, "learning_rate": 0.0002, "loss": 0.5927, "mean_token_accuracy": 0.7630103379487991, "num_tokens": 10752641.0, "step": 2969 }, { "entropy": 0.5868904292583466, "epoch": 2.77088194120392, "grad_norm": 0.2861466407775879, "learning_rate": 0.0002, "loss": 0.5835, "mean_token_accuracy": 0.7670445591211319, "num_tokens": 10756268.0, "step": 2970 }, { "entropy": 0.6281893998384476, "epoch": 2.771815212319179, "grad_norm": 0.3037635385990143, "learning_rate": 0.0002, "loss": 0.624, "mean_token_accuracy": 0.7526396661996841, "num_tokens": 10760037.0, "step": 2971 }, { "entropy": 0.5693120062351227, "epoch": 2.772748483434438, "grad_norm": 0.27967557311058044, "learning_rate": 0.0002, "loss": 0.5749, "mean_token_accuracy": 0.7655276656150818, "num_tokens": 10763680.0, "step": 2972 }, { "entropy": 0.6388155221939087, "epoch": 2.773681754549697, "grad_norm": 0.3254161477088928, "learning_rate": 0.0002, "loss": 0.6415, "mean_token_accuracy": 0.7386961579322815, "num_tokens": 10767371.0, "step": 2973 }, { "entropy": 0.602486327290535, "epoch": 2.7746150256649558, "grad_norm": 0.26905447244644165, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.7635650783777237, "num_tokens": 10771072.0, "step": 2974 }, { "entropy": 0.5866997390985489, "epoch": 2.7755482967802148, "grad_norm": 0.3131217658519745, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7582214325666428, "num_tokens": 10774752.0, "step": 2975 }, { "entropy": 0.5853118747472763, "epoch": 2.7764815678954737, "grad_norm": 0.2787522077560425, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.7650163471698761, "num_tokens": 10778421.0, "step": 2976 }, { "entropy": 0.5818798393011093, "epoch": 2.7774148390107327, "grad_norm": 0.2941856384277344, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7583233118057251, "num_tokens": 10782061.0, "step": 2977 }, { "entropy": 0.5919815301895142, "epoch": 2.7783481101259917, "grad_norm": 0.2744908630847931, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.758191779255867, "num_tokens": 10785725.0, "step": 2978 }, { "entropy": 0.659652516245842, "epoch": 2.7792813812412507, "grad_norm": 0.24751022458076477, "learning_rate": 0.0002, "loss": 0.6531, "mean_token_accuracy": 0.7355669289827347, "num_tokens": 10789414.0, "step": 2979 }, { "entropy": 0.5855045020580292, "epoch": 2.7802146523565097, "grad_norm": 0.31181931495666504, "learning_rate": 0.0002, "loss": 0.5916, "mean_token_accuracy": 0.7535517364740372, "num_tokens": 10792928.0, "step": 2980 }, { "entropy": 0.5920361876487732, "epoch": 2.7811479234717686, "grad_norm": 0.29160523414611816, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.7594924867153168, "num_tokens": 10796583.0, "step": 2981 }, { "entropy": 0.624497577548027, "epoch": 2.7820811945870276, "grad_norm": 0.2895045578479767, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.7464143633842468, "num_tokens": 10800161.0, "step": 2982 }, { "entropy": 0.5761651396751404, "epoch": 2.7830144657022866, "grad_norm": 0.22196923196315765, "learning_rate": 0.0002, "loss": 0.5682, "mean_token_accuracy": 0.7715012282133102, "num_tokens": 10803890.0, "step": 2983 }, { "entropy": 0.5708905905485153, "epoch": 2.7839477368175456, "grad_norm": 0.27918311953544617, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.7752271592617035, "num_tokens": 10807519.0, "step": 2984 }, { "entropy": 0.5538100898265839, "epoch": 2.7848810079328046, "grad_norm": 0.31265389919281006, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7703826725482941, "num_tokens": 10811081.0, "step": 2985 }, { "entropy": 0.5846253335475922, "epoch": 2.7858142790480636, "grad_norm": 0.28502726554870605, "learning_rate": 0.0002, "loss": 0.5922, "mean_token_accuracy": 0.7598684877157211, "num_tokens": 10814754.0, "step": 2986 }, { "entropy": 0.5778238922357559, "epoch": 2.7867475501633225, "grad_norm": 0.37309730052948, "learning_rate": 0.0002, "loss": 0.5931, "mean_token_accuracy": 0.7582359313964844, "num_tokens": 10818467.0, "step": 2987 }, { "entropy": 0.5857134908437729, "epoch": 2.7876808212785815, "grad_norm": 0.31000396609306335, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.7531865537166595, "num_tokens": 10821955.0, "step": 2988 }, { "entropy": 0.6222389042377472, "epoch": 2.7886140923938405, "grad_norm": 0.3169272840023041, "learning_rate": 0.0002, "loss": 0.6284, "mean_token_accuracy": 0.7481012344360352, "num_tokens": 10825589.0, "step": 2989 }, { "entropy": 0.5992244631052017, "epoch": 2.7895473635090995, "grad_norm": 0.29916319251060486, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7592369019985199, "num_tokens": 10829371.0, "step": 2990 }, { "entropy": 0.5603584051132202, "epoch": 2.7904806346243585, "grad_norm": 0.28008902072906494, "learning_rate": 0.0002, "loss": 0.573, "mean_token_accuracy": 0.7706271260976791, "num_tokens": 10832970.0, "step": 2991 }, { "entropy": 0.6236664652824402, "epoch": 2.7914139057396175, "grad_norm": 0.2600453794002533, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.7587350308895111, "num_tokens": 10836634.0, "step": 2992 }, { "entropy": 0.6171578764915466, "epoch": 2.7923471768548764, "grad_norm": 0.292451947927475, "learning_rate": 0.0002, "loss": 0.6076, "mean_token_accuracy": 0.7526761144399643, "num_tokens": 10840275.0, "step": 2993 }, { "entropy": 0.6213969439268112, "epoch": 2.7932804479701354, "grad_norm": 0.43007370829582214, "learning_rate": 0.0002, "loss": 0.6452, "mean_token_accuracy": 0.7424769550561905, "num_tokens": 10843865.0, "step": 2994 }, { "entropy": 0.6172437965869904, "epoch": 2.7942137190853944, "grad_norm": 0.2701164186000824, "learning_rate": 0.0002, "loss": 0.622, "mean_token_accuracy": 0.7445826977491379, "num_tokens": 10847344.0, "step": 2995 }, { "entropy": 0.5982016623020172, "epoch": 2.7951469902006534, "grad_norm": 0.22846569120883942, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7678139507770538, "num_tokens": 10850912.0, "step": 2996 }, { "entropy": 0.609807476401329, "epoch": 2.7960802613159124, "grad_norm": 0.34953078627586365, "learning_rate": 0.0002, "loss": 0.6127, "mean_token_accuracy": 0.757586270570755, "num_tokens": 10854651.0, "step": 2997 }, { "entropy": 0.59034164249897, "epoch": 2.7970135324311713, "grad_norm": 0.2683144211769104, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.7609061300754547, "num_tokens": 10858224.0, "step": 2998 }, { "entropy": 0.6149503439664841, "epoch": 2.7979468035464303, "grad_norm": 0.2436961680650711, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7480558454990387, "num_tokens": 10861878.0, "step": 2999 }, { "entropy": 0.569657027721405, "epoch": 2.7988800746616893, "grad_norm": 0.3095519542694092, "learning_rate": 0.0002, "loss": 0.5723, "mean_token_accuracy": 0.7612144500017166, "num_tokens": 10865527.0, "step": 3000 }, { "entropy": 0.5959553569555283, "epoch": 2.7998133457769483, "grad_norm": 0.33972373604774475, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.7478853166103363, "num_tokens": 10869092.0, "step": 3001 }, { "entropy": 0.6036671996116638, "epoch": 2.8007466168922073, "grad_norm": 0.30624276399612427, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.7543076872825623, "num_tokens": 10872705.0, "step": 3002 }, { "entropy": 0.5952155888080597, "epoch": 2.8016798880074663, "grad_norm": 0.380785197019577, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.7541253566741943, "num_tokens": 10876270.0, "step": 3003 }, { "entropy": 0.5990408360958099, "epoch": 2.8026131591227252, "grad_norm": 0.28005507588386536, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.7504477947950363, "num_tokens": 10879816.0, "step": 3004 }, { "entropy": 0.5701583921909332, "epoch": 2.803546430237984, "grad_norm": 0.3437856435775757, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.7656089067459106, "num_tokens": 10883344.0, "step": 3005 }, { "entropy": 0.6003424227237701, "epoch": 2.804479701353243, "grad_norm": 0.32988038659095764, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.7579247802495956, "num_tokens": 10886951.0, "step": 3006 }, { "entropy": 0.6179942339658737, "epoch": 2.805412972468502, "grad_norm": 0.3693266808986664, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.7477086931467056, "num_tokens": 10890605.0, "step": 3007 }, { "entropy": 0.5977237075567245, "epoch": 2.806346243583761, "grad_norm": 0.33087897300720215, "learning_rate": 0.0002, "loss": 0.6061, "mean_token_accuracy": 0.7496107816696167, "num_tokens": 10894121.0, "step": 3008 }, { "entropy": 0.5956491231918335, "epoch": 2.80727951469902, "grad_norm": 0.2709832489490509, "learning_rate": 0.0002, "loss": 0.5955, "mean_token_accuracy": 0.7606271356344223, "num_tokens": 10897637.0, "step": 3009 }, { "entropy": 0.5521785393357277, "epoch": 2.808212785814279, "grad_norm": 0.330720454454422, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.7710676044225693, "num_tokens": 10901277.0, "step": 3010 }, { "entropy": 0.6137790679931641, "epoch": 2.809146056929538, "grad_norm": 0.30881884694099426, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7462268024682999, "num_tokens": 10904997.0, "step": 3011 }, { "entropy": 0.6221829354763031, "epoch": 2.810079328044797, "grad_norm": 0.2928849458694458, "learning_rate": 0.0002, "loss": 0.6216, "mean_token_accuracy": 0.748352900147438, "num_tokens": 10908717.0, "step": 3012 }, { "entropy": 0.5521133989095688, "epoch": 2.811012599160056, "grad_norm": 0.2634420394897461, "learning_rate": 0.0002, "loss": 0.5451, "mean_token_accuracy": 0.7805008888244629, "num_tokens": 10912401.0, "step": 3013 }, { "entropy": 0.5794743001461029, "epoch": 2.811945870275315, "grad_norm": 0.28334248065948486, "learning_rate": 0.0002, "loss": 0.5783, "mean_token_accuracy": 0.7645858228206635, "num_tokens": 10915915.0, "step": 3014 }, { "entropy": 0.5917574763298035, "epoch": 2.812879141390574, "grad_norm": 0.25822895765304565, "learning_rate": 0.0002, "loss": 0.5873, "mean_token_accuracy": 0.7652510553598404, "num_tokens": 10919713.0, "step": 3015 }, { "entropy": 0.6387159377336502, "epoch": 2.813812412505833, "grad_norm": 0.2981826364994049, "learning_rate": 0.0002, "loss": 0.6382, "mean_token_accuracy": 0.7386875450611115, "num_tokens": 10923281.0, "step": 3016 }, { "entropy": 0.6396109461784363, "epoch": 2.814745683621092, "grad_norm": 0.3590371310710907, "learning_rate": 0.0002, "loss": 0.6508, "mean_token_accuracy": 0.7325965464115143, "num_tokens": 10926890.0, "step": 3017 }, { "entropy": 0.5827488601207733, "epoch": 2.815678954736351, "grad_norm": 0.27098560333251953, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.761214017868042, "num_tokens": 10930506.0, "step": 3018 }, { "entropy": 0.5996869951486588, "epoch": 2.81661222585161, "grad_norm": 0.31620877981185913, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.752584382891655, "num_tokens": 10934085.0, "step": 3019 }, { "entropy": 0.5946385264396667, "epoch": 2.817545496966869, "grad_norm": 0.3576335310935974, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.7491792589426041, "num_tokens": 10937642.0, "step": 3020 }, { "entropy": 0.5822563022375107, "epoch": 2.818478768082128, "grad_norm": 0.33394455909729004, "learning_rate": 0.0002, "loss": 0.6022, "mean_token_accuracy": 0.7632328569889069, "num_tokens": 10941159.0, "step": 3021 }, { "entropy": 0.6090145111083984, "epoch": 2.819412039197387, "grad_norm": 0.32203930616378784, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.7522079199552536, "num_tokens": 10944745.0, "step": 3022 }, { "entropy": 0.6276083588600159, "epoch": 2.820345310312646, "grad_norm": 0.26853078603744507, "learning_rate": 0.0002, "loss": 0.623, "mean_token_accuracy": 0.7508935034275055, "num_tokens": 10948389.0, "step": 3023 }, { "entropy": 0.618749737739563, "epoch": 2.821278581427905, "grad_norm": 0.24535082280635834, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7423107624053955, "num_tokens": 10952059.0, "step": 3024 }, { "entropy": 0.5896616280078888, "epoch": 2.822211852543164, "grad_norm": 0.2745663821697235, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.7665961235761642, "num_tokens": 10955653.0, "step": 3025 }, { "entropy": 0.6081703901290894, "epoch": 2.823145123658423, "grad_norm": 0.2776983082294464, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7537357956171036, "num_tokens": 10959197.0, "step": 3026 }, { "entropy": 0.6011470705270767, "epoch": 2.824078394773682, "grad_norm": 0.3054050803184509, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.7446547597646713, "num_tokens": 10962713.0, "step": 3027 }, { "entropy": 0.6303839385509491, "epoch": 2.825011665888941, "grad_norm": 0.29333779215812683, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.7441856563091278, "num_tokens": 10966355.0, "step": 3028 }, { "entropy": 0.6078919619321823, "epoch": 2.8259449370042, "grad_norm": 0.2660228908061981, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.7515535950660706, "num_tokens": 10970030.0, "step": 3029 }, { "entropy": 0.6134039908647537, "epoch": 2.826878208119459, "grad_norm": 0.3243391215801239, "learning_rate": 0.0002, "loss": 0.6181, "mean_token_accuracy": 0.7524002343416214, "num_tokens": 10973635.0, "step": 3030 }, { "entropy": 0.6012022346258163, "epoch": 2.8278114792347178, "grad_norm": 0.3261015713214874, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.7540189027786255, "num_tokens": 10977246.0, "step": 3031 }, { "entropy": 0.5637366622686386, "epoch": 2.8287447503499767, "grad_norm": 0.35526609420776367, "learning_rate": 0.0002, "loss": 0.5649, "mean_token_accuracy": 0.7749943733215332, "num_tokens": 10980760.0, "step": 3032 }, { "entropy": 0.5802885890007019, "epoch": 2.8296780214652357, "grad_norm": 0.2770848870277405, "learning_rate": 0.0002, "loss": 0.582, "mean_token_accuracy": 0.76914943754673, "num_tokens": 10984332.0, "step": 3033 }, { "entropy": 0.6522209644317627, "epoch": 2.8306112925804947, "grad_norm": 0.2479674071073532, "learning_rate": 0.0002, "loss": 0.6443, "mean_token_accuracy": 0.7426959425210953, "num_tokens": 10988051.0, "step": 3034 }, { "entropy": 0.6159544736146927, "epoch": 2.8315445636957537, "grad_norm": 0.24436251819133759, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.7463350147008896, "num_tokens": 10991631.0, "step": 3035 }, { "entropy": 0.6387726068496704, "epoch": 2.8324778348110127, "grad_norm": 0.2999413311481476, "learning_rate": 0.0002, "loss": 0.6546, "mean_token_accuracy": 0.7351182997226715, "num_tokens": 10995334.0, "step": 3036 }, { "entropy": 0.6088640242815018, "epoch": 2.8334111059262717, "grad_norm": 0.3186960220336914, "learning_rate": 0.0002, "loss": 0.6219, "mean_token_accuracy": 0.7545024454593658, "num_tokens": 10999005.0, "step": 3037 }, { "entropy": 0.6099793910980225, "epoch": 2.8343443770415306, "grad_norm": 0.2785789370536804, "learning_rate": 0.0002, "loss": 0.6199, "mean_token_accuracy": 0.7516246736049652, "num_tokens": 11002659.0, "step": 3038 }, { "entropy": 0.6111045330762863, "epoch": 2.8352776481567896, "grad_norm": 0.3087356984615326, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7518241703510284, "num_tokens": 11006185.0, "step": 3039 }, { "entropy": 0.5812557637691498, "epoch": 2.8362109192720486, "grad_norm": 0.2726166248321533, "learning_rate": 0.0002, "loss": 0.5824, "mean_token_accuracy": 0.7623242884874344, "num_tokens": 11009646.0, "step": 3040 }, { "entropy": 0.6010236293077469, "epoch": 2.8371441903873076, "grad_norm": 0.25601038336753845, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.7541338950395584, "num_tokens": 11013356.0, "step": 3041 }, { "entropy": 0.5974919646978378, "epoch": 2.8380774615025666, "grad_norm": 0.3042868673801422, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.7568031698465347, "num_tokens": 11016988.0, "step": 3042 }, { "entropy": 0.6054881364107132, "epoch": 2.8390107326178255, "grad_norm": 0.29635265469551086, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.7510102391242981, "num_tokens": 11020633.0, "step": 3043 }, { "entropy": 0.5745657682418823, "epoch": 2.8399440037330845, "grad_norm": 0.31981948018074036, "learning_rate": 0.0002, "loss": 0.5738, "mean_token_accuracy": 0.7669635564088821, "num_tokens": 11024142.0, "step": 3044 }, { "entropy": 0.6573443859815598, "epoch": 2.8408772748483435, "grad_norm": 0.27270859479904175, "learning_rate": 0.0002, "loss": 0.6457, "mean_token_accuracy": 0.7435896843671799, "num_tokens": 11027938.0, "step": 3045 }, { "entropy": 0.5716906785964966, "epoch": 2.8418105459636025, "grad_norm": 0.21885700523853302, "learning_rate": 0.0002, "loss": 0.5667, "mean_token_accuracy": 0.7771565616130829, "num_tokens": 11031519.0, "step": 3046 }, { "entropy": 0.5991765558719635, "epoch": 2.8427438170788615, "grad_norm": 0.2715460956096649, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.7582710087299347, "num_tokens": 11035283.0, "step": 3047 }, { "entropy": 0.5692508071660995, "epoch": 2.8436770881941205, "grad_norm": 0.3098483383655548, "learning_rate": 0.0002, "loss": 0.5803, "mean_token_accuracy": 0.7625603079795837, "num_tokens": 11038900.0, "step": 3048 }, { "entropy": 0.6233002841472626, "epoch": 2.8446103593093794, "grad_norm": 0.27842554450035095, "learning_rate": 0.0002, "loss": 0.6355, "mean_token_accuracy": 0.7472833842039108, "num_tokens": 11042521.0, "step": 3049 }, { "entropy": 0.5932607501745224, "epoch": 2.8455436304246384, "grad_norm": 0.33292368054389954, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.7715035974979401, "num_tokens": 11046120.0, "step": 3050 }, { "entropy": 0.6061807125806808, "epoch": 2.8464769015398974, "grad_norm": 0.3082412779331207, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.749010443687439, "num_tokens": 11049822.0, "step": 3051 }, { "entropy": 0.5543680489063263, "epoch": 2.8474101726551564, "grad_norm": 0.29380524158477783, "learning_rate": 0.0002, "loss": 0.5701, "mean_token_accuracy": 0.7683513462543488, "num_tokens": 11053372.0, "step": 3052 }, { "entropy": 0.572511687874794, "epoch": 2.8483434437704154, "grad_norm": 0.26173171401023865, "learning_rate": 0.0002, "loss": 0.5753, "mean_token_accuracy": 0.766379177570343, "num_tokens": 11057023.0, "step": 3053 }, { "entropy": 0.574095219373703, "epoch": 2.8492767148856744, "grad_norm": 0.2991386353969574, "learning_rate": 0.0002, "loss": 0.5811, "mean_token_accuracy": 0.7581657767295837, "num_tokens": 11060626.0, "step": 3054 }, { "entropy": 0.604459673166275, "epoch": 2.8502099860009333, "grad_norm": 0.2966041564941406, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.7445806562900543, "num_tokens": 11064322.0, "step": 3055 }, { "entropy": 0.5833980441093445, "epoch": 2.8511432571161923, "grad_norm": 0.31823626160621643, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7622593939304352, "num_tokens": 11067949.0, "step": 3056 }, { "entropy": 0.5452151596546173, "epoch": 2.8520765282314513, "grad_norm": 0.260606050491333, "learning_rate": 0.0002, "loss": 0.546, "mean_token_accuracy": 0.7797724604606628, "num_tokens": 11071528.0, "step": 3057 }, { "entropy": 0.575703501701355, "epoch": 2.8530097993467103, "grad_norm": 0.30173808336257935, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7698642760515213, "num_tokens": 11075237.0, "step": 3058 }, { "entropy": 0.592119961977005, "epoch": 2.8539430704619693, "grad_norm": 0.27009323239326477, "learning_rate": 0.0002, "loss": 0.5983, "mean_token_accuracy": 0.757276862859726, "num_tokens": 11078792.0, "step": 3059 }, { "entropy": 0.5924768000841141, "epoch": 2.8548763415772282, "grad_norm": 0.30659160017967224, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.746891662478447, "num_tokens": 11082400.0, "step": 3060 }, { "entropy": 0.5920791774988174, "epoch": 2.8558096126924872, "grad_norm": 0.2921883761882782, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7552139759063721, "num_tokens": 11086161.0, "step": 3061 }, { "entropy": 0.5754950195550919, "epoch": 2.856742883807746, "grad_norm": 0.3075581192970276, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7668950408697128, "num_tokens": 11089806.0, "step": 3062 }, { "entropy": 0.6149885505437851, "epoch": 2.857676154923005, "grad_norm": 0.2783588469028473, "learning_rate": 0.0002, "loss": 0.6201, "mean_token_accuracy": 0.7519170939922333, "num_tokens": 11093488.0, "step": 3063 }, { "entropy": 0.5903946310281754, "epoch": 2.858609426038264, "grad_norm": 0.33340874314308167, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.7636396735906601, "num_tokens": 11097138.0, "step": 3064 }, { "entropy": 0.632124125957489, "epoch": 2.859542697153523, "grad_norm": 0.25024130940437317, "learning_rate": 0.0002, "loss": 0.6222, "mean_token_accuracy": 0.753422275185585, "num_tokens": 11100777.0, "step": 3065 }, { "entropy": 0.6177300214767456, "epoch": 2.860475968268782, "grad_norm": 0.3089832663536072, "learning_rate": 0.0002, "loss": 0.6233, "mean_token_accuracy": 0.7522821724414825, "num_tokens": 11104381.0, "step": 3066 }, { "entropy": 0.610651358962059, "epoch": 2.861409239384041, "grad_norm": 0.31180068850517273, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.7558139711618423, "num_tokens": 11107985.0, "step": 3067 }, { "entropy": 0.5891657620668411, "epoch": 2.8623425104993, "grad_norm": 0.3330582082271576, "learning_rate": 0.0002, "loss": 0.5825, "mean_token_accuracy": 0.7685088515281677, "num_tokens": 11111626.0, "step": 3068 }, { "entropy": 0.5968683809041977, "epoch": 2.863275781614559, "grad_norm": 0.3410806953907013, "learning_rate": 0.0002, "loss": 0.6154, "mean_token_accuracy": 0.747265487909317, "num_tokens": 11115154.0, "step": 3069 }, { "entropy": 0.5873818397521973, "epoch": 2.864209052729818, "grad_norm": 0.29796937108039856, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.7552398443222046, "num_tokens": 11118763.0, "step": 3070 }, { "entropy": 0.6264586746692657, "epoch": 2.865142323845077, "grad_norm": 0.377530962228775, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.7487123906612396, "num_tokens": 11122370.0, "step": 3071 }, { "entropy": 0.5891697853803635, "epoch": 2.866075594960336, "grad_norm": 0.3186618983745575, "learning_rate": 0.0002, "loss": 0.591, "mean_token_accuracy": 0.7606510668992996, "num_tokens": 11125996.0, "step": 3072 }, { "entropy": 0.5800516307353973, "epoch": 2.867008866075595, "grad_norm": 0.2629801332950592, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7602849751710892, "num_tokens": 11129589.0, "step": 3073 }, { "entropy": 0.5803913921117783, "epoch": 2.867942137190854, "grad_norm": 0.2828027009963989, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.7688085436820984, "num_tokens": 11133331.0, "step": 3074 }, { "entropy": 0.5919537097215652, "epoch": 2.868875408306113, "grad_norm": 0.29938146471977234, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7582693994045258, "num_tokens": 11136938.0, "step": 3075 }, { "entropy": 0.6007039844989777, "epoch": 2.869808679421372, "grad_norm": 0.2924206852912903, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7554949074983597, "num_tokens": 11140647.0, "step": 3076 }, { "entropy": 0.6072003841400146, "epoch": 2.870741950536631, "grad_norm": 0.2739294469356537, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7509594708681107, "num_tokens": 11144247.0, "step": 3077 }, { "entropy": 0.5964792817831039, "epoch": 2.87167522165189, "grad_norm": 0.2938867509365082, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.7543106824159622, "num_tokens": 11147782.0, "step": 3078 }, { "entropy": 0.6118725687265396, "epoch": 2.872608492767149, "grad_norm": 0.2957371175289154, "learning_rate": 0.0002, "loss": 0.625, "mean_token_accuracy": 0.7469840198755264, "num_tokens": 11151417.0, "step": 3079 }, { "entropy": 0.5998136103153229, "epoch": 2.873541763882408, "grad_norm": 0.2700604498386383, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.7571037113666534, "num_tokens": 11155020.0, "step": 3080 }, { "entropy": 0.6030322015285492, "epoch": 2.874475034997667, "grad_norm": 0.34245166182518005, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.7552971094846725, "num_tokens": 11158638.0, "step": 3081 }, { "entropy": 0.609783336520195, "epoch": 2.875408306112926, "grad_norm": 0.2322557121515274, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.7585130780935287, "num_tokens": 11162371.0, "step": 3082 }, { "entropy": 0.6090105026960373, "epoch": 2.876341577228185, "grad_norm": 0.3073332607746124, "learning_rate": 0.0002, "loss": 0.6133, "mean_token_accuracy": 0.7507927417755127, "num_tokens": 11166014.0, "step": 3083 }, { "entropy": 0.5677677541971207, "epoch": 2.877274848343444, "grad_norm": 0.2911318838596344, "learning_rate": 0.0002, "loss": 0.5683, "mean_token_accuracy": 0.7684277147054672, "num_tokens": 11169531.0, "step": 3084 }, { "entropy": 0.5928406268358231, "epoch": 2.878208119458703, "grad_norm": 0.345211386680603, "learning_rate": 0.0002, "loss": 0.5966, "mean_token_accuracy": 0.753378301858902, "num_tokens": 11173148.0, "step": 3085 }, { "entropy": 0.5887275040149689, "epoch": 2.879141390573962, "grad_norm": 0.3250561058521271, "learning_rate": 0.0002, "loss": 0.5912, "mean_token_accuracy": 0.7640737593173981, "num_tokens": 11176697.0, "step": 3086 }, { "entropy": 0.5772612988948822, "epoch": 2.8800746616892208, "grad_norm": 0.33793532848358154, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.7613471746444702, "num_tokens": 11180243.0, "step": 3087 }, { "entropy": 0.5643090009689331, "epoch": 2.8810079328044798, "grad_norm": 0.29627668857574463, "learning_rate": 0.0002, "loss": 0.5745, "mean_token_accuracy": 0.763971671462059, "num_tokens": 11183883.0, "step": 3088 }, { "entropy": 0.612529531121254, "epoch": 2.8819412039197387, "grad_norm": 0.3398737907409668, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.7528883367776871, "num_tokens": 11187549.0, "step": 3089 }, { "entropy": 0.5779610872268677, "epoch": 2.8828744750349977, "grad_norm": 0.26051074266433716, "learning_rate": 0.0002, "loss": 0.565, "mean_token_accuracy": 0.766671285033226, "num_tokens": 11191086.0, "step": 3090 }, { "entropy": 0.5684132874011993, "epoch": 2.8838077461502567, "grad_norm": 0.32148799300193787, "learning_rate": 0.0002, "loss": 0.5695, "mean_token_accuracy": 0.7696121633052826, "num_tokens": 11194588.0, "step": 3091 }, { "entropy": 0.5868300944566727, "epoch": 2.8847410172655157, "grad_norm": 0.26020166277885437, "learning_rate": 0.0002, "loss": 0.5823, "mean_token_accuracy": 0.7617156058549881, "num_tokens": 11198265.0, "step": 3092 }, { "entropy": 0.5812435299158096, "epoch": 2.8856742883807747, "grad_norm": 0.2609700858592987, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.7642641514539719, "num_tokens": 11201944.0, "step": 3093 }, { "entropy": 0.6171357482671738, "epoch": 2.8866075594960336, "grad_norm": 0.26448482275009155, "learning_rate": 0.0002, "loss": 0.6328, "mean_token_accuracy": 0.7483329027891159, "num_tokens": 11205624.0, "step": 3094 }, { "entropy": 0.5836154222488403, "epoch": 2.8875408306112926, "grad_norm": 0.4766055941581726, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.7553527504205704, "num_tokens": 11209136.0, "step": 3095 }, { "entropy": 0.6088007539510727, "epoch": 2.8884741017265516, "grad_norm": 0.24405017495155334, "learning_rate": 0.0002, "loss": 0.6028, "mean_token_accuracy": 0.7567862868309021, "num_tokens": 11212908.0, "step": 3096 }, { "entropy": 0.6156401038169861, "epoch": 2.8894073728418106, "grad_norm": 0.2428264617919922, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.7496951818466187, "num_tokens": 11216553.0, "step": 3097 }, { "entropy": 0.6248343884944916, "epoch": 2.8903406439570696, "grad_norm": 0.36595460772514343, "learning_rate": 0.0002, "loss": 0.6304, "mean_token_accuracy": 0.7452883869409561, "num_tokens": 11220025.0, "step": 3098 }, { "entropy": 0.6150981783866882, "epoch": 2.8912739150723286, "grad_norm": 0.31891772150993347, "learning_rate": 0.0002, "loss": 0.6095, "mean_token_accuracy": 0.7556407451629639, "num_tokens": 11223644.0, "step": 3099 }, { "entropy": 0.620049774646759, "epoch": 2.8922071861875875, "grad_norm": 0.2644253373146057, "learning_rate": 0.0002, "loss": 0.6144, "mean_token_accuracy": 0.747463122010231, "num_tokens": 11227332.0, "step": 3100 }, { "entropy": 0.566635474562645, "epoch": 2.8931404573028465, "grad_norm": 0.23025617003440857, "learning_rate": 0.0002, "loss": 0.556, "mean_token_accuracy": 0.7752638161182404, "num_tokens": 11231024.0, "step": 3101 }, { "entropy": 0.5321856886148453, "epoch": 2.8940737284181055, "grad_norm": 0.30736881494522095, "learning_rate": 0.0002, "loss": 0.5361, "mean_token_accuracy": 0.7844803780317307, "num_tokens": 11234509.0, "step": 3102 }, { "entropy": 0.5960945188999176, "epoch": 2.8950069995333645, "grad_norm": 0.31102320551872253, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.7554732412099838, "num_tokens": 11238127.0, "step": 3103 }, { "entropy": 0.5995414704084396, "epoch": 2.8959402706486235, "grad_norm": 0.29775604605674744, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.7529190480709076, "num_tokens": 11241806.0, "step": 3104 }, { "entropy": 0.6027424186468124, "epoch": 2.8968735417638825, "grad_norm": 0.29638543725013733, "learning_rate": 0.0002, "loss": 0.6191, "mean_token_accuracy": 0.7500474750995636, "num_tokens": 11245461.0, "step": 3105 }, { "entropy": 0.5535172745585442, "epoch": 2.8978068128791414, "grad_norm": 0.2661828100681305, "learning_rate": 0.0002, "loss": 0.5597, "mean_token_accuracy": 0.777563214302063, "num_tokens": 11249001.0, "step": 3106 }, { "entropy": 0.5932348519563675, "epoch": 2.8987400839944004, "grad_norm": 0.2910860776901245, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.7593528032302856, "num_tokens": 11252682.0, "step": 3107 }, { "entropy": 0.5730276703834534, "epoch": 2.8996733551096594, "grad_norm": 0.3160713016986847, "learning_rate": 0.0002, "loss": 0.5796, "mean_token_accuracy": 0.7656454741954803, "num_tokens": 11256302.0, "step": 3108 }, { "entropy": 0.5974744260311127, "epoch": 2.9006066262249184, "grad_norm": 0.3023775517940521, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7512742727994919, "num_tokens": 11259811.0, "step": 3109 }, { "entropy": 0.5957473963499069, "epoch": 2.9015398973401774, "grad_norm": 0.26540854573249817, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.7561625391244888, "num_tokens": 11263411.0, "step": 3110 }, { "entropy": 0.5723201930522919, "epoch": 2.9024731684554363, "grad_norm": 0.2618273198604584, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.763622909784317, "num_tokens": 11266958.0, "step": 3111 }, { "entropy": 0.5970823615789413, "epoch": 2.9034064395706953, "grad_norm": 0.2689523696899414, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7589943408966064, "num_tokens": 11270479.0, "step": 3112 }, { "entropy": 0.5965802818536758, "epoch": 2.9043397106859543, "grad_norm": 0.27808839082717896, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.7569893300533295, "num_tokens": 11273995.0, "step": 3113 }, { "entropy": 0.6079405248165131, "epoch": 2.9052729818012133, "grad_norm": 0.29429492354393005, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.7530756592750549, "num_tokens": 11277630.0, "step": 3114 }, { "entropy": 0.5914899408817291, "epoch": 2.9062062529164723, "grad_norm": 0.33129996061325073, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.7554599046707153, "num_tokens": 11281366.0, "step": 3115 }, { "entropy": 0.59227254986763, "epoch": 2.9071395240317313, "grad_norm": 0.2918965220451355, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.7572817355394363, "num_tokens": 11285012.0, "step": 3116 }, { "entropy": 0.6294339001178741, "epoch": 2.9080727951469902, "grad_norm": 0.2581447660923004, "learning_rate": 0.0002, "loss": 0.6261, "mean_token_accuracy": 0.749846026301384, "num_tokens": 11288677.0, "step": 3117 }, { "entropy": 0.6171252280473709, "epoch": 2.9090060662622492, "grad_norm": 0.3953474760055542, "learning_rate": 0.0002, "loss": 0.6416, "mean_token_accuracy": 0.7445995509624481, "num_tokens": 11292366.0, "step": 3118 }, { "entropy": 0.5634105652570724, "epoch": 2.909939337377508, "grad_norm": 0.3078080415725708, "learning_rate": 0.0002, "loss": 0.5655, "mean_token_accuracy": 0.7741966247558594, "num_tokens": 11295914.0, "step": 3119 }, { "entropy": 0.5908727794885635, "epoch": 2.910872608492767, "grad_norm": 0.3318139910697937, "learning_rate": 0.0002, "loss": 0.5945, "mean_token_accuracy": 0.7629107385873795, "num_tokens": 11299609.0, "step": 3120 }, { "entropy": 0.5893458425998688, "epoch": 2.911805879608026, "grad_norm": 0.3178554177284241, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.7471740692853928, "num_tokens": 11303160.0, "step": 3121 }, { "entropy": 0.5886469483375549, "epoch": 2.912739150723285, "grad_norm": 0.3169299066066742, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.7522707730531693, "num_tokens": 11306860.0, "step": 3122 }, { "entropy": 0.6067028492689133, "epoch": 2.913672421838544, "grad_norm": 0.271229088306427, "learning_rate": 0.0002, "loss": 0.6039, "mean_token_accuracy": 0.7595750987529755, "num_tokens": 11310480.0, "step": 3123 }, { "entropy": 0.5784197002649307, "epoch": 2.914605692953803, "grad_norm": 0.2585243284702301, "learning_rate": 0.0002, "loss": 0.5727, "mean_token_accuracy": 0.7666608691215515, "num_tokens": 11314119.0, "step": 3124 }, { "entropy": 0.5893753319978714, "epoch": 2.915538964069062, "grad_norm": 0.2231559008359909, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.7616947740316391, "num_tokens": 11317742.0, "step": 3125 }, { "entropy": 0.6486422568559647, "epoch": 2.916472235184321, "grad_norm": 0.262136846780777, "learning_rate": 0.0002, "loss": 0.6357, "mean_token_accuracy": 0.7398717850446701, "num_tokens": 11321296.0, "step": 3126 }, { "entropy": 0.6051342189311981, "epoch": 2.91740550629958, "grad_norm": 0.27932292222976685, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.760748565196991, "num_tokens": 11324956.0, "step": 3127 }, { "entropy": 0.5512849539518356, "epoch": 2.918338777414839, "grad_norm": 0.28285104036331177, "learning_rate": 0.0002, "loss": 0.5527, "mean_token_accuracy": 0.7773767709732056, "num_tokens": 11328592.0, "step": 3128 }, { "entropy": 0.5986577570438385, "epoch": 2.919272048530098, "grad_norm": 0.2638506591320038, "learning_rate": 0.0002, "loss": 0.5893, "mean_token_accuracy": 0.7582873851060867, "num_tokens": 11332208.0, "step": 3129 }, { "entropy": 0.5725806653499603, "epoch": 2.920205319645357, "grad_norm": 0.26647841930389404, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7650887370109558, "num_tokens": 11336003.0, "step": 3130 }, { "entropy": 0.5972597450017929, "epoch": 2.921138590760616, "grad_norm": 0.3524705767631531, "learning_rate": 0.0002, "loss": 0.6282, "mean_token_accuracy": 0.7494337409734726, "num_tokens": 11339673.0, "step": 3131 }, { "entropy": 0.5752980783581734, "epoch": 2.922071861875875, "grad_norm": 0.2818494737148285, "learning_rate": 0.0002, "loss": 0.5792, "mean_token_accuracy": 0.7663270682096481, "num_tokens": 11343306.0, "step": 3132 }, { "entropy": 0.5748731046915054, "epoch": 2.923005132991134, "grad_norm": 0.283351868391037, "learning_rate": 0.0002, "loss": 0.5865, "mean_token_accuracy": 0.7581116408109665, "num_tokens": 11346964.0, "step": 3133 }, { "entropy": 0.5403507351875305, "epoch": 2.923938404106393, "grad_norm": 0.3090425729751587, "learning_rate": 0.0002, "loss": 0.559, "mean_token_accuracy": 0.7735445946455002, "num_tokens": 11350526.0, "step": 3134 }, { "entropy": 0.5851901322603226, "epoch": 2.924871675221652, "grad_norm": 0.32583755254745483, "learning_rate": 0.0002, "loss": 0.591, "mean_token_accuracy": 0.7642992436885834, "num_tokens": 11354046.0, "step": 3135 }, { "entropy": 0.5998115986585617, "epoch": 2.925804946336911, "grad_norm": 0.2528308033943176, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.7610236555337906, "num_tokens": 11357717.0, "step": 3136 }, { "entropy": 0.6092266589403152, "epoch": 2.92673821745217, "grad_norm": 0.34722113609313965, "learning_rate": 0.0002, "loss": 0.6169, "mean_token_accuracy": 0.754142165184021, "num_tokens": 11361270.0, "step": 3137 }, { "entropy": 0.5999947190284729, "epoch": 2.927671488567429, "grad_norm": 0.279729962348938, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.7617057114839554, "num_tokens": 11365004.0, "step": 3138 }, { "entropy": 0.5495341643691063, "epoch": 2.928604759682688, "grad_norm": 0.25759223103523254, "learning_rate": 0.0002, "loss": 0.5561, "mean_token_accuracy": 0.7718123197555542, "num_tokens": 11368673.0, "step": 3139 }, { "entropy": 0.5977368801832199, "epoch": 2.929538030797947, "grad_norm": 0.32109174132347107, "learning_rate": 0.0002, "loss": 0.6063, "mean_token_accuracy": 0.754258468747139, "num_tokens": 11372225.0, "step": 3140 }, { "entropy": 0.5795502215623856, "epoch": 2.930471301913206, "grad_norm": 0.2743336260318756, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7642609775066376, "num_tokens": 11375768.0, "step": 3141 }, { "entropy": 0.5854646116495132, "epoch": 2.931404573028465, "grad_norm": 0.29528355598449707, "learning_rate": 0.0002, "loss": 0.5853, "mean_token_accuracy": 0.7625527083873749, "num_tokens": 11379545.0, "step": 3142 }, { "entropy": 0.6046826839447021, "epoch": 2.932337844143724, "grad_norm": 0.27066895365715027, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.7549967169761658, "num_tokens": 11383162.0, "step": 3143 }, { "entropy": 0.6226438134908676, "epoch": 2.9332711152589828, "grad_norm": 0.24129630625247955, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7618516534566879, "num_tokens": 11386940.0, "step": 3144 }, { "entropy": 0.5902719348669052, "epoch": 2.9342043863742417, "grad_norm": 0.29643768072128296, "learning_rate": 0.0002, "loss": 0.5865, "mean_token_accuracy": 0.7579989433288574, "num_tokens": 11390477.0, "step": 3145 }, { "entropy": 0.5872322767972946, "epoch": 2.9351376574895007, "grad_norm": 0.3153306543827057, "learning_rate": 0.0002, "loss": 0.5917, "mean_token_accuracy": 0.7618313133716583, "num_tokens": 11394127.0, "step": 3146 }, { "entropy": 0.5785763263702393, "epoch": 2.9360709286047597, "grad_norm": 0.31968897581100464, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.7563066929578781, "num_tokens": 11397787.0, "step": 3147 }, { "entropy": 0.5967634171247482, "epoch": 2.9370041997200187, "grad_norm": 0.2854063808917999, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7480222582817078, "num_tokens": 11401241.0, "step": 3148 }, { "entropy": 0.6280641555786133, "epoch": 2.9379374708352777, "grad_norm": 0.3214808404445648, "learning_rate": 0.0002, "loss": 0.6394, "mean_token_accuracy": 0.7404205352067947, "num_tokens": 11404793.0, "step": 3149 }, { "entropy": 0.5797632485628128, "epoch": 2.9388707419505367, "grad_norm": 0.2740463614463806, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7595625668764114, "num_tokens": 11408517.0, "step": 3150 }, { "entropy": 0.5944087505340576, "epoch": 2.9398040130657956, "grad_norm": 0.2670361399650574, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.7541628777980804, "num_tokens": 11412061.0, "step": 3151 }, { "entropy": 0.5861486792564392, "epoch": 2.9407372841810546, "grad_norm": 0.24845556914806366, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.7635263353586197, "num_tokens": 11415703.0, "step": 3152 }, { "entropy": 0.5896187126636505, "epoch": 2.9416705552963136, "grad_norm": 0.2544192671775818, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.7605195045471191, "num_tokens": 11419401.0, "step": 3153 }, { "entropy": 0.5699638277292252, "epoch": 2.9426038264115726, "grad_norm": 0.27563896775245667, "learning_rate": 0.0002, "loss": 0.5678, "mean_token_accuracy": 0.7727461159229279, "num_tokens": 11423008.0, "step": 3154 }, { "entropy": 0.5890413373708725, "epoch": 2.9435370975268316, "grad_norm": 0.3077503442764282, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.7543983906507492, "num_tokens": 11426662.0, "step": 3155 }, { "entropy": 0.5872341096401215, "epoch": 2.9444703686420906, "grad_norm": 0.2681085765361786, "learning_rate": 0.0002, "loss": 0.586, "mean_token_accuracy": 0.7676787376403809, "num_tokens": 11430344.0, "step": 3156 }, { "entropy": 0.6076451539993286, "epoch": 2.9454036397573495, "grad_norm": 0.313919335603714, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.7505831569433212, "num_tokens": 11433912.0, "step": 3157 }, { "entropy": 0.5850380063056946, "epoch": 2.9463369108726085, "grad_norm": 0.296995609998703, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.7572672367095947, "num_tokens": 11437367.0, "step": 3158 }, { "entropy": 0.6296968460083008, "epoch": 2.9472701819878675, "grad_norm": 0.27755481004714966, "learning_rate": 0.0002, "loss": 0.6223, "mean_token_accuracy": 0.7475418001413345, "num_tokens": 11441083.0, "step": 3159 }, { "entropy": 0.603736400604248, "epoch": 2.9482034531031265, "grad_norm": 0.29653188586235046, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.7569259405136108, "num_tokens": 11444822.0, "step": 3160 }, { "entropy": 0.5899407863616943, "epoch": 2.9491367242183855, "grad_norm": 0.2855703830718994, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7611601054668427, "num_tokens": 11448534.0, "step": 3161 }, { "entropy": 0.5808372050523758, "epoch": 2.9500699953336444, "grad_norm": 0.3030730187892914, "learning_rate": 0.0002, "loss": 0.5875, "mean_token_accuracy": 0.7644793540239334, "num_tokens": 11452237.0, "step": 3162 }, { "entropy": 0.5696911960840225, "epoch": 2.9510032664489034, "grad_norm": 0.35269129276275635, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.7609989047050476, "num_tokens": 11455768.0, "step": 3163 }, { "entropy": 0.5772757530212402, "epoch": 2.9519365375641624, "grad_norm": 0.2900140881538391, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7662184983491898, "num_tokens": 11459428.0, "step": 3164 }, { "entropy": 0.6002923399209976, "epoch": 2.9528698086794214, "grad_norm": 0.2703345715999603, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.7531653195619583, "num_tokens": 11463188.0, "step": 3165 }, { "entropy": 0.6179348230361938, "epoch": 2.9538030797946804, "grad_norm": 0.25778988003730774, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.7512529790401459, "num_tokens": 11466950.0, "step": 3166 }, { "entropy": 0.6260432153940201, "epoch": 2.9547363509099394, "grad_norm": 0.28053849935531616, "learning_rate": 0.0002, "loss": 0.6228, "mean_token_accuracy": 0.747724860906601, "num_tokens": 11470770.0, "step": 3167 }, { "entropy": 0.618918314576149, "epoch": 2.9556696220251983, "grad_norm": 0.28162455558776855, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.7475159466266632, "num_tokens": 11474387.0, "step": 3168 }, { "entropy": 0.6150233000516891, "epoch": 2.9566028931404573, "grad_norm": 0.2799813747406006, "learning_rate": 0.0002, "loss": 0.6101, "mean_token_accuracy": 0.7539042830467224, "num_tokens": 11478020.0, "step": 3169 }, { "entropy": 0.6075970083475113, "epoch": 2.9575361642557163, "grad_norm": 0.29789936542510986, "learning_rate": 0.0002, "loss": 0.6069, "mean_token_accuracy": 0.7594657242298126, "num_tokens": 11481734.0, "step": 3170 }, { "entropy": 0.569659873843193, "epoch": 2.9584694353709753, "grad_norm": 0.2742854058742523, "learning_rate": 0.0002, "loss": 0.5733, "mean_token_accuracy": 0.7681124061346054, "num_tokens": 11485449.0, "step": 3171 }, { "entropy": 0.593114897608757, "epoch": 2.9594027064862343, "grad_norm": 0.3341306746006012, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.7557294964790344, "num_tokens": 11489137.0, "step": 3172 }, { "entropy": 0.5755763649940491, "epoch": 2.9603359776014933, "grad_norm": 0.34675225615501404, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7606999576091766, "num_tokens": 11492891.0, "step": 3173 }, { "entropy": 0.6137986183166504, "epoch": 2.9612692487167522, "grad_norm": 0.2861494719982147, "learning_rate": 0.0002, "loss": 0.6259, "mean_token_accuracy": 0.7500059306621552, "num_tokens": 11496564.0, "step": 3174 }, { "entropy": 0.5691146850585938, "epoch": 2.962202519832011, "grad_norm": 0.2946808338165283, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.7696904540061951, "num_tokens": 11500258.0, "step": 3175 }, { "entropy": 0.5819707065820694, "epoch": 2.96313579094727, "grad_norm": 0.3918161690235138, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.7612521052360535, "num_tokens": 11503911.0, "step": 3176 }, { "entropy": 0.573918342590332, "epoch": 2.964069062062529, "grad_norm": 0.31533899903297424, "learning_rate": 0.0002, "loss": 0.583, "mean_token_accuracy": 0.7546318918466568, "num_tokens": 11507506.0, "step": 3177 }, { "entropy": 0.6281457841396332, "epoch": 2.965002333177788, "grad_norm": 0.2856009602546692, "learning_rate": 0.0002, "loss": 0.6364, "mean_token_accuracy": 0.7415702491998672, "num_tokens": 11511043.0, "step": 3178 }, { "entropy": 0.5915785580873489, "epoch": 2.965935604293047, "grad_norm": 0.2858209013938904, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7555264830589294, "num_tokens": 11514723.0, "step": 3179 }, { "entropy": 0.6385788917541504, "epoch": 2.966868875408306, "grad_norm": 0.38346031308174133, "learning_rate": 0.0002, "loss": 0.6376, "mean_token_accuracy": 0.7439766526222229, "num_tokens": 11518332.0, "step": 3180 }, { "entropy": 0.6332580745220184, "epoch": 2.967802146523565, "grad_norm": 0.3222842514514923, "learning_rate": 0.0002, "loss": 0.6379, "mean_token_accuracy": 0.7417911738157272, "num_tokens": 11521984.0, "step": 3181 }, { "entropy": 0.5363463014364243, "epoch": 2.968735417638824, "grad_norm": 0.22501178085803986, "learning_rate": 0.0002, "loss": 0.5322, "mean_token_accuracy": 0.7824167162179947, "num_tokens": 11525593.0, "step": 3182 }, { "entropy": 0.5925339013338089, "epoch": 2.969668688754083, "grad_norm": 0.6059805750846863, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.7565067261457443, "num_tokens": 11529184.0, "step": 3183 }, { "entropy": 0.6138273179531097, "epoch": 2.970601959869342, "grad_norm": 0.36484822630882263, "learning_rate": 0.0002, "loss": 0.6166, "mean_token_accuracy": 0.7521388977766037, "num_tokens": 11532764.0, "step": 3184 }, { "entropy": 0.6071785390377045, "epoch": 2.971535230984601, "grad_norm": 0.3101447820663452, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7549857050180435, "num_tokens": 11536353.0, "step": 3185 }, { "entropy": 0.5552823543548584, "epoch": 2.97246850209986, "grad_norm": 0.4875289797782898, "learning_rate": 0.0002, "loss": 0.5633, "mean_token_accuracy": 0.7711167931556702, "num_tokens": 11539952.0, "step": 3186 }, { "entropy": 0.5853101909160614, "epoch": 2.973401773215119, "grad_norm": 0.3224151134490967, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.7549132704734802, "num_tokens": 11543525.0, "step": 3187 }, { "entropy": 0.6031962335109711, "epoch": 2.974335044330378, "grad_norm": 0.3717763423919678, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7564367353916168, "num_tokens": 11547172.0, "step": 3188 }, { "entropy": 0.62708380818367, "epoch": 2.975268315445637, "grad_norm": 0.29766905307769775, "learning_rate": 0.0002, "loss": 0.6298, "mean_token_accuracy": 0.7381460815668106, "num_tokens": 11550861.0, "step": 3189 }, { "entropy": 0.5578789710998535, "epoch": 2.976201586560896, "grad_norm": 0.26360777020454407, "learning_rate": 0.0002, "loss": 0.5605, "mean_token_accuracy": 0.7690404951572418, "num_tokens": 11554572.0, "step": 3190 }, { "entropy": 0.5930578857660294, "epoch": 2.977134857676155, "grad_norm": 0.37247276306152344, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.75057552754879, "num_tokens": 11558033.0, "step": 3191 }, { "entropy": 0.6046136021614075, "epoch": 2.978068128791414, "grad_norm": 0.31877219676971436, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.7505595088005066, "num_tokens": 11561683.0, "step": 3192 }, { "entropy": 0.6361041069030762, "epoch": 2.979001399906673, "grad_norm": 0.3297255039215088, "learning_rate": 0.0002, "loss": 0.6361, "mean_token_accuracy": 0.7404361814260483, "num_tokens": 11565251.0, "step": 3193 }, { "entropy": 0.5609379410743713, "epoch": 2.979934671021932, "grad_norm": 0.26834970712661743, "learning_rate": 0.0002, "loss": 0.5606, "mean_token_accuracy": 0.7717452645301819, "num_tokens": 11568764.0, "step": 3194 }, { "entropy": 0.5965733826160431, "epoch": 2.980867942137191, "grad_norm": 0.28759047389030457, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.7615046501159668, "num_tokens": 11572386.0, "step": 3195 }, { "entropy": 0.6090290397405624, "epoch": 2.98180121325245, "grad_norm": 0.36456361413002014, "learning_rate": 0.0002, "loss": 0.6131, "mean_token_accuracy": 0.7544472813606262, "num_tokens": 11576102.0, "step": 3196 }, { "entropy": 0.5905314385890961, "epoch": 2.982734484367709, "grad_norm": 0.32524415850639343, "learning_rate": 0.0002, "loss": 0.5846, "mean_token_accuracy": 0.7612465769052505, "num_tokens": 11579586.0, "step": 3197 }, { "entropy": 0.5935803204774857, "epoch": 2.983667755482968, "grad_norm": 0.34983181953430176, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.7598332315683365, "num_tokens": 11583207.0, "step": 3198 }, { "entropy": 0.5868535488843918, "epoch": 2.984601026598227, "grad_norm": 0.2864875793457031, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.7568349391222, "num_tokens": 11586761.0, "step": 3199 }, { "entropy": 0.5939600765705109, "epoch": 2.9855342977134858, "grad_norm": 0.2880533039569855, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.7639029175043106, "num_tokens": 11590434.0, "step": 3200 }, { "entropy": 0.558985061943531, "epoch": 2.9864675688287448, "grad_norm": 0.329680472612381, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.766192838549614, "num_tokens": 11594086.0, "step": 3201 }, { "entropy": 0.581333190202713, "epoch": 2.9874008399440037, "grad_norm": 0.341679185628891, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.7536371648311615, "num_tokens": 11597603.0, "step": 3202 }, { "entropy": 0.6173575520515442, "epoch": 2.9883341110592627, "grad_norm": 0.34349435567855835, "learning_rate": 0.0002, "loss": 0.6311, "mean_token_accuracy": 0.744461327791214, "num_tokens": 11601233.0, "step": 3203 }, { "entropy": 0.5964512079954147, "epoch": 2.9892673821745217, "grad_norm": 0.28613749146461487, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.7553053498268127, "num_tokens": 11604808.0, "step": 3204 }, { "entropy": 0.5750188827514648, "epoch": 2.9902006532897807, "grad_norm": 0.3107970058917999, "learning_rate": 0.0002, "loss": 0.5688, "mean_token_accuracy": 0.7668399065732956, "num_tokens": 11608244.0, "step": 3205 }, { "entropy": 0.6383098363876343, "epoch": 2.9911339244050397, "grad_norm": 0.28356412053108215, "learning_rate": 0.0002, "loss": 0.6202, "mean_token_accuracy": 0.7536584585905075, "num_tokens": 11611975.0, "step": 3206 }, { "entropy": 0.6121927201747894, "epoch": 2.9920671955202987, "grad_norm": 0.26669979095458984, "learning_rate": 0.0002, "loss": 0.6057, "mean_token_accuracy": 0.7627904117107391, "num_tokens": 11615546.0, "step": 3207 }, { "entropy": 0.6115109026432037, "epoch": 2.9930004666355576, "grad_norm": 0.3095892667770386, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.7572216093540192, "num_tokens": 11619262.0, "step": 3208 }, { "entropy": 0.5956231504678726, "epoch": 2.9939337377508166, "grad_norm": 0.26268434524536133, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.7631510347127914, "num_tokens": 11622957.0, "step": 3209 }, { "entropy": 0.6042822003364563, "epoch": 2.9948670088660756, "grad_norm": 0.2788606882095337, "learning_rate": 0.0002, "loss": 0.6076, "mean_token_accuracy": 0.7500470876693726, "num_tokens": 11626633.0, "step": 3210 }, { "entropy": 0.5690905749797821, "epoch": 2.9958002799813346, "grad_norm": 0.32607007026672363, "learning_rate": 0.0002, "loss": 0.5765, "mean_token_accuracy": 0.7597797065973282, "num_tokens": 11630165.0, "step": 3211 }, { "entropy": 0.5784741044044495, "epoch": 2.9967335510965936, "grad_norm": 0.35699549317359924, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.7649131715297699, "num_tokens": 11633774.0, "step": 3212 }, { "entropy": 0.5792493373155594, "epoch": 2.9976668222118525, "grad_norm": 0.3105754852294922, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.7593958377838135, "num_tokens": 11637335.0, "step": 3213 }, { "entropy": 0.6175146996974945, "epoch": 2.9986000933271115, "grad_norm": 0.32855460047721863, "learning_rate": 0.0002, "loss": 0.6401, "mean_token_accuracy": 0.7427545040845871, "num_tokens": 11640973.0, "step": 3214 }, { "entropy": 0.5826921463012695, "epoch": 2.9995333644423705, "grad_norm": 0.3404068648815155, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7534625977277756, "num_tokens": 11644498.0, "step": 3215 }, { "entropy": 0.6371209025382996, "epoch": 3.0, "grad_norm": 0.40314748883247375, "learning_rate": 0.0002, "loss": 0.6281, "mean_token_accuracy": 0.7521000802516937, "num_tokens": 11645478.0, "step": 3216 }, { "entropy": 0.5453531593084335, "epoch": 3.000933271115259, "grad_norm": 0.28808748722076416, "learning_rate": 0.0002, "loss": 0.5371, "mean_token_accuracy": 0.7884436547756195, "num_tokens": 11649126.0, "step": 3217 }, { "entropy": 0.5829551517963409, "epoch": 3.001866542230518, "grad_norm": 0.27326512336730957, "learning_rate": 0.0002, "loss": 0.5771, "mean_token_accuracy": 0.7625979632139206, "num_tokens": 11652664.0, "step": 3218 }, { "entropy": 0.5925953835248947, "epoch": 3.002799813345777, "grad_norm": 0.2833689749240875, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7682607769966125, "num_tokens": 11656274.0, "step": 3219 }, { "entropy": 0.6053134948015213, "epoch": 3.003733084461036, "grad_norm": 0.2784832715988159, "learning_rate": 0.0002, "loss": 0.6016, "mean_token_accuracy": 0.7557684183120728, "num_tokens": 11659943.0, "step": 3220 }, { "entropy": 0.5776787996292114, "epoch": 3.004666355576295, "grad_norm": 0.38788679242134094, "learning_rate": 0.0002, "loss": 0.5834, "mean_token_accuracy": 0.7731528878211975, "num_tokens": 11663621.0, "step": 3221 }, { "entropy": 0.5691636949777603, "epoch": 3.005599626691554, "grad_norm": 0.33379316329956055, "learning_rate": 0.0002, "loss": 0.5732, "mean_token_accuracy": 0.7660046368837357, "num_tokens": 11667208.0, "step": 3222 }, { "entropy": 0.5843886286020279, "epoch": 3.006532897806813, "grad_norm": 0.30741944909095764, "learning_rate": 0.0002, "loss": 0.5836, "mean_token_accuracy": 0.7637011706829071, "num_tokens": 11670972.0, "step": 3223 }, { "entropy": 0.5686820298433304, "epoch": 3.007466168922072, "grad_norm": 0.3207147717475891, "learning_rate": 0.0002, "loss": 0.5723, "mean_token_accuracy": 0.7695431858301163, "num_tokens": 11674598.0, "step": 3224 }, { "entropy": 0.5770242810249329, "epoch": 3.008399440037331, "grad_norm": 0.34974348545074463, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7571841180324554, "num_tokens": 11678190.0, "step": 3225 }, { "entropy": 0.5749538838863373, "epoch": 3.00933271115259, "grad_norm": 0.27713674306869507, "learning_rate": 0.0002, "loss": 0.5694, "mean_token_accuracy": 0.774519145488739, "num_tokens": 11681851.0, "step": 3226 }, { "entropy": 0.5634574368596077, "epoch": 3.010265982267849, "grad_norm": 0.302423894405365, "learning_rate": 0.0002, "loss": 0.561, "mean_token_accuracy": 0.7777755856513977, "num_tokens": 11685521.0, "step": 3227 }, { "entropy": 0.5514305382966995, "epoch": 3.011199253383108, "grad_norm": 0.34591227769851685, "learning_rate": 0.0002, "loss": 0.556, "mean_token_accuracy": 0.7725898623466492, "num_tokens": 11689138.0, "step": 3228 }, { "entropy": 0.5734123289585114, "epoch": 3.0121325244983668, "grad_norm": 0.28703245520591736, "learning_rate": 0.0002, "loss": 0.5725, "mean_token_accuracy": 0.7733337432146072, "num_tokens": 11692818.0, "step": 3229 }, { "entropy": 0.5385692790150642, "epoch": 3.0130657956136258, "grad_norm": 0.3136560022830963, "learning_rate": 0.0002, "loss": 0.5528, "mean_token_accuracy": 0.7773199081420898, "num_tokens": 11696328.0, "step": 3230 }, { "entropy": 0.5746417045593262, "epoch": 3.0139990667288847, "grad_norm": 0.3703383803367615, "learning_rate": 0.0002, "loss": 0.5812, "mean_token_accuracy": 0.7655432820320129, "num_tokens": 11699942.0, "step": 3231 }, { "entropy": 0.5360496342182159, "epoch": 3.0149323378441437, "grad_norm": 0.2852562367916107, "learning_rate": 0.0002, "loss": 0.5379, "mean_token_accuracy": 0.7786687016487122, "num_tokens": 11703608.0, "step": 3232 }, { "entropy": 0.5993359386920929, "epoch": 3.0158656089594027, "grad_norm": 0.27799466252326965, "learning_rate": 0.0002, "loss": 0.588, "mean_token_accuracy": 0.7663296610116959, "num_tokens": 11707352.0, "step": 3233 }, { "entropy": 0.5540611743927002, "epoch": 3.0167988800746617, "grad_norm": 0.3180372416973114, "learning_rate": 0.0002, "loss": 0.5612, "mean_token_accuracy": 0.7770187854766846, "num_tokens": 11710980.0, "step": 3234 }, { "entropy": 0.540851429104805, "epoch": 3.0177321511899207, "grad_norm": 0.39450013637542725, "learning_rate": 0.0002, "loss": 0.5558, "mean_token_accuracy": 0.77458855509758, "num_tokens": 11714523.0, "step": 3235 }, { "entropy": 0.5687976479530334, "epoch": 3.0186654223051796, "grad_norm": 0.2600560784339905, "learning_rate": 0.0002, "loss": 0.5646, "mean_token_accuracy": 0.7684583067893982, "num_tokens": 11718156.0, "step": 3236 }, { "entropy": 0.5876971781253815, "epoch": 3.0195986934204386, "grad_norm": 0.31229060888290405, "learning_rate": 0.0002, "loss": 0.5919, "mean_token_accuracy": 0.7591981589794159, "num_tokens": 11721716.0, "step": 3237 }, { "entropy": 0.523548312485218, "epoch": 3.0205319645356976, "grad_norm": 0.33309081196784973, "learning_rate": 0.0002, "loss": 0.5244, "mean_token_accuracy": 0.7939031571149826, "num_tokens": 11725244.0, "step": 3238 }, { "entropy": 0.5499828532338142, "epoch": 3.0214652356509566, "grad_norm": 0.3564458191394806, "learning_rate": 0.0002, "loss": 0.559, "mean_token_accuracy": 0.7761405259370804, "num_tokens": 11728790.0, "step": 3239 }, { "entropy": 0.5716783404350281, "epoch": 3.0223985067662156, "grad_norm": 0.3710835576057434, "learning_rate": 0.0002, "loss": 0.5757, "mean_token_accuracy": 0.7679315954446793, "num_tokens": 11732395.0, "step": 3240 }, { "entropy": 0.5511093437671661, "epoch": 3.0233317778814746, "grad_norm": 0.3625626564025879, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7698272615671158, "num_tokens": 11735821.0, "step": 3241 }, { "entropy": 0.5648081004619598, "epoch": 3.0242650489967335, "grad_norm": 0.34622812271118164, "learning_rate": 0.0002, "loss": 0.5723, "mean_token_accuracy": 0.7662564367055893, "num_tokens": 11739332.0, "step": 3242 }, { "entropy": 0.5708575248718262, "epoch": 3.0251983201119925, "grad_norm": 0.35663092136383057, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.7615257948637009, "num_tokens": 11742905.0, "step": 3243 }, { "entropy": 0.5572772920131683, "epoch": 3.0261315912272515, "grad_norm": 0.30051368474960327, "learning_rate": 0.0002, "loss": 0.5547, "mean_token_accuracy": 0.7696043401956558, "num_tokens": 11746568.0, "step": 3244 }, { "entropy": 0.5519414991140366, "epoch": 3.0270648623425105, "grad_norm": 0.3289499878883362, "learning_rate": 0.0002, "loss": 0.5635, "mean_token_accuracy": 0.7662785053253174, "num_tokens": 11750174.0, "step": 3245 }, { "entropy": 0.5550941079854965, "epoch": 3.0279981334577695, "grad_norm": 0.33135557174682617, "learning_rate": 0.0002, "loss": 0.5563, "mean_token_accuracy": 0.7786406129598618, "num_tokens": 11753649.0, "step": 3246 }, { "entropy": 0.550404280424118, "epoch": 3.0289314045730285, "grad_norm": 0.3164787292480469, "learning_rate": 0.0002, "loss": 0.5473, "mean_token_accuracy": 0.7804099321365356, "num_tokens": 11757213.0, "step": 3247 }, { "entropy": 0.56070177257061, "epoch": 3.0298646756882874, "grad_norm": 0.30318740010261536, "learning_rate": 0.0002, "loss": 0.5615, "mean_token_accuracy": 0.7719786018133163, "num_tokens": 11760816.0, "step": 3248 }, { "entropy": 0.5611201897263527, "epoch": 3.0307979468035464, "grad_norm": 0.35784927010536194, "learning_rate": 0.0002, "loss": 0.5642, "mean_token_accuracy": 0.7732638865709305, "num_tokens": 11764360.0, "step": 3249 }, { "entropy": 0.6005063951015472, "epoch": 3.0317312179188054, "grad_norm": 0.3469320237636566, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.7630321979522705, "num_tokens": 11767929.0, "step": 3250 }, { "entropy": 0.5837763547897339, "epoch": 3.0326644890340644, "grad_norm": 0.33131837844848633, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7570205330848694, "num_tokens": 11771534.0, "step": 3251 }, { "entropy": 0.587557390332222, "epoch": 3.0335977601493234, "grad_norm": 0.3498901426792145, "learning_rate": 0.0002, "loss": 0.591, "mean_token_accuracy": 0.756621241569519, "num_tokens": 11775205.0, "step": 3252 }, { "entropy": 0.5715064257383347, "epoch": 3.0345310312645823, "grad_norm": 0.3758942484855652, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7631188184022903, "num_tokens": 11778764.0, "step": 3253 }, { "entropy": 0.5859584212303162, "epoch": 3.0354643023798413, "grad_norm": 0.33297356963157654, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7623724788427353, "num_tokens": 11782500.0, "step": 3254 }, { "entropy": 0.5617497116327286, "epoch": 3.0363975734951003, "grad_norm": 0.41717690229415894, "learning_rate": 0.0002, "loss": 0.5604, "mean_token_accuracy": 0.7724545449018478, "num_tokens": 11786136.0, "step": 3255 }, { "entropy": 0.5481326133012772, "epoch": 3.0373308446103593, "grad_norm": 0.295685350894928, "learning_rate": 0.0002, "loss": 0.549, "mean_token_accuracy": 0.7785678058862686, "num_tokens": 11789798.0, "step": 3256 }, { "entropy": 0.5697991698980331, "epoch": 3.0382641157256183, "grad_norm": 0.35467907786369324, "learning_rate": 0.0002, "loss": 0.5676, "mean_token_accuracy": 0.7745958417654037, "num_tokens": 11793372.0, "step": 3257 }, { "entropy": 0.5659648776054382, "epoch": 3.0391973868408773, "grad_norm": 0.3338589668273926, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7702318578958511, "num_tokens": 11796948.0, "step": 3258 }, { "entropy": 0.5373198240995407, "epoch": 3.0401306579561362, "grad_norm": 0.40949541330337524, "learning_rate": 0.0002, "loss": 0.5389, "mean_token_accuracy": 0.7808077931404114, "num_tokens": 11800661.0, "step": 3259 }, { "entropy": 0.5581737160682678, "epoch": 3.0410639290713952, "grad_norm": 0.3567778468132019, "learning_rate": 0.0002, "loss": 0.5709, "mean_token_accuracy": 0.7715121209621429, "num_tokens": 11804180.0, "step": 3260 }, { "entropy": 0.606160506606102, "epoch": 3.041997200186654, "grad_norm": 0.3963301479816437, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.752405971288681, "num_tokens": 11807837.0, "step": 3261 }, { "entropy": 0.526047520339489, "epoch": 3.042930471301913, "grad_norm": 0.36864909529685974, "learning_rate": 0.0002, "loss": 0.5318, "mean_token_accuracy": 0.7899544686079025, "num_tokens": 11811437.0, "step": 3262 }, { "entropy": 0.5860097408294678, "epoch": 3.043863742417172, "grad_norm": 0.44236719608306885, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.757451519370079, "num_tokens": 11814983.0, "step": 3263 }, { "entropy": 0.5852121710777283, "epoch": 3.044797013532431, "grad_norm": 0.3412539064884186, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7567246109247208, "num_tokens": 11818589.0, "step": 3264 }, { "entropy": 0.5473352372646332, "epoch": 3.04573028464769, "grad_norm": 0.3548423945903778, "learning_rate": 0.0002, "loss": 0.5565, "mean_token_accuracy": 0.771997794508934, "num_tokens": 11822247.0, "step": 3265 }, { "entropy": 0.5881932526826859, "epoch": 3.046663555762949, "grad_norm": 0.35427436232566833, "learning_rate": 0.0002, "loss": 0.5852, "mean_token_accuracy": 0.7672399580478668, "num_tokens": 11825877.0, "step": 3266 }, { "entropy": 0.5866481959819794, "epoch": 3.047596826878208, "grad_norm": 0.35724857449531555, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.764951080083847, "num_tokens": 11829479.0, "step": 3267 }, { "entropy": 0.592366486787796, "epoch": 3.048530097993467, "grad_norm": 0.27809959650039673, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7647206336259842, "num_tokens": 11833157.0, "step": 3268 }, { "entropy": 0.599411204457283, "epoch": 3.049463369108726, "grad_norm": 0.31310388445854187, "learning_rate": 0.0002, "loss": 0.5786, "mean_token_accuracy": 0.7637552171945572, "num_tokens": 11836810.0, "step": 3269 }, { "entropy": 0.5920996963977814, "epoch": 3.050396640223985, "grad_norm": 0.3733389973640442, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.7578735053539276, "num_tokens": 11840410.0, "step": 3270 }, { "entropy": 0.5538582652807236, "epoch": 3.051329911339244, "grad_norm": 0.35337910056114197, "learning_rate": 0.0002, "loss": 0.5558, "mean_token_accuracy": 0.7750561833381653, "num_tokens": 11843976.0, "step": 3271 }, { "entropy": 0.5900582075119019, "epoch": 3.052263182454503, "grad_norm": 0.3265371024608612, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.761514738202095, "num_tokens": 11847714.0, "step": 3272 }, { "entropy": 0.5678048878908157, "epoch": 3.053196453569762, "grad_norm": 0.34549349546432495, "learning_rate": 0.0002, "loss": 0.5745, "mean_token_accuracy": 0.763996809720993, "num_tokens": 11851420.0, "step": 3273 }, { "entropy": 0.532403402030468, "epoch": 3.054129724685021, "grad_norm": 0.40814802050590515, "learning_rate": 0.0002, "loss": 0.549, "mean_token_accuracy": 0.7747573256492615, "num_tokens": 11855105.0, "step": 3274 }, { "entropy": 0.5521239042282104, "epoch": 3.05506299580028, "grad_norm": 0.3746466636657715, "learning_rate": 0.0002, "loss": 0.5806, "mean_token_accuracy": 0.7628461867570877, "num_tokens": 11858737.0, "step": 3275 }, { "entropy": 0.5592659264802933, "epoch": 3.055996266915539, "grad_norm": 0.3358624279499054, "learning_rate": 0.0002, "loss": 0.568, "mean_token_accuracy": 0.7734821289777756, "num_tokens": 11862377.0, "step": 3276 }, { "entropy": 0.5753948539495468, "epoch": 3.056929538030798, "grad_norm": 0.4084530174732208, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7620803862810135, "num_tokens": 11865996.0, "step": 3277 }, { "entropy": 0.5935320258140564, "epoch": 3.057862809146057, "grad_norm": 0.4440910816192627, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7615256458520889, "num_tokens": 11869587.0, "step": 3278 }, { "entropy": 0.5215371176600456, "epoch": 3.058796080261316, "grad_norm": 0.298142671585083, "learning_rate": 0.0002, "loss": 0.511, "mean_token_accuracy": 0.7888702005147934, "num_tokens": 11873082.0, "step": 3279 }, { "entropy": 0.5601858049631119, "epoch": 3.059729351376575, "grad_norm": 0.2879664897918701, "learning_rate": 0.0002, "loss": 0.5492, "mean_token_accuracy": 0.7827082872390747, "num_tokens": 11876719.0, "step": 3280 }, { "entropy": 0.5844492167234421, "epoch": 3.060662622491834, "grad_norm": 0.27805009484291077, "learning_rate": 0.0002, "loss": 0.5838, "mean_token_accuracy": 0.7643211036920547, "num_tokens": 11880416.0, "step": 3281 }, { "entropy": 0.5687485039234161, "epoch": 3.061595893607093, "grad_norm": 0.36844339966773987, "learning_rate": 0.0002, "loss": 0.586, "mean_token_accuracy": 0.7640807628631592, "num_tokens": 11884089.0, "step": 3282 }, { "entropy": 0.5778980404138565, "epoch": 3.062529164722352, "grad_norm": 0.34963950514793396, "learning_rate": 0.0002, "loss": 0.5872, "mean_token_accuracy": 0.7602520883083344, "num_tokens": 11887758.0, "step": 3283 }, { "entropy": 0.5459486991167068, "epoch": 3.063462435837611, "grad_norm": 0.33760759234428406, "learning_rate": 0.0002, "loss": 0.5586, "mean_token_accuracy": 0.7700901478528976, "num_tokens": 11891314.0, "step": 3284 }, { "entropy": 0.5556304007768631, "epoch": 3.06439570695287, "grad_norm": 0.4066332280635834, "learning_rate": 0.0002, "loss": 0.5659, "mean_token_accuracy": 0.7706780284643173, "num_tokens": 11894933.0, "step": 3285 }, { "entropy": 0.5996490269899368, "epoch": 3.0653289780681288, "grad_norm": 0.36480799317359924, "learning_rate": 0.0002, "loss": 0.6118, "mean_token_accuracy": 0.7555499970912933, "num_tokens": 11898485.0, "step": 3286 }, { "entropy": 0.5251405164599419, "epoch": 3.0662622491833877, "grad_norm": 0.4667423367500305, "learning_rate": 0.0002, "loss": 0.5374, "mean_token_accuracy": 0.7830724120140076, "num_tokens": 11902031.0, "step": 3287 }, { "entropy": 0.5868809819221497, "epoch": 3.0671955202986467, "grad_norm": 0.3532901108264923, "learning_rate": 0.0002, "loss": 0.5766, "mean_token_accuracy": 0.7682751566171646, "num_tokens": 11905603.0, "step": 3288 }, { "entropy": 0.573658749461174, "epoch": 3.0681287914139057, "grad_norm": 0.3821570575237274, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.7734875082969666, "num_tokens": 11909065.0, "step": 3289 }, { "entropy": 0.5686099976301193, "epoch": 3.0690620625291647, "grad_norm": 0.3876951336860657, "learning_rate": 0.0002, "loss": 0.5697, "mean_token_accuracy": 0.7702282965183258, "num_tokens": 11912605.0, "step": 3290 }, { "entropy": 0.5377917438745499, "epoch": 3.0699953336444237, "grad_norm": 0.2982039451599121, "learning_rate": 0.0002, "loss": 0.537, "mean_token_accuracy": 0.7791972756385803, "num_tokens": 11916203.0, "step": 3291 }, { "entropy": 0.5961387008428574, "epoch": 3.0709286047596827, "grad_norm": 0.3387278616428375, "learning_rate": 0.0002, "loss": 0.6023, "mean_token_accuracy": 0.7553795874118805, "num_tokens": 11919843.0, "step": 3292 }, { "entropy": 0.5738270506262779, "epoch": 3.0718618758749416, "grad_norm": 0.3442314863204956, "learning_rate": 0.0002, "loss": 0.5781, "mean_token_accuracy": 0.7676345556974411, "num_tokens": 11923525.0, "step": 3293 }, { "entropy": 0.598867729306221, "epoch": 3.0727951469902006, "grad_norm": 0.33009782433509827, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.7517653107643127, "num_tokens": 11927233.0, "step": 3294 }, { "entropy": 0.5458163470029831, "epoch": 3.0737284181054596, "grad_norm": 0.3257681727409363, "learning_rate": 0.0002, "loss": 0.5445, "mean_token_accuracy": 0.779736191034317, "num_tokens": 11930920.0, "step": 3295 }, { "entropy": 0.5551645308732986, "epoch": 3.0746616892207186, "grad_norm": 0.31428879499435425, "learning_rate": 0.0002, "loss": 0.569, "mean_token_accuracy": 0.7693517357110977, "num_tokens": 11934507.0, "step": 3296 }, { "entropy": 0.5767552703619003, "epoch": 3.0755949603359776, "grad_norm": 0.4190780222415924, "learning_rate": 0.0002, "loss": 0.5752, "mean_token_accuracy": 0.7699346095323563, "num_tokens": 11938103.0, "step": 3297 }, { "entropy": 0.5587742328643799, "epoch": 3.0765282314512366, "grad_norm": 0.2939176559448242, "learning_rate": 0.0002, "loss": 0.5568, "mean_token_accuracy": 0.7731087356805801, "num_tokens": 11941670.0, "step": 3298 }, { "entropy": 0.594311997294426, "epoch": 3.0774615025664955, "grad_norm": 0.4080130159854889, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7533360123634338, "num_tokens": 11945295.0, "step": 3299 }, { "entropy": 0.6153740137815475, "epoch": 3.0783947736817545, "grad_norm": 0.3202863335609436, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.751051664352417, "num_tokens": 11948933.0, "step": 3300 }, { "entropy": 0.5442058891057968, "epoch": 3.0793280447970135, "grad_norm": 0.3038877844810486, "learning_rate": 0.0002, "loss": 0.5453, "mean_token_accuracy": 0.7798260599374771, "num_tokens": 11952620.0, "step": 3301 }, { "entropy": 0.5661356300115585, "epoch": 3.0802613159122725, "grad_norm": 0.3022439777851105, "learning_rate": 0.0002, "loss": 0.559, "mean_token_accuracy": 0.773269847035408, "num_tokens": 11956317.0, "step": 3302 }, { "entropy": 0.5638733208179474, "epoch": 3.0811945870275315, "grad_norm": 0.3236598074436188, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.7732623517513275, "num_tokens": 11959975.0, "step": 3303 }, { "entropy": 0.5529799461364746, "epoch": 3.0821278581427904, "grad_norm": 0.3134319484233856, "learning_rate": 0.0002, "loss": 0.5525, "mean_token_accuracy": 0.7804372012615204, "num_tokens": 11963628.0, "step": 3304 }, { "entropy": 0.5852284878492355, "epoch": 3.0830611292580494, "grad_norm": 0.2917464077472687, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.7623164057731628, "num_tokens": 11967256.0, "step": 3305 }, { "entropy": 0.576140969991684, "epoch": 3.0839944003733084, "grad_norm": 0.3620412349700928, "learning_rate": 0.0002, "loss": 0.5755, "mean_token_accuracy": 0.7694461643695831, "num_tokens": 11970918.0, "step": 3306 }, { "entropy": 0.5952577143907547, "epoch": 3.0849276714885674, "grad_norm": 0.31598371267318726, "learning_rate": 0.0002, "loss": 0.609, "mean_token_accuracy": 0.7557248771190643, "num_tokens": 11974501.0, "step": 3307 }, { "entropy": 0.5866020917892456, "epoch": 3.0858609426038264, "grad_norm": 0.44857820868492126, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.7559608370065689, "num_tokens": 11977963.0, "step": 3308 }, { "entropy": 0.6037546396255493, "epoch": 3.0867942137190854, "grad_norm": 0.3215252757072449, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.7525443434715271, "num_tokens": 11981581.0, "step": 3309 }, { "entropy": 0.5766941010951996, "epoch": 3.0877274848343443, "grad_norm": 0.2990507185459137, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7662306576967239, "num_tokens": 11985251.0, "step": 3310 }, { "entropy": 0.5654806792736053, "epoch": 3.0886607559496033, "grad_norm": 0.2806839942932129, "learning_rate": 0.0002, "loss": 0.5573, "mean_token_accuracy": 0.7754451185464859, "num_tokens": 11988905.0, "step": 3311 }, { "entropy": 0.5827172696590424, "epoch": 3.0895940270648623, "grad_norm": 0.3348761796951294, "learning_rate": 0.0002, "loss": 0.5814, "mean_token_accuracy": 0.7659704834222794, "num_tokens": 11992607.0, "step": 3312 }, { "entropy": 0.5855301320552826, "epoch": 3.0905272981801213, "grad_norm": 0.34078899025917053, "learning_rate": 0.0002, "loss": 0.5758, "mean_token_accuracy": 0.769530177116394, "num_tokens": 11996331.0, "step": 3313 }, { "entropy": 0.5438722223043442, "epoch": 3.0914605692953803, "grad_norm": 0.31183359026908875, "learning_rate": 0.0002, "loss": 0.5311, "mean_token_accuracy": 0.784532755613327, "num_tokens": 11999868.0, "step": 3314 }, { "entropy": 0.6327761858701706, "epoch": 3.0923938404106392, "grad_norm": 0.3287194073200226, "learning_rate": 0.0002, "loss": 0.6254, "mean_token_accuracy": 0.7480560392141342, "num_tokens": 12003527.0, "step": 3315 }, { "entropy": 0.5418529585003853, "epoch": 3.0933271115258982, "grad_norm": 0.3219356834888458, "learning_rate": 0.0002, "loss": 0.5427, "mean_token_accuracy": 0.7810071259737015, "num_tokens": 12007215.0, "step": 3316 }, { "entropy": 0.5729929283261299, "epoch": 3.094260382641157, "grad_norm": 0.3315852880477905, "learning_rate": 0.0002, "loss": 0.5704, "mean_token_accuracy": 0.7724975198507309, "num_tokens": 12010778.0, "step": 3317 }, { "entropy": 0.5787313133478165, "epoch": 3.095193653756416, "grad_norm": 0.3164331018924713, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.7666173726320267, "num_tokens": 12014389.0, "step": 3318 }, { "entropy": 0.5768770128488541, "epoch": 3.096126924871675, "grad_norm": 0.4338223934173584, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.7538031935691833, "num_tokens": 12017930.0, "step": 3319 }, { "entropy": 0.5483536869287491, "epoch": 3.097060195986934, "grad_norm": 0.33743834495544434, "learning_rate": 0.0002, "loss": 0.5631, "mean_token_accuracy": 0.7724947482347488, "num_tokens": 12021491.0, "step": 3320 }, { "entropy": 0.5713081955909729, "epoch": 3.097993467102193, "grad_norm": 0.31765761971473694, "learning_rate": 0.0002, "loss": 0.581, "mean_token_accuracy": 0.7674655169248581, "num_tokens": 12025211.0, "step": 3321 }, { "entropy": 0.5092109143733978, "epoch": 3.098926738217452, "grad_norm": 0.3376293182373047, "learning_rate": 0.0002, "loss": 0.526, "mean_token_accuracy": 0.7809778451919556, "num_tokens": 12028742.0, "step": 3322 }, { "entropy": 0.5921120643615723, "epoch": 3.099860009332711, "grad_norm": 0.291277676820755, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7573261559009552, "num_tokens": 12032305.0, "step": 3323 }, { "entropy": 0.589724987745285, "epoch": 3.10079328044797, "grad_norm": 0.2981966733932495, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.7603082209825516, "num_tokens": 12035917.0, "step": 3324 }, { "entropy": 0.6154551804065704, "epoch": 3.101726551563229, "grad_norm": 0.2892036437988281, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.7541266828775406, "num_tokens": 12039437.0, "step": 3325 }, { "entropy": 0.594393864274025, "epoch": 3.102659822678488, "grad_norm": 0.40066057443618774, "learning_rate": 0.0002, "loss": 0.6026, "mean_token_accuracy": 0.7630078494548798, "num_tokens": 12042990.0, "step": 3326 }, { "entropy": 0.5861915796995163, "epoch": 3.103593093793747, "grad_norm": 0.32117459177970886, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.7671522796154022, "num_tokens": 12046651.0, "step": 3327 }, { "entropy": 0.5419702529907227, "epoch": 3.104526364909006, "grad_norm": 0.40027308464050293, "learning_rate": 0.0002, "loss": 0.5398, "mean_token_accuracy": 0.775492861866951, "num_tokens": 12050305.0, "step": 3328 }, { "entropy": 0.5848996639251709, "epoch": 3.105459636024265, "grad_norm": 0.3733519911766052, "learning_rate": 0.0002, "loss": 0.5816, "mean_token_accuracy": 0.7659524828195572, "num_tokens": 12053825.0, "step": 3329 }, { "entropy": 0.5628032237291336, "epoch": 3.106392907139524, "grad_norm": 0.3824514150619507, "learning_rate": 0.0002, "loss": 0.5676, "mean_token_accuracy": 0.774803027510643, "num_tokens": 12057439.0, "step": 3330 }, { "entropy": 0.5952902436256409, "epoch": 3.107326178254783, "grad_norm": 0.3307967483997345, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.7633701115846634, "num_tokens": 12061054.0, "step": 3331 }, { "entropy": 0.5690867304801941, "epoch": 3.108259449370042, "grad_norm": 0.39863789081573486, "learning_rate": 0.0002, "loss": 0.5723, "mean_token_accuracy": 0.7703157067298889, "num_tokens": 12064824.0, "step": 3332 }, { "entropy": 0.5981793105602264, "epoch": 3.109192720485301, "grad_norm": 0.32348570227622986, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.7566349655389786, "num_tokens": 12068521.0, "step": 3333 }, { "entropy": 0.570891872048378, "epoch": 3.11012599160056, "grad_norm": 0.30705374479293823, "learning_rate": 0.0002, "loss": 0.5792, "mean_token_accuracy": 0.769045501947403, "num_tokens": 12072168.0, "step": 3334 }, { "entropy": 0.6009804308414459, "epoch": 3.111059262715819, "grad_norm": 0.3004703223705292, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.7536547780036926, "num_tokens": 12075784.0, "step": 3335 }, { "entropy": 0.5795655995607376, "epoch": 3.111992533831078, "grad_norm": 0.31888478994369507, "learning_rate": 0.0002, "loss": 0.5801, "mean_token_accuracy": 0.762880802154541, "num_tokens": 12079554.0, "step": 3336 }, { "entropy": 0.5231815278530121, "epoch": 3.112925804946337, "grad_norm": 0.44172418117523193, "learning_rate": 0.0002, "loss": 0.5342, "mean_token_accuracy": 0.7819075286388397, "num_tokens": 12082985.0, "step": 3337 }, { "entropy": 0.5904891788959503, "epoch": 3.113859076061596, "grad_norm": 0.3179642856121063, "learning_rate": 0.0002, "loss": 0.603, "mean_token_accuracy": 0.7523630261421204, "num_tokens": 12086563.0, "step": 3338 }, { "entropy": 0.5695329159498215, "epoch": 3.114792347176855, "grad_norm": 0.35177674889564514, "learning_rate": 0.0002, "loss": 0.5644, "mean_token_accuracy": 0.7699542939662933, "num_tokens": 12090097.0, "step": 3339 }, { "entropy": 0.5893609523773193, "epoch": 3.115725618292114, "grad_norm": 0.34715989232063293, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.7648748308420181, "num_tokens": 12093761.0, "step": 3340 }, { "entropy": 0.5394696444272995, "epoch": 3.116658889407373, "grad_norm": 0.36084210872650146, "learning_rate": 0.0002, "loss": 0.5349, "mean_token_accuracy": 0.7903193831443787, "num_tokens": 12097321.0, "step": 3341 }, { "entropy": 0.5581321865320206, "epoch": 3.1175921605226318, "grad_norm": 0.2733387053012848, "learning_rate": 0.0002, "loss": 0.5479, "mean_token_accuracy": 0.7804133743047714, "num_tokens": 12100950.0, "step": 3342 }, { "entropy": 0.5997810065746307, "epoch": 3.1185254316378908, "grad_norm": 0.3144678473472595, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7628905922174454, "num_tokens": 12104720.0, "step": 3343 }, { "entropy": 0.5383901074528694, "epoch": 3.1194587027531497, "grad_norm": 0.3221658170223236, "learning_rate": 0.0002, "loss": 0.5442, "mean_token_accuracy": 0.7838472872972488, "num_tokens": 12108381.0, "step": 3344 }, { "entropy": 0.5481646060943604, "epoch": 3.1203919738684087, "grad_norm": 0.3319031000137329, "learning_rate": 0.0002, "loss": 0.5435, "mean_token_accuracy": 0.7794167995452881, "num_tokens": 12112064.0, "step": 3345 }, { "entropy": 0.5809528678655624, "epoch": 3.1213252449836677, "grad_norm": 0.4528380036354065, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.7545249611139297, "num_tokens": 12115772.0, "step": 3346 }, { "entropy": 0.4841812551021576, "epoch": 3.1222585160989267, "grad_norm": 0.36695238947868347, "learning_rate": 0.0002, "loss": 0.5024, "mean_token_accuracy": 0.802045926451683, "num_tokens": 12119221.0, "step": 3347 }, { "entropy": 0.5605773329734802, "epoch": 3.1231917872141857, "grad_norm": 0.3415738642215729, "learning_rate": 0.0002, "loss": 0.5683, "mean_token_accuracy": 0.7664918601512909, "num_tokens": 12122711.0, "step": 3348 }, { "entropy": 0.5638378858566284, "epoch": 3.1241250583294446, "grad_norm": 0.3774103820323944, "learning_rate": 0.0002, "loss": 0.5774, "mean_token_accuracy": 0.7654460668563843, "num_tokens": 12126379.0, "step": 3349 }, { "entropy": 0.610402300953865, "epoch": 3.1250583294447036, "grad_norm": 0.2994999289512634, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.7525516450405121, "num_tokens": 12130056.0, "step": 3350 }, { "entropy": 0.5964173972606659, "epoch": 3.1259916005599626, "grad_norm": 0.326014906167984, "learning_rate": 0.0002, "loss": 0.5837, "mean_token_accuracy": 0.7667053639888763, "num_tokens": 12133681.0, "step": 3351 }, { "entropy": 0.5857187360525131, "epoch": 3.1269248716752216, "grad_norm": 0.3217678368091583, "learning_rate": 0.0002, "loss": 0.5771, "mean_token_accuracy": 0.7636246681213379, "num_tokens": 12137286.0, "step": 3352 }, { "entropy": 0.5664801001548767, "epoch": 3.1278581427904806, "grad_norm": 0.3151302933692932, "learning_rate": 0.0002, "loss": 0.5766, "mean_token_accuracy": 0.7652677595615387, "num_tokens": 12140785.0, "step": 3353 }, { "entropy": 0.5572614222764969, "epoch": 3.1287914139057396, "grad_norm": 0.3276812732219696, "learning_rate": 0.0002, "loss": 0.5656, "mean_token_accuracy": 0.7665868848562241, "num_tokens": 12144435.0, "step": 3354 }, { "entropy": 0.597048819065094, "epoch": 3.1297246850209985, "grad_norm": 0.3750784993171692, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.7567569464445114, "num_tokens": 12148048.0, "step": 3355 }, { "entropy": 0.6021869331598282, "epoch": 3.1306579561362575, "grad_norm": 0.3276108503341675, "learning_rate": 0.0002, "loss": 0.6167, "mean_token_accuracy": 0.74915412068367, "num_tokens": 12151622.0, "step": 3356 }, { "entropy": 0.5846520215272903, "epoch": 3.1315912272515165, "grad_norm": 0.29064106941223145, "learning_rate": 0.0002, "loss": 0.5863, "mean_token_accuracy": 0.7638949602842331, "num_tokens": 12155307.0, "step": 3357 }, { "entropy": 0.5460968911647797, "epoch": 3.1325244983667755, "grad_norm": 0.44339731335639954, "learning_rate": 0.0002, "loss": 0.5589, "mean_token_accuracy": 0.7742795944213867, "num_tokens": 12158787.0, "step": 3358 }, { "entropy": 0.5251542925834656, "epoch": 3.1334577694820345, "grad_norm": 0.3431839048862457, "learning_rate": 0.0002, "loss": 0.533, "mean_token_accuracy": 0.7863042801618576, "num_tokens": 12162326.0, "step": 3359 }, { "entropy": 0.5826641917228699, "epoch": 3.1343910405972935, "grad_norm": 0.3227311968803406, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7695860415697098, "num_tokens": 12165960.0, "step": 3360 }, { "entropy": 0.5592413991689682, "epoch": 3.1353243117125524, "grad_norm": 0.28728553652763367, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.7718911617994308, "num_tokens": 12169567.0, "step": 3361 }, { "entropy": 0.5720224529504776, "epoch": 3.1362575828278114, "grad_norm": 0.49239709973335266, "learning_rate": 0.0002, "loss": 0.5894, "mean_token_accuracy": 0.7706671208143234, "num_tokens": 12173231.0, "step": 3362 }, { "entropy": 0.5583447962999344, "epoch": 3.1371908539430704, "grad_norm": 0.3408893942832947, "learning_rate": 0.0002, "loss": 0.5748, "mean_token_accuracy": 0.7684617191553116, "num_tokens": 12176800.0, "step": 3363 }, { "entropy": 0.5488391220569611, "epoch": 3.1381241250583294, "grad_norm": 0.4131360650062561, "learning_rate": 0.0002, "loss": 0.5551, "mean_token_accuracy": 0.7759237289428711, "num_tokens": 12180343.0, "step": 3364 }, { "entropy": 0.5670682638883591, "epoch": 3.1390573961735884, "grad_norm": 0.36809635162353516, "learning_rate": 0.0002, "loss": 0.5752, "mean_token_accuracy": 0.7636937946081161, "num_tokens": 12184023.0, "step": 3365 }, { "entropy": 0.5823393762111664, "epoch": 3.1399906672888473, "grad_norm": 0.35286539793014526, "learning_rate": 0.0002, "loss": 0.5727, "mean_token_accuracy": 0.7727678567171097, "num_tokens": 12187503.0, "step": 3366 }, { "entropy": 0.5651304200291634, "epoch": 3.1409239384041063, "grad_norm": 0.34140801429748535, "learning_rate": 0.0002, "loss": 0.5564, "mean_token_accuracy": 0.7743328213691711, "num_tokens": 12191110.0, "step": 3367 }, { "entropy": 0.5390618294477463, "epoch": 3.1418572095193653, "grad_norm": 0.36929118633270264, "learning_rate": 0.0002, "loss": 0.5377, "mean_token_accuracy": 0.7837007641792297, "num_tokens": 12194632.0, "step": 3368 }, { "entropy": 0.623888224363327, "epoch": 3.1427904806346243, "grad_norm": 0.2900649309158325, "learning_rate": 0.0002, "loss": 0.6115, "mean_token_accuracy": 0.7512860596179962, "num_tokens": 12198259.0, "step": 3369 }, { "entropy": 0.5820949375629425, "epoch": 3.1437237517498833, "grad_norm": 0.3427778482437134, "learning_rate": 0.0002, "loss": 0.59, "mean_token_accuracy": 0.760360985994339, "num_tokens": 12201768.0, "step": 3370 }, { "entropy": 0.5602949410676956, "epoch": 3.1446570228651423, "grad_norm": 0.2969643175601959, "learning_rate": 0.0002, "loss": 0.5541, "mean_token_accuracy": 0.7766399383544922, "num_tokens": 12205402.0, "step": 3371 }, { "entropy": 0.6117941588163376, "epoch": 3.1455902939804012, "grad_norm": 0.3381527066230774, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.7578441351652145, "num_tokens": 12209068.0, "step": 3372 }, { "entropy": 0.5883471965789795, "epoch": 3.1465235650956602, "grad_norm": 0.31809744238853455, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.7585402578115463, "num_tokens": 12212714.0, "step": 3373 }, { "entropy": 0.5741936191916466, "epoch": 3.147456836210919, "grad_norm": 0.349567174911499, "learning_rate": 0.0002, "loss": 0.5692, "mean_token_accuracy": 0.7691198885440826, "num_tokens": 12216358.0, "step": 3374 }, { "entropy": 0.5529589504003525, "epoch": 3.148390107326178, "grad_norm": 0.3505329191684723, "learning_rate": 0.0002, "loss": 0.5635, "mean_token_accuracy": 0.7728633433580399, "num_tokens": 12219906.0, "step": 3375 }, { "entropy": 0.5704671144485474, "epoch": 3.149323378441437, "grad_norm": 0.40056225657463074, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.7635512053966522, "num_tokens": 12223516.0, "step": 3376 }, { "entropy": 0.550372339785099, "epoch": 3.150256649556696, "grad_norm": 0.37145236134529114, "learning_rate": 0.0002, "loss": 0.5626, "mean_token_accuracy": 0.772272452712059, "num_tokens": 12227166.0, "step": 3377 }, { "entropy": 0.5387049466371536, "epoch": 3.151189920671955, "grad_norm": 0.3277270495891571, "learning_rate": 0.0002, "loss": 0.5453, "mean_token_accuracy": 0.7799247354269028, "num_tokens": 12230736.0, "step": 3378 }, { "entropy": 0.5438738316297531, "epoch": 3.152123191787214, "grad_norm": 0.29702305793762207, "learning_rate": 0.0002, "loss": 0.5476, "mean_token_accuracy": 0.7795989662408829, "num_tokens": 12234352.0, "step": 3379 }, { "entropy": 0.5473948866128922, "epoch": 3.153056462902473, "grad_norm": 0.30543622374534607, "learning_rate": 0.0002, "loss": 0.542, "mean_token_accuracy": 0.7822788804769516, "num_tokens": 12237904.0, "step": 3380 }, { "entropy": 0.5915519297122955, "epoch": 3.153989734017732, "grad_norm": 0.3034839928150177, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.7649374753236771, "num_tokens": 12241556.0, "step": 3381 }, { "entropy": 0.5717664808034897, "epoch": 3.154923005132991, "grad_norm": 0.41371721029281616, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7658505737781525, "num_tokens": 12245255.0, "step": 3382 }, { "entropy": 0.5726034343242645, "epoch": 3.15585627624825, "grad_norm": 0.3210294544696808, "learning_rate": 0.0002, "loss": 0.5825, "mean_token_accuracy": 0.7636376023292542, "num_tokens": 12248928.0, "step": 3383 }, { "entropy": 0.5715530961751938, "epoch": 3.156789547363509, "grad_norm": 0.3909350037574768, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7617567181587219, "num_tokens": 12252377.0, "step": 3384 }, { "entropy": 0.5559177026152611, "epoch": 3.157722818478768, "grad_norm": 0.38430261611938477, "learning_rate": 0.0002, "loss": 0.5535, "mean_token_accuracy": 0.7791155427694321, "num_tokens": 12255894.0, "step": 3385 }, { "entropy": 0.5883194953203201, "epoch": 3.158656089594027, "grad_norm": 0.3437056243419647, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.7619248181581497, "num_tokens": 12259348.0, "step": 3386 }, { "entropy": 0.632411003112793, "epoch": 3.159589360709286, "grad_norm": 0.32679489254951477, "learning_rate": 0.0002, "loss": 0.6243, "mean_token_accuracy": 0.7468205392360687, "num_tokens": 12263102.0, "step": 3387 }, { "entropy": 0.5420273318886757, "epoch": 3.160522631824545, "grad_norm": 0.30968132615089417, "learning_rate": 0.0002, "loss": 0.5421, "mean_token_accuracy": 0.7814225107431412, "num_tokens": 12266629.0, "step": 3388 }, { "entropy": 0.618849441409111, "epoch": 3.161455902939804, "grad_norm": 0.3715273141860962, "learning_rate": 0.0002, "loss": 0.6383, "mean_token_accuracy": 0.7474346309900284, "num_tokens": 12270222.0, "step": 3389 }, { "entropy": 0.5983093529939651, "epoch": 3.162389174055063, "grad_norm": 0.3742247521877289, "learning_rate": 0.0002, "loss": 0.5968, "mean_token_accuracy": 0.760837733745575, "num_tokens": 12273744.0, "step": 3390 }, { "entropy": 0.5598663538694382, "epoch": 3.163322445170322, "grad_norm": 0.3515024781227112, "learning_rate": 0.0002, "loss": 0.5576, "mean_token_accuracy": 0.7767027467489243, "num_tokens": 12277507.0, "step": 3391 }, { "entropy": 0.5900998115539551, "epoch": 3.164255716285581, "grad_norm": 0.37571224570274353, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7596219629049301, "num_tokens": 12281086.0, "step": 3392 }, { "entropy": 0.5411986857652664, "epoch": 3.16518898740084, "grad_norm": 0.30989012122154236, "learning_rate": 0.0002, "loss": 0.551, "mean_token_accuracy": 0.7756365388631821, "num_tokens": 12284737.0, "step": 3393 }, { "entropy": 0.5979555398225784, "epoch": 3.166122258516099, "grad_norm": 0.3449721932411194, "learning_rate": 0.0002, "loss": 0.5912, "mean_token_accuracy": 0.7602803260087967, "num_tokens": 12288305.0, "step": 3394 }, { "entropy": 0.5923656523227692, "epoch": 3.167055529631358, "grad_norm": 0.3591870665550232, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.756271243095398, "num_tokens": 12291859.0, "step": 3395 }, { "entropy": 0.5819128155708313, "epoch": 3.167988800746617, "grad_norm": 0.34853461384773254, "learning_rate": 0.0002, "loss": 0.5879, "mean_token_accuracy": 0.7639522105455399, "num_tokens": 12295525.0, "step": 3396 }, { "entropy": 0.5683513134717941, "epoch": 3.168922071861876, "grad_norm": 0.3129333257675171, "learning_rate": 0.0002, "loss": 0.5656, "mean_token_accuracy": 0.7694780677556992, "num_tokens": 12299083.0, "step": 3397 }, { "entropy": 0.5801762640476227, "epoch": 3.169855342977135, "grad_norm": 0.30494046211242676, "learning_rate": 0.0002, "loss": 0.5725, "mean_token_accuracy": 0.7738424688577652, "num_tokens": 12302741.0, "step": 3398 }, { "entropy": 0.6146019548177719, "epoch": 3.1707886140923938, "grad_norm": 0.35233259201049805, "learning_rate": 0.0002, "loss": 0.614, "mean_token_accuracy": 0.7487485706806183, "num_tokens": 12306424.0, "step": 3399 }, { "entropy": 0.586580902338028, "epoch": 3.1717218852076527, "grad_norm": 0.39888036251068115, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.7605153471231461, "num_tokens": 12310048.0, "step": 3400 }, { "entropy": 0.595913901925087, "epoch": 3.1726551563229117, "grad_norm": 0.3397449553012848, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.7580761909484863, "num_tokens": 12313687.0, "step": 3401 }, { "entropy": 0.5699460059404373, "epoch": 3.1735884274381707, "grad_norm": 0.36288443207740784, "learning_rate": 0.0002, "loss": 0.569, "mean_token_accuracy": 0.7700593322515488, "num_tokens": 12317301.0, "step": 3402 }, { "entropy": 0.5789060294628143, "epoch": 3.1745216985534297, "grad_norm": 0.3058337867259979, "learning_rate": 0.0002, "loss": 0.5852, "mean_token_accuracy": 0.7643412947654724, "num_tokens": 12320905.0, "step": 3403 }, { "entropy": 0.583456963300705, "epoch": 3.1754549696686887, "grad_norm": 0.28052428364753723, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7630862146615982, "num_tokens": 12324572.0, "step": 3404 }, { "entropy": 0.5844687521457672, "epoch": 3.1763882407839477, "grad_norm": 0.3664693832397461, "learning_rate": 0.0002, "loss": 0.6049, "mean_token_accuracy": 0.7485065311193466, "num_tokens": 12328264.0, "step": 3405 }, { "entropy": 0.611202746629715, "epoch": 3.1773215118992066, "grad_norm": 0.3052636682987213, "learning_rate": 0.0002, "loss": 0.6125, "mean_token_accuracy": 0.7579861730337143, "num_tokens": 12331948.0, "step": 3406 }, { "entropy": 0.5692786872386932, "epoch": 3.1782547830144656, "grad_norm": 0.2975655794143677, "learning_rate": 0.0002, "loss": 0.5668, "mean_token_accuracy": 0.7653402090072632, "num_tokens": 12335555.0, "step": 3407 }, { "entropy": 0.5876613557338715, "epoch": 3.1791880541297246, "grad_norm": 0.31324201822280884, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.764846682548523, "num_tokens": 12339191.0, "step": 3408 }, { "entropy": 0.5787064284086227, "epoch": 3.1801213252449836, "grad_norm": 0.2950741946697235, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.7656221687793732, "num_tokens": 12342823.0, "step": 3409 }, { "entropy": 0.5467731058597565, "epoch": 3.1810545963602426, "grad_norm": 0.3355877995491028, "learning_rate": 0.0002, "loss": 0.5533, "mean_token_accuracy": 0.7831338793039322, "num_tokens": 12346372.0, "step": 3410 }, { "entropy": 0.5589385032653809, "epoch": 3.1819878674755016, "grad_norm": 0.33012330532073975, "learning_rate": 0.0002, "loss": 0.5481, "mean_token_accuracy": 0.7786241918802261, "num_tokens": 12349975.0, "step": 3411 }, { "entropy": 0.5856512039899826, "epoch": 3.1829211385907605, "grad_norm": 0.3924480378627777, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.7611373960971832, "num_tokens": 12353615.0, "step": 3412 }, { "entropy": 0.5686290413141251, "epoch": 3.1838544097060195, "grad_norm": 0.4560908377170563, "learning_rate": 0.0002, "loss": 0.5744, "mean_token_accuracy": 0.7694695740938187, "num_tokens": 12357291.0, "step": 3413 }, { "entropy": 0.5716057270765305, "epoch": 3.1847876808212785, "grad_norm": 0.35197505354881287, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.7636765688657761, "num_tokens": 12360929.0, "step": 3414 }, { "entropy": 0.5599038600921631, "epoch": 3.1857209519365375, "grad_norm": 0.34638476371765137, "learning_rate": 0.0002, "loss": 0.5709, "mean_token_accuracy": 0.762939378619194, "num_tokens": 12364577.0, "step": 3415 }, { "entropy": 0.5335824489593506, "epoch": 3.1866542230517965, "grad_norm": 0.4069618880748749, "learning_rate": 0.0002, "loss": 0.5543, "mean_token_accuracy": 0.779619887471199, "num_tokens": 12368234.0, "step": 3416 }, { "entropy": 0.567078486084938, "epoch": 3.1875874941670554, "grad_norm": 0.30963781476020813, "learning_rate": 0.0002, "loss": 0.5575, "mean_token_accuracy": 0.7719650566577911, "num_tokens": 12371846.0, "step": 3417 }, { "entropy": 0.5844715982675552, "epoch": 3.1885207652823144, "grad_norm": 0.3128688931465149, "learning_rate": 0.0002, "loss": 0.5819, "mean_token_accuracy": 0.7622516006231308, "num_tokens": 12375411.0, "step": 3418 }, { "entropy": 0.6152746230363846, "epoch": 3.1894540363975734, "grad_norm": 0.400124728679657, "learning_rate": 0.0002, "loss": 0.6184, "mean_token_accuracy": 0.7460779547691345, "num_tokens": 12378995.0, "step": 3419 }, { "entropy": 0.5576332658529282, "epoch": 3.1903873075128324, "grad_norm": 0.2916274666786194, "learning_rate": 0.0002, "loss": 0.5478, "mean_token_accuracy": 0.7760219573974609, "num_tokens": 12382660.0, "step": 3420 }, { "entropy": 0.5471925586462021, "epoch": 3.1913205786280914, "grad_norm": 0.34106624126434326, "learning_rate": 0.0002, "loss": 0.5502, "mean_token_accuracy": 0.7767651677131653, "num_tokens": 12386388.0, "step": 3421 }, { "entropy": 0.5923531204462051, "epoch": 3.1922538497433504, "grad_norm": 0.2914336025714874, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.7613362073898315, "num_tokens": 12390025.0, "step": 3422 }, { "entropy": 0.5737205147743225, "epoch": 3.1931871208586093, "grad_norm": 0.30275145173072815, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.765151634812355, "num_tokens": 12393659.0, "step": 3423 }, { "entropy": 0.6145357340574265, "epoch": 3.1941203919738683, "grad_norm": 0.33335164189338684, "learning_rate": 0.0002, "loss": 0.6116, "mean_token_accuracy": 0.7517257630825043, "num_tokens": 12397339.0, "step": 3424 }, { "entropy": 0.5534588843584061, "epoch": 3.1950536630891273, "grad_norm": 0.35679641366004944, "learning_rate": 0.0002, "loss": 0.5711, "mean_token_accuracy": 0.7693688720464706, "num_tokens": 12400907.0, "step": 3425 }, { "entropy": 0.6216437667608261, "epoch": 3.1959869342043863, "grad_norm": 0.3443339467048645, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.7460353672504425, "num_tokens": 12404544.0, "step": 3426 }, { "entropy": 0.5600467324256897, "epoch": 3.1969202053196453, "grad_norm": 0.30500322580337524, "learning_rate": 0.0002, "loss": 0.5688, "mean_token_accuracy": 0.7704520672559738, "num_tokens": 12408206.0, "step": 3427 }, { "entropy": 0.5741176605224609, "epoch": 3.1978534764349043, "grad_norm": 0.3589382469654083, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.7676559239625931, "num_tokens": 12411870.0, "step": 3428 }, { "entropy": 0.5743464827537537, "epoch": 3.1987867475501632, "grad_norm": 0.37221306562423706, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.7638313621282578, "num_tokens": 12415541.0, "step": 3429 }, { "entropy": 0.5775733590126038, "epoch": 3.199720018665422, "grad_norm": 0.36187925934791565, "learning_rate": 0.0002, "loss": 0.5817, "mean_token_accuracy": 0.7680332511663437, "num_tokens": 12419274.0, "step": 3430 }, { "entropy": 0.5541295409202576, "epoch": 3.200653289780681, "grad_norm": 0.3525640368461609, "learning_rate": 0.0002, "loss": 0.5579, "mean_token_accuracy": 0.7727144658565521, "num_tokens": 12422984.0, "step": 3431 }, { "entropy": 0.5825414657592773, "epoch": 3.20158656089594, "grad_norm": 0.3804261386394501, "learning_rate": 0.0002, "loss": 0.581, "mean_token_accuracy": 0.7598031610250473, "num_tokens": 12426681.0, "step": 3432 }, { "entropy": 0.5395058840513229, "epoch": 3.202519832011199, "grad_norm": 0.3227798044681549, "learning_rate": 0.0002, "loss": 0.5498, "mean_token_accuracy": 0.779127910733223, "num_tokens": 12430192.0, "step": 3433 }, { "entropy": 0.5564335137605667, "epoch": 3.203453103126458, "grad_norm": 0.31674495339393616, "learning_rate": 0.0002, "loss": 0.5561, "mean_token_accuracy": 0.7755589634180069, "num_tokens": 12433964.0, "step": 3434 }, { "entropy": 0.6005851477384567, "epoch": 3.204386374241717, "grad_norm": 0.3386434316635132, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.7498380243778229, "num_tokens": 12437628.0, "step": 3435 }, { "entropy": 0.5213591754436493, "epoch": 3.205319645356976, "grad_norm": 0.34046828746795654, "learning_rate": 0.0002, "loss": 0.532, "mean_token_accuracy": 0.7867457270622253, "num_tokens": 12441214.0, "step": 3436 }, { "entropy": 0.6039704531431198, "epoch": 3.206252916472235, "grad_norm": 0.44604358077049255, "learning_rate": 0.0002, "loss": 0.6276, "mean_token_accuracy": 0.751088336110115, "num_tokens": 12444922.0, "step": 3437 }, { "entropy": 0.573437824845314, "epoch": 3.207186187587494, "grad_norm": 0.3696539103984833, "learning_rate": 0.0002, "loss": 0.5674, "mean_token_accuracy": 0.7702459245920181, "num_tokens": 12448598.0, "step": 3438 }, { "entropy": 0.5717646181583405, "epoch": 3.208119458702753, "grad_norm": 0.333663672208786, "learning_rate": 0.0002, "loss": 0.5787, "mean_token_accuracy": 0.761628195643425, "num_tokens": 12452186.0, "step": 3439 }, { "entropy": 0.5591313987970352, "epoch": 3.209052729818012, "grad_norm": 0.34934428334236145, "learning_rate": 0.0002, "loss": 0.573, "mean_token_accuracy": 0.767824575304985, "num_tokens": 12455735.0, "step": 3440 }, { "entropy": 0.6136508733034134, "epoch": 3.209986000933271, "grad_norm": 0.29287680983543396, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.7535134106874466, "num_tokens": 12459447.0, "step": 3441 }, { "entropy": 0.5783073008060455, "epoch": 3.21091927204853, "grad_norm": 0.3153122365474701, "learning_rate": 0.0002, "loss": 0.5803, "mean_token_accuracy": 0.7660355716943741, "num_tokens": 12463028.0, "step": 3442 }, { "entropy": 0.5742153972387314, "epoch": 3.211852543163789, "grad_norm": 0.33298128843307495, "learning_rate": 0.0002, "loss": 0.5749, "mean_token_accuracy": 0.7641850709915161, "num_tokens": 12466571.0, "step": 3443 }, { "entropy": 0.6258575767278671, "epoch": 3.212785814279048, "grad_norm": 0.3247137665748596, "learning_rate": 0.0002, "loss": 0.6121, "mean_token_accuracy": 0.7574439942836761, "num_tokens": 12470335.0, "step": 3444 }, { "entropy": 0.6017990857362747, "epoch": 3.213719085394307, "grad_norm": 0.34292906522750854, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7564824372529984, "num_tokens": 12473870.0, "step": 3445 }, { "entropy": 0.5595622807741165, "epoch": 3.214652356509566, "grad_norm": 0.33859387040138245, "learning_rate": 0.0002, "loss": 0.5584, "mean_token_accuracy": 0.7762850522994995, "num_tokens": 12477521.0, "step": 3446 }, { "entropy": 0.5984771400690079, "epoch": 3.215585627624825, "grad_norm": 0.33000946044921875, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.7618459612131119, "num_tokens": 12481256.0, "step": 3447 }, { "entropy": 0.5456172823905945, "epoch": 3.216518898740084, "grad_norm": 0.34852534532546997, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.7752137184143066, "num_tokens": 12484803.0, "step": 3448 }, { "entropy": 0.5771496891975403, "epoch": 3.217452169855343, "grad_norm": 0.37711918354034424, "learning_rate": 0.0002, "loss": 0.5875, "mean_token_accuracy": 0.7587802559137344, "num_tokens": 12488462.0, "step": 3449 }, { "entropy": 0.5823307782411575, "epoch": 3.218385440970602, "grad_norm": 0.3310795724391937, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.7638469189405441, "num_tokens": 12492214.0, "step": 3450 }, { "entropy": 0.5701731592416763, "epoch": 3.219318712085861, "grad_norm": 0.36559590697288513, "learning_rate": 0.0002, "loss": 0.5781, "mean_token_accuracy": 0.760685607790947, "num_tokens": 12495773.0, "step": 3451 }, { "entropy": 0.5747452825307846, "epoch": 3.22025198320112, "grad_norm": 0.3255156874656677, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.7674467414617538, "num_tokens": 12499241.0, "step": 3452 }, { "entropy": 0.6369077265262604, "epoch": 3.221185254316379, "grad_norm": 0.37815532088279724, "learning_rate": 0.0002, "loss": 0.6322, "mean_token_accuracy": 0.7447565197944641, "num_tokens": 12502825.0, "step": 3453 }, { "entropy": 0.6143908202648163, "epoch": 3.222118525431638, "grad_norm": 0.31781038641929626, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.7557294815778732, "num_tokens": 12506582.0, "step": 3454 }, { "entropy": 0.5908450931310654, "epoch": 3.2230517965468968, "grad_norm": 0.37773048877716064, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.7596397697925568, "num_tokens": 12510246.0, "step": 3455 }, { "entropy": 0.59175905585289, "epoch": 3.2239850676621558, "grad_norm": 0.37593111395835876, "learning_rate": 0.0002, "loss": 0.5956, "mean_token_accuracy": 0.7562637776136398, "num_tokens": 12513890.0, "step": 3456 }, { "entropy": 0.5713547170162201, "epoch": 3.2249183387774147, "grad_norm": 0.2906079590320587, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.7709006369113922, "num_tokens": 12517544.0, "step": 3457 }, { "entropy": 0.5915031582117081, "epoch": 3.2258516098926737, "grad_norm": 0.35446691513061523, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7605699598789215, "num_tokens": 12521174.0, "step": 3458 }, { "entropy": 0.6099496781826019, "epoch": 3.2267848810079327, "grad_norm": 0.32972252368927, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.759606122970581, "num_tokens": 12524885.0, "step": 3459 }, { "entropy": 0.548615962266922, "epoch": 3.2277181521231917, "grad_norm": 0.36443597078323364, "learning_rate": 0.0002, "loss": 0.5536, "mean_token_accuracy": 0.7733213156461716, "num_tokens": 12528438.0, "step": 3460 }, { "entropy": 0.5634920001029968, "epoch": 3.2286514232384507, "grad_norm": 0.34587159752845764, "learning_rate": 0.0002, "loss": 0.5667, "mean_token_accuracy": 0.7715646028518677, "num_tokens": 12532085.0, "step": 3461 }, { "entropy": 0.56559918820858, "epoch": 3.2295846943537097, "grad_norm": 0.34410980343818665, "learning_rate": 0.0002, "loss": 0.5761, "mean_token_accuracy": 0.7675856351852417, "num_tokens": 12535725.0, "step": 3462 }, { "entropy": 0.5981874167919159, "epoch": 3.2305179654689686, "grad_norm": 0.36183369159698486, "learning_rate": 0.0002, "loss": 0.5983, "mean_token_accuracy": 0.7614538967609406, "num_tokens": 12539397.0, "step": 3463 }, { "entropy": 0.5864405781030655, "epoch": 3.2314512365842276, "grad_norm": 0.3231503963470459, "learning_rate": 0.0002, "loss": 0.5847, "mean_token_accuracy": 0.7591737061738968, "num_tokens": 12543193.0, "step": 3464 }, { "entropy": 0.5538864433765411, "epoch": 3.2323845076994866, "grad_norm": 0.29956066608428955, "learning_rate": 0.0002, "loss": 0.5594, "mean_token_accuracy": 0.7795847952365875, "num_tokens": 12546867.0, "step": 3465 }, { "entropy": 0.5689100176095963, "epoch": 3.2333177788147456, "grad_norm": 0.3432539701461792, "learning_rate": 0.0002, "loss": 0.5806, "mean_token_accuracy": 0.7673849910497665, "num_tokens": 12550363.0, "step": 3466 }, { "entropy": 0.51860311627388, "epoch": 3.2342510499300046, "grad_norm": 0.3058547377586365, "learning_rate": 0.0002, "loss": 0.5265, "mean_token_accuracy": 0.7906652390956879, "num_tokens": 12553835.0, "step": 3467 }, { "entropy": 0.5708577781915665, "epoch": 3.2351843210452635, "grad_norm": 0.3375129699707031, "learning_rate": 0.0002, "loss": 0.5772, "mean_token_accuracy": 0.7698636502027512, "num_tokens": 12557477.0, "step": 3468 }, { "entropy": 0.5227376446127892, "epoch": 3.2361175921605225, "grad_norm": 0.3014373481273651, "learning_rate": 0.0002, "loss": 0.5199, "mean_token_accuracy": 0.7831443250179291, "num_tokens": 12561039.0, "step": 3469 }, { "entropy": 0.5856218636035919, "epoch": 3.2370508632757815, "grad_norm": 0.33715179562568665, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.7599817216396332, "num_tokens": 12564682.0, "step": 3470 }, { "entropy": 0.6026125401258469, "epoch": 3.2379841343910405, "grad_norm": 0.3037037253379822, "learning_rate": 0.0002, "loss": 0.6, "mean_token_accuracy": 0.7581905871629715, "num_tokens": 12568397.0, "step": 3471 }, { "entropy": 0.6053171902894974, "epoch": 3.2389174055062995, "grad_norm": 0.33656924962997437, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7542498111724854, "num_tokens": 12572146.0, "step": 3472 }, { "entropy": 0.5713017582893372, "epoch": 3.2398506766215585, "grad_norm": 0.3326230049133301, "learning_rate": 0.0002, "loss": 0.5611, "mean_token_accuracy": 0.7721697986125946, "num_tokens": 12575763.0, "step": 3473 }, { "entropy": 0.566580206155777, "epoch": 3.2407839477368174, "grad_norm": 0.33456963300704956, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7730893194675446, "num_tokens": 12579434.0, "step": 3474 }, { "entropy": 0.5884917080402374, "epoch": 3.2417172188520764, "grad_norm": 0.33694303035736084, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.7623604387044907, "num_tokens": 12583188.0, "step": 3475 }, { "entropy": 0.5500610545277596, "epoch": 3.2426504899673354, "grad_norm": 0.3554735779762268, "learning_rate": 0.0002, "loss": 0.5527, "mean_token_accuracy": 0.7756444364786148, "num_tokens": 12586771.0, "step": 3476 }, { "entropy": 0.5790189802646637, "epoch": 3.2435837610825944, "grad_norm": 0.33379584550857544, "learning_rate": 0.0002, "loss": 0.5783, "mean_token_accuracy": 0.7666683197021484, "num_tokens": 12590441.0, "step": 3477 }, { "entropy": 0.5844347327947617, "epoch": 3.2445170321978534, "grad_norm": 0.40332674980163574, "learning_rate": 0.0002, "loss": 0.5931, "mean_token_accuracy": 0.7623559981584549, "num_tokens": 12594190.0, "step": 3478 }, { "entropy": 0.5559370219707489, "epoch": 3.2454503033131124, "grad_norm": 0.38174179196357727, "learning_rate": 0.0002, "loss": 0.567, "mean_token_accuracy": 0.7698310762643814, "num_tokens": 12597820.0, "step": 3479 }, { "entropy": 0.5972052365541458, "epoch": 3.2463835744283713, "grad_norm": 0.35780656337738037, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.7584429532289505, "num_tokens": 12601585.0, "step": 3480 }, { "entropy": 0.5374492555856705, "epoch": 3.2473168455436303, "grad_norm": 0.3223605751991272, "learning_rate": 0.0002, "loss": 0.545, "mean_token_accuracy": 0.7779808640480042, "num_tokens": 12605250.0, "step": 3481 }, { "entropy": 0.5636101216077805, "epoch": 3.2482501166588893, "grad_norm": 0.27408647537231445, "learning_rate": 0.0002, "loss": 0.5581, "mean_token_accuracy": 0.7769683301448822, "num_tokens": 12609006.0, "step": 3482 }, { "entropy": 0.5453791618347168, "epoch": 3.2491833877741483, "grad_norm": 0.3149451017379761, "learning_rate": 0.0002, "loss": 0.5522, "mean_token_accuracy": 0.7745856493711472, "num_tokens": 12612475.0, "step": 3483 }, { "entropy": 0.5685854703187943, "epoch": 3.2501166588894073, "grad_norm": 0.36625605821609497, "learning_rate": 0.0002, "loss": 0.5803, "mean_token_accuracy": 0.7680483460426331, "num_tokens": 12616073.0, "step": 3484 }, { "entropy": 0.5980734676122665, "epoch": 3.2510499300046662, "grad_norm": 0.26225635409355164, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.7624857574701309, "num_tokens": 12619781.0, "step": 3485 }, { "entropy": 0.5570379197597504, "epoch": 3.2519832011199252, "grad_norm": 0.36406949162483215, "learning_rate": 0.0002, "loss": 0.5686, "mean_token_accuracy": 0.7741573601961136, "num_tokens": 12623400.0, "step": 3486 }, { "entropy": 0.5601896420121193, "epoch": 3.252916472235184, "grad_norm": 0.3807485103607178, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.7709416002035141, "num_tokens": 12626998.0, "step": 3487 }, { "entropy": 0.5459508523344994, "epoch": 3.253849743350443, "grad_norm": 0.39441898465156555, "learning_rate": 0.0002, "loss": 0.5745, "mean_token_accuracy": 0.7653300017118454, "num_tokens": 12630605.0, "step": 3488 }, { "entropy": 0.5530619025230408, "epoch": 3.254783014465702, "grad_norm": 0.3671073317527771, "learning_rate": 0.0002, "loss": 0.5683, "mean_token_accuracy": 0.7740761935710907, "num_tokens": 12634199.0, "step": 3489 }, { "entropy": 0.6091555655002594, "epoch": 3.255716285580961, "grad_norm": 0.3235841393470764, "learning_rate": 0.0002, "loss": 0.6073, "mean_token_accuracy": 0.7556083053350449, "num_tokens": 12637911.0, "step": 3490 }, { "entropy": 0.5656271278858185, "epoch": 3.25664955669622, "grad_norm": 0.46451476216316223, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.7654076516628265, "num_tokens": 12641327.0, "step": 3491 }, { "entropy": 0.6169862002134323, "epoch": 3.257582827811479, "grad_norm": 0.2911243736743927, "learning_rate": 0.0002, "loss": 0.6097, "mean_token_accuracy": 0.7592959553003311, "num_tokens": 12645094.0, "step": 3492 }, { "entropy": 0.591512605547905, "epoch": 3.258516098926738, "grad_norm": 0.32244452834129333, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7684789299964905, "num_tokens": 12648667.0, "step": 3493 }, { "entropy": 0.5570146292448044, "epoch": 3.259449370041997, "grad_norm": 0.2885509133338928, "learning_rate": 0.0002, "loss": 0.5485, "mean_token_accuracy": 0.7805760055780411, "num_tokens": 12652187.0, "step": 3494 }, { "entropy": 0.5568027272820473, "epoch": 3.260382641157256, "grad_norm": 0.31292805075645447, "learning_rate": 0.0002, "loss": 0.5603, "mean_token_accuracy": 0.7733695358037949, "num_tokens": 12655792.0, "step": 3495 }, { "entropy": 0.575826421380043, "epoch": 3.261315912272515, "grad_norm": 0.36820223927497864, "learning_rate": 0.0002, "loss": 0.5746, "mean_token_accuracy": 0.7667249739170074, "num_tokens": 12659386.0, "step": 3496 }, { "entropy": 0.5592332780361176, "epoch": 3.262249183387774, "grad_norm": 0.3488422632217407, "learning_rate": 0.0002, "loss": 0.5583, "mean_token_accuracy": 0.7728912532329559, "num_tokens": 12662963.0, "step": 3497 }, { "entropy": 0.586890920996666, "epoch": 3.263182454503033, "grad_norm": 0.3300977349281311, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7645197212696075, "num_tokens": 12666680.0, "step": 3498 }, { "entropy": 0.5768714994192123, "epoch": 3.264115725618292, "grad_norm": 0.34342294931411743, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.7619966566562653, "num_tokens": 12670407.0, "step": 3499 }, { "entropy": 0.5485050231218338, "epoch": 3.265048996733551, "grad_norm": 0.38880160450935364, "learning_rate": 0.0002, "loss": 0.5616, "mean_token_accuracy": 0.7713954597711563, "num_tokens": 12673991.0, "step": 3500 }, { "entropy": 0.5748550593852997, "epoch": 3.26598226784881, "grad_norm": 0.34434378147125244, "learning_rate": 0.0002, "loss": 0.5751, "mean_token_accuracy": 0.7664964348077774, "num_tokens": 12677543.0, "step": 3501 }, { "entropy": 0.6118647307157516, "epoch": 3.266915538964069, "grad_norm": 0.33986330032348633, "learning_rate": 0.0002, "loss": 0.6217, "mean_token_accuracy": 0.7435556352138519, "num_tokens": 12681236.0, "step": 3502 }, { "entropy": 0.5555671900510788, "epoch": 3.267848810079328, "grad_norm": 0.349298357963562, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.7685307264328003, "num_tokens": 12684774.0, "step": 3503 }, { "entropy": 0.5829960256814957, "epoch": 3.268782081194587, "grad_norm": 0.2918517589569092, "learning_rate": 0.0002, "loss": 0.5894, "mean_token_accuracy": 0.7596587240695953, "num_tokens": 12688545.0, "step": 3504 }, { "entropy": 0.5631788522005081, "epoch": 3.269715352309846, "grad_norm": 0.3631798326969147, "learning_rate": 0.0002, "loss": 0.5665, "mean_token_accuracy": 0.7718207389116287, "num_tokens": 12692040.0, "step": 3505 }, { "entropy": 0.58211649954319, "epoch": 3.270648623425105, "grad_norm": 0.32736751437187195, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7693795412778854, "num_tokens": 12695706.0, "step": 3506 }, { "entropy": 0.55414879322052, "epoch": 3.271581894540364, "grad_norm": 0.3284951150417328, "learning_rate": 0.0002, "loss": 0.5552, "mean_token_accuracy": 0.7728336900472641, "num_tokens": 12699239.0, "step": 3507 }, { "entropy": 0.5851959139108658, "epoch": 3.272515165655623, "grad_norm": 0.2971942722797394, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.7690276354551315, "num_tokens": 12702804.0, "step": 3508 }, { "entropy": 0.5685626119375229, "epoch": 3.273448436770882, "grad_norm": 0.3023169934749603, "learning_rate": 0.0002, "loss": 0.5618, "mean_token_accuracy": 0.7766228318214417, "num_tokens": 12706477.0, "step": 3509 }, { "entropy": 0.5528089106082916, "epoch": 3.274381707886141, "grad_norm": 0.34755614399909973, "learning_rate": 0.0002, "loss": 0.5518, "mean_token_accuracy": 0.7746007144451141, "num_tokens": 12709998.0, "step": 3510 }, { "entropy": 0.5879451483488083, "epoch": 3.2753149790014, "grad_norm": 0.29311156272888184, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.7581737488508224, "num_tokens": 12713785.0, "step": 3511 }, { "entropy": 0.5895410925149918, "epoch": 3.2762482501166588, "grad_norm": 0.3286229074001312, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.7600379437208176, "num_tokens": 12717462.0, "step": 3512 }, { "entropy": 0.6002267003059387, "epoch": 3.2771815212319177, "grad_norm": 0.35827428102493286, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7534103542566299, "num_tokens": 12721102.0, "step": 3513 }, { "entropy": 0.5767338424921036, "epoch": 3.2781147923471767, "grad_norm": 0.33748695254325867, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.7686408013105392, "num_tokens": 12724822.0, "step": 3514 }, { "entropy": 0.559476375579834, "epoch": 3.2790480634624357, "grad_norm": 0.34347042441368103, "learning_rate": 0.0002, "loss": 0.5686, "mean_token_accuracy": 0.7702213674783707, "num_tokens": 12728474.0, "step": 3515 }, { "entropy": 0.5635605379939079, "epoch": 3.2799813345776947, "grad_norm": 0.38503214716911316, "learning_rate": 0.0002, "loss": 0.5651, "mean_token_accuracy": 0.7725612074136734, "num_tokens": 12732053.0, "step": 3516 }, { "entropy": 0.5951201319694519, "epoch": 3.2809146056929537, "grad_norm": 0.3708593249320984, "learning_rate": 0.0002, "loss": 0.5956, "mean_token_accuracy": 0.7557997703552246, "num_tokens": 12735808.0, "step": 3517 }, { "entropy": 0.5899913758039474, "epoch": 3.2818478768082127, "grad_norm": 0.332207590341568, "learning_rate": 0.0002, "loss": 0.5904, "mean_token_accuracy": 0.7624277025461197, "num_tokens": 12739465.0, "step": 3518 }, { "entropy": 0.5369748994708061, "epoch": 3.2827811479234716, "grad_norm": 0.3289025127887726, "learning_rate": 0.0002, "loss": 0.5405, "mean_token_accuracy": 0.7836969494819641, "num_tokens": 12743033.0, "step": 3519 }, { "entropy": 0.5656716227531433, "epoch": 3.2837144190387306, "grad_norm": 0.31128084659576416, "learning_rate": 0.0002, "loss": 0.5662, "mean_token_accuracy": 0.7700157314538956, "num_tokens": 12746650.0, "step": 3520 }, { "entropy": 0.5669850558042526, "epoch": 3.2846476901539896, "grad_norm": 0.3420146405696869, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.7641702741384506, "num_tokens": 12750149.0, "step": 3521 }, { "entropy": 0.5844371020793915, "epoch": 3.2855809612692486, "grad_norm": 0.3672175705432892, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.7543726712465286, "num_tokens": 12753776.0, "step": 3522 }, { "entropy": 0.5934655517339706, "epoch": 3.2865142323845076, "grad_norm": 0.3790332078933716, "learning_rate": 0.0002, "loss": 0.5834, "mean_token_accuracy": 0.7617530822753906, "num_tokens": 12757365.0, "step": 3523 }, { "entropy": 0.5811368674039841, "epoch": 3.2874475034997666, "grad_norm": 0.3162591755390167, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.7631847262382507, "num_tokens": 12761045.0, "step": 3524 }, { "entropy": 0.5838279128074646, "epoch": 3.2883807746150255, "grad_norm": 0.35948389768600464, "learning_rate": 0.0002, "loss": 0.585, "mean_token_accuracy": 0.766775369644165, "num_tokens": 12764806.0, "step": 3525 }, { "entropy": 0.583508312702179, "epoch": 3.2893140457302845, "grad_norm": 0.3965013325214386, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.7686806470155716, "num_tokens": 12768426.0, "step": 3526 }, { "entropy": 0.5966184884309769, "epoch": 3.2902473168455435, "grad_norm": 0.3357316851615906, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.7566381096839905, "num_tokens": 12772274.0, "step": 3527 }, { "entropy": 0.5739154517650604, "epoch": 3.2911805879608025, "grad_norm": 0.4197036921977997, "learning_rate": 0.0002, "loss": 0.5734, "mean_token_accuracy": 0.766748920083046, "num_tokens": 12775987.0, "step": 3528 }, { "entropy": 0.5915751159191132, "epoch": 3.2921138590760615, "grad_norm": 0.4114255905151367, "learning_rate": 0.0002, "loss": 0.6005, "mean_token_accuracy": 0.7552734464406967, "num_tokens": 12779556.0, "step": 3529 }, { "entropy": 0.5786140859127045, "epoch": 3.2930471301913204, "grad_norm": 0.32731112837791443, "learning_rate": 0.0002, "loss": 0.5892, "mean_token_accuracy": 0.7613353580236435, "num_tokens": 12783271.0, "step": 3530 }, { "entropy": 0.587250143289566, "epoch": 3.2939804013065794, "grad_norm": 0.32460924983024597, "learning_rate": 0.0002, "loss": 0.5862, "mean_token_accuracy": 0.7624487578868866, "num_tokens": 12786782.0, "step": 3531 }, { "entropy": 0.5553157776594162, "epoch": 3.2949136724218384, "grad_norm": 0.33747851848602295, "learning_rate": 0.0002, "loss": 0.5476, "mean_token_accuracy": 0.7820670455694199, "num_tokens": 12790402.0, "step": 3532 }, { "entropy": 0.5718697011470795, "epoch": 3.2958469435370974, "grad_norm": 0.35412323474884033, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.7645402550697327, "num_tokens": 12793950.0, "step": 3533 }, { "entropy": 0.5426996797323227, "epoch": 3.2967802146523564, "grad_norm": 0.311614453792572, "learning_rate": 0.0002, "loss": 0.5511, "mean_token_accuracy": 0.7743299007415771, "num_tokens": 12797534.0, "step": 3534 }, { "entropy": 0.5600429028272629, "epoch": 3.2977134857676154, "grad_norm": 0.3192189931869507, "learning_rate": 0.0002, "loss": 0.5585, "mean_token_accuracy": 0.7707420289516449, "num_tokens": 12801106.0, "step": 3535 }, { "entropy": 0.5745146721601486, "epoch": 3.2986467568828743, "grad_norm": 0.3741686940193176, "learning_rate": 0.0002, "loss": 0.5862, "mean_token_accuracy": 0.7592151761054993, "num_tokens": 12804723.0, "step": 3536 }, { "entropy": 0.5652812719345093, "epoch": 3.2995800279981333, "grad_norm": 0.32827624678611755, "learning_rate": 0.0002, "loss": 0.5755, "mean_token_accuracy": 0.763958215713501, "num_tokens": 12808367.0, "step": 3537 }, { "entropy": 0.5801624953746796, "epoch": 3.3005132991133923, "grad_norm": 0.34260594844818115, "learning_rate": 0.0002, "loss": 0.5914, "mean_token_accuracy": 0.7666043639183044, "num_tokens": 12811992.0, "step": 3538 }, { "entropy": 0.6020073443651199, "epoch": 3.3014465702286513, "grad_norm": 0.32217085361480713, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.7668110132217407, "num_tokens": 12815602.0, "step": 3539 }, { "entropy": 0.5741595029830933, "epoch": 3.3023798413439103, "grad_norm": 0.34010839462280273, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.7603214234113693, "num_tokens": 12819201.0, "step": 3540 }, { "entropy": 0.5945567339658737, "epoch": 3.3033131124591693, "grad_norm": 0.3444623649120331, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.7586384415626526, "num_tokens": 12823017.0, "step": 3541 }, { "entropy": 0.5824175626039505, "epoch": 3.3042463835744282, "grad_norm": 0.37789392471313477, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.7567434310913086, "num_tokens": 12826713.0, "step": 3542 }, { "entropy": 0.57651287317276, "epoch": 3.305179654689687, "grad_norm": 0.31921178102493286, "learning_rate": 0.0002, "loss": 0.5803, "mean_token_accuracy": 0.7691146284341812, "num_tokens": 12830296.0, "step": 3543 }, { "entropy": 0.5783520191907883, "epoch": 3.306112925804946, "grad_norm": 0.3478955924510956, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.7561089843511581, "num_tokens": 12833805.0, "step": 3544 }, { "entropy": 0.6162155717611313, "epoch": 3.307046196920205, "grad_norm": 0.30626383423805237, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.7578680515289307, "num_tokens": 12837449.0, "step": 3545 }, { "entropy": 0.6050973683595657, "epoch": 3.307979468035464, "grad_norm": 0.3328168988227844, "learning_rate": 0.0002, "loss": 0.6087, "mean_token_accuracy": 0.7542469501495361, "num_tokens": 12841121.0, "step": 3546 }, { "entropy": 0.6172062009572983, "epoch": 3.308912739150723, "grad_norm": 0.39308860898017883, "learning_rate": 0.0002, "loss": 0.6183, "mean_token_accuracy": 0.7523662596940994, "num_tokens": 12844817.0, "step": 3547 }, { "entropy": 0.5594506561756134, "epoch": 3.309846010265982, "grad_norm": 0.2812630236148834, "learning_rate": 0.0002, "loss": 0.5526, "mean_token_accuracy": 0.778497040271759, "num_tokens": 12848402.0, "step": 3548 }, { "entropy": 0.5994167774915695, "epoch": 3.310779281381241, "grad_norm": 0.3792758584022522, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.7616264373064041, "num_tokens": 12852006.0, "step": 3549 }, { "entropy": 0.58873550593853, "epoch": 3.3117125524965, "grad_norm": 0.32484713196754456, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.7541863918304443, "num_tokens": 12855740.0, "step": 3550 }, { "entropy": 0.6171249002218246, "epoch": 3.312645823611759, "grad_norm": 0.34987491369247437, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.7550085484981537, "num_tokens": 12859302.0, "step": 3551 }, { "entropy": 0.5827347636222839, "epoch": 3.313579094727018, "grad_norm": 0.323936402797699, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.7600400447845459, "num_tokens": 12862936.0, "step": 3552 }, { "entropy": 0.6123741269111633, "epoch": 3.314512365842277, "grad_norm": 0.37786322832107544, "learning_rate": 0.0002, "loss": 0.6306, "mean_token_accuracy": 0.7475491911172867, "num_tokens": 12866539.0, "step": 3553 }, { "entropy": 0.5787962824106216, "epoch": 3.315445636957536, "grad_norm": 0.40689319372177124, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.7565918564796448, "num_tokens": 12870075.0, "step": 3554 }, { "entropy": 0.5630239099264145, "epoch": 3.316378908072795, "grad_norm": 0.3072739541530609, "learning_rate": 0.0002, "loss": 0.5641, "mean_token_accuracy": 0.7723774015903473, "num_tokens": 12873730.0, "step": 3555 }, { "entropy": 0.5595102906227112, "epoch": 3.317312179188054, "grad_norm": 0.31859734654426575, "learning_rate": 0.0002, "loss": 0.5624, "mean_token_accuracy": 0.7746931910514832, "num_tokens": 12877322.0, "step": 3556 }, { "entropy": 0.6063546687364578, "epoch": 3.318245450303313, "grad_norm": 0.3166681230068207, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.7586980164051056, "num_tokens": 12881037.0, "step": 3557 }, { "entropy": 0.5610704571008682, "epoch": 3.319178721418572, "grad_norm": 0.37489545345306396, "learning_rate": 0.0002, "loss": 0.5693, "mean_token_accuracy": 0.7677082717418671, "num_tokens": 12884601.0, "step": 3558 }, { "entropy": 0.5642856359481812, "epoch": 3.320111992533831, "grad_norm": 0.30490759015083313, "learning_rate": 0.0002, "loss": 0.5624, "mean_token_accuracy": 0.7755732834339142, "num_tokens": 12888438.0, "step": 3559 }, { "entropy": 0.5616464391350746, "epoch": 3.32104526364909, "grad_norm": 0.34926897287368774, "learning_rate": 0.0002, "loss": 0.5706, "mean_token_accuracy": 0.7686270028352737, "num_tokens": 12892121.0, "step": 3560 }, { "entropy": 0.5802826285362244, "epoch": 3.321978534764349, "grad_norm": 0.30415961146354675, "learning_rate": 0.0002, "loss": 0.5751, "mean_token_accuracy": 0.7673206627368927, "num_tokens": 12895834.0, "step": 3561 }, { "entropy": 0.5788423418998718, "epoch": 3.322911805879608, "grad_norm": 0.3539387285709381, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.7729215174913406, "num_tokens": 12899460.0, "step": 3562 }, { "entropy": 0.571048304438591, "epoch": 3.323845076994867, "grad_norm": 0.34180065989494324, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.7671468555927277, "num_tokens": 12903169.0, "step": 3563 }, { "entropy": 0.5455076023936272, "epoch": 3.324778348110126, "grad_norm": 0.3767808973789215, "learning_rate": 0.0002, "loss": 0.5598, "mean_token_accuracy": 0.7702785283327103, "num_tokens": 12906624.0, "step": 3564 }, { "entropy": 0.5806710571050644, "epoch": 3.325711619225385, "grad_norm": 0.3247108459472656, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7653170526027679, "num_tokens": 12910240.0, "step": 3565 }, { "entropy": 0.5977975726127625, "epoch": 3.326644890340644, "grad_norm": 0.39513304829597473, "learning_rate": 0.0002, "loss": 0.6001, "mean_token_accuracy": 0.7630519717931747, "num_tokens": 12913884.0, "step": 3566 }, { "entropy": 0.5725344121456146, "epoch": 3.327578161455903, "grad_norm": 0.32677510380744934, "learning_rate": 0.0002, "loss": 0.5828, "mean_token_accuracy": 0.770171731710434, "num_tokens": 12917459.0, "step": 3567 }, { "entropy": 0.6075931787490845, "epoch": 3.3285114325711618, "grad_norm": 0.3278302252292633, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.7533722519874573, "num_tokens": 12921153.0, "step": 3568 }, { "entropy": 0.6008288562297821, "epoch": 3.3294447036864208, "grad_norm": 0.33340251445770264, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7494803071022034, "num_tokens": 12924774.0, "step": 3569 }, { "entropy": 0.5895597636699677, "epoch": 3.3303779748016797, "grad_norm": 0.36855435371398926, "learning_rate": 0.0002, "loss": 0.5913, "mean_token_accuracy": 0.762884184718132, "num_tokens": 12928397.0, "step": 3570 }, { "entropy": 0.5506584346294403, "epoch": 3.3313112459169387, "grad_norm": 0.2721220850944519, "learning_rate": 0.0002, "loss": 0.5505, "mean_token_accuracy": 0.7795367389917374, "num_tokens": 12931938.0, "step": 3571 }, { "entropy": 0.5397593975067139, "epoch": 3.3322445170321977, "grad_norm": 0.4066859185695648, "learning_rate": 0.0002, "loss": 0.5548, "mean_token_accuracy": 0.776322066783905, "num_tokens": 12935482.0, "step": 3572 }, { "entropy": 0.5511198192834854, "epoch": 3.3331777881474567, "grad_norm": 0.388356477022171, "learning_rate": 0.0002, "loss": 0.5392, "mean_token_accuracy": 0.7830780297517776, "num_tokens": 12939125.0, "step": 3573 }, { "entropy": 0.5851507186889648, "epoch": 3.3341110592627157, "grad_norm": 0.33682945370674133, "learning_rate": 0.0002, "loss": 0.5817, "mean_token_accuracy": 0.7641492933034897, "num_tokens": 12942814.0, "step": 3574 }, { "entropy": 0.5955893397331238, "epoch": 3.3350443303779747, "grad_norm": 0.34697553515434265, "learning_rate": 0.0002, "loss": 0.6051, "mean_token_accuracy": 0.7573183178901672, "num_tokens": 12946451.0, "step": 3575 }, { "entropy": 0.5808471888303757, "epoch": 3.3359776014932336, "grad_norm": 0.4537312090396881, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.7643401771783829, "num_tokens": 12950037.0, "step": 3576 }, { "entropy": 0.6055961549282074, "epoch": 3.3369108726084926, "grad_norm": 0.3827846348285675, "learning_rate": 0.0002, "loss": 0.617, "mean_token_accuracy": 0.7509162127971649, "num_tokens": 12953666.0, "step": 3577 }, { "entropy": 0.635187417268753, "epoch": 3.3378441437237516, "grad_norm": 0.31342050433158875, "learning_rate": 0.0002, "loss": 0.6196, "mean_token_accuracy": 0.7541668266057968, "num_tokens": 12957345.0, "step": 3578 }, { "entropy": 0.592944398522377, "epoch": 3.3387774148390106, "grad_norm": 0.2909313142299652, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.7646371424198151, "num_tokens": 12961088.0, "step": 3579 }, { "entropy": 0.5872069597244263, "epoch": 3.3397106859542696, "grad_norm": 0.3263028562068939, "learning_rate": 0.0002, "loss": 0.5786, "mean_token_accuracy": 0.7666190713644028, "num_tokens": 12964674.0, "step": 3580 }, { "entropy": 0.5740813612937927, "epoch": 3.3406439570695285, "grad_norm": 0.34482529759407043, "learning_rate": 0.0002, "loss": 0.5786, "mean_token_accuracy": 0.7617161124944687, "num_tokens": 12968343.0, "step": 3581 }, { "entropy": 0.5498665794730186, "epoch": 3.3415772281847875, "grad_norm": 0.38168081641197205, "learning_rate": 0.0002, "loss": 0.568, "mean_token_accuracy": 0.768210157752037, "num_tokens": 12971941.0, "step": 3582 }, { "entropy": 0.575865626335144, "epoch": 3.3425104993000465, "grad_norm": 0.4360131025314331, "learning_rate": 0.0002, "loss": 0.5954, "mean_token_accuracy": 0.7616235911846161, "num_tokens": 12975692.0, "step": 3583 }, { "entropy": 0.5597129017114639, "epoch": 3.3434437704153055, "grad_norm": 0.35423871874809265, "learning_rate": 0.0002, "loss": 0.567, "mean_token_accuracy": 0.7707638740539551, "num_tokens": 12979222.0, "step": 3584 }, { "entropy": 0.573897585272789, "epoch": 3.3443770415305645, "grad_norm": 0.31674280762672424, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.7701295912265778, "num_tokens": 12982822.0, "step": 3585 }, { "entropy": 0.5922565162181854, "epoch": 3.3453103126458235, "grad_norm": 0.3443075120449066, "learning_rate": 0.0002, "loss": 0.5873, "mean_token_accuracy": 0.7623215466737747, "num_tokens": 12986519.0, "step": 3586 }, { "entropy": 0.5962289869785309, "epoch": 3.3462435837610824, "grad_norm": 0.3244750499725342, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7571402937173843, "num_tokens": 12990077.0, "step": 3587 }, { "entropy": 0.5703475177288055, "epoch": 3.3471768548763414, "grad_norm": 0.3706181049346924, "learning_rate": 0.0002, "loss": 0.5766, "mean_token_accuracy": 0.7638845890760422, "num_tokens": 12993670.0, "step": 3588 }, { "entropy": 0.5339751541614532, "epoch": 3.3481101259916004, "grad_norm": 0.37254056334495544, "learning_rate": 0.0002, "loss": 0.5408, "mean_token_accuracy": 0.784169390797615, "num_tokens": 12997172.0, "step": 3589 }, { "entropy": 0.5726533383131027, "epoch": 3.3490433971068594, "grad_norm": 0.39158308506011963, "learning_rate": 0.0002, "loss": 0.5852, "mean_token_accuracy": 0.772881418466568, "num_tokens": 13000829.0, "step": 3590 }, { "entropy": 0.5826108604669571, "epoch": 3.3499766682221184, "grad_norm": 0.35533785820007324, "learning_rate": 0.0002, "loss": 0.584, "mean_token_accuracy": 0.762790709733963, "num_tokens": 13004464.0, "step": 3591 }, { "entropy": 0.5647110044956207, "epoch": 3.3509099393373774, "grad_norm": 0.30600062012672424, "learning_rate": 0.0002, "loss": 0.5623, "mean_token_accuracy": 0.7717772424221039, "num_tokens": 13008162.0, "step": 3592 }, { "entropy": 0.5883168429136276, "epoch": 3.3518432104526363, "grad_norm": 0.3327505886554718, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.7633303105831146, "num_tokens": 13011870.0, "step": 3593 }, { "entropy": 0.5990070998668671, "epoch": 3.3527764815678953, "grad_norm": 0.3360869288444519, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7605664134025574, "num_tokens": 13015615.0, "step": 3594 }, { "entropy": 0.5663905963301659, "epoch": 3.3537097526831543, "grad_norm": 0.3351825177669525, "learning_rate": 0.0002, "loss": 0.5742, "mean_token_accuracy": 0.7655612528324127, "num_tokens": 13019343.0, "step": 3595 }, { "entropy": 0.5792697072029114, "epoch": 3.3546430237984133, "grad_norm": 0.3182855546474457, "learning_rate": 0.0002, "loss": 0.5829, "mean_token_accuracy": 0.7686283439397812, "num_tokens": 13023099.0, "step": 3596 }, { "entropy": 0.5567439571022987, "epoch": 3.3555762949136723, "grad_norm": 0.3218185603618622, "learning_rate": 0.0002, "loss": 0.5681, "mean_token_accuracy": 0.766289010643959, "num_tokens": 13026740.0, "step": 3597 }, { "entropy": 0.5722562223672867, "epoch": 3.3565095660289312, "grad_norm": 0.3621424734592438, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7657070457935333, "num_tokens": 13030390.0, "step": 3598 }, { "entropy": 0.6001022458076477, "epoch": 3.3574428371441902, "grad_norm": 0.37881457805633545, "learning_rate": 0.0002, "loss": 0.6177, "mean_token_accuracy": 0.7528184205293655, "num_tokens": 13034088.0, "step": 3599 }, { "entropy": 0.5458583682775497, "epoch": 3.358376108259449, "grad_norm": 0.37181878089904785, "learning_rate": 0.0002, "loss": 0.5578, "mean_token_accuracy": 0.7732039391994476, "num_tokens": 13037596.0, "step": 3600 }, { "entropy": 0.5689570605754852, "epoch": 3.359309379374708, "grad_norm": 0.3353498876094818, "learning_rate": 0.0002, "loss": 0.5618, "mean_token_accuracy": 0.7779487371444702, "num_tokens": 13041264.0, "step": 3601 }, { "entropy": 0.5498959273099899, "epoch": 3.360242650489967, "grad_norm": 0.36290866136550903, "learning_rate": 0.0002, "loss": 0.5555, "mean_token_accuracy": 0.7760282456874847, "num_tokens": 13044822.0, "step": 3602 }, { "entropy": 0.5513225942850113, "epoch": 3.361175921605226, "grad_norm": 0.3451271057128906, "learning_rate": 0.0002, "loss": 0.5571, "mean_token_accuracy": 0.7751489728689194, "num_tokens": 13048476.0, "step": 3603 }, { "entropy": 0.590503141283989, "epoch": 3.362109192720485, "grad_norm": 0.35818931460380554, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.7608870416879654, "num_tokens": 13052027.0, "step": 3604 }, { "entropy": 0.6068126410245895, "epoch": 3.363042463835744, "grad_norm": 0.3045908212661743, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.7589861303567886, "num_tokens": 13055706.0, "step": 3605 }, { "entropy": 0.5993064045906067, "epoch": 3.363975734951003, "grad_norm": 0.3069894313812256, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.7635009437799454, "num_tokens": 13059321.0, "step": 3606 }, { "entropy": 0.5473637580871582, "epoch": 3.364909006066262, "grad_norm": 0.3486545979976654, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.7783321440219879, "num_tokens": 13063017.0, "step": 3607 }, { "entropy": 0.5824806392192841, "epoch": 3.365842277181521, "grad_norm": 0.331520676612854, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.7597516477108002, "num_tokens": 13066660.0, "step": 3608 }, { "entropy": 0.5745731592178345, "epoch": 3.36677554829678, "grad_norm": 0.3743458390235901, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.7558245956897736, "num_tokens": 13070310.0, "step": 3609 }, { "entropy": 0.5783993005752563, "epoch": 3.367708819412039, "grad_norm": 0.3893824517726898, "learning_rate": 0.0002, "loss": 0.6056, "mean_token_accuracy": 0.7601224333047867, "num_tokens": 13073991.0, "step": 3610 }, { "entropy": 0.6012451350688934, "epoch": 3.368642090527298, "grad_norm": 0.31736651062965393, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.7616562247276306, "num_tokens": 13077538.0, "step": 3611 }, { "entropy": 0.6113009601831436, "epoch": 3.369575361642557, "grad_norm": 0.3132448196411133, "learning_rate": 0.0002, "loss": 0.5998, "mean_token_accuracy": 0.7577767670154572, "num_tokens": 13081258.0, "step": 3612 }, { "entropy": 0.6031929850578308, "epoch": 3.370508632757816, "grad_norm": 0.3217904567718506, "learning_rate": 0.0002, "loss": 0.5891, "mean_token_accuracy": 0.7622328549623489, "num_tokens": 13084914.0, "step": 3613 }, { "entropy": 0.5933402180671692, "epoch": 3.371441903873075, "grad_norm": 0.30672332644462585, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.770642951130867, "num_tokens": 13088471.0, "step": 3614 }, { "entropy": 0.596218466758728, "epoch": 3.372375174988334, "grad_norm": 0.3231268525123596, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.7597440481185913, "num_tokens": 13092040.0, "step": 3615 }, { "entropy": 0.5841282904148102, "epoch": 3.373308446103593, "grad_norm": 0.3262060284614563, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7644140273332596, "num_tokens": 13095575.0, "step": 3616 }, { "entropy": 0.5517860874533653, "epoch": 3.374241717218852, "grad_norm": 0.34085431694984436, "learning_rate": 0.0002, "loss": 0.5582, "mean_token_accuracy": 0.7775106579065323, "num_tokens": 13099178.0, "step": 3617 }, { "entropy": 0.5890819132328033, "epoch": 3.375174988334111, "grad_norm": 0.3746381103992462, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.7641165554523468, "num_tokens": 13102780.0, "step": 3618 }, { "entropy": 0.5785878151655197, "epoch": 3.37610825944937, "grad_norm": 0.38708579540252686, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.7537835389375687, "num_tokens": 13106578.0, "step": 3619 }, { "entropy": 0.5576973408460617, "epoch": 3.377041530564629, "grad_norm": 0.32540175318717957, "learning_rate": 0.0002, "loss": 0.5556, "mean_token_accuracy": 0.774636760354042, "num_tokens": 13110158.0, "step": 3620 }, { "entropy": 0.5751238018274307, "epoch": 3.377974801679888, "grad_norm": 0.33178284764289856, "learning_rate": 0.0002, "loss": 0.5748, "mean_token_accuracy": 0.767518550157547, "num_tokens": 13113868.0, "step": 3621 }, { "entropy": 0.5721110999584198, "epoch": 3.378908072795147, "grad_norm": 0.3570699393749237, "learning_rate": 0.0002, "loss": 0.576, "mean_token_accuracy": 0.7719866931438446, "num_tokens": 13117406.0, "step": 3622 }, { "entropy": 0.5799495726823807, "epoch": 3.379841343910406, "grad_norm": 0.3060358166694641, "learning_rate": 0.0002, "loss": 0.5772, "mean_token_accuracy": 0.7671172767877579, "num_tokens": 13121196.0, "step": 3623 }, { "entropy": 0.5785647630691528, "epoch": 3.380774615025665, "grad_norm": 0.3584585189819336, "learning_rate": 0.0002, "loss": 0.5905, "mean_token_accuracy": 0.7675395458936691, "num_tokens": 13124747.0, "step": 3624 }, { "entropy": 0.5519731864333153, "epoch": 3.3817078861409238, "grad_norm": 0.3220421075820923, "learning_rate": 0.0002, "loss": 0.5575, "mean_token_accuracy": 0.7767481356859207, "num_tokens": 13128272.0, "step": 3625 }, { "entropy": 0.5987811237573624, "epoch": 3.3826411572561828, "grad_norm": 0.31473004817962646, "learning_rate": 0.0002, "loss": 0.6046, "mean_token_accuracy": 0.7602509111166, "num_tokens": 13131870.0, "step": 3626 }, { "entropy": 0.6262630373239517, "epoch": 3.3835744283714417, "grad_norm": 0.2979796230792999, "learning_rate": 0.0002, "loss": 0.6203, "mean_token_accuracy": 0.7464405596256256, "num_tokens": 13135602.0, "step": 3627 }, { "entropy": 0.5768579989671707, "epoch": 3.3845076994867007, "grad_norm": 0.3301537334918976, "learning_rate": 0.0002, "loss": 0.5713, "mean_token_accuracy": 0.7713715881109238, "num_tokens": 13139271.0, "step": 3628 }, { "entropy": 0.5499501824378967, "epoch": 3.3854409706019597, "grad_norm": 0.3559218645095825, "learning_rate": 0.0002, "loss": 0.5442, "mean_token_accuracy": 0.7856462001800537, "num_tokens": 13142786.0, "step": 3629 }, { "entropy": 0.5757618546485901, "epoch": 3.3863742417172187, "grad_norm": 0.32815125584602356, "learning_rate": 0.0002, "loss": 0.5693, "mean_token_accuracy": 0.762263834476471, "num_tokens": 13146365.0, "step": 3630 }, { "entropy": 0.5842954814434052, "epoch": 3.3873075128324777, "grad_norm": 0.3003830313682556, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7656714022159576, "num_tokens": 13149944.0, "step": 3631 }, { "entropy": 0.5783694535493851, "epoch": 3.3882407839477366, "grad_norm": 0.35322079062461853, "learning_rate": 0.0002, "loss": 0.5738, "mean_token_accuracy": 0.7669763565063477, "num_tokens": 13153693.0, "step": 3632 }, { "entropy": 0.5993991494178772, "epoch": 3.3891740550629956, "grad_norm": 0.34917131066322327, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.7542227655649185, "num_tokens": 13157437.0, "step": 3633 }, { "entropy": 0.5786245614290237, "epoch": 3.3901073261782546, "grad_norm": 0.3667726516723633, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7609338611364365, "num_tokens": 13161077.0, "step": 3634 }, { "entropy": 0.6016099601984024, "epoch": 3.3910405972935136, "grad_norm": 0.30477073788642883, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7570227086544037, "num_tokens": 13164762.0, "step": 3635 }, { "entropy": 0.558053195476532, "epoch": 3.3919738684087726, "grad_norm": 0.3070068955421448, "learning_rate": 0.0002, "loss": 0.571, "mean_token_accuracy": 0.7653021216392517, "num_tokens": 13168362.0, "step": 3636 }, { "entropy": 0.5776077955961227, "epoch": 3.3929071395240316, "grad_norm": 0.29722604155540466, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.7568008899688721, "num_tokens": 13171918.0, "step": 3637 }, { "entropy": 0.5644113570451736, "epoch": 3.3938404106392905, "grad_norm": 0.3663692772388458, "learning_rate": 0.0002, "loss": 0.5751, "mean_token_accuracy": 0.7701656818389893, "num_tokens": 13175479.0, "step": 3638 }, { "entropy": 0.5478582829236984, "epoch": 3.3947736817545495, "grad_norm": 0.2930006980895996, "learning_rate": 0.0002, "loss": 0.5493, "mean_token_accuracy": 0.7825065106153488, "num_tokens": 13179234.0, "step": 3639 }, { "entropy": 0.5784141421318054, "epoch": 3.3957069528698085, "grad_norm": 0.4094732999801636, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7503646463155746, "num_tokens": 13182774.0, "step": 3640 }, { "entropy": 0.5823723673820496, "epoch": 3.3966402239850675, "grad_norm": 0.3216119408607483, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.7574580609798431, "num_tokens": 13186474.0, "step": 3641 }, { "entropy": 0.5428922325372696, "epoch": 3.3975734951003265, "grad_norm": 0.3348865509033203, "learning_rate": 0.0002, "loss": 0.5446, "mean_token_accuracy": 0.7747505307197571, "num_tokens": 13190197.0, "step": 3642 }, { "entropy": 0.5751352608203888, "epoch": 3.3985067662155855, "grad_norm": 0.33113932609558105, "learning_rate": 0.0002, "loss": 0.5624, "mean_token_accuracy": 0.7735869288444519, "num_tokens": 13193801.0, "step": 3643 }, { "entropy": 0.5740882083773613, "epoch": 3.3994400373308444, "grad_norm": 0.3057151138782501, "learning_rate": 0.0002, "loss": 0.5795, "mean_token_accuracy": 0.770337775349617, "num_tokens": 13197464.0, "step": 3644 }, { "entropy": 0.5596960037946701, "epoch": 3.4003733084461034, "grad_norm": 0.3481717109680176, "learning_rate": 0.0002, "loss": 0.5645, "mean_token_accuracy": 0.7651912271976471, "num_tokens": 13201193.0, "step": 3645 }, { "entropy": 0.5754128098487854, "epoch": 3.4013065795613624, "grad_norm": 0.3156900107860565, "learning_rate": 0.0002, "loss": 0.5773, "mean_token_accuracy": 0.7682941555976868, "num_tokens": 13204902.0, "step": 3646 }, { "entropy": 0.5651202946901321, "epoch": 3.4022398506766214, "grad_norm": 0.31365153193473816, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.77081598341465, "num_tokens": 13208607.0, "step": 3647 }, { "entropy": 0.5751762241125107, "epoch": 3.4031731217918804, "grad_norm": 0.39329707622528076, "learning_rate": 0.0002, "loss": 0.5822, "mean_token_accuracy": 0.7664200216531754, "num_tokens": 13212259.0, "step": 3648 }, { "entropy": 0.5820844769477844, "epoch": 3.4041063929071393, "grad_norm": 0.3077203333377838, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7585452347993851, "num_tokens": 13215867.0, "step": 3649 }, { "entropy": 0.5815353989601135, "epoch": 3.4050396640223983, "grad_norm": 0.34416094422340393, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.7615126669406891, "num_tokens": 13219439.0, "step": 3650 }, { "entropy": 0.5802690237760544, "epoch": 3.4059729351376573, "grad_norm": 0.37074974179267883, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.7609994560480118, "num_tokens": 13223179.0, "step": 3651 }, { "entropy": 0.5830641984939575, "epoch": 3.4069062062529163, "grad_norm": 0.3006986081600189, "learning_rate": 0.0002, "loss": 0.5807, "mean_token_accuracy": 0.7671185880899429, "num_tokens": 13226849.0, "step": 3652 }, { "entropy": 0.6045270711183548, "epoch": 3.4078394773681753, "grad_norm": 0.3419649004936218, "learning_rate": 0.0002, "loss": 0.6081, "mean_token_accuracy": 0.7522582560777664, "num_tokens": 13230475.0, "step": 3653 }, { "entropy": 0.6012050062417984, "epoch": 3.4087727484834343, "grad_norm": 0.36375269293785095, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7515381425619125, "num_tokens": 13234073.0, "step": 3654 }, { "entropy": 0.5978149771690369, "epoch": 3.4097060195986932, "grad_norm": 0.39873215556144714, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.7546112090349197, "num_tokens": 13237598.0, "step": 3655 }, { "entropy": 0.5817397385835648, "epoch": 3.410639290713952, "grad_norm": 0.3812029957771301, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.7577191889286041, "num_tokens": 13241162.0, "step": 3656 }, { "entropy": 0.5918416976928711, "epoch": 3.411572561829211, "grad_norm": 0.4018363654613495, "learning_rate": 0.0002, "loss": 0.5969, "mean_token_accuracy": 0.7529495507478714, "num_tokens": 13244948.0, "step": 3657 }, { "entropy": 0.5379917174577713, "epoch": 3.41250583294447, "grad_norm": 0.3856666684150696, "learning_rate": 0.0002, "loss": 0.5516, "mean_token_accuracy": 0.7809030562639236, "num_tokens": 13248598.0, "step": 3658 }, { "entropy": 0.563093289732933, "epoch": 3.413439104059729, "grad_norm": 0.3791152536869049, "learning_rate": 0.0002, "loss": 0.5722, "mean_token_accuracy": 0.7664156407117844, "num_tokens": 13252217.0, "step": 3659 }, { "entropy": 0.5492370426654816, "epoch": 3.414372375174988, "grad_norm": 0.34378698468208313, "learning_rate": 0.0002, "loss": 0.5639, "mean_token_accuracy": 0.7764579206705093, "num_tokens": 13255789.0, "step": 3660 }, { "entropy": 0.5879540741443634, "epoch": 3.415305646290247, "grad_norm": 0.3402460515499115, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.757973313331604, "num_tokens": 13259398.0, "step": 3661 }, { "entropy": 0.5815284103155136, "epoch": 3.416238917405506, "grad_norm": 0.3031552731990814, "learning_rate": 0.0002, "loss": 0.5847, "mean_token_accuracy": 0.7635083496570587, "num_tokens": 13262987.0, "step": 3662 }, { "entropy": 0.5907090082764626, "epoch": 3.417172188520765, "grad_norm": 0.3029532730579376, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7607073038816452, "num_tokens": 13266679.0, "step": 3663 }, { "entropy": 0.5584889054298401, "epoch": 3.418105459636024, "grad_norm": 0.31317824125289917, "learning_rate": 0.0002, "loss": 0.5481, "mean_token_accuracy": 0.778710350394249, "num_tokens": 13270592.0, "step": 3664 }, { "entropy": 0.5915802121162415, "epoch": 3.419038730751283, "grad_norm": 0.2813847064971924, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.7630463391542435, "num_tokens": 13274229.0, "step": 3665 }, { "entropy": 0.6000353991985321, "epoch": 3.419972001866542, "grad_norm": 0.33996814489364624, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.7580121010541916, "num_tokens": 13277854.0, "step": 3666 }, { "entropy": 0.5970087200403214, "epoch": 3.420905272981801, "grad_norm": 0.3113202750682831, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.7635833621025085, "num_tokens": 13281425.0, "step": 3667 }, { "entropy": 0.5745975524187088, "epoch": 3.42183854409706, "grad_norm": 0.3817340135574341, "learning_rate": 0.0002, "loss": 0.584, "mean_token_accuracy": 0.765855997800827, "num_tokens": 13285032.0, "step": 3668 }, { "entropy": 0.5648555010557175, "epoch": 3.422771815212319, "grad_norm": 0.35015755891799927, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.7704421281814575, "num_tokens": 13288625.0, "step": 3669 }, { "entropy": 0.5404407829046249, "epoch": 3.423705086327578, "grad_norm": 0.4381895065307617, "learning_rate": 0.0002, "loss": 0.5665, "mean_token_accuracy": 0.775938093662262, "num_tokens": 13292169.0, "step": 3670 }, { "entropy": 0.5704846531152725, "epoch": 3.424638357442837, "grad_norm": 0.37614983320236206, "learning_rate": 0.0002, "loss": 0.5755, "mean_token_accuracy": 0.7664020508527756, "num_tokens": 13296021.0, "step": 3671 }, { "entropy": 0.5647004246711731, "epoch": 3.425571628558096, "grad_norm": 0.37026238441467285, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.7649362981319427, "num_tokens": 13299561.0, "step": 3672 }, { "entropy": 0.5763756483793259, "epoch": 3.426504899673355, "grad_norm": 0.3351760506629944, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.7592351883649826, "num_tokens": 13303205.0, "step": 3673 }, { "entropy": 0.6244111061096191, "epoch": 3.427438170788614, "grad_norm": 0.3198356628417969, "learning_rate": 0.0002, "loss": 0.6287, "mean_token_accuracy": 0.7469025105237961, "num_tokens": 13306850.0, "step": 3674 }, { "entropy": 0.5992836356163025, "epoch": 3.428371441903873, "grad_norm": 0.3307931423187256, "learning_rate": 0.0002, "loss": 0.582, "mean_token_accuracy": 0.7665457725524902, "num_tokens": 13310481.0, "step": 3675 }, { "entropy": 0.5509048700332642, "epoch": 3.429304713019132, "grad_norm": 0.3131285607814789, "learning_rate": 0.0002, "loss": 0.5451, "mean_token_accuracy": 0.780194953083992, "num_tokens": 13314042.0, "step": 3676 }, { "entropy": 0.6307092905044556, "epoch": 3.430237984134391, "grad_norm": 0.3179020881652832, "learning_rate": 0.0002, "loss": 0.6289, "mean_token_accuracy": 0.7499159723520279, "num_tokens": 13317688.0, "step": 3677 }, { "entropy": 0.6013831794261932, "epoch": 3.43117125524965, "grad_norm": 0.322466641664505, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.752927765250206, "num_tokens": 13321304.0, "step": 3678 }, { "entropy": 0.6002855598926544, "epoch": 3.432104526364909, "grad_norm": 0.33395886421203613, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7586990743875504, "num_tokens": 13324968.0, "step": 3679 }, { "entropy": 0.5759229958057404, "epoch": 3.433037797480168, "grad_norm": 0.41082972288131714, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.7568250298500061, "num_tokens": 13328489.0, "step": 3680 }, { "entropy": 0.5969287753105164, "epoch": 3.433971068595427, "grad_norm": 0.3515359163284302, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.7543045431375504, "num_tokens": 13332245.0, "step": 3681 }, { "entropy": 0.5672413259744644, "epoch": 3.4349043397106858, "grad_norm": 0.3349680006504059, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7660504728555679, "num_tokens": 13335857.0, "step": 3682 }, { "entropy": 0.5678838640451431, "epoch": 3.4358376108259447, "grad_norm": 0.3279564380645752, "learning_rate": 0.0002, "loss": 0.5564, "mean_token_accuracy": 0.7798249870538712, "num_tokens": 13339556.0, "step": 3683 }, { "entropy": 0.5631003826856613, "epoch": 3.4367708819412037, "grad_norm": 0.368327796459198, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.7671055495738983, "num_tokens": 13343125.0, "step": 3684 }, { "entropy": 0.5795350074768066, "epoch": 3.4377041530564627, "grad_norm": 0.3175252377986908, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.760164424777031, "num_tokens": 13346756.0, "step": 3685 }, { "entropy": 0.574462041258812, "epoch": 3.4386374241717217, "grad_norm": 0.33823108673095703, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7638093680143356, "num_tokens": 13350444.0, "step": 3686 }, { "entropy": 0.559341311454773, "epoch": 3.4395706952869807, "grad_norm": 0.36842644214630127, "learning_rate": 0.0002, "loss": 0.5615, "mean_token_accuracy": 0.7710844576358795, "num_tokens": 13354047.0, "step": 3687 }, { "entropy": 0.5753481984138489, "epoch": 3.4405039664022397, "grad_norm": 0.3465222418308258, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.764994814991951, "num_tokens": 13357831.0, "step": 3688 }, { "entropy": 0.5730218887329102, "epoch": 3.4414372375174986, "grad_norm": 0.37939855456352234, "learning_rate": 0.0002, "loss": 0.5939, "mean_token_accuracy": 0.7563752681016922, "num_tokens": 13361336.0, "step": 3689 }, { "entropy": 0.5714315474033356, "epoch": 3.4423705086327576, "grad_norm": 0.3315083682537079, "learning_rate": 0.0002, "loss": 0.5671, "mean_token_accuracy": 0.7738872468471527, "num_tokens": 13364951.0, "step": 3690 }, { "entropy": 0.5899413675069809, "epoch": 3.4433037797480166, "grad_norm": 0.29734715819358826, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7728480249643326, "num_tokens": 13368620.0, "step": 3691 }, { "entropy": 0.5885497629642487, "epoch": 3.4442370508632756, "grad_norm": 0.26522523164749146, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.763216182589531, "num_tokens": 13372213.0, "step": 3692 }, { "entropy": 0.5654432624578476, "epoch": 3.4451703219785346, "grad_norm": 0.37462425231933594, "learning_rate": 0.0002, "loss": 0.5646, "mean_token_accuracy": 0.7674428224563599, "num_tokens": 13375798.0, "step": 3693 }, { "entropy": 0.5978517979383469, "epoch": 3.4461035930937935, "grad_norm": 0.33536961674690247, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.7612755000591278, "num_tokens": 13379383.0, "step": 3694 }, { "entropy": 0.5915549546480179, "epoch": 3.4470368642090525, "grad_norm": 0.3515778183937073, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.763075515627861, "num_tokens": 13382971.0, "step": 3695 }, { "entropy": 0.566206082701683, "epoch": 3.4479701353243115, "grad_norm": 0.35763922333717346, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.770512580871582, "num_tokens": 13386525.0, "step": 3696 }, { "entropy": 0.5974565148353577, "epoch": 3.4489034064395705, "grad_norm": 0.37655091285705566, "learning_rate": 0.0002, "loss": 0.6006, "mean_token_accuracy": 0.7623469233512878, "num_tokens": 13390181.0, "step": 3697 }, { "entropy": 0.5701934397220612, "epoch": 3.4498366775548295, "grad_norm": 0.338606059551239, "learning_rate": 0.0002, "loss": 0.574, "mean_token_accuracy": 0.7690251469612122, "num_tokens": 13393855.0, "step": 3698 }, { "entropy": 0.5459783226251602, "epoch": 3.4507699486700885, "grad_norm": 0.3602162301540375, "learning_rate": 0.0002, "loss": 0.5514, "mean_token_accuracy": 0.782463937997818, "num_tokens": 13397367.0, "step": 3699 }, { "entropy": 0.5806498527526855, "epoch": 3.4517032197853474, "grad_norm": 0.35794520378112793, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.758947029709816, "num_tokens": 13401088.0, "step": 3700 }, { "entropy": 0.5655510425567627, "epoch": 3.4526364909006064, "grad_norm": 0.36087164282798767, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.7664229720830917, "num_tokens": 13404730.0, "step": 3701 }, { "entropy": 0.5523159354925156, "epoch": 3.4535697620158654, "grad_norm": 0.3886668086051941, "learning_rate": 0.0002, "loss": 0.5669, "mean_token_accuracy": 0.7690217345952988, "num_tokens": 13408322.0, "step": 3702 }, { "entropy": 0.518091082572937, "epoch": 3.4545030331311244, "grad_norm": 0.36739975214004517, "learning_rate": 0.0002, "loss": 0.5358, "mean_token_accuracy": 0.7905401587486267, "num_tokens": 13411943.0, "step": 3703 }, { "entropy": 0.5926903337240219, "epoch": 3.4554363042463834, "grad_norm": 0.32350289821624756, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.7585449367761612, "num_tokens": 13415528.0, "step": 3704 }, { "entropy": 0.5924101024866104, "epoch": 3.4563695753616424, "grad_norm": 0.29395872354507446, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7611747235059738, "num_tokens": 13419005.0, "step": 3705 }, { "entropy": 0.6285646259784698, "epoch": 3.4573028464769013, "grad_norm": 0.3191835880279541, "learning_rate": 0.0002, "loss": 0.6175, "mean_token_accuracy": 0.7475217431783676, "num_tokens": 13422654.0, "step": 3706 }, { "entropy": 0.5878799557685852, "epoch": 3.4582361175921603, "grad_norm": 0.2897433042526245, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.7657844722270966, "num_tokens": 13426186.0, "step": 3707 }, { "entropy": 0.6133871227502823, "epoch": 3.4591693887074193, "grad_norm": 0.4153243899345398, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7551271468400955, "num_tokens": 13429792.0, "step": 3708 }, { "entropy": 0.58256696164608, "epoch": 3.4601026598226783, "grad_norm": 0.3052923381328583, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.7529771029949188, "num_tokens": 13433376.0, "step": 3709 }, { "entropy": 0.5612822771072388, "epoch": 3.4610359309379373, "grad_norm": 0.3256652057170868, "learning_rate": 0.0002, "loss": 0.5687, "mean_token_accuracy": 0.766850933432579, "num_tokens": 13436971.0, "step": 3710 }, { "entropy": 0.566651239991188, "epoch": 3.4619692020531962, "grad_norm": 0.31297966837882996, "learning_rate": 0.0002, "loss": 0.5669, "mean_token_accuracy": 0.7679001092910767, "num_tokens": 13440526.0, "step": 3711 }, { "entropy": 0.5801733583211899, "epoch": 3.4629024731684552, "grad_norm": 0.38936150074005127, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.7645063400268555, "num_tokens": 13444030.0, "step": 3712 }, { "entropy": 0.5477310568094254, "epoch": 3.463835744283714, "grad_norm": 0.38585591316223145, "learning_rate": 0.0002, "loss": 0.5715, "mean_token_accuracy": 0.7720755338668823, "num_tokens": 13447643.0, "step": 3713 }, { "entropy": 0.5531274378299713, "epoch": 3.464769015398973, "grad_norm": 0.37163063883781433, "learning_rate": 0.0002, "loss": 0.5677, "mean_token_accuracy": 0.7756104320287704, "num_tokens": 13451195.0, "step": 3714 }, { "entropy": 0.5523652657866478, "epoch": 3.465702286514232, "grad_norm": 0.36661073565483093, "learning_rate": 0.0002, "loss": 0.5617, "mean_token_accuracy": 0.7691337615251541, "num_tokens": 13454858.0, "step": 3715 }, { "entropy": 0.5688285976648331, "epoch": 3.466635557629491, "grad_norm": 0.32825466990470886, "learning_rate": 0.0002, "loss": 0.5659, "mean_token_accuracy": 0.7722684144973755, "num_tokens": 13458461.0, "step": 3716 }, { "entropy": 0.5741965174674988, "epoch": 3.46756882874475, "grad_norm": 0.3066367506980896, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.7679432481527328, "num_tokens": 13462208.0, "step": 3717 }, { "entropy": 0.5620297342538834, "epoch": 3.468502099860009, "grad_norm": 0.35086891055107117, "learning_rate": 0.0002, "loss": 0.5728, "mean_token_accuracy": 0.7677515149116516, "num_tokens": 13465876.0, "step": 3718 }, { "entropy": 0.5850566476583481, "epoch": 3.469435370975268, "grad_norm": 0.42363426089286804, "learning_rate": 0.0002, "loss": 0.5836, "mean_token_accuracy": 0.765438511967659, "num_tokens": 13469465.0, "step": 3719 }, { "entropy": 0.5812923312187195, "epoch": 3.470368642090527, "grad_norm": 0.32053208351135254, "learning_rate": 0.0002, "loss": 0.5792, "mean_token_accuracy": 0.7670563906431198, "num_tokens": 13473156.0, "step": 3720 }, { "entropy": 0.583085224032402, "epoch": 3.471301913205786, "grad_norm": 0.3716863691806793, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.7679958641529083, "num_tokens": 13476861.0, "step": 3721 }, { "entropy": 0.6072978228330612, "epoch": 3.472235184321045, "grad_norm": 0.277120977640152, "learning_rate": 0.0002, "loss": 0.5956, "mean_token_accuracy": 0.765503391623497, "num_tokens": 13480605.0, "step": 3722 }, { "entropy": 0.5555035769939423, "epoch": 3.473168455436304, "grad_norm": 0.28921741247177124, "learning_rate": 0.0002, "loss": 0.5578, "mean_token_accuracy": 0.7701535820960999, "num_tokens": 13484040.0, "step": 3723 }, { "entropy": 0.5680551528930664, "epoch": 3.474101726551563, "grad_norm": 0.36709776520729065, "learning_rate": 0.0002, "loss": 0.5845, "mean_token_accuracy": 0.7675759047269821, "num_tokens": 13487646.0, "step": 3724 }, { "entropy": 0.5379194021224976, "epoch": 3.475034997666822, "grad_norm": 0.3714510500431061, "learning_rate": 0.0002, "loss": 0.5496, "mean_token_accuracy": 0.7810903638601303, "num_tokens": 13491243.0, "step": 3725 }, { "entropy": 0.5547040104866028, "epoch": 3.475968268782081, "grad_norm": 0.34346917271614075, "learning_rate": 0.0002, "loss": 0.5663, "mean_token_accuracy": 0.7716636210680008, "num_tokens": 13494855.0, "step": 3726 }, { "entropy": 0.6102897226810455, "epoch": 3.47690153989734, "grad_norm": 0.3465892970561981, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.7535250782966614, "num_tokens": 13498569.0, "step": 3727 }, { "entropy": 0.5702423602342606, "epoch": 3.477834811012599, "grad_norm": 0.3065604865550995, "learning_rate": 0.0002, "loss": 0.5757, "mean_token_accuracy": 0.7722277045249939, "num_tokens": 13502089.0, "step": 3728 }, { "entropy": 0.576222687959671, "epoch": 3.478768082127858, "grad_norm": 0.2975893020629883, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7617056667804718, "num_tokens": 13505790.0, "step": 3729 }, { "entropy": 0.5571857243776321, "epoch": 3.479701353243117, "grad_norm": 0.3116614520549774, "learning_rate": 0.0002, "loss": 0.5669, "mean_token_accuracy": 0.7711467742919922, "num_tokens": 13509460.0, "step": 3730 }, { "entropy": 0.5878018587827682, "epoch": 3.480634624358376, "grad_norm": 0.3718899190425873, "learning_rate": 0.0002, "loss": 0.5865, "mean_token_accuracy": 0.7631841897964478, "num_tokens": 13513154.0, "step": 3731 }, { "entropy": 0.572625920176506, "epoch": 3.481567895473635, "grad_norm": 0.30493223667144775, "learning_rate": 0.0002, "loss": 0.572, "mean_token_accuracy": 0.7732439935207367, "num_tokens": 13516728.0, "step": 3732 }, { "entropy": 0.5934887677431107, "epoch": 3.482501166588894, "grad_norm": 0.3783116340637207, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.7590289860963821, "num_tokens": 13520334.0, "step": 3733 }, { "entropy": 0.5847637131810188, "epoch": 3.483434437704153, "grad_norm": 0.30588045716285706, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.7626622319221497, "num_tokens": 13524031.0, "step": 3734 }, { "entropy": 0.557684913277626, "epoch": 3.484367708819412, "grad_norm": 0.3445364534854889, "learning_rate": 0.0002, "loss": 0.5611, "mean_token_accuracy": 0.7729349583387375, "num_tokens": 13527732.0, "step": 3735 }, { "entropy": 0.5953482836484909, "epoch": 3.485300979934671, "grad_norm": 0.31490758061408997, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.7602405697107315, "num_tokens": 13531381.0, "step": 3736 }, { "entropy": 0.5918342620134354, "epoch": 3.48623425104993, "grad_norm": 0.3139866888523102, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.7574724853038788, "num_tokens": 13535114.0, "step": 3737 }, { "entropy": 0.565702274441719, "epoch": 3.4871675221651888, "grad_norm": 0.3312624394893646, "learning_rate": 0.0002, "loss": 0.5628, "mean_token_accuracy": 0.7762036919593811, "num_tokens": 13538783.0, "step": 3738 }, { "entropy": 0.5589347183704376, "epoch": 3.4881007932804478, "grad_norm": 0.3818490505218506, "learning_rate": 0.0002, "loss": 0.5661, "mean_token_accuracy": 0.7651479095220566, "num_tokens": 13542276.0, "step": 3739 }, { "entropy": 0.5887464731931686, "epoch": 3.4890340643957067, "grad_norm": 0.35473906993865967, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.7528282105922699, "num_tokens": 13545933.0, "step": 3740 }, { "entropy": 0.6297465711832047, "epoch": 3.4899673355109657, "grad_norm": 0.38739174604415894, "learning_rate": 0.0002, "loss": 0.6303, "mean_token_accuracy": 0.7454826533794403, "num_tokens": 13549580.0, "step": 3741 }, { "entropy": 0.529657281935215, "epoch": 3.4909006066262247, "grad_norm": 0.36351844668388367, "learning_rate": 0.0002, "loss": 0.5343, "mean_token_accuracy": 0.7838511317968369, "num_tokens": 13553256.0, "step": 3742 }, { "entropy": 0.5538813546299934, "epoch": 3.4918338777414837, "grad_norm": 0.3025018870830536, "learning_rate": 0.0002, "loss": 0.5577, "mean_token_accuracy": 0.7759316861629486, "num_tokens": 13556872.0, "step": 3743 }, { "entropy": 0.5828116238117218, "epoch": 3.4927671488567427, "grad_norm": 0.36222535371780396, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.7612337023019791, "num_tokens": 13560470.0, "step": 3744 }, { "entropy": 0.5738154202699661, "epoch": 3.4937004199720016, "grad_norm": 0.33381345868110657, "learning_rate": 0.0002, "loss": 0.5775, "mean_token_accuracy": 0.7646171748638153, "num_tokens": 13564061.0, "step": 3745 }, { "entropy": 0.5817979425191879, "epoch": 3.4946336910872606, "grad_norm": 0.33583906292915344, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.7674531042575836, "num_tokens": 13567702.0, "step": 3746 }, { "entropy": 0.5735438391566277, "epoch": 3.4955669622025196, "grad_norm": 0.3877462148666382, "learning_rate": 0.0002, "loss": 0.5811, "mean_token_accuracy": 0.7709633260965347, "num_tokens": 13571441.0, "step": 3747 }, { "entropy": 0.5590199828147888, "epoch": 3.4965002333177786, "grad_norm": 0.34920769929885864, "learning_rate": 0.0002, "loss": 0.5669, "mean_token_accuracy": 0.7670224606990814, "num_tokens": 13574881.0, "step": 3748 }, { "entropy": 0.5722408890724182, "epoch": 3.4974335044330376, "grad_norm": 0.37936827540397644, "learning_rate": 0.0002, "loss": 0.578, "mean_token_accuracy": 0.7646621316671371, "num_tokens": 13578481.0, "step": 3749 }, { "entropy": 0.6149341911077499, "epoch": 3.4983667755482966, "grad_norm": 0.31104210019111633, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7523884475231171, "num_tokens": 13582104.0, "step": 3750 }, { "entropy": 0.6258626729249954, "epoch": 3.4993000466635555, "grad_norm": 0.3125160336494446, "learning_rate": 0.0002, "loss": 0.6104, "mean_token_accuracy": 0.7505354136228561, "num_tokens": 13585776.0, "step": 3751 }, { "entropy": 0.576611340045929, "epoch": 3.5002333177788145, "grad_norm": 0.4567732810974121, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.760782927274704, "num_tokens": 13589375.0, "step": 3752 }, { "entropy": 0.601644828915596, "epoch": 3.5011665888940735, "grad_norm": 0.28519967198371887, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.7578742355108261, "num_tokens": 13593130.0, "step": 3753 }, { "entropy": 0.5683354735374451, "epoch": 3.5020998600093325, "grad_norm": 0.3810276687145233, "learning_rate": 0.0002, "loss": 0.5782, "mean_token_accuracy": 0.767480880022049, "num_tokens": 13596828.0, "step": 3754 }, { "entropy": 0.5764371752738953, "epoch": 3.5030331311245915, "grad_norm": 0.31598493456840515, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.7610608637332916, "num_tokens": 13600372.0, "step": 3755 }, { "entropy": 0.5978749841451645, "epoch": 3.5039664022398505, "grad_norm": 0.3191881477832794, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.760095939040184, "num_tokens": 13603989.0, "step": 3756 }, { "entropy": 0.5588468760251999, "epoch": 3.5048996733551094, "grad_norm": 0.34674742817878723, "learning_rate": 0.0002, "loss": 0.5656, "mean_token_accuracy": 0.7679957747459412, "num_tokens": 13607610.0, "step": 3757 }, { "entropy": 0.5468575060367584, "epoch": 3.5058329444703684, "grad_norm": 0.35512086749076843, "learning_rate": 0.0002, "loss": 0.5452, "mean_token_accuracy": 0.7791291177272797, "num_tokens": 13611149.0, "step": 3758 }, { "entropy": 0.5562029927968979, "epoch": 3.5067662155856274, "grad_norm": 0.4180397689342499, "learning_rate": 0.0002, "loss": 0.5709, "mean_token_accuracy": 0.7671919763088226, "num_tokens": 13614636.0, "step": 3759 }, { "entropy": 0.5617521554231644, "epoch": 3.5076994867008864, "grad_norm": 0.3368414342403412, "learning_rate": 0.0002, "loss": 0.5603, "mean_token_accuracy": 0.7685167342424393, "num_tokens": 13618136.0, "step": 3760 }, { "entropy": 0.6083158999681473, "epoch": 3.5086327578161454, "grad_norm": 0.3509185314178467, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7555030882358551, "num_tokens": 13621826.0, "step": 3761 }, { "entropy": 0.6086844503879547, "epoch": 3.5095660289314043, "grad_norm": 0.29819798469543457, "learning_rate": 0.0002, "loss": 0.6112, "mean_token_accuracy": 0.7560089826583862, "num_tokens": 13625346.0, "step": 3762 }, { "entropy": 0.5740047991275787, "epoch": 3.5104993000466633, "grad_norm": 0.35518065094947815, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.7690262645483017, "num_tokens": 13628988.0, "step": 3763 }, { "entropy": 0.5710824131965637, "epoch": 3.5114325711619223, "grad_norm": 0.3736708164215088, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7683528065681458, "num_tokens": 13632597.0, "step": 3764 }, { "entropy": 0.5335356593132019, "epoch": 3.5123658422771813, "grad_norm": 0.33851325511932373, "learning_rate": 0.0002, "loss": 0.5391, "mean_token_accuracy": 0.7771494537591934, "num_tokens": 13636209.0, "step": 3765 }, { "entropy": 0.6164332628250122, "epoch": 3.5132991133924403, "grad_norm": 0.2970331311225891, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.747039720416069, "num_tokens": 13639988.0, "step": 3766 }, { "entropy": 0.5843295753002167, "epoch": 3.5142323845076993, "grad_norm": 0.30492526292800903, "learning_rate": 0.0002, "loss": 0.5762, "mean_token_accuracy": 0.7636783421039581, "num_tokens": 13643600.0, "step": 3767 }, { "entropy": 0.5734693259000778, "epoch": 3.5151656556229582, "grad_norm": 0.3458095192909241, "learning_rate": 0.0002, "loss": 0.572, "mean_token_accuracy": 0.7663268595933914, "num_tokens": 13647193.0, "step": 3768 }, { "entropy": 0.5789221823215485, "epoch": 3.5160989267382172, "grad_norm": 0.39524057507514954, "learning_rate": 0.0002, "loss": 0.5951, "mean_token_accuracy": 0.757603257894516, "num_tokens": 13650894.0, "step": 3769 }, { "entropy": 0.6040351539850235, "epoch": 3.517032197853476, "grad_norm": 0.37974318861961365, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7491610050201416, "num_tokens": 13654531.0, "step": 3770 }, { "entropy": 0.5597633570432663, "epoch": 3.517965468968735, "grad_norm": 0.3439669907093048, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7715885639190674, "num_tokens": 13658054.0, "step": 3771 }, { "entropy": 0.5762329250574112, "epoch": 3.518898740083994, "grad_norm": 0.32373860478401184, "learning_rate": 0.0002, "loss": 0.5789, "mean_token_accuracy": 0.7660953402519226, "num_tokens": 13661719.0, "step": 3772 }, { "entropy": 0.5673515647649765, "epoch": 3.519832011199253, "grad_norm": 0.3007700443267822, "learning_rate": 0.0002, "loss": 0.573, "mean_token_accuracy": 0.768086850643158, "num_tokens": 13665246.0, "step": 3773 }, { "entropy": 0.5817809551954269, "epoch": 3.520765282314512, "grad_norm": 0.33829265832901, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.7588630318641663, "num_tokens": 13668981.0, "step": 3774 }, { "entropy": 0.5992788672447205, "epoch": 3.521698553429771, "grad_norm": 0.3124954402446747, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.759375661611557, "num_tokens": 13672704.0, "step": 3775 }, { "entropy": 0.5661397576332092, "epoch": 3.52263182454503, "grad_norm": 0.38659605383872986, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.758668065071106, "num_tokens": 13676265.0, "step": 3776 }, { "entropy": 0.6264830678701401, "epoch": 3.523565095660289, "grad_norm": 0.3430008888244629, "learning_rate": 0.0002, "loss": 0.6206, "mean_token_accuracy": 0.7528432309627533, "num_tokens": 13679960.0, "step": 3777 }, { "entropy": 0.5643893182277679, "epoch": 3.524498366775548, "grad_norm": 0.3279637098312378, "learning_rate": 0.0002, "loss": 0.5653, "mean_token_accuracy": 0.7774769812822342, "num_tokens": 13683559.0, "step": 3778 }, { "entropy": 0.5761358588933945, "epoch": 3.525431637890807, "grad_norm": 0.3472989797592163, "learning_rate": 0.0002, "loss": 0.571, "mean_token_accuracy": 0.7618918418884277, "num_tokens": 13687159.0, "step": 3779 }, { "entropy": 0.623761311173439, "epoch": 3.526364909006066, "grad_norm": 0.3138110935688019, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.7531154602766037, "num_tokens": 13690819.0, "step": 3780 }, { "entropy": 0.5548340529203415, "epoch": 3.527298180121325, "grad_norm": 0.31852108240127563, "learning_rate": 0.0002, "loss": 0.5593, "mean_token_accuracy": 0.771216407418251, "num_tokens": 13694379.0, "step": 3781 }, { "entropy": 0.5880695879459381, "epoch": 3.528231451236584, "grad_norm": 0.3252304494380951, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.7576576918363571, "num_tokens": 13698022.0, "step": 3782 }, { "entropy": 0.5641152411699295, "epoch": 3.529164722351843, "grad_norm": 0.36701372265815735, "learning_rate": 0.0002, "loss": 0.5816, "mean_token_accuracy": 0.7623481899499893, "num_tokens": 13701592.0, "step": 3783 }, { "entropy": 0.5546195954084396, "epoch": 3.530097993467102, "grad_norm": 0.3593199849128723, "learning_rate": 0.0002, "loss": 0.5667, "mean_token_accuracy": 0.7716110944747925, "num_tokens": 13705052.0, "step": 3784 }, { "entropy": 0.562559962272644, "epoch": 3.531031264582361, "grad_norm": 0.3549519181251526, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7757324278354645, "num_tokens": 13708626.0, "step": 3785 }, { "entropy": 0.5959880948066711, "epoch": 3.53196453569762, "grad_norm": 0.37917545437812805, "learning_rate": 0.0002, "loss": 0.6071, "mean_token_accuracy": 0.7572960257530212, "num_tokens": 13712210.0, "step": 3786 }, { "entropy": 0.5399685502052307, "epoch": 3.532897806812879, "grad_norm": 0.33415576815605164, "learning_rate": 0.0002, "loss": 0.5528, "mean_token_accuracy": 0.7760212421417236, "num_tokens": 13715792.0, "step": 3787 }, { "entropy": 0.5981897264719009, "epoch": 3.533831077928138, "grad_norm": 0.3900342583656311, "learning_rate": 0.0002, "loss": 0.6153, "mean_token_accuracy": 0.746522068977356, "num_tokens": 13719340.0, "step": 3788 }, { "entropy": 0.613459050655365, "epoch": 3.534764349043397, "grad_norm": 0.37826043367385864, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.7554235607385635, "num_tokens": 13723118.0, "step": 3789 }, { "entropy": 0.5789056196808815, "epoch": 3.535697620158656, "grad_norm": 0.3085501194000244, "learning_rate": 0.0002, "loss": 0.5757, "mean_token_accuracy": 0.7675461769104004, "num_tokens": 13726720.0, "step": 3790 }, { "entropy": 0.5786017179489136, "epoch": 3.536630891273915, "grad_norm": 0.3256521224975586, "learning_rate": 0.0002, "loss": 0.5693, "mean_token_accuracy": 0.7664621025323868, "num_tokens": 13730290.0, "step": 3791 }, { "entropy": 0.5871846526861191, "epoch": 3.537564162389174, "grad_norm": 0.4005200266838074, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.7721613794565201, "num_tokens": 13733750.0, "step": 3792 }, { "entropy": 0.616417407989502, "epoch": 3.538497433504433, "grad_norm": 0.297344833612442, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.7577711641788483, "num_tokens": 13737452.0, "step": 3793 }, { "entropy": 0.5777567774057388, "epoch": 3.539430704619692, "grad_norm": 0.32522809505462646, "learning_rate": 0.0002, "loss": 0.5694, "mean_token_accuracy": 0.7678745239973068, "num_tokens": 13741105.0, "step": 3794 }, { "entropy": 0.5770379602909088, "epoch": 3.5403639757349508, "grad_norm": 0.3349016010761261, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7726330757141113, "num_tokens": 13744789.0, "step": 3795 }, { "entropy": 0.5910525918006897, "epoch": 3.5412972468502097, "grad_norm": 0.37415218353271484, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.7506991177797318, "num_tokens": 13748428.0, "step": 3796 }, { "entropy": 0.5654714852571487, "epoch": 3.5422305179654687, "grad_norm": 0.30709248781204224, "learning_rate": 0.0002, "loss": 0.5675, "mean_token_accuracy": 0.7733902484178543, "num_tokens": 13752004.0, "step": 3797 }, { "entropy": 0.5719783157110214, "epoch": 3.5431637890807277, "grad_norm": 0.40230774879455566, "learning_rate": 0.0002, "loss": 0.5756, "mean_token_accuracy": 0.7712215781211853, "num_tokens": 13755730.0, "step": 3798 }, { "entropy": 0.5453578531742096, "epoch": 3.5440970601959867, "grad_norm": 0.3802404999732971, "learning_rate": 0.0002, "loss": 0.5494, "mean_token_accuracy": 0.7720907926559448, "num_tokens": 13759267.0, "step": 3799 }, { "entropy": 0.6140803098678589, "epoch": 3.5450303313112457, "grad_norm": 0.3380722403526306, "learning_rate": 0.0002, "loss": 0.6236, "mean_token_accuracy": 0.7525492310523987, "num_tokens": 13762915.0, "step": 3800 }, { "entropy": 0.6039578318595886, "epoch": 3.5459636024265047, "grad_norm": 0.3798513412475586, "learning_rate": 0.0002, "loss": 0.6135, "mean_token_accuracy": 0.7503634989261627, "num_tokens": 13766512.0, "step": 3801 }, { "entropy": 0.558135986328125, "epoch": 3.5468968735417636, "grad_norm": 0.4099818766117096, "learning_rate": 0.0002, "loss": 0.5728, "mean_token_accuracy": 0.7697967737913132, "num_tokens": 13770070.0, "step": 3802 }, { "entropy": 0.5856420546770096, "epoch": 3.5478301446570226, "grad_norm": 0.362480103969574, "learning_rate": 0.0002, "loss": 0.5927, "mean_token_accuracy": 0.7652021646499634, "num_tokens": 13773669.0, "step": 3803 }, { "entropy": 0.5855385065078735, "epoch": 3.5487634157722816, "grad_norm": 0.391690731048584, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.7603690773248672, "num_tokens": 13777334.0, "step": 3804 }, { "entropy": 0.5994816422462463, "epoch": 3.5496966868875406, "grad_norm": 0.3257186710834503, "learning_rate": 0.0002, "loss": 0.6037, "mean_token_accuracy": 0.7529147416353226, "num_tokens": 13780891.0, "step": 3805 }, { "entropy": 0.5417252779006958, "epoch": 3.5506299580027996, "grad_norm": 0.31231406331062317, "learning_rate": 0.0002, "loss": 0.5378, "mean_token_accuracy": 0.7810786217451096, "num_tokens": 13784485.0, "step": 3806 }, { "entropy": 0.5732999071478844, "epoch": 3.5515632291180586, "grad_norm": 0.3371269106864929, "learning_rate": 0.0002, "loss": 0.5781, "mean_token_accuracy": 0.7644158601760864, "num_tokens": 13788115.0, "step": 3807 }, { "entropy": 0.5751039087772369, "epoch": 3.5524965002333175, "grad_norm": 0.4041590690612793, "learning_rate": 0.0002, "loss": 0.5801, "mean_token_accuracy": 0.7641141265630722, "num_tokens": 13791655.0, "step": 3808 }, { "entropy": 0.5622642487287521, "epoch": 3.5534297713485765, "grad_norm": 0.3726871907711029, "learning_rate": 0.0002, "loss": 0.5643, "mean_token_accuracy": 0.7703521698713303, "num_tokens": 13795208.0, "step": 3809 }, { "entropy": 0.5803488194942474, "epoch": 3.5543630424638355, "grad_norm": 0.3132140338420868, "learning_rate": 0.0002, "loss": 0.5759, "mean_token_accuracy": 0.7664266377687454, "num_tokens": 13798909.0, "step": 3810 }, { "entropy": 0.5784748941659927, "epoch": 3.5552963135790945, "grad_norm": 0.3265935480594635, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.764917179942131, "num_tokens": 13802547.0, "step": 3811 }, { "entropy": 0.6029090285301208, "epoch": 3.5562295846943535, "grad_norm": 0.3116160035133362, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.7509216517210007, "num_tokens": 13806171.0, "step": 3812 }, { "entropy": 0.5616150349378586, "epoch": 3.5571628558096124, "grad_norm": 0.45276394486427307, "learning_rate": 0.0002, "loss": 0.5685, "mean_token_accuracy": 0.7768421471118927, "num_tokens": 13809698.0, "step": 3813 }, { "entropy": 0.5714215040206909, "epoch": 3.5580961269248714, "grad_norm": 0.37346646189689636, "learning_rate": 0.0002, "loss": 0.5758, "mean_token_accuracy": 0.7675218135118484, "num_tokens": 13813285.0, "step": 3814 }, { "entropy": 0.5951714962720871, "epoch": 3.5590293980401304, "grad_norm": 0.34978342056274414, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.7582394331693649, "num_tokens": 13816860.0, "step": 3815 }, { "entropy": 0.5997519046068192, "epoch": 3.5599626691553894, "grad_norm": 0.3367670178413391, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.7634527683258057, "num_tokens": 13820547.0, "step": 3816 }, { "entropy": 0.5876623690128326, "epoch": 3.5608959402706484, "grad_norm": 0.32354792952537537, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7608576118946075, "num_tokens": 13824188.0, "step": 3817 }, { "entropy": 0.5690113231539726, "epoch": 3.5618292113859074, "grad_norm": 0.34308692812919617, "learning_rate": 0.0002, "loss": 0.5723, "mean_token_accuracy": 0.7704053521156311, "num_tokens": 13827767.0, "step": 3818 }, { "entropy": 0.5715198218822479, "epoch": 3.5627624825011663, "grad_norm": 0.41024449467658997, "learning_rate": 0.0002, "loss": 0.5916, "mean_token_accuracy": 0.76175357401371, "num_tokens": 13831257.0, "step": 3819 }, { "entropy": 0.5866518467664719, "epoch": 3.5636957536164253, "grad_norm": 0.3273560702800751, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.7593400925397873, "num_tokens": 13834766.0, "step": 3820 }, { "entropy": 0.5787445157766342, "epoch": 3.5646290247316843, "grad_norm": 0.31795328855514526, "learning_rate": 0.0002, "loss": 0.5825, "mean_token_accuracy": 0.7645241618156433, "num_tokens": 13838316.0, "step": 3821 }, { "entropy": 0.5877916216850281, "epoch": 3.5655622958469433, "grad_norm": 0.3083827495574951, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.7654929161071777, "num_tokens": 13841847.0, "step": 3822 }, { "entropy": 0.6031261682510376, "epoch": 3.5664955669622023, "grad_norm": 0.31256571412086487, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.7615248560905457, "num_tokens": 13845519.0, "step": 3823 }, { "entropy": 0.6071941256523132, "epoch": 3.5674288380774613, "grad_norm": 0.29121851921081543, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.7537767738103867, "num_tokens": 13849282.0, "step": 3824 }, { "entropy": 0.6222513616085052, "epoch": 3.5683621091927202, "grad_norm": 0.32070499658584595, "learning_rate": 0.0002, "loss": 0.6221, "mean_token_accuracy": 0.7551289349794388, "num_tokens": 13852888.0, "step": 3825 }, { "entropy": 0.599595695734024, "epoch": 3.569295380307979, "grad_norm": 0.31602755188941956, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.7569671422243118, "num_tokens": 13856485.0, "step": 3826 }, { "entropy": 0.6091476529836655, "epoch": 3.570228651423238, "grad_norm": 0.3351421654224396, "learning_rate": 0.0002, "loss": 0.6162, "mean_token_accuracy": 0.7524739652872086, "num_tokens": 13860104.0, "step": 3827 }, { "entropy": 0.6093484610319138, "epoch": 3.571161922538497, "grad_norm": 0.3462320864200592, "learning_rate": 0.0002, "loss": 0.6195, "mean_token_accuracy": 0.7518720626831055, "num_tokens": 13863688.0, "step": 3828 }, { "entropy": 0.5649409294128418, "epoch": 3.572095193653756, "grad_norm": 0.3933051526546478, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7665114253759384, "num_tokens": 13867246.0, "step": 3829 }, { "entropy": 0.5961765050888062, "epoch": 3.573028464769015, "grad_norm": 0.35490772128105164, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.7581880837678909, "num_tokens": 13870975.0, "step": 3830 }, { "entropy": 0.5821540504693985, "epoch": 3.573961735884274, "grad_norm": 0.33734193444252014, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7612928450107574, "num_tokens": 13874724.0, "step": 3831 }, { "entropy": 0.5711414068937302, "epoch": 3.574895006999533, "grad_norm": 0.31861311197280884, "learning_rate": 0.0002, "loss": 0.5698, "mean_token_accuracy": 0.7758100926876068, "num_tokens": 13878309.0, "step": 3832 }, { "entropy": 0.6232967525720596, "epoch": 3.575828278114792, "grad_norm": 0.34907016158103943, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.7407291084527969, "num_tokens": 13881953.0, "step": 3833 }, { "entropy": 0.5606085732579231, "epoch": 3.576761549230051, "grad_norm": 0.3244101107120514, "learning_rate": 0.0002, "loss": 0.5622, "mean_token_accuracy": 0.7711174637079239, "num_tokens": 13885626.0, "step": 3834 }, { "entropy": 0.5512956008315086, "epoch": 3.57769482034531, "grad_norm": 0.3263333737850189, "learning_rate": 0.0002, "loss": 0.5565, "mean_token_accuracy": 0.7754239290952682, "num_tokens": 13889198.0, "step": 3835 }, { "entropy": 0.544885165989399, "epoch": 3.578628091460569, "grad_norm": 0.31176674365997314, "learning_rate": 0.0002, "loss": 0.5445, "mean_token_accuracy": 0.7829092741012573, "num_tokens": 13892957.0, "step": 3836 }, { "entropy": 0.567774161696434, "epoch": 3.579561362575828, "grad_norm": 0.38554760813713074, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.764853909611702, "num_tokens": 13896508.0, "step": 3837 }, { "entropy": 0.5988309532403946, "epoch": 3.580494633691087, "grad_norm": 0.36599200963974, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.7529443949460983, "num_tokens": 13900141.0, "step": 3838 }, { "entropy": 0.5653048604726791, "epoch": 3.581427904806346, "grad_norm": 0.382627934217453, "learning_rate": 0.0002, "loss": 0.5794, "mean_token_accuracy": 0.7692445814609528, "num_tokens": 13903610.0, "step": 3839 }, { "entropy": 0.5932115167379379, "epoch": 3.582361175921605, "grad_norm": 0.29232025146484375, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.7654600888490677, "num_tokens": 13907213.0, "step": 3840 }, { "entropy": 0.6206163913011551, "epoch": 3.583294447036864, "grad_norm": 0.3013647198677063, "learning_rate": 0.0002, "loss": 0.6102, "mean_token_accuracy": 0.7530961185693741, "num_tokens": 13910885.0, "step": 3841 }, { "entropy": 0.6130441576242447, "epoch": 3.584227718152123, "grad_norm": 0.29914507269859314, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.75961834192276, "num_tokens": 13914524.0, "step": 3842 }, { "entropy": 0.5669876784086227, "epoch": 3.585160989267382, "grad_norm": 0.26992204785346985, "learning_rate": 0.0002, "loss": 0.5539, "mean_token_accuracy": 0.7763294279575348, "num_tokens": 13918154.0, "step": 3843 }, { "entropy": 0.6281183511018753, "epoch": 3.586094260382641, "grad_norm": 0.3016006350517273, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.7530871778726578, "num_tokens": 13921758.0, "step": 3844 }, { "entropy": 0.5901928395032883, "epoch": 3.5870275314979, "grad_norm": 0.28257331252098083, "learning_rate": 0.0002, "loss": 0.5897, "mean_token_accuracy": 0.7622300833463669, "num_tokens": 13925424.0, "step": 3845 }, { "entropy": 0.5878738164901733, "epoch": 3.587960802613159, "grad_norm": 0.3707005977630615, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.7529503703117371, "num_tokens": 13929022.0, "step": 3846 }, { "entropy": 0.5971107482910156, "epoch": 3.588894073728418, "grad_norm": 0.3813655972480774, "learning_rate": 0.0002, "loss": 0.6179, "mean_token_accuracy": 0.7522550970315933, "num_tokens": 13932719.0, "step": 3847 }, { "entropy": 0.5646380037069321, "epoch": 3.589827344843677, "grad_norm": 0.3389035761356354, "learning_rate": 0.0002, "loss": 0.5711, "mean_token_accuracy": 0.7675033658742905, "num_tokens": 13936416.0, "step": 3848 }, { "entropy": 0.5679559260606766, "epoch": 3.590760615958936, "grad_norm": 0.38123106956481934, "learning_rate": 0.0002, "loss": 0.576, "mean_token_accuracy": 0.7660284489393234, "num_tokens": 13939991.0, "step": 3849 }, { "entropy": 0.593162328004837, "epoch": 3.591693887074195, "grad_norm": 0.3807743787765503, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7634227871894836, "num_tokens": 13943692.0, "step": 3850 }, { "entropy": 0.5433150082826614, "epoch": 3.5926271581894538, "grad_norm": 0.3373897969722748, "learning_rate": 0.0002, "loss": 0.5631, "mean_token_accuracy": 0.7697895169258118, "num_tokens": 13947310.0, "step": 3851 }, { "entropy": 0.5781200975179672, "epoch": 3.5935604293047128, "grad_norm": 0.41849687695503235, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.765534371137619, "num_tokens": 13950947.0, "step": 3852 }, { "entropy": 0.6022240519523621, "epoch": 3.5944937004199717, "grad_norm": 0.3326480984687805, "learning_rate": 0.0002, "loss": 0.6122, "mean_token_accuracy": 0.7485272884368896, "num_tokens": 13954699.0, "step": 3853 }, { "entropy": 0.5728299021720886, "epoch": 3.5954269715352307, "grad_norm": 0.38981255888938904, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.7652591019868851, "num_tokens": 13958268.0, "step": 3854 }, { "entropy": 0.5848111361265182, "epoch": 3.5963602426504897, "grad_norm": 0.35166677832603455, "learning_rate": 0.0002, "loss": 0.5924, "mean_token_accuracy": 0.7658181488513947, "num_tokens": 13961741.0, "step": 3855 }, { "entropy": 0.5643909946084023, "epoch": 3.5972935137657487, "grad_norm": 0.3199177086353302, "learning_rate": 0.0002, "loss": 0.5711, "mean_token_accuracy": 0.7636389285326004, "num_tokens": 13965338.0, "step": 3856 }, { "entropy": 0.5678524821996689, "epoch": 3.5982267848810077, "grad_norm": 0.3333631753921509, "learning_rate": 0.0002, "loss": 0.5669, "mean_token_accuracy": 0.7688505202531815, "num_tokens": 13968888.0, "step": 3857 }, { "entropy": 0.5923017263412476, "epoch": 3.5991600559962666, "grad_norm": 0.2928493320941925, "learning_rate": 0.0002, "loss": 0.5879, "mean_token_accuracy": 0.7603162080049515, "num_tokens": 13972610.0, "step": 3858 }, { "entropy": 0.5572316944599152, "epoch": 3.6000933271115256, "grad_norm": 0.29275310039520264, "learning_rate": 0.0002, "loss": 0.5521, "mean_token_accuracy": 0.7711005210876465, "num_tokens": 13976137.0, "step": 3859 }, { "entropy": 0.587244525551796, "epoch": 3.6010265982267846, "grad_norm": 0.3508412539958954, "learning_rate": 0.0002, "loss": 0.598, "mean_token_accuracy": 0.7620662748813629, "num_tokens": 13979807.0, "step": 3860 }, { "entropy": 0.5747798979282379, "epoch": 3.6019598693420436, "grad_norm": 0.35471683740615845, "learning_rate": 0.0002, "loss": 0.5735, "mean_token_accuracy": 0.7636239677667618, "num_tokens": 13983408.0, "step": 3861 }, { "entropy": 0.5657021999359131, "epoch": 3.6028931404573026, "grad_norm": 0.3181654214859009, "learning_rate": 0.0002, "loss": 0.5681, "mean_token_accuracy": 0.7733299881219864, "num_tokens": 13986991.0, "step": 3862 }, { "entropy": 0.5950556546449661, "epoch": 3.6038264115725616, "grad_norm": 0.34353193640708923, "learning_rate": 0.0002, "loss": 0.6054, "mean_token_accuracy": 0.7611847221851349, "num_tokens": 13990558.0, "step": 3863 }, { "entropy": 0.5915162414312363, "epoch": 3.6047596826878205, "grad_norm": 0.3269050419330597, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.7574534118175507, "num_tokens": 13994277.0, "step": 3864 }, { "entropy": 0.528988741338253, "epoch": 3.6056929538030795, "grad_norm": 0.3856528401374817, "learning_rate": 0.0002, "loss": 0.5359, "mean_token_accuracy": 0.7826804965734482, "num_tokens": 13997771.0, "step": 3865 }, { "entropy": 0.6219935417175293, "epoch": 3.6066262249183385, "grad_norm": 0.31206217408180237, "learning_rate": 0.0002, "loss": 0.6226, "mean_token_accuracy": 0.7493187040090561, "num_tokens": 14001357.0, "step": 3866 }, { "entropy": 0.5722871124744415, "epoch": 3.6075594960335975, "grad_norm": 0.32044848799705505, "learning_rate": 0.0002, "loss": 0.5744, "mean_token_accuracy": 0.7728626579046249, "num_tokens": 14004962.0, "step": 3867 }, { "entropy": 0.5880558490753174, "epoch": 3.6084927671488565, "grad_norm": 0.3141503930091858, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.7686361223459244, "num_tokens": 14008512.0, "step": 3868 }, { "entropy": 0.6326761841773987, "epoch": 3.6094260382641155, "grad_norm": 0.34118661284446716, "learning_rate": 0.0002, "loss": 0.627, "mean_token_accuracy": 0.7538288086652756, "num_tokens": 14012291.0, "step": 3869 }, { "entropy": 0.6017717570066452, "epoch": 3.6103593093793744, "grad_norm": 0.36372077465057373, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.7576287984848022, "num_tokens": 14015783.0, "step": 3870 }, { "entropy": 0.5731663182377815, "epoch": 3.6112925804946334, "grad_norm": 0.33554455637931824, "learning_rate": 0.0002, "loss": 0.5774, "mean_token_accuracy": 0.7635979950428009, "num_tokens": 14019370.0, "step": 3871 }, { "entropy": 0.5488988608121872, "epoch": 3.6122258516098924, "grad_norm": 0.4144861698150635, "learning_rate": 0.0002, "loss": 0.5655, "mean_token_accuracy": 0.7678020894527435, "num_tokens": 14022944.0, "step": 3872 }, { "entropy": 0.5960760563611984, "epoch": 3.6131591227251514, "grad_norm": 0.36085325479507446, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.7500066161155701, "num_tokens": 14026627.0, "step": 3873 }, { "entropy": 0.5549457967281342, "epoch": 3.6140923938404104, "grad_norm": 0.3173762261867523, "learning_rate": 0.0002, "loss": 0.5625, "mean_token_accuracy": 0.7768613547086716, "num_tokens": 14030363.0, "step": 3874 }, { "entropy": 0.5755786895751953, "epoch": 3.6150256649556693, "grad_norm": 0.3447907269001007, "learning_rate": 0.0002, "loss": 0.5781, "mean_token_accuracy": 0.7699871361255646, "num_tokens": 14033965.0, "step": 3875 }, { "entropy": 0.5712272375822067, "epoch": 3.6159589360709283, "grad_norm": 0.31379443407058716, "learning_rate": 0.0002, "loss": 0.5769, "mean_token_accuracy": 0.7693324536085129, "num_tokens": 14037759.0, "step": 3876 }, { "entropy": 0.6040422022342682, "epoch": 3.6168922071861873, "grad_norm": 0.36927035450935364, "learning_rate": 0.0002, "loss": 0.6096, "mean_token_accuracy": 0.7539640963077545, "num_tokens": 14041546.0, "step": 3877 }, { "entropy": 0.5736714601516724, "epoch": 3.6178254783014463, "grad_norm": 0.33893129229545593, "learning_rate": 0.0002, "loss": 0.575, "mean_token_accuracy": 0.7703326046466827, "num_tokens": 14045224.0, "step": 3878 }, { "entropy": 0.5907787084579468, "epoch": 3.6187587494167053, "grad_norm": 0.34340840578079224, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.7617567181587219, "num_tokens": 14048885.0, "step": 3879 }, { "entropy": 0.5562929213047028, "epoch": 3.6196920205319643, "grad_norm": 0.3202238082885742, "learning_rate": 0.0002, "loss": 0.5644, "mean_token_accuracy": 0.769898533821106, "num_tokens": 14052486.0, "step": 3880 }, { "entropy": 0.5866236835718155, "epoch": 3.6206252916472232, "grad_norm": 0.3035411834716797, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.767398789525032, "num_tokens": 14056079.0, "step": 3881 }, { "entropy": 0.6131669282913208, "epoch": 3.6215585627624822, "grad_norm": 0.395805686712265, "learning_rate": 0.0002, "loss": 0.6142, "mean_token_accuracy": 0.7451876550912857, "num_tokens": 14059674.0, "step": 3882 }, { "entropy": 0.6009152233600616, "epoch": 3.622491833877741, "grad_norm": 0.3708978295326233, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.7561370581388474, "num_tokens": 14063311.0, "step": 3883 }, { "entropy": 0.556886151432991, "epoch": 3.623425104993, "grad_norm": 0.35550177097320557, "learning_rate": 0.0002, "loss": 0.5631, "mean_token_accuracy": 0.7742457985877991, "num_tokens": 14066850.0, "step": 3884 }, { "entropy": 0.568698063492775, "epoch": 3.624358376108259, "grad_norm": 0.395001620054245, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7700649052858353, "num_tokens": 14070508.0, "step": 3885 }, { "entropy": 0.5594694912433624, "epoch": 3.6252916472235186, "grad_norm": 0.3587895631790161, "learning_rate": 0.0002, "loss": 0.5671, "mean_token_accuracy": 0.7743656039237976, "num_tokens": 14074107.0, "step": 3886 }, { "entropy": 0.5791210830211639, "epoch": 3.6262249183387776, "grad_norm": 0.3241833746433258, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.7651449739933014, "num_tokens": 14077752.0, "step": 3887 }, { "entropy": 0.5449633598327637, "epoch": 3.6271581894540366, "grad_norm": 0.30027061700820923, "learning_rate": 0.0002, "loss": 0.5502, "mean_token_accuracy": 0.7736046612262726, "num_tokens": 14081515.0, "step": 3888 }, { "entropy": 0.5515652447938919, "epoch": 3.6280914605692955, "grad_norm": 0.40578922629356384, "learning_rate": 0.0002, "loss": 0.5676, "mean_token_accuracy": 0.7684683948755264, "num_tokens": 14085066.0, "step": 3889 }, { "entropy": 0.5894753783941269, "epoch": 3.6290247316845545, "grad_norm": 0.3037429451942444, "learning_rate": 0.0002, "loss": 0.5876, "mean_token_accuracy": 0.7638644278049469, "num_tokens": 14088879.0, "step": 3890 }, { "entropy": 0.5250414311885834, "epoch": 3.6299580027998135, "grad_norm": 0.34336885809898376, "learning_rate": 0.0002, "loss": 0.5301, "mean_token_accuracy": 0.7811094224452972, "num_tokens": 14092488.0, "step": 3891 }, { "entropy": 0.565603606402874, "epoch": 3.6308912739150725, "grad_norm": 0.37788382172584534, "learning_rate": 0.0002, "loss": 0.5663, "mean_token_accuracy": 0.7739093005657196, "num_tokens": 14096159.0, "step": 3892 }, { "entropy": 0.5680104345083237, "epoch": 3.6318245450303315, "grad_norm": 0.3981499969959259, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.7666442841291428, "num_tokens": 14099869.0, "step": 3893 }, { "entropy": 0.5705828219652176, "epoch": 3.6327578161455905, "grad_norm": 0.3224697411060333, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.7704690247774124, "num_tokens": 14103384.0, "step": 3894 }, { "entropy": 0.5865432471036911, "epoch": 3.6336910872608494, "grad_norm": 0.413298636674881, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.7656527310609818, "num_tokens": 14107029.0, "step": 3895 }, { "entropy": 0.5695411860942841, "epoch": 3.6346243583761084, "grad_norm": 0.3050275146961212, "learning_rate": 0.0002, "loss": 0.5673, "mean_token_accuracy": 0.7708216309547424, "num_tokens": 14110553.0, "step": 3896 }, { "entropy": 0.568225547671318, "epoch": 3.6355576294913674, "grad_norm": 0.29818302392959595, "learning_rate": 0.0002, "loss": 0.5638, "mean_token_accuracy": 0.7730000913143158, "num_tokens": 14114267.0, "step": 3897 }, { "entropy": 0.6012596786022186, "epoch": 3.6364909006066264, "grad_norm": 0.336907297372818, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.7574862539768219, "num_tokens": 14117889.0, "step": 3898 }, { "entropy": 0.5769800394773483, "epoch": 3.6374241717218854, "grad_norm": 0.3768845796585083, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7671528607606888, "num_tokens": 14121399.0, "step": 3899 }, { "entropy": 0.5842826217412949, "epoch": 3.6383574428371444, "grad_norm": 0.34831398725509644, "learning_rate": 0.0002, "loss": 0.5814, "mean_token_accuracy": 0.7648147642612457, "num_tokens": 14125076.0, "step": 3900 }, { "entropy": 0.6316385120153427, "epoch": 3.6392907139524033, "grad_norm": 0.3616248369216919, "learning_rate": 0.0002, "loss": 0.6387, "mean_token_accuracy": 0.7419647425413132, "num_tokens": 14128796.0, "step": 3901 }, { "entropy": 0.6120327562093735, "epoch": 3.6402239850676623, "grad_norm": 0.3610549867153168, "learning_rate": 0.0002, "loss": 0.6209, "mean_token_accuracy": 0.7509381324052811, "num_tokens": 14132517.0, "step": 3902 }, { "entropy": 0.56941719353199, "epoch": 3.6411572561829213, "grad_norm": 0.35996338725090027, "learning_rate": 0.0002, "loss": 0.5715, "mean_token_accuracy": 0.7710943222045898, "num_tokens": 14136200.0, "step": 3903 }, { "entropy": 0.5830513387918472, "epoch": 3.6420905272981803, "grad_norm": 0.33035799860954285, "learning_rate": 0.0002, "loss": 0.583, "mean_token_accuracy": 0.7698493152856827, "num_tokens": 14139946.0, "step": 3904 }, { "entropy": 0.5898331552743912, "epoch": 3.6430237984134393, "grad_norm": 0.3557892143726349, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.7602048218250275, "num_tokens": 14143490.0, "step": 3905 }, { "entropy": 0.5792873799800873, "epoch": 3.6439570695286982, "grad_norm": 0.3428036868572235, "learning_rate": 0.0002, "loss": 0.5822, "mean_token_accuracy": 0.7610692828893661, "num_tokens": 14147183.0, "step": 3906 }, { "entropy": 0.5607921928167343, "epoch": 3.6448903406439572, "grad_norm": 0.3794654309749603, "learning_rate": 0.0002, "loss": 0.5668, "mean_token_accuracy": 0.7706628143787384, "num_tokens": 14150802.0, "step": 3907 }, { "entropy": 0.5924274623394012, "epoch": 3.645823611759216, "grad_norm": 0.3258565664291382, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.7637478411197662, "num_tokens": 14154491.0, "step": 3908 }, { "entropy": 0.569896325469017, "epoch": 3.646756882874475, "grad_norm": 0.3314167261123657, "learning_rate": 0.0002, "loss": 0.5728, "mean_token_accuracy": 0.7638538032770157, "num_tokens": 14158113.0, "step": 3909 }, { "entropy": 0.5845869481563568, "epoch": 3.647690153989734, "grad_norm": 0.3280129134654999, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7620397806167603, "num_tokens": 14161758.0, "step": 3910 }, { "entropy": 0.600479856133461, "epoch": 3.648623425104993, "grad_norm": 0.32310131192207336, "learning_rate": 0.0002, "loss": 0.604, "mean_token_accuracy": 0.7599018961191177, "num_tokens": 14165418.0, "step": 3911 }, { "entropy": 0.567915603518486, "epoch": 3.649556696220252, "grad_norm": 0.36821696162223816, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7682351917028427, "num_tokens": 14169065.0, "step": 3912 }, { "entropy": 0.5452623441815376, "epoch": 3.650489967335511, "grad_norm": 0.36052727699279785, "learning_rate": 0.0002, "loss": 0.5533, "mean_token_accuracy": 0.7766226679086685, "num_tokens": 14172632.0, "step": 3913 }, { "entropy": 0.6176996380090714, "epoch": 3.65142323845077, "grad_norm": 0.27620434761047363, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.7488767802715302, "num_tokens": 14176164.0, "step": 3914 }, { "entropy": 0.5840997397899628, "epoch": 3.652356509566029, "grad_norm": 0.3048302233219147, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.7590353786945343, "num_tokens": 14179843.0, "step": 3915 }, { "entropy": 0.6183319836854935, "epoch": 3.653289780681288, "grad_norm": 0.3021836578845978, "learning_rate": 0.0002, "loss": 0.618, "mean_token_accuracy": 0.7467764467000961, "num_tokens": 14183504.0, "step": 3916 }, { "entropy": 0.6188158094882965, "epoch": 3.654223051796547, "grad_norm": 0.32962939143180847, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.7554430812597275, "num_tokens": 14187213.0, "step": 3917 }, { "entropy": 0.5639399290084839, "epoch": 3.655156322911806, "grad_norm": 0.3323192894458771, "learning_rate": 0.0002, "loss": 0.5617, "mean_token_accuracy": 0.7728131711483002, "num_tokens": 14190946.0, "step": 3918 }, { "entropy": 0.5708101540803909, "epoch": 3.656089594027065, "grad_norm": 0.2950708866119385, "learning_rate": 0.0002, "loss": 0.5693, "mean_token_accuracy": 0.7697159647941589, "num_tokens": 14194612.0, "step": 3919 }, { "entropy": 0.5824941694736481, "epoch": 3.657022865142324, "grad_norm": 0.3235052227973938, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.7702392190694809, "num_tokens": 14198236.0, "step": 3920 }, { "entropy": 0.5795156359672546, "epoch": 3.657956136257583, "grad_norm": 0.3464623689651489, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7594168931245804, "num_tokens": 14201941.0, "step": 3921 }, { "entropy": 0.5639664530754089, "epoch": 3.658889407372842, "grad_norm": 0.348721981048584, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.7594483196735382, "num_tokens": 14205517.0, "step": 3922 }, { "entropy": 0.5458170026540756, "epoch": 3.659822678488101, "grad_norm": 0.32144662737846375, "learning_rate": 0.0002, "loss": 0.5611, "mean_token_accuracy": 0.7716976255178452, "num_tokens": 14209190.0, "step": 3923 }, { "entropy": 0.5594873577356339, "epoch": 3.66075594960336, "grad_norm": 0.38897791504859924, "learning_rate": 0.0002, "loss": 0.5751, "mean_token_accuracy": 0.7618945389986038, "num_tokens": 14212797.0, "step": 3924 }, { "entropy": 0.5679596960544586, "epoch": 3.661689220718619, "grad_norm": 0.3742331266403198, "learning_rate": 0.0002, "loss": 0.5747, "mean_token_accuracy": 0.7699204236268997, "num_tokens": 14216419.0, "step": 3925 }, { "entropy": 0.5768562704324722, "epoch": 3.662622491833878, "grad_norm": 0.3515186905860901, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.7660610675811768, "num_tokens": 14219978.0, "step": 3926 }, { "entropy": 0.6063810735940933, "epoch": 3.663555762949137, "grad_norm": 0.34188568592071533, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.7592268288135529, "num_tokens": 14223630.0, "step": 3927 }, { "entropy": 0.5970328599214554, "epoch": 3.664489034064396, "grad_norm": 0.3415139317512512, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.7541011869907379, "num_tokens": 14227252.0, "step": 3928 }, { "entropy": 0.5547977611422539, "epoch": 3.665422305179655, "grad_norm": 0.45866838097572327, "learning_rate": 0.0002, "loss": 0.5556, "mean_token_accuracy": 0.7719889879226685, "num_tokens": 14230768.0, "step": 3929 }, { "entropy": 0.592997819185257, "epoch": 3.666355576294914, "grad_norm": 0.336673378944397, "learning_rate": 0.0002, "loss": 0.5852, "mean_token_accuracy": 0.7589982450008392, "num_tokens": 14234467.0, "step": 3930 }, { "entropy": 0.608492061495781, "epoch": 3.667288847410173, "grad_norm": 0.410719633102417, "learning_rate": 0.0002, "loss": 0.6132, "mean_token_accuracy": 0.756444662809372, "num_tokens": 14238234.0, "step": 3931 }, { "entropy": 0.5941027849912643, "epoch": 3.668222118525432, "grad_norm": 0.3508427143096924, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.7652078121900558, "num_tokens": 14241888.0, "step": 3932 }, { "entropy": 0.549273319542408, "epoch": 3.6691553896406908, "grad_norm": 0.4272778034210205, "learning_rate": 0.0002, "loss": 0.5748, "mean_token_accuracy": 0.7692470550537109, "num_tokens": 14245449.0, "step": 3933 }, { "entropy": 0.548586055636406, "epoch": 3.6700886607559497, "grad_norm": 0.3381420075893402, "learning_rate": 0.0002, "loss": 0.5612, "mean_token_accuracy": 0.776843935251236, "num_tokens": 14249138.0, "step": 3934 }, { "entropy": 0.5587465763092041, "epoch": 3.6710219318712087, "grad_norm": 0.3933650255203247, "learning_rate": 0.0002, "loss": 0.5697, "mean_token_accuracy": 0.7676128149032593, "num_tokens": 14252771.0, "step": 3935 }, { "entropy": 0.597853809595108, "epoch": 3.6719552029864677, "grad_norm": 0.3760383129119873, "learning_rate": 0.0002, "loss": 0.6003, "mean_token_accuracy": 0.7598902881145477, "num_tokens": 14256508.0, "step": 3936 }, { "entropy": 0.6222196519374847, "epoch": 3.6728884741017267, "grad_norm": 0.28753212094306946, "learning_rate": 0.0002, "loss": 0.6114, "mean_token_accuracy": 0.7520410567522049, "num_tokens": 14260147.0, "step": 3937 }, { "entropy": 0.606382429599762, "epoch": 3.6738217452169857, "grad_norm": 0.3875804543495178, "learning_rate": 0.0002, "loss": 0.6103, "mean_token_accuracy": 0.761890321969986, "num_tokens": 14263780.0, "step": 3938 }, { "entropy": 0.6280186027288437, "epoch": 3.6747550163322447, "grad_norm": 0.3432680666446686, "learning_rate": 0.0002, "loss": 0.6194, "mean_token_accuracy": 0.7509109526872635, "num_tokens": 14267379.0, "step": 3939 }, { "entropy": 0.6019525527954102, "epoch": 3.6756882874475036, "grad_norm": 0.3475853204727173, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7565563470125198, "num_tokens": 14270974.0, "step": 3940 }, { "entropy": 0.5781262069940567, "epoch": 3.6766215585627626, "grad_norm": 0.3214326798915863, "learning_rate": 0.0002, "loss": 0.5811, "mean_token_accuracy": 0.7681198418140411, "num_tokens": 14274653.0, "step": 3941 }, { "entropy": 0.5985192060470581, "epoch": 3.6775548296780216, "grad_norm": 0.35484814643859863, "learning_rate": 0.0002, "loss": 0.6155, "mean_token_accuracy": 0.7476750761270523, "num_tokens": 14278320.0, "step": 3942 }, { "entropy": 0.5817293077707291, "epoch": 3.6784881007932806, "grad_norm": 0.32916775345802307, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.7662037163972855, "num_tokens": 14281987.0, "step": 3943 }, { "entropy": 0.5792255252599716, "epoch": 3.6794213719085396, "grad_norm": 0.35134002566337585, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.7734541893005371, "num_tokens": 14285579.0, "step": 3944 }, { "entropy": 0.6091095209121704, "epoch": 3.6803546430237986, "grad_norm": 0.3411770761013031, "learning_rate": 0.0002, "loss": 0.6229, "mean_token_accuracy": 0.7407828718423843, "num_tokens": 14289176.0, "step": 3945 }, { "entropy": 0.6013078838586807, "epoch": 3.6812879141390575, "grad_norm": 0.3345896899700165, "learning_rate": 0.0002, "loss": 0.6139, "mean_token_accuracy": 0.7549757957458496, "num_tokens": 14292809.0, "step": 3946 }, { "entropy": 0.6232054233551025, "epoch": 3.6822211852543165, "grad_norm": 0.3038768470287323, "learning_rate": 0.0002, "loss": 0.6092, "mean_token_accuracy": 0.7555011212825775, "num_tokens": 14296579.0, "step": 3947 }, { "entropy": 0.5919431447982788, "epoch": 3.6831544563695755, "grad_norm": 0.3342461884021759, "learning_rate": 0.0002, "loss": 0.5834, "mean_token_accuracy": 0.7629011422395706, "num_tokens": 14300120.0, "step": 3948 }, { "entropy": 0.5473599582910538, "epoch": 3.6840877274848345, "grad_norm": 0.3818954527378082, "learning_rate": 0.0002, "loss": 0.5554, "mean_token_accuracy": 0.7750977873802185, "num_tokens": 14303747.0, "step": 3949 }, { "entropy": 0.5696288794279099, "epoch": 3.6850209986000935, "grad_norm": 0.35929977893829346, "learning_rate": 0.0002, "loss": 0.5824, "mean_token_accuracy": 0.7619118243455887, "num_tokens": 14307335.0, "step": 3950 }, { "entropy": 0.6010849177837372, "epoch": 3.6859542697153524, "grad_norm": 0.3447357714176178, "learning_rate": 0.0002, "loss": 0.6079, "mean_token_accuracy": 0.7522968500852585, "num_tokens": 14310816.0, "step": 3951 }, { "entropy": 0.5812274068593979, "epoch": 3.6868875408306114, "grad_norm": 0.3745281398296356, "learning_rate": 0.0002, "loss": 0.596, "mean_token_accuracy": 0.7575566172599792, "num_tokens": 14314406.0, "step": 3952 }, { "entropy": 0.6032616198062897, "epoch": 3.6878208119458704, "grad_norm": 0.36050495505332947, "learning_rate": 0.0002, "loss": 0.605, "mean_token_accuracy": 0.7600051760673523, "num_tokens": 14318117.0, "step": 3953 }, { "entropy": 0.5888346582651138, "epoch": 3.6887540830611294, "grad_norm": 0.3341602683067322, "learning_rate": 0.0002, "loss": 0.5989, "mean_token_accuracy": 0.756238579750061, "num_tokens": 14321727.0, "step": 3954 }, { "entropy": 0.6035739481449127, "epoch": 3.6896873541763884, "grad_norm": 0.29744064807891846, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.7597524225711823, "num_tokens": 14325356.0, "step": 3955 }, { "entropy": 0.6107344776391983, "epoch": 3.6906206252916474, "grad_norm": 0.3534705936908722, "learning_rate": 0.0002, "loss": 0.6136, "mean_token_accuracy": 0.7496775537729263, "num_tokens": 14328911.0, "step": 3956 }, { "entropy": 0.5192564427852631, "epoch": 3.6915538964069063, "grad_norm": 0.39007601141929626, "learning_rate": 0.0002, "loss": 0.5365, "mean_token_accuracy": 0.7809058278799057, "num_tokens": 14332284.0, "step": 3957 }, { "entropy": 0.5696321278810501, "epoch": 3.6924871675221653, "grad_norm": 0.30133309960365295, "learning_rate": 0.0002, "loss": 0.5667, "mean_token_accuracy": 0.7672118246555328, "num_tokens": 14335859.0, "step": 3958 }, { "entropy": 0.5883957147598267, "epoch": 3.6934204386374243, "grad_norm": 0.3440927267074585, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.7583846151828766, "num_tokens": 14339507.0, "step": 3959 }, { "entropy": 0.5845747739076614, "epoch": 3.6943537097526833, "grad_norm": 0.3479245901107788, "learning_rate": 0.0002, "loss": 0.5865, "mean_token_accuracy": 0.7594495564699173, "num_tokens": 14343133.0, "step": 3960 }, { "entropy": 0.5712944492697716, "epoch": 3.6952869808679423, "grad_norm": 0.316556841135025, "learning_rate": 0.0002, "loss": 0.5722, "mean_token_accuracy": 0.7622150629758835, "num_tokens": 14346814.0, "step": 3961 }, { "entropy": 0.5768572986125946, "epoch": 3.6962202519832013, "grad_norm": 0.34972527623176575, "learning_rate": 0.0002, "loss": 0.5782, "mean_token_accuracy": 0.7635985910892487, "num_tokens": 14350478.0, "step": 3962 }, { "entropy": 0.5833560675382614, "epoch": 3.6971535230984602, "grad_norm": 0.2991499900817871, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.7648169994354248, "num_tokens": 14354158.0, "step": 3963 }, { "entropy": 0.5732728987932205, "epoch": 3.698086794213719, "grad_norm": 0.37632954120635986, "learning_rate": 0.0002, "loss": 0.5751, "mean_token_accuracy": 0.7716990411281586, "num_tokens": 14357758.0, "step": 3964 }, { "entropy": 0.5657035559415817, "epoch": 3.699020065328978, "grad_norm": 0.35340332984924316, "learning_rate": 0.0002, "loss": 0.5662, "mean_token_accuracy": 0.775088295340538, "num_tokens": 14361410.0, "step": 3965 }, { "entropy": 0.5552204847335815, "epoch": 3.699953336444237, "grad_norm": 0.3751367926597595, "learning_rate": 0.0002, "loss": 0.5576, "mean_token_accuracy": 0.772477924823761, "num_tokens": 14365027.0, "step": 3966 }, { "entropy": 0.5819824934005737, "epoch": 3.700886607559496, "grad_norm": 0.3699405789375305, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.7609040439128876, "num_tokens": 14368797.0, "step": 3967 }, { "entropy": 0.5454636365175247, "epoch": 3.701819878674755, "grad_norm": 0.34118038415908813, "learning_rate": 0.0002, "loss": 0.5557, "mean_token_accuracy": 0.7736933380365372, "num_tokens": 14372389.0, "step": 3968 }, { "entropy": 0.5882205963134766, "epoch": 3.702753149790014, "grad_norm": 0.36108776926994324, "learning_rate": 0.0002, "loss": 0.592, "mean_token_accuracy": 0.7634865939617157, "num_tokens": 14375963.0, "step": 3969 }, { "entropy": 0.5660656839609146, "epoch": 3.703686420905273, "grad_norm": 0.31421536207199097, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.7675003856420517, "num_tokens": 14379470.0, "step": 3970 }, { "entropy": 0.6053193509578705, "epoch": 3.704619692020532, "grad_norm": 0.31547626852989197, "learning_rate": 0.0002, "loss": 0.6013, "mean_token_accuracy": 0.7556703984737396, "num_tokens": 14383047.0, "step": 3971 }, { "entropy": 0.5946832299232483, "epoch": 3.705552963135791, "grad_norm": 0.3597579300403595, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.7660495489835739, "num_tokens": 14386684.0, "step": 3972 }, { "entropy": 0.5882548987865448, "epoch": 3.70648623425105, "grad_norm": 0.34936800599098206, "learning_rate": 0.0002, "loss": 0.5912, "mean_token_accuracy": 0.7640232294797897, "num_tokens": 14390322.0, "step": 3973 }, { "entropy": 0.5827976763248444, "epoch": 3.707419505366309, "grad_norm": 0.3156088888645172, "learning_rate": 0.0002, "loss": 0.5763, "mean_token_accuracy": 0.7684086859226227, "num_tokens": 14394081.0, "step": 3974 }, { "entropy": 0.5568510890007019, "epoch": 3.708352776481568, "grad_norm": 0.43490931391716003, "learning_rate": 0.0002, "loss": 0.5804, "mean_token_accuracy": 0.7689134478569031, "num_tokens": 14397739.0, "step": 3975 }, { "entropy": 0.5623335689306259, "epoch": 3.709286047596827, "grad_norm": 0.381727397441864, "learning_rate": 0.0002, "loss": 0.5759, "mean_token_accuracy": 0.7696806788444519, "num_tokens": 14401353.0, "step": 3976 }, { "entropy": 0.6091882884502411, "epoch": 3.710219318712086, "grad_norm": 0.38858747482299805, "learning_rate": 0.0002, "loss": 0.6234, "mean_token_accuracy": 0.7487282007932663, "num_tokens": 14405034.0, "step": 3977 }, { "entropy": 0.5872656852006912, "epoch": 3.711152589827345, "grad_norm": 0.4290648400783539, "learning_rate": 0.0002, "loss": 0.6004, "mean_token_accuracy": 0.7653700262308121, "num_tokens": 14408528.0, "step": 3978 }, { "entropy": 0.6043431162834167, "epoch": 3.712085860942604, "grad_norm": 0.35856562852859497, "learning_rate": 0.0002, "loss": 0.6083, "mean_token_accuracy": 0.749419629573822, "num_tokens": 14412070.0, "step": 3979 }, { "entropy": 0.5680277198553085, "epoch": 3.713019132057863, "grad_norm": 0.3415631949901581, "learning_rate": 0.0002, "loss": 0.573, "mean_token_accuracy": 0.7700846195220947, "num_tokens": 14415760.0, "step": 3980 }, { "entropy": 0.5710401386022568, "epoch": 3.713952403173122, "grad_norm": 0.35566768050193787, "learning_rate": 0.0002, "loss": 0.5712, "mean_token_accuracy": 0.7714456617832184, "num_tokens": 14419396.0, "step": 3981 }, { "entropy": 0.6315086185932159, "epoch": 3.714885674288381, "grad_norm": 0.3620493412017822, "learning_rate": 0.0002, "loss": 0.6314, "mean_token_accuracy": 0.7484363168478012, "num_tokens": 14423050.0, "step": 3982 }, { "entropy": 0.5846328884363174, "epoch": 3.71581894540364, "grad_norm": 0.3326133191585541, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.7668561339378357, "num_tokens": 14426708.0, "step": 3983 }, { "entropy": 0.5739267021417618, "epoch": 3.716752216518899, "grad_norm": 0.38399457931518555, "learning_rate": 0.0002, "loss": 0.5742, "mean_token_accuracy": 0.7689763307571411, "num_tokens": 14430365.0, "step": 3984 }, { "entropy": 0.5906878560781479, "epoch": 3.717685487634158, "grad_norm": 0.37404555082321167, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.7482836097478867, "num_tokens": 14433928.0, "step": 3985 }, { "entropy": 0.6306973844766617, "epoch": 3.718618758749417, "grad_norm": 0.33447521924972534, "learning_rate": 0.0002, "loss": 0.6302, "mean_token_accuracy": 0.748205840587616, "num_tokens": 14437698.0, "step": 3986 }, { "entropy": 0.591068908572197, "epoch": 3.719552029864676, "grad_norm": 0.34279242157936096, "learning_rate": 0.0002, "loss": 0.5905, "mean_token_accuracy": 0.7556974291801453, "num_tokens": 14441261.0, "step": 3987 }, { "entropy": 0.5979397743940353, "epoch": 3.720485300979935, "grad_norm": 0.3966573178768158, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.7595078647136688, "num_tokens": 14444896.0, "step": 3988 }, { "entropy": 0.5770508497953415, "epoch": 3.7214185720951938, "grad_norm": 0.3139202296733856, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7648147493600845, "num_tokens": 14448348.0, "step": 3989 }, { "entropy": 0.5743088573217392, "epoch": 3.7223518432104528, "grad_norm": 0.3046020269393921, "learning_rate": 0.0002, "loss": 0.5648, "mean_token_accuracy": 0.7745606303215027, "num_tokens": 14452009.0, "step": 3990 }, { "entropy": 0.5735203772783279, "epoch": 3.7232851143257117, "grad_norm": 0.32074153423309326, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7711732685565948, "num_tokens": 14455677.0, "step": 3991 }, { "entropy": 0.5815156698226929, "epoch": 3.7242183854409707, "grad_norm": 0.34204304218292236, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.7699456810951233, "num_tokens": 14459206.0, "step": 3992 }, { "entropy": 0.5692028403282166, "epoch": 3.7251516565562297, "grad_norm": 0.3061155378818512, "learning_rate": 0.0002, "loss": 0.5733, "mean_token_accuracy": 0.7693729251623154, "num_tokens": 14462891.0, "step": 3993 }, { "entropy": 0.5764668732881546, "epoch": 3.7260849276714887, "grad_norm": 0.33081454038619995, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7620121538639069, "num_tokens": 14466439.0, "step": 3994 }, { "entropy": 0.5793246775865555, "epoch": 3.7270181987867477, "grad_norm": 0.4115839898586273, "learning_rate": 0.0002, "loss": 0.5976, "mean_token_accuracy": 0.7595193535089493, "num_tokens": 14469910.0, "step": 3995 }, { "entropy": 0.5170691162347794, "epoch": 3.7279514699020067, "grad_norm": 0.3682936728000641, "learning_rate": 0.0002, "loss": 0.5237, "mean_token_accuracy": 0.7939295023679733, "num_tokens": 14473513.0, "step": 3996 }, { "entropy": 0.6005477160215378, "epoch": 3.7288847410172656, "grad_norm": 0.31517061591148376, "learning_rate": 0.0002, "loss": 0.6019, "mean_token_accuracy": 0.7565909028053284, "num_tokens": 14477116.0, "step": 3997 }, { "entropy": 0.5723210275173187, "epoch": 3.7298180121325246, "grad_norm": 0.2900746464729309, "learning_rate": 0.0002, "loss": 0.5685, "mean_token_accuracy": 0.7699611932039261, "num_tokens": 14480618.0, "step": 3998 }, { "entropy": 0.5569671392440796, "epoch": 3.7307512832477836, "grad_norm": 0.305835098028183, "learning_rate": 0.0002, "loss": 0.5558, "mean_token_accuracy": 0.7721687257289886, "num_tokens": 14484306.0, "step": 3999 }, { "entropy": 0.5954722911119461, "epoch": 3.7316845543630426, "grad_norm": 0.4028019607067108, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7564237415790558, "num_tokens": 14487933.0, "step": 4000 }, { "entropy": 0.5705859661102295, "epoch": 3.7326178254783016, "grad_norm": 0.33902186155319214, "learning_rate": 0.0002, "loss": 0.5739, "mean_token_accuracy": 0.7672070562839508, "num_tokens": 14491589.0, "step": 4001 }, { "entropy": 0.5917544066905975, "epoch": 3.7335510965935605, "grad_norm": 0.37106117606163025, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.7489840686321259, "num_tokens": 14495129.0, "step": 4002 }, { "entropy": 0.5871383547782898, "epoch": 3.7344843677088195, "grad_norm": 0.32036879658699036, "learning_rate": 0.0002, "loss": 0.5964, "mean_token_accuracy": 0.7578097432851791, "num_tokens": 14498770.0, "step": 4003 }, { "entropy": 0.5726061314344406, "epoch": 3.7354176388240785, "grad_norm": 0.37192049622535706, "learning_rate": 0.0002, "loss": 0.5759, "mean_token_accuracy": 0.7722023278474808, "num_tokens": 14502397.0, "step": 4004 }, { "entropy": 0.5778686553239822, "epoch": 3.7363509099393375, "grad_norm": 0.31737419962882996, "learning_rate": 0.0002, "loss": 0.5776, "mean_token_accuracy": 0.7625727355480194, "num_tokens": 14506033.0, "step": 4005 }, { "entropy": 0.5695561319589615, "epoch": 3.7372841810545965, "grad_norm": 0.35127022862434387, "learning_rate": 0.0002, "loss": 0.561, "mean_token_accuracy": 0.7730983346700668, "num_tokens": 14509548.0, "step": 4006 }, { "entropy": 0.5815680027008057, "epoch": 3.7382174521698555, "grad_norm": 0.2950612008571625, "learning_rate": 0.0002, "loss": 0.5697, "mean_token_accuracy": 0.7691705524921417, "num_tokens": 14513263.0, "step": 4007 }, { "entropy": 0.6254215836524963, "epoch": 3.7391507232851144, "grad_norm": 0.3660946786403656, "learning_rate": 0.0002, "loss": 0.6186, "mean_token_accuracy": 0.7513529807329178, "num_tokens": 14517006.0, "step": 4008 }, { "entropy": 0.5714662969112396, "epoch": 3.7400839944003734, "grad_norm": 0.38694703578948975, "learning_rate": 0.0002, "loss": 0.5829, "mean_token_accuracy": 0.7668000906705856, "num_tokens": 14520552.0, "step": 4009 }, { "entropy": 0.5833796113729477, "epoch": 3.7410172655156324, "grad_norm": 0.34303152561187744, "learning_rate": 0.0002, "loss": 0.5825, "mean_token_accuracy": 0.7626402974128723, "num_tokens": 14524218.0, "step": 4010 }, { "entropy": 0.6043826937675476, "epoch": 3.7419505366308914, "grad_norm": 0.29866811633110046, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.7548088878393173, "num_tokens": 14527957.0, "step": 4011 }, { "entropy": 0.5650615096092224, "epoch": 3.7428838077461504, "grad_norm": 0.4060649871826172, "learning_rate": 0.0002, "loss": 0.5761, "mean_token_accuracy": 0.7639009356498718, "num_tokens": 14531589.0, "step": 4012 }, { "entropy": 0.5595886930823326, "epoch": 3.7438170788614094, "grad_norm": 0.33777090907096863, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.7615832984447479, "num_tokens": 14535115.0, "step": 4013 }, { "entropy": 0.5696531683206558, "epoch": 3.7447503499766683, "grad_norm": 0.3404829800128937, "learning_rate": 0.0002, "loss": 0.5729, "mean_token_accuracy": 0.7662468254566193, "num_tokens": 14538668.0, "step": 4014 }, { "entropy": 0.5712669938802719, "epoch": 3.7456836210919273, "grad_norm": 0.4129889905452728, "learning_rate": 0.0002, "loss": 0.5806, "mean_token_accuracy": 0.7716949880123138, "num_tokens": 14542202.0, "step": 4015 }, { "entropy": 0.5587000101804733, "epoch": 3.7466168922071863, "grad_norm": 0.29060137271881104, "learning_rate": 0.0002, "loss": 0.5534, "mean_token_accuracy": 0.7726069688796997, "num_tokens": 14545725.0, "step": 4016 }, { "entropy": 0.5979063361883163, "epoch": 3.7475501633224453, "grad_norm": 0.29507309198379517, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.7608896940946579, "num_tokens": 14549447.0, "step": 4017 }, { "entropy": 0.5921023339033127, "epoch": 3.7484834344377043, "grad_norm": 0.3719669282436371, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.7593921571969986, "num_tokens": 14552929.0, "step": 4018 }, { "entropy": 0.5834296941757202, "epoch": 3.7494167055529632, "grad_norm": 0.31040552258491516, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.761771023273468, "num_tokens": 14556535.0, "step": 4019 }, { "entropy": 0.5774352550506592, "epoch": 3.7503499766682222, "grad_norm": 0.3337206244468689, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.7656038850545883, "num_tokens": 14560243.0, "step": 4020 }, { "entropy": 0.5444851890206337, "epoch": 3.751283247783481, "grad_norm": 0.3869938254356384, "learning_rate": 0.0002, "loss": 0.5579, "mean_token_accuracy": 0.775994211435318, "num_tokens": 14563720.0, "step": 4021 }, { "entropy": 0.5539983510971069, "epoch": 3.75221651889874, "grad_norm": 0.3796078562736511, "learning_rate": 0.0002, "loss": 0.5714, "mean_token_accuracy": 0.77196304500103, "num_tokens": 14567330.0, "step": 4022 }, { "entropy": 0.5784531757235527, "epoch": 3.753149790013999, "grad_norm": 0.28997471928596497, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.7672252058982849, "num_tokens": 14570921.0, "step": 4023 }, { "entropy": 0.6017835140228271, "epoch": 3.754083061129258, "grad_norm": 0.3298400640487671, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.7489300966262817, "num_tokens": 14574474.0, "step": 4024 }, { "entropy": 0.5803540050983429, "epoch": 3.755016332244517, "grad_norm": 0.32622745633125305, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.769756093621254, "num_tokens": 14578169.0, "step": 4025 }, { "entropy": 0.5501172095537186, "epoch": 3.755949603359776, "grad_norm": 0.3827906548976898, "learning_rate": 0.0002, "loss": 0.5604, "mean_token_accuracy": 0.7667605727910995, "num_tokens": 14581616.0, "step": 4026 }, { "entropy": 0.5782746076583862, "epoch": 3.756882874475035, "grad_norm": 0.38510215282440186, "learning_rate": 0.0002, "loss": 0.5862, "mean_token_accuracy": 0.7584515511989594, "num_tokens": 14585170.0, "step": 4027 }, { "entropy": 0.5807192623615265, "epoch": 3.757816145590294, "grad_norm": 0.3442402482032776, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.7587011605501175, "num_tokens": 14588847.0, "step": 4028 }, { "entropy": 0.5972743034362793, "epoch": 3.758749416705553, "grad_norm": 0.2997773587703705, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.756899505853653, "num_tokens": 14592416.0, "step": 4029 }, { "entropy": 0.5680601745843887, "epoch": 3.759682687820812, "grad_norm": 0.39949482679367065, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.7724571079015732, "num_tokens": 14595871.0, "step": 4030 }, { "entropy": 0.5775506645441055, "epoch": 3.760615958936071, "grad_norm": 0.3184582591056824, "learning_rate": 0.0002, "loss": 0.5752, "mean_token_accuracy": 0.7661296725273132, "num_tokens": 14599486.0, "step": 4031 }, { "entropy": 0.5709557086229324, "epoch": 3.76154923005133, "grad_norm": 0.35792890191078186, "learning_rate": 0.0002, "loss": 0.583, "mean_token_accuracy": 0.7640889137983322, "num_tokens": 14603166.0, "step": 4032 }, { "entropy": 0.5847548246383667, "epoch": 3.762482501166589, "grad_norm": 0.35616615414619446, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.7574669122695923, "num_tokens": 14606843.0, "step": 4033 }, { "entropy": 0.5811756178736687, "epoch": 3.763415772281848, "grad_norm": 0.36563944816589355, "learning_rate": 0.0002, "loss": 0.585, "mean_token_accuracy": 0.7629151195287704, "num_tokens": 14610435.0, "step": 4034 }, { "entropy": 0.5796699225902557, "epoch": 3.764349043397107, "grad_norm": 0.31135398149490356, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7627735137939453, "num_tokens": 14614145.0, "step": 4035 }, { "entropy": 0.5964766293764114, "epoch": 3.765282314512366, "grad_norm": 0.5123640894889832, "learning_rate": 0.0002, "loss": 0.6161, "mean_token_accuracy": 0.7507667988538742, "num_tokens": 14617732.0, "step": 4036 }, { "entropy": 0.5775121003389359, "epoch": 3.766215585627625, "grad_norm": 0.3408767580986023, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.765381783246994, "num_tokens": 14621425.0, "step": 4037 }, { "entropy": 0.5969468802213669, "epoch": 3.767148856742884, "grad_norm": 0.34183210134506226, "learning_rate": 0.0002, "loss": 0.5988, "mean_token_accuracy": 0.754180371761322, "num_tokens": 14625037.0, "step": 4038 }, { "entropy": 0.5794012695550919, "epoch": 3.768082127858143, "grad_norm": 0.34756866097450256, "learning_rate": 0.0002, "loss": 0.5732, "mean_token_accuracy": 0.7678795903921127, "num_tokens": 14628759.0, "step": 4039 }, { "entropy": 0.5804010033607483, "epoch": 3.769015398973402, "grad_norm": 0.3164600729942322, "learning_rate": 0.0002, "loss": 0.5744, "mean_token_accuracy": 0.7673238962888718, "num_tokens": 14632289.0, "step": 4040 }, { "entropy": 0.576421931385994, "epoch": 3.769948670088661, "grad_norm": 0.3698291480541229, "learning_rate": 0.0002, "loss": 0.5789, "mean_token_accuracy": 0.7642203718423843, "num_tokens": 14635794.0, "step": 4041 }, { "entropy": 0.572257786989212, "epoch": 3.77088194120392, "grad_norm": 0.3161434829235077, "learning_rate": 0.0002, "loss": 0.562, "mean_token_accuracy": 0.7722362726926804, "num_tokens": 14639405.0, "step": 4042 }, { "entropy": 0.5769276469945908, "epoch": 3.771815212319179, "grad_norm": 0.3810981810092926, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.7682319283485413, "num_tokens": 14642940.0, "step": 4043 }, { "entropy": 0.5943052172660828, "epoch": 3.772748483434438, "grad_norm": 0.39821797609329224, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.7556962221860886, "num_tokens": 14646621.0, "step": 4044 }, { "entropy": 0.5628055483102798, "epoch": 3.773681754549697, "grad_norm": 0.3872489631175995, "learning_rate": 0.0002, "loss": 0.5821, "mean_token_accuracy": 0.7692348957061768, "num_tokens": 14650250.0, "step": 4045 }, { "entropy": 0.5604529231786728, "epoch": 3.7746150256649558, "grad_norm": 0.3493027091026306, "learning_rate": 0.0002, "loss": 0.5638, "mean_token_accuracy": 0.777432382106781, "num_tokens": 14653912.0, "step": 4046 }, { "entropy": 0.5683268159627914, "epoch": 3.7755482967802148, "grad_norm": 0.3735898435115814, "learning_rate": 0.0002, "loss": 0.5739, "mean_token_accuracy": 0.7700846642255783, "num_tokens": 14657556.0, "step": 4047 }, { "entropy": 0.6233088672161102, "epoch": 3.7764815678954737, "grad_norm": 0.46428725123405457, "learning_rate": 0.0002, "loss": 0.6331, "mean_token_accuracy": 0.7468466460704803, "num_tokens": 14661206.0, "step": 4048 }, { "entropy": 0.5803075879812241, "epoch": 3.7774148390107327, "grad_norm": 0.35695838928222656, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.7672312408685684, "num_tokens": 14664835.0, "step": 4049 }, { "entropy": 0.6088456809520721, "epoch": 3.7783481101259917, "grad_norm": 0.3064866364002228, "learning_rate": 0.0002, "loss": 0.6024, "mean_token_accuracy": 0.7580238431692123, "num_tokens": 14668485.0, "step": 4050 }, { "entropy": 0.5957548767328262, "epoch": 3.7792813812412507, "grad_norm": 0.29909783601760864, "learning_rate": 0.0002, "loss": 0.5912, "mean_token_accuracy": 0.7569977194070816, "num_tokens": 14672249.0, "step": 4051 }, { "entropy": 0.5614382475614548, "epoch": 3.7802146523565097, "grad_norm": 0.34233444929122925, "learning_rate": 0.0002, "loss": 0.5567, "mean_token_accuracy": 0.7746280878782272, "num_tokens": 14675825.0, "step": 4052 }, { "entropy": 0.5926666706800461, "epoch": 3.7811479234717686, "grad_norm": 0.3477368950843811, "learning_rate": 0.0002, "loss": 0.6036, "mean_token_accuracy": 0.7609052360057831, "num_tokens": 14679445.0, "step": 4053 }, { "entropy": 0.5819146186113358, "epoch": 3.7820811945870276, "grad_norm": 0.28524795174598694, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7643128931522369, "num_tokens": 14682970.0, "step": 4054 }, { "entropy": 0.6094800233840942, "epoch": 3.7830144657022866, "grad_norm": 0.3252614736557007, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.7553646266460419, "num_tokens": 14686673.0, "step": 4055 }, { "entropy": 0.5854882150888443, "epoch": 3.7839477368175456, "grad_norm": 0.37494269013404846, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7557459473609924, "num_tokens": 14690260.0, "step": 4056 }, { "entropy": 0.5987108200788498, "epoch": 3.7848810079328046, "grad_norm": 0.344169020652771, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.7638267874717712, "num_tokens": 14693976.0, "step": 4057 }, { "entropy": 0.5655094087123871, "epoch": 3.7858142790480636, "grad_norm": 0.3680334985256195, "learning_rate": 0.0002, "loss": 0.5807, "mean_token_accuracy": 0.7653078436851501, "num_tokens": 14697540.0, "step": 4058 }, { "entropy": 0.5840834230184555, "epoch": 3.7867475501633225, "grad_norm": 0.34128302335739136, "learning_rate": 0.0002, "loss": 0.6105, "mean_token_accuracy": 0.7574243247509003, "num_tokens": 14701182.0, "step": 4059 }, { "entropy": 0.5672972053289413, "epoch": 3.7876808212785815, "grad_norm": 0.35274648666381836, "learning_rate": 0.0002, "loss": 0.5744, "mean_token_accuracy": 0.7699091583490372, "num_tokens": 14704791.0, "step": 4060 }, { "entropy": 0.5477928072214127, "epoch": 3.7886140923938405, "grad_norm": 0.40023207664489746, "learning_rate": 0.0002, "loss": 0.5642, "mean_token_accuracy": 0.7653950899839401, "num_tokens": 14708337.0, "step": 4061 }, { "entropy": 0.5777410417795181, "epoch": 3.7895473635090995, "grad_norm": 0.3645648658275604, "learning_rate": 0.0002, "loss": 0.5834, "mean_token_accuracy": 0.7644944339990616, "num_tokens": 14711932.0, "step": 4062 }, { "entropy": 0.6238599270582199, "epoch": 3.7904806346243585, "grad_norm": 0.33629801869392395, "learning_rate": 0.0002, "loss": 0.6212, "mean_token_accuracy": 0.7556043118238449, "num_tokens": 14715525.0, "step": 4063 }, { "entropy": 0.5511471033096313, "epoch": 3.7914139057396175, "grad_norm": 0.3315882384777069, "learning_rate": 0.0002, "loss": 0.5506, "mean_token_accuracy": 0.7743772715330124, "num_tokens": 14719017.0, "step": 4064 }, { "entropy": 0.6167783141136169, "epoch": 3.7923471768548764, "grad_norm": 0.3166919946670532, "learning_rate": 0.0002, "loss": 0.607, "mean_token_accuracy": 0.7574209272861481, "num_tokens": 14722604.0, "step": 4065 }, { "entropy": 0.6340991109609604, "epoch": 3.7932804479701354, "grad_norm": 0.3299112617969513, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.7527620494365692, "num_tokens": 14726378.0, "step": 4066 }, { "entropy": 0.5433174073696136, "epoch": 3.7942137190853944, "grad_norm": 0.25898340344429016, "learning_rate": 0.0002, "loss": 0.5409, "mean_token_accuracy": 0.7834035158157349, "num_tokens": 14730118.0, "step": 4067 }, { "entropy": 0.5709274709224701, "epoch": 3.7951469902006534, "grad_norm": 0.37802571058273315, "learning_rate": 0.0002, "loss": 0.5754, "mean_token_accuracy": 0.7682987749576569, "num_tokens": 14733685.0, "step": 4068 }, { "entropy": 0.5617736577987671, "epoch": 3.7960802613159124, "grad_norm": 0.3631581962108612, "learning_rate": 0.0002, "loss": 0.5761, "mean_token_accuracy": 0.7648698091506958, "num_tokens": 14737254.0, "step": 4069 }, { "entropy": 0.5838818848133087, "epoch": 3.7970135324311713, "grad_norm": 0.33635908365249634, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.7600345611572266, "num_tokens": 14740887.0, "step": 4070 }, { "entropy": 0.5803457498550415, "epoch": 3.7979468035464303, "grad_norm": 0.4098087251186371, "learning_rate": 0.0002, "loss": 0.5897, "mean_token_accuracy": 0.7600945830345154, "num_tokens": 14744444.0, "step": 4071 }, { "entropy": 0.6008616238832474, "epoch": 3.7988800746616893, "grad_norm": 0.3984472155570984, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.7514693439006805, "num_tokens": 14748252.0, "step": 4072 }, { "entropy": 0.5792517066001892, "epoch": 3.7998133457769483, "grad_norm": 0.3495924770832062, "learning_rate": 0.0002, "loss": 0.574, "mean_token_accuracy": 0.769722580909729, "num_tokens": 14751956.0, "step": 4073 }, { "entropy": 0.5613935813307762, "epoch": 3.8007466168922073, "grad_norm": 0.2729974389076233, "learning_rate": 0.0002, "loss": 0.5622, "mean_token_accuracy": 0.7699548155069351, "num_tokens": 14755717.0, "step": 4074 }, { "entropy": 0.5948241204023361, "epoch": 3.8016798880074663, "grad_norm": 0.440076619386673, "learning_rate": 0.0002, "loss": 0.5999, "mean_token_accuracy": 0.7606780678033829, "num_tokens": 14759383.0, "step": 4075 }, { "entropy": 0.6108729392290115, "epoch": 3.8026131591227252, "grad_norm": 0.3577395975589752, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.7562578022480011, "num_tokens": 14763099.0, "step": 4076 }, { "entropy": 0.5552002042531967, "epoch": 3.803546430237984, "grad_norm": 0.35825473070144653, "learning_rate": 0.0002, "loss": 0.562, "mean_token_accuracy": 0.7669911086559296, "num_tokens": 14766709.0, "step": 4077 }, { "entropy": 0.5932480543851852, "epoch": 3.804479701353243, "grad_norm": 0.33458730578422546, "learning_rate": 0.0002, "loss": 0.5934, "mean_token_accuracy": 0.7566622495651245, "num_tokens": 14770245.0, "step": 4078 }, { "entropy": 0.5695033818483353, "epoch": 3.805412972468502, "grad_norm": 0.34568044543266296, "learning_rate": 0.0002, "loss": 0.5694, "mean_token_accuracy": 0.7684505879878998, "num_tokens": 14773724.0, "step": 4079 }, { "entropy": 0.6206734329462051, "epoch": 3.806346243583761, "grad_norm": 0.41628795862197876, "learning_rate": 0.0002, "loss": 0.6316, "mean_token_accuracy": 0.7505240142345428, "num_tokens": 14777426.0, "step": 4080 }, { "entropy": 0.5762177854776382, "epoch": 3.80727951469902, "grad_norm": 0.42529913783073425, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.7675616145133972, "num_tokens": 14781103.0, "step": 4081 }, { "entropy": 0.6001671254634857, "epoch": 3.808212785814279, "grad_norm": 0.4630299508571625, "learning_rate": 0.0002, "loss": 0.6148, "mean_token_accuracy": 0.7539022862911224, "num_tokens": 14784679.0, "step": 4082 }, { "entropy": 0.571804866194725, "epoch": 3.809146056929538, "grad_norm": 0.3361251950263977, "learning_rate": 0.0002, "loss": 0.5747, "mean_token_accuracy": 0.7646249085664749, "num_tokens": 14788264.0, "step": 4083 }, { "entropy": 0.5800113081932068, "epoch": 3.810079328044797, "grad_norm": 0.33688685297966003, "learning_rate": 0.0002, "loss": 0.581, "mean_token_accuracy": 0.7679096311330795, "num_tokens": 14791923.0, "step": 4084 }, { "entropy": 0.5694443136453629, "epoch": 3.811012599160056, "grad_norm": 0.3451668620109558, "learning_rate": 0.0002, "loss": 0.5642, "mean_token_accuracy": 0.7688590586185455, "num_tokens": 14795593.0, "step": 4085 }, { "entropy": 0.6103632152080536, "epoch": 3.811945870275315, "grad_norm": 0.3355207145214081, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.7569137513637543, "num_tokens": 14799177.0, "step": 4086 }, { "entropy": 0.6124494522809982, "epoch": 3.812879141390574, "grad_norm": 0.32976776361465454, "learning_rate": 0.0002, "loss": 0.6124, "mean_token_accuracy": 0.7581565529108047, "num_tokens": 14802822.0, "step": 4087 }, { "entropy": 0.5715620592236519, "epoch": 3.813812412505833, "grad_norm": 0.3102185130119324, "learning_rate": 0.0002, "loss": 0.5674, "mean_token_accuracy": 0.7782368063926697, "num_tokens": 14806460.0, "step": 4088 }, { "entropy": 0.540314793586731, "epoch": 3.814745683621092, "grad_norm": 0.3058367073535919, "learning_rate": 0.0002, "loss": 0.5441, "mean_token_accuracy": 0.7811140865087509, "num_tokens": 14809950.0, "step": 4089 }, { "entropy": 0.6001714766025543, "epoch": 3.815678954736351, "grad_norm": 0.32235103845596313, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7524544894695282, "num_tokens": 14813660.0, "step": 4090 }, { "entropy": 0.6226836442947388, "epoch": 3.81661222585161, "grad_norm": 0.35856935381889343, "learning_rate": 0.0002, "loss": 0.636, "mean_token_accuracy": 0.7401200830936432, "num_tokens": 14817306.0, "step": 4091 }, { "entropy": 0.5771284252405167, "epoch": 3.817545496966869, "grad_norm": 0.34253162145614624, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.7612587362527847, "num_tokens": 14820877.0, "step": 4092 }, { "entropy": 0.5621858686208725, "epoch": 3.818478768082128, "grad_norm": 0.3636613190174103, "learning_rate": 0.0002, "loss": 0.5787, "mean_token_accuracy": 0.7680647373199463, "num_tokens": 14824506.0, "step": 4093 }, { "entropy": 0.5627840012311935, "epoch": 3.819412039197387, "grad_norm": 0.33842653036117554, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7706868499517441, "num_tokens": 14828152.0, "step": 4094 }, { "entropy": 0.6052551567554474, "epoch": 3.820345310312646, "grad_norm": 0.37052327394485474, "learning_rate": 0.0002, "loss": 0.6224, "mean_token_accuracy": 0.7467842251062393, "num_tokens": 14831843.0, "step": 4095 }, { "entropy": 0.6045450419187546, "epoch": 3.821278581427905, "grad_norm": 0.3221755623817444, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.7538949251174927, "num_tokens": 14835410.0, "step": 4096 }, { "entropy": 0.5959164500236511, "epoch": 3.822211852543164, "grad_norm": 0.3274252712726593, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.7708679288625717, "num_tokens": 14839025.0, "step": 4097 }, { "entropy": 0.6140446215867996, "epoch": 3.823145123658423, "grad_norm": 0.3185279965400696, "learning_rate": 0.0002, "loss": 0.6084, "mean_token_accuracy": 0.7553866654634476, "num_tokens": 14842617.0, "step": 4098 }, { "entropy": 0.5423814952373505, "epoch": 3.824078394773682, "grad_norm": 0.31900373101234436, "learning_rate": 0.0002, "loss": 0.5359, "mean_token_accuracy": 0.780858039855957, "num_tokens": 14846207.0, "step": 4099 }, { "entropy": 0.5724018067121506, "epoch": 3.825011665888941, "grad_norm": 0.3199213147163391, "learning_rate": 0.0002, "loss": 0.5651, "mean_token_accuracy": 0.7726239114999771, "num_tokens": 14849821.0, "step": 4100 }, { "entropy": 0.5926460176706314, "epoch": 3.8259449370042, "grad_norm": 0.46325090527534485, "learning_rate": 0.0002, "loss": 0.5994, "mean_token_accuracy": 0.7613843977451324, "num_tokens": 14853268.0, "step": 4101 }, { "entropy": 0.6293675750494003, "epoch": 3.826878208119459, "grad_norm": 0.3104851245880127, "learning_rate": 0.0002, "loss": 0.6323, "mean_token_accuracy": 0.7448960691690445, "num_tokens": 14857037.0, "step": 4102 }, { "entropy": 0.5732813030481339, "epoch": 3.8278114792347178, "grad_norm": 0.3613507151603699, "learning_rate": 0.0002, "loss": 0.5863, "mean_token_accuracy": 0.7616142481565475, "num_tokens": 14860645.0, "step": 4103 }, { "entropy": 0.5642653033137321, "epoch": 3.8287447503499767, "grad_norm": 0.3770686686038971, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.7728495597839355, "num_tokens": 14864208.0, "step": 4104 }, { "entropy": 0.5531920194625854, "epoch": 3.8296780214652357, "grad_norm": 0.3443024158477783, "learning_rate": 0.0002, "loss": 0.5721, "mean_token_accuracy": 0.7699717730283737, "num_tokens": 14867821.0, "step": 4105 }, { "entropy": 0.5959101617336273, "epoch": 3.8306112925804947, "grad_norm": 0.3726497292518616, "learning_rate": 0.0002, "loss": 0.6157, "mean_token_accuracy": 0.7492837011814117, "num_tokens": 14871468.0, "step": 4106 }, { "entropy": 0.550059862434864, "epoch": 3.8315445636957537, "grad_norm": 0.3308662474155426, "learning_rate": 0.0002, "loss": 0.5525, "mean_token_accuracy": 0.7785928845405579, "num_tokens": 14875147.0, "step": 4107 }, { "entropy": 0.5862789452075958, "epoch": 3.8324778348110127, "grad_norm": 0.4026295244693756, "learning_rate": 0.0002, "loss": 0.5891, "mean_token_accuracy": 0.7639085352420807, "num_tokens": 14878652.0, "step": 4108 }, { "entropy": 0.5928833335638046, "epoch": 3.8334111059262717, "grad_norm": 0.31790632009506226, "learning_rate": 0.0002, "loss": 0.5869, "mean_token_accuracy": 0.7657425999641418, "num_tokens": 14882258.0, "step": 4109 }, { "entropy": 0.5977003127336502, "epoch": 3.8343443770415306, "grad_norm": 0.34242525696754456, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.7588139474391937, "num_tokens": 14885755.0, "step": 4110 }, { "entropy": 0.6031154096126556, "epoch": 3.8352776481567896, "grad_norm": 0.35556232929229736, "learning_rate": 0.0002, "loss": 0.6034, "mean_token_accuracy": 0.7542014569044113, "num_tokens": 14889278.0, "step": 4111 }, { "entropy": 0.5997671335935593, "epoch": 3.8362109192720486, "grad_norm": 0.3111479580402374, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.7568405419588089, "num_tokens": 14892975.0, "step": 4112 }, { "entropy": 0.5471038967370987, "epoch": 3.8371441903873076, "grad_norm": 0.2689560055732727, "learning_rate": 0.0002, "loss": 0.54, "mean_token_accuracy": 0.783947303891182, "num_tokens": 14896671.0, "step": 4113 }, { "entropy": 0.5766041278839111, "epoch": 3.8380774615025666, "grad_norm": 0.32285672426223755, "learning_rate": 0.0002, "loss": 0.579, "mean_token_accuracy": 0.7718513160943985, "num_tokens": 14900128.0, "step": 4114 }, { "entropy": 0.5824575275182724, "epoch": 3.8390107326178255, "grad_norm": 0.3500012755393982, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7596343159675598, "num_tokens": 14903725.0, "step": 4115 }, { "entropy": 0.5576942563056946, "epoch": 3.8399440037330845, "grad_norm": 0.3487767279148102, "learning_rate": 0.0002, "loss": 0.5601, "mean_token_accuracy": 0.7741268426179886, "num_tokens": 14907294.0, "step": 4116 }, { "entropy": 0.5371532514691353, "epoch": 3.8408772748483435, "grad_norm": 0.33387741446495056, "learning_rate": 0.0002, "loss": 0.5353, "mean_token_accuracy": 0.7829751521348953, "num_tokens": 14910878.0, "step": 4117 }, { "entropy": 0.5757485330104828, "epoch": 3.8418105459636025, "grad_norm": 0.3874761760234833, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.7550171315670013, "num_tokens": 14914428.0, "step": 4118 }, { "entropy": 0.6051407158374786, "epoch": 3.8427438170788615, "grad_norm": 0.3767818212509155, "learning_rate": 0.0002, "loss": 0.6187, "mean_token_accuracy": 0.7519586235284805, "num_tokens": 14918058.0, "step": 4119 }, { "entropy": 0.5648772865533829, "epoch": 3.8436770881941205, "grad_norm": 0.33364006876945496, "learning_rate": 0.0002, "loss": 0.5724, "mean_token_accuracy": 0.7648481726646423, "num_tokens": 14921751.0, "step": 4120 }, { "entropy": 0.5756044685840607, "epoch": 3.8446103593093794, "grad_norm": 0.36959919333457947, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.7632772773504257, "num_tokens": 14925345.0, "step": 4121 }, { "entropy": 0.5883417278528214, "epoch": 3.8455436304246384, "grad_norm": 0.30951714515686035, "learning_rate": 0.0002, "loss": 0.5847, "mean_token_accuracy": 0.7690014988183975, "num_tokens": 14928963.0, "step": 4122 }, { "entropy": 0.5754809975624084, "epoch": 3.8464769015398974, "grad_norm": 0.3519700765609741, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.7675562649965286, "num_tokens": 14932536.0, "step": 4123 }, { "entropy": 0.5671976953744888, "epoch": 3.8474101726551564, "grad_norm": 0.36404091119766235, "learning_rate": 0.0002, "loss": 0.5795, "mean_token_accuracy": 0.7645154744386673, "num_tokens": 14936114.0, "step": 4124 }, { "entropy": 0.6233913451433182, "epoch": 3.8483434437704154, "grad_norm": 0.365436851978302, "learning_rate": 0.0002, "loss": 0.6277, "mean_token_accuracy": 0.7567429691553116, "num_tokens": 14939761.0, "step": 4125 }, { "entropy": 0.6083231568336487, "epoch": 3.8492767148856744, "grad_norm": 0.2847233712673187, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.7556406706571579, "num_tokens": 14943473.0, "step": 4126 }, { "entropy": 0.5550829470157623, "epoch": 3.8502099860009333, "grad_norm": 0.2724111080169678, "learning_rate": 0.0002, "loss": 0.5546, "mean_token_accuracy": 0.7755290120840073, "num_tokens": 14947115.0, "step": 4127 }, { "entropy": 0.59567691385746, "epoch": 3.8511432571161923, "grad_norm": 0.32850730419158936, "learning_rate": 0.0002, "loss": 0.5911, "mean_token_accuracy": 0.7593120336532593, "num_tokens": 14950762.0, "step": 4128 }, { "entropy": 0.5984815806150436, "epoch": 3.8520765282314513, "grad_norm": 0.33364883065223694, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.753802016377449, "num_tokens": 14954448.0, "step": 4129 }, { "entropy": 0.5895789712667465, "epoch": 3.8530097993467103, "grad_norm": 0.32979294657707214, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7606745213270187, "num_tokens": 14958018.0, "step": 4130 }, { "entropy": 0.6145337820053101, "epoch": 3.8539430704619693, "grad_norm": 0.3377334475517273, "learning_rate": 0.0002, "loss": 0.608, "mean_token_accuracy": 0.7528519779443741, "num_tokens": 14961693.0, "step": 4131 }, { "entropy": 0.6051406115293503, "epoch": 3.8548763415772282, "grad_norm": 0.310056209564209, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.7553309351205826, "num_tokens": 14965427.0, "step": 4132 }, { "entropy": 0.5774888247251511, "epoch": 3.8558096126924872, "grad_norm": 0.3452215790748596, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.76765076816082, "num_tokens": 14969112.0, "step": 4133 }, { "entropy": 0.5914076864719391, "epoch": 3.856742883807746, "grad_norm": 0.31338438391685486, "learning_rate": 0.0002, "loss": 0.586, "mean_token_accuracy": 0.7632959187030792, "num_tokens": 14972798.0, "step": 4134 }, { "entropy": 0.572572335600853, "epoch": 3.857676154923005, "grad_norm": 0.3621913492679596, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7674552500247955, "num_tokens": 14976464.0, "step": 4135 }, { "entropy": 0.5268637835979462, "epoch": 3.858609426038264, "grad_norm": 0.30436915159225464, "learning_rate": 0.0002, "loss": 0.5344, "mean_token_accuracy": 0.7817871570587158, "num_tokens": 14979975.0, "step": 4136 }, { "entropy": 0.5847120732069016, "epoch": 3.859542697153523, "grad_norm": 0.3711087107658386, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.7563194930553436, "num_tokens": 14983562.0, "step": 4137 }, { "entropy": 0.5560777187347412, "epoch": 3.860475968268782, "grad_norm": 0.39908644556999207, "learning_rate": 0.0002, "loss": 0.5735, "mean_token_accuracy": 0.7648957222700119, "num_tokens": 14987172.0, "step": 4138 }, { "entropy": 0.5899339914321899, "epoch": 3.861409239384041, "grad_norm": 0.38072508573532104, "learning_rate": 0.0002, "loss": 0.6042, "mean_token_accuracy": 0.7550695091485977, "num_tokens": 14990817.0, "step": 4139 }, { "entropy": 0.5653451681137085, "epoch": 3.8623425104993, "grad_norm": 0.3842276334762573, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.7638968825340271, "num_tokens": 14994580.0, "step": 4140 }, { "entropy": 0.6027566492557526, "epoch": 3.863275781614559, "grad_norm": 0.33037370443344116, "learning_rate": 0.0002, "loss": 0.6134, "mean_token_accuracy": 0.7500583082437515, "num_tokens": 14998176.0, "step": 4141 }, { "entropy": 0.5784429758787155, "epoch": 3.864209052729818, "grad_norm": 0.3995838761329651, "learning_rate": 0.0002, "loss": 0.5973, "mean_token_accuracy": 0.7572029232978821, "num_tokens": 15001776.0, "step": 4142 }, { "entropy": 0.5859054774045944, "epoch": 3.865142323845077, "grad_norm": 0.3498653769493103, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.7536239773035049, "num_tokens": 15005306.0, "step": 4143 }, { "entropy": 0.6067303717136383, "epoch": 3.866075594960336, "grad_norm": 0.3970213830471039, "learning_rate": 0.0002, "loss": 0.5946, "mean_token_accuracy": 0.7593587785959244, "num_tokens": 15008930.0, "step": 4144 }, { "entropy": 0.5617355555295944, "epoch": 3.867008866075595, "grad_norm": 0.3121927082538605, "learning_rate": 0.0002, "loss": 0.5524, "mean_token_accuracy": 0.7799769639968872, "num_tokens": 15012588.0, "step": 4145 }, { "entropy": 0.5760528892278671, "epoch": 3.867942137190854, "grad_norm": 0.31297603249549866, "learning_rate": 0.0002, "loss": 0.5683, "mean_token_accuracy": 0.7737076133489609, "num_tokens": 15016185.0, "step": 4146 }, { "entropy": 0.6035601943731308, "epoch": 3.868875408306113, "grad_norm": 0.2450142502784729, "learning_rate": 0.0002, "loss": 0.5804, "mean_token_accuracy": 0.7641141563653946, "num_tokens": 15019806.0, "step": 4147 }, { "entropy": 0.5810652673244476, "epoch": 3.869808679421372, "grad_norm": 0.2757302522659302, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7665317505598068, "num_tokens": 15023384.0, "step": 4148 }, { "entropy": 0.5638958364725113, "epoch": 3.870741950536631, "grad_norm": 0.3688391149044037, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.7640593945980072, "num_tokens": 15027062.0, "step": 4149 }, { "entropy": 0.5520376414060593, "epoch": 3.87167522165189, "grad_norm": 0.29369598627090454, "learning_rate": 0.0002, "loss": 0.556, "mean_token_accuracy": 0.7741729170084, "num_tokens": 15030691.0, "step": 4150 }, { "entropy": 0.5577495098114014, "epoch": 3.872608492767149, "grad_norm": 0.35497331619262695, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7657085657119751, "num_tokens": 15034299.0, "step": 4151 }, { "entropy": 0.5725068151950836, "epoch": 3.873541763882408, "grad_norm": 0.3555956184864044, "learning_rate": 0.0002, "loss": 0.589, "mean_token_accuracy": 0.7603972554206848, "num_tokens": 15037930.0, "step": 4152 }, { "entropy": 0.5771743208169937, "epoch": 3.874475034997667, "grad_norm": 0.3498767912387848, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.7610989511013031, "num_tokens": 15041523.0, "step": 4153 }, { "entropy": 0.5782325118780136, "epoch": 3.875408306112926, "grad_norm": 0.3206714689731598, "learning_rate": 0.0002, "loss": 0.593, "mean_token_accuracy": 0.7641359865665436, "num_tokens": 15045165.0, "step": 4154 }, { "entropy": 0.5856272131204605, "epoch": 3.876341577228185, "grad_norm": 0.3600999414920807, "learning_rate": 0.0002, "loss": 0.5996, "mean_token_accuracy": 0.7636566013097763, "num_tokens": 15048717.0, "step": 4155 }, { "entropy": 0.6014769524335861, "epoch": 3.877274848343444, "grad_norm": 0.3055244982242584, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.7517028152942657, "num_tokens": 15052438.0, "step": 4156 }, { "entropy": 0.5926368236541748, "epoch": 3.878208119458703, "grad_norm": 0.3615627884864807, "learning_rate": 0.0002, "loss": 0.5986, "mean_token_accuracy": 0.7583994716405869, "num_tokens": 15055930.0, "step": 4157 }, { "entropy": 0.556729793548584, "epoch": 3.879141390573962, "grad_norm": 0.3272119462490082, "learning_rate": 0.0002, "loss": 0.5595, "mean_token_accuracy": 0.7703679949045181, "num_tokens": 15059609.0, "step": 4158 }, { "entropy": 0.5737362205982208, "epoch": 3.8800746616892208, "grad_norm": 0.319805771112442, "learning_rate": 0.0002, "loss": 0.5642, "mean_token_accuracy": 0.7713331431150436, "num_tokens": 15063324.0, "step": 4159 }, { "entropy": 0.6059785485267639, "epoch": 3.8810079328044798, "grad_norm": 0.30276939272880554, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.7615683674812317, "num_tokens": 15066872.0, "step": 4160 }, { "entropy": 0.5627863854169846, "epoch": 3.8819412039197387, "grad_norm": 0.29058748483657837, "learning_rate": 0.0002, "loss": 0.5632, "mean_token_accuracy": 0.7768182158470154, "num_tokens": 15070426.0, "step": 4161 }, { "entropy": 0.5624689310789108, "epoch": 3.8828744750349977, "grad_norm": 0.3194344937801361, "learning_rate": 0.0002, "loss": 0.5695, "mean_token_accuracy": 0.7653584182262421, "num_tokens": 15073924.0, "step": 4162 }, { "entropy": 0.5831165462732315, "epoch": 3.8838077461502567, "grad_norm": 0.336547315120697, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.760330855846405, "num_tokens": 15077475.0, "step": 4163 }, { "entropy": 0.605524092912674, "epoch": 3.8847410172655157, "grad_norm": 0.30529534816741943, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7546844631433487, "num_tokens": 15081166.0, "step": 4164 }, { "entropy": 0.5825307220220566, "epoch": 3.8856742883807747, "grad_norm": 0.43801769614219666, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.7584783285856247, "num_tokens": 15084691.0, "step": 4165 }, { "entropy": 0.579651802778244, "epoch": 3.8866075594960336, "grad_norm": 0.3549387454986572, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.7581342160701752, "num_tokens": 15088281.0, "step": 4166 }, { "entropy": 0.5764124989509583, "epoch": 3.8875408306112926, "grad_norm": 0.3384336829185486, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7600256204605103, "num_tokens": 15091837.0, "step": 4167 }, { "entropy": 0.5497073084115982, "epoch": 3.8884741017265516, "grad_norm": 0.39309680461883545, "learning_rate": 0.0002, "loss": 0.5692, "mean_token_accuracy": 0.7723076045513153, "num_tokens": 15095272.0, "step": 4168 }, { "entropy": 0.5653053671121597, "epoch": 3.8894073728418106, "grad_norm": 0.3152865171432495, "learning_rate": 0.0002, "loss": 0.5727, "mean_token_accuracy": 0.7665012627840042, "num_tokens": 15098872.0, "step": 4169 }, { "entropy": 0.6181367933750153, "epoch": 3.8903406439570696, "grad_norm": 0.31344595551490784, "learning_rate": 0.0002, "loss": 0.6146, "mean_token_accuracy": 0.7465729117393494, "num_tokens": 15102508.0, "step": 4170 }, { "entropy": 0.5847389996051788, "epoch": 3.8912739150723286, "grad_norm": 0.3125249445438385, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7641590237617493, "num_tokens": 15106125.0, "step": 4171 }, { "entropy": 0.5881438851356506, "epoch": 3.8922071861875875, "grad_norm": 0.29654860496520996, "learning_rate": 0.0002, "loss": 0.5963, "mean_token_accuracy": 0.7627315074205399, "num_tokens": 15109746.0, "step": 4172 }, { "entropy": 0.6088776588439941, "epoch": 3.8931404573028465, "grad_norm": 0.335882306098938, "learning_rate": 0.0002, "loss": 0.6225, "mean_token_accuracy": 0.7496844679117203, "num_tokens": 15113305.0, "step": 4173 }, { "entropy": 0.5847539305686951, "epoch": 3.8940737284181055, "grad_norm": 0.30307406187057495, "learning_rate": 0.0002, "loss": 0.5804, "mean_token_accuracy": 0.7630544751882553, "num_tokens": 15116845.0, "step": 4174 }, { "entropy": 0.5802493244409561, "epoch": 3.8950069995333645, "grad_norm": 0.3739353120326996, "learning_rate": 0.0002, "loss": 0.5823, "mean_token_accuracy": 0.7642449140548706, "num_tokens": 15120440.0, "step": 4175 }, { "entropy": 0.5807182043790817, "epoch": 3.8959402706486235, "grad_norm": 0.26229044795036316, "learning_rate": 0.0002, "loss": 0.5769, "mean_token_accuracy": 0.7630961239337921, "num_tokens": 15123992.0, "step": 4176 }, { "entropy": 0.5903589278459549, "epoch": 3.8968735417638825, "grad_norm": 0.2892991006374359, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7644101828336716, "num_tokens": 15127636.0, "step": 4177 }, { "entropy": 0.6297985911369324, "epoch": 3.8978068128791414, "grad_norm": 0.28750160336494446, "learning_rate": 0.0002, "loss": 0.616, "mean_token_accuracy": 0.7527358084917068, "num_tokens": 15131462.0, "step": 4178 }, { "entropy": 0.5640276670455933, "epoch": 3.8987400839944004, "grad_norm": 0.30421164631843567, "learning_rate": 0.0002, "loss": 0.5622, "mean_token_accuracy": 0.7698156088590622, "num_tokens": 15134973.0, "step": 4179 }, { "entropy": 0.5737184584140778, "epoch": 3.8996733551096594, "grad_norm": 0.36322513222694397, "learning_rate": 0.0002, "loss": 0.5763, "mean_token_accuracy": 0.7567091882228851, "num_tokens": 15138618.0, "step": 4180 }, { "entropy": 0.5733828395605087, "epoch": 3.9006066262249184, "grad_norm": 0.30200275778770447, "learning_rate": 0.0002, "loss": 0.575, "mean_token_accuracy": 0.7684834003448486, "num_tokens": 15142416.0, "step": 4181 }, { "entropy": 0.5754028707742691, "epoch": 3.9015398973401774, "grad_norm": 0.419649600982666, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.7587804794311523, "num_tokens": 15146040.0, "step": 4182 }, { "entropy": 0.5838166028261185, "epoch": 3.9024731684554363, "grad_norm": 0.3891884684562683, "learning_rate": 0.0002, "loss": 0.6066, "mean_token_accuracy": 0.7610355764627457, "num_tokens": 15149657.0, "step": 4183 }, { "entropy": 0.5696303844451904, "epoch": 3.9034064395706953, "grad_norm": 0.34077927470207214, "learning_rate": 0.0002, "loss": 0.5772, "mean_token_accuracy": 0.7606174051761627, "num_tokens": 15153376.0, "step": 4184 }, { "entropy": 0.5746899247169495, "epoch": 3.9043397106859543, "grad_norm": 0.37285366654396057, "learning_rate": 0.0002, "loss": 0.5852, "mean_token_accuracy": 0.7575471848249435, "num_tokens": 15156902.0, "step": 4185 }, { "entropy": 0.6022009551525116, "epoch": 3.9052729818012133, "grad_norm": 0.37776070833206177, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.7523915022611618, "num_tokens": 15160629.0, "step": 4186 }, { "entropy": 0.5755694806575775, "epoch": 3.9062062529164723, "grad_norm": 0.37240326404571533, "learning_rate": 0.0002, "loss": 0.5731, "mean_token_accuracy": 0.765335276722908, "num_tokens": 15164281.0, "step": 4187 }, { "entropy": 0.5871454626321793, "epoch": 3.9071395240317313, "grad_norm": 0.31200966238975525, "learning_rate": 0.0002, "loss": 0.5875, "mean_token_accuracy": 0.7592903673648834, "num_tokens": 15167918.0, "step": 4188 }, { "entropy": 0.6159591972827911, "epoch": 3.9080727951469902, "grad_norm": 0.31079983711242676, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.7573417872190475, "num_tokens": 15171642.0, "step": 4189 }, { "entropy": 0.6023615300655365, "epoch": 3.9090060662622492, "grad_norm": 0.31263917684555054, "learning_rate": 0.0002, "loss": 0.6062, "mean_token_accuracy": 0.7526303976774216, "num_tokens": 15175231.0, "step": 4190 }, { "entropy": 0.5823858231306076, "epoch": 3.909939337377508, "grad_norm": 0.3292902410030365, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.7649051994085312, "num_tokens": 15178964.0, "step": 4191 }, { "entropy": 0.5462386906147003, "epoch": 3.910872608492767, "grad_norm": 0.3830333650112152, "learning_rate": 0.0002, "loss": 0.5661, "mean_token_accuracy": 0.767536997795105, "num_tokens": 15182505.0, "step": 4192 }, { "entropy": 0.5864111483097076, "epoch": 3.911805879608026, "grad_norm": 0.40206584334373474, "learning_rate": 0.0002, "loss": 0.5845, "mean_token_accuracy": 0.7640255093574524, "num_tokens": 15186072.0, "step": 4193 }, { "entropy": 0.5486739575862885, "epoch": 3.912739150723285, "grad_norm": 0.40634670853614807, "learning_rate": 0.0002, "loss": 0.563, "mean_token_accuracy": 0.773191824555397, "num_tokens": 15189621.0, "step": 4194 }, { "entropy": 0.572454959154129, "epoch": 3.913672421838544, "grad_norm": 0.36097413301467896, "learning_rate": 0.0002, "loss": 0.5814, "mean_token_accuracy": 0.7656194120645523, "num_tokens": 15193237.0, "step": 4195 }, { "entropy": 0.6240314543247223, "epoch": 3.914605692953803, "grad_norm": 0.3194260895252228, "learning_rate": 0.0002, "loss": 0.6245, "mean_token_accuracy": 0.7415618449449539, "num_tokens": 15197009.0, "step": 4196 }, { "entropy": 0.5402249097824097, "epoch": 3.915538964069062, "grad_norm": 0.3465651571750641, "learning_rate": 0.0002, "loss": 0.5421, "mean_token_accuracy": 0.7775312066078186, "num_tokens": 15200541.0, "step": 4197 }, { "entropy": 0.5662882179021835, "epoch": 3.916472235184321, "grad_norm": 0.3668631315231323, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.7632578909397125, "num_tokens": 15204107.0, "step": 4198 }, { "entropy": 0.6102010309696198, "epoch": 3.91740550629958, "grad_norm": 0.3175731599330902, "learning_rate": 0.0002, "loss": 0.6165, "mean_token_accuracy": 0.7526694387197495, "num_tokens": 15207729.0, "step": 4199 }, { "entropy": 0.587971568107605, "epoch": 3.918338777414839, "grad_norm": 0.40686434507369995, "learning_rate": 0.0002, "loss": 0.5977, "mean_token_accuracy": 0.7627031952142715, "num_tokens": 15211311.0, "step": 4200 }, { "entropy": 0.5887168347835541, "epoch": 3.919272048530098, "grad_norm": 0.3899231255054474, "learning_rate": 0.0002, "loss": 0.6052, "mean_token_accuracy": 0.7531741708517075, "num_tokens": 15214983.0, "step": 4201 }, { "entropy": 0.5743265450000763, "epoch": 3.920205319645357, "grad_norm": 0.3825027346611023, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.7670669108629227, "num_tokens": 15218453.0, "step": 4202 }, { "entropy": 0.5798163563013077, "epoch": 3.921138590760616, "grad_norm": 0.3294221758842468, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7653143554925919, "num_tokens": 15221995.0, "step": 4203 }, { "entropy": 0.6117687970399857, "epoch": 3.922071861875875, "grad_norm": 0.359053373336792, "learning_rate": 0.0002, "loss": 0.606, "mean_token_accuracy": 0.7561585605144501, "num_tokens": 15225630.0, "step": 4204 }, { "entropy": 0.5693279057741165, "epoch": 3.923005132991134, "grad_norm": 0.32044947147369385, "learning_rate": 0.0002, "loss": 0.5617, "mean_token_accuracy": 0.7691264301538467, "num_tokens": 15229303.0, "step": 4205 }, { "entropy": 0.5755826234817505, "epoch": 3.923938404106393, "grad_norm": 0.3259136378765106, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.7615996599197388, "num_tokens": 15232971.0, "step": 4206 }, { "entropy": 0.6478686183691025, "epoch": 3.924871675221652, "grad_norm": 0.30744460225105286, "learning_rate": 0.0002, "loss": 0.6545, "mean_token_accuracy": 0.7415248304605484, "num_tokens": 15236637.0, "step": 4207 }, { "entropy": 0.6036820411682129, "epoch": 3.925804946336911, "grad_norm": 0.30493631958961487, "learning_rate": 0.0002, "loss": 0.5877, "mean_token_accuracy": 0.7626097351312637, "num_tokens": 15240459.0, "step": 4208 }, { "entropy": 0.5694770961999893, "epoch": 3.92673821745217, "grad_norm": 0.3365185856819153, "learning_rate": 0.0002, "loss": 0.5682, "mean_token_accuracy": 0.7739541828632355, "num_tokens": 15244033.0, "step": 4209 }, { "entropy": 0.5577616393566132, "epoch": 3.927671488567429, "grad_norm": 0.3074789047241211, "learning_rate": 0.0002, "loss": 0.556, "mean_token_accuracy": 0.7723812162876129, "num_tokens": 15247613.0, "step": 4210 }, { "entropy": 0.5868463516235352, "epoch": 3.928604759682688, "grad_norm": 0.34878310561180115, "learning_rate": 0.0002, "loss": 0.5916, "mean_token_accuracy": 0.7602426260709763, "num_tokens": 15251242.0, "step": 4211 }, { "entropy": 0.5819192975759506, "epoch": 3.929538030797947, "grad_norm": 0.3518075942993164, "learning_rate": 0.0002, "loss": 0.5979, "mean_token_accuracy": 0.7522498667240143, "num_tokens": 15254933.0, "step": 4212 }, { "entropy": 0.5454027205705643, "epoch": 3.930471301913206, "grad_norm": 0.35528630018234253, "learning_rate": 0.0002, "loss": 0.5652, "mean_token_accuracy": 0.7737051993608475, "num_tokens": 15258603.0, "step": 4213 }, { "entropy": 0.5732483118772507, "epoch": 3.931404573028465, "grad_norm": 0.37779319286346436, "learning_rate": 0.0002, "loss": 0.5775, "mean_token_accuracy": 0.7636108994483948, "num_tokens": 15262250.0, "step": 4214 }, { "entropy": 0.6421754062175751, "epoch": 3.932337844143724, "grad_norm": 0.3288324177265167, "learning_rate": 0.0002, "loss": 0.6427, "mean_token_accuracy": 0.7419284284114838, "num_tokens": 15265864.0, "step": 4215 }, { "entropy": 0.526521347463131, "epoch": 3.9332711152589828, "grad_norm": 0.31366100907325745, "learning_rate": 0.0002, "loss": 0.54, "mean_token_accuracy": 0.7779451012611389, "num_tokens": 15269403.0, "step": 4216 }, { "entropy": 0.5750417560338974, "epoch": 3.9342043863742417, "grad_norm": 0.31731197237968445, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.771765723824501, "num_tokens": 15273041.0, "step": 4217 }, { "entropy": 0.5507422760128975, "epoch": 3.9351376574895007, "grad_norm": 0.3171045780181885, "learning_rate": 0.0002, "loss": 0.5499, "mean_token_accuracy": 0.7737328857183456, "num_tokens": 15276628.0, "step": 4218 }, { "entropy": 0.594827190041542, "epoch": 3.9360709286047597, "grad_norm": 0.30406856536865234, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.7575125247240067, "num_tokens": 15280353.0, "step": 4219 }, { "entropy": 0.5590098649263382, "epoch": 3.9370041997200187, "grad_norm": 0.34618639945983887, "learning_rate": 0.0002, "loss": 0.5635, "mean_token_accuracy": 0.7697655558586121, "num_tokens": 15283906.0, "step": 4220 }, { "entropy": 0.5653934180736542, "epoch": 3.9379374708352777, "grad_norm": 0.3280852138996124, "learning_rate": 0.0002, "loss": 0.5599, "mean_token_accuracy": 0.7807044684886932, "num_tokens": 15287449.0, "step": 4221 }, { "entropy": 0.6107084006071091, "epoch": 3.9388707419505367, "grad_norm": 0.2891669273376465, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.7583427727222443, "num_tokens": 15291127.0, "step": 4222 }, { "entropy": 0.6073260307312012, "epoch": 3.9398040130657956, "grad_norm": 0.34186795353889465, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.7599710822105408, "num_tokens": 15294761.0, "step": 4223 }, { "entropy": 0.5647527277469635, "epoch": 3.9407372841810546, "grad_norm": 0.3648265600204468, "learning_rate": 0.0002, "loss": 0.5692, "mean_token_accuracy": 0.7724986523389816, "num_tokens": 15298344.0, "step": 4224 }, { "entropy": 0.5756465047597885, "epoch": 3.9416705552963136, "grad_norm": 0.33396294713020325, "learning_rate": 0.0002, "loss": 0.5758, "mean_token_accuracy": 0.7682419866323471, "num_tokens": 15302021.0, "step": 4225 }, { "entropy": 0.5913558453321457, "epoch": 3.9426038264115726, "grad_norm": 0.31047695875167847, "learning_rate": 0.0002, "loss": 0.5982, "mean_token_accuracy": 0.76102514564991, "num_tokens": 15305655.0, "step": 4226 }, { "entropy": 0.5771314054727554, "epoch": 3.9435370975268316, "grad_norm": 0.3928256928920746, "learning_rate": 0.0002, "loss": 0.5875, "mean_token_accuracy": 0.7666014730930328, "num_tokens": 15309383.0, "step": 4227 }, { "entropy": 0.526108056306839, "epoch": 3.9444703686420906, "grad_norm": 0.4109482169151306, "learning_rate": 0.0002, "loss": 0.5406, "mean_token_accuracy": 0.7794772684574127, "num_tokens": 15312837.0, "step": 4228 }, { "entropy": 0.583714634180069, "epoch": 3.9454036397573495, "grad_norm": 0.3547687232494354, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.7604759782552719, "num_tokens": 15316504.0, "step": 4229 }, { "entropy": 0.633979856967926, "epoch": 3.9463369108726085, "grad_norm": 0.38939645886421204, "learning_rate": 0.0002, "loss": 0.6428, "mean_token_accuracy": 0.7428986877202988, "num_tokens": 15320099.0, "step": 4230 }, { "entropy": 0.575963705778122, "epoch": 3.9472701819878675, "grad_norm": 0.3574615716934204, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.7572726458311081, "num_tokens": 15323634.0, "step": 4231 }, { "entropy": 0.6015319228172302, "epoch": 3.9482034531031265, "grad_norm": 0.38963329792022705, "learning_rate": 0.0002, "loss": 0.6178, "mean_token_accuracy": 0.7535549849271774, "num_tokens": 15327322.0, "step": 4232 }, { "entropy": 0.5305717810988426, "epoch": 3.9491367242183855, "grad_norm": 0.3393436074256897, "learning_rate": 0.0002, "loss": 0.5296, "mean_token_accuracy": 0.7859223484992981, "num_tokens": 15330766.0, "step": 4233 }, { "entropy": 0.5996994972229004, "epoch": 3.9500699953336444, "grad_norm": 0.36729803681373596, "learning_rate": 0.0002, "loss": 0.6008, "mean_token_accuracy": 0.7586476057767868, "num_tokens": 15334374.0, "step": 4234 }, { "entropy": 0.6213994473218918, "epoch": 3.9510032664489034, "grad_norm": 0.26852211356163025, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.7496281117200851, "num_tokens": 15338021.0, "step": 4235 }, { "entropy": 0.5813051164150238, "epoch": 3.9519365375641624, "grad_norm": 0.3230738639831543, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.7587844580411911, "num_tokens": 15341569.0, "step": 4236 }, { "entropy": 0.5371773689985275, "epoch": 3.9528698086794214, "grad_norm": 0.3796621859073639, "learning_rate": 0.0002, "loss": 0.5507, "mean_token_accuracy": 0.7831907868385315, "num_tokens": 15345031.0, "step": 4237 }, { "entropy": 0.5914702117443085, "epoch": 3.9538030797946804, "grad_norm": 0.30336931347846985, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.7610509842634201, "num_tokens": 15348738.0, "step": 4238 }, { "entropy": 0.5946296453475952, "epoch": 3.9547363509099394, "grad_norm": 0.3807680308818817, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.766339585185051, "num_tokens": 15352239.0, "step": 4239 }, { "entropy": 0.5550496727228165, "epoch": 3.9556696220251983, "grad_norm": 0.3530040681362152, "learning_rate": 0.0002, "loss": 0.5598, "mean_token_accuracy": 0.7701387107372284, "num_tokens": 15355751.0, "step": 4240 }, { "entropy": 0.5636443048715591, "epoch": 3.9566028931404573, "grad_norm": 0.37433648109436035, "learning_rate": 0.0002, "loss": 0.5678, "mean_token_accuracy": 0.7669436037540436, "num_tokens": 15359413.0, "step": 4241 }, { "entropy": 0.6321748793125153, "epoch": 3.9575361642557163, "grad_norm": 0.3373590409755707, "learning_rate": 0.0002, "loss": 0.6354, "mean_token_accuracy": 0.7465044111013412, "num_tokens": 15362966.0, "step": 4242 }, { "entropy": 0.5505669116973877, "epoch": 3.9584694353709753, "grad_norm": 0.3045439124107361, "learning_rate": 0.0002, "loss": 0.5562, "mean_token_accuracy": 0.7851799130439758, "num_tokens": 15366515.0, "step": 4243 }, { "entropy": 0.6173630654811859, "epoch": 3.9594027064862343, "grad_norm": 0.30895695090293884, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7555558532476425, "num_tokens": 15370120.0, "step": 4244 }, { "entropy": 0.5563984811306, "epoch": 3.9603359776014933, "grad_norm": 0.3169419765472412, "learning_rate": 0.0002, "loss": 0.5558, "mean_token_accuracy": 0.7783038020133972, "num_tokens": 15373757.0, "step": 4245 }, { "entropy": 0.5949655175209045, "epoch": 3.9612692487167522, "grad_norm": 0.38486170768737793, "learning_rate": 0.0002, "loss": 0.5987, "mean_token_accuracy": 0.7534978240728378, "num_tokens": 15377263.0, "step": 4246 }, { "entropy": 0.6234951019287109, "epoch": 3.962202519832011, "grad_norm": 0.38190776109695435, "learning_rate": 0.0002, "loss": 0.6251, "mean_token_accuracy": 0.7429997771978378, "num_tokens": 15381011.0, "step": 4247 }, { "entropy": 0.566067062318325, "epoch": 3.96313579094727, "grad_norm": 0.3388822078704834, "learning_rate": 0.0002, "loss": 0.5565, "mean_token_accuracy": 0.771737203001976, "num_tokens": 15384619.0, "step": 4248 }, { "entropy": 0.5830581486225128, "epoch": 3.964069062062529, "grad_norm": 0.3237406313419342, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7615788578987122, "num_tokens": 15388279.0, "step": 4249 }, { "entropy": 0.6351239681243896, "epoch": 3.965002333177788, "grad_norm": 0.3792057931423187, "learning_rate": 0.0002, "loss": 0.6514, "mean_token_accuracy": 0.7351933866739273, "num_tokens": 15391907.0, "step": 4250 }, { "entropy": 0.5577774941921234, "epoch": 3.965935604293047, "grad_norm": 0.4206329584121704, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.7697530835866928, "num_tokens": 15395452.0, "step": 4251 }, { "entropy": 0.5917369276285172, "epoch": 3.966868875408306, "grad_norm": 0.331928014755249, "learning_rate": 0.0002, "loss": 0.6027, "mean_token_accuracy": 0.7565092742443085, "num_tokens": 15399101.0, "step": 4252 }, { "entropy": 0.5858493894338608, "epoch": 3.967802146523565, "grad_norm": 0.3628694713115692, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7591953128576279, "num_tokens": 15402692.0, "step": 4253 }, { "entropy": 0.5683287084102631, "epoch": 3.968735417638824, "grad_norm": 0.4067605137825012, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7659985572099686, "num_tokens": 15406361.0, "step": 4254 }, { "entropy": 0.5969200730323792, "epoch": 3.969668688754083, "grad_norm": 0.38037827610969543, "learning_rate": 0.0002, "loss": 0.599, "mean_token_accuracy": 0.7591191530227661, "num_tokens": 15410039.0, "step": 4255 }, { "entropy": 0.5541892796754837, "epoch": 3.970601959869342, "grad_norm": 0.37842822074890137, "learning_rate": 0.0002, "loss": 0.5559, "mean_token_accuracy": 0.7763052433729172, "num_tokens": 15413697.0, "step": 4256 }, { "entropy": 0.6053934693336487, "epoch": 3.971535230984601, "grad_norm": 0.30580493807792664, "learning_rate": 0.0002, "loss": 0.6006, "mean_token_accuracy": 0.7555515915155411, "num_tokens": 15417308.0, "step": 4257 }, { "entropy": 0.606128141283989, "epoch": 3.97246850209986, "grad_norm": 0.35421034693717957, "learning_rate": 0.0002, "loss": 0.6053, "mean_token_accuracy": 0.7549224346876144, "num_tokens": 15420862.0, "step": 4258 }, { "entropy": 0.5827612727880478, "epoch": 3.973401773215119, "grad_norm": 0.36693766713142395, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7725186794996262, "num_tokens": 15424510.0, "step": 4259 }, { "entropy": 0.630018800497055, "epoch": 3.974335044330378, "grad_norm": 0.41921156644821167, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.7498626708984375, "num_tokens": 15428147.0, "step": 4260 }, { "entropy": 0.5797700434923172, "epoch": 3.975268315445637, "grad_norm": 0.3128143846988678, "learning_rate": 0.0002, "loss": 0.5828, "mean_token_accuracy": 0.7663234770298004, "num_tokens": 15431906.0, "step": 4261 }, { "entropy": 0.5567100793123245, "epoch": 3.976201586560896, "grad_norm": 0.3700143098831177, "learning_rate": 0.0002, "loss": 0.5622, "mean_token_accuracy": 0.7720667868852615, "num_tokens": 15435461.0, "step": 4262 }, { "entropy": 0.5405890196561813, "epoch": 3.977134857676155, "grad_norm": 0.3949667513370514, "learning_rate": 0.0002, "loss": 0.551, "mean_token_accuracy": 0.7786464989185333, "num_tokens": 15438885.0, "step": 4263 }, { "entropy": 0.5700049251317978, "epoch": 3.978068128791414, "grad_norm": 0.3023259937763214, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7654095888137817, "num_tokens": 15442555.0, "step": 4264 }, { "entropy": 0.5805918574333191, "epoch": 3.979001399906673, "grad_norm": 0.3856750726699829, "learning_rate": 0.0002, "loss": 0.6065, "mean_token_accuracy": 0.7560684829950333, "num_tokens": 15446220.0, "step": 4265 }, { "entropy": 0.5776357203722, "epoch": 3.979934671021932, "grad_norm": 0.39441752433776855, "learning_rate": 0.0002, "loss": 0.5883, "mean_token_accuracy": 0.7596040815114975, "num_tokens": 15449685.0, "step": 4266 }, { "entropy": 0.5420563668012619, "epoch": 3.980867942137191, "grad_norm": 0.3310878574848175, "learning_rate": 0.0002, "loss": 0.5496, "mean_token_accuracy": 0.7786393165588379, "num_tokens": 15453227.0, "step": 4267 }, { "entropy": 0.6152137070894241, "epoch": 3.98180121325245, "grad_norm": 0.30426400899887085, "learning_rate": 0.0002, "loss": 0.6192, "mean_token_accuracy": 0.7486023753881454, "num_tokens": 15456927.0, "step": 4268 }, { "entropy": 0.6334832906723022, "epoch": 3.982734484367709, "grad_norm": 0.3374471962451935, "learning_rate": 0.0002, "loss": 0.6299, "mean_token_accuracy": 0.7522028833627701, "num_tokens": 15460594.0, "step": 4269 }, { "entropy": 0.5915023535490036, "epoch": 3.983667755482968, "grad_norm": 0.31177881360054016, "learning_rate": 0.0002, "loss": 0.5792, "mean_token_accuracy": 0.7696114182472229, "num_tokens": 15464315.0, "step": 4270 }, { "entropy": 0.5715983957052231, "epoch": 3.984601026598227, "grad_norm": 0.30525630712509155, "learning_rate": 0.0002, "loss": 0.558, "mean_token_accuracy": 0.77445949614048, "num_tokens": 15467952.0, "step": 4271 }, { "entropy": 0.5917341560125351, "epoch": 3.9855342977134858, "grad_norm": 0.30785471200942993, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.7645906656980515, "num_tokens": 15471546.0, "step": 4272 }, { "entropy": 0.6253741979598999, "epoch": 3.9864675688287448, "grad_norm": 0.3260955810546875, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7474178969860077, "num_tokens": 15475219.0, "step": 4273 }, { "entropy": 0.5992993116378784, "epoch": 3.9874008399440037, "grad_norm": 0.373039573431015, "learning_rate": 0.0002, "loss": 0.6033, "mean_token_accuracy": 0.7566805928945541, "num_tokens": 15479038.0, "step": 4274 }, { "entropy": 0.588234469294548, "epoch": 3.9883341110592627, "grad_norm": 0.3254663646221161, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.7628826647996902, "num_tokens": 15482695.0, "step": 4275 }, { "entropy": 0.6156070381402969, "epoch": 3.9892673821745217, "grad_norm": 0.33952006697654724, "learning_rate": 0.0002, "loss": 0.6267, "mean_token_accuracy": 0.7478067874908447, "num_tokens": 15486349.0, "step": 4276 }, { "entropy": 0.5325841754674911, "epoch": 3.9902006532897807, "grad_norm": 0.34973496198654175, "learning_rate": 0.0002, "loss": 0.5391, "mean_token_accuracy": 0.7787259072065353, "num_tokens": 15490020.0, "step": 4277 }, { "entropy": 0.5925266891717911, "epoch": 3.9911339244050397, "grad_norm": 0.3881760537624359, "learning_rate": 0.0002, "loss": 0.5997, "mean_token_accuracy": 0.7563602030277252, "num_tokens": 15493588.0, "step": 4278 }, { "entropy": 0.621590331196785, "epoch": 3.9920671955202987, "grad_norm": 0.33721238374710083, "learning_rate": 0.0002, "loss": 0.6272, "mean_token_accuracy": 0.7496249824762344, "num_tokens": 15497226.0, "step": 4279 }, { "entropy": 0.562803253531456, "epoch": 3.9930004666355576, "grad_norm": 0.3315730690956116, "learning_rate": 0.0002, "loss": 0.5663, "mean_token_accuracy": 0.7774157226085663, "num_tokens": 15500842.0, "step": 4280 }, { "entropy": 0.5553891286253929, "epoch": 3.9939337377508166, "grad_norm": 0.3006991147994995, "learning_rate": 0.0002, "loss": 0.5664, "mean_token_accuracy": 0.7713907957077026, "num_tokens": 15504502.0, "step": 4281 }, { "entropy": 0.5986599624156952, "epoch": 3.9948670088660756, "grad_norm": 0.33401304483413696, "learning_rate": 0.0002, "loss": 0.6159, "mean_token_accuracy": 0.7537712305784225, "num_tokens": 15508099.0, "step": 4282 }, { "entropy": 0.5899293422698975, "epoch": 3.9958002799813346, "grad_norm": 0.3097676932811737, "learning_rate": 0.0002, "loss": 0.5862, "mean_token_accuracy": 0.76250359416008, "num_tokens": 15511744.0, "step": 4283 }, { "entropy": 0.5913249403238297, "epoch": 3.9967335510965936, "grad_norm": 0.31897011399269104, "learning_rate": 0.0002, "loss": 0.5905, "mean_token_accuracy": 0.76234170794487, "num_tokens": 15515321.0, "step": 4284 }, { "entropy": 0.5844963192939758, "epoch": 3.9976668222118525, "grad_norm": 0.3224645256996155, "learning_rate": 0.0002, "loss": 0.5833, "mean_token_accuracy": 0.7649446129798889, "num_tokens": 15518923.0, "step": 4285 }, { "entropy": 0.6093281507492065, "epoch": 3.9986000933271115, "grad_norm": 0.33128511905670166, "learning_rate": 0.0002, "loss": 0.6006, "mean_token_accuracy": 0.755438819527626, "num_tokens": 15522505.0, "step": 4286 }, { "entropy": 0.6153004318475723, "epoch": 3.9995333644423705, "grad_norm": 0.29912886023521423, "learning_rate": 0.0002, "loss": 0.6068, "mean_token_accuracy": 0.752925232052803, "num_tokens": 15526196.0, "step": 4287 }, { "entropy": 0.6291969418525696, "epoch": 4.0, "grad_norm": 0.4319230914115906, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.7423315942287445, "num_tokens": 15527291.0, "step": 4288 }, { "entropy": 0.5718091875314713, "epoch": 4.000933271115259, "grad_norm": 0.2986222505569458, "learning_rate": 0.0002, "loss": 0.5535, "mean_token_accuracy": 0.776356041431427, "num_tokens": 15530911.0, "step": 4289 }, { "entropy": 0.5418230071663857, "epoch": 4.001866542230518, "grad_norm": 0.34845879673957825, "learning_rate": 0.0002, "loss": 0.5425, "mean_token_accuracy": 0.7761613428592682, "num_tokens": 15534476.0, "step": 4290 }, { "entropy": 0.566920205950737, "epoch": 4.002799813345777, "grad_norm": 0.3357422649860382, "learning_rate": 0.0002, "loss": 0.5639, "mean_token_accuracy": 0.7777633816003799, "num_tokens": 15538094.0, "step": 4291 }, { "entropy": 0.5329610109329224, "epoch": 4.003733084461036, "grad_norm": 0.34260323643684387, "learning_rate": 0.0002, "loss": 0.5331, "mean_token_accuracy": 0.7861761748790741, "num_tokens": 15541719.0, "step": 4292 }, { "entropy": 0.5503586977720261, "epoch": 4.004666355576295, "grad_norm": 0.39415207505226135, "learning_rate": 0.0002, "loss": 0.5615, "mean_token_accuracy": 0.772543653845787, "num_tokens": 15545229.0, "step": 4293 }, { "entropy": 0.4928669184446335, "epoch": 4.005599626691554, "grad_norm": 0.5114783644676208, "learning_rate": 0.0002, "loss": 0.5054, "mean_token_accuracy": 0.7980858832597733, "num_tokens": 15548710.0, "step": 4294 }, { "entropy": 0.5423515290021896, "epoch": 4.006532897806813, "grad_norm": 0.43724900484085083, "learning_rate": 0.0002, "loss": 0.5576, "mean_token_accuracy": 0.773228108882904, "num_tokens": 15552304.0, "step": 4295 }, { "entropy": 0.5396513938903809, "epoch": 4.007466168922072, "grad_norm": 0.34760168194770813, "learning_rate": 0.0002, "loss": 0.5456, "mean_token_accuracy": 0.7790286540985107, "num_tokens": 15555999.0, "step": 4296 }, { "entropy": 0.5100599080324173, "epoch": 4.008399440037331, "grad_norm": 0.5378729104995728, "learning_rate": 0.0002, "loss": 0.5357, "mean_token_accuracy": 0.7865834832191467, "num_tokens": 15559536.0, "step": 4297 }, { "entropy": 0.5288470089435577, "epoch": 4.00933271115259, "grad_norm": 0.4428754448890686, "learning_rate": 0.0002, "loss": 0.5417, "mean_token_accuracy": 0.7854844629764557, "num_tokens": 15563140.0, "step": 4298 }, { "entropy": 0.5650734007358551, "epoch": 4.010265982267849, "grad_norm": 0.4411764442920685, "learning_rate": 0.0002, "loss": 0.5772, "mean_token_accuracy": 0.7722245156764984, "num_tokens": 15566692.0, "step": 4299 }, { "entropy": 0.5532489269971848, "epoch": 4.011199253383108, "grad_norm": 0.3938041627407074, "learning_rate": 0.0002, "loss": 0.5555, "mean_token_accuracy": 0.7721171975135803, "num_tokens": 15570367.0, "step": 4300 }, { "entropy": 0.5949858725070953, "epoch": 4.012132524498367, "grad_norm": 0.372881680727005, "learning_rate": 0.0002, "loss": 0.5879, "mean_token_accuracy": 0.7601189613342285, "num_tokens": 15574129.0, "step": 4301 }, { "entropy": 0.572471871972084, "epoch": 4.013065795613626, "grad_norm": 0.3633120059967041, "learning_rate": 0.0002, "loss": 0.5652, "mean_token_accuracy": 0.7711039483547211, "num_tokens": 15577693.0, "step": 4302 }, { "entropy": 0.525070384144783, "epoch": 4.013999066728885, "grad_norm": 0.4642201066017151, "learning_rate": 0.0002, "loss": 0.5273, "mean_token_accuracy": 0.7913294285535812, "num_tokens": 15581275.0, "step": 4303 }, { "entropy": 0.571530818939209, "epoch": 4.014932337844144, "grad_norm": 0.3459571301937103, "learning_rate": 0.0002, "loss": 0.5672, "mean_token_accuracy": 0.7744784206151962, "num_tokens": 15584939.0, "step": 4304 }, { "entropy": 0.5560689717531204, "epoch": 4.015865608959403, "grad_norm": 0.33735018968582153, "learning_rate": 0.0002, "loss": 0.5562, "mean_token_accuracy": 0.7748935222625732, "num_tokens": 15588587.0, "step": 4305 }, { "entropy": 0.5884099751710892, "epoch": 4.016798880074662, "grad_norm": 0.39736706018447876, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.7711355984210968, "num_tokens": 15592251.0, "step": 4306 }, { "entropy": 0.5796013027429581, "epoch": 4.017732151189921, "grad_norm": 0.401770681142807, "learning_rate": 0.0002, "loss": 0.5836, "mean_token_accuracy": 0.7693349272012711, "num_tokens": 15595851.0, "step": 4307 }, { "entropy": 0.529483862221241, "epoch": 4.01866542230518, "grad_norm": 0.3481132388114929, "learning_rate": 0.0002, "loss": 0.5288, "mean_token_accuracy": 0.786321610212326, "num_tokens": 15599497.0, "step": 4308 }, { "entropy": 0.5342400521039963, "epoch": 4.019598693420439, "grad_norm": 0.4216737151145935, "learning_rate": 0.0002, "loss": 0.5448, "mean_token_accuracy": 0.7806902378797531, "num_tokens": 15603104.0, "step": 4309 }, { "entropy": 0.556865468621254, "epoch": 4.020531964535698, "grad_norm": 0.40034496784210205, "learning_rate": 0.0002, "loss": 0.5619, "mean_token_accuracy": 0.7744745761156082, "num_tokens": 15606753.0, "step": 4310 }, { "entropy": 0.5514436364173889, "epoch": 4.021465235650957, "grad_norm": 0.34042349457740784, "learning_rate": 0.0002, "loss": 0.553, "mean_token_accuracy": 0.7796882838010788, "num_tokens": 15610321.0, "step": 4311 }, { "entropy": 0.5598265901207924, "epoch": 4.022398506766216, "grad_norm": 0.39226576685905457, "learning_rate": 0.0002, "loss": 0.5597, "mean_token_accuracy": 0.7783335447311401, "num_tokens": 15613963.0, "step": 4312 }, { "entropy": 0.57775017619133, "epoch": 4.023331777881475, "grad_norm": 0.4723208248615265, "learning_rate": 0.0002, "loss": 0.5822, "mean_token_accuracy": 0.7676392644643784, "num_tokens": 15617540.0, "step": 4313 }, { "entropy": 0.5731740444898605, "epoch": 4.0242650489967335, "grad_norm": 0.3694972097873688, "learning_rate": 0.0002, "loss": 0.5679, "mean_token_accuracy": 0.7702531963586807, "num_tokens": 15621320.0, "step": 4314 }, { "entropy": 0.5625657141208649, "epoch": 4.0251983201119925, "grad_norm": 0.4385097026824951, "learning_rate": 0.0002, "loss": 0.5635, "mean_token_accuracy": 0.7706658244132996, "num_tokens": 15624875.0, "step": 4315 }, { "entropy": 0.5644065737724304, "epoch": 4.0261315912272515, "grad_norm": 0.3662335276603699, "learning_rate": 0.0002, "loss": 0.5581, "mean_token_accuracy": 0.7735539823770523, "num_tokens": 15628585.0, "step": 4316 }, { "entropy": 0.5806561410427094, "epoch": 4.0270648623425105, "grad_norm": 0.49322089552879333, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7727358341217041, "num_tokens": 15632146.0, "step": 4317 }, { "entropy": 0.5933333188295364, "epoch": 4.0279981334577695, "grad_norm": 0.3863774240016937, "learning_rate": 0.0002, "loss": 0.5812, "mean_token_accuracy": 0.7645541876554489, "num_tokens": 15635909.0, "step": 4318 }, { "entropy": 0.5767905414104462, "epoch": 4.0289314045730285, "grad_norm": 0.40232256054878235, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.7623245269060135, "num_tokens": 15639506.0, "step": 4319 }, { "entropy": 0.5396553725004196, "epoch": 4.029864675688287, "grad_norm": 0.3563508987426758, "learning_rate": 0.0002, "loss": 0.5411, "mean_token_accuracy": 0.7831797897815704, "num_tokens": 15643104.0, "step": 4320 }, { "entropy": 0.5910630971193314, "epoch": 4.030797946803546, "grad_norm": 0.3658318817615509, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.7607601433992386, "num_tokens": 15646631.0, "step": 4321 }, { "entropy": 0.5717461556196213, "epoch": 4.031731217918805, "grad_norm": 0.36070871353149414, "learning_rate": 0.0002, "loss": 0.5771, "mean_token_accuracy": 0.7744189500808716, "num_tokens": 15650209.0, "step": 4322 }, { "entropy": 0.5687431842088699, "epoch": 4.032664489034064, "grad_norm": 0.37096795439720154, "learning_rate": 0.0002, "loss": 0.5761, "mean_token_accuracy": 0.7678748816251755, "num_tokens": 15653774.0, "step": 4323 }, { "entropy": 0.5841269493103027, "epoch": 4.033597760149323, "grad_norm": 0.37502437829971313, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7727383822202682, "num_tokens": 15657311.0, "step": 4324 }, { "entropy": 0.5584191679954529, "epoch": 4.034531031264582, "grad_norm": 0.42604199051856995, "learning_rate": 0.0002, "loss": 0.57, "mean_token_accuracy": 0.7777440696954727, "num_tokens": 15660846.0, "step": 4325 }, { "entropy": 0.5312782749533653, "epoch": 4.035464302379841, "grad_norm": 0.35560962557792664, "learning_rate": 0.0002, "loss": 0.5287, "mean_token_accuracy": 0.7862493395805359, "num_tokens": 15664470.0, "step": 4326 }, { "entropy": 0.5302693471312523, "epoch": 4.0363975734951, "grad_norm": 0.42508795857429504, "learning_rate": 0.0002, "loss": 0.5399, "mean_token_accuracy": 0.7818353325128555, "num_tokens": 15668063.0, "step": 4327 }, { "entropy": 0.5923568159341812, "epoch": 4.037330844610359, "grad_norm": 0.36380255222320557, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.759965717792511, "num_tokens": 15671620.0, "step": 4328 }, { "entropy": 0.5149156004190445, "epoch": 4.038264115725618, "grad_norm": 0.38307228684425354, "learning_rate": 0.0002, "loss": 0.5243, "mean_token_accuracy": 0.7911986410617828, "num_tokens": 15675204.0, "step": 4329 }, { "entropy": 0.5630026906728745, "epoch": 4.039197386840877, "grad_norm": 0.3727272152900696, "learning_rate": 0.0002, "loss": 0.5564, "mean_token_accuracy": 0.7770478129386902, "num_tokens": 15678858.0, "step": 4330 }, { "entropy": 0.5532669872045517, "epoch": 4.040130657956136, "grad_norm": 0.3826324939727783, "learning_rate": 0.0002, "loss": 0.5572, "mean_token_accuracy": 0.7732905149459839, "num_tokens": 15682422.0, "step": 4331 }, { "entropy": 0.581944540143013, "epoch": 4.041063929071395, "grad_norm": 0.3706046938896179, "learning_rate": 0.0002, "loss": 0.5838, "mean_token_accuracy": 0.7618164420127869, "num_tokens": 15686139.0, "step": 4332 }, { "entropy": 0.543464332818985, "epoch": 4.041997200186654, "grad_norm": 0.39570188522338867, "learning_rate": 0.0002, "loss": 0.5513, "mean_token_accuracy": 0.7798887938261032, "num_tokens": 15689641.0, "step": 4333 }, { "entropy": 0.5682021975517273, "epoch": 4.042930471301913, "grad_norm": 0.35274356603622437, "learning_rate": 0.0002, "loss": 0.5685, "mean_token_accuracy": 0.7689846158027649, "num_tokens": 15693225.0, "step": 4334 }, { "entropy": 0.5610690861940384, "epoch": 4.043863742417172, "grad_norm": 0.37168532609939575, "learning_rate": 0.0002, "loss": 0.5635, "mean_token_accuracy": 0.776696503162384, "num_tokens": 15696832.0, "step": 4335 }, { "entropy": 0.5365568697452545, "epoch": 4.044797013532431, "grad_norm": 0.3752303123474121, "learning_rate": 0.0002, "loss": 0.5321, "mean_token_accuracy": 0.785913497209549, "num_tokens": 15700513.0, "step": 4336 }, { "entropy": 0.551312118768692, "epoch": 4.04573028464769, "grad_norm": 0.41410723328590393, "learning_rate": 0.0002, "loss": 0.5483, "mean_token_accuracy": 0.7806616872549057, "num_tokens": 15704054.0, "step": 4337 }, { "entropy": 0.5786242634057999, "epoch": 4.046663555762949, "grad_norm": 0.41100597381591797, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.7630598992109299, "num_tokens": 15707682.0, "step": 4338 }, { "entropy": 0.5591981261968613, "epoch": 4.047596826878208, "grad_norm": 0.4512147009372711, "learning_rate": 0.0002, "loss": 0.5668, "mean_token_accuracy": 0.7708046734333038, "num_tokens": 15711341.0, "step": 4339 }, { "entropy": 0.5511506944894791, "epoch": 4.048530097993467, "grad_norm": 0.483455091714859, "learning_rate": 0.0002, "loss": 0.5639, "mean_token_accuracy": 0.7776842415332794, "num_tokens": 15714982.0, "step": 4340 }, { "entropy": 0.553556464612484, "epoch": 4.049463369108726, "grad_norm": 0.4402035176753998, "learning_rate": 0.0002, "loss": 0.5616, "mean_token_accuracy": 0.7722098976373672, "num_tokens": 15718690.0, "step": 4341 }, { "entropy": 0.5454882234334946, "epoch": 4.050396640223985, "grad_norm": 0.4129257798194885, "learning_rate": 0.0002, "loss": 0.5561, "mean_token_accuracy": 0.7746927291154861, "num_tokens": 15722335.0, "step": 4342 }, { "entropy": 0.547361746430397, "epoch": 4.051329911339244, "grad_norm": 0.37307876348495483, "learning_rate": 0.0002, "loss": 0.5441, "mean_token_accuracy": 0.7783026993274689, "num_tokens": 15725954.0, "step": 4343 }, { "entropy": 0.5114463865756989, "epoch": 4.052263182454503, "grad_norm": 0.36340320110321045, "learning_rate": 0.0002, "loss": 0.5091, "mean_token_accuracy": 0.7956323623657227, "num_tokens": 15729519.0, "step": 4344 }, { "entropy": 0.575357973575592, "epoch": 4.053196453569762, "grad_norm": 0.34505680203437805, "learning_rate": 0.0002, "loss": 0.5623, "mean_token_accuracy": 0.775127649307251, "num_tokens": 15733161.0, "step": 4345 }, { "entropy": 0.567499577999115, "epoch": 4.054129724685021, "grad_norm": 0.3648303151130676, "learning_rate": 0.0002, "loss": 0.5612, "mean_token_accuracy": 0.7733801603317261, "num_tokens": 15736704.0, "step": 4346 }, { "entropy": 0.5782628953456879, "epoch": 4.05506299580028, "grad_norm": 0.3507026731967926, "learning_rate": 0.0002, "loss": 0.579, "mean_token_accuracy": 0.7678195387125015, "num_tokens": 15740395.0, "step": 4347 }, { "entropy": 0.5696995109319687, "epoch": 4.055996266915539, "grad_norm": 0.4444122016429901, "learning_rate": 0.0002, "loss": 0.5716, "mean_token_accuracy": 0.7681450843811035, "num_tokens": 15743985.0, "step": 4348 }, { "entropy": 0.5118681117892265, "epoch": 4.056929538030798, "grad_norm": 0.3993775248527527, "learning_rate": 0.0002, "loss": 0.5155, "mean_token_accuracy": 0.797982782125473, "num_tokens": 15747534.0, "step": 4349 }, { "entropy": 0.5285969972610474, "epoch": 4.057862809146057, "grad_norm": 0.42587703466415405, "learning_rate": 0.0002, "loss": 0.5396, "mean_token_accuracy": 0.784998282790184, "num_tokens": 15751005.0, "step": 4350 }, { "entropy": 0.5653172880411148, "epoch": 4.058796080261316, "grad_norm": 0.437892884016037, "learning_rate": 0.0002, "loss": 0.5665, "mean_token_accuracy": 0.7744983732700348, "num_tokens": 15754535.0, "step": 4351 }, { "entropy": 0.5478359907865524, "epoch": 4.059729351376575, "grad_norm": 0.356773316860199, "learning_rate": 0.0002, "loss": 0.5541, "mean_token_accuracy": 0.7755768001079559, "num_tokens": 15758220.0, "step": 4352 }, { "entropy": 0.5445040464401245, "epoch": 4.060662622491834, "grad_norm": 0.35651880502700806, "learning_rate": 0.0002, "loss": 0.5561, "mean_token_accuracy": 0.7812622934579849, "num_tokens": 15761702.0, "step": 4353 }, { "entropy": 0.5328249856829643, "epoch": 4.061595893607093, "grad_norm": 0.3467646837234497, "learning_rate": 0.0002, "loss": 0.5463, "mean_token_accuracy": 0.782383993268013, "num_tokens": 15765306.0, "step": 4354 }, { "entropy": 0.6157335191965103, "epoch": 4.062529164722352, "grad_norm": 0.37333768606185913, "learning_rate": 0.0002, "loss": 0.6163, "mean_token_accuracy": 0.7538351714611053, "num_tokens": 15768990.0, "step": 4355 }, { "entropy": 0.5453936830163002, "epoch": 4.063462435837611, "grad_norm": 0.37382450699806213, "learning_rate": 0.0002, "loss": 0.5527, "mean_token_accuracy": 0.7775781452655792, "num_tokens": 15772615.0, "step": 4356 }, { "entropy": 0.5779876410961151, "epoch": 4.06439570695287, "grad_norm": 0.35245534777641296, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.7674523890018463, "num_tokens": 15776342.0, "step": 4357 }, { "entropy": 0.5686265677213669, "epoch": 4.065328978068129, "grad_norm": 0.36990463733673096, "learning_rate": 0.0002, "loss": 0.5678, "mean_token_accuracy": 0.7683667987585068, "num_tokens": 15780098.0, "step": 4358 }, { "entropy": 0.5675742924213409, "epoch": 4.066262249183388, "grad_norm": 0.3924823999404907, "learning_rate": 0.0002, "loss": 0.5701, "mean_token_accuracy": 0.772771805524826, "num_tokens": 15783615.0, "step": 4359 }, { "entropy": 0.5847237706184387, "epoch": 4.067195520298647, "grad_norm": 0.39516112208366394, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.7678819447755814, "num_tokens": 15787316.0, "step": 4360 }, { "entropy": 0.5345647037029266, "epoch": 4.068128791413906, "grad_norm": 0.4276958107948303, "learning_rate": 0.0002, "loss": 0.5379, "mean_token_accuracy": 0.7852991670370102, "num_tokens": 15790968.0, "step": 4361 }, { "entropy": 0.554614432156086, "epoch": 4.069062062529165, "grad_norm": 0.3584611415863037, "learning_rate": 0.0002, "loss": 0.5653, "mean_token_accuracy": 0.7772437334060669, "num_tokens": 15794600.0, "step": 4362 }, { "entropy": 0.5542571395635605, "epoch": 4.069995333644424, "grad_norm": 0.34528669714927673, "learning_rate": 0.0002, "loss": 0.5545, "mean_token_accuracy": 0.7787132263183594, "num_tokens": 15798371.0, "step": 4363 }, { "entropy": 0.5513459891080856, "epoch": 4.070928604759683, "grad_norm": 0.35674142837524414, "learning_rate": 0.0002, "loss": 0.5588, "mean_token_accuracy": 0.7754523009061813, "num_tokens": 15802047.0, "step": 4364 }, { "entropy": 0.5587810575962067, "epoch": 4.071861875874942, "grad_norm": 0.3467062711715698, "learning_rate": 0.0002, "loss": 0.5476, "mean_token_accuracy": 0.7795488387346268, "num_tokens": 15805669.0, "step": 4365 }, { "entropy": 0.5755652338266373, "epoch": 4.072795146990201, "grad_norm": 0.4413575530052185, "learning_rate": 0.0002, "loss": 0.582, "mean_token_accuracy": 0.7745592594146729, "num_tokens": 15809215.0, "step": 4366 }, { "entropy": 0.579313188791275, "epoch": 4.07372841810546, "grad_norm": 0.4168565571308136, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7740853577852249, "num_tokens": 15812875.0, "step": 4367 }, { "entropy": 0.5491799339652061, "epoch": 4.074661689220719, "grad_norm": 0.48278188705444336, "learning_rate": 0.0002, "loss": 0.5418, "mean_token_accuracy": 0.7812957614660263, "num_tokens": 15816310.0, "step": 4368 }, { "entropy": 0.5201648026704788, "epoch": 4.075594960335978, "grad_norm": 0.36417141556739807, "learning_rate": 0.0002, "loss": 0.5273, "mean_token_accuracy": 0.7853813022375107, "num_tokens": 15819876.0, "step": 4369 }, { "entropy": 0.5343703180551529, "epoch": 4.0765282314512366, "grad_norm": 0.3820527195930481, "learning_rate": 0.0002, "loss": 0.5354, "mean_token_accuracy": 0.7818801999092102, "num_tokens": 15823623.0, "step": 4370 }, { "entropy": 0.5636161863803864, "epoch": 4.0774615025664955, "grad_norm": 0.3725081980228424, "learning_rate": 0.0002, "loss": 0.5689, "mean_token_accuracy": 0.7773351967334747, "num_tokens": 15827173.0, "step": 4371 }, { "entropy": 0.554010421037674, "epoch": 4.0783947736817545, "grad_norm": 0.37581363320350647, "learning_rate": 0.0002, "loss": 0.5634, "mean_token_accuracy": 0.7730012536048889, "num_tokens": 15830857.0, "step": 4372 }, { "entropy": 0.5734675228595734, "epoch": 4.0793280447970135, "grad_norm": 0.39470601081848145, "learning_rate": 0.0002, "loss": 0.5914, "mean_token_accuracy": 0.7618004232645035, "num_tokens": 15834563.0, "step": 4373 }, { "entropy": 0.5691987872123718, "epoch": 4.0802613159122725, "grad_norm": 0.483268678188324, "learning_rate": 0.0002, "loss": 0.5828, "mean_token_accuracy": 0.7665334045886993, "num_tokens": 15838158.0, "step": 4374 }, { "entropy": 0.5525713711977005, "epoch": 4.0811945870275315, "grad_norm": 0.357499897480011, "learning_rate": 0.0002, "loss": 0.5683, "mean_token_accuracy": 0.7747593224048615, "num_tokens": 15841753.0, "step": 4375 }, { "entropy": 0.5199720412492752, "epoch": 4.0821278581427904, "grad_norm": 0.3666343092918396, "learning_rate": 0.0002, "loss": 0.5219, "mean_token_accuracy": 0.7940666228532791, "num_tokens": 15845247.0, "step": 4376 }, { "entropy": 0.5539559870958328, "epoch": 4.083061129258049, "grad_norm": 0.36106544733047485, "learning_rate": 0.0002, "loss": 0.5447, "mean_token_accuracy": 0.7814560234546661, "num_tokens": 15848969.0, "step": 4377 }, { "entropy": 0.5657629370689392, "epoch": 4.083994400373308, "grad_norm": 0.3233679234981537, "learning_rate": 0.0002, "loss": 0.5523, "mean_token_accuracy": 0.7814423143863678, "num_tokens": 15852719.0, "step": 4378 }, { "entropy": 0.563693031668663, "epoch": 4.084927671488567, "grad_norm": 0.3878922164440155, "learning_rate": 0.0002, "loss": 0.5619, "mean_token_accuracy": 0.774744063615799, "num_tokens": 15856391.0, "step": 4379 }, { "entropy": 0.5846759676933289, "epoch": 4.085860942603826, "grad_norm": 0.4122251868247986, "learning_rate": 0.0002, "loss": 0.5942, "mean_token_accuracy": 0.7638482600450516, "num_tokens": 15859890.0, "step": 4380 }, { "entropy": 0.5595483407378197, "epoch": 4.086794213719085, "grad_norm": 0.4141573905944824, "learning_rate": 0.0002, "loss": 0.5631, "mean_token_accuracy": 0.7739391475915909, "num_tokens": 15863700.0, "step": 4381 }, { "entropy": 0.5573016405105591, "epoch": 4.087727484834344, "grad_norm": 0.444029301404953, "learning_rate": 0.0002, "loss": 0.5708, "mean_token_accuracy": 0.7670770287513733, "num_tokens": 15867337.0, "step": 4382 }, { "entropy": 0.5615010634064674, "epoch": 4.088660755949603, "grad_norm": 0.3499749004840851, "learning_rate": 0.0002, "loss": 0.5558, "mean_token_accuracy": 0.7782021164894104, "num_tokens": 15870955.0, "step": 4383 }, { "entropy": 0.5452065318822861, "epoch": 4.089594027064862, "grad_norm": 0.3909960389137268, "learning_rate": 0.0002, "loss": 0.5446, "mean_token_accuracy": 0.785827025771141, "num_tokens": 15874619.0, "step": 4384 }, { "entropy": 0.5169099494814873, "epoch": 4.090527298180121, "grad_norm": 0.3814428746700287, "learning_rate": 0.0002, "loss": 0.5206, "mean_token_accuracy": 0.7959522902965546, "num_tokens": 15878143.0, "step": 4385 }, { "entropy": 0.5501350909471512, "epoch": 4.09146056929538, "grad_norm": 0.36494341492652893, "learning_rate": 0.0002, "loss": 0.5484, "mean_token_accuracy": 0.7784543633460999, "num_tokens": 15881722.0, "step": 4386 }, { "entropy": 0.5484140813350677, "epoch": 4.092393840410639, "grad_norm": 0.3990732729434967, "learning_rate": 0.0002, "loss": 0.5502, "mean_token_accuracy": 0.7747270613908768, "num_tokens": 15885367.0, "step": 4387 }, { "entropy": 0.5598793625831604, "epoch": 4.093327111525898, "grad_norm": 0.4579596519470215, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.7746822983026505, "num_tokens": 15889048.0, "step": 4388 }, { "entropy": 0.569487676024437, "epoch": 4.094260382641157, "grad_norm": 0.37817642092704773, "learning_rate": 0.0002, "loss": 0.5673, "mean_token_accuracy": 0.770484670996666, "num_tokens": 15892754.0, "step": 4389 }, { "entropy": 0.551454946398735, "epoch": 4.095193653756416, "grad_norm": 0.40148138999938965, "learning_rate": 0.0002, "loss": 0.5597, "mean_token_accuracy": 0.7740621417760849, "num_tokens": 15896342.0, "step": 4390 }, { "entropy": 0.5688120871782303, "epoch": 4.096126924871675, "grad_norm": 0.41293197870254517, "learning_rate": 0.0002, "loss": 0.5763, "mean_token_accuracy": 0.765140026807785, "num_tokens": 15899921.0, "step": 4391 }, { "entropy": 0.5658110529184341, "epoch": 4.097060195986934, "grad_norm": 0.3917008936405182, "learning_rate": 0.0002, "loss": 0.5761, "mean_token_accuracy": 0.7726100236177444, "num_tokens": 15903484.0, "step": 4392 }, { "entropy": 0.5226412042975426, "epoch": 4.097993467102193, "grad_norm": 0.3575540781021118, "learning_rate": 0.0002, "loss": 0.5188, "mean_token_accuracy": 0.7888312190771103, "num_tokens": 15907192.0, "step": 4393 }, { "entropy": 0.5631369352340698, "epoch": 4.098926738217452, "grad_norm": 0.34116730093955994, "learning_rate": 0.0002, "loss": 0.5594, "mean_token_accuracy": 0.7728751450777054, "num_tokens": 15910769.0, "step": 4394 }, { "entropy": 0.5931248217821121, "epoch": 4.099860009332711, "grad_norm": 0.3457247316837311, "learning_rate": 0.0002, "loss": 0.5815, "mean_token_accuracy": 0.7680908888578415, "num_tokens": 15914458.0, "step": 4395 }, { "entropy": 0.5424012988805771, "epoch": 4.10079328044797, "grad_norm": 0.41280707716941833, "learning_rate": 0.0002, "loss": 0.5391, "mean_token_accuracy": 0.7801148444414139, "num_tokens": 15917990.0, "step": 4396 }, { "entropy": 0.5563458353281021, "epoch": 4.101726551563229, "grad_norm": 0.3784092366695404, "learning_rate": 0.0002, "loss": 0.561, "mean_token_accuracy": 0.7734064161777496, "num_tokens": 15921623.0, "step": 4397 }, { "entropy": 0.5318582504987717, "epoch": 4.102659822678488, "grad_norm": 0.36413851380348206, "learning_rate": 0.0002, "loss": 0.5208, "mean_token_accuracy": 0.7895262539386749, "num_tokens": 15925210.0, "step": 4398 }, { "entropy": 0.5667240619659424, "epoch": 4.103593093793747, "grad_norm": 0.43014049530029297, "learning_rate": 0.0002, "loss": 0.5738, "mean_token_accuracy": 0.7668261528015137, "num_tokens": 15928832.0, "step": 4399 }, { "entropy": 0.5705763399600983, "epoch": 4.104526364909006, "grad_norm": 0.39544597268104553, "learning_rate": 0.0002, "loss": 0.5794, "mean_token_accuracy": 0.7627464383840561, "num_tokens": 15932462.0, "step": 4400 }, { "entropy": 0.5627043098211288, "epoch": 4.105459636024265, "grad_norm": 0.41506877541542053, "learning_rate": 0.0002, "loss": 0.5824, "mean_token_accuracy": 0.7642354965209961, "num_tokens": 15936044.0, "step": 4401 }, { "entropy": 0.5881014317274094, "epoch": 4.106392907139524, "grad_norm": 0.36006489396095276, "learning_rate": 0.0002, "loss": 0.5843, "mean_token_accuracy": 0.7609738111495972, "num_tokens": 15939715.0, "step": 4402 }, { "entropy": 0.5214153900742531, "epoch": 4.107326178254783, "grad_norm": 0.34497570991516113, "learning_rate": 0.0002, "loss": 0.5213, "mean_token_accuracy": 0.7871826142072678, "num_tokens": 15943404.0, "step": 4403 }, { "entropy": 0.5389950722455978, "epoch": 4.108259449370042, "grad_norm": 0.4079255759716034, "learning_rate": 0.0002, "loss": 0.5407, "mean_token_accuracy": 0.7833768278360367, "num_tokens": 15947020.0, "step": 4404 }, { "entropy": 0.5719279795885086, "epoch": 4.109192720485301, "grad_norm": 0.31818878650665283, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7737962305545807, "num_tokens": 15950587.0, "step": 4405 }, { "entropy": 0.5904742479324341, "epoch": 4.11012599160056, "grad_norm": 0.40725910663604736, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.7587917596101761, "num_tokens": 15954259.0, "step": 4406 }, { "entropy": 0.5748631060123444, "epoch": 4.111059262715819, "grad_norm": 0.367063969373703, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.7682444155216217, "num_tokens": 15957849.0, "step": 4407 }, { "entropy": 0.5804377198219299, "epoch": 4.111992533831078, "grad_norm": 0.38982218503952026, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7667720913887024, "num_tokens": 15961433.0, "step": 4408 }, { "entropy": 0.5531233251094818, "epoch": 4.112925804946337, "grad_norm": 0.42229214310646057, "learning_rate": 0.0002, "loss": 0.5468, "mean_token_accuracy": 0.7848973572254181, "num_tokens": 15965098.0, "step": 4409 }, { "entropy": 0.6125704646110535, "epoch": 4.113859076061596, "grad_norm": 0.3975871503353119, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.7498772293329239, "num_tokens": 15968848.0, "step": 4410 }, { "entropy": 0.5345346331596375, "epoch": 4.114792347176855, "grad_norm": 0.5533455610275269, "learning_rate": 0.0002, "loss": 0.5445, "mean_token_accuracy": 0.783808171749115, "num_tokens": 15972455.0, "step": 4411 }, { "entropy": 0.5651628822088242, "epoch": 4.115725618292114, "grad_norm": 0.49397748708724976, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.766636997461319, "num_tokens": 15976005.0, "step": 4412 }, { "entropy": 0.5669545978307724, "epoch": 4.116658889407373, "grad_norm": 0.3837139308452606, "learning_rate": 0.0002, "loss": 0.5697, "mean_token_accuracy": 0.7705878019332886, "num_tokens": 15979641.0, "step": 4413 }, { "entropy": 0.5538704097270966, "epoch": 4.117592160522632, "grad_norm": 0.4029603600502014, "learning_rate": 0.0002, "loss": 0.5508, "mean_token_accuracy": 0.7761311531066895, "num_tokens": 15983159.0, "step": 4414 }, { "entropy": 0.5755529701709747, "epoch": 4.118525431637891, "grad_norm": 0.3801623284816742, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.7693308591842651, "num_tokens": 15986783.0, "step": 4415 }, { "entropy": 0.6031590551137924, "epoch": 4.11945870275315, "grad_norm": 0.3869643211364746, "learning_rate": 0.0002, "loss": 0.5927, "mean_token_accuracy": 0.7602563500404358, "num_tokens": 15990446.0, "step": 4416 }, { "entropy": 0.5881306678056717, "epoch": 4.120391973868409, "grad_norm": 0.35071584582328796, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.7576500773429871, "num_tokens": 15994054.0, "step": 4417 }, { "entropy": 0.5275491401553154, "epoch": 4.121325244983668, "grad_norm": 0.3480493128299713, "learning_rate": 0.0002, "loss": 0.5203, "mean_token_accuracy": 0.7914706766605377, "num_tokens": 15997749.0, "step": 4418 }, { "entropy": 0.5926742106676102, "epoch": 4.122258516098927, "grad_norm": 0.3841041624546051, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.7556651830673218, "num_tokens": 16001503.0, "step": 4419 }, { "entropy": 0.5379577577114105, "epoch": 4.123191787214186, "grad_norm": 0.2910940647125244, "learning_rate": 0.0002, "loss": 0.536, "mean_token_accuracy": 0.7848014384508133, "num_tokens": 16005182.0, "step": 4420 }, { "entropy": 0.5617898553609848, "epoch": 4.124125058329445, "grad_norm": 0.42166396975517273, "learning_rate": 0.0002, "loss": 0.5708, "mean_token_accuracy": 0.7735068053007126, "num_tokens": 16008847.0, "step": 4421 }, { "entropy": 0.5645452737808228, "epoch": 4.125058329444704, "grad_norm": 0.39508384466171265, "learning_rate": 0.0002, "loss": 0.5715, "mean_token_accuracy": 0.7790709137916565, "num_tokens": 16012420.0, "step": 4422 }, { "entropy": 0.5886212438344955, "epoch": 4.125991600559963, "grad_norm": 0.4250461757183075, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.75514817237854, "num_tokens": 16015996.0, "step": 4423 }, { "entropy": 0.5909144282341003, "epoch": 4.126924871675222, "grad_norm": 0.3710750341415405, "learning_rate": 0.0002, "loss": 0.6025, "mean_token_accuracy": 0.7556474953889847, "num_tokens": 16019705.0, "step": 4424 }, { "entropy": 0.565814420580864, "epoch": 4.127858142790481, "grad_norm": 0.4087642729282379, "learning_rate": 0.0002, "loss": 0.5691, "mean_token_accuracy": 0.7660773694515228, "num_tokens": 16023516.0, "step": 4425 }, { "entropy": 0.5362553000450134, "epoch": 4.12879141390574, "grad_norm": 0.39698588848114014, "learning_rate": 0.0002, "loss": 0.533, "mean_token_accuracy": 0.7866718918085098, "num_tokens": 16027168.0, "step": 4426 }, { "entropy": 0.57541124522686, "epoch": 4.1297246850209985, "grad_norm": 0.35051223635673523, "learning_rate": 0.0002, "loss": 0.57, "mean_token_accuracy": 0.7724707126617432, "num_tokens": 16030854.0, "step": 4427 }, { "entropy": 0.5471496731042862, "epoch": 4.1306579561362575, "grad_norm": 0.3827662467956543, "learning_rate": 0.0002, "loss": 0.5516, "mean_token_accuracy": 0.77565498650074, "num_tokens": 16034359.0, "step": 4428 }, { "entropy": 0.5535033792257309, "epoch": 4.1315912272515165, "grad_norm": 0.4327697455883026, "learning_rate": 0.0002, "loss": 0.5681, "mean_token_accuracy": 0.7779445350170135, "num_tokens": 16038037.0, "step": 4429 }, { "entropy": 0.5363893210887909, "epoch": 4.1325244983667755, "grad_norm": 0.4233139753341675, "learning_rate": 0.0002, "loss": 0.5432, "mean_token_accuracy": 0.7852437198162079, "num_tokens": 16041678.0, "step": 4430 }, { "entropy": 0.5811562538146973, "epoch": 4.1334577694820345, "grad_norm": 0.39563480019569397, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.7625151723623276, "num_tokens": 16045343.0, "step": 4431 }, { "entropy": 0.5544935613870621, "epoch": 4.1343910405972935, "grad_norm": 0.3981841206550598, "learning_rate": 0.0002, "loss": 0.547, "mean_token_accuracy": 0.7840071469545364, "num_tokens": 16048996.0, "step": 4432 }, { "entropy": 0.5765390843153, "epoch": 4.135324311712552, "grad_norm": 0.42809754610061646, "learning_rate": 0.0002, "loss": 0.5792, "mean_token_accuracy": 0.766445517539978, "num_tokens": 16052688.0, "step": 4433 }, { "entropy": 0.5360845625400543, "epoch": 4.136257582827811, "grad_norm": 0.37710654735565186, "learning_rate": 0.0002, "loss": 0.5342, "mean_token_accuracy": 0.7823232710361481, "num_tokens": 16056226.0, "step": 4434 }, { "entropy": 0.553607851266861, "epoch": 4.13719085394307, "grad_norm": 0.43383264541625977, "learning_rate": 0.0002, "loss": 0.5584, "mean_token_accuracy": 0.770741418004036, "num_tokens": 16059887.0, "step": 4435 }, { "entropy": 0.5328576937317848, "epoch": 4.138124125058329, "grad_norm": 0.3369746804237366, "learning_rate": 0.0002, "loss": 0.5304, "mean_token_accuracy": 0.7845747619867325, "num_tokens": 16063417.0, "step": 4436 }, { "entropy": 0.5637959986925125, "epoch": 4.139057396173588, "grad_norm": 0.4271340072154999, "learning_rate": 0.0002, "loss": 0.5733, "mean_token_accuracy": 0.7716436833143234, "num_tokens": 16066934.0, "step": 4437 }, { "entropy": 0.5333796888589859, "epoch": 4.139990667288847, "grad_norm": 0.3972529470920563, "learning_rate": 0.0002, "loss": 0.5429, "mean_token_accuracy": 0.775852620601654, "num_tokens": 16070455.0, "step": 4438 }, { "entropy": 0.5799658745527267, "epoch": 4.140923938404106, "grad_norm": 0.4817938208580017, "learning_rate": 0.0002, "loss": 0.6011, "mean_token_accuracy": 0.7579192370176315, "num_tokens": 16074105.0, "step": 4439 }, { "entropy": 0.5858430415391922, "epoch": 4.141857209519365, "grad_norm": 0.47453686594963074, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.7595655918121338, "num_tokens": 16077785.0, "step": 4440 }, { "entropy": 0.5383895635604858, "epoch": 4.142790480634624, "grad_norm": 0.3593476414680481, "learning_rate": 0.0002, "loss": 0.5446, "mean_token_accuracy": 0.7859454303979874, "num_tokens": 16081368.0, "step": 4441 }, { "entropy": 0.5844596922397614, "epoch": 4.143723751749883, "grad_norm": 0.3897358775138855, "learning_rate": 0.0002, "loss": 0.5905, "mean_token_accuracy": 0.762002557516098, "num_tokens": 16084940.0, "step": 4442 }, { "entropy": 0.5610780715942383, "epoch": 4.144657022865142, "grad_norm": 0.3647034168243408, "learning_rate": 0.0002, "loss": 0.5627, "mean_token_accuracy": 0.7712987810373306, "num_tokens": 16088552.0, "step": 4443 }, { "entropy": 0.5394036173820496, "epoch": 4.145590293980401, "grad_norm": 0.4191771447658539, "learning_rate": 0.0002, "loss": 0.5374, "mean_token_accuracy": 0.7807114720344543, "num_tokens": 16092135.0, "step": 4444 }, { "entropy": 0.5799668282270432, "epoch": 4.14652356509566, "grad_norm": 0.33805549144744873, "learning_rate": 0.0002, "loss": 0.5756, "mean_token_accuracy": 0.7699174731969833, "num_tokens": 16095830.0, "step": 4445 }, { "entropy": 0.5882242918014526, "epoch": 4.147456836210919, "grad_norm": 0.39074501395225525, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7619688957929611, "num_tokens": 16099495.0, "step": 4446 }, { "entropy": 0.58576300740242, "epoch": 4.148390107326178, "grad_norm": 0.4362803101539612, "learning_rate": 0.0002, "loss": 0.5886, "mean_token_accuracy": 0.7588661313056946, "num_tokens": 16103050.0, "step": 4447 }, { "entropy": 0.5560915917158127, "epoch": 4.149323378441437, "grad_norm": 0.36938080191612244, "learning_rate": 0.0002, "loss": 0.5479, "mean_token_accuracy": 0.7794412076473236, "num_tokens": 16106708.0, "step": 4448 }, { "entropy": 0.5404798984527588, "epoch": 4.150256649556696, "grad_norm": 0.4012331962585449, "learning_rate": 0.0002, "loss": 0.5441, "mean_token_accuracy": 0.7834742069244385, "num_tokens": 16110274.0, "step": 4449 }, { "entropy": 0.5514147654175758, "epoch": 4.151189920671955, "grad_norm": 0.35605019330978394, "learning_rate": 0.0002, "loss": 0.5438, "mean_token_accuracy": 0.7828662544488907, "num_tokens": 16114007.0, "step": 4450 }, { "entropy": 0.5571222454309464, "epoch": 4.152123191787214, "grad_norm": 0.4114798903465271, "learning_rate": 0.0002, "loss": 0.5605, "mean_token_accuracy": 0.7708314657211304, "num_tokens": 16117597.0, "step": 4451 }, { "entropy": 0.5352520570158958, "epoch": 4.153056462902473, "grad_norm": 0.4071026146411896, "learning_rate": 0.0002, "loss": 0.5425, "mean_token_accuracy": 0.7797213941812515, "num_tokens": 16121245.0, "step": 4452 }, { "entropy": 0.5436455234885216, "epoch": 4.153989734017732, "grad_norm": 0.5323375463485718, "learning_rate": 0.0002, "loss": 0.5582, "mean_token_accuracy": 0.7761431932449341, "num_tokens": 16124864.0, "step": 4453 }, { "entropy": 0.6192911565303802, "epoch": 4.154923005132991, "grad_norm": 0.3971819579601288, "learning_rate": 0.0002, "loss": 0.6292, "mean_token_accuracy": 0.7487154901027679, "num_tokens": 16128525.0, "step": 4454 }, { "entropy": 0.5409982204437256, "epoch": 4.15585627624825, "grad_norm": 0.3764752447605133, "learning_rate": 0.0002, "loss": 0.5456, "mean_token_accuracy": 0.7805296182632446, "num_tokens": 16132182.0, "step": 4455 }, { "entropy": 0.5544396638870239, "epoch": 4.156789547363509, "grad_norm": 0.39595457911491394, "learning_rate": 0.0002, "loss": 0.5659, "mean_token_accuracy": 0.7707878053188324, "num_tokens": 16135778.0, "step": 4456 }, { "entropy": 0.5675004571676254, "epoch": 4.157722818478768, "grad_norm": 0.4170816242694855, "learning_rate": 0.0002, "loss": 0.575, "mean_token_accuracy": 0.7664197683334351, "num_tokens": 16139297.0, "step": 4457 }, { "entropy": 0.5677419155836105, "epoch": 4.158656089594027, "grad_norm": 0.33762887120246887, "learning_rate": 0.0002, "loss": 0.5728, "mean_token_accuracy": 0.7672965973615646, "num_tokens": 16142948.0, "step": 4458 }, { "entropy": 0.5792926996946335, "epoch": 4.159589360709286, "grad_norm": 0.32875898480415344, "learning_rate": 0.0002, "loss": 0.5714, "mean_token_accuracy": 0.7728397399187088, "num_tokens": 16146614.0, "step": 4459 }, { "entropy": 0.5707026422023773, "epoch": 4.160522631824545, "grad_norm": 0.3788049519062042, "learning_rate": 0.0002, "loss": 0.565, "mean_token_accuracy": 0.7706453055143356, "num_tokens": 16150232.0, "step": 4460 }, { "entropy": 0.5643679350614548, "epoch": 4.161455902939804, "grad_norm": 0.3269674479961395, "learning_rate": 0.0002, "loss": 0.5568, "mean_token_accuracy": 0.7724134474992752, "num_tokens": 16153853.0, "step": 4461 }, { "entropy": 0.552356943488121, "epoch": 4.162389174055063, "grad_norm": 0.3684431314468384, "learning_rate": 0.0002, "loss": 0.5507, "mean_token_accuracy": 0.7769126892089844, "num_tokens": 16157465.0, "step": 4462 }, { "entropy": 0.6048998236656189, "epoch": 4.163322445170322, "grad_norm": 0.42384257912635803, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.753729984164238, "num_tokens": 16161160.0, "step": 4463 }, { "entropy": 0.5698466897010803, "epoch": 4.164255716285581, "grad_norm": 0.4123237431049347, "learning_rate": 0.0002, "loss": 0.5721, "mean_token_accuracy": 0.764623612165451, "num_tokens": 16164823.0, "step": 4464 }, { "entropy": 0.5277955457568169, "epoch": 4.16518898740084, "grad_norm": 0.4114566147327423, "learning_rate": 0.0002, "loss": 0.5404, "mean_token_accuracy": 0.7825662344694138, "num_tokens": 16168323.0, "step": 4465 }, { "entropy": 0.571990892291069, "epoch": 4.166122258516099, "grad_norm": 0.3914051949977875, "learning_rate": 0.0002, "loss": 0.5643, "mean_token_accuracy": 0.774555504322052, "num_tokens": 16172034.0, "step": 4466 }, { "entropy": 0.5603059381246567, "epoch": 4.167055529631358, "grad_norm": 0.46525004506111145, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7663080990314484, "num_tokens": 16175506.0, "step": 4467 }, { "entropy": 0.5131365060806274, "epoch": 4.167988800746617, "grad_norm": 0.3808034658432007, "learning_rate": 0.0002, "loss": 0.5214, "mean_token_accuracy": 0.785764530301094, "num_tokens": 16179040.0, "step": 4468 }, { "entropy": 0.5662645697593689, "epoch": 4.168922071861876, "grad_norm": 0.42223840951919556, "learning_rate": 0.0002, "loss": 0.5723, "mean_token_accuracy": 0.7669072151184082, "num_tokens": 16182638.0, "step": 4469 }, { "entropy": 0.5388799160718918, "epoch": 4.169855342977135, "grad_norm": 0.33347252011299133, "learning_rate": 0.0002, "loss": 0.5383, "mean_token_accuracy": 0.781153067946434, "num_tokens": 16186373.0, "step": 4470 }, { "entropy": 0.5199207738041878, "epoch": 4.170788614092394, "grad_norm": 0.37282198667526245, "learning_rate": 0.0002, "loss": 0.522, "mean_token_accuracy": 0.7930797934532166, "num_tokens": 16190031.0, "step": 4471 }, { "entropy": 0.5342549905180931, "epoch": 4.171721885207653, "grad_norm": 0.43613868951797485, "learning_rate": 0.0002, "loss": 0.5518, "mean_token_accuracy": 0.778131976723671, "num_tokens": 16193543.0, "step": 4472 }, { "entropy": 0.546243779361248, "epoch": 4.172655156322912, "grad_norm": 0.4062199890613556, "learning_rate": 0.0002, "loss": 0.5533, "mean_token_accuracy": 0.7757628560066223, "num_tokens": 16197163.0, "step": 4473 }, { "entropy": 0.5493147373199463, "epoch": 4.173588427438171, "grad_norm": 0.39030107855796814, "learning_rate": 0.0002, "loss": 0.5574, "mean_token_accuracy": 0.7764144092798233, "num_tokens": 16200858.0, "step": 4474 }, { "entropy": 0.5795082896947861, "epoch": 4.17452169855343, "grad_norm": 0.3530479073524475, "learning_rate": 0.0002, "loss": 0.5696, "mean_token_accuracy": 0.7732589691877365, "num_tokens": 16204526.0, "step": 4475 }, { "entropy": 0.579336866736412, "epoch": 4.175454969668689, "grad_norm": 0.3835013210773468, "learning_rate": 0.0002, "loss": 0.5851, "mean_token_accuracy": 0.7657412886619568, "num_tokens": 16208115.0, "step": 4476 }, { "entropy": 0.5648431330919266, "epoch": 4.176388240783948, "grad_norm": 0.3688071668148041, "learning_rate": 0.0002, "loss": 0.5571, "mean_token_accuracy": 0.7766186445951462, "num_tokens": 16211713.0, "step": 4477 }, { "entropy": 0.5574924498796463, "epoch": 4.177321511899207, "grad_norm": 0.33675137162208557, "learning_rate": 0.0002, "loss": 0.5534, "mean_token_accuracy": 0.7802775800228119, "num_tokens": 16215239.0, "step": 4478 }, { "entropy": 0.5455467253923416, "epoch": 4.178254783014466, "grad_norm": 0.3578890562057495, "learning_rate": 0.0002, "loss": 0.5438, "mean_token_accuracy": 0.7836681753396988, "num_tokens": 16218921.0, "step": 4479 }, { "entropy": 0.5583456009626389, "epoch": 4.179188054129725, "grad_norm": 0.4160853326320648, "learning_rate": 0.0002, "loss": 0.5731, "mean_token_accuracy": 0.7689127027988434, "num_tokens": 16222520.0, "step": 4480 }, { "entropy": 0.5333013162016869, "epoch": 4.180121325244984, "grad_norm": 0.42274150252342224, "learning_rate": 0.0002, "loss": 0.5393, "mean_token_accuracy": 0.7818892300128937, "num_tokens": 16226063.0, "step": 4481 }, { "entropy": 0.559693306684494, "epoch": 4.181054596360243, "grad_norm": 0.44452112913131714, "learning_rate": 0.0002, "loss": 0.5679, "mean_token_accuracy": 0.7688615322113037, "num_tokens": 16229707.0, "step": 4482 }, { "entropy": 0.5479262173175812, "epoch": 4.1819878674755016, "grad_norm": 0.4062844514846802, "learning_rate": 0.0002, "loss": 0.5517, "mean_token_accuracy": 0.7768008708953857, "num_tokens": 16233374.0, "step": 4483 }, { "entropy": 0.527716763317585, "epoch": 4.1829211385907605, "grad_norm": 0.340544730424881, "learning_rate": 0.0002, "loss": 0.532, "mean_token_accuracy": 0.7823335528373718, "num_tokens": 16237015.0, "step": 4484 }, { "entropy": 0.5484567880630493, "epoch": 4.1838544097060195, "grad_norm": 0.4316791296005249, "learning_rate": 0.0002, "loss": 0.556, "mean_token_accuracy": 0.7733221650123596, "num_tokens": 16240562.0, "step": 4485 }, { "entropy": 0.5708028078079224, "epoch": 4.1847876808212785, "grad_norm": 0.4671587646007538, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.7729039490222931, "num_tokens": 16244129.0, "step": 4486 }, { "entropy": 0.5718712657690048, "epoch": 4.1857209519365375, "grad_norm": 0.4063180088996887, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.7744385451078415, "num_tokens": 16247694.0, "step": 4487 }, { "entropy": 0.5837290436029434, "epoch": 4.1866542230517965, "grad_norm": 0.3640226423740387, "learning_rate": 0.0002, "loss": 0.58, "mean_token_accuracy": 0.7625810652971268, "num_tokens": 16251344.0, "step": 4488 }, { "entropy": 0.5227133259177208, "epoch": 4.1875874941670554, "grad_norm": 0.36458808183670044, "learning_rate": 0.0002, "loss": 0.5199, "mean_token_accuracy": 0.7922407686710358, "num_tokens": 16254962.0, "step": 4489 }, { "entropy": 0.5693673342466354, "epoch": 4.188520765282314, "grad_norm": 0.416572242975235, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7645738571882248, "num_tokens": 16258509.0, "step": 4490 }, { "entropy": 0.542713887989521, "epoch": 4.189454036397573, "grad_norm": 0.3925401568412781, "learning_rate": 0.0002, "loss": 0.5528, "mean_token_accuracy": 0.7774144262075424, "num_tokens": 16262126.0, "step": 4491 }, { "entropy": 0.5552328526973724, "epoch": 4.190387307512832, "grad_norm": 0.40719103813171387, "learning_rate": 0.0002, "loss": 0.5669, "mean_token_accuracy": 0.7742655724287033, "num_tokens": 16265814.0, "step": 4492 }, { "entropy": 0.5548240840435028, "epoch": 4.191320578628091, "grad_norm": 0.359979510307312, "learning_rate": 0.0002, "loss": 0.5658, "mean_token_accuracy": 0.7773142755031586, "num_tokens": 16269494.0, "step": 4493 }, { "entropy": 0.573658674955368, "epoch": 4.19225384974335, "grad_norm": 0.4649319350719452, "learning_rate": 0.0002, "loss": 0.5932, "mean_token_accuracy": 0.7631372064352036, "num_tokens": 16273191.0, "step": 4494 }, { "entropy": 0.5404131561517715, "epoch": 4.193187120858609, "grad_norm": 0.415968120098114, "learning_rate": 0.0002, "loss": 0.5554, "mean_token_accuracy": 0.7748799920082092, "num_tokens": 16276769.0, "step": 4495 }, { "entropy": 0.5250860154628754, "epoch": 4.194120391973868, "grad_norm": 0.3497310280799866, "learning_rate": 0.0002, "loss": 0.5272, "mean_token_accuracy": 0.7851834893226624, "num_tokens": 16280372.0, "step": 4496 }, { "entropy": 0.5905834138393402, "epoch": 4.195053663089127, "grad_norm": 0.4444243013858795, "learning_rate": 0.0002, "loss": 0.5878, "mean_token_accuracy": 0.7691342085599899, "num_tokens": 16283902.0, "step": 4497 }, { "entropy": 0.5490048974752426, "epoch": 4.195986934204386, "grad_norm": 0.3995142877101898, "learning_rate": 0.0002, "loss": 0.5523, "mean_token_accuracy": 0.7766209989786148, "num_tokens": 16287537.0, "step": 4498 }, { "entropy": 0.5602352768182755, "epoch": 4.196920205319645, "grad_norm": 0.3514252305030823, "learning_rate": 0.0002, "loss": 0.5574, "mean_token_accuracy": 0.7702897787094116, "num_tokens": 16291114.0, "step": 4499 }, { "entropy": 0.5639408677816391, "epoch": 4.197853476434904, "grad_norm": 0.3562808632850647, "learning_rate": 0.0002, "loss": 0.5684, "mean_token_accuracy": 0.7750012129545212, "num_tokens": 16294840.0, "step": 4500 }, { "entropy": 0.5542402118444443, "epoch": 4.198786747550163, "grad_norm": 0.34151703119277954, "learning_rate": 0.0002, "loss": 0.5539, "mean_token_accuracy": 0.7783383131027222, "num_tokens": 16298615.0, "step": 4501 }, { "entropy": 0.5454935729503632, "epoch": 4.199720018665422, "grad_norm": 0.34914302825927734, "learning_rate": 0.0002, "loss": 0.5402, "mean_token_accuracy": 0.7876157462596893, "num_tokens": 16302133.0, "step": 4502 }, { "entropy": 0.5308671370148659, "epoch": 4.200653289780681, "grad_norm": 0.41737061738967896, "learning_rate": 0.0002, "loss": 0.5267, "mean_token_accuracy": 0.7843996733427048, "num_tokens": 16305589.0, "step": 4503 }, { "entropy": 0.568629115819931, "epoch": 4.20158656089594, "grad_norm": 0.4494345486164093, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.7678923457860947, "num_tokens": 16309216.0, "step": 4504 }, { "entropy": 0.5185042843222618, "epoch": 4.202519832011199, "grad_norm": 0.4084509611129761, "learning_rate": 0.0002, "loss": 0.5237, "mean_token_accuracy": 0.7869435697793961, "num_tokens": 16312832.0, "step": 4505 }, { "entropy": 0.5557620376348495, "epoch": 4.203453103126458, "grad_norm": 0.37701573967933655, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7726753801107407, "num_tokens": 16316320.0, "step": 4506 }, { "entropy": 0.5802404880523682, "epoch": 4.204386374241717, "grad_norm": 0.7581464052200317, "learning_rate": 0.0002, "loss": 0.5905, "mean_token_accuracy": 0.7634428143501282, "num_tokens": 16320061.0, "step": 4507 }, { "entropy": 0.5538036376237869, "epoch": 4.205319645356976, "grad_norm": 0.3564257323741913, "learning_rate": 0.0002, "loss": 0.5471, "mean_token_accuracy": 0.7812828123569489, "num_tokens": 16323764.0, "step": 4508 }, { "entropy": 0.4979523867368698, "epoch": 4.206252916472235, "grad_norm": 0.40721994638442993, "learning_rate": 0.0002, "loss": 0.5217, "mean_token_accuracy": 0.7915256023406982, "num_tokens": 16327374.0, "step": 4509 }, { "entropy": 0.5106080919504166, "epoch": 4.207186187587494, "grad_norm": 0.4494459629058838, "learning_rate": 0.0002, "loss": 0.5345, "mean_token_accuracy": 0.7837228327989578, "num_tokens": 16330849.0, "step": 4510 }, { "entropy": 0.6228400468826294, "epoch": 4.208119458702753, "grad_norm": 0.899654746055603, "learning_rate": 0.0002, "loss": 0.6375, "mean_token_accuracy": 0.7464903444051743, "num_tokens": 16334553.0, "step": 4511 }, { "entropy": 0.5854363590478897, "epoch": 4.209052729818012, "grad_norm": 0.49035152792930603, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.7577978521585464, "num_tokens": 16338204.0, "step": 4512 }, { "entropy": 0.5608862787485123, "epoch": 4.209986000933271, "grad_norm": 0.45967376232147217, "learning_rate": 0.0002, "loss": 0.5566, "mean_token_accuracy": 0.7759836614131927, "num_tokens": 16341842.0, "step": 4513 }, { "entropy": 0.565482884645462, "epoch": 4.21091927204853, "grad_norm": 0.36401206254959106, "learning_rate": 0.0002, "loss": 0.5635, "mean_token_accuracy": 0.7814563363790512, "num_tokens": 16345355.0, "step": 4514 }, { "entropy": 0.5879130661487579, "epoch": 4.211852543163789, "grad_norm": 0.34686607122421265, "learning_rate": 0.0002, "loss": 0.5858, "mean_token_accuracy": 0.765937015414238, "num_tokens": 16349013.0, "step": 4515 }, { "entropy": 0.5628704726696014, "epoch": 4.212785814279048, "grad_norm": 0.4435798227787018, "learning_rate": 0.0002, "loss": 0.5684, "mean_token_accuracy": 0.7748207598924637, "num_tokens": 16352607.0, "step": 4516 }, { "entropy": 0.5510977357625961, "epoch": 4.213719085394307, "grad_norm": 0.5922273993492126, "learning_rate": 0.0002, "loss": 0.57, "mean_token_accuracy": 0.776917353272438, "num_tokens": 16356159.0, "step": 4517 }, { "entropy": 0.5830553770065308, "epoch": 4.214652356509566, "grad_norm": 0.47999247908592224, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.7721502482891083, "num_tokens": 16359766.0, "step": 4518 }, { "entropy": 0.5355807393789291, "epoch": 4.215585627624825, "grad_norm": 0.46855854988098145, "learning_rate": 0.0002, "loss": 0.5497, "mean_token_accuracy": 0.7801009863615036, "num_tokens": 16363409.0, "step": 4519 }, { "entropy": 0.5515029281377792, "epoch": 4.216518898740084, "grad_norm": 0.38685548305511475, "learning_rate": 0.0002, "loss": 0.5581, "mean_token_accuracy": 0.7746160179376602, "num_tokens": 16366935.0, "step": 4520 }, { "entropy": 0.5604618862271309, "epoch": 4.217452169855343, "grad_norm": 0.3747440576553345, "learning_rate": 0.0002, "loss": 0.5592, "mean_token_accuracy": 0.7727388292551041, "num_tokens": 16370669.0, "step": 4521 }, { "entropy": 0.5709213763475418, "epoch": 4.218385440970602, "grad_norm": 0.39434975385665894, "learning_rate": 0.0002, "loss": 0.5694, "mean_token_accuracy": 0.776545524597168, "num_tokens": 16374315.0, "step": 4522 }, { "entropy": 0.5955444425344467, "epoch": 4.219318712085861, "grad_norm": 0.3787243068218231, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.7630526572465897, "num_tokens": 16378109.0, "step": 4523 }, { "entropy": 0.5334986746311188, "epoch": 4.22025198320112, "grad_norm": 0.3305770754814148, "learning_rate": 0.0002, "loss": 0.5296, "mean_token_accuracy": 0.7817317247390747, "num_tokens": 16381723.0, "step": 4524 }, { "entropy": 0.5449931025505066, "epoch": 4.221185254316379, "grad_norm": 0.43413999676704407, "learning_rate": 0.0002, "loss": 0.5443, "mean_token_accuracy": 0.7830724865198135, "num_tokens": 16385395.0, "step": 4525 }, { "entropy": 0.5331474244594574, "epoch": 4.222118525431638, "grad_norm": 0.4225224256515503, "learning_rate": 0.0002, "loss": 0.5523, "mean_token_accuracy": 0.7790732532739639, "num_tokens": 16388973.0, "step": 4526 }, { "entropy": 0.5559306293725967, "epoch": 4.223051796546897, "grad_norm": 0.41010338068008423, "learning_rate": 0.0002, "loss": 0.5648, "mean_token_accuracy": 0.776974618434906, "num_tokens": 16392639.0, "step": 4527 }, { "entropy": 0.5695668309926987, "epoch": 4.223985067662156, "grad_norm": 0.4870885908603668, "learning_rate": 0.0002, "loss": 0.5793, "mean_token_accuracy": 0.762761577963829, "num_tokens": 16396233.0, "step": 4528 }, { "entropy": 0.522407628595829, "epoch": 4.224918338777415, "grad_norm": 0.4166819155216217, "learning_rate": 0.0002, "loss": 0.5262, "mean_token_accuracy": 0.7908060252666473, "num_tokens": 16399828.0, "step": 4529 }, { "entropy": 0.5289699286222458, "epoch": 4.225851609892674, "grad_norm": 0.39065057039260864, "learning_rate": 0.0002, "loss": 0.5351, "mean_token_accuracy": 0.7887045443058014, "num_tokens": 16403587.0, "step": 4530 }, { "entropy": 0.5517182946205139, "epoch": 4.226784881007933, "grad_norm": 0.3959144055843353, "learning_rate": 0.0002, "loss": 0.5421, "mean_token_accuracy": 0.7890835702419281, "num_tokens": 16407142.0, "step": 4531 }, { "entropy": 0.5876867920160294, "epoch": 4.227718152123192, "grad_norm": 0.34318530559539795, "learning_rate": 0.0002, "loss": 0.5789, "mean_token_accuracy": 0.7670533210039139, "num_tokens": 16410740.0, "step": 4532 }, { "entropy": 0.5384373664855957, "epoch": 4.228651423238451, "grad_norm": 0.34387582540512085, "learning_rate": 0.0002, "loss": 0.5342, "mean_token_accuracy": 0.7783515155315399, "num_tokens": 16414388.0, "step": 4533 }, { "entropy": 0.5513866096735001, "epoch": 4.22958469435371, "grad_norm": 0.36237654089927673, "learning_rate": 0.0002, "loss": 0.5518, "mean_token_accuracy": 0.7787488996982574, "num_tokens": 16418017.0, "step": 4534 }, { "entropy": 0.5528467297554016, "epoch": 4.230517965468969, "grad_norm": 0.36615580320358276, "learning_rate": 0.0002, "loss": 0.5458, "mean_token_accuracy": 0.7844164967536926, "num_tokens": 16421644.0, "step": 4535 }, { "entropy": 0.5241330415010452, "epoch": 4.231451236584228, "grad_norm": 0.43057534098625183, "learning_rate": 0.0002, "loss": 0.5262, "mean_token_accuracy": 0.7853270918130875, "num_tokens": 16425317.0, "step": 4536 }, { "entropy": 0.579550102353096, "epoch": 4.232384507699487, "grad_norm": 0.4063197672367096, "learning_rate": 0.0002, "loss": 0.576, "mean_token_accuracy": 0.7663921564817429, "num_tokens": 16429028.0, "step": 4537 }, { "entropy": 0.5691178143024445, "epoch": 4.233317778814746, "grad_norm": 0.40930721163749695, "learning_rate": 0.0002, "loss": 0.5797, "mean_token_accuracy": 0.7651995420455933, "num_tokens": 16432779.0, "step": 4538 }, { "entropy": 0.5662098079919815, "epoch": 4.234251049930005, "grad_norm": 0.5893768668174744, "learning_rate": 0.0002, "loss": 0.5965, "mean_token_accuracy": 0.766404926776886, "num_tokens": 16436361.0, "step": 4539 }, { "entropy": 0.5478901118040085, "epoch": 4.2351843210452635, "grad_norm": 0.3598502278327942, "learning_rate": 0.0002, "loss": 0.5489, "mean_token_accuracy": 0.775253176689148, "num_tokens": 16440156.0, "step": 4540 }, { "entropy": 0.5601105690002441, "epoch": 4.2361175921605225, "grad_norm": 0.4260481297969818, "learning_rate": 0.0002, "loss": 0.5711, "mean_token_accuracy": 0.7751760929822922, "num_tokens": 16443769.0, "step": 4541 }, { "entropy": 0.5491198897361755, "epoch": 4.2370508632757815, "grad_norm": 0.4296196401119232, "learning_rate": 0.0002, "loss": 0.5538, "mean_token_accuracy": 0.7779155522584915, "num_tokens": 16447349.0, "step": 4542 }, { "entropy": 0.5806834399700165, "epoch": 4.2379841343910405, "grad_norm": 0.38172921538352966, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.7645335346460342, "num_tokens": 16450962.0, "step": 4543 }, { "entropy": 0.5437842309474945, "epoch": 4.2389174055062995, "grad_norm": 0.4206923842430115, "learning_rate": 0.0002, "loss": 0.5532, "mean_token_accuracy": 0.7734051495790482, "num_tokens": 16454498.0, "step": 4544 }, { "entropy": 0.6044817119836807, "epoch": 4.2398506766215585, "grad_norm": 0.35585999488830566, "learning_rate": 0.0002, "loss": 0.6035, "mean_token_accuracy": 0.7597494572401047, "num_tokens": 16458163.0, "step": 4545 }, { "entropy": 0.5571728199720383, "epoch": 4.240783947736817, "grad_norm": 0.347603440284729, "learning_rate": 0.0002, "loss": 0.5586, "mean_token_accuracy": 0.7740349918603897, "num_tokens": 16461704.0, "step": 4546 }, { "entropy": 0.5969555377960205, "epoch": 4.241717218852076, "grad_norm": 0.44922980666160583, "learning_rate": 0.0002, "loss": 0.601, "mean_token_accuracy": 0.7519570887088776, "num_tokens": 16465260.0, "step": 4547 }, { "entropy": 0.5566169321537018, "epoch": 4.242650489967335, "grad_norm": 0.36194726824760437, "learning_rate": 0.0002, "loss": 0.5527, "mean_token_accuracy": 0.7725875079631805, "num_tokens": 16468847.0, "step": 4548 }, { "entropy": 0.5331284403800964, "epoch": 4.243583761082594, "grad_norm": 0.35673680901527405, "learning_rate": 0.0002, "loss": 0.5359, "mean_token_accuracy": 0.7809505611658096, "num_tokens": 16472528.0, "step": 4549 }, { "entropy": 0.5221046209335327, "epoch": 4.244517032197853, "grad_norm": 0.3584834039211273, "learning_rate": 0.0002, "loss": 0.5366, "mean_token_accuracy": 0.7867422252893448, "num_tokens": 16476163.0, "step": 4550 }, { "entropy": 0.6171246320009232, "epoch": 4.245450303313112, "grad_norm": 0.3716277778148651, "learning_rate": 0.0002, "loss": 0.6032, "mean_token_accuracy": 0.7573423087596893, "num_tokens": 16479921.0, "step": 4551 }, { "entropy": 0.5746903792023659, "epoch": 4.246383574428371, "grad_norm": 0.39970505237579346, "learning_rate": 0.0002, "loss": 0.5837, "mean_token_accuracy": 0.7684673219919205, "num_tokens": 16483514.0, "step": 4552 }, { "entropy": 0.5439817905426025, "epoch": 4.24731684554363, "grad_norm": 0.45723673701286316, "learning_rate": 0.0002, "loss": 0.5546, "mean_token_accuracy": 0.780216172337532, "num_tokens": 16487149.0, "step": 4553 }, { "entropy": 0.5562909469008446, "epoch": 4.248250116658889, "grad_norm": 0.37735116481781006, "learning_rate": 0.0002, "loss": 0.5608, "mean_token_accuracy": 0.773700937628746, "num_tokens": 16490857.0, "step": 4554 }, { "entropy": 0.5791928917169571, "epoch": 4.249183387774148, "grad_norm": 0.4481351971626282, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.7541704475879669, "num_tokens": 16494490.0, "step": 4555 }, { "entropy": 0.5741078406572342, "epoch": 4.250116658889407, "grad_norm": 0.49846628308296204, "learning_rate": 0.0002, "loss": 0.5864, "mean_token_accuracy": 0.7613990604877472, "num_tokens": 16498200.0, "step": 4556 }, { "entropy": 0.5691894292831421, "epoch": 4.251049930004666, "grad_norm": 0.3294290602207184, "learning_rate": 0.0002, "loss": 0.5646, "mean_token_accuracy": 0.7735412567853928, "num_tokens": 16501940.0, "step": 4557 }, { "entropy": 0.5761946439743042, "epoch": 4.251983201119925, "grad_norm": 0.4437555968761444, "learning_rate": 0.0002, "loss": 0.5748, "mean_token_accuracy": 0.7671098709106445, "num_tokens": 16505565.0, "step": 4558 }, { "entropy": 0.5870713293552399, "epoch": 4.252916472235184, "grad_norm": 0.41930556297302246, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.7616381198167801, "num_tokens": 16509353.0, "step": 4559 }, { "entropy": 0.5366149991750717, "epoch": 4.253849743350443, "grad_norm": 0.3634549081325531, "learning_rate": 0.0002, "loss": 0.5351, "mean_token_accuracy": 0.7859407365322113, "num_tokens": 16513043.0, "step": 4560 }, { "entropy": 0.5390907227993011, "epoch": 4.254783014465702, "grad_norm": 0.36334067583084106, "learning_rate": 0.0002, "loss": 0.5325, "mean_token_accuracy": 0.7893301993608475, "num_tokens": 16516579.0, "step": 4561 }, { "entropy": 0.586410716176033, "epoch": 4.255716285580961, "grad_norm": 0.3426935076713562, "learning_rate": 0.0002, "loss": 0.5772, "mean_token_accuracy": 0.7725787162780762, "num_tokens": 16520242.0, "step": 4562 }, { "entropy": 0.5439328402280807, "epoch": 4.25664955669622, "grad_norm": 0.42412158846855164, "learning_rate": 0.0002, "loss": 0.5416, "mean_token_accuracy": 0.7864137887954712, "num_tokens": 16523845.0, "step": 4563 }, { "entropy": 0.5684704631567001, "epoch": 4.257582827811479, "grad_norm": 0.3812936842441559, "learning_rate": 0.0002, "loss": 0.567, "mean_token_accuracy": 0.7735758870840073, "num_tokens": 16527497.0, "step": 4564 }, { "entropy": 0.5477355271577835, "epoch": 4.258516098926738, "grad_norm": 0.35875648260116577, "learning_rate": 0.0002, "loss": 0.5405, "mean_token_accuracy": 0.781907469034195, "num_tokens": 16531065.0, "step": 4565 }, { "entropy": 0.5447731167078018, "epoch": 4.259449370041997, "grad_norm": 0.3861584961414337, "learning_rate": 0.0002, "loss": 0.5453, "mean_token_accuracy": 0.7826742827892303, "num_tokens": 16534582.0, "step": 4566 }, { "entropy": 0.5752970427274704, "epoch": 4.260382641157256, "grad_norm": 0.3489152491092682, "learning_rate": 0.0002, "loss": 0.5677, "mean_token_accuracy": 0.7713454812765121, "num_tokens": 16538318.0, "step": 4567 }, { "entropy": 0.5247169509530067, "epoch": 4.261315912272515, "grad_norm": 0.42027297616004944, "learning_rate": 0.0002, "loss": 0.5386, "mean_token_accuracy": 0.7862849682569504, "num_tokens": 16541796.0, "step": 4568 }, { "entropy": 0.5521944761276245, "epoch": 4.262249183387774, "grad_norm": 0.4701298475265503, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.7705735862255096, "num_tokens": 16545371.0, "step": 4569 }, { "entropy": 0.5512542426586151, "epoch": 4.263182454503033, "grad_norm": 0.4199792742729187, "learning_rate": 0.0002, "loss": 0.5733, "mean_token_accuracy": 0.7705302387475967, "num_tokens": 16548848.0, "step": 4570 }, { "entropy": 0.5466200113296509, "epoch": 4.264115725618292, "grad_norm": 0.38444212079048157, "learning_rate": 0.0002, "loss": 0.5544, "mean_token_accuracy": 0.773516520857811, "num_tokens": 16552373.0, "step": 4571 }, { "entropy": 0.5831708014011383, "epoch": 4.265048996733551, "grad_norm": 0.4732194244861603, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.770376518368721, "num_tokens": 16556093.0, "step": 4572 }, { "entropy": 0.6070472449064255, "epoch": 4.26598226784881, "grad_norm": 0.3389182686805725, "learning_rate": 0.0002, "loss": 0.6002, "mean_token_accuracy": 0.7625418603420258, "num_tokens": 16559835.0, "step": 4573 }, { "entropy": 0.5243826806545258, "epoch": 4.266915538964069, "grad_norm": 0.33370673656463623, "learning_rate": 0.0002, "loss": 0.5204, "mean_token_accuracy": 0.7866966128349304, "num_tokens": 16563443.0, "step": 4574 }, { "entropy": 0.5613622963428497, "epoch": 4.267848810079328, "grad_norm": 0.40627050399780273, "learning_rate": 0.0002, "loss": 0.5559, "mean_token_accuracy": 0.776722177863121, "num_tokens": 16567168.0, "step": 4575 }, { "entropy": 0.5671862065792084, "epoch": 4.268782081194587, "grad_norm": 0.3959299325942993, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.764758437871933, "num_tokens": 16570841.0, "step": 4576 }, { "entropy": 0.5504554063081741, "epoch": 4.269715352309846, "grad_norm": 0.41689175367355347, "learning_rate": 0.0002, "loss": 0.5552, "mean_token_accuracy": 0.7748226672410965, "num_tokens": 16574329.0, "step": 4577 }, { "entropy": 0.5528155416250229, "epoch": 4.270648623425105, "grad_norm": 0.4188728332519531, "learning_rate": 0.0002, "loss": 0.5651, "mean_token_accuracy": 0.7717592120170593, "num_tokens": 16577951.0, "step": 4578 }, { "entropy": 0.5724599808454514, "epoch": 4.271581894540364, "grad_norm": 0.3458888828754425, "learning_rate": 0.0002, "loss": 0.576, "mean_token_accuracy": 0.7706823348999023, "num_tokens": 16581610.0, "step": 4579 }, { "entropy": 0.5822378396987915, "epoch": 4.272515165655623, "grad_norm": 0.4617227017879486, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.7635507732629776, "num_tokens": 16585192.0, "step": 4580 }, { "entropy": 0.5726703852415085, "epoch": 4.273448436770882, "grad_norm": 0.3836043179035187, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.764376237988472, "num_tokens": 16588886.0, "step": 4581 }, { "entropy": 0.5412057712674141, "epoch": 4.274381707886141, "grad_norm": 0.44249022006988525, "learning_rate": 0.0002, "loss": 0.5436, "mean_token_accuracy": 0.7779527753591537, "num_tokens": 16592539.0, "step": 4582 }, { "entropy": 0.5754400491714478, "epoch": 4.2753149790014, "grad_norm": 0.4521276354789734, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.7700896114110947, "num_tokens": 16596072.0, "step": 4583 }, { "entropy": 0.548817902803421, "epoch": 4.276248250116659, "grad_norm": 0.3890576660633087, "learning_rate": 0.0002, "loss": 0.5484, "mean_token_accuracy": 0.772969976067543, "num_tokens": 16599769.0, "step": 4584 }, { "entropy": 0.5526015013456345, "epoch": 4.277181521231918, "grad_norm": 0.3896269202232361, "learning_rate": 0.0002, "loss": 0.5624, "mean_token_accuracy": 0.7758597433567047, "num_tokens": 16603404.0, "step": 4585 }, { "entropy": 0.5556777268648148, "epoch": 4.278114792347177, "grad_norm": 0.5212846398353577, "learning_rate": 0.0002, "loss": 0.5701, "mean_token_accuracy": 0.7698825299739838, "num_tokens": 16606890.0, "step": 4586 }, { "entropy": 0.5650320649147034, "epoch": 4.279048063462436, "grad_norm": 0.45980075001716614, "learning_rate": 0.0002, "loss": 0.5755, "mean_token_accuracy": 0.7676198929548264, "num_tokens": 16610562.0, "step": 4587 }, { "entropy": 0.5655693411827087, "epoch": 4.279981334577695, "grad_norm": 0.36335209012031555, "learning_rate": 0.0002, "loss": 0.5597, "mean_token_accuracy": 0.7713259607553482, "num_tokens": 16614225.0, "step": 4588 }, { "entropy": 0.5841951817274094, "epoch": 4.280914605692954, "grad_norm": 0.37272143363952637, "learning_rate": 0.0002, "loss": 0.5923, "mean_token_accuracy": 0.7545136511325836, "num_tokens": 16617826.0, "step": 4589 }, { "entropy": 0.5855627804994583, "epoch": 4.281847876808213, "grad_norm": 0.49933281540870667, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.7555498480796814, "num_tokens": 16621359.0, "step": 4590 }, { "entropy": 0.5840333998203278, "epoch": 4.282781147923472, "grad_norm": 0.39573967456817627, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.7715776115655899, "num_tokens": 16624992.0, "step": 4591 }, { "entropy": 0.5893713384866714, "epoch": 4.283714419038731, "grad_norm": 0.33937036991119385, "learning_rate": 0.0002, "loss": 0.5925, "mean_token_accuracy": 0.7606225311756134, "num_tokens": 16628683.0, "step": 4592 }, { "entropy": 0.6217667609453201, "epoch": 4.28464769015399, "grad_norm": 0.3892073929309845, "learning_rate": 0.0002, "loss": 0.6252, "mean_token_accuracy": 0.7480908036231995, "num_tokens": 16632443.0, "step": 4593 }, { "entropy": 0.5691698789596558, "epoch": 4.285580961269249, "grad_norm": 0.3751169741153717, "learning_rate": 0.0002, "loss": 0.5689, "mean_token_accuracy": 0.7691144347190857, "num_tokens": 16635969.0, "step": 4594 }, { "entropy": 0.6049752086400986, "epoch": 4.286514232384508, "grad_norm": 0.39304545521736145, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.7571255564689636, "num_tokens": 16639531.0, "step": 4595 }, { "entropy": 0.6027060896158218, "epoch": 4.2874475034997666, "grad_norm": 0.3702416718006134, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.767469123005867, "num_tokens": 16643167.0, "step": 4596 }, { "entropy": 0.5644853413105011, "epoch": 4.2883807746150255, "grad_norm": 0.3617715835571289, "learning_rate": 0.0002, "loss": 0.5698, "mean_token_accuracy": 0.7744456827640533, "num_tokens": 16646612.0, "step": 4597 }, { "entropy": 0.5630577281117439, "epoch": 4.2893140457302845, "grad_norm": 0.4086513817310333, "learning_rate": 0.0002, "loss": 0.5591, "mean_token_accuracy": 0.7714471369981766, "num_tokens": 16650193.0, "step": 4598 }, { "entropy": 0.6052750051021576, "epoch": 4.2902473168455435, "grad_norm": 0.39043867588043213, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.758299246430397, "num_tokens": 16653962.0, "step": 4599 }, { "entropy": 0.5697626322507858, "epoch": 4.2911805879608025, "grad_norm": 0.3842417299747467, "learning_rate": 0.0002, "loss": 0.579, "mean_token_accuracy": 0.7704851776361465, "num_tokens": 16657632.0, "step": 4600 }, { "entropy": 0.556911051273346, "epoch": 4.2921138590760615, "grad_norm": 0.490234911441803, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.7638583034276962, "num_tokens": 16661322.0, "step": 4601 }, { "entropy": 0.5764216929674149, "epoch": 4.2930471301913204, "grad_norm": 0.4171096980571747, "learning_rate": 0.0002, "loss": 0.584, "mean_token_accuracy": 0.7671753764152527, "num_tokens": 16664914.0, "step": 4602 }, { "entropy": 0.5697965323925018, "epoch": 4.293980401306579, "grad_norm": 0.405622661113739, "learning_rate": 0.0002, "loss": 0.5795, "mean_token_accuracy": 0.7672004550695419, "num_tokens": 16668503.0, "step": 4603 }, { "entropy": 0.5471824482083321, "epoch": 4.294913672421838, "grad_norm": 0.3818334937095642, "learning_rate": 0.0002, "loss": 0.5585, "mean_token_accuracy": 0.7784087061882019, "num_tokens": 16672092.0, "step": 4604 }, { "entropy": 0.5760680586099625, "epoch": 4.295846943537097, "grad_norm": 0.3953736126422882, "learning_rate": 0.0002, "loss": 0.5773, "mean_token_accuracy": 0.7649439871311188, "num_tokens": 16675742.0, "step": 4605 }, { "entropy": 0.5464799255132675, "epoch": 4.296780214652356, "grad_norm": 0.3404422700405121, "learning_rate": 0.0002, "loss": 0.5381, "mean_token_accuracy": 0.7903730422258377, "num_tokens": 16679331.0, "step": 4606 }, { "entropy": 0.5560935139656067, "epoch": 4.297713485767615, "grad_norm": 0.42352694272994995, "learning_rate": 0.0002, "loss": 0.5552, "mean_token_accuracy": 0.7784016281366348, "num_tokens": 16682797.0, "step": 4607 }, { "entropy": 0.5620854794979095, "epoch": 4.298646756882874, "grad_norm": 0.4155040979385376, "learning_rate": 0.0002, "loss": 0.5554, "mean_token_accuracy": 0.7748286426067352, "num_tokens": 16686531.0, "step": 4608 }, { "entropy": 0.5386885702610016, "epoch": 4.299580027998133, "grad_norm": 0.40871959924697876, "learning_rate": 0.0002, "loss": 0.5547, "mean_token_accuracy": 0.7750599384307861, "num_tokens": 16689999.0, "step": 4609 }, { "entropy": 0.5657964795827866, "epoch": 4.300513299113392, "grad_norm": 0.38618016242980957, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7754220515489578, "num_tokens": 16693626.0, "step": 4610 }, { "entropy": 0.5748744606971741, "epoch": 4.301446570228651, "grad_norm": 0.3862861692905426, "learning_rate": 0.0002, "loss": 0.5806, "mean_token_accuracy": 0.7655459344387054, "num_tokens": 16697304.0, "step": 4611 }, { "entropy": 0.598087415099144, "epoch": 4.30237984134391, "grad_norm": 0.3259623944759369, "learning_rate": 0.0002, "loss": 0.5971, "mean_token_accuracy": 0.7569666653871536, "num_tokens": 16700961.0, "step": 4612 }, { "entropy": 0.5708233863115311, "epoch": 4.303313112459169, "grad_norm": 0.38811710476875305, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.768126517534256, "num_tokens": 16704606.0, "step": 4613 }, { "entropy": 0.542794793844223, "epoch": 4.304246383574428, "grad_norm": 0.4362645149230957, "learning_rate": 0.0002, "loss": 0.5451, "mean_token_accuracy": 0.7830037772655487, "num_tokens": 16708218.0, "step": 4614 }, { "entropy": 0.6123175024986267, "epoch": 4.305179654689687, "grad_norm": 0.46499866247177124, "learning_rate": 0.0002, "loss": 0.6274, "mean_token_accuracy": 0.7460983395576477, "num_tokens": 16712026.0, "step": 4615 }, { "entropy": 0.512028656899929, "epoch": 4.306112925804946, "grad_norm": 0.4586975574493408, "learning_rate": 0.0002, "loss": 0.5139, "mean_token_accuracy": 0.7913431972265244, "num_tokens": 16715589.0, "step": 4616 }, { "entropy": 0.5402645617723465, "epoch": 4.307046196920205, "grad_norm": 0.3739077150821686, "learning_rate": 0.0002, "loss": 0.5414, "mean_token_accuracy": 0.7789601385593414, "num_tokens": 16719194.0, "step": 4617 }, { "entropy": 0.5710839927196503, "epoch": 4.307979468035464, "grad_norm": 0.3656806945800781, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7691105753183365, "num_tokens": 16722853.0, "step": 4618 }, { "entropy": 0.5934586077928543, "epoch": 4.308912739150723, "grad_norm": 0.37234845757484436, "learning_rate": 0.0002, "loss": 0.5961, "mean_token_accuracy": 0.7645404487848282, "num_tokens": 16726465.0, "step": 4619 }, { "entropy": 0.5686552673578262, "epoch": 4.309846010265982, "grad_norm": 0.3808324933052063, "learning_rate": 0.0002, "loss": 0.5632, "mean_token_accuracy": 0.7716092169284821, "num_tokens": 16730047.0, "step": 4620 }, { "entropy": 0.5910189300775528, "epoch": 4.310779281381241, "grad_norm": 0.4038862884044647, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7660815417766571, "num_tokens": 16733653.0, "step": 4621 }, { "entropy": 0.5517419129610062, "epoch": 4.3117125524965, "grad_norm": 0.37767869234085083, "learning_rate": 0.0002, "loss": 0.5565, "mean_token_accuracy": 0.7726212292909622, "num_tokens": 16737267.0, "step": 4622 }, { "entropy": 0.5681179314851761, "epoch": 4.312645823611759, "grad_norm": 0.4112359583377838, "learning_rate": 0.0002, "loss": 0.5691, "mean_token_accuracy": 0.7666919827461243, "num_tokens": 16740993.0, "step": 4623 }, { "entropy": 0.5479011237621307, "epoch": 4.313579094727018, "grad_norm": 0.39576083421707153, "learning_rate": 0.0002, "loss": 0.5591, "mean_token_accuracy": 0.7745470106601715, "num_tokens": 16744572.0, "step": 4624 }, { "entropy": 0.5561527758836746, "epoch": 4.314512365842277, "grad_norm": 0.4742555618286133, "learning_rate": 0.0002, "loss": 0.5602, "mean_token_accuracy": 0.7753775268793106, "num_tokens": 16748144.0, "step": 4625 }, { "entropy": 0.5562687665224075, "epoch": 4.315445636957536, "grad_norm": 0.3753771185874939, "learning_rate": 0.0002, "loss": 0.5539, "mean_token_accuracy": 0.7729810178279877, "num_tokens": 16751664.0, "step": 4626 }, { "entropy": 0.5505030751228333, "epoch": 4.316378908072795, "grad_norm": 0.37984994053840637, "learning_rate": 0.0002, "loss": 0.5495, "mean_token_accuracy": 0.7788436114788055, "num_tokens": 16755338.0, "step": 4627 }, { "entropy": 0.5423202142119408, "epoch": 4.317312179188054, "grad_norm": 0.3984731435775757, "learning_rate": 0.0002, "loss": 0.5552, "mean_token_accuracy": 0.777241662144661, "num_tokens": 16758896.0, "step": 4628 }, { "entropy": 0.5806672424077988, "epoch": 4.318245450303313, "grad_norm": 0.5797415375709534, "learning_rate": 0.0002, "loss": 0.5993, "mean_token_accuracy": 0.7603085041046143, "num_tokens": 16762400.0, "step": 4629 }, { "entropy": 0.5614344850182533, "epoch": 4.319178721418572, "grad_norm": 0.3831437826156616, "learning_rate": 0.0002, "loss": 0.5661, "mean_token_accuracy": 0.7686126977205276, "num_tokens": 16765932.0, "step": 4630 }, { "entropy": 0.5449123829603195, "epoch": 4.320111992533831, "grad_norm": 0.3455878794193268, "learning_rate": 0.0002, "loss": 0.5416, "mean_token_accuracy": 0.7816454172134399, "num_tokens": 16769550.0, "step": 4631 }, { "entropy": 0.5775397419929504, "epoch": 4.32104526364909, "grad_norm": 0.35588666796684265, "learning_rate": 0.0002, "loss": 0.5786, "mean_token_accuracy": 0.7685042768716812, "num_tokens": 16773207.0, "step": 4632 }, { "entropy": 0.5417838096618652, "epoch": 4.321978534764349, "grad_norm": 0.379192590713501, "learning_rate": 0.0002, "loss": 0.543, "mean_token_accuracy": 0.774483934044838, "num_tokens": 16776728.0, "step": 4633 }, { "entropy": 0.6085532903671265, "epoch": 4.322911805879608, "grad_norm": 0.4344077408313751, "learning_rate": 0.0002, "loss": 0.6119, "mean_token_accuracy": 0.7501601278781891, "num_tokens": 16780381.0, "step": 4634 }, { "entropy": 0.5989659875631332, "epoch": 4.323845076994867, "grad_norm": 0.359725683927536, "learning_rate": 0.0002, "loss": 0.5806, "mean_token_accuracy": 0.7642632573843002, "num_tokens": 16783961.0, "step": 4635 }, { "entropy": 0.5592113882303238, "epoch": 4.324778348110126, "grad_norm": 0.3689648509025574, "learning_rate": 0.0002, "loss": 0.556, "mean_token_accuracy": 0.7810871005058289, "num_tokens": 16787507.0, "step": 4636 }, { "entropy": 0.5766878426074982, "epoch": 4.325711619225385, "grad_norm": 0.4172409772872925, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7664443105459213, "num_tokens": 16791155.0, "step": 4637 }, { "entropy": 0.5305728167295456, "epoch": 4.326644890340644, "grad_norm": 0.44531023502349854, "learning_rate": 0.0002, "loss": 0.5406, "mean_token_accuracy": 0.783518061041832, "num_tokens": 16794723.0, "step": 4638 }, { "entropy": 0.5695598721504211, "epoch": 4.327578161455903, "grad_norm": 0.3627854883670807, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.765716165304184, "num_tokens": 16798385.0, "step": 4639 }, { "entropy": 0.5538819432258606, "epoch": 4.328511432571162, "grad_norm": 0.368792325258255, "learning_rate": 0.0002, "loss": 0.5536, "mean_token_accuracy": 0.774614542722702, "num_tokens": 16802010.0, "step": 4640 }, { "entropy": 0.5377375781536102, "epoch": 4.329444703686421, "grad_norm": 0.41519737243652344, "learning_rate": 0.0002, "loss": 0.5368, "mean_token_accuracy": 0.7819859683513641, "num_tokens": 16805698.0, "step": 4641 }, { "entropy": 0.5558690056204796, "epoch": 4.33037797480168, "grad_norm": 0.4604337513446808, "learning_rate": 0.0002, "loss": 0.5716, "mean_token_accuracy": 0.7668971419334412, "num_tokens": 16809359.0, "step": 4642 }, { "entropy": 0.617822915315628, "epoch": 4.331311245916939, "grad_norm": 0.4477773606777191, "learning_rate": 0.0002, "loss": 0.6108, "mean_token_accuracy": 0.7612538635730743, "num_tokens": 16813143.0, "step": 4643 }, { "entropy": 0.5827959179878235, "epoch": 4.332244517032198, "grad_norm": 0.36747023463249207, "learning_rate": 0.0002, "loss": 0.5815, "mean_token_accuracy": 0.7606979459524155, "num_tokens": 16816787.0, "step": 4644 }, { "entropy": 0.5307241380214691, "epoch": 4.333177788147457, "grad_norm": 0.380158931016922, "learning_rate": 0.0002, "loss": 0.5419, "mean_token_accuracy": 0.7853983640670776, "num_tokens": 16820302.0, "step": 4645 }, { "entropy": 0.543260358273983, "epoch": 4.334111059262716, "grad_norm": 0.39682817459106445, "learning_rate": 0.0002, "loss": 0.5544, "mean_token_accuracy": 0.7782559096813202, "num_tokens": 16823962.0, "step": 4646 }, { "entropy": 0.5935095995664597, "epoch": 4.335044330377975, "grad_norm": 0.42336687445640564, "learning_rate": 0.0002, "loss": 0.6038, "mean_token_accuracy": 0.7575225532054901, "num_tokens": 16827690.0, "step": 4647 }, { "entropy": 0.5757811814546585, "epoch": 4.335977601493234, "grad_norm": 0.36186179518699646, "learning_rate": 0.0002, "loss": 0.5732, "mean_token_accuracy": 0.7716682702302933, "num_tokens": 16831435.0, "step": 4648 }, { "entropy": 0.5683554410934448, "epoch": 4.336910872608493, "grad_norm": 0.4756685793399811, "learning_rate": 0.0002, "loss": 0.5661, "mean_token_accuracy": 0.7770765721797943, "num_tokens": 16835050.0, "step": 4649 }, { "entropy": 0.583026796579361, "epoch": 4.337844143723752, "grad_norm": 0.36454981565475464, "learning_rate": 0.0002, "loss": 0.5836, "mean_token_accuracy": 0.7655878812074661, "num_tokens": 16838699.0, "step": 4650 }, { "entropy": 0.576143428683281, "epoch": 4.338777414839011, "grad_norm": 0.3763681650161743, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7650084644556046, "num_tokens": 16842297.0, "step": 4651 }, { "entropy": 0.5683173388242722, "epoch": 4.33971068595427, "grad_norm": 0.425941526889801, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7658594846725464, "num_tokens": 16845906.0, "step": 4652 }, { "entropy": 0.5661130249500275, "epoch": 4.3406439570695285, "grad_norm": 0.331843763589859, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7689116448163986, "num_tokens": 16849606.0, "step": 4653 }, { "entropy": 0.5815854072570801, "epoch": 4.3415772281847875, "grad_norm": 0.35378992557525635, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.7671300321817398, "num_tokens": 16853381.0, "step": 4654 }, { "entropy": 0.5650876611471176, "epoch": 4.3425104993000465, "grad_norm": 0.37343937158584595, "learning_rate": 0.0002, "loss": 0.5727, "mean_token_accuracy": 0.7652049362659454, "num_tokens": 16857023.0, "step": 4655 }, { "entropy": 0.5756454765796661, "epoch": 4.3434437704153055, "grad_norm": 0.42546308040618896, "learning_rate": 0.0002, "loss": 0.5853, "mean_token_accuracy": 0.7601268589496613, "num_tokens": 16860610.0, "step": 4656 }, { "entropy": 0.5572742372751236, "epoch": 4.3443770415305645, "grad_norm": 0.37221020460128784, "learning_rate": 0.0002, "loss": 0.5597, "mean_token_accuracy": 0.7744613140821457, "num_tokens": 16864252.0, "step": 4657 }, { "entropy": 0.5753507167100906, "epoch": 4.3453103126458235, "grad_norm": 0.3967035412788391, "learning_rate": 0.0002, "loss": 0.5801, "mean_token_accuracy": 0.7623188644647598, "num_tokens": 16867843.0, "step": 4658 }, { "entropy": 0.4974977821111679, "epoch": 4.346243583761082, "grad_norm": 0.3385363519191742, "learning_rate": 0.0002, "loss": 0.5024, "mean_token_accuracy": 0.7933927327394485, "num_tokens": 16871381.0, "step": 4659 }, { "entropy": 0.5614067763090134, "epoch": 4.347176854876341, "grad_norm": 0.4345538318157196, "learning_rate": 0.0002, "loss": 0.5628, "mean_token_accuracy": 0.7743257731199265, "num_tokens": 16874985.0, "step": 4660 }, { "entropy": 0.578956201672554, "epoch": 4.3481101259916, "grad_norm": 0.3668467104434967, "learning_rate": 0.0002, "loss": 0.578, "mean_token_accuracy": 0.7713659107685089, "num_tokens": 16878549.0, "step": 4661 }, { "entropy": 0.5829433053731918, "epoch": 4.349043397106859, "grad_norm": 0.3731113374233246, "learning_rate": 0.0002, "loss": 0.5888, "mean_token_accuracy": 0.7633968144655228, "num_tokens": 16882267.0, "step": 4662 }, { "entropy": 0.5497891381382942, "epoch": 4.349976668222118, "grad_norm": 0.36125853657722473, "learning_rate": 0.0002, "loss": 0.5672, "mean_token_accuracy": 0.7703017890453339, "num_tokens": 16885966.0, "step": 4663 }, { "entropy": 0.5734807997941971, "epoch": 4.350909939337377, "grad_norm": 0.3324778974056244, "learning_rate": 0.0002, "loss": 0.5684, "mean_token_accuracy": 0.7734586894512177, "num_tokens": 16889590.0, "step": 4664 }, { "entropy": 0.5303533226251602, "epoch": 4.351843210452636, "grad_norm": 0.5030075311660767, "learning_rate": 0.0002, "loss": 0.5262, "mean_token_accuracy": 0.7891038060188293, "num_tokens": 16893204.0, "step": 4665 }, { "entropy": 0.5276463329792023, "epoch": 4.352776481567895, "grad_norm": 0.37648308277130127, "learning_rate": 0.0002, "loss": 0.5284, "mean_token_accuracy": 0.7791541814804077, "num_tokens": 16896754.0, "step": 4666 }, { "entropy": 0.5677337199449539, "epoch": 4.353709752683154, "grad_norm": 0.37096065282821655, "learning_rate": 0.0002, "loss": 0.5661, "mean_token_accuracy": 0.7718956470489502, "num_tokens": 16900341.0, "step": 4667 }, { "entropy": 0.5406680107116699, "epoch": 4.354643023798413, "grad_norm": 0.3597382605075836, "learning_rate": 0.0002, "loss": 0.5456, "mean_token_accuracy": 0.7783167362213135, "num_tokens": 16903945.0, "step": 4668 }, { "entropy": 0.5669621527194977, "epoch": 4.355576294913672, "grad_norm": 0.44480857253074646, "learning_rate": 0.0002, "loss": 0.5836, "mean_token_accuracy": 0.7648208141326904, "num_tokens": 16907533.0, "step": 4669 }, { "entropy": 0.5446468740701675, "epoch": 4.356509566028931, "grad_norm": 0.4316929876804352, "learning_rate": 0.0002, "loss": 0.5617, "mean_token_accuracy": 0.7716309577226639, "num_tokens": 16911198.0, "step": 4670 }, { "entropy": 0.5801779627799988, "epoch": 4.35744283714419, "grad_norm": 0.40730419754981995, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.764004647731781, "num_tokens": 16914923.0, "step": 4671 }, { "entropy": 0.5196635872125626, "epoch": 4.358376108259449, "grad_norm": 0.41802069544792175, "learning_rate": 0.0002, "loss": 0.5154, "mean_token_accuracy": 0.7915468811988831, "num_tokens": 16918594.0, "step": 4672 }, { "entropy": 0.570673942565918, "epoch": 4.359309379374708, "grad_norm": 0.42902857065200806, "learning_rate": 0.0002, "loss": 0.5731, "mean_token_accuracy": 0.7633825838565826, "num_tokens": 16922169.0, "step": 4673 }, { "entropy": 0.5570426136255264, "epoch": 4.360242650489967, "grad_norm": 0.41017329692840576, "learning_rate": 0.0002, "loss": 0.5708, "mean_token_accuracy": 0.7732487767934799, "num_tokens": 16925678.0, "step": 4674 }, { "entropy": 0.5593993365764618, "epoch": 4.361175921605226, "grad_norm": 0.33803436160087585, "learning_rate": 0.0002, "loss": 0.5561, "mean_token_accuracy": 0.777127280831337, "num_tokens": 16929231.0, "step": 4675 }, { "entropy": 0.5817646682262421, "epoch": 4.362109192720485, "grad_norm": 0.3414442837238312, "learning_rate": 0.0002, "loss": 0.5648, "mean_token_accuracy": 0.7754772305488586, "num_tokens": 16932929.0, "step": 4676 }, { "entropy": 0.6373453140258789, "epoch": 4.363042463835744, "grad_norm": 0.4282139837741852, "learning_rate": 0.0002, "loss": 0.628, "mean_token_accuracy": 0.7446738630533218, "num_tokens": 16936654.0, "step": 4677 }, { "entropy": 0.5204852893948555, "epoch": 4.363975734951003, "grad_norm": 0.4299212694168091, "learning_rate": 0.0002, "loss": 0.5386, "mean_token_accuracy": 0.7842329740524292, "num_tokens": 16940171.0, "step": 4678 }, { "entropy": 0.5479604303836823, "epoch": 4.364909006066262, "grad_norm": 0.4205645024776459, "learning_rate": 0.0002, "loss": 0.5609, "mean_token_accuracy": 0.7668548077344894, "num_tokens": 16943867.0, "step": 4679 }, { "entropy": 0.5618674904108047, "epoch": 4.365842277181521, "grad_norm": 0.3679405748844147, "learning_rate": 0.0002, "loss": 0.5615, "mean_token_accuracy": 0.7702279090881348, "num_tokens": 16947437.0, "step": 4680 }, { "entropy": 0.5528646931052208, "epoch": 4.36677554829678, "grad_norm": 0.3780288100242615, "learning_rate": 0.0002, "loss": 0.5574, "mean_token_accuracy": 0.7722366154193878, "num_tokens": 16951050.0, "step": 4681 }, { "entropy": 0.5798207372426987, "epoch": 4.367708819412039, "grad_norm": 0.48529183864593506, "learning_rate": 0.0002, "loss": 0.5921, "mean_token_accuracy": 0.7614002674818039, "num_tokens": 16954786.0, "step": 4682 }, { "entropy": 0.6153538227081299, "epoch": 4.368642090527298, "grad_norm": 0.42850664258003235, "learning_rate": 0.0002, "loss": 0.6137, "mean_token_accuracy": 0.7522264420986176, "num_tokens": 16958424.0, "step": 4683 }, { "entropy": 0.5413488447666168, "epoch": 4.369575361642557, "grad_norm": 0.3412569761276245, "learning_rate": 0.0002, "loss": 0.5333, "mean_token_accuracy": 0.7879227846860886, "num_tokens": 16962030.0, "step": 4684 }, { "entropy": 0.5480045676231384, "epoch": 4.370508632757816, "grad_norm": 0.3573145270347595, "learning_rate": 0.0002, "loss": 0.5496, "mean_token_accuracy": 0.7741654962301254, "num_tokens": 16965535.0, "step": 4685 }, { "entropy": 0.5629580393433571, "epoch": 4.371441903873075, "grad_norm": 0.40035730600357056, "learning_rate": 0.0002, "loss": 0.565, "mean_token_accuracy": 0.7699762880802155, "num_tokens": 16969234.0, "step": 4686 }, { "entropy": 0.5804581046104431, "epoch": 4.372375174988334, "grad_norm": 0.3520801365375519, "learning_rate": 0.0002, "loss": 0.578, "mean_token_accuracy": 0.7620745599269867, "num_tokens": 16972843.0, "step": 4687 }, { "entropy": 0.576716274023056, "epoch": 4.373308446103593, "grad_norm": 0.430040568113327, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7671136260032654, "num_tokens": 16976589.0, "step": 4688 }, { "entropy": 0.5482111722230911, "epoch": 4.374241717218852, "grad_norm": 0.3678080439567566, "learning_rate": 0.0002, "loss": 0.5622, "mean_token_accuracy": 0.7725704312324524, "num_tokens": 16980243.0, "step": 4689 }, { "entropy": 0.5481177270412445, "epoch": 4.375174988334111, "grad_norm": 0.40206068754196167, "learning_rate": 0.0002, "loss": 0.5673, "mean_token_accuracy": 0.77241550385952, "num_tokens": 16983864.0, "step": 4690 }, { "entropy": 0.5537257567048073, "epoch": 4.37610825944937, "grad_norm": 0.37989363074302673, "learning_rate": 0.0002, "loss": 0.5555, "mean_token_accuracy": 0.7785429358482361, "num_tokens": 16987528.0, "step": 4691 }, { "entropy": 0.5646013915538788, "epoch": 4.377041530564629, "grad_norm": 0.38210639357566833, "learning_rate": 0.0002, "loss": 0.5728, "mean_token_accuracy": 0.7690902948379517, "num_tokens": 16991107.0, "step": 4692 }, { "entropy": 0.5668746307492256, "epoch": 4.377974801679888, "grad_norm": 0.39597296714782715, "learning_rate": 0.0002, "loss": 0.5697, "mean_token_accuracy": 0.7697228789329529, "num_tokens": 16994690.0, "step": 4693 }, { "entropy": 0.5161522552371025, "epoch": 4.378908072795147, "grad_norm": 0.3820546865463257, "learning_rate": 0.0002, "loss": 0.5094, "mean_token_accuracy": 0.7930627465248108, "num_tokens": 16998293.0, "step": 4694 }, { "entropy": 0.5786686390638351, "epoch": 4.379841343910406, "grad_norm": 0.3618684411048889, "learning_rate": 0.0002, "loss": 0.5812, "mean_token_accuracy": 0.7697398513555527, "num_tokens": 17001754.0, "step": 4695 }, { "entropy": 0.5949938148260117, "epoch": 4.380774615025665, "grad_norm": 0.33606553077697754, "learning_rate": 0.0002, "loss": 0.5868, "mean_token_accuracy": 0.7581522911787033, "num_tokens": 17005328.0, "step": 4696 }, { "entropy": 0.5840330123901367, "epoch": 4.381707886140924, "grad_norm": 0.3472137749195099, "learning_rate": 0.0002, "loss": 0.591, "mean_token_accuracy": 0.7566090077161789, "num_tokens": 17008975.0, "step": 4697 }, { "entropy": 0.5472902655601501, "epoch": 4.382641157256183, "grad_norm": 0.39333656430244446, "learning_rate": 0.0002, "loss": 0.5601, "mean_token_accuracy": 0.7732668519020081, "num_tokens": 17012673.0, "step": 4698 }, { "entropy": 0.5264808088541031, "epoch": 4.383574428371442, "grad_norm": 0.3094315826892853, "learning_rate": 0.0002, "loss": 0.5199, "mean_token_accuracy": 0.7909749746322632, "num_tokens": 17016293.0, "step": 4699 }, { "entropy": 0.5586999654769897, "epoch": 4.384507699486701, "grad_norm": 0.3823093771934509, "learning_rate": 0.0002, "loss": 0.5658, "mean_token_accuracy": 0.7782613784074783, "num_tokens": 17019944.0, "step": 4700 }, { "entropy": 0.5043866410851479, "epoch": 4.38544097060196, "grad_norm": 0.3680046796798706, "learning_rate": 0.0002, "loss": 0.5056, "mean_token_accuracy": 0.7980969548225403, "num_tokens": 17023414.0, "step": 4701 }, { "entropy": 0.5600251108407974, "epoch": 4.386374241717219, "grad_norm": 0.34734857082366943, "learning_rate": 0.0002, "loss": 0.5629, "mean_token_accuracy": 0.7755666524171829, "num_tokens": 17027089.0, "step": 4702 }, { "entropy": 0.5640757530927658, "epoch": 4.387307512832478, "grad_norm": 0.46661660075187683, "learning_rate": 0.0002, "loss": 0.5787, "mean_token_accuracy": 0.761077493429184, "num_tokens": 17030649.0, "step": 4703 }, { "entropy": 0.5852279216051102, "epoch": 4.388240783947737, "grad_norm": 0.4577287435531616, "learning_rate": 0.0002, "loss": 0.585, "mean_token_accuracy": 0.7650814205408096, "num_tokens": 17034277.0, "step": 4704 }, { "entropy": 0.544544592499733, "epoch": 4.389174055062996, "grad_norm": 0.38205909729003906, "learning_rate": 0.0002, "loss": 0.544, "mean_token_accuracy": 0.7728643119335175, "num_tokens": 17037960.0, "step": 4705 }, { "entropy": 0.5902948081493378, "epoch": 4.390107326178255, "grad_norm": 0.39844128489494324, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7707569152116776, "num_tokens": 17041602.0, "step": 4706 }, { "entropy": 0.5413063615560532, "epoch": 4.391040597293514, "grad_norm": 0.4116442799568176, "learning_rate": 0.0002, "loss": 0.5463, "mean_token_accuracy": 0.7829220294952393, "num_tokens": 17045137.0, "step": 4707 }, { "entropy": 0.5817439407110214, "epoch": 4.391973868408773, "grad_norm": 0.370347797870636, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7653124630451202, "num_tokens": 17048839.0, "step": 4708 }, { "entropy": 0.529910683631897, "epoch": 4.392907139524032, "grad_norm": 0.3894893229007721, "learning_rate": 0.0002, "loss": 0.5363, "mean_token_accuracy": 0.7870247066020966, "num_tokens": 17052426.0, "step": 4709 }, { "entropy": 0.5332825779914856, "epoch": 4.3938404106392905, "grad_norm": 0.39375174045562744, "learning_rate": 0.0002, "loss": 0.5413, "mean_token_accuracy": 0.7860166430473328, "num_tokens": 17056054.0, "step": 4710 }, { "entropy": 0.5480093955993652, "epoch": 4.3947736817545495, "grad_norm": 0.39534926414489746, "learning_rate": 0.0002, "loss": 0.5548, "mean_token_accuracy": 0.7780823111534119, "num_tokens": 17059617.0, "step": 4711 }, { "entropy": 0.5443964675068855, "epoch": 4.3957069528698085, "grad_norm": 0.41443565487861633, "learning_rate": 0.0002, "loss": 0.5679, "mean_token_accuracy": 0.7746897041797638, "num_tokens": 17063224.0, "step": 4712 }, { "entropy": 0.565073624253273, "epoch": 4.3966402239850675, "grad_norm": 0.35442501306533813, "learning_rate": 0.0002, "loss": 0.5596, "mean_token_accuracy": 0.7714420109987259, "num_tokens": 17066789.0, "step": 4713 }, { "entropy": 0.5696223378181458, "epoch": 4.3975734951003265, "grad_norm": 0.44112658500671387, "learning_rate": 0.0002, "loss": 0.5792, "mean_token_accuracy": 0.7707536965608597, "num_tokens": 17070459.0, "step": 4714 }, { "entropy": 0.562053769826889, "epoch": 4.3985067662155855, "grad_norm": 0.3856194317340851, "learning_rate": 0.0002, "loss": 0.5684, "mean_token_accuracy": 0.7672448009252548, "num_tokens": 17074041.0, "step": 4715 }, { "entropy": 0.5814986079931259, "epoch": 4.399440037330844, "grad_norm": 0.37185341119766235, "learning_rate": 0.0002, "loss": 0.5752, "mean_token_accuracy": 0.76861871778965, "num_tokens": 17077713.0, "step": 4716 }, { "entropy": 0.5446520000696182, "epoch": 4.400373308446103, "grad_norm": 0.37687408924102783, "learning_rate": 0.0002, "loss": 0.5473, "mean_token_accuracy": 0.7846891433000565, "num_tokens": 17081318.0, "step": 4717 }, { "entropy": 0.5465152636170387, "epoch": 4.401306579561362, "grad_norm": 0.39660927653312683, "learning_rate": 0.0002, "loss": 0.549, "mean_token_accuracy": 0.7747049480676651, "num_tokens": 17084861.0, "step": 4718 }, { "entropy": 0.5327008217573166, "epoch": 4.402239850676621, "grad_norm": 0.3510056734085083, "learning_rate": 0.0002, "loss": 0.5385, "mean_token_accuracy": 0.7850252985954285, "num_tokens": 17088515.0, "step": 4719 }, { "entropy": 0.5833509266376495, "epoch": 4.40317312179188, "grad_norm": 0.41949155926704407, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.762780487537384, "num_tokens": 17092130.0, "step": 4720 }, { "entropy": 0.6068794131278992, "epoch": 4.404106392907139, "grad_norm": 0.43204405903816223, "learning_rate": 0.0002, "loss": 0.6111, "mean_token_accuracy": 0.7542572170495987, "num_tokens": 17095809.0, "step": 4721 }, { "entropy": 0.5666741728782654, "epoch": 4.405039664022398, "grad_norm": 0.4077167212963104, "learning_rate": 0.0002, "loss": 0.5722, "mean_token_accuracy": 0.7663974165916443, "num_tokens": 17099319.0, "step": 4722 }, { "entropy": 0.5684815496206284, "epoch": 4.405972935137657, "grad_norm": 0.5228050947189331, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.7655448615550995, "num_tokens": 17102927.0, "step": 4723 }, { "entropy": 0.6018561571836472, "epoch": 4.406906206252916, "grad_norm": 0.45333603024482727, "learning_rate": 0.0002, "loss": 0.613, "mean_token_accuracy": 0.7541264295578003, "num_tokens": 17106536.0, "step": 4724 }, { "entropy": 0.5414431616663933, "epoch": 4.407839477368175, "grad_norm": 0.4065191447734833, "learning_rate": 0.0002, "loss": 0.5525, "mean_token_accuracy": 0.7775253504514694, "num_tokens": 17110109.0, "step": 4725 }, { "entropy": 0.5834645479917526, "epoch": 4.408772748483434, "grad_norm": 0.42941004037857056, "learning_rate": 0.0002, "loss": 0.5875, "mean_token_accuracy": 0.7633649557828903, "num_tokens": 17113818.0, "step": 4726 }, { "entropy": 0.5392599105834961, "epoch": 4.409706019598693, "grad_norm": 0.3844338059425354, "learning_rate": 0.0002, "loss": 0.5404, "mean_token_accuracy": 0.7806790918111801, "num_tokens": 17117423.0, "step": 4727 }, { "entropy": 0.5517618954181671, "epoch": 4.410639290713952, "grad_norm": 0.39306989312171936, "learning_rate": 0.0002, "loss": 0.5591, "mean_token_accuracy": 0.7813950031995773, "num_tokens": 17121026.0, "step": 4728 }, { "entropy": 0.5410718098282814, "epoch": 4.411572561829211, "grad_norm": 0.3800431191921234, "learning_rate": 0.0002, "loss": 0.5419, "mean_token_accuracy": 0.7862123101949692, "num_tokens": 17124573.0, "step": 4729 }, { "entropy": 0.5167459696531296, "epoch": 4.41250583294447, "grad_norm": 0.4437946379184723, "learning_rate": 0.0002, "loss": 0.5265, "mean_token_accuracy": 0.7933396995067596, "num_tokens": 17128083.0, "step": 4730 }, { "entropy": 0.566743478178978, "epoch": 4.413439104059729, "grad_norm": 0.39415067434310913, "learning_rate": 0.0002, "loss": 0.5707, "mean_token_accuracy": 0.7686133533716202, "num_tokens": 17131744.0, "step": 4731 }, { "entropy": 0.5692651867866516, "epoch": 4.414372375174988, "grad_norm": 0.39205077290534973, "learning_rate": 0.0002, "loss": 0.5659, "mean_token_accuracy": 0.7723159790039062, "num_tokens": 17135259.0, "step": 4732 }, { "entropy": 0.5790980756282806, "epoch": 4.415305646290247, "grad_norm": 0.3896775245666504, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7636936455965042, "num_tokens": 17138925.0, "step": 4733 }, { "entropy": 0.5655860006809235, "epoch": 4.416238917405506, "grad_norm": 0.3676462769508362, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.7657071650028229, "num_tokens": 17142549.0, "step": 4734 }, { "entropy": 0.5615455359220505, "epoch": 4.417172188520765, "grad_norm": 0.34674471616744995, "learning_rate": 0.0002, "loss": 0.5569, "mean_token_accuracy": 0.7817856222391129, "num_tokens": 17146090.0, "step": 4735 }, { "entropy": 0.571957066655159, "epoch": 4.418105459636024, "grad_norm": 0.373026579618454, "learning_rate": 0.0002, "loss": 0.5776, "mean_token_accuracy": 0.7684651166200638, "num_tokens": 17149592.0, "step": 4736 }, { "entropy": 0.5543761104345322, "epoch": 4.419038730751283, "grad_norm": 0.3503012955188751, "learning_rate": 0.0002, "loss": 0.5582, "mean_token_accuracy": 0.7750866711139679, "num_tokens": 17153154.0, "step": 4737 }, { "entropy": 0.5386656820774078, "epoch": 4.419972001866542, "grad_norm": 0.37612006068229675, "learning_rate": 0.0002, "loss": 0.5409, "mean_token_accuracy": 0.7849409431219101, "num_tokens": 17156832.0, "step": 4738 }, { "entropy": 0.5248478353023529, "epoch": 4.420905272981801, "grad_norm": 0.3810533285140991, "learning_rate": 0.0002, "loss": 0.5267, "mean_token_accuracy": 0.7835360318422318, "num_tokens": 17160447.0, "step": 4739 }, { "entropy": 0.5635346472263336, "epoch": 4.42183854409706, "grad_norm": 0.41568025946617126, "learning_rate": 0.0002, "loss": 0.579, "mean_token_accuracy": 0.7664895355701447, "num_tokens": 17164132.0, "step": 4740 }, { "entropy": 0.5779878944158554, "epoch": 4.422771815212319, "grad_norm": 0.4372031092643738, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.7634967267513275, "num_tokens": 17167761.0, "step": 4741 }, { "entropy": 0.5693068206310272, "epoch": 4.423705086327578, "grad_norm": 0.39451080560684204, "learning_rate": 0.0002, "loss": 0.562, "mean_token_accuracy": 0.7770075350999832, "num_tokens": 17171331.0, "step": 4742 }, { "entropy": 0.6011217385530472, "epoch": 4.424638357442837, "grad_norm": 0.39399054646492004, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.7611152678728104, "num_tokens": 17175023.0, "step": 4743 }, { "entropy": 0.5665426850318909, "epoch": 4.425571628558096, "grad_norm": 0.3595229685306549, "learning_rate": 0.0002, "loss": 0.5558, "mean_token_accuracy": 0.7770230919122696, "num_tokens": 17178693.0, "step": 4744 }, { "entropy": 0.5506669655442238, "epoch": 4.426504899673355, "grad_norm": 0.38675457239151, "learning_rate": 0.0002, "loss": 0.5429, "mean_token_accuracy": 0.7866318821907043, "num_tokens": 17182422.0, "step": 4745 }, { "entropy": 0.5674890875816345, "epoch": 4.427438170788614, "grad_norm": 0.2821574807167053, "learning_rate": 0.0002, "loss": 0.5443, "mean_token_accuracy": 0.7819769829511642, "num_tokens": 17186170.0, "step": 4746 }, { "entropy": 0.524126723408699, "epoch": 4.428371441903873, "grad_norm": 0.37574583292007446, "learning_rate": 0.0002, "loss": 0.5303, "mean_token_accuracy": 0.7928348332643509, "num_tokens": 17189703.0, "step": 4747 }, { "entropy": 0.5462774336338043, "epoch": 4.429304713019132, "grad_norm": 0.34517166018486023, "learning_rate": 0.0002, "loss": 0.555, "mean_token_accuracy": 0.7785056829452515, "num_tokens": 17193341.0, "step": 4748 }, { "entropy": 0.5719588547945023, "epoch": 4.430237984134391, "grad_norm": 0.35609951615333557, "learning_rate": 0.0002, "loss": 0.5702, "mean_token_accuracy": 0.7713380604982376, "num_tokens": 17197111.0, "step": 4749 }, { "entropy": 0.5561958253383636, "epoch": 4.43117125524965, "grad_norm": 0.36054325103759766, "learning_rate": 0.0002, "loss": 0.5477, "mean_token_accuracy": 0.7800996154546738, "num_tokens": 17200766.0, "step": 4750 }, { "entropy": 0.5539978891611099, "epoch": 4.432104526364909, "grad_norm": 0.40670061111450195, "learning_rate": 0.0002, "loss": 0.5684, "mean_token_accuracy": 0.7761141061782837, "num_tokens": 17204402.0, "step": 4751 }, { "entropy": 0.5567348152399063, "epoch": 4.433037797480168, "grad_norm": 0.46007099747657776, "learning_rate": 0.0002, "loss": 0.572, "mean_token_accuracy": 0.7713145613670349, "num_tokens": 17207971.0, "step": 4752 }, { "entropy": 0.5678975880146027, "epoch": 4.433971068595427, "grad_norm": 0.41920483112335205, "learning_rate": 0.0002, "loss": 0.5866, "mean_token_accuracy": 0.7565597891807556, "num_tokens": 17211584.0, "step": 4753 }, { "entropy": 0.5552102774381638, "epoch": 4.434904339710686, "grad_norm": 0.47984176874160767, "learning_rate": 0.0002, "loss": 0.5657, "mean_token_accuracy": 0.7746181190013885, "num_tokens": 17215265.0, "step": 4754 }, { "entropy": 0.5659389793872833, "epoch": 4.435837610825945, "grad_norm": 0.35407039523124695, "learning_rate": 0.0002, "loss": 0.5707, "mean_token_accuracy": 0.7690668702125549, "num_tokens": 17218905.0, "step": 4755 }, { "entropy": 0.544752761721611, "epoch": 4.436770881941204, "grad_norm": 0.43802326917648315, "learning_rate": 0.0002, "loss": 0.5548, "mean_token_accuracy": 0.7783986032009125, "num_tokens": 17222464.0, "step": 4756 }, { "entropy": 0.557177945971489, "epoch": 4.437704153056463, "grad_norm": 0.3929416239261627, "learning_rate": 0.0002, "loss": 0.5513, "mean_token_accuracy": 0.7801963537931442, "num_tokens": 17226124.0, "step": 4757 }, { "entropy": 0.5571690499782562, "epoch": 4.438637424171722, "grad_norm": 0.37667614221572876, "learning_rate": 0.0002, "loss": 0.554, "mean_token_accuracy": 0.7830390483140945, "num_tokens": 17229885.0, "step": 4758 }, { "entropy": 0.5658641755580902, "epoch": 4.439570695286981, "grad_norm": 0.4249729812145233, "learning_rate": 0.0002, "loss": 0.5673, "mean_token_accuracy": 0.7712273001670837, "num_tokens": 17233534.0, "step": 4759 }, { "entropy": 0.5664212554693222, "epoch": 4.44050396640224, "grad_norm": 0.38295233249664307, "learning_rate": 0.0002, "loss": 0.5618, "mean_token_accuracy": 0.7733796238899231, "num_tokens": 17237181.0, "step": 4760 }, { "entropy": 0.5437911450862885, "epoch": 4.441437237517499, "grad_norm": 0.32384541630744934, "learning_rate": 0.0002, "loss": 0.5495, "mean_token_accuracy": 0.7790758609771729, "num_tokens": 17240719.0, "step": 4761 }, { "entropy": 0.5396060198545456, "epoch": 4.442370508632758, "grad_norm": 0.3643885850906372, "learning_rate": 0.0002, "loss": 0.5388, "mean_token_accuracy": 0.7857991605997086, "num_tokens": 17244304.0, "step": 4762 }, { "entropy": 0.5442587286233902, "epoch": 4.443303779748017, "grad_norm": 0.3535192608833313, "learning_rate": 0.0002, "loss": 0.5534, "mean_token_accuracy": 0.7763226926326752, "num_tokens": 17247744.0, "step": 4763 }, { "entropy": 0.5689135193824768, "epoch": 4.444237050863276, "grad_norm": 0.31683871150016785, "learning_rate": 0.0002, "loss": 0.5608, "mean_token_accuracy": 0.7704321146011353, "num_tokens": 17251387.0, "step": 4764 }, { "entropy": 0.5592212378978729, "epoch": 4.445170321978535, "grad_norm": 0.3564198911190033, "learning_rate": 0.0002, "loss": 0.5625, "mean_token_accuracy": 0.7680592387914658, "num_tokens": 17255097.0, "step": 4765 }, { "entropy": 0.5578014999628067, "epoch": 4.4461035930937935, "grad_norm": 0.3772795498371124, "learning_rate": 0.0002, "loss": 0.5642, "mean_token_accuracy": 0.7702446728944778, "num_tokens": 17258678.0, "step": 4766 }, { "entropy": 0.5351444482803345, "epoch": 4.4470368642090525, "grad_norm": 0.38088786602020264, "learning_rate": 0.0002, "loss": 0.5268, "mean_token_accuracy": 0.7858691066503525, "num_tokens": 17262331.0, "step": 4767 }, { "entropy": 0.57601198554039, "epoch": 4.4479701353243115, "grad_norm": 0.4222445785999298, "learning_rate": 0.0002, "loss": 0.584, "mean_token_accuracy": 0.7625647634267807, "num_tokens": 17265924.0, "step": 4768 }, { "entropy": 0.5852011442184448, "epoch": 4.4489034064395705, "grad_norm": 0.476326584815979, "learning_rate": 0.0002, "loss": 0.6029, "mean_token_accuracy": 0.7566677927970886, "num_tokens": 17269564.0, "step": 4769 }, { "entropy": 0.5795451402664185, "epoch": 4.4498366775548295, "grad_norm": 0.4264727234840393, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.7638415992259979, "num_tokens": 17273159.0, "step": 4770 }, { "entropy": 0.5229138135910034, "epoch": 4.4507699486700885, "grad_norm": 0.4038301110267639, "learning_rate": 0.0002, "loss": 0.5345, "mean_token_accuracy": 0.7853285074234009, "num_tokens": 17276812.0, "step": 4771 }, { "entropy": 0.5642301142215729, "epoch": 4.451703219785347, "grad_norm": 0.36728376150131226, "learning_rate": 0.0002, "loss": 0.5636, "mean_token_accuracy": 0.7734222561120987, "num_tokens": 17280380.0, "step": 4772 }, { "entropy": 0.5472255498170853, "epoch": 4.452636490900606, "grad_norm": 0.4221397936344147, "learning_rate": 0.0002, "loss": 0.5481, "mean_token_accuracy": 0.7785835862159729, "num_tokens": 17283920.0, "step": 4773 }, { "entropy": 0.5769768357276917, "epoch": 4.453569762015865, "grad_norm": 0.3671717643737793, "learning_rate": 0.0002, "loss": 0.5715, "mean_token_accuracy": 0.7729549407958984, "num_tokens": 17287706.0, "step": 4774 }, { "entropy": 0.5820193141698837, "epoch": 4.454503033131124, "grad_norm": 0.3763672113418579, "learning_rate": 0.0002, "loss": 0.5916, "mean_token_accuracy": 0.7625409066677094, "num_tokens": 17291288.0, "step": 4775 }, { "entropy": 0.560752734541893, "epoch": 4.455436304246383, "grad_norm": 0.43022283911705017, "learning_rate": 0.0002, "loss": 0.5645, "mean_token_accuracy": 0.7711473703384399, "num_tokens": 17294870.0, "step": 4776 }, { "entropy": 0.5542318820953369, "epoch": 4.456369575361642, "grad_norm": 0.38864725828170776, "learning_rate": 0.0002, "loss": 0.5542, "mean_token_accuracy": 0.7787680923938751, "num_tokens": 17298602.0, "step": 4777 }, { "entropy": 0.5836614370346069, "epoch": 4.457302846476901, "grad_norm": 0.39797669649124146, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.766489252448082, "num_tokens": 17302238.0, "step": 4778 }, { "entropy": 0.5814106166362762, "epoch": 4.45823611759216, "grad_norm": 0.38516944646835327, "learning_rate": 0.0002, "loss": 0.5801, "mean_token_accuracy": 0.763225331902504, "num_tokens": 17305841.0, "step": 4779 }, { "entropy": 0.5619276165962219, "epoch": 4.459169388707419, "grad_norm": 0.40797755122184753, "learning_rate": 0.0002, "loss": 0.57, "mean_token_accuracy": 0.7679566442966461, "num_tokens": 17309394.0, "step": 4780 }, { "entropy": 0.5603714287281036, "epoch": 4.460102659822678, "grad_norm": 0.5262644290924072, "learning_rate": 0.0002, "loss": 0.5659, "mean_token_accuracy": 0.7732515633106232, "num_tokens": 17312894.0, "step": 4781 }, { "entropy": 0.5552294701337814, "epoch": 4.461035930937937, "grad_norm": 0.3810431659221649, "learning_rate": 0.0002, "loss": 0.5517, "mean_token_accuracy": 0.7761777937412262, "num_tokens": 17316401.0, "step": 4782 }, { "entropy": 0.5594064742326736, "epoch": 4.461969202053196, "grad_norm": 0.3885859549045563, "learning_rate": 0.0002, "loss": 0.5646, "mean_token_accuracy": 0.7699560225009918, "num_tokens": 17320132.0, "step": 4783 }, { "entropy": 0.5725715905427933, "epoch": 4.462902473168455, "grad_norm": 0.4000626504421234, "learning_rate": 0.0002, "loss": 0.5844, "mean_token_accuracy": 0.7715978920459747, "num_tokens": 17323671.0, "step": 4784 }, { "entropy": 0.5578971654176712, "epoch": 4.463835744283714, "grad_norm": 0.4019957482814789, "learning_rate": 0.0002, "loss": 0.5594, "mean_token_accuracy": 0.778071254491806, "num_tokens": 17327303.0, "step": 4785 }, { "entropy": 0.5803490430116653, "epoch": 4.464769015398973, "grad_norm": 0.38214948773384094, "learning_rate": 0.0002, "loss": 0.5798, "mean_token_accuracy": 0.7640321254730225, "num_tokens": 17330932.0, "step": 4786 }, { "entropy": 0.5363295376300812, "epoch": 4.465702286514232, "grad_norm": 0.3645095229148865, "learning_rate": 0.0002, "loss": 0.5355, "mean_token_accuracy": 0.7861178368330002, "num_tokens": 17334458.0, "step": 4787 }, { "entropy": 0.5919544696807861, "epoch": 4.466635557629491, "grad_norm": 0.3219243586063385, "learning_rate": 0.0002, "loss": 0.5724, "mean_token_accuracy": 0.7709100246429443, "num_tokens": 17338221.0, "step": 4788 }, { "entropy": 0.5516549497842789, "epoch": 4.46756882874475, "grad_norm": 0.4472642242908478, "learning_rate": 0.0002, "loss": 0.5574, "mean_token_accuracy": 0.7724955230951309, "num_tokens": 17341808.0, "step": 4789 }, { "entropy": 0.5710238814353943, "epoch": 4.468502099860009, "grad_norm": 0.35947275161743164, "learning_rate": 0.0002, "loss": 0.5706, "mean_token_accuracy": 0.7757185399532318, "num_tokens": 17345406.0, "step": 4790 }, { "entropy": 0.5397614315152168, "epoch": 4.469435370975268, "grad_norm": 0.4095395803451538, "learning_rate": 0.0002, "loss": 0.5487, "mean_token_accuracy": 0.7795400470495224, "num_tokens": 17349032.0, "step": 4791 }, { "entropy": 0.5808098763227463, "epoch": 4.470368642090527, "grad_norm": 0.47623154520988464, "learning_rate": 0.0002, "loss": 0.586, "mean_token_accuracy": 0.7584972679615021, "num_tokens": 17352781.0, "step": 4792 }, { "entropy": 0.5744175314903259, "epoch": 4.471301913205786, "grad_norm": 0.41202595829963684, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7681480348110199, "num_tokens": 17356484.0, "step": 4793 }, { "entropy": 0.5667805820703506, "epoch": 4.472235184321045, "grad_norm": 0.4242592751979828, "learning_rate": 0.0002, "loss": 0.5777, "mean_token_accuracy": 0.7673287093639374, "num_tokens": 17360206.0, "step": 4794 }, { "entropy": 0.5925455838441849, "epoch": 4.473168455436304, "grad_norm": 0.4820718765258789, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.7588153034448624, "num_tokens": 17363906.0, "step": 4795 }, { "entropy": 0.5203521698713303, "epoch": 4.474101726551563, "grad_norm": 0.4917142689228058, "learning_rate": 0.0002, "loss": 0.5345, "mean_token_accuracy": 0.7820800244808197, "num_tokens": 17367477.0, "step": 4796 }, { "entropy": 0.5403307229280472, "epoch": 4.475034997666822, "grad_norm": 0.3736501932144165, "learning_rate": 0.0002, "loss": 0.546, "mean_token_accuracy": 0.7741403877735138, "num_tokens": 17371064.0, "step": 4797 }, { "entropy": 0.6148831695318222, "epoch": 4.475968268782081, "grad_norm": 0.4697050452232361, "learning_rate": 0.0002, "loss": 0.6151, "mean_token_accuracy": 0.7504268139600754, "num_tokens": 17374702.0, "step": 4798 }, { "entropy": 0.6061358898878098, "epoch": 4.47690153989734, "grad_norm": 0.35866495966911316, "learning_rate": 0.0002, "loss": 0.597, "mean_token_accuracy": 0.7609979063272476, "num_tokens": 17378422.0, "step": 4799 }, { "entropy": 0.5684889554977417, "epoch": 4.477834811012599, "grad_norm": 0.3152361512184143, "learning_rate": 0.0002, "loss": 0.5632, "mean_token_accuracy": 0.7712717056274414, "num_tokens": 17382030.0, "step": 4800 }, { "entropy": 0.5730877369642258, "epoch": 4.478768082127858, "grad_norm": 0.42027273774147034, "learning_rate": 0.0002, "loss": 0.5782, "mean_token_accuracy": 0.7655376493930817, "num_tokens": 17385520.0, "step": 4801 }, { "entropy": 0.5168729797005653, "epoch": 4.479701353243117, "grad_norm": 0.4272652566432953, "learning_rate": 0.0002, "loss": 0.5205, "mean_token_accuracy": 0.7895040363073349, "num_tokens": 17389157.0, "step": 4802 }, { "entropy": 0.5923026353120804, "epoch": 4.480634624358376, "grad_norm": 0.3977101445198059, "learning_rate": 0.0002, "loss": 0.6, "mean_token_accuracy": 0.7631928473711014, "num_tokens": 17392843.0, "step": 4803 }, { "entropy": 0.56991246342659, "epoch": 4.481567895473635, "grad_norm": 0.3652901351451874, "learning_rate": 0.0002, "loss": 0.5617, "mean_token_accuracy": 0.7730635851621628, "num_tokens": 17396621.0, "step": 4804 }, { "entropy": 0.5299651101231575, "epoch": 4.482501166588894, "grad_norm": 0.41294997930526733, "learning_rate": 0.0002, "loss": 0.5312, "mean_token_accuracy": 0.7872717827558517, "num_tokens": 17400045.0, "step": 4805 }, { "entropy": 0.566728912293911, "epoch": 4.483434437704153, "grad_norm": 0.39931243658065796, "learning_rate": 0.0002, "loss": 0.5771, "mean_token_accuracy": 0.7719633728265762, "num_tokens": 17403712.0, "step": 4806 }, { "entropy": 0.5635720789432526, "epoch": 4.484367708819412, "grad_norm": 0.40274181962013245, "learning_rate": 0.0002, "loss": 0.5679, "mean_token_accuracy": 0.7661291211843491, "num_tokens": 17407444.0, "step": 4807 }, { "entropy": 0.5834224820137024, "epoch": 4.485300979934671, "grad_norm": 0.4449334740638733, "learning_rate": 0.0002, "loss": 0.5967, "mean_token_accuracy": 0.7601579129695892, "num_tokens": 17411091.0, "step": 4808 }, { "entropy": 0.5575105398893356, "epoch": 4.48623425104993, "grad_norm": 0.41816002130508423, "learning_rate": 0.0002, "loss": 0.5748, "mean_token_accuracy": 0.7649667859077454, "num_tokens": 17414698.0, "step": 4809 }, { "entropy": 0.5623730272054672, "epoch": 4.487167522165189, "grad_norm": 0.38647937774658203, "learning_rate": 0.0002, "loss": 0.5712, "mean_token_accuracy": 0.7664888352155685, "num_tokens": 17418340.0, "step": 4810 }, { "entropy": 0.6007379442453384, "epoch": 4.488100793280448, "grad_norm": 0.3353053331375122, "learning_rate": 0.0002, "loss": 0.5908, "mean_token_accuracy": 0.7608205825090408, "num_tokens": 17421949.0, "step": 4811 }, { "entropy": 0.5708445012569427, "epoch": 4.489034064395707, "grad_norm": 0.3588291108608246, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7688730508089066, "num_tokens": 17425712.0, "step": 4812 }, { "entropy": 0.5519346594810486, "epoch": 4.489967335510966, "grad_norm": 0.40661391615867615, "learning_rate": 0.0002, "loss": 0.544, "mean_token_accuracy": 0.7847284078598022, "num_tokens": 17429200.0, "step": 4813 }, { "entropy": 0.5810827314853668, "epoch": 4.490900606626225, "grad_norm": 0.357538640499115, "learning_rate": 0.0002, "loss": 0.5922, "mean_token_accuracy": 0.760045662522316, "num_tokens": 17432620.0, "step": 4814 }, { "entropy": 0.5310829803347588, "epoch": 4.491833877741484, "grad_norm": 0.3604326546192169, "learning_rate": 0.0002, "loss": 0.5387, "mean_token_accuracy": 0.777440682053566, "num_tokens": 17436197.0, "step": 4815 }, { "entropy": 0.5750412344932556, "epoch": 4.492767148856743, "grad_norm": 0.39230573177337646, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.7674905061721802, "num_tokens": 17439893.0, "step": 4816 }, { "entropy": 0.5626011043787003, "epoch": 4.493700419972002, "grad_norm": 0.3937736749649048, "learning_rate": 0.0002, "loss": 0.569, "mean_token_accuracy": 0.7709674537181854, "num_tokens": 17443512.0, "step": 4817 }, { "entropy": 0.5991633087396622, "epoch": 4.494633691087261, "grad_norm": 0.36795926094055176, "learning_rate": 0.0002, "loss": 0.5956, "mean_token_accuracy": 0.7605534642934799, "num_tokens": 17447233.0, "step": 4818 }, { "entropy": 0.5650808215141296, "epoch": 4.49556696220252, "grad_norm": 0.42285382747650146, "learning_rate": 0.0002, "loss": 0.5608, "mean_token_accuracy": 0.7752731442451477, "num_tokens": 17450741.0, "step": 4819 }, { "entropy": 0.6046368330717087, "epoch": 4.496500233317779, "grad_norm": 0.42851147055625916, "learning_rate": 0.0002, "loss": 0.6205, "mean_token_accuracy": 0.7580421566963196, "num_tokens": 17454308.0, "step": 4820 }, { "entropy": 0.5484835654497147, "epoch": 4.497433504433038, "grad_norm": 0.3448130190372467, "learning_rate": 0.0002, "loss": 0.5481, "mean_token_accuracy": 0.7849153727293015, "num_tokens": 17457989.0, "step": 4821 }, { "entropy": 0.5306568294763565, "epoch": 4.498366775548297, "grad_norm": 0.3316846787929535, "learning_rate": 0.0002, "loss": 0.527, "mean_token_accuracy": 0.7895179837942123, "num_tokens": 17461625.0, "step": 4822 }, { "entropy": 0.5873294472694397, "epoch": 4.4993000466635555, "grad_norm": 0.3406471312046051, "learning_rate": 0.0002, "loss": 0.5817, "mean_token_accuracy": 0.7688527256250381, "num_tokens": 17465207.0, "step": 4823 }, { "entropy": 0.601048156619072, "epoch": 4.5002333177788145, "grad_norm": 0.3842972218990326, "learning_rate": 0.0002, "loss": 0.5968, "mean_token_accuracy": 0.7649251520633698, "num_tokens": 17468872.0, "step": 4824 }, { "entropy": 0.5554709434509277, "epoch": 4.5011665888940735, "grad_norm": 0.3771896958351135, "learning_rate": 0.0002, "loss": 0.5563, "mean_token_accuracy": 0.7746524661779404, "num_tokens": 17472482.0, "step": 4825 }, { "entropy": 0.5687798857688904, "epoch": 4.5020998600093325, "grad_norm": 0.37926656007766724, "learning_rate": 0.0002, "loss": 0.5639, "mean_token_accuracy": 0.7745184898376465, "num_tokens": 17476179.0, "step": 4826 }, { "entropy": 0.5726940482854843, "epoch": 4.5030331311245915, "grad_norm": 0.3484569489955902, "learning_rate": 0.0002, "loss": 0.5692, "mean_token_accuracy": 0.7704342603683472, "num_tokens": 17479944.0, "step": 4827 }, { "entropy": 0.5499776005744934, "epoch": 4.5039664022398505, "grad_norm": 0.41548749804496765, "learning_rate": 0.0002, "loss": 0.5557, "mean_token_accuracy": 0.7718091458082199, "num_tokens": 17483468.0, "step": 4828 }, { "entropy": 0.568818062543869, "epoch": 4.504899673355109, "grad_norm": 0.4642757475376129, "learning_rate": 0.0002, "loss": 0.5801, "mean_token_accuracy": 0.7647477239370346, "num_tokens": 17487094.0, "step": 4829 }, { "entropy": 0.5429835170507431, "epoch": 4.505832944470368, "grad_norm": 0.5153999924659729, "learning_rate": 0.0002, "loss": 0.5632, "mean_token_accuracy": 0.7725977748632431, "num_tokens": 17490747.0, "step": 4830 }, { "entropy": 0.5675468891859055, "epoch": 4.506766215585627, "grad_norm": 0.37133681774139404, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.7639704793691635, "num_tokens": 17494484.0, "step": 4831 }, { "entropy": 0.546793594956398, "epoch": 4.507699486700886, "grad_norm": 0.41995495557785034, "learning_rate": 0.0002, "loss": 0.5465, "mean_token_accuracy": 0.7816768139600754, "num_tokens": 17498124.0, "step": 4832 }, { "entropy": 0.570551410317421, "epoch": 4.508632757816145, "grad_norm": 0.47133171558380127, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.7676591724157333, "num_tokens": 17501644.0, "step": 4833 }, { "entropy": 0.57256168872118, "epoch": 4.509566028931404, "grad_norm": 0.3713406026363373, "learning_rate": 0.0002, "loss": 0.5848, "mean_token_accuracy": 0.7623922675848007, "num_tokens": 17505342.0, "step": 4834 }, { "entropy": 0.5276869609951973, "epoch": 4.510499300046663, "grad_norm": 0.37438830733299255, "learning_rate": 0.0002, "loss": 0.5359, "mean_token_accuracy": 0.7867025882005692, "num_tokens": 17508904.0, "step": 4835 }, { "entropy": 0.5624717026948929, "epoch": 4.511432571161922, "grad_norm": 0.3821360468864441, "learning_rate": 0.0002, "loss": 0.5605, "mean_token_accuracy": 0.7748641967773438, "num_tokens": 17512539.0, "step": 4836 }, { "entropy": 0.5040373802185059, "epoch": 4.512365842277181, "grad_norm": 0.37212783098220825, "learning_rate": 0.0002, "loss": 0.5019, "mean_token_accuracy": 0.7914271950721741, "num_tokens": 17515991.0, "step": 4837 }, { "entropy": 0.5793690383434296, "epoch": 4.51329911339244, "grad_norm": 0.38897573947906494, "learning_rate": 0.0002, "loss": 0.5677, "mean_token_accuracy": 0.7731021195650101, "num_tokens": 17519673.0, "step": 4838 }, { "entropy": 0.5586540699005127, "epoch": 4.514232384507699, "grad_norm": 0.45772773027420044, "learning_rate": 0.0002, "loss": 0.5575, "mean_token_accuracy": 0.7726636826992035, "num_tokens": 17523261.0, "step": 4839 }, { "entropy": 0.5587081760168076, "epoch": 4.515165655622958, "grad_norm": 0.35961800813674927, "learning_rate": 0.0002, "loss": 0.5514, "mean_token_accuracy": 0.7717085033655167, "num_tokens": 17526946.0, "step": 4840 }, { "entropy": 0.6024845242500305, "epoch": 4.516098926738217, "grad_norm": 0.34894222021102905, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.7644893527030945, "num_tokens": 17530538.0, "step": 4841 }, { "entropy": 0.5687915682792664, "epoch": 4.517032197853476, "grad_norm": 0.3956739604473114, "learning_rate": 0.0002, "loss": 0.5835, "mean_token_accuracy": 0.755761131644249, "num_tokens": 17534190.0, "step": 4842 }, { "entropy": 0.5450727120041847, "epoch": 4.517965468968735, "grad_norm": 0.39620187878608704, "learning_rate": 0.0002, "loss": 0.5491, "mean_token_accuracy": 0.774897038936615, "num_tokens": 17537735.0, "step": 4843 }, { "entropy": 0.5693006813526154, "epoch": 4.518898740083994, "grad_norm": 0.3605823516845703, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.772251084446907, "num_tokens": 17541397.0, "step": 4844 }, { "entropy": 0.5651182308793068, "epoch": 4.519832011199253, "grad_norm": 0.4015510082244873, "learning_rate": 0.0002, "loss": 0.5809, "mean_token_accuracy": 0.7685455232858658, "num_tokens": 17545002.0, "step": 4845 }, { "entropy": 0.5822598338127136, "epoch": 4.520765282314512, "grad_norm": 0.4435337781906128, "learning_rate": 0.0002, "loss": 0.6004, "mean_token_accuracy": 0.764484241604805, "num_tokens": 17548588.0, "step": 4846 }, { "entropy": 0.567173033952713, "epoch": 4.521698553429771, "grad_norm": 0.39889320731163025, "learning_rate": 0.0002, "loss": 0.5794, "mean_token_accuracy": 0.759841114282608, "num_tokens": 17552223.0, "step": 4847 }, { "entropy": 0.5198673605918884, "epoch": 4.52263182454503, "grad_norm": 0.37386447191238403, "learning_rate": 0.0002, "loss": 0.5136, "mean_token_accuracy": 0.7982574701309204, "num_tokens": 17555888.0, "step": 4848 }, { "entropy": 0.5510258823633194, "epoch": 4.523565095660289, "grad_norm": 0.3940255343914032, "learning_rate": 0.0002, "loss": 0.5509, "mean_token_accuracy": 0.7768819481134415, "num_tokens": 17559415.0, "step": 4849 }, { "entropy": 0.556463286280632, "epoch": 4.524498366775548, "grad_norm": 0.3421221375465393, "learning_rate": 0.0002, "loss": 0.567, "mean_token_accuracy": 0.7713297307491302, "num_tokens": 17563133.0, "step": 4850 }, { "entropy": 0.5411703288555145, "epoch": 4.525431637890807, "grad_norm": 0.3595718741416931, "learning_rate": 0.0002, "loss": 0.5441, "mean_token_accuracy": 0.7758969366550446, "num_tokens": 17566787.0, "step": 4851 }, { "entropy": 0.570067971944809, "epoch": 4.526364909006066, "grad_norm": 0.3781256079673767, "learning_rate": 0.0002, "loss": 0.5628, "mean_token_accuracy": 0.7718492150306702, "num_tokens": 17570385.0, "step": 4852 }, { "entropy": 0.5682332664728165, "epoch": 4.527298180121325, "grad_norm": 0.3341864347457886, "learning_rate": 0.0002, "loss": 0.5601, "mean_token_accuracy": 0.7788263261318207, "num_tokens": 17574047.0, "step": 4853 }, { "entropy": 0.5168418288230896, "epoch": 4.528231451236584, "grad_norm": 0.3196272850036621, "learning_rate": 0.0002, "loss": 0.5082, "mean_token_accuracy": 0.7947773486375809, "num_tokens": 17577735.0, "step": 4854 }, { "entropy": 0.5205790475010872, "epoch": 4.529164722351843, "grad_norm": 0.4400723874568939, "learning_rate": 0.0002, "loss": 0.5289, "mean_token_accuracy": 0.7856476306915283, "num_tokens": 17581435.0, "step": 4855 }, { "entropy": 0.5679063946008682, "epoch": 4.530097993467102, "grad_norm": 0.39205801486968994, "learning_rate": 0.0002, "loss": 0.5755, "mean_token_accuracy": 0.7734402269124985, "num_tokens": 17585069.0, "step": 4856 }, { "entropy": 0.5887645930051804, "epoch": 4.531031264582361, "grad_norm": 0.3447405695915222, "learning_rate": 0.0002, "loss": 0.5841, "mean_token_accuracy": 0.7684680223464966, "num_tokens": 17588702.0, "step": 4857 }, { "entropy": 0.5446571111679077, "epoch": 4.53196453569762, "grad_norm": 0.3261997401714325, "learning_rate": 0.0002, "loss": 0.5359, "mean_token_accuracy": 0.785684809088707, "num_tokens": 17592288.0, "step": 4858 }, { "entropy": 0.5548210144042969, "epoch": 4.532897806812879, "grad_norm": 0.3187585473060608, "learning_rate": 0.0002, "loss": 0.5499, "mean_token_accuracy": 0.7803417593240738, "num_tokens": 17596039.0, "step": 4859 }, { "entropy": 0.5536800473928452, "epoch": 4.533831077928138, "grad_norm": 0.4121701717376709, "learning_rate": 0.0002, "loss": 0.5545, "mean_token_accuracy": 0.776318296790123, "num_tokens": 17599581.0, "step": 4860 }, { "entropy": 0.5483148992061615, "epoch": 4.534764349043397, "grad_norm": 0.39247459173202515, "learning_rate": 0.0002, "loss": 0.5608, "mean_token_accuracy": 0.7680959105491638, "num_tokens": 17603123.0, "step": 4861 }, { "entropy": 0.6087500005960464, "epoch": 4.535697620158656, "grad_norm": 0.4023435711860657, "learning_rate": 0.0002, "loss": 0.6089, "mean_token_accuracy": 0.7568105459213257, "num_tokens": 17606729.0, "step": 4862 }, { "entropy": 0.5186846479773521, "epoch": 4.536630891273915, "grad_norm": 0.4977905750274658, "learning_rate": 0.0002, "loss": 0.5348, "mean_token_accuracy": 0.7905682027339935, "num_tokens": 17610283.0, "step": 4863 }, { "entropy": 0.5812677145004272, "epoch": 4.537564162389174, "grad_norm": 0.41021087765693665, "learning_rate": 0.0002, "loss": 0.5847, "mean_token_accuracy": 0.7666632235050201, "num_tokens": 17613917.0, "step": 4864 }, { "entropy": 0.5350098684430122, "epoch": 4.538497433504433, "grad_norm": 0.4474293887615204, "learning_rate": 0.0002, "loss": 0.5516, "mean_token_accuracy": 0.7781218588352203, "num_tokens": 17617482.0, "step": 4865 }, { "entropy": 0.6022578030824661, "epoch": 4.539430704619692, "grad_norm": 0.337429404258728, "learning_rate": 0.0002, "loss": 0.6047, "mean_token_accuracy": 0.7590168416500092, "num_tokens": 17621252.0, "step": 4866 }, { "entropy": 0.562186986207962, "epoch": 4.540363975734951, "grad_norm": 0.39323750138282776, "learning_rate": 0.0002, "loss": 0.565, "mean_token_accuracy": 0.7775822579860687, "num_tokens": 17624832.0, "step": 4867 }, { "entropy": 0.5566632896661758, "epoch": 4.54129724685021, "grad_norm": 0.3489106595516205, "learning_rate": 0.0002, "loss": 0.5504, "mean_token_accuracy": 0.7757734954357147, "num_tokens": 17628551.0, "step": 4868 }, { "entropy": 0.5522240176796913, "epoch": 4.542230517965469, "grad_norm": 0.34779608249664307, "learning_rate": 0.0002, "loss": 0.5486, "mean_token_accuracy": 0.7803953588008881, "num_tokens": 17632180.0, "step": 4869 }, { "entropy": 0.5843191891908646, "epoch": 4.543163789080728, "grad_norm": 0.3515261113643646, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.7634051889181137, "num_tokens": 17635763.0, "step": 4870 }, { "entropy": 0.5410157740116119, "epoch": 4.544097060195987, "grad_norm": 0.4196840524673462, "learning_rate": 0.0002, "loss": 0.5421, "mean_token_accuracy": 0.7808069288730621, "num_tokens": 17639366.0, "step": 4871 }, { "entropy": 0.5360670015215874, "epoch": 4.545030331311246, "grad_norm": 0.405295729637146, "learning_rate": 0.0002, "loss": 0.5376, "mean_token_accuracy": 0.7813647091388702, "num_tokens": 17642929.0, "step": 4872 }, { "entropy": 0.5770669877529144, "epoch": 4.545963602426505, "grad_norm": 0.373299777507782, "learning_rate": 0.0002, "loss": 0.5667, "mean_token_accuracy": 0.7711631804704666, "num_tokens": 17646600.0, "step": 4873 }, { "entropy": 0.55177141726017, "epoch": 4.546896873541764, "grad_norm": 0.34068775177001953, "learning_rate": 0.0002, "loss": 0.5475, "mean_token_accuracy": 0.7801592200994492, "num_tokens": 17650241.0, "step": 4874 }, { "entropy": 0.5539674982428551, "epoch": 4.547830144657023, "grad_norm": 0.45854437351226807, "learning_rate": 0.0002, "loss": 0.5581, "mean_token_accuracy": 0.772121012210846, "num_tokens": 17653813.0, "step": 4875 }, { "entropy": 0.5514517277479172, "epoch": 4.548763415772282, "grad_norm": 0.4662773907184601, "learning_rate": 0.0002, "loss": 0.5712, "mean_token_accuracy": 0.7677302807569504, "num_tokens": 17657422.0, "step": 4876 }, { "entropy": 0.5859925448894501, "epoch": 4.549696686887541, "grad_norm": 0.35193949937820435, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7674762606620789, "num_tokens": 17661119.0, "step": 4877 }, { "entropy": 0.5574215650558472, "epoch": 4.5506299580028, "grad_norm": 0.5004472136497498, "learning_rate": 0.0002, "loss": 0.5692, "mean_token_accuracy": 0.7727451324462891, "num_tokens": 17664741.0, "step": 4878 }, { "entropy": 0.5128000602126122, "epoch": 4.5515632291180586, "grad_norm": 0.5175631046295166, "learning_rate": 0.0002, "loss": 0.5394, "mean_token_accuracy": 0.784002348780632, "num_tokens": 17668274.0, "step": 4879 }, { "entropy": 0.5896311402320862, "epoch": 4.5524965002333175, "grad_norm": 0.44500064849853516, "learning_rate": 0.0002, "loss": 0.5935, "mean_token_accuracy": 0.7533291876316071, "num_tokens": 17671887.0, "step": 4880 }, { "entropy": 0.5984329879283905, "epoch": 4.5534297713485765, "grad_norm": 0.4442961812019348, "learning_rate": 0.0002, "loss": 0.5907, "mean_token_accuracy": 0.7610793262720108, "num_tokens": 17675476.0, "step": 4881 }, { "entropy": 0.5953851491212845, "epoch": 4.5543630424638355, "grad_norm": 0.39245375990867615, "learning_rate": 0.0002, "loss": 0.6015, "mean_token_accuracy": 0.7616232335567474, "num_tokens": 17679023.0, "step": 4882 }, { "entropy": 0.5877227187156677, "epoch": 4.5552963135790945, "grad_norm": 0.49223241209983826, "learning_rate": 0.0002, "loss": 0.6072, "mean_token_accuracy": 0.7583972364664078, "num_tokens": 17682789.0, "step": 4883 }, { "entropy": 0.5757845789194107, "epoch": 4.5562295846943535, "grad_norm": 0.41956886649131775, "learning_rate": 0.0002, "loss": 0.5801, "mean_token_accuracy": 0.7638586014509201, "num_tokens": 17686368.0, "step": 4884 }, { "entropy": 0.598517119884491, "epoch": 4.5571628558096124, "grad_norm": 0.3665843904018402, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7532899230718613, "num_tokens": 17690087.0, "step": 4885 }, { "entropy": 0.5357743799686432, "epoch": 4.558096126924871, "grad_norm": 0.39076387882232666, "learning_rate": 0.0002, "loss": 0.5415, "mean_token_accuracy": 0.7782648056745529, "num_tokens": 17693711.0, "step": 4886 }, { "entropy": 0.5373507812619209, "epoch": 4.55902939804013, "grad_norm": 0.28244534134864807, "learning_rate": 0.0002, "loss": 0.5275, "mean_token_accuracy": 0.7857670336961746, "num_tokens": 17697479.0, "step": 4887 }, { "entropy": 0.5383172780275345, "epoch": 4.559962669155389, "grad_norm": 0.38541579246520996, "learning_rate": 0.0002, "loss": 0.5386, "mean_token_accuracy": 0.7821384370326996, "num_tokens": 17701131.0, "step": 4888 }, { "entropy": 0.5912415087223053, "epoch": 4.560895940270648, "grad_norm": 0.3622971475124359, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.7605313062667847, "num_tokens": 17704702.0, "step": 4889 }, { "entropy": 0.6008728742599487, "epoch": 4.561829211385907, "grad_norm": 0.32936641573905945, "learning_rate": 0.0002, "loss": 0.602, "mean_token_accuracy": 0.758951336145401, "num_tokens": 17708360.0, "step": 4890 }, { "entropy": 0.5416419804096222, "epoch": 4.562762482501166, "grad_norm": 0.3837810158729553, "learning_rate": 0.0002, "loss": 0.5586, "mean_token_accuracy": 0.7777370512485504, "num_tokens": 17711950.0, "step": 4891 }, { "entropy": 0.5769273340702057, "epoch": 4.563695753616425, "grad_norm": 0.4786092936992645, "learning_rate": 0.0002, "loss": 0.5825, "mean_token_accuracy": 0.7637661397457123, "num_tokens": 17715665.0, "step": 4892 }, { "entropy": 0.58542300760746, "epoch": 4.564629024731684, "grad_norm": 0.41529908776283264, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.7617106139659882, "num_tokens": 17719349.0, "step": 4893 }, { "entropy": 0.6184741854667664, "epoch": 4.565562295846943, "grad_norm": 0.4369775056838989, "learning_rate": 0.0002, "loss": 0.6218, "mean_token_accuracy": 0.7563212662935257, "num_tokens": 17723015.0, "step": 4894 }, { "entropy": 0.5169181376695633, "epoch": 4.566495566962202, "grad_norm": 0.3778000771999359, "learning_rate": 0.0002, "loss": 0.5319, "mean_token_accuracy": 0.782123401761055, "num_tokens": 17726602.0, "step": 4895 }, { "entropy": 0.5754657834768295, "epoch": 4.567428838077461, "grad_norm": 0.3857405185699463, "learning_rate": 0.0002, "loss": 0.5766, "mean_token_accuracy": 0.7734043598175049, "num_tokens": 17730153.0, "step": 4896 }, { "entropy": 0.5911608934402466, "epoch": 4.56836210919272, "grad_norm": 0.35402077436447144, "learning_rate": 0.0002, "loss": 0.5885, "mean_token_accuracy": 0.7683266699314117, "num_tokens": 17733820.0, "step": 4897 }, { "entropy": 0.5808236747980118, "epoch": 4.569295380307979, "grad_norm": 0.4585820734500885, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7628980129957199, "num_tokens": 17737429.0, "step": 4898 }, { "entropy": 0.5127824768424034, "epoch": 4.570228651423238, "grad_norm": 0.3762039244174957, "learning_rate": 0.0002, "loss": 0.5141, "mean_token_accuracy": 0.793137714266777, "num_tokens": 17740967.0, "step": 4899 }, { "entropy": 0.5578572675585747, "epoch": 4.571161922538497, "grad_norm": 0.3430737257003784, "learning_rate": 0.0002, "loss": 0.5527, "mean_token_accuracy": 0.7819212377071381, "num_tokens": 17744630.0, "step": 4900 }, { "entropy": 0.5726819634437561, "epoch": 4.572095193653756, "grad_norm": 0.37427571415901184, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7651753127574921, "num_tokens": 17748239.0, "step": 4901 }, { "entropy": 0.5819738060235977, "epoch": 4.573028464769015, "grad_norm": 0.36536529660224915, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.7721845954656601, "num_tokens": 17751861.0, "step": 4902 }, { "entropy": 0.5454451218247414, "epoch": 4.573961735884274, "grad_norm": 0.34601929783821106, "learning_rate": 0.0002, "loss": 0.537, "mean_token_accuracy": 0.786949560046196, "num_tokens": 17755428.0, "step": 4903 }, { "entropy": 0.5405720770359039, "epoch": 4.574895006999533, "grad_norm": 0.35252729058265686, "learning_rate": 0.0002, "loss": 0.5525, "mean_token_accuracy": 0.7742167860269547, "num_tokens": 17759116.0, "step": 4904 }, { "entropy": 0.5135308727622032, "epoch": 4.575828278114792, "grad_norm": 0.3628905415534973, "learning_rate": 0.0002, "loss": 0.521, "mean_token_accuracy": 0.7840693444013596, "num_tokens": 17762734.0, "step": 4905 }, { "entropy": 0.5811951011419296, "epoch": 4.576761549230051, "grad_norm": 0.3007758855819702, "learning_rate": 0.0002, "loss": 0.5778, "mean_token_accuracy": 0.7685562670230865, "num_tokens": 17766490.0, "step": 4906 }, { "entropy": 0.5629690736532211, "epoch": 4.57769482034531, "grad_norm": 0.3381650745868683, "learning_rate": 0.0002, "loss": 0.5666, "mean_token_accuracy": 0.7749863415956497, "num_tokens": 17770145.0, "step": 4907 }, { "entropy": 0.5596809089183807, "epoch": 4.578628091460569, "grad_norm": 0.3843808174133301, "learning_rate": 0.0002, "loss": 0.5649, "mean_token_accuracy": 0.7740264981985092, "num_tokens": 17773769.0, "step": 4908 }, { "entropy": 0.5583049654960632, "epoch": 4.579561362575828, "grad_norm": 0.40866291522979736, "learning_rate": 0.0002, "loss": 0.557, "mean_token_accuracy": 0.7730708867311478, "num_tokens": 17777363.0, "step": 4909 }, { "entropy": 0.573640339076519, "epoch": 4.580494633691087, "grad_norm": 0.38706347346305847, "learning_rate": 0.0002, "loss": 0.584, "mean_token_accuracy": 0.7684453725814819, "num_tokens": 17780987.0, "step": 4910 }, { "entropy": 0.5870187282562256, "epoch": 4.581427904806346, "grad_norm": 0.4140099883079529, "learning_rate": 0.0002, "loss": 0.5882, "mean_token_accuracy": 0.7644986212253571, "num_tokens": 17784619.0, "step": 4911 }, { "entropy": 0.5704992413520813, "epoch": 4.582361175921605, "grad_norm": 0.3808387517929077, "learning_rate": 0.0002, "loss": 0.5751, "mean_token_accuracy": 0.7664143890142441, "num_tokens": 17788293.0, "step": 4912 }, { "entropy": 0.6025023013353348, "epoch": 4.583294447036864, "grad_norm": 0.4608018100261688, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.7514491081237793, "num_tokens": 17791929.0, "step": 4913 }, { "entropy": 0.5513797849416733, "epoch": 4.584227718152123, "grad_norm": 0.41797003149986267, "learning_rate": 0.0002, "loss": 0.5569, "mean_token_accuracy": 0.7725077718496323, "num_tokens": 17795460.0, "step": 4914 }, { "entropy": 0.5111840143799782, "epoch": 4.585160989267382, "grad_norm": 0.4658350348472595, "learning_rate": 0.0002, "loss": 0.516, "mean_token_accuracy": 0.7892905622720718, "num_tokens": 17799039.0, "step": 4915 }, { "entropy": 0.5401464402675629, "epoch": 4.586094260382641, "grad_norm": 0.35445863008499146, "learning_rate": 0.0002, "loss": 0.5429, "mean_token_accuracy": 0.7866924703121185, "num_tokens": 17802681.0, "step": 4916 }, { "entropy": 0.5723909437656403, "epoch": 4.5870275314979, "grad_norm": 0.4094473719596863, "learning_rate": 0.0002, "loss": 0.573, "mean_token_accuracy": 0.7711297422647476, "num_tokens": 17806311.0, "step": 4917 }, { "entropy": 0.5654982924461365, "epoch": 4.587960802613159, "grad_norm": 0.3944474458694458, "learning_rate": 0.0002, "loss": 0.5628, "mean_token_accuracy": 0.7716998308897018, "num_tokens": 17809890.0, "step": 4918 }, { "entropy": 0.6154283285140991, "epoch": 4.588894073728418, "grad_norm": 0.361325740814209, "learning_rate": 0.0002, "loss": 0.6085, "mean_token_accuracy": 0.7544650733470917, "num_tokens": 17813580.0, "step": 4919 }, { "entropy": 0.5727228820323944, "epoch": 4.589827344843677, "grad_norm": 0.4918725788593292, "learning_rate": 0.0002, "loss": 0.5607, "mean_token_accuracy": 0.7710158377885818, "num_tokens": 17817235.0, "step": 4920 }, { "entropy": 0.5720652788877487, "epoch": 4.590760615958936, "grad_norm": 0.43101727962493896, "learning_rate": 0.0002, "loss": 0.5737, "mean_token_accuracy": 0.7680994272232056, "num_tokens": 17820899.0, "step": 4921 }, { "entropy": 0.5674052387475967, "epoch": 4.591693887074195, "grad_norm": 0.40416428446769714, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.77132348716259, "num_tokens": 17824736.0, "step": 4922 }, { "entropy": 0.5856206268072128, "epoch": 4.592627158189454, "grad_norm": 0.4181523621082306, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7682902812957764, "num_tokens": 17828397.0, "step": 4923 }, { "entropy": 0.5355602577328682, "epoch": 4.593560429304713, "grad_norm": 0.48093366622924805, "learning_rate": 0.0002, "loss": 0.5516, "mean_token_accuracy": 0.7780580967664719, "num_tokens": 17831874.0, "step": 4924 }, { "entropy": 0.5499213486909866, "epoch": 4.594493700419972, "grad_norm": 0.41718265414237976, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7685301154851913, "num_tokens": 17835496.0, "step": 4925 }, { "entropy": 0.5388929024338722, "epoch": 4.595426971535231, "grad_norm": 0.3774538040161133, "learning_rate": 0.0002, "loss": 0.5433, "mean_token_accuracy": 0.7802752554416656, "num_tokens": 17839107.0, "step": 4926 }, { "entropy": 0.5668818801641464, "epoch": 4.59636024265049, "grad_norm": 0.3971269130706787, "learning_rate": 0.0002, "loss": 0.5782, "mean_token_accuracy": 0.762268990278244, "num_tokens": 17842634.0, "step": 4927 }, { "entropy": 0.5752602964639664, "epoch": 4.597293513765749, "grad_norm": 0.3261200487613678, "learning_rate": 0.0002, "loss": 0.5817, "mean_token_accuracy": 0.7720722854137421, "num_tokens": 17846418.0, "step": 4928 }, { "entropy": 0.5907959640026093, "epoch": 4.598226784881008, "grad_norm": 0.338697224855423, "learning_rate": 0.0002, "loss": 0.5876, "mean_token_accuracy": 0.7601265162229538, "num_tokens": 17850099.0, "step": 4929 }, { "entropy": 0.5529542714357376, "epoch": 4.599160055996267, "grad_norm": 0.35862720012664795, "learning_rate": 0.0002, "loss": 0.5535, "mean_token_accuracy": 0.77744260430336, "num_tokens": 17853631.0, "step": 4930 }, { "entropy": 0.5241840183734894, "epoch": 4.600093327111526, "grad_norm": 0.34995201230049133, "learning_rate": 0.0002, "loss": 0.5292, "mean_token_accuracy": 0.7857812494039536, "num_tokens": 17857214.0, "step": 4931 }, { "entropy": 0.5560179799795151, "epoch": 4.601026598226785, "grad_norm": 0.3886911869049072, "learning_rate": 0.0002, "loss": 0.5579, "mean_token_accuracy": 0.7783230990171432, "num_tokens": 17860828.0, "step": 4932 }, { "entropy": 0.5601569265127182, "epoch": 4.601959869342044, "grad_norm": 0.36788541078567505, "learning_rate": 0.0002, "loss": 0.5551, "mean_token_accuracy": 0.7718390375375748, "num_tokens": 17864384.0, "step": 4933 }, { "entropy": 0.5611379593610764, "epoch": 4.602893140457303, "grad_norm": 0.4165249466896057, "learning_rate": 0.0002, "loss": 0.5687, "mean_token_accuracy": 0.7693435996770859, "num_tokens": 17867834.0, "step": 4934 }, { "entropy": 0.5699248313903809, "epoch": 4.603826411572562, "grad_norm": 0.4547255337238312, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.76997409760952, "num_tokens": 17871448.0, "step": 4935 }, { "entropy": 0.5698123425245285, "epoch": 4.6047596826878205, "grad_norm": 0.36557647585868835, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.775556817650795, "num_tokens": 17875160.0, "step": 4936 }, { "entropy": 0.564688116312027, "epoch": 4.6056929538030795, "grad_norm": 0.5132119059562683, "learning_rate": 0.0002, "loss": 0.576, "mean_token_accuracy": 0.7697847187519073, "num_tokens": 17878708.0, "step": 4937 }, { "entropy": 0.5665983557701111, "epoch": 4.6066262249183385, "grad_norm": 0.4288041889667511, "learning_rate": 0.0002, "loss": 0.5775, "mean_token_accuracy": 0.7678466439247131, "num_tokens": 17882305.0, "step": 4938 }, { "entropy": 0.5552066117525101, "epoch": 4.6075594960335975, "grad_norm": 0.38323742151260376, "learning_rate": 0.0002, "loss": 0.5622, "mean_token_accuracy": 0.7732952535152435, "num_tokens": 17885921.0, "step": 4939 }, { "entropy": 0.5394971966743469, "epoch": 4.6084927671488565, "grad_norm": 0.34470832347869873, "learning_rate": 0.0002, "loss": 0.5413, "mean_token_accuracy": 0.7835830897092819, "num_tokens": 17889617.0, "step": 4940 }, { "entropy": 0.5684772878885269, "epoch": 4.6094260382641155, "grad_norm": 0.4002438485622406, "learning_rate": 0.0002, "loss": 0.5765, "mean_token_accuracy": 0.7671052366495132, "num_tokens": 17893302.0, "step": 4941 }, { "entropy": 0.5214446038007736, "epoch": 4.610359309379374, "grad_norm": 0.39311322569847107, "learning_rate": 0.0002, "loss": 0.5298, "mean_token_accuracy": 0.788799986243248, "num_tokens": 17896858.0, "step": 4942 }, { "entropy": 0.5975686460733414, "epoch": 4.611292580494633, "grad_norm": 0.3629685938358307, "learning_rate": 0.0002, "loss": 0.6017, "mean_token_accuracy": 0.7609939873218536, "num_tokens": 17900480.0, "step": 4943 }, { "entropy": 0.5247260853648186, "epoch": 4.612225851609892, "grad_norm": 0.37814605236053467, "learning_rate": 0.0002, "loss": 0.5307, "mean_token_accuracy": 0.7878833264112473, "num_tokens": 17904129.0, "step": 4944 }, { "entropy": 0.5988909006118774, "epoch": 4.613159122725151, "grad_norm": 0.4237043857574463, "learning_rate": 0.0002, "loss": 0.5981, "mean_token_accuracy": 0.759353831410408, "num_tokens": 17907791.0, "step": 4945 }, { "entropy": 0.5626085251569748, "epoch": 4.61409239384041, "grad_norm": 0.3508053421974182, "learning_rate": 0.0002, "loss": 0.5681, "mean_token_accuracy": 0.767866775393486, "num_tokens": 17911330.0, "step": 4946 }, { "entropy": 0.5380107015371323, "epoch": 4.615025664955669, "grad_norm": 0.36534425616264343, "learning_rate": 0.0002, "loss": 0.5396, "mean_token_accuracy": 0.7822549045085907, "num_tokens": 17914859.0, "step": 4947 }, { "entropy": 0.5590044558048248, "epoch": 4.615958936070928, "grad_norm": 0.37018176913261414, "learning_rate": 0.0002, "loss": 0.5692, "mean_token_accuracy": 0.7691466361284256, "num_tokens": 17918396.0, "step": 4948 }, { "entropy": 0.596370130777359, "epoch": 4.616892207186187, "grad_norm": 0.3803148865699768, "learning_rate": 0.0002, "loss": 0.5813, "mean_token_accuracy": 0.7629829496145248, "num_tokens": 17922082.0, "step": 4949 }, { "entropy": 0.5839471220970154, "epoch": 4.617825478301446, "grad_norm": 0.371419221162796, "learning_rate": 0.0002, "loss": 0.5794, "mean_token_accuracy": 0.7676007747650146, "num_tokens": 17925680.0, "step": 4950 }, { "entropy": 0.5669324547052383, "epoch": 4.618758749416705, "grad_norm": 0.368034690618515, "learning_rate": 0.0002, "loss": 0.5684, "mean_token_accuracy": 0.7725664973258972, "num_tokens": 17929343.0, "step": 4951 }, { "entropy": 0.5742137283086777, "epoch": 4.619692020531964, "grad_norm": 0.38132011890411377, "learning_rate": 0.0002, "loss": 0.575, "mean_token_accuracy": 0.76503786444664, "num_tokens": 17933068.0, "step": 4952 }, { "entropy": 0.5598926097154617, "epoch": 4.620625291647223, "grad_norm": 0.36973875761032104, "learning_rate": 0.0002, "loss": 0.5622, "mean_token_accuracy": 0.7733671218156815, "num_tokens": 17936718.0, "step": 4953 }, { "entropy": 0.557572603225708, "epoch": 4.621558562762482, "grad_norm": 0.4235858619213104, "learning_rate": 0.0002, "loss": 0.567, "mean_token_accuracy": 0.773655578494072, "num_tokens": 17940342.0, "step": 4954 }, { "entropy": 0.5682855993509293, "epoch": 4.622491833877741, "grad_norm": 0.4238036870956421, "learning_rate": 0.0002, "loss": 0.5774, "mean_token_accuracy": 0.7713465988636017, "num_tokens": 17943952.0, "step": 4955 }, { "entropy": 0.5518262684345245, "epoch": 4.623425104993, "grad_norm": 0.36027878522872925, "learning_rate": 0.0002, "loss": 0.5517, "mean_token_accuracy": 0.7747040241956711, "num_tokens": 17947485.0, "step": 4956 }, { "entropy": 0.605153352022171, "epoch": 4.624358376108259, "grad_norm": 0.37587451934814453, "learning_rate": 0.0002, "loss": 0.6211, "mean_token_accuracy": 0.7509528547525406, "num_tokens": 17951203.0, "step": 4957 }, { "entropy": 0.5593715384602547, "epoch": 4.625291647223518, "grad_norm": 0.4109523594379425, "learning_rate": 0.0002, "loss": 0.5666, "mean_token_accuracy": 0.7779312580823898, "num_tokens": 17954720.0, "step": 4958 }, { "entropy": 0.6022717654705048, "epoch": 4.626224918338777, "grad_norm": 0.4050321578979492, "learning_rate": 0.0002, "loss": 0.5909, "mean_token_accuracy": 0.7690692096948624, "num_tokens": 17958467.0, "step": 4959 }, { "entropy": 0.5721611231565475, "epoch": 4.627158189454036, "grad_norm": 0.3848203122615814, "learning_rate": 0.0002, "loss": 0.5599, "mean_token_accuracy": 0.7704412937164307, "num_tokens": 17962011.0, "step": 4960 }, { "entropy": 0.5627020373940468, "epoch": 4.628091460569295, "grad_norm": 0.3559623956680298, "learning_rate": 0.0002, "loss": 0.566, "mean_token_accuracy": 0.7753792405128479, "num_tokens": 17965676.0, "step": 4961 }, { "entropy": 0.5419072061777115, "epoch": 4.629024731684554, "grad_norm": 0.3701173663139343, "learning_rate": 0.0002, "loss": 0.5431, "mean_token_accuracy": 0.7847242206335068, "num_tokens": 17969217.0, "step": 4962 }, { "entropy": 0.5597690045833588, "epoch": 4.629958002799813, "grad_norm": 0.35718366503715515, "learning_rate": 0.0002, "loss": 0.5625, "mean_token_accuracy": 0.7735681384801865, "num_tokens": 17972902.0, "step": 4963 }, { "entropy": 0.5657523795962334, "epoch": 4.630891273915072, "grad_norm": 0.41195711493492126, "learning_rate": 0.0002, "loss": 0.5733, "mean_token_accuracy": 0.7702198475599289, "num_tokens": 17976620.0, "step": 4964 }, { "entropy": 0.5611817911267281, "epoch": 4.631824545030331, "grad_norm": 0.3862907588481903, "learning_rate": 0.0002, "loss": 0.5649, "mean_token_accuracy": 0.7696749716997147, "num_tokens": 17980233.0, "step": 4965 }, { "entropy": 0.5579839795827866, "epoch": 4.63275781614559, "grad_norm": 0.45833253860473633, "learning_rate": 0.0002, "loss": 0.5714, "mean_token_accuracy": 0.7699311673641205, "num_tokens": 17983899.0, "step": 4966 }, { "entropy": 0.5284889042377472, "epoch": 4.633691087260849, "grad_norm": 0.38397467136383057, "learning_rate": 0.0002, "loss": 0.5345, "mean_token_accuracy": 0.7785120010375977, "num_tokens": 17987439.0, "step": 4967 }, { "entropy": 0.5992829948663712, "epoch": 4.634624358376108, "grad_norm": 0.3382185995578766, "learning_rate": 0.0002, "loss": 0.5903, "mean_token_accuracy": 0.7626045197248459, "num_tokens": 17991029.0, "step": 4968 }, { "entropy": 0.5747019797563553, "epoch": 4.635557629491367, "grad_norm": 0.3707175850868225, "learning_rate": 0.0002, "loss": 0.5689, "mean_token_accuracy": 0.7764086574316025, "num_tokens": 17994672.0, "step": 4969 }, { "entropy": 0.5933233499526978, "epoch": 4.636490900606626, "grad_norm": 0.3848911225795746, "learning_rate": 0.0002, "loss": 0.5855, "mean_token_accuracy": 0.763680636882782, "num_tokens": 17998296.0, "step": 4970 }, { "entropy": 0.5781502276659012, "epoch": 4.637424171721885, "grad_norm": 0.4134160876274109, "learning_rate": 0.0002, "loss": 0.5695, "mean_token_accuracy": 0.770712673664093, "num_tokens": 18001916.0, "step": 4971 }, { "entropy": 0.5306430160999298, "epoch": 4.638357442837144, "grad_norm": 0.38553497195243835, "learning_rate": 0.0002, "loss": 0.54, "mean_token_accuracy": 0.7764711827039719, "num_tokens": 18005592.0, "step": 4972 }, { "entropy": 0.5890820920467377, "epoch": 4.639290713952403, "grad_norm": 0.4130083918571472, "learning_rate": 0.0002, "loss": 0.5887, "mean_token_accuracy": 0.7668064683675766, "num_tokens": 18009303.0, "step": 4973 }, { "entropy": 0.5615467056632042, "epoch": 4.640223985067662, "grad_norm": 0.4337501525878906, "learning_rate": 0.0002, "loss": 0.5831, "mean_token_accuracy": 0.7672310620546341, "num_tokens": 18012929.0, "step": 4974 }, { "entropy": 0.5469682142138481, "epoch": 4.641157256182921, "grad_norm": 0.4875669777393341, "learning_rate": 0.0002, "loss": 0.568, "mean_token_accuracy": 0.7724873423576355, "num_tokens": 18016589.0, "step": 4975 }, { "entropy": 0.5302253216505051, "epoch": 4.64209052729818, "grad_norm": 0.41162344813346863, "learning_rate": 0.0002, "loss": 0.5464, "mean_token_accuracy": 0.7773792445659637, "num_tokens": 18020246.0, "step": 4976 }, { "entropy": 0.5700079798698425, "epoch": 4.643023798413439, "grad_norm": 0.4367320239543915, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.7608955800533295, "num_tokens": 18023879.0, "step": 4977 }, { "entropy": 0.5795313566923141, "epoch": 4.643957069528698, "grad_norm": 0.4009832441806793, "learning_rate": 0.0002, "loss": 0.591, "mean_token_accuracy": 0.7645910233259201, "num_tokens": 18027469.0, "step": 4978 }, { "entropy": 0.5432212874293327, "epoch": 4.644890340643957, "grad_norm": 0.45703744888305664, "learning_rate": 0.0002, "loss": 0.5358, "mean_token_accuracy": 0.7845307141542435, "num_tokens": 18031152.0, "step": 4979 }, { "entropy": 0.5747144371271133, "epoch": 4.645823611759216, "grad_norm": 0.4215880334377289, "learning_rate": 0.0002, "loss": 0.5731, "mean_token_accuracy": 0.7708906680345535, "num_tokens": 18034629.0, "step": 4980 }, { "entropy": 0.5442966669797897, "epoch": 4.646756882874475, "grad_norm": 0.3576415777206421, "learning_rate": 0.0002, "loss": 0.5306, "mean_token_accuracy": 0.7953539192676544, "num_tokens": 18038164.0, "step": 4981 }, { "entropy": 0.555517703294754, "epoch": 4.647690153989734, "grad_norm": 0.29764771461486816, "learning_rate": 0.0002, "loss": 0.554, "mean_token_accuracy": 0.7725921273231506, "num_tokens": 18041663.0, "step": 4982 }, { "entropy": 0.5913557708263397, "epoch": 4.648623425104993, "grad_norm": 0.37422189116477966, "learning_rate": 0.0002, "loss": 0.5808, "mean_token_accuracy": 0.7697311043739319, "num_tokens": 18045423.0, "step": 4983 }, { "entropy": 0.578028991818428, "epoch": 4.649556696220252, "grad_norm": 0.3242702782154083, "learning_rate": 0.0002, "loss": 0.5627, "mean_token_accuracy": 0.7736149728298187, "num_tokens": 18049107.0, "step": 4984 }, { "entropy": 0.5675028413534164, "epoch": 4.650489967335511, "grad_norm": 0.40332645177841187, "learning_rate": 0.0002, "loss": 0.5845, "mean_token_accuracy": 0.7699740827083588, "num_tokens": 18052787.0, "step": 4985 }, { "entropy": 0.5820091515779495, "epoch": 4.65142323845077, "grad_norm": 0.4116232693195343, "learning_rate": 0.0002, "loss": 0.5879, "mean_token_accuracy": 0.7641018778085709, "num_tokens": 18056348.0, "step": 4986 }, { "entropy": 0.571269690990448, "epoch": 4.652356509566029, "grad_norm": 0.49794450402259827, "learning_rate": 0.0002, "loss": 0.5832, "mean_token_accuracy": 0.7652582377195358, "num_tokens": 18060092.0, "step": 4987 }, { "entropy": 0.5511510968208313, "epoch": 4.653289780681288, "grad_norm": 0.37264284491539, "learning_rate": 0.0002, "loss": 0.5551, "mean_token_accuracy": 0.7807415276765823, "num_tokens": 18063635.0, "step": 4988 }, { "entropy": 0.5781686380505562, "epoch": 4.654223051796547, "grad_norm": 0.45104488730430603, "learning_rate": 0.0002, "loss": 0.5914, "mean_token_accuracy": 0.7641949355602264, "num_tokens": 18067243.0, "step": 4989 }, { "entropy": 0.5811182260513306, "epoch": 4.655156322911806, "grad_norm": 0.38881951570510864, "learning_rate": 0.0002, "loss": 0.5845, "mean_token_accuracy": 0.7607244253158569, "num_tokens": 18071034.0, "step": 4990 }, { "entropy": 0.5602719038724899, "epoch": 4.656089594027065, "grad_norm": 0.41093888878822327, "learning_rate": 0.0002, "loss": 0.5672, "mean_token_accuracy": 0.7707304060459137, "num_tokens": 18074617.0, "step": 4991 }, { "entropy": 0.5583410412073135, "epoch": 4.6570228651423236, "grad_norm": 0.452378511428833, "learning_rate": 0.0002, "loss": 0.5632, "mean_token_accuracy": 0.7713153958320618, "num_tokens": 18078269.0, "step": 4992 }, { "entropy": 0.6008237898349762, "epoch": 4.6579561362575825, "grad_norm": 0.3383327126502991, "learning_rate": 0.0002, "loss": 0.5933, "mean_token_accuracy": 0.7580035030841827, "num_tokens": 18081992.0, "step": 4993 }, { "entropy": 0.5705653131008148, "epoch": 4.6588894073728415, "grad_norm": 0.36094388365745544, "learning_rate": 0.0002, "loss": 0.5763, "mean_token_accuracy": 0.771105483174324, "num_tokens": 18085573.0, "step": 4994 }, { "entropy": 0.5634460896253586, "epoch": 4.6598226784881005, "grad_norm": 0.32939839363098145, "learning_rate": 0.0002, "loss": 0.5577, "mean_token_accuracy": 0.7744188010692596, "num_tokens": 18089224.0, "step": 4995 }, { "entropy": 0.5051965564489365, "epoch": 4.6607559496033595, "grad_norm": 0.32522428035736084, "learning_rate": 0.0002, "loss": 0.4994, "mean_token_accuracy": 0.799232542514801, "num_tokens": 18092873.0, "step": 4996 }, { "entropy": 0.5869854092597961, "epoch": 4.6616892207186185, "grad_norm": 0.3603444993495941, "learning_rate": 0.0002, "loss": 0.5905, "mean_token_accuracy": 0.7686928361654282, "num_tokens": 18096454.0, "step": 4997 }, { "entropy": 0.5734087526798248, "epoch": 4.6626224918338774, "grad_norm": 0.4866026043891907, "learning_rate": 0.0002, "loss": 0.5828, "mean_token_accuracy": 0.7703135013580322, "num_tokens": 18100083.0, "step": 4998 }, { "entropy": 0.5618395060300827, "epoch": 4.663555762949136, "grad_norm": 0.35154232382774353, "learning_rate": 0.0002, "loss": 0.5596, "mean_token_accuracy": 0.7735393643379211, "num_tokens": 18103648.0, "step": 4999 }, { "entropy": 0.5635645985603333, "epoch": 4.664489034064395, "grad_norm": 0.3624590039253235, "learning_rate": 0.0002, "loss": 0.5683, "mean_token_accuracy": 0.7704758942127228, "num_tokens": 18107349.0, "step": 5000 }, { "entropy": 0.5585620850324631, "epoch": 4.665422305179654, "grad_norm": 0.32337599992752075, "learning_rate": 0.0002, "loss": 0.5472, "mean_token_accuracy": 0.7767833024263382, "num_tokens": 18111049.0, "step": 5001 }, { "entropy": 0.5379780828952789, "epoch": 4.666355576294913, "grad_norm": 0.42132630944252014, "learning_rate": 0.0002, "loss": 0.546, "mean_token_accuracy": 0.7748342454433441, "num_tokens": 18114704.0, "step": 5002 }, { "entropy": 0.5837940573692322, "epoch": 4.667288847410172, "grad_norm": 0.4543617069721222, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.7711394131183624, "num_tokens": 18118363.0, "step": 5003 }, { "entropy": 0.5341905578970909, "epoch": 4.668222118525431, "grad_norm": 0.39037516713142395, "learning_rate": 0.0002, "loss": 0.5307, "mean_token_accuracy": 0.7863487154245377, "num_tokens": 18121968.0, "step": 5004 }, { "entropy": 0.5289327874779701, "epoch": 4.66915538964069, "grad_norm": 0.4402127265930176, "learning_rate": 0.0002, "loss": 0.5512, "mean_token_accuracy": 0.7784434705972672, "num_tokens": 18125558.0, "step": 5005 }, { "entropy": 0.5823215246200562, "epoch": 4.670088660755949, "grad_norm": 0.428911417722702, "learning_rate": 0.0002, "loss": 0.5995, "mean_token_accuracy": 0.7578289806842804, "num_tokens": 18129119.0, "step": 5006 }, { "entropy": 0.5070472285151482, "epoch": 4.671021931871208, "grad_norm": 0.3923477828502655, "learning_rate": 0.0002, "loss": 0.5149, "mean_token_accuracy": 0.7868524938821793, "num_tokens": 18132587.0, "step": 5007 }, { "entropy": 0.5931232869625092, "epoch": 4.671955202986467, "grad_norm": 0.3721891939640045, "learning_rate": 0.0002, "loss": 0.5985, "mean_token_accuracy": 0.7571764588356018, "num_tokens": 18136218.0, "step": 5008 }, { "entropy": 0.5718326568603516, "epoch": 4.672888474101726, "grad_norm": 0.3615352213382721, "learning_rate": 0.0002, "loss": 0.5734, "mean_token_accuracy": 0.765153557062149, "num_tokens": 18139830.0, "step": 5009 }, { "entropy": 0.59474016726017, "epoch": 4.673821745216985, "grad_norm": 0.34739500284194946, "learning_rate": 0.0002, "loss": 0.5889, "mean_token_accuracy": 0.7614308446645737, "num_tokens": 18143418.0, "step": 5010 }, { "entropy": 0.5449605733156204, "epoch": 4.674755016332244, "grad_norm": 0.3190179765224457, "learning_rate": 0.0002, "loss": 0.5453, "mean_token_accuracy": 0.7779452502727509, "num_tokens": 18147094.0, "step": 5011 }, { "entropy": 0.5654537677764893, "epoch": 4.675688287447503, "grad_norm": 0.4078752100467682, "learning_rate": 0.0002, "loss": 0.5731, "mean_token_accuracy": 0.7651380449533463, "num_tokens": 18150580.0, "step": 5012 }, { "entropy": 0.5744655504822731, "epoch": 4.676621558562762, "grad_norm": 0.3833051323890686, "learning_rate": 0.0002, "loss": 0.571, "mean_token_accuracy": 0.7714661359786987, "num_tokens": 18154167.0, "step": 5013 }, { "entropy": 0.5979563146829605, "epoch": 4.677554829678021, "grad_norm": 0.3876741826534271, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7584546059370041, "num_tokens": 18157957.0, "step": 5014 }, { "entropy": 0.562088206410408, "epoch": 4.67848810079328, "grad_norm": 0.3548921048641205, "learning_rate": 0.0002, "loss": 0.566, "mean_token_accuracy": 0.7745717018842697, "num_tokens": 18161497.0, "step": 5015 }, { "entropy": 0.562276229262352, "epoch": 4.679421371908539, "grad_norm": 0.43038710951805115, "learning_rate": 0.0002, "loss": 0.5677, "mean_token_accuracy": 0.7737734615802765, "num_tokens": 18165049.0, "step": 5016 }, { "entropy": 0.5521158501505852, "epoch": 4.680354643023798, "grad_norm": 0.3933962881565094, "learning_rate": 0.0002, "loss": 0.5574, "mean_token_accuracy": 0.7749777138233185, "num_tokens": 18168614.0, "step": 5017 }, { "entropy": 0.5621806383132935, "epoch": 4.681287914139057, "grad_norm": 0.4247540831565857, "learning_rate": 0.0002, "loss": 0.5747, "mean_token_accuracy": 0.7705670893192291, "num_tokens": 18172388.0, "step": 5018 }, { "entropy": 0.5635628700256348, "epoch": 4.682221185254316, "grad_norm": 0.42945539951324463, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.7666988223791122, "num_tokens": 18176001.0, "step": 5019 }, { "entropy": 0.5511122941970825, "epoch": 4.683154456369575, "grad_norm": 0.4064529538154602, "learning_rate": 0.0002, "loss": 0.5493, "mean_token_accuracy": 0.7860204428434372, "num_tokens": 18179749.0, "step": 5020 }, { "entropy": 0.6048489660024643, "epoch": 4.684087727484834, "grad_norm": 0.35500380396842957, "learning_rate": 0.0002, "loss": 0.6021, "mean_token_accuracy": 0.7570336014032364, "num_tokens": 18183473.0, "step": 5021 }, { "entropy": 0.5815774947404861, "epoch": 4.685020998600093, "grad_norm": 0.3697463274002075, "learning_rate": 0.0002, "loss": 0.5785, "mean_token_accuracy": 0.7721184641122818, "num_tokens": 18187174.0, "step": 5022 }, { "entropy": 0.5607841461896896, "epoch": 4.685954269715352, "grad_norm": 0.37846431136131287, "learning_rate": 0.0002, "loss": 0.5696, "mean_token_accuracy": 0.766611710190773, "num_tokens": 18190797.0, "step": 5023 }, { "entropy": 0.5683610439300537, "epoch": 4.686887540830611, "grad_norm": 0.29998815059661865, "learning_rate": 0.0002, "loss": 0.5559, "mean_token_accuracy": 0.7764761596918106, "num_tokens": 18194420.0, "step": 5024 }, { "entropy": 0.5878759175539017, "epoch": 4.68782081194587, "grad_norm": 0.3252502977848053, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7739330232143402, "num_tokens": 18198187.0, "step": 5025 }, { "entropy": 0.5730148553848267, "epoch": 4.688754083061129, "grad_norm": 0.34262868762016296, "learning_rate": 0.0002, "loss": 0.5668, "mean_token_accuracy": 0.7699866592884064, "num_tokens": 18201876.0, "step": 5026 }, { "entropy": 0.5699986517429352, "epoch": 4.689687354176388, "grad_norm": 0.3510224223136902, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.765339583158493, "num_tokens": 18205465.0, "step": 5027 }, { "entropy": 0.5450512170791626, "epoch": 4.690620625291647, "grad_norm": 0.4107488691806793, "learning_rate": 0.0002, "loss": 0.5477, "mean_token_accuracy": 0.7845502495765686, "num_tokens": 18209022.0, "step": 5028 }, { "entropy": 0.585419163107872, "epoch": 4.691553896406906, "grad_norm": 0.3993939757347107, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.7656446546316147, "num_tokens": 18212716.0, "step": 5029 }, { "entropy": 0.5504361540079117, "epoch": 4.692487167522165, "grad_norm": 0.41391298174858093, "learning_rate": 0.0002, "loss": 0.5602, "mean_token_accuracy": 0.7732379734516144, "num_tokens": 18216400.0, "step": 5030 }, { "entropy": 0.5807351171970367, "epoch": 4.693420438637424, "grad_norm": 0.4029393792152405, "learning_rate": 0.0002, "loss": 0.6009, "mean_token_accuracy": 0.760921448469162, "num_tokens": 18220019.0, "step": 5031 }, { "entropy": 0.5897707790136337, "epoch": 4.694353709752683, "grad_norm": 0.43813082575798035, "learning_rate": 0.0002, "loss": 0.6018, "mean_token_accuracy": 0.7577258944511414, "num_tokens": 18223780.0, "step": 5032 }, { "entropy": 0.5624167025089264, "epoch": 4.695286980867942, "grad_norm": 0.40226835012435913, "learning_rate": 0.0002, "loss": 0.5532, "mean_token_accuracy": 0.7727477699518204, "num_tokens": 18227362.0, "step": 5033 }, { "entropy": 0.5695414692163467, "epoch": 4.696220251983201, "grad_norm": 0.3680605888366699, "learning_rate": 0.0002, "loss": 0.5711, "mean_token_accuracy": 0.7727688997983932, "num_tokens": 18231017.0, "step": 5034 }, { "entropy": 0.5434642285108566, "epoch": 4.69715352309846, "grad_norm": 0.40561443567276, "learning_rate": 0.0002, "loss": 0.5418, "mean_token_accuracy": 0.7839132249355316, "num_tokens": 18234640.0, "step": 5035 }, { "entropy": 0.5437801480293274, "epoch": 4.698086794213719, "grad_norm": 0.3984902799129486, "learning_rate": 0.0002, "loss": 0.5428, "mean_token_accuracy": 0.7809140384197235, "num_tokens": 18238243.0, "step": 5036 }, { "entropy": 0.5589670091867447, "epoch": 4.699020065328978, "grad_norm": 0.35066697001457214, "learning_rate": 0.0002, "loss": 0.5589, "mean_token_accuracy": 0.7765732556581497, "num_tokens": 18241845.0, "step": 5037 }, { "entropy": 0.5801659375429153, "epoch": 4.699953336444237, "grad_norm": 0.39263102412223816, "learning_rate": 0.0002, "loss": 0.5901, "mean_token_accuracy": 0.7594034671783447, "num_tokens": 18245494.0, "step": 5038 }, { "entropy": 0.5591834187507629, "epoch": 4.700886607559496, "grad_norm": 0.36797958612442017, "learning_rate": 0.0002, "loss": 0.5639, "mean_token_accuracy": 0.769641175866127, "num_tokens": 18249101.0, "step": 5039 }, { "entropy": 0.5689128935337067, "epoch": 4.701819878674755, "grad_norm": 0.3930954337120056, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.766726940870285, "num_tokens": 18252759.0, "step": 5040 }, { "entropy": 0.5718165338039398, "epoch": 4.702753149790014, "grad_norm": 0.3527078926563263, "learning_rate": 0.0002, "loss": 0.5704, "mean_token_accuracy": 0.769456148147583, "num_tokens": 18256475.0, "step": 5041 }, { "entropy": 0.5755462348461151, "epoch": 4.703686420905273, "grad_norm": 0.330449640750885, "learning_rate": 0.0002, "loss": 0.5678, "mean_token_accuracy": 0.7741593420505524, "num_tokens": 18260087.0, "step": 5042 }, { "entropy": 0.5838131308555603, "epoch": 4.704619692020532, "grad_norm": 0.4561574161052704, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.7631572782993317, "num_tokens": 18263685.0, "step": 5043 }, { "entropy": 0.5441323965787888, "epoch": 4.705552963135791, "grad_norm": 0.329885333776474, "learning_rate": 0.0002, "loss": 0.5413, "mean_token_accuracy": 0.7796768099069595, "num_tokens": 18267413.0, "step": 5044 }, { "entropy": 0.5797205567359924, "epoch": 4.70648623425105, "grad_norm": 0.37232518196105957, "learning_rate": 0.0002, "loss": 0.5862, "mean_token_accuracy": 0.7633067518472672, "num_tokens": 18271052.0, "step": 5045 }, { "entropy": 0.5970920473337173, "epoch": 4.707419505366309, "grad_norm": 0.35692012310028076, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.7591648995876312, "num_tokens": 18274735.0, "step": 5046 }, { "entropy": 0.5266027599573135, "epoch": 4.708352776481568, "grad_norm": 0.4026413559913635, "learning_rate": 0.0002, "loss": 0.5355, "mean_token_accuracy": 0.7843427658081055, "num_tokens": 18278382.0, "step": 5047 }, { "entropy": 0.5891039818525314, "epoch": 4.709286047596827, "grad_norm": 0.3617851138114929, "learning_rate": 0.0002, "loss": 0.5926, "mean_token_accuracy": 0.7602651715278625, "num_tokens": 18282125.0, "step": 5048 }, { "entropy": 0.5761052668094635, "epoch": 4.7102193187120855, "grad_norm": 0.3768720328807831, "learning_rate": 0.0002, "loss": 0.5766, "mean_token_accuracy": 0.766923114657402, "num_tokens": 18285767.0, "step": 5049 }, { "entropy": 0.5608512610197067, "epoch": 4.7111525898273445, "grad_norm": 0.43096834421157837, "learning_rate": 0.0002, "loss": 0.5673, "mean_token_accuracy": 0.7768329232931137, "num_tokens": 18289343.0, "step": 5050 }, { "entropy": 0.5704580247402191, "epoch": 4.7120858609426035, "grad_norm": 0.395560622215271, "learning_rate": 0.0002, "loss": 0.5761, "mean_token_accuracy": 0.7664727121591568, "num_tokens": 18293018.0, "step": 5051 }, { "entropy": 0.5817736834287643, "epoch": 4.7130191320578625, "grad_norm": 0.3807607889175415, "learning_rate": 0.0002, "loss": 0.5902, "mean_token_accuracy": 0.7636866420507431, "num_tokens": 18296604.0, "step": 5052 }, { "entropy": 0.5642538666725159, "epoch": 4.7139524031731215, "grad_norm": 0.36983779072761536, "learning_rate": 0.0002, "loss": 0.5673, "mean_token_accuracy": 0.7758589684963226, "num_tokens": 18300229.0, "step": 5053 }, { "entropy": 0.5529363602399826, "epoch": 4.7148856742883805, "grad_norm": 0.3400697112083435, "learning_rate": 0.0002, "loss": 0.5601, "mean_token_accuracy": 0.7724595069885254, "num_tokens": 18304040.0, "step": 5054 }, { "entropy": 0.5980473607778549, "epoch": 4.715818945403639, "grad_norm": 0.44896769523620605, "learning_rate": 0.0002, "loss": 0.6149, "mean_token_accuracy": 0.7503587603569031, "num_tokens": 18307746.0, "step": 5055 }, { "entropy": 0.605592355132103, "epoch": 4.716752216518898, "grad_norm": 0.42256245017051697, "learning_rate": 0.0002, "loss": 0.612, "mean_token_accuracy": 0.7550223171710968, "num_tokens": 18311413.0, "step": 5056 }, { "entropy": 0.5795286595821381, "epoch": 4.717685487634157, "grad_norm": 0.43707701563835144, "learning_rate": 0.0002, "loss": 0.5803, "mean_token_accuracy": 0.7613016963005066, "num_tokens": 18315095.0, "step": 5057 }, { "entropy": 0.5604373216629028, "epoch": 4.718618758749416, "grad_norm": 0.37234389781951904, "learning_rate": 0.0002, "loss": 0.553, "mean_token_accuracy": 0.7776554673910141, "num_tokens": 18318760.0, "step": 5058 }, { "entropy": 0.5545274019241333, "epoch": 4.719552029864675, "grad_norm": 0.41477537155151367, "learning_rate": 0.0002, "loss": 0.5529, "mean_token_accuracy": 0.7789442092180252, "num_tokens": 18322323.0, "step": 5059 }, { "entropy": 0.5842546075582504, "epoch": 4.720485300979934, "grad_norm": 0.47883927822113037, "learning_rate": 0.0002, "loss": 0.5891, "mean_token_accuracy": 0.7628040015697479, "num_tokens": 18326010.0, "step": 5060 }, { "entropy": 0.5843833535909653, "epoch": 4.721418572095193, "grad_norm": 0.4231046736240387, "learning_rate": 0.0002, "loss": 0.5894, "mean_token_accuracy": 0.7694827318191528, "num_tokens": 18329645.0, "step": 5061 }, { "entropy": 0.5329452976584435, "epoch": 4.722351843210452, "grad_norm": 0.38733136653900146, "learning_rate": 0.0002, "loss": 0.5446, "mean_token_accuracy": 0.7838130742311478, "num_tokens": 18333133.0, "step": 5062 }, { "entropy": 0.5631667897105217, "epoch": 4.723285114325711, "grad_norm": 0.4057009816169739, "learning_rate": 0.0002, "loss": 0.5695, "mean_token_accuracy": 0.7689582258462906, "num_tokens": 18336662.0, "step": 5063 }, { "entropy": 0.5505263954401016, "epoch": 4.72421838544097, "grad_norm": 0.36115944385528564, "learning_rate": 0.0002, "loss": 0.5512, "mean_token_accuracy": 0.7808873355388641, "num_tokens": 18340287.0, "step": 5064 }, { "entropy": 0.5702440291643143, "epoch": 4.725151656556229, "grad_norm": 0.42071905732154846, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.764273390173912, "num_tokens": 18343934.0, "step": 5065 }, { "entropy": 0.5582850724458694, "epoch": 4.726084927671488, "grad_norm": 0.3550545275211334, "learning_rate": 0.0002, "loss": 0.5627, "mean_token_accuracy": 0.7720849364995956, "num_tokens": 18347479.0, "step": 5066 }, { "entropy": 0.5376018136739731, "epoch": 4.727018198786747, "grad_norm": 0.3472418189048767, "learning_rate": 0.0002, "loss": 0.535, "mean_token_accuracy": 0.7830753475427628, "num_tokens": 18351130.0, "step": 5067 }, { "entropy": 0.5334690511226654, "epoch": 4.727951469902006, "grad_norm": 0.4736270606517792, "learning_rate": 0.0002, "loss": 0.5469, "mean_token_accuracy": 0.776764377951622, "num_tokens": 18354740.0, "step": 5068 }, { "entropy": 0.5840517729520798, "epoch": 4.728884741017265, "grad_norm": 0.4066859185695648, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7640927284955978, "num_tokens": 18358452.0, "step": 5069 }, { "entropy": 0.5681255012750626, "epoch": 4.729818012132524, "grad_norm": 0.3891274631023407, "learning_rate": 0.0002, "loss": 0.5698, "mean_token_accuracy": 0.7749434560537338, "num_tokens": 18362020.0, "step": 5070 }, { "entropy": 0.6121579259634018, "epoch": 4.730751283247783, "grad_norm": 0.3703775405883789, "learning_rate": 0.0002, "loss": 0.5992, "mean_token_accuracy": 0.7524862289428711, "num_tokens": 18365613.0, "step": 5071 }, { "entropy": 0.5804189741611481, "epoch": 4.731684554363042, "grad_norm": 0.35658591985702515, "learning_rate": 0.0002, "loss": 0.5728, "mean_token_accuracy": 0.7698460668325424, "num_tokens": 18369332.0, "step": 5072 }, { "entropy": 0.6065908223390579, "epoch": 4.732617825478301, "grad_norm": 0.5089513659477234, "learning_rate": 0.0002, "loss": 0.6214, "mean_token_accuracy": 0.7435170263051987, "num_tokens": 18372852.0, "step": 5073 }, { "entropy": 0.5356921926140785, "epoch": 4.73355109659356, "grad_norm": 0.3563210964202881, "learning_rate": 0.0002, "loss": 0.5379, "mean_token_accuracy": 0.7829476445913315, "num_tokens": 18376384.0, "step": 5074 }, { "entropy": 0.5513594672083855, "epoch": 4.734484367708819, "grad_norm": 0.4178679287433624, "learning_rate": 0.0002, "loss": 0.5638, "mean_token_accuracy": 0.7729248255491257, "num_tokens": 18379942.0, "step": 5075 }, { "entropy": 0.5583355948328972, "epoch": 4.735417638824078, "grad_norm": 0.4414004385471344, "learning_rate": 0.0002, "loss": 0.5659, "mean_token_accuracy": 0.7733424156904221, "num_tokens": 18383518.0, "step": 5076 }, { "entropy": 0.5451033115386963, "epoch": 4.736350909939337, "grad_norm": 0.4189080595970154, "learning_rate": 0.0002, "loss": 0.5642, "mean_token_accuracy": 0.7715637385845184, "num_tokens": 18387034.0, "step": 5077 }, { "entropy": 0.5480293408036232, "epoch": 4.737284181054596, "grad_norm": 0.4308619797229767, "learning_rate": 0.0002, "loss": 0.554, "mean_token_accuracy": 0.7701990753412247, "num_tokens": 18390649.0, "step": 5078 }, { "entropy": 0.5781579464673996, "epoch": 4.738217452169855, "grad_norm": 0.41704225540161133, "learning_rate": 0.0002, "loss": 0.5715, "mean_token_accuracy": 0.7676239162683487, "num_tokens": 18394352.0, "step": 5079 }, { "entropy": 0.5778117179870605, "epoch": 4.739150723285114, "grad_norm": 0.48700037598609924, "learning_rate": 0.0002, "loss": 0.577, "mean_token_accuracy": 0.7703861743211746, "num_tokens": 18397918.0, "step": 5080 }, { "entropy": 0.5668669492006302, "epoch": 4.740083994400373, "grad_norm": 0.39342671632766724, "learning_rate": 0.0002, "loss": 0.5665, "mean_token_accuracy": 0.7746629565954208, "num_tokens": 18401485.0, "step": 5081 }, { "entropy": 0.5942127853631973, "epoch": 4.741017265515632, "grad_norm": 0.3404448628425598, "learning_rate": 0.0002, "loss": 0.5846, "mean_token_accuracy": 0.7675492316484451, "num_tokens": 18405217.0, "step": 5082 }, { "entropy": 0.5665842443704605, "epoch": 4.741950536630891, "grad_norm": 0.3359329402446747, "learning_rate": 0.0002, "loss": 0.5701, "mean_token_accuracy": 0.770055279135704, "num_tokens": 18408908.0, "step": 5083 }, { "entropy": 0.5619405806064606, "epoch": 4.74288380774615, "grad_norm": 0.38861510157585144, "learning_rate": 0.0002, "loss": 0.5736, "mean_token_accuracy": 0.7627428770065308, "num_tokens": 18412381.0, "step": 5084 }, { "entropy": 0.5555403307080269, "epoch": 4.743817078861409, "grad_norm": 0.3391020596027374, "learning_rate": 0.0002, "loss": 0.5488, "mean_token_accuracy": 0.7819044440984726, "num_tokens": 18416071.0, "step": 5085 }, { "entropy": 0.5990473181009293, "epoch": 4.744750349976668, "grad_norm": 0.4421513080596924, "learning_rate": 0.0002, "loss": 0.6043, "mean_token_accuracy": 0.7526031732559204, "num_tokens": 18419874.0, "step": 5086 }, { "entropy": 0.5461741015315056, "epoch": 4.745683621091927, "grad_norm": 0.4245346784591675, "learning_rate": 0.0002, "loss": 0.5645, "mean_token_accuracy": 0.7708476781845093, "num_tokens": 18423476.0, "step": 5087 }, { "entropy": 0.5527116358280182, "epoch": 4.746616892207186, "grad_norm": 0.3820510804653168, "learning_rate": 0.0002, "loss": 0.56, "mean_token_accuracy": 0.7734947204589844, "num_tokens": 18427248.0, "step": 5088 }, { "entropy": 0.5628604739904404, "epoch": 4.747550163322445, "grad_norm": 0.3720964789390564, "learning_rate": 0.0002, "loss": 0.5704, "mean_token_accuracy": 0.7698292583227158, "num_tokens": 18430847.0, "step": 5089 }, { "entropy": 0.5315804034471512, "epoch": 4.748483434437704, "grad_norm": 0.40078020095825195, "learning_rate": 0.0002, "loss": 0.5325, "mean_token_accuracy": 0.7923571765422821, "num_tokens": 18434424.0, "step": 5090 }, { "entropy": 0.5595552176237106, "epoch": 4.749416705552963, "grad_norm": 0.4703252613544464, "learning_rate": 0.0002, "loss": 0.5709, "mean_token_accuracy": 0.7679237723350525, "num_tokens": 18438029.0, "step": 5091 }, { "entropy": 0.5628156960010529, "epoch": 4.750349976668222, "grad_norm": 0.3748379945755005, "learning_rate": 0.0002, "loss": 0.5657, "mean_token_accuracy": 0.7789324820041656, "num_tokens": 18441620.0, "step": 5092 }, { "entropy": 0.5216574519872665, "epoch": 4.751283247783481, "grad_norm": 0.33287954330444336, "learning_rate": 0.0002, "loss": 0.5185, "mean_token_accuracy": 0.7933188080787659, "num_tokens": 18445263.0, "step": 5093 }, { "entropy": 0.5919139832258224, "epoch": 4.75221651889874, "grad_norm": 0.41579148173332214, "learning_rate": 0.0002, "loss": 0.5938, "mean_token_accuracy": 0.7660512775182724, "num_tokens": 18448929.0, "step": 5094 }, { "entropy": 0.5862676203250885, "epoch": 4.753149790013999, "grad_norm": 0.3802694082260132, "learning_rate": 0.0002, "loss": 0.5805, "mean_token_accuracy": 0.7643318921327591, "num_tokens": 18452594.0, "step": 5095 }, { "entropy": 0.551288902759552, "epoch": 4.754083061129258, "grad_norm": 0.3710348308086395, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.775962308049202, "num_tokens": 18456256.0, "step": 5096 }, { "entropy": 0.5732976645231247, "epoch": 4.755016332244517, "grad_norm": 0.43090611696243286, "learning_rate": 0.0002, "loss": 0.5827, "mean_token_accuracy": 0.7625113278627396, "num_tokens": 18459809.0, "step": 5097 }, { "entropy": 0.5635059326887131, "epoch": 4.755949603359776, "grad_norm": 0.37242797017097473, "learning_rate": 0.0002, "loss": 0.5681, "mean_token_accuracy": 0.7653497755527496, "num_tokens": 18463469.0, "step": 5098 }, { "entropy": 0.5539982914924622, "epoch": 4.756882874475035, "grad_norm": 0.35279422998428345, "learning_rate": 0.0002, "loss": 0.5611, "mean_token_accuracy": 0.7691478878259659, "num_tokens": 18467120.0, "step": 5099 }, { "entropy": 0.6158871650695801, "epoch": 4.757816145590294, "grad_norm": 0.347899466753006, "learning_rate": 0.0002, "loss": 0.6145, "mean_token_accuracy": 0.7580831944942474, "num_tokens": 18470829.0, "step": 5100 }, { "entropy": 0.5565620958805084, "epoch": 4.758749416705553, "grad_norm": 0.365593284368515, "learning_rate": 0.0002, "loss": 0.5629, "mean_token_accuracy": 0.774996742606163, "num_tokens": 18474443.0, "step": 5101 }, { "entropy": 0.5970499366521835, "epoch": 4.759682687820812, "grad_norm": 0.38508906960487366, "learning_rate": 0.0002, "loss": 0.5962, "mean_token_accuracy": 0.7591585516929626, "num_tokens": 18477992.0, "step": 5102 }, { "entropy": 0.5666817873716354, "epoch": 4.760615958936071, "grad_norm": 0.4194212257862091, "learning_rate": 0.0002, "loss": 0.5676, "mean_token_accuracy": 0.7736937701702118, "num_tokens": 18481695.0, "step": 5103 }, { "entropy": 0.5345659255981445, "epoch": 4.76154923005133, "grad_norm": 0.3435582220554352, "learning_rate": 0.0002, "loss": 0.5381, "mean_token_accuracy": 0.7822657823562622, "num_tokens": 18485300.0, "step": 5104 }, { "entropy": 0.5747577175498009, "epoch": 4.7624825011665886, "grad_norm": 0.36687004566192627, "learning_rate": 0.0002, "loss": 0.5781, "mean_token_accuracy": 0.7638672590255737, "num_tokens": 18488949.0, "step": 5105 }, { "entropy": 0.5695013403892517, "epoch": 4.7634157722818475, "grad_norm": 0.398416668176651, "learning_rate": 0.0002, "loss": 0.563, "mean_token_accuracy": 0.7759317755699158, "num_tokens": 18492591.0, "step": 5106 }, { "entropy": 0.5556615889072418, "epoch": 4.7643490433971065, "grad_norm": 0.399731308221817, "learning_rate": 0.0002, "loss": 0.5618, "mean_token_accuracy": 0.7697865962982178, "num_tokens": 18496331.0, "step": 5107 }, { "entropy": 0.5499328672885895, "epoch": 4.7652823145123655, "grad_norm": 0.4589386582374573, "learning_rate": 0.0002, "loss": 0.5519, "mean_token_accuracy": 0.7821925282478333, "num_tokens": 18499827.0, "step": 5108 }, { "entropy": 0.5341510474681854, "epoch": 4.7662155856276245, "grad_norm": 0.39651715755462646, "learning_rate": 0.0002, "loss": 0.539, "mean_token_accuracy": 0.7774975895881653, "num_tokens": 18503365.0, "step": 5109 }, { "entropy": 0.578183576464653, "epoch": 4.7671488567428835, "grad_norm": 0.4379296600818634, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7633722573518753, "num_tokens": 18506988.0, "step": 5110 }, { "entropy": 0.5514602661132812, "epoch": 4.7680821278581424, "grad_norm": 0.37421566247940063, "learning_rate": 0.0002, "loss": 0.5502, "mean_token_accuracy": 0.7771903574466705, "num_tokens": 18510575.0, "step": 5111 }, { "entropy": 0.5403428226709366, "epoch": 4.769015398973401, "grad_norm": 0.406074196100235, "learning_rate": 0.0002, "loss": 0.5577, "mean_token_accuracy": 0.7756370157003403, "num_tokens": 18514170.0, "step": 5112 }, { "entropy": 0.5706124007701874, "epoch": 4.76994867008866, "grad_norm": 0.3239794969558716, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7622000277042389, "num_tokens": 18517790.0, "step": 5113 }, { "entropy": 0.5587579682469368, "epoch": 4.770881941203919, "grad_norm": 0.38912075757980347, "learning_rate": 0.0002, "loss": 0.563, "mean_token_accuracy": 0.7724925130605698, "num_tokens": 18521404.0, "step": 5114 }, { "entropy": 0.5425563752651215, "epoch": 4.771815212319178, "grad_norm": 0.3219035267829895, "learning_rate": 0.0002, "loss": 0.5402, "mean_token_accuracy": 0.7812967300415039, "num_tokens": 18525087.0, "step": 5115 }, { "entropy": 0.589273676276207, "epoch": 4.772748483434437, "grad_norm": 0.6534790396690369, "learning_rate": 0.0002, "loss": 0.5975, "mean_token_accuracy": 0.7626163363456726, "num_tokens": 18528870.0, "step": 5116 }, { "entropy": 0.5529173985123634, "epoch": 4.773681754549696, "grad_norm": 0.36693084239959717, "learning_rate": 0.0002, "loss": 0.5597, "mean_token_accuracy": 0.768507331609726, "num_tokens": 18532568.0, "step": 5117 }, { "entropy": 0.5914974808692932, "epoch": 4.774615025664955, "grad_norm": 0.3475857377052307, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7657051086425781, "num_tokens": 18536293.0, "step": 5118 }, { "entropy": 0.5759420543909073, "epoch": 4.775548296780214, "grad_norm": 0.421247661113739, "learning_rate": 0.0002, "loss": 0.5624, "mean_token_accuracy": 0.7766792923212051, "num_tokens": 18539861.0, "step": 5119 }, { "entropy": 0.5752776265144348, "epoch": 4.776481567895473, "grad_norm": 0.3892023265361786, "learning_rate": 0.0002, "loss": 0.5713, "mean_token_accuracy": 0.7700335532426834, "num_tokens": 18543546.0, "step": 5120 }, { "entropy": 0.5746910274028778, "epoch": 4.777414839010732, "grad_norm": 0.38955438137054443, "learning_rate": 0.0002, "loss": 0.5781, "mean_token_accuracy": 0.7705122083425522, "num_tokens": 18547088.0, "step": 5121 }, { "entropy": 0.5736331790685654, "epoch": 4.778348110125991, "grad_norm": 0.48265740275382996, "learning_rate": 0.0002, "loss": 0.5802, "mean_token_accuracy": 0.7677689492702484, "num_tokens": 18550668.0, "step": 5122 }, { "entropy": 0.5550585389137268, "epoch": 4.77928138124125, "grad_norm": 0.4533918797969818, "learning_rate": 0.0002, "loss": 0.5722, "mean_token_accuracy": 0.769371435046196, "num_tokens": 18554268.0, "step": 5123 }, { "entropy": 0.5753709822893143, "epoch": 4.780214652356509, "grad_norm": 0.3893023729324341, "learning_rate": 0.0002, "loss": 0.5835, "mean_token_accuracy": 0.7700848579406738, "num_tokens": 18557869.0, "step": 5124 }, { "entropy": 0.558521032333374, "epoch": 4.781147923471768, "grad_norm": 0.4036959409713745, "learning_rate": 0.0002, "loss": 0.5663, "mean_token_accuracy": 0.7692359536886215, "num_tokens": 18561533.0, "step": 5125 }, { "entropy": 0.5600506216287613, "epoch": 4.782081194587027, "grad_norm": 0.3951052725315094, "learning_rate": 0.0002, "loss": 0.568, "mean_token_accuracy": 0.7627758681774139, "num_tokens": 18565131.0, "step": 5126 }, { "entropy": 0.5490160137414932, "epoch": 4.783014465702286, "grad_norm": 0.35038208961486816, "learning_rate": 0.0002, "loss": 0.5646, "mean_token_accuracy": 0.7740890085697174, "num_tokens": 18568710.0, "step": 5127 }, { "entropy": 0.5838751792907715, "epoch": 4.783947736817545, "grad_norm": 0.39104920625686646, "learning_rate": 0.0002, "loss": 0.5958, "mean_token_accuracy": 0.7594696879386902, "num_tokens": 18572191.0, "step": 5128 }, { "entropy": 0.5664685964584351, "epoch": 4.784881007932804, "grad_norm": 0.37496304512023926, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7747564911842346, "num_tokens": 18575881.0, "step": 5129 }, { "entropy": 0.580343559384346, "epoch": 4.785814279048063, "grad_norm": 0.3486812114715576, "learning_rate": 0.0002, "loss": 0.5695, "mean_token_accuracy": 0.7729068994522095, "num_tokens": 18579450.0, "step": 5130 }, { "entropy": 0.5726107954978943, "epoch": 4.786747550163322, "grad_norm": 0.38250645995140076, "learning_rate": 0.0002, "loss": 0.5756, "mean_token_accuracy": 0.7672691494226456, "num_tokens": 18583060.0, "step": 5131 }, { "entropy": 0.5857817381620407, "epoch": 4.787680821278581, "grad_norm": 0.40876755118370056, "learning_rate": 0.0002, "loss": 0.5867, "mean_token_accuracy": 0.7593191266059875, "num_tokens": 18586593.0, "step": 5132 }, { "entropy": 0.557344451546669, "epoch": 4.78861409239384, "grad_norm": 0.3811647295951843, "learning_rate": 0.0002, "loss": 0.5575, "mean_token_accuracy": 0.772866278886795, "num_tokens": 18590189.0, "step": 5133 }, { "entropy": 0.5561444908380508, "epoch": 4.789547363509099, "grad_norm": 0.41450008749961853, "learning_rate": 0.0002, "loss": 0.5648, "mean_token_accuracy": 0.7740989476442337, "num_tokens": 18593805.0, "step": 5134 }, { "entropy": 0.5619507730007172, "epoch": 4.790480634624358, "grad_norm": 0.40010982751846313, "learning_rate": 0.0002, "loss": 0.5636, "mean_token_accuracy": 0.7768718004226685, "num_tokens": 18597318.0, "step": 5135 }, { "entropy": 0.5590502470731735, "epoch": 4.791413905739617, "grad_norm": 0.455120712518692, "learning_rate": 0.0002, "loss": 0.5634, "mean_token_accuracy": 0.7748511135578156, "num_tokens": 18600997.0, "step": 5136 }, { "entropy": 0.5540226697921753, "epoch": 4.792347176854876, "grad_norm": 0.3542335629463196, "learning_rate": 0.0002, "loss": 0.5573, "mean_token_accuracy": 0.7756698131561279, "num_tokens": 18604694.0, "step": 5137 }, { "entropy": 0.5992708653211594, "epoch": 4.793280447970135, "grad_norm": 0.39759984612464905, "learning_rate": 0.0002, "loss": 0.6086, "mean_token_accuracy": 0.7530217468738556, "num_tokens": 18608319.0, "step": 5138 }, { "entropy": 0.5885222405195236, "epoch": 4.794213719085394, "grad_norm": 0.356664776802063, "learning_rate": 0.0002, "loss": 0.5849, "mean_token_accuracy": 0.7727283090353012, "num_tokens": 18611998.0, "step": 5139 }, { "entropy": 0.5429888367652893, "epoch": 4.795146990200653, "grad_norm": 0.4066486656665802, "learning_rate": 0.0002, "loss": 0.5487, "mean_token_accuracy": 0.7754170149564743, "num_tokens": 18615540.0, "step": 5140 }, { "entropy": 0.5757399648427963, "epoch": 4.796080261315912, "grad_norm": 0.4521086812019348, "learning_rate": 0.0002, "loss": 0.5912, "mean_token_accuracy": 0.7597895711660385, "num_tokens": 18619178.0, "step": 5141 }, { "entropy": 0.5457243546843529, "epoch": 4.797013532431171, "grad_norm": 0.380161315202713, "learning_rate": 0.0002, "loss": 0.5437, "mean_token_accuracy": 0.780634269118309, "num_tokens": 18622716.0, "step": 5142 }, { "entropy": 0.5576182007789612, "epoch": 4.79794680354643, "grad_norm": 0.3149712383747101, "learning_rate": 0.0002, "loss": 0.5522, "mean_token_accuracy": 0.7762923538684845, "num_tokens": 18626248.0, "step": 5143 }, { "entropy": 0.5978433936834335, "epoch": 4.798880074661689, "grad_norm": 0.3378632962703705, "learning_rate": 0.0002, "loss": 0.598, "mean_token_accuracy": 0.7627675533294678, "num_tokens": 18629867.0, "step": 5144 }, { "entropy": 0.5691583007574081, "epoch": 4.799813345776948, "grad_norm": 0.35108044743537903, "learning_rate": 0.0002, "loss": 0.5768, "mean_token_accuracy": 0.7693262845277786, "num_tokens": 18633488.0, "step": 5145 }, { "entropy": 0.595000147819519, "epoch": 4.800746616892207, "grad_norm": 0.42836713790893555, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.7592506259679794, "num_tokens": 18637022.0, "step": 5146 }, { "entropy": 0.5811780244112015, "epoch": 4.801679888007466, "grad_norm": 0.3372402489185333, "learning_rate": 0.0002, "loss": 0.5818, "mean_token_accuracy": 0.7638733386993408, "num_tokens": 18640753.0, "step": 5147 }, { "entropy": 0.5784092992544174, "epoch": 4.802613159122725, "grad_norm": 0.39809414744377136, "learning_rate": 0.0002, "loss": 0.5728, "mean_token_accuracy": 0.7696513235569, "num_tokens": 18644369.0, "step": 5148 }, { "entropy": 0.5889221131801605, "epoch": 4.803546430237984, "grad_norm": 0.4042304754257202, "learning_rate": 0.0002, "loss": 0.594, "mean_token_accuracy": 0.7626517713069916, "num_tokens": 18648020.0, "step": 5149 }, { "entropy": 0.5174869671463966, "epoch": 4.804479701353243, "grad_norm": 0.4004286825656891, "learning_rate": 0.0002, "loss": 0.5265, "mean_token_accuracy": 0.7922818809747696, "num_tokens": 18651535.0, "step": 5150 }, { "entropy": 0.5726029127836227, "epoch": 4.805412972468502, "grad_norm": 0.383829265832901, "learning_rate": 0.0002, "loss": 0.5822, "mean_token_accuracy": 0.7672967314720154, "num_tokens": 18655202.0, "step": 5151 }, { "entropy": 0.5655099600553513, "epoch": 4.806346243583761, "grad_norm": 0.34822535514831543, "learning_rate": 0.0002, "loss": 0.563, "mean_token_accuracy": 0.7752173990011215, "num_tokens": 18658928.0, "step": 5152 }, { "entropy": 0.5501739084720612, "epoch": 4.80727951469902, "grad_norm": 0.4380563795566559, "learning_rate": 0.0002, "loss": 0.5536, "mean_token_accuracy": 0.7747508734464645, "num_tokens": 18662481.0, "step": 5153 }, { "entropy": 0.5859765112400055, "epoch": 4.808212785814279, "grad_norm": 0.4031268060207367, "learning_rate": 0.0002, "loss": 0.5739, "mean_token_accuracy": 0.7688797414302826, "num_tokens": 18666150.0, "step": 5154 }, { "entropy": 0.5684535279870033, "epoch": 4.809146056929538, "grad_norm": 0.37637659907341003, "learning_rate": 0.0002, "loss": 0.5654, "mean_token_accuracy": 0.7728755623102188, "num_tokens": 18669790.0, "step": 5155 }, { "entropy": 0.5711510181427002, "epoch": 4.810079328044797, "grad_norm": 0.48840948939323425, "learning_rate": 0.0002, "loss": 0.5842, "mean_token_accuracy": 0.7602245360612869, "num_tokens": 18673309.0, "step": 5156 }, { "entropy": 0.5835841447114944, "epoch": 4.811012599160056, "grad_norm": 0.3656798005104065, "learning_rate": 0.0002, "loss": 0.5874, "mean_token_accuracy": 0.7585870325565338, "num_tokens": 18676921.0, "step": 5157 }, { "entropy": 0.5623078495264053, "epoch": 4.811945870275315, "grad_norm": 0.39647746086120605, "learning_rate": 0.0002, "loss": 0.5632, "mean_token_accuracy": 0.7751936912536621, "num_tokens": 18680497.0, "step": 5158 }, { "entropy": 0.5570798963308334, "epoch": 4.812879141390574, "grad_norm": 0.3916913568973541, "learning_rate": 0.0002, "loss": 0.563, "mean_token_accuracy": 0.7723047882318497, "num_tokens": 18684017.0, "step": 5159 }, { "entropy": 0.5400333479046822, "epoch": 4.813812412505833, "grad_norm": 0.3892170786857605, "learning_rate": 0.0002, "loss": 0.549, "mean_token_accuracy": 0.7780390381813049, "num_tokens": 18687535.0, "step": 5160 }, { "entropy": 0.5802188664674759, "epoch": 4.814745683621092, "grad_norm": 0.3708321750164032, "learning_rate": 0.0002, "loss": 0.5859, "mean_token_accuracy": 0.7598800659179688, "num_tokens": 18691292.0, "step": 5161 }, { "entropy": 0.5903623104095459, "epoch": 4.8156789547363505, "grad_norm": 0.3588537275791168, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7535546272993088, "num_tokens": 18694829.0, "step": 5162 }, { "entropy": 0.5716966390609741, "epoch": 4.8166122258516095, "grad_norm": 0.36425524950027466, "learning_rate": 0.0002, "loss": 0.5725, "mean_token_accuracy": 0.771777406334877, "num_tokens": 18698405.0, "step": 5163 }, { "entropy": 0.5477681010961533, "epoch": 4.8175454969668685, "grad_norm": 0.3941708505153656, "learning_rate": 0.0002, "loss": 0.557, "mean_token_accuracy": 0.7763004004955292, "num_tokens": 18702073.0, "step": 5164 }, { "entropy": 0.5553366988897324, "epoch": 4.8184787680821275, "grad_norm": 0.3913637399673462, "learning_rate": 0.0002, "loss": 0.5568, "mean_token_accuracy": 0.7703752666711807, "num_tokens": 18705582.0, "step": 5165 }, { "entropy": 0.5372421741485596, "epoch": 4.8194120391973865, "grad_norm": 0.38751763105392456, "learning_rate": 0.0002, "loss": 0.5417, "mean_token_accuracy": 0.7778544872999191, "num_tokens": 18709243.0, "step": 5166 }, { "entropy": 0.5510534644126892, "epoch": 4.8203453103126455, "grad_norm": 0.3870584964752197, "learning_rate": 0.0002, "loss": 0.5542, "mean_token_accuracy": 0.7784046232700348, "num_tokens": 18712845.0, "step": 5167 }, { "entropy": 0.5629229247570038, "epoch": 4.821278581427904, "grad_norm": 0.4038059413433075, "learning_rate": 0.0002, "loss": 0.5638, "mean_token_accuracy": 0.7677488625049591, "num_tokens": 18716466.0, "step": 5168 }, { "entropy": 0.572410449385643, "epoch": 4.822211852543163, "grad_norm": 0.395091712474823, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.7698109745979309, "num_tokens": 18720046.0, "step": 5169 }, { "entropy": 0.5692667663097382, "epoch": 4.823145123658422, "grad_norm": 0.3698265552520752, "learning_rate": 0.0002, "loss": 0.5731, "mean_token_accuracy": 0.773095428943634, "num_tokens": 18723691.0, "step": 5170 }, { "entropy": 0.5703960210084915, "epoch": 4.824078394773681, "grad_norm": 0.39923253655433655, "learning_rate": 0.0002, "loss": 0.574, "mean_token_accuracy": 0.7716745287179947, "num_tokens": 18727187.0, "step": 5171 }, { "entropy": 0.5326645374298096, "epoch": 4.82501166588894, "grad_norm": 0.3794931471347809, "learning_rate": 0.0002, "loss": 0.541, "mean_token_accuracy": 0.7799729555845261, "num_tokens": 18730792.0, "step": 5172 }, { "entropy": 0.5629732459783554, "epoch": 4.825944937004199, "grad_norm": 0.39995747804641724, "learning_rate": 0.0002, "loss": 0.5705, "mean_token_accuracy": 0.7654712647199631, "num_tokens": 18734507.0, "step": 5173 }, { "entropy": 0.5084508284926414, "epoch": 4.826878208119458, "grad_norm": 0.41508132219314575, "learning_rate": 0.0002, "loss": 0.5211, "mean_token_accuracy": 0.7910325229167938, "num_tokens": 18738057.0, "step": 5174 }, { "entropy": 0.5448549091815948, "epoch": 4.827811479234717, "grad_norm": 0.3146677017211914, "learning_rate": 0.0002, "loss": 0.5469, "mean_token_accuracy": 0.7808440625667572, "num_tokens": 18741632.0, "step": 5175 }, { "entropy": 0.5416872948408127, "epoch": 4.828744750349976, "grad_norm": 0.4073014259338379, "learning_rate": 0.0002, "loss": 0.5542, "mean_token_accuracy": 0.7738437801599503, "num_tokens": 18745113.0, "step": 5176 }, { "entropy": 0.5587602853775024, "epoch": 4.829678021465235, "grad_norm": 0.3722480237483978, "learning_rate": 0.0002, "loss": 0.5627, "mean_token_accuracy": 0.7695860415697098, "num_tokens": 18748702.0, "step": 5177 }, { "entropy": 0.5696078240871429, "epoch": 4.830611292580494, "grad_norm": 0.3667600452899933, "learning_rate": 0.0002, "loss": 0.5724, "mean_token_accuracy": 0.7686291933059692, "num_tokens": 18752331.0, "step": 5178 }, { "entropy": 0.6014447957277298, "epoch": 4.831544563695753, "grad_norm": 0.3294428586959839, "learning_rate": 0.0002, "loss": 0.6014, "mean_token_accuracy": 0.753211036324501, "num_tokens": 18756215.0, "step": 5179 }, { "entropy": 0.5903729647397995, "epoch": 4.832477834811012, "grad_norm": 0.37939929962158203, "learning_rate": 0.0002, "loss": 0.5773, "mean_token_accuracy": 0.7701052278280258, "num_tokens": 18759896.0, "step": 5180 }, { "entropy": 0.5584921315312386, "epoch": 4.833411105926271, "grad_norm": 0.3984195590019226, "learning_rate": 0.0002, "loss": 0.5647, "mean_token_accuracy": 0.771717369556427, "num_tokens": 18763520.0, "step": 5181 }, { "entropy": 0.5950011014938354, "epoch": 4.83434437704153, "grad_norm": 0.33643725514411926, "learning_rate": 0.0002, "loss": 0.5881, "mean_token_accuracy": 0.7629101127386093, "num_tokens": 18767245.0, "step": 5182 }, { "entropy": 0.5546718165278435, "epoch": 4.835277648156789, "grad_norm": 0.4158468544483185, "learning_rate": 0.0002, "loss": 0.5672, "mean_token_accuracy": 0.7731850743293762, "num_tokens": 18770814.0, "step": 5183 }, { "entropy": 0.5692325532436371, "epoch": 4.836210919272048, "grad_norm": 0.40066877007484436, "learning_rate": 0.0002, "loss": 0.5732, "mean_token_accuracy": 0.7756513059139252, "num_tokens": 18774447.0, "step": 5184 }, { "entropy": 0.5630367547273636, "epoch": 4.837144190387307, "grad_norm": 0.3890416920185089, "learning_rate": 0.0002, "loss": 0.5726, "mean_token_accuracy": 0.7679532617330551, "num_tokens": 18777975.0, "step": 5185 }, { "entropy": 0.5909515917301178, "epoch": 4.838077461502566, "grad_norm": 0.42932796478271484, "learning_rate": 0.0002, "loss": 0.5948, "mean_token_accuracy": 0.76106196641922, "num_tokens": 18781656.0, "step": 5186 }, { "entropy": 0.5740502327680588, "epoch": 4.839010732617825, "grad_norm": 0.359287291765213, "learning_rate": 0.0002, "loss": 0.5719, "mean_token_accuracy": 0.7680432200431824, "num_tokens": 18785291.0, "step": 5187 }, { "entropy": 0.5549063310027122, "epoch": 4.839944003733084, "grad_norm": 0.33033496141433716, "learning_rate": 0.0002, "loss": 0.5612, "mean_token_accuracy": 0.7736696600914001, "num_tokens": 18789058.0, "step": 5188 }, { "entropy": 0.5699485093355179, "epoch": 4.840877274848343, "grad_norm": 0.378276526927948, "learning_rate": 0.0002, "loss": 0.5739, "mean_token_accuracy": 0.7662181258201599, "num_tokens": 18792718.0, "step": 5189 }, { "entropy": 0.5466859117150307, "epoch": 4.841810545963602, "grad_norm": 0.36671409010887146, "learning_rate": 0.0002, "loss": 0.5496, "mean_token_accuracy": 0.7800489813089371, "num_tokens": 18796325.0, "step": 5190 }, { "entropy": 0.5618728697299957, "epoch": 4.842743817078861, "grad_norm": 0.47634434700012207, "learning_rate": 0.0002, "loss": 0.5709, "mean_token_accuracy": 0.7668459266424179, "num_tokens": 18799915.0, "step": 5191 }, { "entropy": 0.5810050517320633, "epoch": 4.84367708819412, "grad_norm": 0.3551657795906067, "learning_rate": 0.0002, "loss": 0.5766, "mean_token_accuracy": 0.7699255347251892, "num_tokens": 18803508.0, "step": 5192 }, { "entropy": 0.5555650144815445, "epoch": 4.844610359309379, "grad_norm": 0.3210444748401642, "learning_rate": 0.0002, "loss": 0.556, "mean_token_accuracy": 0.7715108841657639, "num_tokens": 18807067.0, "step": 5193 }, { "entropy": 0.5571902990341187, "epoch": 4.845543630424638, "grad_norm": 0.35371944308280945, "learning_rate": 0.0002, "loss": 0.5558, "mean_token_accuracy": 0.7726841419935226, "num_tokens": 18810625.0, "step": 5194 }, { "entropy": 0.5463511198759079, "epoch": 4.846476901539897, "grad_norm": 0.40656596422195435, "learning_rate": 0.0002, "loss": 0.5531, "mean_token_accuracy": 0.776980996131897, "num_tokens": 18814193.0, "step": 5195 }, { "entropy": 0.5073603764176369, "epoch": 4.847410172655156, "grad_norm": 0.4895794093608856, "learning_rate": 0.0002, "loss": 0.5234, "mean_token_accuracy": 0.7861256152391434, "num_tokens": 18817649.0, "step": 5196 }, { "entropy": 0.5551960170269012, "epoch": 4.848343443770415, "grad_norm": 0.3397321403026581, "learning_rate": 0.0002, "loss": 0.561, "mean_token_accuracy": 0.776521697640419, "num_tokens": 18821322.0, "step": 5197 }, { "entropy": 0.5844755619764328, "epoch": 4.849276714885674, "grad_norm": 0.4477311670780182, "learning_rate": 0.0002, "loss": 0.5978, "mean_token_accuracy": 0.7590460777282715, "num_tokens": 18824940.0, "step": 5198 }, { "entropy": 0.5511307716369629, "epoch": 4.850209986000933, "grad_norm": 0.4218621850013733, "learning_rate": 0.0002, "loss": 0.5666, "mean_token_accuracy": 0.7765572816133499, "num_tokens": 18828455.0, "step": 5199 }, { "entropy": 0.5320224910974503, "epoch": 4.851143257116192, "grad_norm": 0.44897276163101196, "learning_rate": 0.0002, "loss": 0.5502, "mean_token_accuracy": 0.7766143828630447, "num_tokens": 18831950.0, "step": 5200 }, { "entropy": 0.5613471865653992, "epoch": 4.852076528231451, "grad_norm": 0.4564073383808136, "learning_rate": 0.0002, "loss": 0.5789, "mean_token_accuracy": 0.7611192017793655, "num_tokens": 18835691.0, "step": 5201 }, { "entropy": 0.5635475963354111, "epoch": 4.85300979934671, "grad_norm": 0.48967933654785156, "learning_rate": 0.0002, "loss": 0.5794, "mean_token_accuracy": 0.7646276354789734, "num_tokens": 18839336.0, "step": 5202 }, { "entropy": 0.5389666184782982, "epoch": 4.853943070461969, "grad_norm": 0.3586329519748688, "learning_rate": 0.0002, "loss": 0.549, "mean_token_accuracy": 0.7858403921127319, "num_tokens": 18842929.0, "step": 5203 }, { "entropy": 0.5590162426233292, "epoch": 4.854876341577228, "grad_norm": 0.3437548279762268, "learning_rate": 0.0002, "loss": 0.5565, "mean_token_accuracy": 0.7797302156686783, "num_tokens": 18846495.0, "step": 5204 }, { "entropy": 0.573760136961937, "epoch": 4.855809612692487, "grad_norm": 0.3886411488056183, "learning_rate": 0.0002, "loss": 0.5753, "mean_token_accuracy": 0.7673644572496414, "num_tokens": 18850144.0, "step": 5205 }, { "entropy": 0.5720073506236076, "epoch": 4.856742883807746, "grad_norm": 0.32299548387527466, "learning_rate": 0.0002, "loss": 0.5703, "mean_token_accuracy": 0.770476907491684, "num_tokens": 18853741.0, "step": 5206 }, { "entropy": 0.6143889427185059, "epoch": 4.857676154923005, "grad_norm": 0.37015628814697266, "learning_rate": 0.0002, "loss": 0.6048, "mean_token_accuracy": 0.7616124004125595, "num_tokens": 18857362.0, "step": 5207 }, { "entropy": 0.5767806023359299, "epoch": 4.858609426038264, "grad_norm": 0.32597801089286804, "learning_rate": 0.0002, "loss": 0.5603, "mean_token_accuracy": 0.7789549976587296, "num_tokens": 18861097.0, "step": 5208 }, { "entropy": 0.5617486089468002, "epoch": 4.859542697153523, "grad_norm": 0.3498903810977936, "learning_rate": 0.0002, "loss": 0.558, "mean_token_accuracy": 0.7733747214078903, "num_tokens": 18864730.0, "step": 5209 }, { "entropy": 0.5642584413290024, "epoch": 4.860475968268782, "grad_norm": 0.32657480239868164, "learning_rate": 0.0002, "loss": 0.5672, "mean_token_accuracy": 0.7723542153835297, "num_tokens": 18868453.0, "step": 5210 }, { "entropy": 0.5644195228815079, "epoch": 4.861409239384041, "grad_norm": 0.39943811297416687, "learning_rate": 0.0002, "loss": 0.5697, "mean_token_accuracy": 0.7718174159526825, "num_tokens": 18872085.0, "step": 5211 }, { "entropy": 0.552848219871521, "epoch": 4.8623425104993, "grad_norm": 0.43149131536483765, "learning_rate": 0.0002, "loss": 0.5544, "mean_token_accuracy": 0.7763472497463226, "num_tokens": 18875635.0, "step": 5212 }, { "entropy": 0.5377706438302994, "epoch": 4.863275781614559, "grad_norm": 0.511028528213501, "learning_rate": 0.0002, "loss": 0.5447, "mean_token_accuracy": 0.7819960117340088, "num_tokens": 18879412.0, "step": 5213 }, { "entropy": 0.5314461588859558, "epoch": 4.864209052729818, "grad_norm": 0.459177702665329, "learning_rate": 0.0002, "loss": 0.5508, "mean_token_accuracy": 0.7789402306079865, "num_tokens": 18882961.0, "step": 5214 }, { "entropy": 0.571778416633606, "epoch": 4.865142323845077, "grad_norm": 0.33033475279808044, "learning_rate": 0.0002, "loss": 0.5694, "mean_token_accuracy": 0.7718916535377502, "num_tokens": 18886603.0, "step": 5215 }, { "entropy": 0.5627578347921371, "epoch": 4.866075594960336, "grad_norm": 0.36910831928253174, "learning_rate": 0.0002, "loss": 0.5614, "mean_token_accuracy": 0.7742450535297394, "num_tokens": 18890193.0, "step": 5216 }, { "entropy": 0.5977398008108139, "epoch": 4.867008866075595, "grad_norm": 0.38123658299446106, "learning_rate": 0.0002, "loss": 0.595, "mean_token_accuracy": 0.762000098824501, "num_tokens": 18893976.0, "step": 5217 }, { "entropy": 0.5514411702752113, "epoch": 4.867942137190854, "grad_norm": 0.35643428564071655, "learning_rate": 0.0002, "loss": 0.5414, "mean_token_accuracy": 0.7774688303470612, "num_tokens": 18897563.0, "step": 5218 }, { "entropy": 0.5669237077236176, "epoch": 4.8688754083061125, "grad_norm": 0.33614540100097656, "learning_rate": 0.0002, "loss": 0.5656, "mean_token_accuracy": 0.7708261609077454, "num_tokens": 18901262.0, "step": 5219 }, { "entropy": 0.5838929861783981, "epoch": 4.8698086794213715, "grad_norm": 0.3922409415245056, "learning_rate": 0.0002, "loss": 0.5906, "mean_token_accuracy": 0.763220950961113, "num_tokens": 18904836.0, "step": 5220 }, { "entropy": 0.5553544610738754, "epoch": 4.8707419505366305, "grad_norm": 0.38742777705192566, "learning_rate": 0.0002, "loss": 0.5643, "mean_token_accuracy": 0.7732299119234085, "num_tokens": 18908427.0, "step": 5221 }, { "entropy": 0.5315223336219788, "epoch": 4.8716752216518895, "grad_norm": 0.36237284541130066, "learning_rate": 0.0002, "loss": 0.5291, "mean_token_accuracy": 0.7840207517147064, "num_tokens": 18911894.0, "step": 5222 }, { "entropy": 0.5729064047336578, "epoch": 4.8726084927671485, "grad_norm": 0.35872775316238403, "learning_rate": 0.0002, "loss": 0.5725, "mean_token_accuracy": 0.7759911715984344, "num_tokens": 18915582.0, "step": 5223 }, { "entropy": 0.5853463560342789, "epoch": 4.8735417638824075, "grad_norm": 0.3520136773586273, "learning_rate": 0.0002, "loss": 0.5902, "mean_token_accuracy": 0.7644035667181015, "num_tokens": 18919203.0, "step": 5224 }, { "entropy": 0.5624598413705826, "epoch": 4.874475034997666, "grad_norm": 0.3328130841255188, "learning_rate": 0.0002, "loss": 0.5656, "mean_token_accuracy": 0.7790891230106354, "num_tokens": 18922809.0, "step": 5225 }, { "entropy": 0.5520014986395836, "epoch": 4.875408306112925, "grad_norm": 0.4361693263053894, "learning_rate": 0.0002, "loss": 0.5714, "mean_token_accuracy": 0.7708020955324173, "num_tokens": 18926327.0, "step": 5226 }, { "entropy": 0.5608643740415573, "epoch": 4.876341577228185, "grad_norm": 0.3782578408718109, "learning_rate": 0.0002, "loss": 0.5728, "mean_token_accuracy": 0.7717126756906509, "num_tokens": 18929946.0, "step": 5227 }, { "entropy": 0.5583661049604416, "epoch": 4.877274848343443, "grad_norm": 0.3898084759712219, "learning_rate": 0.0002, "loss": 0.5722, "mean_token_accuracy": 0.7680395245552063, "num_tokens": 18933627.0, "step": 5228 }, { "entropy": 0.546528697013855, "epoch": 4.878208119458703, "grad_norm": 0.4232458174228668, "learning_rate": 0.0002, "loss": 0.5548, "mean_token_accuracy": 0.7769127488136292, "num_tokens": 18937180.0, "step": 5229 }, { "entropy": 0.555287778377533, "epoch": 4.879141390573961, "grad_norm": 0.38083475828170776, "learning_rate": 0.0002, "loss": 0.5593, "mean_token_accuracy": 0.7724321186542511, "num_tokens": 18940848.0, "step": 5230 }, { "entropy": 0.5558875203132629, "epoch": 4.880074661689221, "grad_norm": 0.3565739393234253, "learning_rate": 0.0002, "loss": 0.5428, "mean_token_accuracy": 0.7788939774036407, "num_tokens": 18944446.0, "step": 5231 }, { "entropy": 0.5914708375930786, "epoch": 4.881007932804479, "grad_norm": 0.41315758228302, "learning_rate": 0.0002, "loss": 0.6012, "mean_token_accuracy": 0.7569282650947571, "num_tokens": 18948079.0, "step": 5232 }, { "entropy": 0.5481231436133385, "epoch": 4.881941203919739, "grad_norm": 0.3736111521720886, "learning_rate": 0.0002, "loss": 0.535, "mean_token_accuracy": 0.7789389938116074, "num_tokens": 18951607.0, "step": 5233 }, { "entropy": 0.551792174577713, "epoch": 4.882874475034997, "grad_norm": 0.3456709384918213, "learning_rate": 0.0002, "loss": 0.5514, "mean_token_accuracy": 0.7819351851940155, "num_tokens": 18955270.0, "step": 5234 }, { "entropy": 0.554887980222702, "epoch": 4.883807746150257, "grad_norm": 0.3842284679412842, "learning_rate": 0.0002, "loss": 0.5621, "mean_token_accuracy": 0.7747032046318054, "num_tokens": 18958852.0, "step": 5235 }, { "entropy": 0.5828744620084763, "epoch": 4.884741017265515, "grad_norm": 0.40122827887535095, "learning_rate": 0.0002, "loss": 0.5853, "mean_token_accuracy": 0.771819069981575, "num_tokens": 18962478.0, "step": 5236 }, { "entropy": 0.5642376244068146, "epoch": 4.885674288380775, "grad_norm": 0.4304533302783966, "learning_rate": 0.0002, "loss": 0.5707, "mean_token_accuracy": 0.7716109752655029, "num_tokens": 18966135.0, "step": 5237 }, { "entropy": 0.5809419751167297, "epoch": 4.886607559496033, "grad_norm": 0.35364052653312683, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.7615045011043549, "num_tokens": 18969752.0, "step": 5238 }, { "entropy": 0.5647705942392349, "epoch": 4.887540830611293, "grad_norm": 0.38376420736312866, "learning_rate": 0.0002, "loss": 0.5588, "mean_token_accuracy": 0.7721617817878723, "num_tokens": 18973311.0, "step": 5239 }, { "entropy": 0.5940476357936859, "epoch": 4.888474101726551, "grad_norm": 0.34235697984695435, "learning_rate": 0.0002, "loss": 0.5846, "mean_token_accuracy": 0.7679928839206696, "num_tokens": 18977155.0, "step": 5240 }, { "entropy": 0.5448775291442871, "epoch": 4.889407372841811, "grad_norm": 0.4394550323486328, "learning_rate": 0.0002, "loss": 0.5585, "mean_token_accuracy": 0.7806281447410583, "num_tokens": 18980608.0, "step": 5241 }, { "entropy": 0.5810261070728302, "epoch": 4.890340643957069, "grad_norm": 0.3854098320007324, "learning_rate": 0.0002, "loss": 0.5826, "mean_token_accuracy": 0.7582627087831497, "num_tokens": 18984202.0, "step": 5242 }, { "entropy": 0.556139275431633, "epoch": 4.891273915072329, "grad_norm": 0.45204848051071167, "learning_rate": 0.0002, "loss": 0.572, "mean_token_accuracy": 0.7722677290439606, "num_tokens": 18987704.0, "step": 5243 }, { "entropy": 0.6036161482334137, "epoch": 4.892207186187587, "grad_norm": 0.4029693305492401, "learning_rate": 0.0002, "loss": 0.6103, "mean_token_accuracy": 0.7570687085390091, "num_tokens": 18991392.0, "step": 5244 }, { "entropy": 0.556283712387085, "epoch": 4.893140457302847, "grad_norm": 0.3676453232765198, "learning_rate": 0.0002, "loss": 0.555, "mean_token_accuracy": 0.7743522524833679, "num_tokens": 18995099.0, "step": 5245 }, { "entropy": 0.5903688967227936, "epoch": 4.894073728418105, "grad_norm": 0.4141484200954437, "learning_rate": 0.0002, "loss": 0.6128, "mean_token_accuracy": 0.7514609545469284, "num_tokens": 18998750.0, "step": 5246 }, { "entropy": 0.5836945176124573, "epoch": 4.895006999533365, "grad_norm": 0.3963736295700073, "learning_rate": 0.0002, "loss": 0.5883, "mean_token_accuracy": 0.7612303048372269, "num_tokens": 19002412.0, "step": 5247 }, { "entropy": 0.6009140908718109, "epoch": 4.895940270648623, "grad_norm": 0.40713974833488464, "learning_rate": 0.0002, "loss": 0.6077, "mean_token_accuracy": 0.7565010339021683, "num_tokens": 19006095.0, "step": 5248 }, { "entropy": 0.5744146555662155, "epoch": 4.896873541763883, "grad_norm": 0.3867068886756897, "learning_rate": 0.0002, "loss": 0.5747, "mean_token_accuracy": 0.7650448381900787, "num_tokens": 19009633.0, "step": 5249 }, { "entropy": 0.5770806074142456, "epoch": 4.897806812879141, "grad_norm": 0.45541056990623474, "learning_rate": 0.0002, "loss": 0.5753, "mean_token_accuracy": 0.7686446905136108, "num_tokens": 19013152.0, "step": 5250 }, { "entropy": 0.5610028952360153, "epoch": 4.898740083994401, "grad_norm": 0.3833506107330322, "learning_rate": 0.0002, "loss": 0.5646, "mean_token_accuracy": 0.7712444216012955, "num_tokens": 19016896.0, "step": 5251 }, { "entropy": 0.583490788936615, "epoch": 4.899673355109659, "grad_norm": 0.3437897562980652, "learning_rate": 0.0002, "loss": 0.5856, "mean_token_accuracy": 0.764999270439148, "num_tokens": 19020597.0, "step": 5252 }, { "entropy": 0.5835117101669312, "epoch": 4.900606626224919, "grad_norm": 0.37707024812698364, "learning_rate": 0.0002, "loss": 0.5865, "mean_token_accuracy": 0.7589012682437897, "num_tokens": 19024171.0, "step": 5253 }, { "entropy": 0.5141132771968842, "epoch": 4.901539897340177, "grad_norm": 0.3602462708950043, "learning_rate": 0.0002, "loss": 0.5155, "mean_token_accuracy": 0.7910581231117249, "num_tokens": 19027705.0, "step": 5254 }, { "entropy": 0.5187685936689377, "epoch": 4.902473168455437, "grad_norm": 0.35128310322761536, "learning_rate": 0.0002, "loss": 0.5222, "mean_token_accuracy": 0.7837758362293243, "num_tokens": 19031346.0, "step": 5255 }, { "entropy": 0.5436407700181007, "epoch": 4.903406439570695, "grad_norm": 0.3796846568584442, "learning_rate": 0.0002, "loss": 0.548, "mean_token_accuracy": 0.7762163430452347, "num_tokens": 19035023.0, "step": 5256 }, { "entropy": 0.563270628452301, "epoch": 4.904339710685955, "grad_norm": 0.5145208239555359, "learning_rate": 0.0002, "loss": 0.5782, "mean_token_accuracy": 0.7757045477628708, "num_tokens": 19038730.0, "step": 5257 }, { "entropy": 0.567597970366478, "epoch": 4.905272981801213, "grad_norm": 0.4186343550682068, "learning_rate": 0.0002, "loss": 0.5776, "mean_token_accuracy": 0.7686308175325394, "num_tokens": 19042334.0, "step": 5258 }, { "entropy": 0.5574558228254318, "epoch": 4.906206252916473, "grad_norm": 0.3866247832775116, "learning_rate": 0.0002, "loss": 0.5632, "mean_token_accuracy": 0.7760410308837891, "num_tokens": 19045926.0, "step": 5259 }, { "entropy": 0.5910737961530685, "epoch": 4.907139524031731, "grad_norm": 0.41662657260894775, "learning_rate": 0.0002, "loss": 0.5922, "mean_token_accuracy": 0.7663400322198868, "num_tokens": 19049535.0, "step": 5260 }, { "entropy": 0.5774719715118408, "epoch": 4.908072795146991, "grad_norm": 0.4067746698856354, "learning_rate": 0.0002, "loss": 0.5764, "mean_token_accuracy": 0.7690877914428711, "num_tokens": 19053195.0, "step": 5261 }, { "entropy": 0.5626442134380341, "epoch": 4.909006066262249, "grad_norm": 0.3417803943157196, "learning_rate": 0.0002, "loss": 0.5516, "mean_token_accuracy": 0.7800594866275787, "num_tokens": 19056984.0, "step": 5262 }, { "entropy": 0.5853002071380615, "epoch": 4.909939337377509, "grad_norm": 0.3503539562225342, "learning_rate": 0.0002, "loss": 0.5899, "mean_token_accuracy": 0.7629952281713486, "num_tokens": 19060618.0, "step": 5263 }, { "entropy": 0.5728003978729248, "epoch": 4.910872608492767, "grad_norm": 0.35674116015434265, "learning_rate": 0.0002, "loss": 0.5779, "mean_token_accuracy": 0.7664952874183655, "num_tokens": 19064272.0, "step": 5264 }, { "entropy": 0.5010719671845436, "epoch": 4.911805879608027, "grad_norm": 0.37737002968788147, "learning_rate": 0.0002, "loss": 0.5067, "mean_token_accuracy": 0.7950724810361862, "num_tokens": 19067834.0, "step": 5265 }, { "entropy": 0.5791975408792496, "epoch": 4.912739150723285, "grad_norm": 0.40008336305618286, "learning_rate": 0.0002, "loss": 0.585, "mean_token_accuracy": 0.7631917148828506, "num_tokens": 19071439.0, "step": 5266 }, { "entropy": 0.5449786484241486, "epoch": 4.913672421838545, "grad_norm": 0.4540976285934448, "learning_rate": 0.0002, "loss": 0.5619, "mean_token_accuracy": 0.76899054646492, "num_tokens": 19074905.0, "step": 5267 }, { "entropy": 0.5512082725763321, "epoch": 4.914605692953803, "grad_norm": 0.4045686721801758, "learning_rate": 0.0002, "loss": 0.5549, "mean_token_accuracy": 0.7741516083478928, "num_tokens": 19078441.0, "step": 5268 }, { "entropy": 0.5737831443548203, "epoch": 4.9155389640690625, "grad_norm": 0.42616531252861023, "learning_rate": 0.0002, "loss": 0.5943, "mean_token_accuracy": 0.7595313936471939, "num_tokens": 19082083.0, "step": 5269 }, { "entropy": 0.5594895929098129, "epoch": 4.916472235184321, "grad_norm": 0.45894762873649597, "learning_rate": 0.0002, "loss": 0.573, "mean_token_accuracy": 0.766769215464592, "num_tokens": 19085614.0, "step": 5270 }, { "entropy": 0.5763648301362991, "epoch": 4.9174055062995805, "grad_norm": 0.40066012740135193, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7612225264310837, "num_tokens": 19089110.0, "step": 5271 }, { "entropy": 0.550218865275383, "epoch": 4.918338777414839, "grad_norm": 0.3206861913204193, "learning_rate": 0.0002, "loss": 0.5424, "mean_token_accuracy": 0.7841930836439133, "num_tokens": 19092642.0, "step": 5272 }, { "entropy": 0.545047253370285, "epoch": 4.9192720485300985, "grad_norm": 0.38106095790863037, "learning_rate": 0.0002, "loss": 0.5342, "mean_token_accuracy": 0.7835682779550552, "num_tokens": 19096217.0, "step": 5273 }, { "entropy": 0.6260076612234116, "epoch": 4.920205319645357, "grad_norm": 0.34911441802978516, "learning_rate": 0.0002, "loss": 0.6198, "mean_token_accuracy": 0.7467133849859238, "num_tokens": 19099868.0, "step": 5274 }, { "entropy": 0.5723360180854797, "epoch": 4.921138590760616, "grad_norm": 0.3271065950393677, "learning_rate": 0.0002, "loss": 0.5593, "mean_token_accuracy": 0.7734796553850174, "num_tokens": 19103466.0, "step": 5275 }, { "entropy": 0.5717315375804901, "epoch": 4.9220718618758745, "grad_norm": 0.35511675477027893, "learning_rate": 0.0002, "loss": 0.5686, "mean_token_accuracy": 0.768511563539505, "num_tokens": 19107049.0, "step": 5276 }, { "entropy": 0.578776478767395, "epoch": 4.923005132991134, "grad_norm": 0.33857232332229614, "learning_rate": 0.0002, "loss": 0.5724, "mean_token_accuracy": 0.7724848985671997, "num_tokens": 19110730.0, "step": 5277 }, { "entropy": 0.5353398323059082, "epoch": 4.9239384041063925, "grad_norm": 0.4011839032173157, "learning_rate": 0.0002, "loss": 0.5385, "mean_token_accuracy": 0.7839401364326477, "num_tokens": 19114370.0, "step": 5278 }, { "entropy": 0.5613954812288284, "epoch": 4.924871675221652, "grad_norm": 0.3918183147907257, "learning_rate": 0.0002, "loss": 0.5691, "mean_token_accuracy": 0.7671540975570679, "num_tokens": 19118058.0, "step": 5279 }, { "entropy": 0.5391390025615692, "epoch": 4.9258049463369105, "grad_norm": 0.35848361253738403, "learning_rate": 0.0002, "loss": 0.5523, "mean_token_accuracy": 0.7739707380533218, "num_tokens": 19121670.0, "step": 5280 }, { "entropy": 0.563785120844841, "epoch": 4.92673821745217, "grad_norm": 0.3995574414730072, "learning_rate": 0.0002, "loss": 0.5743, "mean_token_accuracy": 0.7684236615896225, "num_tokens": 19125316.0, "step": 5281 }, { "entropy": 0.5684122294187546, "epoch": 4.927671488567428, "grad_norm": 0.40253186225891113, "learning_rate": 0.0002, "loss": 0.5811, "mean_token_accuracy": 0.7646132856607437, "num_tokens": 19128990.0, "step": 5282 }, { "entropy": 0.5576053857803345, "epoch": 4.928604759682688, "grad_norm": 0.4014016389846802, "learning_rate": 0.0002, "loss": 0.5643, "mean_token_accuracy": 0.7713753134012222, "num_tokens": 19132645.0, "step": 5283 }, { "entropy": 0.5518018379807472, "epoch": 4.929538030797946, "grad_norm": 0.3547157049179077, "learning_rate": 0.0002, "loss": 0.5579, "mean_token_accuracy": 0.774137407541275, "num_tokens": 19136281.0, "step": 5284 }, { "entropy": 0.5530434995889664, "epoch": 4.930471301913206, "grad_norm": 0.4227665960788727, "learning_rate": 0.0002, "loss": 0.5687, "mean_token_accuracy": 0.7704144716262817, "num_tokens": 19139656.0, "step": 5285 }, { "entropy": 0.5867572277784348, "epoch": 4.931404573028464, "grad_norm": 0.43696245551109314, "learning_rate": 0.0002, "loss": 0.5857, "mean_token_accuracy": 0.7735651284456253, "num_tokens": 19143393.0, "step": 5286 }, { "entropy": 0.5488783642649651, "epoch": 4.932337844143724, "grad_norm": 0.3863449692726135, "learning_rate": 0.0002, "loss": 0.5637, "mean_token_accuracy": 0.7707670032978058, "num_tokens": 19147021.0, "step": 5287 }, { "entropy": 0.5833796560764313, "epoch": 4.933271115258982, "grad_norm": 0.34445908665657043, "learning_rate": 0.0002, "loss": 0.5789, "mean_token_accuracy": 0.771406352519989, "num_tokens": 19150552.0, "step": 5288 }, { "entropy": 0.5439732596278191, "epoch": 4.934204386374242, "grad_norm": 0.3752858638763428, "learning_rate": 0.0002, "loss": 0.5378, "mean_token_accuracy": 0.7858706712722778, "num_tokens": 19154213.0, "step": 5289 }, { "entropy": 0.5740639865398407, "epoch": 4.9351376574895, "grad_norm": 0.3812858760356903, "learning_rate": 0.0002, "loss": 0.5753, "mean_token_accuracy": 0.7662524729967117, "num_tokens": 19157958.0, "step": 5290 }, { "entropy": 0.5912611037492752, "epoch": 4.93607092860476, "grad_norm": 0.37467917799949646, "learning_rate": 0.0002, "loss": 0.5936, "mean_token_accuracy": 0.7632912397384644, "num_tokens": 19161657.0, "step": 5291 }, { "entropy": 0.5431942418217659, "epoch": 4.937004199720018, "grad_norm": 0.3699650764465332, "learning_rate": 0.0002, "loss": 0.5496, "mean_token_accuracy": 0.7772209197282791, "num_tokens": 19165240.0, "step": 5292 }, { "entropy": 0.5139699205756187, "epoch": 4.937937470835278, "grad_norm": 0.40933138132095337, "learning_rate": 0.0002, "loss": 0.5311, "mean_token_accuracy": 0.7799742668867111, "num_tokens": 19168823.0, "step": 5293 }, { "entropy": 0.560746818780899, "epoch": 4.938870741950536, "grad_norm": 0.4153490662574768, "learning_rate": 0.0002, "loss": 0.5737, "mean_token_accuracy": 0.7687033265829086, "num_tokens": 19172361.0, "step": 5294 }, { "entropy": 0.556433841586113, "epoch": 4.939804013065796, "grad_norm": 0.3606758713722229, "learning_rate": 0.0002, "loss": 0.5577, "mean_token_accuracy": 0.7714663743972778, "num_tokens": 19175988.0, "step": 5295 }, { "entropy": 0.593312069773674, "epoch": 4.940737284181054, "grad_norm": 0.3589232265949249, "learning_rate": 0.0002, "loss": 0.5839, "mean_token_accuracy": 0.7630837857723236, "num_tokens": 19179752.0, "step": 5296 }, { "entropy": 0.5751971900463104, "epoch": 4.941670555296314, "grad_norm": 0.3263804614543915, "learning_rate": 0.0002, "loss": 0.571, "mean_token_accuracy": 0.7673409581184387, "num_tokens": 19183450.0, "step": 5297 }, { "entropy": 0.5493935495615005, "epoch": 4.942603826411572, "grad_norm": 0.3576168417930603, "learning_rate": 0.0002, "loss": 0.547, "mean_token_accuracy": 0.7843827903270721, "num_tokens": 19187282.0, "step": 5298 }, { "entropy": 0.5771018415689468, "epoch": 4.943537097526832, "grad_norm": 0.4370299279689789, "learning_rate": 0.0002, "loss": 0.5736, "mean_token_accuracy": 0.7682170867919922, "num_tokens": 19190875.0, "step": 5299 }, { "entropy": 0.5892027318477631, "epoch": 4.94447036864209, "grad_norm": 0.41890451312065125, "learning_rate": 0.0002, "loss": 0.5952, "mean_token_accuracy": 0.7585530132055283, "num_tokens": 19194491.0, "step": 5300 }, { "entropy": 0.5732314437627792, "epoch": 4.94540363975735, "grad_norm": 0.36648818850517273, "learning_rate": 0.0002, "loss": 0.5737, "mean_token_accuracy": 0.7645338475704193, "num_tokens": 19198019.0, "step": 5301 }, { "entropy": 0.5994285047054291, "epoch": 4.946336910872608, "grad_norm": 0.41923514008522034, "learning_rate": 0.0002, "loss": 0.6106, "mean_token_accuracy": 0.7544065713882446, "num_tokens": 19201595.0, "step": 5302 }, { "entropy": 0.5941856056451797, "epoch": 4.947270181987868, "grad_norm": 0.3747379779815674, "learning_rate": 0.0002, "loss": 0.5897, "mean_token_accuracy": 0.7598832696676254, "num_tokens": 19205320.0, "step": 5303 }, { "entropy": 0.5887867212295532, "epoch": 4.948203453103126, "grad_norm": 0.38028159737586975, "learning_rate": 0.0002, "loss": 0.5915, "mean_token_accuracy": 0.7631696611642838, "num_tokens": 19208989.0, "step": 5304 }, { "entropy": 0.5761336535215378, "epoch": 4.949136724218386, "grad_norm": 0.37356293201446533, "learning_rate": 0.0002, "loss": 0.5735, "mean_token_accuracy": 0.7691191881895065, "num_tokens": 19212568.0, "step": 5305 }, { "entropy": 0.5914958119392395, "epoch": 4.950069995333644, "grad_norm": 0.3737683594226837, "learning_rate": 0.0002, "loss": 0.5941, "mean_token_accuracy": 0.7610636800527573, "num_tokens": 19216127.0, "step": 5306 }, { "entropy": 0.5491894632577896, "epoch": 4.951003266448904, "grad_norm": 0.39143848419189453, "learning_rate": 0.0002, "loss": 0.5423, "mean_token_accuracy": 0.7800063341856003, "num_tokens": 19219747.0, "step": 5307 }, { "entropy": 0.5167500600218773, "epoch": 4.951936537564162, "grad_norm": 0.3636963367462158, "learning_rate": 0.0002, "loss": 0.5286, "mean_token_accuracy": 0.7904459834098816, "num_tokens": 19223273.0, "step": 5308 }, { "entropy": 0.6075840890407562, "epoch": 4.952869808679422, "grad_norm": 0.45651039481163025, "learning_rate": 0.0002, "loss": 0.61, "mean_token_accuracy": 0.7571623027324677, "num_tokens": 19227020.0, "step": 5309 }, { "entropy": 0.5543019771575928, "epoch": 4.95380307979468, "grad_norm": 0.4349325895309448, "learning_rate": 0.0002, "loss": 0.5654, "mean_token_accuracy": 0.7642548978328705, "num_tokens": 19230646.0, "step": 5310 }, { "entropy": 0.5690474957227707, "epoch": 4.95473635090994, "grad_norm": 0.38035038113594055, "learning_rate": 0.0002, "loss": 0.5784, "mean_token_accuracy": 0.7651447951793671, "num_tokens": 19234248.0, "step": 5311 }, { "entropy": 0.5261747688055038, "epoch": 4.955669622025198, "grad_norm": 0.39761513471603394, "learning_rate": 0.0002, "loss": 0.5353, "mean_token_accuracy": 0.7880674004554749, "num_tokens": 19237901.0, "step": 5312 }, { "entropy": 0.5481677204370499, "epoch": 4.956602893140458, "grad_norm": 0.3728567361831665, "learning_rate": 0.0002, "loss": 0.5649, "mean_token_accuracy": 0.7689776420593262, "num_tokens": 19241616.0, "step": 5313 }, { "entropy": 0.5616655349731445, "epoch": 4.957536164255716, "grad_norm": 0.34953856468200684, "learning_rate": 0.0002, "loss": 0.5625, "mean_token_accuracy": 0.7706323266029358, "num_tokens": 19245198.0, "step": 5314 }, { "entropy": 0.5885468423366547, "epoch": 4.958469435370976, "grad_norm": 0.4548299312591553, "learning_rate": 0.0002, "loss": 0.5937, "mean_token_accuracy": 0.762794241309166, "num_tokens": 19248851.0, "step": 5315 }, { "entropy": 0.5694086402654648, "epoch": 4.959402706486234, "grad_norm": 0.3699735105037689, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7677624523639679, "num_tokens": 19252541.0, "step": 5316 }, { "entropy": 0.5493597835302353, "epoch": 4.960335977601494, "grad_norm": 0.3484140932559967, "learning_rate": 0.0002, "loss": 0.5609, "mean_token_accuracy": 0.7673466056585312, "num_tokens": 19256163.0, "step": 5317 }, { "entropy": 0.5781940370798111, "epoch": 4.961269248716752, "grad_norm": 0.386889785528183, "learning_rate": 0.0002, "loss": 0.5741, "mean_token_accuracy": 0.7701414823532104, "num_tokens": 19259754.0, "step": 5318 }, { "entropy": 0.614410400390625, "epoch": 4.962202519832012, "grad_norm": 0.3823978304862976, "learning_rate": 0.0002, "loss": 0.6075, "mean_token_accuracy": 0.75282783806324, "num_tokens": 19263475.0, "step": 5319 }, { "entropy": 0.5764468908309937, "epoch": 4.96313579094727, "grad_norm": 0.4110611379146576, "learning_rate": 0.0002, "loss": 0.5799, "mean_token_accuracy": 0.7678909152746201, "num_tokens": 19267062.0, "step": 5320 }, { "entropy": 0.5642324686050415, "epoch": 4.96406906206253, "grad_norm": 0.3070579767227173, "learning_rate": 0.0002, "loss": 0.5564, "mean_token_accuracy": 0.7820192575454712, "num_tokens": 19270795.0, "step": 5321 }, { "entropy": 0.5677494406700134, "epoch": 4.965002333177788, "grad_norm": 0.35494524240493774, "learning_rate": 0.0002, "loss": 0.5601, "mean_token_accuracy": 0.7769469767808914, "num_tokens": 19274424.0, "step": 5322 }, { "entropy": 0.5383458957076073, "epoch": 4.965935604293048, "grad_norm": 0.3581012189388275, "learning_rate": 0.0002, "loss": 0.5343, "mean_token_accuracy": 0.7923269122838974, "num_tokens": 19277948.0, "step": 5323 }, { "entropy": 0.5672779828310013, "epoch": 4.966868875408306, "grad_norm": 0.42435523867607117, "learning_rate": 0.0002, "loss": 0.5695, "mean_token_accuracy": 0.7653083652257919, "num_tokens": 19281589.0, "step": 5324 }, { "entropy": 0.5538556724786758, "epoch": 4.9678021465235656, "grad_norm": 0.3972119688987732, "learning_rate": 0.0002, "loss": 0.5625, "mean_token_accuracy": 0.78278748691082, "num_tokens": 19285153.0, "step": 5325 }, { "entropy": 0.5482962280511856, "epoch": 4.968735417638824, "grad_norm": 0.38663744926452637, "learning_rate": 0.0002, "loss": 0.5541, "mean_token_accuracy": 0.7688904255628586, "num_tokens": 19288801.0, "step": 5326 }, { "entropy": 0.5993737727403641, "epoch": 4.9696686887540835, "grad_norm": 0.3877536952495575, "learning_rate": 0.0002, "loss": 0.5974, "mean_token_accuracy": 0.7632242739200592, "num_tokens": 19292445.0, "step": 5327 }, { "entropy": 0.5994735807180405, "epoch": 4.970601959869342, "grad_norm": 0.3876253366470337, "learning_rate": 0.0002, "loss": 0.6055, "mean_token_accuracy": 0.76120924949646, "num_tokens": 19296112.0, "step": 5328 }, { "entropy": 0.5572182834148407, "epoch": 4.9715352309846015, "grad_norm": 0.3969508707523346, "learning_rate": 0.0002, "loss": 0.5705, "mean_token_accuracy": 0.7736941426992416, "num_tokens": 19299770.0, "step": 5329 }, { "entropy": 0.5469140261411667, "epoch": 4.97246850209986, "grad_norm": 0.3777613043785095, "learning_rate": 0.0002, "loss": 0.5507, "mean_token_accuracy": 0.7739361524581909, "num_tokens": 19303424.0, "step": 5330 }, { "entropy": 0.5672814920544624, "epoch": 4.9734017732151194, "grad_norm": 0.4510459005832672, "learning_rate": 0.0002, "loss": 0.5758, "mean_token_accuracy": 0.7641360908746719, "num_tokens": 19307111.0, "step": 5331 }, { "entropy": 0.5169000774621964, "epoch": 4.9743350443303775, "grad_norm": 0.3527889847755432, "learning_rate": 0.0002, "loss": 0.5202, "mean_token_accuracy": 0.786843478679657, "num_tokens": 19310577.0, "step": 5332 }, { "entropy": 0.59471096098423, "epoch": 4.975268315445637, "grad_norm": 0.3412415683269501, "learning_rate": 0.0002, "loss": 0.5947, "mean_token_accuracy": 0.7631819397211075, "num_tokens": 19314222.0, "step": 5333 }, { "entropy": 0.5685037821531296, "epoch": 4.9762015865608955, "grad_norm": 0.3508768379688263, "learning_rate": 0.0002, "loss": 0.5573, "mean_token_accuracy": 0.7712485641241074, "num_tokens": 19317803.0, "step": 5334 }, { "entropy": 0.5539515018463135, "epoch": 4.977134857676155, "grad_norm": 0.35104089975357056, "learning_rate": 0.0002, "loss": 0.5502, "mean_token_accuracy": 0.7764158397912979, "num_tokens": 19321355.0, "step": 5335 }, { "entropy": 0.5843575894832611, "epoch": 4.9780681287914135, "grad_norm": 0.4481710195541382, "learning_rate": 0.0002, "loss": 0.5959, "mean_token_accuracy": 0.7571786195039749, "num_tokens": 19324870.0, "step": 5336 }, { "entropy": 0.5712054297327995, "epoch": 4.979001399906673, "grad_norm": 0.3659404218196869, "learning_rate": 0.0002, "loss": 0.5807, "mean_token_accuracy": 0.7635954916477203, "num_tokens": 19328495.0, "step": 5337 }, { "entropy": 0.5794796049594879, "epoch": 4.979934671021931, "grad_norm": 0.3726680278778076, "learning_rate": 0.0002, "loss": 0.5918, "mean_token_accuracy": 0.7599027156829834, "num_tokens": 19332174.0, "step": 5338 }, { "entropy": 0.5784941911697388, "epoch": 4.980867942137191, "grad_norm": 0.36544662714004517, "learning_rate": 0.0002, "loss": 0.5871, "mean_token_accuracy": 0.7612032890319824, "num_tokens": 19335775.0, "step": 5339 }, { "entropy": 0.581547811627388, "epoch": 4.981801213252449, "grad_norm": 0.347599059343338, "learning_rate": 0.0002, "loss": 0.5767, "mean_token_accuracy": 0.7733725905418396, "num_tokens": 19339469.0, "step": 5340 }, { "entropy": 0.5834157466888428, "epoch": 4.982734484367709, "grad_norm": 0.4149073660373688, "learning_rate": 0.0002, "loss": 0.5854, "mean_token_accuracy": 0.7692352086305618, "num_tokens": 19343107.0, "step": 5341 }, { "entropy": 0.5809771120548248, "epoch": 4.983667755482967, "grad_norm": 0.3800896108150482, "learning_rate": 0.0002, "loss": 0.5739, "mean_token_accuracy": 0.7649085074663162, "num_tokens": 19346740.0, "step": 5342 }, { "entropy": 0.5713161379098892, "epoch": 4.984601026598227, "grad_norm": 0.3893333375453949, "learning_rate": 0.0002, "loss": 0.5822, "mean_token_accuracy": 0.7588733732700348, "num_tokens": 19350335.0, "step": 5343 }, { "entropy": 0.5763756334781647, "epoch": 4.985534297713485, "grad_norm": 0.4326505661010742, "learning_rate": 0.0002, "loss": 0.5782, "mean_token_accuracy": 0.7685773372650146, "num_tokens": 19353951.0, "step": 5344 }, { "entropy": 0.568813756108284, "epoch": 4.986467568828745, "grad_norm": 0.39661461114883423, "learning_rate": 0.0002, "loss": 0.5596, "mean_token_accuracy": 0.7753033339977264, "num_tokens": 19357530.0, "step": 5345 }, { "entropy": 0.5297786220908165, "epoch": 4.987400839944003, "grad_norm": 0.39699694514274597, "learning_rate": 0.0002, "loss": 0.5389, "mean_token_accuracy": 0.785002127289772, "num_tokens": 19361009.0, "step": 5346 }, { "entropy": 0.575795441865921, "epoch": 4.988334111059263, "grad_norm": 0.3758590817451477, "learning_rate": 0.0002, "loss": 0.5718, "mean_token_accuracy": 0.767906591296196, "num_tokens": 19364684.0, "step": 5347 }, { "entropy": 0.5853984504938126, "epoch": 4.989267382174521, "grad_norm": 0.4085390567779541, "learning_rate": 0.0002, "loss": 0.5898, "mean_token_accuracy": 0.7634739279747009, "num_tokens": 19368272.0, "step": 5348 }, { "entropy": 0.5563909113407135, "epoch": 4.990200653289781, "grad_norm": 0.35535627603530884, "learning_rate": 0.0002, "loss": 0.5619, "mean_token_accuracy": 0.7737501412630081, "num_tokens": 19372004.0, "step": 5349 }, { "entropy": 0.5961571931838989, "epoch": 4.991133924405039, "grad_norm": 0.3945954442024231, "learning_rate": 0.0002, "loss": 0.5957, "mean_token_accuracy": 0.7542757838964462, "num_tokens": 19375674.0, "step": 5350 }, { "entropy": 0.5192335247993469, "epoch": 4.992067195520299, "grad_norm": 0.409149169921875, "learning_rate": 0.0002, "loss": 0.522, "mean_token_accuracy": 0.7916966825723648, "num_tokens": 19379251.0, "step": 5351 }, { "entropy": 0.5652161985635757, "epoch": 4.993000466635557, "grad_norm": 0.3975712060928345, "learning_rate": 0.0002, "loss": 0.5684, "mean_token_accuracy": 0.7699392586946487, "num_tokens": 19382860.0, "step": 5352 }, { "entropy": 0.5693072974681854, "epoch": 4.993933737750817, "grad_norm": 0.4919719696044922, "learning_rate": 0.0002, "loss": 0.5745, "mean_token_accuracy": 0.7688096165657043, "num_tokens": 19386515.0, "step": 5353 }, { "entropy": 0.5436797067523003, "epoch": 4.994867008866075, "grad_norm": 0.39878109097480774, "learning_rate": 0.0002, "loss": 0.5484, "mean_token_accuracy": 0.772763267159462, "num_tokens": 19390213.0, "step": 5354 }, { "entropy": 0.5483643561601639, "epoch": 4.995800279981335, "grad_norm": 0.43142226338386536, "learning_rate": 0.0002, "loss": 0.5589, "mean_token_accuracy": 0.7708827704191208, "num_tokens": 19393842.0, "step": 5355 }, { "entropy": 0.5456165224313736, "epoch": 4.996733551096593, "grad_norm": 0.365904301404953, "learning_rate": 0.0002, "loss": 0.5456, "mean_token_accuracy": 0.7830430269241333, "num_tokens": 19397316.0, "step": 5356 }, { "entropy": 0.5677433758974075, "epoch": 4.997666822211853, "grad_norm": 0.372837096452713, "learning_rate": 0.0002, "loss": 0.5656, "mean_token_accuracy": 0.7704169452190399, "num_tokens": 19400887.0, "step": 5357 }, { "entropy": 0.5478221550583839, "epoch": 4.998600093327111, "grad_norm": 0.4025062620639801, "learning_rate": 0.0002, "loss": 0.5539, "mean_token_accuracy": 0.7768195420503616, "num_tokens": 19404470.0, "step": 5358 }, { "entropy": 0.5950629264116287, "epoch": 4.999533364442371, "grad_norm": 0.3943311870098114, "learning_rate": 0.0002, "loss": 0.5953, "mean_token_accuracy": 0.7596324533224106, "num_tokens": 19408067.0, "step": 5359 }, { "entropy": 0.5964286625385284, "epoch": 5.0, "grad_norm": 0.5401034355163574, "learning_rate": 0.0002, "loss": 0.5595, "mean_token_accuracy": 0.7551791071891785, "num_tokens": 19409113.0, "step": 5360 } ], "logging_steps": 1, "max_steps": 5360, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6326315119183135e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }