{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.5430711610486894, "eval_steps": 500, "global_step": 946, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.124472439289093, "epoch": 0.003745318352059925, "grad_norm": 0.4064895212650299, "learning_rate": 0.0002, "loss": 2.4620742797851562, "mean_token_accuracy": 0.5437362492084503, "num_tokens": 16219.0, "step": 1 }, { "entropy": 1.2432018220424652, "epoch": 0.00749063670411985, "grad_norm": 0.37879112362861633, "learning_rate": 0.0002, "loss": 2.1651668548583984, "mean_token_accuracy": 0.5638100206851959, "num_tokens": 32451.0, "step": 2 }, { "entropy": 1.4062562882900238, "epoch": 0.011235955056179775, "grad_norm": 0.28845661878585815, "learning_rate": 0.0002, "loss": 1.7072796821594238, "mean_token_accuracy": 0.5924695134162903, "num_tokens": 48696.0, "step": 3 }, { "entropy": 1.3798817992210388, "epoch": 0.0149812734082397, "grad_norm": 0.2335132509469986, "learning_rate": 0.0002, "loss": 1.4192372560501099, "mean_token_accuracy": 0.6366562396287918, "num_tokens": 65149.0, "step": 4 }, { "entropy": 1.3547163307666779, "epoch": 0.018726591760299626, "grad_norm": 0.27531901001930237, "learning_rate": 0.0002, "loss": 1.2890108823776245, "mean_token_accuracy": 0.639111116528511, "num_tokens": 81615.0, "step": 5 }, { "entropy": 1.2633765935897827, "epoch": 0.02247191011235955, "grad_norm": 0.15004344284534454, "learning_rate": 0.0002, "loss": 1.1727904081344604, "mean_token_accuracy": 0.6589455008506775, "num_tokens": 98238.0, "step": 6 }, { "entropy": 1.1859196424484253, "epoch": 0.026217228464419477, "grad_norm": 0.10320489853620529, "learning_rate": 0.0002, "loss": 1.0918691158294678, "mean_token_accuracy": 0.6676707565784454, "num_tokens": 114444.0, "step": 7 }, { "entropy": 1.1146739721298218, "epoch": 0.0299625468164794, "grad_norm": 0.1199173703789711, "learning_rate": 0.0002, "loss": 1.0362448692321777, "mean_token_accuracy": 0.6752683073282242, "num_tokens": 130761.0, "step": 8 }, { "entropy": 1.0335184335708618, "epoch": 0.033707865168539325, "grad_norm": 0.12563136219978333, "learning_rate": 0.0002, "loss": 0.9962326288223267, "mean_token_accuracy": 0.6880597323179245, "num_tokens": 147021.0, "step": 9 }, { "entropy": 0.9865177571773529, "epoch": 0.03745318352059925, "grad_norm": 0.1199953481554985, "learning_rate": 0.0002, "loss": 0.9303470849990845, "mean_token_accuracy": 0.6944610327482224, "num_tokens": 163123.0, "step": 10 }, { "entropy": 0.9654616415500641, "epoch": 0.04119850187265917, "grad_norm": 0.11374429613351822, "learning_rate": 0.0002, "loss": 0.8831573724746704, "mean_token_accuracy": 0.7051983922719955, "num_tokens": 179185.0, "step": 11 }, { "entropy": 0.9084527641534805, "epoch": 0.0449438202247191, "grad_norm": 0.11104491353034973, "learning_rate": 0.0002, "loss": 0.8112745881080627, "mean_token_accuracy": 0.717003270983696, "num_tokens": 195302.0, "step": 12 }, { "entropy": 0.8792405873537064, "epoch": 0.04868913857677903, "grad_norm": 0.29082274436950684, "learning_rate": 0.0002, "loss": 0.798420250415802, "mean_token_accuracy": 0.7170884907245636, "num_tokens": 211890.0, "step": 13 }, { "entropy": 0.8252373337745667, "epoch": 0.052434456928838954, "grad_norm": 0.10816927999258041, "learning_rate": 0.0002, "loss": 0.7828125357627869, "mean_token_accuracy": 0.7214709371328354, "num_tokens": 228238.0, "step": 14 }, { "entropy": 0.7244665324687958, "epoch": 0.056179775280898875, "grad_norm": 0.11618702858686447, "learning_rate": 0.0002, "loss": 0.7206279635429382, "mean_token_accuracy": 0.7338205277919769, "num_tokens": 244371.0, "step": 15 }, { "entropy": 0.6871565729379654, "epoch": 0.0599250936329588, "grad_norm": 0.1065768375992775, "learning_rate": 0.0002, "loss": 0.7100083827972412, "mean_token_accuracy": 0.7358262836933136, "num_tokens": 260726.0, "step": 16 }, { "entropy": 0.6935696750879288, "epoch": 0.06367041198501873, "grad_norm": 0.08450760692358017, "learning_rate": 0.0002, "loss": 0.6838802695274353, "mean_token_accuracy": 0.7416488826274872, "num_tokens": 277122.0, "step": 17 }, { "entropy": 0.6860368996858597, "epoch": 0.06741573033707865, "grad_norm": 0.08516346663236618, "learning_rate": 0.0002, "loss": 0.6765270829200745, "mean_token_accuracy": 0.7396037727594376, "num_tokens": 293596.0, "step": 18 }, { "entropy": 0.6689740270376205, "epoch": 0.07116104868913857, "grad_norm": 0.08950749784708023, "learning_rate": 0.0002, "loss": 0.6559870839118958, "mean_token_accuracy": 0.7492983937263489, "num_tokens": 309758.0, "step": 19 }, { "entropy": 0.6853971034288406, "epoch": 0.0749063670411985, "grad_norm": 0.08301156759262085, "learning_rate": 0.0002, "loss": 0.6591368913650513, "mean_token_accuracy": 0.7445396035909653, "num_tokens": 326199.0, "step": 20 }, { "entropy": 0.6475548148155212, "epoch": 0.07865168539325842, "grad_norm": 0.07257863134145737, "learning_rate": 0.0002, "loss": 0.6322771906852722, "mean_token_accuracy": 0.7570293545722961, "num_tokens": 342706.0, "step": 21 }, { "entropy": 0.62291419506073, "epoch": 0.08239700374531835, "grad_norm": 0.07468358427286148, "learning_rate": 0.0002, "loss": 0.6161096096038818, "mean_token_accuracy": 0.7579571604728699, "num_tokens": 358967.0, "step": 22 }, { "entropy": 0.6039848625659943, "epoch": 0.08614232209737828, "grad_norm": 0.06657886505126953, "learning_rate": 0.0002, "loss": 0.5981277823448181, "mean_token_accuracy": 0.7673389315605164, "num_tokens": 375372.0, "step": 23 }, { "entropy": 0.6231608390808105, "epoch": 0.0898876404494382, "grad_norm": 0.06528797745704651, "learning_rate": 0.0002, "loss": 0.6185131072998047, "mean_token_accuracy": 0.7547510862350464, "num_tokens": 391535.0, "step": 24 }, { "entropy": 0.6286156177520752, "epoch": 0.09363295880149813, "grad_norm": 0.06431519240140915, "learning_rate": 0.0002, "loss": 0.6217876672744751, "mean_token_accuracy": 0.7541641592979431, "num_tokens": 407808.0, "step": 25 }, { "entropy": 0.6126427948474884, "epoch": 0.09737827715355805, "grad_norm": 0.06216903775930405, "learning_rate": 0.0002, "loss": 0.6070841550827026, "mean_token_accuracy": 0.759774461388588, "num_tokens": 424098.0, "step": 26 }, { "entropy": 0.6149384081363678, "epoch": 0.10112359550561797, "grad_norm": 0.06437912583351135, "learning_rate": 0.0002, "loss": 0.6078751087188721, "mean_token_accuracy": 0.7595006227493286, "num_tokens": 440539.0, "step": 27 }, { "entropy": 0.6091344654560089, "epoch": 0.10486891385767791, "grad_norm": 0.06495340913534164, "learning_rate": 0.0002, "loss": 0.6011782884597778, "mean_token_accuracy": 0.7595006972551346, "num_tokens": 456799.0, "step": 28 }, { "entropy": 0.608646497130394, "epoch": 0.10861423220973783, "grad_norm": 0.059445418417453766, "learning_rate": 0.0002, "loss": 0.6044275164604187, "mean_token_accuracy": 0.7600021511316299, "num_tokens": 473089.0, "step": 29 }, { "entropy": 0.6043040752410889, "epoch": 0.11235955056179775, "grad_norm": 0.06593701243400574, "learning_rate": 0.0002, "loss": 0.6045087575912476, "mean_token_accuracy": 0.7567310333251953, "num_tokens": 489490.0, "step": 30 }, { "entropy": 0.5747391283512115, "epoch": 0.11610486891385768, "grad_norm": 0.06415696442127228, "learning_rate": 0.0002, "loss": 0.5873428583145142, "mean_token_accuracy": 0.7674129754304886, "num_tokens": 505809.0, "step": 31 }, { "entropy": 0.5926542580127716, "epoch": 0.1198501872659176, "grad_norm": 0.051249004900455475, "learning_rate": 0.0002, "loss": 0.598324179649353, "mean_token_accuracy": 0.759703740477562, "num_tokens": 522016.0, "step": 32 }, { "entropy": 0.5886886864900589, "epoch": 0.12359550561797752, "grad_norm": 0.05292005091905594, "learning_rate": 0.0002, "loss": 0.5881145596504211, "mean_token_accuracy": 0.7697232961654663, "num_tokens": 538100.0, "step": 33 }, { "entropy": 0.5867745727300644, "epoch": 0.12734082397003746, "grad_norm": 0.04721912741661072, "learning_rate": 0.0002, "loss": 0.5836299061775208, "mean_token_accuracy": 0.768671840429306, "num_tokens": 554234.0, "step": 34 }, { "entropy": 0.5881127417087555, "epoch": 0.13108614232209737, "grad_norm": 0.05805843323469162, "learning_rate": 0.0002, "loss": 0.5897107124328613, "mean_token_accuracy": 0.7657543420791626, "num_tokens": 570565.0, "step": 35 }, { "entropy": 0.5939383208751678, "epoch": 0.1348314606741573, "grad_norm": 0.0569508820772171, "learning_rate": 0.0002, "loss": 0.5897835493087769, "mean_token_accuracy": 0.7598359882831573, "num_tokens": 586816.0, "step": 36 }, { "entropy": 0.5979506522417068, "epoch": 0.13857677902621723, "grad_norm": 0.05739126354455948, "learning_rate": 0.0002, "loss": 0.5949404835700989, "mean_token_accuracy": 0.7612607926130295, "num_tokens": 603019.0, "step": 37 }, { "entropy": 0.5742268264293671, "epoch": 0.14232209737827714, "grad_norm": 0.047265954315662384, "learning_rate": 0.0002, "loss": 0.5759380459785461, "mean_token_accuracy": 0.7693933397531509, "num_tokens": 619295.0, "step": 38 }, { "entropy": 0.5710775703191757, "epoch": 0.14606741573033707, "grad_norm": 0.05281650274991989, "learning_rate": 0.0002, "loss": 0.5691424608230591, "mean_token_accuracy": 0.7704602777957916, "num_tokens": 635365.0, "step": 39 }, { "entropy": 0.582334503531456, "epoch": 0.149812734082397, "grad_norm": 0.055993299931287766, "learning_rate": 0.0002, "loss": 0.5809962749481201, "mean_token_accuracy": 0.7662668973207474, "num_tokens": 651665.0, "step": 40 }, { "entropy": 0.5551325976848602, "epoch": 0.15355805243445692, "grad_norm": 0.04340814799070358, "learning_rate": 0.0002, "loss": 0.557377815246582, "mean_token_accuracy": 0.7778407037258148, "num_tokens": 667809.0, "step": 41 }, { "entropy": 0.5822649896144867, "epoch": 0.15730337078651685, "grad_norm": 0.04575135186314583, "learning_rate": 0.0002, "loss": 0.5827720165252686, "mean_token_accuracy": 0.7657051831483841, "num_tokens": 683923.0, "step": 42 }, { "entropy": 0.55968376994133, "epoch": 0.16104868913857678, "grad_norm": 0.04552368074655533, "learning_rate": 0.0002, "loss": 0.5598254799842834, "mean_token_accuracy": 0.7764519304037094, "num_tokens": 700197.0, "step": 43 }, { "entropy": 0.5671757161617279, "epoch": 0.1647940074906367, "grad_norm": 0.04587964341044426, "learning_rate": 0.0002, "loss": 0.5750178694725037, "mean_token_accuracy": 0.7700542360544205, "num_tokens": 716432.0, "step": 44 }, { "entropy": 0.5685836523771286, "epoch": 0.16853932584269662, "grad_norm": 0.03833606839179993, "learning_rate": 0.0002, "loss": 0.5728627443313599, "mean_token_accuracy": 0.7676915228366852, "num_tokens": 732768.0, "step": 45 }, { "entropy": 0.5726271122694016, "epoch": 0.17228464419475656, "grad_norm": 0.04773888736963272, "learning_rate": 0.0002, "loss": 0.5737521052360535, "mean_token_accuracy": 0.7691973745822906, "num_tokens": 748991.0, "step": 46 }, { "entropy": 0.5940001755952835, "epoch": 0.1760299625468165, "grad_norm": 0.035074397921562195, "learning_rate": 0.0002, "loss": 0.58332759141922, "mean_token_accuracy": 0.7648619115352631, "num_tokens": 765572.0, "step": 47 }, { "entropy": 0.5897164344787598, "epoch": 0.1797752808988764, "grad_norm": 0.037994541227817535, "learning_rate": 0.0002, "loss": 0.5864952802658081, "mean_token_accuracy": 0.7641548812389374, "num_tokens": 782005.0, "step": 48 }, { "entropy": 0.5744329988956451, "epoch": 0.18352059925093633, "grad_norm": 0.040346939116716385, "learning_rate": 0.0002, "loss": 0.5669541954994202, "mean_token_accuracy": 0.770287498831749, "num_tokens": 798604.0, "step": 49 }, { "entropy": 0.5779913067817688, "epoch": 0.18726591760299627, "grad_norm": 0.036969687789678574, "learning_rate": 0.0002, "loss": 0.5797433257102966, "mean_token_accuracy": 0.7645184099674225, "num_tokens": 814871.0, "step": 50 }, { "entropy": 0.5663889348506927, "epoch": 0.19101123595505617, "grad_norm": 0.03604266792535782, "learning_rate": 0.0002, "loss": 0.5714061260223389, "mean_token_accuracy": 0.7704311609268188, "num_tokens": 831246.0, "step": 51 }, { "entropy": 0.561771884560585, "epoch": 0.1947565543071161, "grad_norm": 0.04034798592329025, "learning_rate": 0.0002, "loss": 0.5732511878013611, "mean_token_accuracy": 0.7705236822366714, "num_tokens": 847825.0, "step": 52 }, { "entropy": 0.5677134096622467, "epoch": 0.19850187265917604, "grad_norm": 0.03827312961220741, "learning_rate": 0.0002, "loss": 0.5743907690048218, "mean_token_accuracy": 0.7655002921819687, "num_tokens": 864255.0, "step": 53 }, { "entropy": 0.563701331615448, "epoch": 0.20224719101123595, "grad_norm": 0.04143316298723221, "learning_rate": 0.0002, "loss": 0.5607832074165344, "mean_token_accuracy": 0.772660031914711, "num_tokens": 880665.0, "step": 54 }, { "entropy": 0.5692192316055298, "epoch": 0.20599250936329588, "grad_norm": 0.03400753438472748, "learning_rate": 0.0002, "loss": 0.5670974254608154, "mean_token_accuracy": 0.769247904419899, "num_tokens": 896987.0, "step": 55 }, { "entropy": 0.5776625126600266, "epoch": 0.20973782771535582, "grad_norm": 0.035431839525699615, "learning_rate": 0.0002, "loss": 0.5733675360679626, "mean_token_accuracy": 0.7692834436893463, "num_tokens": 913582.0, "step": 56 }, { "entropy": 0.5626319646835327, "epoch": 0.21348314606741572, "grad_norm": 0.03843431547284126, "learning_rate": 0.0002, "loss": 0.5641550421714783, "mean_token_accuracy": 0.7710368186235428, "num_tokens": 929972.0, "step": 57 }, { "entropy": 0.5526942014694214, "epoch": 0.21722846441947566, "grad_norm": 0.03771563246846199, "learning_rate": 0.0002, "loss": 0.5567817687988281, "mean_token_accuracy": 0.7731232047080994, "num_tokens": 945888.0, "step": 58 }, { "entropy": 0.5716714560985565, "epoch": 0.2209737827715356, "grad_norm": 0.036766648292541504, "learning_rate": 0.0002, "loss": 0.5660452246665955, "mean_token_accuracy": 0.7728052884340286, "num_tokens": 962278.0, "step": 59 }, { "entropy": 0.568805992603302, "epoch": 0.2247191011235955, "grad_norm": 0.035415392369031906, "learning_rate": 0.0002, "loss": 0.5717817544937134, "mean_token_accuracy": 0.7711138129234314, "num_tokens": 978682.0, "step": 60 }, { "entropy": 0.5708261281251907, "epoch": 0.22846441947565543, "grad_norm": 0.03432939946651459, "learning_rate": 0.0002, "loss": 0.5735772252082825, "mean_token_accuracy": 0.7677555531263351, "num_tokens": 994945.0, "step": 61 }, { "entropy": 0.5660677701234818, "epoch": 0.23220973782771537, "grad_norm": 0.041112665086984634, "learning_rate": 0.0002, "loss": 0.5750763416290283, "mean_token_accuracy": 0.7678538411855698, "num_tokens": 1011319.0, "step": 62 }, { "entropy": 0.5581584423780441, "epoch": 0.23595505617977527, "grad_norm": 0.03535327687859535, "learning_rate": 0.0002, "loss": 0.5653359889984131, "mean_token_accuracy": 0.7709096819162369, "num_tokens": 1027780.0, "step": 63 }, { "entropy": 0.5639653205871582, "epoch": 0.2397003745318352, "grad_norm": 0.03404325619339943, "learning_rate": 0.0002, "loss": 0.5576256513595581, "mean_token_accuracy": 0.7768308818340302, "num_tokens": 1044141.0, "step": 64 }, { "entropy": 0.5733215659856796, "epoch": 0.24344569288389514, "grad_norm": 0.041786711663007736, "learning_rate": 0.0002, "loss": 0.5677163600921631, "mean_token_accuracy": 0.768655464053154, "num_tokens": 1060152.0, "step": 65 }, { "entropy": 0.5721775144338608, "epoch": 0.24719101123595505, "grad_norm": 0.037091247737407684, "learning_rate": 0.0002, "loss": 0.5689237713813782, "mean_token_accuracy": 0.769687607884407, "num_tokens": 1076350.0, "step": 66 }, { "entropy": 0.5711842328310013, "epoch": 0.250936329588015, "grad_norm": 0.03522708639502525, "learning_rate": 0.0002, "loss": 0.567720890045166, "mean_token_accuracy": 0.7711529284715652, "num_tokens": 1092839.0, "step": 67 }, { "entropy": 0.5565171837806702, "epoch": 0.2546816479400749, "grad_norm": 0.038917530328035355, "learning_rate": 0.0002, "loss": 0.5597351789474487, "mean_token_accuracy": 0.7759623378515244, "num_tokens": 1109005.0, "step": 68 }, { "entropy": 0.5430796295404434, "epoch": 0.25842696629213485, "grad_norm": 0.034353867173194885, "learning_rate": 0.0002, "loss": 0.5536048412322998, "mean_token_accuracy": 0.7768301516771317, "num_tokens": 1125051.0, "step": 69 }, { "entropy": 0.5550204813480377, "epoch": 0.26217228464419473, "grad_norm": 0.03845667093992233, "learning_rate": 0.0002, "loss": 0.5609036087989807, "mean_token_accuracy": 0.7741425037384033, "num_tokens": 1141333.0, "step": 70 }, { "entropy": 0.5524102747440338, "epoch": 0.26591760299625467, "grad_norm": 0.0383320152759552, "learning_rate": 0.0002, "loss": 0.5493491291999817, "mean_token_accuracy": 0.7784009873867035, "num_tokens": 1157440.0, "step": 71 }, { "entropy": 0.5607451796531677, "epoch": 0.2696629213483146, "grad_norm": 0.0344189889729023, "learning_rate": 0.0002, "loss": 0.5574801564216614, "mean_token_accuracy": 0.7733150720596313, "num_tokens": 1173721.0, "step": 72 }, { "entropy": 0.5708478391170502, "epoch": 0.27340823970037453, "grad_norm": 0.03608883544802666, "learning_rate": 0.0002, "loss": 0.5691329836845398, "mean_token_accuracy": 0.7706348299980164, "num_tokens": 1189995.0, "step": 73 }, { "entropy": 0.5674006342887878, "epoch": 0.27715355805243447, "grad_norm": 0.03380035236477852, "learning_rate": 0.0002, "loss": 0.5687033534049988, "mean_token_accuracy": 0.7686747610569, "num_tokens": 1206546.0, "step": 74 }, { "entropy": 0.5619117617607117, "epoch": 0.2808988764044944, "grad_norm": 0.033374786376953125, "learning_rate": 0.0002, "loss": 0.5617104768753052, "mean_token_accuracy": 0.774394765496254, "num_tokens": 1222857.0, "step": 75 }, { "entropy": 0.553475558757782, "epoch": 0.2846441947565543, "grad_norm": 0.03828837722539902, "learning_rate": 0.0002, "loss": 0.5524560809135437, "mean_token_accuracy": 0.7749378681182861, "num_tokens": 1239289.0, "step": 76 }, { "entropy": 0.5745554566383362, "epoch": 0.2883895131086142, "grad_norm": 0.03621216490864754, "learning_rate": 0.0002, "loss": 0.5808500051498413, "mean_token_accuracy": 0.7678203135728836, "num_tokens": 1255521.0, "step": 77 }, { "entropy": 0.5676577985286713, "epoch": 0.29213483146067415, "grad_norm": 0.03588660806417465, "learning_rate": 0.0002, "loss": 0.5705655813217163, "mean_token_accuracy": 0.7692013084888458, "num_tokens": 1271794.0, "step": 78 }, { "entropy": 0.578361302614212, "epoch": 0.2958801498127341, "grad_norm": 0.03781484439969063, "learning_rate": 0.0002, "loss": 0.5760793089866638, "mean_token_accuracy": 0.7664260119199753, "num_tokens": 1288356.0, "step": 79 }, { "entropy": 0.5593062490224838, "epoch": 0.299625468164794, "grad_norm": 0.03217354416847229, "learning_rate": 0.0002, "loss": 0.5657471418380737, "mean_token_accuracy": 0.7739468365907669, "num_tokens": 1304492.0, "step": 80 }, { "entropy": 0.5666437745094299, "epoch": 0.30337078651685395, "grad_norm": 0.03268091008067131, "learning_rate": 0.0002, "loss": 0.5716702938079834, "mean_token_accuracy": 0.7679993361234665, "num_tokens": 1320914.0, "step": 81 }, { "entropy": 0.5685661137104034, "epoch": 0.30711610486891383, "grad_norm": 0.03592272475361824, "learning_rate": 0.0002, "loss": 0.5758165717124939, "mean_token_accuracy": 0.7661760449409485, "num_tokens": 1337161.0, "step": 82 }, { "entropy": 0.5707727521657944, "epoch": 0.31086142322097376, "grad_norm": 0.032845061272382736, "learning_rate": 0.0002, "loss": 0.5710837841033936, "mean_token_accuracy": 0.7702731043100357, "num_tokens": 1353376.0, "step": 83 }, { "entropy": 0.5628758817911148, "epoch": 0.3146067415730337, "grad_norm": 0.029750632122159004, "learning_rate": 0.0002, "loss": 0.5637022852897644, "mean_token_accuracy": 0.7708846777677536, "num_tokens": 1369870.0, "step": 84 }, { "entropy": 0.5795712918043137, "epoch": 0.31835205992509363, "grad_norm": 0.03464500606060028, "learning_rate": 0.0002, "loss": 0.5780152082443237, "mean_token_accuracy": 0.7670614421367645, "num_tokens": 1386403.0, "step": 85 }, { "entropy": 0.5554608702659607, "epoch": 0.32209737827715357, "grad_norm": 0.03547544404864311, "learning_rate": 0.0002, "loss": 0.5557012557983398, "mean_token_accuracy": 0.7721797376871109, "num_tokens": 1402494.0, "step": 86 }, { "entropy": 0.5579323172569275, "epoch": 0.3258426966292135, "grad_norm": 0.03288840129971504, "learning_rate": 0.0002, "loss": 0.560955286026001, "mean_token_accuracy": 0.7751947343349457, "num_tokens": 1418821.0, "step": 87 }, { "entropy": 0.5543566048145294, "epoch": 0.3295880149812734, "grad_norm": 0.04169093072414398, "learning_rate": 0.0002, "loss": 0.5500882267951965, "mean_token_accuracy": 0.7791634202003479, "num_tokens": 1434993.0, "step": 88 }, { "entropy": 0.5734467208385468, "epoch": 0.3333333333333333, "grad_norm": 0.04577335715293884, "learning_rate": 0.0002, "loss": 0.5629557371139526, "mean_token_accuracy": 0.7727752029895782, "num_tokens": 1451307.0, "step": 89 }, { "entropy": 0.5726543813943863, "epoch": 0.33707865168539325, "grad_norm": 0.0342593714594841, "learning_rate": 0.0002, "loss": 0.5802106261253357, "mean_token_accuracy": 0.7650935351848602, "num_tokens": 1467745.0, "step": 90 }, { "entropy": 0.551667258143425, "epoch": 0.3408239700374532, "grad_norm": 0.03779289126396179, "learning_rate": 0.0002, "loss": 0.562962532043457, "mean_token_accuracy": 0.7722999006509781, "num_tokens": 1483931.0, "step": 91 }, { "entropy": 0.5500118583440781, "epoch": 0.3445692883895131, "grad_norm": 0.04092314839363098, "learning_rate": 0.0002, "loss": 0.5627440810203552, "mean_token_accuracy": 0.7718297243118286, "num_tokens": 1500272.0, "step": 92 }, { "entropy": 0.5528086423873901, "epoch": 0.34831460674157305, "grad_norm": 0.03680623322725296, "learning_rate": 0.0002, "loss": 0.5555366277694702, "mean_token_accuracy": 0.7774850875139236, "num_tokens": 1516853.0, "step": 93 }, { "entropy": 0.5520536154508591, "epoch": 0.352059925093633, "grad_norm": 0.037777166813611984, "learning_rate": 0.0002, "loss": 0.5425198078155518, "mean_token_accuracy": 0.7793015986680984, "num_tokens": 1533333.0, "step": 94 }, { "entropy": 0.5685165077447891, "epoch": 0.35580524344569286, "grad_norm": 0.04140891879796982, "learning_rate": 0.0002, "loss": 0.5641899108886719, "mean_token_accuracy": 0.7713409811258316, "num_tokens": 1549757.0, "step": 95 }, { "entropy": 0.5465481728315353, "epoch": 0.3595505617977528, "grad_norm": 0.035262562334537506, "learning_rate": 0.0002, "loss": 0.5490474104881287, "mean_token_accuracy": 0.7827550321817398, "num_tokens": 1565996.0, "step": 96 }, { "entropy": 0.5831216871738434, "epoch": 0.36329588014981273, "grad_norm": 0.036104101687669754, "learning_rate": 0.0002, "loss": 0.589984118938446, "mean_token_accuracy": 0.7600380033254623, "num_tokens": 1582215.0, "step": 97 }, { "entropy": 0.5677650719881058, "epoch": 0.36704119850187267, "grad_norm": 0.03766894340515137, "learning_rate": 0.0002, "loss": 0.5645126104354858, "mean_token_accuracy": 0.7706596851348877, "num_tokens": 1598452.0, "step": 98 }, { "entropy": 0.5670180022716522, "epoch": 0.3707865168539326, "grad_norm": 0.031464677304029465, "learning_rate": 0.0002, "loss": 0.5694231986999512, "mean_token_accuracy": 0.7699034363031387, "num_tokens": 1614973.0, "step": 99 }, { "entropy": 0.556086465716362, "epoch": 0.37453183520599254, "grad_norm": 0.03442725911736488, "learning_rate": 0.0002, "loss": 0.5548810958862305, "mean_token_accuracy": 0.7733764350414276, "num_tokens": 1631172.0, "step": 100 }, { "entropy": 0.5800606608390808, "epoch": 0.3782771535580524, "grad_norm": 0.03572804853320122, "learning_rate": 0.0002, "loss": 0.5861737728118896, "mean_token_accuracy": 0.7624654024839401, "num_tokens": 1647621.0, "step": 101 }, { "entropy": 0.5482688248157501, "epoch": 0.38202247191011235, "grad_norm": 0.03775500878691673, "learning_rate": 0.0002, "loss": 0.5594941973686218, "mean_token_accuracy": 0.7744353115558624, "num_tokens": 1663895.0, "step": 102 }, { "entropy": 0.563491478562355, "epoch": 0.3857677902621723, "grad_norm": 0.031457267701625824, "learning_rate": 0.0002, "loss": 0.564830482006073, "mean_token_accuracy": 0.7690578252077103, "num_tokens": 1680534.0, "step": 103 }, { "entropy": 0.564789205789566, "epoch": 0.3895131086142322, "grad_norm": 0.035452548414468765, "learning_rate": 0.0002, "loss": 0.560291588306427, "mean_token_accuracy": 0.7735853344202042, "num_tokens": 1696770.0, "step": 104 }, { "entropy": 0.5566727668046951, "epoch": 0.39325842696629215, "grad_norm": 0.03198615834116936, "learning_rate": 0.0002, "loss": 0.5535395741462708, "mean_token_accuracy": 0.7722934931516647, "num_tokens": 1713024.0, "step": 105 }, { "entropy": 0.5578596889972687, "epoch": 0.3970037453183521, "grad_norm": 0.03393879160284996, "learning_rate": 0.0002, "loss": 0.5627562999725342, "mean_token_accuracy": 0.7742809951305389, "num_tokens": 1729333.0, "step": 106 }, { "entropy": 0.5788154900074005, "epoch": 0.40074906367041196, "grad_norm": 0.033935144543647766, "learning_rate": 0.0002, "loss": 0.580773115158081, "mean_token_accuracy": 0.7651670575141907, "num_tokens": 1745611.0, "step": 107 }, { "entropy": 0.5737199634313583, "epoch": 0.4044943820224719, "grad_norm": 0.03252919018268585, "learning_rate": 0.0002, "loss": 0.5751349925994873, "mean_token_accuracy": 0.7671079486608505, "num_tokens": 1762357.0, "step": 108 }, { "entropy": 0.5651296824216843, "epoch": 0.40823970037453183, "grad_norm": 0.028949161991477013, "learning_rate": 0.0002, "loss": 0.5604527592658997, "mean_token_accuracy": 0.7729825675487518, "num_tokens": 1778752.0, "step": 109 }, { "entropy": 0.5504195243120193, "epoch": 0.41198501872659177, "grad_norm": 0.028210768476128578, "learning_rate": 0.0002, "loss": 0.549246072769165, "mean_token_accuracy": 0.7782431095838547, "num_tokens": 1794998.0, "step": 110 }, { "entropy": 0.5765475034713745, "epoch": 0.4157303370786517, "grad_norm": 0.02785623073577881, "learning_rate": 0.0002, "loss": 0.5748263597488403, "mean_token_accuracy": 0.7663502544164658, "num_tokens": 1811522.0, "step": 111 }, { "entropy": 0.5662956237792969, "epoch": 0.41947565543071164, "grad_norm": 0.027803661301732063, "learning_rate": 0.0002, "loss": 0.5678505897521973, "mean_token_accuracy": 0.769574448466301, "num_tokens": 1827911.0, "step": 112 }, { "entropy": 0.554324135184288, "epoch": 0.4232209737827715, "grad_norm": 0.03252230957150459, "learning_rate": 0.0002, "loss": 0.5648460984230042, "mean_token_accuracy": 0.7699959129095078, "num_tokens": 1844234.0, "step": 113 }, { "entropy": 0.5458608418703079, "epoch": 0.42696629213483145, "grad_norm": 0.027507655322551727, "learning_rate": 0.0002, "loss": 0.5496413111686707, "mean_token_accuracy": 0.7775106579065323, "num_tokens": 1860498.0, "step": 114 }, { "entropy": 0.5563929826021194, "epoch": 0.4307116104868914, "grad_norm": 0.03014312870800495, "learning_rate": 0.0002, "loss": 0.5582830905914307, "mean_token_accuracy": 0.7708972990512848, "num_tokens": 1876571.0, "step": 115 }, { "entropy": 0.5650668740272522, "epoch": 0.4344569288389513, "grad_norm": 0.032711341977119446, "learning_rate": 0.0002, "loss": 0.5640538930892944, "mean_token_accuracy": 0.7726383656263351, "num_tokens": 1893031.0, "step": 116 }, { "entropy": 0.5807255804538727, "epoch": 0.43820224719101125, "grad_norm": 0.04059470072388649, "learning_rate": 0.0002, "loss": 0.5742425918579102, "mean_token_accuracy": 0.7666837275028229, "num_tokens": 1909366.0, "step": 117 }, { "entropy": 0.5798581689596176, "epoch": 0.4419475655430712, "grad_norm": 0.03380719944834709, "learning_rate": 0.0002, "loss": 0.5788700580596924, "mean_token_accuracy": 0.7679527401924133, "num_tokens": 1925898.0, "step": 118 }, { "entropy": 0.5766737908124924, "epoch": 0.44569288389513106, "grad_norm": 0.030183367431163788, "learning_rate": 0.0002, "loss": 0.5766640901565552, "mean_token_accuracy": 0.7679651975631714, "num_tokens": 1942401.0, "step": 119 }, { "entropy": 0.5603433847427368, "epoch": 0.449438202247191, "grad_norm": 0.0362340547144413, "learning_rate": 0.0002, "loss": 0.5619690418243408, "mean_token_accuracy": 0.7730819880962372, "num_tokens": 1958720.0, "step": 120 }, { "entropy": 0.5559201538562775, "epoch": 0.45318352059925093, "grad_norm": 0.034683868288993835, "learning_rate": 0.0002, "loss": 0.5595064163208008, "mean_token_accuracy": 0.7748750001192093, "num_tokens": 1975119.0, "step": 121 }, { "entropy": 0.5641336888074875, "epoch": 0.45692883895131087, "grad_norm": 0.034222401678562164, "learning_rate": 0.0002, "loss": 0.5678452849388123, "mean_token_accuracy": 0.7732732445001602, "num_tokens": 1991506.0, "step": 122 }, { "entropy": 0.5829679220914841, "epoch": 0.4606741573033708, "grad_norm": 0.034026652574539185, "learning_rate": 0.0002, "loss": 0.5875802040100098, "mean_token_accuracy": 0.7611493021249771, "num_tokens": 2007947.0, "step": 123 }, { "entropy": 0.5581521540880203, "epoch": 0.46441947565543074, "grad_norm": 0.025140831246972084, "learning_rate": 0.0002, "loss": 0.5602667927742004, "mean_token_accuracy": 0.7735796868801117, "num_tokens": 2024401.0, "step": 124 }, { "entropy": 0.5715497881174088, "epoch": 0.4681647940074906, "grad_norm": 0.029785403981804848, "learning_rate": 0.0002, "loss": 0.5672232508659363, "mean_token_accuracy": 0.7685857713222504, "num_tokens": 2040631.0, "step": 125 }, { "entropy": 0.5607001930475235, "epoch": 0.47191011235955055, "grad_norm": 0.04235680401325226, "learning_rate": 0.0002, "loss": 0.5650739073753357, "mean_token_accuracy": 0.7696276903152466, "num_tokens": 2056536.0, "step": 126 }, { "entropy": 0.5663832724094391, "epoch": 0.4756554307116105, "grad_norm": 0.03530610725283623, "learning_rate": 0.0002, "loss": 0.5653817653656006, "mean_token_accuracy": 0.771982342004776, "num_tokens": 2072694.0, "step": 127 }, { "entropy": 0.5544104427099228, "epoch": 0.4794007490636704, "grad_norm": 0.02733522094786167, "learning_rate": 0.0002, "loss": 0.5605688095092773, "mean_token_accuracy": 0.7723411917686462, "num_tokens": 2089137.0, "step": 128 }, { "entropy": 0.5275053828954697, "epoch": 0.48314606741573035, "grad_norm": 0.04322921857237816, "learning_rate": 0.0002, "loss": 0.5484553575515747, "mean_token_accuracy": 0.7770342081785202, "num_tokens": 2105149.0, "step": 129 }, { "entropy": 0.5561497956514359, "epoch": 0.4868913857677903, "grad_norm": 0.038827862590551376, "learning_rate": 0.0002, "loss": 0.55650395154953, "mean_token_accuracy": 0.7764105200767517, "num_tokens": 2121463.0, "step": 130 }, { "entropy": 0.5783034265041351, "epoch": 0.49063670411985016, "grad_norm": 0.029603557661175728, "learning_rate": 0.0002, "loss": 0.5703758001327515, "mean_token_accuracy": 0.7689076513051987, "num_tokens": 2137873.0, "step": 131 }, { "entropy": 0.5802958011627197, "epoch": 0.4943820224719101, "grad_norm": 0.03336755558848381, "learning_rate": 0.0002, "loss": 0.5750676989555359, "mean_token_accuracy": 0.7685631215572357, "num_tokens": 2154043.0, "step": 132 }, { "entropy": 0.5565105229616165, "epoch": 0.49812734082397003, "grad_norm": 0.03589406609535217, "learning_rate": 0.0002, "loss": 0.5438498258590698, "mean_token_accuracy": 0.7815204560756683, "num_tokens": 2170057.0, "step": 133 }, { "entropy": 0.5716612040996552, "epoch": 0.50187265917603, "grad_norm": 0.03452189266681671, "learning_rate": 0.0002, "loss": 0.5778107047080994, "mean_token_accuracy": 0.7688381224870682, "num_tokens": 2186386.0, "step": 134 }, { "entropy": 0.561384916305542, "epoch": 0.5056179775280899, "grad_norm": 0.03864321857690811, "learning_rate": 0.0002, "loss": 0.5704262256622314, "mean_token_accuracy": 0.7647197097539902, "num_tokens": 2202441.0, "step": 135 }, { "entropy": 0.5625592470169067, "epoch": 0.5093632958801498, "grad_norm": 0.029244674369692802, "learning_rate": 0.0002, "loss": 0.5618846416473389, "mean_token_accuracy": 0.7706502974033356, "num_tokens": 2218642.0, "step": 136 }, { "entropy": 0.557224690914154, "epoch": 0.5131086142322098, "grad_norm": 0.03010115958750248, "learning_rate": 0.0002, "loss": 0.5529860854148865, "mean_token_accuracy": 0.7745790481567383, "num_tokens": 2234941.0, "step": 137 }, { "entropy": 0.5669968128204346, "epoch": 0.5168539325842697, "grad_norm": 0.030734272673726082, "learning_rate": 0.0002, "loss": 0.563121497631073, "mean_token_accuracy": 0.7691874206066132, "num_tokens": 2251132.0, "step": 138 }, { "entropy": 0.5601507127285004, "epoch": 0.5205992509363296, "grad_norm": 0.03075527958571911, "learning_rate": 0.0002, "loss": 0.5602597594261169, "mean_token_accuracy": 0.7736657857894897, "num_tokens": 2267424.0, "step": 139 }, { "entropy": 0.5564019232988358, "epoch": 0.5243445692883895, "grad_norm": 0.03025938756763935, "learning_rate": 0.0002, "loss": 0.5628267526626587, "mean_token_accuracy": 0.771067887544632, "num_tokens": 2283849.0, "step": 140 }, { "entropy": 0.5395451635122299, "epoch": 0.5280898876404494, "grad_norm": 0.03199173882603645, "learning_rate": 0.0002, "loss": 0.5487725734710693, "mean_token_accuracy": 0.7775663435459137, "num_tokens": 2299872.0, "step": 141 }, { "entropy": 0.5526085048913956, "epoch": 0.5318352059925093, "grad_norm": 0.030539415776729584, "learning_rate": 0.0002, "loss": 0.5591868162155151, "mean_token_accuracy": 0.7733905166387558, "num_tokens": 2316381.0, "step": 142 }, { "entropy": 0.5586904883384705, "epoch": 0.5355805243445693, "grad_norm": 0.03167688101530075, "learning_rate": 0.0002, "loss": 0.5590608716011047, "mean_token_accuracy": 0.7722269594669342, "num_tokens": 2332636.0, "step": 143 }, { "entropy": 0.5568670481443405, "epoch": 0.5393258426966292, "grad_norm": 0.02876191958785057, "learning_rate": 0.0002, "loss": 0.5519507527351379, "mean_token_accuracy": 0.776704877614975, "num_tokens": 2348823.0, "step": 144 }, { "entropy": 0.5536152571439743, "epoch": 0.5430711610486891, "grad_norm": 0.026966845616698265, "learning_rate": 0.0002, "loss": 0.5451969504356384, "mean_token_accuracy": 0.7772984057664871, "num_tokens": 2365018.0, "step": 145 }, { "entropy": 0.55972820520401, "epoch": 0.5468164794007491, "grad_norm": 0.028171516954898834, "learning_rate": 0.0002, "loss": 0.5568036437034607, "mean_token_accuracy": 0.7727039009332657, "num_tokens": 2381199.0, "step": 146 }, { "entropy": 0.5505439043045044, "epoch": 0.550561797752809, "grad_norm": 0.02772362343966961, "learning_rate": 0.0002, "loss": 0.5527427792549133, "mean_token_accuracy": 0.7765008956193924, "num_tokens": 2397235.0, "step": 147 }, { "entropy": 0.5575017333030701, "epoch": 0.5543071161048689, "grad_norm": 0.030587337911128998, "learning_rate": 0.0002, "loss": 0.5631366968154907, "mean_token_accuracy": 0.7698703855276108, "num_tokens": 2413454.0, "step": 148 }, { "entropy": 0.5469523966312408, "epoch": 0.5580524344569289, "grad_norm": 0.0317547544836998, "learning_rate": 0.0002, "loss": 0.554557740688324, "mean_token_accuracy": 0.776221752166748, "num_tokens": 2429888.0, "step": 149 }, { "entropy": 0.5393165349960327, "epoch": 0.5617977528089888, "grad_norm": 0.028293034061789513, "learning_rate": 0.0002, "loss": 0.538506269454956, "mean_token_accuracy": 0.7823521643877029, "num_tokens": 2446146.0, "step": 150 }, { "entropy": 0.5640445649623871, "epoch": 0.5655430711610487, "grad_norm": 0.027342529967427254, "learning_rate": 0.0002, "loss": 0.5663660764694214, "mean_token_accuracy": 0.7686634063720703, "num_tokens": 2462436.0, "step": 151 }, { "entropy": 0.5660315603017807, "epoch": 0.5692883895131086, "grad_norm": 0.029160011559724808, "learning_rate": 0.0002, "loss": 0.5658541917800903, "mean_token_accuracy": 0.7699626982212067, "num_tokens": 2478983.0, "step": 152 }, { "entropy": 0.5457171052694321, "epoch": 0.5730337078651685, "grad_norm": 0.029130199924111366, "learning_rate": 0.0002, "loss": 0.5439150929450989, "mean_token_accuracy": 0.7802361398935318, "num_tokens": 2495263.0, "step": 153 }, { "entropy": 0.5504166930913925, "epoch": 0.5767790262172284, "grad_norm": 0.03016018122434616, "learning_rate": 0.0002, "loss": 0.5510883331298828, "mean_token_accuracy": 0.775614932179451, "num_tokens": 2511475.0, "step": 154 }, { "entropy": 0.5550555139780045, "epoch": 0.5805243445692884, "grad_norm": 0.03134196624159813, "learning_rate": 0.0002, "loss": 0.5607972145080566, "mean_token_accuracy": 0.7707046419382095, "num_tokens": 2527673.0, "step": 155 }, { "entropy": 0.5454694628715515, "epoch": 0.5842696629213483, "grad_norm": 0.0311669260263443, "learning_rate": 0.0002, "loss": 0.5492562651634216, "mean_token_accuracy": 0.779202476143837, "num_tokens": 2543853.0, "step": 156 }, { "entropy": 0.5742276608943939, "epoch": 0.5880149812734082, "grad_norm": 0.027328435331583023, "learning_rate": 0.0002, "loss": 0.5779210329055786, "mean_token_accuracy": 0.765041321516037, "num_tokens": 2560115.0, "step": 157 }, { "entropy": 0.5670003890991211, "epoch": 0.5917602996254682, "grad_norm": 0.02951730042695999, "learning_rate": 0.0002, "loss": 0.5664114952087402, "mean_token_accuracy": 0.7700729966163635, "num_tokens": 2576322.0, "step": 158 }, { "entropy": 0.5762516111135483, "epoch": 0.5955056179775281, "grad_norm": 0.029969869181513786, "learning_rate": 0.0002, "loss": 0.5735501050949097, "mean_token_accuracy": 0.7683756053447723, "num_tokens": 2592455.0, "step": 159 }, { "entropy": 0.5583818256855011, "epoch": 0.599250936329588, "grad_norm": 0.02687755413353443, "learning_rate": 0.0002, "loss": 0.5561562776565552, "mean_token_accuracy": 0.7738349288702011, "num_tokens": 2608647.0, "step": 160 }, { "entropy": 0.5745189636945724, "epoch": 0.602996254681648, "grad_norm": 0.03188227489590645, "learning_rate": 0.0002, "loss": 0.573383092880249, "mean_token_accuracy": 0.7658237218856812, "num_tokens": 2624851.0, "step": 161 }, { "entropy": 0.5701076835393906, "epoch": 0.6067415730337079, "grad_norm": 0.03216436505317688, "learning_rate": 0.0002, "loss": 0.5696204900741577, "mean_token_accuracy": 0.7674751281738281, "num_tokens": 2641365.0, "step": 162 }, { "entropy": 0.548926368355751, "epoch": 0.6104868913857678, "grad_norm": 0.02745572291314602, "learning_rate": 0.0002, "loss": 0.5530045032501221, "mean_token_accuracy": 0.7764343470335007, "num_tokens": 2657724.0, "step": 163 }, { "entropy": 0.5748997032642365, "epoch": 0.6142322097378277, "grad_norm": 0.03055480308830738, "learning_rate": 0.0002, "loss": 0.5857313275337219, "mean_token_accuracy": 0.7639760226011276, "num_tokens": 2674255.0, "step": 164 }, { "entropy": 0.5685756206512451, "epoch": 0.6179775280898876, "grad_norm": 0.030725592747330666, "learning_rate": 0.0002, "loss": 0.5727284550666809, "mean_token_accuracy": 0.7686582803726196, "num_tokens": 2690670.0, "step": 165 }, { "entropy": 0.547265499830246, "epoch": 0.6217228464419475, "grad_norm": 0.028982795774936676, "learning_rate": 0.0002, "loss": 0.5458434820175171, "mean_token_accuracy": 0.7764610648155212, "num_tokens": 2706990.0, "step": 166 }, { "entropy": 0.5669321566820145, "epoch": 0.6254681647940075, "grad_norm": 0.02999156154692173, "learning_rate": 0.0002, "loss": 0.5610904097557068, "mean_token_accuracy": 0.7703774124383926, "num_tokens": 2723382.0, "step": 167 }, { "entropy": 0.5631402879953384, "epoch": 0.6292134831460674, "grad_norm": 0.02727295272052288, "learning_rate": 0.0002, "loss": 0.5610119700431824, "mean_token_accuracy": 0.7734928578138351, "num_tokens": 2739673.0, "step": 168 }, { "entropy": 0.5462162643671036, "epoch": 0.6329588014981273, "grad_norm": 0.03161296248435974, "learning_rate": 0.0002, "loss": 0.5594881772994995, "mean_token_accuracy": 0.7721333503723145, "num_tokens": 2756004.0, "step": 169 }, { "entropy": 0.5525806844234467, "epoch": 0.6367041198501873, "grad_norm": 0.028923675417900085, "learning_rate": 0.0002, "loss": 0.5581262707710266, "mean_token_accuracy": 0.7746219336986542, "num_tokens": 2772131.0, "step": 170 }, { "entropy": 0.5815936326980591, "epoch": 0.6404494382022472, "grad_norm": 0.029989033937454224, "learning_rate": 0.0002, "loss": 0.5781337022781372, "mean_token_accuracy": 0.7642954289913177, "num_tokens": 2788556.0, "step": 171 }, { "entropy": 0.5742616653442383, "epoch": 0.6441947565543071, "grad_norm": 0.03870734944939613, "learning_rate": 0.0002, "loss": 0.5799432992935181, "mean_token_accuracy": 0.7655478119850159, "num_tokens": 2804635.0, "step": 172 }, { "entropy": 0.576400488615036, "epoch": 0.6479400749063671, "grad_norm": 0.02596936747431755, "learning_rate": 0.0002, "loss": 0.5705851912498474, "mean_token_accuracy": 0.7653899490833282, "num_tokens": 2821201.0, "step": 173 }, { "entropy": 0.5751689076423645, "epoch": 0.651685393258427, "grad_norm": 0.02525261603295803, "learning_rate": 0.0002, "loss": 0.5706028938293457, "mean_token_accuracy": 0.7693078964948654, "num_tokens": 2837952.0, "step": 174 }, { "entropy": 0.557927280664444, "epoch": 0.6554307116104869, "grad_norm": 0.025947891175746918, "learning_rate": 0.0002, "loss": 0.55954509973526, "mean_token_accuracy": 0.7710674405097961, "num_tokens": 2854247.0, "step": 175 }, { "entropy": 0.5340227037668228, "epoch": 0.6591760299625468, "grad_norm": 0.03157508745789528, "learning_rate": 0.0002, "loss": 0.5432956218719482, "mean_token_accuracy": 0.7804963290691376, "num_tokens": 2870169.0, "step": 176 }, { "entropy": 0.5522671341896057, "epoch": 0.6629213483146067, "grad_norm": 0.027346299961209297, "learning_rate": 0.0002, "loss": 0.5591444969177246, "mean_token_accuracy": 0.7712446004152298, "num_tokens": 2886516.0, "step": 177 }, { "entropy": 0.5393896102905273, "epoch": 0.6666666666666666, "grad_norm": 0.027576690539717674, "learning_rate": 0.0002, "loss": 0.5416374206542969, "mean_token_accuracy": 0.7780617028474808, "num_tokens": 2902729.0, "step": 178 }, { "entropy": 0.5685822814702988, "epoch": 0.6704119850187266, "grad_norm": 0.03415964916348457, "learning_rate": 0.0002, "loss": 0.5774993300437927, "mean_token_accuracy": 0.7654603570699692, "num_tokens": 2919059.0, "step": 179 }, { "entropy": 0.5473489463329315, "epoch": 0.6741573033707865, "grad_norm": 0.03175094723701477, "learning_rate": 0.0002, "loss": 0.5478240847587585, "mean_token_accuracy": 0.7771035730838776, "num_tokens": 2935209.0, "step": 180 }, { "entropy": 0.5505825132131577, "epoch": 0.6779026217228464, "grad_norm": 0.027963241562247276, "learning_rate": 0.0002, "loss": 0.5473360419273376, "mean_token_accuracy": 0.7776090204715729, "num_tokens": 2951643.0, "step": 181 }, { "entropy": 0.5541345179080963, "epoch": 0.6816479400749064, "grad_norm": 0.03300129622220993, "learning_rate": 0.0002, "loss": 0.5419403910636902, "mean_token_accuracy": 0.7789575755596161, "num_tokens": 2967938.0, "step": 182 }, { "entropy": 0.5639268904924393, "epoch": 0.6853932584269663, "grad_norm": 0.032656021416187286, "learning_rate": 0.0002, "loss": 0.5597264170646667, "mean_token_accuracy": 0.7759947925806046, "num_tokens": 2984230.0, "step": 183 }, { "entropy": 0.5538647770881653, "epoch": 0.6891385767790262, "grad_norm": 0.03382604569196701, "learning_rate": 0.0002, "loss": 0.5666002631187439, "mean_token_accuracy": 0.7692589312791824, "num_tokens": 3000607.0, "step": 184 }, { "entropy": 0.5578113794326782, "epoch": 0.6928838951310862, "grad_norm": 0.03644486889243126, "learning_rate": 0.0002, "loss": 0.5739911198616028, "mean_token_accuracy": 0.7684497386217117, "num_tokens": 3017077.0, "step": 185 }, { "entropy": 0.5290449112653732, "epoch": 0.6966292134831461, "grad_norm": 0.027713051065802574, "learning_rate": 0.0002, "loss": 0.5355228781700134, "mean_token_accuracy": 0.7826152592897415, "num_tokens": 3032996.0, "step": 186 }, { "entropy": 0.5759813338518143, "epoch": 0.700374531835206, "grad_norm": 0.03057127632200718, "learning_rate": 0.0002, "loss": 0.569280743598938, "mean_token_accuracy": 0.7680912464857101, "num_tokens": 3049460.0, "step": 187 }, { "entropy": 0.5720777213573456, "epoch": 0.704119850187266, "grad_norm": 0.02572391740977764, "learning_rate": 0.0002, "loss": 0.5658439993858337, "mean_token_accuracy": 0.7709487825632095, "num_tokens": 3065672.0, "step": 188 }, { "entropy": 0.5517766922712326, "epoch": 0.7078651685393258, "grad_norm": 0.029554082080721855, "learning_rate": 0.0002, "loss": 0.5389034748077393, "mean_token_accuracy": 0.7830005586147308, "num_tokens": 3082173.0, "step": 189 }, { "entropy": 0.5635267347097397, "epoch": 0.7116104868913857, "grad_norm": 0.025442970916628838, "learning_rate": 0.0002, "loss": 0.5614153742790222, "mean_token_accuracy": 0.7708731889724731, "num_tokens": 3098727.0, "step": 190 }, { "entropy": 0.5624646097421646, "epoch": 0.7153558052434457, "grad_norm": 0.03501886874437332, "learning_rate": 0.0002, "loss": 0.5751168727874756, "mean_token_accuracy": 0.7674457877874374, "num_tokens": 3115031.0, "step": 191 }, { "entropy": 0.5412020832300186, "epoch": 0.7191011235955056, "grad_norm": 0.029673364013433456, "learning_rate": 0.0002, "loss": 0.5503013730049133, "mean_token_accuracy": 0.780591607093811, "num_tokens": 3131271.0, "step": 192 }, { "entropy": 0.557359516620636, "epoch": 0.7228464419475655, "grad_norm": 0.025931306183338165, "learning_rate": 0.0002, "loss": 0.559468150138855, "mean_token_accuracy": 0.7729436904191971, "num_tokens": 3147732.0, "step": 193 }, { "entropy": 0.5394045114517212, "epoch": 0.7265917602996255, "grad_norm": 0.0292246975004673, "learning_rate": 0.0002, "loss": 0.5409769415855408, "mean_token_accuracy": 0.7795000076293945, "num_tokens": 3163963.0, "step": 194 }, { "entropy": 0.5587436705827713, "epoch": 0.7303370786516854, "grad_norm": 0.03306795284152031, "learning_rate": 0.0002, "loss": 0.5556156039237976, "mean_token_accuracy": 0.7742602825164795, "num_tokens": 3179928.0, "step": 195 }, { "entropy": 0.558687686920166, "epoch": 0.7340823970037453, "grad_norm": 0.025363627821207047, "learning_rate": 0.0002, "loss": 0.5573633909225464, "mean_token_accuracy": 0.7759020626544952, "num_tokens": 3196142.0, "step": 196 }, { "entropy": 0.545383557677269, "epoch": 0.7378277153558053, "grad_norm": 0.027863260358572006, "learning_rate": 0.0002, "loss": 0.5485226511955261, "mean_token_accuracy": 0.7776659727096558, "num_tokens": 3212565.0, "step": 197 }, { "entropy": 0.5556656569242477, "epoch": 0.7415730337078652, "grad_norm": 0.035580288618803024, "learning_rate": 0.0002, "loss": 0.5673390626907349, "mean_token_accuracy": 0.7700339257717133, "num_tokens": 3228915.0, "step": 198 }, { "entropy": 0.5520624220371246, "epoch": 0.7453183520599251, "grad_norm": 0.02862994559109211, "learning_rate": 0.0002, "loss": 0.5494414567947388, "mean_token_accuracy": 0.7801119983196259, "num_tokens": 3245273.0, "step": 199 }, { "entropy": 0.5758003443479538, "epoch": 0.7490636704119851, "grad_norm": 0.0339261032640934, "learning_rate": 0.0002, "loss": 0.5687139630317688, "mean_token_accuracy": 0.7678625285625458, "num_tokens": 3261785.0, "step": 200 }, { "entropy": 0.568912148475647, "epoch": 0.7528089887640449, "grad_norm": 0.029947372153401375, "learning_rate": 0.0002, "loss": 0.5638163089752197, "mean_token_accuracy": 0.77249875664711, "num_tokens": 3278313.0, "step": 201 }, { "entropy": 0.5490483492612839, "epoch": 0.7565543071161048, "grad_norm": 0.02934352308511734, "learning_rate": 0.0002, "loss": 0.5535009503364563, "mean_token_accuracy": 0.7746146768331528, "num_tokens": 3294575.0, "step": 202 }, { "entropy": 0.560209795832634, "epoch": 0.7602996254681648, "grad_norm": 0.031990889459848404, "learning_rate": 0.0002, "loss": 0.5637909770011902, "mean_token_accuracy": 0.7735392153263092, "num_tokens": 3310679.0, "step": 203 }, { "entropy": 0.5573873072862625, "epoch": 0.7640449438202247, "grad_norm": 0.02812575176358223, "learning_rate": 0.0002, "loss": 0.5629784464836121, "mean_token_accuracy": 0.7686379998922348, "num_tokens": 3327065.0, "step": 204 }, { "entropy": 0.534591019153595, "epoch": 0.7677902621722846, "grad_norm": 0.03412024676799774, "learning_rate": 0.0002, "loss": 0.546525239944458, "mean_token_accuracy": 0.7761467695236206, "num_tokens": 3343404.0, "step": 205 }, { "entropy": 0.5677939504384995, "epoch": 0.7715355805243446, "grad_norm": 0.02933080866932869, "learning_rate": 0.0002, "loss": 0.5688956379890442, "mean_token_accuracy": 0.7702508270740509, "num_tokens": 3359958.0, "step": 206 }, { "entropy": 0.582836389541626, "epoch": 0.7752808988764045, "grad_norm": 0.027001049369573593, "learning_rate": 0.0002, "loss": 0.5772212147712708, "mean_token_accuracy": 0.7654514610767365, "num_tokens": 3376426.0, "step": 207 }, { "entropy": 0.5876192450523376, "epoch": 0.7790262172284644, "grad_norm": 0.031185103580355644, "learning_rate": 0.0002, "loss": 0.5810344219207764, "mean_token_accuracy": 0.7651431113481522, "num_tokens": 3392821.0, "step": 208 }, { "entropy": 0.5676351487636566, "epoch": 0.7827715355805244, "grad_norm": 0.02849467284977436, "learning_rate": 0.0002, "loss": 0.5602158904075623, "mean_token_accuracy": 0.771087646484375, "num_tokens": 3409137.0, "step": 209 }, { "entropy": 0.5598850250244141, "epoch": 0.7865168539325843, "grad_norm": 0.028652694076299667, "learning_rate": 0.0002, "loss": 0.5560476779937744, "mean_token_accuracy": 0.7744726985692978, "num_tokens": 3425346.0, "step": 210 }, { "entropy": 0.5631076842546463, "epoch": 0.7902621722846442, "grad_norm": 0.03177965059876442, "learning_rate": 0.0002, "loss": 0.5703850984573364, "mean_token_accuracy": 0.7688238769769669, "num_tokens": 3441766.0, "step": 211 }, { "entropy": 0.5571614354848862, "epoch": 0.7940074906367042, "grad_norm": 0.035387102514505386, "learning_rate": 0.0002, "loss": 0.5680047869682312, "mean_token_accuracy": 0.7702172994613647, "num_tokens": 3458303.0, "step": 212 }, { "entropy": 0.5512831062078476, "epoch": 0.797752808988764, "grad_norm": 0.02970981039106846, "learning_rate": 0.0002, "loss": 0.5541270971298218, "mean_token_accuracy": 0.7740521878004074, "num_tokens": 3474455.0, "step": 213 }, { "entropy": 0.5604052096605301, "epoch": 0.8014981273408239, "grad_norm": 0.028583871200680733, "learning_rate": 0.0002, "loss": 0.5585545301437378, "mean_token_accuracy": 0.7712778151035309, "num_tokens": 3490567.0, "step": 214 }, { "entropy": 0.5531798452138901, "epoch": 0.8052434456928839, "grad_norm": 0.027284301817417145, "learning_rate": 0.0002, "loss": 0.5523191690444946, "mean_token_accuracy": 0.7744116485118866, "num_tokens": 3506697.0, "step": 215 }, { "entropy": 0.5611687004566193, "epoch": 0.8089887640449438, "grad_norm": 0.030331265181303024, "learning_rate": 0.0002, "loss": 0.5599703192710876, "mean_token_accuracy": 0.7741329371929169, "num_tokens": 3523064.0, "step": 216 }, { "entropy": 0.5679153800010681, "epoch": 0.8127340823970037, "grad_norm": 0.028981544077396393, "learning_rate": 0.0002, "loss": 0.5729029178619385, "mean_token_accuracy": 0.7667650431394577, "num_tokens": 3539143.0, "step": 217 }, { "entropy": 0.5438763052225113, "epoch": 0.8164794007490637, "grad_norm": 0.02691890485584736, "learning_rate": 0.0002, "loss": 0.5485566854476929, "mean_token_accuracy": 0.7739608585834503, "num_tokens": 3555565.0, "step": 218 }, { "entropy": 0.5619954615831375, "epoch": 0.8202247191011236, "grad_norm": 0.026171443983912468, "learning_rate": 0.0002, "loss": 0.5637154579162598, "mean_token_accuracy": 0.7711703032255173, "num_tokens": 3571906.0, "step": 219 }, { "entropy": 0.5464108288288116, "epoch": 0.8239700374531835, "grad_norm": 0.02858656644821167, "learning_rate": 0.0002, "loss": 0.5461940169334412, "mean_token_accuracy": 0.7789376378059387, "num_tokens": 3588158.0, "step": 220 }, { "entropy": 0.5636538565158844, "epoch": 0.8277153558052435, "grad_norm": 0.02787981554865837, "learning_rate": 0.0002, "loss": 0.5658812522888184, "mean_token_accuracy": 0.7694707363843918, "num_tokens": 3604701.0, "step": 221 }, { "entropy": 0.5738235861063004, "epoch": 0.8314606741573034, "grad_norm": 0.03107610158622265, "learning_rate": 0.0002, "loss": 0.5720517635345459, "mean_token_accuracy": 0.767520397901535, "num_tokens": 3621041.0, "step": 222 }, { "entropy": 0.5418261587619781, "epoch": 0.8352059925093633, "grad_norm": 0.030757945030927658, "learning_rate": 0.0002, "loss": 0.5468308925628662, "mean_token_accuracy": 0.7743646949529648, "num_tokens": 3637338.0, "step": 223 }, { "entropy": 0.5567242801189423, "epoch": 0.8389513108614233, "grad_norm": 0.031262289732694626, "learning_rate": 0.0002, "loss": 0.5633231997489929, "mean_token_accuracy": 0.7722140103578568, "num_tokens": 3653872.0, "step": 224 }, { "entropy": 0.5542743653059006, "epoch": 0.8426966292134831, "grad_norm": 0.03351176902651787, "learning_rate": 0.0002, "loss": 0.5574679374694824, "mean_token_accuracy": 0.7744366973638535, "num_tokens": 3670013.0, "step": 225 }, { "entropy": 0.5486074835062027, "epoch": 0.846441947565543, "grad_norm": 0.0312609001994133, "learning_rate": 0.0002, "loss": 0.545890748500824, "mean_token_accuracy": 0.7778652608394623, "num_tokens": 3686275.0, "step": 226 }, { "entropy": 0.5650633871555328, "epoch": 0.850187265917603, "grad_norm": 0.028242582455277443, "learning_rate": 0.0002, "loss": 0.5587697625160217, "mean_token_accuracy": 0.7728594094514847, "num_tokens": 3702890.0, "step": 227 }, { "entropy": 0.5442924797534943, "epoch": 0.8539325842696629, "grad_norm": 0.03206290304660797, "learning_rate": 0.0002, "loss": 0.5438553690910339, "mean_token_accuracy": 0.7799272388219833, "num_tokens": 3719196.0, "step": 228 }, { "entropy": 0.5688119828701019, "epoch": 0.8576779026217228, "grad_norm": 0.031068341806530952, "learning_rate": 0.0002, "loss": 0.5722005367279053, "mean_token_accuracy": 0.7658038288354874, "num_tokens": 3735614.0, "step": 229 }, { "entropy": 0.5671662837266922, "epoch": 0.8614232209737828, "grad_norm": 0.03664137050509453, "learning_rate": 0.0002, "loss": 0.5779143571853638, "mean_token_accuracy": 0.7624872028827667, "num_tokens": 3751617.0, "step": 230 }, { "entropy": 0.5505847632884979, "epoch": 0.8651685393258427, "grad_norm": 0.031469304114580154, "learning_rate": 0.0002, "loss": 0.5520802140235901, "mean_token_accuracy": 0.7765519469976425, "num_tokens": 3768020.0, "step": 231 }, { "entropy": 0.5407437533140182, "epoch": 0.8689138576779026, "grad_norm": 0.03157830610871315, "learning_rate": 0.0002, "loss": 0.53821861743927, "mean_token_accuracy": 0.7832015603780746, "num_tokens": 3784206.0, "step": 232 }, { "entropy": 0.5574967563152313, "epoch": 0.8726591760299626, "grad_norm": 0.03071594052016735, "learning_rate": 0.0002, "loss": 0.5562031865119934, "mean_token_accuracy": 0.7721244394779205, "num_tokens": 3800616.0, "step": 233 }, { "entropy": 0.5378725826740265, "epoch": 0.8764044943820225, "grad_norm": 0.030823221430182457, "learning_rate": 0.0002, "loss": 0.5407513380050659, "mean_token_accuracy": 0.7836541086435318, "num_tokens": 3816842.0, "step": 234 }, { "entropy": 0.5592721700668335, "epoch": 0.8801498127340824, "grad_norm": 0.03175733983516693, "learning_rate": 0.0002, "loss": 0.5660021305084229, "mean_token_accuracy": 0.7676839083433151, "num_tokens": 3833206.0, "step": 235 }, { "entropy": 0.5588899403810501, "epoch": 0.8838951310861424, "grad_norm": 0.03060559183359146, "learning_rate": 0.0002, "loss": 0.5651678442955017, "mean_token_accuracy": 0.7706761956214905, "num_tokens": 3849556.0, "step": 236 }, { "entropy": 0.5560838133096695, "epoch": 0.8876404494382022, "grad_norm": 0.03011494129896164, "learning_rate": 0.0002, "loss": 0.5619899034500122, "mean_token_accuracy": 0.7695688903331757, "num_tokens": 3865973.0, "step": 237 }, { "entropy": 0.572941854596138, "epoch": 0.8913857677902621, "grad_norm": 0.02626178041100502, "learning_rate": 0.0002, "loss": 0.5712540149688721, "mean_token_accuracy": 0.7688916623592377, "num_tokens": 3882349.0, "step": 238 }, { "entropy": 0.5688192397356033, "epoch": 0.8951310861423221, "grad_norm": 0.0268928874284029, "learning_rate": 0.0002, "loss": 0.562833309173584, "mean_token_accuracy": 0.7708128988742828, "num_tokens": 3898536.0, "step": 239 }, { "entropy": 0.5633461475372314, "epoch": 0.898876404494382, "grad_norm": 0.029186321422457695, "learning_rate": 0.0002, "loss": 0.5525766611099243, "mean_token_accuracy": 0.7749095112085342, "num_tokens": 3914950.0, "step": 240 }, { "entropy": 0.5715253502130508, "epoch": 0.9026217228464419, "grad_norm": 0.029228920117020607, "learning_rate": 0.0002, "loss": 0.5710093975067139, "mean_token_accuracy": 0.7693532109260559, "num_tokens": 3931161.0, "step": 241 }, { "entropy": 0.5170925259590149, "epoch": 0.9063670411985019, "grad_norm": 0.03571123257279396, "learning_rate": 0.0002, "loss": 0.52873295545578, "mean_token_accuracy": 0.7879834473133087, "num_tokens": 3947256.0, "step": 242 }, { "entropy": 0.5353554487228394, "epoch": 0.9101123595505618, "grad_norm": 0.031091809272766113, "learning_rate": 0.0002, "loss": 0.5437985062599182, "mean_token_accuracy": 0.7802935838699341, "num_tokens": 3963703.0, "step": 243 }, { "entropy": 0.5593858063220978, "epoch": 0.9138576779026217, "grad_norm": 0.028724675998091698, "learning_rate": 0.0002, "loss": 0.5654380321502686, "mean_token_accuracy": 0.766664981842041, "num_tokens": 3980237.0, "step": 244 }, { "entropy": 0.5452692359685898, "epoch": 0.9176029962546817, "grad_norm": 0.032008957117795944, "learning_rate": 0.0002, "loss": 0.5489979982376099, "mean_token_accuracy": 0.7783998996019363, "num_tokens": 3996411.0, "step": 245 }, { "entropy": 0.5732362270355225, "epoch": 0.9213483146067416, "grad_norm": 0.026769591495394707, "learning_rate": 0.0002, "loss": 0.5739398002624512, "mean_token_accuracy": 0.7671795785427094, "num_tokens": 4012857.0, "step": 246 }, { "entropy": 0.5656879991292953, "epoch": 0.9250936329588015, "grad_norm": 0.03197095915675163, "learning_rate": 0.0002, "loss": 0.563187301158905, "mean_token_accuracy": 0.7670102566480637, "num_tokens": 4029053.0, "step": 247 }, { "entropy": 0.5575947314500809, "epoch": 0.9288389513108615, "grad_norm": 0.02987116388976574, "learning_rate": 0.0002, "loss": 0.5625151991844177, "mean_token_accuracy": 0.7722823321819305, "num_tokens": 4045520.0, "step": 248 }, { "entropy": 0.5391925573348999, "epoch": 0.9325842696629213, "grad_norm": 0.03071737289428711, "learning_rate": 0.0002, "loss": 0.5494749546051025, "mean_token_accuracy": 0.7774742394685745, "num_tokens": 4061722.0, "step": 249 }, { "entropy": 0.5374163240194321, "epoch": 0.9363295880149812, "grad_norm": 0.03443381190299988, "learning_rate": 0.0002, "loss": 0.5430468916893005, "mean_token_accuracy": 0.7767436355352402, "num_tokens": 4077909.0, "step": 250 }, { "entropy": 0.563934788107872, "epoch": 0.9400749063670412, "grad_norm": 0.03456362709403038, "learning_rate": 0.0002, "loss": 0.5705171227455139, "mean_token_accuracy": 0.7667582482099533, "num_tokens": 4094266.0, "step": 251 }, { "entropy": 0.5498995631933212, "epoch": 0.9438202247191011, "grad_norm": 0.03230346366763115, "learning_rate": 0.0002, "loss": 0.5477432012557983, "mean_token_accuracy": 0.7797223627567291, "num_tokens": 4110154.0, "step": 252 }, { "entropy": 0.5815821886062622, "epoch": 0.947565543071161, "grad_norm": 0.030871113762259483, "learning_rate": 0.0002, "loss": 0.5757232904434204, "mean_token_accuracy": 0.7643865346908569, "num_tokens": 4126298.0, "step": 253 }, { "entropy": 0.568855032324791, "epoch": 0.951310861423221, "grad_norm": 0.03128105401992798, "learning_rate": 0.0002, "loss": 0.5623528361320496, "mean_token_accuracy": 0.7733433544635773, "num_tokens": 4142423.0, "step": 254 }, { "entropy": 0.5580300092697144, "epoch": 0.9550561797752809, "grad_norm": 0.028919901698827744, "learning_rate": 0.0002, "loss": 0.5540750026702881, "mean_token_accuracy": 0.7751399129629135, "num_tokens": 4158616.0, "step": 255 }, { "entropy": 0.5586510896682739, "epoch": 0.9588014981273408, "grad_norm": 0.028054876253008842, "learning_rate": 0.0002, "loss": 0.5566189289093018, "mean_token_accuracy": 0.771488219499588, "num_tokens": 4174981.0, "step": 256 }, { "entropy": 0.5506493747234344, "epoch": 0.9625468164794008, "grad_norm": 0.028799347579479218, "learning_rate": 0.0002, "loss": 0.5535633563995361, "mean_token_accuracy": 0.7742148786783218, "num_tokens": 4191446.0, "step": 257 }, { "entropy": 0.5423731654882431, "epoch": 0.9662921348314607, "grad_norm": 0.033325713127851486, "learning_rate": 0.0002, "loss": 0.5534674525260925, "mean_token_accuracy": 0.773481622338295, "num_tokens": 4207545.0, "step": 258 }, { "entropy": 0.5463626831769943, "epoch": 0.9700374531835206, "grad_norm": 0.029474180191755295, "learning_rate": 0.0002, "loss": 0.5469580888748169, "mean_token_accuracy": 0.778034508228302, "num_tokens": 4223705.0, "step": 259 }, { "entropy": 0.5447346717119217, "epoch": 0.9737827715355806, "grad_norm": 0.02612573839724064, "learning_rate": 0.0002, "loss": 0.5400044322013855, "mean_token_accuracy": 0.7802340239286423, "num_tokens": 4240129.0, "step": 260 }, { "entropy": 0.5821470022201538, "epoch": 0.9775280898876404, "grad_norm": 0.030348099768161774, "learning_rate": 0.0002, "loss": 0.5687776803970337, "mean_token_accuracy": 0.7688710540533066, "num_tokens": 4256543.0, "step": 261 }, { "entropy": 0.5551526695489883, "epoch": 0.9812734082397003, "grad_norm": 0.027197403833270073, "learning_rate": 0.0002, "loss": 0.5550498962402344, "mean_token_accuracy": 0.7730266898870468, "num_tokens": 4272850.0, "step": 262 }, { "entropy": 0.558951735496521, "epoch": 0.9850187265917603, "grad_norm": 0.02930772304534912, "learning_rate": 0.0002, "loss": 0.568732738494873, "mean_token_accuracy": 0.7649472206830978, "num_tokens": 4288981.0, "step": 263 }, { "entropy": 0.5453519076108932, "epoch": 0.9887640449438202, "grad_norm": 0.03282203525304794, "learning_rate": 0.0002, "loss": 0.5584692358970642, "mean_token_accuracy": 0.7731108516454697, "num_tokens": 4305020.0, "step": 264 }, { "entropy": 0.5550204813480377, "epoch": 0.9925093632958801, "grad_norm": 0.030776405707001686, "learning_rate": 0.0002, "loss": 0.5647276639938354, "mean_token_accuracy": 0.7714035212993622, "num_tokens": 4321505.0, "step": 265 }, { "entropy": 0.5713452994823456, "epoch": 0.9962546816479401, "grad_norm": 0.027741121128201485, "learning_rate": 0.0002, "loss": 0.5671746134757996, "mean_token_accuracy": 0.77179254591465, "num_tokens": 4337819.0, "step": 266 }, { "entropy": 0.5695875138044357, "epoch": 1.0, "grad_norm": 0.03063138760626316, "learning_rate": 0.0002, "loss": 0.5631532669067383, "mean_token_accuracy": 0.7723733484745026, "num_tokens": 4354077.0, "step": 267 }, { "entropy": 0.5564615577459335, "epoch": 1.00374531835206, "grad_norm": 0.02938828431069851, "learning_rate": 0.0002, "loss": 0.5473178625106812, "mean_token_accuracy": 0.7778049558401108, "num_tokens": 4370546.0, "step": 268 }, { "entropy": 0.5574217587709427, "epoch": 1.0074906367041199, "grad_norm": 0.029280902817845345, "learning_rate": 0.0002, "loss": 0.5522539019584656, "mean_token_accuracy": 0.774829238653183, "num_tokens": 4386769.0, "step": 269 }, { "entropy": 0.5274022594094276, "epoch": 1.0112359550561798, "grad_norm": 0.03879232704639435, "learning_rate": 0.0002, "loss": 0.5378210544586182, "mean_token_accuracy": 0.7831418812274933, "num_tokens": 4402982.0, "step": 270 }, { "entropy": 0.5290966331958771, "epoch": 1.0149812734082397, "grad_norm": 0.03839439898729324, "learning_rate": 0.0002, "loss": 0.5428091883659363, "mean_token_accuracy": 0.7794705182313919, "num_tokens": 4418967.0, "step": 271 }, { "entropy": 0.5340720564126968, "epoch": 1.0187265917602997, "grad_norm": 0.027254262939095497, "learning_rate": 0.0002, "loss": 0.5355733633041382, "mean_token_accuracy": 0.7818265557289124, "num_tokens": 4435204.0, "step": 272 }, { "entropy": 0.5440738946199417, "epoch": 1.0224719101123596, "grad_norm": 0.03392236679792404, "learning_rate": 0.0002, "loss": 0.5456275939941406, "mean_token_accuracy": 0.780282586812973, "num_tokens": 4451432.0, "step": 273 }, { "entropy": 0.5574818104505539, "epoch": 1.0262172284644195, "grad_norm": 0.026871202513575554, "learning_rate": 0.0002, "loss": 0.5559114217758179, "mean_token_accuracy": 0.777089074254036, "num_tokens": 4467766.0, "step": 274 }, { "entropy": 0.5488097965717316, "epoch": 1.0299625468164795, "grad_norm": 0.029019974172115326, "learning_rate": 0.0002, "loss": 0.5336285829544067, "mean_token_accuracy": 0.7849163711071014, "num_tokens": 4483969.0, "step": 275 }, { "entropy": 0.5530442148447037, "epoch": 1.0337078651685394, "grad_norm": 0.02914772555232048, "learning_rate": 0.0002, "loss": 0.5511333346366882, "mean_token_accuracy": 0.7753048241138458, "num_tokens": 4500202.0, "step": 276 }, { "entropy": 0.5580654293298721, "epoch": 1.0374531835205993, "grad_norm": 0.02970791608095169, "learning_rate": 0.0002, "loss": 0.5622603297233582, "mean_token_accuracy": 0.7713205814361572, "num_tokens": 4516619.0, "step": 277 }, { "entropy": 0.5405817478895187, "epoch": 1.0411985018726593, "grad_norm": 0.0317082442343235, "learning_rate": 0.0002, "loss": 0.5510064363479614, "mean_token_accuracy": 0.7750898003578186, "num_tokens": 4532787.0, "step": 278 }, { "entropy": 0.529707208275795, "epoch": 1.0449438202247192, "grad_norm": 0.032039616256952286, "learning_rate": 0.0002, "loss": 0.5385198593139648, "mean_token_accuracy": 0.7802569419145584, "num_tokens": 4549095.0, "step": 279 }, { "entropy": 0.536220982670784, "epoch": 1.048689138576779, "grad_norm": 0.03247847780585289, "learning_rate": 0.0002, "loss": 0.5422552824020386, "mean_token_accuracy": 0.7777614146471024, "num_tokens": 4565068.0, "step": 280 }, { "entropy": 0.5643364787101746, "epoch": 1.0524344569288389, "grad_norm": 0.03038158267736435, "learning_rate": 0.0002, "loss": 0.5526927709579468, "mean_token_accuracy": 0.7772861868143082, "num_tokens": 4581362.0, "step": 281 }, { "entropy": 0.5710341036319733, "epoch": 1.0561797752808988, "grad_norm": 0.029375184327363968, "learning_rate": 0.0002, "loss": 0.5627338290214539, "mean_token_accuracy": 0.7716031968593597, "num_tokens": 4598044.0, "step": 282 }, { "entropy": 0.5661873072385788, "epoch": 1.0599250936329587, "grad_norm": 0.029537923634052277, "learning_rate": 0.0002, "loss": 0.5619353652000427, "mean_token_accuracy": 0.7722314894199371, "num_tokens": 4614605.0, "step": 283 }, { "entropy": 0.545825719833374, "epoch": 1.0636704119850187, "grad_norm": 0.028511304408311844, "learning_rate": 0.0002, "loss": 0.5431419610977173, "mean_token_accuracy": 0.7778640240430832, "num_tokens": 4630914.0, "step": 284 }, { "entropy": 0.5331753790378571, "epoch": 1.0674157303370786, "grad_norm": 0.032436709851026535, "learning_rate": 0.0002, "loss": 0.5459548830986023, "mean_token_accuracy": 0.7751310169696808, "num_tokens": 4647234.0, "step": 285 }, { "entropy": 0.5640293508768082, "epoch": 1.0711610486891385, "grad_norm": 0.0322943851351738, "learning_rate": 0.0002, "loss": 0.5726660490036011, "mean_token_accuracy": 0.76516292989254, "num_tokens": 4663828.0, "step": 286 }, { "entropy": 0.5655198693275452, "epoch": 1.0749063670411985, "grad_norm": 0.028429750353097916, "learning_rate": 0.0002, "loss": 0.5707299709320068, "mean_token_accuracy": 0.7665908485651016, "num_tokens": 4680191.0, "step": 287 }, { "entropy": 0.5641037821769714, "epoch": 1.0786516853932584, "grad_norm": 0.02850640006363392, "learning_rate": 0.0002, "loss": 0.5591652393341064, "mean_token_accuracy": 0.7727868556976318, "num_tokens": 4696297.0, "step": 288 }, { "entropy": 0.5585228204727173, "epoch": 1.0823970037453183, "grad_norm": 0.03052029199898243, "learning_rate": 0.0002, "loss": 0.5535526275634766, "mean_token_accuracy": 0.7758607268333435, "num_tokens": 4712608.0, "step": 289 }, { "entropy": 0.5454631745815277, "epoch": 1.0861423220973783, "grad_norm": 0.02904430776834488, "learning_rate": 0.0002, "loss": 0.5463353395462036, "mean_token_accuracy": 0.7812290787696838, "num_tokens": 4728702.0, "step": 290 }, { "entropy": 0.547488197684288, "epoch": 1.0898876404494382, "grad_norm": 0.02964003197848797, "learning_rate": 0.0002, "loss": 0.5422903299331665, "mean_token_accuracy": 0.7805432081222534, "num_tokens": 4745177.0, "step": 291 }, { "entropy": 0.5354203134775162, "epoch": 1.0936329588014981, "grad_norm": 0.036443792283535004, "learning_rate": 0.0002, "loss": 0.5374300479888916, "mean_token_accuracy": 0.7797484993934631, "num_tokens": 4761143.0, "step": 292 }, { "entropy": 0.5536107122898102, "epoch": 1.097378277153558, "grad_norm": 0.028762439265847206, "learning_rate": 0.0002, "loss": 0.5621394515037537, "mean_token_accuracy": 0.7706074863672256, "num_tokens": 4777282.0, "step": 293 }, { "entropy": 0.5409039855003357, "epoch": 1.101123595505618, "grad_norm": 0.03404904156923294, "learning_rate": 0.0002, "loss": 0.5510942339897156, "mean_token_accuracy": 0.7781406044960022, "num_tokens": 4793365.0, "step": 294 }, { "entropy": 0.5496554970741272, "epoch": 1.104868913857678, "grad_norm": 0.03300090506672859, "learning_rate": 0.0002, "loss": 0.5508947372436523, "mean_token_accuracy": 0.7776678502559662, "num_tokens": 4809752.0, "step": 295 }, { "entropy": 0.5615599453449249, "epoch": 1.1086142322097379, "grad_norm": 0.02708325907588005, "learning_rate": 0.0002, "loss": 0.5569652915000916, "mean_token_accuracy": 0.7737039029598236, "num_tokens": 4826077.0, "step": 296 }, { "entropy": 0.5593246519565582, "epoch": 1.1123595505617978, "grad_norm": 0.03139323368668556, "learning_rate": 0.0002, "loss": 0.5524771809577942, "mean_token_accuracy": 0.7745187878608704, "num_tokens": 4842333.0, "step": 297 }, { "entropy": 0.5454850494861603, "epoch": 1.1161048689138577, "grad_norm": 0.02898702770471573, "learning_rate": 0.0002, "loss": 0.5425970554351807, "mean_token_accuracy": 0.7789193391799927, "num_tokens": 4858558.0, "step": 298 }, { "entropy": 0.538344144821167, "epoch": 1.1198501872659177, "grad_norm": 0.029788950458168983, "learning_rate": 0.0002, "loss": 0.5424114465713501, "mean_token_accuracy": 0.7777515351772308, "num_tokens": 4874826.0, "step": 299 }, { "entropy": 0.5260975658893585, "epoch": 1.1235955056179776, "grad_norm": 0.03646169230341911, "learning_rate": 0.0002, "loss": 0.5355998277664185, "mean_token_accuracy": 0.7840575128793716, "num_tokens": 4890978.0, "step": 300 }, { "entropy": 0.5369604676961899, "epoch": 1.1273408239700375, "grad_norm": 0.03131569176912308, "learning_rate": 0.0002, "loss": 0.540716290473938, "mean_token_accuracy": 0.780446395277977, "num_tokens": 4907064.0, "step": 301 }, { "entropy": 0.5605516880750656, "epoch": 1.1310861423220975, "grad_norm": 0.034511223435401917, "learning_rate": 0.0002, "loss": 0.5577893257141113, "mean_token_accuracy": 0.7730138152837753, "num_tokens": 4923266.0, "step": 302 }, { "entropy": 0.5472770929336548, "epoch": 1.1348314606741572, "grad_norm": 0.0347181111574173, "learning_rate": 0.0002, "loss": 0.5447498559951782, "mean_token_accuracy": 0.7790001332759857, "num_tokens": 4939554.0, "step": 303 }, { "entropy": 0.5580919533967972, "epoch": 1.1385767790262173, "grad_norm": 0.029458722099661827, "learning_rate": 0.0002, "loss": 0.5602295994758606, "mean_token_accuracy": 0.7698655724525452, "num_tokens": 4955864.0, "step": 304 }, { "entropy": 0.5566238462924957, "epoch": 1.142322097378277, "grad_norm": 0.03371216729283333, "learning_rate": 0.0002, "loss": 0.5516577363014221, "mean_token_accuracy": 0.7762005478143692, "num_tokens": 4972145.0, "step": 305 }, { "entropy": 0.5444543808698654, "epoch": 1.146067415730337, "grad_norm": 0.03240659460425377, "learning_rate": 0.0002, "loss": 0.5465469360351562, "mean_token_accuracy": 0.7778800278902054, "num_tokens": 4988600.0, "step": 306 }, { "entropy": 0.5197838395833969, "epoch": 1.149812734082397, "grad_norm": 0.03453533351421356, "learning_rate": 0.0002, "loss": 0.52244633436203, "mean_token_accuracy": 0.7865428030490875, "num_tokens": 5004593.0, "step": 307 }, { "entropy": 0.5355952382087708, "epoch": 1.1535580524344569, "grad_norm": 0.02796328440308571, "learning_rate": 0.0002, "loss": 0.5417516231536865, "mean_token_accuracy": 0.778742790222168, "num_tokens": 5020798.0, "step": 308 }, { "entropy": 0.5339494347572327, "epoch": 1.1573033707865168, "grad_norm": 0.031283531337976456, "learning_rate": 0.0002, "loss": 0.5422439575195312, "mean_token_accuracy": 0.7790778428316116, "num_tokens": 5037095.0, "step": 309 }, { "entropy": 0.5599728673696518, "epoch": 1.1610486891385767, "grad_norm": 0.029156681150197983, "learning_rate": 0.0002, "loss": 0.5628546476364136, "mean_token_accuracy": 0.7709409445524216, "num_tokens": 5053556.0, "step": 310 }, { "entropy": 0.5527057945728302, "epoch": 1.1647940074906367, "grad_norm": 0.028000809252262115, "learning_rate": 0.0002, "loss": 0.5457457900047302, "mean_token_accuracy": 0.7764673084020615, "num_tokens": 5069817.0, "step": 311 }, { "entropy": 0.5439251810312271, "epoch": 1.1685393258426966, "grad_norm": 0.027509242296218872, "learning_rate": 0.0002, "loss": 0.5400040149688721, "mean_token_accuracy": 0.7789120823144913, "num_tokens": 5086044.0, "step": 312 }, { "entropy": 0.561322957277298, "epoch": 1.1722846441947565, "grad_norm": 0.030032532289624214, "learning_rate": 0.0002, "loss": 0.5588545799255371, "mean_token_accuracy": 0.7742930203676224, "num_tokens": 5102685.0, "step": 313 }, { "entropy": 0.5458335727453232, "epoch": 1.1760299625468165, "grad_norm": 0.029963059350848198, "learning_rate": 0.0002, "loss": 0.5477938055992126, "mean_token_accuracy": 0.777193009853363, "num_tokens": 5119294.0, "step": 314 }, { "entropy": 0.5545150190591812, "epoch": 1.1797752808988764, "grad_norm": 0.03310168907046318, "learning_rate": 0.0002, "loss": 0.5611361265182495, "mean_token_accuracy": 0.7725827246904373, "num_tokens": 5135795.0, "step": 315 }, { "entropy": 0.5393262058496475, "epoch": 1.1835205992509363, "grad_norm": 0.02876197174191475, "learning_rate": 0.0002, "loss": 0.5395398139953613, "mean_token_accuracy": 0.781178891658783, "num_tokens": 5151936.0, "step": 316 }, { "entropy": 0.5356467962265015, "epoch": 1.1872659176029963, "grad_norm": 0.029216231778264046, "learning_rate": 0.0002, "loss": 0.5275884866714478, "mean_token_accuracy": 0.7844340801239014, "num_tokens": 5168072.0, "step": 317 }, { "entropy": 0.5539442598819733, "epoch": 1.1910112359550562, "grad_norm": 0.029222887009382248, "learning_rate": 0.0002, "loss": 0.5549959540367126, "mean_token_accuracy": 0.7750978469848633, "num_tokens": 5184280.0, "step": 318 }, { "entropy": 0.5316408574581146, "epoch": 1.1947565543071161, "grad_norm": 0.03008115477859974, "learning_rate": 0.0002, "loss": 0.536407470703125, "mean_token_accuracy": 0.7843799740076065, "num_tokens": 5200364.0, "step": 319 }, { "entropy": 0.5335765928030014, "epoch": 1.198501872659176, "grad_norm": 0.030437173321843147, "learning_rate": 0.0002, "loss": 0.5371608734130859, "mean_token_accuracy": 0.7834146469831467, "num_tokens": 5216503.0, "step": 320 }, { "entropy": 0.5507327914237976, "epoch": 1.202247191011236, "grad_norm": 0.030706282705068588, "learning_rate": 0.0002, "loss": 0.5528247356414795, "mean_token_accuracy": 0.7763889282941818, "num_tokens": 5232896.0, "step": 321 }, { "entropy": 0.5600829422473907, "epoch": 1.205992509363296, "grad_norm": 0.03131498023867607, "learning_rate": 0.0002, "loss": 0.559609055519104, "mean_token_accuracy": 0.7688225358724594, "num_tokens": 5249400.0, "step": 322 }, { "entropy": 0.5482848882675171, "epoch": 1.2097378277153559, "grad_norm": 0.030239688232541084, "learning_rate": 0.0002, "loss": 0.5498725771903992, "mean_token_accuracy": 0.7751806825399399, "num_tokens": 5265595.0, "step": 323 }, { "entropy": 0.5517048090696335, "epoch": 1.2134831460674158, "grad_norm": 0.03668053448200226, "learning_rate": 0.0002, "loss": 0.5480911135673523, "mean_token_accuracy": 0.7757556736469269, "num_tokens": 5281774.0, "step": 324 }, { "entropy": 0.5576729625463486, "epoch": 1.2172284644194757, "grad_norm": 0.028534850105643272, "learning_rate": 0.0002, "loss": 0.5513843894004822, "mean_token_accuracy": 0.7748550176620483, "num_tokens": 5297913.0, "step": 325 }, { "entropy": 0.5390013605356216, "epoch": 1.2209737827715357, "grad_norm": 0.03146135434508324, "learning_rate": 0.0002, "loss": 0.539669930934906, "mean_token_accuracy": 0.7778647989034653, "num_tokens": 5314070.0, "step": 326 }, { "entropy": 0.5463844388723373, "epoch": 1.2247191011235956, "grad_norm": 0.03442573919892311, "learning_rate": 0.0002, "loss": 0.5508401393890381, "mean_token_accuracy": 0.774851381778717, "num_tokens": 5330361.0, "step": 327 }, { "entropy": 0.5308734029531479, "epoch": 1.2284644194756553, "grad_norm": 0.03126746043562889, "learning_rate": 0.0002, "loss": 0.5370399951934814, "mean_token_accuracy": 0.7805522531270981, "num_tokens": 5346367.0, "step": 328 }, { "entropy": 0.5443529635667801, "epoch": 1.2322097378277155, "grad_norm": 0.028079699724912643, "learning_rate": 0.0002, "loss": 0.5469828248023987, "mean_token_accuracy": 0.7801272124052048, "num_tokens": 5362795.0, "step": 329 }, { "entropy": 0.5508403033018112, "epoch": 1.2359550561797752, "grad_norm": 0.03308681398630142, "learning_rate": 0.0002, "loss": 0.5537492632865906, "mean_token_accuracy": 0.776117667555809, "num_tokens": 5378892.0, "step": 330 }, { "entropy": 0.547036200761795, "epoch": 1.2397003745318351, "grad_norm": 0.030657080933451653, "learning_rate": 0.0002, "loss": 0.5473320484161377, "mean_token_accuracy": 0.7783585488796234, "num_tokens": 5395182.0, "step": 331 }, { "entropy": 0.5384639650583267, "epoch": 1.243445692883895, "grad_norm": 0.03128959983587265, "learning_rate": 0.0002, "loss": 0.5418936610221863, "mean_token_accuracy": 0.7789008319377899, "num_tokens": 5411728.0, "step": 332 }, { "entropy": 0.5433261394500732, "epoch": 1.247191011235955, "grad_norm": 0.02972225658595562, "learning_rate": 0.0002, "loss": 0.5430710315704346, "mean_token_accuracy": 0.7793088257312775, "num_tokens": 5427990.0, "step": 333 }, { "entropy": 0.5405146926641464, "epoch": 1.250936329588015, "grad_norm": 0.028844943270087242, "learning_rate": 0.0002, "loss": 0.538284957408905, "mean_token_accuracy": 0.7814860939979553, "num_tokens": 5443961.0, "step": 334 }, { "entropy": 0.5582905858755112, "epoch": 1.2546816479400749, "grad_norm": 0.0356195829808712, "learning_rate": 0.0002, "loss": 0.558274507522583, "mean_token_accuracy": 0.772399827837944, "num_tokens": 5460135.0, "step": 335 }, { "entropy": 0.5524656623601913, "epoch": 1.2584269662921348, "grad_norm": 0.02986624464392662, "learning_rate": 0.0002, "loss": 0.5503432750701904, "mean_token_accuracy": 0.7768993377685547, "num_tokens": 5476448.0, "step": 336 }, { "entropy": 0.553261786699295, "epoch": 1.2621722846441947, "grad_norm": 0.03385454788804054, "learning_rate": 0.0002, "loss": 0.5513902902603149, "mean_token_accuracy": 0.7756227403879166, "num_tokens": 5492657.0, "step": 337 }, { "entropy": 0.5534822195768356, "epoch": 1.2659176029962547, "grad_norm": 0.03496600687503815, "learning_rate": 0.0002, "loss": 0.5570470690727234, "mean_token_accuracy": 0.7745380252599716, "num_tokens": 5508936.0, "step": 338 }, { "entropy": 0.5206775590777397, "epoch": 1.2696629213483146, "grad_norm": 0.038312628865242004, "learning_rate": 0.0002, "loss": 0.531387209892273, "mean_token_accuracy": 0.7818328887224197, "num_tokens": 5525150.0, "step": 339 }, { "entropy": 0.5372405052185059, "epoch": 1.2734082397003745, "grad_norm": 0.03226601704955101, "learning_rate": 0.0002, "loss": 0.5414312481880188, "mean_token_accuracy": 0.7806438505649567, "num_tokens": 5541125.0, "step": 340 }, { "entropy": 0.5670074820518494, "epoch": 1.2771535580524345, "grad_norm": 0.032290343195199966, "learning_rate": 0.0002, "loss": 0.5651661157608032, "mean_token_accuracy": 0.768811360001564, "num_tokens": 5557589.0, "step": 341 }, { "entropy": 0.5581976920366287, "epoch": 1.2808988764044944, "grad_norm": 0.035112183541059494, "learning_rate": 0.0002, "loss": 0.5540149211883545, "mean_token_accuracy": 0.7756919115781784, "num_tokens": 5574011.0, "step": 342 }, { "entropy": 0.5480058342218399, "epoch": 1.2846441947565543, "grad_norm": 0.029269572347402573, "learning_rate": 0.0002, "loss": 0.5497134923934937, "mean_token_accuracy": 0.7775010466575623, "num_tokens": 5590227.0, "step": 343 }, { "entropy": 0.5551355630159378, "epoch": 1.2883895131086143, "grad_norm": 0.03512820973992348, "learning_rate": 0.0002, "loss": 0.5613937377929688, "mean_token_accuracy": 0.77100470662117, "num_tokens": 5606436.0, "step": 344 }, { "entropy": 0.5681823641061783, "epoch": 1.2921348314606742, "grad_norm": 0.028890319168567657, "learning_rate": 0.0002, "loss": 0.5653828382492065, "mean_token_accuracy": 0.7733339965343475, "num_tokens": 5622955.0, "step": 345 }, { "entropy": 0.5512849390506744, "epoch": 1.2958801498127341, "grad_norm": 0.03168505057692528, "learning_rate": 0.0002, "loss": 0.5475208759307861, "mean_token_accuracy": 0.778771311044693, "num_tokens": 5639583.0, "step": 346 }, { "entropy": 0.5361000895500183, "epoch": 1.299625468164794, "grad_norm": 0.03995742276310921, "learning_rate": 0.0002, "loss": 0.5435983538627625, "mean_token_accuracy": 0.7801041901111603, "num_tokens": 5655726.0, "step": 347 }, { "entropy": 0.5335006862878799, "epoch": 1.303370786516854, "grad_norm": 0.03385796397924423, "learning_rate": 0.0002, "loss": 0.5360836982727051, "mean_token_accuracy": 0.7803510278463364, "num_tokens": 5671935.0, "step": 348 }, { "entropy": 0.5649213343858719, "epoch": 1.3071161048689137, "grad_norm": 0.03367312625050545, "learning_rate": 0.0002, "loss": 0.5654204487800598, "mean_token_accuracy": 0.7698808759450912, "num_tokens": 5688484.0, "step": 349 }, { "entropy": 0.5636743903160095, "epoch": 1.3108614232209739, "grad_norm": 0.028330491855740547, "learning_rate": 0.0002, "loss": 0.564975380897522, "mean_token_accuracy": 0.769644483923912, "num_tokens": 5704874.0, "step": 350 }, { "entropy": 0.5439984649419785, "epoch": 1.3146067415730336, "grad_norm": 0.030180098488926888, "learning_rate": 0.0002, "loss": 0.540916383266449, "mean_token_accuracy": 0.7806600630283356, "num_tokens": 5721250.0, "step": 351 }, { "entropy": 0.5403287261724472, "epoch": 1.3183520599250937, "grad_norm": 0.03425198793411255, "learning_rate": 0.0002, "loss": 0.5408051609992981, "mean_token_accuracy": 0.7801858931779861, "num_tokens": 5737303.0, "step": 352 }, { "entropy": 0.5534793436527252, "epoch": 1.3220973782771535, "grad_norm": 0.029101019725203514, "learning_rate": 0.0002, "loss": 0.5576366782188416, "mean_token_accuracy": 0.773370087146759, "num_tokens": 5753786.0, "step": 353 }, { "entropy": 0.5410192608833313, "epoch": 1.3258426966292136, "grad_norm": 0.0356539785861969, "learning_rate": 0.0002, "loss": 0.5408055186271667, "mean_token_accuracy": 0.7814153283834457, "num_tokens": 5769926.0, "step": 354 }, { "entropy": 0.5472375005483627, "epoch": 1.3295880149812733, "grad_norm": 0.03288782387971878, "learning_rate": 0.0002, "loss": 0.5537273287773132, "mean_token_accuracy": 0.7744840979576111, "num_tokens": 5785998.0, "step": 355 }, { "entropy": 0.5556980893015862, "epoch": 1.3333333333333333, "grad_norm": 0.038231220096349716, "learning_rate": 0.0002, "loss": 0.558592677116394, "mean_token_accuracy": 0.7744520753622055, "num_tokens": 5802256.0, "step": 356 }, { "entropy": 0.5668211281299591, "epoch": 1.3370786516853932, "grad_norm": 0.02924768440425396, "learning_rate": 0.0002, "loss": 0.5691797733306885, "mean_token_accuracy": 0.7683669775724411, "num_tokens": 5818757.0, "step": 357 }, { "entropy": 0.549320325255394, "epoch": 1.3408239700374531, "grad_norm": 0.03099512681365013, "learning_rate": 0.0002, "loss": 0.551908016204834, "mean_token_accuracy": 0.7755500972270966, "num_tokens": 5835041.0, "step": 358 }, { "entropy": 0.5573329776525497, "epoch": 1.344569288389513, "grad_norm": 0.028519438579678535, "learning_rate": 0.0002, "loss": 0.5581731796264648, "mean_token_accuracy": 0.7729284316301346, "num_tokens": 5851618.0, "step": 359 }, { "entropy": 0.5377827435731888, "epoch": 1.348314606741573, "grad_norm": 0.03338128328323364, "learning_rate": 0.0002, "loss": 0.5362961888313293, "mean_token_accuracy": 0.7824237793684006, "num_tokens": 5867600.0, "step": 360 }, { "entropy": 0.549625426530838, "epoch": 1.352059925093633, "grad_norm": 0.032118137925863266, "learning_rate": 0.0002, "loss": 0.5464169979095459, "mean_token_accuracy": 0.779940128326416, "num_tokens": 5883550.0, "step": 361 }, { "entropy": 0.5563124269247055, "epoch": 1.3558052434456929, "grad_norm": 0.028186708688735962, "learning_rate": 0.0002, "loss": 0.5525781512260437, "mean_token_accuracy": 0.7742565721273422, "num_tokens": 5900020.0, "step": 362 }, { "entropy": 0.5396654903888702, "epoch": 1.3595505617977528, "grad_norm": 0.03306869789958, "learning_rate": 0.0002, "loss": 0.5485842227935791, "mean_token_accuracy": 0.7763185799121857, "num_tokens": 5916563.0, "step": 363 }, { "entropy": 0.5324016958475113, "epoch": 1.3632958801498127, "grad_norm": 0.030485033988952637, "learning_rate": 0.0002, "loss": 0.5407555103302002, "mean_token_accuracy": 0.7805987000465393, "num_tokens": 5932915.0, "step": 364 }, { "entropy": 0.5415676534175873, "epoch": 1.3670411985018727, "grad_norm": 0.032210033386945724, "learning_rate": 0.0002, "loss": 0.5420053601264954, "mean_token_accuracy": 0.7789227366447449, "num_tokens": 5949294.0, "step": 365 }, { "entropy": 0.5479710251092911, "epoch": 1.3707865168539326, "grad_norm": 0.030770668759942055, "learning_rate": 0.0002, "loss": 0.5442653894424438, "mean_token_accuracy": 0.7809406220912933, "num_tokens": 5965688.0, "step": 366 }, { "entropy": 0.5611272603273392, "epoch": 1.3745318352059925, "grad_norm": 0.030032480135560036, "learning_rate": 0.0002, "loss": 0.5458992719650269, "mean_token_accuracy": 0.7793887704610825, "num_tokens": 5982353.0, "step": 367 }, { "entropy": 0.5711783468723297, "epoch": 1.3782771535580525, "grad_norm": 0.030471278354525566, "learning_rate": 0.0002, "loss": 0.5689231157302856, "mean_token_accuracy": 0.7691554129123688, "num_tokens": 5998928.0, "step": 368 }, { "entropy": 0.5704734623432159, "epoch": 1.3820224719101124, "grad_norm": 0.0308744665235281, "learning_rate": 0.0002, "loss": 0.5704200267791748, "mean_token_accuracy": 0.7696904093027115, "num_tokens": 6015488.0, "step": 369 }, { "entropy": 0.540970042347908, "epoch": 1.3857677902621723, "grad_norm": 0.029789667576551437, "learning_rate": 0.0002, "loss": 0.5435522794723511, "mean_token_accuracy": 0.7803212404251099, "num_tokens": 6032273.0, "step": 370 }, { "entropy": 0.5323564112186432, "epoch": 1.3895131086142323, "grad_norm": 0.03373701870441437, "learning_rate": 0.0002, "loss": 0.5415207147598267, "mean_token_accuracy": 0.7777475565671921, "num_tokens": 6048761.0, "step": 371 }, { "entropy": 0.5275064408779144, "epoch": 1.3932584269662922, "grad_norm": 0.03547370806336403, "learning_rate": 0.0002, "loss": 0.540917694568634, "mean_token_accuracy": 0.7795429080724716, "num_tokens": 6064848.0, "step": 372 }, { "entropy": 0.5497806072235107, "epoch": 1.3970037453183521, "grad_norm": 0.03201119974255562, "learning_rate": 0.0002, "loss": 0.552889347076416, "mean_token_accuracy": 0.7745427489280701, "num_tokens": 6081258.0, "step": 373 }, { "entropy": 0.5175323188304901, "epoch": 1.4007490636704119, "grad_norm": 0.03368834778666496, "learning_rate": 0.0002, "loss": 0.5198505520820618, "mean_token_accuracy": 0.7878732234239578, "num_tokens": 6097172.0, "step": 374 }, { "entropy": 0.5441398918628693, "epoch": 1.404494382022472, "grad_norm": 0.03139437735080719, "learning_rate": 0.0002, "loss": 0.5445310473442078, "mean_token_accuracy": 0.780688688158989, "num_tokens": 6113446.0, "step": 375 }, { "entropy": 0.5468717068433762, "epoch": 1.4082397003745317, "grad_norm": 0.03169120475649834, "learning_rate": 0.0002, "loss": 0.5426516532897949, "mean_token_accuracy": 0.776495024561882, "num_tokens": 6129738.0, "step": 376 }, { "entropy": 0.5554005056619644, "epoch": 1.4119850187265919, "grad_norm": 0.03649836778640747, "learning_rate": 0.0002, "loss": 0.5584489703178406, "mean_token_accuracy": 0.7743981927633286, "num_tokens": 6146138.0, "step": 377 }, { "entropy": 0.545359656214714, "epoch": 1.4157303370786516, "grad_norm": 0.0333530455827713, "learning_rate": 0.0002, "loss": 0.547561526298523, "mean_token_accuracy": 0.7772817760705948, "num_tokens": 6162466.0, "step": 378 }, { "entropy": 0.5366268008947372, "epoch": 1.4194756554307117, "grad_norm": 0.0315176397562027, "learning_rate": 0.0002, "loss": 0.5370338559150696, "mean_token_accuracy": 0.7830789685249329, "num_tokens": 6178827.0, "step": 379 }, { "entropy": 0.5343760550022125, "epoch": 1.4232209737827715, "grad_norm": 0.03283468633890152, "learning_rate": 0.0002, "loss": 0.5403618812561035, "mean_token_accuracy": 0.7811573594808578, "num_tokens": 6195014.0, "step": 380 }, { "entropy": 0.5374447852373123, "epoch": 1.4269662921348314, "grad_norm": 0.03712209314107895, "learning_rate": 0.0002, "loss": 0.5359081625938416, "mean_token_accuracy": 0.7824594676494598, "num_tokens": 6211204.0, "step": 381 }, { "entropy": 0.5647163391113281, "epoch": 1.4307116104868913, "grad_norm": 0.030612658709287643, "learning_rate": 0.0002, "loss": 0.5665347576141357, "mean_token_accuracy": 0.7709782868623734, "num_tokens": 6227439.0, "step": 382 }, { "entropy": 0.5584586560726166, "epoch": 1.4344569288389513, "grad_norm": 0.03545604646205902, "learning_rate": 0.0002, "loss": 0.5592620372772217, "mean_token_accuracy": 0.7708311080932617, "num_tokens": 6243909.0, "step": 383 }, { "entropy": 0.5563389509916306, "epoch": 1.4382022471910112, "grad_norm": 0.031707633286714554, "learning_rate": 0.0002, "loss": 0.5574153065681458, "mean_token_accuracy": 0.7749636173248291, "num_tokens": 6260228.0, "step": 384 }, { "entropy": 0.5361679270863533, "epoch": 1.4419475655430711, "grad_norm": 0.030576881021261215, "learning_rate": 0.0002, "loss": 0.5358593463897705, "mean_token_accuracy": 0.7815472632646561, "num_tokens": 6276438.0, "step": 385 }, { "entropy": 0.5404613763093948, "epoch": 1.445692883895131, "grad_norm": 0.0397074818611145, "learning_rate": 0.0002, "loss": 0.5409061908721924, "mean_token_accuracy": 0.7812814116477966, "num_tokens": 6292854.0, "step": 386 }, { "entropy": 0.5539507865905762, "epoch": 1.449438202247191, "grad_norm": 0.027634674683213234, "learning_rate": 0.0002, "loss": 0.551899254322052, "mean_token_accuracy": 0.7763891369104385, "num_tokens": 6309146.0, "step": 387 }, { "entropy": 0.5406185388565063, "epoch": 1.453183520599251, "grad_norm": 0.03658418357372284, "learning_rate": 0.0002, "loss": 0.5376873016357422, "mean_token_accuracy": 0.7802905589342117, "num_tokens": 6325371.0, "step": 388 }, { "entropy": 0.5515788942575455, "epoch": 1.4569288389513109, "grad_norm": 0.029648393392562866, "learning_rate": 0.0002, "loss": 0.5481655597686768, "mean_token_accuracy": 0.7753021568059921, "num_tokens": 6341504.0, "step": 389 }, { "entropy": 0.5403069257736206, "epoch": 1.4606741573033708, "grad_norm": 0.0300885122269392, "learning_rate": 0.0002, "loss": 0.5417286157608032, "mean_token_accuracy": 0.7805690169334412, "num_tokens": 6357574.0, "step": 390 }, { "entropy": 0.5320965051651001, "epoch": 1.4644194756554307, "grad_norm": 0.04233168438076973, "learning_rate": 0.0002, "loss": 0.542140543460846, "mean_token_accuracy": 0.7790813148021698, "num_tokens": 6373603.0, "step": 391 }, { "entropy": 0.5370313972234726, "epoch": 1.4681647940074907, "grad_norm": 0.03608033061027527, "learning_rate": 0.0002, "loss": 0.5452749133110046, "mean_token_accuracy": 0.7784496247768402, "num_tokens": 6389874.0, "step": 392 }, { "entropy": 0.5391117632389069, "epoch": 1.4719101123595506, "grad_norm": 0.044416990131139755, "learning_rate": 0.0002, "loss": 0.5447070598602295, "mean_token_accuracy": 0.7758590877056122, "num_tokens": 6406014.0, "step": 393 }, { "entropy": 0.5536396950483322, "epoch": 1.4756554307116105, "grad_norm": 0.028598185628652573, "learning_rate": 0.0002, "loss": 0.5509454011917114, "mean_token_accuracy": 0.7754955738782883, "num_tokens": 6422526.0, "step": 394 }, { "entropy": 0.5600528717041016, "epoch": 1.4794007490636705, "grad_norm": 0.03587036579847336, "learning_rate": 0.0002, "loss": 0.5511722564697266, "mean_token_accuracy": 0.7756818234920502, "num_tokens": 6438826.0, "step": 395 }, { "entropy": 0.5635561943054199, "epoch": 1.4831460674157304, "grad_norm": 0.04037458822131157, "learning_rate": 0.0002, "loss": 0.5569745898246765, "mean_token_accuracy": 0.7768395692110062, "num_tokens": 6455392.0, "step": 396 }, { "entropy": 0.5546122640371323, "epoch": 1.4868913857677903, "grad_norm": 0.03193597123026848, "learning_rate": 0.0002, "loss": 0.5528469085693359, "mean_token_accuracy": 0.7737569063901901, "num_tokens": 6471908.0, "step": 397 }, { "entropy": 0.540926069021225, "epoch": 1.4906367041198503, "grad_norm": 0.03908224403858185, "learning_rate": 0.0002, "loss": 0.5521141290664673, "mean_token_accuracy": 0.7775756865739822, "num_tokens": 6487958.0, "step": 398 }, { "entropy": 0.5474519431591034, "epoch": 1.49438202247191, "grad_norm": 0.04104601964354515, "learning_rate": 0.0002, "loss": 0.5533535480499268, "mean_token_accuracy": 0.7748162597417831, "num_tokens": 6504634.0, "step": 399 }, { "entropy": 0.5560764372348785, "epoch": 1.4981273408239701, "grad_norm": 0.0360972136259079, "learning_rate": 0.0002, "loss": 0.5614410042762756, "mean_token_accuracy": 0.770107239484787, "num_tokens": 6521072.0, "step": 400 }, { "entropy": 0.5673471540212631, "epoch": 1.5018726591760299, "grad_norm": 0.04004177823662758, "learning_rate": 0.0002, "loss": 0.5589927434921265, "mean_token_accuracy": 0.7734557241201401, "num_tokens": 6537361.0, "step": 401 }, { "entropy": 0.5486087501049042, "epoch": 1.50561797752809, "grad_norm": 0.030557790771126747, "learning_rate": 0.0002, "loss": 0.5393815040588379, "mean_token_accuracy": 0.7784638553857803, "num_tokens": 6553620.0, "step": 402 }, { "entropy": 0.5486248284578323, "epoch": 1.5093632958801497, "grad_norm": 0.03941396623849869, "learning_rate": 0.0002, "loss": 0.5509032011032104, "mean_token_accuracy": 0.7800426781177521, "num_tokens": 6569936.0, "step": 403 }, { "entropy": 0.558304026722908, "epoch": 1.5131086142322099, "grad_norm": 0.03858976438641548, "learning_rate": 0.0002, "loss": 0.566615104675293, "mean_token_accuracy": 0.7677357494831085, "num_tokens": 6586223.0, "step": 404 }, { "entropy": 0.5375211238861084, "epoch": 1.5168539325842696, "grad_norm": 0.0333857461810112, "learning_rate": 0.0002, "loss": 0.546052873134613, "mean_token_accuracy": 0.779136061668396, "num_tokens": 6602626.0, "step": 405 }, { "entropy": 0.545025646686554, "epoch": 1.5205992509363297, "grad_norm": 0.03882851079106331, "learning_rate": 0.0002, "loss": 0.5526992678642273, "mean_token_accuracy": 0.7757603526115417, "num_tokens": 6618970.0, "step": 406 }, { "entropy": 0.5616021603345871, "epoch": 1.5243445692883895, "grad_norm": 0.029704444110393524, "learning_rate": 0.0002, "loss": 0.5617290139198303, "mean_token_accuracy": 0.771888017654419, "num_tokens": 6635712.0, "step": 407 }, { "entropy": 0.5517143756151199, "epoch": 1.5280898876404494, "grad_norm": 0.029841486364603043, "learning_rate": 0.0002, "loss": 0.5455192923545837, "mean_token_accuracy": 0.7790273427963257, "num_tokens": 6652005.0, "step": 408 }, { "entropy": 0.5481491684913635, "epoch": 1.5318352059925093, "grad_norm": 0.03239016607403755, "learning_rate": 0.0002, "loss": 0.5448024272918701, "mean_token_accuracy": 0.7801620662212372, "num_tokens": 6668365.0, "step": 409 }, { "entropy": 0.5385047048330307, "epoch": 1.5355805243445693, "grad_norm": 0.029611637815833092, "learning_rate": 0.0002, "loss": 0.5335633754730225, "mean_token_accuracy": 0.785701259970665, "num_tokens": 6684708.0, "step": 410 }, { "entropy": 0.558298259973526, "epoch": 1.5393258426966292, "grad_norm": 0.030493013560771942, "learning_rate": 0.0002, "loss": 0.5560066103935242, "mean_token_accuracy": 0.7725876718759537, "num_tokens": 6701142.0, "step": 411 }, { "entropy": 0.5395427197217941, "epoch": 1.5430711610486891, "grad_norm": 0.032578032463788986, "learning_rate": 0.0002, "loss": 0.5449746251106262, "mean_token_accuracy": 0.7762585133314133, "num_tokens": 6717233.0, "step": 412 }, { "entropy": 0.5387013256549835, "epoch": 1.546816479400749, "grad_norm": 0.0333687961101532, "learning_rate": 0.0002, "loss": 0.5403171181678772, "mean_token_accuracy": 0.7810612767934799, "num_tokens": 6733228.0, "step": 413 }, { "entropy": 0.5673456788063049, "epoch": 1.550561797752809, "grad_norm": 0.036015916615724564, "learning_rate": 0.0002, "loss": 0.5735532641410828, "mean_token_accuracy": 0.7664827108383179, "num_tokens": 6749423.0, "step": 414 }, { "entropy": 0.5494605153799057, "epoch": 1.554307116104869, "grad_norm": 0.02719104290008545, "learning_rate": 0.0002, "loss": 0.5493685007095337, "mean_token_accuracy": 0.776999905705452, "num_tokens": 6765893.0, "step": 415 }, { "entropy": 0.5593840181827545, "epoch": 1.5580524344569289, "grad_norm": 0.03425523266196251, "learning_rate": 0.0002, "loss": 0.5553128719329834, "mean_token_accuracy": 0.7735365033149719, "num_tokens": 6782271.0, "step": 416 }, { "entropy": 0.5617495179176331, "epoch": 1.5617977528089888, "grad_norm": 0.032372213900089264, "learning_rate": 0.0002, "loss": 0.5606021881103516, "mean_token_accuracy": 0.7721095532178879, "num_tokens": 6798813.0, "step": 417 }, { "entropy": 0.5550025552511215, "epoch": 1.5655430711610487, "grad_norm": 0.029182737693190575, "learning_rate": 0.0002, "loss": 0.5564966201782227, "mean_token_accuracy": 0.7731625586748123, "num_tokens": 6815405.0, "step": 418 }, { "entropy": 0.5605382174253464, "epoch": 1.5692883895131087, "grad_norm": 0.030886612832546234, "learning_rate": 0.0002, "loss": 0.5631057024002075, "mean_token_accuracy": 0.7716924250125885, "num_tokens": 6831974.0, "step": 419 }, { "entropy": 0.5414248704910278, "epoch": 1.5730337078651684, "grad_norm": 0.03267752379179001, "learning_rate": 0.0002, "loss": 0.5522453188896179, "mean_token_accuracy": 0.7731709033250809, "num_tokens": 6848314.0, "step": 420 }, { "entropy": 0.5514931678771973, "epoch": 1.5767790262172285, "grad_norm": 0.03168710321187973, "learning_rate": 0.0002, "loss": 0.5525091886520386, "mean_token_accuracy": 0.7754202336072922, "num_tokens": 6864671.0, "step": 421 }, { "entropy": 0.5639499425888062, "epoch": 1.5805243445692883, "grad_norm": 0.032651759684085846, "learning_rate": 0.0002, "loss": 0.5697652697563171, "mean_token_accuracy": 0.7682019621133804, "num_tokens": 6881061.0, "step": 422 }, { "entropy": 0.5544054210186005, "epoch": 1.5842696629213484, "grad_norm": 0.03449453413486481, "learning_rate": 0.0002, "loss": 0.5507102012634277, "mean_token_accuracy": 0.775859922170639, "num_tokens": 6897314.0, "step": 423 }, { "entropy": 0.5711345225572586, "epoch": 1.5880149812734081, "grad_norm": 0.03847847133874893, "learning_rate": 0.0002, "loss": 0.5732009410858154, "mean_token_accuracy": 0.7667471021413803, "num_tokens": 6913609.0, "step": 424 }, { "entropy": 0.5389959663152695, "epoch": 1.5917602996254683, "grad_norm": 0.03514353558421135, "learning_rate": 0.0002, "loss": 0.5444454550743103, "mean_token_accuracy": 0.7799976915121078, "num_tokens": 6929936.0, "step": 425 }, { "entropy": 0.5668403804302216, "epoch": 1.595505617977528, "grad_norm": 0.035787779837846756, "learning_rate": 0.0002, "loss": 0.5658587217330933, "mean_token_accuracy": 0.7714453637599945, "num_tokens": 6946824.0, "step": 426 }, { "entropy": 0.5508380085229874, "epoch": 1.5992509363295881, "grad_norm": 0.03445902094244957, "learning_rate": 0.0002, "loss": 0.5547541975975037, "mean_token_accuracy": 0.7770363837480545, "num_tokens": 6962968.0, "step": 427 }, { "entropy": 0.5622916221618652, "epoch": 1.6029962546816479, "grad_norm": 0.033641569316387177, "learning_rate": 0.0002, "loss": 0.5611415505409241, "mean_token_accuracy": 0.7717165648937225, "num_tokens": 6979281.0, "step": 428 }, { "entropy": 0.5456431210041046, "epoch": 1.606741573033708, "grad_norm": 0.030943863093852997, "learning_rate": 0.0002, "loss": 0.5433369278907776, "mean_token_accuracy": 0.77703957259655, "num_tokens": 6995448.0, "step": 429 }, { "entropy": 0.5349363088607788, "epoch": 1.6104868913857677, "grad_norm": 0.029584866017103195, "learning_rate": 0.0002, "loss": 0.528792142868042, "mean_token_accuracy": 0.7852742522954941, "num_tokens": 7011578.0, "step": 430 }, { "entropy": 0.52534219622612, "epoch": 1.6142322097378277, "grad_norm": 0.031122464686632156, "learning_rate": 0.0002, "loss": 0.5248501300811768, "mean_token_accuracy": 0.7855943292379379, "num_tokens": 7027819.0, "step": 431 }, { "entropy": 0.5471996814012527, "epoch": 1.6179775280898876, "grad_norm": 0.03317458927631378, "learning_rate": 0.0002, "loss": 0.5547217726707458, "mean_token_accuracy": 0.776124969124794, "num_tokens": 7044215.0, "step": 432 }, { "entropy": 0.5501783192157745, "epoch": 1.6217228464419475, "grad_norm": 0.028514394536614418, "learning_rate": 0.0002, "loss": 0.5524763464927673, "mean_token_accuracy": 0.773967519402504, "num_tokens": 7060557.0, "step": 433 }, { "entropy": 0.5516121089458466, "epoch": 1.6254681647940075, "grad_norm": 0.037680886685848236, "learning_rate": 0.0002, "loss": 0.5547643899917603, "mean_token_accuracy": 0.7772052437067032, "num_tokens": 7076827.0, "step": 434 }, { "entropy": 0.5446216315031052, "epoch": 1.6292134831460674, "grad_norm": 0.025961318984627724, "learning_rate": 0.0002, "loss": 0.540472149848938, "mean_token_accuracy": 0.7827950567007065, "num_tokens": 7093240.0, "step": 435 }, { "entropy": 0.5542737692594528, "epoch": 1.6329588014981273, "grad_norm": 0.03385328873991966, "learning_rate": 0.0002, "loss": 0.5622321963310242, "mean_token_accuracy": 0.7715137451887131, "num_tokens": 7109763.0, "step": 436 }, { "entropy": 0.5479970276355743, "epoch": 1.6367041198501873, "grad_norm": 0.027666250243782997, "learning_rate": 0.0002, "loss": 0.5450934767723083, "mean_token_accuracy": 0.7789344042539597, "num_tokens": 7125965.0, "step": 437 }, { "entropy": 0.5606249123811722, "epoch": 1.6404494382022472, "grad_norm": 0.028965814039111137, "learning_rate": 0.0002, "loss": 0.5618120431900024, "mean_token_accuracy": 0.7737310230731964, "num_tokens": 7142275.0, "step": 438 }, { "entropy": 0.5434140264987946, "epoch": 1.6441947565543071, "grad_norm": 0.03233455866575241, "learning_rate": 0.0002, "loss": 0.5448483824729919, "mean_token_accuracy": 0.7776681929826736, "num_tokens": 7158681.0, "step": 439 }, { "entropy": 0.5462686270475388, "epoch": 1.647940074906367, "grad_norm": 0.030159825459122658, "learning_rate": 0.0002, "loss": 0.5512958765029907, "mean_token_accuracy": 0.7788191735744476, "num_tokens": 7174999.0, "step": 440 }, { "entropy": 0.5655659884214401, "epoch": 1.651685393258427, "grad_norm": 0.0356375053524971, "learning_rate": 0.0002, "loss": 0.5668036937713623, "mean_token_accuracy": 0.7672240734100342, "num_tokens": 7191451.0, "step": 441 }, { "entropy": 0.5439184606075287, "epoch": 1.655430711610487, "grad_norm": 0.03394126892089844, "learning_rate": 0.0002, "loss": 0.5443013906478882, "mean_token_accuracy": 0.7794349491596222, "num_tokens": 7207657.0, "step": 442 }, { "entropy": 0.5462498217821121, "epoch": 1.6591760299625467, "grad_norm": 0.03115757368505001, "learning_rate": 0.0002, "loss": 0.5484351515769958, "mean_token_accuracy": 0.7759426087141037, "num_tokens": 7223926.0, "step": 443 }, { "entropy": 0.5479519367218018, "epoch": 1.6629213483146068, "grad_norm": 0.03686544671654701, "learning_rate": 0.0002, "loss": 0.5487886071205139, "mean_token_accuracy": 0.7793583422899246, "num_tokens": 7239926.0, "step": 444 }, { "entropy": 0.5571880787611008, "epoch": 1.6666666666666665, "grad_norm": 0.029902130365371704, "learning_rate": 0.0002, "loss": 0.5566808581352234, "mean_token_accuracy": 0.7738562673330307, "num_tokens": 7256365.0, "step": 445 }, { "entropy": 0.5606496781110764, "epoch": 1.6704119850187267, "grad_norm": 0.03581070154905319, "learning_rate": 0.0002, "loss": 0.5646023750305176, "mean_token_accuracy": 0.7700021713972092, "num_tokens": 7272415.0, "step": 446 }, { "entropy": 0.5493645370006561, "epoch": 1.6741573033707864, "grad_norm": 0.034732386469841, "learning_rate": 0.0002, "loss": 0.5556433796882629, "mean_token_accuracy": 0.7724722027778625, "num_tokens": 7288442.0, "step": 447 }, { "entropy": 0.5454504191875458, "epoch": 1.6779026217228465, "grad_norm": 0.031994741410017014, "learning_rate": 0.0002, "loss": 0.5455131530761719, "mean_token_accuracy": 0.7786727547645569, "num_tokens": 7304778.0, "step": 448 }, { "entropy": 0.5480805784463882, "epoch": 1.6816479400749063, "grad_norm": 0.029919426888227463, "learning_rate": 0.0002, "loss": 0.5464503765106201, "mean_token_accuracy": 0.7800304591655731, "num_tokens": 7320989.0, "step": 449 }, { "entropy": 0.5258940905332565, "epoch": 1.6853932584269664, "grad_norm": 0.032200053334236145, "learning_rate": 0.0002, "loss": 0.5228010416030884, "mean_token_accuracy": 0.7870291918516159, "num_tokens": 7337145.0, "step": 450 }, { "entropy": 0.545346587896347, "epoch": 1.6891385767790261, "grad_norm": 0.037810057401657104, "learning_rate": 0.0002, "loss": 0.5497158765792847, "mean_token_accuracy": 0.7733957171440125, "num_tokens": 7353380.0, "step": 451 }, { "entropy": 0.5455152243375778, "epoch": 1.6928838951310863, "grad_norm": 0.036783650517463684, "learning_rate": 0.0002, "loss": 0.547383725643158, "mean_token_accuracy": 0.7792070508003235, "num_tokens": 7369718.0, "step": 452 }, { "entropy": 0.5610679686069489, "epoch": 1.696629213483146, "grad_norm": 0.032883401960134506, "learning_rate": 0.0002, "loss": 0.5691272616386414, "mean_token_accuracy": 0.7677329927682877, "num_tokens": 7385896.0, "step": 453 }, { "entropy": 0.5505604892969131, "epoch": 1.7003745318352061, "grad_norm": 0.03284638375043869, "learning_rate": 0.0002, "loss": 0.5511571168899536, "mean_token_accuracy": 0.7760978639125824, "num_tokens": 7402228.0, "step": 454 }, { "entropy": 0.5650221109390259, "epoch": 1.7041198501872659, "grad_norm": 0.02887006103992462, "learning_rate": 0.0002, "loss": 0.5633357763290405, "mean_token_accuracy": 0.7709190398454666, "num_tokens": 7418506.0, "step": 455 }, { "entropy": 0.5511359125375748, "epoch": 1.7078651685393258, "grad_norm": 0.02897547371685505, "learning_rate": 0.0002, "loss": 0.5476655960083008, "mean_token_accuracy": 0.7766725867986679, "num_tokens": 7434993.0, "step": 456 }, { "entropy": 0.5589297413825989, "epoch": 1.7116104868913857, "grad_norm": 0.03913537412881851, "learning_rate": 0.0002, "loss": 0.562713623046875, "mean_token_accuracy": 0.7716452181339264, "num_tokens": 7451420.0, "step": 457 }, { "entropy": 0.5587479770183563, "epoch": 1.7153558052434457, "grad_norm": 0.0281817764043808, "learning_rate": 0.0002, "loss": 0.5552535057067871, "mean_token_accuracy": 0.7717525810003281, "num_tokens": 7467745.0, "step": 458 }, { "entropy": 0.5426507443189621, "epoch": 1.7191011235955056, "grad_norm": 0.03837720304727554, "learning_rate": 0.0002, "loss": 0.5466030836105347, "mean_token_accuracy": 0.7787178158760071, "num_tokens": 7484044.0, "step": 459 }, { "entropy": 0.548772931098938, "epoch": 1.7228464419475655, "grad_norm": 0.034067291766405106, "learning_rate": 0.0002, "loss": 0.5531357526779175, "mean_token_accuracy": 0.7748309075832367, "num_tokens": 7500332.0, "step": 460 }, { "entropy": 0.5564078390598297, "epoch": 1.7265917602996255, "grad_norm": 0.03204013407230377, "learning_rate": 0.0002, "loss": 0.5560243725776672, "mean_token_accuracy": 0.7740551978349686, "num_tokens": 7516660.0, "step": 461 }, { "entropy": 0.5405488759279251, "epoch": 1.7303370786516854, "grad_norm": 0.030630316585302353, "learning_rate": 0.0002, "loss": 0.5395958423614502, "mean_token_accuracy": 0.7782745659351349, "num_tokens": 7532934.0, "step": 462 }, { "entropy": 0.5496814846992493, "epoch": 1.7340823970037453, "grad_norm": 0.03725660592317581, "learning_rate": 0.0002, "loss": 0.5496969223022461, "mean_token_accuracy": 0.7755606472492218, "num_tokens": 7549291.0, "step": 463 }, { "entropy": 0.5522442013025284, "epoch": 1.7378277153558053, "grad_norm": 0.039360832422971725, "learning_rate": 0.0002, "loss": 0.5475296378135681, "mean_token_accuracy": 0.7740370631217957, "num_tokens": 7565370.0, "step": 464 }, { "entropy": 0.5205198004841805, "epoch": 1.7415730337078652, "grad_norm": 0.029320131987333298, "learning_rate": 0.0002, "loss": 0.5181597471237183, "mean_token_accuracy": 0.789748415350914, "num_tokens": 7581731.0, "step": 465 }, { "entropy": 0.5322981476783752, "epoch": 1.7453183520599251, "grad_norm": 0.03633226826786995, "learning_rate": 0.0002, "loss": 0.5413781404495239, "mean_token_accuracy": 0.7808037847280502, "num_tokens": 7597822.0, "step": 466 }, { "entropy": 0.524602085351944, "epoch": 1.749063670411985, "grad_norm": 0.04402731731534004, "learning_rate": 0.0002, "loss": 0.532406210899353, "mean_token_accuracy": 0.7855067849159241, "num_tokens": 7613933.0, "step": 467 }, { "entropy": 0.5708600282669067, "epoch": 1.7528089887640448, "grad_norm": 0.0357418954372406, "learning_rate": 0.0002, "loss": 0.5712512731552124, "mean_token_accuracy": 0.7683784365653992, "num_tokens": 7630331.0, "step": 468 }, { "entropy": 0.5579233318567276, "epoch": 1.756554307116105, "grad_norm": 0.15994992852210999, "learning_rate": 0.0002, "loss": 0.5615707635879517, "mean_token_accuracy": 0.7749305069446564, "num_tokens": 7646666.0, "step": 469 }, { "entropy": 0.5672501176595688, "epoch": 1.7602996254681647, "grad_norm": 0.18223144114017487, "learning_rate": 0.0002, "loss": 0.5922040939331055, "mean_token_accuracy": 0.767003208398819, "num_tokens": 7663024.0, "step": 470 }, { "entropy": 0.5853898674249649, "epoch": 1.7640449438202248, "grad_norm": 0.19322983920574188, "learning_rate": 0.0002, "loss": 0.5716003179550171, "mean_token_accuracy": 0.7706755697727203, "num_tokens": 7679445.0, "step": 471 }, { "entropy": 0.5652599781751633, "epoch": 1.7677902621722845, "grad_norm": 0.040028076618909836, "learning_rate": 0.0002, "loss": 0.5545145869255066, "mean_token_accuracy": 0.7762533873319626, "num_tokens": 7695863.0, "step": 472 }, { "entropy": 0.5655337423086166, "epoch": 1.7715355805243447, "grad_norm": 0.03808818385004997, "learning_rate": 0.0002, "loss": 0.5697377324104309, "mean_token_accuracy": 0.7698807120323181, "num_tokens": 7712117.0, "step": 473 }, { "entropy": 0.531586229801178, "epoch": 1.7752808988764044, "grad_norm": 0.03700399026274681, "learning_rate": 0.0002, "loss": 0.5407450199127197, "mean_token_accuracy": 0.7823738306760788, "num_tokens": 7728324.0, "step": 474 }, { "entropy": 0.5400687605142593, "epoch": 1.7790262172284645, "grad_norm": 0.04493065923452377, "learning_rate": 0.0002, "loss": 0.5463284254074097, "mean_token_accuracy": 0.778341680765152, "num_tokens": 7744642.0, "step": 475 }, { "entropy": 0.5348718762397766, "epoch": 1.7827715355805243, "grad_norm": 0.032796818763017654, "learning_rate": 0.0002, "loss": 0.53885817527771, "mean_token_accuracy": 0.7798904478549957, "num_tokens": 7761144.0, "step": 476 }, { "entropy": 0.5612788051366806, "epoch": 1.7865168539325844, "grad_norm": 0.03454861417412758, "learning_rate": 0.0002, "loss": 0.5585771799087524, "mean_token_accuracy": 0.7730214893817902, "num_tokens": 7777603.0, "step": 477 }, { "entropy": 0.5655092746019363, "epoch": 1.7902621722846441, "grad_norm": 0.04326882213354111, "learning_rate": 0.0002, "loss": 0.5594231486320496, "mean_token_accuracy": 0.7714511156082153, "num_tokens": 7794017.0, "step": 478 }, { "entropy": 0.5740013867616653, "epoch": 1.7940074906367043, "grad_norm": 0.03586514666676521, "learning_rate": 0.0002, "loss": 0.5665684342384338, "mean_token_accuracy": 0.7693835347890854, "num_tokens": 7810410.0, "step": 479 }, { "entropy": 0.5689022541046143, "epoch": 1.797752808988764, "grad_norm": 0.03453454375267029, "learning_rate": 0.0002, "loss": 0.5640177130699158, "mean_token_accuracy": 0.7688567489385605, "num_tokens": 7826878.0, "step": 480 }, { "entropy": 0.5344455689191818, "epoch": 1.801498127340824, "grad_norm": 0.04154738038778305, "learning_rate": 0.0002, "loss": 0.5412873029708862, "mean_token_accuracy": 0.7843961417675018, "num_tokens": 7842957.0, "step": 481 }, { "entropy": 0.5326808393001556, "epoch": 1.8052434456928839, "grad_norm": 0.03772249072790146, "learning_rate": 0.0002, "loss": 0.5458777546882629, "mean_token_accuracy": 0.7775137424468994, "num_tokens": 7859243.0, "step": 482 }, { "entropy": 0.552602618932724, "epoch": 1.8089887640449438, "grad_norm": 0.03419940546154976, "learning_rate": 0.0002, "loss": 0.5563470721244812, "mean_token_accuracy": 0.7756804972887039, "num_tokens": 7875641.0, "step": 483 }, { "entropy": 0.5412130802869797, "epoch": 1.8127340823970037, "grad_norm": 0.033059973269701004, "learning_rate": 0.0002, "loss": 0.540538489818573, "mean_token_accuracy": 0.782319188117981, "num_tokens": 7891954.0, "step": 484 }, { "entropy": 0.5559896975755692, "epoch": 1.8164794007490637, "grad_norm": 0.03472665324807167, "learning_rate": 0.0002, "loss": 0.5544817447662354, "mean_token_accuracy": 0.7753840684890747, "num_tokens": 7908283.0, "step": 485 }, { "entropy": 0.5695093274116516, "epoch": 1.8202247191011236, "grad_norm": 0.0319642499089241, "learning_rate": 0.0002, "loss": 0.5608171224594116, "mean_token_accuracy": 0.7743540853261948, "num_tokens": 7924627.0, "step": 486 }, { "entropy": 0.5412854105234146, "epoch": 1.8239700374531835, "grad_norm": 0.032578784972429276, "learning_rate": 0.0002, "loss": 0.5386444330215454, "mean_token_accuracy": 0.7795344591140747, "num_tokens": 7940814.0, "step": 487 }, { "entropy": 0.5442286729812622, "epoch": 1.8277153558052435, "grad_norm": 0.03279658779501915, "learning_rate": 0.0002, "loss": 0.553512454032898, "mean_token_accuracy": 0.7744518220424652, "num_tokens": 7957133.0, "step": 488 }, { "entropy": 0.544167771935463, "epoch": 1.8314606741573034, "grad_norm": 0.034980904310941696, "learning_rate": 0.0002, "loss": 0.5495878458023071, "mean_token_accuracy": 0.7794477045536041, "num_tokens": 7973367.0, "step": 489 }, { "entropy": 0.5514913648366928, "epoch": 1.8352059925093633, "grad_norm": 0.0437743179500103, "learning_rate": 0.0002, "loss": 0.5581385493278503, "mean_token_accuracy": 0.7734484821557999, "num_tokens": 7989443.0, "step": 490 }, { "entropy": 0.5721138119697571, "epoch": 1.8389513108614233, "grad_norm": 0.032419200986623764, "learning_rate": 0.0002, "loss": 0.5644645094871521, "mean_token_accuracy": 0.7717173397541046, "num_tokens": 8005817.0, "step": 491 }, { "entropy": 0.5577604025602341, "epoch": 1.8426966292134832, "grad_norm": 0.04115711897611618, "learning_rate": 0.0002, "loss": 0.5619987845420837, "mean_token_accuracy": 0.77156862616539, "num_tokens": 8022160.0, "step": 492 }, { "entropy": 0.5528861582279205, "epoch": 1.846441947565543, "grad_norm": 0.029432786628603935, "learning_rate": 0.0002, "loss": 0.5476526618003845, "mean_token_accuracy": 0.7781069427728653, "num_tokens": 8038591.0, "step": 493 }, { "entropy": 0.5558982342481613, "epoch": 1.850187265917603, "grad_norm": 0.036472100764513016, "learning_rate": 0.0002, "loss": 0.5545116662979126, "mean_token_accuracy": 0.776875764131546, "num_tokens": 8054879.0, "step": 494 }, { "entropy": 0.5589891523122787, "epoch": 1.8539325842696628, "grad_norm": 0.02796117588877678, "learning_rate": 0.0002, "loss": 0.5532379746437073, "mean_token_accuracy": 0.7751499116420746, "num_tokens": 8071227.0, "step": 495 }, { "entropy": 0.5462375283241272, "epoch": 1.857677902621723, "grad_norm": 0.0307608712464571, "learning_rate": 0.0002, "loss": 0.5444692373275757, "mean_token_accuracy": 0.7788323760032654, "num_tokens": 8087424.0, "step": 496 }, { "entropy": 0.562559187412262, "epoch": 1.8614232209737827, "grad_norm": 0.03130098804831505, "learning_rate": 0.0002, "loss": 0.5660312175750732, "mean_token_accuracy": 0.7673315852880478, "num_tokens": 8104163.0, "step": 497 }, { "entropy": 0.5469489693641663, "epoch": 1.8651685393258428, "grad_norm": 0.031797025352716446, "learning_rate": 0.0002, "loss": 0.5592264533042908, "mean_token_accuracy": 0.7764750421047211, "num_tokens": 8120483.0, "step": 498 }, { "entropy": 0.5529169142246246, "epoch": 1.8689138576779025, "grad_norm": 0.0395452156662941, "learning_rate": 0.0002, "loss": 0.5562450885772705, "mean_token_accuracy": 0.7762233167886734, "num_tokens": 8136774.0, "step": 499 }, { "entropy": 0.5619923919439316, "epoch": 1.8726591760299627, "grad_norm": 0.03070960007607937, "learning_rate": 0.0002, "loss": 0.5671469569206238, "mean_token_accuracy": 0.7695633620023727, "num_tokens": 8152950.0, "step": 500 }, { "entropy": 0.571450412273407, "epoch": 1.8764044943820224, "grad_norm": 0.03263135999441147, "learning_rate": 0.0002, "loss": 0.5684110522270203, "mean_token_accuracy": 0.7683538943529129, "num_tokens": 8169231.0, "step": 501 }, { "entropy": 0.5732105523347855, "epoch": 1.8801498127340825, "grad_norm": 0.04209841415286064, "learning_rate": 0.0002, "loss": 0.571649968624115, "mean_token_accuracy": 0.7642921954393387, "num_tokens": 8185562.0, "step": 502 }, { "entropy": 0.5685284435749054, "epoch": 1.8838951310861423, "grad_norm": 0.03377389535307884, "learning_rate": 0.0002, "loss": 0.56586092710495, "mean_token_accuracy": 0.7697953432798386, "num_tokens": 8201808.0, "step": 503 }, { "entropy": 0.5590908825397491, "epoch": 1.8876404494382022, "grad_norm": 0.0385461188852787, "learning_rate": 0.0002, "loss": 0.5578455924987793, "mean_token_accuracy": 0.7730644196271896, "num_tokens": 8217945.0, "step": 504 }, { "entropy": 0.5606498569250107, "epoch": 1.8913857677902621, "grad_norm": 0.03381400555372238, "learning_rate": 0.0002, "loss": 0.5585749745368958, "mean_token_accuracy": 0.7752718329429626, "num_tokens": 8234181.0, "step": 505 }, { "entropy": 0.5511593520641327, "epoch": 1.895131086142322, "grad_norm": 0.04427889734506607, "learning_rate": 0.0002, "loss": 0.5605770349502563, "mean_token_accuracy": 0.7708971202373505, "num_tokens": 8250412.0, "step": 506 }, { "entropy": 0.5558828562498093, "epoch": 1.898876404494382, "grad_norm": 0.032851386815309525, "learning_rate": 0.0002, "loss": 0.5588455200195312, "mean_token_accuracy": 0.7729152590036392, "num_tokens": 8266940.0, "step": 507 }, { "entropy": 0.5533877611160278, "epoch": 1.902621722846442, "grad_norm": 0.034889817237854004, "learning_rate": 0.0002, "loss": 0.5531287789344788, "mean_token_accuracy": 0.7766410559415817, "num_tokens": 8283192.0, "step": 508 }, { "entropy": 0.55963134765625, "epoch": 1.9063670411985019, "grad_norm": 0.03460029140114784, "learning_rate": 0.0002, "loss": 0.5557897686958313, "mean_token_accuracy": 0.7736343890428543, "num_tokens": 8299357.0, "step": 509 }, { "entropy": 0.5412601754069328, "epoch": 1.9101123595505618, "grad_norm": 0.032328344881534576, "learning_rate": 0.0002, "loss": 0.5438541173934937, "mean_token_accuracy": 0.7753017991781235, "num_tokens": 8315841.0, "step": 510 }, { "entropy": 0.5540103167295456, "epoch": 1.9138576779026217, "grad_norm": 0.03002399578690529, "learning_rate": 0.0002, "loss": 0.5542548894882202, "mean_token_accuracy": 0.7737881243228912, "num_tokens": 8332181.0, "step": 511 }, { "entropy": 0.5422029197216034, "epoch": 1.9176029962546817, "grad_norm": 0.034409623593091965, "learning_rate": 0.0002, "loss": 0.5453910231590271, "mean_token_accuracy": 0.7794903218746185, "num_tokens": 8348319.0, "step": 512 }, { "entropy": 0.5566486120223999, "epoch": 1.9213483146067416, "grad_norm": 0.030252845957875252, "learning_rate": 0.0002, "loss": 0.5601068735122681, "mean_token_accuracy": 0.7728803753852844, "num_tokens": 8364457.0, "step": 513 }, { "entropy": 0.5523079186677933, "epoch": 1.9250936329588015, "grad_norm": 0.02711205929517746, "learning_rate": 0.0002, "loss": 0.5482505559921265, "mean_token_accuracy": 0.7751948684453964, "num_tokens": 8380923.0, "step": 514 }, { "entropy": 0.5604666918516159, "epoch": 1.9288389513108615, "grad_norm": 0.032180819660425186, "learning_rate": 0.0002, "loss": 0.5568802356719971, "mean_token_accuracy": 0.7695084065198898, "num_tokens": 8397239.0, "step": 515 }, { "entropy": 0.5643311589956284, "epoch": 1.9325842696629212, "grad_norm": 0.03032456897199154, "learning_rate": 0.0002, "loss": 0.5628493428230286, "mean_token_accuracy": 0.7717900723218918, "num_tokens": 8413791.0, "step": 516 }, { "entropy": 0.5468644499778748, "epoch": 1.9363295880149813, "grad_norm": 0.03036642260849476, "learning_rate": 0.0002, "loss": 0.5469942688941956, "mean_token_accuracy": 0.7763982564210892, "num_tokens": 8429973.0, "step": 517 }, { "entropy": 0.5639230608940125, "epoch": 1.940074906367041, "grad_norm": 0.03586732968688011, "learning_rate": 0.0002, "loss": 0.5693802237510681, "mean_token_accuracy": 0.7674274742603302, "num_tokens": 8446632.0, "step": 518 }, { "entropy": 0.552105188369751, "epoch": 1.9438202247191012, "grad_norm": 0.028923669829964638, "learning_rate": 0.0002, "loss": 0.5536226630210876, "mean_token_accuracy": 0.7770767658948898, "num_tokens": 8462861.0, "step": 519 }, { "entropy": 0.546203225851059, "epoch": 1.947565543071161, "grad_norm": 0.03517064452171326, "learning_rate": 0.0002, "loss": 0.5486375689506531, "mean_token_accuracy": 0.7788794338703156, "num_tokens": 8479188.0, "step": 520 }, { "entropy": 0.5571713298559189, "epoch": 1.951310861423221, "grad_norm": 0.03267424926161766, "learning_rate": 0.0002, "loss": 0.5605846047401428, "mean_token_accuracy": 0.7741213738918304, "num_tokens": 8495441.0, "step": 521 }, { "entropy": 0.5428985059261322, "epoch": 1.9550561797752808, "grad_norm": 0.03182944655418396, "learning_rate": 0.0002, "loss": 0.5459189414978027, "mean_token_accuracy": 0.7793070673942566, "num_tokens": 8511788.0, "step": 522 }, { "entropy": 0.5454448312520981, "epoch": 1.958801498127341, "grad_norm": 0.033397775143384933, "learning_rate": 0.0002, "loss": 0.5454107522964478, "mean_token_accuracy": 0.7772410660982132, "num_tokens": 8528152.0, "step": 523 }, { "entropy": 0.5469843745231628, "epoch": 1.9625468164794007, "grad_norm": 0.030805334448814392, "learning_rate": 0.0002, "loss": 0.5417147874832153, "mean_token_accuracy": 0.7786692380905151, "num_tokens": 8544780.0, "step": 524 }, { "entropy": 0.5402656495571136, "epoch": 1.9662921348314608, "grad_norm": 0.030130336061120033, "learning_rate": 0.0002, "loss": 0.5425636768341064, "mean_token_accuracy": 0.7805010080337524, "num_tokens": 8561035.0, "step": 525 }, { "entropy": 0.5509428530931473, "epoch": 1.9700374531835205, "grad_norm": 0.0316033698618412, "learning_rate": 0.0002, "loss": 0.5516440272331238, "mean_token_accuracy": 0.775515004992485, "num_tokens": 8577541.0, "step": 526 }, { "entropy": 0.5449865013360977, "epoch": 1.9737827715355807, "grad_norm": 0.03625763952732086, "learning_rate": 0.0002, "loss": 0.5528845191001892, "mean_token_accuracy": 0.7754436731338501, "num_tokens": 8593925.0, "step": 527 }, { "entropy": 0.563062384724617, "epoch": 1.9775280898876404, "grad_norm": 0.029838701710104942, "learning_rate": 0.0002, "loss": 0.5591800808906555, "mean_token_accuracy": 0.7732478529214859, "num_tokens": 8610524.0, "step": 528 }, { "entropy": 0.5514681190252304, "epoch": 1.9812734082397003, "grad_norm": 0.03368176147341728, "learning_rate": 0.0002, "loss": 0.548831582069397, "mean_token_accuracy": 0.7749605923891068, "num_tokens": 8626872.0, "step": 529 }, { "entropy": 0.5520317405462265, "epoch": 1.9850187265917603, "grad_norm": 0.03429826721549034, "learning_rate": 0.0002, "loss": 0.5514442324638367, "mean_token_accuracy": 0.7730523347854614, "num_tokens": 8642960.0, "step": 530 }, { "entropy": 0.5669658333063126, "epoch": 1.9887640449438202, "grad_norm": 0.0307292602956295, "learning_rate": 0.0002, "loss": 0.5723692178726196, "mean_token_accuracy": 0.7651190161705017, "num_tokens": 8659084.0, "step": 531 }, { "entropy": 0.5609945952892303, "epoch": 1.9925093632958801, "grad_norm": 0.036607109010219574, "learning_rate": 0.0002, "loss": 0.5636897683143616, "mean_token_accuracy": 0.7701397836208344, "num_tokens": 8675587.0, "step": 532 }, { "entropy": 0.5549340695142746, "epoch": 1.99625468164794, "grad_norm": 0.03215758502483368, "learning_rate": 0.0002, "loss": 0.5516895651817322, "mean_token_accuracy": 0.7737619578838348, "num_tokens": 8691850.0, "step": 533 }, { "entropy": 0.5620461255311966, "epoch": 2.0, "grad_norm": 0.028028611093759537, "learning_rate": 0.0002, "loss": 0.5578765869140625, "mean_token_accuracy": 0.7716735005378723, "num_tokens": 8708236.0, "step": 534 }, { "entropy": 0.557419016957283, "epoch": 2.0037453183520597, "grad_norm": 0.03629058599472046, "learning_rate": 0.0002, "loss": 0.5479042530059814, "mean_token_accuracy": 0.7768302410840988, "num_tokens": 8724656.0, "step": 535 }, { "entropy": 0.5507587045431137, "epoch": 2.00749063670412, "grad_norm": 0.032850366085767746, "learning_rate": 0.0002, "loss": 0.5528382062911987, "mean_token_accuracy": 0.7756710648536682, "num_tokens": 8741046.0, "step": 536 }, { "entropy": 0.5404622703790665, "epoch": 2.0112359550561796, "grad_norm": 0.031562913209199905, "learning_rate": 0.0002, "loss": 0.5380600094795227, "mean_token_accuracy": 0.7781912684440613, "num_tokens": 8757535.0, "step": 537 }, { "entropy": 0.5316804945468903, "epoch": 2.0149812734082397, "grad_norm": 0.03351443260908127, "learning_rate": 0.0002, "loss": 0.5359355807304382, "mean_token_accuracy": 0.7827723175287247, "num_tokens": 8773824.0, "step": 538 }, { "entropy": 0.5419723987579346, "epoch": 2.0187265917602994, "grad_norm": 0.03948935121297836, "learning_rate": 0.0002, "loss": 0.5471257567405701, "mean_token_accuracy": 0.7790137678384781, "num_tokens": 8790095.0, "step": 539 }, { "entropy": 0.5343683362007141, "epoch": 2.0224719101123596, "grad_norm": 0.031161192804574966, "learning_rate": 0.0002, "loss": 0.5309802889823914, "mean_token_accuracy": 0.7821521759033203, "num_tokens": 8806510.0, "step": 540 }, { "entropy": 0.5364920198917389, "epoch": 2.0262172284644193, "grad_norm": 0.03507857769727707, "learning_rate": 0.0002, "loss": 0.5324068069458008, "mean_token_accuracy": 0.7870013862848282, "num_tokens": 8822654.0, "step": 541 }, { "entropy": 0.5483170747756958, "epoch": 2.0299625468164795, "grad_norm": 0.03222345933318138, "learning_rate": 0.0002, "loss": 0.549699068069458, "mean_token_accuracy": 0.7751237750053406, "num_tokens": 8839285.0, "step": 542 }, { "entropy": 0.5425759255886078, "epoch": 2.033707865168539, "grad_norm": 0.03227977082133293, "learning_rate": 0.0002, "loss": 0.5380892753601074, "mean_token_accuracy": 0.7839174568653107, "num_tokens": 8855507.0, "step": 543 }, { "entropy": 0.5272768065333366, "epoch": 2.0374531835205993, "grad_norm": 0.03487760201096535, "learning_rate": 0.0002, "loss": 0.5265735387802124, "mean_token_accuracy": 0.7857347279787064, "num_tokens": 8871873.0, "step": 544 }, { "entropy": 0.5219558328390121, "epoch": 2.041198501872659, "grad_norm": 0.035983484238386154, "learning_rate": 0.0002, "loss": 0.5337969660758972, "mean_token_accuracy": 0.7834839969873428, "num_tokens": 8887984.0, "step": 545 }, { "entropy": 0.5376651287078857, "epoch": 2.044943820224719, "grad_norm": 0.038352932780981064, "learning_rate": 0.0002, "loss": 0.5438427329063416, "mean_token_accuracy": 0.7784269452095032, "num_tokens": 8904216.0, "step": 546 }, { "entropy": 0.5456122606992722, "epoch": 2.048689138576779, "grad_norm": 0.036168649792671204, "learning_rate": 0.0002, "loss": 0.5431267023086548, "mean_token_accuracy": 0.7829999178647995, "num_tokens": 8920617.0, "step": 547 }, { "entropy": 0.5304486304521561, "epoch": 2.052434456928839, "grad_norm": 0.03324899077415466, "learning_rate": 0.0002, "loss": 0.5289336442947388, "mean_token_accuracy": 0.7849617451429367, "num_tokens": 8936835.0, "step": 548 }, { "entropy": 0.5275251343846321, "epoch": 2.056179775280899, "grad_norm": 0.03898227587342262, "learning_rate": 0.0002, "loss": 0.530302882194519, "mean_token_accuracy": 0.7835600972175598, "num_tokens": 8953009.0, "step": 549 }, { "entropy": 0.5530034005641937, "epoch": 2.059925093632959, "grad_norm": 0.038006141781806946, "learning_rate": 0.0002, "loss": 0.5494067072868347, "mean_token_accuracy": 0.7755949050188065, "num_tokens": 8969428.0, "step": 550 }, { "entropy": 0.5418991297483444, "epoch": 2.0636704119850187, "grad_norm": 0.03261435031890869, "learning_rate": 0.0002, "loss": 0.5322299003601074, "mean_token_accuracy": 0.7837673723697662, "num_tokens": 8985844.0, "step": 551 }, { "entropy": 0.5309967398643494, "epoch": 2.067415730337079, "grad_norm": 0.03797997906804085, "learning_rate": 0.0002, "loss": 0.5291654467582703, "mean_token_accuracy": 0.7849747538566589, "num_tokens": 9002169.0, "step": 552 }, { "entropy": 0.5188492685556412, "epoch": 2.0711610486891385, "grad_norm": 0.038583919405937195, "learning_rate": 0.0002, "loss": 0.5282660722732544, "mean_token_accuracy": 0.7870546579360962, "num_tokens": 9018570.0, "step": 553 }, { "entropy": 0.534794494509697, "epoch": 2.0749063670411987, "grad_norm": 0.03449336439371109, "learning_rate": 0.0002, "loss": 0.5352678298950195, "mean_token_accuracy": 0.7845733165740967, "num_tokens": 9034788.0, "step": 554 }, { "entropy": 0.5308385342359543, "epoch": 2.0786516853932584, "grad_norm": 0.03845726326107979, "learning_rate": 0.0002, "loss": 0.5325117111206055, "mean_token_accuracy": 0.7835551649332047, "num_tokens": 9051109.0, "step": 555 }, { "entropy": 0.5309025943279266, "epoch": 2.0823970037453186, "grad_norm": 0.03809129074215889, "learning_rate": 0.0002, "loss": 0.5253363847732544, "mean_token_accuracy": 0.7868698537349701, "num_tokens": 9067268.0, "step": 556 }, { "entropy": 0.5575416088104248, "epoch": 2.0861423220973783, "grad_norm": 0.034367915242910385, "learning_rate": 0.0002, "loss": 0.5523205995559692, "mean_token_accuracy": 0.7749448716640472, "num_tokens": 9083891.0, "step": 557 }, { "entropy": 0.53434719145298, "epoch": 2.0898876404494384, "grad_norm": 0.03826329484581947, "learning_rate": 0.0002, "loss": 0.5409042835235596, "mean_token_accuracy": 0.7785179018974304, "num_tokens": 9100370.0, "step": 558 }, { "entropy": 0.5194257721304893, "epoch": 2.093632958801498, "grad_norm": 0.03882572054862976, "learning_rate": 0.0002, "loss": 0.5238875150680542, "mean_token_accuracy": 0.7858750522136688, "num_tokens": 9116506.0, "step": 559 }, { "entropy": 0.5331729799509048, "epoch": 2.097378277153558, "grad_norm": 0.045005545020103455, "learning_rate": 0.0002, "loss": 0.5285134315490723, "mean_token_accuracy": 0.7852654755115509, "num_tokens": 9132871.0, "step": 560 }, { "entropy": 0.5405212640762329, "epoch": 2.101123595505618, "grad_norm": 0.04780491814017296, "learning_rate": 0.0002, "loss": 0.5461173057556152, "mean_token_accuracy": 0.7770982980728149, "num_tokens": 9149174.0, "step": 561 }, { "entropy": 0.5288062691688538, "epoch": 2.1048689138576777, "grad_norm": 0.04940470680594444, "learning_rate": 0.0002, "loss": 0.5337265729904175, "mean_token_accuracy": 0.7846069186925888, "num_tokens": 9165316.0, "step": 562 }, { "entropy": 0.531680166721344, "epoch": 2.108614232209738, "grad_norm": 0.05061717331409454, "learning_rate": 0.0002, "loss": 0.5271866321563721, "mean_token_accuracy": 0.7854976505041122, "num_tokens": 9181482.0, "step": 563 }, { "entropy": 0.5314291417598724, "epoch": 2.1123595505617976, "grad_norm": 0.0397643968462944, "learning_rate": 0.0002, "loss": 0.5271567702293396, "mean_token_accuracy": 0.7851341366767883, "num_tokens": 9197662.0, "step": 564 }, { "entropy": 0.5252021998167038, "epoch": 2.1161048689138577, "grad_norm": 0.041956719011068344, "learning_rate": 0.0002, "loss": 0.5281031727790833, "mean_token_accuracy": 0.7877316772937775, "num_tokens": 9214001.0, "step": 565 }, { "entropy": 0.5378998965024948, "epoch": 2.1198501872659175, "grad_norm": 0.03963020071387291, "learning_rate": 0.0002, "loss": 0.5432679653167725, "mean_token_accuracy": 0.7765485197305679, "num_tokens": 9230298.0, "step": 566 }, { "entropy": 0.5449769049882889, "epoch": 2.1235955056179776, "grad_norm": 0.04862145707011223, "learning_rate": 0.0002, "loss": 0.5481102466583252, "mean_token_accuracy": 0.7771643400192261, "num_tokens": 9246648.0, "step": 567 }, { "entropy": 0.5432566553354263, "epoch": 2.1273408239700373, "grad_norm": 0.03826707601547241, "learning_rate": 0.0002, "loss": 0.5354676246643066, "mean_token_accuracy": 0.7808031290769577, "num_tokens": 9263059.0, "step": 568 }, { "entropy": 0.5395092964172363, "epoch": 2.1310861423220975, "grad_norm": 0.04806908592581749, "learning_rate": 0.0002, "loss": 0.5348396897315979, "mean_token_accuracy": 0.7838325351476669, "num_tokens": 9279690.0, "step": 569 }, { "entropy": 0.512074276804924, "epoch": 2.134831460674157, "grad_norm": 0.034932930022478104, "learning_rate": 0.0002, "loss": 0.5059640407562256, "mean_token_accuracy": 0.7954477220773697, "num_tokens": 9296053.0, "step": 570 }, { "entropy": 0.5317389219999313, "epoch": 2.1385767790262173, "grad_norm": 0.054850984364748, "learning_rate": 0.0002, "loss": 0.5419769287109375, "mean_token_accuracy": 0.7804836332798004, "num_tokens": 9312250.0, "step": 571 }, { "entropy": 0.523776650428772, "epoch": 2.142322097378277, "grad_norm": 0.03885575383901596, "learning_rate": 0.0002, "loss": 0.5337730050086975, "mean_token_accuracy": 0.7821401208639145, "num_tokens": 9328588.0, "step": 572 }, { "entropy": 0.5306317359209061, "epoch": 2.146067415730337, "grad_norm": 0.04031698405742645, "learning_rate": 0.0002, "loss": 0.5285602807998657, "mean_token_accuracy": 0.7860189080238342, "num_tokens": 9344771.0, "step": 573 }, { "entropy": 0.5253511220216751, "epoch": 2.149812734082397, "grad_norm": 0.03704000264406204, "learning_rate": 0.0002, "loss": 0.519854724407196, "mean_token_accuracy": 0.7907343953847885, "num_tokens": 9360913.0, "step": 574 }, { "entropy": 0.5498696267604828, "epoch": 2.153558052434457, "grad_norm": 0.03690071031451225, "learning_rate": 0.0002, "loss": 0.5417680144309998, "mean_token_accuracy": 0.7790531069040298, "num_tokens": 9377532.0, "step": 575 }, { "entropy": 0.5402537435293198, "epoch": 2.157303370786517, "grad_norm": 0.0378306582570076, "learning_rate": 0.0002, "loss": 0.541071891784668, "mean_token_accuracy": 0.7788532823324203, "num_tokens": 9393830.0, "step": 576 }, { "entropy": 0.5282108932733536, "epoch": 2.161048689138577, "grad_norm": 0.04091333597898483, "learning_rate": 0.0002, "loss": 0.5348851084709167, "mean_token_accuracy": 0.7821558713912964, "num_tokens": 9410274.0, "step": 577 }, { "entropy": 0.5303814560174942, "epoch": 2.1647940074906367, "grad_norm": 0.03591841831803322, "learning_rate": 0.0002, "loss": 0.5331617593765259, "mean_token_accuracy": 0.7818120270967484, "num_tokens": 9426511.0, "step": 578 }, { "entropy": 0.5272700041532516, "epoch": 2.168539325842697, "grad_norm": 0.03997735306620598, "learning_rate": 0.0002, "loss": 0.5334488153457642, "mean_token_accuracy": 0.7814541161060333, "num_tokens": 9442897.0, "step": 579 }, { "entropy": 0.5336402952671051, "epoch": 2.1722846441947565, "grad_norm": 0.0450415313243866, "learning_rate": 0.0002, "loss": 0.5275048017501831, "mean_token_accuracy": 0.7864081561565399, "num_tokens": 9459023.0, "step": 580 }, { "entropy": 0.538782149553299, "epoch": 2.1760299625468167, "grad_norm": 0.03600127249956131, "learning_rate": 0.0002, "loss": 0.5313720107078552, "mean_token_accuracy": 0.7847412079572678, "num_tokens": 9475337.0, "step": 581 }, { "entropy": 0.5273982435464859, "epoch": 2.1797752808988764, "grad_norm": 0.04744241386651993, "learning_rate": 0.0002, "loss": 0.5319021344184875, "mean_token_accuracy": 0.7850695848464966, "num_tokens": 9491529.0, "step": 582 }, { "entropy": 0.5370319783687592, "epoch": 2.1835205992509366, "grad_norm": 0.035024821758270264, "learning_rate": 0.0002, "loss": 0.5342311859130859, "mean_token_accuracy": 0.7830409854650497, "num_tokens": 9508099.0, "step": 583 }, { "entropy": 0.5350894033908844, "epoch": 2.1872659176029963, "grad_norm": 0.04598443582653999, "learning_rate": 0.0002, "loss": 0.5383565425872803, "mean_token_accuracy": 0.7810914367437363, "num_tokens": 9524506.0, "step": 584 }, { "entropy": 0.5270723178982735, "epoch": 2.191011235955056, "grad_norm": 0.03489379957318306, "learning_rate": 0.0002, "loss": 0.5261937379837036, "mean_token_accuracy": 0.7874008566141129, "num_tokens": 9540868.0, "step": 585 }, { "entropy": 0.5187418013811111, "epoch": 2.194756554307116, "grad_norm": 0.04006824642419815, "learning_rate": 0.0002, "loss": 0.516140341758728, "mean_token_accuracy": 0.7876712679862976, "num_tokens": 9557109.0, "step": 586 }, { "entropy": 0.5397524982690811, "epoch": 2.198501872659176, "grad_norm": 0.037596385926008224, "learning_rate": 0.0002, "loss": 0.5337037444114685, "mean_token_accuracy": 0.7848425805568695, "num_tokens": 9573451.0, "step": 587 }, { "entropy": 0.542935311794281, "epoch": 2.202247191011236, "grad_norm": 0.05163532868027687, "learning_rate": 0.0002, "loss": 0.548254668712616, "mean_token_accuracy": 0.7771319299936295, "num_tokens": 9589800.0, "step": 588 }, { "entropy": 0.524966299533844, "epoch": 2.2059925093632957, "grad_norm": 0.04678061604499817, "learning_rate": 0.0002, "loss": 0.537632405757904, "mean_token_accuracy": 0.7821167409420013, "num_tokens": 9606180.0, "step": 589 }, { "entropy": 0.5223182588815689, "epoch": 2.209737827715356, "grad_norm": 0.04918593540787697, "learning_rate": 0.0002, "loss": 0.5256946086883545, "mean_token_accuracy": 0.7862184792757034, "num_tokens": 9622319.0, "step": 590 }, { "entropy": 0.545245572924614, "epoch": 2.2134831460674156, "grad_norm": 0.044536106288433075, "learning_rate": 0.0002, "loss": 0.5387803316116333, "mean_token_accuracy": 0.7820178419351578, "num_tokens": 9638605.0, "step": 591 }, { "entropy": 0.5572000294923782, "epoch": 2.2172284644194757, "grad_norm": 0.04941220581531525, "learning_rate": 0.0002, "loss": 0.5500818490982056, "mean_token_accuracy": 0.7780845314264297, "num_tokens": 9655041.0, "step": 592 }, { "entropy": 0.524405911564827, "epoch": 2.2209737827715355, "grad_norm": 0.04783201217651367, "learning_rate": 0.0002, "loss": 0.5203397870063782, "mean_token_accuracy": 0.7880013734102249, "num_tokens": 9671239.0, "step": 593 }, { "entropy": 0.5252467542886734, "epoch": 2.2247191011235956, "grad_norm": 0.04301263764500618, "learning_rate": 0.0002, "loss": 0.5267080664634705, "mean_token_accuracy": 0.7888626754283905, "num_tokens": 9687363.0, "step": 594 }, { "entropy": 0.53339484333992, "epoch": 2.2284644194756553, "grad_norm": 0.05318563058972359, "learning_rate": 0.0002, "loss": 0.5481151342391968, "mean_token_accuracy": 0.7762688100337982, "num_tokens": 9703829.0, "step": 595 }, { "entropy": 0.5450247228145599, "epoch": 2.2322097378277155, "grad_norm": 0.03796645253896713, "learning_rate": 0.0002, "loss": 0.5463745594024658, "mean_token_accuracy": 0.7799876779317856, "num_tokens": 9720055.0, "step": 596 }, { "entropy": 0.5355545580387115, "epoch": 2.235955056179775, "grad_norm": 0.04619521647691727, "learning_rate": 0.0002, "loss": 0.5383350253105164, "mean_token_accuracy": 0.7803421318531036, "num_tokens": 9736065.0, "step": 597 }, { "entropy": 0.5393659174442291, "epoch": 2.2397003745318353, "grad_norm": 0.04189852997660637, "learning_rate": 0.0002, "loss": 0.5408390760421753, "mean_token_accuracy": 0.7799636572599411, "num_tokens": 9752285.0, "step": 598 }, { "entropy": 0.5505337119102478, "epoch": 2.243445692883895, "grad_norm": 0.04415363445878029, "learning_rate": 0.0002, "loss": 0.5492491722106934, "mean_token_accuracy": 0.7789665758609772, "num_tokens": 9768797.0, "step": 599 }, { "entropy": 0.5322769433259964, "epoch": 2.247191011235955, "grad_norm": 0.0446348674595356, "learning_rate": 0.0002, "loss": 0.5362676978111267, "mean_token_accuracy": 0.7827903628349304, "num_tokens": 9785259.0, "step": 600 }, { "entropy": 0.5283399671316147, "epoch": 2.250936329588015, "grad_norm": 0.04350518435239792, "learning_rate": 0.0002, "loss": 0.5263485312461853, "mean_token_accuracy": 0.7854094952344894, "num_tokens": 9801683.0, "step": 601 }, { "entropy": 0.5155128389596939, "epoch": 2.254681647940075, "grad_norm": 0.049416691064834595, "learning_rate": 0.0002, "loss": 0.5274794101715088, "mean_token_accuracy": 0.7866163551807404, "num_tokens": 9817897.0, "step": 602 }, { "entropy": 0.555690124630928, "epoch": 2.258426966292135, "grad_norm": 0.042244087904691696, "learning_rate": 0.0002, "loss": 0.5587432384490967, "mean_token_accuracy": 0.7742861956357956, "num_tokens": 9834109.0, "step": 603 }, { "entropy": 0.5449231714010239, "epoch": 2.262172284644195, "grad_norm": 0.04214772582054138, "learning_rate": 0.0002, "loss": 0.5424601435661316, "mean_token_accuracy": 0.7795074135065079, "num_tokens": 9850508.0, "step": 604 }, { "entropy": 0.551129087805748, "epoch": 2.2659176029962547, "grad_norm": 0.04242361709475517, "learning_rate": 0.0002, "loss": 0.5350391268730164, "mean_token_accuracy": 0.7817512005567551, "num_tokens": 9866973.0, "step": 605 }, { "entropy": 0.5557906329631805, "epoch": 2.2696629213483144, "grad_norm": 0.04337119311094284, "learning_rate": 0.0002, "loss": 0.5464892387390137, "mean_token_accuracy": 0.7796575874090195, "num_tokens": 9883567.0, "step": 606 }, { "entropy": 0.5241350680589676, "epoch": 2.2734082397003745, "grad_norm": 0.04597577825188637, "learning_rate": 0.0002, "loss": 0.5339911580085754, "mean_token_accuracy": 0.784000501036644, "num_tokens": 9899884.0, "step": 607 }, { "entropy": 0.5317652076482773, "epoch": 2.2771535580524347, "grad_norm": 0.06419555842876434, "learning_rate": 0.0002, "loss": 0.5507545471191406, "mean_token_accuracy": 0.7757140696048737, "num_tokens": 9916225.0, "step": 608 }, { "entropy": 0.520916298031807, "epoch": 2.2808988764044944, "grad_norm": 0.0413593053817749, "learning_rate": 0.0002, "loss": 0.5282008051872253, "mean_token_accuracy": 0.7836293429136276, "num_tokens": 9932137.0, "step": 609 }, { "entropy": 0.550976499915123, "epoch": 2.284644194756554, "grad_norm": 0.04407277703285217, "learning_rate": 0.0002, "loss": 0.5476412177085876, "mean_token_accuracy": 0.7784940898418427, "num_tokens": 9948364.0, "step": 610 }, { "entropy": 0.5534344464540482, "epoch": 2.2883895131086143, "grad_norm": 0.036215297877788544, "learning_rate": 0.0002, "loss": 0.5448459386825562, "mean_token_accuracy": 0.7809607535600662, "num_tokens": 9964781.0, "step": 611 }, { "entropy": 0.540510505437851, "epoch": 2.292134831460674, "grad_norm": 0.037168748676776886, "learning_rate": 0.0002, "loss": 0.5290323495864868, "mean_token_accuracy": 0.7844896763563156, "num_tokens": 9980949.0, "step": 612 }, { "entropy": 0.537270799279213, "epoch": 2.295880149812734, "grad_norm": 0.0456305667757988, "learning_rate": 0.0002, "loss": 0.5368558764457703, "mean_token_accuracy": 0.781862810254097, "num_tokens": 9997181.0, "step": 613 }, { "entropy": 0.529745414853096, "epoch": 2.299625468164794, "grad_norm": 0.04219827800989151, "learning_rate": 0.0002, "loss": 0.5287020206451416, "mean_token_accuracy": 0.7848487794399261, "num_tokens": 10013303.0, "step": 614 }, { "entropy": 0.5297169536352158, "epoch": 2.303370786516854, "grad_norm": 0.05070658028125763, "learning_rate": 0.0002, "loss": 0.5422332286834717, "mean_token_accuracy": 0.7800150513648987, "num_tokens": 10029569.0, "step": 615 }, { "entropy": 0.5271121859550476, "epoch": 2.3071161048689137, "grad_norm": 0.04743409901857376, "learning_rate": 0.0002, "loss": 0.5323826670646667, "mean_token_accuracy": 0.7835269123315811, "num_tokens": 10045920.0, "step": 616 }, { "entropy": 0.5429159998893738, "epoch": 2.310861423220974, "grad_norm": 0.04348791018128395, "learning_rate": 0.0002, "loss": 0.5469599962234497, "mean_token_accuracy": 0.777765229344368, "num_tokens": 10062068.0, "step": 617 }, { "entropy": 0.5268895328044891, "epoch": 2.3146067415730336, "grad_norm": 0.046540766954422, "learning_rate": 0.0002, "loss": 0.5318824052810669, "mean_token_accuracy": 0.784139409661293, "num_tokens": 10078035.0, "step": 618 }, { "entropy": 0.5406851470470428, "epoch": 2.3183520599250937, "grad_norm": 0.03879360482096672, "learning_rate": 0.0002, "loss": 0.5327763557434082, "mean_token_accuracy": 0.7838515788316727, "num_tokens": 10094069.0, "step": 619 }, { "entropy": 0.5550850629806519, "epoch": 2.3220973782771535, "grad_norm": 0.04021632671356201, "learning_rate": 0.0002, "loss": 0.544082760810852, "mean_token_accuracy": 0.7794292271137238, "num_tokens": 10110562.0, "step": 620 }, { "entropy": 0.5633902698755264, "epoch": 2.3258426966292136, "grad_norm": 0.03872428461909294, "learning_rate": 0.0002, "loss": 0.5591956973075867, "mean_token_accuracy": 0.7731619328260422, "num_tokens": 10127313.0, "step": 621 }, { "entropy": 0.526028499007225, "epoch": 2.3295880149812733, "grad_norm": 0.04169732704758644, "learning_rate": 0.0002, "loss": 0.5296715497970581, "mean_token_accuracy": 0.7846156656742096, "num_tokens": 10143539.0, "step": 622 }, { "entropy": 0.5621512830257416, "epoch": 2.3333333333333335, "grad_norm": 0.03567031770944595, "learning_rate": 0.0002, "loss": 0.5641921758651733, "mean_token_accuracy": 0.7724113464355469, "num_tokens": 10159890.0, "step": 623 }, { "entropy": 0.5621916353702545, "epoch": 2.337078651685393, "grad_norm": 0.044719185680150986, "learning_rate": 0.0002, "loss": 0.5658475756645203, "mean_token_accuracy": 0.768171489238739, "num_tokens": 10176303.0, "step": 624 }, { "entropy": 0.5397062003612518, "epoch": 2.3408239700374533, "grad_norm": 0.03938845917582512, "learning_rate": 0.0002, "loss": 0.5410289168357849, "mean_token_accuracy": 0.7816459834575653, "num_tokens": 10192725.0, "step": 625 }, { "entropy": 0.5308454632759094, "epoch": 2.344569288389513, "grad_norm": 0.0393369197845459, "learning_rate": 0.0002, "loss": 0.5327979326248169, "mean_token_accuracy": 0.7836434692144394, "num_tokens": 10208900.0, "step": 626 }, { "entropy": 0.5351555794477463, "epoch": 2.348314606741573, "grad_norm": 0.044483788311481476, "learning_rate": 0.0002, "loss": 0.537283182144165, "mean_token_accuracy": 0.784860372543335, "num_tokens": 10224853.0, "step": 627 }, { "entropy": 0.5380195677280426, "epoch": 2.352059925093633, "grad_norm": 0.04018259420990944, "learning_rate": 0.0002, "loss": 0.5401010513305664, "mean_token_accuracy": 0.7777950018644333, "num_tokens": 10241181.0, "step": 628 }, { "entropy": 0.5319711565971375, "epoch": 2.355805243445693, "grad_norm": 0.052694015204906464, "learning_rate": 0.0002, "loss": 0.5327081680297852, "mean_token_accuracy": 0.7857355177402496, "num_tokens": 10257569.0, "step": 629 }, { "entropy": 0.5219532996416092, "epoch": 2.359550561797753, "grad_norm": 0.0513097383081913, "learning_rate": 0.0002, "loss": 0.5344624519348145, "mean_token_accuracy": 0.781092032790184, "num_tokens": 10273502.0, "step": 630 }, { "entropy": 0.5303360670804977, "epoch": 2.3632958801498125, "grad_norm": 0.05031297355890274, "learning_rate": 0.0002, "loss": 0.5381285548210144, "mean_token_accuracy": 0.7818425595760345, "num_tokens": 10289765.0, "step": 631 }, { "entropy": 0.5247592329978943, "epoch": 2.3670411985018727, "grad_norm": 0.040263328701257706, "learning_rate": 0.0002, "loss": 0.5220550298690796, "mean_token_accuracy": 0.786396861076355, "num_tokens": 10306027.0, "step": 632 }, { "entropy": 0.5546284765005112, "epoch": 2.370786516853933, "grad_norm": 0.04438352584838867, "learning_rate": 0.0002, "loss": 0.5477085113525391, "mean_token_accuracy": 0.7770822197198868, "num_tokens": 10322169.0, "step": 633 }, { "entropy": 0.5496452152729034, "epoch": 2.3745318352059925, "grad_norm": 0.048432301729917526, "learning_rate": 0.0002, "loss": 0.5438807606697083, "mean_token_accuracy": 0.780827596783638, "num_tokens": 10338568.0, "step": 634 }, { "entropy": 0.5297926962375641, "epoch": 2.3782771535580522, "grad_norm": 0.03634348511695862, "learning_rate": 0.0002, "loss": 0.5239929556846619, "mean_token_accuracy": 0.7896489948034286, "num_tokens": 10354708.0, "step": 635 }, { "entropy": 0.5366943925619125, "epoch": 2.3820224719101124, "grad_norm": 0.051037952303886414, "learning_rate": 0.0002, "loss": 0.5460379123687744, "mean_token_accuracy": 0.7777325063943863, "num_tokens": 10371358.0, "step": 636 }, { "entropy": 0.5219292491674423, "epoch": 2.385767790262172, "grad_norm": 0.03863009437918663, "learning_rate": 0.0002, "loss": 0.5266265273094177, "mean_token_accuracy": 0.7879810929298401, "num_tokens": 10387500.0, "step": 637 }, { "entropy": 0.5288277566432953, "epoch": 2.3895131086142323, "grad_norm": 0.05099929869174957, "learning_rate": 0.0002, "loss": 0.5307456851005554, "mean_token_accuracy": 0.7841700166463852, "num_tokens": 10404042.0, "step": 638 }, { "entropy": 0.5441994965076447, "epoch": 2.393258426966292, "grad_norm": 0.03832423314452171, "learning_rate": 0.0002, "loss": 0.5406984090805054, "mean_token_accuracy": 0.7822638154029846, "num_tokens": 10420308.0, "step": 639 }, { "entropy": 0.5474298596382141, "epoch": 2.397003745318352, "grad_norm": 0.03593610227108002, "learning_rate": 0.0002, "loss": 0.5448755025863647, "mean_token_accuracy": 0.7769681811332703, "num_tokens": 10436473.0, "step": 640 }, { "entropy": 0.5544268637895584, "epoch": 2.400749063670412, "grad_norm": 0.05683998391032219, "learning_rate": 0.0002, "loss": 0.5575302839279175, "mean_token_accuracy": 0.7728745937347412, "num_tokens": 10453006.0, "step": 641 }, { "entropy": 0.5459371656179428, "epoch": 2.404494382022472, "grad_norm": 0.041604217141866684, "learning_rate": 0.0002, "loss": 0.5482038855552673, "mean_token_accuracy": 0.7801420837640762, "num_tokens": 10469281.0, "step": 642 }, { "entropy": 0.5380865782499313, "epoch": 2.4082397003745317, "grad_norm": 0.05113884434103966, "learning_rate": 0.0002, "loss": 0.5394017696380615, "mean_token_accuracy": 0.7834807485342026, "num_tokens": 10485666.0, "step": 643 }, { "entropy": 0.549991711974144, "epoch": 2.411985018726592, "grad_norm": 0.03647167235612869, "learning_rate": 0.0002, "loss": 0.553663969039917, "mean_token_accuracy": 0.774835467338562, "num_tokens": 10501890.0, "step": 644 }, { "entropy": 0.5480955541133881, "epoch": 2.4157303370786516, "grad_norm": 0.04493939131498337, "learning_rate": 0.0002, "loss": 0.5466475486755371, "mean_token_accuracy": 0.7790014296770096, "num_tokens": 10518311.0, "step": 645 }, { "entropy": 0.5469405502080917, "epoch": 2.4194756554307117, "grad_norm": 0.040811046957969666, "learning_rate": 0.0002, "loss": 0.5483651161193848, "mean_token_accuracy": 0.7788845151662827, "num_tokens": 10534519.0, "step": 646 }, { "entropy": 0.542740598320961, "epoch": 2.4232209737827715, "grad_norm": 0.045434851199388504, "learning_rate": 0.0002, "loss": 0.5396543741226196, "mean_token_accuracy": 0.7790694683790207, "num_tokens": 10550595.0, "step": 647 }, { "entropy": 0.535121500492096, "epoch": 2.4269662921348316, "grad_norm": 0.04115886241197586, "learning_rate": 0.0002, "loss": 0.5374845266342163, "mean_token_accuracy": 0.7803627252578735, "num_tokens": 10566917.0, "step": 648 }, { "entropy": 0.5375159233808517, "epoch": 2.4307116104868913, "grad_norm": 0.04332772269845009, "learning_rate": 0.0002, "loss": 0.5381888151168823, "mean_token_accuracy": 0.7793711423873901, "num_tokens": 10583313.0, "step": 649 }, { "entropy": 0.5432725697755814, "epoch": 2.4344569288389515, "grad_norm": 0.041510697454214096, "learning_rate": 0.0002, "loss": 0.5448310375213623, "mean_token_accuracy": 0.7758618593215942, "num_tokens": 10599510.0, "step": 650 }, { "entropy": 0.5411451011896133, "epoch": 2.438202247191011, "grad_norm": 0.04265889525413513, "learning_rate": 0.0002, "loss": 0.5466779470443726, "mean_token_accuracy": 0.7779202163219452, "num_tokens": 10615799.0, "step": 651 }, { "entropy": 0.535615861415863, "epoch": 2.4419475655430714, "grad_norm": 0.04081408306956291, "learning_rate": 0.0002, "loss": 0.539250373840332, "mean_token_accuracy": 0.7790500521659851, "num_tokens": 10632054.0, "step": 652 }, { "entropy": 0.5231917202472687, "epoch": 2.445692883895131, "grad_norm": 0.037281572818756104, "learning_rate": 0.0002, "loss": 0.5242350101470947, "mean_token_accuracy": 0.7875235080718994, "num_tokens": 10648293.0, "step": 653 }, { "entropy": 0.5311395078897476, "epoch": 2.449438202247191, "grad_norm": 0.04048464447259903, "learning_rate": 0.0002, "loss": 0.5264798402786255, "mean_token_accuracy": 0.7850567251443863, "num_tokens": 10664249.0, "step": 654 }, { "entropy": 0.5295854657888412, "epoch": 2.453183520599251, "grad_norm": 0.042382705956697464, "learning_rate": 0.0002, "loss": 0.5322737097740173, "mean_token_accuracy": 0.7859133034944534, "num_tokens": 10680711.0, "step": 655 }, { "entropy": 0.5250136256217957, "epoch": 2.4569288389513106, "grad_norm": 0.047354746609926224, "learning_rate": 0.0002, "loss": 0.524110734462738, "mean_token_accuracy": 0.7874706089496613, "num_tokens": 10696903.0, "step": 656 }, { "entropy": 0.5428455919027328, "epoch": 2.460674157303371, "grad_norm": 0.04214261844754219, "learning_rate": 0.0002, "loss": 0.5400563478469849, "mean_token_accuracy": 0.7825742065906525, "num_tokens": 10713018.0, "step": 657 }, { "entropy": 0.5570447146892548, "epoch": 2.464419475655431, "grad_norm": 0.04198653623461723, "learning_rate": 0.0002, "loss": 0.5468944907188416, "mean_token_accuracy": 0.7801797240972519, "num_tokens": 10729583.0, "step": 658 }, { "entropy": 0.5350753366947174, "epoch": 2.4681647940074907, "grad_norm": 0.03751063346862793, "learning_rate": 0.0002, "loss": 0.5351656675338745, "mean_token_accuracy": 0.7814910113811493, "num_tokens": 10746077.0, "step": 659 }, { "entropy": 0.5235352218151093, "epoch": 2.4719101123595504, "grad_norm": 0.040084533393383026, "learning_rate": 0.0002, "loss": 0.531356692314148, "mean_token_accuracy": 0.7839406430721283, "num_tokens": 10762311.0, "step": 660 }, { "entropy": 0.5389134883880615, "epoch": 2.4756554307116105, "grad_norm": 0.05371229350566864, "learning_rate": 0.0002, "loss": 0.5532786250114441, "mean_token_accuracy": 0.7754277139902115, "num_tokens": 10778652.0, "step": 661 }, { "entropy": 0.5187595188617706, "epoch": 2.4794007490636703, "grad_norm": 0.03975149244070053, "learning_rate": 0.0002, "loss": 0.5151571035385132, "mean_token_accuracy": 0.7930901050567627, "num_tokens": 10794746.0, "step": 662 }, { "entropy": 0.5426436811685562, "epoch": 2.4831460674157304, "grad_norm": 0.03997328504920006, "learning_rate": 0.0002, "loss": 0.5403225421905518, "mean_token_accuracy": 0.7798904031515121, "num_tokens": 10811033.0, "step": 663 }, { "entropy": 0.5267360359430313, "epoch": 2.48689138576779, "grad_norm": 0.043838318437337875, "learning_rate": 0.0002, "loss": 0.526395320892334, "mean_token_accuracy": 0.7879899889230728, "num_tokens": 10827129.0, "step": 664 }, { "entropy": 0.5509849190711975, "epoch": 2.4906367041198503, "grad_norm": 0.037469275295734406, "learning_rate": 0.0002, "loss": 0.5411713719367981, "mean_token_accuracy": 0.7808174937963486, "num_tokens": 10843435.0, "step": 665 }, { "entropy": 0.5449976474046707, "epoch": 2.49438202247191, "grad_norm": 0.05326893553137779, "learning_rate": 0.0002, "loss": 0.5467808842658997, "mean_token_accuracy": 0.7777620851993561, "num_tokens": 10859523.0, "step": 666 }, { "entropy": 0.5301449000835419, "epoch": 2.49812734082397, "grad_norm": 0.04426975175738335, "learning_rate": 0.0002, "loss": 0.5359491109848022, "mean_token_accuracy": 0.7841154336929321, "num_tokens": 10875805.0, "step": 667 }, { "entropy": 0.5325603634119034, "epoch": 2.50187265917603, "grad_norm": 0.04210103675723076, "learning_rate": 0.0002, "loss": 0.5365734100341797, "mean_token_accuracy": 0.782084509730339, "num_tokens": 10892315.0, "step": 668 }, { "entropy": 0.5456321388483047, "epoch": 2.50561797752809, "grad_norm": 0.03740176558494568, "learning_rate": 0.0002, "loss": 0.5444263219833374, "mean_token_accuracy": 0.7780910581350327, "num_tokens": 10908850.0, "step": 669 }, { "entropy": 0.5338556170463562, "epoch": 2.5093632958801497, "grad_norm": 0.04143742844462395, "learning_rate": 0.0002, "loss": 0.5300049185752869, "mean_token_accuracy": 0.787174180150032, "num_tokens": 10925106.0, "step": 670 }, { "entropy": 0.5515117049217224, "epoch": 2.51310861423221, "grad_norm": 0.03918025270104408, "learning_rate": 0.0002, "loss": 0.542182445526123, "mean_token_accuracy": 0.7806340008974075, "num_tokens": 10941543.0, "step": 671 }, { "entropy": 0.5549922436475754, "epoch": 2.5168539325842696, "grad_norm": 0.04009648784995079, "learning_rate": 0.0002, "loss": 0.5559307932853699, "mean_token_accuracy": 0.7725488841533661, "num_tokens": 10957817.0, "step": 672 }, { "entropy": 0.539954200387001, "epoch": 2.5205992509363297, "grad_norm": 0.04543929174542427, "learning_rate": 0.0002, "loss": 0.5482618808746338, "mean_token_accuracy": 0.7789554446935654, "num_tokens": 10974119.0, "step": 673 }, { "entropy": 0.5211862847208977, "epoch": 2.5243445692883895, "grad_norm": 0.0385296531021595, "learning_rate": 0.0002, "loss": 0.5304719805717468, "mean_token_accuracy": 0.7863713204860687, "num_tokens": 10990490.0, "step": 674 }, { "entropy": 0.5547338724136353, "epoch": 2.5280898876404496, "grad_norm": 0.047472305595874786, "learning_rate": 0.0002, "loss": 0.5596637725830078, "mean_token_accuracy": 0.771984726190567, "num_tokens": 11007150.0, "step": 675 }, { "entropy": 0.5423361957073212, "epoch": 2.5318352059925093, "grad_norm": 0.03454773128032684, "learning_rate": 0.0002, "loss": 0.5381237268447876, "mean_token_accuracy": 0.7808732390403748, "num_tokens": 11023385.0, "step": 676 }, { "entropy": 0.5561535805463791, "epoch": 2.535580524344569, "grad_norm": 0.03847538307309151, "learning_rate": 0.0002, "loss": 0.5428014993667603, "mean_token_accuracy": 0.7786359935998917, "num_tokens": 11039943.0, "step": 677 }, { "entropy": 0.544300451874733, "epoch": 2.539325842696629, "grad_norm": 0.04131785407662392, "learning_rate": 0.0002, "loss": 0.5334832668304443, "mean_token_accuracy": 0.7851458042860031, "num_tokens": 11056430.0, "step": 678 }, { "entropy": 0.5311527848243713, "epoch": 2.5430711610486894, "grad_norm": 0.03951219096779823, "learning_rate": 0.0002, "loss": 0.5389747023582458, "mean_token_accuracy": 0.7813056856393814, "num_tokens": 11072776.0, "step": 679 }, { "entropy": 0.5290235728025436, "epoch": 2.546816479400749, "grad_norm": 0.0438111387193203, "learning_rate": 0.0002, "loss": 0.5451354384422302, "mean_token_accuracy": 0.7777683436870575, "num_tokens": 11088991.0, "step": 680 }, { "entropy": 0.5291692391037941, "epoch": 2.550561797752809, "grad_norm": 0.039012420922517776, "learning_rate": 0.0002, "loss": 0.5386437773704529, "mean_token_accuracy": 0.7806796282529831, "num_tokens": 11105235.0, "step": 681 }, { "entropy": 0.5217102319002151, "epoch": 2.554307116104869, "grad_norm": 0.04288937896490097, "learning_rate": 0.0002, "loss": 0.5323805809020996, "mean_token_accuracy": 0.7835096120834351, "num_tokens": 11121333.0, "step": 682 }, { "entropy": 0.5252867043018341, "epoch": 2.558052434456929, "grad_norm": 0.0371013842523098, "learning_rate": 0.0002, "loss": 0.5191121101379395, "mean_token_accuracy": 0.7874591499567032, "num_tokens": 11137249.0, "step": 683 }, { "entropy": 0.5371126532554626, "epoch": 2.561797752808989, "grad_norm": 0.03830140084028244, "learning_rate": 0.0002, "loss": 0.5264033675193787, "mean_token_accuracy": 0.7881854623556137, "num_tokens": 11153699.0, "step": 684 }, { "entropy": 0.5386142879724503, "epoch": 2.5655430711610485, "grad_norm": 0.035421278327703476, "learning_rate": 0.0002, "loss": 0.5367159247398376, "mean_token_accuracy": 0.7793221473693848, "num_tokens": 11170196.0, "step": 685 }, { "entropy": 0.5483710169792175, "epoch": 2.5692883895131087, "grad_norm": 0.04288771376013756, "learning_rate": 0.0002, "loss": 0.5506448149681091, "mean_token_accuracy": 0.7785434424877167, "num_tokens": 11186770.0, "step": 686 }, { "entropy": 0.5472489446401596, "epoch": 2.5730337078651684, "grad_norm": 0.04111029580235481, "learning_rate": 0.0002, "loss": 0.5503485798835754, "mean_token_accuracy": 0.7765214443206787, "num_tokens": 11203191.0, "step": 687 }, { "entropy": 0.523987427353859, "epoch": 2.5767790262172285, "grad_norm": 0.04419523477554321, "learning_rate": 0.0002, "loss": 0.5254223942756653, "mean_token_accuracy": 0.7858942598104477, "num_tokens": 11219530.0, "step": 688 }, { "entropy": 0.5482724606990814, "epoch": 2.5805243445692883, "grad_norm": 0.0384112112224102, "learning_rate": 0.0002, "loss": 0.5467587113380432, "mean_token_accuracy": 0.7784788310527802, "num_tokens": 11236013.0, "step": 689 }, { "entropy": 0.5410710424184799, "epoch": 2.5842696629213484, "grad_norm": 0.04548390954732895, "learning_rate": 0.0002, "loss": 0.5361588001251221, "mean_token_accuracy": 0.7842984944581985, "num_tokens": 11252349.0, "step": 690 }, { "entropy": 0.5413189381361008, "epoch": 2.588014981273408, "grad_norm": 0.03719467297196388, "learning_rate": 0.0002, "loss": 0.5372804403305054, "mean_token_accuracy": 0.7805864661931992, "num_tokens": 11268637.0, "step": 691 }, { "entropy": 0.5587044954299927, "epoch": 2.5917602996254683, "grad_norm": 0.03943658620119095, "learning_rate": 0.0002, "loss": 0.556570291519165, "mean_token_accuracy": 0.7712628394365311, "num_tokens": 11284973.0, "step": 692 }, { "entropy": 0.5220051556825638, "epoch": 2.595505617977528, "grad_norm": 0.04577549174427986, "learning_rate": 0.0002, "loss": 0.5235053896903992, "mean_token_accuracy": 0.7874717712402344, "num_tokens": 11301234.0, "step": 693 }, { "entropy": 0.5253131091594696, "epoch": 2.599250936329588, "grad_norm": 0.055322322994470596, "learning_rate": 0.0002, "loss": 0.539014458656311, "mean_token_accuracy": 0.7832715809345245, "num_tokens": 11317622.0, "step": 694 }, { "entropy": 0.529956579208374, "epoch": 2.602996254681648, "grad_norm": 0.04555559530854225, "learning_rate": 0.0002, "loss": 0.5358556509017944, "mean_token_accuracy": 0.7829083502292633, "num_tokens": 11334260.0, "step": 695 }, { "entropy": 0.5464101433753967, "epoch": 2.606741573033708, "grad_norm": 0.04112941771745682, "learning_rate": 0.0002, "loss": 0.5475582480430603, "mean_token_accuracy": 0.780443549156189, "num_tokens": 11350510.0, "step": 696 }, { "entropy": 0.5290370956063271, "epoch": 2.6104868913857677, "grad_norm": 0.03645879402756691, "learning_rate": 0.0002, "loss": 0.5310324430465698, "mean_token_accuracy": 0.7870594263076782, "num_tokens": 11366960.0, "step": 697 }, { "entropy": 0.5584116280078888, "epoch": 2.6142322097378274, "grad_norm": 0.03702421113848686, "learning_rate": 0.0002, "loss": 0.5555626153945923, "mean_token_accuracy": 0.7766379117965698, "num_tokens": 11383705.0, "step": 698 }, { "entropy": 0.5311998277902603, "epoch": 2.6179775280898876, "grad_norm": 0.039902858436107635, "learning_rate": 0.0002, "loss": 0.5329570770263672, "mean_token_accuracy": 0.7843590825796127, "num_tokens": 11399770.0, "step": 699 }, { "entropy": 0.5450660437345505, "epoch": 2.6217228464419478, "grad_norm": 0.040915053337812424, "learning_rate": 0.0002, "loss": 0.5421010851860046, "mean_token_accuracy": 0.7778819799423218, "num_tokens": 11416143.0, "step": 700 }, { "entropy": 0.5301565080881119, "epoch": 2.6254681647940075, "grad_norm": 0.04668205976486206, "learning_rate": 0.0002, "loss": 0.542178750038147, "mean_token_accuracy": 0.7808790653944016, "num_tokens": 11432391.0, "step": 701 }, { "entropy": 0.5262583941221237, "epoch": 2.629213483146067, "grad_norm": 0.044074323028326035, "learning_rate": 0.0002, "loss": 0.528965413570404, "mean_token_accuracy": 0.7844109088182449, "num_tokens": 11448787.0, "step": 702 }, { "entropy": 0.5375534892082214, "epoch": 2.6329588014981273, "grad_norm": 0.046261075884103775, "learning_rate": 0.0002, "loss": 0.5426000952720642, "mean_token_accuracy": 0.7772792726755142, "num_tokens": 11464834.0, "step": 703 }, { "entropy": 0.5281456708908081, "epoch": 2.6367041198501875, "grad_norm": 0.04074921831488609, "learning_rate": 0.0002, "loss": 0.5224668979644775, "mean_token_accuracy": 0.7867994755506516, "num_tokens": 11481010.0, "step": 704 }, { "entropy": 0.5607274174690247, "epoch": 2.640449438202247, "grad_norm": 0.04910429194569588, "learning_rate": 0.0002, "loss": 0.5609941482543945, "mean_token_accuracy": 0.7746099084615707, "num_tokens": 11497290.0, "step": 705 }, { "entropy": 0.5405243337154388, "epoch": 2.644194756554307, "grad_norm": 0.042494796216487885, "learning_rate": 0.0002, "loss": 0.5373457670211792, "mean_token_accuracy": 0.7792738676071167, "num_tokens": 11513583.0, "step": 706 }, { "entropy": 0.5465130656957626, "epoch": 2.647940074906367, "grad_norm": 0.051266275346279144, "learning_rate": 0.0002, "loss": 0.5519081950187683, "mean_token_accuracy": 0.7757825553417206, "num_tokens": 11530012.0, "step": 707 }, { "entropy": 0.5431560575962067, "epoch": 2.6516853932584272, "grad_norm": 0.03533034771680832, "learning_rate": 0.0002, "loss": 0.5461572408676147, "mean_token_accuracy": 0.7784530967473984, "num_tokens": 11546456.0, "step": 708 }, { "entropy": 0.5154132097959518, "epoch": 2.655430711610487, "grad_norm": 0.04611873999238014, "learning_rate": 0.0002, "loss": 0.5180613398551941, "mean_token_accuracy": 0.7888959646224976, "num_tokens": 11562883.0, "step": 709 }, { "entropy": 0.5712718665599823, "epoch": 2.6591760299625467, "grad_norm": 0.03861664608120918, "learning_rate": 0.0002, "loss": 0.5646159052848816, "mean_token_accuracy": 0.7710563838481903, "num_tokens": 11579392.0, "step": 710 }, { "entropy": 0.5572114437818527, "epoch": 2.662921348314607, "grad_norm": 0.04512866213917732, "learning_rate": 0.0002, "loss": 0.551059901714325, "mean_token_accuracy": 0.7758464813232422, "num_tokens": 11595937.0, "step": 711 }, { "entropy": 0.5336201041936874, "epoch": 2.6666666666666665, "grad_norm": 0.042362719774246216, "learning_rate": 0.0002, "loss": 0.5347069501876831, "mean_token_accuracy": 0.7828791737556458, "num_tokens": 11612066.0, "step": 712 }, { "entropy": 0.5221793055534363, "epoch": 2.6704119850187267, "grad_norm": 0.04037570580840111, "learning_rate": 0.0002, "loss": 0.523446261882782, "mean_token_accuracy": 0.7888407558202744, "num_tokens": 11628437.0, "step": 713 }, { "entropy": 0.5422008782625198, "epoch": 2.6741573033707864, "grad_norm": 0.04662792757153511, "learning_rate": 0.0002, "loss": 0.555385947227478, "mean_token_accuracy": 0.7747650295495987, "num_tokens": 11644722.0, "step": 714 }, { "entropy": 0.5356374382972717, "epoch": 2.6779026217228465, "grad_norm": 0.03770140931010246, "learning_rate": 0.0002, "loss": 0.5397407412528992, "mean_token_accuracy": 0.77961665391922, "num_tokens": 11661403.0, "step": 715 }, { "entropy": 0.5477268397808075, "epoch": 2.6816479400749063, "grad_norm": 0.04137538745999336, "learning_rate": 0.0002, "loss": 0.5421797633171082, "mean_token_accuracy": 0.7774805575609207, "num_tokens": 11677740.0, "step": 716 }, { "entropy": 0.5390584021806717, "epoch": 2.6853932584269664, "grad_norm": 0.04397116228938103, "learning_rate": 0.0002, "loss": 0.5323628187179565, "mean_token_accuracy": 0.7813891172409058, "num_tokens": 11693755.0, "step": 717 }, { "entropy": 0.5430156886577606, "epoch": 2.689138576779026, "grad_norm": 0.03867118060588837, "learning_rate": 0.0002, "loss": 0.5338262319564819, "mean_token_accuracy": 0.7821642309427261, "num_tokens": 11710311.0, "step": 718 }, { "entropy": 0.5369475930929184, "epoch": 2.6928838951310863, "grad_norm": 0.03773213177919388, "learning_rate": 0.0002, "loss": 0.5436868071556091, "mean_token_accuracy": 0.7776243984699249, "num_tokens": 11726751.0, "step": 719 }, { "entropy": 0.5204776674509048, "epoch": 2.696629213483146, "grad_norm": 0.045796290040016174, "learning_rate": 0.0002, "loss": 0.5366164445877075, "mean_token_accuracy": 0.7829219549894333, "num_tokens": 11743104.0, "step": 720 }, { "entropy": 0.5444348156452179, "epoch": 2.700374531835206, "grad_norm": 0.041639544069767, "learning_rate": 0.0002, "loss": 0.5522270202636719, "mean_token_accuracy": 0.7758014649152756, "num_tokens": 11759143.0, "step": 721 }, { "entropy": 0.5301756113767624, "epoch": 2.704119850187266, "grad_norm": 0.04008952155709267, "learning_rate": 0.0002, "loss": 0.5239149928092957, "mean_token_accuracy": 0.7852831333875656, "num_tokens": 11775647.0, "step": 722 }, { "entropy": 0.5141435042023659, "epoch": 2.7078651685393256, "grad_norm": 0.03991787135601044, "learning_rate": 0.0002, "loss": 0.5066305994987488, "mean_token_accuracy": 0.7961233854293823, "num_tokens": 11791695.0, "step": 723 }, { "entropy": 0.5294996351003647, "epoch": 2.7116104868913857, "grad_norm": 0.03514706343412399, "learning_rate": 0.0002, "loss": 0.5277984738349915, "mean_token_accuracy": 0.7842394113540649, "num_tokens": 11807908.0, "step": 724 }, { "entropy": 0.553158238530159, "epoch": 2.715355805243446, "grad_norm": 0.0371016301214695, "learning_rate": 0.0002, "loss": 0.5542132258415222, "mean_token_accuracy": 0.7742846459150314, "num_tokens": 11824455.0, "step": 725 }, { "entropy": 0.5377026200294495, "epoch": 2.7191011235955056, "grad_norm": 0.04648866876959801, "learning_rate": 0.0002, "loss": 0.5486031770706177, "mean_token_accuracy": 0.7776967585086823, "num_tokens": 11840615.0, "step": 726 }, { "entropy": 0.5500117689371109, "epoch": 2.7228464419475653, "grad_norm": 0.03958411142230034, "learning_rate": 0.0002, "loss": 0.5574382543563843, "mean_token_accuracy": 0.7707358449697495, "num_tokens": 11856804.0, "step": 727 }, { "entropy": 0.5287734270095825, "epoch": 2.7265917602996255, "grad_norm": 0.039377059787511826, "learning_rate": 0.0002, "loss": 0.5284842848777771, "mean_token_accuracy": 0.7842006385326385, "num_tokens": 11872824.0, "step": 728 }, { "entropy": 0.5455043613910675, "epoch": 2.7303370786516856, "grad_norm": 0.038099173456430435, "learning_rate": 0.0002, "loss": 0.5363825559616089, "mean_token_accuracy": 0.7839681655168533, "num_tokens": 11889236.0, "step": 729 }, { "entropy": 0.5231508985161781, "epoch": 2.7340823970037453, "grad_norm": 0.04386546462774277, "learning_rate": 0.0002, "loss": 0.5231119394302368, "mean_token_accuracy": 0.7876169681549072, "num_tokens": 11905504.0, "step": 730 }, { "entropy": 0.5425267070531845, "epoch": 2.737827715355805, "grad_norm": 0.03880799189209938, "learning_rate": 0.0002, "loss": 0.5381489992141724, "mean_token_accuracy": 0.7835936099290848, "num_tokens": 11922030.0, "step": 731 }, { "entropy": 0.5379330962896347, "epoch": 2.741573033707865, "grad_norm": 0.04163983464241028, "learning_rate": 0.0002, "loss": 0.5459231734275818, "mean_token_accuracy": 0.7755035907030106, "num_tokens": 11938351.0, "step": 732 }, { "entropy": 0.5344593375921249, "epoch": 2.7453183520599254, "grad_norm": 0.03764946386218071, "learning_rate": 0.0002, "loss": 0.5335820913314819, "mean_token_accuracy": 0.7851902097463608, "num_tokens": 11954720.0, "step": 733 }, { "entropy": 0.5275440439581871, "epoch": 2.749063670411985, "grad_norm": 0.041039030998945236, "learning_rate": 0.0002, "loss": 0.5316729545593262, "mean_token_accuracy": 0.784284695982933, "num_tokens": 11970943.0, "step": 734 }, { "entropy": 0.5440046042203903, "epoch": 2.752808988764045, "grad_norm": 0.03777683153748512, "learning_rate": 0.0002, "loss": 0.5479453802108765, "mean_token_accuracy": 0.7796096056699753, "num_tokens": 11987274.0, "step": 735 }, { "entropy": 0.5314242094755173, "epoch": 2.756554307116105, "grad_norm": 0.04298453778028488, "learning_rate": 0.0002, "loss": 0.5360277891159058, "mean_token_accuracy": 0.7836730033159256, "num_tokens": 12003645.0, "step": 736 }, { "entropy": 0.5434319823980331, "epoch": 2.7602996254681647, "grad_norm": 0.038422685116529465, "learning_rate": 0.0002, "loss": 0.5429157614707947, "mean_token_accuracy": 0.7770098298788071, "num_tokens": 12020104.0, "step": 737 }, { "entropy": 0.5382603704929352, "epoch": 2.764044943820225, "grad_norm": 0.04176581650972366, "learning_rate": 0.0002, "loss": 0.5365764498710632, "mean_token_accuracy": 0.7839252799749374, "num_tokens": 12036423.0, "step": 738 }, { "entropy": 0.5331043303012848, "epoch": 2.7677902621722845, "grad_norm": 0.04350239410996437, "learning_rate": 0.0002, "loss": 0.5356451272964478, "mean_token_accuracy": 0.7829470187425613, "num_tokens": 12052564.0, "step": 739 }, { "entropy": 0.5245354026556015, "epoch": 2.7715355805243447, "grad_norm": 0.04295556619763374, "learning_rate": 0.0002, "loss": 0.5335471034049988, "mean_token_accuracy": 0.7844749689102173, "num_tokens": 12068677.0, "step": 740 }, { "entropy": 0.5476740896701813, "epoch": 2.7752808988764044, "grad_norm": 0.04540206119418144, "learning_rate": 0.0002, "loss": 0.552383542060852, "mean_token_accuracy": 0.7785235494375229, "num_tokens": 12085174.0, "step": 741 }, { "entropy": 0.5276885330677032, "epoch": 2.7790262172284645, "grad_norm": 0.03786449506878853, "learning_rate": 0.0002, "loss": 0.5295007228851318, "mean_token_accuracy": 0.7848162055015564, "num_tokens": 12101546.0, "step": 742 }, { "entropy": 0.5504680871963501, "epoch": 2.7827715355805243, "grad_norm": 0.04417780414223671, "learning_rate": 0.0002, "loss": 0.5459782481193542, "mean_token_accuracy": 0.7778183221817017, "num_tokens": 12117833.0, "step": 743 }, { "entropy": 0.5514437556266785, "epoch": 2.7865168539325844, "grad_norm": 0.03677407279610634, "learning_rate": 0.0002, "loss": 0.5444294810295105, "mean_token_accuracy": 0.7822880744934082, "num_tokens": 12134076.0, "step": 744 }, { "entropy": 0.544072225689888, "epoch": 2.790262172284644, "grad_norm": 0.04843369498848915, "learning_rate": 0.0002, "loss": 0.5418300628662109, "mean_token_accuracy": 0.7809806764125824, "num_tokens": 12149991.0, "step": 745 }, { "entropy": 0.5447394847869873, "epoch": 2.7940074906367043, "grad_norm": 0.04489225894212723, "learning_rate": 0.0002, "loss": 0.5485548377037048, "mean_token_accuracy": 0.7752929180860519, "num_tokens": 12166319.0, "step": 746 }, { "entropy": 0.5193701684474945, "epoch": 2.797752808988764, "grad_norm": 0.04051094502210617, "learning_rate": 0.0002, "loss": 0.5254422426223755, "mean_token_accuracy": 0.7868325263261795, "num_tokens": 12182585.0, "step": 747 }, { "entropy": 0.533800944685936, "epoch": 2.8014981273408237, "grad_norm": 0.03557295724749565, "learning_rate": 0.0002, "loss": 0.5316165089607239, "mean_token_accuracy": 0.7825881540775299, "num_tokens": 12198769.0, "step": 748 }, { "entropy": 0.534054160118103, "epoch": 2.805243445692884, "grad_norm": 0.04074644669890404, "learning_rate": 0.0002, "loss": 0.5342618823051453, "mean_token_accuracy": 0.7828291058540344, "num_tokens": 12215003.0, "step": 749 }, { "entropy": 0.5486414730548859, "epoch": 2.808988764044944, "grad_norm": 0.04066525399684906, "learning_rate": 0.0002, "loss": 0.5566014647483826, "mean_token_accuracy": 0.7741669267416, "num_tokens": 12231307.0, "step": 750 }, { "entropy": 0.5236565172672272, "epoch": 2.8127340823970037, "grad_norm": 0.03859638050198555, "learning_rate": 0.0002, "loss": 0.5243086218833923, "mean_token_accuracy": 0.7863422483205795, "num_tokens": 12247563.0, "step": 751 }, { "entropy": 0.5354926288127899, "epoch": 2.8164794007490634, "grad_norm": 0.040070392191410065, "learning_rate": 0.0002, "loss": 0.5424857139587402, "mean_token_accuracy": 0.7793509066104889, "num_tokens": 12263768.0, "step": 752 }, { "entropy": 0.5465504974126816, "epoch": 2.8202247191011236, "grad_norm": 0.04251793026924133, "learning_rate": 0.0002, "loss": 0.5422512292861938, "mean_token_accuracy": 0.7784619033336639, "num_tokens": 12280224.0, "step": 753 }, { "entropy": 0.5511007905006409, "epoch": 2.8239700374531838, "grad_norm": 0.03704281151294708, "learning_rate": 0.0002, "loss": 0.5432584285736084, "mean_token_accuracy": 0.7793723195791245, "num_tokens": 12296720.0, "step": 754 }, { "entropy": 0.5557062178850174, "epoch": 2.8277153558052435, "grad_norm": 0.04253645986318588, "learning_rate": 0.0002, "loss": 0.5526583194732666, "mean_token_accuracy": 0.7777480781078339, "num_tokens": 12313013.0, "step": 755 }, { "entropy": 0.5158669054508209, "epoch": 2.831460674157303, "grad_norm": 0.036200929433107376, "learning_rate": 0.0002, "loss": 0.5140800476074219, "mean_token_accuracy": 0.7922120690345764, "num_tokens": 12328987.0, "step": 756 }, { "entropy": 0.5495094060897827, "epoch": 2.8352059925093633, "grad_norm": 0.04025623947381973, "learning_rate": 0.0002, "loss": 0.5524377226829529, "mean_token_accuracy": 0.7765700370073318, "num_tokens": 12345487.0, "step": 757 }, { "entropy": 0.5472595542669296, "epoch": 2.8389513108614235, "grad_norm": 0.037925150245428085, "learning_rate": 0.0002, "loss": 0.5513643622398376, "mean_token_accuracy": 0.7754906117916107, "num_tokens": 12362003.0, "step": 758 }, { "entropy": 0.5349185019731522, "epoch": 2.842696629213483, "grad_norm": 0.04107813537120819, "learning_rate": 0.0002, "loss": 0.5352935791015625, "mean_token_accuracy": 0.785232812166214, "num_tokens": 12378308.0, "step": 759 }, { "entropy": 0.5332917869091034, "epoch": 2.846441947565543, "grad_norm": 0.0485457181930542, "learning_rate": 0.0002, "loss": 0.5407130122184753, "mean_token_accuracy": 0.7778820097446442, "num_tokens": 12394745.0, "step": 760 }, { "entropy": 0.5373108834028244, "epoch": 2.850187265917603, "grad_norm": 0.045551612973213196, "learning_rate": 0.0002, "loss": 0.5431134104728699, "mean_token_accuracy": 0.7788770198822021, "num_tokens": 12410653.0, "step": 761 }, { "entropy": 0.5553153157234192, "epoch": 2.853932584269663, "grad_norm": 0.042994849383831024, "learning_rate": 0.0002, "loss": 0.5521018505096436, "mean_token_accuracy": 0.7741047441959381, "num_tokens": 12426820.0, "step": 762 }, { "entropy": 0.5405306816101074, "epoch": 2.857677902621723, "grad_norm": 0.03894044831395149, "learning_rate": 0.0002, "loss": 0.5416905283927917, "mean_token_accuracy": 0.7816338688135147, "num_tokens": 12443026.0, "step": 763 }, { "entropy": 0.5384278744459152, "epoch": 2.8614232209737827, "grad_norm": 0.04121169447898865, "learning_rate": 0.0002, "loss": 0.5407273769378662, "mean_token_accuracy": 0.7787628769874573, "num_tokens": 12459216.0, "step": 764 }, { "entropy": 0.5316817611455917, "epoch": 2.865168539325843, "grad_norm": 0.05211913585662842, "learning_rate": 0.0002, "loss": 0.5382348895072937, "mean_token_accuracy": 0.7807497531175613, "num_tokens": 12475540.0, "step": 765 }, { "entropy": 0.5411743521690369, "epoch": 2.8689138576779025, "grad_norm": 0.05021794140338898, "learning_rate": 0.0002, "loss": 0.5549106001853943, "mean_token_accuracy": 0.7732493728399277, "num_tokens": 12491791.0, "step": 766 }, { "entropy": 0.5427963435649872, "epoch": 2.8726591760299627, "grad_norm": 0.048997581005096436, "learning_rate": 0.0002, "loss": 0.5405234694480896, "mean_token_accuracy": 0.7799372375011444, "num_tokens": 12508102.0, "step": 767 }, { "entropy": 0.5702031701803207, "epoch": 2.8764044943820224, "grad_norm": 0.035217706114053726, "learning_rate": 0.0002, "loss": 0.5628358721733093, "mean_token_accuracy": 0.7744450867176056, "num_tokens": 12524674.0, "step": 768 }, { "entropy": 0.5263065099716187, "epoch": 2.8801498127340825, "grad_norm": 0.04417087137699127, "learning_rate": 0.0002, "loss": 0.5192127227783203, "mean_token_accuracy": 0.7900556176900864, "num_tokens": 12540700.0, "step": 769 }, { "entropy": 0.5679396241903305, "epoch": 2.8838951310861423, "grad_norm": 0.038472775369882584, "learning_rate": 0.0002, "loss": 0.5629768967628479, "mean_token_accuracy": 0.7697183936834335, "num_tokens": 12557124.0, "step": 770 }, { "entropy": 0.541569247841835, "epoch": 2.8876404494382024, "grad_norm": 0.04340888932347298, "learning_rate": 0.0002, "loss": 0.5380176901817322, "mean_token_accuracy": 0.7819050699472427, "num_tokens": 12573582.0, "step": 771 }, { "entropy": 0.5244268327951431, "epoch": 2.891385767790262, "grad_norm": 0.043049633502960205, "learning_rate": 0.0002, "loss": 0.5338467955589294, "mean_token_accuracy": 0.7832711786031723, "num_tokens": 12589568.0, "step": 772 }, { "entropy": 0.5213008224964142, "epoch": 2.895131086142322, "grad_norm": 0.05456610396504402, "learning_rate": 0.0002, "loss": 0.5332724452018738, "mean_token_accuracy": 0.7851873487234116, "num_tokens": 12605650.0, "step": 773 }, { "entropy": 0.5455889403820038, "epoch": 2.898876404494382, "grad_norm": 0.04193198308348656, "learning_rate": 0.0002, "loss": 0.5584859251976013, "mean_token_accuracy": 0.7724700570106506, "num_tokens": 12621922.0, "step": 774 }, { "entropy": 0.5487163811922073, "epoch": 2.902621722846442, "grad_norm": 0.03447289392352104, "learning_rate": 0.0002, "loss": 0.5422307252883911, "mean_token_accuracy": 0.779036745429039, "num_tokens": 12638171.0, "step": 775 }, { "entropy": 0.5613754689693451, "epoch": 2.906367041198502, "grad_norm": 0.03812362253665924, "learning_rate": 0.0002, "loss": 0.5491812229156494, "mean_token_accuracy": 0.7774574309587479, "num_tokens": 12654497.0, "step": 776 }, { "entropy": 0.5419997125864029, "epoch": 2.9101123595505616, "grad_norm": 0.03889596462249756, "learning_rate": 0.0002, "loss": 0.5366528630256653, "mean_token_accuracy": 0.7796314209699631, "num_tokens": 12671014.0, "step": 777 }, { "entropy": 0.5404350906610489, "epoch": 2.9138576779026217, "grad_norm": 0.03634997084736824, "learning_rate": 0.0002, "loss": 0.5370875000953674, "mean_token_accuracy": 0.7817376554012299, "num_tokens": 12687252.0, "step": 778 }, { "entropy": 0.5554278641939163, "epoch": 2.917602996254682, "grad_norm": 0.04131067916750908, "learning_rate": 0.0002, "loss": 0.5544486045837402, "mean_token_accuracy": 0.774728998541832, "num_tokens": 12703762.0, "step": 779 }, { "entropy": 0.5132855176925659, "epoch": 2.9213483146067416, "grad_norm": 0.041993558406829834, "learning_rate": 0.0002, "loss": 0.5225546360015869, "mean_token_accuracy": 0.7885993123054504, "num_tokens": 12720070.0, "step": 780 }, { "entropy": 0.5195116326212883, "epoch": 2.9250936329588013, "grad_norm": 0.045502807945013046, "learning_rate": 0.0002, "loss": 0.5276657938957214, "mean_token_accuracy": 0.7835886776447296, "num_tokens": 12736079.0, "step": 781 }, { "entropy": 0.5291299819946289, "epoch": 2.9288389513108615, "grad_norm": 0.04560597985982895, "learning_rate": 0.0002, "loss": 0.5367044806480408, "mean_token_accuracy": 0.7813848108053207, "num_tokens": 12752163.0, "step": 782 }, { "entropy": 0.5446918457746506, "epoch": 2.932584269662921, "grad_norm": 0.04057231545448303, "learning_rate": 0.0002, "loss": 0.5368906259536743, "mean_token_accuracy": 0.7825321704149246, "num_tokens": 12768377.0, "step": 783 }, { "entropy": 0.5624755024909973, "epoch": 2.9363295880149813, "grad_norm": 0.04997701197862625, "learning_rate": 0.0002, "loss": 0.5559151768684387, "mean_token_accuracy": 0.7733145207166672, "num_tokens": 12784692.0, "step": 784 }, { "entropy": 0.5384950041770935, "epoch": 2.940074906367041, "grad_norm": 0.04062885046005249, "learning_rate": 0.0002, "loss": 0.536974310874939, "mean_token_accuracy": 0.7846025824546814, "num_tokens": 12800887.0, "step": 785 }, { "entropy": 0.5255657434463501, "epoch": 2.943820224719101, "grad_norm": 0.044986989349126816, "learning_rate": 0.0002, "loss": 0.5352227091789246, "mean_token_accuracy": 0.7826129198074341, "num_tokens": 12817261.0, "step": 786 }, { "entropy": 0.532112181186676, "epoch": 2.947565543071161, "grad_norm": 0.04506840929389, "learning_rate": 0.0002, "loss": 0.5401644110679626, "mean_token_accuracy": 0.7819447070360184, "num_tokens": 12833628.0, "step": 787 }, { "entropy": 0.5532176345586777, "epoch": 2.951310861423221, "grad_norm": 0.047445181757211685, "learning_rate": 0.0002, "loss": 0.5567490458488464, "mean_token_accuracy": 0.7756209075450897, "num_tokens": 12850048.0, "step": 788 }, { "entropy": 0.5571421086788177, "epoch": 2.955056179775281, "grad_norm": 0.03836369141936302, "learning_rate": 0.0002, "loss": 0.5471166968345642, "mean_token_accuracy": 0.7780868262052536, "num_tokens": 12866382.0, "step": 789 }, { "entropy": 0.5684118866920471, "epoch": 2.958801498127341, "grad_norm": 0.03691793233156204, "learning_rate": 0.0002, "loss": 0.5584673285484314, "mean_token_accuracy": 0.7734033614397049, "num_tokens": 12882861.0, "step": 790 }, { "entropy": 0.5417571067810059, "epoch": 2.9625468164794007, "grad_norm": 0.03854163736104965, "learning_rate": 0.0002, "loss": 0.5380803346633911, "mean_token_accuracy": 0.7819686830043793, "num_tokens": 12898999.0, "step": 791 }, { "entropy": 0.5183953493833542, "epoch": 2.966292134831461, "grad_norm": 0.04670790210366249, "learning_rate": 0.0002, "loss": 0.527891993522644, "mean_token_accuracy": 0.7858579158782959, "num_tokens": 12915160.0, "step": 792 }, { "entropy": 0.5315932035446167, "epoch": 2.9700374531835205, "grad_norm": 0.05011628568172455, "learning_rate": 0.0002, "loss": 0.5408577919006348, "mean_token_accuracy": 0.7781645357608795, "num_tokens": 12931387.0, "step": 793 }, { "entropy": 0.533274233341217, "epoch": 2.9737827715355807, "grad_norm": 0.038501009345054626, "learning_rate": 0.0002, "loss": 0.5422831773757935, "mean_token_accuracy": 0.7777345776557922, "num_tokens": 12947630.0, "step": 794 }, { "entropy": 0.5588134974241257, "epoch": 2.9775280898876404, "grad_norm": 0.04206021502614021, "learning_rate": 0.0002, "loss": 0.5564273595809937, "mean_token_accuracy": 0.7733636498451233, "num_tokens": 12964026.0, "step": 795 }, { "entropy": 0.5579260289669037, "epoch": 2.9812734082397006, "grad_norm": 0.04490978643298149, "learning_rate": 0.0002, "loss": 0.5504725575447083, "mean_token_accuracy": 0.7786446362733841, "num_tokens": 12980554.0, "step": 796 }, { "entropy": 0.541483461856842, "epoch": 2.9850187265917603, "grad_norm": 0.03570273146033287, "learning_rate": 0.0002, "loss": 0.5293324589729309, "mean_token_accuracy": 0.783537819981575, "num_tokens": 12996979.0, "step": 797 }, { "entropy": 0.5362358242273331, "epoch": 2.98876404494382, "grad_norm": 0.04825478047132492, "learning_rate": 0.0002, "loss": 0.5365868210792542, "mean_token_accuracy": 0.7838873118162155, "num_tokens": 13013323.0, "step": 798 }, { "entropy": 0.5404023975133896, "epoch": 2.99250936329588, "grad_norm": 0.04962825030088425, "learning_rate": 0.0002, "loss": 0.5480868816375732, "mean_token_accuracy": 0.7763252705335617, "num_tokens": 13029636.0, "step": 799 }, { "entropy": 0.5300639569759369, "epoch": 2.9962546816479403, "grad_norm": 0.042783528566360474, "learning_rate": 0.0002, "loss": 0.5343177318572998, "mean_token_accuracy": 0.7828411161899567, "num_tokens": 13046055.0, "step": 800 }, { "entropy": 0.5252282693982124, "epoch": 3.0, "grad_norm": 0.049276161938905716, "learning_rate": 0.0002, "loss": 0.5320798754692078, "mean_token_accuracy": 0.7844677865505219, "num_tokens": 13062401.0, "step": 801 }, { "entropy": 0.545697808265686, "epoch": 3.0037453183520597, "grad_norm": 0.04111013561487198, "learning_rate": 0.0002, "loss": 0.5242352485656738, "mean_token_accuracy": 0.7881960570812225, "num_tokens": 13078838.0, "step": 802 }, { "entropy": 0.5105714052915573, "epoch": 3.00749063670412, "grad_norm": 0.050722841173410416, "learning_rate": 0.0002, "loss": 0.49721649289131165, "mean_token_accuracy": 0.7984847724437714, "num_tokens": 13095019.0, "step": 803 }, { "entropy": 0.518198661506176, "epoch": 3.0112359550561796, "grad_norm": 0.05298876017332077, "learning_rate": 0.0002, "loss": 0.5273076891899109, "mean_token_accuracy": 0.7871041893959045, "num_tokens": 13111294.0, "step": 804 }, { "entropy": 0.48655156791210175, "epoch": 3.0149812734082397, "grad_norm": 0.05474111810326576, "learning_rate": 0.0002, "loss": 0.5008523464202881, "mean_token_accuracy": 0.79793781042099, "num_tokens": 13127173.0, "step": 805 }, { "entropy": 0.4898255914449692, "epoch": 3.0187265917602994, "grad_norm": 0.05198859050869942, "learning_rate": 0.0002, "loss": 0.502049446105957, "mean_token_accuracy": 0.7997064739465714, "num_tokens": 13143319.0, "step": 806 }, { "entropy": 0.5108759626746178, "epoch": 3.0224719101123596, "grad_norm": 0.050299011170864105, "learning_rate": 0.0002, "loss": 0.5128780603408813, "mean_token_accuracy": 0.7923674434423447, "num_tokens": 13159544.0, "step": 807 }, { "entropy": 0.5222347229719162, "epoch": 3.0262172284644193, "grad_norm": 0.047297973185777664, "learning_rate": 0.0002, "loss": 0.5127148628234863, "mean_token_accuracy": 0.7936184853315353, "num_tokens": 13175745.0, "step": 808 }, { "entropy": 0.5319055169820786, "epoch": 3.0299625468164795, "grad_norm": 0.043087251484394073, "learning_rate": 0.0002, "loss": 0.5200571417808533, "mean_token_accuracy": 0.789368748664856, "num_tokens": 13192098.0, "step": 809 }, { "entropy": 0.5223256945610046, "epoch": 3.033707865168539, "grad_norm": 0.045950714498758316, "learning_rate": 0.0002, "loss": 0.5118798613548279, "mean_token_accuracy": 0.7952196598052979, "num_tokens": 13208503.0, "step": 810 }, { "entropy": 0.5253837034106255, "epoch": 3.0374531835205993, "grad_norm": 0.051792871206998825, "learning_rate": 0.0002, "loss": 0.5294127464294434, "mean_token_accuracy": 0.7874963134527206, "num_tokens": 13224945.0, "step": 811 }, { "entropy": 0.5031881630420685, "epoch": 3.041198501872659, "grad_norm": 0.05261905863881111, "learning_rate": 0.0002, "loss": 0.5030893087387085, "mean_token_accuracy": 0.796674519777298, "num_tokens": 13241369.0, "step": 812 }, { "entropy": 0.5100391805171967, "epoch": 3.044943820224719, "grad_norm": 0.05024467036128044, "learning_rate": 0.0002, "loss": 0.5141370296478271, "mean_token_accuracy": 0.7916264235973358, "num_tokens": 13257754.0, "step": 813 }, { "entropy": 0.5079550594091415, "epoch": 3.048689138576779, "grad_norm": 0.05758948624134064, "learning_rate": 0.0002, "loss": 0.512941300868988, "mean_token_accuracy": 0.7929425090551376, "num_tokens": 13273994.0, "step": 814 }, { "entropy": 0.513673685491085, "epoch": 3.052434456928839, "grad_norm": 0.04496518149971962, "learning_rate": 0.0002, "loss": 0.5110280513763428, "mean_token_accuracy": 0.7918824106454849, "num_tokens": 13290072.0, "step": 815 }, { "entropy": 0.5141152441501617, "epoch": 3.056179775280899, "grad_norm": 0.0500110387802124, "learning_rate": 0.0002, "loss": 0.5101944804191589, "mean_token_accuracy": 0.7915782928466797, "num_tokens": 13306210.0, "step": 816 }, { "entropy": 0.5212079957127571, "epoch": 3.059925093632959, "grad_norm": 0.048487596213817596, "learning_rate": 0.0002, "loss": 0.5181204080581665, "mean_token_accuracy": 0.791895255446434, "num_tokens": 13322810.0, "step": 817 }, { "entropy": 0.5105150416493416, "epoch": 3.0636704119850187, "grad_norm": 0.04949360713362694, "learning_rate": 0.0002, "loss": 0.5145678520202637, "mean_token_accuracy": 0.7915669232606888, "num_tokens": 13339105.0, "step": 818 }, { "entropy": 0.5000638663768768, "epoch": 3.067415730337079, "grad_norm": 0.05010031536221504, "learning_rate": 0.0002, "loss": 0.5040720701217651, "mean_token_accuracy": 0.7957489788532257, "num_tokens": 13355562.0, "step": 819 }, { "entropy": 0.4990030825138092, "epoch": 3.0711610486891385, "grad_norm": 0.04833959415555, "learning_rate": 0.0002, "loss": 0.5016943216323853, "mean_token_accuracy": 0.795589417219162, "num_tokens": 13371584.0, "step": 820 }, { "entropy": 0.49931125342845917, "epoch": 3.0749063670411987, "grad_norm": 0.0536712147295475, "learning_rate": 0.0002, "loss": 0.5040884017944336, "mean_token_accuracy": 0.7980391532182693, "num_tokens": 13387562.0, "step": 821 }, { "entropy": 0.522365540266037, "epoch": 3.0786516853932584, "grad_norm": 0.05137619003653526, "learning_rate": 0.0002, "loss": 0.5167077779769897, "mean_token_accuracy": 0.7917557954788208, "num_tokens": 13403730.0, "step": 822 }, { "entropy": 0.5068316459655762, "epoch": 3.0823970037453186, "grad_norm": 0.05163760110735893, "learning_rate": 0.0002, "loss": 0.5044561624526978, "mean_token_accuracy": 0.7993681281805038, "num_tokens": 13419918.0, "step": 823 }, { "entropy": 0.49808672070503235, "epoch": 3.0861423220973783, "grad_norm": 0.06049012020230293, "learning_rate": 0.0002, "loss": 0.5022746920585632, "mean_token_accuracy": 0.7967248558998108, "num_tokens": 13435959.0, "step": 824 }, { "entropy": 0.514209657907486, "epoch": 3.0898876404494384, "grad_norm": 0.04543498158454895, "learning_rate": 0.0002, "loss": 0.5144035220146179, "mean_token_accuracy": 0.789142832159996, "num_tokens": 13452229.0, "step": 825 }, { "entropy": 0.5195358544588089, "epoch": 3.093632958801498, "grad_norm": 0.057822633534669876, "learning_rate": 0.0002, "loss": 0.5155280828475952, "mean_token_accuracy": 0.7921741157770157, "num_tokens": 13468667.0, "step": 826 }, { "entropy": 0.507283978164196, "epoch": 3.097378277153558, "grad_norm": 0.05148691684007645, "learning_rate": 0.0002, "loss": 0.504961371421814, "mean_token_accuracy": 0.7980248332023621, "num_tokens": 13484964.0, "step": 827 }, { "entropy": 0.5191457867622375, "epoch": 3.101123595505618, "grad_norm": 0.045027829706668854, "learning_rate": 0.0002, "loss": 0.5200563669204712, "mean_token_accuracy": 0.7913502901792526, "num_tokens": 13501449.0, "step": 828 }, { "entropy": 0.5351596623659134, "epoch": 3.1048689138576777, "grad_norm": 0.05001077800989151, "learning_rate": 0.0002, "loss": 0.5278201699256897, "mean_token_accuracy": 0.7879630476236343, "num_tokens": 13517966.0, "step": 829 }, { "entropy": 0.5123812630772591, "epoch": 3.108614232209738, "grad_norm": 0.0483224131166935, "learning_rate": 0.0002, "loss": 0.5094588398933411, "mean_token_accuracy": 0.794407531619072, "num_tokens": 13534307.0, "step": 830 }, { "entropy": 0.5005150064826012, "epoch": 3.1123595505617976, "grad_norm": 0.06896387785673141, "learning_rate": 0.0002, "loss": 0.5081024169921875, "mean_token_accuracy": 0.7954099476337433, "num_tokens": 13550484.0, "step": 831 }, { "entropy": 0.5042895451188087, "epoch": 3.1161048689138577, "grad_norm": 0.058579690754413605, "learning_rate": 0.0002, "loss": 0.508193850517273, "mean_token_accuracy": 0.793841764330864, "num_tokens": 13566708.0, "step": 832 }, { "entropy": 0.49759114533662796, "epoch": 3.1198501872659175, "grad_norm": 0.07416244596242905, "learning_rate": 0.0002, "loss": 0.5042813420295715, "mean_token_accuracy": 0.7976614087820053, "num_tokens": 13582827.0, "step": 833 }, { "entropy": 0.5223132967948914, "epoch": 3.1235955056179776, "grad_norm": 0.06452949345111847, "learning_rate": 0.0002, "loss": 0.5273835062980652, "mean_token_accuracy": 0.7855038046836853, "num_tokens": 13599052.0, "step": 834 }, { "entropy": 0.5274243950843811, "epoch": 3.1273408239700373, "grad_norm": 0.05534323304891586, "learning_rate": 0.0002, "loss": 0.527578592300415, "mean_token_accuracy": 0.7877459824085236, "num_tokens": 13615363.0, "step": 835 }, { "entropy": 0.5254645645618439, "epoch": 3.1310861423220975, "grad_norm": 0.05036141723394394, "learning_rate": 0.0002, "loss": 0.5162075161933899, "mean_token_accuracy": 0.7924645841121674, "num_tokens": 13631656.0, "step": 836 }, { "entropy": 0.519648090004921, "epoch": 3.134831460674157, "grad_norm": 0.05153921991586685, "learning_rate": 0.0002, "loss": 0.5139608383178711, "mean_token_accuracy": 0.7937669306993484, "num_tokens": 13648061.0, "step": 837 }, { "entropy": 0.5104959607124329, "epoch": 3.1385767790262173, "grad_norm": 0.0628538653254509, "learning_rate": 0.0002, "loss": 0.5201999545097351, "mean_token_accuracy": 0.7901795506477356, "num_tokens": 13664398.0, "step": 838 }, { "entropy": 0.5013151913881302, "epoch": 3.142322097378277, "grad_norm": 0.05778926610946655, "learning_rate": 0.0002, "loss": 0.5063536763191223, "mean_token_accuracy": 0.7938642650842667, "num_tokens": 13680563.0, "step": 839 }, { "entropy": 0.5136759728193283, "epoch": 3.146067415730337, "grad_norm": 0.0481521412730217, "learning_rate": 0.0002, "loss": 0.5169215202331543, "mean_token_accuracy": 0.7936979234218597, "num_tokens": 13696943.0, "step": 840 }, { "entropy": 0.5035114511847496, "epoch": 3.149812734082397, "grad_norm": 0.052551548928022385, "learning_rate": 0.0002, "loss": 0.5094401240348816, "mean_token_accuracy": 0.7950234562158585, "num_tokens": 13713121.0, "step": 841 }, { "entropy": 0.5143017992377281, "epoch": 3.153558052434457, "grad_norm": 0.051041699945926666, "learning_rate": 0.0002, "loss": 0.5074518322944641, "mean_token_accuracy": 0.7948710173368454, "num_tokens": 13729464.0, "step": 842 }, { "entropy": 0.5306706875562668, "epoch": 3.157303370786517, "grad_norm": 0.0463450625538826, "learning_rate": 0.0002, "loss": 0.5219502449035645, "mean_token_accuracy": 0.7893195748329163, "num_tokens": 13746493.0, "step": 843 }, { "entropy": 0.5117569044232368, "epoch": 3.161048689138577, "grad_norm": 0.06164409592747688, "learning_rate": 0.0002, "loss": 0.5158479809761047, "mean_token_accuracy": 0.7911277264356613, "num_tokens": 13762823.0, "step": 844 }, { "entropy": 0.5204734578728676, "epoch": 3.1647940074906367, "grad_norm": 0.054356031119823456, "learning_rate": 0.0002, "loss": 0.5212512016296387, "mean_token_accuracy": 0.7890127152204514, "num_tokens": 13779000.0, "step": 845 }, { "entropy": 0.5199745744466782, "epoch": 3.168539325842697, "grad_norm": 0.0607718862593174, "learning_rate": 0.0002, "loss": 0.5160431265830994, "mean_token_accuracy": 0.7902602553367615, "num_tokens": 13794975.0, "step": 846 }, { "entropy": 0.4987589195370674, "epoch": 3.1722846441947565, "grad_norm": 0.04878820478916168, "learning_rate": 0.0002, "loss": 0.5000798106193542, "mean_token_accuracy": 0.7972550392150879, "num_tokens": 13811158.0, "step": 847 }, { "entropy": 0.5230295807123184, "epoch": 3.1760299625468167, "grad_norm": 0.06623463332653046, "learning_rate": 0.0002, "loss": 0.5327509641647339, "mean_token_accuracy": 0.7841638922691345, "num_tokens": 13827505.0, "step": 848 }, { "entropy": 0.5071290284395218, "epoch": 3.1797752808988764, "grad_norm": 0.05458921194076538, "learning_rate": 0.0002, "loss": 0.506171464920044, "mean_token_accuracy": 0.796265110373497, "num_tokens": 13843820.0, "step": 849 }, { "entropy": 0.5068354383111, "epoch": 3.1835205992509366, "grad_norm": 0.07471395283937454, "learning_rate": 0.0002, "loss": 0.5159043669700623, "mean_token_accuracy": 0.7950875610113144, "num_tokens": 13860049.0, "step": 850 }, { "entropy": 0.5165606439113617, "epoch": 3.1872659176029963, "grad_norm": 0.04287557676434517, "learning_rate": 0.0002, "loss": 0.5090954303741455, "mean_token_accuracy": 0.7943407446146011, "num_tokens": 13876269.0, "step": 851 }, { "entropy": 0.5112441331148148, "epoch": 3.191011235955056, "grad_norm": 0.055288348346948624, "learning_rate": 0.0002, "loss": 0.5097154974937439, "mean_token_accuracy": 0.7928614467382431, "num_tokens": 13892237.0, "step": 852 }, { "entropy": 0.5263922363519669, "epoch": 3.194756554307116, "grad_norm": 0.05795539170503616, "learning_rate": 0.0002, "loss": 0.5299734473228455, "mean_token_accuracy": 0.7866927832365036, "num_tokens": 13908834.0, "step": 853 }, { "entropy": 0.5262639820575714, "epoch": 3.198501872659176, "grad_norm": 0.04974358528852463, "learning_rate": 0.0002, "loss": 0.5219104290008545, "mean_token_accuracy": 0.789173498749733, "num_tokens": 13925285.0, "step": 854 }, { "entropy": 0.5375918298959732, "epoch": 3.202247191011236, "grad_norm": 0.05287981405854225, "learning_rate": 0.0002, "loss": 0.538820207118988, "mean_token_accuracy": 0.7783188968896866, "num_tokens": 13941531.0, "step": 855 }, { "entropy": 0.5262509882450104, "epoch": 3.2059925093632957, "grad_norm": 0.050868358463048935, "learning_rate": 0.0002, "loss": 0.5281128883361816, "mean_token_accuracy": 0.78641077876091, "num_tokens": 13957808.0, "step": 856 }, { "entropy": 0.5126873999834061, "epoch": 3.209737827715356, "grad_norm": 0.053514108061790466, "learning_rate": 0.0002, "loss": 0.5147566795349121, "mean_token_accuracy": 0.7941258400678635, "num_tokens": 13974052.0, "step": 857 }, { "entropy": 0.5275673717260361, "epoch": 3.2134831460674156, "grad_norm": 0.05271236225962639, "learning_rate": 0.0002, "loss": 0.5292813777923584, "mean_token_accuracy": 0.7857562899589539, "num_tokens": 13990343.0, "step": 858 }, { "entropy": 0.5242348462343216, "epoch": 3.2172284644194757, "grad_norm": 0.07179221510887146, "learning_rate": 0.0002, "loss": 0.5286028981208801, "mean_token_accuracy": 0.7894574105739594, "num_tokens": 14006625.0, "step": 859 }, { "entropy": 0.5096549838781357, "epoch": 3.2209737827715355, "grad_norm": 0.049610402435064316, "learning_rate": 0.0002, "loss": 0.5049244165420532, "mean_token_accuracy": 0.7980163246393204, "num_tokens": 14022899.0, "step": 860 }, { "entropy": 0.5015261322259903, "epoch": 3.2247191011235956, "grad_norm": 0.05947711691260338, "learning_rate": 0.0002, "loss": 0.4989194869995117, "mean_token_accuracy": 0.7979766577482224, "num_tokens": 14039443.0, "step": 861 }, { "entropy": 0.507699728012085, "epoch": 3.2284644194756553, "grad_norm": 0.04882875084877014, "learning_rate": 0.0002, "loss": 0.507795512676239, "mean_token_accuracy": 0.7962815016508102, "num_tokens": 14055656.0, "step": 862 }, { "entropy": 0.5021291598677635, "epoch": 3.2322097378277155, "grad_norm": 0.061408963054418564, "learning_rate": 0.0002, "loss": 0.5129059553146362, "mean_token_accuracy": 0.7919183075428009, "num_tokens": 14071999.0, "step": 863 }, { "entropy": 0.520720586180687, "epoch": 3.235955056179775, "grad_norm": 0.06845266371965408, "learning_rate": 0.0002, "loss": 0.5275195837020874, "mean_token_accuracy": 0.786097377538681, "num_tokens": 14088181.0, "step": 864 }, { "entropy": 0.5245565697550774, "epoch": 3.2397003745318353, "grad_norm": 0.05512849986553192, "learning_rate": 0.0002, "loss": 0.5164670944213867, "mean_token_accuracy": 0.7922011315822601, "num_tokens": 14104382.0, "step": 865 }, { "entropy": 0.523853063583374, "epoch": 3.243445692883895, "grad_norm": 0.05168979614973068, "learning_rate": 0.0002, "loss": 0.5198615789413452, "mean_token_accuracy": 0.7894517928361893, "num_tokens": 14120589.0, "step": 866 }, { "entropy": 0.5336069017648697, "epoch": 3.247191011235955, "grad_norm": 0.04658959433436394, "learning_rate": 0.0002, "loss": 0.5296441912651062, "mean_token_accuracy": 0.7839891761541367, "num_tokens": 14137115.0, "step": 867 }, { "entropy": 0.5032267719507217, "epoch": 3.250936329588015, "grad_norm": 0.06418543308973312, "learning_rate": 0.0002, "loss": 0.5041000843048096, "mean_token_accuracy": 0.7958316802978516, "num_tokens": 14153324.0, "step": 868 }, { "entropy": 0.5415874123573303, "epoch": 3.254681647940075, "grad_norm": 0.05481120944023132, "learning_rate": 0.0002, "loss": 0.5544674396514893, "mean_token_accuracy": 0.7752077877521515, "num_tokens": 14169770.0, "step": 869 }, { "entropy": 0.5231891572475433, "epoch": 3.258426966292135, "grad_norm": 0.055172860622406006, "learning_rate": 0.0002, "loss": 0.527195930480957, "mean_token_accuracy": 0.7866710424423218, "num_tokens": 14186252.0, "step": 870 }, { "entropy": 0.522189661860466, "epoch": 3.262172284644195, "grad_norm": 0.058594439178705215, "learning_rate": 0.0002, "loss": 0.5187022686004639, "mean_token_accuracy": 0.7929898500442505, "num_tokens": 14202621.0, "step": 871 }, { "entropy": 0.5282062888145447, "epoch": 3.2659176029962547, "grad_norm": 0.05134856328368187, "learning_rate": 0.0002, "loss": 0.5219106674194336, "mean_token_accuracy": 0.7889548540115356, "num_tokens": 14218830.0, "step": 872 }, { "entropy": 0.5150680243968964, "epoch": 3.2696629213483144, "grad_norm": 0.05508032441139221, "learning_rate": 0.0002, "loss": 0.5112281441688538, "mean_token_accuracy": 0.7931530773639679, "num_tokens": 14234888.0, "step": 873 }, { "entropy": 0.5219835788011551, "epoch": 3.2734082397003745, "grad_norm": 0.05464804917573929, "learning_rate": 0.0002, "loss": 0.524517297744751, "mean_token_accuracy": 0.7871863842010498, "num_tokens": 14251240.0, "step": 874 }, { "entropy": 0.5211943238973618, "epoch": 3.2771535580524347, "grad_norm": 0.06844772398471832, "learning_rate": 0.0002, "loss": 0.5394464731216431, "mean_token_accuracy": 0.7814126461744308, "num_tokens": 14267612.0, "step": 875 }, { "entropy": 0.5181123912334442, "epoch": 3.2808988764044944, "grad_norm": 0.04897969216108322, "learning_rate": 0.0002, "loss": 0.5221361517906189, "mean_token_accuracy": 0.7895658910274506, "num_tokens": 14284024.0, "step": 876 }, { "entropy": 0.522240474820137, "epoch": 3.284644194756554, "grad_norm": 0.046099789440631866, "learning_rate": 0.0002, "loss": 0.515265941619873, "mean_token_accuracy": 0.7908574789762497, "num_tokens": 14300400.0, "step": 877 }, { "entropy": 0.539507195353508, "epoch": 3.2883895131086143, "grad_norm": 0.048160191625356674, "learning_rate": 0.0002, "loss": 0.5282410979270935, "mean_token_accuracy": 0.7885929346084595, "num_tokens": 14316696.0, "step": 878 }, { "entropy": 0.5196528732776642, "epoch": 3.292134831460674, "grad_norm": 0.05286882072687149, "learning_rate": 0.0002, "loss": 0.5168602466583252, "mean_token_accuracy": 0.7895731180906296, "num_tokens": 14333018.0, "step": 879 }, { "entropy": 0.5102087259292603, "epoch": 3.295880149812734, "grad_norm": 0.059099920094013214, "learning_rate": 0.0002, "loss": 0.5207654237747192, "mean_token_accuracy": 0.7876903861761093, "num_tokens": 14349309.0, "step": 880 }, { "entropy": 0.5270523875951767, "epoch": 3.299625468164794, "grad_norm": 0.05176056921482086, "learning_rate": 0.0002, "loss": 0.5302364230155945, "mean_token_accuracy": 0.7864267975091934, "num_tokens": 14365771.0, "step": 881 }, { "entropy": 0.5273350328207016, "epoch": 3.303370786516854, "grad_norm": 0.053021032363176346, "learning_rate": 0.0002, "loss": 0.51994389295578, "mean_token_accuracy": 0.7906388491392136, "num_tokens": 14382276.0, "step": 882 }, { "entropy": 0.5050782039761543, "epoch": 3.3071161048689137, "grad_norm": 0.05596887692809105, "learning_rate": 0.0002, "loss": 0.5052669644355774, "mean_token_accuracy": 0.7954567670822144, "num_tokens": 14398533.0, "step": 883 }, { "entropy": 0.5178304612636566, "epoch": 3.310861423220974, "grad_norm": 0.051180679351091385, "learning_rate": 0.0002, "loss": 0.5151298642158508, "mean_token_accuracy": 0.7920469641685486, "num_tokens": 14414953.0, "step": 884 }, { "entropy": 0.5152227282524109, "epoch": 3.3146067415730336, "grad_norm": 0.060053881257772446, "learning_rate": 0.0002, "loss": 0.5225366950035095, "mean_token_accuracy": 0.7887113392353058, "num_tokens": 14431177.0, "step": 885 }, { "entropy": 0.5342336893081665, "epoch": 3.3183520599250937, "grad_norm": 0.04932161048054695, "learning_rate": 0.0002, "loss": 0.5272732973098755, "mean_token_accuracy": 0.7877390533685684, "num_tokens": 14447551.0, "step": 886 }, { "entropy": 0.5131062269210815, "epoch": 3.3220973782771535, "grad_norm": 0.056324418634176254, "learning_rate": 0.0002, "loss": 0.511243999004364, "mean_token_accuracy": 0.7933667898178101, "num_tokens": 14463837.0, "step": 887 }, { "entropy": 0.5144293755292892, "epoch": 3.3258426966292136, "grad_norm": 0.049344755709171295, "learning_rate": 0.0002, "loss": 0.5185728073120117, "mean_token_accuracy": 0.7894094735383987, "num_tokens": 14480010.0, "step": 888 }, { "entropy": 0.5006949752569199, "epoch": 3.3295880149812733, "grad_norm": 0.06578890234231949, "learning_rate": 0.0002, "loss": 0.5114624500274658, "mean_token_accuracy": 0.7939462065696716, "num_tokens": 14496280.0, "step": 889 }, { "entropy": 0.5155239552259445, "epoch": 3.3333333333333335, "grad_norm": 0.052595749497413635, "learning_rate": 0.0002, "loss": 0.5211793780326843, "mean_token_accuracy": 0.7900384217500687, "num_tokens": 14512580.0, "step": 890 }, { "entropy": 0.4996938407421112, "epoch": 3.337078651685393, "grad_norm": 0.05196739733219147, "learning_rate": 0.0002, "loss": 0.4989975094795227, "mean_token_accuracy": 0.7975862473249435, "num_tokens": 14528932.0, "step": 891 }, { "entropy": 0.5200860351324081, "epoch": 3.3408239700374533, "grad_norm": 0.05091974139213562, "learning_rate": 0.0002, "loss": 0.5156251192092896, "mean_token_accuracy": 0.7910965532064438, "num_tokens": 14545418.0, "step": 892 }, { "entropy": 0.5055394843220711, "epoch": 3.344569288389513, "grad_norm": 0.0533117949962616, "learning_rate": 0.0002, "loss": 0.5111801028251648, "mean_token_accuracy": 0.791337177157402, "num_tokens": 14561554.0, "step": 893 }, { "entropy": 0.5070675015449524, "epoch": 3.348314606741573, "grad_norm": 0.04844473674893379, "learning_rate": 0.0002, "loss": 0.5077552795410156, "mean_token_accuracy": 0.7912814170122147, "num_tokens": 14578052.0, "step": 894 }, { "entropy": 0.5202019810676575, "epoch": 3.352059925093633, "grad_norm": 0.04764174669981003, "learning_rate": 0.0002, "loss": 0.5175067186355591, "mean_token_accuracy": 0.7899416983127594, "num_tokens": 14594359.0, "step": 895 }, { "entropy": 0.5255243629217148, "epoch": 3.355805243445693, "grad_norm": 0.05360300838947296, "learning_rate": 0.0002, "loss": 0.5318154692649841, "mean_token_accuracy": 0.7854946553707123, "num_tokens": 14610661.0, "step": 896 }, { "entropy": 0.5251385867595673, "epoch": 3.359550561797753, "grad_norm": 0.05500936135649681, "learning_rate": 0.0002, "loss": 0.5363146066665649, "mean_token_accuracy": 0.7834254056215286, "num_tokens": 14626712.0, "step": 897 }, { "entropy": 0.5119743421673775, "epoch": 3.3632958801498125, "grad_norm": 0.04378456994891167, "learning_rate": 0.0002, "loss": 0.5079984068870544, "mean_token_accuracy": 0.7939057648181915, "num_tokens": 14642932.0, "step": 898 }, { "entropy": 0.5284467786550522, "epoch": 3.3670411985018727, "grad_norm": 0.046168722212314606, "learning_rate": 0.0002, "loss": 0.5247387290000916, "mean_token_accuracy": 0.787312924861908, "num_tokens": 14659213.0, "step": 899 }, { "entropy": 0.5423993915319443, "epoch": 3.370786516853933, "grad_norm": 0.04573873057961464, "learning_rate": 0.0002, "loss": 0.5364725589752197, "mean_token_accuracy": 0.7854876816272736, "num_tokens": 14675678.0, "step": 900 }, { "entropy": 0.5328433066606522, "epoch": 3.3745318352059925, "grad_norm": 0.044917598366737366, "learning_rate": 0.0002, "loss": 0.5308316946029663, "mean_token_accuracy": 0.785490483045578, "num_tokens": 14692287.0, "step": 901 }, { "entropy": 0.5370714962482452, "epoch": 3.3782771535580522, "grad_norm": 0.05281532183289528, "learning_rate": 0.0002, "loss": 0.5403937101364136, "mean_token_accuracy": 0.7802177965641022, "num_tokens": 14708736.0, "step": 902 }, { "entropy": 0.5240233987569809, "epoch": 3.3820224719101124, "grad_norm": 0.04636811465024948, "learning_rate": 0.0002, "loss": 0.5222055315971375, "mean_token_accuracy": 0.7886700630187988, "num_tokens": 14725122.0, "step": 903 }, { "entropy": 0.5218504667282104, "epoch": 3.385767790262172, "grad_norm": 0.05728694424033165, "learning_rate": 0.0002, "loss": 0.5256317853927612, "mean_token_accuracy": 0.7890423983335495, "num_tokens": 14741271.0, "step": 904 }, { "entropy": 0.5346123427152634, "epoch": 3.3895131086142323, "grad_norm": 0.046447765082120895, "learning_rate": 0.0002, "loss": 0.5343607664108276, "mean_token_accuracy": 0.7844806611537933, "num_tokens": 14757614.0, "step": 905 }, { "entropy": 0.5300848186016083, "epoch": 3.393258426966292, "grad_norm": 0.06571624428033829, "learning_rate": 0.0002, "loss": 0.5315452814102173, "mean_token_accuracy": 0.7868516147136688, "num_tokens": 14774083.0, "step": 906 }, { "entropy": 0.5144885182380676, "epoch": 3.397003745318352, "grad_norm": 0.05184376239776611, "learning_rate": 0.0002, "loss": 0.5137390494346619, "mean_token_accuracy": 0.7918999344110489, "num_tokens": 14790219.0, "step": 907 }, { "entropy": 0.5159177482128143, "epoch": 3.400749063670412, "grad_norm": 0.0637274757027626, "learning_rate": 0.0002, "loss": 0.5109057426452637, "mean_token_accuracy": 0.792988732457161, "num_tokens": 14806579.0, "step": 908 }, { "entropy": 0.5414174944162369, "epoch": 3.404494382022472, "grad_norm": 0.049117956310510635, "learning_rate": 0.0002, "loss": 0.5352107286453247, "mean_token_accuracy": 0.7849340736865997, "num_tokens": 14823142.0, "step": 909 }, { "entropy": 0.5176117867231369, "epoch": 3.4082397003745317, "grad_norm": 0.06466244161128998, "learning_rate": 0.0002, "loss": 0.522276759147644, "mean_token_accuracy": 0.789726972579956, "num_tokens": 14839440.0, "step": 910 }, { "entropy": 0.5329615920782089, "epoch": 3.411985018726592, "grad_norm": 0.05105730891227722, "learning_rate": 0.0002, "loss": 0.5381749868392944, "mean_token_accuracy": 0.7826534360647202, "num_tokens": 14855956.0, "step": 911 }, { "entropy": 0.5107108354568481, "epoch": 3.4157303370786516, "grad_norm": 0.05413498729467392, "learning_rate": 0.0002, "loss": 0.5151250958442688, "mean_token_accuracy": 0.7922552824020386, "num_tokens": 14872232.0, "step": 912 }, { "entropy": 0.5194525718688965, "epoch": 3.4194756554307117, "grad_norm": 0.049860697239637375, "learning_rate": 0.0002, "loss": 0.5245251655578613, "mean_token_accuracy": 0.7890132665634155, "num_tokens": 14888739.0, "step": 913 }, { "entropy": 0.5260248631238937, "epoch": 3.4232209737827715, "grad_norm": 0.0514976903796196, "learning_rate": 0.0002, "loss": 0.5202233195304871, "mean_token_accuracy": 0.7909575551748276, "num_tokens": 14905100.0, "step": 914 }, { "entropy": 0.5172304511070251, "epoch": 3.4269662921348316, "grad_norm": 0.046695906668901443, "learning_rate": 0.0002, "loss": 0.5149263143539429, "mean_token_accuracy": 0.7901606112718582, "num_tokens": 14921448.0, "step": 915 }, { "entropy": 0.5069386884570122, "epoch": 3.4307116104868913, "grad_norm": 0.05618730187416077, "learning_rate": 0.0002, "loss": 0.5093807578086853, "mean_token_accuracy": 0.7943364530801773, "num_tokens": 14937735.0, "step": 916 }, { "entropy": 0.5155317038297653, "epoch": 3.4344569288389515, "grad_norm": 0.04981003701686859, "learning_rate": 0.0002, "loss": 0.5243242383003235, "mean_token_accuracy": 0.7892241328954697, "num_tokens": 14954139.0, "step": 917 }, { "entropy": 0.5165708512067795, "epoch": 3.438202247191011, "grad_norm": 0.050371985882520676, "learning_rate": 0.0002, "loss": 0.5150896906852722, "mean_token_accuracy": 0.7927063405513763, "num_tokens": 14970507.0, "step": 918 }, { "entropy": 0.5134851261973381, "epoch": 3.4419475655430714, "grad_norm": 0.04879898577928543, "learning_rate": 0.0002, "loss": 0.5160987377166748, "mean_token_accuracy": 0.7906570881605148, "num_tokens": 14986812.0, "step": 919 }, { "entropy": 0.5135181546211243, "epoch": 3.445692883895131, "grad_norm": 0.05624324828386307, "learning_rate": 0.0002, "loss": 0.5219361186027527, "mean_token_accuracy": 0.7903093546628952, "num_tokens": 15003179.0, "step": 920 }, { "entropy": 0.5162501037120819, "epoch": 3.449438202247191, "grad_norm": 0.04822200909256935, "learning_rate": 0.0002, "loss": 0.5126674175262451, "mean_token_accuracy": 0.7924687564373016, "num_tokens": 15019428.0, "step": 921 }, { "entropy": 0.5315191224217415, "epoch": 3.453183520599251, "grad_norm": 0.04490262269973755, "learning_rate": 0.0002, "loss": 0.5248660445213318, "mean_token_accuracy": 0.7871098518371582, "num_tokens": 15035868.0, "step": 922 }, { "entropy": 0.5238284766674042, "epoch": 3.4569288389513106, "grad_norm": 0.051175910979509354, "learning_rate": 0.0002, "loss": 0.521578311920166, "mean_token_accuracy": 0.7883873879909515, "num_tokens": 15052303.0, "step": 923 }, { "entropy": 0.5168250873684883, "epoch": 3.460674157303371, "grad_norm": 0.046608321368694305, "learning_rate": 0.0002, "loss": 0.5207570791244507, "mean_token_accuracy": 0.7900703996419907, "num_tokens": 15068618.0, "step": 924 }, { "entropy": 0.5313585698604584, "epoch": 3.464419475655431, "grad_norm": 0.049307819455862045, "learning_rate": 0.0002, "loss": 0.5298991203308105, "mean_token_accuracy": 0.7864013016223907, "num_tokens": 15084957.0, "step": 925 }, { "entropy": 0.5185838490724564, "epoch": 3.4681647940074907, "grad_norm": 0.05639752745628357, "learning_rate": 0.0002, "loss": 0.5251802802085876, "mean_token_accuracy": 0.787624716758728, "num_tokens": 15101189.0, "step": 926 }, { "entropy": 0.515865795314312, "epoch": 3.4719101123595504, "grad_norm": 0.05554183945059776, "learning_rate": 0.0002, "loss": 0.518955647945404, "mean_token_accuracy": 0.7888496518135071, "num_tokens": 15117511.0, "step": 927 }, { "entropy": 0.5173558592796326, "epoch": 3.4756554307116105, "grad_norm": 0.051211338490247726, "learning_rate": 0.0002, "loss": 0.5185026526451111, "mean_token_accuracy": 0.7890340387821198, "num_tokens": 15133719.0, "step": 928 }, { "entropy": 0.520257018506527, "epoch": 3.4794007490636703, "grad_norm": 0.055278245359659195, "learning_rate": 0.0002, "loss": 0.5183354616165161, "mean_token_accuracy": 0.7902627289295197, "num_tokens": 15149922.0, "step": 929 }, { "entropy": 0.515156589448452, "epoch": 3.4831460674157304, "grad_norm": 0.05468440055847168, "learning_rate": 0.0002, "loss": 0.5097793340682983, "mean_token_accuracy": 0.7964832186698914, "num_tokens": 15166020.0, "step": 930 }, { "entropy": 0.521842934191227, "epoch": 3.48689138576779, "grad_norm": 0.04573323577642441, "learning_rate": 0.0002, "loss": 0.5174736380577087, "mean_token_accuracy": 0.7907158583402634, "num_tokens": 15182296.0, "step": 931 }, { "entropy": 0.5367195308208466, "epoch": 3.4906367041198503, "grad_norm": 0.05060438811779022, "learning_rate": 0.0002, "loss": 0.5360324382781982, "mean_token_accuracy": 0.7832886576652527, "num_tokens": 15198618.0, "step": 932 }, { "entropy": 0.5351738333702087, "epoch": 3.49438202247191, "grad_norm": 0.04796265438199043, "learning_rate": 0.0002, "loss": 0.5342084765434265, "mean_token_accuracy": 0.7837437838315964, "num_tokens": 15215125.0, "step": 933 }, { "entropy": 0.5210021957755089, "epoch": 3.49812734082397, "grad_norm": 0.05278978869318962, "learning_rate": 0.0002, "loss": 0.5260420441627502, "mean_token_accuracy": 0.7890212833881378, "num_tokens": 15231335.0, "step": 934 }, { "entropy": 0.5361146479845047, "epoch": 3.50187265917603, "grad_norm": 0.05599920451641083, "learning_rate": 0.0002, "loss": 0.5407608151435852, "mean_token_accuracy": 0.7809196263551712, "num_tokens": 15247587.0, "step": 935 }, { "entropy": 0.5127650052309036, "epoch": 3.50561797752809, "grad_norm": 0.053348250687122345, "learning_rate": 0.0002, "loss": 0.5172818303108215, "mean_token_accuracy": 0.7908589243888855, "num_tokens": 15263983.0, "step": 936 }, { "entropy": 0.5113075897097588, "epoch": 3.5093632958801497, "grad_norm": 0.047283098101615906, "learning_rate": 0.0002, "loss": 0.5094785690307617, "mean_token_accuracy": 0.7913675010204315, "num_tokens": 15280172.0, "step": 937 }, { "entropy": 0.5144875794649124, "epoch": 3.51310861423221, "grad_norm": 0.05150860175490379, "learning_rate": 0.0002, "loss": 0.5117542743682861, "mean_token_accuracy": 0.7926830351352692, "num_tokens": 15296278.0, "step": 938 }, { "entropy": 0.5282381922006607, "epoch": 3.5168539325842696, "grad_norm": 0.05235690623521805, "learning_rate": 0.0002, "loss": 0.5275253653526306, "mean_token_accuracy": 0.787050798535347, "num_tokens": 15312737.0, "step": 939 }, { "entropy": 0.5191426128149033, "epoch": 3.5205992509363297, "grad_norm": 0.05214005708694458, "learning_rate": 0.0002, "loss": 0.5218259692192078, "mean_token_accuracy": 0.7854390293359756, "num_tokens": 15329171.0, "step": 940 }, { "entropy": 0.488400898873806, "epoch": 3.5243445692883895, "grad_norm": 0.05028095468878746, "learning_rate": 0.0002, "loss": 0.49238866567611694, "mean_token_accuracy": 0.8010139167308807, "num_tokens": 15345040.0, "step": 941 }, { "entropy": 0.530989944934845, "epoch": 3.5280898876404496, "grad_norm": 0.05137421563267708, "learning_rate": 0.0002, "loss": 0.5283138155937195, "mean_token_accuracy": 0.7872757613658905, "num_tokens": 15361506.0, "step": 942 }, { "entropy": 0.5166791379451752, "epoch": 3.5318352059925093, "grad_norm": 0.05064837634563446, "learning_rate": 0.0002, "loss": 0.5200411677360535, "mean_token_accuracy": 0.7893417179584503, "num_tokens": 15377725.0, "step": 943 }, { "entropy": 0.5225488543510437, "epoch": 3.535580524344569, "grad_norm": 0.05224663019180298, "learning_rate": 0.0002, "loss": 0.5252619981765747, "mean_token_accuracy": 0.7887216210365295, "num_tokens": 15394073.0, "step": 944 }, { "entropy": 0.5133933499455452, "epoch": 3.539325842696629, "grad_norm": 0.054900407791137695, "learning_rate": 0.0002, "loss": 0.5187044143676758, "mean_token_accuracy": 0.7941587567329407, "num_tokens": 15410326.0, "step": 945 }, { "entropy": 0.5217478722333908, "epoch": 3.5430711610486894, "grad_norm": 0.05068376660346985, "learning_rate": 0.0002, "loss": 0.5203924179077148, "mean_token_accuracy": 0.7903146594762802, "num_tokens": 15426695.0, "step": 946 } ], "logging_steps": 1, "max_steps": 1335, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.437096036035199e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }