{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9660792116732992, "eval_steps": 1000, "global_step": 41500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.2857219433784486, "epoch": 0.002368770134546144, "grad_norm": 9.544422149658203, "learning_rate": 1.1605873993368073e-06, "loss": 2.8961, "mean_token_accuracy": 0.6633071088790894, "num_tokens": 1213722.0, "step": 50 }, { "entropy": 1.5439852488040924, "epoch": 0.004737540269092288, "grad_norm": 1.5218281745910645, "learning_rate": 2.3448602558029374e-06, "loss": 2.0745, "mean_token_accuracy": 0.6788832449913025, "num_tokens": 2440523.0, "step": 100 }, { "entropy": 1.9061457157135009, "epoch": 0.007106310403638431, "grad_norm": 1.1667029857635498, "learning_rate": 3.529133112269067e-06, "loss": 1.5243, "mean_token_accuracy": 0.6922976732254028, "num_tokens": 3673044.0, "step": 150 }, { "entropy": 1.8150238823890685, "epoch": 0.009475080538184575, "grad_norm": 1.093643307685852, "learning_rate": 4.713405968735197e-06, "loss": 1.4194, "mean_token_accuracy": 0.7095924293994904, "num_tokens": 4883924.0, "step": 200 }, { "entropy": 1.7932313251495362, "epoch": 0.011843850672730718, "grad_norm": 0.9533292055130005, "learning_rate": 5.897678825201327e-06, "loss": 1.4224, "mean_token_accuracy": 0.7063059556484222, "num_tokens": 6115751.0, "step": 250 }, { "entropy": 1.7181093657016755, "epoch": 0.014212620807276862, "grad_norm": 1.2956347465515137, "learning_rate": 7.0819516816674565e-06, "loss": 1.3637, "mean_token_accuracy": 0.7182639849185943, "num_tokens": 7344600.0, "step": 300 }, { "entropy": 1.7070274019241334, "epoch": 0.016581390941823006, "grad_norm": 1.0166376829147339, "learning_rate": 8.266224538133587e-06, "loss": 1.3641, "mean_token_accuracy": 0.7153780800104141, "num_tokens": 8564072.0, "step": 350 }, { "entropy": 1.7232308828830718, "epoch": 0.01895016107636915, "grad_norm": 0.9041787981987, "learning_rate": 9.450497394599716e-06, "loss": 1.4065, "mean_token_accuracy": 0.7088368773460388, "num_tokens": 9774641.0, "step": 400 }, { "entropy": 1.691476699113846, "epoch": 0.02131893121091529, "grad_norm": 1.104785680770874, "learning_rate": 1.0634770251065847e-05, "loss": 1.3464, "mean_token_accuracy": 0.7208190321922302, "num_tokens": 11023876.0, "step": 450 }, { "entropy": 1.7134545636177063, "epoch": 0.023687701345461436, "grad_norm": 1.029800295829773, "learning_rate": 1.1819043107531975e-05, "loss": 1.3758, "mean_token_accuracy": 0.7151617485284806, "num_tokens": 12267386.0, "step": 500 }, { "entropy": 1.6694060420989991, "epoch": 0.02605647148000758, "grad_norm": 0.9847853183746338, "learning_rate": 1.3003315963998106e-05, "loss": 1.3374, "mean_token_accuracy": 0.7215989363193512, "num_tokens": 13530115.0, "step": 550 }, { "entropy": 1.6969332695007324, "epoch": 0.028425241614553724, "grad_norm": 0.9018113017082214, "learning_rate": 1.4187588820464234e-05, "loss": 1.3446, "mean_token_accuracy": 0.720185512304306, "num_tokens": 14757305.0, "step": 600 }, { "entropy": 1.7514292740821837, "epoch": 0.03079401174909987, "grad_norm": 0.803989827632904, "learning_rate": 1.5371861676930365e-05, "loss": 1.4127, "mean_token_accuracy": 0.7072590082883835, "num_tokens": 15958099.0, "step": 650 }, { "entropy": 1.7325732719898224, "epoch": 0.03316278188364601, "grad_norm": 0.865963876247406, "learning_rate": 1.6556134533396493e-05, "loss": 1.3412, "mean_token_accuracy": 0.71946579515934, "num_tokens": 17188325.0, "step": 700 }, { "entropy": 1.707287894487381, "epoch": 0.03553155201819216, "grad_norm": 0.8039044141769409, "learning_rate": 1.7740407389862628e-05, "loss": 1.3502, "mean_token_accuracy": 0.7187299233675003, "num_tokens": 18423224.0, "step": 750 }, { "entropy": 1.708844404220581, "epoch": 0.0379003221527383, "grad_norm": 0.8659459948539734, "learning_rate": 1.8924680246328755e-05, "loss": 1.3596, "mean_token_accuracy": 0.7151121199131012, "num_tokens": 19656464.0, "step": 800 }, { "entropy": 1.7051723492145539, "epoch": 0.04026909228728444, "grad_norm": 0.8470927476882935, "learning_rate": 2.0108953102794883e-05, "loss": 1.309, "mean_token_accuracy": 0.723027645945549, "num_tokens": 20908664.0, "step": 850 }, { "entropy": 1.6959775292873382, "epoch": 0.04263786242183058, "grad_norm": 1.0128496885299683, "learning_rate": 2.1293225959261014e-05, "loss": 1.3337, "mean_token_accuracy": 0.720455265045166, "num_tokens": 22165141.0, "step": 900 }, { "entropy": 1.7231877827644348, "epoch": 0.04500663255637673, "grad_norm": 0.9077188968658447, "learning_rate": 2.2477498815727142e-05, "loss": 1.3309, "mean_token_accuracy": 0.7221323251724243, "num_tokens": 23408996.0, "step": 950 }, { "entropy": 1.7131755888462066, "epoch": 0.04737540269092287, "grad_norm": 0.9538567066192627, "learning_rate": 2.3661771672193277e-05, "loss": 1.3434, "mean_token_accuracy": 0.718843805193901, "num_tokens": 24626341.0, "step": 1000 }, { "epoch": 0.04737540269092287, "eval_entropy": 1.1688038776145298, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7489374687524091, "eval_num_tokens": 24626341.0, "eval_runtime": 739.6274, "eval_samples_per_second": 33.551, "eval_steps_per_second": 4.194, "step": 1000 }, { "entropy": 1.7510324358940124, "epoch": 0.049744172825469016, "grad_norm": 0.8725846409797668, "learning_rate": 2.4846044528659405e-05, "loss": 1.3756, "mean_token_accuracy": 0.714400834441185, "num_tokens": 25873918.0, "step": 1050 }, { "entropy": 1.7741026413440704, "epoch": 0.05211294296001516, "grad_norm": 0.9376819729804993, "learning_rate": 2.6030317385125536e-05, "loss": 1.3554, "mean_token_accuracy": 0.7169699442386627, "num_tokens": 27067052.0, "step": 1100 }, { "entropy": 1.7553015303611756, "epoch": 0.054481713094561304, "grad_norm": 0.7493100166320801, "learning_rate": 2.7214590241591663e-05, "loss": 1.3385, "mean_token_accuracy": 0.721844300031662, "num_tokens": 28308409.0, "step": 1150 }, { "entropy": 1.7428362345695496, "epoch": 0.05685048322910745, "grad_norm": 0.7889260053634644, "learning_rate": 2.8398863098057795e-05, "loss": 1.3293, "mean_token_accuracy": 0.7205719447135925, "num_tokens": 29542172.0, "step": 1200 }, { "entropy": 1.727198257446289, "epoch": 0.05921925336365359, "grad_norm": 0.8638942837715149, "learning_rate": 2.9583135954523922e-05, "loss": 1.3176, "mean_token_accuracy": 0.7266954278945923, "num_tokens": 30777592.0, "step": 1250 }, { "entropy": 1.7321817111968993, "epoch": 0.06158802349819974, "grad_norm": 0.8561661839485168, "learning_rate": 3.076740881099006e-05, "loss": 1.3116, "mean_token_accuracy": 0.7248632162809372, "num_tokens": 32008108.0, "step": 1300 }, { "entropy": 1.7352443253993988, "epoch": 0.06395679363274588, "grad_norm": 0.8668932914733887, "learning_rate": 3.1951681667456185e-05, "loss": 1.3422, "mean_token_accuracy": 0.7209743493795395, "num_tokens": 33257843.0, "step": 1350 }, { "entropy": 1.7172068011760713, "epoch": 0.06632556376729203, "grad_norm": 1.030638337135315, "learning_rate": 3.313595452392231e-05, "loss": 1.3161, "mean_token_accuracy": 0.7226567584276199, "num_tokens": 34486893.0, "step": 1400 }, { "entropy": 1.7621229577064514, "epoch": 0.06869433390183817, "grad_norm": 1.0166395902633667, "learning_rate": 3.432022738038844e-05, "loss": 1.3644, "mean_token_accuracy": 0.7157200646400451, "num_tokens": 35701555.0, "step": 1450 }, { "entropy": 1.739331885576248, "epoch": 0.07106310403638431, "grad_norm": 0.6904604434967041, "learning_rate": 3.550450023685457e-05, "loss": 1.3599, "mean_token_accuracy": 0.716941955089569, "num_tokens": 36936472.0, "step": 1500 }, { "entropy": 1.7242075634002685, "epoch": 0.07343187417093046, "grad_norm": 0.8110722303390503, "learning_rate": 3.66887730933207e-05, "loss": 1.3374, "mean_token_accuracy": 0.7217743951082229, "num_tokens": 38178198.0, "step": 1550 }, { "entropy": 1.7162011814117433, "epoch": 0.0758006443054766, "grad_norm": 0.8524773716926575, "learning_rate": 3.787304594978684e-05, "loss": 1.3228, "mean_token_accuracy": 0.7227801591157913, "num_tokens": 39412346.0, "step": 1600 }, { "entropy": 1.7332351410388946, "epoch": 0.07816941444002275, "grad_norm": 0.7344287037849426, "learning_rate": 3.9057318806252965e-05, "loss": 1.3343, "mean_token_accuracy": 0.721068668961525, "num_tokens": 40640112.0, "step": 1650 }, { "entropy": 1.7606168591976166, "epoch": 0.08053818457456888, "grad_norm": 0.8270729184150696, "learning_rate": 4.024159166271909e-05, "loss": 1.3587, "mean_token_accuracy": 0.7182448714971542, "num_tokens": 41866196.0, "step": 1700 }, { "entropy": 1.7047703182697296, "epoch": 0.08290695470911502, "grad_norm": 0.8977941870689392, "learning_rate": 4.142586451918522e-05, "loss": 1.3278, "mean_token_accuracy": 0.7222372907400131, "num_tokens": 43124550.0, "step": 1750 }, { "entropy": 1.7705276823043823, "epoch": 0.08527572484366117, "grad_norm": 0.8741844892501831, "learning_rate": 4.2610137375651355e-05, "loss": 1.3573, "mean_token_accuracy": 0.7164474505186081, "num_tokens": 44315237.0, "step": 1800 }, { "entropy": 1.7388102066516877, "epoch": 0.08764449497820731, "grad_norm": 0.8004917502403259, "learning_rate": 4.379441023211748e-05, "loss": 1.3342, "mean_token_accuracy": 0.7217385923862457, "num_tokens": 45536080.0, "step": 1850 }, { "entropy": 1.7502016520500183, "epoch": 0.09001326511275345, "grad_norm": 0.8822757005691528, "learning_rate": 4.497868308858361e-05, "loss": 1.336, "mean_token_accuracy": 0.7232905811071396, "num_tokens": 46770718.0, "step": 1900 }, { "entropy": 1.767259726524353, "epoch": 0.0923820352472996, "grad_norm": 0.7748751640319824, "learning_rate": 4.616295594504974e-05, "loss": 1.3553, "mean_token_accuracy": 0.7182161051034928, "num_tokens": 47996075.0, "step": 1950 }, { "entropy": 1.6973827588558197, "epoch": 0.09475080538184574, "grad_norm": 0.8286657333374023, "learning_rate": 4.7347228801515866e-05, "loss": 1.3257, "mean_token_accuracy": 0.7228487819433213, "num_tokens": 49257764.0, "step": 2000 }, { "epoch": 0.09475080538184574, "eval_entropy": 1.204862184174056, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7511153636268459, "eval_num_tokens": 49257764.0, "eval_runtime": 739.5936, "eval_samples_per_second": 33.552, "eval_steps_per_second": 4.194, "step": 2000 }, { "entropy": 1.7507979416847228, "epoch": 0.09711957551639189, "grad_norm": 0.8749801516532898, "learning_rate": 4.8531501657982e-05, "loss": 1.3637, "mean_token_accuracy": 0.7162546402215958, "num_tokens": 50488090.0, "step": 2050 }, { "entropy": 1.7249326765537263, "epoch": 0.09948834565093803, "grad_norm": 0.9060792922973633, "learning_rate": 4.9715774514448135e-05, "loss": 1.325, "mean_token_accuracy": 0.7223574507236481, "num_tokens": 51760869.0, "step": 2100 }, { "entropy": 1.7315819489955901, "epoch": 0.10185711578548418, "grad_norm": 0.7879400253295898, "learning_rate": 5.090004737091426e-05, "loss": 1.3279, "mean_token_accuracy": 0.7231736582517624, "num_tokens": 52971004.0, "step": 2150 }, { "entropy": 1.721841138601303, "epoch": 0.10422588592003032, "grad_norm": 0.6798914074897766, "learning_rate": 5.208432022738039e-05, "loss": 1.3486, "mean_token_accuracy": 0.7193009465932846, "num_tokens": 54200559.0, "step": 2200 }, { "entropy": 1.732561513185501, "epoch": 0.10659465605457646, "grad_norm": 0.9104458689689636, "learning_rate": 5.326859308384652e-05, "loss": 1.313, "mean_token_accuracy": 0.7249197036027908, "num_tokens": 55431492.0, "step": 2250 }, { "entropy": 1.745149908065796, "epoch": 0.10896342618912261, "grad_norm": 1.0170321464538574, "learning_rate": 5.4452865940312646e-05, "loss": 1.3327, "mean_token_accuracy": 0.7201522195339203, "num_tokens": 56685982.0, "step": 2300 }, { "entropy": 1.7817789494991303, "epoch": 0.11133219632366875, "grad_norm": 0.8275519013404846, "learning_rate": 5.5637138796778774e-05, "loss": 1.3902, "mean_token_accuracy": 0.7141750353574753, "num_tokens": 57930317.0, "step": 2350 }, { "entropy": 1.7808838784694672, "epoch": 0.1137009664582149, "grad_norm": 0.8482922315597534, "learning_rate": 5.6821411653244915e-05, "loss": 1.3555, "mean_token_accuracy": 0.7165615385770798, "num_tokens": 59178908.0, "step": 2400 }, { "entropy": 1.7483756732940674, "epoch": 0.11606973659276104, "grad_norm": 1.1124041080474854, "learning_rate": 5.800568450971104e-05, "loss": 1.3078, "mean_token_accuracy": 0.7251748180389405, "num_tokens": 60407544.0, "step": 2450 }, { "entropy": 1.7590064382553101, "epoch": 0.11843850672730719, "grad_norm": 0.8734819889068604, "learning_rate": 5.918995736617717e-05, "loss": 1.3515, "mean_token_accuracy": 0.7170740348100663, "num_tokens": 61646580.0, "step": 2500 }, { "entropy": 1.7470902466773988, "epoch": 0.12080727686185333, "grad_norm": 0.9874738454818726, "learning_rate": 6.03742302226433e-05, "loss": 1.3449, "mean_token_accuracy": 0.7202254205942153, "num_tokens": 62847914.0, "step": 2550 }, { "entropy": 1.7077771651744842, "epoch": 0.12317604699639947, "grad_norm": 0.7741467952728271, "learning_rate": 6.155850307910943e-05, "loss": 1.3253, "mean_token_accuracy": 0.721799430847168, "num_tokens": 64110298.0, "step": 2600 }, { "entropy": 1.7353104615211488, "epoch": 0.12554481713094562, "grad_norm": 0.878971517086029, "learning_rate": 6.274277593557556e-05, "loss": 1.3365, "mean_token_accuracy": 0.7212816894054412, "num_tokens": 65347419.0, "step": 2650 }, { "entropy": 1.734018679857254, "epoch": 0.12791358726549176, "grad_norm": 0.9191023111343384, "learning_rate": 6.392704879204168e-05, "loss": 1.3317, "mean_token_accuracy": 0.7217869812250137, "num_tokens": 66593613.0, "step": 2700 }, { "entropy": 1.7515969347953797, "epoch": 0.1302823574000379, "grad_norm": 0.9526401162147522, "learning_rate": 6.511132164850782e-05, "loss": 1.3381, "mean_token_accuracy": 0.7212713253498078, "num_tokens": 67818628.0, "step": 2750 }, { "entropy": 1.732736051082611, "epoch": 0.13265112753458405, "grad_norm": 0.9250634908676147, "learning_rate": 6.629559450497395e-05, "loss": 1.3267, "mean_token_accuracy": 0.7237467241287231, "num_tokens": 69063328.0, "step": 2800 }, { "entropy": 1.7254245734214784, "epoch": 0.1350198976691302, "grad_norm": 0.8667979836463928, "learning_rate": 6.747986736144007e-05, "loss": 1.3669, "mean_token_accuracy": 0.7158471101522446, "num_tokens": 70325294.0, "step": 2850 }, { "entropy": 1.7342670309543609, "epoch": 0.13738866780367634, "grad_norm": 0.9424638748168945, "learning_rate": 6.866414021790622e-05, "loss": 1.352, "mean_token_accuracy": 0.7186112779378891, "num_tokens": 71578584.0, "step": 2900 }, { "entropy": 1.717711169719696, "epoch": 0.13975743793822248, "grad_norm": 0.8827985525131226, "learning_rate": 6.984841307437234e-05, "loss": 1.3708, "mean_token_accuracy": 0.7152537268400192, "num_tokens": 72849472.0, "step": 2950 }, { "entropy": 1.7510717618465423, "epoch": 0.14212620807276863, "grad_norm": 0.8640701174736023, "learning_rate": 7.103268593083848e-05, "loss": 1.3576, "mean_token_accuracy": 0.7165306961536407, "num_tokens": 74089751.0, "step": 3000 }, { "epoch": 0.14212620807276863, "eval_entropy": 1.1560545717993527, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7539014699553613, "eval_num_tokens": 74089751.0, "eval_runtime": 739.5182, "eval_samples_per_second": 33.556, "eval_steps_per_second": 4.195, "step": 3000 }, { "entropy": 1.7481833016872406, "epoch": 0.14449497820731477, "grad_norm": 0.8835089206695557, "learning_rate": 7.22169587873046e-05, "loss": 1.3735, "mean_token_accuracy": 0.7145194208621979, "num_tokens": 75300868.0, "step": 3050 }, { "entropy": 1.7272928488254546, "epoch": 0.14686374834186092, "grad_norm": 0.858995258808136, "learning_rate": 7.340123164377073e-05, "loss": 1.3456, "mean_token_accuracy": 0.7180565488338471, "num_tokens": 76542456.0, "step": 3100 }, { "entropy": 1.7239915192127229, "epoch": 0.14923251847640706, "grad_norm": 1.038167953491211, "learning_rate": 7.458550450023685e-05, "loss": 1.3364, "mean_token_accuracy": 0.7197621566057205, "num_tokens": 77797173.0, "step": 3150 }, { "entropy": 1.7232668161392213, "epoch": 0.1516012886109532, "grad_norm": 0.7863021492958069, "learning_rate": 7.576977735670299e-05, "loss": 1.3252, "mean_token_accuracy": 0.7219555181264877, "num_tokens": 79036677.0, "step": 3200 }, { "entropy": 1.7575169241428374, "epoch": 0.15397005874549935, "grad_norm": 1.122582197189331, "learning_rate": 7.695405021316912e-05, "loss": 1.3513, "mean_token_accuracy": 0.7171193498373032, "num_tokens": 80257412.0, "step": 3250 }, { "entropy": 1.7478620946407317, "epoch": 0.1563388288800455, "grad_norm": 0.8442687392234802, "learning_rate": 7.813832306963524e-05, "loss": 1.3689, "mean_token_accuracy": 0.7146556586027145, "num_tokens": 81452995.0, "step": 3300 }, { "entropy": 1.7156465804576875, "epoch": 0.1587075990145916, "grad_norm": 0.8353444337844849, "learning_rate": 7.932259592610138e-05, "loss": 1.341, "mean_token_accuracy": 0.7198166775703431, "num_tokens": 82720102.0, "step": 3350 }, { "entropy": 1.6997444534301758, "epoch": 0.16107636914913775, "grad_norm": 1.0969985723495483, "learning_rate": 8.050686878256751e-05, "loss": 1.3462, "mean_token_accuracy": 0.7181236177682877, "num_tokens": 83970755.0, "step": 3400 }, { "entropy": 1.7316451609134673, "epoch": 0.1634451392836839, "grad_norm": 1.048732876777649, "learning_rate": 8.169114163903365e-05, "loss": 1.3286, "mean_token_accuracy": 0.7228594154119492, "num_tokens": 85196313.0, "step": 3450 }, { "entropy": 1.6936505138874054, "epoch": 0.16581390941823004, "grad_norm": 0.9473629593849182, "learning_rate": 8.287541449549977e-05, "loss": 1.3178, "mean_token_accuracy": 0.7219874155521393, "num_tokens": 86444399.0, "step": 3500 }, { "entropy": 1.7614711892604829, "epoch": 0.1681826795527762, "grad_norm": 1.0644205808639526, "learning_rate": 8.40596873519659e-05, "loss": 1.3485, "mean_token_accuracy": 0.7177842026948928, "num_tokens": 87668028.0, "step": 3550 }, { "entropy": 1.760385752916336, "epoch": 0.17055144968732233, "grad_norm": 0.8554447293281555, "learning_rate": 8.524396020843202e-05, "loss": 1.3655, "mean_token_accuracy": 0.7148052769899368, "num_tokens": 88876622.0, "step": 3600 }, { "entropy": 1.7391455006599426, "epoch": 0.17292021982186848, "grad_norm": 0.8997156023979187, "learning_rate": 8.642823306489816e-05, "loss": 1.3729, "mean_token_accuracy": 0.7139494162797928, "num_tokens": 90122997.0, "step": 3650 }, { "entropy": 1.6978785967826844, "epoch": 0.17528898995641462, "grad_norm": 0.9306835532188416, "learning_rate": 8.761250592136429e-05, "loss": 1.3101, "mean_token_accuracy": 0.7250325381755829, "num_tokens": 91386702.0, "step": 3700 }, { "entropy": 1.7522969400882722, "epoch": 0.17765776009096076, "grad_norm": 0.8477308750152588, "learning_rate": 8.879677877783041e-05, "loss": 1.3688, "mean_token_accuracy": 0.715588583946228, "num_tokens": 92610301.0, "step": 3750 }, { "entropy": 1.7447042429447175, "epoch": 0.1800265302255069, "grad_norm": 1.0386239290237427, "learning_rate": 8.998105163429655e-05, "loss": 1.3431, "mean_token_accuracy": 0.718235713839531, "num_tokens": 93824209.0, "step": 3800 }, { "entropy": 1.7764518535137177, "epoch": 0.18239530036005305, "grad_norm": 1.1934313774108887, "learning_rate": 9.116532449076267e-05, "loss": 1.377, "mean_token_accuracy": 0.7121944260597229, "num_tokens": 95073205.0, "step": 3850 }, { "entropy": 1.7176523733139037, "epoch": 0.1847640704945992, "grad_norm": 0.9109567403793335, "learning_rate": 9.234959734722882e-05, "loss": 1.3269, "mean_token_accuracy": 0.7219234961271286, "num_tokens": 96282593.0, "step": 3900 }, { "entropy": 1.7826895797252655, "epoch": 0.18713284062914534, "grad_norm": 0.8581134080886841, "learning_rate": 9.353387020369494e-05, "loss": 1.4049, "mean_token_accuracy": 0.7105602127313614, "num_tokens": 97491980.0, "step": 3950 }, { "entropy": 1.7406088852882384, "epoch": 0.18950161076369149, "grad_norm": 0.8592116236686707, "learning_rate": 9.471814306016107e-05, "loss": 1.3514, "mean_token_accuracy": 0.7164700603485108, "num_tokens": 98726298.0, "step": 4000 }, { "epoch": 0.18950161076369149, "eval_entropy": 1.1647965725124612, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7535035212354303, "eval_num_tokens": 98726298.0, "eval_runtime": 741.2483, "eval_samples_per_second": 33.477, "eval_steps_per_second": 4.185, "step": 4000 }, { "entropy": 1.7602094197273255, "epoch": 0.19187038089823763, "grad_norm": 1.0990040302276611, "learning_rate": 9.590241591662719e-05, "loss": 1.3787, "mean_token_accuracy": 0.7126823592185975, "num_tokens": 99961531.0, "step": 4050 }, { "entropy": 1.736720016002655, "epoch": 0.19423915103278377, "grad_norm": 0.886349081993103, "learning_rate": 9.708668877309333e-05, "loss": 1.3486, "mean_token_accuracy": 0.7173503488302231, "num_tokens": 101187732.0, "step": 4100 }, { "entropy": 1.772811095714569, "epoch": 0.19660792116732992, "grad_norm": 1.147083044052124, "learning_rate": 9.827096162955945e-05, "loss": 1.4016, "mean_token_accuracy": 0.7078080683946609, "num_tokens": 102372338.0, "step": 4150 }, { "entropy": 1.737178726196289, "epoch": 0.19897669130187606, "grad_norm": 1.1906094551086426, "learning_rate": 9.945523448602558e-05, "loss": 1.3823, "mean_token_accuracy": 0.7129582542181016, "num_tokens": 103594020.0, "step": 4200 }, { "entropy": 1.739368189573288, "epoch": 0.2013454614364222, "grad_norm": 0.8465049862861633, "learning_rate": 9.999987539454218e-05, "loss": 1.384, "mean_token_accuracy": 0.7121477049589157, "num_tokens": 104839897.0, "step": 4250 }, { "entropy": 1.728913918733597, "epoch": 0.20371423157096835, "grad_norm": 1.0396977663040161, "learning_rate": 9.999898657946416e-05, "loss": 1.4049, "mean_token_accuracy": 0.709805850982666, "num_tokens": 106084752.0, "step": 4300 }, { "entropy": 1.7548469495773316, "epoch": 0.2060830017055145, "grad_norm": 1.1842293739318848, "learning_rate": 9.999724314980077e-05, "loss": 1.3883, "mean_token_accuracy": 0.7103092032670975, "num_tokens": 107308027.0, "step": 4350 }, { "entropy": 1.7609304535388945, "epoch": 0.20845177184006064, "grad_norm": 0.8410583138465881, "learning_rate": 9.999464513535188e-05, "loss": 1.3632, "mean_token_accuracy": 0.7141008460521698, "num_tokens": 108532695.0, "step": 4400 }, { "entropy": 1.7612316942214965, "epoch": 0.21082054197460678, "grad_norm": 0.9074947237968445, "learning_rate": 9.999119258052436e-05, "loss": 1.3728, "mean_token_accuracy": 0.7128197175264358, "num_tokens": 109768914.0, "step": 4450 }, { "entropy": 1.7695635759830475, "epoch": 0.21318931210915293, "grad_norm": 0.9042698740959167, "learning_rate": 9.99868855443315e-05, "loss": 1.3519, "mean_token_accuracy": 0.7166950708627701, "num_tokens": 110984584.0, "step": 4500 }, { "entropy": 1.7432436084747314, "epoch": 0.21555808224369907, "grad_norm": 1.2357442378997803, "learning_rate": 9.99817241003919e-05, "loss": 1.334, "mean_token_accuracy": 0.7201163339614868, "num_tokens": 112235932.0, "step": 4550 }, { "entropy": 1.7642862284183503, "epoch": 0.21792685237824522, "grad_norm": 1.0687198638916016, "learning_rate": 9.997570833692829e-05, "loss": 1.3798, "mean_token_accuracy": 0.7113319665193558, "num_tokens": 113455353.0, "step": 4600 }, { "entropy": 1.7590344095230102, "epoch": 0.22029562251279136, "grad_norm": 1.1026127338409424, "learning_rate": 9.996883835676589e-05, "loss": 1.3825, "mean_token_accuracy": 0.7098899132013321, "num_tokens": 114694421.0, "step": 4650 }, { "entropy": 1.7447860455513, "epoch": 0.2226643926473375, "grad_norm": 1.0826524496078491, "learning_rate": 9.99611142773308e-05, "loss": 1.3484, "mean_token_accuracy": 0.7184046697616577, "num_tokens": 115913968.0, "step": 4700 }, { "entropy": 1.7905651438236236, "epoch": 0.22503316278188365, "grad_norm": 1.1828806400299072, "learning_rate": 9.995253623064793e-05, "loss": 1.4072, "mean_token_accuracy": 0.7065826892852783, "num_tokens": 117100168.0, "step": 4750 }, { "entropy": 1.7732587778568267, "epoch": 0.2274019329164298, "grad_norm": 0.8388417959213257, "learning_rate": 9.994310436333872e-05, "loss": 1.3876, "mean_token_accuracy": 0.7099131292104721, "num_tokens": 118323063.0, "step": 4800 }, { "entropy": 1.7498207116127014, "epoch": 0.22977070305097594, "grad_norm": 0.9928333759307861, "learning_rate": 9.993281883661866e-05, "loss": 1.3248, "mean_token_accuracy": 0.7209601724147796, "num_tokens": 119542247.0, "step": 4850 }, { "entropy": 1.7807526588439941, "epoch": 0.23213947318552208, "grad_norm": 1.180126428604126, "learning_rate": 9.992167982629455e-05, "loss": 1.3807, "mean_token_accuracy": 0.7116306042671203, "num_tokens": 120783656.0, "step": 4900 }, { "entropy": 1.7660968756675721, "epoch": 0.23450824332006823, "grad_norm": 1.035225749015808, "learning_rate": 9.990968752276143e-05, "loss": 1.3906, "mean_token_accuracy": 0.7096653944253921, "num_tokens": 122014053.0, "step": 4950 }, { "entropy": 1.7683662581443786, "epoch": 0.23687701345461437, "grad_norm": 0.8732820153236389, "learning_rate": 9.989684213099944e-05, "loss": 1.363, "mean_token_accuracy": 0.7147561728954315, "num_tokens": 123247491.0, "step": 5000 }, { "epoch": 0.23687701345461437, "eval_entropy": 1.1902209509963915, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7525513527433308, "eval_num_tokens": 123247491.0, "eval_runtime": 749.4439, "eval_samples_per_second": 33.111, "eval_steps_per_second": 4.139, "step": 5000 }, { "entropy": 1.7724631798267365, "epoch": 0.23924578358916052, "grad_norm": 1.2394686937332153, "learning_rate": 9.988314387057021e-05, "loss": 1.4029, "mean_token_accuracy": 0.7083960479497909, "num_tokens": 124486744.0, "step": 5050 }, { "entropy": 1.7794454956054688, "epoch": 0.24161455372370666, "grad_norm": 1.031551718711853, "learning_rate": 9.986859297561312e-05, "loss": 1.3872, "mean_token_accuracy": 0.7082083231210708, "num_tokens": 125689651.0, "step": 5100 }, { "entropy": 1.8115082442760468, "epoch": 0.2439833238582528, "grad_norm": 1.238067388534546, "learning_rate": 9.985318969484139e-05, "loss": 1.4075, "mean_token_accuracy": 0.7077406024932862, "num_tokens": 126912476.0, "step": 5150 }, { "entropy": 1.7362813007831575, "epoch": 0.24635209399279895, "grad_norm": 0.9080651998519897, "learning_rate": 9.983693429153769e-05, "loss": 1.3715, "mean_token_accuracy": 0.7125364172458649, "num_tokens": 128141273.0, "step": 5200 }, { "entropy": 1.7462396609783173, "epoch": 0.2487208641273451, "grad_norm": 0.9258147478103638, "learning_rate": 9.981982704354978e-05, "loss": 1.3539, "mean_token_accuracy": 0.7153694558143616, "num_tokens": 129367296.0, "step": 5250 }, { "entropy": 1.7526134848594666, "epoch": 0.25108963426189124, "grad_norm": 1.0741764307022095, "learning_rate": 9.980186824328563e-05, "loss": 1.3639, "mean_token_accuracy": 0.7122530096769333, "num_tokens": 130622992.0, "step": 5300 }, { "entropy": 1.8136487221717834, "epoch": 0.25345840439643735, "grad_norm": 1.079744815826416, "learning_rate": 9.978305819770852e-05, "loss": 1.3934, "mean_token_accuracy": 0.7090709501504898, "num_tokens": 131844647.0, "step": 5350 }, { "entropy": 1.7428915858268739, "epoch": 0.2558271745309835, "grad_norm": 1.0281189680099487, "learning_rate": 9.976339722833178e-05, "loss": 1.357, "mean_token_accuracy": 0.7154221564531327, "num_tokens": 133100147.0, "step": 5400 }, { "entropy": 1.8012803518772125, "epoch": 0.25819594466552964, "grad_norm": 1.2619256973266602, "learning_rate": 9.974288567121322e-05, "loss": 1.4075, "mean_token_accuracy": 0.7054576027393341, "num_tokens": 134303236.0, "step": 5450 }, { "entropy": 1.7738253235816956, "epoch": 0.2605647148000758, "grad_norm": 1.0344356298446655, "learning_rate": 9.972152387694946e-05, "loss": 1.3516, "mean_token_accuracy": 0.7141925716400146, "num_tokens": 135527480.0, "step": 5500 }, { "entropy": 1.7168458807468414, "epoch": 0.26293348493462193, "grad_norm": 1.062092661857605, "learning_rate": 9.969931221066992e-05, "loss": 1.3439, "mean_token_accuracy": 0.7171407097578049, "num_tokens": 136777268.0, "step": 5550 }, { "entropy": 1.7599689650535584, "epoch": 0.2653022550691681, "grad_norm": 0.9637967348098755, "learning_rate": 9.96762510520306e-05, "loss": 1.3794, "mean_token_accuracy": 0.7112497627735138, "num_tokens": 137993796.0, "step": 5600 }, { "entropy": 1.7565060186386108, "epoch": 0.2676710252037142, "grad_norm": 0.9759653806686401, "learning_rate": 9.965234079520751e-05, "loss": 1.3797, "mean_token_accuracy": 0.7126868903636933, "num_tokens": 139236029.0, "step": 5650 }, { "entropy": 1.7332323002815246, "epoch": 0.2700397953382604, "grad_norm": 1.1588467359542847, "learning_rate": 9.962758184889003e-05, "loss": 1.3803, "mean_token_accuracy": 0.710934864282608, "num_tokens": 140453476.0, "step": 5700 }, { "entropy": 1.763832380771637, "epoch": 0.2724085654728065, "grad_norm": 0.8555989861488342, "learning_rate": 9.960197463627388e-05, "loss": 1.3641, "mean_token_accuracy": 0.7138992995023727, "num_tokens": 141647360.0, "step": 5750 }, { "entropy": 1.7501352691650391, "epoch": 0.2747773356073527, "grad_norm": 0.9515321850776672, "learning_rate": 9.957551959505387e-05, "loss": 1.4013, "mean_token_accuracy": 0.7074063158035279, "num_tokens": 142881658.0, "step": 5800 }, { "entropy": 1.7732372057437897, "epoch": 0.2771461057418988, "grad_norm": 1.0687644481658936, "learning_rate": 9.954821717741643e-05, "loss": 1.3726, "mean_token_accuracy": 0.7110266560316085, "num_tokens": 144097656.0, "step": 5850 }, { "entropy": 1.8280004715919496, "epoch": 0.27951487587644497, "grad_norm": 0.9914586544036865, "learning_rate": 9.952006785003194e-05, "loss": 1.4141, "mean_token_accuracy": 0.7043382048606872, "num_tokens": 145304660.0, "step": 5900 }, { "entropy": 1.7428024232387542, "epoch": 0.2818836460109911, "grad_norm": 0.9063569903373718, "learning_rate": 9.949107209404665e-05, "loss": 1.3871, "mean_token_accuracy": 0.7085927510261536, "num_tokens": 146556399.0, "step": 5950 }, { "entropy": 1.7348086619377137, "epoch": 0.28425241614553726, "grad_norm": 1.1388063430786133, "learning_rate": 9.946123040507451e-05, "loss": 1.4059, "mean_token_accuracy": 0.7068395394086838, "num_tokens": 147781528.0, "step": 6000 }, { "epoch": 0.28425241614553726, "eval_entropy": 1.1620629866956358, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7536570436719308, "eval_num_tokens": 147781528.0, "eval_runtime": 742.238, "eval_samples_per_second": 33.433, "eval_steps_per_second": 4.179, "step": 6000 }, { "entropy": 1.7318350422382354, "epoch": 0.2866211862800834, "grad_norm": 1.0696161985397339, "learning_rate": 9.943054329318873e-05, "loss": 1.3689, "mean_token_accuracy": 0.7137463581562042, "num_tokens": 148993131.0, "step": 6050 }, { "entropy": 1.7341815280914306, "epoch": 0.28898995641462955, "grad_norm": 1.211084246635437, "learning_rate": 9.9399011282913e-05, "loss": 1.3396, "mean_token_accuracy": 0.7190863400697708, "num_tokens": 150231439.0, "step": 6100 }, { "entropy": 1.8086679303646087, "epoch": 0.29135872654917566, "grad_norm": 0.997982919216156, "learning_rate": 9.936663491321256e-05, "loss": 1.3991, "mean_token_accuracy": 0.7076171565055848, "num_tokens": 151425872.0, "step": 6150 }, { "entropy": 1.7646045112609863, "epoch": 0.29372749668372183, "grad_norm": 1.0052849054336548, "learning_rate": 9.9333414737485e-05, "loss": 1.3833, "mean_token_accuracy": 0.7115501266717911, "num_tokens": 152649154.0, "step": 6200 }, { "entropy": 1.7603888380527497, "epoch": 0.29609626681826795, "grad_norm": 1.1485621929168701, "learning_rate": 9.929935132355075e-05, "loss": 1.3774, "mean_token_accuracy": 0.7107777494192123, "num_tokens": 153909546.0, "step": 6250 }, { "entropy": 1.810437490940094, "epoch": 0.2984650369528141, "grad_norm": 1.1413508653640747, "learning_rate": 9.926444525364341e-05, "loss": 1.378, "mean_token_accuracy": 0.711902762055397, "num_tokens": 155120315.0, "step": 6300 }, { "entropy": 1.7656940996646882, "epoch": 0.30083380708736024, "grad_norm": 0.8839899897575378, "learning_rate": 9.922869712439981e-05, "loss": 1.3904, "mean_token_accuracy": 0.7087905770540237, "num_tokens": 156368001.0, "step": 6350 }, { "entropy": 1.7679949700832367, "epoch": 0.3032025772219064, "grad_norm": 1.285138726234436, "learning_rate": 9.91921075468498e-05, "loss": 1.3891, "mean_token_accuracy": 0.7098447853326797, "num_tokens": 157568259.0, "step": 6400 }, { "entropy": 1.775840550661087, "epoch": 0.3055713473564525, "grad_norm": 1.10303795337677, "learning_rate": 9.915467714640578e-05, "loss": 1.3918, "mean_token_accuracy": 0.7079905581474304, "num_tokens": 158791523.0, "step": 6450 }, { "entropy": 1.7338063383102418, "epoch": 0.3079401174909987, "grad_norm": 1.0604420900344849, "learning_rate": 9.911640656285203e-05, "loss": 1.3554, "mean_token_accuracy": 0.714795948266983, "num_tokens": 160073528.0, "step": 6500 }, { "entropy": 1.7426982474327088, "epoch": 0.3103088876255448, "grad_norm": 0.9847440123558044, "learning_rate": 9.907729645033379e-05, "loss": 1.3512, "mean_token_accuracy": 0.7151961398124694, "num_tokens": 161312761.0, "step": 6550 }, { "entropy": 1.8005949878692626, "epoch": 0.312677657760091, "grad_norm": 1.2713630199432373, "learning_rate": 9.903734747734607e-05, "loss": 1.3597, "mean_token_accuracy": 0.7128104782104492, "num_tokens": 162512008.0, "step": 6600 }, { "entropy": 1.8041615283489227, "epoch": 0.3150464278946371, "grad_norm": 0.99453204870224, "learning_rate": 9.899656032672221e-05, "loss": 1.3642, "mean_token_accuracy": 0.7122291630506515, "num_tokens": 163702726.0, "step": 6650 }, { "entropy": 1.7597569704055787, "epoch": 0.3174151980291832, "grad_norm": 1.2227306365966797, "learning_rate": 9.895493569562221e-05, "loss": 1.3276, "mean_token_accuracy": 0.7197510945796967, "num_tokens": 164943131.0, "step": 6700 }, { "entropy": 1.7358013463020325, "epoch": 0.3197839681637294, "grad_norm": 1.1400933265686035, "learning_rate": 9.891247429552082e-05, "loss": 1.384, "mean_token_accuracy": 0.7089168894290924, "num_tokens": 166167321.0, "step": 6750 }, { "entropy": 1.7530862140655517, "epoch": 0.3221527382982755, "grad_norm": 1.2036629915237427, "learning_rate": 9.886917685219541e-05, "loss": 1.3398, "mean_token_accuracy": 0.7184527868032455, "num_tokens": 167397732.0, "step": 6800 }, { "entropy": 1.7854076647758483, "epoch": 0.3245215084328217, "grad_norm": 1.2987496852874756, "learning_rate": 9.88250441057135e-05, "loss": 1.394, "mean_token_accuracy": 0.7086141872406005, "num_tokens": 168618527.0, "step": 6850 }, { "entropy": 1.7509974801540376, "epoch": 0.3268902785673678, "grad_norm": 1.056751012802124, "learning_rate": 9.878007681042014e-05, "loss": 1.3389, "mean_token_accuracy": 0.7182145416736603, "num_tokens": 169856441.0, "step": 6900 }, { "entropy": 1.740920853614807, "epoch": 0.32925904870191397, "grad_norm": 1.1730940341949463, "learning_rate": 9.873427573492507e-05, "loss": 1.3387, "mean_token_accuracy": 0.718420038819313, "num_tokens": 171123051.0, "step": 6950 }, { "entropy": 1.7686040151119231, "epoch": 0.3316278188364601, "grad_norm": 1.139112949371338, "learning_rate": 9.868764166208946e-05, "loss": 1.373, "mean_token_accuracy": 0.7120095008611679, "num_tokens": 172342540.0, "step": 7000 }, { "epoch": 0.3316278188364601, "eval_entropy": 1.1930562926445525, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7542100738587493, "eval_num_tokens": 172342540.0, "eval_runtime": 741.3646, "eval_samples_per_second": 33.472, "eval_steps_per_second": 4.184, "step": 7000 }, { "entropy": 1.7598109781742095, "epoch": 0.33399658897100626, "grad_norm": 1.2180997133255005, "learning_rate": 9.864017538901267e-05, "loss": 1.4032, "mean_token_accuracy": 0.7083274441957473, "num_tokens": 173589426.0, "step": 7050 }, { "entropy": 1.7652622890472411, "epoch": 0.3363653591055524, "grad_norm": 1.3037455081939697, "learning_rate": 9.859187772701853e-05, "loss": 1.369, "mean_token_accuracy": 0.7140497547388077, "num_tokens": 174848839.0, "step": 7100 }, { "entropy": 1.7913592505455016, "epoch": 0.33873412924009855, "grad_norm": 1.1562169790267944, "learning_rate": 9.854274950164149e-05, "loss": 1.3837, "mean_token_accuracy": 0.7107916122674942, "num_tokens": 176055919.0, "step": 7150 }, { "entropy": 1.7908745443820953, "epoch": 0.34110289937464466, "grad_norm": 1.2559897899627686, "learning_rate": 9.849279155261252e-05, "loss": 1.3907, "mean_token_accuracy": 0.7087368202209473, "num_tokens": 177277309.0, "step": 7200 }, { "entropy": 1.753930516242981, "epoch": 0.34347166950919084, "grad_norm": 0.9901047348976135, "learning_rate": 9.844200473384479e-05, "loss": 1.3527, "mean_token_accuracy": 0.716761229634285, "num_tokens": 178518563.0, "step": 7250 }, { "entropy": 1.739516668319702, "epoch": 0.34584043964373695, "grad_norm": 1.2106683254241943, "learning_rate": 9.8390389913419e-05, "loss": 1.3725, "mean_token_accuracy": 0.7121469175815582, "num_tokens": 179742683.0, "step": 7300 }, { "entropy": 1.756061052083969, "epoch": 0.3482092097782831, "grad_norm": 1.0457638502120972, "learning_rate": 9.833794797356861e-05, "loss": 1.3701, "mean_token_accuracy": 0.7125989294052124, "num_tokens": 180940666.0, "step": 7350 }, { "entropy": 1.7689040386676789, "epoch": 0.35057797991282924, "grad_norm": 0.9141308069229126, "learning_rate": 9.828467981066472e-05, "loss": 1.3718, "mean_token_accuracy": 0.7115379917621613, "num_tokens": 182184090.0, "step": 7400 }, { "entropy": 1.7089093339443207, "epoch": 0.3529467500473754, "grad_norm": 0.8629412055015564, "learning_rate": 9.823058633520074e-05, "loss": 1.3324, "mean_token_accuracy": 0.7187563890218734, "num_tokens": 183446222.0, "step": 7450 }, { "entropy": 1.776807938814163, "epoch": 0.35531552018192153, "grad_norm": 0.9498484134674072, "learning_rate": 9.817566847177689e-05, "loss": 1.375, "mean_token_accuracy": 0.7121974611282349, "num_tokens": 184676077.0, "step": 7500 }, { "entropy": 1.8064971625804902, "epoch": 0.3576842903164677, "grad_norm": 1.0395594835281372, "learning_rate": 9.811992715908434e-05, "loss": 1.3748, "mean_token_accuracy": 0.7101496076583862, "num_tokens": 185903667.0, "step": 7550 }, { "entropy": 1.756836792230606, "epoch": 0.3600530604510138, "grad_norm": 0.9577502608299255, "learning_rate": 9.806336334988918e-05, "loss": 1.3556, "mean_token_accuracy": 0.7159949284791947, "num_tokens": 187154538.0, "step": 7600 }, { "entropy": 1.7699120783805846, "epoch": 0.36242183058556, "grad_norm": 1.4034383296966553, "learning_rate": 9.800597801101612e-05, "loss": 1.3911, "mean_token_accuracy": 0.7097045290470123, "num_tokens": 188378482.0, "step": 7650 }, { "entropy": 1.787111645936966, "epoch": 0.3647906007201061, "grad_norm": 1.0781787633895874, "learning_rate": 9.794777212333202e-05, "loss": 1.3937, "mean_token_accuracy": 0.7096772521734238, "num_tokens": 189611171.0, "step": 7700 }, { "entropy": 1.777391802072525, "epoch": 0.3671593708546523, "grad_norm": 1.1259112358093262, "learning_rate": 9.7888746681729e-05, "loss": 1.3448, "mean_token_accuracy": 0.7169349992275238, "num_tokens": 190834562.0, "step": 7750 }, { "entropy": 1.7815845644474029, "epoch": 0.3695281409891984, "grad_norm": 1.1464273929595947, "learning_rate": 9.782890269510765e-05, "loss": 1.4057, "mean_token_accuracy": 0.7066523498296737, "num_tokens": 192054556.0, "step": 7800 }, { "entropy": 1.7850996911525727, "epoch": 0.37189691112374457, "grad_norm": 1.0448256731033325, "learning_rate": 9.776824118635952e-05, "loss": 1.3829, "mean_token_accuracy": 0.7095517975091934, "num_tokens": 193268475.0, "step": 7850 }, { "entropy": 1.8061986804008483, "epoch": 0.3742656812582907, "grad_norm": 0.9750792384147644, "learning_rate": 9.770676319234984e-05, "loss": 1.3863, "mean_token_accuracy": 0.7090413582324981, "num_tokens": 194477246.0, "step": 7900 }, { "entropy": 1.7675806987285614, "epoch": 0.37663445139283686, "grad_norm": 1.0662715435028076, "learning_rate": 9.764446976389974e-05, "loss": 1.3617, "mean_token_accuracy": 0.712408259510994, "num_tokens": 195727604.0, "step": 7950 }, { "entropy": 1.7661338579654693, "epoch": 0.37900322152738297, "grad_norm": 1.0620079040527344, "learning_rate": 9.758136196576822e-05, "loss": 1.3594, "mean_token_accuracy": 0.7141281938552857, "num_tokens": 196957775.0, "step": 8000 }, { "epoch": 0.37900322152738297, "eval_entropy": 1.1821323013705334, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7540838839660376, "eval_num_tokens": 196957775.0, "eval_runtime": 746.7244, "eval_samples_per_second": 33.232, "eval_steps_per_second": 4.154, "step": 8000 }, { "entropy": 1.8103419041633606, "epoch": 0.38137199166192914, "grad_norm": 0.9513231515884399, "learning_rate": 9.751744087663406e-05, "loss": 1.3912, "mean_token_accuracy": 0.7097796177864075, "num_tokens": 198135404.0, "step": 8050 }, { "entropy": 1.7960492491722106, "epoch": 0.38374076179647526, "grad_norm": 1.0502028465270996, "learning_rate": 9.74527075890773e-05, "loss": 1.4075, "mean_token_accuracy": 0.7074997735023498, "num_tokens": 199322966.0, "step": 8100 }, { "entropy": 1.8010617554187776, "epoch": 0.38610953193102143, "grad_norm": 1.0754374265670776, "learning_rate": 9.73871632095606e-05, "loss": 1.3893, "mean_token_accuracy": 0.7116775345802308, "num_tokens": 200538368.0, "step": 8150 }, { "entropy": 1.7480302667617797, "epoch": 0.38847830206556755, "grad_norm": 1.074485421180725, "learning_rate": 9.732080885841031e-05, "loss": 1.3824, "mean_token_accuracy": 0.7114830583333969, "num_tokens": 201768017.0, "step": 8200 }, { "entropy": 1.7346595871448516, "epoch": 0.3908470722001137, "grad_norm": 1.2857214212417603, "learning_rate": 9.725364566979737e-05, "loss": 1.3483, "mean_token_accuracy": 0.7171267950534821, "num_tokens": 203001309.0, "step": 8250 }, { "entropy": 1.7614091503620148, "epoch": 0.39321584233465984, "grad_norm": 0.9842163324356079, "learning_rate": 9.718567479171784e-05, "loss": 1.3712, "mean_token_accuracy": 0.7125260305404663, "num_tokens": 204234311.0, "step": 8300 }, { "entropy": 1.7672381138801574, "epoch": 0.395584612469206, "grad_norm": 1.098926067352295, "learning_rate": 9.711689738597335e-05, "loss": 1.4068, "mean_token_accuracy": 0.7051201003789902, "num_tokens": 205440916.0, "step": 8350 }, { "entropy": 1.7645212149620055, "epoch": 0.3979533826037521, "grad_norm": 1.0630714893341064, "learning_rate": 9.70473146281512e-05, "loss": 1.3971, "mean_token_accuracy": 0.7092112845182419, "num_tokens": 206679396.0, "step": 8400 }, { "entropy": 1.7202996456623076, "epoch": 0.4003221527382983, "grad_norm": 0.9493738412857056, "learning_rate": 9.697692770760431e-05, "loss": 1.349, "mean_token_accuracy": 0.7158361315727234, "num_tokens": 207946846.0, "step": 8450 }, { "entropy": 1.7327898812294007, "epoch": 0.4026909228728444, "grad_norm": 0.8810617327690125, "learning_rate": 9.690573782743082e-05, "loss": 1.3631, "mean_token_accuracy": 0.7150676685571671, "num_tokens": 209162939.0, "step": 8500 }, { "entropy": 1.7277316284179687, "epoch": 0.4050596930073906, "grad_norm": 1.0136702060699463, "learning_rate": 9.683374620445361e-05, "loss": 1.3714, "mean_token_accuracy": 0.7120017749071121, "num_tokens": 210427784.0, "step": 8550 }, { "entropy": 1.7886472380161285, "epoch": 0.4074284631419367, "grad_norm": 1.0549664497375488, "learning_rate": 9.676095406919943e-05, "loss": 1.3664, "mean_token_accuracy": 0.7133744984865189, "num_tokens": 211638614.0, "step": 8600 }, { "entropy": 1.747572809457779, "epoch": 0.4097972332764829, "grad_norm": 1.1670211553573608, "learning_rate": 9.668736266587792e-05, "loss": 1.3495, "mean_token_accuracy": 0.7146046167612076, "num_tokens": 212839094.0, "step": 8650 }, { "entropy": 1.7699014341831207, "epoch": 0.412166003411029, "grad_norm": 1.0434460639953613, "learning_rate": 9.66129732523603e-05, "loss": 1.3686, "mean_token_accuracy": 0.713732448220253, "num_tokens": 214078185.0, "step": 8700 }, { "entropy": 1.7851570510864259, "epoch": 0.41453477354557516, "grad_norm": 1.0788432359695435, "learning_rate": 9.653778710015788e-05, "loss": 1.3735, "mean_token_accuracy": 0.7115869015455246, "num_tokens": 215291596.0, "step": 8750 }, { "entropy": 1.771958166360855, "epoch": 0.4169035436801213, "grad_norm": 0.9727463722229004, "learning_rate": 9.646180549440038e-05, "loss": 1.3858, "mean_token_accuracy": 0.7092594999074936, "num_tokens": 216522630.0, "step": 8800 }, { "entropy": 1.7790643846988679, "epoch": 0.41927231381466745, "grad_norm": 1.125771403312683, "learning_rate": 9.638502973381389e-05, "loss": 1.3779, "mean_token_accuracy": 0.7110064566135407, "num_tokens": 217765170.0, "step": 8850 }, { "entropy": 1.7827233350276948, "epoch": 0.42164108394921357, "grad_norm": 1.0116534233093262, "learning_rate": 9.63074611306987e-05, "loss": 1.3625, "mean_token_accuracy": 0.7156530952453614, "num_tokens": 218976869.0, "step": 8900 }, { "entropy": 1.75447958111763, "epoch": 0.4240098540837597, "grad_norm": 1.3180460929870605, "learning_rate": 9.622910101090686e-05, "loss": 1.3936, "mean_token_accuracy": 0.7107756125926972, "num_tokens": 220217849.0, "step": 8950 }, { "entropy": 1.748335200548172, "epoch": 0.42637862421830586, "grad_norm": 0.986765444278717, "learning_rate": 9.614995071381956e-05, "loss": 1.3734, "mean_token_accuracy": 0.7136638331413269, "num_tokens": 221451171.0, "step": 9000 }, { "epoch": 0.42637862421830586, "eval_entropy": 1.1782315272926747, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7542952842323493, "eval_num_tokens": 221451171.0, "eval_runtime": 744.3949, "eval_samples_per_second": 33.336, "eval_steps_per_second": 4.167, "step": 9000 }, { "entropy": 1.762521461248398, "epoch": 0.428747394352852, "grad_norm": 1.1056315898895264, "learning_rate": 9.607001159232418e-05, "loss": 1.3411, "mean_token_accuracy": 0.7177901411056519, "num_tokens": 222644153.0, "step": 9050 }, { "entropy": 1.7698546504974366, "epoch": 0.43111616448739815, "grad_norm": 1.0218158960342407, "learning_rate": 9.59892850127912e-05, "loss": 1.3568, "mean_token_accuracy": 0.7160427170991898, "num_tokens": 223885271.0, "step": 9100 }, { "entropy": 1.7873007321357728, "epoch": 0.43348493462194426, "grad_norm": 1.0137804746627808, "learning_rate": 9.590777235505085e-05, "loss": 1.3578, "mean_token_accuracy": 0.7130710703134536, "num_tokens": 225093029.0, "step": 9150 }, { "entropy": 1.7597880065441132, "epoch": 0.43585370475649043, "grad_norm": 1.0279192924499512, "learning_rate": 9.582547501236947e-05, "loss": 1.3552, "mean_token_accuracy": 0.7151528036594391, "num_tokens": 226339608.0, "step": 9200 }, { "entropy": 1.7488136601448059, "epoch": 0.43822247489103655, "grad_norm": 1.2627191543579102, "learning_rate": 9.574239439142576e-05, "loss": 1.3368, "mean_token_accuracy": 0.7172259968519211, "num_tokens": 227578157.0, "step": 9250 }, { "entropy": 1.8161335122585296, "epoch": 0.4405912450255827, "grad_norm": 1.4642895460128784, "learning_rate": 9.56585319122867e-05, "loss": 1.3891, "mean_token_accuracy": 0.7093940156698227, "num_tokens": 228810604.0, "step": 9300 }, { "entropy": 1.7846631932258605, "epoch": 0.44296001516012884, "grad_norm": 1.0811119079589844, "learning_rate": 9.557388900838334e-05, "loss": 1.3681, "mean_token_accuracy": 0.7125671052932739, "num_tokens": 230055004.0, "step": 9350 }, { "entropy": 1.748952749967575, "epoch": 0.445328785294675, "grad_norm": 1.0202217102050781, "learning_rate": 9.548846712648616e-05, "loss": 1.355, "mean_token_accuracy": 0.7164496505260467, "num_tokens": 231284769.0, "step": 9400 }, { "entropy": 1.754820455312729, "epoch": 0.44769755542922113, "grad_norm": 1.2328052520751953, "learning_rate": 9.540226772668053e-05, "loss": 1.3402, "mean_token_accuracy": 0.7169833314418793, "num_tokens": 232505637.0, "step": 9450 }, { "entropy": 1.7367425131797791, "epoch": 0.4500663255637673, "grad_norm": 1.0527913570404053, "learning_rate": 9.531529228234155e-05, "loss": 1.3576, "mean_token_accuracy": 0.7145136260986328, "num_tokens": 233725437.0, "step": 9500 }, { "entropy": 1.733099582195282, "epoch": 0.4524350956983134, "grad_norm": 0.8144567608833313, "learning_rate": 9.522754228010906e-05, "loss": 1.3282, "mean_token_accuracy": 0.720543931722641, "num_tokens": 234955358.0, "step": 9550 }, { "entropy": 1.733365514278412, "epoch": 0.4548038658328596, "grad_norm": 1.0677859783172607, "learning_rate": 9.513901921986206e-05, "loss": 1.3275, "mean_token_accuracy": 0.7202348792552948, "num_tokens": 236197729.0, "step": 9600 }, { "entropy": 1.7879818844795228, "epoch": 0.4571726359674057, "grad_norm": 1.0054843425750732, "learning_rate": 9.504972461469319e-05, "loss": 1.3617, "mean_token_accuracy": 0.7137482041120529, "num_tokens": 237418727.0, "step": 9650 }, { "entropy": 1.7609144997596742, "epoch": 0.4595414061019519, "grad_norm": 1.252611756324768, "learning_rate": 9.495965999088285e-05, "loss": 1.3773, "mean_token_accuracy": 0.7108764094114304, "num_tokens": 238640440.0, "step": 9700 }, { "entropy": 1.7785017716884612, "epoch": 0.461910176236498, "grad_norm": 1.1619056463241577, "learning_rate": 9.486882688787305e-05, "loss": 1.3769, "mean_token_accuracy": 0.7111158293485641, "num_tokens": 239845699.0, "step": 9750 }, { "entropy": 1.7622488391399385, "epoch": 0.46427894637104417, "grad_norm": 1.2110604047775269, "learning_rate": 9.477722685824114e-05, "loss": 1.3853, "mean_token_accuracy": 0.7111801999807358, "num_tokens": 241057039.0, "step": 9800 }, { "entropy": 1.771475486755371, "epoch": 0.4666477165055903, "grad_norm": 0.9056064486503601, "learning_rate": 9.46848614676733e-05, "loss": 1.3612, "mean_token_accuracy": 0.7140835148096084, "num_tokens": 242271603.0, "step": 9850 }, { "entropy": 1.7718469250202178, "epoch": 0.46901648664013645, "grad_norm": 1.2525917291641235, "learning_rate": 9.459173229493772e-05, "loss": 1.3937, "mean_token_accuracy": 0.7090546947717666, "num_tokens": 243506199.0, "step": 9900 }, { "entropy": 1.7770234513282777, "epoch": 0.47138525677468257, "grad_norm": 1.0945196151733398, "learning_rate": 9.449784093185765e-05, "loss": 1.3913, "mean_token_accuracy": 0.7097006791830063, "num_tokens": 244728720.0, "step": 9950 }, { "entropy": 1.7675727343559264, "epoch": 0.47375402690922874, "grad_norm": 0.9690260291099548, "learning_rate": 9.440318898328419e-05, "loss": 1.3915, "mean_token_accuracy": 0.7102323162555695, "num_tokens": 245938116.0, "step": 10000 }, { "epoch": 0.47375402690922874, "eval_entropy": 1.1980976138977295, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7557047149064385, "eval_num_tokens": 245938116.0, "eval_runtime": 744.8001, "eval_samples_per_second": 33.318, "eval_steps_per_second": 4.165, "step": 10000 }, { "entropy": 1.8029465341567994, "epoch": 0.47612279704377486, "grad_norm": 1.1126075983047485, "learning_rate": 9.430777806706885e-05, "loss": 1.425, "mean_token_accuracy": 0.7034233027696609, "num_tokens": 247144026.0, "step": 10050 }, { "entropy": 1.7569603097438813, "epoch": 0.47849156717832103, "grad_norm": 1.0091259479522705, "learning_rate": 9.421160981403587e-05, "loss": 1.3778, "mean_token_accuracy": 0.7116102015972138, "num_tokens": 248387083.0, "step": 10100 }, { "entropy": 1.730289832353592, "epoch": 0.48086033731286715, "grad_norm": 0.9621230959892273, "learning_rate": 9.411468586795443e-05, "loss": 1.3592, "mean_token_accuracy": 0.7129039680957794, "num_tokens": 249644502.0, "step": 10150 }, { "entropy": 1.7547185254096984, "epoch": 0.4832291074474133, "grad_norm": 1.0875402688980103, "learning_rate": 9.401700788551047e-05, "loss": 1.3664, "mean_token_accuracy": 0.7126635414361954, "num_tokens": 250876166.0, "step": 10200 }, { "entropy": 1.7428116750717164, "epoch": 0.48559787758195944, "grad_norm": 1.006138563156128, "learning_rate": 9.391857753627837e-05, "loss": 1.3673, "mean_token_accuracy": 0.7143008214235306, "num_tokens": 252091179.0, "step": 10250 }, { "entropy": 1.7458369052410125, "epoch": 0.4879666477165056, "grad_norm": 1.001531720161438, "learning_rate": 9.381939650269249e-05, "loss": 1.3674, "mean_token_accuracy": 0.7141269159317016, "num_tokens": 253307058.0, "step": 10300 }, { "entropy": 1.7744270980358123, "epoch": 0.4903354178510517, "grad_norm": 1.0080331563949585, "learning_rate": 9.371946648001835e-05, "loss": 1.383, "mean_token_accuracy": 0.7098779672384262, "num_tokens": 254550553.0, "step": 10350 }, { "entropy": 1.7707626497745514, "epoch": 0.4927041879855979, "grad_norm": 1.0779789686203003, "learning_rate": 9.361878917632365e-05, "loss": 1.3529, "mean_token_accuracy": 0.7156933480501175, "num_tokens": 255800272.0, "step": 10400 }, { "entropy": 1.7735213398933412, "epoch": 0.495072958120144, "grad_norm": 0.9861488342285156, "learning_rate": 9.351736631244914e-05, "loss": 1.352, "mean_token_accuracy": 0.7177729392051697, "num_tokens": 257029917.0, "step": 10450 }, { "entropy": 1.7585961294174195, "epoch": 0.4974417282546902, "grad_norm": 1.0564011335372925, "learning_rate": 9.341519962197912e-05, "loss": 1.3421, "mean_token_accuracy": 0.7166235017776489, "num_tokens": 258269464.0, "step": 10500 }, { "entropy": 1.745290095806122, "epoch": 0.4998104983892363, "grad_norm": 1.0212265253067017, "learning_rate": 9.331229085121185e-05, "loss": 1.3898, "mean_token_accuracy": 0.710235812664032, "num_tokens": 259531127.0, "step": 10550 }, { "entropy": 1.7529479134082795, "epoch": 0.5021792685237825, "grad_norm": 1.391863226890564, "learning_rate": 9.320864175912972e-05, "loss": 1.378, "mean_token_accuracy": 0.7103132110834122, "num_tokens": 260797490.0, "step": 10600 }, { "entropy": 1.75710902094841, "epoch": 0.5045480386583286, "grad_norm": 1.0978041887283325, "learning_rate": 9.310425411736916e-05, "loss": 1.3386, "mean_token_accuracy": 0.7186200088262558, "num_tokens": 262023377.0, "step": 10650 }, { "entropy": 1.7746818363666534, "epoch": 0.5069168087928747, "grad_norm": 1.0323866605758667, "learning_rate": 9.299912971019036e-05, "loss": 1.3641, "mean_token_accuracy": 0.7137188649177552, "num_tokens": 263266765.0, "step": 10700 }, { "entropy": 1.754684933423996, "epoch": 0.5092855789274209, "grad_norm": 0.9584967494010925, "learning_rate": 9.289327033444674e-05, "loss": 1.3668, "mean_token_accuracy": 0.7127582091093063, "num_tokens": 264493871.0, "step": 10750 }, { "entropy": 1.7670053398609162, "epoch": 0.511654349061967, "grad_norm": 1.0315459966659546, "learning_rate": 9.278667779955437e-05, "loss": 1.3966, "mean_token_accuracy": 0.7078602635860443, "num_tokens": 265716107.0, "step": 10800 }, { "entropy": 1.7422236442565917, "epoch": 0.5140231191965132, "grad_norm": 1.066741943359375, "learning_rate": 9.267935392746081e-05, "loss": 1.3224, "mean_token_accuracy": 0.7229005527496338, "num_tokens": 266969953.0, "step": 10850 }, { "entropy": 1.7432917177677154, "epoch": 0.5163918893310593, "grad_norm": 1.0382195711135864, "learning_rate": 9.25713005526142e-05, "loss": 1.3466, "mean_token_accuracy": 0.7158039021492004, "num_tokens": 268225977.0, "step": 10900 }, { "entropy": 1.7296686470508575, "epoch": 0.5187606594656055, "grad_norm": 1.1235915422439575, "learning_rate": 9.246251952193176e-05, "loss": 1.3222, "mean_token_accuracy": 0.7224133855104446, "num_tokens": 269466793.0, "step": 10950 }, { "entropy": 1.735136388540268, "epoch": 0.5211294296001516, "grad_norm": 0.990793764591217, "learning_rate": 9.235301269476832e-05, "loss": 1.3191, "mean_token_accuracy": 0.7210667967796326, "num_tokens": 270708159.0, "step": 11000 }, { "epoch": 0.5211294296001516, "eval_entropy": 1.181336676109844, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7557077165440849, "eval_num_tokens": 270708159.0, "eval_runtime": 747.4423, "eval_samples_per_second": 33.2, "eval_steps_per_second": 4.15, "step": 11000 }, { "entropy": 1.751279581785202, "epoch": 0.5234981997346978, "grad_norm": 1.3285608291625977, "learning_rate": 9.224278194288444e-05, "loss": 1.3385, "mean_token_accuracy": 0.7199172627925873, "num_tokens": 271927990.0, "step": 11050 }, { "entropy": 1.7399055349826813, "epoch": 0.5258669698692439, "grad_norm": 1.395338535308838, "learning_rate": 9.213182915041445e-05, "loss": 1.3314, "mean_token_accuracy": 0.7194273501634598, "num_tokens": 273153187.0, "step": 11100 }, { "entropy": 1.73090322971344, "epoch": 0.52823574000379, "grad_norm": 1.6059190034866333, "learning_rate": 9.202015621383431e-05, "loss": 1.3223, "mean_token_accuracy": 0.7193130904436111, "num_tokens": 274381622.0, "step": 11150 }, { "entropy": 1.7387698328495025, "epoch": 0.5306045101383362, "grad_norm": 0.9370666742324829, "learning_rate": 9.190776504192909e-05, "loss": 1.3606, "mean_token_accuracy": 0.7134118205308915, "num_tokens": 275611193.0, "step": 11200 }, { "entropy": 1.7551235890388488, "epoch": 0.5329732802728823, "grad_norm": 0.972176730632782, "learning_rate": 9.179465755576045e-05, "loss": 1.4027, "mean_token_accuracy": 0.7102609771490097, "num_tokens": 276860392.0, "step": 11250 }, { "entropy": 1.742105484008789, "epoch": 0.5353420504074284, "grad_norm": 1.3585799932479858, "learning_rate": 9.16808356886337e-05, "loss": 1.3869, "mean_token_accuracy": 0.7101844340562821, "num_tokens": 278102635.0, "step": 11300 }, { "entropy": 1.7393697941303252, "epoch": 0.5377108205419746, "grad_norm": 0.9401509165763855, "learning_rate": 9.156630138606484e-05, "loss": 1.3764, "mean_token_accuracy": 0.7136105120182037, "num_tokens": 279342491.0, "step": 11350 }, { "entropy": 1.7620924258232116, "epoch": 0.5400795906765208, "grad_norm": 1.03669273853302, "learning_rate": 9.145105660574725e-05, "loss": 1.3836, "mean_token_accuracy": 0.7112589359283448, "num_tokens": 280562523.0, "step": 11400 }, { "entropy": 1.7693402111530303, "epoch": 0.5424483608110668, "grad_norm": 1.0556858777999878, "learning_rate": 9.133510331751828e-05, "loss": 1.3543, "mean_token_accuracy": 0.7159368151426315, "num_tokens": 281804551.0, "step": 11450 }, { "entropy": 1.7435523355007172, "epoch": 0.544817130945613, "grad_norm": 1.36162531375885, "learning_rate": 9.121844350332549e-05, "loss": 1.3505, "mean_token_accuracy": 0.7172021287679672, "num_tokens": 283039847.0, "step": 11500 }, { "entropy": 1.7690045988559724, "epoch": 0.5471859010801592, "grad_norm": 1.1119062900543213, "learning_rate": 9.110107915719292e-05, "loss": 1.3536, "mean_token_accuracy": 0.7164638632535935, "num_tokens": 284295808.0, "step": 11550 }, { "entropy": 1.7988950431346893, "epoch": 0.5495546712147054, "grad_norm": 1.2980992794036865, "learning_rate": 9.098301228518683e-05, "loss": 1.387, "mean_token_accuracy": 0.7079293090105057, "num_tokens": 285481962.0, "step": 11600 }, { "entropy": 1.7422871506214141, "epoch": 0.5519234413492514, "grad_norm": 1.0130205154418945, "learning_rate": 9.086424490538157e-05, "loss": 1.3488, "mean_token_accuracy": 0.7166511958837509, "num_tokens": 286739692.0, "step": 11650 }, { "entropy": 1.7421106839179992, "epoch": 0.5542922114837976, "grad_norm": 1.0390921831130981, "learning_rate": 9.074477904782495e-05, "loss": 1.3213, "mean_token_accuracy": 0.7222142660617829, "num_tokens": 287953436.0, "step": 11700 }, { "entropy": 1.7164568746089934, "epoch": 0.5566609816183438, "grad_norm": 0.9376536011695862, "learning_rate": 9.062461675450366e-05, "loss": 1.3204, "mean_token_accuracy": 0.7219431722164154, "num_tokens": 289187059.0, "step": 11750 }, { "entropy": 1.7607939064502716, "epoch": 0.5590297517528899, "grad_norm": 1.1221693754196167, "learning_rate": 9.050376007930831e-05, "loss": 1.358, "mean_token_accuracy": 0.7148712009191514, "num_tokens": 290395472.0, "step": 11800 }, { "entropy": 1.7365293169021607, "epoch": 0.561398521887436, "grad_norm": 1.2102606296539307, "learning_rate": 9.038221108799832e-05, "loss": 1.3362, "mean_token_accuracy": 0.7193614053726196, "num_tokens": 291650032.0, "step": 11850 }, { "entropy": 1.7262990617752074, "epoch": 0.5637672920219822, "grad_norm": 1.1103631258010864, "learning_rate": 9.025997185816662e-05, "loss": 1.3304, "mean_token_accuracy": 0.7197805154323578, "num_tokens": 292891757.0, "step": 11900 }, { "entropy": 1.7870515859127045, "epoch": 0.5661360621565283, "grad_norm": 1.2359330654144287, "learning_rate": 9.013704447920407e-05, "loss": 1.3947, "mean_token_accuracy": 0.7112246352434158, "num_tokens": 294108078.0, "step": 11950 }, { "entropy": 1.7402713179588318, "epoch": 0.5685048322910745, "grad_norm": 1.2696958780288696, "learning_rate": 9.001343105226397e-05, "loss": 1.3456, "mean_token_accuracy": 0.7186821699142456, "num_tokens": 295347523.0, "step": 12000 }, { "epoch": 0.5685048322910745, "eval_entropy": 1.1782386238578055, "eval_loss": NaN, "eval_mean_token_accuracy": 0.755168300715283, "eval_num_tokens": 295347523.0, "eval_runtime": 747.9571, "eval_samples_per_second": 33.177, "eval_steps_per_second": 4.147, "step": 12000 }, { "entropy": 1.7823381924629211, "epoch": 0.5708736024256206, "grad_norm": 1.090854287147522, "learning_rate": 8.988913369022585e-05, "loss": 1.3752, "mean_token_accuracy": 0.7125837200880051, "num_tokens": 296596547.0, "step": 12050 }, { "entropy": 1.7829466736316681, "epoch": 0.5732423725601667, "grad_norm": 0.9252607226371765, "learning_rate": 8.976415451765952e-05, "loss": 1.3646, "mean_token_accuracy": 0.7142701143026352, "num_tokens": 297794903.0, "step": 12100 }, { "entropy": 1.7680902397632599, "epoch": 0.5756111426947129, "grad_norm": 1.0627940893173218, "learning_rate": 8.96384956707888e-05, "loss": 1.3628, "mean_token_accuracy": 0.7150187093019486, "num_tokens": 299029935.0, "step": 12150 }, { "entropy": 1.7492178344726563, "epoch": 0.5779799128292591, "grad_norm": 1.2822635173797607, "learning_rate": 8.951215929745486e-05, "loss": 1.3594, "mean_token_accuracy": 0.71372334420681, "num_tokens": 300256525.0, "step": 12200 }, { "entropy": 1.7705177330970765, "epoch": 0.5803486829638052, "grad_norm": 1.1303389072418213, "learning_rate": 8.93851475570796e-05, "loss": 1.3498, "mean_token_accuracy": 0.7154391181468963, "num_tokens": 301466189.0, "step": 12250 }, { "entropy": 1.7693954205513, "epoch": 0.5827174530983513, "grad_norm": 1.0360733270645142, "learning_rate": 8.925746262062879e-05, "loss": 1.3523, "mean_token_accuracy": 0.71549709379673, "num_tokens": 302677547.0, "step": 12300 }, { "entropy": 1.7499237847328186, "epoch": 0.5850862232328975, "grad_norm": 1.2163889408111572, "learning_rate": 8.912910667057482e-05, "loss": 1.3219, "mean_token_accuracy": 0.7233135092258454, "num_tokens": 303884552.0, "step": 12350 }, { "entropy": 1.7528709161281586, "epoch": 0.5874549933674437, "grad_norm": 1.0694142580032349, "learning_rate": 8.900008190085946e-05, "loss": 1.3695, "mean_token_accuracy": 0.7140274894237518, "num_tokens": 305112064.0, "step": 12400 }, { "entropy": 1.7593362927436829, "epoch": 0.5898237635019897, "grad_norm": 0.9559013247489929, "learning_rate": 8.887039051685646e-05, "loss": 1.3538, "mean_token_accuracy": 0.7164691358804702, "num_tokens": 306349750.0, "step": 12450 }, { "entropy": 1.7250176286697387, "epoch": 0.5921925336365359, "grad_norm": 1.0856672525405884, "learning_rate": 8.874003473533372e-05, "loss": 1.3617, "mean_token_accuracy": 0.7142321610450745, "num_tokens": 307589875.0, "step": 12500 }, { "entropy": 1.7204779553413392, "epoch": 0.5945613037710821, "grad_norm": 1.0638339519500732, "learning_rate": 8.860901678441542e-05, "loss": 1.3523, "mean_token_accuracy": 0.7161801540851593, "num_tokens": 308844739.0, "step": 12550 }, { "entropy": 1.7241905891895295, "epoch": 0.5969300739056282, "grad_norm": 1.0859177112579346, "learning_rate": 8.847733890354397e-05, "loss": 1.3558, "mean_token_accuracy": 0.714522579908371, "num_tokens": 310070098.0, "step": 12600 }, { "entropy": 1.735662100315094, "epoch": 0.5992988440401743, "grad_norm": 1.1100165843963623, "learning_rate": 8.834500334344178e-05, "loss": 1.363, "mean_token_accuracy": 0.7140331470966339, "num_tokens": 311292251.0, "step": 12650 }, { "entropy": 1.7426686155796052, "epoch": 0.6016676141747205, "grad_norm": 1.109788179397583, "learning_rate": 8.821201236607266e-05, "loss": 1.3491, "mean_token_accuracy": 0.7144311499595642, "num_tokens": 312573175.0, "step": 12700 }, { "entropy": 1.7310996508598329, "epoch": 0.6040363843092666, "grad_norm": 1.4260696172714233, "learning_rate": 8.807836824460329e-05, "loss": 1.3352, "mean_token_accuracy": 0.7185973340272903, "num_tokens": 313821355.0, "step": 12750 }, { "entropy": 1.7413757181167602, "epoch": 0.6064051544438128, "grad_norm": 0.9746555685997009, "learning_rate": 8.794407326336427e-05, "loss": 1.3168, "mean_token_accuracy": 0.7220592141151428, "num_tokens": 315041303.0, "step": 12800 }, { "entropy": 1.7303865098953246, "epoch": 0.6087739245783589, "grad_norm": 0.892135739326477, "learning_rate": 8.780912971781112e-05, "loss": 1.3201, "mean_token_accuracy": 0.7211132681369782, "num_tokens": 316288409.0, "step": 12850 }, { "entropy": 1.7497126710414888, "epoch": 0.611142694712905, "grad_norm": 1.199959397315979, "learning_rate": 8.767353991448503e-05, "loss": 1.3052, "mean_token_accuracy": 0.7245729100704194, "num_tokens": 317526338.0, "step": 12900 }, { "entropy": 1.7545914590358733, "epoch": 0.6135114648474512, "grad_norm": 0.9794778227806091, "learning_rate": 8.753730617097342e-05, "loss": 1.3417, "mean_token_accuracy": 0.7178518337011337, "num_tokens": 318776423.0, "step": 12950 }, { "entropy": 1.761199436187744, "epoch": 0.6158802349819974, "grad_norm": 1.115660548210144, "learning_rate": 8.740043081587043e-05, "loss": 1.3428, "mean_token_accuracy": 0.71872696518898, "num_tokens": 319970665.0, "step": 13000 }, { "epoch": 0.6158802349819974, "eval_entropy": 1.173806337542414, "eval_loss": NaN, "eval_mean_token_accuracy": 0.755712005311562, "eval_num_tokens": 319970665.0, "eval_runtime": 729.7637, "eval_samples_per_second": 34.004, "eval_steps_per_second": 4.251, "step": 13000 }, { "entropy": 1.7386520493030548, "epoch": 0.6182490051165435, "grad_norm": 1.0832492113113403, "learning_rate": 8.726291618873692e-05, "loss": 1.3185, "mean_token_accuracy": 0.7225498640537262, "num_tokens": 321195496.0, "step": 13050 }, { "entropy": 1.788149139881134, "epoch": 0.6206177752510896, "grad_norm": 1.0728507041931152, "learning_rate": 8.712476464006069e-05, "loss": 1.3687, "mean_token_accuracy": 0.7138838738203048, "num_tokens": 322394051.0, "step": 13100 }, { "entropy": 1.7250337314605713, "epoch": 0.6229865453856358, "grad_norm": 0.9454106688499451, "learning_rate": 8.698597853121613e-05, "loss": 1.3206, "mean_token_accuracy": 0.7232500827312469, "num_tokens": 323646049.0, "step": 13150 }, { "entropy": 1.7228785872459411, "epoch": 0.625355315520182, "grad_norm": 1.074063777923584, "learning_rate": 8.684656023442404e-05, "loss": 1.3416, "mean_token_accuracy": 0.7188435053825378, "num_tokens": 324901290.0, "step": 13200 }, { "entropy": 1.7498575222492219, "epoch": 0.627724085654728, "grad_norm": 1.3152785301208496, "learning_rate": 8.670651213271087e-05, "loss": 1.3495, "mean_token_accuracy": 0.7163092708587646, "num_tokens": 326143599.0, "step": 13250 }, { "entropy": 1.773080164194107, "epoch": 0.6300928557892742, "grad_norm": 1.117574691772461, "learning_rate": 8.656583661986815e-05, "loss": 1.3716, "mean_token_accuracy": 0.7143875294923783, "num_tokens": 327369948.0, "step": 13300 }, { "entropy": 1.756659119129181, "epoch": 0.6324616259238204, "grad_norm": 1.0091075897216797, "learning_rate": 8.642453610041152e-05, "loss": 1.3815, "mean_token_accuracy": 0.7113278949260712, "num_tokens": 328609411.0, "step": 13350 }, { "entropy": 1.7753915119171142, "epoch": 0.6348303960583664, "grad_norm": 0.9333689212799072, "learning_rate": 8.628261298953963e-05, "loss": 1.3478, "mean_token_accuracy": 0.7168344795703888, "num_tokens": 329812629.0, "step": 13400 }, { "entropy": 1.7191705119609832, "epoch": 0.6371991661929126, "grad_norm": 0.9254161715507507, "learning_rate": 8.614006971309287e-05, "loss": 1.32, "mean_token_accuracy": 0.7235176879167556, "num_tokens": 331045306.0, "step": 13450 }, { "entropy": 1.7539090728759765, "epoch": 0.6395679363274588, "grad_norm": 1.135908842086792, "learning_rate": 8.599690870751189e-05, "loss": 1.2991, "mean_token_accuracy": 0.7238886666297912, "num_tokens": 332265198.0, "step": 13500 }, { "entropy": 1.7759091782569885, "epoch": 0.641936706462005, "grad_norm": 0.9939352869987488, "learning_rate": 8.585313241979593e-05, "loss": 1.3446, "mean_token_accuracy": 0.7167073094844818, "num_tokens": 333478621.0, "step": 13550 }, { "entropy": 1.8198255062103272, "epoch": 0.644305476596551, "grad_norm": 1.1110658645629883, "learning_rate": 8.570874330746109e-05, "loss": 1.3429, "mean_token_accuracy": 0.7163071328401566, "num_tokens": 334679776.0, "step": 13600 }, { "entropy": 1.7491872441768646, "epoch": 0.6466742467310972, "grad_norm": 1.102397084236145, "learning_rate": 8.556374383849815e-05, "loss": 1.3429, "mean_token_accuracy": 0.7170016378164291, "num_tokens": 335924366.0, "step": 13650 }, { "entropy": 1.7320797193050383, "epoch": 0.6490430168656434, "grad_norm": 0.9770281910896301, "learning_rate": 8.541813649133064e-05, "loss": 1.3012, "mean_token_accuracy": 0.7252740359306336, "num_tokens": 337177387.0, "step": 13700 }, { "entropy": 1.7851051843166352, "epoch": 0.6514117870001895, "grad_norm": 1.2061119079589844, "learning_rate": 8.52719237547722e-05, "loss": 1.3423, "mean_token_accuracy": 0.7174914568662644, "num_tokens": 338406873.0, "step": 13750 }, { "entropy": 1.7197813856601716, "epoch": 0.6537805571347356, "grad_norm": 1.0583444833755493, "learning_rate": 8.512510812798426e-05, "loss": 1.3451, "mean_token_accuracy": 0.7177790975570679, "num_tokens": 339627417.0, "step": 13800 }, { "entropy": 1.7355473148822784, "epoch": 0.6561493272692818, "grad_norm": 1.1621958017349243, "learning_rate": 8.49776921204332e-05, "loss": 1.3587, "mean_token_accuracy": 0.7146015846729279, "num_tokens": 340857014.0, "step": 13850 }, { "entropy": 1.7453387939929963, "epoch": 0.6585180974038279, "grad_norm": 1.0361634492874146, "learning_rate": 8.48296782518475e-05, "loss": 1.3769, "mean_token_accuracy": 0.7130674320459366, "num_tokens": 342093808.0, "step": 13900 }, { "entropy": 1.7622779953479766, "epoch": 0.6608868675383741, "grad_norm": 1.2546138763427734, "learning_rate": 8.468106905217465e-05, "loss": 1.348, "mean_token_accuracy": 0.7160887461900711, "num_tokens": 343326476.0, "step": 13950 }, { "entropy": 1.7349292349815368, "epoch": 0.6632556376729202, "grad_norm": 0.9984197020530701, "learning_rate": 8.453186706153789e-05, "loss": 1.301, "mean_token_accuracy": 0.7255065280199051, "num_tokens": 344557978.0, "step": 14000 }, { "epoch": 0.6632556376729202, "eval_entropy": 1.1754964998965876, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7568369234841843, "eval_num_tokens": 344557978.0, "eval_runtime": 728.4569, "eval_samples_per_second": 34.065, "eval_steps_per_second": 4.258, "step": 14000 }, { "entropy": 1.786487684249878, "epoch": 0.6656244078074663, "grad_norm": 1.1214771270751953, "learning_rate": 8.438207483019291e-05, "loss": 1.3981, "mean_token_accuracy": 0.7088551700115204, "num_tokens": 345789604.0, "step": 14050 }, { "entropy": 1.7951791512966155, "epoch": 0.6679931779420125, "grad_norm": 1.0142500400543213, "learning_rate": 8.42316949184841e-05, "loss": 1.3948, "mean_token_accuracy": 0.7093900120258332, "num_tokens": 347013721.0, "step": 14100 }, { "entropy": 1.7639971029758454, "epoch": 0.6703619480765587, "grad_norm": 1.0026280879974365, "learning_rate": 8.408072989680087e-05, "loss": 1.3031, "mean_token_accuracy": 0.7246174013614655, "num_tokens": 348211806.0, "step": 14150 }, { "entropy": 1.7443763566017152, "epoch": 0.6727307182111048, "grad_norm": 1.0735307931900024, "learning_rate": 8.39291823455337e-05, "loss": 1.3052, "mean_token_accuracy": 0.7236789721250534, "num_tokens": 349491496.0, "step": 14200 }, { "entropy": 1.7840288174152374, "epoch": 0.6750994883456509, "grad_norm": 1.1240233182907104, "learning_rate": 8.377705485503007e-05, "loss": 1.3545, "mean_token_accuracy": 0.7152435338497162, "num_tokens": 350709829.0, "step": 14250 }, { "entropy": 1.7674752044677735, "epoch": 0.6774682584801971, "grad_norm": 0.9507238864898682, "learning_rate": 8.36243500255501e-05, "loss": 1.3193, "mean_token_accuracy": 0.7238988935947418, "num_tokens": 351910781.0, "step": 14300 }, { "entropy": 1.7877480947971345, "epoch": 0.6798370286147433, "grad_norm": 0.9499313831329346, "learning_rate": 8.34710704672222e-05, "loss": 1.3392, "mean_token_accuracy": 0.7172271001338959, "num_tokens": 353134751.0, "step": 14350 }, { "entropy": 1.7766230964660645, "epoch": 0.6822057987492893, "grad_norm": 1.2760356664657593, "learning_rate": 8.331721879999841e-05, "loss": 1.3595, "mean_token_accuracy": 0.7147215807437897, "num_tokens": 354350033.0, "step": 14400 }, { "entropy": 1.7614100205898284, "epoch": 0.6845745688838355, "grad_norm": 1.043785572052002, "learning_rate": 8.316279765360957e-05, "loss": 1.3879, "mean_token_accuracy": 0.7108203011751175, "num_tokens": 355573758.0, "step": 14450 }, { "entropy": 1.7683514368534088, "epoch": 0.6869433390183817, "grad_norm": 1.1136603355407715, "learning_rate": 8.300780966752049e-05, "loss": 1.3451, "mean_token_accuracy": 0.7161549615859986, "num_tokens": 356822721.0, "step": 14500 }, { "entropy": 1.7504412484169007, "epoch": 0.6893121091529278, "grad_norm": 1.132605791091919, "learning_rate": 8.28522574908847e-05, "loss": 1.3433, "mean_token_accuracy": 0.7193030816316605, "num_tokens": 358090609.0, "step": 14550 }, { "entropy": 1.7415921115875244, "epoch": 0.6916808792874739, "grad_norm": 0.901418924331665, "learning_rate": 8.269614378249932e-05, "loss": 1.3098, "mean_token_accuracy": 0.7223669987916946, "num_tokens": 359334849.0, "step": 14600 }, { "entropy": 1.72568878531456, "epoch": 0.6940496494220201, "grad_norm": 1.2013583183288574, "learning_rate": 8.253947121075942e-05, "loss": 1.3413, "mean_token_accuracy": 0.7166631370782852, "num_tokens": 360565890.0, "step": 14650 }, { "entropy": 1.7351550233364106, "epoch": 0.6964184195565662, "grad_norm": 0.9248843193054199, "learning_rate": 8.238224245361262e-05, "loss": 1.3269, "mean_token_accuracy": 0.7205180561542511, "num_tokens": 361780402.0, "step": 14700 }, { "entropy": 1.7353229641914367, "epoch": 0.6987871896911124, "grad_norm": 0.9147818088531494, "learning_rate": 8.222446019851314e-05, "loss": 1.3239, "mean_token_accuracy": 0.7209709006547927, "num_tokens": 362998310.0, "step": 14750 }, { "entropy": 1.7628222048282622, "epoch": 0.7011559598256585, "grad_norm": 1.0660256147384644, "learning_rate": 8.206612714237601e-05, "loss": 1.3736, "mean_token_accuracy": 0.7127251303195954, "num_tokens": 364192705.0, "step": 14800 }, { "entropy": 1.7622958242893219, "epoch": 0.7035247299602047, "grad_norm": 1.133527398109436, "learning_rate": 8.190724599153083e-05, "loss": 1.3252, "mean_token_accuracy": 0.7197421258687973, "num_tokens": 365419544.0, "step": 14850 }, { "entropy": 1.7733166551589965, "epoch": 0.7058935000947508, "grad_norm": 1.0449475049972534, "learning_rate": 8.174781946167563e-05, "loss": 1.3422, "mean_token_accuracy": 0.7184215635061264, "num_tokens": 366668472.0, "step": 14900 }, { "entropy": 1.7824318826198577, "epoch": 0.708262270229297, "grad_norm": 0.9425482749938965, "learning_rate": 8.158785027783038e-05, "loss": 1.351, "mean_token_accuracy": 0.7144128715991974, "num_tokens": 367883921.0, "step": 14950 }, { "entropy": 1.7383503484725953, "epoch": 0.7106310403638431, "grad_norm": 1.0266870260238647, "learning_rate": 8.14273411742905e-05, "loss": 1.3003, "mean_token_accuracy": 0.7255125510692596, "num_tokens": 369125857.0, "step": 15000 }, { "epoch": 0.7106310403638431, "eval_entropy": 1.1784183711370755, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7566592082102632, "eval_num_tokens": 369125857.0, "eval_runtime": 728.4679, "eval_samples_per_second": 34.065, "eval_steps_per_second": 4.258, "step": 15000 }, { "entropy": 1.7569481348991394, "epoch": 0.7129998104983892, "grad_norm": 1.0710499286651611, "learning_rate": 8.126629489457998e-05, "loss": 1.3493, "mean_token_accuracy": 0.7171416920423508, "num_tokens": 370360655.0, "step": 15050 }, { "entropy": 1.7639206099510192, "epoch": 0.7153685806329354, "grad_norm": 1.023205041885376, "learning_rate": 8.110471419140461e-05, "loss": 1.3816, "mean_token_accuracy": 0.7107687264680862, "num_tokens": 371611998.0, "step": 15100 }, { "entropy": 1.7124925446510315, "epoch": 0.7177373507674816, "grad_norm": 1.4796391725540161, "learning_rate": 8.094260182660491e-05, "loss": 1.3103, "mean_token_accuracy": 0.7245303303003311, "num_tokens": 372852886.0, "step": 15150 }, { "entropy": 1.7330104005336762, "epoch": 0.7201061209020276, "grad_norm": 1.1054223775863647, "learning_rate": 8.077996057110881e-05, "loss": 1.3446, "mean_token_accuracy": 0.7186214071512222, "num_tokens": 374060791.0, "step": 15200 }, { "entropy": 1.781588876247406, "epoch": 0.7224748910365738, "grad_norm": 1.2375303506851196, "learning_rate": 8.06167932048845e-05, "loss": 1.3815, "mean_token_accuracy": 0.7106660062074661, "num_tokens": 375260162.0, "step": 15250 }, { "entropy": 1.7753887116909026, "epoch": 0.72484366117112, "grad_norm": 1.0260518789291382, "learning_rate": 8.045310251689269e-05, "loss": 1.3782, "mean_token_accuracy": 0.7120629328489304, "num_tokens": 376480540.0, "step": 15300 }, { "entropy": 1.7605856931209565, "epoch": 0.7272124313056662, "grad_norm": 1.0135972499847412, "learning_rate": 8.028889130503908e-05, "loss": 1.3664, "mean_token_accuracy": 0.714390983581543, "num_tokens": 377707870.0, "step": 15350 }, { "entropy": 1.7473648416996002, "epoch": 0.7295812014402122, "grad_norm": 1.5319159030914307, "learning_rate": 8.012416237612651e-05, "loss": 1.3251, "mean_token_accuracy": 0.7199866360425949, "num_tokens": 378945180.0, "step": 15400 }, { "entropy": 1.7613112390041352, "epoch": 0.7319499715747584, "grad_norm": 1.1516921520233154, "learning_rate": 7.995891854580694e-05, "loss": 1.3398, "mean_token_accuracy": 0.7185401087999344, "num_tokens": 380202318.0, "step": 15450 }, { "entropy": 1.7631869399547577, "epoch": 0.7343187417093046, "grad_norm": 1.2842717170715332, "learning_rate": 7.979316263853338e-05, "loss": 1.3246, "mean_token_accuracy": 0.7208184325695037, "num_tokens": 381422244.0, "step": 15500 }, { "entropy": 1.7426608395576477, "epoch": 0.7366875118438506, "grad_norm": 1.2845314741134644, "learning_rate": 7.962689748751158e-05, "loss": 1.3073, "mean_token_accuracy": 0.7258092379570007, "num_tokens": 382656317.0, "step": 15550 }, { "entropy": 1.7413418543338777, "epoch": 0.7390562819783968, "grad_norm": 1.1051653623580933, "learning_rate": 7.94601259346516e-05, "loss": 1.3248, "mean_token_accuracy": 0.7227061313390731, "num_tokens": 383887770.0, "step": 15600 }, { "entropy": 1.7754042732715607, "epoch": 0.741425052112943, "grad_norm": 1.0099495649337769, "learning_rate": 7.929285083051921e-05, "loss": 1.3818, "mean_token_accuracy": 0.713128559589386, "num_tokens": 385130213.0, "step": 15650 }, { "entropy": 1.7583998191356658, "epoch": 0.7437938222474891, "grad_norm": 1.0357869863510132, "learning_rate": 7.912507503428728e-05, "loss": 1.3513, "mean_token_accuracy": 0.716811910867691, "num_tokens": 386352005.0, "step": 15700 }, { "entropy": 1.7743099415302277, "epoch": 0.7461625923820352, "grad_norm": 1.10836660861969, "learning_rate": 7.895680141368678e-05, "loss": 1.3314, "mean_token_accuracy": 0.7205884575843811, "num_tokens": 387565047.0, "step": 15750 }, { "entropy": 1.7909239864349364, "epoch": 0.7485313625165814, "grad_norm": 1.0026726722717285, "learning_rate": 7.87880328449578e-05, "loss": 1.3547, "mean_token_accuracy": 0.7177285236120224, "num_tokens": 388787505.0, "step": 15800 }, { "entropy": 1.7829215788841248, "epoch": 0.7509001326511275, "grad_norm": 1.3079992532730103, "learning_rate": 7.86187722128004e-05, "loss": 1.329, "mean_token_accuracy": 0.720749350786209, "num_tokens": 390046573.0, "step": 15850 }, { "entropy": 1.7563760423660277, "epoch": 0.7532689027856737, "grad_norm": 1.1663581132888794, "learning_rate": 7.844902241032535e-05, "loss": 1.3364, "mean_token_accuracy": 0.7199984455108642, "num_tokens": 391284239.0, "step": 15900 }, { "entropy": 1.7583712506294251, "epoch": 0.7556376729202198, "grad_norm": 1.0669708251953125, "learning_rate": 7.827878633900461e-05, "loss": 1.3233, "mean_token_accuracy": 0.7232204431295395, "num_tokens": 392511286.0, "step": 15950 }, { "entropy": 1.7710711109638213, "epoch": 0.7580064430547659, "grad_norm": 1.1993380784988403, "learning_rate": 7.81080669086217e-05, "loss": 1.3633, "mean_token_accuracy": 0.7153352189064026, "num_tokens": 393762386.0, "step": 16000 }, { "epoch": 0.7580064430547659, "eval_entropy": 1.205138478718751, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7560521679240454, "eval_num_tokens": 393762386.0, "eval_runtime": 728.478, "eval_samples_per_second": 34.064, "eval_steps_per_second": 4.258, "step": 16000 }, { "entropy": 1.7863057303428649, "epoch": 0.7603752131893121, "grad_norm": 0.9311954975128174, "learning_rate": 7.793686703722212e-05, "loss": 1.3477, "mean_token_accuracy": 0.7189880239963532, "num_tokens": 394971495.0, "step": 16050 }, { "entropy": 1.747820656299591, "epoch": 0.7627439833238583, "grad_norm": 1.0288971662521362, "learning_rate": 7.776518965106327e-05, "loss": 1.3034, "mean_token_accuracy": 0.7258507144451142, "num_tokens": 396221548.0, "step": 16100 }, { "entropy": 1.7468533515930176, "epoch": 0.7651127534584043, "grad_norm": 1.0182217359542847, "learning_rate": 7.759303768456463e-05, "loss": 1.3123, "mean_token_accuracy": 0.7229688459634781, "num_tokens": 397439687.0, "step": 16150 }, { "entropy": 1.7685637962818146, "epoch": 0.7674815235929505, "grad_norm": 1.1658631563186646, "learning_rate": 7.742041408025747e-05, "loss": 1.3163, "mean_token_accuracy": 0.7229421508312225, "num_tokens": 398648499.0, "step": 16200 }, { "entropy": 1.74519140958786, "epoch": 0.7698502937274967, "grad_norm": 1.0480293035507202, "learning_rate": 7.724732178873456e-05, "loss": 1.3396, "mean_token_accuracy": 0.7191978305578232, "num_tokens": 399900933.0, "step": 16250 }, { "entropy": 1.73216095328331, "epoch": 0.7722190638620429, "grad_norm": 1.105089783668518, "learning_rate": 7.707376376859984e-05, "loss": 1.3092, "mean_token_accuracy": 0.7250679528713226, "num_tokens": 401117830.0, "step": 16300 }, { "entropy": 1.76406853556633, "epoch": 0.7745878339965889, "grad_norm": 1.2632781267166138, "learning_rate": 7.689974298641773e-05, "loss": 1.3509, "mean_token_accuracy": 0.7167004567384719, "num_tokens": 402347744.0, "step": 16350 }, { "entropy": 1.7978839790821075, "epoch": 0.7769566041311351, "grad_norm": 1.0637677907943726, "learning_rate": 7.672526241666248e-05, "loss": 1.3469, "mean_token_accuracy": 0.71729552090168, "num_tokens": 403549647.0, "step": 16400 }, { "entropy": 1.764045135974884, "epoch": 0.7793253742656813, "grad_norm": 0.9130464196205139, "learning_rate": 7.655032504166735e-05, "loss": 1.3204, "mean_token_accuracy": 0.7207730168104172, "num_tokens": 404774771.0, "step": 16450 }, { "entropy": 1.7537085354328155, "epoch": 0.7816941444002274, "grad_norm": 1.1020361185073853, "learning_rate": 7.637493385157358e-05, "loss": 1.327, "mean_token_accuracy": 0.7206742608547211, "num_tokens": 406011265.0, "step": 16500 }, { "entropy": 1.755173259973526, "epoch": 0.7840629145347735, "grad_norm": 0.9496687650680542, "learning_rate": 7.619909184427934e-05, "loss": 1.3013, "mean_token_accuracy": 0.7237769782543182, "num_tokens": 407262276.0, "step": 16550 }, { "entropy": 1.8000263261795044, "epoch": 0.7864316846693197, "grad_norm": 1.295494556427002, "learning_rate": 7.602280202538839e-05, "loss": 1.3753, "mean_token_accuracy": 0.7130093973875046, "num_tokens": 408508718.0, "step": 16600 }, { "entropy": 1.746513249874115, "epoch": 0.7888004548038658, "grad_norm": 1.1544225215911865, "learning_rate": 7.584606740815885e-05, "loss": 1.3246, "mean_token_accuracy": 0.7214300912618637, "num_tokens": 409745538.0, "step": 16650 }, { "entropy": 1.8098858451843263, "epoch": 0.791169224938412, "grad_norm": 0.9912792444229126, "learning_rate": 7.566889101345156e-05, "loss": 1.3452, "mean_token_accuracy": 0.7167094177007676, "num_tokens": 410988780.0, "step": 16700 }, { "entropy": 1.735760669708252, "epoch": 0.7935379950729581, "grad_norm": 0.9103946685791016, "learning_rate": 7.549127586967853e-05, "loss": 1.3319, "mean_token_accuracy": 0.7208295828104019, "num_tokens": 412261045.0, "step": 16750 }, { "entropy": 1.7235812985897063, "epoch": 0.7959067652075043, "grad_norm": 0.9902112483978271, "learning_rate": 7.531322501275114e-05, "loss": 1.3523, "mean_token_accuracy": 0.7184983837604523, "num_tokens": 413490577.0, "step": 16800 }, { "entropy": 1.7275058662891387, "epoch": 0.7982755353420504, "grad_norm": 0.857623279094696, "learning_rate": 7.513474148602826e-05, "loss": 1.3474, "mean_token_accuracy": 0.71783855676651, "num_tokens": 414734324.0, "step": 16850 }, { "entropy": 1.712952392101288, "epoch": 0.8006443054765966, "grad_norm": 0.8611600399017334, "learning_rate": 7.495582834026421e-05, "loss": 1.3284, "mean_token_accuracy": 0.7218652653694153, "num_tokens": 415979550.0, "step": 16900 }, { "entropy": 1.7426096272468568, "epoch": 0.8030130756111427, "grad_norm": 1.153913140296936, "learning_rate": 7.47764886335567e-05, "loss": 1.3673, "mean_token_accuracy": 0.7147996026277542, "num_tokens": 417172690.0, "step": 16950 }, { "entropy": 1.7134468042850495, "epoch": 0.8053818457456888, "grad_norm": 0.9624414443969727, "learning_rate": 7.459672543129438e-05, "loss": 1.3301, "mean_token_accuracy": 0.7208444583415985, "num_tokens": 418396867.0, "step": 17000 }, { "epoch": 0.8053818457456888, "eval_entropy": 1.1603839400696954, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7568193746719877, "eval_num_tokens": 418396867.0, "eval_runtime": 730.007, "eval_samples_per_second": 33.993, "eval_steps_per_second": 4.249, "step": 17000 }, { "entropy": 1.7083112740516662, "epoch": 0.807750615880235, "grad_norm": 1.0578949451446533, "learning_rate": 7.441654180610466e-05, "loss": 1.3116, "mean_token_accuracy": 0.7241713929176331, "num_tokens": 419620242.0, "step": 17050 }, { "entropy": 1.7101111936569213, "epoch": 0.8101193860147812, "grad_norm": 1.4528478384017944, "learning_rate": 7.423594083780106e-05, "loss": 1.2894, "mean_token_accuracy": 0.7281060153245926, "num_tokens": 420840365.0, "step": 17100 }, { "entropy": 1.7118054771423339, "epoch": 0.8124881561493272, "grad_norm": 0.9906657338142395, "learning_rate": 7.405492561333052e-05, "loss": 1.3313, "mean_token_accuracy": 0.7208691501617431, "num_tokens": 422103109.0, "step": 17150 }, { "entropy": 1.743634682893753, "epoch": 0.8148569262838734, "grad_norm": 1.1013976335525513, "learning_rate": 7.387349922672082e-05, "loss": 1.3435, "mean_token_accuracy": 0.7182679337263107, "num_tokens": 423336510.0, "step": 17200 }, { "entropy": 1.7483525812625884, "epoch": 0.8172256964184196, "grad_norm": 1.1249130964279175, "learning_rate": 7.369166477902753e-05, "loss": 1.3356, "mean_token_accuracy": 0.7192718476057053, "num_tokens": 424558544.0, "step": 17250 }, { "entropy": 1.7446792232990265, "epoch": 0.8195944665529658, "grad_norm": 0.9279561042785645, "learning_rate": 7.350942537828105e-05, "loss": 1.357, "mean_token_accuracy": 0.7167576777935029, "num_tokens": 425791336.0, "step": 17300 }, { "entropy": 1.7697773826122285, "epoch": 0.8219632366875118, "grad_norm": 1.0355454683303833, "learning_rate": 7.332678413943352e-05, "loss": 1.3279, "mean_token_accuracy": 0.7211151129007339, "num_tokens": 427017026.0, "step": 17350 }, { "entropy": 1.769633387327194, "epoch": 0.824332006822058, "grad_norm": 1.1847100257873535, "learning_rate": 7.314374418430554e-05, "loss": 1.3239, "mean_token_accuracy": 0.7223846167325974, "num_tokens": 428272562.0, "step": 17400 }, { "entropy": 1.754239571094513, "epoch": 0.8267007769566042, "grad_norm": 1.0456291437149048, "learning_rate": 7.296030864153286e-05, "loss": 1.3136, "mean_token_accuracy": 0.7257154327630997, "num_tokens": 429502230.0, "step": 17450 }, { "entropy": 1.727893146276474, "epoch": 0.8290695470911503, "grad_norm": 1.0988260507583618, "learning_rate": 7.277648064651281e-05, "loss": 1.3325, "mean_token_accuracy": 0.7202855634689331, "num_tokens": 430738126.0, "step": 17500 }, { "entropy": 1.7441512525081635, "epoch": 0.8314383172256964, "grad_norm": 1.4698035717010498, "learning_rate": 7.259226334135079e-05, "loss": 1.303, "mean_token_accuracy": 0.7249046045541764, "num_tokens": 431957649.0, "step": 17550 }, { "entropy": 1.7701557087898254, "epoch": 0.8338070873602426, "grad_norm": 0.8762974143028259, "learning_rate": 7.240765987480654e-05, "loss": 1.3501, "mean_token_accuracy": 0.7148396277427673, "num_tokens": 433171928.0, "step": 17600 }, { "entropy": 1.771160396337509, "epoch": 0.8361758574947887, "grad_norm": 0.9736217260360718, "learning_rate": 7.222267340224034e-05, "loss": 1.324, "mean_token_accuracy": 0.7225921380519867, "num_tokens": 434354105.0, "step": 17650 }, { "entropy": 1.7395762205123901, "epoch": 0.8385446276293349, "grad_norm": 1.1522939205169678, "learning_rate": 7.203730708555897e-05, "loss": 1.3243, "mean_token_accuracy": 0.7200587207078933, "num_tokens": 435556247.0, "step": 17700 }, { "entropy": 1.7756438231468201, "epoch": 0.840913397763881, "grad_norm": 1.2958649396896362, "learning_rate": 7.185156409316186e-05, "loss": 1.374, "mean_token_accuracy": 0.7119175827503205, "num_tokens": 436745432.0, "step": 17750 }, { "entropy": 1.7542549967765808, "epoch": 0.8432821678984271, "grad_norm": 1.1340820789337158, "learning_rate": 7.166544759988676e-05, "loss": 1.3066, "mean_token_accuracy": 0.7246117842197418, "num_tokens": 437965467.0, "step": 17800 }, { "entropy": 1.7582876706123352, "epoch": 0.8456509380329733, "grad_norm": 1.3607231378555298, "learning_rate": 7.147896078695551e-05, "loss": 1.304, "mean_token_accuracy": 0.724840202331543, "num_tokens": 439190298.0, "step": 17850 }, { "entropy": 1.7604767334461213, "epoch": 0.8480197081675194, "grad_norm": 1.2566254138946533, "learning_rate": 7.129210684191973e-05, "loss": 1.3237, "mean_token_accuracy": 0.7207924181222916, "num_tokens": 440410573.0, "step": 17900 }, { "entropy": 1.7759632766246796, "epoch": 0.8503884783020655, "grad_norm": 1.3075144290924072, "learning_rate": 7.110488895860633e-05, "loss": 1.3476, "mean_token_accuracy": 0.7191230463981628, "num_tokens": 441635031.0, "step": 17950 }, { "entropy": 1.7343137776851654, "epoch": 0.8527572484366117, "grad_norm": 0.9776571393013, "learning_rate": 7.091731033706281e-05, "loss": 1.3101, "mean_token_accuracy": 0.7254330676794052, "num_tokens": 442870901.0, "step": 18000 }, { "epoch": 0.8527572484366117, "eval_entropy": 1.1681382538694325, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7576727671230478, "eval_num_tokens": 442870901.0, "eval_runtime": 728.745, "eval_samples_per_second": 34.052, "eval_steps_per_second": 4.257, "step": 18000 }, { "entropy": 1.7468896472454072, "epoch": 0.8551260185711579, "grad_norm": 1.1819489002227783, "learning_rate": 7.072937418350267e-05, "loss": 1.3424, "mean_token_accuracy": 0.7205572354793549, "num_tokens": 444113226.0, "step": 18050 }, { "entropy": 1.752768530845642, "epoch": 0.857494788705704, "grad_norm": 1.1274313926696777, "learning_rate": 7.05410837102506e-05, "loss": 1.3276, "mean_token_accuracy": 0.7209989041090011, "num_tokens": 445334171.0, "step": 18100 }, { "entropy": 1.7303791618347169, "epoch": 0.8598635588402501, "grad_norm": 1.3661671876907349, "learning_rate": 7.035244213568752e-05, "loss": 1.2946, "mean_token_accuracy": 0.7279473000764847, "num_tokens": 446563863.0, "step": 18150 }, { "entropy": 1.7404825651645661, "epoch": 0.8622323289747963, "grad_norm": 1.2854536771774292, "learning_rate": 7.016345268419559e-05, "loss": 1.3414, "mean_token_accuracy": 0.7202253836393356, "num_tokens": 447805772.0, "step": 18200 }, { "entropy": 1.7380101013183593, "epoch": 0.8646010991093425, "grad_norm": 1.1535879373550415, "learning_rate": 6.997411858610311e-05, "loss": 1.3059, "mean_token_accuracy": 0.7250276601314545, "num_tokens": 449010921.0, "step": 18250 }, { "entropy": 1.7006947088241577, "epoch": 0.8669698692438885, "grad_norm": 1.076830506324768, "learning_rate": 6.978444307762932e-05, "loss": 1.2936, "mean_token_accuracy": 0.7278600412607193, "num_tokens": 450222021.0, "step": 18300 }, { "entropy": 1.7165203237533568, "epoch": 0.8693386393784347, "grad_norm": 1.1687458753585815, "learning_rate": 6.959442940082907e-05, "loss": 1.3093, "mean_token_accuracy": 0.7266046351194382, "num_tokens": 451437320.0, "step": 18350 }, { "entropy": 1.7525631844997407, "epoch": 0.8717074095129809, "grad_norm": 1.0274993181228638, "learning_rate": 6.940408080353737e-05, "loss": 1.3405, "mean_token_accuracy": 0.7197101265192032, "num_tokens": 452637999.0, "step": 18400 }, { "entropy": 1.6950951242446899, "epoch": 0.874076179647527, "grad_norm": 1.170281171798706, "learning_rate": 6.921340053931389e-05, "loss": 1.322, "mean_token_accuracy": 0.7230970364809036, "num_tokens": 453872583.0, "step": 18450 }, { "entropy": 1.7251943707466126, "epoch": 0.8764449497820731, "grad_norm": 1.0783965587615967, "learning_rate": 6.902239186738742e-05, "loss": 1.3077, "mean_token_accuracy": 0.7254717952013016, "num_tokens": 455115487.0, "step": 18500 }, { "entropy": 1.748286772966385, "epoch": 0.8788137199166193, "grad_norm": 0.9898918867111206, "learning_rate": 6.883105805260006e-05, "loss": 1.336, "mean_token_accuracy": 0.7198050141334533, "num_tokens": 456357289.0, "step": 18550 }, { "entropy": 1.7271603178977966, "epoch": 0.8811824900511654, "grad_norm": 1.0102683305740356, "learning_rate": 6.863940236535146e-05, "loss": 1.2972, "mean_token_accuracy": 0.7267592811584472, "num_tokens": 457601954.0, "step": 18600 }, { "entropy": 1.7312583494186402, "epoch": 0.8835512601857116, "grad_norm": 1.1015334129333496, "learning_rate": 6.844742808154297e-05, "loss": 1.3264, "mean_token_accuracy": 0.7216020065546036, "num_tokens": 458836210.0, "step": 18650 }, { "entropy": 1.7203202879428863, "epoch": 0.8859200303202577, "grad_norm": 1.2608678340911865, "learning_rate": 6.82551384825215e-05, "loss": 1.3112, "mean_token_accuracy": 0.7242467325925827, "num_tokens": 460059995.0, "step": 18700 }, { "entropy": 1.73845316529274, "epoch": 0.8882888004548039, "grad_norm": 1.2071726322174072, "learning_rate": 6.806253685502361e-05, "loss": 1.3422, "mean_token_accuracy": 0.7193791323900223, "num_tokens": 461321260.0, "step": 18750 }, { "entropy": 1.7286129772663117, "epoch": 0.89065757058935, "grad_norm": 0.9281275868415833, "learning_rate": 6.786962649111926e-05, "loss": 1.3346, "mean_token_accuracy": 0.7215994411706924, "num_tokens": 462547797.0, "step": 18800 }, { "entropy": 1.7290022671222687, "epoch": 0.8930263407238962, "grad_norm": 1.4838134050369263, "learning_rate": 6.767641068815546e-05, "loss": 1.2936, "mean_token_accuracy": 0.7260348951816559, "num_tokens": 463769872.0, "step": 18850 }, { "entropy": 1.7164359045028688, "epoch": 0.8953951108584423, "grad_norm": 0.9822611808776855, "learning_rate": 6.748289274870001e-05, "loss": 1.2841, "mean_token_accuracy": 0.7294727778434753, "num_tokens": 465012929.0, "step": 18900 }, { "entropy": 1.778987593650818, "epoch": 0.8977638809929884, "grad_norm": 1.056518793106079, "learning_rate": 6.728907598048503e-05, "loss": 1.3276, "mean_token_accuracy": 0.7213660079240799, "num_tokens": 466199667.0, "step": 18950 }, { "entropy": 1.7454373347759247, "epoch": 0.9001326511275346, "grad_norm": 1.1590458154678345, "learning_rate": 6.709496369635043e-05, "loss": 1.3057, "mean_token_accuracy": 0.7262363374233246, "num_tokens": 467441804.0, "step": 19000 }, { "epoch": 0.9001326511275346, "eval_entropy": 1.1678229505634554, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7581798996773941, "eval_num_tokens": 467441804.0, "eval_runtime": 729.0934, "eval_samples_per_second": 34.035, "eval_steps_per_second": 4.255, "step": 19000 }, { "entropy": 1.7560745322704314, "epoch": 0.9025014212620808, "grad_norm": 1.1119104623794556, "learning_rate": 6.69005592141872e-05, "loss": 1.3103, "mean_token_accuracy": 0.7248196619749069, "num_tokens": 468672442.0, "step": 19050 }, { "entropy": 1.7368404233455659, "epoch": 0.9048701913966268, "grad_norm": 1.0841938257217407, "learning_rate": 6.670586585688086e-05, "loss": 1.3168, "mean_token_accuracy": 0.723661498427391, "num_tokens": 469911642.0, "step": 19100 }, { "entropy": 1.7659292578697205, "epoch": 0.907238961531173, "grad_norm": 1.3782135248184204, "learning_rate": 6.651088695225447e-05, "loss": 1.3044, "mean_token_accuracy": 0.7251561576128006, "num_tokens": 471110856.0, "step": 19150 }, { "entropy": 1.7336669373512268, "epoch": 0.9096077316657192, "grad_norm": 1.1556999683380127, "learning_rate": 6.631562583301191e-05, "loss": 1.297, "mean_token_accuracy": 0.7274081045389176, "num_tokens": 472320155.0, "step": 19200 }, { "entropy": 1.684252212047577, "epoch": 0.9119765018002653, "grad_norm": 0.9659352898597717, "learning_rate": 6.612008583668082e-05, "loss": 1.3105, "mean_token_accuracy": 0.7258839225769043, "num_tokens": 473560540.0, "step": 19250 }, { "entropy": 1.736447709798813, "epoch": 0.9143452719348114, "grad_norm": 1.231652855873108, "learning_rate": 6.592427030555565e-05, "loss": 1.3364, "mean_token_accuracy": 0.7204890990257263, "num_tokens": 474795749.0, "step": 19300 }, { "entropy": 1.6979431188106537, "epoch": 0.9167140420693576, "grad_norm": 1.0034390687942505, "learning_rate": 6.572818258664035e-05, "loss": 1.321, "mean_token_accuracy": 0.7222663134336471, "num_tokens": 476048351.0, "step": 19350 }, { "entropy": 1.7359539401531219, "epoch": 0.9190828122039038, "grad_norm": 1.0725759267807007, "learning_rate": 6.55318260315914e-05, "loss": 1.3228, "mean_token_accuracy": 0.7220259785652161, "num_tokens": 477260246.0, "step": 19400 }, { "entropy": 1.6657227408885955, "epoch": 0.9214515823384499, "grad_norm": 0.9826326370239258, "learning_rate": 6.533520399666033e-05, "loss": 1.2904, "mean_token_accuracy": 0.7296865725517273, "num_tokens": 478504094.0, "step": 19450 }, { "entropy": 1.7167856967449189, "epoch": 0.923820352472996, "grad_norm": 0.9942904710769653, "learning_rate": 6.513831984263641e-05, "loss": 1.2708, "mean_token_accuracy": 0.7317487215995788, "num_tokens": 479728318.0, "step": 19500 }, { "entropy": 1.7254127764701843, "epoch": 0.9261891226075422, "grad_norm": 1.4505666494369507, "learning_rate": 6.494117693478926e-05, "loss": 1.2893, "mean_token_accuracy": 0.7286518901586533, "num_tokens": 480937077.0, "step": 19550 }, { "entropy": 1.7521008849143982, "epoch": 0.9285578927420883, "grad_norm": 1.066002607345581, "learning_rate": 6.474377864281127e-05, "loss": 1.3244, "mean_token_accuracy": 0.7240516602993011, "num_tokens": 482172564.0, "step": 19600 }, { "entropy": 1.7225322866439818, "epoch": 0.9309266628766345, "grad_norm": 1.1396028995513916, "learning_rate": 6.454612834076e-05, "loss": 1.3052, "mean_token_accuracy": 0.7258065021038056, "num_tokens": 483406518.0, "step": 19650 }, { "entropy": 1.7187459325790406, "epoch": 0.9332954330111806, "grad_norm": 0.8960033655166626, "learning_rate": 6.434822940700057e-05, "loss": 1.297, "mean_token_accuracy": 0.7268172729015351, "num_tokens": 484643697.0, "step": 19700 }, { "entropy": 1.7082497942447663, "epoch": 0.9356642031457267, "grad_norm": 1.1821448802947998, "learning_rate": 6.415008522414782e-05, "loss": 1.292, "mean_token_accuracy": 0.7285556894540787, "num_tokens": 485855707.0, "step": 19750 }, { "entropy": 1.7341230428218841, "epoch": 0.9380329732802729, "grad_norm": 1.0824941396713257, "learning_rate": 6.395169917900858e-05, "loss": 1.3135, "mean_token_accuracy": 0.723016293644905, "num_tokens": 487075872.0, "step": 19800 }, { "entropy": 1.712151471376419, "epoch": 0.9404017434148191, "grad_norm": 1.4998127222061157, "learning_rate": 6.375307466252372e-05, "loss": 1.3492, "mean_token_accuracy": 0.7187636381387711, "num_tokens": 488272477.0, "step": 19850 }, { "entropy": 1.683067034482956, "epoch": 0.9427705135493651, "grad_norm": 0.9722391963005066, "learning_rate": 6.355421506971025e-05, "loss": 1.2899, "mean_token_accuracy": 0.728040627837181, "num_tokens": 489486559.0, "step": 19900 }, { "entropy": 1.6999699199199676, "epoch": 0.9451392836839113, "grad_norm": 1.150619626045227, "learning_rate": 6.335512379960322e-05, "loss": 1.2776, "mean_token_accuracy": 0.7324709689617157, "num_tokens": 490706066.0, "step": 19950 }, { "entropy": 1.7298948228359223, "epoch": 0.9475080538184575, "grad_norm": 1.2315185070037842, "learning_rate": 6.315580425519766e-05, "loss": 1.3312, "mean_token_accuracy": 0.7208713871240616, "num_tokens": 491918291.0, "step": 20000 }, { "epoch": 0.9475080538184575, "eval_entropy": 1.15051956327787, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7579382187335588, "eval_num_tokens": 491918291.0, "eval_runtime": 726.7621, "eval_samples_per_second": 34.145, "eval_steps_per_second": 4.268, "step": 20000 }, { "entropy": 1.7038512194156648, "epoch": 0.9498768239530035, "grad_norm": 1.163555383682251, "learning_rate": 6.295625984339043e-05, "loss": 1.3204, "mean_token_accuracy": 0.7242659759521485, "num_tokens": 493116821.0, "step": 20050 }, { "entropy": 1.741351603269577, "epoch": 0.9522455940875497, "grad_norm": 1.1677212715148926, "learning_rate": 6.275649397492195e-05, "loss": 1.3061, "mean_token_accuracy": 0.7256824851036072, "num_tokens": 494289241.0, "step": 20100 }, { "entropy": 1.718240325450897, "epoch": 0.9546143642220959, "grad_norm": 0.9395654797554016, "learning_rate": 6.255651006431793e-05, "loss": 1.2979, "mean_token_accuracy": 0.7259613001346588, "num_tokens": 495524606.0, "step": 20150 }, { "entropy": 1.6947869229316712, "epoch": 0.9569831343566421, "grad_norm": 0.9097754955291748, "learning_rate": 6.235631152983098e-05, "loss": 1.3067, "mean_token_accuracy": 0.7251908606290818, "num_tokens": 496753832.0, "step": 20200 }, { "entropy": 1.6904695987701417, "epoch": 0.9593519044911881, "grad_norm": 1.0224709510803223, "learning_rate": 6.215590179338221e-05, "loss": 1.2916, "mean_token_accuracy": 0.7274919444322586, "num_tokens": 497977841.0, "step": 20250 }, { "entropy": 1.7162616491317748, "epoch": 0.9617206746257343, "grad_norm": 1.1449992656707764, "learning_rate": 6.195528428050273e-05, "loss": 1.3412, "mean_token_accuracy": 0.7184383940696716, "num_tokens": 499188236.0, "step": 20300 }, { "entropy": 1.6742710149288178, "epoch": 0.9640894447602805, "grad_norm": 1.100428581237793, "learning_rate": 6.17544624202751e-05, "loss": 1.2547, "mean_token_accuracy": 0.7356196337938309, "num_tokens": 500446815.0, "step": 20350 }, { "entropy": 1.6939631617069244, "epoch": 0.9664582148948266, "grad_norm": 0.9926633238792419, "learning_rate": 6.15534396452747e-05, "loss": 1.3121, "mean_token_accuracy": 0.7248954975605011, "num_tokens": 501679112.0, "step": 20400 }, { "entropy": 1.739109193086624, "epoch": 0.9688269850293727, "grad_norm": 1.270719051361084, "learning_rate": 6.135221939151108e-05, "loss": 1.3404, "mean_token_accuracy": 0.7209612077474594, "num_tokens": 502912575.0, "step": 20450 }, { "entropy": 1.7274162566661835, "epoch": 0.9711957551639189, "grad_norm": 1.2614290714263916, "learning_rate": 6.115080509836923e-05, "loss": 1.334, "mean_token_accuracy": 0.7216370838880539, "num_tokens": 504141410.0, "step": 20500 }, { "entropy": 1.708759593963623, "epoch": 0.973564525298465, "grad_norm": 1.2522040605545044, "learning_rate": 6.09492002085508e-05, "loss": 1.3175, "mean_token_accuracy": 0.7249479728937149, "num_tokens": 505345687.0, "step": 20550 }, { "entropy": 1.6911339461803436, "epoch": 0.9759332954330112, "grad_norm": 1.0709445476531982, "learning_rate": 6.074740816801516e-05, "loss": 1.2945, "mean_token_accuracy": 0.7283177155256272, "num_tokens": 506583420.0, "step": 20600 }, { "entropy": 1.7511263823509216, "epoch": 0.9783020655675573, "grad_norm": 1.1028821468353271, "learning_rate": 6.054543242592071e-05, "loss": 1.3661, "mean_token_accuracy": 0.7142648506164551, "num_tokens": 507769373.0, "step": 20650 }, { "entropy": 1.7048313403129578, "epoch": 0.9806708357021034, "grad_norm": 1.2044216394424438, "learning_rate": 6.034327643456569e-05, "loss": 1.2878, "mean_token_accuracy": 0.7300767368078231, "num_tokens": 508986124.0, "step": 20700 }, { "entropy": 1.732351886034012, "epoch": 0.9830396058366496, "grad_norm": 1.118547797203064, "learning_rate": 6.014094364932931e-05, "loss": 1.3298, "mean_token_accuracy": 0.7219525814056397, "num_tokens": 510216131.0, "step": 20750 }, { "entropy": 1.7391897797584535, "epoch": 0.9854083759711958, "grad_norm": 1.134662389755249, "learning_rate": 5.993843752861266e-05, "loss": 1.349, "mean_token_accuracy": 0.7185146582126617, "num_tokens": 511452480.0, "step": 20800 }, { "entropy": 1.7354848337173463, "epoch": 0.9877771461057419, "grad_norm": 1.2413026094436646, "learning_rate": 5.9735761533779575e-05, "loss": 1.3117, "mean_token_accuracy": 0.7226764589548111, "num_tokens": 512677249.0, "step": 20850 }, { "entropy": 1.7198446631431579, "epoch": 0.990145916240288, "grad_norm": 1.094545602798462, "learning_rate": 5.953291912909751e-05, "loss": 1.271, "mean_token_accuracy": 0.7310113716125488, "num_tokens": 513916521.0, "step": 20900 }, { "entropy": 1.7085091185569763, "epoch": 0.9925146863748342, "grad_norm": 1.1231886148452759, "learning_rate": 5.932991378167827e-05, "loss": 1.2842, "mean_token_accuracy": 0.7295077663660049, "num_tokens": 515136725.0, "step": 20950 }, { "entropy": 1.6870656645298003, "epoch": 0.9948834565093804, "grad_norm": 1.1282687187194824, "learning_rate": 5.912674896141883e-05, "loss": 1.3022, "mean_token_accuracy": 0.7291330778598786, "num_tokens": 516364497.0, "step": 21000 }, { "epoch": 0.9948834565093804, "eval_entropy": 1.1225133023863065, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7580950103464779, "eval_num_tokens": 516364497.0, "eval_runtime": 728.4546, "eval_samples_per_second": 34.065, "eval_steps_per_second": 4.258, "step": 21000 }, { "entropy": 1.7055912351608276, "epoch": 0.9972522266439264, "grad_norm": 1.0038437843322754, "learning_rate": 5.892342814094193e-05, "loss": 1.3364, "mean_token_accuracy": 0.7213913726806641, "num_tokens": 517597413.0, "step": 21050 }, { "entropy": 1.6995044994354247, "epoch": 0.9996209967784726, "grad_norm": 1.0054970979690552, "learning_rate": 5.871995479553676e-05, "loss": 1.3426, "mean_token_accuracy": 0.7205459761619568, "num_tokens": 518808510.0, "step": 21100 }, { "entropy": 1.6123450350761415, "epoch": 1.0019897669130187, "grad_norm": 1.1733577251434326, "learning_rate": 5.851633240309963e-05, "loss": 1.2043, "mean_token_accuracy": 0.7436364030838013, "num_tokens": 520050715.0, "step": 21150 }, { "entropy": 1.6008358299732208, "epoch": 1.004358537047565, "grad_norm": 1.2594228982925415, "learning_rate": 5.8312564444074366e-05, "loss": 1.1962, "mean_token_accuracy": 0.7485791981220246, "num_tokens": 521272519.0, "step": 21200 }, { "entropy": 1.5864481520652771, "epoch": 1.006727307182111, "grad_norm": 1.1308941841125488, "learning_rate": 5.810865440139299e-05, "loss": 1.2014, "mean_token_accuracy": 0.7478245437145233, "num_tokens": 522505953.0, "step": 21250 }, { "entropy": 1.6050166165828705, "epoch": 1.0090960773166573, "grad_norm": 1.0653034448623657, "learning_rate": 5.790460576041608e-05, "loss": 1.2219, "mean_token_accuracy": 0.7426986521482468, "num_tokens": 523767803.0, "step": 21300 }, { "entropy": 1.598244547843933, "epoch": 1.0114648474512034, "grad_norm": 1.1732730865478516, "learning_rate": 5.77004220088732e-05, "loss": 1.2003, "mean_token_accuracy": 0.7464343649148941, "num_tokens": 525030135.0, "step": 21350 }, { "entropy": 1.5887214350700378, "epoch": 1.0138336175857494, "grad_norm": 1.0639592409133911, "learning_rate": 5.749610663680334e-05, "loss": 1.1959, "mean_token_accuracy": 0.7482451206445694, "num_tokens": 526245960.0, "step": 21400 }, { "entropy": 1.5872322118282318, "epoch": 1.0162023877202957, "grad_norm": 1.4078749418258667, "learning_rate": 5.729166313649523e-05, "loss": 1.1928, "mean_token_accuracy": 0.747797891497612, "num_tokens": 527461999.0, "step": 21450 }, { "entropy": 1.5684971618652344, "epoch": 1.0185711578548418, "grad_norm": 1.0040647983551025, "learning_rate": 5.7087095002427614e-05, "loss": 1.1636, "mean_token_accuracy": 0.7527302461862564, "num_tokens": 528693347.0, "step": 21500 }, { "entropy": 1.5993961930274962, "epoch": 1.0209399279893878, "grad_norm": 1.519827127456665, "learning_rate": 5.688240573120962e-05, "loss": 1.1996, "mean_token_accuracy": 0.7477444261312485, "num_tokens": 529916988.0, "step": 21550 }, { "entropy": 1.6100480878353118, "epoch": 1.023308698123934, "grad_norm": 1.298337459564209, "learning_rate": 5.6677598821520886e-05, "loss": 1.1941, "mean_token_accuracy": 0.746188434958458, "num_tokens": 531136613.0, "step": 21600 }, { "entropy": 1.608763552904129, "epoch": 1.0256774682584802, "grad_norm": 1.2754813432693481, "learning_rate": 5.647267777405177e-05, "loss": 1.1801, "mean_token_accuracy": 0.7486988466978073, "num_tokens": 532395495.0, "step": 21650 }, { "entropy": 1.5974150121212005, "epoch": 1.0280462383930264, "grad_norm": 1.306957721710205, "learning_rate": 5.626764609144364e-05, "loss": 1.229, "mean_token_accuracy": 0.7420145213603974, "num_tokens": 533626086.0, "step": 21700 }, { "entropy": 1.5830400812625884, "epoch": 1.0304150085275725, "grad_norm": 1.2734113931655884, "learning_rate": 5.606250727822883e-05, "loss": 1.2002, "mean_token_accuracy": 0.7472029691934585, "num_tokens": 534872278.0, "step": 21750 }, { "entropy": 1.5808272886276244, "epoch": 1.0327837786621186, "grad_norm": 0.9578977227210999, "learning_rate": 5.585726484077085e-05, "loss": 1.2118, "mean_token_accuracy": 0.745669018626213, "num_tokens": 536104060.0, "step": 21800 }, { "entropy": 1.546217747926712, "epoch": 1.0351525487966649, "grad_norm": 1.1195182800292969, "learning_rate": 5.565192228720439e-05, "loss": 1.1738, "mean_token_accuracy": 0.7508551919460297, "num_tokens": 537338574.0, "step": 21850 }, { "entropy": 1.5846721458435058, "epoch": 1.037521318931211, "grad_norm": 1.2577298879623413, "learning_rate": 5.544648312737547e-05, "loss": 1.1778, "mean_token_accuracy": 0.7510464614629746, "num_tokens": 538557980.0, "step": 21900 }, { "entropy": 1.5822556126117706, "epoch": 1.039890089065757, "grad_norm": 1.1481040716171265, "learning_rate": 5.524095087278126e-05, "loss": 1.1848, "mean_token_accuracy": 0.7497791868448257, "num_tokens": 539784677.0, "step": 21950 }, { "entropy": 1.627133835554123, "epoch": 1.0422588592003033, "grad_norm": 1.1411134004592896, "learning_rate": 5.503532903651023e-05, "loss": 1.2608, "mean_token_accuracy": 0.7371292334794998, "num_tokens": 541002283.0, "step": 22000 }, { "epoch": 1.0422588592003033, "eval_entropy": 1.0797893660903208, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7580676496720636, "eval_num_tokens": 541002283.0, "eval_runtime": 728.5593, "eval_samples_per_second": 34.06, "eval_steps_per_second": 4.258, "step": 22000 }, { "entropy": 1.5906290924549102, "epoch": 1.0446276293348493, "grad_norm": 1.164602518081665, "learning_rate": 5.482962113318203e-05, "loss": 1.2085, "mean_token_accuracy": 0.744525915980339, "num_tokens": 542190090.0, "step": 22050 }, { "entropy": 1.5971219801902772, "epoch": 1.0469963994693954, "grad_norm": 1.2099922895431519, "learning_rate": 5.462383067888741e-05, "loss": 1.2251, "mean_token_accuracy": 0.7439098012447357, "num_tokens": 543425520.0, "step": 22100 }, { "entropy": 1.5796366775035857, "epoch": 1.0493651696039417, "grad_norm": 1.0977132320404053, "learning_rate": 5.441796119112814e-05, "loss": 1.1964, "mean_token_accuracy": 0.7492218172550201, "num_tokens": 544655034.0, "step": 22150 }, { "entropy": 1.6276609122753143, "epoch": 1.0517339397384877, "grad_norm": 1.2101308107376099, "learning_rate": 5.421201618875689e-05, "loss": 1.2242, "mean_token_accuracy": 0.7425367647409439, "num_tokens": 545867278.0, "step": 22200 }, { "entropy": 1.6007154369354248, "epoch": 1.054102709873034, "grad_norm": 1.2157655954360962, "learning_rate": 5.4005999191917034e-05, "loss": 1.2258, "mean_token_accuracy": 0.7422733837366104, "num_tokens": 547117555.0, "step": 22250 }, { "entropy": 1.6148575782775878, "epoch": 1.05647148000758, "grad_norm": 1.2759310007095337, "learning_rate": 5.379991372198259e-05, "loss": 1.187, "mean_token_accuracy": 0.7483934825658798, "num_tokens": 548337677.0, "step": 22300 }, { "entropy": 1.6164679837226867, "epoch": 1.0588402501421261, "grad_norm": 1.041208267211914, "learning_rate": 5.359376330149789e-05, "loss": 1.2082, "mean_token_accuracy": 0.7465775471925735, "num_tokens": 549541289.0, "step": 22350 }, { "entropy": 1.5865103662014008, "epoch": 1.0612090202766724, "grad_norm": 1.0899627208709717, "learning_rate": 5.338755145411749e-05, "loss": 1.1928, "mean_token_accuracy": 0.747542524933815, "num_tokens": 550805086.0, "step": 22400 }, { "entropy": 1.6212577140331268, "epoch": 1.0635777904112185, "grad_norm": 1.08705472946167, "learning_rate": 5.318128170454589e-05, "loss": 1.1852, "mean_token_accuracy": 0.7487930029630661, "num_tokens": 552036289.0, "step": 22450 }, { "entropy": 1.620989305973053, "epoch": 1.0659465605457648, "grad_norm": 1.2424880266189575, "learning_rate": 5.297495757847727e-05, "loss": 1.1865, "mean_token_accuracy": 0.750239091515541, "num_tokens": 553267770.0, "step": 22500 }, { "entropy": 1.5796532726287842, "epoch": 1.0683153306803108, "grad_norm": 1.4131051301956177, "learning_rate": 5.2768582602535246e-05, "loss": 1.177, "mean_token_accuracy": 0.7511077529191971, "num_tokens": 554500484.0, "step": 22550 }, { "entropy": 1.6192417418956757, "epoch": 1.0706841008148569, "grad_norm": 1.2125214338302612, "learning_rate": 5.25621603042126e-05, "loss": 1.2202, "mean_token_accuracy": 0.7429804271459579, "num_tokens": 555724501.0, "step": 22600 }, { "entropy": 1.6070238423347474, "epoch": 1.0730528709494032, "grad_norm": 1.1681252717971802, "learning_rate": 5.235569421181103e-05, "loss": 1.1896, "mean_token_accuracy": 0.748240845799446, "num_tokens": 556951175.0, "step": 22650 }, { "entropy": 1.6153511393070221, "epoch": 1.0754216410839492, "grad_norm": 1.2313953638076782, "learning_rate": 5.21491878543807e-05, "loss": 1.1826, "mean_token_accuracy": 0.7484846365451813, "num_tokens": 558174509.0, "step": 22700 }, { "entropy": 1.6176446998119354, "epoch": 1.0777904112184953, "grad_norm": 1.202515959739685, "learning_rate": 5.194264476166006e-05, "loss": 1.2147, "mean_token_accuracy": 0.7431507217884064, "num_tokens": 559404128.0, "step": 22750 }, { "entropy": 1.6383704769611358, "epoch": 1.0801591813530416, "grad_norm": 1.1402473449707031, "learning_rate": 5.1736068464015463e-05, "loss": 1.2216, "mean_token_accuracy": 0.7412754154205322, "num_tokens": 560601861.0, "step": 22800 }, { "entropy": 1.6056468284130097, "epoch": 1.0825279514875876, "grad_norm": 0.9191189408302307, "learning_rate": 5.152946249238082e-05, "loss": 1.1687, "mean_token_accuracy": 0.751121336221695, "num_tokens": 561852684.0, "step": 22850 }, { "entropy": 1.5993254363536835, "epoch": 1.0848967216221337, "grad_norm": 1.460180640220642, "learning_rate": 5.132283037819723e-05, "loss": 1.2194, "mean_token_accuracy": 0.7445776867866516, "num_tokens": 563087919.0, "step": 22900 }, { "entropy": 1.6140161871910095, "epoch": 1.08726549175668, "grad_norm": 1.2920235395431519, "learning_rate": 5.111617565335264e-05, "loss": 1.2139, "mean_token_accuracy": 0.7439986896514893, "num_tokens": 564333108.0, "step": 22950 }, { "entropy": 1.599245457649231, "epoch": 1.089634261891226, "grad_norm": 1.0727440118789673, "learning_rate": 5.090950185012152e-05, "loss": 1.1895, "mean_token_accuracy": 0.7461957842111587, "num_tokens": 565584977.0, "step": 23000 }, { "epoch": 1.089634261891226, "eval_entropy": 1.0720901108034038, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7579072969698429, "eval_num_tokens": 565584977.0, "eval_runtime": 729.1236, "eval_samples_per_second": 34.034, "eval_steps_per_second": 4.254, "step": 23000 }, { "entropy": 1.5884887778759003, "epoch": 1.0920030320257723, "grad_norm": 1.2358678579330444, "learning_rate": 5.070281250110437e-05, "loss": 1.2144, "mean_token_accuracy": 0.7440959370136261, "num_tokens": 566830458.0, "step": 23050 }, { "entropy": 1.6139474022388458, "epoch": 1.0943718021603184, "grad_norm": 1.1043397188186646, "learning_rate": 5.049611113916745e-05, "loss": 1.2277, "mean_token_accuracy": 0.7419761300086976, "num_tokens": 568033795.0, "step": 23100 }, { "entropy": 1.5839227229356765, "epoch": 1.0967405722948644, "grad_norm": 1.267622470855713, "learning_rate": 5.028940129738234e-05, "loss": 1.1841, "mean_token_accuracy": 0.7496823114156723, "num_tokens": 569296146.0, "step": 23150 }, { "entropy": 1.5836478543281556, "epoch": 1.0991093424294107, "grad_norm": 1.3109345436096191, "learning_rate": 5.0082686508965594e-05, "loss": 1.2001, "mean_token_accuracy": 0.7444102185964584, "num_tokens": 570538060.0, "step": 23200 }, { "entropy": 1.6264153301715851, "epoch": 1.1014781125639568, "grad_norm": 1.166844129562378, "learning_rate": 4.987597030721826e-05, "loss": 1.2166, "mean_token_accuracy": 0.7437608361244201, "num_tokens": 571764299.0, "step": 23250 }, { "entropy": 1.593840502500534, "epoch": 1.1038468826985028, "grad_norm": 1.313543677330017, "learning_rate": 4.966925622546559e-05, "loss": 1.1976, "mean_token_accuracy": 0.7477673798799515, "num_tokens": 572986252.0, "step": 23300 }, { "entropy": 1.6218383753299712, "epoch": 1.1062156528330491, "grad_norm": 1.3079559803009033, "learning_rate": 4.9462547796996554e-05, "loss": 1.2085, "mean_token_accuracy": 0.7460601913928986, "num_tokens": 574203486.0, "step": 23350 }, { "entropy": 1.6025708365440368, "epoch": 1.1085844229675952, "grad_norm": 1.045305848121643, "learning_rate": 4.925584855500357e-05, "loss": 1.1834, "mean_token_accuracy": 0.7496994876861572, "num_tokens": 575431997.0, "step": 23400 }, { "entropy": 1.6023164546489717, "epoch": 1.1109531931021415, "grad_norm": 1.6260581016540527, "learning_rate": 4.904916203252196e-05, "loss": 1.1972, "mean_token_accuracy": 0.7476231580972672, "num_tokens": 576655765.0, "step": 23450 }, { "entropy": 1.6074532234668732, "epoch": 1.1133219632366875, "grad_norm": 1.1137516498565674, "learning_rate": 4.884249176236966e-05, "loss": 1.2031, "mean_token_accuracy": 0.7456535613536834, "num_tokens": 577896889.0, "step": 23500 }, { "entropy": 1.6064541089534758, "epoch": 1.1156907333712336, "grad_norm": 1.0753376483917236, "learning_rate": 4.8635841277086823e-05, "loss": 1.2093, "mean_token_accuracy": 0.7460182595252991, "num_tokens": 579123368.0, "step": 23550 }, { "entropy": 1.6451520609855652, "epoch": 1.1180595035057799, "grad_norm": 1.2830525636672974, "learning_rate": 4.842921410887541e-05, "loss": 1.2173, "mean_token_accuracy": 0.7460962778329849, "num_tokens": 580343576.0, "step": 23600 }, { "entropy": 1.594506859779358, "epoch": 1.120428273640326, "grad_norm": 1.2104618549346924, "learning_rate": 4.822261378953884e-05, "loss": 1.1846, "mean_token_accuracy": 0.7500998550653457, "num_tokens": 581571230.0, "step": 23650 }, { "entropy": 1.5977623069286346, "epoch": 1.122797043774872, "grad_norm": 1.0635625123977661, "learning_rate": 4.8016043850421614e-05, "loss": 1.2121, "mean_token_accuracy": 0.7432440650463105, "num_tokens": 582786589.0, "step": 23700 }, { "entropy": 1.6117420196533203, "epoch": 1.1251658139094183, "grad_norm": 1.2150670289993286, "learning_rate": 4.7809507822348967e-05, "loss": 1.1995, "mean_token_accuracy": 0.746940575838089, "num_tokens": 583979707.0, "step": 23750 }, { "entropy": 1.6140946924686432, "epoch": 1.1275345840439643, "grad_norm": 1.1496037244796753, "learning_rate": 4.7603009235566465e-05, "loss": 1.1965, "mean_token_accuracy": 0.7485955774784088, "num_tokens": 585198089.0, "step": 23800 }, { "entropy": 1.612507269382477, "epoch": 1.1299033541785106, "grad_norm": 1.1946005821228027, "learning_rate": 4.7396551619679735e-05, "loss": 1.1963, "mean_token_accuracy": 0.7465278053283692, "num_tokens": 586406915.0, "step": 23850 }, { "entropy": 1.614688711166382, "epoch": 1.1322721243130567, "grad_norm": 1.3998786211013794, "learning_rate": 4.719013850359412e-05, "loss": 1.202, "mean_token_accuracy": 0.7469007116556168, "num_tokens": 587625422.0, "step": 23900 }, { "entropy": 1.64091064453125, "epoch": 1.1346408944476027, "grad_norm": 1.1203569173812866, "learning_rate": 4.69837734154543e-05, "loss": 1.1882, "mean_token_accuracy": 0.7487949818372727, "num_tokens": 588838858.0, "step": 23950 }, { "entropy": 1.6168962919712067, "epoch": 1.137009664582149, "grad_norm": 1.4909604787826538, "learning_rate": 4.677745988258406e-05, "loss": 1.1948, "mean_token_accuracy": 0.7495903551578522, "num_tokens": 590081510.0, "step": 24000 }, { "epoch": 1.137009664582149, "eval_entropy": 1.0897794357236166, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7586307351547699, "eval_num_tokens": 590081510.0, "eval_runtime": 728.9602, "eval_samples_per_second": 34.042, "eval_steps_per_second": 4.255, "step": 24000 }, { "entropy": 1.6240591382980347, "epoch": 1.139378434716695, "grad_norm": 0.9998461604118347, "learning_rate": 4.657120143142597e-05, "loss": 1.1922, "mean_token_accuracy": 0.7492350846529007, "num_tokens": 591297702.0, "step": 24050 }, { "entropy": 1.655022679567337, "epoch": 1.1417472048512411, "grad_norm": 1.124312162399292, "learning_rate": 4.636500158748109e-05, "loss": 1.2067, "mean_token_accuracy": 0.7455829763412476, "num_tokens": 592508714.0, "step": 24100 }, { "entropy": 1.5910465133190155, "epoch": 1.1441159749857874, "grad_norm": 1.11435866355896, "learning_rate": 4.6158863875248734e-05, "loss": 1.1684, "mean_token_accuracy": 0.7527882248163223, "num_tokens": 593747081.0, "step": 24150 }, { "entropy": 1.6349923205375672, "epoch": 1.1464847451203335, "grad_norm": 1.070635437965393, "learning_rate": 4.595279181816624e-05, "loss": 1.1916, "mean_token_accuracy": 0.749586900472641, "num_tokens": 594943747.0, "step": 24200 }, { "entropy": 1.601109493970871, "epoch": 1.1488535152548796, "grad_norm": 1.1581242084503174, "learning_rate": 4.574678893854871e-05, "loss": 1.1818, "mean_token_accuracy": 0.7509790074825287, "num_tokens": 596175131.0, "step": 24250 }, { "entropy": 1.6024657559394837, "epoch": 1.1512222853894258, "grad_norm": 1.2476531267166138, "learning_rate": 4.554085875752879e-05, "loss": 1.1997, "mean_token_accuracy": 0.747204402089119, "num_tokens": 597415232.0, "step": 24300 }, { "entropy": 1.636058064699173, "epoch": 1.153591055523972, "grad_norm": 1.6576809883117676, "learning_rate": 4.533500479499661e-05, "loss": 1.248, "mean_token_accuracy": 0.7378575146198273, "num_tokens": 598627396.0, "step": 24350 }, { "entropy": 1.6401480340957642, "epoch": 1.1559598256585182, "grad_norm": 1.2274305820465088, "learning_rate": 4.512923056953941e-05, "loss": 1.2219, "mean_token_accuracy": 0.7444866347312927, "num_tokens": 599864565.0, "step": 24400 }, { "entropy": 1.5943049252033235, "epoch": 1.1583285957930642, "grad_norm": 1.2362310886383057, "learning_rate": 4.49235395983816e-05, "loss": 1.1675, "mean_token_accuracy": 0.7534567403793335, "num_tokens": 601090785.0, "step": 24450 }, { "entropy": 1.5798736822605133, "epoch": 1.1606973659276103, "grad_norm": 0.9551867842674255, "learning_rate": 4.4717935397324504e-05, "loss": 1.1633, "mean_token_accuracy": 0.7534276330471039, "num_tokens": 602347409.0, "step": 24500 }, { "entropy": 1.612923823595047, "epoch": 1.1630661360621566, "grad_norm": 1.167639970779419, "learning_rate": 4.4512421480686334e-05, "loss": 1.1752, "mean_token_accuracy": 0.7525352644920349, "num_tokens": 603557548.0, "step": 24550 }, { "entropy": 1.5877990233898163, "epoch": 1.1654349061967026, "grad_norm": 1.1369372606277466, "learning_rate": 4.430700136124209e-05, "loss": 1.1781, "mean_token_accuracy": 0.7510064965486527, "num_tokens": 604816361.0, "step": 24600 }, { "entropy": 1.5707013046741485, "epoch": 1.167803676331249, "grad_norm": 1.3217617273330688, "learning_rate": 4.410167855016356e-05, "loss": 1.1578, "mean_token_accuracy": 0.7544564688205719, "num_tokens": 606031595.0, "step": 24650 }, { "entropy": 1.6142506301403046, "epoch": 1.170172446465795, "grad_norm": 1.0833561420440674, "learning_rate": 4.3896456556959245e-05, "loss": 1.1882, "mean_token_accuracy": 0.7481600660085678, "num_tokens": 607260243.0, "step": 24700 }, { "entropy": 1.5962833178043365, "epoch": 1.172541216600341, "grad_norm": 1.2788193225860596, "learning_rate": 4.369133888941442e-05, "loss": 1.1685, "mean_token_accuracy": 0.7528700757026673, "num_tokens": 608463931.0, "step": 24750 }, { "entropy": 1.6402158641815185, "epoch": 1.1749099867348873, "grad_norm": 1.275524616241455, "learning_rate": 4.348632905353116e-05, "loss": 1.1968, "mean_token_accuracy": 0.7491856187582016, "num_tokens": 609655302.0, "step": 24800 }, { "entropy": 1.614987963438034, "epoch": 1.1772787568694334, "grad_norm": 0.945978581905365, "learning_rate": 4.32814305534684e-05, "loss": 1.209, "mean_token_accuracy": 0.7443074882030487, "num_tokens": 610898851.0, "step": 24850 }, { "entropy": 1.636513990163803, "epoch": 1.1796475270039795, "grad_norm": 1.0590687990188599, "learning_rate": 4.307664689148205e-05, "loss": 1.2299, "mean_token_accuracy": 0.7430419319868088, "num_tokens": 612116242.0, "step": 24900 }, { "entropy": 1.6237515592575074, "epoch": 1.1820162971385257, "grad_norm": 1.3774100542068481, "learning_rate": 4.287198156786516e-05, "loss": 1.1786, "mean_token_accuracy": 0.7511296081542969, "num_tokens": 613335390.0, "step": 24950 }, { "entropy": 1.596169695854187, "epoch": 1.1843850672730718, "grad_norm": 1.1338964700698853, "learning_rate": 4.2667438080888036e-05, "loss": 1.1616, "mean_token_accuracy": 0.7531165385246277, "num_tokens": 614572046.0, "step": 25000 }, { "epoch": 1.1843850672730718, "eval_entropy": 1.0971692777848259, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7585837088482061, "eval_num_tokens": 614572046.0, "eval_runtime": 728.9341, "eval_samples_per_second": 34.043, "eval_steps_per_second": 4.256, "step": 25000 }, { "entropy": 1.6357793831825256, "epoch": 1.1867538374076179, "grad_norm": 1.3235797882080078, "learning_rate": 4.24630199267385e-05, "loss": 1.2175, "mean_token_accuracy": 0.7434951883554458, "num_tokens": 615813138.0, "step": 25050 }, { "entropy": 1.610986716747284, "epoch": 1.1891226075421641, "grad_norm": 0.9810039401054382, "learning_rate": 4.225873059946206e-05, "loss": 1.183, "mean_token_accuracy": 0.7497907614707947, "num_tokens": 617052724.0, "step": 25100 }, { "entropy": 1.5992292380332946, "epoch": 1.1914913776767102, "grad_norm": 1.3308528661727905, "learning_rate": 4.2054573590902295e-05, "loss": 1.1455, "mean_token_accuracy": 0.7566713351011276, "num_tokens": 618273598.0, "step": 25150 }, { "entropy": 1.6277151501178742, "epoch": 1.1938601478112565, "grad_norm": 1.3727302551269531, "learning_rate": 4.1850552390641076e-05, "loss": 1.2243, "mean_token_accuracy": 0.7431273967027664, "num_tokens": 619483045.0, "step": 25200 }, { "entropy": 1.622536163330078, "epoch": 1.1962289179458026, "grad_norm": 1.319136142730713, "learning_rate": 4.164667048593892e-05, "loss": 1.1947, "mean_token_accuracy": 0.7481009513139725, "num_tokens": 620705194.0, "step": 25250 }, { "entropy": 1.6237769031524658, "epoch": 1.1985976880803486, "grad_norm": 1.2181921005249023, "learning_rate": 4.144293136167549e-05, "loss": 1.1737, "mean_token_accuracy": 0.7511861574649811, "num_tokens": 621924964.0, "step": 25300 }, { "entropy": 1.6047194039821624, "epoch": 1.200966458214895, "grad_norm": 1.2977948188781738, "learning_rate": 4.123933850028991e-05, "loss": 1.2143, "mean_token_accuracy": 0.7442529916763305, "num_tokens": 623143777.0, "step": 25350 }, { "entropy": 1.6068167972564698, "epoch": 1.203335228349441, "grad_norm": 1.313650369644165, "learning_rate": 4.103589538172127e-05, "loss": 1.2124, "mean_token_accuracy": 0.7447144162654876, "num_tokens": 624380253.0, "step": 25400 }, { "entropy": 1.5983986258506775, "epoch": 1.2057039984839872, "grad_norm": 1.6129273176193237, "learning_rate": 4.0832605483349193e-05, "loss": 1.1634, "mean_token_accuracy": 0.7545448428392411, "num_tokens": 625600084.0, "step": 25450 }, { "entropy": 1.6317675995826721, "epoch": 1.2080727686185333, "grad_norm": 1.2153606414794922, "learning_rate": 4.062947227993433e-05, "loss": 1.1998, "mean_token_accuracy": 0.7479275733232498, "num_tokens": 626816439.0, "step": 25500 }, { "entropy": 1.591133669614792, "epoch": 1.2104415387530794, "grad_norm": 1.3711997270584106, "learning_rate": 4.042649924355905e-05, "loss": 1.1747, "mean_token_accuracy": 0.7510749793052673, "num_tokens": 628060275.0, "step": 25550 }, { "entropy": 1.616237759590149, "epoch": 1.2128103088876254, "grad_norm": 1.317814588546753, "learning_rate": 4.022368984356801e-05, "loss": 1.1964, "mean_token_accuracy": 0.7473501098155976, "num_tokens": 629291699.0, "step": 25600 }, { "entropy": 1.618736606836319, "epoch": 1.2151790790221717, "grad_norm": 1.1058121919631958, "learning_rate": 4.002104754650887e-05, "loss": 1.2022, "mean_token_accuracy": 0.74667718231678, "num_tokens": 630538034.0, "step": 25650 }, { "entropy": 1.6311498081684113, "epoch": 1.2175478491567178, "grad_norm": 1.0992521047592163, "learning_rate": 3.981857581607313e-05, "loss": 1.1851, "mean_token_accuracy": 0.7504255121946335, "num_tokens": 631771489.0, "step": 25700 }, { "entropy": 1.6083201706409453, "epoch": 1.219916619291264, "grad_norm": 1.2340748310089111, "learning_rate": 3.9616278113036786e-05, "loss": 1.1595, "mean_token_accuracy": 0.7537871873378754, "num_tokens": 632996983.0, "step": 25750 }, { "entropy": 1.6100276720523834, "epoch": 1.22228538942581, "grad_norm": 1.2286880016326904, "learning_rate": 3.9414157895201273e-05, "loss": 1.2196, "mean_token_accuracy": 0.7460716181993484, "num_tokens": 634207237.0, "step": 25800 }, { "entropy": 1.6090706491470337, "epoch": 1.2246541595603562, "grad_norm": 1.2269102334976196, "learning_rate": 3.9212218617334356e-05, "loss": 1.19, "mean_token_accuracy": 0.7494009816646576, "num_tokens": 635435267.0, "step": 25850 }, { "entropy": 1.5722477328777313, "epoch": 1.2270229296949025, "grad_norm": 1.0874273777008057, "learning_rate": 3.901046373111103e-05, "loss": 1.1665, "mean_token_accuracy": 0.7549541050195694, "num_tokens": 636668958.0, "step": 25900 }, { "entropy": 1.6207891261577607, "epoch": 1.2293916998294485, "grad_norm": 1.4808669090270996, "learning_rate": 3.880889668505455e-05, "loss": 1.2441, "mean_token_accuracy": 0.7397470092773437, "num_tokens": 637878020.0, "step": 25950 }, { "entropy": 1.5923774099349977, "epoch": 1.2317604699639948, "grad_norm": 1.1652626991271973, "learning_rate": 3.860752092447749e-05, "loss": 1.1818, "mean_token_accuracy": 0.7510163110494613, "num_tokens": 639111025.0, "step": 26000 }, { "epoch": 1.2317604699639948, "eval_entropy": 1.0619611897482402, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7597063543775787, "eval_num_tokens": 639111025.0, "eval_runtime": 728.2888, "eval_samples_per_second": 34.073, "eval_steps_per_second": 4.259, "step": 26000 }, { "entropy": 1.6090249001979828, "epoch": 1.2341292400985409, "grad_norm": 1.140541434288025, "learning_rate": 3.840633989142289e-05, "loss": 1.2208, "mean_token_accuracy": 0.7445061576366424, "num_tokens": 640330754.0, "step": 26050 }, { "entropy": 1.5675190496444702, "epoch": 1.236498010233087, "grad_norm": 1.1996732950210571, "learning_rate": 3.820535702460533e-05, "loss": 1.1648, "mean_token_accuracy": 0.7530361658334732, "num_tokens": 641577483.0, "step": 26100 }, { "entropy": 1.5749832487106323, "epoch": 1.2388667803676332, "grad_norm": 1.3973077535629272, "learning_rate": 3.800457575935222e-05, "loss": 1.172, "mean_token_accuracy": 0.7515737456083298, "num_tokens": 642810578.0, "step": 26150 }, { "entropy": 1.5630564618110656, "epoch": 1.2412355505021793, "grad_norm": 1.0359597206115723, "learning_rate": 3.780399952754507e-05, "loss": 1.1647, "mean_token_accuracy": 0.7527536135911942, "num_tokens": 644066110.0, "step": 26200 }, { "entropy": 1.592404429912567, "epoch": 1.2436043206367253, "grad_norm": 1.076479434967041, "learning_rate": 3.7603631757560855e-05, "loss": 1.1732, "mean_token_accuracy": 0.7527641028165817, "num_tokens": 645301566.0, "step": 26250 }, { "entropy": 1.6117449808120727, "epoch": 1.2459730907712716, "grad_norm": 1.218085527420044, "learning_rate": 3.7403475874213354e-05, "loss": 1.2315, "mean_token_accuracy": 0.7417248862981797, "num_tokens": 646546442.0, "step": 26300 }, { "entropy": 1.5983488774299621, "epoch": 1.2483418609058177, "grad_norm": 1.2915470600128174, "learning_rate": 3.7203535298694656e-05, "loss": 1.2024, "mean_token_accuracy": 0.7482427370548248, "num_tokens": 647787345.0, "step": 26350 }, { "entropy": 1.5954162907600402, "epoch": 1.2507106310403637, "grad_norm": 1.0252054929733276, "learning_rate": 3.700381344851665e-05, "loss": 1.1757, "mean_token_accuracy": 0.752863358259201, "num_tokens": 649019906.0, "step": 26400 }, { "entropy": 1.5740117967128753, "epoch": 1.25307940117491, "grad_norm": 1.2225929498672485, "learning_rate": 3.6804313737452686e-05, "loss": 1.1731, "mean_token_accuracy": 0.7517155534029007, "num_tokens": 650242573.0, "step": 26450 }, { "entropy": 1.5935308575630187, "epoch": 1.255448171309456, "grad_norm": 1.3027613162994385, "learning_rate": 3.66050395754791e-05, "loss": 1.1661, "mean_token_accuracy": 0.7530785751342773, "num_tokens": 651477593.0, "step": 26500 }, { "entropy": 1.5640760624408723, "epoch": 1.2578169414440024, "grad_norm": 0.9961079955101013, "learning_rate": 3.6405994368717054e-05, "loss": 1.1706, "mean_token_accuracy": 0.7543714487552643, "num_tokens": 652736186.0, "step": 26550 }, { "entropy": 1.6221139824390411, "epoch": 1.2601857115785484, "grad_norm": 1.6644654273986816, "learning_rate": 3.620718151937425e-05, "loss": 1.1881, "mean_token_accuracy": 0.7484775596857071, "num_tokens": 653945306.0, "step": 26600 }, { "entropy": 1.627434605360031, "epoch": 1.2625544817130945, "grad_norm": 1.1984070539474487, "learning_rate": 3.6008604425686766e-05, "loss": 1.2087, "mean_token_accuracy": 0.744699953198433, "num_tokens": 655163994.0, "step": 26650 }, { "entropy": 1.595815200805664, "epoch": 1.2649232518476408, "grad_norm": 0.942965030670166, "learning_rate": 3.581026648186101e-05, "loss": 1.2047, "mean_token_accuracy": 0.7466594022512436, "num_tokens": 656389078.0, "step": 26700 }, { "entropy": 1.5596436941623688, "epoch": 1.2672920219821868, "grad_norm": 1.0333982706069946, "learning_rate": 3.561217107801568e-05, "loss": 1.1366, "mean_token_accuracy": 0.7599540430307389, "num_tokens": 657628840.0, "step": 26750 }, { "entropy": 1.582226196527481, "epoch": 1.269660792116733, "grad_norm": 1.3895862102508545, "learning_rate": 3.5414321600123854e-05, "loss": 1.1594, "mean_token_accuracy": 0.7542477381229401, "num_tokens": 658863710.0, "step": 26800 }, { "entropy": 1.6000851714611053, "epoch": 1.2720295622512792, "grad_norm": 1.2000585794448853, "learning_rate": 3.521672142995506e-05, "loss": 1.1862, "mean_token_accuracy": 0.7507990497350693, "num_tokens": 660068012.0, "step": 26850 }, { "entropy": 1.6038016283512115, "epoch": 1.2743983323858252, "grad_norm": 1.0799274444580078, "learning_rate": 3.501937394501747e-05, "loss": 1.1911, "mean_token_accuracy": 0.7496211153268814, "num_tokens": 661305265.0, "step": 26900 }, { "entropy": 1.6001941812038423, "epoch": 1.2767671025203713, "grad_norm": 1.0266954898834229, "learning_rate": 3.4822282518500286e-05, "loss": 1.1319, "mean_token_accuracy": 0.7590432322025299, "num_tokens": 662525430.0, "step": 26950 }, { "entropy": 1.6020025527477264, "epoch": 1.2791358726549176, "grad_norm": 1.4219437837600708, "learning_rate": 3.4625450519215915e-05, "loss": 1.1896, "mean_token_accuracy": 0.7499201774597168, "num_tokens": 663708332.0, "step": 27000 }, { "epoch": 1.2791358726549176, "eval_entropy": 1.0667552875749993, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7588769892467213, "eval_num_tokens": 663708332.0, "eval_runtime": 730.6419, "eval_samples_per_second": 33.963, "eval_steps_per_second": 4.246, "step": 27000 }, { "entropy": 1.5845714378356934, "epoch": 1.2815046427894636, "grad_norm": 1.2918035984039307, "learning_rate": 3.4428881311542485e-05, "loss": 1.2018, "mean_token_accuracy": 0.7470276898145676, "num_tokens": 664918181.0, "step": 27050 }, { "entropy": 1.567339129447937, "epoch": 1.28387341292401, "grad_norm": 1.3722400665283203, "learning_rate": 3.423257825536637e-05, "loss": 1.1469, "mean_token_accuracy": 0.7578269052505493, "num_tokens": 666127940.0, "step": 27100 }, { "entropy": 1.5779772651195527, "epoch": 1.286242183058556, "grad_norm": 1.2678896188735962, "learning_rate": 3.403654470602463e-05, "loss": 1.2057, "mean_token_accuracy": 0.7468883281946183, "num_tokens": 667363858.0, "step": 27150 }, { "entropy": 1.6206267583370209, "epoch": 1.288610953193102, "grad_norm": 1.415664553642273, "learning_rate": 3.3840784014247825e-05, "loss": 1.1709, "mean_token_accuracy": 0.7533509171009064, "num_tokens": 668586984.0, "step": 27200 }, { "entropy": 1.58282252907753, "epoch": 1.2909797233276483, "grad_norm": 1.0562044382095337, "learning_rate": 3.3645299526102625e-05, "loss": 1.1525, "mean_token_accuracy": 0.7587102675437927, "num_tokens": 669848008.0, "step": 27250 }, { "entropy": 1.6004004609584808, "epoch": 1.2933484934621944, "grad_norm": 1.3321800231933594, "learning_rate": 3.3450094582934624e-05, "loss": 1.168, "mean_token_accuracy": 0.7531924885511398, "num_tokens": 671055921.0, "step": 27300 }, { "entropy": 1.6070753967761993, "epoch": 1.2957172635967407, "grad_norm": 1.1480814218521118, "learning_rate": 3.3255172521311296e-05, "loss": 1.1957, "mean_token_accuracy": 0.7474820953607559, "num_tokens": 672291814.0, "step": 27350 }, { "entropy": 1.618386241197586, "epoch": 1.2980860337312867, "grad_norm": 1.3218634128570557, "learning_rate": 3.306053667296491e-05, "loss": 1.1813, "mean_token_accuracy": 0.749809256196022, "num_tokens": 673529686.0, "step": 27400 }, { "entropy": 1.5894237875938415, "epoch": 1.3004548038658328, "grad_norm": 1.2133702039718628, "learning_rate": 3.286619036473557e-05, "loss": 1.1527, "mean_token_accuracy": 0.7563208711147308, "num_tokens": 674737012.0, "step": 27450 }, { "entropy": 1.5680401778221131, "epoch": 1.302823574000379, "grad_norm": 1.3504135608673096, "learning_rate": 3.267213691851443e-05, "loss": 1.1453, "mean_token_accuracy": 0.7576598930358887, "num_tokens": 676016669.0, "step": 27500 }, { "entropy": 1.5564317107200623, "epoch": 1.3051923441349251, "grad_norm": 1.2370836734771729, "learning_rate": 3.2478379651186814e-05, "loss": 1.151, "mean_token_accuracy": 0.7560758543014526, "num_tokens": 677240518.0, "step": 27550 }, { "entropy": 1.5811952316761018, "epoch": 1.3075611142694714, "grad_norm": 1.161582589149475, "learning_rate": 3.228492187457557e-05, "loss": 1.1623, "mean_token_accuracy": 0.7540548771619797, "num_tokens": 678477906.0, "step": 27600 }, { "entropy": 1.616460270881653, "epoch": 1.3099298844040175, "grad_norm": 1.2357761859893799, "learning_rate": 3.209176689538448e-05, "loss": 1.203, "mean_token_accuracy": 0.7480911284685134, "num_tokens": 679717449.0, "step": 27650 }, { "entropy": 1.6137902176380157, "epoch": 1.3122986545385635, "grad_norm": 1.1097781658172607, "learning_rate": 3.189891801514171e-05, "loss": 1.1877, "mean_token_accuracy": 0.7503223437070846, "num_tokens": 680910674.0, "step": 27700 }, { "entropy": 1.6009632289409637, "epoch": 1.3146674246731096, "grad_norm": 1.260872721672058, "learning_rate": 3.1706378530143385e-05, "loss": 1.1725, "mean_token_accuracy": 0.7530629223585129, "num_tokens": 682144950.0, "step": 27750 }, { "entropy": 1.6063115882873535, "epoch": 1.3170361948076559, "grad_norm": 1.1645578145980835, "learning_rate": 3.1514151731397246e-05, "loss": 1.1647, "mean_token_accuracy": 0.753446283340454, "num_tokens": 683390865.0, "step": 27800 }, { "entropy": 1.6270235812664031, "epoch": 1.319404964942202, "grad_norm": 1.2020407915115356, "learning_rate": 3.1322240904566426e-05, "loss": 1.1735, "mean_token_accuracy": 0.7529788100719452, "num_tokens": 684605889.0, "step": 27850 }, { "entropy": 1.607979006767273, "epoch": 1.3217737350767482, "grad_norm": 1.137190580368042, "learning_rate": 3.1130649329913225e-05, "loss": 1.2056, "mean_token_accuracy": 0.7471660190820694, "num_tokens": 685842856.0, "step": 27900 }, { "entropy": 1.6045704185962677, "epoch": 1.3241425052112943, "grad_norm": 1.21959388256073, "learning_rate": 3.09393802822431e-05, "loss": 1.1506, "mean_token_accuracy": 0.7561245012283325, "num_tokens": 687059905.0, "step": 27950 }, { "entropy": 1.6008513212203979, "epoch": 1.3265112753458403, "grad_norm": 0.969918429851532, "learning_rate": 3.074843703084869e-05, "loss": 1.1717, "mean_token_accuracy": 0.7522702825069427, "num_tokens": 688293184.0, "step": 28000 }, { "epoch": 1.3265112753458403, "eval_entropy": 1.1074502345902315, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7599941444139493, "eval_num_tokens": 688293184.0, "eval_runtime": 728.3714, "eval_samples_per_second": 34.069, "eval_steps_per_second": 4.259, "step": 28000 }, { "entropy": 1.613970617055893, "epoch": 1.3288800454803866, "grad_norm": 1.2240134477615356, "learning_rate": 3.0557822839453874e-05, "loss": 1.1618, "mean_token_accuracy": 0.7536975979804993, "num_tokens": 689517314.0, "step": 28050 }, { "entropy": 1.5815585339069367, "epoch": 1.3312488156149327, "grad_norm": 1.1150712966918945, "learning_rate": 3.036754096615807e-05, "loss": 1.1704, "mean_token_accuracy": 0.753908543586731, "num_tokens": 690755165.0, "step": 28100 }, { "entropy": 1.5578910648822784, "epoch": 1.333617585749479, "grad_norm": 1.2640388011932373, "learning_rate": 3.017759466338046e-05, "loss": 1.1623, "mean_token_accuracy": 0.7535911196470261, "num_tokens": 692007773.0, "step": 28150 }, { "entropy": 1.5919007360935211, "epoch": 1.335986355884025, "grad_norm": 1.1327555179595947, "learning_rate": 2.9987987177804494e-05, "loss": 1.1729, "mean_token_accuracy": 0.7511158144474029, "num_tokens": 693232662.0, "step": 28200 }, { "entropy": 1.6085839200019836, "epoch": 1.338355126018571, "grad_norm": 0.9447433352470398, "learning_rate": 2.979872175032231e-05, "loss": 1.1558, "mean_token_accuracy": 0.7552832061052323, "num_tokens": 694432399.0, "step": 28250 }, { "entropy": 1.6203950083255767, "epoch": 1.3407238961531174, "grad_norm": 1.1614621877670288, "learning_rate": 2.960980161597936e-05, "loss": 1.1892, "mean_token_accuracy": 0.7479177683591842, "num_tokens": 695664920.0, "step": 28300 }, { "entropy": 1.588459266424179, "epoch": 1.3430926662876634, "grad_norm": 1.6014941930770874, "learning_rate": 2.9421230003919155e-05, "loss": 1.1604, "mean_token_accuracy": 0.75300128698349, "num_tokens": 696895792.0, "step": 28350 }, { "entropy": 1.6118758118152618, "epoch": 1.3454614364222097, "grad_norm": 1.1947180032730103, "learning_rate": 2.923301013732799e-05, "loss": 1.1825, "mean_token_accuracy": 0.7502673131227493, "num_tokens": 698079475.0, "step": 28400 }, { "entropy": 1.6115264117717742, "epoch": 1.3478302065567558, "grad_norm": 1.2438665628433228, "learning_rate": 2.9045145233379976e-05, "loss": 1.2001, "mean_token_accuracy": 0.7489022916555405, "num_tokens": 699305883.0, "step": 28450 }, { "entropy": 1.5930208683013916, "epoch": 1.3501989766913018, "grad_norm": 1.2472587823867798, "learning_rate": 2.885763850318193e-05, "loss": 1.1455, "mean_token_accuracy": 0.7588497418165207, "num_tokens": 700517157.0, "step": 28500 }, { "entropy": 1.611283905506134, "epoch": 1.352567746825848, "grad_norm": 1.1896998882293701, "learning_rate": 2.8670493151718526e-05, "loss": 1.2069, "mean_token_accuracy": 0.7471307969093323, "num_tokens": 701725293.0, "step": 28550 }, { "entropy": 1.5695743489265441, "epoch": 1.3549365169603942, "grad_norm": 1.1057043075561523, "learning_rate": 2.8483712377797544e-05, "loss": 1.1538, "mean_token_accuracy": 0.7563241708278656, "num_tokens": 702969110.0, "step": 28600 }, { "entropy": 1.5800132751464844, "epoch": 1.3573052870949402, "grad_norm": 1.1600664854049683, "learning_rate": 2.829729937399515e-05, "loss": 1.1533, "mean_token_accuracy": 0.7582338035106659, "num_tokens": 704225571.0, "step": 28650 }, { "entropy": 1.6222402799129485, "epoch": 1.3596740572294865, "grad_norm": 0.993548572063446, "learning_rate": 2.8111257326601402e-05, "loss": 1.2294, "mean_token_accuracy": 0.742488032579422, "num_tokens": 705467457.0, "step": 28700 }, { "entropy": 1.568291175365448, "epoch": 1.3620428273640326, "grad_norm": 1.0379763841629028, "learning_rate": 2.7925589415565666e-05, "loss": 1.1593, "mean_token_accuracy": 0.7555217838287354, "num_tokens": 706689479.0, "step": 28750 }, { "entropy": 1.6043275892734528, "epoch": 1.3644115974985787, "grad_norm": 1.43356454372406, "learning_rate": 2.774029881444238e-05, "loss": 1.2127, "mean_token_accuracy": 0.7451710641384125, "num_tokens": 707935708.0, "step": 28800 }, { "entropy": 1.5973702204227447, "epoch": 1.366780367633125, "grad_norm": 1.2377339601516724, "learning_rate": 2.7555388690336725e-05, "loss": 1.163, "mean_token_accuracy": 0.7523965907096862, "num_tokens": 709186556.0, "step": 28850 }, { "entropy": 1.5913502633571626, "epoch": 1.369149137767671, "grad_norm": 1.2148689031600952, "learning_rate": 2.737086220385055e-05, "loss": 1.128, "mean_token_accuracy": 0.7610643255710602, "num_tokens": 710387868.0, "step": 28900 }, { "entropy": 1.629042412042618, "epoch": 1.3715179079022173, "grad_norm": 1.3843477964401245, "learning_rate": 2.7186722509028294e-05, "loss": 1.1943, "mean_token_accuracy": 0.7486888426542282, "num_tokens": 711599301.0, "step": 28950 }, { "entropy": 1.5498824548721313, "epoch": 1.3738866780367633, "grad_norm": 1.2801408767700195, "learning_rate": 2.7002972753303167e-05, "loss": 1.1466, "mean_token_accuracy": 0.7573213475942612, "num_tokens": 712853561.0, "step": 29000 }, { "epoch": 1.3738866780367633, "eval_entropy": 1.08869860577322, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7604253247985526, "eval_num_tokens": 712853561.0, "eval_runtime": 728.768, "eval_samples_per_second": 34.051, "eval_steps_per_second": 4.256, "step": 29000 }, { "entropy": 1.5806612002849578, "epoch": 1.3762554481713094, "grad_norm": 1.1750032901763916, "learning_rate": 2.6819616077443243e-05, "loss": 1.1608, "mean_token_accuracy": 0.754431728720665, "num_tokens": 714067279.0, "step": 29050 }, { "entropy": 1.5694472527503966, "epoch": 1.3786242183058555, "grad_norm": 1.0282950401306152, "learning_rate": 2.6636655615497808e-05, "loss": 1.1345, "mean_token_accuracy": 0.7596888369321824, "num_tokens": 715313924.0, "step": 29100 }, { "entropy": 1.5819503486156463, "epoch": 1.3809929884404017, "grad_norm": 1.0335078239440918, "learning_rate": 2.6454094494743865e-05, "loss": 1.172, "mean_token_accuracy": 0.7516703462600708, "num_tokens": 716539594.0, "step": 29150 }, { "entropy": 1.5596589314937592, "epoch": 1.3833617585749478, "grad_norm": 1.1024916172027588, "learning_rate": 2.627193583563259e-05, "loss": 1.1515, "mean_token_accuracy": 0.7565756964683533, "num_tokens": 717779064.0, "step": 29200 }, { "entropy": 1.5737035143375397, "epoch": 1.385730528709494, "grad_norm": 1.3154332637786865, "learning_rate": 2.609018275173601e-05, "loss": 1.1313, "mean_token_accuracy": 0.7610451829433441, "num_tokens": 719013380.0, "step": 29250 }, { "entropy": 1.5879342305660247, "epoch": 1.3880992988440402, "grad_norm": 1.1045840978622437, "learning_rate": 2.590883834969383e-05, "loss": 1.1607, "mean_token_accuracy": 0.753515048623085, "num_tokens": 720213990.0, "step": 29300 }, { "entropy": 1.5969963049888611, "epoch": 1.3904680689785862, "grad_norm": 1.360352635383606, "learning_rate": 2.5727905729160274e-05, "loss": 1.2105, "mean_token_accuracy": 0.7468285745382309, "num_tokens": 721454429.0, "step": 29350 }, { "entropy": 1.5682587361335754, "epoch": 1.3928368391131325, "grad_norm": 1.2134160995483398, "learning_rate": 2.5547387982751186e-05, "loss": 1.1674, "mean_token_accuracy": 0.7542579096555709, "num_tokens": 722678187.0, "step": 29400 }, { "entropy": 1.5867646288871766, "epoch": 1.3952056092476786, "grad_norm": 1.316106915473938, "learning_rate": 2.536728819599108e-05, "loss": 1.1752, "mean_token_accuracy": 0.7528412294387817, "num_tokens": 723910197.0, "step": 29450 }, { "entropy": 1.5651627695560455, "epoch": 1.3975743793822248, "grad_norm": 1.184169054031372, "learning_rate": 2.5187609447260417e-05, "loss": 1.1439, "mean_token_accuracy": 0.7595011454820633, "num_tokens": 725117786.0, "step": 29500 }, { "entropy": 1.5885691118240357, "epoch": 1.399943149516771, "grad_norm": 1.026950716972351, "learning_rate": 2.5008354807743063e-05, "loss": 1.1624, "mean_token_accuracy": 0.7540817469358444, "num_tokens": 726361382.0, "step": 29550 }, { "entropy": 1.6078854203224182, "epoch": 1.402311919651317, "grad_norm": 1.105989694595337, "learning_rate": 2.482952734137369e-05, "loss": 1.1846, "mean_token_accuracy": 0.7512462210655212, "num_tokens": 727584011.0, "step": 29600 }, { "entropy": 1.5742497992515565, "epoch": 1.4046806897858632, "grad_norm": 1.0175246000289917, "learning_rate": 2.4651130104785464e-05, "loss": 1.1383, "mean_token_accuracy": 0.7599206572771072, "num_tokens": 728859452.0, "step": 29650 }, { "entropy": 1.5539786064624785, "epoch": 1.4070494599204093, "grad_norm": 1.19257652759552, "learning_rate": 2.447316614725779e-05, "loss": 1.1285, "mean_token_accuracy": 0.7605455183982849, "num_tokens": 730093871.0, "step": 29700 }, { "entropy": 1.575538364648819, "epoch": 1.4094182300549556, "grad_norm": 1.3367068767547607, "learning_rate": 2.429563851066423e-05, "loss": 1.1549, "mean_token_accuracy": 0.7566698521375657, "num_tokens": 731296865.0, "step": 29750 }, { "entropy": 1.5920424699783324, "epoch": 1.4117870001895017, "grad_norm": 1.1951195001602173, "learning_rate": 2.411855022942043e-05, "loss": 1.1863, "mean_token_accuracy": 0.7511163413524627, "num_tokens": 732528311.0, "step": 29800 }, { "entropy": 1.5808912098407746, "epoch": 1.4141557703240477, "grad_norm": 1.2582076787948608, "learning_rate": 2.394190433043228e-05, "loss": 1.1524, "mean_token_accuracy": 0.756331347823143, "num_tokens": 733754679.0, "step": 29850 }, { "entropy": 1.6012385189533234, "epoch": 1.4165245404585938, "grad_norm": 1.2719967365264893, "learning_rate": 2.376570383304423e-05, "loss": 1.1689, "mean_token_accuracy": 0.7530979549884796, "num_tokens": 734988780.0, "step": 29900 }, { "entropy": 1.5944563674926757, "epoch": 1.41889331059314, "grad_norm": 1.168672800064087, "learning_rate": 2.3589951748987615e-05, "loss": 1.1874, "mean_token_accuracy": 0.7496302407979966, "num_tokens": 736210496.0, "step": 29950 }, { "entropy": 1.5730518507957458, "epoch": 1.4212620807276861, "grad_norm": 1.2104912996292114, "learning_rate": 2.3414651082329214e-05, "loss": 1.1672, "mean_token_accuracy": 0.7543781703710556, "num_tokens": 737427744.0, "step": 30000 }, { "epoch": 1.4212620807276861, "eval_entropy": 1.0714102481581333, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7607390975986735, "eval_num_tokens": 737427744.0, "eval_runtime": 727.6986, "eval_samples_per_second": 34.101, "eval_steps_per_second": 4.263, "step": 30000 }, { "entropy": 1.5904272723197936, "epoch": 1.4236308508622324, "grad_norm": 1.3191770315170288, "learning_rate": 2.323980482941987e-05, "loss": 1.1451, "mean_token_accuracy": 0.7584353858232498, "num_tokens": 738655939.0, "step": 30050 }, { "entropy": 1.5946367990970611, "epoch": 1.4259996209967785, "grad_norm": 1.6591154336929321, "learning_rate": 2.3065415978843334e-05, "loss": 1.1805, "mean_token_accuracy": 0.7520586925745011, "num_tokens": 739862826.0, "step": 30100 }, { "entropy": 1.5869379675388335, "epoch": 1.4283683911313245, "grad_norm": 1.4352805614471436, "learning_rate": 2.2891487511365144e-05, "loss": 1.1486, "mean_token_accuracy": 0.7577965116500854, "num_tokens": 741119357.0, "step": 30150 }, { "entropy": 1.5365791404247284, "epoch": 1.4307371612658708, "grad_norm": 1.2801551818847656, "learning_rate": 2.2718022399881637e-05, "loss": 1.142, "mean_token_accuracy": 0.7584273481369018, "num_tokens": 742333607.0, "step": 30200 }, { "entropy": 1.5572332954406738, "epoch": 1.4331059314004169, "grad_norm": 1.212966799736023, "learning_rate": 2.2545023609369202e-05, "loss": 1.1619, "mean_token_accuracy": 0.7548242086172103, "num_tokens": 743565034.0, "step": 30250 }, { "entropy": 1.566081155538559, "epoch": 1.4354747015349631, "grad_norm": 1.0920426845550537, "learning_rate": 2.237249409683356e-05, "loss": 1.1783, "mean_token_accuracy": 0.7530256235599517, "num_tokens": 744804057.0, "step": 30300 }, { "entropy": 1.6045558285713195, "epoch": 1.4378434716695092, "grad_norm": 0.9273141026496887, "learning_rate": 2.220043681125924e-05, "loss": 1.1419, "mean_token_accuracy": 0.7590651035308837, "num_tokens": 746006768.0, "step": 30350 }, { "entropy": 1.5942323195934296, "epoch": 1.4402122418040553, "grad_norm": 1.1541792154312134, "learning_rate": 2.202885469355916e-05, "loss": 1.1921, "mean_token_accuracy": 0.7489143800735474, "num_tokens": 747223675.0, "step": 30400 }, { "entropy": 1.5869334352016449, "epoch": 1.4425810119386016, "grad_norm": 1.462320327758789, "learning_rate": 2.1857750676524357e-05, "loss": 1.1442, "mean_token_accuracy": 0.7573701620101929, "num_tokens": 748430497.0, "step": 30450 }, { "entropy": 1.586618103981018, "epoch": 1.4449497820731476, "grad_norm": 1.0793588161468506, "learning_rate": 2.168712768477392e-05, "loss": 1.1743, "mean_token_accuracy": 0.7522615754604339, "num_tokens": 749647006.0, "step": 30500 }, { "entropy": 1.6103046894073487, "epoch": 1.447318552207694, "grad_norm": 1.2154242992401123, "learning_rate": 2.1516988634704882e-05, "loss": 1.19, "mean_token_accuracy": 0.7501159131526947, "num_tokens": 750853602.0, "step": 30550 }, { "entropy": 1.5543176436424255, "epoch": 1.44968732234224, "grad_norm": 1.1655502319335938, "learning_rate": 2.1347336434442467e-05, "loss": 1.1284, "mean_token_accuracy": 0.7604024815559387, "num_tokens": 752063383.0, "step": 30600 }, { "entropy": 1.5402367627620697, "epoch": 1.452056092476786, "grad_norm": 0.9396981000900269, "learning_rate": 2.1178173983790333e-05, "loss": 1.1413, "mean_token_accuracy": 0.7587384188175201, "num_tokens": 753297932.0, "step": 30650 }, { "entropy": 1.565843381881714, "epoch": 1.454424862611332, "grad_norm": 1.2412699460983276, "learning_rate": 2.100950417418105e-05, "loss": 1.1336, "mean_token_accuracy": 0.76046923995018, "num_tokens": 754534333.0, "step": 30700 }, { "entropy": 1.5800429701805114, "epoch": 1.4567936327458784, "grad_norm": 1.3534191846847534, "learning_rate": 2.084132988862663e-05, "loss": 1.168, "mean_token_accuracy": 0.7545898991823197, "num_tokens": 755771112.0, "step": 30750 }, { "entropy": 1.5431535518169404, "epoch": 1.4591624028804244, "grad_norm": 1.1893748044967651, "learning_rate": 2.067365400166928e-05, "loss": 1.1317, "mean_token_accuracy": 0.7592762231826782, "num_tokens": 757016170.0, "step": 30800 }, { "entropy": 1.5234503149986267, "epoch": 1.4615311730149707, "grad_norm": 1.2661027908325195, "learning_rate": 2.0506479379332277e-05, "loss": 1.1197, "mean_token_accuracy": 0.7625928592681884, "num_tokens": 758267588.0, "step": 30850 }, { "entropy": 1.595642819404602, "epoch": 1.4638999431495168, "grad_norm": 1.3147796392440796, "learning_rate": 2.0339808879070942e-05, "loss": 1.1943, "mean_token_accuracy": 0.7485580265522003, "num_tokens": 759488024.0, "step": 30900 }, { "entropy": 1.581249178647995, "epoch": 1.4662687132840628, "grad_norm": 1.1915379762649536, "learning_rate": 2.0173645349723823e-05, "loss": 1.1843, "mean_token_accuracy": 0.751889705657959, "num_tokens": 760705547.0, "step": 30950 }, { "entropy": 1.547069821357727, "epoch": 1.4686374834186091, "grad_norm": 1.3527320623397827, "learning_rate": 2.0007991631463985e-05, "loss": 1.124, "mean_token_accuracy": 0.7617496418952941, "num_tokens": 761946385.0, "step": 31000 }, { "epoch": 1.4686374834186091, "eval_entropy": 1.0745692070571167, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7606811193421454, "eval_num_tokens": 761946385.0, "eval_runtime": 750.8644, "eval_samples_per_second": 33.049, "eval_steps_per_second": 4.131, "step": 31000 }, { "entropy": 1.5860296082496643, "epoch": 1.4710062535531552, "grad_norm": 1.1028927564620972, "learning_rate": 1.984285055575052e-05, "loss": 1.1477, "mean_token_accuracy": 0.757946463227272, "num_tokens": 763145229.0, "step": 31050 }, { "entropy": 1.5409555327892304, "epoch": 1.4733750236877015, "grad_norm": 1.3105554580688477, "learning_rate": 1.967822494528007e-05, "loss": 1.1388, "mean_token_accuracy": 0.7595143103599549, "num_tokens": 764368780.0, "step": 31100 }, { "entropy": 1.5342436349391937, "epoch": 1.4757437938222475, "grad_norm": 1.175134539604187, "learning_rate": 1.9514117613938625e-05, "loss": 1.1376, "mean_token_accuracy": 0.7598193883895874, "num_tokens": 765622021.0, "step": 31150 }, { "entropy": 1.568356648683548, "epoch": 1.4781125639567936, "grad_norm": 1.1193444728851318, "learning_rate": 1.935053136675339e-05, "loss": 1.1488, "mean_token_accuracy": 0.7594327408075333, "num_tokens": 766856360.0, "step": 31200 }, { "entropy": 1.5699161183834076, "epoch": 1.4804813340913396, "grad_norm": 1.0560715198516846, "learning_rate": 1.9187468999844936e-05, "loss": 1.1459, "mean_token_accuracy": 0.7583613079786301, "num_tokens": 768058852.0, "step": 31250 }, { "entropy": 1.588351196050644, "epoch": 1.482850104225886, "grad_norm": 1.2862846851348877, "learning_rate": 1.9024933300379277e-05, "loss": 1.1692, "mean_token_accuracy": 0.7539873999357224, "num_tokens": 769278437.0, "step": 31300 }, { "entropy": 1.5751077544689178, "epoch": 1.485218874360432, "grad_norm": 1.1171611547470093, "learning_rate": 1.8862927046520312e-05, "loss": 1.1468, "mean_token_accuracy": 0.7574340969324111, "num_tokens": 770491005.0, "step": 31350 }, { "entropy": 1.5523167753219604, "epoch": 1.4875876444949783, "grad_norm": 1.1966944932937622, "learning_rate": 1.8701453007382314e-05, "loss": 1.1322, "mean_token_accuracy": 0.7628469413518906, "num_tokens": 771700031.0, "step": 31400 }, { "entropy": 1.5793978321552276, "epoch": 1.4899564146295243, "grad_norm": 1.404768466949463, "learning_rate": 1.8540513942982602e-05, "loss": 1.1309, "mean_token_accuracy": 0.7610709732770919, "num_tokens": 772941795.0, "step": 31450 }, { "entropy": 1.5804509365558623, "epoch": 1.4923251847640704, "grad_norm": 1.3773914575576782, "learning_rate": 1.838011260419435e-05, "loss": 1.1556, "mean_token_accuracy": 0.7568354111909866, "num_tokens": 774162687.0, "step": 31500 }, { "entropy": 1.5457658851146698, "epoch": 1.4946939548986167, "grad_norm": 0.9370711445808411, "learning_rate": 1.822025173269964e-05, "loss": 1.1291, "mean_token_accuracy": 0.7615066528320312, "num_tokens": 775426714.0, "step": 31550 }, { "entropy": 1.550627862215042, "epoch": 1.4970627250331627, "grad_norm": 1.1992812156677246, "learning_rate": 1.8060934060942487e-05, "loss": 1.1443, "mean_token_accuracy": 0.7579187524318695, "num_tokens": 776645207.0, "step": 31600 }, { "entropy": 1.5496788358688354, "epoch": 1.499431495167709, "grad_norm": 1.290854811668396, "learning_rate": 1.7902162312082194e-05, "loss": 1.1542, "mean_token_accuracy": 0.7575876170396805, "num_tokens": 777890539.0, "step": 31650 }, { "entropy": 1.583651841878891, "epoch": 1.501800265302255, "grad_norm": 1.4201711416244507, "learning_rate": 1.7743939199946818e-05, "loss": 1.1669, "mean_token_accuracy": 0.7559886735677719, "num_tokens": 779106659.0, "step": 31700 }, { "entropy": 1.5621719944477082, "epoch": 1.5041690354368011, "grad_norm": 1.0013508796691895, "learning_rate": 1.7586267428986763e-05, "loss": 1.1622, "mean_token_accuracy": 0.7543949365615845, "num_tokens": 780313881.0, "step": 31750 }, { "entropy": 1.5784629476070404, "epoch": 1.5065378055713472, "grad_norm": 1.293186068534851, "learning_rate": 1.742914969422856e-05, "loss": 1.1484, "mean_token_accuracy": 0.7578674453496933, "num_tokens": 781544604.0, "step": 31800 }, { "entropy": 1.5572881984710694, "epoch": 1.5089065757058935, "grad_norm": 1.1909185647964478, "learning_rate": 1.7272588681228767e-05, "loss": 1.1025, "mean_token_accuracy": 0.7669892936944962, "num_tokens": 782765240.0, "step": 31850 }, { "entropy": 1.5987060451507569, "epoch": 1.5112753458404398, "grad_norm": 1.3331712484359741, "learning_rate": 1.7116587066028172e-05, "loss": 1.1787, "mean_token_accuracy": 0.7533298796415329, "num_tokens": 783994667.0, "step": 31900 }, { "entropy": 1.5784035372734069, "epoch": 1.5136441159749858, "grad_norm": 1.3433549404144287, "learning_rate": 1.6961147515105897e-05, "loss": 1.1539, "mean_token_accuracy": 0.7583291745185852, "num_tokens": 785241722.0, "step": 31950 }, { "entropy": 1.5924919998645783, "epoch": 1.5160128861095319, "grad_norm": 0.9708880186080933, "learning_rate": 1.6806272685333967e-05, "loss": 1.168, "mean_token_accuracy": 0.7547562402486802, "num_tokens": 786450293.0, "step": 32000 }, { "epoch": 1.5160128861095319, "eval_entropy": 1.0883530624676796, "eval_loss": NaN, "eval_mean_token_accuracy": 0.760518561690796, "eval_num_tokens": 786450293.0, "eval_runtime": 728.4178, "eval_samples_per_second": 34.067, "eval_steps_per_second": 4.259, "step": 32000 }, { "entropy": 1.5847830092906952, "epoch": 1.518381656244078, "grad_norm": 1.0674691200256348, "learning_rate": 1.6651965223931798e-05, "loss": 1.122, "mean_token_accuracy": 0.7640283882617951, "num_tokens": 787651249.0, "step": 32050 }, { "entropy": 1.6060099351406096, "epoch": 1.5207504263786242, "grad_norm": 1.3451073169708252, "learning_rate": 1.6498227768420986e-05, "loss": 1.1986, "mean_token_accuracy": 0.7503712397813797, "num_tokens": 788894856.0, "step": 32100 }, { "entropy": 1.5495011293888092, "epoch": 1.5231191965131705, "grad_norm": 1.184458613395691, "learning_rate": 1.634506294658023e-05, "loss": 1.1241, "mean_token_accuracy": 0.7635609942674637, "num_tokens": 790124279.0, "step": 32150 }, { "entropy": 1.5789199233055116, "epoch": 1.5254879666477166, "grad_norm": 1.4204998016357422, "learning_rate": 1.619247337640041e-05, "loss": 1.1481, "mean_token_accuracy": 0.7602039396762847, "num_tokens": 791346787.0, "step": 32200 }, { "entropy": 1.5563798201084138, "epoch": 1.5278567367822626, "grad_norm": 1.1505266427993774, "learning_rate": 1.6040461666039808e-05, "loss": 1.1499, "mean_token_accuracy": 0.7575086969137191, "num_tokens": 792593563.0, "step": 32250 }, { "entropy": 1.6074644064903258, "epoch": 1.5302255069168087, "grad_norm": 1.185583472251892, "learning_rate": 1.5889030413779622e-05, "loss": 1.156, "mean_token_accuracy": 0.7562423485517502, "num_tokens": 793829790.0, "step": 32300 }, { "entropy": 1.5752575540542602, "epoch": 1.532594277051355, "grad_norm": 1.4243769645690918, "learning_rate": 1.5738182207979435e-05, "loss": 1.1459, "mean_token_accuracy": 0.7583240360021591, "num_tokens": 795055789.0, "step": 32350 }, { "entropy": 1.5923644971847535, "epoch": 1.534963047185901, "grad_norm": 1.6619261503219604, "learning_rate": 1.558791962703304e-05, "loss": 1.154, "mean_token_accuracy": 0.7567561262845993, "num_tokens": 796275636.0, "step": 32400 }, { "entropy": 1.583539651632309, "epoch": 1.5373318173204473, "grad_norm": 1.5260084867477417, "learning_rate": 1.5438245239324372e-05, "loss": 1.1293, "mean_token_accuracy": 0.7631356823444366, "num_tokens": 797503738.0, "step": 32450 }, { "entropy": 1.5718154168128968, "epoch": 1.5397005874549934, "grad_norm": 1.1916577816009521, "learning_rate": 1.5289161603183565e-05, "loss": 1.1556, "mean_token_accuracy": 0.756939308643341, "num_tokens": 798743606.0, "step": 32500 }, { "entropy": 1.5819120156764983, "epoch": 1.5420693575895394, "grad_norm": 1.1773018836975098, "learning_rate": 1.5140671266843276e-05, "loss": 1.1722, "mean_token_accuracy": 0.7551066309213639, "num_tokens": 799964473.0, "step": 32550 }, { "entropy": 1.575450291633606, "epoch": 1.5444381277240855, "grad_norm": 1.0022114515304565, "learning_rate": 1.4992776768395073e-05, "loss": 1.1449, "mean_token_accuracy": 0.7597598391771316, "num_tokens": 801188088.0, "step": 32600 }, { "entropy": 1.5459000968933105, "epoch": 1.5468068978586318, "grad_norm": 1.2957897186279297, "learning_rate": 1.4845480635746129e-05, "loss": 1.1227, "mean_token_accuracy": 0.7632001984119415, "num_tokens": 802438523.0, "step": 32650 }, { "entropy": 1.5870135259628295, "epoch": 1.549175667993178, "grad_norm": 1.3867087364196777, "learning_rate": 1.469878538657593e-05, "loss": 1.1395, "mean_token_accuracy": 0.7591842120885849, "num_tokens": 803649365.0, "step": 32700 }, { "entropy": 1.5655232286453247, "epoch": 1.5515444381277241, "grad_norm": 0.9858147501945496, "learning_rate": 1.4552693528293287e-05, "loss": 1.1343, "mean_token_accuracy": 0.7619771939516068, "num_tokens": 804874548.0, "step": 32750 }, { "entropy": 1.5606813442707062, "epoch": 1.5539132082622702, "grad_norm": 1.1506080627441406, "learning_rate": 1.4407207557993468e-05, "loss": 1.1358, "mean_token_accuracy": 0.7607605350017548, "num_tokens": 806110451.0, "step": 32800 }, { "entropy": 1.5905800759792328, "epoch": 1.5562819783968163, "grad_norm": 1.2425259351730347, "learning_rate": 1.4262329962415521e-05, "loss": 1.1416, "mean_token_accuracy": 0.7600742274522782, "num_tokens": 807342732.0, "step": 32850 }, { "entropy": 1.5680935847759248, "epoch": 1.5586507485313625, "grad_norm": 1.153823733329773, "learning_rate": 1.4118063217899746e-05, "loss": 1.1335, "mean_token_accuracy": 0.7605480921268463, "num_tokens": 808586619.0, "step": 32900 }, { "entropy": 1.5620489943027496, "epoch": 1.5610195186659088, "grad_norm": 1.050882339477539, "learning_rate": 1.397440979034544e-05, "loss": 1.1522, "mean_token_accuracy": 0.756206591129303, "num_tokens": 809832674.0, "step": 32950 }, { "entropy": 1.605532693862915, "epoch": 1.5633882888004549, "grad_norm": 1.3032130002975464, "learning_rate": 1.383137213516862e-05, "loss": 1.1758, "mean_token_accuracy": 0.7512508201599121, "num_tokens": 811045391.0, "step": 33000 }, { "epoch": 1.5633882888004549, "eval_entropy": 1.0869283330279116, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7607752118838518, "eval_num_tokens": 811045391.0, "eval_runtime": 725.7757, "eval_samples_per_second": 34.191, "eval_steps_per_second": 4.274, "step": 33000 }, { "entropy": 1.592424702644348, "epoch": 1.565757058935001, "grad_norm": 1.4170438051223755, "learning_rate": 1.36889526972602e-05, "loss": 1.1393, "mean_token_accuracy": 0.7614479339122773, "num_tokens": 812284992.0, "step": 33050 }, { "entropy": 1.5892753875255585, "epoch": 1.568125829069547, "grad_norm": 1.1543865203857422, "learning_rate": 1.3547153910944083e-05, "loss": 1.1511, "mean_token_accuracy": 0.7588121032714844, "num_tokens": 813547664.0, "step": 33100 }, { "entropy": 1.584560890197754, "epoch": 1.5704945992040933, "grad_norm": 0.9274981021881104, "learning_rate": 1.3405978199935615e-05, "loss": 1.1374, "mean_token_accuracy": 0.7598586791753769, "num_tokens": 814802491.0, "step": 33150 }, { "entropy": 1.6207016718387603, "epoch": 1.5728633693386393, "grad_norm": 1.2314906120300293, "learning_rate": 1.3265427977300137e-05, "loss": 1.1615, "mean_token_accuracy": 0.7563327193260193, "num_tokens": 816017932.0, "step": 33200 }, { "entropy": 1.586843957901001, "epoch": 1.5752321394731856, "grad_norm": 1.1286264657974243, "learning_rate": 1.3125505645411745e-05, "loss": 1.1426, "mean_token_accuracy": 0.7603234398365021, "num_tokens": 817264849.0, "step": 33250 }, { "entropy": 1.591685062646866, "epoch": 1.5776009096077317, "grad_norm": 1.0517480373382568, "learning_rate": 1.2986213595912234e-05, "loss": 1.1207, "mean_token_accuracy": 0.7635256379842759, "num_tokens": 818484337.0, "step": 33300 }, { "entropy": 1.6172384572029115, "epoch": 1.5799696797422778, "grad_norm": 1.360130786895752, "learning_rate": 1.2847554209670182e-05, "loss": 1.161, "mean_token_accuracy": 0.7545110338926315, "num_tokens": 819712425.0, "step": 33350 }, { "entropy": 1.5726615214347839, "epoch": 1.5823384498768238, "grad_norm": 0.9944781064987183, "learning_rate": 1.2709529856740331e-05, "loss": 1.1207, "mean_token_accuracy": 0.7627239066362381, "num_tokens": 820956336.0, "step": 33400 }, { "entropy": 1.58195317029953, "epoch": 1.58470722001137, "grad_norm": 1.3657851219177246, "learning_rate": 1.2572142896322991e-05, "loss": 1.1318, "mean_token_accuracy": 0.7620331639051438, "num_tokens": 822181235.0, "step": 33450 }, { "entropy": 1.5519816017150878, "epoch": 1.5870759901459164, "grad_norm": 1.1192278861999512, "learning_rate": 1.2435395676723765e-05, "loss": 1.1255, "mean_token_accuracy": 0.763149077296257, "num_tokens": 823394039.0, "step": 33500 }, { "entropy": 1.5487982165813445, "epoch": 1.5894447602804624, "grad_norm": 0.9175589084625244, "learning_rate": 1.229929053531339e-05, "loss": 1.1266, "mean_token_accuracy": 0.762880043387413, "num_tokens": 824629826.0, "step": 33550 }, { "entropy": 1.5455662417411804, "epoch": 1.5918135304150085, "grad_norm": 1.543500542640686, "learning_rate": 1.2163829798487796e-05, "loss": 1.1179, "mean_token_accuracy": 0.7645809006690979, "num_tokens": 825885699.0, "step": 33600 }, { "entropy": 1.545372655391693, "epoch": 1.5941823005495546, "grad_norm": 1.042571783065796, "learning_rate": 1.2029015781628333e-05, "loss": 1.1253, "mean_token_accuracy": 0.7624981206655502, "num_tokens": 827144142.0, "step": 33650 }, { "entropy": 1.5542615973949432, "epoch": 1.5965510706841008, "grad_norm": 1.2017757892608643, "learning_rate": 1.1894850789062234e-05, "loss": 1.1095, "mean_token_accuracy": 0.7662106871604919, "num_tokens": 828358780.0, "step": 33700 }, { "entropy": 1.5387887310981752, "epoch": 1.598919840818647, "grad_norm": 1.2897499799728394, "learning_rate": 1.1761337114023157e-05, "loss": 1.1393, "mean_token_accuracy": 0.7597699278593063, "num_tokens": 829617688.0, "step": 33750 }, { "entropy": 1.57563338637352, "epoch": 1.6012886109531932, "grad_norm": 1.7554948329925537, "learning_rate": 1.1628477038612035e-05, "loss": 1.1186, "mean_token_accuracy": 0.7649687886238098, "num_tokens": 830817095.0, "step": 33800 }, { "entropy": 1.5856982839107514, "epoch": 1.6036573810877393, "grad_norm": 0.9697763919830322, "learning_rate": 1.1496272833758042e-05, "loss": 1.1803, "mean_token_accuracy": 0.7541155385971069, "num_tokens": 832068396.0, "step": 33850 }, { "entropy": 1.582381078004837, "epoch": 1.6060261512222853, "grad_norm": 1.2476204633712769, "learning_rate": 1.1364726759179856e-05, "loss": 1.1366, "mean_token_accuracy": 0.7601368808746338, "num_tokens": 833258832.0, "step": 33900 }, { "entropy": 1.5806366765499116, "epoch": 1.6083949213568314, "grad_norm": 1.464986801147461, "learning_rate": 1.12338410633469e-05, "loss": 1.1401, "mean_token_accuracy": 0.7602562707662582, "num_tokens": 834452824.0, "step": 33950 }, { "entropy": 1.5599962186813354, "epoch": 1.6107636914913777, "grad_norm": 1.1796619892120361, "learning_rate": 1.1103617983441017e-05, "loss": 1.1369, "mean_token_accuracy": 0.7609011316299439, "num_tokens": 835712240.0, "step": 34000 }, { "epoch": 1.6107636914913777, "eval_entropy": 1.0767018733141425, "eval_loss": NaN, "eval_mean_token_accuracy": 0.760854017967028, "eval_num_tokens": 835712240.0, "eval_runtime": 727.3112, "eval_samples_per_second": 34.119, "eval_steps_per_second": 4.265, "step": 34000 }, { "entropy": 1.579708174467087, "epoch": 1.613132461625924, "grad_norm": 1.2132450342178345, "learning_rate": 1.0974059745318177e-05, "loss": 1.1412, "mean_token_accuracy": 0.7581533867120743, "num_tokens": 836951903.0, "step": 34050 }, { "entropy": 1.5457484829425812, "epoch": 1.61550123176047, "grad_norm": 0.9479733109474182, "learning_rate": 1.0845168563470492e-05, "loss": 1.1319, "mean_token_accuracy": 0.7595140463113785, "num_tokens": 838214157.0, "step": 34100 }, { "entropy": 1.5466408836841583, "epoch": 1.617870001895016, "grad_norm": 1.7005789279937744, "learning_rate": 1.071694664098828e-05, "loss": 1.1175, "mean_token_accuracy": 0.7649167954921723, "num_tokens": 839471563.0, "step": 34150 }, { "entropy": 1.5700657200813293, "epoch": 1.6202387720295621, "grad_norm": 1.059528112411499, "learning_rate": 1.0589396169522465e-05, "loss": 1.1615, "mean_token_accuracy": 0.7565118598937989, "num_tokens": 840715891.0, "step": 34200 }, { "entropy": 1.5375142538547515, "epoch": 1.6226075421641084, "grad_norm": 1.559818148612976, "learning_rate": 1.0462519329247094e-05, "loss": 1.1356, "mean_token_accuracy": 0.7605293154716491, "num_tokens": 841942053.0, "step": 34250 }, { "entropy": 1.587239592075348, "epoch": 1.6249763122986547, "grad_norm": 1.1134872436523438, "learning_rate": 1.03363182888221e-05, "loss": 1.1564, "mean_token_accuracy": 0.7570129364728928, "num_tokens": 843161472.0, "step": 34300 }, { "entropy": 1.5743994867801667, "epoch": 1.6273450824332008, "grad_norm": 1.3683475255966187, "learning_rate": 1.021079520535619e-05, "loss": 1.159, "mean_token_accuracy": 0.7565000504255295, "num_tokens": 844415080.0, "step": 34350 }, { "entropy": 1.5728708267211915, "epoch": 1.6297138525677468, "grad_norm": 1.1361534595489502, "learning_rate": 1.0085952224369998e-05, "loss": 1.1464, "mean_token_accuracy": 0.7604904717206955, "num_tokens": 845652767.0, "step": 34400 }, { "entropy": 1.555993628501892, "epoch": 1.6320826227022929, "grad_norm": 1.2197624444961548, "learning_rate": 9.961791479759453e-06, "loss": 1.1094, "mean_token_accuracy": 0.7654684072732926, "num_tokens": 846861078.0, "step": 34450 }, { "entropy": 1.5712345719337464, "epoch": 1.6344513928368392, "grad_norm": 1.2012556791305542, "learning_rate": 9.83831509375922e-06, "loss": 1.1318, "mean_token_accuracy": 0.7618235784769058, "num_tokens": 848079801.0, "step": 34500 }, { "entropy": 1.5478745126724243, "epoch": 1.6368201629713852, "grad_norm": 1.1320964097976685, "learning_rate": 9.715525176906482e-06, "loss": 1.1156, "mean_token_accuracy": 0.763830555677414, "num_tokens": 849324814.0, "step": 34550 }, { "entropy": 1.5780341172218322, "epoch": 1.6391889331059315, "grad_norm": 1.025578498840332, "learning_rate": 9.59342382800486e-06, "loss": 1.1426, "mean_token_accuracy": 0.7610353720188141, "num_tokens": 850545817.0, "step": 34600 }, { "entropy": 1.5829120945930482, "epoch": 1.6415577032404776, "grad_norm": 1.0543193817138672, "learning_rate": 9.472013134088525e-06, "loss": 1.1659, "mean_token_accuracy": 0.7564774835109711, "num_tokens": 851771892.0, "step": 34650 }, { "entropy": 1.5125657570362092, "epoch": 1.6439264733750236, "grad_norm": 1.047337532043457, "learning_rate": 9.351295170386536e-06, "loss": 1.1436, "mean_token_accuracy": 0.7618407100439072, "num_tokens": 853004916.0, "step": 34700 }, { "entropy": 1.5835665547847748, "epoch": 1.6462952435095697, "grad_norm": 1.4141535758972168, "learning_rate": 9.231272000287355e-06, "loss": 1.1394, "mean_token_accuracy": 0.7607215863466262, "num_tokens": 854213875.0, "step": 34750 }, { "entropy": 1.5290465533733368, "epoch": 1.648664013644116, "grad_norm": 1.578470230102539, "learning_rate": 9.111945675303619e-06, "loss": 1.0863, "mean_token_accuracy": 0.7710424029827118, "num_tokens": 855445223.0, "step": 34800 }, { "entropy": 1.549327657222748, "epoch": 1.6510327837786622, "grad_norm": 1.0670899152755737, "learning_rate": 8.993318235037001e-06, "loss": 1.1251, "mean_token_accuracy": 0.7622793889045716, "num_tokens": 856681494.0, "step": 34850 }, { "entropy": 1.5306969308853149, "epoch": 1.6534015539132083, "grad_norm": 1.3352553844451904, "learning_rate": 8.875391707143432e-06, "loss": 1.1102, "mean_token_accuracy": 0.7646553814411163, "num_tokens": 857925423.0, "step": 34900 }, { "entropy": 1.534108463525772, "epoch": 1.6557703240477544, "grad_norm": 1.3138459920883179, "learning_rate": 8.75816810729837e-06, "loss": 1.1059, "mean_token_accuracy": 0.7676987838745117, "num_tokens": 859172535.0, "step": 34950 }, { "entropy": 1.561165556907654, "epoch": 1.6581390941823004, "grad_norm": 1.1682779788970947, "learning_rate": 8.641649439162396e-06, "loss": 1.1193, "mean_token_accuracy": 0.7643628352880478, "num_tokens": 860388305.0, "step": 35000 }, { "epoch": 1.6581390941823004, "eval_entropy": 1.0728673784675942, "eval_loss": NaN, "eval_mean_token_accuracy": 0.761188728099558, "eval_num_tokens": 860388305.0, "eval_runtime": 727.1748, "eval_samples_per_second": 34.125, "eval_steps_per_second": 4.266, "step": 35000 }, { "entropy": 1.5597219800949096, "epoch": 1.6605078643168467, "grad_norm": 1.1003355979919434, "learning_rate": 8.525837694346932e-06, "loss": 1.1456, "mean_token_accuracy": 0.7606293076276779, "num_tokens": 861636981.0, "step": 35050 }, { "entropy": 1.5564636278152466, "epoch": 1.662876634451393, "grad_norm": 1.22114098072052, "learning_rate": 8.410734852380231e-06, "loss": 1.1478, "mean_token_accuracy": 0.7590148377418519, "num_tokens": 862893390.0, "step": 35100 }, { "entropy": 1.5701312077045442, "epoch": 1.665245404585939, "grad_norm": 1.4223356246948242, "learning_rate": 8.296342880673513e-06, "loss": 1.1266, "mean_token_accuracy": 0.763382934331894, "num_tokens": 864117153.0, "step": 35150 }, { "entropy": 1.587481471300125, "epoch": 1.6676141747204851, "grad_norm": 1.5688437223434448, "learning_rate": 8.182663734487372e-06, "loss": 1.1656, "mean_token_accuracy": 0.7555125683546067, "num_tokens": 865348622.0, "step": 35200 }, { "entropy": 1.5680071783065797, "epoch": 1.6699829448550312, "grad_norm": 1.2558252811431885, "learning_rate": 8.069699356898309e-06, "loss": 1.151, "mean_token_accuracy": 0.7581069612503052, "num_tokens": 866584445.0, "step": 35250 }, { "entropy": 1.5600192046165466, "epoch": 1.6723517149895775, "grad_norm": 1.3348325490951538, "learning_rate": 7.95745167876556e-06, "loss": 1.1564, "mean_token_accuracy": 0.7571524727344513, "num_tokens": 867822403.0, "step": 35300 }, { "entropy": 1.562008023262024, "epoch": 1.6747204851241235, "grad_norm": 1.3110319375991821, "learning_rate": 7.84592261869806e-06, "loss": 1.1462, "mean_token_accuracy": 0.7590863239765168, "num_tokens": 869072939.0, "step": 35350 }, { "entropy": 1.5816670620441438, "epoch": 1.6770892552586698, "grad_norm": 1.2804386615753174, "learning_rate": 7.735114083021683e-06, "loss": 1.1353, "mean_token_accuracy": 0.7603730088472367, "num_tokens": 870288358.0, "step": 35400 }, { "entropy": 1.546754379272461, "epoch": 1.6794580253932159, "grad_norm": 1.1443445682525635, "learning_rate": 7.625027965746634e-06, "loss": 1.1473, "mean_token_accuracy": 0.7597916102409363, "num_tokens": 871537045.0, "step": 35450 }, { "entropy": 1.5389190435409545, "epoch": 1.681826795527762, "grad_norm": 1.2796293497085571, "learning_rate": 7.515666148535067e-06, "loss": 1.1159, "mean_token_accuracy": 0.7650646787881851, "num_tokens": 872759023.0, "step": 35500 }, { "entropy": 1.5631573498249054, "epoch": 1.684195565662308, "grad_norm": 1.503520131111145, "learning_rate": 7.407030500668971e-06, "loss": 1.1688, "mean_token_accuracy": 0.7553135341405869, "num_tokens": 873995801.0, "step": 35550 }, { "entropy": 1.5810019493103027, "epoch": 1.6865643357968543, "grad_norm": 1.126876950263977, "learning_rate": 7.299122879018155e-06, "loss": 1.1475, "mean_token_accuracy": 0.7582389563322067, "num_tokens": 875225780.0, "step": 35600 }, { "entropy": 1.5617643618583679, "epoch": 1.6889331059314006, "grad_norm": 1.041165828704834, "learning_rate": 7.191945128008548e-06, "loss": 1.1599, "mean_token_accuracy": 0.7565414899587631, "num_tokens": 876441973.0, "step": 35650 }, { "entropy": 1.5521779787540435, "epoch": 1.6913018760659466, "grad_norm": 1.024032711982727, "learning_rate": 7.085499079590674e-06, "loss": 1.1359, "mean_token_accuracy": 0.7614186578989028, "num_tokens": 877691572.0, "step": 35700 }, { "entropy": 1.5647426414489747, "epoch": 1.6936706462004927, "grad_norm": 1.4068409204483032, "learning_rate": 6.979786553208306e-06, "loss": 1.1434, "mean_token_accuracy": 0.7604501461982727, "num_tokens": 878910690.0, "step": 35750 }, { "entropy": 1.5498666989803314, "epoch": 1.6960394163350387, "grad_norm": 1.2371301651000977, "learning_rate": 6.8748093557674084e-06, "loss": 1.1359, "mean_token_accuracy": 0.7612047231197357, "num_tokens": 880160478.0, "step": 35800 }, { "entropy": 1.5569170558452605, "epoch": 1.698408186469585, "grad_norm": 1.2279973030090332, "learning_rate": 6.770569281605244e-06, "loss": 1.1249, "mean_token_accuracy": 0.7620308262109756, "num_tokens": 881367218.0, "step": 35850 }, { "entropy": 1.5841618192195892, "epoch": 1.700776956604131, "grad_norm": 1.173069953918457, "learning_rate": 6.667068112459662e-06, "loss": 1.1585, "mean_token_accuracy": 0.7556693691015244, "num_tokens": 882584025.0, "step": 35900 }, { "entropy": 1.5525937521457671, "epoch": 1.7031457267386774, "grad_norm": 1.2860488891601562, "learning_rate": 6.56430761743872e-06, "loss": 1.1681, "mean_token_accuracy": 0.7552351075410842, "num_tokens": 883859871.0, "step": 35950 }, { "entropy": 1.5703179001808167, "epoch": 1.7055144968732234, "grad_norm": 1.2826309204101562, "learning_rate": 6.462289552990353e-06, "loss": 1.1341, "mean_token_accuracy": 0.7611742705106735, "num_tokens": 885071859.0, "step": 36000 }, { "epoch": 1.7055144968732234, "eval_entropy": 1.0768880580103375, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7609103475268005, "eval_num_tokens": 885071859.0, "eval_runtime": 728.1701, "eval_samples_per_second": 34.079, "eval_steps_per_second": 4.26, "step": 36000 }, { "entropy": 1.5391144120693208, "epoch": 1.7078832670077695, "grad_norm": 1.2603217363357544, "learning_rate": 6.361015662872433e-06, "loss": 1.1158, "mean_token_accuracy": 0.7669140672683716, "num_tokens": 886316413.0, "step": 36050 }, { "entropy": 1.5330300676822661, "epoch": 1.7102520371423156, "grad_norm": 1.2248510122299194, "learning_rate": 6.260487678122911e-06, "loss": 1.0644, "mean_token_accuracy": 0.7753306698799133, "num_tokens": 887565349.0, "step": 36100 }, { "entropy": 1.5597666120529174, "epoch": 1.7126208072768618, "grad_norm": 1.0783295631408691, "learning_rate": 6.160707317030256e-06, "loss": 1.109, "mean_token_accuracy": 0.7654628306627274, "num_tokens": 888770171.0, "step": 36150 }, { "entropy": 1.5886062026023864, "epoch": 1.7149895774114081, "grad_norm": 1.518917202949524, "learning_rate": 6.0616762851040675e-06, "loss": 1.1602, "mean_token_accuracy": 0.7575176376104354, "num_tokens": 889998256.0, "step": 36200 }, { "entropy": 1.5691123294830323, "epoch": 1.7173583475459542, "grad_norm": 1.1943705081939697, "learning_rate": 5.963396275045951e-06, "loss": 1.1476, "mean_token_accuracy": 0.758755573630333, "num_tokens": 891199335.0, "step": 36250 }, { "entropy": 1.5685584223270417, "epoch": 1.7197271176805002, "grad_norm": 1.2932955026626587, "learning_rate": 5.865868966720556e-06, "loss": 1.1354, "mean_token_accuracy": 0.7614442694187165, "num_tokens": 892434722.0, "step": 36300 }, { "entropy": 1.5710162222385406, "epoch": 1.7220958878150463, "grad_norm": 1.200039267539978, "learning_rate": 5.769096027126869e-06, "loss": 1.1766, "mean_token_accuracy": 0.7540206718444824, "num_tokens": 893676597.0, "step": 36350 }, { "entropy": 1.562828722000122, "epoch": 1.7244646579495926, "grad_norm": 1.2064563035964966, "learning_rate": 5.673079110369722e-06, "loss": 1.121, "mean_token_accuracy": 0.7634602183103562, "num_tokens": 894910050.0, "step": 36400 }, { "entropy": 1.5540617489814759, "epoch": 1.7268334280841389, "grad_norm": 1.4902048110961914, "learning_rate": 5.577819857631539e-06, "loss": 1.1201, "mean_token_accuracy": 0.7639645302295685, "num_tokens": 896142711.0, "step": 36450 }, { "entropy": 1.554260642528534, "epoch": 1.729202198218685, "grad_norm": 1.2376636266708374, "learning_rate": 5.483319897144257e-06, "loss": 1.141, "mean_token_accuracy": 0.7609711056947708, "num_tokens": 897387745.0, "step": 36500 }, { "entropy": 1.5512582790851592, "epoch": 1.731570968353231, "grad_norm": 1.0070257186889648, "learning_rate": 5.389580844161491e-06, "loss": 1.151, "mean_token_accuracy": 0.7582071113586426, "num_tokens": 898612694.0, "step": 36550 }, { "entropy": 1.5260178673267364, "epoch": 1.733939738487777, "grad_norm": 1.035585880279541, "learning_rate": 5.296604300930968e-06, "loss": 1.1097, "mean_token_accuracy": 0.7681008791923523, "num_tokens": 899864115.0, "step": 36600 }, { "entropy": 1.557324800491333, "epoch": 1.7363085086223233, "grad_norm": 1.2301568984985352, "learning_rate": 5.204391856667101e-06, "loss": 1.1191, "mean_token_accuracy": 0.7642790126800537, "num_tokens": 901100268.0, "step": 36650 }, { "entropy": 1.5380194628238677, "epoch": 1.7386772787568694, "grad_norm": 1.246462345123291, "learning_rate": 5.112945087523824e-06, "loss": 1.1108, "mean_token_accuracy": 0.7644780373573303, "num_tokens": 902310249.0, "step": 36700 }, { "entropy": 1.5637565624713898, "epoch": 1.7410460488914157, "grad_norm": 1.1634399890899658, "learning_rate": 5.022265556567668e-06, "loss": 1.1319, "mean_token_accuracy": 0.7625255084037781, "num_tokens": 903523545.0, "step": 36750 }, { "entropy": 1.5375991368293762, "epoch": 1.7434148190259617, "grad_norm": 1.3280473947525024, "learning_rate": 4.9323548137510555e-06, "loss": 1.1053, "mean_token_accuracy": 0.7662223023176193, "num_tokens": 904774364.0, "step": 36800 }, { "entropy": 1.5782112526893615, "epoch": 1.7457835891605078, "grad_norm": 1.3013827800750732, "learning_rate": 4.843214395885776e-06, "loss": 1.1594, "mean_token_accuracy": 0.758129763007164, "num_tokens": 906007167.0, "step": 36850 }, { "entropy": 1.541445196866989, "epoch": 1.7481523592950539, "grad_norm": 0.9861883521080017, "learning_rate": 4.754845826616727e-06, "loss": 1.1442, "mean_token_accuracy": 0.7601429998874665, "num_tokens": 907201311.0, "step": 36900 }, { "entropy": 1.5591549813747405, "epoch": 1.7505211294296001, "grad_norm": 1.1912263631820679, "learning_rate": 4.667250616395885e-06, "loss": 1.1229, "mean_token_accuracy": 0.7642272913455963, "num_tokens": 908429689.0, "step": 36950 }, { "entropy": 1.53731192111969, "epoch": 1.7528898995641464, "grad_norm": 1.2835556268692017, "learning_rate": 4.580430262456503e-06, "loss": 1.0855, "mean_token_accuracy": 0.770037140250206, "num_tokens": 909656463.0, "step": 37000 }, { "epoch": 1.7528898995641464, "eval_entropy": 1.0678023306651703, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7609675386542278, "eval_num_tokens": 909656463.0, "eval_runtime": 744.4803, "eval_samples_per_second": 33.332, "eval_steps_per_second": 4.167, "step": 37000 }, { "entropy": 1.555112097263336, "epoch": 1.7552586696986925, "grad_norm": 1.195916771888733, "learning_rate": 4.4943862487874575e-06, "loss": 1.1449, "mean_token_accuracy": 0.7592443466186524, "num_tokens": 910867530.0, "step": 37050 }, { "entropy": 1.5418341505527495, "epoch": 1.7576274398332385, "grad_norm": 1.139011025428772, "learning_rate": 4.409120046107945e-06, "loss": 1.1017, "mean_token_accuracy": 0.7676555049419403, "num_tokens": 912088709.0, "step": 37100 }, { "entropy": 1.5542387223243714, "epoch": 1.7599962099677846, "grad_norm": 1.1180921792984009, "learning_rate": 4.324633111842308e-06, "loss": 1.1473, "mean_token_accuracy": 0.759157150387764, "num_tokens": 913332056.0, "step": 37150 }, { "entropy": 1.5596051335334777, "epoch": 1.762364980102331, "grad_norm": 1.006624460220337, "learning_rate": 4.240926890095148e-06, "loss": 1.1482, "mean_token_accuracy": 0.7598807489871979, "num_tokens": 914537591.0, "step": 37200 }, { "entropy": 1.5528207927942277, "epoch": 1.7647337502368772, "grad_norm": 0.971926748752594, "learning_rate": 4.158002811626621e-06, "loss": 1.1571, "mean_token_accuracy": 0.7576300024986267, "num_tokens": 915743333.0, "step": 37250 }, { "entropy": 1.5758486306667328, "epoch": 1.7671025203714232, "grad_norm": 1.1977986097335815, "learning_rate": 4.075862293827986e-06, "loss": 1.1495, "mean_token_accuracy": 0.7577051311731339, "num_tokens": 916959683.0, "step": 37300 }, { "entropy": 1.544589899778366, "epoch": 1.7694712905059693, "grad_norm": 1.3282675743103027, "learning_rate": 3.994506740697407e-06, "loss": 1.1269, "mean_token_accuracy": 0.7617994117736816, "num_tokens": 918211320.0, "step": 37350 }, { "entropy": 1.5542384481430054, "epoch": 1.7718400606405154, "grad_norm": 1.4619874954223633, "learning_rate": 3.9139375428159095e-06, "loss": 1.1173, "mean_token_accuracy": 0.7629494529962539, "num_tokens": 919446181.0, "step": 37400 }, { "entropy": 1.5291326987743377, "epoch": 1.7742088307750616, "grad_norm": 1.3056997060775757, "learning_rate": 3.834156077323636e-06, "loss": 1.0887, "mean_token_accuracy": 0.7687935763597489, "num_tokens": 920685182.0, "step": 37450 }, { "entropy": 1.5420164275169372, "epoch": 1.7765776009096077, "grad_norm": 1.2205777168273926, "learning_rate": 3.7551637078963085e-06, "loss": 1.1142, "mean_token_accuracy": 0.7653926169872284, "num_tokens": 921939020.0, "step": 37500 }, { "entropy": 1.567058709859848, "epoch": 1.778946371044154, "grad_norm": 0.9547618627548218, "learning_rate": 3.6769617847219164e-06, "loss": 1.1223, "mean_token_accuracy": 0.7639624851942063, "num_tokens": 923177265.0, "step": 37550 }, { "entropy": 1.5774991846084594, "epoch": 1.7813151411787, "grad_norm": 1.2139365673065186, "learning_rate": 3.5995516444776276e-06, "loss": 1.1457, "mean_token_accuracy": 0.7596712547540665, "num_tokens": 924378635.0, "step": 37600 }, { "entropy": 1.5741923189163207, "epoch": 1.783683911313246, "grad_norm": 1.3455299139022827, "learning_rate": 3.5229346103069547e-06, "loss": 1.1265, "mean_token_accuracy": 0.7622561120986938, "num_tokens": 925558387.0, "step": 37650 }, { "entropy": 1.5316821897029878, "epoch": 1.7860526814477922, "grad_norm": 1.833621859550476, "learning_rate": 3.4471119917971473e-06, "loss": 1.1108, "mean_token_accuracy": 0.7637511855363845, "num_tokens": 926797544.0, "step": 37700 }, { "entropy": 1.5604101026058197, "epoch": 1.7884214515823385, "grad_norm": 1.2970396280288696, "learning_rate": 3.3720850849567944e-06, "loss": 1.112, "mean_token_accuracy": 0.7661514669656754, "num_tokens": 928034501.0, "step": 37750 }, { "entropy": 1.543112144470215, "epoch": 1.7907902217168847, "grad_norm": 0.9984686970710754, "learning_rate": 3.297855172193659e-06, "loss": 1.1264, "mean_token_accuracy": 0.7631747448444366, "num_tokens": 929281453.0, "step": 37800 }, { "entropy": 1.5535426819324494, "epoch": 1.7931589918514308, "grad_norm": 1.123579740524292, "learning_rate": 3.22442352229278e-06, "loss": 1.1449, "mean_token_accuracy": 0.760178684592247, "num_tokens": 930526986.0, "step": 37850 }, { "entropy": 1.533100154399872, "epoch": 1.7955277619859769, "grad_norm": 1.1597360372543335, "learning_rate": 3.1517913903947707e-06, "loss": 1.1216, "mean_token_accuracy": 0.7638274627923965, "num_tokens": 931736763.0, "step": 37900 }, { "entropy": 1.5554070842266083, "epoch": 1.797896532120523, "grad_norm": 1.2038190364837646, "learning_rate": 3.0799600179743927e-06, "loss": 1.1308, "mean_token_accuracy": 0.7614258807897568, "num_tokens": 932923483.0, "step": 37950 }, { "entropy": 1.540804421901703, "epoch": 1.8002653022550692, "grad_norm": 1.0928473472595215, "learning_rate": 3.00893063281929e-06, "loss": 1.1338, "mean_token_accuracy": 0.7619293278455734, "num_tokens": 934170514.0, "step": 38000 }, { "epoch": 1.8002653022550692, "eval_entropy": 1.0719800475410766, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7611359905939115, "eval_num_tokens": 934170514.0, "eval_runtime": 723.3133, "eval_samples_per_second": 34.307, "eval_steps_per_second": 4.289, "step": 38000 }, { "entropy": 1.5533955585956574, "epoch": 1.8026340723896153, "grad_norm": 1.0474472045898438, "learning_rate": 2.9387044490090385e-06, "loss": 1.1715, "mean_token_accuracy": 0.7546948331594467, "num_tokens": 935423080.0, "step": 38050 }, { "entropy": 1.5497667694091797, "epoch": 1.8050028425241615, "grad_norm": 1.1220715045928955, "learning_rate": 2.869282666894402e-06, "loss": 1.1327, "mean_token_accuracy": 0.7612757116556168, "num_tokens": 936650729.0, "step": 38100 }, { "entropy": 1.5674825513362884, "epoch": 1.8073716126587076, "grad_norm": 1.4526468515396118, "learning_rate": 2.8006664730767683e-06, "loss": 1.1427, "mean_token_accuracy": 0.7606144285202027, "num_tokens": 937892138.0, "step": 38150 }, { "entropy": 1.5554044562578202, "epoch": 1.8097403827932537, "grad_norm": 1.2976021766662598, "learning_rate": 2.7328570403879205e-06, "loss": 1.1397, "mean_token_accuracy": 0.7616138017177582, "num_tokens": 939126149.0, "step": 38200 }, { "entropy": 1.568613636493683, "epoch": 1.8121091529277997, "grad_norm": 1.0613607168197632, "learning_rate": 2.665855527869948e-06, "loss": 1.1463, "mean_token_accuracy": 0.759678093791008, "num_tokens": 940346079.0, "step": 38250 }, { "entropy": 1.5337522840499878, "epoch": 1.814477923062346, "grad_norm": 1.267045259475708, "learning_rate": 2.59966308075546e-06, "loss": 1.1493, "mean_token_accuracy": 0.7598909741640091, "num_tokens": 941597120.0, "step": 38300 }, { "entropy": 1.5623528015613557, "epoch": 1.8168466931968923, "grad_norm": 1.3286635875701904, "learning_rate": 2.5342808304479993e-06, "loss": 1.1713, "mean_token_accuracy": 0.754482525587082, "num_tokens": 942822640.0, "step": 38350 }, { "entropy": 1.5387673115730285, "epoch": 1.8192154633314384, "grad_norm": 1.1870768070220947, "learning_rate": 2.46970989450272e-06, "loss": 1.1121, "mean_token_accuracy": 0.7645811969041825, "num_tokens": 944054652.0, "step": 38400 }, { "entropy": 1.5411259424686432, "epoch": 1.8215842334659844, "grad_norm": 1.487240195274353, "learning_rate": 2.405951376607257e-06, "loss": 1.0865, "mean_token_accuracy": 0.7703835678100586, "num_tokens": 945284822.0, "step": 38450 }, { "entropy": 1.586391316652298, "epoch": 1.8239530036005305, "grad_norm": 1.198615312576294, "learning_rate": 2.3430063665628943e-06, "loss": 1.163, "mean_token_accuracy": 0.7574268835783005, "num_tokens": 946506870.0, "step": 38500 }, { "entropy": 1.5495548892021178, "epoch": 1.8263217737350768, "grad_norm": 1.2452329397201538, "learning_rate": 2.280875940265903e-06, "loss": 1.1172, "mean_token_accuracy": 0.7652907830476761, "num_tokens": 947752324.0, "step": 38550 }, { "entropy": 1.5577371573448182, "epoch": 1.828690543869623, "grad_norm": 1.1785380840301514, "learning_rate": 2.2195611596891872e-06, "loss": 1.113, "mean_token_accuracy": 0.7650933820009231, "num_tokens": 948980177.0, "step": 38600 }, { "entropy": 1.5553138053417206, "epoch": 1.831059314004169, "grad_norm": 1.3556625843048096, "learning_rate": 2.159063072864087e-06, "loss": 1.1187, "mean_token_accuracy": 0.7657572621107102, "num_tokens": 950168267.0, "step": 38650 }, { "entropy": 1.5441812425851822, "epoch": 1.8334280841387152, "grad_norm": 1.2191582918167114, "learning_rate": 2.09938271386253e-06, "loss": 1.132, "mean_token_accuracy": 0.7622571617364884, "num_tokens": 951380706.0, "step": 38700 }, { "entropy": 1.5715741848945617, "epoch": 1.8357968542732612, "grad_norm": 1.22894287109375, "learning_rate": 2.040521102779286e-06, "loss": 1.1266, "mean_token_accuracy": 0.7630192279815674, "num_tokens": 952592270.0, "step": 38750 }, { "entropy": 1.5562168991565704, "epoch": 1.8381656244078075, "grad_norm": 1.5664132833480835, "learning_rate": 1.982479245714569e-06, "loss": 1.1185, "mean_token_accuracy": 0.765923129916191, "num_tokens": 953815987.0, "step": 38800 }, { "entropy": 1.574343602657318, "epoch": 1.8405343945423536, "grad_norm": 1.1616158485412598, "learning_rate": 1.925258134756858e-06, "loss": 1.1508, "mean_token_accuracy": 0.7590267878770828, "num_tokens": 955053412.0, "step": 38850 }, { "entropy": 1.5616822016239167, "epoch": 1.8429031646768999, "grad_norm": 1.0465819835662842, "learning_rate": 1.8688587479658793e-06, "loss": 1.1668, "mean_token_accuracy": 0.7562251263856887, "num_tokens": 956294592.0, "step": 38900 }, { "entropy": 1.533266224861145, "epoch": 1.845271934811446, "grad_norm": 1.3192518949508667, "learning_rate": 1.8132820493559521e-06, "loss": 1.1001, "mean_token_accuracy": 0.7673191577196121, "num_tokens": 957522550.0, "step": 38950 }, { "entropy": 1.552718700170517, "epoch": 1.847640704945992, "grad_norm": 1.5773288011550903, "learning_rate": 1.758528988879471e-06, "loss": 1.1048, "mean_token_accuracy": 0.76726045191288, "num_tokens": 958757267.0, "step": 39000 }, { "epoch": 1.847640704945992, "eval_entropy": 1.0707699135186055, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7611362978498987, "eval_num_tokens": 958757267.0, "eval_runtime": 728.2004, "eval_samples_per_second": 34.077, "eval_steps_per_second": 4.26, "step": 39000 }, { "entropy": 1.5483810114860534, "epoch": 1.850009475080538, "grad_norm": 1.4050700664520264, "learning_rate": 1.704600502410686e-06, "loss": 1.1036, "mean_token_accuracy": 0.7662443941831589, "num_tokens": 959971891.0, "step": 39050 }, { "entropy": 1.538731288909912, "epoch": 1.8523782452150843, "grad_norm": 1.1801916360855103, "learning_rate": 1.6514975117296994e-06, "loss": 1.101, "mean_token_accuracy": 0.7680959284305573, "num_tokens": 961210335.0, "step": 39100 }, { "entropy": 1.5468962013721466, "epoch": 1.8547470153496306, "grad_norm": 1.2052723169326782, "learning_rate": 1.599220924506728e-06, "loss": 1.1602, "mean_token_accuracy": 0.7576704436540603, "num_tokens": 962476786.0, "step": 39150 }, { "entropy": 1.5435814583301544, "epoch": 1.8571157854841767, "grad_norm": 1.056043267250061, "learning_rate": 1.547771634286549e-06, "loss": 1.1059, "mean_token_accuracy": 0.7669939565658569, "num_tokens": 963730472.0, "step": 39200 }, { "entropy": 1.557974625825882, "epoch": 1.8594845556187227, "grad_norm": 1.2635796070098877, "learning_rate": 1.4971505204732673e-06, "loss": 1.1212, "mean_token_accuracy": 0.7648367810249329, "num_tokens": 964964095.0, "step": 39250 }, { "entropy": 1.5005019557476045, "epoch": 1.8618533257532688, "grad_norm": 1.2261488437652588, "learning_rate": 1.4473584483152614e-06, "loss": 1.0945, "mean_token_accuracy": 0.7694985699653626, "num_tokens": 966211342.0, "step": 39300 }, { "entropy": 1.595631295442581, "epoch": 1.864222095887815, "grad_norm": 1.1187533140182495, "learning_rate": 1.3983962688904062e-06, "loss": 1.1547, "mean_token_accuracy": 0.7579811322689056, "num_tokens": 967415509.0, "step": 39350 }, { "entropy": 1.5132013654708862, "epoch": 1.8665908660223614, "grad_norm": 1.1046701669692993, "learning_rate": 1.3502648190915124e-06, "loss": 1.1251, "mean_token_accuracy": 0.7633352410793305, "num_tokens": 968645582.0, "step": 39400 }, { "entropy": 1.5683543026447295, "epoch": 1.8689596361569074, "grad_norm": 0.9930199384689331, "learning_rate": 1.3029649216120376e-06, "loss": 1.1359, "mean_token_accuracy": 0.7611577039957047, "num_tokens": 969861208.0, "step": 39450 }, { "entropy": 1.5816809368133544, "epoch": 1.8713284062914535, "grad_norm": 1.0561200380325317, "learning_rate": 1.2564973849320204e-06, "loss": 1.14, "mean_token_accuracy": 0.7613069009780884, "num_tokens": 971090749.0, "step": 39500 }, { "entropy": 1.5379055535793305, "epoch": 1.8736971764259995, "grad_norm": 1.1951854228973389, "learning_rate": 1.2108630033042412e-06, "loss": 1.1165, "mean_token_accuracy": 0.7650814574956893, "num_tokens": 972346968.0, "step": 39550 }, { "entropy": 1.5865603411197662, "epoch": 1.8760659465605458, "grad_norm": 1.2471119165420532, "learning_rate": 1.1660625567406768e-06, "loss": 1.1328, "mean_token_accuracy": 0.7614058357477188, "num_tokens": 973571764.0, "step": 39600 }, { "entropy": 1.5396224319934846, "epoch": 1.8784347166950919, "grad_norm": 1.348791241645813, "learning_rate": 1.1220968109991515e-06, "loss": 1.0901, "mean_token_accuracy": 0.769990593791008, "num_tokens": 974799757.0, "step": 39650 }, { "entropy": 1.553302252292633, "epoch": 1.8808034868296382, "grad_norm": 1.2576488256454468, "learning_rate": 1.0789665175702456e-06, "loss": 1.1081, "mean_token_accuracy": 0.7674774092435837, "num_tokens": 976007380.0, "step": 39700 }, { "entropy": 1.5356010353565217, "epoch": 1.8831722569641842, "grad_norm": 1.2327417135238647, "learning_rate": 1.036672413664458e-06, "loss": 1.1085, "mean_token_accuracy": 0.7655727046728135, "num_tokens": 977252116.0, "step": 39750 }, { "entropy": 1.5608667409420014, "epoch": 1.8855410270987303, "grad_norm": 1.2925286293029785, "learning_rate": 9.952152221996024e-07, "loss": 1.1201, "mean_token_accuracy": 0.7642844372987747, "num_tokens": 978457729.0, "step": 39800 }, { "entropy": 1.5483964371681214, "epoch": 1.8879097972332763, "grad_norm": 1.4176242351531982, "learning_rate": 9.54595651788448e-07, "loss": 1.1321, "mean_token_accuracy": 0.7614534211158752, "num_tokens": 979721797.0, "step": 39850 }, { "entropy": 1.556614215373993, "epoch": 1.8902785673678226, "grad_norm": 1.2233829498291016, "learning_rate": 9.148143967266209e-07, "loss": 1.1657, "mean_token_accuracy": 0.7561358803510666, "num_tokens": 980985021.0, "step": 39900 }, { "entropy": 1.5509166061878203, "epoch": 1.892647337502369, "grad_norm": 1.1454182863235474, "learning_rate": 8.758721369807099e-07, "loss": 1.103, "mean_token_accuracy": 0.7666506910324097, "num_tokens": 982221836.0, "step": 39950 }, { "entropy": 1.5321409046649932, "epoch": 1.895016107636915, "grad_norm": 1.1777846813201904, "learning_rate": 8.377695381766804e-07, "loss": 1.1016, "mean_token_accuracy": 0.7675345009565353, "num_tokens": 983496374.0, "step": 40000 }, { "epoch": 1.895016107636915, "eval_entropy": 1.0718190068778954, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7612931992664097, "eval_num_tokens": 983496374.0, "eval_runtime": 728.5673, "eval_samples_per_second": 34.06, "eval_steps_per_second": 4.258, "step": 40000 }, { "entropy": 1.563276962041855, "epoch": 1.897384877771461, "grad_norm": 1.0903818607330322, "learning_rate": 8.00507251588456e-07, "loss": 1.1352, "mean_token_accuracy": 0.761111610531807, "num_tokens": 984743304.0, "step": 40050 }, { "entropy": 1.5520950222015382, "epoch": 1.899753647906007, "grad_norm": 1.4384498596191406, "learning_rate": 7.64085914126822e-07, "loss": 1.105, "mean_token_accuracy": 0.7678878873586654, "num_tokens": 985955766.0, "step": 40100 }, { "entropy": 1.522156765460968, "epoch": 1.9021224180405534, "grad_norm": 1.1247589588165283, "learning_rate": 7.285061483285227e-07, "loss": 1.0875, "mean_token_accuracy": 0.7697209113836289, "num_tokens": 987194885.0, "step": 40150 }, { "entropy": 1.5538606441020966, "epoch": 1.9044911881750994, "grad_norm": 1.1288164854049683, "learning_rate": 6.937685623456147e-07, "loss": 1.1135, "mean_token_accuracy": 0.7657132083177567, "num_tokens": 988419847.0, "step": 40200 }, { "entropy": 1.5719120705127716, "epoch": 1.9068599583096457, "grad_norm": 1.0764816999435425, "learning_rate": 6.598737499350915e-07, "loss": 1.1339, "mean_token_accuracy": 0.7616824221611023, "num_tokens": 989644278.0, "step": 40250 }, { "entropy": 1.531646077632904, "epoch": 1.9092287284441918, "grad_norm": 1.442927360534668, "learning_rate": 6.268222904487087e-07, "loss": 1.1163, "mean_token_accuracy": 0.7660135048627853, "num_tokens": 990874401.0, "step": 40300 }, { "entropy": 1.5490214943885803, "epoch": 1.9115974985787378, "grad_norm": 1.215856671333313, "learning_rate": 5.946147488231135e-07, "loss": 1.1019, "mean_token_accuracy": 0.7664029818773269, "num_tokens": 992095108.0, "step": 40350 }, { "entropy": 1.5366157114505767, "epoch": 1.913966268713284, "grad_norm": 1.2367199659347534, "learning_rate": 5.632516755701588e-07, "loss": 1.1322, "mean_token_accuracy": 0.763009768128395, "num_tokens": 993348612.0, "step": 40400 }, { "entropy": 1.5416124892234802, "epoch": 1.9163350388478302, "grad_norm": 1.5094550848007202, "learning_rate": 5.327336067674992e-07, "loss": 1.121, "mean_token_accuracy": 0.7645821911096573, "num_tokens": 994580681.0, "step": 40450 }, { "entropy": 1.5396121156215667, "epoch": 1.9187038089823765, "grad_norm": 1.1620501279830933, "learning_rate": 5.030610640494427e-07, "loss": 1.0964, "mean_token_accuracy": 0.7677625626325607, "num_tokens": 995811651.0, "step": 40500 }, { "entropy": 1.551676151752472, "epoch": 1.9210725791169225, "grad_norm": 1.1541757583618164, "learning_rate": 4.7423455459803536e-07, "loss": 1.1273, "mean_token_accuracy": 0.7630288958549499, "num_tokens": 997009574.0, "step": 40550 }, { "entropy": 1.5580232727527619, "epoch": 1.9234413492514686, "grad_norm": 1.1015989780426025, "learning_rate": 4.46254571134358e-07, "loss": 1.1074, "mean_token_accuracy": 0.7681276690959931, "num_tokens": 998241396.0, "step": 40600 }, { "entropy": 1.5923674273490906, "epoch": 1.9258101193860147, "grad_norm": 1.3336706161499023, "learning_rate": 4.1912159191015433e-07, "loss": 1.1296, "mean_token_accuracy": 0.7642046666145325, "num_tokens": 999467429.0, "step": 40650 }, { "entropy": 1.5437606346607209, "epoch": 1.928178889520561, "grad_norm": 1.2162717580795288, "learning_rate": 3.928360806996212e-07, "loss": 1.1328, "mean_token_accuracy": 0.7629256331920624, "num_tokens": 1000699068.0, "step": 40700 }, { "entropy": 1.5417852425575256, "epoch": 1.9305476596551072, "grad_norm": 1.6152098178863525, "learning_rate": 3.673984867914815e-07, "loss": 1.1152, "mean_token_accuracy": 0.7649567657709122, "num_tokens": 1001935400.0, "step": 40750 }, { "entropy": 1.5360798180103301, "epoch": 1.9329164297896533, "grad_norm": 1.1792229413986206, "learning_rate": 3.4280924498132917e-07, "loss": 1.0897, "mean_token_accuracy": 0.7702373021841049, "num_tokens": 1003148072.0, "step": 40800 }, { "entropy": 1.5614196360111237, "epoch": 1.9352851999241993, "grad_norm": 1.244520664215088, "learning_rate": 3.1906877556417414e-07, "loss": 1.1636, "mean_token_accuracy": 0.7582697266340256, "num_tokens": 1004371846.0, "step": 40850 }, { "entropy": 1.5311019134521484, "epoch": 1.9376539700587454, "grad_norm": 1.064553141593933, "learning_rate": 2.961774843272702e-07, "loss": 1.0873, "mean_token_accuracy": 0.7693702638149261, "num_tokens": 1005607072.0, "step": 40900 }, { "entropy": 1.5764271855354308, "epoch": 1.9400227401932917, "grad_norm": 1.3938215970993042, "learning_rate": 2.7413576254317065e-07, "loss": 1.1587, "mean_token_accuracy": 0.7578467607498169, "num_tokens": 1006850136.0, "step": 40950 }, { "entropy": 1.5480285215377807, "epoch": 1.9423915103278377, "grad_norm": 1.2054554224014282, "learning_rate": 2.529439869630612e-07, "loss": 1.1434, "mean_token_accuracy": 0.7606914877891541, "num_tokens": 1008107025.0, "step": 41000 }, { "epoch": 1.9423915103278377, "eval_entropy": 1.068777510758218, "eval_loss": NaN, "eval_mean_token_accuracy": 0.7612478421396935, "eval_num_tokens": 1008107025.0, "eval_runtime": 729.2965, "eval_samples_per_second": 34.026, "eval_steps_per_second": 4.253, "step": 41000 }, { "entropy": 1.5319510400295258, "epoch": 1.944760280462384, "grad_norm": 1.2180676460266113, "learning_rate": 2.326025198102877e-07, "loss": 1.1041, "mean_token_accuracy": 0.7670031028985977, "num_tokens": 1009369117.0, "step": 41050 }, { "entropy": 1.5614441645145416, "epoch": 1.94712905059693, "grad_norm": 1.0921282768249512, "learning_rate": 2.1311170877418296e-07, "loss": 1.1394, "mean_token_accuracy": 0.7615219783782959, "num_tokens": 1010587367.0, "step": 41100 }, { "entropy": 1.5393009448051453, "epoch": 1.9494978207314761, "grad_norm": 0.9701796770095825, "learning_rate": 1.9447188700413287e-07, "loss": 1.097, "mean_token_accuracy": 0.7700717490911484, "num_tokens": 1011833377.0, "step": 41150 }, { "entropy": 1.541386902332306, "epoch": 1.9518665908660222, "grad_norm": 1.24893319606781, "learning_rate": 1.7668337310386418e-07, "loss": 1.1177, "mean_token_accuracy": 0.7653918391466141, "num_tokens": 1013068815.0, "step": 41200 }, { "entropy": 1.5547258961200714, "epoch": 1.9542353610005685, "grad_norm": 1.079546570777893, "learning_rate": 1.5974647112600994e-07, "loss": 1.123, "mean_token_accuracy": 0.7637561255693436, "num_tokens": 1014274097.0, "step": 41250 }, { "entropy": 1.5249077999591827, "epoch": 1.9566041311351148, "grad_norm": 1.5189976692199707, "learning_rate": 1.436614705669026e-07, "loss": 1.1098, "mean_token_accuracy": 0.7675369191169739, "num_tokens": 1015545120.0, "step": 41300 }, { "entropy": 1.569046869277954, "epoch": 1.9589729012696608, "grad_norm": 1.0235174894332886, "learning_rate": 1.2842864636164464e-07, "loss": 1.1425, "mean_token_accuracy": 0.7613073486089706, "num_tokens": 1016784041.0, "step": 41350 }, { "entropy": 1.5418232583999634, "epoch": 1.961341671404207, "grad_norm": 1.1395779848098755, "learning_rate": 1.1404825887937898e-07, "loss": 1.1096, "mean_token_accuracy": 0.7651181477308273, "num_tokens": 1018023715.0, "step": 41400 }, { "entropy": 1.5499148654937744, "epoch": 1.963710441538753, "grad_norm": 1.103959560394287, "learning_rate": 1.0052055391887027e-07, "loss": 1.1536, "mean_token_accuracy": 0.7575215709209442, "num_tokens": 1019247827.0, "step": 41450 }, { "entropy": 1.5448267459869385, "epoch": 1.9660792116732992, "grad_norm": 1.159401297569275, "learning_rate": 8.784576270428058e-08, "loss": 1.1252, "mean_token_accuracy": 0.7638620465993882, "num_tokens": 1020467713.0, "step": 41500 } ], "logging_steps": 50, "max_steps": 42216, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1122127009405626e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null }