{ "best_global_step": 6948, "best_metric": 5.525067329406738, "best_model_checkpoint": "./output/checkpoint-6948", "epoch": 4.0, "eval_steps": 500, "global_step": 6948, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 3.606692385673523, "epoch": 0.028785261945883708, "grad_norm": 3.2999913692474365, "learning_rate": 4.9e-07, "loss": 13.6598, "mean_token_accuracy": 0.16028020828962325, "num_tokens": 53993.0, "step": 50 }, { "entropy": 3.618675880432129, "epoch": 0.057570523891767415, "grad_norm": 3.101252555847168, "learning_rate": 9.9e-07, "loss": 14.0188, "mean_token_accuracy": 0.1508466500043869, "num_tokens": 110134.0, "step": 100 }, { "entropy": 3.5215235900878907, "epoch": 0.08635578583765112, "grad_norm": 3.513662815093994, "learning_rate": 1.49e-06, "loss": 12.8555, "mean_token_accuracy": 0.18527640983462335, "num_tokens": 160191.0, "step": 150 }, { "entropy": 3.667909698486328, "epoch": 0.11514104778353483, "grad_norm": 4.327610492706299, "learning_rate": 1.99e-06, "loss": 13.5394, "mean_token_accuracy": 0.157139780074358, "num_tokens": 214993.0, "step": 200 }, { "entropy": 3.768263258934021, "epoch": 0.14392630972941853, "grad_norm": 4.290107250213623, "learning_rate": 1.988450206246317e-06, "loss": 12.8912, "mean_token_accuracy": 0.17374794125556947, "num_tokens": 268184.0, "step": 250 }, { "entropy": 3.990619196891785, "epoch": 0.17271157167530224, "grad_norm": 4.444278717041016, "learning_rate": 1.976664702416028e-06, "loss": 12.455, "mean_token_accuracy": 0.17780130118131637, "num_tokens": 319458.0, "step": 300 }, { "entropy": 4.162646284103394, "epoch": 0.20149683362118595, "grad_norm": 5.615262508392334, "learning_rate": 1.9648791985857395e-06, "loss": 12.0893, "mean_token_accuracy": 0.18191319867968558, "num_tokens": 373337.0, "step": 350 }, { "entropy": 4.532100868225098, "epoch": 0.23028209556706966, "grad_norm": 10.074016571044922, "learning_rate": 1.9530936947554507e-06, "loss": 11.9261, "mean_token_accuracy": 0.169477596282959, "num_tokens": 427526.0, "step": 400 }, { "entropy": 4.923871030807495, "epoch": 0.25906735751295334, "grad_norm": 16.220163345336914, "learning_rate": 1.9413081909251622e-06, "loss": 11.0048, "mean_token_accuracy": 0.1704501649737358, "num_tokens": 480528.0, "step": 450 }, { "entropy": 5.521005854606629, "epoch": 0.28785261945883706, "grad_norm": 29.904008865356445, "learning_rate": 1.9295226870948733e-06, "loss": 9.6524, "mean_token_accuracy": 0.16450899541378022, "num_tokens": 535314.0, "step": 500 }, { "entropy": 6.092623329162597, "epoch": 0.31663788140472077, "grad_norm": 17.821575164794922, "learning_rate": 1.9177371832645845e-06, "loss": 8.1054, "mean_token_accuracy": 0.17205011785030366, "num_tokens": 588410.0, "step": 550 }, { "entropy": 6.385262680053711, "epoch": 0.3454231433506045, "grad_norm": 5.502202987670898, "learning_rate": 1.9059516794342958e-06, "loss": 7.4313, "mean_token_accuracy": 0.1734227080643177, "num_tokens": 641736.0, "step": 600 }, { "entropy": 6.278562617301941, "epoch": 0.3742084052964882, "grad_norm": 5.4657697677612305, "learning_rate": 1.8941661756040071e-06, "loss": 6.9266, "mean_token_accuracy": 0.18680249139666558, "num_tokens": 692200.0, "step": 650 }, { "entropy": 6.553266277313233, "epoch": 0.4029936672423719, "grad_norm": 4.955812931060791, "learning_rate": 1.8823806717737183e-06, "loss": 6.9847, "mean_token_accuracy": 0.16679802387952805, "num_tokens": 745830.0, "step": 700 }, { "entropy": 6.470935583114624, "epoch": 0.4317789291882556, "grad_norm": 4.198381423950195, "learning_rate": 1.8705951679434296e-06, "loss": 6.7277, "mean_token_accuracy": 0.17847734570503235, "num_tokens": 798872.0, "step": 750 }, { "entropy": 6.5620588779449465, "epoch": 0.4605641911341393, "grad_norm": 3.1793746948242188, "learning_rate": 1.8588096641131407e-06, "loss": 6.7032, "mean_token_accuracy": 0.17336134731769562, "num_tokens": 853045.0, "step": 800 }, { "entropy": 6.532204885482788, "epoch": 0.48934945308002303, "grad_norm": 3.824537515640259, "learning_rate": 1.847024160282852e-06, "loss": 6.5762, "mean_token_accuracy": 0.1805124071240425, "num_tokens": 907679.0, "step": 850 }, { "entropy": 6.535988225936889, "epoch": 0.5181347150259067, "grad_norm": 4.350001811981201, "learning_rate": 1.8352386564525632e-06, "loss": 6.505, "mean_token_accuracy": 0.1842605724930763, "num_tokens": 964170.0, "step": 900 }, { "entropy": 6.204533562660218, "epoch": 0.5469199769717904, "grad_norm": 2.193660020828247, "learning_rate": 1.8234531526222745e-06, "loss": 6.1211, "mean_token_accuracy": 0.21968430042266845, "num_tokens": 1015909.0, "step": 950 }, { "entropy": 6.308737449645996, "epoch": 0.5757052389176741, "grad_norm": 2.325622320175171, "learning_rate": 1.8116676487919857e-06, "loss": 6.1653, "mean_token_accuracy": 0.21636426240205764, "num_tokens": 1068859.0, "step": 1000 }, { "entropy": 6.332560749053955, "epoch": 0.6044905008635578, "grad_norm": 2.0439090728759766, "learning_rate": 1.799882144961697e-06, "loss": 6.1559, "mean_token_accuracy": 0.21859725564718246, "num_tokens": 1123202.0, "step": 1050 }, { "entropy": 6.042124252319336, "epoch": 0.6332757628094415, "grad_norm": 3.621903657913208, "learning_rate": 1.7880966411314081e-06, "loss": 5.8441, "mean_token_accuracy": 0.24906315237283708, "num_tokens": 1173403.0, "step": 1100 }, { "entropy": 5.921343173980713, "epoch": 0.6620610247553252, "grad_norm": 5.658033847808838, "learning_rate": 1.7763111373011195e-06, "loss": 5.7104, "mean_token_accuracy": 0.2625067520141602, "num_tokens": 1225026.0, "step": 1150 }, { "entropy": 6.093586492538452, "epoch": 0.690846286701209, "grad_norm": 2.4292995929718018, "learning_rate": 1.7645256334708308e-06, "loss": 5.8658, "mean_token_accuracy": 0.24842385441064835, "num_tokens": 1279013.0, "step": 1200 }, { "entropy": 6.119112596511841, "epoch": 0.7196315486470927, "grad_norm": 3.369384288787842, "learning_rate": 1.752740129640542e-06, "loss": 5.8784, "mean_token_accuracy": 0.24857850253582, "num_tokens": 1332547.0, "step": 1250 }, { "entropy": 6.025163550376892, "epoch": 0.7484168105929764, "grad_norm": 2.5110116004943848, "learning_rate": 1.7409546258102533e-06, "loss": 5.7769, "mean_token_accuracy": 0.25835376888513567, "num_tokens": 1385192.0, "step": 1300 }, { "entropy": 5.877259612083435, "epoch": 0.7772020725388601, "grad_norm": 2.4179303646087646, "learning_rate": 1.7291691219799646e-06, "loss": 5.6284, "mean_token_accuracy": 0.2756252554059029, "num_tokens": 1437071.0, "step": 1350 }, { "entropy": 6.002246947288513, "epoch": 0.8059873344847438, "grad_norm": 3.494359016418457, "learning_rate": 1.717383618149676e-06, "loss": 5.747, "mean_token_accuracy": 0.26462210685014725, "num_tokens": 1490818.0, "step": 1400 }, { "entropy": 5.991955623626709, "epoch": 0.8347725964306275, "grad_norm": 2.340975761413574, "learning_rate": 1.705598114319387e-06, "loss": 5.7379, "mean_token_accuracy": 0.26444981098175047, "num_tokens": 1544997.0, "step": 1450 }, { "entropy": 5.91768889427185, "epoch": 0.8635578583765112, "grad_norm": 2.2394514083862305, "learning_rate": 1.6938126104890984e-06, "loss": 5.6564, "mean_token_accuracy": 0.2730415526032448, "num_tokens": 1598302.0, "step": 1500 }, { "entropy": 5.982716989517212, "epoch": 0.8923431203223949, "grad_norm": 1.876839518547058, "learning_rate": 1.6820271066588098e-06, "loss": 5.7215, "mean_token_accuracy": 0.26642445534467696, "num_tokens": 1655267.0, "step": 1550 }, { "entropy": 5.820467872619629, "epoch": 0.9211283822682786, "grad_norm": 2.219966173171997, "learning_rate": 1.6702416028285209e-06, "loss": 5.5555, "mean_token_accuracy": 0.2856418335437775, "num_tokens": 1709199.0, "step": 1600 }, { "entropy": 5.996349005699158, "epoch": 0.9499136442141624, "grad_norm": 2.247213840484619, "learning_rate": 1.6584560989982322e-06, "loss": 5.7283, "mean_token_accuracy": 0.2696125540137291, "num_tokens": 1765443.0, "step": 1650 }, { "entropy": 5.696683068275451, "epoch": 0.9786989061600461, "grad_norm": 2.8499979972839355, "learning_rate": 1.6466705951679433e-06, "loss": 5.4335, "mean_token_accuracy": 0.29918427973985673, "num_tokens": 1817494.0, "step": 1700 }, { "epoch": 1.0, "eval_entropy": 5.993559589034401, "eval_loss": 5.737204551696777, "eval_mean_token_accuracy": 0.2618687468739699, "eval_model_preparation_time": 0.0045, "eval_num_tokens": 1856362.0, "eval_runtime": 50.5332, "eval_samples_per_second": 8.588, "eval_steps_per_second": 4.294, "step": 1737 }, { "entropy": 5.746842083930969, "epoch": 1.0074841681059297, "grad_norm": 2.33052921295166, "learning_rate": 1.6348850913376547e-06, "loss": 5.4796, "mean_token_accuracy": 0.2966849410533905, "num_tokens": 1870353.0, "step": 1750 }, { "entropy": 5.859029049873352, "epoch": 1.0362694300518134, "grad_norm": 1.6248886585235596, "learning_rate": 1.6230995875073658e-06, "loss": 5.5975, "mean_token_accuracy": 0.2838129925727844, "num_tokens": 1926205.0, "step": 1800 }, { "entropy": 5.731445336341858, "epoch": 1.065054691997697, "grad_norm": 1.6941566467285156, "learning_rate": 1.6113140836770771e-06, "loss": 5.476, "mean_token_accuracy": 0.2992346465587616, "num_tokens": 1979821.0, "step": 1850 }, { "entropy": 5.6993954515457155, "epoch": 1.0938399539435808, "grad_norm": 1.1746597290039062, "learning_rate": 1.5995285798467883e-06, "loss": 5.4608, "mean_token_accuracy": 0.3000726142525673, "num_tokens": 2034373.0, "step": 1900 }, { "entropy": 5.668873124122619, "epoch": 1.1226252158894645, "grad_norm": 1.728211760520935, "learning_rate": 1.5877430760164996e-06, "loss": 5.4347, "mean_token_accuracy": 0.3033922725915909, "num_tokens": 2087339.0, "step": 1950 }, { "entropy": 5.624621086120605, "epoch": 1.1514104778353482, "grad_norm": 1.4078539609909058, "learning_rate": 1.5759575721862107e-06, "loss": 5.3954, "mean_token_accuracy": 0.30784171640872954, "num_tokens": 2139520.0, "step": 2000 }, { "entropy": 5.7141213130950925, "epoch": 1.180195739781232, "grad_norm": 2.186459541320801, "learning_rate": 1.564172068355922e-06, "loss": 5.4847, "mean_token_accuracy": 0.29594049394130706, "num_tokens": 2193987.0, "step": 2050 }, { "entropy": 5.632415266036987, "epoch": 1.2089810017271156, "grad_norm": 1.3601349592208862, "learning_rate": 1.5523865645256334e-06, "loss": 5.4135, "mean_token_accuracy": 0.30366597563028336, "num_tokens": 2249616.0, "step": 2100 }, { "entropy": 5.510904269218445, "epoch": 1.2377662636729994, "grad_norm": 2.065760612487793, "learning_rate": 1.5406010606953445e-06, "loss": 5.2904, "mean_token_accuracy": 0.3211754837632179, "num_tokens": 2300863.0, "step": 2150 }, { "entropy": 5.703383626937867, "epoch": 1.266551525618883, "grad_norm": 1.1172698736190796, "learning_rate": 1.5288155568650559e-06, "loss": 5.4802, "mean_token_accuracy": 0.29713701367378237, "num_tokens": 2356029.0, "step": 2200 }, { "entropy": 5.565930342674255, "epoch": 1.2953367875647668, "grad_norm": 1.7528513669967651, "learning_rate": 1.5170300530347672e-06, "loss": 5.3518, "mean_token_accuracy": 0.31301232606172563, "num_tokens": 2408957.0, "step": 2250 }, { "entropy": 5.496430187225342, "epoch": 1.3241220495106505, "grad_norm": 1.892640233039856, "learning_rate": 1.5052445492044786e-06, "loss": 5.2967, "mean_token_accuracy": 0.3181899458169937, "num_tokens": 2462569.0, "step": 2300 }, { "entropy": 5.725150098800659, "epoch": 1.3529073114565342, "grad_norm": 1.774940848350525, "learning_rate": 1.4934590453741897e-06, "loss": 5.5215, "mean_token_accuracy": 0.29055028676986694, "num_tokens": 2518544.0, "step": 2350 }, { "entropy": 5.4884827613830565, "epoch": 1.381692573402418, "grad_norm": 2.2167599201202393, "learning_rate": 1.481673541543901e-06, "loss": 5.2917, "mean_token_accuracy": 0.31803421139717103, "num_tokens": 2570863.0, "step": 2400 }, { "entropy": 5.697079472541809, "epoch": 1.4104778353483016, "grad_norm": 1.6489030122756958, "learning_rate": 1.4698880377136124e-06, "loss": 5.4982, "mean_token_accuracy": 0.2925163987278938, "num_tokens": 2626998.0, "step": 2450 }, { "entropy": 5.46209939956665, "epoch": 1.4392630972941853, "grad_norm": 1.153914451599121, "learning_rate": 1.4581025338833235e-06, "loss": 5.2736, "mean_token_accuracy": 0.3182168474793434, "num_tokens": 2681568.0, "step": 2500 }, { "entropy": 5.4405768728256225, "epoch": 1.468048359240069, "grad_norm": 3.6614978313446045, "learning_rate": 1.4463170300530348e-06, "loss": 5.2515, "mean_token_accuracy": 0.3218736210465431, "num_tokens": 2733587.0, "step": 2550 }, { "entropy": 5.528175053596496, "epoch": 1.4968336211859528, "grad_norm": 1.0849746465682983, "learning_rate": 1.434531526222746e-06, "loss": 5.3378, "mean_token_accuracy": 0.31061659604310987, "num_tokens": 2787003.0, "step": 2600 }, { "entropy": 5.46110897064209, "epoch": 1.5256188831318365, "grad_norm": 1.8315683603286743, "learning_rate": 1.4227460223924573e-06, "loss": 5.2782, "mean_token_accuracy": 0.31781029611825945, "num_tokens": 2840263.0, "step": 2650 }, { "entropy": 5.455560960769653, "epoch": 1.5544041450777202, "grad_norm": 1.1859091520309448, "learning_rate": 1.4109605185621684e-06, "loss": 5.2735, "mean_token_accuracy": 0.3194814011454582, "num_tokens": 2894186.0, "step": 2700 }, { "entropy": 5.430496115684509, "epoch": 1.583189407023604, "grad_norm": 2.3500001430511475, "learning_rate": 1.3991750147318797e-06, "loss": 5.2464, "mean_token_accuracy": 0.32140792965888976, "num_tokens": 2948171.0, "step": 2750 }, { "entropy": 5.588023023605347, "epoch": 1.6119746689694876, "grad_norm": 1.727825403213501, "learning_rate": 1.3873895109015909e-06, "loss": 5.4028, "mean_token_accuracy": 0.3039530631899834, "num_tokens": 3002678.0, "step": 2800 }, { "entropy": 5.410525422096253, "epoch": 1.6407599309153713, "grad_norm": 1.3401474952697754, "learning_rate": 1.3756040070713022e-06, "loss": 5.2298, "mean_token_accuracy": 0.324065263569355, "num_tokens": 3055844.0, "step": 2850 }, { "entropy": 5.36959942817688, "epoch": 1.669545192861255, "grad_norm": 1.1892589330673218, "learning_rate": 1.3638185032410133e-06, "loss": 5.1956, "mean_token_accuracy": 0.32639502108097074, "num_tokens": 3108636.0, "step": 2900 }, { "entropy": 5.53826907157898, "epoch": 1.6983304548071387, "grad_norm": 1.2652360200881958, "learning_rate": 1.3520329994107247e-06, "loss": 5.3583, "mean_token_accuracy": 0.3074926760792732, "num_tokens": 3162627.0, "step": 2950 }, { "entropy": 5.417449145317078, "epoch": 1.7271157167530224, "grad_norm": 1.584312915802002, "learning_rate": 1.340247495580436e-06, "loss": 5.2388, "mean_token_accuracy": 0.32019727885723115, "num_tokens": 3216409.0, "step": 3000 }, { "entropy": 5.241390740871429, "epoch": 1.7559009786989062, "grad_norm": 1.5219439268112183, "learning_rate": 1.3284619917501471e-06, "loss": 5.0645, "mean_token_accuracy": 0.3445430138707161, "num_tokens": 3266967.0, "step": 3050 }, { "entropy": 5.405424036979675, "epoch": 1.7846862406447899, "grad_norm": 2.1165153980255127, "learning_rate": 1.3166764879198585e-06, "loss": 5.232, "mean_token_accuracy": 0.32085000157356264, "num_tokens": 3319877.0, "step": 3100 }, { "entropy": 5.123006024360657, "epoch": 1.8134715025906736, "grad_norm": 1.2189785242080688, "learning_rate": 1.3048909840895698e-06, "loss": 4.9582, "mean_token_accuracy": 0.356108532845974, "num_tokens": 3368569.0, "step": 3150 }, { "entropy": 5.417610831260681, "epoch": 1.8422567645365573, "grad_norm": 1.5157604217529297, "learning_rate": 1.2931054802592812e-06, "loss": 5.2454, "mean_token_accuracy": 0.31976755023002623, "num_tokens": 3422449.0, "step": 3200 }, { "entropy": 5.409690895080566, "epoch": 1.871042026482441, "grad_norm": 1.3088161945343018, "learning_rate": 1.2813199764289923e-06, "loss": 5.2348, "mean_token_accuracy": 0.32325415283441544, "num_tokens": 3474399.0, "step": 3250 }, { "entropy": 5.44662567615509, "epoch": 1.8998272884283247, "grad_norm": 2.178372621536255, "learning_rate": 1.2695344725987036e-06, "loss": 5.2661, "mean_token_accuracy": 0.3182847076654434, "num_tokens": 3527726.0, "step": 3300 }, { "entropy": 5.512614865303039, "epoch": 1.9286125503742084, "grad_norm": 1.3050425052642822, "learning_rate": 1.2577489687684147e-06, "loss": 5.3416, "mean_token_accuracy": 0.3084403133392334, "num_tokens": 3581980.0, "step": 3350 }, { "entropy": 5.379772834777832, "epoch": 1.9573978123200921, "grad_norm": 1.4584404230117798, "learning_rate": 1.245963464938126e-06, "loss": 5.2087, "mean_token_accuracy": 0.32388432770967485, "num_tokens": 3635393.0, "step": 3400 }, { "entropy": 5.483665924072266, "epoch": 1.9861830742659758, "grad_norm": 1.2157734632492065, "learning_rate": 1.2341779611078374e-06, "loss": 5.3101, "mean_token_accuracy": 0.3121953472495079, "num_tokens": 3689894.0, "step": 3450 }, { "epoch": 2.0, "eval_entropy": 5.711394641805904, "eval_loss": 5.55628776550293, "eval_mean_token_accuracy": 0.2764948787777105, "eval_model_preparation_time": 0.0045, "eval_num_tokens": 3712724.0, "eval_runtime": 50.187, "eval_samples_per_second": 8.648, "eval_steps_per_second": 4.324, "step": 3474 }, { "entropy": 5.349283556938172, "epoch": 2.0149683362118593, "grad_norm": 1.1696771383285522, "learning_rate": 1.2223924572775486e-06, "loss": 5.1782, "mean_token_accuracy": 0.33028870791196824, "num_tokens": 3740861.0, "step": 3500 }, { "entropy": 5.4721107006073, "epoch": 2.043753598157743, "grad_norm": 1.8449370861053467, "learning_rate": 1.2106069534472599e-06, "loss": 5.2978, "mean_token_accuracy": 0.31511022299528124, "num_tokens": 3794869.0, "step": 3550 }, { "entropy": 5.404226851463318, "epoch": 2.0725388601036268, "grad_norm": 3.789496660232544, "learning_rate": 1.198821449616971e-06, "loss": 5.2371, "mean_token_accuracy": 0.32092176616191864, "num_tokens": 3848573.0, "step": 3600 }, { "entropy": 5.435445628166199, "epoch": 2.1013241220495105, "grad_norm": 2.2847959995269775, "learning_rate": 1.1870359457866824e-06, "loss": 5.2662, "mean_token_accuracy": 0.3186633634567261, "num_tokens": 3901204.0, "step": 3650 }, { "entropy": 5.4066293334960935, "epoch": 2.130109383995394, "grad_norm": 1.0950902700424194, "learning_rate": 1.1752504419563935e-06, "loss": 5.2345, "mean_token_accuracy": 0.32156052827835085, "num_tokens": 3953753.0, "step": 3700 }, { "entropy": 5.272332944869995, "epoch": 2.158894645941278, "grad_norm": 2.1477339267730713, "learning_rate": 1.1634649381261048e-06, "loss": 5.1091, "mean_token_accuracy": 0.3380983591079712, "num_tokens": 4005481.0, "step": 3750 }, { "entropy": 5.4118804311752315, "epoch": 2.1876799078871616, "grad_norm": 1.4509484767913818, "learning_rate": 1.151679434295816e-06, "loss": 5.2448, "mean_token_accuracy": 0.3208243528008461, "num_tokens": 4058829.0, "step": 3800 }, { "entropy": 5.4763900089263915, "epoch": 2.2164651698330453, "grad_norm": 1.0856804847717285, "learning_rate": 1.1398939304655273e-06, "loss": 5.3042, "mean_token_accuracy": 0.31338351368904116, "num_tokens": 4113326.0, "step": 3850 }, { "entropy": 5.328452725410461, "epoch": 2.245250431778929, "grad_norm": 3.2843880653381348, "learning_rate": 1.1281084266352386e-06, "loss": 5.1624, "mean_token_accuracy": 0.3305218696594238, "num_tokens": 4165454.0, "step": 3900 }, { "entropy": 5.383157343864441, "epoch": 2.2740356937248127, "grad_norm": 2.207082748413086, "learning_rate": 1.1163229228049497e-06, "loss": 5.2163, "mean_token_accuracy": 0.32331310987472534, "num_tokens": 4219250.0, "step": 3950 }, { "entropy": 5.585261764526368, "epoch": 2.3028209556706964, "grad_norm": 2.7102835178375244, "learning_rate": 1.104537418974661e-06, "loss": 5.4137, "mean_token_accuracy": 0.29959124475717547, "num_tokens": 4274711.0, "step": 4000 }, { "entropy": 5.434073266983032, "epoch": 2.33160621761658, "grad_norm": 1.3775779008865356, "learning_rate": 1.0927519151443724e-06, "loss": 5.2644, "mean_token_accuracy": 0.3175011593103409, "num_tokens": 4328616.0, "step": 4050 }, { "entropy": 5.462391858100891, "epoch": 2.360391479562464, "grad_norm": 1.4101024866104126, "learning_rate": 1.0809664113140838e-06, "loss": 5.2924, "mean_token_accuracy": 0.3137941011786461, "num_tokens": 4382416.0, "step": 4100 }, { "entropy": 5.529892563819885, "epoch": 2.3891767415083476, "grad_norm": 1.2311837673187256, "learning_rate": 1.0691809074837949e-06, "loss": 5.364, "mean_token_accuracy": 0.3046491605043411, "num_tokens": 4437848.0, "step": 4150 }, { "entropy": 5.4370484542846675, "epoch": 2.4179620034542313, "grad_norm": 1.0929864645004272, "learning_rate": 1.0573954036535062e-06, "loss": 5.2734, "mean_token_accuracy": 0.3169013774394989, "num_tokens": 4491185.0, "step": 4200 }, { "entropy": 5.395377616882325, "epoch": 2.446747265400115, "grad_norm": 1.5457273721694946, "learning_rate": 1.0456098998232174e-06, "loss": 5.2276, "mean_token_accuracy": 0.32221508473157884, "num_tokens": 4544086.0, "step": 4250 }, { "entropy": 5.443737335205078, "epoch": 2.4755325273459987, "grad_norm": 1.4844346046447754, "learning_rate": 1.0338243959929287e-06, "loss": 5.2786, "mean_token_accuracy": 0.3157751387357712, "num_tokens": 4597677.0, "step": 4300 }, { "entropy": 5.419876251220703, "epoch": 2.5043177892918824, "grad_norm": 1.2481963634490967, "learning_rate": 1.02203889216264e-06, "loss": 5.2564, "mean_token_accuracy": 0.31889803290367125, "num_tokens": 4651343.0, "step": 4350 }, { "entropy": 5.578677978515625, "epoch": 2.533103051237766, "grad_norm": 2.0005414485931396, "learning_rate": 1.0102533883323512e-06, "loss": 5.4145, "mean_token_accuracy": 0.30037090003490446, "num_tokens": 4705985.0, "step": 4400 }, { "entropy": 5.279946126937866, "epoch": 2.56188831318365, "grad_norm": 1.080521821975708, "learning_rate": 9.984678845020625e-07, "loss": 5.1226, "mean_token_accuracy": 0.3341303279995918, "num_tokens": 4757741.0, "step": 4450 }, { "entropy": 5.551463279724121, "epoch": 2.5906735751295336, "grad_norm": 1.28898024559021, "learning_rate": 9.866823806717736e-07, "loss": 5.3832, "mean_token_accuracy": 0.3028248634934425, "num_tokens": 4812808.0, "step": 4500 }, { "entropy": 5.3787487554550175, "epoch": 2.6194588370754173, "grad_norm": 1.5697983503341675, "learning_rate": 9.74896876841485e-07, "loss": 5.2141, "mean_token_accuracy": 0.3227942296862602, "num_tokens": 4866572.0, "step": 4550 }, { "entropy": 5.460358958244324, "epoch": 2.648244099021301, "grad_norm": 1.3180441856384277, "learning_rate": 9.63111373011196e-07, "loss": 5.2954, "mean_token_accuracy": 0.31269474506378175, "num_tokens": 4921312.0, "step": 4600 }, { "entropy": 5.434084935188293, "epoch": 2.6770293609671847, "grad_norm": 1.2409590482711792, "learning_rate": 9.513258691809074e-07, "loss": 5.271, "mean_token_accuracy": 0.3172155100107193, "num_tokens": 4974289.0, "step": 4650 }, { "entropy": 5.406955418586731, "epoch": 2.7058146229130684, "grad_norm": 1.4782609939575195, "learning_rate": 9.395403653506187e-07, "loss": 5.2473, "mean_token_accuracy": 0.32031788885593415, "num_tokens": 5028149.0, "step": 4700 }, { "entropy": 5.206603040695191, "epoch": 2.734599884858952, "grad_norm": 2.351633071899414, "learning_rate": 9.2775486152033e-07, "loss": 5.0478, "mean_token_accuracy": 0.3428420132398605, "num_tokens": 5079349.0, "step": 4750 }, { "entropy": 5.388812799453735, "epoch": 2.763385146804836, "grad_norm": 7.564618110656738, "learning_rate": 9.159693576900412e-07, "loss": 5.2281, "mean_token_accuracy": 0.3222071170806885, "num_tokens": 5132564.0, "step": 4800 }, { "entropy": 5.374106278419495, "epoch": 2.7921704087507195, "grad_norm": 1.4734679460525513, "learning_rate": 9.041838538597525e-07, "loss": 5.2161, "mean_token_accuracy": 0.3219477406144142, "num_tokens": 5185921.0, "step": 4850 }, { "entropy": 5.232998585700988, "epoch": 2.8209556706966032, "grad_norm": 1.4175471067428589, "learning_rate": 8.923983500294637e-07, "loss": 5.0769, "mean_token_accuracy": 0.3403926733136177, "num_tokens": 5237521.0, "step": 4900 }, { "entropy": 5.394891719818116, "epoch": 2.849740932642487, "grad_norm": 4.951873779296875, "learning_rate": 8.806128461991749e-07, "loss": 5.2344, "mean_token_accuracy": 0.3213117456436157, "num_tokens": 5291104.0, "step": 4950 }, { "entropy": 5.413805012702942, "epoch": 2.8785261945883707, "grad_norm": 1.679518461227417, "learning_rate": 8.688273423688863e-07, "loss": 5.2597, "mean_token_accuracy": 0.3165634173154831, "num_tokens": 5345058.0, "step": 5000 }, { "entropy": 5.256177935600281, "epoch": 2.9073114565342544, "grad_norm": 1.8892916440963745, "learning_rate": 8.570418385385975e-07, "loss": 5.1004, "mean_token_accuracy": 0.3369427987933159, "num_tokens": 5395918.0, "step": 5050 }, { "entropy": 5.259814453125, "epoch": 2.936096718480138, "grad_norm": 1.3802675008773804, "learning_rate": 8.452563347083087e-07, "loss": 5.1057, "mean_token_accuracy": 0.3362414276599884, "num_tokens": 5448086.0, "step": 5100 }, { "entropy": 5.416206178665161, "epoch": 2.964881980426022, "grad_norm": 1.7677236795425415, "learning_rate": 8.3347083087802e-07, "loss": 5.2562, "mean_token_accuracy": 0.31725785195827483, "num_tokens": 5501959.0, "step": 5150 }, { "entropy": 5.507337794303894, "epoch": 2.9936672423719055, "grad_norm": 1.021727442741394, "learning_rate": 8.216853270477313e-07, "loss": 5.344, "mean_token_accuracy": 0.30679062128067014, "num_tokens": 5557908.0, "step": 5200 }, { "epoch": 3.0, "eval_entropy": 5.682707933786278, "eval_loss": 5.53223991394043, "eval_mean_token_accuracy": 0.27747743456594404, "eval_model_preparation_time": 0.0045, "eval_num_tokens": 5569086.0, "eval_runtime": 49.9944, "eval_samples_per_second": 8.681, "eval_steps_per_second": 4.34, "step": 5211 }, { "entropy": 5.209756035804748, "epoch": 3.0224525043177892, "grad_norm": 1.725786566734314, "learning_rate": 8.098998232174425e-07, "loss": 5.0541, "mean_token_accuracy": 0.34166110813617706, "num_tokens": 5608917.0, "step": 5250 }, { "entropy": 5.396296281814575, "epoch": 3.051237766263673, "grad_norm": 0.7720207571983337, "learning_rate": 7.981143193871538e-07, "loss": 5.2337, "mean_token_accuracy": 0.32116260558366777, "num_tokens": 5662712.0, "step": 5300 }, { "entropy": 5.341518473625183, "epoch": 3.0800230282095566, "grad_norm": 2.2686808109283447, "learning_rate": 7.86328815556865e-07, "loss": 5.1824, "mean_token_accuracy": 0.32726580530405047, "num_tokens": 5715921.0, "step": 5350 }, { "entropy": 5.376176896095276, "epoch": 3.1088082901554404, "grad_norm": 1.2420796155929565, "learning_rate": 7.745433117265762e-07, "loss": 5.2162, "mean_token_accuracy": 0.32142678707838057, "num_tokens": 5769436.0, "step": 5400 }, { "entropy": 5.4553061914443965, "epoch": 3.137593552101324, "grad_norm": 1.2402859926223755, "learning_rate": 7.627578078962876e-07, "loss": 5.2971, "mean_token_accuracy": 0.31396267503499986, "num_tokens": 5823126.0, "step": 5450 }, { "entropy": 5.385247969627381, "epoch": 3.166378814047208, "grad_norm": 1.112062931060791, "learning_rate": 7.509723040659988e-07, "loss": 5.2324, "mean_token_accuracy": 0.3207343602180481, "num_tokens": 5875751.0, "step": 5500 }, { "entropy": 5.55422221660614, "epoch": 3.1951640759930915, "grad_norm": 1.5440446138381958, "learning_rate": 7.3918680023571e-07, "loss": 5.3902, "mean_token_accuracy": 0.3006985321640968, "num_tokens": 5932163.0, "step": 5550 }, { "entropy": 5.403217372894287, "epoch": 3.223949337938975, "grad_norm": 0.8481096625328064, "learning_rate": 7.274012964054213e-07, "loss": 5.2417, "mean_token_accuracy": 0.3210747820138931, "num_tokens": 5985889.0, "step": 5600 }, { "entropy": 5.388293180465698, "epoch": 3.252734599884859, "grad_norm": 0.9305989146232605, "learning_rate": 7.156157925751326e-07, "loss": 5.2319, "mean_token_accuracy": 0.3206030324101448, "num_tokens": 6040052.0, "step": 5650 }, { "entropy": 5.401709322929382, "epoch": 3.2815198618307426, "grad_norm": 0.8080459237098694, "learning_rate": 7.038302887448438e-07, "loss": 5.2438, "mean_token_accuracy": 0.3199671137332916, "num_tokens": 6092350.0, "step": 5700 }, { "entropy": 5.4320423412323, "epoch": 3.3103051237766263, "grad_norm": 1.9186089038848877, "learning_rate": 6.920447849145551e-07, "loss": 5.2696, "mean_token_accuracy": 0.31657984614372253, "num_tokens": 6146112.0, "step": 5750 }, { "entropy": 5.276471285820008, "epoch": 3.33909038572251, "grad_norm": 1.032879114151001, "learning_rate": 6.802592810842663e-07, "loss": 5.1224, "mean_token_accuracy": 0.3347566506266594, "num_tokens": 6197916.0, "step": 5800 }, { "entropy": 5.122317051887512, "epoch": 3.3678756476683938, "grad_norm": 3.156858444213867, "learning_rate": 6.684737772539775e-07, "loss": 4.9706, "mean_token_accuracy": 0.35455317378044127, "num_tokens": 6247565.0, "step": 5850 }, { "entropy": 5.346597375869751, "epoch": 3.3966609096142775, "grad_norm": 1.2619549036026, "learning_rate": 6.566882734236889e-07, "loss": 5.1902, "mean_token_accuracy": 0.3258721518516541, "num_tokens": 6300481.0, "step": 5900 }, { "entropy": 5.413151068687439, "epoch": 3.425446171560161, "grad_norm": 1.801740050315857, "learning_rate": 6.449027695934001e-07, "loss": 5.2513, "mean_token_accuracy": 0.3187857499718666, "num_tokens": 6353098.0, "step": 5950 }, { "entropy": 5.464186942577362, "epoch": 3.454231433506045, "grad_norm": 1.6306997537612915, "learning_rate": 6.331172657631113e-07, "loss": 5.3043, "mean_token_accuracy": 0.31154109388589857, "num_tokens": 6407984.0, "step": 6000 }, { "entropy": 5.401795778274536, "epoch": 3.4830166954519286, "grad_norm": 1.1694583892822266, "learning_rate": 6.213317619328226e-07, "loss": 5.2427, "mean_token_accuracy": 0.31954523265361784, "num_tokens": 6461854.0, "step": 6050 }, { "entropy": 5.317689285278321, "epoch": 3.5118019573978123, "grad_norm": 0.9361855387687683, "learning_rate": 6.095462581025339e-07, "loss": 5.1588, "mean_token_accuracy": 0.330586878657341, "num_tokens": 6514882.0, "step": 6100 }, { "entropy": 5.478708257675171, "epoch": 3.540587219343696, "grad_norm": 1.05711030960083, "learning_rate": 5.977607542722451e-07, "loss": 5.321, "mean_token_accuracy": 0.3104448106884956, "num_tokens": 6569455.0, "step": 6150 }, { "entropy": 5.309361801147461, "epoch": 3.5693724812895797, "grad_norm": 1.3499550819396973, "learning_rate": 5.859752504419564e-07, "loss": 5.153, "mean_token_accuracy": 0.331512533724308, "num_tokens": 6621734.0, "step": 6200 }, { "entropy": 5.296572666168213, "epoch": 3.5981577432354634, "grad_norm": 1.940708875656128, "learning_rate": 5.741897466116676e-07, "loss": 5.14, "mean_token_accuracy": 0.3299832499027252, "num_tokens": 6674994.0, "step": 6250 }, { "entropy": 5.544284400939941, "epoch": 3.626943005181347, "grad_norm": 1.8903827667236328, "learning_rate": 5.624042427813788e-07, "loss": 5.3885, "mean_token_accuracy": 0.3016947290301323, "num_tokens": 6730674.0, "step": 6300 }, { "entropy": 5.333053431510925, "epoch": 3.655728267127231, "grad_norm": 1.1618578433990479, "learning_rate": 5.506187389510902e-07, "loss": 5.1781, "mean_token_accuracy": 0.3275001719594002, "num_tokens": 6784235.0, "step": 6350 }, { "entropy": 5.4938449716568, "epoch": 3.6845135290731146, "grad_norm": 1.384329080581665, "learning_rate": 5.388332351208014e-07, "loss": 5.3399, "mean_token_accuracy": 0.3068840709328651, "num_tokens": 6839590.0, "step": 6400 }, { "entropy": 5.277545223236084, "epoch": 3.7132987910189983, "grad_norm": 1.8918265104293823, "learning_rate": 5.270477312905126e-07, "loss": 5.1221, "mean_token_accuracy": 0.33364981949329375, "num_tokens": 6891301.0, "step": 6450 }, { "entropy": 5.40100293636322, "epoch": 3.742084052964882, "grad_norm": 1.6968809366226196, "learning_rate": 5.152622274602239e-07, "loss": 5.2471, "mean_token_accuracy": 0.31912936180830004, "num_tokens": 6945510.0, "step": 6500 }, { "entropy": 5.561220169067383, "epoch": 3.7708693149107657, "grad_norm": 2.066960573196411, "learning_rate": 5.034767236299352e-07, "loss": 5.4026, "mean_token_accuracy": 0.2984810543060303, "num_tokens": 7001870.0, "step": 6550 }, { "entropy": 5.3108087682724, "epoch": 3.7996545768566494, "grad_norm": 1.6065007448196411, "learning_rate": 4.916912197996464e-07, "loss": 5.155, "mean_token_accuracy": 0.3304683968424797, "num_tokens": 7053974.0, "step": 6600 }, { "entropy": 5.323807754516602, "epoch": 3.828439838802533, "grad_norm": 2.6806318759918213, "learning_rate": 4.799057159693577e-07, "loss": 5.1653, "mean_token_accuracy": 0.3294159671664238, "num_tokens": 7107061.0, "step": 6650 }, { "entropy": 5.4716163873672485, "epoch": 3.857225100748417, "grad_norm": 1.8264856338500977, "learning_rate": 4.6812021213906895e-07, "loss": 5.3124, "mean_token_accuracy": 0.3109353107213974, "num_tokens": 7161697.0, "step": 6700 }, { "entropy": 5.382365622520447, "epoch": 3.8860103626943006, "grad_norm": 0.9954923987388611, "learning_rate": 4.563347083087802e-07, "loss": 5.2237, "mean_token_accuracy": 0.32161149621009827, "num_tokens": 7215524.0, "step": 6750 }, { "entropy": 5.277496585845947, "epoch": 3.9147956246401843, "grad_norm": 1.267786979675293, "learning_rate": 4.445492044784914e-07, "loss": 5.1265, "mean_token_accuracy": 0.3319795566797257, "num_tokens": 7267329.0, "step": 6800 }, { "entropy": 5.550942025184631, "epoch": 3.943580886586068, "grad_norm": 0.9425063133239746, "learning_rate": 4.3276370064820265e-07, "loss": 5.3898, "mean_token_accuracy": 0.30050904959440233, "num_tokens": 7324070.0, "step": 6850 }, { "entropy": 5.125799627304077, "epoch": 3.9723661485319517, "grad_norm": 5.447021007537842, "learning_rate": 4.20978196817914e-07, "loss": 4.9781, "mean_token_accuracy": 0.3520450854301453, "num_tokens": 7375083.0, "step": 6900 }, { "epoch": 4.0, "eval_entropy": 5.6681923492712905, "eval_loss": 5.525067329406738, "eval_mean_token_accuracy": 0.2779707208893816, "eval_model_preparation_time": 0.0045, "eval_num_tokens": 7425448.0, "eval_runtime": 49.7944, "eval_samples_per_second": 8.716, "eval_steps_per_second": 4.358, "step": 6948 } ], "logging_steps": 50, "max_steps": 8685, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.016969752533504e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }