| { | |
| "best_global_step": 6948, | |
| "best_metric": 5.525067329406738, | |
| "best_model_checkpoint": "./output/checkpoint-6948", | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 6948, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 3.606692385673523, | |
| "epoch": 0.028785261945883708, | |
| "grad_norm": 3.2999913692474365, | |
| "learning_rate": 4.9e-07, | |
| "loss": 13.6598, | |
| "mean_token_accuracy": 0.16028020828962325, | |
| "num_tokens": 53993.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 3.618675880432129, | |
| "epoch": 0.057570523891767415, | |
| "grad_norm": 3.101252555847168, | |
| "learning_rate": 9.9e-07, | |
| "loss": 14.0188, | |
| "mean_token_accuracy": 0.1508466500043869, | |
| "num_tokens": 110134.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 3.5215235900878907, | |
| "epoch": 0.08635578583765112, | |
| "grad_norm": 3.513662815093994, | |
| "learning_rate": 1.49e-06, | |
| "loss": 12.8555, | |
| "mean_token_accuracy": 0.18527640983462335, | |
| "num_tokens": 160191.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 3.667909698486328, | |
| "epoch": 0.11514104778353483, | |
| "grad_norm": 4.327610492706299, | |
| "learning_rate": 1.99e-06, | |
| "loss": 13.5394, | |
| "mean_token_accuracy": 0.157139780074358, | |
| "num_tokens": 214993.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 3.768263258934021, | |
| "epoch": 0.14392630972941853, | |
| "grad_norm": 4.290107250213623, | |
| "learning_rate": 1.988450206246317e-06, | |
| "loss": 12.8912, | |
| "mean_token_accuracy": 0.17374794125556947, | |
| "num_tokens": 268184.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 3.990619196891785, | |
| "epoch": 0.17271157167530224, | |
| "grad_norm": 4.444278717041016, | |
| "learning_rate": 1.976664702416028e-06, | |
| "loss": 12.455, | |
| "mean_token_accuracy": 0.17780130118131637, | |
| "num_tokens": 319458.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 4.162646284103394, | |
| "epoch": 0.20149683362118595, | |
| "grad_norm": 5.615262508392334, | |
| "learning_rate": 1.9648791985857395e-06, | |
| "loss": 12.0893, | |
| "mean_token_accuracy": 0.18191319867968558, | |
| "num_tokens": 373337.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 4.532100868225098, | |
| "epoch": 0.23028209556706966, | |
| "grad_norm": 10.074016571044922, | |
| "learning_rate": 1.9530936947554507e-06, | |
| "loss": 11.9261, | |
| "mean_token_accuracy": 0.169477596282959, | |
| "num_tokens": 427526.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 4.923871030807495, | |
| "epoch": 0.25906735751295334, | |
| "grad_norm": 16.220163345336914, | |
| "learning_rate": 1.9413081909251622e-06, | |
| "loss": 11.0048, | |
| "mean_token_accuracy": 0.1704501649737358, | |
| "num_tokens": 480528.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 5.521005854606629, | |
| "epoch": 0.28785261945883706, | |
| "grad_norm": 29.904008865356445, | |
| "learning_rate": 1.9295226870948733e-06, | |
| "loss": 9.6524, | |
| "mean_token_accuracy": 0.16450899541378022, | |
| "num_tokens": 535314.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 6.092623329162597, | |
| "epoch": 0.31663788140472077, | |
| "grad_norm": 17.821575164794922, | |
| "learning_rate": 1.9177371832645845e-06, | |
| "loss": 8.1054, | |
| "mean_token_accuracy": 0.17205011785030366, | |
| "num_tokens": 588410.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 6.385262680053711, | |
| "epoch": 0.3454231433506045, | |
| "grad_norm": 5.502202987670898, | |
| "learning_rate": 1.9059516794342958e-06, | |
| "loss": 7.4313, | |
| "mean_token_accuracy": 0.1734227080643177, | |
| "num_tokens": 641736.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 6.278562617301941, | |
| "epoch": 0.3742084052964882, | |
| "grad_norm": 5.4657697677612305, | |
| "learning_rate": 1.8941661756040071e-06, | |
| "loss": 6.9266, | |
| "mean_token_accuracy": 0.18680249139666558, | |
| "num_tokens": 692200.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 6.553266277313233, | |
| "epoch": 0.4029936672423719, | |
| "grad_norm": 4.955812931060791, | |
| "learning_rate": 1.8823806717737183e-06, | |
| "loss": 6.9847, | |
| "mean_token_accuracy": 0.16679802387952805, | |
| "num_tokens": 745830.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 6.470935583114624, | |
| "epoch": 0.4317789291882556, | |
| "grad_norm": 4.198381423950195, | |
| "learning_rate": 1.8705951679434296e-06, | |
| "loss": 6.7277, | |
| "mean_token_accuracy": 0.17847734570503235, | |
| "num_tokens": 798872.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 6.5620588779449465, | |
| "epoch": 0.4605641911341393, | |
| "grad_norm": 3.1793746948242188, | |
| "learning_rate": 1.8588096641131407e-06, | |
| "loss": 6.7032, | |
| "mean_token_accuracy": 0.17336134731769562, | |
| "num_tokens": 853045.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 6.532204885482788, | |
| "epoch": 0.48934945308002303, | |
| "grad_norm": 3.824537515640259, | |
| "learning_rate": 1.847024160282852e-06, | |
| "loss": 6.5762, | |
| "mean_token_accuracy": 0.1805124071240425, | |
| "num_tokens": 907679.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 6.535988225936889, | |
| "epoch": 0.5181347150259067, | |
| "grad_norm": 4.350001811981201, | |
| "learning_rate": 1.8352386564525632e-06, | |
| "loss": 6.505, | |
| "mean_token_accuracy": 0.1842605724930763, | |
| "num_tokens": 964170.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 6.204533562660218, | |
| "epoch": 0.5469199769717904, | |
| "grad_norm": 2.193660020828247, | |
| "learning_rate": 1.8234531526222745e-06, | |
| "loss": 6.1211, | |
| "mean_token_accuracy": 0.21968430042266845, | |
| "num_tokens": 1015909.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 6.308737449645996, | |
| "epoch": 0.5757052389176741, | |
| "grad_norm": 2.325622320175171, | |
| "learning_rate": 1.8116676487919857e-06, | |
| "loss": 6.1653, | |
| "mean_token_accuracy": 0.21636426240205764, | |
| "num_tokens": 1068859.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 6.332560749053955, | |
| "epoch": 0.6044905008635578, | |
| "grad_norm": 2.0439090728759766, | |
| "learning_rate": 1.799882144961697e-06, | |
| "loss": 6.1559, | |
| "mean_token_accuracy": 0.21859725564718246, | |
| "num_tokens": 1123202.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 6.042124252319336, | |
| "epoch": 0.6332757628094415, | |
| "grad_norm": 3.621903657913208, | |
| "learning_rate": 1.7880966411314081e-06, | |
| "loss": 5.8441, | |
| "mean_token_accuracy": 0.24906315237283708, | |
| "num_tokens": 1173403.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 5.921343173980713, | |
| "epoch": 0.6620610247553252, | |
| "grad_norm": 5.658033847808838, | |
| "learning_rate": 1.7763111373011195e-06, | |
| "loss": 5.7104, | |
| "mean_token_accuracy": 0.2625067520141602, | |
| "num_tokens": 1225026.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 6.093586492538452, | |
| "epoch": 0.690846286701209, | |
| "grad_norm": 2.4292995929718018, | |
| "learning_rate": 1.7645256334708308e-06, | |
| "loss": 5.8658, | |
| "mean_token_accuracy": 0.24842385441064835, | |
| "num_tokens": 1279013.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 6.119112596511841, | |
| "epoch": 0.7196315486470927, | |
| "grad_norm": 3.369384288787842, | |
| "learning_rate": 1.752740129640542e-06, | |
| "loss": 5.8784, | |
| "mean_token_accuracy": 0.24857850253582, | |
| "num_tokens": 1332547.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 6.025163550376892, | |
| "epoch": 0.7484168105929764, | |
| "grad_norm": 2.5110116004943848, | |
| "learning_rate": 1.7409546258102533e-06, | |
| "loss": 5.7769, | |
| "mean_token_accuracy": 0.25835376888513567, | |
| "num_tokens": 1385192.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 5.877259612083435, | |
| "epoch": 0.7772020725388601, | |
| "grad_norm": 2.4179303646087646, | |
| "learning_rate": 1.7291691219799646e-06, | |
| "loss": 5.6284, | |
| "mean_token_accuracy": 0.2756252554059029, | |
| "num_tokens": 1437071.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 6.002246947288513, | |
| "epoch": 0.8059873344847438, | |
| "grad_norm": 3.494359016418457, | |
| "learning_rate": 1.717383618149676e-06, | |
| "loss": 5.747, | |
| "mean_token_accuracy": 0.26462210685014725, | |
| "num_tokens": 1490818.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 5.991955623626709, | |
| "epoch": 0.8347725964306275, | |
| "grad_norm": 2.340975761413574, | |
| "learning_rate": 1.705598114319387e-06, | |
| "loss": 5.7379, | |
| "mean_token_accuracy": 0.26444981098175047, | |
| "num_tokens": 1544997.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 5.91768889427185, | |
| "epoch": 0.8635578583765112, | |
| "grad_norm": 2.2394514083862305, | |
| "learning_rate": 1.6938126104890984e-06, | |
| "loss": 5.6564, | |
| "mean_token_accuracy": 0.2730415526032448, | |
| "num_tokens": 1598302.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 5.982716989517212, | |
| "epoch": 0.8923431203223949, | |
| "grad_norm": 1.876839518547058, | |
| "learning_rate": 1.6820271066588098e-06, | |
| "loss": 5.7215, | |
| "mean_token_accuracy": 0.26642445534467696, | |
| "num_tokens": 1655267.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 5.820467872619629, | |
| "epoch": 0.9211283822682786, | |
| "grad_norm": 2.219966173171997, | |
| "learning_rate": 1.6702416028285209e-06, | |
| "loss": 5.5555, | |
| "mean_token_accuracy": 0.2856418335437775, | |
| "num_tokens": 1709199.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 5.996349005699158, | |
| "epoch": 0.9499136442141624, | |
| "grad_norm": 2.247213840484619, | |
| "learning_rate": 1.6584560989982322e-06, | |
| "loss": 5.7283, | |
| "mean_token_accuracy": 0.2696125540137291, | |
| "num_tokens": 1765443.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 5.696683068275451, | |
| "epoch": 0.9786989061600461, | |
| "grad_norm": 2.8499979972839355, | |
| "learning_rate": 1.6466705951679433e-06, | |
| "loss": 5.4335, | |
| "mean_token_accuracy": 0.29918427973985673, | |
| "num_tokens": 1817494.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_entropy": 5.993559589034401, | |
| "eval_loss": 5.737204551696777, | |
| "eval_mean_token_accuracy": 0.2618687468739699, | |
| "eval_model_preparation_time": 0.0045, | |
| "eval_num_tokens": 1856362.0, | |
| "eval_runtime": 50.5332, | |
| "eval_samples_per_second": 8.588, | |
| "eval_steps_per_second": 4.294, | |
| "step": 1737 | |
| }, | |
| { | |
| "entropy": 5.746842083930969, | |
| "epoch": 1.0074841681059297, | |
| "grad_norm": 2.33052921295166, | |
| "learning_rate": 1.6348850913376547e-06, | |
| "loss": 5.4796, | |
| "mean_token_accuracy": 0.2966849410533905, | |
| "num_tokens": 1870353.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 5.859029049873352, | |
| "epoch": 1.0362694300518134, | |
| "grad_norm": 1.6248886585235596, | |
| "learning_rate": 1.6230995875073658e-06, | |
| "loss": 5.5975, | |
| "mean_token_accuracy": 0.2838129925727844, | |
| "num_tokens": 1926205.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 5.731445336341858, | |
| "epoch": 1.065054691997697, | |
| "grad_norm": 1.6941566467285156, | |
| "learning_rate": 1.6113140836770771e-06, | |
| "loss": 5.476, | |
| "mean_token_accuracy": 0.2992346465587616, | |
| "num_tokens": 1979821.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 5.6993954515457155, | |
| "epoch": 1.0938399539435808, | |
| "grad_norm": 1.1746597290039062, | |
| "learning_rate": 1.5995285798467883e-06, | |
| "loss": 5.4608, | |
| "mean_token_accuracy": 0.3000726142525673, | |
| "num_tokens": 2034373.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 5.668873124122619, | |
| "epoch": 1.1226252158894645, | |
| "grad_norm": 1.728211760520935, | |
| "learning_rate": 1.5877430760164996e-06, | |
| "loss": 5.4347, | |
| "mean_token_accuracy": 0.3033922725915909, | |
| "num_tokens": 2087339.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 5.624621086120605, | |
| "epoch": 1.1514104778353482, | |
| "grad_norm": 1.4078539609909058, | |
| "learning_rate": 1.5759575721862107e-06, | |
| "loss": 5.3954, | |
| "mean_token_accuracy": 0.30784171640872954, | |
| "num_tokens": 2139520.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 5.7141213130950925, | |
| "epoch": 1.180195739781232, | |
| "grad_norm": 2.186459541320801, | |
| "learning_rate": 1.564172068355922e-06, | |
| "loss": 5.4847, | |
| "mean_token_accuracy": 0.29594049394130706, | |
| "num_tokens": 2193987.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 5.632415266036987, | |
| "epoch": 1.2089810017271156, | |
| "grad_norm": 1.3601349592208862, | |
| "learning_rate": 1.5523865645256334e-06, | |
| "loss": 5.4135, | |
| "mean_token_accuracy": 0.30366597563028336, | |
| "num_tokens": 2249616.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 5.510904269218445, | |
| "epoch": 1.2377662636729994, | |
| "grad_norm": 2.065760612487793, | |
| "learning_rate": 1.5406010606953445e-06, | |
| "loss": 5.2904, | |
| "mean_token_accuracy": 0.3211754837632179, | |
| "num_tokens": 2300863.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 5.703383626937867, | |
| "epoch": 1.266551525618883, | |
| "grad_norm": 1.1172698736190796, | |
| "learning_rate": 1.5288155568650559e-06, | |
| "loss": 5.4802, | |
| "mean_token_accuracy": 0.29713701367378237, | |
| "num_tokens": 2356029.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 5.565930342674255, | |
| "epoch": 1.2953367875647668, | |
| "grad_norm": 1.7528513669967651, | |
| "learning_rate": 1.5170300530347672e-06, | |
| "loss": 5.3518, | |
| "mean_token_accuracy": 0.31301232606172563, | |
| "num_tokens": 2408957.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 5.496430187225342, | |
| "epoch": 1.3241220495106505, | |
| "grad_norm": 1.892640233039856, | |
| "learning_rate": 1.5052445492044786e-06, | |
| "loss": 5.2967, | |
| "mean_token_accuracy": 0.3181899458169937, | |
| "num_tokens": 2462569.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 5.725150098800659, | |
| "epoch": 1.3529073114565342, | |
| "grad_norm": 1.774940848350525, | |
| "learning_rate": 1.4934590453741897e-06, | |
| "loss": 5.5215, | |
| "mean_token_accuracy": 0.29055028676986694, | |
| "num_tokens": 2518544.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 5.4884827613830565, | |
| "epoch": 1.381692573402418, | |
| "grad_norm": 2.2167599201202393, | |
| "learning_rate": 1.481673541543901e-06, | |
| "loss": 5.2917, | |
| "mean_token_accuracy": 0.31803421139717103, | |
| "num_tokens": 2570863.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 5.697079472541809, | |
| "epoch": 1.4104778353483016, | |
| "grad_norm": 1.6489030122756958, | |
| "learning_rate": 1.4698880377136124e-06, | |
| "loss": 5.4982, | |
| "mean_token_accuracy": 0.2925163987278938, | |
| "num_tokens": 2626998.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 5.46209939956665, | |
| "epoch": 1.4392630972941853, | |
| "grad_norm": 1.153914451599121, | |
| "learning_rate": 1.4581025338833235e-06, | |
| "loss": 5.2736, | |
| "mean_token_accuracy": 0.3182168474793434, | |
| "num_tokens": 2681568.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 5.4405768728256225, | |
| "epoch": 1.468048359240069, | |
| "grad_norm": 3.6614978313446045, | |
| "learning_rate": 1.4463170300530348e-06, | |
| "loss": 5.2515, | |
| "mean_token_accuracy": 0.3218736210465431, | |
| "num_tokens": 2733587.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 5.528175053596496, | |
| "epoch": 1.4968336211859528, | |
| "grad_norm": 1.0849746465682983, | |
| "learning_rate": 1.434531526222746e-06, | |
| "loss": 5.3378, | |
| "mean_token_accuracy": 0.31061659604310987, | |
| "num_tokens": 2787003.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 5.46110897064209, | |
| "epoch": 1.5256188831318365, | |
| "grad_norm": 1.8315683603286743, | |
| "learning_rate": 1.4227460223924573e-06, | |
| "loss": 5.2782, | |
| "mean_token_accuracy": 0.31781029611825945, | |
| "num_tokens": 2840263.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 5.455560960769653, | |
| "epoch": 1.5544041450777202, | |
| "grad_norm": 1.1859091520309448, | |
| "learning_rate": 1.4109605185621684e-06, | |
| "loss": 5.2735, | |
| "mean_token_accuracy": 0.3194814011454582, | |
| "num_tokens": 2894186.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 5.430496115684509, | |
| "epoch": 1.583189407023604, | |
| "grad_norm": 2.3500001430511475, | |
| "learning_rate": 1.3991750147318797e-06, | |
| "loss": 5.2464, | |
| "mean_token_accuracy": 0.32140792965888976, | |
| "num_tokens": 2948171.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 5.588023023605347, | |
| "epoch": 1.6119746689694876, | |
| "grad_norm": 1.727825403213501, | |
| "learning_rate": 1.3873895109015909e-06, | |
| "loss": 5.4028, | |
| "mean_token_accuracy": 0.3039530631899834, | |
| "num_tokens": 3002678.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 5.410525422096253, | |
| "epoch": 1.6407599309153713, | |
| "grad_norm": 1.3401474952697754, | |
| "learning_rate": 1.3756040070713022e-06, | |
| "loss": 5.2298, | |
| "mean_token_accuracy": 0.324065263569355, | |
| "num_tokens": 3055844.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 5.36959942817688, | |
| "epoch": 1.669545192861255, | |
| "grad_norm": 1.1892589330673218, | |
| "learning_rate": 1.3638185032410133e-06, | |
| "loss": 5.1956, | |
| "mean_token_accuracy": 0.32639502108097074, | |
| "num_tokens": 3108636.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 5.53826907157898, | |
| "epoch": 1.6983304548071387, | |
| "grad_norm": 1.2652360200881958, | |
| "learning_rate": 1.3520329994107247e-06, | |
| "loss": 5.3583, | |
| "mean_token_accuracy": 0.3074926760792732, | |
| "num_tokens": 3162627.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 5.417449145317078, | |
| "epoch": 1.7271157167530224, | |
| "grad_norm": 1.584312915802002, | |
| "learning_rate": 1.340247495580436e-06, | |
| "loss": 5.2388, | |
| "mean_token_accuracy": 0.32019727885723115, | |
| "num_tokens": 3216409.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 5.241390740871429, | |
| "epoch": 1.7559009786989062, | |
| "grad_norm": 1.5219439268112183, | |
| "learning_rate": 1.3284619917501471e-06, | |
| "loss": 5.0645, | |
| "mean_token_accuracy": 0.3445430138707161, | |
| "num_tokens": 3266967.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 5.405424036979675, | |
| "epoch": 1.7846862406447899, | |
| "grad_norm": 2.1165153980255127, | |
| "learning_rate": 1.3166764879198585e-06, | |
| "loss": 5.232, | |
| "mean_token_accuracy": 0.32085000157356264, | |
| "num_tokens": 3319877.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 5.123006024360657, | |
| "epoch": 1.8134715025906736, | |
| "grad_norm": 1.2189785242080688, | |
| "learning_rate": 1.3048909840895698e-06, | |
| "loss": 4.9582, | |
| "mean_token_accuracy": 0.356108532845974, | |
| "num_tokens": 3368569.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 5.417610831260681, | |
| "epoch": 1.8422567645365573, | |
| "grad_norm": 1.5157604217529297, | |
| "learning_rate": 1.2931054802592812e-06, | |
| "loss": 5.2454, | |
| "mean_token_accuracy": 0.31976755023002623, | |
| "num_tokens": 3422449.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 5.409690895080566, | |
| "epoch": 1.871042026482441, | |
| "grad_norm": 1.3088161945343018, | |
| "learning_rate": 1.2813199764289923e-06, | |
| "loss": 5.2348, | |
| "mean_token_accuracy": 0.32325415283441544, | |
| "num_tokens": 3474399.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 5.44662567615509, | |
| "epoch": 1.8998272884283247, | |
| "grad_norm": 2.178372621536255, | |
| "learning_rate": 1.2695344725987036e-06, | |
| "loss": 5.2661, | |
| "mean_token_accuracy": 0.3182847076654434, | |
| "num_tokens": 3527726.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 5.512614865303039, | |
| "epoch": 1.9286125503742084, | |
| "grad_norm": 1.3050425052642822, | |
| "learning_rate": 1.2577489687684147e-06, | |
| "loss": 5.3416, | |
| "mean_token_accuracy": 0.3084403133392334, | |
| "num_tokens": 3581980.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 5.379772834777832, | |
| "epoch": 1.9573978123200921, | |
| "grad_norm": 1.4584404230117798, | |
| "learning_rate": 1.245963464938126e-06, | |
| "loss": 5.2087, | |
| "mean_token_accuracy": 0.32388432770967485, | |
| "num_tokens": 3635393.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 5.483665924072266, | |
| "epoch": 1.9861830742659758, | |
| "grad_norm": 1.2157734632492065, | |
| "learning_rate": 1.2341779611078374e-06, | |
| "loss": 5.3101, | |
| "mean_token_accuracy": 0.3121953472495079, | |
| "num_tokens": 3689894.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_entropy": 5.711394641805904, | |
| "eval_loss": 5.55628776550293, | |
| "eval_mean_token_accuracy": 0.2764948787777105, | |
| "eval_model_preparation_time": 0.0045, | |
| "eval_num_tokens": 3712724.0, | |
| "eval_runtime": 50.187, | |
| "eval_samples_per_second": 8.648, | |
| "eval_steps_per_second": 4.324, | |
| "step": 3474 | |
| }, | |
| { | |
| "entropy": 5.349283556938172, | |
| "epoch": 2.0149683362118593, | |
| "grad_norm": 1.1696771383285522, | |
| "learning_rate": 1.2223924572775486e-06, | |
| "loss": 5.1782, | |
| "mean_token_accuracy": 0.33028870791196824, | |
| "num_tokens": 3740861.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 5.4721107006073, | |
| "epoch": 2.043753598157743, | |
| "grad_norm": 1.8449370861053467, | |
| "learning_rate": 1.2106069534472599e-06, | |
| "loss": 5.2978, | |
| "mean_token_accuracy": 0.31511022299528124, | |
| "num_tokens": 3794869.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 5.404226851463318, | |
| "epoch": 2.0725388601036268, | |
| "grad_norm": 3.789496660232544, | |
| "learning_rate": 1.198821449616971e-06, | |
| "loss": 5.2371, | |
| "mean_token_accuracy": 0.32092176616191864, | |
| "num_tokens": 3848573.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 5.435445628166199, | |
| "epoch": 2.1013241220495105, | |
| "grad_norm": 2.2847959995269775, | |
| "learning_rate": 1.1870359457866824e-06, | |
| "loss": 5.2662, | |
| "mean_token_accuracy": 0.3186633634567261, | |
| "num_tokens": 3901204.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 5.4066293334960935, | |
| "epoch": 2.130109383995394, | |
| "grad_norm": 1.0950902700424194, | |
| "learning_rate": 1.1752504419563935e-06, | |
| "loss": 5.2345, | |
| "mean_token_accuracy": 0.32156052827835085, | |
| "num_tokens": 3953753.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 5.272332944869995, | |
| "epoch": 2.158894645941278, | |
| "grad_norm": 2.1477339267730713, | |
| "learning_rate": 1.1634649381261048e-06, | |
| "loss": 5.1091, | |
| "mean_token_accuracy": 0.3380983591079712, | |
| "num_tokens": 4005481.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 5.4118804311752315, | |
| "epoch": 2.1876799078871616, | |
| "grad_norm": 1.4509484767913818, | |
| "learning_rate": 1.151679434295816e-06, | |
| "loss": 5.2448, | |
| "mean_token_accuracy": 0.3208243528008461, | |
| "num_tokens": 4058829.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 5.4763900089263915, | |
| "epoch": 2.2164651698330453, | |
| "grad_norm": 1.0856804847717285, | |
| "learning_rate": 1.1398939304655273e-06, | |
| "loss": 5.3042, | |
| "mean_token_accuracy": 0.31338351368904116, | |
| "num_tokens": 4113326.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 5.328452725410461, | |
| "epoch": 2.245250431778929, | |
| "grad_norm": 3.2843880653381348, | |
| "learning_rate": 1.1281084266352386e-06, | |
| "loss": 5.1624, | |
| "mean_token_accuracy": 0.3305218696594238, | |
| "num_tokens": 4165454.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 5.383157343864441, | |
| "epoch": 2.2740356937248127, | |
| "grad_norm": 2.207082748413086, | |
| "learning_rate": 1.1163229228049497e-06, | |
| "loss": 5.2163, | |
| "mean_token_accuracy": 0.32331310987472534, | |
| "num_tokens": 4219250.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 5.585261764526368, | |
| "epoch": 2.3028209556706964, | |
| "grad_norm": 2.7102835178375244, | |
| "learning_rate": 1.104537418974661e-06, | |
| "loss": 5.4137, | |
| "mean_token_accuracy": 0.29959124475717547, | |
| "num_tokens": 4274711.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 5.434073266983032, | |
| "epoch": 2.33160621761658, | |
| "grad_norm": 1.3775779008865356, | |
| "learning_rate": 1.0927519151443724e-06, | |
| "loss": 5.2644, | |
| "mean_token_accuracy": 0.3175011593103409, | |
| "num_tokens": 4328616.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 5.462391858100891, | |
| "epoch": 2.360391479562464, | |
| "grad_norm": 1.4101024866104126, | |
| "learning_rate": 1.0809664113140838e-06, | |
| "loss": 5.2924, | |
| "mean_token_accuracy": 0.3137941011786461, | |
| "num_tokens": 4382416.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 5.529892563819885, | |
| "epoch": 2.3891767415083476, | |
| "grad_norm": 1.2311837673187256, | |
| "learning_rate": 1.0691809074837949e-06, | |
| "loss": 5.364, | |
| "mean_token_accuracy": 0.3046491605043411, | |
| "num_tokens": 4437848.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 5.4370484542846675, | |
| "epoch": 2.4179620034542313, | |
| "grad_norm": 1.0929864645004272, | |
| "learning_rate": 1.0573954036535062e-06, | |
| "loss": 5.2734, | |
| "mean_token_accuracy": 0.3169013774394989, | |
| "num_tokens": 4491185.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 5.395377616882325, | |
| "epoch": 2.446747265400115, | |
| "grad_norm": 1.5457273721694946, | |
| "learning_rate": 1.0456098998232174e-06, | |
| "loss": 5.2276, | |
| "mean_token_accuracy": 0.32221508473157884, | |
| "num_tokens": 4544086.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 5.443737335205078, | |
| "epoch": 2.4755325273459987, | |
| "grad_norm": 1.4844346046447754, | |
| "learning_rate": 1.0338243959929287e-06, | |
| "loss": 5.2786, | |
| "mean_token_accuracy": 0.3157751387357712, | |
| "num_tokens": 4597677.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 5.419876251220703, | |
| "epoch": 2.5043177892918824, | |
| "grad_norm": 1.2481963634490967, | |
| "learning_rate": 1.02203889216264e-06, | |
| "loss": 5.2564, | |
| "mean_token_accuracy": 0.31889803290367125, | |
| "num_tokens": 4651343.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 5.578677978515625, | |
| "epoch": 2.533103051237766, | |
| "grad_norm": 2.0005414485931396, | |
| "learning_rate": 1.0102533883323512e-06, | |
| "loss": 5.4145, | |
| "mean_token_accuracy": 0.30037090003490446, | |
| "num_tokens": 4705985.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 5.279946126937866, | |
| "epoch": 2.56188831318365, | |
| "grad_norm": 1.080521821975708, | |
| "learning_rate": 9.984678845020625e-07, | |
| "loss": 5.1226, | |
| "mean_token_accuracy": 0.3341303279995918, | |
| "num_tokens": 4757741.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 5.551463279724121, | |
| "epoch": 2.5906735751295336, | |
| "grad_norm": 1.28898024559021, | |
| "learning_rate": 9.866823806717736e-07, | |
| "loss": 5.3832, | |
| "mean_token_accuracy": 0.3028248634934425, | |
| "num_tokens": 4812808.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 5.3787487554550175, | |
| "epoch": 2.6194588370754173, | |
| "grad_norm": 1.5697983503341675, | |
| "learning_rate": 9.74896876841485e-07, | |
| "loss": 5.2141, | |
| "mean_token_accuracy": 0.3227942296862602, | |
| "num_tokens": 4866572.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 5.460358958244324, | |
| "epoch": 2.648244099021301, | |
| "grad_norm": 1.3180441856384277, | |
| "learning_rate": 9.63111373011196e-07, | |
| "loss": 5.2954, | |
| "mean_token_accuracy": 0.31269474506378175, | |
| "num_tokens": 4921312.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 5.434084935188293, | |
| "epoch": 2.6770293609671847, | |
| "grad_norm": 1.2409590482711792, | |
| "learning_rate": 9.513258691809074e-07, | |
| "loss": 5.271, | |
| "mean_token_accuracy": 0.3172155100107193, | |
| "num_tokens": 4974289.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 5.406955418586731, | |
| "epoch": 2.7058146229130684, | |
| "grad_norm": 1.4782609939575195, | |
| "learning_rate": 9.395403653506187e-07, | |
| "loss": 5.2473, | |
| "mean_token_accuracy": 0.32031788885593415, | |
| "num_tokens": 5028149.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 5.206603040695191, | |
| "epoch": 2.734599884858952, | |
| "grad_norm": 2.351633071899414, | |
| "learning_rate": 9.2775486152033e-07, | |
| "loss": 5.0478, | |
| "mean_token_accuracy": 0.3428420132398605, | |
| "num_tokens": 5079349.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 5.388812799453735, | |
| "epoch": 2.763385146804836, | |
| "grad_norm": 7.564618110656738, | |
| "learning_rate": 9.159693576900412e-07, | |
| "loss": 5.2281, | |
| "mean_token_accuracy": 0.3222071170806885, | |
| "num_tokens": 5132564.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 5.374106278419495, | |
| "epoch": 2.7921704087507195, | |
| "grad_norm": 1.4734679460525513, | |
| "learning_rate": 9.041838538597525e-07, | |
| "loss": 5.2161, | |
| "mean_token_accuracy": 0.3219477406144142, | |
| "num_tokens": 5185921.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 5.232998585700988, | |
| "epoch": 2.8209556706966032, | |
| "grad_norm": 1.4175471067428589, | |
| "learning_rate": 8.923983500294637e-07, | |
| "loss": 5.0769, | |
| "mean_token_accuracy": 0.3403926733136177, | |
| "num_tokens": 5237521.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 5.394891719818116, | |
| "epoch": 2.849740932642487, | |
| "grad_norm": 4.951873779296875, | |
| "learning_rate": 8.806128461991749e-07, | |
| "loss": 5.2344, | |
| "mean_token_accuracy": 0.3213117456436157, | |
| "num_tokens": 5291104.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 5.413805012702942, | |
| "epoch": 2.8785261945883707, | |
| "grad_norm": 1.679518461227417, | |
| "learning_rate": 8.688273423688863e-07, | |
| "loss": 5.2597, | |
| "mean_token_accuracy": 0.3165634173154831, | |
| "num_tokens": 5345058.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 5.256177935600281, | |
| "epoch": 2.9073114565342544, | |
| "grad_norm": 1.8892916440963745, | |
| "learning_rate": 8.570418385385975e-07, | |
| "loss": 5.1004, | |
| "mean_token_accuracy": 0.3369427987933159, | |
| "num_tokens": 5395918.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "entropy": 5.259814453125, | |
| "epoch": 2.936096718480138, | |
| "grad_norm": 1.3802675008773804, | |
| "learning_rate": 8.452563347083087e-07, | |
| "loss": 5.1057, | |
| "mean_token_accuracy": 0.3362414276599884, | |
| "num_tokens": 5448086.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "entropy": 5.416206178665161, | |
| "epoch": 2.964881980426022, | |
| "grad_norm": 1.7677236795425415, | |
| "learning_rate": 8.3347083087802e-07, | |
| "loss": 5.2562, | |
| "mean_token_accuracy": 0.31725785195827483, | |
| "num_tokens": 5501959.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "entropy": 5.507337794303894, | |
| "epoch": 2.9936672423719055, | |
| "grad_norm": 1.021727442741394, | |
| "learning_rate": 8.216853270477313e-07, | |
| "loss": 5.344, | |
| "mean_token_accuracy": 0.30679062128067014, | |
| "num_tokens": 5557908.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_entropy": 5.682707933786278, | |
| "eval_loss": 5.53223991394043, | |
| "eval_mean_token_accuracy": 0.27747743456594404, | |
| "eval_model_preparation_time": 0.0045, | |
| "eval_num_tokens": 5569086.0, | |
| "eval_runtime": 49.9944, | |
| "eval_samples_per_second": 8.681, | |
| "eval_steps_per_second": 4.34, | |
| "step": 5211 | |
| }, | |
| { | |
| "entropy": 5.209756035804748, | |
| "epoch": 3.0224525043177892, | |
| "grad_norm": 1.725786566734314, | |
| "learning_rate": 8.098998232174425e-07, | |
| "loss": 5.0541, | |
| "mean_token_accuracy": 0.34166110813617706, | |
| "num_tokens": 5608917.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "entropy": 5.396296281814575, | |
| "epoch": 3.051237766263673, | |
| "grad_norm": 0.7720207571983337, | |
| "learning_rate": 7.981143193871538e-07, | |
| "loss": 5.2337, | |
| "mean_token_accuracy": 0.32116260558366777, | |
| "num_tokens": 5662712.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "entropy": 5.341518473625183, | |
| "epoch": 3.0800230282095566, | |
| "grad_norm": 2.2686808109283447, | |
| "learning_rate": 7.86328815556865e-07, | |
| "loss": 5.1824, | |
| "mean_token_accuracy": 0.32726580530405047, | |
| "num_tokens": 5715921.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "entropy": 5.376176896095276, | |
| "epoch": 3.1088082901554404, | |
| "grad_norm": 1.2420796155929565, | |
| "learning_rate": 7.745433117265762e-07, | |
| "loss": 5.2162, | |
| "mean_token_accuracy": 0.32142678707838057, | |
| "num_tokens": 5769436.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "entropy": 5.4553061914443965, | |
| "epoch": 3.137593552101324, | |
| "grad_norm": 1.2402859926223755, | |
| "learning_rate": 7.627578078962876e-07, | |
| "loss": 5.2971, | |
| "mean_token_accuracy": 0.31396267503499986, | |
| "num_tokens": 5823126.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "entropy": 5.385247969627381, | |
| "epoch": 3.166378814047208, | |
| "grad_norm": 1.112062931060791, | |
| "learning_rate": 7.509723040659988e-07, | |
| "loss": 5.2324, | |
| "mean_token_accuracy": 0.3207343602180481, | |
| "num_tokens": 5875751.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "entropy": 5.55422221660614, | |
| "epoch": 3.1951640759930915, | |
| "grad_norm": 1.5440446138381958, | |
| "learning_rate": 7.3918680023571e-07, | |
| "loss": 5.3902, | |
| "mean_token_accuracy": 0.3006985321640968, | |
| "num_tokens": 5932163.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "entropy": 5.403217372894287, | |
| "epoch": 3.223949337938975, | |
| "grad_norm": 0.8481096625328064, | |
| "learning_rate": 7.274012964054213e-07, | |
| "loss": 5.2417, | |
| "mean_token_accuracy": 0.3210747820138931, | |
| "num_tokens": 5985889.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "entropy": 5.388293180465698, | |
| "epoch": 3.252734599884859, | |
| "grad_norm": 0.9305989146232605, | |
| "learning_rate": 7.156157925751326e-07, | |
| "loss": 5.2319, | |
| "mean_token_accuracy": 0.3206030324101448, | |
| "num_tokens": 6040052.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "entropy": 5.401709322929382, | |
| "epoch": 3.2815198618307426, | |
| "grad_norm": 0.8080459237098694, | |
| "learning_rate": 7.038302887448438e-07, | |
| "loss": 5.2438, | |
| "mean_token_accuracy": 0.3199671137332916, | |
| "num_tokens": 6092350.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "entropy": 5.4320423412323, | |
| "epoch": 3.3103051237766263, | |
| "grad_norm": 1.9186089038848877, | |
| "learning_rate": 6.920447849145551e-07, | |
| "loss": 5.2696, | |
| "mean_token_accuracy": 0.31657984614372253, | |
| "num_tokens": 6146112.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "entropy": 5.276471285820008, | |
| "epoch": 3.33909038572251, | |
| "grad_norm": 1.032879114151001, | |
| "learning_rate": 6.802592810842663e-07, | |
| "loss": 5.1224, | |
| "mean_token_accuracy": 0.3347566506266594, | |
| "num_tokens": 6197916.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "entropy": 5.122317051887512, | |
| "epoch": 3.3678756476683938, | |
| "grad_norm": 3.156858444213867, | |
| "learning_rate": 6.684737772539775e-07, | |
| "loss": 4.9706, | |
| "mean_token_accuracy": 0.35455317378044127, | |
| "num_tokens": 6247565.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "entropy": 5.346597375869751, | |
| "epoch": 3.3966609096142775, | |
| "grad_norm": 1.2619549036026, | |
| "learning_rate": 6.566882734236889e-07, | |
| "loss": 5.1902, | |
| "mean_token_accuracy": 0.3258721518516541, | |
| "num_tokens": 6300481.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "entropy": 5.413151068687439, | |
| "epoch": 3.425446171560161, | |
| "grad_norm": 1.801740050315857, | |
| "learning_rate": 6.449027695934001e-07, | |
| "loss": 5.2513, | |
| "mean_token_accuracy": 0.3187857499718666, | |
| "num_tokens": 6353098.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "entropy": 5.464186942577362, | |
| "epoch": 3.454231433506045, | |
| "grad_norm": 1.6306997537612915, | |
| "learning_rate": 6.331172657631113e-07, | |
| "loss": 5.3043, | |
| "mean_token_accuracy": 0.31154109388589857, | |
| "num_tokens": 6407984.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "entropy": 5.401795778274536, | |
| "epoch": 3.4830166954519286, | |
| "grad_norm": 1.1694583892822266, | |
| "learning_rate": 6.213317619328226e-07, | |
| "loss": 5.2427, | |
| "mean_token_accuracy": 0.31954523265361784, | |
| "num_tokens": 6461854.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "entropy": 5.317689285278321, | |
| "epoch": 3.5118019573978123, | |
| "grad_norm": 0.9361855387687683, | |
| "learning_rate": 6.095462581025339e-07, | |
| "loss": 5.1588, | |
| "mean_token_accuracy": 0.330586878657341, | |
| "num_tokens": 6514882.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "entropy": 5.478708257675171, | |
| "epoch": 3.540587219343696, | |
| "grad_norm": 1.05711030960083, | |
| "learning_rate": 5.977607542722451e-07, | |
| "loss": 5.321, | |
| "mean_token_accuracy": 0.3104448106884956, | |
| "num_tokens": 6569455.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "entropy": 5.309361801147461, | |
| "epoch": 3.5693724812895797, | |
| "grad_norm": 1.3499550819396973, | |
| "learning_rate": 5.859752504419564e-07, | |
| "loss": 5.153, | |
| "mean_token_accuracy": 0.331512533724308, | |
| "num_tokens": 6621734.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "entropy": 5.296572666168213, | |
| "epoch": 3.5981577432354634, | |
| "grad_norm": 1.940708875656128, | |
| "learning_rate": 5.741897466116676e-07, | |
| "loss": 5.14, | |
| "mean_token_accuracy": 0.3299832499027252, | |
| "num_tokens": 6674994.0, | |
| "step": 6250 | |
| }, | |
| { | |
| "entropy": 5.544284400939941, | |
| "epoch": 3.626943005181347, | |
| "grad_norm": 1.8903827667236328, | |
| "learning_rate": 5.624042427813788e-07, | |
| "loss": 5.3885, | |
| "mean_token_accuracy": 0.3016947290301323, | |
| "num_tokens": 6730674.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "entropy": 5.333053431510925, | |
| "epoch": 3.655728267127231, | |
| "grad_norm": 1.1618578433990479, | |
| "learning_rate": 5.506187389510902e-07, | |
| "loss": 5.1781, | |
| "mean_token_accuracy": 0.3275001719594002, | |
| "num_tokens": 6784235.0, | |
| "step": 6350 | |
| }, | |
| { | |
| "entropy": 5.4938449716568, | |
| "epoch": 3.6845135290731146, | |
| "grad_norm": 1.384329080581665, | |
| "learning_rate": 5.388332351208014e-07, | |
| "loss": 5.3399, | |
| "mean_token_accuracy": 0.3068840709328651, | |
| "num_tokens": 6839590.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "entropy": 5.277545223236084, | |
| "epoch": 3.7132987910189983, | |
| "grad_norm": 1.8918265104293823, | |
| "learning_rate": 5.270477312905126e-07, | |
| "loss": 5.1221, | |
| "mean_token_accuracy": 0.33364981949329375, | |
| "num_tokens": 6891301.0, | |
| "step": 6450 | |
| }, | |
| { | |
| "entropy": 5.40100293636322, | |
| "epoch": 3.742084052964882, | |
| "grad_norm": 1.6968809366226196, | |
| "learning_rate": 5.152622274602239e-07, | |
| "loss": 5.2471, | |
| "mean_token_accuracy": 0.31912936180830004, | |
| "num_tokens": 6945510.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "entropy": 5.561220169067383, | |
| "epoch": 3.7708693149107657, | |
| "grad_norm": 2.066960573196411, | |
| "learning_rate": 5.034767236299352e-07, | |
| "loss": 5.4026, | |
| "mean_token_accuracy": 0.2984810543060303, | |
| "num_tokens": 7001870.0, | |
| "step": 6550 | |
| }, | |
| { | |
| "entropy": 5.3108087682724, | |
| "epoch": 3.7996545768566494, | |
| "grad_norm": 1.6065007448196411, | |
| "learning_rate": 4.916912197996464e-07, | |
| "loss": 5.155, | |
| "mean_token_accuracy": 0.3304683968424797, | |
| "num_tokens": 7053974.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "entropy": 5.323807754516602, | |
| "epoch": 3.828439838802533, | |
| "grad_norm": 2.6806318759918213, | |
| "learning_rate": 4.799057159693577e-07, | |
| "loss": 5.1653, | |
| "mean_token_accuracy": 0.3294159671664238, | |
| "num_tokens": 7107061.0, | |
| "step": 6650 | |
| }, | |
| { | |
| "entropy": 5.4716163873672485, | |
| "epoch": 3.857225100748417, | |
| "grad_norm": 1.8264856338500977, | |
| "learning_rate": 4.6812021213906895e-07, | |
| "loss": 5.3124, | |
| "mean_token_accuracy": 0.3109353107213974, | |
| "num_tokens": 7161697.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "entropy": 5.382365622520447, | |
| "epoch": 3.8860103626943006, | |
| "grad_norm": 0.9954923987388611, | |
| "learning_rate": 4.563347083087802e-07, | |
| "loss": 5.2237, | |
| "mean_token_accuracy": 0.32161149621009827, | |
| "num_tokens": 7215524.0, | |
| "step": 6750 | |
| }, | |
| { | |
| "entropy": 5.277496585845947, | |
| "epoch": 3.9147956246401843, | |
| "grad_norm": 1.267786979675293, | |
| "learning_rate": 4.445492044784914e-07, | |
| "loss": 5.1265, | |
| "mean_token_accuracy": 0.3319795566797257, | |
| "num_tokens": 7267329.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "entropy": 5.550942025184631, | |
| "epoch": 3.943580886586068, | |
| "grad_norm": 0.9425063133239746, | |
| "learning_rate": 4.3276370064820265e-07, | |
| "loss": 5.3898, | |
| "mean_token_accuracy": 0.30050904959440233, | |
| "num_tokens": 7324070.0, | |
| "step": 6850 | |
| }, | |
| { | |
| "entropy": 5.125799627304077, | |
| "epoch": 3.9723661485319517, | |
| "grad_norm": 5.447021007537842, | |
| "learning_rate": 4.20978196817914e-07, | |
| "loss": 4.9781, | |
| "mean_token_accuracy": 0.3520450854301453, | |
| "num_tokens": 7375083.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_entropy": 5.6681923492712905, | |
| "eval_loss": 5.525067329406738, | |
| "eval_mean_token_accuracy": 0.2779707208893816, | |
| "eval_model_preparation_time": 0.0045, | |
| "eval_num_tokens": 7425448.0, | |
| "eval_runtime": 49.7944, | |
| "eval_samples_per_second": 8.716, | |
| "eval_steps_per_second": 4.358, | |
| "step": 6948 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 8685, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.016969752533504e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |