{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.666851903306474, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 4.080089821666479, "epoch": 0.04445679355376494, "grad_norm": 2.7002620697021484, "learning_rate": 6.61764705882353e-05, "loss": 4.8247, "mean_token_accuracy": 0.26705394631717355, "num_tokens": 163434.0, "step": 10 }, { "entropy": 2.475909074395895, "epoch": 0.08891358710752988, "grad_norm": 0.7268410325050354, "learning_rate": 0.00013970588235294118, "loss": 2.7008, "mean_token_accuracy": 0.4736104167997837, "num_tokens": 326768.0, "step": 20 }, { "entropy": 2.4751133039593696, "epoch": 0.1333703806612948, "grad_norm": 0.6923421025276184, "learning_rate": 0.00021323529411764705, "loss": 2.527, "mean_token_accuracy": 0.4972056133672595, "num_tokens": 489268.0, "step": 30 }, { "entropy": 2.4025521516799926, "epoch": 0.17782717421505975, "grad_norm": 0.7557116746902466, "learning_rate": 0.0002867647058823529, "loss": 2.4375, "mean_token_accuracy": 0.5120668156072498, "num_tokens": 652245.0, "step": 40 }, { "entropy": 2.3454058580100536, "epoch": 0.22228396776882467, "grad_norm": 0.6406869292259216, "learning_rate": 0.0003602941176470588, "loss": 2.3741, "mean_token_accuracy": 0.5260033827275038, "num_tokens": 815544.0, "step": 50 }, { "entropy": 2.2708897083997726, "epoch": 0.2667407613225896, "grad_norm": 0.8530856370925903, "learning_rate": 0.0004338235294117647, "loss": 2.3009, "mean_token_accuracy": 0.5377379301935434, "num_tokens": 978734.0, "step": 60 }, { "entropy": 2.216454330831766, "epoch": 0.31119755487635453, "grad_norm": 0.6344852447509766, "learning_rate": 0.0004999966516456925, "loss": 2.2346, "mean_token_accuracy": 0.5448098734021187, "num_tokens": 1142045.0, "step": 70 }, { "entropy": 2.2090395711362363, "epoch": 0.3556543484301195, "grad_norm": 0.6977742910385132, "learning_rate": 0.000499594957644556, "loss": 2.2313, "mean_token_accuracy": 0.5475962175056338, "num_tokens": 1305101.0, "step": 80 }, { "entropy": 2.1481428153812887, "epoch": 0.4001111419838844, "grad_norm": 0.9081956148147583, "learning_rate": 0.0004985248255012393, "loss": 2.1787, "mean_token_accuracy": 0.5571485862135888, "num_tokens": 1468499.0, "step": 90 }, { "entropy": 2.126115245744586, "epoch": 0.44456793553764934, "grad_norm": 0.8921639323234558, "learning_rate": 0.0004967891211275719, "loss": 2.1385, "mean_token_accuracy": 0.5636989219114185, "num_tokens": 1631775.0, "step": 100 }, { "entropy": 2.0657279253005982, "epoch": 0.48902472909141426, "grad_norm": 0.7108869552612305, "learning_rate": 0.0004943924928987371, "loss": 2.0797, "mean_token_accuracy": 0.5761262526735663, "num_tokens": 1795003.0, "step": 110 }, { "entropy": 2.042630685120821, "epoch": 0.5334815226451792, "grad_norm": 0.9017110466957092, "learning_rate": 0.0004913413592044966, "loss": 2.074, "mean_token_accuracy": 0.576865964755416, "num_tokens": 1958262.0, "step": 120 }, { "entropy": 1.9980590514838696, "epoch": 0.5779383161989442, "grad_norm": 0.984071671962738, "learning_rate": 0.0004876438912601562, "loss": 2.0242, "mean_token_accuracy": 0.5904566749930382, "num_tokens": 2120485.0, "step": 130 }, { "entropy": 2.056968325190246, "epoch": 0.6223951097527091, "grad_norm": 0.9656462669372559, "learning_rate": 0.0004833099912233028, "loss": 2.0646, "mean_token_accuracy": 0.582532343827188, "num_tokens": 2282892.0, "step": 140 }, { "entropy": 1.9450239988043905, "epoch": 0.666851903306474, "grad_norm": 0.8873224854469299, "learning_rate": 0.00047835126567492, "loss": 1.9611, "mean_token_accuracy": 0.5966836862266064, "num_tokens": 2446252.0, "step": 150 }, { "entropy": 2.0017862655222416, "epoch": 0.711308696860239, "grad_norm": 0.9066752791404724, "learning_rate": 0.00047278099453590216, "loss": 2.0178, "mean_token_accuracy": 0.5848092636093497, "num_tokens": 2609666.0, "step": 160 }, { "entropy": 1.8802081950008869, "epoch": 0.7557654904140039, "grad_norm": 0.8870635032653809, "learning_rate": 0.0004666140955022107, "loss": 1.9088, "mean_token_accuracy": 0.610439893975854, "num_tokens": 2772216.0, "step": 170 }, { "entropy": 1.9238170295953751, "epoch": 0.8002222839677688, "grad_norm": 0.8861916065216064, "learning_rate": 0.0004598670840939184, "loss": 1.9254, "mean_token_accuracy": 0.6038264224305749, "num_tokens": 2935027.0, "step": 180 }, { "entropy": 1.779969944804907, "epoch": 0.8446790775215337, "grad_norm": 0.9129126071929932, "learning_rate": 0.0004525580294251356, "loss": 1.7986, "mean_token_accuracy": 0.6294678594917059, "num_tokens": 3097530.0, "step": 190 }, { "entropy": 1.7979737279005348, "epoch": 0.8891358710752987, "grad_norm": 0.8641685843467712, "learning_rate": 0.0004447065058132684, "loss": 1.8191, "mean_token_accuracy": 0.6282314421609044, "num_tokens": 3260036.0, "step": 200 }, { "entropy": 1.8591997269541025, "epoch": 0.9335926646290637, "grad_norm": 1.096100091934204, "learning_rate": 0.00043633354035720506, "loss": 1.8622, "mean_token_accuracy": 0.6153368357568979, "num_tokens": 3423249.0, "step": 210 }, { "entropy": 1.73656656621024, "epoch": 0.9780494581828285, "grad_norm": 0.8999007940292358, "learning_rate": 0.0004274615566248206, "loss": 1.7403, "mean_token_accuracy": 0.6391685012727976, "num_tokens": 3586732.0, "step": 220 }, { "entropy": 1.7403309533618532, "epoch": 1.0222283967768824, "grad_norm": 0.9680184721946716, "learning_rate": 0.00041811431460060926, "loss": 1.7414, "mean_token_accuracy": 0.6385732677372746, "num_tokens": 3749001.0, "step": 230 }, { "entropy": 1.6463226695545017, "epoch": 1.0666851903306473, "grad_norm": 1.188348650932312, "learning_rate": 0.00040831684705427203, "loss": 1.6575, "mean_token_accuracy": 0.653388250619173, "num_tokens": 3912441.0, "step": 240 }, { "entropy": 1.6219884321093558, "epoch": 1.1111419838844123, "grad_norm": 0.9402503967285156, "learning_rate": 0.00039809539250066717, "loss": 1.616, "mean_token_accuracy": 0.6620589151978493, "num_tokens": 4074498.0, "step": 250 }, { "entropy": 1.5720586974173785, "epoch": 1.1555987774381773, "grad_norm": 1.0742427110671997, "learning_rate": 0.00038747732493066647, "loss": 1.5702, "mean_token_accuracy": 0.6679834071546793, "num_tokens": 4237249.0, "step": 260 }, { "entropy": 1.6063256619498134, "epoch": 1.2000555709919423, "grad_norm": 1.0826376676559448, "learning_rate": 0.0003764910805011017, "loss": 1.6076, "mean_token_accuracy": 0.6603890936821699, "num_tokens": 4400673.0, "step": 270 }, { "entropy": 1.6607086526229977, "epoch": 1.2445123645457072, "grad_norm": 1.1589428186416626, "learning_rate": 0.0003651660813801344, "loss": 1.6578, "mean_token_accuracy": 0.652839015610516, "num_tokens": 4563571.0, "step": 280 }, { "entropy": 1.5815417014062405, "epoch": 1.288969158099472, "grad_norm": 1.130180835723877, "learning_rate": 0.0003535326569519959, "loss": 1.5775, "mean_token_accuracy": 0.6679099775850773, "num_tokens": 4726859.0, "step": 290 }, { "entropy": 1.6092839901335538, "epoch": 1.333425951653237, "grad_norm": 0.9171048402786255, "learning_rate": 0.0003416219625921204, "loss": 1.6241, "mean_token_accuracy": 0.656397813372314, "num_tokens": 4889977.0, "step": 300 }, { "entropy": 1.5430887764319778, "epoch": 1.377882745207002, "grad_norm": 1.2212048768997192, "learning_rate": 0.00032946589623019827, "loss": 1.5316, "mean_token_accuracy": 0.6758351223543286, "num_tokens": 5053344.0, "step": 310 }, { "entropy": 1.5891861728392542, "epoch": 1.422339538760767, "grad_norm": 1.2055883407592773, "learning_rate": 0.00031709701292460037, "loss": 1.5768, "mean_token_accuracy": 0.6684434033930302, "num_tokens": 5216641.0, "step": 320 }, { "entropy": 1.4777619161643087, "epoch": 1.4667963323145319, "grad_norm": 1.1511977910995483, "learning_rate": 0.00030454843767695194, "loss": 1.4737, "mean_token_accuracy": 0.6856943031772971, "num_tokens": 5379690.0, "step": 330 }, { "entropy": 1.536539927031845, "epoch": 1.5112531258682966, "grad_norm": 1.1793866157531738, "learning_rate": 0.0002918537767203464, "loss": 1.5243, "mean_token_accuracy": 0.6775767827406526, "num_tokens": 5543019.0, "step": 340 }, { "entropy": 1.4959282116033137, "epoch": 1.5557099194220618, "grad_norm": 1.001905083656311, "learning_rate": 0.0002790470275187772, "loss": 1.504, "mean_token_accuracy": 0.6810265580192209, "num_tokens": 5705946.0, "step": 350 }, { "entropy": 1.5315143384039402, "epoch": 1.6001667129758266, "grad_norm": 1.0786097049713135, "learning_rate": 0.00026616248771881706, "loss": 1.5182, "mean_token_accuracy": 0.6769340887665749, "num_tokens": 5869367.0, "step": 360 }, { "entropy": 1.4619380568154157, "epoch": 1.6446235065295916, "grad_norm": 1.1511582136154175, "learning_rate": 0.0002532346632973818, "loss": 1.4542, "mean_token_accuracy": 0.6922258980572223, "num_tokens": 6032042.0, "step": 370 }, { "entropy": 1.3948858159594237, "epoch": 1.6890803000833565, "grad_norm": 1.3182344436645508, "learning_rate": 0.00024029817615156491, "loss": 1.3733, "mean_token_accuracy": 0.7100105246528983, "num_tokens": 6194524.0, "step": 380 }, { "entropy": 1.502246926818043, "epoch": 1.7335370936371213, "grad_norm": 1.149990200996399, "learning_rate": 0.0002273876713780292, "loss": 1.5031, "mean_token_accuracy": 0.6846974194049835, "num_tokens": 6358008.0, "step": 390 }, { "entropy": 1.4208066834136843, "epoch": 1.7779938871908865, "grad_norm": 1.1152338981628418, "learning_rate": 0.00021453772449026595, "loss": 1.4044, "mean_token_accuracy": 0.7048833057284355, "num_tokens": 6519678.0, "step": 400 }, { "entropy": 1.4979191770311444, "epoch": 1.8224506807446512, "grad_norm": 1.0883276462554932, "learning_rate": 0.00020178274882220573, "loss": 1.4849, "mean_token_accuracy": 0.6878569139167666, "num_tokens": 6683001.0, "step": 410 }, { "entropy": 1.4353665138129146, "epoch": 1.8669074742984162, "grad_norm": 0.9934560656547546, "learning_rate": 0.00018915690336615977, "loss": 1.4027, "mean_token_accuracy": 0.7004664979875088, "num_tokens": 6846358.0, "step": 420 }, { "entropy": 1.3516410222277044, "epoch": 1.9113642678521812, "grad_norm": 1.3949309587478638, "learning_rate": 0.00017669400129191205, "loss": 1.3303, "mean_token_accuracy": 0.7134283676743507, "num_tokens": 7009309.0, "step": 430 }, { "entropy": 1.303240181831643, "epoch": 1.9558210614059461, "grad_norm": 0.9951959252357483, "learning_rate": 0.00016442741939195614, "loss": 1.2784, "mean_token_accuracy": 0.7300505785271525, "num_tokens": 7172608.0, "step": 440 }, { "entropy": 1.3642704431084718, "epoch": 2.0, "grad_norm": 1.0989266633987427, "learning_rate": 0.0001523900086953891, "loss": 1.3297, "mean_token_accuracy": 0.7180480212910371, "num_tokens": 7334830.0, "step": 450 }, { "entropy": 1.312969586532563, "epoch": 2.0444567935537648, "grad_norm": 1.2978543043136597, "learning_rate": 0.0001406140064898494, "loss": 1.2625, "mean_token_accuracy": 0.7279225319623948, "num_tokens": 7497148.0, "step": 460 }, { "entropy": 1.2975676921661943, "epoch": 2.08891358710753, "grad_norm": 1.0483146905899048, "learning_rate": 0.00012913094998710944, "loss": 1.2614, "mean_token_accuracy": 0.7284373817965388, "num_tokens": 7660378.0, "step": 470 }, { "entropy": 1.2763791465200485, "epoch": 2.1333703806612947, "grad_norm": 1.0130515098571777, "learning_rate": 0.00011797159186353621, "loss": 1.2505, "mean_token_accuracy": 0.733047577738762, "num_tokens": 7823678.0, "step": 480 }, { "entropy": 1.2947511278558523, "epoch": 2.17782717421506, "grad_norm": 1.1858347654342651, "learning_rate": 0.0001071658179016087, "loss": 1.2613, "mean_token_accuracy": 0.729880428686738, "num_tokens": 7987185.0, "step": 490 }, { "entropy": 1.2410555441398174, "epoch": 2.2222839677688246, "grad_norm": 1.012299656867981, "learning_rate": 9.674256695305733e-05, "loss": 1.1959, "mean_token_accuracy": 0.7429393958300352, "num_tokens": 8150314.0, "step": 500 }, { "entropy": 1.2186435043811799, "epoch": 2.2667407613225894, "grad_norm": 1.2160053253173828, "learning_rate": 8.67297534379701e-05, "loss": 1.1706, "mean_token_accuracy": 0.7483966367319226, "num_tokens": 8313597.0, "step": 510 }, { "entropy": 1.2957685018423946, "epoch": 2.3111975548763546, "grad_norm": 1.0054808855056763, "learning_rate": 7.715419258742051e-05, "loss": 1.2519, "mean_token_accuracy": 0.7310865389183163, "num_tokens": 8476942.0, "step": 520 }, { "entropy": 1.1714724350254984, "epoch": 2.3556543484301193, "grad_norm": 0.9889060854911804, "learning_rate": 6.804152862982438e-05, "loss": 1.1277, "mean_token_accuracy": 0.7567101055756211, "num_tokens": 8640389.0, "step": 530 }, { "entropy": 1.2453054987825454, "epoch": 2.4001111419838845, "grad_norm": 1.033257246017456, "learning_rate": 5.941616611334891e-05, "loss": 1.2081, "mean_token_accuracy": 0.7403238713741302, "num_tokens": 8803672.0, "step": 540 }, { "entropy": 1.1717943134717643, "epoch": 2.4445679355376493, "grad_norm": 1.0136687755584717, "learning_rate": 5.130120454829887e-05, "loss": 1.1236, "mean_token_accuracy": 0.7593533847481012, "num_tokens": 8965555.0, "step": 550 }, { "entropy": 1.165727266203612, "epoch": 2.4890247290914145, "grad_norm": 0.9948190450668335, "learning_rate": 4.3718376544513937e-05, "loss": 1.1158, "mean_token_accuracy": 0.760404358804226, "num_tokens": 9129027.0, "step": 560 }, { "entropy": 1.2672561651095748, "epoch": 2.5334815226451792, "grad_norm": 1.116560697555542, "learning_rate": 3.6687989609449965e-05, "loss": 1.229, "mean_token_accuracy": 0.7357183141633868, "num_tokens": 9291052.0, "step": 570 }, { "entropy": 1.2553348348941653, "epoch": 2.577938316198944, "grad_norm": 1.05418062210083, "learning_rate": 3.022887176281547e-05, "loss": 1.2094, "mean_token_accuracy": 0.740975316427648, "num_tokens": 9454236.0, "step": 580 }, { "entropy": 1.232571629062295, "epoch": 2.622395109752709, "grad_norm": 1.0469932556152344, "learning_rate": 2.435832111341152e-05, "loss": 1.1916, "mean_token_accuracy": 0.7476834831759334, "num_tokens": 9616388.0, "step": 590 }, { "entropy": 1.2260436179116367, "epoch": 2.666851903306474, "grad_norm": 1.0891305208206177, "learning_rate": 1.909205953321294e-05, "loss": 1.1714, "mean_token_accuracy": 0.7486020162701607, "num_tokens": 9779866.0, "step": 600 } ], "logging_steps": 10, "max_steps": 675, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1942125322051704e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }