| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.666851903306474, | |
| "eval_steps": 500, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 4.080089821666479, | |
| "epoch": 0.04445679355376494, | |
| "grad_norm": 2.7002620697021484, | |
| "learning_rate": 6.61764705882353e-05, | |
| "loss": 4.8247, | |
| "mean_token_accuracy": 0.26705394631717355, | |
| "num_tokens": 163434.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.475909074395895, | |
| "epoch": 0.08891358710752988, | |
| "grad_norm": 0.7268410325050354, | |
| "learning_rate": 0.00013970588235294118, | |
| "loss": 2.7008, | |
| "mean_token_accuracy": 0.4736104167997837, | |
| "num_tokens": 326768.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.4751133039593696, | |
| "epoch": 0.1333703806612948, | |
| "grad_norm": 0.6923421025276184, | |
| "learning_rate": 0.00021323529411764705, | |
| "loss": 2.527, | |
| "mean_token_accuracy": 0.4972056133672595, | |
| "num_tokens": 489268.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.4025521516799926, | |
| "epoch": 0.17782717421505975, | |
| "grad_norm": 0.7557116746902466, | |
| "learning_rate": 0.0002867647058823529, | |
| "loss": 2.4375, | |
| "mean_token_accuracy": 0.5120668156072498, | |
| "num_tokens": 652245.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.3454058580100536, | |
| "epoch": 0.22228396776882467, | |
| "grad_norm": 0.6406869292259216, | |
| "learning_rate": 0.0003602941176470588, | |
| "loss": 2.3741, | |
| "mean_token_accuracy": 0.5260033827275038, | |
| "num_tokens": 815544.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.2708897083997726, | |
| "epoch": 0.2667407613225896, | |
| "grad_norm": 0.8530856370925903, | |
| "learning_rate": 0.0004338235294117647, | |
| "loss": 2.3009, | |
| "mean_token_accuracy": 0.5377379301935434, | |
| "num_tokens": 978734.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.216454330831766, | |
| "epoch": 0.31119755487635453, | |
| "grad_norm": 0.6344852447509766, | |
| "learning_rate": 0.0004999966516456925, | |
| "loss": 2.2346, | |
| "mean_token_accuracy": 0.5448098734021187, | |
| "num_tokens": 1142045.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.2090395711362363, | |
| "epoch": 0.3556543484301195, | |
| "grad_norm": 0.6977742910385132, | |
| "learning_rate": 0.000499594957644556, | |
| "loss": 2.2313, | |
| "mean_token_accuracy": 0.5475962175056338, | |
| "num_tokens": 1305101.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 2.1481428153812887, | |
| "epoch": 0.4001111419838844, | |
| "grad_norm": 0.9081956148147583, | |
| "learning_rate": 0.0004985248255012393, | |
| "loss": 2.1787, | |
| "mean_token_accuracy": 0.5571485862135888, | |
| "num_tokens": 1468499.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 2.126115245744586, | |
| "epoch": 0.44456793553764934, | |
| "grad_norm": 0.8921639323234558, | |
| "learning_rate": 0.0004967891211275719, | |
| "loss": 2.1385, | |
| "mean_token_accuracy": 0.5636989219114185, | |
| "num_tokens": 1631775.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 2.0657279253005982, | |
| "epoch": 0.48902472909141426, | |
| "grad_norm": 0.7108869552612305, | |
| "learning_rate": 0.0004943924928987371, | |
| "loss": 2.0797, | |
| "mean_token_accuracy": 0.5761262526735663, | |
| "num_tokens": 1795003.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 2.042630685120821, | |
| "epoch": 0.5334815226451792, | |
| "grad_norm": 0.9017110466957092, | |
| "learning_rate": 0.0004913413592044966, | |
| "loss": 2.074, | |
| "mean_token_accuracy": 0.576865964755416, | |
| "num_tokens": 1958262.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.9980590514838696, | |
| "epoch": 0.5779383161989442, | |
| "grad_norm": 0.984071671962738, | |
| "learning_rate": 0.0004876438912601562, | |
| "loss": 2.0242, | |
| "mean_token_accuracy": 0.5904566749930382, | |
| "num_tokens": 2120485.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 2.056968325190246, | |
| "epoch": 0.6223951097527091, | |
| "grad_norm": 0.9656462669372559, | |
| "learning_rate": 0.0004833099912233028, | |
| "loss": 2.0646, | |
| "mean_token_accuracy": 0.582532343827188, | |
| "num_tokens": 2282892.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.9450239988043905, | |
| "epoch": 0.666851903306474, | |
| "grad_norm": 0.8873224854469299, | |
| "learning_rate": 0.00047835126567492, | |
| "loss": 1.9611, | |
| "mean_token_accuracy": 0.5966836862266064, | |
| "num_tokens": 2446252.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 2.0017862655222416, | |
| "epoch": 0.711308696860239, | |
| "grad_norm": 0.9066752791404724, | |
| "learning_rate": 0.00047278099453590216, | |
| "loss": 2.0178, | |
| "mean_token_accuracy": 0.5848092636093497, | |
| "num_tokens": 2609666.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.8802081950008869, | |
| "epoch": 0.7557654904140039, | |
| "grad_norm": 0.8870635032653809, | |
| "learning_rate": 0.0004666140955022107, | |
| "loss": 1.9088, | |
| "mean_token_accuracy": 0.610439893975854, | |
| "num_tokens": 2772216.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.9238170295953751, | |
| "epoch": 0.8002222839677688, | |
| "grad_norm": 0.8861916065216064, | |
| "learning_rate": 0.0004598670840939184, | |
| "loss": 1.9254, | |
| "mean_token_accuracy": 0.6038264224305749, | |
| "num_tokens": 2935027.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.779969944804907, | |
| "epoch": 0.8446790775215337, | |
| "grad_norm": 0.9129126071929932, | |
| "learning_rate": 0.0004525580294251356, | |
| "loss": 1.7986, | |
| "mean_token_accuracy": 0.6294678594917059, | |
| "num_tokens": 3097530.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.7979737279005348, | |
| "epoch": 0.8891358710752987, | |
| "grad_norm": 0.8641685843467712, | |
| "learning_rate": 0.0004447065058132684, | |
| "loss": 1.8191, | |
| "mean_token_accuracy": 0.6282314421609044, | |
| "num_tokens": 3260036.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.8591997269541025, | |
| "epoch": 0.9335926646290637, | |
| "grad_norm": 1.096100091934204, | |
| "learning_rate": 0.00043633354035720506, | |
| "loss": 1.8622, | |
| "mean_token_accuracy": 0.6153368357568979, | |
| "num_tokens": 3423249.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.73656656621024, | |
| "epoch": 0.9780494581828285, | |
| "grad_norm": 0.8999007940292358, | |
| "learning_rate": 0.0004274615566248206, | |
| "loss": 1.7403, | |
| "mean_token_accuracy": 0.6391685012727976, | |
| "num_tokens": 3586732.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.7403309533618532, | |
| "epoch": 1.0222283967768824, | |
| "grad_norm": 0.9680184721946716, | |
| "learning_rate": 0.00041811431460060926, | |
| "loss": 1.7414, | |
| "mean_token_accuracy": 0.6385732677372746, | |
| "num_tokens": 3749001.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.6463226695545017, | |
| "epoch": 1.0666851903306473, | |
| "grad_norm": 1.188348650932312, | |
| "learning_rate": 0.00040831684705427203, | |
| "loss": 1.6575, | |
| "mean_token_accuracy": 0.653388250619173, | |
| "num_tokens": 3912441.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.6219884321093558, | |
| "epoch": 1.1111419838844123, | |
| "grad_norm": 0.9402503967285156, | |
| "learning_rate": 0.00039809539250066717, | |
| "loss": 1.616, | |
| "mean_token_accuracy": 0.6620589151978493, | |
| "num_tokens": 4074498.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.5720586974173785, | |
| "epoch": 1.1555987774381773, | |
| "grad_norm": 1.0742427110671997, | |
| "learning_rate": 0.00038747732493066647, | |
| "loss": 1.5702, | |
| "mean_token_accuracy": 0.6679834071546793, | |
| "num_tokens": 4237249.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.6063256619498134, | |
| "epoch": 1.2000555709919423, | |
| "grad_norm": 1.0826376676559448, | |
| "learning_rate": 0.0003764910805011017, | |
| "loss": 1.6076, | |
| "mean_token_accuracy": 0.6603890936821699, | |
| "num_tokens": 4400673.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.6607086526229977, | |
| "epoch": 1.2445123645457072, | |
| "grad_norm": 1.1589428186416626, | |
| "learning_rate": 0.0003651660813801344, | |
| "loss": 1.6578, | |
| "mean_token_accuracy": 0.652839015610516, | |
| "num_tokens": 4563571.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.5815417014062405, | |
| "epoch": 1.288969158099472, | |
| "grad_norm": 1.130180835723877, | |
| "learning_rate": 0.0003535326569519959, | |
| "loss": 1.5775, | |
| "mean_token_accuracy": 0.6679099775850773, | |
| "num_tokens": 4726859.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.6092839901335538, | |
| "epoch": 1.333425951653237, | |
| "grad_norm": 0.9171048402786255, | |
| "learning_rate": 0.0003416219625921204, | |
| "loss": 1.6241, | |
| "mean_token_accuracy": 0.656397813372314, | |
| "num_tokens": 4889977.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.5430887764319778, | |
| "epoch": 1.377882745207002, | |
| "grad_norm": 1.2212048768997192, | |
| "learning_rate": 0.00032946589623019827, | |
| "loss": 1.5316, | |
| "mean_token_accuracy": 0.6758351223543286, | |
| "num_tokens": 5053344.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.5891861728392542, | |
| "epoch": 1.422339538760767, | |
| "grad_norm": 1.2055883407592773, | |
| "learning_rate": 0.00031709701292460037, | |
| "loss": 1.5768, | |
| "mean_token_accuracy": 0.6684434033930302, | |
| "num_tokens": 5216641.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.4777619161643087, | |
| "epoch": 1.4667963323145319, | |
| "grad_norm": 1.1511977910995483, | |
| "learning_rate": 0.00030454843767695194, | |
| "loss": 1.4737, | |
| "mean_token_accuracy": 0.6856943031772971, | |
| "num_tokens": 5379690.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.536539927031845, | |
| "epoch": 1.5112531258682966, | |
| "grad_norm": 1.1793866157531738, | |
| "learning_rate": 0.0002918537767203464, | |
| "loss": 1.5243, | |
| "mean_token_accuracy": 0.6775767827406526, | |
| "num_tokens": 5543019.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.4959282116033137, | |
| "epoch": 1.5557099194220618, | |
| "grad_norm": 1.001905083656311, | |
| "learning_rate": 0.0002790470275187772, | |
| "loss": 1.504, | |
| "mean_token_accuracy": 0.6810265580192209, | |
| "num_tokens": 5705946.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.5315143384039402, | |
| "epoch": 1.6001667129758266, | |
| "grad_norm": 1.0786097049713135, | |
| "learning_rate": 0.00026616248771881706, | |
| "loss": 1.5182, | |
| "mean_token_accuracy": 0.6769340887665749, | |
| "num_tokens": 5869367.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.4619380568154157, | |
| "epoch": 1.6446235065295916, | |
| "grad_norm": 1.1511582136154175, | |
| "learning_rate": 0.0002532346632973818, | |
| "loss": 1.4542, | |
| "mean_token_accuracy": 0.6922258980572223, | |
| "num_tokens": 6032042.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.3948858159594237, | |
| "epoch": 1.6890803000833565, | |
| "grad_norm": 1.3182344436645508, | |
| "learning_rate": 0.00024029817615156491, | |
| "loss": 1.3733, | |
| "mean_token_accuracy": 0.7100105246528983, | |
| "num_tokens": 6194524.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.502246926818043, | |
| "epoch": 1.7335370936371213, | |
| "grad_norm": 1.149990200996399, | |
| "learning_rate": 0.0002273876713780292, | |
| "loss": 1.5031, | |
| "mean_token_accuracy": 0.6846974194049835, | |
| "num_tokens": 6358008.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.4208066834136843, | |
| "epoch": 1.7779938871908865, | |
| "grad_norm": 1.1152338981628418, | |
| "learning_rate": 0.00021453772449026595, | |
| "loss": 1.4044, | |
| "mean_token_accuracy": 0.7048833057284355, | |
| "num_tokens": 6519678.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.4979191770311444, | |
| "epoch": 1.8224506807446512, | |
| "grad_norm": 1.0883276462554932, | |
| "learning_rate": 0.00020178274882220573, | |
| "loss": 1.4849, | |
| "mean_token_accuracy": 0.6878569139167666, | |
| "num_tokens": 6683001.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.4353665138129146, | |
| "epoch": 1.8669074742984162, | |
| "grad_norm": 0.9934560656547546, | |
| "learning_rate": 0.00018915690336615977, | |
| "loss": 1.4027, | |
| "mean_token_accuracy": 0.7004664979875088, | |
| "num_tokens": 6846358.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.3516410222277044, | |
| "epoch": 1.9113642678521812, | |
| "grad_norm": 1.3949309587478638, | |
| "learning_rate": 0.00017669400129191205, | |
| "loss": 1.3303, | |
| "mean_token_accuracy": 0.7134283676743507, | |
| "num_tokens": 7009309.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.303240181831643, | |
| "epoch": 1.9558210614059461, | |
| "grad_norm": 0.9951959252357483, | |
| "learning_rate": 0.00016442741939195614, | |
| "loss": 1.2784, | |
| "mean_token_accuracy": 0.7300505785271525, | |
| "num_tokens": 7172608.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.3642704431084718, | |
| "epoch": 2.0, | |
| "grad_norm": 1.0989266633987427, | |
| "learning_rate": 0.0001523900086953891, | |
| "loss": 1.3297, | |
| "mean_token_accuracy": 0.7180480212910371, | |
| "num_tokens": 7334830.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.312969586532563, | |
| "epoch": 2.0444567935537648, | |
| "grad_norm": 1.2978543043136597, | |
| "learning_rate": 0.0001406140064898494, | |
| "loss": 1.2625, | |
| "mean_token_accuracy": 0.7279225319623948, | |
| "num_tokens": 7497148.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.2975676921661943, | |
| "epoch": 2.08891358710753, | |
| "grad_norm": 1.0483146905899048, | |
| "learning_rate": 0.00012913094998710944, | |
| "loss": 1.2614, | |
| "mean_token_accuracy": 0.7284373817965388, | |
| "num_tokens": 7660378.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.2763791465200485, | |
| "epoch": 2.1333703806612947, | |
| "grad_norm": 1.0130515098571777, | |
| "learning_rate": 0.00011797159186353621, | |
| "loss": 1.2505, | |
| "mean_token_accuracy": 0.733047577738762, | |
| "num_tokens": 7823678.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.2947511278558523, | |
| "epoch": 2.17782717421506, | |
| "grad_norm": 1.1858347654342651, | |
| "learning_rate": 0.0001071658179016087, | |
| "loss": 1.2613, | |
| "mean_token_accuracy": 0.729880428686738, | |
| "num_tokens": 7987185.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.2410555441398174, | |
| "epoch": 2.2222839677688246, | |
| "grad_norm": 1.012299656867981, | |
| "learning_rate": 9.674256695305733e-05, | |
| "loss": 1.1959, | |
| "mean_token_accuracy": 0.7429393958300352, | |
| "num_tokens": 8150314.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.2186435043811799, | |
| "epoch": 2.2667407613225894, | |
| "grad_norm": 1.2160053253173828, | |
| "learning_rate": 8.67297534379701e-05, | |
| "loss": 1.1706, | |
| "mean_token_accuracy": 0.7483966367319226, | |
| "num_tokens": 8313597.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.2957685018423946, | |
| "epoch": 2.3111975548763546, | |
| "grad_norm": 1.0054808855056763, | |
| "learning_rate": 7.715419258742051e-05, | |
| "loss": 1.2519, | |
| "mean_token_accuracy": 0.7310865389183163, | |
| "num_tokens": 8476942.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.1714724350254984, | |
| "epoch": 2.3556543484301193, | |
| "grad_norm": 0.9889060854911804, | |
| "learning_rate": 6.804152862982438e-05, | |
| "loss": 1.1277, | |
| "mean_token_accuracy": 0.7567101055756211, | |
| "num_tokens": 8640389.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.2453054987825454, | |
| "epoch": 2.4001111419838845, | |
| "grad_norm": 1.033257246017456, | |
| "learning_rate": 5.941616611334891e-05, | |
| "loss": 1.2081, | |
| "mean_token_accuracy": 0.7403238713741302, | |
| "num_tokens": 8803672.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.1717943134717643, | |
| "epoch": 2.4445679355376493, | |
| "grad_norm": 1.0136687755584717, | |
| "learning_rate": 5.130120454829887e-05, | |
| "loss": 1.1236, | |
| "mean_token_accuracy": 0.7593533847481012, | |
| "num_tokens": 8965555.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.165727266203612, | |
| "epoch": 2.4890247290914145, | |
| "grad_norm": 0.9948190450668335, | |
| "learning_rate": 4.3718376544513937e-05, | |
| "loss": 1.1158, | |
| "mean_token_accuracy": 0.760404358804226, | |
| "num_tokens": 9129027.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.2672561651095748, | |
| "epoch": 2.5334815226451792, | |
| "grad_norm": 1.116560697555542, | |
| "learning_rate": 3.6687989609449965e-05, | |
| "loss": 1.229, | |
| "mean_token_accuracy": 0.7357183141633868, | |
| "num_tokens": 9291052.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.2553348348941653, | |
| "epoch": 2.577938316198944, | |
| "grad_norm": 1.05418062210083, | |
| "learning_rate": 3.022887176281547e-05, | |
| "loss": 1.2094, | |
| "mean_token_accuracy": 0.740975316427648, | |
| "num_tokens": 9454236.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.232571629062295, | |
| "epoch": 2.622395109752709, | |
| "grad_norm": 1.0469932556152344, | |
| "learning_rate": 2.435832111341152e-05, | |
| "loss": 1.1916, | |
| "mean_token_accuracy": 0.7476834831759334, | |
| "num_tokens": 9616388.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.2260436179116367, | |
| "epoch": 2.666851903306474, | |
| "grad_norm": 1.0891305208206177, | |
| "learning_rate": 1.909205953321294e-05, | |
| "loss": 1.1714, | |
| "mean_token_accuracy": 0.7486020162701607, | |
| "num_tokens": 9779866.0, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 675, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1942125322051704e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |