{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1245, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.060350030175015085, "grad_norm": 0.25875431299209595, "learning_rate": 0.00013636363636363637, "loss": 1.8139, "mean_token_accuracy": 0.6307312214374542, "num_tokens": 156890.0, "step": 25 }, { "epoch": 0.12070006035003017, "grad_norm": 0.2245057374238968, "learning_rate": 0.0002784090909090909, "loss": 0.8686, "mean_token_accuracy": 0.7807172381877899, "num_tokens": 283625.0, "step": 50 }, { "epoch": 0.18105009052504525, "grad_norm": 0.18950718641281128, "learning_rate": 0.0004204545454545455, "loss": 0.6245, "mean_token_accuracy": 0.8282737296819687, "num_tokens": 439023.0, "step": 75 }, { "epoch": 0.24140012070006034, "grad_norm": 0.24096715450286865, "learning_rate": 0.0004999811888222022, "loss": 0.4905, "mean_token_accuracy": 0.8619039803743362, "num_tokens": 563539.0, "step": 100 }, { "epoch": 0.30175015087507545, "grad_norm": 0.17966966331005096, "learning_rate": 0.0004997985428296869, "loss": 0.3821, "mean_token_accuracy": 0.8908107584714889, "num_tokens": 722861.0, "step": 125 }, { "epoch": 0.3621001810500905, "grad_norm": 0.21009193360805511, "learning_rate": 0.0004994217332277896, "loss": 0.3269, "mean_token_accuracy": 0.9061362200975418, "num_tokens": 850582.0, "step": 150 }, { "epoch": 0.4224502112251056, "grad_norm": 0.23502632975578308, "learning_rate": 0.0004988510529033761, "loss": 0.2387, "mean_token_accuracy": 0.9319302082061768, "num_tokens": 1009486.0, "step": 175 }, { "epoch": 0.4828002414001207, "grad_norm": 0.2148028463125229, "learning_rate": 0.0004980869454353018, "loss": 0.2076, "mean_token_accuracy": 0.9411827009916306, "num_tokens": 1137451.0, "step": 200 }, { "epoch": 0.5431502715751357, "grad_norm": 0.1632697582244873, "learning_rate": 0.0004971300047496247, "loss": 0.1619, "mean_token_accuracy": 0.9540650862455368, "num_tokens": 1293610.0, "step": 225 }, { "epoch": 0.6035003017501509, "grad_norm": 0.26104432344436646, "learning_rate": 0.0004959809746579596, "loss": 0.164, "mean_token_accuracy": 0.9534644478559494, "num_tokens": 1420307.0, "step": 250 }, { "epoch": 0.663850331925166, "grad_norm": 0.12273592501878738, "learning_rate": 0.0004946407482793272, "loss": 0.1295, "mean_token_accuracy": 0.9624715512990951, "num_tokens": 1578380.0, "step": 275 }, { "epoch": 0.724200362100181, "grad_norm": 0.32087773084640503, "learning_rate": 0.0004931103673459494, "loss": 0.1113, "mean_token_accuracy": 0.9689557248353958, "num_tokens": 1706886.0, "step": 300 }, { "epoch": 0.7845503922751962, "grad_norm": 0.1366167813539505, "learning_rate": 0.0004913910213935311, "loss": 0.1113, "mean_token_accuracy": 0.9688878679275512, "num_tokens": 1866557.0, "step": 325 }, { "epoch": 0.8449004224502112, "grad_norm": 0.24156679213047028, "learning_rate": 0.000489484046836657, "loss": 0.1045, "mean_token_accuracy": 0.9704328417778015, "num_tokens": 1992314.0, "step": 350 }, { "epoch": 0.9052504526252263, "grad_norm": 0.14394783973693848, "learning_rate": 0.0004873909259300225, "loss": 0.0948, "mean_token_accuracy": 0.9738239508867264, "num_tokens": 2149150.0, "step": 375 }, { "epoch": 0.9656004828002414, "grad_norm": 0.22457388043403625, "learning_rate": 0.0004851132856163051, "loss": 0.0826, "mean_token_accuracy": 0.9772097253799439, "num_tokens": 2275425.0, "step": 400 }, { "epoch": 1.0, "eval_loss": 0.09257517755031586, "eval_mean_token_accuracy": 0.9736895409790245, "eval_num_tokens": 2354180.0, "eval_runtime": 15.7044, "eval_samples_per_second": 23.497, "eval_steps_per_second": 11.78, "step": 415 }, { "epoch": 1.024140012070006, "grad_norm": 0.15583111345767975, "learning_rate": 0.0004826528962615731, "loss": 0.0995, "mean_token_accuracy": 0.9726028952401938, "num_tokens": 2422079.0, "step": 425 }, { "epoch": 1.0844900422450212, "grad_norm": 0.11416012048721313, "learning_rate": 0.0004800116702792146, "loss": 0.0592, "mean_token_accuracy": 0.9830048185586929, "num_tokens": 2563265.0, "step": 450 }, { "epoch": 1.1448400724200363, "grad_norm": 0.0843014195561409, "learning_rate": 0.00047719166064345484, "loss": 0.0773, "mean_token_accuracy": 0.977996414899826, "num_tokens": 2706077.0, "step": 475 }, { "epoch": 1.2051901025950513, "grad_norm": 0.11646401882171631, "learning_rate": 0.0004741950592936188, "loss": 0.0494, "mean_token_accuracy": 0.9854312521219254, "num_tokens": 2847990.0, "step": 500 }, { "epoch": 1.2655401327700664, "grad_norm": 0.08569753915071487, "learning_rate": 0.00047102419543037903, "loss": 0.0834, "mean_token_accuracy": 0.9780280828475952, "num_tokens": 2989791.0, "step": 525 }, { "epoch": 1.3258901629450814, "grad_norm": 0.10648876428604126, "learning_rate": 0.00046768153370531276, "loss": 0.0572, "mean_token_accuracy": 0.9838361299037933, "num_tokens": 3132225.0, "step": 550 }, { "epoch": 1.3862401931200965, "grad_norm": 0.07524841278791428, "learning_rate": 0.00046416967230517524, "loss": 0.0674, "mean_token_accuracy": 0.9807477170228958, "num_tokens": 3272843.0, "step": 575 }, { "epoch": 1.4465902232951118, "grad_norm": 0.07833650708198547, "learning_rate": 0.00046049134093237943, "loss": 0.0472, "mean_token_accuracy": 0.9861810153722763, "num_tokens": 3417150.0, "step": 600 }, { "epoch": 1.5069402534701268, "grad_norm": 0.06221695989370346, "learning_rate": 0.0004566493986832504, "loss": 0.0606, "mean_token_accuracy": 0.9831046575307846, "num_tokens": 3558624.0, "step": 625 }, { "epoch": 1.567290283645142, "grad_norm": 0.1267794370651245, "learning_rate": 0.0004526468318257052, "loss": 0.0452, "mean_token_accuracy": 0.9872009134292603, "num_tokens": 3701282.0, "step": 650 }, { "epoch": 1.627640313820157, "grad_norm": 0.08861544728279114, "learning_rate": 0.0004484867514780834, "loss": 0.0608, "mean_token_accuracy": 0.9829268258810043, "num_tokens": 3846134.0, "step": 675 }, { "epoch": 1.687990343995172, "grad_norm": 0.12759177386760712, "learning_rate": 0.0004441723911909354, "loss": 0.0434, "mean_token_accuracy": 0.9877904134988785, "num_tokens": 3989906.0, "step": 700 }, { "epoch": 1.748340374170187, "grad_norm": 0.08757840096950531, "learning_rate": 0.00043970710443364506, "loss": 0.055, "mean_token_accuracy": 0.9843549174070358, "num_tokens": 4130356.0, "step": 725 }, { "epoch": 1.8086904043452021, "grad_norm": 0.0843178927898407, "learning_rate": 0.0004350943619878427, "loss": 0.0433, "mean_token_accuracy": 0.9873012232780457, "num_tokens": 4272859.0, "step": 750 }, { "epoch": 1.8690404345202172, "grad_norm": 0.07654984295368195, "learning_rate": 0.00043033774924963297, "loss": 0.0586, "mean_token_accuracy": 0.9836700081825256, "num_tokens": 4414328.0, "step": 775 }, { "epoch": 1.9293904646952322, "grad_norm": 0.08279535174369812, "learning_rate": 0.00042544096344273566, "loss": 0.0406, "mean_token_accuracy": 0.9882488793134689, "num_tokens": 4555974.0, "step": 800 }, { "epoch": 1.9897404948702473, "grad_norm": 0.09290555119514465, "learning_rate": 0.00042040781074470415, "loss": 0.0464, "mean_token_accuracy": 0.9870415306091309, "num_tokens": 4689001.0, "step": 825 }, { "epoch": 2.0, "eval_loss": 0.05456389859318733, "eval_mean_token_accuracy": 0.9851698475915033, "eval_num_tokens": 4708360.0, "eval_runtime": 15.7238, "eval_samples_per_second": 23.468, "eval_steps_per_second": 11.766, "step": 830 }, { "epoch": 2.048280024140012, "grad_norm": 0.05181082338094711, "learning_rate": 0.0004152422033284574, "loss": 0.0445, "mean_token_accuracy": 0.9867442065907508, "num_tokens": 4838142.0, "step": 850 }, { "epoch": 2.1086300543150274, "grad_norm": 0.05355146527290344, "learning_rate": 0.0004099481563214226, "loss": 0.0314, "mean_token_accuracy": 0.990835223197937, "num_tokens": 4971140.0, "step": 875 }, { "epoch": 2.1689800844900424, "grad_norm": 0.04749223589897156, "learning_rate": 0.00040452978468465383, "loss": 0.0446, "mean_token_accuracy": 0.9865801340341568, "num_tokens": 5122361.0, "step": 900 }, { "epoch": 2.2293301146650575, "grad_norm": 0.06724860519170761, "learning_rate": 0.00039899130001435203, "loss": 0.0321, "mean_token_accuracy": 0.9902541941404343, "num_tokens": 5255402.0, "step": 925 }, { "epoch": 2.2896801448400725, "grad_norm": 0.07366824150085449, "learning_rate": 0.000393337007268272, "loss": 0.0414, "mean_token_accuracy": 0.9877721995115281, "num_tokens": 5406401.0, "step": 950 }, { "epoch": 2.3500301750150876, "grad_norm": 0.04158030077815056, "learning_rate": 0.0003875713014195614, "loss": 0.0304, "mean_token_accuracy": 0.9910214012861251, "num_tokens": 5538594.0, "step": 975 }, { "epoch": 2.4103802051901027, "grad_norm": 0.05240670219063759, "learning_rate": 0.0003816986640406329, "loss": 0.0474, "mean_token_accuracy": 0.9857887053489685, "num_tokens": 5690102.0, "step": 1000 }, { "epoch": 2.4707302353651177, "grad_norm": 0.077730692923069, "learning_rate": 0.00037572365981972333, "loss": 0.0334, "mean_token_accuracy": 0.9903035759925842, "num_tokens": 5823065.0, "step": 1025 }, { "epoch": 2.5310802655401328, "grad_norm": 0.04822751507163048, "learning_rate": 0.00036965093301284994, "loss": 0.0449, "mean_token_accuracy": 0.9864302569627762, "num_tokens": 5974567.0, "step": 1050 }, { "epoch": 2.591430295715148, "grad_norm": 0.07845977693796158, "learning_rate": 0.00036348520383391885, "loss": 0.031, "mean_token_accuracy": 0.9906397736072541, "num_tokens": 6107171.0, "step": 1075 }, { "epoch": 2.651780325890163, "grad_norm": 0.04880121350288391, "learning_rate": 0.00035723126478579383, "loss": 0.0432, "mean_token_accuracy": 0.9871418106555939, "num_tokens": 6257293.0, "step": 1100 }, { "epoch": 2.712130356065178, "grad_norm": 0.05152444541454315, "learning_rate": 0.00035089397693517546, "loss": 0.0308, "mean_token_accuracy": 0.9905445170402527, "num_tokens": 6389689.0, "step": 1125 }, { "epoch": 2.772480386240193, "grad_norm": 0.04494641348719597, "learning_rate": 0.00034447826613418793, "loss": 0.0418, "mean_token_accuracy": 0.9878531390428543, "num_tokens": 6541638.0, "step": 1150 }, { "epoch": 2.832830416415208, "grad_norm": 0.07286397367715836, "learning_rate": 0.0003379891191916081, "loss": 0.0299, "mean_token_accuracy": 0.9908888912200928, "num_tokens": 6674920.0, "step": 1175 }, { "epoch": 2.8931804465902236, "grad_norm": 0.04308425635099411, "learning_rate": 0.0003314315799967154, "loss": 0.0438, "mean_token_accuracy": 0.9870698708295822, "num_tokens": 6827190.0, "step": 1200 }, { "epoch": 2.9535304767652386, "grad_norm": 0.0426165796816349, "learning_rate": 0.00032481074559877334, "loss": 0.0287, "mean_token_accuracy": 0.9914436733722687, "num_tokens": 6960777.0, "step": 1225 }, { "epoch": 3.0, "eval_loss": 0.04540110006928444, "eval_mean_token_accuracy": 0.9879436412373105, "eval_num_tokens": 7062540.0, "eval_runtime": 15.6959, "eval_samples_per_second": 23.509, "eval_steps_per_second": 11.787, "step": 1245 } ], "logging_steps": 25, "max_steps": 2905, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.06824728551936e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }