| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1245, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.060350030175015085, | |
| "grad_norm": 0.25875431299209595, | |
| "learning_rate": 0.00013636363636363637, | |
| "loss": 1.8139, | |
| "mean_token_accuracy": 0.6307312214374542, | |
| "num_tokens": 156890.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.12070006035003017, | |
| "grad_norm": 0.2245057374238968, | |
| "learning_rate": 0.0002784090909090909, | |
| "loss": 0.8686, | |
| "mean_token_accuracy": 0.7807172381877899, | |
| "num_tokens": 283625.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18105009052504525, | |
| "grad_norm": 0.18950718641281128, | |
| "learning_rate": 0.0004204545454545455, | |
| "loss": 0.6245, | |
| "mean_token_accuracy": 0.8282737296819687, | |
| "num_tokens": 439023.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.24140012070006034, | |
| "grad_norm": 0.24096715450286865, | |
| "learning_rate": 0.0004999811888222022, | |
| "loss": 0.4905, | |
| "mean_token_accuracy": 0.8619039803743362, | |
| "num_tokens": 563539.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.30175015087507545, | |
| "grad_norm": 0.17966966331005096, | |
| "learning_rate": 0.0004997985428296869, | |
| "loss": 0.3821, | |
| "mean_token_accuracy": 0.8908107584714889, | |
| "num_tokens": 722861.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3621001810500905, | |
| "grad_norm": 0.21009193360805511, | |
| "learning_rate": 0.0004994217332277896, | |
| "loss": 0.3269, | |
| "mean_token_accuracy": 0.9061362200975418, | |
| "num_tokens": 850582.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4224502112251056, | |
| "grad_norm": 0.23502632975578308, | |
| "learning_rate": 0.0004988510529033761, | |
| "loss": 0.2387, | |
| "mean_token_accuracy": 0.9319302082061768, | |
| "num_tokens": 1009486.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4828002414001207, | |
| "grad_norm": 0.2148028463125229, | |
| "learning_rate": 0.0004980869454353018, | |
| "loss": 0.2076, | |
| "mean_token_accuracy": 0.9411827009916306, | |
| "num_tokens": 1137451.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5431502715751357, | |
| "grad_norm": 0.1632697582244873, | |
| "learning_rate": 0.0004971300047496247, | |
| "loss": 0.1619, | |
| "mean_token_accuracy": 0.9540650862455368, | |
| "num_tokens": 1293610.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6035003017501509, | |
| "grad_norm": 0.26104432344436646, | |
| "learning_rate": 0.0004959809746579596, | |
| "loss": 0.164, | |
| "mean_token_accuracy": 0.9534644478559494, | |
| "num_tokens": 1420307.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.663850331925166, | |
| "grad_norm": 0.12273592501878738, | |
| "learning_rate": 0.0004946407482793272, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9624715512990951, | |
| "num_tokens": 1578380.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.724200362100181, | |
| "grad_norm": 0.32087773084640503, | |
| "learning_rate": 0.0004931103673459494, | |
| "loss": 0.1113, | |
| "mean_token_accuracy": 0.9689557248353958, | |
| "num_tokens": 1706886.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7845503922751962, | |
| "grad_norm": 0.1366167813539505, | |
| "learning_rate": 0.0004913910213935311, | |
| "loss": 0.1113, | |
| "mean_token_accuracy": 0.9688878679275512, | |
| "num_tokens": 1866557.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8449004224502112, | |
| "grad_norm": 0.24156679213047028, | |
| "learning_rate": 0.000489484046836657, | |
| "loss": 0.1045, | |
| "mean_token_accuracy": 0.9704328417778015, | |
| "num_tokens": 1992314.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9052504526252263, | |
| "grad_norm": 0.14394783973693848, | |
| "learning_rate": 0.0004873909259300225, | |
| "loss": 0.0948, | |
| "mean_token_accuracy": 0.9738239508867264, | |
| "num_tokens": 2149150.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.9656004828002414, | |
| "grad_norm": 0.22457388043403625, | |
| "learning_rate": 0.0004851132856163051, | |
| "loss": 0.0826, | |
| "mean_token_accuracy": 0.9772097253799439, | |
| "num_tokens": 2275425.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.09257517755031586, | |
| "eval_mean_token_accuracy": 0.9736895409790245, | |
| "eval_num_tokens": 2354180.0, | |
| "eval_runtime": 15.7044, | |
| "eval_samples_per_second": 23.497, | |
| "eval_steps_per_second": 11.78, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.024140012070006, | |
| "grad_norm": 0.15583111345767975, | |
| "learning_rate": 0.0004826528962615731, | |
| "loss": 0.0995, | |
| "mean_token_accuracy": 0.9726028952401938, | |
| "num_tokens": 2422079.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.0844900422450212, | |
| "grad_norm": 0.11416012048721313, | |
| "learning_rate": 0.0004800116702792146, | |
| "loss": 0.0592, | |
| "mean_token_accuracy": 0.9830048185586929, | |
| "num_tokens": 2563265.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1448400724200363, | |
| "grad_norm": 0.0843014195561409, | |
| "learning_rate": 0.00047719166064345484, | |
| "loss": 0.0773, | |
| "mean_token_accuracy": 0.977996414899826, | |
| "num_tokens": 2706077.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.2051901025950513, | |
| "grad_norm": 0.11646401882171631, | |
| "learning_rate": 0.0004741950592936188, | |
| "loss": 0.0494, | |
| "mean_token_accuracy": 0.9854312521219254, | |
| "num_tokens": 2847990.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2655401327700664, | |
| "grad_norm": 0.08569753915071487, | |
| "learning_rate": 0.00047102419543037903, | |
| "loss": 0.0834, | |
| "mean_token_accuracy": 0.9780280828475952, | |
| "num_tokens": 2989791.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.3258901629450814, | |
| "grad_norm": 0.10648876428604126, | |
| "learning_rate": 0.00046768153370531276, | |
| "loss": 0.0572, | |
| "mean_token_accuracy": 0.9838361299037933, | |
| "num_tokens": 3132225.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3862401931200965, | |
| "grad_norm": 0.07524841278791428, | |
| "learning_rate": 0.00046416967230517524, | |
| "loss": 0.0674, | |
| "mean_token_accuracy": 0.9807477170228958, | |
| "num_tokens": 3272843.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.4465902232951118, | |
| "grad_norm": 0.07833650708198547, | |
| "learning_rate": 0.00046049134093237943, | |
| "loss": 0.0472, | |
| "mean_token_accuracy": 0.9861810153722763, | |
| "num_tokens": 3417150.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5069402534701268, | |
| "grad_norm": 0.06221695989370346, | |
| "learning_rate": 0.0004566493986832504, | |
| "loss": 0.0606, | |
| "mean_token_accuracy": 0.9831046575307846, | |
| "num_tokens": 3558624.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.567290283645142, | |
| "grad_norm": 0.1267794370651245, | |
| "learning_rate": 0.0004526468318257052, | |
| "loss": 0.0452, | |
| "mean_token_accuracy": 0.9872009134292603, | |
| "num_tokens": 3701282.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.627640313820157, | |
| "grad_norm": 0.08861544728279114, | |
| "learning_rate": 0.0004484867514780834, | |
| "loss": 0.0608, | |
| "mean_token_accuracy": 0.9829268258810043, | |
| "num_tokens": 3846134.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.687990343995172, | |
| "grad_norm": 0.12759177386760712, | |
| "learning_rate": 0.0004441723911909354, | |
| "loss": 0.0434, | |
| "mean_token_accuracy": 0.9877904134988785, | |
| "num_tokens": 3989906.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.748340374170187, | |
| "grad_norm": 0.08757840096950531, | |
| "learning_rate": 0.00043970710443364506, | |
| "loss": 0.055, | |
| "mean_token_accuracy": 0.9843549174070358, | |
| "num_tokens": 4130356.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.8086904043452021, | |
| "grad_norm": 0.0843178927898407, | |
| "learning_rate": 0.0004350943619878427, | |
| "loss": 0.0433, | |
| "mean_token_accuracy": 0.9873012232780457, | |
| "num_tokens": 4272859.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8690404345202172, | |
| "grad_norm": 0.07654984295368195, | |
| "learning_rate": 0.00043033774924963297, | |
| "loss": 0.0586, | |
| "mean_token_accuracy": 0.9836700081825256, | |
| "num_tokens": 4414328.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.9293904646952322, | |
| "grad_norm": 0.08279535174369812, | |
| "learning_rate": 0.00042544096344273566, | |
| "loss": 0.0406, | |
| "mean_token_accuracy": 0.9882488793134689, | |
| "num_tokens": 4555974.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9897404948702473, | |
| "grad_norm": 0.09290555119514465, | |
| "learning_rate": 0.00042040781074470415, | |
| "loss": 0.0464, | |
| "mean_token_accuracy": 0.9870415306091309, | |
| "num_tokens": 4689001.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.05456389859318733, | |
| "eval_mean_token_accuracy": 0.9851698475915033, | |
| "eval_num_tokens": 4708360.0, | |
| "eval_runtime": 15.7238, | |
| "eval_samples_per_second": 23.468, | |
| "eval_steps_per_second": 11.766, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.048280024140012, | |
| "grad_norm": 0.05181082338094711, | |
| "learning_rate": 0.0004152422033284574, | |
| "loss": 0.0445, | |
| "mean_token_accuracy": 0.9867442065907508, | |
| "num_tokens": 4838142.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.1086300543150274, | |
| "grad_norm": 0.05355146527290344, | |
| "learning_rate": 0.0004099481563214226, | |
| "loss": 0.0314, | |
| "mean_token_accuracy": 0.990835223197937, | |
| "num_tokens": 4971140.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 2.1689800844900424, | |
| "grad_norm": 0.04749223589897156, | |
| "learning_rate": 0.00040452978468465383, | |
| "loss": 0.0446, | |
| "mean_token_accuracy": 0.9865801340341568, | |
| "num_tokens": 5122361.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.2293301146650575, | |
| "grad_norm": 0.06724860519170761, | |
| "learning_rate": 0.00039899130001435203, | |
| "loss": 0.0321, | |
| "mean_token_accuracy": 0.9902541941404343, | |
| "num_tokens": 5255402.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 2.2896801448400725, | |
| "grad_norm": 0.07366824150085449, | |
| "learning_rate": 0.000393337007268272, | |
| "loss": 0.0414, | |
| "mean_token_accuracy": 0.9877721995115281, | |
| "num_tokens": 5406401.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.3500301750150876, | |
| "grad_norm": 0.04158030077815056, | |
| "learning_rate": 0.0003875713014195614, | |
| "loss": 0.0304, | |
| "mean_token_accuracy": 0.9910214012861251, | |
| "num_tokens": 5538594.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.4103802051901027, | |
| "grad_norm": 0.05240670219063759, | |
| "learning_rate": 0.0003816986640406329, | |
| "loss": 0.0474, | |
| "mean_token_accuracy": 0.9857887053489685, | |
| "num_tokens": 5690102.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.4707302353651177, | |
| "grad_norm": 0.077730692923069, | |
| "learning_rate": 0.00037572365981972333, | |
| "loss": 0.0334, | |
| "mean_token_accuracy": 0.9903035759925842, | |
| "num_tokens": 5823065.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 2.5310802655401328, | |
| "grad_norm": 0.04822751507163048, | |
| "learning_rate": 0.00036965093301284994, | |
| "loss": 0.0449, | |
| "mean_token_accuracy": 0.9864302569627762, | |
| "num_tokens": 5974567.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.591430295715148, | |
| "grad_norm": 0.07845977693796158, | |
| "learning_rate": 0.00036348520383391885, | |
| "loss": 0.031, | |
| "mean_token_accuracy": 0.9906397736072541, | |
| "num_tokens": 6107171.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 2.651780325890163, | |
| "grad_norm": 0.04880121350288391, | |
| "learning_rate": 0.00035723126478579383, | |
| "loss": 0.0432, | |
| "mean_token_accuracy": 0.9871418106555939, | |
| "num_tokens": 6257293.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.712130356065178, | |
| "grad_norm": 0.05152444541454315, | |
| "learning_rate": 0.00035089397693517546, | |
| "loss": 0.0308, | |
| "mean_token_accuracy": 0.9905445170402527, | |
| "num_tokens": 6389689.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 2.772480386240193, | |
| "grad_norm": 0.04494641348719597, | |
| "learning_rate": 0.00034447826613418793, | |
| "loss": 0.0418, | |
| "mean_token_accuracy": 0.9878531390428543, | |
| "num_tokens": 6541638.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.832830416415208, | |
| "grad_norm": 0.07286397367715836, | |
| "learning_rate": 0.0003379891191916081, | |
| "loss": 0.0299, | |
| "mean_token_accuracy": 0.9908888912200928, | |
| "num_tokens": 6674920.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 2.8931804465902236, | |
| "grad_norm": 0.04308425635099411, | |
| "learning_rate": 0.0003314315799967154, | |
| "loss": 0.0438, | |
| "mean_token_accuracy": 0.9870698708295822, | |
| "num_tokens": 6827190.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.9535304767652386, | |
| "grad_norm": 0.0426165796816349, | |
| "learning_rate": 0.00032481074559877334, | |
| "loss": 0.0287, | |
| "mean_token_accuracy": 0.9914436733722687, | |
| "num_tokens": 6960777.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.04540110006928444, | |
| "eval_mean_token_accuracy": 0.9879436412373105, | |
| "eval_num_tokens": 7062540.0, | |
| "eval_runtime": 15.6959, | |
| "eval_samples_per_second": 23.509, | |
| "eval_steps_per_second": 11.787, | |
| "step": 1245 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 2905, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.06824728551936e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |