| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 834, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03597122302158273, | |
| "grad_norm": 26.93948859971876, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0175, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07194244604316546, | |
| "grad_norm": 2.6743423167480063, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9337, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1079136690647482, | |
| "grad_norm": 1.100805597904231, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8923, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14388489208633093, | |
| "grad_norm": 0.8355539701078896, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8673, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17985611510791366, | |
| "grad_norm": 0.7157506047100403, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8553, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2158273381294964, | |
| "grad_norm": 0.9806631521043339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8492, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2517985611510791, | |
| "grad_norm": 0.8360835611944488, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8382, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.28776978417266186, | |
| "grad_norm": 0.7078472519601653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8318, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3237410071942446, | |
| "grad_norm": 0.6255785562847258, | |
| "learning_rate": 5e-06, | |
| "loss": 0.825, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3597122302158273, | |
| "grad_norm": 0.6950072028339258, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8225, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.39568345323741005, | |
| "grad_norm": 0.622757689781733, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8165, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4316546762589928, | |
| "grad_norm": 0.6855173384055511, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8162, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4676258992805755, | |
| "grad_norm": 0.555459004966806, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8141, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5035971223021583, | |
| "grad_norm": 0.7189252900166325, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8113, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.539568345323741, | |
| "grad_norm": 0.8411135438726722, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8069, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5755395683453237, | |
| "grad_norm": 0.9141854769887011, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8087, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6115107913669064, | |
| "grad_norm": 0.6527584548807389, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8048, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6474820143884892, | |
| "grad_norm": 0.6986581112545092, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8051, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6834532374100719, | |
| "grad_norm": 0.6094857952430536, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8044, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7194244604316546, | |
| "grad_norm": 0.74096920276776, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7989, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7553956834532374, | |
| "grad_norm": 0.6584952886572538, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8025, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7913669064748201, | |
| "grad_norm": 0.5838446606699556, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7988, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8273381294964028, | |
| "grad_norm": 0.5916175411049406, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7985, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8633093525179856, | |
| "grad_norm": 0.626471567693148, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7973, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8992805755395683, | |
| "grad_norm": 0.6338741269795162, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7933, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.935251798561151, | |
| "grad_norm": 0.8343555675066444, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7969, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9712230215827338, | |
| "grad_norm": 0.6221641429373133, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7933, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.7923575043678284, | |
| "eval_runtime": 27.9533, | |
| "eval_samples_per_second": 267.732, | |
| "eval_steps_per_second": 1.073, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.0071942446043165, | |
| "grad_norm": 0.8944971285319924, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7823, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0431654676258992, | |
| "grad_norm": 0.7668083853056575, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7574, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.079136690647482, | |
| "grad_norm": 0.6176816592509634, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7529, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1151079136690647, | |
| "grad_norm": 0.6475301176330789, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7558, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1510791366906474, | |
| "grad_norm": 0.5811910989874788, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7623, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1870503597122302, | |
| "grad_norm": 0.6269454462814978, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7601, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.223021582733813, | |
| "grad_norm": 0.5423886247053047, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7535, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2589928057553956, | |
| "grad_norm": 0.6670401432003603, | |
| "learning_rate": 5e-06, | |
| "loss": 0.757, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.2949640287769784, | |
| "grad_norm": 0.7095322132659916, | |
| "learning_rate": 5e-06, | |
| "loss": 0.759, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.330935251798561, | |
| "grad_norm": 0.6870367808903867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7567, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.3669064748201438, | |
| "grad_norm": 0.6640094117573664, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7592, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4028776978417266, | |
| "grad_norm": 0.5994950619117767, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7529, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.4388489208633093, | |
| "grad_norm": 0.7392872817621052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7554, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.474820143884892, | |
| "grad_norm": 0.5656749568866071, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7547, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.5107913669064748, | |
| "grad_norm": 0.921484641426356, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7532, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5467625899280577, | |
| "grad_norm": 0.540059029380678, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7585, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.5827338129496402, | |
| "grad_norm": 0.6558652758296812, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7515, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.6187050359712232, | |
| "grad_norm": 0.57268163367781, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7562, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.6546762589928057, | |
| "grad_norm": 0.5407189047091853, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7559, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6906474820143886, | |
| "grad_norm": 0.6077940984618293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.757, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.7266187050359711, | |
| "grad_norm": 1.001124812241379, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7552, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.762589928057554, | |
| "grad_norm": 0.6254013722291123, | |
| "learning_rate": 5e-06, | |
| "loss": 0.753, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.7985611510791366, | |
| "grad_norm": 0.5767617312575639, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7594, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.8345323741007196, | |
| "grad_norm": 0.665915353902276, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7554, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.870503597122302, | |
| "grad_norm": 0.5596777388150926, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7537, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.906474820143885, | |
| "grad_norm": 0.5547398560915929, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7555, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.9424460431654675, | |
| "grad_norm": 0.5874602156110944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7509, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.9784172661870505, | |
| "grad_norm": 0.6369533697170318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7503, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.7788412570953369, | |
| "eval_runtime": 27.8988, | |
| "eval_samples_per_second": 268.255, | |
| "eval_steps_per_second": 1.075, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 2.014388489208633, | |
| "grad_norm": 1.0929207520027995, | |
| "learning_rate": 5e-06, | |
| "loss": 0.735, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.050359712230216, | |
| "grad_norm": 0.687310495052166, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7131, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.0863309352517985, | |
| "grad_norm": 0.6848749958758751, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7129, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.1223021582733814, | |
| "grad_norm": 0.9700661070159223, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7154, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.158273381294964, | |
| "grad_norm": 0.7429316335562708, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7163, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.194244604316547, | |
| "grad_norm": 0.5731198010767242, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7197, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.2302158273381294, | |
| "grad_norm": 0.6519774548706885, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7192, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.2661870503597124, | |
| "grad_norm": 0.7092939571259266, | |
| "learning_rate": 5e-06, | |
| "loss": 0.717, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.302158273381295, | |
| "grad_norm": 0.8300683342338049, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7171, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.338129496402878, | |
| "grad_norm": 0.6364079517115279, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7179, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.3741007194244603, | |
| "grad_norm": 0.6830216482631195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7208, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.4100719424460433, | |
| "grad_norm": 0.580810416113199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7201, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.446043165467626, | |
| "grad_norm": 0.7709663647446697, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7165, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.4820143884892087, | |
| "grad_norm": 0.6587806242655105, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7199, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.5179856115107913, | |
| "grad_norm": 0.6679031168226195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7228, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.553956834532374, | |
| "grad_norm": 0.5802019851320436, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7211, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.5899280575539567, | |
| "grad_norm": 0.633360775543426, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7192, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.6258992805755397, | |
| "grad_norm": 0.7014721250700231, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7208, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.661870503597122, | |
| "grad_norm": 0.5972726636881343, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7184, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.697841726618705, | |
| "grad_norm": 0.5454556975289979, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7139, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.7338129496402876, | |
| "grad_norm": 0.5626224999737693, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7207, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.7697841726618706, | |
| "grad_norm": 0.5106193565014756, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7193, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.805755395683453, | |
| "grad_norm": 0.6138738602878809, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7185, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.841726618705036, | |
| "grad_norm": 0.6093685279993987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7217, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.8776978417266186, | |
| "grad_norm": 0.5564883285882788, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7213, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.9136690647482015, | |
| "grad_norm": 0.5906548449538034, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7183, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.949640287769784, | |
| "grad_norm": 0.5460219561244413, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7216, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.985611510791367, | |
| "grad_norm": 0.6453368774762195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7198, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.7752296328544617, | |
| "eval_runtime": 27.5746, | |
| "eval_samples_per_second": 271.409, | |
| "eval_steps_per_second": 1.088, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 834, | |
| "total_flos": 1396981062696960.0, | |
| "train_loss": 0.7675551453368555, | |
| "train_runtime": 5571.5313, | |
| "train_samples_per_second": 76.563, | |
| "train_steps_per_second": 0.15 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 834, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1396981062696960.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |