| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 1000, |
| "global_step": 19088, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.026194467728415757, |
| "grad_norm": 1.5032150745391846, |
| "learning_rate": 4.869027661357922e-05, |
| "loss": 4.396, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.052388935456831515, |
| "grad_norm": 1.729953408241272, |
| "learning_rate": 4.7380553227158424e-05, |
| "loss": 2.9272, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.052388935456831515, |
| "eval_accuracy": 0.44252870820096124, |
| "eval_loss": 2.4654057025909424, |
| "eval_runtime": 53.4591, |
| "eval_samples_per_second": 114.929, |
| "eval_steps_per_second": 3.592, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.07858340318524727, |
| "grad_norm": 1.887872576713562, |
| "learning_rate": 4.607082984073764e-05, |
| "loss": 2.3046, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.10477787091366303, |
| "grad_norm": 1.500243902206421, |
| "learning_rate": 4.4761106454316845e-05, |
| "loss": 1.9903, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.10477787091366303, |
| "eval_accuracy": 0.5657442303580157, |
| "eval_loss": 1.8096131086349487, |
| "eval_runtime": 54.753, |
| "eval_samples_per_second": 112.213, |
| "eval_steps_per_second": 3.507, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.1309723386420788, |
| "grad_norm": 1.718226432800293, |
| "learning_rate": 4.345138306789606e-05, |
| "loss": 1.8341, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.15716680637049454, |
| "grad_norm": 1.5894819498062134, |
| "learning_rate": 4.2141659681475273e-05, |
| "loss": 1.7407, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.15716680637049454, |
| "eval_accuracy": 0.5995431889459107, |
| "eval_loss": 1.6194320917129517, |
| "eval_runtime": 54.256, |
| "eval_samples_per_second": 113.241, |
| "eval_steps_per_second": 3.539, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.18336127409891032, |
| "grad_norm": 1.4065430164337158, |
| "learning_rate": 4.0831936295054484e-05, |
| "loss": 1.6698, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.20955574182732606, |
| "grad_norm": 1.3207411766052246, |
| "learning_rate": 3.95222129086337e-05, |
| "loss": 1.623, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.20955574182732606, |
| "eval_accuracy": 0.6163172170291626, |
| "eval_loss": 1.5256904363632202, |
| "eval_runtime": 54.1432, |
| "eval_samples_per_second": 113.477, |
| "eval_steps_per_second": 3.546, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.23575020955574183, |
| "grad_norm": 1.3076437711715698, |
| "learning_rate": 3.821248952221291e-05, |
| "loss": 1.5788, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.2619446772841576, |
| "grad_norm": 1.2652488946914673, |
| "learning_rate": 3.690276613579212e-05, |
| "loss": 1.5462, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.2619446772841576, |
| "eval_accuracy": 0.6284878141291952, |
| "eval_loss": 1.4622727632522583, |
| "eval_runtime": 53.2548, |
| "eval_samples_per_second": 115.37, |
| "eval_steps_per_second": 3.605, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.28813914501257337, |
| "grad_norm": 1.1916437149047852, |
| "learning_rate": 3.5593042749371334e-05, |
| "loss": 1.515, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.3143336127409891, |
| "grad_norm": 1.3247679471969604, |
| "learning_rate": 3.4283319362950544e-05, |
| "loss": 1.4894, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.3143336127409891, |
| "eval_accuracy": 0.6376865619399642, |
| "eval_loss": 1.4145272970199585, |
| "eval_runtime": 52.5759, |
| "eval_samples_per_second": 116.86, |
| "eval_steps_per_second": 3.652, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.34052808046940486, |
| "grad_norm": 1.1543827056884766, |
| "learning_rate": 3.297359597652976e-05, |
| "loss": 1.4695, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.36672254819782063, |
| "grad_norm": 1.1558704376220703, |
| "learning_rate": 3.166387259010897e-05, |
| "loss": 1.4533, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.36672254819782063, |
| "eval_accuracy": 0.6449126153164711, |
| "eval_loss": 1.3772516250610352, |
| "eval_runtime": 52.553, |
| "eval_samples_per_second": 116.911, |
| "eval_steps_per_second": 3.653, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.3929170159262364, |
| "grad_norm": 1.2273856401443481, |
| "learning_rate": 3.035414920368818e-05, |
| "loss": 1.4362, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.4191114836546521, |
| "grad_norm": 1.1397446393966675, |
| "learning_rate": 2.9044425817267394e-05, |
| "loss": 1.4216, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.4191114836546521, |
| "eval_accuracy": 0.6501475185320952, |
| "eval_loss": 1.3503440618515015, |
| "eval_runtime": 52.6263, |
| "eval_samples_per_second": 116.748, |
| "eval_steps_per_second": 3.648, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.4453059513830679, |
| "grad_norm": 1.335091471672058, |
| "learning_rate": 2.7734702430846605e-05, |
| "loss": 1.4069, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.47150041911148366, |
| "grad_norm": 1.1339608430862427, |
| "learning_rate": 2.642497904442582e-05, |
| "loss": 1.394, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.47150041911148366, |
| "eval_accuracy": 0.6541592525558, |
| "eval_loss": 1.3288276195526123, |
| "eval_runtime": 52.5059, |
| "eval_samples_per_second": 117.015, |
| "eval_steps_per_second": 3.657, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.49769488683989943, |
| "grad_norm": 1.1313315629959106, |
| "learning_rate": 2.5115255658005033e-05, |
| "loss": 1.3842, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.5238893545683152, |
| "grad_norm": 1.1140952110290527, |
| "learning_rate": 2.3805532271584244e-05, |
| "loss": 1.3762, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5238893545683152, |
| "eval_accuracy": 0.6584938026942815, |
| "eval_loss": 1.3073725700378418, |
| "eval_runtime": 52.8006, |
| "eval_samples_per_second": 116.362, |
| "eval_steps_per_second": 3.636, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.5500838222967309, |
| "grad_norm": 1.1034098863601685, |
| "learning_rate": 2.2495808885163454e-05, |
| "loss": 1.3638, |
| "step": 10500 |
| }, |
| { |
| "epoch": 0.5762782900251467, |
| "grad_norm": 1.1063100099563599, |
| "learning_rate": 2.1186085498742665e-05, |
| "loss": 1.3551, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.5762782900251467, |
| "eval_accuracy": 0.662028233443304, |
| "eval_loss": 1.2898907661437988, |
| "eval_runtime": 52.4641, |
| "eval_samples_per_second": 117.109, |
| "eval_steps_per_second": 3.66, |
| "step": 11000 |
| }, |
| { |
| "epoch": 0.6024727577535625, |
| "grad_norm": 1.124444842338562, |
| "learning_rate": 1.987636211232188e-05, |
| "loss": 1.3471, |
| "step": 11500 |
| }, |
| { |
| "epoch": 0.6286672254819782, |
| "grad_norm": 1.1195414066314697, |
| "learning_rate": 1.856663872590109e-05, |
| "loss": 1.3385, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.6286672254819782, |
| "eval_accuracy": 0.6649752629622842, |
| "eval_loss": 1.27458655834198, |
| "eval_runtime": 52.9529, |
| "eval_samples_per_second": 116.028, |
| "eval_steps_per_second": 3.626, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.654861693210394, |
| "grad_norm": 1.1512706279754639, |
| "learning_rate": 1.7256915339480304e-05, |
| "loss": 1.3312, |
| "step": 12500 |
| }, |
| { |
| "epoch": 0.6810561609388097, |
| "grad_norm": 1.186962604522705, |
| "learning_rate": 1.5947191953059515e-05, |
| "loss": 1.323, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.6810561609388097, |
| "eval_accuracy": 0.6676055222079668, |
| "eval_loss": 1.2622450590133667, |
| "eval_runtime": 52.9391, |
| "eval_samples_per_second": 116.058, |
| "eval_steps_per_second": 3.627, |
| "step": 13000 |
| }, |
| { |
| "epoch": 0.7072506286672254, |
| "grad_norm": 1.1225796937942505, |
| "learning_rate": 1.4637468566638727e-05, |
| "loss": 1.317, |
| "step": 13500 |
| }, |
| { |
| "epoch": 0.7334450963956413, |
| "grad_norm": 1.1574194431304932, |
| "learning_rate": 1.3327745180217938e-05, |
| "loss": 1.318, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.7334450963956413, |
| "eval_accuracy": 0.669778047613229, |
| "eval_loss": 1.2517098188400269, |
| "eval_runtime": 53.1952, |
| "eval_samples_per_second": 115.499, |
| "eval_steps_per_second": 3.609, |
| "step": 14000 |
| }, |
| { |
| "epoch": 0.759639564124057, |
| "grad_norm": 1.1191082000732422, |
| "learning_rate": 1.201802179379715e-05, |
| "loss": 1.3062, |
| "step": 14500 |
| }, |
| { |
| "epoch": 0.7858340318524728, |
| "grad_norm": 1.101965069770813, |
| "learning_rate": 1.0708298407376362e-05, |
| "loss": 1.3015, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.7858340318524728, |
| "eval_accuracy": 0.6718552714646465, |
| "eval_loss": 1.2416285276412964, |
| "eval_runtime": 53.148, |
| "eval_samples_per_second": 115.602, |
| "eval_steps_per_second": 3.613, |
| "step": 15000 |
| }, |
| { |
| "epoch": 0.8120284995808885, |
| "grad_norm": 1.1240930557250977, |
| "learning_rate": 9.398575020955575e-06, |
| "loss": 1.2988, |
| "step": 15500 |
| }, |
| { |
| "epoch": 0.8382229673093042, |
| "grad_norm": 1.1306583881378174, |
| "learning_rate": 8.088851634534787e-06, |
| "loss": 1.2967, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8382229673093042, |
| "eval_accuracy": 0.6734273493503584, |
| "eval_loss": 1.233677864074707, |
| "eval_runtime": 52.7559, |
| "eval_samples_per_second": 116.461, |
| "eval_steps_per_second": 3.639, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.8644174350377201, |
| "grad_norm": 1.1197803020477295, |
| "learning_rate": 6.779128248113999e-06, |
| "loss": 1.2894, |
| "step": 16500 |
| }, |
| { |
| "epoch": 0.8906119027661358, |
| "grad_norm": 1.1068426370620728, |
| "learning_rate": 5.46940486169321e-06, |
| "loss": 1.2859, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.8906119027661358, |
| "eval_accuracy": 0.6749321274743402, |
| "eval_loss": 1.2268511056900024, |
| "eval_runtime": 53.0583, |
| "eval_samples_per_second": 115.797, |
| "eval_steps_per_second": 3.619, |
| "step": 17000 |
| }, |
| { |
| "epoch": 0.9168063704945516, |
| "grad_norm": 1.1375619173049927, |
| "learning_rate": 4.159681475272423e-06, |
| "loss": 1.2834, |
| "step": 17500 |
| }, |
| { |
| "epoch": 0.9430008382229673, |
| "grad_norm": 1.1194686889648438, |
| "learning_rate": 2.8499580888516347e-06, |
| "loss": 1.2785, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.9430008382229673, |
| "eval_accuracy": 0.675972489512056, |
| "eval_loss": 1.2216249704360962, |
| "eval_runtime": 53.0273, |
| "eval_samples_per_second": 115.865, |
| "eval_steps_per_second": 3.621, |
| "step": 18000 |
| }, |
| { |
| "epoch": 0.969195305951383, |
| "grad_norm": 1.0964257717132568, |
| "learning_rate": 1.5402347024308467e-06, |
| "loss": 1.2771, |
| "step": 18500 |
| }, |
| { |
| "epoch": 0.9953897736797989, |
| "grad_norm": 1.1362768411636353, |
| "learning_rate": 2.3051131601005868e-07, |
| "loss": 1.2753, |
| "step": 19000 |
| }, |
| { |
| "epoch": 0.9953897736797989, |
| "eval_accuracy": 0.6768104749613066, |
| "eval_loss": 1.2177869081497192, |
| "eval_runtime": 52.3892, |
| "eval_samples_per_second": 117.276, |
| "eval_steps_per_second": 3.665, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 19088, |
| "total_flos": 3.19199049547776e+17, |
| "train_loss": 1.5656107469256546, |
| "train_runtime": 7404.3159, |
| "train_samples_per_second": 82.494, |
| "train_steps_per_second": 2.578 |
| } |
| ], |
| "logging_steps": 500, |
| "max_steps": 19088, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.19199049547776e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|