| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 1000, | |
| "global_step": 19282, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02593092002904263, | |
| "grad_norm": 1.788983702659607, | |
| "learning_rate": 4.8703453998547873e-05, | |
| "loss": 4.3664, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05186184005808526, | |
| "grad_norm": 2.7369565963745117, | |
| "learning_rate": 4.740690799709574e-05, | |
| "loss": 2.8944, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.05186184005808526, | |
| "eval_accuracy": 0.4515463904526746, | |
| "eval_loss": 2.4388132095336914, | |
| "eval_runtime": 54.0003, | |
| "eval_samples_per_second": 114.851, | |
| "eval_steps_per_second": 3.593, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07779276008712789, | |
| "grad_norm": 1.7776174545288086, | |
| "learning_rate": 4.611036199564361e-05, | |
| "loss": 2.262, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.10372368011617052, | |
| "grad_norm": 1.502121090888977, | |
| "learning_rate": 4.481381599419148e-05, | |
| "loss": 1.9548, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.10372368011617052, | |
| "eval_accuracy": 0.5749121385180512, | |
| "eval_loss": 1.7761105298995972, | |
| "eval_runtime": 54.3267, | |
| "eval_samples_per_second": 114.161, | |
| "eval_steps_per_second": 3.571, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.12965460014521316, | |
| "grad_norm": 1.4310986995697021, | |
| "learning_rate": 4.3517269992739344e-05, | |
| "loss": 1.8029, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.15558552017425578, | |
| "grad_norm": 1.4096485376358032, | |
| "learning_rate": 4.2220723991287215e-05, | |
| "loss": 1.711, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.15558552017425578, | |
| "eval_accuracy": 0.6069719256204366, | |
| "eval_loss": 1.5925041437149048, | |
| "eval_runtime": 54.3191, | |
| "eval_samples_per_second": 114.177, | |
| "eval_steps_per_second": 3.571, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.1815164402032984, | |
| "grad_norm": 1.5363044738769531, | |
| "learning_rate": 4.0924177989835086e-05, | |
| "loss": 1.6399, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.20744736023234103, | |
| "grad_norm": 1.3037211894989014, | |
| "learning_rate": 3.962763198838295e-05, | |
| "loss": 1.5894, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.20744736023234103, | |
| "eval_accuracy": 0.6245891417740249, | |
| "eval_loss": 1.494728446006775, | |
| "eval_runtime": 54.2507, | |
| "eval_samples_per_second": 114.321, | |
| "eval_steps_per_second": 3.576, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.23337828026138369, | |
| "grad_norm": 1.2526813745498657, | |
| "learning_rate": 3.833108598693082e-05, | |
| "loss": 1.5478, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.2593092002904263, | |
| "grad_norm": 1.3765000104904175, | |
| "learning_rate": 3.7034539985478686e-05, | |
| "loss": 1.5161, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.2593092002904263, | |
| "eval_accuracy": 0.6354598822377167, | |
| "eval_loss": 1.4352596998214722, | |
| "eval_runtime": 54.0077, | |
| "eval_samples_per_second": 114.835, | |
| "eval_steps_per_second": 3.592, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.28524012031946894, | |
| "grad_norm": 1.232720971107483, | |
| "learning_rate": 3.573799398402656e-05, | |
| "loss": 1.4843, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.31117104034851156, | |
| "grad_norm": 1.1925880908966064, | |
| "learning_rate": 3.444144798257442e-05, | |
| "loss": 1.4649, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.31117104034851156, | |
| "eval_accuracy": 0.6444491938557328, | |
| "eval_loss": 1.387238621711731, | |
| "eval_runtime": 54.3146, | |
| "eval_samples_per_second": 114.187, | |
| "eval_steps_per_second": 3.572, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3371019603775542, | |
| "grad_norm": 1.1595429182052612, | |
| "learning_rate": 3.314490198112229e-05, | |
| "loss": 1.4448, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.3630328804065968, | |
| "grad_norm": 1.1648014783859253, | |
| "learning_rate": 3.184835597967016e-05, | |
| "loss": 1.4264, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.3630328804065968, | |
| "eval_accuracy": 0.6509181126890294, | |
| "eval_loss": 1.3541431427001953, | |
| "eval_runtime": 54.2062, | |
| "eval_samples_per_second": 114.415, | |
| "eval_steps_per_second": 3.579, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.38896380043563944, | |
| "grad_norm": 1.265906572341919, | |
| "learning_rate": 3.055180997821803e-05, | |
| "loss": 1.4074, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.41489472046468207, | |
| "grad_norm": 1.1421406269073486, | |
| "learning_rate": 2.9255263976765896e-05, | |
| "loss": 1.3953, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.41489472046468207, | |
| "eval_accuracy": 0.6560071278996495, | |
| "eval_loss": 1.3276734352111816, | |
| "eval_runtime": 53.8158, | |
| "eval_samples_per_second": 115.245, | |
| "eval_steps_per_second": 3.605, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.4408256404937247, | |
| "grad_norm": 1.147186279296875, | |
| "learning_rate": 2.7958717975313763e-05, | |
| "loss": 1.3818, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.46675656052276737, | |
| "grad_norm": 1.1580846309661865, | |
| "learning_rate": 2.6662171973861634e-05, | |
| "loss": 1.3682, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.46675656052276737, | |
| "eval_accuracy": 0.660986444318564, | |
| "eval_loss": 1.3023183345794678, | |
| "eval_runtime": 54.6308, | |
| "eval_samples_per_second": 113.526, | |
| "eval_steps_per_second": 3.551, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.49268748055181, | |
| "grad_norm": 1.0931369066238403, | |
| "learning_rate": 2.5365625972409502e-05, | |
| "loss": 1.3596, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.5186184005808526, | |
| "grad_norm": 1.1315568685531616, | |
| "learning_rate": 2.406907997095737e-05, | |
| "loss": 1.349, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5186184005808526, | |
| "eval_accuracy": 0.6649929720271234, | |
| "eval_loss": 1.2824217081069946, | |
| "eval_runtime": 53.9326, | |
| "eval_samples_per_second": 114.995, | |
| "eval_steps_per_second": 3.597, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5445493206098952, | |
| "grad_norm": 1.088722825050354, | |
| "learning_rate": 2.2772533969505237e-05, | |
| "loss": 1.3395, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.5704802406389379, | |
| "grad_norm": 1.1066893339157104, | |
| "learning_rate": 2.147598796805311e-05, | |
| "loss": 1.3305, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5704802406389379, | |
| "eval_accuracy": 0.6681658834866437, | |
| "eval_loss": 1.2654088735580444, | |
| "eval_runtime": 54.0356, | |
| "eval_samples_per_second": 114.776, | |
| "eval_steps_per_second": 3.59, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.5964111606679805, | |
| "grad_norm": 1.1504087448120117, | |
| "learning_rate": 2.0179441966600976e-05, | |
| "loss": 1.3238, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.6223420806970231, | |
| "grad_norm": 1.0969709157943726, | |
| "learning_rate": 1.8882895965148844e-05, | |
| "loss": 1.3179, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6223420806970231, | |
| "eval_accuracy": 0.6708257639590924, | |
| "eval_loss": 1.2524346113204956, | |
| "eval_runtime": 54.3034, | |
| "eval_samples_per_second": 114.21, | |
| "eval_steps_per_second": 3.573, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6482730007260658, | |
| "grad_norm": 1.112231731414795, | |
| "learning_rate": 1.7586349963696715e-05, | |
| "loss": 1.3056, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.6742039207551084, | |
| "grad_norm": 1.141927719116211, | |
| "learning_rate": 1.6289803962244583e-05, | |
| "loss": 1.3027, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.6742039207551084, | |
| "eval_accuracy": 0.6735423851858717, | |
| "eval_loss": 1.2395827770233154, | |
| "eval_runtime": 54.1312, | |
| "eval_samples_per_second": 114.574, | |
| "eval_steps_per_second": 3.584, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.700134840784151, | |
| "grad_norm": 1.1116396188735962, | |
| "learning_rate": 1.499325796079245e-05, | |
| "loss": 1.2961, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.7260657608131936, | |
| "grad_norm": 1.0832737684249878, | |
| "learning_rate": 1.369671195934032e-05, | |
| "loss": 1.2895, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7260657608131936, | |
| "eval_accuracy": 0.6757629976518784, | |
| "eval_loss": 1.2289599180221558, | |
| "eval_runtime": 54.7148, | |
| "eval_samples_per_second": 113.351, | |
| "eval_steps_per_second": 3.546, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7519966808422363, | |
| "grad_norm": 1.1050748825073242, | |
| "learning_rate": 1.2400165957888187e-05, | |
| "loss": 1.2825, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.7779276008712789, | |
| "grad_norm": 1.1190505027770996, | |
| "learning_rate": 1.1103619956436055e-05, | |
| "loss": 1.2797, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.7779276008712789, | |
| "eval_accuracy": 0.6775377223567713, | |
| "eval_loss": 1.219657301902771, | |
| "eval_runtime": 54.2283, | |
| "eval_samples_per_second": 114.368, | |
| "eval_steps_per_second": 3.577, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.8038585209003215, | |
| "grad_norm": 1.1283358335494995, | |
| "learning_rate": 9.807073954983924e-06, | |
| "loss": 1.2737, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.8297894409293641, | |
| "grad_norm": 5.891765117645264, | |
| "learning_rate": 8.51052795353179e-06, | |
| "loss": 1.2697, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8297894409293641, | |
| "eval_accuracy": 0.6790465535823433, | |
| "eval_loss": 1.2125846147537231, | |
| "eval_runtime": 53.4813, | |
| "eval_samples_per_second": 115.966, | |
| "eval_steps_per_second": 3.627, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8557203609584068, | |
| "grad_norm": 1.1195549964904785, | |
| "learning_rate": 7.21398195207966e-06, | |
| "loss": 1.2664, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.8816512809874494, | |
| "grad_norm": 1.1249275207519531, | |
| "learning_rate": 5.917435950627528e-06, | |
| "loss": 1.2629, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.8816512809874494, | |
| "eval_accuracy": 0.6806890408070049, | |
| "eval_loss": 1.2044044733047485, | |
| "eval_runtime": 53.0931, | |
| "eval_samples_per_second": 116.814, | |
| "eval_steps_per_second": 3.654, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.9075822010164921, | |
| "grad_norm": 1.1231811046600342, | |
| "learning_rate": 4.620889949175397e-06, | |
| "loss": 1.258, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.9335131210455347, | |
| "grad_norm": 1.1008754968643188, | |
| "learning_rate": 3.324343947723265e-06, | |
| "loss": 1.2562, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9335131210455347, | |
| "eval_accuracy": 0.682017247297958, | |
| "eval_loss": 1.1984517574310303, | |
| "eval_runtime": 52.9563, | |
| "eval_samples_per_second": 117.115, | |
| "eval_steps_per_second": 3.663, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9594440410745774, | |
| "grad_norm": 1.1336047649383545, | |
| "learning_rate": 2.027797946271134e-06, | |
| "loss": 1.2536, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.98537496110362, | |
| "grad_norm": 1.116174578666687, | |
| "learning_rate": 7.312519448190021e-07, | |
| "loss": 1.2531, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.98537496110362, | |
| "eval_accuracy": 0.6826912013688392, | |
| "eval_loss": 1.1951990127563477, | |
| "eval_runtime": 52.9834, | |
| "eval_samples_per_second": 117.055, | |
| "eval_steps_per_second": 3.662, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 19282, | |
| "total_flos": 3.22443774001152e+17, | |
| "train_loss": 1.535953860030161, | |
| "train_runtime": 7521.8474, | |
| "train_samples_per_second": 82.03, | |
| "train_steps_per_second": 2.563 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 19282, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.22443774001152e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |