{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 4455, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06734006734006734, "grad_norm": 1277.759765625, "learning_rate": 0.00014849999999999998, "loss": 306.280546875, "step": 100 }, { "epoch": 0.13468013468013468, "grad_norm": 45.35179138183594, "learning_rate": 0.0002985, "loss": 116.630830078125, "step": 200 }, { "epoch": 0.20202020202020202, "grad_norm": 38.45261764526367, "learning_rate": 0.00029301997649823735, "loss": 81.141572265625, "step": 300 }, { "epoch": 0.26936026936026936, "grad_norm": 30.12755584716797, "learning_rate": 0.0002859694477085781, "loss": 71.679775390625, "step": 400 }, { "epoch": 0.3367003367003367, "grad_norm": 26.099262237548828, "learning_rate": 0.0002789189189189189, "loss": 67.6156201171875, "step": 500 }, { "epoch": 0.3367003367003367, "eval_loss": 6.426710605621338, "eval_runtime": 71.9526, "eval_samples_per_second": 13.342, "eval_steps_per_second": 3.336, "step": 500 }, { "epoch": 0.40404040404040403, "grad_norm": 25.531190872192383, "learning_rate": 0.00027186839012925966, "loss": 65.3996484375, "step": 600 }, { "epoch": 0.4713804713804714, "grad_norm": 25.456987380981445, "learning_rate": 0.00026481786133960043, "loss": 63.7711328125, "step": 700 }, { "epoch": 0.5387205387205387, "grad_norm": 24.871572494506836, "learning_rate": 0.00025776733254994125, "loss": 62.25517578125, "step": 800 }, { "epoch": 0.6060606060606061, "grad_norm": 29.645320892333984, "learning_rate": 0.000250716803760282, "loss": 60.900390625, "step": 900 }, { "epoch": 0.6734006734006734, "grad_norm": 25.431486129760742, "learning_rate": 0.00024366627497062276, "loss": 60.542490234375, "step": 1000 }, { "epoch": 0.6734006734006734, "eval_loss": 5.928141117095947, "eval_runtime": 71.83, "eval_samples_per_second": 13.365, "eval_steps_per_second": 3.341, "step": 1000 }, { "epoch": 0.7407407407407407, "grad_norm": 23.62899398803711, "learning_rate": 0.00023661574618096356, "loss": 59.750263671875, "step": 1100 }, { "epoch": 0.8080808080808081, "grad_norm": 18.71799087524414, "learning_rate": 0.00022956521739130433, "loss": 58.8070458984375, "step": 1200 }, { "epoch": 0.8754208754208754, "grad_norm": 21.216167449951172, "learning_rate": 0.00022251468860164512, "loss": 58.904775390625, "step": 1300 }, { "epoch": 0.9427609427609428, "grad_norm": 25.631328582763672, "learning_rate": 0.00021546415981198587, "loss": 57.6883837890625, "step": 1400 }, { "epoch": 1.0101010101010102, "grad_norm": 24.354522705078125, "learning_rate": 0.00020841363102232666, "loss": 57.590849609375, "step": 1500 }, { "epoch": 1.0101010101010102, "eval_loss": 5.705260276794434, "eval_runtime": 72.7006, "eval_samples_per_second": 13.205, "eval_steps_per_second": 3.301, "step": 1500 }, { "epoch": 1.0774410774410774, "grad_norm": 28.926172256469727, "learning_rate": 0.00020136310223266743, "loss": 54.9418896484375, "step": 1600 }, { "epoch": 1.144781144781145, "grad_norm": 23.21477508544922, "learning_rate": 0.00019431257344300823, "loss": 55.0524072265625, "step": 1700 }, { "epoch": 1.2121212121212122, "grad_norm": 22.93968391418457, "learning_rate": 0.00018726204465334897, "loss": 54.60087890625, "step": 1800 }, { "epoch": 1.2794612794612794, "grad_norm": 21.029342651367188, "learning_rate": 0.00018021151586368976, "loss": 53.961689453125, "step": 1900 }, { "epoch": 1.3468013468013469, "grad_norm": 22.02585220336914, "learning_rate": 0.00017316098707403053, "loss": 54.352119140625, "step": 2000 }, { "epoch": 1.3468013468013469, "eval_loss": 5.576821804046631, "eval_runtime": 70.7614, "eval_samples_per_second": 13.567, "eval_steps_per_second": 3.392, "step": 2000 }, { "epoch": 1.4141414141414141, "grad_norm": 22.914306640625, "learning_rate": 0.00016611045828437133, "loss": 53.9832177734375, "step": 2100 }, { "epoch": 1.4814814814814814, "grad_norm": 22.693891525268555, "learning_rate": 0.00015905992949471207, "loss": 53.846845703125, "step": 2200 }, { "epoch": 1.5488215488215489, "grad_norm": 22.133243560791016, "learning_rate": 0.00015200940070505287, "loss": 54.00986328125, "step": 2300 }, { "epoch": 1.6161616161616161, "grad_norm": 23.607688903808594, "learning_rate": 0.00014495887191539364, "loss": 53.8654638671875, "step": 2400 }, { "epoch": 1.6835016835016834, "grad_norm": 22.71261215209961, "learning_rate": 0.0001379083431257344, "loss": 53.5878173828125, "step": 2500 }, { "epoch": 1.6835016835016834, "eval_loss": 5.490618705749512, "eval_runtime": 72.5632, "eval_samples_per_second": 13.23, "eval_steps_per_second": 3.307, "step": 2500 }, { "epoch": 1.7508417508417509, "grad_norm": 19.942703247070312, "learning_rate": 0.0001308578143360752, "loss": 52.879384765625, "step": 2600 }, { "epoch": 1.8181818181818183, "grad_norm": 20.36492919921875, "learning_rate": 0.00012380728554641597, "loss": 52.678017578125, "step": 2700 }, { "epoch": 1.8855218855218854, "grad_norm": 20.429353713989258, "learning_rate": 0.00011675675675675675, "loss": 53.1516796875, "step": 2800 }, { "epoch": 1.9528619528619529, "grad_norm": 20.908979415893555, "learning_rate": 0.00010970622796709752, "loss": 52.3822607421875, "step": 2900 }, { "epoch": 2.0202020202020203, "grad_norm": 24.01288604736328, "learning_rate": 0.0001026556991774383, "loss": 51.8819384765625, "step": 3000 }, { "epoch": 2.0202020202020203, "eval_loss": 5.448428153991699, "eval_runtime": 70.4982, "eval_samples_per_second": 13.617, "eval_steps_per_second": 3.404, "step": 3000 }, { "epoch": 2.0875420875420874, "grad_norm": 19.735408782958984, "learning_rate": 9.560517038777907e-05, "loss": 51.3095654296875, "step": 3100 }, { "epoch": 2.154882154882155, "grad_norm": 23.40346336364746, "learning_rate": 8.855464159811985e-05, "loss": 51.257080078125, "step": 3200 }, { "epoch": 2.2222222222222223, "grad_norm": 19.5986385345459, "learning_rate": 8.150411280846062e-05, "loss": 50.9421484375, "step": 3300 }, { "epoch": 2.28956228956229, "grad_norm": 22.559755325317383, "learning_rate": 7.44535840188014e-05, "loss": 50.7065576171875, "step": 3400 }, { "epoch": 2.356902356902357, "grad_norm": 21.577171325683594, "learning_rate": 6.740305522914217e-05, "loss": 51.1507177734375, "step": 3500 }, { "epoch": 2.356902356902357, "eval_loss": 5.395638942718506, "eval_runtime": 73.4284, "eval_samples_per_second": 13.074, "eval_steps_per_second": 3.268, "step": 3500 }, { "epoch": 2.4242424242424243, "grad_norm": 29.17755889892578, "learning_rate": 6.0352526439482956e-05, "loss": 50.8910009765625, "step": 3600 }, { "epoch": 2.4915824915824913, "grad_norm": 20.923023223876953, "learning_rate": 5.330199764982373e-05, "loss": 50.6894189453125, "step": 3700 }, { "epoch": 2.558922558922559, "grad_norm": 18.506322860717773, "learning_rate": 4.625146886016451e-05, "loss": 51.0062451171875, "step": 3800 }, { "epoch": 2.6262626262626263, "grad_norm": 22.548017501831055, "learning_rate": 3.920094007050528e-05, "loss": 50.7424365234375, "step": 3900 }, { "epoch": 2.6936026936026938, "grad_norm": 22.250715255737305, "learning_rate": 3.2150411280846066e-05, "loss": 50.8699951171875, "step": 4000 }, { "epoch": 2.6936026936026938, "eval_loss": 5.3642354011535645, "eval_runtime": 71.4784, "eval_samples_per_second": 13.431, "eval_steps_per_second": 3.358, "step": 4000 }, { "epoch": 2.760942760942761, "grad_norm": 21.673240661621094, "learning_rate": 2.5099882491186838e-05, "loss": 50.46708984375, "step": 4100 }, { "epoch": 2.8282828282828283, "grad_norm": 21.410390853881836, "learning_rate": 1.8049353701527613e-05, "loss": 50.4184521484375, "step": 4200 }, { "epoch": 2.8956228956228958, "grad_norm": 21.75411033630371, "learning_rate": 1.0998824911868389e-05, "loss": 50.68787109375, "step": 4300 }, { "epoch": 2.962962962962963, "grad_norm": 21.319902420043945, "learning_rate": 3.948296122209165e-06, "loss": 50.4838671875, "step": 4400 } ], "logging_steps": 100, "max_steps": 4455, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.714151970521088e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }