{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 27080, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.36927621861152143, "grad_norm": 13247.388671875, "learning_rate": 0.0005993999999999999, "loss": 0.6625, "step": 1000 }, { "epoch": 0.7385524372230429, "grad_norm": 9940.6201171875, "learning_rate": 0.0005770168711656441, "loss": 0.4984, "step": 2000 }, { "epoch": 1.0, "eval_accuracy": 0.46332530241587166, "eval_loss": 1.8212859630584717, "eval_runtime": 10.2423, "eval_samples_per_second": 60.826, "eval_steps_per_second": 1.953, "step": 2708 }, { "epoch": 1.1078286558345642, "grad_norm": 8275.166015625, "learning_rate": 0.000554010736196319, "loss": 0.4532, "step": 3000 }, { "epoch": 1.4771048744460857, "grad_norm": 7695.97265625, "learning_rate": 0.0005310046012269938, "loss": 0.4253, "step": 4000 }, { "epoch": 1.846381093057607, "grad_norm": 6635.02685546875, "learning_rate": 0.0005079984662576687, "loss": 0.408, "step": 5000 }, { "epoch": 2.0, "eval_accuracy": 0.5106941037150584, "eval_loss": 1.6397608518600464, "eval_runtime": 9.7196, "eval_samples_per_second": 64.097, "eval_steps_per_second": 2.058, "step": 5416 }, { "epoch": 2.2156573116691285, "grad_norm": 7749.61181640625, "learning_rate": 0.00048499233128834355, "loss": 0.3934, "step": 6000 }, { "epoch": 2.58493353028065, "grad_norm": 6143.23388671875, "learning_rate": 0.0004619861963190184, "loss": 0.3836, "step": 7000 }, { "epoch": 2.9542097488921715, "grad_norm": 6868.88916015625, "learning_rate": 0.00043898006134969323, "loss": 0.3764, "step": 8000 }, { "epoch": 3.0, "eval_accuracy": 0.5333796131966716, "eval_loss": 1.5575107336044312, "eval_runtime": 9.7301, "eval_samples_per_second": 64.028, "eval_steps_per_second": 2.055, "step": 8124 }, { "epoch": 3.323485967503693, "grad_norm": 6560.751953125, "learning_rate": 0.00041597392638036807, "loss": 0.3684, "step": 9000 }, { "epoch": 3.692762186115214, "grad_norm": 6382.58203125, "learning_rate": 0.0003929677914110429, "loss": 0.3628, "step": 10000 }, { "epoch": 4.0, "eval_accuracy": 0.547043690494514, "eval_loss": 1.5047563314437866, "eval_runtime": 9.717, "eval_samples_per_second": 64.115, "eval_steps_per_second": 2.058, "step": 10832 }, { "epoch": 4.062038404726736, "grad_norm": 5827.2607421875, "learning_rate": 0.0003699616564417178, "loss": 0.3585, "step": 11000 }, { "epoch": 4.431314623338257, "grad_norm": 6025.1650390625, "learning_rate": 0.00034695552147239264, "loss": 0.3539, "step": 12000 }, { "epoch": 4.800590841949779, "grad_norm": 5609.11474609375, "learning_rate": 0.0003239493865030675, "loss": 0.3505, "step": 13000 }, { "epoch": 5.0, "eval_accuracy": 0.5583267630586173, "eval_loss": 1.4623572826385498, "eval_runtime": 9.7131, "eval_samples_per_second": 64.14, "eval_steps_per_second": 2.059, "step": 13540 }, { "epoch": 5.1698670605613, "grad_norm": 5670.53466796875, "learning_rate": 0.0003009432515337423, "loss": 0.3465, "step": 14000 }, { "epoch": 5.539143279172821, "grad_norm": 6664.6083984375, "learning_rate": 0.00027793711656441715, "loss": 0.3426, "step": 15000 }, { "epoch": 5.908419497784343, "grad_norm": 5798.9404296875, "learning_rate": 0.000254930981595092, "loss": 0.3399, "step": 16000 }, { "epoch": 6.0, "eval_accuracy": 0.5711521487154196, "eval_loss": 1.427840232849121, "eval_runtime": 9.718, "eval_samples_per_second": 64.108, "eval_steps_per_second": 2.058, "step": 16248 }, { "epoch": 6.277695716395864, "grad_norm": 5907.61376953125, "learning_rate": 0.00023192484662576683, "loss": 0.3356, "step": 17000 }, { "epoch": 6.646971935007386, "grad_norm": 5923.216796875, "learning_rate": 0.0002089187116564417, "loss": 0.3334, "step": 18000 }, { "epoch": 7.0, "eval_accuracy": 0.5765989326313872, "eval_loss": 1.406113624572754, "eval_runtime": 9.7146, "eval_samples_per_second": 64.13, "eval_steps_per_second": 2.059, "step": 18956 }, { "epoch": 7.016248153618907, "grad_norm": 6729.748046875, "learning_rate": 0.00018591257668711654, "loss": 0.3315, "step": 19000 }, { "epoch": 7.385524372230428, "grad_norm": 6439.97607421875, "learning_rate": 0.00016290644171779137, "loss": 0.3267, "step": 20000 }, { "epoch": 7.75480059084195, "grad_norm": 6263.06689453125, "learning_rate": 0.00013990030674846624, "loss": 0.3248, "step": 21000 }, { "epoch": 8.0, "eval_accuracy": 0.5833838537723848, "eval_loss": 1.3805782794952393, "eval_runtime": 9.8981, "eval_samples_per_second": 62.941, "eval_steps_per_second": 2.021, "step": 21664 }, { "epoch": 8.124076809453472, "grad_norm": 6769.88916015625, "learning_rate": 0.00011689417177914109, "loss": 0.3226, "step": 22000 }, { "epoch": 8.493353028064993, "grad_norm": 5885.490234375, "learning_rate": 9.388803680981594e-05, "loss": 0.3188, "step": 23000 }, { "epoch": 8.862629246676514, "grad_norm": 6447.85498046875, "learning_rate": 7.08819018404908e-05, "loss": 0.3178, "step": 24000 }, { "epoch": 9.0, "eval_accuracy": 0.5892798245972238, "eval_loss": 1.3639582395553589, "eval_runtime": 9.9358, "eval_samples_per_second": 62.703, "eval_steps_per_second": 2.013, "step": 24372 }, { "epoch": 9.231905465288035, "grad_norm": 6391.68896484375, "learning_rate": 4.787576687116564e-05, "loss": 0.3141, "step": 25000 }, { "epoch": 9.601181683899556, "grad_norm": 6642.7880859375, "learning_rate": 2.4869631901840487e-05, "loss": 0.3118, "step": 26000 }, { "epoch": 9.970457902511079, "grad_norm": 6367.48193359375, "learning_rate": 1.863496932515337e-06, "loss": 0.311, "step": 27000 }, { "epoch": 10.0, "eval_accuracy": 0.5924084271233505, "eval_loss": 1.3527692556381226, "eval_runtime": 9.7249, "eval_samples_per_second": 64.063, "eval_steps_per_second": 2.057, "step": 27080 }, { "epoch": 10.0, "step": 27080, "total_flos": 2.2639125528576e+17, "train_loss": 0.36916396628454484, "train_runtime": 23461.5127, "train_samples_per_second": 36.93, "train_steps_per_second": 1.154 } ], "logging_steps": 1000, "max_steps": 27080, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2639125528576e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }