{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 324, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030864197530864196, "grad_norm": 2.801547146682613, "learning_rate": 6.060606060606061e-06, "loss": 3.4098, "mean_token_accuracy": 0.37030486166477206, "step": 10 }, { "epoch": 0.06172839506172839, "grad_norm": 1.75272536141732, "learning_rate": 1.2121212121212122e-05, "loss": 3.3597, "mean_token_accuracy": 0.3771016627550125, "step": 20 }, { "epoch": 0.09259259259259259, "grad_norm": 1.398235214213489, "learning_rate": 1.8181818181818182e-05, "loss": 3.3316, "mean_token_accuracy": 0.3800418496131897, "step": 30 }, { "epoch": 0.12345679012345678, "grad_norm": 1.1493166383946674, "learning_rate": 1.9971458739130598e-05, "loss": 3.2765, "mean_token_accuracy": 0.3864120662212372, "step": 40 }, { "epoch": 0.15432098765432098, "grad_norm": 1.0928521826388469, "learning_rate": 1.983205687062742e-05, "loss": 3.2601, "mean_token_accuracy": 0.38865621387958527, "step": 50 }, { "epoch": 0.18518518518518517, "grad_norm": 1.051132616366509, "learning_rate": 1.957817324187987e-05, "loss": 3.2511, "mean_token_accuracy": 0.38999360203742983, "step": 60 }, { "epoch": 0.21604938271604937, "grad_norm": 1.0185881030179378, "learning_rate": 1.921276400388451e-05, "loss": 3.2235, "mean_token_accuracy": 0.3924990862607956, "step": 70 }, { "epoch": 0.24691358024691357, "grad_norm": 1.0076933028462502, "learning_rate": 1.874008388109276e-05, "loss": 3.2341, "mean_token_accuracy": 0.39132562279701233, "step": 80 }, { "epoch": 0.2777777777777778, "grad_norm": 0.9614110884294764, "learning_rate": 1.816563663057211e-05, "loss": 3.2218, "mean_token_accuracy": 0.3926112145185471, "step": 90 }, { "epoch": 0.30864197530864196, "grad_norm": 0.8588255327540169, "learning_rate": 1.749611095777581e-05, "loss": 3.2184, "mean_token_accuracy": 0.3942494422197342, "step": 100 }, { "epoch": 0.30864197530864196, "eval_loss": 3.005662441253662, "eval_mean_token_accuracy": 0.4227954422434171, "eval_runtime": 1.7433, "eval_samples_per_second": 53.347, "eval_steps_per_second": 6.883, "step": 100 }, { "epoch": 0.3395061728395062, "grad_norm": 0.9111279933093522, "learning_rate": 1.673930263510011e-05, "loss": 3.2183, "mean_token_accuracy": 0.39443579018116, "step": 110 }, { "epoch": 0.37037037037037035, "grad_norm": 0.8443631468404235, "learning_rate": 1.5904023730059227e-05, "loss": 3.174, "mean_token_accuracy": 0.3968636095523834, "step": 120 }, { "epoch": 0.4012345679012346, "grad_norm": 0.7784470701259443, "learning_rate": 1.5000000000000002e-05, "loss": 3.198, "mean_token_accuracy": 0.39601509273052216, "step": 130 }, { "epoch": 0.43209876543209874, "grad_norm": 0.9408416181737065, "learning_rate": 1.4037757648064019e-05, "loss": 3.169, "mean_token_accuracy": 0.39953720271587373, "step": 140 }, { "epoch": 0.46296296296296297, "grad_norm": 0.8992234272178823, "learning_rate": 1.3028500758979507e-05, "loss": 3.1816, "mean_token_accuracy": 0.39692102670669555, "step": 150 }, { "epoch": 0.49382716049382713, "grad_norm": 0.7966343469917108, "learning_rate": 1.1983980841786899e-05, "loss": 3.1658, "mean_token_accuracy": 0.39880101978778837, "step": 160 }, { "epoch": 0.5246913580246914, "grad_norm": 0.7567594319754051, "learning_rate": 1.0916359998506549e-05, "loss": 3.1597, "mean_token_accuracy": 0.3991874396800995, "step": 170 }, { "epoch": 0.5555555555555556, "grad_norm": 0.8942537839940679, "learning_rate": 9.838069311974986e-06, "loss": 3.1839, "mean_token_accuracy": 0.39815137088298796, "step": 180 }, { "epoch": 0.5864197530864198, "grad_norm": 0.7891927796973276, "learning_rate": 8.7616641017427e-06, "loss": 3.1867, "mean_token_accuracy": 0.3971911668777466, "step": 190 }, { "epoch": 0.6172839506172839, "grad_norm": 0.8968961057445863, "learning_rate": 7.699677733393827e-06, "loss": 3.1794, "mean_token_accuracy": 0.3982805788516998, "step": 200 }, { "epoch": 0.6172839506172839, "eval_loss": 2.987029790878296, "eval_mean_token_accuracy": 0.42388613273700076, "eval_runtime": 1.7289, "eval_samples_per_second": 53.791, "eval_steps_per_second": 6.941, "step": 200 }, { "epoch": 0.6481481481481481, "grad_norm": 0.749592048458194, "learning_rate": 6.664475683491797e-06, "loss": 3.1534, "mean_token_accuracy": 0.4000447064638138, "step": 210 }, { "epoch": 0.6790123456790124, "grad_norm": 0.8463738403795209, "learning_rate": 5.6681115593784705e-06, "loss": 3.1758, "mean_token_accuracy": 0.39696604907512667, "step": 220 }, { "epoch": 0.7098765432098766, "grad_norm": 0.773029467101032, "learning_rate": 4.722186750292511e-06, "loss": 3.1788, "mean_token_accuracy": 0.398376402258873, "step": 230 }, { "epoch": 0.7407407407407407, "grad_norm": 0.7480708017713564, "learning_rate": 3.837715343990727e-06, "loss": 3.1579, "mean_token_accuracy": 0.40122034549713137, "step": 240 }, { "epoch": 0.7716049382716049, "grad_norm": 0.7727925167197635, "learning_rate": 3.024995881745972e-06, "loss": 3.1877, "mean_token_accuracy": 0.39733474254608153, "step": 250 }, { "epoch": 0.8024691358024691, "grad_norm": 0.8121479296047495, "learning_rate": 2.293491444971109e-06, "loss": 3.1588, "mean_token_accuracy": 0.39986406564712523, "step": 260 }, { "epoch": 0.8333333333333334, "grad_norm": 0.818064094402815, "learning_rate": 1.6517194697072903e-06, "loss": 3.1651, "mean_token_accuracy": 0.3996410697698593, "step": 270 }, { "epoch": 0.8641975308641975, "grad_norm": 0.7271998536527885, "learning_rate": 1.1071525719463094e-06, "loss": 3.1971, "mean_token_accuracy": 0.39565774202346804, "step": 280 }, { "epoch": 0.8950617283950617, "grad_norm": 0.7601308393782793, "learning_rate": 6.661315385496426e-07, "loss": 3.1713, "mean_token_accuracy": 0.3977700412273407, "step": 290 }, { "epoch": 0.9259259259259259, "grad_norm": 0.7456928451079772, "learning_rate": 3.3379149687388866e-07, "loss": 3.1662, "mean_token_accuracy": 0.3995864540338516, "step": 300 }, { "epoch": 0.9259259259259259, "eval_loss": 2.9822428226470947, "eval_mean_token_accuracy": 0.4246668443083763, "eval_runtime": 1.7305, "eval_samples_per_second": 53.742, "eval_steps_per_second": 6.934, "step": 300 }, { "epoch": 0.9567901234567902, "grad_norm": 0.7365449920194871, "learning_rate": 1.1400212276321377e-07, "loss": 3.1553, "mean_token_accuracy": 0.401428085565567, "step": 310 }, { "epoch": 0.9876543209876543, "grad_norm": 0.7469534948900145, "learning_rate": 9.322583110392692e-09, "loss": 3.1715, "mean_token_accuracy": 0.3986253648996353, "step": 320 }, { "epoch": 1.0, "mean_token_accuracy": 0.4006178230047226, "step": 324, "total_flos": 15459466346496.0, "train_loss": 3.207212707142771, "train_runtime": 884.063, "train_samples_per_second": 23.43, "train_steps_per_second": 0.366 } ], "logging_steps": 10, "max_steps": 324, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 15459466346496.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }