{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 200, "global_step": 318, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09456264775413711, "grad_norm": 18.87253189086914, "learning_rate": 2.8125e-06, "loss": 2.3773, "step": 10 }, { "epoch": 0.18912529550827423, "grad_norm": 6.278682708740234, "learning_rate": 5.9375e-06, "loss": 1.002, "step": 20 }, { "epoch": 0.28368794326241137, "grad_norm": 6.021182537078857, "learning_rate": 9.0625e-06, "loss": 0.8042, "step": 30 }, { "epoch": 0.37825059101654845, "grad_norm": 5.4127421379089355, "learning_rate": 9.985226282835216e-06, "loss": 0.8278, "step": 40 }, { "epoch": 0.4728132387706856, "grad_norm": 5.558228015899658, "learning_rate": 9.913075312749867e-06, "loss": 0.7046, "step": 50 }, { "epoch": 0.5673758865248227, "grad_norm": 5.16882848739624, "learning_rate": 9.781702165490638e-06, "loss": 0.6698, "step": 60 }, { "epoch": 0.6619385342789598, "grad_norm": 4.812554836273193, "learning_rate": 9.59269041210166e-06, "loss": 0.7291, "step": 70 }, { "epoch": 0.7565011820330969, "grad_norm": 4.815040111541748, "learning_rate": 9.348318399002347e-06, "loss": 0.641, "step": 80 }, { "epoch": 0.851063829787234, "grad_norm": 4.179919719696045, "learning_rate": 9.051531784814817e-06, "loss": 0.6477, "step": 90 }, { "epoch": 0.9456264775413712, "grad_norm": 5.225935935974121, "learning_rate": 8.705908033414426e-06, "loss": 0.5922, "step": 100 }, { "epoch": 1.037825059101655, "grad_norm": 3.746854066848755, "learning_rate": 8.315613291203977e-06, "loss": 0.5398, "step": 110 }, { "epoch": 1.132387706855792, "grad_norm": 4.180168628692627, "learning_rate": 7.885352168412677e-06, "loss": 0.3628, "step": 120 }, { "epoch": 1.226950354609929, "grad_norm": 4.26162052154541, "learning_rate": 7.420311029755688e-06, "loss": 0.3878, "step": 130 }, { "epoch": 1.3215130023640662, "grad_norm": 4.134862422943115, "learning_rate": 6.926095478028312e-06, "loss": 0.3484, "step": 140 }, { "epoch": 1.4160756501182032, "grad_norm": 3.880824565887451, "learning_rate": 6.408662784207149e-06, "loss": 0.3692, "step": 150 }, { "epoch": 1.5106382978723403, "grad_norm": 4.255764484405518, "learning_rate": 5.8742500785453226e-06, "loss": 0.3478, "step": 160 }, { "epoch": 1.6052009456264775, "grad_norm": 3.5713248252868652, "learning_rate": 5.3292991682458576e-06, "loss": 0.3675, "step": 170 }, { "epoch": 1.6997635933806148, "grad_norm": 3.488389015197754, "learning_rate": 4.7803788879604585e-06, "loss": 0.3758, "step": 180 }, { "epoch": 1.7943262411347518, "grad_norm": 3.949796676635742, "learning_rate": 4.234105919100261e-06, "loss": 0.3756, "step": 190 }, { "epoch": 1.8888888888888888, "grad_norm": 3.332965612411499, "learning_rate": 3.6970650324020784e-06, "loss": 0.3716, "step": 200 }, { "epoch": 1.8888888888888888, "eval_loss": 0.6453335285186768, "eval_runtime": 55.7022, "eval_samples_per_second": 5.906, "eval_steps_per_second": 0.844, "step": 200 }, { "epoch": 1.983451536643026, "grad_norm": 3.7968406677246094, "learning_rate": 3.1757297151456844e-06, "loss": 0.3388, "step": 210 }, { "epoch": 2.07565011820331, "grad_norm": 2.6286983489990234, "learning_rate": 2.6763841397811576e-06, "loss": 0.1871, "step": 220 }, { "epoch": 2.1702127659574466, "grad_norm": 437.03662109375, "learning_rate": 2.2050474145559326e-06, "loss": 0.1906, "step": 230 }, { "epoch": 2.264775413711584, "grad_norm": 2.9663329124450684, "learning_rate": 1.7674010292239746e-06, "loss": 0.1742, "step": 240 }, { "epoch": 2.359338061465721, "grad_norm": 2.8688528537750244, "learning_rate": 1.3687203704060343e-06, "loss": 0.143, "step": 250 }, { "epoch": 2.453900709219858, "grad_norm": 3.0793418884277344, "learning_rate": 1.013811132114384e-06, "loss": 0.1432, "step": 260 }, { "epoch": 2.548463356973995, "grad_norm": 2.9158718585968018, "learning_rate": 7.06951387949118e-07, "loss": 0.1317, "step": 270 }, { "epoch": 2.6430260047281324, "grad_norm": 2.4157230854034424, "learning_rate": 4.5184002322740784e-07, "loss": 0.1366, "step": 280 }, { "epoch": 2.7375886524822697, "grad_norm": 2.75584077835083, "learning_rate": 2.5155214864446556e-07, "loss": 0.1303, "step": 290 }, { "epoch": 2.8321513002364065, "grad_norm": 2.8111653327941895, "learning_rate": 1.0850203290965699e-07, "loss": 0.1506, "step": 300 }, { "epoch": 2.9267139479905437, "grad_norm": 2.849290609359741, "learning_rate": 2.441400116752146e-08, "loss": 0.136, "step": 310 }, { "epoch": 3.0, "step": 318, "total_flos": 55872632029184.0, "train_loss": 0.46639804007872093, "train_runtime": 7754.2521, "train_samples_per_second": 1.145, "train_steps_per_second": 0.041 } ], "logging_steps": 10, "max_steps": 318, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 55872632029184.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }