{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015384615384615385, "grad_norm": 1.4592643976211548, "learning_rate": 1.4634146341463416e-06, "loss": 1.3666, "step": 5 }, { "epoch": 0.03076923076923077, "grad_norm": 0.9001633524894714, "learning_rate": 3.2926829268292685e-06, "loss": 1.353, "step": 10 }, { "epoch": 0.046153846153846156, "grad_norm": 0.7452582716941833, "learning_rate": 5.121951219512195e-06, "loss": 1.3232, "step": 15 }, { "epoch": 0.06153846153846154, "grad_norm": 0.6290757656097412, "learning_rate": 6.951219512195123e-06, "loss": 1.3691, "step": 20 }, { "epoch": 0.07692307692307693, "grad_norm": 0.4892721474170685, "learning_rate": 8.780487804878048e-06, "loss": 1.3552, "step": 25 }, { "epoch": 0.09230769230769231, "grad_norm": 0.5850591659545898, "learning_rate": 1.0609756097560975e-05, "loss": 1.3401, "step": 30 }, { "epoch": 0.1076923076923077, "grad_norm": 0.5620784163475037, "learning_rate": 1.2439024390243903e-05, "loss": 1.2889, "step": 35 }, { "epoch": 0.12307692307692308, "grad_norm": 0.5427800416946411, "learning_rate": 1.4268292682926829e-05, "loss": 1.2625, "step": 40 }, { "epoch": 0.13846153846153847, "grad_norm": 0.5011927485466003, "learning_rate": 1.6097560975609757e-05, "loss": 1.2391, "step": 45 }, { "epoch": 0.15384615384615385, "grad_norm": 0.5249335169792175, "learning_rate": 1.7926829268292684e-05, "loss": 1.2474, "step": 50 }, { "epoch": 0.16923076923076924, "grad_norm": 0.50667804479599, "learning_rate": 1.975609756097561e-05, "loss": 1.2475, "step": 55 }, { "epoch": 0.18461538461538463, "grad_norm": 0.48870664834976196, "learning_rate": 2.1585365853658537e-05, "loss": 1.2505, "step": 60 }, { "epoch": 0.2, "grad_norm": 0.44815170764923096, "learning_rate": 2.3414634146341466e-05, "loss": 1.2401, "step": 65 }, { "epoch": 0.2153846153846154, "grad_norm": 0.4926839768886566, "learning_rate": 2.524390243902439e-05, "loss": 1.2612, "step": 70 }, { "epoch": 0.23076923076923078, "grad_norm": 0.6198599934577942, "learning_rate": 2.707317073170732e-05, "loss": 1.2474, "step": 75 }, { "epoch": 0.24615384615384617, "grad_norm": 0.49995529651641846, "learning_rate": 2.8902439024390242e-05, "loss": 1.2197, "step": 80 }, { "epoch": 0.26153846153846155, "grad_norm": 0.5290886163711548, "learning_rate": 2.9999875637756577e-05, "loss": 1.1353, "step": 85 }, { "epoch": 0.27692307692307694, "grad_norm": 0.5151082873344421, "learning_rate": 2.9998476586200195e-05, "loss": 1.1778, "step": 90 }, { "epoch": 0.2923076923076923, "grad_norm": 0.6584219932556152, "learning_rate": 2.9995523175756406e-05, "loss": 1.1296, "step": 95 }, { "epoch": 0.3076923076923077, "grad_norm": 0.5882734060287476, "learning_rate": 2.9991015712500275e-05, "loss": 1.1377, "step": 100 }, { "epoch": 0.3230769230769231, "grad_norm": 0.5423492193222046, "learning_rate": 2.9984954663560287e-05, "loss": 1.1778, "step": 105 }, { "epoch": 0.3384615384615385, "grad_norm": 0.7454944849014282, "learning_rate": 2.9977340657069916e-05, "loss": 1.0789, "step": 110 }, { "epoch": 0.35384615384615387, "grad_norm": 0.5375627875328064, "learning_rate": 2.9968174482102552e-05, "loss": 1.1178, "step": 115 }, { "epoch": 0.36923076923076925, "grad_norm": 0.7350935339927673, "learning_rate": 2.9957457088589697e-05, "loss": 1.0682, "step": 120 }, { "epoch": 0.38461538461538464, "grad_norm": 0.624808669090271, "learning_rate": 2.994518958722255e-05, "loss": 1.0331, "step": 125 }, { "epoch": 0.4, "grad_norm": 0.679461658000946, "learning_rate": 2.993137324933688e-05, "loss": 1.1324, "step": 130 }, { "epoch": 0.4153846153846154, "grad_norm": 0.6046873331069946, "learning_rate": 2.9916009506781284e-05, "loss": 1.0288, "step": 135 }, { "epoch": 0.4307692307692308, "grad_norm": 0.7874692678451538, "learning_rate": 2.9899099951768775e-05, "loss": 1.118, "step": 140 }, { "epoch": 0.4461538461538462, "grad_norm": 0.6846326589584351, "learning_rate": 2.988064633671181e-05, "loss": 1.109, "step": 145 }, { "epoch": 0.46153846153846156, "grad_norm": 0.6826982498168945, "learning_rate": 2.9860650574040666e-05, "loss": 1.0325, "step": 150 }, { "epoch": 0.47692307692307695, "grad_norm": 0.9006183743476868, "learning_rate": 2.9839114736005216e-05, "loss": 1.0584, "step": 155 }, { "epoch": 0.49230769230769234, "grad_norm": 0.8280394077301025, "learning_rate": 2.981604105446022e-05, "loss": 0.9616, "step": 160 }, { "epoch": 0.5076923076923077, "grad_norm": 0.7612792253494263, "learning_rate": 2.979143192063399e-05, "loss": 1.0125, "step": 165 }, { "epoch": 0.5230769230769231, "grad_norm": 0.8651503324508667, "learning_rate": 2.976528988488061e-05, "loss": 0.9571, "step": 170 }, { "epoch": 0.5384615384615384, "grad_norm": 0.8503299951553345, "learning_rate": 2.97376176564156e-05, "loss": 0.9423, "step": 175 }, { "epoch": 0.5538461538461539, "grad_norm": 0.7497279047966003, "learning_rate": 2.9708418103035166e-05, "loss": 1.014, "step": 180 }, { "epoch": 0.5692307692307692, "grad_norm": 0.881376326084137, "learning_rate": 2.9677694250818998e-05, "loss": 0.9457, "step": 185 }, { "epoch": 0.5846153846153846, "grad_norm": 0.8077456951141357, "learning_rate": 2.9645449283816644e-05, "loss": 0.9003, "step": 190 }, { "epoch": 0.6, "grad_norm": 0.8906092047691345, "learning_rate": 2.9611686543717565e-05, "loss": 0.944, "step": 195 }, { "epoch": 0.6153846153846154, "grad_norm": 0.8934299349784851, "learning_rate": 2.95764095295048e-05, "loss": 0.9277, "step": 200 }, { "epoch": 0.6307692307692307, "grad_norm": 0.9526508450508118, "learning_rate": 2.9539621897092342e-05, "loss": 0.8852, "step": 205 }, { "epoch": 0.6461538461538462, "grad_norm": 0.8791632056236267, "learning_rate": 2.950132745894629e-05, "loss": 0.9177, "step": 210 }, { "epoch": 0.6615384615384615, "grad_norm": 0.9002313613891602, "learning_rate": 2.946153018368971e-05, "loss": 0.8687, "step": 215 }, { "epoch": 0.676923076923077, "grad_norm": 0.8795701861381531, "learning_rate": 2.9420234195691383e-05, "loss": 0.8655, "step": 220 }, { "epoch": 0.6923076923076923, "grad_norm": 0.8691343069076538, "learning_rate": 2.9377443774638358e-05, "loss": 0.8868, "step": 225 }, { "epoch": 0.7076923076923077, "grad_norm": 0.9766488075256348, "learning_rate": 2.933316335509242e-05, "loss": 0.8838, "step": 230 }, { "epoch": 0.7230769230769231, "grad_norm": 0.9698830842971802, "learning_rate": 2.928739752603055e-05, "loss": 0.8904, "step": 235 }, { "epoch": 0.7384615384615385, "grad_norm": 0.9286680221557617, "learning_rate": 2.9240151030369314e-05, "loss": 0.8388, "step": 240 }, { "epoch": 0.7538461538461538, "grad_norm": 0.8796747326850891, "learning_rate": 2.919142876447335e-05, "loss": 0.8116, "step": 245 }, { "epoch": 0.7692307692307693, "grad_norm": 1.0402207374572754, "learning_rate": 2.914123577764795e-05, "loss": 0.8384, "step": 250 }, { "epoch": 0.7846153846153846, "grad_norm": 1.0921474695205688, "learning_rate": 2.9089577271615735e-05, "loss": 0.8572, "step": 255 }, { "epoch": 0.8, "grad_norm": 1.0024511814117432, "learning_rate": 2.9036458599977625e-05, "loss": 0.7863, "step": 260 }, { "epoch": 0.8153846153846154, "grad_norm": 0.9102811217308044, "learning_rate": 2.8981885267658e-05, "loss": 0.8251, "step": 265 }, { "epoch": 0.8307692307692308, "grad_norm": 1.1564185619354248, "learning_rate": 2.892586293033419e-05, "loss": 0.7996, "step": 270 }, { "epoch": 0.8461538461538461, "grad_norm": 0.9751123785972595, "learning_rate": 2.886839739385037e-05, "loss": 0.792, "step": 275 }, { "epoch": 0.8615384615384616, "grad_norm": 0.9479052424430847, "learning_rate": 2.880949461361587e-05, "loss": 0.7419, "step": 280 }, { "epoch": 0.8769230769230769, "grad_norm": 1.006443738937378, "learning_rate": 2.874916069398798e-05, "loss": 0.8018, "step": 285 }, { "epoch": 0.8923076923076924, "grad_norm": 0.9871719479560852, "learning_rate": 2.8687401887639343e-05, "loss": 0.7702, "step": 290 }, { "epoch": 0.9076923076923077, "grad_norm": 1.0391933917999268, "learning_rate": 2.8624224594909953e-05, "loss": 0.7881, "step": 295 }, { "epoch": 0.9230769230769231, "grad_norm": 0.9722039699554443, "learning_rate": 2.8559635363143857e-05, "loss": 0.764, "step": 300 }, { "epoch": 0.9384615384615385, "grad_norm": 1.0607675313949585, "learning_rate": 2.849364088601063e-05, "loss": 0.7665, "step": 305 }, { "epoch": 0.9538461538461539, "grad_norm": 1.122786045074463, "learning_rate": 2.8426248002811686e-05, "loss": 0.7217, "step": 310 }, { "epoch": 0.9692307692307692, "grad_norm": 1.0698699951171875, "learning_rate": 2.8357463697771474e-05, "loss": 0.7319, "step": 315 }, { "epoch": 0.9846153846153847, "grad_norm": 1.0934139490127563, "learning_rate": 2.8287295099313694e-05, "loss": 0.7172, "step": 320 }, { "epoch": 1.0, "grad_norm": 1.0971643924713135, "learning_rate": 2.8215749479322523e-05, "loss": 0.6796, "step": 325 } ], "logging_steps": 5, "max_steps": 1625, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.566837909793014e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }