{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4029, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02482005460412013, "grad_norm": 0.1369238644838333, "learning_rate": 6.666666666666667e-06, "loss": 2.4042, "step": 100 }, { "epoch": 0.04964010920824026, "grad_norm": 0.17341101169586182, "learning_rate": 1.3333333333333333e-05, "loss": 2.3581, "step": 200 }, { "epoch": 0.07446016381236038, "grad_norm": 0.30298689007759094, "learning_rate": 2e-05, "loss": 2.2979, "step": 300 }, { "epoch": 0.09928021841648052, "grad_norm": 0.4181392788887024, "learning_rate": 1.9964532702725803e-05, "loss": 2.2736, "step": 400 }, { "epoch": 0.12410027302060064, "grad_norm": 0.4833754301071167, "learning_rate": 1.9858382396738395e-05, "loss": 2.2352, "step": 500 }, { "epoch": 0.14892032762472077, "grad_norm": 0.5508949756622314, "learning_rate": 1.9682302054929414e-05, "loss": 2.1951, "step": 600 }, { "epoch": 0.17374038222884092, "grad_norm": 0.5856565833091736, "learning_rate": 1.943754069606428e-05, "loss": 2.1662, "step": 700 }, { "epoch": 0.19856043683296104, "grad_norm": 0.5611233115196228, "learning_rate": 1.9125834524918215e-05, "loss": 2.1815, "step": 800 }, { "epoch": 0.22338049143708116, "grad_norm": 0.6802138090133667, "learning_rate": 1.8749394616578068e-05, "loss": 2.1675, "step": 900 }, { "epoch": 0.2482005460412013, "grad_norm": 0.6513592004776001, "learning_rate": 1.8310891232270827e-05, "loss": 2.1402, "step": 1000 }, { "epoch": 0.2730206006453214, "grad_norm": 0.6889598369598389, "learning_rate": 1.781343487797389e-05, "loss": 2.1334, "step": 1100 }, { "epoch": 0.29784065524944153, "grad_norm": 0.7928256988525391, "learning_rate": 1.7260554240167017e-05, "loss": 2.1295, "step": 1200 }, { "epoch": 0.32266070985356166, "grad_norm": 0.7162489295005798, "learning_rate": 1.665617115523785e-05, "loss": 2.1232, "step": 1300 }, { "epoch": 0.34748076445768183, "grad_norm": 0.7136086225509644, "learning_rate": 1.6004572790094535e-05, "loss": 2.1148, "step": 1400 }, { "epoch": 0.37230081906180196, "grad_norm": 0.7688263654708862, "learning_rate": 1.531038123132105e-05, "loss": 2.0873, "step": 1500 }, { "epoch": 0.3971208736659221, "grad_norm": 0.772521436214447, "learning_rate": 1.4578520698593441e-05, "loss": 2.117, "step": 1600 }, { "epoch": 0.4219409282700422, "grad_norm": 1.010330080986023, "learning_rate": 1.3814182614927217e-05, "loss": 2.071, "step": 1700 }, { "epoch": 0.4467609828741623, "grad_norm": 0.6752054691314697, "learning_rate": 1.3022788781528653e-05, "loss": 2.0636, "step": 1800 }, { "epoch": 0.47158103747828245, "grad_norm": 0.841232180595398, "learning_rate": 1.220995291846777e-05, "loss": 2.0532, "step": 1900 }, { "epoch": 0.4964010920824026, "grad_norm": 0.7984778881072998, "learning_rate": 1.1381440843982634e-05, "loss": 2.0438, "step": 2000 }, { "epoch": 0.5212211466865228, "grad_norm": 0.8068585395812988, "learning_rate": 1.0543129574881446e-05, "loss": 2.0687, "step": 2100 }, { "epoch": 0.5460412012906428, "grad_norm": 0.8497598767280579, "learning_rate": 9.700965638162112e-06, "loss": 2.0477, "step": 2200 }, { "epoch": 0.570861255894763, "grad_norm": 0.7474705576896667, "learning_rate": 8.860922889564078e-06, "loss": 2.0429, "step": 2300 }, { "epoch": 0.5956813104988831, "grad_norm": 1.0781651735305786, "learning_rate": 8.028960138264857e-06, "loss": 2.0389, "step": 2400 }, { "epoch": 0.6205013651030032, "grad_norm": 0.8750322461128235, "learning_rate": 7.21097887830873e-06, "loss": 2.046, "step": 2500 }, { "epoch": 0.6453214197071233, "grad_norm": 0.9259145855903625, "learning_rate": 6.4127814265980095e-06, "loss": 2.0243, "step": 2600 }, { "epoch": 0.6701414743112435, "grad_norm": 1.1625196933746338, "learning_rate": 5.640029764393366e-06, "loss": 2.0513, "step": 2700 }, { "epoch": 0.6949615289153637, "grad_norm": 0.8271129727363586, "learning_rate": 4.8982053742793025e-06, "loss": 2.0228, "step": 2800 }, { "epoch": 0.7197815835194837, "grad_norm": 0.7196031212806702, "learning_rate": 4.1925703574897115e-06, "loss": 2.0496, "step": 2900 }, { "epoch": 0.7446016381236039, "grad_norm": 0.7880265712738037, "learning_rate": 3.528130107406099e-06, "loss": 2.0145, "step": 3000 }, { "epoch": 0.769421692727724, "grad_norm": 0.909106433391571, "learning_rate": 2.909597804002603e-06, "loss": 2.0437, "step": 3100 }, { "epoch": 0.7942417473318442, "grad_norm": 1.2606161832809448, "learning_rate": 2.341360981094921e-06, "loss": 2.0443, "step": 3200 }, { "epoch": 0.8190618019359642, "grad_norm": 0.795652449131012, "learning_rate": 1.8274504035470942e-06, "loss": 2.0568, "step": 3300 }, { "epoch": 0.8438818565400844, "grad_norm": 0.8904260993003845, "learning_rate": 1.3715114752043746e-06, "loss": 2.0787, "step": 3400 }, { "epoch": 0.8687019111442045, "grad_norm": 1.0925287008285522, "learning_rate": 9.767783803688414e-07, "loss": 2.045, "step": 3500 }, { "epoch": 0.8935219657483247, "grad_norm": 0.799608588218689, "learning_rate": 6.460511422441984e-07, "loss": 2.0167, "step": 3600 }, { "epoch": 0.9183420203524447, "grad_norm": 0.9094216227531433, "learning_rate": 3.8167576108468994e-07, "loss": 2.057, "step": 3700 }, { "epoch": 0.9431620749565649, "grad_norm": 0.8395094871520996, "learning_rate": 1.855275729374284e-07, "loss": 2.0425, "step": 3800 }, { "epoch": 0.9679821295606851, "grad_norm": 0.8606423735618591, "learning_rate": 5.89979470221802e-08, "loss": 2.0208, "step": 3900 }, { "epoch": 0.9928021841648051, "grad_norm": 0.8908767700195312, "learning_rate": 2.9844161102077218e-09, "loss": 2.0512, "step": 4000 }, { "epoch": 1.0, "step": 4029, "total_flos": 7.32108351012864e+16, "train_loss": 2.105005581936963, "train_runtime": 1251.4031, "train_samples_per_second": 6.438, "train_steps_per_second": 3.22 } ], "logging_steps": 100, "max_steps": 4029, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.32108351012864e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }