{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 16446, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09120758847136082, "grad_norm": 2.8996551036834717, "learning_rate": 4.849507479022255e-05, "loss": 0.8414, "step": 500 }, { "epoch": 0.09120758847136082, "eval_loss": 0.8034613132476807, "eval_runtime": 67.9143, "eval_samples_per_second": 35.883, "eval_steps_per_second": 4.491, "step": 500 }, { "epoch": 0.18241517694272164, "grad_norm": 3.896031141281128, "learning_rate": 4.697494831569987e-05, "loss": 0.8357, "step": 1000 }, { "epoch": 0.18241517694272164, "eval_loss": 0.7636010050773621, "eval_runtime": 66.4597, "eval_samples_per_second": 36.669, "eval_steps_per_second": 4.589, "step": 1000 }, { "epoch": 0.2736227654140824, "grad_norm": 3.0594074726104736, "learning_rate": 4.5454821841177186e-05, "loss": 0.784, "step": 1500 }, { "epoch": 0.2736227654140824, "eval_loss": 0.7404520511627197, "eval_runtime": 66.4753, "eval_samples_per_second": 36.66, "eval_steps_per_second": 4.588, "step": 1500 }, { "epoch": 0.36483035388544327, "grad_norm": 2.4666194915771484, "learning_rate": 4.393469536665451e-05, "loss": 0.7527, "step": 2000 }, { "epoch": 0.36483035388544327, "eval_loss": 0.720551073551178, "eval_runtime": 66.4761, "eval_samples_per_second": 36.66, "eval_steps_per_second": 4.588, "step": 2000 }, { "epoch": 0.45603794235680406, "grad_norm": 3.1520893573760986, "learning_rate": 4.2414568892131825e-05, "loss": 0.7588, "step": 2500 }, { "epoch": 0.45603794235680406, "eval_loss": 0.7047598958015442, "eval_runtime": 66.6712, "eval_samples_per_second": 36.552, "eval_steps_per_second": 4.575, "step": 2500 }, { "epoch": 0.5472455308281649, "grad_norm": 2.88120174407959, "learning_rate": 4.0894442417609145e-05, "loss": 0.7203, "step": 3000 }, { "epoch": 0.5472455308281649, "eval_loss": 0.6884846091270447, "eval_runtime": 66.5954, "eval_samples_per_second": 36.594, "eval_steps_per_second": 4.58, "step": 3000 }, { "epoch": 0.6384531192995258, "grad_norm": 2.235811710357666, "learning_rate": 3.9374315943086464e-05, "loss": 0.7098, "step": 3500 }, { "epoch": 0.6384531192995258, "eval_loss": 0.6729713082313538, "eval_runtime": 66.5208, "eval_samples_per_second": 36.635, "eval_steps_per_second": 4.585, "step": 3500 }, { "epoch": 0.7296607077708865, "grad_norm": 4.2170891761779785, "learning_rate": 3.785418946856379e-05, "loss": 0.6946, "step": 4000 }, { "epoch": 0.7296607077708865, "eval_loss": 0.6604794263839722, "eval_runtime": 66.5342, "eval_samples_per_second": 36.628, "eval_steps_per_second": 4.584, "step": 4000 }, { "epoch": 0.8208682962422473, "grad_norm": 3.053722381591797, "learning_rate": 3.633406299404111e-05, "loss": 0.684, "step": 4500 }, { "epoch": 0.8208682962422473, "eval_loss": 0.646191418170929, "eval_runtime": 66.6938, "eval_samples_per_second": 36.54, "eval_steps_per_second": 4.573, "step": 4500 }, { "epoch": 0.9120758847136081, "grad_norm": 2.7756240367889404, "learning_rate": 3.481393651951842e-05, "loss": 0.6772, "step": 5000 }, { "epoch": 0.9120758847136081, "eval_loss": 0.6353339552879333, "eval_runtime": 66.5122, "eval_samples_per_second": 36.64, "eval_steps_per_second": 4.586, "step": 5000 }, { "epoch": 1.003283473184969, "grad_norm": 1.9400490522384644, "learning_rate": 3.329381004499574e-05, "loss": 0.6393, "step": 5500 }, { "epoch": 1.003283473184969, "eval_loss": 0.643139123916626, "eval_runtime": 66.5024, "eval_samples_per_second": 36.645, "eval_steps_per_second": 4.586, "step": 5500 }, { "epoch": 1.09449106165633, "grad_norm": 2.2215569019317627, "learning_rate": 3.177368357047307e-05, "loss": 0.4513, "step": 6000 }, { "epoch": 1.09449106165633, "eval_loss": 0.6397776007652283, "eval_runtime": 66.4386, "eval_samples_per_second": 36.68, "eval_steps_per_second": 4.591, "step": 6000 }, { "epoch": 1.1856986501276907, "grad_norm": 2.093801736831665, "learning_rate": 3.0253557095950385e-05, "loss": 0.4427, "step": 6500 }, { "epoch": 1.1856986501276907, "eval_loss": 0.636920154094696, "eval_runtime": 66.5515, "eval_samples_per_second": 36.618, "eval_steps_per_second": 4.583, "step": 6500 }, { "epoch": 1.2769062385990515, "grad_norm": 4.663333892822266, "learning_rate": 2.8733430621427704e-05, "loss": 0.4468, "step": 7000 }, { "epoch": 1.2769062385990515, "eval_loss": 0.6325265169143677, "eval_runtime": 66.5785, "eval_samples_per_second": 36.603, "eval_steps_per_second": 4.581, "step": 7000 }, { "epoch": 1.3681138270704123, "grad_norm": 3.2814857959747314, "learning_rate": 2.721330414690502e-05, "loss": 0.4555, "step": 7500 }, { "epoch": 1.3681138270704123, "eval_loss": 0.6226893067359924, "eval_runtime": 66.6251, "eval_samples_per_second": 36.578, "eval_steps_per_second": 4.578, "step": 7500 }, { "epoch": 1.459321415541773, "grad_norm": 1.9859124422073364, "learning_rate": 2.5693177672382347e-05, "loss": 0.4306, "step": 8000 }, { "epoch": 1.459321415541773, "eval_loss": 0.6232908368110657, "eval_runtime": 66.5211, "eval_samples_per_second": 36.635, "eval_steps_per_second": 4.585, "step": 8000 }, { "epoch": 1.5505290040131339, "grad_norm": 2.86844539642334, "learning_rate": 2.4173051197859663e-05, "loss": 0.4399, "step": 8500 }, { "epoch": 1.5505290040131339, "eval_loss": 0.6131536960601807, "eval_runtime": 66.4759, "eval_samples_per_second": 36.66, "eval_steps_per_second": 4.588, "step": 8500 }, { "epoch": 1.6417365924844947, "grad_norm": 2.0596117973327637, "learning_rate": 2.2652924723336982e-05, "loss": 0.4357, "step": 9000 }, { "epoch": 1.6417365924844947, "eval_loss": 0.6083381175994873, "eval_runtime": 66.5319, "eval_samples_per_second": 36.629, "eval_steps_per_second": 4.584, "step": 9000 }, { "epoch": 1.7329441809558555, "grad_norm": 2.2367780208587646, "learning_rate": 2.1132798248814302e-05, "loss": 0.4387, "step": 9500 }, { "epoch": 1.7329441809558555, "eval_loss": 0.6063674688339233, "eval_runtime": 66.5477, "eval_samples_per_second": 36.62, "eval_steps_per_second": 4.583, "step": 9500 }, { "epoch": 1.8241517694272162, "grad_norm": 2.397143602371216, "learning_rate": 1.961267177429162e-05, "loss": 0.4353, "step": 10000 }, { "epoch": 1.8241517694272162, "eval_loss": 0.5975276231765747, "eval_runtime": 66.5784, "eval_samples_per_second": 36.603, "eval_steps_per_second": 4.581, "step": 10000 }, { "epoch": 1.915359357898577, "grad_norm": 3.369065284729004, "learning_rate": 1.8092545299768944e-05, "loss": 0.4334, "step": 10500 }, { "epoch": 1.915359357898577, "eval_loss": 0.5899476408958435, "eval_runtime": 66.4613, "eval_samples_per_second": 36.668, "eval_steps_per_second": 4.589, "step": 10500 }, { "epoch": 2.006566946369938, "grad_norm": 2.555560827255249, "learning_rate": 1.657241882524626e-05, "loss": 0.4188, "step": 11000 }, { "epoch": 2.006566946369938, "eval_loss": 0.670684278011322, "eval_runtime": 66.4465, "eval_samples_per_second": 36.676, "eval_steps_per_second": 4.59, "step": 11000 }, { "epoch": 2.0977745348412986, "grad_norm": 3.1445837020874023, "learning_rate": 1.5052292350723582e-05, "loss": 0.2206, "step": 11500 }, { "epoch": 2.0977745348412986, "eval_loss": 0.7085195183753967, "eval_runtime": 66.4599, "eval_samples_per_second": 36.669, "eval_steps_per_second": 4.589, "step": 11500 }, { "epoch": 2.18898212331266, "grad_norm": 1.821514368057251, "learning_rate": 1.35321658762009e-05, "loss": 0.2125, "step": 12000 }, { "epoch": 2.18898212331266, "eval_loss": 0.7055649161338806, "eval_runtime": 66.4182, "eval_samples_per_second": 36.692, "eval_steps_per_second": 4.592, "step": 12000 }, { "epoch": 2.28018971178402, "grad_norm": 2.517010450363159, "learning_rate": 1.2015079654627266e-05, "loss": 0.2136, "step": 12500 }, { "epoch": 2.28018971178402, "eval_loss": 0.7125562429428101, "eval_runtime": 66.4822, "eval_samples_per_second": 36.656, "eval_steps_per_second": 4.588, "step": 12500 }, { "epoch": 2.3713973002553814, "grad_norm": 2.654905319213867, "learning_rate": 1.049799343305363e-05, "loss": 0.2186, "step": 13000 }, { "epoch": 2.3713973002553814, "eval_loss": 0.711520254611969, "eval_runtime": 66.4793, "eval_samples_per_second": 36.658, "eval_steps_per_second": 4.588, "step": 13000 }, { "epoch": 2.462604888726742, "grad_norm": 1.6538429260253906, "learning_rate": 8.97786695853095e-06, "loss": 0.2119, "step": 13500 }, { "epoch": 2.462604888726742, "eval_loss": 0.7095320820808411, "eval_runtime": 66.4626, "eval_samples_per_second": 36.667, "eval_steps_per_second": 4.589, "step": 13500 }, { "epoch": 2.553812477198103, "grad_norm": 2.4926905632019043, "learning_rate": 7.45774048400827e-06, "loss": 0.2093, "step": 14000 }, { "epoch": 2.553812477198103, "eval_loss": 0.7063737511634827, "eval_runtime": 66.4346, "eval_samples_per_second": 36.683, "eval_steps_per_second": 4.591, "step": 14000 }, { "epoch": 2.645020065669464, "grad_norm": 2.814349412918091, "learning_rate": 5.93761400948559e-06, "loss": 0.203, "step": 14500 }, { "epoch": 2.645020065669464, "eval_loss": 0.7055577635765076, "eval_runtime": 66.5157, "eval_samples_per_second": 36.638, "eval_steps_per_second": 4.585, "step": 14500 }, { "epoch": 2.7362276541408246, "grad_norm": 2.242487907409668, "learning_rate": 4.417487534962909e-06, "loss": 0.2019, "step": 15000 }, { "epoch": 2.7362276541408246, "eval_loss": 0.7038553357124329, "eval_runtime": 66.6171, "eval_samples_per_second": 36.582, "eval_steps_per_second": 4.578, "step": 15000 }, { "epoch": 2.8274352426121854, "grad_norm": 3.0727193355560303, "learning_rate": 2.8973610604402286e-06, "loss": 0.2, "step": 15500 }, { "epoch": 2.8274352426121854, "eval_loss": 0.7052037119865417, "eval_runtime": 66.5239, "eval_samples_per_second": 36.633, "eval_steps_per_second": 4.585, "step": 15500 }, { "epoch": 2.918642831083546, "grad_norm": 2.016516923904419, "learning_rate": 1.3772345859175483e-06, "loss": 0.2011, "step": 16000 }, { "epoch": 2.918642831083546, "eval_loss": 0.7026786804199219, "eval_runtime": 66.488, "eval_samples_per_second": 36.653, "eval_steps_per_second": 4.587, "step": 16000 } ], "logging_steps": 500, "max_steps": 16446, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.901338457754829e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }