| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 16446, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.09120758847136082, | |
| "grad_norm": 2.8996551036834717, | |
| "learning_rate": 4.849507479022255e-05, | |
| "loss": 0.8414, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09120758847136082, | |
| "eval_loss": 0.8034613132476807, | |
| "eval_runtime": 67.9143, | |
| "eval_samples_per_second": 35.883, | |
| "eval_steps_per_second": 4.491, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.18241517694272164, | |
| "grad_norm": 3.896031141281128, | |
| "learning_rate": 4.697494831569987e-05, | |
| "loss": 0.8357, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.18241517694272164, | |
| "eval_loss": 0.7636010050773621, | |
| "eval_runtime": 66.4597, | |
| "eval_samples_per_second": 36.669, | |
| "eval_steps_per_second": 4.589, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2736227654140824, | |
| "grad_norm": 3.0594074726104736, | |
| "learning_rate": 4.5454821841177186e-05, | |
| "loss": 0.784, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.2736227654140824, | |
| "eval_loss": 0.7404520511627197, | |
| "eval_runtime": 66.4753, | |
| "eval_samples_per_second": 36.66, | |
| "eval_steps_per_second": 4.588, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.36483035388544327, | |
| "grad_norm": 2.4666194915771484, | |
| "learning_rate": 4.393469536665451e-05, | |
| "loss": 0.7527, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.36483035388544327, | |
| "eval_loss": 0.720551073551178, | |
| "eval_runtime": 66.4761, | |
| "eval_samples_per_second": 36.66, | |
| "eval_steps_per_second": 4.588, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.45603794235680406, | |
| "grad_norm": 3.1520893573760986, | |
| "learning_rate": 4.2414568892131825e-05, | |
| "loss": 0.7588, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.45603794235680406, | |
| "eval_loss": 0.7047598958015442, | |
| "eval_runtime": 66.6712, | |
| "eval_samples_per_second": 36.552, | |
| "eval_steps_per_second": 4.575, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.5472455308281649, | |
| "grad_norm": 2.88120174407959, | |
| "learning_rate": 4.0894442417609145e-05, | |
| "loss": 0.7203, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5472455308281649, | |
| "eval_loss": 0.6884846091270447, | |
| "eval_runtime": 66.5954, | |
| "eval_samples_per_second": 36.594, | |
| "eval_steps_per_second": 4.58, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6384531192995258, | |
| "grad_norm": 2.235811710357666, | |
| "learning_rate": 3.9374315943086464e-05, | |
| "loss": 0.7098, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.6384531192995258, | |
| "eval_loss": 0.6729713082313538, | |
| "eval_runtime": 66.5208, | |
| "eval_samples_per_second": 36.635, | |
| "eval_steps_per_second": 4.585, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.7296607077708865, | |
| "grad_norm": 4.2170891761779785, | |
| "learning_rate": 3.785418946856379e-05, | |
| "loss": 0.6946, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.7296607077708865, | |
| "eval_loss": 0.6604794263839722, | |
| "eval_runtime": 66.5342, | |
| "eval_samples_per_second": 36.628, | |
| "eval_steps_per_second": 4.584, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.8208682962422473, | |
| "grad_norm": 3.053722381591797, | |
| "learning_rate": 3.633406299404111e-05, | |
| "loss": 0.684, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.8208682962422473, | |
| "eval_loss": 0.646191418170929, | |
| "eval_runtime": 66.6938, | |
| "eval_samples_per_second": 36.54, | |
| "eval_steps_per_second": 4.573, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.9120758847136081, | |
| "grad_norm": 2.7756240367889404, | |
| "learning_rate": 3.481393651951842e-05, | |
| "loss": 0.6772, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.9120758847136081, | |
| "eval_loss": 0.6353339552879333, | |
| "eval_runtime": 66.5122, | |
| "eval_samples_per_second": 36.64, | |
| "eval_steps_per_second": 4.586, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.003283473184969, | |
| "grad_norm": 1.9400490522384644, | |
| "learning_rate": 3.329381004499574e-05, | |
| "loss": 0.6393, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.003283473184969, | |
| "eval_loss": 0.643139123916626, | |
| "eval_runtime": 66.5024, | |
| "eval_samples_per_second": 36.645, | |
| "eval_steps_per_second": 4.586, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.09449106165633, | |
| "grad_norm": 2.2215569019317627, | |
| "learning_rate": 3.177368357047307e-05, | |
| "loss": 0.4513, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.09449106165633, | |
| "eval_loss": 0.6397776007652283, | |
| "eval_runtime": 66.4386, | |
| "eval_samples_per_second": 36.68, | |
| "eval_steps_per_second": 4.591, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.1856986501276907, | |
| "grad_norm": 2.093801736831665, | |
| "learning_rate": 3.0253557095950385e-05, | |
| "loss": 0.4427, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.1856986501276907, | |
| "eval_loss": 0.636920154094696, | |
| "eval_runtime": 66.5515, | |
| "eval_samples_per_second": 36.618, | |
| "eval_steps_per_second": 4.583, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.2769062385990515, | |
| "grad_norm": 4.663333892822266, | |
| "learning_rate": 2.8733430621427704e-05, | |
| "loss": 0.4468, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.2769062385990515, | |
| "eval_loss": 0.6325265169143677, | |
| "eval_runtime": 66.5785, | |
| "eval_samples_per_second": 36.603, | |
| "eval_steps_per_second": 4.581, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.3681138270704123, | |
| "grad_norm": 3.2814857959747314, | |
| "learning_rate": 2.721330414690502e-05, | |
| "loss": 0.4555, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.3681138270704123, | |
| "eval_loss": 0.6226893067359924, | |
| "eval_runtime": 66.6251, | |
| "eval_samples_per_second": 36.578, | |
| "eval_steps_per_second": 4.578, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.459321415541773, | |
| "grad_norm": 1.9859124422073364, | |
| "learning_rate": 2.5693177672382347e-05, | |
| "loss": 0.4306, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.459321415541773, | |
| "eval_loss": 0.6232908368110657, | |
| "eval_runtime": 66.5211, | |
| "eval_samples_per_second": 36.635, | |
| "eval_steps_per_second": 4.585, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.5505290040131339, | |
| "grad_norm": 2.86844539642334, | |
| "learning_rate": 2.4173051197859663e-05, | |
| "loss": 0.4399, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.5505290040131339, | |
| "eval_loss": 0.6131536960601807, | |
| "eval_runtime": 66.4759, | |
| "eval_samples_per_second": 36.66, | |
| "eval_steps_per_second": 4.588, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.6417365924844947, | |
| "grad_norm": 2.0596117973327637, | |
| "learning_rate": 2.2652924723336982e-05, | |
| "loss": 0.4357, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.6417365924844947, | |
| "eval_loss": 0.6083381175994873, | |
| "eval_runtime": 66.5319, | |
| "eval_samples_per_second": 36.629, | |
| "eval_steps_per_second": 4.584, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.7329441809558555, | |
| "grad_norm": 2.2367780208587646, | |
| "learning_rate": 2.1132798248814302e-05, | |
| "loss": 0.4387, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.7329441809558555, | |
| "eval_loss": 0.6063674688339233, | |
| "eval_runtime": 66.5477, | |
| "eval_samples_per_second": 36.62, | |
| "eval_steps_per_second": 4.583, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.8241517694272162, | |
| "grad_norm": 2.397143602371216, | |
| "learning_rate": 1.961267177429162e-05, | |
| "loss": 0.4353, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.8241517694272162, | |
| "eval_loss": 0.5975276231765747, | |
| "eval_runtime": 66.5784, | |
| "eval_samples_per_second": 36.603, | |
| "eval_steps_per_second": 4.581, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.915359357898577, | |
| "grad_norm": 3.369065284729004, | |
| "learning_rate": 1.8092545299768944e-05, | |
| "loss": 0.4334, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.915359357898577, | |
| "eval_loss": 0.5899476408958435, | |
| "eval_runtime": 66.4613, | |
| "eval_samples_per_second": 36.668, | |
| "eval_steps_per_second": 4.589, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 2.006566946369938, | |
| "grad_norm": 2.555560827255249, | |
| "learning_rate": 1.657241882524626e-05, | |
| "loss": 0.4188, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.006566946369938, | |
| "eval_loss": 0.670684278011322, | |
| "eval_runtime": 66.4465, | |
| "eval_samples_per_second": 36.676, | |
| "eval_steps_per_second": 4.59, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.0977745348412986, | |
| "grad_norm": 3.1445837020874023, | |
| "learning_rate": 1.5052292350723582e-05, | |
| "loss": 0.2206, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.0977745348412986, | |
| "eval_loss": 0.7085195183753967, | |
| "eval_runtime": 66.4599, | |
| "eval_samples_per_second": 36.669, | |
| "eval_steps_per_second": 4.589, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 2.18898212331266, | |
| "grad_norm": 1.821514368057251, | |
| "learning_rate": 1.35321658762009e-05, | |
| "loss": 0.2125, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.18898212331266, | |
| "eval_loss": 0.7055649161338806, | |
| "eval_runtime": 66.4182, | |
| "eval_samples_per_second": 36.692, | |
| "eval_steps_per_second": 4.592, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.28018971178402, | |
| "grad_norm": 2.517010450363159, | |
| "learning_rate": 1.2015079654627266e-05, | |
| "loss": 0.2136, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.28018971178402, | |
| "eval_loss": 0.7125562429428101, | |
| "eval_runtime": 66.4822, | |
| "eval_samples_per_second": 36.656, | |
| "eval_steps_per_second": 4.588, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 2.3713973002553814, | |
| "grad_norm": 2.654905319213867, | |
| "learning_rate": 1.049799343305363e-05, | |
| "loss": 0.2186, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.3713973002553814, | |
| "eval_loss": 0.711520254611969, | |
| "eval_runtime": 66.4793, | |
| "eval_samples_per_second": 36.658, | |
| "eval_steps_per_second": 4.588, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.462604888726742, | |
| "grad_norm": 1.6538429260253906, | |
| "learning_rate": 8.97786695853095e-06, | |
| "loss": 0.2119, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.462604888726742, | |
| "eval_loss": 0.7095320820808411, | |
| "eval_runtime": 66.4626, | |
| "eval_samples_per_second": 36.667, | |
| "eval_steps_per_second": 4.589, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.553812477198103, | |
| "grad_norm": 2.4926905632019043, | |
| "learning_rate": 7.45774048400827e-06, | |
| "loss": 0.2093, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.553812477198103, | |
| "eval_loss": 0.7063737511634827, | |
| "eval_runtime": 66.4346, | |
| "eval_samples_per_second": 36.683, | |
| "eval_steps_per_second": 4.591, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.645020065669464, | |
| "grad_norm": 2.814349412918091, | |
| "learning_rate": 5.93761400948559e-06, | |
| "loss": 0.203, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.645020065669464, | |
| "eval_loss": 0.7055577635765076, | |
| "eval_runtime": 66.5157, | |
| "eval_samples_per_second": 36.638, | |
| "eval_steps_per_second": 4.585, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.7362276541408246, | |
| "grad_norm": 2.242487907409668, | |
| "learning_rate": 4.417487534962909e-06, | |
| "loss": 0.2019, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.7362276541408246, | |
| "eval_loss": 0.7038553357124329, | |
| "eval_runtime": 66.6171, | |
| "eval_samples_per_second": 36.582, | |
| "eval_steps_per_second": 4.578, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.8274352426121854, | |
| "grad_norm": 3.0727193355560303, | |
| "learning_rate": 2.8973610604402286e-06, | |
| "loss": 0.2, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.8274352426121854, | |
| "eval_loss": 0.7052037119865417, | |
| "eval_runtime": 66.5239, | |
| "eval_samples_per_second": 36.633, | |
| "eval_steps_per_second": 4.585, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.918642831083546, | |
| "grad_norm": 2.016516923904419, | |
| "learning_rate": 1.3772345859175483e-06, | |
| "loss": 0.2011, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.918642831083546, | |
| "eval_loss": 0.7026786804199219, | |
| "eval_runtime": 66.488, | |
| "eval_samples_per_second": 36.653, | |
| "eval_steps_per_second": 4.587, | |
| "step": 16000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 16446, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.901338457754829e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |