| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.994350282485876, |
| "eval_steps": 500, |
| "global_step": 795, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03766478342749529, |
| "grad_norm": 7.01964876508617, |
| "learning_rate": 5e-06, |
| "loss": 1.0512, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07532956685499058, |
| "grad_norm": 1.7064746932399855, |
| "learning_rate": 5e-06, |
| "loss": 0.9195, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11299435028248588, |
| "grad_norm": 1.5997363297482388, |
| "learning_rate": 5e-06, |
| "loss": 0.8822, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.15065913370998116, |
| "grad_norm": 1.2080439221780226, |
| "learning_rate": 5e-06, |
| "loss": 0.8521, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.18832391713747645, |
| "grad_norm": 1.0679693197690212, |
| "learning_rate": 5e-06, |
| "loss": 0.8377, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.22598870056497175, |
| "grad_norm": 0.9608926103732748, |
| "learning_rate": 5e-06, |
| "loss": 0.8208, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.263653483992467, |
| "grad_norm": 1.1752578271193257, |
| "learning_rate": 5e-06, |
| "loss": 0.8101, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3013182674199623, |
| "grad_norm": 0.7689747973770075, |
| "learning_rate": 5e-06, |
| "loss": 0.8014, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3389830508474576, |
| "grad_norm": 0.8655895794881668, |
| "learning_rate": 5e-06, |
| "loss": 0.7916, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3766478342749529, |
| "grad_norm": 0.7877262798626278, |
| "learning_rate": 5e-06, |
| "loss": 0.7837, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4143126177024482, |
| "grad_norm": 0.670131754521518, |
| "learning_rate": 5e-06, |
| "loss": 0.7818, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4519774011299435, |
| "grad_norm": 0.6897806811013272, |
| "learning_rate": 5e-06, |
| "loss": 0.7761, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4896421845574388, |
| "grad_norm": 0.9612838426596706, |
| "learning_rate": 5e-06, |
| "loss": 0.7801, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.527306967984934, |
| "grad_norm": 0.6741872992998229, |
| "learning_rate": 5e-06, |
| "loss": 0.7709, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5649717514124294, |
| "grad_norm": 0.7449128082443985, |
| "learning_rate": 5e-06, |
| "loss": 0.7697, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6026365348399246, |
| "grad_norm": 0.702009783976525, |
| "learning_rate": 5e-06, |
| "loss": 0.7661, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.64030131826742, |
| "grad_norm": 0.9412048703347311, |
| "learning_rate": 5e-06, |
| "loss": 0.7701, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6779661016949152, |
| "grad_norm": 0.8925674460202105, |
| "learning_rate": 5e-06, |
| "loss": 0.7709, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7156308851224106, |
| "grad_norm": 0.6803231082977221, |
| "learning_rate": 5e-06, |
| "loss": 0.7655, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7532956685499058, |
| "grad_norm": 0.63167541083718, |
| "learning_rate": 5e-06, |
| "loss": 0.7626, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7909604519774012, |
| "grad_norm": 0.6839121568063468, |
| "learning_rate": 5e-06, |
| "loss": 0.7584, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8286252354048964, |
| "grad_norm": 0.5919745674111999, |
| "learning_rate": 5e-06, |
| "loss": 0.7546, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8662900188323918, |
| "grad_norm": 0.8606865903126677, |
| "learning_rate": 5e-06, |
| "loss": 0.7622, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.903954802259887, |
| "grad_norm": 0.8112347024571781, |
| "learning_rate": 5e-06, |
| "loss": 0.7542, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9416195856873822, |
| "grad_norm": 0.7929648837913739, |
| "learning_rate": 5e-06, |
| "loss": 0.7598, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9792843691148776, |
| "grad_norm": 0.6660909939006495, |
| "learning_rate": 5e-06, |
| "loss": 0.7563, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9981167608286252, |
| "eval_loss": 0.7525370717048645, |
| "eval_runtime": 282.6778, |
| "eval_samples_per_second": 25.301, |
| "eval_steps_per_second": 0.396, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.0169491525423728, |
| "grad_norm": 1.3153331677845188, |
| "learning_rate": 5e-06, |
| "loss": 0.775, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.054613935969868, |
| "grad_norm": 0.8617206715995941, |
| "learning_rate": 5e-06, |
| "loss": 0.6997, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0922787193973635, |
| "grad_norm": 0.6944168653287546, |
| "learning_rate": 5e-06, |
| "loss": 0.7005, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.1299435028248588, |
| "grad_norm": 0.9411772868385534, |
| "learning_rate": 5e-06, |
| "loss": 0.6973, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.167608286252354, |
| "grad_norm": 0.6694366573751462, |
| "learning_rate": 5e-06, |
| "loss": 0.7051, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.2052730696798493, |
| "grad_norm": 0.7156857674386213, |
| "learning_rate": 5e-06, |
| "loss": 0.7018, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.2429378531073447, |
| "grad_norm": 0.8805127872682743, |
| "learning_rate": 5e-06, |
| "loss": 0.7009, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.28060263653484, |
| "grad_norm": 0.7003145394069171, |
| "learning_rate": 5e-06, |
| "loss": 0.7003, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.3182674199623352, |
| "grad_norm": 0.9332351519153556, |
| "learning_rate": 5e-06, |
| "loss": 0.7039, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.3559322033898304, |
| "grad_norm": 0.5931688673479261, |
| "learning_rate": 5e-06, |
| "loss": 0.6953, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3935969868173257, |
| "grad_norm": 0.7444491178664134, |
| "learning_rate": 5e-06, |
| "loss": 0.7021, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.4312617702448212, |
| "grad_norm": 0.6201120088074494, |
| "learning_rate": 5e-06, |
| "loss": 0.6989, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4689265536723164, |
| "grad_norm": 0.6394360492158847, |
| "learning_rate": 5e-06, |
| "loss": 0.7006, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.5065913370998116, |
| "grad_norm": 0.5968659634612707, |
| "learning_rate": 5e-06, |
| "loss": 0.7003, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.544256120527307, |
| "grad_norm": 0.6066424887033086, |
| "learning_rate": 5e-06, |
| "loss": 0.7013, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.5819209039548023, |
| "grad_norm": 0.7667292931170824, |
| "learning_rate": 5e-06, |
| "loss": 0.7022, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.6195856873822976, |
| "grad_norm": 0.5682752376913638, |
| "learning_rate": 5e-06, |
| "loss": 0.6947, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.6572504708097928, |
| "grad_norm": 0.6764654533865712, |
| "learning_rate": 5e-06, |
| "loss": 0.7, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.694915254237288, |
| "grad_norm": 0.6800954995181525, |
| "learning_rate": 5e-06, |
| "loss": 0.7044, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.7325800376647833, |
| "grad_norm": 0.5796176798333441, |
| "learning_rate": 5e-06, |
| "loss": 0.6957, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.7702448210922788, |
| "grad_norm": 0.6427876261770084, |
| "learning_rate": 5e-06, |
| "loss": 0.6982, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.807909604519774, |
| "grad_norm": 0.7709175834054774, |
| "learning_rate": 5e-06, |
| "loss": 0.6927, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.8455743879472695, |
| "grad_norm": 0.62498067885703, |
| "learning_rate": 5e-06, |
| "loss": 0.6966, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.8832391713747647, |
| "grad_norm": 0.6802211302093598, |
| "learning_rate": 5e-06, |
| "loss": 0.7008, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.92090395480226, |
| "grad_norm": 0.7015346819134285, |
| "learning_rate": 5e-06, |
| "loss": 0.6926, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.9585687382297552, |
| "grad_norm": 0.6679611093712937, |
| "learning_rate": 5e-06, |
| "loss": 0.7, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9962335216572504, |
| "grad_norm": 0.6337629504219587, |
| "learning_rate": 5e-06, |
| "loss": 0.6959, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.7400202751159668, |
| "eval_runtime": 282.1309, |
| "eval_samples_per_second": 25.35, |
| "eval_steps_per_second": 0.397, |
| "step": 531 |
| }, |
| { |
| "epoch": 2.0338983050847457, |
| "grad_norm": 1.0318615681166752, |
| "learning_rate": 5e-06, |
| "loss": 0.6778, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.071563088512241, |
| "grad_norm": 0.6334271963975697, |
| "learning_rate": 5e-06, |
| "loss": 0.6449, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.109227871939736, |
| "grad_norm": 1.1074270168911127, |
| "learning_rate": 5e-06, |
| "loss": 0.6411, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.146892655367232, |
| "grad_norm": 0.7758960956340408, |
| "learning_rate": 5e-06, |
| "loss": 0.642, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.184557438794727, |
| "grad_norm": 0.6355849787561325, |
| "learning_rate": 5e-06, |
| "loss": 0.6437, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 0.6846746599985979, |
| "learning_rate": 5e-06, |
| "loss": 0.6436, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.2598870056497176, |
| "grad_norm": 0.6043112947114725, |
| "learning_rate": 5e-06, |
| "loss": 0.645, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.297551789077213, |
| "grad_norm": 0.817414985795291, |
| "learning_rate": 5e-06, |
| "loss": 0.6457, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.335216572504708, |
| "grad_norm": 0.698455857380687, |
| "learning_rate": 5e-06, |
| "loss": 0.6455, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.3728813559322033, |
| "grad_norm": 0.6479771691734695, |
| "learning_rate": 5e-06, |
| "loss": 0.6462, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.4105461393596985, |
| "grad_norm": 0.7093548332196533, |
| "learning_rate": 5e-06, |
| "loss": 0.6507, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.4482109227871938, |
| "grad_norm": 0.8925613153879164, |
| "learning_rate": 5e-06, |
| "loss": 0.6466, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.4858757062146895, |
| "grad_norm": 0.6757355564710857, |
| "learning_rate": 5e-06, |
| "loss": 0.6436, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.5235404896421847, |
| "grad_norm": 0.6081496236845628, |
| "learning_rate": 5e-06, |
| "loss": 0.648, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.56120527306968, |
| "grad_norm": 0.7447422506860626, |
| "learning_rate": 5e-06, |
| "loss": 0.6477, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.598870056497175, |
| "grad_norm": 0.8278808479195525, |
| "learning_rate": 5e-06, |
| "loss": 0.6456, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.6365348399246704, |
| "grad_norm": 0.8544558166675883, |
| "learning_rate": 5e-06, |
| "loss": 0.6459, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.6741996233521657, |
| "grad_norm": 0.7429520936151375, |
| "learning_rate": 5e-06, |
| "loss": 0.6459, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.711864406779661, |
| "grad_norm": 0.7127745829357879, |
| "learning_rate": 5e-06, |
| "loss": 0.6498, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.7495291902071566, |
| "grad_norm": 0.6076001592702167, |
| "learning_rate": 5e-06, |
| "loss": 0.651, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.7871939736346514, |
| "grad_norm": 0.5977493204127112, |
| "learning_rate": 5e-06, |
| "loss": 0.6507, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.824858757062147, |
| "grad_norm": 0.6943542767826754, |
| "learning_rate": 5e-06, |
| "loss": 0.652, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.8625235404896423, |
| "grad_norm": 0.7797150538117115, |
| "learning_rate": 5e-06, |
| "loss": 0.651, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.9001883239171375, |
| "grad_norm": 0.5704333728088763, |
| "learning_rate": 5e-06, |
| "loss": 0.6464, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.937853107344633, |
| "grad_norm": 0.5780251500512679, |
| "learning_rate": 5e-06, |
| "loss": 0.6448, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.975517890772128, |
| "grad_norm": 0.6517902655313229, |
| "learning_rate": 5e-06, |
| "loss": 0.6504, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.994350282485876, |
| "eval_loss": 0.741532027721405, |
| "eval_runtime": 280.9763, |
| "eval_samples_per_second": 25.454, |
| "eval_steps_per_second": 0.399, |
| "step": 795 |
| }, |
| { |
| "epoch": 2.994350282485876, |
| "step": 795, |
| "total_flos": 1331445230469120.0, |
| "train_loss": 0.7161390892364694, |
| "train_runtime": 46894.4924, |
| "train_samples_per_second": 8.693, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 795, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1331445230469120.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|