{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10101010101010101, "grad_norm": 5.43838668916967, "learning_rate": 6.4000000000000006e-06, "loss": 0.768, "loss_nan_ranks": 0, "loss_rank_avg": 0.20029328763484955, "step": 5, "valid_targets_mean": 3894.4, "valid_targets_min": 1551 }, { "epoch": 0.20202020202020202, "grad_norm": 1.8173704060845222, "learning_rate": 1.4400000000000001e-05, "loss": 0.6809, "loss_nan_ranks": 0, "loss_rank_avg": 0.12393468618392944, "step": 10, "valid_targets_mean": 3313.9, "valid_targets_min": 658 }, { "epoch": 0.30303030303030304, "grad_norm": 0.8994415488525478, "learning_rate": 2.2400000000000002e-05, "loss": 0.6367, "loss_nan_ranks": 0, "loss_rank_avg": 0.15267133712768555, "step": 15, "valid_targets_mean": 3214.2, "valid_targets_min": 748 }, { "epoch": 0.40404040404040403, "grad_norm": 0.5231491285791463, "learning_rate": 3.0400000000000004e-05, "loss": 0.603, "loss_nan_ranks": 0, "loss_rank_avg": 0.1451815664768219, "step": 20, "valid_targets_mean": 4062.5, "valid_targets_min": 752 }, { "epoch": 0.5050505050505051, "grad_norm": 0.48610136532802667, "learning_rate": 3.8400000000000005e-05, "loss": 0.5739, "loss_nan_ranks": 0, "loss_rank_avg": 0.15977895259857178, "step": 25, "valid_targets_mean": 4920.6, "valid_targets_min": 689 }, { "epoch": 0.6060606060606061, "grad_norm": 0.3657562059313739, "learning_rate": 3.9968815283639625e-05, "loss": 0.531, "loss_nan_ranks": 0, "loss_rank_avg": 0.12183522433042526, "step": 30, "valid_targets_mean": 4170.1, "valid_targets_min": 848 }, { "epoch": 0.7070707070707071, "grad_norm": 0.3117441303599555, "learning_rate": 3.9842294026289565e-05, "loss": 0.5137, "loss_nan_ranks": 0, "loss_rank_avg": 0.11094912886619568, "step": 35, "valid_targets_mean": 4246.0, "valid_targets_min": 626 }, { "epoch": 0.8080808080808081, "grad_norm": 0.31427213112004465, "learning_rate": 3.9619103106983835e-05, "loss": 0.5033, "loss_nan_ranks": 0, "loss_rank_avg": 0.11524395644664764, "step": 40, "valid_targets_mean": 3335.3, "valid_targets_min": 589 }, { "epoch": 0.9090909090909091, "grad_norm": 0.27843711420334, "learning_rate": 3.930032988944623e-05, "loss": 0.4921, "loss_nan_ranks": 0, "loss_rank_avg": 0.12719261646270752, "step": 45, "valid_targets_mean": 4673.1, "valid_targets_min": 842 }, { "epoch": 1.0, "grad_norm": 0.35336756908591643, "learning_rate": 3.888752740474962e-05, "loss": 0.4725, "loss_nan_ranks": 0, "loss_rank_avg": 0.23522385954856873, "step": 50, "valid_targets_mean": 4525.1, "valid_targets_min": 618 }, { "epoch": 1.101010101010101, "grad_norm": 0.2618047514835116, "learning_rate": 3.838270678510469e-05, "loss": 0.4631, "loss_nan_ranks": 0, "loss_rank_avg": 0.0997854694724083, "step": 55, "valid_targets_mean": 3254.4, "valid_targets_min": 620 }, { "epoch": 1.202020202020202, "grad_norm": 0.2596681731982267, "learning_rate": 3.778832746582596e-05, "loss": 0.4695, "loss_nan_ranks": 0, "loss_rank_avg": 0.15060186386108398, "step": 60, "valid_targets_mean": 5336.2, "valid_targets_min": 994 }, { "epoch": 1.303030303030303, "grad_norm": 0.26134191299357834, "learning_rate": 3.710728520321014e-05, "loss": 0.4622, "loss_nan_ranks": 0, "loss_rank_avg": 0.10998662561178207, "step": 65, "valid_targets_mean": 3828.9, "valid_targets_min": 813 }, { "epoch": 1.404040404040404, "grad_norm": 0.2505910129541225, "learning_rate": 3.634289796670257e-05, "loss": 0.4639, "loss_nan_ranks": 0, "loss_rank_avg": 0.11412525177001953, "step": 70, "valid_targets_mean": 4118.6, "valid_targets_min": 589 }, { "epoch": 1.5050505050505052, "grad_norm": 0.28469306377718256, "learning_rate": 3.549888977408359e-05, "loss": 0.4547, "loss_nan_ranks": 0, "loss_rank_avg": 0.10224676132202148, "step": 75, "valid_targets_mean": 3621.8, "valid_targets_min": 807 }, { "epoch": 1.606060606060606, "grad_norm": 0.26506094775038846, "learning_rate": 3.457937254842823e-05, "loss": 0.4478, "loss_nan_ranks": 0, "loss_rank_avg": 0.1216074526309967, "step": 80, "valid_targets_mean": 4121.2, "valid_targets_min": 900 }, { "epoch": 1.7070707070707072, "grad_norm": 0.27422041538604486, "learning_rate": 3.3588826085230336e-05, "loss": 0.4405, "loss_nan_ranks": 0, "loss_rank_avg": 0.09807954728603363, "step": 85, "valid_targets_mean": 3189.9, "valid_targets_min": 777 }, { "epoch": 1.808080808080808, "grad_norm": 0.25008604850493343, "learning_rate": 3.253207622728921e-05, "loss": 0.441, "loss_nan_ranks": 0, "loss_rank_avg": 0.11396118998527527, "step": 90, "valid_targets_mean": 4577.4, "valid_targets_min": 875 }, { "epoch": 1.9090909090909092, "grad_norm": 0.26410650818418535, "learning_rate": 3.141427135368864e-05, "loss": 0.4447, "loss_nan_ranks": 0, "loss_rank_avg": 0.10761921852827072, "step": 95, "valid_targets_mean": 3803.5, "valid_targets_min": 613 }, { "epoch": 2.0, "grad_norm": 0.37011104936880673, "learning_rate": 3.024085729741143e-05, "loss": 0.4481, "loss_nan_ranks": 0, "loss_rank_avg": 0.254130095243454, "step": 100, "valid_targets_mean": 4326.2, "valid_targets_min": 550 }, { "epoch": 2.101010101010101, "grad_norm": 0.2633444829921788, "learning_rate": 2.9017550813788616e-05, "loss": 0.4286, "loss_nan_ranks": 0, "loss_rank_avg": 0.10730044543743134, "step": 105, "valid_targets_mean": 4681.4, "valid_targets_min": 808 }, { "epoch": 2.202020202020202, "grad_norm": 0.27947047290967925, "learning_rate": 2.7750311729042062e-05, "loss": 0.4249, "loss_nan_ranks": 0, "loss_rank_avg": 0.11274366080760956, "step": 110, "valid_targets_mean": 4363.5, "valid_targets_min": 721 }, { "epoch": 2.303030303030303, "grad_norm": 0.26878537952096837, "learning_rate": 2.6445313904610227e-05, "loss": 0.4319, "loss_nan_ranks": 0, "loss_rank_avg": 0.10825060307979584, "step": 115, "valid_targets_mean": 3570.9, "valid_targets_min": 740 }, { "epoch": 2.404040404040404, "grad_norm": 0.27801838465637635, "learning_rate": 2.510891515871581e-05, "loss": 0.4335, "loss_nan_ranks": 0, "loss_rank_avg": 0.09939642250537872, "step": 120, "valid_targets_mean": 3389.2, "valid_targets_min": 712 }, { "epoch": 2.505050505050505, "grad_norm": 0.26781305081846457, "learning_rate": 2.37476262917145e-05, "loss": 0.4247, "loss_nan_ranks": 0, "loss_rank_avg": 0.109991654753685, "step": 125, "valid_targets_mean": 4646.0, "valid_targets_min": 748 }, { "epoch": 2.606060606060606, "grad_norm": 0.26237181084548494, "learning_rate": 2.2368079366130028e-05, "loss": 0.4187, "loss_nan_ranks": 0, "loss_rank_avg": 0.11524119973182678, "step": 130, "valid_targets_mean": 4932.1, "valid_targets_min": 845 }, { "epoch": 2.707070707070707, "grad_norm": 0.2585522405239844, "learning_rate": 2.097699539591227e-05, "loss": 0.4274, "loss_nan_ranks": 0, "loss_rank_avg": 0.08941829204559326, "step": 135, "valid_targets_mean": 3169.6, "valid_targets_min": 832 }, { "epoch": 2.808080808080808, "grad_norm": 0.26016350721342923, "learning_rate": 1.9581151602332865e-05, "loss": 0.4177, "loss_nan_ranks": 0, "loss_rank_avg": 0.1133737787604332, "step": 140, "valid_targets_mean": 5011.3, "valid_targets_min": 1002 }, { "epoch": 2.909090909090909, "grad_norm": 0.27030558024007334, "learning_rate": 1.8187348396044402e-05, "loss": 0.4231, "loss_nan_ranks": 0, "loss_rank_avg": 0.10732856392860413, "step": 145, "valid_targets_mean": 4244.3, "valid_targets_min": 849 }, { "epoch": 3.0, "grad_norm": 0.3695239483214702, "learning_rate": 1.6802376246163307e-05, "loss": 0.4307, "loss_nan_ranks": 0, "loss_rank_avg": 0.2080117017030716, "step": 150, "valid_targets_mean": 3466.7, "valid_targets_min": 600 }, { "epoch": 3.101010101010101, "grad_norm": 0.2516175673157399, "learning_rate": 1.5432982597786886e-05, "loss": 0.4228, "loss_nan_ranks": 0, "loss_rank_avg": 0.11081644892692566, "step": 155, "valid_targets_mean": 4376.6, "valid_targets_min": 1393 }, { "epoch": 3.202020202020202, "grad_norm": 0.25899041127457506, "learning_rate": 1.4085838999119075e-05, "loss": 0.4183, "loss_nan_ranks": 0, "loss_rank_avg": 0.10279497504234314, "step": 160, "valid_targets_mean": 3922.6, "valid_targets_min": 1037 }, { "epoch": 3.303030303030303, "grad_norm": 0.2568302854985244, "learning_rate": 1.2767508598358158e-05, "loss": 0.4119, "loss_nan_ranks": 0, "loss_rank_avg": 0.10400432348251343, "step": 165, "valid_targets_mean": 4147.3, "valid_targets_min": 840 }, { "epoch": 3.404040404040404, "grad_norm": 0.2575745984749214, "learning_rate": 1.1484414168698547e-05, "loss": 0.4109, "loss_nan_ranks": 0, "loss_rank_avg": 0.1014273464679718, "step": 170, "valid_targets_mean": 3870.5, "valid_targets_min": 738 }, { "epoch": 3.505050505050505, "grad_norm": 0.2708052183398422, "learning_rate": 1.0242806817225344e-05, "loss": 0.4183, "loss_nan_ranks": 0, "loss_rank_avg": 0.0942554920911789, "step": 175, "valid_targets_mean": 4129.5, "valid_targets_min": 691 }, { "epoch": 3.606060606060606, "grad_norm": 0.39589597035376406, "learning_rate": 9.048735530148998e-06, "loss": 0.4057, "loss_nan_ranks": 0, "loss_rank_avg": 0.10414694994688034, "step": 180, "valid_targets_mean": 3647.8, "valid_targets_min": 1355 }, { "epoch": 3.707070707070707, "grad_norm": 0.2754352204699097, "learning_rate": 7.908017702752504e-06, "loss": 0.4167, "loss_nan_ranks": 0, "loss_rank_avg": 0.11268728971481323, "step": 185, "valid_targets_mean": 4025.8, "valid_targets_min": 631 }, { "epoch": 3.808080808080808, "grad_norm": 0.26307739642578865, "learning_rate": 6.826210797626389e-06, "loss": 0.4158, "loss_nan_ranks": 0, "loss_rank_avg": 0.08769197762012482, "step": 190, "valid_targets_mean": 3541.8, "valid_targets_min": 1132 }, { "epoch": 3.909090909090909, "grad_norm": 0.25965429680059315, "learning_rate": 5.8085852692695864e-06, "loss": 0.4088, "loss_nan_ranks": 0, "loss_rank_avg": 0.10568307340145111, "step": 195, "valid_targets_mean": 4072.8, "valid_targets_min": 658 }, { "epoch": 4.0, "grad_norm": 0.37112181378749864, "learning_rate": 4.8600988869648745e-06, "loss": 0.4077, "loss_nan_ranks": 0, "loss_rank_avg": 0.18167036771774292, "step": 200, "valid_targets_mean": 3388.4, "valid_targets_min": 757 }, { "epoch": 4.101010101010101, "grad_norm": 0.24190724490489265, "learning_rate": 3.985372581025333e-06, "loss": 0.3992, "loss_nan_ranks": 0, "loss_rank_avg": 0.09219323098659515, "step": 205, "valid_targets_mean": 3395.9, "valid_targets_min": 1280 }, { "epoch": 4.202020202020202, "grad_norm": 0.26150260454768026, "learning_rate": 3.1886679300863156e-06, "loss": 0.407, "loss_nan_ranks": 0, "loss_rank_avg": 0.09850673377513885, "step": 210, "valid_targets_mean": 3216.6, "valid_targets_min": 784 }, { "epoch": 4.303030303030303, "grad_norm": 0.26872823184659334, "learning_rate": 2.473866399122733e-06, "loss": 0.4133, "loss_nan_ranks": 0, "loss_rank_avg": 0.10747270286083221, "step": 215, "valid_targets_mean": 3487.2, "valid_targets_min": 773 }, { "epoch": 4.404040404040404, "grad_norm": 0.2549088626047512, "learning_rate": 1.8444504293418286e-06, "loss": 0.4124, "loss_nan_ranks": 0, "loss_rank_avg": 0.10171965509653091, "step": 220, "valid_targets_mean": 3727.9, "valid_targets_min": 928 }, { "epoch": 4.505050505050505, "grad_norm": 0.32301587125223463, "learning_rate": 1.3034864720797112e-06, "loss": 0.4125, "loss_nan_ranks": 0, "loss_rank_avg": 0.11444351077079773, "step": 225, "valid_targets_mean": 3867.6, "valid_targets_min": 791 }, { "epoch": 4.606060606060606, "grad_norm": 0.24811664458418323, "learning_rate": 8.536100493586552e-07, "loss": 0.4023, "loss_nan_ranks": 0, "loss_rank_avg": 0.0989425927400589, "step": 230, "valid_targets_mean": 3649.8, "valid_targets_min": 675 }, { "epoch": 4.707070707070707, "grad_norm": 0.24999205236156857, "learning_rate": 4.970129138887347e-07, "loss": 0.4078, "loss_nan_ranks": 0, "loss_rank_avg": 0.09448540210723877, "step": 235, "valid_targets_mean": 3779.3, "valid_targets_min": 613 }, { "epoch": 4.808080808080808, "grad_norm": 0.2533525398749002, "learning_rate": 2.3543237106894434e-07, "loss": 0.4138, "loss_nan_ranks": 0, "loss_rank_avg": 0.10717539489269257, "step": 240, "valid_targets_mean": 3630.1, "valid_targets_min": 1185 }, { "epoch": 4.909090909090909, "grad_norm": 0.2324383918083103, "learning_rate": 7.01428150099126e-08, "loss": 0.4099, "loss_nan_ranks": 0, "loss_rank_avg": 0.09807487577199936, "step": 245, "valid_targets_mean": 4431.1, "valid_targets_min": 1588 }, { "epoch": 5.0, "grad_norm": 0.3564637724019629, "learning_rate": 1.949519813915224e-09, "loss": 0.4103, "loss_nan_ranks": 0, "loss_rank_avg": 0.20247647166252136, "step": 250, "valid_targets_mean": 3850.3, "valid_targets_min": 771 }, { "epoch": 5.0, "loss_nan_ranks": 0, "loss_rank_avg": 0.20247647166252136, "step": 250, "total_flos": 4.268418036190413e+17, "train_loss": 0.4559480676651001, "train_runtime": 5383.7809, "train_samples_per_second": 2.935, "train_steps_per_second": 0.046, "valid_targets_mean": 3850.3, "valid_targets_min": 771 } ], "logging_steps": 5, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.268418036190413e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }