| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.10101010101010101, | |
| "grad_norm": 5.43838668916967, | |
| "learning_rate": 6.4000000000000006e-06, | |
| "loss": 0.768, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.20029328763484955, | |
| "step": 5, | |
| "valid_targets_mean": 3894.4, | |
| "valid_targets_min": 1551 | |
| }, | |
| { | |
| "epoch": 0.20202020202020202, | |
| "grad_norm": 1.8173704060845222, | |
| "learning_rate": 1.4400000000000001e-05, | |
| "loss": 0.6809, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.12393468618392944, | |
| "step": 10, | |
| "valid_targets_mean": 3313.9, | |
| "valid_targets_min": 658 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "grad_norm": 0.8994415488525478, | |
| "learning_rate": 2.2400000000000002e-05, | |
| "loss": 0.6367, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.15267133712768555, | |
| "step": 15, | |
| "valid_targets_mean": 3214.2, | |
| "valid_targets_min": 748 | |
| }, | |
| { | |
| "epoch": 0.40404040404040403, | |
| "grad_norm": 0.5231491285791463, | |
| "learning_rate": 3.0400000000000004e-05, | |
| "loss": 0.603, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.1451815664768219, | |
| "step": 20, | |
| "valid_targets_mean": 4062.5, | |
| "valid_targets_min": 752 | |
| }, | |
| { | |
| "epoch": 0.5050505050505051, | |
| "grad_norm": 0.48610136532802667, | |
| "learning_rate": 3.8400000000000005e-05, | |
| "loss": 0.5739, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.15977895259857178, | |
| "step": 25, | |
| "valid_targets_mean": 4920.6, | |
| "valid_targets_min": 689 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 0.3657562059313739, | |
| "learning_rate": 3.9968815283639625e-05, | |
| "loss": 0.531, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.12183522433042526, | |
| "step": 30, | |
| "valid_targets_mean": 4170.1, | |
| "valid_targets_min": 848 | |
| }, | |
| { | |
| "epoch": 0.7070707070707071, | |
| "grad_norm": 0.3117441303599555, | |
| "learning_rate": 3.9842294026289565e-05, | |
| "loss": 0.5137, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11094912886619568, | |
| "step": 35, | |
| "valid_targets_mean": 4246.0, | |
| "valid_targets_min": 626 | |
| }, | |
| { | |
| "epoch": 0.8080808080808081, | |
| "grad_norm": 0.31427213112004465, | |
| "learning_rate": 3.9619103106983835e-05, | |
| "loss": 0.5033, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11524395644664764, | |
| "step": 40, | |
| "valid_targets_mean": 3335.3, | |
| "valid_targets_min": 589 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.27843711420334, | |
| "learning_rate": 3.930032988944623e-05, | |
| "loss": 0.4921, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.12719261646270752, | |
| "step": 45, | |
| "valid_targets_mean": 4673.1, | |
| "valid_targets_min": 842 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.35336756908591643, | |
| "learning_rate": 3.888752740474962e-05, | |
| "loss": 0.4725, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.23522385954856873, | |
| "step": 50, | |
| "valid_targets_mean": 4525.1, | |
| "valid_targets_min": 618 | |
| }, | |
| { | |
| "epoch": 1.101010101010101, | |
| "grad_norm": 0.2618047514835116, | |
| "learning_rate": 3.838270678510469e-05, | |
| "loss": 0.4631, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.0997854694724083, | |
| "step": 55, | |
| "valid_targets_mean": 3254.4, | |
| "valid_targets_min": 620 | |
| }, | |
| { | |
| "epoch": 1.202020202020202, | |
| "grad_norm": 0.2596681731982267, | |
| "learning_rate": 3.778832746582596e-05, | |
| "loss": 0.4695, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.15060186386108398, | |
| "step": 60, | |
| "valid_targets_mean": 5336.2, | |
| "valid_targets_min": 994 | |
| }, | |
| { | |
| "epoch": 1.303030303030303, | |
| "grad_norm": 0.26134191299357834, | |
| "learning_rate": 3.710728520321014e-05, | |
| "loss": 0.4622, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10998662561178207, | |
| "step": 65, | |
| "valid_targets_mean": 3828.9, | |
| "valid_targets_min": 813 | |
| }, | |
| { | |
| "epoch": 1.404040404040404, | |
| "grad_norm": 0.2505910129541225, | |
| "learning_rate": 3.634289796670257e-05, | |
| "loss": 0.4639, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11412525177001953, | |
| "step": 70, | |
| "valid_targets_mean": 4118.6, | |
| "valid_targets_min": 589 | |
| }, | |
| { | |
| "epoch": 1.5050505050505052, | |
| "grad_norm": 0.28469306377718256, | |
| "learning_rate": 3.549888977408359e-05, | |
| "loss": 0.4547, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10224676132202148, | |
| "step": 75, | |
| "valid_targets_mean": 3621.8, | |
| "valid_targets_min": 807 | |
| }, | |
| { | |
| "epoch": 1.606060606060606, | |
| "grad_norm": 0.26506094775038846, | |
| "learning_rate": 3.457937254842823e-05, | |
| "loss": 0.4478, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.1216074526309967, | |
| "step": 80, | |
| "valid_targets_mean": 4121.2, | |
| "valid_targets_min": 900 | |
| }, | |
| { | |
| "epoch": 1.7070707070707072, | |
| "grad_norm": 0.27422041538604486, | |
| "learning_rate": 3.3588826085230336e-05, | |
| "loss": 0.4405, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.09807954728603363, | |
| "step": 85, | |
| "valid_targets_mean": 3189.9, | |
| "valid_targets_min": 777 | |
| }, | |
| { | |
| "epoch": 1.808080808080808, | |
| "grad_norm": 0.25008604850493343, | |
| "learning_rate": 3.253207622728921e-05, | |
| "loss": 0.441, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11396118998527527, | |
| "step": 90, | |
| "valid_targets_mean": 4577.4, | |
| "valid_targets_min": 875 | |
| }, | |
| { | |
| "epoch": 1.9090909090909092, | |
| "grad_norm": 0.26410650818418535, | |
| "learning_rate": 3.141427135368864e-05, | |
| "loss": 0.4447, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10761921852827072, | |
| "step": 95, | |
| "valid_targets_mean": 3803.5, | |
| "valid_targets_min": 613 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.37011104936880673, | |
| "learning_rate": 3.024085729741143e-05, | |
| "loss": 0.4481, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.254130095243454, | |
| "step": 100, | |
| "valid_targets_mean": 4326.2, | |
| "valid_targets_min": 550 | |
| }, | |
| { | |
| "epoch": 2.101010101010101, | |
| "grad_norm": 0.2633444829921788, | |
| "learning_rate": 2.9017550813788616e-05, | |
| "loss": 0.4286, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10730044543743134, | |
| "step": 105, | |
| "valid_targets_mean": 4681.4, | |
| "valid_targets_min": 808 | |
| }, | |
| { | |
| "epoch": 2.202020202020202, | |
| "grad_norm": 0.27947047290967925, | |
| "learning_rate": 2.7750311729042062e-05, | |
| "loss": 0.4249, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11274366080760956, | |
| "step": 110, | |
| "valid_targets_mean": 4363.5, | |
| "valid_targets_min": 721 | |
| }, | |
| { | |
| "epoch": 2.303030303030303, | |
| "grad_norm": 0.26878537952096837, | |
| "learning_rate": 2.6445313904610227e-05, | |
| "loss": 0.4319, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10825060307979584, | |
| "step": 115, | |
| "valid_targets_mean": 3570.9, | |
| "valid_targets_min": 740 | |
| }, | |
| { | |
| "epoch": 2.404040404040404, | |
| "grad_norm": 0.27801838465637635, | |
| "learning_rate": 2.510891515871581e-05, | |
| "loss": 0.4335, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.09939642250537872, | |
| "step": 120, | |
| "valid_targets_mean": 3389.2, | |
| "valid_targets_min": 712 | |
| }, | |
| { | |
| "epoch": 2.505050505050505, | |
| "grad_norm": 0.26781305081846457, | |
| "learning_rate": 2.37476262917145e-05, | |
| "loss": 0.4247, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.109991654753685, | |
| "step": 125, | |
| "valid_targets_mean": 4646.0, | |
| "valid_targets_min": 748 | |
| }, | |
| { | |
| "epoch": 2.606060606060606, | |
| "grad_norm": 0.26237181084548494, | |
| "learning_rate": 2.2368079366130028e-05, | |
| "loss": 0.4187, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11524119973182678, | |
| "step": 130, | |
| "valid_targets_mean": 4932.1, | |
| "valid_targets_min": 845 | |
| }, | |
| { | |
| "epoch": 2.707070707070707, | |
| "grad_norm": 0.2585522405239844, | |
| "learning_rate": 2.097699539591227e-05, | |
| "loss": 0.4274, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.08941829204559326, | |
| "step": 135, | |
| "valid_targets_mean": 3169.6, | |
| "valid_targets_min": 832 | |
| }, | |
| { | |
| "epoch": 2.808080808080808, | |
| "grad_norm": 0.26016350721342923, | |
| "learning_rate": 1.9581151602332865e-05, | |
| "loss": 0.4177, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.1133737787604332, | |
| "step": 140, | |
| "valid_targets_mean": 5011.3, | |
| "valid_targets_min": 1002 | |
| }, | |
| { | |
| "epoch": 2.909090909090909, | |
| "grad_norm": 0.27030558024007334, | |
| "learning_rate": 1.8187348396044402e-05, | |
| "loss": 0.4231, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10732856392860413, | |
| "step": 145, | |
| "valid_targets_mean": 4244.3, | |
| "valid_targets_min": 849 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.3695239483214702, | |
| "learning_rate": 1.6802376246163307e-05, | |
| "loss": 0.4307, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.2080117017030716, | |
| "step": 150, | |
| "valid_targets_mean": 3466.7, | |
| "valid_targets_min": 600 | |
| }, | |
| { | |
| "epoch": 3.101010101010101, | |
| "grad_norm": 0.2516175673157399, | |
| "learning_rate": 1.5432982597786886e-05, | |
| "loss": 0.4228, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11081644892692566, | |
| "step": 155, | |
| "valid_targets_mean": 4376.6, | |
| "valid_targets_min": 1393 | |
| }, | |
| { | |
| "epoch": 3.202020202020202, | |
| "grad_norm": 0.25899041127457506, | |
| "learning_rate": 1.4085838999119075e-05, | |
| "loss": 0.4183, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10279497504234314, | |
| "step": 160, | |
| "valid_targets_mean": 3922.6, | |
| "valid_targets_min": 1037 | |
| }, | |
| { | |
| "epoch": 3.303030303030303, | |
| "grad_norm": 0.2568302854985244, | |
| "learning_rate": 1.2767508598358158e-05, | |
| "loss": 0.4119, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10400432348251343, | |
| "step": 165, | |
| "valid_targets_mean": 4147.3, | |
| "valid_targets_min": 840 | |
| }, | |
| { | |
| "epoch": 3.404040404040404, | |
| "grad_norm": 0.2575745984749214, | |
| "learning_rate": 1.1484414168698547e-05, | |
| "loss": 0.4109, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.1014273464679718, | |
| "step": 170, | |
| "valid_targets_mean": 3870.5, | |
| "valid_targets_min": 738 | |
| }, | |
| { | |
| "epoch": 3.505050505050505, | |
| "grad_norm": 0.2708052183398422, | |
| "learning_rate": 1.0242806817225344e-05, | |
| "loss": 0.4183, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.0942554920911789, | |
| "step": 175, | |
| "valid_targets_mean": 4129.5, | |
| "valid_targets_min": 691 | |
| }, | |
| { | |
| "epoch": 3.606060606060606, | |
| "grad_norm": 0.39589597035376406, | |
| "learning_rate": 9.048735530148998e-06, | |
| "loss": 0.4057, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10414694994688034, | |
| "step": 180, | |
| "valid_targets_mean": 3647.8, | |
| "valid_targets_min": 1355 | |
| }, | |
| { | |
| "epoch": 3.707070707070707, | |
| "grad_norm": 0.2754352204699097, | |
| "learning_rate": 7.908017702752504e-06, | |
| "loss": 0.4167, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11268728971481323, | |
| "step": 185, | |
| "valid_targets_mean": 4025.8, | |
| "valid_targets_min": 631 | |
| }, | |
| { | |
| "epoch": 3.808080808080808, | |
| "grad_norm": 0.26307739642578865, | |
| "learning_rate": 6.826210797626389e-06, | |
| "loss": 0.4158, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.08769197762012482, | |
| "step": 190, | |
| "valid_targets_mean": 3541.8, | |
| "valid_targets_min": 1132 | |
| }, | |
| { | |
| "epoch": 3.909090909090909, | |
| "grad_norm": 0.25965429680059315, | |
| "learning_rate": 5.8085852692695864e-06, | |
| "loss": 0.4088, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10568307340145111, | |
| "step": 195, | |
| "valid_targets_mean": 4072.8, | |
| "valid_targets_min": 658 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.37112181378749864, | |
| "learning_rate": 4.8600988869648745e-06, | |
| "loss": 0.4077, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.18167036771774292, | |
| "step": 200, | |
| "valid_targets_mean": 3388.4, | |
| "valid_targets_min": 757 | |
| }, | |
| { | |
| "epoch": 4.101010101010101, | |
| "grad_norm": 0.24190724490489265, | |
| "learning_rate": 3.985372581025333e-06, | |
| "loss": 0.3992, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.09219323098659515, | |
| "step": 205, | |
| "valid_targets_mean": 3395.9, | |
| "valid_targets_min": 1280 | |
| }, | |
| { | |
| "epoch": 4.202020202020202, | |
| "grad_norm": 0.26150260454768026, | |
| "learning_rate": 3.1886679300863156e-06, | |
| "loss": 0.407, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.09850673377513885, | |
| "step": 210, | |
| "valid_targets_mean": 3216.6, | |
| "valid_targets_min": 784 | |
| }, | |
| { | |
| "epoch": 4.303030303030303, | |
| "grad_norm": 0.26872823184659334, | |
| "learning_rate": 2.473866399122733e-06, | |
| "loss": 0.4133, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10747270286083221, | |
| "step": 215, | |
| "valid_targets_mean": 3487.2, | |
| "valid_targets_min": 773 | |
| }, | |
| { | |
| "epoch": 4.404040404040404, | |
| "grad_norm": 0.2549088626047512, | |
| "learning_rate": 1.8444504293418286e-06, | |
| "loss": 0.4124, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10171965509653091, | |
| "step": 220, | |
| "valid_targets_mean": 3727.9, | |
| "valid_targets_min": 928 | |
| }, | |
| { | |
| "epoch": 4.505050505050505, | |
| "grad_norm": 0.32301587125223463, | |
| "learning_rate": 1.3034864720797112e-06, | |
| "loss": 0.4125, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11444351077079773, | |
| "step": 225, | |
| "valid_targets_mean": 3867.6, | |
| "valid_targets_min": 791 | |
| }, | |
| { | |
| "epoch": 4.606060606060606, | |
| "grad_norm": 0.24811664458418323, | |
| "learning_rate": 8.536100493586552e-07, | |
| "loss": 0.4023, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.0989425927400589, | |
| "step": 230, | |
| "valid_targets_mean": 3649.8, | |
| "valid_targets_min": 675 | |
| }, | |
| { | |
| "epoch": 4.707070707070707, | |
| "grad_norm": 0.24999205236156857, | |
| "learning_rate": 4.970129138887347e-07, | |
| "loss": 0.4078, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.09448540210723877, | |
| "step": 235, | |
| "valid_targets_mean": 3779.3, | |
| "valid_targets_min": 613 | |
| }, | |
| { | |
| "epoch": 4.808080808080808, | |
| "grad_norm": 0.2533525398749002, | |
| "learning_rate": 2.3543237106894434e-07, | |
| "loss": 0.4138, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10717539489269257, | |
| "step": 240, | |
| "valid_targets_mean": 3630.1, | |
| "valid_targets_min": 1185 | |
| }, | |
| { | |
| "epoch": 4.909090909090909, | |
| "grad_norm": 0.2324383918083103, | |
| "learning_rate": 7.01428150099126e-08, | |
| "loss": 0.4099, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.09807487577199936, | |
| "step": 245, | |
| "valid_targets_mean": 4431.1, | |
| "valid_targets_min": 1588 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.3564637724019629, | |
| "learning_rate": 1.949519813915224e-09, | |
| "loss": 0.4103, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.20247647166252136, | |
| "step": 250, | |
| "valid_targets_mean": 3850.3, | |
| "valid_targets_min": 771 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.20247647166252136, | |
| "step": 250, | |
| "total_flos": 4.268418036190413e+17, | |
| "train_loss": 0.4559480676651001, | |
| "train_runtime": 5383.7809, | |
| "train_samples_per_second": 2.935, | |
| "train_steps_per_second": 0.046, | |
| "valid_targets_mean": 3850.3, | |
| "valid_targets_min": 771 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.268418036190413e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |