| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 50, |
| "global_step": 361, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.013869625520110958, |
| "grad_norm": 4.510591983795166, |
| "learning_rate": 1.111111111111111e-06, |
| "loss": 4.4401, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.027739251040221916, |
| "grad_norm": 3.330329418182373, |
| "learning_rate": 2.5e-06, |
| "loss": 4.3997, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04160887656033287, |
| "grad_norm": 2.752856731414795, |
| "learning_rate": 3.88888888888889e-06, |
| "loss": 4.3407, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.05547850208044383, |
| "grad_norm": 1.670164704322815, |
| "learning_rate": 5.2777777777777785e-06, |
| "loss": 4.2228, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06934812760055478, |
| "grad_norm": 1.4805010557174683, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 4.1725, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08321775312066575, |
| "grad_norm": 1.4973056316375732, |
| "learning_rate": 8.055555555555557e-06, |
| "loss": 4.0898, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0970873786407767, |
| "grad_norm": 1.232718586921692, |
| "learning_rate": 9.444444444444445e-06, |
| "loss": 4.0469, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.11095700416088766, |
| "grad_norm": 1.0980631113052368, |
| "learning_rate": 1.0833333333333334e-05, |
| "loss": 4.0108, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.12482662968099861, |
| "grad_norm": 1.0764951705932617, |
| "learning_rate": 1.2222222222222224e-05, |
| "loss": 3.9719, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.13869625520110956, |
| "grad_norm": 1.2205593585968018, |
| "learning_rate": 1.3611111111111113e-05, |
| "loss": 3.8779, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.13869625520110956, |
| "eval_loss": 3.850510597229004, |
| "eval_runtime": 33.5856, |
| "eval_samples_per_second": 59.549, |
| "eval_steps_per_second": 14.887, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.15256588072122051, |
| "grad_norm": 0.9968748688697815, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 3.8635, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.1664355062413315, |
| "grad_norm": 0.8354158401489258, |
| "learning_rate": 1.638888888888889e-05, |
| "loss": 3.8142, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.18030513176144244, |
| "grad_norm": 1.072789192199707, |
| "learning_rate": 1.7777777777777777e-05, |
| "loss": 3.764, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.1941747572815534, |
| "grad_norm": 0.8747042417526245, |
| "learning_rate": 1.916666666666667e-05, |
| "loss": 3.7425, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.20804438280166435, |
| "grad_norm": 0.8491944074630737, |
| "learning_rate": 2e-05, |
| "loss": 3.7508, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.22191400832177532, |
| "grad_norm": 1.2049877643585205, |
| "learning_rate": 2e-05, |
| "loss": 3.7437, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.23578363384188628, |
| "grad_norm": 0.8578179478645325, |
| "learning_rate": 2e-05, |
| "loss": 3.7026, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.24965325936199723, |
| "grad_norm": 0.926129937171936, |
| "learning_rate": 2e-05, |
| "loss": 3.6973, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2635228848821082, |
| "grad_norm": 0.8127467632293701, |
| "learning_rate": 2e-05, |
| "loss": 3.6644, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.27739251040221913, |
| "grad_norm": 0.8188264966011047, |
| "learning_rate": 2e-05, |
| "loss": 3.7131, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.27739251040221913, |
| "eval_loss": 3.662523031234741, |
| "eval_runtime": 21.6908, |
| "eval_samples_per_second": 92.205, |
| "eval_steps_per_second": 23.051, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2912621359223301, |
| "grad_norm": 0.8815653920173645, |
| "learning_rate": 2e-05, |
| "loss": 3.698, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.30513176144244103, |
| "grad_norm": 0.8444380164146423, |
| "learning_rate": 2e-05, |
| "loss": 3.7047, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.31900138696255204, |
| "grad_norm": 0.8192554712295532, |
| "learning_rate": 2e-05, |
| "loss": 3.6493, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.332871012482663, |
| "grad_norm": 0.7751811146736145, |
| "learning_rate": 2e-05, |
| "loss": 3.6687, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.34674063800277394, |
| "grad_norm": 0.8989075422286987, |
| "learning_rate": 2e-05, |
| "loss": 3.6364, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.3606102635228849, |
| "grad_norm": 0.8365870118141174, |
| "learning_rate": 2e-05, |
| "loss": 3.638, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.37447988904299584, |
| "grad_norm": 0.8214474320411682, |
| "learning_rate": 2e-05, |
| "loss": 3.6234, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.3883495145631068, |
| "grad_norm": 0.8589164018630981, |
| "learning_rate": 2e-05, |
| "loss": 3.6011, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.40221914008321774, |
| "grad_norm": 0.7401110529899597, |
| "learning_rate": 2e-05, |
| "loss": 3.6116, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.4160887656033287, |
| "grad_norm": 0.9119060635566711, |
| "learning_rate": 2e-05, |
| "loss": 3.66, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4160887656033287, |
| "eval_loss": 3.6079025268554688, |
| "eval_runtime": 21.5551, |
| "eval_samples_per_second": 92.786, |
| "eval_steps_per_second": 23.196, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.42995839112343964, |
| "grad_norm": 0.7333055138587952, |
| "learning_rate": 2e-05, |
| "loss": 3.6081, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.44382801664355065, |
| "grad_norm": 0.9201464653015137, |
| "learning_rate": 2e-05, |
| "loss": 3.5902, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.4576976421636616, |
| "grad_norm": 0.8326563835144043, |
| "learning_rate": 2e-05, |
| "loss": 3.616, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.47156726768377255, |
| "grad_norm": 0.7910550236701965, |
| "learning_rate": 2e-05, |
| "loss": 3.5961, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.4854368932038835, |
| "grad_norm": 0.7993020415306091, |
| "learning_rate": 2e-05, |
| "loss": 3.6279, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.49930651872399445, |
| "grad_norm": 0.7476230263710022, |
| "learning_rate": 2e-05, |
| "loss": 3.6089, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5131761442441054, |
| "grad_norm": 0.8083401322364807, |
| "learning_rate": 2e-05, |
| "loss": 3.5774, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.5270457697642164, |
| "grad_norm": 0.8746952414512634, |
| "learning_rate": 2e-05, |
| "loss": 3.5767, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5409153952843273, |
| "grad_norm": 0.7660844326019287, |
| "learning_rate": 2e-05, |
| "loss": 3.5744, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.5547850208044383, |
| "grad_norm": 0.9199999570846558, |
| "learning_rate": 2e-05, |
| "loss": 3.5692, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5547850208044383, |
| "eval_loss": 3.57633113861084, |
| "eval_runtime": 21.5806, |
| "eval_samples_per_second": 92.676, |
| "eval_steps_per_second": 23.169, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5686546463245492, |
| "grad_norm": 0.7814918756484985, |
| "learning_rate": 2e-05, |
| "loss": 3.5922, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.5825242718446602, |
| "grad_norm": 0.8722035884857178, |
| "learning_rate": 2e-05, |
| "loss": 3.566, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.5963938973647711, |
| "grad_norm": 0.7965853214263916, |
| "learning_rate": 2e-05, |
| "loss": 3.5569, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.6102635228848821, |
| "grad_norm": 0.7837920188903809, |
| "learning_rate": 1.9586206896551725e-05, |
| "loss": 3.5482, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.624133148404993, |
| "grad_norm": 0.8320760726928711, |
| "learning_rate": 1.8896551724137934e-05, |
| "loss": 3.574, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.6380027739251041, |
| "grad_norm": 0.8384792804718018, |
| "learning_rate": 1.820689655172414e-05, |
| "loss": 3.559, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.651872399445215, |
| "grad_norm": 0.8892226219177246, |
| "learning_rate": 1.7517241379310347e-05, |
| "loss": 3.592, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.665742024965326, |
| "grad_norm": 0.7530568838119507, |
| "learning_rate": 1.6827586206896552e-05, |
| "loss": 3.5113, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.6796116504854369, |
| "grad_norm": 0.7810556292533875, |
| "learning_rate": 1.613793103448276e-05, |
| "loss": 3.5498, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.6934812760055479, |
| "grad_norm": 0.8023419380187988, |
| "learning_rate": 1.5448275862068965e-05, |
| "loss": 3.5663, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.6934812760055479, |
| "eval_loss": 3.5521113872528076, |
| "eval_runtime": 21.6583, |
| "eval_samples_per_second": 92.343, |
| "eval_steps_per_second": 23.086, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7073509015256588, |
| "grad_norm": 0.816082775592804, |
| "learning_rate": 1.4758620689655172e-05, |
| "loss": 3.5967, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.7212205270457698, |
| "grad_norm": 0.7597996592521667, |
| "learning_rate": 1.4068965517241382e-05, |
| "loss": 3.5758, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.7350901525658807, |
| "grad_norm": 0.7658893465995789, |
| "learning_rate": 1.3379310344827587e-05, |
| "loss": 3.5614, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.7489597780859917, |
| "grad_norm": 0.8528255224227905, |
| "learning_rate": 1.2689655172413794e-05, |
| "loss": 3.5871, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.7628294036061026, |
| "grad_norm": 0.8154652714729309, |
| "learning_rate": 1.2e-05, |
| "loss": 3.5798, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.7766990291262136, |
| "grad_norm": 0.7679687142372131, |
| "learning_rate": 1.1310344827586209e-05, |
| "loss": 3.5735, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.7905686546463245, |
| "grad_norm": 0.8223329782485962, |
| "learning_rate": 1.0620689655172414e-05, |
| "loss": 3.5398, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.8044382801664355, |
| "grad_norm": 0.7668578624725342, |
| "learning_rate": 9.931034482758622e-06, |
| "loss": 3.5625, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8183079056865464, |
| "grad_norm": 0.7521477937698364, |
| "learning_rate": 9.241379310344829e-06, |
| "loss": 3.5387, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.8321775312066574, |
| "grad_norm": 0.7685747742652893, |
| "learning_rate": 8.551724137931035e-06, |
| "loss": 3.5442, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8321775312066574, |
| "eval_loss": 3.5378079414367676, |
| "eval_runtime": 21.7896, |
| "eval_samples_per_second": 91.787, |
| "eval_steps_per_second": 22.947, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8460471567267683, |
| "grad_norm": 0.7819119691848755, |
| "learning_rate": 7.862068965517242e-06, |
| "loss": 3.4996, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.8599167822468793, |
| "grad_norm": 0.7009981274604797, |
| "learning_rate": 7.1724137931034495e-06, |
| "loss": 3.5478, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.8737864077669902, |
| "grad_norm": 0.7951462268829346, |
| "learning_rate": 6.482758620689655e-06, |
| "loss": 3.5395, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.8876560332871013, |
| "grad_norm": 0.8697261810302734, |
| "learning_rate": 5.793103448275863e-06, |
| "loss": 3.5568, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9015256588072122, |
| "grad_norm": 0.7426744103431702, |
| "learning_rate": 5.103448275862069e-06, |
| "loss": 3.5467, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.9153952843273232, |
| "grad_norm": 0.8066877126693726, |
| "learning_rate": 4.413793103448276e-06, |
| "loss": 3.533, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9292649098474342, |
| "grad_norm": 0.8604612946510315, |
| "learning_rate": 3.7241379310344837e-06, |
| "loss": 3.5227, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.9431345353675451, |
| "grad_norm": 0.7136410474777222, |
| "learning_rate": 3.0344827586206895e-06, |
| "loss": 3.5113, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.957004160887656, |
| "grad_norm": 0.701519250869751, |
| "learning_rate": 2.344827586206897e-06, |
| "loss": 3.5372, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.970873786407767, |
| "grad_norm": 0.8079059720039368, |
| "learning_rate": 1.6551724137931047e-06, |
| "loss": 3.5555, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.970873786407767, |
| "eval_loss": 3.5296669006347656, |
| "eval_runtime": 21.758, |
| "eval_samples_per_second": 91.92, |
| "eval_steps_per_second": 22.98, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.984743411927878, |
| "grad_norm": 0.8491346836090088, |
| "learning_rate": 9.655172413793101e-07, |
| "loss": 3.5319, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.9986130374479889, |
| "grad_norm": 0.7234743237495422, |
| "learning_rate": 2.758620689655178e-07, |
| "loss": 3.5472, |
| "step": 360 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 361, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|