{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 361, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013869625520110958, "grad_norm": 4.510591983795166, "learning_rate": 1.111111111111111e-06, "loss": 4.4401, "step": 5 }, { "epoch": 0.027739251040221916, "grad_norm": 3.330329418182373, "learning_rate": 2.5e-06, "loss": 4.3997, "step": 10 }, { "epoch": 0.04160887656033287, "grad_norm": 2.752856731414795, "learning_rate": 3.88888888888889e-06, "loss": 4.3407, "step": 15 }, { "epoch": 0.05547850208044383, "grad_norm": 1.670164704322815, "learning_rate": 5.2777777777777785e-06, "loss": 4.2228, "step": 20 }, { "epoch": 0.06934812760055478, "grad_norm": 1.4805010557174683, "learning_rate": 6.666666666666667e-06, "loss": 4.1725, "step": 25 }, { "epoch": 0.08321775312066575, "grad_norm": 1.4973056316375732, "learning_rate": 8.055555555555557e-06, "loss": 4.0898, "step": 30 }, { "epoch": 0.0970873786407767, "grad_norm": 1.232718586921692, "learning_rate": 9.444444444444445e-06, "loss": 4.0469, "step": 35 }, { "epoch": 0.11095700416088766, "grad_norm": 1.0980631113052368, "learning_rate": 1.0833333333333334e-05, "loss": 4.0108, "step": 40 }, { "epoch": 0.12482662968099861, "grad_norm": 1.0764951705932617, "learning_rate": 1.2222222222222224e-05, "loss": 3.9719, "step": 45 }, { "epoch": 0.13869625520110956, "grad_norm": 1.2205593585968018, "learning_rate": 1.3611111111111113e-05, "loss": 3.8779, "step": 50 }, { "epoch": 0.13869625520110956, "eval_loss": 3.850510597229004, "eval_runtime": 33.5856, "eval_samples_per_second": 59.549, "eval_steps_per_second": 14.887, "step": 50 }, { "epoch": 0.15256588072122051, "grad_norm": 0.9968748688697815, "learning_rate": 1.5000000000000002e-05, "loss": 3.8635, "step": 55 }, { "epoch": 0.1664355062413315, "grad_norm": 0.8354158401489258, "learning_rate": 1.638888888888889e-05, "loss": 3.8142, "step": 60 }, { "epoch": 0.18030513176144244, "grad_norm": 1.072789192199707, "learning_rate": 1.7777777777777777e-05, "loss": 3.764, "step": 65 }, { "epoch": 0.1941747572815534, "grad_norm": 0.8747042417526245, "learning_rate": 1.916666666666667e-05, "loss": 3.7425, "step": 70 }, { "epoch": 0.20804438280166435, "grad_norm": 0.8491944074630737, "learning_rate": 2e-05, "loss": 3.7508, "step": 75 }, { "epoch": 0.22191400832177532, "grad_norm": 1.2049877643585205, "learning_rate": 2e-05, "loss": 3.7437, "step": 80 }, { "epoch": 0.23578363384188628, "grad_norm": 0.8578179478645325, "learning_rate": 2e-05, "loss": 3.7026, "step": 85 }, { "epoch": 0.24965325936199723, "grad_norm": 0.926129937171936, "learning_rate": 2e-05, "loss": 3.6973, "step": 90 }, { "epoch": 0.2635228848821082, "grad_norm": 0.8127467632293701, "learning_rate": 2e-05, "loss": 3.6644, "step": 95 }, { "epoch": 0.27739251040221913, "grad_norm": 0.8188264966011047, "learning_rate": 2e-05, "loss": 3.7131, "step": 100 }, { "epoch": 0.27739251040221913, "eval_loss": 3.662523031234741, "eval_runtime": 21.6908, "eval_samples_per_second": 92.205, "eval_steps_per_second": 23.051, "step": 100 }, { "epoch": 0.2912621359223301, "grad_norm": 0.8815653920173645, "learning_rate": 2e-05, "loss": 3.698, "step": 105 }, { "epoch": 0.30513176144244103, "grad_norm": 0.8444380164146423, "learning_rate": 2e-05, "loss": 3.7047, "step": 110 }, { "epoch": 0.31900138696255204, "grad_norm": 0.8192554712295532, "learning_rate": 2e-05, "loss": 3.6493, "step": 115 }, { "epoch": 0.332871012482663, "grad_norm": 0.7751811146736145, "learning_rate": 2e-05, "loss": 3.6687, "step": 120 }, { "epoch": 0.34674063800277394, "grad_norm": 0.8989075422286987, "learning_rate": 2e-05, "loss": 3.6364, "step": 125 }, { "epoch": 0.3606102635228849, "grad_norm": 0.8365870118141174, "learning_rate": 2e-05, "loss": 3.638, "step": 130 }, { "epoch": 0.37447988904299584, "grad_norm": 0.8214474320411682, "learning_rate": 2e-05, "loss": 3.6234, "step": 135 }, { "epoch": 0.3883495145631068, "grad_norm": 0.8589164018630981, "learning_rate": 2e-05, "loss": 3.6011, "step": 140 }, { "epoch": 0.40221914008321774, "grad_norm": 0.7401110529899597, "learning_rate": 2e-05, "loss": 3.6116, "step": 145 }, { "epoch": 0.4160887656033287, "grad_norm": 0.9119060635566711, "learning_rate": 2e-05, "loss": 3.66, "step": 150 }, { "epoch": 0.4160887656033287, "eval_loss": 3.6079025268554688, "eval_runtime": 21.5551, "eval_samples_per_second": 92.786, "eval_steps_per_second": 23.196, "step": 150 }, { "epoch": 0.42995839112343964, "grad_norm": 0.7333055138587952, "learning_rate": 2e-05, "loss": 3.6081, "step": 155 }, { "epoch": 0.44382801664355065, "grad_norm": 0.9201464653015137, "learning_rate": 2e-05, "loss": 3.5902, "step": 160 }, { "epoch": 0.4576976421636616, "grad_norm": 0.8326563835144043, "learning_rate": 2e-05, "loss": 3.616, "step": 165 }, { "epoch": 0.47156726768377255, "grad_norm": 0.7910550236701965, "learning_rate": 2e-05, "loss": 3.5961, "step": 170 }, { "epoch": 0.4854368932038835, "grad_norm": 0.7993020415306091, "learning_rate": 2e-05, "loss": 3.6279, "step": 175 }, { "epoch": 0.49930651872399445, "grad_norm": 0.7476230263710022, "learning_rate": 2e-05, "loss": 3.6089, "step": 180 }, { "epoch": 0.5131761442441054, "grad_norm": 0.8083401322364807, "learning_rate": 2e-05, "loss": 3.5774, "step": 185 }, { "epoch": 0.5270457697642164, "grad_norm": 0.8746952414512634, "learning_rate": 2e-05, "loss": 3.5767, "step": 190 }, { "epoch": 0.5409153952843273, "grad_norm": 0.7660844326019287, "learning_rate": 2e-05, "loss": 3.5744, "step": 195 }, { "epoch": 0.5547850208044383, "grad_norm": 0.9199999570846558, "learning_rate": 2e-05, "loss": 3.5692, "step": 200 }, { "epoch": 0.5547850208044383, "eval_loss": 3.57633113861084, "eval_runtime": 21.5806, "eval_samples_per_second": 92.676, "eval_steps_per_second": 23.169, "step": 200 }, { "epoch": 0.5686546463245492, "grad_norm": 0.7814918756484985, "learning_rate": 2e-05, "loss": 3.5922, "step": 205 }, { "epoch": 0.5825242718446602, "grad_norm": 0.8722035884857178, "learning_rate": 2e-05, "loss": 3.566, "step": 210 }, { "epoch": 0.5963938973647711, "grad_norm": 0.7965853214263916, "learning_rate": 2e-05, "loss": 3.5569, "step": 215 }, { "epoch": 0.6102635228848821, "grad_norm": 0.7837920188903809, "learning_rate": 1.9586206896551725e-05, "loss": 3.5482, "step": 220 }, { "epoch": 0.624133148404993, "grad_norm": 0.8320760726928711, "learning_rate": 1.8896551724137934e-05, "loss": 3.574, "step": 225 }, { "epoch": 0.6380027739251041, "grad_norm": 0.8384792804718018, "learning_rate": 1.820689655172414e-05, "loss": 3.559, "step": 230 }, { "epoch": 0.651872399445215, "grad_norm": 0.8892226219177246, "learning_rate": 1.7517241379310347e-05, "loss": 3.592, "step": 235 }, { "epoch": 0.665742024965326, "grad_norm": 0.7530568838119507, "learning_rate": 1.6827586206896552e-05, "loss": 3.5113, "step": 240 }, { "epoch": 0.6796116504854369, "grad_norm": 0.7810556292533875, "learning_rate": 1.613793103448276e-05, "loss": 3.5498, "step": 245 }, { "epoch": 0.6934812760055479, "grad_norm": 0.8023419380187988, "learning_rate": 1.5448275862068965e-05, "loss": 3.5663, "step": 250 }, { "epoch": 0.6934812760055479, "eval_loss": 3.5521113872528076, "eval_runtime": 21.6583, "eval_samples_per_second": 92.343, "eval_steps_per_second": 23.086, "step": 250 }, { "epoch": 0.7073509015256588, "grad_norm": 0.816082775592804, "learning_rate": 1.4758620689655172e-05, "loss": 3.5967, "step": 255 }, { "epoch": 0.7212205270457698, "grad_norm": 0.7597996592521667, "learning_rate": 1.4068965517241382e-05, "loss": 3.5758, "step": 260 }, { "epoch": 0.7350901525658807, "grad_norm": 0.7658893465995789, "learning_rate": 1.3379310344827587e-05, "loss": 3.5614, "step": 265 }, { "epoch": 0.7489597780859917, "grad_norm": 0.8528255224227905, "learning_rate": 1.2689655172413794e-05, "loss": 3.5871, "step": 270 }, { "epoch": 0.7628294036061026, "grad_norm": 0.8154652714729309, "learning_rate": 1.2e-05, "loss": 3.5798, "step": 275 }, { "epoch": 0.7766990291262136, "grad_norm": 0.7679687142372131, "learning_rate": 1.1310344827586209e-05, "loss": 3.5735, "step": 280 }, { "epoch": 0.7905686546463245, "grad_norm": 0.8223329782485962, "learning_rate": 1.0620689655172414e-05, "loss": 3.5398, "step": 285 }, { "epoch": 0.8044382801664355, "grad_norm": 0.7668578624725342, "learning_rate": 9.931034482758622e-06, "loss": 3.5625, "step": 290 }, { "epoch": 0.8183079056865464, "grad_norm": 0.7521477937698364, "learning_rate": 9.241379310344829e-06, "loss": 3.5387, "step": 295 }, { "epoch": 0.8321775312066574, "grad_norm": 0.7685747742652893, "learning_rate": 8.551724137931035e-06, "loss": 3.5442, "step": 300 }, { "epoch": 0.8321775312066574, "eval_loss": 3.5378079414367676, "eval_runtime": 21.7896, "eval_samples_per_second": 91.787, "eval_steps_per_second": 22.947, "step": 300 }, { "epoch": 0.8460471567267683, "grad_norm": 0.7819119691848755, "learning_rate": 7.862068965517242e-06, "loss": 3.4996, "step": 305 }, { "epoch": 0.8599167822468793, "grad_norm": 0.7009981274604797, "learning_rate": 7.1724137931034495e-06, "loss": 3.5478, "step": 310 }, { "epoch": 0.8737864077669902, "grad_norm": 0.7951462268829346, "learning_rate": 6.482758620689655e-06, "loss": 3.5395, "step": 315 }, { "epoch": 0.8876560332871013, "grad_norm": 0.8697261810302734, "learning_rate": 5.793103448275863e-06, "loss": 3.5568, "step": 320 }, { "epoch": 0.9015256588072122, "grad_norm": 0.7426744103431702, "learning_rate": 5.103448275862069e-06, "loss": 3.5467, "step": 325 }, { "epoch": 0.9153952843273232, "grad_norm": 0.8066877126693726, "learning_rate": 4.413793103448276e-06, "loss": 3.533, "step": 330 }, { "epoch": 0.9292649098474342, "grad_norm": 0.8604612946510315, "learning_rate": 3.7241379310344837e-06, "loss": 3.5227, "step": 335 }, { "epoch": 0.9431345353675451, "grad_norm": 0.7136410474777222, "learning_rate": 3.0344827586206895e-06, "loss": 3.5113, "step": 340 }, { "epoch": 0.957004160887656, "grad_norm": 0.701519250869751, "learning_rate": 2.344827586206897e-06, "loss": 3.5372, "step": 345 }, { "epoch": 0.970873786407767, "grad_norm": 0.8079059720039368, "learning_rate": 1.6551724137931047e-06, "loss": 3.5555, "step": 350 }, { "epoch": 0.970873786407767, "eval_loss": 3.5296669006347656, "eval_runtime": 21.758, "eval_samples_per_second": 91.92, "eval_steps_per_second": 22.98, "step": 350 }, { "epoch": 0.984743411927878, "grad_norm": 0.8491346836090088, "learning_rate": 9.655172413793101e-07, "loss": 3.5319, "step": 355 }, { "epoch": 0.9986130374479889, "grad_norm": 0.7234743237495422, "learning_rate": 2.758620689655178e-07, "loss": 3.5472, "step": 360 } ], "logging_steps": 5, "max_steps": 361, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }