{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9973380656610469, "eval_steps": 500, "global_step": 844, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023661638568470866, "grad_norm": 1.2408271523327399, "learning_rate": 5e-06, "loss": 0.8878, "step": 10 }, { "epoch": 0.04732327713694173, "grad_norm": 10.221763472546659, "learning_rate": 5e-06, "loss": 0.7989, "step": 20 }, { "epoch": 0.0709849157054126, "grad_norm": 1.2230059638293505, "learning_rate": 5e-06, "loss": 0.773, "step": 30 }, { "epoch": 0.09464655427388347, "grad_norm": 2.6694221939382583, "learning_rate": 5e-06, "loss": 0.7466, "step": 40 }, { "epoch": 0.11830819284235433, "grad_norm": 0.7655921008659343, "learning_rate": 5e-06, "loss": 0.7308, "step": 50 }, { "epoch": 0.1419698314108252, "grad_norm": 0.6751841899446792, "learning_rate": 5e-06, "loss": 0.7178, "step": 60 }, { "epoch": 0.16563146997929606, "grad_norm": 0.545261037277831, "learning_rate": 5e-06, "loss": 0.7116, "step": 70 }, { "epoch": 0.18929310854776693, "grad_norm": 0.8030523103589834, "learning_rate": 5e-06, "loss": 0.7017, "step": 80 }, { "epoch": 0.2129547471162378, "grad_norm": 0.8014531123871866, "learning_rate": 5e-06, "loss": 0.6861, "step": 90 }, { "epoch": 0.23661638568470866, "grad_norm": 0.4918470297754101, "learning_rate": 5e-06, "loss": 0.6852, "step": 100 }, { "epoch": 0.26027802425317953, "grad_norm": 0.69215978679395, "learning_rate": 5e-06, "loss": 0.69, "step": 110 }, { "epoch": 0.2839396628216504, "grad_norm": 0.8786435457825235, "learning_rate": 5e-06, "loss": 0.6773, "step": 120 }, { "epoch": 0.30760130139012126, "grad_norm": 0.49069358486584114, "learning_rate": 5e-06, "loss": 0.6737, "step": 130 }, { "epoch": 0.33126293995859213, "grad_norm": 0.7921488279977867, "learning_rate": 5e-06, "loss": 0.6821, "step": 140 }, { "epoch": 0.354924578527063, "grad_norm": 0.77230991386959, "learning_rate": 5e-06, "loss": 0.6648, "step": 150 }, { "epoch": 0.37858621709553386, "grad_norm": 0.47987920788300265, "learning_rate": 5e-06, "loss": 0.669, "step": 160 }, { "epoch": 0.4022478556640047, "grad_norm": 0.5618200809563821, "learning_rate": 5e-06, "loss": 0.6668, "step": 170 }, { "epoch": 0.4259094942324756, "grad_norm": 0.7304782642194491, "learning_rate": 5e-06, "loss": 0.6737, "step": 180 }, { "epoch": 0.44957113280094646, "grad_norm": 0.46280184605813207, "learning_rate": 5e-06, "loss": 0.6697, "step": 190 }, { "epoch": 0.4732327713694173, "grad_norm": 0.7079097684721737, "learning_rate": 5e-06, "loss": 0.6686, "step": 200 }, { "epoch": 0.4968944099378882, "grad_norm": 0.774761573498746, "learning_rate": 5e-06, "loss": 0.6694, "step": 210 }, { "epoch": 0.5205560485063591, "grad_norm": 0.576730626392715, "learning_rate": 5e-06, "loss": 0.6677, "step": 220 }, { "epoch": 0.54421768707483, "grad_norm": 0.5744988270185307, "learning_rate": 5e-06, "loss": 0.6602, "step": 230 }, { "epoch": 0.5678793256433008, "grad_norm": 0.5394481930250411, "learning_rate": 5e-06, "loss": 0.6644, "step": 240 }, { "epoch": 0.5915409642117717, "grad_norm": 0.5182952984171931, "learning_rate": 5e-06, "loss": 0.6615, "step": 250 }, { "epoch": 0.6152026027802425, "grad_norm": 0.6364320156443367, "learning_rate": 5e-06, "loss": 0.6519, "step": 260 }, { "epoch": 0.6388642413487134, "grad_norm": 0.6324207034276161, "learning_rate": 5e-06, "loss": 0.6639, "step": 270 }, { "epoch": 0.6625258799171843, "grad_norm": 0.6620182705762153, "learning_rate": 5e-06, "loss": 0.6651, "step": 280 }, { "epoch": 0.6861875184856552, "grad_norm": 0.46128169756980925, "learning_rate": 5e-06, "loss": 0.6596, "step": 290 }, { "epoch": 0.709849157054126, "grad_norm": 0.622188372470794, "learning_rate": 5e-06, "loss": 0.6534, "step": 300 }, { "epoch": 0.7335107956225969, "grad_norm": 0.4904698615453566, "learning_rate": 5e-06, "loss": 0.6618, "step": 310 }, { "epoch": 0.7571724341910677, "grad_norm": 0.4555806118897353, "learning_rate": 5e-06, "loss": 0.6554, "step": 320 }, { "epoch": 0.7808340727595386, "grad_norm": 0.5273034701797177, "learning_rate": 5e-06, "loss": 0.654, "step": 330 }, { "epoch": 0.8044957113280095, "grad_norm": 0.5442233535066454, "learning_rate": 5e-06, "loss": 0.6537, "step": 340 }, { "epoch": 0.8281573498964804, "grad_norm": 0.6380409398524519, "learning_rate": 5e-06, "loss": 0.6601, "step": 350 }, { "epoch": 0.8518189884649512, "grad_norm": 0.4389996927828098, "learning_rate": 5e-06, "loss": 0.6537, "step": 360 }, { "epoch": 0.8754806270334221, "grad_norm": 0.4608268531740333, "learning_rate": 5e-06, "loss": 0.6565, "step": 370 }, { "epoch": 0.8991422656018929, "grad_norm": 0.5330723429667825, "learning_rate": 5e-06, "loss": 0.6477, "step": 380 }, { "epoch": 0.9228039041703638, "grad_norm": 0.5929849990200475, "learning_rate": 5e-06, "loss": 0.6552, "step": 390 }, { "epoch": 0.9464655427388347, "grad_norm": 0.4773172047297779, "learning_rate": 5e-06, "loss": 0.6464, "step": 400 }, { "epoch": 0.9701271813073056, "grad_norm": 0.4606137860127268, "learning_rate": 5e-06, "loss": 0.6489, "step": 410 }, { "epoch": 0.9937888198757764, "grad_norm": 0.526120099445913, "learning_rate": 5e-06, "loss": 0.6478, "step": 420 }, { "epoch": 0.9985211475894705, "eval_loss": 0.6501929759979248, "eval_runtime": 449.6535, "eval_samples_per_second": 25.328, "eval_steps_per_second": 0.396, "step": 422 }, { "epoch": 1.0177462289263532, "grad_norm": 0.5143362353922324, "learning_rate": 5e-06, "loss": 0.6515, "step": 430 }, { "epoch": 1.041407867494824, "grad_norm": 0.5162162401792869, "learning_rate": 5e-06, "loss": 0.605, "step": 440 }, { "epoch": 1.0650695060632949, "grad_norm": 0.7393357078452915, "learning_rate": 5e-06, "loss": 0.603, "step": 450 }, { "epoch": 1.0887311446317658, "grad_norm": 0.649426932177774, "learning_rate": 5e-06, "loss": 0.6134, "step": 460 }, { "epoch": 1.1123927832002367, "grad_norm": 0.5705639188659947, "learning_rate": 5e-06, "loss": 0.6106, "step": 470 }, { "epoch": 1.1360544217687074, "grad_norm": 0.7543562567579628, "learning_rate": 5e-06, "loss": 0.611, "step": 480 }, { "epoch": 1.1597160603371783, "grad_norm": 0.5499597181388575, "learning_rate": 5e-06, "loss": 0.6079, "step": 490 }, { "epoch": 1.1833776989056493, "grad_norm": 0.5262121393467482, "learning_rate": 5e-06, "loss": 0.6036, "step": 500 }, { "epoch": 1.2070393374741202, "grad_norm": 0.5683114548160128, "learning_rate": 5e-06, "loss": 0.6034, "step": 510 }, { "epoch": 1.2307009760425909, "grad_norm": 0.6610172663362014, "learning_rate": 5e-06, "loss": 0.6099, "step": 520 }, { "epoch": 1.2543626146110618, "grad_norm": 0.6007955010537178, "learning_rate": 5e-06, "loss": 0.6125, "step": 530 }, { "epoch": 1.2780242531795327, "grad_norm": 0.5585264375543114, "learning_rate": 5e-06, "loss": 0.6121, "step": 540 }, { "epoch": 1.3016858917480034, "grad_norm": 0.4689366084615487, "learning_rate": 5e-06, "loss": 0.6089, "step": 550 }, { "epoch": 1.3253475303164743, "grad_norm": 0.443719906754886, "learning_rate": 5e-06, "loss": 0.6073, "step": 560 }, { "epoch": 1.3490091688849453, "grad_norm": 0.8624897115990705, "learning_rate": 5e-06, "loss": 0.6084, "step": 570 }, { "epoch": 1.3726708074534162, "grad_norm": 0.5498793437391156, "learning_rate": 5e-06, "loss": 0.611, "step": 580 }, { "epoch": 1.396332446021887, "grad_norm": 0.44457160894446396, "learning_rate": 5e-06, "loss": 0.6115, "step": 590 }, { "epoch": 1.4199940845903578, "grad_norm": 0.5196837986130378, "learning_rate": 5e-06, "loss": 0.6008, "step": 600 }, { "epoch": 1.4436557231588287, "grad_norm": 0.40806642647037533, "learning_rate": 5e-06, "loss": 0.6002, "step": 610 }, { "epoch": 1.4673173617272997, "grad_norm": 0.449778520265882, "learning_rate": 5e-06, "loss": 0.6037, "step": 620 }, { "epoch": 1.4909790002957704, "grad_norm": 0.46760792115141014, "learning_rate": 5e-06, "loss": 0.6157, "step": 630 }, { "epoch": 1.5146406388642415, "grad_norm": 0.4490152450206069, "learning_rate": 5e-06, "loss": 0.6101, "step": 640 }, { "epoch": 1.5383022774327122, "grad_norm": 0.42442779950583953, "learning_rate": 5e-06, "loss": 0.6042, "step": 650 }, { "epoch": 1.5619639160011831, "grad_norm": 0.5976128445381751, "learning_rate": 5e-06, "loss": 0.609, "step": 660 }, { "epoch": 1.585625554569654, "grad_norm": 0.7381067199080075, "learning_rate": 5e-06, "loss": 0.6015, "step": 670 }, { "epoch": 1.6092871931381247, "grad_norm": 0.4692365896477618, "learning_rate": 5e-06, "loss": 0.6098, "step": 680 }, { "epoch": 1.6329488317065957, "grad_norm": 0.5475052095467955, "learning_rate": 5e-06, "loss": 0.601, "step": 690 }, { "epoch": 1.6566104702750666, "grad_norm": 0.5706027825471482, "learning_rate": 5e-06, "loss": 0.6107, "step": 700 }, { "epoch": 1.6802721088435373, "grad_norm": 0.5270197331562642, "learning_rate": 5e-06, "loss": 0.609, "step": 710 }, { "epoch": 1.7039337474120084, "grad_norm": 0.6598391343305342, "learning_rate": 5e-06, "loss": 0.6118, "step": 720 }, { "epoch": 1.7275953859804791, "grad_norm": 0.5570434796027114, "learning_rate": 5e-06, "loss": 0.6116, "step": 730 }, { "epoch": 1.75125702454895, "grad_norm": 0.4955844130516369, "learning_rate": 5e-06, "loss": 0.6039, "step": 740 }, { "epoch": 1.774918663117421, "grad_norm": 0.47770168087128073, "learning_rate": 5e-06, "loss": 0.6101, "step": 750 }, { "epoch": 1.7985803016858917, "grad_norm": 0.4667370666965365, "learning_rate": 5e-06, "loss": 0.614, "step": 760 }, { "epoch": 1.8222419402543626, "grad_norm": 0.4616819567056668, "learning_rate": 5e-06, "loss": 0.6158, "step": 770 }, { "epoch": 1.8459035788228335, "grad_norm": 0.43467879051005953, "learning_rate": 5e-06, "loss": 0.6067, "step": 780 }, { "epoch": 1.8695652173913042, "grad_norm": 0.48362881437134725, "learning_rate": 5e-06, "loss": 0.6054, "step": 790 }, { "epoch": 1.8932268559597754, "grad_norm": 0.49747648081112666, "learning_rate": 5e-06, "loss": 0.6137, "step": 800 }, { "epoch": 1.916888494528246, "grad_norm": 0.4097820122920606, "learning_rate": 5e-06, "loss": 0.6114, "step": 810 }, { "epoch": 1.940550133096717, "grad_norm": 0.47535675742314604, "learning_rate": 5e-06, "loss": 0.5996, "step": 820 }, { "epoch": 1.964211771665188, "grad_norm": 0.49949616004506914, "learning_rate": 5e-06, "loss": 0.6108, "step": 830 }, { "epoch": 1.9878734102336586, "grad_norm": 0.4387152081138621, "learning_rate": 5e-06, "loss": 0.5981, "step": 840 }, { "epoch": 1.9973380656610469, "eval_loss": 0.6398828029632568, "eval_runtime": 449.4321, "eval_samples_per_second": 25.341, "eval_steps_per_second": 0.396, "step": 844 }, { "epoch": 1.9973380656610469, "step": 844, "total_flos": 1413522055495680.0, "train_loss": 0.645099672378522, "train_runtime": 50035.9585, "train_samples_per_second": 8.649, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 844, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1413522055495680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }