| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9973380656610469, | |
| "eval_steps": 500, | |
| "global_step": 844, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.023661638568470866, | |
| "grad_norm": 1.2408271523327399, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8878, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04732327713694173, | |
| "grad_norm": 10.221763472546659, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7989, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0709849157054126, | |
| "grad_norm": 1.2230059638293505, | |
| "learning_rate": 5e-06, | |
| "loss": 0.773, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09464655427388347, | |
| "grad_norm": 2.6694221939382583, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7466, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11830819284235433, | |
| "grad_norm": 0.7655921008659343, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7308, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1419698314108252, | |
| "grad_norm": 0.6751841899446792, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7178, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.16563146997929606, | |
| "grad_norm": 0.545261037277831, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7116, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18929310854776693, | |
| "grad_norm": 0.8030523103589834, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7017, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2129547471162378, | |
| "grad_norm": 0.8014531123871866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6861, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23661638568470866, | |
| "grad_norm": 0.4918470297754101, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6852, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.26027802425317953, | |
| "grad_norm": 0.69215978679395, | |
| "learning_rate": 5e-06, | |
| "loss": 0.69, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2839396628216504, | |
| "grad_norm": 0.8786435457825235, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6773, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.30760130139012126, | |
| "grad_norm": 0.49069358486584114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6737, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.33126293995859213, | |
| "grad_norm": 0.7921488279977867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6821, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.354924578527063, | |
| "grad_norm": 0.77230991386959, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6648, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.37858621709553386, | |
| "grad_norm": 0.47987920788300265, | |
| "learning_rate": 5e-06, | |
| "loss": 0.669, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4022478556640047, | |
| "grad_norm": 0.5618200809563821, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6668, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4259094942324756, | |
| "grad_norm": 0.7304782642194491, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6737, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.44957113280094646, | |
| "grad_norm": 0.46280184605813207, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6697, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4732327713694173, | |
| "grad_norm": 0.7079097684721737, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6686, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4968944099378882, | |
| "grad_norm": 0.774761573498746, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6694, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5205560485063591, | |
| "grad_norm": 0.576730626392715, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6677, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.54421768707483, | |
| "grad_norm": 0.5744988270185307, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6602, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5678793256433008, | |
| "grad_norm": 0.5394481930250411, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6644, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5915409642117717, | |
| "grad_norm": 0.5182952984171931, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6615, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6152026027802425, | |
| "grad_norm": 0.6364320156443367, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6519, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6388642413487134, | |
| "grad_norm": 0.6324207034276161, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6639, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6625258799171843, | |
| "grad_norm": 0.6620182705762153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6651, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6861875184856552, | |
| "grad_norm": 0.46128169756980925, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6596, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.709849157054126, | |
| "grad_norm": 0.622188372470794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6534, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7335107956225969, | |
| "grad_norm": 0.4904698615453566, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6618, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7571724341910677, | |
| "grad_norm": 0.4555806118897353, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6554, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7808340727595386, | |
| "grad_norm": 0.5273034701797177, | |
| "learning_rate": 5e-06, | |
| "loss": 0.654, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8044957113280095, | |
| "grad_norm": 0.5442233535066454, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6537, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8281573498964804, | |
| "grad_norm": 0.6380409398524519, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6601, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8518189884649512, | |
| "grad_norm": 0.4389996927828098, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6537, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8754806270334221, | |
| "grad_norm": 0.4608268531740333, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6565, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8991422656018929, | |
| "grad_norm": 0.5330723429667825, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6477, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9228039041703638, | |
| "grad_norm": 0.5929849990200475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6552, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9464655427388347, | |
| "grad_norm": 0.4773172047297779, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6464, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9701271813073056, | |
| "grad_norm": 0.4606137860127268, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6489, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9937888198757764, | |
| "grad_norm": 0.526120099445913, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6478, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9985211475894705, | |
| "eval_loss": 0.6501929759979248, | |
| "eval_runtime": 449.6535, | |
| "eval_samples_per_second": 25.328, | |
| "eval_steps_per_second": 0.396, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.0177462289263532, | |
| "grad_norm": 0.5143362353922324, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6515, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.041407867494824, | |
| "grad_norm": 0.5162162401792869, | |
| "learning_rate": 5e-06, | |
| "loss": 0.605, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0650695060632949, | |
| "grad_norm": 0.7393357078452915, | |
| "learning_rate": 5e-06, | |
| "loss": 0.603, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0887311446317658, | |
| "grad_norm": 0.649426932177774, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6134, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.1123927832002367, | |
| "grad_norm": 0.5705639188659947, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6106, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1360544217687074, | |
| "grad_norm": 0.7543562567579628, | |
| "learning_rate": 5e-06, | |
| "loss": 0.611, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1597160603371783, | |
| "grad_norm": 0.5499597181388575, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6079, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1833776989056493, | |
| "grad_norm": 0.5262121393467482, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6036, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2070393374741202, | |
| "grad_norm": 0.5683114548160128, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6034, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2307009760425909, | |
| "grad_norm": 0.6610172663362014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6099, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2543626146110618, | |
| "grad_norm": 0.6007955010537178, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6125, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.2780242531795327, | |
| "grad_norm": 0.5585264375543114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6121, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3016858917480034, | |
| "grad_norm": 0.4689366084615487, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6089, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3253475303164743, | |
| "grad_norm": 0.443719906754886, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6073, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.3490091688849453, | |
| "grad_norm": 0.8624897115990705, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6084, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3726708074534162, | |
| "grad_norm": 0.5498793437391156, | |
| "learning_rate": 5e-06, | |
| "loss": 0.611, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.396332446021887, | |
| "grad_norm": 0.44457160894446396, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6115, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4199940845903578, | |
| "grad_norm": 0.5196837986130378, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6008, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4436557231588287, | |
| "grad_norm": 0.40806642647037533, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6002, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4673173617272997, | |
| "grad_norm": 0.449778520265882, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6037, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4909790002957704, | |
| "grad_norm": 0.46760792115141014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6157, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5146406388642415, | |
| "grad_norm": 0.4490152450206069, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6101, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.5383022774327122, | |
| "grad_norm": 0.42442779950583953, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6042, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5619639160011831, | |
| "grad_norm": 0.5976128445381751, | |
| "learning_rate": 5e-06, | |
| "loss": 0.609, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.585625554569654, | |
| "grad_norm": 0.7381067199080075, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6015, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6092871931381247, | |
| "grad_norm": 0.4692365896477618, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6098, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.6329488317065957, | |
| "grad_norm": 0.5475052095467955, | |
| "learning_rate": 5e-06, | |
| "loss": 0.601, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.6566104702750666, | |
| "grad_norm": 0.5706027825471482, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6107, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6802721088435373, | |
| "grad_norm": 0.5270197331562642, | |
| "learning_rate": 5e-06, | |
| "loss": 0.609, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.7039337474120084, | |
| "grad_norm": 0.6598391343305342, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6118, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.7275953859804791, | |
| "grad_norm": 0.5570434796027114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6116, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.75125702454895, | |
| "grad_norm": 0.4955844130516369, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6039, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.774918663117421, | |
| "grad_norm": 0.47770168087128073, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6101, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.7985803016858917, | |
| "grad_norm": 0.4667370666965365, | |
| "learning_rate": 5e-06, | |
| "loss": 0.614, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.8222419402543626, | |
| "grad_norm": 0.4616819567056668, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6158, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.8459035788228335, | |
| "grad_norm": 0.43467879051005953, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6067, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.8695652173913042, | |
| "grad_norm": 0.48362881437134725, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6054, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.8932268559597754, | |
| "grad_norm": 0.49747648081112666, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6137, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.916888494528246, | |
| "grad_norm": 0.4097820122920606, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6114, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.940550133096717, | |
| "grad_norm": 0.47535675742314604, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5996, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.964211771665188, | |
| "grad_norm": 0.49949616004506914, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6108, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.9878734102336586, | |
| "grad_norm": 0.4387152081138621, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5981, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.9973380656610469, | |
| "eval_loss": 0.6398828029632568, | |
| "eval_runtime": 449.4321, | |
| "eval_samples_per_second": 25.341, | |
| "eval_steps_per_second": 0.396, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 1.9973380656610469, | |
| "step": 844, | |
| "total_flos": 1413522055495680.0, | |
| "train_loss": 0.645099672378522, | |
| "train_runtime": 50035.9585, | |
| "train_samples_per_second": 8.649, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 844, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1413522055495680.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |