{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9956063268892796, "eval_steps": 500, "global_step": 852, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0351493848857645, "grad_norm": 6.865658287926604, "learning_rate": 5e-06, "loss": 1.0224, "step": 10 }, { "epoch": 0.070298769771529, "grad_norm": 1.300129454515723, "learning_rate": 5e-06, "loss": 0.8893, "step": 20 }, { "epoch": 0.1054481546572935, "grad_norm": 1.662141333626877, "learning_rate": 5e-06, "loss": 0.8461, "step": 30 }, { "epoch": 0.140597539543058, "grad_norm": 1.1867317674423972, "learning_rate": 5e-06, "loss": 0.8221, "step": 40 }, { "epoch": 0.1757469244288225, "grad_norm": 1.3406663541941202, "learning_rate": 5e-06, "loss": 0.8082, "step": 50 }, { "epoch": 0.210896309314587, "grad_norm": 0.9748883756928828, "learning_rate": 5e-06, "loss": 0.7926, "step": 60 }, { "epoch": 0.2460456942003515, "grad_norm": 0.9049088495136057, "learning_rate": 5e-06, "loss": 0.7786, "step": 70 }, { "epoch": 0.281195079086116, "grad_norm": 0.7756246074861972, "learning_rate": 5e-06, "loss": 0.7672, "step": 80 }, { "epoch": 0.3163444639718805, "grad_norm": 0.6803113923201367, "learning_rate": 5e-06, "loss": 0.7666, "step": 90 }, { "epoch": 0.351493848857645, "grad_norm": 0.7087390396178761, "learning_rate": 5e-06, "loss": 0.7626, "step": 100 }, { "epoch": 0.3866432337434095, "grad_norm": 0.6904558645177432, "learning_rate": 5e-06, "loss": 0.7558, "step": 110 }, { "epoch": 0.421792618629174, "grad_norm": 0.5406381121382178, "learning_rate": 5e-06, "loss": 0.751, "step": 120 }, { "epoch": 0.45694200351493847, "grad_norm": 0.7263515515258443, "learning_rate": 5e-06, "loss": 0.7541, "step": 130 }, { "epoch": 0.492091388400703, "grad_norm": 0.6381225954297634, "learning_rate": 5e-06, "loss": 0.7507, "step": 140 }, { "epoch": 0.5272407732864675, "grad_norm": 0.9282926398227679, "learning_rate": 5e-06, "loss": 0.7425, "step": 150 }, { "epoch": 0.562390158172232, "grad_norm": 0.703837658050583, "learning_rate": 5e-06, "loss": 0.7473, "step": 160 }, { "epoch": 0.5975395430579965, "grad_norm": 0.7762432087380096, "learning_rate": 5e-06, "loss": 0.7408, "step": 170 }, { "epoch": 0.632688927943761, "grad_norm": 0.5947662547404722, "learning_rate": 5e-06, "loss": 0.7347, "step": 180 }, { "epoch": 0.6678383128295254, "grad_norm": 0.628944705791063, "learning_rate": 5e-06, "loss": 0.7406, "step": 190 }, { "epoch": 0.70298769771529, "grad_norm": 0.5977406811055224, "learning_rate": 5e-06, "loss": 0.7347, "step": 200 }, { "epoch": 0.7381370826010545, "grad_norm": 0.582107769314153, "learning_rate": 5e-06, "loss": 0.737, "step": 210 }, { "epoch": 0.773286467486819, "grad_norm": 0.6326552735959291, "learning_rate": 5e-06, "loss": 0.7328, "step": 220 }, { "epoch": 0.8084358523725835, "grad_norm": 0.6637547116847639, "learning_rate": 5e-06, "loss": 0.7311, "step": 230 }, { "epoch": 0.843585237258348, "grad_norm": 0.6997143410926964, "learning_rate": 5e-06, "loss": 0.7341, "step": 240 }, { "epoch": 0.8787346221441125, "grad_norm": 0.6162729226466245, "learning_rate": 5e-06, "loss": 0.7332, "step": 250 }, { "epoch": 0.9138840070298769, "grad_norm": 0.6199166403621413, "learning_rate": 5e-06, "loss": 0.7262, "step": 260 }, { "epoch": 0.9490333919156415, "grad_norm": 0.6034966296550427, "learning_rate": 5e-06, "loss": 0.729, "step": 270 }, { "epoch": 0.984182776801406, "grad_norm": 0.6195682554180708, "learning_rate": 5e-06, "loss": 0.7264, "step": 280 }, { "epoch": 0.9982425307557118, "eval_loss": 0.721891462802887, "eval_runtime": 302.9942, "eval_samples_per_second": 25.304, "eval_steps_per_second": 0.396, "step": 284 }, { "epoch": 1.0197715289982425, "grad_norm": 0.6617437136575776, "learning_rate": 5e-06, "loss": 0.7503, "step": 290 }, { "epoch": 1.054920913884007, "grad_norm": 0.596469668606961, "learning_rate": 5e-06, "loss": 0.6752, "step": 300 }, { "epoch": 1.0900702987697715, "grad_norm": 0.7286257897811691, "learning_rate": 5e-06, "loss": 0.6745, "step": 310 }, { "epoch": 1.1252196836555362, "grad_norm": 0.6703518701287363, "learning_rate": 5e-06, "loss": 0.6772, "step": 320 }, { "epoch": 1.1603690685413006, "grad_norm": 0.6678193952959378, "learning_rate": 5e-06, "loss": 0.6757, "step": 330 }, { "epoch": 1.195518453427065, "grad_norm": 0.6272061731880971, "learning_rate": 5e-06, "loss": 0.6772, "step": 340 }, { "epoch": 1.2306678383128296, "grad_norm": 0.5618776589312474, "learning_rate": 5e-06, "loss": 0.6767, "step": 350 }, { "epoch": 1.265817223198594, "grad_norm": 0.656461597570214, "learning_rate": 5e-06, "loss": 0.6742, "step": 360 }, { "epoch": 1.3009666080843585, "grad_norm": 0.95088298783439, "learning_rate": 5e-06, "loss": 0.6803, "step": 370 }, { "epoch": 1.336115992970123, "grad_norm": 0.6256062888068228, "learning_rate": 5e-06, "loss": 0.6789, "step": 380 }, { "epoch": 1.3712653778558876, "grad_norm": 0.5178529345876333, "learning_rate": 5e-06, "loss": 0.6757, "step": 390 }, { "epoch": 1.406414762741652, "grad_norm": 0.6633111117626306, "learning_rate": 5e-06, "loss": 0.6786, "step": 400 }, { "epoch": 1.4415641476274166, "grad_norm": 0.5753214727933854, "learning_rate": 5e-06, "loss": 0.6686, "step": 410 }, { "epoch": 1.476713532513181, "grad_norm": 0.7023169996268164, "learning_rate": 5e-06, "loss": 0.674, "step": 420 }, { "epoch": 1.5118629173989455, "grad_norm": 0.601050736097527, "learning_rate": 5e-06, "loss": 0.676, "step": 430 }, { "epoch": 1.54701230228471, "grad_norm": 0.6375081303020413, "learning_rate": 5e-06, "loss": 0.6779, "step": 440 }, { "epoch": 1.5821616871704745, "grad_norm": 0.6076189491485879, "learning_rate": 5e-06, "loss": 0.6811, "step": 450 }, { "epoch": 1.6173110720562391, "grad_norm": 0.6123755131309624, "learning_rate": 5e-06, "loss": 0.6749, "step": 460 }, { "epoch": 1.6524604569420034, "grad_norm": 0.5847478738087437, "learning_rate": 5e-06, "loss": 0.6747, "step": 470 }, { "epoch": 1.687609841827768, "grad_norm": 0.6041574755100807, "learning_rate": 5e-06, "loss": 0.6681, "step": 480 }, { "epoch": 1.7227592267135325, "grad_norm": 0.6972984159432736, "learning_rate": 5e-06, "loss": 0.6686, "step": 490 }, { "epoch": 1.757908611599297, "grad_norm": 0.592332789109461, "learning_rate": 5e-06, "loss": 0.6763, "step": 500 }, { "epoch": 1.7930579964850615, "grad_norm": 0.7081266254056617, "learning_rate": 5e-06, "loss": 0.6707, "step": 510 }, { "epoch": 1.828207381370826, "grad_norm": 0.5655551892586738, "learning_rate": 5e-06, "loss": 0.675, "step": 520 }, { "epoch": 1.8633567662565906, "grad_norm": 0.5912936045849521, "learning_rate": 5e-06, "loss": 0.6746, "step": 530 }, { "epoch": 1.8985061511423549, "grad_norm": 0.6418543236430647, "learning_rate": 5e-06, "loss": 0.6743, "step": 540 }, { "epoch": 1.9336555360281196, "grad_norm": 0.8406203952305934, "learning_rate": 5e-06, "loss": 0.6777, "step": 550 }, { "epoch": 1.968804920913884, "grad_norm": 0.6740623987469322, "learning_rate": 5e-06, "loss": 0.6783, "step": 560 }, { "epoch": 1.9969244288224957, "eval_loss": 0.7088373899459839, "eval_runtime": 302.5633, "eval_samples_per_second": 25.34, "eval_steps_per_second": 0.397, "step": 568 }, { "epoch": 2.0043936731107204, "grad_norm": 0.8982450142296012, "learning_rate": 5e-06, "loss": 0.7079, "step": 570 }, { "epoch": 2.039543057996485, "grad_norm": 0.9691008221084222, "learning_rate": 5e-06, "loss": 0.6203, "step": 580 }, { "epoch": 2.0746924428822497, "grad_norm": 0.6447824341633516, "learning_rate": 5e-06, "loss": 0.6244, "step": 590 }, { "epoch": 2.109841827768014, "grad_norm": 0.7064323342581214, "learning_rate": 5e-06, "loss": 0.6189, "step": 600 }, { "epoch": 2.1449912126537787, "grad_norm": 0.5819596596280016, "learning_rate": 5e-06, "loss": 0.6207, "step": 610 }, { "epoch": 2.180140597539543, "grad_norm": 0.7981926624790863, "learning_rate": 5e-06, "loss": 0.6203, "step": 620 }, { "epoch": 2.2152899824253076, "grad_norm": 0.6853162161955834, "learning_rate": 5e-06, "loss": 0.6281, "step": 630 }, { "epoch": 2.2504393673110723, "grad_norm": 0.6819271490957453, "learning_rate": 5e-06, "loss": 0.6246, "step": 640 }, { "epoch": 2.2855887521968365, "grad_norm": 0.678545369804577, "learning_rate": 5e-06, "loss": 0.6277, "step": 650 }, { "epoch": 2.3207381370826012, "grad_norm": 0.6597702524075268, "learning_rate": 5e-06, "loss": 0.631, "step": 660 }, { "epoch": 2.3558875219683655, "grad_norm": 0.5352899370053985, "learning_rate": 5e-06, "loss": 0.627, "step": 670 }, { "epoch": 2.39103690685413, "grad_norm": 0.575976735916134, "learning_rate": 5e-06, "loss": 0.6252, "step": 680 }, { "epoch": 2.4261862917398944, "grad_norm": 0.6538224434833726, "learning_rate": 5e-06, "loss": 0.631, "step": 690 }, { "epoch": 2.461335676625659, "grad_norm": 0.8281376962806699, "learning_rate": 5e-06, "loss": 0.6238, "step": 700 }, { "epoch": 2.4964850615114234, "grad_norm": 0.5971561231648772, "learning_rate": 5e-06, "loss": 0.6244, "step": 710 }, { "epoch": 2.531634446397188, "grad_norm": 0.5668390272889466, "learning_rate": 5e-06, "loss": 0.6254, "step": 720 }, { "epoch": 2.5667838312829527, "grad_norm": 0.7378544776528181, "learning_rate": 5e-06, "loss": 0.6248, "step": 730 }, { "epoch": 2.601933216168717, "grad_norm": 0.6067368368819991, "learning_rate": 5e-06, "loss": 0.6256, "step": 740 }, { "epoch": 2.6370826010544817, "grad_norm": 0.6816545127839443, "learning_rate": 5e-06, "loss": 0.6286, "step": 750 }, { "epoch": 2.672231985940246, "grad_norm": 0.787032141068753, "learning_rate": 5e-06, "loss": 0.628, "step": 760 }, { "epoch": 2.7073813708260106, "grad_norm": 0.6393338928189319, "learning_rate": 5e-06, "loss": 0.6267, "step": 770 }, { "epoch": 2.7425307557117753, "grad_norm": 0.5562264277034894, "learning_rate": 5e-06, "loss": 0.6261, "step": 780 }, { "epoch": 2.7776801405975395, "grad_norm": 0.5896436524802737, "learning_rate": 5e-06, "loss": 0.6256, "step": 790 }, { "epoch": 2.812829525483304, "grad_norm": 0.5828475505687344, "learning_rate": 5e-06, "loss": 0.6247, "step": 800 }, { "epoch": 2.8479789103690685, "grad_norm": 0.634394806473084, "learning_rate": 5e-06, "loss": 0.6269, "step": 810 }, { "epoch": 2.883128295254833, "grad_norm": 0.6117384621451529, "learning_rate": 5e-06, "loss": 0.6279, "step": 820 }, { "epoch": 2.9182776801405974, "grad_norm": 0.5540272640106404, "learning_rate": 5e-06, "loss": 0.6212, "step": 830 }, { "epoch": 2.953427065026362, "grad_norm": 0.5600169828318418, "learning_rate": 5e-06, "loss": 0.6282, "step": 840 }, { "epoch": 2.9885764499121263, "grad_norm": 0.7592332443324643, "learning_rate": 5e-06, "loss": 0.6277, "step": 850 }, { "epoch": 2.9956063268892796, "eval_loss": 0.7103263735771179, "eval_runtime": 302.7761, "eval_samples_per_second": 25.322, "eval_steps_per_second": 0.396, "step": 852 }, { "epoch": 2.9956063268892796, "step": 852, "total_flos": 1426922353459200.0, "train_loss": 0.6913109551852857, "train_runtime": 50624.2334, "train_samples_per_second": 8.632, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 852, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1426922353459200.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }