| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9956063268892796, | |
| "eval_steps": 500, | |
| "global_step": 852, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0351493848857645, | |
| "grad_norm": 6.865658287926604, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0224, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.070298769771529, | |
| "grad_norm": 1.300129454515723, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8893, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1054481546572935, | |
| "grad_norm": 1.662141333626877, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8461, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.140597539543058, | |
| "grad_norm": 1.1867317674423972, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8221, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1757469244288225, | |
| "grad_norm": 1.3406663541941202, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8082, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.210896309314587, | |
| "grad_norm": 0.9748883756928828, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7926, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2460456942003515, | |
| "grad_norm": 0.9049088495136057, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7786, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.281195079086116, | |
| "grad_norm": 0.7756246074861972, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7672, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3163444639718805, | |
| "grad_norm": 0.6803113923201367, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7666, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.351493848857645, | |
| "grad_norm": 0.7087390396178761, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7626, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3866432337434095, | |
| "grad_norm": 0.6904558645177432, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7558, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.421792618629174, | |
| "grad_norm": 0.5406381121382178, | |
| "learning_rate": 5e-06, | |
| "loss": 0.751, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.45694200351493847, | |
| "grad_norm": 0.7263515515258443, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7541, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.492091388400703, | |
| "grad_norm": 0.6381225954297634, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7507, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5272407732864675, | |
| "grad_norm": 0.9282926398227679, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7425, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.562390158172232, | |
| "grad_norm": 0.703837658050583, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7473, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5975395430579965, | |
| "grad_norm": 0.7762432087380096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7408, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.632688927943761, | |
| "grad_norm": 0.5947662547404722, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7347, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6678383128295254, | |
| "grad_norm": 0.628944705791063, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7406, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.70298769771529, | |
| "grad_norm": 0.5977406811055224, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7347, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7381370826010545, | |
| "grad_norm": 0.582107769314153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.737, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.773286467486819, | |
| "grad_norm": 0.6326552735959291, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7328, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8084358523725835, | |
| "grad_norm": 0.6637547116847639, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7311, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.843585237258348, | |
| "grad_norm": 0.6997143410926964, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7341, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8787346221441125, | |
| "grad_norm": 0.6162729226466245, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7332, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9138840070298769, | |
| "grad_norm": 0.6199166403621413, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7262, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9490333919156415, | |
| "grad_norm": 0.6034966296550427, | |
| "learning_rate": 5e-06, | |
| "loss": 0.729, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.984182776801406, | |
| "grad_norm": 0.6195682554180708, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7264, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9982425307557118, | |
| "eval_loss": 0.721891462802887, | |
| "eval_runtime": 302.9942, | |
| "eval_samples_per_second": 25.304, | |
| "eval_steps_per_second": 0.396, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.0197715289982425, | |
| "grad_norm": 0.6617437136575776, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7503, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.054920913884007, | |
| "grad_norm": 0.596469668606961, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6752, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0900702987697715, | |
| "grad_norm": 0.7286257897811691, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6745, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1252196836555362, | |
| "grad_norm": 0.6703518701287363, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6772, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1603690685413006, | |
| "grad_norm": 0.6678193952959378, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6757, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.195518453427065, | |
| "grad_norm": 0.6272061731880971, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6772, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2306678383128296, | |
| "grad_norm": 0.5618776589312474, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6767, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.265817223198594, | |
| "grad_norm": 0.656461597570214, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6742, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3009666080843585, | |
| "grad_norm": 0.95088298783439, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6803, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.336115992970123, | |
| "grad_norm": 0.6256062888068228, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6789, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.3712653778558876, | |
| "grad_norm": 0.5178529345876333, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6757, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.406414762741652, | |
| "grad_norm": 0.6633111117626306, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6786, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4415641476274166, | |
| "grad_norm": 0.5753214727933854, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6686, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.476713532513181, | |
| "grad_norm": 0.7023169996268164, | |
| "learning_rate": 5e-06, | |
| "loss": 0.674, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5118629173989455, | |
| "grad_norm": 0.601050736097527, | |
| "learning_rate": 5e-06, | |
| "loss": 0.676, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.54701230228471, | |
| "grad_norm": 0.6375081303020413, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6779, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.5821616871704745, | |
| "grad_norm": 0.6076189491485879, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6811, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.6173110720562391, | |
| "grad_norm": 0.6123755131309624, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6749, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6524604569420034, | |
| "grad_norm": 0.5847478738087437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6747, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.687609841827768, | |
| "grad_norm": 0.6041574755100807, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6681, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.7227592267135325, | |
| "grad_norm": 0.6972984159432736, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6686, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.757908611599297, | |
| "grad_norm": 0.592332789109461, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6763, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7930579964850615, | |
| "grad_norm": 0.7081266254056617, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6707, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.828207381370826, | |
| "grad_norm": 0.5655551892586738, | |
| "learning_rate": 5e-06, | |
| "loss": 0.675, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.8633567662565906, | |
| "grad_norm": 0.5912936045849521, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6746, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8985061511423549, | |
| "grad_norm": 0.6418543236430647, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6743, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.9336555360281196, | |
| "grad_norm": 0.8406203952305934, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6777, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.968804920913884, | |
| "grad_norm": 0.6740623987469322, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6783, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.9969244288224957, | |
| "eval_loss": 0.7088373899459839, | |
| "eval_runtime": 302.5633, | |
| "eval_samples_per_second": 25.34, | |
| "eval_steps_per_second": 0.397, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 2.0043936731107204, | |
| "grad_norm": 0.8982450142296012, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7079, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.039543057996485, | |
| "grad_norm": 0.9691008221084222, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6203, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.0746924428822497, | |
| "grad_norm": 0.6447824341633516, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6244, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.109841827768014, | |
| "grad_norm": 0.7064323342581214, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6189, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.1449912126537787, | |
| "grad_norm": 0.5819596596280016, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6207, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.180140597539543, | |
| "grad_norm": 0.7981926624790863, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6203, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.2152899824253076, | |
| "grad_norm": 0.6853162161955834, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6281, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.2504393673110723, | |
| "grad_norm": 0.6819271490957453, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6246, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.2855887521968365, | |
| "grad_norm": 0.678545369804577, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6277, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.3207381370826012, | |
| "grad_norm": 0.6597702524075268, | |
| "learning_rate": 5e-06, | |
| "loss": 0.631, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.3558875219683655, | |
| "grad_norm": 0.5352899370053985, | |
| "learning_rate": 5e-06, | |
| "loss": 0.627, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.39103690685413, | |
| "grad_norm": 0.575976735916134, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6252, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.4261862917398944, | |
| "grad_norm": 0.6538224434833726, | |
| "learning_rate": 5e-06, | |
| "loss": 0.631, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.461335676625659, | |
| "grad_norm": 0.8281376962806699, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6238, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.4964850615114234, | |
| "grad_norm": 0.5971561231648772, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6244, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.531634446397188, | |
| "grad_norm": 0.5668390272889466, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6254, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.5667838312829527, | |
| "grad_norm": 0.7378544776528181, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6248, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.601933216168717, | |
| "grad_norm": 0.6067368368819991, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6256, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.6370826010544817, | |
| "grad_norm": 0.6816545127839443, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6286, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.672231985940246, | |
| "grad_norm": 0.787032141068753, | |
| "learning_rate": 5e-06, | |
| "loss": 0.628, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.7073813708260106, | |
| "grad_norm": 0.6393338928189319, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6267, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.7425307557117753, | |
| "grad_norm": 0.5562264277034894, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6261, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.7776801405975395, | |
| "grad_norm": 0.5896436524802737, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6256, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.812829525483304, | |
| "grad_norm": 0.5828475505687344, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6247, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.8479789103690685, | |
| "grad_norm": 0.634394806473084, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6269, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.883128295254833, | |
| "grad_norm": 0.6117384621451529, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6279, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.9182776801405974, | |
| "grad_norm": 0.5540272640106404, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6212, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.953427065026362, | |
| "grad_norm": 0.5600169828318418, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6282, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.9885764499121263, | |
| "grad_norm": 0.7592332443324643, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6277, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.9956063268892796, | |
| "eval_loss": 0.7103263735771179, | |
| "eval_runtime": 302.7761, | |
| "eval_samples_per_second": 25.322, | |
| "eval_steps_per_second": 0.396, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 2.9956063268892796, | |
| "step": 852, | |
| "total_flos": 1426922353459200.0, | |
| "train_loss": 0.6913109551852857, | |
| "train_runtime": 50624.2334, | |
| "train_samples_per_second": 8.632, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 852, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1426922353459200.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |