{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9971469329529246, "eval_steps": 500, "global_step": 525, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05706134094151213, "grad_norm": 14.191660560442434, "learning_rate": 5e-06, "loss": 1.1357, "step": 10 }, { "epoch": 0.11412268188302425, "grad_norm": 3.2725072406272364, "learning_rate": 5e-06, "loss": 0.9928, "step": 20 }, { "epoch": 0.17118402282453637, "grad_norm": 4.495391053372432, "learning_rate": 5e-06, "loss": 0.9322, "step": 30 }, { "epoch": 0.2282453637660485, "grad_norm": 5.085341885044574, "learning_rate": 5e-06, "loss": 0.8899, "step": 40 }, { "epoch": 0.28530670470756064, "grad_norm": 4.5927160200176855, "learning_rate": 5e-06, "loss": 0.8676, "step": 50 }, { "epoch": 0.34236804564907275, "grad_norm": 1.6911849555640204, "learning_rate": 5e-06, "loss": 0.8394, "step": 60 }, { "epoch": 0.39942938659058486, "grad_norm": 1.466261010867929, "learning_rate": 5e-06, "loss": 0.8166, "step": 70 }, { "epoch": 0.456490727532097, "grad_norm": 1.5136151777383209, "learning_rate": 5e-06, "loss": 0.8095, "step": 80 }, { "epoch": 0.5135520684736091, "grad_norm": 1.2618165735232854, "learning_rate": 5e-06, "loss": 0.8023, "step": 90 }, { "epoch": 0.5706134094151213, "grad_norm": 1.6421126077036754, "learning_rate": 5e-06, "loss": 0.7912, "step": 100 }, { "epoch": 0.6276747503566333, "grad_norm": 0.9444257126116525, "learning_rate": 5e-06, "loss": 0.7844, "step": 110 }, { "epoch": 0.6847360912981455, "grad_norm": 1.0658378058753986, "learning_rate": 5e-06, "loss": 0.7821, "step": 120 }, { "epoch": 0.7417974322396577, "grad_norm": 1.0219994955594212, "learning_rate": 5e-06, "loss": 0.7761, "step": 130 }, { "epoch": 0.7988587731811697, "grad_norm": 1.0628427150837625, "learning_rate": 5e-06, "loss": 0.7747, "step": 140 }, { "epoch": 0.8559201141226819, "grad_norm": 0.9619954966621683, "learning_rate": 5e-06, "loss": 0.7722, "step": 150 }, { "epoch": 0.912981455064194, "grad_norm": 0.8603887735435021, "learning_rate": 5e-06, "loss": 0.7698, "step": 160 }, { "epoch": 0.9700427960057061, "grad_norm": 0.8416466459767842, "learning_rate": 5e-06, "loss": 0.7665, "step": 170 }, { "epoch": 0.9985734664764622, "eval_loss": 0.7644989490509033, "eval_runtime": 121.9669, "eval_samples_per_second": 38.707, "eval_steps_per_second": 0.607, "step": 175 }, { "epoch": 1.027817403708987, "grad_norm": 0.8822220105180665, "learning_rate": 5e-06, "loss": 0.7951, "step": 180 }, { "epoch": 1.0848787446504993, "grad_norm": 0.8529113410206094, "learning_rate": 5e-06, "loss": 0.7077, "step": 190 }, { "epoch": 1.1419400855920114, "grad_norm": 1.6886404017331722, "learning_rate": 5e-06, "loss": 0.7039, "step": 200 }, { "epoch": 1.1990014265335236, "grad_norm": 1.5586898240952252, "learning_rate": 5e-06, "loss": 0.7018, "step": 210 }, { "epoch": 1.2560627674750355, "grad_norm": 1.0565182550630585, "learning_rate": 5e-06, "loss": 0.6986, "step": 220 }, { "epoch": 1.313124108416548, "grad_norm": 0.8847228395407172, "learning_rate": 5e-06, "loss": 0.7016, "step": 230 }, { "epoch": 1.3701854493580599, "grad_norm": 0.7426270405443446, "learning_rate": 5e-06, "loss": 0.7005, "step": 240 }, { "epoch": 1.427246790299572, "grad_norm": 1.031506144419473, "learning_rate": 5e-06, "loss": 0.7047, "step": 250 }, { "epoch": 1.4843081312410842, "grad_norm": 0.8734427470796896, "learning_rate": 5e-06, "loss": 0.7011, "step": 260 }, { "epoch": 1.5413694721825963, "grad_norm": 1.057801790522887, "learning_rate": 5e-06, "loss": 0.6988, "step": 270 }, { "epoch": 1.5984308131241085, "grad_norm": 0.8415840230824246, "learning_rate": 5e-06, "loss": 0.6987, "step": 280 }, { "epoch": 1.6554921540656204, "grad_norm": 0.827701826950944, "learning_rate": 5e-06, "loss": 0.7045, "step": 290 }, { "epoch": 1.7125534950071328, "grad_norm": 0.7740671969466235, "learning_rate": 5e-06, "loss": 0.6981, "step": 300 }, { "epoch": 1.7696148359486448, "grad_norm": 0.8907728175194645, "learning_rate": 5e-06, "loss": 0.7043, "step": 310 }, { "epoch": 1.826676176890157, "grad_norm": 0.7591532846679572, "learning_rate": 5e-06, "loss": 0.7002, "step": 320 }, { "epoch": 1.883737517831669, "grad_norm": 1.037223455268037, "learning_rate": 5e-06, "loss": 0.7029, "step": 330 }, { "epoch": 1.940798858773181, "grad_norm": 0.9095266922636082, "learning_rate": 5e-06, "loss": 0.7038, "step": 340 }, { "epoch": 1.9978601997146934, "grad_norm": 0.8437114369311308, "learning_rate": 5e-06, "loss": 0.6988, "step": 350 }, { "epoch": 1.9978601997146934, "eval_loss": 0.752321720123291, "eval_runtime": 119.2436, "eval_samples_per_second": 39.591, "eval_steps_per_second": 0.621, "step": 350 }, { "epoch": 2.055634807417974, "grad_norm": 0.7992701008382876, "learning_rate": 5e-06, "loss": 0.697, "step": 360 }, { "epoch": 2.1126961483594866, "grad_norm": 0.9009096072867547, "learning_rate": 5e-06, "loss": 0.6309, "step": 370 }, { "epoch": 2.1697574893009985, "grad_norm": 0.9335524571114746, "learning_rate": 5e-06, "loss": 0.6301, "step": 380 }, { "epoch": 2.226818830242511, "grad_norm": 0.9107893164568975, "learning_rate": 5e-06, "loss": 0.6274, "step": 390 }, { "epoch": 2.283880171184023, "grad_norm": 0.8275181427943094, "learning_rate": 5e-06, "loss": 0.631, "step": 400 }, { "epoch": 2.340941512125535, "grad_norm": 0.7487796908821762, "learning_rate": 5e-06, "loss": 0.6321, "step": 410 }, { "epoch": 2.398002853067047, "grad_norm": 1.1741881512767371, "learning_rate": 5e-06, "loss": 0.6352, "step": 420 }, { "epoch": 2.455064194008559, "grad_norm": 0.9540225993983626, "learning_rate": 5e-06, "loss": 0.6356, "step": 430 }, { "epoch": 2.512125534950071, "grad_norm": 0.9795761392377677, "learning_rate": 5e-06, "loss": 0.635, "step": 440 }, { "epoch": 2.5691868758915835, "grad_norm": 0.7321465764810406, "learning_rate": 5e-06, "loss": 0.6349, "step": 450 }, { "epoch": 2.626248216833096, "grad_norm": 0.8103933471152502, "learning_rate": 5e-06, "loss": 0.6355, "step": 460 }, { "epoch": 2.683309557774608, "grad_norm": 1.0066498655926475, "learning_rate": 5e-06, "loss": 0.6377, "step": 470 }, { "epoch": 2.7403708987161197, "grad_norm": 0.9338836682210027, "learning_rate": 5e-06, "loss": 0.6367, "step": 480 }, { "epoch": 2.797432239657632, "grad_norm": 0.8149026499533006, "learning_rate": 5e-06, "loss": 0.6382, "step": 490 }, { "epoch": 2.854493580599144, "grad_norm": 0.8107310696209651, "learning_rate": 5e-06, "loss": 0.6357, "step": 500 }, { "epoch": 2.911554921540656, "grad_norm": 0.78901956302398, "learning_rate": 5e-06, "loss": 0.6363, "step": 510 }, { "epoch": 2.9686162624821684, "grad_norm": 0.8105645400500955, "learning_rate": 5e-06, "loss": 0.644, "step": 520 }, { "epoch": 2.9971469329529246, "eval_loss": 0.7613628506660461, "eval_runtime": 124.0128, "eval_samples_per_second": 38.069, "eval_steps_per_second": 0.597, "step": 525 }, { "epoch": 2.9971469329529246, "step": 525, "total_flos": 879185174200320.0, "train_loss": 0.7275740219297864, "train_runtime": 16864.3599, "train_samples_per_second": 15.956, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 525, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 879185174200320.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }