| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9971469329529246, | |
| "eval_steps": 500, | |
| "global_step": 525, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05706134094151213, | |
| "grad_norm": 14.191660560442434, | |
| "learning_rate": 5e-06, | |
| "loss": 1.1357, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11412268188302425, | |
| "grad_norm": 3.2725072406272364, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9928, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17118402282453637, | |
| "grad_norm": 4.495391053372432, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9322, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2282453637660485, | |
| "grad_norm": 5.085341885044574, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8899, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.28530670470756064, | |
| "grad_norm": 4.5927160200176855, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8676, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.34236804564907275, | |
| "grad_norm": 1.6911849555640204, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8394, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.39942938659058486, | |
| "grad_norm": 1.466261010867929, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8166, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.456490727532097, | |
| "grad_norm": 1.5136151777383209, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8095, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5135520684736091, | |
| "grad_norm": 1.2618165735232854, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8023, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5706134094151213, | |
| "grad_norm": 1.6421126077036754, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7912, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6276747503566333, | |
| "grad_norm": 0.9444257126116525, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7844, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6847360912981455, | |
| "grad_norm": 1.0658378058753986, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7821, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7417974322396577, | |
| "grad_norm": 1.0219994955594212, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7761, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7988587731811697, | |
| "grad_norm": 1.0628427150837625, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7747, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8559201141226819, | |
| "grad_norm": 0.9619954966621683, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7722, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.912981455064194, | |
| "grad_norm": 0.8603887735435021, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7698, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9700427960057061, | |
| "grad_norm": 0.8416466459767842, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7665, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9985734664764622, | |
| "eval_loss": 0.7644989490509033, | |
| "eval_runtime": 121.9669, | |
| "eval_samples_per_second": 38.707, | |
| "eval_steps_per_second": 0.607, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.027817403708987, | |
| "grad_norm": 0.8822220105180665, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7951, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0848787446504993, | |
| "grad_norm": 0.8529113410206094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7077, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.1419400855920114, | |
| "grad_norm": 1.6886404017331722, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7039, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1990014265335236, | |
| "grad_norm": 1.5586898240952252, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7018, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.2560627674750355, | |
| "grad_norm": 1.0565182550630585, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6986, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.313124108416548, | |
| "grad_norm": 0.8847228395407172, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7016, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.3701854493580599, | |
| "grad_norm": 0.7426270405443446, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7005, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.427246790299572, | |
| "grad_norm": 1.031506144419473, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7047, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.4843081312410842, | |
| "grad_norm": 0.8734427470796896, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7011, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.5413694721825963, | |
| "grad_norm": 1.057801790522887, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6988, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.5984308131241085, | |
| "grad_norm": 0.8415840230824246, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6987, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.6554921540656204, | |
| "grad_norm": 0.827701826950944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7045, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.7125534950071328, | |
| "grad_norm": 0.7740671969466235, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6981, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.7696148359486448, | |
| "grad_norm": 0.8907728175194645, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7043, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.826676176890157, | |
| "grad_norm": 0.7591532846679572, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7002, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.883737517831669, | |
| "grad_norm": 1.037223455268037, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7029, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.940798858773181, | |
| "grad_norm": 0.9095266922636082, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7038, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.9978601997146934, | |
| "grad_norm": 0.8437114369311308, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6988, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.9978601997146934, | |
| "eval_loss": 0.752321720123291, | |
| "eval_runtime": 119.2436, | |
| "eval_samples_per_second": 39.591, | |
| "eval_steps_per_second": 0.621, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.055634807417974, | |
| "grad_norm": 0.7992701008382876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.697, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.1126961483594866, | |
| "grad_norm": 0.9009096072867547, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6309, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.1697574893009985, | |
| "grad_norm": 0.9335524571114746, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6301, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.226818830242511, | |
| "grad_norm": 0.9107893164568975, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6274, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.283880171184023, | |
| "grad_norm": 0.8275181427943094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.631, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.340941512125535, | |
| "grad_norm": 0.7487796908821762, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6321, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.398002853067047, | |
| "grad_norm": 1.1741881512767371, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6352, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.455064194008559, | |
| "grad_norm": 0.9540225993983626, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6356, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.512125534950071, | |
| "grad_norm": 0.9795761392377677, | |
| "learning_rate": 5e-06, | |
| "loss": 0.635, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.5691868758915835, | |
| "grad_norm": 0.7321465764810406, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6349, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.626248216833096, | |
| "grad_norm": 0.8103933471152502, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6355, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.683309557774608, | |
| "grad_norm": 1.0066498655926475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6377, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.7403708987161197, | |
| "grad_norm": 0.9338836682210027, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6367, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.797432239657632, | |
| "grad_norm": 0.8149026499533006, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6382, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.854493580599144, | |
| "grad_norm": 0.8107310696209651, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6357, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.911554921540656, | |
| "grad_norm": 0.78901956302398, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6363, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.9686162624821684, | |
| "grad_norm": 0.8105645400500955, | |
| "learning_rate": 5e-06, | |
| "loss": 0.644, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.9971469329529246, | |
| "eval_loss": 0.7613628506660461, | |
| "eval_runtime": 124.0128, | |
| "eval_samples_per_second": 38.069, | |
| "eval_steps_per_second": 0.597, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.9971469329529246, | |
| "step": 525, | |
| "total_flos": 879185174200320.0, | |
| "train_loss": 0.7275740219297864, | |
| "train_runtime": 16864.3599, | |
| "train_samples_per_second": 15.956, | |
| "train_steps_per_second": 0.031 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 525, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 879185174200320.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |