| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.997624703087886, | |
| "eval_steps": 500, | |
| "global_step": 280, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007125890736342043, | |
| "grad_norm": 2.014270943330479, | |
| "learning_rate": 3.5714285714285716e-07, | |
| "loss": 2.0833, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.035629453681710214, | |
| "grad_norm": 1.6415901607490422, | |
| "learning_rate": 1.7857142857142859e-06, | |
| "loss": 2.0703, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07125890736342043, | |
| "grad_norm": 0.9045439354344873, | |
| "learning_rate": 3.5714285714285718e-06, | |
| "loss": 1.7357, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10688836104513064, | |
| "grad_norm": 0.29255939881487364, | |
| "learning_rate": 5.357142857142857e-06, | |
| "loss": 1.0634, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.14251781472684086, | |
| "grad_norm": 0.18199017505467613, | |
| "learning_rate": 7.1428571428571436e-06, | |
| "loss": 0.7719, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17814726840855108, | |
| "grad_norm": 0.1337369431738206, | |
| "learning_rate": 8.92857142857143e-06, | |
| "loss": 0.6677, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.21377672209026127, | |
| "grad_norm": 0.11045661942717355, | |
| "learning_rate": 9.998445910004082e-06, | |
| "loss": 0.5819, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2494061757719715, | |
| "grad_norm": 0.10847403382185834, | |
| "learning_rate": 9.980973490458728e-06, | |
| "loss": 0.541, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2850356294536817, | |
| "grad_norm": 0.11896254274796987, | |
| "learning_rate": 9.944154131125643e-06, | |
| "loss": 0.4698, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.32066508313539194, | |
| "grad_norm": 0.09943694170299246, | |
| "learning_rate": 9.888130844596525e-06, | |
| "loss": 0.4676, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.35629453681710216, | |
| "grad_norm": 0.08861914811199816, | |
| "learning_rate": 9.81312123475006e-06, | |
| "loss": 0.477, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3919239904988123, | |
| "grad_norm": 0.0903117913307708, | |
| "learning_rate": 9.719416651541839e-06, | |
| "loss": 0.4609, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.42755344418052255, | |
| "grad_norm": 0.09078544419971217, | |
| "learning_rate": 9.60738105935204e-06, | |
| "loss": 0.4001, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.46318289786223277, | |
| "grad_norm": 0.07183071959915902, | |
| "learning_rate": 9.477449623286505e-06, | |
| "loss": 0.4261, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.498812351543943, | |
| "grad_norm": 0.06897912595572098, | |
| "learning_rate": 9.330127018922195e-06, | |
| "loss": 0.3817, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5344418052256532, | |
| "grad_norm": 0.0723143122366438, | |
| "learning_rate": 9.165985472062245e-06, | |
| "loss": 0.4229, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5700712589073634, | |
| "grad_norm": 0.07774774592307501, | |
| "learning_rate": 8.985662536114614e-06, | |
| "loss": 0.4301, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6057007125890737, | |
| "grad_norm": 0.06507489587665306, | |
| "learning_rate": 8.789858615727266e-06, | |
| "loss": 0.3971, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.6413301662707839, | |
| "grad_norm": 0.06634165567645521, | |
| "learning_rate": 8.579334246298593e-06, | |
| "loss": 0.3883, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6769596199524941, | |
| "grad_norm": 0.06591696283097749, | |
| "learning_rate": 8.35490713992985e-06, | |
| "loss": 0.3771, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7125890736342043, | |
| "grad_norm": 0.06595542170677902, | |
| "learning_rate": 8.117449009293668e-06, | |
| "loss": 0.4218, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7482185273159145, | |
| "grad_norm": 0.06259662018084211, | |
| "learning_rate": 7.86788218175523e-06, | |
| "loss": 0.374, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7838479809976246, | |
| "grad_norm": 0.05818262826755867, | |
| "learning_rate": 7.607176016897491e-06, | |
| "loss": 0.4117, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8194774346793349, | |
| "grad_norm": 0.05150004181000599, | |
| "learning_rate": 7.336343141365311e-06, | |
| "loss": 0.3836, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.8551068883610451, | |
| "grad_norm": 0.05963071253301088, | |
| "learning_rate": 7.056435515653059e-06, | |
| "loss": 0.3714, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8907363420427553, | |
| "grad_norm": 0.056071605705336806, | |
| "learning_rate": 6.768540348112908e-06, | |
| "loss": 0.3682, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9263657957244655, | |
| "grad_norm": 0.05426625640333866, | |
| "learning_rate": 6.473775872054522e-06, | |
| "loss": 0.3876, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.9619952494061758, | |
| "grad_norm": 0.04885065381619708, | |
| "learning_rate": 6.173287002338577e-06, | |
| "loss": 0.3337, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.997624703087886, | |
| "grad_norm": 0.059980824843577495, | |
| "learning_rate": 5.8682408883346535e-06, | |
| "loss": 0.3689, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.997624703087886, | |
| "eval_loss": 0.31315726041793823, | |
| "eval_runtime": 3.975, | |
| "eval_samples_per_second": 17.359, | |
| "eval_steps_per_second": 4.528, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.03562945368171, | |
| "grad_norm": 0.05546346679815539, | |
| "learning_rate": 5.559822380516539e-06, | |
| "loss": 0.4304, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.0712589073634204, | |
| "grad_norm": 0.06795836481113786, | |
| "learning_rate": 5.249229428303486e-06, | |
| "loss": 0.3636, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.1068883610451306, | |
| "grad_norm": 0.057169860948488506, | |
| "learning_rate": 4.937668427022925e-06, | |
| "loss": 0.325, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.1425178147268409, | |
| "grad_norm": 0.06883436674952785, | |
| "learning_rate": 4.626349532067879e-06, | |
| "loss": 0.3478, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.178147268408551, | |
| "grad_norm": 0.06753514376330658, | |
| "learning_rate": 4.316481958449634e-06, | |
| "loss": 0.3583, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.2137767220902613, | |
| "grad_norm": 0.06424945692176337, | |
| "learning_rate": 4.009269284003014e-06, | |
| "loss": 0.3368, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.2494061757719714, | |
| "grad_norm": 0.10246279297452984, | |
| "learning_rate": 3.705904774487396e-06, | |
| "loss": 0.323, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.2850356294536818, | |
| "grad_norm": 0.07408591934649965, | |
| "learning_rate": 3.4075667487415785e-06, | |
| "loss": 0.3213, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.3206650831353919, | |
| "grad_norm": 0.07135541120127381, | |
| "learning_rate": 3.1154140018949743e-06, | |
| "loss": 0.3533, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.3562945368171022, | |
| "grad_norm": 0.06620006117618393, | |
| "learning_rate": 2.83058130441221e-06, | |
| "loss": 0.3524, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.3919239904988123, | |
| "grad_norm": 0.07786338824259277, | |
| "learning_rate": 2.5541749944535554e-06, | |
| "loss": 0.3338, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.4275534441805227, | |
| "grad_norm": 0.09055157205194621, | |
| "learning_rate": 2.2872686806712037e-06, | |
| "loss": 0.3618, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.4631828978622328, | |
| "grad_norm": 0.10379209054364397, | |
| "learning_rate": 2.030899072132493e-06, | |
| "loss": 0.35, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.4988123515439429, | |
| "grad_norm": 0.09353822202985496, | |
| "learning_rate": 1.7860619515673034e-06, | |
| "loss": 0.3236, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.5344418052256532, | |
| "grad_norm": 0.12742915464154184, | |
| "learning_rate": 1.553708307580265e-06, | |
| "loss": 0.3386, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.5700712589073635, | |
| "grad_norm": 0.08379438960110396, | |
| "learning_rate": 1.3347406408508695e-06, | |
| "loss": 0.3171, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.6057007125890737, | |
| "grad_norm": 0.08932277272469959, | |
| "learning_rate": 1.1300094586688632e-06, | |
| "loss": 0.3201, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.6413301662707838, | |
| "grad_norm": 0.1131311853778167, | |
| "learning_rate": 9.403099714207175e-07, | |
| "loss": 0.3451, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.676959619952494, | |
| "grad_norm": 0.08882627556488333, | |
| "learning_rate": 7.663790038585794e-07, | |
| "loss": 0.3129, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.7125890736342044, | |
| "grad_norm": 0.10344808811283311, | |
| "learning_rate": 6.088921331488568e-07, | |
| "loss": 0.3256, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.7482185273159145, | |
| "grad_norm": 0.11584657777909668, | |
| "learning_rate": 4.6846106481675035e-07, | |
| "loss": 0.3254, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.7838479809976246, | |
| "grad_norm": 0.1347731993934301, | |
| "learning_rate": 3.4563125677897936e-07, | |
| "loss": 0.3209, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.8194774346793348, | |
| "grad_norm": 0.09310880851694624, | |
| "learning_rate": 2.4087980069338825e-07, | |
| "loss": 0.3205, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.855106888361045, | |
| "grad_norm": 0.09106940567745804, | |
| "learning_rate": 1.5461356885461077e-07, | |
| "loss": 0.3026, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.8907363420427554, | |
| "grad_norm": 0.12326041580096593, | |
| "learning_rate": 8.716763383355863e-08, | |
| "loss": 0.325, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.9263657957244655, | |
| "grad_norm": 0.07845758422297669, | |
| "learning_rate": 3.8803966999139686e-08, | |
| "loss": 0.2845, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.9619952494061756, | |
| "grad_norm": 0.09114091009511459, | |
| "learning_rate": 9.710420977340763e-09, | |
| "loss": 0.2961, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.997624703087886, | |
| "grad_norm": 0.09919205831626536, | |
| "learning_rate": 0.0, | |
| "loss": 0.3141, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.997624703087886, | |
| "eval_loss": 0.26854342222213745, | |
| "eval_runtime": 3.626, | |
| "eval_samples_per_second": 19.029, | |
| "eval_steps_per_second": 4.964, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.997624703087886, | |
| "step": 280, | |
| "total_flos": 7.474053798973932e+17, | |
| "train_loss": 0.45149746750082287, | |
| "train_runtime": 2861.5261, | |
| "train_samples_per_second": 4.707, | |
| "train_steps_per_second": 0.098 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 280, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.474053798973932e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |