| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.7472527472527473, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.054945054945054944, | |
| "grad_norm": 7.214223861694336, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.2206, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10989010989010989, | |
| "grad_norm": 1.9139518737792969, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.9332, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.16483516483516483, | |
| "grad_norm": 1.8469916582107544, | |
| "learning_rate": 2e-05, | |
| "loss": 0.6775, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.21978021978021978, | |
| "grad_norm": 1.5840134620666504, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 0.4775, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.27472527472527475, | |
| "grad_norm": 1.324249029159546, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 0.3462, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.32967032967032966, | |
| "grad_norm": 9.703359603881836, | |
| "learning_rate": 4e-05, | |
| "loss": 0.2783, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 0.7924676537513733, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 0.2422, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.43956043956043955, | |
| "grad_norm": 0.7410662770271301, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 0.1935, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4945054945054945, | |
| "grad_norm": 0.8206967711448669, | |
| "learning_rate": 6e-05, | |
| "loss": 0.1737, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5494505494505495, | |
| "grad_norm": 0.7770851254463196, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 0.1686, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6043956043956044, | |
| "grad_norm": 0.77186119556427, | |
| "learning_rate": 7.333333333333333e-05, | |
| "loss": 0.1791, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6593406593406593, | |
| "grad_norm": 0.6624640822410583, | |
| "learning_rate": 8e-05, | |
| "loss": 0.1631, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.6289125084877014, | |
| "learning_rate": 8.666666666666667e-05, | |
| "loss": 0.1695, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.7366447448730469, | |
| "learning_rate": 9.333333333333334e-05, | |
| "loss": 0.1596, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8241758241758241, | |
| "grad_norm": 0.6936759948730469, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1742, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8791208791208791, | |
| "grad_norm": 0.7598293423652649, | |
| "learning_rate": 0.00010666666666666667, | |
| "loss": 0.1555, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9340659340659341, | |
| "grad_norm": 0.5829370021820068, | |
| "learning_rate": 0.00011333333333333334, | |
| "loss": 0.1495, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.989010989010989, | |
| "grad_norm": 0.7476902604103088, | |
| "learning_rate": 0.00012, | |
| "loss": 0.1413, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.043956043956044, | |
| "grad_norm": 0.611395537853241, | |
| "learning_rate": 0.00012666666666666666, | |
| "loss": 0.1584, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.098901098901099, | |
| "grad_norm": 0.38252344727516174, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 0.1274, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1538461538461537, | |
| "grad_norm": 0.6068706512451172, | |
| "learning_rate": 0.00014, | |
| "loss": 0.119, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.2087912087912087, | |
| "grad_norm": 0.601772665977478, | |
| "learning_rate": 0.00014666666666666666, | |
| "loss": 0.1451, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.2637362637362637, | |
| "grad_norm": 0.4933900833129883, | |
| "learning_rate": 0.00015333333333333334, | |
| "loss": 0.1405, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.3186813186813187, | |
| "grad_norm": 0.37198406457901, | |
| "learning_rate": 0.00016, | |
| "loss": 0.1435, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3736263736263736, | |
| "grad_norm": 0.2324627786874771, | |
| "learning_rate": 0.0001666666666666667, | |
| "loss": 0.1405, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.6498947739601135, | |
| "learning_rate": 0.00017333333333333334, | |
| "loss": 0.1276, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4835164835164836, | |
| "grad_norm": 0.48399585485458374, | |
| "learning_rate": 0.00018, | |
| "loss": 0.1058, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.30425164103507996, | |
| "learning_rate": 0.0001866666666666667, | |
| "loss": 0.1046, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.5934065934065935, | |
| "grad_norm": 0.44099316000938416, | |
| "learning_rate": 0.00019333333333333333, | |
| "loss": 0.1133, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.6483516483516483, | |
| "grad_norm": 0.5143241286277771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.1237, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.7032967032967035, | |
| "grad_norm": 0.2673356831073761, | |
| "learning_rate": 0.000191869918699187, | |
| "loss": 0.1101, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.7582417582417582, | |
| "grad_norm": 0.3179146349430084, | |
| "learning_rate": 0.000183739837398374, | |
| "loss": 0.113, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.8131868131868132, | |
| "grad_norm": 0.37721434235572815, | |
| "learning_rate": 0.000175609756097561, | |
| "loss": 0.1113, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.8681318681318682, | |
| "grad_norm": 0.4035661816596985, | |
| "learning_rate": 0.00016747967479674797, | |
| "loss": 0.1058, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.42484527826309204, | |
| "learning_rate": 0.00015934959349593497, | |
| "loss": 0.1142, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.978021978021978, | |
| "grad_norm": 0.37304389476776123, | |
| "learning_rate": 0.00015121951219512197, | |
| "loss": 0.0937, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.032967032967033, | |
| "grad_norm": 0.1278969645500183, | |
| "learning_rate": 0.00014308943089430895, | |
| "loss": 0.1039, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.087912087912088, | |
| "grad_norm": 0.19862954318523407, | |
| "learning_rate": 0.00013495934959349595, | |
| "loss": 0.1006, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.2047436237335205, | |
| "learning_rate": 0.00012682926829268293, | |
| "loss": 0.0709, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.197802197802198, | |
| "grad_norm": 0.2357422411441803, | |
| "learning_rate": 0.00011869918699186993, | |
| "loss": 0.0909, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.2527472527472527, | |
| "grad_norm": 0.19021956622600555, | |
| "learning_rate": 0.00011056910569105692, | |
| "loss": 0.0939, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.3076923076923075, | |
| "grad_norm": 0.2202429175376892, | |
| "learning_rate": 0.0001024390243902439, | |
| "loss": 0.0972, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.3626373626373627, | |
| "grad_norm": 0.24319039285182953, | |
| "learning_rate": 9.43089430894309e-05, | |
| "loss": 0.1028, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.4175824175824174, | |
| "grad_norm": 0.26337578892707825, | |
| "learning_rate": 8.61788617886179e-05, | |
| "loss": 0.106, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.4725274725274726, | |
| "grad_norm": 0.18105241656303406, | |
| "learning_rate": 7.804878048780489e-05, | |
| "loss": 0.08, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.5274725274725274, | |
| "grad_norm": 0.3710761070251465, | |
| "learning_rate": 6.991869918699188e-05, | |
| "loss": 0.0843, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.5824175824175826, | |
| "grad_norm": 0.16677603125572205, | |
| "learning_rate": 6.178861788617887e-05, | |
| "loss": 0.0929, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.6373626373626373, | |
| "grad_norm": 0.06506508588790894, | |
| "learning_rate": 5.365853658536586e-05, | |
| "loss": 0.0956, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.6923076923076925, | |
| "grad_norm": 0.13316453993320465, | |
| "learning_rate": 4.5528455284552844e-05, | |
| "loss": 0.0935, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.7472527472527473, | |
| "grad_norm": 0.04071825370192528, | |
| "learning_rate": 3.739837398373984e-05, | |
| "loss": 0.0949, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 546, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "total_flos": 0.0, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |