| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9971181556195967, | |
| "eval_steps": 500, | |
| "global_step": 780, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03842459173871278, | |
| "grad_norm": 2.22598139805258, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0605, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07684918347742556, | |
| "grad_norm": 1.8651908058801285, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9358, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11527377521613832, | |
| "grad_norm": 2.6599993200524916, | |
| "learning_rate": 5e-06, | |
| "loss": 0.893, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15369836695485112, | |
| "grad_norm": 1.7240152968762026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8656, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.19212295869356388, | |
| "grad_norm": 1.3749810808043914, | |
| "learning_rate": 5e-06, | |
| "loss": 0.845, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.23054755043227665, | |
| "grad_norm": 1.1007270333926593, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8299, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2689721421709894, | |
| "grad_norm": 1.2050224496780755, | |
| "learning_rate": 5e-06, | |
| "loss": 0.818, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.30739673390970224, | |
| "grad_norm": 0.9788138383899402, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8033, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.345821325648415, | |
| "grad_norm": 0.884055914711954, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8036, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.38424591738712777, | |
| "grad_norm": 0.8235015272280632, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7934, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.42267050912584053, | |
| "grad_norm": 1.245984363361222, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7926, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4610951008645533, | |
| "grad_norm": 0.8556601852472671, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7844, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.49951969260326606, | |
| "grad_norm": 1.1413084645120115, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7801, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5379442843419788, | |
| "grad_norm": 0.8043285660887262, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7784, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5763688760806917, | |
| "grad_norm": 0.7459273515185546, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7741, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6147934678194045, | |
| "grad_norm": 0.8660117432164065, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7736, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6532180595581172, | |
| "grad_norm": 1.0007814674866071, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7706, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.69164265129683, | |
| "grad_norm": 0.7055795079240674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7664, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7300672430355427, | |
| "grad_norm": 0.7838600424732732, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7698, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7684918347742555, | |
| "grad_norm": 0.7583120615774692, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7641, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8069164265129684, | |
| "grad_norm": 0.5733684412033053, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7623, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8453410182516811, | |
| "grad_norm": 0.859078978836711, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7594, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8837656099903939, | |
| "grad_norm": 0.6923227057286715, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7619, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9221902017291066, | |
| "grad_norm": 0.8251505086096959, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7555, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9606147934678194, | |
| "grad_norm": 0.7435533797370215, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7585, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9990393852065321, | |
| "grad_norm": 0.7678197391666464, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7576, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9990393852065321, | |
| "eval_loss": 0.7591712474822998, | |
| "eval_runtime": 277.1989, | |
| "eval_samples_per_second": 25.3, | |
| "eval_steps_per_second": 0.397, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.037463976945245, | |
| "grad_norm": 1.5561145218285635, | |
| "learning_rate": 5e-06, | |
| "loss": 0.768, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0758885686839577, | |
| "grad_norm": 1.1330094006549452, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7048, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1143131604226706, | |
| "grad_norm": 1.1036306351344938, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7049, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1527377521613833, | |
| "grad_norm": 0.8088529561657445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7065, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.191162343900096, | |
| "grad_norm": 0.6453683425935199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7053, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.229586935638809, | |
| "grad_norm": 0.7446769681200912, | |
| "learning_rate": 5e-06, | |
| "loss": 0.705, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2680115273775217, | |
| "grad_norm": 0.6286087733342373, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7034, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3064361191162344, | |
| "grad_norm": 0.7100549600622268, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7042, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.344860710854947, | |
| "grad_norm": 0.9359540512097944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7074, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.38328530259366, | |
| "grad_norm": 0.8699027772413719, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7029, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4217098943323727, | |
| "grad_norm": 0.7184467018616941, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7049, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4601344860710854, | |
| "grad_norm": 0.707621532856516, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7011, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4985590778097984, | |
| "grad_norm": 0.6450672044350859, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7033, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.536983669548511, | |
| "grad_norm": 0.6206488333062592, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7007, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5754082612872238, | |
| "grad_norm": 0.6551961537173275, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7073, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6138328530259365, | |
| "grad_norm": 0.759327282861503, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7014, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6522574447646494, | |
| "grad_norm": 0.6747434203983423, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7026, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.6906820365033621, | |
| "grad_norm": 0.7077879716432668, | |
| "learning_rate": 5e-06, | |
| "loss": 0.697, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.729106628242075, | |
| "grad_norm": 0.6097521106522427, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7006, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.7675312199807878, | |
| "grad_norm": 0.6284752403489111, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6945, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.8059558117195005, | |
| "grad_norm": 0.6322821309286489, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7004, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.8443804034582132, | |
| "grad_norm": 0.6503834070635269, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6978, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.882804995196926, | |
| "grad_norm": 0.5928523527781089, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6965, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.9212295869356388, | |
| "grad_norm": 0.7766248894494607, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6983, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9596541786743515, | |
| "grad_norm": 0.6619286278573577, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7017, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.9980787704130645, | |
| "grad_norm": 0.6972596313365015, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7051, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.9980787704130645, | |
| "eval_loss": 0.7455958724021912, | |
| "eval_runtime": 276.3849, | |
| "eval_samples_per_second": 25.374, | |
| "eval_steps_per_second": 0.398, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.036503362151777, | |
| "grad_norm": 0.90057408209001, | |
| "learning_rate": 5e-06, | |
| "loss": 0.703, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.07492795389049, | |
| "grad_norm": 0.5876798686485762, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6459, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.1133525456292026, | |
| "grad_norm": 0.6750213227294546, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6429, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.1517771373679153, | |
| "grad_norm": 0.6562193769008317, | |
| "learning_rate": 5e-06, | |
| "loss": 0.647, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.1902017291066285, | |
| "grad_norm": 1.0106178235163223, | |
| "learning_rate": 5e-06, | |
| "loss": 0.647, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.228626320845341, | |
| "grad_norm": 1.203791734016265, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6464, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.267050912584054, | |
| "grad_norm": 1.072833126484535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6436, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.3054755043227666, | |
| "grad_norm": 0.7265275190167136, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6496, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.3439000960614793, | |
| "grad_norm": 0.7159981148293034, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6488, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.382324687800192, | |
| "grad_norm": 0.7499566009343647, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6471, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.4207492795389047, | |
| "grad_norm": 0.7298967563003717, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6484, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.459173871277618, | |
| "grad_norm": 0.7953731123916896, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6457, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.4975984630163306, | |
| "grad_norm": 0.7193497283697384, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6456, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.5360230547550433, | |
| "grad_norm": 0.6304268096938748, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6538, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.574447646493756, | |
| "grad_norm": 0.6119037125847125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6511, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.6128722382324687, | |
| "grad_norm": 0.668738855127415, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6497, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.6512968299711814, | |
| "grad_norm": 0.6759770116611106, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6532, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.689721421709894, | |
| "grad_norm": 0.7216343244870027, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6504, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.7281460134486073, | |
| "grad_norm": 0.6238981674987125, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6509, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.76657060518732, | |
| "grad_norm": 0.8189531719324571, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6512, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.8049951969260327, | |
| "grad_norm": 0.641751150175285, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6535, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.8434197886647454, | |
| "grad_norm": 0.8091455165748344, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6513, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.881844380403458, | |
| "grad_norm": 0.6798414752987015, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6496, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.920268972142171, | |
| "grad_norm": 0.8078397653599894, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6505, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.9586935638808836, | |
| "grad_norm": 0.7023876525964049, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6511, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.9971181556195967, | |
| "grad_norm": 0.6381184685536545, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6472, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.9971181556195967, | |
| "eval_loss": 0.7485793232917786, | |
| "eval_runtime": 276.3503, | |
| "eval_samples_per_second": 25.377, | |
| "eval_steps_per_second": 0.398, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.9971181556195967, | |
| "step": 780, | |
| "total_flos": 1306319671787520.0, | |
| "train_loss": 0.7206086501097068, | |
| "train_runtime": 45973.7379, | |
| "train_samples_per_second": 8.694, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 780, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1306319671787520.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |