| { |
| "best_metric": 3.404348134994507, |
| "best_model_checkpoint": "./output/checkpoint-750", |
| "epoch": 1.1574074074074074, |
| "eval_steps": 150, |
| "global_step": 750, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.015432098765432098, |
| "grad_norm": 1.2556171417236328, |
| "learning_rate": 2.9999999999999997e-05, |
| "loss": 4.2432, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.030864197530864196, |
| "grad_norm": 1.3756800889968872, |
| "learning_rate": 5.9999999999999995e-05, |
| "loss": 4.2354, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.046296296296296294, |
| "grad_norm": 1.1585898399353027, |
| "learning_rate": 8.999999999999999e-05, |
| "loss": 4.2311, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06172839506172839, |
| "grad_norm": 1.3726162910461426, |
| "learning_rate": 0.00011999999999999999, |
| "loss": 4.1963, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.07716049382716049, |
| "grad_norm": 1.207290768623352, |
| "learning_rate": 0.00015, |
| "loss": 4.2085, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09259259259259259, |
| "grad_norm": 1.361877202987671, |
| "learning_rate": 0.00017999999999999998, |
| "loss": 4.1297, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.10802469135802469, |
| "grad_norm": 1.432780146598816, |
| "learning_rate": 0.00020999999999999998, |
| "loss": 4.1494, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.12345679012345678, |
| "grad_norm": 1.7335089445114136, |
| "learning_rate": 0.00023999999999999998, |
| "loss": 4.0941, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.1388888888888889, |
| "grad_norm": 1.1360023021697998, |
| "learning_rate": 0.00027, |
| "loss": 4.1655, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.15432098765432098, |
| "grad_norm": 1.9188594818115234, |
| "learning_rate": 0.0003, |
| "loss": 4.1419, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1697530864197531, |
| "grad_norm": 1.5362151861190796, |
| "learning_rate": 0.00029999691704375486, |
| "loss": 4.0569, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.18518518518518517, |
| "grad_norm": 1.398676872253418, |
| "learning_rate": 0.00029998766830174786, |
| "loss": 4.1547, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2006172839506173, |
| "grad_norm": 1.465765118598938, |
| "learning_rate": 0.00029997225415415846, |
| "loss": 4.0257, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.21604938271604937, |
| "grad_norm": 1.7107757329940796, |
| "learning_rate": 0.00029995067523460196, |
| "loss": 4.1388, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.23148148148148148, |
| "grad_norm": 1.5202412605285645, |
| "learning_rate": 0.0002999229324301032, |
| "loss": 4.1026, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.23148148148148148, |
| "eval_loss": 4.2441534996032715, |
| "eval_runtime": 12.6941, |
| "eval_samples_per_second": 39.388, |
| "eval_steps_per_second": 39.388, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.24691358024691357, |
| "grad_norm": 1.3799288272857666, |
| "learning_rate": 0.0002998890268810601, |
| "loss": 4.1424, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2623456790123457, |
| "grad_norm": 1.7501742839813232, |
| "learning_rate": 0.0002998489599811972, |
| "loss": 4.0572, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2777777777777778, |
| "grad_norm": 1.8857972621917725, |
| "learning_rate": 0.00029980273337750765, |
| "loss": 4.0301, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.2932098765432099, |
| "grad_norm": 1.4475533962249756, |
| "learning_rate": 0.00029975034897018613, |
| "loss": 3.9859, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.30864197530864196, |
| "grad_norm": 2.1174092292785645, |
| "learning_rate": 0.00029969180891255043, |
| "loss": 4.0501, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.32407407407407407, |
| "grad_norm": 1.6035856008529663, |
| "learning_rate": 0.00029962711561095306, |
| "loss": 3.9958, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3395061728395062, |
| "grad_norm": 1.641836404800415, |
| "learning_rate": 0.00029955627172468223, |
| "loss": 3.9698, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3549382716049383, |
| "grad_norm": 2.0835461616516113, |
| "learning_rate": 0.0002994792801658527, |
| "loss": 3.9256, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.37037037037037035, |
| "grad_norm": 1.725764274597168, |
| "learning_rate": 0.00029939614409928584, |
| "loss": 3.9947, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.38580246913580246, |
| "grad_norm": 1.7063647508621216, |
| "learning_rate": 0.0002993068669423797, |
| "loss": 3.9562, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.4012345679012346, |
| "grad_norm": 1.8369818925857544, |
| "learning_rate": 0.0002992114523649686, |
| "loss": 4.034, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 1.623962163925171, |
| "learning_rate": 0.000299109904289172, |
| "loss": 3.9337, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.43209876543209874, |
| "grad_norm": 1.886649489402771, |
| "learning_rate": 0.0002990022268892337, |
| "loss": 4.0057, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.44753086419753085, |
| "grad_norm": 1.757165789604187, |
| "learning_rate": 0.00029888842459134974, |
| "loss": 3.9772, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.46296296296296297, |
| "grad_norm": 1.5515315532684326, |
| "learning_rate": 0.0002987685020734869, |
| "loss": 3.8073, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.46296296296296297, |
| "eval_loss": 4.050190448760986, |
| "eval_runtime": 12.7855, |
| "eval_samples_per_second": 39.107, |
| "eval_steps_per_second": 39.107, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4783950617283951, |
| "grad_norm": 2.2727859020233154, |
| "learning_rate": 0.0002986424642651902, |
| "loss": 4.0273, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.49382716049382713, |
| "grad_norm": 1.9832032918930054, |
| "learning_rate": 0.00029851031634738024, |
| "loss": 3.9517, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.5092592592592593, |
| "grad_norm": 2.112586498260498, |
| "learning_rate": 0.0002983720637521404, |
| "loss": 3.8791, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5246913580246914, |
| "grad_norm": 2.052028179168701, |
| "learning_rate": 0.00029822771216249334, |
| "loss": 3.8507, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5401234567901234, |
| "grad_norm": 2.184110641479492, |
| "learning_rate": 0.00029807726751216753, |
| "loss": 3.8, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 2.0305166244506836, |
| "learning_rate": 0.0002979207359853532, |
| "loss": 3.8285, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5709876543209876, |
| "grad_norm": 2.6513309478759766, |
| "learning_rate": 0.0002977581240164485, |
| "loss": 3.935, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5864197530864198, |
| "grad_norm": 1.953681230545044, |
| "learning_rate": 0.00029758943828979444, |
| "loss": 3.899, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.6018518518518519, |
| "grad_norm": 2.1595969200134277, |
| "learning_rate": 0.00029741468573940056, |
| "loss": 3.9902, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6172839506172839, |
| "grad_norm": 2.0108890533447266, |
| "learning_rate": 0.0002972338735486598, |
| "loss": 3.7605, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6327160493827161, |
| "grad_norm": 2.363704204559326, |
| "learning_rate": 0.00029704700915005305, |
| "loss": 3.7982, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6481481481481481, |
| "grad_norm": 2.1953635215759277, |
| "learning_rate": 0.00029685410022484393, |
| "loss": 3.8828, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6635802469135802, |
| "grad_norm": 2.266935348510742, |
| "learning_rate": 0.0002966551547027627, |
| "loss": 3.835, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6790123456790124, |
| "grad_norm": 2.0851755142211914, |
| "learning_rate": 0.0002964501807616806, |
| "loss": 3.7015, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6944444444444444, |
| "grad_norm": 2.31782865524292, |
| "learning_rate": 0.0002962391868272735, |
| "loss": 3.7793, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6944444444444444, |
| "eval_loss": 3.8206655979156494, |
| "eval_runtime": 12.8936, |
| "eval_samples_per_second": 38.779, |
| "eval_steps_per_second": 38.779, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.7098765432098766, |
| "grad_norm": 2.024686813354492, |
| "learning_rate": 0.0002960221815726757, |
| "loss": 3.69, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7253086419753086, |
| "grad_norm": 2.0512404441833496, |
| "learning_rate": 0.00029579917391812314, |
| "loss": 3.6132, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7407407407407407, |
| "grad_norm": 2.0870227813720703, |
| "learning_rate": 0.0002955701730305872, |
| "loss": 3.6681, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7561728395061729, |
| "grad_norm": 2.38336443901062, |
| "learning_rate": 0.00029533518832339727, |
| "loss": 3.5982, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.7716049382716049, |
| "grad_norm": 3.0686960220336914, |
| "learning_rate": 0.0002950942294558544, |
| "loss": 3.7158, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7870370370370371, |
| "grad_norm": 1.8391352891921997, |
| "learning_rate": 0.0002948473063328338, |
| "loss": 3.4874, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8024691358024691, |
| "grad_norm": 3.0002377033233643, |
| "learning_rate": 0.00029459442910437797, |
| "loss": 3.5523, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8179012345679012, |
| "grad_norm": 2.9197888374328613, |
| "learning_rate": 0.0002943356081652793, |
| "loss": 3.5964, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 2.4608469009399414, |
| "learning_rate": 0.0002940708541546529, |
| "loss": 3.6013, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8487654320987654, |
| "grad_norm": 2.6457505226135254, |
| "learning_rate": 0.00029380017795549906, |
| "loss": 3.5637, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8641975308641975, |
| "grad_norm": 2.789729356765747, |
| "learning_rate": 0.0002935235906942563, |
| "loss": 3.6736, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8796296296296297, |
| "grad_norm": 2.2199699878692627, |
| "learning_rate": 0.00029324110374034354, |
| "loss": 3.6183, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8950617283950617, |
| "grad_norm": 2.9708974361419678, |
| "learning_rate": 0.00029295272870569303, |
| "loss": 3.4817, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9104938271604939, |
| "grad_norm": 2.5598032474517822, |
| "learning_rate": 0.00029265847744427303, |
| "loss": 3.3191, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.9259259259259259, |
| "grad_norm": 2.723311424255371, |
| "learning_rate": 0.0002923583620516003, |
| "loss": 3.6747, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9259259259259259, |
| "eval_loss": 3.631777286529541, |
| "eval_runtime": 14.5253, |
| "eval_samples_per_second": 34.423, |
| "eval_steps_per_second": 34.423, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.941358024691358, |
| "grad_norm": 3.062471866607666, |
| "learning_rate": 0.0002920523948642432, |
| "loss": 3.4515, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.9567901234567902, |
| "grad_norm": 2.787814140319824, |
| "learning_rate": 0.00029174058845931434, |
| "loss": 3.6189, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.9722222222222222, |
| "grad_norm": 2.71366810798645, |
| "learning_rate": 0.0002914229556539539, |
| "loss": 3.4178, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.9876543209876543, |
| "grad_norm": 2.544954776763916, |
| "learning_rate": 0.00029109950950480235, |
| "loss": 3.5334, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.0030864197530864, |
| "grad_norm": 2.5218825340270996, |
| "learning_rate": 0.00029077026330746403, |
| "loss": 3.3136, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.0185185185185186, |
| "grad_norm": 3.377516746520996, |
| "learning_rate": 0.00029043523059596053, |
| "loss": 3.0759, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.0339506172839505, |
| "grad_norm": 3.7312910556793213, |
| "learning_rate": 0.00029009442514217447, |
| "loss": 3.0688, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.0493827160493827, |
| "grad_norm": 3.517127513885498, |
| "learning_rate": 0.00028974786095528306, |
| "loss": 3.1925, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.0648148148148149, |
| "grad_norm": 3.9042561054229736, |
| "learning_rate": 0.00028939555228118277, |
| "loss": 3.1548, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.0802469135802468, |
| "grad_norm": 3.7608659267425537, |
| "learning_rate": 0.0002890375136019032, |
| "loss": 3.0335, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.095679012345679, |
| "grad_norm": 3.591569662094116, |
| "learning_rate": 0.0002886737596350122, |
| "loss": 3.1341, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 3.2849345207214355, |
| "learning_rate": 0.0002883043053330106, |
| "loss": 2.9512, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.126543209876543, |
| "grad_norm": 3.238452196121216, |
| "learning_rate": 0.0002879291658827176, |
| "loss": 2.9095, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.1419753086419753, |
| "grad_norm": 4.236405849456787, |
| "learning_rate": 0.00028754835670464677, |
| "loss": 2.9601, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.1574074074074074, |
| "grad_norm": 3.696310043334961, |
| "learning_rate": 0.00028716189345237184, |
| "loss": 2.9871, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.1574074074074074, |
| "eval_loss": 3.404348134994507, |
| "eval_runtime": 13.6336, |
| "eval_samples_per_second": 36.674, |
| "eval_steps_per_second": 36.674, |
| "step": 750 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 150, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2848396981714944.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|