| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.994764397905759, | |
| "eval_steps": 500, | |
| "global_step": 858, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.034904013961605584, | |
| "grad_norm": 1.636154953792276, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9609, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06980802792321117, | |
| "grad_norm": 1.1847809904244873, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8537, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10471204188481675, | |
| "grad_norm": 1.1055793026372138, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8208, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13961605584642234, | |
| "grad_norm": 1.1454147166881017, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8046, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17452006980802792, | |
| "grad_norm": 0.8149143192535275, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7882, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2094240837696335, | |
| "grad_norm": 0.8125713341369675, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7822, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2443280977312391, | |
| "grad_norm": 0.6705477513854557, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7746, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2792321116928447, | |
| "grad_norm": 0.742108317973775, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7674, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.31413612565445026, | |
| "grad_norm": 0.5830122580252405, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7613, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.34904013961605584, | |
| "grad_norm": 0.6867621444893013, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7581, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.38394415357766143, | |
| "grad_norm": 0.5706894443800514, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7561, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.418848167539267, | |
| "grad_norm": 0.5933615440045283, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7555, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4537521815008726, | |
| "grad_norm": 0.5976459344111194, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7479, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4886561954624782, | |
| "grad_norm": 0.7595901956158283, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7445, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5235602094240838, | |
| "grad_norm": 0.5243859212111798, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7453, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5584642233856894, | |
| "grad_norm": 0.7053972260403277, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7459, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5933682373472949, | |
| "grad_norm": 0.7356815513429203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7406, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6282722513089005, | |
| "grad_norm": 0.5406768959780917, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7403, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6631762652705061, | |
| "grad_norm": 0.5731257742921576, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7414, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6980802792321117, | |
| "grad_norm": 0.5893545114403889, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7354, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7329842931937173, | |
| "grad_norm": 0.6666154404813628, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7378, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7678883071553229, | |
| "grad_norm": 0.6379810550334492, | |
| "learning_rate": 5e-06, | |
| "loss": 0.736, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8027923211169284, | |
| "grad_norm": 0.5761611687799336, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7365, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.837696335078534, | |
| "grad_norm": 0.5490954549201844, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7351, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8726003490401396, | |
| "grad_norm": 0.5577745326979847, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7305, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9075043630017452, | |
| "grad_norm": 0.5309350088615197, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7344, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9424083769633508, | |
| "grad_norm": 0.5171633944749564, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7312, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9773123909249564, | |
| "grad_norm": 0.6439135188078838, | |
| "learning_rate": 5e-06, | |
| "loss": 0.729, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.012216404886562, | |
| "grad_norm": 0.7944207887971882, | |
| "learning_rate": 5e-06, | |
| "loss": 0.752, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.0471204188481675, | |
| "grad_norm": 0.6463556134515147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6937, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.082024432809773, | |
| "grad_norm": 0.9747756816715487, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6922, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1169284467713787, | |
| "grad_norm": 0.6041467541568463, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6946, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1518324607329844, | |
| "grad_norm": 0.6555191903371557, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6905, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.1867364746945899, | |
| "grad_norm": 0.7808834156906888, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6905, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2216404886561953, | |
| "grad_norm": 0.689883618215288, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6886, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.256544502617801, | |
| "grad_norm": 0.5458990467442779, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6958, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2914485165794067, | |
| "grad_norm": 0.49774723989961944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6872, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.3263525305410122, | |
| "grad_norm": 0.615067023750174, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6913, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.3612565445026177, | |
| "grad_norm": 0.5120804275981703, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6943, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.3961605584642234, | |
| "grad_norm": 0.6511934985434475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6922, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.431064572425829, | |
| "grad_norm": 0.6015850091580557, | |
| "learning_rate": 5e-06, | |
| "loss": 0.691, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.4659685863874345, | |
| "grad_norm": 0.5548178493075747, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6876, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.50087260034904, | |
| "grad_norm": 0.6084101340671536, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6871, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.5357766143106457, | |
| "grad_norm": 0.5785174417745115, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6893, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.5706806282722514, | |
| "grad_norm": 0.6387925686406533, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6889, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.6055846422338569, | |
| "grad_norm": 0.511039789752418, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6882, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6404886561954624, | |
| "grad_norm": 0.6303156357824996, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6874, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.675392670157068, | |
| "grad_norm": 0.5463553041688999, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6826, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.7102966841186737, | |
| "grad_norm": 0.6680053603003989, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6887, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.7452006980802792, | |
| "grad_norm": 0.5861342009392054, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6877, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7801047120418847, | |
| "grad_norm": 0.5441609154940179, | |
| "learning_rate": 5e-06, | |
| "loss": 0.687, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.8150087260034904, | |
| "grad_norm": 0.493678213169674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6865, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.849912739965096, | |
| "grad_norm": 0.7325770535166638, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6889, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8848167539267016, | |
| "grad_norm": 0.6627130061862745, | |
| "learning_rate": 5e-06, | |
| "loss": 0.683, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.919720767888307, | |
| "grad_norm": 0.5392435344182795, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6869, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.9546247818499127, | |
| "grad_norm": 0.5374264329462486, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6874, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.9895287958115184, | |
| "grad_norm": 0.4520265683973087, | |
| "learning_rate": 5e-06, | |
| "loss": 0.686, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.024432809773124, | |
| "grad_norm": 0.7616966695399988, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6906, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.0593368237347294, | |
| "grad_norm": 0.8040603166806708, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6442, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.094240837696335, | |
| "grad_norm": 0.5161547369323151, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6462, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.1291448516579408, | |
| "grad_norm": 0.5566643583686863, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6465, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.164048865619546, | |
| "grad_norm": 0.7404248587220047, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6501, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.1989528795811517, | |
| "grad_norm": 0.602474854437427, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6451, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.2338568935427574, | |
| "grad_norm": 0.663987316506295, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6454, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.268760907504363, | |
| "grad_norm": 0.513580773343669, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6464, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.303664921465969, | |
| "grad_norm": 0.5572279672626476, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6495, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.338568935427574, | |
| "grad_norm": 0.6124769197735679, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6466, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.3734729493891797, | |
| "grad_norm": 0.6689033731877824, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6487, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.4083769633507854, | |
| "grad_norm": 0.6242198004638967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6479, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.4432809773123907, | |
| "grad_norm": 0.5816264133167447, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6473, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.4781849912739964, | |
| "grad_norm": 0.6182232441775428, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6486, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.513089005235602, | |
| "grad_norm": 0.6438939308222409, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6523, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.5479930191972078, | |
| "grad_norm": 0.6270289995094971, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6467, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.5828970331588135, | |
| "grad_norm": 0.5230942315565851, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6491, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.6178010471204187, | |
| "grad_norm": 0.5004924007290114, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6519, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.6527050610820244, | |
| "grad_norm": 0.5669482852337735, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6501, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.68760907504363, | |
| "grad_norm": 0.6432274167649737, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6487, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.7225130890052354, | |
| "grad_norm": 0.5996384982145978, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6521, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.757417102966841, | |
| "grad_norm": 0.5437537271625021, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6502, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.7923211169284468, | |
| "grad_norm": 0.5375883439387915, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6494, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.8272251308900525, | |
| "grad_norm": 0.9130751404600511, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6521, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.862129144851658, | |
| "grad_norm": 0.7285776360956339, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6497, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.8970331588132634, | |
| "grad_norm": 0.6108923967332701, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6491, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.931937172774869, | |
| "grad_norm": 0.6234985887070743, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6471, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.966841186736475, | |
| "grad_norm": 0.6458996378236107, | |
| "learning_rate": 5e-06, | |
| "loss": 0.648, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.994764397905759, | |
| "step": 858, | |
| "total_flos": 1436763197276160.0, | |
| "train_loss": 0.7012652700597589, | |
| "train_runtime": 12908.7913, | |
| "train_samples_per_second": 34.079, | |
| "train_steps_per_second": 0.066 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 858, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1436763197276160.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |