| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9990041824337781, | |
| "eval_steps": 100, | |
| "global_step": 627, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01593308105954989, | |
| "grad_norm": 5.976239945638455, | |
| "learning_rate": 2.631578947368421e-06, | |
| "loss": 0.8689, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03186616211909978, | |
| "grad_norm": 4.067977130361014, | |
| "learning_rate": 4.999966626509336e-06, | |
| "loss": 0.6193, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04779924317864967, | |
| "grad_norm": 3.93622618786125, | |
| "learning_rate": 4.995962885666031e-06, | |
| "loss": 0.4451, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06373232423819956, | |
| "grad_norm": 1.979025192947513, | |
| "learning_rate": 4.985296693000866e-06, | |
| "loss": 0.3984, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07966540529774946, | |
| "grad_norm": 4.958516213423643, | |
| "learning_rate": 4.967996519688298e-06, | |
| "loss": 0.3429, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09559848635729934, | |
| "grad_norm": 1.901531491013398, | |
| "learning_rate": 4.944108544929091e-06, | |
| "loss": 0.3135, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11153156741684923, | |
| "grad_norm": 2.6637317005625105, | |
| "learning_rate": 4.913696532684593e-06, | |
| "loss": 0.3014, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12746464847639913, | |
| "grad_norm": 1.721469735007685, | |
| "learning_rate": 4.876841661472136e-06, | |
| "loss": 0.3393, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.143397729535949, | |
| "grad_norm": 2.285020500263398, | |
| "learning_rate": 4.833642307675948e-06, | |
| "loss": 0.2983, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.15933081059549892, | |
| "grad_norm": 1.846260142695851, | |
| "learning_rate": 4.784213782951926e-06, | |
| "loss": 0.2895, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1752638916550488, | |
| "grad_norm": 2.437561340834138, | |
| "learning_rate": 4.728688026427245e-06, | |
| "loss": 0.2802, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.19119697271459868, | |
| "grad_norm": 1.814073132276511, | |
| "learning_rate": 4.667213252516408e-06, | |
| "loss": 0.2764, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2071300537741486, | |
| "grad_norm": 1.7520200232040006, | |
| "learning_rate": 4.599953555293807e-06, | |
| "loss": 0.2723, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.22306313483369847, | |
| "grad_norm": 1.7012731088046187, | |
| "learning_rate": 4.527088470478851e-06, | |
| "loss": 0.2766, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.23899621589324835, | |
| "grad_norm": 1.4552675894129845, | |
| "learning_rate": 4.448812496202849e-06, | |
| "loss": 0.2675, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.25492929695279826, | |
| "grad_norm": 1.3972467983262729, | |
| "learning_rate": 4.365334573836851e-06, | |
| "loss": 0.2693, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.27086237801234814, | |
| "grad_norm": 1.7017564789089161, | |
| "learning_rate": 4.276877530266284e-06, | |
| "loss": 0.2648, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.286795459071898, | |
| "grad_norm": 1.3763883863698911, | |
| "learning_rate": 4.183677483101101e-06, | |
| "loss": 0.2668, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3027285401314479, | |
| "grad_norm": 2.187618573791854, | |
| "learning_rate": 4.085983210409114e-06, | |
| "loss": 0.2788, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.31866162119099783, | |
| "grad_norm": 1.457524811306034, | |
| "learning_rate": 3.98405548665489e-06, | |
| "loss": 0.2548, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3345947022505477, | |
| "grad_norm": 1.4844524404614106, | |
| "learning_rate": 3.878166386616752e-06, | |
| "loss": 0.2334, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3505277833100976, | |
| "grad_norm": 1.7097951513131715, | |
| "learning_rate": 3.7685985591399677e-06, | |
| "loss": 0.2526, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3664608643696475, | |
| "grad_norm": 2.5926772898483885, | |
| "learning_rate": 3.655644472664667e-06, | |
| "loss": 0.2537, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.38239394542919736, | |
| "grad_norm": 2.3399494453043745, | |
| "learning_rate": 3.539605634542399e-06, | |
| "loss": 0.261, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.39832702648874724, | |
| "grad_norm": 2.3700041684156394, | |
| "learning_rate": 3.4207917862252083e-06, | |
| "loss": 0.2655, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4142601075482972, | |
| "grad_norm": 2.0696626816914088, | |
| "learning_rate": 3.2995200764754924e-06, | |
| "loss": 0.2488, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.43019318860784705, | |
| "grad_norm": 1.3941586957894263, | |
| "learning_rate": 3.1761142148035993e-06, | |
| "loss": 0.2404, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.44612626966739694, | |
| "grad_norm": 1.740968716187846, | |
| "learning_rate": 3.0509036073928686e-06, | |
| "loss": 0.2497, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4620593507269468, | |
| "grad_norm": 1.463183890861242, | |
| "learning_rate": 2.9242224778185985e-06, | |
| "loss": 0.2522, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4779924317864967, | |
| "grad_norm": 2.646999989780854, | |
| "learning_rate": 2.7964089749079907e-06, | |
| "loss": 0.2439, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4939255128460466, | |
| "grad_norm": 1.3983229614766708, | |
| "learning_rate": 2.667804270122454e-06, | |
| "loss": 0.2331, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5098585939055965, | |
| "grad_norm": 1.4379739717093198, | |
| "learning_rate": 2.538751646871617e-06, | |
| "loss": 0.2331, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5257916749651463, | |
| "grad_norm": 1.499769972907629, | |
| "learning_rate": 2.4095955841899372e-06, | |
| "loss": 0.2374, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5417247560246963, | |
| "grad_norm": 1.3346410958529098, | |
| "learning_rate": 2.280680837221835e-06, | |
| "loss": 0.2252, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5576578370842462, | |
| "grad_norm": 1.4481302153632802, | |
| "learning_rate": 2.1523515169698144e-06, | |
| "loss": 0.241, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.573590918143796, | |
| "grad_norm": 2.286737307402928, | |
| "learning_rate": 2.0249501717619894e-06, | |
| "loss": 0.2319, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.589523999203346, | |
| "grad_norm": 1.28289335918518, | |
| "learning_rate": 1.8988168728908277e-06, | |
| "loss": 0.2202, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6054570802628958, | |
| "grad_norm": 1.4034454327013157, | |
| "learning_rate": 1.7742883068638447e-06, | |
| "loss": 0.2325, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6213901613224457, | |
| "grad_norm": 1.581917998658176, | |
| "learning_rate": 1.65169687668926e-06, | |
| "loss": 0.2273, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6373232423819957, | |
| "grad_norm": 1.499413918873696, | |
| "learning_rate": 1.531369814595567e-06, | |
| "loss": 0.2401, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6532563234415455, | |
| "grad_norm": 1.26561372061873, | |
| "learning_rate": 1.4136283085534158e-06, | |
| "loss": 0.2134, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6691894045010954, | |
| "grad_norm": 1.399381000277604, | |
| "learning_rate": 1.2987866449313824e-06, | |
| "loss": 0.2232, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6851224855606453, | |
| "grad_norm": 1.500478318908901, | |
| "learning_rate": 1.187151369574127e-06, | |
| "loss": 0.2304, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7010555666201952, | |
| "grad_norm": 1.4005423659968532, | |
| "learning_rate": 1.0790204695422571e-06, | |
| "loss": 0.215, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.716988647679745, | |
| "grad_norm": 1.5134639641942145, | |
| "learning_rate": 9.746825776980864e-07, | |
| "loss": 0.2259, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.732921728739295, | |
| "grad_norm": 1.16505843435412, | |
| "learning_rate": 8.744162022604671e-07, | |
| "loss": 0.2012, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7488548097988449, | |
| "grad_norm": 1.8363118423070643, | |
| "learning_rate": 7.784889833852433e-07, | |
| "loss": 0.208, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7647878908583947, | |
| "grad_norm": 1.3521882743327087, | |
| "learning_rate": 6.871569787557375e-07, | |
| "loss": 0.2169, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7807209719179447, | |
| "grad_norm": 1.3456412659902603, | |
| "learning_rate": 6.006639800902223e-07, | |
| "loss": 0.2353, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7966540529774945, | |
| "grad_norm": 1.2307940792861065, | |
| "learning_rate": 5.192408623908246e-07, | |
| "loss": 0.2202, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8125871340370444, | |
| "grad_norm": 2.6755130100833613, | |
| "learning_rate": 4.431049676709093e-07, | |
| "loss": 0.2282, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8285202150965943, | |
| "grad_norm": 1.4296114226991548, | |
| "learning_rate": 3.72459524805954e-07, | |
| "loss": 0.2265, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8444532961561442, | |
| "grad_norm": 1.3346792046123495, | |
| "learning_rate": 3.074931070564921e-07, | |
| "loss": 0.2103, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8603863772156941, | |
| "grad_norm": 1.3521565356216243, | |
| "learning_rate": 2.4837912871116645e-07, | |
| "loss": 0.2192, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8763194582752439, | |
| "grad_norm": 1.3074956128752757, | |
| "learning_rate": 1.9527538219348775e-07, | |
| "loss": 0.2159, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8922525393347939, | |
| "grad_norm": 1.6764404498838295, | |
| "learning_rate": 1.4832361686790697e-07, | |
| "loss": 0.2052, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9081856203943438, | |
| "grad_norm": 1.319097300978499, | |
| "learning_rate": 1.0764916066947795e-07, | |
| "loss": 0.2036, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.9241187014538936, | |
| "grad_norm": 1.4874961576772896, | |
| "learning_rate": 7.336058556710241e-08, | |
| "loss": 0.2054, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9400517825134436, | |
| "grad_norm": 1.2833278629486689, | |
| "learning_rate": 4.5549417753326106e-08, | |
| "loss": 0.2259, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9559848635729934, | |
| "grad_norm": 1.76408107001633, | |
| "learning_rate": 2.4289893334276116e-08, | |
| "loss": 0.227, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9719179446325433, | |
| "grad_norm": 2.1005708168542325, | |
| "learning_rate": 9.638760171873373e-09, | |
| "loss": 0.2119, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9878510256920932, | |
| "grad_norm": 1.5104598295315261, | |
| "learning_rate": 1.6351264072653194e-09, | |
| "loss": 0.2176, | |
| "step": 620 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 627, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 183428678942720.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |