{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9990041824337781, "eval_steps": 100, "global_step": 627, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01593308105954989, "grad_norm": 6.123967491063448, "learning_rate": 2.631578947368421e-06, "loss": 0.7844, "step": 10 }, { "epoch": 0.03186616211909978, "grad_norm": 4.195346424811152, "learning_rate": 4.999966626509336e-06, "loss": 0.4873, "step": 20 }, { "epoch": 0.04779924317864967, "grad_norm": 2.3937960648002496, "learning_rate": 4.995962885666031e-06, "loss": 0.3576, "step": 30 }, { "epoch": 0.06373232423819956, "grad_norm": 2.0119365099499613, "learning_rate": 4.985296693000866e-06, "loss": 0.3362, "step": 40 }, { "epoch": 0.07966540529774946, "grad_norm": 6.364486514806231, "learning_rate": 4.967996519688298e-06, "loss": 0.3002, "step": 50 }, { "epoch": 0.09559848635729934, "grad_norm": 2.149725330685237, "learning_rate": 4.944108544929091e-06, "loss": 0.2702, "step": 60 }, { "epoch": 0.11153156741684923, "grad_norm": 179.0810015887418, "learning_rate": 4.913696532684593e-06, "loss": 0.3265, "step": 70 }, { "epoch": 0.12746464847639913, "grad_norm": 2.3719183425912354, "learning_rate": 4.876841661472136e-06, "loss": 0.4095, "step": 80 }, { "epoch": 0.143397729535949, "grad_norm": 5.36119881007005, "learning_rate": 4.833642307675948e-06, "loss": 0.2789, "step": 90 }, { "epoch": 0.15933081059549892, "grad_norm": 1.884067588257083, "learning_rate": 4.784213782951926e-06, "loss": 0.2808, "step": 100 }, { "epoch": 0.1752638916550488, "grad_norm": 37.862372083970605, "learning_rate": 4.728688026427245e-06, "loss": 0.2711, "step": 110 }, { "epoch": 0.19119697271459868, "grad_norm": 2.462558490018586, "learning_rate": 4.667213252516408e-06, "loss": 0.2496, "step": 120 }, { "epoch": 0.2071300537741486, "grad_norm": 2.109315595561499, "learning_rate": 4.599953555293807e-06, "loss": 0.2455, "step": 130 }, { "epoch": 0.22306313483369847, "grad_norm": 2.0083488558558833, "learning_rate": 4.527088470478851e-06, "loss": 0.2522, "step": 140 }, { "epoch": 0.23899621589324835, "grad_norm": 1.655486020092584, "learning_rate": 4.448812496202849e-06, "loss": 0.2517, "step": 150 }, { "epoch": 0.25492929695279826, "grad_norm": 1.6528337383021185, "learning_rate": 4.365334573836851e-06, "loss": 0.2566, "step": 160 }, { "epoch": 0.27086237801234814, "grad_norm": 1.8785516918363623, "learning_rate": 4.276877530266284e-06, "loss": 0.2404, "step": 170 }, { "epoch": 0.286795459071898, "grad_norm": 1.6846196856668103, "learning_rate": 4.183677483101101e-06, "loss": 0.2452, "step": 180 }, { "epoch": 0.3027285401314479, "grad_norm": 2.020720522276084, "learning_rate": 4.085983210409114e-06, "loss": 0.2523, "step": 190 }, { "epoch": 0.31866162119099783, "grad_norm": 1.6343932551444422, "learning_rate": 3.98405548665489e-06, "loss": 0.2284, "step": 200 }, { "epoch": 0.3345947022505477, "grad_norm": 1.830257135625606, "learning_rate": 3.878166386616752e-06, "loss": 0.2164, "step": 210 }, { "epoch": 0.3505277833100976, "grad_norm": 2.6262705273633036, "learning_rate": 3.7685985591399677e-06, "loss": 0.2358, "step": 220 }, { "epoch": 0.3664608643696475, "grad_norm": 3.891410262587011, "learning_rate": 3.655644472664667e-06, "loss": 0.234, "step": 230 }, { "epoch": 0.38239394542919736, "grad_norm": 3.0175466455395115, "learning_rate": 3.539605634542399e-06, "loss": 0.237, "step": 240 }, { "epoch": 0.39832702648874724, "grad_norm": 1.7383677821741295, "learning_rate": 3.4207917862252083e-06, "loss": 0.2406, "step": 250 }, { "epoch": 0.4142601075482972, "grad_norm": 1.9322292163817105, "learning_rate": 3.2995200764754924e-06, "loss": 0.224, "step": 260 }, { "epoch": 0.43019318860784705, "grad_norm": 1.477693042337343, "learning_rate": 3.1761142148035993e-06, "loss": 0.2201, "step": 270 }, { "epoch": 0.44612626966739694, "grad_norm": 1.8474067145641697, "learning_rate": 3.0509036073928686e-06, "loss": 0.225, "step": 280 }, { "epoch": 0.4620593507269468, "grad_norm": 1.5510783177161291, "learning_rate": 2.9242224778185985e-06, "loss": 0.2314, "step": 290 }, { "epoch": 0.4779924317864967, "grad_norm": 2.433823482819606, "learning_rate": 2.7964089749079907e-06, "loss": 0.2199, "step": 300 }, { "epoch": 0.4939255128460466, "grad_norm": 1.5158466644893986, "learning_rate": 2.667804270122454e-06, "loss": 0.2101, "step": 310 }, { "epoch": 0.5098585939055965, "grad_norm": 1.496925875732646, "learning_rate": 2.538751646871617e-06, "loss": 0.21, "step": 320 }, { "epoch": 0.5257916749651463, "grad_norm": 1.6528604902515285, "learning_rate": 2.4095955841899372e-06, "loss": 0.2148, "step": 330 }, { "epoch": 0.5417247560246963, "grad_norm": 1.4331715070790367, "learning_rate": 2.280680837221835e-06, "loss": 0.2015, "step": 340 }, { "epoch": 0.5576578370842462, "grad_norm": 1.8026216721078392, "learning_rate": 2.1523515169698144e-06, "loss": 0.2189, "step": 350 }, { "epoch": 0.573590918143796, "grad_norm": 2.769205324123664, "learning_rate": 2.0249501717619894e-06, "loss": 0.2098, "step": 360 }, { "epoch": 0.589523999203346, "grad_norm": 1.4824634904460874, "learning_rate": 1.8988168728908277e-06, "loss": 0.1976, "step": 370 }, { "epoch": 0.6054570802628958, "grad_norm": 1.5417502167114872, "learning_rate": 1.7742883068638447e-06, "loss": 0.2071, "step": 380 }, { "epoch": 0.6213901613224457, "grad_norm": 1.6185776121068578, "learning_rate": 1.65169687668926e-06, "loss": 0.2043, "step": 390 }, { "epoch": 0.6373232423819957, "grad_norm": 1.6099311926386768, "learning_rate": 1.531369814595567e-06, "loss": 0.2118, "step": 400 }, { "epoch": 0.6532563234415455, "grad_norm": 1.4106643427656438, "learning_rate": 1.4136283085534158e-06, "loss": 0.1883, "step": 410 }, { "epoch": 0.6691894045010954, "grad_norm": 1.4983565765754983, "learning_rate": 1.2987866449313824e-06, "loss": 0.1971, "step": 420 }, { "epoch": 0.6851224855606453, "grad_norm": 1.6344675457287534, "learning_rate": 1.187151369574127e-06, "loss": 0.2033, "step": 430 }, { "epoch": 0.7010555666201952, "grad_norm": 1.5810003041618759, "learning_rate": 1.0790204695422571e-06, "loss": 0.191, "step": 440 }, { "epoch": 0.716988647679745, "grad_norm": 1.669954129289104, "learning_rate": 9.746825776980864e-07, "loss": 0.1984, "step": 450 }, { "epoch": 0.732921728739295, "grad_norm": 1.3119689193301776, "learning_rate": 8.744162022604671e-07, "loss": 0.1782, "step": 460 }, { "epoch": 0.7488548097988449, "grad_norm": 2.142011140780938, "learning_rate": 7.784889833852433e-07, "loss": 0.1844, "step": 470 }, { "epoch": 0.7647878908583947, "grad_norm": 1.3982306847588983, "learning_rate": 6.871569787557375e-07, "loss": 0.1884, "step": 480 }, { "epoch": 0.7807209719179447, "grad_norm": 1.4269559759325665, "learning_rate": 6.006639800902223e-07, "loss": 0.2041, "step": 490 }, { "epoch": 0.7966540529774945, "grad_norm": 1.3402320824249279, "learning_rate": 5.192408623908246e-07, "loss": 0.1935, "step": 500 }, { "epoch": 0.8125871340370444, "grad_norm": 4.081818040872233, "learning_rate": 4.431049676709093e-07, "loss": 0.2023, "step": 510 }, { "epoch": 0.8285202150965943, "grad_norm": 1.5620677708803261, "learning_rate": 3.72459524805954e-07, "loss": 0.2009, "step": 520 }, { "epoch": 0.8444532961561442, "grad_norm": 1.657553455475968, "learning_rate": 3.074931070564921e-07, "loss": 0.1861, "step": 530 }, { "epoch": 0.8603863772156941, "grad_norm": 1.4767107380746103, "learning_rate": 2.4837912871116645e-07, "loss": 0.1933, "step": 540 }, { "epoch": 0.8763194582752439, "grad_norm": 1.4093479736420487, "learning_rate": 1.9527538219348775e-07, "loss": 0.1896, "step": 550 }, { "epoch": 0.8922525393347939, "grad_norm": 1.8203174948013054, "learning_rate": 1.4832361686790697e-07, "loss": 0.1768, "step": 560 }, { "epoch": 0.9081856203943438, "grad_norm": 1.3643578889605608, "learning_rate": 1.0764916066947795e-07, "loss": 0.1763, "step": 570 }, { "epoch": 0.9241187014538936, "grad_norm": 1.6532008723644016, "learning_rate": 7.336058556710241e-08, "loss": 0.1801, "step": 580 }, { "epoch": 0.9400517825134436, "grad_norm": 1.4228921803437908, "learning_rate": 4.5549417753326106e-08, "loss": 0.1974, "step": 590 }, { "epoch": 0.9559848635729934, "grad_norm": 1.7441673390589285, "learning_rate": 2.4289893334276116e-08, "loss": 0.196, "step": 600 }, { "epoch": 0.9719179446325433, "grad_norm": 2.271358181919917, "learning_rate": 9.638760171873373e-09, "loss": 0.1817, "step": 610 }, { "epoch": 0.9878510256920932, "grad_norm": 1.5715295700071357, "learning_rate": 1.6351264072653194e-09, "loss": 0.1918, "step": 620 } ], "logging_steps": 10, "max_steps": 627, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 206046578081792.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }