{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "grad_norm": 0.4471716284751892, "learning_rate": 1.8e-05, "loss": 1.287, "step": 10 }, { "grad_norm": 0.20870500802993774, "learning_rate": 3.8e-05, "loss": 1.202, "step": 20 }, { "grad_norm": 0.15201318264007568, "learning_rate": 5.8e-05, "loss": 1.2002, "step": 30 }, { "grad_norm": 0.16838377714157104, "learning_rate": 7.800000000000001e-05, "loss": 1.1862, "step": 40 }, { "grad_norm": 0.8558338284492493, "learning_rate": 9.8e-05, "loss": 1.1176, "step": 50 }, { "grad_norm": 0.3106129765510559, "learning_rate": 9.997785653888835e-05, "loss": 1.0896, "step": 60 }, { "grad_norm": 0.18132999539375305, "learning_rate": 9.990133642141359e-05, "loss": 1.0682, "step": 70 }, { "grad_norm": 0.20137448608875275, "learning_rate": 9.977024992520602e-05, "loss": 1.0698, "step": 80 }, { "grad_norm": 0.4021719992160797, "learning_rate": 9.95847403914247e-05, "loss": 1.0598, "step": 90 }, { "grad_norm": 0.25229543447494507, "learning_rate": 9.934501067202117e-05, "loss": 1.0643, "step": 100 }, { "grad_norm": 0.41172972321510315, "learning_rate": 9.905132290792394e-05, "loss": 1.0437, "step": 110 }, { "grad_norm": 0.3424518406391144, "learning_rate": 9.870399824239117e-05, "loss": 1.0168, "step": 120 }, { "grad_norm": 0.5512681007385254, "learning_rate": 9.830341646984521e-05, "loss": 0.9655, "step": 130 }, { "grad_norm": 0.6318266987800598, "learning_rate": 9.785001562057309e-05, "loss": 0.9044, "step": 140 }, { "grad_norm": 0.68224036693573, "learning_rate": 9.734429148174675e-05, "loss": 0.825, "step": 150 }, { "grad_norm": 0.7105351090431213, "learning_rate": 9.6786797055287e-05, "loss": 0.7075, "step": 160 }, { "grad_norm": 0.9974402189254761, "learning_rate": 9.617814195316411e-05, "loss": 0.6054, "step": 170 }, { "grad_norm": 0.7529901266098022, "learning_rate": 9.551899173079607e-05, "loss": 0.5167, "step": 180 }, { "grad_norm": 0.7296067476272583, "learning_rate": 9.481006715927351e-05, "loss": 0.4093, "step": 190 }, { "grad_norm": 0.8161115050315857, "learning_rate": 9.405214343720707e-05, "loss": 0.3432, "step": 200 }, { "grad_norm": 0.8231457471847534, "learning_rate": 9.32460493430591e-05, "loss": 0.3133, "step": 210 }, { "grad_norm": 0.6054251790046692, "learning_rate": 9.239266632888659e-05, "loss": 0.269, "step": 220 }, { "grad_norm": 0.8241866230964661, "learning_rate": 9.14929275564863e-05, "loss": 0.2279, "step": 230 }, { "grad_norm": 0.6421325206756592, "learning_rate": 9.0547816876996e-05, "loss": 0.2042, "step": 240 }, { "grad_norm": 0.7083246111869812, "learning_rate": 8.955836775506776e-05, "loss": 0.2063, "step": 250 }, { "grad_norm": 0.6825037002563477, "learning_rate": 8.852566213878947e-05, "loss": 0.1857, "step": 260 }, { "grad_norm": 0.9095971584320068, "learning_rate": 8.745082927659047e-05, "loss": 0.1583, "step": 270 }, { "grad_norm": 0.7291728258132935, "learning_rate": 8.633504448242505e-05, "loss": 0.168, "step": 280 }, { "grad_norm": 0.5694337487220764, "learning_rate": 8.517952785058385e-05, "loss": 0.1521, "step": 290 }, { "grad_norm": 0.6113538146018982, "learning_rate": 8.398554292153866e-05, "loss": 0.1446, "step": 300 }, { "grad_norm": 0.7520995140075684, "learning_rate": 8.275439530027948e-05, "loss": 0.1436, "step": 310 }, { "grad_norm": 0.5477866530418396, "learning_rate": 8.148743122865463e-05, "loss": 0.1256, "step": 320 }, { "grad_norm": 0.7027655839920044, "learning_rate": 8.018603611327504e-05, "loss": 0.1078, "step": 330 }, { "grad_norm": 0.6788212060928345, "learning_rate": 7.88516330105925e-05, "loss": 0.0999, "step": 340 }, { "grad_norm": 0.8349484801292419, "learning_rate": 7.748568107080832e-05, "loss": 0.0976, "step": 350 }, { "grad_norm": 0.6667965650558472, "learning_rate": 7.608967394231387e-05, "loss": 0.0889, "step": 360 }, { "grad_norm": 0.5367528200149536, "learning_rate": 7.466513813840825e-05, "loss": 0.0831, "step": 370 }, { "grad_norm": 0.6868879199028015, "learning_rate": 7.32136313680782e-05, "loss": 0.0782, "step": 380 }, { "grad_norm": 0.5659252405166626, "learning_rate": 7.173674083266624e-05, "loss": 0.0844, "step": 390 }, { "grad_norm": 0.5637879371643066, "learning_rate": 7.023608149028937e-05, "loss": 0.0857, "step": 400 }, { "grad_norm": 0.5377715826034546, "learning_rate": 6.871329428990602e-05, "loss": 0.0887, "step": 410 }, { "grad_norm": 0.6541546583175659, "learning_rate": 6.71700443769625e-05, "loss": 0.0708, "step": 420 }, { "grad_norm": 0.6478805541992188, "learning_rate": 6.56080192725808e-05, "loss": 0.0722, "step": 430 }, { "grad_norm": 0.5954216122627258, "learning_rate": 6.402892702827916e-05, "loss": 0.0765, "step": 440 }, { "grad_norm": 0.533097505569458, "learning_rate": 6.243449435824276e-05, "loss": 0.0793, "step": 450 }, { "grad_norm": 0.5573644042015076, "learning_rate": 6.0826464751186994e-05, "loss": 0.0747, "step": 460 }, { "grad_norm": 0.5666183829307556, "learning_rate": 5.9206596563878357e-05, "loss": 0.0804, "step": 470 }, { "grad_norm": 0.6182389259338379, "learning_rate": 5.757666109839702e-05, "loss": 0.0675, "step": 480 }, { "grad_norm": 0.41103699803352356, "learning_rate": 5.5938440665244006e-05, "loss": 0.0638, "step": 490 }, { "grad_norm": 0.5240882635116577, "learning_rate": 5.4293726634410855e-05, "loss": 0.064, "step": 500 }, { "grad_norm": 0.5724607110023499, "learning_rate": 5.264431747654284e-05, "loss": 0.0615, "step": 510 }, { "grad_norm": 0.4552757441997528, "learning_rate": 5.0992016796337686e-05, "loss": 0.0627, "step": 520 }, { "grad_norm": 0.5225529670715332, "learning_rate": 4.93386313603304e-05, "loss": 0.0608, "step": 530 }, { "grad_norm": 0.491335391998291, "learning_rate": 4.7685969121220456e-05, "loss": 0.0693, "step": 540 }, { "grad_norm": 0.48702317476272583, "learning_rate": 4.60358372409022e-05, "loss": 0.0595, "step": 550 }, { "grad_norm": 0.5675700902938843, "learning_rate": 4.439004011435979e-05, "loss": 0.0651, "step": 560 }, { "grad_norm": 0.5530186295509338, "learning_rate": 4.275037739658771e-05, "loss": 0.0701, "step": 570 }, { "grad_norm": 0.6130643486976624, "learning_rate": 4.111864203469457e-05, "loss": 0.0589, "step": 580 }, { "grad_norm": 0.523269534111023, "learning_rate": 3.949661830734172e-05, "loss": 0.057, "step": 590 }, { "grad_norm": 0.5698956847190857, "learning_rate": 3.788607987366069e-05, "loss": 0.0677, "step": 600 }, { "grad_norm": 0.4857608675956726, "learning_rate": 3.628878783378302e-05, "loss": 0.0522, "step": 610 }, { "grad_norm": 0.5062139630317688, "learning_rate": 3.470648880310313e-05, "loss": 0.0565, "step": 620 }, { "grad_norm": 0.5451298356056213, "learning_rate": 3.3140913002379995e-05, "loss": 0.0526, "step": 630 }, { "grad_norm": 0.47404196858406067, "learning_rate": 3.1593772365766105e-05, "loss": 0.0498, "step": 640 }, { "grad_norm": 0.4543800354003906, "learning_rate": 3.006675866883275e-05, "loss": 0.0501, "step": 650 }, { "grad_norm": 0.4308624863624573, "learning_rate": 2.8561541678638142e-05, "loss": 0.054, "step": 660 }, { "grad_norm": 0.4590848386287689, "learning_rate": 2.707976732786166e-05, "loss": 0.0527, "step": 670 }, { "grad_norm": 0.46471887826919556, "learning_rate": 2.562305591500069e-05, "loss": 0.0515, "step": 680 }, { "grad_norm": 0.5157451033592224, "learning_rate": 2.419300033259798e-05, "loss": 0.0496, "step": 690 }, { "grad_norm": 0.3607851564884186, "learning_rate": 2.279116432543705e-05, "loss": 0.0495, "step": 700 }, { "grad_norm": 0.45652496814727783, "learning_rate": 2.1419080780610123e-05, "loss": 0.0623, "step": 710 }, { "grad_norm": 0.42067599296569824, "learning_rate": 2.0078250051328784e-05, "loss": 0.0515, "step": 720 }, { "grad_norm": 0.447702556848526, "learning_rate": 1.877013831630961e-05, "loss": 0.0409, "step": 730 }, { "grad_norm": 0.46375778317451477, "learning_rate": 1.749617597652934e-05, "loss": 0.0568, "step": 740 }, { "grad_norm": 0.40698182582855225, "learning_rate": 1.62577560911024e-05, "loss": 0.0549, "step": 750 }, { "grad_norm": 0.36565569043159485, "learning_rate": 1.5056232853991209e-05, "loss": 0.0619, "step": 760 }, { "grad_norm": 0.3338938355445862, "learning_rate": 1.389292011321498e-05, "loss": 0.0471, "step": 770 }, { "grad_norm": 0.391041100025177, "learning_rate": 1.2769089934176126e-05, "loss": 0.0513, "step": 780 }, { "grad_norm": 0.2763245701789856, "learning_rate": 1.1685971208675539e-05, "loss": 0.0551, "step": 790 }, { "grad_norm": 0.30885049700737, "learning_rate": 1.0644748311137376e-05, "loss": 0.0513, "step": 800 }, { "grad_norm": 0.28447505831718445, "learning_rate": 9.646559803512994e-06, "loss": 0.0481, "step": 810 }, { "grad_norm": 0.34004244208335876, "learning_rate": 8.692497190280224e-06, "loss": 0.0498, "step": 820 }, { "grad_norm": 0.27130886912345886, "learning_rate": 7.783603724899257e-06, "loss": 0.0462, "step": 830 }, { "grad_norm": 0.2683325707912445, "learning_rate": 6.92087326903022e-06, "loss": 0.046, "step": 840 }, { "grad_norm": 0.3312835991382599, "learning_rate": 6.1052492057601275e-06, "loss": 0.0483, "step": 850 }, { "grad_norm": 0.35007426142692566, "learning_rate": 5.337623408027293e-06, "loss": 0.056, "step": 860 }, { "grad_norm": 0.2915596067905426, "learning_rate": 4.618835263371396e-06, "loss": 0.0485, "step": 870 }, { "grad_norm": 0.3173588812351227, "learning_rate": 3.949670756075447e-06, "loss": 0.0473, "step": 880 }, { "grad_norm": 0.3005145788192749, "learning_rate": 3.3308616077036115e-06, "loss": 0.0479, "step": 890 }, { "grad_norm": 0.28296926617622375, "learning_rate": 2.7630844769743757e-06, "loss": 0.0422, "step": 900 }, { "grad_norm": 0.2193201184272766, "learning_rate": 2.2469602198441573e-06, "loss": 0.0421, "step": 910 }, { "grad_norm": 0.1950123906135559, "learning_rate": 1.7830532106104747e-06, "loss": 0.0447, "step": 920 }, { "grad_norm": 0.32784703373908997, "learning_rate": 1.3718707247769135e-06, "loss": 0.0448, "step": 930 }, { "grad_norm": 0.20192043483257294, "learning_rate": 1.0138623843548078e-06, "loss": 0.0414, "step": 940 }, { "grad_norm": 0.23864571750164032, "learning_rate": 7.094196662081831e-07, "loss": 0.0476, "step": 950 }, { "grad_norm": 0.22843579947948456, "learning_rate": 4.5887547397955864e-07, "loss": 0.0449, "step": 960 }, { "grad_norm": 0.24962212145328522, "learning_rate": 2.625037740646763e-07, "loss": 0.0455, "step": 970 }, { "grad_norm": 0.316610187292099, "learning_rate": 1.2051929603428825e-07, "loss": 0.0467, "step": 980 }, { "grad_norm": 0.32058215141296387, "learning_rate": 3.3077297830541584e-08, "loss": 0.0506, "step": 990 }, { "grad_norm": 0.2989563047885895, "learning_rate": 2.7339599464326627e-10, "loss": 0.0532, "step": 1000 } ], "logging_steps": 10, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 48, "trial_name": null, "trial_params": null }