{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 566, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0088339222614841, "grad_norm": 0.318359375, "learning_rate": 8.620689655172414e-06, "loss": 0.145, "step": 5 }, { "epoch": 0.0176678445229682, "grad_norm": 0.1845703125, "learning_rate": 1.7241379310344828e-05, "loss": 0.1351, "step": 10 }, { "epoch": 0.026501766784452298, "grad_norm": 0.1513671875, "learning_rate": 2.5862068965517244e-05, "loss": 0.1249, "step": 15 }, { "epoch": 0.0353356890459364, "grad_norm": 0.12109375, "learning_rate": 3.4482758620689657e-05, "loss": 0.1189, "step": 20 }, { "epoch": 0.044169611307420496, "grad_norm": 0.09619140625, "learning_rate": 4.3103448275862066e-05, "loss": 0.1181, "step": 25 }, { "epoch": 0.053003533568904596, "grad_norm": 0.0908203125, "learning_rate": 4.999961496300632e-05, "loss": 0.1127, "step": 30 }, { "epoch": 0.061837455830388695, "grad_norm": 0.0830078125, "learning_rate": 4.9986140051876094e-05, "loss": 0.1102, "step": 35 }, { "epoch": 0.0706713780918728, "grad_norm": 0.08447265625, "learning_rate": 4.995342646712217e-05, "loss": 0.1093, "step": 40 }, { "epoch": 0.07950530035335689, "grad_norm": 0.07958984375, "learning_rate": 4.9901502197807084e-05, "loss": 0.109, "step": 45 }, { "epoch": 0.08833922261484099, "grad_norm": 0.07958984375, "learning_rate": 4.9830411669255416e-05, "loss": 0.107, "step": 50 }, { "epoch": 0.09717314487632508, "grad_norm": 0.07763671875, "learning_rate": 4.974021570504443e-05, "loss": 0.1041, "step": 55 }, { "epoch": 0.10600706713780919, "grad_norm": 0.07421875, "learning_rate": 4.963099147496465e-05, "loss": 0.1029, "step": 60 }, { "epoch": 0.11484098939929328, "grad_norm": 0.0810546875, "learning_rate": 4.9502832428995005e-05, "loss": 0.1047, "step": 65 }, { "epoch": 0.12367491166077739, "grad_norm": 0.0771484375, "learning_rate": 4.935584821734901e-05, "loss": 0.1044, "step": 70 }, { "epoch": 0.13250883392226148, "grad_norm": 0.07568359375, "learning_rate": 4.919016459666026e-05, "loss": 0.1063, "step": 75 }, { "epoch": 0.1413427561837456, "grad_norm": 0.0830078125, "learning_rate": 4.9005923322387706e-05, "loss": 0.1025, "step": 80 }, { "epoch": 0.1501766784452297, "grad_norm": 0.0859375, "learning_rate": 4.880328202753264e-05, "loss": 0.1022, "step": 85 }, { "epoch": 0.15901060070671377, "grad_norm": 0.0791015625, "learning_rate": 4.858241408777117e-05, "loss": 0.1031, "step": 90 }, { "epoch": 0.16784452296819788, "grad_norm": 0.07421875, "learning_rate": 4.834350847311758e-05, "loss": 0.1016, "step": 95 }, { "epoch": 0.17667844522968199, "grad_norm": 0.076171875, "learning_rate": 4.8086769586245554e-05, "loss": 0.1008, "step": 100 }, { "epoch": 0.1855123674911661, "grad_norm": 0.0751953125, "learning_rate": 4.7812417087605456e-05, "loss": 0.1043, "step": 105 }, { "epoch": 0.19434628975265017, "grad_norm": 0.07275390625, "learning_rate": 4.752068570748746e-05, "loss": 0.1038, "step": 110 }, { "epoch": 0.20318021201413428, "grad_norm": 0.078125, "learning_rate": 4.721182504519118e-05, "loss": 0.1014, "step": 115 }, { "epoch": 0.21201413427561838, "grad_norm": 0.0712890625, "learning_rate": 4.688609935547371e-05, "loss": 0.1004, "step": 120 }, { "epoch": 0.22084805653710246, "grad_norm": 0.07177734375, "learning_rate": 4.654378732245869e-05, "loss": 0.1017, "step": 125 }, { "epoch": 0.22968197879858657, "grad_norm": 0.07421875, "learning_rate": 4.618518182120011e-05, "loss": 0.1006, "step": 130 }, { "epoch": 0.23851590106007067, "grad_norm": 0.0751953125, "learning_rate": 4.5810589667104347e-05, "loss": 0.1008, "step": 135 }, { "epoch": 0.24734982332155478, "grad_norm": 0.08447265625, "learning_rate": 4.542033135342537e-05, "loss": 0.1012, "step": 140 }, { "epoch": 0.25618374558303886, "grad_norm": 0.07275390625, "learning_rate": 4.5014740777057405e-05, "loss": 0.1026, "step": 145 }, { "epoch": 0.26501766784452296, "grad_norm": 0.07861328125, "learning_rate": 4.45941649528596e-05, "loss": 0.1017, "step": 150 }, { "epoch": 0.27385159010600707, "grad_norm": 0.0771484375, "learning_rate": 4.4158963716757444e-05, "loss": 0.099, "step": 155 }, { "epoch": 0.2826855123674912, "grad_norm": 0.07666015625, "learning_rate": 4.370950941787456e-05, "loss": 0.1021, "step": 160 }, { "epoch": 0.2915194346289753, "grad_norm": 0.0712890625, "learning_rate": 4.324618659995855e-05, "loss": 0.1006, "step": 165 }, { "epoch": 0.3003533568904594, "grad_norm": 0.076171875, "learning_rate": 4.27693916723734e-05, "loss": 0.1006, "step": 170 }, { "epoch": 0.30918727915194344, "grad_norm": 0.07275390625, "learning_rate": 4.227953257093985e-05, "loss": 0.0995, "step": 175 }, { "epoch": 0.31802120141342755, "grad_norm": 0.07763671875, "learning_rate": 4.1777028408913985e-05, "loss": 0.1005, "step": 180 }, { "epoch": 0.32685512367491165, "grad_norm": 0.07373046875, "learning_rate": 4.126230911840269e-05, "loss": 0.0995, "step": 185 }, { "epoch": 0.33568904593639576, "grad_norm": 0.07666015625, "learning_rate": 4.07358150825226e-05, "loss": 0.0984, "step": 190 }, { "epoch": 0.34452296819787986, "grad_norm": 0.07568359375, "learning_rate": 4.0197996758617594e-05, "loss": 0.0979, "step": 195 }, { "epoch": 0.35335689045936397, "grad_norm": 0.076171875, "learning_rate": 3.964931429285675e-05, "loss": 0.0998, "step": 200 }, { "epoch": 0.3621908127208481, "grad_norm": 0.07958984375, "learning_rate": 3.909023712654291e-05, "loss": 0.1012, "step": 205 }, { "epoch": 0.3710247349823322, "grad_norm": 0.07373046875, "learning_rate": 3.852124359446845e-05, "loss": 0.0987, "step": 210 }, { "epoch": 0.37985865724381623, "grad_norm": 0.0732421875, "learning_rate": 3.794282051566199e-05, "loss": 0.0982, "step": 215 }, { "epoch": 0.38869257950530034, "grad_norm": 0.0751953125, "learning_rate": 3.7355462776876184e-05, "loss": 0.0984, "step": 220 }, { "epoch": 0.39752650176678445, "grad_norm": 0.06982421875, "learning_rate": 3.6759672909172846e-05, "loss": 0.0973, "step": 225 }, { "epoch": 0.40636042402826855, "grad_norm": 0.07080078125, "learning_rate": 3.615596065796791e-05, "loss": 0.1007, "step": 230 }, { "epoch": 0.41519434628975266, "grad_norm": 0.07421875, "learning_rate": 3.554484254690379e-05, "loss": 0.0972, "step": 235 }, { "epoch": 0.42402826855123676, "grad_norm": 0.07080078125, "learning_rate": 3.492684143592252e-05, "loss": 0.0974, "step": 240 }, { "epoch": 0.43286219081272087, "grad_norm": 0.076171875, "learning_rate": 3.4302486073917686e-05, "loss": 0.0991, "step": 245 }, { "epoch": 0.4416961130742049, "grad_norm": 0.076171875, "learning_rate": 3.3672310646347844e-05, "loss": 0.0979, "step": 250 }, { "epoch": 0.450530035335689, "grad_norm": 0.07275390625, "learning_rate": 3.3036854318198575e-05, "loss": 0.0987, "step": 255 }, { "epoch": 0.45936395759717313, "grad_norm": 0.07470703125, "learning_rate": 3.2396660772684114e-05, "loss": 0.0999, "step": 260 }, { "epoch": 0.46819787985865724, "grad_norm": 0.07373046875, "learning_rate": 3.1752277746083325e-05, "loss": 0.0979, "step": 265 }, { "epoch": 0.47703180212014135, "grad_norm": 0.07373046875, "learning_rate": 3.110425655910795e-05, "loss": 0.0983, "step": 270 }, { "epoch": 0.48586572438162545, "grad_norm": 0.07080078125, "learning_rate": 3.045315164520405e-05, "loss": 0.0981, "step": 275 }, { "epoch": 0.49469964664310956, "grad_norm": 0.07568359375, "learning_rate": 2.9799520076190268e-05, "loss": 0.0987, "step": 280 }, { "epoch": 0.5035335689045937, "grad_norm": 0.07080078125, "learning_rate": 2.914392108563883e-05, "loss": 0.0963, "step": 285 }, { "epoch": 0.5123674911660777, "grad_norm": 0.07470703125, "learning_rate": 2.848691559040687e-05, "loss": 0.0977, "step": 290 }, { "epoch": 0.5212014134275619, "grad_norm": 0.072265625, "learning_rate": 2.7829065710727682e-05, "loss": 0.0959, "step": 295 }, { "epoch": 0.5300353356890459, "grad_norm": 0.07275390625, "learning_rate": 2.7170934289272327e-05, "loss": 0.0983, "step": 300 }, { "epoch": 0.5388692579505301, "grad_norm": 0.07177734375, "learning_rate": 2.6513084409593137e-05, "loss": 0.0981, "step": 305 }, { "epoch": 0.5477031802120141, "grad_norm": 0.07373046875, "learning_rate": 2.585607891436118e-05, "loss": 0.0972, "step": 310 }, { "epoch": 0.5565371024734982, "grad_norm": 0.078125, "learning_rate": 2.5200479923809738e-05, "loss": 0.0964, "step": 315 }, { "epoch": 0.5653710247349824, "grad_norm": 0.06787109375, "learning_rate": 2.4546848354795954e-05, "loss": 0.0966, "step": 320 }, { "epoch": 0.5742049469964664, "grad_norm": 0.068359375, "learning_rate": 2.3895743440892053e-05, "loss": 0.0983, "step": 325 }, { "epoch": 0.5830388692579506, "grad_norm": 0.0712890625, "learning_rate": 2.3247722253916677e-05, "loss": 0.0983, "step": 330 }, { "epoch": 0.5918727915194346, "grad_norm": 0.07177734375, "learning_rate": 2.2603339227315902e-05, "loss": 0.0982, "step": 335 }, { "epoch": 0.6007067137809188, "grad_norm": 0.06787109375, "learning_rate": 2.1963145681801434e-05, "loss": 0.0968, "step": 340 }, { "epoch": 0.6095406360424028, "grad_norm": 0.0712890625, "learning_rate": 2.132768935365215e-05, "loss": 0.0976, "step": 345 }, { "epoch": 0.6183745583038869, "grad_norm": 0.0693359375, "learning_rate": 2.069751392608232e-05, "loss": 0.0974, "step": 350 }, { "epoch": 0.627208480565371, "grad_norm": 0.0712890625, "learning_rate": 2.0073158564077483e-05, "loss": 0.0992, "step": 355 }, { "epoch": 0.6360424028268551, "grad_norm": 0.0703125, "learning_rate": 1.9455157453096225e-05, "loss": 0.0992, "step": 360 }, { "epoch": 0.6448763250883393, "grad_norm": 0.06884765625, "learning_rate": 1.8844039342032095e-05, "loss": 0.0961, "step": 365 }, { "epoch": 0.6537102473498233, "grad_norm": 0.06884765625, "learning_rate": 1.8240327090827153e-05, "loss": 0.097, "step": 370 }, { "epoch": 0.6625441696113075, "grad_norm": 0.068359375, "learning_rate": 1.764453722312383e-05, "loss": 0.0979, "step": 375 }, { "epoch": 0.6713780918727915, "grad_norm": 0.0693359375, "learning_rate": 1.705717948433801e-05, "loss": 0.0963, "step": 380 }, { "epoch": 0.6802120141342756, "grad_norm": 0.06787109375, "learning_rate": 1.6478756405531564e-05, "loss": 0.0969, "step": 385 }, { "epoch": 0.6890459363957597, "grad_norm": 0.0673828125, "learning_rate": 1.5909762873457096e-05, "loss": 0.0963, "step": 390 }, { "epoch": 0.6978798586572438, "grad_norm": 0.06787109375, "learning_rate": 1.5350685707143258e-05, "loss": 0.0973, "step": 395 }, { "epoch": 0.7067137809187279, "grad_norm": 0.06591796875, "learning_rate": 1.4802003241382406e-05, "loss": 0.0963, "step": 400 }, { "epoch": 0.715547703180212, "grad_norm": 0.0673828125, "learning_rate": 1.4264184917477397e-05, "loss": 0.0964, "step": 405 }, { "epoch": 0.7243816254416962, "grad_norm": 0.0673828125, "learning_rate": 1.3737690881597321e-05, "loss": 0.0981, "step": 410 }, { "epoch": 0.7332155477031802, "grad_norm": 0.068359375, "learning_rate": 1.3222971591086014e-05, "loss": 0.0977, "step": 415 }, { "epoch": 0.7420494699646644, "grad_norm": 0.06689453125, "learning_rate": 1.2720467429060156e-05, "loss": 0.0975, "step": 420 }, { "epoch": 0.7508833922261484, "grad_norm": 0.06787109375, "learning_rate": 1.2230608327626608e-05, "loss": 0.0978, "step": 425 }, { "epoch": 0.7597173144876325, "grad_norm": 0.0712890625, "learning_rate": 1.1753813400041453e-05, "loss": 0.0954, "step": 430 }, { "epoch": 0.7685512367491166, "grad_norm": 0.06640625, "learning_rate": 1.1290490582125454e-05, "loss": 0.0952, "step": 435 }, { "epoch": 0.7773851590106007, "grad_norm": 0.068359375, "learning_rate": 1.0841036283242558e-05, "loss": 0.0971, "step": 440 }, { "epoch": 0.7862190812720848, "grad_norm": 0.068359375, "learning_rate": 1.0405835047140401e-05, "loss": 0.0982, "step": 445 }, { "epoch": 0.7950530035335689, "grad_norm": 0.06689453125, "learning_rate": 9.985259222942602e-06, "loss": 0.0952, "step": 450 }, { "epoch": 0.803886925795053, "grad_norm": 0.0703125, "learning_rate": 9.57966864657463e-06, "loss": 0.0978, "step": 455 }, { "epoch": 0.8127208480565371, "grad_norm": 0.06982421875, "learning_rate": 9.189410332895662e-06, "loss": 0.0989, "step": 460 }, { "epoch": 0.8215547703180212, "grad_norm": 0.06787109375, "learning_rate": 8.814818178799892e-06, "loss": 0.0981, "step": 465 }, { "epoch": 0.8303886925795053, "grad_norm": 0.06591796875, "learning_rate": 8.456212677541312e-06, "loss": 0.0945, "step": 470 }, { "epoch": 0.8392226148409894, "grad_norm": 0.0673828125, "learning_rate": 8.113900644526301e-06, "loss": 0.0988, "step": 475 }, { "epoch": 0.8480565371024735, "grad_norm": 0.06689453125, "learning_rate": 7.788174954808826e-06, "loss": 0.0973, "step": 480 }, { "epoch": 0.8568904593639576, "grad_norm": 0.06689453125, "learning_rate": 7.479314292512542e-06, "loss": 0.0972, "step": 485 }, { "epoch": 0.8657243816254417, "grad_norm": 0.06689453125, "learning_rate": 7.187582912394548e-06, "loss": 0.0977, "step": 490 }, { "epoch": 0.8745583038869258, "grad_norm": 0.06591796875, "learning_rate": 6.913230413754452e-06, "loss": 0.0962, "step": 495 }, { "epoch": 0.8833922261484098, "grad_norm": 0.06982421875, "learning_rate": 6.656491526882422e-06, "loss": 0.0958, "step": 500 }, { "epoch": 0.892226148409894, "grad_norm": 0.06494140625, "learning_rate": 6.417585912228833e-06, "loss": 0.0959, "step": 505 }, { "epoch": 0.901060070671378, "grad_norm": 0.0703125, "learning_rate": 6.196717972467361e-06, "loss": 0.0978, "step": 510 }, { "epoch": 0.9098939929328622, "grad_norm": 0.0673828125, "learning_rate": 5.994076677612297e-06, "loss": 0.0982, "step": 515 }, { "epoch": 0.9187279151943463, "grad_norm": 0.06640625, "learning_rate": 5.809835403339747e-06, "loss": 0.0971, "step": 520 }, { "epoch": 0.9275618374558304, "grad_norm": 0.06787109375, "learning_rate": 5.644151782650993e-06, "loss": 0.0953, "step": 525 }, { "epoch": 0.9363957597173145, "grad_norm": 0.06982421875, "learning_rate": 5.497167571004998e-06, "loss": 0.0956, "step": 530 }, { "epoch": 0.9452296819787986, "grad_norm": 0.06689453125, "learning_rate": 5.36900852503536e-06, "loss": 0.0963, "step": 535 }, { "epoch": 0.9540636042402827, "grad_norm": 0.06494140625, "learning_rate": 5.259784294955576e-06, "loss": 0.0985, "step": 540 }, { "epoch": 0.9628975265017667, "grad_norm": 0.06884765625, "learning_rate": 5.169588330744585e-06, "loss": 0.0982, "step": 545 }, { "epoch": 0.9717314487632509, "grad_norm": 0.068359375, "learning_rate": 5.098497802192923e-06, "loss": 0.0979, "step": 550 }, { "epoch": 0.980565371024735, "grad_norm": 0.06591796875, "learning_rate": 5.046573532877835e-06, "loss": 0.0977, "step": 555 }, { "epoch": 0.9893992932862191, "grad_norm": 0.06689453125, "learning_rate": 5.013859948123909e-06, "loss": 0.0976, "step": 560 }, { "epoch": 0.9982332155477032, "grad_norm": 0.06591796875, "learning_rate": 5.000385036993684e-06, "loss": 0.0977, "step": 565 }, { "epoch": 1.0, "step": 566, "total_flos": 5.828408442271826e+17, "train_loss": 0.10050818478102819, "train_runtime": 2227.4528, "train_samples_per_second": 32.502, "train_steps_per_second": 0.254 } ], "logging_steps": 5, "max_steps": 566, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.828408442271826e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }