{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2578815034491651, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005157630068983302, "grad_norm": 0.25815722346305847, "learning_rate": 0.00019998936857560623, "loss": 2.1913, "mean_token_accuracy": 0.3947448147460818, "num_tokens": 24791.0, "step": 10 }, { "epoch": 0.010315260137966605, "grad_norm": 0.3397273123264313, "learning_rate": 0.0001999526208749509, "loss": 1.839, "mean_token_accuracy": 0.46727082105353474, "num_tokens": 47657.0, "step": 20 }, { "epoch": 0.015472890206949906, "grad_norm": 0.31899315118789673, "learning_rate": 0.00019988963528997362, "loss": 1.6338, "mean_token_accuracy": 0.5085321174934506, "num_tokens": 71833.0, "step": 30 }, { "epoch": 0.02063052027593321, "grad_norm": 0.33701273798942566, "learning_rate": 0.00019980042835459288, "loss": 1.5019, "mean_token_accuracy": 0.5493818091228604, "num_tokens": 94695.0, "step": 40 }, { "epoch": 0.02578815034491651, "grad_norm": 0.34330254793167114, "learning_rate": 0.000199685023485916, "loss": 1.4607, "mean_token_accuracy": 0.5583337539806962, "num_tokens": 118278.0, "step": 50 }, { "epoch": 0.030945780413899813, "grad_norm": 0.3338906168937683, "learning_rate": 0.0001995434509780921, "loss": 1.3955, "mean_token_accuracy": 0.5763469154015184, "num_tokens": 142428.0, "step": 60 }, { "epoch": 0.03610341048288312, "grad_norm": 0.3558220863342285, "learning_rate": 0.00019937574799435957, "loss": 1.3424, "mean_token_accuracy": 0.5941972561180592, "num_tokens": 166242.0, "step": 70 }, { "epoch": 0.04126104055186642, "grad_norm": 0.34792616963386536, "learning_rate": 0.00019918195855729082, "loss": 1.3115, "mean_token_accuracy": 0.5970643986016512, "num_tokens": 189317.0, "step": 80 }, { "epoch": 0.04641867062084972, "grad_norm": 0.3633214831352234, "learning_rate": 0.00019896213353723613, "loss": 1.3081, "mean_token_accuracy": 0.5974381363019348, "num_tokens": 212852.0, "step": 90 }, { "epoch": 0.05157630068983302, "grad_norm": 0.420682817697525, "learning_rate": 0.00019871633063896994, "loss": 1.2799, "mean_token_accuracy": 0.6089719075709581, "num_tokens": 236286.0, "step": 100 }, { "epoch": 0.056733930758816324, "grad_norm": 0.37725815176963806, "learning_rate": 0.00019844461438654328, "loss": 1.2995, "mean_token_accuracy": 0.6003120748326183, "num_tokens": 259169.0, "step": 110 }, { "epoch": 0.061891560827799626, "grad_norm": 0.35575857758522034, "learning_rate": 0.000198147056106346, "loss": 1.261, "mean_token_accuracy": 0.6272577648982406, "num_tokens": 282438.0, "step": 120 }, { "epoch": 0.06704919089678293, "grad_norm": 0.38050541281700134, "learning_rate": 0.0001978237339083833, "loss": 1.2588, "mean_token_accuracy": 0.6213426964357496, "num_tokens": 305763.0, "step": 130 }, { "epoch": 0.07220682096576624, "grad_norm": 0.356004536151886, "learning_rate": 0.00019747473266577159, "loss": 1.1867, "mean_token_accuracy": 0.6391215188428759, "num_tokens": 330538.0, "step": 140 }, { "epoch": 0.07736445103474954, "grad_norm": 0.4616079032421112, "learning_rate": 0.00019710014399245906, "loss": 1.2113, "mean_token_accuracy": 0.6334994403645396, "num_tokens": 353239.0, "step": 150 }, { "epoch": 0.08252208110373284, "grad_norm": 0.45747244358062744, "learning_rate": 0.00019670006621917675, "loss": 1.1924, "mean_token_accuracy": 0.6410702392458916, "num_tokens": 375364.0, "step": 160 }, { "epoch": 0.08767971117271614, "grad_norm": 0.3868316411972046, "learning_rate": 0.0001962746043676264, "loss": 1.2024, "mean_token_accuracy": 0.6303978456184268, "num_tokens": 399123.0, "step": 170 }, { "epoch": 0.09283734124169944, "grad_norm": 0.40062034130096436, "learning_rate": 0.00019582387012291182, "loss": 1.1887, "mean_token_accuracy": 0.638477023690939, "num_tokens": 421761.0, "step": 180 }, { "epoch": 0.09799497131068274, "grad_norm": 0.40518611669540405, "learning_rate": 0.00019534798180422138, "loss": 1.1861, "mean_token_accuracy": 0.6374255264177918, "num_tokens": 445675.0, "step": 190 }, { "epoch": 0.10315260137966605, "grad_norm": 0.4045695960521698, "learning_rate": 0.0001948470643337687, "loss": 1.1445, "mean_token_accuracy": 0.641272259876132, "num_tokens": 469975.0, "step": 200 }, { "epoch": 0.10831023144864935, "grad_norm": 0.3835983872413635, "learning_rate": 0.00019432124920400017, "loss": 1.1414, "mean_token_accuracy": 0.6493382846936584, "num_tokens": 493727.0, "step": 210 }, { "epoch": 0.11346786151763265, "grad_norm": 0.39580345153808594, "learning_rate": 0.0001937706744430778, "loss": 1.1333, "mean_token_accuracy": 0.6460228314623236, "num_tokens": 516991.0, "step": 220 }, { "epoch": 0.11862549158661595, "grad_norm": 0.392665833234787, "learning_rate": 0.00019319548457864648, "loss": 1.1408, "mean_token_accuracy": 0.6520120551809668, "num_tokens": 541253.0, "step": 230 }, { "epoch": 0.12378312165559925, "grad_norm": 0.3695116639137268, "learning_rate": 0.0001925958305998947, "loss": 1.11, "mean_token_accuracy": 0.6565015500411391, "num_tokens": 565471.0, "step": 240 }, { "epoch": 0.12894075172458255, "grad_norm": 0.38127511739730835, "learning_rate": 0.0001919718699179199, "loss": 1.0965, "mean_token_accuracy": 0.6642474669963121, "num_tokens": 589030.0, "step": 250 }, { "epoch": 0.13409838179356587, "grad_norm": 0.3783718943595886, "learning_rate": 0.00019132376632440695, "loss": 1.062, "mean_token_accuracy": 0.6770766332745553, "num_tokens": 612514.0, "step": 260 }, { "epoch": 0.13925601186254916, "grad_norm": 0.42868489027023315, "learning_rate": 0.00019065168994863288, "loss": 1.1059, "mean_token_accuracy": 0.6585826754570008, "num_tokens": 635574.0, "step": 270 }, { "epoch": 0.14441364193153247, "grad_norm": 0.4161641299724579, "learning_rate": 0.00018995581721280695, "loss": 1.0985, "mean_token_accuracy": 0.6587576447054744, "num_tokens": 659029.0, "step": 280 }, { "epoch": 0.14957127200051576, "grad_norm": 0.36837488412857056, "learning_rate": 0.00018923633078575953, "loss": 1.0987, "mean_token_accuracy": 0.6716255461797118, "num_tokens": 682537.0, "step": 290 }, { "epoch": 0.15472890206949907, "grad_norm": 0.3812052309513092, "learning_rate": 0.0001884934195349908, "loss": 1.0731, "mean_token_accuracy": 0.6624803204089403, "num_tokens": 705616.0, "step": 300 }, { "epoch": 0.15988653213848236, "grad_norm": 0.38784265518188477, "learning_rate": 0.00018772727847709257, "loss": 1.0669, "mean_token_accuracy": 0.6701639717444777, "num_tokens": 729415.0, "step": 310 }, { "epoch": 0.16504416220746568, "grad_norm": 0.3632284700870514, "learning_rate": 0.00018693810872655558, "loss": 1.074, "mean_token_accuracy": 0.6647017451003194, "num_tokens": 753385.0, "step": 320 }, { "epoch": 0.17020179227644897, "grad_norm": 0.4154379069805145, "learning_rate": 0.0001861261174429765, "loss": 1.0724, "mean_token_accuracy": 0.6690206056460738, "num_tokens": 776884.0, "step": 330 }, { "epoch": 0.17535942234543228, "grad_norm": 0.4121210277080536, "learning_rate": 0.00018529151777667784, "loss": 1.0599, "mean_token_accuracy": 0.674660662189126, "num_tokens": 800821.0, "step": 340 }, { "epoch": 0.18051705241441557, "grad_norm": 0.4217364192008972, "learning_rate": 0.00018443452881275512, "loss": 1.0652, "mean_token_accuracy": 0.6764787383377552, "num_tokens": 823505.0, "step": 350 }, { "epoch": 0.18567468248339888, "grad_norm": 0.43876639008522034, "learning_rate": 0.00018355537551356654, "loss": 1.0353, "mean_token_accuracy": 0.684059496410191, "num_tokens": 846313.0, "step": 360 }, { "epoch": 0.19083231255238217, "grad_norm": 0.377739816904068, "learning_rate": 0.0001826542886596796, "loss": 1.0532, "mean_token_accuracy": 0.6820366451516747, "num_tokens": 869767.0, "step": 370 }, { "epoch": 0.1959899426213655, "grad_norm": 0.38219141960144043, "learning_rate": 0.00018173150478929042, "loss": 1.0524, "mean_token_accuracy": 0.6820811878889799, "num_tokens": 893966.0, "step": 380 }, { "epoch": 0.20114757269034877, "grad_norm": 0.3853937089443207, "learning_rate": 0.00018078726613613162, "loss": 1.0277, "mean_token_accuracy": 0.687343406304717, "num_tokens": 917272.0, "step": 390 }, { "epoch": 0.2063052027593321, "grad_norm": 0.36827078461647034, "learning_rate": 0.00017982182056588535, "loss": 1.0081, "mean_token_accuracy": 0.6875007605180145, "num_tokens": 940965.0, "step": 400 }, { "epoch": 0.21146283282831538, "grad_norm": 0.41124311089515686, "learning_rate": 0.00017883542151111764, "loss": 1.0568, "mean_token_accuracy": 0.6763140456750989, "num_tokens": 965284.0, "step": 410 }, { "epoch": 0.2166204628972987, "grad_norm": 0.4158463776111603, "learning_rate": 0.00017782832790475166, "loss": 1.046, "mean_token_accuracy": 0.67484475299716, "num_tokens": 989038.0, "step": 420 }, { "epoch": 0.22177809296628198, "grad_norm": 0.33250564336776733, "learning_rate": 0.00017680080411209677, "loss": 1.0307, "mean_token_accuracy": 0.6823460660874844, "num_tokens": 1013429.0, "step": 430 }, { "epoch": 0.2269357230352653, "grad_norm": 0.3930635154247284, "learning_rate": 0.00017575311986145196, "loss": 1.0365, "mean_token_accuracy": 0.6863100994378328, "num_tokens": 1037050.0, "step": 440 }, { "epoch": 0.23209335310424858, "grad_norm": 0.3810296952724457, "learning_rate": 0.0001746855501733013, "loss": 1.041, "mean_token_accuracy": 0.6770287297666073, "num_tokens": 1060608.0, "step": 450 }, { "epoch": 0.2372509831732319, "grad_norm": 0.43654826283454895, "learning_rate": 0.00017359837528812012, "loss": 1.0147, "mean_token_accuracy": 0.6897374652326107, "num_tokens": 1084685.0, "step": 460 }, { "epoch": 0.24240861324221522, "grad_norm": 0.38834720849990845, "learning_rate": 0.00017249188059281098, "loss": 0.9982, "mean_token_accuracy": 0.6943748012185097, "num_tokens": 1107888.0, "step": 470 }, { "epoch": 0.2475662433111985, "grad_norm": 0.36283308267593384, "learning_rate": 0.0001713663565457887, "loss": 0.9835, "mean_token_accuracy": 0.7002836445346474, "num_tokens": 1130809.0, "step": 480 }, { "epoch": 0.2527238733801818, "grad_norm": 0.3753542900085449, "learning_rate": 0.00017022209860073414, "loss": 1.0063, "mean_token_accuracy": 0.6868171758949757, "num_tokens": 1154529.0, "step": 490 }, { "epoch": 0.2578815034491651, "grad_norm": 0.3620479106903076, "learning_rate": 0.00016905940712903662, "loss": 0.9876, "mean_token_accuracy": 0.7012953195720911, "num_tokens": 1178719.0, "step": 500 } ], "logging_steps": 10, "max_steps": 1939, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.925092470733824e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }