| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2578815034491651, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005157630068983302, | |
| "grad_norm": 0.25815722346305847, | |
| "learning_rate": 0.00019998936857560623, | |
| "loss": 2.1913, | |
| "mean_token_accuracy": 0.3947448147460818, | |
| "num_tokens": 24791.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.010315260137966605, | |
| "grad_norm": 0.3397273123264313, | |
| "learning_rate": 0.0001999526208749509, | |
| "loss": 1.839, | |
| "mean_token_accuracy": 0.46727082105353474, | |
| "num_tokens": 47657.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.015472890206949906, | |
| "grad_norm": 0.31899315118789673, | |
| "learning_rate": 0.00019988963528997362, | |
| "loss": 1.6338, | |
| "mean_token_accuracy": 0.5085321174934506, | |
| "num_tokens": 71833.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02063052027593321, | |
| "grad_norm": 0.33701273798942566, | |
| "learning_rate": 0.00019980042835459288, | |
| "loss": 1.5019, | |
| "mean_token_accuracy": 0.5493818091228604, | |
| "num_tokens": 94695.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.02578815034491651, | |
| "grad_norm": 0.34330254793167114, | |
| "learning_rate": 0.000199685023485916, | |
| "loss": 1.4607, | |
| "mean_token_accuracy": 0.5583337539806962, | |
| "num_tokens": 118278.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.030945780413899813, | |
| "grad_norm": 0.3338906168937683, | |
| "learning_rate": 0.0001995434509780921, | |
| "loss": 1.3955, | |
| "mean_token_accuracy": 0.5763469154015184, | |
| "num_tokens": 142428.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.03610341048288312, | |
| "grad_norm": 0.3558220863342285, | |
| "learning_rate": 0.00019937574799435957, | |
| "loss": 1.3424, | |
| "mean_token_accuracy": 0.5941972561180592, | |
| "num_tokens": 166242.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04126104055186642, | |
| "grad_norm": 0.34792616963386536, | |
| "learning_rate": 0.00019918195855729082, | |
| "loss": 1.3115, | |
| "mean_token_accuracy": 0.5970643986016512, | |
| "num_tokens": 189317.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04641867062084972, | |
| "grad_norm": 0.3633214831352234, | |
| "learning_rate": 0.00019896213353723613, | |
| "loss": 1.3081, | |
| "mean_token_accuracy": 0.5974381363019348, | |
| "num_tokens": 212852.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05157630068983302, | |
| "grad_norm": 0.420682817697525, | |
| "learning_rate": 0.00019871633063896994, | |
| "loss": 1.2799, | |
| "mean_token_accuracy": 0.6089719075709581, | |
| "num_tokens": 236286.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.056733930758816324, | |
| "grad_norm": 0.37725815176963806, | |
| "learning_rate": 0.00019844461438654328, | |
| "loss": 1.2995, | |
| "mean_token_accuracy": 0.6003120748326183, | |
| "num_tokens": 259169.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.061891560827799626, | |
| "grad_norm": 0.35575857758522034, | |
| "learning_rate": 0.000198147056106346, | |
| "loss": 1.261, | |
| "mean_token_accuracy": 0.6272577648982406, | |
| "num_tokens": 282438.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06704919089678293, | |
| "grad_norm": 0.38050541281700134, | |
| "learning_rate": 0.0001978237339083833, | |
| "loss": 1.2588, | |
| "mean_token_accuracy": 0.6213426964357496, | |
| "num_tokens": 305763.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07220682096576624, | |
| "grad_norm": 0.356004536151886, | |
| "learning_rate": 0.00019747473266577159, | |
| "loss": 1.1867, | |
| "mean_token_accuracy": 0.6391215188428759, | |
| "num_tokens": 330538.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.07736445103474954, | |
| "grad_norm": 0.4616079032421112, | |
| "learning_rate": 0.00019710014399245906, | |
| "loss": 1.2113, | |
| "mean_token_accuracy": 0.6334994403645396, | |
| "num_tokens": 353239.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08252208110373284, | |
| "grad_norm": 0.45747244358062744, | |
| "learning_rate": 0.00019670006621917675, | |
| "loss": 1.1924, | |
| "mean_token_accuracy": 0.6410702392458916, | |
| "num_tokens": 375364.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.08767971117271614, | |
| "grad_norm": 0.3868316411972046, | |
| "learning_rate": 0.0001962746043676264, | |
| "loss": 1.2024, | |
| "mean_token_accuracy": 0.6303978456184268, | |
| "num_tokens": 399123.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09283734124169944, | |
| "grad_norm": 0.40062034130096436, | |
| "learning_rate": 0.00019582387012291182, | |
| "loss": 1.1887, | |
| "mean_token_accuracy": 0.638477023690939, | |
| "num_tokens": 421761.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.09799497131068274, | |
| "grad_norm": 0.40518611669540405, | |
| "learning_rate": 0.00019534798180422138, | |
| "loss": 1.1861, | |
| "mean_token_accuracy": 0.6374255264177918, | |
| "num_tokens": 445675.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10315260137966605, | |
| "grad_norm": 0.4045695960521698, | |
| "learning_rate": 0.0001948470643337687, | |
| "loss": 1.1445, | |
| "mean_token_accuracy": 0.641272259876132, | |
| "num_tokens": 469975.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10831023144864935, | |
| "grad_norm": 0.3835983872413635, | |
| "learning_rate": 0.00019432124920400017, | |
| "loss": 1.1414, | |
| "mean_token_accuracy": 0.6493382846936584, | |
| "num_tokens": 493727.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.11346786151763265, | |
| "grad_norm": 0.39580345153808594, | |
| "learning_rate": 0.0001937706744430778, | |
| "loss": 1.1333, | |
| "mean_token_accuracy": 0.6460228314623236, | |
| "num_tokens": 516991.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.11862549158661595, | |
| "grad_norm": 0.392665833234787, | |
| "learning_rate": 0.00019319548457864648, | |
| "loss": 1.1408, | |
| "mean_token_accuracy": 0.6520120551809668, | |
| "num_tokens": 541253.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.12378312165559925, | |
| "grad_norm": 0.3695116639137268, | |
| "learning_rate": 0.0001925958305998947, | |
| "loss": 1.11, | |
| "mean_token_accuracy": 0.6565015500411391, | |
| "num_tokens": 565471.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.12894075172458255, | |
| "grad_norm": 0.38127511739730835, | |
| "learning_rate": 0.0001919718699179199, | |
| "loss": 1.0965, | |
| "mean_token_accuracy": 0.6642474669963121, | |
| "num_tokens": 589030.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13409838179356587, | |
| "grad_norm": 0.3783718943595886, | |
| "learning_rate": 0.00019132376632440695, | |
| "loss": 1.062, | |
| "mean_token_accuracy": 0.6770766332745553, | |
| "num_tokens": 612514.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.13925601186254916, | |
| "grad_norm": 0.42868489027023315, | |
| "learning_rate": 0.00019065168994863288, | |
| "loss": 1.1059, | |
| "mean_token_accuracy": 0.6585826754570008, | |
| "num_tokens": 635574.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.14441364193153247, | |
| "grad_norm": 0.4161641299724579, | |
| "learning_rate": 0.00018995581721280695, | |
| "loss": 1.0985, | |
| "mean_token_accuracy": 0.6587576447054744, | |
| "num_tokens": 659029.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.14957127200051576, | |
| "grad_norm": 0.36837488412857056, | |
| "learning_rate": 0.00018923633078575953, | |
| "loss": 1.0987, | |
| "mean_token_accuracy": 0.6716255461797118, | |
| "num_tokens": 682537.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.15472890206949907, | |
| "grad_norm": 0.3812052309513092, | |
| "learning_rate": 0.0001884934195349908, | |
| "loss": 1.0731, | |
| "mean_token_accuracy": 0.6624803204089403, | |
| "num_tokens": 705616.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15988653213848236, | |
| "grad_norm": 0.38784265518188477, | |
| "learning_rate": 0.00018772727847709257, | |
| "loss": 1.0669, | |
| "mean_token_accuracy": 0.6701639717444777, | |
| "num_tokens": 729415.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.16504416220746568, | |
| "grad_norm": 0.3632284700870514, | |
| "learning_rate": 0.00018693810872655558, | |
| "loss": 1.074, | |
| "mean_token_accuracy": 0.6647017451003194, | |
| "num_tokens": 753385.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.17020179227644897, | |
| "grad_norm": 0.4154379069805145, | |
| "learning_rate": 0.0001861261174429765, | |
| "loss": 1.0724, | |
| "mean_token_accuracy": 0.6690206056460738, | |
| "num_tokens": 776884.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.17535942234543228, | |
| "grad_norm": 0.4121210277080536, | |
| "learning_rate": 0.00018529151777667784, | |
| "loss": 1.0599, | |
| "mean_token_accuracy": 0.674660662189126, | |
| "num_tokens": 800821.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.18051705241441557, | |
| "grad_norm": 0.4217364192008972, | |
| "learning_rate": 0.00018443452881275512, | |
| "loss": 1.0652, | |
| "mean_token_accuracy": 0.6764787383377552, | |
| "num_tokens": 823505.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.18567468248339888, | |
| "grad_norm": 0.43876639008522034, | |
| "learning_rate": 0.00018355537551356654, | |
| "loss": 1.0353, | |
| "mean_token_accuracy": 0.684059496410191, | |
| "num_tokens": 846313.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.19083231255238217, | |
| "grad_norm": 0.377739816904068, | |
| "learning_rate": 0.0001826542886596796, | |
| "loss": 1.0532, | |
| "mean_token_accuracy": 0.6820366451516747, | |
| "num_tokens": 869767.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1959899426213655, | |
| "grad_norm": 0.38219141960144043, | |
| "learning_rate": 0.00018173150478929042, | |
| "loss": 1.0524, | |
| "mean_token_accuracy": 0.6820811878889799, | |
| "num_tokens": 893966.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.20114757269034877, | |
| "grad_norm": 0.3853937089443207, | |
| "learning_rate": 0.00018078726613613162, | |
| "loss": 1.0277, | |
| "mean_token_accuracy": 0.687343406304717, | |
| "num_tokens": 917272.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2063052027593321, | |
| "grad_norm": 0.36827078461647034, | |
| "learning_rate": 0.00017982182056588535, | |
| "loss": 1.0081, | |
| "mean_token_accuracy": 0.6875007605180145, | |
| "num_tokens": 940965.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.21146283282831538, | |
| "grad_norm": 0.41124311089515686, | |
| "learning_rate": 0.00017883542151111764, | |
| "loss": 1.0568, | |
| "mean_token_accuracy": 0.6763140456750989, | |
| "num_tokens": 965284.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2166204628972987, | |
| "grad_norm": 0.4158463776111603, | |
| "learning_rate": 0.00017782832790475166, | |
| "loss": 1.046, | |
| "mean_token_accuracy": 0.67484475299716, | |
| "num_tokens": 989038.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.22177809296628198, | |
| "grad_norm": 0.33250564336776733, | |
| "learning_rate": 0.00017680080411209677, | |
| "loss": 1.0307, | |
| "mean_token_accuracy": 0.6823460660874844, | |
| "num_tokens": 1013429.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2269357230352653, | |
| "grad_norm": 0.3930635154247284, | |
| "learning_rate": 0.00017575311986145196, | |
| "loss": 1.0365, | |
| "mean_token_accuracy": 0.6863100994378328, | |
| "num_tokens": 1037050.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.23209335310424858, | |
| "grad_norm": 0.3810296952724457, | |
| "learning_rate": 0.0001746855501733013, | |
| "loss": 1.041, | |
| "mean_token_accuracy": 0.6770287297666073, | |
| "num_tokens": 1060608.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2372509831732319, | |
| "grad_norm": 0.43654826283454895, | |
| "learning_rate": 0.00017359837528812012, | |
| "loss": 1.0147, | |
| "mean_token_accuracy": 0.6897374652326107, | |
| "num_tokens": 1084685.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.24240861324221522, | |
| "grad_norm": 0.38834720849990845, | |
| "learning_rate": 0.00017249188059281098, | |
| "loss": 0.9982, | |
| "mean_token_accuracy": 0.6943748012185097, | |
| "num_tokens": 1107888.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2475662433111985, | |
| "grad_norm": 0.36283308267593384, | |
| "learning_rate": 0.0001713663565457887, | |
| "loss": 0.9835, | |
| "mean_token_accuracy": 0.7002836445346474, | |
| "num_tokens": 1130809.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2527238733801818, | |
| "grad_norm": 0.3753542900085449, | |
| "learning_rate": 0.00017022209860073414, | |
| "loss": 1.0063, | |
| "mean_token_accuracy": 0.6868171758949757, | |
| "num_tokens": 1154529.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2578815034491651, | |
| "grad_norm": 0.3620479106903076, | |
| "learning_rate": 0.00016905940712903662, | |
| "loss": 0.9876, | |
| "mean_token_accuracy": 0.7012953195720911, | |
| "num_tokens": 1178719.0, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1939, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.925092470733824e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |