{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23675643681562591, "eval_steps": 200, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002959455460195324, "eval_loss": 10.376261711120605, "eval_runtime": 10.819, "eval_samples_per_second": 138.829, "eval_steps_per_second": 34.754, "step": 1 }, { "epoch": 0.002959455460195324, "grad_norm": 0.298828125, "learning_rate": 1.6000000000000003e-05, "loss": 10.3804, "step": 10 }, { "epoch": 0.005918910920390648, "grad_norm": 0.357421875, "learning_rate": 3.2000000000000005e-05, "loss": 10.3767, "step": 20 }, { "epoch": 0.008878366380585973, "grad_norm": 0.443359375, "learning_rate": 4.8e-05, "loss": 10.3754, "step": 30 }, { "epoch": 0.011837821840781295, "grad_norm": 0.5625, "learning_rate": 6.400000000000001e-05, "loss": 10.3767, "step": 40 }, { "epoch": 0.01479727730097662, "grad_norm": 1.109375, "learning_rate": 8e-05, "loss": 10.3722, "step": 50 }, { "epoch": 0.017756732761171946, "grad_norm": 0.294921875, "learning_rate": 9.6e-05, "loss": 10.3804, "step": 60 }, { "epoch": 0.020716188221367268, "grad_norm": 0.373046875, "learning_rate": 0.00011200000000000001, "loss": 10.3739, "step": 70 }, { "epoch": 0.02367564368156259, "grad_norm": 0.42578125, "learning_rate": 0.00012800000000000002, "loss": 10.3736, "step": 80 }, { "epoch": 0.026635099141757917, "grad_norm": 0.70703125, "learning_rate": 0.000144, "loss": 10.3643, "step": 90 }, { "epoch": 0.02959455460195324, "grad_norm": 1.59375, "learning_rate": 0.00016, "loss": 10.364, "step": 100 }, { "epoch": 0.032554010062148565, "grad_norm": 0.478515625, "learning_rate": 0.00017600000000000002, "loss": 10.3561, "step": 110 }, { "epoch": 0.03551346552234389, "grad_norm": 0.73046875, "learning_rate": 0.000192, "loss": 10.3211, "step": 120 }, { "epoch": 0.03847292098253921, "grad_norm": 0.84375, "learning_rate": 0.0001999978128380225, "loss": 10.2582, "step": 130 }, { "epoch": 0.041432376442734536, "grad_norm": 0.72265625, "learning_rate": 0.0001999803161162393, "loss": 10.172, "step": 140 }, { "epoch": 0.04439183190292986, "grad_norm": 1.2734375, "learning_rate": 0.00019994532573409262, "loss": 10.1033, "step": 150 }, { "epoch": 0.04735128736312518, "grad_norm": 0.43359375, "learning_rate": 0.00019989284781388617, "loss": 10.0041, "step": 160 }, { "epoch": 0.05031074282332051, "grad_norm": 0.41796875, "learning_rate": 0.00019982289153773646, "loss": 9.9331, "step": 170 }, { "epoch": 0.053270198283515834, "grad_norm": 0.46875, "learning_rate": 0.00019973546914596623, "loss": 9.8548, "step": 180 }, { "epoch": 0.05622965374371116, "grad_norm": 0.64453125, "learning_rate": 0.00019963059593496268, "loss": 9.7692, "step": 190 }, { "epoch": 0.05918910920390648, "grad_norm": 1.140625, "learning_rate": 0.00019950829025450114, "loss": 9.7054, "step": 200 }, { "epoch": 0.05918910920390648, "eval_loss": 9.686193466186523, "eval_runtime": 20.1405, "eval_samples_per_second": 74.576, "eval_steps_per_second": 18.669, "step": 200 }, { "epoch": 0.062148564664101805, "grad_norm": 0.46484375, "learning_rate": 0.0001993685735045343, "loss": 9.6486, "step": 210 }, { "epoch": 0.06510802012429713, "grad_norm": 0.51171875, "learning_rate": 0.0001992114701314478, "loss": 9.6029, "step": 220 }, { "epoch": 0.06806747558449246, "grad_norm": 0.5078125, "learning_rate": 0.000199037007623783, "loss": 9.5554, "step": 230 }, { "epoch": 0.07102693104468778, "grad_norm": 0.609375, "learning_rate": 0.00019884521650742715, "loss": 9.4941, "step": 240 }, { "epoch": 0.0739863865048831, "grad_norm": 1.78125, "learning_rate": 0.00019863613034027224, "loss": 9.508, "step": 250 }, { "epoch": 0.07694584196507842, "grad_norm": 0.5078125, "learning_rate": 0.0001984097857063434, "loss": 9.3502, "step": 260 }, { "epoch": 0.07990529742527375, "grad_norm": 0.55859375, "learning_rate": 0.0001981662222093976, "loss": 9.3473, "step": 270 }, { "epoch": 0.08286475288546907, "grad_norm": 0.5234375, "learning_rate": 0.00019790548246599447, "loss": 9.2955, "step": 280 }, { "epoch": 0.0858242083456644, "grad_norm": 0.625, "learning_rate": 0.00019762761209803927, "loss": 9.2712, "step": 290 }, { "epoch": 0.08878366380585972, "grad_norm": 1.140625, "learning_rate": 0.0001973326597248006, "loss": 9.2969, "step": 300 }, { "epoch": 0.09174311926605505, "grad_norm": 0.455078125, "learning_rate": 0.00019702067695440332, "loss": 9.1616, "step": 310 }, { "epoch": 0.09470257472625036, "grad_norm": 0.4609375, "learning_rate": 0.00019669171837479873, "loss": 9.1605, "step": 320 }, { "epoch": 0.09766203018644569, "grad_norm": 0.474609375, "learning_rate": 0.00019634584154421317, "loss": 9.1402, "step": 330 }, { "epoch": 0.10062148564664102, "grad_norm": 0.578125, "learning_rate": 0.00019598310698107702, "loss": 9.0839, "step": 340 }, { "epoch": 0.10358094110683634, "grad_norm": 1.296875, "learning_rate": 0.00019560357815343577, "loss": 9.0709, "step": 350 }, { "epoch": 0.10654039656703167, "grad_norm": 0.57421875, "learning_rate": 0.00019520732146784491, "loss": 9.0372, "step": 360 }, { "epoch": 0.109499852027227, "grad_norm": 0.76953125, "learning_rate": 0.0001947944062577507, "loss": 9.0209, "step": 370 }, { "epoch": 0.11245930748742232, "grad_norm": 0.5390625, "learning_rate": 0.00019436490477135878, "loss": 8.9724, "step": 380 }, { "epoch": 0.11541876294761765, "grad_norm": 0.6171875, "learning_rate": 0.00019391889215899299, "loss": 9.0212, "step": 390 }, { "epoch": 0.11837821840781296, "grad_norm": 1.421875, "learning_rate": 0.0001934564464599461, "loss": 8.9091, "step": 400 }, { "epoch": 0.11837821840781296, "eval_loss": 8.961220741271973, "eval_runtime": 13.0065, "eval_samples_per_second": 115.48, "eval_steps_per_second": 28.909, "step": 400 }, { "epoch": 0.12133767386800828, "grad_norm": 0.443359375, "learning_rate": 0.00019297764858882514, "loss": 8.9547, "step": 410 }, { "epoch": 0.12429712932820361, "grad_norm": 0.466796875, "learning_rate": 0.00019248258232139388, "loss": 8.9394, "step": 420 }, { "epoch": 0.12725658478839894, "grad_norm": 0.61328125, "learning_rate": 0.00019197133427991436, "loss": 8.9748, "step": 430 }, { "epoch": 0.13021604024859426, "grad_norm": 0.73046875, "learning_rate": 0.00019144399391799043, "loss": 8.9198, "step": 440 }, { "epoch": 0.1331754957087896, "grad_norm": 1.203125, "learning_rate": 0.00019090065350491626, "loss": 8.8904, "step": 450 }, { "epoch": 0.1361349511689849, "grad_norm": 0.494140625, "learning_rate": 0.0001903414081095315, "loss": 8.8971, "step": 460 }, { "epoch": 0.13909440662918024, "grad_norm": 0.48046875, "learning_rate": 0.00018976635558358722, "loss": 8.84, "step": 470 }, { "epoch": 0.14205386208937557, "grad_norm": 0.55859375, "learning_rate": 0.00018917559654462474, "loss": 8.838, "step": 480 }, { "epoch": 0.1450133175495709, "grad_norm": 0.5703125, "learning_rate": 0.00018856923435837022, "loss": 8.7761, "step": 490 }, { "epoch": 0.1479727730097662, "grad_norm": 0.96875, "learning_rate": 0.0001879473751206489, "loss": 8.8421, "step": 500 }, { "epoch": 0.15093222846996152, "grad_norm": 0.478515625, "learning_rate": 0.00018731012763882133, "loss": 8.7691, "step": 510 }, { "epoch": 0.15389168393015684, "grad_norm": 0.4921875, "learning_rate": 0.00018665760341274505, "loss": 8.7749, "step": 520 }, { "epoch": 0.15685113939035217, "grad_norm": 0.51171875, "learning_rate": 0.00018598991661526572, "loss": 8.79, "step": 530 }, { "epoch": 0.1598105948505475, "grad_norm": 0.58203125, "learning_rate": 0.00018530718407223974, "loss": 8.8742, "step": 540 }, { "epoch": 0.16277005031074282, "grad_norm": 1.234375, "learning_rate": 0.00018460952524209355, "loss": 8.7845, "step": 550 }, { "epoch": 0.16572950577093815, "grad_norm": 0.470703125, "learning_rate": 0.00018389706219492147, "loss": 8.8165, "step": 560 }, { "epoch": 0.16868896123113347, "grad_norm": 0.486328125, "learning_rate": 0.00018316991959112716, "loss": 8.7024, "step": 570 }, { "epoch": 0.1716484166913288, "grad_norm": 0.53515625, "learning_rate": 0.00018242822465961176, "loss": 8.7764, "step": 580 }, { "epoch": 0.17460787215152412, "grad_norm": 0.58984375, "learning_rate": 0.00018167210717551224, "loss": 8.7501, "step": 590 }, { "epoch": 0.17756732761171945, "grad_norm": 1.28125, "learning_rate": 0.00018090169943749476, "loss": 8.7257, "step": 600 }, { "epoch": 0.17756732761171945, "eval_loss": 8.762685775756836, "eval_runtime": 18.9408, "eval_samples_per_second": 79.3, "eval_steps_per_second": 19.851, "step": 600 }, { "epoch": 0.18052678307191478, "grad_norm": 0.54296875, "learning_rate": 0.00018011713624460608, "loss": 8.7709, "step": 610 }, { "epoch": 0.1834862385321101, "grad_norm": 0.53515625, "learning_rate": 0.00017931855487268782, "loss": 8.7334, "step": 620 }, { "epoch": 0.18644569399230543, "grad_norm": 0.56640625, "learning_rate": 0.0001785060950503568, "loss": 8.824, "step": 630 }, { "epoch": 0.18940514945250073, "grad_norm": 0.69921875, "learning_rate": 0.00017767989893455698, "loss": 8.6731, "step": 640 }, { "epoch": 0.19236460491269605, "grad_norm": 0.90625, "learning_rate": 0.00017684011108568592, "loss": 8.7669, "step": 650 }, { "epoch": 0.19532406037289138, "grad_norm": 0.49609375, "learning_rate": 0.00017598687844230088, "loss": 8.6911, "step": 660 }, { "epoch": 0.1982835158330867, "grad_norm": 0.44140625, "learning_rate": 0.00017512035029540885, "loss": 8.6932, "step": 670 }, { "epoch": 0.20124297129328203, "grad_norm": 0.52734375, "learning_rate": 0.000174240678262345, "loss": 8.71, "step": 680 }, { "epoch": 0.20420242675347736, "grad_norm": 0.59375, "learning_rate": 0.000173348016260244, "loss": 8.7219, "step": 690 }, { "epoch": 0.20716188221367268, "grad_norm": 1.3515625, "learning_rate": 0.00017244252047910892, "loss": 8.6973, "step": 700 }, { "epoch": 0.210121337673868, "grad_norm": 0.462890625, "learning_rate": 0.00017152434935448256, "loss": 8.6743, "step": 710 }, { "epoch": 0.21308079313406333, "grad_norm": 0.451171875, "learning_rate": 0.0001705936635397259, "loss": 8.7094, "step": 720 }, { "epoch": 0.21604024859425866, "grad_norm": 0.57421875, "learning_rate": 0.00016965062587790823, "loss": 8.7353, "step": 730 }, { "epoch": 0.218999704054454, "grad_norm": 0.5546875, "learning_rate": 0.00016869540137331445, "loss": 8.6939, "step": 740 }, { "epoch": 0.2219591595146493, "grad_norm": 1.0703125, "learning_rate": 0.00016772815716257412, "loss": 8.7202, "step": 750 }, { "epoch": 0.22491861497484464, "grad_norm": 0.51171875, "learning_rate": 0.00016674906248541726, "loss": 8.6779, "step": 760 }, { "epoch": 0.22787807043503996, "grad_norm": 0.671875, "learning_rate": 0.00016575828865506245, "loss": 8.6627, "step": 770 }, { "epoch": 0.2308375258952353, "grad_norm": 0.4375, "learning_rate": 0.0001647560090282419, "loss": 8.7348, "step": 780 }, { "epoch": 0.2337969813554306, "grad_norm": 0.6875, "learning_rate": 0.000163742398974869, "loss": 8.7236, "step": 790 }, { "epoch": 0.23675643681562591, "grad_norm": 1.4140625, "learning_rate": 0.0001627176358473537, "loss": 8.7416, "step": 800 }, { "epoch": 0.23675643681562591, "eval_loss": 8.710856437683105, "eval_runtime": 16.7859, "eval_samples_per_second": 89.48, "eval_steps_per_second": 22.4, "step": 800 } ], "logging_steps": 10, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 20509072293888.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }