| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.23675643681562591, | |
| "eval_steps": 200, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0002959455460195324, | |
| "eval_loss": 10.376261711120605, | |
| "eval_runtime": 10.819, | |
| "eval_samples_per_second": 138.829, | |
| "eval_steps_per_second": 34.754, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.002959455460195324, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 10.3804, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005918910920390648, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 10.3767, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.008878366380585973, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 4.8e-05, | |
| "loss": 10.3754, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.011837821840781295, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 10.3767, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01479727730097662, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8e-05, | |
| "loss": 10.3722, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.017756732761171946, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 9.6e-05, | |
| "loss": 10.3804, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.020716188221367268, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 10.3739, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02367564368156259, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 10.3736, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.026635099141757917, | |
| "grad_norm": 0.70703125, | |
| "learning_rate": 0.000144, | |
| "loss": 10.3643, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.02959455460195324, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 0.00016, | |
| "loss": 10.364, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.032554010062148565, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 10.3561, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03551346552234389, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.000192, | |
| "loss": 10.3211, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03847292098253921, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 0.0001999978128380225, | |
| "loss": 10.2582, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.041432376442734536, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 0.0001999803161162393, | |
| "loss": 10.172, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.04439183190292986, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 0.00019994532573409262, | |
| "loss": 10.1033, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04735128736312518, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 0.00019989284781388617, | |
| "loss": 10.0041, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.05031074282332051, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 0.00019982289153773646, | |
| "loss": 9.9331, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.053270198283515834, | |
| "grad_norm": 0.46875, | |
| "learning_rate": 0.00019973546914596623, | |
| "loss": 9.8548, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.05622965374371116, | |
| "grad_norm": 0.64453125, | |
| "learning_rate": 0.00019963059593496268, | |
| "loss": 9.7692, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05918910920390648, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 0.00019950829025450114, | |
| "loss": 9.7054, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05918910920390648, | |
| "eval_loss": 9.686193466186523, | |
| "eval_runtime": 20.1405, | |
| "eval_samples_per_second": 74.576, | |
| "eval_steps_per_second": 18.669, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.062148564664101805, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 0.0001993685735045343, | |
| "loss": 9.6486, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.06510802012429713, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0001992114701314478, | |
| "loss": 9.6029, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.06806747558449246, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.000199037007623783, | |
| "loss": 9.5554, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.07102693104468778, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.00019884521650742715, | |
| "loss": 9.4941, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.0739863865048831, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 0.00019863613034027224, | |
| "loss": 9.508, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07694584196507842, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.0001984097857063434, | |
| "loss": 9.3502, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.07990529742527375, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0001981662222093976, | |
| "loss": 9.3473, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.08286475288546907, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.00019790548246599447, | |
| "loss": 9.2955, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0858242083456644, | |
| "grad_norm": 0.625, | |
| "learning_rate": 0.00019762761209803927, | |
| "loss": 9.2712, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.08878366380585972, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 0.0001973326597248006, | |
| "loss": 9.2969, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09174311926605505, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.00019702067695440332, | |
| "loss": 9.1616, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.09470257472625036, | |
| "grad_norm": 0.4609375, | |
| "learning_rate": 0.00019669171837479873, | |
| "loss": 9.1605, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.09766203018644569, | |
| "grad_norm": 0.474609375, | |
| "learning_rate": 0.00019634584154421317, | |
| "loss": 9.1402, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.10062148564664102, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 0.00019598310698107702, | |
| "loss": 9.0839, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.10358094110683634, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 0.00019560357815343577, | |
| "loss": 9.0709, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.10654039656703167, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 0.00019520732146784491, | |
| "loss": 9.0372, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.109499852027227, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 0.0001947944062577507, | |
| "loss": 9.0209, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.11245930748742232, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.00019436490477135878, | |
| "loss": 8.9724, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.11541876294761765, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 0.00019391889215899299, | |
| "loss": 9.0212, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.11837821840781296, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 0.0001934564464599461, | |
| "loss": 8.9091, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.11837821840781296, | |
| "eval_loss": 8.961220741271973, | |
| "eval_runtime": 13.0065, | |
| "eval_samples_per_second": 115.48, | |
| "eval_steps_per_second": 28.909, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.12133767386800828, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 0.00019297764858882514, | |
| "loss": 8.9547, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.12429712932820361, | |
| "grad_norm": 0.466796875, | |
| "learning_rate": 0.00019248258232139388, | |
| "loss": 8.9394, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.12725658478839894, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.00019197133427991436, | |
| "loss": 8.9748, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.13021604024859426, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 0.00019144399391799043, | |
| "loss": 8.9198, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.1331754957087896, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 0.00019090065350491626, | |
| "loss": 8.8904, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1361349511689849, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 0.0001903414081095315, | |
| "loss": 8.8971, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.13909440662918024, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.00018976635558358722, | |
| "loss": 8.84, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.14205386208937557, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.00018917559654462474, | |
| "loss": 8.838, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1450133175495709, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.00018856923435837022, | |
| "loss": 8.7761, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.1479727730097662, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 0.0001879473751206489, | |
| "loss": 8.8421, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.15093222846996152, | |
| "grad_norm": 0.478515625, | |
| "learning_rate": 0.00018731012763882133, | |
| "loss": 8.7691, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.15389168393015684, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.00018665760341274505, | |
| "loss": 8.7749, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.15685113939035217, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.00018598991661526572, | |
| "loss": 8.79, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1598105948505475, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.00018530718407223974, | |
| "loss": 8.8742, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.16277005031074282, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 0.00018460952524209355, | |
| "loss": 8.7845, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.16572950577093815, | |
| "grad_norm": 0.470703125, | |
| "learning_rate": 0.00018389706219492147, | |
| "loss": 8.8165, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.16868896123113347, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 0.00018316991959112716, | |
| "loss": 8.7024, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.1716484166913288, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.00018242822465961176, | |
| "loss": 8.7764, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.17460787215152412, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.00018167210717551224, | |
| "loss": 8.7501, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.17756732761171945, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 0.00018090169943749476, | |
| "loss": 8.7257, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.17756732761171945, | |
| "eval_loss": 8.762685775756836, | |
| "eval_runtime": 18.9408, | |
| "eval_samples_per_second": 79.3, | |
| "eval_steps_per_second": 19.851, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18052678307191478, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 0.00018011713624460608, | |
| "loss": 8.7709, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.1834862385321101, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.00017931855487268782, | |
| "loss": 8.7334, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.18644569399230543, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.0001785060950503568, | |
| "loss": 8.824, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.18940514945250073, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.00017767989893455698, | |
| "loss": 8.6731, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.19236460491269605, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 0.00017684011108568592, | |
| "loss": 8.7669, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.19532406037289138, | |
| "grad_norm": 0.49609375, | |
| "learning_rate": 0.00017598687844230088, | |
| "loss": 8.6911, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.1982835158330867, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.00017512035029540885, | |
| "loss": 8.6932, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.20124297129328203, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.000174240678262345, | |
| "loss": 8.71, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.20420242675347736, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 0.000173348016260244, | |
| "loss": 8.7219, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.20716188221367268, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 0.00017244252047910892, | |
| "loss": 8.6973, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.210121337673868, | |
| "grad_norm": 0.462890625, | |
| "learning_rate": 0.00017152434935448256, | |
| "loss": 8.6743, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.21308079313406333, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 0.0001705936635397259, | |
| "loss": 8.7094, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.21604024859425866, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 0.00016965062587790823, | |
| "loss": 8.7353, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.218999704054454, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 0.00016869540137331445, | |
| "loss": 8.6939, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.2219591595146493, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 0.00016772815716257412, | |
| "loss": 8.7202, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.22491861497484464, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.00016674906248541726, | |
| "loss": 8.6779, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.22787807043503996, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.00016575828865506245, | |
| "loss": 8.6627, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.2308375258952353, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 0.0001647560090282419, | |
| "loss": 8.7348, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2337969813554306, | |
| "grad_norm": 0.6875, | |
| "learning_rate": 0.000163742398974869, | |
| "loss": 8.7236, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.23675643681562591, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 0.0001627176358473537, | |
| "loss": 8.7416, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.23675643681562591, | |
| "eval_loss": 8.710856437683105, | |
| "eval_runtime": 16.7859, | |
| "eval_samples_per_second": 89.48, | |
| "eval_steps_per_second": 22.4, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 400, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 20509072293888.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |