{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0125, "grad_norm": 0.14092598855495453, "learning_rate": 0.0004, "loss": 1.1529, "step": 25 }, { "epoch": 0.025, "grad_norm": 0.14281609654426575, "learning_rate": 0.0004998852503731983, "loss": 1.0472, "step": 50 }, { "epoch": 0.0375, "grad_norm": 0.24040694534778595, "learning_rate": 0.0004993848168027977, "loss": 0.8532, "step": 75 }, { "epoch": 0.05, "grad_norm": 0.14735238254070282, "learning_rate": 0.0004984880506341147, "loss": 0.9761, "step": 100 }, { "epoch": 0.05, "eval_loss": 0.9448406100273132, "eval_runtime": 322.186, "eval_samples_per_second": 3.405, "eval_steps_per_second": 0.056, "step": 100 }, { "epoch": 0.0625, "grad_norm": 0.1359478235244751, "learning_rate": 0.0004971963770447935, "loss": 0.9568, "step": 125 }, { "epoch": 0.075, "grad_norm": 0.17497147619724274, "learning_rate": 0.0004955118488155782, "loss": 0.7111, "step": 150 }, { "epoch": 0.0875, "grad_norm": 0.14382271468639374, "learning_rate": 0.0004934371430679492, "loss": 0.9413, "step": 175 }, { "epoch": 0.1, "grad_norm": 0.16079047322273254, "learning_rate": 0.0004909755570095319, "loss": 0.8981, "step": 200 }, { "epoch": 0.1, "eval_loss": 0.9723050594329834, "eval_runtime": 322.7576, "eval_samples_per_second": 3.399, "eval_steps_per_second": 0.056, "step": 200 }, { "epoch": 0.1125, "grad_norm": 0.14729101955890656, "learning_rate": 0.0004881310026940389, "loss": 0.6378, "step": 225 }, { "epoch": 0.125, "grad_norm": 0.15030288696289062, "learning_rate": 0.0004849080008040734, "loss": 0.9271, "step": 250 }, { "epoch": 0.1375, "grad_norm": 0.1613348424434662, "learning_rate": 0.00048131167346667446, "loss": 0.8457, "step": 275 }, { "epoch": 0.15, "grad_norm": 0.15532569587230682, "learning_rate": 0.00047734773611302284, "loss": 0.603, "step": 300 }, { "epoch": 0.15, "eval_loss": 1.006589651107788, "eval_runtime": 323.2105, "eval_samples_per_second": 3.394, "eval_steps_per_second": 0.056, "step": 300 }, { "epoch": 0.1625, "grad_norm": 0.16015686094760895, "learning_rate": 0.0004730224883952422, "loss": 0.9036, "step": 325 }, { "epoch": 0.175, "grad_norm": 0.15767253935337067, "learning_rate": 0.0004683428041747334, "loss": 0.8283, "step": 350 }, { "epoch": 0.1875, "grad_norm": 0.17757417261600494, "learning_rate": 0.0004633161205979517, "loss": 0.5945, "step": 375 }, { "epoch": 0.2, "grad_norm": 0.17248600721359253, "learning_rate": 0.0004579504262769877, "loss": 0.8655, "step": 400 }, { "epoch": 0.2, "eval_loss": 1.0158599615097046, "eval_runtime": 323.8034, "eval_samples_per_second": 3.388, "eval_steps_per_second": 0.056, "step": 400 }, { "epoch": 0.2125, "grad_norm": 0.17826460301876068, "learning_rate": 0.0004522542485937369, "loss": 0.8079, "step": 425 }, { "epoch": 0.225, "grad_norm": 0.19307631254196167, "learning_rate": 0.00044623664014783386, "loss": 0.5737, "step": 450 }, { "epoch": 0.2375, "grad_norm": 0.1877959966659546, "learning_rate": 0.00043990716436988924, "loss": 0.8605, "step": 475 }, { "epoch": 0.25, "grad_norm": 0.15268854796886444, "learning_rate": 0.0004332758803228925, "loss": 0.7674, "step": 500 }, { "epoch": 0.25, "eval_loss": 1.0431231260299683, "eval_runtime": 323.1376, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.056, "step": 500 }, { "epoch": 0.2625, "grad_norm": 0.16773808002471924, "learning_rate": 0.00042635332671593575, "loss": 0.5884, "step": 525 }, { "epoch": 0.275, "grad_norm": 0.15766142308712006, "learning_rate": 0.00041915050515566445, "loss": 0.8178, "step": 550 }, { "epoch": 0.2875, "grad_norm": 0.16790153086185455, "learning_rate": 0.00041167886266207167, "loss": 0.7797, "step": 575 }, { "epoch": 0.3, "grad_norm": 0.15149210393428802, "learning_rate": 0.0004039502734764241, "loss": 0.7334, "step": 600 }, { "epoch": 0.3, "eval_loss": 1.0655592679977417, "eval_runtime": 321.6139, "eval_samples_per_second": 3.411, "eval_steps_per_second": 0.056, "step": 600 }, { "epoch": 0.3125, "grad_norm": 0.15414434671401978, "learning_rate": 0.0003959770201902294, "loss": 0.744, "step": 625 }, { "epoch": 0.325, "grad_norm": 0.1529635190963745, "learning_rate": 0.0003877717742252371, "loss": 0.6345, "step": 650 }, { "epoch": 0.3375, "grad_norm": 0.16185611486434937, "learning_rate": 0.00037934757569549495, "loss": 0.7354, "step": 675 }, { "epoch": 0.35, "grad_norm": 0.16656433045864105, "learning_rate": 0.00037071781268346345, "loss": 0.7455, "step": 700 }, { "epoch": 0.35, "eval_loss": 1.0835301876068115, "eval_runtime": 322.4593, "eval_samples_per_second": 3.402, "eval_steps_per_second": 0.056, "step": 700 }, { "epoch": 0.3625, "grad_norm": 0.15590643882751465, "learning_rate": 0.00036189619996312495, "loss": 0.5972, "step": 725 }, { "epoch": 0.375, "grad_norm": 0.1837926208972931, "learning_rate": 0.00035289675720390174, "loss": 0.7592, "step": 750 }, { "epoch": 0.3875, "grad_norm": 0.1620703488588333, "learning_rate": 0.00034373378669002105, "loss": 0.736, "step": 775 }, { "epoch": 0.4, "grad_norm": 0.16613048315048218, "learning_rate": 0.00033442185059073706, "loss": 0.564, "step": 800 }, { "epoch": 0.4, "eval_loss": 1.096523642539978, "eval_runtime": 321.7553, "eval_samples_per_second": 3.409, "eval_steps_per_second": 0.056, "step": 800 }, { "epoch": 0.4125, "grad_norm": 0.1587529182434082, "learning_rate": 0.00032497574781753367, "loss": 0.7598, "step": 825 }, { "epoch": 0.425, "grad_norm": 0.16457463800907135, "learning_rate": 0.000315410490505086, "loss": 0.7292, "step": 850 }, { "epoch": 0.4375, "grad_norm": 0.16429653763771057, "learning_rate": 0.0003057412801533589, "loss": 0.5329, "step": 875 }, { "epoch": 0.45, "grad_norm": 0.16424661874771118, "learning_rate": 0.0002959834834687587, "loss": 0.7785, "step": 900 }, { "epoch": 0.45, "eval_loss": 1.0958806276321411, "eval_runtime": 322.3489, "eval_samples_per_second": 3.403, "eval_steps_per_second": 0.056, "step": 900 }, { "epoch": 0.4625, "grad_norm": 0.16132202744483948, "learning_rate": 0.00028615260794273236, "loss": 0.7259, "step": 925 }, { "epoch": 0.475, "grad_norm": 0.16069433093070984, "learning_rate": 0.00027626427720662416, "loss": 0.4996, "step": 950 }, { "epoch": 0.4875, "grad_norm": 0.16033512353897095, "learning_rate": 0.00026633420620195917, "loss": 0.7768, "step": 975 }, { "epoch": 0.5, "grad_norm": 0.16236689686775208, "learning_rate": 0.00025637817620561263, "loss": 0.7225, "step": 1000 }, { "epoch": 0.5, "eval_loss": 1.1207813024520874, "eval_runtime": 322.5881, "eval_samples_per_second": 3.401, "eval_steps_per_second": 0.056, "step": 1000 }, { "epoch": 0.5125, "grad_norm": 0.15189126133918762, "learning_rate": 0.0002464120097495559, "loss": 0.4932, "step": 1025 }, { "epoch": 0.525, "grad_norm": 0.1648908108472824, "learning_rate": 0.00023645154547503855, "loss": 0.7902, "step": 1050 }, { "epoch": 0.5375, "grad_norm": 0.16001655161380768, "learning_rate": 0.00022651261296116894, "loss": 0.7003, "step": 1075 }, { "epoch": 0.55, "grad_norm": 0.1633903682231903, "learning_rate": 0.00021661100756789666, "loss": 0.4709, "step": 1100 }, { "epoch": 0.55, "eval_loss": 1.1372770071029663, "eval_runtime": 323.5516, "eval_samples_per_second": 3.39, "eval_steps_per_second": 0.056, "step": 1100 }, { "epoch": 0.5625, "grad_norm": 0.1652471274137497, "learning_rate": 0.00020676246533337764, "loss": 0.8076, "step": 1125 }, { "epoch": 0.575, "grad_norm": 0.17891553044319153, "learning_rate": 0.00019698263796561526, "loss": 0.7156, "step": 1150 }, { "epoch": 0.5875, "grad_norm": 0.16443009674549103, "learning_rate": 0.00018728706796812333, "loss": 0.6316, "step": 1175 }, { "epoch": 0.6, "grad_norm": 0.16446325182914734, "learning_rate": 0.00017769116393914037, "loss": 0.6956, "step": 1200 }, { "epoch": 0.6, "eval_loss": 1.1236783266067505, "eval_runtime": 320.7164, "eval_samples_per_second": 3.42, "eval_steps_per_second": 0.056, "step": 1200 }, { "epoch": 0.6125, "grad_norm": 0.1620441973209381, "learning_rate": 0.00016821017608365264, "loss": 0.6163, "step": 1225 }, { "epoch": 0.625, "grad_norm": 0.17003227770328522, "learning_rate": 0.00015885917197714112, "loss": 0.6232, "step": 1250 }, { "epoch": 0.6375, "grad_norm": 0.17415954172611237, "learning_rate": 0.00014965301261957238, "loss": 0.6991, "step": 1275 }, { "epoch": 0.65, "grad_norm": 0.1617245227098465, "learning_rate": 0.00014060632881768558, "loss": 0.599, "step": 1300 }, { "epoch": 0.65, "eval_loss": 1.1370735168457031, "eval_runtime": 320.7099, "eval_samples_per_second": 3.421, "eval_steps_per_second": 0.056, "step": 1300 }, { "epoch": 0.6625, "grad_norm": 0.1753346472978592, "learning_rate": 0.00013173349793311424, "loss": 0.6611, "step": 1325 }, { "epoch": 0.675, "grad_norm": 0.17510834336280823, "learning_rate": 0.0001230486210332916, "loss": 0.6814, "step": 1350 }, { "epoch": 0.6875, "grad_norm": 0.17805688083171844, "learning_rate": 0.00011456550048145536, "loss": 0.5757, "step": 1375 }, { "epoch": 0.7, "grad_norm": 0.17829716205596924, "learning_rate": 0.00010629761800136473, "loss": 0.6646, "step": 1400 }, { "epoch": 0.7, "eval_loss": 1.1476235389709473, "eval_runtime": 321.7747, "eval_samples_per_second": 3.409, "eval_steps_per_second": 0.056, "step": 1400 }, { "epoch": 0.7125, "grad_norm": 0.18059992790222168, "learning_rate": 9.82581132515907e-05, "loss": 0.6797, "step": 1425 }, { "epoch": 0.725, "grad_norm": 0.15256117284297943, "learning_rate": 9.045976294343145e-05, "loss": 0.5596, "step": 1450 }, { "epoch": 0.7375, "grad_norm": 0.17262427508831024, "learning_rate": 8.291496053563699e-05, "loss": 0.6905, "step": 1475 }, { "epoch": 0.75, "grad_norm": 0.17382751405239105, "learning_rate": 7.563569653821565e-05, "loss": 0.6772, "step": 1500 }, { "epoch": 0.75, "eval_loss": 1.1411069631576538, "eval_runtime": 323.0865, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.056, "step": 1500 }, { "epoch": 0.7625, "grad_norm": 0.15558482706546783, "learning_rate": 6.863353945662288e-05, "loss": 0.5172, "step": 1525 }, { "epoch": 0.775, "grad_norm": 0.1950470507144928, "learning_rate": 6.191961740661687e-05, "loss": 0.7039, "step": 1550 }, { "epoch": 0.7875, "grad_norm": 0.17224222421646118, "learning_rate": 5.550460042899982e-05, "loss": 0.6914, "step": 1575 }, { "epoch": 0.8, "grad_norm": 0.1652670055627823, "learning_rate": 4.9398683532350855e-05, "loss": 0.488, "step": 1600 }, { "epoch": 0.8, "eval_loss": 1.1589314937591553, "eval_runtime": 321.6657, "eval_samples_per_second": 3.41, "eval_steps_per_second": 0.056, "step": 1600 }, { "epoch": 0.8125, "grad_norm": 0.1697286069393158, "learning_rate": 4.3611570490698945e-05, "loss": 0.7453, "step": 1625 }, { "epoch": 0.825, "grad_norm": 0.16859295964241028, "learning_rate": 3.815245842188697e-05, "loss": 0.6625, "step": 1650 }, { "epoch": 0.8375, "grad_norm": 0.16462524235248566, "learning_rate": 3.30300231711339e-05, "loss": 0.4719, "step": 1675 }, { "epoch": 0.85, "grad_norm": 0.17534367740154266, "learning_rate": 2.8252405523025106e-05, "loss": 0.7468, "step": 1700 }, { "epoch": 0.85, "eval_loss": 1.161086916923523, "eval_runtime": 322.1848, "eval_samples_per_second": 3.405, "eval_steps_per_second": 0.056, "step": 1700 }, { "epoch": 0.8625, "grad_norm": 0.16568109393119812, "learning_rate": 2.3827198263843162e-05, "loss": 0.709, "step": 1725 }, { "epoch": 0.875, "grad_norm": 0.17694273591041565, "learning_rate": 1.9761434114799497e-05, "loss": 0.5756, "step": 1750 }, { "epoch": 0.8875, "grad_norm": 0.16244478523731232, "learning_rate": 1.606157455534535e-05, "loss": 0.6545, "step": 1775 }, { "epoch": 0.9, "grad_norm": 0.1651735007762909, "learning_rate": 1.2733499554322708e-05, "loss": 0.6352, "step": 1800 }, { "epoch": 0.9, "eval_loss": 1.1563351154327393, "eval_runtime": 320.396, "eval_samples_per_second": 3.424, "eval_steps_per_second": 0.056, "step": 1800 }, { "epoch": 0.9125, "grad_norm": 0.17625375092029572, "learning_rate": 9.782498225276437e-06, "loss": 0.5515, "step": 1825 }, { "epoch": 0.925, "grad_norm": 0.17532864212989807, "learning_rate": 7.213260420777607e-06, "loss": 0.686, "step": 1850 }, { "epoch": 0.9375, "grad_norm": 0.15695761144161224, "learning_rate": 5.029869279117167e-06, "loss": 0.6296, "step": 1875 }, { "epoch": 0.95, "grad_norm": 0.17587953805923462, "learning_rate": 3.235794735214709e-06, "loss": 0.569, "step": 1900 }, { "epoch": 0.95, "eval_loss": 1.1628855466842651, "eval_runtime": 324.1332, "eval_samples_per_second": 3.384, "eval_steps_per_second": 0.056, "step": 1900 }, { "epoch": 0.9625, "grad_norm": 0.17684130370616913, "learning_rate": 1.8338880060553287e-06, "loss": 0.6861, "step": 1925 }, { "epoch": 0.975, "grad_norm": 0.16606929898262024, "learning_rate": 8.263770594185149e-07, "loss": 0.6187, "step": 1950 }, { "epoch": 0.9875, "grad_norm": 0.1606944501399994, "learning_rate": 2.1486307310000787e-07, "loss": 0.5915, "step": 1975 }, { "epoch": 1.0, "grad_norm": 0.16651476919651031, "learning_rate": 3.1789025450867925e-10, "loss": 0.6817, "step": 2000 }, { "epoch": 1.0, "eval_loss": 1.160001277923584, "eval_runtime": 322.869, "eval_samples_per_second": 3.398, "eval_steps_per_second": 0.056, "step": 2000 } ], "logging_steps": 25, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0158630867238912e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }