{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0125, "grad_norm": 0.14176353812217712, "learning_rate": 0.0004, "loss": 1.1525, "step": 25 }, { "epoch": 0.025, "grad_norm": 0.1460508555173874, "learning_rate": 0.0004998852503731983, "loss": 1.047, "step": 50 }, { "epoch": 0.0375, "grad_norm": 0.2368021011352539, "learning_rate": 0.0004993848168027977, "loss": 0.8529, "step": 75 }, { "epoch": 0.05, "grad_norm": 0.14488168060779572, "learning_rate": 0.0004984880506341147, "loss": 0.9756, "step": 100 }, { "epoch": 0.05, "eval_loss": 0.9470569491386414, "eval_runtime": 845.9115, "eval_samples_per_second": 1.297, "eval_steps_per_second": 0.021, "step": 100 }, { "epoch": 0.0625, "grad_norm": 0.1415175348520279, "learning_rate": 0.0004971963770447935, "loss": 0.9564, "step": 125 }, { "epoch": 0.075, "grad_norm": 0.1751064509153366, "learning_rate": 0.0004955118488155782, "loss": 0.711, "step": 150 }, { "epoch": 0.0875, "grad_norm": 0.1439943015575409, "learning_rate": 0.0004934371430679492, "loss": 0.9409, "step": 175 }, { "epoch": 0.1, "grad_norm": 0.15947112441062927, "learning_rate": 0.0004909755570095319, "loss": 0.8979, "step": 200 }, { "epoch": 0.1, "eval_loss": 0.9711233973503113, "eval_runtime": 845.6433, "eval_samples_per_second": 1.297, "eval_steps_per_second": 0.021, "step": 200 }, { "epoch": 0.1125, "grad_norm": 0.14685837924480438, "learning_rate": 0.0004881310026940389, "loss": 0.6376, "step": 225 }, { "epoch": 0.125, "grad_norm": 0.15040776133537292, "learning_rate": 0.0004849080008040734, "loss": 0.927, "step": 250 }, { "epoch": 0.1375, "grad_norm": 0.16087745130062103, "learning_rate": 0.00048131167346667446, "loss": 0.8456, "step": 275 }, { "epoch": 0.15, "grad_norm": 0.15025638043880463, "learning_rate": 0.00047734773611302284, "loss": 0.6029, "step": 300 }, { "epoch": 0.15, "eval_loss": 1.0056413412094116, "eval_runtime": 848.447, "eval_samples_per_second": 1.293, "eval_steps_per_second": 0.021, "step": 300 }, { "epoch": 0.1625, "grad_norm": 0.15893957018852234, "learning_rate": 0.0004730224883952422, "loss": 0.9036, "step": 325 }, { "epoch": 0.175, "grad_norm": 0.1535714715719223, "learning_rate": 0.0004683428041747334, "loss": 0.828, "step": 350 }, { "epoch": 0.1875, "grad_norm": 0.1718970686197281, "learning_rate": 0.0004633161205979517, "loss": 0.5944, "step": 375 }, { "epoch": 0.2, "grad_norm": 0.17664535343647003, "learning_rate": 0.0004579504262769877, "loss": 0.8654, "step": 400 }, { "epoch": 0.2, "eval_loss": 1.0151112079620361, "eval_runtime": 839.7482, "eval_samples_per_second": 1.306, "eval_steps_per_second": 0.021, "step": 400 }, { "epoch": 0.2125, "grad_norm": 0.17122088372707367, "learning_rate": 0.0004522542485937369, "loss": 0.8078, "step": 425 }, { "epoch": 0.225, "grad_norm": 0.19420970976352692, "learning_rate": 0.00044623664014783386, "loss": 0.5735, "step": 450 }, { "epoch": 0.2375, "grad_norm": 0.18166953325271606, "learning_rate": 0.00043990716436988924, "loss": 0.8604, "step": 475 }, { "epoch": 0.25, "grad_norm": 0.1502382755279541, "learning_rate": 0.0004332758803228925, "loss": 0.7673, "step": 500 }, { "epoch": 0.25, "eval_loss": 1.0454745292663574, "eval_runtime": 841.3951, "eval_samples_per_second": 1.304, "eval_steps_per_second": 0.021, "step": 500 }, { "epoch": 0.2625, "grad_norm": 0.16875839233398438, "learning_rate": 0.00042635332671593575, "loss": 0.5882, "step": 525 }, { "epoch": 0.275, "grad_norm": 0.16070039570331573, "learning_rate": 0.00041915050515566445, "loss": 0.8175, "step": 550 }, { "epoch": 0.2875, "grad_norm": 0.16584184765815735, "learning_rate": 0.00041167886266207167, "loss": 0.7795, "step": 575 }, { "epoch": 0.3, "grad_norm": 0.15345798432826996, "learning_rate": 0.0004039502734764241, "loss": 0.7331, "step": 600 }, { "epoch": 0.3, "eval_loss": 1.0627943277359009, "eval_runtime": 849.4944, "eval_samples_per_second": 1.291, "eval_steps_per_second": 0.021, "step": 600 }, { "epoch": 0.3125, "grad_norm": 0.16241031885147095, "learning_rate": 0.0003959770201902294, "loss": 0.7436, "step": 625 }, { "epoch": 0.325, "grad_norm": 0.1531112939119339, "learning_rate": 0.0003877717742252371, "loss": 0.6345, "step": 650 }, { "epoch": 0.3375, "grad_norm": 0.15790140628814697, "learning_rate": 0.00037934757569549495, "loss": 0.7351, "step": 675 }, { "epoch": 0.35, "grad_norm": 0.1822807490825653, "learning_rate": 0.00037071781268346345, "loss": 0.745, "step": 700 }, { "epoch": 0.35, "eval_loss": 1.07315993309021, "eval_runtime": 837.5276, "eval_samples_per_second": 1.31, "eval_steps_per_second": 0.021, "step": 700 }, { "epoch": 0.3625, "grad_norm": 0.15983282029628754, "learning_rate": 0.00036189619996312495, "loss": 0.5972, "step": 725 }, { "epoch": 0.375, "grad_norm": 0.18202389776706696, "learning_rate": 0.00035289675720390174, "loss": 0.759, "step": 750 }, { "epoch": 0.3875, "grad_norm": 0.16057106852531433, "learning_rate": 0.00034373378669002105, "loss": 0.7358, "step": 775 }, { "epoch": 0.4, "grad_norm": 0.1680625081062317, "learning_rate": 0.00033442185059073706, "loss": 0.5636, "step": 800 }, { "epoch": 0.4, "eval_loss": 1.0932456254959106, "eval_runtime": 840.1331, "eval_samples_per_second": 1.306, "eval_steps_per_second": 0.021, "step": 800 }, { "epoch": 0.4125, "grad_norm": 0.15613198280334473, "learning_rate": 0.00032497574781753367, "loss": 0.7596, "step": 825 }, { "epoch": 0.425, "grad_norm": 0.1628854125738144, "learning_rate": 0.000315410490505086, "loss": 0.7282, "step": 850 }, { "epoch": 0.4375, "grad_norm": 0.16740979254245758, "learning_rate": 0.0003057412801533589, "loss": 0.5325, "step": 875 }, { "epoch": 0.45, "grad_norm": 0.1634828895330429, "learning_rate": 0.0002959834834687587, "loss": 0.778, "step": 900 }, { "epoch": 0.45, "eval_loss": 1.0952215194702148, "eval_runtime": 836.473, "eval_samples_per_second": 1.311, "eval_steps_per_second": 0.022, "step": 900 }, { "epoch": 0.4625, "grad_norm": 0.161700040102005, "learning_rate": 0.00028615260794273236, "loss": 0.7255, "step": 925 }, { "epoch": 0.475, "grad_norm": 0.16280074417591095, "learning_rate": 0.00027626427720662416, "loss": 0.4993, "step": 950 }, { "epoch": 0.4875, "grad_norm": 0.16114428639411926, "learning_rate": 0.00026633420620195917, "loss": 0.7765, "step": 975 }, { "epoch": 0.5, "grad_norm": 0.16209788620471954, "learning_rate": 0.00025637817620561263, "loss": 0.7221, "step": 1000 }, { "epoch": 0.5, "eval_loss": 1.1190329790115356, "eval_runtime": 835.4164, "eval_samples_per_second": 1.313, "eval_steps_per_second": 0.022, "step": 1000 }, { "epoch": 0.5125, "grad_norm": 0.15057620406150818, "learning_rate": 0.0002464120097495559, "loss": 0.4929, "step": 1025 }, { "epoch": 0.525, "grad_norm": 0.16715486347675323, "learning_rate": 0.00023645154547503855, "loss": 0.7896, "step": 1050 }, { "epoch": 0.5375, "grad_norm": 0.1591680645942688, "learning_rate": 0.00022651261296116894, "loss": 0.6999, "step": 1075 }, { "epoch": 0.55, "grad_norm": 0.17062760889530182, "learning_rate": 0.00021661100756789666, "loss": 0.4708, "step": 1100 }, { "epoch": 0.55, "eval_loss": 1.1381303071975708, "eval_runtime": 839.2576, "eval_samples_per_second": 1.307, "eval_steps_per_second": 0.021, "step": 1100 }, { "epoch": 0.5625, "grad_norm": 0.16746024787425995, "learning_rate": 0.00020676246533337764, "loss": 0.8073, "step": 1125 }, { "epoch": 0.575, "grad_norm": 0.18011848628520966, "learning_rate": 0.00019698263796561526, "loss": 0.7153, "step": 1150 }, { "epoch": 0.5875, "grad_norm": 0.16664239764213562, "learning_rate": 0.00018728706796812333, "loss": 0.6313, "step": 1175 }, { "epoch": 0.6, "grad_norm": 0.1630394607782364, "learning_rate": 0.00017769116393914037, "loss": 0.6952, "step": 1200 }, { "epoch": 0.6, "eval_loss": 1.1242510080337524, "eval_runtime": 847.0203, "eval_samples_per_second": 1.295, "eval_steps_per_second": 0.021, "step": 1200 }, { "epoch": 0.6125, "grad_norm": 0.16016128659248352, "learning_rate": 0.00016821017608365264, "loss": 0.6161, "step": 1225 }, { "epoch": 0.625, "grad_norm": 0.17186138033866882, "learning_rate": 0.00015885917197714112, "loss": 0.623, "step": 1250 }, { "epoch": 0.6375, "grad_norm": 0.1764240562915802, "learning_rate": 0.00014965301261957238, "loss": 0.6988, "step": 1275 }, { "epoch": 0.65, "grad_norm": 0.16019247472286224, "learning_rate": 0.00014060632881768558, "loss": 0.599, "step": 1300 }, { "epoch": 0.65, "eval_loss": 1.1349693536758423, "eval_runtime": 842.5056, "eval_samples_per_second": 1.302, "eval_steps_per_second": 0.021, "step": 1300 }, { "epoch": 0.6625, "grad_norm": 0.17425072193145752, "learning_rate": 0.00013173349793311424, "loss": 0.6607, "step": 1325 }, { "epoch": 0.675, "grad_norm": 0.17765691876411438, "learning_rate": 0.0001230486210332916, "loss": 0.6811, "step": 1350 }, { "epoch": 0.6875, "grad_norm": 0.17980526387691498, "learning_rate": 0.00011456550048145536, "loss": 0.5755, "step": 1375 }, { "epoch": 0.7, "grad_norm": 0.1814012974500656, "learning_rate": 0.00010629761800136473, "loss": 0.6642, "step": 1400 }, { "epoch": 0.7, "eval_loss": 1.1495640277862549, "eval_runtime": 840.9164, "eval_samples_per_second": 1.305, "eval_steps_per_second": 0.021, "step": 1400 }, { "epoch": 0.7125, "grad_norm": 0.1855439394712448, "learning_rate": 9.82581132515907e-05, "loss": 0.6796, "step": 1425 }, { "epoch": 0.725, "grad_norm": 0.15141044557094574, "learning_rate": 9.045976294343145e-05, "loss": 0.5593, "step": 1450 }, { "epoch": 0.7375, "grad_norm": 0.17237244546413422, "learning_rate": 8.291496053563699e-05, "loss": 0.69, "step": 1475 }, { "epoch": 0.75, "grad_norm": 0.17433880269527435, "learning_rate": 7.563569653821565e-05, "loss": 0.6768, "step": 1500 }, { "epoch": 0.75, "eval_loss": 1.1441528797149658, "eval_runtime": 844.4916, "eval_samples_per_second": 1.299, "eval_steps_per_second": 0.021, "step": 1500 }, { "epoch": 0.7625, "grad_norm": 0.15868628025054932, "learning_rate": 6.863353945662288e-05, "loss": 0.517, "step": 1525 }, { "epoch": 0.775, "grad_norm": 0.18802335858345032, "learning_rate": 6.191961740661687e-05, "loss": 0.7035, "step": 1550 }, { "epoch": 0.7875, "grad_norm": 0.17180030047893524, "learning_rate": 5.550460042899982e-05, "loss": 0.6911, "step": 1575 }, { "epoch": 0.8, "grad_norm": 0.16377338767051697, "learning_rate": 4.9398683532350855e-05, "loss": 0.4876, "step": 1600 }, { "epoch": 0.8, "eval_loss": 1.1581627130508423, "eval_runtime": 844.0055, "eval_samples_per_second": 1.3, "eval_steps_per_second": 0.021, "step": 1600 }, { "epoch": 0.8125, "grad_norm": 0.16953328251838684, "learning_rate": 4.3611570490698945e-05, "loss": 0.745, "step": 1625 }, { "epoch": 0.825, "grad_norm": 0.16867636144161224, "learning_rate": 3.815245842188697e-05, "loss": 0.6623, "step": 1650 }, { "epoch": 0.8375, "grad_norm": 0.16276288032531738, "learning_rate": 3.30300231711339e-05, "loss": 0.4716, "step": 1675 }, { "epoch": 0.85, "grad_norm": 0.17455314099788666, "learning_rate": 2.8252405523025106e-05, "loss": 0.7464, "step": 1700 }, { "epoch": 0.85, "eval_loss": 1.1617317199707031, "eval_runtime": 841.9311, "eval_samples_per_second": 1.303, "eval_steps_per_second": 0.021, "step": 1700 }, { "epoch": 0.8625, "grad_norm": 0.16539457440376282, "learning_rate": 2.3827198263843162e-05, "loss": 0.7088, "step": 1725 }, { "epoch": 0.875, "grad_norm": 0.17668606340885162, "learning_rate": 1.9761434114799497e-05, "loss": 0.5753, "step": 1750 }, { "epoch": 0.8875, "grad_norm": 0.17462626099586487, "learning_rate": 1.606157455534535e-05, "loss": 0.6541, "step": 1775 }, { "epoch": 0.9, "grad_norm": 0.1645047813653946, "learning_rate": 1.2733499554322708e-05, "loss": 0.6349, "step": 1800 }, { "epoch": 0.9, "eval_loss": 1.1562622785568237, "eval_runtime": 838.8692, "eval_samples_per_second": 1.308, "eval_steps_per_second": 0.021, "step": 1800 }, { "epoch": 0.9125, "grad_norm": 0.17606309056282043, "learning_rate": 9.782498225276437e-06, "loss": 0.5512, "step": 1825 }, { "epoch": 0.925, "grad_norm": 0.17557215690612793, "learning_rate": 7.213260420777607e-06, "loss": 0.6858, "step": 1850 }, { "epoch": 0.9375, "grad_norm": 0.15605369210243225, "learning_rate": 5.029869279117167e-06, "loss": 0.6293, "step": 1875 }, { "epoch": 0.95, "grad_norm": 0.17496590316295624, "learning_rate": 3.235794735214709e-06, "loss": 0.5686, "step": 1900 }, { "epoch": 0.95, "eval_loss": 1.1630581617355347, "eval_runtime": 829.7381, "eval_samples_per_second": 1.322, "eval_steps_per_second": 0.022, "step": 1900 }, { "epoch": 0.9625, "grad_norm": 0.17726068198680878, "learning_rate": 1.8338880060553287e-06, "loss": 0.6856, "step": 1925 }, { "epoch": 0.975, "grad_norm": 0.165832057595253, "learning_rate": 8.263770594185149e-07, "loss": 0.6185, "step": 1950 }, { "epoch": 0.9875, "grad_norm": 0.16085895895957947, "learning_rate": 2.1486307310000787e-07, "loss": 0.5909, "step": 1975 }, { "epoch": 1.0, "grad_norm": 0.166295126080513, "learning_rate": 3.1789025450867925e-10, "loss": 0.6814, "step": 2000 }, { "epoch": 1.0, "eval_loss": 1.1603492498397827, "eval_runtime": 844.3926, "eval_samples_per_second": 1.299, "eval_steps_per_second": 0.021, "step": 2000 } ], "logging_steps": 25, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0158630867238912e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }