{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.175, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025, "grad_norm": 28.533191680908203, "learning_rate": 0.00019950000000000002, "loss": 10.2028, "step": 10 }, { "epoch": 0.005, "grad_norm": 1.5962224006652832, "learning_rate": 0.000199, "loss": 1.8731, "step": 20 }, { "epoch": 0.0075, "grad_norm": 0.8784465193748474, "learning_rate": 0.00019850000000000003, "loss": 0.5834, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.6551746726036072, "learning_rate": 0.00019800000000000002, "loss": 0.4274, "step": 40 }, { "epoch": 0.0125, "grad_norm": 0.7458402514457703, "learning_rate": 0.00019750000000000003, "loss": 0.2921, "step": 50 }, { "epoch": 0.015, "grad_norm": 0.6910417675971985, "learning_rate": 0.00019700000000000002, "loss": 0.1889, "step": 60 }, { "epoch": 0.0175, "grad_norm": 0.575136125087738, "learning_rate": 0.0001965, "loss": 0.1379, "step": 70 }, { "epoch": 0.02, "grad_norm": 1.2535680532455444, "learning_rate": 0.000196, "loss": 0.1053, "step": 80 }, { "epoch": 0.0225, "grad_norm": 0.9697864651679993, "learning_rate": 0.0001955, "loss": 0.0756, "step": 90 }, { "epoch": 0.025, "grad_norm": 0.508269727230072, "learning_rate": 0.000195, "loss": 0.055, "step": 100 }, { "epoch": 0.0275, "grad_norm": 0.6621774435043335, "learning_rate": 0.0001945, "loss": 0.0487, "step": 110 }, { "epoch": 0.03, "grad_norm": 0.5406679511070251, "learning_rate": 0.000194, "loss": 0.0406, "step": 120 }, { "epoch": 0.0325, "grad_norm": 0.35967350006103516, "learning_rate": 0.00019350000000000001, "loss": 0.0347, "step": 130 }, { "epoch": 0.035, "grad_norm": 0.6122244000434875, "learning_rate": 0.000193, "loss": 0.0334, "step": 140 }, { "epoch": 0.0375, "grad_norm": 0.4679579734802246, "learning_rate": 0.00019250000000000002, "loss": 0.0342, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.6229879856109619, "learning_rate": 0.000192, "loss": 0.0328, "step": 160 }, { "epoch": 0.0425, "grad_norm": 0.4741787314414978, "learning_rate": 0.00019150000000000002, "loss": 0.0328, "step": 170 }, { "epoch": 0.045, "grad_norm": 0.3581089377403259, "learning_rate": 0.000191, "loss": 0.0329, "step": 180 }, { "epoch": 0.0475, "grad_norm": 0.2805705964565277, "learning_rate": 0.00019050000000000002, "loss": 0.0316, "step": 190 }, { "epoch": 0.05, "grad_norm": 0.36797094345092773, "learning_rate": 0.00019, "loss": 0.0314, "step": 200 }, { "epoch": 0.0525, "grad_norm": 0.22872206568717957, "learning_rate": 0.0001895, "loss": 0.0304, "step": 210 }, { "epoch": 0.055, "grad_norm": 0.3525296151638031, "learning_rate": 0.00018899999999999999, "loss": 0.0315, "step": 220 }, { "epoch": 0.0575, "grad_norm": 0.21026159822940826, "learning_rate": 0.0001885, "loss": 0.0302, "step": 230 }, { "epoch": 0.06, "grad_norm": 0.1741417497396469, "learning_rate": 0.000188, "loss": 0.0307, "step": 240 }, { "epoch": 0.0625, "grad_norm": 0.35116010904312134, "learning_rate": 0.0001875, "loss": 0.0305, "step": 250 }, { "epoch": 0.065, "grad_norm": 0.2572971284389496, "learning_rate": 0.00018700000000000002, "loss": 0.0313, "step": 260 }, { "epoch": 0.0675, "grad_norm": 0.2466694414615631, "learning_rate": 0.0001865, "loss": 0.0299, "step": 270 }, { "epoch": 0.07, "grad_norm": 0.19943873584270477, "learning_rate": 0.00018600000000000002, "loss": 0.0304, "step": 280 }, { "epoch": 0.0725, "grad_norm": 0.3378709852695465, "learning_rate": 0.0001855, "loss": 0.0299, "step": 290 }, { "epoch": 0.075, "grad_norm": 0.23438668251037598, "learning_rate": 0.00018500000000000002, "loss": 0.0301, "step": 300 }, { "epoch": 0.0775, "grad_norm": 0.3201534152030945, "learning_rate": 0.0001845, "loss": 0.0307, "step": 310 }, { "epoch": 0.08, "grad_norm": 0.19868455827236176, "learning_rate": 0.00018400000000000003, "loss": 0.0294, "step": 320 }, { "epoch": 0.0825, "grad_norm": 0.20487827062606812, "learning_rate": 0.00018350000000000002, "loss": 0.0309, "step": 330 }, { "epoch": 0.085, "grad_norm": 0.3057793378829956, "learning_rate": 0.000183, "loss": 0.0307, "step": 340 }, { "epoch": 0.0875, "grad_norm": 0.12229125201702118, "learning_rate": 0.0001825, "loss": 0.0303, "step": 350 }, { "epoch": 0.09, "grad_norm": 0.18177232146263123, "learning_rate": 0.000182, "loss": 0.0302, "step": 360 }, { "epoch": 0.0925, "grad_norm": 0.28575578331947327, "learning_rate": 0.0001815, "loss": 0.0304, "step": 370 }, { "epoch": 0.095, "grad_norm": 0.19034205377101898, "learning_rate": 0.000181, "loss": 0.03, "step": 380 }, { "epoch": 0.0975, "grad_norm": 0.23103861510753632, "learning_rate": 0.0001805, "loss": 0.0305, "step": 390 }, { "epoch": 0.1, "grad_norm": 0.15927983820438385, "learning_rate": 0.00018, "loss": 0.0292, "step": 400 }, { "epoch": 0.1025, "grad_norm": 0.19252969324588776, "learning_rate": 0.0001795, "loss": 0.0314, "step": 410 }, { "epoch": 0.105, "grad_norm": 0.20013266801834106, "learning_rate": 0.00017900000000000001, "loss": 0.0299, "step": 420 }, { "epoch": 0.1075, "grad_norm": 0.14499768614768982, "learning_rate": 0.0001785, "loss": 0.0304, "step": 430 }, { "epoch": 0.11, "grad_norm": 0.16168737411499023, "learning_rate": 0.00017800000000000002, "loss": 0.0312, "step": 440 }, { "epoch": 0.1125, "grad_norm": 0.22811977565288544, "learning_rate": 0.0001775, "loss": 0.0297, "step": 450 }, { "epoch": 0.115, "grad_norm": 0.13746319711208344, "learning_rate": 0.00017700000000000002, "loss": 0.0299, "step": 460 }, { "epoch": 0.1175, "grad_norm": 0.24217799305915833, "learning_rate": 0.0001765, "loss": 0.0298, "step": 470 }, { "epoch": 0.12, "grad_norm": 0.20098623633384705, "learning_rate": 0.00017600000000000002, "loss": 0.031, "step": 480 }, { "epoch": 0.1225, "grad_norm": 0.16132541000843048, "learning_rate": 0.0001755, "loss": 0.0298, "step": 490 }, { "epoch": 0.125, "grad_norm": 0.1743323802947998, "learning_rate": 0.000175, "loss": 0.0311, "step": 500 }, { "epoch": 0.1275, "grad_norm": 0.25642746686935425, "learning_rate": 0.0001745, "loss": 0.0293, "step": 510 }, { "epoch": 0.13, "grad_norm": 0.14132989943027496, "learning_rate": 0.000174, "loss": 0.0298, "step": 520 }, { "epoch": 0.1325, "grad_norm": 0.15414279699325562, "learning_rate": 0.00017350000000000002, "loss": 0.0306, "step": 530 }, { "epoch": 0.135, "grad_norm": 0.1769929677248001, "learning_rate": 0.000173, "loss": 0.0308, "step": 540 }, { "epoch": 0.1375, "grad_norm": 0.23861835896968842, "learning_rate": 0.00017250000000000002, "loss": 0.03, "step": 550 }, { "epoch": 0.14, "grad_norm": 0.14238761365413666, "learning_rate": 0.000172, "loss": 0.0302, "step": 560 }, { "epoch": 0.1425, "grad_norm": 0.12197570502758026, "learning_rate": 0.00017150000000000002, "loss": 0.0291, "step": 570 }, { "epoch": 0.145, "grad_norm": 0.16140304505825043, "learning_rate": 0.000171, "loss": 0.0308, "step": 580 }, { "epoch": 0.1475, "grad_norm": 0.1349712610244751, "learning_rate": 0.00017050000000000002, "loss": 0.0296, "step": 590 }, { "epoch": 0.15, "grad_norm": 0.15457607805728912, "learning_rate": 0.00017, "loss": 0.0294, "step": 600 }, { "epoch": 0.1525, "grad_norm": 0.24771223962306976, "learning_rate": 0.00016950000000000003, "loss": 0.0295, "step": 610 }, { "epoch": 0.155, "grad_norm": 0.1785576194524765, "learning_rate": 0.00016900000000000002, "loss": 0.0294, "step": 620 }, { "epoch": 0.1575, "grad_norm": 0.208939328789711, "learning_rate": 0.0001685, "loss": 0.0287, "step": 630 }, { "epoch": 0.16, "grad_norm": 0.15111897885799408, "learning_rate": 0.000168, "loss": 0.0297, "step": 640 }, { "epoch": 0.1625, "grad_norm": 0.16397146880626678, "learning_rate": 0.0001675, "loss": 0.0306, "step": 650 }, { "epoch": 0.165, "grad_norm": 0.1457725465297699, "learning_rate": 0.000167, "loss": 0.029, "step": 660 }, { "epoch": 0.1675, "grad_norm": 0.1790078729391098, "learning_rate": 0.0001665, "loss": 0.0293, "step": 670 }, { "epoch": 0.17, "grad_norm": 0.13654978573322296, "learning_rate": 0.000166, "loss": 0.0292, "step": 680 }, { "epoch": 0.1725, "grad_norm": 0.14379267394542694, "learning_rate": 0.0001655, "loss": 0.0295, "step": 690 }, { "epoch": 0.175, "grad_norm": 0.21510595083236694, "learning_rate": 0.000165, "loss": 0.0306, "step": 700 } ], "logging_steps": 10, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 770786761113600.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }