{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.091703056768559, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010917030567685589, "grad_norm": 0.0361328125, "learning_rate": 0.00019800000000000002, "loss": 1.2963, "step": 50 }, { "epoch": 0.021834061135371178, "grad_norm": 4.6875, "learning_rate": 0.000196, "loss": 0.9099, "step": 100 }, { "epoch": 0.03275109170305677, "grad_norm": 0.4140625, "learning_rate": 0.000194, "loss": 0.6184, "step": 150 }, { "epoch": 0.043668122270742356, "grad_norm": 0.94140625, "learning_rate": 0.000192, "loss": 0.6445, "step": 200 }, { "epoch": 0.05458515283842795, "grad_norm": 0.02490234375, "learning_rate": 0.00019, "loss": 0.5761, "step": 250 }, { "epoch": 0.06550218340611354, "grad_norm": 1.8359375, "learning_rate": 0.000188, "loss": 0.8202, "step": 300 }, { "epoch": 0.07641921397379912, "grad_norm": 25.0, "learning_rate": 0.00018600000000000002, "loss": 0.6368, "step": 350 }, { "epoch": 0.08733624454148471, "grad_norm": 0.006439208984375, "learning_rate": 0.00018400000000000003, "loss": 0.3766, "step": 400 }, { "epoch": 0.0982532751091703, "grad_norm": 176.0, "learning_rate": 0.000182, "loss": 0.5329, "step": 450 }, { "epoch": 0.1091703056768559, "grad_norm": 0.019287109375, "learning_rate": 0.00018, "loss": 0.6169, "step": 500 }, { "epoch": 0.12008733624454149, "grad_norm": 7.03125, "learning_rate": 0.00017800000000000002, "loss": 0.6633, "step": 550 }, { "epoch": 0.13100436681222707, "grad_norm": 0.01361083984375, "learning_rate": 0.00017600000000000002, "loss": 0.5915, "step": 600 }, { "epoch": 0.14192139737991266, "grad_norm": 0.1484375, "learning_rate": 0.000174, "loss": 0.513, "step": 650 }, { "epoch": 0.15283842794759825, "grad_norm": 0.00958251953125, "learning_rate": 0.000172, "loss": 0.4981, "step": 700 }, { "epoch": 0.16375545851528384, "grad_norm": 0.00677490234375, "learning_rate": 0.00017, "loss": 0.3848, "step": 750 }, { "epoch": 0.17467248908296942, "grad_norm": 0.07666015625, "learning_rate": 0.000168, "loss": 0.452, "step": 800 }, { "epoch": 0.185589519650655, "grad_norm": 0.0145263671875, "learning_rate": 0.000166, "loss": 0.7262, "step": 850 }, { "epoch": 0.1965065502183406, "grad_norm": 0.0172119140625, "learning_rate": 0.000164, "loss": 0.4341, "step": 900 }, { "epoch": 0.2074235807860262, "grad_norm": 1.796875, "learning_rate": 0.000162, "loss": 0.5116, "step": 950 }, { "epoch": 0.2183406113537118, "grad_norm": 0.01123046875, "learning_rate": 0.00016, "loss": 0.5325, "step": 1000 }, { "epoch": 0.2292576419213974, "grad_norm": 28.625, "learning_rate": 0.00015800000000000002, "loss": 0.4908, "step": 1050 }, { "epoch": 0.24017467248908297, "grad_norm": 0.0517578125, "learning_rate": 0.00015600000000000002, "loss": 0.5384, "step": 1100 }, { "epoch": 0.25109170305676853, "grad_norm": 0.055419921875, "learning_rate": 0.000154, "loss": 0.5571, "step": 1150 }, { "epoch": 0.26200873362445415, "grad_norm": 0.392578125, "learning_rate": 0.000152, "loss": 0.5969, "step": 1200 }, { "epoch": 0.27292576419213976, "grad_norm": 0.51171875, "learning_rate": 0.00015000000000000001, "loss": 0.3076, "step": 1250 }, { "epoch": 0.2838427947598253, "grad_norm": 47.25, "learning_rate": 0.000148, "loss": 0.3439, "step": 1300 }, { "epoch": 0.29475982532751094, "grad_norm": 0.00799560546875, "learning_rate": 0.000146, "loss": 0.3013, "step": 1350 }, { "epoch": 0.3056768558951965, "grad_norm": 0.06396484375, "learning_rate": 0.000144, "loss": 0.5307, "step": 1400 }, { "epoch": 0.3165938864628821, "grad_norm": 20.75, "learning_rate": 0.000142, "loss": 0.3934, "step": 1450 }, { "epoch": 0.32751091703056767, "grad_norm": 0.0703125, "learning_rate": 0.00014, "loss": 0.2161, "step": 1500 }, { "epoch": 0.3384279475982533, "grad_norm": 40.75, "learning_rate": 0.000138, "loss": 0.4879, "step": 1550 }, { "epoch": 0.34934497816593885, "grad_norm": 3.3125, "learning_rate": 0.00013600000000000003, "loss": 0.4656, "step": 1600 }, { "epoch": 0.36026200873362446, "grad_norm": 0.020751953125, "learning_rate": 0.000134, "loss": 0.2679, "step": 1650 }, { "epoch": 0.37117903930131, "grad_norm": 68.5, "learning_rate": 0.000132, "loss": 0.5902, "step": 1700 }, { "epoch": 0.38209606986899564, "grad_norm": 0.091796875, "learning_rate": 0.00013000000000000002, "loss": 0.2799, "step": 1750 }, { "epoch": 0.3930131004366812, "grad_norm": 0.0247802734375, "learning_rate": 0.00012800000000000002, "loss": 0.4931, "step": 1800 }, { "epoch": 0.4039301310043668, "grad_norm": 0.10107421875, "learning_rate": 0.000126, "loss": 0.3611, "step": 1850 }, { "epoch": 0.4148471615720524, "grad_norm": 0.007354736328125, "learning_rate": 0.000124, "loss": 0.3994, "step": 1900 }, { "epoch": 0.425764192139738, "grad_norm": 0.06884765625, "learning_rate": 0.000122, "loss": 0.3919, "step": 1950 }, { "epoch": 0.4366812227074236, "grad_norm": 86.5, "learning_rate": 0.00012, "loss": 0.2731, "step": 2000 }, { "epoch": 0.44759825327510916, "grad_norm": 0.0067138671875, "learning_rate": 0.000118, "loss": 0.3157, "step": 2050 }, { "epoch": 0.4585152838427948, "grad_norm": 0.027099609375, "learning_rate": 0.000116, "loss": 0.3493, "step": 2100 }, { "epoch": 0.46943231441048033, "grad_norm": 0.08056640625, "learning_rate": 0.00011399999999999999, "loss": 0.3887, "step": 2150 }, { "epoch": 0.48034934497816595, "grad_norm": 0.0086669921875, "learning_rate": 0.00011200000000000001, "loss": 0.2559, "step": 2200 }, { "epoch": 0.4912663755458515, "grad_norm": 0.1044921875, "learning_rate": 0.00011000000000000002, "loss": 0.4685, "step": 2250 }, { "epoch": 0.5021834061135371, "grad_norm": 0.00445556640625, "learning_rate": 0.00010800000000000001, "loss": 0.3723, "step": 2300 }, { "epoch": 0.5131004366812227, "grad_norm": 0.00201416015625, "learning_rate": 0.00010600000000000002, "loss": 0.3303, "step": 2350 }, { "epoch": 0.5240174672489083, "grad_norm": 0.0361328125, "learning_rate": 0.00010400000000000001, "loss": 0.3119, "step": 2400 }, { "epoch": 0.5349344978165939, "grad_norm": 18.25, "learning_rate": 0.00010200000000000001, "loss": 0.4043, "step": 2450 }, { "epoch": 0.5458515283842795, "grad_norm": 0.0026702880859375, "learning_rate": 0.0001, "loss": 0.114, "step": 2500 }, { "epoch": 0.5567685589519651, "grad_norm": 0.06982421875, "learning_rate": 9.8e-05, "loss": 0.2757, "step": 2550 }, { "epoch": 0.5676855895196506, "grad_norm": 0.0146484375, "learning_rate": 9.6e-05, "loss": 0.3825, "step": 2600 }, { "epoch": 0.5786026200873362, "grad_norm": 0.265625, "learning_rate": 9.4e-05, "loss": 0.4058, "step": 2650 }, { "epoch": 0.5895196506550219, "grad_norm": 2.71875, "learning_rate": 9.200000000000001e-05, "loss": 0.2844, "step": 2700 }, { "epoch": 0.6004366812227074, "grad_norm": 0.01806640625, "learning_rate": 9e-05, "loss": 0.317, "step": 2750 }, { "epoch": 0.611353711790393, "grad_norm": 1.078125, "learning_rate": 8.800000000000001e-05, "loss": 0.3611, "step": 2800 }, { "epoch": 0.6222707423580786, "grad_norm": 0.0830078125, "learning_rate": 8.6e-05, "loss": 0.2253, "step": 2850 }, { "epoch": 0.6331877729257642, "grad_norm": 0.017822265625, "learning_rate": 8.4e-05, "loss": 0.2998, "step": 2900 }, { "epoch": 0.6441048034934498, "grad_norm": 50.0, "learning_rate": 8.2e-05, "loss": 0.3148, "step": 2950 }, { "epoch": 0.6550218340611353, "grad_norm": 0.00156402587890625, "learning_rate": 8e-05, "loss": 0.3505, "step": 3000 }, { "epoch": 0.665938864628821, "grad_norm": 0.0107421875, "learning_rate": 7.800000000000001e-05, "loss": 0.1272, "step": 3050 }, { "epoch": 0.6768558951965066, "grad_norm": 0.00897216796875, "learning_rate": 7.6e-05, "loss": 0.2141, "step": 3100 }, { "epoch": 0.6877729257641921, "grad_norm": 0.0042724609375, "learning_rate": 7.4e-05, "loss": 0.3637, "step": 3150 }, { "epoch": 0.6986899563318777, "grad_norm": 0.0189208984375, "learning_rate": 7.2e-05, "loss": 0.2633, "step": 3200 }, { "epoch": 0.7096069868995634, "grad_norm": 0.0101318359375, "learning_rate": 7e-05, "loss": 0.1656, "step": 3250 }, { "epoch": 0.7205240174672489, "grad_norm": 0.06982421875, "learning_rate": 6.800000000000001e-05, "loss": 0.1287, "step": 3300 }, { "epoch": 0.7314410480349345, "grad_norm": 16.0, "learning_rate": 6.6e-05, "loss": 0.1781, "step": 3350 }, { "epoch": 0.74235807860262, "grad_norm": 0.1083984375, "learning_rate": 6.400000000000001e-05, "loss": 0.2521, "step": 3400 }, { "epoch": 0.7532751091703057, "grad_norm": 0.00106048583984375, "learning_rate": 6.2e-05, "loss": 0.1658, "step": 3450 }, { "epoch": 0.7641921397379913, "grad_norm": 0.0020599365234375, "learning_rate": 6e-05, "loss": 0.232, "step": 3500 }, { "epoch": 0.7751091703056768, "grad_norm": 0.04052734375, "learning_rate": 5.8e-05, "loss": 0.0531, "step": 3550 }, { "epoch": 0.7860262008733624, "grad_norm": 208.0, "learning_rate": 5.6000000000000006e-05, "loss": 0.152, "step": 3600 }, { "epoch": 0.7969432314410481, "grad_norm": 0.00262451171875, "learning_rate": 5.4000000000000005e-05, "loss": 0.3011, "step": 3650 }, { "epoch": 0.8078602620087336, "grad_norm": 0.0267333984375, "learning_rate": 5.2000000000000004e-05, "loss": 0.3626, "step": 3700 }, { "epoch": 0.8187772925764192, "grad_norm": 0.017578125, "learning_rate": 5e-05, "loss": 0.1233, "step": 3750 }, { "epoch": 0.8296943231441049, "grad_norm": 2.453125, "learning_rate": 4.8e-05, "loss": 0.3622, "step": 3800 }, { "epoch": 0.8406113537117904, "grad_norm": 57.25, "learning_rate": 4.600000000000001e-05, "loss": 0.3681, "step": 3850 }, { "epoch": 0.851528384279476, "grad_norm": 0.006195068359375, "learning_rate": 4.4000000000000006e-05, "loss": 0.4212, "step": 3900 }, { "epoch": 0.8624454148471615, "grad_norm": 0.0205078125, "learning_rate": 4.2e-05, "loss": 0.2464, "step": 3950 }, { "epoch": 0.8733624454148472, "grad_norm": 30.125, "learning_rate": 4e-05, "loss": 0.3118, "step": 4000 }, { "epoch": 0.8842794759825328, "grad_norm": 0.01165771484375, "learning_rate": 3.8e-05, "loss": 0.2954, "step": 4050 }, { "epoch": 0.8951965065502183, "grad_norm": 0.003509521484375, "learning_rate": 3.6e-05, "loss": 0.1704, "step": 4100 }, { "epoch": 0.9061135371179039, "grad_norm": 0.005859375, "learning_rate": 3.4000000000000007e-05, "loss": 0.3955, "step": 4150 }, { "epoch": 0.9170305676855895, "grad_norm": 0.00323486328125, "learning_rate": 3.2000000000000005e-05, "loss": 0.3858, "step": 4200 }, { "epoch": 0.9279475982532751, "grad_norm": 0.0021209716796875, "learning_rate": 3e-05, "loss": 0.0347, "step": 4250 }, { "epoch": 0.9388646288209607, "grad_norm": 0.07666015625, "learning_rate": 2.8000000000000003e-05, "loss": 0.2532, "step": 4300 }, { "epoch": 0.9497816593886463, "grad_norm": 0.003997802734375, "learning_rate": 2.6000000000000002e-05, "loss": 0.224, "step": 4350 }, { "epoch": 0.9606986899563319, "grad_norm": 0.0152587890625, "learning_rate": 2.4e-05, "loss": 0.0007, "step": 4400 }, { "epoch": 0.9716157205240175, "grad_norm": 0.00799560546875, "learning_rate": 2.2000000000000003e-05, "loss": 0.1745, "step": 4450 }, { "epoch": 0.982532751091703, "grad_norm": 0.00482177734375, "learning_rate": 2e-05, "loss": 0.2485, "step": 4500 }, { "epoch": 0.9934497816593887, "grad_norm": 0.009521484375, "learning_rate": 1.8e-05, "loss": 0.177, "step": 4550 }, { "epoch": 1.0043668122270741, "grad_norm": 0.006622314453125, "learning_rate": 1.6000000000000003e-05, "loss": 0.177, "step": 4600 }, { "epoch": 1.01528384279476, "grad_norm": 0.0023956298828125, "learning_rate": 1.4000000000000001e-05, "loss": 0.1522, "step": 4650 }, { "epoch": 1.0262008733624455, "grad_norm": 0.033203125, "learning_rate": 1.2e-05, "loss": 0.2442, "step": 4700 }, { "epoch": 1.037117903930131, "grad_norm": 0.0037841796875, "learning_rate": 1e-05, "loss": 0.2411, "step": 4750 }, { "epoch": 1.0480349344978166, "grad_norm": 0.0019683837890625, "learning_rate": 8.000000000000001e-06, "loss": 0.1355, "step": 4800 }, { "epoch": 1.0589519650655022, "grad_norm": 2.984375, "learning_rate": 6e-06, "loss": 0.365, "step": 4850 }, { "epoch": 1.0698689956331877, "grad_norm": 0.004119873046875, "learning_rate": 4.000000000000001e-06, "loss": 0.2094, "step": 4900 }, { "epoch": 1.0807860262008733, "grad_norm": 0.0024566650390625, "learning_rate": 2.0000000000000003e-06, "loss": 0.1219, "step": 4950 }, { "epoch": 1.091703056768559, "grad_norm": 0.0034332275390625, "learning_rate": 0.0, "loss": 0.2887, "step": 5000 } ], "logging_steps": 50, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.981914056646656e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }