| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.091703056768559, |
| "eval_steps": 500, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.010917030567685589, |
| "grad_norm": 0.0361328125, |
| "learning_rate": 0.00019800000000000002, |
| "loss": 1.2963, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.021834061135371178, |
| "grad_norm": 4.6875, |
| "learning_rate": 0.000196, |
| "loss": 0.9099, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.03275109170305677, |
| "grad_norm": 0.4140625, |
| "learning_rate": 0.000194, |
| "loss": 0.6184, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.043668122270742356, |
| "grad_norm": 0.94140625, |
| "learning_rate": 0.000192, |
| "loss": 0.6445, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.05458515283842795, |
| "grad_norm": 0.02490234375, |
| "learning_rate": 0.00019, |
| "loss": 0.5761, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.06550218340611354, |
| "grad_norm": 1.8359375, |
| "learning_rate": 0.000188, |
| "loss": 0.8202, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.07641921397379912, |
| "grad_norm": 25.0, |
| "learning_rate": 0.00018600000000000002, |
| "loss": 0.6368, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.08733624454148471, |
| "grad_norm": 0.006439208984375, |
| "learning_rate": 0.00018400000000000003, |
| "loss": 0.3766, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.0982532751091703, |
| "grad_norm": 176.0, |
| "learning_rate": 0.000182, |
| "loss": 0.5329, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.1091703056768559, |
| "grad_norm": 0.019287109375, |
| "learning_rate": 0.00018, |
| "loss": 0.6169, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.12008733624454149, |
| "grad_norm": 7.03125, |
| "learning_rate": 0.00017800000000000002, |
| "loss": 0.6633, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.13100436681222707, |
| "grad_norm": 0.01361083984375, |
| "learning_rate": 0.00017600000000000002, |
| "loss": 0.5915, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.14192139737991266, |
| "grad_norm": 0.1484375, |
| "learning_rate": 0.000174, |
| "loss": 0.513, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.15283842794759825, |
| "grad_norm": 0.00958251953125, |
| "learning_rate": 0.000172, |
| "loss": 0.4981, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.16375545851528384, |
| "grad_norm": 0.00677490234375, |
| "learning_rate": 0.00017, |
| "loss": 0.3848, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.17467248908296942, |
| "grad_norm": 0.07666015625, |
| "learning_rate": 0.000168, |
| "loss": 0.452, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.185589519650655, |
| "grad_norm": 0.0145263671875, |
| "learning_rate": 0.000166, |
| "loss": 0.7262, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.1965065502183406, |
| "grad_norm": 0.0172119140625, |
| "learning_rate": 0.000164, |
| "loss": 0.4341, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2074235807860262, |
| "grad_norm": 1.796875, |
| "learning_rate": 0.000162, |
| "loss": 0.5116, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.2183406113537118, |
| "grad_norm": 0.01123046875, |
| "learning_rate": 0.00016, |
| "loss": 0.5325, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2292576419213974, |
| "grad_norm": 28.625, |
| "learning_rate": 0.00015800000000000002, |
| "loss": 0.4908, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.24017467248908297, |
| "grad_norm": 0.0517578125, |
| "learning_rate": 0.00015600000000000002, |
| "loss": 0.5384, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.25109170305676853, |
| "grad_norm": 0.055419921875, |
| "learning_rate": 0.000154, |
| "loss": 0.5571, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.26200873362445415, |
| "grad_norm": 0.392578125, |
| "learning_rate": 0.000152, |
| "loss": 0.5969, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.27292576419213976, |
| "grad_norm": 0.51171875, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 0.3076, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.2838427947598253, |
| "grad_norm": 47.25, |
| "learning_rate": 0.000148, |
| "loss": 0.3439, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.29475982532751094, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 0.000146, |
| "loss": 0.3013, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.3056768558951965, |
| "grad_norm": 0.06396484375, |
| "learning_rate": 0.000144, |
| "loss": 0.5307, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.3165938864628821, |
| "grad_norm": 20.75, |
| "learning_rate": 0.000142, |
| "loss": 0.3934, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.32751091703056767, |
| "grad_norm": 0.0703125, |
| "learning_rate": 0.00014, |
| "loss": 0.2161, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3384279475982533, |
| "grad_norm": 40.75, |
| "learning_rate": 0.000138, |
| "loss": 0.4879, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.34934497816593885, |
| "grad_norm": 3.3125, |
| "learning_rate": 0.00013600000000000003, |
| "loss": 0.4656, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.36026200873362446, |
| "grad_norm": 0.020751953125, |
| "learning_rate": 0.000134, |
| "loss": 0.2679, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.37117903930131, |
| "grad_norm": 68.5, |
| "learning_rate": 0.000132, |
| "loss": 0.5902, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.38209606986899564, |
| "grad_norm": 0.091796875, |
| "learning_rate": 0.00013000000000000002, |
| "loss": 0.2799, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.3930131004366812, |
| "grad_norm": 0.0247802734375, |
| "learning_rate": 0.00012800000000000002, |
| "loss": 0.4931, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.4039301310043668, |
| "grad_norm": 0.10107421875, |
| "learning_rate": 0.000126, |
| "loss": 0.3611, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.4148471615720524, |
| "grad_norm": 0.007354736328125, |
| "learning_rate": 0.000124, |
| "loss": 0.3994, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.425764192139738, |
| "grad_norm": 0.06884765625, |
| "learning_rate": 0.000122, |
| "loss": 0.3919, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.4366812227074236, |
| "grad_norm": 86.5, |
| "learning_rate": 0.00012, |
| "loss": 0.2731, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.44759825327510916, |
| "grad_norm": 0.0067138671875, |
| "learning_rate": 0.000118, |
| "loss": 0.3157, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.4585152838427948, |
| "grad_norm": 0.027099609375, |
| "learning_rate": 0.000116, |
| "loss": 0.3493, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.46943231441048033, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 0.00011399999999999999, |
| "loss": 0.3887, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.48034934497816595, |
| "grad_norm": 0.0086669921875, |
| "learning_rate": 0.00011200000000000001, |
| "loss": 0.2559, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.4912663755458515, |
| "grad_norm": 0.1044921875, |
| "learning_rate": 0.00011000000000000002, |
| "loss": 0.4685, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.5021834061135371, |
| "grad_norm": 0.00445556640625, |
| "learning_rate": 0.00010800000000000001, |
| "loss": 0.3723, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.5131004366812227, |
| "grad_norm": 0.00201416015625, |
| "learning_rate": 0.00010600000000000002, |
| "loss": 0.3303, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.5240174672489083, |
| "grad_norm": 0.0361328125, |
| "learning_rate": 0.00010400000000000001, |
| "loss": 0.3119, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.5349344978165939, |
| "grad_norm": 18.25, |
| "learning_rate": 0.00010200000000000001, |
| "loss": 0.4043, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.5458515283842795, |
| "grad_norm": 0.0026702880859375, |
| "learning_rate": 0.0001, |
| "loss": 0.114, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.5567685589519651, |
| "grad_norm": 0.06982421875, |
| "learning_rate": 9.8e-05, |
| "loss": 0.2757, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.5676855895196506, |
| "grad_norm": 0.0146484375, |
| "learning_rate": 9.6e-05, |
| "loss": 0.3825, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.5786026200873362, |
| "grad_norm": 0.265625, |
| "learning_rate": 9.4e-05, |
| "loss": 0.4058, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.5895196506550219, |
| "grad_norm": 2.71875, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 0.2844, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.6004366812227074, |
| "grad_norm": 0.01806640625, |
| "learning_rate": 9e-05, |
| "loss": 0.317, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.611353711790393, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 0.3611, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.6222707423580786, |
| "grad_norm": 0.0830078125, |
| "learning_rate": 8.6e-05, |
| "loss": 0.2253, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.6331877729257642, |
| "grad_norm": 0.017822265625, |
| "learning_rate": 8.4e-05, |
| "loss": 0.2998, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.6441048034934498, |
| "grad_norm": 50.0, |
| "learning_rate": 8.2e-05, |
| "loss": 0.3148, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.6550218340611353, |
| "grad_norm": 0.00156402587890625, |
| "learning_rate": 8e-05, |
| "loss": 0.3505, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.665938864628821, |
| "grad_norm": 0.0107421875, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 0.1272, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.6768558951965066, |
| "grad_norm": 0.00897216796875, |
| "learning_rate": 7.6e-05, |
| "loss": 0.2141, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.6877729257641921, |
| "grad_norm": 0.0042724609375, |
| "learning_rate": 7.4e-05, |
| "loss": 0.3637, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.6986899563318777, |
| "grad_norm": 0.0189208984375, |
| "learning_rate": 7.2e-05, |
| "loss": 0.2633, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7096069868995634, |
| "grad_norm": 0.0101318359375, |
| "learning_rate": 7e-05, |
| "loss": 0.1656, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.7205240174672489, |
| "grad_norm": 0.06982421875, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 0.1287, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.7314410480349345, |
| "grad_norm": 16.0, |
| "learning_rate": 6.6e-05, |
| "loss": 0.1781, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.74235807860262, |
| "grad_norm": 0.1083984375, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 0.2521, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.7532751091703057, |
| "grad_norm": 0.00106048583984375, |
| "learning_rate": 6.2e-05, |
| "loss": 0.1658, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.7641921397379913, |
| "grad_norm": 0.0020599365234375, |
| "learning_rate": 6e-05, |
| "loss": 0.232, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.7751091703056768, |
| "grad_norm": 0.04052734375, |
| "learning_rate": 5.8e-05, |
| "loss": 0.0531, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.7860262008733624, |
| "grad_norm": 208.0, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 0.152, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.7969432314410481, |
| "grad_norm": 0.00262451171875, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 0.3011, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.8078602620087336, |
| "grad_norm": 0.0267333984375, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 0.3626, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.8187772925764192, |
| "grad_norm": 0.017578125, |
| "learning_rate": 5e-05, |
| "loss": 0.1233, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.8296943231441049, |
| "grad_norm": 2.453125, |
| "learning_rate": 4.8e-05, |
| "loss": 0.3622, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.8406113537117904, |
| "grad_norm": 57.25, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 0.3681, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.851528384279476, |
| "grad_norm": 0.006195068359375, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 0.4212, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.8624454148471615, |
| "grad_norm": 0.0205078125, |
| "learning_rate": 4.2e-05, |
| "loss": 0.2464, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.8733624454148472, |
| "grad_norm": 30.125, |
| "learning_rate": 4e-05, |
| "loss": 0.3118, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.8842794759825328, |
| "grad_norm": 0.01165771484375, |
| "learning_rate": 3.8e-05, |
| "loss": 0.2954, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.8951965065502183, |
| "grad_norm": 0.003509521484375, |
| "learning_rate": 3.6e-05, |
| "loss": 0.1704, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.9061135371179039, |
| "grad_norm": 0.005859375, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 0.3955, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.9170305676855895, |
| "grad_norm": 0.00323486328125, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 0.3858, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.9279475982532751, |
| "grad_norm": 0.0021209716796875, |
| "learning_rate": 3e-05, |
| "loss": 0.0347, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.9388646288209607, |
| "grad_norm": 0.07666015625, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 0.2532, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.9497816593886463, |
| "grad_norm": 0.003997802734375, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 0.224, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.9606986899563319, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 2.4e-05, |
| "loss": 0.0007, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.9716157205240175, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 0.1745, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.982532751091703, |
| "grad_norm": 0.00482177734375, |
| "learning_rate": 2e-05, |
| "loss": 0.2485, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.9934497816593887, |
| "grad_norm": 0.009521484375, |
| "learning_rate": 1.8e-05, |
| "loss": 0.177, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.0043668122270741, |
| "grad_norm": 0.006622314453125, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.177, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.01528384279476, |
| "grad_norm": 0.0023956298828125, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 0.1522, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.0262008733624455, |
| "grad_norm": 0.033203125, |
| "learning_rate": 1.2e-05, |
| "loss": 0.2442, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.037117903930131, |
| "grad_norm": 0.0037841796875, |
| "learning_rate": 1e-05, |
| "loss": 0.2411, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.0480349344978166, |
| "grad_norm": 0.0019683837890625, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.1355, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.0589519650655022, |
| "grad_norm": 2.984375, |
| "learning_rate": 6e-06, |
| "loss": 0.365, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.0698689956331877, |
| "grad_norm": 0.004119873046875, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.2094, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.0807860262008733, |
| "grad_norm": 0.0024566650390625, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.1219, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.091703056768559, |
| "grad_norm": 0.0034332275390625, |
| "learning_rate": 0.0, |
| "loss": 0.2887, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.981914056646656e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|