| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.27114967462039047, |
| "eval_steps": 500, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0027114967462039045, |
| "grad_norm": 0.00136566162109375, |
| "learning_rate": 0.00019800000000000002, |
| "loss": 0.9637, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.005422993492407809, |
| "grad_norm": 0.01171875, |
| "learning_rate": 0.000196, |
| "loss": 0.8921, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.008134490238611713, |
| "grad_norm": 13.9375, |
| "learning_rate": 0.000194, |
| "loss": 0.4537, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.010845986984815618, |
| "grad_norm": 0.00579833984375, |
| "learning_rate": 0.000192, |
| "loss": 0.4135, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.013557483731019523, |
| "grad_norm": 57.0, |
| "learning_rate": 0.00019, |
| "loss": 0.251, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.016268980477223426, |
| "grad_norm": 159.0, |
| "learning_rate": 0.000188, |
| "loss": 0.4018, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.01898047722342733, |
| "grad_norm": 0.0012359619140625, |
| "learning_rate": 0.00018600000000000002, |
| "loss": 0.1588, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.021691973969631236, |
| "grad_norm": 0.011962890625, |
| "learning_rate": 0.00018400000000000003, |
| "loss": 0.5163, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.02440347071583514, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 0.000182, |
| "loss": 0.4493, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.027114967462039046, |
| "grad_norm": 14.125, |
| "learning_rate": 0.00018, |
| "loss": 0.3355, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.02982646420824295, |
| "grad_norm": 0.0113525390625, |
| "learning_rate": 0.00017800000000000002, |
| "loss": 0.2711, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.03253796095444685, |
| "grad_norm": 0.0224609375, |
| "learning_rate": 0.00017600000000000002, |
| "loss": 0.3634, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.03524945770065076, |
| "grad_norm": 0.1396484375, |
| "learning_rate": 0.000174, |
| "loss": 0.4355, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.03796095444685466, |
| "grad_norm": 22.375, |
| "learning_rate": 0.000172, |
| "loss": 0.2781, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.04067245119305857, |
| "grad_norm": 0.034423828125, |
| "learning_rate": 0.00017, |
| "loss": 0.3848, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.04338394793926247, |
| "grad_norm": 0.0286865234375, |
| "learning_rate": 0.000168, |
| "loss": 0.2264, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.04609544468546638, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.000166, |
| "loss": 0.4542, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.04880694143167028, |
| "grad_norm": 0.0380859375, |
| "learning_rate": 0.000164, |
| "loss": 0.1554, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.05151843817787419, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 0.000162, |
| "loss": 0.2198, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.05422993492407809, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 0.00016, |
| "loss": 0.3759, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.056941431670282, |
| "grad_norm": 0.0091552734375, |
| "learning_rate": 0.00015800000000000002, |
| "loss": 0.2689, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.0596529284164859, |
| "grad_norm": 0.01031494140625, |
| "learning_rate": 0.00015600000000000002, |
| "loss": 0.2071, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.06236442516268981, |
| "grad_norm": 0.09912109375, |
| "learning_rate": 0.000154, |
| "loss": 0.1407, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.0650759219088937, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.000152, |
| "loss": 0.3599, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.06778741865509762, |
| "grad_norm": 0.023193359375, |
| "learning_rate": 0.00015000000000000001, |
| "loss": 0.3034, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.07049891540130152, |
| "grad_norm": 0.103515625, |
| "learning_rate": 0.000148, |
| "loss": 0.2354, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.07321041214750543, |
| "grad_norm": 0.03759765625, |
| "learning_rate": 0.000146, |
| "loss": 0.2458, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.07592190889370933, |
| "grad_norm": 0.0137939453125, |
| "learning_rate": 0.000144, |
| "loss": 0.2756, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.07863340563991324, |
| "grad_norm": 0.0126953125, |
| "learning_rate": 0.000142, |
| "loss": 0.1256, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.08134490238611713, |
| "grad_norm": 0.236328125, |
| "learning_rate": 0.00014, |
| "loss": 0.3264, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.08405639913232105, |
| "grad_norm": 0.01416015625, |
| "learning_rate": 0.000138, |
| "loss": 0.0995, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.08676789587852494, |
| "grad_norm": 0.1005859375, |
| "learning_rate": 0.00013600000000000003, |
| "loss": 0.3766, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.08947939262472886, |
| "grad_norm": 0.02490234375, |
| "learning_rate": 0.000134, |
| "loss": 0.3071, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.09219088937093275, |
| "grad_norm": 0.0224609375, |
| "learning_rate": 0.000132, |
| "loss": 0.2511, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.09490238611713665, |
| "grad_norm": 3.65625, |
| "learning_rate": 0.00013000000000000002, |
| "loss": 0.1508, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.09761388286334056, |
| "grad_norm": 0.0133056640625, |
| "learning_rate": 0.00012800000000000002, |
| "loss": 0.2386, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.10032537960954446, |
| "grad_norm": 13.75, |
| "learning_rate": 0.000126, |
| "loss": 0.2268, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.10303687635574837, |
| "grad_norm": 0.0244140625, |
| "learning_rate": 0.000124, |
| "loss": 0.1943, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.10574837310195227, |
| "grad_norm": 0.011962890625, |
| "learning_rate": 0.000122, |
| "loss": 0.2053, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.10845986984815618, |
| "grad_norm": 0.267578125, |
| "learning_rate": 0.00012, |
| "loss": 0.1648, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.11117136659436008, |
| "grad_norm": 0.0181884765625, |
| "learning_rate": 0.000118, |
| "loss": 0.1398, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.113882863340564, |
| "grad_norm": 0.083984375, |
| "learning_rate": 0.000116, |
| "loss": 0.1745, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.11659436008676789, |
| "grad_norm": 0.053955078125, |
| "learning_rate": 0.00011399999999999999, |
| "loss": 0.1719, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.1193058568329718, |
| "grad_norm": 0.0390625, |
| "learning_rate": 0.00011200000000000001, |
| "loss": 0.1191, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.1220173535791757, |
| "grad_norm": 0.00799560546875, |
| "learning_rate": 0.00011000000000000002, |
| "loss": 0.0768, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.12472885032537961, |
| "grad_norm": 0.007720947265625, |
| "learning_rate": 0.00010800000000000001, |
| "loss": 0.1663, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.12744034707158353, |
| "grad_norm": 0.0289306640625, |
| "learning_rate": 0.00010600000000000002, |
| "loss": 0.1374, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.1301518438177874, |
| "grad_norm": 0.28125, |
| "learning_rate": 0.00010400000000000001, |
| "loss": 0.3337, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.13286334056399132, |
| "grad_norm": 0.01708984375, |
| "learning_rate": 0.00010200000000000001, |
| "loss": 0.3759, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.13557483731019523, |
| "grad_norm": 0.01611328125, |
| "learning_rate": 0.0001, |
| "loss": 0.237, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.13828633405639915, |
| "grad_norm": 42.75, |
| "learning_rate": 9.8e-05, |
| "loss": 0.2289, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.14099783080260303, |
| "grad_norm": 19.0, |
| "learning_rate": 9.6e-05, |
| "loss": 0.39, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.14370932754880694, |
| "grad_norm": 0.045166015625, |
| "learning_rate": 9.4e-05, |
| "loss": 0.2849, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.14642082429501085, |
| "grad_norm": 0.05224609375, |
| "learning_rate": 9.200000000000001e-05, |
| "loss": 0.281, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.14913232104121474, |
| "grad_norm": 0.0303955078125, |
| "learning_rate": 9e-05, |
| "loss": 0.107, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.15184381778741865, |
| "grad_norm": 0.0220947265625, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 0.3079, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.15455531453362256, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.6e-05, |
| "loss": 0.1839, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.15726681127982647, |
| "grad_norm": 1.03125, |
| "learning_rate": 8.4e-05, |
| "loss": 0.2098, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.15997830802603036, |
| "grad_norm": 0.2080078125, |
| "learning_rate": 8.2e-05, |
| "loss": 0.2058, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.16268980477223427, |
| "grad_norm": 0.0152587890625, |
| "learning_rate": 8e-05, |
| "loss": 0.2558, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.16540130151843818, |
| "grad_norm": 0.01025390625, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 0.2058, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.1681127982646421, |
| "grad_norm": 0.01904296875, |
| "learning_rate": 7.6e-05, |
| "loss": 0.1929, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.17082429501084598, |
| "grad_norm": 0.00732421875, |
| "learning_rate": 7.4e-05, |
| "loss": 0.1941, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.1735357917570499, |
| "grad_norm": 60.75, |
| "learning_rate": 7.2e-05, |
| "loss": 0.2042, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.1762472885032538, |
| "grad_norm": 0.028076171875, |
| "learning_rate": 7e-05, |
| "loss": 0.2161, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.1789587852494577, |
| "grad_norm": 1.5, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 0.1245, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.1816702819956616, |
| "grad_norm": 74.0, |
| "learning_rate": 6.6e-05, |
| "loss": 0.1341, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.1843817787418655, |
| "grad_norm": 0.0230712890625, |
| "learning_rate": 6.400000000000001e-05, |
| "loss": 0.1216, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.18709327548806942, |
| "grad_norm": 0.27734375, |
| "learning_rate": 6.2e-05, |
| "loss": 0.1534, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.1898047722342733, |
| "grad_norm": 1.421875, |
| "learning_rate": 6e-05, |
| "loss": 0.1427, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.19251626898047722, |
| "grad_norm": 0.006134033203125, |
| "learning_rate": 5.8e-05, |
| "loss": 0.1307, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.19522776572668113, |
| "grad_norm": 0.12060546875, |
| "learning_rate": 5.6000000000000006e-05, |
| "loss": 0.2316, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.19793926247288504, |
| "grad_norm": 0.0264892578125, |
| "learning_rate": 5.4000000000000005e-05, |
| "loss": 0.0439, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.20065075921908893, |
| "grad_norm": 0.054931640625, |
| "learning_rate": 5.2000000000000004e-05, |
| "loss": 0.2922, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.20336225596529284, |
| "grad_norm": 0.015380859375, |
| "learning_rate": 5e-05, |
| "loss": 0.3503, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.20607375271149675, |
| "grad_norm": 0.294921875, |
| "learning_rate": 4.8e-05, |
| "loss": 0.1714, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.20878524945770066, |
| "grad_norm": 0.146484375, |
| "learning_rate": 4.600000000000001e-05, |
| "loss": 0.0389, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.21149674620390455, |
| "grad_norm": 0.021240234375, |
| "learning_rate": 4.4000000000000006e-05, |
| "loss": 0.1058, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.21420824295010846, |
| "grad_norm": 0.0252685546875, |
| "learning_rate": 4.2e-05, |
| "loss": 0.1039, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.21691973969631237, |
| "grad_norm": 0.0751953125, |
| "learning_rate": 4e-05, |
| "loss": 0.1852, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.21963123644251628, |
| "grad_norm": 0.00927734375, |
| "learning_rate": 3.8e-05, |
| "loss": 0.0974, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.22234273318872017, |
| "grad_norm": 60.75, |
| "learning_rate": 3.6e-05, |
| "loss": 0.1371, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.22505422993492408, |
| "grad_norm": 0.16796875, |
| "learning_rate": 3.4000000000000007e-05, |
| "loss": 0.2533, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.227765726681128, |
| "grad_norm": 0.011474609375, |
| "learning_rate": 3.2000000000000005e-05, |
| "loss": 0.2726, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.2304772234273319, |
| "grad_norm": 0.2041015625, |
| "learning_rate": 3e-05, |
| "loss": 0.1344, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.23318872017353579, |
| "grad_norm": 0.031982421875, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 0.1649, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.2359002169197397, |
| "grad_norm": 64.0, |
| "learning_rate": 2.6000000000000002e-05, |
| "loss": 0.0462, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.2386117136659436, |
| "grad_norm": 0.01348876953125, |
| "learning_rate": 2.4e-05, |
| "loss": 0.2292, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.2413232104121475, |
| "grad_norm": 0.0084228515625, |
| "learning_rate": 2.2000000000000003e-05, |
| "loss": 0.1774, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.2440347071583514, |
| "grad_norm": 0.03125, |
| "learning_rate": 2e-05, |
| "loss": 0.1743, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.24674620390455532, |
| "grad_norm": 0.01007080078125, |
| "learning_rate": 1.8e-05, |
| "loss": 0.3265, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.24945770065075923, |
| "grad_norm": 0.0157470703125, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.0298, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.25216919739696314, |
| "grad_norm": 0.005828857421875, |
| "learning_rate": 1.4000000000000001e-05, |
| "loss": 0.0788, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.25488069414316705, |
| "grad_norm": 0.0400390625, |
| "learning_rate": 1.2e-05, |
| "loss": 0.1261, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.2575921908893709, |
| "grad_norm": 0.0108642578125, |
| "learning_rate": 1e-05, |
| "loss": 0.1364, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.2603036876355748, |
| "grad_norm": 0.0079345703125, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.1416, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.26301518438177873, |
| "grad_norm": 0.00830078125, |
| "learning_rate": 6e-06, |
| "loss": 0.1362, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.26572668112798264, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.112, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.26843817787418656, |
| "grad_norm": 0.0185546875, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.2715, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.27114967462039047, |
| "grad_norm": 0.2412109375, |
| "learning_rate": 0.0, |
| "loss": 0.2406, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.387481970192384e+16, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|