| { |
| "best_global_step": 180, |
| "best_metric": 0.22945284843444824, |
| "best_model_checkpoint": "saves_multiple/p-tuning/llama-3-8b-instruct/train_copa_789_1760637874/checkpoint-180", |
| "epoch": 20.0, |
| "eval_steps": 90, |
| "global_step": 1800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.05555555555555555, |
| "grad_norm": 215.8870849609375, |
| "learning_rate": 2.2222222222222223e-05, |
| "loss": 4.687, |
| "num_input_tokens_seen": 1632, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.1111111111111111, |
| "grad_norm": 22.133590698242188, |
| "learning_rate": 5e-05, |
| "loss": 1.6103, |
| "num_input_tokens_seen": 3232, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.16666666666666666, |
| "grad_norm": 7.232245922088623, |
| "learning_rate": 7.777777777777778e-05, |
| "loss": 0.3703, |
| "num_input_tokens_seen": 4832, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.2222222222222222, |
| "grad_norm": 1.963056206703186, |
| "learning_rate": 0.00010555555555555555, |
| "loss": 0.2378, |
| "num_input_tokens_seen": 6432, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.2777777777777778, |
| "grad_norm": 6.656666278839111, |
| "learning_rate": 0.00013333333333333334, |
| "loss": 0.2584, |
| "num_input_tokens_seen": 7968, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 2.5032241344451904, |
| "learning_rate": 0.0001611111111111111, |
| "loss": 0.5879, |
| "num_input_tokens_seen": 9504, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.3888888888888889, |
| "grad_norm": 3.4075732231140137, |
| "learning_rate": 0.00018888888888888888, |
| "loss": 0.28, |
| "num_input_tokens_seen": 11104, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.4444444444444444, |
| "grad_norm": 1.5475021600723267, |
| "learning_rate": 0.00021666666666666668, |
| "loss": 0.2908, |
| "num_input_tokens_seen": 12704, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 7.280683517456055, |
| "learning_rate": 0.00024444444444444443, |
| "loss": 0.2511, |
| "num_input_tokens_seen": 14240, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 0.17710517346858978, |
| "learning_rate": 0.0002722222222222222, |
| "loss": 0.2512, |
| "num_input_tokens_seen": 15808, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.6111111111111112, |
| "grad_norm": 0.8657889366149902, |
| "learning_rate": 0.0003, |
| "loss": 0.4036, |
| "num_input_tokens_seen": 17344, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 1.1960246562957764, |
| "learning_rate": 0.0003277777777777778, |
| "loss": 0.2508, |
| "num_input_tokens_seen": 18912, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.7222222222222222, |
| "grad_norm": 0.656658411026001, |
| "learning_rate": 0.00035555555555555557, |
| "loss": 0.2354, |
| "num_input_tokens_seen": 20448, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.7777777777777778, |
| "grad_norm": 1.5388158559799194, |
| "learning_rate": 0.00038333333333333334, |
| "loss": 0.2819, |
| "num_input_tokens_seen": 21984, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 4.028906345367432, |
| "learning_rate": 0.0004111111111111111, |
| "loss": 0.2688, |
| "num_input_tokens_seen": 23552, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.8888888888888888, |
| "grad_norm": 0.4602603018283844, |
| "learning_rate": 0.0004388888888888889, |
| "loss": 0.2861, |
| "num_input_tokens_seen": 25120, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.9444444444444444, |
| "grad_norm": 2.0211222171783447, |
| "learning_rate": 0.00046666666666666666, |
| "loss": 0.2392, |
| "num_input_tokens_seen": 26656, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.11489757150411606, |
| "learning_rate": 0.0004944444444444445, |
| "loss": 0.245, |
| "num_input_tokens_seen": 28192, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.2327098846435547, |
| "eval_runtime": 1.1469, |
| "eval_samples_per_second": 34.877, |
| "eval_steps_per_second": 8.719, |
| "num_input_tokens_seen": 28192, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.0555555555555556, |
| "grad_norm": 0.2278575897216797, |
| "learning_rate": 0.0005222222222222223, |
| "loss": 0.235, |
| "num_input_tokens_seen": 29792, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 0.25667938590049744, |
| "learning_rate": 0.00055, |
| "loss": 0.2347, |
| "num_input_tokens_seen": 31328, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.1666666666666667, |
| "grad_norm": 0.09546233713626862, |
| "learning_rate": 0.0005777777777777778, |
| "loss": 0.2524, |
| "num_input_tokens_seen": 32832, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.2222222222222223, |
| "grad_norm": 0.16348819434642792, |
| "learning_rate": 0.0006055555555555556, |
| "loss": 0.2344, |
| "num_input_tokens_seen": 34304, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.2777777777777777, |
| "grad_norm": 0.1358136087656021, |
| "learning_rate": 0.0006333333333333333, |
| "loss": 0.2314, |
| "num_input_tokens_seen": 35840, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 0.12788349390029907, |
| "learning_rate": 0.0006611111111111111, |
| "loss": 0.2244, |
| "num_input_tokens_seen": 37376, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.3888888888888888, |
| "grad_norm": 0.0501859076321125, |
| "learning_rate": 0.000688888888888889, |
| "loss": 0.2462, |
| "num_input_tokens_seen": 38944, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.4444444444444444, |
| "grad_norm": 0.26624178886413574, |
| "learning_rate": 0.0007166666666666667, |
| "loss": 0.2241, |
| "num_input_tokens_seen": 40512, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.037180013954639435, |
| "learning_rate": 0.0007444444444444445, |
| "loss": 0.2453, |
| "num_input_tokens_seen": 42080, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.5555555555555556, |
| "grad_norm": 0.10765981674194336, |
| "learning_rate": 0.0007722222222222223, |
| "loss": 0.2264, |
| "num_input_tokens_seen": 43680, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.6111111111111112, |
| "grad_norm": 0.12236373126506805, |
| "learning_rate": 0.0008, |
| "loss": 0.2341, |
| "num_input_tokens_seen": 45216, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.17313969135284424, |
| "learning_rate": 0.0008277777777777778, |
| "loss": 0.2343, |
| "num_input_tokens_seen": 46720, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.7222222222222223, |
| "grad_norm": 0.9564811587333679, |
| "learning_rate": 0.0008555555555555556, |
| "loss": 0.2399, |
| "num_input_tokens_seen": 48288, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.7777777777777777, |
| "grad_norm": 0.09225412458181381, |
| "learning_rate": 0.0008833333333333333, |
| "loss": 0.2303, |
| "num_input_tokens_seen": 49888, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.8333333333333335, |
| "grad_norm": 0.09442174434661865, |
| "learning_rate": 0.0009111111111111111, |
| "loss": 0.2393, |
| "num_input_tokens_seen": 51424, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.8888888888888888, |
| "grad_norm": 0.056451309472322464, |
| "learning_rate": 0.000938888888888889, |
| "loss": 0.2384, |
| "num_input_tokens_seen": 52992, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.9444444444444444, |
| "grad_norm": 0.18257667124271393, |
| "learning_rate": 0.0009666666666666667, |
| "loss": 0.228, |
| "num_input_tokens_seen": 54592, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.30549758672714233, |
| "learning_rate": 0.0009944444444444445, |
| "loss": 0.2443, |
| "num_input_tokens_seen": 56192, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.22945284843444824, |
| "eval_runtime": 0.8098, |
| "eval_samples_per_second": 49.394, |
| "eval_steps_per_second": 12.349, |
| "num_input_tokens_seen": 56192, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.0555555555555554, |
| "grad_norm": 0.19198457896709442, |
| "learning_rate": 0.000999984957239884, |
| "loss": 0.2365, |
| "num_input_tokens_seen": 57760, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.111111111111111, |
| "grad_norm": 4.2871012687683105, |
| "learning_rate": 0.0009999238475781956, |
| "loss": 0.2684, |
| "num_input_tokens_seen": 59264, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.1666666666666665, |
| "grad_norm": 1.6800514459609985, |
| "learning_rate": 0.000999815736583355, |
| "loss": 0.2517, |
| "num_input_tokens_seen": 60768, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 0.0673963725566864, |
| "learning_rate": 0.000999660634419631, |
| "loss": 0.2444, |
| "num_input_tokens_seen": 62272, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.2777777777777777, |
| "grad_norm": 0.046518296003341675, |
| "learning_rate": 0.0009994585556692623, |
| "loss": 0.2327, |
| "num_input_tokens_seen": 63808, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.3333333333333335, |
| "grad_norm": 0.05228637158870697, |
| "learning_rate": 0.0009992095193310836, |
| "loss": 0.2406, |
| "num_input_tokens_seen": 65344, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.388888888888889, |
| "grad_norm": 0.21179161965847015, |
| "learning_rate": 0.0009989135488187406, |
| "loss": 0.2357, |
| "num_input_tokens_seen": 66912, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.4444444444444446, |
| "grad_norm": 0.337697297334671, |
| "learning_rate": 0.0009985706719584887, |
| "loss": 0.2345, |
| "num_input_tokens_seen": 68480, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.12051483988761902, |
| "learning_rate": 0.000998180920986577, |
| "loss": 0.2317, |
| "num_input_tokens_seen": 70048, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.5555555555555554, |
| "grad_norm": 0.026494326069951057, |
| "learning_rate": 0.0009977443325462165, |
| "loss": 0.2303, |
| "num_input_tokens_seen": 71680, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.611111111111111, |
| "grad_norm": 0.02274704910814762, |
| "learning_rate": 0.0009972609476841367, |
| "loss": 0.2404, |
| "num_input_tokens_seen": 73280, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 0.12976308166980743, |
| "learning_rate": 0.0009967308118467252, |
| "loss": 0.2217, |
| "num_input_tokens_seen": 74752, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.7222222222222223, |
| "grad_norm": 0.08644629269838333, |
| "learning_rate": 0.0009961539748757548, |
| "loss": 0.2598, |
| "num_input_tokens_seen": 76288, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.7777777777777777, |
| "grad_norm": 0.0433812253177166, |
| "learning_rate": 0.0009955304910036994, |
| "loss": 0.2304, |
| "num_input_tokens_seen": 77888, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.8333333333333335, |
| "grad_norm": 0.020187288522720337, |
| "learning_rate": 0.0009948604188486328, |
| "loss": 0.2377, |
| "num_input_tokens_seen": 79456, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.888888888888889, |
| "grad_norm": 0.023410074412822723, |
| "learning_rate": 0.000994143821408719, |
| "loss": 0.2363, |
| "num_input_tokens_seen": 81088, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.9444444444444446, |
| "grad_norm": 0.029354019090533257, |
| "learning_rate": 0.0009933807660562897, |
| "loss": 0.2357, |
| "num_input_tokens_seen": 82592, |
| "step": 265 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.01759847067296505, |
| "learning_rate": 0.0009925713245315083, |
| "loss": 0.2401, |
| "num_input_tokens_seen": 84192, |
| "step": 270 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.23134836554527283, |
| "eval_runtime": 0.813, |
| "eval_samples_per_second": 49.203, |
| "eval_steps_per_second": 12.301, |
| "num_input_tokens_seen": 84192, |
| "step": 270 |
| }, |
| { |
| "epoch": 3.0555555555555554, |
| "grad_norm": 0.06405995786190033, |
| "learning_rate": 0.0009917155729356273, |
| "loss": 0.2304, |
| "num_input_tokens_seen": 85696, |
| "step": 275 |
| }, |
| { |
| "epoch": 3.111111111111111, |
| "grad_norm": 0.0503934845328331, |
| "learning_rate": 0.000990813591723832, |
| "loss": 0.2375, |
| "num_input_tokens_seen": 87296, |
| "step": 280 |
| }, |
| { |
| "epoch": 3.1666666666666665, |
| "grad_norm": 0.016397392377257347, |
| "learning_rate": 0.000989865465697677, |
| "loss": 0.2318, |
| "num_input_tokens_seen": 88832, |
| "step": 285 |
| }, |
| { |
| "epoch": 3.2222222222222223, |
| "grad_norm": 0.06879417598247528, |
| "learning_rate": 0.0009888712839971133, |
| "loss": 0.2166, |
| "num_input_tokens_seen": 90368, |
| "step": 290 |
| }, |
| { |
| "epoch": 3.2777777777777777, |
| "grad_norm": 0.1464420109987259, |
| "learning_rate": 0.0009878311400921072, |
| "loss": 0.2354, |
| "num_input_tokens_seen": 91968, |
| "step": 295 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.40577006340026855, |
| "learning_rate": 0.0009867451317738534, |
| "loss": 0.2449, |
| "num_input_tokens_seen": 93568, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.388888888888889, |
| "grad_norm": 0.03198301047086716, |
| "learning_rate": 0.0009856133611455802, |
| "loss": 0.2378, |
| "num_input_tokens_seen": 95168, |
| "step": 305 |
| }, |
| { |
| "epoch": 3.4444444444444446, |
| "grad_norm": 0.08572801202535629, |
| "learning_rate": 0.0009844359346129503, |
| "loss": 0.2425, |
| "num_input_tokens_seen": 96704, |
| "step": 310 |
| }, |
| { |
| "epoch": 3.5, |
| "grad_norm": 0.02046571485698223, |
| "learning_rate": 0.0009832129628740574, |
| "loss": 0.2357, |
| "num_input_tokens_seen": 98304, |
| "step": 315 |
| }, |
| { |
| "epoch": 3.5555555555555554, |
| "grad_norm": 0.06534157693386078, |
| "learning_rate": 0.0009819445609090174, |
| "loss": 0.2368, |
| "num_input_tokens_seen": 99840, |
| "step": 320 |
| }, |
| { |
| "epoch": 3.611111111111111, |
| "grad_norm": 0.012981094419956207, |
| "learning_rate": 0.0009806308479691594, |
| "loss": 0.2315, |
| "num_input_tokens_seen": 101376, |
| "step": 325 |
| }, |
| { |
| "epoch": 3.6666666666666665, |
| "grad_norm": 0.01108462642878294, |
| "learning_rate": 0.0009792719475658143, |
| "loss": 0.2294, |
| "num_input_tokens_seen": 102976, |
| "step": 330 |
| }, |
| { |
| "epoch": 3.7222222222222223, |
| "grad_norm": 0.090813547372818, |
| "learning_rate": 0.0009778679874587015, |
| "loss": 0.2325, |
| "num_input_tokens_seen": 104576, |
| "step": 335 |
| }, |
| { |
| "epoch": 3.7777777777777777, |
| "grad_norm": 0.04561146721243858, |
| "learning_rate": 0.0009764190996439181, |
| "loss": 0.2304, |
| "num_input_tokens_seen": 106112, |
| "step": 340 |
| }, |
| { |
| "epoch": 3.8333333333333335, |
| "grad_norm": 0.01101192831993103, |
| "learning_rate": 0.0009749254203415288, |
| "loss": 0.2314, |
| "num_input_tokens_seen": 107712, |
| "step": 345 |
| }, |
| { |
| "epoch": 3.888888888888889, |
| "grad_norm": 0.05584251880645752, |
| "learning_rate": 0.000973387089982759, |
| "loss": 0.234, |
| "num_input_tokens_seen": 109344, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.9444444444444446, |
| "grad_norm": 0.09156426787376404, |
| "learning_rate": 0.0009718042531967918, |
| "loss": 0.2401, |
| "num_input_tokens_seen": 110944, |
| "step": 355 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.010413792915642262, |
| "learning_rate": 0.0009701770587971706, |
| "loss": 0.2337, |
| "num_input_tokens_seen": 112544, |
| "step": 360 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.23112532496452332, |
| "eval_runtime": 0.8133, |
| "eval_samples_per_second": 49.181, |
| "eval_steps_per_second": 12.295, |
| "num_input_tokens_seen": 112544, |
| "step": 360 |
| }, |
| { |
| "epoch": 4.055555555555555, |
| "grad_norm": 0.05406933277845383, |
| "learning_rate": 0.0009685056597678075, |
| "loss": 0.2357, |
| "num_input_tokens_seen": 114144, |
| "step": 365 |
| }, |
| { |
| "epoch": 4.111111111111111, |
| "grad_norm": 0.11962758749723434, |
| "learning_rate": 0.0009667902132486009, |
| "loss": 0.2285, |
| "num_input_tokens_seen": 115712, |
| "step": 370 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "grad_norm": 0.07900924980640411, |
| "learning_rate": 0.0009650308805206616, |
| "loss": 0.2343, |
| "num_input_tokens_seen": 117312, |
| "step": 375 |
| }, |
| { |
| "epoch": 4.222222222222222, |
| "grad_norm": 0.06815142184495926, |
| "learning_rate": 0.0009632278269911492, |
| "loss": 0.2245, |
| "num_input_tokens_seen": 118848, |
| "step": 380 |
| }, |
| { |
| "epoch": 4.277777777777778, |
| "grad_norm": 0.0701470896601677, |
| "learning_rate": 0.0009613812221777212, |
| "loss": 0.2225, |
| "num_input_tokens_seen": 120416, |
| "step": 385 |
| }, |
| { |
| "epoch": 4.333333333333333, |
| "grad_norm": 0.022089658305048943, |
| "learning_rate": 0.0009594912396925958, |
| "loss": 0.2649, |
| "num_input_tokens_seen": 122016, |
| "step": 390 |
| }, |
| { |
| "epoch": 4.388888888888889, |
| "grad_norm": 0.04512215778231621, |
| "learning_rate": 0.0009575580572262289, |
| "loss": 0.2293, |
| "num_input_tokens_seen": 123584, |
| "step": 395 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 0.013591914437711239, |
| "learning_rate": 0.0009555818565306084, |
| "loss": 0.2338, |
| "num_input_tokens_seen": 125216, |
| "step": 400 |
| }, |
| { |
| "epoch": 4.5, |
| "grad_norm": 0.08284556865692139, |
| "learning_rate": 0.0009535628234021669, |
| "loss": 0.2345, |
| "num_input_tokens_seen": 126784, |
| "step": 405 |
| }, |
| { |
| "epoch": 4.555555555555555, |
| "grad_norm": 0.08072531223297119, |
| "learning_rate": 0.0009515011476643126, |
| "loss": 0.2291, |
| "num_input_tokens_seen": 128352, |
| "step": 410 |
| }, |
| { |
| "epoch": 4.611111111111111, |
| "grad_norm": 0.057940974831581116, |
| "learning_rate": 0.0009493970231495835, |
| "loss": 0.2256, |
| "num_input_tokens_seen": 129920, |
| "step": 415 |
| }, |
| { |
| "epoch": 4.666666666666667, |
| "grad_norm": 0.10934175550937653, |
| "learning_rate": 0.0009472506476814238, |
| "loss": 0.2505, |
| "num_input_tokens_seen": 131488, |
| "step": 420 |
| }, |
| { |
| "epoch": 4.722222222222222, |
| "grad_norm": 0.04054225981235504, |
| "learning_rate": 0.0009450622230555847, |
| "loss": 0.2353, |
| "num_input_tokens_seen": 133056, |
| "step": 425 |
| }, |
| { |
| "epoch": 4.777777777777778, |
| "grad_norm": 0.03740108013153076, |
| "learning_rate": 0.0009428319550211531, |
| "loss": 0.231, |
| "num_input_tokens_seen": 134624, |
| "step": 430 |
| }, |
| { |
| "epoch": 4.833333333333333, |
| "grad_norm": 0.07108583301305771, |
| "learning_rate": 0.000940560053261206, |
| "loss": 0.2312, |
| "num_input_tokens_seen": 136192, |
| "step": 435 |
| }, |
| { |
| "epoch": 4.888888888888889, |
| "grad_norm": 0.033905934542417526, |
| "learning_rate": 0.0009382467313730985, |
| "loss": 0.2328, |
| "num_input_tokens_seen": 137824, |
| "step": 440 |
| }, |
| { |
| "epoch": 4.944444444444445, |
| "grad_norm": 0.07359946519136429, |
| "learning_rate": 0.0009358922068483812, |
| "loss": 0.2322, |
| "num_input_tokens_seen": 139392, |
| "step": 445 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.07416823506355286, |
| "learning_rate": 0.0009334967010523523, |
| "loss": 0.2353, |
| "num_input_tokens_seen": 140960, |
| "step": 450 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.233585923910141, |
| "eval_runtime": 0.8135, |
| "eval_samples_per_second": 49.169, |
| "eval_steps_per_second": 12.292, |
| "num_input_tokens_seen": 140960, |
| "step": 450 |
| }, |
| { |
| "epoch": 5.055555555555555, |
| "grad_norm": 0.037136659026145935, |
| "learning_rate": 0.0009310604392032455, |
| "loss": 0.2314, |
| "num_input_tokens_seen": 142560, |
| "step": 455 |
| }, |
| { |
| "epoch": 5.111111111111111, |
| "grad_norm": 0.008749793283641338, |
| "learning_rate": 0.0009285836503510562, |
| "loss": 0.2301, |
| "num_input_tokens_seen": 144096, |
| "step": 460 |
| }, |
| { |
| "epoch": 5.166666666666667, |
| "grad_norm": 0.04284720867872238, |
| "learning_rate": 0.0009260665673560057, |
| "loss": 0.2352, |
| "num_input_tokens_seen": 145632, |
| "step": 465 |
| }, |
| { |
| "epoch": 5.222222222222222, |
| "grad_norm": 0.03149145096540451, |
| "learning_rate": 0.0009235094268666498, |
| "loss": 0.2297, |
| "num_input_tokens_seen": 147232, |
| "step": 470 |
| }, |
| { |
| "epoch": 5.277777777777778, |
| "grad_norm": 0.07827319204807281, |
| "learning_rate": 0.0009209124692976287, |
| "loss": 0.2324, |
| "num_input_tokens_seen": 148864, |
| "step": 475 |
| }, |
| { |
| "epoch": 5.333333333333333, |
| "grad_norm": 0.008484285324811935, |
| "learning_rate": 0.0009182759388070649, |
| "loss": 0.2366, |
| "num_input_tokens_seen": 150432, |
| "step": 480 |
| }, |
| { |
| "epoch": 5.388888888888889, |
| "grad_norm": 0.037567220628261566, |
| "learning_rate": 0.0009156000832736073, |
| "loss": 0.2271, |
| "num_input_tokens_seen": 152032, |
| "step": 485 |
| }, |
| { |
| "epoch": 5.444444444444445, |
| "grad_norm": 0.04004412144422531, |
| "learning_rate": 0.0009128851542731271, |
| "loss": 0.2315, |
| "num_input_tokens_seen": 153600, |
| "step": 490 |
| }, |
| { |
| "epoch": 5.5, |
| "grad_norm": 0.04386899247765541, |
| "learning_rate": 0.0009101314070550646, |
| "loss": 0.2342, |
| "num_input_tokens_seen": 155200, |
| "step": 495 |
| }, |
| { |
| "epoch": 5.555555555555555, |
| "grad_norm": 0.009182831272482872, |
| "learning_rate": 0.0009073391005184324, |
| "loss": 0.2372, |
| "num_input_tokens_seen": 156800, |
| "step": 500 |
| }, |
| { |
| "epoch": 5.611111111111111, |
| "grad_norm": 0.06390406936407089, |
| "learning_rate": 0.0009045084971874737, |
| "loss": 0.229, |
| "num_input_tokens_seen": 158336, |
| "step": 505 |
| }, |
| { |
| "epoch": 5.666666666666667, |
| "grad_norm": 0.007993469946086407, |
| "learning_rate": 0.0009016398631869811, |
| "loss": 0.2347, |
| "num_input_tokens_seen": 159904, |
| "step": 510 |
| }, |
| { |
| "epoch": 5.722222222222222, |
| "grad_norm": 0.03192123398184776, |
| "learning_rate": 0.0008987334682172759, |
| "loss": 0.2299, |
| "num_input_tokens_seen": 161504, |
| "step": 515 |
| }, |
| { |
| "epoch": 5.777777777777778, |
| "grad_norm": 0.03337305784225464, |
| "learning_rate": 0.0008957895855288517, |
| "loss": 0.2309, |
| "num_input_tokens_seen": 163008, |
| "step": 520 |
| }, |
| { |
| "epoch": 5.833333333333333, |
| "grad_norm": 0.007557485718280077, |
| "learning_rate": 0.000892808491896685, |
| "loss": 0.2309, |
| "num_input_tokens_seen": 164512, |
| "step": 525 |
| }, |
| { |
| "epoch": 5.888888888888889, |
| "grad_norm": 0.008209764957427979, |
| "learning_rate": 0.0008897904675942128, |
| "loss": 0.2319, |
| "num_input_tokens_seen": 166080, |
| "step": 530 |
| }, |
| { |
| "epoch": 5.944444444444445, |
| "grad_norm": 0.007890612818300724, |
| "learning_rate": 0.000886735796366982, |
| "loss": 0.2314, |
| "num_input_tokens_seen": 167680, |
| "step": 535 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 0.0075538670644164085, |
| "learning_rate": 0.0008836447654059734, |
| "loss": 0.2335, |
| "num_input_tokens_seen": 169216, |
| "step": 540 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.23072807490825653, |
| "eval_runtime": 0.8144, |
| "eval_samples_per_second": 49.116, |
| "eval_steps_per_second": 12.279, |
| "num_input_tokens_seen": 169216, |
| "step": 540 |
| }, |
| { |
| "epoch": 6.055555555555555, |
| "grad_norm": 0.008767198771238327, |
| "learning_rate": 0.0008805176653206003, |
| "loss": 0.2326, |
| "num_input_tokens_seen": 170784, |
| "step": 545 |
| }, |
| { |
| "epoch": 6.111111111111111, |
| "grad_norm": 0.032195769250392914, |
| "learning_rate": 0.000877354790111386, |
| "loss": 0.2273, |
| "num_input_tokens_seen": 172352, |
| "step": 550 |
| }, |
| { |
| "epoch": 6.166666666666667, |
| "grad_norm": 0.008529037237167358, |
| "learning_rate": 0.0008741564371423235, |
| "loss": 0.2227, |
| "num_input_tokens_seen": 173920, |
| "step": 555 |
| }, |
| { |
| "epoch": 6.222222222222222, |
| "grad_norm": 0.059331003576517105, |
| "learning_rate": 0.0008709229071129177, |
| "loss": 0.2277, |
| "num_input_tokens_seen": 175488, |
| "step": 560 |
| }, |
| { |
| "epoch": 6.277777777777778, |
| "grad_norm": 0.06853003799915314, |
| "learning_rate": 0.0008676545040299144, |
| "loss": 0.2339, |
| "num_input_tokens_seen": 177024, |
| "step": 565 |
| }, |
| { |
| "epoch": 6.333333333333333, |
| "grad_norm": 0.008488861843943596, |
| "learning_rate": 0.0008643515351787192, |
| "loss": 0.2281, |
| "num_input_tokens_seen": 178624, |
| "step": 570 |
| }, |
| { |
| "epoch": 6.388888888888889, |
| "grad_norm": 0.03643598034977913, |
| "learning_rate": 0.0008610143110945068, |
| "loss": 0.2325, |
| "num_input_tokens_seen": 180096, |
| "step": 575 |
| }, |
| { |
| "epoch": 6.444444444444445, |
| "grad_norm": 0.06390345096588135, |
| "learning_rate": 0.0008576431455330258, |
| "loss": 0.2229, |
| "num_input_tokens_seen": 181600, |
| "step": 580 |
| }, |
| { |
| "epoch": 6.5, |
| "grad_norm": 0.01061676349490881, |
| "learning_rate": 0.0008542383554411, |
| "loss": 0.2242, |
| "num_input_tokens_seen": 183232, |
| "step": 585 |
| }, |
| { |
| "epoch": 6.555555555555555, |
| "grad_norm": 0.03071434609591961, |
| "learning_rate": 0.0008508002609268301, |
| "loss": 0.2356, |
| "num_input_tokens_seen": 184768, |
| "step": 590 |
| }, |
| { |
| "epoch": 6.611111111111111, |
| "grad_norm": 0.04599233716726303, |
| "learning_rate": 0.0008473291852294987, |
| "loss": 0.2313, |
| "num_input_tokens_seen": 186208, |
| "step": 595 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.028627226129174232, |
| "learning_rate": 0.0008438254546891792, |
| "loss": 0.2385, |
| "num_input_tokens_seen": 187776, |
| "step": 600 |
| }, |
| { |
| "epoch": 6.722222222222222, |
| "grad_norm": 0.012562990188598633, |
| "learning_rate": 0.0008402893987160552, |
| "loss": 0.2202, |
| "num_input_tokens_seen": 189344, |
| "step": 605 |
| }, |
| { |
| "epoch": 6.777777777777778, |
| "grad_norm": 0.012583953328430653, |
| "learning_rate": 0.0008367213497594501, |
| "loss": 0.2342, |
| "num_input_tokens_seen": 190912, |
| "step": 610 |
| }, |
| { |
| "epoch": 6.833333333333333, |
| "grad_norm": 0.028570545837283134, |
| "learning_rate": 0.0008331216432765713, |
| "loss": 0.241, |
| "num_input_tokens_seen": 192512, |
| "step": 615 |
| }, |
| { |
| "epoch": 6.888888888888889, |
| "grad_norm": 0.03644197806715965, |
| "learning_rate": 0.0008294906177009707, |
| "loss": 0.2338, |
| "num_input_tokens_seen": 194112, |
| "step": 620 |
| }, |
| { |
| "epoch": 6.944444444444445, |
| "grad_norm": 0.010511292144656181, |
| "learning_rate": 0.0008258286144107276, |
| "loss": 0.2335, |
| "num_input_tokens_seen": 195712, |
| "step": 625 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 0.03820033371448517, |
| "learning_rate": 0.0008221359776963525, |
| "loss": 0.2306, |
| "num_input_tokens_seen": 197248, |
| "step": 630 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.23276683688163757, |
| "eval_runtime": 0.8141, |
| "eval_samples_per_second": 49.134, |
| "eval_steps_per_second": 12.283, |
| "num_input_tokens_seen": 197248, |
| "step": 630 |
| }, |
| { |
| "epoch": 7.055555555555555, |
| "grad_norm": 0.010046997107565403, |
| "learning_rate": 0.000818413054728418, |
| "loss": 0.2401, |
| "num_input_tokens_seen": 198784, |
| "step": 635 |
| }, |
| { |
| "epoch": 7.111111111111111, |
| "grad_norm": 0.03139125183224678, |
| "learning_rate": 0.0008146601955249188, |
| "loss": 0.2324, |
| "num_input_tokens_seen": 200352, |
| "step": 640 |
| }, |
| { |
| "epoch": 7.166666666666667, |
| "grad_norm": 0.008107461966574192, |
| "learning_rate": 0.0008108777529183644, |
| "loss": 0.2297, |
| "num_input_tokens_seen": 201856, |
| "step": 645 |
| }, |
| { |
| "epoch": 7.222222222222222, |
| "grad_norm": 0.012090813368558884, |
| "learning_rate": 0.000807066082522607, |
| "loss": 0.2299, |
| "num_input_tokens_seen": 203488, |
| "step": 650 |
| }, |
| { |
| "epoch": 7.277777777777778, |
| "grad_norm": 0.033598825335502625, |
| "learning_rate": 0.0008032255426994069, |
| "loss": 0.2278, |
| "num_input_tokens_seen": 205056, |
| "step": 655 |
| }, |
| { |
| "epoch": 7.333333333333333, |
| "grad_norm": 0.009637514129281044, |
| "learning_rate": 0.0007993564945247409, |
| "loss": 0.2163, |
| "num_input_tokens_seen": 206656, |
| "step": 660 |
| }, |
| { |
| "epoch": 7.388888888888889, |
| "grad_norm": 0.035691387951374054, |
| "learning_rate": 0.0007954593017548556, |
| "loss": 0.2323, |
| "num_input_tokens_seen": 208288, |
| "step": 665 |
| }, |
| { |
| "epoch": 7.444444444444445, |
| "grad_norm": 0.03339530527591705, |
| "learning_rate": 0.0007915343307920673, |
| "loss": 0.2278, |
| "num_input_tokens_seen": 209856, |
| "step": 670 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.05436629429459572, |
| "learning_rate": 0.0007875819506503144, |
| "loss": 0.2611, |
| "num_input_tokens_seen": 211424, |
| "step": 675 |
| }, |
| { |
| "epoch": 7.555555555555555, |
| "grad_norm": 0.028627492487430573, |
| "learning_rate": 0.0007836025329204635, |
| "loss": 0.2402, |
| "num_input_tokens_seen": 212928, |
| "step": 680 |
| }, |
| { |
| "epoch": 7.611111111111111, |
| "grad_norm": 0.009143758565187454, |
| "learning_rate": 0.0007795964517353734, |
| "loss": 0.2251, |
| "num_input_tokens_seen": 214496, |
| "step": 685 |
| }, |
| { |
| "epoch": 7.666666666666667, |
| "grad_norm": 0.0279055368155241, |
| "learning_rate": 0.0007755640837347215, |
| "loss": 0.227, |
| "num_input_tokens_seen": 216064, |
| "step": 690 |
| }, |
| { |
| "epoch": 7.722222222222222, |
| "grad_norm": 0.008520636707544327, |
| "learning_rate": 0.0007715058080295917, |
| "loss": 0.2359, |
| "num_input_tokens_seen": 217600, |
| "step": 695 |
| }, |
| { |
| "epoch": 7.777777777777778, |
| "grad_norm": 0.007954890839755535, |
| "learning_rate": 0.0007674220061668323, |
| "loss": 0.2393, |
| "num_input_tokens_seen": 219136, |
| "step": 700 |
| }, |
| { |
| "epoch": 7.833333333333333, |
| "grad_norm": 0.0061981771141290665, |
| "learning_rate": 0.0007633130620931837, |
| "loss": 0.2368, |
| "num_input_tokens_seen": 220704, |
| "step": 705 |
| }, |
| { |
| "epoch": 7.888888888888889, |
| "grad_norm": 0.008587339892983437, |
| "learning_rate": 0.0007591793621191819, |
| "loss": 0.2299, |
| "num_input_tokens_seen": 222272, |
| "step": 710 |
| }, |
| { |
| "epoch": 7.944444444444445, |
| "grad_norm": 0.014931959100067616, |
| "learning_rate": 0.0007550212948828377, |
| "loss": 0.2227, |
| "num_input_tokens_seen": 223872, |
| "step": 715 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 0.03580380976200104, |
| "learning_rate": 0.0007508392513130979, |
| "loss": 0.2366, |
| "num_input_tokens_seen": 225440, |
| "step": 720 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.23274831473827362, |
| "eval_runtime": 0.8134, |
| "eval_samples_per_second": 49.175, |
| "eval_steps_per_second": 12.294, |
| "num_input_tokens_seen": 225440, |
| "step": 720 |
| }, |
| { |
| "epoch": 8.055555555555555, |
| "grad_norm": 0.016410276293754578, |
| "learning_rate": 0.0007466336245930927, |
| "loss": 0.2335, |
| "num_input_tokens_seen": 226912, |
| "step": 725 |
| }, |
| { |
| "epoch": 8.11111111111111, |
| "grad_norm": 0.010216175578534603, |
| "learning_rate": 0.0007424048101231686, |
| "loss": 0.2378, |
| "num_input_tokens_seen": 228512, |
| "step": 730 |
| }, |
| { |
| "epoch": 8.166666666666666, |
| "grad_norm": 0.026450112462043762, |
| "learning_rate": 0.0007381532054837144, |
| "loss": 0.2351, |
| "num_input_tokens_seen": 230080, |
| "step": 735 |
| }, |
| { |
| "epoch": 8.222222222222221, |
| "grad_norm": 0.06160770729184151, |
| "learning_rate": 0.0007338792103977821, |
| "loss": 0.2347, |
| "num_input_tokens_seen": 231616, |
| "step": 740 |
| }, |
| { |
| "epoch": 8.277777777777779, |
| "grad_norm": 0.029110893607139587, |
| "learning_rate": 0.0007295832266935059, |
| "loss": 0.2294, |
| "num_input_tokens_seen": 233184, |
| "step": 745 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.00839630514383316, |
| "learning_rate": 0.0007252656582663236, |
| "loss": 0.2356, |
| "num_input_tokens_seen": 234688, |
| "step": 750 |
| }, |
| { |
| "epoch": 8.38888888888889, |
| "grad_norm": 0.008437261916697025, |
| "learning_rate": 0.0007209269110410039, |
| "loss": 0.2282, |
| "num_input_tokens_seen": 236256, |
| "step": 755 |
| }, |
| { |
| "epoch": 8.444444444444445, |
| "grad_norm": 0.026610706001520157, |
| "learning_rate": 0.0007165673929334815, |
| "loss": 0.2304, |
| "num_input_tokens_seen": 237856, |
| "step": 760 |
| }, |
| { |
| "epoch": 8.5, |
| "grad_norm": 0.052499689161777496, |
| "learning_rate": 0.0007121875138125077, |
| "loss": 0.2314, |
| "num_input_tokens_seen": 239392, |
| "step": 765 |
| }, |
| { |
| "epoch": 8.555555555555555, |
| "grad_norm": 0.05292768031358719, |
| "learning_rate": 0.0007077876854611145, |
| "loss": 0.2302, |
| "num_input_tokens_seen": 240960, |
| "step": 770 |
| }, |
| { |
| "epoch": 8.61111111111111, |
| "grad_norm": 0.030896423384547234, |
| "learning_rate": 0.0007033683215379002, |
| "loss": 0.2295, |
| "num_input_tokens_seen": 242496, |
| "step": 775 |
| }, |
| { |
| "epoch": 8.666666666666666, |
| "grad_norm": 0.009958789683878422, |
| "learning_rate": 0.000698929837538139, |
| "loss": 0.2379, |
| "num_input_tokens_seen": 244064, |
| "step": 780 |
| }, |
| { |
| "epoch": 8.722222222222221, |
| "grad_norm": 0.02802702784538269, |
| "learning_rate": 0.0006944726507547168, |
| "loss": 0.2384, |
| "num_input_tokens_seen": 245664, |
| "step": 785 |
| }, |
| { |
| "epoch": 8.777777777777779, |
| "grad_norm": 0.02543003484606743, |
| "learning_rate": 0.0006899971802388996, |
| "loss": 0.2293, |
| "num_input_tokens_seen": 247264, |
| "step": 790 |
| }, |
| { |
| "epoch": 8.833333333333334, |
| "grad_norm": 0.00691555580124259, |
| "learning_rate": 0.0006855038467609335, |
| "loss": 0.2368, |
| "num_input_tokens_seen": 248832, |
| "step": 795 |
| }, |
| { |
| "epoch": 8.88888888888889, |
| "grad_norm": 0.028850693255662918, |
| "learning_rate": 0.0006809930727704874, |
| "loss": 0.2304, |
| "num_input_tokens_seen": 250432, |
| "step": 800 |
| }, |
| { |
| "epoch": 8.944444444444445, |
| "grad_norm": 0.008373413234949112, |
| "learning_rate": 0.0006764652823569344, |
| "loss": 0.2268, |
| "num_input_tokens_seen": 252064, |
| "step": 805 |
| }, |
| { |
| "epoch": 9.0, |
| "grad_norm": 0.030820351094007492, |
| "learning_rate": 0.0006719209012094805, |
| "loss": 0.2396, |
| "num_input_tokens_seen": 253632, |
| "step": 810 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.23413518071174622, |
| "eval_runtime": 0.814, |
| "eval_samples_per_second": 49.14, |
| "eval_steps_per_second": 12.285, |
| "num_input_tokens_seen": 253632, |
| "step": 810 |
| }, |
| { |
| "epoch": 9.055555555555555, |
| "grad_norm": 0.007299968507140875, |
| "learning_rate": 0.0006673603565771424, |
| "loss": 0.227, |
| "num_input_tokens_seen": 255232, |
| "step": 815 |
| }, |
| { |
| "epoch": 9.11111111111111, |
| "grad_norm": 0.007571425288915634, |
| "learning_rate": 0.0006627840772285784, |
| "loss": 0.2285, |
| "num_input_tokens_seen": 256800, |
| "step": 820 |
| }, |
| { |
| "epoch": 9.166666666666666, |
| "grad_norm": 0.007662767544388771, |
| "learning_rate": 0.0006581924934117783, |
| "loss": 0.2347, |
| "num_input_tokens_seen": 258368, |
| "step": 825 |
| }, |
| { |
| "epoch": 9.222222222222221, |
| "grad_norm": 0.027202824130654335, |
| "learning_rate": 0.0006535860368136113, |
| "loss": 0.231, |
| "num_input_tokens_seen": 259872, |
| "step": 830 |
| }, |
| { |
| "epoch": 9.277777777777779, |
| "grad_norm": 0.028580373153090477, |
| "learning_rate": 0.0006489651405192409, |
| "loss": 0.2346, |
| "num_input_tokens_seen": 261376, |
| "step": 835 |
| }, |
| { |
| "epoch": 9.333333333333334, |
| "grad_norm": 0.026986513286828995, |
| "learning_rate": 0.0006443302389714074, |
| "loss": 0.2336, |
| "num_input_tokens_seen": 262944, |
| "step": 840 |
| }, |
| { |
| "epoch": 9.38888888888889, |
| "grad_norm": 0.057506296783685684, |
| "learning_rate": 0.0006396817679295822, |
| "loss": 0.2384, |
| "num_input_tokens_seen": 264544, |
| "step": 845 |
| }, |
| { |
| "epoch": 9.444444444444445, |
| "grad_norm": 0.008491387590765953, |
| "learning_rate": 0.0006350201644290005, |
| "loss": 0.2325, |
| "num_input_tokens_seen": 266080, |
| "step": 850 |
| }, |
| { |
| "epoch": 9.5, |
| "grad_norm": 0.026944199576973915, |
| "learning_rate": 0.0006303458667395708, |
| "loss": 0.2329, |
| "num_input_tokens_seen": 267680, |
| "step": 855 |
| }, |
| { |
| "epoch": 9.555555555555555, |
| "grad_norm": 0.02705678716301918, |
| "learning_rate": 0.0006256593143246718, |
| "loss": 0.2303, |
| "num_input_tokens_seen": 269280, |
| "step": 860 |
| }, |
| { |
| "epoch": 9.61111111111111, |
| "grad_norm": 0.007777373772114515, |
| "learning_rate": 0.0006209609477998338, |
| "loss": 0.2318, |
| "num_input_tokens_seen": 270912, |
| "step": 865 |
| }, |
| { |
| "epoch": 9.666666666666666, |
| "grad_norm": 0.027663791552186012, |
| "learning_rate": 0.0006162512088913149, |
| "loss": 0.2273, |
| "num_input_tokens_seen": 272512, |
| "step": 870 |
| }, |
| { |
| "epoch": 9.722222222222221, |
| "grad_norm": 0.028824860230088234, |
| "learning_rate": 0.0006115305403945697, |
| "loss": 0.2331, |
| "num_input_tokens_seen": 274048, |
| "step": 875 |
| }, |
| { |
| "epoch": 9.777777777777779, |
| "grad_norm": 0.051675502210855484, |
| "learning_rate": 0.0006067993861326201, |
| "loss": 0.2304, |
| "num_input_tokens_seen": 275648, |
| "step": 880 |
| }, |
| { |
| "epoch": 9.833333333333334, |
| "grad_norm": 0.00994098000228405, |
| "learning_rate": 0.0006020581909143279, |
| "loss": 0.2346, |
| "num_input_tokens_seen": 277248, |
| "step": 885 |
| }, |
| { |
| "epoch": 9.88888888888889, |
| "grad_norm": 0.02850501798093319, |
| "learning_rate": 0.0005973074004925755, |
| "loss": 0.23, |
| "num_input_tokens_seen": 278816, |
| "step": 890 |
| }, |
| { |
| "epoch": 9.944444444444445, |
| "grad_norm": 0.027266889810562134, |
| "learning_rate": 0.0005925474615223572, |
| "loss": 0.2304, |
| "num_input_tokens_seen": 280416, |
| "step": 895 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.05180501565337181, |
| "learning_rate": 0.0005877788215187867, |
| "loss": 0.228, |
| "num_input_tokens_seen": 281984, |
| "step": 900 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.23290684819221497, |
| "eval_runtime": 0.8154, |
| "eval_samples_per_second": 49.058, |
| "eval_steps_per_second": 12.264, |
| "num_input_tokens_seen": 281984, |
| "step": 900 |
| }, |
| { |
| "epoch": 10.055555555555555, |
| "grad_norm": 0.051713358610868454, |
| "learning_rate": 0.0005830019288150222, |
| "loss": 0.2385, |
| "num_input_tokens_seen": 283552, |
| "step": 905 |
| }, |
| { |
| "epoch": 10.11111111111111, |
| "grad_norm": 0.01033524889498949, |
| "learning_rate": 0.0005782172325201155, |
| "loss": 0.2296, |
| "num_input_tokens_seen": 285120, |
| "step": 910 |
| }, |
| { |
| "epoch": 10.166666666666666, |
| "grad_norm": 0.05586029216647148, |
| "learning_rate": 0.0005734251824767894, |
| "loss": 0.2321, |
| "num_input_tokens_seen": 286720, |
| "step": 915 |
| }, |
| { |
| "epoch": 10.222222222222221, |
| "grad_norm": 0.011709915474057198, |
| "learning_rate": 0.0005686262292191438, |
| "loss": 0.2324, |
| "num_input_tokens_seen": 288288, |
| "step": 920 |
| }, |
| { |
| "epoch": 10.277777777777779, |
| "grad_norm": 0.031557176262140274, |
| "learning_rate": 0.0005638208239302974, |
| "loss": 0.2286, |
| "num_input_tokens_seen": 289856, |
| "step": 925 |
| }, |
| { |
| "epoch": 10.333333333333334, |
| "grad_norm": 0.05073035880923271, |
| "learning_rate": 0.0005590094183999698, |
| "loss": 0.2289, |
| "num_input_tokens_seen": 291392, |
| "step": 930 |
| }, |
| { |
| "epoch": 10.38888888888889, |
| "grad_norm": 0.05272587388753891, |
| "learning_rate": 0.0005541924649820054, |
| "loss": 0.2294, |
| "num_input_tokens_seen": 292864, |
| "step": 935 |
| }, |
| { |
| "epoch": 10.444444444444445, |
| "grad_norm": 0.03139587864279747, |
| "learning_rate": 0.000549370416551844, |
| "loss": 0.2315, |
| "num_input_tokens_seen": 294496, |
| "step": 940 |
| }, |
| { |
| "epoch": 10.5, |
| "grad_norm": 0.03361722081899643, |
| "learning_rate": 0.0005445437264639432, |
| "loss": 0.2263, |
| "num_input_tokens_seen": 296096, |
| "step": 945 |
| }, |
| { |
| "epoch": 10.555555555555555, |
| "grad_norm": 0.03754409775137901, |
| "learning_rate": 0.0005397128485091551, |
| "loss": 0.2281, |
| "num_input_tokens_seen": 297632, |
| "step": 950 |
| }, |
| { |
| "epoch": 10.61111111111111, |
| "grad_norm": 0.030270714312791824, |
| "learning_rate": 0.0005348782368720626, |
| "loss": 0.2347, |
| "num_input_tokens_seen": 299168, |
| "step": 955 |
| }, |
| { |
| "epoch": 10.666666666666666, |
| "grad_norm": 0.03540631756186485, |
| "learning_rate": 0.0005300403460882783, |
| "loss": 0.231, |
| "num_input_tokens_seen": 300736, |
| "step": 960 |
| }, |
| { |
| "epoch": 10.722222222222221, |
| "grad_norm": 0.032770272344350815, |
| "learning_rate": 0.00052519963100171, |
| "loss": 0.2333, |
| "num_input_tokens_seen": 302304, |
| "step": 965 |
| }, |
| { |
| "epoch": 10.777777777777779, |
| "grad_norm": 0.030736278742551804, |
| "learning_rate": 0.000520356546721798, |
| "loss": 0.225, |
| "num_input_tokens_seen": 303936, |
| "step": 970 |
| }, |
| { |
| "epoch": 10.833333333333334, |
| "grad_norm": 0.03407590091228485, |
| "learning_rate": 0.0005155115485807269, |
| "loss": 0.2315, |
| "num_input_tokens_seen": 305504, |
| "step": 975 |
| }, |
| { |
| "epoch": 10.88888888888889, |
| "grad_norm": 0.03474384918808937, |
| "learning_rate": 0.0005106650920906171, |
| "loss": 0.2336, |
| "num_input_tokens_seen": 307040, |
| "step": 980 |
| }, |
| { |
| "epoch": 10.944444444444445, |
| "grad_norm": 0.03182826563715935, |
| "learning_rate": 0.0005058176329006986, |
| "loss": 0.2247, |
| "num_input_tokens_seen": 308608, |
| "step": 985 |
| }, |
| { |
| "epoch": 11.0, |
| "grad_norm": 0.08343902230262756, |
| "learning_rate": 0.0005009696267544715, |
| "loss": 0.2447, |
| "num_input_tokens_seen": 310176, |
| "step": 990 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.23567497730255127, |
| "eval_runtime": 0.8148, |
| "eval_samples_per_second": 49.094, |
| "eval_steps_per_second": 12.274, |
| "num_input_tokens_seen": 310176, |
| "step": 990 |
| }, |
| { |
| "epoch": 11.055555555555555, |
| "grad_norm": 0.05768521502614021, |
| "learning_rate": 0.0004961215294468599, |
| "loss": 0.2235, |
| "num_input_tokens_seen": 311712, |
| "step": 995 |
| }, |
| { |
| "epoch": 11.11111111111111, |
| "grad_norm": 0.01326485350728035, |
| "learning_rate": 0.0004912737967813582, |
| "loss": 0.2318, |
| "num_input_tokens_seen": 313280, |
| "step": 1000 |
| }, |
| { |
| "epoch": 11.166666666666666, |
| "grad_norm": 0.028805643320083618, |
| "learning_rate": 0.0004864268845271786, |
| "loss": 0.2188, |
| "num_input_tokens_seen": 314816, |
| "step": 1005 |
| }, |
| { |
| "epoch": 11.222222222222221, |
| "grad_norm": 0.013591939583420753, |
| "learning_rate": 0.0004815812483764, |
| "loss": 0.2347, |
| "num_input_tokens_seen": 316384, |
| "step": 1010 |
| }, |
| { |
| "epoch": 11.277777777777779, |
| "grad_norm": 0.04025491699576378, |
| "learning_rate": 0.0004767373439011267, |
| "loss": 0.2427, |
| "num_input_tokens_seen": 317984, |
| "step": 1015 |
| }, |
| { |
| "epoch": 11.333333333333334, |
| "grad_norm": 0.020464973524212837, |
| "learning_rate": 0.00047189562651065565, |
| "loss": 0.2307, |
| "num_input_tokens_seen": 319552, |
| "step": 1020 |
| }, |
| { |
| "epoch": 11.38888888888889, |
| "grad_norm": 0.06203271448612213, |
| "learning_rate": 0.00046705655140866074, |
| "loss": 0.2357, |
| "num_input_tokens_seen": 321184, |
| "step": 1025 |
| }, |
| { |
| "epoch": 11.444444444444445, |
| "grad_norm": 0.029454471543431282, |
| "learning_rate": 0.0004622205735503961, |
| "loss": 0.2282, |
| "num_input_tokens_seen": 322688, |
| "step": 1030 |
| }, |
| { |
| "epoch": 11.5, |
| "grad_norm": 0.06667706370353699, |
| "learning_rate": 0.00045738814759992174, |
| "loss": 0.234, |
| "num_input_tokens_seen": 324224, |
| "step": 1035 |
| }, |
| { |
| "epoch": 11.555555555555555, |
| "grad_norm": 0.012882853858172894, |
| "learning_rate": 0.00045255972788735873, |
| "loss": 0.2319, |
| "num_input_tokens_seen": 325760, |
| "step": 1040 |
| }, |
| { |
| "epoch": 11.61111111111111, |
| "grad_norm": 0.03125486522912979, |
| "learning_rate": 0.00044773576836617336, |
| "loss": 0.2318, |
| "num_input_tokens_seen": 327264, |
| "step": 1045 |
| }, |
| { |
| "epoch": 11.666666666666666, |
| "grad_norm": 0.012210973538458347, |
| "learning_rate": 0.000442916722570498, |
| "loss": 0.2313, |
| "num_input_tokens_seen": 328832, |
| "step": 1050 |
| }, |
| { |
| "epoch": 11.722222222222221, |
| "grad_norm": 0.01437061931937933, |
| "learning_rate": 0.0004381030435724919, |
| "loss": 0.2272, |
| "num_input_tokens_seen": 330464, |
| "step": 1055 |
| }, |
| { |
| "epoch": 11.777777777777779, |
| "grad_norm": 0.012531839311122894, |
| "learning_rate": 0.00043329518393974364, |
| "loss": 0.2293, |
| "num_input_tokens_seen": 332064, |
| "step": 1060 |
| }, |
| { |
| "epoch": 11.833333333333334, |
| "grad_norm": 0.03327002376317978, |
| "learning_rate": 0.0004284935956927229, |
| "loss": 0.2299, |
| "num_input_tokens_seen": 333664, |
| "step": 1065 |
| }, |
| { |
| "epoch": 11.88888888888889, |
| "grad_norm": 0.014634636230766773, |
| "learning_rate": 0.00042369873026228263, |
| "loss": 0.2268, |
| "num_input_tokens_seen": 335232, |
| "step": 1070 |
| }, |
| { |
| "epoch": 11.944444444444445, |
| "grad_norm": 0.014177635312080383, |
| "learning_rate": 0.00041891103844721633, |
| "loss": 0.229, |
| "num_input_tokens_seen": 336800, |
| "step": 1075 |
| }, |
| { |
| "epoch": 12.0, |
| "grad_norm": 0.03053419105708599, |
| "learning_rate": 0.00041413097037187657, |
| "loss": 0.229, |
| "num_input_tokens_seen": 338400, |
| "step": 1080 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.23775029182434082, |
| "eval_runtime": 0.8142, |
| "eval_samples_per_second": 49.127, |
| "eval_steps_per_second": 12.282, |
| "num_input_tokens_seen": 338400, |
| "step": 1080 |
| }, |
| { |
| "epoch": 12.055555555555555, |
| "grad_norm": 0.03754847124218941, |
| "learning_rate": 0.00040935897544385424, |
| "loss": 0.2332, |
| "num_input_tokens_seen": 339968, |
| "step": 1085 |
| }, |
| { |
| "epoch": 12.11111111111111, |
| "grad_norm": 0.02007185108959675, |
| "learning_rate": 0.0004045955023117276, |
| "loss": 0.2273, |
| "num_input_tokens_seen": 341536, |
| "step": 1090 |
| }, |
| { |
| "epoch": 12.166666666666666, |
| "grad_norm": 0.021748747676610947, |
| "learning_rate": 0.00039984099882288133, |
| "loss": 0.2319, |
| "num_input_tokens_seen": 343104, |
| "step": 1095 |
| }, |
| { |
| "epoch": 12.222222222222221, |
| "grad_norm": 0.03744516894221306, |
| "learning_rate": 0.0003950959119814013, |
| "loss": 0.2293, |
| "num_input_tokens_seen": 344640, |
| "step": 1100 |
| }, |
| { |
| "epoch": 12.277777777777779, |
| "grad_norm": 0.042804788798093796, |
| "learning_rate": 0.0003903606879060483, |
| "loss": 0.2314, |
| "num_input_tokens_seen": 346272, |
| "step": 1105 |
| }, |
| { |
| "epoch": 12.333333333333334, |
| "grad_norm": 0.023323964327573776, |
| "learning_rate": 0.0003856357717883161, |
| "loss": 0.2309, |
| "num_input_tokens_seen": 347808, |
| "step": 1110 |
| }, |
| { |
| "epoch": 12.38888888888889, |
| "grad_norm": 0.05394618213176727, |
| "learning_rate": 0.00038092160785057466, |
| "loss": 0.2305, |
| "num_input_tokens_seen": 349344, |
| "step": 1115 |
| }, |
| { |
| "epoch": 12.444444444444445, |
| "grad_norm": 0.017731385305523872, |
| "learning_rate": 0.00037621863930430713, |
| "loss": 0.2288, |
| "num_input_tokens_seen": 350912, |
| "step": 1120 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 0.042858049273490906, |
| "learning_rate": 0.000371527308308439, |
| "loss": 0.2278, |
| "num_input_tokens_seen": 352480, |
| "step": 1125 |
| }, |
| { |
| "epoch": 12.555555555555555, |
| "grad_norm": 0.04008388891816139, |
| "learning_rate": 0.00036684805592776895, |
| "loss": 0.228, |
| "num_input_tokens_seen": 354048, |
| "step": 1130 |
| }, |
| { |
| "epoch": 12.61111111111111, |
| "grad_norm": 0.05050317198038101, |
| "learning_rate": 0.00036218132209150044, |
| "loss": 0.2243, |
| "num_input_tokens_seen": 355648, |
| "step": 1135 |
| }, |
| { |
| "epoch": 12.666666666666666, |
| "grad_norm": 0.08396083861589432, |
| "learning_rate": 0.0003575275455518811, |
| "loss": 0.2372, |
| "num_input_tokens_seen": 357248, |
| "step": 1140 |
| }, |
| { |
| "epoch": 12.722222222222221, |
| "grad_norm": 0.0884268507361412, |
| "learning_rate": 0.00035288716384295236, |
| "loss": 0.2273, |
| "num_input_tokens_seen": 358752, |
| "step": 1145 |
| }, |
| { |
| "epoch": 12.777777777777779, |
| "grad_norm": 0.04433482140302658, |
| "learning_rate": 0.00034826061323941484, |
| "loss": 0.2411, |
| "num_input_tokens_seen": 360288, |
| "step": 1150 |
| }, |
| { |
| "epoch": 12.833333333333334, |
| "grad_norm": 0.03444892540574074, |
| "learning_rate": 0.0003436483287156091, |
| "loss": 0.2358, |
| "num_input_tokens_seen": 361920, |
| "step": 1155 |
| }, |
| { |
| "epoch": 12.88888888888889, |
| "grad_norm": 0.0388001948595047, |
| "learning_rate": 0.000339050743904623, |
| "loss": 0.2316, |
| "num_input_tokens_seen": 363520, |
| "step": 1160 |
| }, |
| { |
| "epoch": 12.944444444444445, |
| "grad_norm": 0.04146473854780197, |
| "learning_rate": 0.000334468291057521, |
| "loss": 0.2319, |
| "num_input_tokens_seen": 365088, |
| "step": 1165 |
| }, |
| { |
| "epoch": 13.0, |
| "grad_norm": 0.016760699450969696, |
| "learning_rate": 0.00032990140100270637, |
| "loss": 0.2283, |
| "num_input_tokens_seen": 366688, |
| "step": 1170 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.23198899626731873, |
| "eval_runtime": 0.8146, |
| "eval_samples_per_second": 49.103, |
| "eval_steps_per_second": 12.276, |
| "num_input_tokens_seen": 366688, |
| "step": 1170 |
| }, |
| { |
| "epoch": 13.055555555555555, |
| "grad_norm": 0.04751737415790558, |
| "learning_rate": 0.0003253505031054155, |
| "loss": 0.2336, |
| "num_input_tokens_seen": 368192, |
| "step": 1175 |
| }, |
| { |
| "epoch": 13.11111111111111, |
| "grad_norm": 0.018362808972597122, |
| "learning_rate": 0.00032081602522734986, |
| "loss": 0.2247, |
| "num_input_tokens_seen": 369792, |
| "step": 1180 |
| }, |
| { |
| "epoch": 13.166666666666666, |
| "grad_norm": 0.03477654606103897, |
| "learning_rate": 0.00031629839368645086, |
| "loss": 0.2294, |
| "num_input_tokens_seen": 371360, |
| "step": 1185 |
| }, |
| { |
| "epoch": 13.222222222222221, |
| "grad_norm": 0.07415346056222916, |
| "learning_rate": 0.0003117980332168179, |
| "loss": 0.2288, |
| "num_input_tokens_seen": 372928, |
| "step": 1190 |
| }, |
| { |
| "epoch": 13.277777777777779, |
| "grad_norm": 0.04998091980814934, |
| "learning_rate": 0.00030731536692877595, |
| "loss": 0.2358, |
| "num_input_tokens_seen": 374528, |
| "step": 1195 |
| }, |
| { |
| "epoch": 13.333333333333334, |
| "grad_norm": 0.0374421626329422, |
| "learning_rate": 0.0003028508162690967, |
| "loss": 0.2287, |
| "num_input_tokens_seen": 376000, |
| "step": 1200 |
| }, |
| { |
| "epoch": 13.38888888888889, |
| "grad_norm": 0.08486609905958176, |
| "learning_rate": 0.000298404800981375, |
| "loss": 0.23, |
| "num_input_tokens_seen": 377600, |
| "step": 1205 |
| }, |
| { |
| "epoch": 13.444444444444445, |
| "grad_norm": 0.031455185264348984, |
| "learning_rate": 0.0002939777390665658, |
| "loss": 0.2304, |
| "num_input_tokens_seen": 379200, |
| "step": 1210 |
| }, |
| { |
| "epoch": 13.5, |
| "grad_norm": 0.04608649015426636, |
| "learning_rate": 0.0002895700467436855, |
| "loss": 0.2258, |
| "num_input_tokens_seen": 380704, |
| "step": 1215 |
| }, |
| { |
| "epoch": 13.555555555555555, |
| "grad_norm": 0.078786201775074, |
| "learning_rate": 0.00028518213841067906, |
| "loss": 0.2228, |
| "num_input_tokens_seen": 382272, |
| "step": 1220 |
| }, |
| { |
| "epoch": 13.61111111111111, |
| "grad_norm": 0.046005699783563614, |
| "learning_rate": 0.00028081442660546124, |
| "loss": 0.2239, |
| "num_input_tokens_seen": 383776, |
| "step": 1225 |
| }, |
| { |
| "epoch": 13.666666666666666, |
| "grad_norm": 0.0584123358130455, |
| "learning_rate": 0.00027646732196712974, |
| "loss": 0.2389, |
| "num_input_tokens_seen": 385344, |
| "step": 1230 |
| }, |
| { |
| "epoch": 13.722222222222221, |
| "grad_norm": 0.046881191432476044, |
| "learning_rate": 0.00027214123319735785, |
| "loss": 0.2284, |
| "num_input_tokens_seen": 386912, |
| "step": 1235 |
| }, |
| { |
| "epoch": 13.777777777777779, |
| "grad_norm": 0.05353561043739319, |
| "learning_rate": 0.00026783656702197156, |
| "loss": 0.2259, |
| "num_input_tokens_seen": 388480, |
| "step": 1240 |
| }, |
| { |
| "epoch": 13.833333333333334, |
| "grad_norm": 0.05366063863039017, |
| "learning_rate": 0.00026355372815270835, |
| "loss": 0.2302, |
| "num_input_tokens_seen": 390048, |
| "step": 1245 |
| }, |
| { |
| "epoch": 13.88888888888889, |
| "grad_norm": 0.025699380785226822, |
| "learning_rate": 0.000259293119249168, |
| "loss": 0.2346, |
| "num_input_tokens_seen": 391584, |
| "step": 1250 |
| }, |
| { |
| "epoch": 13.944444444444445, |
| "grad_norm": 0.09284758567810059, |
| "learning_rate": 0.00025505514088095655, |
| "loss": 0.2374, |
| "num_input_tokens_seen": 393184, |
| "step": 1255 |
| }, |
| { |
| "epoch": 14.0, |
| "grad_norm": 0.03431705757975578, |
| "learning_rate": 0.0002508401914900249, |
| "loss": 0.2299, |
| "num_input_tokens_seen": 394752, |
| "step": 1260 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.23503117263317108, |
| "eval_runtime": 0.8137, |
| "eval_samples_per_second": 49.157, |
| "eval_steps_per_second": 12.289, |
| "num_input_tokens_seen": 394752, |
| "step": 1260 |
| }, |
| { |
| "epoch": 14.055555555555555, |
| "grad_norm": 0.07217755913734436, |
| "learning_rate": 0.00024664866735320885, |
| "loss": 0.2287, |
| "num_input_tokens_seen": 396320, |
| "step": 1265 |
| }, |
| { |
| "epoch": 14.11111111111111, |
| "grad_norm": 0.03390711545944214, |
| "learning_rate": 0.00024248096254497287, |
| "loss": 0.2307, |
| "num_input_tokens_seen": 397920, |
| "step": 1270 |
| }, |
| { |
| "epoch": 14.166666666666666, |
| "grad_norm": 0.11280521005392075, |
| "learning_rate": 0.00023833746890035963, |
| "loss": 0.2273, |
| "num_input_tokens_seen": 399552, |
| "step": 1275 |
| }, |
| { |
| "epoch": 14.222222222222221, |
| "grad_norm": 0.06520415097475052, |
| "learning_rate": 0.0002342185759781511, |
| "loss": 0.2272, |
| "num_input_tokens_seen": 401120, |
| "step": 1280 |
| }, |
| { |
| "epoch": 14.277777777777779, |
| "grad_norm": 0.10993943363428116, |
| "learning_rate": 0.00023012467102424372, |
| "loss": 0.2273, |
| "num_input_tokens_seen": 402720, |
| "step": 1285 |
| }, |
| { |
| "epoch": 14.333333333333334, |
| "grad_norm": 0.03927115350961685, |
| "learning_rate": 0.00022605613893524008, |
| "loss": 0.2316, |
| "num_input_tokens_seen": 404320, |
| "step": 1290 |
| }, |
| { |
| "epoch": 14.38888888888889, |
| "grad_norm": 0.0648801177740097, |
| "learning_rate": 0.00022201336222226332, |
| "loss": 0.2286, |
| "num_input_tokens_seen": 405824, |
| "step": 1295 |
| }, |
| { |
| "epoch": 14.444444444444445, |
| "grad_norm": 0.08081918209791183, |
| "learning_rate": 0.0002179967209749929, |
| "loss": 0.2324, |
| "num_input_tokens_seen": 407360, |
| "step": 1300 |
| }, |
| { |
| "epoch": 14.5, |
| "grad_norm": 0.0945676863193512, |
| "learning_rate": 0.00021400659282593083, |
| "loss": 0.2218, |
| "num_input_tokens_seen": 408928, |
| "step": 1305 |
| }, |
| { |
| "epoch": 14.555555555555555, |
| "grad_norm": 0.05751676857471466, |
| "learning_rate": 0.0002100433529148979, |
| "loss": 0.2345, |
| "num_input_tokens_seen": 410496, |
| "step": 1310 |
| }, |
| { |
| "epoch": 14.61111111111111, |
| "grad_norm": 0.059024445712566376, |
| "learning_rate": 0.00020610737385376348, |
| "loss": 0.2312, |
| "num_input_tokens_seen": 412032, |
| "step": 1315 |
| }, |
| { |
| "epoch": 14.666666666666666, |
| "grad_norm": 0.05571332573890686, |
| "learning_rate": 0.00020219902569141402, |
| "loss": 0.2198, |
| "num_input_tokens_seen": 413536, |
| "step": 1320 |
| }, |
| { |
| "epoch": 14.722222222222221, |
| "grad_norm": 0.07701590657234192, |
| "learning_rate": 0.00019831867587896218, |
| "loss": 0.2254, |
| "num_input_tokens_seen": 415136, |
| "step": 1325 |
| }, |
| { |
| "epoch": 14.777777777777779, |
| "grad_norm": 0.13892430067062378, |
| "learning_rate": 0.0001944666892352001, |
| "loss": 0.2146, |
| "num_input_tokens_seen": 416704, |
| "step": 1330 |
| }, |
| { |
| "epoch": 14.833333333333334, |
| "grad_norm": 0.0762837678194046, |
| "learning_rate": 0.00019064342791230072, |
| "loss": 0.2589, |
| "num_input_tokens_seen": 418272, |
| "step": 1335 |
| }, |
| { |
| "epoch": 14.88888888888889, |
| "grad_norm": 0.055135730654001236, |
| "learning_rate": 0.00018684925136176834, |
| "loss": 0.2212, |
| "num_input_tokens_seen": 419808, |
| "step": 1340 |
| }, |
| { |
| "epoch": 14.944444444444445, |
| "grad_norm": 0.07139123976230621, |
| "learning_rate": 0.0001830845163006448, |
| "loss": 0.227, |
| "num_input_tokens_seen": 421344, |
| "step": 1345 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.044596634805202484, |
| "learning_rate": 0.00017934957667797225, |
| "loss": 0.2301, |
| "num_input_tokens_seen": 422912, |
| "step": 1350 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.23467722535133362, |
| "eval_runtime": 0.8152, |
| "eval_samples_per_second": 49.068, |
| "eval_steps_per_second": 12.267, |
| "num_input_tokens_seen": 422912, |
| "step": 1350 |
| }, |
| { |
| "epoch": 15.055555555555555, |
| "grad_norm": 0.07222557067871094, |
| "learning_rate": 0.000175644783641515, |
| "loss": 0.2299, |
| "num_input_tokens_seen": 424480, |
| "step": 1355 |
| }, |
| { |
| "epoch": 15.11111111111111, |
| "grad_norm": 0.10835325717926025, |
| "learning_rate": 0.00017197048550474643, |
| "loss": 0.2289, |
| "num_input_tokens_seen": 426048, |
| "step": 1360 |
| }, |
| { |
| "epoch": 15.166666666666666, |
| "grad_norm": 0.050258129835128784, |
| "learning_rate": 0.0001683270277141014, |
| "loss": 0.2294, |
| "num_input_tokens_seen": 427648, |
| "step": 1365 |
| }, |
| { |
| "epoch": 15.222222222222221, |
| "grad_norm": 0.07297654449939728, |
| "learning_rate": 0.00016471475281649818, |
| "loss": 0.2284, |
| "num_input_tokens_seen": 429216, |
| "step": 1370 |
| }, |
| { |
| "epoch": 15.277777777777779, |
| "grad_norm": 0.13614331185817719, |
| "learning_rate": 0.0001611340004271339, |
| "loss": 0.2362, |
| "num_input_tokens_seen": 430720, |
| "step": 1375 |
| }, |
| { |
| "epoch": 15.333333333333334, |
| "grad_norm": 0.05731874704360962, |
| "learning_rate": 0.0001575851071975541, |
| "loss": 0.2274, |
| "num_input_tokens_seen": 432320, |
| "step": 1380 |
| }, |
| { |
| "epoch": 15.38888888888889, |
| "grad_norm": 0.08178743720054626, |
| "learning_rate": 0.00015406840678400203, |
| "loss": 0.2222, |
| "num_input_tokens_seen": 433888, |
| "step": 1385 |
| }, |
| { |
| "epoch": 15.444444444444445, |
| "grad_norm": 0.055484700947999954, |
| "learning_rate": 0.00015058422981604997, |
| "loss": 0.2184, |
| "num_input_tokens_seen": 435456, |
| "step": 1390 |
| }, |
| { |
| "epoch": 15.5, |
| "grad_norm": 0.08678986132144928, |
| "learning_rate": 0.00014713290386551348, |
| "loss": 0.22, |
| "num_input_tokens_seen": 437024, |
| "step": 1395 |
| }, |
| { |
| "epoch": 15.555555555555555, |
| "grad_norm": 0.17864876985549927, |
| "learning_rate": 0.00014371475341565454, |
| "loss": 0.2566, |
| "num_input_tokens_seen": 438528, |
| "step": 1400 |
| }, |
| { |
| "epoch": 15.61111111111111, |
| "grad_norm": 0.08130117505788803, |
| "learning_rate": 0.00014033009983067452, |
| "loss": 0.2336, |
| "num_input_tokens_seen": 440096, |
| "step": 1405 |
| }, |
| { |
| "epoch": 15.666666666666666, |
| "grad_norm": 0.07935910671949387, |
| "learning_rate": 0.00013697926132550054, |
| "loss": 0.2329, |
| "num_input_tokens_seen": 441632, |
| "step": 1410 |
| }, |
| { |
| "epoch": 15.722222222222221, |
| "grad_norm": 0.08176440745592117, |
| "learning_rate": 0.0001336625529358682, |
| "loss": 0.2294, |
| "num_input_tokens_seen": 443136, |
| "step": 1415 |
| }, |
| { |
| "epoch": 15.777777777777779, |
| "grad_norm": 0.04429703205823898, |
| "learning_rate": 0.00013038028648870205, |
| "loss": 0.2289, |
| "num_input_tokens_seen": 444736, |
| "step": 1420 |
| }, |
| { |
| "epoch": 15.833333333333334, |
| "grad_norm": 0.04416218772530556, |
| "learning_rate": 0.0001271327705727991, |
| "loss": 0.2307, |
| "num_input_tokens_seen": 446336, |
| "step": 1425 |
| }, |
| { |
| "epoch": 15.88888888888889, |
| "grad_norm": 0.061331622302532196, |
| "learning_rate": 0.0001239203105098165, |
| "loss": 0.2248, |
| "num_input_tokens_seen": 447936, |
| "step": 1430 |
| }, |
| { |
| "epoch": 15.944444444444445, |
| "grad_norm": 0.12402181327342987, |
| "learning_rate": 0.00012074320832556557, |
| "loss": 0.2271, |
| "num_input_tokens_seen": 449440, |
| "step": 1435 |
| }, |
| { |
| "epoch": 16.0, |
| "grad_norm": 0.07083910703659058, |
| "learning_rate": 0.00011760176272161627, |
| "loss": 0.2232, |
| "num_input_tokens_seen": 451008, |
| "step": 1440 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.23479977250099182, |
| "eval_runtime": 0.8141, |
| "eval_samples_per_second": 49.136, |
| "eval_steps_per_second": 12.284, |
| "num_input_tokens_seen": 451008, |
| "step": 1440 |
| }, |
| { |
| "epoch": 16.055555555555557, |
| "grad_norm": 0.11320801079273224, |
| "learning_rate": 0.00011449626904721472, |
| "loss": 0.2316, |
| "num_input_tokens_seen": 452576, |
| "step": 1445 |
| }, |
| { |
| "epoch": 16.11111111111111, |
| "grad_norm": 0.07996129244565964, |
| "learning_rate": 0.00011142701927151455, |
| "loss": 0.2261, |
| "num_input_tokens_seen": 454112, |
| "step": 1450 |
| }, |
| { |
| "epoch": 16.166666666666668, |
| "grad_norm": 0.07149044424295425, |
| "learning_rate": 0.00010839430195612793, |
| "loss": 0.2259, |
| "num_input_tokens_seen": 455680, |
| "step": 1455 |
| }, |
| { |
| "epoch": 16.22222222222222, |
| "grad_norm": 0.09157592058181763, |
| "learning_rate": 0.00010539840222799463, |
| "loss": 0.2243, |
| "num_input_tokens_seen": 457280, |
| "step": 1460 |
| }, |
| { |
| "epoch": 16.27777777777778, |
| "grad_norm": 0.05001309514045715, |
| "learning_rate": 0.00010243960175257604, |
| "loss": 0.2255, |
| "num_input_tokens_seen": 458848, |
| "step": 1465 |
| }, |
| { |
| "epoch": 16.333333333333332, |
| "grad_norm": 0.12440861761569977, |
| "learning_rate": 9.9518178707374e-05, |
| "loss": 0.2238, |
| "num_input_tokens_seen": 460416, |
| "step": 1470 |
| }, |
| { |
| "epoch": 16.38888888888889, |
| "grad_norm": 0.10305523872375488, |
| "learning_rate": 9.663440775577653e-05, |
| "loss": 0.2266, |
| "num_input_tokens_seen": 462048, |
| "step": 1475 |
| }, |
| { |
| "epoch": 16.444444444444443, |
| "grad_norm": 0.1269642859697342, |
| "learning_rate": 9.378856002123548e-05, |
| "loss": 0.218, |
| "num_input_tokens_seen": 463584, |
| "step": 1480 |
| }, |
| { |
| "epoch": 16.5, |
| "grad_norm": 0.09578834474086761, |
| "learning_rate": 9.098090306177625e-05, |
| "loss": 0.2233, |
| "num_input_tokens_seen": 465152, |
| "step": 1485 |
| }, |
| { |
| "epoch": 16.555555555555557, |
| "grad_norm": 0.1153278648853302, |
| "learning_rate": 8.821170084484247e-05, |
| "loss": 0.2287, |
| "num_input_tokens_seen": 466720, |
| "step": 1490 |
| }, |
| { |
| "epoch": 16.61111111111111, |
| "grad_norm": 0.08815108239650726, |
| "learning_rate": 8.548121372247918e-05, |
| "loss": 0.2262, |
| "num_input_tokens_seen": 468256, |
| "step": 1495 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "grad_norm": 0.11618749797344208, |
| "learning_rate": 8.278969840685458e-05, |
| "loss": 0.2108, |
| "num_input_tokens_seen": 469792, |
| "step": 1500 |
| }, |
| { |
| "epoch": 16.72222222222222, |
| "grad_norm": 0.13226836919784546, |
| "learning_rate": 8.013740794612512e-05, |
| "loss": 0.2431, |
| "num_input_tokens_seen": 471360, |
| "step": 1505 |
| }, |
| { |
| "epoch": 16.77777777777778, |
| "grad_norm": 0.14580771327018738, |
| "learning_rate": 7.752459170064491e-05, |
| "loss": 0.2221, |
| "num_input_tokens_seen": 472896, |
| "step": 1510 |
| }, |
| { |
| "epoch": 16.833333333333332, |
| "grad_norm": 0.13856364786624908, |
| "learning_rate": 7.4951495319521e-05, |
| "loss": 0.2229, |
| "num_input_tokens_seen": 474496, |
| "step": 1515 |
| }, |
| { |
| "epoch": 16.88888888888889, |
| "grad_norm": 0.16624583303928375, |
| "learning_rate": 7.241836071751879e-05, |
| "loss": 0.2221, |
| "num_input_tokens_seen": 476064, |
| "step": 1520 |
| }, |
| { |
| "epoch": 16.944444444444443, |
| "grad_norm": 0.1337256133556366, |
| "learning_rate": 6.992542605231739e-05, |
| "loss": 0.2251, |
| "num_input_tokens_seen": 477600, |
| "step": 1525 |
| }, |
| { |
| "epoch": 17.0, |
| "grad_norm": 0.08969806879758835, |
| "learning_rate": 6.747292570211916e-05, |
| "loss": 0.2318, |
| "num_input_tokens_seen": 479104, |
| "step": 1530 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.24350161850452423, |
| "eval_runtime": 0.8143, |
| "eval_samples_per_second": 49.124, |
| "eval_steps_per_second": 12.281, |
| "num_input_tokens_seen": 479104, |
| "step": 1530 |
| }, |
| { |
| "epoch": 17.055555555555557, |
| "grad_norm": 0.08840487152338028, |
| "learning_rate": 6.506109024361429e-05, |
| "loss": 0.2191, |
| "num_input_tokens_seen": 480736, |
| "step": 1535 |
| }, |
| { |
| "epoch": 17.11111111111111, |
| "grad_norm": 0.16554512083530426, |
| "learning_rate": 6.269014643030213e-05, |
| "loss": 0.2099, |
| "num_input_tokens_seen": 482304, |
| "step": 1540 |
| }, |
| { |
| "epoch": 17.166666666666668, |
| "grad_norm": 0.10631433129310608, |
| "learning_rate": 6.0360317171172794e-05, |
| "loss": 0.2332, |
| "num_input_tokens_seen": 483872, |
| "step": 1545 |
| }, |
| { |
| "epoch": 17.22222222222222, |
| "grad_norm": 0.11119335144758224, |
| "learning_rate": 5.807182150975027e-05, |
| "loss": 0.2026, |
| "num_input_tokens_seen": 485408, |
| "step": 1550 |
| }, |
| { |
| "epoch": 17.27777777777778, |
| "grad_norm": 0.1245507299900055, |
| "learning_rate": 5.5824874603498056e-05, |
| "loss": 0.2208, |
| "num_input_tokens_seen": 486944, |
| "step": 1555 |
| }, |
| { |
| "epoch": 17.333333333333332, |
| "grad_norm": 0.1490304321050644, |
| "learning_rate": 5.361968770359071e-05, |
| "loss": 0.2278, |
| "num_input_tokens_seen": 488544, |
| "step": 1560 |
| }, |
| { |
| "epoch": 17.38888888888889, |
| "grad_norm": 0.13573351502418518, |
| "learning_rate": 5.145646813505339e-05, |
| "loss": 0.2334, |
| "num_input_tokens_seen": 490144, |
| "step": 1565 |
| }, |
| { |
| "epoch": 17.444444444444443, |
| "grad_norm": 0.10809604823589325, |
| "learning_rate": 4.933541927726887e-05, |
| "loss": 0.2179, |
| "num_input_tokens_seen": 491680, |
| "step": 1570 |
| }, |
| { |
| "epoch": 17.5, |
| "grad_norm": 0.2656223177909851, |
| "learning_rate": 4.725674054485712e-05, |
| "loss": 0.2309, |
| "num_input_tokens_seen": 493248, |
| "step": 1575 |
| }, |
| { |
| "epoch": 17.555555555555557, |
| "grad_norm": 0.14225415885448456, |
| "learning_rate": 4.522062736892635e-05, |
| "loss": 0.2227, |
| "num_input_tokens_seen": 494816, |
| "step": 1580 |
| }, |
| { |
| "epoch": 17.61111111111111, |
| "grad_norm": 0.11132286489009857, |
| "learning_rate": 4.322727117869951e-05, |
| "loss": 0.2249, |
| "num_input_tokens_seen": 496416, |
| "step": 1585 |
| }, |
| { |
| "epoch": 17.666666666666668, |
| "grad_norm": 0.20258449018001556, |
| "learning_rate": 4.127685938351694e-05, |
| "loss": 0.2353, |
| "num_input_tokens_seen": 497952, |
| "step": 1590 |
| }, |
| { |
| "epoch": 17.72222222222222, |
| "grad_norm": 0.18990765511989594, |
| "learning_rate": 3.936957535521624e-05, |
| "loss": 0.2128, |
| "num_input_tokens_seen": 499520, |
| "step": 1595 |
| }, |
| { |
| "epoch": 17.77777777777778, |
| "grad_norm": 0.21558478474617004, |
| "learning_rate": 3.750559841089196e-05, |
| "loss": 0.2246, |
| "num_input_tokens_seen": 501120, |
| "step": 1600 |
| }, |
| { |
| "epoch": 17.833333333333332, |
| "grad_norm": 0.21279899775981903, |
| "learning_rate": 3.56851037960379e-05, |
| "loss": 0.2251, |
| "num_input_tokens_seen": 502688, |
| "step": 1605 |
| }, |
| { |
| "epoch": 17.88888888888889, |
| "grad_norm": 0.19791842997074127, |
| "learning_rate": 3.3908262668069845e-05, |
| "loss": 0.219, |
| "num_input_tokens_seen": 504224, |
| "step": 1610 |
| }, |
| { |
| "epoch": 17.944444444444443, |
| "grad_norm": 0.22439955174922943, |
| "learning_rate": 3.217524208023431e-05, |
| "loss": 0.2137, |
| "num_input_tokens_seen": 505824, |
| "step": 1615 |
| }, |
| { |
| "epoch": 18.0, |
| "grad_norm": 0.2424042671918869, |
| "learning_rate": 3.048620496590304e-05, |
| "loss": 0.2203, |
| "num_input_tokens_seen": 507392, |
| "step": 1620 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.24611559510231018, |
| "eval_runtime": 0.8158, |
| "eval_samples_per_second": 49.03, |
| "eval_steps_per_second": 12.257, |
| "num_input_tokens_seen": 507392, |
| "step": 1620 |
| }, |
| { |
| "epoch": 18.055555555555557, |
| "grad_norm": 0.1351904571056366, |
| "learning_rate": 2.884131012325386e-05, |
| "loss": 0.2125, |
| "num_input_tokens_seen": 508928, |
| "step": 1625 |
| }, |
| { |
| "epoch": 18.11111111111111, |
| "grad_norm": 0.2632122039794922, |
| "learning_rate": 2.724071220034158e-05, |
| "loss": 0.2346, |
| "num_input_tokens_seen": 510496, |
| "step": 1630 |
| }, |
| { |
| "epoch": 18.166666666666668, |
| "grad_norm": 0.22769327461719513, |
| "learning_rate": 2.5684561680557994e-05, |
| "loss": 0.2089, |
| "num_input_tokens_seen": 512032, |
| "step": 1635 |
| }, |
| { |
| "epoch": 18.22222222222222, |
| "grad_norm": 0.21476925909519196, |
| "learning_rate": 2.417300486848373e-05, |
| "loss": 0.2317, |
| "num_input_tokens_seen": 513504, |
| "step": 1640 |
| }, |
| { |
| "epoch": 18.27777777777778, |
| "grad_norm": 0.16752052307128906, |
| "learning_rate": 2.2706183876134045e-05, |
| "loss": 0.2126, |
| "num_input_tokens_seen": 515040, |
| "step": 1645 |
| }, |
| { |
| "epoch": 18.333333333333332, |
| "grad_norm": 0.32024410367012024, |
| "learning_rate": 2.1284236609596887e-05, |
| "loss": 0.2101, |
| "num_input_tokens_seen": 516640, |
| "step": 1650 |
| }, |
| { |
| "epoch": 18.38888888888889, |
| "grad_norm": 0.29665425419807434, |
| "learning_rate": 1.990729675606784e-05, |
| "loss": 0.209, |
| "num_input_tokens_seen": 518208, |
| "step": 1655 |
| }, |
| { |
| "epoch": 18.444444444444443, |
| "grad_norm": 0.24566873908042908, |
| "learning_rate": 1.8575493771281205e-05, |
| "loss": 0.2161, |
| "num_input_tokens_seen": 519776, |
| "step": 1660 |
| }, |
| { |
| "epoch": 18.5, |
| "grad_norm": 0.278112530708313, |
| "learning_rate": 1.728895286733906e-05, |
| "loss": 0.2034, |
| "num_input_tokens_seen": 521344, |
| "step": 1665 |
| }, |
| { |
| "epoch": 18.555555555555557, |
| "grad_norm": 0.611442506313324, |
| "learning_rate": 1.6047795000938782e-05, |
| "loss": 0.2009, |
| "num_input_tokens_seen": 522976, |
| "step": 1670 |
| }, |
| { |
| "epoch": 18.61111111111111, |
| "grad_norm": 0.32947102189064026, |
| "learning_rate": 1.4852136862001764e-05, |
| "loss": 0.2189, |
| "num_input_tokens_seen": 524544, |
| "step": 1675 |
| }, |
| { |
| "epoch": 18.666666666666668, |
| "grad_norm": 0.49668529629707336, |
| "learning_rate": 1.3702090862701855e-05, |
| "loss": 0.209, |
| "num_input_tokens_seen": 526112, |
| "step": 1680 |
| }, |
| { |
| "epoch": 18.72222222222222, |
| "grad_norm": 0.911090612411499, |
| "learning_rate": 1.2597765126897198e-05, |
| "loss": 0.2027, |
| "num_input_tokens_seen": 527712, |
| "step": 1685 |
| }, |
| { |
| "epoch": 18.77777777777778, |
| "grad_norm": 0.7248960137367249, |
| "learning_rate": 1.1539263479964535e-05, |
| "loss": 0.1916, |
| "num_input_tokens_seen": 529280, |
| "step": 1690 |
| }, |
| { |
| "epoch": 18.833333333333332, |
| "grad_norm": 0.508635938167572, |
| "learning_rate": 1.0526685439037842e-05, |
| "loss": 0.2805, |
| "num_input_tokens_seen": 530816, |
| "step": 1695 |
| }, |
| { |
| "epoch": 18.88888888888889, |
| "grad_norm": 0.5231343507766724, |
| "learning_rate": 9.560126203652263e-06, |
| "loss": 0.196, |
| "num_input_tokens_seen": 532448, |
| "step": 1700 |
| }, |
| { |
| "epoch": 18.944444444444443, |
| "grad_norm": 0.8021954298019409, |
| "learning_rate": 8.639676646793382e-06, |
| "loss": 0.2193, |
| "num_input_tokens_seen": 534016, |
| "step": 1705 |
| }, |
| { |
| "epoch": 19.0, |
| "grad_norm": 0.42625030875205994, |
| "learning_rate": 7.76542330635388e-06, |
| "loss": 0.2287, |
| "num_input_tokens_seen": 535584, |
| "step": 1710 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.26315268874168396, |
| "eval_runtime": 0.819, |
| "eval_samples_per_second": 48.84, |
| "eval_steps_per_second": 12.21, |
| "num_input_tokens_seen": 535584, |
| "step": 1710 |
| }, |
| { |
| "epoch": 19.055555555555557, |
| "grad_norm": 0.2445189207792282, |
| "learning_rate": 6.9374483769975016e-06, |
| "loss": 0.2139, |
| "num_input_tokens_seen": 537152, |
| "step": 1715 |
| }, |
| { |
| "epoch": 19.11111111111111, |
| "grad_norm": 0.5057767033576965, |
| "learning_rate": 6.15582970243117e-06, |
| "loss": 0.2267, |
| "num_input_tokens_seen": 538720, |
| "step": 1720 |
| }, |
| { |
| "epoch": 19.166666666666668, |
| "grad_norm": 0.3208341896533966, |
| "learning_rate": 5.42064076808646e-06, |
| "loss": 0.1985, |
| "num_input_tokens_seen": 540256, |
| "step": 1725 |
| }, |
| { |
| "epoch": 19.22222222222222, |
| "grad_norm": 0.3510449528694153, |
| "learning_rate": 4.731950694210896e-06, |
| "loss": 0.2141, |
| "num_input_tokens_seen": 541824, |
| "step": 1730 |
| }, |
| { |
| "epoch": 19.27777777777778, |
| "grad_norm": 0.2291206270456314, |
| "learning_rate": 4.089824229369155e-06, |
| "loss": 0.2031, |
| "num_input_tokens_seen": 543424, |
| "step": 1735 |
| }, |
| { |
| "epoch": 19.333333333333332, |
| "grad_norm": 0.5885674953460693, |
| "learning_rate": 3.4943217443557664e-06, |
| "loss": 0.2143, |
| "num_input_tokens_seen": 545024, |
| "step": 1740 |
| }, |
| { |
| "epoch": 19.38888888888889, |
| "grad_norm": 0.4313626289367676, |
| "learning_rate": 2.9454992265193214e-06, |
| "loss": 0.2094, |
| "num_input_tokens_seen": 546560, |
| "step": 1745 |
| }, |
| { |
| "epoch": 19.444444444444443, |
| "grad_norm": 0.36777594685554504, |
| "learning_rate": 2.4434082744984598e-06, |
| "loss": 0.2101, |
| "num_input_tokens_seen": 548096, |
| "step": 1750 |
| }, |
| { |
| "epoch": 19.5, |
| "grad_norm": 0.4024662375450134, |
| "learning_rate": 1.9880960933710836e-06, |
| "loss": 0.1968, |
| "num_input_tokens_seen": 549664, |
| "step": 1755 |
| }, |
| { |
| "epoch": 19.555555555555557, |
| "grad_norm": 0.4908435344696045, |
| "learning_rate": 1.5796054902157964e-06, |
| "loss": 0.2048, |
| "num_input_tokens_seen": 551264, |
| "step": 1760 |
| }, |
| { |
| "epoch": 19.61111111111111, |
| "grad_norm": 0.3945781886577606, |
| "learning_rate": 1.2179748700879012e-06, |
| "loss": 0.2301, |
| "num_input_tokens_seen": 552800, |
| "step": 1765 |
| }, |
| { |
| "epoch": 19.666666666666668, |
| "grad_norm": 0.6240169405937195, |
| "learning_rate": 9.032382324080101e-07, |
| "loss": 0.1973, |
| "num_input_tokens_seen": 554336, |
| "step": 1770 |
| }, |
| { |
| "epoch": 19.72222222222222, |
| "grad_norm": 0.2913757264614105, |
| "learning_rate": 6.354251677661571e-07, |
| "loss": 0.2249, |
| "num_input_tokens_seen": 555872, |
| "step": 1775 |
| }, |
| { |
| "epoch": 19.77777777777778, |
| "grad_norm": 0.6081423759460449, |
| "learning_rate": 4.1456085513935646e-07, |
| "loss": 0.1961, |
| "num_input_tokens_seen": 557504, |
| "step": 1780 |
| }, |
| { |
| "epoch": 19.833333333333332, |
| "grad_norm": 0.2613461911678314, |
| "learning_rate": 2.4066605952444145e-07, |
| "loss": 0.2223, |
| "num_input_tokens_seen": 559040, |
| "step": 1785 |
| }, |
| { |
| "epoch": 19.88888888888889, |
| "grad_norm": 0.31355512142181396, |
| "learning_rate": 1.1375712998595855e-07, |
| "loss": 0.2048, |
| "num_input_tokens_seen": 560640, |
| "step": 1790 |
| }, |
| { |
| "epoch": 19.944444444444443, |
| "grad_norm": 0.3902145326137543, |
| "learning_rate": 3.384599811889766e-08, |
| "loss": 0.2315, |
| "num_input_tokens_seen": 562176, |
| "step": 1795 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.2755837142467499, |
| "learning_rate": 9.40176926922387e-10, |
| "loss": 0.1829, |
| "num_input_tokens_seen": 563744, |
| "step": 1800 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.25516048073768616, |
| "eval_runtime": 0.8152, |
| "eval_samples_per_second": 49.069, |
| "eval_steps_per_second": 12.267, |
| "num_input_tokens_seen": 563744, |
| "step": 1800 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 563744, |
| "step": 1800, |
| "total_flos": 2.538513752575181e+16, |
| "train_loss": 0.24851805749866698, |
| "train_runtime": 322.5049, |
| "train_samples_per_second": 22.325, |
| "train_steps_per_second": 5.581 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1800, |
| "num_input_tokens_seen": 563744, |
| "num_train_epochs": 20, |
| "save_steps": 90, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.538513752575181e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|