{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0005, "eval_steps": 500, "global_step": 1001, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, "grad_norm": 412.5579833984375, "learning_rate": 2.0000000000000002e-07, "loss": 3.0887, "num_input_tokens_seen": 8388608, "step": 1 }, { "epoch": 0.001, "grad_norm": 443.02325439453125, "learning_rate": 4.0000000000000003e-07, "loss": 3.0742, "num_input_tokens_seen": 16777216, "step": 2 }, { "epoch": 0.0015, "grad_norm": 430.74072265625, "learning_rate": 6.000000000000001e-07, "loss": 3.3292, "num_input_tokens_seen": 25165824, "step": 3 }, { "epoch": 0.002, "grad_norm": 426.48583984375, "learning_rate": 8.000000000000001e-07, "loss": 3.2173, "num_input_tokens_seen": 33554432, "step": 4 }, { "epoch": 0.0025, "grad_norm": 445.1883850097656, "learning_rate": 1.0000000000000002e-06, "loss": 2.9962, "num_input_tokens_seen": 41943040, "step": 5 }, { "epoch": 0.003, "grad_norm": 311.1723937988281, "learning_rate": 1.2000000000000002e-06, "loss": 2.9596, "num_input_tokens_seen": 50331648, "step": 6 }, { "epoch": 0.0035, "grad_norm": 304.4527587890625, "learning_rate": 1.4000000000000001e-06, "loss": 2.8535, "num_input_tokens_seen": 58720256, "step": 7 }, { "epoch": 0.004, "grad_norm": 135.69871520996094, "learning_rate": 1.6000000000000001e-06, "loss": 2.2207, "num_input_tokens_seen": 67108864, "step": 8 }, { "epoch": 0.0045, "grad_norm": 141.8043975830078, "learning_rate": 1.8000000000000001e-06, "loss": 2.2832, "num_input_tokens_seen": 75497472, "step": 9 }, { "epoch": 0.005, "grad_norm": 97.46916961669922, "learning_rate": 2.0000000000000003e-06, "loss": 2.2242, "num_input_tokens_seen": 83886080, "step": 10 }, { "epoch": 0.0055, "grad_norm": 49.4164924621582, "learning_rate": 2.2e-06, "loss": 2.0308, "num_input_tokens_seen": 92274688, "step": 11 }, { "epoch": 0.006, "grad_norm": 42.89604187011719, "learning_rate": 2.4000000000000003e-06, "loss": 2.0429, "num_input_tokens_seen": 100663296, "step": 12 }, { "epoch": 0.0065, "grad_norm": 35.186466217041016, "learning_rate": 2.6e-06, "loss": 1.7657, "num_input_tokens_seen": 109051904, "step": 13 }, { "epoch": 0.007, "grad_norm": 27.82587242126465, "learning_rate": 2.8000000000000003e-06, "loss": 1.9782, "num_input_tokens_seen": 117440512, "step": 14 }, { "epoch": 0.0075, "grad_norm": 18.570419311523438, "learning_rate": 3e-06, "loss": 1.8374, "num_input_tokens_seen": 125829120, "step": 15 }, { "epoch": 0.008, "grad_norm": 14.944059371948242, "learning_rate": 3.2000000000000003e-06, "loss": 1.824, "num_input_tokens_seen": 134217728, "step": 16 }, { "epoch": 0.0085, "grad_norm": 17.627771377563477, "learning_rate": 3.4000000000000005e-06, "loss": 1.853, "num_input_tokens_seen": 142606336, "step": 17 }, { "epoch": 0.009, "grad_norm": 15.03504753112793, "learning_rate": 3.6000000000000003e-06, "loss": 1.8538, "num_input_tokens_seen": 150994944, "step": 18 }, { "epoch": 0.0095, "grad_norm": 10.141422271728516, "learning_rate": 3.8000000000000005e-06, "loss": 1.6118, "num_input_tokens_seen": 159383552, "step": 19 }, { "epoch": 0.01, "grad_norm": 7.695674419403076, "learning_rate": 4.000000000000001e-06, "loss": 1.6979, "num_input_tokens_seen": 167772160, "step": 20 }, { "epoch": 0.0105, "grad_norm": 5.906360626220703, "learning_rate": 4.2000000000000004e-06, "loss": 1.642, "num_input_tokens_seen": 176160768, "step": 21 }, { "epoch": 0.011, "grad_norm": 7.0395283699035645, "learning_rate": 4.4e-06, "loss": 1.7246, "num_input_tokens_seen": 184549376, "step": 22 }, { "epoch": 0.0115, "grad_norm": 2.8150055408477783, "learning_rate": 4.600000000000001e-06, "loss": 1.5182, "num_input_tokens_seen": 192937984, "step": 23 }, { "epoch": 0.012, "grad_norm": 2.834420680999756, "learning_rate": 4.800000000000001e-06, "loss": 1.6918, "num_input_tokens_seen": 201326592, "step": 24 }, { "epoch": 0.0125, "grad_norm": 2.087716817855835, "learning_rate": 5e-06, "loss": 1.6816, "num_input_tokens_seen": 209715200, "step": 25 }, { "epoch": 0.013, "grad_norm": 1.6437803506851196, "learning_rate": 5.2e-06, "loss": 1.4976, "num_input_tokens_seen": 218103808, "step": 26 }, { "epoch": 0.0135, "grad_norm": 1.8645679950714111, "learning_rate": 5.400000000000001e-06, "loss": 1.7057, "num_input_tokens_seen": 226492416, "step": 27 }, { "epoch": 0.014, "grad_norm": 2.494386911392212, "learning_rate": 5.600000000000001e-06, "loss": 1.634, "num_input_tokens_seen": 234881024, "step": 28 }, { "epoch": 0.0145, "grad_norm": 3.4246933460235596, "learning_rate": 5.8e-06, "loss": 1.5829, "num_input_tokens_seen": 243269632, "step": 29 }, { "epoch": 0.015, "grad_norm": 2.1508476734161377, "learning_rate": 6e-06, "loss": 1.6899, "num_input_tokens_seen": 251658240, "step": 30 }, { "epoch": 0.0155, "grad_norm": 2.865145444869995, "learning_rate": 6.200000000000001e-06, "loss": 1.6139, "num_input_tokens_seen": 260046848, "step": 31 }, { "epoch": 0.016, "grad_norm": 1.5092774629592896, "learning_rate": 6.4000000000000006e-06, "loss": 1.7102, "num_input_tokens_seen": 268435456, "step": 32 }, { "epoch": 0.0165, "grad_norm": 2.086214780807495, "learning_rate": 6.600000000000001e-06, "loss": 1.5729, "num_input_tokens_seen": 276824064, "step": 33 }, { "epoch": 0.017, "grad_norm": 3.169787883758545, "learning_rate": 6.800000000000001e-06, "loss": 1.6501, "num_input_tokens_seen": 285212672, "step": 34 }, { "epoch": 0.0175, "grad_norm": 2.5079052448272705, "learning_rate": 7e-06, "loss": 1.6801, "num_input_tokens_seen": 293601280, "step": 35 }, { "epoch": 0.018, "grad_norm": 1.7760392427444458, "learning_rate": 7.2000000000000005e-06, "loss": 1.608, "num_input_tokens_seen": 301989888, "step": 36 }, { "epoch": 0.0185, "grad_norm": 1.5771219730377197, "learning_rate": 7.4e-06, "loss": 1.5296, "num_input_tokens_seen": 310378496, "step": 37 }, { "epoch": 0.019, "grad_norm": 1.3967360258102417, "learning_rate": 7.600000000000001e-06, "loss": 1.4373, "num_input_tokens_seen": 318767104, "step": 38 }, { "epoch": 0.0195, "grad_norm": 1.2799304723739624, "learning_rate": 7.800000000000002e-06, "loss": 1.5938, "num_input_tokens_seen": 327155712, "step": 39 }, { "epoch": 0.02, "grad_norm": 0.9541518092155457, "learning_rate": 8.000000000000001e-06, "loss": 1.5984, "num_input_tokens_seen": 335544320, "step": 40 }, { "epoch": 0.0205, "grad_norm": 1.1842225790023804, "learning_rate": 8.2e-06, "loss": 1.6011, "num_input_tokens_seen": 343932928, "step": 41 }, { "epoch": 0.021, "grad_norm": 2.33016037940979, "learning_rate": 8.400000000000001e-06, "loss": 1.398, "num_input_tokens_seen": 352321536, "step": 42 }, { "epoch": 0.0215, "grad_norm": 1.1873712539672852, "learning_rate": 8.6e-06, "loss": 1.5905, "num_input_tokens_seen": 360710144, "step": 43 }, { "epoch": 0.022, "grad_norm": 1.2696932554244995, "learning_rate": 8.8e-06, "loss": 1.482, "num_input_tokens_seen": 369098752, "step": 44 }, { "epoch": 0.0225, "grad_norm": 1.1331920623779297, "learning_rate": 9e-06, "loss": 1.5517, "num_input_tokens_seen": 377487360, "step": 45 }, { "epoch": 0.023, "grad_norm": 0.8957023024559021, "learning_rate": 9.200000000000002e-06, "loss": 1.4977, "num_input_tokens_seen": 385875968, "step": 46 }, { "epoch": 0.0235, "grad_norm": 1.3905730247497559, "learning_rate": 9.4e-06, "loss": 1.5301, "num_input_tokens_seen": 394264576, "step": 47 }, { "epoch": 0.024, "grad_norm": 1.6883445978164673, "learning_rate": 9.600000000000001e-06, "loss": 1.5458, "num_input_tokens_seen": 402653184, "step": 48 }, { "epoch": 0.0245, "grad_norm": 1.2196577787399292, "learning_rate": 9.800000000000001e-06, "loss": 1.641, "num_input_tokens_seen": 411041792, "step": 49 }, { "epoch": 0.025, "grad_norm": 0.9018034934997559, "learning_rate": 1e-05, "loss": 1.5061, "num_input_tokens_seen": 419430400, "step": 50 }, { "epoch": 0.0255, "grad_norm": 1.5921214818954468, "learning_rate": 1.02e-05, "loss": 1.5479, "num_input_tokens_seen": 427819008, "step": 51 }, { "epoch": 0.026, "grad_norm": 1.1603463888168335, "learning_rate": 1.04e-05, "loss": 1.728, "num_input_tokens_seen": 436207616, "step": 52 }, { "epoch": 0.0265, "grad_norm": 1.0216683149337769, "learning_rate": 1.0600000000000002e-05, "loss": 1.5272, "num_input_tokens_seen": 444596224, "step": 53 }, { "epoch": 0.027, "grad_norm": 1.5849745273590088, "learning_rate": 1.0800000000000002e-05, "loss": 1.5133, "num_input_tokens_seen": 452984832, "step": 54 }, { "epoch": 0.0275, "grad_norm": 1.2114629745483398, "learning_rate": 1.1000000000000001e-05, "loss": 1.4469, "num_input_tokens_seen": 461373440, "step": 55 }, { "epoch": 0.028, "grad_norm": 1.6813267469406128, "learning_rate": 1.1200000000000001e-05, "loss": 1.4083, "num_input_tokens_seen": 469762048, "step": 56 }, { "epoch": 0.0285, "grad_norm": 1.2210255861282349, "learning_rate": 1.14e-05, "loss": 1.5488, "num_input_tokens_seen": 478150656, "step": 57 }, { "epoch": 0.029, "grad_norm": 1.0760395526885986, "learning_rate": 1.16e-05, "loss": 1.5792, "num_input_tokens_seen": 486539264, "step": 58 }, { "epoch": 0.0295, "grad_norm": 1.5512540340423584, "learning_rate": 1.18e-05, "loss": 1.424, "num_input_tokens_seen": 494927872, "step": 59 }, { "epoch": 0.03, "grad_norm": 1.0693914890289307, "learning_rate": 1.2e-05, "loss": 1.5086, "num_input_tokens_seen": 503316480, "step": 60 }, { "epoch": 0.0305, "grad_norm": 1.6010892391204834, "learning_rate": 1.22e-05, "loss": 1.4852, "num_input_tokens_seen": 511705088, "step": 61 }, { "epoch": 0.031, "grad_norm": 1.2374985218048096, "learning_rate": 1.2400000000000002e-05, "loss": 1.6066, "num_input_tokens_seen": 520093696, "step": 62 }, { "epoch": 0.0315, "grad_norm": 1.2361466884613037, "learning_rate": 1.2600000000000001e-05, "loss": 1.5206, "num_input_tokens_seen": 528482304, "step": 63 }, { "epoch": 0.032, "grad_norm": 1.2030378580093384, "learning_rate": 1.2800000000000001e-05, "loss": 1.6945, "num_input_tokens_seen": 536870912, "step": 64 }, { "epoch": 0.0325, "grad_norm": 1.1588894128799438, "learning_rate": 1.3000000000000001e-05, "loss": 1.5522, "num_input_tokens_seen": 545259520, "step": 65 }, { "epoch": 0.033, "grad_norm": 1.3740041255950928, "learning_rate": 1.3200000000000002e-05, "loss": 1.4767, "num_input_tokens_seen": 553648128, "step": 66 }, { "epoch": 0.0335, "grad_norm": 1.5017540454864502, "learning_rate": 1.3400000000000002e-05, "loss": 1.4603, "num_input_tokens_seen": 562036736, "step": 67 }, { "epoch": 0.034, "grad_norm": 1.2395786046981812, "learning_rate": 1.3600000000000002e-05, "loss": 1.4505, "num_input_tokens_seen": 570425344, "step": 68 }, { "epoch": 0.0345, "grad_norm": 1.7764869928359985, "learning_rate": 1.38e-05, "loss": 1.5503, "num_input_tokens_seen": 578813952, "step": 69 }, { "epoch": 0.035, "grad_norm": 1.9530940055847168, "learning_rate": 1.4e-05, "loss": 1.398, "num_input_tokens_seen": 587202560, "step": 70 }, { "epoch": 0.0355, "grad_norm": 1.3631656169891357, "learning_rate": 1.4200000000000001e-05, "loss": 1.5875, "num_input_tokens_seen": 595591168, "step": 71 }, { "epoch": 0.036, "grad_norm": 1.2517352104187012, "learning_rate": 1.4400000000000001e-05, "loss": 1.7108, "num_input_tokens_seen": 603979776, "step": 72 }, { "epoch": 0.0365, "grad_norm": 1.306931734085083, "learning_rate": 1.46e-05, "loss": 1.4414, "num_input_tokens_seen": 612368384, "step": 73 }, { "epoch": 0.037, "grad_norm": 1.4231802225112915, "learning_rate": 1.48e-05, "loss": 1.4402, "num_input_tokens_seen": 620756992, "step": 74 }, { "epoch": 0.0375, "grad_norm": 1.2013376951217651, "learning_rate": 1.5000000000000002e-05, "loss": 1.3087, "num_input_tokens_seen": 629145600, "step": 75 }, { "epoch": 0.038, "grad_norm": 1.208516001701355, "learning_rate": 1.5200000000000002e-05, "loss": 1.5853, "num_input_tokens_seen": 637534208, "step": 76 }, { "epoch": 0.0385, "grad_norm": 0.9534114003181458, "learning_rate": 1.54e-05, "loss": 1.5989, "num_input_tokens_seen": 645922816, "step": 77 }, { "epoch": 0.039, "grad_norm": 0.9337539076805115, "learning_rate": 1.5600000000000003e-05, "loss": 1.6234, "num_input_tokens_seen": 654311424, "step": 78 }, { "epoch": 0.0395, "grad_norm": 1.3310667276382446, "learning_rate": 1.58e-05, "loss": 1.532, "num_input_tokens_seen": 662700032, "step": 79 }, { "epoch": 0.04, "grad_norm": 1.5133103132247925, "learning_rate": 1.6000000000000003e-05, "loss": 1.6173, "num_input_tokens_seen": 671088640, "step": 80 }, { "epoch": 0.0405, "grad_norm": 0.9211172461509705, "learning_rate": 1.62e-05, "loss": 1.7171, "num_input_tokens_seen": 679477248, "step": 81 }, { "epoch": 0.041, "grad_norm": 1.3290948867797852, "learning_rate": 1.64e-05, "loss": 1.4691, "num_input_tokens_seen": 687865856, "step": 82 }, { "epoch": 0.0415, "grad_norm": 1.529194712638855, "learning_rate": 1.66e-05, "loss": 1.3846, "num_input_tokens_seen": 696254464, "step": 83 }, { "epoch": 0.042, "grad_norm": 1.8044465780258179, "learning_rate": 1.6800000000000002e-05, "loss": 1.6476, "num_input_tokens_seen": 704643072, "step": 84 }, { "epoch": 0.0425, "grad_norm": 1.1989140510559082, "learning_rate": 1.7e-05, "loss": 1.574, "num_input_tokens_seen": 713031680, "step": 85 }, { "epoch": 0.043, "grad_norm": 2.2860641479492188, "learning_rate": 1.72e-05, "loss": 1.4513, "num_input_tokens_seen": 721420288, "step": 86 }, { "epoch": 0.0435, "grad_norm": 1.4773674011230469, "learning_rate": 1.7400000000000003e-05, "loss": 1.5491, "num_input_tokens_seen": 729808896, "step": 87 }, { "epoch": 0.044, "grad_norm": 1.1364495754241943, "learning_rate": 1.76e-05, "loss": 1.7041, "num_input_tokens_seen": 738197504, "step": 88 }, { "epoch": 0.0445, "grad_norm": 1.465047001838684, "learning_rate": 1.7800000000000002e-05, "loss": 1.4433, "num_input_tokens_seen": 746586112, "step": 89 }, { "epoch": 0.045, "grad_norm": 1.4849730730056763, "learning_rate": 1.8e-05, "loss": 1.4223, "num_input_tokens_seen": 754974720, "step": 90 }, { "epoch": 0.0455, "grad_norm": 1.3632209300994873, "learning_rate": 1.8200000000000002e-05, "loss": 1.6045, "num_input_tokens_seen": 763363328, "step": 91 }, { "epoch": 0.046, "grad_norm": 1.773093819618225, "learning_rate": 1.8400000000000003e-05, "loss": 1.503, "num_input_tokens_seen": 771751936, "step": 92 }, { "epoch": 0.0465, "grad_norm": 1.0889328718185425, "learning_rate": 1.86e-05, "loss": 1.5306, "num_input_tokens_seen": 780140544, "step": 93 }, { "epoch": 0.047, "grad_norm": 1.1769163608551025, "learning_rate": 1.88e-05, "loss": 1.4637, "num_input_tokens_seen": 788529152, "step": 94 }, { "epoch": 0.0475, "grad_norm": 1.2278140783309937, "learning_rate": 1.9e-05, "loss": 1.4143, "num_input_tokens_seen": 796917760, "step": 95 }, { "epoch": 0.048, "grad_norm": 1.515855073928833, "learning_rate": 1.9200000000000003e-05, "loss": 1.4587, "num_input_tokens_seen": 805306368, "step": 96 }, { "epoch": 0.0485, "grad_norm": 1.4236325025558472, "learning_rate": 1.94e-05, "loss": 1.6362, "num_input_tokens_seen": 813694976, "step": 97 }, { "epoch": 0.049, "grad_norm": 0.9047524929046631, "learning_rate": 1.9600000000000002e-05, "loss": 1.5615, "num_input_tokens_seen": 822083584, "step": 98 }, { "epoch": 0.0495, "grad_norm": 1.0978785753250122, "learning_rate": 1.98e-05, "loss": 1.5942, "num_input_tokens_seen": 830472192, "step": 99 }, { "epoch": 0.05, "grad_norm": 1.447261929512024, "learning_rate": 2e-05, "loss": 1.4845, "num_input_tokens_seen": 838860800, "step": 100 }, { "epoch": 0.0505, "grad_norm": 1.5815658569335938, "learning_rate": 2.0200000000000003e-05, "loss": 1.5163, "num_input_tokens_seen": 847249408, "step": 101 }, { "epoch": 0.051, "grad_norm": 1.0602869987487793, "learning_rate": 2.04e-05, "loss": 1.5627, "num_input_tokens_seen": 855638016, "step": 102 }, { "epoch": 0.0515, "grad_norm": 2.6368749141693115, "learning_rate": 2.0600000000000003e-05, "loss": 1.4506, "num_input_tokens_seen": 864026624, "step": 103 }, { "epoch": 0.052, "grad_norm": 1.5542439222335815, "learning_rate": 2.08e-05, "loss": 1.5836, "num_input_tokens_seen": 872415232, "step": 104 }, { "epoch": 0.0525, "grad_norm": 1.80384361743927, "learning_rate": 2.1000000000000002e-05, "loss": 1.6419, "num_input_tokens_seen": 880803840, "step": 105 }, { "epoch": 0.053, "grad_norm": 1.621586561203003, "learning_rate": 2.1200000000000004e-05, "loss": 1.3437, "num_input_tokens_seen": 889192448, "step": 106 }, { "epoch": 0.0535, "grad_norm": 1.8528887033462524, "learning_rate": 2.1400000000000002e-05, "loss": 1.5, "num_input_tokens_seen": 897581056, "step": 107 }, { "epoch": 0.054, "grad_norm": 1.3142207860946655, "learning_rate": 2.1600000000000003e-05, "loss": 1.5708, "num_input_tokens_seen": 905969664, "step": 108 }, { "epoch": 0.0545, "grad_norm": 1.350847601890564, "learning_rate": 2.1800000000000005e-05, "loss": 1.5892, "num_input_tokens_seen": 914358272, "step": 109 }, { "epoch": 0.055, "grad_norm": 1.519858479499817, "learning_rate": 2.2000000000000003e-05, "loss": 1.5513, "num_input_tokens_seen": 922746880, "step": 110 }, { "epoch": 0.0555, "grad_norm": 1.6471163034439087, "learning_rate": 2.2200000000000004e-05, "loss": 1.5137, "num_input_tokens_seen": 931135488, "step": 111 }, { "epoch": 0.056, "grad_norm": 1.3542113304138184, "learning_rate": 2.2400000000000002e-05, "loss": 1.5368, "num_input_tokens_seen": 939524096, "step": 112 }, { "epoch": 0.0565, "grad_norm": 1.8961020708084106, "learning_rate": 2.26e-05, "loss": 1.4339, "num_input_tokens_seen": 947912704, "step": 113 }, { "epoch": 0.057, "grad_norm": 1.3555443286895752, "learning_rate": 2.28e-05, "loss": 1.4529, "num_input_tokens_seen": 956301312, "step": 114 }, { "epoch": 0.0575, "grad_norm": 0.978428840637207, "learning_rate": 2.3e-05, "loss": 1.7014, "num_input_tokens_seen": 964689920, "step": 115 }, { "epoch": 0.058, "grad_norm": 1.1933631896972656, "learning_rate": 2.32e-05, "loss": 1.4238, "num_input_tokens_seen": 973078528, "step": 116 }, { "epoch": 0.0585, "grad_norm": 1.160028100013733, "learning_rate": 2.34e-05, "loss": 1.7248, "num_input_tokens_seen": 981467136, "step": 117 }, { "epoch": 0.059, "grad_norm": 1.3412951231002808, "learning_rate": 2.36e-05, "loss": 1.5906, "num_input_tokens_seen": 989855744, "step": 118 }, { "epoch": 0.0595, "grad_norm": 1.5971200466156006, "learning_rate": 2.3800000000000003e-05, "loss": 1.4209, "num_input_tokens_seen": 998244352, "step": 119 }, { "epoch": 0.06, "grad_norm": 1.839640498161316, "learning_rate": 2.4e-05, "loss": 1.3599, "num_input_tokens_seen": 1006632960, "step": 120 }, { "epoch": 0.0605, "grad_norm": 1.384678602218628, "learning_rate": 2.4200000000000002e-05, "loss": 1.5075, "num_input_tokens_seen": 1015021568, "step": 121 }, { "epoch": 0.061, "grad_norm": 1.1456788778305054, "learning_rate": 2.44e-05, "loss": 1.7151, "num_input_tokens_seen": 1023410176, "step": 122 }, { "epoch": 0.0615, "grad_norm": 0.8662890791893005, "learning_rate": 2.46e-05, "loss": 1.5478, "num_input_tokens_seen": 1031798784, "step": 123 }, { "epoch": 0.062, "grad_norm": 1.442311406135559, "learning_rate": 2.4800000000000003e-05, "loss": 1.6116, "num_input_tokens_seen": 1040187392, "step": 124 }, { "epoch": 0.0625, "grad_norm": 1.0386899709701538, "learning_rate": 2.5e-05, "loss": 1.568, "num_input_tokens_seen": 1048576000, "step": 125 }, { "epoch": 0.063, "grad_norm": 1.5259876251220703, "learning_rate": 2.5200000000000003e-05, "loss": 1.5687, "num_input_tokens_seen": 1056964608, "step": 126 }, { "epoch": 0.0635, "grad_norm": 1.4336763620376587, "learning_rate": 2.54e-05, "loss": 1.6075, "num_input_tokens_seen": 1065353216, "step": 127 }, { "epoch": 0.064, "grad_norm": 1.2190485000610352, "learning_rate": 2.5600000000000002e-05, "loss": 1.4311, "num_input_tokens_seen": 1073741824, "step": 128 }, { "epoch": 0.0645, "grad_norm": 1.133845567703247, "learning_rate": 2.5800000000000004e-05, "loss": 1.4361, "num_input_tokens_seen": 1082130432, "step": 129 }, { "epoch": 0.065, "grad_norm": 1.3018219470977783, "learning_rate": 2.6000000000000002e-05, "loss": 1.6208, "num_input_tokens_seen": 1090519040, "step": 130 }, { "epoch": 0.0655, "grad_norm": 1.2958109378814697, "learning_rate": 2.6200000000000003e-05, "loss": 1.3764, "num_input_tokens_seen": 1098907648, "step": 131 }, { "epoch": 0.066, "grad_norm": 1.839138150215149, "learning_rate": 2.6400000000000005e-05, "loss": 1.3464, "num_input_tokens_seen": 1107296256, "step": 132 }, { "epoch": 0.0665, "grad_norm": 1.5408761501312256, "learning_rate": 2.6600000000000003e-05, "loss": 1.6076, "num_input_tokens_seen": 1115684864, "step": 133 }, { "epoch": 0.067, "grad_norm": 1.2489920854568481, "learning_rate": 2.6800000000000004e-05, "loss": 1.5839, "num_input_tokens_seen": 1124073472, "step": 134 }, { "epoch": 0.0675, "grad_norm": 1.24154794216156, "learning_rate": 2.7000000000000002e-05, "loss": 1.4888, "num_input_tokens_seen": 1132462080, "step": 135 }, { "epoch": 0.068, "grad_norm": 1.6921497583389282, "learning_rate": 2.7200000000000004e-05, "loss": 1.4236, "num_input_tokens_seen": 1140850688, "step": 136 }, { "epoch": 0.0685, "grad_norm": 1.4992108345031738, "learning_rate": 2.7400000000000005e-05, "loss": 1.58, "num_input_tokens_seen": 1149239296, "step": 137 }, { "epoch": 0.069, "grad_norm": 1.301435112953186, "learning_rate": 2.76e-05, "loss": 1.4688, "num_input_tokens_seen": 1157627904, "step": 138 }, { "epoch": 0.0695, "grad_norm": 2.358342170715332, "learning_rate": 2.78e-05, "loss": 1.6317, "num_input_tokens_seen": 1166016512, "step": 139 }, { "epoch": 0.07, "grad_norm": 1.2117111682891846, "learning_rate": 2.8e-05, "loss": 1.4887, "num_input_tokens_seen": 1174405120, "step": 140 }, { "epoch": 0.0705, "grad_norm": 2.953007936477661, "learning_rate": 2.82e-05, "loss": 1.4228, "num_input_tokens_seen": 1182793728, "step": 141 }, { "epoch": 0.071, "grad_norm": 1.7483651638031006, "learning_rate": 2.8400000000000003e-05, "loss": 1.4996, "num_input_tokens_seen": 1191182336, "step": 142 }, { "epoch": 0.0715, "grad_norm": 2.0823328495025635, "learning_rate": 2.86e-05, "loss": 1.5173, "num_input_tokens_seen": 1199570944, "step": 143 }, { "epoch": 0.072, "grad_norm": 1.5575473308563232, "learning_rate": 2.8800000000000002e-05, "loss": 1.6488, "num_input_tokens_seen": 1207959552, "step": 144 }, { "epoch": 0.0725, "grad_norm": 1.4316424131393433, "learning_rate": 2.9e-05, "loss": 1.5737, "num_input_tokens_seen": 1216348160, "step": 145 }, { "epoch": 0.073, "grad_norm": 1.232532024383545, "learning_rate": 2.92e-05, "loss": 1.524, "num_input_tokens_seen": 1224736768, "step": 146 }, { "epoch": 0.0735, "grad_norm": 1.520302653312683, "learning_rate": 2.9400000000000003e-05, "loss": 1.6555, "num_input_tokens_seen": 1233125376, "step": 147 }, { "epoch": 0.074, "grad_norm": 1.3599549531936646, "learning_rate": 2.96e-05, "loss": 1.4547, "num_input_tokens_seen": 1241513984, "step": 148 }, { "epoch": 0.0745, "grad_norm": 1.3655874729156494, "learning_rate": 2.9800000000000003e-05, "loss": 1.4444, "num_input_tokens_seen": 1249902592, "step": 149 }, { "epoch": 0.075, "grad_norm": 1.110838532447815, "learning_rate": 3.0000000000000004e-05, "loss": 1.5493, "num_input_tokens_seen": 1258291200, "step": 150 }, { "epoch": 0.0755, "grad_norm": 1.1049745082855225, "learning_rate": 3.0200000000000002e-05, "loss": 1.4894, "num_input_tokens_seen": 1266679808, "step": 151 }, { "epoch": 0.076, "grad_norm": 1.3869433403015137, "learning_rate": 3.0400000000000004e-05, "loss": 1.5701, "num_input_tokens_seen": 1275068416, "step": 152 }, { "epoch": 0.0765, "grad_norm": 0.9968905448913574, "learning_rate": 3.0600000000000005e-05, "loss": 1.4069, "num_input_tokens_seen": 1283457024, "step": 153 }, { "epoch": 0.077, "grad_norm": 1.2496414184570312, "learning_rate": 3.08e-05, "loss": 1.6171, "num_input_tokens_seen": 1291845632, "step": 154 }, { "epoch": 0.0775, "grad_norm": 1.2284398078918457, "learning_rate": 3.1e-05, "loss": 1.3802, "num_input_tokens_seen": 1300234240, "step": 155 }, { "epoch": 0.078, "grad_norm": 6.807265281677246, "learning_rate": 3.1200000000000006e-05, "loss": 1.4788, "num_input_tokens_seen": 1308622848, "step": 156 }, { "epoch": 0.0785, "grad_norm": 2.8714168071746826, "learning_rate": 3.1400000000000004e-05, "loss": 1.4898, "num_input_tokens_seen": 1317011456, "step": 157 }, { "epoch": 0.079, "grad_norm": 1.6808459758758545, "learning_rate": 3.16e-05, "loss": 1.5658, "num_input_tokens_seen": 1325400064, "step": 158 }, { "epoch": 0.0795, "grad_norm": 1.448957085609436, "learning_rate": 3.180000000000001e-05, "loss": 1.6529, "num_input_tokens_seen": 1333788672, "step": 159 }, { "epoch": 0.08, "grad_norm": 1.4101011753082275, "learning_rate": 3.2000000000000005e-05, "loss": 1.6686, "num_input_tokens_seen": 1342177280, "step": 160 }, { "epoch": 0.0805, "grad_norm": 1.8087806701660156, "learning_rate": 3.2200000000000003e-05, "loss": 1.3667, "num_input_tokens_seen": 1350565888, "step": 161 }, { "epoch": 0.081, "grad_norm": 1.772939920425415, "learning_rate": 3.24e-05, "loss": 1.4576, "num_input_tokens_seen": 1358954496, "step": 162 }, { "epoch": 0.0815, "grad_norm": 1.165248155593872, "learning_rate": 3.26e-05, "loss": 1.5332, "num_input_tokens_seen": 1367343104, "step": 163 }, { "epoch": 0.082, "grad_norm": 1.6000652313232422, "learning_rate": 3.28e-05, "loss": 1.2258, "num_input_tokens_seen": 1375731712, "step": 164 }, { "epoch": 0.0825, "grad_norm": 2.204563617706299, "learning_rate": 3.3e-05, "loss": 1.5246, "num_input_tokens_seen": 1384120320, "step": 165 }, { "epoch": 0.083, "grad_norm": 1.367166519165039, "learning_rate": 3.32e-05, "loss": 1.4654, "num_input_tokens_seen": 1392508928, "step": 166 }, { "epoch": 0.0835, "grad_norm": 1.8968291282653809, "learning_rate": 3.34e-05, "loss": 1.7745, "num_input_tokens_seen": 1400897536, "step": 167 }, { "epoch": 0.084, "grad_norm": 1.4040907621383667, "learning_rate": 3.3600000000000004e-05, "loss": 1.6478, "num_input_tokens_seen": 1409286144, "step": 168 }, { "epoch": 0.0845, "grad_norm": 1.2746425867080688, "learning_rate": 3.38e-05, "loss": 1.5, "num_input_tokens_seen": 1417674752, "step": 169 }, { "epoch": 0.085, "grad_norm": 1.3624781370162964, "learning_rate": 3.4e-05, "loss": 1.536, "num_input_tokens_seen": 1426063360, "step": 170 }, { "epoch": 0.0855, "grad_norm": 1.2191846370697021, "learning_rate": 3.4200000000000005e-05, "loss": 1.3079, "num_input_tokens_seen": 1434451968, "step": 171 }, { "epoch": 0.086, "grad_norm": 1.5852552652359009, "learning_rate": 3.44e-05, "loss": 1.4818, "num_input_tokens_seen": 1442840576, "step": 172 }, { "epoch": 0.0865, "grad_norm": 1.594234585762024, "learning_rate": 3.46e-05, "loss": 1.4661, "num_input_tokens_seen": 1451229184, "step": 173 }, { "epoch": 0.087, "grad_norm": 1.589820384979248, "learning_rate": 3.4800000000000006e-05, "loss": 1.4564, "num_input_tokens_seen": 1459617792, "step": 174 }, { "epoch": 0.0875, "grad_norm": 1.5420101881027222, "learning_rate": 3.5000000000000004e-05, "loss": 1.5041, "num_input_tokens_seen": 1468006400, "step": 175 }, { "epoch": 0.088, "grad_norm": 1.1910130977630615, "learning_rate": 3.52e-05, "loss": 1.5391, "num_input_tokens_seen": 1476395008, "step": 176 }, { "epoch": 0.0885, "grad_norm": 1.1627064943313599, "learning_rate": 3.54e-05, "loss": 1.4564, "num_input_tokens_seen": 1484783616, "step": 177 }, { "epoch": 0.089, "grad_norm": 1.2218005657196045, "learning_rate": 3.5600000000000005e-05, "loss": 1.5223, "num_input_tokens_seen": 1493172224, "step": 178 }, { "epoch": 0.0895, "grad_norm": 1.0689893960952759, "learning_rate": 3.58e-05, "loss": 1.4585, "num_input_tokens_seen": 1501560832, "step": 179 }, { "epoch": 0.09, "grad_norm": 1.0907436609268188, "learning_rate": 3.6e-05, "loss": 1.5093, "num_input_tokens_seen": 1509949440, "step": 180 }, { "epoch": 0.0905, "grad_norm": 1.4522995948791504, "learning_rate": 3.6200000000000006e-05, "loss": 1.4059, "num_input_tokens_seen": 1518338048, "step": 181 }, { "epoch": 0.091, "grad_norm": 1.4474709033966064, "learning_rate": 3.6400000000000004e-05, "loss": 1.5482, "num_input_tokens_seen": 1526726656, "step": 182 }, { "epoch": 0.0915, "grad_norm": 1.640328049659729, "learning_rate": 3.66e-05, "loss": 1.421, "num_input_tokens_seen": 1535115264, "step": 183 }, { "epoch": 0.092, "grad_norm": 1.1758472919464111, "learning_rate": 3.680000000000001e-05, "loss": 1.5539, "num_input_tokens_seen": 1543503872, "step": 184 }, { "epoch": 0.0925, "grad_norm": 1.4281258583068848, "learning_rate": 3.7000000000000005e-05, "loss": 1.4736, "num_input_tokens_seen": 1551892480, "step": 185 }, { "epoch": 0.093, "grad_norm": 1.530848503112793, "learning_rate": 3.72e-05, "loss": 1.5419, "num_input_tokens_seen": 1560281088, "step": 186 }, { "epoch": 0.0935, "grad_norm": 1.1989493370056152, "learning_rate": 3.740000000000001e-05, "loss": 1.5677, "num_input_tokens_seen": 1568669696, "step": 187 }, { "epoch": 0.094, "grad_norm": 1.709241509437561, "learning_rate": 3.76e-05, "loss": 1.5661, "num_input_tokens_seen": 1577058304, "step": 188 }, { "epoch": 0.0945, "grad_norm": 1.2679917812347412, "learning_rate": 3.7800000000000004e-05, "loss": 1.5265, "num_input_tokens_seen": 1585446912, "step": 189 }, { "epoch": 0.095, "grad_norm": 1.252685546875, "learning_rate": 3.8e-05, "loss": 1.4125, "num_input_tokens_seen": 1593835520, "step": 190 }, { "epoch": 0.0955, "grad_norm": 1.1314305067062378, "learning_rate": 3.82e-05, "loss": 1.5629, "num_input_tokens_seen": 1602224128, "step": 191 }, { "epoch": 0.096, "grad_norm": 1.3145653009414673, "learning_rate": 3.8400000000000005e-05, "loss": 1.4342, "num_input_tokens_seen": 1610612736, "step": 192 }, { "epoch": 0.0965, "grad_norm": 1.9945522546768188, "learning_rate": 3.86e-05, "loss": 1.6386, "num_input_tokens_seen": 1619001344, "step": 193 }, { "epoch": 0.097, "grad_norm": 1.0219146013259888, "learning_rate": 3.88e-05, "loss": 1.5971, "num_input_tokens_seen": 1627389952, "step": 194 }, { "epoch": 0.0975, "grad_norm": 1.8139615058898926, "learning_rate": 3.9e-05, "loss": 1.6514, "num_input_tokens_seen": 1635778560, "step": 195 }, { "epoch": 0.098, "grad_norm": 1.1766538619995117, "learning_rate": 3.9200000000000004e-05, "loss": 1.4922, "num_input_tokens_seen": 1644167168, "step": 196 }, { "epoch": 0.0985, "grad_norm": 1.7942572832107544, "learning_rate": 3.94e-05, "loss": 1.4603, "num_input_tokens_seen": 1652555776, "step": 197 }, { "epoch": 0.099, "grad_norm": 1.5922130346298218, "learning_rate": 3.96e-05, "loss": 1.4355, "num_input_tokens_seen": 1660944384, "step": 198 }, { "epoch": 0.0995, "grad_norm": 1.207627773284912, "learning_rate": 3.9800000000000005e-05, "loss": 1.4737, "num_input_tokens_seen": 1669332992, "step": 199 }, { "epoch": 0.1, "grad_norm": 1.5598710775375366, "learning_rate": 4e-05, "loss": 1.5503, "num_input_tokens_seen": 1677721600, "step": 200 }, { "epoch": 0.1005, "grad_norm": 1.0401601791381836, "learning_rate": 3.9999845787629415e-05, "loss": 1.4716, "num_input_tokens_seen": 1686110208, "step": 201 }, { "epoch": 0.101, "grad_norm": 1.5625003576278687, "learning_rate": 3.99993831528958e-05, "loss": 1.5045, "num_input_tokens_seen": 1694498816, "step": 202 }, { "epoch": 0.1015, "grad_norm": 0.9797676205635071, "learning_rate": 3.9998612102933544e-05, "loss": 1.5389, "num_input_tokens_seen": 1702887424, "step": 203 }, { "epoch": 0.102, "grad_norm": 1.2792385816574097, "learning_rate": 3.999753264963321e-05, "loss": 1.5862, "num_input_tokens_seen": 1711276032, "step": 204 }, { "epoch": 0.1025, "grad_norm": 1.156701683998108, "learning_rate": 3.9996144809641296e-05, "loss": 1.5397, "num_input_tokens_seen": 1719664640, "step": 205 }, { "epoch": 0.103, "grad_norm": 1.1689105033874512, "learning_rate": 3.9994448604360016e-05, "loss": 1.5393, "num_input_tokens_seen": 1728053248, "step": 206 }, { "epoch": 0.1035, "grad_norm": 1.3956072330474854, "learning_rate": 3.999244405994694e-05, "loss": 1.4317, "num_input_tokens_seen": 1736441856, "step": 207 }, { "epoch": 0.104, "grad_norm": 1.883193016052246, "learning_rate": 3.9990131207314634e-05, "loss": 1.4605, "num_input_tokens_seen": 1744830464, "step": 208 }, { "epoch": 0.1045, "grad_norm": 1.210907220840454, "learning_rate": 3.998751008213014e-05, "loss": 1.415, "num_input_tokens_seen": 1753219072, "step": 209 }, { "epoch": 0.105, "grad_norm": 1.2669427394866943, "learning_rate": 3.9984580724814464e-05, "loss": 1.3665, "num_input_tokens_seen": 1761607680, "step": 210 }, { "epoch": 0.1055, "grad_norm": 1.8047226667404175, "learning_rate": 3.99813431805419e-05, "loss": 1.4922, "num_input_tokens_seen": 1769996288, "step": 211 }, { "epoch": 0.106, "grad_norm": 1.2756640911102295, "learning_rate": 3.9977797499239404e-05, "loss": 1.5212, "num_input_tokens_seen": 1778384896, "step": 212 }, { "epoch": 0.1065, "grad_norm": 1.0521904230117798, "learning_rate": 3.997394373558576e-05, "loss": 1.5597, "num_input_tokens_seen": 1786773504, "step": 213 }, { "epoch": 0.107, "grad_norm": 1.2665338516235352, "learning_rate": 3.996978194901077e-05, "loss": 1.6852, "num_input_tokens_seen": 1795162112, "step": 214 }, { "epoch": 0.1075, "grad_norm": 1.0749468803405762, "learning_rate": 3.996531220369432e-05, "loss": 1.4212, "num_input_tokens_seen": 1803550720, "step": 215 }, { "epoch": 0.108, "grad_norm": 1.3273717164993286, "learning_rate": 3.9960534568565436e-05, "loss": 1.4239, "num_input_tokens_seen": 1811939328, "step": 216 }, { "epoch": 0.1085, "grad_norm": 1.1981056928634644, "learning_rate": 3.995544911730115e-05, "loss": 1.5102, "num_input_tokens_seen": 1820327936, "step": 217 }, { "epoch": 0.109, "grad_norm": 1.1900936365127563, "learning_rate": 3.995005592832541e-05, "loss": 1.5633, "num_input_tokens_seen": 1828716544, "step": 218 }, { "epoch": 0.1095, "grad_norm": 0.9460403323173523, "learning_rate": 3.994435508480786e-05, "loss": 1.5169, "num_input_tokens_seen": 1837105152, "step": 219 }, { "epoch": 0.11, "grad_norm": 1.7271934747695923, "learning_rate": 3.9938346674662565e-05, "loss": 1.642, "num_input_tokens_seen": 1845493760, "step": 220 }, { "epoch": 0.1105, "grad_norm": 1.1056196689605713, "learning_rate": 3.9932030790546636e-05, "loss": 1.4402, "num_input_tokens_seen": 1853882368, "step": 221 }, { "epoch": 0.111, "grad_norm": 1.5984708070755005, "learning_rate": 3.9925407529858826e-05, "loss": 1.4715, "num_input_tokens_seen": 1862270976, "step": 222 }, { "epoch": 0.1115, "grad_norm": 1.494849443435669, "learning_rate": 3.991847699473801e-05, "loss": 1.4581, "num_input_tokens_seen": 1870659584, "step": 223 }, { "epoch": 0.112, "grad_norm": 1.2106796503067017, "learning_rate": 3.99112392920616e-05, "loss": 1.484, "num_input_tokens_seen": 1879048192, "step": 224 }, { "epoch": 0.1125, "grad_norm": 1.198693871498108, "learning_rate": 3.990369453344394e-05, "loss": 1.6248, "num_input_tokens_seen": 1887436800, "step": 225 }, { "epoch": 0.113, "grad_norm": 1.261634349822998, "learning_rate": 3.989584283523453e-05, "loss": 1.531, "num_input_tokens_seen": 1895825408, "step": 226 }, { "epoch": 0.1135, "grad_norm": 1.6074720621109009, "learning_rate": 3.988768431851628e-05, "loss": 1.3933, "num_input_tokens_seen": 1904214016, "step": 227 }, { "epoch": 0.114, "grad_norm": 1.302821159362793, "learning_rate": 3.98792191091036e-05, "loss": 1.3738, "num_input_tokens_seen": 1912602624, "step": 228 }, { "epoch": 0.1145, "grad_norm": 1.083345651626587, "learning_rate": 3.987044733754049e-05, "loss": 1.6179, "num_input_tokens_seen": 1920991232, "step": 229 }, { "epoch": 0.115, "grad_norm": 0.9179165959358215, "learning_rate": 3.986136913909853e-05, "loss": 1.6625, "num_input_tokens_seen": 1929379840, "step": 230 }, { "epoch": 0.1155, "grad_norm": 1.0027072429656982, "learning_rate": 3.985198465377476e-05, "loss": 1.2675, "num_input_tokens_seen": 1937768448, "step": 231 }, { "epoch": 0.116, "grad_norm": 1.3542875051498413, "learning_rate": 3.9842294026289565e-05, "loss": 1.5128, "num_input_tokens_seen": 1946157056, "step": 232 }, { "epoch": 0.1165, "grad_norm": 1.1843351125717163, "learning_rate": 3.9832297406084386e-05, "loss": 1.5214, "num_input_tokens_seen": 1954545664, "step": 233 }, { "epoch": 0.117, "grad_norm": 1.3169660568237305, "learning_rate": 3.98219949473195e-05, "loss": 1.6508, "num_input_tokens_seen": 1962934272, "step": 234 }, { "epoch": 0.1175, "grad_norm": 1.350449800491333, "learning_rate": 3.981138680887154e-05, "loss": 1.6205, "num_input_tokens_seen": 1971322880, "step": 235 }, { "epoch": 0.118, "grad_norm": 1.2358680963516235, "learning_rate": 3.980047315433116e-05, "loss": 1.4588, "num_input_tokens_seen": 1979711488, "step": 236 }, { "epoch": 0.1185, "grad_norm": 1.2926589250564575, "learning_rate": 3.978925415200037e-05, "loss": 1.3785, "num_input_tokens_seen": 1988100096, "step": 237 }, { "epoch": 0.119, "grad_norm": 1.7286139726638794, "learning_rate": 3.97777299748901e-05, "loss": 1.5533, "num_input_tokens_seen": 1996488704, "step": 238 }, { "epoch": 0.1195, "grad_norm": 1.0642439126968384, "learning_rate": 3.976590080071739e-05, "loss": 1.4134, "num_input_tokens_seen": 2004877312, "step": 239 }, { "epoch": 0.12, "grad_norm": 1.1486881971359253, "learning_rate": 3.9753766811902756e-05, "loss": 1.4183, "num_input_tokens_seen": 2013265920, "step": 240 }, { "epoch": 0.1205, "grad_norm": 1.4171710014343262, "learning_rate": 3.974132819556731e-05, "loss": 1.4274, "num_input_tokens_seen": 2021654528, "step": 241 }, { "epoch": 0.121, "grad_norm": 1.0526764392852783, "learning_rate": 3.972858514352991e-05, "loss": 1.6306, "num_input_tokens_seen": 2030043136, "step": 242 }, { "epoch": 0.1215, "grad_norm": 1.1677898168563843, "learning_rate": 3.971553785230418e-05, "loss": 1.5371, "num_input_tokens_seen": 2038431744, "step": 243 }, { "epoch": 0.122, "grad_norm": 1.0388144254684448, "learning_rate": 3.970218652309548e-05, "loss": 1.4155, "num_input_tokens_seen": 2046820352, "step": 244 }, { "epoch": 0.1225, "grad_norm": 6.161566257476807, "learning_rate": 3.9688531361797834e-05, "loss": 1.4207, "num_input_tokens_seen": 2055208960, "step": 245 }, { "epoch": 0.123, "grad_norm": 2.01348614692688, "learning_rate": 3.9674572578990724e-05, "loss": 1.4573, "num_input_tokens_seen": 2063597568, "step": 246 }, { "epoch": 0.1235, "grad_norm": 1.6936241388320923, "learning_rate": 3.9660310389935837e-05, "loss": 1.5489, "num_input_tokens_seen": 2071986176, "step": 247 }, { "epoch": 0.124, "grad_norm": 1.1356407403945923, "learning_rate": 3.964574501457378e-05, "loss": 1.6481, "num_input_tokens_seen": 2080374784, "step": 248 }, { "epoch": 0.1245, "grad_norm": 1.4091911315917969, "learning_rate": 3.9630876677520656e-05, "loss": 1.514, "num_input_tokens_seen": 2088763392, "step": 249 }, { "epoch": 0.125, "grad_norm": 1.2380797863006592, "learning_rate": 3.961570560806461e-05, "loss": 1.3364, "num_input_tokens_seen": 2097152000, "step": 250 }, { "epoch": 0.1255, "grad_norm": 1.576047658920288, "learning_rate": 3.960023204016231e-05, "loss": 1.6203, "num_input_tokens_seen": 2105540608, "step": 251 }, { "epoch": 0.126, "grad_norm": 1.013575792312622, "learning_rate": 3.958445621243532e-05, "loss": 1.5956, "num_input_tokens_seen": 2113929216, "step": 252 }, { "epoch": 0.1265, "grad_norm": 1.075700044631958, "learning_rate": 3.9568378368166406e-05, "loss": 1.4344, "num_input_tokens_seen": 2122317824, "step": 253 }, { "epoch": 0.127, "grad_norm": 1.365511417388916, "learning_rate": 3.955199875529582e-05, "loss": 1.5382, "num_input_tokens_seen": 2130706432, "step": 254 }, { "epoch": 0.1275, "grad_norm": 1.1434495449066162, "learning_rate": 3.953531762641745e-05, "loss": 1.48, "num_input_tokens_seen": 2139095040, "step": 255 }, { "epoch": 0.128, "grad_norm": 1.03788161277771, "learning_rate": 3.951833523877495e-05, "loss": 1.5976, "num_input_tokens_seen": 2147483648, "step": 256 }, { "epoch": 0.1285, "grad_norm": 1.0065782070159912, "learning_rate": 3.9501051854257745e-05, "loss": 1.532, "num_input_tokens_seen": 2155872256, "step": 257 }, { "epoch": 0.129, "grad_norm": 4.652547359466553, "learning_rate": 3.948346773939699e-05, "loss": 1.4492, "num_input_tokens_seen": 2164260864, "step": 258 }, { "epoch": 0.1295, "grad_norm": 1.5066403150558472, "learning_rate": 3.94655831653615e-05, "loss": 1.5709, "num_input_tokens_seen": 2172649472, "step": 259 }, { "epoch": 0.13, "grad_norm": 1.4068509340286255, "learning_rate": 3.9447398407953536e-05, "loss": 1.6171, "num_input_tokens_seen": 2181038080, "step": 260 }, { "epoch": 0.1305, "grad_norm": 1.3719857931137085, "learning_rate": 3.942891374760455e-05, "loss": 1.4291, "num_input_tokens_seen": 2189426688, "step": 261 }, { "epoch": 0.131, "grad_norm": 1.5399225950241089, "learning_rate": 3.941012946937085e-05, "loss": 1.4308, "num_input_tokens_seen": 2197815296, "step": 262 }, { "epoch": 0.1315, "grad_norm": 1.1638039350509644, "learning_rate": 3.9391045862929275e-05, "loss": 1.5893, "num_input_tokens_seen": 2206203904, "step": 263 }, { "epoch": 0.132, "grad_norm": 1.2434509992599487, "learning_rate": 3.9371663222572625e-05, "loss": 1.442, "num_input_tokens_seen": 2214592512, "step": 264 }, { "epoch": 0.1325, "grad_norm": 1.0376842021942139, "learning_rate": 3.93519818472052e-05, "loss": 1.5357, "num_input_tokens_seen": 2222981120, "step": 265 }, { "epoch": 0.133, "grad_norm": 0.8054550886154175, "learning_rate": 3.933200204033815e-05, "loss": 1.5604, "num_input_tokens_seen": 2231369728, "step": 266 }, { "epoch": 0.1335, "grad_norm": 0.912632942199707, "learning_rate": 3.931172411008482e-05, "loss": 1.4744, "num_input_tokens_seen": 2239758336, "step": 267 }, { "epoch": 0.134, "grad_norm": 1.0088155269622803, "learning_rate": 3.9291148369155964e-05, "loss": 1.4997, "num_input_tokens_seen": 2248146944, "step": 268 }, { "epoch": 0.1345, "grad_norm": 0.8963494896888733, "learning_rate": 3.927027513485498e-05, "loss": 1.523, "num_input_tokens_seen": 2256535552, "step": 269 }, { "epoch": 0.135, "grad_norm": 0.7654589414596558, "learning_rate": 3.9249104729072944e-05, "loss": 1.5157, "num_input_tokens_seen": 2264924160, "step": 270 }, { "epoch": 0.1355, "grad_norm": 0.8809084296226501, "learning_rate": 3.9227637478283725e-05, "loss": 1.4474, "num_input_tokens_seen": 2273312768, "step": 271 }, { "epoch": 0.136, "grad_norm": 1.2865358591079712, "learning_rate": 3.9205873713538864e-05, "loss": 1.5414, "num_input_tokens_seen": 2281701376, "step": 272 }, { "epoch": 0.1365, "grad_norm": 1.593856930732727, "learning_rate": 3.918381377046255e-05, "loss": 1.3743, "num_input_tokens_seen": 2290089984, "step": 273 }, { "epoch": 0.137, "grad_norm": 1.0723228454589844, "learning_rate": 3.916145798924639e-05, "loss": 1.5014, "num_input_tokens_seen": 2298478592, "step": 274 }, { "epoch": 0.1375, "grad_norm": 1.2044541835784912, "learning_rate": 3.913880671464418e-05, "loss": 1.5039, "num_input_tokens_seen": 2306867200, "step": 275 }, { "epoch": 0.138, "grad_norm": 1.074403166770935, "learning_rate": 3.911586029596661e-05, "loss": 1.4905, "num_input_tokens_seen": 2315255808, "step": 276 }, { "epoch": 0.1385, "grad_norm": 1.145273208618164, "learning_rate": 3.9092619087075825e-05, "loss": 1.5985, "num_input_tokens_seen": 2323644416, "step": 277 }, { "epoch": 0.139, "grad_norm": 0.9891590476036072, "learning_rate": 3.906908344638002e-05, "loss": 1.4371, "num_input_tokens_seen": 2332033024, "step": 278 }, { "epoch": 0.1395, "grad_norm": 1.1161611080169678, "learning_rate": 3.904525373682791e-05, "loss": 1.5372, "num_input_tokens_seen": 2340421632, "step": 279 }, { "epoch": 0.14, "grad_norm": 1.4517216682434082, "learning_rate": 3.9021130325903076e-05, "loss": 1.3466, "num_input_tokens_seen": 2348810240, "step": 280 }, { "epoch": 0.1405, "grad_norm": 1.2670317888259888, "learning_rate": 3.8996713585618354e-05, "loss": 1.4057, "num_input_tokens_seen": 2357198848, "step": 281 }, { "epoch": 0.141, "grad_norm": 1.4455859661102295, "learning_rate": 3.897200389251009e-05, "loss": 1.395, "num_input_tokens_seen": 2365587456, "step": 282 }, { "epoch": 0.1415, "grad_norm": 0.8360774517059326, "learning_rate": 3.8947001627632326e-05, "loss": 1.5327, "num_input_tokens_seen": 2373976064, "step": 283 }, { "epoch": 0.142, "grad_norm": 0.7943998575210571, "learning_rate": 3.892170717655091e-05, "loss": 1.4345, "num_input_tokens_seen": 2382364672, "step": 284 }, { "epoch": 0.1425, "grad_norm": 1.4543523788452148, "learning_rate": 3.889612092933756e-05, "loss": 1.4521, "num_input_tokens_seen": 2390753280, "step": 285 }, { "epoch": 0.143, "grad_norm": 0.9397439956665039, "learning_rate": 3.887024328056387e-05, "loss": 1.6032, "num_input_tokens_seen": 2399141888, "step": 286 }, { "epoch": 0.1435, "grad_norm": 0.762290358543396, "learning_rate": 3.88440746292952e-05, "loss": 1.4352, "num_input_tokens_seen": 2407530496, "step": 287 }, { "epoch": 0.144, "grad_norm": 0.6979811191558838, "learning_rate": 3.8817615379084514e-05, "loss": 1.6963, "num_input_tokens_seen": 2415919104, "step": 288 }, { "epoch": 0.1445, "grad_norm": 0.7770799398422241, "learning_rate": 3.879086593796618e-05, "loss": 1.5172, "num_input_tokens_seen": 2424307712, "step": 289 }, { "epoch": 0.145, "grad_norm": 0.8327502012252808, "learning_rate": 3.876382671844969e-05, "loss": 1.5107, "num_input_tokens_seen": 2432696320, "step": 290 }, { "epoch": 0.1455, "grad_norm": 0.8581123948097229, "learning_rate": 3.873649813751323e-05, "loss": 1.368, "num_input_tokens_seen": 2441084928, "step": 291 }, { "epoch": 0.146, "grad_norm": 1.1327030658721924, "learning_rate": 3.870888061659735e-05, "loss": 1.5626, "num_input_tokens_seen": 2449473536, "step": 292 }, { "epoch": 0.1465, "grad_norm": 0.7921344041824341, "learning_rate": 3.8680974581598375e-05, "loss": 1.4555, "num_input_tokens_seen": 2457862144, "step": 293 }, { "epoch": 0.147, "grad_norm": 1.0657182931900024, "learning_rate": 3.865278046286189e-05, "loss": 1.5181, "num_input_tokens_seen": 2466250752, "step": 294 }, { "epoch": 0.1475, "grad_norm": 1.0408730506896973, "learning_rate": 3.862429869517607e-05, "loss": 1.4967, "num_input_tokens_seen": 2474639360, "step": 295 }, { "epoch": 0.148, "grad_norm": 1.5368201732635498, "learning_rate": 3.859552971776503e-05, "loss": 1.4739, "num_input_tokens_seen": 2483027968, "step": 296 }, { "epoch": 0.1485, "grad_norm": 0.8569972515106201, "learning_rate": 3.856647397428198e-05, "loss": 1.425, "num_input_tokens_seen": 2491416576, "step": 297 }, { "epoch": 0.149, "grad_norm": 0.98533695936203, "learning_rate": 3.853713191280242e-05, "loss": 1.6042, "num_input_tokens_seen": 2499805184, "step": 298 }, { "epoch": 0.1495, "grad_norm": 0.9723827242851257, "learning_rate": 3.850750398581725e-05, "loss": 1.6315, "num_input_tokens_seen": 2508193792, "step": 299 }, { "epoch": 0.15, "grad_norm": 1.0792138576507568, "learning_rate": 3.8477590650225735e-05, "loss": 1.4135, "num_input_tokens_seen": 2516582400, "step": 300 }, { "epoch": 0.1505, "grad_norm": 1.1905463933944702, "learning_rate": 3.8447392367328535e-05, "loss": 1.3862, "num_input_tokens_seen": 2524971008, "step": 301 }, { "epoch": 0.151, "grad_norm": 1.1530685424804688, "learning_rate": 3.8416909602820534e-05, "loss": 1.3182, "num_input_tokens_seen": 2533359616, "step": 302 }, { "epoch": 0.1515, "grad_norm": 1.30711829662323, "learning_rate": 3.8386142826783645e-05, "loss": 1.3789, "num_input_tokens_seen": 2541748224, "step": 303 }, { "epoch": 0.152, "grad_norm": 0.9950928688049316, "learning_rate": 3.835509251367963e-05, "loss": 1.5674, "num_input_tokens_seen": 2550136832, "step": 304 }, { "epoch": 0.1525, "grad_norm": 0.92460036277771, "learning_rate": 3.832375914234272e-05, "loss": 1.362, "num_input_tokens_seen": 2558525440, "step": 305 }, { "epoch": 0.153, "grad_norm": 1.242473840713501, "learning_rate": 3.829214319597228e-05, "loss": 1.4396, "num_input_tokens_seen": 2566914048, "step": 306 }, { "epoch": 0.1535, "grad_norm": 1.2101365327835083, "learning_rate": 3.826024516212529e-05, "loss": 1.4095, "num_input_tokens_seen": 2575302656, "step": 307 }, { "epoch": 0.154, "grad_norm": 1.0523734092712402, "learning_rate": 3.8228065532708905e-05, "loss": 1.4848, "num_input_tokens_seen": 2583691264, "step": 308 }, { "epoch": 0.1545, "grad_norm": 0.8966120481491089, "learning_rate": 3.819560480397282e-05, "loss": 1.4843, "num_input_tokens_seen": 2592079872, "step": 309 }, { "epoch": 0.155, "grad_norm": 0.8767443299293518, "learning_rate": 3.816286347650163e-05, "loss": 1.5952, "num_input_tokens_seen": 2600468480, "step": 310 }, { "epoch": 0.1555, "grad_norm": 1.0079834461212158, "learning_rate": 3.81298420552071e-05, "loss": 1.4734, "num_input_tokens_seen": 2608857088, "step": 311 }, { "epoch": 0.156, "grad_norm": 1.2203563451766968, "learning_rate": 3.809654104932039e-05, "loss": 1.3708, "num_input_tokens_seen": 2617245696, "step": 312 }, { "epoch": 0.1565, "grad_norm": 0.9881919026374817, "learning_rate": 3.8062960972384223e-05, "loss": 1.6418, "num_input_tokens_seen": 2625634304, "step": 313 }, { "epoch": 0.157, "grad_norm": 0.9492523670196533, "learning_rate": 3.802910234224491e-05, "loss": 1.5082, "num_input_tokens_seen": 2634022912, "step": 314 }, { "epoch": 0.1575, "grad_norm": 0.8579779863357544, "learning_rate": 3.7994965681044436e-05, "loss": 1.5221, "num_input_tokens_seen": 2642411520, "step": 315 }, { "epoch": 0.158, "grad_norm": 0.9302236437797546, "learning_rate": 3.796055151521231e-05, "loss": 1.4642, "num_input_tokens_seen": 2650800128, "step": 316 }, { "epoch": 0.1585, "grad_norm": 0.8026265501976013, "learning_rate": 3.792586037545758e-05, "loss": 1.3628, "num_input_tokens_seen": 2659188736, "step": 317 }, { "epoch": 0.159, "grad_norm": 1.0016106367111206, "learning_rate": 3.78908927967605e-05, "loss": 1.4537, "num_input_tokens_seen": 2667577344, "step": 318 }, { "epoch": 0.1595, "grad_norm": 0.8716861009597778, "learning_rate": 3.785564931836442e-05, "loss": 1.6082, "num_input_tokens_seen": 2675965952, "step": 319 }, { "epoch": 0.16, "grad_norm": 0.885236918926239, "learning_rate": 3.782013048376736e-05, "loss": 1.6032, "num_input_tokens_seen": 2684354560, "step": 320 }, { "epoch": 0.1605, "grad_norm": 0.7577134966850281, "learning_rate": 3.778433684071369e-05, "loss": 1.5775, "num_input_tokens_seen": 2692743168, "step": 321 }, { "epoch": 0.161, "grad_norm": 1.3161879777908325, "learning_rate": 3.774826894118567e-05, "loss": 1.4185, "num_input_tokens_seen": 2701131776, "step": 322 }, { "epoch": 0.1615, "grad_norm": 1.4146431684494019, "learning_rate": 3.7711927341394916e-05, "loss": 1.3288, "num_input_tokens_seen": 2709520384, "step": 323 }, { "epoch": 0.162, "grad_norm": 0.9457252621650696, "learning_rate": 3.7675312601773874e-05, "loss": 1.5753, "num_input_tokens_seen": 2717908992, "step": 324 }, { "epoch": 0.1625, "grad_norm": 0.929100751876831, "learning_rate": 3.76384252869671e-05, "loss": 1.5252, "num_input_tokens_seen": 2726297600, "step": 325 }, { "epoch": 0.163, "grad_norm": 0.9035556316375732, "learning_rate": 3.760126596582264e-05, "loss": 1.4524, "num_input_tokens_seen": 2734686208, "step": 326 }, { "epoch": 0.1635, "grad_norm": 1.2999314069747925, "learning_rate": 3.756383521138319e-05, "loss": 1.4461, "num_input_tokens_seen": 2743074816, "step": 327 }, { "epoch": 0.164, "grad_norm": 0.8283589482307434, "learning_rate": 3.7526133600877275e-05, "loss": 1.4055, "num_input_tokens_seen": 2751463424, "step": 328 }, { "epoch": 0.1645, "grad_norm": 0.932754635810852, "learning_rate": 3.748816171571038e-05, "loss": 1.468, "num_input_tokens_seen": 2759852032, "step": 329 }, { "epoch": 0.165, "grad_norm": 1.0350176095962524, "learning_rate": 3.744992014145595e-05, "loss": 1.2676, "num_input_tokens_seen": 2768240640, "step": 330 }, { "epoch": 0.1655, "grad_norm": 1.0135241746902466, "learning_rate": 3.741140946784635e-05, "loss": 1.4816, "num_input_tokens_seen": 2776629248, "step": 331 }, { "epoch": 0.166, "grad_norm": 0.8522953987121582, "learning_rate": 3.737263028876383e-05, "loss": 1.549, "num_input_tokens_seen": 2785017856, "step": 332 }, { "epoch": 0.1665, "grad_norm": 0.9122476577758789, "learning_rate": 3.733358320223128e-05, "loss": 1.4926, "num_input_tokens_seen": 2793406464, "step": 333 }, { "epoch": 0.167, "grad_norm": 1.0672262907028198, "learning_rate": 3.729426881040311e-05, "loss": 1.3914, "num_input_tokens_seen": 2801795072, "step": 334 }, { "epoch": 0.1675, "grad_norm": 1.051777720451355, "learning_rate": 3.725468771955584e-05, "loss": 1.4951, "num_input_tokens_seen": 2810183680, "step": 335 }, { "epoch": 0.168, "grad_norm": 1.1109224557876587, "learning_rate": 3.721484054007888e-05, "loss": 1.4601, "num_input_tokens_seen": 2818572288, "step": 336 }, { "epoch": 0.1685, "grad_norm": 0.8404030203819275, "learning_rate": 3.717472788646501e-05, "loss": 1.3862, "num_input_tokens_seen": 2826960896, "step": 337 }, { "epoch": 0.169, "grad_norm": 0.7329870462417603, "learning_rate": 3.7134350377301e-05, "loss": 1.6388, "num_input_tokens_seen": 2835349504, "step": 338 }, { "epoch": 0.1695, "grad_norm": 0.9302310943603516, "learning_rate": 3.709370863525796e-05, "loss": 1.4883, "num_input_tokens_seen": 2843738112, "step": 339 }, { "epoch": 0.17, "grad_norm": 1.0194952487945557, "learning_rate": 3.705280328708185e-05, "loss": 1.6099, "num_input_tokens_seen": 2852126720, "step": 340 }, { "epoch": 0.1705, "grad_norm": 1.0555291175842285, "learning_rate": 3.701163496358373e-05, "loss": 1.5221, "num_input_tokens_seen": 2860515328, "step": 341 }, { "epoch": 0.171, "grad_norm": 0.8368796110153198, "learning_rate": 3.6970204299630077e-05, "loss": 1.4309, "num_input_tokens_seen": 2868903936, "step": 342 }, { "epoch": 0.1715, "grad_norm": 0.9990943074226379, "learning_rate": 3.692851193413299e-05, "loss": 1.5313, "num_input_tokens_seen": 2877292544, "step": 343 }, { "epoch": 0.172, "grad_norm": 1.0590474605560303, "learning_rate": 3.6886558510040305e-05, "loss": 1.4084, "num_input_tokens_seen": 2885681152, "step": 344 }, { "epoch": 0.1725, "grad_norm": 1.1361685991287231, "learning_rate": 3.684434467432573e-05, "loss": 1.5136, "num_input_tokens_seen": 2894069760, "step": 345 }, { "epoch": 0.173, "grad_norm": 1.1363329887390137, "learning_rate": 3.680187107797884e-05, "loss": 1.3386, "num_input_tokens_seen": 2902458368, "step": 346 }, { "epoch": 0.1735, "grad_norm": 0.8391625285148621, "learning_rate": 3.675913837599503e-05, "loss": 1.5767, "num_input_tokens_seen": 2910846976, "step": 347 }, { "epoch": 0.174, "grad_norm": 0.9546042084693909, "learning_rate": 3.671614722736541e-05, "loss": 1.4376, "num_input_tokens_seen": 2919235584, "step": 348 }, { "epoch": 0.1745, "grad_norm": 0.957488477230072, "learning_rate": 3.667289829506669e-05, "loss": 1.5229, "num_input_tokens_seen": 2927624192, "step": 349 }, { "epoch": 0.175, "grad_norm": 0.9508236646652222, "learning_rate": 3.662939224605091e-05, "loss": 1.5911, "num_input_tokens_seen": 2936012800, "step": 350 }, { "epoch": 0.1755, "grad_norm": 1.3240845203399658, "learning_rate": 3.658562975123516e-05, "loss": 1.5105, "num_input_tokens_seen": 2944401408, "step": 351 }, { "epoch": 0.176, "grad_norm": 0.8311107158660889, "learning_rate": 3.654161148549124e-05, "loss": 1.4136, "num_input_tokens_seen": 2952790016, "step": 352 }, { "epoch": 0.1765, "grad_norm": 1.1078195571899414, "learning_rate": 3.649733812763527e-05, "loss": 1.4567, "num_input_tokens_seen": 2961178624, "step": 353 }, { "epoch": 0.177, "grad_norm": 0.8839166760444641, "learning_rate": 3.64528103604172e-05, "loss": 1.5807, "num_input_tokens_seen": 2969567232, "step": 354 }, { "epoch": 0.1775, "grad_norm": 1.2083498239517212, "learning_rate": 3.640802887051027e-05, "loss": 1.2894, "num_input_tokens_seen": 2977955840, "step": 355 }, { "epoch": 0.178, "grad_norm": 1.245565414428711, "learning_rate": 3.636299434850047e-05, "loss": 1.436, "num_input_tokens_seen": 2986344448, "step": 356 }, { "epoch": 0.1785, "grad_norm": 0.7781410217285156, "learning_rate": 3.631770748887583e-05, "loss": 1.6099, "num_input_tokens_seen": 2994733056, "step": 357 }, { "epoch": 0.179, "grad_norm": 1.188174843788147, "learning_rate": 3.627216899001575e-05, "loss": 1.4733, "num_input_tokens_seen": 3003121664, "step": 358 }, { "epoch": 0.1795, "grad_norm": 0.7973136305809021, "learning_rate": 3.62263795541802e-05, "loss": 1.4058, "num_input_tokens_seen": 3011510272, "step": 359 }, { "epoch": 0.18, "grad_norm": 0.9052129983901978, "learning_rate": 3.6180339887498953e-05, "loss": 1.5637, "num_input_tokens_seen": 3019898880, "step": 360 }, { "epoch": 0.1805, "grad_norm": 0.7846525311470032, "learning_rate": 3.6134050699960604e-05, "loss": 1.4831, "num_input_tokens_seen": 3028287488, "step": 361 }, { "epoch": 0.181, "grad_norm": 0.8140630125999451, "learning_rate": 3.608751270540169e-05, "loss": 1.4714, "num_input_tokens_seen": 3036676096, "step": 362 }, { "epoch": 0.1815, "grad_norm": 0.8658472299575806, "learning_rate": 3.604072662149567e-05, "loss": 1.5229, "num_input_tokens_seen": 3045064704, "step": 363 }, { "epoch": 0.182, "grad_norm": 1.0609042644500732, "learning_rate": 3.599369316974182e-05, "loss": 1.5374, "num_input_tokens_seen": 3053453312, "step": 364 }, { "epoch": 0.1825, "grad_norm": 1.131906509399414, "learning_rate": 3.594641307545414e-05, "loss": 1.3994, "num_input_tokens_seen": 3061841920, "step": 365 }, { "epoch": 0.183, "grad_norm": 0.8365881443023682, "learning_rate": 3.58988870677502e-05, "loss": 1.5038, "num_input_tokens_seen": 3070230528, "step": 366 }, { "epoch": 0.1835, "grad_norm": 0.7198511362075806, "learning_rate": 3.585111587953982e-05, "loss": 1.3586, "num_input_tokens_seen": 3078619136, "step": 367 }, { "epoch": 0.184, "grad_norm": 1.0345085859298706, "learning_rate": 3.580310024751381e-05, "loss": 1.3342, "num_input_tokens_seen": 3087007744, "step": 368 }, { "epoch": 0.1845, "grad_norm": 1.111305594444275, "learning_rate": 3.575484091213262e-05, "loss": 1.5304, "num_input_tokens_seen": 3095396352, "step": 369 }, { "epoch": 0.185, "grad_norm": 0.8057035207748413, "learning_rate": 3.57063386176149e-05, "loss": 1.4772, "num_input_tokens_seen": 3103784960, "step": 370 }, { "epoch": 0.1855, "grad_norm": 0.6957182884216309, "learning_rate": 3.565759411192604e-05, "loss": 1.605, "num_input_tokens_seen": 3112173568, "step": 371 }, { "epoch": 0.186, "grad_norm": 0.8788880705833435, "learning_rate": 3.5608608146766597e-05, "loss": 1.5986, "num_input_tokens_seen": 3120562176, "step": 372 }, { "epoch": 0.1865, "grad_norm": 0.738642156124115, "learning_rate": 3.555938147756077e-05, "loss": 1.3805, "num_input_tokens_seen": 3128950784, "step": 373 }, { "epoch": 0.187, "grad_norm": 0.8225732445716858, "learning_rate": 3.5509914863444694e-05, "loss": 1.3732, "num_input_tokens_seen": 3137339392, "step": 374 }, { "epoch": 0.1875, "grad_norm": 1.1334948539733887, "learning_rate": 3.546020906725474e-05, "loss": 1.519, "num_input_tokens_seen": 3145728000, "step": 375 }, { "epoch": 0.188, "grad_norm": 0.7885330319404602, "learning_rate": 3.541026485551579e-05, "loss": 1.3916, "num_input_tokens_seen": 3154116608, "step": 376 }, { "epoch": 0.1885, "grad_norm": 0.782975971698761, "learning_rate": 3.536008299842936e-05, "loss": 1.4815, "num_input_tokens_seen": 3162505216, "step": 377 }, { "epoch": 0.189, "grad_norm": 0.7733896970748901, "learning_rate": 3.530966426986177e-05, "loss": 1.4121, "num_input_tokens_seen": 3170893824, "step": 378 }, { "epoch": 0.1895, "grad_norm": 0.8163855075836182, "learning_rate": 3.525900944733218e-05, "loss": 1.3504, "num_input_tokens_seen": 3179282432, "step": 379 }, { "epoch": 0.19, "grad_norm": 0.9350858330726624, "learning_rate": 3.520811931200063e-05, "loss": 1.3843, "num_input_tokens_seen": 3187671040, "step": 380 }, { "epoch": 0.1905, "grad_norm": 0.9935964941978455, "learning_rate": 3.515699464865594e-05, "loss": 1.3983, "num_input_tokens_seen": 3196059648, "step": 381 }, { "epoch": 0.191, "grad_norm": 0.8148149847984314, "learning_rate": 3.5105636245703675e-05, "loss": 1.4509, "num_input_tokens_seen": 3204448256, "step": 382 }, { "epoch": 0.1915, "grad_norm": 0.5832996964454651, "learning_rate": 3.505404489515394e-05, "loss": 1.6755, "num_input_tokens_seen": 3212836864, "step": 383 }, { "epoch": 0.192, "grad_norm": 0.8905732035636902, "learning_rate": 3.5002221392609196e-05, "loss": 1.4067, "num_input_tokens_seen": 3221225472, "step": 384 }, { "epoch": 0.1925, "grad_norm": 0.9013821482658386, "learning_rate": 3.495016653725194e-05, "loss": 1.4696, "num_input_tokens_seen": 3229614080, "step": 385 }, { "epoch": 0.193, "grad_norm": 0.8977445363998413, "learning_rate": 3.489788113183244e-05, "loss": 1.2823, "num_input_tokens_seen": 3238002688, "step": 386 }, { "epoch": 0.1935, "grad_norm": 0.6694108247756958, "learning_rate": 3.484536598265634e-05, "loss": 1.606, "num_input_tokens_seen": 3246391296, "step": 387 }, { "epoch": 0.194, "grad_norm": 0.789479672908783, "learning_rate": 3.47926218995722e-05, "loss": 1.3914, "num_input_tokens_seen": 3254779904, "step": 388 }, { "epoch": 0.1945, "grad_norm": 1.217543363571167, "learning_rate": 3.473964969595902e-05, "loss": 1.4358, "num_input_tokens_seen": 3263168512, "step": 389 }, { "epoch": 0.195, "grad_norm": 0.9232218265533447, "learning_rate": 3.468645018871371e-05, "loss": 1.5252, "num_input_tokens_seen": 3271557120, "step": 390 }, { "epoch": 0.1955, "grad_norm": 0.8410552144050598, "learning_rate": 3.46330241982385e-05, "loss": 1.5064, "num_input_tokens_seen": 3279945728, "step": 391 }, { "epoch": 0.196, "grad_norm": 0.7464767694473267, "learning_rate": 3.457937254842823e-05, "loss": 1.5469, "num_input_tokens_seen": 3288334336, "step": 392 }, { "epoch": 0.1965, "grad_norm": 0.931220293045044, "learning_rate": 3.4525496066657735e-05, "loss": 1.4228, "num_input_tokens_seen": 3296722944, "step": 393 }, { "epoch": 0.197, "grad_norm": 0.7736884355545044, "learning_rate": 3.4471395583768985e-05, "loss": 1.5042, "num_input_tokens_seen": 3305111552, "step": 394 }, { "epoch": 0.1975, "grad_norm": 0.6560108661651611, "learning_rate": 3.441707193405838e-05, "loss": 1.5902, "num_input_tokens_seen": 3313500160, "step": 395 }, { "epoch": 0.198, "grad_norm": 0.7676752805709839, "learning_rate": 3.436252595526378e-05, "loss": 1.4203, "num_input_tokens_seen": 3321888768, "step": 396 }, { "epoch": 0.1985, "grad_norm": 0.9736531376838684, "learning_rate": 3.430775848855166e-05, "loss": 1.3697, "num_input_tokens_seen": 3330277376, "step": 397 }, { "epoch": 0.199, "grad_norm": 1.216167688369751, "learning_rate": 3.425277037850411e-05, "loss": 1.4303, "num_input_tokens_seen": 3338665984, "step": 398 }, { "epoch": 0.1995, "grad_norm": 0.7341766357421875, "learning_rate": 3.419756247310581e-05, "loss": 1.3222, "num_input_tokens_seen": 3347054592, "step": 399 }, { "epoch": 0.2, "grad_norm": 0.830595850944519, "learning_rate": 3.4142135623730954e-05, "loss": 1.4865, "num_input_tokens_seen": 3355443200, "step": 400 }, { "epoch": 0.2005, "grad_norm": 0.9123224020004272, "learning_rate": 3.408649068513013e-05, "loss": 1.4786, "num_input_tokens_seen": 3363831808, "step": 401 }, { "epoch": 0.201, "grad_norm": 0.7426742315292358, "learning_rate": 3.403062851541712e-05, "loss": 1.4676, "num_input_tokens_seen": 3372220416, "step": 402 }, { "epoch": 0.2015, "grad_norm": 0.751975953578949, "learning_rate": 3.397454997605569e-05, "loss": 1.3365, "num_input_tokens_seen": 3380609024, "step": 403 }, { "epoch": 0.202, "grad_norm": 1.2541900873184204, "learning_rate": 3.391825593184629e-05, "loss": 1.4688, "num_input_tokens_seen": 3388997632, "step": 404 }, { "epoch": 0.2025, "grad_norm": 0.718085765838623, "learning_rate": 3.3861747250912724e-05, "loss": 1.4313, "num_input_tokens_seen": 3397386240, "step": 405 }, { "epoch": 0.203, "grad_norm": 0.8636963367462158, "learning_rate": 3.3805024804688745e-05, "loss": 1.1946, "num_input_tokens_seen": 3405774848, "step": 406 }, { "epoch": 0.2035, "grad_norm": 0.9231307506561279, "learning_rate": 3.374808946790466e-05, "loss": 1.4451, "num_input_tokens_seen": 3414163456, "step": 407 }, { "epoch": 0.204, "grad_norm": 0.6937165856361389, "learning_rate": 3.369094211857378e-05, "loss": 1.3791, "num_input_tokens_seen": 3422552064, "step": 408 }, { "epoch": 0.2045, "grad_norm": 0.8068282604217529, "learning_rate": 3.363358363797893e-05, "loss": 1.3055, "num_input_tokens_seen": 3430940672, "step": 409 }, { "epoch": 0.205, "grad_norm": 1.053564429283142, "learning_rate": 3.357601491065884e-05, "loss": 1.3462, "num_input_tokens_seen": 3439329280, "step": 410 }, { "epoch": 0.2055, "grad_norm": 0.7462815642356873, "learning_rate": 3.35182368243945e-05, "loss": 1.5061, "num_input_tokens_seen": 3447717888, "step": 411 }, { "epoch": 0.206, "grad_norm": 0.6738744378089905, "learning_rate": 3.346025027019547e-05, "loss": 1.4853, "num_input_tokens_seen": 3456106496, "step": 412 }, { "epoch": 0.2065, "grad_norm": 0.7402742505073547, "learning_rate": 3.3402056142286156e-05, "loss": 1.5601, "num_input_tokens_seen": 3464495104, "step": 413 }, { "epoch": 0.207, "grad_norm": 0.7452412247657776, "learning_rate": 3.3343655338091996e-05, "loss": 1.5008, "num_input_tokens_seen": 3472883712, "step": 414 }, { "epoch": 0.2075, "grad_norm": 0.7277024984359741, "learning_rate": 3.328504875822564e-05, "loss": 1.4185, "num_input_tokens_seen": 3481272320, "step": 415 }, { "epoch": 0.208, "grad_norm": 0.8247030377388, "learning_rate": 3.322623730647304e-05, "loss": 1.3824, "num_input_tokens_seen": 3489660928, "step": 416 }, { "epoch": 0.2085, "grad_norm": 1.0731184482574463, "learning_rate": 3.316722188977955e-05, "loss": 1.4415, "num_input_tokens_seen": 3498049536, "step": 417 }, { "epoch": 0.209, "grad_norm": 0.6655694246292114, "learning_rate": 3.310800341823588e-05, "loss": 1.4068, "num_input_tokens_seen": 3506438144, "step": 418 }, { "epoch": 0.2095, "grad_norm": 0.7759659290313721, "learning_rate": 3.3048582805064137e-05, "loss": 1.3946, "num_input_tokens_seen": 3514826752, "step": 419 }, { "epoch": 0.21, "grad_norm": 0.6646033525466919, "learning_rate": 3.298896096660367e-05, "loss": 1.4531, "num_input_tokens_seen": 3523215360, "step": 420 }, { "epoch": 0.2105, "grad_norm": 0.8200505971908569, "learning_rate": 3.2929138822297004e-05, "loss": 1.3261, "num_input_tokens_seen": 3531603968, "step": 421 }, { "epoch": 0.211, "grad_norm": 1.0771782398223877, "learning_rate": 3.286911729467558e-05, "loss": 1.2967, "num_input_tokens_seen": 3539992576, "step": 422 }, { "epoch": 0.2115, "grad_norm": 0.909013569355011, "learning_rate": 3.280889730934562e-05, "loss": 1.4012, "num_input_tokens_seen": 3548381184, "step": 423 }, { "epoch": 0.212, "grad_norm": 0.9211897253990173, "learning_rate": 3.27484797949738e-05, "loss": 1.5176, "num_input_tokens_seen": 3556769792, "step": 424 }, { "epoch": 0.2125, "grad_norm": 0.9550484418869019, "learning_rate": 3.268786568327291e-05, "loss": 1.5174, "num_input_tokens_seen": 3565158400, "step": 425 }, { "epoch": 0.213, "grad_norm": 0.7973341345787048, "learning_rate": 3.262705590898756e-05, "loss": 1.5307, "num_input_tokens_seen": 3573547008, "step": 426 }, { "epoch": 0.2135, "grad_norm": 1.4084835052490234, "learning_rate": 3.2566051409879676e-05, "loss": 1.4755, "num_input_tokens_seen": 3581935616, "step": 427 }, { "epoch": 0.214, "grad_norm": 0.9049050211906433, "learning_rate": 3.250485312671411e-05, "loss": 1.3363, "num_input_tokens_seen": 3590324224, "step": 428 }, { "epoch": 0.2145, "grad_norm": 1.0391597747802734, "learning_rate": 3.244346200324409e-05, "loss": 1.3376, "num_input_tokens_seen": 3598712832, "step": 429 }, { "epoch": 0.215, "grad_norm": 0.7400302886962891, "learning_rate": 3.238187898619669e-05, "loss": 1.5293, "num_input_tokens_seen": 3607101440, "step": 430 }, { "epoch": 0.2155, "grad_norm": 0.7565677165985107, "learning_rate": 3.23201050252582e-05, "loss": 1.4848, "num_input_tokens_seen": 3615490048, "step": 431 }, { "epoch": 0.216, "grad_norm": 0.6060124039649963, "learning_rate": 3.2258141073059533e-05, "loss": 1.4681, "num_input_tokens_seen": 3623878656, "step": 432 }, { "epoch": 0.2165, "grad_norm": 0.6991977095603943, "learning_rate": 3.219598808516148e-05, "loss": 1.3987, "num_input_tokens_seen": 3632267264, "step": 433 }, { "epoch": 0.217, "grad_norm": 0.7281671166419983, "learning_rate": 3.2133647020039995e-05, "loss": 1.3886, "num_input_tokens_seen": 3640655872, "step": 434 }, { "epoch": 0.2175, "grad_norm": 0.6527149081230164, "learning_rate": 3.207111883907143e-05, "loss": 1.5513, "num_input_tokens_seen": 3649044480, "step": 435 }, { "epoch": 0.218, "grad_norm": 0.6604170203208923, "learning_rate": 3.200840450651769e-05, "loss": 1.5738, "num_input_tokens_seen": 3657433088, "step": 436 }, { "epoch": 0.2185, "grad_norm": 0.6788345575332642, "learning_rate": 3.194550498951134e-05, "loss": 1.417, "num_input_tokens_seen": 3665821696, "step": 437 }, { "epoch": 0.219, "grad_norm": 0.6693570613861084, "learning_rate": 3.188242125804078e-05, "loss": 1.4801, "num_input_tokens_seen": 3674210304, "step": 438 }, { "epoch": 0.2195, "grad_norm": 0.6979196667671204, "learning_rate": 3.181915428493515e-05, "loss": 1.4487, "num_input_tokens_seen": 3682598912, "step": 439 }, { "epoch": 0.22, "grad_norm": 0.7262624502182007, "learning_rate": 3.1755705045849465e-05, "loss": 1.4608, "num_input_tokens_seen": 3690987520, "step": 440 }, { "epoch": 0.2205, "grad_norm": 0.6817556023597717, "learning_rate": 3.1692074519249476e-05, "loss": 1.5764, "num_input_tokens_seen": 3699376128, "step": 441 }, { "epoch": 0.221, "grad_norm": 0.7283326387405396, "learning_rate": 3.1628263686396614e-05, "loss": 1.462, "num_input_tokens_seen": 3707764736, "step": 442 }, { "epoch": 0.2215, "grad_norm": 0.7863017320632935, "learning_rate": 3.156427353133286e-05, "loss": 1.5356, "num_input_tokens_seen": 3716153344, "step": 443 }, { "epoch": 0.222, "grad_norm": 0.6429500579833984, "learning_rate": 3.150010504086558e-05, "loss": 1.6187, "num_input_tokens_seen": 3724541952, "step": 444 }, { "epoch": 0.2225, "grad_norm": 0.8246350288391113, "learning_rate": 3.1435759204552246e-05, "loss": 1.459, "num_input_tokens_seen": 3732930560, "step": 445 }, { "epoch": 0.223, "grad_norm": 0.8313559293746948, "learning_rate": 3.1371237014685285e-05, "loss": 1.445, "num_input_tokens_seen": 3741319168, "step": 446 }, { "epoch": 0.2235, "grad_norm": 0.678228497505188, "learning_rate": 3.130653946627666e-05, "loss": 1.567, "num_input_tokens_seen": 3749707776, "step": 447 }, { "epoch": 0.224, "grad_norm": 0.8126465082168579, "learning_rate": 3.124166755704261e-05, "loss": 1.3926, "num_input_tokens_seen": 3758096384, "step": 448 }, { "epoch": 0.2245, "grad_norm": 1.3224822282791138, "learning_rate": 3.117662228738823e-05, "loss": 1.3824, "num_input_tokens_seen": 3766484992, "step": 449 }, { "epoch": 0.225, "grad_norm": 1.047465443611145, "learning_rate": 3.111140466039205e-05, "loss": 1.4348, "num_input_tokens_seen": 3774873600, "step": 450 }, { "epoch": 0.2255, "grad_norm": 0.7666661739349365, "learning_rate": 3.104601568179054e-05, "loss": 1.5393, "num_input_tokens_seen": 3783262208, "step": 451 }, { "epoch": 0.226, "grad_norm": 0.9954845905303955, "learning_rate": 3.098045635996264e-05, "loss": 1.3856, "num_input_tokens_seen": 3791650816, "step": 452 }, { "epoch": 0.2265, "grad_norm": 1.5355603694915771, "learning_rate": 3.09147277059142e-05, "loss": 1.5093, "num_input_tokens_seen": 3800039424, "step": 453 }, { "epoch": 0.227, "grad_norm": 0.9586007595062256, "learning_rate": 3.084883073326238e-05, "loss": 1.3665, "num_input_tokens_seen": 3808428032, "step": 454 }, { "epoch": 0.2275, "grad_norm": 1.163979172706604, "learning_rate": 3.078276645822001e-05, "loss": 1.5178, "num_input_tokens_seen": 3816816640, "step": 455 }, { "epoch": 0.228, "grad_norm": 1.00368070602417, "learning_rate": 3.0716535899579936e-05, "loss": 1.6259, "num_input_tokens_seen": 3825205248, "step": 456 }, { "epoch": 0.2285, "grad_norm": 0.9432997703552246, "learning_rate": 3.065014007869931e-05, "loss": 1.5349, "num_input_tokens_seen": 3833593856, "step": 457 }, { "epoch": 0.229, "grad_norm": 1.0078848600387573, "learning_rate": 3.058358001948381e-05, "loss": 1.4382, "num_input_tokens_seen": 3841982464, "step": 458 }, { "epoch": 0.2295, "grad_norm": 1.014064073562622, "learning_rate": 3.0516856748371914e-05, "loss": 1.4839, "num_input_tokens_seen": 3850371072, "step": 459 }, { "epoch": 0.23, "grad_norm": 0.8157885074615479, "learning_rate": 3.0449971294318977e-05, "loss": 1.4889, "num_input_tokens_seen": 3858759680, "step": 460 }, { "epoch": 0.2305, "grad_norm": 0.762134313583374, "learning_rate": 3.0382924688781462e-05, "loss": 1.594, "num_input_tokens_seen": 3867148288, "step": 461 }, { "epoch": 0.231, "grad_norm": 0.8391034603118896, "learning_rate": 3.031571796570095e-05, "loss": 1.3632, "num_input_tokens_seen": 3875536896, "step": 462 }, { "epoch": 0.2315, "grad_norm": 0.809033215045929, "learning_rate": 3.0248352161488267e-05, "loss": 1.6475, "num_input_tokens_seen": 3883925504, "step": 463 }, { "epoch": 0.232, "grad_norm": 0.746643602848053, "learning_rate": 3.018082831500743e-05, "loss": 1.4414, "num_input_tokens_seen": 3892314112, "step": 464 }, { "epoch": 0.2325, "grad_norm": 0.752721905708313, "learning_rate": 3.0113147467559697e-05, "loss": 1.5054, "num_input_tokens_seen": 3900702720, "step": 465 }, { "epoch": 0.233, "grad_norm": 0.9590891599655151, "learning_rate": 3.004531066286745e-05, "loss": 1.6844, "num_input_tokens_seen": 3909091328, "step": 466 }, { "epoch": 0.2335, "grad_norm": 1.0775322914123535, "learning_rate": 2.997731894705815e-05, "loss": 1.4237, "num_input_tokens_seen": 3917479936, "step": 467 }, { "epoch": 0.234, "grad_norm": 0.7942163348197937, "learning_rate": 2.9909173368648154e-05, "loss": 1.3686, "num_input_tokens_seen": 3925868544, "step": 468 }, { "epoch": 0.2345, "grad_norm": 0.8021498918533325, "learning_rate": 2.9840874978526582e-05, "loss": 1.4463, "num_input_tokens_seen": 3934257152, "step": 469 }, { "epoch": 0.235, "grad_norm": 0.6999263763427734, "learning_rate": 2.9772424829939103e-05, "loss": 1.5182, "num_input_tokens_seen": 3942645760, "step": 470 }, { "epoch": 0.2355, "grad_norm": 0.6573587656021118, "learning_rate": 2.9703823978471676e-05, "loss": 1.4216, "num_input_tokens_seen": 3951034368, "step": 471 }, { "epoch": 0.236, "grad_norm": 0.5885545611381531, "learning_rate": 2.9635073482034307e-05, "loss": 1.4596, "num_input_tokens_seen": 3959422976, "step": 472 }, { "epoch": 0.2365, "grad_norm": 0.7636064887046814, "learning_rate": 2.9566174400844692e-05, "loss": 1.2986, "num_input_tokens_seen": 3967811584, "step": 473 }, { "epoch": 0.237, "grad_norm": 0.767098605632782, "learning_rate": 2.949712779741189e-05, "loss": 1.3722, "num_input_tokens_seen": 3976200192, "step": 474 }, { "epoch": 0.2375, "grad_norm": 0.7839322686195374, "learning_rate": 2.9427934736519962e-05, "loss": 1.5351, "num_input_tokens_seen": 3984588800, "step": 475 }, { "epoch": 0.238, "grad_norm": 0.7171193361282349, "learning_rate": 2.935859628521147e-05, "loss": 1.4625, "num_input_tokens_seen": 3992977408, "step": 476 }, { "epoch": 0.2385, "grad_norm": 1.4963669776916504, "learning_rate": 2.9289113512771133e-05, "loss": 1.6766, "num_input_tokens_seen": 4001366016, "step": 477 }, { "epoch": 0.239, "grad_norm": 0.7156916856765747, "learning_rate": 2.921948749070925e-05, "loss": 1.6142, "num_input_tokens_seen": 4009754624, "step": 478 }, { "epoch": 0.2395, "grad_norm": 0.6309584975242615, "learning_rate": 2.914971929274521e-05, "loss": 1.4025, "num_input_tokens_seen": 4018143232, "step": 479 }, { "epoch": 0.24, "grad_norm": 0.7521786689758301, "learning_rate": 2.9079809994790937e-05, "loss": 1.4687, "num_input_tokens_seen": 4026531840, "step": 480 }, { "epoch": 0.2405, "grad_norm": 0.5882706046104431, "learning_rate": 2.900976067493429e-05, "loss": 1.2888, "num_input_tokens_seen": 4034920448, "step": 481 }, { "epoch": 0.241, "grad_norm": 0.7344236373901367, "learning_rate": 2.8939572413422426e-05, "loss": 1.2505, "num_input_tokens_seen": 4043309056, "step": 482 }, { "epoch": 0.2415, "grad_norm": 0.7438739538192749, "learning_rate": 2.886924629264517e-05, "loss": 1.3081, "num_input_tokens_seen": 4051697664, "step": 483 }, { "epoch": 0.242, "grad_norm": 0.9241055250167847, "learning_rate": 2.8798783397118305e-05, "loss": 1.4013, "num_input_tokens_seen": 4060086272, "step": 484 }, { "epoch": 0.2425, "grad_norm": 1.5059905052185059, "learning_rate": 2.872818481346684e-05, "loss": 1.4172, "num_input_tokens_seen": 4068474880, "step": 485 }, { "epoch": 0.243, "grad_norm": 0.824744701385498, "learning_rate": 2.8657451630408287e-05, "loss": 1.5448, "num_input_tokens_seen": 4076863488, "step": 486 }, { "epoch": 0.2435, "grad_norm": 1.162174105644226, "learning_rate": 2.85865849387358e-05, "loss": 1.4611, "num_input_tokens_seen": 4085252096, "step": 487 }, { "epoch": 0.244, "grad_norm": 0.873450756072998, "learning_rate": 2.8515585831301456e-05, "loss": 1.3448, "num_input_tokens_seen": 4093640704, "step": 488 }, { "epoch": 0.2445, "grad_norm": 2.251950979232788, "learning_rate": 2.844445540299931e-05, "loss": 1.5159, "num_input_tokens_seen": 4102029312, "step": 489 }, { "epoch": 0.245, "grad_norm": 1.2113547325134277, "learning_rate": 2.8373194750748566e-05, "loss": 1.4218, "num_input_tokens_seen": 4110417920, "step": 490 }, { "epoch": 0.2455, "grad_norm": 1.1472817659378052, "learning_rate": 2.8301804973476628e-05, "loss": 1.3113, "num_input_tokens_seen": 4118806528, "step": 491 }, { "epoch": 0.246, "grad_norm": 0.9820563197135925, "learning_rate": 2.823028717210218e-05, "loss": 1.4429, "num_input_tokens_seen": 4127195136, "step": 492 }, { "epoch": 0.2465, "grad_norm": 0.7963792085647583, "learning_rate": 2.8158642449518186e-05, "loss": 1.5824, "num_input_tokens_seen": 4135583744, "step": 493 }, { "epoch": 0.247, "grad_norm": 0.7588343024253845, "learning_rate": 2.8086871910574904e-05, "loss": 1.3568, "num_input_tokens_seen": 4143972352, "step": 494 }, { "epoch": 0.2475, "grad_norm": 0.8434267044067383, "learning_rate": 2.8014976662062818e-05, "loss": 1.4751, "num_input_tokens_seen": 4152360960, "step": 495 }, { "epoch": 0.248, "grad_norm": 0.7480296492576599, "learning_rate": 2.7942957812695613e-05, "loss": 1.4948, "num_input_tokens_seen": 4160749568, "step": 496 }, { "epoch": 0.2485, "grad_norm": 0.778762698173523, "learning_rate": 2.787081647309303e-05, "loss": 1.4823, "num_input_tokens_seen": 4169138176, "step": 497 }, { "epoch": 0.249, "grad_norm": 0.5632816553115845, "learning_rate": 2.7798553755763768e-05, "loss": 1.5916, "num_input_tokens_seen": 4177526784, "step": 498 }, { "epoch": 0.2495, "grad_norm": 0.8150230050086975, "learning_rate": 2.7726170775088324e-05, "loss": 1.6006, "num_input_tokens_seen": 4185915392, "step": 499 }, { "epoch": 0.25, "grad_norm": 0.5832118391990662, "learning_rate": 2.7653668647301797e-05, "loss": 1.5992, "num_input_tokens_seen": 4194304000, "step": 500 }, { "epoch": 0.2505, "grad_norm": 0.7656115293502808, "learning_rate": 2.7581048490476695e-05, "loss": 1.5432, "num_input_tokens_seen": 4202692608, "step": 501 }, { "epoch": 0.251, "grad_norm": 0.5938467383384705, "learning_rate": 2.7508311424505665e-05, "loss": 1.507, "num_input_tokens_seen": 4211081216, "step": 502 }, { "epoch": 0.2515, "grad_norm": 0.5441139936447144, "learning_rate": 2.7435458571084247e-05, "loss": 1.4525, "num_input_tokens_seen": 4219469824, "step": 503 }, { "epoch": 0.252, "grad_norm": 0.6305999755859375, "learning_rate": 2.7362491053693564e-05, "loss": 1.4723, "num_input_tokens_seen": 4227858432, "step": 504 }, { "epoch": 0.2525, "grad_norm": 0.6024186015129089, "learning_rate": 2.7289409997583002e-05, "loss": 1.4195, "num_input_tokens_seen": 4236247040, "step": 505 }, { "epoch": 0.253, "grad_norm": 0.553775429725647, "learning_rate": 2.7216216529752836e-05, "loss": 1.4429, "num_input_tokens_seen": 4244635648, "step": 506 }, { "epoch": 0.2535, "grad_norm": 0.5768627524375916, "learning_rate": 2.7142911778936913e-05, "loss": 1.4958, "num_input_tokens_seen": 4253024256, "step": 507 }, { "epoch": 0.254, "grad_norm": 0.784758985042572, "learning_rate": 2.7069496875585145e-05, "loss": 1.3976, "num_input_tokens_seen": 4261412864, "step": 508 }, { "epoch": 0.2545, "grad_norm": 0.6774864792823792, "learning_rate": 2.6995972951846177e-05, "loss": 1.2998, "num_input_tokens_seen": 4269801472, "step": 509 }, { "epoch": 0.255, "grad_norm": 0.8152163624763489, "learning_rate": 2.692234114154986e-05, "loss": 1.239, "num_input_tokens_seen": 4278190080, "step": 510 }, { "epoch": 0.2555, "grad_norm": 0.758950412273407, "learning_rate": 2.68486025801898e-05, "loss": 1.379, "num_input_tokens_seen": 4286578688, "step": 511 }, { "epoch": 0.256, "grad_norm": 0.5301639437675476, "learning_rate": 2.6774758404905833e-05, "loss": 1.4378, "num_input_tokens_seen": 4294967296, "step": 512 }, { "epoch": 0.2565, "grad_norm": 0.5760114789009094, "learning_rate": 2.670080975446648e-05, "loss": 1.3666, "num_input_tokens_seen": 4303355904, "step": 513 }, { "epoch": 0.257, "grad_norm": 0.6986238360404968, "learning_rate": 2.662675776925142e-05, "loss": 1.3647, "num_input_tokens_seen": 4311744512, "step": 514 }, { "epoch": 0.2575, "grad_norm": 0.7341744899749756, "learning_rate": 2.6552603591233875e-05, "loss": 1.4368, "num_input_tokens_seen": 4320133120, "step": 515 }, { "epoch": 0.258, "grad_norm": 0.6045746207237244, "learning_rate": 2.647834836396299e-05, "loss": 1.4528, "num_input_tokens_seen": 4328521728, "step": 516 }, { "epoch": 0.2585, "grad_norm": 0.5556396842002869, "learning_rate": 2.6403993232546235e-05, "loss": 1.384, "num_input_tokens_seen": 4336910336, "step": 517 }, { "epoch": 0.259, "grad_norm": 0.7080802321434021, "learning_rate": 2.6329539343631725e-05, "loss": 1.5486, "num_input_tokens_seen": 4345298944, "step": 518 }, { "epoch": 0.2595, "grad_norm": 0.5549618005752563, "learning_rate": 2.625498784539052e-05, "loss": 1.3982, "num_input_tokens_seen": 4353687552, "step": 519 }, { "epoch": 0.26, "grad_norm": 0.6162033677101135, "learning_rate": 2.618033988749895e-05, "loss": 1.2825, "num_input_tokens_seen": 4362076160, "step": 520 }, { "epoch": 0.2605, "grad_norm": 0.9670022130012512, "learning_rate": 2.6105596621120873e-05, "loss": 1.3772, "num_input_tokens_seen": 4370464768, "step": 521 }, { "epoch": 0.261, "grad_norm": 1.053465485572815, "learning_rate": 2.6030759198889915e-05, "loss": 1.445, "num_input_tokens_seen": 4378853376, "step": 522 }, { "epoch": 0.2615, "grad_norm": 0.539896547794342, "learning_rate": 2.595582877489171e-05, "loss": 1.3613, "num_input_tokens_seen": 4387241984, "step": 523 }, { "epoch": 0.262, "grad_norm": 0.7269028425216675, "learning_rate": 2.588080650464608e-05, "loss": 1.3752, "num_input_tokens_seen": 4395630592, "step": 524 }, { "epoch": 0.2625, "grad_norm": 0.7713400721549988, "learning_rate": 2.580569354508925e-05, "loss": 1.4271, "num_input_tokens_seen": 4404019200, "step": 525 }, { "epoch": 0.263, "grad_norm": 0.5016767978668213, "learning_rate": 2.573049105455597e-05, "loss": 1.5652, "num_input_tokens_seen": 4412407808, "step": 526 }, { "epoch": 0.2635, "grad_norm": 0.6196660399436951, "learning_rate": 2.5655200192761668e-05, "loss": 1.5806, "num_input_tokens_seen": 4420796416, "step": 527 }, { "epoch": 0.264, "grad_norm": 0.5495991706848145, "learning_rate": 2.557982212078459e-05, "loss": 1.5978, "num_input_tokens_seen": 4429185024, "step": 528 }, { "epoch": 0.2645, "grad_norm": 0.8191635608673096, "learning_rate": 2.550435800104783e-05, "loss": 1.5199, "num_input_tokens_seen": 4437573632, "step": 529 }, { "epoch": 0.265, "grad_norm": 0.7542608380317688, "learning_rate": 2.5428808997301486e-05, "loss": 1.4653, "num_input_tokens_seen": 4445962240, "step": 530 }, { "epoch": 0.2655, "grad_norm": 0.49044930934906006, "learning_rate": 2.535317627460465e-05, "loss": 1.3972, "num_input_tokens_seen": 4454350848, "step": 531 }, { "epoch": 0.266, "grad_norm": 0.6356293559074402, "learning_rate": 2.5277460999307462e-05, "loss": 1.4841, "num_input_tokens_seen": 4462739456, "step": 532 }, { "epoch": 0.2665, "grad_norm": 0.602867841720581, "learning_rate": 2.5201664339033138e-05, "loss": 1.5647, "num_input_tokens_seen": 4471128064, "step": 533 }, { "epoch": 0.267, "grad_norm": 0.832502543926239, "learning_rate": 2.5125787462659937e-05, "loss": 1.4769, "num_input_tokens_seen": 4479516672, "step": 534 }, { "epoch": 0.2675, "grad_norm": 0.8413757085800171, "learning_rate": 2.504983154030316e-05, "loss": 1.3794, "num_input_tokens_seen": 4487905280, "step": 535 }, { "epoch": 0.268, "grad_norm": 0.7393339276313782, "learning_rate": 2.4973797743297103e-05, "loss": 1.5675, "num_input_tokens_seen": 4496293888, "step": 536 }, { "epoch": 0.2685, "grad_norm": 0.6585960984230042, "learning_rate": 2.489768724417695e-05, "loss": 1.6703, "num_input_tokens_seen": 4504682496, "step": 537 }, { "epoch": 0.269, "grad_norm": 0.7164105176925659, "learning_rate": 2.4821501216660778e-05, "loss": 1.3813, "num_input_tokens_seen": 4513071104, "step": 538 }, { "epoch": 0.2695, "grad_norm": 0.5881420373916626, "learning_rate": 2.474524083563136e-05, "loss": 1.3829, "num_input_tokens_seen": 4521459712, "step": 539 }, { "epoch": 0.27, "grad_norm": 0.5825918912887573, "learning_rate": 2.4668907277118114e-05, "loss": 1.3653, "num_input_tokens_seen": 4529848320, "step": 540 }, { "epoch": 0.2705, "grad_norm": 0.7076876163482666, "learning_rate": 2.459250171827894e-05, "loss": 1.4445, "num_input_tokens_seen": 4538236928, "step": 541 }, { "epoch": 0.271, "grad_norm": 0.5759537220001221, "learning_rate": 2.4516025337382078e-05, "loss": 1.5127, "num_input_tokens_seen": 4546625536, "step": 542 }, { "epoch": 0.2715, "grad_norm": 0.5978606343269348, "learning_rate": 2.443947931378792e-05, "loss": 1.4295, "num_input_tokens_seen": 4555014144, "step": 543 }, { "epoch": 0.272, "grad_norm": 0.5363240838050842, "learning_rate": 2.4362864827930855e-05, "loss": 1.2487, "num_input_tokens_seen": 4563402752, "step": 544 }, { "epoch": 0.2725, "grad_norm": 0.6239610314369202, "learning_rate": 2.4286183061301016e-05, "loss": 1.3886, "num_input_tokens_seen": 4571791360, "step": 545 }, { "epoch": 0.273, "grad_norm": 0.5882096290588379, "learning_rate": 2.4209435196426112e-05, "loss": 1.4466, "num_input_tokens_seen": 4580179968, "step": 546 }, { "epoch": 0.2735, "grad_norm": 0.5329456925392151, "learning_rate": 2.4132622416853164e-05, "loss": 1.3682, "num_input_tokens_seen": 4588568576, "step": 547 }, { "epoch": 0.274, "grad_norm": 0.6302275657653809, "learning_rate": 2.405574590713025e-05, "loss": 1.1858, "num_input_tokens_seen": 4596957184, "step": 548 }, { "epoch": 0.2745, "grad_norm": 0.5893291234970093, "learning_rate": 2.3978806852788253e-05, "loss": 1.4673, "num_input_tokens_seen": 4605345792, "step": 549 }, { "epoch": 0.275, "grad_norm": 0.5820369124412537, "learning_rate": 2.390180644032257e-05, "loss": 1.3326, "num_input_tokens_seen": 4613734400, "step": 550 }, { "epoch": 0.2755, "grad_norm": 0.4964269697666168, "learning_rate": 2.382474585717481e-05, "loss": 1.3376, "num_input_tokens_seen": 4622123008, "step": 551 }, { "epoch": 0.276, "grad_norm": 0.7295953631401062, "learning_rate": 2.37476262917145e-05, "loss": 1.4808, "num_input_tokens_seen": 4630511616, "step": 552 }, { "epoch": 0.2765, "grad_norm": 0.6741532683372498, "learning_rate": 2.3670448933220732e-05, "loss": 1.4168, "num_input_tokens_seen": 4638900224, "step": 553 }, { "epoch": 0.277, "grad_norm": 0.5214787125587463, "learning_rate": 2.3593214971863857e-05, "loss": 1.5511, "num_input_tokens_seen": 4647288832, "step": 554 }, { "epoch": 0.2775, "grad_norm": 0.8530207276344299, "learning_rate": 2.3515925598687097e-05, "loss": 1.5042, "num_input_tokens_seen": 4655677440, "step": 555 }, { "epoch": 0.278, "grad_norm": 0.6344778537750244, "learning_rate": 2.3438582005588192e-05, "loss": 1.3884, "num_input_tokens_seen": 4664066048, "step": 556 }, { "epoch": 0.2785, "grad_norm": 0.559327244758606, "learning_rate": 2.3361185385301042e-05, "loss": 1.4922, "num_input_tokens_seen": 4672454656, "step": 557 }, { "epoch": 0.279, "grad_norm": 0.6840835213661194, "learning_rate": 2.328373693137726e-05, "loss": 1.4604, "num_input_tokens_seen": 4680843264, "step": 558 }, { "epoch": 0.2795, "grad_norm": 0.6126253008842468, "learning_rate": 2.3206237838167825e-05, "loss": 1.5406, "num_input_tokens_seen": 4689231872, "step": 559 }, { "epoch": 0.28, "grad_norm": 0.7576929926872253, "learning_rate": 2.312868930080462e-05, "loss": 1.4167, "num_input_tokens_seen": 4697620480, "step": 560 }, { "epoch": 0.2805, "grad_norm": 0.9730141758918762, "learning_rate": 2.3051092515182022e-05, "loss": 1.5018, "num_input_tokens_seen": 4706009088, "step": 561 }, { "epoch": 0.281, "grad_norm": 0.549049973487854, "learning_rate": 2.2973448677938466e-05, "loss": 1.3886, "num_input_tokens_seen": 4714397696, "step": 562 }, { "epoch": 0.2815, "grad_norm": 0.9518319368362427, "learning_rate": 2.289575898643796e-05, "loss": 1.4624, "num_input_tokens_seen": 4722786304, "step": 563 }, { "epoch": 0.282, "grad_norm": 0.8367601633071899, "learning_rate": 2.2818024638751655e-05, "loss": 1.2965, "num_input_tokens_seen": 4731174912, "step": 564 }, { "epoch": 0.2825, "grad_norm": 0.6762754917144775, "learning_rate": 2.2740246833639366e-05, "loss": 1.3959, "num_input_tokens_seen": 4739563520, "step": 565 }, { "epoch": 0.283, "grad_norm": 0.8794370889663696, "learning_rate": 2.266242677053105e-05, "loss": 1.4526, "num_input_tokens_seen": 4747952128, "step": 566 }, { "epoch": 0.2835, "grad_norm": 0.6587076783180237, "learning_rate": 2.2584565649508355e-05, "loss": 1.3469, "num_input_tokens_seen": 4756340736, "step": 567 }, { "epoch": 0.284, "grad_norm": 0.7360010147094727, "learning_rate": 2.2506664671286087e-05, "loss": 1.3586, "num_input_tokens_seen": 4764729344, "step": 568 }, { "epoch": 0.2845, "grad_norm": 0.7257161140441895, "learning_rate": 2.2428725037193697e-05, "loss": 1.3982, "num_input_tokens_seen": 4773117952, "step": 569 }, { "epoch": 0.285, "grad_norm": 0.6716630458831787, "learning_rate": 2.2350747949156756e-05, "loss": 1.5609, "num_input_tokens_seen": 4781506560, "step": 570 }, { "epoch": 0.2855, "grad_norm": 0.7084539532661438, "learning_rate": 2.2272734609678426e-05, "loss": 1.406, "num_input_tokens_seen": 4789895168, "step": 571 }, { "epoch": 0.286, "grad_norm": 0.6002200841903687, "learning_rate": 2.2194686221820905e-05, "loss": 1.4629, "num_input_tokens_seen": 4798283776, "step": 572 }, { "epoch": 0.2865, "grad_norm": 0.6542272567749023, "learning_rate": 2.2116603989186895e-05, "loss": 1.4454, "num_input_tokens_seen": 4806672384, "step": 573 }, { "epoch": 0.287, "grad_norm": 0.7670809626579285, "learning_rate": 2.2038489115901e-05, "loss": 1.3285, "num_input_tokens_seen": 4815060992, "step": 574 }, { "epoch": 0.2875, "grad_norm": 0.5654203295707703, "learning_rate": 2.196034280659122e-05, "loss": 1.4207, "num_input_tokens_seen": 4823449600, "step": 575 }, { "epoch": 0.288, "grad_norm": 0.7025532126426697, "learning_rate": 2.1882166266370292e-05, "loss": 1.2446, "num_input_tokens_seen": 4831838208, "step": 576 }, { "epoch": 0.2885, "grad_norm": 0.6945000290870667, "learning_rate": 2.1803960700817185e-05, "loss": 1.4721, "num_input_tokens_seen": 4840226816, "step": 577 }, { "epoch": 0.289, "grad_norm": 0.6775925755500793, "learning_rate": 2.1725727315958473e-05, "loss": 1.3402, "num_input_tokens_seen": 4848615424, "step": 578 }, { "epoch": 0.2895, "grad_norm": 0.7750461101531982, "learning_rate": 2.1647467318249715e-05, "loss": 1.4354, "num_input_tokens_seen": 4857004032, "step": 579 }, { "epoch": 0.29, "grad_norm": 0.6834791302680969, "learning_rate": 2.1569181914556904e-05, "loss": 1.5643, "num_input_tokens_seen": 4865392640, "step": 580 }, { "epoch": 0.2905, "grad_norm": 0.6802902817726135, "learning_rate": 2.1490872312137795e-05, "loss": 1.3149, "num_input_tokens_seen": 4873781248, "step": 581 }, { "epoch": 0.291, "grad_norm": 1.0233876705169678, "learning_rate": 2.1412539718623337e-05, "loss": 1.5537, "num_input_tokens_seen": 4882169856, "step": 582 }, { "epoch": 0.2915, "grad_norm": 0.6507788300514221, "learning_rate": 2.1334185341999024e-05, "loss": 1.2769, "num_input_tokens_seen": 4890558464, "step": 583 }, { "epoch": 0.292, "grad_norm": 0.5922709107398987, "learning_rate": 2.125581039058627e-05, "loss": 1.4924, "num_input_tokens_seen": 4898947072, "step": 584 }, { "epoch": 0.2925, "grad_norm": 0.7806411385536194, "learning_rate": 2.117741607302378e-05, "loss": 1.3305, "num_input_tokens_seen": 4907335680, "step": 585 }, { "epoch": 0.293, "grad_norm": 0.6991647481918335, "learning_rate": 2.109900359824892e-05, "loss": 1.4842, "num_input_tokens_seen": 4915724288, "step": 586 }, { "epoch": 0.2935, "grad_norm": 0.6910050511360168, "learning_rate": 2.1020574175479035e-05, "loss": 1.4521, "num_input_tokens_seen": 4924112896, "step": 587 }, { "epoch": 0.294, "grad_norm": 0.7409821152687073, "learning_rate": 2.0942129014192854e-05, "loss": 1.5209, "num_input_tokens_seen": 4932501504, "step": 588 }, { "epoch": 0.2945, "grad_norm": 1.4206409454345703, "learning_rate": 2.0863669324111807e-05, "loss": 1.4107, "num_input_tokens_seen": 4940890112, "step": 589 }, { "epoch": 0.295, "grad_norm": 0.6781050562858582, "learning_rate": 2.0785196315181374e-05, "loss": 1.5288, "num_input_tokens_seen": 4949278720, "step": 590 }, { "epoch": 0.2955, "grad_norm": 1.0001184940338135, "learning_rate": 2.0706711197552427e-05, "loss": 1.4751, "num_input_tokens_seen": 4957667328, "step": 591 }, { "epoch": 0.296, "grad_norm": 0.6228082180023193, "learning_rate": 2.0628215181562567e-05, "loss": 1.4661, "num_input_tokens_seen": 4966055936, "step": 592 }, { "epoch": 0.2965, "grad_norm": 0.6182650327682495, "learning_rate": 2.054970947771747e-05, "loss": 1.3928, "num_input_tokens_seen": 4974444544, "step": 593 }, { "epoch": 0.297, "grad_norm": 0.6820820569992065, "learning_rate": 2.0471195296672207e-05, "loss": 1.4402, "num_input_tokens_seen": 4982833152, "step": 594 }, { "epoch": 0.2975, "grad_norm": 0.6595681309700012, "learning_rate": 2.0392673849212565e-05, "loss": 1.4636, "num_input_tokens_seen": 4991221760, "step": 595 }, { "epoch": 0.298, "grad_norm": 0.5305392742156982, "learning_rate": 2.0314146346236415e-05, "loss": 1.3205, "num_input_tokens_seen": 4999610368, "step": 596 }, { "epoch": 0.2985, "grad_norm": 0.5211024880409241, "learning_rate": 2.0235613998734985e-05, "loss": 1.4325, "num_input_tokens_seen": 5007998976, "step": 597 }, { "epoch": 0.299, "grad_norm": 0.51418536901474, "learning_rate": 2.0157078017774228e-05, "loss": 1.4516, "num_input_tokens_seen": 5016387584, "step": 598 }, { "epoch": 0.2995, "grad_norm": 0.472655326128006, "learning_rate": 2.0078539614476122e-05, "loss": 1.4689, "num_input_tokens_seen": 5024776192, "step": 599 }, { "epoch": 0.3, "grad_norm": 0.4495573043823242, "learning_rate": 2e-05, "loss": 1.4367, "num_input_tokens_seen": 5033164800, "step": 600 }, { "epoch": 0.3005, "grad_norm": 0.5164429545402527, "learning_rate": 1.9921460385523884e-05, "loss": 1.5228, "num_input_tokens_seen": 5041553408, "step": 601 }, { "epoch": 0.301, "grad_norm": 0.4836256802082062, "learning_rate": 1.9842921982225782e-05, "loss": 1.395, "num_input_tokens_seen": 5049942016, "step": 602 }, { "epoch": 0.3015, "grad_norm": 0.4672565758228302, "learning_rate": 1.9764386001265015e-05, "loss": 1.2658, "num_input_tokens_seen": 5058330624, "step": 603 }, { "epoch": 0.302, "grad_norm": 0.518621563911438, "learning_rate": 1.9685853653763592e-05, "loss": 1.3537, "num_input_tokens_seen": 5066719232, "step": 604 }, { "epoch": 0.3025, "grad_norm": 0.4430660605430603, "learning_rate": 1.960732615078744e-05, "loss": 1.4805, "num_input_tokens_seen": 5075107840, "step": 605 }, { "epoch": 0.303, "grad_norm": 0.5217835903167725, "learning_rate": 1.95288047033278e-05, "loss": 1.5621, "num_input_tokens_seen": 5083496448, "step": 606 }, { "epoch": 0.3035, "grad_norm": 0.4984613358974457, "learning_rate": 1.9450290522282533e-05, "loss": 1.3864, "num_input_tokens_seen": 5091885056, "step": 607 }, { "epoch": 0.304, "grad_norm": 0.42779552936553955, "learning_rate": 1.9371784818437436e-05, "loss": 1.4034, "num_input_tokens_seen": 5100273664, "step": 608 }, { "epoch": 0.3045, "grad_norm": 0.450816810131073, "learning_rate": 1.929328880244758e-05, "loss": 1.3803, "num_input_tokens_seen": 5108662272, "step": 609 }, { "epoch": 0.305, "grad_norm": 0.5176210999488831, "learning_rate": 1.9214803684818636e-05, "loss": 1.5259, "num_input_tokens_seen": 5117050880, "step": 610 }, { "epoch": 0.3055, "grad_norm": 0.5661643743515015, "learning_rate": 1.9136330675888192e-05, "loss": 1.3864, "num_input_tokens_seen": 5125439488, "step": 611 }, { "epoch": 0.306, "grad_norm": 0.45199450850486755, "learning_rate": 1.905787098580715e-05, "loss": 1.3537, "num_input_tokens_seen": 5133828096, "step": 612 }, { "epoch": 0.3065, "grad_norm": 0.5624679923057556, "learning_rate": 1.897942582452097e-05, "loss": 1.3428, "num_input_tokens_seen": 5142216704, "step": 613 }, { "epoch": 0.307, "grad_norm": 0.5675469040870667, "learning_rate": 1.890099640175109e-05, "loss": 1.5039, "num_input_tokens_seen": 5150605312, "step": 614 }, { "epoch": 0.3075, "grad_norm": 0.5343053340911865, "learning_rate": 1.882258392697622e-05, "loss": 1.4343, "num_input_tokens_seen": 5158993920, "step": 615 }, { "epoch": 0.308, "grad_norm": 0.5548115372657776, "learning_rate": 1.8744189609413733e-05, "loss": 1.3258, "num_input_tokens_seen": 5167382528, "step": 616 }, { "epoch": 0.3085, "grad_norm": 0.5104579329490662, "learning_rate": 1.8665814658000982e-05, "loss": 1.3997, "num_input_tokens_seen": 5175771136, "step": 617 }, { "epoch": 0.309, "grad_norm": 0.6124238967895508, "learning_rate": 1.8587460281376673e-05, "loss": 1.3717, "num_input_tokens_seen": 5184159744, "step": 618 }, { "epoch": 0.3095, "grad_norm": 0.4247097969055176, "learning_rate": 1.8509127687862208e-05, "loss": 1.56, "num_input_tokens_seen": 5192548352, "step": 619 }, { "epoch": 0.31, "grad_norm": 0.5699355602264404, "learning_rate": 1.8430818085443106e-05, "loss": 1.4855, "num_input_tokens_seen": 5200936960, "step": 620 }, { "epoch": 0.3105, "grad_norm": 0.5173696279525757, "learning_rate": 1.835253268175029e-05, "loss": 1.4375, "num_input_tokens_seen": 5209325568, "step": 621 }, { "epoch": 0.311, "grad_norm": 0.5886808037757874, "learning_rate": 1.8274272684041537e-05, "loss": 1.3632, "num_input_tokens_seen": 5217714176, "step": 622 }, { "epoch": 0.3115, "grad_norm": 0.6511006355285645, "learning_rate": 1.8196039299182818e-05, "loss": 1.3691, "num_input_tokens_seen": 5226102784, "step": 623 }, { "epoch": 0.312, "grad_norm": 0.47680914402008057, "learning_rate": 1.8117833733629715e-05, "loss": 1.2702, "num_input_tokens_seen": 5234491392, "step": 624 }, { "epoch": 0.3125, "grad_norm": 0.5211230516433716, "learning_rate": 1.8039657193408788e-05, "loss": 1.4756, "num_input_tokens_seen": 5242880000, "step": 625 }, { "epoch": 0.313, "grad_norm": 0.6009602546691895, "learning_rate": 1.7961510884099005e-05, "loss": 1.5261, "num_input_tokens_seen": 5251268608, "step": 626 }, { "epoch": 0.3135, "grad_norm": 0.6021015644073486, "learning_rate": 1.7883396010813116e-05, "loss": 1.2861, "num_input_tokens_seen": 5259657216, "step": 627 }, { "epoch": 0.314, "grad_norm": 0.7084016799926758, "learning_rate": 1.7805313778179095e-05, "loss": 1.4946, "num_input_tokens_seen": 5268045824, "step": 628 }, { "epoch": 0.3145, "grad_norm": 0.49211904406547546, "learning_rate": 1.772726539032158e-05, "loss": 1.3503, "num_input_tokens_seen": 5276434432, "step": 629 }, { "epoch": 0.315, "grad_norm": 0.5996968150138855, "learning_rate": 1.764925205084325e-05, "loss": 1.3383, "num_input_tokens_seen": 5284823040, "step": 630 }, { "epoch": 0.3155, "grad_norm": 0.4948015809059143, "learning_rate": 1.7571274962806316e-05, "loss": 1.634, "num_input_tokens_seen": 5293211648, "step": 631 }, { "epoch": 0.316, "grad_norm": 0.5732881426811218, "learning_rate": 1.7493335328713913e-05, "loss": 1.3245, "num_input_tokens_seen": 5301600256, "step": 632 }, { "epoch": 0.3165, "grad_norm": 0.6721447706222534, "learning_rate": 1.741543435049165e-05, "loss": 1.4615, "num_input_tokens_seen": 5309988864, "step": 633 }, { "epoch": 0.317, "grad_norm": 0.5314796566963196, "learning_rate": 1.7337573229468958e-05, "loss": 1.3558, "num_input_tokens_seen": 5318377472, "step": 634 }, { "epoch": 0.3175, "grad_norm": 0.6996693015098572, "learning_rate": 1.7259753166360644e-05, "loss": 1.3847, "num_input_tokens_seen": 5326766080, "step": 635 }, { "epoch": 0.318, "grad_norm": 0.6030979156494141, "learning_rate": 1.7181975361248348e-05, "loss": 1.3559, "num_input_tokens_seen": 5335154688, "step": 636 }, { "epoch": 0.3185, "grad_norm": 0.5246545076370239, "learning_rate": 1.7104241013562045e-05, "loss": 1.3436, "num_input_tokens_seen": 5343543296, "step": 637 }, { "epoch": 0.319, "grad_norm": 0.4644118547439575, "learning_rate": 1.702655132206154e-05, "loss": 1.4038, "num_input_tokens_seen": 5351931904, "step": 638 }, { "epoch": 0.3195, "grad_norm": 0.5122376680374146, "learning_rate": 1.6948907484817985e-05, "loss": 1.4261, "num_input_tokens_seen": 5360320512, "step": 639 }, { "epoch": 0.32, "grad_norm": 0.5542669296264648, "learning_rate": 1.687131069919538e-05, "loss": 1.4136, "num_input_tokens_seen": 5368709120, "step": 640 }, { "epoch": 0.3205, "grad_norm": 0.5668465495109558, "learning_rate": 1.679376216183218e-05, "loss": 1.4643, "num_input_tokens_seen": 5377097728, "step": 641 }, { "epoch": 0.321, "grad_norm": 0.5116631388664246, "learning_rate": 1.6716263068622744e-05, "loss": 1.4002, "num_input_tokens_seen": 5385486336, "step": 642 }, { "epoch": 0.3215, "grad_norm": 0.4890478551387787, "learning_rate": 1.6638814614698965e-05, "loss": 1.3683, "num_input_tokens_seen": 5393874944, "step": 643 }, { "epoch": 0.322, "grad_norm": 0.5171346664428711, "learning_rate": 1.6561417994411808e-05, "loss": 1.5243, "num_input_tokens_seen": 5402263552, "step": 644 }, { "epoch": 0.3225, "grad_norm": 0.5781503915786743, "learning_rate": 1.648407440131291e-05, "loss": 1.2987, "num_input_tokens_seen": 5410652160, "step": 645 }, { "epoch": 0.323, "grad_norm": 0.5926806330680847, "learning_rate": 1.640678502813615e-05, "loss": 1.3718, "num_input_tokens_seen": 5419040768, "step": 646 }, { "epoch": 0.3235, "grad_norm": 0.4966941773891449, "learning_rate": 1.6329551066779278e-05, "loss": 1.4791, "num_input_tokens_seen": 5427429376, "step": 647 }, { "epoch": 0.324, "grad_norm": 0.48014533519744873, "learning_rate": 1.6252373708285505e-05, "loss": 1.3792, "num_input_tokens_seen": 5435817984, "step": 648 }, { "epoch": 0.3245, "grad_norm": 0.5070712566375732, "learning_rate": 1.6175254142825196e-05, "loss": 1.2988, "num_input_tokens_seen": 5444206592, "step": 649 }, { "epoch": 0.325, "grad_norm": 0.5408443808555603, "learning_rate": 1.609819355967744e-05, "loss": 1.2435, "num_input_tokens_seen": 5452595200, "step": 650 }, { "epoch": 0.3255, "grad_norm": 0.5190852284431458, "learning_rate": 1.602119314721175e-05, "loss": 1.4163, "num_input_tokens_seen": 5460983808, "step": 651 }, { "epoch": 0.326, "grad_norm": 0.45008084177970886, "learning_rate": 1.5944254092869756e-05, "loss": 1.4073, "num_input_tokens_seen": 5469372416, "step": 652 }, { "epoch": 0.3265, "grad_norm": 0.4805797040462494, "learning_rate": 1.5867377583146836e-05, "loss": 1.4555, "num_input_tokens_seen": 5477761024, "step": 653 }, { "epoch": 0.327, "grad_norm": 0.6190395355224609, "learning_rate": 1.579056480357389e-05, "loss": 1.3031, "num_input_tokens_seen": 5486149632, "step": 654 }, { "epoch": 0.3275, "grad_norm": 0.5672743320465088, "learning_rate": 1.571381693869899e-05, "loss": 1.3732, "num_input_tokens_seen": 5494538240, "step": 655 }, { "epoch": 0.328, "grad_norm": 0.4496897757053375, "learning_rate": 1.5637135172069155e-05, "loss": 1.4723, "num_input_tokens_seen": 5502926848, "step": 656 }, { "epoch": 0.3285, "grad_norm": 0.5514234900474548, "learning_rate": 1.5560520686212083e-05, "loss": 1.2985, "num_input_tokens_seen": 5511315456, "step": 657 }, { "epoch": 0.329, "grad_norm": 0.5014287829399109, "learning_rate": 1.548397466261793e-05, "loss": 1.5959, "num_input_tokens_seen": 5519704064, "step": 658 }, { "epoch": 0.3295, "grad_norm": 0.5554429888725281, "learning_rate": 1.5407498281721063e-05, "loss": 1.2987, "num_input_tokens_seen": 5528092672, "step": 659 }, { "epoch": 0.33, "grad_norm": 0.562203586101532, "learning_rate": 1.53310927228819e-05, "loss": 1.4116, "num_input_tokens_seen": 5536481280, "step": 660 }, { "epoch": 0.3305, "grad_norm": 0.4993477761745453, "learning_rate": 1.5254759164368644e-05, "loss": 1.4463, "num_input_tokens_seen": 5544869888, "step": 661 }, { "epoch": 0.331, "grad_norm": 0.5042338371276855, "learning_rate": 1.517849878333923e-05, "loss": 1.4157, "num_input_tokens_seen": 5553258496, "step": 662 }, { "epoch": 0.3315, "grad_norm": 0.5745116472244263, "learning_rate": 1.5102312755823053e-05, "loss": 1.4815, "num_input_tokens_seen": 5561647104, "step": 663 }, { "epoch": 0.332, "grad_norm": 0.46626922488212585, "learning_rate": 1.5026202256702909e-05, "loss": 1.5073, "num_input_tokens_seen": 5570035712, "step": 664 }, { "epoch": 0.3325, "grad_norm": 0.5493903756141663, "learning_rate": 1.4950168459696841e-05, "loss": 1.4008, "num_input_tokens_seen": 5578424320, "step": 665 }, { "epoch": 0.333, "grad_norm": 0.42233845591545105, "learning_rate": 1.4874212537340067e-05, "loss": 1.4189, "num_input_tokens_seen": 5586812928, "step": 666 }, { "epoch": 0.3335, "grad_norm": 0.46876290440559387, "learning_rate": 1.4798335660966869e-05, "loss": 1.5545, "num_input_tokens_seen": 5595201536, "step": 667 }, { "epoch": 0.334, "grad_norm": 0.4012518525123596, "learning_rate": 1.4722539000692548e-05, "loss": 1.5397, "num_input_tokens_seen": 5603590144, "step": 668 }, { "epoch": 0.3345, "grad_norm": 0.4659953713417053, "learning_rate": 1.4646823725395351e-05, "loss": 1.385, "num_input_tokens_seen": 5611978752, "step": 669 }, { "epoch": 0.335, "grad_norm": 0.39889195561408997, "learning_rate": 1.4571191002698517e-05, "loss": 1.4432, "num_input_tokens_seen": 5620367360, "step": 670 }, { "epoch": 0.3355, "grad_norm": 0.3971866965293884, "learning_rate": 1.4495641998952172e-05, "loss": 1.3255, "num_input_tokens_seen": 5628755968, "step": 671 }, { "epoch": 0.336, "grad_norm": 0.39519765973091125, "learning_rate": 1.4420177879215419e-05, "loss": 1.3468, "num_input_tokens_seen": 5637144576, "step": 672 }, { "epoch": 0.3365, "grad_norm": 0.4671727120876312, "learning_rate": 1.434479980723833e-05, "loss": 1.4167, "num_input_tokens_seen": 5645533184, "step": 673 }, { "epoch": 0.337, "grad_norm": 0.4683206379413605, "learning_rate": 1.4269508945444033e-05, "loss": 1.2581, "num_input_tokens_seen": 5653921792, "step": 674 }, { "epoch": 0.3375, "grad_norm": 0.4941425323486328, "learning_rate": 1.4194306454910757e-05, "loss": 1.2882, "num_input_tokens_seen": 5662310400, "step": 675 }, { "epoch": 0.338, "grad_norm": 0.3830873668193817, "learning_rate": 1.4119193495353925e-05, "loss": 1.3762, "num_input_tokens_seen": 5670699008, "step": 676 }, { "epoch": 0.3385, "grad_norm": 0.4463721513748169, "learning_rate": 1.40441712251083e-05, "loss": 1.5396, "num_input_tokens_seen": 5679087616, "step": 677 }, { "epoch": 0.339, "grad_norm": 0.45546117424964905, "learning_rate": 1.3969240801110088e-05, "loss": 1.1974, "num_input_tokens_seen": 5687476224, "step": 678 }, { "epoch": 0.3395, "grad_norm": 0.4943059980869293, "learning_rate": 1.3894403378879132e-05, "loss": 1.3352, "num_input_tokens_seen": 5695864832, "step": 679 }, { "epoch": 0.34, "grad_norm": 0.3816196918487549, "learning_rate": 1.3819660112501054e-05, "loss": 1.4715, "num_input_tokens_seen": 5704253440, "step": 680 }, { "epoch": 0.3405, "grad_norm": 0.4322736859321594, "learning_rate": 1.3745012154609492e-05, "loss": 1.4253, "num_input_tokens_seen": 5712642048, "step": 681 }, { "epoch": 0.341, "grad_norm": 0.42386630177497864, "learning_rate": 1.3670460656368278e-05, "loss": 1.4005, "num_input_tokens_seen": 5721030656, "step": 682 }, { "epoch": 0.3415, "grad_norm": 0.4334513247013092, "learning_rate": 1.3596006767453766e-05, "loss": 1.4121, "num_input_tokens_seen": 5729419264, "step": 683 }, { "epoch": 0.342, "grad_norm": 0.4680859446525574, "learning_rate": 1.3521651636037017e-05, "loss": 1.2901, "num_input_tokens_seen": 5737807872, "step": 684 }, { "epoch": 0.3425, "grad_norm": 0.404777467250824, "learning_rate": 1.3447396408766134e-05, "loss": 1.4667, "num_input_tokens_seen": 5746196480, "step": 685 }, { "epoch": 0.343, "grad_norm": 0.42837515473365784, "learning_rate": 1.3373242230748579e-05, "loss": 1.4082, "num_input_tokens_seen": 5754585088, "step": 686 }, { "epoch": 0.3435, "grad_norm": 0.408573716878891, "learning_rate": 1.3299190245533522e-05, "loss": 1.5449, "num_input_tokens_seen": 5762973696, "step": 687 }, { "epoch": 0.344, "grad_norm": 0.43913209438323975, "learning_rate": 1.3225241595094173e-05, "loss": 1.3952, "num_input_tokens_seen": 5771362304, "step": 688 }, { "epoch": 0.3445, "grad_norm": 0.38361233472824097, "learning_rate": 1.3151397419810207e-05, "loss": 1.3481, "num_input_tokens_seen": 5779750912, "step": 689 }, { "epoch": 0.345, "grad_norm": 0.44679775834083557, "learning_rate": 1.3077658858450137e-05, "loss": 1.3201, "num_input_tokens_seen": 5788139520, "step": 690 }, { "epoch": 0.3455, "grad_norm": 0.404319167137146, "learning_rate": 1.3004027048153826e-05, "loss": 1.4606, "num_input_tokens_seen": 5796528128, "step": 691 }, { "epoch": 0.346, "grad_norm": 0.40052586793899536, "learning_rate": 1.2930503124414862e-05, "loss": 1.4713, "num_input_tokens_seen": 5804916736, "step": 692 }, { "epoch": 0.3465, "grad_norm": 0.4191119968891144, "learning_rate": 1.2857088221063099e-05, "loss": 1.4651, "num_input_tokens_seen": 5813305344, "step": 693 }, { "epoch": 0.347, "grad_norm": 0.41156262159347534, "learning_rate": 1.2783783470247164e-05, "loss": 1.4181, "num_input_tokens_seen": 5821693952, "step": 694 }, { "epoch": 0.3475, "grad_norm": 0.4054810404777527, "learning_rate": 1.2710590002417008e-05, "loss": 1.3518, "num_input_tokens_seen": 5830082560, "step": 695 }, { "epoch": 0.348, "grad_norm": 0.5507049560546875, "learning_rate": 1.2637508946306443e-05, "loss": 1.3982, "num_input_tokens_seen": 5838471168, "step": 696 }, { "epoch": 0.3485, "grad_norm": 0.5741444826126099, "learning_rate": 1.2564541428915762e-05, "loss": 1.4619, "num_input_tokens_seen": 5846859776, "step": 697 }, { "epoch": 0.349, "grad_norm": 0.4412943422794342, "learning_rate": 1.2491688575494337e-05, "loss": 1.546, "num_input_tokens_seen": 5855248384, "step": 698 }, { "epoch": 0.3495, "grad_norm": 0.6128819584846497, "learning_rate": 1.2418951509523312e-05, "loss": 1.3792, "num_input_tokens_seen": 5863636992, "step": 699 }, { "epoch": 0.35, "grad_norm": 0.4005577266216278, "learning_rate": 1.2346331352698206e-05, "loss": 1.327, "num_input_tokens_seen": 5872025600, "step": 700 }, { "epoch": 0.3505, "grad_norm": 0.5048608183860779, "learning_rate": 1.2273829224911685e-05, "loss": 1.3723, "num_input_tokens_seen": 5880414208, "step": 701 }, { "epoch": 0.351, "grad_norm": 0.3869295120239258, "learning_rate": 1.2201446244236242e-05, "loss": 1.4289, "num_input_tokens_seen": 5888802816, "step": 702 }, { "epoch": 0.3515, "grad_norm": 0.4177543818950653, "learning_rate": 1.2129183526906971e-05, "loss": 1.5365, "num_input_tokens_seen": 5897191424, "step": 703 }, { "epoch": 0.352, "grad_norm": 0.3738914132118225, "learning_rate": 1.205704218730439e-05, "loss": 1.535, "num_input_tokens_seen": 5905580032, "step": 704 }, { "epoch": 0.3525, "grad_norm": 0.4541512429714203, "learning_rate": 1.1985023337937185e-05, "loss": 1.3511, "num_input_tokens_seen": 5913968640, "step": 705 }, { "epoch": 0.353, "grad_norm": 0.38867321610450745, "learning_rate": 1.1913128089425103e-05, "loss": 1.257, "num_input_tokens_seen": 5922357248, "step": 706 }, { "epoch": 0.3535, "grad_norm": 0.4386734962463379, "learning_rate": 1.1841357550481817e-05, "loss": 1.2996, "num_input_tokens_seen": 5930745856, "step": 707 }, { "epoch": 0.354, "grad_norm": 0.7539464831352234, "learning_rate": 1.1769712827897825e-05, "loss": 1.5229, "num_input_tokens_seen": 5939134464, "step": 708 }, { "epoch": 0.3545, "grad_norm": 0.4095211625099182, "learning_rate": 1.1698195026523379e-05, "loss": 1.4399, "num_input_tokens_seen": 5947523072, "step": 709 }, { "epoch": 0.355, "grad_norm": 0.43154534697532654, "learning_rate": 1.1626805249251444e-05, "loss": 1.583, "num_input_tokens_seen": 5955911680, "step": 710 }, { "epoch": 0.3555, "grad_norm": 0.38890644907951355, "learning_rate": 1.1555544597000693e-05, "loss": 1.3619, "num_input_tokens_seen": 5964300288, "step": 711 }, { "epoch": 0.356, "grad_norm": 0.42766648530960083, "learning_rate": 1.1484414168698547e-05, "loss": 1.3051, "num_input_tokens_seen": 5972688896, "step": 712 }, { "epoch": 0.3565, "grad_norm": 0.3693629801273346, "learning_rate": 1.1413415061264205e-05, "loss": 1.3839, "num_input_tokens_seen": 5981077504, "step": 713 }, { "epoch": 0.357, "grad_norm": 0.39556047320365906, "learning_rate": 1.134254836959173e-05, "loss": 1.3957, "num_input_tokens_seen": 5989466112, "step": 714 }, { "epoch": 0.3575, "grad_norm": 0.35693857073783875, "learning_rate": 1.1271815186533156e-05, "loss": 1.4229, "num_input_tokens_seen": 5997854720, "step": 715 }, { "epoch": 0.358, "grad_norm": 0.45855775475502014, "learning_rate": 1.1201216602881696e-05, "loss": 1.4924, "num_input_tokens_seen": 6006243328, "step": 716 }, { "epoch": 0.3585, "grad_norm": 0.46495652198791504, "learning_rate": 1.1130753707354836e-05, "loss": 1.412, "num_input_tokens_seen": 6014631936, "step": 717 }, { "epoch": 0.359, "grad_norm": 0.41153737902641296, "learning_rate": 1.106042758657758e-05, "loss": 1.2981, "num_input_tokens_seen": 6023020544, "step": 718 }, { "epoch": 0.3595, "grad_norm": 0.41452744603157043, "learning_rate": 1.0990239325065714e-05, "loss": 1.4342, "num_input_tokens_seen": 6031409152, "step": 719 }, { "epoch": 0.36, "grad_norm": 0.37425488233566284, "learning_rate": 1.0920190005209066e-05, "loss": 1.4595, "num_input_tokens_seen": 6039797760, "step": 720 }, { "epoch": 0.3605, "grad_norm": 0.36869382858276367, "learning_rate": 1.085028070725479e-05, "loss": 1.4808, "num_input_tokens_seen": 6048186368, "step": 721 }, { "epoch": 0.361, "grad_norm": 0.3328341841697693, "learning_rate": 1.0780512509290758e-05, "loss": 1.4267, "num_input_tokens_seen": 6056574976, "step": 722 }, { "epoch": 0.3615, "grad_norm": 0.39769792556762695, "learning_rate": 1.0710886487228868e-05, "loss": 1.4068, "num_input_tokens_seen": 6064963584, "step": 723 }, { "epoch": 0.362, "grad_norm": 0.368804931640625, "learning_rate": 1.0641403714788537e-05, "loss": 1.3572, "num_input_tokens_seen": 6073352192, "step": 724 }, { "epoch": 0.3625, "grad_norm": 0.4038560390472412, "learning_rate": 1.0572065263480046e-05, "loss": 1.4903, "num_input_tokens_seen": 6081740800, "step": 725 }, { "epoch": 0.363, "grad_norm": 0.36016955971717834, "learning_rate": 1.0502872202588113e-05, "loss": 1.3402, "num_input_tokens_seen": 6090129408, "step": 726 }, { "epoch": 0.3635, "grad_norm": 0.46069440245628357, "learning_rate": 1.043382559915532e-05, "loss": 1.3255, "num_input_tokens_seen": 6098518016, "step": 727 }, { "epoch": 0.364, "grad_norm": 0.39502382278442383, "learning_rate": 1.0364926517965693e-05, "loss": 1.3882, "num_input_tokens_seen": 6106906624, "step": 728 }, { "epoch": 0.3645, "grad_norm": 0.3514525890350342, "learning_rate": 1.0296176021528326e-05, "loss": 1.4047, "num_input_tokens_seen": 6115295232, "step": 729 }, { "epoch": 0.365, "grad_norm": 0.35697489976882935, "learning_rate": 1.0227575170060909e-05, "loss": 1.4461, "num_input_tokens_seen": 6123683840, "step": 730 }, { "epoch": 0.3655, "grad_norm": 0.35604503750801086, "learning_rate": 1.0159125021473421e-05, "loss": 1.4992, "num_input_tokens_seen": 6132072448, "step": 731 }, { "epoch": 0.366, "grad_norm": 0.3803131878376007, "learning_rate": 1.009082663135185e-05, "loss": 1.4753, "num_input_tokens_seen": 6140461056, "step": 732 }, { "epoch": 0.3665, "grad_norm": 0.38193321228027344, "learning_rate": 1.0022681052941856e-05, "loss": 1.356, "num_input_tokens_seen": 6148849664, "step": 733 }, { "epoch": 0.367, "grad_norm": 0.4219174087047577, "learning_rate": 9.95468933713255e-06, "loss": 1.4134, "num_input_tokens_seen": 6157238272, "step": 734 }, { "epoch": 0.3675, "grad_norm": 0.37221747636795044, "learning_rate": 9.886852532440312e-06, "loss": 1.3637, "num_input_tokens_seen": 6165626880, "step": 735 }, { "epoch": 0.368, "grad_norm": 0.471129834651947, "learning_rate": 9.819171684992575e-06, "loss": 1.4212, "num_input_tokens_seen": 6174015488, "step": 736 }, { "epoch": 0.3685, "grad_norm": 0.3800898790359497, "learning_rate": 9.751647838511747e-06, "loss": 1.434, "num_input_tokens_seen": 6182404096, "step": 737 }, { "epoch": 0.369, "grad_norm": 0.40310239791870117, "learning_rate": 9.684282034299053e-06, "loss": 1.4536, "num_input_tokens_seen": 6190792704, "step": 738 }, { "epoch": 0.3695, "grad_norm": 0.31282833218574524, "learning_rate": 9.61707531121855e-06, "loss": 1.3445, "num_input_tokens_seen": 6199181312, "step": 739 }, { "epoch": 0.37, "grad_norm": 0.4354435205459595, "learning_rate": 9.550028705681024e-06, "loss": 1.4359, "num_input_tokens_seen": 6207569920, "step": 740 }, { "epoch": 0.3705, "grad_norm": 0.4455682039260864, "learning_rate": 9.483143251628088e-06, "loss": 1.5311, "num_input_tokens_seen": 6215958528, "step": 741 }, { "epoch": 0.371, "grad_norm": 0.39577606320381165, "learning_rate": 9.416419980516192e-06, "loss": 1.4777, "num_input_tokens_seen": 6224347136, "step": 742 }, { "epoch": 0.3715, "grad_norm": 0.4234643578529358, "learning_rate": 9.349859921300704e-06, "loss": 1.4215, "num_input_tokens_seen": 6232735744, "step": 743 }, { "epoch": 0.372, "grad_norm": 0.40708428621292114, "learning_rate": 9.283464100420064e-06, "loss": 1.5666, "num_input_tokens_seen": 6241124352, "step": 744 }, { "epoch": 0.3725, "grad_norm": 0.4814571142196655, "learning_rate": 9.217233541779995e-06, "loss": 1.4099, "num_input_tokens_seen": 6249512960, "step": 745 }, { "epoch": 0.373, "grad_norm": 0.44198286533355713, "learning_rate": 9.15116926673763e-06, "loss": 1.4867, "num_input_tokens_seen": 6257901568, "step": 746 }, { "epoch": 0.3735, "grad_norm": 0.528195858001709, "learning_rate": 9.085272294085803e-06, "loss": 1.4261, "num_input_tokens_seen": 6266290176, "step": 747 }, { "epoch": 0.374, "grad_norm": 0.5089737176895142, "learning_rate": 9.019543640037363e-06, "loss": 1.394, "num_input_tokens_seen": 6274678784, "step": 748 }, { "epoch": 0.3745, "grad_norm": 0.42153167724609375, "learning_rate": 8.95398431820947e-06, "loss": 1.6003, "num_input_tokens_seen": 6283067392, "step": 749 }, { "epoch": 0.375, "grad_norm": 0.48496297001838684, "learning_rate": 8.888595339607961e-06, "loss": 1.3454, "num_input_tokens_seen": 6291456000, "step": 750 }, { "epoch": 0.3755, "grad_norm": 0.5419741868972778, "learning_rate": 8.82337771261177e-06, "loss": 1.415, "num_input_tokens_seen": 6299844608, "step": 751 }, { "epoch": 0.376, "grad_norm": 0.43826228380203247, "learning_rate": 8.758332442957394e-06, "loss": 1.3491, "num_input_tokens_seen": 6308233216, "step": 752 }, { "epoch": 0.3765, "grad_norm": 0.4173643887042999, "learning_rate": 8.693460533723346e-06, "loss": 1.5206, "num_input_tokens_seen": 6316621824, "step": 753 }, { "epoch": 0.377, "grad_norm": 0.3852335214614868, "learning_rate": 8.62876298531472e-06, "loss": 1.3818, "num_input_tokens_seen": 6325010432, "step": 754 }, { "epoch": 0.3775, "grad_norm": 1.0346558094024658, "learning_rate": 8.564240795447758e-06, "loss": 1.5792, "num_input_tokens_seen": 6333399040, "step": 755 }, { "epoch": 0.378, "grad_norm": 0.4322704076766968, "learning_rate": 8.499894959134436e-06, "loss": 1.2725, "num_input_tokens_seen": 6341787648, "step": 756 }, { "epoch": 0.3785, "grad_norm": 0.37390002608299255, "learning_rate": 8.435726468667135e-06, "loss": 1.4501, "num_input_tokens_seen": 6350176256, "step": 757 }, { "epoch": 0.379, "grad_norm": 0.4188503623008728, "learning_rate": 8.37173631360339e-06, "loss": 1.4103, "num_input_tokens_seen": 6358564864, "step": 758 }, { "epoch": 0.3795, "grad_norm": 0.5148934125900269, "learning_rate": 8.307925480750535e-06, "loss": 1.5353, "num_input_tokens_seen": 6366953472, "step": 759 }, { "epoch": 0.38, "grad_norm": 0.46682509779930115, "learning_rate": 8.24429495415054e-06, "loss": 1.4309, "num_input_tokens_seen": 6375342080, "step": 760 }, { "epoch": 0.3805, "grad_norm": 0.5225061774253845, "learning_rate": 8.180845715064851e-06, "loss": 1.4119, "num_input_tokens_seen": 6383730688, "step": 761 }, { "epoch": 0.381, "grad_norm": 0.3820711374282837, "learning_rate": 8.117578741959232e-06, "loss": 1.1499, "num_input_tokens_seen": 6392119296, "step": 762 }, { "epoch": 0.3815, "grad_norm": 0.38416001200675964, "learning_rate": 8.054495010488658e-06, "loss": 1.6055, "num_input_tokens_seen": 6400507904, "step": 763 }, { "epoch": 0.382, "grad_norm": 0.4727804958820343, "learning_rate": 7.991595493482323e-06, "loss": 1.2637, "num_input_tokens_seen": 6408896512, "step": 764 }, { "epoch": 0.3825, "grad_norm": 0.5752984881401062, "learning_rate": 7.928881160928572e-06, "loss": 1.511, "num_input_tokens_seen": 6417285120, "step": 765 }, { "epoch": 0.383, "grad_norm": 0.47958895564079285, "learning_rate": 7.86635297996001e-06, "loss": 1.3375, "num_input_tokens_seen": 6425673728, "step": 766 }, { "epoch": 0.3835, "grad_norm": 0.44161489605903625, "learning_rate": 7.804011914838524e-06, "loss": 1.5681, "num_input_tokens_seen": 6434062336, "step": 767 }, { "epoch": 0.384, "grad_norm": 0.39978259801864624, "learning_rate": 7.741858926940475e-06, "loss": 1.7071, "num_input_tokens_seen": 6442450944, "step": 768 }, { "epoch": 0.3845, "grad_norm": 0.45671382546424866, "learning_rate": 7.679894974741807e-06, "loss": 1.3448, "num_input_tokens_seen": 6450839552, "step": 769 }, { "epoch": 0.385, "grad_norm": 0.383653461933136, "learning_rate": 7.618121013803319e-06, "loss": 1.2981, "num_input_tokens_seen": 6459228160, "step": 770 }, { "epoch": 0.3855, "grad_norm": 0.537684977054596, "learning_rate": 7.556537996755919e-06, "loss": 1.2535, "num_input_tokens_seen": 6467616768, "step": 771 }, { "epoch": 0.386, "grad_norm": 0.36121198534965515, "learning_rate": 7.495146873285904e-06, "loss": 1.3337, "num_input_tokens_seen": 6476005376, "step": 772 }, { "epoch": 0.3865, "grad_norm": 0.35980767011642456, "learning_rate": 7.433948590120326e-06, "loss": 1.6477, "num_input_tokens_seen": 6484393984, "step": 773 }, { "epoch": 0.387, "grad_norm": 0.4171326756477356, "learning_rate": 7.3729440910124464e-06, "loss": 1.3981, "num_input_tokens_seen": 6492782592, "step": 774 }, { "epoch": 0.3875, "grad_norm": 0.3761611878871918, "learning_rate": 7.312134316727093e-06, "loss": 1.545, "num_input_tokens_seen": 6501171200, "step": 775 }, { "epoch": 0.388, "grad_norm": 0.37193554639816284, "learning_rate": 7.251520205026206e-06, "loss": 1.4064, "num_input_tokens_seen": 6509559808, "step": 776 }, { "epoch": 0.3885, "grad_norm": 0.3899398446083069, "learning_rate": 7.191102690654384e-06, "loss": 1.0782, "num_input_tokens_seen": 6517948416, "step": 777 }, { "epoch": 0.389, "grad_norm": 0.3811258375644684, "learning_rate": 7.130882705324422e-06, "loss": 1.5228, "num_input_tokens_seen": 6526337024, "step": 778 }, { "epoch": 0.3895, "grad_norm": 0.3311116695404053, "learning_rate": 7.070861177703006e-06, "loss": 1.4572, "num_input_tokens_seen": 6534725632, "step": 779 }, { "epoch": 0.39, "grad_norm": 0.3486502468585968, "learning_rate": 7.01103903339633e-06, "loss": 1.385, "num_input_tokens_seen": 6543114240, "step": 780 }, { "epoch": 0.3905, "grad_norm": 0.38137882947921753, "learning_rate": 6.95141719493587e-06, "loss": 1.3502, "num_input_tokens_seen": 6551502848, "step": 781 }, { "epoch": 0.391, "grad_norm": 0.33792567253112793, "learning_rate": 6.891996581764124e-06, "loss": 1.5721, "num_input_tokens_seen": 6559891456, "step": 782 }, { "epoch": 0.3915, "grad_norm": 0.3679531216621399, "learning_rate": 6.832778110220457e-06, "loss": 1.4466, "num_input_tokens_seen": 6568280064, "step": 783 }, { "epoch": 0.392, "grad_norm": 0.3734787702560425, "learning_rate": 6.773762693526967e-06, "loss": 1.3574, "num_input_tokens_seen": 6576668672, "step": 784 }, { "epoch": 0.3925, "grad_norm": 0.3447019159793854, "learning_rate": 6.7149512417743725e-06, "loss": 1.3066, "num_input_tokens_seen": 6585057280, "step": 785 }, { "epoch": 0.393, "grad_norm": 0.44054052233695984, "learning_rate": 6.656344661908003e-06, "loss": 1.2998, "num_input_tokens_seen": 6593445888, "step": 786 }, { "epoch": 0.3935, "grad_norm": 0.3736647963523865, "learning_rate": 6.597943857713849e-06, "loss": 1.2595, "num_input_tokens_seen": 6601834496, "step": 787 }, { "epoch": 0.394, "grad_norm": 0.3501863479614258, "learning_rate": 6.539749729804539e-06, "loss": 1.4001, "num_input_tokens_seen": 6610223104, "step": 788 }, { "epoch": 0.3945, "grad_norm": 0.3729817271232605, "learning_rate": 6.4817631756055086e-06, "loss": 1.4816, "num_input_tokens_seen": 6618611712, "step": 789 }, { "epoch": 0.395, "grad_norm": 0.3676244616508484, "learning_rate": 6.423985089341165e-06, "loss": 1.2956, "num_input_tokens_seen": 6627000320, "step": 790 }, { "epoch": 0.3955, "grad_norm": 0.33653560280799866, "learning_rate": 6.366416362021077e-06, "loss": 1.3932, "num_input_tokens_seen": 6635388928, "step": 791 }, { "epoch": 0.396, "grad_norm": 4.367507457733154, "learning_rate": 6.3090578814262256e-06, "loss": 1.3941, "num_input_tokens_seen": 6643777536, "step": 792 }, { "epoch": 0.3965, "grad_norm": 0.38902419805526733, "learning_rate": 6.251910532095349e-06, "loss": 1.3425, "num_input_tokens_seen": 6652166144, "step": 793 }, { "epoch": 0.397, "grad_norm": 0.3469020426273346, "learning_rate": 6.1949751953112565e-06, "loss": 1.4045, "num_input_tokens_seen": 6660554752, "step": 794 }, { "epoch": 0.3975, "grad_norm": 0.32471761107444763, "learning_rate": 6.138252749087286e-06, "loss": 1.4689, "num_input_tokens_seen": 6668943360, "step": 795 }, { "epoch": 0.398, "grad_norm": 0.3643701672554016, "learning_rate": 6.081744068153714e-06, "loss": 1.4039, "num_input_tokens_seen": 6677331968, "step": 796 }, { "epoch": 0.3985, "grad_norm": 0.3417051434516907, "learning_rate": 6.02545002394432e-06, "loss": 1.3409, "num_input_tokens_seen": 6685720576, "step": 797 }, { "epoch": 0.399, "grad_norm": 0.29954588413238525, "learning_rate": 5.969371484582887e-06, "loss": 1.3717, "num_input_tokens_seen": 6694109184, "step": 798 }, { "epoch": 0.3995, "grad_norm": 0.3241947889328003, "learning_rate": 5.913509314869874e-06, "loss": 1.4585, "num_input_tokens_seen": 6702497792, "step": 799 }, { "epoch": 0.4, "grad_norm": 0.332887202501297, "learning_rate": 5.857864376269051e-06, "loss": 1.3768, "num_input_tokens_seen": 6710886400, "step": 800 }, { "epoch": 0.4005, "grad_norm": 0.42092156410217285, "learning_rate": 5.802437526894198e-06, "loss": 1.5152, "num_input_tokens_seen": 6719275008, "step": 801 }, { "epoch": 0.401, "grad_norm": 0.3968624174594879, "learning_rate": 5.747229621495893e-06, "loss": 1.2536, "num_input_tokens_seen": 6727663616, "step": 802 }, { "epoch": 0.4015, "grad_norm": 0.3537593483924866, "learning_rate": 5.692241511448342e-06, "loss": 1.4378, "num_input_tokens_seen": 6736052224, "step": 803 }, { "epoch": 0.402, "grad_norm": 0.3879539966583252, "learning_rate": 5.637474044736227e-06, "loss": 1.3811, "num_input_tokens_seen": 6744440832, "step": 804 }, { "epoch": 0.4025, "grad_norm": 0.3723753094673157, "learning_rate": 5.582928065941624e-06, "loss": 1.3662, "num_input_tokens_seen": 6752829440, "step": 805 }, { "epoch": 0.403, "grad_norm": 0.3457501530647278, "learning_rate": 5.528604416231016e-06, "loss": 1.3809, "num_input_tokens_seen": 6761218048, "step": 806 }, { "epoch": 0.4035, "grad_norm": 0.37048009037971497, "learning_rate": 5.474503933342272e-06, "loss": 1.5293, "num_input_tokens_seen": 6769606656, "step": 807 }, { "epoch": 0.404, "grad_norm": 0.3256695866584778, "learning_rate": 5.4206274515717735e-06, "loss": 1.4058, "num_input_tokens_seen": 6777995264, "step": 808 }, { "epoch": 0.4045, "grad_norm": 0.3388415575027466, "learning_rate": 5.366975801761507e-06, "loss": 1.6318, "num_input_tokens_seen": 6786383872, "step": 809 }, { "epoch": 0.405, "grad_norm": 0.3703429698944092, "learning_rate": 5.313549811286294e-06, "loss": 1.38, "num_input_tokens_seen": 6794772480, "step": 810 }, { "epoch": 0.4055, "grad_norm": 0.34759944677352905, "learning_rate": 5.260350304040987e-06, "loss": 1.3529, "num_input_tokens_seen": 6803161088, "step": 811 }, { "epoch": 0.406, "grad_norm": 2.019902229309082, "learning_rate": 5.207378100427804e-06, "loss": 1.5112, "num_input_tokens_seen": 6811549696, "step": 812 }, { "epoch": 0.4065, "grad_norm": 0.41291892528533936, "learning_rate": 5.154634017343662e-06, "loss": 1.5495, "num_input_tokens_seen": 6819938304, "step": 813 }, { "epoch": 0.407, "grad_norm": 0.3619902729988098, "learning_rate": 5.102118868167565e-06, "loss": 1.346, "num_input_tokens_seen": 6828326912, "step": 814 }, { "epoch": 0.4075, "grad_norm": 0.30278557538986206, "learning_rate": 5.049833462748061e-06, "loss": 1.3187, "num_input_tokens_seen": 6836715520, "step": 815 }, { "epoch": 0.408, "grad_norm": 0.3572129011154175, "learning_rate": 4.997778607390809e-06, "loss": 1.3678, "num_input_tokens_seen": 6845104128, "step": 816 }, { "epoch": 0.4085, "grad_norm": 0.35668709874153137, "learning_rate": 4.945955104846061e-06, "loss": 1.3922, "num_input_tokens_seen": 6853492736, "step": 817 }, { "epoch": 0.409, "grad_norm": 0.3320825397968292, "learning_rate": 4.89436375429633e-06, "loss": 1.5999, "num_input_tokens_seen": 6861881344, "step": 818 }, { "epoch": 0.4095, "grad_norm": 0.3514334261417389, "learning_rate": 4.843005351344065e-06, "loss": 1.3739, "num_input_tokens_seen": 6870269952, "step": 819 }, { "epoch": 0.41, "grad_norm": 0.30500420928001404, "learning_rate": 4.791880687999382e-06, "loss": 1.5723, "num_input_tokens_seen": 6878658560, "step": 820 }, { "epoch": 0.4105, "grad_norm": 0.331772118806839, "learning_rate": 4.740990552667823e-06, "loss": 1.3552, "num_input_tokens_seen": 6887047168, "step": 821 }, { "epoch": 0.411, "grad_norm": 0.2891063988208771, "learning_rate": 4.6903357301382405e-06, "loss": 1.3274, "num_input_tokens_seen": 6895435776, "step": 822 }, { "epoch": 0.4115, "grad_norm": 0.31751158833503723, "learning_rate": 4.639917001570644e-06, "loss": 1.2424, "num_input_tokens_seen": 6903824384, "step": 823 }, { "epoch": 0.412, "grad_norm": 0.48061904311180115, "learning_rate": 4.589735144484217e-06, "loss": 1.296, "num_input_tokens_seen": 6912212992, "step": 824 }, { "epoch": 0.4125, "grad_norm": 0.31406256556510925, "learning_rate": 4.53979093274526e-06, "loss": 1.4939, "num_input_tokens_seen": 6920601600, "step": 825 }, { "epoch": 0.413, "grad_norm": 0.369199275970459, "learning_rate": 4.490085136555313e-06, "loss": 1.45, "num_input_tokens_seen": 6928990208, "step": 826 }, { "epoch": 0.4135, "grad_norm": 0.2938808500766754, "learning_rate": 4.440618522439237e-06, "loss": 1.3813, "num_input_tokens_seen": 6937378816, "step": 827 }, { "epoch": 0.414, "grad_norm": 0.319866418838501, "learning_rate": 4.391391853233404e-06, "loss": 1.3415, "num_input_tokens_seen": 6945767424, "step": 828 }, { "epoch": 0.4145, "grad_norm": 0.33815205097198486, "learning_rate": 4.342405888073971e-06, "loss": 1.5604, "num_input_tokens_seen": 6954156032, "step": 829 }, { "epoch": 0.415, "grad_norm": 0.32216718792915344, "learning_rate": 4.293661382385106e-06, "loss": 1.4807, "num_input_tokens_seen": 6962544640, "step": 830 }, { "epoch": 0.4155, "grad_norm": 0.2970641553401947, "learning_rate": 4.245159087867383e-06, "loss": 1.4238, "num_input_tokens_seen": 6970933248, "step": 831 }, { "epoch": 0.416, "grad_norm": 0.2760828137397766, "learning_rate": 4.196899752486192e-06, "loss": 1.3684, "num_input_tokens_seen": 6979321856, "step": 832 }, { "epoch": 0.4165, "grad_norm": 0.3223128616809845, "learning_rate": 4.148884120460186e-06, "loss": 1.5162, "num_input_tokens_seen": 6987710464, "step": 833 }, { "epoch": 0.417, "grad_norm": 0.31736284494400024, "learning_rate": 4.1011129322498e-06, "loss": 1.2987, "num_input_tokens_seen": 6996099072, "step": 834 }, { "epoch": 0.4175, "grad_norm": 0.3846115171909332, "learning_rate": 4.05358692454586e-06, "loss": 1.5683, "num_input_tokens_seen": 7004487680, "step": 835 }, { "epoch": 0.418, "grad_norm": 0.3098331093788147, "learning_rate": 4.006306830258189e-06, "loss": 1.4323, "num_input_tokens_seen": 7012876288, "step": 836 }, { "epoch": 0.4185, "grad_norm": 0.3555186986923218, "learning_rate": 3.9592733785043405e-06, "loss": 1.2484, "num_input_tokens_seen": 7021264896, "step": 837 }, { "epoch": 0.419, "grad_norm": 0.3833989202976227, "learning_rate": 3.91248729459831e-06, "loss": 1.3651, "num_input_tokens_seen": 7029653504, "step": 838 }, { "epoch": 0.4195, "grad_norm": 0.3087936043739319, "learning_rate": 3.865949300039404e-06, "loss": 1.4754, "num_input_tokens_seen": 7038042112, "step": 839 }, { "epoch": 0.42, "grad_norm": 0.3653687536716461, "learning_rate": 3.819660112501053e-06, "loss": 1.3972, "num_input_tokens_seen": 7046430720, "step": 840 }, { "epoch": 0.4205, "grad_norm": 0.3193002939224243, "learning_rate": 3.773620445819799e-06, "loss": 1.1439, "num_input_tokens_seen": 7054819328, "step": 841 }, { "epoch": 0.421, "grad_norm": 0.3219449520111084, "learning_rate": 3.727831009984262e-06, "loss": 1.473, "num_input_tokens_seen": 7063207936, "step": 842 }, { "epoch": 0.4215, "grad_norm": 0.3474680185317993, "learning_rate": 3.682292511124179e-06, "loss": 1.3857, "num_input_tokens_seen": 7071596544, "step": 843 }, { "epoch": 0.422, "grad_norm": 0.2984297275543213, "learning_rate": 3.637005651499528e-06, "loss": 1.3787, "num_input_tokens_seen": 7079985152, "step": 844 }, { "epoch": 0.4225, "grad_norm": 0.40929341316223145, "learning_rate": 3.5919711294897285e-06, "loss": 1.52, "num_input_tokens_seen": 7088373760, "step": 845 }, { "epoch": 0.423, "grad_norm": 0.2667399048805237, "learning_rate": 3.5471896395828064e-06, "loss": 1.2748, "num_input_tokens_seen": 7096762368, "step": 846 }, { "epoch": 0.4235, "grad_norm": 0.29983726143836975, "learning_rate": 3.502661872364732e-06, "loss": 1.2588, "num_input_tokens_seen": 7105150976, "step": 847 }, { "epoch": 0.424, "grad_norm": 0.336478590965271, "learning_rate": 3.4583885145087613e-06, "loss": 1.3578, "num_input_tokens_seen": 7113539584, "step": 848 }, { "epoch": 0.4245, "grad_norm": 0.3049528896808624, "learning_rate": 3.414370248764849e-06, "loss": 1.426, "num_input_tokens_seen": 7121928192, "step": 849 }, { "epoch": 0.425, "grad_norm": 0.31462106108665466, "learning_rate": 3.3706077539490933e-06, "loss": 1.4603, "num_input_tokens_seen": 7130316800, "step": 850 }, { "epoch": 0.4255, "grad_norm": 0.31278783082962036, "learning_rate": 3.327101704933313e-06, "loss": 1.2372, "num_input_tokens_seen": 7138705408, "step": 851 }, { "epoch": 0.426, "grad_norm": 0.295612096786499, "learning_rate": 3.2838527726345994e-06, "loss": 1.4237, "num_input_tokens_seen": 7147094016, "step": 852 }, { "epoch": 0.4265, "grad_norm": 0.3268527686595917, "learning_rate": 3.240861624004983e-06, "loss": 1.3505, "num_input_tokens_seen": 7155482624, "step": 853 }, { "epoch": 0.427, "grad_norm": 0.32720014452934265, "learning_rate": 3.198128922021162e-06, "loss": 1.4295, "num_input_tokens_seen": 7163871232, "step": 854 }, { "epoch": 0.4275, "grad_norm": 0.37289947271347046, "learning_rate": 3.155655325674272e-06, "loss": 1.3217, "num_input_tokens_seen": 7172259840, "step": 855 }, { "epoch": 0.428, "grad_norm": 0.3099896013736725, "learning_rate": 3.1134414899597033e-06, "loss": 1.3894, "num_input_tokens_seen": 7180648448, "step": 856 }, { "epoch": 0.4285, "grad_norm": 0.3067796230316162, "learning_rate": 3.0714880658670165e-06, "loss": 1.5158, "num_input_tokens_seen": 7189037056, "step": 857 }, { "epoch": 0.429, "grad_norm": 0.37800902128219604, "learning_rate": 3.0297957003699284e-06, "loss": 1.3586, "num_input_tokens_seen": 7197425664, "step": 858 }, { "epoch": 0.4295, "grad_norm": 0.27667638659477234, "learning_rate": 2.9883650364162784e-06, "loss": 1.3589, "num_input_tokens_seen": 7205814272, "step": 859 }, { "epoch": 0.43, "grad_norm": 0.3086688220500946, "learning_rate": 2.947196712918157e-06, "loss": 1.3992, "num_input_tokens_seen": 7214202880, "step": 860 }, { "epoch": 0.4305, "grad_norm": 0.30200591683387756, "learning_rate": 2.906291364742042e-06, "loss": 1.5816, "num_input_tokens_seen": 7222591488, "step": 861 }, { "epoch": 0.431, "grad_norm": 0.2709788680076599, "learning_rate": 2.8656496226990092e-06, "loss": 1.3399, "num_input_tokens_seen": 7230980096, "step": 862 }, { "epoch": 0.4315, "grad_norm": 0.2946958541870117, "learning_rate": 2.8252721135349892e-06, "loss": 1.2181, "num_input_tokens_seen": 7239368704, "step": 863 }, { "epoch": 0.432, "grad_norm": 0.2847934663295746, "learning_rate": 2.7851594599211297e-06, "loss": 1.4519, "num_input_tokens_seen": 7247757312, "step": 864 }, { "epoch": 0.4325, "grad_norm": 0.32021042704582214, "learning_rate": 2.7453122804441636e-06, "loss": 1.3525, "num_input_tokens_seen": 7256145920, "step": 865 }, { "epoch": 0.433, "grad_norm": 0.3354461193084717, "learning_rate": 2.705731189596901e-06, "loss": 1.3606, "num_input_tokens_seen": 7264534528, "step": 866 }, { "epoch": 0.4335, "grad_norm": 0.2807963192462921, "learning_rate": 2.6664167977687182e-06, "loss": 1.2865, "num_input_tokens_seen": 7272923136, "step": 867 }, { "epoch": 0.434, "grad_norm": 0.3701249063014984, "learning_rate": 2.6273697112361786e-06, "loss": 1.3952, "num_input_tokens_seen": 7281311744, "step": 868 }, { "epoch": 0.4345, "grad_norm": 0.26810991764068604, "learning_rate": 2.588590532153652e-06, "loss": 1.237, "num_input_tokens_seen": 7289700352, "step": 869 }, { "epoch": 0.435, "grad_norm": 0.3003408908843994, "learning_rate": 2.550079858544057e-06, "loss": 1.4199, "num_input_tokens_seen": 7298088960, "step": 870 }, { "epoch": 0.4355, "grad_norm": 0.30752259492874146, "learning_rate": 2.511838284289625e-06, "loss": 1.4534, "num_input_tokens_seen": 7306477568, "step": 871 }, { "epoch": 0.436, "grad_norm": 0.27997586131095886, "learning_rate": 2.473866399122733e-06, "loss": 1.3808, "num_input_tokens_seen": 7314866176, "step": 872 }, { "epoch": 0.4365, "grad_norm": 0.29734259843826294, "learning_rate": 2.436164788616815e-06, "loss": 1.3357, "num_input_tokens_seen": 7323254784, "step": 873 }, { "epoch": 0.437, "grad_norm": 0.2851868271827698, "learning_rate": 2.398734034177361e-06, "loss": 1.487, "num_input_tokens_seen": 7331643392, "step": 874 }, { "epoch": 0.4375, "grad_norm": 0.26372966170310974, "learning_rate": 2.3615747130329013e-06, "loss": 1.3459, "num_input_tokens_seen": 7340032000, "step": 875 }, { "epoch": 0.438, "grad_norm": 0.2983658015727997, "learning_rate": 2.324687398226131e-06, "loss": 1.2504, "num_input_tokens_seen": 7348420608, "step": 876 }, { "epoch": 0.4385, "grad_norm": 0.2787192165851593, "learning_rate": 2.288072658605087e-06, "loss": 1.438, "num_input_tokens_seen": 7356809216, "step": 877 }, { "epoch": 0.439, "grad_norm": 0.2981388568878174, "learning_rate": 2.2517310588143372e-06, "loss": 1.4062, "num_input_tokens_seen": 7365197824, "step": 878 }, { "epoch": 0.4395, "grad_norm": 0.2956690788269043, "learning_rate": 2.215663159286314e-06, "loss": 1.3157, "num_input_tokens_seen": 7373586432, "step": 879 }, { "epoch": 0.44, "grad_norm": 0.2586216330528259, "learning_rate": 2.1798695162326444e-06, "loss": 1.4289, "num_input_tokens_seen": 7381975040, "step": 880 }, { "epoch": 0.4405, "grad_norm": 0.3166693449020386, "learning_rate": 2.144350681635585e-06, "loss": 1.3983, "num_input_tokens_seen": 7390363648, "step": 881 }, { "epoch": 0.441, "grad_norm": 0.29220932722091675, "learning_rate": 2.1091072032395e-06, "loss": 1.5127, "num_input_tokens_seen": 7398752256, "step": 882 }, { "epoch": 0.4415, "grad_norm": 0.29734739661216736, "learning_rate": 2.0741396245424263e-06, "loss": 1.393, "num_input_tokens_seen": 7407140864, "step": 883 }, { "epoch": 0.442, "grad_norm": 0.2723188102245331, "learning_rate": 2.0394484847876894e-06, "loss": 1.4143, "num_input_tokens_seen": 7415529472, "step": 884 }, { "epoch": 0.4425, "grad_norm": 0.3048396110534668, "learning_rate": 2.0050343189555743e-06, "loss": 1.4618, "num_input_tokens_seen": 7423918080, "step": 885 }, { "epoch": 0.443, "grad_norm": 0.2891803979873657, "learning_rate": 1.970897657755084e-06, "loss": 1.5123, "num_input_tokens_seen": 7432306688, "step": 886 }, { "epoch": 0.4435, "grad_norm": 0.3083032965660095, "learning_rate": 1.937039027615779e-06, "loss": 1.3842, "num_input_tokens_seen": 7440695296, "step": 887 }, { "epoch": 0.444, "grad_norm": 0.2674204111099243, "learning_rate": 1.903458950679613e-06, "loss": 1.4032, "num_input_tokens_seen": 7449083904, "step": 888 }, { "epoch": 0.4445, "grad_norm": 0.3049355149269104, "learning_rate": 1.8701579447929076e-06, "loss": 1.5021, "num_input_tokens_seen": 7457472512, "step": 889 }, { "epoch": 0.445, "grad_norm": 0.30539655685424805, "learning_rate": 1.837136523498373e-06, "loss": 1.5079, "num_input_tokens_seen": 7465861120, "step": 890 }, { "epoch": 0.4455, "grad_norm": 0.25622060894966125, "learning_rate": 1.80439519602718e-06, "loss": 1.3642, "num_input_tokens_seen": 7474249728, "step": 891 }, { "epoch": 0.446, "grad_norm": 0.2763841450214386, "learning_rate": 1.7719344672910942e-06, "loss": 1.3546, "num_input_tokens_seen": 7482638336, "step": 892 }, { "epoch": 0.4465, "grad_norm": 0.27978193759918213, "learning_rate": 1.7397548378747142e-06, "loss": 1.327, "num_input_tokens_seen": 7491026944, "step": 893 }, { "epoch": 0.447, "grad_norm": 0.27141082286834717, "learning_rate": 1.7078568040277276e-06, "loss": 1.3473, "num_input_tokens_seen": 7499415552, "step": 894 }, { "epoch": 0.4475, "grad_norm": 0.25104501843452454, "learning_rate": 1.676240857657283e-06, "loss": 1.3926, "num_input_tokens_seen": 7507804160, "step": 895 }, { "epoch": 0.448, "grad_norm": 0.29892927408218384, "learning_rate": 1.6449074863203773e-06, "loss": 1.3195, "num_input_tokens_seen": 7516192768, "step": 896 }, { "epoch": 0.4485, "grad_norm": 0.28884321451187134, "learning_rate": 1.6138571732163643e-06, "loss": 1.5636, "num_input_tokens_seen": 7524581376, "step": 897 }, { "epoch": 0.449, "grad_norm": 0.3087320327758789, "learning_rate": 1.5830903971794765e-06, "loss": 1.324, "num_input_tokens_seen": 7532969984, "step": 898 }, { "epoch": 0.4495, "grad_norm": 0.318194180727005, "learning_rate": 1.5526076326714635e-06, "loss": 1.4597, "num_input_tokens_seen": 7541358592, "step": 899 }, { "epoch": 0.45, "grad_norm": 0.26725178956985474, "learning_rate": 1.5224093497742654e-06, "loss": 1.3111, "num_input_tokens_seen": 7549747200, "step": 900 }, { "epoch": 0.4505, "grad_norm": 0.3461545705795288, "learning_rate": 1.4924960141827605e-06, "loss": 1.3402, "num_input_tokens_seen": 7558135808, "step": 901 }, { "epoch": 0.451, "grad_norm": 0.3121661841869354, "learning_rate": 1.4628680871975842e-06, "loss": 1.3725, "num_input_tokens_seen": 7566524416, "step": 902 }, { "epoch": 0.4515, "grad_norm": 0.3888826370239258, "learning_rate": 1.4335260257180262e-06, "loss": 1.2918, "num_input_tokens_seen": 7574913024, "step": 903 }, { "epoch": 0.452, "grad_norm": 0.26517921686172485, "learning_rate": 1.4044702822349731e-06, "loss": 1.2748, "num_input_tokens_seen": 7583301632, "step": 904 }, { "epoch": 0.4525, "grad_norm": 0.26547905802726746, "learning_rate": 1.3757013048239287e-06, "loss": 1.4139, "num_input_tokens_seen": 7591690240, "step": 905 }, { "epoch": 0.453, "grad_norm": 0.2867394685745239, "learning_rate": 1.3472195371381202e-06, "loss": 1.379, "num_input_tokens_seen": 7600078848, "step": 906 }, { "epoch": 0.4535, "grad_norm": 0.25690674781799316, "learning_rate": 1.3190254184016294e-06, "loss": 1.4272, "num_input_tokens_seen": 7608467456, "step": 907 }, { "epoch": 0.454, "grad_norm": 0.2735510468482971, "learning_rate": 1.2911193834026548e-06, "loss": 1.4282, "num_input_tokens_seen": 7616856064, "step": 908 }, { "epoch": 0.4545, "grad_norm": 0.2754003703594208, "learning_rate": 1.2635018624867712e-06, "loss": 1.4639, "num_input_tokens_seen": 7625244672, "step": 909 }, { "epoch": 0.455, "grad_norm": 0.26509949564933777, "learning_rate": 1.236173281550319e-06, "loss": 1.269, "num_input_tokens_seen": 7633633280, "step": 910 }, { "epoch": 0.4555, "grad_norm": 0.26368507742881775, "learning_rate": 1.209134062033821e-06, "loss": 1.3834, "num_input_tokens_seen": 7642021888, "step": 911 }, { "epoch": 0.456, "grad_norm": 0.27090704441070557, "learning_rate": 1.182384620915491e-06, "loss": 1.3751, "num_input_tokens_seen": 7650410496, "step": 912 }, { "epoch": 0.4565, "grad_norm": 0.30463507771492004, "learning_rate": 1.1559253707048046e-06, "loss": 1.4265, "num_input_tokens_seen": 7658799104, "step": 913 }, { "epoch": 0.457, "grad_norm": 0.28881165385246277, "learning_rate": 1.1297567194361303e-06, "loss": 1.2824, "num_input_tokens_seen": 7667187712, "step": 914 }, { "epoch": 0.4575, "grad_norm": 0.2604656517505646, "learning_rate": 1.103879070662439e-06, "loss": 1.3111, "num_input_tokens_seen": 7675576320, "step": 915 }, { "epoch": 0.458, "grad_norm": 0.2724970579147339, "learning_rate": 1.0782928234490941e-06, "loss": 1.2933, "num_input_tokens_seen": 7683964928, "step": 916 }, { "epoch": 0.4585, "grad_norm": 0.28571322560310364, "learning_rate": 1.0529983723676751e-06, "loss": 1.4072, "num_input_tokens_seen": 7692353536, "step": 917 }, { "epoch": 0.459, "grad_norm": 0.25053897500038147, "learning_rate": 1.027996107489908e-06, "loss": 1.3556, "num_input_tokens_seen": 7700742144, "step": 918 }, { "epoch": 0.4595, "grad_norm": 0.24220338463783264, "learning_rate": 1.0032864143816456e-06, "loss": 1.2246, "num_input_tokens_seen": 7709130752, "step": 919 }, { "epoch": 0.46, "grad_norm": 0.25879448652267456, "learning_rate": 9.788696740969295e-07, "loss": 1.3957, "num_input_tokens_seen": 7717519360, "step": 920 }, { "epoch": 0.4605, "grad_norm": 0.26545363664627075, "learning_rate": 9.547462631720906e-07, "loss": 1.4378, "num_input_tokens_seen": 7725907968, "step": 921 }, { "epoch": 0.461, "grad_norm": 0.2611714005470276, "learning_rate": 9.30916553619976e-07, "loss": 1.398, "num_input_tokens_seen": 7734296576, "step": 922 }, { "epoch": 0.4615, "grad_norm": 0.31541547179222107, "learning_rate": 9.073809129241784e-07, "loss": 1.3662, "num_input_tokens_seen": 7742685184, "step": 923 }, { "epoch": 0.462, "grad_norm": 0.25709474086761475, "learning_rate": 8.841397040333976e-07, "loss": 1.2697, "num_input_tokens_seen": 7751073792, "step": 924 }, { "epoch": 0.4625, "grad_norm": 0.3479345738887787, "learning_rate": 8.611932853558236e-07, "loss": 1.4419, "num_input_tokens_seen": 7759462400, "step": 925 }, { "epoch": 0.463, "grad_norm": 0.2671267092227936, "learning_rate": 8.38542010753618e-07, "loss": 1.366, "num_input_tokens_seen": 7767851008, "step": 926 }, { "epoch": 0.4635, "grad_norm": 0.25018930435180664, "learning_rate": 8.161862295374567e-07, "loss": 1.5786, "num_input_tokens_seen": 7776239616, "step": 927 }, { "epoch": 0.464, "grad_norm": 0.2930094003677368, "learning_rate": 7.941262864611387e-07, "loss": 1.321, "num_input_tokens_seen": 7784628224, "step": 928 }, { "epoch": 0.4645, "grad_norm": 0.23606263101100922, "learning_rate": 7.723625217162811e-07, "loss": 1.1955, "num_input_tokens_seen": 7793016832, "step": 929 }, { "epoch": 0.465, "grad_norm": 0.23684702813625336, "learning_rate": 7.508952709270567e-07, "loss": 1.3062, "num_input_tokens_seen": 7801405440, "step": 930 }, { "epoch": 0.4655, "grad_norm": 0.2372099906206131, "learning_rate": 7.29724865145025e-07, "loss": 1.2958, "num_input_tokens_seen": 7809794048, "step": 931 }, { "epoch": 0.466, "grad_norm": 0.249996617436409, "learning_rate": 7.088516308440386e-07, "loss": 1.3162, "num_input_tokens_seen": 7818182656, "step": 932 }, { "epoch": 0.4665, "grad_norm": 0.4657593369483948, "learning_rate": 6.882758899151886e-07, "loss": 1.376, "num_input_tokens_seen": 7826571264, "step": 933 }, { "epoch": 0.467, "grad_norm": 0.24294114112854004, "learning_rate": 6.679979596618546e-07, "loss": 1.5417, "num_input_tokens_seen": 7834959872, "step": 934 }, { "epoch": 0.4675, "grad_norm": 0.2363034188747406, "learning_rate": 6.480181527948049e-07, "loss": 1.3153, "num_input_tokens_seen": 7843348480, "step": 935 }, { "epoch": 0.468, "grad_norm": 0.25250643491744995, "learning_rate": 6.283367774273785e-07, "loss": 1.5223, "num_input_tokens_seen": 7851737088, "step": 936 }, { "epoch": 0.4685, "grad_norm": 0.23639926314353943, "learning_rate": 6.089541370707297e-07, "loss": 1.3308, "num_input_tokens_seen": 7860125696, "step": 937 }, { "epoch": 0.469, "grad_norm": 0.27431753277778625, "learning_rate": 5.898705306291508e-07, "loss": 1.4283, "num_input_tokens_seen": 7868514304, "step": 938 }, { "epoch": 0.4695, "grad_norm": 0.26388487219810486, "learning_rate": 5.71086252395463e-07, "loss": 1.2778, "num_input_tokens_seen": 7876902912, "step": 939 }, { "epoch": 0.47, "grad_norm": 0.24989992380142212, "learning_rate": 5.526015920464689e-07, "loss": 1.2934, "num_input_tokens_seen": 7885291520, "step": 940 }, { "epoch": 0.4705, "grad_norm": 0.24453981220722198, "learning_rate": 5.344168346385003e-07, "loss": 1.3573, "num_input_tokens_seen": 7893680128, "step": 941 }, { "epoch": 0.471, "grad_norm": 0.23557046055793762, "learning_rate": 5.165322606030132e-07, "loss": 1.3027, "num_input_tokens_seen": 7902068736, "step": 942 }, { "epoch": 0.4715, "grad_norm": 0.2599053382873535, "learning_rate": 4.98948145742264e-07, "loss": 1.4185, "num_input_tokens_seen": 7910457344, "step": 943 }, { "epoch": 0.472, "grad_norm": 0.2594621479511261, "learning_rate": 4.816647612250513e-07, "loss": 1.422, "num_input_tokens_seen": 7918845952, "step": 944 }, { "epoch": 0.4725, "grad_norm": 0.24601015448570251, "learning_rate": 4.646823735825523e-07, "loss": 1.4249, "num_input_tokens_seen": 7927234560, "step": 945 }, { "epoch": 0.473, "grad_norm": 0.2665488123893738, "learning_rate": 4.4800124470418815e-07, "loss": 1.3231, "num_input_tokens_seen": 7935623168, "step": 946 }, { "epoch": 0.4735, "grad_norm": 0.2772173285484314, "learning_rate": 4.3162163183360084e-07, "loss": 1.3807, "num_input_tokens_seen": 7944011776, "step": 947 }, { "epoch": 0.474, "grad_norm": 0.24356356263160706, "learning_rate": 4.155437875646828e-07, "loss": 1.2934, "num_input_tokens_seen": 7952400384, "step": 948 }, { "epoch": 0.4745, "grad_norm": 0.3069448471069336, "learning_rate": 3.997679598376891e-07, "loss": 1.3525, "num_input_tokens_seen": 7960788992, "step": 949 }, { "epoch": 0.475, "grad_norm": 0.23641465604305267, "learning_rate": 3.842943919353914e-07, "loss": 1.4811, "num_input_tokens_seen": 7969177600, "step": 950 }, { "epoch": 0.4755, "grad_norm": 0.24180881679058075, "learning_rate": 3.6912332247935224e-07, "loss": 1.3682, "num_input_tokens_seen": 7977566208, "step": 951 }, { "epoch": 0.476, "grad_norm": 0.26675185561180115, "learning_rate": 3.5425498542622784e-07, "loss": 1.4107, "num_input_tokens_seen": 7985954816, "step": 952 }, { "epoch": 0.4765, "grad_norm": 0.2614452540874481, "learning_rate": 3.396896100641689e-07, "loss": 1.4663, "num_input_tokens_seen": 7994343424, "step": 953 }, { "epoch": 0.477, "grad_norm": 0.28876617550849915, "learning_rate": 3.2542742100928114e-07, "loss": 1.5379, "num_input_tokens_seen": 8002732032, "step": 954 }, { "epoch": 0.4775, "grad_norm": 0.2845548987388611, "learning_rate": 3.114686382021681e-07, "loss": 1.5455, "num_input_tokens_seen": 8011120640, "step": 955 }, { "epoch": 0.478, "grad_norm": 0.2539234161376953, "learning_rate": 2.9781347690452266e-07, "loss": 1.4225, "num_input_tokens_seen": 8019509248, "step": 956 }, { "epoch": 0.4785, "grad_norm": 0.2430867850780487, "learning_rate": 2.8446214769582534e-07, "loss": 1.3769, "num_input_tokens_seen": 8027897856, "step": 957 }, { "epoch": 0.479, "grad_norm": 0.2622883915901184, "learning_rate": 2.714148564700914e-07, "loss": 1.5041, "num_input_tokens_seen": 8036286464, "step": 958 }, { "epoch": 0.4795, "grad_norm": 0.2362249344587326, "learning_rate": 2.586718044326886e-07, "loss": 1.356, "num_input_tokens_seen": 8044675072, "step": 959 }, { "epoch": 0.48, "grad_norm": 0.27168476581573486, "learning_rate": 2.462331880972468e-07, "loss": 1.3921, "num_input_tokens_seen": 8053063680, "step": 960 }, { "epoch": 0.4805, "grad_norm": 0.24687404930591583, "learning_rate": 2.340991992826136e-07, "loss": 1.4027, "num_input_tokens_seen": 8061452288, "step": 961 }, { "epoch": 0.481, "grad_norm": 0.2963731586933136, "learning_rate": 2.222700251099097e-07, "loss": 1.3255, "num_input_tokens_seen": 8069840896, "step": 962 }, { "epoch": 0.4815, "grad_norm": 0.23553644120693207, "learning_rate": 2.107458479996316e-07, "loss": 1.499, "num_input_tokens_seen": 8078229504, "step": 963 }, { "epoch": 0.482, "grad_norm": 0.23809978365898132, "learning_rate": 1.9952684566884927e-07, "loss": 1.4551, "num_input_tokens_seen": 8086618112, "step": 964 }, { "epoch": 0.4825, "grad_norm": 0.25255122780799866, "learning_rate": 1.88613191128455e-07, "loss": 1.3352, "num_input_tokens_seen": 8095006720, "step": 965 }, { "epoch": 0.483, "grad_norm": 0.258586049079895, "learning_rate": 1.780050526805055e-07, "loss": 1.4319, "num_input_tokens_seen": 8103395328, "step": 966 }, { "epoch": 0.4835, "grad_norm": 0.3484399616718292, "learning_rate": 1.6770259391561518e-07, "loss": 1.5366, "num_input_tokens_seen": 8111783936, "step": 967 }, { "epoch": 0.484, "grad_norm": 0.23989766836166382, "learning_rate": 1.577059737104447e-07, "loss": 1.3264, "num_input_tokens_seen": 8120172544, "step": 968 }, { "epoch": 0.4845, "grad_norm": 0.2582119405269623, "learning_rate": 1.4801534622524316e-07, "loss": 1.3255, "num_input_tokens_seen": 8128561152, "step": 969 }, { "epoch": 0.485, "grad_norm": 0.2474099099636078, "learning_rate": 1.3863086090147415e-07, "loss": 1.6036, "num_input_tokens_seen": 8136949760, "step": 970 }, { "epoch": 0.4855, "grad_norm": 0.25209760665893555, "learning_rate": 1.2955266245951338e-07, "loss": 1.3672, "num_input_tokens_seen": 8145338368, "step": 971 }, { "epoch": 0.486, "grad_norm": 0.2993927001953125, "learning_rate": 1.2078089089640809e-07, "loss": 1.4853, "num_input_tokens_seen": 8153726976, "step": 972 }, { "epoch": 0.4865, "grad_norm": 0.24984106421470642, "learning_rate": 1.1231568148372562e-07, "loss": 1.3645, "num_input_tokens_seen": 8162115584, "step": 973 }, { "epoch": 0.487, "grad_norm": 0.2713457942008972, "learning_rate": 1.0415716476547045e-07, "loss": 1.4428, "num_input_tokens_seen": 8170504192, "step": 974 }, { "epoch": 0.4875, "grad_norm": 0.23345446586608887, "learning_rate": 9.630546655606365e-08, "loss": 1.2593, "num_input_tokens_seen": 8178892800, "step": 975 }, { "epoch": 0.488, "grad_norm": 0.23903486132621765, "learning_rate": 8.876070793840008e-08, "loss": 1.3409, "num_input_tokens_seen": 8187281408, "step": 976 }, { "epoch": 0.4885, "grad_norm": 0.2698642909526825, "learning_rate": 8.15230052619942e-08, "loss": 1.3661, "num_input_tokens_seen": 8195670016, "step": 977 }, { "epoch": 0.489, "grad_norm": 0.2532804608345032, "learning_rate": 7.459247014117488e-08, "loss": 1.4019, "num_input_tokens_seen": 8204058624, "step": 978 }, { "epoch": 0.4895, "grad_norm": 0.244890034198761, "learning_rate": 6.796920945336682e-08, "loss": 1.3339, "num_input_tokens_seen": 8212447232, "step": 979 }, { "epoch": 0.49, "grad_norm": 0.23683778941631317, "learning_rate": 6.165332533744072e-08, "loss": 1.3438, "num_input_tokens_seen": 8220835840, "step": 980 }, { "epoch": 0.4905, "grad_norm": 0.24453355371952057, "learning_rate": 5.5644915192145654e-08, "loss": 1.4663, "num_input_tokens_seen": 8229224448, "step": 981 }, { "epoch": 0.491, "grad_norm": 0.2259846329689026, "learning_rate": 4.9944071674599135e-08, "loss": 1.2937, "num_input_tokens_seen": 8237613056, "step": 982 }, { "epoch": 0.4915, "grad_norm": 0.2521670162677765, "learning_rate": 4.4550882698857214e-08, "loss": 1.2651, "num_input_tokens_seen": 8246001664, "step": 983 }, { "epoch": 0.492, "grad_norm": 0.2303016483783722, "learning_rate": 3.946543143456882e-08, "loss": 1.4663, "num_input_tokens_seen": 8254390272, "step": 984 }, { "epoch": 0.4925, "grad_norm": 0.2453220933675766, "learning_rate": 3.468779630568353e-08, "loss": 1.3876, "num_input_tokens_seen": 8262778880, "step": 985 }, { "epoch": 0.493, "grad_norm": 0.25257933139801025, "learning_rate": 3.021805098924136e-08, "loss": 1.256, "num_input_tokens_seen": 8271167488, "step": 986 }, { "epoch": 0.4935, "grad_norm": 0.2717069387435913, "learning_rate": 2.6056264414249245e-08, "loss": 1.2968, "num_input_tokens_seen": 8279556096, "step": 987 }, { "epoch": 0.494, "grad_norm": 0.24595245718955994, "learning_rate": 2.220250076060193e-08, "loss": 1.4732, "num_input_tokens_seen": 8287944704, "step": 988 }, { "epoch": 0.4945, "grad_norm": 0.2430465966463089, "learning_rate": 1.8656819458100496e-08, "loss": 1.5087, "num_input_tokens_seen": 8296333312, "step": 989 }, { "epoch": 0.495, "grad_norm": 0.3049032986164093, "learning_rate": 1.541927518554198e-08, "loss": 1.5372, "num_input_tokens_seen": 8304721920, "step": 990 }, { "epoch": 0.4955, "grad_norm": 0.23862385749816895, "learning_rate": 1.2489917869860091e-08, "loss": 1.4927, "num_input_tokens_seen": 8313110528, "step": 991 }, { "epoch": 0.496, "grad_norm": 0.24209170043468475, "learning_rate": 9.868792685368001e-09, "loss": 1.5102, "num_input_tokens_seen": 8321499136, "step": 992 }, { "epoch": 0.4965, "grad_norm": 0.2436770349740982, "learning_rate": 7.55594005306337e-09, "loss": 1.3454, "num_input_tokens_seen": 8329887744, "step": 993 }, { "epoch": 0.497, "grad_norm": 0.28859907388687134, "learning_rate": 5.551395639988855e-09, "loss": 1.4936, "num_input_tokens_seen": 8338276352, "step": 994 }, { "epoch": 0.4975, "grad_norm": 0.2565382122993469, "learning_rate": 3.855190358703631e-09, "loss": 1.314, "num_input_tokens_seen": 8346664960, "step": 995 }, { "epoch": 0.498, "grad_norm": 0.2295544445514679, "learning_rate": 2.467350366788246e-09, "loss": 1.3402, "num_input_tokens_seen": 8355053568, "step": 996 }, { "epoch": 0.4985, "grad_norm": 0.2493482232093811, "learning_rate": 1.3878970664538138e-09, "loss": 1.371, "num_input_tokens_seen": 8363442176, "step": 997 }, { "epoch": 0.499, "grad_norm": 0.24827319383621216, "learning_rate": 6.168471042067303e-10, "loss": 1.3969, "num_input_tokens_seen": 8371830784, "step": 998 }, { "epoch": 0.4995, "grad_norm": 0.25917744636535645, "learning_rate": 1.5421237058887984e-10, "loss": 1.4008, "num_input_tokens_seen": 8380219392, "step": 999 }, { "epoch": 0.5, "grad_norm": 0.2483104020357132, "learning_rate": 0.0, "loss": 1.3491, "num_input_tokens_seen": 8388608000, "step": 1000 }, { "epoch": 0.0005, "grad_norm": 0.2557491958141327, "learning_rate": 1.5421237058887984e-10, "loss": 1.3519, "num_input_tokens_seen": 8396996608, "step": 1001 }, { "epoch": 0.0005, "num_input_tokens_seen": 8396996608, "step": 1001, "total_flos": 5.908015032569954e+18, "train_loss": 0.0013505319258073469, "train_runtime": 212.0248, "train_samples_per_second": 301.852, "train_steps_per_second": 4.716 } ], "logging_steps": 1.0, "max_steps": 1000, "num_input_tokens_seen": 8396996608, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.908015032569954e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }