| { | |
| "best_global_step": 5058, | |
| "best_metric": 0.18196314573287964, | |
| "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_rte_1754652145/checkpoint-5058", | |
| "epoch": 10.0, | |
| "eval_steps": 281, | |
| "global_step": 5610, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008912655971479501, | |
| "grad_norm": 2.640630006790161, | |
| "learning_rate": 3.5650623885918005e-07, | |
| "loss": 11.4646, | |
| "num_input_tokens_seen": 3168, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.017825311942959002, | |
| "grad_norm": 2.218021869659424, | |
| "learning_rate": 8.021390374331552e-07, | |
| "loss": 11.5893, | |
| "num_input_tokens_seen": 6272, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.026737967914438502, | |
| "grad_norm": 2.5081377029418945, | |
| "learning_rate": 1.2477718360071302e-06, | |
| "loss": 11.4013, | |
| "num_input_tokens_seen": 10144, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.035650623885918005, | |
| "grad_norm": 2.2221429347991943, | |
| "learning_rate": 1.6934046345811053e-06, | |
| "loss": 11.4758, | |
| "num_input_tokens_seen": 13536, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.044563279857397504, | |
| "grad_norm": 2.0649468898773193, | |
| "learning_rate": 2.1390374331550802e-06, | |
| "loss": 11.3651, | |
| "num_input_tokens_seen": 16128, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.053475935828877004, | |
| "grad_norm": 2.8866872787475586, | |
| "learning_rate": 2.5846702317290554e-06, | |
| "loss": 11.4745, | |
| "num_input_tokens_seen": 18784, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.062388591800356503, | |
| "grad_norm": 2.091982841491699, | |
| "learning_rate": 3.0303030303030305e-06, | |
| "loss": 11.5827, | |
| "num_input_tokens_seen": 22336, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07130124777183601, | |
| "grad_norm": 2.3348405361175537, | |
| "learning_rate": 3.4759358288770056e-06, | |
| "loss": 11.5288, | |
| "num_input_tokens_seen": 25408, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08021390374331551, | |
| "grad_norm": 2.6505680084228516, | |
| "learning_rate": 3.92156862745098e-06, | |
| "loss": 11.3178, | |
| "num_input_tokens_seen": 27968, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.08912655971479501, | |
| "grad_norm": 2.241699457168579, | |
| "learning_rate": 4.3672014260249555e-06, | |
| "loss": 11.2239, | |
| "num_input_tokens_seen": 30752, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09803921568627451, | |
| "grad_norm": 2.149437427520752, | |
| "learning_rate": 4.812834224598931e-06, | |
| "loss": 11.3085, | |
| "num_input_tokens_seen": 33376, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.10695187165775401, | |
| "grad_norm": 2.2778542041778564, | |
| "learning_rate": 5.258467023172906e-06, | |
| "loss": 11.3491, | |
| "num_input_tokens_seen": 37280, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11586452762923351, | |
| "grad_norm": 2.1370596885681152, | |
| "learning_rate": 5.704099821746881e-06, | |
| "loss": 11.0892, | |
| "num_input_tokens_seen": 40640, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.12477718360071301, | |
| "grad_norm": 2.1671693325042725, | |
| "learning_rate": 6.149732620320856e-06, | |
| "loss": 11.4158, | |
| "num_input_tokens_seen": 44128, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13368983957219252, | |
| "grad_norm": 2.1441879272460938, | |
| "learning_rate": 6.59536541889483e-06, | |
| "loss": 11.0242, | |
| "num_input_tokens_seen": 47648, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.14260249554367202, | |
| "grad_norm": 2.2412052154541016, | |
| "learning_rate": 7.040998217468805e-06, | |
| "loss": 10.8869, | |
| "num_input_tokens_seen": 50816, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15151515151515152, | |
| "grad_norm": 2.3039534091949463, | |
| "learning_rate": 7.4866310160427806e-06, | |
| "loss": 11.031, | |
| "num_input_tokens_seen": 53728, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.16042780748663102, | |
| "grad_norm": 2.3388712406158447, | |
| "learning_rate": 7.932263814616755e-06, | |
| "loss": 10.9959, | |
| "num_input_tokens_seen": 57056, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16934046345811052, | |
| "grad_norm": 2.324082851409912, | |
| "learning_rate": 8.377896613190733e-06, | |
| "loss": 10.8078, | |
| "num_input_tokens_seen": 59808, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.17825311942959002, | |
| "grad_norm": 2.343338966369629, | |
| "learning_rate": 8.823529411764707e-06, | |
| "loss": 10.6152, | |
| "num_input_tokens_seen": 62848, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.18716577540106952, | |
| "grad_norm": 2.170870542526245, | |
| "learning_rate": 9.269162210338681e-06, | |
| "loss": 10.7252, | |
| "num_input_tokens_seen": 65856, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.19607843137254902, | |
| "grad_norm": 2.1757500171661377, | |
| "learning_rate": 9.714795008912657e-06, | |
| "loss": 10.702, | |
| "num_input_tokens_seen": 68672, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.20499108734402852, | |
| "grad_norm": 2.319809675216675, | |
| "learning_rate": 1.0160427807486631e-05, | |
| "loss": 10.7596, | |
| "num_input_tokens_seen": 71840, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.21390374331550802, | |
| "grad_norm": 2.85723876953125, | |
| "learning_rate": 1.0606060606060607e-05, | |
| "loss": 10.6329, | |
| "num_input_tokens_seen": 74624, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.22281639928698752, | |
| "grad_norm": 2.3634092807769775, | |
| "learning_rate": 1.1051693404634582e-05, | |
| "loss": 10.694, | |
| "num_input_tokens_seen": 78080, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.23172905525846701, | |
| "grad_norm": 2.2238471508026123, | |
| "learning_rate": 1.1497326203208558e-05, | |
| "loss": 10.4616, | |
| "num_input_tokens_seen": 81408, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.24064171122994651, | |
| "grad_norm": 2.2605199813842773, | |
| "learning_rate": 1.1942959001782532e-05, | |
| "loss": 10.2704, | |
| "num_input_tokens_seen": 84192, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.24955436720142601, | |
| "grad_norm": 2.334446668624878, | |
| "learning_rate": 1.2388591800356506e-05, | |
| "loss": 10.1217, | |
| "num_input_tokens_seen": 87264, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.25846702317290554, | |
| "grad_norm": 2.1008996963500977, | |
| "learning_rate": 1.2834224598930484e-05, | |
| "loss": 9.9505, | |
| "num_input_tokens_seen": 90336, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.26737967914438504, | |
| "grad_norm": 2.1396262645721436, | |
| "learning_rate": 1.3279857397504458e-05, | |
| "loss": 9.9953, | |
| "num_input_tokens_seen": 93760, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.27629233511586454, | |
| "grad_norm": 1.9306892156600952, | |
| "learning_rate": 1.3725490196078432e-05, | |
| "loss": 10.0273, | |
| "num_input_tokens_seen": 97120, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.28520499108734404, | |
| "grad_norm": 2.2339835166931152, | |
| "learning_rate": 1.4171122994652408e-05, | |
| "loss": 9.8194, | |
| "num_input_tokens_seen": 100160, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.29411764705882354, | |
| "grad_norm": 2.1370038986206055, | |
| "learning_rate": 1.4616755793226383e-05, | |
| "loss": 9.7234, | |
| "num_input_tokens_seen": 103136, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "grad_norm": 2.2204971313476562, | |
| "learning_rate": 1.5062388591800359e-05, | |
| "loss": 9.4737, | |
| "num_input_tokens_seen": 105696, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.31194295900178254, | |
| "grad_norm": 2.0649607181549072, | |
| "learning_rate": 1.5508021390374333e-05, | |
| "loss": 9.299, | |
| "num_input_tokens_seen": 108800, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.32085561497326204, | |
| "grad_norm": 2.166388511657715, | |
| "learning_rate": 1.5953654188948307e-05, | |
| "loss": 9.3115, | |
| "num_input_tokens_seen": 111808, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.32976827094474154, | |
| "grad_norm": 2.0328972339630127, | |
| "learning_rate": 1.639928698752228e-05, | |
| "loss": 9.3707, | |
| "num_input_tokens_seen": 114944, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.33868092691622104, | |
| "grad_norm": 2.443514347076416, | |
| "learning_rate": 1.684491978609626e-05, | |
| "loss": 8.9663, | |
| "num_input_tokens_seen": 118112, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.34759358288770054, | |
| "grad_norm": 2.0616464614868164, | |
| "learning_rate": 1.7290552584670233e-05, | |
| "loss": 8.9474, | |
| "num_input_tokens_seen": 120896, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.35650623885918004, | |
| "grad_norm": 2.2355945110321045, | |
| "learning_rate": 1.7736185383244208e-05, | |
| "loss": 8.6637, | |
| "num_input_tokens_seen": 123904, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.36541889483065954, | |
| "grad_norm": 2.044498920440674, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 8.6211, | |
| "num_input_tokens_seen": 127008, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.37433155080213903, | |
| "grad_norm": 2.1903281211853027, | |
| "learning_rate": 1.862745098039216e-05, | |
| "loss": 8.4521, | |
| "num_input_tokens_seen": 129984, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.38324420677361853, | |
| "grad_norm": 2.253875255584717, | |
| "learning_rate": 1.9073083778966134e-05, | |
| "loss": 8.4635, | |
| "num_input_tokens_seen": 133152, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.39215686274509803, | |
| "grad_norm": 2.23766827583313, | |
| "learning_rate": 1.951871657754011e-05, | |
| "loss": 8.4012, | |
| "num_input_tokens_seen": 136096, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.40106951871657753, | |
| "grad_norm": 2.4483225345611572, | |
| "learning_rate": 1.9964349376114083e-05, | |
| "loss": 8.019, | |
| "num_input_tokens_seen": 139136, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.40998217468805703, | |
| "grad_norm": 2.141366958618164, | |
| "learning_rate": 2.0409982174688057e-05, | |
| "loss": 8.2362, | |
| "num_input_tokens_seen": 142080, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.41889483065953653, | |
| "grad_norm": 2.049794912338257, | |
| "learning_rate": 2.0855614973262035e-05, | |
| "loss": 8.3716, | |
| "num_input_tokens_seen": 145824, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.42780748663101603, | |
| "grad_norm": 2.0718395709991455, | |
| "learning_rate": 2.130124777183601e-05, | |
| "loss": 7.742, | |
| "num_input_tokens_seen": 149280, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.43672014260249553, | |
| "grad_norm": 2.133650064468384, | |
| "learning_rate": 2.1746880570409983e-05, | |
| "loss": 7.7851, | |
| "num_input_tokens_seen": 152544, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.44563279857397503, | |
| "grad_norm": 2.0652763843536377, | |
| "learning_rate": 2.2192513368983957e-05, | |
| "loss": 7.4258, | |
| "num_input_tokens_seen": 156416, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.45454545454545453, | |
| "grad_norm": 1.8414599895477295, | |
| "learning_rate": 2.2638146167557932e-05, | |
| "loss": 7.1734, | |
| "num_input_tokens_seen": 159712, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.46345811051693403, | |
| "grad_norm": 2.0587077140808105, | |
| "learning_rate": 2.308377896613191e-05, | |
| "loss": 6.8801, | |
| "num_input_tokens_seen": 162400, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.47237076648841353, | |
| "grad_norm": 1.8652368783950806, | |
| "learning_rate": 2.3529411764705884e-05, | |
| "loss": 7.0346, | |
| "num_input_tokens_seen": 166048, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.48128342245989303, | |
| "grad_norm": 1.6939105987548828, | |
| "learning_rate": 2.3975044563279858e-05, | |
| "loss": 6.5944, | |
| "num_input_tokens_seen": 168576, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.49019607843137253, | |
| "grad_norm": 1.9076436758041382, | |
| "learning_rate": 2.4420677361853832e-05, | |
| "loss": 6.7204, | |
| "num_input_tokens_seen": 172320, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.49910873440285203, | |
| "grad_norm": 1.65463387966156, | |
| "learning_rate": 2.4866310160427807e-05, | |
| "loss": 6.7786, | |
| "num_input_tokens_seen": 175424, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5008912655971479, | |
| "eval_loss": 6.320615768432617, | |
| "eval_runtime": 4.2449, | |
| "eval_samples_per_second": 58.659, | |
| "eval_steps_per_second": 14.841, | |
| "num_input_tokens_seen": 176032, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.5080213903743316, | |
| "grad_norm": 2.3921778202056885, | |
| "learning_rate": 2.5311942959001784e-05, | |
| "loss": 6.4536, | |
| "num_input_tokens_seen": 178016, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5169340463458111, | |
| "grad_norm": 1.5767650604248047, | |
| "learning_rate": 2.575757575757576e-05, | |
| "loss": 6.7214, | |
| "num_input_tokens_seen": 181888, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5258467023172906, | |
| "grad_norm": 1.6409612894058228, | |
| "learning_rate": 2.6203208556149733e-05, | |
| "loss": 6.0779, | |
| "num_input_tokens_seen": 184960, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5347593582887701, | |
| "grad_norm": 1.5643103122711182, | |
| "learning_rate": 2.6648841354723707e-05, | |
| "loss": 5.8182, | |
| "num_input_tokens_seen": 187488, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5436720142602496, | |
| "grad_norm": 1.7608228921890259, | |
| "learning_rate": 2.7094474153297685e-05, | |
| "loss": 6.2207, | |
| "num_input_tokens_seen": 191232, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.5525846702317291, | |
| "grad_norm": 1.5273125171661377, | |
| "learning_rate": 2.754010695187166e-05, | |
| "loss": 5.8064, | |
| "num_input_tokens_seen": 194272, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5614973262032086, | |
| "grad_norm": 1.3673619031906128, | |
| "learning_rate": 2.7985739750445633e-05, | |
| "loss": 5.7312, | |
| "num_input_tokens_seen": 197184, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.5704099821746881, | |
| "grad_norm": 1.3092046976089478, | |
| "learning_rate": 2.8431372549019608e-05, | |
| "loss": 5.36, | |
| "num_input_tokens_seen": 199840, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5793226381461676, | |
| "grad_norm": 1.5241113901138306, | |
| "learning_rate": 2.8877005347593582e-05, | |
| "loss": 5.6509, | |
| "num_input_tokens_seen": 203008, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 1.2224637269973755, | |
| "learning_rate": 2.932263814616756e-05, | |
| "loss": 5.3917, | |
| "num_input_tokens_seen": 206400, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5971479500891266, | |
| "grad_norm": 1.1933878660202026, | |
| "learning_rate": 2.9768270944741534e-05, | |
| "loss": 5.2637, | |
| "num_input_tokens_seen": 209440, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 1.1900209188461304, | |
| "learning_rate": 3.0213903743315508e-05, | |
| "loss": 5.4659, | |
| "num_input_tokens_seen": 212736, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6149732620320856, | |
| "grad_norm": 1.3414652347564697, | |
| "learning_rate": 3.065953654188948e-05, | |
| "loss": 5.324, | |
| "num_input_tokens_seen": 216096, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6238859180035651, | |
| "grad_norm": 1.1607022285461426, | |
| "learning_rate": 3.110516934046346e-05, | |
| "loss": 5.2878, | |
| "num_input_tokens_seen": 219200, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6327985739750446, | |
| "grad_norm": 1.153671383857727, | |
| "learning_rate": 3.155080213903743e-05, | |
| "loss": 4.9444, | |
| "num_input_tokens_seen": 221952, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.6417112299465241, | |
| "grad_norm": 1.139689326286316, | |
| "learning_rate": 3.199643493761141e-05, | |
| "loss": 4.891, | |
| "num_input_tokens_seen": 225376, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6506238859180036, | |
| "grad_norm": 1.0437010526657104, | |
| "learning_rate": 3.2442067736185386e-05, | |
| "loss": 4.9337, | |
| "num_input_tokens_seen": 228736, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.6595365418894831, | |
| "grad_norm": 1.2458043098449707, | |
| "learning_rate": 3.288770053475936e-05, | |
| "loss": 4.7023, | |
| "num_input_tokens_seen": 231648, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6684491978609626, | |
| "grad_norm": 1.0675745010375977, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 4.594, | |
| "num_input_tokens_seen": 234976, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.6773618538324421, | |
| "grad_norm": 1.0720183849334717, | |
| "learning_rate": 3.3778966131907306e-05, | |
| "loss": 4.82, | |
| "num_input_tokens_seen": 238368, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6862745098039216, | |
| "grad_norm": 1.044710636138916, | |
| "learning_rate": 3.4224598930481284e-05, | |
| "loss": 4.5563, | |
| "num_input_tokens_seen": 241440, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.6951871657754011, | |
| "grad_norm": 1.0943641662597656, | |
| "learning_rate": 3.467023172905526e-05, | |
| "loss": 4.5969, | |
| "num_input_tokens_seen": 244448, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7040998217468806, | |
| "grad_norm": 1.082396149635315, | |
| "learning_rate": 3.511586452762923e-05, | |
| "loss": 4.3737, | |
| "num_input_tokens_seen": 246880, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7130124777183601, | |
| "grad_norm": 1.1410984992980957, | |
| "learning_rate": 3.556149732620321e-05, | |
| "loss": 4.3754, | |
| "num_input_tokens_seen": 250240, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7219251336898396, | |
| "grad_norm": 1.1234968900680542, | |
| "learning_rate": 3.600713012477718e-05, | |
| "loss": 4.3313, | |
| "num_input_tokens_seen": 253184, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.7308377896613191, | |
| "grad_norm": 1.2889167070388794, | |
| "learning_rate": 3.645276292335116e-05, | |
| "loss": 4.1676, | |
| "num_input_tokens_seen": 255968, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7397504456327986, | |
| "grad_norm": 0.9909088611602783, | |
| "learning_rate": 3.6898395721925136e-05, | |
| "loss": 4.1332, | |
| "num_input_tokens_seen": 258688, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.7486631016042781, | |
| "grad_norm": 1.12320077419281, | |
| "learning_rate": 3.734402852049911e-05, | |
| "loss": 4.1551, | |
| "num_input_tokens_seen": 262240, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7575757575757576, | |
| "grad_norm": 1.1998422145843506, | |
| "learning_rate": 3.7789661319073085e-05, | |
| "loss": 4.1066, | |
| "num_input_tokens_seen": 265952, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.7664884135472371, | |
| "grad_norm": 1.6095830202102661, | |
| "learning_rate": 3.8235294117647055e-05, | |
| "loss": 4.3427, | |
| "num_input_tokens_seen": 269312, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7754010695187166, | |
| "grad_norm": 1.1973387002944946, | |
| "learning_rate": 3.868092691622103e-05, | |
| "loss": 4.0544, | |
| "num_input_tokens_seen": 272128, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.7843137254901961, | |
| "grad_norm": 1.13062584400177, | |
| "learning_rate": 3.912655971479501e-05, | |
| "loss": 4.0524, | |
| "num_input_tokens_seen": 275552, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7932263814616756, | |
| "grad_norm": 1.095451831817627, | |
| "learning_rate": 3.957219251336899e-05, | |
| "loss": 3.9436, | |
| "num_input_tokens_seen": 278720, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.8021390374331551, | |
| "grad_norm": 0.9978923201560974, | |
| "learning_rate": 4.0017825311942966e-05, | |
| "loss": 3.6121, | |
| "num_input_tokens_seen": 281536, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8110516934046346, | |
| "grad_norm": 1.036067008972168, | |
| "learning_rate": 4.046345811051694e-05, | |
| "loss": 3.8184, | |
| "num_input_tokens_seen": 284672, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.8199643493761141, | |
| "grad_norm": 0.8888896107673645, | |
| "learning_rate": 4.0909090909090915e-05, | |
| "loss": 3.7184, | |
| "num_input_tokens_seen": 288416, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8288770053475936, | |
| "grad_norm": 0.8882661461830139, | |
| "learning_rate": 4.1354723707664886e-05, | |
| "loss": 3.6762, | |
| "num_input_tokens_seen": 291232, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.8377896613190731, | |
| "grad_norm": 1.3067046403884888, | |
| "learning_rate": 4.180035650623886e-05, | |
| "loss": 3.7256, | |
| "num_input_tokens_seen": 294784, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8467023172905526, | |
| "grad_norm": 1.1890095472335815, | |
| "learning_rate": 4.224598930481284e-05, | |
| "loss": 3.4105, | |
| "num_input_tokens_seen": 297632, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.8556149732620321, | |
| "grad_norm": 0.9891613125801086, | |
| "learning_rate": 4.269162210338681e-05, | |
| "loss": 3.2745, | |
| "num_input_tokens_seen": 300416, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8645276292335116, | |
| "grad_norm": 0.9931787848472595, | |
| "learning_rate": 4.313725490196079e-05, | |
| "loss": 3.1763, | |
| "num_input_tokens_seen": 303232, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.8734402852049911, | |
| "grad_norm": 0.8934875130653381, | |
| "learning_rate": 4.358288770053476e-05, | |
| "loss": 3.2828, | |
| "num_input_tokens_seen": 306144, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8823529411764706, | |
| "grad_norm": 1.265254259109497, | |
| "learning_rate": 4.402852049910874e-05, | |
| "loss": 3.2048, | |
| "num_input_tokens_seen": 308576, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.8912655971479501, | |
| "grad_norm": 1.0396374464035034, | |
| "learning_rate": 4.4474153297682716e-05, | |
| "loss": 3.197, | |
| "num_input_tokens_seen": 312000, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9001782531194296, | |
| "grad_norm": 0.8916023373603821, | |
| "learning_rate": 4.491978609625669e-05, | |
| "loss": 2.9296, | |
| "num_input_tokens_seen": 314848, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 1.1076226234436035, | |
| "learning_rate": 4.5365418894830664e-05, | |
| "loss": 3.0006, | |
| "num_input_tokens_seen": 318112, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9180035650623886, | |
| "grad_norm": 1.0348403453826904, | |
| "learning_rate": 4.5811051693404635e-05, | |
| "loss": 3.2128, | |
| "num_input_tokens_seen": 321152, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.9269162210338681, | |
| "grad_norm": 0.9368388056755066, | |
| "learning_rate": 4.625668449197861e-05, | |
| "loss": 2.6109, | |
| "num_input_tokens_seen": 323552, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9358288770053476, | |
| "grad_norm": 0.9401017427444458, | |
| "learning_rate": 4.670231729055259e-05, | |
| "loss": 2.6761, | |
| "num_input_tokens_seen": 326112, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.9447415329768271, | |
| "grad_norm": 1.0641679763793945, | |
| "learning_rate": 4.714795008912656e-05, | |
| "loss": 2.7169, | |
| "num_input_tokens_seen": 328800, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9536541889483066, | |
| "grad_norm": 1.1021815538406372, | |
| "learning_rate": 4.759358288770054e-05, | |
| "loss": 3.1103, | |
| "num_input_tokens_seen": 332512, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.9625668449197861, | |
| "grad_norm": 0.8338248133659363, | |
| "learning_rate": 4.803921568627452e-05, | |
| "loss": 2.3949, | |
| "num_input_tokens_seen": 335360, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9714795008912656, | |
| "grad_norm": 1.311125636100769, | |
| "learning_rate": 4.848484848484849e-05, | |
| "loss": 2.9292, | |
| "num_input_tokens_seen": 339488, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.9803921568627451, | |
| "grad_norm": 0.993326723575592, | |
| "learning_rate": 4.8930481283422465e-05, | |
| "loss": 2.2154, | |
| "num_input_tokens_seen": 342176, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9893048128342246, | |
| "grad_norm": 1.0523838996887207, | |
| "learning_rate": 4.9376114081996436e-05, | |
| "loss": 2.6187, | |
| "num_input_tokens_seen": 345568, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.9982174688057041, | |
| "grad_norm": 1.2461936473846436, | |
| "learning_rate": 4.9821746880570414e-05, | |
| "loss": 2.0606, | |
| "num_input_tokens_seen": 348000, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.0017825311942958, | |
| "eval_loss": 2.2780375480651855, | |
| "eval_runtime": 4.2492, | |
| "eval_samples_per_second": 58.599, | |
| "eval_steps_per_second": 14.826, | |
| "num_input_tokens_seen": 349200, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 1.0071301247771836, | |
| "grad_norm": 0.8942297697067261, | |
| "learning_rate": 4.99999564446608e-05, | |
| "loss": 2.598, | |
| "num_input_tokens_seen": 350960, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.0160427807486632, | |
| "grad_norm": 0.9582070708274841, | |
| "learning_rate": 4.9999690273693036e-05, | |
| "loss": 2.0767, | |
| "num_input_tokens_seen": 354288, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.0249554367201426, | |
| "grad_norm": 1.0559678077697754, | |
| "learning_rate": 4.999918213174131e-05, | |
| "loss": 2.1588, | |
| "num_input_tokens_seen": 357648, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.0338680926916222, | |
| "grad_norm": 1.2316597700119019, | |
| "learning_rate": 4.9998432023723915e-05, | |
| "loss": 2.0186, | |
| "num_input_tokens_seen": 360496, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.0427807486631016, | |
| "grad_norm": 1.1366970539093018, | |
| "learning_rate": 4.9997439956901106e-05, | |
| "loss": 2.0455, | |
| "num_input_tokens_seen": 363376, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.0516934046345812, | |
| "grad_norm": 1.041366696357727, | |
| "learning_rate": 4.999620594087507e-05, | |
| "loss": 1.995, | |
| "num_input_tokens_seen": 366320, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0606060606060606, | |
| "grad_norm": 0.9262757301330566, | |
| "learning_rate": 4.999472998758978e-05, | |
| "loss": 1.912, | |
| "num_input_tokens_seen": 369488, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.0695187165775402, | |
| "grad_norm": 1.3618220090866089, | |
| "learning_rate": 4.999301211133095e-05, | |
| "loss": 1.8174, | |
| "num_input_tokens_seen": 372656, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0784313725490196, | |
| "grad_norm": 0.9017401337623596, | |
| "learning_rate": 4.999105232872582e-05, | |
| "loss": 1.7304, | |
| "num_input_tokens_seen": 376048, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.0873440285204992, | |
| "grad_norm": 1.131372332572937, | |
| "learning_rate": 4.998885065874305e-05, | |
| "loss": 2.0501, | |
| "num_input_tokens_seen": 379472, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.0962566844919786, | |
| "grad_norm": 0.743751585483551, | |
| "learning_rate": 4.9986407122692504e-05, | |
| "loss": 1.6725, | |
| "num_input_tokens_seen": 382288, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.1051693404634582, | |
| "grad_norm": 1.2746849060058594, | |
| "learning_rate": 4.998372174422507e-05, | |
| "loss": 1.5424, | |
| "num_input_tokens_seen": 385392, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.1140819964349375, | |
| "grad_norm": 1.250909686088562, | |
| "learning_rate": 4.998079454933244e-05, | |
| "loss": 1.9679, | |
| "num_input_tokens_seen": 389200, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.1229946524064172, | |
| "grad_norm": 0.8632287979125977, | |
| "learning_rate": 4.99776255663468e-05, | |
| "loss": 1.2718, | |
| "num_input_tokens_seen": 391664, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1319073083778965, | |
| "grad_norm": 0.773535966873169, | |
| "learning_rate": 4.997421482594059e-05, | |
| "loss": 1.3693, | |
| "num_input_tokens_seen": 394416, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.1408199643493762, | |
| "grad_norm": 1.104138731956482, | |
| "learning_rate": 4.997056236112625e-05, | |
| "loss": 1.9817, | |
| "num_input_tokens_seen": 399248, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.1497326203208555, | |
| "grad_norm": 0.7540408372879028, | |
| "learning_rate": 4.9966668207255826e-05, | |
| "loss": 1.2948, | |
| "num_input_tokens_seen": 402032, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.1586452762923352, | |
| "grad_norm": 0.9450183510780334, | |
| "learning_rate": 4.996253240202069e-05, | |
| "loss": 1.2707, | |
| "num_input_tokens_seen": 405296, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1675579322638145, | |
| "grad_norm": 1.1226730346679688, | |
| "learning_rate": 4.9958154985451114e-05, | |
| "loss": 1.2088, | |
| "num_input_tokens_seen": 408400, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.1764705882352942, | |
| "grad_norm": 0.9736111760139465, | |
| "learning_rate": 4.995353599991595e-05, | |
| "loss": 1.4309, | |
| "num_input_tokens_seen": 412016, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.1853832442067735, | |
| "grad_norm": 0.9693507552146912, | |
| "learning_rate": 4.994867549012215e-05, | |
| "loss": 1.2743, | |
| "num_input_tokens_seen": 415504, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.1942959001782532, | |
| "grad_norm": 1.0443888902664185, | |
| "learning_rate": 4.99435735031144e-05, | |
| "loss": 1.1155, | |
| "num_input_tokens_seen": 418448, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.2032085561497325, | |
| "grad_norm": 1.0174163579940796, | |
| "learning_rate": 4.993823008827465e-05, | |
| "loss": 1.092, | |
| "num_input_tokens_seen": 421168, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.2121212121212122, | |
| "grad_norm": 0.7569769620895386, | |
| "learning_rate": 4.9932645297321555e-05, | |
| "loss": 0.9307, | |
| "num_input_tokens_seen": 423632, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.2210338680926915, | |
| "grad_norm": 0.7273694276809692, | |
| "learning_rate": 4.9926819184310103e-05, | |
| "loss": 0.9791, | |
| "num_input_tokens_seen": 426640, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.2299465240641712, | |
| "grad_norm": 0.952115535736084, | |
| "learning_rate": 4.9920751805631e-05, | |
| "loss": 1.1522, | |
| "num_input_tokens_seen": 430032, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.2388591800356505, | |
| "grad_norm": 1.1709868907928467, | |
| "learning_rate": 4.991444322001014e-05, | |
| "loss": 1.0973, | |
| "num_input_tokens_seen": 433008, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.2477718360071302, | |
| "grad_norm": 0.6561676263809204, | |
| "learning_rate": 4.99078934885081e-05, | |
| "loss": 1.0868, | |
| "num_input_tokens_seen": 436400, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2566844919786098, | |
| "grad_norm": 0.8287897109985352, | |
| "learning_rate": 4.990110267451944e-05, | |
| "loss": 0.8352, | |
| "num_input_tokens_seen": 439248, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.2655971479500892, | |
| "grad_norm": 0.9313675165176392, | |
| "learning_rate": 4.989407084377218e-05, | |
| "loss": 0.8707, | |
| "num_input_tokens_seen": 442416, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.2745098039215685, | |
| "grad_norm": 0.9105520844459534, | |
| "learning_rate": 4.988679806432712e-05, | |
| "loss": 0.9153, | |
| "num_input_tokens_seen": 445616, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.2834224598930482, | |
| "grad_norm": 0.7386419773101807, | |
| "learning_rate": 4.9879284406577195e-05, | |
| "loss": 0.7514, | |
| "num_input_tokens_seen": 448528, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.2923351158645278, | |
| "grad_norm": 0.8464149236679077, | |
| "learning_rate": 4.98715299432468e-05, | |
| "loss": 0.897, | |
| "num_input_tokens_seen": 451664, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.3012477718360071, | |
| "grad_norm": 0.7016708254814148, | |
| "learning_rate": 4.986353474939106e-05, | |
| "loss": 0.9608, | |
| "num_input_tokens_seen": 455120, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.3101604278074865, | |
| "grad_norm": 0.7350292801856995, | |
| "learning_rate": 4.9855298902395134e-05, | |
| "loss": 0.8485, | |
| "num_input_tokens_seen": 458352, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.3190730837789661, | |
| "grad_norm": 0.657071053981781, | |
| "learning_rate": 4.9846822481973455e-05, | |
| "loss": 0.9055, | |
| "num_input_tokens_seen": 461488, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.3279857397504458, | |
| "grad_norm": 0.7406115531921387, | |
| "learning_rate": 4.9838105570168946e-05, | |
| "loss": 0.9068, | |
| "num_input_tokens_seen": 464848, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.3368983957219251, | |
| "grad_norm": 0.9874480962753296, | |
| "learning_rate": 4.982914825135224e-05, | |
| "loss": 1.0902, | |
| "num_input_tokens_seen": 468944, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3458110516934045, | |
| "grad_norm": 0.7415845990180969, | |
| "learning_rate": 4.981995061222087e-05, | |
| "loss": 0.6795, | |
| "num_input_tokens_seen": 471312, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.3547237076648841, | |
| "grad_norm": 0.6649575233459473, | |
| "learning_rate": 4.98105127417984e-05, | |
| "loss": 0.6273, | |
| "num_input_tokens_seen": 474128, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 1.0872315168380737, | |
| "learning_rate": 4.9800834731433596e-05, | |
| "loss": 0.5981, | |
| "num_input_tokens_seen": 476592, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.3725490196078431, | |
| "grad_norm": 0.7500861287117004, | |
| "learning_rate": 4.9790916674799526e-05, | |
| "loss": 1.014, | |
| "num_input_tokens_seen": 480240, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.3814616755793225, | |
| "grad_norm": 1.2134431600570679, | |
| "learning_rate": 4.9780758667892656e-05, | |
| "loss": 0.681, | |
| "num_input_tokens_seen": 483472, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.3903743315508021, | |
| "grad_norm": 0.8633726835250854, | |
| "learning_rate": 4.977036080903193e-05, | |
| "loss": 0.6929, | |
| "num_input_tokens_seen": 486768, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.3992869875222818, | |
| "grad_norm": 0.903477668762207, | |
| "learning_rate": 4.975972319885779e-05, | |
| "loss": 0.5834, | |
| "num_input_tokens_seen": 489392, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.4081996434937611, | |
| "grad_norm": 0.7039727568626404, | |
| "learning_rate": 4.974884594033123e-05, | |
| "loss": 0.7406, | |
| "num_input_tokens_seen": 492560, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.4171122994652405, | |
| "grad_norm": 0.9972723126411438, | |
| "learning_rate": 4.9737729138732805e-05, | |
| "loss": 0.5558, | |
| "num_input_tokens_seen": 495344, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.4260249554367201, | |
| "grad_norm": 1.2662111520767212, | |
| "learning_rate": 4.972637290166158e-05, | |
| "loss": 0.6374, | |
| "num_input_tokens_seen": 498128, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.4349376114081998, | |
| "grad_norm": 1.4038677215576172, | |
| "learning_rate": 4.97147773390341e-05, | |
| "loss": 0.8173, | |
| "num_input_tokens_seen": 501488, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.4438502673796791, | |
| "grad_norm": 0.730514407157898, | |
| "learning_rate": 4.9702942563083356e-05, | |
| "loss": 0.5782, | |
| "num_input_tokens_seen": 504272, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.4527629233511585, | |
| "grad_norm": 0.5917222499847412, | |
| "learning_rate": 4.969086868835765e-05, | |
| "loss": 0.4533, | |
| "num_input_tokens_seen": 506672, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.4616755793226381, | |
| "grad_norm": 0.49027279019355774, | |
| "learning_rate": 4.967855583171954e-05, | |
| "loss": 0.4866, | |
| "num_input_tokens_seen": 509232, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.4705882352941178, | |
| "grad_norm": 1.1144423484802246, | |
| "learning_rate": 4.9666004112344656e-05, | |
| "loss": 0.7116, | |
| "num_input_tokens_seen": 512528, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.4795008912655971, | |
| "grad_norm": 0.6267158389091492, | |
| "learning_rate": 4.965321365172057e-05, | |
| "loss": 0.576, | |
| "num_input_tokens_seen": 514896, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.4884135472370765, | |
| "grad_norm": 0.8494957089424133, | |
| "learning_rate": 4.9640184573645646e-05, | |
| "loss": 0.6064, | |
| "num_input_tokens_seen": 518384, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.4973262032085561, | |
| "grad_norm": 1.1032313108444214, | |
| "learning_rate": 4.962691700422778e-05, | |
| "loss": 0.8595, | |
| "num_input_tokens_seen": 522448, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.5026737967914439, | |
| "eval_loss": 0.5879648327827454, | |
| "eval_runtime": 4.2487, | |
| "eval_samples_per_second": 58.606, | |
| "eval_steps_per_second": 14.828, | |
| "num_input_tokens_seen": 524208, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 1.5062388591800357, | |
| "grad_norm": 0.7947481274604797, | |
| "learning_rate": 4.9613411071883267e-05, | |
| "loss": 0.4532, | |
| "num_input_tokens_seen": 525264, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.5151515151515151, | |
| "grad_norm": 0.6550034284591675, | |
| "learning_rate": 4.959966690733544e-05, | |
| "loss": 0.7043, | |
| "num_input_tokens_seen": 528528, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.5240641711229945, | |
| "grad_norm": 1.126085877418518, | |
| "learning_rate": 4.958568464361353e-05, | |
| "loss": 0.6396, | |
| "num_input_tokens_seen": 531536, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.5329768270944741, | |
| "grad_norm": 0.6209072470664978, | |
| "learning_rate": 4.9571464416051294e-05, | |
| "loss": 0.5435, | |
| "num_input_tokens_seen": 534704, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.5418894830659537, | |
| "grad_norm": 0.5790075063705444, | |
| "learning_rate": 4.955700636228573e-05, | |
| "loss": 0.359, | |
| "num_input_tokens_seen": 537264, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.5508021390374331, | |
| "grad_norm": 0.9781410694122314, | |
| "learning_rate": 4.954231062225576e-05, | |
| "loss": 0.6823, | |
| "num_input_tokens_seen": 541328, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.5597147950089125, | |
| "grad_norm": 0.7598072290420532, | |
| "learning_rate": 4.9527377338200855e-05, | |
| "loss": 0.4973, | |
| "num_input_tokens_seen": 544496, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.5686274509803921, | |
| "grad_norm": 0.8549111485481262, | |
| "learning_rate": 4.951220665465964e-05, | |
| "loss": 0.6291, | |
| "num_input_tokens_seen": 547696, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.5775401069518717, | |
| "grad_norm": 0.7234603762626648, | |
| "learning_rate": 4.949679871846857e-05, | |
| "loss": 0.4632, | |
| "num_input_tokens_seen": 550416, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.5864527629233511, | |
| "grad_norm": 0.5888731479644775, | |
| "learning_rate": 4.948115367876043e-05, | |
| "loss": 0.5336, | |
| "num_input_tokens_seen": 553968, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.5953654188948305, | |
| "grad_norm": 0.8173357844352722, | |
| "learning_rate": 4.94652716869629e-05, | |
| "loss": 0.3634, | |
| "num_input_tokens_seen": 556656, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.6042780748663101, | |
| "grad_norm": 0.5093280673027039, | |
| "learning_rate": 4.944915289679716e-05, | |
| "loss": 0.3877, | |
| "num_input_tokens_seen": 559536, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.6131907308377897, | |
| "grad_norm": 0.9982839226722717, | |
| "learning_rate": 4.94327974642763e-05, | |
| "loss": 0.5395, | |
| "num_input_tokens_seen": 562704, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.6221033868092691, | |
| "grad_norm": 1.0210356712341309, | |
| "learning_rate": 4.94162055477039e-05, | |
| "loss": 0.5995, | |
| "num_input_tokens_seen": 566352, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.6310160427807485, | |
| "grad_norm": 1.2152962684631348, | |
| "learning_rate": 4.939937730767243e-05, | |
| "loss": 0.5234, | |
| "num_input_tokens_seen": 569584, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.6399286987522281, | |
| "grad_norm": 0.8112650513648987, | |
| "learning_rate": 4.9382312907061755e-05, | |
| "loss": 0.3781, | |
| "num_input_tokens_seen": 571824, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.6488413547237077, | |
| "grad_norm": 0.8025038838386536, | |
| "learning_rate": 4.9365012511037514e-05, | |
| "loss": 0.5397, | |
| "num_input_tokens_seen": 575248, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.6577540106951871, | |
| "grad_norm": 1.2283076047897339, | |
| "learning_rate": 4.934747628704952e-05, | |
| "loss": 0.4426, | |
| "num_input_tokens_seen": 578032, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.8238184452056885, | |
| "learning_rate": 4.932970440483018e-05, | |
| "loss": 0.4614, | |
| "num_input_tokens_seen": 581744, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.6755793226381461, | |
| "grad_norm": 0.7958811521530151, | |
| "learning_rate": 4.931169703639282e-05, | |
| "loss": 0.4136, | |
| "num_input_tokens_seen": 584880, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.6844919786096257, | |
| "grad_norm": 1.2087262868881226, | |
| "learning_rate": 4.929345435603003e-05, | |
| "loss": 0.4801, | |
| "num_input_tokens_seen": 587856, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.6934046345811051, | |
| "grad_norm": 0.7868252992630005, | |
| "learning_rate": 4.9274976540311956e-05, | |
| "loss": 0.5347, | |
| "num_input_tokens_seen": 590928, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.7023172905525845, | |
| "grad_norm": 0.9967821836471558, | |
| "learning_rate": 4.9256263768084635e-05, | |
| "loss": 0.37, | |
| "num_input_tokens_seen": 594096, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.7112299465240641, | |
| "grad_norm": 0.8641761541366577, | |
| "learning_rate": 4.923731622046823e-05, | |
| "loss": 0.3977, | |
| "num_input_tokens_seen": 597136, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.7201426024955437, | |
| "grad_norm": 0.6801542639732361, | |
| "learning_rate": 4.9218134080855273e-05, | |
| "loss": 0.5575, | |
| "num_input_tokens_seen": 600912, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.7290552584670231, | |
| "grad_norm": 0.9356634616851807, | |
| "learning_rate": 4.919871753490891e-05, | |
| "loss": 0.5977, | |
| "num_input_tokens_seen": 604240, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.7379679144385025, | |
| "grad_norm": 0.898560106754303, | |
| "learning_rate": 4.917906677056111e-05, | |
| "loss": 0.4074, | |
| "num_input_tokens_seen": 607248, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.7468805704099821, | |
| "grad_norm": 0.7507029175758362, | |
| "learning_rate": 4.9159181978010814e-05, | |
| "loss": 0.4812, | |
| "num_input_tokens_seen": 610736, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.7557932263814617, | |
| "grad_norm": 0.9444867372512817, | |
| "learning_rate": 4.9139063349722113e-05, | |
| "loss": 0.4682, | |
| "num_input_tokens_seen": 614128, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.7647058823529411, | |
| "grad_norm": 0.9318161010742188, | |
| "learning_rate": 4.911871108042241e-05, | |
| "loss": 0.4571, | |
| "num_input_tokens_seen": 617232, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.7736185383244205, | |
| "grad_norm": 0.7218228578567505, | |
| "learning_rate": 4.909812536710048e-05, | |
| "loss": 0.5007, | |
| "num_input_tokens_seen": 620880, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.7825311942959001, | |
| "grad_norm": 0.7028499841690063, | |
| "learning_rate": 4.9077306409004585e-05, | |
| "loss": 0.6674, | |
| "num_input_tokens_seen": 624368, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.7914438502673797, | |
| "grad_norm": 0.5062604546546936, | |
| "learning_rate": 4.9056254407640604e-05, | |
| "loss": 0.3413, | |
| "num_input_tokens_seen": 627152, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.8003565062388591, | |
| "grad_norm": 0.49366044998168945, | |
| "learning_rate": 4.903496956676998e-05, | |
| "loss": 0.3736, | |
| "num_input_tokens_seen": 629680, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.8092691622103387, | |
| "grad_norm": 0.6387802958488464, | |
| "learning_rate": 4.901345209240784e-05, | |
| "loss": 0.3377, | |
| "num_input_tokens_seen": 632848, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 0.8644296526908875, | |
| "learning_rate": 4.8991702192820924e-05, | |
| "loss": 0.4588, | |
| "num_input_tokens_seen": 635920, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.8270944741532977, | |
| "grad_norm": 0.4941517114639282, | |
| "learning_rate": 4.896972007852563e-05, | |
| "loss": 0.3705, | |
| "num_input_tokens_seen": 639056, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.8360071301247771, | |
| "grad_norm": 0.5460651516914368, | |
| "learning_rate": 4.894750596228594e-05, | |
| "loss": 0.3389, | |
| "num_input_tokens_seen": 642192, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.8449197860962567, | |
| "grad_norm": 0.7782461643218994, | |
| "learning_rate": 4.8925060059111394e-05, | |
| "loss": 0.4158, | |
| "num_input_tokens_seen": 645488, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.8538324420677363, | |
| "grad_norm": 0.5338404178619385, | |
| "learning_rate": 4.890238258625496e-05, | |
| "loss": 0.3644, | |
| "num_input_tokens_seen": 648336, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.8627450980392157, | |
| "grad_norm": 0.8528239727020264, | |
| "learning_rate": 4.887947376321099e-05, | |
| "loss": 0.3682, | |
| "num_input_tokens_seen": 651696, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.8716577540106951, | |
| "grad_norm": 0.4754684865474701, | |
| "learning_rate": 4.885633381171304e-05, | |
| "loss": 0.3467, | |
| "num_input_tokens_seen": 654640, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.8805704099821747, | |
| "grad_norm": 0.9799590110778809, | |
| "learning_rate": 4.883296295573176e-05, | |
| "loss": 0.511, | |
| "num_input_tokens_seen": 658128, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.8894830659536543, | |
| "grad_norm": 0.6689459085464478, | |
| "learning_rate": 4.880936142147271e-05, | |
| "loss": 0.3246, | |
| "num_input_tokens_seen": 660848, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.8983957219251337, | |
| "grad_norm": 0.7261871099472046, | |
| "learning_rate": 4.878552943737418e-05, | |
| "loss": 0.2685, | |
| "num_input_tokens_seen": 663120, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.9073083778966131, | |
| "grad_norm": 0.7026433944702148, | |
| "learning_rate": 4.876146723410498e-05, | |
| "loss": 0.3756, | |
| "num_input_tokens_seen": 666288, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.9162210338680927, | |
| "grad_norm": 1.4159960746765137, | |
| "learning_rate": 4.873717504456219e-05, | |
| "loss": 0.3687, | |
| "num_input_tokens_seen": 669360, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.9251336898395723, | |
| "grad_norm": 0.7870906591415405, | |
| "learning_rate": 4.8712653103868916e-05, | |
| "loss": 0.2532, | |
| "num_input_tokens_seen": 671344, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.9340463458110517, | |
| "grad_norm": 0.8793025612831116, | |
| "learning_rate": 4.868790164937204e-05, | |
| "loss": 0.3925, | |
| "num_input_tokens_seen": 674672, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.9429590017825311, | |
| "grad_norm": 0.40374019742012024, | |
| "learning_rate": 4.8662920920639866e-05, | |
| "loss": 0.3251, | |
| "num_input_tokens_seen": 677968, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.9518716577540107, | |
| "grad_norm": 0.5041529536247253, | |
| "learning_rate": 4.8637711159459855e-05, | |
| "loss": 0.3022, | |
| "num_input_tokens_seen": 680560, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.9607843137254903, | |
| "grad_norm": 1.0466898679733276, | |
| "learning_rate": 4.8612272609836263e-05, | |
| "loss": 0.3464, | |
| "num_input_tokens_seen": 683824, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9696969696969697, | |
| "grad_norm": 0.8734254240989685, | |
| "learning_rate": 4.858660551798778e-05, | |
| "loss": 0.4663, | |
| "num_input_tokens_seen": 687216, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.9786096256684491, | |
| "grad_norm": 0.589005172252655, | |
| "learning_rate": 4.856071013234513e-05, | |
| "loss": 0.3396, | |
| "num_input_tokens_seen": 690128, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.9875222816399287, | |
| "grad_norm": 0.570462167263031, | |
| "learning_rate": 4.85345867035487e-05, | |
| "loss": 0.3839, | |
| "num_input_tokens_seen": 693232, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.9964349376114083, | |
| "grad_norm": 0.9086877107620239, | |
| "learning_rate": 4.8508235484446095e-05, | |
| "loss": 0.4327, | |
| "num_input_tokens_seen": 696880, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.0035650623885917, | |
| "eval_loss": 0.37957677245140076, | |
| "eval_runtime": 4.2451, | |
| "eval_samples_per_second": 58.656, | |
| "eval_steps_per_second": 14.841, | |
| "num_input_tokens_seen": 699264, | |
| "step": 1124 | |
| }, | |
| { | |
| "epoch": 2.0053475935828877, | |
| "grad_norm": 0.9719306826591492, | |
| "learning_rate": 4.8481656730089695e-05, | |
| "loss": 0.4008, | |
| "num_input_tokens_seen": 700096, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 2.014260249554367, | |
| "grad_norm": 0.9481471180915833, | |
| "learning_rate": 4.8454850697734174e-05, | |
| "loss": 0.4113, | |
| "num_input_tokens_seen": 703360, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.0231729055258465, | |
| "grad_norm": 0.7257654666900635, | |
| "learning_rate": 4.842781764683403e-05, | |
| "loss": 0.3966, | |
| "num_input_tokens_seen": 706624, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 2.0320855614973263, | |
| "grad_norm": 0.8015730977058411, | |
| "learning_rate": 4.8400557839041064e-05, | |
| "loss": 0.3069, | |
| "num_input_tokens_seen": 709472, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.0409982174688057, | |
| "grad_norm": 0.43969354033470154, | |
| "learning_rate": 4.837307153820184e-05, | |
| "loss": 0.337, | |
| "num_input_tokens_seen": 713152, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 2.049910873440285, | |
| "grad_norm": 0.934760570526123, | |
| "learning_rate": 4.8345359010355155e-05, | |
| "loss": 0.3539, | |
| "num_input_tokens_seen": 716480, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.0588235294117645, | |
| "grad_norm": 0.4905712306499481, | |
| "learning_rate": 4.831742052372943e-05, | |
| "loss": 0.3069, | |
| "num_input_tokens_seen": 719104, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 2.0677361853832443, | |
| "grad_norm": 0.6868427395820618, | |
| "learning_rate": 4.828925634874014e-05, | |
| "loss": 0.3006, | |
| "num_input_tokens_seen": 722016, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.0766488413547237, | |
| "grad_norm": 0.6591427326202393, | |
| "learning_rate": 4.8260866757987177e-05, | |
| "loss": 0.2809, | |
| "num_input_tokens_seen": 725184, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 2.085561497326203, | |
| "grad_norm": 1.2832831144332886, | |
| "learning_rate": 4.823225202625226e-05, | |
| "loss": 0.3441, | |
| "num_input_tokens_seen": 728352, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.0944741532976825, | |
| "grad_norm": 0.7174959182739258, | |
| "learning_rate": 4.820341243049618e-05, | |
| "loss": 0.4048, | |
| "num_input_tokens_seen": 731712, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 2.1033868092691623, | |
| "grad_norm": 0.6431313157081604, | |
| "learning_rate": 4.8174348249856236e-05, | |
| "loss": 0.3201, | |
| "num_input_tokens_seen": 734880, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.1122994652406417, | |
| "grad_norm": 0.658487856388092, | |
| "learning_rate": 4.814505976564343e-05, | |
| "loss": 0.3509, | |
| "num_input_tokens_seen": 737728, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 2.121212121212121, | |
| "grad_norm": 0.7958409786224365, | |
| "learning_rate": 4.8115547261339824e-05, | |
| "loss": 0.3429, | |
| "num_input_tokens_seen": 741376, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.1301247771836005, | |
| "grad_norm": 0.6729584336280823, | |
| "learning_rate": 4.808581102259573e-05, | |
| "loss": 0.2909, | |
| "num_input_tokens_seen": 744256, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 2.1390374331550803, | |
| "grad_norm": 0.740015983581543, | |
| "learning_rate": 4.8055851337227006e-05, | |
| "loss": 0.2479, | |
| "num_input_tokens_seen": 746944, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.1479500891265597, | |
| "grad_norm": 0.5458919405937195, | |
| "learning_rate": 4.802566849521222e-05, | |
| "loss": 0.2943, | |
| "num_input_tokens_seen": 750272, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 2.156862745098039, | |
| "grad_norm": 0.508515477180481, | |
| "learning_rate": 4.799526278868987e-05, | |
| "loss": 0.2486, | |
| "num_input_tokens_seen": 753024, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.165775401069519, | |
| "grad_norm": 0.8448687791824341, | |
| "learning_rate": 4.796463451195554e-05, | |
| "loss": 0.388, | |
| "num_input_tokens_seen": 756576, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 2.1746880570409983, | |
| "grad_norm": 0.5762525200843811, | |
| "learning_rate": 4.7933783961459094e-05, | |
| "loss": 0.3068, | |
| "num_input_tokens_seen": 759680, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.1836007130124777, | |
| "grad_norm": 0.6639679670333862, | |
| "learning_rate": 4.790271143580174e-05, | |
| "loss": 0.331, | |
| "num_input_tokens_seen": 762880, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 2.192513368983957, | |
| "grad_norm": 0.5362179279327393, | |
| "learning_rate": 4.7871417235733196e-05, | |
| "loss": 0.2964, | |
| "num_input_tokens_seen": 765920, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.2014260249554365, | |
| "grad_norm": 0.5786792039871216, | |
| "learning_rate": 4.783990166414875e-05, | |
| "loss": 0.4138, | |
| "num_input_tokens_seen": 769728, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 2.2103386809269163, | |
| "grad_norm": 0.47215279936790466, | |
| "learning_rate": 4.780816502608632e-05, | |
| "loss": 0.3199, | |
| "num_input_tokens_seen": 772832, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.2192513368983957, | |
| "grad_norm": 0.4350599944591522, | |
| "learning_rate": 4.777620762872355e-05, | |
| "loss": 0.3148, | |
| "num_input_tokens_seen": 776352, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 2.228163992869875, | |
| "grad_norm": 0.6416548490524292, | |
| "learning_rate": 4.774402978137479e-05, | |
| "loss": 0.3055, | |
| "num_input_tokens_seen": 779456, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.237076648841355, | |
| "grad_norm": 0.2961161434650421, | |
| "learning_rate": 4.7711631795488096e-05, | |
| "loss": 0.2604, | |
| "num_input_tokens_seen": 782112, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 2.2459893048128343, | |
| "grad_norm": 0.5333968997001648, | |
| "learning_rate": 4.767901398464227e-05, | |
| "loss": 0.346, | |
| "num_input_tokens_seen": 784864, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.2549019607843137, | |
| "grad_norm": 0.7181191444396973, | |
| "learning_rate": 4.7646176664543763e-05, | |
| "loss": 0.2688, | |
| "num_input_tokens_seen": 787936, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 2.263814616755793, | |
| "grad_norm": 1.1632299423217773, | |
| "learning_rate": 4.761312015302367e-05, | |
| "loss": 0.2973, | |
| "num_input_tokens_seen": 790976, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.2727272727272725, | |
| "grad_norm": 1.0037575960159302, | |
| "learning_rate": 4.757984477003462e-05, | |
| "loss": 0.3304, | |
| "num_input_tokens_seen": 794016, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 2.2816399286987523, | |
| "grad_norm": 0.6830529570579529, | |
| "learning_rate": 4.7546350837647666e-05, | |
| "loss": 0.2141, | |
| "num_input_tokens_seen": 796864, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.2905525846702317, | |
| "grad_norm": 0.7043412327766418, | |
| "learning_rate": 4.7512638680049245e-05, | |
| "loss": 0.3195, | |
| "num_input_tokens_seen": 800096, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 2.299465240641711, | |
| "grad_norm": 0.6342535018920898, | |
| "learning_rate": 4.7478708623537956e-05, | |
| "loss": 0.2506, | |
| "num_input_tokens_seen": 803392, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.308377896613191, | |
| "grad_norm": 1.047386646270752, | |
| "learning_rate": 4.7444560996521415e-05, | |
| "loss": 0.3365, | |
| "num_input_tokens_seen": 806400, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 2.3172905525846703, | |
| "grad_norm": 1.372889518737793, | |
| "learning_rate": 4.741019612951312e-05, | |
| "loss": 0.4817, | |
| "num_input_tokens_seen": 809568, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.3262032085561497, | |
| "grad_norm": 0.4855256974697113, | |
| "learning_rate": 4.737561435512923e-05, | |
| "loss": 0.2226, | |
| "num_input_tokens_seen": 812768, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 2.335115864527629, | |
| "grad_norm": 0.5740591287612915, | |
| "learning_rate": 4.734081600808531e-05, | |
| "loss": 0.2448, | |
| "num_input_tokens_seen": 815968, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.344028520499109, | |
| "grad_norm": 0.5068109631538391, | |
| "learning_rate": 4.7305801425193165e-05, | |
| "loss": 0.2175, | |
| "num_input_tokens_seen": 818976, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 2.3529411764705883, | |
| "grad_norm": 0.9766526818275452, | |
| "learning_rate": 4.727057094535749e-05, | |
| "loss": 0.2615, | |
| "num_input_tokens_seen": 821760, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.3618538324420677, | |
| "grad_norm": 0.5878629684448242, | |
| "learning_rate": 4.72351249095727e-05, | |
| "loss": 0.3121, | |
| "num_input_tokens_seen": 824288, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 2.370766488413547, | |
| "grad_norm": 0.8109356760978699, | |
| "learning_rate": 4.7199463660919514e-05, | |
| "loss": 0.3045, | |
| "num_input_tokens_seen": 827424, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.379679144385027, | |
| "grad_norm": 0.6713225245475769, | |
| "learning_rate": 4.7163587544561705e-05, | |
| "loss": 0.2503, | |
| "num_input_tokens_seen": 830176, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 2.3885918003565063, | |
| "grad_norm": 0.7476429343223572, | |
| "learning_rate": 4.7127496907742734e-05, | |
| "loss": 0.357, | |
| "num_input_tokens_seen": 833664, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.3975044563279857, | |
| "grad_norm": 1.1430628299713135, | |
| "learning_rate": 4.709119209978242e-05, | |
| "loss": 0.3525, | |
| "num_input_tokens_seen": 836736, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 2.406417112299465, | |
| "grad_norm": 0.5232317447662354, | |
| "learning_rate": 4.7054673472073506e-05, | |
| "loss": 0.3624, | |
| "num_input_tokens_seen": 840160, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.415329768270945, | |
| "grad_norm": 0.9793670773506165, | |
| "learning_rate": 4.7017941378078314e-05, | |
| "loss": 0.3082, | |
| "num_input_tokens_seen": 843168, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 2.4242424242424243, | |
| "grad_norm": 0.6311604380607605, | |
| "learning_rate": 4.698099617332528e-05, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 845952, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.4331550802139037, | |
| "grad_norm": 0.9364222288131714, | |
| "learning_rate": 4.694383821540555e-05, | |
| "loss": 0.2302, | |
| "num_input_tokens_seen": 848448, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.442067736185383, | |
| "grad_norm": 1.2326656579971313, | |
| "learning_rate": 4.690646786396945e-05, | |
| "loss": 0.2639, | |
| "num_input_tokens_seen": 851552, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.450980392156863, | |
| "grad_norm": 0.7579092979431152, | |
| "learning_rate": 4.686888548072312e-05, | |
| "loss": 0.3276, | |
| "num_input_tokens_seen": 854752, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 2.4598930481283423, | |
| "grad_norm": 0.9993529915809631, | |
| "learning_rate": 4.683109142942492e-05, | |
| "loss": 0.2741, | |
| "num_input_tokens_seen": 857600, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.4688057040998217, | |
| "grad_norm": 0.5094732642173767, | |
| "learning_rate": 4.679308607588192e-05, | |
| "loss": 0.4073, | |
| "num_input_tokens_seen": 861248, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 2.477718360071301, | |
| "grad_norm": 0.6214059591293335, | |
| "learning_rate": 4.6754869787946386e-05, | |
| "loss": 0.3205, | |
| "num_input_tokens_seen": 865056, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.486631016042781, | |
| "grad_norm": 0.432815283536911, | |
| "learning_rate": 4.6716442935512214e-05, | |
| "loss": 0.2478, | |
| "num_input_tokens_seen": 867936, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 2.4955436720142603, | |
| "grad_norm": 0.5354329347610474, | |
| "learning_rate": 4.6677805890511354e-05, | |
| "loss": 0.2816, | |
| "num_input_tokens_seen": 871136, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.5044563279857397, | |
| "grad_norm": 0.5837387442588806, | |
| "learning_rate": 4.663895902691018e-05, | |
| "loss": 0.239, | |
| "num_input_tokens_seen": 873600, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 2.5044563279857397, | |
| "eval_loss": 0.286673367023468, | |
| "eval_runtime": 4.2516, | |
| "eval_samples_per_second": 58.566, | |
| "eval_steps_per_second": 14.818, | |
| "num_input_tokens_seen": 873600, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 2.5133689839572195, | |
| "grad_norm": 0.48573535680770874, | |
| "learning_rate": 4.659990272070591e-05, | |
| "loss": 0.31, | |
| "num_input_tokens_seen": 877152, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.522281639928699, | |
| "grad_norm": 0.5476496815681458, | |
| "learning_rate": 4.656063734992294e-05, | |
| "loss": 0.2718, | |
| "num_input_tokens_seen": 880096, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 2.5311942959001783, | |
| "grad_norm": 0.5417474508285522, | |
| "learning_rate": 4.6521163294609196e-05, | |
| "loss": 0.2433, | |
| "num_input_tokens_seen": 882944, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.5401069518716577, | |
| "grad_norm": 0.7648299932479858, | |
| "learning_rate": 4.6481480936832444e-05, | |
| "loss": 0.3607, | |
| "num_input_tokens_seen": 886848, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 2.549019607843137, | |
| "grad_norm": 0.6219758987426758, | |
| "learning_rate": 4.644159066067662e-05, | |
| "loss": 0.2771, | |
| "num_input_tokens_seen": 890272, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.557932263814617, | |
| "grad_norm": 0.6586949825286865, | |
| "learning_rate": 4.640149285223806e-05, | |
| "loss": 0.2683, | |
| "num_input_tokens_seen": 893600, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 2.5668449197860963, | |
| "grad_norm": 1.156497836112976, | |
| "learning_rate": 4.636118789962184e-05, | |
| "loss": 0.2513, | |
| "num_input_tokens_seen": 896448, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.5757575757575757, | |
| "grad_norm": 0.6117565631866455, | |
| "learning_rate": 4.632067619293795e-05, | |
| "loss": 0.2491, | |
| "num_input_tokens_seen": 899424, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 2.5846702317290555, | |
| "grad_norm": 0.6213181614875793, | |
| "learning_rate": 4.6279958124297554e-05, | |
| "loss": 0.2476, | |
| "num_input_tokens_seen": 902624, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.593582887700535, | |
| "grad_norm": 0.8394727110862732, | |
| "learning_rate": 4.623903408780916e-05, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 905568, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 2.6024955436720143, | |
| "grad_norm": 0.65825355052948, | |
| "learning_rate": 4.619790447957488e-05, | |
| "loss": 0.321, | |
| "num_input_tokens_seen": 908960, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.6114081996434937, | |
| "grad_norm": 0.7782941460609436, | |
| "learning_rate": 4.615656969768649e-05, | |
| "loss": 0.2843, | |
| "num_input_tokens_seen": 912640, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 2.620320855614973, | |
| "grad_norm": 0.8492444157600403, | |
| "learning_rate": 4.611503014222168e-05, | |
| "loss": 0.2464, | |
| "num_input_tokens_seen": 915328, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.629233511586453, | |
| "grad_norm": 1.3704971075057983, | |
| "learning_rate": 4.6073286215240105e-05, | |
| "loss": 0.2942, | |
| "num_input_tokens_seen": 918656, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 2.6381461675579323, | |
| "grad_norm": 0.8433835506439209, | |
| "learning_rate": 4.6031338320779534e-05, | |
| "loss": 0.2215, | |
| "num_input_tokens_seen": 921344, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.6470588235294117, | |
| "grad_norm": 0.5805216431617737, | |
| "learning_rate": 4.598918686485193e-05, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 924192, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 2.6559714795008915, | |
| "grad_norm": 0.4831686317920685, | |
| "learning_rate": 4.594683225543952e-05, | |
| "loss": 0.2957, | |
| "num_input_tokens_seen": 927424, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.664884135472371, | |
| "grad_norm": 0.7766821980476379, | |
| "learning_rate": 4.590427490249084e-05, | |
| "loss": 0.2587, | |
| "num_input_tokens_seen": 930080, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 2.6737967914438503, | |
| "grad_norm": 0.4486106038093567, | |
| "learning_rate": 4.5861515217916785e-05, | |
| "loss": 0.202, | |
| "num_input_tokens_seen": 932768, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.6827094474153297, | |
| "grad_norm": 0.43728289008140564, | |
| "learning_rate": 4.581855361558659e-05, | |
| "loss": 0.2685, | |
| "num_input_tokens_seen": 935904, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 2.691622103386809, | |
| "grad_norm": 0.5914068222045898, | |
| "learning_rate": 4.577539051132386e-05, | |
| "loss": 0.2218, | |
| "num_input_tokens_seen": 938784, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.700534759358289, | |
| "grad_norm": 0.4907556176185608, | |
| "learning_rate": 4.573202632290252e-05, | |
| "loss": 0.2022, | |
| "num_input_tokens_seen": 941280, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 2.7094474153297683, | |
| "grad_norm": 0.7610965967178345, | |
| "learning_rate": 4.568846147004279e-05, | |
| "loss": 0.2046, | |
| "num_input_tokens_seen": 944672, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.7183600713012477, | |
| "grad_norm": 0.7069556713104248, | |
| "learning_rate": 4.5644696374407105e-05, | |
| "loss": 0.2896, | |
| "num_input_tokens_seen": 948032, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 0.7775002121925354, | |
| "learning_rate": 4.560073145959602e-05, | |
| "loss": 0.322, | |
| "num_input_tokens_seen": 952000, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.736185383244207, | |
| "grad_norm": 0.5535850524902344, | |
| "learning_rate": 4.555656715114419e-05, | |
| "loss": 0.278, | |
| "num_input_tokens_seen": 955456, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 2.7450980392156863, | |
| "grad_norm": 0.6513121724128723, | |
| "learning_rate": 4.551220387651615e-05, | |
| "loss": 0.2629, | |
| "num_input_tokens_seen": 959232, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.7540106951871657, | |
| "grad_norm": 0.5215713977813721, | |
| "learning_rate": 4.546764206510221e-05, | |
| "loss": 0.2042, | |
| "num_input_tokens_seen": 962304, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 2.762923351158645, | |
| "grad_norm": 0.5402376651763916, | |
| "learning_rate": 4.542288214821433e-05, | |
| "loss": 0.213, | |
| "num_input_tokens_seen": 965344, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.771836007130125, | |
| "grad_norm": 1.1007705926895142, | |
| "learning_rate": 4.5377924559081946e-05, | |
| "loss": 0.1996, | |
| "num_input_tokens_seen": 968032, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 2.7807486631016043, | |
| "grad_norm": 0.5571001172065735, | |
| "learning_rate": 4.533276973284771e-05, | |
| "loss": 0.2281, | |
| "num_input_tokens_seen": 970624, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.7896613190730837, | |
| "grad_norm": 0.7429901361465454, | |
| "learning_rate": 4.528741810656336e-05, | |
| "loss": 0.2868, | |
| "num_input_tokens_seen": 973760, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 2.7985739750445635, | |
| "grad_norm": 0.3642044961452484, | |
| "learning_rate": 4.5241870119185426e-05, | |
| "loss": 0.2662, | |
| "num_input_tokens_seen": 976480, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.807486631016043, | |
| "grad_norm": 0.5374373197555542, | |
| "learning_rate": 4.519612621157103e-05, | |
| "loss": 0.241, | |
| "num_input_tokens_seen": 979328, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 2.8163992869875223, | |
| "grad_norm": 0.9241515398025513, | |
| "learning_rate": 4.515018682647359e-05, | |
| "loss": 0.2839, | |
| "num_input_tokens_seen": 982624, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.8253119429590017, | |
| "grad_norm": 0.6853222846984863, | |
| "learning_rate": 4.510405240853854e-05, | |
| "loss": 0.2158, | |
| "num_input_tokens_seen": 985664, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 2.834224598930481, | |
| "grad_norm": 0.5483903884887695, | |
| "learning_rate": 4.505772340429905e-05, | |
| "loss": 0.2571, | |
| "num_input_tokens_seen": 989024, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.843137254901961, | |
| "grad_norm": 0.4872891902923584, | |
| "learning_rate": 4.501120026217164e-05, | |
| "loss": 0.2331, | |
| "num_input_tokens_seen": 992160, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 2.8520499108734403, | |
| "grad_norm": 0.5892439484596252, | |
| "learning_rate": 4.496448343245192e-05, | |
| "loss": 0.2645, | |
| "num_input_tokens_seen": 995328, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.8609625668449197, | |
| "grad_norm": 0.6122104525566101, | |
| "learning_rate": 4.4917573367310184e-05, | |
| "loss": 0.3106, | |
| "num_input_tokens_seen": 999136, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 2.8698752228163995, | |
| "grad_norm": 0.657755970954895, | |
| "learning_rate": 4.4870470520787035e-05, | |
| "loss": 0.2123, | |
| "num_input_tokens_seen": 1001920, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.878787878787879, | |
| "grad_norm": 0.6398863196372986, | |
| "learning_rate": 4.482317534878901e-05, | |
| "loss": 0.385, | |
| "num_input_tokens_seen": 1005632, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 2.8877005347593583, | |
| "grad_norm": 0.9357530474662781, | |
| "learning_rate": 4.477568830908415e-05, | |
| "loss": 0.2565, | |
| "num_input_tokens_seen": 1009408, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.8966131907308377, | |
| "grad_norm": 0.767514705657959, | |
| "learning_rate": 4.4728009861297586e-05, | |
| "loss": 0.2551, | |
| "num_input_tokens_seen": 1012448, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 2.905525846702317, | |
| "grad_norm": 0.5800440311431885, | |
| "learning_rate": 4.468014046690707e-05, | |
| "loss": 0.2587, | |
| "num_input_tokens_seen": 1015616, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.914438502673797, | |
| "grad_norm": 0.487104207277298, | |
| "learning_rate": 4.463208058923851e-05, | |
| "loss": 0.2677, | |
| "num_input_tokens_seen": 1018944, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 2.9233511586452763, | |
| "grad_norm": 0.799360454082489, | |
| "learning_rate": 4.458383069346152e-05, | |
| "loss": 0.2031, | |
| "num_input_tokens_seen": 1021696, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.9322638146167557, | |
| "grad_norm": 0.5832977890968323, | |
| "learning_rate": 4.453539124658486e-05, | |
| "loss": 0.2505, | |
| "num_input_tokens_seen": 1024832, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 2.9411764705882355, | |
| "grad_norm": 0.7471289038658142, | |
| "learning_rate": 4.4486762717451975e-05, | |
| "loss": 0.2521, | |
| "num_input_tokens_seen": 1027712, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.950089126559715, | |
| "grad_norm": 0.5479772090911865, | |
| "learning_rate": 4.443794557673641e-05, | |
| "loss": 0.2542, | |
| "num_input_tokens_seen": 1031040, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 2.9590017825311943, | |
| "grad_norm": 0.5916025042533875, | |
| "learning_rate": 4.43889402969373e-05, | |
| "loss": 0.1892, | |
| "num_input_tokens_seen": 1033440, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.9679144385026737, | |
| "grad_norm": 0.7155612111091614, | |
| "learning_rate": 4.4339747352374726e-05, | |
| "loss": 0.2661, | |
| "num_input_tokens_seen": 1036864, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 2.976827094474153, | |
| "grad_norm": 0.4465028941631317, | |
| "learning_rate": 4.4290367219185206e-05, | |
| "loss": 0.2583, | |
| "num_input_tokens_seen": 1039808, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.985739750445633, | |
| "grad_norm": 0.5775701999664307, | |
| "learning_rate": 4.424080037531705e-05, | |
| "loss": 0.2162, | |
| "num_input_tokens_seen": 1043200, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 2.9946524064171123, | |
| "grad_norm": 0.49966952204704285, | |
| "learning_rate": 4.4191047300525704e-05, | |
| "loss": 0.1902, | |
| "num_input_tokens_seen": 1045504, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.0035650623885917, | |
| "grad_norm": 0.5228843092918396, | |
| "learning_rate": 4.414110847636916e-05, | |
| "loss": 0.196, | |
| "num_input_tokens_seen": 1047768, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 3.0053475935828877, | |
| "eval_loss": 0.2455865740776062, | |
| "eval_runtime": 4.252, | |
| "eval_samples_per_second": 58.561, | |
| "eval_steps_per_second": 14.817, | |
| "num_input_tokens_seen": 1048184, | |
| "step": 1686 | |
| }, | |
| { | |
| "epoch": 3.0124777183600715, | |
| "grad_norm": 0.3864419162273407, | |
| "learning_rate": 4.409098438620326e-05, | |
| "loss": 0.1859, | |
| "num_input_tokens_seen": 1050456, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.021390374331551, | |
| "grad_norm": 0.7427952885627747, | |
| "learning_rate": 4.404067551517703e-05, | |
| "loss": 0.2342, | |
| "num_input_tokens_seen": 1053592, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 3.0303030303030303, | |
| "grad_norm": 0.8005133867263794, | |
| "learning_rate": 4.399018235022799e-05, | |
| "loss": 0.2547, | |
| "num_input_tokens_seen": 1056664, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.0392156862745097, | |
| "grad_norm": 0.42377611994743347, | |
| "learning_rate": 4.393950538007743e-05, | |
| "loss": 0.2227, | |
| "num_input_tokens_seen": 1059384, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 3.0481283422459895, | |
| "grad_norm": 0.4982529878616333, | |
| "learning_rate": 4.3888645095225675e-05, | |
| "loss": 0.1863, | |
| "num_input_tokens_seen": 1062168, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.057040998217469, | |
| "grad_norm": 0.9931812882423401, | |
| "learning_rate": 4.383760198794734e-05, | |
| "loss": 0.2083, | |
| "num_input_tokens_seen": 1064952, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 3.0659536541889483, | |
| "grad_norm": 0.6572649478912354, | |
| "learning_rate": 4.37863765522866e-05, | |
| "loss": 0.1863, | |
| "num_input_tokens_seen": 1067416, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.0748663101604277, | |
| "grad_norm": 0.6921285390853882, | |
| "learning_rate": 4.3734969284052345e-05, | |
| "loss": 0.2354, | |
| "num_input_tokens_seen": 1070552, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 3.0837789661319075, | |
| "grad_norm": 0.7747342586517334, | |
| "learning_rate": 4.368338068081343e-05, | |
| "loss": 0.3332, | |
| "num_input_tokens_seen": 1074136, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.092691622103387, | |
| "grad_norm": 1.056235432624817, | |
| "learning_rate": 4.3631611241893874e-05, | |
| "loss": 0.2396, | |
| "num_input_tokens_seen": 1077848, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 3.1016042780748663, | |
| "grad_norm": 0.7865013480186462, | |
| "learning_rate": 4.3579661468367924e-05, | |
| "loss": 0.2057, | |
| "num_input_tokens_seen": 1080664, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.1105169340463457, | |
| "grad_norm": 0.6681080460548401, | |
| "learning_rate": 4.352753186305536e-05, | |
| "loss": 0.2823, | |
| "num_input_tokens_seen": 1083992, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 3.1194295900178255, | |
| "grad_norm": 0.4991186559200287, | |
| "learning_rate": 4.347522293051648e-05, | |
| "loss": 0.2609, | |
| "num_input_tokens_seen": 1087800, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.128342245989305, | |
| "grad_norm": 0.5108634829521179, | |
| "learning_rate": 4.3422735177047324e-05, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 1090776, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 3.1372549019607843, | |
| "grad_norm": 1.343435525894165, | |
| "learning_rate": 4.337006911067473e-05, | |
| "loss": 0.2593, | |
| "num_input_tokens_seen": 1093624, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.1461675579322637, | |
| "grad_norm": 0.7029876708984375, | |
| "learning_rate": 4.331722524115139e-05, | |
| "loss": 0.1993, | |
| "num_input_tokens_seen": 1096472, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 3.1550802139037435, | |
| "grad_norm": 0.5673936605453491, | |
| "learning_rate": 4.3264204079950975e-05, | |
| "loss": 0.2703, | |
| "num_input_tokens_seen": 1099736, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.163992869875223, | |
| "grad_norm": 0.49642717838287354, | |
| "learning_rate": 4.321100614026315e-05, | |
| "loss": 0.3485, | |
| "num_input_tokens_seen": 1103384, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 3.1729055258467023, | |
| "grad_norm": 0.7280632257461548, | |
| "learning_rate": 4.31576319369886e-05, | |
| "loss": 0.2451, | |
| "num_input_tokens_seen": 1106520, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.1818181818181817, | |
| "grad_norm": 0.642463207244873, | |
| "learning_rate": 4.310408198673406e-05, | |
| "loss": 0.2062, | |
| "num_input_tokens_seen": 1109208, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 3.1907308377896615, | |
| "grad_norm": 0.7189128994941711, | |
| "learning_rate": 4.305035680780732e-05, | |
| "loss": 0.2478, | |
| "num_input_tokens_seen": 1112536, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.199643493761141, | |
| "grad_norm": 1.2781462669372559, | |
| "learning_rate": 4.299645692021221e-05, | |
| "loss": 0.2381, | |
| "num_input_tokens_seen": 1115992, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 3.2085561497326203, | |
| "grad_norm": 0.598044753074646, | |
| "learning_rate": 4.294238284564354e-05, | |
| "loss": 0.2208, | |
| "num_input_tokens_seen": 1119192, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.2174688057040997, | |
| "grad_norm": 0.6014571189880371, | |
| "learning_rate": 4.2888135107482067e-05, | |
| "loss": 0.2393, | |
| "num_input_tokens_seen": 1122552, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 3.2263814616755795, | |
| "grad_norm": 0.8126239776611328, | |
| "learning_rate": 4.283371423078945e-05, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 1126072, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.235294117647059, | |
| "grad_norm": 0.6001937985420227, | |
| "learning_rate": 4.277912074230312e-05, | |
| "loss": 0.1901, | |
| "num_input_tokens_seen": 1128792, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 3.2442067736185383, | |
| "grad_norm": 0.6077953577041626, | |
| "learning_rate": 4.272435517043125e-05, | |
| "loss": 0.2166, | |
| "num_input_tokens_seen": 1132152, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.2531194295900177, | |
| "grad_norm": 0.38485997915267944, | |
| "learning_rate": 4.2669418045247576e-05, | |
| "loss": 0.2028, | |
| "num_input_tokens_seen": 1135064, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 3.2620320855614975, | |
| "grad_norm": 0.5066972970962524, | |
| "learning_rate": 4.2614309898486297e-05, | |
| "loss": 0.247, | |
| "num_input_tokens_seen": 1137976, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.270944741532977, | |
| "grad_norm": 0.5907444357872009, | |
| "learning_rate": 4.25590312635369e-05, | |
| "loss": 0.1952, | |
| "num_input_tokens_seen": 1141080, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 3.2798573975044563, | |
| "grad_norm": 0.6255643963813782, | |
| "learning_rate": 4.250358267543907e-05, | |
| "loss": 0.2124, | |
| "num_input_tokens_seen": 1144376, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.2887700534759357, | |
| "grad_norm": 0.9536407589912415, | |
| "learning_rate": 4.244796467087741e-05, | |
| "loss": 0.23, | |
| "num_input_tokens_seen": 1147224, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 3.2976827094474155, | |
| "grad_norm": 0.7920709252357483, | |
| "learning_rate": 4.2392177788176335e-05, | |
| "loss": 0.2005, | |
| "num_input_tokens_seen": 1150360, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.306595365418895, | |
| "grad_norm": 0.4633888602256775, | |
| "learning_rate": 4.2336222567294804e-05, | |
| "loss": 0.1962, | |
| "num_input_tokens_seen": 1153688, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 3.3155080213903743, | |
| "grad_norm": 0.384843111038208, | |
| "learning_rate": 4.228009954982112e-05, | |
| "loss": 0.2039, | |
| "num_input_tokens_seen": 1157016, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.3244206773618536, | |
| "grad_norm": 0.4141569435596466, | |
| "learning_rate": 4.22238092789677e-05, | |
| "loss": 0.2075, | |
| "num_input_tokens_seen": 1159768, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.5076260566711426, | |
| "learning_rate": 4.2167352299565746e-05, | |
| "loss": 0.198, | |
| "num_input_tokens_seen": 1162520, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 3.342245989304813, | |
| "grad_norm": 0.6106960773468018, | |
| "learning_rate": 4.21107291580601e-05, | |
| "loss": 0.1931, | |
| "num_input_tokens_seen": 1165336, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 3.3511586452762923, | |
| "grad_norm": 0.49231547117233276, | |
| "learning_rate": 4.205394040250382e-05, | |
| "loss": 0.2574, | |
| "num_input_tokens_seen": 1168632, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 3.3600713012477716, | |
| "grad_norm": 0.5341747403144836, | |
| "learning_rate": 4.199698658255298e-05, | |
| "loss": 0.2002, | |
| "num_input_tokens_seen": 1171352, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 3.3689839572192515, | |
| "grad_norm": 0.5527672171592712, | |
| "learning_rate": 4.193986824946125e-05, | |
| "loss": 0.2148, | |
| "num_input_tokens_seen": 1174360, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 3.377896613190731, | |
| "grad_norm": 0.5493122935295105, | |
| "learning_rate": 4.188258595607468e-05, | |
| "loss": 0.2173, | |
| "num_input_tokens_seen": 1177368, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 3.3868092691622103, | |
| "grad_norm": 0.6076507568359375, | |
| "learning_rate": 4.182514025682625e-05, | |
| "loss": 0.2365, | |
| "num_input_tokens_seen": 1180824, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 3.3957219251336896, | |
| "grad_norm": 0.38345441222190857, | |
| "learning_rate": 4.176753170773052e-05, | |
| "loss": 0.237, | |
| "num_input_tokens_seen": 1183544, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 3.4046345811051695, | |
| "grad_norm": 0.8067929744720459, | |
| "learning_rate": 4.170976086637832e-05, | |
| "loss": 0.1945, | |
| "num_input_tokens_seen": 1185848, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 3.413547237076649, | |
| "grad_norm": 0.5404775142669678, | |
| "learning_rate": 4.1651828291931264e-05, | |
| "loss": 0.1856, | |
| "num_input_tokens_seen": 1189176, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 3.4224598930481283, | |
| "grad_norm": 0.6067723631858826, | |
| "learning_rate": 4.159373454511636e-05, | |
| "loss": 0.2464, | |
| "num_input_tokens_seen": 1192984, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 3.431372549019608, | |
| "grad_norm": 0.6056991815567017, | |
| "learning_rate": 4.1535480188220636e-05, | |
| "loss": 0.2909, | |
| "num_input_tokens_seen": 1196888, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 3.4402852049910875, | |
| "grad_norm": 0.7518835067749023, | |
| "learning_rate": 4.1477065785085634e-05, | |
| "loss": 0.2496, | |
| "num_input_tokens_seen": 1200792, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.449197860962567, | |
| "grad_norm": 0.41140249371528625, | |
| "learning_rate": 4.141849190110199e-05, | |
| "loss": 0.2267, | |
| "num_input_tokens_seen": 1203832, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 3.4581105169340463, | |
| "grad_norm": 0.44746679067611694, | |
| "learning_rate": 4.1359759103203935e-05, | |
| "loss": 0.215, | |
| "num_input_tokens_seen": 1207160, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.4670231729055256, | |
| "grad_norm": 0.7266998291015625, | |
| "learning_rate": 4.130086795986383e-05, | |
| "loss": 0.2169, | |
| "num_input_tokens_seen": 1210616, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 3.4759358288770055, | |
| "grad_norm": 0.5968104600906372, | |
| "learning_rate": 4.124181904108664e-05, | |
| "loss": 0.1875, | |
| "num_input_tokens_seen": 1213528, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.484848484848485, | |
| "grad_norm": 0.5463330149650574, | |
| "learning_rate": 4.1182612918404466e-05, | |
| "loss": 0.1969, | |
| "num_input_tokens_seen": 1216568, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 3.4937611408199643, | |
| "grad_norm": 0.6442824006080627, | |
| "learning_rate": 4.1123250164870955e-05, | |
| "loss": 0.3184, | |
| "num_input_tokens_seen": 1219896, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 3.502673796791444, | |
| "grad_norm": 0.701900064945221, | |
| "learning_rate": 4.1063731355055763e-05, | |
| "loss": 0.2079, | |
| "num_input_tokens_seen": 1222904, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 3.5062388591800357, | |
| "eval_loss": 0.22395405173301697, | |
| "eval_runtime": 4.2462, | |
| "eval_samples_per_second": 58.641, | |
| "eval_steps_per_second": 14.837, | |
| "num_input_tokens_seen": 1223864, | |
| "step": 1967 | |
| }, | |
| { | |
| "epoch": 3.5115864527629235, | |
| "grad_norm": 0.39802566170692444, | |
| "learning_rate": 4.100405706503904e-05, | |
| "loss": 0.158, | |
| "num_input_tokens_seen": 1225496, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 3.520499108734403, | |
| "grad_norm": 0.7380387783050537, | |
| "learning_rate": 4.094422787240581e-05, | |
| "loss": 0.1725, | |
| "num_input_tokens_seen": 1228280, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 3.5294117647058822, | |
| "grad_norm": 0.6759628653526306, | |
| "learning_rate": 4.088424435624038e-05, | |
| "loss": 0.2052, | |
| "num_input_tokens_seen": 1231288, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 3.5383244206773616, | |
| "grad_norm": 1.158799409866333, | |
| "learning_rate": 4.082410709712077e-05, | |
| "loss": 0.2018, | |
| "num_input_tokens_seen": 1234456, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 3.5472370766488415, | |
| "grad_norm": 0.7307495474815369, | |
| "learning_rate": 4.0763816677113064e-05, | |
| "loss": 0.2669, | |
| "num_input_tokens_seen": 1237912, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 3.556149732620321, | |
| "grad_norm": 0.9738561511039734, | |
| "learning_rate": 4.070337367976578e-05, | |
| "loss": 0.2444, | |
| "num_input_tokens_seen": 1240984, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 3.5650623885918002, | |
| "grad_norm": 0.5394619703292847, | |
| "learning_rate": 4.064277869010421e-05, | |
| "loss": 0.2265, | |
| "num_input_tokens_seen": 1244280, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 3.57397504456328, | |
| "grad_norm": 0.7028752565383911, | |
| "learning_rate": 4.058203229462482e-05, | |
| "loss": 0.2192, | |
| "num_input_tokens_seen": 1246904, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 3.5828877005347595, | |
| "grad_norm": 1.353464126586914, | |
| "learning_rate": 4.052113508128948e-05, | |
| "loss": 0.2313, | |
| "num_input_tokens_seen": 1249880, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 3.591800356506239, | |
| "grad_norm": 0.8846970796585083, | |
| "learning_rate": 4.0460087639519836e-05, | |
| "loss": 0.1889, | |
| "num_input_tokens_seen": 1252408, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 3.6007130124777182, | |
| "grad_norm": 1.0351589918136597, | |
| "learning_rate": 4.039889056019159e-05, | |
| "loss": 0.2567, | |
| "num_input_tokens_seen": 1255800, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 3.6096256684491976, | |
| "grad_norm": 0.6438773274421692, | |
| "learning_rate": 4.03375444356288e-05, | |
| "loss": 0.2018, | |
| "num_input_tokens_seen": 1259160, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 3.6185383244206775, | |
| "grad_norm": 0.8322818279266357, | |
| "learning_rate": 4.0276049859598084e-05, | |
| "loss": 0.2269, | |
| "num_input_tokens_seen": 1262488, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 3.627450980392157, | |
| "grad_norm": 0.5302309393882751, | |
| "learning_rate": 4.021440742730295e-05, | |
| "loss": 0.2032, | |
| "num_input_tokens_seen": 1265368, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "grad_norm": 0.8041933178901672, | |
| "learning_rate": 4.015261773537799e-05, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 1269112, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 3.645276292335116, | |
| "grad_norm": 0.5872630476951599, | |
| "learning_rate": 4.009068138188311e-05, | |
| "loss": 0.2389, | |
| "num_input_tokens_seen": 1272408, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 3.6541889483065955, | |
| "grad_norm": 0.5462104678153992, | |
| "learning_rate": 4.002859896629776e-05, | |
| "loss": 0.1955, | |
| "num_input_tokens_seen": 1275640, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 3.663101604278075, | |
| "grad_norm": 0.7330032587051392, | |
| "learning_rate": 3.99663710895151e-05, | |
| "loss": 0.2116, | |
| "num_input_tokens_seen": 1278616, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 3.6720142602495542, | |
| "grad_norm": 0.5604473352432251, | |
| "learning_rate": 3.990399835383623e-05, | |
| "loss": 0.2285, | |
| "num_input_tokens_seen": 1281624, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 3.6809269162210336, | |
| "grad_norm": 0.49228572845458984, | |
| "learning_rate": 3.984148136296431e-05, | |
| "loss": 0.2026, | |
| "num_input_tokens_seen": 1284216, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 3.6898395721925135, | |
| "grad_norm": 0.8332962393760681, | |
| "learning_rate": 3.977882072199874e-05, | |
| "loss": 0.2028, | |
| "num_input_tokens_seen": 1286808, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 3.698752228163993, | |
| "grad_norm": 0.6717101335525513, | |
| "learning_rate": 3.971601703742932e-05, | |
| "loss": 0.2117, | |
| "num_input_tokens_seen": 1289944, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 3.7076648841354722, | |
| "grad_norm": 0.6963510513305664, | |
| "learning_rate": 3.965307091713037e-05, | |
| "loss": 0.1899, | |
| "num_input_tokens_seen": 1292856, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 3.716577540106952, | |
| "grad_norm": 0.771668553352356, | |
| "learning_rate": 3.95899829703548e-05, | |
| "loss": 0.2491, | |
| "num_input_tokens_seen": 1296792, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 3.7254901960784315, | |
| "grad_norm": 0.9969800710678101, | |
| "learning_rate": 3.9526753807728295e-05, | |
| "loss": 0.2512, | |
| "num_input_tokens_seen": 1299800, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 3.734402852049911, | |
| "grad_norm": 0.5737549066543579, | |
| "learning_rate": 3.946338404124334e-05, | |
| "loss": 0.1831, | |
| "num_input_tokens_seen": 1302648, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 3.7433155080213902, | |
| "grad_norm": 0.5544306039810181, | |
| "learning_rate": 3.939987428425331e-05, | |
| "loss": 0.1678, | |
| "num_input_tokens_seen": 1305016, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 3.7522281639928696, | |
| "grad_norm": 0.4125676155090332, | |
| "learning_rate": 3.933622515146658e-05, | |
| "loss": 0.1715, | |
| "num_input_tokens_seen": 1308024, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 3.7611408199643495, | |
| "grad_norm": 0.6266154646873474, | |
| "learning_rate": 3.9272437258940494e-05, | |
| "loss": 0.2112, | |
| "num_input_tokens_seen": 1310552, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 3.770053475935829, | |
| "grad_norm": 0.44769471883773804, | |
| "learning_rate": 3.9208511224075484e-05, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 1313656, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 3.7789661319073082, | |
| "grad_norm": 0.5761722922325134, | |
| "learning_rate": 3.914444766560902e-05, | |
| "loss": 0.2712, | |
| "num_input_tokens_seen": 1316728, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 3.787878787878788, | |
| "grad_norm": 0.556746780872345, | |
| "learning_rate": 3.908024720360968e-05, | |
| "loss": 0.2286, | |
| "num_input_tokens_seen": 1320344, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 3.7967914438502675, | |
| "grad_norm": 0.45677894353866577, | |
| "learning_rate": 3.9015910459471126e-05, | |
| "loss": 0.196, | |
| "num_input_tokens_seen": 1323416, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 3.805704099821747, | |
| "grad_norm": 0.6750150322914124, | |
| "learning_rate": 3.8951438055906084e-05, | |
| "loss": 0.1779, | |
| "num_input_tokens_seen": 1326360, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 3.8146167557932262, | |
| "grad_norm": 0.9360057711601257, | |
| "learning_rate": 3.888683061694032e-05, | |
| "loss": 0.2523, | |
| "num_input_tokens_seen": 1329944, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 3.8235294117647056, | |
| "grad_norm": 0.4923909604549408, | |
| "learning_rate": 3.882208876790661e-05, | |
| "loss": 0.1995, | |
| "num_input_tokens_seen": 1333080, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 3.8324420677361855, | |
| "grad_norm": 0.6493288278579712, | |
| "learning_rate": 3.8757213135438655e-05, | |
| "loss": 0.1972, | |
| "num_input_tokens_seen": 1336504, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 3.841354723707665, | |
| "grad_norm": 0.5835461616516113, | |
| "learning_rate": 3.869220434746509e-05, | |
| "loss": 0.2229, | |
| "num_input_tokens_seen": 1339704, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 3.8502673796791442, | |
| "grad_norm": 0.6278809309005737, | |
| "learning_rate": 3.862706303320329e-05, | |
| "loss": 0.2137, | |
| "num_input_tokens_seen": 1343032, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 3.859180035650624, | |
| "grad_norm": 0.7989611625671387, | |
| "learning_rate": 3.856178982315342e-05, | |
| "loss": 0.2522, | |
| "num_input_tokens_seen": 1346104, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 3.8680926916221035, | |
| "grad_norm": 0.4888596534729004, | |
| "learning_rate": 3.849638534909219e-05, | |
| "loss": 0.1977, | |
| "num_input_tokens_seen": 1348984, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 3.877005347593583, | |
| "grad_norm": 0.590801477432251, | |
| "learning_rate": 3.843085024406686e-05, | |
| "loss": 0.2031, | |
| "num_input_tokens_seen": 1351480, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 3.8859180035650622, | |
| "grad_norm": 0.6255959868431091, | |
| "learning_rate": 3.836518514238903e-05, | |
| "loss": 0.2707, | |
| "num_input_tokens_seen": 1355448, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 3.8948306595365416, | |
| "grad_norm": 0.5446547269821167, | |
| "learning_rate": 3.8299390679628555e-05, | |
| "loss": 0.1831, | |
| "num_input_tokens_seen": 1358392, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 3.9037433155080214, | |
| "grad_norm": 0.5819702744483948, | |
| "learning_rate": 3.8233467492607354e-05, | |
| "loss": 0.2039, | |
| "num_input_tokens_seen": 1361368, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 3.912655971479501, | |
| "grad_norm": 0.5366934537887573, | |
| "learning_rate": 3.816741621939327e-05, | |
| "loss": 0.1955, | |
| "num_input_tokens_seen": 1364536, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 3.9215686274509802, | |
| "grad_norm": 1.1435610055923462, | |
| "learning_rate": 3.81012374992939e-05, | |
| "loss": 0.2049, | |
| "num_input_tokens_seen": 1367800, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.93048128342246, | |
| "grad_norm": 0.5551317930221558, | |
| "learning_rate": 3.803493197285036e-05, | |
| "loss": 0.2268, | |
| "num_input_tokens_seen": 1371224, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 3.9393939393939394, | |
| "grad_norm": 1.10652756690979, | |
| "learning_rate": 3.7968500281831146e-05, | |
| "loss": 0.1848, | |
| "num_input_tokens_seen": 1373944, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 3.948306595365419, | |
| "grad_norm": 0.9579757452011108, | |
| "learning_rate": 3.79019430692259e-05, | |
| "loss": 0.2114, | |
| "num_input_tokens_seen": 1377240, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 3.9572192513368982, | |
| "grad_norm": 0.42045828700065613, | |
| "learning_rate": 3.783526097923915e-05, | |
| "loss": 0.2034, | |
| "num_input_tokens_seen": 1380248, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 3.966131907308378, | |
| "grad_norm": 0.6384634375572205, | |
| "learning_rate": 3.7768454657284154e-05, | |
| "loss": 0.1566, | |
| "num_input_tokens_seen": 1382712, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 3.9750445632798574, | |
| "grad_norm": 0.9116731882095337, | |
| "learning_rate": 3.770152474997657e-05, | |
| "loss": 0.2102, | |
| "num_input_tokens_seen": 1385976, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 3.983957219251337, | |
| "grad_norm": 0.6810240149497986, | |
| "learning_rate": 3.763447190512824e-05, | |
| "loss": 0.2052, | |
| "num_input_tokens_seen": 1389624, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 3.9928698752228167, | |
| "grad_norm": 0.3541090488433838, | |
| "learning_rate": 3.7567296771740925e-05, | |
| "loss": 0.244, | |
| "num_input_tokens_seen": 1392728, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 4.001782531194296, | |
| "grad_norm": 1.0409997701644897, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 0.2358, | |
| "num_input_tokens_seen": 1395704, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 4.007130124777183, | |
| "eval_loss": 0.21653257310390472, | |
| "eval_runtime": 4.2509, | |
| "eval_samples_per_second": 58.576, | |
| "eval_steps_per_second": 14.82, | |
| "num_input_tokens_seen": 1397624, | |
| "step": 2248 | |
| }, | |
| { | |
| "epoch": 4.010695187165775, | |
| "grad_norm": 0.5523825287818909, | |
| "learning_rate": 3.743258224126819e-05, | |
| "loss": 0.1735, | |
| "num_input_tokens_seen": 1398584, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 4.019607843137255, | |
| "grad_norm": 0.7276411652565002, | |
| "learning_rate": 3.736504414807922e-05, | |
| "loss": 0.1992, | |
| "num_input_tokens_seen": 1401784, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 4.028520499108734, | |
| "grad_norm": 0.36699721217155457, | |
| "learning_rate": 3.729738637413156e-05, | |
| "loss": 0.1728, | |
| "num_input_tokens_seen": 1404312, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 4.037433155080214, | |
| "grad_norm": 0.7663154006004333, | |
| "learning_rate": 3.722960957428203e-05, | |
| "loss": 0.1866, | |
| "num_input_tokens_seen": 1407352, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 4.046345811051693, | |
| "grad_norm": 0.4959503412246704, | |
| "learning_rate": 3.716171440453952e-05, | |
| "loss": 0.1823, | |
| "num_input_tokens_seen": 1410648, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 4.055258467023173, | |
| "grad_norm": 0.6325064897537231, | |
| "learning_rate": 3.709370152205863e-05, | |
| "loss": 0.1698, | |
| "num_input_tokens_seen": 1413816, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 4.064171122994653, | |
| "grad_norm": 0.4548736810684204, | |
| "learning_rate": 3.7025571585133254e-05, | |
| "loss": 0.1626, | |
| "num_input_tokens_seen": 1416024, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 4.073083778966132, | |
| "grad_norm": 0.3842249810695648, | |
| "learning_rate": 3.69573252531903e-05, | |
| "loss": 0.1929, | |
| "num_input_tokens_seen": 1419128, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 4.081996434937611, | |
| "grad_norm": 0.6341343522071838, | |
| "learning_rate": 3.6888963186783224e-05, | |
| "loss": 0.1625, | |
| "num_input_tokens_seen": 1421720, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 4.090909090909091, | |
| "grad_norm": 0.5091090798377991, | |
| "learning_rate": 3.682048604758567e-05, | |
| "loss": 0.1771, | |
| "num_input_tokens_seen": 1424632, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 4.09982174688057, | |
| "grad_norm": 0.24424993991851807, | |
| "learning_rate": 3.67518944983851e-05, | |
| "loss": 0.1739, | |
| "num_input_tokens_seen": 1427480, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.10873440285205, | |
| "grad_norm": 0.589100182056427, | |
| "learning_rate": 3.668318920307632e-05, | |
| "loss": 0.2092, | |
| "num_input_tokens_seen": 1430296, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 4.117647058823529, | |
| "grad_norm": 0.41250258684158325, | |
| "learning_rate": 3.6614370826655074e-05, | |
| "loss": 0.1714, | |
| "num_input_tokens_seen": 1432920, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 4.126559714795009, | |
| "grad_norm": 0.7590497136116028, | |
| "learning_rate": 3.654544003521164e-05, | |
| "loss": 0.2039, | |
| "num_input_tokens_seen": 1435544, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 4.135472370766489, | |
| "grad_norm": 0.8127907514572144, | |
| "learning_rate": 3.647639749592433e-05, | |
| "loss": 0.1583, | |
| "num_input_tokens_seen": 1438040, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 4.144385026737968, | |
| "grad_norm": 0.6445732712745667, | |
| "learning_rate": 3.640724387705308e-05, | |
| "loss": 0.2149, | |
| "num_input_tokens_seen": 1441528, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 4.153297682709447, | |
| "grad_norm": 0.44771522283554077, | |
| "learning_rate": 3.633797984793294e-05, | |
| "loss": 0.1543, | |
| "num_input_tokens_seen": 1444920, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 4.162210338680927, | |
| "grad_norm": 0.47167617082595825, | |
| "learning_rate": 3.626860607896764e-05, | |
| "loss": 0.2014, | |
| "num_input_tokens_seen": 1447896, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 4.171122994652406, | |
| "grad_norm": 0.49547502398490906, | |
| "learning_rate": 3.6199123241623046e-05, | |
| "loss": 0.2085, | |
| "num_input_tokens_seen": 1451256, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 4.180035650623886, | |
| "grad_norm": 0.5464377403259277, | |
| "learning_rate": 3.6129532008420715e-05, | |
| "loss": 0.1821, | |
| "num_input_tokens_seen": 1454136, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 4.188948306595365, | |
| "grad_norm": 0.44719406962394714, | |
| "learning_rate": 3.605983305293137e-05, | |
| "loss": 0.1703, | |
| "num_input_tokens_seen": 1456504, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 4.197860962566845, | |
| "grad_norm": 0.905034065246582, | |
| "learning_rate": 3.599002704976835e-05, | |
| "loss": 0.1734, | |
| "num_input_tokens_seen": 1459768, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 4.206773618538325, | |
| "grad_norm": 0.3426745533943176, | |
| "learning_rate": 3.592011467458113e-05, | |
| "loss": 0.1501, | |
| "num_input_tokens_seen": 1462392, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 4.215686274509804, | |
| "grad_norm": 1.105431318283081, | |
| "learning_rate": 3.585009660404873e-05, | |
| "loss": 0.2289, | |
| "num_input_tokens_seen": 1466040, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 4.224598930481283, | |
| "grad_norm": 0.6577187776565552, | |
| "learning_rate": 3.577997351587322e-05, | |
| "loss": 0.2166, | |
| "num_input_tokens_seen": 1469208, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 4.233511586452763, | |
| "grad_norm": 0.5719982981681824, | |
| "learning_rate": 3.5709746088773085e-05, | |
| "loss": 0.222, | |
| "num_input_tokens_seen": 1472536, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 4.242424242424242, | |
| "grad_norm": 0.4010562598705292, | |
| "learning_rate": 3.563941500247676e-05, | |
| "loss": 0.1836, | |
| "num_input_tokens_seen": 1475608, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 4.251336898395722, | |
| "grad_norm": 0.6845771074295044, | |
| "learning_rate": 3.5568980937715945e-05, | |
| "loss": 0.1762, | |
| "num_input_tokens_seen": 1479256, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 4.260249554367201, | |
| "grad_norm": 0.5753139853477478, | |
| "learning_rate": 3.54984445762191e-05, | |
| "loss": 0.2054, | |
| "num_input_tokens_seen": 1483064, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 4.269162210338681, | |
| "grad_norm": 0.586729109287262, | |
| "learning_rate": 3.5427806600704785e-05, | |
| "loss": 0.1733, | |
| "num_input_tokens_seen": 1485880, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 4.278074866310161, | |
| "grad_norm": 0.5614349842071533, | |
| "learning_rate": 3.535706769487509e-05, | |
| "loss": 0.1777, | |
| "num_input_tokens_seen": 1489208, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 4.28698752228164, | |
| "grad_norm": 0.6715386509895325, | |
| "learning_rate": 3.5286228543409004e-05, | |
| "loss": 0.1883, | |
| "num_input_tokens_seen": 1492216, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 4.295900178253119, | |
| "grad_norm": 0.5051096677780151, | |
| "learning_rate": 3.5215289831955786e-05, | |
| "loss": 0.2037, | |
| "num_input_tokens_seen": 1495960, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 4.304812834224599, | |
| "grad_norm": 0.8140228390693665, | |
| "learning_rate": 3.514425224712835e-05, | |
| "loss": 0.1892, | |
| "num_input_tokens_seen": 1498584, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 4.313725490196078, | |
| "grad_norm": 0.45702996850013733, | |
| "learning_rate": 3.507311647649657e-05, | |
| "loss": 0.179, | |
| "num_input_tokens_seen": 1501880, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 4.322638146167558, | |
| "grad_norm": 0.6330050230026245, | |
| "learning_rate": 3.5001883208580665e-05, | |
| "loss": 0.1901, | |
| "num_input_tokens_seen": 1505112, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 4.331550802139038, | |
| "grad_norm": 0.5689657330513, | |
| "learning_rate": 3.493055313284456e-05, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 1507768, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 4.340463458110517, | |
| "grad_norm": 0.9648520946502686, | |
| "learning_rate": 3.485912693968913e-05, | |
| "loss": 0.2049, | |
| "num_input_tokens_seen": 1511224, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 4.349376114081997, | |
| "grad_norm": 0.4425726532936096, | |
| "learning_rate": 3.478760532044561e-05, | |
| "loss": 0.2032, | |
| "num_input_tokens_seen": 1514456, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 4.358288770053476, | |
| "grad_norm": 0.5605233311653137, | |
| "learning_rate": 3.471598896736881e-05, | |
| "loss": 0.207, | |
| "num_input_tokens_seen": 1517400, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 4.367201426024955, | |
| "grad_norm": 0.5907042622566223, | |
| "learning_rate": 3.464427857363052e-05, | |
| "loss": 0.2018, | |
| "num_input_tokens_seen": 1520664, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 4.376114081996435, | |
| "grad_norm": 0.8678156137466431, | |
| "learning_rate": 3.457247483331272e-05, | |
| "loss": 0.2408, | |
| "num_input_tokens_seen": 1523960, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 4.385026737967914, | |
| "grad_norm": 0.4271613359451294, | |
| "learning_rate": 3.4500578441400876e-05, | |
| "loss": 0.1568, | |
| "num_input_tokens_seen": 1526616, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 4.393939393939394, | |
| "grad_norm": 1.1846132278442383, | |
| "learning_rate": 3.4428590093777244e-05, | |
| "loss": 0.3417, | |
| "num_input_tokens_seen": 1530808, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 4.402852049910873, | |
| "grad_norm": 0.49708229303359985, | |
| "learning_rate": 3.43565104872141e-05, | |
| "loss": 0.1599, | |
| "num_input_tokens_seen": 1533336, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 4.411764705882353, | |
| "grad_norm": 0.35631561279296875, | |
| "learning_rate": 3.428434031936704e-05, | |
| "loss": 0.1646, | |
| "num_input_tokens_seen": 1535864, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 4.420677361853833, | |
| "grad_norm": 0.6264846324920654, | |
| "learning_rate": 3.421208028876815e-05, | |
| "loss": 0.2114, | |
| "num_input_tokens_seen": 1539192, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 4.429590017825312, | |
| "grad_norm": 0.3950527310371399, | |
| "learning_rate": 3.413973109481935e-05, | |
| "loss": 0.227, | |
| "num_input_tokens_seen": 1542712, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 4.438502673796791, | |
| "grad_norm": 0.7369870543479919, | |
| "learning_rate": 3.406729343778552e-05, | |
| "loss": 0.1871, | |
| "num_input_tokens_seen": 1545272, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 4.447415329768271, | |
| "grad_norm": 0.549528956413269, | |
| "learning_rate": 3.3994768018787815e-05, | |
| "loss": 0.3024, | |
| "num_input_tokens_seen": 1549464, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 4.45632798573975, | |
| "grad_norm": 0.5840650796890259, | |
| "learning_rate": 3.392215553979679e-05, | |
| "loss": 0.2244, | |
| "num_input_tokens_seen": 1552280, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 4.46524064171123, | |
| "grad_norm": 0.399300754070282, | |
| "learning_rate": 3.38494567036257e-05, | |
| "loss": 0.2032, | |
| "num_input_tokens_seen": 1555448, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 4.47415329768271, | |
| "grad_norm": 0.47554269433021545, | |
| "learning_rate": 3.3776672213923587e-05, | |
| "loss": 0.2211, | |
| "num_input_tokens_seen": 1559480, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 4.483065953654189, | |
| "grad_norm": 0.3855815827846527, | |
| "learning_rate": 3.370380277516858e-05, | |
| "loss": 0.1718, | |
| "num_input_tokens_seen": 1562872, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 4.491978609625669, | |
| "grad_norm": 0.5743004679679871, | |
| "learning_rate": 3.3630849092661e-05, | |
| "loss": 0.183, | |
| "num_input_tokens_seen": 1565752, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 4.500891265597148, | |
| "grad_norm": 0.527409553527832, | |
| "learning_rate": 3.355781187251657e-05, | |
| "loss": 0.1778, | |
| "num_input_tokens_seen": 1568600, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 4.508021390374331, | |
| "eval_loss": 0.2118549942970276, | |
| "eval_runtime": 4.2596, | |
| "eval_samples_per_second": 58.457, | |
| "eval_steps_per_second": 14.79, | |
| "num_input_tokens_seen": 1570936, | |
| "step": 2529 | |
| }, | |
| { | |
| "epoch": 4.509803921568627, | |
| "grad_norm": 0.39879217743873596, | |
| "learning_rate": 3.3484691821659584e-05, | |
| "loss": 0.1747, | |
| "num_input_tokens_seen": 1571512, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 4.518716577540107, | |
| "grad_norm": 0.5035882592201233, | |
| "learning_rate": 3.3411489647816016e-05, | |
| "loss": 0.1871, | |
| "num_input_tokens_seen": 1574232, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 4.527629233511586, | |
| "grad_norm": 1.1074864864349365, | |
| "learning_rate": 3.3338206059506736e-05, | |
| "loss": 0.2403, | |
| "num_input_tokens_seen": 1577816, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 4.536541889483066, | |
| "grad_norm": 0.8603164553642273, | |
| "learning_rate": 3.326484176604061e-05, | |
| "loss": 0.2662, | |
| "num_input_tokens_seen": 1581368, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 4.545454545454545, | |
| "grad_norm": 0.43185243010520935, | |
| "learning_rate": 3.3191397477507655e-05, | |
| "loss": 0.1828, | |
| "num_input_tokens_seen": 1583800, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 4.554367201426025, | |
| "grad_norm": 0.397795170545578, | |
| "learning_rate": 3.3117873904772123e-05, | |
| "loss": 0.206, | |
| "num_input_tokens_seen": 1587384, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 4.563279857397505, | |
| "grad_norm": 0.7756383419036865, | |
| "learning_rate": 3.30442717594657e-05, | |
| "loss": 0.1919, | |
| "num_input_tokens_seen": 1590328, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 4.572192513368984, | |
| "grad_norm": 0.7332653999328613, | |
| "learning_rate": 3.297059175398056e-05, | |
| "loss": 0.2376, | |
| "num_input_tokens_seen": 1594136, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 4.581105169340463, | |
| "grad_norm": 0.541881799697876, | |
| "learning_rate": 3.289683460146244e-05, | |
| "loss": 0.1923, | |
| "num_input_tokens_seen": 1597656, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 4.590017825311943, | |
| "grad_norm": 0.48139122128486633, | |
| "learning_rate": 3.282300101580386e-05, | |
| "loss": 0.198, | |
| "num_input_tokens_seen": 1600536, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 4.598930481283422, | |
| "grad_norm": 0.7859025001525879, | |
| "learning_rate": 3.274909171163706e-05, | |
| "loss": 0.1965, | |
| "num_input_tokens_seen": 1603832, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 4.607843137254902, | |
| "grad_norm": 0.8468954563140869, | |
| "learning_rate": 3.2675107404327194e-05, | |
| "loss": 0.1882, | |
| "num_input_tokens_seen": 1607480, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 4.616755793226382, | |
| "grad_norm": 0.6784586310386658, | |
| "learning_rate": 3.2601048809965355e-05, | |
| "loss": 0.187, | |
| "num_input_tokens_seen": 1610296, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 4.625668449197861, | |
| "grad_norm": 0.4848667085170746, | |
| "learning_rate": 3.2526916645361666e-05, | |
| "loss": 0.1797, | |
| "num_input_tokens_seen": 1613336, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 4.634581105169341, | |
| "grad_norm": 0.4509483575820923, | |
| "learning_rate": 3.2452711628038324e-05, | |
| "loss": 0.159, | |
| "num_input_tokens_seen": 1616152, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 4.64349376114082, | |
| "grad_norm": 0.9891667366027832, | |
| "learning_rate": 3.2378434476222666e-05, | |
| "loss": 0.2153, | |
| "num_input_tokens_seen": 1620024, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 4.652406417112299, | |
| "grad_norm": 0.45274657011032104, | |
| "learning_rate": 3.2304085908840244e-05, | |
| "loss": 0.1975, | |
| "num_input_tokens_seen": 1623544, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 4.661319073083779, | |
| "grad_norm": 0.5668216943740845, | |
| "learning_rate": 3.222966664550777e-05, | |
| "loss": 0.1748, | |
| "num_input_tokens_seen": 1626296, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 4.670231729055258, | |
| "grad_norm": 0.6975745558738708, | |
| "learning_rate": 3.2155177406526304e-05, | |
| "loss": 0.1868, | |
| "num_input_tokens_seen": 1629336, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 4.6791443850267385, | |
| "grad_norm": 0.7208099961280823, | |
| "learning_rate": 3.208061891287414e-05, | |
| "loss": 0.214, | |
| "num_input_tokens_seen": 1632888, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 4.688057040998218, | |
| "grad_norm": 0.41192349791526794, | |
| "learning_rate": 3.200599188619989e-05, | |
| "loss": 0.1753, | |
| "num_input_tokens_seen": 1635768, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 4.696969696969697, | |
| "grad_norm": 1.2426398992538452, | |
| "learning_rate": 3.1931297048815534e-05, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 1639256, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 4.705882352941177, | |
| "grad_norm": 0.4843774735927582, | |
| "learning_rate": 3.185653512368933e-05, | |
| "loss": 0.2591, | |
| "num_input_tokens_seen": 1643128, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 4.714795008912656, | |
| "grad_norm": 0.6016537547111511, | |
| "learning_rate": 3.178170683443893e-05, | |
| "loss": 0.1748, | |
| "num_input_tokens_seen": 1646424, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 4.723707664884135, | |
| "grad_norm": 0.5028678178787231, | |
| "learning_rate": 3.1706812905324276e-05, | |
| "loss": 0.1844, | |
| "num_input_tokens_seen": 1649240, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 4.732620320855615, | |
| "grad_norm": 0.694146454334259, | |
| "learning_rate": 3.1631854061240684e-05, | |
| "loss": 0.1668, | |
| "num_input_tokens_seen": 1652184, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 4.741532976827094, | |
| "grad_norm": 0.6105802655220032, | |
| "learning_rate": 3.155683102771173e-05, | |
| "loss": 0.2189, | |
| "num_input_tokens_seen": 1655480, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 4.750445632798574, | |
| "grad_norm": 0.8289818167686462, | |
| "learning_rate": 3.1481744530882305e-05, | |
| "loss": 0.2437, | |
| "num_input_tokens_seen": 1659352, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 4.759358288770054, | |
| "grad_norm": 0.5131431221961975, | |
| "learning_rate": 3.1406595297511566e-05, | |
| "loss": 0.1756, | |
| "num_input_tokens_seen": 1661976, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 4.768270944741533, | |
| "grad_norm": 0.6698647737503052, | |
| "learning_rate": 3.133138405496587e-05, | |
| "loss": 0.1713, | |
| "num_input_tokens_seen": 1664504, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 4.777183600713013, | |
| "grad_norm": 0.5975663065910339, | |
| "learning_rate": 3.125611153121178e-05, | |
| "loss": 0.1763, | |
| "num_input_tokens_seen": 1667288, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 4.786096256684492, | |
| "grad_norm": 0.5346847772598267, | |
| "learning_rate": 3.118077845480897e-05, | |
| "loss": 0.1686, | |
| "num_input_tokens_seen": 1670360, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 4.795008912655971, | |
| "grad_norm": 0.5491595268249512, | |
| "learning_rate": 3.110538555490324e-05, | |
| "loss": 0.1884, | |
| "num_input_tokens_seen": 1673624, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 4.803921568627451, | |
| "grad_norm": 0.35313117504119873, | |
| "learning_rate": 3.1029933561219375e-05, | |
| "loss": 0.1675, | |
| "num_input_tokens_seen": 1676440, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 4.81283422459893, | |
| "grad_norm": 0.5857532024383545, | |
| "learning_rate": 3.095442320405418e-05, | |
| "loss": 0.1637, | |
| "num_input_tokens_seen": 1679448, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 4.8217468805704105, | |
| "grad_norm": 0.6775690913200378, | |
| "learning_rate": 3.0878855214269293e-05, | |
| "loss": 0.1642, | |
| "num_input_tokens_seen": 1682520, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 4.83065953654189, | |
| "grad_norm": 0.5732465386390686, | |
| "learning_rate": 3.0803230323284225e-05, | |
| "loss": 0.1834, | |
| "num_input_tokens_seen": 1685656, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 4.839572192513369, | |
| "grad_norm": 1.1239274740219116, | |
| "learning_rate": 3.0727549263069224e-05, | |
| "loss": 0.2211, | |
| "num_input_tokens_seen": 1688856, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 4.848484848484849, | |
| "grad_norm": 0.8710312247276306, | |
| "learning_rate": 3.065181276613817e-05, | |
| "loss": 0.1483, | |
| "num_input_tokens_seen": 1691768, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 4.857397504456328, | |
| "grad_norm": 0.28014299273490906, | |
| "learning_rate": 3.057602156554155e-05, | |
| "loss": 0.1538, | |
| "num_input_tokens_seen": 1694488, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 4.866310160427807, | |
| "grad_norm": 0.5496522784233093, | |
| "learning_rate": 3.0500176394859293e-05, | |
| "loss": 0.2051, | |
| "num_input_tokens_seen": 1697752, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 4.875222816399287, | |
| "grad_norm": 0.673943817615509, | |
| "learning_rate": 3.042427798819373e-05, | |
| "loss": 0.1897, | |
| "num_input_tokens_seen": 1700408, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 4.884135472370766, | |
| "grad_norm": 0.7624504566192627, | |
| "learning_rate": 3.0348327080162435e-05, | |
| "loss": 0.1842, | |
| "num_input_tokens_seen": 1703512, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 4.893048128342246, | |
| "grad_norm": 0.5836613774299622, | |
| "learning_rate": 3.0272324405891172e-05, | |
| "loss": 0.1811, | |
| "num_input_tokens_seen": 1707032, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 4.901960784313726, | |
| "grad_norm": 0.6330267190933228, | |
| "learning_rate": 3.0196270701006706e-05, | |
| "loss": 0.1925, | |
| "num_input_tokens_seen": 1710328, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 4.910873440285205, | |
| "grad_norm": 0.764445960521698, | |
| "learning_rate": 3.012016670162977e-05, | |
| "loss": 0.1888, | |
| "num_input_tokens_seen": 1712632, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 4.919786096256685, | |
| "grad_norm": 0.3074583113193512, | |
| "learning_rate": 3.0044013144367866e-05, | |
| "loss": 0.2241, | |
| "num_input_tokens_seen": 1716344, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 4.928698752228164, | |
| "grad_norm": 0.4822777509689331, | |
| "learning_rate": 2.996781076630816e-05, | |
| "loss": 0.1661, | |
| "num_input_tokens_seen": 1718712, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 4.937611408199643, | |
| "grad_norm": 0.56252521276474, | |
| "learning_rate": 2.9891560305010392e-05, | |
| "loss": 0.1863, | |
| "num_input_tokens_seen": 1722328, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 4.946524064171123, | |
| "grad_norm": 0.5701931118965149, | |
| "learning_rate": 2.9815262498499657e-05, | |
| "loss": 0.2022, | |
| "num_input_tokens_seen": 1725464, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 4.955436720142602, | |
| "grad_norm": 0.6118953227996826, | |
| "learning_rate": 2.9738918085259314e-05, | |
| "loss": 0.1703, | |
| "num_input_tokens_seen": 1728472, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 4.9643493761140824, | |
| "grad_norm": 0.43155810236930847, | |
| "learning_rate": 2.9662527804223827e-05, | |
| "loss": 0.1658, | |
| "num_input_tokens_seen": 1731160, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 4.973262032085562, | |
| "grad_norm": 0.622303307056427, | |
| "learning_rate": 2.9586092394771637e-05, | |
| "loss": 0.2174, | |
| "num_input_tokens_seen": 1734264, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 4.982174688057041, | |
| "grad_norm": 0.592126727104187, | |
| "learning_rate": 2.950961259671793e-05, | |
| "loss": 0.1573, | |
| "num_input_tokens_seen": 1737144, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 4.991087344028521, | |
| "grad_norm": 0.4473949372768402, | |
| "learning_rate": 2.943308915030757e-05, | |
| "loss": 0.1619, | |
| "num_input_tokens_seen": 1740664, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 1.4496628046035767, | |
| "learning_rate": 2.935652279620788e-05, | |
| "loss": 0.194, | |
| "num_input_tokens_seen": 1743216, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 5.008912655971479, | |
| "grad_norm": 0.5206677913665771, | |
| "learning_rate": 2.9279914275501473e-05, | |
| "loss": 0.2055, | |
| "num_input_tokens_seen": 1746384, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 5.008912655971479, | |
| "eval_loss": 0.19685669243335724, | |
| "eval_runtime": 4.2355, | |
| "eval_samples_per_second": 58.788, | |
| "eval_steps_per_second": 14.874, | |
| "num_input_tokens_seen": 1746384, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 5.017825311942959, | |
| "grad_norm": 0.46784770488739014, | |
| "learning_rate": 2.9203264329679115e-05, | |
| "loss": 0.1835, | |
| "num_input_tokens_seen": 1749680, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 5.026737967914438, | |
| "grad_norm": 0.9836930632591248, | |
| "learning_rate": 2.9126573700632504e-05, | |
| "loss": 0.1855, | |
| "num_input_tokens_seen": 1753104, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 5.035650623885918, | |
| "grad_norm": 0.48144713044166565, | |
| "learning_rate": 2.9049843130647112e-05, | |
| "loss": 0.1857, | |
| "num_input_tokens_seen": 1756112, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 5.044563279857398, | |
| "grad_norm": 0.49128931760787964, | |
| "learning_rate": 2.8973073362394998e-05, | |
| "loss": 0.1802, | |
| "num_input_tokens_seen": 1759344, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 5.053475935828877, | |
| "grad_norm": 0.4599247872829437, | |
| "learning_rate": 2.8896265138927638e-05, | |
| "loss": 0.1939, | |
| "num_input_tokens_seen": 1762288, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 5.062388591800357, | |
| "grad_norm": 0.4987725615501404, | |
| "learning_rate": 2.881941920366868e-05, | |
| "loss": 0.1583, | |
| "num_input_tokens_seen": 1765072, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 5.071301247771836, | |
| "grad_norm": 0.4939536452293396, | |
| "learning_rate": 2.8742536300406804e-05, | |
| "loss": 0.2022, | |
| "num_input_tokens_seen": 1767952, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 5.080213903743315, | |
| "grad_norm": 0.2937607765197754, | |
| "learning_rate": 2.8665617173288516e-05, | |
| "loss": 0.1696, | |
| "num_input_tokens_seen": 1770896, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 5.089126559714795, | |
| "grad_norm": 0.6866093277931213, | |
| "learning_rate": 2.8588662566810893e-05, | |
| "loss": 0.1683, | |
| "num_input_tokens_seen": 1773840, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 5.098039215686274, | |
| "grad_norm": 0.5026021003723145, | |
| "learning_rate": 2.851167322581445e-05, | |
| "loss": 0.1924, | |
| "num_input_tokens_seen": 1776720, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 5.106951871657754, | |
| "grad_norm": 0.5058155059814453, | |
| "learning_rate": 2.8434649895475877e-05, | |
| "loss": 0.1572, | |
| "num_input_tokens_seen": 1779088, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 5.115864527629234, | |
| "grad_norm": 0.47404804825782776, | |
| "learning_rate": 2.8357593321300856e-05, | |
| "loss": 0.1753, | |
| "num_input_tokens_seen": 1781776, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 5.124777183600713, | |
| "grad_norm": 0.5163501501083374, | |
| "learning_rate": 2.828050424911683e-05, | |
| "loss": 0.1685, | |
| "num_input_tokens_seen": 1784720, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 5.133689839572193, | |
| "grad_norm": 0.6680046319961548, | |
| "learning_rate": 2.8203383425065787e-05, | |
| "loss": 0.1854, | |
| "num_input_tokens_seen": 1787856, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 5.142602495543672, | |
| "grad_norm": 0.47441810369491577, | |
| "learning_rate": 2.812623159559704e-05, | |
| "loss": 0.1793, | |
| "num_input_tokens_seen": 1791088, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 5.151515151515151, | |
| "grad_norm": 0.4247751533985138, | |
| "learning_rate": 2.8049049507460003e-05, | |
| "loss": 0.2227, | |
| "num_input_tokens_seen": 1795056, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 5.160427807486631, | |
| "grad_norm": 0.4086715281009674, | |
| "learning_rate": 2.7971837907696973e-05, | |
| "loss": 0.2894, | |
| "num_input_tokens_seen": 1798928, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 5.16934046345811, | |
| "grad_norm": 0.48060083389282227, | |
| "learning_rate": 2.7894597543635863e-05, | |
| "loss": 0.1778, | |
| "num_input_tokens_seen": 1802384, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 5.17825311942959, | |
| "grad_norm": 0.5457305312156677, | |
| "learning_rate": 2.781732916288303e-05, | |
| "loss": 0.1873, | |
| "num_input_tokens_seen": 1805616, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 5.18716577540107, | |
| "grad_norm": 0.7138332724571228, | |
| "learning_rate": 2.774003351331597e-05, | |
| "loss": 0.1532, | |
| "num_input_tokens_seen": 1809008, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 5.196078431372549, | |
| "grad_norm": 0.5133665204048157, | |
| "learning_rate": 2.7662711343076135e-05, | |
| "loss": 0.1604, | |
| "num_input_tokens_seen": 1812784, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 5.204991087344029, | |
| "grad_norm": 0.48487603664398193, | |
| "learning_rate": 2.7585363400561658e-05, | |
| "loss": 0.155, | |
| "num_input_tokens_seen": 1815248, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 5.213903743315508, | |
| "grad_norm": 0.5267552137374878, | |
| "learning_rate": 2.7507990434420126e-05, | |
| "loss": 0.186, | |
| "num_input_tokens_seen": 1818032, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 5.222816399286987, | |
| "grad_norm": 0.45045390725135803, | |
| "learning_rate": 2.7430593193541325e-05, | |
| "loss": 0.1804, | |
| "num_input_tokens_seen": 1821232, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 5.231729055258467, | |
| "grad_norm": 0.5850667953491211, | |
| "learning_rate": 2.7353172427049995e-05, | |
| "loss": 0.2057, | |
| "num_input_tokens_seen": 1824784, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 5.240641711229946, | |
| "grad_norm": 0.4316384792327881, | |
| "learning_rate": 2.7275728884298596e-05, | |
| "loss": 0.1754, | |
| "num_input_tokens_seen": 1827088, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 5.249554367201426, | |
| "grad_norm": 0.350407212972641, | |
| "learning_rate": 2.719826331486e-05, | |
| "loss": 0.1627, | |
| "num_input_tokens_seen": 1829328, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 5.258467023172906, | |
| "grad_norm": 0.6626913547515869, | |
| "learning_rate": 2.7120776468520314e-05, | |
| "loss": 0.2147, | |
| "num_input_tokens_seen": 1833136, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 5.267379679144385, | |
| "grad_norm": 0.711764931678772, | |
| "learning_rate": 2.7043269095271573e-05, | |
| "loss": 0.185, | |
| "num_input_tokens_seen": 1835632, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 5.276292335115865, | |
| "grad_norm": 0.5972061157226562, | |
| "learning_rate": 2.6965741945304467e-05, | |
| "loss": 0.199, | |
| "num_input_tokens_seen": 1838992, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 5.285204991087344, | |
| "grad_norm": 0.9157897233963013, | |
| "learning_rate": 2.6888195769001146e-05, | |
| "loss": 0.1782, | |
| "num_input_tokens_seen": 1841840, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 5.294117647058823, | |
| "grad_norm": 0.4935537874698639, | |
| "learning_rate": 2.681063131692787e-05, | |
| "loss": 0.1843, | |
| "num_input_tokens_seen": 1844560, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 5.303030303030303, | |
| "grad_norm": 0.5020252466201782, | |
| "learning_rate": 2.673304933982783e-05, | |
| "loss": 0.1891, | |
| "num_input_tokens_seen": 1848624, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 5.311942959001782, | |
| "grad_norm": 0.5348985195159912, | |
| "learning_rate": 2.6655450588613806e-05, | |
| "loss": 0.1925, | |
| "num_input_tokens_seen": 1851952, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 5.320855614973262, | |
| "grad_norm": 0.42828452587127686, | |
| "learning_rate": 2.657783581436097e-05, | |
| "loss": 0.2381, | |
| "num_input_tokens_seen": 1855696, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 5.329768270944742, | |
| "grad_norm": 0.6298767328262329, | |
| "learning_rate": 2.6500205768299535e-05, | |
| "loss": 0.193, | |
| "num_input_tokens_seen": 1859408, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 5.338680926916221, | |
| "grad_norm": 0.6732975244522095, | |
| "learning_rate": 2.642256120180758e-05, | |
| "loss": 0.1508, | |
| "num_input_tokens_seen": 1861936, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 5.347593582887701, | |
| "grad_norm": 0.6173202991485596, | |
| "learning_rate": 2.6344902866403687e-05, | |
| "loss": 0.1724, | |
| "num_input_tokens_seen": 1864624, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 5.35650623885918, | |
| "grad_norm": 0.4392896890640259, | |
| "learning_rate": 2.6267231513739726e-05, | |
| "loss": 0.2092, | |
| "num_input_tokens_seen": 1867600, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 5.365418894830659, | |
| "grad_norm": 0.621001660823822, | |
| "learning_rate": 2.6189547895593562e-05, | |
| "loss": 0.1982, | |
| "num_input_tokens_seen": 1870672, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 5.374331550802139, | |
| "grad_norm": 0.5161955952644348, | |
| "learning_rate": 2.611185276386176e-05, | |
| "loss": 0.1923, | |
| "num_input_tokens_seen": 1874160, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 5.383244206773618, | |
| "grad_norm": 0.5126301050186157, | |
| "learning_rate": 2.6034146870552346e-05, | |
| "loss": 0.1906, | |
| "num_input_tokens_seen": 1877616, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 5.392156862745098, | |
| "grad_norm": 0.6807987093925476, | |
| "learning_rate": 2.595643096777748e-05, | |
| "loss": 0.1862, | |
| "num_input_tokens_seen": 1880432, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 5.401069518716578, | |
| "grad_norm": 0.6361598372459412, | |
| "learning_rate": 2.5878705807746245e-05, | |
| "loss": 0.2137, | |
| "num_input_tokens_seen": 1884528, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 5.409982174688057, | |
| "grad_norm": 0.6302884221076965, | |
| "learning_rate": 2.580097214275727e-05, | |
| "loss": 0.1688, | |
| "num_input_tokens_seen": 1887152, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 5.418894830659537, | |
| "grad_norm": 0.5410829186439514, | |
| "learning_rate": 2.5723230725191554e-05, | |
| "loss": 0.1772, | |
| "num_input_tokens_seen": 1890032, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 5.427807486631016, | |
| "grad_norm": 0.5092021822929382, | |
| "learning_rate": 2.5645482307505108e-05, | |
| "loss": 0.1677, | |
| "num_input_tokens_seen": 1892304, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 5.436720142602495, | |
| "grad_norm": 0.7809433937072754, | |
| "learning_rate": 2.55677276422217e-05, | |
| "loss": 0.1875, | |
| "num_input_tokens_seen": 1895728, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 5.445632798573975, | |
| "grad_norm": 0.43497583270072937, | |
| "learning_rate": 2.548996748192556e-05, | |
| "loss": 0.167, | |
| "num_input_tokens_seen": 1898384, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 5.454545454545454, | |
| "grad_norm": 0.36343979835510254, | |
| "learning_rate": 2.541220257925412e-05, | |
| "loss": 0.1719, | |
| "num_input_tokens_seen": 1901104, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 5.463458110516934, | |
| "grad_norm": 0.6379041075706482, | |
| "learning_rate": 2.5334433686890702e-05, | |
| "loss": 0.1879, | |
| "num_input_tokens_seen": 1904976, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 5.472370766488414, | |
| "grad_norm": 0.501068651676178, | |
| "learning_rate": 2.5256661557557247e-05, | |
| "loss": 0.1898, | |
| "num_input_tokens_seen": 1908688, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 5.481283422459893, | |
| "grad_norm": 0.4064844250679016, | |
| "learning_rate": 2.517888694400704e-05, | |
| "loss": 0.1471, | |
| "num_input_tokens_seen": 1911792, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 5.490196078431373, | |
| "grad_norm": 0.7375326156616211, | |
| "learning_rate": 2.5101110599017374e-05, | |
| "loss": 0.223, | |
| "num_input_tokens_seen": 1915248, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 5.499108734402852, | |
| "grad_norm": 0.7120162844657898, | |
| "learning_rate": 2.502333327538235e-05, | |
| "loss": 0.1666, | |
| "num_input_tokens_seen": 1918544, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 5.508021390374331, | |
| "grad_norm": 0.4658108353614807, | |
| "learning_rate": 2.4945555725905502e-05, | |
| "loss": 0.2039, | |
| "num_input_tokens_seen": 1922032, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 5.509803921568627, | |
| "eval_loss": 0.19006255269050598, | |
| "eval_runtime": 4.2606, | |
| "eval_samples_per_second": 58.442, | |
| "eval_steps_per_second": 14.787, | |
| "num_input_tokens_seen": 1922384, | |
| "step": 3091 | |
| }, | |
| { | |
| "epoch": 5.516934046345811, | |
| "grad_norm": 0.6522291898727417, | |
| "learning_rate": 2.4867778703392554e-05, | |
| "loss": 0.1586, | |
| "num_input_tokens_seen": 1924400, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 5.52584670231729, | |
| "grad_norm": 0.5256299376487732, | |
| "learning_rate": 2.479000296064417e-05, | |
| "loss": 0.2169, | |
| "num_input_tokens_seen": 1927376, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 5.53475935828877, | |
| "grad_norm": 0.5868116021156311, | |
| "learning_rate": 2.4712229250448567e-05, | |
| "loss": 0.1768, | |
| "num_input_tokens_seen": 1930352, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 5.54367201426025, | |
| "grad_norm": 0.6082111597061157, | |
| "learning_rate": 2.4634458325574323e-05, | |
| "loss": 0.2153, | |
| "num_input_tokens_seen": 1933680, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 5.552584670231729, | |
| "grad_norm": 0.5021962523460388, | |
| "learning_rate": 2.4556690938763062e-05, | |
| "loss": 0.1667, | |
| "num_input_tokens_seen": 1937488, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 5.561497326203209, | |
| "grad_norm": 0.5544887781143188, | |
| "learning_rate": 2.4478927842722154e-05, | |
| "loss": 0.1854, | |
| "num_input_tokens_seen": 1940368, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 5.570409982174688, | |
| "grad_norm": 0.6153222322463989, | |
| "learning_rate": 2.4401169790117427e-05, | |
| "loss": 0.1775, | |
| "num_input_tokens_seen": 1943728, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 5.579322638146167, | |
| "grad_norm": 0.7217985987663269, | |
| "learning_rate": 2.4323417533565916e-05, | |
| "loss": 0.1929, | |
| "num_input_tokens_seen": 1946832, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 5.588235294117647, | |
| "grad_norm": 0.5232107639312744, | |
| "learning_rate": 2.424567182562854e-05, | |
| "loss": 0.205, | |
| "num_input_tokens_seen": 1949904, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 5.597147950089127, | |
| "grad_norm": 0.5853015184402466, | |
| "learning_rate": 2.4167933418802837e-05, | |
| "loss": 0.1431, | |
| "num_input_tokens_seen": 1952432, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 5.606060606060606, | |
| "grad_norm": 0.7414368391036987, | |
| "learning_rate": 2.4090203065515695e-05, | |
| "loss": 0.1622, | |
| "num_input_tokens_seen": 1955216, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 5.614973262032086, | |
| "grad_norm": 0.4388047456741333, | |
| "learning_rate": 2.4012481518116022e-05, | |
| "loss": 0.1707, | |
| "num_input_tokens_seen": 1958096, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 5.623885918003565, | |
| "grad_norm": 0.5946722626686096, | |
| "learning_rate": 2.3934769528867513e-05, | |
| "loss": 0.198, | |
| "num_input_tokens_seen": 1961456, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 5.632798573975045, | |
| "grad_norm": 0.4028293192386627, | |
| "learning_rate": 2.385706784994135e-05, | |
| "loss": 0.162, | |
| "num_input_tokens_seen": 1964272, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 5.641711229946524, | |
| "grad_norm": 0.4915693700313568, | |
| "learning_rate": 2.3779377233408923e-05, | |
| "loss": 0.192, | |
| "num_input_tokens_seen": 1967120, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 5.650623885918003, | |
| "grad_norm": 0.4452253580093384, | |
| "learning_rate": 2.3701698431234528e-05, | |
| "loss": 0.1601, | |
| "num_input_tokens_seen": 1969872, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 5.659536541889483, | |
| "grad_norm": 0.5284585356712341, | |
| "learning_rate": 2.362403219526815e-05, | |
| "loss": 0.1605, | |
| "num_input_tokens_seen": 1972944, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 5.668449197860962, | |
| "grad_norm": 0.48784369230270386, | |
| "learning_rate": 2.3546379277238107e-05, | |
| "loss": 0.1533, | |
| "num_input_tokens_seen": 1975888, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 5.677361853832442, | |
| "grad_norm": 0.5844167470932007, | |
| "learning_rate": 2.3468740428743833e-05, | |
| "loss": 0.1903, | |
| "num_input_tokens_seen": 1979088, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 5.686274509803922, | |
| "grad_norm": 0.6798781752586365, | |
| "learning_rate": 2.339111640124859e-05, | |
| "loss": 0.171, | |
| "num_input_tokens_seen": 1981520, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 5.695187165775401, | |
| "grad_norm": 0.8696448802947998, | |
| "learning_rate": 2.3313507946072172e-05, | |
| "loss": 0.1648, | |
| "num_input_tokens_seen": 1984880, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 5.704099821746881, | |
| "grad_norm": 0.4180395007133484, | |
| "learning_rate": 2.323591581438365e-05, | |
| "loss": 0.1617, | |
| "num_input_tokens_seen": 1987440, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 5.71301247771836, | |
| "grad_norm": 0.6146518588066101, | |
| "learning_rate": 2.3158340757194116e-05, | |
| "loss": 0.1963, | |
| "num_input_tokens_seen": 1990640, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 5.721925133689839, | |
| "grad_norm": 0.8348390460014343, | |
| "learning_rate": 2.3080783525349388e-05, | |
| "loss": 0.1653, | |
| "num_input_tokens_seen": 1993808, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 5.730837789661319, | |
| "grad_norm": 0.7081406712532043, | |
| "learning_rate": 2.3003244869522743e-05, | |
| "loss": 0.1779, | |
| "num_input_tokens_seen": 1996688, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 5.739750445632799, | |
| "grad_norm": 0.5054243206977844, | |
| "learning_rate": 2.2925725540207688e-05, | |
| "loss": 0.1565, | |
| "num_input_tokens_seen": 1999696, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 5.748663101604278, | |
| "grad_norm": 0.5454304814338684, | |
| "learning_rate": 2.2848226287710645e-05, | |
| "loss": 0.1536, | |
| "num_input_tokens_seen": 2002032, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 5.757575757575758, | |
| "grad_norm": 0.6999877095222473, | |
| "learning_rate": 2.277074786214372e-05, | |
| "loss": 0.1683, | |
| "num_input_tokens_seen": 2005584, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 5.766488413547237, | |
| "grad_norm": 0.765386164188385, | |
| "learning_rate": 2.2693291013417453e-05, | |
| "loss": 0.1592, | |
| "num_input_tokens_seen": 2008176, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 5.775401069518717, | |
| "grad_norm": 0.7968612909317017, | |
| "learning_rate": 2.2615856491233513e-05, | |
| "loss": 0.3207, | |
| "num_input_tokens_seen": 2011376, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 5.784313725490196, | |
| "grad_norm": 0.3482127785682678, | |
| "learning_rate": 2.2538445045077488e-05, | |
| "loss": 0.1455, | |
| "num_input_tokens_seen": 2014224, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 5.793226381461675, | |
| "grad_norm": 0.5806959271430969, | |
| "learning_rate": 2.246105742421162e-05, | |
| "loss": 0.1741, | |
| "num_input_tokens_seen": 2016912, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 5.802139037433155, | |
| "grad_norm": 0.7654284834861755, | |
| "learning_rate": 2.2383694377667543e-05, | |
| "loss": 0.1575, | |
| "num_input_tokens_seen": 2020048, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 5.811051693404634, | |
| "grad_norm": 0.642106831073761, | |
| "learning_rate": 2.2306356654239012e-05, | |
| "loss": 0.1756, | |
| "num_input_tokens_seen": 2023216, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 5.819964349376114, | |
| "grad_norm": 0.43349790573120117, | |
| "learning_rate": 2.222904500247473e-05, | |
| "loss": 0.1924, | |
| "num_input_tokens_seen": 2026928, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 5.828877005347594, | |
| "grad_norm": 0.4377082884311676, | |
| "learning_rate": 2.2151760170671004e-05, | |
| "loss": 0.1696, | |
| "num_input_tokens_seen": 2029584, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 5.837789661319073, | |
| "grad_norm": 0.40771257877349854, | |
| "learning_rate": 2.207450290686458e-05, | |
| "loss": 0.1603, | |
| "num_input_tokens_seen": 2032720, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 5.846702317290553, | |
| "grad_norm": 0.5143370628356934, | |
| "learning_rate": 2.1997273958825375e-05, | |
| "loss": 0.1845, | |
| "num_input_tokens_seen": 2036176, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 5.855614973262032, | |
| "grad_norm": 0.5394704341888428, | |
| "learning_rate": 2.1920074074049225e-05, | |
| "loss": 0.1801, | |
| "num_input_tokens_seen": 2039632, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 5.864527629233511, | |
| "grad_norm": 0.6020737290382385, | |
| "learning_rate": 2.1842903999750665e-05, | |
| "loss": 0.1862, | |
| "num_input_tokens_seen": 2043184, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 5.873440285204991, | |
| "grad_norm": 0.7539795637130737, | |
| "learning_rate": 2.1765764482855715e-05, | |
| "loss": 0.1628, | |
| "num_input_tokens_seen": 2046416, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 5.882352941176471, | |
| "grad_norm": 0.6914777755737305, | |
| "learning_rate": 2.1688656269994612e-05, | |
| "loss": 0.1768, | |
| "num_input_tokens_seen": 2049008, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 5.89126559714795, | |
| "grad_norm": 1.2212262153625488, | |
| "learning_rate": 2.1611580107494597e-05, | |
| "loss": 0.1982, | |
| "num_input_tokens_seen": 2052656, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 5.90017825311943, | |
| "grad_norm": 0.5432605743408203, | |
| "learning_rate": 2.153453674137272e-05, | |
| "loss": 0.1885, | |
| "num_input_tokens_seen": 2055888, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 5.909090909090909, | |
| "grad_norm": 0.5268386006355286, | |
| "learning_rate": 2.1457526917328588e-05, | |
| "loss": 0.1492, | |
| "num_input_tokens_seen": 2059056, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 5.918003565062389, | |
| "grad_norm": 0.8248959183692932, | |
| "learning_rate": 2.1380551380737128e-05, | |
| "loss": 0.1755, | |
| "num_input_tokens_seen": 2062096, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 5.926916221033868, | |
| "grad_norm": 0.5520910024642944, | |
| "learning_rate": 2.130361087664145e-05, | |
| "loss": 0.1899, | |
| "num_input_tokens_seen": 2065168, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 5.935828877005347, | |
| "grad_norm": 0.5292351841926575, | |
| "learning_rate": 2.122670614974555e-05, | |
| "loss": 0.1983, | |
| "num_input_tokens_seen": 2067856, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 5.944741532976827, | |
| "grad_norm": 0.8153255581855774, | |
| "learning_rate": 2.1149837944407136e-05, | |
| "loss": 0.1517, | |
| "num_input_tokens_seen": 2071056, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 5.953654188948306, | |
| "grad_norm": 0.7868825197219849, | |
| "learning_rate": 2.107300700463045e-05, | |
| "loss": 0.193, | |
| "num_input_tokens_seen": 2074192, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 5.962566844919786, | |
| "grad_norm": 0.39180079102516174, | |
| "learning_rate": 2.0996214074059034e-05, | |
| "loss": 0.166, | |
| "num_input_tokens_seen": 2077040, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 5.971479500891266, | |
| "grad_norm": 0.5239204168319702, | |
| "learning_rate": 2.0919459895968517e-05, | |
| "loss": 0.1395, | |
| "num_input_tokens_seen": 2079312, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 5.980392156862745, | |
| "grad_norm": 0.4734959304332733, | |
| "learning_rate": 2.084274521325948e-05, | |
| "loss": 0.1701, | |
| "num_input_tokens_seen": 2082864, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 5.989304812834225, | |
| "grad_norm": 0.6230949759483337, | |
| "learning_rate": 2.0766070768450206e-05, | |
| "loss": 0.1928, | |
| "num_input_tokens_seen": 2085872, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 5.998217468805704, | |
| "grad_norm": 0.6036242246627808, | |
| "learning_rate": 2.0689437303669508e-05, | |
| "loss": 0.1673, | |
| "num_input_tokens_seen": 2088272, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 6.007130124777183, | |
| "grad_norm": 0.6001238822937012, | |
| "learning_rate": 2.0612845560649603e-05, | |
| "loss": 0.1752, | |
| "num_input_tokens_seen": 2091232, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 6.010695187165775, | |
| "eval_loss": 0.19044770300388336, | |
| "eval_runtime": 4.266, | |
| "eval_samples_per_second": 58.369, | |
| "eval_steps_per_second": 14.768, | |
| "num_input_tokens_seen": 2092320, | |
| "step": 3372 | |
| }, | |
| { | |
| "epoch": 6.016042780748663, | |
| "grad_norm": 0.9030793309211731, | |
| "learning_rate": 2.0536296280718825e-05, | |
| "loss": 0.1664, | |
| "num_input_tokens_seen": 2093952, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 6.024955436720143, | |
| "grad_norm": 0.6371573209762573, | |
| "learning_rate": 2.0459790204794545e-05, | |
| "loss": 0.1941, | |
| "num_input_tokens_seen": 2097728, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 6.033868092691622, | |
| "grad_norm": 0.4168316125869751, | |
| "learning_rate": 2.0383328073375955e-05, | |
| "loss": 0.2223, | |
| "num_input_tokens_seen": 2100736, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 6.042780748663102, | |
| "grad_norm": 0.8262919187545776, | |
| "learning_rate": 2.0306910626536926e-05, | |
| "loss": 0.1762, | |
| "num_input_tokens_seen": 2104032, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 6.051693404634581, | |
| "grad_norm": 0.482316255569458, | |
| "learning_rate": 2.0230538603918787e-05, | |
| "loss": 0.1594, | |
| "num_input_tokens_seen": 2107264, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 6.0606060606060606, | |
| "grad_norm": 1.0964471101760864, | |
| "learning_rate": 2.015421274472325e-05, | |
| "loss": 0.1881, | |
| "num_input_tokens_seen": 2110336, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 6.06951871657754, | |
| "grad_norm": 0.49298667907714844, | |
| "learning_rate": 2.0077933787705204e-05, | |
| "loss": 0.151, | |
| "num_input_tokens_seen": 2113248, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 6.078431372549019, | |
| "grad_norm": 0.6304886341094971, | |
| "learning_rate": 2.000170247116554e-05, | |
| "loss": 0.1657, | |
| "num_input_tokens_seen": 2116032, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 6.087344028520499, | |
| "grad_norm": 0.4530024230480194, | |
| "learning_rate": 1.9925519532944104e-05, | |
| "loss": 0.1692, | |
| "num_input_tokens_seen": 2118848, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 6.096256684491979, | |
| "grad_norm": 0.5926321744918823, | |
| "learning_rate": 1.9849385710412424e-05, | |
| "loss": 0.3085, | |
| "num_input_tokens_seen": 2122208, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 6.105169340463458, | |
| "grad_norm": 0.5866901874542236, | |
| "learning_rate": 1.977330174046667e-05, | |
| "loss": 0.1675, | |
| "num_input_tokens_seen": 2125248, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 6.114081996434938, | |
| "grad_norm": 0.35337719321250916, | |
| "learning_rate": 1.9697268359520506e-05, | |
| "loss": 0.2589, | |
| "num_input_tokens_seen": 2129248, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 6.122994652406417, | |
| "grad_norm": 0.4666219651699066, | |
| "learning_rate": 1.9621286303497915e-05, | |
| "loss": 0.1709, | |
| "num_input_tokens_seen": 2131904, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 6.1319073083778965, | |
| "grad_norm": 0.6858420372009277, | |
| "learning_rate": 1.954535630782612e-05, | |
| "loss": 0.183, | |
| "num_input_tokens_seen": 2135552, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 6.140819964349376, | |
| "grad_norm": 0.41474148631095886, | |
| "learning_rate": 1.9469479107428463e-05, | |
| "loss": 0.1723, | |
| "num_input_tokens_seen": 2138688, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 6.149732620320855, | |
| "grad_norm": 0.60605388879776, | |
| "learning_rate": 1.9393655436717283e-05, | |
| "loss": 0.1506, | |
| "num_input_tokens_seen": 2141248, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 6.158645276292335, | |
| "grad_norm": 0.9076442122459412, | |
| "learning_rate": 1.9317886029586778e-05, | |
| "loss": 0.2039, | |
| "num_input_tokens_seen": 2144768, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 6.167557932263815, | |
| "grad_norm": 0.9373259544372559, | |
| "learning_rate": 1.9242171619405986e-05, | |
| "loss": 0.1797, | |
| "num_input_tokens_seen": 2147552, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 6.176470588235294, | |
| "grad_norm": 0.6851420998573303, | |
| "learning_rate": 1.916651293901157e-05, | |
| "loss": 0.1825, | |
| "num_input_tokens_seen": 2151040, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 6.185383244206774, | |
| "grad_norm": 0.6892784833908081, | |
| "learning_rate": 1.909091072070083e-05, | |
| "loss": 0.171, | |
| "num_input_tokens_seen": 2155040, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 6.194295900178253, | |
| "grad_norm": 0.6285828948020935, | |
| "learning_rate": 1.9015365696224564e-05, | |
| "loss": 0.158, | |
| "num_input_tokens_seen": 2157824, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 6.2032085561497325, | |
| "grad_norm": 0.5884494781494141, | |
| "learning_rate": 1.893987859677997e-05, | |
| "loss": 0.181, | |
| "num_input_tokens_seen": 2160672, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 6.212121212121212, | |
| "grad_norm": 0.7425735592842102, | |
| "learning_rate": 1.886445015300362e-05, | |
| "loss": 0.1473, | |
| "num_input_tokens_seen": 2163552, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 6.221033868092691, | |
| "grad_norm": 0.39105650782585144, | |
| "learning_rate": 1.8789081094964347e-05, | |
| "loss": 0.1441, | |
| "num_input_tokens_seen": 2167456, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 6.229946524064171, | |
| "grad_norm": 0.30422699451446533, | |
| "learning_rate": 1.8713772152156205e-05, | |
| "loss": 0.1294, | |
| "num_input_tokens_seen": 2170560, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 6.238859180035651, | |
| "grad_norm": 0.7964766621589661, | |
| "learning_rate": 1.863852405349135e-05, | |
| "loss": 0.1838, | |
| "num_input_tokens_seen": 2173152, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 6.24777183600713, | |
| "grad_norm": 0.6463519334793091, | |
| "learning_rate": 1.856333752729311e-05, | |
| "loss": 0.1637, | |
| "num_input_tokens_seen": 2175808, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 6.25668449197861, | |
| "grad_norm": 0.8007080554962158, | |
| "learning_rate": 1.848821330128878e-05, | |
| "loss": 0.1717, | |
| "num_input_tokens_seen": 2178304, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 6.265597147950089, | |
| "grad_norm": 1.0539445877075195, | |
| "learning_rate": 1.8413152102602687e-05, | |
| "loss": 0.1892, | |
| "num_input_tokens_seen": 2181312, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 6.2745098039215685, | |
| "grad_norm": 0.6273789405822754, | |
| "learning_rate": 1.8338154657749128e-05, | |
| "loss": 0.1699, | |
| "num_input_tokens_seen": 2184128, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 6.283422459893048, | |
| "grad_norm": 0.5192899703979492, | |
| "learning_rate": 1.826322169262531e-05, | |
| "loss": 0.1772, | |
| "num_input_tokens_seen": 2187584, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 6.292335115864527, | |
| "grad_norm": 0.6465858221054077, | |
| "learning_rate": 1.818835393250434e-05, | |
| "loss": 0.1814, | |
| "num_input_tokens_seen": 2191168, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 6.301247771836007, | |
| "grad_norm": 0.5996541380882263, | |
| "learning_rate": 1.8113552102028236e-05, | |
| "loss": 0.1888, | |
| "num_input_tokens_seen": 2194880, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 6.310160427807487, | |
| "grad_norm": 0.3005512058734894, | |
| "learning_rate": 1.803881692520087e-05, | |
| "loss": 0.1483, | |
| "num_input_tokens_seen": 2197184, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 6.319073083778966, | |
| "grad_norm": 0.4426136016845703, | |
| "learning_rate": 1.796414912538095e-05, | |
| "loss": 0.162, | |
| "num_input_tokens_seen": 2200160, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 6.327985739750446, | |
| "grad_norm": 0.7000912427902222, | |
| "learning_rate": 1.7889549425275093e-05, | |
| "loss": 0.1686, | |
| "num_input_tokens_seen": 2203776, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 6.336898395721925, | |
| "grad_norm": 0.5500680804252625, | |
| "learning_rate": 1.7815018546930754e-05, | |
| "loss": 0.1716, | |
| "num_input_tokens_seen": 2207104, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 6.3458110516934045, | |
| "grad_norm": 0.5378794074058533, | |
| "learning_rate": 1.7740557211729258e-05, | |
| "loss": 0.1653, | |
| "num_input_tokens_seen": 2210400, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 6.354723707664884, | |
| "grad_norm": 0.20100829005241394, | |
| "learning_rate": 1.7666166140378852e-05, | |
| "loss": 0.1604, | |
| "num_input_tokens_seen": 2213728, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 6.363636363636363, | |
| "grad_norm": 0.33214375376701355, | |
| "learning_rate": 1.7591846052907673e-05, | |
| "loss": 0.1524, | |
| "num_input_tokens_seen": 2216416, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 6.372549019607844, | |
| "grad_norm": 1.197052240371704, | |
| "learning_rate": 1.7517597668656823e-05, | |
| "loss": 0.1849, | |
| "num_input_tokens_seen": 2219328, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 6.381461675579323, | |
| "grad_norm": 0.704537034034729, | |
| "learning_rate": 1.7443421706273395e-05, | |
| "loss": 0.1927, | |
| "num_input_tokens_seen": 2222496, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 6.390374331550802, | |
| "grad_norm": 0.6272372007369995, | |
| "learning_rate": 1.7369318883703506e-05, | |
| "loss": 0.1855, | |
| "num_input_tokens_seen": 2225504, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 6.399286987522282, | |
| "grad_norm": 0.8482812643051147, | |
| "learning_rate": 1.7295289918185348e-05, | |
| "loss": 0.1753, | |
| "num_input_tokens_seen": 2229312, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 6.408199643493761, | |
| "grad_norm": 0.5499706864356995, | |
| "learning_rate": 1.722133552624227e-05, | |
| "loss": 0.1939, | |
| "num_input_tokens_seen": 2232544, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 6.4171122994652405, | |
| "grad_norm": 0.48051542043685913, | |
| "learning_rate": 1.714745642367583e-05, | |
| "loss": 0.1707, | |
| "num_input_tokens_seen": 2235808, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 6.42602495543672, | |
| "grad_norm": 1.0482089519500732, | |
| "learning_rate": 1.707365332555883e-05, | |
| "loss": 0.183, | |
| "num_input_tokens_seen": 2239040, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 6.434937611408199, | |
| "grad_norm": 0.5002045631408691, | |
| "learning_rate": 1.699992694622847e-05, | |
| "loss": 0.1476, | |
| "num_input_tokens_seen": 2241728, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 6.443850267379679, | |
| "grad_norm": 0.5338446497917175, | |
| "learning_rate": 1.6926277999279372e-05, | |
| "loss": 0.1712, | |
| "num_input_tokens_seen": 2244928, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 6.452762923351159, | |
| "grad_norm": 0.5092248320579529, | |
| "learning_rate": 1.6852707197556677e-05, | |
| "loss": 0.1569, | |
| "num_input_tokens_seen": 2247936, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 6.461675579322638, | |
| "grad_norm": 0.4300782382488251, | |
| "learning_rate": 1.67792152531492e-05, | |
| "loss": 0.1658, | |
| "num_input_tokens_seen": 2250560, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 6.470588235294118, | |
| "grad_norm": 0.3229581415653229, | |
| "learning_rate": 1.6705802877382464e-05, | |
| "loss": 0.1451, | |
| "num_input_tokens_seen": 2253248, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 6.479500891265597, | |
| "grad_norm": 0.5048878788948059, | |
| "learning_rate": 1.6632470780811866e-05, | |
| "loss": 0.1803, | |
| "num_input_tokens_seen": 2256320, | |
| "step": 3635 | |
| }, | |
| { | |
| "epoch": 6.4884135472370765, | |
| "grad_norm": 0.7852115631103516, | |
| "learning_rate": 1.6559219673215784e-05, | |
| "loss": 0.1825, | |
| "num_input_tokens_seen": 2259168, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 6.497326203208556, | |
| "grad_norm": 0.3399798572063446, | |
| "learning_rate": 1.6486050263588702e-05, | |
| "loss": 0.1856, | |
| "num_input_tokens_seen": 2262240, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 6.506238859180035, | |
| "grad_norm": 0.5445297360420227, | |
| "learning_rate": 1.641296326013436e-05, | |
| "loss": 0.2109, | |
| "num_input_tokens_seen": 2265600, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 6.5115864527629235, | |
| "eval_loss": 0.1881975382566452, | |
| "eval_runtime": 4.2584, | |
| "eval_samples_per_second": 58.472, | |
| "eval_steps_per_second": 14.794, | |
| "num_input_tokens_seen": 2267520, | |
| "step": 3653 | |
| }, | |
| { | |
| "epoch": 6.515151515151516, | |
| "grad_norm": 0.33709490299224854, | |
| "learning_rate": 1.633995937025889e-05, | |
| "loss": 0.1652, | |
| "num_input_tokens_seen": 2268768, | |
| "step": 3655 | |
| }, | |
| { | |
| "epoch": 6.524064171122995, | |
| "grad_norm": 0.4406679570674896, | |
| "learning_rate": 1.6267039300563965e-05, | |
| "loss": 0.2093, | |
| "num_input_tokens_seen": 2272256, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 6.532976827094474, | |
| "grad_norm": 0.6629878878593445, | |
| "learning_rate": 1.619420375683996e-05, | |
| "loss": 0.1718, | |
| "num_input_tokens_seen": 2275968, | |
| "step": 3665 | |
| }, | |
| { | |
| "epoch": 6.541889483065954, | |
| "grad_norm": 0.665874183177948, | |
| "learning_rate": 1.6121453444059153e-05, | |
| "loss": 0.1913, | |
| "num_input_tokens_seen": 2278784, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 6.550802139037433, | |
| "grad_norm": 0.5533963441848755, | |
| "learning_rate": 1.6048789066368858e-05, | |
| "loss": 0.1798, | |
| "num_input_tokens_seen": 2281472, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 6.5597147950089125, | |
| "grad_norm": 0.40691274404525757, | |
| "learning_rate": 1.5976211327084606e-05, | |
| "loss": 0.1737, | |
| "num_input_tokens_seen": 2284608, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 6.568627450980392, | |
| "grad_norm": 0.7153930068016052, | |
| "learning_rate": 1.59037209286834e-05, | |
| "loss": 0.1607, | |
| "num_input_tokens_seen": 2287296, | |
| "step": 3685 | |
| }, | |
| { | |
| "epoch": 6.577540106951871, | |
| "grad_norm": 0.4068545401096344, | |
| "learning_rate": 1.583131857279685e-05, | |
| "loss": 0.1584, | |
| "num_input_tokens_seen": 2290176, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 6.586452762923351, | |
| "grad_norm": 0.5864424109458923, | |
| "learning_rate": 1.57590049602044e-05, | |
| "loss": 0.175, | |
| "num_input_tokens_seen": 2292960, | |
| "step": 3695 | |
| }, | |
| { | |
| "epoch": 6.595365418894831, | |
| "grad_norm": 0.729058027267456, | |
| "learning_rate": 1.5686780790826574e-05, | |
| "loss": 0.1749, | |
| "num_input_tokens_seen": 2296192, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 6.60427807486631, | |
| "grad_norm": 0.7947399616241455, | |
| "learning_rate": 1.561464676371816e-05, | |
| "loss": 0.1895, | |
| "num_input_tokens_seen": 2300224, | |
| "step": 3705 | |
| }, | |
| { | |
| "epoch": 6.61319073083779, | |
| "grad_norm": 0.5141013860702515, | |
| "learning_rate": 1.5542603577061464e-05, | |
| "loss": 0.1672, | |
| "num_input_tokens_seen": 2303040, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 6.622103386809269, | |
| "grad_norm": 0.7291932702064514, | |
| "learning_rate": 1.5470651928159564e-05, | |
| "loss": 0.1447, | |
| "num_input_tokens_seen": 2305600, | |
| "step": 3715 | |
| }, | |
| { | |
| "epoch": 6.6310160427807485, | |
| "grad_norm": 0.48628827929496765, | |
| "learning_rate": 1.539879251342954e-05, | |
| "loss": 0.1646, | |
| "num_input_tokens_seen": 2308736, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 6.639928698752228, | |
| "grad_norm": 0.6047589778900146, | |
| "learning_rate": 1.5327026028395724e-05, | |
| "loss": 0.1547, | |
| "num_input_tokens_seen": 2311840, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 6.648841354723707, | |
| "grad_norm": 0.5494013428688049, | |
| "learning_rate": 1.5255353167683017e-05, | |
| "loss": 0.1728, | |
| "num_input_tokens_seen": 2315808, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 6.657754010695188, | |
| "grad_norm": 0.6367866396903992, | |
| "learning_rate": 1.5183774625010119e-05, | |
| "loss": 0.1566, | |
| "num_input_tokens_seen": 2319072, | |
| "step": 3735 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.6009120345115662, | |
| "learning_rate": 1.5112291093182818e-05, | |
| "loss": 0.187, | |
| "num_input_tokens_seen": 2323104, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 6.675579322638146, | |
| "grad_norm": 0.5307632088661194, | |
| "learning_rate": 1.5040903264087328e-05, | |
| "loss": 0.174, | |
| "num_input_tokens_seen": 2325984, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 6.684491978609626, | |
| "grad_norm": 0.4566698372364044, | |
| "learning_rate": 1.4969611828683517e-05, | |
| "loss": 0.1415, | |
| "num_input_tokens_seen": 2329152, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 6.693404634581105, | |
| "grad_norm": 0.5744293928146362, | |
| "learning_rate": 1.4898417476998289e-05, | |
| "loss": 0.2178, | |
| "num_input_tokens_seen": 2332768, | |
| "step": 3755 | |
| }, | |
| { | |
| "epoch": 6.7023172905525845, | |
| "grad_norm": 0.4906589984893799, | |
| "learning_rate": 1.4827320898118884e-05, | |
| "loss": 0.1595, | |
| "num_input_tokens_seen": 2335680, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 6.711229946524064, | |
| "grad_norm": 0.643140435218811, | |
| "learning_rate": 1.4756322780186193e-05, | |
| "loss": 0.1865, | |
| "num_input_tokens_seen": 2338656, | |
| "step": 3765 | |
| }, | |
| { | |
| "epoch": 6.720142602495543, | |
| "grad_norm": 0.6035706996917725, | |
| "learning_rate": 1.4685423810388094e-05, | |
| "loss": 0.1639, | |
| "num_input_tokens_seen": 2342016, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 6.729055258467023, | |
| "grad_norm": 0.35557159781455994, | |
| "learning_rate": 1.4614624674952842e-05, | |
| "loss": 0.1617, | |
| "num_input_tokens_seen": 2345120, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 6.737967914438503, | |
| "grad_norm": 0.589004397392273, | |
| "learning_rate": 1.4543926059142379e-05, | |
| "loss": 0.1699, | |
| "num_input_tokens_seen": 2348512, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 6.746880570409982, | |
| "grad_norm": 0.4238247573375702, | |
| "learning_rate": 1.4473328647245726e-05, | |
| "loss": 0.1614, | |
| "num_input_tokens_seen": 2350688, | |
| "step": 3785 | |
| }, | |
| { | |
| "epoch": 6.755793226381462, | |
| "grad_norm": 0.6005486845970154, | |
| "learning_rate": 1.4402833122572368e-05, | |
| "loss": 0.1801, | |
| "num_input_tokens_seen": 2353504, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 6.764705882352941, | |
| "grad_norm": 0.6389063000679016, | |
| "learning_rate": 1.4332440167445613e-05, | |
| "loss": 0.1597, | |
| "num_input_tokens_seen": 2356672, | |
| "step": 3795 | |
| }, | |
| { | |
| "epoch": 6.7736185383244205, | |
| "grad_norm": 0.4916219115257263, | |
| "learning_rate": 1.4262150463195981e-05, | |
| "loss": 0.1759, | |
| "num_input_tokens_seen": 2360288, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 6.7825311942959, | |
| "grad_norm": 0.6930426359176636, | |
| "learning_rate": 1.4191964690154702e-05, | |
| "loss": 0.1552, | |
| "num_input_tokens_seen": 2362944, | |
| "step": 3805 | |
| }, | |
| { | |
| "epoch": 6.791443850267379, | |
| "grad_norm": 0.5594033598899841, | |
| "learning_rate": 1.412188352764699e-05, | |
| "loss": 0.1858, | |
| "num_input_tokens_seen": 2366080, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 6.80035650623886, | |
| "grad_norm": 0.6492391228675842, | |
| "learning_rate": 1.4051907653985552e-05, | |
| "loss": 0.1954, | |
| "num_input_tokens_seen": 2369632, | |
| "step": 3815 | |
| }, | |
| { | |
| "epoch": 6.809269162210339, | |
| "grad_norm": 0.7449959516525269, | |
| "learning_rate": 1.3982037746464043e-05, | |
| "loss": 0.1986, | |
| "num_input_tokens_seen": 2373504, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 6.818181818181818, | |
| "grad_norm": 0.6552306413650513, | |
| "learning_rate": 1.3912274481350433e-05, | |
| "loss": 0.1672, | |
| "num_input_tokens_seen": 2376480, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 6.827094474153298, | |
| "grad_norm": 0.5298140048980713, | |
| "learning_rate": 1.3842618533880531e-05, | |
| "loss": 0.1679, | |
| "num_input_tokens_seen": 2379488, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 6.836007130124777, | |
| "grad_norm": 0.6472254395484924, | |
| "learning_rate": 1.3773070578251424e-05, | |
| "loss": 0.179, | |
| "num_input_tokens_seen": 2382496, | |
| "step": 3835 | |
| }, | |
| { | |
| "epoch": 6.8449197860962565, | |
| "grad_norm": 0.5164865851402283, | |
| "learning_rate": 1.3703631287614935e-05, | |
| "loss": 0.1802, | |
| "num_input_tokens_seen": 2386304, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 6.853832442067736, | |
| "grad_norm": 0.4910835325717926, | |
| "learning_rate": 1.363430133407112e-05, | |
| "loss": 0.1772, | |
| "num_input_tokens_seen": 2389504, | |
| "step": 3845 | |
| }, | |
| { | |
| "epoch": 6.862745098039216, | |
| "grad_norm": 0.5745038986206055, | |
| "learning_rate": 1.3565081388661782e-05, | |
| "loss": 0.1634, | |
| "num_input_tokens_seen": 2392320, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 6.871657754010696, | |
| "grad_norm": 0.5505916476249695, | |
| "learning_rate": 1.3495972121363968e-05, | |
| "loss": 0.1739, | |
| "num_input_tokens_seen": 2395648, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 6.880570409982175, | |
| "grad_norm": 0.6166315674781799, | |
| "learning_rate": 1.3426974201083439e-05, | |
| "loss": 0.1693, | |
| "num_input_tokens_seen": 2398080, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 6.889483065953654, | |
| "grad_norm": 1.0031318664550781, | |
| "learning_rate": 1.3358088295648274e-05, | |
| "loss": 0.175, | |
| "num_input_tokens_seen": 2400448, | |
| "step": 3865 | |
| }, | |
| { | |
| "epoch": 6.898395721925134, | |
| "grad_norm": 0.43097200989723206, | |
| "learning_rate": 1.328931507180233e-05, | |
| "loss": 0.1634, | |
| "num_input_tokens_seen": 2403424, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 6.907308377896613, | |
| "grad_norm": 0.4086379110813141, | |
| "learning_rate": 1.3220655195198847e-05, | |
| "loss": 0.1469, | |
| "num_input_tokens_seen": 2405984, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 6.9162210338680925, | |
| "grad_norm": 0.40902405977249146, | |
| "learning_rate": 1.3152109330393985e-05, | |
| "loss": 0.1677, | |
| "num_input_tokens_seen": 2409472, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 6.925133689839572, | |
| "grad_norm": 0.629298985004425, | |
| "learning_rate": 1.3083678140840366e-05, | |
| "loss": 0.1898, | |
| "num_input_tokens_seen": 2412384, | |
| "step": 3885 | |
| }, | |
| { | |
| "epoch": 6.934046345811051, | |
| "grad_norm": 0.4956974387168884, | |
| "learning_rate": 1.3015362288880678e-05, | |
| "loss": 0.1628, | |
| "num_input_tokens_seen": 2415328, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 6.942959001782532, | |
| "grad_norm": 0.31115609407424927, | |
| "learning_rate": 1.2947162435741278e-05, | |
| "loss": 0.1869, | |
| "num_input_tokens_seen": 2418848, | |
| "step": 3895 | |
| }, | |
| { | |
| "epoch": 6.951871657754011, | |
| "grad_norm": 0.5426957011222839, | |
| "learning_rate": 1.2879079241525783e-05, | |
| "loss": 0.1615, | |
| "num_input_tokens_seen": 2421824, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 6.96078431372549, | |
| "grad_norm": 0.6043846011161804, | |
| "learning_rate": 1.2811113365208627e-05, | |
| "loss": 0.189, | |
| "num_input_tokens_seen": 2424224, | |
| "step": 3905 | |
| }, | |
| { | |
| "epoch": 6.96969696969697, | |
| "grad_norm": 0.48290809988975525, | |
| "learning_rate": 1.2743265464628786e-05, | |
| "loss": 0.1779, | |
| "num_input_tokens_seen": 2427616, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 6.978609625668449, | |
| "grad_norm": 0.5067238211631775, | |
| "learning_rate": 1.2675536196483306e-05, | |
| "loss": 0.1568, | |
| "num_input_tokens_seen": 2430368, | |
| "step": 3915 | |
| }, | |
| { | |
| "epoch": 6.9875222816399285, | |
| "grad_norm": 0.43254604935646057, | |
| "learning_rate": 1.260792621632102e-05, | |
| "loss": 0.1876, | |
| "num_input_tokens_seen": 2433376, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 6.996434937611408, | |
| "grad_norm": 0.8352137804031372, | |
| "learning_rate": 1.2540436178536186e-05, | |
| "loss": 0.186, | |
| "num_input_tokens_seen": 2436608, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 7.005347593582887, | |
| "grad_norm": 0.8926360011100769, | |
| "learning_rate": 1.2473066736362124e-05, | |
| "loss": 0.1554, | |
| "num_input_tokens_seen": 2439064, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 7.0124777183600715, | |
| "eval_loss": 0.18532642722129822, | |
| "eval_runtime": 4.2481, | |
| "eval_samples_per_second": 58.614, | |
| "eval_steps_per_second": 14.83, | |
| "num_input_tokens_seen": 2441688, | |
| "step": 3934 | |
| }, | |
| { | |
| "epoch": 7.0142602495543676, | |
| "grad_norm": 0.40735986828804016, | |
| "learning_rate": 1.2405818541864905e-05, | |
| "loss": 0.1639, | |
| "num_input_tokens_seen": 2442328, | |
| "step": 3935 | |
| }, | |
| { | |
| "epoch": 7.023172905525847, | |
| "grad_norm": 0.8125144243240356, | |
| "learning_rate": 1.2338692245937077e-05, | |
| "loss": 0.1518, | |
| "num_input_tokens_seen": 2445272, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 7.032085561497326, | |
| "grad_norm": 0.352469801902771, | |
| "learning_rate": 1.2271688498291335e-05, | |
| "loss": 0.1499, | |
| "num_input_tokens_seen": 2448216, | |
| "step": 3945 | |
| }, | |
| { | |
| "epoch": 7.040998217468806, | |
| "grad_norm": 0.5842772722244263, | |
| "learning_rate": 1.2204807947454203e-05, | |
| "loss": 0.173, | |
| "num_input_tokens_seen": 2451704, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 7.049910873440285, | |
| "grad_norm": 0.8481732606887817, | |
| "learning_rate": 1.2138051240759826e-05, | |
| "loss": 0.1489, | |
| "num_input_tokens_seen": 2454392, | |
| "step": 3955 | |
| }, | |
| { | |
| "epoch": 7.0588235294117645, | |
| "grad_norm": 0.6517293453216553, | |
| "learning_rate": 1.2071419024343633e-05, | |
| "loss": 0.1674, | |
| "num_input_tokens_seen": 2457112, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 7.067736185383244, | |
| "grad_norm": 0.5270460844039917, | |
| "learning_rate": 1.2004911943136143e-05, | |
| "loss": 0.1551, | |
| "num_input_tokens_seen": 2460312, | |
| "step": 3965 | |
| }, | |
| { | |
| "epoch": 7.076648841354723, | |
| "grad_norm": 0.5227533578872681, | |
| "learning_rate": 1.1938530640856696e-05, | |
| "loss": 0.1572, | |
| "num_input_tokens_seen": 2463224, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 7.0855614973262036, | |
| "grad_norm": 0.29230085015296936, | |
| "learning_rate": 1.1872275760007198e-05, | |
| "loss": 0.1661, | |
| "num_input_tokens_seen": 2466008, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 7.094474153297683, | |
| "grad_norm": 0.5345339179039001, | |
| "learning_rate": 1.1806147941865938e-05, | |
| "loss": 0.1784, | |
| "num_input_tokens_seen": 2469176, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 7.103386809269162, | |
| "grad_norm": 0.4222520589828491, | |
| "learning_rate": 1.1740147826481385e-05, | |
| "loss": 0.1405, | |
| "num_input_tokens_seen": 2472408, | |
| "step": 3985 | |
| }, | |
| { | |
| "epoch": 7.112299465240642, | |
| "grad_norm": 0.5282605290412903, | |
| "learning_rate": 1.1674276052665973e-05, | |
| "loss": 0.1902, | |
| "num_input_tokens_seen": 2475608, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 7.121212121212121, | |
| "grad_norm": 0.4751206636428833, | |
| "learning_rate": 1.1608533257989901e-05, | |
| "loss": 0.1489, | |
| "num_input_tokens_seen": 2478680, | |
| "step": 3995 | |
| }, | |
| { | |
| "epoch": 7.1301247771836005, | |
| "grad_norm": 0.3280528783798218, | |
| "learning_rate": 1.1542920078775018e-05, | |
| "loss": 0.1666, | |
| "num_input_tokens_seen": 2481592, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 7.13903743315508, | |
| "grad_norm": 0.9430297017097473, | |
| "learning_rate": 1.14774371500886e-05, | |
| "loss": 0.2094, | |
| "num_input_tokens_seen": 2485176, | |
| "step": 4005 | |
| }, | |
| { | |
| "epoch": 7.14795008912656, | |
| "grad_norm": 0.27522483468055725, | |
| "learning_rate": 1.141208510573725e-05, | |
| "loss": 0.1596, | |
| "num_input_tokens_seen": 2488152, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 7.1568627450980395, | |
| "grad_norm": 0.5842289328575134, | |
| "learning_rate": 1.1346864578260758e-05, | |
| "loss": 0.1904, | |
| "num_input_tokens_seen": 2491320, | |
| "step": 4015 | |
| }, | |
| { | |
| "epoch": 7.165775401069519, | |
| "grad_norm": 0.38907817006111145, | |
| "learning_rate": 1.1281776198925939e-05, | |
| "loss": 0.1459, | |
| "num_input_tokens_seen": 2493944, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 7.174688057040998, | |
| "grad_norm": 0.31314197182655334, | |
| "learning_rate": 1.121682059772056e-05, | |
| "loss": 0.1407, | |
| "num_input_tokens_seen": 2496664, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 7.183600713012478, | |
| "grad_norm": 0.5018792748451233, | |
| "learning_rate": 1.1151998403347244e-05, | |
| "loss": 0.2596, | |
| "num_input_tokens_seen": 2500216, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 7.192513368983957, | |
| "grad_norm": 0.4724593162536621, | |
| "learning_rate": 1.1087310243217386e-05, | |
| "loss": 0.1538, | |
| "num_input_tokens_seen": 2503544, | |
| "step": 4035 | |
| }, | |
| { | |
| "epoch": 7.2014260249554365, | |
| "grad_norm": 0.647865891456604, | |
| "learning_rate": 1.1022756743445028e-05, | |
| "loss": 0.1738, | |
| "num_input_tokens_seen": 2507160, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 7.210338680926916, | |
| "grad_norm": 0.48006606101989746, | |
| "learning_rate": 1.0958338528840893e-05, | |
| "loss": 0.1834, | |
| "num_input_tokens_seen": 2510232, | |
| "step": 4045 | |
| }, | |
| { | |
| "epoch": 7.219251336898395, | |
| "grad_norm": 0.4462122917175293, | |
| "learning_rate": 1.0894056222906226e-05, | |
| "loss": 0.1348, | |
| "num_input_tokens_seen": 2513144, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 7.2281639928698755, | |
| "grad_norm": 0.48262760043144226, | |
| "learning_rate": 1.0829910447826868e-05, | |
| "loss": 0.1547, | |
| "num_input_tokens_seen": 2516504, | |
| "step": 4055 | |
| }, | |
| { | |
| "epoch": 7.237076648841355, | |
| "grad_norm": 0.5589674711227417, | |
| "learning_rate": 1.0765901824467167e-05, | |
| "loss": 0.1723, | |
| "num_input_tokens_seen": 2518648, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 7.245989304812834, | |
| "grad_norm": 0.4827505946159363, | |
| "learning_rate": 1.0702030972363963e-05, | |
| "loss": 0.1625, | |
| "num_input_tokens_seen": 2521880, | |
| "step": 4065 | |
| }, | |
| { | |
| "epoch": 7.254901960784314, | |
| "grad_norm": 0.5129882097244263, | |
| "learning_rate": 1.063829850972065e-05, | |
| "loss": 0.1871, | |
| "num_input_tokens_seen": 2525336, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 7.263814616755793, | |
| "grad_norm": 0.5441546440124512, | |
| "learning_rate": 1.0574705053401127e-05, | |
| "loss": 0.1591, | |
| "num_input_tokens_seen": 2528184, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 7.2727272727272725, | |
| "grad_norm": 0.42811569571495056, | |
| "learning_rate": 1.0511251218923868e-05, | |
| "loss": 0.1592, | |
| "num_input_tokens_seen": 2530904, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 7.281639928698752, | |
| "grad_norm": 0.43192997574806213, | |
| "learning_rate": 1.0447937620455964e-05, | |
| "loss": 0.178, | |
| "num_input_tokens_seen": 2533656, | |
| "step": 4085 | |
| }, | |
| { | |
| "epoch": 7.290552584670232, | |
| "grad_norm": 0.7238538265228271, | |
| "learning_rate": 1.0384764870807149e-05, | |
| "loss": 0.1817, | |
| "num_input_tokens_seen": 2535928, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 7.2994652406417115, | |
| "grad_norm": 0.4946947991847992, | |
| "learning_rate": 1.0321733581423884e-05, | |
| "loss": 0.1685, | |
| "num_input_tokens_seen": 2539352, | |
| "step": 4095 | |
| }, | |
| { | |
| "epoch": 7.308377896613191, | |
| "grad_norm": 0.5055748224258423, | |
| "learning_rate": 1.025884436238346e-05, | |
| "loss": 0.1722, | |
| "num_input_tokens_seen": 2542456, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 7.31729055258467, | |
| "grad_norm": 0.9246964454650879, | |
| "learning_rate": 1.0196097822388075e-05, | |
| "loss": 0.1772, | |
| "num_input_tokens_seen": 2545816, | |
| "step": 4105 | |
| }, | |
| { | |
| "epoch": 7.32620320855615, | |
| "grad_norm": 0.8303518891334534, | |
| "learning_rate": 1.013349456875892e-05, | |
| "loss": 0.1608, | |
| "num_input_tokens_seen": 2548824, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 7.335115864527629, | |
| "grad_norm": 0.5074154734611511, | |
| "learning_rate": 1.0071035207430352e-05, | |
| "loss": 0.1655, | |
| "num_input_tokens_seen": 2552152, | |
| "step": 4115 | |
| }, | |
| { | |
| "epoch": 7.3440285204991085, | |
| "grad_norm": 0.4153769910335541, | |
| "learning_rate": 1.0008720342943966e-05, | |
| "loss": 0.1643, | |
| "num_input_tokens_seen": 2555768, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 7.352941176470588, | |
| "grad_norm": 0.3799455165863037, | |
| "learning_rate": 9.94655057844281e-06, | |
| "loss": 0.1602, | |
| "num_input_tokens_seen": 2558328, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 7.361853832442068, | |
| "grad_norm": 0.6474289298057556, | |
| "learning_rate": 9.884526515665508e-06, | |
| "loss": 0.17, | |
| "num_input_tokens_seen": 2561368, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 7.3707664884135475, | |
| "grad_norm": 0.7523593902587891, | |
| "learning_rate": 9.822648754940431e-06, | |
| "loss": 0.156, | |
| "num_input_tokens_seen": 2564056, | |
| "step": 4135 | |
| }, | |
| { | |
| "epoch": 7.379679144385027, | |
| "grad_norm": 0.5380316972732544, | |
| "learning_rate": 9.760917895179894e-06, | |
| "loss": 0.1746, | |
| "num_input_tokens_seen": 2566744, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 7.388591800356506, | |
| "grad_norm": 1.0373018980026245, | |
| "learning_rate": 9.699334533874386e-06, | |
| "loss": 0.1959, | |
| "num_input_tokens_seen": 2569656, | |
| "step": 4145 | |
| }, | |
| { | |
| "epoch": 7.397504456327986, | |
| "grad_norm": 0.6027229428291321, | |
| "learning_rate": 9.637899267086758e-06, | |
| "loss": 0.1752, | |
| "num_input_tokens_seen": 2573112, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 7.406417112299465, | |
| "grad_norm": 0.5722499489784241, | |
| "learning_rate": 9.576612689446444e-06, | |
| "loss": 0.1712, | |
| "num_input_tokens_seen": 2576952, | |
| "step": 4155 | |
| }, | |
| { | |
| "epoch": 7.4153297682709445, | |
| "grad_norm": 0.5797430276870728, | |
| "learning_rate": 9.515475394143742e-06, | |
| "loss": 0.1445, | |
| "num_input_tokens_seen": 2579896, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 7.424242424242424, | |
| "grad_norm": 0.4454365670681, | |
| "learning_rate": 9.45448797292403e-06, | |
| "loss": 0.2141, | |
| "num_input_tokens_seen": 2583544, | |
| "step": 4165 | |
| }, | |
| { | |
| "epoch": 7.433155080213904, | |
| "grad_norm": 0.3823348879814148, | |
| "learning_rate": 9.393651016082083e-06, | |
| "loss": 0.154, | |
| "num_input_tokens_seen": 2586200, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 7.4420677361853835, | |
| "grad_norm": 0.44054359197616577, | |
| "learning_rate": 9.332965112456337e-06, | |
| "loss": 0.1803, | |
| "num_input_tokens_seen": 2589496, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 7.450980392156863, | |
| "grad_norm": 0.4444521963596344, | |
| "learning_rate": 9.272430849423174e-06, | |
| "loss": 0.1813, | |
| "num_input_tokens_seen": 2591928, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 7.459893048128342, | |
| "grad_norm": 0.6432741284370422, | |
| "learning_rate": 9.21204881289125e-06, | |
| "loss": 0.1793, | |
| "num_input_tokens_seen": 2595064, | |
| "step": 4185 | |
| }, | |
| { | |
| "epoch": 7.468805704099822, | |
| "grad_norm": 0.5586231350898743, | |
| "learning_rate": 9.151819587295845e-06, | |
| "loss": 0.162, | |
| "num_input_tokens_seen": 2597944, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 7.477718360071301, | |
| "grad_norm": 0.4838408827781677, | |
| "learning_rate": 9.09174375559319e-06, | |
| "loss": 0.1969, | |
| "num_input_tokens_seen": 2601656, | |
| "step": 4195 | |
| }, | |
| { | |
| "epoch": 7.4866310160427805, | |
| "grad_norm": 0.4085644483566284, | |
| "learning_rate": 9.031821899254796e-06, | |
| "loss": 0.1497, | |
| "num_input_tokens_seen": 2604472, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 7.49554367201426, | |
| "grad_norm": 0.3888384699821472, | |
| "learning_rate": 8.972054598261892e-06, | |
| "loss": 0.1631, | |
| "num_input_tokens_seen": 2607992, | |
| "step": 4205 | |
| }, | |
| { | |
| "epoch": 7.50445632798574, | |
| "grad_norm": 0.7054049372673035, | |
| "learning_rate": 8.912442431099724e-06, | |
| "loss": 0.1672, | |
| "num_input_tokens_seen": 2611800, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 7.5133689839572195, | |
| "grad_norm": 0.5162657499313354, | |
| "learning_rate": 8.852985974752045e-06, | |
| "loss": 0.1665, | |
| "num_input_tokens_seen": 2614936, | |
| "step": 4215 | |
| }, | |
| { | |
| "epoch": 7.5133689839572195, | |
| "eval_loss": 0.18536153435707092, | |
| "eval_runtime": 4.2538, | |
| "eval_samples_per_second": 58.536, | |
| "eval_steps_per_second": 14.81, | |
| "num_input_tokens_seen": 2614936, | |
| "step": 4215 | |
| }, | |
| { | |
| "epoch": 7.522281639928699, | |
| "grad_norm": 0.357683002948761, | |
| "learning_rate": 8.793685804695482e-06, | |
| "loss": 0.2229, | |
| "num_input_tokens_seen": 2618744, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 7.531194295900178, | |
| "grad_norm": 0.4619935154914856, | |
| "learning_rate": 8.734542494893955e-06, | |
| "loss": 0.1613, | |
| "num_input_tokens_seen": 2621496, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 7.540106951871658, | |
| "grad_norm": 0.5771064758300781, | |
| "learning_rate": 8.675556617793143e-06, | |
| "loss": 0.1607, | |
| "num_input_tokens_seen": 2624568, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 7.549019607843137, | |
| "grad_norm": 0.5340394377708435, | |
| "learning_rate": 8.616728744314956e-06, | |
| "loss": 0.1969, | |
| "num_input_tokens_seen": 2627832, | |
| "step": 4235 | |
| }, | |
| { | |
| "epoch": 7.5579322638146165, | |
| "grad_norm": 0.5918867588043213, | |
| "learning_rate": 8.558059443851998e-06, | |
| "loss": 0.1702, | |
| "num_input_tokens_seen": 2631160, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 7.566844919786096, | |
| "grad_norm": 0.4290253520011902, | |
| "learning_rate": 8.499549284262017e-06, | |
| "loss": 0.158, | |
| "num_input_tokens_seen": 2634488, | |
| "step": 4245 | |
| }, | |
| { | |
| "epoch": 7.575757575757576, | |
| "grad_norm": 0.6583709120750427, | |
| "learning_rate": 8.441198831862485e-06, | |
| "loss": 0.1691, | |
| "num_input_tokens_seen": 2637240, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 7.5846702317290555, | |
| "grad_norm": 0.6762195825576782, | |
| "learning_rate": 8.383008651425035e-06, | |
| "loss": 0.1565, | |
| "num_input_tokens_seen": 2639992, | |
| "step": 4255 | |
| }, | |
| { | |
| "epoch": 7.593582887700535, | |
| "grad_norm": 0.29171764850616455, | |
| "learning_rate": 8.32497930617006e-06, | |
| "loss": 0.1893, | |
| "num_input_tokens_seen": 2643832, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 7.602495543672014, | |
| "grad_norm": 0.4991152286529541, | |
| "learning_rate": 8.267111357761243e-06, | |
| "loss": 0.1343, | |
| "num_input_tokens_seen": 2646712, | |
| "step": 4265 | |
| }, | |
| { | |
| "epoch": 7.611408199643494, | |
| "grad_norm": 0.6517699360847473, | |
| "learning_rate": 8.209405366300088e-06, | |
| "loss": 0.1455, | |
| "num_input_tokens_seen": 2650072, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 7.620320855614973, | |
| "grad_norm": 1.1518526077270508, | |
| "learning_rate": 8.151861890320528e-06, | |
| "loss": 0.1928, | |
| "num_input_tokens_seen": 2653656, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 7.6292335115864525, | |
| "grad_norm": 0.7069615721702576, | |
| "learning_rate": 8.094481486783534e-06, | |
| "loss": 0.2059, | |
| "num_input_tokens_seen": 2657464, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 7.638146167557933, | |
| "grad_norm": 0.3675689697265625, | |
| "learning_rate": 8.0372647110717e-06, | |
| "loss": 0.1825, | |
| "num_input_tokens_seen": 2660568, | |
| "step": 4285 | |
| }, | |
| { | |
| "epoch": 7.647058823529412, | |
| "grad_norm": 0.5671415328979492, | |
| "learning_rate": 7.98021211698385e-06, | |
| "loss": 0.1507, | |
| "num_input_tokens_seen": 2663448, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 7.6559714795008915, | |
| "grad_norm": 0.5237590074539185, | |
| "learning_rate": 7.923324256729738e-06, | |
| "loss": 0.1794, | |
| "num_input_tokens_seen": 2666136, | |
| "step": 4295 | |
| }, | |
| { | |
| "epoch": 7.664884135472371, | |
| "grad_norm": 0.6967838406562805, | |
| "learning_rate": 7.866601680924633e-06, | |
| "loss": 0.183, | |
| "num_input_tokens_seen": 2669048, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 7.67379679144385, | |
| "grad_norm": 0.48244914412498474, | |
| "learning_rate": 7.810044938584038e-06, | |
| "loss": 0.1663, | |
| "num_input_tokens_seen": 2671800, | |
| "step": 4305 | |
| }, | |
| { | |
| "epoch": 7.68270944741533, | |
| "grad_norm": 0.5121620893478394, | |
| "learning_rate": 7.75365457711837e-06, | |
| "loss": 0.1757, | |
| "num_input_tokens_seen": 2675448, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 7.691622103386809, | |
| "grad_norm": 0.5723910331726074, | |
| "learning_rate": 7.697431142327632e-06, | |
| "loss": 0.1654, | |
| "num_input_tokens_seen": 2678392, | |
| "step": 4315 | |
| }, | |
| { | |
| "epoch": 7.7005347593582885, | |
| "grad_norm": 0.4338489770889282, | |
| "learning_rate": 7.641375178396151e-06, | |
| "loss": 0.1645, | |
| "num_input_tokens_seen": 2681112, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 7.709447415329768, | |
| "grad_norm": 0.5260465145111084, | |
| "learning_rate": 7.585487227887328e-06, | |
| "loss": 0.1636, | |
| "num_input_tokens_seen": 2684856, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 7.718360071301248, | |
| "grad_norm": 0.37905287742614746, | |
| "learning_rate": 7.529767831738366e-06, | |
| "loss": 0.1682, | |
| "num_input_tokens_seen": 2687576, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 7.7272727272727275, | |
| "grad_norm": 0.5463063716888428, | |
| "learning_rate": 7.474217529255018e-06, | |
| "loss": 0.1472, | |
| "num_input_tokens_seen": 2690328, | |
| "step": 4335 | |
| }, | |
| { | |
| "epoch": 7.736185383244207, | |
| "grad_norm": 0.640016496181488, | |
| "learning_rate": 7.4188368581064124e-06, | |
| "loss": 0.17, | |
| "num_input_tokens_seen": 2694168, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 7.745098039215686, | |
| "grad_norm": 0.42445164918899536, | |
| "learning_rate": 7.3636263543197945e-06, | |
| "loss": 0.1617, | |
| "num_input_tokens_seen": 2697208, | |
| "step": 4345 | |
| }, | |
| { | |
| "epoch": 7.754010695187166, | |
| "grad_norm": 1.0092363357543945, | |
| "learning_rate": 7.30858655227539e-06, | |
| "loss": 0.182, | |
| "num_input_tokens_seen": 2700376, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 7.762923351158645, | |
| "grad_norm": 0.2814575433731079, | |
| "learning_rate": 7.253717984701208e-06, | |
| "loss": 0.1667, | |
| "num_input_tokens_seen": 2703256, | |
| "step": 4355 | |
| }, | |
| { | |
| "epoch": 7.7718360071301245, | |
| "grad_norm": 0.5186646580696106, | |
| "learning_rate": 7.199021182667873e-06, | |
| "loss": 0.1594, | |
| "num_input_tokens_seen": 2705752, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 7.780748663101605, | |
| "grad_norm": 0.4522174000740051, | |
| "learning_rate": 7.1444966755834954e-06, | |
| "loss": 0.1373, | |
| "num_input_tokens_seen": 2708888, | |
| "step": 4365 | |
| }, | |
| { | |
| "epoch": 7.789661319073084, | |
| "grad_norm": 0.4952068328857422, | |
| "learning_rate": 7.0901449911885685e-06, | |
| "loss": 0.159, | |
| "num_input_tokens_seen": 2711576, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 7.7985739750445635, | |
| "grad_norm": 0.47718411684036255, | |
| "learning_rate": 7.035966655550838e-06, | |
| "loss": 0.1856, | |
| "num_input_tokens_seen": 2715000, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 7.807486631016043, | |
| "grad_norm": 0.5538311004638672, | |
| "learning_rate": 6.98196219306019e-06, | |
| "loss": 0.1708, | |
| "num_input_tokens_seen": 2717880, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 7.816399286987522, | |
| "grad_norm": 0.40867936611175537, | |
| "learning_rate": 6.928132126423636e-06, | |
| "loss": 0.1424, | |
| "num_input_tokens_seen": 2721240, | |
| "step": 4385 | |
| }, | |
| { | |
| "epoch": 7.825311942959002, | |
| "grad_norm": 0.579886257648468, | |
| "learning_rate": 6.8744769766601854e-06, | |
| "loss": 0.1844, | |
| "num_input_tokens_seen": 2724696, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 7.834224598930481, | |
| "grad_norm": 0.4526924788951874, | |
| "learning_rate": 6.820997263095849e-06, | |
| "loss": 0.1754, | |
| "num_input_tokens_seen": 2727960, | |
| "step": 4395 | |
| }, | |
| { | |
| "epoch": 7.8431372549019605, | |
| "grad_norm": 0.5530297756195068, | |
| "learning_rate": 6.767693503358608e-06, | |
| "loss": 0.1816, | |
| "num_input_tokens_seen": 2731000, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 7.85204991087344, | |
| "grad_norm": 0.3621399700641632, | |
| "learning_rate": 6.7145662133733715e-06, | |
| "loss": 0.1751, | |
| "num_input_tokens_seen": 2734264, | |
| "step": 4405 | |
| }, | |
| { | |
| "epoch": 7.86096256684492, | |
| "grad_norm": 0.5544110536575317, | |
| "learning_rate": 6.6616159073570135e-06, | |
| "loss": 0.1635, | |
| "num_input_tokens_seen": 2736664, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 7.8698752228163995, | |
| "grad_norm": 0.504298985004425, | |
| "learning_rate": 6.6088430978133914e-06, | |
| "loss": 0.1685, | |
| "num_input_tokens_seen": 2739672, | |
| "step": 4415 | |
| }, | |
| { | |
| "epoch": 7.878787878787879, | |
| "grad_norm": 0.45025068521499634, | |
| "learning_rate": 6.556248295528389e-06, | |
| "loss": 0.1576, | |
| "num_input_tokens_seen": 2742552, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 7.887700534759358, | |
| "grad_norm": 0.9994719624519348, | |
| "learning_rate": 6.5038320095649395e-06, | |
| "loss": 0.1938, | |
| "num_input_tokens_seen": 2745880, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 7.896613190730838, | |
| "grad_norm": 0.5288066267967224, | |
| "learning_rate": 6.451594747258155e-06, | |
| "loss": 0.1818, | |
| "num_input_tokens_seen": 2749912, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 7.905525846702317, | |
| "grad_norm": 0.5786968469619751, | |
| "learning_rate": 6.399537014210355e-06, | |
| "loss": 0.1757, | |
| "num_input_tokens_seen": 2753368, | |
| "step": 4435 | |
| }, | |
| { | |
| "epoch": 7.9144385026737964, | |
| "grad_norm": 0.3910267651081085, | |
| "learning_rate": 6.3476593142862275e-06, | |
| "loss": 0.1794, | |
| "num_input_tokens_seen": 2756568, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 7.923351158645277, | |
| "grad_norm": 1.0030827522277832, | |
| "learning_rate": 6.29596214960792e-06, | |
| "loss": 0.1752, | |
| "num_input_tokens_seen": 2759704, | |
| "step": 4445 | |
| }, | |
| { | |
| "epoch": 7.932263814616756, | |
| "grad_norm": 0.41212958097457886, | |
| "learning_rate": 6.244446020550182e-06, | |
| "loss": 0.1709, | |
| "num_input_tokens_seen": 2762584, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 7.9411764705882355, | |
| "grad_norm": 0.5541166067123413, | |
| "learning_rate": 6.193111425735515e-06, | |
| "loss": 0.1763, | |
| "num_input_tokens_seen": 2765752, | |
| "step": 4455 | |
| }, | |
| { | |
| "epoch": 7.950089126559715, | |
| "grad_norm": 0.6690767407417297, | |
| "learning_rate": 6.141958862029384e-06, | |
| "loss": 0.1624, | |
| "num_input_tokens_seen": 2768696, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 7.959001782531194, | |
| "grad_norm": 0.5791964530944824, | |
| "learning_rate": 6.090988824535374e-06, | |
| "loss": 0.1844, | |
| "num_input_tokens_seen": 2772120, | |
| "step": 4465 | |
| }, | |
| { | |
| "epoch": 7.967914438502674, | |
| "grad_norm": 0.40184465050697327, | |
| "learning_rate": 6.040201806590387e-06, | |
| "loss": 0.1918, | |
| "num_input_tokens_seen": 2775384, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 7.976827094474153, | |
| "grad_norm": 0.4650464951992035, | |
| "learning_rate": 5.989598299759919e-06, | |
| "loss": 0.1778, | |
| "num_input_tokens_seen": 2778520, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 7.9857397504456324, | |
| "grad_norm": 0.5422367453575134, | |
| "learning_rate": 5.939178793833233e-06, | |
| "loss": 0.1734, | |
| "num_input_tokens_seen": 2780888, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 7.994652406417112, | |
| "grad_norm": 0.5420627593994141, | |
| "learning_rate": 5.888943776818684e-06, | |
| "loss": 0.1781, | |
| "num_input_tokens_seen": 2784312, | |
| "step": 4485 | |
| }, | |
| { | |
| "epoch": 8.003565062388592, | |
| "grad_norm": 0.465055912733078, | |
| "learning_rate": 5.83889373493896e-06, | |
| "loss": 0.1861, | |
| "num_input_tokens_seen": 2787056, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 8.01247771836007, | |
| "grad_norm": 0.8877488970756531, | |
| "learning_rate": 5.789029152626374e-06, | |
| "loss": 0.1686, | |
| "num_input_tokens_seen": 2790288, | |
| "step": 4495 | |
| }, | |
| { | |
| "epoch": 8.014260249554367, | |
| "eval_loss": 0.18306031823158264, | |
| "eval_runtime": 4.2492, | |
| "eval_samples_per_second": 58.599, | |
| "eval_steps_per_second": 14.826, | |
| "num_input_tokens_seen": 2790832, | |
| "step": 4496 | |
| }, | |
| { | |
| "epoch": 8.02139037433155, | |
| "grad_norm": 0.3791468143463135, | |
| "learning_rate": 5.73935051251818e-06, | |
| "loss": 0.1626, | |
| "num_input_tokens_seen": 2793136, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 8.030303030303031, | |
| "grad_norm": 0.6450890302658081, | |
| "learning_rate": 5.689858295451914e-06, | |
| "loss": 0.1684, | |
| "num_input_tokens_seen": 2796464, | |
| "step": 4505 | |
| }, | |
| { | |
| "epoch": 8.03921568627451, | |
| "grad_norm": 0.36496949195861816, | |
| "learning_rate": 5.640552980460742e-06, | |
| "loss": 0.1524, | |
| "num_input_tokens_seen": 2799344, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 8.04812834224599, | |
| "grad_norm": 0.5503035187721252, | |
| "learning_rate": 5.591435044768783e-06, | |
| "loss": 0.1529, | |
| "num_input_tokens_seen": 2801648, | |
| "step": 4515 | |
| }, | |
| { | |
| "epoch": 8.057040998217468, | |
| "grad_norm": 0.4298340678215027, | |
| "learning_rate": 5.542504963786552e-06, | |
| "loss": 0.1769, | |
| "num_input_tokens_seen": 2804976, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 8.065953654188949, | |
| "grad_norm": 0.44245445728302, | |
| "learning_rate": 5.493763211106293e-06, | |
| "loss": 0.1543, | |
| "num_input_tokens_seen": 2807472, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 8.074866310160427, | |
| "grad_norm": 0.27881208062171936, | |
| "learning_rate": 5.4452102584974545e-06, | |
| "loss": 0.1436, | |
| "num_input_tokens_seen": 2810768, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 8.083778966131907, | |
| "grad_norm": 0.9025391340255737, | |
| "learning_rate": 5.396846575902095e-06, | |
| "loss": 0.1822, | |
| "num_input_tokens_seen": 2814480, | |
| "step": 4535 | |
| }, | |
| { | |
| "epoch": 8.092691622103386, | |
| "grad_norm": 0.33398008346557617, | |
| "learning_rate": 5.348672631430318e-06, | |
| "loss": 0.1551, | |
| "num_input_tokens_seen": 2817968, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 8.101604278074866, | |
| "grad_norm": 0.45554453134536743, | |
| "learning_rate": 5.300688891355765e-06, | |
| "loss": 0.1626, | |
| "num_input_tokens_seen": 2820784, | |
| "step": 4545 | |
| }, | |
| { | |
| "epoch": 8.110516934046347, | |
| "grad_norm": 0.38997194170951843, | |
| "learning_rate": 5.252895820111112e-06, | |
| "loss": 0.1377, | |
| "num_input_tokens_seen": 2823824, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 8.119429590017825, | |
| "grad_norm": 0.5823608040809631, | |
| "learning_rate": 5.205293880283552e-06, | |
| "loss": 0.1602, | |
| "num_input_tokens_seen": 2826832, | |
| "step": 4555 | |
| }, | |
| { | |
| "epoch": 8.128342245989305, | |
| "grad_norm": 0.6442610025405884, | |
| "learning_rate": 5.157883532610305e-06, | |
| "loss": 0.189, | |
| "num_input_tokens_seen": 2830256, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 8.137254901960784, | |
| "grad_norm": 0.6161116361618042, | |
| "learning_rate": 5.110665235974219e-06, | |
| "loss": 0.181, | |
| "num_input_tokens_seen": 2832848, | |
| "step": 4565 | |
| }, | |
| { | |
| "epoch": 8.146167557932264, | |
| "grad_norm": 0.5139124989509583, | |
| "learning_rate": 5.06363944739924e-06, | |
| "loss": 0.1593, | |
| "num_input_tokens_seen": 2835664, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 8.155080213903743, | |
| "grad_norm": 0.4244152903556824, | |
| "learning_rate": 5.0168066220460715e-06, | |
| "loss": 0.1533, | |
| "num_input_tokens_seen": 2838864, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 8.163992869875223, | |
| "grad_norm": 0.8236415386199951, | |
| "learning_rate": 4.97016721320773e-06, | |
| "loss": 0.1638, | |
| "num_input_tokens_seen": 2841840, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 8.172905525846703, | |
| "grad_norm": 0.6396406292915344, | |
| "learning_rate": 4.9237216723051485e-06, | |
| "loss": 0.1693, | |
| "num_input_tokens_seen": 2844976, | |
| "step": 4585 | |
| }, | |
| { | |
| "epoch": 8.181818181818182, | |
| "grad_norm": 0.41378054022789, | |
| "learning_rate": 4.877470448882815e-06, | |
| "loss": 0.1585, | |
| "num_input_tokens_seen": 2847856, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 8.190730837789662, | |
| "grad_norm": 0.5032555460929871, | |
| "learning_rate": 4.831413990604447e-06, | |
| "loss": 0.1465, | |
| "num_input_tokens_seen": 2850192, | |
| "step": 4595 | |
| }, | |
| { | |
| "epoch": 8.19964349376114, | |
| "grad_norm": 0.4285055994987488, | |
| "learning_rate": 4.7855527432486336e-06, | |
| "loss": 0.1517, | |
| "num_input_tokens_seen": 2853008, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 8.20855614973262, | |
| "grad_norm": 0.5328398942947388, | |
| "learning_rate": 4.739887150704508e-06, | |
| "loss": 0.2001, | |
| "num_input_tokens_seen": 2856464, | |
| "step": 4605 | |
| }, | |
| { | |
| "epoch": 8.2174688057041, | |
| "grad_norm": 0.45751845836639404, | |
| "learning_rate": 4.694417654967492e-06, | |
| "loss": 0.1507, | |
| "num_input_tokens_seen": 2858864, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 8.22638146167558, | |
| "grad_norm": 0.44036829471588135, | |
| "learning_rate": 4.649144696134972e-06, | |
| "loss": 0.1711, | |
| "num_input_tokens_seen": 2861488, | |
| "step": 4615 | |
| }, | |
| { | |
| "epoch": 8.235294117647058, | |
| "grad_norm": 0.4446769654750824, | |
| "learning_rate": 4.6040687124020794e-06, | |
| "loss": 0.168, | |
| "num_input_tokens_seen": 2865136, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 8.244206773618538, | |
| "grad_norm": 0.6855089068412781, | |
| "learning_rate": 4.5591901400574285e-06, | |
| "loss": 0.1646, | |
| "num_input_tokens_seen": 2867984, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 8.253119429590019, | |
| "grad_norm": 0.6599955558776855, | |
| "learning_rate": 4.514509413478888e-06, | |
| "loss": 0.1795, | |
| "num_input_tokens_seen": 2871088, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 8.262032085561497, | |
| "grad_norm": 0.42294609546661377, | |
| "learning_rate": 4.470026965129384e-06, | |
| "loss": 0.1433, | |
| "num_input_tokens_seen": 2874352, | |
| "step": 4635 | |
| }, | |
| { | |
| "epoch": 8.270944741532977, | |
| "grad_norm": 0.4342804551124573, | |
| "learning_rate": 4.425743225552731e-06, | |
| "loss": 0.1762, | |
| "num_input_tokens_seen": 2877840, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 8.279857397504456, | |
| "grad_norm": 0.5680054426193237, | |
| "learning_rate": 4.381658623369445e-06, | |
| "loss": 0.1532, | |
| "num_input_tokens_seen": 2881456, | |
| "step": 4645 | |
| }, | |
| { | |
| "epoch": 8.288770053475936, | |
| "grad_norm": 0.5137624740600586, | |
| "learning_rate": 4.337773585272581e-06, | |
| "loss": 0.1694, | |
| "num_input_tokens_seen": 2884400, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 8.297682709447415, | |
| "grad_norm": 0.3794878125190735, | |
| "learning_rate": 4.294088536023652e-06, | |
| "loss": 0.1475, | |
| "num_input_tokens_seen": 2887536, | |
| "step": 4655 | |
| }, | |
| { | |
| "epoch": 8.306595365418895, | |
| "grad_norm": 0.6075329184532166, | |
| "learning_rate": 4.250603898448455e-06, | |
| "loss": 0.1811, | |
| "num_input_tokens_seen": 2890352, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 8.315508021390375, | |
| "grad_norm": 0.45767733454704285, | |
| "learning_rate": 4.2073200934330315e-06, | |
| "loss": 0.1871, | |
| "num_input_tokens_seen": 2893520, | |
| "step": 4665 | |
| }, | |
| { | |
| "epoch": 8.324420677361854, | |
| "grad_norm": 0.46819356083869934, | |
| "learning_rate": 4.164237539919577e-06, | |
| "loss": 0.1842, | |
| "num_input_tokens_seen": 2896048, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.5235320329666138, | |
| "learning_rate": 4.121356654902364e-06, | |
| "loss": 0.164, | |
| "num_input_tokens_seen": 2899472, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 8.342245989304812, | |
| "grad_norm": 0.8180021047592163, | |
| "learning_rate": 4.078677853423724e-06, | |
| "loss": 0.1573, | |
| "num_input_tokens_seen": 2902832, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 8.351158645276293, | |
| "grad_norm": 0.9956904649734497, | |
| "learning_rate": 4.036201548570049e-06, | |
| "loss": 0.2367, | |
| "num_input_tokens_seen": 2906576, | |
| "step": 4685 | |
| }, | |
| { | |
| "epoch": 8.360071301247771, | |
| "grad_norm": 0.6165153980255127, | |
| "learning_rate": 3.993928151467766e-06, | |
| "loss": 0.1987, | |
| "num_input_tokens_seen": 2909840, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 8.368983957219251, | |
| "grad_norm": 0.48898622393608093, | |
| "learning_rate": 3.951858071279352e-06, | |
| "loss": 0.1454, | |
| "num_input_tokens_seen": 2912752, | |
| "step": 4695 | |
| }, | |
| { | |
| "epoch": 8.37789661319073, | |
| "grad_norm": 0.48024001717567444, | |
| "learning_rate": 3.909991715199412e-06, | |
| "loss": 0.1633, | |
| "num_input_tokens_seen": 2915024, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 8.38680926916221, | |
| "grad_norm": 0.4968958795070648, | |
| "learning_rate": 3.8683294884506945e-06, | |
| "loss": 0.1655, | |
| "num_input_tokens_seen": 2918480, | |
| "step": 4705 | |
| }, | |
| { | |
| "epoch": 8.39572192513369, | |
| "grad_norm": 0.5491753220558167, | |
| "learning_rate": 3.826871794280193e-06, | |
| "loss": 0.1729, | |
| "num_input_tokens_seen": 2921712, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 8.404634581105169, | |
| "grad_norm": 0.5808373093605042, | |
| "learning_rate": 3.7856190339552513e-06, | |
| "loss": 0.1851, | |
| "num_input_tokens_seen": 2925040, | |
| "step": 4715 | |
| }, | |
| { | |
| "epoch": 8.41354723707665, | |
| "grad_norm": 0.9629413485527039, | |
| "learning_rate": 3.7445716067596503e-06, | |
| "loss": 0.1578, | |
| "num_input_tokens_seen": 2928112, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 8.422459893048128, | |
| "grad_norm": 0.8614413142204285, | |
| "learning_rate": 3.7037299099897586e-06, | |
| "loss": 0.1865, | |
| "num_input_tokens_seen": 2932368, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 8.431372549019608, | |
| "grad_norm": 0.5639718770980835, | |
| "learning_rate": 3.663094338950704e-06, | |
| "loss": 0.1738, | |
| "num_input_tokens_seen": 2935088, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 8.440285204991087, | |
| "grad_norm": 0.5123082995414734, | |
| "learning_rate": 3.6226652869525285e-06, | |
| "loss": 0.1471, | |
| "num_input_tokens_seen": 2937840, | |
| "step": 4735 | |
| }, | |
| { | |
| "epoch": 8.449197860962567, | |
| "grad_norm": 0.5894414186477661, | |
| "learning_rate": 3.5824431453063662e-06, | |
| "loss": 0.1638, | |
| "num_input_tokens_seen": 2941008, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 8.458110516934047, | |
| "grad_norm": 0.34330514073371887, | |
| "learning_rate": 3.5424283033207024e-06, | |
| "loss": 0.1672, | |
| "num_input_tokens_seen": 2944464, | |
| "step": 4745 | |
| }, | |
| { | |
| "epoch": 8.467023172905526, | |
| "grad_norm": 0.37955033779144287, | |
| "learning_rate": 3.5026211482975497e-06, | |
| "loss": 0.1584, | |
| "num_input_tokens_seen": 2947376, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 8.475935828877006, | |
| "grad_norm": 0.9495477080345154, | |
| "learning_rate": 3.463022065528748e-06, | |
| "loss": 0.1767, | |
| "num_input_tokens_seen": 2950480, | |
| "step": 4755 | |
| }, | |
| { | |
| "epoch": 8.484848484848484, | |
| "grad_norm": 0.3263673782348633, | |
| "learning_rate": 3.4236314382922103e-06, | |
| "loss": 0.1429, | |
| "num_input_tokens_seen": 2953392, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 8.493761140819965, | |
| "grad_norm": 0.5537719130516052, | |
| "learning_rate": 3.3844496478482064e-06, | |
| "loss": 0.1588, | |
| "num_input_tokens_seen": 2956272, | |
| "step": 4765 | |
| }, | |
| { | |
| "epoch": 8.502673796791443, | |
| "grad_norm": 0.30169588327407837, | |
| "learning_rate": 3.345477073435685e-06, | |
| "loss": 0.167, | |
| "num_input_tokens_seen": 2959056, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 8.511586452762923, | |
| "grad_norm": 0.5430099964141846, | |
| "learning_rate": 3.3067140922686174e-06, | |
| "loss": 0.1655, | |
| "num_input_tokens_seen": 2962480, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 8.515151515151516, | |
| "eval_loss": 0.1827203780412674, | |
| "eval_runtime": 4.2534, | |
| "eval_samples_per_second": 58.541, | |
| "eval_steps_per_second": 14.812, | |
| "num_input_tokens_seen": 2963888, | |
| "step": 4777 | |
| }, | |
| { | |
| "epoch": 8.520499108734402, | |
| "grad_norm": 0.44720202684402466, | |
| "learning_rate": 3.268161079532317e-06, | |
| "loss": 0.1494, | |
| "num_input_tokens_seen": 2965360, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 8.529411764705882, | |
| "grad_norm": 0.3062620162963867, | |
| "learning_rate": 3.22981840837982e-06, | |
| "loss": 0.1712, | |
| "num_input_tokens_seen": 2968464, | |
| "step": 4785 | |
| }, | |
| { | |
| "epoch": 8.538324420677363, | |
| "grad_norm": 0.9861251711845398, | |
| "learning_rate": 3.1916864499282856e-06, | |
| "loss": 0.1779, | |
| "num_input_tokens_seen": 2972144, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 8.547237076648841, | |
| "grad_norm": 0.43644168972969055, | |
| "learning_rate": 3.1537655732553768e-06, | |
| "loss": 0.1509, | |
| "num_input_tokens_seen": 2974384, | |
| "step": 4795 | |
| }, | |
| { | |
| "epoch": 8.556149732620321, | |
| "grad_norm": 0.5110581517219543, | |
| "learning_rate": 3.1160561453957183e-06, | |
| "loss": 0.1578, | |
| "num_input_tokens_seen": 2977104, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 8.5650623885918, | |
| "grad_norm": 0.5604438781738281, | |
| "learning_rate": 3.078558531337336e-06, | |
| "loss": 0.1694, | |
| "num_input_tokens_seen": 2980464, | |
| "step": 4805 | |
| }, | |
| { | |
| "epoch": 8.57397504456328, | |
| "grad_norm": 0.5687141418457031, | |
| "learning_rate": 3.0412730940181015e-06, | |
| "loss": 0.1643, | |
| "num_input_tokens_seen": 2983248, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 8.582887700534759, | |
| "grad_norm": 0.9281808137893677, | |
| "learning_rate": 3.0042001943222376e-06, | |
| "loss": 0.165, | |
| "num_input_tokens_seen": 2986256, | |
| "step": 4815 | |
| }, | |
| { | |
| "epoch": 8.591800356506239, | |
| "grad_norm": 0.6919686794281006, | |
| "learning_rate": 2.967340191076834e-06, | |
| "loss": 0.1902, | |
| "num_input_tokens_seen": 2990256, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 8.60071301247772, | |
| "grad_norm": 0.7080613374710083, | |
| "learning_rate": 2.930693441048371e-06, | |
| "loss": 0.149, | |
| "num_input_tokens_seen": 2992592, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 8.609625668449198, | |
| "grad_norm": 0.5117068886756897, | |
| "learning_rate": 2.8942602989392386e-06, | |
| "loss": 0.174, | |
| "num_input_tokens_seen": 2995888, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 8.618538324420678, | |
| "grad_norm": 0.37796565890312195, | |
| "learning_rate": 2.858041117384341e-06, | |
| "loss": 0.148, | |
| "num_input_tokens_seen": 2999280, | |
| "step": 4835 | |
| }, | |
| { | |
| "epoch": 8.627450980392156, | |
| "grad_norm": 0.6607238054275513, | |
| "learning_rate": 2.8220362469476624e-06, | |
| "loss": 0.1541, | |
| "num_input_tokens_seen": 3002864, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 8.636363636363637, | |
| "grad_norm": 0.4288221001625061, | |
| "learning_rate": 2.7862460361188614e-06, | |
| "loss": 0.1521, | |
| "num_input_tokens_seen": 3004944, | |
| "step": 4845 | |
| }, | |
| { | |
| "epoch": 8.645276292335115, | |
| "grad_norm": 0.49076348543167114, | |
| "learning_rate": 2.750670831309957e-06, | |
| "loss": 0.1682, | |
| "num_input_tokens_seen": 3008464, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 8.654188948306595, | |
| "grad_norm": 0.615407407283783, | |
| "learning_rate": 2.7153109768518925e-06, | |
| "loss": 0.171, | |
| "num_input_tokens_seen": 3012240, | |
| "step": 4855 | |
| }, | |
| { | |
| "epoch": 8.663101604278076, | |
| "grad_norm": 0.5121405124664307, | |
| "learning_rate": 2.680166814991256e-06, | |
| "loss": 0.1606, | |
| "num_input_tokens_seen": 3015056, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 8.672014260249554, | |
| "grad_norm": 0.7262160778045654, | |
| "learning_rate": 2.645238685886961e-06, | |
| "loss": 0.2009, | |
| "num_input_tokens_seen": 3018160, | |
| "step": 4865 | |
| }, | |
| { | |
| "epoch": 8.680926916221035, | |
| "grad_norm": 0.5012710690498352, | |
| "learning_rate": 2.6105269276069573e-06, | |
| "loss": 0.1641, | |
| "num_input_tokens_seen": 3021392, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 8.689839572192513, | |
| "grad_norm": 0.681621789932251, | |
| "learning_rate": 2.5760318761249263e-06, | |
| "loss": 0.1751, | |
| "num_input_tokens_seen": 3024240, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 8.698752228163993, | |
| "grad_norm": 0.4795394539833069, | |
| "learning_rate": 2.541753865317076e-06, | |
| "loss": 0.171, | |
| "num_input_tokens_seen": 3026800, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 8.707664884135472, | |
| "grad_norm": 0.4269944429397583, | |
| "learning_rate": 2.507693226958871e-06, | |
| "loss": 0.1673, | |
| "num_input_tokens_seen": 3029968, | |
| "step": 4885 | |
| }, | |
| { | |
| "epoch": 8.716577540106952, | |
| "grad_norm": 0.6113168597221375, | |
| "learning_rate": 2.473850290721838e-06, | |
| "loss": 0.1568, | |
| "num_input_tokens_seen": 3032656, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 8.72549019607843, | |
| "grad_norm": 0.5832796692848206, | |
| "learning_rate": 2.4402253841703914e-06, | |
| "loss": 0.1645, | |
| "num_input_tokens_seen": 3035376, | |
| "step": 4895 | |
| }, | |
| { | |
| "epoch": 8.73440285204991, | |
| "grad_norm": 0.4533407986164093, | |
| "learning_rate": 2.4068188327586257e-06, | |
| "loss": 0.1798, | |
| "num_input_tokens_seen": 3038512, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 8.743315508021391, | |
| "grad_norm": 0.6923168897628784, | |
| "learning_rate": 2.373630959827186e-06, | |
| "loss": 0.161, | |
| "num_input_tokens_seen": 3041744, | |
| "step": 4905 | |
| }, | |
| { | |
| "epoch": 8.75222816399287, | |
| "grad_norm": 0.5411429405212402, | |
| "learning_rate": 2.3406620866001485e-06, | |
| "loss": 0.1696, | |
| "num_input_tokens_seen": 3045232, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 8.76114081996435, | |
| "grad_norm": 0.40592697262763977, | |
| "learning_rate": 2.3079125321818996e-06, | |
| "loss": 0.1636, | |
| "num_input_tokens_seen": 3047728, | |
| "step": 4915 | |
| }, | |
| { | |
| "epoch": 8.770053475935828, | |
| "grad_norm": 0.7785168886184692, | |
| "learning_rate": 2.275382613554031e-06, | |
| "loss": 0.1534, | |
| "num_input_tokens_seen": 3050864, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 8.778966131907309, | |
| "grad_norm": 0.46840912103652954, | |
| "learning_rate": 2.2430726455723113e-06, | |
| "loss": 0.1651, | |
| "num_input_tokens_seen": 3053680, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 8.787878787878787, | |
| "grad_norm": 0.5858107209205627, | |
| "learning_rate": 2.210982940963596e-06, | |
| "loss": 0.1632, | |
| "num_input_tokens_seen": 3057136, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 8.796791443850267, | |
| "grad_norm": 0.8381409049034119, | |
| "learning_rate": 2.1791138103228275e-06, | |
| "loss": 0.1736, | |
| "num_input_tokens_seen": 3060144, | |
| "step": 4935 | |
| }, | |
| { | |
| "epoch": 8.805704099821746, | |
| "grad_norm": 0.4155525863170624, | |
| "learning_rate": 2.1474655621100347e-06, | |
| "loss": 0.1759, | |
| "num_input_tokens_seen": 3063024, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 8.814616755793226, | |
| "grad_norm": 0.7829816937446594, | |
| "learning_rate": 2.116038502647319e-06, | |
| "loss": 0.1736, | |
| "num_input_tokens_seen": 3066320, | |
| "step": 4945 | |
| }, | |
| { | |
| "epoch": 8.823529411764707, | |
| "grad_norm": 0.44637227058410645, | |
| "learning_rate": 2.084832936115902e-06, | |
| "loss": 0.1513, | |
| "num_input_tokens_seen": 3069296, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 8.832442067736185, | |
| "grad_norm": 0.49461662769317627, | |
| "learning_rate": 2.0538491645531982e-06, | |
| "loss": 0.1745, | |
| "num_input_tokens_seen": 3071888, | |
| "step": 4955 | |
| }, | |
| { | |
| "epoch": 8.841354723707665, | |
| "grad_norm": 0.5589842200279236, | |
| "learning_rate": 2.0230874878498648e-06, | |
| "loss": 0.2835, | |
| "num_input_tokens_seen": 3075984, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 8.850267379679144, | |
| "grad_norm": 0.544204592704773, | |
| "learning_rate": 1.9925482037469188e-06, | |
| "loss": 0.1654, | |
| "num_input_tokens_seen": 3079152, | |
| "step": 4965 | |
| }, | |
| { | |
| "epoch": 8.859180035650624, | |
| "grad_norm": 0.5478450059890747, | |
| "learning_rate": 1.9622316078328566e-06, | |
| "loss": 0.1682, | |
| "num_input_tokens_seen": 3082544, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 8.868092691622103, | |
| "grad_norm": 0.5605227947235107, | |
| "learning_rate": 1.9321379935407697e-06, | |
| "loss": 0.145, | |
| "num_input_tokens_seen": 3085680, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 8.877005347593583, | |
| "grad_norm": 0.5030500292778015, | |
| "learning_rate": 1.9022676521455117e-06, | |
| "loss": 0.1795, | |
| "num_input_tokens_seen": 3089392, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 8.885918003565063, | |
| "grad_norm": 0.6063732504844666, | |
| "learning_rate": 1.8726208727609219e-06, | |
| "loss": 0.1604, | |
| "num_input_tokens_seen": 3092656, | |
| "step": 4985 | |
| }, | |
| { | |
| "epoch": 8.894830659536542, | |
| "grad_norm": 0.6032387018203735, | |
| "learning_rate": 1.8431979423369604e-06, | |
| "loss": 0.1646, | |
| "num_input_tokens_seen": 3095600, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 8.903743315508022, | |
| "grad_norm": 0.4930381774902344, | |
| "learning_rate": 1.8139991456569694e-06, | |
| "loss": 0.1622, | |
| "num_input_tokens_seen": 3098320, | |
| "step": 4995 | |
| }, | |
| { | |
| "epoch": 8.9126559714795, | |
| "grad_norm": 0.8425898551940918, | |
| "learning_rate": 1.7850247653349223e-06, | |
| "loss": 0.1554, | |
| "num_input_tokens_seen": 3101520, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 8.92156862745098, | |
| "grad_norm": 0.6207576394081116, | |
| "learning_rate": 1.7562750818126556e-06, | |
| "loss": 0.1733, | |
| "num_input_tokens_seen": 3104816, | |
| "step": 5005 | |
| }, | |
| { | |
| "epoch": 8.93048128342246, | |
| "grad_norm": 0.5085470676422119, | |
| "learning_rate": 1.727750373357187e-06, | |
| "loss": 0.1686, | |
| "num_input_tokens_seen": 3108176, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 8.93939393939394, | |
| "grad_norm": 0.4193607568740845, | |
| "learning_rate": 1.699450916058018e-06, | |
| "loss": 0.1473, | |
| "num_input_tokens_seen": 3111248, | |
| "step": 5015 | |
| }, | |
| { | |
| "epoch": 8.94830659536542, | |
| "grad_norm": 0.3501569330692291, | |
| "learning_rate": 1.6713769838244325e-06, | |
| "loss": 0.154, | |
| "num_input_tokens_seen": 3114224, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 8.957219251336898, | |
| "grad_norm": 0.40926966071128845, | |
| "learning_rate": 1.6435288483828748e-06, | |
| "loss": 0.1529, | |
| "num_input_tokens_seen": 3117232, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 8.966131907308379, | |
| "grad_norm": 0.3181830644607544, | |
| "learning_rate": 1.615906779274326e-06, | |
| "loss": 0.2044, | |
| "num_input_tokens_seen": 3120240, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 8.975044563279857, | |
| "grad_norm": 0.9511982798576355, | |
| "learning_rate": 1.588511043851662e-06, | |
| "loss": 0.2427, | |
| "num_input_tokens_seen": 3123792, | |
| "step": 5035 | |
| }, | |
| { | |
| "epoch": 8.983957219251337, | |
| "grad_norm": 0.3971862494945526, | |
| "learning_rate": 1.5613419072770864e-06, | |
| "loss": 0.1803, | |
| "num_input_tokens_seen": 3127184, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 8.992869875222816, | |
| "grad_norm": 0.527430534362793, | |
| "learning_rate": 1.534399632519573e-06, | |
| "loss": 0.1621, | |
| "num_input_tokens_seen": 3130480, | |
| "step": 5045 | |
| }, | |
| { | |
| "epoch": 9.001782531194296, | |
| "grad_norm": 0.4454513490200043, | |
| "learning_rate": 1.5076844803522922e-06, | |
| "loss": 0.1472, | |
| "num_input_tokens_seen": 3132712, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 9.010695187165775, | |
| "grad_norm": 0.8424109816551208, | |
| "learning_rate": 1.4811967093501189e-06, | |
| "loss": 0.1594, | |
| "num_input_tokens_seen": 3135400, | |
| "step": 5055 | |
| }, | |
| { | |
| "epoch": 9.016042780748663, | |
| "eval_loss": 0.18196314573287964, | |
| "eval_runtime": 4.2599, | |
| "eval_samples_per_second": 58.452, | |
| "eval_steps_per_second": 14.789, | |
| "num_input_tokens_seen": 3137352, | |
| "step": 5058 | |
| }, | |
| { | |
| "epoch": 9.019607843137255, | |
| "grad_norm": 0.8189364075660706, | |
| "learning_rate": 1.4549365758871142e-06, | |
| "loss": 0.1552, | |
| "num_input_tokens_seen": 3138248, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 9.028520499108735, | |
| "grad_norm": 0.40512701869010925, | |
| "learning_rate": 1.4289043341340375e-06, | |
| "loss": 0.1724, | |
| "num_input_tokens_seen": 3141480, | |
| "step": 5065 | |
| }, | |
| { | |
| "epoch": 9.037433155080214, | |
| "grad_norm": 0.5652516484260559, | |
| "learning_rate": 1.4031002360558849e-06, | |
| "loss": 0.1694, | |
| "num_input_tokens_seen": 3144904, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 9.046345811051694, | |
| "grad_norm": 0.5365282893180847, | |
| "learning_rate": 1.377524531409491e-06, | |
| "loss": 0.1725, | |
| "num_input_tokens_seen": 3148968, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 9.055258467023172, | |
| "grad_norm": 0.3831281065940857, | |
| "learning_rate": 1.3521774677410476e-06, | |
| "loss": 0.1522, | |
| "num_input_tokens_seen": 3151912, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 9.064171122994653, | |
| "grad_norm": 0.4094650149345398, | |
| "learning_rate": 1.3270592903837503e-06, | |
| "loss": 0.1649, | |
| "num_input_tokens_seen": 3155080, | |
| "step": 5085 | |
| }, | |
| { | |
| "epoch": 9.073083778966131, | |
| "grad_norm": 0.7728195786476135, | |
| "learning_rate": 1.3021702424554221e-06, | |
| "loss": 0.1512, | |
| "num_input_tokens_seen": 3157768, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 9.081996434937611, | |
| "grad_norm": 0.6765234470367432, | |
| "learning_rate": 1.2775105648561352e-06, | |
| "loss": 0.1841, | |
| "num_input_tokens_seen": 3161224, | |
| "step": 5095 | |
| }, | |
| { | |
| "epoch": 9.090909090909092, | |
| "grad_norm": 0.5181841254234314, | |
| "learning_rate": 1.2530804962659098e-06, | |
| "loss": 0.1716, | |
| "num_input_tokens_seen": 3163944, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 9.09982174688057, | |
| "grad_norm": 0.8874284625053406, | |
| "learning_rate": 1.2288802731423883e-06, | |
| "loss": 0.176, | |
| "num_input_tokens_seen": 3166728, | |
| "step": 5105 | |
| }, | |
| { | |
| "epoch": 9.10873440285205, | |
| "grad_norm": 0.6627284288406372, | |
| "learning_rate": 1.2049101297185422e-06, | |
| "loss": 0.1661, | |
| "num_input_tokens_seen": 3170120, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 9.117647058823529, | |
| "grad_norm": 0.7040612101554871, | |
| "learning_rate": 1.1811702980004058e-06, | |
| "loss": 0.1486, | |
| "num_input_tokens_seen": 3173000, | |
| "step": 5115 | |
| }, | |
| { | |
| "epoch": 9.12655971479501, | |
| "grad_norm": 0.6169217228889465, | |
| "learning_rate": 1.1576610077648513e-06, | |
| "loss": 0.1868, | |
| "num_input_tokens_seen": 3176520, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 9.135472370766488, | |
| "grad_norm": 0.464032381772995, | |
| "learning_rate": 1.134382486557342e-06, | |
| "loss": 0.1539, | |
| "num_input_tokens_seen": 3179496, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 9.144385026737968, | |
| "grad_norm": 0.679073691368103, | |
| "learning_rate": 1.1113349596897331e-06, | |
| "loss": 0.1429, | |
| "num_input_tokens_seen": 3182248, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 9.153297682709447, | |
| "grad_norm": 0.32752713561058044, | |
| "learning_rate": 1.0885186502381017e-06, | |
| "loss": 0.154, | |
| "num_input_tokens_seen": 3184840, | |
| "step": 5135 | |
| }, | |
| { | |
| "epoch": 9.162210338680927, | |
| "grad_norm": 0.6518117189407349, | |
| "learning_rate": 1.0659337790405704e-06, | |
| "loss": 0.1727, | |
| "num_input_tokens_seen": 3187720, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 9.171122994652407, | |
| "grad_norm": 0.6068860292434692, | |
| "learning_rate": 1.0435805646951958e-06, | |
| "loss": 0.1512, | |
| "num_input_tokens_seen": 3190536, | |
| "step": 5145 | |
| }, | |
| { | |
| "epoch": 9.180035650623886, | |
| "grad_norm": 0.42867806553840637, | |
| "learning_rate": 1.0214592235578274e-06, | |
| "loss": 0.162, | |
| "num_input_tokens_seen": 3193608, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 9.188948306595366, | |
| "grad_norm": 0.49051374197006226, | |
| "learning_rate": 9.995699697400247e-07, | |
| "loss": 0.181, | |
| "num_input_tokens_seen": 3196936, | |
| "step": 5155 | |
| }, | |
| { | |
| "epoch": 9.197860962566844, | |
| "grad_norm": 0.5725313425064087, | |
| "learning_rate": 9.77913015106982e-07, | |
| "loss": 0.1708, | |
| "num_input_tokens_seen": 3200040, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 9.206773618538325, | |
| "grad_norm": 0.9723972082138062, | |
| "learning_rate": 9.564885692754793e-07, | |
| "loss": 0.1814, | |
| "num_input_tokens_seen": 3203240, | |
| "step": 5165 | |
| }, | |
| { | |
| "epoch": 9.215686274509803, | |
| "grad_norm": 0.506613552570343, | |
| "learning_rate": 9.352968396118628e-07, | |
| "loss": 0.1726, | |
| "num_input_tokens_seen": 3206376, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 9.224598930481283, | |
| "grad_norm": 0.6921798586845398, | |
| "learning_rate": 9.143380312300137e-07, | |
| "loss": 0.1543, | |
| "num_input_tokens_seen": 3209480, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 9.233511586452764, | |
| "grad_norm": 0.5370962023735046, | |
| "learning_rate": 8.936123469893892e-07, | |
| "loss": 0.2448, | |
| "num_input_tokens_seen": 3213448, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 9.242424242424242, | |
| "grad_norm": 0.6006255745887756, | |
| "learning_rate": 8.731199874930374e-07, | |
| "loss": 0.1604, | |
| "num_input_tokens_seen": 3216776, | |
| "step": 5185 | |
| }, | |
| { | |
| "epoch": 9.251336898395722, | |
| "grad_norm": 0.5161803960800171, | |
| "learning_rate": 8.528611510856766e-07, | |
| "loss": 0.1543, | |
| "num_input_tokens_seen": 3219752, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 9.260249554367201, | |
| "grad_norm": 0.5216704607009888, | |
| "learning_rate": 8.328360338517583e-07, | |
| "loss": 0.1659, | |
| "num_input_tokens_seen": 3223048, | |
| "step": 5195 | |
| }, | |
| { | |
| "epoch": 9.269162210338681, | |
| "grad_norm": 0.43477028608322144, | |
| "learning_rate": 8.130448296135768e-07, | |
| "loss": 0.1847, | |
| "num_input_tokens_seen": 3226984, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 9.27807486631016, | |
| "grad_norm": 0.5066149234771729, | |
| "learning_rate": 7.934877299293875e-07, | |
| "loss": 0.1806, | |
| "num_input_tokens_seen": 3230088, | |
| "step": 5205 | |
| }, | |
| { | |
| "epoch": 9.28698752228164, | |
| "grad_norm": 0.9408987760543823, | |
| "learning_rate": 7.741649240915666e-07, | |
| "loss": 0.1692, | |
| "num_input_tokens_seen": 3232840, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 9.29590017825312, | |
| "grad_norm": 0.41510528326034546, | |
| "learning_rate": 7.550765991247654e-07, | |
| "loss": 0.144, | |
| "num_input_tokens_seen": 3235944, | |
| "step": 5215 | |
| }, | |
| { | |
| "epoch": 9.304812834224599, | |
| "grad_norm": 0.5157932043075562, | |
| "learning_rate": 7.362229397840981e-07, | |
| "loss": 0.1744, | |
| "num_input_tokens_seen": 3238728, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 9.313725490196079, | |
| "grad_norm": 0.44517961144447327, | |
| "learning_rate": 7.17604128553373e-07, | |
| "loss": 0.1478, | |
| "num_input_tokens_seen": 3241256, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 9.322638146167558, | |
| "grad_norm": 0.6294628977775574, | |
| "learning_rate": 6.992203456432977e-07, | |
| "loss": 0.1887, | |
| "num_input_tokens_seen": 3244680, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 9.331550802139038, | |
| "grad_norm": 0.3271355628967285, | |
| "learning_rate": 6.810717689897633e-07, | |
| "loss": 0.1474, | |
| "num_input_tokens_seen": 3247560, | |
| "step": 5235 | |
| }, | |
| { | |
| "epoch": 9.340463458110516, | |
| "grad_norm": 0.5900879502296448, | |
| "learning_rate": 6.631585742521068e-07, | |
| "loss": 0.1654, | |
| "num_input_tokens_seen": 3251176, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 9.349376114081997, | |
| "grad_norm": 1.2029948234558105, | |
| "learning_rate": 6.454809348114044e-07, | |
| "loss": 0.1985, | |
| "num_input_tokens_seen": 3254152, | |
| "step": 5245 | |
| }, | |
| { | |
| "epoch": 9.358288770053475, | |
| "grad_norm": 0.7293168902397156, | |
| "learning_rate": 6.280390217688114e-07, | |
| "loss": 0.1636, | |
| "num_input_tokens_seen": 3256744, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 9.367201426024955, | |
| "grad_norm": 0.28766605257987976, | |
| "learning_rate": 6.108330039438892e-07, | |
| "loss": 0.1729, | |
| "num_input_tokens_seen": 3259400, | |
| "step": 5255 | |
| }, | |
| { | |
| "epoch": 9.376114081996436, | |
| "grad_norm": 0.7399141788482666, | |
| "learning_rate": 5.938630478729917e-07, | |
| "loss": 0.1547, | |
| "num_input_tokens_seen": 3262728, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 9.385026737967914, | |
| "grad_norm": 0.45791682600975037, | |
| "learning_rate": 5.771293178076286e-07, | |
| "loss": 0.1693, | |
| "num_input_tokens_seen": 3266376, | |
| "step": 5265 | |
| }, | |
| { | |
| "epoch": 9.393939393939394, | |
| "grad_norm": 0.6668148636817932, | |
| "learning_rate": 5.606319757128914e-07, | |
| "loss": 0.169, | |
| "num_input_tokens_seen": 3268808, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 9.402852049910873, | |
| "grad_norm": 0.580091655254364, | |
| "learning_rate": 5.443711812658792e-07, | |
| "loss": 0.174, | |
| "num_input_tokens_seen": 3272008, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 9.411764705882353, | |
| "grad_norm": 0.47462576627731323, | |
| "learning_rate": 5.283470918541616e-07, | |
| "loss": 0.1395, | |
| "num_input_tokens_seen": 3274920, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 9.420677361853832, | |
| "grad_norm": 0.4406573474407196, | |
| "learning_rate": 5.125598625742523e-07, | |
| "loss": 0.1781, | |
| "num_input_tokens_seen": 3278376, | |
| "step": 5285 | |
| }, | |
| { | |
| "epoch": 9.429590017825312, | |
| "grad_norm": 0.4939647614955902, | |
| "learning_rate": 4.970096462300927e-07, | |
| "loss": 0.1745, | |
| "num_input_tokens_seen": 3281704, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 9.43850267379679, | |
| "grad_norm": 0.3747076988220215, | |
| "learning_rate": 4.816965933315987e-07, | |
| "loss": 0.1692, | |
| "num_input_tokens_seen": 3285256, | |
| "step": 5295 | |
| }, | |
| { | |
| "epoch": 9.44741532976827, | |
| "grad_norm": 0.5448613166809082, | |
| "learning_rate": 4.6662085209318305e-07, | |
| "loss": 0.1651, | |
| "num_input_tokens_seen": 3288616, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 9.456327985739751, | |
| "grad_norm": 0.5583840608596802, | |
| "learning_rate": 4.517825684323324e-07, | |
| "loss": 0.1549, | |
| "num_input_tokens_seen": 3291752, | |
| "step": 5305 | |
| }, | |
| { | |
| "epoch": 9.46524064171123, | |
| "grad_norm": 0.4584488272666931, | |
| "learning_rate": 4.3718188596819086e-07, | |
| "loss": 0.1519, | |
| "num_input_tokens_seen": 3294344, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 9.47415329768271, | |
| "grad_norm": 0.6175810694694519, | |
| "learning_rate": 4.228189460201676e-07, | |
| "loss": 0.1706, | |
| "num_input_tokens_seen": 3297512, | |
| "step": 5315 | |
| }, | |
| { | |
| "epoch": 9.483065953654188, | |
| "grad_norm": 0.5118115544319153, | |
| "learning_rate": 4.086938876065732e-07, | |
| "loss": 0.1538, | |
| "num_input_tokens_seen": 3300296, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 9.491978609625669, | |
| "grad_norm": 0.5376412868499756, | |
| "learning_rate": 3.948068474432715e-07, | |
| "loss": 0.274, | |
| "num_input_tokens_seen": 3304360, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 9.500891265597147, | |
| "grad_norm": 0.5221200585365295, | |
| "learning_rate": 3.8115795994236313e-07, | |
| "loss": 0.1658, | |
| "num_input_tokens_seen": 3307304, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 9.509803921568627, | |
| "grad_norm": 0.4227612316608429, | |
| "learning_rate": 3.6774735721087085e-07, | |
| "loss": 0.1618, | |
| "num_input_tokens_seen": 3310536, | |
| "step": 5335 | |
| }, | |
| { | |
| "epoch": 9.516934046345812, | |
| "eval_loss": 0.183439701795578, | |
| "eval_runtime": 4.2535, | |
| "eval_samples_per_second": 58.539, | |
| "eval_steps_per_second": 14.811, | |
| "num_input_tokens_seen": 3312648, | |
| "step": 5339 | |
| }, | |
| { | |
| "epoch": 9.518716577540108, | |
| "grad_norm": 0.601445734500885, | |
| "learning_rate": 3.5457516904947587e-07, | |
| "loss": 0.1771, | |
| "num_input_tokens_seen": 3313672, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 9.527629233511586, | |
| "grad_norm": 0.5191211700439453, | |
| "learning_rate": 3.416415229512443e-07, | |
| "loss": 0.1688, | |
| "num_input_tokens_seen": 3317224, | |
| "step": 5345 | |
| }, | |
| { | |
| "epoch": 9.536541889483066, | |
| "grad_norm": 0.6869432330131531, | |
| "learning_rate": 3.2894654410041417e-07, | |
| "loss": 0.1661, | |
| "num_input_tokens_seen": 3319848, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 9.545454545454545, | |
| "grad_norm": 0.905884325504303, | |
| "learning_rate": 3.1649035537117123e-07, | |
| "loss": 0.1521, | |
| "num_input_tokens_seen": 3322664, | |
| "step": 5355 | |
| }, | |
| { | |
| "epoch": 9.554367201426025, | |
| "grad_norm": 0.5753766894340515, | |
| "learning_rate": 3.042730773264557e-07, | |
| "loss": 0.1512, | |
| "num_input_tokens_seen": 3325928, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 9.563279857397504, | |
| "grad_norm": 0.5148957967758179, | |
| "learning_rate": 2.9229482821680197e-07, | |
| "loss": 0.1496, | |
| "num_input_tokens_seen": 3328680, | |
| "step": 5365 | |
| }, | |
| { | |
| "epoch": 9.572192513368984, | |
| "grad_norm": 0.47426876425743103, | |
| "learning_rate": 2.8055572397919784e-07, | |
| "loss": 0.152, | |
| "num_input_tokens_seen": 3331976, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 9.581105169340464, | |
| "grad_norm": 0.5953306555747986, | |
| "learning_rate": 2.690558782359576e-07, | |
| "loss": 0.1609, | |
| "num_input_tokens_seen": 3334888, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 9.590017825311943, | |
| "grad_norm": 0.49842748045921326, | |
| "learning_rate": 2.5779540229361745e-07, | |
| "loss": 0.1822, | |
| "num_input_tokens_seen": 3337960, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 9.598930481283423, | |
| "grad_norm": 0.6325761079788208, | |
| "learning_rate": 2.467744051418641e-07, | |
| "loss": 0.155, | |
| "num_input_tokens_seen": 3340936, | |
| "step": 5385 | |
| }, | |
| { | |
| "epoch": 9.607843137254902, | |
| "grad_norm": 0.8439469933509827, | |
| "learning_rate": 2.3599299345248292e-07, | |
| "loss": 0.1561, | |
| "num_input_tokens_seen": 3343784, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 9.616755793226382, | |
| "grad_norm": 0.7139554619789124, | |
| "learning_rate": 2.2545127157831413e-07, | |
| "loss": 0.1669, | |
| "num_input_tokens_seen": 3347016, | |
| "step": 5395 | |
| }, | |
| { | |
| "epoch": 9.62566844919786, | |
| "grad_norm": 0.3963601291179657, | |
| "learning_rate": 2.1514934155226208e-07, | |
| "loss": 0.1412, | |
| "num_input_tokens_seen": 3349800, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 9.63458110516934, | |
| "grad_norm": 0.5459052324295044, | |
| "learning_rate": 2.0508730308627933e-07, | |
| "loss": 0.1527, | |
| "num_input_tokens_seen": 3353640, | |
| "step": 5405 | |
| }, | |
| { | |
| "epoch": 9.643493761140821, | |
| "grad_norm": 0.7221339344978333, | |
| "learning_rate": 1.9526525357043136e-07, | |
| "loss": 0.1708, | |
| "num_input_tokens_seen": 3356904, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 9.6524064171123, | |
| "grad_norm": 0.39834100008010864, | |
| "learning_rate": 1.8568328807193337e-07, | |
| "loss": 0.1623, | |
| "num_input_tokens_seen": 3360232, | |
| "step": 5415 | |
| }, | |
| { | |
| "epoch": 9.66131907308378, | |
| "grad_norm": 0.3296028673648834, | |
| "learning_rate": 1.7634149933423993e-07, | |
| "loss": 0.1723, | |
| "num_input_tokens_seen": 3362824, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 9.670231729055258, | |
| "grad_norm": 0.6187313199043274, | |
| "learning_rate": 1.6723997777614574e-07, | |
| "loss": 0.2013, | |
| "num_input_tokens_seen": 3366152, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 9.679144385026738, | |
| "grad_norm": 0.4088561236858368, | |
| "learning_rate": 1.5837881149090294e-07, | |
| "loss": 0.1668, | |
| "num_input_tokens_seen": 3369192, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 9.688057040998217, | |
| "grad_norm": 0.6721343994140625, | |
| "learning_rate": 1.497580862453829e-07, | |
| "loss": 0.1767, | |
| "num_input_tokens_seen": 3372776, | |
| "step": 5435 | |
| }, | |
| { | |
| "epoch": 9.696969696969697, | |
| "grad_norm": 0.6333170533180237, | |
| "learning_rate": 1.4137788547923246e-07, | |
| "loss": 0.1829, | |
| "num_input_tokens_seen": 3376232, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 9.705882352941176, | |
| "grad_norm": 0.6064999103546143, | |
| "learning_rate": 1.3323829030407465e-07, | |
| "loss": 0.1916, | |
| "num_input_tokens_seen": 3379912, | |
| "step": 5445 | |
| }, | |
| { | |
| "epoch": 9.714795008912656, | |
| "grad_norm": 0.5454294085502625, | |
| "learning_rate": 1.2533937950272023e-07, | |
| "loss": 0.1639, | |
| "num_input_tokens_seen": 3382824, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 9.723707664884136, | |
| "grad_norm": 0.4902726411819458, | |
| "learning_rate": 1.176812295283991e-07, | |
| "loss": 0.1577, | |
| "num_input_tokens_seen": 3385640, | |
| "step": 5455 | |
| }, | |
| { | |
| "epoch": 9.732620320855615, | |
| "grad_norm": 0.4689973294734955, | |
| "learning_rate": 1.1026391450404128e-07, | |
| "loss": 0.1652, | |
| "num_input_tokens_seen": 3389672, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 9.741532976827095, | |
| "grad_norm": 0.6127117276191711, | |
| "learning_rate": 1.0308750622153307e-07, | |
| "loss": 0.1815, | |
| "num_input_tokens_seen": 3393096, | |
| "step": 5465 | |
| }, | |
| { | |
| "epoch": 9.750445632798574, | |
| "grad_norm": 0.40860888361930847, | |
| "learning_rate": 9.615207414103434e-08, | |
| "loss": 0.149, | |
| "num_input_tokens_seen": 3396136, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 9.759358288770054, | |
| "grad_norm": 0.5143342018127441, | |
| "learning_rate": 8.945768539031785e-08, | |
| "loss": 0.1785, | |
| "num_input_tokens_seen": 3399304, | |
| "step": 5475 | |
| }, | |
| { | |
| "epoch": 9.768270944741532, | |
| "grad_norm": 0.599516749382019, | |
| "learning_rate": 8.30044047640921e-08, | |
| "loss": 0.1617, | |
| "num_input_tokens_seen": 3402216, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 9.777183600713013, | |
| "grad_norm": 0.37185174226760864, | |
| "learning_rate": 7.679229472340176e-08, | |
| "loss": 0.1554, | |
| "num_input_tokens_seen": 3405096, | |
| "step": 5485 | |
| }, | |
| { | |
| "epoch": 9.786096256684491, | |
| "grad_norm": 0.4413319528102875, | |
| "learning_rate": 7.082141539500597e-08, | |
| "loss": 0.1639, | |
| "num_input_tokens_seen": 3407912, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 9.795008912655971, | |
| "grad_norm": 0.7090705633163452, | |
| "learning_rate": 6.509182457080376e-08, | |
| "loss": 0.1679, | |
| "num_input_tokens_seen": 3410856, | |
| "step": 5495 | |
| }, | |
| { | |
| "epoch": 9.803921568627452, | |
| "grad_norm": 0.5437349677085876, | |
| "learning_rate": 5.9603577707267875e-08, | |
| "loss": 0.1559, | |
| "num_input_tokens_seen": 3413928, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 9.81283422459893, | |
| "grad_norm": 0.5729760527610779, | |
| "learning_rate": 5.435672792491742e-08, | |
| "loss": 0.1623, | |
| "num_input_tokens_seen": 3417416, | |
| "step": 5505 | |
| }, | |
| { | |
| "epoch": 9.82174688057041, | |
| "grad_norm": 0.38444051146507263, | |
| "learning_rate": 4.935132600780157e-08, | |
| "loss": 0.1769, | |
| "num_input_tokens_seen": 3420136, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 9.830659536541889, | |
| "grad_norm": 0.4345572292804718, | |
| "learning_rate": 4.4587420402997235e-08, | |
| "loss": 0.1537, | |
| "num_input_tokens_seen": 3423272, | |
| "step": 5515 | |
| }, | |
| { | |
| "epoch": 9.83957219251337, | |
| "grad_norm": 0.44134852290153503, | |
| "learning_rate": 4.006505722015386e-08, | |
| "loss": 0.1499, | |
| "num_input_tokens_seen": 3426472, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 9.848484848484848, | |
| "grad_norm": 0.6951932907104492, | |
| "learning_rate": 3.578428023103819e-08, | |
| "loss": 0.1725, | |
| "num_input_tokens_seen": 3429992, | |
| "step": 5525 | |
| }, | |
| { | |
| "epoch": 9.857397504456328, | |
| "grad_norm": 0.47553181648254395, | |
| "learning_rate": 3.1745130869123566e-08, | |
| "loss": 0.1554, | |
| "num_input_tokens_seen": 3432456, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 9.866310160427808, | |
| "grad_norm": 0.5962952375411987, | |
| "learning_rate": 2.794764822916518e-08, | |
| "loss": 0.1618, | |
| "num_input_tokens_seen": 3434888, | |
| "step": 5535 | |
| }, | |
| { | |
| "epoch": 9.875222816399287, | |
| "grad_norm": 0.4873346984386444, | |
| "learning_rate": 2.4391869066844874e-08, | |
| "loss": 0.1773, | |
| "num_input_tokens_seen": 3437832, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 9.884135472370767, | |
| "grad_norm": 0.65750652551651, | |
| "learning_rate": 2.1077827798404726e-08, | |
| "loss": 0.1697, | |
| "num_input_tokens_seen": 3440872, | |
| "step": 5545 | |
| }, | |
| { | |
| "epoch": 9.893048128342246, | |
| "grad_norm": 0.4054161012172699, | |
| "learning_rate": 1.8005556500313993e-08, | |
| "loss": 0.1495, | |
| "num_input_tokens_seen": 3443784, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 9.901960784313726, | |
| "grad_norm": 0.605219841003418, | |
| "learning_rate": 1.51750849089638e-08, | |
| "loss": 0.1643, | |
| "num_input_tokens_seen": 3447592, | |
| "step": 5555 | |
| }, | |
| { | |
| "epoch": 9.910873440285204, | |
| "grad_norm": 0.3572712540626526, | |
| "learning_rate": 1.2586440420372936e-08, | |
| "loss": 0.1714, | |
| "num_input_tokens_seen": 3451048, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 9.919786096256685, | |
| "grad_norm": 0.5080024600028992, | |
| "learning_rate": 1.023964808992417e-08, | |
| "loss": 0.1497, | |
| "num_input_tokens_seen": 3453928, | |
| "step": 5565 | |
| }, | |
| { | |
| "epoch": 9.928698752228165, | |
| "grad_norm": 0.5494665503501892, | |
| "learning_rate": 8.134730632125554e-09, | |
| "loss": 0.1739, | |
| "num_input_tokens_seen": 3456968, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 9.937611408199643, | |
| "grad_norm": 0.5445519089698792, | |
| "learning_rate": 6.271708420385603e-09, | |
| "loss": 0.1683, | |
| "num_input_tokens_seen": 3460616, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 9.946524064171124, | |
| "grad_norm": 0.4502975046634674, | |
| "learning_rate": 4.650599486827334e-09, | |
| "loss": 0.1625, | |
| "num_input_tokens_seen": 3463592, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 9.955436720142602, | |
| "grad_norm": 0.713843047618866, | |
| "learning_rate": 3.2714195220912013e-09, | |
| "loss": 0.1604, | |
| "num_input_tokens_seen": 3466888, | |
| "step": 5585 | |
| }, | |
| { | |
| "epoch": 9.964349376114082, | |
| "grad_norm": 0.457069456577301, | |
| "learning_rate": 2.134181875204644e-09, | |
| "loss": 0.1602, | |
| "num_input_tokens_seen": 3470408, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 9.973262032085561, | |
| "grad_norm": 0.5743651390075684, | |
| "learning_rate": 1.2388975534460834e-09, | |
| "loss": 0.1584, | |
| "num_input_tokens_seen": 3473608, | |
| "step": 5595 | |
| }, | |
| { | |
| "epoch": 9.982174688057041, | |
| "grad_norm": 0.41813942790031433, | |
| "learning_rate": 5.855752222366783e-10, | |
| "loss": 0.163, | |
| "num_input_tokens_seen": 3476616, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 9.99108734402852, | |
| "grad_norm": 0.4171542227268219, | |
| "learning_rate": 1.7422120505705686e-10, | |
| "loss": 0.1549, | |
| "num_input_tokens_seen": 3479624, | |
| "step": 5605 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 1.7215794324874878, | |
| "learning_rate": 4.839483383478616e-12, | |
| "loss": 0.1694, | |
| "num_input_tokens_seen": 3481336, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "num_input_tokens_seen": 3481336, | |
| "step": 5610, | |
| "total_flos": 1.5676298662753075e+17, | |
| "train_loss": 0.9318533902924754, | |
| "train_runtime": 970.4341, | |
| "train_samples_per_second": 23.093, | |
| "train_steps_per_second": 5.781 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 5610, | |
| "num_input_tokens_seen": 3481336, | |
| "num_train_epochs": 10, | |
| "save_steps": 281, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5676298662753075e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |