| { |
| "best_global_step": 1992, |
| "best_metric": 0.16210927069187164, |
| "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_rte_456_1760637784/checkpoint-1992", |
| "epoch": 20.0, |
| "eval_steps": 996, |
| "global_step": 9960, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.010040160642570281, |
| "grad_norm": 194.0540008544922, |
| "learning_rate": 4.016064257028112e-08, |
| "loss": 8.9772, |
| "num_input_tokens_seen": 2176, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.020080321285140562, |
| "grad_norm": 214.29051208496094, |
| "learning_rate": 9.036144578313253e-08, |
| "loss": 9.1361, |
| "num_input_tokens_seen": 6144, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.030120481927710843, |
| "grad_norm": 204.60433959960938, |
| "learning_rate": 1.4056224899598394e-07, |
| "loss": 8.8855, |
| "num_input_tokens_seen": 9408, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.040160642570281124, |
| "grad_norm": 167.90016174316406, |
| "learning_rate": 1.9076305220883537e-07, |
| "loss": 8.6766, |
| "num_input_tokens_seen": 11840, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.050200803212851405, |
| "grad_norm": 216.1599578857422, |
| "learning_rate": 2.409638554216868e-07, |
| "loss": 8.3983, |
| "num_input_tokens_seen": 14912, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.060240963855421686, |
| "grad_norm": 162.7516326904297, |
| "learning_rate": 2.911646586345382e-07, |
| "loss": 8.1478, |
| "num_input_tokens_seen": 18304, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.07028112449799197, |
| "grad_norm": 177.10873413085938, |
| "learning_rate": 3.413654618473896e-07, |
| "loss": 7.7226, |
| "num_input_tokens_seen": 21344, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.08032128514056225, |
| "grad_norm": 161.67369079589844, |
| "learning_rate": 3.91566265060241e-07, |
| "loss": 7.4309, |
| "num_input_tokens_seen": 25088, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.09036144578313253, |
| "grad_norm": 146.87649536132812, |
| "learning_rate": 4.417670682730924e-07, |
| "loss": 7.098, |
| "num_input_tokens_seen": 28224, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.10040160642570281, |
| "grad_norm": 110.42221069335938, |
| "learning_rate": 4.919678714859438e-07, |
| "loss": 6.5589, |
| "num_input_tokens_seen": 31968, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.11044176706827309, |
| "grad_norm": 104.6060791015625, |
| "learning_rate": 5.421686746987952e-07, |
| "loss": 6.3171, |
| "num_input_tokens_seen": 35104, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.12048192771084337, |
| "grad_norm": 90.55043029785156, |
| "learning_rate": 5.923694779116467e-07, |
| "loss": 5.7753, |
| "num_input_tokens_seen": 38240, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.13052208835341367, |
| "grad_norm": 82.31653594970703, |
| "learning_rate": 6.425702811244979e-07, |
| "loss": 5.6056, |
| "num_input_tokens_seen": 40992, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.14056224899598393, |
| "grad_norm": 80.8585433959961, |
| "learning_rate": 6.927710843373495e-07, |
| "loss": 5.2274, |
| "num_input_tokens_seen": 44544, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.15060240963855423, |
| "grad_norm": 78.3877182006836, |
| "learning_rate": 7.429718875502008e-07, |
| "loss": 4.6893, |
| "num_input_tokens_seen": 48000, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.1606425702811245, |
| "grad_norm": 94.57144165039062, |
| "learning_rate": 7.931726907630523e-07, |
| "loss": 3.8478, |
| "num_input_tokens_seen": 51648, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.1706827309236948, |
| "grad_norm": 75.64936828613281, |
| "learning_rate": 8.433734939759036e-07, |
| "loss": 3.5727, |
| "num_input_tokens_seen": 54656, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.18072289156626506, |
| "grad_norm": 65.45319366455078, |
| "learning_rate": 8.935742971887551e-07, |
| "loss": 3.1493, |
| "num_input_tokens_seen": 57088, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.19076305220883535, |
| "grad_norm": 68.6191635131836, |
| "learning_rate": 9.437751004016064e-07, |
| "loss": 2.8682, |
| "num_input_tokens_seen": 60256, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.20080321285140562, |
| "grad_norm": 64.3413314819336, |
| "learning_rate": 9.93975903614458e-07, |
| "loss": 2.6081, |
| "num_input_tokens_seen": 63072, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.21084337349397592, |
| "grad_norm": 59.27682113647461, |
| "learning_rate": 1.0441767068273092e-06, |
| "loss": 1.6849, |
| "num_input_tokens_seen": 65312, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.22088353413654618, |
| "grad_norm": 43.84572219848633, |
| "learning_rate": 1.0943775100401608e-06, |
| "loss": 1.5056, |
| "num_input_tokens_seen": 68352, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.23092369477911648, |
| "grad_norm": 45.57728576660156, |
| "learning_rate": 1.1445783132530121e-06, |
| "loss": 1.2485, |
| "num_input_tokens_seen": 71104, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.24096385542168675, |
| "grad_norm": 46.22006607055664, |
| "learning_rate": 1.1947791164658635e-06, |
| "loss": 1.0266, |
| "num_input_tokens_seen": 75040, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.25100401606425704, |
| "grad_norm": 53.21188735961914, |
| "learning_rate": 1.2449799196787148e-06, |
| "loss": 0.8694, |
| "num_input_tokens_seen": 77920, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.26104417670682734, |
| "grad_norm": 31.533672332763672, |
| "learning_rate": 1.2951807228915664e-06, |
| "loss": 0.5457, |
| "num_input_tokens_seen": 80000, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.2710843373493976, |
| "grad_norm": 50.55741500854492, |
| "learning_rate": 1.345381526104418e-06, |
| "loss": 0.6048, |
| "num_input_tokens_seen": 82944, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.28112449799196787, |
| "grad_norm": 44.2205924987793, |
| "learning_rate": 1.395582329317269e-06, |
| "loss": 0.3919, |
| "num_input_tokens_seen": 86368, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.29116465863453816, |
| "grad_norm": 34.489261627197266, |
| "learning_rate": 1.4457831325301204e-06, |
| "loss": 0.312, |
| "num_input_tokens_seen": 89344, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.30120481927710846, |
| "grad_norm": 21.01329231262207, |
| "learning_rate": 1.495983935742972e-06, |
| "loss": 0.2272, |
| "num_input_tokens_seen": 92128, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3112449799196787, |
| "grad_norm": 23.1704158782959, |
| "learning_rate": 1.5461847389558236e-06, |
| "loss": 0.2378, |
| "num_input_tokens_seen": 95296, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.321285140562249, |
| "grad_norm": 20.119722366333008, |
| "learning_rate": 1.5963855421686747e-06, |
| "loss": 0.1967, |
| "num_input_tokens_seen": 98272, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.3313253012048193, |
| "grad_norm": 61.622379302978516, |
| "learning_rate": 1.6465863453815263e-06, |
| "loss": 0.2241, |
| "num_input_tokens_seen": 101344, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.3413654618473896, |
| "grad_norm": 29.35689926147461, |
| "learning_rate": 1.6967871485943776e-06, |
| "loss": 0.1959, |
| "num_input_tokens_seen": 103808, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.3514056224899598, |
| "grad_norm": 28.446813583374023, |
| "learning_rate": 1.7469879518072292e-06, |
| "loss": 0.1855, |
| "num_input_tokens_seen": 106880, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3614457831325301, |
| "grad_norm": 30.122529983520508, |
| "learning_rate": 1.7971887550200803e-06, |
| "loss": 0.1717, |
| "num_input_tokens_seen": 110016, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3714859437751004, |
| "grad_norm": 19.520801544189453, |
| "learning_rate": 1.8473895582329318e-06, |
| "loss": 0.1764, |
| "num_input_tokens_seen": 112896, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.3815261044176707, |
| "grad_norm": 28.57624053955078, |
| "learning_rate": 1.8975903614457832e-06, |
| "loss": 0.2006, |
| "num_input_tokens_seen": 115808, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.39156626506024095, |
| "grad_norm": 26.32992172241211, |
| "learning_rate": 1.947791164658635e-06, |
| "loss": 0.2027, |
| "num_input_tokens_seen": 118656, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.40160642570281124, |
| "grad_norm": 30.263748168945312, |
| "learning_rate": 1.997991967871486e-06, |
| "loss": 0.2035, |
| "num_input_tokens_seen": 121728, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.41164658634538154, |
| "grad_norm": 39.99821472167969, |
| "learning_rate": 2.0481927710843377e-06, |
| "loss": 0.335, |
| "num_input_tokens_seen": 124128, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.42168674698795183, |
| "grad_norm": 7.4671549797058105, |
| "learning_rate": 2.098393574297189e-06, |
| "loss": 0.1571, |
| "num_input_tokens_seen": 127072, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.43172690763052207, |
| "grad_norm": 24.52522087097168, |
| "learning_rate": 2.1485943775100404e-06, |
| "loss": 0.1704, |
| "num_input_tokens_seen": 129888, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.44176706827309237, |
| "grad_norm": 11.86972427368164, |
| "learning_rate": 2.1987951807228917e-06, |
| "loss": 0.1659, |
| "num_input_tokens_seen": 132800, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.45180722891566266, |
| "grad_norm": 16.253108978271484, |
| "learning_rate": 2.248995983935743e-06, |
| "loss": 0.1919, |
| "num_input_tokens_seen": 136480, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.46184738955823296, |
| "grad_norm": 31.7047119140625, |
| "learning_rate": 2.2991967871485944e-06, |
| "loss": 0.1908, |
| "num_input_tokens_seen": 140352, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.4718875502008032, |
| "grad_norm": 21.108055114746094, |
| "learning_rate": 2.349397590361446e-06, |
| "loss": 0.1697, |
| "num_input_tokens_seen": 143616, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.4819277108433735, |
| "grad_norm": 15.181034088134766, |
| "learning_rate": 2.399598393574297e-06, |
| "loss": 0.1851, |
| "num_input_tokens_seen": 146656, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.4919678714859438, |
| "grad_norm": 23.453062057495117, |
| "learning_rate": 2.449799196787149e-06, |
| "loss": 0.143, |
| "num_input_tokens_seen": 149664, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.5020080321285141, |
| "grad_norm": 27.794658660888672, |
| "learning_rate": 2.5e-06, |
| "loss": 0.2288, |
| "num_input_tokens_seen": 153344, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5120481927710844, |
| "grad_norm": 39.275882720947266, |
| "learning_rate": 2.5502008032128516e-06, |
| "loss": 0.2078, |
| "num_input_tokens_seen": 156224, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.5220883534136547, |
| "grad_norm": 25.21923065185547, |
| "learning_rate": 2.6004016064257033e-06, |
| "loss": 0.1754, |
| "num_input_tokens_seen": 159648, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5321285140562249, |
| "grad_norm": 15.785544395446777, |
| "learning_rate": 2.6506024096385547e-06, |
| "loss": 0.1866, |
| "num_input_tokens_seen": 162304, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.5421686746987951, |
| "grad_norm": 32.11054229736328, |
| "learning_rate": 2.700803212851406e-06, |
| "loss": 0.1895, |
| "num_input_tokens_seen": 165632, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5522088353413654, |
| "grad_norm": 12.410296440124512, |
| "learning_rate": 2.751004016064257e-06, |
| "loss": 0.1845, |
| "num_input_tokens_seen": 169024, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.5622489959839357, |
| "grad_norm": 16.866844177246094, |
| "learning_rate": 2.8012048192771087e-06, |
| "loss": 0.1919, |
| "num_input_tokens_seen": 173056, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.572289156626506, |
| "grad_norm": 12.017806053161621, |
| "learning_rate": 2.85140562248996e-06, |
| "loss": 0.186, |
| "num_input_tokens_seen": 175904, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.5823293172690763, |
| "grad_norm": 10.06004810333252, |
| "learning_rate": 2.9016064257028114e-06, |
| "loss": 0.1724, |
| "num_input_tokens_seen": 178816, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5923694779116466, |
| "grad_norm": 22.80934715270996, |
| "learning_rate": 2.9518072289156627e-06, |
| "loss": 0.1727, |
| "num_input_tokens_seen": 182464, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.6024096385542169, |
| "grad_norm": 10.76999282836914, |
| "learning_rate": 3.0020080321285145e-06, |
| "loss": 0.1692, |
| "num_input_tokens_seen": 186368, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6124497991967871, |
| "grad_norm": 21.18651008605957, |
| "learning_rate": 3.052208835341366e-06, |
| "loss": 0.2044, |
| "num_input_tokens_seen": 189184, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.6224899598393574, |
| "grad_norm": 25.353771209716797, |
| "learning_rate": 3.1024096385542172e-06, |
| "loss": 0.1926, |
| "num_input_tokens_seen": 192640, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.6325301204819277, |
| "grad_norm": 15.923639297485352, |
| "learning_rate": 3.152610441767068e-06, |
| "loss": 0.1875, |
| "num_input_tokens_seen": 195968, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.642570281124498, |
| "grad_norm": 11.498019218444824, |
| "learning_rate": 3.20281124497992e-06, |
| "loss": 0.1517, |
| "num_input_tokens_seen": 199168, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6526104417670683, |
| "grad_norm": 12.303227424621582, |
| "learning_rate": 3.2530120481927713e-06, |
| "loss": 0.1917, |
| "num_input_tokens_seen": 201728, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.6626506024096386, |
| "grad_norm": 9.231768608093262, |
| "learning_rate": 3.3032128514056226e-06, |
| "loss": 0.1671, |
| "num_input_tokens_seen": 204704, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.6726907630522089, |
| "grad_norm": 8.526322364807129, |
| "learning_rate": 3.3534136546184744e-06, |
| "loss": 0.1743, |
| "num_input_tokens_seen": 207616, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.6827309236947792, |
| "grad_norm": 29.402254104614258, |
| "learning_rate": 3.4036144578313257e-06, |
| "loss": 0.1898, |
| "num_input_tokens_seen": 210528, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6927710843373494, |
| "grad_norm": 24.25574493408203, |
| "learning_rate": 3.453815261044177e-06, |
| "loss": 0.2041, |
| "num_input_tokens_seen": 214752, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.7028112449799196, |
| "grad_norm": 7.983010768890381, |
| "learning_rate": 3.504016064257029e-06, |
| "loss": 0.2241, |
| "num_input_tokens_seen": 217408, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.7128514056224899, |
| "grad_norm": 9.211853981018066, |
| "learning_rate": 3.5542168674698798e-06, |
| "loss": 0.2229, |
| "num_input_tokens_seen": 220480, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.7228915662650602, |
| "grad_norm": 6.233531951904297, |
| "learning_rate": 3.604417670682731e-06, |
| "loss": 0.1506, |
| "num_input_tokens_seen": 223520, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.7329317269076305, |
| "grad_norm": 20.131092071533203, |
| "learning_rate": 3.6546184738955825e-06, |
| "loss": 0.153, |
| "num_input_tokens_seen": 226752, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.7429718875502008, |
| "grad_norm": 21.862136840820312, |
| "learning_rate": 3.7048192771084342e-06, |
| "loss": 0.1937, |
| "num_input_tokens_seen": 230080, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.7530120481927711, |
| "grad_norm": 17.098079681396484, |
| "learning_rate": 3.7550200803212856e-06, |
| "loss": 0.1582, |
| "num_input_tokens_seen": 233280, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.7630522088353414, |
| "grad_norm": 17.18459701538086, |
| "learning_rate": 3.805220883534137e-06, |
| "loss": 0.1743, |
| "num_input_tokens_seen": 236416, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.7730923694779116, |
| "grad_norm": 8.394510269165039, |
| "learning_rate": 3.855421686746989e-06, |
| "loss": 0.1722, |
| "num_input_tokens_seen": 240352, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.7831325301204819, |
| "grad_norm": 9.847726821899414, |
| "learning_rate": 3.90562248995984e-06, |
| "loss": 0.1773, |
| "num_input_tokens_seen": 243296, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.7931726907630522, |
| "grad_norm": 16.50226593017578, |
| "learning_rate": 3.9558232931726905e-06, |
| "loss": 0.1752, |
| "num_input_tokens_seen": 246656, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.8032128514056225, |
| "grad_norm": 7.0121355056762695, |
| "learning_rate": 4.006024096385543e-06, |
| "loss": 0.166, |
| "num_input_tokens_seen": 250112, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8132530120481928, |
| "grad_norm": 4.569051265716553, |
| "learning_rate": 4.056224899598394e-06, |
| "loss": 0.1758, |
| "num_input_tokens_seen": 253632, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.8232931726907631, |
| "grad_norm": 8.35086441040039, |
| "learning_rate": 4.106425702811245e-06, |
| "loss": 0.1631, |
| "num_input_tokens_seen": 256864, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 20.156761169433594, |
| "learning_rate": 4.156626506024097e-06, |
| "loss": 0.1934, |
| "num_input_tokens_seen": 259584, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.8433734939759037, |
| "grad_norm": 23.27678680419922, |
| "learning_rate": 4.206827309236948e-06, |
| "loss": 0.1684, |
| "num_input_tokens_seen": 262560, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.8534136546184738, |
| "grad_norm": 21.717554092407227, |
| "learning_rate": 4.2570281124497995e-06, |
| "loss": 0.26, |
| "num_input_tokens_seen": 265760, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.8634538152610441, |
| "grad_norm": 20.287023544311523, |
| "learning_rate": 4.307228915662651e-06, |
| "loss": 0.1703, |
| "num_input_tokens_seen": 267904, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.8734939759036144, |
| "grad_norm": 5.225078105926514, |
| "learning_rate": 4.357429718875502e-06, |
| "loss": 0.1868, |
| "num_input_tokens_seen": 272288, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.8835341365461847, |
| "grad_norm": 5.046104907989502, |
| "learning_rate": 4.4076305220883535e-06, |
| "loss": 0.1644, |
| "num_input_tokens_seen": 275232, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.893574297188755, |
| "grad_norm": 12.361194610595703, |
| "learning_rate": 4.457831325301205e-06, |
| "loss": 0.177, |
| "num_input_tokens_seen": 278624, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.9036144578313253, |
| "grad_norm": 10.267531394958496, |
| "learning_rate": 4.508032128514056e-06, |
| "loss": 0.1735, |
| "num_input_tokens_seen": 281696, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9136546184738956, |
| "grad_norm": 9.60224437713623, |
| "learning_rate": 4.558232931726908e-06, |
| "loss": 0.1862, |
| "num_input_tokens_seen": 284832, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.9236947791164659, |
| "grad_norm": 6.127063751220703, |
| "learning_rate": 4.60843373493976e-06, |
| "loss": 0.1664, |
| "num_input_tokens_seen": 287424, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.9337349397590361, |
| "grad_norm": 4.402464866638184, |
| "learning_rate": 4.658634538152611e-06, |
| "loss": 0.1646, |
| "num_input_tokens_seen": 289792, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.9437751004016064, |
| "grad_norm": 18.304546356201172, |
| "learning_rate": 4.7088353413654624e-06, |
| "loss": 0.1703, |
| "num_input_tokens_seen": 293152, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.9538152610441767, |
| "grad_norm": 5.38660192489624, |
| "learning_rate": 4.759036144578314e-06, |
| "loss": 0.1596, |
| "num_input_tokens_seen": 296320, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.963855421686747, |
| "grad_norm": 13.366776466369629, |
| "learning_rate": 4.809236947791165e-06, |
| "loss": 0.1694, |
| "num_input_tokens_seen": 299616, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.9738955823293173, |
| "grad_norm": 7.804983615875244, |
| "learning_rate": 4.8594377510040165e-06, |
| "loss": 0.1726, |
| "num_input_tokens_seen": 302528, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.9839357429718876, |
| "grad_norm": 10.998905181884766, |
| "learning_rate": 4.909638554216868e-06, |
| "loss": 0.1914, |
| "num_input_tokens_seen": 305120, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.9939759036144579, |
| "grad_norm": 13.699494361877441, |
| "learning_rate": 4.959839357429719e-06, |
| "loss": 0.1552, |
| "num_input_tokens_seen": 308928, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.0040160642570282, |
| "grad_norm": 6.078279495239258, |
| "learning_rate": 5.0100401606425705e-06, |
| "loss": 0.1611, |
| "num_input_tokens_seen": 312512, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.0140562248995983, |
| "grad_norm": 14.086797714233398, |
| "learning_rate": 5.060240963855422e-06, |
| "loss": 0.182, |
| "num_input_tokens_seen": 315232, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.0240963855421688, |
| "grad_norm": 15.124547004699707, |
| "learning_rate": 5.110441767068274e-06, |
| "loss": 0.217, |
| "num_input_tokens_seen": 317952, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.034136546184739, |
| "grad_norm": 5.822263717651367, |
| "learning_rate": 5.1606425702811245e-06, |
| "loss": 0.1535, |
| "num_input_tokens_seen": 321696, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.0441767068273093, |
| "grad_norm": 6.9545722007751465, |
| "learning_rate": 5.210843373493977e-06, |
| "loss": 0.1515, |
| "num_input_tokens_seen": 325184, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.0542168674698795, |
| "grad_norm": 6.776535511016846, |
| "learning_rate": 5.261044176706827e-06, |
| "loss": 0.1761, |
| "num_input_tokens_seen": 328832, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.0642570281124497, |
| "grad_norm": 6.585295677185059, |
| "learning_rate": 5.3112449799196794e-06, |
| "loss": 0.1718, |
| "num_input_tokens_seen": 332320, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.0742971887550201, |
| "grad_norm": 5.499662399291992, |
| "learning_rate": 5.361445783132531e-06, |
| "loss": 0.1717, |
| "num_input_tokens_seen": 335584, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.0843373493975903, |
| "grad_norm": 16.805959701538086, |
| "learning_rate": 5.411646586345381e-06, |
| "loss": 0.1412, |
| "num_input_tokens_seen": 338464, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.0943775100401607, |
| "grad_norm": 4.637701034545898, |
| "learning_rate": 5.4618473895582335e-06, |
| "loss": 0.192, |
| "num_input_tokens_seen": 341248, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.104417670682731, |
| "grad_norm": 8.762520790100098, |
| "learning_rate": 5.512048192771085e-06, |
| "loss": 0.1577, |
| "num_input_tokens_seen": 344288, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.1144578313253013, |
| "grad_norm": 15.590083122253418, |
| "learning_rate": 5.562248995983936e-06, |
| "loss": 0.1656, |
| "num_input_tokens_seen": 347456, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.1244979919678715, |
| "grad_norm": 3.6315433979034424, |
| "learning_rate": 5.6124497991967875e-06, |
| "loss": 0.144, |
| "num_input_tokens_seen": 350400, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.1345381526104417, |
| "grad_norm": 7.174181938171387, |
| "learning_rate": 5.66265060240964e-06, |
| "loss": 0.1922, |
| "num_input_tokens_seen": 352896, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.144578313253012, |
| "grad_norm": 13.603584289550781, |
| "learning_rate": 5.71285140562249e-06, |
| "loss": 0.1733, |
| "num_input_tokens_seen": 355328, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.1546184738955823, |
| "grad_norm": 4.278202533721924, |
| "learning_rate": 5.7630522088353416e-06, |
| "loss": 0.1604, |
| "num_input_tokens_seen": 358240, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.1646586345381527, |
| "grad_norm": 7.5159149169921875, |
| "learning_rate": 5.813253012048194e-06, |
| "loss": 0.2166, |
| "num_input_tokens_seen": 361792, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.1746987951807228, |
| "grad_norm": 2.941038131713867, |
| "learning_rate": 5.863453815261044e-06, |
| "loss": 0.1522, |
| "num_input_tokens_seen": 364960, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.1847389558232932, |
| "grad_norm": 14.475259780883789, |
| "learning_rate": 5.9136546184738964e-06, |
| "loss": 0.1469, |
| "num_input_tokens_seen": 368416, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.1947791164658634, |
| "grad_norm": 9.602474212646484, |
| "learning_rate": 5.963855421686747e-06, |
| "loss": 0.1788, |
| "num_input_tokens_seen": 371872, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.2048192771084336, |
| "grad_norm": 11.142339706420898, |
| "learning_rate": 6.014056224899599e-06, |
| "loss": 0.1649, |
| "num_input_tokens_seen": 374432, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.214859437751004, |
| "grad_norm": 4.123423099517822, |
| "learning_rate": 6.0642570281124505e-06, |
| "loss": 0.1436, |
| "num_input_tokens_seen": 378080, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.2248995983935742, |
| "grad_norm": 15.446894645690918, |
| "learning_rate": 6.114457831325302e-06, |
| "loss": 0.2001, |
| "num_input_tokens_seen": 380928, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.2349397590361446, |
| "grad_norm": 12.775602340698242, |
| "learning_rate": 6.164658634538153e-06, |
| "loss": 0.1959, |
| "num_input_tokens_seen": 383872, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.2449799196787148, |
| "grad_norm": 3.7326459884643555, |
| "learning_rate": 6.214859437751004e-06, |
| "loss": 0.1745, |
| "num_input_tokens_seen": 386752, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.2550200803212852, |
| "grad_norm": 6.751396179199219, |
| "learning_rate": 6.265060240963856e-06, |
| "loss": 0.1546, |
| "num_input_tokens_seen": 389600, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.2650602409638554, |
| "grad_norm": 8.127579689025879, |
| "learning_rate": 6.315261044176707e-06, |
| "loss": 0.1835, |
| "num_input_tokens_seen": 392672, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.2751004016064256, |
| "grad_norm": 10.662273406982422, |
| "learning_rate": 6.365461847389559e-06, |
| "loss": 0.2297, |
| "num_input_tokens_seen": 395520, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.285140562248996, |
| "grad_norm": 12.054793357849121, |
| "learning_rate": 6.41566265060241e-06, |
| "loss": 0.2924, |
| "num_input_tokens_seen": 398496, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.2951807228915664, |
| "grad_norm": 5.321606636047363, |
| "learning_rate": 6.465863453815262e-06, |
| "loss": 0.1721, |
| "num_input_tokens_seen": 401312, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.3052208835341366, |
| "grad_norm": 5.2592597007751465, |
| "learning_rate": 6.516064257028113e-06, |
| "loss": 0.187, |
| "num_input_tokens_seen": 404960, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.3152610441767068, |
| "grad_norm": 6.9956183433532715, |
| "learning_rate": 6.566265060240964e-06, |
| "loss": 0.1458, |
| "num_input_tokens_seen": 408032, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.3253012048192772, |
| "grad_norm": 3.2474989891052246, |
| "learning_rate": 6.616465863453816e-06, |
| "loss": 0.165, |
| "num_input_tokens_seen": 411968, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.3353413654618473, |
| "grad_norm": 8.723363876342773, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.1554, |
| "num_input_tokens_seen": 415168, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.3453815261044177, |
| "grad_norm": 14.606755256652832, |
| "learning_rate": 6.716867469879519e-06, |
| "loss": 0.1623, |
| "num_input_tokens_seen": 418112, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.355421686746988, |
| "grad_norm": 5.352765083312988, |
| "learning_rate": 6.76706827309237e-06, |
| "loss": 0.1602, |
| "num_input_tokens_seen": 420608, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.3654618473895583, |
| "grad_norm": 4.150283336639404, |
| "learning_rate": 6.8172690763052215e-06, |
| "loss": 0.1651, |
| "num_input_tokens_seen": 423680, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.3755020080321285, |
| "grad_norm": 4.1706061363220215, |
| "learning_rate": 6.867469879518073e-06, |
| "loss": 0.15, |
| "num_input_tokens_seen": 426368, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.3855421686746987, |
| "grad_norm": 9.573399543762207, |
| "learning_rate": 6.917670682730925e-06, |
| "loss": 0.1684, |
| "num_input_tokens_seen": 429568, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.395582329317269, |
| "grad_norm": 24.372079849243164, |
| "learning_rate": 6.9678714859437756e-06, |
| "loss": 0.1996, |
| "num_input_tokens_seen": 432320, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.4056224899598393, |
| "grad_norm": 14.810078620910645, |
| "learning_rate": 7.018072289156627e-06, |
| "loss": 0.2936, |
| "num_input_tokens_seen": 435360, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.4156626506024097, |
| "grad_norm": 2.108583927154541, |
| "learning_rate": 7.068273092369478e-06, |
| "loss": 0.1966, |
| "num_input_tokens_seen": 438112, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.4257028112449799, |
| "grad_norm": 7.824947357177734, |
| "learning_rate": 7.11847389558233e-06, |
| "loss": 0.2228, |
| "num_input_tokens_seen": 440672, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.4357429718875503, |
| "grad_norm": 4.722110271453857, |
| "learning_rate": 7.168674698795182e-06, |
| "loss": 0.1611, |
| "num_input_tokens_seen": 443424, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.4457831325301205, |
| "grad_norm": 9.50132942199707, |
| "learning_rate": 7.218875502008032e-06, |
| "loss": 0.1893, |
| "num_input_tokens_seen": 446528, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.4558232931726907, |
| "grad_norm": 4.900175094604492, |
| "learning_rate": 7.2690763052208845e-06, |
| "loss": 0.164, |
| "num_input_tokens_seen": 450016, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.465863453815261, |
| "grad_norm": 4.841922760009766, |
| "learning_rate": 7.319277108433736e-06, |
| "loss": 0.1482, |
| "num_input_tokens_seen": 453664, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.4759036144578312, |
| "grad_norm": 3.754279851913452, |
| "learning_rate": 7.369477911646586e-06, |
| "loss": 0.1891, |
| "num_input_tokens_seen": 457216, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.4859437751004017, |
| "grad_norm": 8.406981468200684, |
| "learning_rate": 7.4196787148594385e-06, |
| "loss": 0.1748, |
| "num_input_tokens_seen": 460416, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.4959839357429718, |
| "grad_norm": 6.8733015060424805, |
| "learning_rate": 7.469879518072289e-06, |
| "loss": 0.1654, |
| "num_input_tokens_seen": 463776, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.5060240963855422, |
| "grad_norm": 6.487129211425781, |
| "learning_rate": 7.520080321285141e-06, |
| "loss": 0.1918, |
| "num_input_tokens_seen": 467200, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.5160642570281124, |
| "grad_norm": 3.5012166500091553, |
| "learning_rate": 7.570281124497993e-06, |
| "loss": 0.1481, |
| "num_input_tokens_seen": 469728, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.5261044176706826, |
| "grad_norm": 2.2680747509002686, |
| "learning_rate": 7.620481927710845e-06, |
| "loss": 0.1537, |
| "num_input_tokens_seen": 472832, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.536144578313253, |
| "grad_norm": 6.076399326324463, |
| "learning_rate": 7.670682730923695e-06, |
| "loss": 0.1617, |
| "num_input_tokens_seen": 476256, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.5461847389558234, |
| "grad_norm": 2.6769142150878906, |
| "learning_rate": 7.720883534136547e-06, |
| "loss": 0.1653, |
| "num_input_tokens_seen": 478784, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.5562248995983936, |
| "grad_norm": 2.721581220626831, |
| "learning_rate": 7.771084337349398e-06, |
| "loss": 0.1399, |
| "num_input_tokens_seen": 481920, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.5662650602409638, |
| "grad_norm": 2.004193067550659, |
| "learning_rate": 7.82128514056225e-06, |
| "loss": 0.1345, |
| "num_input_tokens_seen": 485344, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.5763052208835342, |
| "grad_norm": 5.72704553604126, |
| "learning_rate": 7.8714859437751e-06, |
| "loss": 0.1843, |
| "num_input_tokens_seen": 487968, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.5863453815261044, |
| "grad_norm": 4.490149021148682, |
| "learning_rate": 7.921686746987952e-06, |
| "loss": 0.1692, |
| "num_input_tokens_seen": 490880, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.5963855421686746, |
| "grad_norm": 2.184390068054199, |
| "learning_rate": 7.971887550200803e-06, |
| "loss": 0.1647, |
| "num_input_tokens_seen": 493568, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.606425702811245, |
| "grad_norm": 6.28218412399292, |
| "learning_rate": 8.022088353413655e-06, |
| "loss": 0.1725, |
| "num_input_tokens_seen": 497184, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.6164658634538154, |
| "grad_norm": 3.339231014251709, |
| "learning_rate": 8.072289156626508e-06, |
| "loss": 0.1703, |
| "num_input_tokens_seen": 499680, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.6265060240963856, |
| "grad_norm": 3.661980152130127, |
| "learning_rate": 8.122489959839357e-06, |
| "loss": 0.1438, |
| "num_input_tokens_seen": 502656, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.6365461847389557, |
| "grad_norm": 1.343388557434082, |
| "learning_rate": 8.172690763052209e-06, |
| "loss": 0.1673, |
| "num_input_tokens_seen": 505056, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.6465863453815262, |
| "grad_norm": 2.127262592315674, |
| "learning_rate": 8.222891566265062e-06, |
| "loss": 0.1565, |
| "num_input_tokens_seen": 508320, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.6566265060240963, |
| "grad_norm": 4.568302631378174, |
| "learning_rate": 8.273092369477911e-06, |
| "loss": 0.1429, |
| "num_input_tokens_seen": 512032, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 6.50462007522583, |
| "learning_rate": 8.323293172690764e-06, |
| "loss": 0.2157, |
| "num_input_tokens_seen": 514752, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.676706827309237, |
| "grad_norm": 3.402589797973633, |
| "learning_rate": 8.373493975903614e-06, |
| "loss": 0.1982, |
| "num_input_tokens_seen": 518176, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.6867469879518073, |
| "grad_norm": 6.337620258331299, |
| "learning_rate": 8.423694779116467e-06, |
| "loss": 0.1626, |
| "num_input_tokens_seen": 521216, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.6967871485943775, |
| "grad_norm": 4.079775810241699, |
| "learning_rate": 8.473895582329319e-06, |
| "loss": 0.1674, |
| "num_input_tokens_seen": 525216, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.7068273092369477, |
| "grad_norm": 7.396521091461182, |
| "learning_rate": 8.52409638554217e-06, |
| "loss": 0.1472, |
| "num_input_tokens_seen": 528896, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.716867469879518, |
| "grad_norm": 9.742034912109375, |
| "learning_rate": 8.574297188755021e-06, |
| "loss": 0.2251, |
| "num_input_tokens_seen": 531744, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.7269076305220885, |
| "grad_norm": 6.10853385925293, |
| "learning_rate": 8.624497991967873e-06, |
| "loss": 0.1385, |
| "num_input_tokens_seen": 534464, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.7369477911646585, |
| "grad_norm": 1.38788640499115, |
| "learning_rate": 8.674698795180724e-06, |
| "loss": 0.1555, |
| "num_input_tokens_seen": 538208, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.7469879518072289, |
| "grad_norm": 3.5134339332580566, |
| "learning_rate": 8.724899598393575e-06, |
| "loss": 0.1201, |
| "num_input_tokens_seen": 541440, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.7570281124497993, |
| "grad_norm": 4.719893932342529, |
| "learning_rate": 8.775100401606427e-06, |
| "loss": 0.2551, |
| "num_input_tokens_seen": 543968, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.7670682730923695, |
| "grad_norm": 8.232791900634766, |
| "learning_rate": 8.825301204819278e-06, |
| "loss": 0.1633, |
| "num_input_tokens_seen": 546752, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.7771084337349397, |
| "grad_norm": 2.3899688720703125, |
| "learning_rate": 8.87550200803213e-06, |
| "loss": 0.1441, |
| "num_input_tokens_seen": 549632, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.78714859437751, |
| "grad_norm": 5.850103378295898, |
| "learning_rate": 8.92570281124498e-06, |
| "loss": 0.1765, |
| "num_input_tokens_seen": 552736, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.7971887550200805, |
| "grad_norm": 2.2977206707000732, |
| "learning_rate": 8.975903614457832e-06, |
| "loss": 0.1459, |
| "num_input_tokens_seen": 555808, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.8072289156626506, |
| "grad_norm": 2.4535903930664062, |
| "learning_rate": 9.026104417670683e-06, |
| "loss": 0.1618, |
| "num_input_tokens_seen": 558624, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.8172690763052208, |
| "grad_norm": 2.2958626747131348, |
| "learning_rate": 9.076305220883535e-06, |
| "loss": 0.1617, |
| "num_input_tokens_seen": 562368, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.8273092369477912, |
| "grad_norm": 2.7728888988494873, |
| "learning_rate": 9.126506024096386e-06, |
| "loss": 0.1613, |
| "num_input_tokens_seen": 565824, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.8373493975903614, |
| "grad_norm": 8.566793441772461, |
| "learning_rate": 9.176706827309237e-06, |
| "loss": 0.1839, |
| "num_input_tokens_seen": 569888, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.8473895582329316, |
| "grad_norm": 8.40902042388916, |
| "learning_rate": 9.226907630522089e-06, |
| "loss": 0.216, |
| "num_input_tokens_seen": 572160, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.857429718875502, |
| "grad_norm": 2.0097877979278564, |
| "learning_rate": 9.27710843373494e-06, |
| "loss": 0.1682, |
| "num_input_tokens_seen": 575072, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.8674698795180724, |
| "grad_norm": 2.2905232906341553, |
| "learning_rate": 9.327309236947793e-06, |
| "loss": 0.1811, |
| "num_input_tokens_seen": 578112, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.8775100401606426, |
| "grad_norm": 4.189212322235107, |
| "learning_rate": 9.377510040160643e-06, |
| "loss": 0.1639, |
| "num_input_tokens_seen": 581248, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.8875502008032128, |
| "grad_norm": 3.09934401512146, |
| "learning_rate": 9.427710843373494e-06, |
| "loss": 0.1592, |
| "num_input_tokens_seen": 584608, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.8975903614457832, |
| "grad_norm": 2.5684139728546143, |
| "learning_rate": 9.477911646586347e-06, |
| "loss": 0.1492, |
| "num_input_tokens_seen": 588192, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.9076305220883534, |
| "grad_norm": 2.9508438110351562, |
| "learning_rate": 9.528112449799197e-06, |
| "loss": 0.1581, |
| "num_input_tokens_seen": 591552, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.9176706827309236, |
| "grad_norm": 4.4684247970581055, |
| "learning_rate": 9.57831325301205e-06, |
| "loss": 0.1515, |
| "num_input_tokens_seen": 594784, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.927710843373494, |
| "grad_norm": 4.187466144561768, |
| "learning_rate": 9.6285140562249e-06, |
| "loss": 0.1574, |
| "num_input_tokens_seen": 598016, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.9377510040160644, |
| "grad_norm": 1.2344138622283936, |
| "learning_rate": 9.678714859437753e-06, |
| "loss": 0.166, |
| "num_input_tokens_seen": 601216, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.9477911646586346, |
| "grad_norm": 5.241006374359131, |
| "learning_rate": 9.728915662650604e-06, |
| "loss": 0.1514, |
| "num_input_tokens_seen": 604640, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.9578313253012047, |
| "grad_norm": 1.7580270767211914, |
| "learning_rate": 9.779116465863454e-06, |
| "loss": 0.1461, |
| "num_input_tokens_seen": 607680, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.9678714859437751, |
| "grad_norm": 2.8425869941711426, |
| "learning_rate": 9.829317269076307e-06, |
| "loss": 0.1787, |
| "num_input_tokens_seen": 611296, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.9779116465863453, |
| "grad_norm": 2.437546730041504, |
| "learning_rate": 9.879518072289156e-06, |
| "loss": 0.152, |
| "num_input_tokens_seen": 614912, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.9879518072289155, |
| "grad_norm": 5.1114726066589355, |
| "learning_rate": 9.92971887550201e-06, |
| "loss": 0.2004, |
| "num_input_tokens_seen": 618272, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.997991967871486, |
| "grad_norm": 3.159273147583008, |
| "learning_rate": 9.97991967871486e-06, |
| "loss": 0.1312, |
| "num_input_tokens_seen": 621824, |
| "step": 995 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.16479995846748352, |
| "eval_runtime": 8.0659, |
| "eval_samples_per_second": 61.742, |
| "eval_steps_per_second": 15.497, |
| "num_input_tokens_seen": 622720, |
| "step": 996 |
| }, |
| { |
| "epoch": 2.0080321285140563, |
| "grad_norm": 0.732082724571228, |
| "learning_rate": 9.999997236378723e-06, |
| "loss": 0.1477, |
| "num_input_tokens_seen": 625344, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.0180722891566263, |
| "grad_norm": 1.478649616241455, |
| "learning_rate": 9.999980347593088e-06, |
| "loss": 0.1797, |
| "num_input_tokens_seen": 629056, |
| "step": 1005 |
| }, |
| { |
| "epoch": 2.0281124497991967, |
| "grad_norm": 1.8761229515075684, |
| "learning_rate": 9.999948105418771e-06, |
| "loss": 0.1524, |
| "num_input_tokens_seen": 632640, |
| "step": 1010 |
| }, |
| { |
| "epoch": 2.038152610441767, |
| "grad_norm": 2.1610300540924072, |
| "learning_rate": 9.999900509954779e-06, |
| "loss": 0.1683, |
| "num_input_tokens_seen": 636032, |
| "step": 1015 |
| }, |
| { |
| "epoch": 2.0481927710843375, |
| "grad_norm": 2.8162364959716797, |
| "learning_rate": 9.999837561347259e-06, |
| "loss": 0.1756, |
| "num_input_tokens_seen": 639008, |
| "step": 1020 |
| }, |
| { |
| "epoch": 2.0582329317269075, |
| "grad_norm": 0.9307097792625427, |
| "learning_rate": 9.99975925978951e-06, |
| "loss": 0.1659, |
| "num_input_tokens_seen": 642048, |
| "step": 1025 |
| }, |
| { |
| "epoch": 2.068273092369478, |
| "grad_norm": 2.646435260772705, |
| "learning_rate": 9.99966560552197e-06, |
| "loss": 0.1371, |
| "num_input_tokens_seen": 645312, |
| "step": 1030 |
| }, |
| { |
| "epoch": 2.0783132530120483, |
| "grad_norm": 3.3150744438171387, |
| "learning_rate": 9.999556598832224e-06, |
| "loss": 0.1625, |
| "num_input_tokens_seen": 648864, |
| "step": 1035 |
| }, |
| { |
| "epoch": 2.0883534136546187, |
| "grad_norm": 3.675476551055908, |
| "learning_rate": 9.999432240054994e-06, |
| "loss": 0.1701, |
| "num_input_tokens_seen": 651616, |
| "step": 1040 |
| }, |
| { |
| "epoch": 2.0983935742971886, |
| "grad_norm": 4.225813865661621, |
| "learning_rate": 9.999292529572152e-06, |
| "loss": 0.1761, |
| "num_input_tokens_seen": 654912, |
| "step": 1045 |
| }, |
| { |
| "epoch": 2.108433734939759, |
| "grad_norm": 5.603430271148682, |
| "learning_rate": 9.9991374678127e-06, |
| "loss": 0.1769, |
| "num_input_tokens_seen": 658400, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.1184738955823295, |
| "grad_norm": 2.353452205657959, |
| "learning_rate": 9.998967055252791e-06, |
| "loss": 0.1754, |
| "num_input_tokens_seen": 661536, |
| "step": 1055 |
| }, |
| { |
| "epoch": 2.1285140562248994, |
| "grad_norm": 2.809842109680176, |
| "learning_rate": 9.998781292415705e-06, |
| "loss": 0.1485, |
| "num_input_tokens_seen": 664736, |
| "step": 1060 |
| }, |
| { |
| "epoch": 2.13855421686747, |
| "grad_norm": 1.9131516218185425, |
| "learning_rate": 9.998580179871864e-06, |
| "loss": 0.1628, |
| "num_input_tokens_seen": 667520, |
| "step": 1065 |
| }, |
| { |
| "epoch": 2.1485943775100402, |
| "grad_norm": 3.7786145210266113, |
| "learning_rate": 9.998363718238819e-06, |
| "loss": 0.1636, |
| "num_input_tokens_seen": 670976, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.1586345381526106, |
| "grad_norm": 5.997560501098633, |
| "learning_rate": 9.998131908181262e-06, |
| "loss": 0.1487, |
| "num_input_tokens_seen": 674816, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.1686746987951806, |
| "grad_norm": 2.7555580139160156, |
| "learning_rate": 9.997884750411004e-06, |
| "loss": 0.153, |
| "num_input_tokens_seen": 678080, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.178714859437751, |
| "grad_norm": 2.348747730255127, |
| "learning_rate": 9.997622245686993e-06, |
| "loss": 0.1557, |
| "num_input_tokens_seen": 680736, |
| "step": 1085 |
| }, |
| { |
| "epoch": 2.1887550200803214, |
| "grad_norm": 3.540637493133545, |
| "learning_rate": 9.997344394815298e-06, |
| "loss": 0.1665, |
| "num_input_tokens_seen": 684064, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.1987951807228914, |
| "grad_norm": 4.5958476066589355, |
| "learning_rate": 9.997051198649117e-06, |
| "loss": 0.1743, |
| "num_input_tokens_seen": 686560, |
| "step": 1095 |
| }, |
| { |
| "epoch": 2.208835341365462, |
| "grad_norm": 1.7110495567321777, |
| "learning_rate": 9.996742658088759e-06, |
| "loss": 0.183, |
| "num_input_tokens_seen": 689312, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.218875502008032, |
| "grad_norm": 3.817140817642212, |
| "learning_rate": 9.996418774081658e-06, |
| "loss": 0.1928, |
| "num_input_tokens_seen": 692320, |
| "step": 1105 |
| }, |
| { |
| "epoch": 2.2289156626506026, |
| "grad_norm": 0.9928373098373413, |
| "learning_rate": 9.996079547622362e-06, |
| "loss": 0.1581, |
| "num_input_tokens_seen": 695040, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.2389558232931726, |
| "grad_norm": 1.4945827722549438, |
| "learning_rate": 9.995724979752533e-06, |
| "loss": 0.1677, |
| "num_input_tokens_seen": 697952, |
| "step": 1115 |
| }, |
| { |
| "epoch": 2.248995983935743, |
| "grad_norm": 3.399029016494751, |
| "learning_rate": 9.995355071560933e-06, |
| "loss": 0.1657, |
| "num_input_tokens_seen": 700992, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.2590361445783134, |
| "grad_norm": 5.047579288482666, |
| "learning_rate": 9.994969824183441e-06, |
| "loss": 0.157, |
| "num_input_tokens_seen": 703936, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.2690763052208833, |
| "grad_norm": 2.0834877490997314, |
| "learning_rate": 9.994569238803027e-06, |
| "loss": 0.1444, |
| "num_input_tokens_seen": 707424, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.2791164658634537, |
| "grad_norm": 2.91137957572937, |
| "learning_rate": 9.994153316649769e-06, |
| "loss": 0.1327, |
| "num_input_tokens_seen": 710592, |
| "step": 1135 |
| }, |
| { |
| "epoch": 2.289156626506024, |
| "grad_norm": 11.498336791992188, |
| "learning_rate": 9.993722059000833e-06, |
| "loss": 0.2246, |
| "num_input_tokens_seen": 714368, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.2991967871485945, |
| "grad_norm": 4.052079677581787, |
| "learning_rate": 9.993275467180476e-06, |
| "loss": 0.1575, |
| "num_input_tokens_seen": 717344, |
| "step": 1145 |
| }, |
| { |
| "epoch": 2.3092369477911645, |
| "grad_norm": 1.9310601949691772, |
| "learning_rate": 9.992813542560045e-06, |
| "loss": 0.1554, |
| "num_input_tokens_seen": 720576, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.319277108433735, |
| "grad_norm": 9.181812286376953, |
| "learning_rate": 9.992336286557967e-06, |
| "loss": 0.1799, |
| "num_input_tokens_seen": 723296, |
| "step": 1155 |
| }, |
| { |
| "epoch": 2.3293172690763053, |
| "grad_norm": 1.3441740274429321, |
| "learning_rate": 9.991843700639747e-06, |
| "loss": 0.1408, |
| "num_input_tokens_seen": 726720, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.3393574297188753, |
| "grad_norm": 5.780555248260498, |
| "learning_rate": 9.991335786317964e-06, |
| "loss": 0.1704, |
| "num_input_tokens_seen": 730240, |
| "step": 1165 |
| }, |
| { |
| "epoch": 2.3493975903614457, |
| "grad_norm": 1.7718957662582397, |
| "learning_rate": 9.990812545152264e-06, |
| "loss": 0.1711, |
| "num_input_tokens_seen": 733984, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.359437751004016, |
| "grad_norm": 1.9247894287109375, |
| "learning_rate": 9.990273978749358e-06, |
| "loss": 0.1465, |
| "num_input_tokens_seen": 737056, |
| "step": 1175 |
| }, |
| { |
| "epoch": 2.3694779116465865, |
| "grad_norm": 2.258086681365967, |
| "learning_rate": 9.98972008876302e-06, |
| "loss": 0.1695, |
| "num_input_tokens_seen": 739744, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.3795180722891565, |
| "grad_norm": 1.809032917022705, |
| "learning_rate": 9.98915087689407e-06, |
| "loss": 0.169, |
| "num_input_tokens_seen": 742912, |
| "step": 1185 |
| }, |
| { |
| "epoch": 2.389558232931727, |
| "grad_norm": 2.019859552383423, |
| "learning_rate": 9.988566344890383e-06, |
| "loss": 0.1525, |
| "num_input_tokens_seen": 746400, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.3995983935742973, |
| "grad_norm": 1.4076141119003296, |
| "learning_rate": 9.987966494546873e-06, |
| "loss": 0.1502, |
| "num_input_tokens_seen": 750144, |
| "step": 1195 |
| }, |
| { |
| "epoch": 2.4096385542168672, |
| "grad_norm": 1.8707914352416992, |
| "learning_rate": 9.987351327705498e-06, |
| "loss": 0.1395, |
| "num_input_tokens_seen": 752640, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.4196787148594376, |
| "grad_norm": 3.6235058307647705, |
| "learning_rate": 9.986720846255244e-06, |
| "loss": 0.1586, |
| "num_input_tokens_seen": 755584, |
| "step": 1205 |
| }, |
| { |
| "epoch": 2.429718875502008, |
| "grad_norm": 1.7184830904006958, |
| "learning_rate": 9.986075052132124e-06, |
| "loss": 0.1645, |
| "num_input_tokens_seen": 758656, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.4397590361445785, |
| "grad_norm": 2.1086556911468506, |
| "learning_rate": 9.98541394731917e-06, |
| "loss": 0.149, |
| "num_input_tokens_seen": 761280, |
| "step": 1215 |
| }, |
| { |
| "epoch": 2.4497991967871484, |
| "grad_norm": 1.5263396501541138, |
| "learning_rate": 9.984737533846429e-06, |
| "loss": 0.1714, |
| "num_input_tokens_seen": 765440, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.459839357429719, |
| "grad_norm": 2.3205933570861816, |
| "learning_rate": 9.984045813790959e-06, |
| "loss": 0.1557, |
| "num_input_tokens_seen": 769184, |
| "step": 1225 |
| }, |
| { |
| "epoch": 2.4698795180722892, |
| "grad_norm": 1.9002354145050049, |
| "learning_rate": 9.983338789276817e-06, |
| "loss": 0.1521, |
| "num_input_tokens_seen": 771584, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.479919678714859, |
| "grad_norm": 1.5902975797653198, |
| "learning_rate": 9.982616462475055e-06, |
| "loss": 0.1492, |
| "num_input_tokens_seen": 774464, |
| "step": 1235 |
| }, |
| { |
| "epoch": 2.4899598393574296, |
| "grad_norm": 3.1418979167938232, |
| "learning_rate": 9.981878835603718e-06, |
| "loss": 0.171, |
| "num_input_tokens_seen": 777568, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 1.7949987649917603, |
| "learning_rate": 9.981125910927824e-06, |
| "loss": 0.1484, |
| "num_input_tokens_seen": 779936, |
| "step": 1245 |
| }, |
| { |
| "epoch": 2.5100401606425704, |
| "grad_norm": 0.7500330805778503, |
| "learning_rate": 9.980357690759376e-06, |
| "loss": 0.1717, |
| "num_input_tokens_seen": 782752, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.520080321285141, |
| "grad_norm": 1.0766161680221558, |
| "learning_rate": 9.979574177457337e-06, |
| "loss": 0.1508, |
| "num_input_tokens_seen": 785984, |
| "step": 1255 |
| }, |
| { |
| "epoch": 2.5301204819277108, |
| "grad_norm": 2.405531406402588, |
| "learning_rate": 9.978775373427634e-06, |
| "loss": 0.1551, |
| "num_input_tokens_seen": 789280, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.540160642570281, |
| "grad_norm": 2.476111888885498, |
| "learning_rate": 9.977961281123146e-06, |
| "loss": 0.1623, |
| "num_input_tokens_seen": 792384, |
| "step": 1265 |
| }, |
| { |
| "epoch": 2.550200803212851, |
| "grad_norm": 4.53995418548584, |
| "learning_rate": 9.9771319030437e-06, |
| "loss": 0.1581, |
| "num_input_tokens_seen": 795200, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.5602409638554215, |
| "grad_norm": 3.0957741737365723, |
| "learning_rate": 9.976287241736055e-06, |
| "loss": 0.1561, |
| "num_input_tokens_seen": 798144, |
| "step": 1275 |
| }, |
| { |
| "epoch": 2.570281124497992, |
| "grad_norm": 2.2821428775787354, |
| "learning_rate": 9.975427299793908e-06, |
| "loss": 0.1669, |
| "num_input_tokens_seen": 800992, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.5803212851405624, |
| "grad_norm": 2.018181324005127, |
| "learning_rate": 9.974552079857873e-06, |
| "loss": 0.1416, |
| "num_input_tokens_seen": 803872, |
| "step": 1285 |
| }, |
| { |
| "epoch": 2.5903614457831328, |
| "grad_norm": 1.4955519437789917, |
| "learning_rate": 9.973661584615476e-06, |
| "loss": 0.1636, |
| "num_input_tokens_seen": 807616, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.6004016064257027, |
| "grad_norm": 2.3408939838409424, |
| "learning_rate": 9.972755816801155e-06, |
| "loss": 0.1555, |
| "num_input_tokens_seen": 810592, |
| "step": 1295 |
| }, |
| { |
| "epoch": 2.610441767068273, |
| "grad_norm": 12.589311599731445, |
| "learning_rate": 9.971834779196238e-06, |
| "loss": 0.1738, |
| "num_input_tokens_seen": 812992, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.6204819277108435, |
| "grad_norm": 2.955625295639038, |
| "learning_rate": 9.970898474628951e-06, |
| "loss": 0.1559, |
| "num_input_tokens_seen": 816544, |
| "step": 1305 |
| }, |
| { |
| "epoch": 2.6305220883534135, |
| "grad_norm": 3.5705697536468506, |
| "learning_rate": 9.969946905974392e-06, |
| "loss": 0.1491, |
| "num_input_tokens_seen": 819904, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.640562248995984, |
| "grad_norm": 3.9580843448638916, |
| "learning_rate": 9.968980076154533e-06, |
| "loss": 0.1472, |
| "num_input_tokens_seen": 822848, |
| "step": 1315 |
| }, |
| { |
| "epoch": 2.6506024096385543, |
| "grad_norm": 2.570261001586914, |
| "learning_rate": 9.96799798813821e-06, |
| "loss": 0.1526, |
| "num_input_tokens_seen": 825696, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.6606425702811247, |
| "grad_norm": 2.2151670455932617, |
| "learning_rate": 9.96700064494111e-06, |
| "loss": 0.138, |
| "num_input_tokens_seen": 828704, |
| "step": 1325 |
| }, |
| { |
| "epoch": 2.6706827309236947, |
| "grad_norm": 3.911294937133789, |
| "learning_rate": 9.965988049625763e-06, |
| "loss": 0.1962, |
| "num_input_tokens_seen": 831744, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.680722891566265, |
| "grad_norm": 3.1704447269439697, |
| "learning_rate": 9.964960205301534e-06, |
| "loss": 0.1459, |
| "num_input_tokens_seen": 834720, |
| "step": 1335 |
| }, |
| { |
| "epoch": 2.6907630522088355, |
| "grad_norm": 3.241382360458374, |
| "learning_rate": 9.963917115124621e-06, |
| "loss": 0.1723, |
| "num_input_tokens_seen": 838048, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.7008032128514055, |
| "grad_norm": 3.085642099380493, |
| "learning_rate": 9.962858782298023e-06, |
| "loss": 0.1566, |
| "num_input_tokens_seen": 841216, |
| "step": 1345 |
| }, |
| { |
| "epoch": 2.710843373493976, |
| "grad_norm": 3.294822931289673, |
| "learning_rate": 9.961785210071554e-06, |
| "loss": 0.1866, |
| "num_input_tokens_seen": 844576, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.7208835341365463, |
| "grad_norm": 1.466409683227539, |
| "learning_rate": 9.960696401741825e-06, |
| "loss": 0.1571, |
| "num_input_tokens_seen": 847872, |
| "step": 1355 |
| }, |
| { |
| "epoch": 2.7309236947791167, |
| "grad_norm": 1.720889687538147, |
| "learning_rate": 9.959592360652224e-06, |
| "loss": 0.1448, |
| "num_input_tokens_seen": 850848, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.7409638554216866, |
| "grad_norm": 2.6980252265930176, |
| "learning_rate": 9.95847309019292e-06, |
| "loss": 0.1496, |
| "num_input_tokens_seen": 853664, |
| "step": 1365 |
| }, |
| { |
| "epoch": 2.751004016064257, |
| "grad_norm": 3.6680150032043457, |
| "learning_rate": 9.957338593800844e-06, |
| "loss": 0.1483, |
| "num_input_tokens_seen": 856928, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.7610441767068274, |
| "grad_norm": 8.08189868927002, |
| "learning_rate": 9.956188874959686e-06, |
| "loss": 0.1877, |
| "num_input_tokens_seen": 860192, |
| "step": 1375 |
| }, |
| { |
| "epoch": 2.7710843373493974, |
| "grad_norm": 3.0358712673187256, |
| "learning_rate": 9.955023937199876e-06, |
| "loss": 0.1748, |
| "num_input_tokens_seen": 863616, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.781124497991968, |
| "grad_norm": 2.551198720932007, |
| "learning_rate": 9.953843784098573e-06, |
| "loss": 0.1268, |
| "num_input_tokens_seen": 867296, |
| "step": 1385 |
| }, |
| { |
| "epoch": 2.791164658634538, |
| "grad_norm": 3.2741429805755615, |
| "learning_rate": 9.952648419279662e-06, |
| "loss": 0.1956, |
| "num_input_tokens_seen": 870368, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.8012048192771086, |
| "grad_norm": 4.114120006561279, |
| "learning_rate": 9.951437846413738e-06, |
| "loss": 0.2096, |
| "num_input_tokens_seen": 873472, |
| "step": 1395 |
| }, |
| { |
| "epoch": 2.8112449799196786, |
| "grad_norm": 3.035440444946289, |
| "learning_rate": 9.950212069218095e-06, |
| "loss": 0.1534, |
| "num_input_tokens_seen": 876224, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.821285140562249, |
| "grad_norm": 2.0006754398345947, |
| "learning_rate": 9.948971091456715e-06, |
| "loss": 0.1606, |
| "num_input_tokens_seen": 879392, |
| "step": 1405 |
| }, |
| { |
| "epoch": 2.8313253012048194, |
| "grad_norm": 3.954497814178467, |
| "learning_rate": 9.947714916940257e-06, |
| "loss": 0.1265, |
| "num_input_tokens_seen": 882656, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.8413654618473894, |
| "grad_norm": 4.2772417068481445, |
| "learning_rate": 9.946443549526041e-06, |
| "loss": 0.1417, |
| "num_input_tokens_seen": 885696, |
| "step": 1415 |
| }, |
| { |
| "epoch": 2.8514056224899598, |
| "grad_norm": 4.809564590454102, |
| "learning_rate": 9.945156993118042e-06, |
| "loss": 0.1702, |
| "num_input_tokens_seen": 888640, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.86144578313253, |
| "grad_norm": 3.161689043045044, |
| "learning_rate": 9.943855251666873e-06, |
| "loss": 0.1291, |
| "num_input_tokens_seen": 892384, |
| "step": 1425 |
| }, |
| { |
| "epoch": 2.8714859437751006, |
| "grad_norm": 2.3096818923950195, |
| "learning_rate": 9.942538329169786e-06, |
| "loss": 0.1453, |
| "num_input_tokens_seen": 895328, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.8815261044176705, |
| "grad_norm": 12.994061470031738, |
| "learning_rate": 9.941206229670634e-06, |
| "loss": 0.181, |
| "num_input_tokens_seen": 897952, |
| "step": 1435 |
| }, |
| { |
| "epoch": 2.891566265060241, |
| "grad_norm": 3.6042988300323486, |
| "learning_rate": 9.939858957259887e-06, |
| "loss": 0.1356, |
| "num_input_tokens_seen": 901792, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.9016064257028114, |
| "grad_norm": 3.180940866470337, |
| "learning_rate": 9.938496516074597e-06, |
| "loss": 0.1256, |
| "num_input_tokens_seen": 905664, |
| "step": 1445 |
| }, |
| { |
| "epoch": 2.9116465863453813, |
| "grad_norm": 5.906370162963867, |
| "learning_rate": 9.937118910298398e-06, |
| "loss": 0.1685, |
| "num_input_tokens_seen": 907904, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.9216867469879517, |
| "grad_norm": 3.592151403427124, |
| "learning_rate": 9.935726144161492e-06, |
| "loss": 0.1314, |
| "num_input_tokens_seen": 910816, |
| "step": 1455 |
| }, |
| { |
| "epoch": 2.931726907630522, |
| "grad_norm": 4.761033058166504, |
| "learning_rate": 9.934318221940632e-06, |
| "loss": 0.1309, |
| "num_input_tokens_seen": 913568, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.9417670682730925, |
| "grad_norm": 8.582740783691406, |
| "learning_rate": 9.932895147959106e-06, |
| "loss": 0.3052, |
| "num_input_tokens_seen": 916320, |
| "step": 1465 |
| }, |
| { |
| "epoch": 2.9518072289156625, |
| "grad_norm": 5.78909158706665, |
| "learning_rate": 9.931456926586738e-06, |
| "loss": 0.1818, |
| "num_input_tokens_seen": 919136, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.961847389558233, |
| "grad_norm": 3.7451789379119873, |
| "learning_rate": 9.930003562239858e-06, |
| "loss": 0.1883, |
| "num_input_tokens_seen": 922080, |
| "step": 1475 |
| }, |
| { |
| "epoch": 2.9718875502008033, |
| "grad_norm": 2.4840102195739746, |
| "learning_rate": 9.928535059381298e-06, |
| "loss": 0.1681, |
| "num_input_tokens_seen": 925088, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.9819277108433733, |
| "grad_norm": 1.527234435081482, |
| "learning_rate": 9.927051422520373e-06, |
| "loss": 0.1436, |
| "num_input_tokens_seen": 928160, |
| "step": 1485 |
| }, |
| { |
| "epoch": 2.9919678714859437, |
| "grad_norm": 3.8078765869140625, |
| "learning_rate": 9.925552656212871e-06, |
| "loss": 0.1555, |
| "num_input_tokens_seen": 930688, |
| "step": 1490 |
| }, |
| { |
| "epoch": 3.002008032128514, |
| "grad_norm": 1.0531964302062988, |
| "learning_rate": 9.924038765061042e-06, |
| "loss": 0.1497, |
| "num_input_tokens_seen": 933504, |
| "step": 1495 |
| }, |
| { |
| "epoch": 3.0120481927710845, |
| "grad_norm": 2.8714663982391357, |
| "learning_rate": 9.922509753713572e-06, |
| "loss": 0.1453, |
| "num_input_tokens_seen": 936448, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.0220883534136544, |
| "grad_norm": 4.788018226623535, |
| "learning_rate": 9.920965626865582e-06, |
| "loss": 0.1549, |
| "num_input_tokens_seen": 939488, |
| "step": 1505 |
| }, |
| { |
| "epoch": 3.032128514056225, |
| "grad_norm": 4.411100387573242, |
| "learning_rate": 9.919406389258607e-06, |
| "loss": 0.145, |
| "num_input_tokens_seen": 942240, |
| "step": 1510 |
| }, |
| { |
| "epoch": 3.0421686746987953, |
| "grad_norm": 2.9873862266540527, |
| "learning_rate": 9.917832045680584e-06, |
| "loss": 0.1603, |
| "num_input_tokens_seen": 946048, |
| "step": 1515 |
| }, |
| { |
| "epoch": 3.0522088353413657, |
| "grad_norm": 5.309106826782227, |
| "learning_rate": 9.91624260096583e-06, |
| "loss": 0.1407, |
| "num_input_tokens_seen": 948672, |
| "step": 1520 |
| }, |
| { |
| "epoch": 3.0622489959839356, |
| "grad_norm": 4.250939846038818, |
| "learning_rate": 9.91463805999504e-06, |
| "loss": 0.1642, |
| "num_input_tokens_seen": 951744, |
| "step": 1525 |
| }, |
| { |
| "epoch": 3.072289156626506, |
| "grad_norm": 3.651625871658325, |
| "learning_rate": 9.913018427695257e-06, |
| "loss": 0.1516, |
| "num_input_tokens_seen": 955136, |
| "step": 1530 |
| }, |
| { |
| "epoch": 3.0823293172690764, |
| "grad_norm": 2.637655258178711, |
| "learning_rate": 9.911383709039876e-06, |
| "loss": 0.1336, |
| "num_input_tokens_seen": 958240, |
| "step": 1535 |
| }, |
| { |
| "epoch": 3.0923694779116464, |
| "grad_norm": 6.339487552642822, |
| "learning_rate": 9.909733909048606e-06, |
| "loss": 0.1601, |
| "num_input_tokens_seen": 961056, |
| "step": 1540 |
| }, |
| { |
| "epoch": 3.102409638554217, |
| "grad_norm": 3.671630382537842, |
| "learning_rate": 9.908069032787473e-06, |
| "loss": 0.1588, |
| "num_input_tokens_seen": 963808, |
| "step": 1545 |
| }, |
| { |
| "epoch": 3.112449799196787, |
| "grad_norm": 3.158716917037964, |
| "learning_rate": 9.906389085368792e-06, |
| "loss": 0.1487, |
| "num_input_tokens_seen": 967168, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.1224899598393576, |
| "grad_norm": 1.9967892169952393, |
| "learning_rate": 9.904694071951167e-06, |
| "loss": 0.1448, |
| "num_input_tokens_seen": 970272, |
| "step": 1555 |
| }, |
| { |
| "epoch": 3.1325301204819276, |
| "grad_norm": 2.418212413787842, |
| "learning_rate": 9.902983997739453e-06, |
| "loss": 0.1227, |
| "num_input_tokens_seen": 972960, |
| "step": 1560 |
| }, |
| { |
| "epoch": 3.142570281124498, |
| "grad_norm": 6.753594875335693, |
| "learning_rate": 9.90125886798476e-06, |
| "loss": 0.1283, |
| "num_input_tokens_seen": 976064, |
| "step": 1565 |
| }, |
| { |
| "epoch": 3.1526104417670684, |
| "grad_norm": 4.105684280395508, |
| "learning_rate": 9.899518687984424e-06, |
| "loss": 0.1485, |
| "num_input_tokens_seen": 979168, |
| "step": 1570 |
| }, |
| { |
| "epoch": 3.1626506024096384, |
| "grad_norm": 4.228103160858154, |
| "learning_rate": 9.897763463082e-06, |
| "loss": 0.1622, |
| "num_input_tokens_seen": 982528, |
| "step": 1575 |
| }, |
| { |
| "epoch": 3.1726907630522088, |
| "grad_norm": 4.657632350921631, |
| "learning_rate": 9.89599319866724e-06, |
| "loss": 0.1166, |
| "num_input_tokens_seen": 985472, |
| "step": 1580 |
| }, |
| { |
| "epoch": 3.182730923694779, |
| "grad_norm": 3.033367872238159, |
| "learning_rate": 9.894207900176074e-06, |
| "loss": 0.1433, |
| "num_input_tokens_seen": 988448, |
| "step": 1585 |
| }, |
| { |
| "epoch": 3.1927710843373496, |
| "grad_norm": 3.6265132427215576, |
| "learning_rate": 9.892407573090603e-06, |
| "loss": 0.1531, |
| "num_input_tokens_seen": 991392, |
| "step": 1590 |
| }, |
| { |
| "epoch": 3.2028112449799195, |
| "grad_norm": 3.417398691177368, |
| "learning_rate": 9.890592222939071e-06, |
| "loss": 0.191, |
| "num_input_tokens_seen": 993760, |
| "step": 1595 |
| }, |
| { |
| "epoch": 3.21285140562249, |
| "grad_norm": 2.1362621784210205, |
| "learning_rate": 9.888761855295855e-06, |
| "loss": 0.1723, |
| "num_input_tokens_seen": 997216, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.2228915662650603, |
| "grad_norm": 2.6160507202148438, |
| "learning_rate": 9.886916475781448e-06, |
| "loss": 0.1387, |
| "num_input_tokens_seen": 1000160, |
| "step": 1605 |
| }, |
| { |
| "epoch": 3.2329317269076308, |
| "grad_norm": 1.8726855516433716, |
| "learning_rate": 9.885056090062436e-06, |
| "loss": 0.1349, |
| "num_input_tokens_seen": 1003424, |
| "step": 1610 |
| }, |
| { |
| "epoch": 3.2429718875502007, |
| "grad_norm": 1.9157968759536743, |
| "learning_rate": 9.883180703851488e-06, |
| "loss": 0.1236, |
| "num_input_tokens_seen": 1006080, |
| "step": 1615 |
| }, |
| { |
| "epoch": 3.253012048192771, |
| "grad_norm": 4.427457809448242, |
| "learning_rate": 9.881290322907332e-06, |
| "loss": 0.1659, |
| "num_input_tokens_seen": 1009472, |
| "step": 1620 |
| }, |
| { |
| "epoch": 3.2630522088353415, |
| "grad_norm": 3.359321355819702, |
| "learning_rate": 9.879384953034745e-06, |
| "loss": 0.1175, |
| "num_input_tokens_seen": 1012576, |
| "step": 1625 |
| }, |
| { |
| "epoch": 3.2730923694779115, |
| "grad_norm": 4.155451774597168, |
| "learning_rate": 9.877464600084521e-06, |
| "loss": 0.1796, |
| "num_input_tokens_seen": 1015744, |
| "step": 1630 |
| }, |
| { |
| "epoch": 3.283132530120482, |
| "grad_norm": 3.184243679046631, |
| "learning_rate": 9.875529269953474e-06, |
| "loss": 0.1309, |
| "num_input_tokens_seen": 1018336, |
| "step": 1635 |
| }, |
| { |
| "epoch": 3.2931726907630523, |
| "grad_norm": 3.387753963470459, |
| "learning_rate": 9.873578968584399e-06, |
| "loss": 0.152, |
| "num_input_tokens_seen": 1021056, |
| "step": 1640 |
| }, |
| { |
| "epoch": 3.3032128514056227, |
| "grad_norm": 1.9127076864242554, |
| "learning_rate": 9.871613701966067e-06, |
| "loss": 0.1473, |
| "num_input_tokens_seen": 1024576, |
| "step": 1645 |
| }, |
| { |
| "epoch": 3.3132530120481927, |
| "grad_norm": 1.9807343482971191, |
| "learning_rate": 9.869633476133205e-06, |
| "loss": 0.1158, |
| "num_input_tokens_seen": 1027840, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.323293172690763, |
| "grad_norm": 2.7899370193481445, |
| "learning_rate": 9.867638297166467e-06, |
| "loss": 0.1114, |
| "num_input_tokens_seen": 1031232, |
| "step": 1655 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 7.52949333190918, |
| "learning_rate": 9.865628171192432e-06, |
| "loss": 0.1627, |
| "num_input_tokens_seen": 1034624, |
| "step": 1660 |
| }, |
| { |
| "epoch": 3.3433734939759034, |
| "grad_norm": 3.0961153507232666, |
| "learning_rate": 9.863603104383575e-06, |
| "loss": 0.1279, |
| "num_input_tokens_seen": 1037792, |
| "step": 1665 |
| }, |
| { |
| "epoch": 3.353413654618474, |
| "grad_norm": 4.504798889160156, |
| "learning_rate": 9.861563102958243e-06, |
| "loss": 0.1248, |
| "num_input_tokens_seen": 1040352, |
| "step": 1670 |
| }, |
| { |
| "epoch": 3.3634538152610443, |
| "grad_norm": 3.05240797996521, |
| "learning_rate": 9.859508173180653e-06, |
| "loss": 0.1567, |
| "num_input_tokens_seen": 1043328, |
| "step": 1675 |
| }, |
| { |
| "epoch": 3.3734939759036147, |
| "grad_norm": 2.5293796062469482, |
| "learning_rate": 9.857438321360853e-06, |
| "loss": 0.1332, |
| "num_input_tokens_seen": 1046912, |
| "step": 1680 |
| }, |
| { |
| "epoch": 3.3835341365461846, |
| "grad_norm": 3.3068597316741943, |
| "learning_rate": 9.855353553854719e-06, |
| "loss": 0.1333, |
| "num_input_tokens_seen": 1050272, |
| "step": 1685 |
| }, |
| { |
| "epoch": 3.393574297188755, |
| "grad_norm": 1.7536911964416504, |
| "learning_rate": 9.853253877063922e-06, |
| "loss": 0.1552, |
| "num_input_tokens_seen": 1052512, |
| "step": 1690 |
| }, |
| { |
| "epoch": 3.4036144578313254, |
| "grad_norm": 1.41140615940094, |
| "learning_rate": 9.85113929743592e-06, |
| "loss": 0.1335, |
| "num_input_tokens_seen": 1056000, |
| "step": 1695 |
| }, |
| { |
| "epoch": 3.4136546184738954, |
| "grad_norm": 9.492226600646973, |
| "learning_rate": 9.849009821463931e-06, |
| "loss": 0.179, |
| "num_input_tokens_seen": 1058624, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.423694779116466, |
| "grad_norm": 1.4923559427261353, |
| "learning_rate": 9.846865455686915e-06, |
| "loss": 0.1076, |
| "num_input_tokens_seen": 1061280, |
| "step": 1705 |
| }, |
| { |
| "epoch": 3.433734939759036, |
| "grad_norm": 2.582712173461914, |
| "learning_rate": 9.844706206689557e-06, |
| "loss": 0.1559, |
| "num_input_tokens_seen": 1064576, |
| "step": 1710 |
| }, |
| { |
| "epoch": 3.4437751004016066, |
| "grad_norm": 7.048799514770508, |
| "learning_rate": 9.842532081102234e-06, |
| "loss": 0.1566, |
| "num_input_tokens_seen": 1067232, |
| "step": 1715 |
| }, |
| { |
| "epoch": 3.4538152610441766, |
| "grad_norm": 3.5871365070343018, |
| "learning_rate": 9.840343085601018e-06, |
| "loss": 0.1286, |
| "num_input_tokens_seen": 1070624, |
| "step": 1720 |
| }, |
| { |
| "epoch": 3.463855421686747, |
| "grad_norm": 11.149718284606934, |
| "learning_rate": 9.838139226907631e-06, |
| "loss": 0.171, |
| "num_input_tokens_seen": 1074208, |
| "step": 1725 |
| }, |
| { |
| "epoch": 3.4738955823293174, |
| "grad_norm": 2.5568132400512695, |
| "learning_rate": 9.835920511789441e-06, |
| "loss": 0.1862, |
| "num_input_tokens_seen": 1078144, |
| "step": 1730 |
| }, |
| { |
| "epoch": 3.4839357429718874, |
| "grad_norm": 2.298809051513672, |
| "learning_rate": 9.833686947059436e-06, |
| "loss": 0.1129, |
| "num_input_tokens_seen": 1081728, |
| "step": 1735 |
| }, |
| { |
| "epoch": 3.4939759036144578, |
| "grad_norm": 2.6856706142425537, |
| "learning_rate": 9.831438539576194e-06, |
| "loss": 0.1192, |
| "num_input_tokens_seen": 1084320, |
| "step": 1740 |
| }, |
| { |
| "epoch": 3.504016064257028, |
| "grad_norm": 2.332166910171509, |
| "learning_rate": 9.829175296243885e-06, |
| "loss": 0.1328, |
| "num_input_tokens_seen": 1087168, |
| "step": 1745 |
| }, |
| { |
| "epoch": 3.5140562248995986, |
| "grad_norm": 3.9273953437805176, |
| "learning_rate": 9.826897224012221e-06, |
| "loss": 0.1197, |
| "num_input_tokens_seen": 1090304, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.5240963855421685, |
| "grad_norm": 2.3180413246154785, |
| "learning_rate": 9.82460432987646e-06, |
| "loss": 0.1532, |
| "num_input_tokens_seen": 1093248, |
| "step": 1755 |
| }, |
| { |
| "epoch": 3.534136546184739, |
| "grad_norm": 3.322129011154175, |
| "learning_rate": 9.822296620877364e-06, |
| "loss": 0.1532, |
| "num_input_tokens_seen": 1096160, |
| "step": 1760 |
| }, |
| { |
| "epoch": 3.5441767068273093, |
| "grad_norm": 2.576061248779297, |
| "learning_rate": 9.819974104101198e-06, |
| "loss": 0.1268, |
| "num_input_tokens_seen": 1099712, |
| "step": 1765 |
| }, |
| { |
| "epoch": 3.5542168674698793, |
| "grad_norm": 2.1420481204986572, |
| "learning_rate": 9.817636786679682e-06, |
| "loss": 0.1301, |
| "num_input_tokens_seen": 1102528, |
| "step": 1770 |
| }, |
| { |
| "epoch": 3.5642570281124497, |
| "grad_norm": 3.8734970092773438, |
| "learning_rate": 9.815284675789999e-06, |
| "loss": 0.1342, |
| "num_input_tokens_seen": 1106368, |
| "step": 1775 |
| }, |
| { |
| "epoch": 3.57429718875502, |
| "grad_norm": 3.125272035598755, |
| "learning_rate": 9.81291777865475e-06, |
| "loss": 0.1162, |
| "num_input_tokens_seen": 1109344, |
| "step": 1780 |
| }, |
| { |
| "epoch": 3.5843373493975905, |
| "grad_norm": 3.877983570098877, |
| "learning_rate": 9.810536102541941e-06, |
| "loss": 0.0825, |
| "num_input_tokens_seen": 1112480, |
| "step": 1785 |
| }, |
| { |
| "epoch": 3.5943775100401605, |
| "grad_norm": 4.222232818603516, |
| "learning_rate": 9.808139654764962e-06, |
| "loss": 0.2169, |
| "num_input_tokens_seen": 1115104, |
| "step": 1790 |
| }, |
| { |
| "epoch": 3.604417670682731, |
| "grad_norm": 4.677252292633057, |
| "learning_rate": 9.80572844268256e-06, |
| "loss": 0.21, |
| "num_input_tokens_seen": 1117376, |
| "step": 1795 |
| }, |
| { |
| "epoch": 3.6144578313253013, |
| "grad_norm": 2.8699440956115723, |
| "learning_rate": 9.80330247369882e-06, |
| "loss": 0.1643, |
| "num_input_tokens_seen": 1120576, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.6244979919678713, |
| "grad_norm": 3.1444013118743896, |
| "learning_rate": 9.800861755263141e-06, |
| "loss": 0.1449, |
| "num_input_tokens_seen": 1123712, |
| "step": 1805 |
| }, |
| { |
| "epoch": 3.6345381526104417, |
| "grad_norm": 0.9077942371368408, |
| "learning_rate": 9.79840629487021e-06, |
| "loss": 0.1334, |
| "num_input_tokens_seen": 1127232, |
| "step": 1810 |
| }, |
| { |
| "epoch": 3.644578313253012, |
| "grad_norm": 5.333505153656006, |
| "learning_rate": 9.795936100059986e-06, |
| "loss": 0.1875, |
| "num_input_tokens_seen": 1130016, |
| "step": 1815 |
| }, |
| { |
| "epoch": 3.6546184738955825, |
| "grad_norm": 2.171931028366089, |
| "learning_rate": 9.79345117841767e-06, |
| "loss": 0.1633, |
| "num_input_tokens_seen": 1133632, |
| "step": 1820 |
| }, |
| { |
| "epoch": 3.664658634538153, |
| "grad_norm": 3.1967790126800537, |
| "learning_rate": 9.790951537573686e-06, |
| "loss": 0.1679, |
| "num_input_tokens_seen": 1136512, |
| "step": 1825 |
| }, |
| { |
| "epoch": 3.674698795180723, |
| "grad_norm": 1.7990355491638184, |
| "learning_rate": 9.788437185203655e-06, |
| "loss": 0.152, |
| "num_input_tokens_seen": 1139424, |
| "step": 1830 |
| }, |
| { |
| "epoch": 3.6847389558232932, |
| "grad_norm": 1.3819535970687866, |
| "learning_rate": 9.785908129028374e-06, |
| "loss": 0.1428, |
| "num_input_tokens_seen": 1142976, |
| "step": 1835 |
| }, |
| { |
| "epoch": 3.694779116465863, |
| "grad_norm": 2.1355533599853516, |
| "learning_rate": 9.78336437681379e-06, |
| "loss": 0.1709, |
| "num_input_tokens_seen": 1146624, |
| "step": 1840 |
| }, |
| { |
| "epoch": 3.7048192771084336, |
| "grad_norm": 3.2626771926879883, |
| "learning_rate": 9.780805936370976e-06, |
| "loss": 0.1462, |
| "num_input_tokens_seen": 1149632, |
| "step": 1845 |
| }, |
| { |
| "epoch": 3.714859437751004, |
| "grad_norm": 2.5651373863220215, |
| "learning_rate": 9.77823281555611e-06, |
| "loss": 0.1669, |
| "num_input_tokens_seen": 1153760, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.7248995983935744, |
| "grad_norm": 2.5659170150756836, |
| "learning_rate": 9.775645022270448e-06, |
| "loss": 0.1465, |
| "num_input_tokens_seen": 1156992, |
| "step": 1855 |
| }, |
| { |
| "epoch": 3.734939759036145, |
| "grad_norm": 1.7440698146820068, |
| "learning_rate": 9.773042564460299e-06, |
| "loss": 0.1401, |
| "num_input_tokens_seen": 1160032, |
| "step": 1860 |
| }, |
| { |
| "epoch": 3.744979919678715, |
| "grad_norm": 2.497500419616699, |
| "learning_rate": 9.770425450117005e-06, |
| "loss": 0.1428, |
| "num_input_tokens_seen": 1164128, |
| "step": 1865 |
| }, |
| { |
| "epoch": 3.755020080321285, |
| "grad_norm": 2.873211622238159, |
| "learning_rate": 9.767793687276913e-06, |
| "loss": 0.123, |
| "num_input_tokens_seen": 1167264, |
| "step": 1870 |
| }, |
| { |
| "epoch": 3.765060240963855, |
| "grad_norm": 2.283451795578003, |
| "learning_rate": 9.76514728402135e-06, |
| "loss": 0.1539, |
| "num_input_tokens_seen": 1169920, |
| "step": 1875 |
| }, |
| { |
| "epoch": 3.7751004016064256, |
| "grad_norm": 1.920371174812317, |
| "learning_rate": 9.762486248476597e-06, |
| "loss": 0.1462, |
| "num_input_tokens_seen": 1172640, |
| "step": 1880 |
| }, |
| { |
| "epoch": 3.785140562248996, |
| "grad_norm": 9.184189796447754, |
| "learning_rate": 9.759810588813872e-06, |
| "loss": 0.1893, |
| "num_input_tokens_seen": 1174816, |
| "step": 1885 |
| }, |
| { |
| "epoch": 3.7951807228915664, |
| "grad_norm": 2.4213876724243164, |
| "learning_rate": 9.757120313249292e-06, |
| "loss": 0.1554, |
| "num_input_tokens_seen": 1177568, |
| "step": 1890 |
| }, |
| { |
| "epoch": 3.805220883534137, |
| "grad_norm": 2.758159875869751, |
| "learning_rate": 9.754415430043864e-06, |
| "loss": 0.1431, |
| "num_input_tokens_seen": 1181472, |
| "step": 1895 |
| }, |
| { |
| "epoch": 3.8152610441767068, |
| "grad_norm": 1.6902081966400146, |
| "learning_rate": 9.751695947503442e-06, |
| "loss": 0.1324, |
| "num_input_tokens_seen": 1184064, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.825301204819277, |
| "grad_norm": 1.6394085884094238, |
| "learning_rate": 9.748961873978713e-06, |
| "loss": 0.1494, |
| "num_input_tokens_seen": 1186976, |
| "step": 1905 |
| }, |
| { |
| "epoch": 3.835341365461847, |
| "grad_norm": 2.8501033782958984, |
| "learning_rate": 9.74621321786517e-06, |
| "loss": 0.1644, |
| "num_input_tokens_seen": 1190080, |
| "step": 1910 |
| }, |
| { |
| "epoch": 3.8453815261044175, |
| "grad_norm": 1.711416482925415, |
| "learning_rate": 9.743449987603082e-06, |
| "loss": 0.1484, |
| "num_input_tokens_seen": 1192800, |
| "step": 1915 |
| }, |
| { |
| "epoch": 3.855421686746988, |
| "grad_norm": 2.7892041206359863, |
| "learning_rate": 9.740672191677474e-06, |
| "loss": 0.1237, |
| "num_input_tokens_seen": 1195936, |
| "step": 1920 |
| }, |
| { |
| "epoch": 3.8654618473895583, |
| "grad_norm": 2.6959807872772217, |
| "learning_rate": 9.737879838618095e-06, |
| "loss": 0.1634, |
| "num_input_tokens_seen": 1199232, |
| "step": 1925 |
| }, |
| { |
| "epoch": 3.8755020080321287, |
| "grad_norm": 1.1288927793502808, |
| "learning_rate": 9.735072936999392e-06, |
| "loss": 0.1529, |
| "num_input_tokens_seen": 1202464, |
| "step": 1930 |
| }, |
| { |
| "epoch": 3.8855421686746987, |
| "grad_norm": 6.117106914520264, |
| "learning_rate": 9.732251495440495e-06, |
| "loss": 0.1659, |
| "num_input_tokens_seen": 1205632, |
| "step": 1935 |
| }, |
| { |
| "epoch": 3.895582329317269, |
| "grad_norm": 3.2092738151550293, |
| "learning_rate": 9.729415522605171e-06, |
| "loss": 0.1869, |
| "num_input_tokens_seen": 1208768, |
| "step": 1940 |
| }, |
| { |
| "epoch": 3.9056224899598395, |
| "grad_norm": 2.194758176803589, |
| "learning_rate": 9.726565027201813e-06, |
| "loss": 0.14, |
| "num_input_tokens_seen": 1211872, |
| "step": 1945 |
| }, |
| { |
| "epoch": 3.9156626506024095, |
| "grad_norm": 6.0824971199035645, |
| "learning_rate": 9.72370001798341e-06, |
| "loss": 0.1424, |
| "num_input_tokens_seen": 1215360, |
| "step": 1950 |
| }, |
| { |
| "epoch": 3.92570281124498, |
| "grad_norm": 3.891514539718628, |
| "learning_rate": 9.720820503747517e-06, |
| "loss": 0.1403, |
| "num_input_tokens_seen": 1218080, |
| "step": 1955 |
| }, |
| { |
| "epoch": 3.9357429718875503, |
| "grad_norm": 5.07526159286499, |
| "learning_rate": 9.717926493336227e-06, |
| "loss": 0.167, |
| "num_input_tokens_seen": 1221216, |
| "step": 1960 |
| }, |
| { |
| "epoch": 3.9457831325301207, |
| "grad_norm": 9.483345031738281, |
| "learning_rate": 9.715017995636151e-06, |
| "loss": 0.1711, |
| "num_input_tokens_seen": 1224096, |
| "step": 1965 |
| }, |
| { |
| "epoch": 3.9558232931726907, |
| "grad_norm": 14.616900444030762, |
| "learning_rate": 9.712095019578382e-06, |
| "loss": 0.1591, |
| "num_input_tokens_seen": 1227584, |
| "step": 1970 |
| }, |
| { |
| "epoch": 3.965863453815261, |
| "grad_norm": 2.052248239517212, |
| "learning_rate": 9.70915757413847e-06, |
| "loss": 0.1023, |
| "num_input_tokens_seen": 1230592, |
| "step": 1975 |
| }, |
| { |
| "epoch": 3.9759036144578315, |
| "grad_norm": 2.4284846782684326, |
| "learning_rate": 9.706205668336404e-06, |
| "loss": 0.1923, |
| "num_input_tokens_seen": 1234592, |
| "step": 1980 |
| }, |
| { |
| "epoch": 3.9859437751004014, |
| "grad_norm": 4.282687664031982, |
| "learning_rate": 9.703239311236567e-06, |
| "loss": 0.1158, |
| "num_input_tokens_seen": 1238464, |
| "step": 1985 |
| }, |
| { |
| "epoch": 3.995983935742972, |
| "grad_norm": 1.9346394538879395, |
| "learning_rate": 9.700258511947722e-06, |
| "loss": 0.1786, |
| "num_input_tokens_seen": 1241760, |
| "step": 1990 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.16210927069187164, |
| "eval_runtime": 8.0638, |
| "eval_samples_per_second": 61.757, |
| "eval_steps_per_second": 15.501, |
| "num_input_tokens_seen": 1242912, |
| "step": 1992 |
| }, |
| { |
| "epoch": 4.006024096385542, |
| "grad_norm": 2.6092324256896973, |
| "learning_rate": 9.697263279622982e-06, |
| "loss": 0.1517, |
| "num_input_tokens_seen": 1245120, |
| "step": 1995 |
| }, |
| { |
| "epoch": 4.016064257028113, |
| "grad_norm": 4.089380264282227, |
| "learning_rate": 9.694253623459773e-06, |
| "loss": 0.1196, |
| "num_input_tokens_seen": 1247680, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.026104417670683, |
| "grad_norm": 1.3615751266479492, |
| "learning_rate": 9.691229552699817e-06, |
| "loss": 0.1008, |
| "num_input_tokens_seen": 1250944, |
| "step": 2005 |
| }, |
| { |
| "epoch": 4.036144578313253, |
| "grad_norm": 5.772199630737305, |
| "learning_rate": 9.688191076629096e-06, |
| "loss": 0.0652, |
| "num_input_tokens_seen": 1253888, |
| "step": 2010 |
| }, |
| { |
| "epoch": 4.046184738955823, |
| "grad_norm": 8.530108451843262, |
| "learning_rate": 9.685138204577829e-06, |
| "loss": 0.1276, |
| "num_input_tokens_seen": 1257312, |
| "step": 2015 |
| }, |
| { |
| "epoch": 4.056224899598393, |
| "grad_norm": 0.802332878112793, |
| "learning_rate": 9.682070945920437e-06, |
| "loss": 0.1594, |
| "num_input_tokens_seen": 1260320, |
| "step": 2020 |
| }, |
| { |
| "epoch": 4.066265060240964, |
| "grad_norm": 2.7650415897369385, |
| "learning_rate": 9.678989310075524e-06, |
| "loss": 0.1536, |
| "num_input_tokens_seen": 1263968, |
| "step": 2025 |
| }, |
| { |
| "epoch": 4.076305220883534, |
| "grad_norm": 11.720952033996582, |
| "learning_rate": 9.675893306505834e-06, |
| "loss": 0.2476, |
| "num_input_tokens_seen": 1266912, |
| "step": 2030 |
| }, |
| { |
| "epoch": 4.086345381526105, |
| "grad_norm": 1.9159353971481323, |
| "learning_rate": 9.672782944718234e-06, |
| "loss": 0.1311, |
| "num_input_tokens_seen": 1270016, |
| "step": 2035 |
| }, |
| { |
| "epoch": 4.096385542168675, |
| "grad_norm": 2.143418312072754, |
| "learning_rate": 9.669658234263682e-06, |
| "loss": 0.1124, |
| "num_input_tokens_seen": 1273984, |
| "step": 2040 |
| }, |
| { |
| "epoch": 4.106425702811245, |
| "grad_norm": 2.5297770500183105, |
| "learning_rate": 9.666519184737193e-06, |
| "loss": 0.1372, |
| "num_input_tokens_seen": 1276992, |
| "step": 2045 |
| }, |
| { |
| "epoch": 4.116465863453815, |
| "grad_norm": 4.700195789337158, |
| "learning_rate": 9.663365805777815e-06, |
| "loss": 0.1124, |
| "num_input_tokens_seen": 1279520, |
| "step": 2050 |
| }, |
| { |
| "epoch": 4.126506024096385, |
| "grad_norm": 7.075033664703369, |
| "learning_rate": 9.660198107068597e-06, |
| "loss": 0.1087, |
| "num_input_tokens_seen": 1282496, |
| "step": 2055 |
| }, |
| { |
| "epoch": 4.136546184738956, |
| "grad_norm": 3.068173408508301, |
| "learning_rate": 9.657016098336557e-06, |
| "loss": 0.1004, |
| "num_input_tokens_seen": 1284960, |
| "step": 2060 |
| }, |
| { |
| "epoch": 4.146586345381526, |
| "grad_norm": 2.9260811805725098, |
| "learning_rate": 9.65381978935266e-06, |
| "loss": 0.1622, |
| "num_input_tokens_seen": 1288544, |
| "step": 2065 |
| }, |
| { |
| "epoch": 4.156626506024097, |
| "grad_norm": 1.7771689891815186, |
| "learning_rate": 9.650609189931778e-06, |
| "loss": 0.1515, |
| "num_input_tokens_seen": 1291904, |
| "step": 2070 |
| }, |
| { |
| "epoch": 4.166666666666667, |
| "grad_norm": 3.8127546310424805, |
| "learning_rate": 9.647384309932665e-06, |
| "loss": 0.1402, |
| "num_input_tokens_seen": 1294880, |
| "step": 2075 |
| }, |
| { |
| "epoch": 4.176706827309237, |
| "grad_norm": 1.967512607574463, |
| "learning_rate": 9.644145159257928e-06, |
| "loss": 0.194, |
| "num_input_tokens_seen": 1298432, |
| "step": 2080 |
| }, |
| { |
| "epoch": 4.186746987951807, |
| "grad_norm": 2.751413106918335, |
| "learning_rate": 9.640891747853995e-06, |
| "loss": 0.1261, |
| "num_input_tokens_seen": 1301568, |
| "step": 2085 |
| }, |
| { |
| "epoch": 4.196787148594377, |
| "grad_norm": 4.411474704742432, |
| "learning_rate": 9.63762408571108e-06, |
| "loss": 0.1632, |
| "num_input_tokens_seen": 1304288, |
| "step": 2090 |
| }, |
| { |
| "epoch": 4.206827309236948, |
| "grad_norm": 1.2499157190322876, |
| "learning_rate": 9.634342182863163e-06, |
| "loss": 0.1517, |
| "num_input_tokens_seen": 1306784, |
| "step": 2095 |
| }, |
| { |
| "epoch": 4.216867469879518, |
| "grad_norm": 2.7209055423736572, |
| "learning_rate": 9.63104604938795e-06, |
| "loss": 0.1451, |
| "num_input_tokens_seen": 1309760, |
| "step": 2100 |
| }, |
| { |
| "epoch": 4.2269076305220885, |
| "grad_norm": 1.8991820812225342, |
| "learning_rate": 9.627735695406842e-06, |
| "loss": 0.1158, |
| "num_input_tokens_seen": 1312928, |
| "step": 2105 |
| }, |
| { |
| "epoch": 4.236947791164659, |
| "grad_norm": 8.971945762634277, |
| "learning_rate": 9.62441113108491e-06, |
| "loss": 0.1374, |
| "num_input_tokens_seen": 1316192, |
| "step": 2110 |
| }, |
| { |
| "epoch": 4.246987951807229, |
| "grad_norm": 2.6375224590301514, |
| "learning_rate": 9.621072366630859e-06, |
| "loss": 0.1737, |
| "num_input_tokens_seen": 1319072, |
| "step": 2115 |
| }, |
| { |
| "epoch": 4.257028112449799, |
| "grad_norm": 2.076023578643799, |
| "learning_rate": 9.617719412297002e-06, |
| "loss": 0.1038, |
| "num_input_tokens_seen": 1321760, |
| "step": 2120 |
| }, |
| { |
| "epoch": 4.267068273092369, |
| "grad_norm": 4.5401225090026855, |
| "learning_rate": 9.614352278379217e-06, |
| "loss": 0.1736, |
| "num_input_tokens_seen": 1325600, |
| "step": 2125 |
| }, |
| { |
| "epoch": 4.27710843373494, |
| "grad_norm": 3.4096763134002686, |
| "learning_rate": 9.610970975216933e-06, |
| "loss": 0.1458, |
| "num_input_tokens_seen": 1328992, |
| "step": 2130 |
| }, |
| { |
| "epoch": 4.28714859437751, |
| "grad_norm": 6.173547744750977, |
| "learning_rate": 9.60757551319308e-06, |
| "loss": 0.1654, |
| "num_input_tokens_seen": 1333152, |
| "step": 2135 |
| }, |
| { |
| "epoch": 4.2971887550200805, |
| "grad_norm": 3.055192232131958, |
| "learning_rate": 9.604165902734069e-06, |
| "loss": 0.1271, |
| "num_input_tokens_seen": 1335488, |
| "step": 2140 |
| }, |
| { |
| "epoch": 4.307228915662651, |
| "grad_norm": 2.3239598274230957, |
| "learning_rate": 9.600742154309756e-06, |
| "loss": 0.1365, |
| "num_input_tokens_seen": 1338720, |
| "step": 2145 |
| }, |
| { |
| "epoch": 4.317269076305221, |
| "grad_norm": 4.2289347648620605, |
| "learning_rate": 9.59730427843341e-06, |
| "loss": 0.1103, |
| "num_input_tokens_seen": 1342272, |
| "step": 2150 |
| }, |
| { |
| "epoch": 4.327309236947791, |
| "grad_norm": 2.9112138748168945, |
| "learning_rate": 9.593852285661684e-06, |
| "loss": 0.1479, |
| "num_input_tokens_seen": 1344704, |
| "step": 2155 |
| }, |
| { |
| "epoch": 4.337349397590361, |
| "grad_norm": 3.277313232421875, |
| "learning_rate": 9.590386186594574e-06, |
| "loss": 0.1618, |
| "num_input_tokens_seen": 1347392, |
| "step": 2160 |
| }, |
| { |
| "epoch": 4.347389558232932, |
| "grad_norm": 1.3917313814163208, |
| "learning_rate": 9.586905991875397e-06, |
| "loss": 0.1114, |
| "num_input_tokens_seen": 1350912, |
| "step": 2165 |
| }, |
| { |
| "epoch": 4.357429718875502, |
| "grad_norm": 1.485863208770752, |
| "learning_rate": 9.583411712190749e-06, |
| "loss": 0.1447, |
| "num_input_tokens_seen": 1353824, |
| "step": 2170 |
| }, |
| { |
| "epoch": 4.367469879518072, |
| "grad_norm": 3.005549907684326, |
| "learning_rate": 9.579903358270482e-06, |
| "loss": 0.1457, |
| "num_input_tokens_seen": 1356416, |
| "step": 2175 |
| }, |
| { |
| "epoch": 4.377510040160643, |
| "grad_norm": 1.8012841939926147, |
| "learning_rate": 9.576380940887661e-06, |
| "loss": 0.1266, |
| "num_input_tokens_seen": 1359712, |
| "step": 2180 |
| }, |
| { |
| "epoch": 4.387550200803213, |
| "grad_norm": 1.5120962858200073, |
| "learning_rate": 9.572844470858537e-06, |
| "loss": 0.1224, |
| "num_input_tokens_seen": 1362368, |
| "step": 2185 |
| }, |
| { |
| "epoch": 4.397590361445783, |
| "grad_norm": 6.252309322357178, |
| "learning_rate": 9.569293959042513e-06, |
| "loss": 0.1564, |
| "num_input_tokens_seen": 1365312, |
| "step": 2190 |
| }, |
| { |
| "epoch": 4.407630522088353, |
| "grad_norm": 4.203272819519043, |
| "learning_rate": 9.56572941634211e-06, |
| "loss": 0.2695, |
| "num_input_tokens_seen": 1368416, |
| "step": 2195 |
| }, |
| { |
| "epoch": 4.417670682730924, |
| "grad_norm": 2.676398515701294, |
| "learning_rate": 9.562150853702931e-06, |
| "loss": 0.1535, |
| "num_input_tokens_seen": 1371456, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.427710843373494, |
| "grad_norm": 1.36383056640625, |
| "learning_rate": 9.558558282113634e-06, |
| "loss": 0.1436, |
| "num_input_tokens_seen": 1375040, |
| "step": 2205 |
| }, |
| { |
| "epoch": 4.437751004016064, |
| "grad_norm": 3.8174405097961426, |
| "learning_rate": 9.554951712605891e-06, |
| "loss": 0.1319, |
| "num_input_tokens_seen": 1379104, |
| "step": 2210 |
| }, |
| { |
| "epoch": 4.447791164658635, |
| "grad_norm": 5.0451154708862305, |
| "learning_rate": 9.551331156254358e-06, |
| "loss": 0.1159, |
| "num_input_tokens_seen": 1382848, |
| "step": 2215 |
| }, |
| { |
| "epoch": 4.457831325301205, |
| "grad_norm": 2.2237813472747803, |
| "learning_rate": 9.547696624176642e-06, |
| "loss": 0.1147, |
| "num_input_tokens_seen": 1386432, |
| "step": 2220 |
| }, |
| { |
| "epoch": 4.467871485943775, |
| "grad_norm": 6.175833702087402, |
| "learning_rate": 9.544048127533262e-06, |
| "loss": 0.1554, |
| "num_input_tokens_seen": 1390048, |
| "step": 2225 |
| }, |
| { |
| "epoch": 4.477911646586345, |
| "grad_norm": 11.775164604187012, |
| "learning_rate": 9.540385677527617e-06, |
| "loss": 0.2124, |
| "num_input_tokens_seen": 1393344, |
| "step": 2230 |
| }, |
| { |
| "epoch": 4.4879518072289155, |
| "grad_norm": 1.2393088340759277, |
| "learning_rate": 9.53670928540596e-06, |
| "loss": 0.1014, |
| "num_input_tokens_seen": 1396096, |
| "step": 2235 |
| }, |
| { |
| "epoch": 4.497991967871486, |
| "grad_norm": 2.5581798553466797, |
| "learning_rate": 9.533018962457347e-06, |
| "loss": 0.0831, |
| "num_input_tokens_seen": 1399168, |
| "step": 2240 |
| }, |
| { |
| "epoch": 4.508032128514056, |
| "grad_norm": 4.483763217926025, |
| "learning_rate": 9.529314720013618e-06, |
| "loss": 0.1206, |
| "num_input_tokens_seen": 1402976, |
| "step": 2245 |
| }, |
| { |
| "epoch": 4.518072289156627, |
| "grad_norm": 1.5558322668075562, |
| "learning_rate": 9.52559656944935e-06, |
| "loss": 0.1574, |
| "num_input_tokens_seen": 1406496, |
| "step": 2250 |
| }, |
| { |
| "epoch": 4.528112449799197, |
| "grad_norm": 5.891838550567627, |
| "learning_rate": 9.521864522181834e-06, |
| "loss": 0.1341, |
| "num_input_tokens_seen": 1409344, |
| "step": 2255 |
| }, |
| { |
| "epoch": 4.538152610441767, |
| "grad_norm": 3.315089702606201, |
| "learning_rate": 9.518118589671025e-06, |
| "loss": 0.1485, |
| "num_input_tokens_seen": 1412544, |
| "step": 2260 |
| }, |
| { |
| "epoch": 4.548192771084337, |
| "grad_norm": 12.753541946411133, |
| "learning_rate": 9.514358783419518e-06, |
| "loss": 0.1299, |
| "num_input_tokens_seen": 1415680, |
| "step": 2265 |
| }, |
| { |
| "epoch": 4.5582329317269075, |
| "grad_norm": 3.243870735168457, |
| "learning_rate": 9.510585114972518e-06, |
| "loss": 0.2371, |
| "num_input_tokens_seen": 1419040, |
| "step": 2270 |
| }, |
| { |
| "epoch": 4.568273092369478, |
| "grad_norm": 2.4356415271759033, |
| "learning_rate": 9.506797595917787e-06, |
| "loss": 0.1112, |
| "num_input_tokens_seen": 1422048, |
| "step": 2275 |
| }, |
| { |
| "epoch": 4.578313253012048, |
| "grad_norm": 3.8205935955047607, |
| "learning_rate": 9.502996237885623e-06, |
| "loss": 0.1132, |
| "num_input_tokens_seen": 1425504, |
| "step": 2280 |
| }, |
| { |
| "epoch": 4.588353413654619, |
| "grad_norm": 4.907054424285889, |
| "learning_rate": 9.499181052548813e-06, |
| "loss": 0.1357, |
| "num_input_tokens_seen": 1428608, |
| "step": 2285 |
| }, |
| { |
| "epoch": 4.598393574297189, |
| "grad_norm": 4.737507343292236, |
| "learning_rate": 9.495352051622612e-06, |
| "loss": 0.1227, |
| "num_input_tokens_seen": 1430752, |
| "step": 2290 |
| }, |
| { |
| "epoch": 4.608433734939759, |
| "grad_norm": 2.2299745082855225, |
| "learning_rate": 9.491509246864691e-06, |
| "loss": 0.1814, |
| "num_input_tokens_seen": 1433600, |
| "step": 2295 |
| }, |
| { |
| "epoch": 4.618473895582329, |
| "grad_norm": 1.2637619972229004, |
| "learning_rate": 9.487652650075116e-06, |
| "loss": 0.1479, |
| "num_input_tokens_seen": 1436352, |
| "step": 2300 |
| }, |
| { |
| "epoch": 4.628514056224899, |
| "grad_norm": 2.430283546447754, |
| "learning_rate": 9.483782273096295e-06, |
| "loss": 0.1756, |
| "num_input_tokens_seen": 1439296, |
| "step": 2305 |
| }, |
| { |
| "epoch": 4.63855421686747, |
| "grad_norm": 6.702330589294434, |
| "learning_rate": 9.479898127812957e-06, |
| "loss": 0.1802, |
| "num_input_tokens_seen": 1443456, |
| "step": 2310 |
| }, |
| { |
| "epoch": 4.64859437751004, |
| "grad_norm": 3.452070474624634, |
| "learning_rate": 9.476000226152107e-06, |
| "loss": 0.1391, |
| "num_input_tokens_seen": 1446624, |
| "step": 2315 |
| }, |
| { |
| "epoch": 4.658634538152611, |
| "grad_norm": 2.66707706451416, |
| "learning_rate": 9.472088580082991e-06, |
| "loss": 0.1071, |
| "num_input_tokens_seen": 1450016, |
| "step": 2320 |
| }, |
| { |
| "epoch": 4.668674698795181, |
| "grad_norm": 1.9608304500579834, |
| "learning_rate": 9.468163201617063e-06, |
| "loss": 0.1438, |
| "num_input_tokens_seen": 1453856, |
| "step": 2325 |
| }, |
| { |
| "epoch": 4.678714859437751, |
| "grad_norm": 2.3657283782958984, |
| "learning_rate": 9.46422410280794e-06, |
| "loss": 0.0999, |
| "num_input_tokens_seen": 1456832, |
| "step": 2330 |
| }, |
| { |
| "epoch": 4.688755020080321, |
| "grad_norm": 8.516715049743652, |
| "learning_rate": 9.460271295751373e-06, |
| "loss": 0.2223, |
| "num_input_tokens_seen": 1459488, |
| "step": 2335 |
| }, |
| { |
| "epoch": 4.698795180722891, |
| "grad_norm": 0.987719714641571, |
| "learning_rate": 9.456304792585207e-06, |
| "loss": 0.1433, |
| "num_input_tokens_seen": 1462400, |
| "step": 2340 |
| }, |
| { |
| "epoch": 4.708835341365462, |
| "grad_norm": 2.08788800239563, |
| "learning_rate": 9.452324605489344e-06, |
| "loss": 0.1415, |
| "num_input_tokens_seen": 1465248, |
| "step": 2345 |
| }, |
| { |
| "epoch": 4.718875502008032, |
| "grad_norm": 6.095128059387207, |
| "learning_rate": 9.448330746685704e-06, |
| "loss": 0.1313, |
| "num_input_tokens_seen": 1468128, |
| "step": 2350 |
| }, |
| { |
| "epoch": 4.728915662650603, |
| "grad_norm": 35.532806396484375, |
| "learning_rate": 9.444323228438186e-06, |
| "loss": 0.2186, |
| "num_input_tokens_seen": 1471040, |
| "step": 2355 |
| }, |
| { |
| "epoch": 4.738955823293173, |
| "grad_norm": 5.288688659667969, |
| "learning_rate": 9.440302063052638e-06, |
| "loss": 0.1963, |
| "num_input_tokens_seen": 1473568, |
| "step": 2360 |
| }, |
| { |
| "epoch": 4.7489959839357425, |
| "grad_norm": 4.212613582611084, |
| "learning_rate": 9.436267262876808e-06, |
| "loss": 0.1266, |
| "num_input_tokens_seen": 1477184, |
| "step": 2365 |
| }, |
| { |
| "epoch": 4.759036144578313, |
| "grad_norm": 2.8863844871520996, |
| "learning_rate": 9.43221884030032e-06, |
| "loss": 0.1391, |
| "num_input_tokens_seen": 1480512, |
| "step": 2370 |
| }, |
| { |
| "epoch": 4.769076305220883, |
| "grad_norm": 4.399484157562256, |
| "learning_rate": 9.428156807754622e-06, |
| "loss": 0.1569, |
| "num_input_tokens_seen": 1483776, |
| "step": 2375 |
| }, |
| { |
| "epoch": 4.779116465863454, |
| "grad_norm": 2.0701286792755127, |
| "learning_rate": 9.424081177712955e-06, |
| "loss": 0.1241, |
| "num_input_tokens_seen": 1486464, |
| "step": 2380 |
| }, |
| { |
| "epoch": 4.789156626506024, |
| "grad_norm": 2.4887611865997314, |
| "learning_rate": 9.419991962690317e-06, |
| "loss": 0.1112, |
| "num_input_tokens_seen": 1489056, |
| "step": 2385 |
| }, |
| { |
| "epoch": 4.7991967871485945, |
| "grad_norm": 3.219498872756958, |
| "learning_rate": 9.415889175243416e-06, |
| "loss": 0.1215, |
| "num_input_tokens_seen": 1491808, |
| "step": 2390 |
| }, |
| { |
| "epoch": 4.809236947791165, |
| "grad_norm": 5.092496871948242, |
| "learning_rate": 9.411772827970642e-06, |
| "loss": 0.1055, |
| "num_input_tokens_seen": 1495008, |
| "step": 2395 |
| }, |
| { |
| "epoch": 4.8192771084337345, |
| "grad_norm": 4.519360542297363, |
| "learning_rate": 9.40764293351202e-06, |
| "loss": 0.1365, |
| "num_input_tokens_seen": 1497760, |
| "step": 2400 |
| }, |
| { |
| "epoch": 4.829317269076305, |
| "grad_norm": 1.1300933361053467, |
| "learning_rate": 9.403499504549174e-06, |
| "loss": 0.175, |
| "num_input_tokens_seen": 1500544, |
| "step": 2405 |
| }, |
| { |
| "epoch": 4.839357429718875, |
| "grad_norm": 2.9588253498077393, |
| "learning_rate": 9.399342553805289e-06, |
| "loss": 0.1112, |
| "num_input_tokens_seen": 1503232, |
| "step": 2410 |
| }, |
| { |
| "epoch": 4.849397590361446, |
| "grad_norm": 3.217325448989868, |
| "learning_rate": 9.395172094045073e-06, |
| "loss": 0.1045, |
| "num_input_tokens_seen": 1506432, |
| "step": 2415 |
| }, |
| { |
| "epoch": 4.859437751004016, |
| "grad_norm": 5.2091569900512695, |
| "learning_rate": 9.390988138074713e-06, |
| "loss": 0.1387, |
| "num_input_tokens_seen": 1510336, |
| "step": 2420 |
| }, |
| { |
| "epoch": 4.8694779116465865, |
| "grad_norm": 8.337028503417969, |
| "learning_rate": 9.38679069874184e-06, |
| "loss": 0.2045, |
| "num_input_tokens_seen": 1512928, |
| "step": 2425 |
| }, |
| { |
| "epoch": 4.879518072289157, |
| "grad_norm": 4.4194111824035645, |
| "learning_rate": 9.382579788935487e-06, |
| "loss": 0.1364, |
| "num_input_tokens_seen": 1515968, |
| "step": 2430 |
| }, |
| { |
| "epoch": 4.889558232931726, |
| "grad_norm": 2.803941011428833, |
| "learning_rate": 9.378355421586053e-06, |
| "loss": 0.1537, |
| "num_input_tokens_seen": 1519168, |
| "step": 2435 |
| }, |
| { |
| "epoch": 4.899598393574297, |
| "grad_norm": 5.292468547821045, |
| "learning_rate": 9.374117609665263e-06, |
| "loss": 0.1241, |
| "num_input_tokens_seen": 1522432, |
| "step": 2440 |
| }, |
| { |
| "epoch": 4.909638554216867, |
| "grad_norm": 3.3019838333129883, |
| "learning_rate": 9.369866366186116e-06, |
| "loss": 0.1524, |
| "num_input_tokens_seen": 1525696, |
| "step": 2445 |
| }, |
| { |
| "epoch": 4.919678714859438, |
| "grad_norm": 7.390207767486572, |
| "learning_rate": 9.365601704202869e-06, |
| "loss": 0.1753, |
| "num_input_tokens_seen": 1528736, |
| "step": 2450 |
| }, |
| { |
| "epoch": 4.929718875502008, |
| "grad_norm": 2.4274277687072754, |
| "learning_rate": 9.36132363681097e-06, |
| "loss": 0.1714, |
| "num_input_tokens_seen": 1531648, |
| "step": 2455 |
| }, |
| { |
| "epoch": 4.9397590361445785, |
| "grad_norm": 1.9142770767211914, |
| "learning_rate": 9.35703217714704e-06, |
| "loss": 0.1605, |
| "num_input_tokens_seen": 1534720, |
| "step": 2460 |
| }, |
| { |
| "epoch": 4.949799196787149, |
| "grad_norm": 3.327009677886963, |
| "learning_rate": 9.35272733838882e-06, |
| "loss": 0.195, |
| "num_input_tokens_seen": 1536928, |
| "step": 2465 |
| }, |
| { |
| "epoch": 4.959839357429718, |
| "grad_norm": 2.2915942668914795, |
| "learning_rate": 9.348409133755137e-06, |
| "loss": 0.1326, |
| "num_input_tokens_seen": 1539648, |
| "step": 2470 |
| }, |
| { |
| "epoch": 4.969879518072289, |
| "grad_norm": 4.97459602355957, |
| "learning_rate": 9.344077576505853e-06, |
| "loss": 0.1515, |
| "num_input_tokens_seen": 1543552, |
| "step": 2475 |
| }, |
| { |
| "epoch": 4.979919678714859, |
| "grad_norm": 2.44541335105896, |
| "learning_rate": 9.339732679941842e-06, |
| "loss": 0.1143, |
| "num_input_tokens_seen": 1546912, |
| "step": 2480 |
| }, |
| { |
| "epoch": 4.98995983935743, |
| "grad_norm": 2.091432809829712, |
| "learning_rate": 9.335374457404928e-06, |
| "loss": 0.1388, |
| "num_input_tokens_seen": 1550688, |
| "step": 2485 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 2.9539296627044678, |
| "learning_rate": 9.331002922277865e-06, |
| "loss": 0.1338, |
| "num_input_tokens_seen": 1553472, |
| "step": 2490 |
| }, |
| { |
| "epoch": 5.01004016064257, |
| "grad_norm": 1.839915156364441, |
| "learning_rate": 9.326618087984278e-06, |
| "loss": 0.0774, |
| "num_input_tokens_seen": 1557056, |
| "step": 2495 |
| }, |
| { |
| "epoch": 5.020080321285141, |
| "grad_norm": 3.636218786239624, |
| "learning_rate": 9.322219967988638e-06, |
| "loss": 0.1358, |
| "num_input_tokens_seen": 1559968, |
| "step": 2500 |
| }, |
| { |
| "epoch": 5.030120481927711, |
| "grad_norm": 11.675433158874512, |
| "learning_rate": 9.317808575796202e-06, |
| "loss": 0.1477, |
| "num_input_tokens_seen": 1563040, |
| "step": 2505 |
| }, |
| { |
| "epoch": 5.040160642570281, |
| "grad_norm": 2.875239372253418, |
| "learning_rate": 9.313383924952988e-06, |
| "loss": 0.1027, |
| "num_input_tokens_seen": 1565760, |
| "step": 2510 |
| }, |
| { |
| "epoch": 5.050200803212851, |
| "grad_norm": 3.811685085296631, |
| "learning_rate": 9.308946029045726e-06, |
| "loss": 0.0806, |
| "num_input_tokens_seen": 1568928, |
| "step": 2515 |
| }, |
| { |
| "epoch": 5.0602409638554215, |
| "grad_norm": 1.25583815574646, |
| "learning_rate": 9.304494901701821e-06, |
| "loss": 0.1202, |
| "num_input_tokens_seen": 1571808, |
| "step": 2520 |
| }, |
| { |
| "epoch": 5.070281124497992, |
| "grad_norm": 3.814985513687134, |
| "learning_rate": 9.300030556589303e-06, |
| "loss": 0.1632, |
| "num_input_tokens_seen": 1575200, |
| "step": 2525 |
| }, |
| { |
| "epoch": 5.080321285140562, |
| "grad_norm": 4.336440086364746, |
| "learning_rate": 9.29555300741679e-06, |
| "loss": 0.1406, |
| "num_input_tokens_seen": 1578880, |
| "step": 2530 |
| }, |
| { |
| "epoch": 5.090361445783133, |
| "grad_norm": 3.6324188709259033, |
| "learning_rate": 9.291062267933446e-06, |
| "loss": 0.1248, |
| "num_input_tokens_seen": 1581120, |
| "step": 2535 |
| }, |
| { |
| "epoch": 5.100401606425703, |
| "grad_norm": 7.417630195617676, |
| "learning_rate": 9.28655835192894e-06, |
| "loss": 0.1795, |
| "num_input_tokens_seen": 1584064, |
| "step": 2540 |
| }, |
| { |
| "epoch": 5.110441767068273, |
| "grad_norm": 5.5329155921936035, |
| "learning_rate": 9.282041273233402e-06, |
| "loss": 0.1542, |
| "num_input_tokens_seen": 1587744, |
| "step": 2545 |
| }, |
| { |
| "epoch": 5.120481927710843, |
| "grad_norm": 5.481621265411377, |
| "learning_rate": 9.277511045717377e-06, |
| "loss": 0.1454, |
| "num_input_tokens_seen": 1590624, |
| "step": 2550 |
| }, |
| { |
| "epoch": 5.1305220883534135, |
| "grad_norm": 5.395864009857178, |
| "learning_rate": 9.27296768329179e-06, |
| "loss": 0.1246, |
| "num_input_tokens_seen": 1594016, |
| "step": 2555 |
| }, |
| { |
| "epoch": 5.140562248995984, |
| "grad_norm": 2.902698278427124, |
| "learning_rate": 9.268411199907898e-06, |
| "loss": 0.1345, |
| "num_input_tokens_seen": 1596640, |
| "step": 2560 |
| }, |
| { |
| "epoch": 5.150602409638554, |
| "grad_norm": 14.940276145935059, |
| "learning_rate": 9.263841609557247e-06, |
| "loss": 0.1773, |
| "num_input_tokens_seen": 1599840, |
| "step": 2565 |
| }, |
| { |
| "epoch": 5.160642570281125, |
| "grad_norm": 3.3749656677246094, |
| "learning_rate": 9.259258926271632e-06, |
| "loss": 0.1523, |
| "num_input_tokens_seen": 1602656, |
| "step": 2570 |
| }, |
| { |
| "epoch": 5.170682730923695, |
| "grad_norm": 9.682775497436523, |
| "learning_rate": 9.254663164123052e-06, |
| "loss": 0.119, |
| "num_input_tokens_seen": 1606176, |
| "step": 2575 |
| }, |
| { |
| "epoch": 5.180722891566265, |
| "grad_norm": 7.369882583618164, |
| "learning_rate": 9.250054337223666e-06, |
| "loss": 0.1502, |
| "num_input_tokens_seen": 1608768, |
| "step": 2580 |
| }, |
| { |
| "epoch": 5.190763052208835, |
| "grad_norm": 2.4143364429473877, |
| "learning_rate": 9.245432459725754e-06, |
| "loss": 0.1165, |
| "num_input_tokens_seen": 1611168, |
| "step": 2585 |
| }, |
| { |
| "epoch": 5.2008032128514055, |
| "grad_norm": 4.51724100112915, |
| "learning_rate": 9.240797545821666e-06, |
| "loss": 0.1484, |
| "num_input_tokens_seen": 1614720, |
| "step": 2590 |
| }, |
| { |
| "epoch": 5.210843373493976, |
| "grad_norm": 3.192105293273926, |
| "learning_rate": 9.236149609743786e-06, |
| "loss": 0.1225, |
| "num_input_tokens_seen": 1617504, |
| "step": 2595 |
| }, |
| { |
| "epoch": 5.220883534136546, |
| "grad_norm": 1.9884601831436157, |
| "learning_rate": 9.231488665764485e-06, |
| "loss": 0.0974, |
| "num_input_tokens_seen": 1620672, |
| "step": 2600 |
| }, |
| { |
| "epoch": 5.230923694779117, |
| "grad_norm": 3.4887187480926514, |
| "learning_rate": 9.226814728196072e-06, |
| "loss": 0.1373, |
| "num_input_tokens_seen": 1623488, |
| "step": 2605 |
| }, |
| { |
| "epoch": 5.240963855421687, |
| "grad_norm": 4.321260452270508, |
| "learning_rate": 9.222127811390765e-06, |
| "loss": 0.1718, |
| "num_input_tokens_seen": 1626080, |
| "step": 2610 |
| }, |
| { |
| "epoch": 5.2510040160642575, |
| "grad_norm": 1.5512542724609375, |
| "learning_rate": 9.217427929740625e-06, |
| "loss": 0.0963, |
| "num_input_tokens_seen": 1629536, |
| "step": 2615 |
| }, |
| { |
| "epoch": 5.261044176706827, |
| "grad_norm": 1.7714067697525024, |
| "learning_rate": 9.212715097677537e-06, |
| "loss": 0.1076, |
| "num_input_tokens_seen": 1632768, |
| "step": 2620 |
| }, |
| { |
| "epoch": 5.271084337349397, |
| "grad_norm": 5.0114264488220215, |
| "learning_rate": 9.207989329673143e-06, |
| "loss": 0.1039, |
| "num_input_tokens_seen": 1636256, |
| "step": 2625 |
| }, |
| { |
| "epoch": 5.281124497991968, |
| "grad_norm": 2.490394353866577, |
| "learning_rate": 9.203250640238813e-06, |
| "loss": 0.1435, |
| "num_input_tokens_seen": 1639264, |
| "step": 2630 |
| }, |
| { |
| "epoch": 5.291164658634538, |
| "grad_norm": 1.5815428495407104, |
| "learning_rate": 9.198499043925591e-06, |
| "loss": 0.1152, |
| "num_input_tokens_seen": 1642432, |
| "step": 2635 |
| }, |
| { |
| "epoch": 5.301204819277109, |
| "grad_norm": 2.301948308944702, |
| "learning_rate": 9.193734555324154e-06, |
| "loss": 0.1229, |
| "num_input_tokens_seen": 1645600, |
| "step": 2640 |
| }, |
| { |
| "epoch": 5.311244979919679, |
| "grad_norm": 7.6477179527282715, |
| "learning_rate": 9.18895718906477e-06, |
| "loss": 0.1259, |
| "num_input_tokens_seen": 1648192, |
| "step": 2645 |
| }, |
| { |
| "epoch": 5.321285140562249, |
| "grad_norm": 11.513751029968262, |
| "learning_rate": 9.184166959817247e-06, |
| "loss": 0.1592, |
| "num_input_tokens_seen": 1651776, |
| "step": 2650 |
| }, |
| { |
| "epoch": 5.331325301204819, |
| "grad_norm": 7.205603122711182, |
| "learning_rate": 9.179363882290896e-06, |
| "loss": 0.0776, |
| "num_input_tokens_seen": 1654944, |
| "step": 2655 |
| }, |
| { |
| "epoch": 5.341365461847389, |
| "grad_norm": 3.9215004444122314, |
| "learning_rate": 9.17454797123448e-06, |
| "loss": 0.1496, |
| "num_input_tokens_seen": 1657344, |
| "step": 2660 |
| }, |
| { |
| "epoch": 5.35140562248996, |
| "grad_norm": 2.7696170806884766, |
| "learning_rate": 9.169719241436162e-06, |
| "loss": 0.0683, |
| "num_input_tokens_seen": 1659680, |
| "step": 2665 |
| }, |
| { |
| "epoch": 5.36144578313253, |
| "grad_norm": 7.695875644683838, |
| "learning_rate": 9.164877707723476e-06, |
| "loss": 0.2095, |
| "num_input_tokens_seen": 1662560, |
| "step": 2670 |
| }, |
| { |
| "epoch": 5.371485943775101, |
| "grad_norm": 7.6388020515441895, |
| "learning_rate": 9.160023384963271e-06, |
| "loss": 0.114, |
| "num_input_tokens_seen": 1665728, |
| "step": 2675 |
| }, |
| { |
| "epoch": 5.381526104417671, |
| "grad_norm": 11.195474624633789, |
| "learning_rate": 9.155156288061666e-06, |
| "loss": 0.1034, |
| "num_input_tokens_seen": 1669216, |
| "step": 2680 |
| }, |
| { |
| "epoch": 5.391566265060241, |
| "grad_norm": 7.711627006530762, |
| "learning_rate": 9.150276431964007e-06, |
| "loss": 0.154, |
| "num_input_tokens_seen": 1672768, |
| "step": 2685 |
| }, |
| { |
| "epoch": 5.401606425702811, |
| "grad_norm": 5.470666885375977, |
| "learning_rate": 9.145383831654814e-06, |
| "loss": 0.1459, |
| "num_input_tokens_seen": 1675520, |
| "step": 2690 |
| }, |
| { |
| "epoch": 5.411646586345381, |
| "grad_norm": 3.5182509422302246, |
| "learning_rate": 9.14047850215775e-06, |
| "loss": 0.0966, |
| "num_input_tokens_seen": 1678784, |
| "step": 2695 |
| }, |
| { |
| "epoch": 5.421686746987952, |
| "grad_norm": 2.275167942047119, |
| "learning_rate": 9.13556045853556e-06, |
| "loss": 0.1964, |
| "num_input_tokens_seen": 1681376, |
| "step": 2700 |
| }, |
| { |
| "epoch": 5.431726907630522, |
| "grad_norm": 1.8862415552139282, |
| "learning_rate": 9.130629715890027e-06, |
| "loss": 0.1234, |
| "num_input_tokens_seen": 1684864, |
| "step": 2705 |
| }, |
| { |
| "epoch": 5.4417670682730925, |
| "grad_norm": 2.1846301555633545, |
| "learning_rate": 9.125686289361935e-06, |
| "loss": 0.1196, |
| "num_input_tokens_seen": 1688896, |
| "step": 2710 |
| }, |
| { |
| "epoch": 5.451807228915663, |
| "grad_norm": 2.483438491821289, |
| "learning_rate": 9.120730194131011e-06, |
| "loss": 0.0986, |
| "num_input_tokens_seen": 1692288, |
| "step": 2715 |
| }, |
| { |
| "epoch": 5.461847389558233, |
| "grad_norm": 4.106010913848877, |
| "learning_rate": 9.115761445415887e-06, |
| "loss": 0.1174, |
| "num_input_tokens_seen": 1695200, |
| "step": 2720 |
| }, |
| { |
| "epoch": 5.471887550200803, |
| "grad_norm": 0.8280296325683594, |
| "learning_rate": 9.110780058474052e-06, |
| "loss": 0.1247, |
| "num_input_tokens_seen": 1698720, |
| "step": 2725 |
| }, |
| { |
| "epoch": 5.481927710843373, |
| "grad_norm": 1.9926586151123047, |
| "learning_rate": 9.105786048601795e-06, |
| "loss": 0.1565, |
| "num_input_tokens_seen": 1701536, |
| "step": 2730 |
| }, |
| { |
| "epoch": 5.491967871485944, |
| "grad_norm": 2.2215921878814697, |
| "learning_rate": 9.100779431134175e-06, |
| "loss": 0.1237, |
| "num_input_tokens_seen": 1704864, |
| "step": 2735 |
| }, |
| { |
| "epoch": 5.502008032128514, |
| "grad_norm": 4.604000091552734, |
| "learning_rate": 9.09576022144496e-06, |
| "loss": 0.1348, |
| "num_input_tokens_seen": 1708000, |
| "step": 2740 |
| }, |
| { |
| "epoch": 5.5120481927710845, |
| "grad_norm": 1.5299623012542725, |
| "learning_rate": 9.090728434946584e-06, |
| "loss": 0.1286, |
| "num_input_tokens_seen": 1711296, |
| "step": 2745 |
| }, |
| { |
| "epoch": 5.522088353413655, |
| "grad_norm": 6.929166316986084, |
| "learning_rate": 9.085684087090108e-06, |
| "loss": 0.1311, |
| "num_input_tokens_seen": 1714880, |
| "step": 2750 |
| }, |
| { |
| "epoch": 5.532128514056225, |
| "grad_norm": 17.860334396362305, |
| "learning_rate": 9.080627193365155e-06, |
| "loss": 0.1346, |
| "num_input_tokens_seen": 1717728, |
| "step": 2755 |
| }, |
| { |
| "epoch": 5.542168674698795, |
| "grad_norm": 1.9477763175964355, |
| "learning_rate": 9.075557769299877e-06, |
| "loss": 0.1556, |
| "num_input_tokens_seen": 1721280, |
| "step": 2760 |
| }, |
| { |
| "epoch": 5.552208835341365, |
| "grad_norm": 1.6732323169708252, |
| "learning_rate": 9.070475830460906e-06, |
| "loss": 0.1214, |
| "num_input_tokens_seen": 1723968, |
| "step": 2765 |
| }, |
| { |
| "epoch": 5.562248995983936, |
| "grad_norm": 10.01714038848877, |
| "learning_rate": 9.065381392453296e-06, |
| "loss": 0.1406, |
| "num_input_tokens_seen": 1727424, |
| "step": 2770 |
| }, |
| { |
| "epoch": 5.572289156626506, |
| "grad_norm": 3.083617687225342, |
| "learning_rate": 9.060274470920487e-06, |
| "loss": 0.1231, |
| "num_input_tokens_seen": 1730528, |
| "step": 2775 |
| }, |
| { |
| "epoch": 5.582329317269076, |
| "grad_norm": 2.5101115703582764, |
| "learning_rate": 9.055155081544253e-06, |
| "loss": 0.1405, |
| "num_input_tokens_seen": 1734208, |
| "step": 2780 |
| }, |
| { |
| "epoch": 5.592369477911647, |
| "grad_norm": 4.0800580978393555, |
| "learning_rate": 9.050023240044649e-06, |
| "loss": 0.1144, |
| "num_input_tokens_seen": 1737728, |
| "step": 2785 |
| }, |
| { |
| "epoch": 5.602409638554217, |
| "grad_norm": 8.37118911743164, |
| "learning_rate": 9.044878962179968e-06, |
| "loss": 0.1405, |
| "num_input_tokens_seen": 1740800, |
| "step": 2790 |
| }, |
| { |
| "epoch": 5.612449799196787, |
| "grad_norm": 3.184250593185425, |
| "learning_rate": 9.039722263746693e-06, |
| "loss": 0.1596, |
| "num_input_tokens_seen": 1744096, |
| "step": 2795 |
| }, |
| { |
| "epoch": 5.622489959839357, |
| "grad_norm": 2.413656711578369, |
| "learning_rate": 9.034553160579444e-06, |
| "loss": 0.0979, |
| "num_input_tokens_seen": 1746720, |
| "step": 2800 |
| }, |
| { |
| "epoch": 5.632530120481928, |
| "grad_norm": 13.534455299377441, |
| "learning_rate": 9.029371668550933e-06, |
| "loss": 0.1587, |
| "num_input_tokens_seen": 1750304, |
| "step": 2805 |
| }, |
| { |
| "epoch": 5.642570281124498, |
| "grad_norm": 1.7071505784988403, |
| "learning_rate": 9.024177803571917e-06, |
| "loss": 0.13, |
| "num_input_tokens_seen": 1753600, |
| "step": 2810 |
| }, |
| { |
| "epoch": 5.652610441767068, |
| "grad_norm": 4.593850135803223, |
| "learning_rate": 9.018971581591141e-06, |
| "loss": 0.1681, |
| "num_input_tokens_seen": 1756096, |
| "step": 2815 |
| }, |
| { |
| "epoch": 5.662650602409639, |
| "grad_norm": 6.782101154327393, |
| "learning_rate": 9.013753018595302e-06, |
| "loss": 0.1039, |
| "num_input_tokens_seen": 1759072, |
| "step": 2820 |
| }, |
| { |
| "epoch": 5.672690763052209, |
| "grad_norm": 2.9062986373901367, |
| "learning_rate": 9.008522130608984e-06, |
| "loss": 0.0958, |
| "num_input_tokens_seen": 1762720, |
| "step": 2825 |
| }, |
| { |
| "epoch": 5.682730923694779, |
| "grad_norm": 2.183579444885254, |
| "learning_rate": 9.003278933694625e-06, |
| "loss": 0.1527, |
| "num_input_tokens_seen": 1765472, |
| "step": 2830 |
| }, |
| { |
| "epoch": 5.692771084337349, |
| "grad_norm": 5.613504409790039, |
| "learning_rate": 8.998023443952453e-06, |
| "loss": 0.0948, |
| "num_input_tokens_seen": 1769472, |
| "step": 2835 |
| }, |
| { |
| "epoch": 5.7028112449799195, |
| "grad_norm": 8.625523567199707, |
| "learning_rate": 8.992755677520448e-06, |
| "loss": 0.1371, |
| "num_input_tokens_seen": 1772640, |
| "step": 2840 |
| }, |
| { |
| "epoch": 5.71285140562249, |
| "grad_norm": 3.748905897140503, |
| "learning_rate": 8.987475650574289e-06, |
| "loss": 0.1788, |
| "num_input_tokens_seen": 1775744, |
| "step": 2845 |
| }, |
| { |
| "epoch": 5.72289156626506, |
| "grad_norm": 4.181033611297607, |
| "learning_rate": 8.982183379327299e-06, |
| "loss": 0.1061, |
| "num_input_tokens_seen": 1778944, |
| "step": 2850 |
| }, |
| { |
| "epoch": 5.732931726907631, |
| "grad_norm": 4.234264373779297, |
| "learning_rate": 8.9768788800304e-06, |
| "loss": 0.1245, |
| "num_input_tokens_seen": 1782400, |
| "step": 2855 |
| }, |
| { |
| "epoch": 5.742971887550201, |
| "grad_norm": 2.122401714324951, |
| "learning_rate": 8.971562168972065e-06, |
| "loss": 0.144, |
| "num_input_tokens_seen": 1784416, |
| "step": 2860 |
| }, |
| { |
| "epoch": 5.753012048192771, |
| "grad_norm": 6.364081859588623, |
| "learning_rate": 8.966233262478266e-06, |
| "loss": 0.1747, |
| "num_input_tokens_seen": 1787392, |
| "step": 2865 |
| }, |
| { |
| "epoch": 5.763052208835341, |
| "grad_norm": 4.270403861999512, |
| "learning_rate": 8.960892176912418e-06, |
| "loss": 0.1084, |
| "num_input_tokens_seen": 1790976, |
| "step": 2870 |
| }, |
| { |
| "epoch": 5.7730923694779115, |
| "grad_norm": 7.241430759429932, |
| "learning_rate": 8.955538928675343e-06, |
| "loss": 0.1494, |
| "num_input_tokens_seen": 1793952, |
| "step": 2875 |
| }, |
| { |
| "epoch": 5.783132530120482, |
| "grad_norm": 2.145287036895752, |
| "learning_rate": 8.950173534205202e-06, |
| "loss": 0.1379, |
| "num_input_tokens_seen": 1797568, |
| "step": 2880 |
| }, |
| { |
| "epoch": 5.793172690763052, |
| "grad_norm": 3.997114896774292, |
| "learning_rate": 8.944796009977459e-06, |
| "loss": 0.1645, |
| "num_input_tokens_seen": 1800128, |
| "step": 2885 |
| }, |
| { |
| "epoch": 5.803212851405623, |
| "grad_norm": 4.003738880157471, |
| "learning_rate": 8.939406372504823e-06, |
| "loss": 0.1543, |
| "num_input_tokens_seen": 1803712, |
| "step": 2890 |
| }, |
| { |
| "epoch": 5.813253012048193, |
| "grad_norm": 4.363209247589111, |
| "learning_rate": 8.934004638337197e-06, |
| "loss": 0.0882, |
| "num_input_tokens_seen": 1806784, |
| "step": 2895 |
| }, |
| { |
| "epoch": 5.823293172690763, |
| "grad_norm": 2.9139928817749023, |
| "learning_rate": 8.928590824061633e-06, |
| "loss": 0.0888, |
| "num_input_tokens_seen": 1809312, |
| "step": 2900 |
| }, |
| { |
| "epoch": 5.833333333333333, |
| "grad_norm": 5.311194896697998, |
| "learning_rate": 8.923164946302274e-06, |
| "loss": 0.1286, |
| "num_input_tokens_seen": 1812192, |
| "step": 2905 |
| }, |
| { |
| "epoch": 5.843373493975903, |
| "grad_norm": 2.110668420791626, |
| "learning_rate": 8.917727021720308e-06, |
| "loss": 0.1309, |
| "num_input_tokens_seen": 1815168, |
| "step": 2910 |
| }, |
| { |
| "epoch": 5.853413654618474, |
| "grad_norm": 8.800142288208008, |
| "learning_rate": 8.912277067013914e-06, |
| "loss": 0.123, |
| "num_input_tokens_seen": 1818176, |
| "step": 2915 |
| }, |
| { |
| "epoch": 5.863453815261044, |
| "grad_norm": 2.967930555343628, |
| "learning_rate": 8.906815098918214e-06, |
| "loss": 0.1411, |
| "num_input_tokens_seen": 1821120, |
| "step": 2920 |
| }, |
| { |
| "epoch": 5.873493975903615, |
| "grad_norm": 2.1932666301727295, |
| "learning_rate": 8.901341134205214e-06, |
| "loss": 0.1104, |
| "num_input_tokens_seen": 1823840, |
| "step": 2925 |
| }, |
| { |
| "epoch": 5.883534136546185, |
| "grad_norm": 12.93001937866211, |
| "learning_rate": 8.895855189683768e-06, |
| "loss": 0.1315, |
| "num_input_tokens_seen": 1827168, |
| "step": 2930 |
| }, |
| { |
| "epoch": 5.893574297188755, |
| "grad_norm": 3.48124361038208, |
| "learning_rate": 8.890357282199504e-06, |
| "loss": 0.1622, |
| "num_input_tokens_seen": 1829504, |
| "step": 2935 |
| }, |
| { |
| "epoch": 5.903614457831325, |
| "grad_norm": 8.852481842041016, |
| "learning_rate": 8.884847428634792e-06, |
| "loss": 0.156, |
| "num_input_tokens_seen": 1832640, |
| "step": 2940 |
| }, |
| { |
| "epoch": 5.913654618473895, |
| "grad_norm": 4.551000595092773, |
| "learning_rate": 8.879325645908686e-06, |
| "loss": 0.1578, |
| "num_input_tokens_seen": 1836448, |
| "step": 2945 |
| }, |
| { |
| "epoch": 5.923694779116466, |
| "grad_norm": 5.305069446563721, |
| "learning_rate": 8.873791950976865e-06, |
| "loss": 0.1497, |
| "num_input_tokens_seen": 1839104, |
| "step": 2950 |
| }, |
| { |
| "epoch": 5.933734939759036, |
| "grad_norm": 9.336993217468262, |
| "learning_rate": 8.868246360831589e-06, |
| "loss": 0.1957, |
| "num_input_tokens_seen": 1841952, |
| "step": 2955 |
| }, |
| { |
| "epoch": 5.943775100401607, |
| "grad_norm": 2.6634745597839355, |
| "learning_rate": 8.862688892501648e-06, |
| "loss": 0.1177, |
| "num_input_tokens_seen": 1846272, |
| "step": 2960 |
| }, |
| { |
| "epoch": 5.953815261044177, |
| "grad_norm": 2.5295801162719727, |
| "learning_rate": 8.857119563052301e-06, |
| "loss": 0.1179, |
| "num_input_tokens_seen": 1849888, |
| "step": 2965 |
| }, |
| { |
| "epoch": 5.9638554216867465, |
| "grad_norm": 1.7765917778015137, |
| "learning_rate": 8.851538389585234e-06, |
| "loss": 0.1344, |
| "num_input_tokens_seen": 1853152, |
| "step": 2970 |
| }, |
| { |
| "epoch": 5.973895582329317, |
| "grad_norm": 1.9854011535644531, |
| "learning_rate": 8.845945389238496e-06, |
| "loss": 0.0961, |
| "num_input_tokens_seen": 1856128, |
| "step": 2975 |
| }, |
| { |
| "epoch": 5.983935742971887, |
| "grad_norm": 10.58693790435791, |
| "learning_rate": 8.840340579186457e-06, |
| "loss": 0.1184, |
| "num_input_tokens_seen": 1858496, |
| "step": 2980 |
| }, |
| { |
| "epoch": 5.993975903614458, |
| "grad_norm": 0.9854339957237244, |
| "learning_rate": 8.834723976639752e-06, |
| "loss": 0.1139, |
| "num_input_tokens_seen": 1860928, |
| "step": 2985 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.19226641952991486, |
| "eval_runtime": 8.0748, |
| "eval_samples_per_second": 61.673, |
| "eval_steps_per_second": 15.48, |
| "num_input_tokens_seen": 1862848, |
| "step": 2988 |
| }, |
| { |
| "epoch": 6.004016064257028, |
| "grad_norm": 2.078145742416382, |
| "learning_rate": 8.829095598845224e-06, |
| "loss": 0.1233, |
| "num_input_tokens_seen": 1864064, |
| "step": 2990 |
| }, |
| { |
| "epoch": 6.014056224899599, |
| "grad_norm": 12.33251667022705, |
| "learning_rate": 8.823455463085873e-06, |
| "loss": 0.0896, |
| "num_input_tokens_seen": 1867360, |
| "step": 2995 |
| }, |
| { |
| "epoch": 6.024096385542169, |
| "grad_norm": 4.962930202484131, |
| "learning_rate": 8.81780358668081e-06, |
| "loss": 0.114, |
| "num_input_tokens_seen": 1870112, |
| "step": 3000 |
| }, |
| { |
| "epoch": 6.034136546184739, |
| "grad_norm": 18.131410598754883, |
| "learning_rate": 8.812139986985194e-06, |
| "loss": 0.0927, |
| "num_input_tokens_seen": 1873632, |
| "step": 3005 |
| }, |
| { |
| "epoch": 6.044176706827309, |
| "grad_norm": 2.0991618633270264, |
| "learning_rate": 8.806464681390182e-06, |
| "loss": 0.1233, |
| "num_input_tokens_seen": 1876480, |
| "step": 3010 |
| }, |
| { |
| "epoch": 6.054216867469879, |
| "grad_norm": 5.510014057159424, |
| "learning_rate": 8.800777687322875e-06, |
| "loss": 0.1091, |
| "num_input_tokens_seen": 1880032, |
| "step": 3015 |
| }, |
| { |
| "epoch": 6.06425702811245, |
| "grad_norm": 0.9164973497390747, |
| "learning_rate": 8.795079022246269e-06, |
| "loss": 0.1362, |
| "num_input_tokens_seen": 1882400, |
| "step": 3020 |
| }, |
| { |
| "epoch": 6.07429718875502, |
| "grad_norm": 2.5013444423675537, |
| "learning_rate": 8.789368703659199e-06, |
| "loss": 0.1909, |
| "num_input_tokens_seen": 1885632, |
| "step": 3025 |
| }, |
| { |
| "epoch": 6.0843373493975905, |
| "grad_norm": 7.10783052444458, |
| "learning_rate": 8.78364674909628e-06, |
| "loss": 0.1584, |
| "num_input_tokens_seen": 1889088, |
| "step": 3030 |
| }, |
| { |
| "epoch": 6.094377510040161, |
| "grad_norm": 12.30833911895752, |
| "learning_rate": 8.777913176127859e-06, |
| "loss": 0.1345, |
| "num_input_tokens_seen": 1893056, |
| "step": 3035 |
| }, |
| { |
| "epoch": 6.104417670682731, |
| "grad_norm": 3.545053720474243, |
| "learning_rate": 8.772168002359962e-06, |
| "loss": 0.1093, |
| "num_input_tokens_seen": 1896896, |
| "step": 3040 |
| }, |
| { |
| "epoch": 6.114457831325301, |
| "grad_norm": 1.529579758644104, |
| "learning_rate": 8.766411245434234e-06, |
| "loss": 0.081, |
| "num_input_tokens_seen": 1899968, |
| "step": 3045 |
| }, |
| { |
| "epoch": 6.124497991967871, |
| "grad_norm": 13.889845848083496, |
| "learning_rate": 8.760642923027888e-06, |
| "loss": 0.0967, |
| "num_input_tokens_seen": 1902944, |
| "step": 3050 |
| }, |
| { |
| "epoch": 6.134538152610442, |
| "grad_norm": 5.338316917419434, |
| "learning_rate": 8.754863052853658e-06, |
| "loss": 0.1337, |
| "num_input_tokens_seen": 1906368, |
| "step": 3055 |
| }, |
| { |
| "epoch": 6.144578313253012, |
| "grad_norm": 7.715173721313477, |
| "learning_rate": 8.74907165265973e-06, |
| "loss": 0.1273, |
| "num_input_tokens_seen": 1908832, |
| "step": 3060 |
| }, |
| { |
| "epoch": 6.1546184738955825, |
| "grad_norm": 7.223991870880127, |
| "learning_rate": 8.743268740229693e-06, |
| "loss": 0.1598, |
| "num_input_tokens_seen": 1911360, |
| "step": 3065 |
| }, |
| { |
| "epoch": 6.164658634538153, |
| "grad_norm": 6.749616622924805, |
| "learning_rate": 8.7374543333825e-06, |
| "loss": 0.1322, |
| "num_input_tokens_seen": 1914816, |
| "step": 3070 |
| }, |
| { |
| "epoch": 6.174698795180723, |
| "grad_norm": 5.285307884216309, |
| "learning_rate": 8.731628449972382e-06, |
| "loss": 0.0712, |
| "num_input_tokens_seen": 1917728, |
| "step": 3075 |
| }, |
| { |
| "epoch": 6.184738955823293, |
| "grad_norm": 12.172269821166992, |
| "learning_rate": 8.725791107888825e-06, |
| "loss": 0.0885, |
| "num_input_tokens_seen": 1920672, |
| "step": 3080 |
| }, |
| { |
| "epoch": 6.194779116465863, |
| "grad_norm": 8.986129760742188, |
| "learning_rate": 8.719942325056496e-06, |
| "loss": 0.1114, |
| "num_input_tokens_seen": 1923776, |
| "step": 3085 |
| }, |
| { |
| "epoch": 6.204819277108434, |
| "grad_norm": 4.885204315185547, |
| "learning_rate": 8.71408211943519e-06, |
| "loss": 0.1036, |
| "num_input_tokens_seen": 1926464, |
| "step": 3090 |
| }, |
| { |
| "epoch": 6.214859437751004, |
| "grad_norm": 9.752225875854492, |
| "learning_rate": 8.70821050901978e-06, |
| "loss": 0.1345, |
| "num_input_tokens_seen": 1929792, |
| "step": 3095 |
| }, |
| { |
| "epoch": 6.224899598393574, |
| "grad_norm": 4.695402145385742, |
| "learning_rate": 8.702327511840165e-06, |
| "loss": 0.0932, |
| "num_input_tokens_seen": 1933664, |
| "step": 3100 |
| }, |
| { |
| "epoch": 6.234939759036145, |
| "grad_norm": 2.340029001235962, |
| "learning_rate": 8.6964331459612e-06, |
| "loss": 0.1173, |
| "num_input_tokens_seen": 1936704, |
| "step": 3105 |
| }, |
| { |
| "epoch": 6.244979919678715, |
| "grad_norm": 11.728029251098633, |
| "learning_rate": 8.690527429482658e-06, |
| "loss": 0.1965, |
| "num_input_tokens_seen": 1939552, |
| "step": 3110 |
| }, |
| { |
| "epoch": 6.255020080321285, |
| "grad_norm": 1.949618935585022, |
| "learning_rate": 8.68461038053916e-06, |
| "loss": 0.1401, |
| "num_input_tokens_seen": 1942944, |
| "step": 3115 |
| }, |
| { |
| "epoch": 6.265060240963855, |
| "grad_norm": 2.2153432369232178, |
| "learning_rate": 8.678682017300126e-06, |
| "loss": 0.0998, |
| "num_input_tokens_seen": 1945600, |
| "step": 3120 |
| }, |
| { |
| "epoch": 6.275100401606426, |
| "grad_norm": 2.1864218711853027, |
| "learning_rate": 8.672742357969724e-06, |
| "loss": 0.1296, |
| "num_input_tokens_seen": 1948416, |
| "step": 3125 |
| }, |
| { |
| "epoch": 6.285140562248996, |
| "grad_norm": 16.295392990112305, |
| "learning_rate": 8.666791420786805e-06, |
| "loss": 0.1204, |
| "num_input_tokens_seen": 1951296, |
| "step": 3130 |
| }, |
| { |
| "epoch": 6.295180722891566, |
| "grad_norm": 5.46299934387207, |
| "learning_rate": 8.660829224024849e-06, |
| "loss": 0.1233, |
| "num_input_tokens_seen": 1954784, |
| "step": 3135 |
| }, |
| { |
| "epoch": 6.305220883534137, |
| "grad_norm": 4.170035362243652, |
| "learning_rate": 8.654855785991915e-06, |
| "loss": 0.134, |
| "num_input_tokens_seen": 1957664, |
| "step": 3140 |
| }, |
| { |
| "epoch": 6.315261044176707, |
| "grad_norm": 9.24455451965332, |
| "learning_rate": 8.648871125030576e-06, |
| "loss": 0.078, |
| "num_input_tokens_seen": 1960736, |
| "step": 3145 |
| }, |
| { |
| "epoch": 6.325301204819277, |
| "grad_norm": 4.151712417602539, |
| "learning_rate": 8.642875259517871e-06, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 1964448, |
| "step": 3150 |
| }, |
| { |
| "epoch": 6.335341365461847, |
| "grad_norm": 2.0245907306671143, |
| "learning_rate": 8.636868207865244e-06, |
| "loss": 0.1463, |
| "num_input_tokens_seen": 1967808, |
| "step": 3155 |
| }, |
| { |
| "epoch": 6.3453815261044175, |
| "grad_norm": 3.730196475982666, |
| "learning_rate": 8.630849988518486e-06, |
| "loss": 0.0814, |
| "num_input_tokens_seen": 1970592, |
| "step": 3160 |
| }, |
| { |
| "epoch": 6.355421686746988, |
| "grad_norm": 0.7733585238456726, |
| "learning_rate": 8.62482061995768e-06, |
| "loss": 0.0911, |
| "num_input_tokens_seen": 1973856, |
| "step": 3165 |
| }, |
| { |
| "epoch": 6.365461847389558, |
| "grad_norm": 13.04262638092041, |
| "learning_rate": 8.618780120697152e-06, |
| "loss": 0.1716, |
| "num_input_tokens_seen": 1977760, |
| "step": 3170 |
| }, |
| { |
| "epoch": 6.375502008032129, |
| "grad_norm": 27.22624397277832, |
| "learning_rate": 8.612728509285395e-06, |
| "loss": 0.1568, |
| "num_input_tokens_seen": 1981408, |
| "step": 3175 |
| }, |
| { |
| "epoch": 6.385542168674699, |
| "grad_norm": 1.5505954027175903, |
| "learning_rate": 8.606665804305034e-06, |
| "loss": 0.0847, |
| "num_input_tokens_seen": 1985056, |
| "step": 3180 |
| }, |
| { |
| "epoch": 6.395582329317269, |
| "grad_norm": 7.722294330596924, |
| "learning_rate": 8.600592024372756e-06, |
| "loss": 0.1526, |
| "num_input_tokens_seen": 1988000, |
| "step": 3185 |
| }, |
| { |
| "epoch": 6.405622489959839, |
| "grad_norm": 4.913753509521484, |
| "learning_rate": 8.594507188139251e-06, |
| "loss": 0.1492, |
| "num_input_tokens_seen": 1991168, |
| "step": 3190 |
| }, |
| { |
| "epoch": 6.4156626506024095, |
| "grad_norm": 5.078114032745361, |
| "learning_rate": 8.588411314289169e-06, |
| "loss": 0.0747, |
| "num_input_tokens_seen": 1994560, |
| "step": 3195 |
| }, |
| { |
| "epoch": 6.42570281124498, |
| "grad_norm": 1.480210542678833, |
| "learning_rate": 8.582304421541045e-06, |
| "loss": 0.1097, |
| "num_input_tokens_seen": 1997248, |
| "step": 3200 |
| }, |
| { |
| "epoch": 6.43574297188755, |
| "grad_norm": 27.352954864501953, |
| "learning_rate": 8.576186528647253e-06, |
| "loss": 0.1648, |
| "num_input_tokens_seen": 2000736, |
| "step": 3205 |
| }, |
| { |
| "epoch": 6.445783132530121, |
| "grad_norm": 2.2470853328704834, |
| "learning_rate": 8.570057654393943e-06, |
| "loss": 0.0742, |
| "num_input_tokens_seen": 2004192, |
| "step": 3210 |
| }, |
| { |
| "epoch": 6.455823293172691, |
| "grad_norm": 6.305330753326416, |
| "learning_rate": 8.563917817600988e-06, |
| "loss": 0.1466, |
| "num_input_tokens_seen": 2007616, |
| "step": 3215 |
| }, |
| { |
| "epoch": 6.4658634538152615, |
| "grad_norm": 7.9827752113342285, |
| "learning_rate": 8.557767037121923e-06, |
| "loss": 0.1116, |
| "num_input_tokens_seen": 2010720, |
| "step": 3220 |
| }, |
| { |
| "epoch": 6.475903614457831, |
| "grad_norm": 4.350990295410156, |
| "learning_rate": 8.551605331843885e-06, |
| "loss": 0.1186, |
| "num_input_tokens_seen": 2014368, |
| "step": 3225 |
| }, |
| { |
| "epoch": 6.485943775100401, |
| "grad_norm": 2.8370282649993896, |
| "learning_rate": 8.545432720687558e-06, |
| "loss": 0.1393, |
| "num_input_tokens_seen": 2017280, |
| "step": 3230 |
| }, |
| { |
| "epoch": 6.495983935742972, |
| "grad_norm": 3.6251227855682373, |
| "learning_rate": 8.53924922260712e-06, |
| "loss": 0.2241, |
| "num_input_tokens_seen": 2020256, |
| "step": 3235 |
| }, |
| { |
| "epoch": 6.506024096385542, |
| "grad_norm": 2.832481861114502, |
| "learning_rate": 8.533054856590175e-06, |
| "loss": 0.1532, |
| "num_input_tokens_seen": 2023968, |
| "step": 3240 |
| }, |
| { |
| "epoch": 6.516064257028113, |
| "grad_norm": 7.332273483276367, |
| "learning_rate": 8.526849641657697e-06, |
| "loss": 0.1268, |
| "num_input_tokens_seen": 2026208, |
| "step": 3245 |
| }, |
| { |
| "epoch": 6.526104417670683, |
| "grad_norm": 15.295309066772461, |
| "learning_rate": 8.520633596863978e-06, |
| "loss": 0.1392, |
| "num_input_tokens_seen": 2028512, |
| "step": 3250 |
| }, |
| { |
| "epoch": 6.5361445783132535, |
| "grad_norm": 3.655897617340088, |
| "learning_rate": 8.514406741296565e-06, |
| "loss": 0.161, |
| "num_input_tokens_seen": 2031456, |
| "step": 3255 |
| }, |
| { |
| "epoch": 6.546184738955823, |
| "grad_norm": 3.424639940261841, |
| "learning_rate": 8.508169094076197e-06, |
| "loss": 0.108, |
| "num_input_tokens_seen": 2033888, |
| "step": 3260 |
| }, |
| { |
| "epoch": 6.556224899598393, |
| "grad_norm": 3.003465175628662, |
| "learning_rate": 8.501920674356755e-06, |
| "loss": 0.1776, |
| "num_input_tokens_seen": 2037312, |
| "step": 3265 |
| }, |
| { |
| "epoch": 6.566265060240964, |
| "grad_norm": 2.6872596740722656, |
| "learning_rate": 8.495661501325197e-06, |
| "loss": 0.1337, |
| "num_input_tokens_seen": 2040448, |
| "step": 3270 |
| }, |
| { |
| "epoch": 6.576305220883534, |
| "grad_norm": 5.785903453826904, |
| "learning_rate": 8.489391594201503e-06, |
| "loss": 0.1077, |
| "num_input_tokens_seen": 2043968, |
| "step": 3275 |
| }, |
| { |
| "epoch": 6.586345381526105, |
| "grad_norm": 4.747533798217773, |
| "learning_rate": 8.483110972238612e-06, |
| "loss": 0.1124, |
| "num_input_tokens_seen": 2047584, |
| "step": 3280 |
| }, |
| { |
| "epoch": 6.596385542168675, |
| "grad_norm": 9.165260314941406, |
| "learning_rate": 8.476819654722365e-06, |
| "loss": 0.0998, |
| "num_input_tokens_seen": 2051136, |
| "step": 3285 |
| }, |
| { |
| "epoch": 6.606425702811245, |
| "grad_norm": 3.9798431396484375, |
| "learning_rate": 8.47051766097145e-06, |
| "loss": 0.1312, |
| "num_input_tokens_seen": 2054784, |
| "step": 3290 |
| }, |
| { |
| "epoch": 6.616465863453815, |
| "grad_norm": 3.7183773517608643, |
| "learning_rate": 8.46420501033733e-06, |
| "loss": 0.1411, |
| "num_input_tokens_seen": 2058176, |
| "step": 3295 |
| }, |
| { |
| "epoch": 6.626506024096385, |
| "grad_norm": 1.8856403827667236, |
| "learning_rate": 8.457881722204201e-06, |
| "loss": 0.1068, |
| "num_input_tokens_seen": 2061472, |
| "step": 3300 |
| }, |
| { |
| "epoch": 6.636546184738956, |
| "grad_norm": 6.899387359619141, |
| "learning_rate": 8.45154781598892e-06, |
| "loss": 0.1221, |
| "num_input_tokens_seen": 2064288, |
| "step": 3305 |
| }, |
| { |
| "epoch": 6.646586345381526, |
| "grad_norm": 6.279878616333008, |
| "learning_rate": 8.445203311140944e-06, |
| "loss": 0.0784, |
| "num_input_tokens_seen": 2067936, |
| "step": 3310 |
| }, |
| { |
| "epoch": 6.656626506024097, |
| "grad_norm": 5.195173263549805, |
| "learning_rate": 8.438848227142282e-06, |
| "loss": 0.1184, |
| "num_input_tokens_seen": 2070752, |
| "step": 3315 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 2.3702588081359863, |
| "learning_rate": 8.432482583507425e-06, |
| "loss": 0.0321, |
| "num_input_tokens_seen": 2073664, |
| "step": 3320 |
| }, |
| { |
| "epoch": 6.676706827309237, |
| "grad_norm": 0.9415891170501709, |
| "learning_rate": 8.42610639978329e-06, |
| "loss": 0.1295, |
| "num_input_tokens_seen": 2077376, |
| "step": 3325 |
| }, |
| { |
| "epoch": 6.686746987951807, |
| "grad_norm": 0.7066106796264648, |
| "learning_rate": 8.41971969554916e-06, |
| "loss": 0.074, |
| "num_input_tokens_seen": 2080608, |
| "step": 3330 |
| }, |
| { |
| "epoch": 6.696787148594377, |
| "grad_norm": 2.8969779014587402, |
| "learning_rate": 8.413322490416623e-06, |
| "loss": 0.1991, |
| "num_input_tokens_seen": 2083104, |
| "step": 3335 |
| }, |
| { |
| "epoch": 6.706827309236948, |
| "grad_norm": 2.466632843017578, |
| "learning_rate": 8.40691480402951e-06, |
| "loss": 0.1012, |
| "num_input_tokens_seen": 2085856, |
| "step": 3340 |
| }, |
| { |
| "epoch": 6.716867469879518, |
| "grad_norm": 8.746673583984375, |
| "learning_rate": 8.40049665606384e-06, |
| "loss": 0.1205, |
| "num_input_tokens_seen": 2088928, |
| "step": 3345 |
| }, |
| { |
| "epoch": 6.7269076305220885, |
| "grad_norm": 4.114537715911865, |
| "learning_rate": 8.394068066227752e-06, |
| "loss": 0.2032, |
| "num_input_tokens_seen": 2091616, |
| "step": 3350 |
| }, |
| { |
| "epoch": 6.736947791164659, |
| "grad_norm": 11.16876220703125, |
| "learning_rate": 8.387629054261454e-06, |
| "loss": 0.1548, |
| "num_input_tokens_seen": 2094272, |
| "step": 3355 |
| }, |
| { |
| "epoch": 6.746987951807229, |
| "grad_norm": 3.5258936882019043, |
| "learning_rate": 8.381179639937152e-06, |
| "loss": 0.1488, |
| "num_input_tokens_seen": 2097152, |
| "step": 3360 |
| }, |
| { |
| "epoch": 6.757028112449799, |
| "grad_norm": 5.957228660583496, |
| "learning_rate": 8.374719843059e-06, |
| "loss": 0.1051, |
| "num_input_tokens_seen": 2100480, |
| "step": 3365 |
| }, |
| { |
| "epoch": 6.767068273092369, |
| "grad_norm": 3.576631784439087, |
| "learning_rate": 8.368249683463028e-06, |
| "loss": 0.1419, |
| "num_input_tokens_seen": 2103552, |
| "step": 3370 |
| }, |
| { |
| "epoch": 6.77710843373494, |
| "grad_norm": 5.911473274230957, |
| "learning_rate": 8.361769181017089e-06, |
| "loss": 0.1153, |
| "num_input_tokens_seen": 2106848, |
| "step": 3375 |
| }, |
| { |
| "epoch": 6.78714859437751, |
| "grad_norm": 7.324421405792236, |
| "learning_rate": 8.355278355620795e-06, |
| "loss": 0.1921, |
| "num_input_tokens_seen": 2109632, |
| "step": 3380 |
| }, |
| { |
| "epoch": 6.7971887550200805, |
| "grad_norm": 1.6949467658996582, |
| "learning_rate": 8.348777227205462e-06, |
| "loss": 0.1737, |
| "num_input_tokens_seen": 2112128, |
| "step": 3385 |
| }, |
| { |
| "epoch": 6.807228915662651, |
| "grad_norm": 4.560188293457031, |
| "learning_rate": 8.342265815734034e-06, |
| "loss": 0.0897, |
| "num_input_tokens_seen": 2114592, |
| "step": 3390 |
| }, |
| { |
| "epoch": 6.817269076305221, |
| "grad_norm": 10.877038955688477, |
| "learning_rate": 8.335744141201037e-06, |
| "loss": 0.1537, |
| "num_input_tokens_seen": 2117728, |
| "step": 3395 |
| }, |
| { |
| "epoch": 6.827309236947791, |
| "grad_norm": 9.559526443481445, |
| "learning_rate": 8.329212223632511e-06, |
| "loss": 0.1561, |
| "num_input_tokens_seen": 2121792, |
| "step": 3400 |
| }, |
| { |
| "epoch": 6.837349397590361, |
| "grad_norm": 2.976337194442749, |
| "learning_rate": 8.32267008308595e-06, |
| "loss": 0.1087, |
| "num_input_tokens_seen": 2124736, |
| "step": 3405 |
| }, |
| { |
| "epoch": 6.847389558232932, |
| "grad_norm": 3.3281893730163574, |
| "learning_rate": 8.316117739650235e-06, |
| "loss": 0.133, |
| "num_input_tokens_seen": 2127456, |
| "step": 3410 |
| }, |
| { |
| "epoch": 6.857429718875502, |
| "grad_norm": 3.8284034729003906, |
| "learning_rate": 8.309555213445583e-06, |
| "loss": 0.1316, |
| "num_input_tokens_seen": 2130720, |
| "step": 3415 |
| }, |
| { |
| "epoch": 6.867469879518072, |
| "grad_norm": 5.083580017089844, |
| "learning_rate": 8.302982524623475e-06, |
| "loss": 0.0751, |
| "num_input_tokens_seen": 2133376, |
| "step": 3420 |
| }, |
| { |
| "epoch": 6.877510040160643, |
| "grad_norm": 2.2928466796875, |
| "learning_rate": 8.296399693366601e-06, |
| "loss": 0.1011, |
| "num_input_tokens_seen": 2135872, |
| "step": 3425 |
| }, |
| { |
| "epoch": 6.887550200803213, |
| "grad_norm": 0.9877734184265137, |
| "learning_rate": 8.289806739888791e-06, |
| "loss": 0.1031, |
| "num_input_tokens_seen": 2138592, |
| "step": 3430 |
| }, |
| { |
| "epoch": 6.897590361445783, |
| "grad_norm": 12.757080078125, |
| "learning_rate": 8.283203684434963e-06, |
| "loss": 0.1485, |
| "num_input_tokens_seen": 2141312, |
| "step": 3435 |
| }, |
| { |
| "epoch": 6.907630522088353, |
| "grad_norm": 2.4897549152374268, |
| "learning_rate": 8.27659054728105e-06, |
| "loss": 0.1108, |
| "num_input_tokens_seen": 2143936, |
| "step": 3440 |
| }, |
| { |
| "epoch": 6.917670682730924, |
| "grad_norm": 1.2650902271270752, |
| "learning_rate": 8.269967348733947e-06, |
| "loss": 0.1032, |
| "num_input_tokens_seen": 2147456, |
| "step": 3445 |
| }, |
| { |
| "epoch": 6.927710843373494, |
| "grad_norm": 11.644404411315918, |
| "learning_rate": 8.26333410913144e-06, |
| "loss": 0.1131, |
| "num_input_tokens_seen": 2150624, |
| "step": 3450 |
| }, |
| { |
| "epoch": 6.937751004016064, |
| "grad_norm": 4.924938201904297, |
| "learning_rate": 8.256690848842153e-06, |
| "loss": 0.124, |
| "num_input_tokens_seen": 2154176, |
| "step": 3455 |
| }, |
| { |
| "epoch": 6.947791164658635, |
| "grad_norm": 15.072479248046875, |
| "learning_rate": 8.250037588265473e-06, |
| "loss": 0.1661, |
| "num_input_tokens_seen": 2157056, |
| "step": 3460 |
| }, |
| { |
| "epoch": 6.957831325301205, |
| "grad_norm": 1.2311360836029053, |
| "learning_rate": 8.243374347831505e-06, |
| "loss": 0.0956, |
| "num_input_tokens_seen": 2160480, |
| "step": 3465 |
| }, |
| { |
| "epoch": 6.967871485943775, |
| "grad_norm": 4.724909782409668, |
| "learning_rate": 8.236701148000989e-06, |
| "loss": 0.1597, |
| "num_input_tokens_seen": 2163840, |
| "step": 3470 |
| }, |
| { |
| "epoch": 6.977911646586345, |
| "grad_norm": 4.713107585906982, |
| "learning_rate": 8.230018009265255e-06, |
| "loss": 0.1122, |
| "num_input_tokens_seen": 2166848, |
| "step": 3475 |
| }, |
| { |
| "epoch": 6.9879518072289155, |
| "grad_norm": 3.1349759101867676, |
| "learning_rate": 8.223324952146145e-06, |
| "loss": 0.1319, |
| "num_input_tokens_seen": 2169568, |
| "step": 3480 |
| }, |
| { |
| "epoch": 6.997991967871486, |
| "grad_norm": 6.093204021453857, |
| "learning_rate": 8.216621997195966e-06, |
| "loss": 0.0853, |
| "num_input_tokens_seen": 2172288, |
| "step": 3485 |
| }, |
| { |
| "epoch": 7.008032128514056, |
| "grad_norm": 2.002347946166992, |
| "learning_rate": 8.209909164997409e-06, |
| "loss": 0.1287, |
| "num_input_tokens_seen": 2175136, |
| "step": 3490 |
| }, |
| { |
| "epoch": 7.018072289156627, |
| "grad_norm": 9.41158390045166, |
| "learning_rate": 8.203186476163503e-06, |
| "loss": 0.0723, |
| "num_input_tokens_seen": 2178848, |
| "step": 3495 |
| }, |
| { |
| "epoch": 7.028112449799197, |
| "grad_norm": 11.366344451904297, |
| "learning_rate": 8.196453951337538e-06, |
| "loss": 0.0719, |
| "num_input_tokens_seen": 2181568, |
| "step": 3500 |
| }, |
| { |
| "epoch": 7.038152610441767, |
| "grad_norm": 0.7649056315422058, |
| "learning_rate": 8.189711611193012e-06, |
| "loss": 0.1081, |
| "num_input_tokens_seen": 2185664, |
| "step": 3505 |
| }, |
| { |
| "epoch": 7.048192771084337, |
| "grad_norm": 3.984673500061035, |
| "learning_rate": 8.182959476433555e-06, |
| "loss": 0.1156, |
| "num_input_tokens_seen": 2189536, |
| "step": 3510 |
| }, |
| { |
| "epoch": 7.0582329317269075, |
| "grad_norm": 12.080748558044434, |
| "learning_rate": 8.176197567792883e-06, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 2192672, |
| "step": 3515 |
| }, |
| { |
| "epoch": 7.068273092369478, |
| "grad_norm": 4.369615077972412, |
| "learning_rate": 8.169425906034718e-06, |
| "loss": 0.1427, |
| "num_input_tokens_seen": 2195136, |
| "step": 3520 |
| }, |
| { |
| "epoch": 7.078313253012048, |
| "grad_norm": 7.291754722595215, |
| "learning_rate": 8.162644511952735e-06, |
| "loss": 0.1883, |
| "num_input_tokens_seen": 2198368, |
| "step": 3525 |
| }, |
| { |
| "epoch": 7.088353413654619, |
| "grad_norm": 1.121962308883667, |
| "learning_rate": 8.155853406370488e-06, |
| "loss": 0.1243, |
| "num_input_tokens_seen": 2201376, |
| "step": 3530 |
| }, |
| { |
| "epoch": 7.098393574297189, |
| "grad_norm": 5.575697422027588, |
| "learning_rate": 8.149052610141357e-06, |
| "loss": 0.1082, |
| "num_input_tokens_seen": 2204160, |
| "step": 3535 |
| }, |
| { |
| "epoch": 7.108433734939759, |
| "grad_norm": 11.145264625549316, |
| "learning_rate": 8.142242144148478e-06, |
| "loss": 0.1217, |
| "num_input_tokens_seen": 2207296, |
| "step": 3540 |
| }, |
| { |
| "epoch": 7.118473895582329, |
| "grad_norm": 0.5861991047859192, |
| "learning_rate": 8.135422029304682e-06, |
| "loss": 0.0637, |
| "num_input_tokens_seen": 2210528, |
| "step": 3545 |
| }, |
| { |
| "epoch": 7.128514056224899, |
| "grad_norm": 10.110943794250488, |
| "learning_rate": 8.128592286552422e-06, |
| "loss": 0.101, |
| "num_input_tokens_seen": 2212960, |
| "step": 3550 |
| }, |
| { |
| "epoch": 7.13855421686747, |
| "grad_norm": 22.04067039489746, |
| "learning_rate": 8.12175293686372e-06, |
| "loss": 0.114, |
| "num_input_tokens_seen": 2216032, |
| "step": 3555 |
| }, |
| { |
| "epoch": 7.14859437751004, |
| "grad_norm": 30.2784366607666, |
| "learning_rate": 8.1149040012401e-06, |
| "loss": 0.1794, |
| "num_input_tokens_seen": 2218944, |
| "step": 3560 |
| }, |
| { |
| "epoch": 7.158634538152611, |
| "grad_norm": 17.950511932373047, |
| "learning_rate": 8.108045500712518e-06, |
| "loss": 0.1161, |
| "num_input_tokens_seen": 2222336, |
| "step": 3565 |
| }, |
| { |
| "epoch": 7.168674698795181, |
| "grad_norm": 28.468650817871094, |
| "learning_rate": 8.101177456341301e-06, |
| "loss": 0.1088, |
| "num_input_tokens_seen": 2225472, |
| "step": 3570 |
| }, |
| { |
| "epoch": 7.178714859437751, |
| "grad_norm": 10.21017074584961, |
| "learning_rate": 8.094299889216081e-06, |
| "loss": 0.1078, |
| "num_input_tokens_seen": 2228320, |
| "step": 3575 |
| }, |
| { |
| "epoch": 7.188755020080321, |
| "grad_norm": 6.275518417358398, |
| "learning_rate": 8.087412820455738e-06, |
| "loss": 0.1111, |
| "num_input_tokens_seen": 2231648, |
| "step": 3580 |
| }, |
| { |
| "epoch": 7.198795180722891, |
| "grad_norm": 3.738758087158203, |
| "learning_rate": 8.080516271208319e-06, |
| "loss": 0.0929, |
| "num_input_tokens_seen": 2234560, |
| "step": 3585 |
| }, |
| { |
| "epoch": 7.208835341365462, |
| "grad_norm": 4.079806327819824, |
| "learning_rate": 8.07361026265099e-06, |
| "loss": 0.1326, |
| "num_input_tokens_seen": 2237728, |
| "step": 3590 |
| }, |
| { |
| "epoch": 7.218875502008032, |
| "grad_norm": 4.522058010101318, |
| "learning_rate": 8.066694815989961e-06, |
| "loss": 0.0802, |
| "num_input_tokens_seen": 2240992, |
| "step": 3595 |
| }, |
| { |
| "epoch": 7.228915662650603, |
| "grad_norm": 7.339025020599365, |
| "learning_rate": 8.059769952460423e-06, |
| "loss": 0.1238, |
| "num_input_tokens_seen": 2244608, |
| "step": 3600 |
| }, |
| { |
| "epoch": 7.238955823293173, |
| "grad_norm": 4.954975128173828, |
| "learning_rate": 8.052835693326484e-06, |
| "loss": 0.1064, |
| "num_input_tokens_seen": 2247840, |
| "step": 3605 |
| }, |
| { |
| "epoch": 7.2489959839357425, |
| "grad_norm": 31.905029296875, |
| "learning_rate": 8.045892059881101e-06, |
| "loss": 0.2156, |
| "num_input_tokens_seen": 2251104, |
| "step": 3610 |
| }, |
| { |
| "epoch": 7.259036144578313, |
| "grad_norm": 2.187739372253418, |
| "learning_rate": 8.038939073446022e-06, |
| "loss": 0.136, |
| "num_input_tokens_seen": 2254240, |
| "step": 3615 |
| }, |
| { |
| "epoch": 7.269076305220883, |
| "grad_norm": 1.4481827020645142, |
| "learning_rate": 8.031976755371709e-06, |
| "loss": 0.119, |
| "num_input_tokens_seen": 2257472, |
| "step": 3620 |
| }, |
| { |
| "epoch": 7.279116465863454, |
| "grad_norm": 1.2066372632980347, |
| "learning_rate": 8.025005127037282e-06, |
| "loss": 0.0584, |
| "num_input_tokens_seen": 2260640, |
| "step": 3625 |
| }, |
| { |
| "epoch": 7.289156626506024, |
| "grad_norm": 6.761447429656982, |
| "learning_rate": 8.018024209850448e-06, |
| "loss": 0.1104, |
| "num_input_tokens_seen": 2264544, |
| "step": 3630 |
| }, |
| { |
| "epoch": 7.2991967871485945, |
| "grad_norm": 3.3396995067596436, |
| "learning_rate": 8.01103402524744e-06, |
| "loss": 0.1133, |
| "num_input_tokens_seen": 2268064, |
| "step": 3635 |
| }, |
| { |
| "epoch": 7.309236947791165, |
| "grad_norm": 13.4473876953125, |
| "learning_rate": 8.004034594692946e-06, |
| "loss": 0.098, |
| "num_input_tokens_seen": 2271136, |
| "step": 3640 |
| }, |
| { |
| "epoch": 7.3192771084337345, |
| "grad_norm": 4.999135971069336, |
| "learning_rate": 7.997025939680047e-06, |
| "loss": 0.0922, |
| "num_input_tokens_seen": 2274016, |
| "step": 3645 |
| }, |
| { |
| "epoch": 7.329317269076305, |
| "grad_norm": 7.845126628875732, |
| "learning_rate": 7.990008081730145e-06, |
| "loss": 0.1477, |
| "num_input_tokens_seen": 2277344, |
| "step": 3650 |
| }, |
| { |
| "epoch": 7.339357429718875, |
| "grad_norm": 5.341637134552002, |
| "learning_rate": 7.982981042392907e-06, |
| "loss": 0.0949, |
| "num_input_tokens_seen": 2280480, |
| "step": 3655 |
| }, |
| { |
| "epoch": 7.349397590361446, |
| "grad_norm": 10.169334411621094, |
| "learning_rate": 7.975944843246195e-06, |
| "loss": 0.1056, |
| "num_input_tokens_seen": 2283616, |
| "step": 3660 |
| }, |
| { |
| "epoch": 7.359437751004016, |
| "grad_norm": 8.788763046264648, |
| "learning_rate": 7.968899505895987e-06, |
| "loss": 0.0823, |
| "num_input_tokens_seen": 2285888, |
| "step": 3665 |
| }, |
| { |
| "epoch": 7.3694779116465865, |
| "grad_norm": 3.360013008117676, |
| "learning_rate": 7.961845051976334e-06, |
| "loss": 0.0945, |
| "num_input_tokens_seen": 2289920, |
| "step": 3670 |
| }, |
| { |
| "epoch": 7.379518072289157, |
| "grad_norm": 5.178712844848633, |
| "learning_rate": 7.954781503149272e-06, |
| "loss": 0.1121, |
| "num_input_tokens_seen": 2293152, |
| "step": 3675 |
| }, |
| { |
| "epoch": 7.389558232931727, |
| "grad_norm": 4.379055976867676, |
| "learning_rate": 7.94770888110477e-06, |
| "loss": 0.1065, |
| "num_input_tokens_seen": 2295680, |
| "step": 3680 |
| }, |
| { |
| "epoch": 7.399598393574297, |
| "grad_norm": 7.7414069175720215, |
| "learning_rate": 7.940627207560655e-06, |
| "loss": 0.1099, |
| "num_input_tokens_seen": 2299264, |
| "step": 3685 |
| }, |
| { |
| "epoch": 7.409638554216867, |
| "grad_norm": 11.378726959228516, |
| "learning_rate": 7.933536504262554e-06, |
| "loss": 0.1326, |
| "num_input_tokens_seen": 2302528, |
| "step": 3690 |
| }, |
| { |
| "epoch": 7.419678714859438, |
| "grad_norm": 4.260462284088135, |
| "learning_rate": 7.926436792983813e-06, |
| "loss": 0.2, |
| "num_input_tokens_seen": 2305344, |
| "step": 3695 |
| }, |
| { |
| "epoch": 7.429718875502008, |
| "grad_norm": 12.941370964050293, |
| "learning_rate": 7.919328095525446e-06, |
| "loss": 0.1095, |
| "num_input_tokens_seen": 2308480, |
| "step": 3700 |
| }, |
| { |
| "epoch": 7.4397590361445785, |
| "grad_norm": 14.121201515197754, |
| "learning_rate": 7.912210433716054e-06, |
| "loss": 0.0761, |
| "num_input_tokens_seen": 2311712, |
| "step": 3705 |
| }, |
| { |
| "epoch": 7.449799196787149, |
| "grad_norm": 1.1341758966445923, |
| "learning_rate": 7.90508382941177e-06, |
| "loss": 0.1312, |
| "num_input_tokens_seen": 2314816, |
| "step": 3710 |
| }, |
| { |
| "epoch": 7.459839357429719, |
| "grad_norm": 2.659677743911743, |
| "learning_rate": 7.897948304496189e-06, |
| "loss": 0.1492, |
| "num_input_tokens_seen": 2317088, |
| "step": 3715 |
| }, |
| { |
| "epoch": 7.469879518072289, |
| "grad_norm": 1.2822554111480713, |
| "learning_rate": 7.890803880880291e-06, |
| "loss": 0.0939, |
| "num_input_tokens_seen": 2320192, |
| "step": 3720 |
| }, |
| { |
| "epoch": 7.479919678714859, |
| "grad_norm": 0.40696293115615845, |
| "learning_rate": 7.883650580502384e-06, |
| "loss": 0.1041, |
| "num_input_tokens_seen": 2323328, |
| "step": 3725 |
| }, |
| { |
| "epoch": 7.48995983935743, |
| "grad_norm": 7.722894191741943, |
| "learning_rate": 7.876488425328037e-06, |
| "loss": 0.1267, |
| "num_input_tokens_seen": 2325760, |
| "step": 3730 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.4684428870677948, |
| "learning_rate": 7.869317437350007e-06, |
| "loss": 0.0458, |
| "num_input_tokens_seen": 2329280, |
| "step": 3735 |
| }, |
| { |
| "epoch": 7.51004016064257, |
| "grad_norm": 6.411214351654053, |
| "learning_rate": 7.862137638588171e-06, |
| "loss": 0.1071, |
| "num_input_tokens_seen": 2332544, |
| "step": 3740 |
| }, |
| { |
| "epoch": 7.520080321285141, |
| "grad_norm": 7.014988422393799, |
| "learning_rate": 7.854949051089467e-06, |
| "loss": 0.1379, |
| "num_input_tokens_seen": 2336352, |
| "step": 3745 |
| }, |
| { |
| "epoch": 7.530120481927711, |
| "grad_norm": 0.7333240509033203, |
| "learning_rate": 7.847751696927813e-06, |
| "loss": 0.0351, |
| "num_input_tokens_seen": 2339840, |
| "step": 3750 |
| }, |
| { |
| "epoch": 7.540160642570282, |
| "grad_norm": 2.350759267807007, |
| "learning_rate": 7.840545598204056e-06, |
| "loss": 0.1339, |
| "num_input_tokens_seen": 2343808, |
| "step": 3755 |
| }, |
| { |
| "epoch": 7.550200803212851, |
| "grad_norm": 11.666228294372559, |
| "learning_rate": 7.833330777045886e-06, |
| "loss": 0.1692, |
| "num_input_tokens_seen": 2346816, |
| "step": 3760 |
| }, |
| { |
| "epoch": 7.5602409638554215, |
| "grad_norm": 13.286596298217773, |
| "learning_rate": 7.826107255607784e-06, |
| "loss": 0.1368, |
| "num_input_tokens_seen": 2349888, |
| "step": 3765 |
| }, |
| { |
| "epoch": 7.570281124497992, |
| "grad_norm": 8.502277374267578, |
| "learning_rate": 7.818875056070944e-06, |
| "loss": 0.0795, |
| "num_input_tokens_seen": 2352960, |
| "step": 3770 |
| }, |
| { |
| "epoch": 7.580321285140562, |
| "grad_norm": 9.710450172424316, |
| "learning_rate": 7.811634200643202e-06, |
| "loss": 0.1426, |
| "num_input_tokens_seen": 2356672, |
| "step": 3775 |
| }, |
| { |
| "epoch": 7.590361445783133, |
| "grad_norm": 13.459878921508789, |
| "learning_rate": 7.804384711558983e-06, |
| "loss": 0.1388, |
| "num_input_tokens_seen": 2360032, |
| "step": 3780 |
| }, |
| { |
| "epoch": 7.600401606425703, |
| "grad_norm": 5.783694267272949, |
| "learning_rate": 7.797126611079219e-06, |
| "loss": 0.0769, |
| "num_input_tokens_seen": 2363456, |
| "step": 3785 |
| }, |
| { |
| "epoch": 7.610441767068274, |
| "grad_norm": 13.784896850585938, |
| "learning_rate": 7.789859921491288e-06, |
| "loss": 0.1245, |
| "num_input_tokens_seen": 2366912, |
| "step": 3790 |
| }, |
| { |
| "epoch": 7.620481927710843, |
| "grad_norm": 6.058487892150879, |
| "learning_rate": 7.782584665108934e-06, |
| "loss": 0.1209, |
| "num_input_tokens_seen": 2371008, |
| "step": 3795 |
| }, |
| { |
| "epoch": 7.6305220883534135, |
| "grad_norm": 10.394042015075684, |
| "learning_rate": 7.775300864272214e-06, |
| "loss": 0.0855, |
| "num_input_tokens_seen": 2374016, |
| "step": 3800 |
| }, |
| { |
| "epoch": 7.640562248995984, |
| "grad_norm": 9.327715873718262, |
| "learning_rate": 7.768008541347423e-06, |
| "loss": 0.166, |
| "num_input_tokens_seen": 2377408, |
| "step": 3805 |
| }, |
| { |
| "epoch": 7.650602409638554, |
| "grad_norm": 27.500553131103516, |
| "learning_rate": 7.760707718727023e-06, |
| "loss": 0.2516, |
| "num_input_tokens_seen": 2379680, |
| "step": 3810 |
| }, |
| { |
| "epoch": 7.660642570281125, |
| "grad_norm": 11.609461784362793, |
| "learning_rate": 7.753398418829572e-06, |
| "loss": 0.1049, |
| "num_input_tokens_seen": 2383200, |
| "step": 3815 |
| }, |
| { |
| "epoch": 7.670682730923695, |
| "grad_norm": 1.3528823852539062, |
| "learning_rate": 7.746080664099667e-06, |
| "loss": 0.0645, |
| "num_input_tokens_seen": 2386048, |
| "step": 3820 |
| }, |
| { |
| "epoch": 7.6807228915662655, |
| "grad_norm": 1.679419994354248, |
| "learning_rate": 7.73875447700786e-06, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 2389152, |
| "step": 3825 |
| }, |
| { |
| "epoch": 7.690763052208835, |
| "grad_norm": 21.143573760986328, |
| "learning_rate": 7.731419880050599e-06, |
| "loss": 0.198, |
| "num_input_tokens_seen": 2392064, |
| "step": 3830 |
| }, |
| { |
| "epoch": 7.7008032128514055, |
| "grad_norm": 11.375654220581055, |
| "learning_rate": 7.72407689575016e-06, |
| "loss": 0.1013, |
| "num_input_tokens_seen": 2395488, |
| "step": 3835 |
| }, |
| { |
| "epoch": 7.710843373493976, |
| "grad_norm": 38.27085876464844, |
| "learning_rate": 7.716725546654564e-06, |
| "loss": 0.1659, |
| "num_input_tokens_seen": 2398496, |
| "step": 3840 |
| }, |
| { |
| "epoch": 7.720883534136546, |
| "grad_norm": 11.487302780151367, |
| "learning_rate": 7.709365855337528e-06, |
| "loss": 0.0891, |
| "num_input_tokens_seen": 2401728, |
| "step": 3845 |
| }, |
| { |
| "epoch": 7.730923694779117, |
| "grad_norm": 3.648078203201294, |
| "learning_rate": 7.701997844398379e-06, |
| "loss": 0.1085, |
| "num_input_tokens_seen": 2404320, |
| "step": 3850 |
| }, |
| { |
| "epoch": 7.740963855421687, |
| "grad_norm": 13.500536918640137, |
| "learning_rate": 7.694621536461995e-06, |
| "loss": 0.1266, |
| "num_input_tokens_seen": 2407424, |
| "step": 3855 |
| }, |
| { |
| "epoch": 7.7510040160642575, |
| "grad_norm": 2.1225502490997314, |
| "learning_rate": 7.687236954178729e-06, |
| "loss": 0.0699, |
| "num_input_tokens_seen": 2411136, |
| "step": 3860 |
| }, |
| { |
| "epoch": 7.761044176706827, |
| "grad_norm": 7.856932640075684, |
| "learning_rate": 7.67984412022434e-06, |
| "loss": 0.099, |
| "num_input_tokens_seen": 2414080, |
| "step": 3865 |
| }, |
| { |
| "epoch": 7.771084337349397, |
| "grad_norm": 9.14748477935791, |
| "learning_rate": 7.672443057299931e-06, |
| "loss": 0.1714, |
| "num_input_tokens_seen": 2416832, |
| "step": 3870 |
| }, |
| { |
| "epoch": 7.781124497991968, |
| "grad_norm": 0.6267831325531006, |
| "learning_rate": 7.665033788131869e-06, |
| "loss": 0.0734, |
| "num_input_tokens_seen": 2419680, |
| "step": 3875 |
| }, |
| { |
| "epoch": 7.791164658634538, |
| "grad_norm": 4.2000250816345215, |
| "learning_rate": 7.657616335471723e-06, |
| "loss": 0.1235, |
| "num_input_tokens_seen": 2422848, |
| "step": 3880 |
| }, |
| { |
| "epoch": 7.801204819277109, |
| "grad_norm": 29.275188446044922, |
| "learning_rate": 7.650190722096188e-06, |
| "loss": 0.1255, |
| "num_input_tokens_seen": 2426816, |
| "step": 3885 |
| }, |
| { |
| "epoch": 7.811244979919679, |
| "grad_norm": 4.945994853973389, |
| "learning_rate": 7.64275697080702e-06, |
| "loss": 0.1027, |
| "num_input_tokens_seen": 2429440, |
| "step": 3890 |
| }, |
| { |
| "epoch": 7.821285140562249, |
| "grad_norm": 3.7940685749053955, |
| "learning_rate": 7.635315104430959e-06, |
| "loss": 0.0784, |
| "num_input_tokens_seen": 2432064, |
| "step": 3895 |
| }, |
| { |
| "epoch": 7.831325301204819, |
| "grad_norm": 1.6572811603546143, |
| "learning_rate": 7.6278651458196724e-06, |
| "loss": 0.0596, |
| "num_input_tokens_seen": 2435328, |
| "step": 3900 |
| }, |
| { |
| "epoch": 7.841365461847389, |
| "grad_norm": 15.092347145080566, |
| "learning_rate": 7.620407117849674e-06, |
| "loss": 0.1102, |
| "num_input_tokens_seen": 2438240, |
| "step": 3905 |
| }, |
| { |
| "epoch": 7.85140562248996, |
| "grad_norm": 0.5130079388618469, |
| "learning_rate": 7.6129410434222505e-06, |
| "loss": 0.036, |
| "num_input_tokens_seen": 2441312, |
| "step": 3910 |
| }, |
| { |
| "epoch": 7.86144578313253, |
| "grad_norm": 25.864091873168945, |
| "learning_rate": 7.6054669454634025e-06, |
| "loss": 0.0824, |
| "num_input_tokens_seen": 2444288, |
| "step": 3915 |
| }, |
| { |
| "epoch": 7.871485943775101, |
| "grad_norm": 36.89551544189453, |
| "learning_rate": 7.597984846923765e-06, |
| "loss": 0.0896, |
| "num_input_tokens_seen": 2447360, |
| "step": 3920 |
| }, |
| { |
| "epoch": 7.881526104417671, |
| "grad_norm": 33.04966354370117, |
| "learning_rate": 7.5904947707785434e-06, |
| "loss": 0.1538, |
| "num_input_tokens_seen": 2449920, |
| "step": 3925 |
| }, |
| { |
| "epoch": 7.891566265060241, |
| "grad_norm": 40.92861557006836, |
| "learning_rate": 7.582996740027438e-06, |
| "loss": 0.2447, |
| "num_input_tokens_seen": 2453120, |
| "step": 3930 |
| }, |
| { |
| "epoch": 7.901606425702811, |
| "grad_norm": 7.3696770668029785, |
| "learning_rate": 7.575490777694572e-06, |
| "loss": 0.1763, |
| "num_input_tokens_seen": 2456512, |
| "step": 3935 |
| }, |
| { |
| "epoch": 7.911646586345381, |
| "grad_norm": 10.613399505615234, |
| "learning_rate": 7.567976906828431e-06, |
| "loss": 0.1581, |
| "num_input_tokens_seen": 2459488, |
| "step": 3940 |
| }, |
| { |
| "epoch": 7.921686746987952, |
| "grad_norm": 10.762381553649902, |
| "learning_rate": 7.560455150501781e-06, |
| "loss": 0.1783, |
| "num_input_tokens_seen": 2462880, |
| "step": 3945 |
| }, |
| { |
| "epoch": 7.931726907630522, |
| "grad_norm": 9.3571138381958, |
| "learning_rate": 7.552925531811601e-06, |
| "loss": 0.1394, |
| "num_input_tokens_seen": 2466432, |
| "step": 3950 |
| }, |
| { |
| "epoch": 7.9417670682730925, |
| "grad_norm": 7.034041881561279, |
| "learning_rate": 7.545388073879018e-06, |
| "loss": 0.125, |
| "num_input_tokens_seen": 2470048, |
| "step": 3955 |
| }, |
| { |
| "epoch": 7.951807228915663, |
| "grad_norm": 3.9113035202026367, |
| "learning_rate": 7.537842799849223e-06, |
| "loss": 0.1295, |
| "num_input_tokens_seen": 2473344, |
| "step": 3960 |
| }, |
| { |
| "epoch": 7.961847389558233, |
| "grad_norm": 3.321641445159912, |
| "learning_rate": 7.530289732891415e-06, |
| "loss": 0.09, |
| "num_input_tokens_seen": 2475904, |
| "step": 3965 |
| }, |
| { |
| "epoch": 7.971887550200803, |
| "grad_norm": 4.113049507141113, |
| "learning_rate": 7.522728896198718e-06, |
| "loss": 0.1257, |
| "num_input_tokens_seen": 2479584, |
| "step": 3970 |
| }, |
| { |
| "epoch": 7.981927710843373, |
| "grad_norm": 4.71891975402832, |
| "learning_rate": 7.515160312988117e-06, |
| "loss": 0.0629, |
| "num_input_tokens_seen": 2482208, |
| "step": 3975 |
| }, |
| { |
| "epoch": 7.991967871485944, |
| "grad_norm": 3.222778081893921, |
| "learning_rate": 7.507584006500381e-06, |
| "loss": 0.0842, |
| "num_input_tokens_seen": 2485760, |
| "step": 3980 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.1914680153131485, |
| "eval_runtime": 8.0714, |
| "eval_samples_per_second": 61.699, |
| "eval_steps_per_second": 15.487, |
| "num_input_tokens_seen": 2487712, |
| "step": 3984 |
| }, |
| { |
| "epoch": 8.002008032128513, |
| "grad_norm": 9.23020076751709, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.128, |
| "num_input_tokens_seen": 2488608, |
| "step": 3985 |
| }, |
| { |
| "epoch": 8.012048192771084, |
| "grad_norm": 21.651845932006836, |
| "learning_rate": 7.492408316775105e-06, |
| "loss": 0.1054, |
| "num_input_tokens_seen": 2491424, |
| "step": 3990 |
| }, |
| { |
| "epoch": 8.022088353413654, |
| "grad_norm": 17.216054916381836, |
| "learning_rate": 7.4848089801374005e-06, |
| "loss": 0.1303, |
| "num_input_tokens_seen": 2495136, |
| "step": 3995 |
| }, |
| { |
| "epoch": 8.032128514056225, |
| "grad_norm": 10.99027156829834, |
| "learning_rate": 7.47720201342209e-06, |
| "loss": 0.0562, |
| "num_input_tokens_seen": 2497504, |
| "step": 4000 |
| }, |
| { |
| "epoch": 8.042168674698795, |
| "grad_norm": 15.369956970214844, |
| "learning_rate": 7.469587439987811e-06, |
| "loss": 0.0594, |
| "num_input_tokens_seen": 2500928, |
| "step": 4005 |
| }, |
| { |
| "epoch": 8.052208835341366, |
| "grad_norm": 1.2656151056289673, |
| "learning_rate": 7.461965283216557e-06, |
| "loss": 0.0109, |
| "num_input_tokens_seen": 2504288, |
| "step": 4010 |
| }, |
| { |
| "epoch": 8.062248995983936, |
| "grad_norm": 1.8451780080795288, |
| "learning_rate": 7.454335566513603e-06, |
| "loss": 0.1269, |
| "num_input_tokens_seen": 2507072, |
| "step": 4015 |
| }, |
| { |
| "epoch": 8.072289156626505, |
| "grad_norm": 10.279462814331055, |
| "learning_rate": 7.446698313307445e-06, |
| "loss": 0.0798, |
| "num_input_tokens_seen": 2510176, |
| "step": 4020 |
| }, |
| { |
| "epoch": 8.082329317269076, |
| "grad_norm": 18.197235107421875, |
| "learning_rate": 7.43905354704972e-06, |
| "loss": 0.0449, |
| "num_input_tokens_seen": 2512576, |
| "step": 4025 |
| }, |
| { |
| "epoch": 8.092369477911646, |
| "grad_norm": 5.9086761474609375, |
| "learning_rate": 7.431401291215131e-06, |
| "loss": 0.1062, |
| "num_input_tokens_seen": 2515744, |
| "step": 4030 |
| }, |
| { |
| "epoch": 8.102409638554217, |
| "grad_norm": 5.934508800506592, |
| "learning_rate": 7.4237415693013846e-06, |
| "loss": 0.1092, |
| "num_input_tokens_seen": 2519136, |
| "step": 4035 |
| }, |
| { |
| "epoch": 8.112449799196787, |
| "grad_norm": 2.209650754928589, |
| "learning_rate": 7.416074404829108e-06, |
| "loss": 0.1781, |
| "num_input_tokens_seen": 2522432, |
| "step": 4040 |
| }, |
| { |
| "epoch": 8.122489959839358, |
| "grad_norm": 28.947336196899414, |
| "learning_rate": 7.408399821341787e-06, |
| "loss": 0.0839, |
| "num_input_tokens_seen": 2525856, |
| "step": 4045 |
| }, |
| { |
| "epoch": 8.132530120481928, |
| "grad_norm": 1.6147160530090332, |
| "learning_rate": 7.400717842405688e-06, |
| "loss": 0.0912, |
| "num_input_tokens_seen": 2528736, |
| "step": 4050 |
| }, |
| { |
| "epoch": 8.142570281124499, |
| "grad_norm": 20.141408920288086, |
| "learning_rate": 7.393028491609782e-06, |
| "loss": 0.1502, |
| "num_input_tokens_seen": 2532448, |
| "step": 4055 |
| }, |
| { |
| "epoch": 8.152610441767068, |
| "grad_norm": 6.467395782470703, |
| "learning_rate": 7.385331792565682e-06, |
| "loss": 0.1389, |
| "num_input_tokens_seen": 2535872, |
| "step": 4060 |
| }, |
| { |
| "epoch": 8.162650602409638, |
| "grad_norm": 6.237977981567383, |
| "learning_rate": 7.377627768907563e-06, |
| "loss": 0.1405, |
| "num_input_tokens_seen": 2538880, |
| "step": 4065 |
| }, |
| { |
| "epoch": 8.17269076305221, |
| "grad_norm": 0.7651534080505371, |
| "learning_rate": 7.369916444292092e-06, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 2541888, |
| "step": 4070 |
| }, |
| { |
| "epoch": 8.182730923694779, |
| "grad_norm": 13.309526443481445, |
| "learning_rate": 7.362197842398355e-06, |
| "loss": 0.0577, |
| "num_input_tokens_seen": 2545216, |
| "step": 4075 |
| }, |
| { |
| "epoch": 8.19277108433735, |
| "grad_norm": 14.438260078430176, |
| "learning_rate": 7.354471986927785e-06, |
| "loss": 0.107, |
| "num_input_tokens_seen": 2548768, |
| "step": 4080 |
| }, |
| { |
| "epoch": 8.20281124497992, |
| "grad_norm": 1.590266466140747, |
| "learning_rate": 7.346738901604086e-06, |
| "loss": 0.0757, |
| "num_input_tokens_seen": 2551776, |
| "step": 4085 |
| }, |
| { |
| "epoch": 8.21285140562249, |
| "grad_norm": 9.592524528503418, |
| "learning_rate": 7.338998610173166e-06, |
| "loss": 0.1362, |
| "num_input_tokens_seen": 2555872, |
| "step": 4090 |
| }, |
| { |
| "epoch": 8.22289156626506, |
| "grad_norm": 0.7704935073852539, |
| "learning_rate": 7.331251136403057e-06, |
| "loss": 0.1629, |
| "num_input_tokens_seen": 2559040, |
| "step": 4095 |
| }, |
| { |
| "epoch": 8.23293172690763, |
| "grad_norm": 26.26668357849121, |
| "learning_rate": 7.323496504083849e-06, |
| "loss": 0.1447, |
| "num_input_tokens_seen": 2562560, |
| "step": 4100 |
| }, |
| { |
| "epoch": 8.242971887550201, |
| "grad_norm": 13.633817672729492, |
| "learning_rate": 7.315734737027612e-06, |
| "loss": 0.11, |
| "num_input_tokens_seen": 2565504, |
| "step": 4105 |
| }, |
| { |
| "epoch": 8.25301204819277, |
| "grad_norm": 17.625274658203125, |
| "learning_rate": 7.307965859068324e-06, |
| "loss": 0.0784, |
| "num_input_tokens_seen": 2568256, |
| "step": 4110 |
| }, |
| { |
| "epoch": 8.263052208835342, |
| "grad_norm": 28.72048568725586, |
| "learning_rate": 7.300189894061802e-06, |
| "loss": 0.085, |
| "num_input_tokens_seen": 2571648, |
| "step": 4115 |
| }, |
| { |
| "epoch": 8.273092369477911, |
| "grad_norm": 6.543407917022705, |
| "learning_rate": 7.292406865885619e-06, |
| "loss": 0.0661, |
| "num_input_tokens_seen": 2575104, |
| "step": 4120 |
| }, |
| { |
| "epoch": 8.283132530120483, |
| "grad_norm": 51.86807632446289, |
| "learning_rate": 7.284616798439045e-06, |
| "loss": 0.1056, |
| "num_input_tokens_seen": 2578400, |
| "step": 4125 |
| }, |
| { |
| "epoch": 8.293172690763052, |
| "grad_norm": 13.690403938293457, |
| "learning_rate": 7.2768197156429564e-06, |
| "loss": 0.1329, |
| "num_input_tokens_seen": 2581376, |
| "step": 4130 |
| }, |
| { |
| "epoch": 8.303212851405622, |
| "grad_norm": 13.89306640625, |
| "learning_rate": 7.2690156414397775e-06, |
| "loss": 0.0798, |
| "num_input_tokens_seen": 2584192, |
| "step": 4135 |
| }, |
| { |
| "epoch": 8.313253012048193, |
| "grad_norm": 10.305960655212402, |
| "learning_rate": 7.261204599793399e-06, |
| "loss": 0.1572, |
| "num_input_tokens_seen": 2587040, |
| "step": 4140 |
| }, |
| { |
| "epoch": 8.323293172690763, |
| "grad_norm": 3.0132784843444824, |
| "learning_rate": 7.2533866146891085e-06, |
| "loss": 0.0726, |
| "num_input_tokens_seen": 2590112, |
| "step": 4145 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 13.267522811889648, |
| "learning_rate": 7.245561710133511e-06, |
| "loss": 0.1047, |
| "num_input_tokens_seen": 2592640, |
| "step": 4150 |
| }, |
| { |
| "epoch": 8.343373493975903, |
| "grad_norm": 14.66914176940918, |
| "learning_rate": 7.23772991015446e-06, |
| "loss": 0.0888, |
| "num_input_tokens_seen": 2595616, |
| "step": 4155 |
| }, |
| { |
| "epoch": 8.353413654618475, |
| "grad_norm": 5.335896015167236, |
| "learning_rate": 7.229891238800988e-06, |
| "loss": 0.182, |
| "num_input_tokens_seen": 2599936, |
| "step": 4160 |
| }, |
| { |
| "epoch": 8.363453815261044, |
| "grad_norm": 34.975765228271484, |
| "learning_rate": 7.22204572014322e-06, |
| "loss": 0.1266, |
| "num_input_tokens_seen": 2602912, |
| "step": 4165 |
| }, |
| { |
| "epoch": 8.373493975903614, |
| "grad_norm": 6.264699935913086, |
| "learning_rate": 7.214193378272312e-06, |
| "loss": 0.0899, |
| "num_input_tokens_seen": 2605536, |
| "step": 4170 |
| }, |
| { |
| "epoch": 8.383534136546185, |
| "grad_norm": 26.686826705932617, |
| "learning_rate": 7.2063342373003676e-06, |
| "loss": 0.1795, |
| "num_input_tokens_seen": 2607936, |
| "step": 4175 |
| }, |
| { |
| "epoch": 8.393574297188755, |
| "grad_norm": 4.992021083831787, |
| "learning_rate": 7.198468321360376e-06, |
| "loss": 0.0467, |
| "num_input_tokens_seen": 2610656, |
| "step": 4180 |
| }, |
| { |
| "epoch": 8.403614457831326, |
| "grad_norm": 10.714327812194824, |
| "learning_rate": 7.190595654606118e-06, |
| "loss": 0.1586, |
| "num_input_tokens_seen": 2613952, |
| "step": 4185 |
| }, |
| { |
| "epoch": 8.413654618473895, |
| "grad_norm": 26.027496337890625, |
| "learning_rate": 7.182716261212116e-06, |
| "loss": 0.1357, |
| "num_input_tokens_seen": 2616864, |
| "step": 4190 |
| }, |
| { |
| "epoch": 8.423694779116467, |
| "grad_norm": 12.172310829162598, |
| "learning_rate": 7.174830165373542e-06, |
| "loss": 0.1129, |
| "num_input_tokens_seen": 2620480, |
| "step": 4195 |
| }, |
| { |
| "epoch": 8.433734939759036, |
| "grad_norm": 22.3388671875, |
| "learning_rate": 7.1669373913061505e-06, |
| "loss": 0.081, |
| "num_input_tokens_seen": 2623392, |
| "step": 4200 |
| }, |
| { |
| "epoch": 8.443775100401606, |
| "grad_norm": 8.700096130371094, |
| "learning_rate": 7.1590379632462004e-06, |
| "loss": 0.0565, |
| "num_input_tokens_seen": 2626432, |
| "step": 4205 |
| }, |
| { |
| "epoch": 8.453815261044177, |
| "grad_norm": 17.64000701904297, |
| "learning_rate": 7.151131905450386e-06, |
| "loss": 0.0768, |
| "num_input_tokens_seen": 2629888, |
| "step": 4210 |
| }, |
| { |
| "epoch": 8.463855421686747, |
| "grad_norm": 22.10565948486328, |
| "learning_rate": 7.14321924219576e-06, |
| "loss": 0.1255, |
| "num_input_tokens_seen": 2632864, |
| "step": 4215 |
| }, |
| { |
| "epoch": 8.473895582329318, |
| "grad_norm": 11.11665153503418, |
| "learning_rate": 7.1352999977796565e-06, |
| "loss": 0.1375, |
| "num_input_tokens_seen": 2636096, |
| "step": 4220 |
| }, |
| { |
| "epoch": 8.483935742971887, |
| "grad_norm": 16.72979164123535, |
| "learning_rate": 7.127374196519616e-06, |
| "loss": 0.0704, |
| "num_input_tokens_seen": 2638368, |
| "step": 4225 |
| }, |
| { |
| "epoch": 8.493975903614459, |
| "grad_norm": 0.9664208889007568, |
| "learning_rate": 7.119441862753316e-06, |
| "loss": 0.059, |
| "num_input_tokens_seen": 2642080, |
| "step": 4230 |
| }, |
| { |
| "epoch": 8.504016064257028, |
| "grad_norm": 8.617629051208496, |
| "learning_rate": 7.111503020838495e-06, |
| "loss": 0.1087, |
| "num_input_tokens_seen": 2644640, |
| "step": 4235 |
| }, |
| { |
| "epoch": 8.514056224899598, |
| "grad_norm": 15.599228858947754, |
| "learning_rate": 7.103557695152874e-06, |
| "loss": 0.0577, |
| "num_input_tokens_seen": 2647616, |
| "step": 4240 |
| }, |
| { |
| "epoch": 8.524096385542169, |
| "grad_norm": 0.8850897550582886, |
| "learning_rate": 7.095605910094081e-06, |
| "loss": 0.0759, |
| "num_input_tokens_seen": 2650560, |
| "step": 4245 |
| }, |
| { |
| "epoch": 8.534136546184738, |
| "grad_norm": 23.22264862060547, |
| "learning_rate": 7.087647690079584e-06, |
| "loss": 0.2256, |
| "num_input_tokens_seen": 2653216, |
| "step": 4250 |
| }, |
| { |
| "epoch": 8.54417670682731, |
| "grad_norm": 1.2891957759857178, |
| "learning_rate": 7.079683059546607e-06, |
| "loss": 0.0457, |
| "num_input_tokens_seen": 2656256, |
| "step": 4255 |
| }, |
| { |
| "epoch": 8.55421686746988, |
| "grad_norm": 38.172786712646484, |
| "learning_rate": 7.071712042952061e-06, |
| "loss": 0.0834, |
| "num_input_tokens_seen": 2659040, |
| "step": 4260 |
| }, |
| { |
| "epoch": 8.56425702811245, |
| "grad_norm": 1.834019660949707, |
| "learning_rate": 7.063734664772461e-06, |
| "loss": 0.1075, |
| "num_input_tokens_seen": 2661824, |
| "step": 4265 |
| }, |
| { |
| "epoch": 8.57429718875502, |
| "grad_norm": 2.8576161861419678, |
| "learning_rate": 7.055750949503867e-06, |
| "loss": 0.0369, |
| "num_input_tokens_seen": 2664576, |
| "step": 4270 |
| }, |
| { |
| "epoch": 8.58433734939759, |
| "grad_norm": 7.74680233001709, |
| "learning_rate": 7.047760921661788e-06, |
| "loss": 0.0959, |
| "num_input_tokens_seen": 2667712, |
| "step": 4275 |
| }, |
| { |
| "epoch": 8.594377510040161, |
| "grad_norm": 43.823997497558594, |
| "learning_rate": 7.039764605781121e-06, |
| "loss": 0.1003, |
| "num_input_tokens_seen": 2670944, |
| "step": 4280 |
| }, |
| { |
| "epoch": 8.60441767068273, |
| "grad_norm": 38.42998123168945, |
| "learning_rate": 7.031762026416074e-06, |
| "loss": 0.0871, |
| "num_input_tokens_seen": 2674336, |
| "step": 4285 |
| }, |
| { |
| "epoch": 8.614457831325302, |
| "grad_norm": 0.930115818977356, |
| "learning_rate": 7.023753208140084e-06, |
| "loss": 0.0914, |
| "num_input_tokens_seen": 2677824, |
| "step": 4290 |
| }, |
| { |
| "epoch": 8.624497991967871, |
| "grad_norm": 55.05161666870117, |
| "learning_rate": 7.01573817554575e-06, |
| "loss": 0.068, |
| "num_input_tokens_seen": 2680544, |
| "step": 4295 |
| }, |
| { |
| "epoch": 8.634538152610443, |
| "grad_norm": 0.6600883603096008, |
| "learning_rate": 7.0077169532447474e-06, |
| "loss": 0.022, |
| "num_input_tokens_seen": 2683904, |
| "step": 4300 |
| }, |
| { |
| "epoch": 8.644578313253012, |
| "grad_norm": 0.6066603660583496, |
| "learning_rate": 6.999689565867764e-06, |
| "loss": 0.0531, |
| "num_input_tokens_seen": 2687200, |
| "step": 4305 |
| }, |
| { |
| "epoch": 8.654618473895582, |
| "grad_norm": 55.28033447265625, |
| "learning_rate": 6.991656038064416e-06, |
| "loss": 0.1906, |
| "num_input_tokens_seen": 2690560, |
| "step": 4310 |
| }, |
| { |
| "epoch": 8.664658634538153, |
| "grad_norm": 34.07001495361328, |
| "learning_rate": 6.983616394503177e-06, |
| "loss": 0.0982, |
| "num_input_tokens_seen": 2693728, |
| "step": 4315 |
| }, |
| { |
| "epoch": 8.674698795180722, |
| "grad_norm": 15.091057777404785, |
| "learning_rate": 6.975570659871295e-06, |
| "loss": 0.1921, |
| "num_input_tokens_seen": 2697312, |
| "step": 4320 |
| }, |
| { |
| "epoch": 8.684738955823294, |
| "grad_norm": 25.58677101135254, |
| "learning_rate": 6.967518858874727e-06, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 2700480, |
| "step": 4325 |
| }, |
| { |
| "epoch": 8.694779116465863, |
| "grad_norm": 6.8150529861450195, |
| "learning_rate": 6.959461016238056e-06, |
| "loss": 0.1458, |
| "num_input_tokens_seen": 2703520, |
| "step": 4330 |
| }, |
| { |
| "epoch": 8.704819277108435, |
| "grad_norm": 15.582724571228027, |
| "learning_rate": 6.951397156704418e-06, |
| "loss": 0.0863, |
| "num_input_tokens_seen": 2706688, |
| "step": 4335 |
| }, |
| { |
| "epoch": 8.714859437751004, |
| "grad_norm": 68.85173797607422, |
| "learning_rate": 6.943327305035424e-06, |
| "loss": 0.1132, |
| "num_input_tokens_seen": 2709440, |
| "step": 4340 |
| }, |
| { |
| "epoch": 8.724899598393574, |
| "grad_norm": 17.743255615234375, |
| "learning_rate": 6.9352514860110876e-06, |
| "loss": 0.0675, |
| "num_input_tokens_seen": 2712512, |
| "step": 4345 |
| }, |
| { |
| "epoch": 8.734939759036145, |
| "grad_norm": 4.8611297607421875, |
| "learning_rate": 6.927169724429737e-06, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 2715296, |
| "step": 4350 |
| }, |
| { |
| "epoch": 8.744979919678714, |
| "grad_norm": 39.961326599121094, |
| "learning_rate": 6.919082045107963e-06, |
| "loss": 0.0683, |
| "num_input_tokens_seen": 2718720, |
| "step": 4355 |
| }, |
| { |
| "epoch": 8.755020080321286, |
| "grad_norm": 8.697257041931152, |
| "learning_rate": 6.910988472880515e-06, |
| "loss": 0.0932, |
| "num_input_tokens_seen": 2721536, |
| "step": 4360 |
| }, |
| { |
| "epoch": 8.765060240963855, |
| "grad_norm": 4.740288257598877, |
| "learning_rate": 6.902889032600245e-06, |
| "loss": 0.0802, |
| "num_input_tokens_seen": 2725024, |
| "step": 4365 |
| }, |
| { |
| "epoch": 8.775100401606426, |
| "grad_norm": 42.13846206665039, |
| "learning_rate": 6.894783749138021e-06, |
| "loss": 0.1271, |
| "num_input_tokens_seen": 2728288, |
| "step": 4370 |
| }, |
| { |
| "epoch": 8.785140562248996, |
| "grad_norm": 0.44398602843284607, |
| "learning_rate": 6.886672647382653e-06, |
| "loss": 0.1137, |
| "num_input_tokens_seen": 2731424, |
| "step": 4375 |
| }, |
| { |
| "epoch": 8.795180722891565, |
| "grad_norm": 1.1284281015396118, |
| "learning_rate": 6.878555752240821e-06, |
| "loss": 0.1214, |
| "num_input_tokens_seen": 2735008, |
| "step": 4380 |
| }, |
| { |
| "epoch": 8.805220883534137, |
| "grad_norm": 27.683223724365234, |
| "learning_rate": 6.870433088636992e-06, |
| "loss": 0.0553, |
| "num_input_tokens_seen": 2737728, |
| "step": 4385 |
| }, |
| { |
| "epoch": 8.815261044176706, |
| "grad_norm": 10.467280387878418, |
| "learning_rate": 6.862304681513344e-06, |
| "loss": 0.2023, |
| "num_input_tokens_seen": 2741120, |
| "step": 4390 |
| }, |
| { |
| "epoch": 8.825301204819278, |
| "grad_norm": 76.25865173339844, |
| "learning_rate": 6.8541705558296954e-06, |
| "loss": 0.1059, |
| "num_input_tokens_seen": 2744384, |
| "step": 4395 |
| }, |
| { |
| "epoch": 8.835341365461847, |
| "grad_norm": 5.276292324066162, |
| "learning_rate": 6.8460307365634225e-06, |
| "loss": 0.0959, |
| "num_input_tokens_seen": 2747296, |
| "step": 4400 |
| }, |
| { |
| "epoch": 8.845381526104418, |
| "grad_norm": 4.337225914001465, |
| "learning_rate": 6.837885248709386e-06, |
| "loss": 0.0289, |
| "num_input_tokens_seen": 2750880, |
| "step": 4405 |
| }, |
| { |
| "epoch": 8.855421686746988, |
| "grad_norm": 9.778970718383789, |
| "learning_rate": 6.829734117279853e-06, |
| "loss": 0.121, |
| "num_input_tokens_seen": 2753696, |
| "step": 4410 |
| }, |
| { |
| "epoch": 8.865461847389557, |
| "grad_norm": 9.512594223022461, |
| "learning_rate": 6.8215773673044175e-06, |
| "loss": 0.1068, |
| "num_input_tokens_seen": 2756544, |
| "step": 4415 |
| }, |
| { |
| "epoch": 8.875502008032129, |
| "grad_norm": 0.6467050909996033, |
| "learning_rate": 6.81341502382993e-06, |
| "loss": 0.1459, |
| "num_input_tokens_seen": 2759008, |
| "step": 4420 |
| }, |
| { |
| "epoch": 8.885542168674698, |
| "grad_norm": 11.161055564880371, |
| "learning_rate": 6.805247111920416e-06, |
| "loss": 0.1331, |
| "num_input_tokens_seen": 2762112, |
| "step": 4425 |
| }, |
| { |
| "epoch": 8.89558232931727, |
| "grad_norm": 25.896102905273438, |
| "learning_rate": 6.797073656656998e-06, |
| "loss": 0.0897, |
| "num_input_tokens_seen": 2765216, |
| "step": 4430 |
| }, |
| { |
| "epoch": 8.905622489959839, |
| "grad_norm": 8.515508651733398, |
| "learning_rate": 6.788894683137822e-06, |
| "loss": 0.0792, |
| "num_input_tokens_seen": 2767744, |
| "step": 4435 |
| }, |
| { |
| "epoch": 8.91566265060241, |
| "grad_norm": 3.874532699584961, |
| "learning_rate": 6.780710216477979e-06, |
| "loss": 0.0705, |
| "num_input_tokens_seen": 2770976, |
| "step": 4440 |
| }, |
| { |
| "epoch": 8.92570281124498, |
| "grad_norm": 11.629859924316406, |
| "learning_rate": 6.772520281809426e-06, |
| "loss": 0.1015, |
| "num_input_tokens_seen": 2774016, |
| "step": 4445 |
| }, |
| { |
| "epoch": 8.93574297188755, |
| "grad_norm": 17.52058219909668, |
| "learning_rate": 6.7643249042809146e-06, |
| "loss": 0.153, |
| "num_input_tokens_seen": 2776768, |
| "step": 4450 |
| }, |
| { |
| "epoch": 8.94578313253012, |
| "grad_norm": 13.657364845275879, |
| "learning_rate": 6.7561241090579045e-06, |
| "loss": 0.0843, |
| "num_input_tokens_seen": 2779520, |
| "step": 4455 |
| }, |
| { |
| "epoch": 8.95582329317269, |
| "grad_norm": 9.738412857055664, |
| "learning_rate": 6.747917921322496e-06, |
| "loss": 0.0886, |
| "num_input_tokens_seen": 2783136, |
| "step": 4460 |
| }, |
| { |
| "epoch": 8.965863453815262, |
| "grad_norm": 4.442209720611572, |
| "learning_rate": 6.739706366273346e-06, |
| "loss": 0.0707, |
| "num_input_tokens_seen": 2786688, |
| "step": 4465 |
| }, |
| { |
| "epoch": 8.975903614457831, |
| "grad_norm": 2.1606521606445312, |
| "learning_rate": 6.731489469125591e-06, |
| "loss": 0.0703, |
| "num_input_tokens_seen": 2790432, |
| "step": 4470 |
| }, |
| { |
| "epoch": 8.985943775100402, |
| "grad_norm": 5.452800750732422, |
| "learning_rate": 6.723267255110773e-06, |
| "loss": 0.1706, |
| "num_input_tokens_seen": 2793696, |
| "step": 4475 |
| }, |
| { |
| "epoch": 8.995983935742972, |
| "grad_norm": 1.0247032642364502, |
| "learning_rate": 6.715039749476764e-06, |
| "loss": 0.1357, |
| "num_input_tokens_seen": 2796512, |
| "step": 4480 |
| }, |
| { |
| "epoch": 9.006024096385541, |
| "grad_norm": 7.481777191162109, |
| "learning_rate": 6.7068069774876785e-06, |
| "loss": 0.1072, |
| "num_input_tokens_seen": 2799168, |
| "step": 4485 |
| }, |
| { |
| "epoch": 9.016064257028113, |
| "grad_norm": 3.347785472869873, |
| "learning_rate": 6.698568964423808e-06, |
| "loss": 0.0907, |
| "num_input_tokens_seen": 2802304, |
| "step": 4490 |
| }, |
| { |
| "epoch": 9.026104417670682, |
| "grad_norm": 10.880471229553223, |
| "learning_rate": 6.690325735581532e-06, |
| "loss": 0.0329, |
| "num_input_tokens_seen": 2805952, |
| "step": 4495 |
| }, |
| { |
| "epoch": 9.036144578313253, |
| "grad_norm": 6.32849645614624, |
| "learning_rate": 6.682077316273252e-06, |
| "loss": 0.1234, |
| "num_input_tokens_seen": 2809440, |
| "step": 4500 |
| }, |
| { |
| "epoch": 9.046184738955823, |
| "grad_norm": 0.3392082750797272, |
| "learning_rate": 6.673823731827306e-06, |
| "loss": 0.1117, |
| "num_input_tokens_seen": 2812576, |
| "step": 4505 |
| }, |
| { |
| "epoch": 9.056224899598394, |
| "grad_norm": 5.663910388946533, |
| "learning_rate": 6.665565007587888e-06, |
| "loss": 0.0835, |
| "num_input_tokens_seen": 2815552, |
| "step": 4510 |
| }, |
| { |
| "epoch": 9.066265060240964, |
| "grad_norm": 21.429214477539062, |
| "learning_rate": 6.657301168914983e-06, |
| "loss": 0.0963, |
| "num_input_tokens_seen": 2818304, |
| "step": 4515 |
| }, |
| { |
| "epoch": 9.076305220883533, |
| "grad_norm": 0.16872386634349823, |
| "learning_rate": 6.649032241184271e-06, |
| "loss": 0.068, |
| "num_input_tokens_seen": 2822016, |
| "step": 4520 |
| }, |
| { |
| "epoch": 9.086345381526105, |
| "grad_norm": 0.11669722944498062, |
| "learning_rate": 6.640758249787067e-06, |
| "loss": 0.0016, |
| "num_input_tokens_seen": 2824992, |
| "step": 4525 |
| }, |
| { |
| "epoch": 9.096385542168674, |
| "grad_norm": 10.158061981201172, |
| "learning_rate": 6.632479220130232e-06, |
| "loss": 0.2172, |
| "num_input_tokens_seen": 2828384, |
| "step": 4530 |
| }, |
| { |
| "epoch": 9.106425702811245, |
| "grad_norm": 0.6976897120475769, |
| "learning_rate": 6.624195177636098e-06, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 2831616, |
| "step": 4535 |
| }, |
| { |
| "epoch": 9.116465863453815, |
| "grad_norm": 21.54926872253418, |
| "learning_rate": 6.615906147742389e-06, |
| "loss": 0.0771, |
| "num_input_tokens_seen": 2834912, |
| "step": 4540 |
| }, |
| { |
| "epoch": 9.126506024096386, |
| "grad_norm": 0.19631217420101166, |
| "learning_rate": 6.6076121559021445e-06, |
| "loss": 0.0624, |
| "num_input_tokens_seen": 2838080, |
| "step": 4545 |
| }, |
| { |
| "epoch": 9.136546184738956, |
| "grad_norm": 1.3235666751861572, |
| "learning_rate": 6.599313227583642e-06, |
| "loss": 0.0352, |
| "num_input_tokens_seen": 2841056, |
| "step": 4550 |
| }, |
| { |
| "epoch": 9.146586345381525, |
| "grad_norm": 0.08484455943107605, |
| "learning_rate": 6.591009388270315e-06, |
| "loss": 0.084, |
| "num_input_tokens_seen": 2844192, |
| "step": 4555 |
| }, |
| { |
| "epoch": 9.156626506024097, |
| "grad_norm": 0.18264135718345642, |
| "learning_rate": 6.582700663460679e-06, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 2847296, |
| "step": 4560 |
| }, |
| { |
| "epoch": 9.166666666666666, |
| "grad_norm": 0.31725814938545227, |
| "learning_rate": 6.57438707866825e-06, |
| "loss": 0.0691, |
| "num_input_tokens_seen": 2850656, |
| "step": 4565 |
| }, |
| { |
| "epoch": 9.176706827309237, |
| "grad_norm": 0.6224172115325928, |
| "learning_rate": 6.566068659421467e-06, |
| "loss": 0.1824, |
| "num_input_tokens_seen": 2854272, |
| "step": 4570 |
| }, |
| { |
| "epoch": 9.186746987951807, |
| "grad_norm": 25.0843563079834, |
| "learning_rate": 6.557745431263617e-06, |
| "loss": 0.105, |
| "num_input_tokens_seen": 2857248, |
| "step": 4575 |
| }, |
| { |
| "epoch": 9.196787148594378, |
| "grad_norm": 10.956290245056152, |
| "learning_rate": 6.5494174197527515e-06, |
| "loss": 0.1361, |
| "num_input_tokens_seen": 2859392, |
| "step": 4580 |
| }, |
| { |
| "epoch": 9.206827309236948, |
| "grad_norm": 16.801361083984375, |
| "learning_rate": 6.54108465046161e-06, |
| "loss": 0.0357, |
| "num_input_tokens_seen": 2862432, |
| "step": 4585 |
| }, |
| { |
| "epoch": 9.216867469879517, |
| "grad_norm": 44.96355056762695, |
| "learning_rate": 6.532747148977543e-06, |
| "loss": 0.1851, |
| "num_input_tokens_seen": 2865728, |
| "step": 4590 |
| }, |
| { |
| "epoch": 9.226907630522089, |
| "grad_norm": 7.259774684906006, |
| "learning_rate": 6.52440494090243e-06, |
| "loss": 0.0511, |
| "num_input_tokens_seen": 2868448, |
| "step": 4595 |
| }, |
| { |
| "epoch": 9.236947791164658, |
| "grad_norm": 2.497241497039795, |
| "learning_rate": 6.516058051852605e-06, |
| "loss": 0.1878, |
| "num_input_tokens_seen": 2871168, |
| "step": 4600 |
| }, |
| { |
| "epoch": 9.24698795180723, |
| "grad_norm": 9.350099563598633, |
| "learning_rate": 6.507706507458776e-06, |
| "loss": 0.0551, |
| "num_input_tokens_seen": 2873728, |
| "step": 4605 |
| }, |
| { |
| "epoch": 9.257028112449799, |
| "grad_norm": 13.690436363220215, |
| "learning_rate": 6.499350333365945e-06, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 2877376, |
| "step": 4610 |
| }, |
| { |
| "epoch": 9.26706827309237, |
| "grad_norm": 8.819265365600586, |
| "learning_rate": 6.490989555233328e-06, |
| "loss": 0.0838, |
| "num_input_tokens_seen": 2880864, |
| "step": 4615 |
| }, |
| { |
| "epoch": 9.27710843373494, |
| "grad_norm": 0.7378672361373901, |
| "learning_rate": 6.482624198734284e-06, |
| "loss": 0.0913, |
| "num_input_tokens_seen": 2883584, |
| "step": 4620 |
| }, |
| { |
| "epoch": 9.28714859437751, |
| "grad_norm": 0.7477921843528748, |
| "learning_rate": 6.4742542895562276e-06, |
| "loss": 0.0817, |
| "num_input_tokens_seen": 2886272, |
| "step": 4625 |
| }, |
| { |
| "epoch": 9.29718875502008, |
| "grad_norm": 1.3603986501693726, |
| "learning_rate": 6.465879853400553e-06, |
| "loss": 0.1111, |
| "num_input_tokens_seen": 2889216, |
| "step": 4630 |
| }, |
| { |
| "epoch": 9.30722891566265, |
| "grad_norm": 2.1609530448913574, |
| "learning_rate": 6.457500915982555e-06, |
| "loss": 0.1591, |
| "num_input_tokens_seen": 2892224, |
| "step": 4635 |
| }, |
| { |
| "epoch": 9.317269076305221, |
| "grad_norm": 15.493494987487793, |
| "learning_rate": 6.449117503031355e-06, |
| "loss": 0.1163, |
| "num_input_tokens_seen": 2895584, |
| "step": 4640 |
| }, |
| { |
| "epoch": 9.32730923694779, |
| "grad_norm": 26.324840545654297, |
| "learning_rate": 6.440729640289809e-06, |
| "loss": 0.126, |
| "num_input_tokens_seen": 2897920, |
| "step": 4645 |
| }, |
| { |
| "epoch": 9.337349397590362, |
| "grad_norm": 8.079339027404785, |
| "learning_rate": 6.432337353514444e-06, |
| "loss": 0.111, |
| "num_input_tokens_seen": 2900224, |
| "step": 4650 |
| }, |
| { |
| "epoch": 9.347389558232932, |
| "grad_norm": 0.812627911567688, |
| "learning_rate": 6.4239406684753695e-06, |
| "loss": 0.1038, |
| "num_input_tokens_seen": 2903200, |
| "step": 4655 |
| }, |
| { |
| "epoch": 9.357429718875501, |
| "grad_norm": 33.051639556884766, |
| "learning_rate": 6.4155396109561995e-06, |
| "loss": 0.1554, |
| "num_input_tokens_seen": 2905824, |
| "step": 4660 |
| }, |
| { |
| "epoch": 9.367469879518072, |
| "grad_norm": 0.4954128563404083, |
| "learning_rate": 6.407134206753977e-06, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 2909312, |
| "step": 4665 |
| }, |
| { |
| "epoch": 9.377510040160642, |
| "grad_norm": 35.997711181640625, |
| "learning_rate": 6.39872448167909e-06, |
| "loss": 0.067, |
| "num_input_tokens_seen": 2912416, |
| "step": 4670 |
| }, |
| { |
| "epoch": 9.387550200803213, |
| "grad_norm": 38.44462203979492, |
| "learning_rate": 6.3903104615551956e-06, |
| "loss": 0.075, |
| "num_input_tokens_seen": 2915232, |
| "step": 4675 |
| }, |
| { |
| "epoch": 9.397590361445783, |
| "grad_norm": 0.6909103393554688, |
| "learning_rate": 6.381892172219142e-06, |
| "loss": 0.0786, |
| "num_input_tokens_seen": 2918624, |
| "step": 4680 |
| }, |
| { |
| "epoch": 9.407630522088354, |
| "grad_norm": 0.7175284028053284, |
| "learning_rate": 6.373469639520881e-06, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 2922112, |
| "step": 4685 |
| }, |
| { |
| "epoch": 9.417670682730924, |
| "grad_norm": 0.5845101475715637, |
| "learning_rate": 6.3650428893234e-06, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 2924800, |
| "step": 4690 |
| }, |
| { |
| "epoch": 9.427710843373493, |
| "grad_norm": 1.1597626209259033, |
| "learning_rate": 6.356611947502633e-06, |
| "loss": 0.0801, |
| "num_input_tokens_seen": 2928064, |
| "step": 4695 |
| }, |
| { |
| "epoch": 9.437751004016064, |
| "grad_norm": 5.7896504402160645, |
| "learning_rate": 6.348176839947389e-06, |
| "loss": 0.1439, |
| "num_input_tokens_seen": 2931168, |
| "step": 4700 |
| }, |
| { |
| "epoch": 9.447791164658634, |
| "grad_norm": 65.76715087890625, |
| "learning_rate": 6.3397375925592675e-06, |
| "loss": 0.223, |
| "num_input_tokens_seen": 2934496, |
| "step": 4705 |
| }, |
| { |
| "epoch": 9.457831325301205, |
| "grad_norm": 0.3577728271484375, |
| "learning_rate": 6.331294231252576e-06, |
| "loss": 0.0278, |
| "num_input_tokens_seen": 2937984, |
| "step": 4710 |
| }, |
| { |
| "epoch": 9.467871485943775, |
| "grad_norm": 10.809001922607422, |
| "learning_rate": 6.3228467819542606e-06, |
| "loss": 0.0346, |
| "num_input_tokens_seen": 2940928, |
| "step": 4715 |
| }, |
| { |
| "epoch": 9.477911646586346, |
| "grad_norm": 0.2872489094734192, |
| "learning_rate": 6.314395270603819e-06, |
| "loss": 0.0529, |
| "num_input_tokens_seen": 2944448, |
| "step": 4720 |
| }, |
| { |
| "epoch": 9.487951807228916, |
| "grad_norm": 0.39773765206336975, |
| "learning_rate": 6.305939723153218e-06, |
| "loss": 0.052, |
| "num_input_tokens_seen": 2947584, |
| "step": 4725 |
| }, |
| { |
| "epoch": 9.497991967871485, |
| "grad_norm": 0.39810657501220703, |
| "learning_rate": 6.297480165566823e-06, |
| "loss": 0.0856, |
| "num_input_tokens_seen": 2951136, |
| "step": 4730 |
| }, |
| { |
| "epoch": 9.508032128514056, |
| "grad_norm": 20.94434928894043, |
| "learning_rate": 6.289016623821308e-06, |
| "loss": 0.1398, |
| "num_input_tokens_seen": 2953760, |
| "step": 4735 |
| }, |
| { |
| "epoch": 9.518072289156626, |
| "grad_norm": 0.32911476492881775, |
| "learning_rate": 6.280549123905588e-06, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 2956736, |
| "step": 4740 |
| }, |
| { |
| "epoch": 9.528112449799197, |
| "grad_norm": 30.02129554748535, |
| "learning_rate": 6.2720776918207285e-06, |
| "loss": 0.094, |
| "num_input_tokens_seen": 2960224, |
| "step": 4745 |
| }, |
| { |
| "epoch": 9.538152610441767, |
| "grad_norm": 1.1178313493728638, |
| "learning_rate": 6.263602353579868e-06, |
| "loss": 0.1783, |
| "num_input_tokens_seen": 2963616, |
| "step": 4750 |
| }, |
| { |
| "epoch": 9.548192771084338, |
| "grad_norm": 42.708438873291016, |
| "learning_rate": 6.255123135208141e-06, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 2966848, |
| "step": 4755 |
| }, |
| { |
| "epoch": 9.558232931726907, |
| "grad_norm": 44.14406967163086, |
| "learning_rate": 6.246640062742598e-06, |
| "loss": 0.1511, |
| "num_input_tokens_seen": 2969568, |
| "step": 4760 |
| }, |
| { |
| "epoch": 9.568273092369477, |
| "grad_norm": 43.29121398925781, |
| "learning_rate": 6.2381531622321234e-06, |
| "loss": 0.1446, |
| "num_input_tokens_seen": 2971488, |
| "step": 4765 |
| }, |
| { |
| "epoch": 9.578313253012048, |
| "grad_norm": 77.22794342041016, |
| "learning_rate": 6.229662459737354e-06, |
| "loss": 0.1338, |
| "num_input_tokens_seen": 2974656, |
| "step": 4770 |
| }, |
| { |
| "epoch": 9.588353413654618, |
| "grad_norm": 3.0715153217315674, |
| "learning_rate": 6.221167981330607e-06, |
| "loss": 0.1145, |
| "num_input_tokens_seen": 2977536, |
| "step": 4775 |
| }, |
| { |
| "epoch": 9.598393574297189, |
| "grad_norm": 1.3485350608825684, |
| "learning_rate": 6.212669753095788e-06, |
| "loss": 0.1066, |
| "num_input_tokens_seen": 2981632, |
| "step": 4780 |
| }, |
| { |
| "epoch": 9.608433734939759, |
| "grad_norm": 19.530614852905273, |
| "learning_rate": 6.204167801128319e-06, |
| "loss": 0.109, |
| "num_input_tokens_seen": 2985184, |
| "step": 4785 |
| }, |
| { |
| "epoch": 9.61847389558233, |
| "grad_norm": 10.45262622833252, |
| "learning_rate": 6.19566215153506e-06, |
| "loss": 0.1409, |
| "num_input_tokens_seen": 2988352, |
| "step": 4790 |
| }, |
| { |
| "epoch": 9.6285140562249, |
| "grad_norm": 29.81780242919922, |
| "learning_rate": 6.18715283043422e-06, |
| "loss": 0.0658, |
| "num_input_tokens_seen": 2991808, |
| "step": 4795 |
| }, |
| { |
| "epoch": 9.638554216867469, |
| "grad_norm": 26.38890266418457, |
| "learning_rate": 6.178639863955287e-06, |
| "loss": 0.1023, |
| "num_input_tokens_seen": 2994688, |
| "step": 4800 |
| }, |
| { |
| "epoch": 9.64859437751004, |
| "grad_norm": 5.988182067871094, |
| "learning_rate": 6.170123278238939e-06, |
| "loss": 0.0813, |
| "num_input_tokens_seen": 2998304, |
| "step": 4805 |
| }, |
| { |
| "epoch": 9.65863453815261, |
| "grad_norm": 0.7817508578300476, |
| "learning_rate": 6.161603099436968e-06, |
| "loss": 0.0725, |
| "num_input_tokens_seen": 3000928, |
| "step": 4810 |
| }, |
| { |
| "epoch": 9.668674698795181, |
| "grad_norm": 7.621532917022705, |
| "learning_rate": 6.153079353712201e-06, |
| "loss": 0.0623, |
| "num_input_tokens_seen": 3004224, |
| "step": 4815 |
| }, |
| { |
| "epoch": 9.67871485943775, |
| "grad_norm": 28.789592742919922, |
| "learning_rate": 6.144552067238418e-06, |
| "loss": 0.1374, |
| "num_input_tokens_seen": 3007200, |
| "step": 4820 |
| }, |
| { |
| "epoch": 9.688755020080322, |
| "grad_norm": 12.002851486206055, |
| "learning_rate": 6.136021266200271e-06, |
| "loss": 0.0868, |
| "num_input_tokens_seen": 3009920, |
| "step": 4825 |
| }, |
| { |
| "epoch": 9.698795180722891, |
| "grad_norm": 17.804859161376953, |
| "learning_rate": 6.1274869767932e-06, |
| "loss": 0.073, |
| "num_input_tokens_seen": 3013152, |
| "step": 4830 |
| }, |
| { |
| "epoch": 9.708835341365463, |
| "grad_norm": 37.982666015625, |
| "learning_rate": 6.118949225223365e-06, |
| "loss": 0.0502, |
| "num_input_tokens_seen": 3015936, |
| "step": 4835 |
| }, |
| { |
| "epoch": 9.718875502008032, |
| "grad_norm": 0.0973651334643364, |
| "learning_rate": 6.110408037707551e-06, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 3019424, |
| "step": 4840 |
| }, |
| { |
| "epoch": 9.728915662650602, |
| "grad_norm": 9.358402252197266, |
| "learning_rate": 6.1018634404730945e-06, |
| "loss": 0.207, |
| "num_input_tokens_seen": 3023040, |
| "step": 4845 |
| }, |
| { |
| "epoch": 9.738955823293173, |
| "grad_norm": 59.125640869140625, |
| "learning_rate": 6.093315459757807e-06, |
| "loss": 0.0914, |
| "num_input_tokens_seen": 3025728, |
| "step": 4850 |
| }, |
| { |
| "epoch": 9.748995983935743, |
| "grad_norm": 15.047895431518555, |
| "learning_rate": 6.084764121809878e-06, |
| "loss": 0.0947, |
| "num_input_tokens_seen": 3028352, |
| "step": 4855 |
| }, |
| { |
| "epoch": 9.759036144578314, |
| "grad_norm": 46.9537239074707, |
| "learning_rate": 6.076209452887821e-06, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 3031968, |
| "step": 4860 |
| }, |
| { |
| "epoch": 9.769076305220883, |
| "grad_norm": 6.795496463775635, |
| "learning_rate": 6.067651479260368e-06, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 3035072, |
| "step": 4865 |
| }, |
| { |
| "epoch": 9.779116465863455, |
| "grad_norm": 1.5318920612335205, |
| "learning_rate": 6.059090227206402e-06, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 3037568, |
| "step": 4870 |
| }, |
| { |
| "epoch": 9.789156626506024, |
| "grad_norm": 0.43727999925613403, |
| "learning_rate": 6.0505257230148715e-06, |
| "loss": 0.0885, |
| "num_input_tokens_seen": 3040384, |
| "step": 4875 |
| }, |
| { |
| "epoch": 9.799196787148594, |
| "grad_norm": 1.1928132772445679, |
| "learning_rate": 6.041957992984711e-06, |
| "loss": 0.0831, |
| "num_input_tokens_seen": 3043104, |
| "step": 4880 |
| }, |
| { |
| "epoch": 9.809236947791165, |
| "grad_norm": 7.331599235534668, |
| "learning_rate": 6.033387063424765e-06, |
| "loss": 0.1508, |
| "num_input_tokens_seen": 3046240, |
| "step": 4885 |
| }, |
| { |
| "epoch": 9.819277108433734, |
| "grad_norm": 7.764211177825928, |
| "learning_rate": 6.0248129606536984e-06, |
| "loss": 0.0875, |
| "num_input_tokens_seen": 3049792, |
| "step": 4890 |
| }, |
| { |
| "epoch": 9.829317269076306, |
| "grad_norm": 1.7602699995040894, |
| "learning_rate": 6.01623571099992e-06, |
| "loss": 0.1742, |
| "num_input_tokens_seen": 3052928, |
| "step": 4895 |
| }, |
| { |
| "epoch": 9.839357429718875, |
| "grad_norm": 0.1980728656053543, |
| "learning_rate": 6.0076553408015035e-06, |
| "loss": 0.0964, |
| "num_input_tokens_seen": 3056416, |
| "step": 4900 |
| }, |
| { |
| "epoch": 9.849397590361447, |
| "grad_norm": 64.95994567871094, |
| "learning_rate": 5.999071876406104e-06, |
| "loss": 0.1073, |
| "num_input_tokens_seen": 3059456, |
| "step": 4905 |
| }, |
| { |
| "epoch": 9.859437751004016, |
| "grad_norm": 29.12643814086914, |
| "learning_rate": 5.990485344170879e-06, |
| "loss": 0.2183, |
| "num_input_tokens_seen": 3062816, |
| "step": 4910 |
| }, |
| { |
| "epoch": 9.869477911646586, |
| "grad_norm": 52.59812545776367, |
| "learning_rate": 5.9818957704624046e-06, |
| "loss": 0.1465, |
| "num_input_tokens_seen": 3065472, |
| "step": 4915 |
| }, |
| { |
| "epoch": 9.879518072289157, |
| "grad_norm": 46.38893127441406, |
| "learning_rate": 5.973303181656597e-06, |
| "loss": 0.1317, |
| "num_input_tokens_seen": 3068480, |
| "step": 4920 |
| }, |
| { |
| "epoch": 9.889558232931726, |
| "grad_norm": 8.468809127807617, |
| "learning_rate": 5.964707604138632e-06, |
| "loss": 0.1163, |
| "num_input_tokens_seen": 3072032, |
| "step": 4925 |
| }, |
| { |
| "epoch": 9.899598393574298, |
| "grad_norm": 10.626388549804688, |
| "learning_rate": 5.956109064302862e-06, |
| "loss": 0.0549, |
| "num_input_tokens_seen": 3074336, |
| "step": 4930 |
| }, |
| { |
| "epoch": 9.909638554216867, |
| "grad_norm": 6.986921310424805, |
| "learning_rate": 5.947507588552734e-06, |
| "loss": 0.0499, |
| "num_input_tokens_seen": 3077728, |
| "step": 4935 |
| }, |
| { |
| "epoch": 9.919678714859439, |
| "grad_norm": 21.21433448791504, |
| "learning_rate": 5.9389032033007135e-06, |
| "loss": 0.0673, |
| "num_input_tokens_seen": 3080992, |
| "step": 4940 |
| }, |
| { |
| "epoch": 9.929718875502008, |
| "grad_norm": 5.551342010498047, |
| "learning_rate": 5.930295934968197e-06, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 3084768, |
| "step": 4945 |
| }, |
| { |
| "epoch": 9.939759036144578, |
| "grad_norm": 7.235230445861816, |
| "learning_rate": 5.9216858099854365e-06, |
| "loss": 0.036, |
| "num_input_tokens_seen": 3088160, |
| "step": 4950 |
| }, |
| { |
| "epoch": 9.949799196787149, |
| "grad_norm": 8.167715072631836, |
| "learning_rate": 5.913072854791458e-06, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 3091104, |
| "step": 4955 |
| }, |
| { |
| "epoch": 9.959839357429718, |
| "grad_norm": 34.79731369018555, |
| "learning_rate": 5.90445709583397e-06, |
| "loss": 0.2039, |
| "num_input_tokens_seen": 3094272, |
| "step": 4960 |
| }, |
| { |
| "epoch": 9.96987951807229, |
| "grad_norm": 15.844338417053223, |
| "learning_rate": 5.895838559569298e-06, |
| "loss": 0.0434, |
| "num_input_tokens_seen": 3098240, |
| "step": 4965 |
| }, |
| { |
| "epoch": 9.97991967871486, |
| "grad_norm": 7.910551071166992, |
| "learning_rate": 5.887217272462295e-06, |
| "loss": 0.1129, |
| "num_input_tokens_seen": 3101056, |
| "step": 4970 |
| }, |
| { |
| "epoch": 9.98995983935743, |
| "grad_norm": 11.544766426086426, |
| "learning_rate": 5.878593260986256e-06, |
| "loss": 0.0883, |
| "num_input_tokens_seen": 3104576, |
| "step": 4975 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 19.9595890045166, |
| "learning_rate": 5.869966551622848e-06, |
| "loss": 0.0943, |
| "num_input_tokens_seen": 3108288, |
| "step": 4980 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.29868796467781067, |
| "eval_runtime": 8.071, |
| "eval_samples_per_second": 61.703, |
| "eval_steps_per_second": 15.488, |
| "num_input_tokens_seen": 3108288, |
| "step": 4980 |
| }, |
| { |
| "epoch": 10.01004016064257, |
| "grad_norm": 7.36317253112793, |
| "learning_rate": 5.861337170862018e-06, |
| "loss": 0.0089, |
| "num_input_tokens_seen": 3111360, |
| "step": 4985 |
| }, |
| { |
| "epoch": 10.02008032128514, |
| "grad_norm": 1.2009451389312744, |
| "learning_rate": 5.852705145201919e-06, |
| "loss": 0.0079, |
| "num_input_tokens_seen": 3114496, |
| "step": 4990 |
| }, |
| { |
| "epoch": 10.03012048192771, |
| "grad_norm": 0.47578689455986023, |
| "learning_rate": 5.844070501148823e-06, |
| "loss": 0.1431, |
| "num_input_tokens_seen": 3117120, |
| "step": 4995 |
| }, |
| { |
| "epoch": 10.040160642570282, |
| "grad_norm": 35.32996368408203, |
| "learning_rate": 5.835433265217043e-06, |
| "loss": 0.0972, |
| "num_input_tokens_seen": 3121376, |
| "step": 5000 |
| }, |
| { |
| "epoch": 10.050200803212851, |
| "grad_norm": 36.080787658691406, |
| "learning_rate": 5.8267934639288525e-06, |
| "loss": 0.1294, |
| "num_input_tokens_seen": 3124352, |
| "step": 5005 |
| }, |
| { |
| "epoch": 10.060240963855422, |
| "grad_norm": 47.2873649597168, |
| "learning_rate": 5.818151123814401e-06, |
| "loss": 0.1062, |
| "num_input_tokens_seen": 3127264, |
| "step": 5010 |
| }, |
| { |
| "epoch": 10.070281124497992, |
| "grad_norm": 22.52366065979004, |
| "learning_rate": 5.809506271411635e-06, |
| "loss": 0.0603, |
| "num_input_tokens_seen": 3130368, |
| "step": 5015 |
| }, |
| { |
| "epoch": 10.080321285140561, |
| "grad_norm": 1.4479087591171265, |
| "learning_rate": 5.800858933266214e-06, |
| "loss": 0.044, |
| "num_input_tokens_seen": 3134016, |
| "step": 5020 |
| }, |
| { |
| "epoch": 10.090361445783133, |
| "grad_norm": 2.216742515563965, |
| "learning_rate": 5.792209135931428e-06, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 3137120, |
| "step": 5025 |
| }, |
| { |
| "epoch": 10.100401606425702, |
| "grad_norm": 8.245423316955566, |
| "learning_rate": 5.7835569059681255e-06, |
| "loss": 0.0852, |
| "num_input_tokens_seen": 3140288, |
| "step": 5030 |
| }, |
| { |
| "epoch": 10.110441767068274, |
| "grad_norm": 13.112469673156738, |
| "learning_rate": 5.77490226994462e-06, |
| "loss": 0.1236, |
| "num_input_tokens_seen": 3142912, |
| "step": 5035 |
| }, |
| { |
| "epoch": 10.120481927710843, |
| "grad_norm": 27.564695358276367, |
| "learning_rate": 5.766245254436613e-06, |
| "loss": 0.0678, |
| "num_input_tokens_seen": 3146112, |
| "step": 5040 |
| }, |
| { |
| "epoch": 10.130522088353414, |
| "grad_norm": 43.371463775634766, |
| "learning_rate": 5.757585886027114e-06, |
| "loss": 0.0703, |
| "num_input_tokens_seen": 3148928, |
| "step": 5045 |
| }, |
| { |
| "epoch": 10.140562248995984, |
| "grad_norm": 3.151542901992798, |
| "learning_rate": 5.748924191306359e-06, |
| "loss": 0.021, |
| "num_input_tokens_seen": 3152256, |
| "step": 5050 |
| }, |
| { |
| "epoch": 10.150602409638553, |
| "grad_norm": 21.960346221923828, |
| "learning_rate": 5.740260196871726e-06, |
| "loss": 0.1104, |
| "num_input_tokens_seen": 3155360, |
| "step": 5055 |
| }, |
| { |
| "epoch": 10.160642570281125, |
| "grad_norm": 0.18042011559009552, |
| "learning_rate": 5.73159392932765e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 3158784, |
| "step": 5060 |
| }, |
| { |
| "epoch": 10.170682730923694, |
| "grad_norm": 54.923194885253906, |
| "learning_rate": 5.722925415285555e-06, |
| "loss": 0.1475, |
| "num_input_tokens_seen": 3161952, |
| "step": 5065 |
| }, |
| { |
| "epoch": 10.180722891566266, |
| "grad_norm": 0.09134513884782791, |
| "learning_rate": 5.714254681363756e-06, |
| "loss": 0.0859, |
| "num_input_tokens_seen": 3165536, |
| "step": 5070 |
| }, |
| { |
| "epoch": 10.190763052208835, |
| "grad_norm": 3.2933976650238037, |
| "learning_rate": 5.705581754187387e-06, |
| "loss": 0.0771, |
| "num_input_tokens_seen": 3168064, |
| "step": 5075 |
| }, |
| { |
| "epoch": 10.200803212851406, |
| "grad_norm": 0.30672141909599304, |
| "learning_rate": 5.69690666038832e-06, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 3171232, |
| "step": 5080 |
| }, |
| { |
| "epoch": 10.210843373493976, |
| "grad_norm": 60.519798278808594, |
| "learning_rate": 5.688229426605075e-06, |
| "loss": 0.089, |
| "num_input_tokens_seen": 3174368, |
| "step": 5085 |
| }, |
| { |
| "epoch": 10.220883534136545, |
| "grad_norm": 0.8040590286254883, |
| "learning_rate": 5.679550079482747e-06, |
| "loss": 0.0532, |
| "num_input_tokens_seen": 3177792, |
| "step": 5090 |
| }, |
| { |
| "epoch": 10.230923694779117, |
| "grad_norm": 9.954076766967773, |
| "learning_rate": 5.670868645672916e-06, |
| "loss": 0.0561, |
| "num_input_tokens_seen": 3180704, |
| "step": 5095 |
| }, |
| { |
| "epoch": 10.240963855421686, |
| "grad_norm": 77.2634048461914, |
| "learning_rate": 5.6621851518335725e-06, |
| "loss": 0.085, |
| "num_input_tokens_seen": 3184320, |
| "step": 5100 |
| }, |
| { |
| "epoch": 10.251004016064257, |
| "grad_norm": 22.160968780517578, |
| "learning_rate": 5.653499624629035e-06, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 3187552, |
| "step": 5105 |
| }, |
| { |
| "epoch": 10.261044176706827, |
| "grad_norm": 0.13508616387844086, |
| "learning_rate": 5.644812090729863e-06, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 3190496, |
| "step": 5110 |
| }, |
| { |
| "epoch": 10.271084337349398, |
| "grad_norm": 0.18858233094215393, |
| "learning_rate": 5.636122576812776e-06, |
| "loss": 0.0672, |
| "num_input_tokens_seen": 3193760, |
| "step": 5115 |
| }, |
| { |
| "epoch": 10.281124497991968, |
| "grad_norm": 0.03702637925744057, |
| "learning_rate": 5.627431109560577e-06, |
| "loss": 0.0477, |
| "num_input_tokens_seen": 3197536, |
| "step": 5120 |
| }, |
| { |
| "epoch": 10.291164658634537, |
| "grad_norm": 0.5422800779342651, |
| "learning_rate": 5.618737715662067e-06, |
| "loss": 0.0529, |
| "num_input_tokens_seen": 3201536, |
| "step": 5125 |
| }, |
| { |
| "epoch": 10.301204819277109, |
| "grad_norm": 34.62778854370117, |
| "learning_rate": 5.61004242181196e-06, |
| "loss": 0.1767, |
| "num_input_tokens_seen": 3205056, |
| "step": 5130 |
| }, |
| { |
| "epoch": 10.311244979919678, |
| "grad_norm": 0.6812612414360046, |
| "learning_rate": 5.601345254710808e-06, |
| "loss": 0.0998, |
| "num_input_tokens_seen": 3208608, |
| "step": 5135 |
| }, |
| { |
| "epoch": 10.32128514056225, |
| "grad_norm": 90.58126831054688, |
| "learning_rate": 5.592646241064913e-06, |
| "loss": 0.1761, |
| "num_input_tokens_seen": 3211648, |
| "step": 5140 |
| }, |
| { |
| "epoch": 10.331325301204819, |
| "grad_norm": 39.69200134277344, |
| "learning_rate": 5.583945407586247e-06, |
| "loss": 0.0906, |
| "num_input_tokens_seen": 3214560, |
| "step": 5145 |
| }, |
| { |
| "epoch": 10.34136546184739, |
| "grad_norm": 45.602333068847656, |
| "learning_rate": 5.5752427809923704e-06, |
| "loss": 0.0525, |
| "num_input_tokens_seen": 3218112, |
| "step": 5150 |
| }, |
| { |
| "epoch": 10.35140562248996, |
| "grad_norm": 0.26112157106399536, |
| "learning_rate": 5.566538388006351e-06, |
| "loss": 0.1533, |
| "num_input_tokens_seen": 3220992, |
| "step": 5155 |
| }, |
| { |
| "epoch": 10.36144578313253, |
| "grad_norm": 0.5084413886070251, |
| "learning_rate": 5.557832255356677e-06, |
| "loss": 0.0048, |
| "num_input_tokens_seen": 3224128, |
| "step": 5160 |
| }, |
| { |
| "epoch": 10.3714859437751, |
| "grad_norm": 27.925601959228516, |
| "learning_rate": 5.549124409777185e-06, |
| "loss": 0.1247, |
| "num_input_tokens_seen": 3227648, |
| "step": 5165 |
| }, |
| { |
| "epoch": 10.38152610441767, |
| "grad_norm": 5.449309349060059, |
| "learning_rate": 5.540414878006965e-06, |
| "loss": 0.0086, |
| "num_input_tokens_seen": 3230848, |
| "step": 5170 |
| }, |
| { |
| "epoch": 10.391566265060241, |
| "grad_norm": 0.9163462519645691, |
| "learning_rate": 5.5317036867902885e-06, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 3234656, |
| "step": 5175 |
| }, |
| { |
| "epoch": 10.401606425702811, |
| "grad_norm": 83.0402603149414, |
| "learning_rate": 5.52299086287652e-06, |
| "loss": 0.0651, |
| "num_input_tokens_seen": 3237760, |
| "step": 5180 |
| }, |
| { |
| "epoch": 10.411646586345382, |
| "grad_norm": 0.2617938220500946, |
| "learning_rate": 5.514276433020044e-06, |
| "loss": 0.0379, |
| "num_input_tokens_seen": 3240928, |
| "step": 5185 |
| }, |
| { |
| "epoch": 10.421686746987952, |
| "grad_norm": 0.2599602937698364, |
| "learning_rate": 5.505560423980164e-06, |
| "loss": 0.0041, |
| "num_input_tokens_seen": 3244512, |
| "step": 5190 |
| }, |
| { |
| "epoch": 10.431726907630521, |
| "grad_norm": 0.06482914090156555, |
| "learning_rate": 5.496842862521046e-06, |
| "loss": 0.1364, |
| "num_input_tokens_seen": 3247488, |
| "step": 5195 |
| }, |
| { |
| "epoch": 10.441767068273093, |
| "grad_norm": 0.03865060955286026, |
| "learning_rate": 5.4881237754116135e-06, |
| "loss": 0.1852, |
| "num_input_tokens_seen": 3249952, |
| "step": 5200 |
| }, |
| { |
| "epoch": 10.451807228915662, |
| "grad_norm": 16.58278465270996, |
| "learning_rate": 5.479403189425481e-06, |
| "loss": 0.2229, |
| "num_input_tokens_seen": 3253248, |
| "step": 5205 |
| }, |
| { |
| "epoch": 10.461847389558233, |
| "grad_norm": 1.5567082166671753, |
| "learning_rate": 5.4706811313408616e-06, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 3255808, |
| "step": 5210 |
| }, |
| { |
| "epoch": 10.471887550200803, |
| "grad_norm": 0.2649865746498108, |
| "learning_rate": 5.461957627940489e-06, |
| "loss": 0.0817, |
| "num_input_tokens_seen": 3259008, |
| "step": 5215 |
| }, |
| { |
| "epoch": 10.481927710843374, |
| "grad_norm": 41.40445327758789, |
| "learning_rate": 5.453232706011539e-06, |
| "loss": 0.1075, |
| "num_input_tokens_seen": 3262208, |
| "step": 5220 |
| }, |
| { |
| "epoch": 10.491967871485944, |
| "grad_norm": 74.19205474853516, |
| "learning_rate": 5.44450639234554e-06, |
| "loss": 0.0785, |
| "num_input_tokens_seen": 3265216, |
| "step": 5225 |
| }, |
| { |
| "epoch": 10.502008032128515, |
| "grad_norm": 0.2126062661409378, |
| "learning_rate": 5.435778713738292e-06, |
| "loss": 0.0788, |
| "num_input_tokens_seen": 3267936, |
| "step": 5230 |
| }, |
| { |
| "epoch": 10.512048192771084, |
| "grad_norm": 49.84227752685547, |
| "learning_rate": 5.427049696989792e-06, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 3271552, |
| "step": 5235 |
| }, |
| { |
| "epoch": 10.522088353413654, |
| "grad_norm": 15.486082077026367, |
| "learning_rate": 5.418319368904137e-06, |
| "loss": 0.1446, |
| "num_input_tokens_seen": 3274304, |
| "step": 5240 |
| }, |
| { |
| "epoch": 10.532128514056225, |
| "grad_norm": 114.0123519897461, |
| "learning_rate": 5.409587756289462e-06, |
| "loss": 0.1745, |
| "num_input_tokens_seen": 3277056, |
| "step": 5245 |
| }, |
| { |
| "epoch": 10.542168674698795, |
| "grad_norm": 14.48117446899414, |
| "learning_rate": 5.40085488595784e-06, |
| "loss": 0.025, |
| "num_input_tokens_seen": 3280448, |
| "step": 5250 |
| }, |
| { |
| "epoch": 10.552208835341366, |
| "grad_norm": 0.0807952955365181, |
| "learning_rate": 5.392120784725206e-06, |
| "loss": 0.068, |
| "num_input_tokens_seen": 3284672, |
| "step": 5255 |
| }, |
| { |
| "epoch": 10.562248995983936, |
| "grad_norm": 0.8095536231994629, |
| "learning_rate": 5.383385479411276e-06, |
| "loss": 0.055, |
| "num_input_tokens_seen": 3287648, |
| "step": 5260 |
| }, |
| { |
| "epoch": 10.572289156626507, |
| "grad_norm": 67.51933288574219, |
| "learning_rate": 5.374648996839462e-06, |
| "loss": 0.0597, |
| "num_input_tokens_seen": 3291040, |
| "step": 5265 |
| }, |
| { |
| "epoch": 10.582329317269076, |
| "grad_norm": 35.59123992919922, |
| "learning_rate": 5.3659113638367936e-06, |
| "loss": 0.0567, |
| "num_input_tokens_seen": 3293536, |
| "step": 5270 |
| }, |
| { |
| "epoch": 10.592369477911646, |
| "grad_norm": 0.17060035467147827, |
| "learning_rate": 5.357172607233831e-06, |
| "loss": 0.0485, |
| "num_input_tokens_seen": 3296704, |
| "step": 5275 |
| }, |
| { |
| "epoch": 10.602409638554217, |
| "grad_norm": 11.730257034301758, |
| "learning_rate": 5.348432753864582e-06, |
| "loss": 0.0804, |
| "num_input_tokens_seen": 3299744, |
| "step": 5280 |
| }, |
| { |
| "epoch": 10.612449799196787, |
| "grad_norm": 8.082003593444824, |
| "learning_rate": 5.339691830566428e-06, |
| "loss": 0.2024, |
| "num_input_tokens_seen": 3302432, |
| "step": 5285 |
| }, |
| { |
| "epoch": 10.622489959839358, |
| "grad_norm": 39.70650100708008, |
| "learning_rate": 5.330949864180034e-06, |
| "loss": 0.0596, |
| "num_input_tokens_seen": 3305760, |
| "step": 5290 |
| }, |
| { |
| "epoch": 10.632530120481928, |
| "grad_norm": 0.2803881764411926, |
| "learning_rate": 5.322206881549266e-06, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 3309312, |
| "step": 5295 |
| }, |
| { |
| "epoch": 10.642570281124499, |
| "grad_norm": 0.115199975669384, |
| "learning_rate": 5.313462909521111e-06, |
| "loss": 0.0613, |
| "num_input_tokens_seen": 3312224, |
| "step": 5300 |
| }, |
| { |
| "epoch": 10.652610441767068, |
| "grad_norm": 23.403295516967773, |
| "learning_rate": 5.304717974945596e-06, |
| "loss": 0.1876, |
| "num_input_tokens_seen": 3314912, |
| "step": 5305 |
| }, |
| { |
| "epoch": 10.662650602409638, |
| "grad_norm": 0.8288989067077637, |
| "learning_rate": 5.2959721046757004e-06, |
| "loss": 0.1077, |
| "num_input_tokens_seen": 3317824, |
| "step": 5310 |
| }, |
| { |
| "epoch": 10.67269076305221, |
| "grad_norm": 1.3652393817901611, |
| "learning_rate": 5.287225325567281e-06, |
| "loss": 0.0748, |
| "num_input_tokens_seen": 3321216, |
| "step": 5315 |
| }, |
| { |
| "epoch": 10.682730923694779, |
| "grad_norm": 0.05525651574134827, |
| "learning_rate": 5.2784776644789825e-06, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 3324640, |
| "step": 5320 |
| }, |
| { |
| "epoch": 10.69277108433735, |
| "grad_norm": 8.195664405822754, |
| "learning_rate": 5.269729148272158e-06, |
| "loss": 0.1266, |
| "num_input_tokens_seen": 3327232, |
| "step": 5325 |
| }, |
| { |
| "epoch": 10.70281124497992, |
| "grad_norm": 26.851442337036133, |
| "learning_rate": 5.260979803810787e-06, |
| "loss": 0.0633, |
| "num_input_tokens_seen": 3330304, |
| "step": 5330 |
| }, |
| { |
| "epoch": 10.71285140562249, |
| "grad_norm": 0.7560333013534546, |
| "learning_rate": 5.252229657961394e-06, |
| "loss": 0.0565, |
| "num_input_tokens_seen": 3333472, |
| "step": 5335 |
| }, |
| { |
| "epoch": 10.72289156626506, |
| "grad_norm": 10.821014404296875, |
| "learning_rate": 5.2434787375929605e-06, |
| "loss": 0.0313, |
| "num_input_tokens_seen": 3336704, |
| "step": 5340 |
| }, |
| { |
| "epoch": 10.73293172690763, |
| "grad_norm": 1.1041162014007568, |
| "learning_rate": 5.2347270695768505e-06, |
| "loss": 0.0202, |
| "num_input_tokens_seen": 3339392, |
| "step": 5345 |
| }, |
| { |
| "epoch": 10.742971887550201, |
| "grad_norm": 50.26311492919922, |
| "learning_rate": 5.225974680786721e-06, |
| "loss": 0.1127, |
| "num_input_tokens_seen": 3342400, |
| "step": 5350 |
| }, |
| { |
| "epoch": 10.75301204819277, |
| "grad_norm": 34.33879470825195, |
| "learning_rate": 5.217221598098444e-06, |
| "loss": 0.1213, |
| "num_input_tokens_seen": 3345792, |
| "step": 5355 |
| }, |
| { |
| "epoch": 10.763052208835342, |
| "grad_norm": 26.683555603027344, |
| "learning_rate": 5.208467848390018e-06, |
| "loss": 0.1532, |
| "num_input_tokens_seen": 3349248, |
| "step": 5360 |
| }, |
| { |
| "epoch": 10.773092369477911, |
| "grad_norm": 14.137528419494629, |
| "learning_rate": 5.199713458541495e-06, |
| "loss": 0.0453, |
| "num_input_tokens_seen": 3352384, |
| "step": 5365 |
| }, |
| { |
| "epoch": 10.783132530120483, |
| "grad_norm": 3.5798494815826416, |
| "learning_rate": 5.190958455434891e-06, |
| "loss": 0.0667, |
| "num_input_tokens_seen": 3355648, |
| "step": 5370 |
| }, |
| { |
| "epoch": 10.793172690763052, |
| "grad_norm": 5.3735761642456055, |
| "learning_rate": 5.182202865954105e-06, |
| "loss": 0.1253, |
| "num_input_tokens_seen": 3358400, |
| "step": 5375 |
| }, |
| { |
| "epoch": 10.803212851405622, |
| "grad_norm": 2.211503267288208, |
| "learning_rate": 5.173446716984837e-06, |
| "loss": 0.0201, |
| "num_input_tokens_seen": 3361408, |
| "step": 5380 |
| }, |
| { |
| "epoch": 10.813253012048193, |
| "grad_norm": 9.731382369995117, |
| "learning_rate": 5.164690035414501e-06, |
| "loss": 0.0566, |
| "num_input_tokens_seen": 3365216, |
| "step": 5385 |
| }, |
| { |
| "epoch": 10.823293172690763, |
| "grad_norm": 20.726686477661133, |
| "learning_rate": 5.155932848132155e-06, |
| "loss": 0.0725, |
| "num_input_tokens_seen": 3368736, |
| "step": 5390 |
| }, |
| { |
| "epoch": 10.833333333333334, |
| "grad_norm": 0.17550311982631683, |
| "learning_rate": 5.1471751820284e-06, |
| "loss": 0.0465, |
| "num_input_tokens_seen": 3372096, |
| "step": 5395 |
| }, |
| { |
| "epoch": 10.843373493975903, |
| "grad_norm": 1.0040000677108765, |
| "learning_rate": 5.138417063995315e-06, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 3375296, |
| "step": 5400 |
| }, |
| { |
| "epoch": 10.853413654618475, |
| "grad_norm": 5.7787065505981445, |
| "learning_rate": 5.129658520926361e-06, |
| "loss": 0.0839, |
| "num_input_tokens_seen": 3378880, |
| "step": 5405 |
| }, |
| { |
| "epoch": 10.863453815261044, |
| "grad_norm": 9.573227882385254, |
| "learning_rate": 5.1208995797163085e-06, |
| "loss": 0.162, |
| "num_input_tokens_seen": 3381600, |
| "step": 5410 |
| }, |
| { |
| "epoch": 10.873493975903614, |
| "grad_norm": 27.446544647216797, |
| "learning_rate": 5.112140267261151e-06, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 3385024, |
| "step": 5415 |
| }, |
| { |
| "epoch": 10.883534136546185, |
| "grad_norm": 0.08992139995098114, |
| "learning_rate": 5.103380610458016e-06, |
| "loss": 0.1112, |
| "num_input_tokens_seen": 3387744, |
| "step": 5420 |
| }, |
| { |
| "epoch": 10.893574297188755, |
| "grad_norm": 0.1688283532857895, |
| "learning_rate": 5.094620636205096e-06, |
| "loss": 0.1092, |
| "num_input_tokens_seen": 3390464, |
| "step": 5425 |
| }, |
| { |
| "epoch": 10.903614457831326, |
| "grad_norm": 0.43699127435684204, |
| "learning_rate": 5.085860371401552e-06, |
| "loss": 0.1259, |
| "num_input_tokens_seen": 3393312, |
| "step": 5430 |
| }, |
| { |
| "epoch": 10.913654618473895, |
| "grad_norm": 107.20014953613281, |
| "learning_rate": 5.077099842947441e-06, |
| "loss": 0.1288, |
| "num_input_tokens_seen": 3396704, |
| "step": 5435 |
| }, |
| { |
| "epoch": 10.923694779116467, |
| "grad_norm": 12.245671272277832, |
| "learning_rate": 5.068339077743629e-06, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 3399264, |
| "step": 5440 |
| }, |
| { |
| "epoch": 10.933734939759036, |
| "grad_norm": 0.1279267817735672, |
| "learning_rate": 5.059578102691707e-06, |
| "loss": 0.0114, |
| "num_input_tokens_seen": 3402144, |
| "step": 5445 |
| }, |
| { |
| "epoch": 10.943775100401606, |
| "grad_norm": 0.1616511046886444, |
| "learning_rate": 5.050816944693913e-06, |
| "loss": 0.002, |
| "num_input_tokens_seen": 3404608, |
| "step": 5450 |
| }, |
| { |
| "epoch": 10.953815261044177, |
| "grad_norm": 22.025815963745117, |
| "learning_rate": 5.042055630653042e-06, |
| "loss": 0.0584, |
| "num_input_tokens_seen": 3407584, |
| "step": 5455 |
| }, |
| { |
| "epoch": 10.963855421686747, |
| "grad_norm": 0.27465957403182983, |
| "learning_rate": 5.0332941874723775e-06, |
| "loss": 0.0499, |
| "num_input_tokens_seen": 3410848, |
| "step": 5460 |
| }, |
| { |
| "epoch": 10.973895582329318, |
| "grad_norm": 4.218213081359863, |
| "learning_rate": 5.02453264205559e-06, |
| "loss": 0.0993, |
| "num_input_tokens_seen": 3413792, |
| "step": 5465 |
| }, |
| { |
| "epoch": 10.983935742971887, |
| "grad_norm": 15.106077194213867, |
| "learning_rate": 5.01577102130667e-06, |
| "loss": 0.091, |
| "num_input_tokens_seen": 3416896, |
| "step": 5470 |
| }, |
| { |
| "epoch": 10.993975903614459, |
| "grad_norm": 8.20037841796875, |
| "learning_rate": 5.007009352129835e-06, |
| "loss": 0.0081, |
| "num_input_tokens_seen": 3419712, |
| "step": 5475 |
| }, |
| { |
| "epoch": 11.004016064257028, |
| "grad_norm": 0.12353216856718063, |
| "learning_rate": 4.998247661429453e-06, |
| "loss": 0.0095, |
| "num_input_tokens_seen": 3423168, |
| "step": 5480 |
| }, |
| { |
| "epoch": 11.014056224899598, |
| "grad_norm": 0.3131659924983978, |
| "learning_rate": 4.98948597610996e-06, |
| "loss": 0.0066, |
| "num_input_tokens_seen": 3426688, |
| "step": 5485 |
| }, |
| { |
| "epoch": 11.024096385542169, |
| "grad_norm": 11.450029373168945, |
| "learning_rate": 4.980724323075772e-06, |
| "loss": 0.1703, |
| "num_input_tokens_seen": 3429952, |
| "step": 5490 |
| }, |
| { |
| "epoch": 11.034136546184738, |
| "grad_norm": 0.36036553978919983, |
| "learning_rate": 4.971962729231211e-06, |
| "loss": 0.006, |
| "num_input_tokens_seen": 3433088, |
| "step": 5495 |
| }, |
| { |
| "epoch": 11.04417670682731, |
| "grad_norm": 0.09164968132972717, |
| "learning_rate": 4.9632012214804086e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 3435840, |
| "step": 5500 |
| }, |
| { |
| "epoch": 11.05421686746988, |
| "grad_norm": 0.10280407220125198, |
| "learning_rate": 4.954439826727243e-06, |
| "loss": 0.0105, |
| "num_input_tokens_seen": 3438976, |
| "step": 5505 |
| }, |
| { |
| "epoch": 11.06425702811245, |
| "grad_norm": 0.31124135851860046, |
| "learning_rate": 4.945678571875234e-06, |
| "loss": 0.0452, |
| "num_input_tokens_seen": 3442208, |
| "step": 5510 |
| }, |
| { |
| "epoch": 11.07429718875502, |
| "grad_norm": 23.169099807739258, |
| "learning_rate": 4.936917483827483e-06, |
| "loss": 0.007, |
| "num_input_tokens_seen": 3445632, |
| "step": 5515 |
| }, |
| { |
| "epoch": 11.08433734939759, |
| "grad_norm": 0.12971937656402588, |
| "learning_rate": 4.928156589486571e-06, |
| "loss": 0.1426, |
| "num_input_tokens_seen": 3448608, |
| "step": 5520 |
| }, |
| { |
| "epoch": 11.094377510040161, |
| "grad_norm": 0.3725661635398865, |
| "learning_rate": 4.919395915754486e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 3451264, |
| "step": 5525 |
| }, |
| { |
| "epoch": 11.10441767068273, |
| "grad_norm": 163.01356506347656, |
| "learning_rate": 4.910635489532543e-06, |
| "loss": 0.0699, |
| "num_input_tokens_seen": 3454496, |
| "step": 5530 |
| }, |
| { |
| "epoch": 11.114457831325302, |
| "grad_norm": 17.35430335998535, |
| "learning_rate": 4.901875337721289e-06, |
| "loss": 0.1167, |
| "num_input_tokens_seen": 3458016, |
| "step": 5535 |
| }, |
| { |
| "epoch": 11.124497991967871, |
| "grad_norm": 51.6560173034668, |
| "learning_rate": 4.893115487220434e-06, |
| "loss": 0.0807, |
| "num_input_tokens_seen": 3461344, |
| "step": 5540 |
| }, |
| { |
| "epoch": 11.134538152610443, |
| "grad_norm": 11.429407119750977, |
| "learning_rate": 4.884355964928767e-06, |
| "loss": 0.1003, |
| "num_input_tokens_seen": 3463424, |
| "step": 5545 |
| }, |
| { |
| "epoch": 11.144578313253012, |
| "grad_norm": 0.01519758440554142, |
| "learning_rate": 4.875596797744056e-06, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 3466560, |
| "step": 5550 |
| }, |
| { |
| "epoch": 11.154618473895582, |
| "grad_norm": 5.342797756195068, |
| "learning_rate": 4.866838012562993e-06, |
| "loss": 0.1129, |
| "num_input_tokens_seen": 3469664, |
| "step": 5555 |
| }, |
| { |
| "epoch": 11.164658634538153, |
| "grad_norm": 1.1703826189041138, |
| "learning_rate": 4.858079636281086e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 3472544, |
| "step": 5560 |
| }, |
| { |
| "epoch": 11.174698795180722, |
| "grad_norm": 39.996116638183594, |
| "learning_rate": 4.8493216957925915e-06, |
| "loss": 0.0965, |
| "num_input_tokens_seen": 3475072, |
| "step": 5565 |
| }, |
| { |
| "epoch": 11.184738955823294, |
| "grad_norm": 35.59189987182617, |
| "learning_rate": 4.840564217990432e-06, |
| "loss": 0.0605, |
| "num_input_tokens_seen": 3477984, |
| "step": 5570 |
| }, |
| { |
| "epoch": 11.194779116465863, |
| "grad_norm": 0.035814475268125534, |
| "learning_rate": 4.831807229766101e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 3481152, |
| "step": 5575 |
| }, |
| { |
| "epoch": 11.204819277108435, |
| "grad_norm": 23.645578384399414, |
| "learning_rate": 4.823050758009597e-06, |
| "loss": 0.0041, |
| "num_input_tokens_seen": 3484800, |
| "step": 5580 |
| }, |
| { |
| "epoch": 11.214859437751004, |
| "grad_norm": 0.532557487487793, |
| "learning_rate": 4.814294829609325e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 3487776, |
| "step": 5585 |
| }, |
| { |
| "epoch": 11.224899598393574, |
| "grad_norm": 0.031553834676742554, |
| "learning_rate": 4.805539471452026e-06, |
| "loss": 0.1039, |
| "num_input_tokens_seen": 3491552, |
| "step": 5590 |
| }, |
| { |
| "epoch": 11.234939759036145, |
| "grad_norm": 33.89256286621094, |
| "learning_rate": 4.796784710422692e-06, |
| "loss": 0.0078, |
| "num_input_tokens_seen": 3495296, |
| "step": 5595 |
| }, |
| { |
| "epoch": 11.244979919678714, |
| "grad_norm": 7.040841102600098, |
| "learning_rate": 4.788030573404475e-06, |
| "loss": 0.0828, |
| "num_input_tokens_seen": 3498208, |
| "step": 5600 |
| }, |
| { |
| "epoch": 11.255020080321286, |
| "grad_norm": 0.027555659413337708, |
| "learning_rate": 4.779277087278615e-06, |
| "loss": 0.0301, |
| "num_input_tokens_seen": 3501472, |
| "step": 5605 |
| }, |
| { |
| "epoch": 11.265060240963855, |
| "grad_norm": 0.02722824178636074, |
| "learning_rate": 4.770524278924353e-06, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 3504352, |
| "step": 5610 |
| }, |
| { |
| "epoch": 11.275100401606426, |
| "grad_norm": 0.2287973165512085, |
| "learning_rate": 4.761772175218848e-06, |
| "loss": 0.061, |
| "num_input_tokens_seen": 3507904, |
| "step": 5615 |
| }, |
| { |
| "epoch": 11.285140562248996, |
| "grad_norm": 0.07063689827919006, |
| "learning_rate": 4.753020803037098e-06, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 3511328, |
| "step": 5620 |
| }, |
| { |
| "epoch": 11.295180722891565, |
| "grad_norm": 0.009446073323488235, |
| "learning_rate": 4.744270189251848e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 3514432, |
| "step": 5625 |
| }, |
| { |
| "epoch": 11.305220883534137, |
| "grad_norm": 60.107078552246094, |
| "learning_rate": 4.735520360733523e-06, |
| "loss": 0.073, |
| "num_input_tokens_seen": 3517824, |
| "step": 5630 |
| }, |
| { |
| "epoch": 11.315261044176706, |
| "grad_norm": 0.03736821189522743, |
| "learning_rate": 4.7267713443501274e-06, |
| "loss": 0.0801, |
| "num_input_tokens_seen": 3520416, |
| "step": 5635 |
| }, |
| { |
| "epoch": 11.325301204819278, |
| "grad_norm": 0.0403904914855957, |
| "learning_rate": 4.718023166967181e-06, |
| "loss": 0.1055, |
| "num_input_tokens_seen": 3523648, |
| "step": 5640 |
| }, |
| { |
| "epoch": 11.335341365461847, |
| "grad_norm": 0.44946715235710144, |
| "learning_rate": 4.7092758554476215e-06, |
| "loss": 0.0693, |
| "num_input_tokens_seen": 3526624, |
| "step": 5645 |
| }, |
| { |
| "epoch": 11.345381526104418, |
| "grad_norm": 0.4542866349220276, |
| "learning_rate": 4.700529436651729e-06, |
| "loss": 0.1391, |
| "num_input_tokens_seen": 3530080, |
| "step": 5650 |
| }, |
| { |
| "epoch": 11.355421686746988, |
| "grad_norm": 0.5738996267318726, |
| "learning_rate": 4.691783937437043e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 3533184, |
| "step": 5655 |
| }, |
| { |
| "epoch": 11.365461847389557, |
| "grad_norm": 0.34505975246429443, |
| "learning_rate": 4.683039384658276e-06, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 3536608, |
| "step": 5660 |
| }, |
| { |
| "epoch": 11.375502008032129, |
| "grad_norm": 0.1401638686656952, |
| "learning_rate": 4.67429580516724e-06, |
| "loss": 0.105, |
| "num_input_tokens_seen": 3539840, |
| "step": 5665 |
| }, |
| { |
| "epoch": 11.385542168674698, |
| "grad_norm": 0.23378750681877136, |
| "learning_rate": 4.665553225812758e-06, |
| "loss": 0.0015, |
| "num_input_tokens_seen": 3541952, |
| "step": 5670 |
| }, |
| { |
| "epoch": 11.39558232931727, |
| "grad_norm": 18.162364959716797, |
| "learning_rate": 4.656811673440572e-06, |
| "loss": 0.1544, |
| "num_input_tokens_seen": 3544992, |
| "step": 5675 |
| }, |
| { |
| "epoch": 11.405622489959839, |
| "grad_norm": 34.31365203857422, |
| "learning_rate": 4.648071174893285e-06, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 3547872, |
| "step": 5680 |
| }, |
| { |
| "epoch": 11.41566265060241, |
| "grad_norm": 0.40164482593536377, |
| "learning_rate": 4.6393317570102505e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 3550880, |
| "step": 5685 |
| }, |
| { |
| "epoch": 11.42570281124498, |
| "grad_norm": 0.06918898224830627, |
| "learning_rate": 4.6305934466275145e-06, |
| "loss": 0.0499, |
| "num_input_tokens_seen": 3554464, |
| "step": 5690 |
| }, |
| { |
| "epoch": 11.43574297188755, |
| "grad_norm": 0.07604250311851501, |
| "learning_rate": 4.6218562705777185e-06, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 3557344, |
| "step": 5695 |
| }, |
| { |
| "epoch": 11.44578313253012, |
| "grad_norm": 0.8705164194107056, |
| "learning_rate": 4.613120255690014e-06, |
| "loss": 0.1489, |
| "num_input_tokens_seen": 3560096, |
| "step": 5700 |
| }, |
| { |
| "epoch": 11.45582329317269, |
| "grad_norm": 0.12667706608772278, |
| "learning_rate": 4.604385428789997e-06, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 3562560, |
| "step": 5705 |
| }, |
| { |
| "epoch": 11.465863453815262, |
| "grad_norm": 0.11770451068878174, |
| "learning_rate": 4.595651816699612e-06, |
| "loss": 0.0591, |
| "num_input_tokens_seen": 3565472, |
| "step": 5710 |
| }, |
| { |
| "epoch": 11.475903614457831, |
| "grad_norm": 94.34481811523438, |
| "learning_rate": 4.586919446237071e-06, |
| "loss": 0.0946, |
| "num_input_tokens_seen": 3568288, |
| "step": 5715 |
| }, |
| { |
| "epoch": 11.485943775100402, |
| "grad_norm": 0.2393973171710968, |
| "learning_rate": 4.578188344216777e-06, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 3571712, |
| "step": 5720 |
| }, |
| { |
| "epoch": 11.495983935742972, |
| "grad_norm": 156.00209045410156, |
| "learning_rate": 4.5694585374492314e-06, |
| "loss": 0.0604, |
| "num_input_tokens_seen": 3574528, |
| "step": 5725 |
| }, |
| { |
| "epoch": 11.506024096385541, |
| "grad_norm": 2.538038492202759, |
| "learning_rate": 4.560730052740967e-06, |
| "loss": 0.0027, |
| "num_input_tokens_seen": 3577504, |
| "step": 5730 |
| }, |
| { |
| "epoch": 11.516064257028113, |
| "grad_norm": 0.1259893774986267, |
| "learning_rate": 4.552002916894454e-06, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 3581024, |
| "step": 5735 |
| }, |
| { |
| "epoch": 11.526104417670682, |
| "grad_norm": 0.10142702609300613, |
| "learning_rate": 4.543277156708013e-06, |
| "loss": 0.0853, |
| "num_input_tokens_seen": 3583552, |
| "step": 5740 |
| }, |
| { |
| "epoch": 11.536144578313253, |
| "grad_norm": 0.24175593256950378, |
| "learning_rate": 4.534552798975755e-06, |
| "loss": 0.0414, |
| "num_input_tokens_seen": 3587136, |
| "step": 5745 |
| }, |
| { |
| "epoch": 11.546184738955823, |
| "grad_norm": 0.05081957206130028, |
| "learning_rate": 4.525829870487468e-06, |
| "loss": 0.0038, |
| "num_input_tokens_seen": 3590368, |
| "step": 5750 |
| }, |
| { |
| "epoch": 11.556224899598394, |
| "grad_norm": 0.03754664212465286, |
| "learning_rate": 4.517108398028566e-06, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 3592896, |
| "step": 5755 |
| }, |
| { |
| "epoch": 11.566265060240964, |
| "grad_norm": 1.1687461137771606, |
| "learning_rate": 4.508388408379985e-06, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 3595424, |
| "step": 5760 |
| }, |
| { |
| "epoch": 11.576305220883533, |
| "grad_norm": 0.5024897456169128, |
| "learning_rate": 4.499669928318105e-06, |
| "loss": 0.0384, |
| "num_input_tokens_seen": 3599136, |
| "step": 5765 |
| }, |
| { |
| "epoch": 11.586345381526105, |
| "grad_norm": 0.012420962564647198, |
| "learning_rate": 4.490952984614676e-06, |
| "loss": 0.0472, |
| "num_input_tokens_seen": 3602496, |
| "step": 5770 |
| }, |
| { |
| "epoch": 11.596385542168674, |
| "grad_norm": 53.2065544128418, |
| "learning_rate": 4.482237604036729e-06, |
| "loss": 0.0978, |
| "num_input_tokens_seen": 3605824, |
| "step": 5775 |
| }, |
| { |
| "epoch": 11.606425702811245, |
| "grad_norm": 25.26783561706543, |
| "learning_rate": 4.473523813346491e-06, |
| "loss": 0.1101, |
| "num_input_tokens_seen": 3608544, |
| "step": 5780 |
| }, |
| { |
| "epoch": 11.616465863453815, |
| "grad_norm": 0.04346461594104767, |
| "learning_rate": 4.464811639301314e-06, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 3611328, |
| "step": 5785 |
| }, |
| { |
| "epoch": 11.626506024096386, |
| "grad_norm": 31.01521873474121, |
| "learning_rate": 4.456101108653579e-06, |
| "loss": 0.065, |
| "num_input_tokens_seen": 3613376, |
| "step": 5790 |
| }, |
| { |
| "epoch": 11.636546184738956, |
| "grad_norm": 54.869712829589844, |
| "learning_rate": 4.447392248150627e-06, |
| "loss": 0.1865, |
| "num_input_tokens_seen": 3616032, |
| "step": 5795 |
| }, |
| { |
| "epoch": 11.646586345381525, |
| "grad_norm": 3.378321647644043, |
| "learning_rate": 4.438685084534663e-06, |
| "loss": 0.022, |
| "num_input_tokens_seen": 3619552, |
| "step": 5800 |
| }, |
| { |
| "epoch": 11.656626506024097, |
| "grad_norm": 0.15011410415172577, |
| "learning_rate": 4.429979644542689e-06, |
| "loss": 0.0459, |
| "num_input_tokens_seen": 3623200, |
| "step": 5805 |
| }, |
| { |
| "epoch": 11.666666666666666, |
| "grad_norm": 44.3188591003418, |
| "learning_rate": 4.421275954906409e-06, |
| "loss": 0.0709, |
| "num_input_tokens_seen": 3626208, |
| "step": 5810 |
| }, |
| { |
| "epoch": 11.676706827309237, |
| "grad_norm": 0.18596585094928741, |
| "learning_rate": 4.412574042352156e-06, |
| "loss": 0.0649, |
| "num_input_tokens_seen": 3629632, |
| "step": 5815 |
| }, |
| { |
| "epoch": 11.686746987951807, |
| "grad_norm": 0.8435150384902954, |
| "learning_rate": 4.403873933600803e-06, |
| "loss": 0.0237, |
| "num_input_tokens_seen": 3632224, |
| "step": 5820 |
| }, |
| { |
| "epoch": 11.696787148594378, |
| "grad_norm": 0.5117378234863281, |
| "learning_rate": 4.395175655367682e-06, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 3635424, |
| "step": 5825 |
| }, |
| { |
| "epoch": 11.706827309236948, |
| "grad_norm": 31.202089309692383, |
| "learning_rate": 4.386479234362512e-06, |
| "loss": 0.1638, |
| "num_input_tokens_seen": 3638560, |
| "step": 5830 |
| }, |
| { |
| "epoch": 11.716867469879517, |
| "grad_norm": 0.47105512022972107, |
| "learning_rate": 4.377784697289304e-06, |
| "loss": 0.0549, |
| "num_input_tokens_seen": 3642560, |
| "step": 5835 |
| }, |
| { |
| "epoch": 11.726907630522089, |
| "grad_norm": 1.4522799253463745, |
| "learning_rate": 4.36909207084628e-06, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 3646016, |
| "step": 5840 |
| }, |
| { |
| "epoch": 11.736947791164658, |
| "grad_norm": 1.9571293592453003, |
| "learning_rate": 4.360401381725806e-06, |
| "loss": 0.0741, |
| "num_input_tokens_seen": 3649152, |
| "step": 5845 |
| }, |
| { |
| "epoch": 11.74698795180723, |
| "grad_norm": 38.02421569824219, |
| "learning_rate": 4.3517126566142864e-06, |
| "loss": 0.0736, |
| "num_input_tokens_seen": 3652096, |
| "step": 5850 |
| }, |
| { |
| "epoch": 11.757028112449799, |
| "grad_norm": 53.22858810424805, |
| "learning_rate": 4.343025922192104e-06, |
| "loss": 0.2828, |
| "num_input_tokens_seen": 3655776, |
| "step": 5855 |
| }, |
| { |
| "epoch": 11.76706827309237, |
| "grad_norm": 15.424005508422852, |
| "learning_rate": 4.334341205133527e-06, |
| "loss": 0.0433, |
| "num_input_tokens_seen": 3658656, |
| "step": 5860 |
| }, |
| { |
| "epoch": 11.77710843373494, |
| "grad_norm": 0.08036069571971893, |
| "learning_rate": 4.325658532106623e-06, |
| "loss": 0.0372, |
| "num_input_tokens_seen": 3661440, |
| "step": 5865 |
| }, |
| { |
| "epoch": 11.78714859437751, |
| "grad_norm": 24.228103637695312, |
| "learning_rate": 4.316977929773191e-06, |
| "loss": 0.0765, |
| "num_input_tokens_seen": 3664288, |
| "step": 5870 |
| }, |
| { |
| "epoch": 11.79718875502008, |
| "grad_norm": 0.32403498888015747, |
| "learning_rate": 4.308299424788667e-06, |
| "loss": 0.0367, |
| "num_input_tokens_seen": 3667744, |
| "step": 5875 |
| }, |
| { |
| "epoch": 11.80722891566265, |
| "grad_norm": 13.046582221984863, |
| "learning_rate": 4.299623043802046e-06, |
| "loss": 0.0453, |
| "num_input_tokens_seen": 3670624, |
| "step": 5880 |
| }, |
| { |
| "epoch": 11.817269076305221, |
| "grad_norm": 0.5890432596206665, |
| "learning_rate": 4.2909488134558086e-06, |
| "loss": 0.0436, |
| "num_input_tokens_seen": 3673600, |
| "step": 5885 |
| }, |
| { |
| "epoch": 11.82730923694779, |
| "grad_norm": 0.2298094779253006, |
| "learning_rate": 4.2822767603858185e-06, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 3676928, |
| "step": 5890 |
| }, |
| { |
| "epoch": 11.837349397590362, |
| "grad_norm": 0.3329310119152069, |
| "learning_rate": 4.2736069112212656e-06, |
| "loss": 0.1138, |
| "num_input_tokens_seen": 3680064, |
| "step": 5895 |
| }, |
| { |
| "epoch": 11.847389558232932, |
| "grad_norm": 0.017066599801182747, |
| "learning_rate": 4.264939292584565e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 3683040, |
| "step": 5900 |
| }, |
| { |
| "epoch": 11.857429718875501, |
| "grad_norm": 0.07971663773059845, |
| "learning_rate": 4.256273931091284e-06, |
| "loss": 0.0137, |
| "num_input_tokens_seen": 3686400, |
| "step": 5905 |
| }, |
| { |
| "epoch": 11.867469879518072, |
| "grad_norm": 14.964932441711426, |
| "learning_rate": 4.247610853350063e-06, |
| "loss": 0.0368, |
| "num_input_tokens_seen": 3689216, |
| "step": 5910 |
| }, |
| { |
| "epoch": 11.877510040160642, |
| "grad_norm": 16.467721939086914, |
| "learning_rate": 4.238950085962522e-06, |
| "loss": 0.0593, |
| "num_input_tokens_seen": 3692288, |
| "step": 5915 |
| }, |
| { |
| "epoch": 11.887550200803213, |
| "grad_norm": 23.15678596496582, |
| "learning_rate": 4.230291655523197e-06, |
| "loss": 0.028, |
| "num_input_tokens_seen": 3696288, |
| "step": 5920 |
| }, |
| { |
| "epoch": 11.897590361445783, |
| "grad_norm": 0.4339298903942108, |
| "learning_rate": 4.2216355886194355e-06, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 3699456, |
| "step": 5925 |
| }, |
| { |
| "epoch": 11.907630522088354, |
| "grad_norm": 74.98004150390625, |
| "learning_rate": 4.212981911831338e-06, |
| "loss": 0.0741, |
| "num_input_tokens_seen": 3703232, |
| "step": 5930 |
| }, |
| { |
| "epoch": 11.917670682730924, |
| "grad_norm": 0.4801337420940399, |
| "learning_rate": 4.204330651731662e-06, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 3705568, |
| "step": 5935 |
| }, |
| { |
| "epoch": 11.927710843373493, |
| "grad_norm": 1.0646982192993164, |
| "learning_rate": 4.195681834885743e-06, |
| "loss": 0.0302, |
| "num_input_tokens_seen": 3709152, |
| "step": 5940 |
| }, |
| { |
| "epoch": 11.937751004016064, |
| "grad_norm": 0.011887975037097931, |
| "learning_rate": 4.187035487851412e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 3713056, |
| "step": 5945 |
| }, |
| { |
| "epoch": 11.947791164658634, |
| "grad_norm": 11.081732749938965, |
| "learning_rate": 4.178391637178923e-06, |
| "loss": 0.0046, |
| "num_input_tokens_seen": 3715744, |
| "step": 5950 |
| }, |
| { |
| "epoch": 11.957831325301205, |
| "grad_norm": 0.07225532084703445, |
| "learning_rate": 4.169750309410856e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 3718912, |
| "step": 5955 |
| }, |
| { |
| "epoch": 11.967871485943775, |
| "grad_norm": 0.05781983956694603, |
| "learning_rate": 4.161111531082052e-06, |
| "loss": 0.3039, |
| "num_input_tokens_seen": 3721504, |
| "step": 5960 |
| }, |
| { |
| "epoch": 11.977911646586346, |
| "grad_norm": 42.374271392822266, |
| "learning_rate": 4.152475328719517e-06, |
| "loss": 0.0095, |
| "num_input_tokens_seen": 3724960, |
| "step": 5965 |
| }, |
| { |
| "epoch": 11.987951807228916, |
| "grad_norm": 206.51792907714844, |
| "learning_rate": 4.14384172884235e-06, |
| "loss": 0.0668, |
| "num_input_tokens_seen": 3728160, |
| "step": 5970 |
| }, |
| { |
| "epoch": 11.997991967871485, |
| "grad_norm": 14.492680549621582, |
| "learning_rate": 4.13521075796166e-06, |
| "loss": 0.1454, |
| "num_input_tokens_seen": 3732000, |
| "step": 5975 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.5769983530044556, |
| "eval_runtime": 8.0773, |
| "eval_samples_per_second": 61.655, |
| "eval_steps_per_second": 15.476, |
| "num_input_tokens_seen": 3732864, |
| "step": 5976 |
| }, |
| { |
| "epoch": 12.008032128514056, |
| "grad_norm": 0.06490268558263779, |
| "learning_rate": 4.126582442580478e-06, |
| "loss": 0.0756, |
| "num_input_tokens_seen": 3735424, |
| "step": 5980 |
| }, |
| { |
| "epoch": 12.018072289156626, |
| "grad_norm": 0.2292412966489792, |
| "learning_rate": 4.117956809193687e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 3738816, |
| "step": 5985 |
| }, |
| { |
| "epoch": 12.028112449799197, |
| "grad_norm": 0.057011283934116364, |
| "learning_rate": 4.109333884287929e-06, |
| "loss": 0.0439, |
| "num_input_tokens_seen": 3742176, |
| "step": 5990 |
| }, |
| { |
| "epoch": 12.038152610441767, |
| "grad_norm": 0.018794192001223564, |
| "learning_rate": 4.1007136943415325e-06, |
| "loss": 0.067, |
| "num_input_tokens_seen": 3744928, |
| "step": 5995 |
| }, |
| { |
| "epoch": 12.048192771084338, |
| "grad_norm": 0.15085923671722412, |
| "learning_rate": 4.092096265824429e-06, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 3748288, |
| "step": 6000 |
| }, |
| { |
| "epoch": 12.058232931726907, |
| "grad_norm": 0.032142024487257004, |
| "learning_rate": 4.083481625198065e-06, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 3751744, |
| "step": 6005 |
| }, |
| { |
| "epoch": 12.068273092369479, |
| "grad_norm": 0.4201153516769409, |
| "learning_rate": 4.074869798915333e-06, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 3754624, |
| "step": 6010 |
| }, |
| { |
| "epoch": 12.078313253012048, |
| "grad_norm": 1.5055813789367676, |
| "learning_rate": 4.066260813420477e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 3757120, |
| "step": 6015 |
| }, |
| { |
| "epoch": 12.088353413654618, |
| "grad_norm": 0.006558163091540337, |
| "learning_rate": 4.0576546951490225e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 3759872, |
| "step": 6020 |
| }, |
| { |
| "epoch": 12.098393574297189, |
| "grad_norm": 0.017629623413085938, |
| "learning_rate": 4.049051470527692e-06, |
| "loss": 0.0741, |
| "num_input_tokens_seen": 3762848, |
| "step": 6025 |
| }, |
| { |
| "epoch": 12.108433734939759, |
| "grad_norm": 0.0402434803545475, |
| "learning_rate": 4.040451165974313e-06, |
| "loss": 0.0594, |
| "num_input_tokens_seen": 3766080, |
| "step": 6030 |
| }, |
| { |
| "epoch": 12.11847389558233, |
| "grad_norm": 0.3141787052154541, |
| "learning_rate": 4.031853807897759e-06, |
| "loss": 0.0397, |
| "num_input_tokens_seen": 3769216, |
| "step": 6035 |
| }, |
| { |
| "epoch": 12.1285140562249, |
| "grad_norm": 0.008833534084260464, |
| "learning_rate": 4.023259422697846e-06, |
| "loss": 0.0017, |
| "num_input_tokens_seen": 3772480, |
| "step": 6040 |
| }, |
| { |
| "epoch": 12.13855421686747, |
| "grad_norm": 38.78855895996094, |
| "learning_rate": 4.014668036765267e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 3776096, |
| "step": 6045 |
| }, |
| { |
| "epoch": 12.14859437751004, |
| "grad_norm": 0.13365915417671204, |
| "learning_rate": 4.006079676481504e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 3779520, |
| "step": 6050 |
| }, |
| { |
| "epoch": 12.15863453815261, |
| "grad_norm": 0.04451864957809448, |
| "learning_rate": 3.997494368218745e-06, |
| "loss": 0.0451, |
| "num_input_tokens_seen": 3782560, |
| "step": 6055 |
| }, |
| { |
| "epoch": 12.168674698795181, |
| "grad_norm": 0.008807549253106117, |
| "learning_rate": 3.988912138339812e-06, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 3785216, |
| "step": 6060 |
| }, |
| { |
| "epoch": 12.17871485943775, |
| "grad_norm": 0.057633060961961746, |
| "learning_rate": 3.980333013198067e-06, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 3788256, |
| "step": 6065 |
| }, |
| { |
| "epoch": 12.188755020080322, |
| "grad_norm": 0.4633331894874573, |
| "learning_rate": 3.971757019137342e-06, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 3791552, |
| "step": 6070 |
| }, |
| { |
| "epoch": 12.198795180722891, |
| "grad_norm": 0.022058850154280663, |
| "learning_rate": 3.9631841824918585e-06, |
| "loss": 0.0042, |
| "num_input_tokens_seen": 3795008, |
| "step": 6075 |
| }, |
| { |
| "epoch": 12.208835341365463, |
| "grad_norm": 25.464744567871094, |
| "learning_rate": 3.954614529586135e-06, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 3797504, |
| "step": 6080 |
| }, |
| { |
| "epoch": 12.218875502008032, |
| "grad_norm": 0.02658812515437603, |
| "learning_rate": 3.946048086734921e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 3800768, |
| "step": 6085 |
| }, |
| { |
| "epoch": 12.228915662650602, |
| "grad_norm": 27.161558151245117, |
| "learning_rate": 3.9374848802430995e-06, |
| "loss": 0.0444, |
| "num_input_tokens_seen": 3804032, |
| "step": 6090 |
| }, |
| { |
| "epoch": 12.238955823293173, |
| "grad_norm": 0.11181746423244476, |
| "learning_rate": 3.928924936405625e-06, |
| "loss": 0.0084, |
| "num_input_tokens_seen": 3807360, |
| "step": 6095 |
| }, |
| { |
| "epoch": 12.248995983935743, |
| "grad_norm": 58.37928009033203, |
| "learning_rate": 3.920368281507431e-06, |
| "loss": 0.0319, |
| "num_input_tokens_seen": 3810304, |
| "step": 6100 |
| }, |
| { |
| "epoch": 12.259036144578314, |
| "grad_norm": 0.0796833336353302, |
| "learning_rate": 3.911814941823349e-06, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 3813504, |
| "step": 6105 |
| }, |
| { |
| "epoch": 12.269076305220883, |
| "grad_norm": 0.011359083466231823, |
| "learning_rate": 3.9032649436180325e-06, |
| "loss": 0.0416, |
| "num_input_tokens_seen": 3815584, |
| "step": 6110 |
| }, |
| { |
| "epoch": 12.279116465863455, |
| "grad_norm": 0.0377965085208416, |
| "learning_rate": 3.894718313145873e-06, |
| "loss": 0.0038, |
| "num_input_tokens_seen": 3819360, |
| "step": 6115 |
| }, |
| { |
| "epoch": 12.289156626506024, |
| "grad_norm": 0.0067506153136491776, |
| "learning_rate": 3.88617507665092e-06, |
| "loss": 0.0426, |
| "num_input_tokens_seen": 3822336, |
| "step": 6120 |
| }, |
| { |
| "epoch": 12.299196787148594, |
| "grad_norm": 0.011676016263663769, |
| "learning_rate": 3.877635260366807e-06, |
| "loss": 0.0295, |
| "num_input_tokens_seen": 3825184, |
| "step": 6125 |
| }, |
| { |
| "epoch": 12.309236947791165, |
| "grad_norm": 0.05182220786809921, |
| "learning_rate": 3.869098890516656e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 3828864, |
| "step": 6130 |
| }, |
| { |
| "epoch": 12.319277108433734, |
| "grad_norm": 17.081266403198242, |
| "learning_rate": 3.8605659933130165e-06, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 3831168, |
| "step": 6135 |
| }, |
| { |
| "epoch": 12.329317269076306, |
| "grad_norm": 0.012902950868010521, |
| "learning_rate": 3.852036594957762e-06, |
| "loss": 0.0117, |
| "num_input_tokens_seen": 3834304, |
| "step": 6140 |
| }, |
| { |
| "epoch": 12.339357429718875, |
| "grad_norm": 0.16895687580108643, |
| "learning_rate": 3.843510721642036e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 3837792, |
| "step": 6145 |
| }, |
| { |
| "epoch": 12.349397590361447, |
| "grad_norm": 154.15745544433594, |
| "learning_rate": 3.834988399546145e-06, |
| "loss": 0.0596, |
| "num_input_tokens_seen": 3840736, |
| "step": 6150 |
| }, |
| { |
| "epoch": 12.359437751004016, |
| "grad_norm": 0.030498886480927467, |
| "learning_rate": 3.826469654839501e-06, |
| "loss": 0.0105, |
| "num_input_tokens_seen": 3843968, |
| "step": 6155 |
| }, |
| { |
| "epoch": 12.369477911646586, |
| "grad_norm": 0.03108968771994114, |
| "learning_rate": 3.817954513680524e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 3846560, |
| "step": 6160 |
| }, |
| { |
| "epoch": 12.379518072289157, |
| "grad_norm": 16.620052337646484, |
| "learning_rate": 3.8094430022165713e-06, |
| "loss": 0.0571, |
| "num_input_tokens_seen": 3849728, |
| "step": 6165 |
| }, |
| { |
| "epoch": 12.389558232931726, |
| "grad_norm": 0.028063397854566574, |
| "learning_rate": 3.800935146583854e-06, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 3852416, |
| "step": 6170 |
| }, |
| { |
| "epoch": 12.399598393574298, |
| "grad_norm": 0.2141476422548294, |
| "learning_rate": 3.7924309729073616e-06, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 3855968, |
| "step": 6175 |
| }, |
| { |
| "epoch": 12.409638554216867, |
| "grad_norm": 11.455037117004395, |
| "learning_rate": 3.7839305073007675e-06, |
| "loss": 0.0015, |
| "num_input_tokens_seen": 3859552, |
| "step": 6180 |
| }, |
| { |
| "epoch": 12.419678714859439, |
| "grad_norm": 28.907880783081055, |
| "learning_rate": 3.775433775866369e-06, |
| "loss": 0.0115, |
| "num_input_tokens_seen": 3862112, |
| "step": 6185 |
| }, |
| { |
| "epoch": 12.429718875502008, |
| "grad_norm": 2.5717875957489014, |
| "learning_rate": 3.766940804694992e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 3865536, |
| "step": 6190 |
| }, |
| { |
| "epoch": 12.439759036144578, |
| "grad_norm": 0.01706070452928543, |
| "learning_rate": 3.758451619865915e-06, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 3868512, |
| "step": 6195 |
| }, |
| { |
| "epoch": 12.449799196787149, |
| "grad_norm": 4.302461624145508, |
| "learning_rate": 3.749966247446794e-06, |
| "loss": 0.0032, |
| "num_input_tokens_seen": 3870912, |
| "step": 6200 |
| }, |
| { |
| "epoch": 12.459839357429718, |
| "grad_norm": 12.282868385314941, |
| "learning_rate": 3.7414847134935716e-06, |
| "loss": 0.1196, |
| "num_input_tokens_seen": 3873568, |
| "step": 6205 |
| }, |
| { |
| "epoch": 12.46987951807229, |
| "grad_norm": 20.93626594543457, |
| "learning_rate": 3.7330070440504097e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 3876608, |
| "step": 6210 |
| }, |
| { |
| "epoch": 12.47991967871486, |
| "grad_norm": 0.0019983912352472544, |
| "learning_rate": 3.7245332651496038e-06, |
| "loss": 0.0249, |
| "num_input_tokens_seen": 3879232, |
| "step": 6215 |
| }, |
| { |
| "epoch": 12.48995983935743, |
| "grad_norm": 0.015782205387949944, |
| "learning_rate": 3.716063402811496e-06, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 3882752, |
| "step": 6220 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 0.0136459581553936, |
| "learning_rate": 3.707597483044411e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 3885344, |
| "step": 6225 |
| }, |
| { |
| "epoch": 12.51004016064257, |
| "grad_norm": 4.084256172180176, |
| "learning_rate": 3.699135531844559e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 3887648, |
| "step": 6230 |
| }, |
| { |
| "epoch": 12.52008032128514, |
| "grad_norm": 0.046021297574043274, |
| "learning_rate": 3.6906775751959667e-06, |
| "loss": 0.001, |
| "num_input_tokens_seen": 3891008, |
| "step": 6235 |
| }, |
| { |
| "epoch": 12.53012048192771, |
| "grad_norm": 0.012802932411432266, |
| "learning_rate": 3.682223639070398e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 3894016, |
| "step": 6240 |
| }, |
| { |
| "epoch": 12.540160642570282, |
| "grad_norm": 0.008216789923608303, |
| "learning_rate": 3.673773749427266e-06, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 3897056, |
| "step": 6245 |
| }, |
| { |
| "epoch": 12.550200803212851, |
| "grad_norm": 0.049423910677433014, |
| "learning_rate": 3.6653279322135637e-06, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 3900064, |
| "step": 6250 |
| }, |
| { |
| "epoch": 12.560240963855422, |
| "grad_norm": 3.1109814643859863, |
| "learning_rate": 3.656886213363772e-06, |
| "loss": 0.0707, |
| "num_input_tokens_seen": 3903424, |
| "step": 6255 |
| }, |
| { |
| "epoch": 12.570281124497992, |
| "grad_norm": 0.013427079655230045, |
| "learning_rate": 3.6484486187997927e-06, |
| "loss": 0.0367, |
| "num_input_tokens_seen": 3906528, |
| "step": 6260 |
| }, |
| { |
| "epoch": 12.580321285140561, |
| "grad_norm": 0.43069183826446533, |
| "learning_rate": 3.640015174430864e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 3909728, |
| "step": 6265 |
| }, |
| { |
| "epoch": 12.590361445783133, |
| "grad_norm": 0.037010353058576584, |
| "learning_rate": 3.6315859061534743e-06, |
| "loss": 0.1657, |
| "num_input_tokens_seen": 3913056, |
| "step": 6270 |
| }, |
| { |
| "epoch": 12.600401606425702, |
| "grad_norm": 1.318424940109253, |
| "learning_rate": 3.623160839851292e-06, |
| "loss": 0.1218, |
| "num_input_tokens_seen": 3916032, |
| "step": 6275 |
| }, |
| { |
| "epoch": 12.610441767068274, |
| "grad_norm": 0.23550517857074738, |
| "learning_rate": 3.6147400013950833e-06, |
| "loss": 0.0096, |
| "num_input_tokens_seen": 3919200, |
| "step": 6280 |
| }, |
| { |
| "epoch": 12.620481927710843, |
| "grad_norm": 0.05175193399190903, |
| "learning_rate": 3.60632341664263e-06, |
| "loss": 0.0043, |
| "num_input_tokens_seen": 3922048, |
| "step": 6285 |
| }, |
| { |
| "epoch": 12.630522088353414, |
| "grad_norm": 0.21679647266864777, |
| "learning_rate": 3.5979111114386556e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 3926208, |
| "step": 6290 |
| }, |
| { |
| "epoch": 12.640562248995984, |
| "grad_norm": 0.030079122632741928, |
| "learning_rate": 3.5895031116147355e-06, |
| "loss": 0.038, |
| "num_input_tokens_seen": 3929792, |
| "step": 6295 |
| }, |
| { |
| "epoch": 12.650602409638553, |
| "grad_norm": 0.020691825076937675, |
| "learning_rate": 3.5810994429892343e-06, |
| "loss": 0.0355, |
| "num_input_tokens_seen": 3932768, |
| "step": 6300 |
| }, |
| { |
| "epoch": 12.660642570281125, |
| "grad_norm": 0.17100581526756287, |
| "learning_rate": 3.5727001313672073e-06, |
| "loss": 0.1505, |
| "num_input_tokens_seen": 3936032, |
| "step": 6305 |
| }, |
| { |
| "epoch": 12.670682730923694, |
| "grad_norm": 0.002263088943436742, |
| "learning_rate": 3.5643052025403366e-06, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 3939136, |
| "step": 6310 |
| }, |
| { |
| "epoch": 12.680722891566266, |
| "grad_norm": 0.3044714033603668, |
| "learning_rate": 3.555914682286845e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 3942688, |
| "step": 6315 |
| }, |
| { |
| "epoch": 12.690763052208835, |
| "grad_norm": 6.629093647003174, |
| "learning_rate": 3.547528596371418e-06, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 3945472, |
| "step": 6320 |
| }, |
| { |
| "epoch": 12.700803212851406, |
| "grad_norm": 0.17978572845458984, |
| "learning_rate": 3.539146970545124e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 3948224, |
| "step": 6325 |
| }, |
| { |
| "epoch": 12.710843373493976, |
| "grad_norm": 6.806268692016602, |
| "learning_rate": 3.530769830545333e-06, |
| "loss": 0.0669, |
| "num_input_tokens_seen": 3951840, |
| "step": 6330 |
| }, |
| { |
| "epoch": 12.720883534136545, |
| "grad_norm": 12.894400596618652, |
| "learning_rate": 3.5223972020956454e-06, |
| "loss": 0.1137, |
| "num_input_tokens_seen": 3955424, |
| "step": 6335 |
| }, |
| { |
| "epoch": 12.730923694779117, |
| "grad_norm": 1.2106947898864746, |
| "learning_rate": 3.514029110905809e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 3957952, |
| "step": 6340 |
| }, |
| { |
| "epoch": 12.740963855421686, |
| "grad_norm": 0.024472283199429512, |
| "learning_rate": 3.505665582671631e-06, |
| "loss": 0.0704, |
| "num_input_tokens_seen": 3961152, |
| "step": 6345 |
| }, |
| { |
| "epoch": 12.751004016064257, |
| "grad_norm": 0.2990010976791382, |
| "learning_rate": 3.4973066430749175e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 3964480, |
| "step": 6350 |
| }, |
| { |
| "epoch": 12.761044176706827, |
| "grad_norm": 0.4378281831741333, |
| "learning_rate": 3.488952317783374e-06, |
| "loss": 0.024, |
| "num_input_tokens_seen": 3966912, |
| "step": 6355 |
| }, |
| { |
| "epoch": 12.771084337349398, |
| "grad_norm": 0.05266943201422691, |
| "learning_rate": 3.480602632450545e-06, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 3969152, |
| "step": 6360 |
| }, |
| { |
| "epoch": 12.781124497991968, |
| "grad_norm": 0.08936101943254471, |
| "learning_rate": 3.4722576127157244e-06, |
| "loss": 0.0341, |
| "num_input_tokens_seen": 3972160, |
| "step": 6365 |
| }, |
| { |
| "epoch": 12.791164658634537, |
| "grad_norm": 0.012653462588787079, |
| "learning_rate": 3.4639172842038766e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 3974784, |
| "step": 6370 |
| }, |
| { |
| "epoch": 12.801204819277109, |
| "grad_norm": 0.008684534579515457, |
| "learning_rate": 3.4555816725255666e-06, |
| "loss": 0.0721, |
| "num_input_tokens_seen": 3978592, |
| "step": 6375 |
| }, |
| { |
| "epoch": 12.811244979919678, |
| "grad_norm": 14.522257804870605, |
| "learning_rate": 3.447250803276869e-06, |
| "loss": 0.076, |
| "num_input_tokens_seen": 3982272, |
| "step": 6380 |
| }, |
| { |
| "epoch": 12.82128514056225, |
| "grad_norm": 0.026460807770490646, |
| "learning_rate": 3.438924702039301e-06, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 3985344, |
| "step": 6385 |
| }, |
| { |
| "epoch": 12.831325301204819, |
| "grad_norm": 0.39210912585258484, |
| "learning_rate": 3.430603394379738e-06, |
| "loss": 0.003, |
| "num_input_tokens_seen": 3988064, |
| "step": 6390 |
| }, |
| { |
| "epoch": 12.84136546184739, |
| "grad_norm": 0.493512362241745, |
| "learning_rate": 3.422286905850332e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 3990976, |
| "step": 6395 |
| }, |
| { |
| "epoch": 12.85140562248996, |
| "grad_norm": 0.03762149438261986, |
| "learning_rate": 3.4139752619884415e-06, |
| "loss": 0.1316, |
| "num_input_tokens_seen": 3994848, |
| "step": 6400 |
| }, |
| { |
| "epoch": 12.861445783132531, |
| "grad_norm": 23.084840774536133, |
| "learning_rate": 3.4056684883165454e-06, |
| "loss": 0.0486, |
| "num_input_tokens_seen": 3997984, |
| "step": 6405 |
| }, |
| { |
| "epoch": 12.8714859437751, |
| "grad_norm": 0.08375642448663712, |
| "learning_rate": 3.3973666103421675e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 4000896, |
| "step": 6410 |
| }, |
| { |
| "epoch": 12.88152610441767, |
| "grad_norm": 0.24988406896591187, |
| "learning_rate": 3.389069653557805e-06, |
| "loss": 0.0106, |
| "num_input_tokens_seen": 4003776, |
| "step": 6415 |
| }, |
| { |
| "epoch": 12.891566265060241, |
| "grad_norm": 0.08291018009185791, |
| "learning_rate": 3.3807776434408326e-06, |
| "loss": 0.0806, |
| "num_input_tokens_seen": 4006656, |
| "step": 6420 |
| }, |
| { |
| "epoch": 12.901606425702811, |
| "grad_norm": 14.322249412536621, |
| "learning_rate": 3.3724906054534434e-06, |
| "loss": 0.0295, |
| "num_input_tokens_seen": 4010432, |
| "step": 6425 |
| }, |
| { |
| "epoch": 12.911646586345382, |
| "grad_norm": 0.2638911008834839, |
| "learning_rate": 3.3642085650425625e-06, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 4013312, |
| "step": 6430 |
| }, |
| { |
| "epoch": 12.921686746987952, |
| "grad_norm": 0.043951284140348434, |
| "learning_rate": 3.355931547639764e-06, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 4016256, |
| "step": 6435 |
| }, |
| { |
| "epoch": 12.931726907630523, |
| "grad_norm": 10.951936721801758, |
| "learning_rate": 3.3476595786612044e-06, |
| "loss": 0.006, |
| "num_input_tokens_seen": 4019264, |
| "step": 6440 |
| }, |
| { |
| "epoch": 12.941767068273093, |
| "grad_norm": 0.3326930105686188, |
| "learning_rate": 3.3393926835075307e-06, |
| "loss": 0.0607, |
| "num_input_tokens_seen": 4022496, |
| "step": 6445 |
| }, |
| { |
| "epoch": 12.951807228915662, |
| "grad_norm": 0.16518734395503998, |
| "learning_rate": 3.331130887563815e-06, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 4025504, |
| "step": 6450 |
| }, |
| { |
| "epoch": 12.961847389558233, |
| "grad_norm": 105.1984634399414, |
| "learning_rate": 3.322874216199471e-06, |
| "loss": 0.0381, |
| "num_input_tokens_seen": 4028672, |
| "step": 6455 |
| }, |
| { |
| "epoch": 12.971887550200803, |
| "grad_norm": 0.021035606041550636, |
| "learning_rate": 3.3146226947681724e-06, |
| "loss": 0.152, |
| "num_input_tokens_seen": 4032672, |
| "step": 6460 |
| }, |
| { |
| "epoch": 12.981927710843374, |
| "grad_norm": 7.944100856781006, |
| "learning_rate": 3.306376348607787e-06, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 4035968, |
| "step": 6465 |
| }, |
| { |
| "epoch": 12.991967871485944, |
| "grad_norm": 0.0038000282365828753, |
| "learning_rate": 3.2981352030402795e-06, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 4039200, |
| "step": 6470 |
| }, |
| { |
| "epoch": 13.002008032128513, |
| "grad_norm": 9.69698429107666, |
| "learning_rate": 3.289899283371657e-06, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 4042080, |
| "step": 6475 |
| }, |
| { |
| "epoch": 13.012048192771084, |
| "grad_norm": 0.06602434813976288, |
| "learning_rate": 3.2816686148918708e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4045088, |
| "step": 6480 |
| }, |
| { |
| "epoch": 13.022088353413654, |
| "grad_norm": 0.011850385926663876, |
| "learning_rate": 3.2734432228747527e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4048736, |
| "step": 6485 |
| }, |
| { |
| "epoch": 13.032128514056225, |
| "grad_norm": 0.028685562312602997, |
| "learning_rate": 3.26522313257793e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 4052416, |
| "step": 6490 |
| }, |
| { |
| "epoch": 13.042168674698795, |
| "grad_norm": 0.20602266490459442, |
| "learning_rate": 3.2570083692427474e-06, |
| "loss": 0.0338, |
| "num_input_tokens_seen": 4055328, |
| "step": 6495 |
| }, |
| { |
| "epoch": 13.052208835341366, |
| "grad_norm": 0.17084050178527832, |
| "learning_rate": 3.248798958094197e-06, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 4058496, |
| "step": 6500 |
| }, |
| { |
| "epoch": 13.062248995983936, |
| "grad_norm": 56.381507873535156, |
| "learning_rate": 3.240594924340835e-06, |
| "loss": 0.018, |
| "num_input_tokens_seen": 4060832, |
| "step": 6505 |
| }, |
| { |
| "epoch": 13.072289156626505, |
| "grad_norm": 0.294530987739563, |
| "learning_rate": 3.232396293174702e-06, |
| "loss": 0.0559, |
| "num_input_tokens_seen": 4063584, |
| "step": 6510 |
| }, |
| { |
| "epoch": 13.082329317269076, |
| "grad_norm": 0.17592327296733856, |
| "learning_rate": 3.224203089771254e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4066368, |
| "step": 6515 |
| }, |
| { |
| "epoch": 13.092369477911646, |
| "grad_norm": 0.05794261023402214, |
| "learning_rate": 3.2160153392892737e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4069312, |
| "step": 6520 |
| }, |
| { |
| "epoch": 13.102409638554217, |
| "grad_norm": 0.054852936416864395, |
| "learning_rate": 3.2078330668708057e-06, |
| "loss": 0.0104, |
| "num_input_tokens_seen": 4072416, |
| "step": 6525 |
| }, |
| { |
| "epoch": 13.112449799196787, |
| "grad_norm": 0.00928029976785183, |
| "learning_rate": 3.19965629764107e-06, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 4075424, |
| "step": 6530 |
| }, |
| { |
| "epoch": 13.122489959839358, |
| "grad_norm": 0.12830331921577454, |
| "learning_rate": 3.1914850567083866e-06, |
| "loss": 0.028, |
| "num_input_tokens_seen": 4078656, |
| "step": 6535 |
| }, |
| { |
| "epoch": 13.132530120481928, |
| "grad_norm": 0.1490611582994461, |
| "learning_rate": 3.1833193691641045e-06, |
| "loss": 0.061, |
| "num_input_tokens_seen": 4081216, |
| "step": 6540 |
| }, |
| { |
| "epoch": 13.142570281124499, |
| "grad_norm": 1.6569007635116577, |
| "learning_rate": 3.1751592600825143e-06, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 4084256, |
| "step": 6545 |
| }, |
| { |
| "epoch": 13.152610441767068, |
| "grad_norm": 0.006913966964930296, |
| "learning_rate": 3.1670047545207817e-06, |
| "loss": 0.0015, |
| "num_input_tokens_seen": 4087712, |
| "step": 6550 |
| }, |
| { |
| "epoch": 13.162650602409638, |
| "grad_norm": 0.2316937893629074, |
| "learning_rate": 3.1588558775188647e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 4090464, |
| "step": 6555 |
| }, |
| { |
| "epoch": 13.17269076305221, |
| "grad_norm": 0.1495800018310547, |
| "learning_rate": 3.1507126540994337e-06, |
| "loss": 0.0249, |
| "num_input_tokens_seen": 4093600, |
| "step": 6560 |
| }, |
| { |
| "epoch": 13.182730923694779, |
| "grad_norm": 0.04467432200908661, |
| "learning_rate": 3.1425751092678064e-06, |
| "loss": 0.019, |
| "num_input_tokens_seen": 4096864, |
| "step": 6565 |
| }, |
| { |
| "epoch": 13.19277108433735, |
| "grad_norm": 1.2640222311019897, |
| "learning_rate": 3.134443268011855e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4100480, |
| "step": 6570 |
| }, |
| { |
| "epoch": 13.20281124497992, |
| "grad_norm": 0.034024544060230255, |
| "learning_rate": 3.126317155301941e-06, |
| "loss": 0.1201, |
| "num_input_tokens_seen": 4103712, |
| "step": 6575 |
| }, |
| { |
| "epoch": 13.21285140562249, |
| "grad_norm": 2.878603219985962, |
| "learning_rate": 3.11819679609084e-06, |
| "loss": 0.0016, |
| "num_input_tokens_seen": 4106976, |
| "step": 6580 |
| }, |
| { |
| "epoch": 13.22289156626506, |
| "grad_norm": 8.289514541625977, |
| "learning_rate": 3.1100822153136513e-06, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 4110464, |
| "step": 6585 |
| }, |
| { |
| "epoch": 13.23293172690763, |
| "grad_norm": 0.03438607603311539, |
| "learning_rate": 3.1019734378877403e-06, |
| "loss": 0.012, |
| "num_input_tokens_seen": 4113600, |
| "step": 6590 |
| }, |
| { |
| "epoch": 13.242971887550201, |
| "grad_norm": 0.005333933513611555, |
| "learning_rate": 3.0938704887126425e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 4116800, |
| "step": 6595 |
| }, |
| { |
| "epoch": 13.25301204819277, |
| "grad_norm": 0.6303266286849976, |
| "learning_rate": 3.0857733926700033e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4120256, |
| "step": 6600 |
| }, |
| { |
| "epoch": 13.263052208835342, |
| "grad_norm": 75.38514709472656, |
| "learning_rate": 3.077682174623495e-06, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 4123136, |
| "step": 6605 |
| }, |
| { |
| "epoch": 13.273092369477911, |
| "grad_norm": 0.02914111502468586, |
| "learning_rate": 3.0695968594187366e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4126752, |
| "step": 6610 |
| }, |
| { |
| "epoch": 13.283132530120483, |
| "grad_norm": 0.02487659826874733, |
| "learning_rate": 3.0615174718832218e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4130080, |
| "step": 6615 |
| }, |
| { |
| "epoch": 13.293172690763052, |
| "grad_norm": 0.07562565803527832, |
| "learning_rate": 3.053444036826246e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4133184, |
| "step": 6620 |
| }, |
| { |
| "epoch": 13.303212851405622, |
| "grad_norm": 0.03663274273276329, |
| "learning_rate": 3.045376579038821e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 4136192, |
| "step": 6625 |
| }, |
| { |
| "epoch": 13.313253012048193, |
| "grad_norm": 0.007939444854855537, |
| "learning_rate": 3.037315123293611e-06, |
| "loss": 0.1104, |
| "num_input_tokens_seen": 4139552, |
| "step": 6630 |
| }, |
| { |
| "epoch": 13.323293172690763, |
| "grad_norm": 0.00802676472812891, |
| "learning_rate": 3.0292596943448416e-06, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 4143040, |
| "step": 6635 |
| }, |
| { |
| "epoch": 13.333333333333334, |
| "grad_norm": 0.20601074397563934, |
| "learning_rate": 3.0212103169282415e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4146240, |
| "step": 6640 |
| }, |
| { |
| "epoch": 13.343373493975903, |
| "grad_norm": 0.12495647370815277, |
| "learning_rate": 3.013167015760946e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4150272, |
| "step": 6645 |
| }, |
| { |
| "epoch": 13.353413654618475, |
| "grad_norm": 1.3309050798416138, |
| "learning_rate": 3.0051298155414426e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4154624, |
| "step": 6650 |
| }, |
| { |
| "epoch": 13.363453815261044, |
| "grad_norm": 0.1865474134683609, |
| "learning_rate": 2.9970987409494784e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 4157152, |
| "step": 6655 |
| }, |
| { |
| "epoch": 13.373493975903614, |
| "grad_norm": 0.7567386031150818, |
| "learning_rate": 2.989073816645992e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 4159552, |
| "step": 6660 |
| }, |
| { |
| "epoch": 13.383534136546185, |
| "grad_norm": 1.041918158531189, |
| "learning_rate": 2.9810550672730367e-06, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 4163008, |
| "step": 6665 |
| }, |
| { |
| "epoch": 13.393574297188755, |
| "grad_norm": 0.1040244847536087, |
| "learning_rate": 2.9730425174537057e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 4166432, |
| "step": 6670 |
| }, |
| { |
| "epoch": 13.403614457831326, |
| "grad_norm": 0.002490447601303458, |
| "learning_rate": 2.965036191792052e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4169472, |
| "step": 6675 |
| }, |
| { |
| "epoch": 13.413654618473895, |
| "grad_norm": 0.009692768566310406, |
| "learning_rate": 2.9570361148730213e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4172704, |
| "step": 6680 |
| }, |
| { |
| "epoch": 13.423694779116467, |
| "grad_norm": 0.05210626497864723, |
| "learning_rate": 2.9490423112623646e-06, |
| "loss": 0.0648, |
| "num_input_tokens_seen": 4176000, |
| "step": 6685 |
| }, |
| { |
| "epoch": 13.433734939759036, |
| "grad_norm": 0.009161800146102905, |
| "learning_rate": 2.9410548055065748e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4178720, |
| "step": 6690 |
| }, |
| { |
| "epoch": 13.443775100401606, |
| "grad_norm": 0.010142548009753227, |
| "learning_rate": 2.933073622132806e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4181760, |
| "step": 6695 |
| }, |
| { |
| "epoch": 13.453815261044177, |
| "grad_norm": 8.248490333557129, |
| "learning_rate": 2.9250987856487932e-06, |
| "loss": 0.0604, |
| "num_input_tokens_seen": 4185152, |
| "step": 6700 |
| }, |
| { |
| "epoch": 13.463855421686747, |
| "grad_norm": 0.09948313981294632, |
| "learning_rate": 2.9171303205427883e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4188320, |
| "step": 6705 |
| }, |
| { |
| "epoch": 13.473895582329318, |
| "grad_norm": 0.012316162697970867, |
| "learning_rate": 2.909168251283474e-06, |
| "loss": 0.0507, |
| "num_input_tokens_seen": 4191776, |
| "step": 6710 |
| }, |
| { |
| "epoch": 13.483935742971887, |
| "grad_norm": 0.01065347995609045, |
| "learning_rate": 2.9012126023198973e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4194752, |
| "step": 6715 |
| }, |
| { |
| "epoch": 13.493975903614459, |
| "grad_norm": 0.005095439963042736, |
| "learning_rate": 2.893263398081386e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4197280, |
| "step": 6720 |
| }, |
| { |
| "epoch": 13.504016064257028, |
| "grad_norm": 0.004570506047457457, |
| "learning_rate": 2.8853206629774823e-06, |
| "loss": 0.0579, |
| "num_input_tokens_seen": 4200736, |
| "step": 6725 |
| }, |
| { |
| "epoch": 13.514056224899598, |
| "grad_norm": 0.0041636135429143906, |
| "learning_rate": 2.877384421397862e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4203968, |
| "step": 6730 |
| }, |
| { |
| "epoch": 13.524096385542169, |
| "grad_norm": 0.006220974028110504, |
| "learning_rate": 2.8694546977122595e-06, |
| "loss": 0.0273, |
| "num_input_tokens_seen": 4206528, |
| "step": 6735 |
| }, |
| { |
| "epoch": 13.534136546184738, |
| "grad_norm": 0.13364745676517487, |
| "learning_rate": 2.8615315162703962e-06, |
| "loss": 0.0669, |
| "num_input_tokens_seen": 4209472, |
| "step": 6740 |
| }, |
| { |
| "epoch": 13.54417670682731, |
| "grad_norm": 0.054688554257154465, |
| "learning_rate": 2.853614901401909e-06, |
| "loss": 0.0193, |
| "num_input_tokens_seen": 4212960, |
| "step": 6745 |
| }, |
| { |
| "epoch": 13.55421686746988, |
| "grad_norm": 0.6423424482345581, |
| "learning_rate": 2.84570487741626e-06, |
| "loss": 0.0445, |
| "num_input_tokens_seen": 4216160, |
| "step": 6750 |
| }, |
| { |
| "epoch": 13.56425702811245, |
| "grad_norm": 0.0701519250869751, |
| "learning_rate": 2.837801468602687e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 4219232, |
| "step": 6755 |
| }, |
| { |
| "epoch": 13.57429718875502, |
| "grad_norm": 20.19487953186035, |
| "learning_rate": 2.8299046992300995e-06, |
| "loss": 0.1348, |
| "num_input_tokens_seen": 4221920, |
| "step": 6760 |
| }, |
| { |
| "epoch": 13.58433734939759, |
| "grad_norm": 0.07352690398693085, |
| "learning_rate": 2.8220145935470276e-06, |
| "loss": 0.0464, |
| "num_input_tokens_seen": 4225152, |
| "step": 6765 |
| }, |
| { |
| "epoch": 13.594377510040161, |
| "grad_norm": 0.26299330592155457, |
| "learning_rate": 2.8141311757815454e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4228736, |
| "step": 6770 |
| }, |
| { |
| "epoch": 13.60441767068273, |
| "grad_norm": 0.006310007069259882, |
| "learning_rate": 2.806254470141174e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4231872, |
| "step": 6775 |
| }, |
| { |
| "epoch": 13.614457831325302, |
| "grad_norm": 76.36661529541016, |
| "learning_rate": 2.798384500812842e-06, |
| "loss": 0.028, |
| "num_input_tokens_seen": 4234784, |
| "step": 6780 |
| }, |
| { |
| "epoch": 13.624497991967871, |
| "grad_norm": 0.17646919190883636, |
| "learning_rate": 2.790521291962775e-06, |
| "loss": 0.0424, |
| "num_input_tokens_seen": 4237696, |
| "step": 6785 |
| }, |
| { |
| "epoch": 13.634538152610443, |
| "grad_norm": 42.649139404296875, |
| "learning_rate": 2.7826648677364555e-06, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 4240928, |
| "step": 6790 |
| }, |
| { |
| "epoch": 13.644578313253012, |
| "grad_norm": 0.010923897847533226, |
| "learning_rate": 2.774815252258522e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4244480, |
| "step": 6795 |
| }, |
| { |
| "epoch": 13.654618473895582, |
| "grad_norm": 23.903295516967773, |
| "learning_rate": 2.7669724696327094e-06, |
| "loss": 0.0276, |
| "num_input_tokens_seen": 4247552, |
| "step": 6800 |
| }, |
| { |
| "epoch": 13.664658634538153, |
| "grad_norm": 0.018534662202000618, |
| "learning_rate": 2.759136543941773e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4250304, |
| "step": 6805 |
| }, |
| { |
| "epoch": 13.674698795180722, |
| "grad_norm": 0.014366156421601772, |
| "learning_rate": 2.751307499247403e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 4254016, |
| "step": 6810 |
| }, |
| { |
| "epoch": 13.684738955823294, |
| "grad_norm": 0.11602424085140228, |
| "learning_rate": 2.743485359590173e-06, |
| "loss": 0.011, |
| "num_input_tokens_seen": 4256704, |
| "step": 6815 |
| }, |
| { |
| "epoch": 13.694779116465863, |
| "grad_norm": 0.0550687350332737, |
| "learning_rate": 2.7356701489894468e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4259584, |
| "step": 6820 |
| }, |
| { |
| "epoch": 13.704819277108435, |
| "grad_norm": 41.35027313232422, |
| "learning_rate": 2.7278618914433105e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 4262368, |
| "step": 6825 |
| }, |
| { |
| "epoch": 13.714859437751004, |
| "grad_norm": 0.009249622002243996, |
| "learning_rate": 2.720060610928501e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4265792, |
| "step": 6830 |
| }, |
| { |
| "epoch": 13.724899598393574, |
| "grad_norm": 0.05187319219112396, |
| "learning_rate": 2.712266331400332e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4268448, |
| "step": 6835 |
| }, |
| { |
| "epoch": 13.734939759036145, |
| "grad_norm": 0.032774023711681366, |
| "learning_rate": 2.704479076792618e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4272192, |
| "step": 6840 |
| }, |
| { |
| "epoch": 13.744979919678714, |
| "grad_norm": 0.01256249938160181, |
| "learning_rate": 2.696698871017601e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4275040, |
| "step": 6845 |
| }, |
| { |
| "epoch": 13.755020080321286, |
| "grad_norm": 0.006609546486288309, |
| "learning_rate": 2.6889257379658804e-06, |
| "loss": 0.0166, |
| "num_input_tokens_seen": 4278144, |
| "step": 6850 |
| }, |
| { |
| "epoch": 13.765060240963855, |
| "grad_norm": 0.03675635904073715, |
| "learning_rate": 2.6811597015063373e-06, |
| "loss": 0.0872, |
| "num_input_tokens_seen": 4281344, |
| "step": 6855 |
| }, |
| { |
| "epoch": 13.775100401606426, |
| "grad_norm": 38.53529739379883, |
| "learning_rate": 2.6734007854860596e-06, |
| "loss": 0.034, |
| "num_input_tokens_seen": 4284032, |
| "step": 6860 |
| }, |
| { |
| "epoch": 13.785140562248996, |
| "grad_norm": 1.9347114562988281, |
| "learning_rate": 2.66564901373027e-06, |
| "loss": 0.0428, |
| "num_input_tokens_seen": 4286848, |
| "step": 6865 |
| }, |
| { |
| "epoch": 13.795180722891565, |
| "grad_norm": 28.785078048706055, |
| "learning_rate": 2.657904410042261e-06, |
| "loss": 0.048, |
| "num_input_tokens_seen": 4289536, |
| "step": 6870 |
| }, |
| { |
| "epoch": 13.805220883534137, |
| "grad_norm": 0.011087162420153618, |
| "learning_rate": 2.6501669982033006e-06, |
| "loss": 0.0023, |
| "num_input_tokens_seen": 4292960, |
| "step": 6875 |
| }, |
| { |
| "epoch": 13.815261044176706, |
| "grad_norm": 1.338516116142273, |
| "learning_rate": 2.6424368019725877e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 4296064, |
| "step": 6880 |
| }, |
| { |
| "epoch": 13.825301204819278, |
| "grad_norm": 0.04255475848913193, |
| "learning_rate": 2.634713845087152e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4299744, |
| "step": 6885 |
| }, |
| { |
| "epoch": 13.835341365461847, |
| "grad_norm": 0.021324431523680687, |
| "learning_rate": 2.626998151261798e-06, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 4302912, |
| "step": 6890 |
| }, |
| { |
| "epoch": 13.845381526104418, |
| "grad_norm": 0.011794524267315865, |
| "learning_rate": 2.6192897441890337e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4306464, |
| "step": 6895 |
| }, |
| { |
| "epoch": 13.855421686746988, |
| "grad_norm": 253.84390258789062, |
| "learning_rate": 2.6115886475389786e-06, |
| "loss": 0.0425, |
| "num_input_tokens_seen": 4310240, |
| "step": 6900 |
| }, |
| { |
| "epoch": 13.865461847389557, |
| "grad_norm": 0.6399533152580261, |
| "learning_rate": 2.603894884959317e-06, |
| "loss": 0.1078, |
| "num_input_tokens_seen": 4313568, |
| "step": 6905 |
| }, |
| { |
| "epoch": 13.875502008032129, |
| "grad_norm": 0.027019036933779716, |
| "learning_rate": 2.5962084800752064e-06, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 4316832, |
| "step": 6910 |
| }, |
| { |
| "epoch": 13.885542168674698, |
| "grad_norm": 0.009591503068804741, |
| "learning_rate": 2.588529456489211e-06, |
| "loss": 0.002, |
| "num_input_tokens_seen": 4319904, |
| "step": 6915 |
| }, |
| { |
| "epoch": 13.89558232931727, |
| "grad_norm": 0.010872164741158485, |
| "learning_rate": 2.580857837781231e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4322496, |
| "step": 6920 |
| }, |
| { |
| "epoch": 13.905622489959839, |
| "grad_norm": 0.021072586998343468, |
| "learning_rate": 2.573193647508426e-06, |
| "loss": 0.0508, |
| "num_input_tokens_seen": 4325696, |
| "step": 6925 |
| }, |
| { |
| "epoch": 13.91566265060241, |
| "grad_norm": 2.339646577835083, |
| "learning_rate": 2.5655369092051495e-06, |
| "loss": 0.0536, |
| "num_input_tokens_seen": 4328672, |
| "step": 6930 |
| }, |
| { |
| "epoch": 13.92570281124498, |
| "grad_norm": 0.1687999665737152, |
| "learning_rate": 2.557887646382868e-06, |
| "loss": 0.03, |
| "num_input_tokens_seen": 4331680, |
| "step": 6935 |
| }, |
| { |
| "epoch": 13.93574297188755, |
| "grad_norm": 0.004229373764246702, |
| "learning_rate": 2.5502458825300956e-06, |
| "loss": 0.0061, |
| "num_input_tokens_seen": 4334688, |
| "step": 6940 |
| }, |
| { |
| "epoch": 13.94578313253012, |
| "grad_norm": 25.813377380371094, |
| "learning_rate": 2.542611641112318e-06, |
| "loss": 0.0367, |
| "num_input_tokens_seen": 4338240, |
| "step": 6945 |
| }, |
| { |
| "epoch": 13.95582329317269, |
| "grad_norm": 0.0337495282292366, |
| "learning_rate": 2.534984945571923e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4341824, |
| "step": 6950 |
| }, |
| { |
| "epoch": 13.965863453815262, |
| "grad_norm": 0.009060739539563656, |
| "learning_rate": 2.5273658193281252e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4344800, |
| "step": 6955 |
| }, |
| { |
| "epoch": 13.975903614457831, |
| "grad_norm": 182.3167266845703, |
| "learning_rate": 2.519754285776903e-06, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 4347936, |
| "step": 6960 |
| }, |
| { |
| "epoch": 13.985943775100402, |
| "grad_norm": 0.032948389649391174, |
| "learning_rate": 2.5121503682909095e-06, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 4350976, |
| "step": 6965 |
| }, |
| { |
| "epoch": 13.995983935742972, |
| "grad_norm": 3.3692076206207275, |
| "learning_rate": 2.504554090219418e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 4354016, |
| "step": 6970 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.8435496687889099, |
| "eval_runtime": 8.0735, |
| "eval_samples_per_second": 61.684, |
| "eval_steps_per_second": 15.483, |
| "num_input_tokens_seen": 4355328, |
| "step": 6972 |
| }, |
| { |
| "epoch": 14.006024096385541, |
| "grad_norm": 0.05708456039428711, |
| "learning_rate": 2.496965474888243e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4356832, |
| "step": 6975 |
| }, |
| { |
| "epoch": 14.016064257028113, |
| "grad_norm": 0.002826336305588484, |
| "learning_rate": 2.489384545599666e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4360320, |
| "step": 6980 |
| }, |
| { |
| "epoch": 14.026104417670682, |
| "grad_norm": 0.015671400353312492, |
| "learning_rate": 2.4818113256323745e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 4363424, |
| "step": 6985 |
| }, |
| { |
| "epoch": 14.036144578313253, |
| "grad_norm": 0.03397877886891365, |
| "learning_rate": 2.474245838241371e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4366240, |
| "step": 6990 |
| }, |
| { |
| "epoch": 14.046184738955823, |
| "grad_norm": 0.002914144191890955, |
| "learning_rate": 2.466688106657927e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4368704, |
| "step": 6995 |
| }, |
| { |
| "epoch": 14.056224899598394, |
| "grad_norm": 0.005297825671732426, |
| "learning_rate": 2.459138154089486e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 4372320, |
| "step": 7000 |
| }, |
| { |
| "epoch": 14.066265060240964, |
| "grad_norm": 0.021198710426688194, |
| "learning_rate": 2.4515960037196146e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4375104, |
| "step": 7005 |
| }, |
| { |
| "epoch": 14.076305220883533, |
| "grad_norm": 0.1993994563817978, |
| "learning_rate": 2.444061678707915e-06, |
| "loss": 0.017, |
| "num_input_tokens_seen": 4377888, |
| "step": 7010 |
| }, |
| { |
| "epoch": 14.086345381526105, |
| "grad_norm": 0.2237624228000641, |
| "learning_rate": 2.4365352021899635e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4381536, |
| "step": 7015 |
| }, |
| { |
| "epoch": 14.096385542168674, |
| "grad_norm": 0.02824407070875168, |
| "learning_rate": 2.4290165972772363e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4384096, |
| "step": 7020 |
| }, |
| { |
| "epoch": 14.106425702811245, |
| "grad_norm": 0.0019500048365443945, |
| "learning_rate": 2.42150588705703e-06, |
| "loss": 0.0072, |
| "num_input_tokens_seen": 4387168, |
| "step": 7025 |
| }, |
| { |
| "epoch": 14.116465863453815, |
| "grad_norm": 34.514892578125, |
| "learning_rate": 2.4140030945924137e-06, |
| "loss": 0.0612, |
| "num_input_tokens_seen": 4389728, |
| "step": 7030 |
| }, |
| { |
| "epoch": 14.126506024096386, |
| "grad_norm": 0.023807184770703316, |
| "learning_rate": 2.4065082429221315e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4393184, |
| "step": 7035 |
| }, |
| { |
| "epoch": 14.136546184738956, |
| "grad_norm": 0.007570713758468628, |
| "learning_rate": 2.3990213550605496e-06, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 4396608, |
| "step": 7040 |
| }, |
| { |
| "epoch": 14.146586345381525, |
| "grad_norm": 0.03999396786093712, |
| "learning_rate": 2.391542453997578e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4399520, |
| "step": 7045 |
| }, |
| { |
| "epoch": 14.156626506024097, |
| "grad_norm": 4.837028503417969, |
| "learning_rate": 2.3840715626986016e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4402784, |
| "step": 7050 |
| }, |
| { |
| "epoch": 14.166666666666666, |
| "grad_norm": 0.004308843053877354, |
| "learning_rate": 2.37660870410441e-06, |
| "loss": 0.0016, |
| "num_input_tokens_seen": 4406016, |
| "step": 7055 |
| }, |
| { |
| "epoch": 14.176706827309237, |
| "grad_norm": 0.013919214718043804, |
| "learning_rate": 2.3691539011311276e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4409600, |
| "step": 7060 |
| }, |
| { |
| "epoch": 14.186746987951807, |
| "grad_norm": 0.06342492997646332, |
| "learning_rate": 2.3617071766701415e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4412352, |
| "step": 7065 |
| }, |
| { |
| "epoch": 14.196787148594378, |
| "grad_norm": 0.05328844487667084, |
| "learning_rate": 2.354268553588033e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4415072, |
| "step": 7070 |
| }, |
| { |
| "epoch": 14.206827309236948, |
| "grad_norm": 0.01577562279999256, |
| "learning_rate": 2.346838054726505e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 4418848, |
| "step": 7075 |
| }, |
| { |
| "epoch": 14.216867469879517, |
| "grad_norm": 1.3071502447128296, |
| "learning_rate": 2.3394157029023145e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4421664, |
| "step": 7080 |
| }, |
| { |
| "epoch": 14.226907630522089, |
| "grad_norm": 0.034468941390514374, |
| "learning_rate": 2.3320015209072056e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4424736, |
| "step": 7085 |
| }, |
| { |
| "epoch": 14.236947791164658, |
| "grad_norm": 102.81072998046875, |
| "learning_rate": 2.324595531507827e-06, |
| "loss": 0.0074, |
| "num_input_tokens_seen": 4427296, |
| "step": 7090 |
| }, |
| { |
| "epoch": 14.24698795180723, |
| "grad_norm": 0.0323479101061821, |
| "learning_rate": 2.317197757445676e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4430848, |
| "step": 7095 |
| }, |
| { |
| "epoch": 14.257028112449799, |
| "grad_norm": 0.03737180680036545, |
| "learning_rate": 2.309808221437022e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4433536, |
| "step": 7100 |
| }, |
| { |
| "epoch": 14.26706827309237, |
| "grad_norm": 0.0033175817225128412, |
| "learning_rate": 2.302426946172836e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4436544, |
| "step": 7105 |
| }, |
| { |
| "epoch": 14.27710843373494, |
| "grad_norm": 7.275900363922119, |
| "learning_rate": 2.295053954318731e-06, |
| "loss": 0.0113, |
| "num_input_tokens_seen": 4439424, |
| "step": 7110 |
| }, |
| { |
| "epoch": 14.28714859437751, |
| "grad_norm": 0.00315207545645535, |
| "learning_rate": 2.2876892685148696e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4442400, |
| "step": 7115 |
| }, |
| { |
| "epoch": 14.29718875502008, |
| "grad_norm": 0.015147917903959751, |
| "learning_rate": 2.2803329113759256e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4445408, |
| "step": 7120 |
| }, |
| { |
| "epoch": 14.30722891566265, |
| "grad_norm": 36.320823669433594, |
| "learning_rate": 2.2729849054909812e-06, |
| "loss": 0.0087, |
| "num_input_tokens_seen": 4448928, |
| "step": 7125 |
| }, |
| { |
| "epoch": 14.317269076305221, |
| "grad_norm": 0.006546009331941605, |
| "learning_rate": 2.26564527342349e-06, |
| "loss": 0.1558, |
| "num_input_tokens_seen": 4452416, |
| "step": 7130 |
| }, |
| { |
| "epoch": 14.32730923694779, |
| "grad_norm": 0.018252495676279068, |
| "learning_rate": 2.258314037711184e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4454976, |
| "step": 7135 |
| }, |
| { |
| "epoch": 14.337349397590362, |
| "grad_norm": 0.1240961030125618, |
| "learning_rate": 2.2509912208660125e-06, |
| "loss": 0.0016, |
| "num_input_tokens_seen": 4457984, |
| "step": 7140 |
| }, |
| { |
| "epoch": 14.347389558232932, |
| "grad_norm": 25.80049705505371, |
| "learning_rate": 2.2436768453740743e-06, |
| "loss": 0.0348, |
| "num_input_tokens_seen": 4460992, |
| "step": 7145 |
| }, |
| { |
| "epoch": 14.357429718875501, |
| "grad_norm": 0.013544696383178234, |
| "learning_rate": 2.236370933695549e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4463904, |
| "step": 7150 |
| }, |
| { |
| "epoch": 14.367469879518072, |
| "grad_norm": 0.022857604548335075, |
| "learning_rate": 2.2290735082646254e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4466656, |
| "step": 7155 |
| }, |
| { |
| "epoch": 14.377510040160642, |
| "grad_norm": 352.6915283203125, |
| "learning_rate": 2.2217845914894315e-06, |
| "loss": 0.0789, |
| "num_input_tokens_seen": 4470208, |
| "step": 7160 |
| }, |
| { |
| "epoch": 14.387550200803213, |
| "grad_norm": 0.05897314473986626, |
| "learning_rate": 2.214504205751971e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4474144, |
| "step": 7165 |
| }, |
| { |
| "epoch": 14.397590361445783, |
| "grad_norm": 0.042249646037817, |
| "learning_rate": 2.2072323734080503e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4477184, |
| "step": 7170 |
| }, |
| { |
| "epoch": 14.407630522088354, |
| "grad_norm": 0.08079247176647186, |
| "learning_rate": 2.1999691167872107e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 4480064, |
| "step": 7175 |
| }, |
| { |
| "epoch": 14.417670682730924, |
| "grad_norm": 0.4898930788040161, |
| "learning_rate": 2.1927144581926597e-06, |
| "loss": 0.034, |
| "num_input_tokens_seen": 4483616, |
| "step": 7180 |
| }, |
| { |
| "epoch": 14.427710843373493, |
| "grad_norm": 0.01570323295891285, |
| "learning_rate": 2.1854684199012036e-06, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 4487488, |
| "step": 7185 |
| }, |
| { |
| "epoch": 14.437751004016064, |
| "grad_norm": 0.021506065502762794, |
| "learning_rate": 2.178231024163179e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4489696, |
| "step": 7190 |
| }, |
| { |
| "epoch": 14.447791164658634, |
| "grad_norm": 0.21415483951568604, |
| "learning_rate": 2.1710022932023805e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4493088, |
| "step": 7195 |
| }, |
| { |
| "epoch": 14.457831325301205, |
| "grad_norm": 0.007669499143958092, |
| "learning_rate": 2.163782249216005e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4497024, |
| "step": 7200 |
| }, |
| { |
| "epoch": 14.467871485943775, |
| "grad_norm": 0.014763305895030499, |
| "learning_rate": 2.15657091437456e-06, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 4499712, |
| "step": 7205 |
| }, |
| { |
| "epoch": 14.477911646586346, |
| "grad_norm": 2.469639778137207, |
| "learning_rate": 2.1493683108218254e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4502400, |
| "step": 7210 |
| }, |
| { |
| "epoch": 14.487951807228916, |
| "grad_norm": 0.02391161024570465, |
| "learning_rate": 2.142174460674755e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4505088, |
| "step": 7215 |
| }, |
| { |
| "epoch": 14.497991967871485, |
| "grad_norm": 0.13245859742164612, |
| "learning_rate": 2.134989386023437e-06, |
| "loss": 0.0474, |
| "num_input_tokens_seen": 4508384, |
| "step": 7220 |
| }, |
| { |
| "epoch": 14.508032128514056, |
| "grad_norm": 0.012419681996107101, |
| "learning_rate": 2.127813108931007e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4511584, |
| "step": 7225 |
| }, |
| { |
| "epoch": 14.518072289156626, |
| "grad_norm": 0.014137223362922668, |
| "learning_rate": 2.1206456514335794e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4514816, |
| "step": 7230 |
| }, |
| { |
| "epoch": 14.528112449799197, |
| "grad_norm": 0.2595354914665222, |
| "learning_rate": 2.113487035540201e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4517824, |
| "step": 7235 |
| }, |
| { |
| "epoch": 14.538152610441767, |
| "grad_norm": 0.010352588258683681, |
| "learning_rate": 2.1063372832327535e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4521088, |
| "step": 7240 |
| }, |
| { |
| "epoch": 14.548192771084338, |
| "grad_norm": 0.018129389733076096, |
| "learning_rate": 2.099196416465913e-06, |
| "loss": 0.0092, |
| "num_input_tokens_seen": 4524416, |
| "step": 7245 |
| }, |
| { |
| "epoch": 14.558232931726907, |
| "grad_norm": 0.005967453587800264, |
| "learning_rate": 2.092064457167066e-06, |
| "loss": 0.0919, |
| "num_input_tokens_seen": 4527520, |
| "step": 7250 |
| }, |
| { |
| "epoch": 14.568273092369477, |
| "grad_norm": 0.004777940455824137, |
| "learning_rate": 2.084941427236245e-06, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 4530976, |
| "step": 7255 |
| }, |
| { |
| "epoch": 14.578313253012048, |
| "grad_norm": 0.009696507826447487, |
| "learning_rate": 2.0778273485460677e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4534048, |
| "step": 7260 |
| }, |
| { |
| "epoch": 14.588353413654618, |
| "grad_norm": 0.009302028454840183, |
| "learning_rate": 2.0707222429416613e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4537536, |
| "step": 7265 |
| }, |
| { |
| "epoch": 14.598393574297189, |
| "grad_norm": 0.008531935513019562, |
| "learning_rate": 2.063626132240602e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4540256, |
| "step": 7270 |
| }, |
| { |
| "epoch": 14.608433734939759, |
| "grad_norm": 0.00231292680837214, |
| "learning_rate": 2.0565390382328448e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4543552, |
| "step": 7275 |
| }, |
| { |
| "epoch": 14.61847389558233, |
| "grad_norm": 0.018894299864768982, |
| "learning_rate": 2.049460982680656e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4546304, |
| "step": 7280 |
| }, |
| { |
| "epoch": 14.6285140562249, |
| "grad_norm": 0.0022834010887891054, |
| "learning_rate": 2.04239198731855e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4549312, |
| "step": 7285 |
| }, |
| { |
| "epoch": 14.638554216867469, |
| "grad_norm": 0.20074783265590668, |
| "learning_rate": 2.035332073853217e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4553152, |
| "step": 7290 |
| }, |
| { |
| "epoch": 14.64859437751004, |
| "grad_norm": 0.9416258335113525, |
| "learning_rate": 2.0282812639634636e-06, |
| "loss": 0.0692, |
| "num_input_tokens_seen": 4555712, |
| "step": 7295 |
| }, |
| { |
| "epoch": 14.65863453815261, |
| "grad_norm": 0.0018730978481471539, |
| "learning_rate": 2.0212395793001384e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4558304, |
| "step": 7300 |
| }, |
| { |
| "epoch": 14.668674698795181, |
| "grad_norm": 0.027835896238684654, |
| "learning_rate": 2.0142070414860704e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4560992, |
| "step": 7305 |
| }, |
| { |
| "epoch": 14.67871485943775, |
| "grad_norm": 35.51850509643555, |
| "learning_rate": 2.007183672116002e-06, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 4564384, |
| "step": 7310 |
| }, |
| { |
| "epoch": 14.688755020080322, |
| "grad_norm": 0.07114183902740479, |
| "learning_rate": 2.000169492756523e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4567936, |
| "step": 7315 |
| }, |
| { |
| "epoch": 14.698795180722891, |
| "grad_norm": 1.0696017742156982, |
| "learning_rate": 1.9931645249459997e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4571072, |
| "step": 7320 |
| }, |
| { |
| "epoch": 14.708835341365463, |
| "grad_norm": 0.0018745064735412598, |
| "learning_rate": 1.986168790194521e-06, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 4574496, |
| "step": 7325 |
| }, |
| { |
| "epoch": 14.718875502008032, |
| "grad_norm": 0.03353925794363022, |
| "learning_rate": 1.9791823099838107e-06, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 4577440, |
| "step": 7330 |
| }, |
| { |
| "epoch": 14.728915662650602, |
| "grad_norm": 0.01741660013794899, |
| "learning_rate": 1.9722051057671896e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4580608, |
| "step": 7335 |
| }, |
| { |
| "epoch": 14.738955823293173, |
| "grad_norm": 0.011953890323638916, |
| "learning_rate": 1.965237198969481e-06, |
| "loss": 0.0201, |
| "num_input_tokens_seen": 4584160, |
| "step": 7340 |
| }, |
| { |
| "epoch": 14.748995983935743, |
| "grad_norm": 0.27335840463638306, |
| "learning_rate": 1.9582786109869713e-06, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 4587072, |
| "step": 7345 |
| }, |
| { |
| "epoch": 14.759036144578314, |
| "grad_norm": 341.1443176269531, |
| "learning_rate": 1.951329363187323e-06, |
| "loss": 0.0585, |
| "num_input_tokens_seen": 4590272, |
| "step": 7350 |
| }, |
| { |
| "epoch": 14.769076305220883, |
| "grad_norm": 0.007021079305559397, |
| "learning_rate": 1.944389476909518e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4593824, |
| "step": 7355 |
| }, |
| { |
| "epoch": 14.779116465863455, |
| "grad_norm": 0.0021303293760865927, |
| "learning_rate": 1.9374589734638e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4596352, |
| "step": 7360 |
| }, |
| { |
| "epoch": 14.789156626506024, |
| "grad_norm": 35.135520935058594, |
| "learning_rate": 1.930537874131588e-06, |
| "loss": 0.0116, |
| "num_input_tokens_seen": 4599616, |
| "step": 7365 |
| }, |
| { |
| "epoch": 14.799196787148594, |
| "grad_norm": 0.003619756083935499, |
| "learning_rate": 1.9236262001654372e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4603584, |
| "step": 7370 |
| }, |
| { |
| "epoch": 14.809236947791165, |
| "grad_norm": 0.004386731423437595, |
| "learning_rate": 1.9167239727889527e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4607136, |
| "step": 7375 |
| }, |
| { |
| "epoch": 14.819277108433734, |
| "grad_norm": 0.012418882921338081, |
| "learning_rate": 1.9098312131967327e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4609888, |
| "step": 7380 |
| }, |
| { |
| "epoch": 14.829317269076306, |
| "grad_norm": 0.008296665735542774, |
| "learning_rate": 1.9029479425543052e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4612384, |
| "step": 7385 |
| }, |
| { |
| "epoch": 14.839357429718875, |
| "grad_norm": 10.158157348632812, |
| "learning_rate": 1.8960741819980576e-06, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 4615424, |
| "step": 7390 |
| }, |
| { |
| "epoch": 14.849397590361447, |
| "grad_norm": 0.003470318391919136, |
| "learning_rate": 1.889209952635178e-06, |
| "loss": 0.0015, |
| "num_input_tokens_seen": 4618848, |
| "step": 7395 |
| }, |
| { |
| "epoch": 14.859437751004016, |
| "grad_norm": 0.006565614603459835, |
| "learning_rate": 1.8823552755435847e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4622176, |
| "step": 7400 |
| }, |
| { |
| "epoch": 14.869477911646586, |
| "grad_norm": 0.001681746100075543, |
| "learning_rate": 1.875510171771865e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4625088, |
| "step": 7405 |
| }, |
| { |
| "epoch": 14.879518072289157, |
| "grad_norm": 0.003082460956647992, |
| "learning_rate": 1.868674662339207e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4628640, |
| "step": 7410 |
| }, |
| { |
| "epoch": 14.889558232931726, |
| "grad_norm": 0.011504475958645344, |
| "learning_rate": 1.8618487682353453e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4631808, |
| "step": 7415 |
| }, |
| { |
| "epoch": 14.899598393574298, |
| "grad_norm": 0.10181345045566559, |
| "learning_rate": 1.855032510420477e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4634176, |
| "step": 7420 |
| }, |
| { |
| "epoch": 14.909638554216867, |
| "grad_norm": 0.0036355298943817616, |
| "learning_rate": 1.848225909825222e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4636704, |
| "step": 7425 |
| }, |
| { |
| "epoch": 14.919678714859439, |
| "grad_norm": 0.012753061018884182, |
| "learning_rate": 1.8414289873505337e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4639776, |
| "step": 7430 |
| }, |
| { |
| "epoch": 14.929718875502008, |
| "grad_norm": 0.1419248729944229, |
| "learning_rate": 1.8346417638676533e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4642848, |
| "step": 7435 |
| }, |
| { |
| "epoch": 14.939759036144578, |
| "grad_norm": 0.014822770841419697, |
| "learning_rate": 1.8278642602180435e-06, |
| "loss": 0.0051, |
| "num_input_tokens_seen": 4646400, |
| "step": 7440 |
| }, |
| { |
| "epoch": 14.949799196787149, |
| "grad_norm": 28.095958709716797, |
| "learning_rate": 1.8210964972133095e-06, |
| "loss": 0.0857, |
| "num_input_tokens_seen": 4649088, |
| "step": 7445 |
| }, |
| { |
| "epoch": 14.959839357429718, |
| "grad_norm": 0.0036780142690986395, |
| "learning_rate": 1.814338495635158e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4652512, |
| "step": 7450 |
| }, |
| { |
| "epoch": 14.96987951807229, |
| "grad_norm": 0.06401392817497253, |
| "learning_rate": 1.8075902762353093e-06, |
| "loss": 0.0551, |
| "num_input_tokens_seen": 4655584, |
| "step": 7455 |
| }, |
| { |
| "epoch": 14.97991967871486, |
| "grad_norm": 0.010513795539736748, |
| "learning_rate": 1.8008518597354575e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4658272, |
| "step": 7460 |
| }, |
| { |
| "epoch": 14.98995983935743, |
| "grad_norm": 0.012088480405509472, |
| "learning_rate": 1.7941232668271863e-06, |
| "loss": 0.053, |
| "num_input_tokens_seen": 4662432, |
| "step": 7465 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.0037560504861176014, |
| "learning_rate": 1.787404518171919e-06, |
| "loss": 0.1219, |
| "num_input_tokens_seen": 4665120, |
| "step": 7470 |
| }, |
| { |
| "epoch": 15.01004016064257, |
| "grad_norm": 0.07317201793193817, |
| "learning_rate": 1.7806956344008475e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4668640, |
| "step": 7475 |
| }, |
| { |
| "epoch": 15.02008032128514, |
| "grad_norm": 0.10222109407186508, |
| "learning_rate": 1.773996636114873e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4672064, |
| "step": 7480 |
| }, |
| { |
| "epoch": 15.03012048192771, |
| "grad_norm": 0.01750057004392147, |
| "learning_rate": 1.7673075438845423e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4675264, |
| "step": 7485 |
| }, |
| { |
| "epoch": 15.040160642570282, |
| "grad_norm": 0.009696793742477894, |
| "learning_rate": 1.7606283782499812e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4677728, |
| "step": 7490 |
| }, |
| { |
| "epoch": 15.050200803212851, |
| "grad_norm": 0.049690455198287964, |
| "learning_rate": 1.753959159720836e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4680608, |
| "step": 7495 |
| }, |
| { |
| "epoch": 15.060240963855422, |
| "grad_norm": 0.006616874132305384, |
| "learning_rate": 1.7472999087762081e-06, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 4683712, |
| "step": 7500 |
| }, |
| { |
| "epoch": 15.070281124497992, |
| "grad_norm": 0.013991329818964005, |
| "learning_rate": 1.7406506458645923e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4687520, |
| "step": 7505 |
| }, |
| { |
| "epoch": 15.080321285140561, |
| "grad_norm": 0.00402922835201025, |
| "learning_rate": 1.7340113914038115e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4690560, |
| "step": 7510 |
| }, |
| { |
| "epoch": 15.090361445783133, |
| "grad_norm": 5.365704536437988, |
| "learning_rate": 1.727382165780957e-06, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 4693696, |
| "step": 7515 |
| }, |
| { |
| "epoch": 15.100401606425702, |
| "grad_norm": 0.006192977540194988, |
| "learning_rate": 1.7207629893523236e-06, |
| "loss": 0.0672, |
| "num_input_tokens_seen": 4696800, |
| "step": 7520 |
| }, |
| { |
| "epoch": 15.110441767068274, |
| "grad_norm": 0.11119314283132553, |
| "learning_rate": 1.7141538824433506e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4699776, |
| "step": 7525 |
| }, |
| { |
| "epoch": 15.120481927710843, |
| "grad_norm": 0.08953115344047546, |
| "learning_rate": 1.7075548653485535e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4702528, |
| "step": 7530 |
| }, |
| { |
| "epoch": 15.130522088353414, |
| "grad_norm": 0.05335596948862076, |
| "learning_rate": 1.7009659583314659e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4705152, |
| "step": 7535 |
| }, |
| { |
| "epoch": 15.140562248995984, |
| "grad_norm": 0.05979358032345772, |
| "learning_rate": 1.6943871816245826e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 4707776, |
| "step": 7540 |
| }, |
| { |
| "epoch": 15.150602409638553, |
| "grad_norm": 0.070265032351017, |
| "learning_rate": 1.6878185554292787e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4710368, |
| "step": 7545 |
| }, |
| { |
| "epoch": 15.160642570281125, |
| "grad_norm": 0.025123678147792816, |
| "learning_rate": 1.6812600999157753e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 4713536, |
| "step": 7550 |
| }, |
| { |
| "epoch": 15.170682730923694, |
| "grad_norm": 0.019195713102817535, |
| "learning_rate": 1.6747118352230495e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4716672, |
| "step": 7555 |
| }, |
| { |
| "epoch": 15.180722891566266, |
| "grad_norm": 0.003091650316491723, |
| "learning_rate": 1.6681737814587912e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4719872, |
| "step": 7560 |
| }, |
| { |
| "epoch": 15.190763052208835, |
| "grad_norm": 0.008971435017883778, |
| "learning_rate": 1.6616459586993394e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4723776, |
| "step": 7565 |
| }, |
| { |
| "epoch": 15.200803212851406, |
| "grad_norm": 0.007008485496044159, |
| "learning_rate": 1.6551283869896073e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4726976, |
| "step": 7570 |
| }, |
| { |
| "epoch": 15.210843373493976, |
| "grad_norm": 0.6058262586593628, |
| "learning_rate": 1.6486210863430424e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4730176, |
| "step": 7575 |
| }, |
| { |
| "epoch": 15.220883534136545, |
| "grad_norm": 0.013418142683804035, |
| "learning_rate": 1.6421240767415397e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4733152, |
| "step": 7580 |
| }, |
| { |
| "epoch": 15.230923694779117, |
| "grad_norm": 0.005203918553888798, |
| "learning_rate": 1.6356373781354058e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4735648, |
| "step": 7585 |
| }, |
| { |
| "epoch": 15.240963855421686, |
| "grad_norm": 0.004218837711960077, |
| "learning_rate": 1.629161010443277e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4739136, |
| "step": 7590 |
| }, |
| { |
| "epoch": 15.251004016064257, |
| "grad_norm": 0.0019372202223166823, |
| "learning_rate": 1.6226949935520708e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4742432, |
| "step": 7595 |
| }, |
| { |
| "epoch": 15.261044176706827, |
| "grad_norm": 0.0026460830122232437, |
| "learning_rate": 1.6162393473169186e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4746304, |
| "step": 7600 |
| }, |
| { |
| "epoch": 15.271084337349398, |
| "grad_norm": 0.02616897039115429, |
| "learning_rate": 1.6097940915611082e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4749536, |
| "step": 7605 |
| }, |
| { |
| "epoch": 15.281124497991968, |
| "grad_norm": 0.002473875880241394, |
| "learning_rate": 1.60335924607602e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4753120, |
| "step": 7610 |
| }, |
| { |
| "epoch": 15.291164658634537, |
| "grad_norm": 0.007338056806474924, |
| "learning_rate": 1.5969348306210692e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4755968, |
| "step": 7615 |
| }, |
| { |
| "epoch": 15.301204819277109, |
| "grad_norm": 0.018916072323918343, |
| "learning_rate": 1.5905208649236426e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4758560, |
| "step": 7620 |
| }, |
| { |
| "epoch": 15.311244979919678, |
| "grad_norm": 0.007263466715812683, |
| "learning_rate": 1.5841173686790368e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 4762368, |
| "step": 7625 |
| }, |
| { |
| "epoch": 15.32128514056225, |
| "grad_norm": 0.1382419317960739, |
| "learning_rate": 1.5777243615504085e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4765888, |
| "step": 7630 |
| }, |
| { |
| "epoch": 15.331325301204819, |
| "grad_norm": 0.0016897142631933093, |
| "learning_rate": 1.5713418631686938e-06, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 4768928, |
| "step": 7635 |
| }, |
| { |
| "epoch": 15.34136546184739, |
| "grad_norm": 0.21704323589801788, |
| "learning_rate": 1.564969893132568e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4771904, |
| "step": 7640 |
| }, |
| { |
| "epoch": 15.35140562248996, |
| "grad_norm": 0.009838147088885307, |
| "learning_rate": 1.5586084710083737e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4775104, |
| "step": 7645 |
| }, |
| { |
| "epoch": 15.36144578313253, |
| "grad_norm": 0.0028061573393642902, |
| "learning_rate": 1.5522576163300635e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4778496, |
| "step": 7650 |
| }, |
| { |
| "epoch": 15.3714859437751, |
| "grad_norm": 0.0043613361194729805, |
| "learning_rate": 1.545917348599147e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4781344, |
| "step": 7655 |
| }, |
| { |
| "epoch": 15.38152610441767, |
| "grad_norm": 0.00559863680973649, |
| "learning_rate": 1.5395876872846132e-06, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 4784352, |
| "step": 7660 |
| }, |
| { |
| "epoch": 15.391566265060241, |
| "grad_norm": 0.004609005060046911, |
| "learning_rate": 1.5332686518228951e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4787424, |
| "step": 7665 |
| }, |
| { |
| "epoch": 15.401606425702811, |
| "grad_norm": 0.00308047141879797, |
| "learning_rate": 1.5269602616177842e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4790656, |
| "step": 7670 |
| }, |
| { |
| "epoch": 15.411646586345382, |
| "grad_norm": 0.033832404762506485, |
| "learning_rate": 1.5206625360403943e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4793536, |
| "step": 7675 |
| }, |
| { |
| "epoch": 15.421686746987952, |
| "grad_norm": 0.011339832097291946, |
| "learning_rate": 1.5143754944290862e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4796704, |
| "step": 7680 |
| }, |
| { |
| "epoch": 15.431726907630521, |
| "grad_norm": 0.09786521643400192, |
| "learning_rate": 1.5080991560894142e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4800032, |
| "step": 7685 |
| }, |
| { |
| "epoch": 15.441767068273093, |
| "grad_norm": 0.003034294117242098, |
| "learning_rate": 1.5018335402940681e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4803552, |
| "step": 7690 |
| }, |
| { |
| "epoch": 15.451807228915662, |
| "grad_norm": 0.002162687247619033, |
| "learning_rate": 1.4955786662828053e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4806848, |
| "step": 7695 |
| }, |
| { |
| "epoch": 15.461847389558233, |
| "grad_norm": 0.022024383768439293, |
| "learning_rate": 1.4893345532624086e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4809152, |
| "step": 7700 |
| }, |
| { |
| "epoch": 15.471887550200803, |
| "grad_norm": 0.005043026525527239, |
| "learning_rate": 1.4831012204066114e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4812064, |
| "step": 7705 |
| }, |
| { |
| "epoch": 15.481927710843374, |
| "grad_norm": 0.007820216938853264, |
| "learning_rate": 1.4768786868560443e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4815040, |
| "step": 7710 |
| }, |
| { |
| "epoch": 15.491967871485944, |
| "grad_norm": 0.003925836179405451, |
| "learning_rate": 1.4706669717181782e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4818880, |
| "step": 7715 |
| }, |
| { |
| "epoch": 15.502008032128515, |
| "grad_norm": 0.030795995146036148, |
| "learning_rate": 1.4644660940672628e-06, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 4821696, |
| "step": 7720 |
| }, |
| { |
| "epoch": 15.512048192771084, |
| "grad_norm": 0.003375057829543948, |
| "learning_rate": 1.4582760729442707e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4824608, |
| "step": 7725 |
| }, |
| { |
| "epoch": 15.522088353413654, |
| "grad_norm": 1.4269546270370483, |
| "learning_rate": 1.4520969273568364e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4827360, |
| "step": 7730 |
| }, |
| { |
| "epoch": 15.532128514056225, |
| "grad_norm": 0.0026180180720984936, |
| "learning_rate": 1.445928676279199e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4830496, |
| "step": 7735 |
| }, |
| { |
| "epoch": 15.542168674698795, |
| "grad_norm": 0.8371602892875671, |
| "learning_rate": 1.4397713386521444e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4833536, |
| "step": 7740 |
| }, |
| { |
| "epoch": 15.552208835341366, |
| "grad_norm": 0.0011596613330766559, |
| "learning_rate": 1.4336249333829466e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4836192, |
| "step": 7745 |
| }, |
| { |
| "epoch": 15.562248995983936, |
| "grad_norm": 0.0891217365860939, |
| "learning_rate": 1.4274894793453075e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4840320, |
| "step": 7750 |
| }, |
| { |
| "epoch": 15.572289156626507, |
| "grad_norm": 22.992704391479492, |
| "learning_rate": 1.421364995379309e-06, |
| "loss": 0.0766, |
| "num_input_tokens_seen": 4843744, |
| "step": 7755 |
| }, |
| { |
| "epoch": 15.582329317269076, |
| "grad_norm": 0.0040291850455105305, |
| "learning_rate": 1.4152515002913358e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4846464, |
| "step": 7760 |
| }, |
| { |
| "epoch": 15.592369477911646, |
| "grad_norm": 0.005161698441952467, |
| "learning_rate": 1.4091490128540374e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4849184, |
| "step": 7765 |
| }, |
| { |
| "epoch": 15.602409638554217, |
| "grad_norm": 43.281211853027344, |
| "learning_rate": 1.403057551806259e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 4851936, |
| "step": 7770 |
| }, |
| { |
| "epoch": 15.612449799196787, |
| "grad_norm": 0.0021972369868308306, |
| "learning_rate": 1.3969771358529866e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 4855040, |
| "step": 7775 |
| }, |
| { |
| "epoch": 15.622489959839358, |
| "grad_norm": 0.014925581403076649, |
| "learning_rate": 1.3909077836652968e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4857952, |
| "step": 7780 |
| }, |
| { |
| "epoch": 15.632530120481928, |
| "grad_norm": 0.30831727385520935, |
| "learning_rate": 1.3848495138802803e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4860960, |
| "step": 7785 |
| }, |
| { |
| "epoch": 15.642570281124499, |
| "grad_norm": 0.010729658417403698, |
| "learning_rate": 1.3788023451010114e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4864544, |
| "step": 7790 |
| }, |
| { |
| "epoch": 15.652610441767068, |
| "grad_norm": 0.10216429084539413, |
| "learning_rate": 1.3727662958964627e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4867616, |
| "step": 7795 |
| }, |
| { |
| "epoch": 15.662650602409638, |
| "grad_norm": 0.001300434349104762, |
| "learning_rate": 1.3667413848014738e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4870304, |
| "step": 7800 |
| }, |
| { |
| "epoch": 15.67269076305221, |
| "grad_norm": 0.012358872219920158, |
| "learning_rate": 1.3607276303166766e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4874240, |
| "step": 7805 |
| }, |
| { |
| "epoch": 15.682730923694779, |
| "grad_norm": 0.035743288695812225, |
| "learning_rate": 1.3547250509084453e-06, |
| "loss": 0.0249, |
| "num_input_tokens_seen": 4876960, |
| "step": 7810 |
| }, |
| { |
| "epoch": 15.69277108433735, |
| "grad_norm": 0.014657980762422085, |
| "learning_rate": 1.3487336650088417e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4879872, |
| "step": 7815 |
| }, |
| { |
| "epoch": 15.70281124497992, |
| "grad_norm": 0.03732848912477493, |
| "learning_rate": 1.3427534910155475e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4883424, |
| "step": 7820 |
| }, |
| { |
| "epoch": 15.71285140562249, |
| "grad_norm": 0.0021094426047056913, |
| "learning_rate": 1.3367845472918272e-06, |
| "loss": 0.0061, |
| "num_input_tokens_seen": 4886912, |
| "step": 7825 |
| }, |
| { |
| "epoch": 15.72289156626506, |
| "grad_norm": 0.0018562499899417162, |
| "learning_rate": 1.330826852166454e-06, |
| "loss": 0.0611, |
| "num_input_tokens_seen": 4890336, |
| "step": 7830 |
| }, |
| { |
| "epoch": 15.73293172690763, |
| "grad_norm": 0.0016846376238390803, |
| "learning_rate": 1.3248804239336616e-06, |
| "loss": 0.0278, |
| "num_input_tokens_seen": 4894144, |
| "step": 7835 |
| }, |
| { |
| "epoch": 15.742971887550201, |
| "grad_norm": 0.022804176434874535, |
| "learning_rate": 1.3189452808530866e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 4897536, |
| "step": 7840 |
| }, |
| { |
| "epoch": 15.75301204819277, |
| "grad_norm": 0.023388752713799477, |
| "learning_rate": 1.3130214411497121e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4900544, |
| "step": 7845 |
| }, |
| { |
| "epoch": 15.763052208835342, |
| "grad_norm": 0.05187452584505081, |
| "learning_rate": 1.3071089230138124e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4903680, |
| "step": 7850 |
| }, |
| { |
| "epoch": 15.773092369477911, |
| "grad_norm": 0.017268287017941475, |
| "learning_rate": 1.3012077446008969e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 4906528, |
| "step": 7855 |
| }, |
| { |
| "epoch": 15.783132530120483, |
| "grad_norm": 0.013094757683575153, |
| "learning_rate": 1.2953179240316533e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4910176, |
| "step": 7860 |
| }, |
| { |
| "epoch": 15.793172690763052, |
| "grad_norm": 0.0332111194729805, |
| "learning_rate": 1.289439479391893e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4913184, |
| "step": 7865 |
| }, |
| { |
| "epoch": 15.803212851405622, |
| "grad_norm": 0.003533572657033801, |
| "learning_rate": 1.2835724287325001e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4916320, |
| "step": 7870 |
| }, |
| { |
| "epoch": 15.813253012048193, |
| "grad_norm": 0.00497691472992301, |
| "learning_rate": 1.277716790069361e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4919360, |
| "step": 7875 |
| }, |
| { |
| "epoch": 15.823293172690763, |
| "grad_norm": 0.03421995788812637, |
| "learning_rate": 1.2718725813833322e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4922880, |
| "step": 7880 |
| }, |
| { |
| "epoch": 15.833333333333334, |
| "grad_norm": 0.13940131664276123, |
| "learning_rate": 1.266039820620159e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4925408, |
| "step": 7885 |
| }, |
| { |
| "epoch": 15.843373493975903, |
| "grad_norm": 0.011046111583709717, |
| "learning_rate": 1.2602185256904453e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4928896, |
| "step": 7890 |
| }, |
| { |
| "epoch": 15.853413654618475, |
| "grad_norm": 0.0013307826593518257, |
| "learning_rate": 1.2544087144695826e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4931872, |
| "step": 7895 |
| }, |
| { |
| "epoch": 15.863453815261044, |
| "grad_norm": 0.014156588353216648, |
| "learning_rate": 1.2486104047976937e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4935136, |
| "step": 7900 |
| }, |
| { |
| "epoch": 15.873493975903614, |
| "grad_norm": 0.005145099479705095, |
| "learning_rate": 1.2428236144795959e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4938176, |
| "step": 7905 |
| }, |
| { |
| "epoch": 15.883534136546185, |
| "grad_norm": 0.02292685955762863, |
| "learning_rate": 1.2370483612847201e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4940672, |
| "step": 7910 |
| }, |
| { |
| "epoch": 15.893574297188755, |
| "grad_norm": 0.009769303724169731, |
| "learning_rate": 1.2312846629470826e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4944192, |
| "step": 7915 |
| }, |
| { |
| "epoch": 15.903614457831326, |
| "grad_norm": 0.0011390167055651546, |
| "learning_rate": 1.225532537165211e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4947488, |
| "step": 7920 |
| }, |
| { |
| "epoch": 15.913654618473895, |
| "grad_norm": 0.0024841073900461197, |
| "learning_rate": 1.219792001602101e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4949824, |
| "step": 7925 |
| }, |
| { |
| "epoch": 15.923694779116467, |
| "grad_norm": 0.0029108019080013037, |
| "learning_rate": 1.2140630738851544e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4952768, |
| "step": 7930 |
| }, |
| { |
| "epoch": 15.933734939759036, |
| "grad_norm": 0.09127728641033173, |
| "learning_rate": 1.2083457716061326e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4956544, |
| "step": 7935 |
| }, |
| { |
| "epoch": 15.943775100401606, |
| "grad_norm": 0.0016728171613067389, |
| "learning_rate": 1.2026401123210968e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 4959648, |
| "step": 7940 |
| }, |
| { |
| "epoch": 15.953815261044177, |
| "grad_norm": 0.002108454005792737, |
| "learning_rate": 1.1969461135503573e-06, |
| "loss": 0.0104, |
| "num_input_tokens_seen": 4961888, |
| "step": 7945 |
| }, |
| { |
| "epoch": 15.963855421686747, |
| "grad_norm": 0.0029469470027834177, |
| "learning_rate": 1.1912637927784176e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4965216, |
| "step": 7950 |
| }, |
| { |
| "epoch": 15.973895582329318, |
| "grad_norm": 0.0053405375219881535, |
| "learning_rate": 1.1855931674539222e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4968608, |
| "step": 7955 |
| }, |
| { |
| "epoch": 15.983935742971887, |
| "grad_norm": 0.004863628186285496, |
| "learning_rate": 1.1799342549896027e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4971456, |
| "step": 7960 |
| }, |
| { |
| "epoch": 15.993975903614459, |
| "grad_norm": 0.0013398093869909644, |
| "learning_rate": 1.174287072762224e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4974112, |
| "step": 7965 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 1.0401936769485474, |
| "eval_runtime": 8.0743, |
| "eval_samples_per_second": 61.677, |
| "eval_steps_per_second": 15.481, |
| "num_input_tokens_seen": 4976032, |
| "step": 7968 |
| }, |
| { |
| "epoch": 16.004016064257026, |
| "grad_norm": 0.005879928823560476, |
| "learning_rate": 1.1686516381125307e-06, |
| "loss": 0.0017, |
| "num_input_tokens_seen": 4977152, |
| "step": 7970 |
| }, |
| { |
| "epoch": 16.014056224899598, |
| "grad_norm": 0.06779361516237259, |
| "learning_rate": 1.163027968345195e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4981088, |
| "step": 7975 |
| }, |
| { |
| "epoch": 16.02409638554217, |
| "grad_norm": 0.015680238604545593, |
| "learning_rate": 1.1574160807287615e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4984064, |
| "step": 7980 |
| }, |
| { |
| "epoch": 16.03413654618474, |
| "grad_norm": 0.0018723757239058614, |
| "learning_rate": 1.1518159924955974e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4987424, |
| "step": 7985 |
| }, |
| { |
| "epoch": 16.044176706827308, |
| "grad_norm": 0.14989595115184784, |
| "learning_rate": 1.1462277208418338e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 4990112, |
| "step": 7990 |
| }, |
| { |
| "epoch": 16.05421686746988, |
| "grad_norm": 0.0020622089505195618, |
| "learning_rate": 1.1406512829273253e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4993600, |
| "step": 7995 |
| }, |
| { |
| "epoch": 16.06425702811245, |
| "grad_norm": 0.009413612075150013, |
| "learning_rate": 1.1350866958755757e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 4996512, |
| "step": 8000 |
| }, |
| { |
| "epoch": 16.07429718875502, |
| "grad_norm": 0.18731117248535156, |
| "learning_rate": 1.1295339767737125e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 4999168, |
| "step": 8005 |
| }, |
| { |
| "epoch": 16.08433734939759, |
| "grad_norm": 0.019764700904488564, |
| "learning_rate": 1.1239931426724076e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5002336, |
| "step": 8010 |
| }, |
| { |
| "epoch": 16.09437751004016, |
| "grad_norm": 0.006744361482560635, |
| "learning_rate": 1.1184642105858484e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5005536, |
| "step": 8015 |
| }, |
| { |
| "epoch": 16.104417670682732, |
| "grad_norm": 0.16189798712730408, |
| "learning_rate": 1.1129471974916696e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5008192, |
| "step": 8020 |
| }, |
| { |
| "epoch": 16.1144578313253, |
| "grad_norm": 0.0014629423385486007, |
| "learning_rate": 1.1074421203309033e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5010944, |
| "step": 8025 |
| }, |
| { |
| "epoch": 16.12449799196787, |
| "grad_norm": 0.007868721149861813, |
| "learning_rate": 1.1019489960079389e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5013888, |
| "step": 8030 |
| }, |
| { |
| "epoch": 16.134538152610443, |
| "grad_norm": 0.09024116396903992, |
| "learning_rate": 1.0964678413904529e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5017184, |
| "step": 8035 |
| }, |
| { |
| "epoch": 16.14457831325301, |
| "grad_norm": 0.0039504412561655045, |
| "learning_rate": 1.0909986733093737e-06, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5020256, |
| "step": 8040 |
| }, |
| { |
| "epoch": 16.15461847389558, |
| "grad_norm": 0.001555570401251316, |
| "learning_rate": 1.0855415085588194e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5023040, |
| "step": 8045 |
| }, |
| { |
| "epoch": 16.164658634538153, |
| "grad_norm": 0.011033423244953156, |
| "learning_rate": 1.08009636389605e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5026752, |
| "step": 8050 |
| }, |
| { |
| "epoch": 16.174698795180724, |
| "grad_norm": 0.0012489588698372245, |
| "learning_rate": 1.0746632560414154e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 5029536, |
| "step": 8055 |
| }, |
| { |
| "epoch": 16.184738955823292, |
| "grad_norm": 0.0016975250327959657, |
| "learning_rate": 1.069242201678305e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5032832, |
| "step": 8060 |
| }, |
| { |
| "epoch": 16.194779116465863, |
| "grad_norm": 0.0009509876254014671, |
| "learning_rate": 1.0638332174530953e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5036416, |
| "step": 8065 |
| }, |
| { |
| "epoch": 16.204819277108435, |
| "grad_norm": 0.0030300356447696686, |
| "learning_rate": 1.058436319975098e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5039392, |
| "step": 8070 |
| }, |
| { |
| "epoch": 16.214859437751002, |
| "grad_norm": 0.008067947812378407, |
| "learning_rate": 1.053051525816512e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5042720, |
| "step": 8075 |
| }, |
| { |
| "epoch": 16.224899598393574, |
| "grad_norm": 0.0013017337769269943, |
| "learning_rate": 1.0476788515123687e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5045760, |
| "step": 8080 |
| }, |
| { |
| "epoch": 16.234939759036145, |
| "grad_norm": 0.001618090900592506, |
| "learning_rate": 1.0423183135604874e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5048032, |
| "step": 8085 |
| }, |
| { |
| "epoch": 16.244979919678716, |
| "grad_norm": 0.012940296903252602, |
| "learning_rate": 1.036969928421413e-06, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5051040, |
| "step": 8090 |
| }, |
| { |
| "epoch": 16.255020080321284, |
| "grad_norm": 0.004096478223800659, |
| "learning_rate": 1.0316337125183817e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5054080, |
| "step": 8095 |
| }, |
| { |
| "epoch": 16.265060240963855, |
| "grad_norm": 0.003049603197723627, |
| "learning_rate": 1.0263096822372537e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5057088, |
| "step": 8100 |
| }, |
| { |
| "epoch": 16.275100401606426, |
| "grad_norm": 0.016989678144454956, |
| "learning_rate": 1.0209978539264747e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 5059904, |
| "step": 8105 |
| }, |
| { |
| "epoch": 16.285140562248998, |
| "grad_norm": 0.0049511161632835865, |
| "learning_rate": 1.0156982438970254e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5062656, |
| "step": 8110 |
| }, |
| { |
| "epoch": 16.295180722891565, |
| "grad_norm": 0.007807273417711258, |
| "learning_rate": 1.010410868422359e-06, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5066240, |
| "step": 8115 |
| }, |
| { |
| "epoch": 16.305220883534137, |
| "grad_norm": 0.00864755641669035, |
| "learning_rate": 1.0051357437383708e-06, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 5069600, |
| "step": 8120 |
| }, |
| { |
| "epoch": 16.315261044176708, |
| "grad_norm": 0.0649455189704895, |
| "learning_rate": 9.998728860433277e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5073280, |
| "step": 8125 |
| }, |
| { |
| "epoch": 16.325301204819276, |
| "grad_norm": 0.012507440522313118, |
| "learning_rate": 9.94622311497836e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5076128, |
| "step": 8130 |
| }, |
| { |
| "epoch": 16.335341365461847, |
| "grad_norm": 0.0012168296379968524, |
| "learning_rate": 9.893840362247809e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5079776, |
| "step": 8135 |
| }, |
| { |
| "epoch": 16.34538152610442, |
| "grad_norm": 0.0013548055430874228, |
| "learning_rate": 9.841580763092812e-07, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 5083168, |
| "step": 8140 |
| }, |
| { |
| "epoch": 16.355421686746986, |
| "grad_norm": 3.4245684146881104, |
| "learning_rate": 9.789444477986375e-07, |
| "loss": 0.0023, |
| "num_input_tokens_seen": 5085792, |
| "step": 8145 |
| }, |
| { |
| "epoch": 16.365461847389557, |
| "grad_norm": 0.003231622511520982, |
| "learning_rate": 9.737431667022866e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5089632, |
| "step": 8150 |
| }, |
| { |
| "epoch": 16.37550200803213, |
| "grad_norm": 0.011450034566223621, |
| "learning_rate": 9.685542489917494e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5092064, |
| "step": 8155 |
| }, |
| { |
| "epoch": 16.3855421686747, |
| "grad_norm": 0.00948801077902317, |
| "learning_rate": 9.633777106005826e-07, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 5095488, |
| "step": 8160 |
| }, |
| { |
| "epoch": 16.395582329317268, |
| "grad_norm": 0.0029805630911141634, |
| "learning_rate": 9.582135674243292e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5098944, |
| "step": 8165 |
| }, |
| { |
| "epoch": 16.40562248995984, |
| "grad_norm": 0.002190305618569255, |
| "learning_rate": 9.530618353204718e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5101600, |
| "step": 8170 |
| }, |
| { |
| "epoch": 16.41566265060241, |
| "grad_norm": 0.0043740589171648026, |
| "learning_rate": 9.479225301083811e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5103904, |
| "step": 8175 |
| }, |
| { |
| "epoch": 16.42570281124498, |
| "grad_norm": 0.0013769206125289202, |
| "learning_rate": 9.427956675692695e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5107616, |
| "step": 8180 |
| }, |
| { |
| "epoch": 16.43574297188755, |
| "grad_norm": 0.04764657840132713, |
| "learning_rate": 9.376812634461418e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5110400, |
| "step": 8185 |
| }, |
| { |
| "epoch": 16.44578313253012, |
| "grad_norm": 0.0012534708948805928, |
| "learning_rate": 9.32579333443746e-07, |
| "loss": 0.0033, |
| "num_input_tokens_seen": 5113504, |
| "step": 8190 |
| }, |
| { |
| "epoch": 16.455823293172692, |
| "grad_norm": 0.015134445391595364, |
| "learning_rate": 9.27489893228527e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5116768, |
| "step": 8195 |
| }, |
| { |
| "epoch": 16.46586345381526, |
| "grad_norm": 0.013975398615002632, |
| "learning_rate": 9.224129584285768e-07, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 5120224, |
| "step": 8200 |
| }, |
| { |
| "epoch": 16.47590361445783, |
| "grad_norm": 0.2885809540748596, |
| "learning_rate": 9.173485446335862e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5123584, |
| "step": 8205 |
| }, |
| { |
| "epoch": 16.485943775100402, |
| "grad_norm": 0.027181854471564293, |
| "learning_rate": 9.122966673948025e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5126752, |
| "step": 8210 |
| }, |
| { |
| "epoch": 16.495983935742974, |
| "grad_norm": 0.007998216897249222, |
| "learning_rate": 9.072573422249692e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5129312, |
| "step": 8215 |
| }, |
| { |
| "epoch": 16.50602409638554, |
| "grad_norm": 0.001557971932925284, |
| "learning_rate": 9.022305845982948e-07, |
| "loss": 0.0584, |
| "num_input_tokens_seen": 5132192, |
| "step": 8220 |
| }, |
| { |
| "epoch": 16.516064257028113, |
| "grad_norm": 0.004761459771543741, |
| "learning_rate": 8.972164099503899e-07, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5135520, |
| "step": 8225 |
| }, |
| { |
| "epoch": 16.526104417670684, |
| "grad_norm": 0.049848757684230804, |
| "learning_rate": 8.922148336782288e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5138432, |
| "step": 8230 |
| }, |
| { |
| "epoch": 16.53614457831325, |
| "grad_norm": 10.381841659545898, |
| "learning_rate": 8.87225871140105e-07, |
| "loss": 0.004, |
| "num_input_tokens_seen": 5141952, |
| "step": 8235 |
| }, |
| { |
| "epoch": 16.546184738955823, |
| "grad_norm": 0.0021748128347098827, |
| "learning_rate": 8.822495376555695e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5145344, |
| "step": 8240 |
| }, |
| { |
| "epoch": 16.556224899598394, |
| "grad_norm": 0.06430647522211075, |
| "learning_rate": 8.772858485054042e-07, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 5148096, |
| "step": 8245 |
| }, |
| { |
| "epoch": 16.566265060240966, |
| "grad_norm": 0.0011095363879576325, |
| "learning_rate": 8.723348189315534e-07, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 5150784, |
| "step": 8250 |
| }, |
| { |
| "epoch": 16.576305220883533, |
| "grad_norm": 0.0028599195647984743, |
| "learning_rate": 8.673964641370974e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5153056, |
| "step": 8255 |
| }, |
| { |
| "epoch": 16.586345381526105, |
| "grad_norm": 0.0014031616738066077, |
| "learning_rate": 8.624707992861897e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5156448, |
| "step": 8260 |
| }, |
| { |
| "epoch": 16.596385542168676, |
| "grad_norm": 0.002864877926185727, |
| "learning_rate": 8.575578395040202e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5160672, |
| "step": 8265 |
| }, |
| { |
| "epoch": 16.606425702811244, |
| "grad_norm": 0.0012285938719287515, |
| "learning_rate": 8.526575998767638e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5163840, |
| "step": 8270 |
| }, |
| { |
| "epoch": 16.616465863453815, |
| "grad_norm": 0.0011251465184614062, |
| "learning_rate": 8.477700954515372e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5167552, |
| "step": 8275 |
| }, |
| { |
| "epoch": 16.626506024096386, |
| "grad_norm": 0.09981559216976166, |
| "learning_rate": 8.428953412363495e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5170496, |
| "step": 8280 |
| }, |
| { |
| "epoch": 16.636546184738958, |
| "grad_norm": 0.004276310559362173, |
| "learning_rate": 8.380333522000588e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5173504, |
| "step": 8285 |
| }, |
| { |
| "epoch": 16.646586345381525, |
| "grad_norm": 0.037310317158699036, |
| "learning_rate": 8.331841432723253e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5176640, |
| "step": 8290 |
| }, |
| { |
| "epoch": 16.656626506024097, |
| "grad_norm": 0.0023100704420357943, |
| "learning_rate": 8.28347729343566e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5180096, |
| "step": 8295 |
| }, |
| { |
| "epoch": 16.666666666666668, |
| "grad_norm": 0.06090432405471802, |
| "learning_rate": 8.235241252649073e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5183616, |
| "step": 8300 |
| }, |
| { |
| "epoch": 16.676706827309236, |
| "grad_norm": 24.074342727661133, |
| "learning_rate": 8.187133458481416e-07, |
| "loss": 0.0765, |
| "num_input_tokens_seen": 5186720, |
| "step": 8305 |
| }, |
| { |
| "epoch": 16.686746987951807, |
| "grad_norm": 0.030896145850419998, |
| "learning_rate": 8.139154058656801e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5190560, |
| "step": 8310 |
| }, |
| { |
| "epoch": 16.696787148594378, |
| "grad_norm": 0.006053561810404062, |
| "learning_rate": 8.091303200505074e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5194304, |
| "step": 8315 |
| }, |
| { |
| "epoch": 16.70682730923695, |
| "grad_norm": 0.0027364008128643036, |
| "learning_rate": 8.043581030961372e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5197792, |
| "step": 8320 |
| }, |
| { |
| "epoch": 16.716867469879517, |
| "grad_norm": 0.002491719089448452, |
| "learning_rate": 7.99598769656571e-07, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 5201280, |
| "step": 8325 |
| }, |
| { |
| "epoch": 16.72690763052209, |
| "grad_norm": 0.015071824193000793, |
| "learning_rate": 7.948523343462411e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5204704, |
| "step": 8330 |
| }, |
| { |
| "epoch": 16.73694779116466, |
| "grad_norm": 0.0035658315755426884, |
| "learning_rate": 7.901188117399817e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5208320, |
| "step": 8335 |
| }, |
| { |
| "epoch": 16.746987951807228, |
| "grad_norm": 0.0027445750311017036, |
| "learning_rate": 7.853982163729684e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5211136, |
| "step": 8340 |
| }, |
| { |
| "epoch": 16.7570281124498, |
| "grad_norm": 0.006281423382461071, |
| "learning_rate": 7.806905627406891e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5214528, |
| "step": 8345 |
| }, |
| { |
| "epoch": 16.76706827309237, |
| "grad_norm": 0.005827156826853752, |
| "learning_rate": 7.759958652988858e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5218048, |
| "step": 8350 |
| }, |
| { |
| "epoch": 16.77710843373494, |
| "grad_norm": 0.009136940352618694, |
| "learning_rate": 7.713141384635186e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5221248, |
| "step": 8355 |
| }, |
| { |
| "epoch": 16.78714859437751, |
| "grad_norm": 0.005150144919753075, |
| "learning_rate": 7.666453966107201e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5223776, |
| "step": 8360 |
| }, |
| { |
| "epoch": 16.79718875502008, |
| "grad_norm": 0.004599465057253838, |
| "learning_rate": 7.619896540767435e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5226176, |
| "step": 8365 |
| }, |
| { |
| "epoch": 16.80722891566265, |
| "grad_norm": 0.006183779798448086, |
| "learning_rate": 7.573469251579346e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5229312, |
| "step": 8370 |
| }, |
| { |
| "epoch": 16.81726907630522, |
| "grad_norm": 0.0017236809944733977, |
| "learning_rate": 7.527172241106718e-07, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 5231744, |
| "step": 8375 |
| }, |
| { |
| "epoch": 16.82730923694779, |
| "grad_norm": 0.010270994156599045, |
| "learning_rate": 7.481005651513312e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5234464, |
| "step": 8380 |
| }, |
| { |
| "epoch": 16.837349397590362, |
| "grad_norm": 0.0016576339257881045, |
| "learning_rate": 7.434969624562405e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5238368, |
| "step": 8385 |
| }, |
| { |
| "epoch": 16.847389558232933, |
| "grad_norm": 0.001661753747612238, |
| "learning_rate": 7.389064301616355e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5241792, |
| "step": 8390 |
| }, |
| { |
| "epoch": 16.8574297188755, |
| "grad_norm": 0.0016957195475697517, |
| "learning_rate": 7.343289823636168e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5244960, |
| "step": 8395 |
| }, |
| { |
| "epoch": 16.867469879518072, |
| "grad_norm": 0.013983628712594509, |
| "learning_rate": 7.297646331181069e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5247520, |
| "step": 8400 |
| }, |
| { |
| "epoch": 16.877510040160644, |
| "grad_norm": 0.00148250802885741, |
| "learning_rate": 7.252133964408065e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5250272, |
| "step": 8405 |
| }, |
| { |
| "epoch": 16.88755020080321, |
| "grad_norm": 0.017954643815755844, |
| "learning_rate": 7.206752863071515e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5252864, |
| "step": 8410 |
| }, |
| { |
| "epoch": 16.897590361445783, |
| "grad_norm": 0.0016218151431530714, |
| "learning_rate": 7.161503166522704e-07, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5255840, |
| "step": 8415 |
| }, |
| { |
| "epoch": 16.907630522088354, |
| "grad_norm": 0.0025824366603046656, |
| "learning_rate": 7.116385013709404e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5258400, |
| "step": 8420 |
| }, |
| { |
| "epoch": 16.917670682730925, |
| "grad_norm": 0.0045729330740869045, |
| "learning_rate": 7.0713985431755e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5261376, |
| "step": 8425 |
| }, |
| { |
| "epoch": 16.927710843373493, |
| "grad_norm": 0.0545303151011467, |
| "learning_rate": 7.026543893060456e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5263968, |
| "step": 8430 |
| }, |
| { |
| "epoch": 16.937751004016064, |
| "grad_norm": 0.12884321808815002, |
| "learning_rate": 6.981821201098999e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5267488, |
| "step": 8435 |
| }, |
| { |
| "epoch": 16.947791164658636, |
| "grad_norm": 0.0024236757308244705, |
| "learning_rate": 6.937230604620642e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5270528, |
| "step": 8440 |
| }, |
| { |
| "epoch": 16.957831325301203, |
| "grad_norm": 0.131021186709404, |
| "learning_rate": 6.892772240549267e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5274048, |
| "step": 8445 |
| }, |
| { |
| "epoch": 16.967871485943775, |
| "grad_norm": 0.0023038825020194054, |
| "learning_rate": 6.848446245402751e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5276320, |
| "step": 8450 |
| }, |
| { |
| "epoch": 16.977911646586346, |
| "grad_norm": 0.0010659729596227407, |
| "learning_rate": 6.804252755292429e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5278688, |
| "step": 8455 |
| }, |
| { |
| "epoch": 16.987951807228917, |
| "grad_norm": 0.0022493836004287004, |
| "learning_rate": 6.760191905922847e-07, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5281120, |
| "step": 8460 |
| }, |
| { |
| "epoch": 16.997991967871485, |
| "grad_norm": 0.0067631215788424015, |
| "learning_rate": 6.716263832591163e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5284064, |
| "step": 8465 |
| }, |
| { |
| "epoch": 17.008032128514056, |
| "grad_norm": 0.03593532368540764, |
| "learning_rate": 6.672468670186899e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5287968, |
| "step": 8470 |
| }, |
| { |
| "epoch": 17.018072289156628, |
| "grad_norm": 0.0016852463595569134, |
| "learning_rate": 6.628806553191397e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5291744, |
| "step": 8475 |
| }, |
| { |
| "epoch": 17.028112449799195, |
| "grad_norm": 0.0035839611664414406, |
| "learning_rate": 6.585277615677472e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5293984, |
| "step": 8480 |
| }, |
| { |
| "epoch": 17.038152610441767, |
| "grad_norm": 0.0018862821161746979, |
| "learning_rate": 6.541881991309013e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5296704, |
| "step": 8485 |
| }, |
| { |
| "epoch": 17.048192771084338, |
| "grad_norm": 0.0017078607343137264, |
| "learning_rate": 6.498619813340473e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5299872, |
| "step": 8490 |
| }, |
| { |
| "epoch": 17.05823293172691, |
| "grad_norm": 0.0025925240479409695, |
| "learning_rate": 6.455491214616622e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5303584, |
| "step": 8495 |
| }, |
| { |
| "epoch": 17.068273092369477, |
| "grad_norm": 0.002037809230387211, |
| "learning_rate": 6.412496327571999e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5307488, |
| "step": 8500 |
| }, |
| { |
| "epoch": 17.07831325301205, |
| "grad_norm": 0.006466969382017851, |
| "learning_rate": 6.369635284230563e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5311328, |
| "step": 8505 |
| }, |
| { |
| "epoch": 17.08835341365462, |
| "grad_norm": 0.0013107856502756476, |
| "learning_rate": 6.32690821620528e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5314720, |
| "step": 8510 |
| }, |
| { |
| "epoch": 17.098393574297187, |
| "grad_norm": 0.023452557623386383, |
| "learning_rate": 6.284315254697726e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5318752, |
| "step": 8515 |
| }, |
| { |
| "epoch": 17.10843373493976, |
| "grad_norm": 0.0020837204065173864, |
| "learning_rate": 6.241856530497669e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5321952, |
| "step": 8520 |
| }, |
| { |
| "epoch": 17.11847389558233, |
| "grad_norm": 0.0032065757550299168, |
| "learning_rate": 6.199532173982692e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5325056, |
| "step": 8525 |
| }, |
| { |
| "epoch": 17.1285140562249, |
| "grad_norm": 0.004327481612563133, |
| "learning_rate": 6.157342315117754e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5327936, |
| "step": 8530 |
| }, |
| { |
| "epoch": 17.13855421686747, |
| "grad_norm": 0.0018718891078606248, |
| "learning_rate": 6.115287083454823e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5331968, |
| "step": 8535 |
| }, |
| { |
| "epoch": 17.14859437751004, |
| "grad_norm": 0.0009799738181754947, |
| "learning_rate": 6.073366608132481e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5334144, |
| "step": 8540 |
| }, |
| { |
| "epoch": 17.15863453815261, |
| "grad_norm": 0.0023493673652410507, |
| "learning_rate": 6.031581017875482e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5336928, |
| "step": 8545 |
| }, |
| { |
| "epoch": 17.16867469879518, |
| "grad_norm": 0.0019072717987000942, |
| "learning_rate": 5.989930440994451e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5339904, |
| "step": 8550 |
| }, |
| { |
| "epoch": 17.17871485943775, |
| "grad_norm": 0.003958335146307945, |
| "learning_rate": 5.948415005385344e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5343552, |
| "step": 8555 |
| }, |
| { |
| "epoch": 17.188755020080322, |
| "grad_norm": 0.6150096654891968, |
| "learning_rate": 5.907034838529224e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5346752, |
| "step": 8560 |
| }, |
| { |
| "epoch": 17.198795180722893, |
| "grad_norm": 0.0013654690701514482, |
| "learning_rate": 5.865790067491739e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5349952, |
| "step": 8565 |
| }, |
| { |
| "epoch": 17.20883534136546, |
| "grad_norm": 0.8217050433158875, |
| "learning_rate": 5.824680818922762e-07, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5352448, |
| "step": 8570 |
| }, |
| { |
| "epoch": 17.218875502008032, |
| "grad_norm": 0.0032255176920443773, |
| "learning_rate": 5.783707219056078e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5356032, |
| "step": 8575 |
| }, |
| { |
| "epoch": 17.228915662650603, |
| "grad_norm": 0.019121866673231125, |
| "learning_rate": 5.742869393708872e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5358368, |
| "step": 8580 |
| }, |
| { |
| "epoch": 17.23895582329317, |
| "grad_norm": 0.002871948992833495, |
| "learning_rate": 5.702167468281461e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5361216, |
| "step": 8585 |
| }, |
| { |
| "epoch": 17.248995983935743, |
| "grad_norm": 0.0008276253938674927, |
| "learning_rate": 5.661601567756819e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5364128, |
| "step": 8590 |
| }, |
| { |
| "epoch": 17.259036144578314, |
| "grad_norm": 0.00140668754465878, |
| "learning_rate": 5.621171816700249e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5367200, |
| "step": 8595 |
| }, |
| { |
| "epoch": 17.269076305220885, |
| "grad_norm": 0.0022226141300052404, |
| "learning_rate": 5.580878339258978e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5370144, |
| "step": 8600 |
| }, |
| { |
| "epoch": 17.279116465863453, |
| "grad_norm": 0.0009751960169523954, |
| "learning_rate": 5.540721259161774e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5373024, |
| "step": 8605 |
| }, |
| { |
| "epoch": 17.289156626506024, |
| "grad_norm": 0.018170801922678947, |
| "learning_rate": 5.500700699718564e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5375904, |
| "step": 8610 |
| }, |
| { |
| "epoch": 17.299196787148595, |
| "grad_norm": 0.0021682889200747013, |
| "learning_rate": 5.460816783820089e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5379264, |
| "step": 8615 |
| }, |
| { |
| "epoch": 17.309236947791163, |
| "grad_norm": 0.0069380393251776695, |
| "learning_rate": 5.42106963393747e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5382208, |
| "step": 8620 |
| }, |
| { |
| "epoch": 17.319277108433734, |
| "grad_norm": 0.11309646815061569, |
| "learning_rate": 5.381459372121878e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5385568, |
| "step": 8625 |
| }, |
| { |
| "epoch": 17.329317269076306, |
| "grad_norm": 0.007803209591656923, |
| "learning_rate": 5.341986120004145e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5389056, |
| "step": 8630 |
| }, |
| { |
| "epoch": 17.339357429718877, |
| "grad_norm": 0.001378397922962904, |
| "learning_rate": 5.302649998794368e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5391840, |
| "step": 8635 |
| }, |
| { |
| "epoch": 17.349397590361445, |
| "grad_norm": 0.008238635957241058, |
| "learning_rate": 5.263451129281605e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5395008, |
| "step": 8640 |
| }, |
| { |
| "epoch": 17.359437751004016, |
| "grad_norm": 0.0009625194361433387, |
| "learning_rate": 5.224389631833393e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5397728, |
| "step": 8645 |
| }, |
| { |
| "epoch": 17.369477911646587, |
| "grad_norm": 0.019273938611149788, |
| "learning_rate": 5.185465626395486e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5401248, |
| "step": 8650 |
| }, |
| { |
| "epoch": 17.379518072289155, |
| "grad_norm": 0.0024227574467658997, |
| "learning_rate": 5.146679232491436e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5405024, |
| "step": 8655 |
| }, |
| { |
| "epoch": 17.389558232931726, |
| "grad_norm": 0.007197659928351641, |
| "learning_rate": 5.108030569222211e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5407968, |
| "step": 8660 |
| }, |
| { |
| "epoch": 17.399598393574298, |
| "grad_norm": 0.009146859869360924, |
| "learning_rate": 5.0695197552659e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5410944, |
| "step": 8665 |
| }, |
| { |
| "epoch": 17.40963855421687, |
| "grad_norm": 0.002492484636604786, |
| "learning_rate": 5.031146908877221e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5414240, |
| "step": 8670 |
| }, |
| { |
| "epoch": 17.419678714859437, |
| "grad_norm": 0.007068297825753689, |
| "learning_rate": 4.99291214788733e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5416896, |
| "step": 8675 |
| }, |
| { |
| "epoch": 17.429718875502008, |
| "grad_norm": 0.0012975713470950723, |
| "learning_rate": 4.954815589703277e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5419744, |
| "step": 8680 |
| }, |
| { |
| "epoch": 17.43975903614458, |
| "grad_norm": 0.0051535964012146, |
| "learning_rate": 4.916857351307802e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5422560, |
| "step": 8685 |
| }, |
| { |
| "epoch": 17.449799196787147, |
| "grad_norm": 0.004130291286855936, |
| "learning_rate": 4.879037549258875e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5426016, |
| "step": 8690 |
| }, |
| { |
| "epoch": 17.45983935742972, |
| "grad_norm": 0.0020685733761638403, |
| "learning_rate": 4.841356299689359e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5429280, |
| "step": 8695 |
| }, |
| { |
| "epoch": 17.46987951807229, |
| "grad_norm": 0.0011084630386903882, |
| "learning_rate": 4.803813718306716e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5432576, |
| "step": 8700 |
| }, |
| { |
| "epoch": 17.47991967871486, |
| "grad_norm": 0.008571324869990349, |
| "learning_rate": 4.7664099203925284e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5436064, |
| "step": 8705 |
| }, |
| { |
| "epoch": 17.48995983935743, |
| "grad_norm": 0.0010640741093084216, |
| "learning_rate": 4.7291450208022836e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5438880, |
| "step": 8710 |
| }, |
| { |
| "epoch": 17.5, |
| "grad_norm": 0.0077699050307273865, |
| "learning_rate": 4.692019133964931e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5441696, |
| "step": 8715 |
| }, |
| { |
| "epoch": 17.51004016064257, |
| "grad_norm": 0.0019389991648495197, |
| "learning_rate": 4.65503237388254e-07, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 5444448, |
| "step": 8720 |
| }, |
| { |
| "epoch": 17.52008032128514, |
| "grad_norm": 0.0016935811145231128, |
| "learning_rate": 4.618184854129981e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5447424, |
| "step": 8725 |
| }, |
| { |
| "epoch": 17.53012048192771, |
| "grad_norm": 0.0011479026870802045, |
| "learning_rate": 4.581476687854558e-07, |
| "loss": 0.0619, |
| "num_input_tokens_seen": 5450688, |
| "step": 8730 |
| }, |
| { |
| "epoch": 17.54016064257028, |
| "grad_norm": 0.0018761102110147476, |
| "learning_rate": 4.5449079877756653e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5453472, |
| "step": 8735 |
| }, |
| { |
| "epoch": 17.550200803212853, |
| "grad_norm": 0.001550107728689909, |
| "learning_rate": 4.508478866184435e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5456800, |
| "step": 8740 |
| }, |
| { |
| "epoch": 17.56024096385542, |
| "grad_norm": 0.0016778951976448298, |
| "learning_rate": 4.4721894349434027e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5460256, |
| "step": 8745 |
| }, |
| { |
| "epoch": 17.570281124497992, |
| "grad_norm": 0.0011093484936282039, |
| "learning_rate": 4.4360398054861473e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5463712, |
| "step": 8750 |
| }, |
| { |
| "epoch": 17.580321285140563, |
| "grad_norm": 0.011861991137266159, |
| "learning_rate": 4.4000300888169753e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5467104, |
| "step": 8755 |
| }, |
| { |
| "epoch": 17.59036144578313, |
| "grad_norm": 0.002717207185924053, |
| "learning_rate": 4.364160395510547e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5469888, |
| "step": 8760 |
| }, |
| { |
| "epoch": 17.600401606425702, |
| "grad_norm": 0.010903539136052132, |
| "learning_rate": 4.328430835711589e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5473216, |
| "step": 8765 |
| }, |
| { |
| "epoch": 17.610441767068274, |
| "grad_norm": 0.0062487199902534485, |
| "learning_rate": 4.2928415191344664e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5476768, |
| "step": 8770 |
| }, |
| { |
| "epoch": 17.620481927710845, |
| "grad_norm": 0.008080641739070415, |
| "learning_rate": 4.2573925550629393e-07, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5479648, |
| "step": 8775 |
| }, |
| { |
| "epoch": 17.630522088353413, |
| "grad_norm": 199.20767211914062, |
| "learning_rate": 4.2220840523497896e-07, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 5483360, |
| "step": 8780 |
| }, |
| { |
| "epoch": 17.640562248995984, |
| "grad_norm": 0.0291756484657526, |
| "learning_rate": 4.1869161194164565e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5486528, |
| "step": 8785 |
| }, |
| { |
| "epoch": 17.650602409638555, |
| "grad_norm": 0.0011264854110777378, |
| "learning_rate": 4.15188886425279e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5489728, |
| "step": 8790 |
| }, |
| { |
| "epoch": 17.660642570281123, |
| "grad_norm": 0.0019059681799262762, |
| "learning_rate": 4.117002394416586e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5492320, |
| "step": 8795 |
| }, |
| { |
| "epoch": 17.670682730923694, |
| "grad_norm": 0.0012327972799539566, |
| "learning_rate": 4.082256817033392e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5495840, |
| "step": 8800 |
| }, |
| { |
| "epoch": 17.680722891566266, |
| "grad_norm": 0.0454624705016613, |
| "learning_rate": 4.047652238796096e-07, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5498784, |
| "step": 8805 |
| }, |
| { |
| "epoch": 17.690763052208837, |
| "grad_norm": 0.02524404413998127, |
| "learning_rate": 4.0131887659646265e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5501088, |
| "step": 8810 |
| }, |
| { |
| "epoch": 17.700803212851405, |
| "grad_norm": 0.0016588432481512427, |
| "learning_rate": 3.9788665043656083e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5504512, |
| "step": 8815 |
| }, |
| { |
| "epoch": 17.710843373493976, |
| "grad_norm": 0.004111200571060181, |
| "learning_rate": 3.94468555939207e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5507616, |
| "step": 8820 |
| }, |
| { |
| "epoch": 17.720883534136547, |
| "grad_norm": 0.056082893162965775, |
| "learning_rate": 3.9106460360030853e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5510624, |
| "step": 8825 |
| }, |
| { |
| "epoch": 17.730923694779115, |
| "grad_norm": 0.0011907127918675542, |
| "learning_rate": 3.8767480387234714e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5513952, |
| "step": 8830 |
| }, |
| { |
| "epoch": 17.740963855421686, |
| "grad_norm": 0.001506793312728405, |
| "learning_rate": 3.84299167164347e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5516704, |
| "step": 8835 |
| }, |
| { |
| "epoch": 17.751004016064257, |
| "grad_norm": 0.001404171111062169, |
| "learning_rate": 3.809377038418405e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5519328, |
| "step": 8840 |
| }, |
| { |
| "epoch": 17.76104417670683, |
| "grad_norm": 0.0012987729860469699, |
| "learning_rate": 3.775904242268391e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5522688, |
| "step": 8845 |
| }, |
| { |
| "epoch": 17.771084337349397, |
| "grad_norm": 0.0013029174879193306, |
| "learning_rate": 3.742573385977999e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5525024, |
| "step": 8850 |
| }, |
| { |
| "epoch": 17.781124497991968, |
| "grad_norm": 0.001227586530148983, |
| "learning_rate": 3.7093845718959575e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5527808, |
| "step": 8855 |
| }, |
| { |
| "epoch": 17.79116465863454, |
| "grad_norm": 1.0415514707565308, |
| "learning_rate": 3.676337901934812e-07, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 5530688, |
| "step": 8860 |
| }, |
| { |
| "epoch": 17.801204819277107, |
| "grad_norm": 0.0038885881658643484, |
| "learning_rate": 3.6434334775706403e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5533696, |
| "step": 8865 |
| }, |
| { |
| "epoch": 17.811244979919678, |
| "grad_norm": 0.12921766936779022, |
| "learning_rate": 3.610671399842719e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5536448, |
| "step": 8870 |
| }, |
| { |
| "epoch": 17.82128514056225, |
| "grad_norm": 0.024864312261343002, |
| "learning_rate": 3.578051769353219e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5539808, |
| "step": 8875 |
| }, |
| { |
| "epoch": 17.83132530120482, |
| "grad_norm": 0.0016896923771128058, |
| "learning_rate": 3.5455746862669336e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5542848, |
| "step": 8880 |
| }, |
| { |
| "epoch": 17.84136546184739, |
| "grad_norm": 0.0020967130549252033, |
| "learning_rate": 3.513240250310873e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5545376, |
| "step": 8885 |
| }, |
| { |
| "epoch": 17.85140562248996, |
| "grad_norm": 0.01755034364759922, |
| "learning_rate": 3.4810485607740975e-07, |
| "loss": 0.0411, |
| "num_input_tokens_seen": 5549088, |
| "step": 8890 |
| }, |
| { |
| "epoch": 17.86144578313253, |
| "grad_norm": 0.47964027523994446, |
| "learning_rate": 3.4489997165072785e-07, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 5551712, |
| "step": 8895 |
| }, |
| { |
| "epoch": 17.8714859437751, |
| "grad_norm": 0.014545414596796036, |
| "learning_rate": 3.4170938159224675e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5554432, |
| "step": 8900 |
| }, |
| { |
| "epoch": 17.88152610441767, |
| "grad_norm": 0.0012772183399647474, |
| "learning_rate": 3.385330956992816e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5557504, |
| "step": 8905 |
| }, |
| { |
| "epoch": 17.89156626506024, |
| "grad_norm": 0.004275870509445667, |
| "learning_rate": 3.3537112372521777e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5560608, |
| "step": 8910 |
| }, |
| { |
| "epoch": 17.901606425702813, |
| "grad_norm": 0.016472170129418373, |
| "learning_rate": 3.3222347537949395e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5563584, |
| "step": 8915 |
| }, |
| { |
| "epoch": 17.91164658634538, |
| "grad_norm": 0.0016320595750585198, |
| "learning_rate": 3.290901603275587e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5566592, |
| "step": 8920 |
| }, |
| { |
| "epoch": 17.92168674698795, |
| "grad_norm": 0.002692002570256591, |
| "learning_rate": 3.2597118819085227e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5569536, |
| "step": 8925 |
| }, |
| { |
| "epoch": 17.931726907630523, |
| "grad_norm": 0.08017129451036453, |
| "learning_rate": 3.228665685467702e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5572448, |
| "step": 8930 |
| }, |
| { |
| "epoch": 17.94176706827309, |
| "grad_norm": 0.0011358013143762946, |
| "learning_rate": 3.1977631092863613e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5575296, |
| "step": 8935 |
| }, |
| { |
| "epoch": 17.951807228915662, |
| "grad_norm": 0.04995585232973099, |
| "learning_rate": 3.167004248256733e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5578912, |
| "step": 8940 |
| }, |
| { |
| "epoch": 17.961847389558233, |
| "grad_norm": 0.010064242407679558, |
| "learning_rate": 3.1363891968297367e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5581056, |
| "step": 8945 |
| }, |
| { |
| "epoch": 17.971887550200805, |
| "grad_norm": 0.0012515847338363528, |
| "learning_rate": 3.105918049014689e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5584352, |
| "step": 8950 |
| }, |
| { |
| "epoch": 17.981927710843372, |
| "grad_norm": 0.0015645629027858377, |
| "learning_rate": 3.075590898379044e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5587872, |
| "step": 8955 |
| }, |
| { |
| "epoch": 17.991967871485944, |
| "grad_norm": 0.01546509936451912, |
| "learning_rate": 3.04540783804807e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5591680, |
| "step": 8960 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 1.093041181564331, |
| "eval_runtime": 8.0723, |
| "eval_samples_per_second": 61.692, |
| "eval_steps_per_second": 15.485, |
| "num_input_tokens_seen": 5594752, |
| "step": 8964 |
| }, |
| { |
| "epoch": 18.002008032128515, |
| "grad_norm": 0.3365046977996826, |
| "learning_rate": 3.015368960704584e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5595360, |
| "step": 8965 |
| }, |
| { |
| "epoch": 18.012048192771083, |
| "grad_norm": 0.017432406544685364, |
| "learning_rate": 2.985474358588658e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5598368, |
| "step": 8970 |
| }, |
| { |
| "epoch": 18.022088353413654, |
| "grad_norm": 0.003238762030377984, |
| "learning_rate": 2.9557241234973446e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5601664, |
| "step": 8975 |
| }, |
| { |
| "epoch": 18.032128514056225, |
| "grad_norm": 0.02988354116678238, |
| "learning_rate": 2.926118346784379e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5604736, |
| "step": 8980 |
| }, |
| { |
| "epoch": 18.042168674698797, |
| "grad_norm": 0.001696154591627419, |
| "learning_rate": 2.8966571193599304e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5607936, |
| "step": 8985 |
| }, |
| { |
| "epoch": 18.052208835341364, |
| "grad_norm": 0.0027232773136347532, |
| "learning_rate": 2.8673405316902824e-07, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 5611200, |
| "step": 8990 |
| }, |
| { |
| "epoch": 18.062248995983936, |
| "grad_norm": 0.004504075739532709, |
| "learning_rate": 2.8381686737975867e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5613856, |
| "step": 8995 |
| }, |
| { |
| "epoch": 18.072289156626507, |
| "grad_norm": 0.002720191143453121, |
| "learning_rate": 2.809141635259555e-07, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5617152, |
| "step": 9000 |
| }, |
| { |
| "epoch": 18.082329317269075, |
| "grad_norm": 0.0020942571572959423, |
| "learning_rate": 2.780259505209249e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5620160, |
| "step": 9005 |
| }, |
| { |
| "epoch": 18.092369477911646, |
| "grad_norm": 0.017170244827866554, |
| "learning_rate": 2.7515223723346974e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5623424, |
| "step": 9010 |
| }, |
| { |
| "epoch": 18.102409638554217, |
| "grad_norm": 0.008794093504548073, |
| "learning_rate": 2.722930324878748e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5626208, |
| "step": 9015 |
| }, |
| { |
| "epoch": 18.11244979919679, |
| "grad_norm": 0.00417192792519927, |
| "learning_rate": 2.694483450638685e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5629280, |
| "step": 9020 |
| }, |
| { |
| "epoch": 18.122489959839356, |
| "grad_norm": 0.031785111874341965, |
| "learning_rate": 2.666181836966053e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5632256, |
| "step": 9025 |
| }, |
| { |
| "epoch": 18.132530120481928, |
| "grad_norm": 0.004091055132448673, |
| "learning_rate": 2.6380255707663285e-07, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5634688, |
| "step": 9030 |
| }, |
| { |
| "epoch": 18.1425702811245, |
| "grad_norm": 0.06754046678543091, |
| "learning_rate": 2.610014738498656e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5637984, |
| "step": 9035 |
| }, |
| { |
| "epoch": 18.152610441767067, |
| "grad_norm": 0.012670686468482018, |
| "learning_rate": 2.5821494261756284e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5641440, |
| "step": 9040 |
| }, |
| { |
| "epoch": 18.162650602409638, |
| "grad_norm": 17.17615509033203, |
| "learning_rate": 2.554429719362972e-07, |
| "loss": 0.0529, |
| "num_input_tokens_seen": 5644960, |
| "step": 9045 |
| }, |
| { |
| "epoch": 18.17269076305221, |
| "grad_norm": 0.0012001717695966363, |
| "learning_rate": 2.526855703179304e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5648512, |
| "step": 9050 |
| }, |
| { |
| "epoch": 18.18273092369478, |
| "grad_norm": 0.005140680354088545, |
| "learning_rate": 2.4994274622958726e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5651584, |
| "step": 9055 |
| }, |
| { |
| "epoch": 18.19277108433735, |
| "grad_norm": 0.025966104120016098, |
| "learning_rate": 2.4721450809363054e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5654720, |
| "step": 9060 |
| }, |
| { |
| "epoch": 18.20281124497992, |
| "grad_norm": 0.003484898479655385, |
| "learning_rate": 2.4450086428763345e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5657952, |
| "step": 9065 |
| }, |
| { |
| "epoch": 18.21285140562249, |
| "grad_norm": 0.00537458062171936, |
| "learning_rate": 2.4180182314435305e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5661120, |
| "step": 9070 |
| }, |
| { |
| "epoch": 18.22289156626506, |
| "grad_norm": 0.06362808495759964, |
| "learning_rate": 2.3911739295170875e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5664704, |
| "step": 9075 |
| }, |
| { |
| "epoch": 18.23293172690763, |
| "grad_norm": 0.013311965391039848, |
| "learning_rate": 2.364475819527523e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5667744, |
| "step": 9080 |
| }, |
| { |
| "epoch": 18.2429718875502, |
| "grad_norm": 0.016375111415982246, |
| "learning_rate": 2.3379239834564526e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5670496, |
| "step": 9085 |
| }, |
| { |
| "epoch": 18.253012048192772, |
| "grad_norm": 0.008078474551439285, |
| "learning_rate": 2.3115185028363186e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5673632, |
| "step": 9090 |
| }, |
| { |
| "epoch": 18.26305220883534, |
| "grad_norm": 0.0013551748124882579, |
| "learning_rate": 2.2852594587501887e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5676672, |
| "step": 9095 |
| }, |
| { |
| "epoch": 18.27309236947791, |
| "grad_norm": 0.009037856943905354, |
| "learning_rate": 2.259146931831413e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5680352, |
| "step": 9100 |
| }, |
| { |
| "epoch": 18.283132530120483, |
| "grad_norm": 0.005470726173371077, |
| "learning_rate": 2.2331810022634847e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5683104, |
| "step": 9105 |
| }, |
| { |
| "epoch": 18.29317269076305, |
| "grad_norm": 0.001181105268187821, |
| "learning_rate": 2.2073617497797018e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5686688, |
| "step": 9110 |
| }, |
| { |
| "epoch": 18.303212851405622, |
| "grad_norm": 0.0015586339868605137, |
| "learning_rate": 2.1816892536629775e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5689600, |
| "step": 9115 |
| }, |
| { |
| "epoch": 18.313253012048193, |
| "grad_norm": 0.0013247845927253366, |
| "learning_rate": 2.1561635927456083e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5692768, |
| "step": 9120 |
| }, |
| { |
| "epoch": 18.323293172690764, |
| "grad_norm": 0.0010992807801812887, |
| "learning_rate": 2.1307848454089452e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5695584, |
| "step": 9125 |
| }, |
| { |
| "epoch": 18.333333333333332, |
| "grad_norm": 0.006108762696385384, |
| "learning_rate": 2.1055530895832897e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5698784, |
| "step": 9130 |
| }, |
| { |
| "epoch": 18.343373493975903, |
| "grad_norm": 0.006278180982917547, |
| "learning_rate": 2.0804684027474987e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5701504, |
| "step": 9135 |
| }, |
| { |
| "epoch": 18.353413654618475, |
| "grad_norm": 0.0014436625642701983, |
| "learning_rate": 2.055530861928884e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5705216, |
| "step": 9140 |
| }, |
| { |
| "epoch": 18.363453815261042, |
| "grad_norm": 0.009470781311392784, |
| "learning_rate": 2.0307405437029027e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5708576, |
| "step": 9145 |
| }, |
| { |
| "epoch": 18.373493975903614, |
| "grad_norm": 0.00394744286313653, |
| "learning_rate": 2.006097524192918e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5712288, |
| "step": 9150 |
| }, |
| { |
| "epoch": 18.383534136546185, |
| "grad_norm": 0.007234565913677216, |
| "learning_rate": 1.9816018790700165e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5715648, |
| "step": 9155 |
| }, |
| { |
| "epoch": 18.393574297188756, |
| "grad_norm": 0.0013976708287373185, |
| "learning_rate": 1.9572536835527013e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5718720, |
| "step": 9160 |
| }, |
| { |
| "epoch": 18.403614457831324, |
| "grad_norm": 0.016879552975296974, |
| "learning_rate": 1.933053012406749e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5722560, |
| "step": 9165 |
| }, |
| { |
| "epoch": 18.413654618473895, |
| "grad_norm": 0.0010123576503247023, |
| "learning_rate": 1.908999939944911e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5725408, |
| "step": 9170 |
| }, |
| { |
| "epoch": 18.423694779116467, |
| "grad_norm": 0.0012418956030160189, |
| "learning_rate": 1.8850945400266994e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5729024, |
| "step": 9175 |
| }, |
| { |
| "epoch": 18.433734939759034, |
| "grad_norm": 0.0023059435188770294, |
| "learning_rate": 1.861336886058196e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5731584, |
| "step": 9180 |
| }, |
| { |
| "epoch": 18.443775100401606, |
| "grad_norm": 0.015861524268984795, |
| "learning_rate": 1.8377270509917777e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5734624, |
| "step": 9185 |
| }, |
| { |
| "epoch": 18.453815261044177, |
| "grad_norm": 0.001924677286297083, |
| "learning_rate": 1.81426510732593e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5737920, |
| "step": 9190 |
| }, |
| { |
| "epoch": 18.46385542168675, |
| "grad_norm": 0.004833642393350601, |
| "learning_rate": 1.7909511271050006e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5740896, |
| "step": 9195 |
| }, |
| { |
| "epoch": 18.473895582329316, |
| "grad_norm": 0.0023220451548695564, |
| "learning_rate": 1.7677851819189907e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5744000, |
| "step": 9200 |
| }, |
| { |
| "epoch": 18.483935742971887, |
| "grad_norm": 0.01160132884979248, |
| "learning_rate": 1.7447673429033361e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5746816, |
| "step": 9205 |
| }, |
| { |
| "epoch": 18.49397590361446, |
| "grad_norm": 0.005880299024283886, |
| "learning_rate": 1.7218976807386767e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5749696, |
| "step": 9210 |
| }, |
| { |
| "epoch": 18.50401606425703, |
| "grad_norm": 0.003148122224956751, |
| "learning_rate": 1.6991762656506483e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5752544, |
| "step": 9215 |
| }, |
| { |
| "epoch": 18.514056224899598, |
| "grad_norm": 0.0012982593616470695, |
| "learning_rate": 1.6766031674096795e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5756672, |
| "step": 9220 |
| }, |
| { |
| "epoch": 18.52409638554217, |
| "grad_norm": 0.02512902021408081, |
| "learning_rate": 1.654178455330735e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5759520, |
| "step": 9225 |
| }, |
| { |
| "epoch": 18.53413654618474, |
| "grad_norm": 0.002674127696081996, |
| "learning_rate": 1.631902198273172e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5762848, |
| "step": 9230 |
| }, |
| { |
| "epoch": 18.544176706827308, |
| "grad_norm": 0.002228042809292674, |
| "learning_rate": 1.6097744646404457e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5766496, |
| "step": 9235 |
| }, |
| { |
| "epoch": 18.55421686746988, |
| "grad_norm": 0.003513950854539871, |
| "learning_rate": 1.5877953223799703e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5769600, |
| "step": 9240 |
| }, |
| { |
| "epoch": 18.56425702811245, |
| "grad_norm": 0.004237438552081585, |
| "learning_rate": 1.565964838982881e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5772800, |
| "step": 9245 |
| }, |
| { |
| "epoch": 18.57429718875502, |
| "grad_norm": 0.0016228174790740013, |
| "learning_rate": 1.544283081483805e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5776416, |
| "step": 9250 |
| }, |
| { |
| "epoch": 18.58433734939759, |
| "grad_norm": 0.0069399080239236355, |
| "learning_rate": 1.5227501164607138e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5778976, |
| "step": 9255 |
| }, |
| { |
| "epoch": 18.59437751004016, |
| "grad_norm": 0.006944271270185709, |
| "learning_rate": 1.501366010034644e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5782400, |
| "step": 9260 |
| }, |
| { |
| "epoch": 18.604417670682732, |
| "grad_norm": 0.013072614558041096, |
| "learning_rate": 1.4801308278695636e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5784640, |
| "step": 9265 |
| }, |
| { |
| "epoch": 18.6144578313253, |
| "grad_norm": 0.0017089575994759798, |
| "learning_rate": 1.45904463517213e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5787936, |
| "step": 9270 |
| }, |
| { |
| "epoch": 18.62449799196787, |
| "grad_norm": 0.0009865846950560808, |
| "learning_rate": 1.4381074966914987e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5791584, |
| "step": 9275 |
| }, |
| { |
| "epoch": 18.634538152610443, |
| "grad_norm": 0.0016180879902094603, |
| "learning_rate": 1.4173194767191257e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5794912, |
| "step": 9280 |
| }, |
| { |
| "epoch": 18.644578313253014, |
| "grad_norm": 0.0051255906000733376, |
| "learning_rate": 1.396680639088571e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5797568, |
| "step": 9285 |
| }, |
| { |
| "epoch": 18.65461847389558, |
| "grad_norm": 0.0013111892621964216, |
| "learning_rate": 1.3761910471753126e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5801088, |
| "step": 9290 |
| }, |
| { |
| "epoch": 18.664658634538153, |
| "grad_norm": 0.0022690477780997753, |
| "learning_rate": 1.3558507638965158e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5804096, |
| "step": 9295 |
| }, |
| { |
| "epoch": 18.674698795180724, |
| "grad_norm": 0.008249749429523945, |
| "learning_rate": 1.3356598517108966e-07, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5807392, |
| "step": 9300 |
| }, |
| { |
| "epoch": 18.684738955823292, |
| "grad_norm": 0.009947913698852062, |
| "learning_rate": 1.3156183726184657e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5810848, |
| "step": 9305 |
| }, |
| { |
| "epoch": 18.694779116465863, |
| "grad_norm": 0.0014019834343343973, |
| "learning_rate": 1.295726388160412e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5814176, |
| "step": 9310 |
| }, |
| { |
| "epoch": 18.704819277108435, |
| "grad_norm": 0.0023297134321182966, |
| "learning_rate": 1.2759839594188307e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5816736, |
| "step": 9315 |
| }, |
| { |
| "epoch": 18.714859437751002, |
| "grad_norm": 0.001655187108553946, |
| "learning_rate": 1.2563911470166057e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5819360, |
| "step": 9320 |
| }, |
| { |
| "epoch": 18.724899598393574, |
| "grad_norm": 0.002192035550251603, |
| "learning_rate": 1.2369480111171784e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5822304, |
| "step": 9325 |
| }, |
| { |
| "epoch": 18.734939759036145, |
| "grad_norm": 0.0010390159441158175, |
| "learning_rate": 1.2176546114243903e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5824768, |
| "step": 9330 |
| }, |
| { |
| "epoch": 18.744979919678716, |
| "grad_norm": 0.0041192579083144665, |
| "learning_rate": 1.198511007182296e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5827488, |
| "step": 9335 |
| }, |
| { |
| "epoch": 18.755020080321284, |
| "grad_norm": 0.0012042642338201404, |
| "learning_rate": 1.1795172571749503e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5830496, |
| "step": 9340 |
| }, |
| { |
| "epoch": 18.765060240963855, |
| "grad_norm": 0.0018591363914310932, |
| "learning_rate": 1.160673419726288e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5833952, |
| "step": 9345 |
| }, |
| { |
| "epoch": 18.775100401606426, |
| "grad_norm": 0.005543689243495464, |
| "learning_rate": 1.1419795526998679e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5837280, |
| "step": 9350 |
| }, |
| { |
| "epoch": 18.785140562248998, |
| "grad_norm": 0.0025698868557810783, |
| "learning_rate": 1.1234357134987717e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5839936, |
| "step": 9355 |
| }, |
| { |
| "epoch": 18.795180722891565, |
| "grad_norm": 0.023136422038078308, |
| "learning_rate": 1.1050419590653726e-07, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 5843584, |
| "step": 9360 |
| }, |
| { |
| "epoch": 18.805220883534137, |
| "grad_norm": 0.001498569967225194, |
| "learning_rate": 1.0867983458811792e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5846624, |
| "step": 9365 |
| }, |
| { |
| "epoch": 18.815261044176708, |
| "grad_norm": 0.06827542185783386, |
| "learning_rate": 1.0687049299666796e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5850112, |
| "step": 9370 |
| }, |
| { |
| "epoch": 18.825301204819276, |
| "grad_norm": 0.004503254778683186, |
| "learning_rate": 1.050761766881131e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5853856, |
| "step": 9375 |
| }, |
| { |
| "epoch": 18.835341365461847, |
| "grad_norm": 0.021971486508846283, |
| "learning_rate": 1.0329689117224262e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5857024, |
| "step": 9380 |
| }, |
| { |
| "epoch": 18.84538152610442, |
| "grad_norm": 0.0739317312836647, |
| "learning_rate": 1.0153264191269052e-07, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5860128, |
| "step": 9385 |
| }, |
| { |
| "epoch": 18.855421686746986, |
| "grad_norm": 0.0029136035591363907, |
| "learning_rate": 9.978343432691884e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5862336, |
| "step": 9390 |
| }, |
| { |
| "epoch": 18.865461847389557, |
| "grad_norm": 0.002241392619907856, |
| "learning_rate": 9.804927378620155e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5865792, |
| "step": 9395 |
| }, |
| { |
| "epoch": 18.87550200803213, |
| "grad_norm": 0.0049470034427940845, |
| "learning_rate": 9.633016561560793e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5869280, |
| "step": 9400 |
| }, |
| { |
| "epoch": 18.8855421686747, |
| "grad_norm": 0.0011665538186207414, |
| "learning_rate": 9.462611509398534e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5872288, |
| "step": 9405 |
| }, |
| { |
| "epoch": 18.895582329317268, |
| "grad_norm": 0.0019446617225185037, |
| "learning_rate": 9.293712745394479e-08, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5874688, |
| "step": 9410 |
| }, |
| { |
| "epoch": 18.90562248995984, |
| "grad_norm": 0.12640930712223053, |
| "learning_rate": 9.126320788184374e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5877824, |
| "step": 9415 |
| }, |
| { |
| "epoch": 18.91566265060241, |
| "grad_norm": 0.014082228764891624, |
| "learning_rate": 8.960436151776886e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5881056, |
| "step": 9420 |
| }, |
| { |
| "epoch": 18.92570281124498, |
| "grad_norm": 0.0034161340445280075, |
| "learning_rate": 8.796059345552389e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5884320, |
| "step": 9425 |
| }, |
| { |
| "epoch": 18.93574297188755, |
| "grad_norm": 0.0019884402863681316, |
| "learning_rate": 8.633190874261011e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5887648, |
| "step": 9430 |
| }, |
| { |
| "epoch": 18.94578313253012, |
| "grad_norm": 0.013954327441751957, |
| "learning_rate": 8.471831238021366e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5890976, |
| "step": 9435 |
| }, |
| { |
| "epoch": 18.955823293172692, |
| "grad_norm": 0.04434090852737427, |
| "learning_rate": 8.31198093231872e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5893344, |
| "step": 9440 |
| }, |
| { |
| "epoch": 18.96586345381526, |
| "grad_norm": 0.006404112558811903, |
| "learning_rate": 8.153640448003875e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5895808, |
| "step": 9445 |
| }, |
| { |
| "epoch": 18.97590361445783, |
| "grad_norm": 0.0015161640476435423, |
| "learning_rate": 7.996810271291344e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5899200, |
| "step": 9450 |
| }, |
| { |
| "epoch": 18.985943775100402, |
| "grad_norm": 0.09693264961242676, |
| "learning_rate": 7.841490883757907e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5902336, |
| "step": 9455 |
| }, |
| { |
| "epoch": 18.99598393574297, |
| "grad_norm": 0.005461865570396185, |
| "learning_rate": 7.687682762341276e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5905248, |
| "step": 9460 |
| }, |
| { |
| "epoch": 19.00602409638554, |
| "grad_norm": 0.0033444438595324755, |
| "learning_rate": 7.535386379338371e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5908704, |
| "step": 9465 |
| }, |
| { |
| "epoch": 19.016064257028113, |
| "grad_norm": 0.019571533426642418, |
| "learning_rate": 7.384602202404335e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5912832, |
| "step": 9470 |
| }, |
| { |
| "epoch": 19.026104417670684, |
| "grad_norm": 0.0012603729264810681, |
| "learning_rate": 7.235330694550402e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5917056, |
| "step": 9475 |
| }, |
| { |
| "epoch": 19.03614457831325, |
| "grad_norm": 0.014011326245963573, |
| "learning_rate": 7.087572314143198e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5920192, |
| "step": 9480 |
| }, |
| { |
| "epoch": 19.046184738955823, |
| "grad_norm": 0.0010616021463647485, |
| "learning_rate": 6.94132751490284e-08, |
| "loss": 0.0059, |
| "num_input_tokens_seen": 5922368, |
| "step": 9485 |
| }, |
| { |
| "epoch": 19.056224899598394, |
| "grad_norm": 0.008679233491420746, |
| "learning_rate": 6.796596745901717e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5925056, |
| "step": 9490 |
| }, |
| { |
| "epoch": 19.066265060240966, |
| "grad_norm": 0.0020861446391791105, |
| "learning_rate": 6.653380451563219e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5928256, |
| "step": 9495 |
| }, |
| { |
| "epoch": 19.076305220883533, |
| "grad_norm": 0.00580920884385705, |
| "learning_rate": 6.511679071659949e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5931392, |
| "step": 9500 |
| }, |
| { |
| "epoch": 19.086345381526105, |
| "grad_norm": 0.005442566704005003, |
| "learning_rate": 6.371493041313126e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5934464, |
| "step": 9505 |
| }, |
| { |
| "epoch": 19.096385542168676, |
| "grad_norm": 0.001245411578565836, |
| "learning_rate": 6.232822790990467e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5937568, |
| "step": 9510 |
| }, |
| { |
| "epoch": 19.106425702811244, |
| "grad_norm": 0.04193849116563797, |
| "learning_rate": 6.095668746505245e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5940768, |
| "step": 9515 |
| }, |
| { |
| "epoch": 19.116465863453815, |
| "grad_norm": 0.003646423341706395, |
| "learning_rate": 5.96003132901507e-08, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5944032, |
| "step": 9520 |
| }, |
| { |
| "epoch": 19.126506024096386, |
| "grad_norm": 0.29433780908584595, |
| "learning_rate": 5.825910955020386e-08, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 5947168, |
| "step": 9525 |
| }, |
| { |
| "epoch": 19.136546184738958, |
| "grad_norm": 0.0010262697469443083, |
| "learning_rate": 5.693308036363143e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5950592, |
| "step": 9530 |
| }, |
| { |
| "epoch": 19.146586345381525, |
| "grad_norm": 0.00138461880851537, |
| "learning_rate": 5.562222980225907e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5952960, |
| "step": 9535 |
| }, |
| { |
| "epoch": 19.156626506024097, |
| "grad_norm": 0.011420581489801407, |
| "learning_rate": 5.432656189130137e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5956288, |
| "step": 9540 |
| }, |
| { |
| "epoch": 19.166666666666668, |
| "grad_norm": 0.003382657188922167, |
| "learning_rate": 5.3046080609352455e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5958752, |
| "step": 9545 |
| }, |
| { |
| "epoch": 19.176706827309236, |
| "grad_norm": 0.0018110686214640737, |
| "learning_rate": 5.178078988837432e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5962144, |
| "step": 9550 |
| }, |
| { |
| "epoch": 19.186746987951807, |
| "grad_norm": 0.0012735830387100577, |
| "learning_rate": 5.053069361368068e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5965280, |
| "step": 9555 |
| }, |
| { |
| "epoch": 19.196787148594378, |
| "grad_norm": 0.0035719373263418674, |
| "learning_rate": 4.9295795623930945e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5968192, |
| "step": 9560 |
| }, |
| { |
| "epoch": 19.20682730923695, |
| "grad_norm": 0.0012759892269968987, |
| "learning_rate": 4.807609971111238e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5971264, |
| "step": 9565 |
| }, |
| { |
| "epoch": 19.216867469879517, |
| "grad_norm": 0.031974148005247116, |
| "learning_rate": 4.68716096205335e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5973344, |
| "step": 9570 |
| }, |
| { |
| "epoch": 19.22690763052209, |
| "grad_norm": 0.0022920300252735615, |
| "learning_rate": 4.5682329050810715e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5977248, |
| "step": 9575 |
| }, |
| { |
| "epoch": 19.23694779116466, |
| "grad_norm": 0.011636001989245415, |
| "learning_rate": 4.450826165385336e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5980704, |
| "step": 9580 |
| }, |
| { |
| "epoch": 19.246987951807228, |
| "grad_norm": 0.0016436435980722308, |
| "learning_rate": 4.33494110348609e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5983936, |
| "step": 9585 |
| }, |
| { |
| "epoch": 19.2570281124498, |
| "grad_norm": 0.03811986371874809, |
| "learning_rate": 4.2205780752301865e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5987424, |
| "step": 9590 |
| }, |
| { |
| "epoch": 19.26706827309237, |
| "grad_norm": 0.001627400633879006, |
| "learning_rate": 4.107737431791159e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5990112, |
| "step": 9595 |
| }, |
| { |
| "epoch": 19.27710843373494, |
| "grad_norm": 0.002810975071042776, |
| "learning_rate": 3.996419519667505e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5993472, |
| "step": 9600 |
| }, |
| { |
| "epoch": 19.28714859437751, |
| "grad_norm": 0.0020102846901863813, |
| "learning_rate": 3.8866246806821273e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5996320, |
| "step": 9605 |
| }, |
| { |
| "epoch": 19.29718875502008, |
| "grad_norm": 0.0016769138164818287, |
| "learning_rate": 3.7783532519808376e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 5999360, |
| "step": 9610 |
| }, |
| { |
| "epoch": 19.30722891566265, |
| "grad_norm": 0.0018848153995350003, |
| "learning_rate": 3.671605566031633e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6002016, |
| "step": 9615 |
| }, |
| { |
| "epoch": 19.31726907630522, |
| "grad_norm": 0.0016581976087763906, |
| "learning_rate": 3.566381950623588e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6004448, |
| "step": 9620 |
| }, |
| { |
| "epoch": 19.32730923694779, |
| "grad_norm": 0.0015075030969455838, |
| "learning_rate": 3.462682728865685e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6007392, |
| "step": 9625 |
| }, |
| { |
| "epoch": 19.337349397590362, |
| "grad_norm": 0.011512084864079952, |
| "learning_rate": 3.3605082191860985e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6010176, |
| "step": 9630 |
| }, |
| { |
| "epoch": 19.347389558232933, |
| "grad_norm": 0.0016901310300454497, |
| "learning_rate": 3.259858735331134e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6013120, |
| "step": 9635 |
| }, |
| { |
| "epoch": 19.3574297188755, |
| "grad_norm": 0.027195386588573456, |
| "learning_rate": 3.1607345863640114e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6015296, |
| "step": 9640 |
| }, |
| { |
| "epoch": 19.367469879518072, |
| "grad_norm": 0.02286182902753353, |
| "learning_rate": 3.063136076664364e-08, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 6018016, |
| "step": 9645 |
| }, |
| { |
| "epoch": 19.377510040160644, |
| "grad_norm": 17.473379135131836, |
| "learning_rate": 2.967063505926848e-08, |
| "loss": 0.0474, |
| "num_input_tokens_seen": 6021408, |
| "step": 9650 |
| }, |
| { |
| "epoch": 19.38755020080321, |
| "grad_norm": 0.001090511097572744, |
| "learning_rate": 2.8725171691605934e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6024544, |
| "step": 9655 |
| }, |
| { |
| "epoch": 19.397590361445783, |
| "grad_norm": 0.007710614707320929, |
| "learning_rate": 2.7794973566880323e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6027872, |
| "step": 9660 |
| }, |
| { |
| "epoch": 19.407630522088354, |
| "grad_norm": 0.012881418690085411, |
| "learning_rate": 2.6880043541441804e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6030784, |
| "step": 9665 |
| }, |
| { |
| "epoch": 19.417670682730925, |
| "grad_norm": 0.0014010410523042083, |
| "learning_rate": 2.5980384424756366e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6034208, |
| "step": 9670 |
| }, |
| { |
| "epoch": 19.427710843373493, |
| "grad_norm": 0.00728964526206255, |
| "learning_rate": 2.5095998979398628e-08, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 6037632, |
| "step": 9675 |
| }, |
| { |
| "epoch": 19.437751004016064, |
| "grad_norm": 0.0008074513752944767, |
| "learning_rate": 2.4226889921041273e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6040928, |
| "step": 9680 |
| }, |
| { |
| "epoch": 19.447791164658636, |
| "grad_norm": 0.0009832337964326143, |
| "learning_rate": 2.3373059918448958e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6044096, |
| "step": 9685 |
| }, |
| { |
| "epoch": 19.457831325301203, |
| "grad_norm": 0.03476724773645401, |
| "learning_rate": 2.2534511593468866e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6047456, |
| "step": 9690 |
| }, |
| { |
| "epoch": 19.467871485943775, |
| "grad_norm": 0.2634872794151306, |
| "learning_rate": 2.171124752102238e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6049856, |
| "step": 9695 |
| }, |
| { |
| "epoch": 19.477911646586346, |
| "grad_norm": 0.001191947259940207, |
| "learning_rate": 2.0903270229098992e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6052704, |
| "step": 9700 |
| }, |
| { |
| "epoch": 19.487951807228917, |
| "grad_norm": 0.01017684955149889, |
| "learning_rate": 2.0110582198745177e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6055456, |
| "step": 9705 |
| }, |
| { |
| "epoch": 19.497991967871485, |
| "grad_norm": 0.0016589167062193155, |
| "learning_rate": 1.9333185864061077e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6058304, |
| "step": 9710 |
| }, |
| { |
| "epoch": 19.508032128514056, |
| "grad_norm": 0.0013716747052967548, |
| "learning_rate": 1.8571083612188845e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6061440, |
| "step": 9715 |
| }, |
| { |
| "epoch": 19.518072289156628, |
| "grad_norm": 0.005404317285865545, |
| "learning_rate": 1.7824277783308197e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6065024, |
| "step": 9720 |
| }, |
| { |
| "epoch": 19.528112449799195, |
| "grad_norm": 0.02352655865252018, |
| "learning_rate": 1.7092770670628644e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6068256, |
| "step": 9725 |
| }, |
| { |
| "epoch": 19.538152610441767, |
| "grad_norm": 0.003068252932280302, |
| "learning_rate": 1.637656452038172e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6071200, |
| "step": 9730 |
| }, |
| { |
| "epoch": 19.548192771084338, |
| "grad_norm": 0.0032745555508881807, |
| "learning_rate": 1.5675661531813215e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6074656, |
| "step": 9735 |
| }, |
| { |
| "epoch": 19.55823293172691, |
| "grad_norm": 0.009923930279910564, |
| "learning_rate": 1.4990063857180383e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6077408, |
| "step": 9740 |
| }, |
| { |
| "epoch": 19.568273092369477, |
| "grad_norm": 0.00499066011980176, |
| "learning_rate": 1.431977360173975e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6080352, |
| "step": 9745 |
| }, |
| { |
| "epoch": 19.57831325301205, |
| "grad_norm": 0.002552691148594022, |
| "learning_rate": 1.3664792823745442e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6082848, |
| "step": 9750 |
| }, |
| { |
| "epoch": 19.58835341365462, |
| "grad_norm": 0.0025797574780881405, |
| "learning_rate": 1.3025123534440299e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6085664, |
| "step": 9755 |
| }, |
| { |
| "epoch": 19.598393574297187, |
| "grad_norm": 0.0038030825089663267, |
| "learning_rate": 1.240076769804921e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6088608, |
| "step": 9760 |
| }, |
| { |
| "epoch": 19.60843373493976, |
| "grad_norm": 0.022904515266418457, |
| "learning_rate": 1.1791727231776906e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6091296, |
| "step": 9765 |
| }, |
| { |
| "epoch": 19.61847389558233, |
| "grad_norm": 0.0012896016705781221, |
| "learning_rate": 1.1198004005796847e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6094880, |
| "step": 9770 |
| }, |
| { |
| "epoch": 19.6285140562249, |
| "grad_norm": 0.024325761944055557, |
| "learning_rate": 1.0619599843249006e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6098208, |
| "step": 9775 |
| }, |
| { |
| "epoch": 19.63855421686747, |
| "grad_norm": 0.0039957738481462, |
| "learning_rate": 1.0056516520232651e-08, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6101024, |
| "step": 9780 |
| }, |
| { |
| "epoch": 19.64859437751004, |
| "grad_norm": 0.012007399462163448, |
| "learning_rate": 9.508755765802457e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6103904, |
| "step": 9785 |
| }, |
| { |
| "epoch": 19.65863453815261, |
| "grad_norm": 0.0020268342923372984, |
| "learning_rate": 8.976319261962407e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6106816, |
| "step": 9790 |
| }, |
| { |
| "epoch": 19.66867469879518, |
| "grad_norm": 0.004168129526078701, |
| "learning_rate": 8.459208643659122e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6110368, |
| "step": 9795 |
| }, |
| { |
| "epoch": 19.67871485943775, |
| "grad_norm": 0.003230429720133543, |
| "learning_rate": 7.957425498778537e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6113856, |
| "step": 9800 |
| }, |
| { |
| "epoch": 19.688755020080322, |
| "grad_norm": 0.01263430342078209, |
| "learning_rate": 7.470971368142011e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6116448, |
| "step": 9805 |
| }, |
| { |
| "epoch": 19.698795180722893, |
| "grad_norm": 0.0028617747593671083, |
| "learning_rate": 6.999847745498556e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6120096, |
| "step": 9810 |
| }, |
| { |
| "epoch": 19.70883534136546, |
| "grad_norm": 0.0074871014803647995, |
| "learning_rate": 6.544056077523175e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6123008, |
| "step": 9815 |
| }, |
| { |
| "epoch": 19.718875502008032, |
| "grad_norm": 0.004571064841002226, |
| "learning_rate": 6.1035977638101985e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6126720, |
| "step": 9820 |
| }, |
| { |
| "epoch": 19.728915662650603, |
| "grad_norm": 0.0010783788748085499, |
| "learning_rate": 5.678474156871061e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6129760, |
| "step": 9825 |
| }, |
| { |
| "epoch": 19.73895582329317, |
| "grad_norm": 0.0013977461494505405, |
| "learning_rate": 5.268686562127645e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6133344, |
| "step": 9830 |
| }, |
| { |
| "epoch": 19.748995983935743, |
| "grad_norm": 0.00812604185193777, |
| "learning_rate": 4.874236237911723e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6136576, |
| "step": 9835 |
| }, |
| { |
| "epoch": 19.759036144578314, |
| "grad_norm": 0.0008148096385411918, |
| "learning_rate": 4.495124395456629e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6139136, |
| "step": 9840 |
| }, |
| { |
| "epoch": 19.769076305220885, |
| "grad_norm": 0.006598465144634247, |
| "learning_rate": 4.1313521988983754e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6142240, |
| "step": 9845 |
| }, |
| { |
| "epoch": 19.779116465863453, |
| "grad_norm": 0.005599440075457096, |
| "learning_rate": 3.7829207652673175e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6145088, |
| "step": 9850 |
| }, |
| { |
| "epoch": 19.789156626506024, |
| "grad_norm": 0.018151333555579185, |
| "learning_rate": 3.44983116448927e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6147968, |
| "step": 9855 |
| }, |
| { |
| "epoch": 19.799196787148595, |
| "grad_norm": 0.015009772963821888, |
| "learning_rate": 3.1320844193788445e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6151296, |
| "step": 9860 |
| }, |
| { |
| "epoch": 19.809236947791163, |
| "grad_norm": 0.0013042399659752846, |
| "learning_rate": 2.8296815056377824e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6154880, |
| "step": 9865 |
| }, |
| { |
| "epoch": 19.819277108433734, |
| "grad_norm": 0.012515922077000141, |
| "learning_rate": 2.54262335185107e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6158528, |
| "step": 9870 |
| }, |
| { |
| "epoch": 19.829317269076306, |
| "grad_norm": 0.0033991907257586718, |
| "learning_rate": 2.2709108394863845e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6161600, |
| "step": 9875 |
| }, |
| { |
| "epoch": 19.839357429718877, |
| "grad_norm": 0.0012220785720273852, |
| "learning_rate": 2.0145448028874305e-09, |
| "loss": 0.0002, |
| "num_input_tokens_seen": 6164288, |
| "step": 9880 |
| }, |
| { |
| "epoch": 19.849397590361445, |
| "grad_norm": 0.0031056124716997147, |
| "learning_rate": 1.7735260292750522e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6167904, |
| "step": 9885 |
| }, |
| { |
| "epoch": 19.859437751004016, |
| "grad_norm": 0.00221474701538682, |
| "learning_rate": 1.547855258743347e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6171360, |
| "step": 9890 |
| }, |
| { |
| "epoch": 19.869477911646587, |
| "grad_norm": 0.006416819524019957, |
| "learning_rate": 1.3375331842574446e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6174848, |
| "step": 9895 |
| }, |
| { |
| "epoch": 19.879518072289155, |
| "grad_norm": 0.011413169093430042, |
| "learning_rate": 1.1425604516512868e-09, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6177472, |
| "step": 9900 |
| }, |
| { |
| "epoch": 19.889558232931726, |
| "grad_norm": 0.04103207588195801, |
| "learning_rate": 9.629376596248518e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6180576, |
| "step": 9905 |
| }, |
| { |
| "epoch": 19.899598393574298, |
| "grad_norm": 0.003425066592171788, |
| "learning_rate": 7.986653597447102e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6183520, |
| "step": 9910 |
| }, |
| { |
| "epoch": 19.90963855421687, |
| "grad_norm": 0.00264586228877306, |
| "learning_rate": 6.497440564395829e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6188000, |
| "step": 9915 |
| }, |
| { |
| "epoch": 19.919678714859437, |
| "grad_norm": 0.00533295376226306, |
| "learning_rate": 5.161742070014519e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6190560, |
| "step": 9920 |
| }, |
| { |
| "epoch": 19.929718875502008, |
| "grad_norm": 0.0016277596587315202, |
| "learning_rate": 3.9795622158111945e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6194080, |
| "step": 9925 |
| }, |
| { |
| "epoch": 19.93975903614458, |
| "grad_norm": 0.004762938711792231, |
| "learning_rate": 2.950904631893181e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6197728, |
| "step": 9930 |
| }, |
| { |
| "epoch": 19.949799196787147, |
| "grad_norm": 0.0040444061160087585, |
| "learning_rate": 2.0757724769560062e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6201088, |
| "step": 9935 |
| }, |
| { |
| "epoch": 19.95983935742972, |
| "grad_norm": 0.05396367609500885, |
| "learning_rate": 1.354168438255643e-10, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6204672, |
| "step": 9940 |
| }, |
| { |
| "epoch": 19.96987951807229, |
| "grad_norm": 0.0035722742322832346, |
| "learning_rate": 7.860947316140621e-11, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6207360, |
| "step": 9945 |
| }, |
| { |
| "epoch": 19.97991967871486, |
| "grad_norm": 0.0020782332867383957, |
| "learning_rate": 3.715531014025775e-11, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6210368, |
| "step": 9950 |
| }, |
| { |
| "epoch": 19.98995983935743, |
| "grad_norm": 0.02569785714149475, |
| "learning_rate": 1.1054482056405136e-11, |
| "loss": 0.0, |
| "num_input_tokens_seen": 6212800, |
| "step": 9955 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.0010948892449960113, |
| "learning_rate": 3.0706905573829603e-13, |
| "loss": 0.0001, |
| "num_input_tokens_seen": 6215968, |
| "step": 9960 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 1.102670431137085, |
| "eval_runtime": 8.0733, |
| "eval_samples_per_second": 61.685, |
| "eval_steps_per_second": 15.483, |
| "num_input_tokens_seen": 6215968, |
| "step": 9960 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 6215968, |
| "step": 9960, |
| "total_flos": 2.7990222962137498e+17, |
| "train_loss": 0.14292573261673278, |
| "train_runtime": 1559.9375, |
| "train_samples_per_second": 25.539, |
| "train_steps_per_second": 6.385 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 9960, |
| "num_input_tokens_seen": 6215968, |
| "num_train_epochs": 20, |
| "save_steps": 996, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.7990222962137498e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|