| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.0005, | |
| "eval_steps": 500, | |
| "global_step": 1001, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0005, | |
| "grad_norm": 375.7668762207031, | |
| "learning_rate": 2.0000000000000002e-07, | |
| "loss": 3.2089, | |
| "num_input_tokens_seen": 8388608, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 365.5732421875, | |
| "learning_rate": 4.0000000000000003e-07, | |
| "loss": 3.2601, | |
| "num_input_tokens_seen": 16777216, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0015, | |
| "grad_norm": 374.2887878417969, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 3.3201, | |
| "num_input_tokens_seen": 25165824, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 358.9713439941406, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 3.3766, | |
| "num_input_tokens_seen": 33554432, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0025, | |
| "grad_norm": 330.81744384765625, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 3.3024, | |
| "num_input_tokens_seen": 41943040, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 286.5671691894531, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 3.1243, | |
| "num_input_tokens_seen": 50331648, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0035, | |
| "grad_norm": 277.4550476074219, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "loss": 3.0079, | |
| "num_input_tokens_seen": 58720256, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 132.09463500976562, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "loss": 2.5348, | |
| "num_input_tokens_seen": 67108864, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0045, | |
| "grad_norm": 113.34139251708984, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "loss": 2.389, | |
| "num_input_tokens_seen": 75497472, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 81.76200866699219, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 2.4319, | |
| "num_input_tokens_seen": 83886080, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0055, | |
| "grad_norm": 43.78662109375, | |
| "learning_rate": 2.2e-06, | |
| "loss": 2.0879, | |
| "num_input_tokens_seen": 92274688, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 33.5757942199707, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 1.8163, | |
| "num_input_tokens_seen": 100663296, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0065, | |
| "grad_norm": 35.40586853027344, | |
| "learning_rate": 2.6e-06, | |
| "loss": 2.1241, | |
| "num_input_tokens_seen": 109051904, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 24.623470306396484, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 2.0751, | |
| "num_input_tokens_seen": 117440512, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0075, | |
| "grad_norm": 21.34939193725586, | |
| "learning_rate": 3e-06, | |
| "loss": 1.9823, | |
| "num_input_tokens_seen": 125829120, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 17.1300106048584, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 1.8128, | |
| "num_input_tokens_seen": 134217728, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0085, | |
| "grad_norm": 15.088403701782227, | |
| "learning_rate": 3.4000000000000005e-06, | |
| "loss": 1.859, | |
| "num_input_tokens_seen": 142606336, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 14.965604782104492, | |
| "learning_rate": 3.6000000000000003e-06, | |
| "loss": 1.8818, | |
| "num_input_tokens_seen": 150994944, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0095, | |
| "grad_norm": 11.057519912719727, | |
| "learning_rate": 3.8000000000000005e-06, | |
| "loss": 1.8023, | |
| "num_input_tokens_seen": 159383552, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 6.788076877593994, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.7203, | |
| "num_input_tokens_seen": 167772160, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0105, | |
| "grad_norm": 4.412460803985596, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "loss": 1.6308, | |
| "num_input_tokens_seen": 176160768, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 5.173304557800293, | |
| "learning_rate": 4.4e-06, | |
| "loss": 1.7863, | |
| "num_input_tokens_seen": 184549376, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0115, | |
| "grad_norm": 2.5053155422210693, | |
| "learning_rate": 4.600000000000001e-06, | |
| "loss": 1.7971, | |
| "num_input_tokens_seen": 192937984, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 2.0483975410461426, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 1.6683, | |
| "num_input_tokens_seen": 201326592, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 2.09352970123291, | |
| "learning_rate": 5e-06, | |
| "loss": 1.6038, | |
| "num_input_tokens_seen": 209715200, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 4.401305675506592, | |
| "learning_rate": 5.2e-06, | |
| "loss": 1.6353, | |
| "num_input_tokens_seen": 218103808, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0135, | |
| "grad_norm": 1.700209140777588, | |
| "learning_rate": 5.400000000000001e-06, | |
| "loss": 1.6772, | |
| "num_input_tokens_seen": 226492416, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 1.6402190923690796, | |
| "learning_rate": 5.600000000000001e-06, | |
| "loss": 1.5686, | |
| "num_input_tokens_seen": 234881024, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.0145, | |
| "grad_norm": 2.4193408489227295, | |
| "learning_rate": 5.8e-06, | |
| "loss": 1.6837, | |
| "num_input_tokens_seen": 243269632, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 1.9186208248138428, | |
| "learning_rate": 6e-06, | |
| "loss": 1.6922, | |
| "num_input_tokens_seen": 251658240, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0155, | |
| "grad_norm": 2.1412179470062256, | |
| "learning_rate": 6.200000000000001e-06, | |
| "loss": 1.6934, | |
| "num_input_tokens_seen": 260046848, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 1.463192105293274, | |
| "learning_rate": 6.4000000000000006e-06, | |
| "loss": 1.6489, | |
| "num_input_tokens_seen": 268435456, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.0165, | |
| "grad_norm": 2.23980975151062, | |
| "learning_rate": 6.600000000000001e-06, | |
| "loss": 1.5714, | |
| "num_input_tokens_seen": 276824064, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 1.8727904558181763, | |
| "learning_rate": 6.800000000000001e-06, | |
| "loss": 1.7643, | |
| "num_input_tokens_seen": 285212672, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.0175, | |
| "grad_norm": 2.000786066055298, | |
| "learning_rate": 7e-06, | |
| "loss": 1.5766, | |
| "num_input_tokens_seen": 293601280, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 1.1636618375778198, | |
| "learning_rate": 7.2000000000000005e-06, | |
| "loss": 1.7253, | |
| "num_input_tokens_seen": 301989888, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.0185, | |
| "grad_norm": 1.5350532531738281, | |
| "learning_rate": 7.4e-06, | |
| "loss": 1.6706, | |
| "num_input_tokens_seen": 310378496, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 1.5187338590621948, | |
| "learning_rate": 7.600000000000001e-06, | |
| "loss": 1.6531, | |
| "num_input_tokens_seen": 318767104, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0195, | |
| "grad_norm": 1.4783555269241333, | |
| "learning_rate": 7.800000000000002e-06, | |
| "loss": 1.6585, | |
| "num_input_tokens_seen": 327155712, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.3482269048690796, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.6662, | |
| "num_input_tokens_seen": 335544320, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0205, | |
| "grad_norm": 1.1753276586532593, | |
| "learning_rate": 8.2e-06, | |
| "loss": 1.5193, | |
| "num_input_tokens_seen": 343932928, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 1.7950923442840576, | |
| "learning_rate": 8.400000000000001e-06, | |
| "loss": 1.6096, | |
| "num_input_tokens_seen": 352321536, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.0215, | |
| "grad_norm": 1.162549614906311, | |
| "learning_rate": 8.6e-06, | |
| "loss": 1.6544, | |
| "num_input_tokens_seen": 360710144, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 1.879497766494751, | |
| "learning_rate": 8.8e-06, | |
| "loss": 1.5784, | |
| "num_input_tokens_seen": 369098752, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.0225, | |
| "grad_norm": 1.1027729511260986, | |
| "learning_rate": 9e-06, | |
| "loss": 1.539, | |
| "num_input_tokens_seen": 377487360, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 1.475902795791626, | |
| "learning_rate": 9.200000000000002e-06, | |
| "loss": 1.6817, | |
| "num_input_tokens_seen": 385875968, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.0235, | |
| "grad_norm": 1.3338828086853027, | |
| "learning_rate": 9.4e-06, | |
| "loss": 1.5864, | |
| "num_input_tokens_seen": 394264576, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 1.277995228767395, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 1.5404, | |
| "num_input_tokens_seen": 402653184, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.0245, | |
| "grad_norm": 1.3274115324020386, | |
| "learning_rate": 9.800000000000001e-06, | |
| "loss": 1.6878, | |
| "num_input_tokens_seen": 411041792, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 1.0426124334335327, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6855, | |
| "num_input_tokens_seen": 419430400, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0255, | |
| "grad_norm": 1.1196398735046387, | |
| "learning_rate": 1.02e-05, | |
| "loss": 1.6636, | |
| "num_input_tokens_seen": 427819008, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 0.9055407047271729, | |
| "learning_rate": 1.04e-05, | |
| "loss": 1.6622, | |
| "num_input_tokens_seen": 436207616, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.0265, | |
| "grad_norm": 1.0774548053741455, | |
| "learning_rate": 1.0600000000000002e-05, | |
| "loss": 1.6751, | |
| "num_input_tokens_seen": 444596224, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 1.4852126836776733, | |
| "learning_rate": 1.0800000000000002e-05, | |
| "loss": 1.7416, | |
| "num_input_tokens_seen": 452984832, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.0275, | |
| "grad_norm": 1.1024013757705688, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 1.6691, | |
| "num_input_tokens_seen": 461373440, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 0.7177935838699341, | |
| "learning_rate": 1.1200000000000001e-05, | |
| "loss": 1.8125, | |
| "num_input_tokens_seen": 469762048, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.0285, | |
| "grad_norm": 0.8956972360610962, | |
| "learning_rate": 1.14e-05, | |
| "loss": 1.6641, | |
| "num_input_tokens_seen": 478150656, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 1.6255810260772705, | |
| "learning_rate": 1.16e-05, | |
| "loss": 1.5453, | |
| "num_input_tokens_seen": 486539264, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.0295, | |
| "grad_norm": 1.249619483947754, | |
| "learning_rate": 1.18e-05, | |
| "loss": 1.6133, | |
| "num_input_tokens_seen": 494927872, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.2520281076431274, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.6106, | |
| "num_input_tokens_seen": 503316480, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0305, | |
| "grad_norm": 0.9336643218994141, | |
| "learning_rate": 1.22e-05, | |
| "loss": 1.5936, | |
| "num_input_tokens_seen": 511705088, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 0.760066568851471, | |
| "learning_rate": 1.2400000000000002e-05, | |
| "loss": 1.6764, | |
| "num_input_tokens_seen": 520093696, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.0315, | |
| "grad_norm": 1.1788405179977417, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 1.4128, | |
| "num_input_tokens_seen": 528482304, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.311288833618164, | |
| "learning_rate": 1.2800000000000001e-05, | |
| "loss": 1.7495, | |
| "num_input_tokens_seen": 536870912, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.0325, | |
| "grad_norm": 0.9069179892539978, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 1.7312, | |
| "num_input_tokens_seen": 545259520, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 1.8227250576019287, | |
| "learning_rate": 1.3200000000000002e-05, | |
| "loss": 1.7095, | |
| "num_input_tokens_seen": 553648128, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.0335, | |
| "grad_norm": 1.4830560684204102, | |
| "learning_rate": 1.3400000000000002e-05, | |
| "loss": 1.4721, | |
| "num_input_tokens_seen": 562036736, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 1.8095840215682983, | |
| "learning_rate": 1.3600000000000002e-05, | |
| "loss": 1.5717, | |
| "num_input_tokens_seen": 570425344, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.0345, | |
| "grad_norm": 1.7319716215133667, | |
| "learning_rate": 1.38e-05, | |
| "loss": 1.5344, | |
| "num_input_tokens_seen": 578813952, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 1.4535399675369263, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.5126, | |
| "num_input_tokens_seen": 587202560, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0355, | |
| "grad_norm": 1.2049615383148193, | |
| "learning_rate": 1.4200000000000001e-05, | |
| "loss": 1.6775, | |
| "num_input_tokens_seen": 595591168, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 0.9136773943901062, | |
| "learning_rate": 1.4400000000000001e-05, | |
| "loss": 1.7429, | |
| "num_input_tokens_seen": 603979776, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.0365, | |
| "grad_norm": 1.14919114112854, | |
| "learning_rate": 1.46e-05, | |
| "loss": 1.5888, | |
| "num_input_tokens_seen": 612368384, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 1.284545660018921, | |
| "learning_rate": 1.48e-05, | |
| "loss": 1.5506, | |
| "num_input_tokens_seen": 620756992, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 1.3817670345306396, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 1.5731, | |
| "num_input_tokens_seen": 629145600, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 1.4588181972503662, | |
| "learning_rate": 1.5200000000000002e-05, | |
| "loss": 1.6044, | |
| "num_input_tokens_seen": 637534208, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.0385, | |
| "grad_norm": 0.8650463223457336, | |
| "learning_rate": 1.54e-05, | |
| "loss": 1.6842, | |
| "num_input_tokens_seen": 645922816, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 1.5716619491577148, | |
| "learning_rate": 1.5600000000000003e-05, | |
| "loss": 1.5799, | |
| "num_input_tokens_seen": 654311424, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.0395, | |
| "grad_norm": 1.0133064985275269, | |
| "learning_rate": 1.58e-05, | |
| "loss": 1.7551, | |
| "num_input_tokens_seen": 662700032, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.4739152193069458, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.5831, | |
| "num_input_tokens_seen": 671088640, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0405, | |
| "grad_norm": 1.1332342624664307, | |
| "learning_rate": 1.62e-05, | |
| "loss": 1.5975, | |
| "num_input_tokens_seen": 679477248, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 1.2728761434555054, | |
| "learning_rate": 1.64e-05, | |
| "loss": 1.5585, | |
| "num_input_tokens_seen": 687865856, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.0415, | |
| "grad_norm": 1.1755008697509766, | |
| "learning_rate": 1.66e-05, | |
| "loss": 1.6756, | |
| "num_input_tokens_seen": 696254464, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 2.5836265087127686, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 1.451, | |
| "num_input_tokens_seen": 704643072, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.0425, | |
| "grad_norm": 1.4453366994857788, | |
| "learning_rate": 1.7e-05, | |
| "loss": 1.5373, | |
| "num_input_tokens_seen": 713031680, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 2.125744104385376, | |
| "learning_rate": 1.72e-05, | |
| "loss": 1.5472, | |
| "num_input_tokens_seen": 721420288, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.0435, | |
| "grad_norm": 1.2962861061096191, | |
| "learning_rate": 1.7400000000000003e-05, | |
| "loss": 1.5372, | |
| "num_input_tokens_seen": 729808896, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 1.7616335153579712, | |
| "learning_rate": 1.76e-05, | |
| "loss": 1.6122, | |
| "num_input_tokens_seen": 738197504, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.0445, | |
| "grad_norm": 1.2126449346542358, | |
| "learning_rate": 1.7800000000000002e-05, | |
| "loss": 1.7006, | |
| "num_input_tokens_seen": 746586112, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 1.1429980993270874, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.5913, | |
| "num_input_tokens_seen": 754974720, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0455, | |
| "grad_norm": 1.083024501800537, | |
| "learning_rate": 1.8200000000000002e-05, | |
| "loss": 1.7909, | |
| "num_input_tokens_seen": 763363328, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 1.1309971809387207, | |
| "learning_rate": 1.8400000000000003e-05, | |
| "loss": 1.6412, | |
| "num_input_tokens_seen": 771751936, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.0465, | |
| "grad_norm": 1.1906055212020874, | |
| "learning_rate": 1.86e-05, | |
| "loss": 1.6325, | |
| "num_input_tokens_seen": 780140544, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 1.0961945056915283, | |
| "learning_rate": 1.88e-05, | |
| "loss": 1.5412, | |
| "num_input_tokens_seen": 788529152, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.0475, | |
| "grad_norm": 1.5861741304397583, | |
| "learning_rate": 1.9e-05, | |
| "loss": 1.5966, | |
| "num_input_tokens_seen": 796917760, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 2.4101991653442383, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 1.6316, | |
| "num_input_tokens_seen": 805306368, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.0485, | |
| "grad_norm": 1.6004070043563843, | |
| "learning_rate": 1.94e-05, | |
| "loss": 1.6441, | |
| "num_input_tokens_seen": 813694976, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 3.2058792114257812, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 1.7082, | |
| "num_input_tokens_seen": 822083584, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.0495, | |
| "grad_norm": 1.95637845993042, | |
| "learning_rate": 1.98e-05, | |
| "loss": 1.616, | |
| "num_input_tokens_seen": 830472192, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.3890223503112793, | |
| "learning_rate": 2e-05, | |
| "loss": 1.4997, | |
| "num_input_tokens_seen": 838860800, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0505, | |
| "grad_norm": 1.4467096328735352, | |
| "learning_rate": 2.0200000000000003e-05, | |
| "loss": 1.6202, | |
| "num_input_tokens_seen": 847249408, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 2.6852478981018066, | |
| "learning_rate": 2.04e-05, | |
| "loss": 1.5475, | |
| "num_input_tokens_seen": 855638016, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.0515, | |
| "grad_norm": 2.040956497192383, | |
| "learning_rate": 2.0600000000000003e-05, | |
| "loss": 1.5854, | |
| "num_input_tokens_seen": 864026624, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 1.3061244487762451, | |
| "learning_rate": 2.08e-05, | |
| "loss": 1.5338, | |
| "num_input_tokens_seen": 872415232, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.0525, | |
| "grad_norm": 1.4860060214996338, | |
| "learning_rate": 2.1000000000000002e-05, | |
| "loss": 1.5763, | |
| "num_input_tokens_seen": 880803840, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 1.0595941543579102, | |
| "learning_rate": 2.1200000000000004e-05, | |
| "loss": 1.7691, | |
| "num_input_tokens_seen": 889192448, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.0535, | |
| "grad_norm": 0.8778529763221741, | |
| "learning_rate": 2.1400000000000002e-05, | |
| "loss": 1.5958, | |
| "num_input_tokens_seen": 897581056, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 1.0940550565719604, | |
| "learning_rate": 2.1600000000000003e-05, | |
| "loss": 1.5972, | |
| "num_input_tokens_seen": 905969664, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.0545, | |
| "grad_norm": 1.440792202949524, | |
| "learning_rate": 2.1800000000000005e-05, | |
| "loss": 1.5493, | |
| "num_input_tokens_seen": 914358272, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.9604665040969849, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 1.6586, | |
| "num_input_tokens_seen": 922746880, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0555, | |
| "grad_norm": 1.0158045291900635, | |
| "learning_rate": 2.2200000000000004e-05, | |
| "loss": 1.7715, | |
| "num_input_tokens_seen": 931135488, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 1.5087007284164429, | |
| "learning_rate": 2.2400000000000002e-05, | |
| "loss": 1.4964, | |
| "num_input_tokens_seen": 939524096, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.0565, | |
| "grad_norm": 1.9200010299682617, | |
| "learning_rate": 2.26e-05, | |
| "loss": 1.4965, | |
| "num_input_tokens_seen": 947912704, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 1.462496280670166, | |
| "learning_rate": 2.28e-05, | |
| "loss": 1.6141, | |
| "num_input_tokens_seen": 956301312, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.0575, | |
| "grad_norm": 1.0114740133285522, | |
| "learning_rate": 2.3e-05, | |
| "loss": 1.5548, | |
| "num_input_tokens_seen": 964689920, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 1.520970106124878, | |
| "learning_rate": 2.32e-05, | |
| "loss": 1.5195, | |
| "num_input_tokens_seen": 973078528, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.0585, | |
| "grad_norm": 1.1274619102478027, | |
| "learning_rate": 2.34e-05, | |
| "loss": 1.6381, | |
| "num_input_tokens_seen": 981467136, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 1.0581841468811035, | |
| "learning_rate": 2.36e-05, | |
| "loss": 1.6444, | |
| "num_input_tokens_seen": 989855744, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.0595, | |
| "grad_norm": 1.247220516204834, | |
| "learning_rate": 2.3800000000000003e-05, | |
| "loss": 1.5465, | |
| "num_input_tokens_seen": 998244352, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.9493988156318665, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.6384, | |
| "num_input_tokens_seen": 1006632960, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0605, | |
| "grad_norm": 1.418355107307434, | |
| "learning_rate": 2.4200000000000002e-05, | |
| "loss": 1.5904, | |
| "num_input_tokens_seen": 1015021568, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.061, | |
| "grad_norm": 1.3078217506408691, | |
| "learning_rate": 2.44e-05, | |
| "loss": 1.6312, | |
| "num_input_tokens_seen": 1023410176, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.0615, | |
| "grad_norm": 0.9902186989784241, | |
| "learning_rate": 2.46e-05, | |
| "loss": 1.5939, | |
| "num_input_tokens_seen": 1031798784, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "grad_norm": 1.1039105653762817, | |
| "learning_rate": 2.4800000000000003e-05, | |
| "loss": 1.6746, | |
| "num_input_tokens_seen": 1040187392, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 1.724129557609558, | |
| "learning_rate": 2.5e-05, | |
| "loss": 1.6997, | |
| "num_input_tokens_seen": 1048576000, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.063, | |
| "grad_norm": 1.0156750679016113, | |
| "learning_rate": 2.5200000000000003e-05, | |
| "loss": 1.6944, | |
| "num_input_tokens_seen": 1056964608, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.0635, | |
| "grad_norm": 2.0937423706054688, | |
| "learning_rate": 2.54e-05, | |
| "loss": 1.6348, | |
| "num_input_tokens_seen": 1065353216, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 1.2552897930145264, | |
| "learning_rate": 2.5600000000000002e-05, | |
| "loss": 1.5971, | |
| "num_input_tokens_seen": 1073741824, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.0645, | |
| "grad_norm": 1.9421988725662231, | |
| "learning_rate": 2.5800000000000004e-05, | |
| "loss": 1.6288, | |
| "num_input_tokens_seen": 1082130432, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 2.0198121070861816, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 1.6626, | |
| "num_input_tokens_seen": 1090519040, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0655, | |
| "grad_norm": 1.5626660585403442, | |
| "learning_rate": 2.6200000000000003e-05, | |
| "loss": 1.5689, | |
| "num_input_tokens_seen": 1098907648, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "grad_norm": 1.5933582782745361, | |
| "learning_rate": 2.6400000000000005e-05, | |
| "loss": 1.7013, | |
| "num_input_tokens_seen": 1107296256, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.0665, | |
| "grad_norm": 1.3158071041107178, | |
| "learning_rate": 2.6600000000000003e-05, | |
| "loss": 1.5135, | |
| "num_input_tokens_seen": 1115684864, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.067, | |
| "grad_norm": 2.2418034076690674, | |
| "learning_rate": 2.6800000000000004e-05, | |
| "loss": 1.5196, | |
| "num_input_tokens_seen": 1124073472, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.0675, | |
| "grad_norm": 1.055151343345642, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 1.611, | |
| "num_input_tokens_seen": 1132462080, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 2.362305164337158, | |
| "learning_rate": 2.7200000000000004e-05, | |
| "loss": 1.4535, | |
| "num_input_tokens_seen": 1140850688, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.0685, | |
| "grad_norm": 1.489244818687439, | |
| "learning_rate": 2.7400000000000005e-05, | |
| "loss": 1.6779, | |
| "num_input_tokens_seen": 1149239296, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.069, | |
| "grad_norm": 1.487452745437622, | |
| "learning_rate": 2.76e-05, | |
| "loss": 1.5136, | |
| "num_input_tokens_seen": 1157627904, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.0695, | |
| "grad_norm": 1.4844437837600708, | |
| "learning_rate": 2.78e-05, | |
| "loss": 1.5241, | |
| "num_input_tokens_seen": 1166016512, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.2633287906646729, | |
| "learning_rate": 2.8e-05, | |
| "loss": 1.4707, | |
| "num_input_tokens_seen": 1174405120, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0705, | |
| "grad_norm": 1.3002358675003052, | |
| "learning_rate": 2.82e-05, | |
| "loss": 1.4836, | |
| "num_input_tokens_seen": 1182793728, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.071, | |
| "grad_norm": 1.2630964517593384, | |
| "learning_rate": 2.8400000000000003e-05, | |
| "loss": 1.5179, | |
| "num_input_tokens_seen": 1191182336, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.0715, | |
| "grad_norm": 1.4223785400390625, | |
| "learning_rate": 2.86e-05, | |
| "loss": 1.6415, | |
| "num_input_tokens_seen": 1199570944, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 1.4245790243148804, | |
| "learning_rate": 2.8800000000000002e-05, | |
| "loss": 1.5909, | |
| "num_input_tokens_seen": 1207959552, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.0725, | |
| "grad_norm": 0.8491201400756836, | |
| "learning_rate": 2.9e-05, | |
| "loss": 1.6497, | |
| "num_input_tokens_seen": 1216348160, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.073, | |
| "grad_norm": 1.3185845613479614, | |
| "learning_rate": 2.92e-05, | |
| "loss": 1.6994, | |
| "num_input_tokens_seen": 1224736768, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.0735, | |
| "grad_norm": 1.564438819885254, | |
| "learning_rate": 2.9400000000000003e-05, | |
| "loss": 1.6212, | |
| "num_input_tokens_seen": 1233125376, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "grad_norm": 0.9664462804794312, | |
| "learning_rate": 2.96e-05, | |
| "loss": 1.6184, | |
| "num_input_tokens_seen": 1241513984, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.0745, | |
| "grad_norm": 1.6645387411117554, | |
| "learning_rate": 2.9800000000000003e-05, | |
| "loss": 1.4731, | |
| "num_input_tokens_seen": 1249902592, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 1.7425169944763184, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 1.5503, | |
| "num_input_tokens_seen": 1258291200, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0755, | |
| "grad_norm": 1.3575599193572998, | |
| "learning_rate": 3.0200000000000002e-05, | |
| "loss": 1.5707, | |
| "num_input_tokens_seen": 1266679808, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 1.5275225639343262, | |
| "learning_rate": 3.0400000000000004e-05, | |
| "loss": 1.5864, | |
| "num_input_tokens_seen": 1275068416, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.0765, | |
| "grad_norm": 1.1285806894302368, | |
| "learning_rate": 3.0600000000000005e-05, | |
| "loss": 1.6406, | |
| "num_input_tokens_seen": 1283457024, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.077, | |
| "grad_norm": 1.988125205039978, | |
| "learning_rate": 3.08e-05, | |
| "loss": 1.5999, | |
| "num_input_tokens_seen": 1291845632, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.0775, | |
| "grad_norm": 1.1705149412155151, | |
| "learning_rate": 3.1e-05, | |
| "loss": 1.7029, | |
| "num_input_tokens_seen": 1300234240, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "grad_norm": 1.810225486755371, | |
| "learning_rate": 3.1200000000000006e-05, | |
| "loss": 1.5762, | |
| "num_input_tokens_seen": 1308622848, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.0785, | |
| "grad_norm": 1.5474299192428589, | |
| "learning_rate": 3.1400000000000004e-05, | |
| "loss": 1.629, | |
| "num_input_tokens_seen": 1317011456, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.079, | |
| "grad_norm": 1.2300294637680054, | |
| "learning_rate": 3.16e-05, | |
| "loss": 1.5959, | |
| "num_input_tokens_seen": 1325400064, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.0795, | |
| "grad_norm": 1.2600280046463013, | |
| "learning_rate": 3.180000000000001e-05, | |
| "loss": 1.5476, | |
| "num_input_tokens_seen": 1333788672, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.2196714878082275, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 1.5161, | |
| "num_input_tokens_seen": 1342177280, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0805, | |
| "grad_norm": 1.740477442741394, | |
| "learning_rate": 3.2200000000000003e-05, | |
| "loss": 1.6133, | |
| "num_input_tokens_seen": 1350565888, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.081, | |
| "grad_norm": 1.0034888982772827, | |
| "learning_rate": 3.24e-05, | |
| "loss": 1.6335, | |
| "num_input_tokens_seen": 1358954496, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.0815, | |
| "grad_norm": 1.6216298341751099, | |
| "learning_rate": 3.26e-05, | |
| "loss": 1.5654, | |
| "num_input_tokens_seen": 1367343104, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "grad_norm": 1.3670564889907837, | |
| "learning_rate": 3.28e-05, | |
| "loss": 1.5737, | |
| "num_input_tokens_seen": 1375731712, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.0825, | |
| "grad_norm": 1.2599210739135742, | |
| "learning_rate": 3.3e-05, | |
| "loss": 1.5826, | |
| "num_input_tokens_seen": 1384120320, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.083, | |
| "grad_norm": 1.2521430253982544, | |
| "learning_rate": 3.32e-05, | |
| "loss": 1.4784, | |
| "num_input_tokens_seen": 1392508928, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.0835, | |
| "grad_norm": 1.4046518802642822, | |
| "learning_rate": 3.34e-05, | |
| "loss": 1.5848, | |
| "num_input_tokens_seen": 1400897536, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 1.317063808441162, | |
| "learning_rate": 3.3600000000000004e-05, | |
| "loss": 1.6185, | |
| "num_input_tokens_seen": 1409286144, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.0845, | |
| "grad_norm": 1.4442344903945923, | |
| "learning_rate": 3.38e-05, | |
| "loss": 1.4859, | |
| "num_input_tokens_seen": 1417674752, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 1.178648591041565, | |
| "learning_rate": 3.4e-05, | |
| "loss": 1.507, | |
| "num_input_tokens_seen": 1426063360, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0855, | |
| "grad_norm": 1.582335352897644, | |
| "learning_rate": 3.4200000000000005e-05, | |
| "loss": 1.507, | |
| "num_input_tokens_seen": 1434451968, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "grad_norm": 1.521285057067871, | |
| "learning_rate": 3.44e-05, | |
| "loss": 1.6079, | |
| "num_input_tokens_seen": 1442840576, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.0865, | |
| "grad_norm": 0.9032558798789978, | |
| "learning_rate": 3.46e-05, | |
| "loss": 1.7599, | |
| "num_input_tokens_seen": 1451229184, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.087, | |
| "grad_norm": 1.0596917867660522, | |
| "learning_rate": 3.4800000000000006e-05, | |
| "loss": 1.5575, | |
| "num_input_tokens_seen": 1459617792, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 1.5911486148834229, | |
| "learning_rate": 3.5000000000000004e-05, | |
| "loss": 1.5351, | |
| "num_input_tokens_seen": 1468006400, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 1.4693293571472168, | |
| "learning_rate": 3.52e-05, | |
| "loss": 1.5911, | |
| "num_input_tokens_seen": 1476395008, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.0885, | |
| "grad_norm": 1.173514485359192, | |
| "learning_rate": 3.54e-05, | |
| "loss": 1.7058, | |
| "num_input_tokens_seen": 1484783616, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.089, | |
| "grad_norm": 1.9427965879440308, | |
| "learning_rate": 3.5600000000000005e-05, | |
| "loss": 1.5192, | |
| "num_input_tokens_seen": 1493172224, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.0895, | |
| "grad_norm": 1.2264207601547241, | |
| "learning_rate": 3.58e-05, | |
| "loss": 1.5401, | |
| "num_input_tokens_seen": 1501560832, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.6922028064727783, | |
| "learning_rate": 3.6e-05, | |
| "loss": 1.7325, | |
| "num_input_tokens_seen": 1509949440, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.0905, | |
| "grad_norm": 0.9629112482070923, | |
| "learning_rate": 3.6200000000000006e-05, | |
| "loss": 1.6422, | |
| "num_input_tokens_seen": 1518338048, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.091, | |
| "grad_norm": 2.7276177406311035, | |
| "learning_rate": 3.6400000000000004e-05, | |
| "loss": 1.6333, | |
| "num_input_tokens_seen": 1526726656, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.0915, | |
| "grad_norm": 2.0043885707855225, | |
| "learning_rate": 3.66e-05, | |
| "loss": 1.559, | |
| "num_input_tokens_seen": 1535115264, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 2.4818575382232666, | |
| "learning_rate": 3.680000000000001e-05, | |
| "loss": 1.642, | |
| "num_input_tokens_seen": 1543503872, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.0925, | |
| "grad_norm": 1.5776571035385132, | |
| "learning_rate": 3.7000000000000005e-05, | |
| "loss": 1.5202, | |
| "num_input_tokens_seen": 1551892480, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.093, | |
| "grad_norm": 1.2193448543548584, | |
| "learning_rate": 3.72e-05, | |
| "loss": 1.6203, | |
| "num_input_tokens_seen": 1560281088, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.0935, | |
| "grad_norm": 1.853638768196106, | |
| "learning_rate": 3.740000000000001e-05, | |
| "loss": 1.5167, | |
| "num_input_tokens_seen": 1568669696, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "grad_norm": 1.3324044942855835, | |
| "learning_rate": 3.76e-05, | |
| "loss": 1.8053, | |
| "num_input_tokens_seen": 1577058304, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.0945, | |
| "grad_norm": 1.5402125120162964, | |
| "learning_rate": 3.7800000000000004e-05, | |
| "loss": 1.5468, | |
| "num_input_tokens_seen": 1585446912, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 2.0242340564727783, | |
| "learning_rate": 3.8e-05, | |
| "loss": 1.6636, | |
| "num_input_tokens_seen": 1593835520, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.0955, | |
| "grad_norm": 1.3856596946716309, | |
| "learning_rate": 3.82e-05, | |
| "loss": 1.5563, | |
| "num_input_tokens_seen": 1602224128, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 1.8583145141601562, | |
| "learning_rate": 3.8400000000000005e-05, | |
| "loss": 1.5695, | |
| "num_input_tokens_seen": 1610612736, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.0965, | |
| "grad_norm": 2.943809747695923, | |
| "learning_rate": 3.86e-05, | |
| "loss": 1.609, | |
| "num_input_tokens_seen": 1619001344, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.097, | |
| "grad_norm": 1.8209037780761719, | |
| "learning_rate": 3.88e-05, | |
| "loss": 1.6175, | |
| "num_input_tokens_seen": 1627389952, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.0975, | |
| "grad_norm": 2.229835033416748, | |
| "learning_rate": 3.9e-05, | |
| "loss": 1.6544, | |
| "num_input_tokens_seen": 1635778560, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "grad_norm": 1.3505150079727173, | |
| "learning_rate": 3.9200000000000004e-05, | |
| "loss": 1.607, | |
| "num_input_tokens_seen": 1644167168, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.0985, | |
| "grad_norm": 2.747910737991333, | |
| "learning_rate": 3.94e-05, | |
| "loss": 1.6337, | |
| "num_input_tokens_seen": 1652555776, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.099, | |
| "grad_norm": 1.9009253978729248, | |
| "learning_rate": 3.96e-05, | |
| "loss": 1.7661, | |
| "num_input_tokens_seen": 1660944384, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.0995, | |
| "grad_norm": 2.3057968616485596, | |
| "learning_rate": 3.9800000000000005e-05, | |
| "loss": 1.68, | |
| "num_input_tokens_seen": 1669332992, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.8019423484802246, | |
| "learning_rate": 4e-05, | |
| "loss": 1.6364, | |
| "num_input_tokens_seen": 1677721600, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1005, | |
| "grad_norm": 2.0466740131378174, | |
| "learning_rate": 3.9999845787629415e-05, | |
| "loss": 1.5807, | |
| "num_input_tokens_seen": 1686110208, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.101, | |
| "grad_norm": 1.6153452396392822, | |
| "learning_rate": 3.99993831528958e-05, | |
| "loss": 1.7046, | |
| "num_input_tokens_seen": 1694498816, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.1015, | |
| "grad_norm": 1.7713414430618286, | |
| "learning_rate": 3.9998612102933544e-05, | |
| "loss": 1.5998, | |
| "num_input_tokens_seen": 1702887424, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "grad_norm": 1.7932119369506836, | |
| "learning_rate": 3.999753264963321e-05, | |
| "loss": 1.653, | |
| "num_input_tokens_seen": 1711276032, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.1025, | |
| "grad_norm": 1.5555003881454468, | |
| "learning_rate": 3.9996144809641296e-05, | |
| "loss": 1.5376, | |
| "num_input_tokens_seen": 1719664640, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.103, | |
| "grad_norm": 1.2789753675460815, | |
| "learning_rate": 3.9994448604360016e-05, | |
| "loss": 1.6359, | |
| "num_input_tokens_seen": 1728053248, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.1035, | |
| "grad_norm": 1.5414743423461914, | |
| "learning_rate": 3.999244405994694e-05, | |
| "loss": 1.4903, | |
| "num_input_tokens_seen": 1736441856, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 1.3504588603973389, | |
| "learning_rate": 3.9990131207314634e-05, | |
| "loss": 1.6295, | |
| "num_input_tokens_seen": 1744830464, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.1045, | |
| "grad_norm": 1.046614170074463, | |
| "learning_rate": 3.998751008213014e-05, | |
| "loss": 1.6115, | |
| "num_input_tokens_seen": 1753219072, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 1.5764557123184204, | |
| "learning_rate": 3.9984580724814464e-05, | |
| "loss": 1.5491, | |
| "num_input_tokens_seen": 1761607680, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1055, | |
| "grad_norm": 1.3029578924179077, | |
| "learning_rate": 3.99813431805419e-05, | |
| "loss": 1.6617, | |
| "num_input_tokens_seen": 1769996288, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "grad_norm": 1.4238810539245605, | |
| "learning_rate": 3.9977797499239404e-05, | |
| "loss": 1.4616, | |
| "num_input_tokens_seen": 1778384896, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.1065, | |
| "grad_norm": 1.2134512662887573, | |
| "learning_rate": 3.997394373558576e-05, | |
| "loss": 1.6862, | |
| "num_input_tokens_seen": 1786773504, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.107, | |
| "grad_norm": 1.372286319732666, | |
| "learning_rate": 3.996978194901077e-05, | |
| "loss": 1.562, | |
| "num_input_tokens_seen": 1795162112, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.1075, | |
| "grad_norm": 1.2671712636947632, | |
| "learning_rate": 3.996531220369432e-05, | |
| "loss": 1.7226, | |
| "num_input_tokens_seen": 1803550720, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 1.2417782545089722, | |
| "learning_rate": 3.9960534568565436e-05, | |
| "loss": 1.627, | |
| "num_input_tokens_seen": 1811939328, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.1085, | |
| "grad_norm": 1.6749941110610962, | |
| "learning_rate": 3.995544911730115e-05, | |
| "loss": 1.5675, | |
| "num_input_tokens_seen": 1820327936, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.109, | |
| "grad_norm": 1.2601590156555176, | |
| "learning_rate": 3.995005592832541e-05, | |
| "loss": 1.6097, | |
| "num_input_tokens_seen": 1828716544, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.1095, | |
| "grad_norm": 1.2672165632247925, | |
| "learning_rate": 3.994435508480786e-05, | |
| "loss": 1.5779, | |
| "num_input_tokens_seen": 1837105152, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.7115397453308105, | |
| "learning_rate": 3.9938346674662565e-05, | |
| "loss": 1.628, | |
| "num_input_tokens_seen": 1845493760, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1105, | |
| "grad_norm": 1.9876790046691895, | |
| "learning_rate": 3.9932030790546636e-05, | |
| "loss": 1.5769, | |
| "num_input_tokens_seen": 1853882368, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.111, | |
| "grad_norm": 1.5821529626846313, | |
| "learning_rate": 3.9925407529858826e-05, | |
| "loss": 1.787, | |
| "num_input_tokens_seen": 1862270976, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.1115, | |
| "grad_norm": 1.7662357091903687, | |
| "learning_rate": 3.991847699473801e-05, | |
| "loss": 1.5917, | |
| "num_input_tokens_seen": 1870659584, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 1.5561827421188354, | |
| "learning_rate": 3.99112392920616e-05, | |
| "loss": 1.5063, | |
| "num_input_tokens_seen": 1879048192, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 1.459555983543396, | |
| "learning_rate": 3.990369453344394e-05, | |
| "loss": 1.6357, | |
| "num_input_tokens_seen": 1887436800, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.113, | |
| "grad_norm": 1.1611353158950806, | |
| "learning_rate": 3.989584283523453e-05, | |
| "loss": 1.4989, | |
| "num_input_tokens_seen": 1895825408, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.1135, | |
| "grad_norm": 1.3985382318496704, | |
| "learning_rate": 3.988768431851628e-05, | |
| "loss": 1.6556, | |
| "num_input_tokens_seen": 1904214016, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "grad_norm": 1.4399328231811523, | |
| "learning_rate": 3.98792191091036e-05, | |
| "loss": 1.5873, | |
| "num_input_tokens_seen": 1912602624, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.1145, | |
| "grad_norm": 1.1547013521194458, | |
| "learning_rate": 3.987044733754049e-05, | |
| "loss": 1.6546, | |
| "num_input_tokens_seen": 1920991232, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.9509936571121216, | |
| "learning_rate": 3.986136913909853e-05, | |
| "loss": 1.708, | |
| "num_input_tokens_seen": 1929379840, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1155, | |
| "grad_norm": 1.4778096675872803, | |
| "learning_rate": 3.985198465377476e-05, | |
| "loss": 1.5725, | |
| "num_input_tokens_seen": 1937768448, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 1.0294121503829956, | |
| "learning_rate": 3.9842294026289565e-05, | |
| "loss": 1.6508, | |
| "num_input_tokens_seen": 1946157056, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.1165, | |
| "grad_norm": 1.242614507675171, | |
| "learning_rate": 3.9832297406084386e-05, | |
| "loss": 1.6116, | |
| "num_input_tokens_seen": 1954545664, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.117, | |
| "grad_norm": 1.3411107063293457, | |
| "learning_rate": 3.98219949473195e-05, | |
| "loss": 1.5746, | |
| "num_input_tokens_seen": 1962934272, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.1175, | |
| "grad_norm": 1.501001238822937, | |
| "learning_rate": 3.981138680887154e-05, | |
| "loss": 1.5524, | |
| "num_input_tokens_seen": 1971322880, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "grad_norm": 1.329789638519287, | |
| "learning_rate": 3.980047315433116e-05, | |
| "loss": 1.5056, | |
| "num_input_tokens_seen": 1979711488, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.1185, | |
| "grad_norm": 0.9432595372200012, | |
| "learning_rate": 3.978925415200037e-05, | |
| "loss": 1.598, | |
| "num_input_tokens_seen": 1988100096, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.119, | |
| "grad_norm": 0.8246760368347168, | |
| "learning_rate": 3.97777299748901e-05, | |
| "loss": 1.6249, | |
| "num_input_tokens_seen": 1996488704, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.1195, | |
| "grad_norm": 0.9633040428161621, | |
| "learning_rate": 3.976590080071739e-05, | |
| "loss": 1.4641, | |
| "num_input_tokens_seen": 2004877312, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.3066949844360352, | |
| "learning_rate": 3.9753766811902756e-05, | |
| "loss": 1.7069, | |
| "num_input_tokens_seen": 2013265920, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1205, | |
| "grad_norm": 0.9971048831939697, | |
| "learning_rate": 3.974132819556731e-05, | |
| "loss": 1.4151, | |
| "num_input_tokens_seen": 2021654528, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.121, | |
| "grad_norm": 1.5287730693817139, | |
| "learning_rate": 3.972858514352991e-05, | |
| "loss": 1.6452, | |
| "num_input_tokens_seen": 2030043136, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.1215, | |
| "grad_norm": 1.1083930730819702, | |
| "learning_rate": 3.971553785230418e-05, | |
| "loss": 1.539, | |
| "num_input_tokens_seen": 2038431744, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.122, | |
| "grad_norm": 1.510451078414917, | |
| "learning_rate": 3.970218652309548e-05, | |
| "loss": 1.663, | |
| "num_input_tokens_seen": 2046820352, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.1225, | |
| "grad_norm": 1.0733381509780884, | |
| "learning_rate": 3.9688531361797834e-05, | |
| "loss": 1.6186, | |
| "num_input_tokens_seen": 2055208960, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.123, | |
| "grad_norm": 1.0600595474243164, | |
| "learning_rate": 3.9674572578990724e-05, | |
| "loss": 1.5896, | |
| "num_input_tokens_seen": 2063597568, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.1235, | |
| "grad_norm": 1.1060857772827148, | |
| "learning_rate": 3.9660310389935837e-05, | |
| "loss": 1.5245, | |
| "num_input_tokens_seen": 2071986176, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 1.2730605602264404, | |
| "learning_rate": 3.964574501457378e-05, | |
| "loss": 1.6357, | |
| "num_input_tokens_seen": 2080374784, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.1245, | |
| "grad_norm": 1.1669179201126099, | |
| "learning_rate": 3.9630876677520656e-05, | |
| "loss": 1.6614, | |
| "num_input_tokens_seen": 2088763392, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 1.1669305562973022, | |
| "learning_rate": 3.961570560806461e-05, | |
| "loss": 1.6551, | |
| "num_input_tokens_seen": 2097152000, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1255, | |
| "grad_norm": 1.2370777130126953, | |
| "learning_rate": 3.960023204016231e-05, | |
| "loss": 1.5691, | |
| "num_input_tokens_seen": 2105540608, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.126, | |
| "grad_norm": 1.1188710927963257, | |
| "learning_rate": 3.958445621243532e-05, | |
| "loss": 1.5289, | |
| "num_input_tokens_seen": 2113929216, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.1265, | |
| "grad_norm": 1.344351887702942, | |
| "learning_rate": 3.9568378368166406e-05, | |
| "loss": 1.5749, | |
| "num_input_tokens_seen": 2122317824, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.127, | |
| "grad_norm": 1.4007827043533325, | |
| "learning_rate": 3.955199875529582e-05, | |
| "loss": 1.5165, | |
| "num_input_tokens_seen": 2130706432, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.1275, | |
| "grad_norm": 1.2763276100158691, | |
| "learning_rate": 3.953531762641745e-05, | |
| "loss": 1.571, | |
| "num_input_tokens_seen": 2139095040, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 1.1570581197738647, | |
| "learning_rate": 3.951833523877495e-05, | |
| "loss": 1.4853, | |
| "num_input_tokens_seen": 2147483648, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.1285, | |
| "grad_norm": 1.0198652744293213, | |
| "learning_rate": 3.9501051854257745e-05, | |
| "loss": 1.5691, | |
| "num_input_tokens_seen": 2155872256, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.129, | |
| "grad_norm": 1.125144600868225, | |
| "learning_rate": 3.948346773939699e-05, | |
| "loss": 1.5856, | |
| "num_input_tokens_seen": 2164260864, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.1295, | |
| "grad_norm": 0.8690728545188904, | |
| "learning_rate": 3.94655831653615e-05, | |
| "loss": 1.6253, | |
| "num_input_tokens_seen": 2172649472, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.2358198165893555, | |
| "learning_rate": 3.9447398407953536e-05, | |
| "loss": 1.4732, | |
| "num_input_tokens_seen": 2181038080, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1305, | |
| "grad_norm": 0.9834250211715698, | |
| "learning_rate": 3.942891374760455e-05, | |
| "loss": 1.8804, | |
| "num_input_tokens_seen": 2189426688, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.131, | |
| "grad_norm": 0.9687766432762146, | |
| "learning_rate": 3.941012946937085e-05, | |
| "loss": 1.5559, | |
| "num_input_tokens_seen": 2197815296, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.1315, | |
| "grad_norm": 1.0704816579818726, | |
| "learning_rate": 3.9391045862929275e-05, | |
| "loss": 1.5955, | |
| "num_input_tokens_seen": 2206203904, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.132, | |
| "grad_norm": 0.9592090249061584, | |
| "learning_rate": 3.9371663222572625e-05, | |
| "loss": 1.5681, | |
| "num_input_tokens_seen": 2214592512, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.1325, | |
| "grad_norm": 0.9682651162147522, | |
| "learning_rate": 3.93519818472052e-05, | |
| "loss": 1.4599, | |
| "num_input_tokens_seen": 2222981120, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.133, | |
| "grad_norm": 1.170053243637085, | |
| "learning_rate": 3.933200204033815e-05, | |
| "loss": 1.472, | |
| "num_input_tokens_seen": 2231369728, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.1335, | |
| "grad_norm": 1.0805102586746216, | |
| "learning_rate": 3.931172411008482e-05, | |
| "loss": 1.6009, | |
| "num_input_tokens_seen": 2239758336, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.134, | |
| "grad_norm": 1.4825639724731445, | |
| "learning_rate": 3.9291148369155964e-05, | |
| "loss": 1.5004, | |
| "num_input_tokens_seen": 2248146944, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.1345, | |
| "grad_norm": 0.8528715968132019, | |
| "learning_rate": 3.927027513485498e-05, | |
| "loss": 1.6396, | |
| "num_input_tokens_seen": 2256535552, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.135, | |
| "grad_norm": 1.50923752784729, | |
| "learning_rate": 3.9249104729072944e-05, | |
| "loss": 1.6655, | |
| "num_input_tokens_seen": 2264924160, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1355, | |
| "grad_norm": 0.8830758929252625, | |
| "learning_rate": 3.9227637478283725e-05, | |
| "loss": 1.5853, | |
| "num_input_tokens_seen": 2273312768, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 1.577521800994873, | |
| "learning_rate": 3.9205873713538864e-05, | |
| "loss": 1.8288, | |
| "num_input_tokens_seen": 2281701376, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.1365, | |
| "grad_norm": 1.2477850914001465, | |
| "learning_rate": 3.918381377046255e-05, | |
| "loss": 1.4954, | |
| "num_input_tokens_seen": 2290089984, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.137, | |
| "grad_norm": 1.199828028678894, | |
| "learning_rate": 3.916145798924639e-05, | |
| "loss": 1.7353, | |
| "num_input_tokens_seen": 2298478592, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.1375, | |
| "grad_norm": 1.0330992937088013, | |
| "learning_rate": 3.913880671464418e-05, | |
| "loss": 1.5266, | |
| "num_input_tokens_seen": 2306867200, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.138, | |
| "grad_norm": 1.4635016918182373, | |
| "learning_rate": 3.911586029596661e-05, | |
| "loss": 1.7229, | |
| "num_input_tokens_seen": 2315255808, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.1385, | |
| "grad_norm": 0.9528157114982605, | |
| "learning_rate": 3.9092619087075825e-05, | |
| "loss": 1.5581, | |
| "num_input_tokens_seen": 2323644416, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.139, | |
| "grad_norm": 1.5162746906280518, | |
| "learning_rate": 3.906908344638002e-05, | |
| "loss": 1.5841, | |
| "num_input_tokens_seen": 2332033024, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.1395, | |
| "grad_norm": 0.930320680141449, | |
| "learning_rate": 3.904525373682791e-05, | |
| "loss": 1.6222, | |
| "num_input_tokens_seen": 2340421632, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.5436041355133057, | |
| "learning_rate": 3.9021130325903076e-05, | |
| "loss": 1.639, | |
| "num_input_tokens_seen": 2348810240, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1405, | |
| "grad_norm": 1.2980411052703857, | |
| "learning_rate": 3.8996713585618354e-05, | |
| "loss": 1.6228, | |
| "num_input_tokens_seen": 2357198848, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.141, | |
| "grad_norm": 1.1375272274017334, | |
| "learning_rate": 3.897200389251009e-05, | |
| "loss": 1.5062, | |
| "num_input_tokens_seen": 2365587456, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.1415, | |
| "grad_norm": 0.9488835334777832, | |
| "learning_rate": 3.8947001627632326e-05, | |
| "loss": 1.6049, | |
| "num_input_tokens_seen": 2373976064, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.142, | |
| "grad_norm": 0.9401798248291016, | |
| "learning_rate": 3.892170717655091e-05, | |
| "loss": 1.6477, | |
| "num_input_tokens_seen": 2382364672, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.1425, | |
| "grad_norm": 0.8705446124076843, | |
| "learning_rate": 3.889612092933756e-05, | |
| "loss": 1.6496, | |
| "num_input_tokens_seen": 2390753280, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.143, | |
| "grad_norm": 1.1039817333221436, | |
| "learning_rate": 3.887024328056387e-05, | |
| "loss": 1.4861, | |
| "num_input_tokens_seen": 2399141888, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.1435, | |
| "grad_norm": 1.0411194562911987, | |
| "learning_rate": 3.88440746292952e-05, | |
| "loss": 1.6675, | |
| "num_input_tokens_seen": 2407530496, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 0.9333168268203735, | |
| "learning_rate": 3.8817615379084514e-05, | |
| "loss": 1.5752, | |
| "num_input_tokens_seen": 2415919104, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.1445, | |
| "grad_norm": 1.1265161037445068, | |
| "learning_rate": 3.879086593796618e-05, | |
| "loss": 1.5223, | |
| "num_input_tokens_seen": 2424307712, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.145, | |
| "grad_norm": 1.404354214668274, | |
| "learning_rate": 3.876382671844969e-05, | |
| "loss": 1.5282, | |
| "num_input_tokens_seen": 2432696320, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.1455, | |
| "grad_norm": 0.9804477095603943, | |
| "learning_rate": 3.873649813751323e-05, | |
| "loss": 1.4289, | |
| "num_input_tokens_seen": 2441084928, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.146, | |
| "grad_norm": 0.8276107311248779, | |
| "learning_rate": 3.870888061659735e-05, | |
| "loss": 1.458, | |
| "num_input_tokens_seen": 2449473536, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.1465, | |
| "grad_norm": 0.9293767213821411, | |
| "learning_rate": 3.8680974581598375e-05, | |
| "loss": 1.4105, | |
| "num_input_tokens_seen": 2457862144, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.147, | |
| "grad_norm": 0.8964887261390686, | |
| "learning_rate": 3.865278046286189e-05, | |
| "loss": 1.6092, | |
| "num_input_tokens_seen": 2466250752, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.1475, | |
| "grad_norm": 1.1574569940567017, | |
| "learning_rate": 3.862429869517607e-05, | |
| "loss": 1.5505, | |
| "num_input_tokens_seen": 2474639360, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.148, | |
| "grad_norm": 0.9000768661499023, | |
| "learning_rate": 3.859552971776503e-05, | |
| "loss": 1.4806, | |
| "num_input_tokens_seen": 2483027968, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.1485, | |
| "grad_norm": 0.8124544620513916, | |
| "learning_rate": 3.856647397428198e-05, | |
| "loss": 1.6933, | |
| "num_input_tokens_seen": 2491416576, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.149, | |
| "grad_norm": 0.7809738516807556, | |
| "learning_rate": 3.853713191280242e-05, | |
| "loss": 1.7366, | |
| "num_input_tokens_seen": 2499805184, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.1495, | |
| "grad_norm": 0.8317289352416992, | |
| "learning_rate": 3.850750398581725e-05, | |
| "loss": 1.6753, | |
| "num_input_tokens_seen": 2508193792, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.8096636533737183, | |
| "learning_rate": 3.8477590650225735e-05, | |
| "loss": 1.4439, | |
| "num_input_tokens_seen": 2516582400, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1505, | |
| "grad_norm": 0.7422524094581604, | |
| "learning_rate": 3.8447392367328535e-05, | |
| "loss": 1.5424, | |
| "num_input_tokens_seen": 2524971008, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.151, | |
| "grad_norm": 1.0304864645004272, | |
| "learning_rate": 3.8416909602820534e-05, | |
| "loss": 1.4648, | |
| "num_input_tokens_seen": 2533359616, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.1515, | |
| "grad_norm": 1.0413343906402588, | |
| "learning_rate": 3.8386142826783645e-05, | |
| "loss": 1.4848, | |
| "num_input_tokens_seen": 2541748224, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 0.7976297736167908, | |
| "learning_rate": 3.835509251367963e-05, | |
| "loss": 1.6543, | |
| "num_input_tokens_seen": 2550136832, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.1525, | |
| "grad_norm": 0.9848310947418213, | |
| "learning_rate": 3.832375914234272e-05, | |
| "loss": 1.4926, | |
| "num_input_tokens_seen": 2558525440, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.153, | |
| "grad_norm": 1.7787138223648071, | |
| "learning_rate": 3.829214319597228e-05, | |
| "loss": 1.4342, | |
| "num_input_tokens_seen": 2566914048, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.1535, | |
| "grad_norm": 1.0632442235946655, | |
| "learning_rate": 3.826024516212529e-05, | |
| "loss": 1.6567, | |
| "num_input_tokens_seen": 2575302656, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.154, | |
| "grad_norm": 0.9911245703697205, | |
| "learning_rate": 3.8228065532708905e-05, | |
| "loss": 1.7441, | |
| "num_input_tokens_seen": 2583691264, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.1545, | |
| "grad_norm": 4.416871070861816, | |
| "learning_rate": 3.819560480397282e-05, | |
| "loss": 1.5855, | |
| "num_input_tokens_seen": 2592079872, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.155, | |
| "grad_norm": 1.6025869846343994, | |
| "learning_rate": 3.816286347650163e-05, | |
| "loss": 1.4225, | |
| "num_input_tokens_seen": 2600468480, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.1555, | |
| "grad_norm": 1.451319932937622, | |
| "learning_rate": 3.81298420552071e-05, | |
| "loss": 1.6691, | |
| "num_input_tokens_seen": 2608857088, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.156, | |
| "grad_norm": 1.2376505136489868, | |
| "learning_rate": 3.809654104932039e-05, | |
| "loss": 1.5486, | |
| "num_input_tokens_seen": 2617245696, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.1565, | |
| "grad_norm": 1.3247355222702026, | |
| "learning_rate": 3.8062960972384223e-05, | |
| "loss": 1.5454, | |
| "num_input_tokens_seen": 2625634304, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.157, | |
| "grad_norm": 1.2022751569747925, | |
| "learning_rate": 3.802910234224491e-05, | |
| "loss": 1.6298, | |
| "num_input_tokens_seen": 2634022912, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.1575, | |
| "grad_norm": 0.9472240805625916, | |
| "learning_rate": 3.7994965681044436e-05, | |
| "loss": 1.6695, | |
| "num_input_tokens_seen": 2642411520, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.158, | |
| "grad_norm": 0.9871085286140442, | |
| "learning_rate": 3.796055151521231e-05, | |
| "loss": 1.5988, | |
| "num_input_tokens_seen": 2650800128, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.1585, | |
| "grad_norm": 1.435222864151001, | |
| "learning_rate": 3.792586037545758e-05, | |
| "loss": 1.6043, | |
| "num_input_tokens_seen": 2659188736, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.159, | |
| "grad_norm": 1.1999521255493164, | |
| "learning_rate": 3.78908927967605e-05, | |
| "loss": 1.6211, | |
| "num_input_tokens_seen": 2667577344, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.1595, | |
| "grad_norm": 0.8386388421058655, | |
| "learning_rate": 3.785564931836442e-05, | |
| "loss": 1.6005, | |
| "num_input_tokens_seen": 2675965952, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.205376148223877, | |
| "learning_rate": 3.782013048376736e-05, | |
| "loss": 1.6663, | |
| "num_input_tokens_seen": 2684354560, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1605, | |
| "grad_norm": 0.9612499475479126, | |
| "learning_rate": 3.778433684071369e-05, | |
| "loss": 1.5949, | |
| "num_input_tokens_seen": 2692743168, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.161, | |
| "grad_norm": 0.8631348609924316, | |
| "learning_rate": 3.774826894118567e-05, | |
| "loss": 1.5974, | |
| "num_input_tokens_seen": 2701131776, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.1615, | |
| "grad_norm": 0.8702800273895264, | |
| "learning_rate": 3.7711927341394916e-05, | |
| "loss": 1.624, | |
| "num_input_tokens_seen": 2709520384, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.162, | |
| "grad_norm": 1.03826904296875, | |
| "learning_rate": 3.7675312601773874e-05, | |
| "loss": 1.5419, | |
| "num_input_tokens_seen": 2717908992, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.1625, | |
| "grad_norm": 0.9504969716072083, | |
| "learning_rate": 3.76384252869671e-05, | |
| "loss": 1.5117, | |
| "num_input_tokens_seen": 2726297600, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.163, | |
| "grad_norm": 0.9232383966445923, | |
| "learning_rate": 3.760126596582264e-05, | |
| "loss": 1.5539, | |
| "num_input_tokens_seen": 2734686208, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.1635, | |
| "grad_norm": 0.7623271942138672, | |
| "learning_rate": 3.756383521138319e-05, | |
| "loss": 1.6595, | |
| "num_input_tokens_seen": 2743074816, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.164, | |
| "grad_norm": 0.7739065289497375, | |
| "learning_rate": 3.7526133600877275e-05, | |
| "loss": 1.6576, | |
| "num_input_tokens_seen": 2751463424, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.1645, | |
| "grad_norm": 0.7299744486808777, | |
| "learning_rate": 3.748816171571038e-05, | |
| "loss": 1.6392, | |
| "num_input_tokens_seen": 2759852032, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.165, | |
| "grad_norm": 0.7746779322624207, | |
| "learning_rate": 3.744992014145595e-05, | |
| "loss": 1.4524, | |
| "num_input_tokens_seen": 2768240640, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1655, | |
| "grad_norm": 1.0410550832748413, | |
| "learning_rate": 3.741140946784635e-05, | |
| "loss": 1.4499, | |
| "num_input_tokens_seen": 2776629248, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.166, | |
| "grad_norm": 0.9332314133644104, | |
| "learning_rate": 3.737263028876383e-05, | |
| "loss": 1.5378, | |
| "num_input_tokens_seen": 2785017856, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.1665, | |
| "grad_norm": 0.7206718921661377, | |
| "learning_rate": 3.733358320223128e-05, | |
| "loss": 1.5865, | |
| "num_input_tokens_seen": 2793406464, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.167, | |
| "grad_norm": 0.7538252472877502, | |
| "learning_rate": 3.729426881040311e-05, | |
| "loss": 1.5233, | |
| "num_input_tokens_seen": 2801795072, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.1675, | |
| "grad_norm": 1.1447285413742065, | |
| "learning_rate": 3.725468771955584e-05, | |
| "loss": 1.6, | |
| "num_input_tokens_seen": 2810183680, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 1.0104703903198242, | |
| "learning_rate": 3.721484054007888e-05, | |
| "loss": 1.5188, | |
| "num_input_tokens_seen": 2818572288, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.1685, | |
| "grad_norm": 1.0000296831130981, | |
| "learning_rate": 3.717472788646501e-05, | |
| "loss": 1.5123, | |
| "num_input_tokens_seen": 2826960896, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.169, | |
| "grad_norm": 0.9542180895805359, | |
| "learning_rate": 3.7134350377301e-05, | |
| "loss": 1.5264, | |
| "num_input_tokens_seen": 2835349504, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.1695, | |
| "grad_norm": 1.0717406272888184, | |
| "learning_rate": 3.709370863525796e-05, | |
| "loss": 1.5088, | |
| "num_input_tokens_seen": 2843738112, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.015666127204895, | |
| "learning_rate": 3.705280328708185e-05, | |
| "loss": 1.4847, | |
| "num_input_tokens_seen": 2852126720, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.1705, | |
| "grad_norm": 0.9344543218612671, | |
| "learning_rate": 3.701163496358373e-05, | |
| "loss": 1.9649, | |
| "num_input_tokens_seen": 2860515328, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.171, | |
| "grad_norm": 0.9627848267555237, | |
| "learning_rate": 3.6970204299630077e-05, | |
| "loss": 1.5983, | |
| "num_input_tokens_seen": 2868903936, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.1715, | |
| "grad_norm": 1.7102587223052979, | |
| "learning_rate": 3.692851193413299e-05, | |
| "loss": 1.6775, | |
| "num_input_tokens_seen": 2877292544, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.172, | |
| "grad_norm": 1.195408582687378, | |
| "learning_rate": 3.6886558510040305e-05, | |
| "loss": 1.5342, | |
| "num_input_tokens_seen": 2885681152, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.1725, | |
| "grad_norm": 1.8404492139816284, | |
| "learning_rate": 3.684434467432573e-05, | |
| "loss": 1.5057, | |
| "num_input_tokens_seen": 2894069760, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.173, | |
| "grad_norm": 1.3141196966171265, | |
| "learning_rate": 3.680187107797884e-05, | |
| "loss": 1.708, | |
| "num_input_tokens_seen": 2902458368, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.1735, | |
| "grad_norm": 1.7167338132858276, | |
| "learning_rate": 3.675913837599503e-05, | |
| "loss": 1.5517, | |
| "num_input_tokens_seen": 2910846976, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.174, | |
| "grad_norm": 1.209182858467102, | |
| "learning_rate": 3.671614722736541e-05, | |
| "loss": 1.5185, | |
| "num_input_tokens_seen": 2919235584, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.1745, | |
| "grad_norm": 1.7432202100753784, | |
| "learning_rate": 3.667289829506669e-05, | |
| "loss": 1.5564, | |
| "num_input_tokens_seen": 2927624192, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 1.204798698425293, | |
| "learning_rate": 3.662939224605091e-05, | |
| "loss": 1.568, | |
| "num_input_tokens_seen": 2936012800, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1755, | |
| "grad_norm": 1.8268729448318481, | |
| "learning_rate": 3.658562975123516e-05, | |
| "loss": 1.5379, | |
| "num_input_tokens_seen": 2944401408, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 1.4031938314437866, | |
| "learning_rate": 3.654161148549124e-05, | |
| "loss": 1.6265, | |
| "num_input_tokens_seen": 2952790016, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.1765, | |
| "grad_norm": 1.4836796522140503, | |
| "learning_rate": 3.649733812763527e-05, | |
| "loss": 1.5787, | |
| "num_input_tokens_seen": 2961178624, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.177, | |
| "grad_norm": 1.1972601413726807, | |
| "learning_rate": 3.64528103604172e-05, | |
| "loss": 1.5386, | |
| "num_input_tokens_seen": 2969567232, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.1775, | |
| "grad_norm": 1.1761174201965332, | |
| "learning_rate": 3.640802887051027e-05, | |
| "loss": 1.6225, | |
| "num_input_tokens_seen": 2977955840, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.178, | |
| "grad_norm": 1.0581141710281372, | |
| "learning_rate": 3.636299434850047e-05, | |
| "loss": 1.515, | |
| "num_input_tokens_seen": 2986344448, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.1785, | |
| "grad_norm": 1.0779668092727661, | |
| "learning_rate": 3.631770748887583e-05, | |
| "loss": 1.5816, | |
| "num_input_tokens_seen": 2994733056, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.179, | |
| "grad_norm": 1.050459623336792, | |
| "learning_rate": 3.627216899001575e-05, | |
| "loss": 1.7315, | |
| "num_input_tokens_seen": 3003121664, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.1795, | |
| "grad_norm": 0.9864397048950195, | |
| "learning_rate": 3.62263795541802e-05, | |
| "loss": 1.6159, | |
| "num_input_tokens_seen": 3011510272, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.8089951872825623, | |
| "learning_rate": 3.6180339887498953e-05, | |
| "loss": 1.3576, | |
| "num_input_tokens_seen": 3019898880, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1805, | |
| "grad_norm": 0.7993234395980835, | |
| "learning_rate": 3.6134050699960604e-05, | |
| "loss": 1.7524, | |
| "num_input_tokens_seen": 3028287488, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.181, | |
| "grad_norm": 1.0345861911773682, | |
| "learning_rate": 3.608751270540169e-05, | |
| "loss": 1.6794, | |
| "num_input_tokens_seen": 3036676096, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.1815, | |
| "grad_norm": 0.8384249210357666, | |
| "learning_rate": 3.604072662149567e-05, | |
| "loss": 1.458, | |
| "num_input_tokens_seen": 3045064704, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.182, | |
| "grad_norm": 0.9244734644889832, | |
| "learning_rate": 3.599369316974182e-05, | |
| "loss": 1.6775, | |
| "num_input_tokens_seen": 3053453312, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.1825, | |
| "grad_norm": 0.8893707394599915, | |
| "learning_rate": 3.594641307545414e-05, | |
| "loss": 1.5501, | |
| "num_input_tokens_seen": 3061841920, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.183, | |
| "grad_norm": 0.9696424007415771, | |
| "learning_rate": 3.58988870677502e-05, | |
| "loss": 1.5262, | |
| "num_input_tokens_seen": 3070230528, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.1835, | |
| "grad_norm": 0.7615416646003723, | |
| "learning_rate": 3.585111587953982e-05, | |
| "loss": 1.5647, | |
| "num_input_tokens_seen": 3078619136, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 0.759972333908081, | |
| "learning_rate": 3.580310024751381e-05, | |
| "loss": 1.5869, | |
| "num_input_tokens_seen": 3087007744, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.1845, | |
| "grad_norm": 0.7315559983253479, | |
| "learning_rate": 3.575484091213262e-05, | |
| "loss": 1.4055, | |
| "num_input_tokens_seen": 3095396352, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.185, | |
| "grad_norm": 0.7774316668510437, | |
| "learning_rate": 3.57063386176149e-05, | |
| "loss": 1.521, | |
| "num_input_tokens_seen": 3103784960, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.1855, | |
| "grad_norm": 0.879809558391571, | |
| "learning_rate": 3.565759411192604e-05, | |
| "loss": 1.4916, | |
| "num_input_tokens_seen": 3112173568, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.186, | |
| "grad_norm": 0.7339460849761963, | |
| "learning_rate": 3.5608608146766597e-05, | |
| "loss": 1.5412, | |
| "num_input_tokens_seen": 3120562176, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.1865, | |
| "grad_norm": 0.7702769637107849, | |
| "learning_rate": 3.555938147756077e-05, | |
| "loss": 1.6866, | |
| "num_input_tokens_seen": 3128950784, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.187, | |
| "grad_norm": 0.6904353499412537, | |
| "learning_rate": 3.5509914863444694e-05, | |
| "loss": 1.6718, | |
| "num_input_tokens_seen": 3137339392, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 0.572349488735199, | |
| "learning_rate": 3.546020906725474e-05, | |
| "loss": 1.4651, | |
| "num_input_tokens_seen": 3145728000, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.188, | |
| "grad_norm": 0.7264822125434875, | |
| "learning_rate": 3.541026485551579e-05, | |
| "loss": 1.5973, | |
| "num_input_tokens_seen": 3154116608, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.1885, | |
| "grad_norm": 0.5672950744628906, | |
| "learning_rate": 3.536008299842936e-05, | |
| "loss": 1.6443, | |
| "num_input_tokens_seen": 3162505216, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.189, | |
| "grad_norm": 0.6739945411682129, | |
| "learning_rate": 3.530966426986177e-05, | |
| "loss": 1.5385, | |
| "num_input_tokens_seen": 3170893824, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.1895, | |
| "grad_norm": 0.6269484758377075, | |
| "learning_rate": 3.525900944733218e-05, | |
| "loss": 1.6951, | |
| "num_input_tokens_seen": 3179282432, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.7589824199676514, | |
| "learning_rate": 3.520811931200063e-05, | |
| "loss": 1.5563, | |
| "num_input_tokens_seen": 3187671040, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1905, | |
| "grad_norm": 0.9890035390853882, | |
| "learning_rate": 3.515699464865594e-05, | |
| "loss": 1.5256, | |
| "num_input_tokens_seen": 3196059648, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.191, | |
| "grad_norm": 1.1618750095367432, | |
| "learning_rate": 3.5105636245703675e-05, | |
| "loss": 1.5294, | |
| "num_input_tokens_seen": 3204448256, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.1915, | |
| "grad_norm": 0.7670680284500122, | |
| "learning_rate": 3.505404489515394e-05, | |
| "loss": 1.4686, | |
| "num_input_tokens_seen": 3212836864, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 0.6254603862762451, | |
| "learning_rate": 3.5002221392609196e-05, | |
| "loss": 1.61, | |
| "num_input_tokens_seen": 3221225472, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.1925, | |
| "grad_norm": 0.8577165007591248, | |
| "learning_rate": 3.495016653725194e-05, | |
| "loss": 1.4902, | |
| "num_input_tokens_seen": 3229614080, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.193, | |
| "grad_norm": 1.0208790302276611, | |
| "learning_rate": 3.489788113183244e-05, | |
| "loss": 1.6458, | |
| "num_input_tokens_seen": 3238002688, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.1935, | |
| "grad_norm": 0.6695219874382019, | |
| "learning_rate": 3.484536598265634e-05, | |
| "loss": 1.4563, | |
| "num_input_tokens_seen": 3246391296, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.194, | |
| "grad_norm": 1.5969139337539673, | |
| "learning_rate": 3.47926218995722e-05, | |
| "loss": 1.4984, | |
| "num_input_tokens_seen": 3254779904, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.1945, | |
| "grad_norm": 0.8430823683738708, | |
| "learning_rate": 3.473964969595902e-05, | |
| "loss": 1.5257, | |
| "num_input_tokens_seen": 3263168512, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.195, | |
| "grad_norm": 0.9442088007926941, | |
| "learning_rate": 3.468645018871371e-05, | |
| "loss": 1.572, | |
| "num_input_tokens_seen": 3271557120, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1955, | |
| "grad_norm": 0.8615273237228394, | |
| "learning_rate": 3.46330241982385e-05, | |
| "loss": 1.5203, | |
| "num_input_tokens_seen": 3279945728, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.196, | |
| "grad_norm": 0.8076956868171692, | |
| "learning_rate": 3.457937254842823e-05, | |
| "loss": 1.5902, | |
| "num_input_tokens_seen": 3288334336, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.1965, | |
| "grad_norm": 0.944568395614624, | |
| "learning_rate": 3.4525496066657735e-05, | |
| "loss": 1.4154, | |
| "num_input_tokens_seen": 3296722944, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.197, | |
| "grad_norm": 0.8723218441009521, | |
| "learning_rate": 3.4471395583768985e-05, | |
| "loss": 1.4952, | |
| "num_input_tokens_seen": 3305111552, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.1975, | |
| "grad_norm": 0.7548305988311768, | |
| "learning_rate": 3.441707193405838e-05, | |
| "loss": 1.5176, | |
| "num_input_tokens_seen": 3313500160, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.198, | |
| "grad_norm": 1.0144670009613037, | |
| "learning_rate": 3.436252595526378e-05, | |
| "loss": 1.4915, | |
| "num_input_tokens_seen": 3321888768, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.1985, | |
| "grad_norm": 1.0607627630233765, | |
| "learning_rate": 3.430775848855166e-05, | |
| "loss": 1.4538, | |
| "num_input_tokens_seen": 3330277376, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.199, | |
| "grad_norm": 0.7729026079177856, | |
| "learning_rate": 3.425277037850411e-05, | |
| "loss": 1.5519, | |
| "num_input_tokens_seen": 3338665984, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.1995, | |
| "grad_norm": 0.7398813366889954, | |
| "learning_rate": 3.419756247310581e-05, | |
| "loss": 1.6044, | |
| "num_input_tokens_seen": 3347054592, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.170116901397705, | |
| "learning_rate": 3.4142135623730954e-05, | |
| "loss": 1.4198, | |
| "num_input_tokens_seen": 3355443200, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2005, | |
| "grad_norm": 1.000035285949707, | |
| "learning_rate": 3.408649068513013e-05, | |
| "loss": 1.6274, | |
| "num_input_tokens_seen": 3363831808, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.201, | |
| "grad_norm": 0.8894368410110474, | |
| "learning_rate": 3.403062851541712e-05, | |
| "loss": 1.6038, | |
| "num_input_tokens_seen": 3372220416, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.2015, | |
| "grad_norm": 0.7087017297744751, | |
| "learning_rate": 3.397454997605569e-05, | |
| "loss": 1.5605, | |
| "num_input_tokens_seen": 3380609024, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.202, | |
| "grad_norm": 0.9844698905944824, | |
| "learning_rate": 3.391825593184629e-05, | |
| "loss": 1.5458, | |
| "num_input_tokens_seen": 3388997632, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.2025, | |
| "grad_norm": 0.779665470123291, | |
| "learning_rate": 3.3861747250912724e-05, | |
| "loss": 1.6731, | |
| "num_input_tokens_seen": 3397386240, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.203, | |
| "grad_norm": 0.866980791091919, | |
| "learning_rate": 3.3805024804688745e-05, | |
| "loss": 1.5318, | |
| "num_input_tokens_seen": 3405774848, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.2035, | |
| "grad_norm": 0.9056677222251892, | |
| "learning_rate": 3.374808946790466e-05, | |
| "loss": 1.5489, | |
| "num_input_tokens_seen": 3414163456, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.204, | |
| "grad_norm": 0.706774115562439, | |
| "learning_rate": 3.369094211857378e-05, | |
| "loss": 1.4679, | |
| "num_input_tokens_seen": 3422552064, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.2045, | |
| "grad_norm": 0.9233213067054749, | |
| "learning_rate": 3.363358363797893e-05, | |
| "loss": 1.7411, | |
| "num_input_tokens_seen": 3430940672, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.205, | |
| "grad_norm": 0.7863237261772156, | |
| "learning_rate": 3.357601491065884e-05, | |
| "loss": 1.6124, | |
| "num_input_tokens_seen": 3439329280, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2055, | |
| "grad_norm": 0.8109866380691528, | |
| "learning_rate": 3.35182368243945e-05, | |
| "loss": 1.6007, | |
| "num_input_tokens_seen": 3447717888, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.206, | |
| "grad_norm": 1.6634656190872192, | |
| "learning_rate": 3.346025027019547e-05, | |
| "loss": 1.5278, | |
| "num_input_tokens_seen": 3456106496, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.2065, | |
| "grad_norm": 0.989840030670166, | |
| "learning_rate": 3.3402056142286156e-05, | |
| "loss": 1.5711, | |
| "num_input_tokens_seen": 3464495104, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.207, | |
| "grad_norm": 1.2160106897354126, | |
| "learning_rate": 3.3343655338091996e-05, | |
| "loss": 1.6061, | |
| "num_input_tokens_seen": 3472883712, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.2075, | |
| "grad_norm": 0.6767374873161316, | |
| "learning_rate": 3.328504875822564e-05, | |
| "loss": 1.6041, | |
| "num_input_tokens_seen": 3481272320, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 1.2567026615142822, | |
| "learning_rate": 3.322623730647304e-05, | |
| "loss": 1.6222, | |
| "num_input_tokens_seen": 3489660928, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.2085, | |
| "grad_norm": 0.7106500267982483, | |
| "learning_rate": 3.316722188977955e-05, | |
| "loss": 1.5692, | |
| "num_input_tokens_seen": 3498049536, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.209, | |
| "grad_norm": 0.8964235186576843, | |
| "learning_rate": 3.310800341823588e-05, | |
| "loss": 1.5388, | |
| "num_input_tokens_seen": 3506438144, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.2095, | |
| "grad_norm": 0.7153457999229431, | |
| "learning_rate": 3.3048582805064137e-05, | |
| "loss": 1.5536, | |
| "num_input_tokens_seen": 3514826752, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.8886182308197021, | |
| "learning_rate": 3.298896096660367e-05, | |
| "loss": 1.3777, | |
| "num_input_tokens_seen": 3523215360, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2105, | |
| "grad_norm": 0.9441683292388916, | |
| "learning_rate": 3.2929138822297004e-05, | |
| "loss": 1.5147, | |
| "num_input_tokens_seen": 3531603968, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.211, | |
| "grad_norm": 0.9555985927581787, | |
| "learning_rate": 3.286911729467558e-05, | |
| "loss": 1.4988, | |
| "num_input_tokens_seen": 3539992576, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.2115, | |
| "grad_norm": 0.8580599427223206, | |
| "learning_rate": 3.280889730934562e-05, | |
| "loss": 1.4216, | |
| "num_input_tokens_seen": 3548381184, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.212, | |
| "grad_norm": 0.6099493503570557, | |
| "learning_rate": 3.27484797949738e-05, | |
| "loss": 1.5914, | |
| "num_input_tokens_seen": 3556769792, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.2125, | |
| "grad_norm": 0.8253036737442017, | |
| "learning_rate": 3.268786568327291e-05, | |
| "loss": 1.6376, | |
| "num_input_tokens_seen": 3565158400, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.213, | |
| "grad_norm": 0.8198022842407227, | |
| "learning_rate": 3.262705590898756e-05, | |
| "loss": 1.5287, | |
| "num_input_tokens_seen": 3573547008, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.2135, | |
| "grad_norm": 0.6324616074562073, | |
| "learning_rate": 3.2566051409879676e-05, | |
| "loss": 1.4937, | |
| "num_input_tokens_seen": 3581935616, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.214, | |
| "grad_norm": 0.7904665470123291, | |
| "learning_rate": 3.250485312671411e-05, | |
| "loss": 1.433, | |
| "num_input_tokens_seen": 3590324224, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.2145, | |
| "grad_norm": 0.8345609307289124, | |
| "learning_rate": 3.244346200324409e-05, | |
| "loss": 1.7622, | |
| "num_input_tokens_seen": 3598712832, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.215, | |
| "grad_norm": 0.764594316482544, | |
| "learning_rate": 3.238187898619669e-05, | |
| "loss": 1.5426, | |
| "num_input_tokens_seen": 3607101440, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2155, | |
| "grad_norm": 1.053906798362732, | |
| "learning_rate": 3.23201050252582e-05, | |
| "loss": 1.4862, | |
| "num_input_tokens_seen": 3615490048, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 0.8447480201721191, | |
| "learning_rate": 3.2258141073059533e-05, | |
| "loss": 1.4908, | |
| "num_input_tokens_seen": 3623878656, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.2165, | |
| "grad_norm": 0.6034976243972778, | |
| "learning_rate": 3.219598808516148e-05, | |
| "loss": 1.5979, | |
| "num_input_tokens_seen": 3632267264, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.217, | |
| "grad_norm": 0.6233925223350525, | |
| "learning_rate": 3.2133647020039995e-05, | |
| "loss": 1.4783, | |
| "num_input_tokens_seen": 3640655872, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.2175, | |
| "grad_norm": 0.734876275062561, | |
| "learning_rate": 3.207111883907143e-05, | |
| "loss": 1.5213, | |
| "num_input_tokens_seen": 3649044480, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.218, | |
| "grad_norm": 0.8191720843315125, | |
| "learning_rate": 3.200840450651769e-05, | |
| "loss": 1.4316, | |
| "num_input_tokens_seen": 3657433088, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.2185, | |
| "grad_norm": 0.7886914610862732, | |
| "learning_rate": 3.194550498951134e-05, | |
| "loss": 1.4348, | |
| "num_input_tokens_seen": 3665821696, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.219, | |
| "grad_norm": 0.5620763301849365, | |
| "learning_rate": 3.188242125804078e-05, | |
| "loss": 1.5731, | |
| "num_input_tokens_seen": 3674210304, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.2195, | |
| "grad_norm": 0.6232312917709351, | |
| "learning_rate": 3.181915428493515e-05, | |
| "loss": 1.5685, | |
| "num_input_tokens_seen": 3682598912, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 0.6559078097343445, | |
| "learning_rate": 3.1755705045849465e-05, | |
| "loss": 1.6089, | |
| "num_input_tokens_seen": 3690987520, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2205, | |
| "grad_norm": 0.5715806484222412, | |
| "learning_rate": 3.1692074519249476e-05, | |
| "loss": 1.5513, | |
| "num_input_tokens_seen": 3699376128, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.221, | |
| "grad_norm": 0.8140203952789307, | |
| "learning_rate": 3.1628263686396614e-05, | |
| "loss": 1.565, | |
| "num_input_tokens_seen": 3707764736, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.2215, | |
| "grad_norm": 0.7068142890930176, | |
| "learning_rate": 3.156427353133286e-05, | |
| "loss": 1.3783, | |
| "num_input_tokens_seen": 3716153344, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.222, | |
| "grad_norm": 0.5702927112579346, | |
| "learning_rate": 3.150010504086558e-05, | |
| "loss": 1.4583, | |
| "num_input_tokens_seen": 3724541952, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.2225, | |
| "grad_norm": 0.5769900679588318, | |
| "learning_rate": 3.1435759204552246e-05, | |
| "loss": 1.688, | |
| "num_input_tokens_seen": 3732930560, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.223, | |
| "grad_norm": 0.7947237491607666, | |
| "learning_rate": 3.1371237014685285e-05, | |
| "loss": 1.4639, | |
| "num_input_tokens_seen": 3741319168, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.2235, | |
| "grad_norm": 1.5811595916748047, | |
| "learning_rate": 3.130653946627666e-05, | |
| "loss": 1.4475, | |
| "num_input_tokens_seen": 3749707776, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 0.7136287093162537, | |
| "learning_rate": 3.124166755704261e-05, | |
| "loss": 1.587, | |
| "num_input_tokens_seen": 3758096384, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.2245, | |
| "grad_norm": 0.8462235927581787, | |
| "learning_rate": 3.117662228738823e-05, | |
| "loss": 1.5585, | |
| "num_input_tokens_seen": 3766484992, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.8650524020195007, | |
| "learning_rate": 3.111140466039205e-05, | |
| "loss": 1.6918, | |
| "num_input_tokens_seen": 3774873600, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2255, | |
| "grad_norm": 0.6417969465255737, | |
| "learning_rate": 3.104601568179054e-05, | |
| "loss": 1.5202, | |
| "num_input_tokens_seen": 3783262208, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.226, | |
| "grad_norm": 0.8439732193946838, | |
| "learning_rate": 3.098045635996264e-05, | |
| "loss": 1.531, | |
| "num_input_tokens_seen": 3791650816, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.2265, | |
| "grad_norm": 0.7102932929992676, | |
| "learning_rate": 3.09147277059142e-05, | |
| "loss": 1.594, | |
| "num_input_tokens_seen": 3800039424, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.227, | |
| "grad_norm": 0.7026742100715637, | |
| "learning_rate": 3.084883073326238e-05, | |
| "loss": 1.4399, | |
| "num_input_tokens_seen": 3808428032, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.2275, | |
| "grad_norm": 0.769747257232666, | |
| "learning_rate": 3.078276645822001e-05, | |
| "loss": 1.5332, | |
| "num_input_tokens_seen": 3816816640, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.228, | |
| "grad_norm": 0.5745607018470764, | |
| "learning_rate": 3.0716535899579936e-05, | |
| "loss": 1.5472, | |
| "num_input_tokens_seen": 3825205248, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.2285, | |
| "grad_norm": 0.5724217295646667, | |
| "learning_rate": 3.065014007869931e-05, | |
| "loss": 1.4897, | |
| "num_input_tokens_seen": 3833593856, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.229, | |
| "grad_norm": 0.6574141979217529, | |
| "learning_rate": 3.058358001948381e-05, | |
| "loss": 1.5466, | |
| "num_input_tokens_seen": 3841982464, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.2295, | |
| "grad_norm": 0.5983175039291382, | |
| "learning_rate": 3.0516856748371914e-05, | |
| "loss": 1.6937, | |
| "num_input_tokens_seen": 3850371072, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 0.607641875743866, | |
| "learning_rate": 3.0449971294318977e-05, | |
| "loss": 1.5146, | |
| "num_input_tokens_seen": 3858759680, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2305, | |
| "grad_norm": 0.7943156361579895, | |
| "learning_rate": 3.0382924688781462e-05, | |
| "loss": 1.4795, | |
| "num_input_tokens_seen": 3867148288, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.231, | |
| "grad_norm": 0.7880296111106873, | |
| "learning_rate": 3.031571796570095e-05, | |
| "loss": 1.595, | |
| "num_input_tokens_seen": 3875536896, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.2315, | |
| "grad_norm": 0.6290007829666138, | |
| "learning_rate": 3.0248352161488267e-05, | |
| "loss": 1.7595, | |
| "num_input_tokens_seen": 3883925504, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 0.8008227944374084, | |
| "learning_rate": 3.018082831500743e-05, | |
| "loss": 1.5703, | |
| "num_input_tokens_seen": 3892314112, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.2325, | |
| "grad_norm": 0.8298013210296631, | |
| "learning_rate": 3.0113147467559697e-05, | |
| "loss": 1.7367, | |
| "num_input_tokens_seen": 3900702720, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.233, | |
| "grad_norm": 0.6246507167816162, | |
| "learning_rate": 3.004531066286745e-05, | |
| "loss": 1.5911, | |
| "num_input_tokens_seen": 3909091328, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.2335, | |
| "grad_norm": 0.9494962096214294, | |
| "learning_rate": 2.997731894705815e-05, | |
| "loss": 1.6025, | |
| "num_input_tokens_seen": 3917479936, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.234, | |
| "grad_norm": 0.9917277097702026, | |
| "learning_rate": 2.9909173368648154e-05, | |
| "loss": 1.6784, | |
| "num_input_tokens_seen": 3925868544, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.2345, | |
| "grad_norm": 0.6494435667991638, | |
| "learning_rate": 2.9840874978526582e-05, | |
| "loss": 1.502, | |
| "num_input_tokens_seen": 3934257152, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.235, | |
| "grad_norm": 1.1123274564743042, | |
| "learning_rate": 2.9772424829939103e-05, | |
| "loss": 1.5277, | |
| "num_input_tokens_seen": 3942645760, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2355, | |
| "grad_norm": 0.8015885353088379, | |
| "learning_rate": 2.9703823978471676e-05, | |
| "loss": 1.52, | |
| "num_input_tokens_seen": 3951034368, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.236, | |
| "grad_norm": 0.6025406122207642, | |
| "learning_rate": 2.9635073482034307e-05, | |
| "loss": 1.5666, | |
| "num_input_tokens_seen": 3959422976, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.2365, | |
| "grad_norm": 0.782339870929718, | |
| "learning_rate": 2.9566174400844692e-05, | |
| "loss": 1.5704, | |
| "num_input_tokens_seen": 3967811584, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.237, | |
| "grad_norm": 0.5577786564826965, | |
| "learning_rate": 2.949712779741189e-05, | |
| "loss": 1.5305, | |
| "num_input_tokens_seen": 3976200192, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.2375, | |
| "grad_norm": 0.6102334856987, | |
| "learning_rate": 2.9427934736519962e-05, | |
| "loss": 1.5881, | |
| "num_input_tokens_seen": 3984588800, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.238, | |
| "grad_norm": 0.47674456238746643, | |
| "learning_rate": 2.935859628521147e-05, | |
| "loss": 1.6161, | |
| "num_input_tokens_seen": 3992977408, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.2385, | |
| "grad_norm": 0.5674923658370972, | |
| "learning_rate": 2.9289113512771133e-05, | |
| "loss": 1.6069, | |
| "num_input_tokens_seen": 4001366016, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.239, | |
| "grad_norm": 0.6357706189155579, | |
| "learning_rate": 2.921948749070925e-05, | |
| "loss": 1.6064, | |
| "num_input_tokens_seen": 4009754624, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.2395, | |
| "grad_norm": 0.5138919949531555, | |
| "learning_rate": 2.914971929274521e-05, | |
| "loss": 1.4089, | |
| "num_input_tokens_seen": 4018143232, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5422305464744568, | |
| "learning_rate": 2.9079809994790937e-05, | |
| "loss": 1.4315, | |
| "num_input_tokens_seen": 4026531840, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2405, | |
| "grad_norm": 0.5790091156959534, | |
| "learning_rate": 2.900976067493429e-05, | |
| "loss": 1.6108, | |
| "num_input_tokens_seen": 4034920448, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.241, | |
| "grad_norm": 0.5500643849372864, | |
| "learning_rate": 2.8939572413422426e-05, | |
| "loss": 1.5919, | |
| "num_input_tokens_seen": 4043309056, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.2415, | |
| "grad_norm": 0.5417104959487915, | |
| "learning_rate": 2.886924629264517e-05, | |
| "loss": 1.5704, | |
| "num_input_tokens_seen": 4051697664, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.242, | |
| "grad_norm": 0.6382232904434204, | |
| "learning_rate": 2.8798783397118305e-05, | |
| "loss": 1.598, | |
| "num_input_tokens_seen": 4060086272, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.2425, | |
| "grad_norm": 0.59275221824646, | |
| "learning_rate": 2.872818481346684e-05, | |
| "loss": 1.5564, | |
| "num_input_tokens_seen": 4068474880, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.243, | |
| "grad_norm": 0.7460857033729553, | |
| "learning_rate": 2.8657451630408287e-05, | |
| "loss": 1.7001, | |
| "num_input_tokens_seen": 4076863488, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.2435, | |
| "grad_norm": 0.5753904581069946, | |
| "learning_rate": 2.85865849387358e-05, | |
| "loss": 1.5682, | |
| "num_input_tokens_seen": 4085252096, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.244, | |
| "grad_norm": 0.6409214735031128, | |
| "learning_rate": 2.8515585831301456e-05, | |
| "loss": 1.4705, | |
| "num_input_tokens_seen": 4093640704, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.2445, | |
| "grad_norm": 0.715114414691925, | |
| "learning_rate": 2.844445540299931e-05, | |
| "loss": 1.5677, | |
| "num_input_tokens_seen": 4102029312, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.245, | |
| "grad_norm": 0.5775078535079956, | |
| "learning_rate": 2.8373194750748566e-05, | |
| "loss": 1.6113, | |
| "num_input_tokens_seen": 4110417920, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.2455, | |
| "grad_norm": 0.621399998664856, | |
| "learning_rate": 2.8301804973476628e-05, | |
| "loss": 1.6211, | |
| "num_input_tokens_seen": 4118806528, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.246, | |
| "grad_norm": 0.6096208095550537, | |
| "learning_rate": 2.823028717210218e-05, | |
| "loss": 1.6534, | |
| "num_input_tokens_seen": 4127195136, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.2465, | |
| "grad_norm": 0.6161178946495056, | |
| "learning_rate": 2.8158642449518186e-05, | |
| "loss": 1.4828, | |
| "num_input_tokens_seen": 4135583744, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.247, | |
| "grad_norm": 0.5037221908569336, | |
| "learning_rate": 2.8086871910574904e-05, | |
| "loss": 1.5372, | |
| "num_input_tokens_seen": 4143972352, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.2475, | |
| "grad_norm": 3.227553367614746, | |
| "learning_rate": 2.8014976662062818e-05, | |
| "loss": 1.5292, | |
| "num_input_tokens_seen": 4152360960, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 0.6233062744140625, | |
| "learning_rate": 2.7942957812695613e-05, | |
| "loss": 1.5397, | |
| "num_input_tokens_seen": 4160749568, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.2485, | |
| "grad_norm": 0.5814934372901917, | |
| "learning_rate": 2.787081647309303e-05, | |
| "loss": 1.472, | |
| "num_input_tokens_seen": 4169138176, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.249, | |
| "grad_norm": 0.6146552562713623, | |
| "learning_rate": 2.7798553755763768e-05, | |
| "loss": 1.4988, | |
| "num_input_tokens_seen": 4177526784, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.2495, | |
| "grad_norm": 0.5402436852455139, | |
| "learning_rate": 2.7726170775088324e-05, | |
| "loss": 1.6962, | |
| "num_input_tokens_seen": 4185915392, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.6572995185852051, | |
| "learning_rate": 2.7653668647301797e-05, | |
| "loss": 1.5652, | |
| "num_input_tokens_seen": 4194304000, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2505, | |
| "grad_norm": 0.6256840229034424, | |
| "learning_rate": 2.7581048490476695e-05, | |
| "loss": 1.6247, | |
| "num_input_tokens_seen": 4202692608, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.251, | |
| "grad_norm": 0.5442299842834473, | |
| "learning_rate": 2.7508311424505665e-05, | |
| "loss": 1.5023, | |
| "num_input_tokens_seen": 4211081216, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.2515, | |
| "grad_norm": 0.7635493278503418, | |
| "learning_rate": 2.7435458571084247e-05, | |
| "loss": 1.3966, | |
| "num_input_tokens_seen": 4219469824, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.252, | |
| "grad_norm": 0.7595793008804321, | |
| "learning_rate": 2.7362491053693564e-05, | |
| "loss": 1.5442, | |
| "num_input_tokens_seen": 4227858432, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.2525, | |
| "grad_norm": 0.6373950839042664, | |
| "learning_rate": 2.7289409997583002e-05, | |
| "loss": 1.639, | |
| "num_input_tokens_seen": 4236247040, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.253, | |
| "grad_norm": 0.7036408185958862, | |
| "learning_rate": 2.7216216529752836e-05, | |
| "loss": 1.5527, | |
| "num_input_tokens_seen": 4244635648, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.2535, | |
| "grad_norm": 0.676279604434967, | |
| "learning_rate": 2.7142911778936913e-05, | |
| "loss": 1.447, | |
| "num_input_tokens_seen": 4253024256, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.254, | |
| "grad_norm": 0.8009625673294067, | |
| "learning_rate": 2.7069496875585145e-05, | |
| "loss": 1.6359, | |
| "num_input_tokens_seen": 4261412864, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.2545, | |
| "grad_norm": 0.7147114872932434, | |
| "learning_rate": 2.6995972951846177e-05, | |
| "loss": 1.5243, | |
| "num_input_tokens_seen": 4269801472, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.255, | |
| "grad_norm": 0.808479905128479, | |
| "learning_rate": 2.692234114154986e-05, | |
| "loss": 1.3225, | |
| "num_input_tokens_seen": 4278190080, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.2555, | |
| "grad_norm": 0.7318254113197327, | |
| "learning_rate": 2.68486025801898e-05, | |
| "loss": 1.3925, | |
| "num_input_tokens_seen": 4286578688, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 1.4810857772827148, | |
| "learning_rate": 2.6774758404905833e-05, | |
| "loss": 1.5665, | |
| "num_input_tokens_seen": 4294967296, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.2565, | |
| "grad_norm": 0.6845510601997375, | |
| "learning_rate": 2.670080975446648e-05, | |
| "loss": 1.5702, | |
| "num_input_tokens_seen": 4303355904, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.257, | |
| "grad_norm": 0.7219899296760559, | |
| "learning_rate": 2.662675776925142e-05, | |
| "loss": 1.602, | |
| "num_input_tokens_seen": 4311744512, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.2575, | |
| "grad_norm": 0.6542209982872009, | |
| "learning_rate": 2.6552603591233875e-05, | |
| "loss": 1.7091, | |
| "num_input_tokens_seen": 4320133120, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.258, | |
| "grad_norm": 0.7145106196403503, | |
| "learning_rate": 2.647834836396299e-05, | |
| "loss": 1.6122, | |
| "num_input_tokens_seen": 4328521728, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.2585, | |
| "grad_norm": 0.6836573481559753, | |
| "learning_rate": 2.6403993232546235e-05, | |
| "loss": 1.4866, | |
| "num_input_tokens_seen": 4336910336, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.259, | |
| "grad_norm": 0.5894479751586914, | |
| "learning_rate": 2.6329539343631725e-05, | |
| "loss": 1.4505, | |
| "num_input_tokens_seen": 4345298944, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.2595, | |
| "grad_norm": 0.6113694906234741, | |
| "learning_rate": 2.625498784539052e-05, | |
| "loss": 1.5411, | |
| "num_input_tokens_seen": 4353687552, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 0.5657572746276855, | |
| "learning_rate": 2.618033988749895e-05, | |
| "loss": 1.6, | |
| "num_input_tokens_seen": 4362076160, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2605, | |
| "grad_norm": 0.5951750874519348, | |
| "learning_rate": 2.6105596621120873e-05, | |
| "loss": 1.4667, | |
| "num_input_tokens_seen": 4370464768, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.261, | |
| "grad_norm": 0.5741889476776123, | |
| "learning_rate": 2.6030759198889915e-05, | |
| "loss": 1.3723, | |
| "num_input_tokens_seen": 4378853376, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.2615, | |
| "grad_norm": 0.7268882989883423, | |
| "learning_rate": 2.595582877489171e-05, | |
| "loss": 1.3528, | |
| "num_input_tokens_seen": 4387241984, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.262, | |
| "grad_norm": 0.5853300094604492, | |
| "learning_rate": 2.588080650464608e-05, | |
| "loss": 1.7055, | |
| "num_input_tokens_seen": 4395630592, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.2625, | |
| "grad_norm": 0.5188358426094055, | |
| "learning_rate": 2.580569354508925e-05, | |
| "loss": 1.5842, | |
| "num_input_tokens_seen": 4404019200, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.263, | |
| "grad_norm": 0.6717216372489929, | |
| "learning_rate": 2.573049105455597e-05, | |
| "loss": 1.6009, | |
| "num_input_tokens_seen": 4412407808, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.2635, | |
| "grad_norm": 0.6268966794013977, | |
| "learning_rate": 2.5655200192761668e-05, | |
| "loss": 1.4833, | |
| "num_input_tokens_seen": 4420796416, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 0.5843154788017273, | |
| "learning_rate": 2.557982212078459e-05, | |
| "loss": 1.4979, | |
| "num_input_tokens_seen": 4429185024, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.2645, | |
| "grad_norm": 0.6628950834274292, | |
| "learning_rate": 2.550435800104783e-05, | |
| "loss": 1.5867, | |
| "num_input_tokens_seen": 4437573632, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.265, | |
| "grad_norm": 0.7546433806419373, | |
| "learning_rate": 2.5428808997301486e-05, | |
| "loss": 1.5423, | |
| "num_input_tokens_seen": 4445962240, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.2655, | |
| "grad_norm": 0.5483337640762329, | |
| "learning_rate": 2.535317627460465e-05, | |
| "loss": 1.4613, | |
| "num_input_tokens_seen": 4454350848, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.266, | |
| "grad_norm": 0.5763765573501587, | |
| "learning_rate": 2.5277460999307462e-05, | |
| "loss": 1.5069, | |
| "num_input_tokens_seen": 4462739456, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.2665, | |
| "grad_norm": 0.7406538724899292, | |
| "learning_rate": 2.5201664339033138e-05, | |
| "loss": 1.4382, | |
| "num_input_tokens_seen": 4471128064, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.267, | |
| "grad_norm": 0.5709603428840637, | |
| "learning_rate": 2.5125787462659937e-05, | |
| "loss": 1.3419, | |
| "num_input_tokens_seen": 4479516672, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.2675, | |
| "grad_norm": 0.7651158571243286, | |
| "learning_rate": 2.504983154030316e-05, | |
| "loss": 1.5017, | |
| "num_input_tokens_seen": 4487905280, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.268, | |
| "grad_norm": 0.6950023174285889, | |
| "learning_rate": 2.4973797743297103e-05, | |
| "loss": 1.4883, | |
| "num_input_tokens_seen": 4496293888, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.2685, | |
| "grad_norm": 0.5981586575508118, | |
| "learning_rate": 2.489768724417695e-05, | |
| "loss": 1.5267, | |
| "num_input_tokens_seen": 4504682496, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.269, | |
| "grad_norm": 0.6037119030952454, | |
| "learning_rate": 2.4821501216660778e-05, | |
| "loss": 1.6222, | |
| "num_input_tokens_seen": 4513071104, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.2695, | |
| "grad_norm": 0.6097348928451538, | |
| "learning_rate": 2.474524083563136e-05, | |
| "loss": 1.3873, | |
| "num_input_tokens_seen": 4521459712, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.5978874564170837, | |
| "learning_rate": 2.4668907277118114e-05, | |
| "loss": 1.567, | |
| "num_input_tokens_seen": 4529848320, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.2705, | |
| "grad_norm": 0.5034315586090088, | |
| "learning_rate": 2.459250171827894e-05, | |
| "loss": 1.4653, | |
| "num_input_tokens_seen": 4538236928, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.271, | |
| "grad_norm": 0.5134051442146301, | |
| "learning_rate": 2.4516025337382078e-05, | |
| "loss": 1.469, | |
| "num_input_tokens_seen": 4546625536, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.2715, | |
| "grad_norm": 0.5771511197090149, | |
| "learning_rate": 2.443947931378792e-05, | |
| "loss": 1.4125, | |
| "num_input_tokens_seen": 4555014144, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 0.5153135061264038, | |
| "learning_rate": 2.4362864827930855e-05, | |
| "loss": 1.3929, | |
| "num_input_tokens_seen": 4563402752, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.2725, | |
| "grad_norm": 0.47954922914505005, | |
| "learning_rate": 2.4286183061301016e-05, | |
| "loss": 1.606, | |
| "num_input_tokens_seen": 4571791360, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.273, | |
| "grad_norm": 0.46164366602897644, | |
| "learning_rate": 2.4209435196426112e-05, | |
| "loss": 1.5043, | |
| "num_input_tokens_seen": 4580179968, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.2735, | |
| "grad_norm": 0.5116024613380432, | |
| "learning_rate": 2.4132622416853164e-05, | |
| "loss": 1.6474, | |
| "num_input_tokens_seen": 4588568576, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.274, | |
| "grad_norm": 0.5638750195503235, | |
| "learning_rate": 2.405574590713025e-05, | |
| "loss": 1.4535, | |
| "num_input_tokens_seen": 4596957184, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.2745, | |
| "grad_norm": 0.6490784287452698, | |
| "learning_rate": 2.3978806852788253e-05, | |
| "loss": 1.4552, | |
| "num_input_tokens_seen": 4605345792, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.6507761478424072, | |
| "learning_rate": 2.390180644032257e-05, | |
| "loss": 1.4237, | |
| "num_input_tokens_seen": 4613734400, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2755, | |
| "grad_norm": 0.5712783932685852, | |
| "learning_rate": 2.382474585717481e-05, | |
| "loss": 1.4194, | |
| "num_input_tokens_seen": 4622123008, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.276, | |
| "grad_norm": 0.6233762502670288, | |
| "learning_rate": 2.37476262917145e-05, | |
| "loss": 1.5597, | |
| "num_input_tokens_seen": 4630511616, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.2765, | |
| "grad_norm": 0.6161183714866638, | |
| "learning_rate": 2.3670448933220732e-05, | |
| "loss": 1.6496, | |
| "num_input_tokens_seen": 4638900224, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.277, | |
| "grad_norm": 0.5277646780014038, | |
| "learning_rate": 2.3593214971863857e-05, | |
| "loss": 1.49, | |
| "num_input_tokens_seen": 4647288832, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.2775, | |
| "grad_norm": 0.5984926819801331, | |
| "learning_rate": 2.3515925598687097e-05, | |
| "loss": 1.5425, | |
| "num_input_tokens_seen": 4655677440, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.278, | |
| "grad_norm": 0.650562584400177, | |
| "learning_rate": 2.3438582005588192e-05, | |
| "loss": 1.3518, | |
| "num_input_tokens_seen": 4664066048, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.2785, | |
| "grad_norm": 0.5822499394416809, | |
| "learning_rate": 2.3361185385301042e-05, | |
| "loss": 1.5796, | |
| "num_input_tokens_seen": 4672454656, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.279, | |
| "grad_norm": 0.5401993989944458, | |
| "learning_rate": 2.328373693137726e-05, | |
| "loss": 1.569, | |
| "num_input_tokens_seen": 4680843264, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.2795, | |
| "grad_norm": 0.5291101336479187, | |
| "learning_rate": 2.3206237838167825e-05, | |
| "loss": 1.5372, | |
| "num_input_tokens_seen": 4689231872, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 0.5063018202781677, | |
| "learning_rate": 2.312868930080462e-05, | |
| "loss": 1.4165, | |
| "num_input_tokens_seen": 4697620480, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.2805, | |
| "grad_norm": 0.5932354927062988, | |
| "learning_rate": 2.3051092515182022e-05, | |
| "loss": 1.4155, | |
| "num_input_tokens_seen": 4706009088, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.281, | |
| "grad_norm": 0.7603226900100708, | |
| "learning_rate": 2.2973448677938466e-05, | |
| "loss": 1.4976, | |
| "num_input_tokens_seen": 4714397696, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.2815, | |
| "grad_norm": 0.5681598782539368, | |
| "learning_rate": 2.289575898643796e-05, | |
| "loss": 1.4042, | |
| "num_input_tokens_seen": 4722786304, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.282, | |
| "grad_norm": 0.5139986872673035, | |
| "learning_rate": 2.2818024638751655e-05, | |
| "loss": 1.543, | |
| "num_input_tokens_seen": 4731174912, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.2825, | |
| "grad_norm": 0.5610659122467041, | |
| "learning_rate": 2.2740246833639366e-05, | |
| "loss": 1.5532, | |
| "num_input_tokens_seen": 4739563520, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.283, | |
| "grad_norm": 0.5930039286613464, | |
| "learning_rate": 2.266242677053105e-05, | |
| "loss": 1.5949, | |
| "num_input_tokens_seen": 4747952128, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.2835, | |
| "grad_norm": 0.6450530886650085, | |
| "learning_rate": 2.2584565649508355e-05, | |
| "loss": 1.4703, | |
| "num_input_tokens_seen": 4756340736, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.284, | |
| "grad_norm": 0.5637064576148987, | |
| "learning_rate": 2.2506664671286087e-05, | |
| "loss": 1.4424, | |
| "num_input_tokens_seen": 4764729344, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.2845, | |
| "grad_norm": 0.5345636606216431, | |
| "learning_rate": 2.2428725037193697e-05, | |
| "loss": 1.5623, | |
| "num_input_tokens_seen": 4773117952, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.285, | |
| "grad_norm": 0.5629446506500244, | |
| "learning_rate": 2.2350747949156756e-05, | |
| "loss": 1.5205, | |
| "num_input_tokens_seen": 4781506560, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.2855, | |
| "grad_norm": 0.6573411822319031, | |
| "learning_rate": 2.2272734609678426e-05, | |
| "loss": 1.5845, | |
| "num_input_tokens_seen": 4789895168, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.286, | |
| "grad_norm": 0.4794755280017853, | |
| "learning_rate": 2.2194686221820905e-05, | |
| "loss": 1.3785, | |
| "num_input_tokens_seen": 4798283776, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.2865, | |
| "grad_norm": 0.5555365085601807, | |
| "learning_rate": 2.2116603989186895e-05, | |
| "loss": 1.6039, | |
| "num_input_tokens_seen": 4806672384, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.287, | |
| "grad_norm": 0.5825105309486389, | |
| "learning_rate": 2.2038489115901e-05, | |
| "loss": 1.4356, | |
| "num_input_tokens_seen": 4815060992, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.2875, | |
| "grad_norm": 0.52816241979599, | |
| "learning_rate": 2.196034280659122e-05, | |
| "loss": 1.5229, | |
| "num_input_tokens_seen": 4823449600, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 0.5466467142105103, | |
| "learning_rate": 2.1882166266370292e-05, | |
| "loss": 1.5349, | |
| "num_input_tokens_seen": 4831838208, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.2885, | |
| "grad_norm": 0.560653567314148, | |
| "learning_rate": 2.1803960700817185e-05, | |
| "loss": 1.5525, | |
| "num_input_tokens_seen": 4840226816, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.289, | |
| "grad_norm": 0.5358963012695312, | |
| "learning_rate": 2.1725727315958473e-05, | |
| "loss": 1.4915, | |
| "num_input_tokens_seen": 4848615424, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.2895, | |
| "grad_norm": 0.5298023223876953, | |
| "learning_rate": 2.1647467318249715e-05, | |
| "loss": 1.4079, | |
| "num_input_tokens_seen": 4857004032, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.6100833415985107, | |
| "learning_rate": 2.1569181914556904e-05, | |
| "loss": 1.6071, | |
| "num_input_tokens_seen": 4865392640, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.2905, | |
| "grad_norm": 0.47884467244148254, | |
| "learning_rate": 2.1490872312137795e-05, | |
| "loss": 1.4504, | |
| "num_input_tokens_seen": 4873781248, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.291, | |
| "grad_norm": 0.5199120044708252, | |
| "learning_rate": 2.1412539718623337e-05, | |
| "loss": 1.4338, | |
| "num_input_tokens_seen": 4882169856, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.2915, | |
| "grad_norm": 0.4816211760044098, | |
| "learning_rate": 2.1334185341999024e-05, | |
| "loss": 1.6256, | |
| "num_input_tokens_seen": 4890558464, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.292, | |
| "grad_norm": 0.48673585057258606, | |
| "learning_rate": 2.125581039058627e-05, | |
| "loss": 1.4931, | |
| "num_input_tokens_seen": 4898947072, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.2925, | |
| "grad_norm": 0.5214781761169434, | |
| "learning_rate": 2.117741607302378e-05, | |
| "loss": 1.4079, | |
| "num_input_tokens_seen": 4907335680, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.293, | |
| "grad_norm": 0.49936363101005554, | |
| "learning_rate": 2.109900359824892e-05, | |
| "loss": 1.683, | |
| "num_input_tokens_seen": 4915724288, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.2935, | |
| "grad_norm": 0.4919891357421875, | |
| "learning_rate": 2.1020574175479035e-05, | |
| "loss": 1.4582, | |
| "num_input_tokens_seen": 4924112896, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.294, | |
| "grad_norm": 0.5366162061691284, | |
| "learning_rate": 2.0942129014192854e-05, | |
| "loss": 1.4047, | |
| "num_input_tokens_seen": 4932501504, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.2945, | |
| "grad_norm": 0.49260836839675903, | |
| "learning_rate": 2.0863669324111807e-05, | |
| "loss": 1.5952, | |
| "num_input_tokens_seen": 4940890112, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.295, | |
| "grad_norm": 0.4582832455635071, | |
| "learning_rate": 2.0785196315181374e-05, | |
| "loss": 1.6684, | |
| "num_input_tokens_seen": 4949278720, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.2955, | |
| "grad_norm": 0.5586093068122864, | |
| "learning_rate": 2.0706711197552427e-05, | |
| "loss": 1.6649, | |
| "num_input_tokens_seen": 4957667328, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 0.5372852087020874, | |
| "learning_rate": 2.0628215181562567e-05, | |
| "loss": 1.5242, | |
| "num_input_tokens_seen": 4966055936, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.2965, | |
| "grad_norm": 0.5798465609550476, | |
| "learning_rate": 2.054970947771747e-05, | |
| "loss": 1.5445, | |
| "num_input_tokens_seen": 4974444544, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.297, | |
| "grad_norm": 0.4393630623817444, | |
| "learning_rate": 2.0471195296672207e-05, | |
| "loss": 1.5248, | |
| "num_input_tokens_seen": 4982833152, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.2975, | |
| "grad_norm": 0.4671383202075958, | |
| "learning_rate": 2.0392673849212565e-05, | |
| "loss": 1.5268, | |
| "num_input_tokens_seen": 4991221760, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.298, | |
| "grad_norm": 0.49570146203041077, | |
| "learning_rate": 2.0314146346236415e-05, | |
| "loss": 1.7117, | |
| "num_input_tokens_seen": 4999610368, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.2985, | |
| "grad_norm": 0.4883139133453369, | |
| "learning_rate": 2.0235613998734985e-05, | |
| "loss": 1.4293, | |
| "num_input_tokens_seen": 5007998976, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.299, | |
| "grad_norm": 0.5154351592063904, | |
| "learning_rate": 2.0157078017774228e-05, | |
| "loss": 1.6194, | |
| "num_input_tokens_seen": 5016387584, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.2995, | |
| "grad_norm": 0.5525362491607666, | |
| "learning_rate": 2.0078539614476122e-05, | |
| "loss": 1.5709, | |
| "num_input_tokens_seen": 5024776192, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.4198490083217621, | |
| "learning_rate": 2e-05, | |
| "loss": 1.677, | |
| "num_input_tokens_seen": 5033164800, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3005, | |
| "grad_norm": 0.6447516083717346, | |
| "learning_rate": 1.9921460385523884e-05, | |
| "loss": 1.743, | |
| "num_input_tokens_seen": 5041553408, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 0.301, | |
| "grad_norm": 0.5225803256034851, | |
| "learning_rate": 1.9842921982225782e-05, | |
| "loss": 1.4039, | |
| "num_input_tokens_seen": 5049942016, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.3015, | |
| "grad_norm": 0.46639594435691833, | |
| "learning_rate": 1.9764386001265015e-05, | |
| "loss": 1.5738, | |
| "num_input_tokens_seen": 5058330624, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 0.302, | |
| "grad_norm": 0.46976351737976074, | |
| "learning_rate": 1.9685853653763592e-05, | |
| "loss": 1.3899, | |
| "num_input_tokens_seen": 5066719232, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 0.3025, | |
| "grad_norm": 0.5326806902885437, | |
| "learning_rate": 1.960732615078744e-05, | |
| "loss": 1.4218, | |
| "num_input_tokens_seen": 5075107840, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.303, | |
| "grad_norm": 0.6198422312736511, | |
| "learning_rate": 1.95288047033278e-05, | |
| "loss": 1.3974, | |
| "num_input_tokens_seen": 5083496448, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.3035, | |
| "grad_norm": 0.6740239858627319, | |
| "learning_rate": 1.9450290522282533e-05, | |
| "loss": 1.5228, | |
| "num_input_tokens_seen": 5091885056, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 0.4955751895904541, | |
| "learning_rate": 1.9371784818437436e-05, | |
| "loss": 1.701, | |
| "num_input_tokens_seen": 5100273664, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.3045, | |
| "grad_norm": 0.5521119236946106, | |
| "learning_rate": 1.929328880244758e-05, | |
| "loss": 1.6082, | |
| "num_input_tokens_seen": 5108662272, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 0.305, | |
| "grad_norm": 0.6112958788871765, | |
| "learning_rate": 1.9214803684818636e-05, | |
| "loss": 1.4346, | |
| "num_input_tokens_seen": 5117050880, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3055, | |
| "grad_norm": 0.6295256018638611, | |
| "learning_rate": 1.9136330675888192e-05, | |
| "loss": 1.5434, | |
| "num_input_tokens_seen": 5125439488, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 0.306, | |
| "grad_norm": 0.5456522703170776, | |
| "learning_rate": 1.905787098580715e-05, | |
| "loss": 1.5229, | |
| "num_input_tokens_seen": 5133828096, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.3065, | |
| "grad_norm": 0.640235424041748, | |
| "learning_rate": 1.897942582452097e-05, | |
| "loss": 1.6001, | |
| "num_input_tokens_seen": 5142216704, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 0.307, | |
| "grad_norm": 0.5226327776908875, | |
| "learning_rate": 1.890099640175109e-05, | |
| "loss": 1.4943, | |
| "num_input_tokens_seen": 5150605312, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 0.3075, | |
| "grad_norm": 0.5394514799118042, | |
| "learning_rate": 1.882258392697622e-05, | |
| "loss": 1.5993, | |
| "num_input_tokens_seen": 5158993920, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.308, | |
| "grad_norm": 0.5722643136978149, | |
| "learning_rate": 1.8744189609413733e-05, | |
| "loss": 1.5002, | |
| "num_input_tokens_seen": 5167382528, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.3085, | |
| "grad_norm": 0.5504662394523621, | |
| "learning_rate": 1.8665814658000982e-05, | |
| "loss": 1.5726, | |
| "num_input_tokens_seen": 5175771136, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 0.309, | |
| "grad_norm": 0.5669524073600769, | |
| "learning_rate": 1.8587460281376673e-05, | |
| "loss": 1.4174, | |
| "num_input_tokens_seen": 5184159744, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.3095, | |
| "grad_norm": 0.5515470504760742, | |
| "learning_rate": 1.8509127687862208e-05, | |
| "loss": 1.5665, | |
| "num_input_tokens_seen": 5192548352, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.5559475421905518, | |
| "learning_rate": 1.8430818085443106e-05, | |
| "loss": 1.6525, | |
| "num_input_tokens_seen": 5200936960, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3105, | |
| "grad_norm": 0.5412907600402832, | |
| "learning_rate": 1.835253268175029e-05, | |
| "loss": 1.5381, | |
| "num_input_tokens_seen": 5209325568, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 0.311, | |
| "grad_norm": 0.514880359172821, | |
| "learning_rate": 1.8274272684041537e-05, | |
| "loss": 1.347, | |
| "num_input_tokens_seen": 5217714176, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 0.3115, | |
| "grad_norm": 0.661659300327301, | |
| "learning_rate": 1.8196039299182818e-05, | |
| "loss": 1.5291, | |
| "num_input_tokens_seen": 5226102784, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 0.5264044404029846, | |
| "learning_rate": 1.8117833733629715e-05, | |
| "loss": 1.5451, | |
| "num_input_tokens_seen": 5234491392, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 0.6714223027229309, | |
| "learning_rate": 1.8039657193408788e-05, | |
| "loss": 1.5878, | |
| "num_input_tokens_seen": 5242880000, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.313, | |
| "grad_norm": 0.6652107834815979, | |
| "learning_rate": 1.7961510884099005e-05, | |
| "loss": 1.4576, | |
| "num_input_tokens_seen": 5251268608, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 0.3135, | |
| "grad_norm": 0.4789321720600128, | |
| "learning_rate": 1.7883396010813116e-05, | |
| "loss": 1.6615, | |
| "num_input_tokens_seen": 5259657216, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 0.314, | |
| "grad_norm": 0.5451124906539917, | |
| "learning_rate": 1.7805313778179095e-05, | |
| "loss": 1.506, | |
| "num_input_tokens_seen": 5268045824, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 0.3145, | |
| "grad_norm": 0.6458382606506348, | |
| "learning_rate": 1.772726539032158e-05, | |
| "loss": 1.5675, | |
| "num_input_tokens_seen": 5276434432, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 0.315, | |
| "grad_norm": 0.4400927424430847, | |
| "learning_rate": 1.764925205084325e-05, | |
| "loss": 1.3921, | |
| "num_input_tokens_seen": 5284823040, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.3155, | |
| "grad_norm": 0.7637887597084045, | |
| "learning_rate": 1.7571274962806316e-05, | |
| "loss": 1.3816, | |
| "num_input_tokens_seen": 5293211648, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 0.316, | |
| "grad_norm": 0.5151621103286743, | |
| "learning_rate": 1.7493335328713913e-05, | |
| "loss": 1.4235, | |
| "num_input_tokens_seen": 5301600256, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 0.3165, | |
| "grad_norm": 0.4786008298397064, | |
| "learning_rate": 1.741543435049165e-05, | |
| "loss": 1.5014, | |
| "num_input_tokens_seen": 5309988864, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 0.317, | |
| "grad_norm": 0.5057694911956787, | |
| "learning_rate": 1.7337573229468958e-05, | |
| "loss": 1.5739, | |
| "num_input_tokens_seen": 5318377472, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 0.3175, | |
| "grad_norm": 1.5806396007537842, | |
| "learning_rate": 1.7259753166360644e-05, | |
| "loss": 1.5426, | |
| "num_input_tokens_seen": 5326766080, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.318, | |
| "grad_norm": 0.45514407753944397, | |
| "learning_rate": 1.7181975361248348e-05, | |
| "loss": 1.6838, | |
| "num_input_tokens_seen": 5335154688, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 0.3185, | |
| "grad_norm": 0.5524787902832031, | |
| "learning_rate": 1.7104241013562045e-05, | |
| "loss": 1.402, | |
| "num_input_tokens_seen": 5343543296, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 0.319, | |
| "grad_norm": 0.47155940532684326, | |
| "learning_rate": 1.702655132206154e-05, | |
| "loss": 1.5449, | |
| "num_input_tokens_seen": 5351931904, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 0.3195, | |
| "grad_norm": 0.5457002520561218, | |
| "learning_rate": 1.6948907484817985e-05, | |
| "loss": 1.5456, | |
| "num_input_tokens_seen": 5360320512, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.4440613389015198, | |
| "learning_rate": 1.687131069919538e-05, | |
| "loss": 1.598, | |
| "num_input_tokens_seen": 5368709120, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3205, | |
| "grad_norm": 0.4771471619606018, | |
| "learning_rate": 1.679376216183218e-05, | |
| "loss": 1.4177, | |
| "num_input_tokens_seen": 5377097728, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 0.321, | |
| "grad_norm": 0.4079873263835907, | |
| "learning_rate": 1.6716263068622744e-05, | |
| "loss": 1.5967, | |
| "num_input_tokens_seen": 5385486336, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 0.3215, | |
| "grad_norm": 0.48604148626327515, | |
| "learning_rate": 1.6638814614698965e-05, | |
| "loss": 1.4505, | |
| "num_input_tokens_seen": 5393874944, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 0.322, | |
| "grad_norm": 0.450703501701355, | |
| "learning_rate": 1.6561417994411808e-05, | |
| "loss": 1.4972, | |
| "num_input_tokens_seen": 5402263552, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 0.3225, | |
| "grad_norm": 0.4407959580421448, | |
| "learning_rate": 1.648407440131291e-05, | |
| "loss": 1.6219, | |
| "num_input_tokens_seen": 5410652160, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.323, | |
| "grad_norm": 0.4869948923587799, | |
| "learning_rate": 1.640678502813615e-05, | |
| "loss": 1.577, | |
| "num_input_tokens_seen": 5419040768, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 0.3235, | |
| "grad_norm": 0.4437527060508728, | |
| "learning_rate": 1.6329551066779278e-05, | |
| "loss": 1.4866, | |
| "num_input_tokens_seen": 5427429376, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 0.324, | |
| "grad_norm": 0.39153116941452026, | |
| "learning_rate": 1.6252373708285505e-05, | |
| "loss": 1.3777, | |
| "num_input_tokens_seen": 5435817984, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 0.3245, | |
| "grad_norm": 0.4412202835083008, | |
| "learning_rate": 1.6175254142825196e-05, | |
| "loss": 1.5029, | |
| "num_input_tokens_seen": 5444206592, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.40190449357032776, | |
| "learning_rate": 1.609819355967744e-05, | |
| "loss": 1.5957, | |
| "num_input_tokens_seen": 5452595200, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3255, | |
| "grad_norm": 0.40953540802001953, | |
| "learning_rate": 1.602119314721175e-05, | |
| "loss": 1.4924, | |
| "num_input_tokens_seen": 5460983808, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 0.326, | |
| "grad_norm": 0.3974117934703827, | |
| "learning_rate": 1.5944254092869756e-05, | |
| "loss": 1.423, | |
| "num_input_tokens_seen": 5469372416, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 0.3265, | |
| "grad_norm": 0.3962326943874359, | |
| "learning_rate": 1.5867377583146836e-05, | |
| "loss": 1.3479, | |
| "num_input_tokens_seen": 5477761024, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 0.327, | |
| "grad_norm": 0.3850420415401459, | |
| "learning_rate": 1.579056480357389e-05, | |
| "loss": 1.3674, | |
| "num_input_tokens_seen": 5486149632, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 0.3275, | |
| "grad_norm": 0.3804101049900055, | |
| "learning_rate": 1.571381693869899e-05, | |
| "loss": 1.6215, | |
| "num_input_tokens_seen": 5494538240, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 0.3984707295894623, | |
| "learning_rate": 1.5637135172069155e-05, | |
| "loss": 1.4702, | |
| "num_input_tokens_seen": 5502926848, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 0.3285, | |
| "grad_norm": 0.4196572005748749, | |
| "learning_rate": 1.5560520686212083e-05, | |
| "loss": 1.3952, | |
| "num_input_tokens_seen": 5511315456, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 0.329, | |
| "grad_norm": 0.4226216673851013, | |
| "learning_rate": 1.548397466261793e-05, | |
| "loss": 1.414, | |
| "num_input_tokens_seen": 5519704064, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 0.3295, | |
| "grad_norm": 0.37507861852645874, | |
| "learning_rate": 1.5407498281721063e-05, | |
| "loss": 1.5991, | |
| "num_input_tokens_seen": 5528092672, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.36862072348594666, | |
| "learning_rate": 1.53310927228819e-05, | |
| "loss": 1.5811, | |
| "num_input_tokens_seen": 5536481280, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3305, | |
| "grad_norm": 0.4065583646297455, | |
| "learning_rate": 1.5254759164368644e-05, | |
| "loss": 1.5603, | |
| "num_input_tokens_seen": 5544869888, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 0.331, | |
| "grad_norm": 0.4069935381412506, | |
| "learning_rate": 1.517849878333923e-05, | |
| "loss": 1.4642, | |
| "num_input_tokens_seen": 5553258496, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 0.3315, | |
| "grad_norm": 0.4405422806739807, | |
| "learning_rate": 1.5102312755823053e-05, | |
| "loss": 1.4566, | |
| "num_input_tokens_seen": 5561647104, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 0.332, | |
| "grad_norm": 0.4785662293434143, | |
| "learning_rate": 1.5026202256702909e-05, | |
| "loss": 1.3465, | |
| "num_input_tokens_seen": 5570035712, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 0.3325, | |
| "grad_norm": 0.40616917610168457, | |
| "learning_rate": 1.4950168459696841e-05, | |
| "loss": 1.5543, | |
| "num_input_tokens_seen": 5578424320, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.333, | |
| "grad_norm": 0.3933257460594177, | |
| "learning_rate": 1.4874212537340067e-05, | |
| "loss": 1.6011, | |
| "num_input_tokens_seen": 5586812928, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 0.3335, | |
| "grad_norm": 0.4299718737602234, | |
| "learning_rate": 1.4798335660966869e-05, | |
| "loss": 1.3836, | |
| "num_input_tokens_seen": 5595201536, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 0.334, | |
| "grad_norm": 0.36443138122558594, | |
| "learning_rate": 1.4722539000692548e-05, | |
| "loss": 1.5179, | |
| "num_input_tokens_seen": 5603590144, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 0.3345, | |
| "grad_norm": 0.42329415678977966, | |
| "learning_rate": 1.4646823725395351e-05, | |
| "loss": 1.5232, | |
| "num_input_tokens_seen": 5611978752, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 0.335, | |
| "grad_norm": 0.411154568195343, | |
| "learning_rate": 1.4571191002698517e-05, | |
| "loss": 1.5944, | |
| "num_input_tokens_seen": 5620367360, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.3355, | |
| "grad_norm": 0.3869856894016266, | |
| "learning_rate": 1.4495641998952172e-05, | |
| "loss": 1.5364, | |
| "num_input_tokens_seen": 5628755968, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 0.41792887449264526, | |
| "learning_rate": 1.4420177879215419e-05, | |
| "loss": 1.4382, | |
| "num_input_tokens_seen": 5637144576, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.3365, | |
| "grad_norm": 0.38612478971481323, | |
| "learning_rate": 1.434479980723833e-05, | |
| "loss": 1.6015, | |
| "num_input_tokens_seen": 5645533184, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 0.337, | |
| "grad_norm": 0.45299631357192993, | |
| "learning_rate": 1.4269508945444033e-05, | |
| "loss": 1.5707, | |
| "num_input_tokens_seen": 5653921792, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 0.3375, | |
| "grad_norm": 0.4732898473739624, | |
| "learning_rate": 1.4194306454910757e-05, | |
| "loss": 1.3082, | |
| "num_input_tokens_seen": 5662310400, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.338, | |
| "grad_norm": 0.4313901662826538, | |
| "learning_rate": 1.4119193495353925e-05, | |
| "loss": 1.4625, | |
| "num_input_tokens_seen": 5670699008, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 0.3385, | |
| "grad_norm": 0.41319823265075684, | |
| "learning_rate": 1.40441712251083e-05, | |
| "loss": 1.5491, | |
| "num_input_tokens_seen": 5679087616, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 0.339, | |
| "grad_norm": 0.4047788381576538, | |
| "learning_rate": 1.3969240801110088e-05, | |
| "loss": 1.4689, | |
| "num_input_tokens_seen": 5687476224, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 0.3395, | |
| "grad_norm": 0.3699304461479187, | |
| "learning_rate": 1.3894403378879132e-05, | |
| "loss": 1.5072, | |
| "num_input_tokens_seen": 5695864832, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 0.4087560772895813, | |
| "learning_rate": 1.3819660112501054e-05, | |
| "loss": 1.4847, | |
| "num_input_tokens_seen": 5704253440, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3405, | |
| "grad_norm": 0.4037911593914032, | |
| "learning_rate": 1.3745012154609492e-05, | |
| "loss": 1.4601, | |
| "num_input_tokens_seen": 5712642048, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 0.341, | |
| "grad_norm": 0.41595056653022766, | |
| "learning_rate": 1.3670460656368278e-05, | |
| "loss": 1.5049, | |
| "num_input_tokens_seen": 5721030656, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 0.3415, | |
| "grad_norm": 0.4090314209461212, | |
| "learning_rate": 1.3596006767453766e-05, | |
| "loss": 1.5409, | |
| "num_input_tokens_seen": 5729419264, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 0.342, | |
| "grad_norm": 0.45403459668159485, | |
| "learning_rate": 1.3521651636037017e-05, | |
| "loss": 1.5363, | |
| "num_input_tokens_seen": 5737807872, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 0.3425, | |
| "grad_norm": 0.45154640078544617, | |
| "learning_rate": 1.3447396408766134e-05, | |
| "loss": 1.4215, | |
| "num_input_tokens_seen": 5746196480, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.343, | |
| "grad_norm": 0.378068208694458, | |
| "learning_rate": 1.3373242230748579e-05, | |
| "loss": 1.5783, | |
| "num_input_tokens_seen": 5754585088, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 0.3435, | |
| "grad_norm": 0.3453136086463928, | |
| "learning_rate": 1.3299190245533522e-05, | |
| "loss": 1.4638, | |
| "num_input_tokens_seen": 5762973696, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 0.39520764350891113, | |
| "learning_rate": 1.3225241595094173e-05, | |
| "loss": 1.366, | |
| "num_input_tokens_seen": 5771362304, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.3445, | |
| "grad_norm": 0.4085905849933624, | |
| "learning_rate": 1.3151397419810207e-05, | |
| "loss": 1.5515, | |
| "num_input_tokens_seen": 5779750912, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 0.345, | |
| "grad_norm": 0.3634837567806244, | |
| "learning_rate": 1.3077658858450137e-05, | |
| "loss": 1.576, | |
| "num_input_tokens_seen": 5788139520, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.3455, | |
| "grad_norm": 0.421149879693985, | |
| "learning_rate": 1.3004027048153826e-05, | |
| "loss": 1.525, | |
| "num_input_tokens_seen": 5796528128, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 0.346, | |
| "grad_norm": 0.49760574102401733, | |
| "learning_rate": 1.2930503124414862e-05, | |
| "loss": 1.582, | |
| "num_input_tokens_seen": 5804916736, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 0.3465, | |
| "grad_norm": 0.4031181037425995, | |
| "learning_rate": 1.2857088221063099e-05, | |
| "loss": 1.4645, | |
| "num_input_tokens_seen": 5813305344, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 0.347, | |
| "grad_norm": 0.4272618293762207, | |
| "learning_rate": 1.2783783470247164e-05, | |
| "loss": 1.5368, | |
| "num_input_tokens_seen": 5821693952, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 0.3475, | |
| "grad_norm": 0.3895905315876007, | |
| "learning_rate": 1.2710590002417008e-05, | |
| "loss": 1.4271, | |
| "num_input_tokens_seen": 5830082560, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.348, | |
| "grad_norm": 0.3700122535228729, | |
| "learning_rate": 1.2637508946306443e-05, | |
| "loss": 1.6742, | |
| "num_input_tokens_seen": 5838471168, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 0.3485, | |
| "grad_norm": 0.44432806968688965, | |
| "learning_rate": 1.2564541428915762e-05, | |
| "loss": 1.2532, | |
| "num_input_tokens_seen": 5846859776, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 0.349, | |
| "grad_norm": 0.514743983745575, | |
| "learning_rate": 1.2491688575494337e-05, | |
| "loss": 1.4461, | |
| "num_input_tokens_seen": 5855248384, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 0.3495, | |
| "grad_norm": 0.3472532629966736, | |
| "learning_rate": 1.2418951509523312e-05, | |
| "loss": 1.6218, | |
| "num_input_tokens_seen": 5863636992, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.583053469657898, | |
| "learning_rate": 1.2346331352698206e-05, | |
| "loss": 1.5894, | |
| "num_input_tokens_seen": 5872025600, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.3505, | |
| "grad_norm": 0.4825255870819092, | |
| "learning_rate": 1.2273829224911685e-05, | |
| "loss": 1.4588, | |
| "num_input_tokens_seen": 5880414208, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 0.351, | |
| "grad_norm": 0.5164452195167542, | |
| "learning_rate": 1.2201446244236242e-05, | |
| "loss": 1.365, | |
| "num_input_tokens_seen": 5888802816, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 0.3515, | |
| "grad_norm": 0.5216825008392334, | |
| "learning_rate": 1.2129183526906971e-05, | |
| "loss": 1.5358, | |
| "num_input_tokens_seen": 5897191424, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 0.4355267286300659, | |
| "learning_rate": 1.205704218730439e-05, | |
| "loss": 1.5211, | |
| "num_input_tokens_seen": 5905580032, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.3525, | |
| "grad_norm": 0.5499055981636047, | |
| "learning_rate": 1.1985023337937185e-05, | |
| "loss": 1.5887, | |
| "num_input_tokens_seen": 5913968640, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.353, | |
| "grad_norm": 0.39691853523254395, | |
| "learning_rate": 1.1913128089425103e-05, | |
| "loss": 1.4468, | |
| "num_input_tokens_seen": 5922357248, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 0.3535, | |
| "grad_norm": 0.7014262676239014, | |
| "learning_rate": 1.1841357550481817e-05, | |
| "loss": 1.3292, | |
| "num_input_tokens_seen": 5930745856, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 0.354, | |
| "grad_norm": 0.42565855383872986, | |
| "learning_rate": 1.1769712827897825e-05, | |
| "loss": 1.5207, | |
| "num_input_tokens_seen": 5939134464, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 0.3545, | |
| "grad_norm": 0.5033617615699768, | |
| "learning_rate": 1.1698195026523379e-05, | |
| "loss": 1.6845, | |
| "num_input_tokens_seen": 5947523072, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 0.355, | |
| "grad_norm": 0.3542759418487549, | |
| "learning_rate": 1.1626805249251444e-05, | |
| "loss": 1.3674, | |
| "num_input_tokens_seen": 5955911680, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3555, | |
| "grad_norm": 0.4811363220214844, | |
| "learning_rate": 1.1555544597000693e-05, | |
| "loss": 1.5443, | |
| "num_input_tokens_seen": 5964300288, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 0.356, | |
| "grad_norm": 0.4083254039287567, | |
| "learning_rate": 1.1484414168698547e-05, | |
| "loss": 1.3636, | |
| "num_input_tokens_seen": 5972688896, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.3565, | |
| "grad_norm": 0.46758443117141724, | |
| "learning_rate": 1.1413415061264205e-05, | |
| "loss": 1.4704, | |
| "num_input_tokens_seen": 5981077504, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 0.357, | |
| "grad_norm": 1.2478874921798706, | |
| "learning_rate": 1.134254836959173e-05, | |
| "loss": 1.5195, | |
| "num_input_tokens_seen": 5989466112, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 0.3575, | |
| "grad_norm": 0.5220457911491394, | |
| "learning_rate": 1.1271815186533156e-05, | |
| "loss": 1.6146, | |
| "num_input_tokens_seen": 5997854720, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.358, | |
| "grad_norm": 0.4007498025894165, | |
| "learning_rate": 1.1201216602881696e-05, | |
| "loss": 1.6766, | |
| "num_input_tokens_seen": 6006243328, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 0.3585, | |
| "grad_norm": 0.4468950033187866, | |
| "learning_rate": 1.1130753707354836e-05, | |
| "loss": 1.5807, | |
| "num_input_tokens_seen": 6014631936, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 0.359, | |
| "grad_norm": 0.5144544243812561, | |
| "learning_rate": 1.106042758657758e-05, | |
| "loss": 1.4072, | |
| "num_input_tokens_seen": 6023020544, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 0.3595, | |
| "grad_norm": 0.4818895757198334, | |
| "learning_rate": 1.0990239325065714e-05, | |
| "loss": 1.4987, | |
| "num_input_tokens_seen": 6031409152, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.4314427673816681, | |
| "learning_rate": 1.0920190005209066e-05, | |
| "loss": 1.4462, | |
| "num_input_tokens_seen": 6039797760, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.3605, | |
| "grad_norm": 0.349338173866272, | |
| "learning_rate": 1.085028070725479e-05, | |
| "loss": 1.4223, | |
| "num_input_tokens_seen": 6048186368, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 0.361, | |
| "grad_norm": 0.43962350487709045, | |
| "learning_rate": 1.0780512509290758e-05, | |
| "loss": 1.5743, | |
| "num_input_tokens_seen": 6056574976, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 0.3615, | |
| "grad_norm": 0.3500334620475769, | |
| "learning_rate": 1.0710886487228868e-05, | |
| "loss": 1.4279, | |
| "num_input_tokens_seen": 6064963584, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 0.362, | |
| "grad_norm": 0.4147644639015198, | |
| "learning_rate": 1.0641403714788537e-05, | |
| "loss": 1.2636, | |
| "num_input_tokens_seen": 6073352192, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 0.3625, | |
| "grad_norm": 2.5849838256835938, | |
| "learning_rate": 1.0572065263480046e-05, | |
| "loss": 1.5996, | |
| "num_input_tokens_seen": 6081740800, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.363, | |
| "grad_norm": 0.5650848746299744, | |
| "learning_rate": 1.0502872202588113e-05, | |
| "loss": 1.3946, | |
| "num_input_tokens_seen": 6090129408, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 0.3635, | |
| "grad_norm": 0.5338028073310852, | |
| "learning_rate": 1.043382559915532e-05, | |
| "loss": 1.542, | |
| "num_input_tokens_seen": 6098518016, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 0.364, | |
| "grad_norm": 0.4497188627719879, | |
| "learning_rate": 1.0364926517965693e-05, | |
| "loss": 1.5702, | |
| "num_input_tokens_seen": 6106906624, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 0.3645, | |
| "grad_norm": 0.3634839355945587, | |
| "learning_rate": 1.0296176021528326e-05, | |
| "loss": 1.4027, | |
| "num_input_tokens_seen": 6115295232, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 0.365, | |
| "grad_norm": 0.46165332198143005, | |
| "learning_rate": 1.0227575170060909e-05, | |
| "loss": 1.4641, | |
| "num_input_tokens_seen": 6123683840, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3655, | |
| "grad_norm": 0.3360598683357239, | |
| "learning_rate": 1.0159125021473421e-05, | |
| "loss": 1.5792, | |
| "num_input_tokens_seen": 6132072448, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 0.366, | |
| "grad_norm": 0.445002943277359, | |
| "learning_rate": 1.009082663135185e-05, | |
| "loss": 1.5181, | |
| "num_input_tokens_seen": 6140461056, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 0.3665, | |
| "grad_norm": 0.38506197929382324, | |
| "learning_rate": 1.0022681052941856e-05, | |
| "loss": 1.3657, | |
| "num_input_tokens_seen": 6148849664, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 0.367, | |
| "grad_norm": 0.4203294515609741, | |
| "learning_rate": 9.95468933713255e-06, | |
| "loss": 1.3131, | |
| "num_input_tokens_seen": 6157238272, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 0.3675, | |
| "grad_norm": 0.38559603691101074, | |
| "learning_rate": 9.886852532440312e-06, | |
| "loss": 1.5369, | |
| "num_input_tokens_seen": 6165626880, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 0.39812353253364563, | |
| "learning_rate": 9.819171684992575e-06, | |
| "loss": 1.2618, | |
| "num_input_tokens_seen": 6174015488, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.3685, | |
| "grad_norm": 0.39168643951416016, | |
| "learning_rate": 9.751647838511747e-06, | |
| "loss": 1.5753, | |
| "num_input_tokens_seen": 6182404096, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 0.369, | |
| "grad_norm": 0.3801339566707611, | |
| "learning_rate": 9.684282034299053e-06, | |
| "loss": 1.5585, | |
| "num_input_tokens_seen": 6190792704, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 0.3695, | |
| "grad_norm": 0.4241108000278473, | |
| "learning_rate": 9.61707531121855e-06, | |
| "loss": 1.5152, | |
| "num_input_tokens_seen": 6199181312, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 0.3776848316192627, | |
| "learning_rate": 9.550028705681024e-06, | |
| "loss": 1.4332, | |
| "num_input_tokens_seen": 6207569920, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.3705, | |
| "grad_norm": 0.4260753095149994, | |
| "learning_rate": 9.483143251628088e-06, | |
| "loss": 1.6118, | |
| "num_input_tokens_seen": 6215958528, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 0.371, | |
| "grad_norm": 0.3607058525085449, | |
| "learning_rate": 9.416419980516192e-06, | |
| "loss": 1.505, | |
| "num_input_tokens_seen": 6224347136, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 0.3715, | |
| "grad_norm": 0.39192166924476624, | |
| "learning_rate": 9.349859921300704e-06, | |
| "loss": 1.4151, | |
| "num_input_tokens_seen": 6232735744, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 0.372, | |
| "grad_norm": 0.3367891311645508, | |
| "learning_rate": 9.283464100420064e-06, | |
| "loss": 1.5143, | |
| "num_input_tokens_seen": 6241124352, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 0.3725, | |
| "grad_norm": 0.33778345584869385, | |
| "learning_rate": 9.217233541779995e-06, | |
| "loss": 1.3913, | |
| "num_input_tokens_seen": 6249512960, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.373, | |
| "grad_norm": 0.31511205434799194, | |
| "learning_rate": 9.15116926673763e-06, | |
| "loss": 1.4947, | |
| "num_input_tokens_seen": 6257901568, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 0.3735, | |
| "grad_norm": 0.33166080713272095, | |
| "learning_rate": 9.085272294085803e-06, | |
| "loss": 1.4156, | |
| "num_input_tokens_seen": 6266290176, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 0.374, | |
| "grad_norm": 0.31525635719299316, | |
| "learning_rate": 9.019543640037363e-06, | |
| "loss": 1.4404, | |
| "num_input_tokens_seen": 6274678784, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 0.3745, | |
| "grad_norm": 0.32022103667259216, | |
| "learning_rate": 8.95398431820947e-06, | |
| "loss": 1.4356, | |
| "num_input_tokens_seen": 6283067392, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 0.33671078085899353, | |
| "learning_rate": 8.888595339607961e-06, | |
| "loss": 1.4918, | |
| "num_input_tokens_seen": 6291456000, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.3755, | |
| "grad_norm": 0.35733476281166077, | |
| "learning_rate": 8.82337771261177e-06, | |
| "loss": 1.4251, | |
| "num_input_tokens_seen": 6299844608, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 0.3112577199935913, | |
| "learning_rate": 8.758332442957394e-06, | |
| "loss": 1.4574, | |
| "num_input_tokens_seen": 6308233216, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 0.3765, | |
| "grad_norm": 0.3170168399810791, | |
| "learning_rate": 8.693460533723346e-06, | |
| "loss": 1.4763, | |
| "num_input_tokens_seen": 6316621824, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 0.377, | |
| "grad_norm": 0.3522033989429474, | |
| "learning_rate": 8.62876298531472e-06, | |
| "loss": 1.4088, | |
| "num_input_tokens_seen": 6325010432, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 0.3775, | |
| "grad_norm": 0.3583701550960541, | |
| "learning_rate": 8.564240795447758e-06, | |
| "loss": 1.5587, | |
| "num_input_tokens_seen": 6333399040, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.378, | |
| "grad_norm": 0.3301263153553009, | |
| "learning_rate": 8.499894959134436e-06, | |
| "loss": 1.4383, | |
| "num_input_tokens_seen": 6341787648, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 0.3785, | |
| "grad_norm": 0.3722430467605591, | |
| "learning_rate": 8.435726468667135e-06, | |
| "loss": 1.4768, | |
| "num_input_tokens_seen": 6350176256, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 0.379, | |
| "grad_norm": 0.32975760102272034, | |
| "learning_rate": 8.37173631360339e-06, | |
| "loss": 1.5434, | |
| "num_input_tokens_seen": 6358564864, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 0.3795, | |
| "grad_norm": 0.33969563245773315, | |
| "learning_rate": 8.307925480750535e-06, | |
| "loss": 1.4865, | |
| "num_input_tokens_seen": 6366953472, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.31190067529678345, | |
| "learning_rate": 8.24429495415054e-06, | |
| "loss": 1.5021, | |
| "num_input_tokens_seen": 6375342080, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.3805, | |
| "grad_norm": 0.33034002780914307, | |
| "learning_rate": 8.180845715064851e-06, | |
| "loss": 1.5358, | |
| "num_input_tokens_seen": 6383730688, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 0.381, | |
| "grad_norm": 0.32401880621910095, | |
| "learning_rate": 8.117578741959232e-06, | |
| "loss": 1.5265, | |
| "num_input_tokens_seen": 6392119296, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 0.3815, | |
| "grad_norm": 0.34233585000038147, | |
| "learning_rate": 8.054495010488658e-06, | |
| "loss": 1.5108, | |
| "num_input_tokens_seen": 6400507904, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 0.382, | |
| "grad_norm": 0.33030426502227783, | |
| "learning_rate": 7.991595493482323e-06, | |
| "loss": 1.4886, | |
| "num_input_tokens_seen": 6408896512, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 0.3825, | |
| "grad_norm": 0.3493650555610657, | |
| "learning_rate": 7.928881160928572e-06, | |
| "loss": 1.5745, | |
| "num_input_tokens_seen": 6417285120, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.383, | |
| "grad_norm": 0.3051527738571167, | |
| "learning_rate": 7.86635297996001e-06, | |
| "loss": 1.6423, | |
| "num_input_tokens_seen": 6425673728, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 0.3835, | |
| "grad_norm": 0.3218837380409241, | |
| "learning_rate": 7.804011914838524e-06, | |
| "loss": 1.6958, | |
| "num_input_tokens_seen": 6434062336, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 0.40201205015182495, | |
| "learning_rate": 7.741858926940475e-06, | |
| "loss": 1.5848, | |
| "num_input_tokens_seen": 6442450944, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.3845, | |
| "grad_norm": 0.3280126452445984, | |
| "learning_rate": 7.679894974741807e-06, | |
| "loss": 1.4155, | |
| "num_input_tokens_seen": 6450839552, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 0.385, | |
| "grad_norm": 0.3949170708656311, | |
| "learning_rate": 7.618121013803319e-06, | |
| "loss": 1.3645, | |
| "num_input_tokens_seen": 6459228160, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.3855, | |
| "grad_norm": 0.3333275318145752, | |
| "learning_rate": 7.556537996755919e-06, | |
| "loss": 1.4209, | |
| "num_input_tokens_seen": 6467616768, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 0.386, | |
| "grad_norm": 0.32655903697013855, | |
| "learning_rate": 7.495146873285904e-06, | |
| "loss": 1.4555, | |
| "num_input_tokens_seen": 6476005376, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 0.3865, | |
| "grad_norm": 0.3308366537094116, | |
| "learning_rate": 7.433948590120326e-06, | |
| "loss": 1.6011, | |
| "num_input_tokens_seen": 6484393984, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 0.387, | |
| "grad_norm": 0.37043994665145874, | |
| "learning_rate": 7.3729440910124464e-06, | |
| "loss": 1.3665, | |
| "num_input_tokens_seen": 6492782592, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 0.3875, | |
| "grad_norm": 0.3787115812301636, | |
| "learning_rate": 7.312134316727093e-06, | |
| "loss": 1.4787, | |
| "num_input_tokens_seen": 6501171200, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.388, | |
| "grad_norm": 0.35611215233802795, | |
| "learning_rate": 7.251520205026206e-06, | |
| "loss": 1.3719, | |
| "num_input_tokens_seen": 6509559808, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 0.3885, | |
| "grad_norm": 0.3926028311252594, | |
| "learning_rate": 7.191102690654384e-06, | |
| "loss": 1.5147, | |
| "num_input_tokens_seen": 6517948416, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 0.389, | |
| "grad_norm": 0.3550088703632355, | |
| "learning_rate": 7.130882705324422e-06, | |
| "loss": 1.419, | |
| "num_input_tokens_seen": 6526337024, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 0.3895, | |
| "grad_norm": 0.35230788588523865, | |
| "learning_rate": 7.070861177703006e-06, | |
| "loss": 1.4705, | |
| "num_input_tokens_seen": 6534725632, | |
| "step": 779 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.32987460494041443, | |
| "learning_rate": 7.01103903339633e-06, | |
| "loss": 1.4474, | |
| "num_input_tokens_seen": 6543114240, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.3905, | |
| "grad_norm": 0.378489226102829, | |
| "learning_rate": 6.95141719493587e-06, | |
| "loss": 1.34, | |
| "num_input_tokens_seen": 6551502848, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 0.391, | |
| "grad_norm": 0.3114675283432007, | |
| "learning_rate": 6.891996581764124e-06, | |
| "loss": 1.4218, | |
| "num_input_tokens_seen": 6559891456, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 0.3915, | |
| "grad_norm": 0.3373682498931885, | |
| "learning_rate": 6.832778110220457e-06, | |
| "loss": 1.5748, | |
| "num_input_tokens_seen": 6568280064, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 0.301872193813324, | |
| "learning_rate": 6.773762693526967e-06, | |
| "loss": 1.5562, | |
| "num_input_tokens_seen": 6576668672, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 0.3925, | |
| "grad_norm": 0.3254591226577759, | |
| "learning_rate": 6.7149512417743725e-06, | |
| "loss": 1.4617, | |
| "num_input_tokens_seen": 6585057280, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.393, | |
| "grad_norm": 0.31254011392593384, | |
| "learning_rate": 6.656344661908003e-06, | |
| "loss": 1.5, | |
| "num_input_tokens_seen": 6593445888, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 0.3935, | |
| "grad_norm": 0.30517539381980896, | |
| "learning_rate": 6.597943857713849e-06, | |
| "loss": 1.504, | |
| "num_input_tokens_seen": 6601834496, | |
| "step": 787 | |
| }, | |
| { | |
| "epoch": 0.394, | |
| "grad_norm": 0.34713372588157654, | |
| "learning_rate": 6.539749729804539e-06, | |
| "loss": 1.5454, | |
| "num_input_tokens_seen": 6610223104, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 0.3945, | |
| "grad_norm": 0.33461976051330566, | |
| "learning_rate": 6.4817631756055086e-06, | |
| "loss": 1.3616, | |
| "num_input_tokens_seen": 6618611712, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 0.395, | |
| "grad_norm": 0.3106061816215515, | |
| "learning_rate": 6.423985089341165e-06, | |
| "loss": 1.3729, | |
| "num_input_tokens_seen": 6627000320, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.3955, | |
| "grad_norm": 0.32616499066352844, | |
| "learning_rate": 6.366416362021077e-06, | |
| "loss": 1.3952, | |
| "num_input_tokens_seen": 6635388928, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 0.396, | |
| "grad_norm": 0.29314351081848145, | |
| "learning_rate": 6.3090578814262256e-06, | |
| "loss": 1.4549, | |
| "num_input_tokens_seen": 6643777536, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 0.3965, | |
| "grad_norm": 0.31268253922462463, | |
| "learning_rate": 6.251910532095349e-06, | |
| "loss": 1.3809, | |
| "num_input_tokens_seen": 6652166144, | |
| "step": 793 | |
| }, | |
| { | |
| "epoch": 0.397, | |
| "grad_norm": 0.33296966552734375, | |
| "learning_rate": 6.1949751953112565e-06, | |
| "loss": 1.5162, | |
| "num_input_tokens_seen": 6660554752, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 0.3975, | |
| "grad_norm": 0.3133097290992737, | |
| "learning_rate": 6.138252749087286e-06, | |
| "loss": 1.359, | |
| "num_input_tokens_seen": 6668943360, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.398, | |
| "grad_norm": 0.2918235957622528, | |
| "learning_rate": 6.081744068153714e-06, | |
| "loss": 1.5294, | |
| "num_input_tokens_seen": 6677331968, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 0.3985, | |
| "grad_norm": 0.35399124026298523, | |
| "learning_rate": 6.02545002394432e-06, | |
| "loss": 1.4941, | |
| "num_input_tokens_seen": 6685720576, | |
| "step": 797 | |
| }, | |
| { | |
| "epoch": 0.399, | |
| "grad_norm": 0.35472235083580017, | |
| "learning_rate": 5.969371484582887e-06, | |
| "loss": 1.5192, | |
| "num_input_tokens_seen": 6694109184, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 0.3995, | |
| "grad_norm": 0.32111871242523193, | |
| "learning_rate": 5.913509314869874e-06, | |
| "loss": 1.5339, | |
| "num_input_tokens_seen": 6702497792, | |
| "step": 799 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.3097288906574249, | |
| "learning_rate": 5.857864376269051e-06, | |
| "loss": 1.3977, | |
| "num_input_tokens_seen": 6710886400, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4005, | |
| "grad_norm": 0.3005337119102478, | |
| "learning_rate": 5.802437526894198e-06, | |
| "loss": 1.44, | |
| "num_input_tokens_seen": 6719275008, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 0.401, | |
| "grad_norm": 0.31911328434944153, | |
| "learning_rate": 5.747229621495893e-06, | |
| "loss": 1.3874, | |
| "num_input_tokens_seen": 6727663616, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 0.4015, | |
| "grad_norm": 0.3250430226325989, | |
| "learning_rate": 5.692241511448342e-06, | |
| "loss": 1.5702, | |
| "num_input_tokens_seen": 6736052224, | |
| "step": 803 | |
| }, | |
| { | |
| "epoch": 0.402, | |
| "grad_norm": 0.31666699051856995, | |
| "learning_rate": 5.637474044736227e-06, | |
| "loss": 1.5223, | |
| "num_input_tokens_seen": 6744440832, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 0.4025, | |
| "grad_norm": 0.30936866998672485, | |
| "learning_rate": 5.582928065941624e-06, | |
| "loss": 1.422, | |
| "num_input_tokens_seen": 6752829440, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.403, | |
| "grad_norm": 0.38043034076690674, | |
| "learning_rate": 5.528604416231016e-06, | |
| "loss": 1.4052, | |
| "num_input_tokens_seen": 6761218048, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 0.4035, | |
| "grad_norm": 0.34034010767936707, | |
| "learning_rate": 5.474503933342272e-06, | |
| "loss": 1.4711, | |
| "num_input_tokens_seen": 6769606656, | |
| "step": 807 | |
| }, | |
| { | |
| "epoch": 0.404, | |
| "grad_norm": 0.3767036199569702, | |
| "learning_rate": 5.4206274515717735e-06, | |
| "loss": 1.3419, | |
| "num_input_tokens_seen": 6777995264, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 0.4045, | |
| "grad_norm": 0.32522067427635193, | |
| "learning_rate": 5.366975801761507e-06, | |
| "loss": 1.5404, | |
| "num_input_tokens_seen": 6786383872, | |
| "step": 809 | |
| }, | |
| { | |
| "epoch": 0.405, | |
| "grad_norm": 0.348499059677124, | |
| "learning_rate": 5.313549811286294e-06, | |
| "loss": 1.6486, | |
| "num_input_tokens_seen": 6794772480, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4055, | |
| "grad_norm": 0.27966755628585815, | |
| "learning_rate": 5.260350304040987e-06, | |
| "loss": 1.4463, | |
| "num_input_tokens_seen": 6803161088, | |
| "step": 811 | |
| }, | |
| { | |
| "epoch": 0.406, | |
| "grad_norm": 0.36147329211235046, | |
| "learning_rate": 5.207378100427804e-06, | |
| "loss": 1.4963, | |
| "num_input_tokens_seen": 6811549696, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 0.4065, | |
| "grad_norm": 0.3296336233615875, | |
| "learning_rate": 5.154634017343662e-06, | |
| "loss": 1.4308, | |
| "num_input_tokens_seen": 6819938304, | |
| "step": 813 | |
| }, | |
| { | |
| "epoch": 0.407, | |
| "grad_norm": 0.29398980736732483, | |
| "learning_rate": 5.102118868167565e-06, | |
| "loss": 1.4888, | |
| "num_input_tokens_seen": 6828326912, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 0.4075, | |
| "grad_norm": 0.38029155135154724, | |
| "learning_rate": 5.049833462748061e-06, | |
| "loss": 1.5156, | |
| "num_input_tokens_seen": 6836715520, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 0.320686012506485, | |
| "learning_rate": 4.997778607390809e-06, | |
| "loss": 1.5002, | |
| "num_input_tokens_seen": 6845104128, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 0.4085, | |
| "grad_norm": 0.29965582489967346, | |
| "learning_rate": 4.945955104846061e-06, | |
| "loss": 1.6507, | |
| "num_input_tokens_seen": 6853492736, | |
| "step": 817 | |
| }, | |
| { | |
| "epoch": 0.409, | |
| "grad_norm": 0.291906476020813, | |
| "learning_rate": 4.89436375429633e-06, | |
| "loss": 1.5245, | |
| "num_input_tokens_seen": 6861881344, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 0.4095, | |
| "grad_norm": 0.28034621477127075, | |
| "learning_rate": 4.843005351344065e-06, | |
| "loss": 1.5288, | |
| "num_input_tokens_seen": 6870269952, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.32334470748901367, | |
| "learning_rate": 4.791880687999382e-06, | |
| "loss": 1.4864, | |
| "num_input_tokens_seen": 6878658560, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4105, | |
| "grad_norm": 0.2975791394710541, | |
| "learning_rate": 4.740990552667823e-06, | |
| "loss": 1.5017, | |
| "num_input_tokens_seen": 6887047168, | |
| "step": 821 | |
| }, | |
| { | |
| "epoch": 0.411, | |
| "grad_norm": 0.30968302488327026, | |
| "learning_rate": 4.6903357301382405e-06, | |
| "loss": 1.5463, | |
| "num_input_tokens_seen": 6895435776, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 0.4115, | |
| "grad_norm": 0.3349354565143585, | |
| "learning_rate": 4.639917001570644e-06, | |
| "loss": 1.4112, | |
| "num_input_tokens_seen": 6903824384, | |
| "step": 823 | |
| }, | |
| { | |
| "epoch": 0.412, | |
| "grad_norm": 0.29564395546913147, | |
| "learning_rate": 4.589735144484217e-06, | |
| "loss": 1.4759, | |
| "num_input_tokens_seen": 6912212992, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 0.4125, | |
| "grad_norm": 0.3293468654155731, | |
| "learning_rate": 4.53979093274526e-06, | |
| "loss": 1.6654, | |
| "num_input_tokens_seen": 6920601600, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.413, | |
| "grad_norm": 0.291838139295578, | |
| "learning_rate": 4.490085136555313e-06, | |
| "loss": 1.5502, | |
| "num_input_tokens_seen": 6928990208, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 0.4135, | |
| "grad_norm": 0.2951217591762543, | |
| "learning_rate": 4.440618522439237e-06, | |
| "loss": 1.5273, | |
| "num_input_tokens_seen": 6937378816, | |
| "step": 827 | |
| }, | |
| { | |
| "epoch": 0.414, | |
| "grad_norm": 0.2911154329776764, | |
| "learning_rate": 4.391391853233404e-06, | |
| "loss": 1.4542, | |
| "num_input_tokens_seen": 6945767424, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 0.4145, | |
| "grad_norm": 0.2849898934364319, | |
| "learning_rate": 4.342405888073971e-06, | |
| "loss": 1.4963, | |
| "num_input_tokens_seen": 6954156032, | |
| "step": 829 | |
| }, | |
| { | |
| "epoch": 0.415, | |
| "grad_norm": 0.30378979444503784, | |
| "learning_rate": 4.293661382385106e-06, | |
| "loss": 1.4233, | |
| "num_input_tokens_seen": 6962544640, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.4155, | |
| "grad_norm": 0.2819633483886719, | |
| "learning_rate": 4.245159087867383e-06, | |
| "loss": 1.444, | |
| "num_input_tokens_seen": 6970933248, | |
| "step": 831 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 0.2961499094963074, | |
| "learning_rate": 4.196899752486192e-06, | |
| "loss": 1.5515, | |
| "num_input_tokens_seen": 6979321856, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.4165, | |
| "grad_norm": 0.27551859617233276, | |
| "learning_rate": 4.148884120460186e-06, | |
| "loss": 1.4978, | |
| "num_input_tokens_seen": 6987710464, | |
| "step": 833 | |
| }, | |
| { | |
| "epoch": 0.417, | |
| "grad_norm": 0.3395516574382782, | |
| "learning_rate": 4.1011129322498e-06, | |
| "loss": 1.3703, | |
| "num_input_tokens_seen": 6996099072, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 0.4175, | |
| "grad_norm": 0.28008756041526794, | |
| "learning_rate": 4.05358692454586e-06, | |
| "loss": 1.6245, | |
| "num_input_tokens_seen": 7004487680, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.418, | |
| "grad_norm": 0.27910134196281433, | |
| "learning_rate": 4.006306830258189e-06, | |
| "loss": 1.6699, | |
| "num_input_tokens_seen": 7012876288, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 0.4185, | |
| "grad_norm": 0.2810775637626648, | |
| "learning_rate": 3.9592733785043405e-06, | |
| "loss": 1.4522, | |
| "num_input_tokens_seen": 7021264896, | |
| "step": 837 | |
| }, | |
| { | |
| "epoch": 0.419, | |
| "grad_norm": 0.28560468554496765, | |
| "learning_rate": 3.91248729459831e-06, | |
| "loss": 1.5036, | |
| "num_input_tokens_seen": 7029653504, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 0.4195, | |
| "grad_norm": 0.26259660720825195, | |
| "learning_rate": 3.865949300039404e-06, | |
| "loss": 1.4584, | |
| "num_input_tokens_seen": 7038042112, | |
| "step": 839 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.2673461437225342, | |
| "learning_rate": 3.819660112501053e-06, | |
| "loss": 1.6309, | |
| "num_input_tokens_seen": 7046430720, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.4205, | |
| "grad_norm": 0.2707984447479248, | |
| "learning_rate": 3.773620445819799e-06, | |
| "loss": 1.5435, | |
| "num_input_tokens_seen": 7054819328, | |
| "step": 841 | |
| }, | |
| { | |
| "epoch": 0.421, | |
| "grad_norm": 0.29399508237838745, | |
| "learning_rate": 3.727831009984262e-06, | |
| "loss": 1.6607, | |
| "num_input_tokens_seen": 7063207936, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 0.4215, | |
| "grad_norm": 0.2908207178115845, | |
| "learning_rate": 3.682292511124179e-06, | |
| "loss": 1.2691, | |
| "num_input_tokens_seen": 7071596544, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 0.422, | |
| "grad_norm": 0.30071473121643066, | |
| "learning_rate": 3.637005651499528e-06, | |
| "loss": 1.3875, | |
| "num_input_tokens_seen": 7079985152, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 0.4225, | |
| "grad_norm": 0.2938913106918335, | |
| "learning_rate": 3.5919711294897285e-06, | |
| "loss": 1.4228, | |
| "num_input_tokens_seen": 7088373760, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.423, | |
| "grad_norm": 0.2846091687679291, | |
| "learning_rate": 3.5471896395828064e-06, | |
| "loss": 1.6032, | |
| "num_input_tokens_seen": 7096762368, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 0.4235, | |
| "grad_norm": 0.28457286953926086, | |
| "learning_rate": 3.502661872364732e-06, | |
| "loss": 1.5612, | |
| "num_input_tokens_seen": 7105150976, | |
| "step": 847 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 0.4077390730381012, | |
| "learning_rate": 3.4583885145087613e-06, | |
| "loss": 1.5145, | |
| "num_input_tokens_seen": 7113539584, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 0.4245, | |
| "grad_norm": 0.26391610503196716, | |
| "learning_rate": 3.414370248764849e-06, | |
| "loss": 1.555, | |
| "num_input_tokens_seen": 7121928192, | |
| "step": 849 | |
| }, | |
| { | |
| "epoch": 0.425, | |
| "grad_norm": 0.2773348391056061, | |
| "learning_rate": 3.3706077539490933e-06, | |
| "loss": 1.573, | |
| "num_input_tokens_seen": 7130316800, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.4255, | |
| "grad_norm": 0.27040889859199524, | |
| "learning_rate": 3.327101704933313e-06, | |
| "loss": 1.4085, | |
| "num_input_tokens_seen": 7138705408, | |
| "step": 851 | |
| }, | |
| { | |
| "epoch": 0.426, | |
| "grad_norm": 0.26301881670951843, | |
| "learning_rate": 3.2838527726345994e-06, | |
| "loss": 1.4402, | |
| "num_input_tokens_seen": 7147094016, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 0.4265, | |
| "grad_norm": 0.2666051685810089, | |
| "learning_rate": 3.240861624004983e-06, | |
| "loss": 1.3919, | |
| "num_input_tokens_seen": 7155482624, | |
| "step": 853 | |
| }, | |
| { | |
| "epoch": 0.427, | |
| "grad_norm": 0.2823437750339508, | |
| "learning_rate": 3.198128922021162e-06, | |
| "loss": 1.5125, | |
| "num_input_tokens_seen": 7163871232, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 0.4275, | |
| "grad_norm": 0.2518790066242218, | |
| "learning_rate": 3.155655325674272e-06, | |
| "loss": 1.4826, | |
| "num_input_tokens_seen": 7172259840, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.428, | |
| "grad_norm": 0.26308494806289673, | |
| "learning_rate": 3.1134414899597033e-06, | |
| "loss": 1.6418, | |
| "num_input_tokens_seen": 7180648448, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 0.4285, | |
| "grad_norm": 0.2775367796421051, | |
| "learning_rate": 3.0714880658670165e-06, | |
| "loss": 1.6232, | |
| "num_input_tokens_seen": 7189037056, | |
| "step": 857 | |
| }, | |
| { | |
| "epoch": 0.429, | |
| "grad_norm": 0.27367228269577026, | |
| "learning_rate": 3.0297957003699284e-06, | |
| "loss": 1.6465, | |
| "num_input_tokens_seen": 7197425664, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 0.4295, | |
| "grad_norm": 0.26115256547927856, | |
| "learning_rate": 2.9883650364162784e-06, | |
| "loss": 1.5252, | |
| "num_input_tokens_seen": 7205814272, | |
| "step": 859 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.2825610339641571, | |
| "learning_rate": 2.947196712918157e-06, | |
| "loss": 1.4566, | |
| "num_input_tokens_seen": 7214202880, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4305, | |
| "grad_norm": 0.2604261338710785, | |
| "learning_rate": 2.906291364742042e-06, | |
| "loss": 1.465, | |
| "num_input_tokens_seen": 7222591488, | |
| "step": 861 | |
| }, | |
| { | |
| "epoch": 0.431, | |
| "grad_norm": 0.28668832778930664, | |
| "learning_rate": 2.8656496226990092e-06, | |
| "loss": 1.567, | |
| "num_input_tokens_seen": 7230980096, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 0.4315, | |
| "grad_norm": 0.2882774770259857, | |
| "learning_rate": 2.8252721135349892e-06, | |
| "loss": 1.277, | |
| "num_input_tokens_seen": 7239368704, | |
| "step": 863 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 0.26201221346855164, | |
| "learning_rate": 2.7851594599211297e-06, | |
| "loss": 1.5073, | |
| "num_input_tokens_seen": 7247757312, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.4325, | |
| "grad_norm": 0.2823950946331024, | |
| "learning_rate": 2.7453122804441636e-06, | |
| "loss": 1.5586, | |
| "num_input_tokens_seen": 7256145920, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.433, | |
| "grad_norm": 0.29726940393447876, | |
| "learning_rate": 2.705731189596901e-06, | |
| "loss": 1.495, | |
| "num_input_tokens_seen": 7264534528, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 0.4335, | |
| "grad_norm": 0.2634095847606659, | |
| "learning_rate": 2.6664167977687182e-06, | |
| "loss": 1.6844, | |
| "num_input_tokens_seen": 7272923136, | |
| "step": 867 | |
| }, | |
| { | |
| "epoch": 0.434, | |
| "grad_norm": 0.2488356977701187, | |
| "learning_rate": 2.6273697112361786e-06, | |
| "loss": 1.5279, | |
| "num_input_tokens_seen": 7281311744, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 0.4345, | |
| "grad_norm": 0.2584473788738251, | |
| "learning_rate": 2.588590532153652e-06, | |
| "loss": 1.6323, | |
| "num_input_tokens_seen": 7289700352, | |
| "step": 869 | |
| }, | |
| { | |
| "epoch": 0.435, | |
| "grad_norm": 0.2461184859275818, | |
| "learning_rate": 2.550079858544057e-06, | |
| "loss": 1.2039, | |
| "num_input_tokens_seen": 7298088960, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4355, | |
| "grad_norm": 0.26180657744407654, | |
| "learning_rate": 2.511838284289625e-06, | |
| "loss": 1.5785, | |
| "num_input_tokens_seen": 7306477568, | |
| "step": 871 | |
| }, | |
| { | |
| "epoch": 0.436, | |
| "grad_norm": 0.25122514367103577, | |
| "learning_rate": 2.473866399122733e-06, | |
| "loss": 1.5489, | |
| "num_input_tokens_seen": 7314866176, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 0.4365, | |
| "grad_norm": 0.2759020924568176, | |
| "learning_rate": 2.436164788616815e-06, | |
| "loss": 1.4657, | |
| "num_input_tokens_seen": 7323254784, | |
| "step": 873 | |
| }, | |
| { | |
| "epoch": 0.437, | |
| "grad_norm": 0.2796644866466522, | |
| "learning_rate": 2.398734034177361e-06, | |
| "loss": 1.5458, | |
| "num_input_tokens_seen": 7331643392, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 0.27356427907943726, | |
| "learning_rate": 2.3615747130329013e-06, | |
| "loss": 1.566, | |
| "num_input_tokens_seen": 7340032000, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.438, | |
| "grad_norm": 0.3139825761318207, | |
| "learning_rate": 2.324687398226131e-06, | |
| "loss": 1.6028, | |
| "num_input_tokens_seen": 7348420608, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 0.4385, | |
| "grad_norm": 0.2848818302154541, | |
| "learning_rate": 2.288072658605087e-06, | |
| "loss": 1.3853, | |
| "num_input_tokens_seen": 7356809216, | |
| "step": 877 | |
| }, | |
| { | |
| "epoch": 0.439, | |
| "grad_norm": 0.2559528946876526, | |
| "learning_rate": 2.2517310588143372e-06, | |
| "loss": 1.3481, | |
| "num_input_tokens_seen": 7365197824, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 0.4395, | |
| "grad_norm": 0.2623364329338074, | |
| "learning_rate": 2.215663159286314e-06, | |
| "loss": 1.4133, | |
| "num_input_tokens_seen": 7373586432, | |
| "step": 879 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.2571483552455902, | |
| "learning_rate": 2.1798695162326444e-06, | |
| "loss": 1.3396, | |
| "num_input_tokens_seen": 7381975040, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.4405, | |
| "grad_norm": 0.2681962847709656, | |
| "learning_rate": 2.144350681635585e-06, | |
| "loss": 1.4452, | |
| "num_input_tokens_seen": 7390363648, | |
| "step": 881 | |
| }, | |
| { | |
| "epoch": 0.441, | |
| "grad_norm": 0.28450822830200195, | |
| "learning_rate": 2.1091072032395e-06, | |
| "loss": 1.5543, | |
| "num_input_tokens_seen": 7398752256, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 0.4415, | |
| "grad_norm": 0.2630172669887543, | |
| "learning_rate": 2.0741396245424263e-06, | |
| "loss": 1.4874, | |
| "num_input_tokens_seen": 7407140864, | |
| "step": 883 | |
| }, | |
| { | |
| "epoch": 0.442, | |
| "grad_norm": 0.26452624797821045, | |
| "learning_rate": 2.0394484847876894e-06, | |
| "loss": 1.4417, | |
| "num_input_tokens_seen": 7415529472, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 0.4425, | |
| "grad_norm": 0.26220640540122986, | |
| "learning_rate": 2.0050343189555743e-06, | |
| "loss": 1.3356, | |
| "num_input_tokens_seen": 7423918080, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.443, | |
| "grad_norm": 0.2857254445552826, | |
| "learning_rate": 1.970897657755084e-06, | |
| "loss": 1.6374, | |
| "num_input_tokens_seen": 7432306688, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 0.4435, | |
| "grad_norm": 0.26213598251342773, | |
| "learning_rate": 1.937039027615779e-06, | |
| "loss": 1.3902, | |
| "num_input_tokens_seen": 7440695296, | |
| "step": 887 | |
| }, | |
| { | |
| "epoch": 0.444, | |
| "grad_norm": 0.260454386472702, | |
| "learning_rate": 1.903458950679613e-06, | |
| "loss": 1.3173, | |
| "num_input_tokens_seen": 7449083904, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 0.4445, | |
| "grad_norm": 0.26337119936943054, | |
| "learning_rate": 1.8701579447929076e-06, | |
| "loss": 1.4369, | |
| "num_input_tokens_seen": 7457472512, | |
| "step": 889 | |
| }, | |
| { | |
| "epoch": 0.445, | |
| "grad_norm": 0.3558846712112427, | |
| "learning_rate": 1.837136523498373e-06, | |
| "loss": 1.5975, | |
| "num_input_tokens_seen": 7465861120, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.4455, | |
| "grad_norm": 0.25488045811653137, | |
| "learning_rate": 1.80439519602718e-06, | |
| "loss": 1.6925, | |
| "num_input_tokens_seen": 7474249728, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 0.446, | |
| "grad_norm": 0.251467227935791, | |
| "learning_rate": 1.7719344672910942e-06, | |
| "loss": 1.5467, | |
| "num_input_tokens_seen": 7482638336, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 0.4465, | |
| "grad_norm": 0.2576838433742523, | |
| "learning_rate": 1.7397548378747142e-06, | |
| "loss": 1.4626, | |
| "num_input_tokens_seen": 7491026944, | |
| "step": 893 | |
| }, | |
| { | |
| "epoch": 0.447, | |
| "grad_norm": 0.2626551687717438, | |
| "learning_rate": 1.7078568040277276e-06, | |
| "loss": 1.528, | |
| "num_input_tokens_seen": 7499415552, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 0.4475, | |
| "grad_norm": 0.24569034576416016, | |
| "learning_rate": 1.676240857657283e-06, | |
| "loss": 1.5258, | |
| "num_input_tokens_seen": 7507804160, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 0.2767206132411957, | |
| "learning_rate": 1.6449074863203773e-06, | |
| "loss": 1.5373, | |
| "num_input_tokens_seen": 7516192768, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.4485, | |
| "grad_norm": 0.26214465498924255, | |
| "learning_rate": 1.6138571732163643e-06, | |
| "loss": 1.5565, | |
| "num_input_tokens_seen": 7524581376, | |
| "step": 897 | |
| }, | |
| { | |
| "epoch": 0.449, | |
| "grad_norm": 0.2768023908138275, | |
| "learning_rate": 1.5830903971794765e-06, | |
| "loss": 1.4132, | |
| "num_input_tokens_seen": 7532969984, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 0.4495, | |
| "grad_norm": 0.2570737898349762, | |
| "learning_rate": 1.5526076326714635e-06, | |
| "loss": 1.5753, | |
| "num_input_tokens_seen": 7541358592, | |
| "step": 899 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.2738839387893677, | |
| "learning_rate": 1.5224093497742654e-06, | |
| "loss": 1.5405, | |
| "num_input_tokens_seen": 7549747200, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.4505, | |
| "grad_norm": 0.27674537897109985, | |
| "learning_rate": 1.4924960141827605e-06, | |
| "loss": 1.3759, | |
| "num_input_tokens_seen": 7558135808, | |
| "step": 901 | |
| }, | |
| { | |
| "epoch": 0.451, | |
| "grad_norm": 0.2571975886821747, | |
| "learning_rate": 1.4628680871975842e-06, | |
| "loss": 1.5067, | |
| "num_input_tokens_seen": 7566524416, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 0.4515, | |
| "grad_norm": 0.2580336332321167, | |
| "learning_rate": 1.4335260257180262e-06, | |
| "loss": 1.4883, | |
| "num_input_tokens_seen": 7574913024, | |
| "step": 903 | |
| }, | |
| { | |
| "epoch": 0.452, | |
| "grad_norm": 0.26173335313796997, | |
| "learning_rate": 1.4044702822349731e-06, | |
| "loss": 1.5439, | |
| "num_input_tokens_seen": 7583301632, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 0.4525, | |
| "grad_norm": 0.24484549462795258, | |
| "learning_rate": 1.3757013048239287e-06, | |
| "loss": 1.5541, | |
| "num_input_tokens_seen": 7591690240, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.453, | |
| "grad_norm": 0.24992741644382477, | |
| "learning_rate": 1.3472195371381202e-06, | |
| "loss": 1.4184, | |
| "num_input_tokens_seen": 7600078848, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 0.4535, | |
| "grad_norm": 0.23614566028118134, | |
| "learning_rate": 1.3190254184016294e-06, | |
| "loss": 1.4825, | |
| "num_input_tokens_seen": 7608467456, | |
| "step": 907 | |
| }, | |
| { | |
| "epoch": 0.454, | |
| "grad_norm": 0.25955870747566223, | |
| "learning_rate": 1.2911193834026548e-06, | |
| "loss": 1.541, | |
| "num_input_tokens_seen": 7616856064, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 0.4545, | |
| "grad_norm": 0.24878355860710144, | |
| "learning_rate": 1.2635018624867712e-06, | |
| "loss": 1.4872, | |
| "num_input_tokens_seen": 7625244672, | |
| "step": 909 | |
| }, | |
| { | |
| "epoch": 0.455, | |
| "grad_norm": 0.306045800447464, | |
| "learning_rate": 1.236173281550319e-06, | |
| "loss": 1.4772, | |
| "num_input_tokens_seen": 7633633280, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.4555, | |
| "grad_norm": 0.24964255094528198, | |
| "learning_rate": 1.209134062033821e-06, | |
| "loss": 1.692, | |
| "num_input_tokens_seen": 7642021888, | |
| "step": 911 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 0.2841110825538635, | |
| "learning_rate": 1.182384620915491e-06, | |
| "loss": 1.3972, | |
| "num_input_tokens_seen": 7650410496, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 0.4565, | |
| "grad_norm": 0.2565704882144928, | |
| "learning_rate": 1.1559253707048046e-06, | |
| "loss": 1.5447, | |
| "num_input_tokens_seen": 7658799104, | |
| "step": 913 | |
| }, | |
| { | |
| "epoch": 0.457, | |
| "grad_norm": 0.24990417063236237, | |
| "learning_rate": 1.1297567194361303e-06, | |
| "loss": 1.557, | |
| "num_input_tokens_seen": 7667187712, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 0.4575, | |
| "grad_norm": 0.25711295008659363, | |
| "learning_rate": 1.103879070662439e-06, | |
| "loss": 1.4012, | |
| "num_input_tokens_seen": 7675576320, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.458, | |
| "grad_norm": 0.2509455978870392, | |
| "learning_rate": 1.0782928234490941e-06, | |
| "loss": 1.3762, | |
| "num_input_tokens_seen": 7683964928, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 0.4585, | |
| "grad_norm": 0.24873420596122742, | |
| "learning_rate": 1.0529983723676751e-06, | |
| "loss": 1.6085, | |
| "num_input_tokens_seen": 7692353536, | |
| "step": 917 | |
| }, | |
| { | |
| "epoch": 0.459, | |
| "grad_norm": 0.24984948337078094, | |
| "learning_rate": 1.027996107489908e-06, | |
| "loss": 1.4029, | |
| "num_input_tokens_seen": 7700742144, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 0.4595, | |
| "grad_norm": 0.269065260887146, | |
| "learning_rate": 1.0032864143816456e-06, | |
| "loss": 1.4508, | |
| "num_input_tokens_seen": 7709130752, | |
| "step": 919 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.24322226643562317, | |
| "learning_rate": 9.788696740969295e-07, | |
| "loss": 1.4074, | |
| "num_input_tokens_seen": 7717519360, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.4605, | |
| "grad_norm": 0.2334384173154831, | |
| "learning_rate": 9.547462631720906e-07, | |
| "loss": 1.4163, | |
| "num_input_tokens_seen": 7725907968, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 0.461, | |
| "grad_norm": 0.2344134896993637, | |
| "learning_rate": 9.30916553619976e-07, | |
| "loss": 1.532, | |
| "num_input_tokens_seen": 7734296576, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 0.4615, | |
| "grad_norm": 0.24026800692081451, | |
| "learning_rate": 9.073809129241784e-07, | |
| "loss": 1.3782, | |
| "num_input_tokens_seen": 7742685184, | |
| "step": 923 | |
| }, | |
| { | |
| "epoch": 0.462, | |
| "grad_norm": 0.24797479808330536, | |
| "learning_rate": 8.841397040333976e-07, | |
| "loss": 1.3886, | |
| "num_input_tokens_seen": 7751073792, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 0.4625, | |
| "grad_norm": 0.25900912284851074, | |
| "learning_rate": 8.611932853558236e-07, | |
| "loss": 1.4859, | |
| "num_input_tokens_seen": 7759462400, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.463, | |
| "grad_norm": 0.23255640268325806, | |
| "learning_rate": 8.38542010753618e-07, | |
| "loss": 1.3226, | |
| "num_input_tokens_seen": 7767851008, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 0.4635, | |
| "grad_norm": 0.23756912350654602, | |
| "learning_rate": 8.161862295374567e-07, | |
| "loss": 1.4173, | |
| "num_input_tokens_seen": 7776239616, | |
| "step": 927 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 0.2541607916355133, | |
| "learning_rate": 7.941262864611387e-07, | |
| "loss": 1.4765, | |
| "num_input_tokens_seen": 7784628224, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.4645, | |
| "grad_norm": 0.27562472224235535, | |
| "learning_rate": 7.723625217162811e-07, | |
| "loss": 1.5217, | |
| "num_input_tokens_seen": 7793016832, | |
| "step": 929 | |
| }, | |
| { | |
| "epoch": 0.465, | |
| "grad_norm": 0.24648788571357727, | |
| "learning_rate": 7.508952709270567e-07, | |
| "loss": 1.5589, | |
| "num_input_tokens_seen": 7801405440, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.4655, | |
| "grad_norm": 0.2653839588165283, | |
| "learning_rate": 7.29724865145025e-07, | |
| "loss": 1.3987, | |
| "num_input_tokens_seen": 7809794048, | |
| "step": 931 | |
| }, | |
| { | |
| "epoch": 0.466, | |
| "grad_norm": 0.2542785406112671, | |
| "learning_rate": 7.088516308440386e-07, | |
| "loss": 1.3908, | |
| "num_input_tokens_seen": 7818182656, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 0.4665, | |
| "grad_norm": 0.2655661404132843, | |
| "learning_rate": 6.882758899151886e-07, | |
| "loss": 1.4311, | |
| "num_input_tokens_seen": 7826571264, | |
| "step": 933 | |
| }, | |
| { | |
| "epoch": 0.467, | |
| "grad_norm": 0.24946841597557068, | |
| "learning_rate": 6.679979596618546e-07, | |
| "loss": 1.4361, | |
| "num_input_tokens_seen": 7834959872, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 0.4675, | |
| "grad_norm": 0.27184778451919556, | |
| "learning_rate": 6.480181527948049e-07, | |
| "loss": 1.7702, | |
| "num_input_tokens_seen": 7843348480, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.468, | |
| "grad_norm": 0.33869075775146484, | |
| "learning_rate": 6.283367774273785e-07, | |
| "loss": 1.4957, | |
| "num_input_tokens_seen": 7851737088, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 0.4685, | |
| "grad_norm": 0.2356402724981308, | |
| "learning_rate": 6.089541370707297e-07, | |
| "loss": 1.5427, | |
| "num_input_tokens_seen": 7860125696, | |
| "step": 937 | |
| }, | |
| { | |
| "epoch": 0.469, | |
| "grad_norm": 0.26490145921707153, | |
| "learning_rate": 5.898705306291508e-07, | |
| "loss": 1.5574, | |
| "num_input_tokens_seen": 7868514304, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 0.4695, | |
| "grad_norm": 0.26912957429885864, | |
| "learning_rate": 5.71086252395463e-07, | |
| "loss": 1.3498, | |
| "num_input_tokens_seen": 7876902912, | |
| "step": 939 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.3503011167049408, | |
| "learning_rate": 5.526015920464689e-07, | |
| "loss": 1.595, | |
| "num_input_tokens_seen": 7885291520, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.4705, | |
| "grad_norm": 0.2570345997810364, | |
| "learning_rate": 5.344168346385003e-07, | |
| "loss": 1.4517, | |
| "num_input_tokens_seen": 7893680128, | |
| "step": 941 | |
| }, | |
| { | |
| "epoch": 0.471, | |
| "grad_norm": 0.3460192382335663, | |
| "learning_rate": 5.165322606030132e-07, | |
| "loss": 1.3609, | |
| "num_input_tokens_seen": 7902068736, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 0.4715, | |
| "grad_norm": 0.25308606028556824, | |
| "learning_rate": 4.98948145742264e-07, | |
| "loss": 1.4926, | |
| "num_input_tokens_seen": 7910457344, | |
| "step": 943 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 0.29238274693489075, | |
| "learning_rate": 4.816647612250513e-07, | |
| "loss": 1.4488, | |
| "num_input_tokens_seen": 7918845952, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 0.4725, | |
| "grad_norm": 0.24855025112628937, | |
| "learning_rate": 4.646823735825523e-07, | |
| "loss": 1.4463, | |
| "num_input_tokens_seen": 7927234560, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.473, | |
| "grad_norm": 0.23923517763614655, | |
| "learning_rate": 4.4800124470418815e-07, | |
| "loss": 1.4419, | |
| "num_input_tokens_seen": 7935623168, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 0.4735, | |
| "grad_norm": 0.27583202719688416, | |
| "learning_rate": 4.3162163183360084e-07, | |
| "loss": 1.5707, | |
| "num_input_tokens_seen": 7944011776, | |
| "step": 947 | |
| }, | |
| { | |
| "epoch": 0.474, | |
| "grad_norm": 0.2602940499782562, | |
| "learning_rate": 4.155437875646828e-07, | |
| "loss": 1.4926, | |
| "num_input_tokens_seen": 7952400384, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 0.4745, | |
| "grad_norm": 0.2309136837720871, | |
| "learning_rate": 3.997679598376891e-07, | |
| "loss": 1.4826, | |
| "num_input_tokens_seen": 7960788992, | |
| "step": 949 | |
| }, | |
| { | |
| "epoch": 0.475, | |
| "grad_norm": 0.23487992584705353, | |
| "learning_rate": 3.842943919353914e-07, | |
| "loss": 1.3799, | |
| "num_input_tokens_seen": 7969177600, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.4755, | |
| "grad_norm": 0.23048855364322662, | |
| "learning_rate": 3.6912332247935224e-07, | |
| "loss": 1.5084, | |
| "num_input_tokens_seen": 7977566208, | |
| "step": 951 | |
| }, | |
| { | |
| "epoch": 0.476, | |
| "grad_norm": 0.24234017729759216, | |
| "learning_rate": 3.5425498542622784e-07, | |
| "loss": 1.5914, | |
| "num_input_tokens_seen": 7985954816, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 0.4765, | |
| "grad_norm": 0.34213560819625854, | |
| "learning_rate": 3.396896100641689e-07, | |
| "loss": 1.3576, | |
| "num_input_tokens_seen": 7994343424, | |
| "step": 953 | |
| }, | |
| { | |
| "epoch": 0.477, | |
| "grad_norm": 0.2516288757324219, | |
| "learning_rate": 3.2542742100928114e-07, | |
| "loss": 1.4572, | |
| "num_input_tokens_seen": 8002732032, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 0.4775, | |
| "grad_norm": 0.24182823300361633, | |
| "learning_rate": 3.114686382021681e-07, | |
| "loss": 1.6151, | |
| "num_input_tokens_seen": 8011120640, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.478, | |
| "grad_norm": 0.23201249539852142, | |
| "learning_rate": 2.9781347690452266e-07, | |
| "loss": 1.374, | |
| "num_input_tokens_seen": 8019509248, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 0.4785, | |
| "grad_norm": 0.28376397490501404, | |
| "learning_rate": 2.8446214769582534e-07, | |
| "loss": 1.4513, | |
| "num_input_tokens_seen": 8027897856, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 0.479, | |
| "grad_norm": 0.27368590235710144, | |
| "learning_rate": 2.714148564700914e-07, | |
| "loss": 1.5391, | |
| "num_input_tokens_seen": 8036286464, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 0.4795, | |
| "grad_norm": 0.22406716644763947, | |
| "learning_rate": 2.586718044326886e-07, | |
| "loss": 1.3113, | |
| "num_input_tokens_seen": 8044675072, | |
| "step": 959 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.23694166541099548, | |
| "learning_rate": 2.462331880972468e-07, | |
| "loss": 1.4956, | |
| "num_input_tokens_seen": 8053063680, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.4805, | |
| "grad_norm": 0.23591896891593933, | |
| "learning_rate": 2.340991992826136e-07, | |
| "loss": 1.573, | |
| "num_input_tokens_seen": 8061452288, | |
| "step": 961 | |
| }, | |
| { | |
| "epoch": 0.481, | |
| "grad_norm": 0.2507438659667969, | |
| "learning_rate": 2.222700251099097e-07, | |
| "loss": 1.4986, | |
| "num_input_tokens_seen": 8069840896, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 0.4815, | |
| "grad_norm": 0.23362399637699127, | |
| "learning_rate": 2.107458479996316e-07, | |
| "loss": 1.4341, | |
| "num_input_tokens_seen": 8078229504, | |
| "step": 963 | |
| }, | |
| { | |
| "epoch": 0.482, | |
| "grad_norm": 0.2520342767238617, | |
| "learning_rate": 1.9952684566884927e-07, | |
| "loss": 1.5952, | |
| "num_input_tokens_seen": 8086618112, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 0.4825, | |
| "grad_norm": 0.2540062367916107, | |
| "learning_rate": 1.88613191128455e-07, | |
| "loss": 1.4212, | |
| "num_input_tokens_seen": 8095006720, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.483, | |
| "grad_norm": 0.2427206039428711, | |
| "learning_rate": 1.780050526805055e-07, | |
| "loss": 1.36, | |
| "num_input_tokens_seen": 8103395328, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 0.4835, | |
| "grad_norm": 0.23076407611370087, | |
| "learning_rate": 1.6770259391561518e-07, | |
| "loss": 1.532, | |
| "num_input_tokens_seen": 8111783936, | |
| "step": 967 | |
| }, | |
| { | |
| "epoch": 0.484, | |
| "grad_norm": 0.2622830271720886, | |
| "learning_rate": 1.577059737104447e-07, | |
| "loss": 1.5584, | |
| "num_input_tokens_seen": 8120172544, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 0.4845, | |
| "grad_norm": 0.24084292352199554, | |
| "learning_rate": 1.4801534622524316e-07, | |
| "loss": 1.4908, | |
| "num_input_tokens_seen": 8128561152, | |
| "step": 969 | |
| }, | |
| { | |
| "epoch": 0.485, | |
| "grad_norm": 0.23624394834041595, | |
| "learning_rate": 1.3863086090147415e-07, | |
| "loss": 1.5083, | |
| "num_input_tokens_seen": 8136949760, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.4855, | |
| "grad_norm": 0.2352854162454605, | |
| "learning_rate": 1.2955266245951338e-07, | |
| "loss": 1.5316, | |
| "num_input_tokens_seen": 8145338368, | |
| "step": 971 | |
| }, | |
| { | |
| "epoch": 0.486, | |
| "grad_norm": 0.2318330854177475, | |
| "learning_rate": 1.2078089089640809e-07, | |
| "loss": 1.4232, | |
| "num_input_tokens_seen": 8153726976, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 0.4865, | |
| "grad_norm": 0.23645414412021637, | |
| "learning_rate": 1.1231568148372562e-07, | |
| "loss": 1.4502, | |
| "num_input_tokens_seen": 8162115584, | |
| "step": 973 | |
| }, | |
| { | |
| "epoch": 0.487, | |
| "grad_norm": 0.237844780087471, | |
| "learning_rate": 1.0415716476547045e-07, | |
| "loss": 1.5448, | |
| "num_input_tokens_seen": 8170504192, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 0.4875, | |
| "grad_norm": 0.23230130970478058, | |
| "learning_rate": 9.630546655606365e-08, | |
| "loss": 1.5471, | |
| "num_input_tokens_seen": 8178892800, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 0.25277188420295715, | |
| "learning_rate": 8.876070793840008e-08, | |
| "loss": 1.5526, | |
| "num_input_tokens_seen": 8187281408, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 0.4885, | |
| "grad_norm": 0.23660285770893097, | |
| "learning_rate": 8.15230052619942e-08, | |
| "loss": 1.1917, | |
| "num_input_tokens_seen": 8195670016, | |
| "step": 977 | |
| }, | |
| { | |
| "epoch": 0.489, | |
| "grad_norm": 0.243132546544075, | |
| "learning_rate": 7.459247014117488e-08, | |
| "loss": 1.5268, | |
| "num_input_tokens_seen": 8204058624, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 0.4895, | |
| "grad_norm": 0.23707322776317596, | |
| "learning_rate": 6.796920945336682e-08, | |
| "loss": 1.471, | |
| "num_input_tokens_seen": 8212447232, | |
| "step": 979 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.24316631257534027, | |
| "learning_rate": 6.165332533744072e-08, | |
| "loss": 1.4282, | |
| "num_input_tokens_seen": 8220835840, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.4905, | |
| "grad_norm": 0.2317165583372116, | |
| "learning_rate": 5.5644915192145654e-08, | |
| "loss": 1.5112, | |
| "num_input_tokens_seen": 8229224448, | |
| "step": 981 | |
| }, | |
| { | |
| "epoch": 0.491, | |
| "grad_norm": 0.22775763273239136, | |
| "learning_rate": 4.9944071674599135e-08, | |
| "loss": 1.4401, | |
| "num_input_tokens_seen": 8237613056, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 0.4915, | |
| "grad_norm": 0.2643609046936035, | |
| "learning_rate": 4.4550882698857214e-08, | |
| "loss": 1.6142, | |
| "num_input_tokens_seen": 8246001664, | |
| "step": 983 | |
| }, | |
| { | |
| "epoch": 0.492, | |
| "grad_norm": 0.236323744058609, | |
| "learning_rate": 3.946543143456882e-08, | |
| "loss": 1.4429, | |
| "num_input_tokens_seen": 8254390272, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 0.4925, | |
| "grad_norm": 1.1297415494918823, | |
| "learning_rate": 3.468779630568353e-08, | |
| "loss": 1.5828, | |
| "num_input_tokens_seen": 8262778880, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.493, | |
| "grad_norm": 0.23412106931209564, | |
| "learning_rate": 3.021805098924136e-08, | |
| "loss": 1.4264, | |
| "num_input_tokens_seen": 8271167488, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 0.4935, | |
| "grad_norm": 0.23799893260002136, | |
| "learning_rate": 2.6056264414249245e-08, | |
| "loss": 1.6704, | |
| "num_input_tokens_seen": 8279556096, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 0.494, | |
| "grad_norm": 0.23002925515174866, | |
| "learning_rate": 2.220250076060193e-08, | |
| "loss": 1.4746, | |
| "num_input_tokens_seen": 8287944704, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 0.4945, | |
| "grad_norm": 0.35217779874801636, | |
| "learning_rate": 1.8656819458100496e-08, | |
| "loss": 1.4988, | |
| "num_input_tokens_seen": 8296333312, | |
| "step": 989 | |
| }, | |
| { | |
| "epoch": 0.495, | |
| "grad_norm": 0.2293684184551239, | |
| "learning_rate": 1.541927518554198e-08, | |
| "loss": 1.5303, | |
| "num_input_tokens_seen": 8304721920, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.4955, | |
| "grad_norm": 0.2379152923822403, | |
| "learning_rate": 1.2489917869860091e-08, | |
| "loss": 1.4344, | |
| "num_input_tokens_seen": 8313110528, | |
| "step": 991 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 0.26264408230781555, | |
| "learning_rate": 9.868792685368001e-09, | |
| "loss": 1.6185, | |
| "num_input_tokens_seen": 8321499136, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.4965, | |
| "grad_norm": 1.379636526107788, | |
| "learning_rate": 7.55594005306337e-09, | |
| "loss": 1.6366, | |
| "num_input_tokens_seen": 8329887744, | |
| "step": 993 | |
| }, | |
| { | |
| "epoch": 0.497, | |
| "grad_norm": 0.25221097469329834, | |
| "learning_rate": 5.551395639988855e-09, | |
| "loss": 1.4015, | |
| "num_input_tokens_seen": 8338276352, | |
| "step": 994 | |
| }, | |
| { | |
| "epoch": 0.4975, | |
| "grad_norm": 0.258603572845459, | |
| "learning_rate": 3.855190358703631e-09, | |
| "loss": 1.6689, | |
| "num_input_tokens_seen": 8346664960, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.498, | |
| "grad_norm": 0.2482948750257492, | |
| "learning_rate": 2.467350366788246e-09, | |
| "loss": 1.3846, | |
| "num_input_tokens_seen": 8355053568, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 0.4985, | |
| "grad_norm": 0.25976622104644775, | |
| "learning_rate": 1.3878970664538138e-09, | |
| "loss": 1.5265, | |
| "num_input_tokens_seen": 8363442176, | |
| "step": 997 | |
| }, | |
| { | |
| "epoch": 0.499, | |
| "grad_norm": 0.22654418647289276, | |
| "learning_rate": 6.168471042067303e-10, | |
| "loss": 1.4878, | |
| "num_input_tokens_seen": 8371830784, | |
| "step": 998 | |
| }, | |
| { | |
| "epoch": 0.4995, | |
| "grad_norm": 0.23816251754760742, | |
| "learning_rate": 1.5421237058887984e-10, | |
| "loss": 1.6286, | |
| "num_input_tokens_seen": 8380219392, | |
| "step": 999 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.24939778447151184, | |
| "learning_rate": 0.0, | |
| "loss": 1.5803, | |
| "num_input_tokens_seen": 8388608000, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.0005, | |
| "grad_norm": 0.24398168921470642, | |
| "learning_rate": 1.5421237058887984e-10, | |
| "loss": 1.499, | |
| "num_input_tokens_seen": 8396996608, | |
| "step": 1001 | |
| }, | |
| { | |
| "epoch": 0.0005, | |
| "num_input_tokens_seen": 8396996608, | |
| "step": 1001, | |
| "total_flos": 5.908015032569954e+18, | |
| "train_loss": 0.001497477203696877, | |
| "train_runtime": 201.1036, | |
| "train_samples_per_second": 318.244, | |
| "train_steps_per_second": 4.973 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 8396996608, | |
| "num_train_epochs": 1, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.908015032569954e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |