| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.953191489361702, | |
| "eval_steps": 500, | |
| "global_step": 440, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011347517730496455, | |
| "grad_norm": 28.768582165086425, | |
| "learning_rate": 4.999936276068748e-05, | |
| "loss": 2.5462, | |
| "num_input_tokens_seen": 262144, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.02269503546099291, | |
| "grad_norm": 93.43249139527468, | |
| "learning_rate": 4.9997451075235834e-05, | |
| "loss": 4.367, | |
| "num_input_tokens_seen": 524288, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.03404255319148936, | |
| "grad_norm": 27.45199589426381, | |
| "learning_rate": 4.999426504110115e-05, | |
| "loss": 3.7754, | |
| "num_input_tokens_seen": 786432, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.04539007092198582, | |
| "grad_norm": 21.466687229475028, | |
| "learning_rate": 4.9989804820704735e-05, | |
| "loss": 3.4412, | |
| "num_input_tokens_seen": 1048576, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.05673758865248227, | |
| "grad_norm": 8.511408964141404, | |
| "learning_rate": 4.99840706414248e-05, | |
| "loss": 2.5369, | |
| "num_input_tokens_seen": 1310720, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.06808510638297872, | |
| "grad_norm": 17.110148299969282, | |
| "learning_rate": 4.9977062795584893e-05, | |
| "loss": 2.531, | |
| "num_input_tokens_seen": 1572864, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.07943262411347518, | |
| "grad_norm": 8.528722034783323, | |
| "learning_rate": 4.9968781640439026e-05, | |
| "loss": 2.2971, | |
| "num_input_tokens_seen": 1835008, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.09078014184397164, | |
| "grad_norm": 7.852506050761374, | |
| "learning_rate": 4.995922759815339e-05, | |
| "loss": 2.2175, | |
| "num_input_tokens_seen": 2097152, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.10212765957446808, | |
| "grad_norm": 5.024862487166255, | |
| "learning_rate": 4.9948401155784904e-05, | |
| "loss": 2.0184, | |
| "num_input_tokens_seen": 2359296, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.11347517730496454, | |
| "grad_norm": 5.07110237293465, | |
| "learning_rate": 4.993630286525634e-05, | |
| "loss": 1.97, | |
| "num_input_tokens_seen": 2621440, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12482269503546099, | |
| "grad_norm": 3.902962877759486, | |
| "learning_rate": 4.99229333433282e-05, | |
| "loss": 1.9201, | |
| "num_input_tokens_seen": 2883584, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.13617021276595745, | |
| "grad_norm": 3.6676315365476855, | |
| "learning_rate": 4.9908293271567286e-05, | |
| "loss": 1.8835, | |
| "num_input_tokens_seen": 3145728, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.1475177304964539, | |
| "grad_norm": 2.929600242853849, | |
| "learning_rate": 4.9892383396311934e-05, | |
| "loss": 1.808, | |
| "num_input_tokens_seen": 3407872, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.15886524822695036, | |
| "grad_norm": 2.5508693562661717, | |
| "learning_rate": 4.987520452863399e-05, | |
| "loss": 1.7956, | |
| "num_input_tokens_seen": 3670016, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.1702127659574468, | |
| "grad_norm": 2.2262214099883053, | |
| "learning_rate": 4.985675754429744e-05, | |
| "loss": 1.781, | |
| "num_input_tokens_seen": 3932160, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.18156028368794327, | |
| "grad_norm": 2.8994102519435656, | |
| "learning_rate": 4.9837043383713753e-05, | |
| "loss": 1.7132, | |
| "num_input_tokens_seen": 4194304, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.19290780141843972, | |
| "grad_norm": 3.2652620723767845, | |
| "learning_rate": 4.981606305189401e-05, | |
| "loss": 1.7316, | |
| "num_input_tokens_seen": 4456448, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.20425531914893616, | |
| "grad_norm": 2.588037121879393, | |
| "learning_rate": 4.979381761839757e-05, | |
| "loss": 1.7198, | |
| "num_input_tokens_seen": 4718592, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.21560283687943263, | |
| "grad_norm": 1.9459781834170677, | |
| "learning_rate": 4.9770308217277614e-05, | |
| "loss": 1.6976, | |
| "num_input_tokens_seen": 4980736, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.22695035460992907, | |
| "grad_norm": 4.291046579259317, | |
| "learning_rate": 4.9745536047023324e-05, | |
| "loss": 1.7159, | |
| "num_input_tokens_seen": 5242880, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.23829787234042554, | |
| "grad_norm": 1.9625030923747173, | |
| "learning_rate": 4.971950237049874e-05, | |
| "loss": 1.6871, | |
| "num_input_tokens_seen": 5505024, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.24964539007092199, | |
| "grad_norm": 3.677910756835627, | |
| "learning_rate": 4.9692208514878444e-05, | |
| "loss": 1.641, | |
| "num_input_tokens_seen": 5767168, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.26099290780141843, | |
| "grad_norm": 2.2017196904664247, | |
| "learning_rate": 4.966365587157986e-05, | |
| "loss": 1.6336, | |
| "num_input_tokens_seen": 6029312, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.2723404255319149, | |
| "grad_norm": 2.9082791522352958, | |
| "learning_rate": 4.963384589619233e-05, | |
| "loss": 1.6184, | |
| "num_input_tokens_seen": 6291456, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.28368794326241137, | |
| "grad_norm": 2.306973394969553, | |
| "learning_rate": 4.96027801084029e-05, | |
| "loss": 1.6175, | |
| "num_input_tokens_seen": 6553600, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2950354609929078, | |
| "grad_norm": 1.8276771179473579, | |
| "learning_rate": 4.957046009191889e-05, | |
| "loss": 1.6145, | |
| "num_input_tokens_seen": 6815744, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.30638297872340425, | |
| "grad_norm": 2.7024583313160213, | |
| "learning_rate": 4.95368874943871e-05, | |
| "loss": 1.5905, | |
| "num_input_tokens_seen": 7077888, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.3177304964539007, | |
| "grad_norm": 1.8227465911208784, | |
| "learning_rate": 4.9502064027309836e-05, | |
| "loss": 1.5847, | |
| "num_input_tokens_seen": 7340032, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.32907801418439714, | |
| "grad_norm": 2.189585770318853, | |
| "learning_rate": 4.946599146595769e-05, | |
| "loss": 1.5862, | |
| "num_input_tokens_seen": 7602176, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.3404255319148936, | |
| "grad_norm": 1.6963880678360608, | |
| "learning_rate": 4.942867164927899e-05, | |
| "loss": 1.5856, | |
| "num_input_tokens_seen": 7864320, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3517730496453901, | |
| "grad_norm": 1.884475186213032, | |
| "learning_rate": 4.9390106479806085e-05, | |
| "loss": 1.5462, | |
| "num_input_tokens_seen": 8126464, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.36312056737588655, | |
| "grad_norm": 1.7633790709929305, | |
| "learning_rate": 4.935029792355834e-05, | |
| "loss": 1.5519, | |
| "num_input_tokens_seen": 8388608, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.37446808510638296, | |
| "grad_norm": 1.6293655236791516, | |
| "learning_rate": 4.9309248009941914e-05, | |
| "loss": 1.5426, | |
| "num_input_tokens_seen": 8650752, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.38581560283687943, | |
| "grad_norm": 1.6363171652251638, | |
| "learning_rate": 4.9266958831646315e-05, | |
| "loss": 1.5179, | |
| "num_input_tokens_seen": 8912896, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.3971631205673759, | |
| "grad_norm": 1.76404697988344, | |
| "learning_rate": 4.922343254453768e-05, | |
| "loss": 1.5046, | |
| "num_input_tokens_seen": 9175040, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.4085106382978723, | |
| "grad_norm": 1.8922889495300352, | |
| "learning_rate": 4.917867136754893e-05, | |
| "loss": 1.5147, | |
| "num_input_tokens_seen": 9437184, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.4198581560283688, | |
| "grad_norm": 1.4943749144982619, | |
| "learning_rate": 4.913267758256658e-05, | |
| "loss": 1.5326, | |
| "num_input_tokens_seen": 9699328, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.43120567375886526, | |
| "grad_norm": 1.652782449680942, | |
| "learning_rate": 4.9085453534314476e-05, | |
| "loss": 1.5253, | |
| "num_input_tokens_seen": 9961472, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.4425531914893617, | |
| "grad_norm": 1.8558129403608419, | |
| "learning_rate": 4.9037001630234215e-05, | |
| "loss": 1.5003, | |
| "num_input_tokens_seen": 10223616, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.45390070921985815, | |
| "grad_norm": 1.664990274441562, | |
| "learning_rate": 4.898732434036244e-05, | |
| "loss": 1.5021, | |
| "num_input_tokens_seen": 10485760, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.4652482269503546, | |
| "grad_norm": 1.7198271557244196, | |
| "learning_rate": 4.893642419720491e-05, | |
| "loss": 1.4748, | |
| "num_input_tokens_seen": 10747904, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.4765957446808511, | |
| "grad_norm": 1.7242786140938589, | |
| "learning_rate": 4.888430379560742e-05, | |
| "loss": 1.5064, | |
| "num_input_tokens_seen": 11010048, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.4879432624113475, | |
| "grad_norm": 1.289994987213828, | |
| "learning_rate": 4.883096579262346e-05, | |
| "loss": 1.4787, | |
| "num_input_tokens_seen": 11272192, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.49929078014184397, | |
| "grad_norm": 2.0636727883365693, | |
| "learning_rate": 4.877641290737884e-05, | |
| "loss": 1.5032, | |
| "num_input_tokens_seen": 11534336, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.5106382978723404, | |
| "grad_norm": 1.50384393546133, | |
| "learning_rate": 4.872064792093299e-05, | |
| "loss": 1.5017, | |
| "num_input_tokens_seen": 11796480, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.5219858156028369, | |
| "grad_norm": 1.7274141300795698, | |
| "learning_rate": 4.866367367613725e-05, | |
| "loss": 1.4899, | |
| "num_input_tokens_seen": 12058624, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 1.4776150452075232, | |
| "learning_rate": 4.86054930774899e-05, | |
| "loss": 1.4501, | |
| "num_input_tokens_seen": 12320768, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.5446808510638298, | |
| "grad_norm": 2.0772123001061296, | |
| "learning_rate": 4.854610909098812e-05, | |
| "loss": 1.4729, | |
| "num_input_tokens_seen": 12582912, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.5560283687943263, | |
| "grad_norm": 1.4784518476625488, | |
| "learning_rate": 4.848552474397676e-05, | |
| "loss": 1.4639, | |
| "num_input_tokens_seen": 12845056, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.5673758865248227, | |
| "grad_norm": 1.5172983637361528, | |
| "learning_rate": 4.842374312499405e-05, | |
| "loss": 1.4626, | |
| "num_input_tokens_seen": 13107200, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5787234042553191, | |
| "grad_norm": 1.590645940567932, | |
| "learning_rate": 4.836076738361408e-05, | |
| "loss": 1.4767, | |
| "num_input_tokens_seen": 13369344, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.5900709219858156, | |
| "grad_norm": 1.5086587054410023, | |
| "learning_rate": 4.829660073028631e-05, | |
| "loss": 1.453, | |
| "num_input_tokens_seen": 13631488, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.601418439716312, | |
| "grad_norm": 1.4196104282840498, | |
| "learning_rate": 4.823124643617187e-05, | |
| "loss": 1.4406, | |
| "num_input_tokens_seen": 13893632, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.6127659574468085, | |
| "grad_norm": 1.5140910517728237, | |
| "learning_rate": 4.8164707832976783e-05, | |
| "loss": 1.4498, | |
| "num_input_tokens_seen": 14155776, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.624113475177305, | |
| "grad_norm": 1.3210319443750393, | |
| "learning_rate": 4.8096988312782174e-05, | |
| "loss": 1.4288, | |
| "num_input_tokens_seen": 14417920, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.6354609929078014, | |
| "grad_norm": 1.5120891443494813, | |
| "learning_rate": 4.802809132787125e-05, | |
| "loss": 1.4267, | |
| "num_input_tokens_seen": 14680064, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.6468085106382979, | |
| "grad_norm": 1.7385310877259594, | |
| "learning_rate": 4.7958020390553426e-05, | |
| "loss": 1.4775, | |
| "num_input_tokens_seen": 14942208, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.6581560283687943, | |
| "grad_norm": 1.3858014810326225, | |
| "learning_rate": 4.7886779072985156e-05, | |
| "loss": 1.4387, | |
| "num_input_tokens_seen": 15204352, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.6695035460992907, | |
| "grad_norm": 1.8605931025096338, | |
| "learning_rate": 4.78143710069879e-05, | |
| "loss": 1.4093, | |
| "num_input_tokens_seen": 15466496, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.6808510638297872, | |
| "grad_norm": 1.3831101615966328, | |
| "learning_rate": 4.774079988386296e-05, | |
| "loss": 1.421, | |
| "num_input_tokens_seen": 15728640, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6921985815602837, | |
| "grad_norm": 1.6322196763337964, | |
| "learning_rate": 4.766606945420329e-05, | |
| "loss": 1.4411, | |
| "num_input_tokens_seen": 15990784, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.7035460992907802, | |
| "grad_norm": 1.6072671722268586, | |
| "learning_rate": 4.759018352770229e-05, | |
| "loss": 1.4283, | |
| "num_input_tokens_seen": 16252928, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.7148936170212766, | |
| "grad_norm": 1.2535166201622518, | |
| "learning_rate": 4.751314597295963e-05, | |
| "loss": 1.4526, | |
| "num_input_tokens_seen": 16515072, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.7262411347517731, | |
| "grad_norm": 1.5543521846201784, | |
| "learning_rate": 4.743496071728396e-05, | |
| "loss": 1.4148, | |
| "num_input_tokens_seen": 16777216, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.7375886524822695, | |
| "grad_norm": 1.3594873685564324, | |
| "learning_rate": 4.735563174649278e-05, | |
| "loss": 1.3976, | |
| "num_input_tokens_seen": 17039360, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.7489361702127659, | |
| "grad_norm": 1.3662132975871213, | |
| "learning_rate": 4.72751631047092e-05, | |
| "loss": 1.4137, | |
| "num_input_tokens_seen": 17301504, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.7602836879432624, | |
| "grad_norm": 1.2729255728645377, | |
| "learning_rate": 4.719355889415576e-05, | |
| "loss": 1.3951, | |
| "num_input_tokens_seen": 17563648, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.7716312056737589, | |
| "grad_norm": 1.2598405597617752, | |
| "learning_rate": 4.711082327494536e-05, | |
| "loss": 1.4049, | |
| "num_input_tokens_seen": 17825792, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.7829787234042553, | |
| "grad_norm": 1.336014117151041, | |
| "learning_rate": 4.7026960464869116e-05, | |
| "loss": 1.4167, | |
| "num_input_tokens_seen": 18087936, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.7943262411347518, | |
| "grad_norm": 1.4872379534302926, | |
| "learning_rate": 4.6941974739181395e-05, | |
| "loss": 1.4048, | |
| "num_input_tokens_seen": 18350080, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8056737588652483, | |
| "grad_norm": 1.2920324415831732, | |
| "learning_rate": 4.6855870430381816e-05, | |
| "loss": 1.4083, | |
| "num_input_tokens_seen": 18612224, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.8170212765957446, | |
| "grad_norm": 1.6143627607763888, | |
| "learning_rate": 4.6768651927994434e-05, | |
| "loss": 1.3906, | |
| "num_input_tokens_seen": 18874368, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.8283687943262411, | |
| "grad_norm": 1.2568838039190413, | |
| "learning_rate": 4.668032367834392e-05, | |
| "loss": 1.3973, | |
| "num_input_tokens_seen": 19136512, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.8397163120567376, | |
| "grad_norm": 1.5547346669542705, | |
| "learning_rate": 4.6590890184328925e-05, | |
| "loss": 1.3918, | |
| "num_input_tokens_seen": 19398656, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.851063829787234, | |
| "grad_norm": 1.4356818846930628, | |
| "learning_rate": 4.6500356005192514e-05, | |
| "loss": 1.3819, | |
| "num_input_tokens_seen": 19660800, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.8624113475177305, | |
| "grad_norm": 1.3284920385369607, | |
| "learning_rate": 4.640872575628973e-05, | |
| "loss": 1.3933, | |
| "num_input_tokens_seen": 19922944, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.873758865248227, | |
| "grad_norm": 1.107701469460885, | |
| "learning_rate": 4.6316004108852305e-05, | |
| "loss": 1.4081, | |
| "num_input_tokens_seen": 20185088, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.8851063829787233, | |
| "grad_norm": 1.2529354895940894, | |
| "learning_rate": 4.622219578975057e-05, | |
| "loss": 1.3801, | |
| "num_input_tokens_seen": 20447232, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.8964539007092198, | |
| "grad_norm": 1.3380392380500745, | |
| "learning_rate": 4.6127305581252414e-05, | |
| "loss": 1.3655, | |
| "num_input_tokens_seen": 20709376, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.9078014184397163, | |
| "grad_norm": 1.2047088627238154, | |
| "learning_rate": 4.6031338320779534e-05, | |
| "loss": 1.3896, | |
| "num_input_tokens_seen": 20971520, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.9191489361702128, | |
| "grad_norm": 1.4991837815250253, | |
| "learning_rate": 4.593429890066082e-05, | |
| "loss": 1.405, | |
| "num_input_tokens_seen": 21233664, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.9304964539007092, | |
| "grad_norm": 1.1414547231387788, | |
| "learning_rate": 4.583619226788294e-05, | |
| "loss": 1.3843, | |
| "num_input_tokens_seen": 21495808, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.9418439716312057, | |
| "grad_norm": 1.4695936511434455, | |
| "learning_rate": 4.573702342383816e-05, | |
| "loss": 1.3698, | |
| "num_input_tokens_seen": 21757952, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.9531914893617022, | |
| "grad_norm": 1.2845207314880724, | |
| "learning_rate": 4.563679742406935e-05, | |
| "loss": 1.3806, | |
| "num_input_tokens_seen": 22020096, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.9645390070921985, | |
| "grad_norm": 1.4475613066375035, | |
| "learning_rate": 4.5535519378012295e-05, | |
| "loss": 1.3715, | |
| "num_input_tokens_seen": 22282240, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.975886524822695, | |
| "grad_norm": 1.4999411758550487, | |
| "learning_rate": 4.543319444873517e-05, | |
| "loss": 1.3718, | |
| "num_input_tokens_seen": 22544384, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.9872340425531915, | |
| "grad_norm": 1.261663898483524, | |
| "learning_rate": 4.532982785267541e-05, | |
| "loss": 1.3564, | |
| "num_input_tokens_seen": 22806528, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.9985815602836879, | |
| "grad_norm": 1.3464933652070896, | |
| "learning_rate": 4.522542485937369e-05, | |
| "loss": 1.3744, | |
| "num_input_tokens_seen": 23068672, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.3464933652070896, | |
| "learning_rate": 4.511999079120534e-05, | |
| "loss": 1.2363, | |
| "num_input_tokens_seen": 23101440, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.0113475177304965, | |
| "grad_norm": 3.3795576075508627, | |
| "learning_rate": 4.5013531023109014e-05, | |
| "loss": 1.0013, | |
| "num_input_tokens_seen": 23363584, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.022695035460993, | |
| "grad_norm": 1.98501369471366, | |
| "learning_rate": 4.4906050982312664e-05, | |
| "loss": 1.0396, | |
| "num_input_tokens_seen": 23625728, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.0340425531914894, | |
| "grad_norm": 1.7550611786324055, | |
| "learning_rate": 4.479755614805688e-05, | |
| "loss": 0.967, | |
| "num_input_tokens_seen": 23887872, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.0453900709219859, | |
| "grad_norm": 2.3047442852698063, | |
| "learning_rate": 4.4688052051315545e-05, | |
| "loss": 1.0097, | |
| "num_input_tokens_seen": 24150016, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.0567375886524824, | |
| "grad_norm": 1.4568858521127028, | |
| "learning_rate": 4.457754427451389e-05, | |
| "loss": 0.9699, | |
| "num_input_tokens_seen": 24412160, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.0680851063829788, | |
| "grad_norm": 1.9193925592577146, | |
| "learning_rate": 4.446603845124388e-05, | |
| "loss": 0.9548, | |
| "num_input_tokens_seen": 24674304, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.0794326241134753, | |
| "grad_norm": 1.3493703965773043, | |
| "learning_rate": 4.4353540265977064e-05, | |
| "loss": 0.9504, | |
| "num_input_tokens_seen": 24936448, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.0907801418439715, | |
| "grad_norm": 1.7615744346425224, | |
| "learning_rate": 4.4240055453774734e-05, | |
| "loss": 0.9807, | |
| "num_input_tokens_seen": 25198592, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.102127659574468, | |
| "grad_norm": 1.8062764758660264, | |
| "learning_rate": 4.412558979999558e-05, | |
| "loss": 0.9363, | |
| "num_input_tokens_seen": 25460736, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.1134751773049645, | |
| "grad_norm": 1.7426430146262821, | |
| "learning_rate": 4.401014914000078e-05, | |
| "loss": 0.968, | |
| "num_input_tokens_seen": 25722880, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.124822695035461, | |
| "grad_norm": 1.5389141278391227, | |
| "learning_rate": 4.389373935885646e-05, | |
| "loss": 0.9428, | |
| "num_input_tokens_seen": 25985024, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1361702127659574, | |
| "grad_norm": 1.4116557578228721, | |
| "learning_rate": 4.3776366391033746e-05, | |
| "loss": 0.921, | |
| "num_input_tokens_seen": 26247168, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.147517730496454, | |
| "grad_norm": 1.4555202749322727, | |
| "learning_rate": 4.365803622010618e-05, | |
| "loss": 0.9368, | |
| "num_input_tokens_seen": 26509312, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.1588652482269504, | |
| "grad_norm": 1.51393536895009, | |
| "learning_rate": 4.35387548784447e-05, | |
| "loss": 0.9424, | |
| "num_input_tokens_seen": 26771456, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.1702127659574468, | |
| "grad_norm": 1.3497959200668166, | |
| "learning_rate": 4.341852844691012e-05, | |
| "loss": 0.9395, | |
| "num_input_tokens_seen": 27033600, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.1815602836879433, | |
| "grad_norm": 1.963720316682781, | |
| "learning_rate": 4.329736305454314e-05, | |
| "loss": 0.9575, | |
| "num_input_tokens_seen": 27295744, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.1929078014184398, | |
| "grad_norm": 1.5398810468770388, | |
| "learning_rate": 4.3175264878251845e-05, | |
| "loss": 0.9087, | |
| "num_input_tokens_seen": 27557888, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.2042553191489362, | |
| "grad_norm": 2.249980638617661, | |
| "learning_rate": 4.305224014249688e-05, | |
| "loss": 0.9613, | |
| "num_input_tokens_seen": 27820032, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.2156028368794327, | |
| "grad_norm": 1.7994413618778806, | |
| "learning_rate": 4.292829511897409e-05, | |
| "loss": 0.9336, | |
| "num_input_tokens_seen": 28082176, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.226950354609929, | |
| "grad_norm": 2.1172062063526695, | |
| "learning_rate": 4.280343612629479e-05, | |
| "loss": 0.9444, | |
| "num_input_tokens_seen": 28344320, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.2382978723404254, | |
| "grad_norm": 1.689025623361111, | |
| "learning_rate": 4.267766952966369e-05, | |
| "loss": 0.9054, | |
| "num_input_tokens_seen": 28606464, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.249645390070922, | |
| "grad_norm": 1.7237793371868935, | |
| "learning_rate": 4.255100174055434e-05, | |
| "loss": 0.9391, | |
| "num_input_tokens_seen": 28868608, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.2609929078014184, | |
| "grad_norm": 1.5807920095324717, | |
| "learning_rate": 4.242343921638234e-05, | |
| "loss": 0.9232, | |
| "num_input_tokens_seen": 29130752, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.2723404255319148, | |
| "grad_norm": 1.5455548794679541, | |
| "learning_rate": 4.22949884601761e-05, | |
| "loss": 0.9383, | |
| "num_input_tokens_seen": 29392896, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.2836879432624113, | |
| "grad_norm": 1.505150180088299, | |
| "learning_rate": 4.2165656020245336e-05, | |
| "loss": 0.921, | |
| "num_input_tokens_seen": 29655040, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.2950354609929078, | |
| "grad_norm": 1.7329578957192628, | |
| "learning_rate": 4.2035448489847284e-05, | |
| "loss": 0.9525, | |
| "num_input_tokens_seen": 29917184, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.3063829787234043, | |
| "grad_norm": 1.4120446620663845, | |
| "learning_rate": 4.1904372506850484e-05, | |
| "loss": 0.8843, | |
| "num_input_tokens_seen": 30179328, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.3177304964539007, | |
| "grad_norm": 1.440521563342665, | |
| "learning_rate": 4.1772434753396504e-05, | |
| "loss": 0.9295, | |
| "num_input_tokens_seen": 30441472, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.3290780141843972, | |
| "grad_norm": 1.2751014560476446, | |
| "learning_rate": 4.1639641955559205e-05, | |
| "loss": 0.965, | |
| "num_input_tokens_seen": 30703616, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.3404255319148937, | |
| "grad_norm": 1.353326348573935, | |
| "learning_rate": 4.1506000883001875e-05, | |
| "loss": 0.9193, | |
| "num_input_tokens_seen": 30965760, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.3517730496453901, | |
| "grad_norm": 1.3458043729465723, | |
| "learning_rate": 4.137151834863213e-05, | |
| "loss": 0.9418, | |
| "num_input_tokens_seen": 31227904, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.3631205673758866, | |
| "grad_norm": 1.3301133192597863, | |
| "learning_rate": 4.123620120825459e-05, | |
| "loss": 0.9139, | |
| "num_input_tokens_seen": 31490048, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.374468085106383, | |
| "grad_norm": 1.519558123717531, | |
| "learning_rate": 4.1100056360221384e-05, | |
| "loss": 0.9378, | |
| "num_input_tokens_seen": 31752192, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.3858156028368795, | |
| "grad_norm": 1.2225151449916911, | |
| "learning_rate": 4.096309074508046e-05, | |
| "loss": 0.9334, | |
| "num_input_tokens_seen": 32014336, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.397163120567376, | |
| "grad_norm": 1.3505855040891415, | |
| "learning_rate": 4.082531134522176e-05, | |
| "loss": 0.9227, | |
| "num_input_tokens_seen": 32276480, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.4085106382978723, | |
| "grad_norm": 1.1984405892159349, | |
| "learning_rate": 4.06867251845213e-05, | |
| "loss": 0.9232, | |
| "num_input_tokens_seen": 32538624, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.4198581560283687, | |
| "grad_norm": 1.2105780592446167, | |
| "learning_rate": 4.054733932798306e-05, | |
| "loss": 0.9126, | |
| "num_input_tokens_seen": 32800768, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.4312056737588652, | |
| "grad_norm": 1.4082252271919538, | |
| "learning_rate": 4.0407160881378824e-05, | |
| "loss": 0.9026, | |
| "num_input_tokens_seen": 33062912, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.4425531914893617, | |
| "grad_norm": 1.242226249471717, | |
| "learning_rate": 4.0266196990885955e-05, | |
| "loss": 0.9329, | |
| "num_input_tokens_seen": 33325056, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.4539007092198581, | |
| "grad_norm": 1.2320327101088704, | |
| "learning_rate": 4.012445484272307e-05, | |
| "loss": 0.9055, | |
| "num_input_tokens_seen": 33587200, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.4652482269503546, | |
| "grad_norm": 1.097364604214534, | |
| "learning_rate": 3.9981941662783674e-05, | |
| "loss": 0.9505, | |
| "num_input_tokens_seen": 33849344, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.476595744680851, | |
| "grad_norm": 1.287506918091485, | |
| "learning_rate": 3.9838664716267855e-05, | |
| "loss": 0.9205, | |
| "num_input_tokens_seen": 34111488, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.4879432624113476, | |
| "grad_norm": 1.1710856488092862, | |
| "learning_rate": 3.969463130731183e-05, | |
| "loss": 0.9757, | |
| "num_input_tokens_seen": 34373632, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.499290780141844, | |
| "grad_norm": 1.1761537048031911, | |
| "learning_rate": 3.954984877861565e-05, | |
| "loss": 0.9375, | |
| "num_input_tokens_seen": 34635776, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.5106382978723403, | |
| "grad_norm": 1.211479891065261, | |
| "learning_rate": 3.9404324511068825e-05, | |
| "loss": 0.9631, | |
| "num_input_tokens_seen": 34897920, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.5219858156028367, | |
| "grad_norm": 1.1227811129578062, | |
| "learning_rate": 3.92580659233741e-05, | |
| "loss": 0.9413, | |
| "num_input_tokens_seen": 35160064, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.5333333333333332, | |
| "grad_norm": 1.2565270122423726, | |
| "learning_rate": 3.911108047166924e-05, | |
| "loss": 0.9436, | |
| "num_input_tokens_seen": 35422208, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.5446808510638297, | |
| "grad_norm": 1.194511509764168, | |
| "learning_rate": 3.8963375649146866e-05, | |
| "loss": 0.9537, | |
| "num_input_tokens_seen": 35684352, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 1.5560283687943262, | |
| "grad_norm": 1.147953601769395, | |
| "learning_rate": 3.881495898567257e-05, | |
| "loss": 0.9279, | |
| "num_input_tokens_seen": 35946496, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.5673758865248226, | |
| "grad_norm": 1.128582567310139, | |
| "learning_rate": 3.866583804740095e-05, | |
| "loss": 0.9053, | |
| "num_input_tokens_seen": 36208640, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 1.578723404255319, | |
| "grad_norm": 1.1883091515233541, | |
| "learning_rate": 3.851602043638994e-05, | |
| "loss": 0.9499, | |
| "num_input_tokens_seen": 36470784, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.5900709219858156, | |
| "grad_norm": 1.154451792017842, | |
| "learning_rate": 3.8365513790213265e-05, | |
| "loss": 0.9262, | |
| "num_input_tokens_seen": 36732928, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 1.601418439716312, | |
| "grad_norm": 1.2185944157116932, | |
| "learning_rate": 3.821432578157105e-05, | |
| "loss": 0.934, | |
| "num_input_tokens_seen": 36995072, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.6127659574468085, | |
| "grad_norm": 1.1779364162194832, | |
| "learning_rate": 3.8062464117898724e-05, | |
| "loss": 0.9588, | |
| "num_input_tokens_seen": 37257216, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 1.624113475177305, | |
| "grad_norm": 1.1736568320305454, | |
| "learning_rate": 3.790993654097405e-05, | |
| "loss": 0.9616, | |
| "num_input_tokens_seen": 37519360, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.6354609929078014, | |
| "grad_norm": 1.174721701784409, | |
| "learning_rate": 3.77567508265225e-05, | |
| "loss": 0.9551, | |
| "num_input_tokens_seen": 37781504, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.646808510638298, | |
| "grad_norm": 1.1583059430503744, | |
| "learning_rate": 3.76029147838208e-05, | |
| "loss": 0.9636, | |
| "num_input_tokens_seen": 38043648, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.6581560283687944, | |
| "grad_norm": 1.2781768836563048, | |
| "learning_rate": 3.74484362552989e-05, | |
| "loss": 0.949, | |
| "num_input_tokens_seen": 38305792, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 1.6695035460992909, | |
| "grad_norm": 1.2187569053893257, | |
| "learning_rate": 3.72933231161401e-05, | |
| "loss": 0.9325, | |
| "num_input_tokens_seen": 38567936, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.6808510638297873, | |
| "grad_norm": 1.1127467597849947, | |
| "learning_rate": 3.713758327387961e-05, | |
| "loss": 0.9175, | |
| "num_input_tokens_seen": 38830080, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 1.6921985815602838, | |
| "grad_norm": 1.5466609004385776, | |
| "learning_rate": 3.6981224668001424e-05, | |
| "loss": 0.9579, | |
| "num_input_tokens_seen": 39092224, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7035460992907803, | |
| "grad_norm": 1.2711234448756215, | |
| "learning_rate": 3.682425526953359e-05, | |
| "loss": 0.9568, | |
| "num_input_tokens_seen": 39354368, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 1.7148936170212767, | |
| "grad_norm": 1.2561675729769082, | |
| "learning_rate": 3.6666683080641846e-05, | |
| "loss": 0.931, | |
| "num_input_tokens_seen": 39616512, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.7262411347517732, | |
| "grad_norm": 1.2872779270009815, | |
| "learning_rate": 3.6508516134221635e-05, | |
| "loss": 0.9456, | |
| "num_input_tokens_seen": 39878656, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.7375886524822695, | |
| "grad_norm": 1.211380322340917, | |
| "learning_rate": 3.634976249348867e-05, | |
| "loss": 0.9525, | |
| "num_input_tokens_seen": 40140800, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.748936170212766, | |
| "grad_norm": 1.262195787306148, | |
| "learning_rate": 3.619043025156782e-05, | |
| "loss": 0.9254, | |
| "num_input_tokens_seen": 40402944, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.7602836879432624, | |
| "grad_norm": 1.158471503672054, | |
| "learning_rate": 3.603052753108053e-05, | |
| "loss": 0.9594, | |
| "num_input_tokens_seen": 40665088, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.7716312056737589, | |
| "grad_norm": 1.2179519684336035, | |
| "learning_rate": 3.58700624837308e-05, | |
| "loss": 0.9565, | |
| "num_input_tokens_seen": 40927232, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 1.7829787234042553, | |
| "grad_norm": 1.2186733946981332, | |
| "learning_rate": 3.5709043289889536e-05, | |
| "loss": 0.957, | |
| "num_input_tokens_seen": 41189376, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 1.7943262411347518, | |
| "grad_norm": 1.1317921418808867, | |
| "learning_rate": 3.554747815817756e-05, | |
| "loss": 0.9698, | |
| "num_input_tokens_seen": 41451520, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 1.8056737588652483, | |
| "grad_norm": 1.2084681311643604, | |
| "learning_rate": 3.5385375325047166e-05, | |
| "loss": 0.9534, | |
| "num_input_tokens_seen": 41713664, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8170212765957445, | |
| "grad_norm": 1.1480526352440725, | |
| "learning_rate": 3.522274305436217e-05, | |
| "loss": 0.9458, | |
| "num_input_tokens_seen": 41975808, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 1.828368794326241, | |
| "grad_norm": 1.158020278288251, | |
| "learning_rate": 3.50595896369767e-05, | |
| "loss": 0.9299, | |
| "num_input_tokens_seen": 42237952, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 1.8397163120567375, | |
| "grad_norm": 1.2652775197204456, | |
| "learning_rate": 3.4895923390312466e-05, | |
| "loss": 0.9702, | |
| "num_input_tokens_seen": 42500096, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 1.851063829787234, | |
| "grad_norm": 1.083067001347801, | |
| "learning_rate": 3.4731752657934794e-05, | |
| "loss": 0.9805, | |
| "num_input_tokens_seen": 42762240, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 1.8624113475177304, | |
| "grad_norm": 1.3218549445020937, | |
| "learning_rate": 3.456708580912725e-05, | |
| "loss": 0.9199, | |
| "num_input_tokens_seen": 43024384, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.8737588652482269, | |
| "grad_norm": 1.1602075511901062, | |
| "learning_rate": 3.4401931238464994e-05, | |
| "loss": 0.9702, | |
| "num_input_tokens_seen": 43286528, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 1.8851063829787233, | |
| "grad_norm": 1.0903691972788288, | |
| "learning_rate": 3.423629736538685e-05, | |
| "loss": 0.9444, | |
| "num_input_tokens_seen": 43548672, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 1.8964539007092198, | |
| "grad_norm": 1.3066277139158358, | |
| "learning_rate": 3.4070192633766025e-05, | |
| "loss": 0.9559, | |
| "num_input_tokens_seen": 43810816, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 1.9078014184397163, | |
| "grad_norm": 1.1068620390544621, | |
| "learning_rate": 3.390362551147974e-05, | |
| "loss": 0.9547, | |
| "num_input_tokens_seen": 44072960, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 1.9191489361702128, | |
| "grad_norm": 1.2160218176405404, | |
| "learning_rate": 3.3736604489977466e-05, | |
| "loss": 0.9774, | |
| "num_input_tokens_seen": 44335104, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.9304964539007092, | |
| "grad_norm": 1.1169244406424832, | |
| "learning_rate": 3.356913808384807e-05, | |
| "loss": 0.9485, | |
| "num_input_tokens_seen": 44597248, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 1.9418439716312057, | |
| "grad_norm": 1.1440632925298737, | |
| "learning_rate": 3.3401234830385756e-05, | |
| "loss": 0.9556, | |
| "num_input_tokens_seen": 44859392, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 1.9531914893617022, | |
| "grad_norm": 1.156402005245823, | |
| "learning_rate": 3.323290328915483e-05, | |
| "loss": 0.9635, | |
| "num_input_tokens_seen": 45121536, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 1.9645390070921986, | |
| "grad_norm": 1.0465098069882341, | |
| "learning_rate": 3.306415204155335e-05, | |
| "loss": 0.9327, | |
| "num_input_tokens_seen": 45383680, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 1.9758865248226951, | |
| "grad_norm": 1.097782670172884, | |
| "learning_rate": 3.2894989690375626e-05, | |
| "loss": 0.9274, | |
| "num_input_tokens_seen": 45645824, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.9872340425531916, | |
| "grad_norm": 1.138066760772727, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.9402, | |
| "num_input_tokens_seen": 45907968, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 1.998581560283688, | |
| "grad_norm": 1.1812784275655155, | |
| "learning_rate": 3.255546619281765e-05, | |
| "loss": 0.9699, | |
| "num_input_tokens_seen": 46170112, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.1812784275655155, | |
| "learning_rate": 3.2385122355055005e-05, | |
| "loss": 0.7485, | |
| "num_input_tokens_seen": 46202880, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.0113475177304965, | |
| "grad_norm": 3.8292467760018236, | |
| "learning_rate": 3.221440203006897e-05, | |
| "loss": 0.5128, | |
| "num_input_tokens_seen": 46465024, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.022695035460993, | |
| "grad_norm": 2.6270275589892638, | |
| "learning_rate": 3.2043313921035743e-05, | |
| "loss": 0.494, | |
| "num_input_tokens_seen": 46727168, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.0340425531914894, | |
| "grad_norm": 1.9013960211342629, | |
| "learning_rate": 3.1871866749880846e-05, | |
| "loss": 0.4649, | |
| "num_input_tokens_seen": 46989312, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.045390070921986, | |
| "grad_norm": 2.6940464488513314, | |
| "learning_rate": 3.170006925683448e-05, | |
| "loss": 0.4609, | |
| "num_input_tokens_seen": 47251456, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.0567375886524824, | |
| "grad_norm": 2.9136774444822446, | |
| "learning_rate": 3.152793019998594e-05, | |
| "loss": 0.4303, | |
| "num_input_tokens_seen": 47513600, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.068085106382979, | |
| "grad_norm": 2.284711351155428, | |
| "learning_rate": 3.135545835483718e-05, | |
| "loss": 0.4343, | |
| "num_input_tokens_seen": 47775744, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.0794326241134753, | |
| "grad_norm": 1.8761627155008236, | |
| "learning_rate": 3.118266251385539e-05, | |
| "loss": 0.4224, | |
| "num_input_tokens_seen": 48037888, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.0907801418439718, | |
| "grad_norm": 1.537845244765071, | |
| "learning_rate": 3.100955148602481e-05, | |
| "loss": 0.4173, | |
| "num_input_tokens_seen": 48300032, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.1021276595744682, | |
| "grad_norm": 1.4350770359924392, | |
| "learning_rate": 3.083613409639764e-05, | |
| "loss": 0.3962, | |
| "num_input_tokens_seen": 48562176, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.1134751773049647, | |
| "grad_norm": 1.3504256900149412, | |
| "learning_rate": 3.0662419185644115e-05, | |
| "loss": 0.393, | |
| "num_input_tokens_seen": 48824320, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.124822695035461, | |
| "grad_norm": 1.4687686815955256, | |
| "learning_rate": 3.0488415609601862e-05, | |
| "loss": 0.3826, | |
| "num_input_tokens_seen": 49086464, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.1361702127659576, | |
| "grad_norm": 1.275174193399539, | |
| "learning_rate": 3.0314132238824415e-05, | |
| "loss": 0.3961, | |
| "num_input_tokens_seen": 49348608, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.147517730496454, | |
| "grad_norm": 1.43179460079197, | |
| "learning_rate": 3.013957795812902e-05, | |
| "loss": 0.3759, | |
| "num_input_tokens_seen": 49610752, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.1588652482269506, | |
| "grad_norm": 1.4035121217192592, | |
| "learning_rate": 2.996476166614364e-05, | |
| "loss": 0.3716, | |
| "num_input_tokens_seen": 49872896, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.1702127659574466, | |
| "grad_norm": 1.39291598371889, | |
| "learning_rate": 2.9789692274853388e-05, | |
| "loss": 0.3875, | |
| "num_input_tokens_seen": 50135040, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 2.181560283687943, | |
| "grad_norm": 1.630578340810045, | |
| "learning_rate": 2.9614378709146133e-05, | |
| "loss": 0.374, | |
| "num_input_tokens_seen": 50397184, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.1929078014184396, | |
| "grad_norm": 1.3814469271709833, | |
| "learning_rate": 2.943882990635759e-05, | |
| "loss": 0.3694, | |
| "num_input_tokens_seen": 50659328, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.204255319148936, | |
| "grad_norm": 1.6114694228119848, | |
| "learning_rate": 2.92630548158156e-05, | |
| "loss": 0.3901, | |
| "num_input_tokens_seen": 50921472, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.2156028368794325, | |
| "grad_norm": 5.245754079879191, | |
| "learning_rate": 2.9087062398384e-05, | |
| "loss": 0.3825, | |
| "num_input_tokens_seen": 51183616, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 2.226950354609929, | |
| "grad_norm": 2.250670427725288, | |
| "learning_rate": 2.8910861626005776e-05, | |
| "loss": 0.4064, | |
| "num_input_tokens_seen": 51445760, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.2382978723404254, | |
| "grad_norm": 1.7095359356479407, | |
| "learning_rate": 2.873446148124563e-05, | |
| "loss": 0.4096, | |
| "num_input_tokens_seen": 51707904, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.249645390070922, | |
| "grad_norm": 1.4355808796008729, | |
| "learning_rate": 2.8557870956832132e-05, | |
| "loss": 0.3814, | |
| "num_input_tokens_seen": 51970048, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2609929078014184, | |
| "grad_norm": 1.73510400599197, | |
| "learning_rate": 2.8381099055199222e-05, | |
| "loss": 0.3932, | |
| "num_input_tokens_seen": 52232192, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.272340425531915, | |
| "grad_norm": 1.555638605050248, | |
| "learning_rate": 2.8204154788027325e-05, | |
| "loss": 0.3613, | |
| "num_input_tokens_seen": 52494336, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.2836879432624113, | |
| "grad_norm": 1.1987449143902036, | |
| "learning_rate": 2.8027047175783873e-05, | |
| "loss": 0.359, | |
| "num_input_tokens_seen": 52756480, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 2.295035460992908, | |
| "grad_norm": 1.442324830425997, | |
| "learning_rate": 2.7849785247263515e-05, | |
| "loss": 0.3897, | |
| "num_input_tokens_seen": 53018624, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.3063829787234043, | |
| "grad_norm": 1.3866321638394548, | |
| "learning_rate": 2.767237803912783e-05, | |
| "loss": 0.3945, | |
| "num_input_tokens_seen": 53280768, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.3177304964539007, | |
| "grad_norm": 1.2184999424862926, | |
| "learning_rate": 2.7494834595444568e-05, | |
| "loss": 0.3682, | |
| "num_input_tokens_seen": 53542912, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.329078014184397, | |
| "grad_norm": 1.2180997743390845, | |
| "learning_rate": 2.731716396722672e-05, | |
| "loss": 0.3686, | |
| "num_input_tokens_seen": 53805056, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 2.3404255319148937, | |
| "grad_norm": 1.315597212375225, | |
| "learning_rate": 2.7139375211970996e-05, | |
| "loss": 0.3683, | |
| "num_input_tokens_seen": 54067200, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.35177304964539, | |
| "grad_norm": 1.1389971788403916, | |
| "learning_rate": 2.6961477393196126e-05, | |
| "loss": 0.3656, | |
| "num_input_tokens_seen": 54329344, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 2.3631205673758866, | |
| "grad_norm": 1.1423683445861843, | |
| "learning_rate": 2.6783479579980807e-05, | |
| "loss": 0.3659, | |
| "num_input_tokens_seen": 54591488, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.374468085106383, | |
| "grad_norm": 1.1906990298985376, | |
| "learning_rate": 2.6605390846501377e-05, | |
| "loss": 0.3581, | |
| "num_input_tokens_seen": 54853632, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 2.3858156028368795, | |
| "grad_norm": 1.3437643319906911, | |
| "learning_rate": 2.6427220271569203e-05, | |
| "loss": 0.3693, | |
| "num_input_tokens_seen": 55115776, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.397163120567376, | |
| "grad_norm": 1.3294483159523118, | |
| "learning_rate": 2.624897693816785e-05, | |
| "loss": 0.3746, | |
| "num_input_tokens_seen": 55377920, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 2.4085106382978725, | |
| "grad_norm": 1.1245200003757012, | |
| "learning_rate": 2.6070669932990067e-05, | |
| "loss": 0.3741, | |
| "num_input_tokens_seen": 55640064, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.419858156028369, | |
| "grad_norm": 1.2751337825743647, | |
| "learning_rate": 2.5892308345974515e-05, | |
| "loss": 0.3583, | |
| "num_input_tokens_seen": 55902208, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.4312056737588654, | |
| "grad_norm": 1.1580659329331486, | |
| "learning_rate": 2.5713901269842404e-05, | |
| "loss": 0.3622, | |
| "num_input_tokens_seen": 56164352, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.4425531914893615, | |
| "grad_norm": 1.1800640589939102, | |
| "learning_rate": 2.5535457799633955e-05, | |
| "loss": 0.3588, | |
| "num_input_tokens_seen": 56426496, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 2.453900709219858, | |
| "grad_norm": 1.198033947606563, | |
| "learning_rate": 2.5356987032244683e-05, | |
| "loss": 0.3614, | |
| "num_input_tokens_seen": 56688640, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.4652482269503544, | |
| "grad_norm": 1.1820243301759021, | |
| "learning_rate": 2.5178498065961736e-05, | |
| "loss": 0.3727, | |
| "num_input_tokens_seen": 56950784, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 2.476595744680851, | |
| "grad_norm": 1.1419500525476782, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.3595, | |
| "num_input_tokens_seen": 57212928, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.4879432624113473, | |
| "grad_norm": 1.1601188236644033, | |
| "learning_rate": 2.4821501934038266e-05, | |
| "loss": 0.3475, | |
| "num_input_tokens_seen": 57475072, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.499290780141844, | |
| "grad_norm": 1.2274900948257617, | |
| "learning_rate": 2.4643012967755326e-05, | |
| "loss": 0.373, | |
| "num_input_tokens_seen": 57737216, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.5106382978723403, | |
| "grad_norm": 1.2801139230605143, | |
| "learning_rate": 2.446454220036605e-05, | |
| "loss": 0.3709, | |
| "num_input_tokens_seen": 57999360, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.5219858156028367, | |
| "grad_norm": 1.175617548960192, | |
| "learning_rate": 2.42860987301576e-05, | |
| "loss": 0.3616, | |
| "num_input_tokens_seen": 58261504, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 1.2552798520658928, | |
| "learning_rate": 2.410769165402549e-05, | |
| "loss": 0.3873, | |
| "num_input_tokens_seen": 58523648, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.5446808510638297, | |
| "grad_norm": 1.1904955099860972, | |
| "learning_rate": 2.3929330067009942e-05, | |
| "loss": 0.3739, | |
| "num_input_tokens_seen": 58785792, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.556028368794326, | |
| "grad_norm": 1.096367614278807, | |
| "learning_rate": 2.3751023061832158e-05, | |
| "loss": 0.3567, | |
| "num_input_tokens_seen": 59047936, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 2.5673758865248226, | |
| "grad_norm": 1.1935334517672878, | |
| "learning_rate": 2.35727797284308e-05, | |
| "loss": 0.3902, | |
| "num_input_tokens_seen": 59310080, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.578723404255319, | |
| "grad_norm": 1.18472305830408, | |
| "learning_rate": 2.339460915349862e-05, | |
| "loss": 0.3814, | |
| "num_input_tokens_seen": 59572224, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 2.5900709219858156, | |
| "grad_norm": 1.171242395850753, | |
| "learning_rate": 2.3216520420019195e-05, | |
| "loss": 0.3786, | |
| "num_input_tokens_seen": 59834368, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.601418439716312, | |
| "grad_norm": 1.0900508856776798, | |
| "learning_rate": 2.303852260680388e-05, | |
| "loss": 0.3773, | |
| "num_input_tokens_seen": 60096512, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 2.6127659574468085, | |
| "grad_norm": 1.2090568930739582, | |
| "learning_rate": 2.2860624788029013e-05, | |
| "loss": 0.3892, | |
| "num_input_tokens_seen": 60358656, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.624113475177305, | |
| "grad_norm": 1.187407278266127, | |
| "learning_rate": 2.268283603277328e-05, | |
| "loss": 0.3784, | |
| "num_input_tokens_seen": 60620800, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 2.6354609929078014, | |
| "grad_norm": 1.1154631653760168, | |
| "learning_rate": 2.250516540455543e-05, | |
| "loss": 0.3617, | |
| "num_input_tokens_seen": 60882944, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.646808510638298, | |
| "grad_norm": 1.1730027507450338, | |
| "learning_rate": 2.2327621960872187e-05, | |
| "loss": 0.3681, | |
| "num_input_tokens_seen": 61145088, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.6581560283687944, | |
| "grad_norm": 1.0970354831582407, | |
| "learning_rate": 2.2150214752736488e-05, | |
| "loss": 0.3684, | |
| "num_input_tokens_seen": 61407232, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.669503546099291, | |
| "grad_norm": 1.0380524878431867, | |
| "learning_rate": 2.197295282421613e-05, | |
| "loss": 0.3544, | |
| "num_input_tokens_seen": 61669376, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 2.6808510638297873, | |
| "grad_norm": 1.1922728093817796, | |
| "learning_rate": 2.179584521197268e-05, | |
| "loss": 0.3669, | |
| "num_input_tokens_seen": 61931520, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.692198581560284, | |
| "grad_norm": 1.1115431834278655, | |
| "learning_rate": 2.1618900944800777e-05, | |
| "loss": 0.3786, | |
| "num_input_tokens_seen": 62193664, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 2.7035460992907803, | |
| "grad_norm": 1.1319193354917367, | |
| "learning_rate": 2.1442129043167874e-05, | |
| "loss": 0.3669, | |
| "num_input_tokens_seen": 62455808, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.7148936170212767, | |
| "grad_norm": 1.1615463220718218, | |
| "learning_rate": 2.1265538518754374e-05, | |
| "loss": 0.3665, | |
| "num_input_tokens_seen": 62717952, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 2.726241134751773, | |
| "grad_norm": 1.1434386890748394, | |
| "learning_rate": 2.1089138373994223e-05, | |
| "loss": 0.392, | |
| "num_input_tokens_seen": 62980096, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 2.7375886524822697, | |
| "grad_norm": 1.1369786895076892, | |
| "learning_rate": 2.0912937601616005e-05, | |
| "loss": 0.372, | |
| "num_input_tokens_seen": 63242240, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 2.748936170212766, | |
| "grad_norm": 1.0632983215067038, | |
| "learning_rate": 2.0736945184184405e-05, | |
| "loss": 0.3648, | |
| "num_input_tokens_seen": 63504384, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 2.7602836879432626, | |
| "grad_norm": 1.1587019300349575, | |
| "learning_rate": 2.0561170093642423e-05, | |
| "loss": 0.3875, | |
| "num_input_tokens_seen": 63766528, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.771631205673759, | |
| "grad_norm": 1.1459361371000099, | |
| "learning_rate": 2.038562129085387e-05, | |
| "loss": 0.3734, | |
| "num_input_tokens_seen": 64028672, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 2.7829787234042556, | |
| "grad_norm": 1.1468262297024947, | |
| "learning_rate": 2.0210307725146615e-05, | |
| "loss": 0.3593, | |
| "num_input_tokens_seen": 64290816, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 2.794326241134752, | |
| "grad_norm": 1.1534392716010342, | |
| "learning_rate": 2.003523833385637e-05, | |
| "loss": 0.3844, | |
| "num_input_tokens_seen": 64552960, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 2.8056737588652485, | |
| "grad_norm": 1.1768077334945029, | |
| "learning_rate": 1.9860422041870987e-05, | |
| "loss": 0.3801, | |
| "num_input_tokens_seen": 64815104, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 2.8170212765957445, | |
| "grad_norm": 1.1272963366670457, | |
| "learning_rate": 1.9685867761175584e-05, | |
| "loss": 0.3849, | |
| "num_input_tokens_seen": 65077248, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.828368794326241, | |
| "grad_norm": 1.1169732679286002, | |
| "learning_rate": 1.9511584390398147e-05, | |
| "loss": 0.3739, | |
| "num_input_tokens_seen": 65339392, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 2.8397163120567375, | |
| "grad_norm": 1.1016941662820015, | |
| "learning_rate": 1.9337580814355888e-05, | |
| "loss": 0.366, | |
| "num_input_tokens_seen": 65601536, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.851063829787234, | |
| "grad_norm": 1.1758369662868582, | |
| "learning_rate": 1.9163865903602374e-05, | |
| "loss": 0.3629, | |
| "num_input_tokens_seen": 65863680, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 2.8624113475177304, | |
| "grad_norm": 1.1242921844137204, | |
| "learning_rate": 1.899044851397519e-05, | |
| "loss": 0.3814, | |
| "num_input_tokens_seen": 66125824, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 2.873758865248227, | |
| "grad_norm": 1.1614961100727705, | |
| "learning_rate": 1.881733748614461e-05, | |
| "loss": 0.3775, | |
| "num_input_tokens_seen": 66387968, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.8851063829787233, | |
| "grad_norm": 1.1581204676106365, | |
| "learning_rate": 1.8644541645162834e-05, | |
| "loss": 0.3783, | |
| "num_input_tokens_seen": 66650112, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 2.89645390070922, | |
| "grad_norm": 1.1120579438295783, | |
| "learning_rate": 1.8472069800014068e-05, | |
| "loss": 0.35, | |
| "num_input_tokens_seen": 66912256, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.9078014184397163, | |
| "grad_norm": 1.1756299409660191, | |
| "learning_rate": 1.8299930743165535e-05, | |
| "loss": 0.3742, | |
| "num_input_tokens_seen": 67174400, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 2.9191489361702128, | |
| "grad_norm": 1.1402458104536393, | |
| "learning_rate": 1.8128133250119157e-05, | |
| "loss": 0.3652, | |
| "num_input_tokens_seen": 67436544, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 2.9304964539007092, | |
| "grad_norm": 1.1433045669706552, | |
| "learning_rate": 1.795668607896426e-05, | |
| "loss": 0.3832, | |
| "num_input_tokens_seen": 67698688, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.9418439716312057, | |
| "grad_norm": 1.1856499993733003, | |
| "learning_rate": 1.778559796993104e-05, | |
| "loss": 0.3807, | |
| "num_input_tokens_seen": 67960832, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 2.953191489361702, | |
| "grad_norm": 1.1271393587454326, | |
| "learning_rate": 1.7614877644945e-05, | |
| "loss": 0.3799, | |
| "num_input_tokens_seen": 68222976, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 2.9645390070921986, | |
| "grad_norm": 1.0826436831731212, | |
| "learning_rate": 1.7444533807182357e-05, | |
| "loss": 0.3412, | |
| "num_input_tokens_seen": 68485120, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 2.975886524822695, | |
| "grad_norm": 1.131271727043019, | |
| "learning_rate": 1.7274575140626318e-05, | |
| "loss": 0.3666, | |
| "num_input_tokens_seen": 68747264, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 2.9872340425531916, | |
| "grad_norm": 1.0929232699820757, | |
| "learning_rate": 1.710501030962438e-05, | |
| "loss": 0.3787, | |
| "num_input_tokens_seen": 69009408, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.998581560283688, | |
| "grad_norm": 1.0625348413281066, | |
| "learning_rate": 1.6935847958446657e-05, | |
| "loss": 0.3623, | |
| "num_input_tokens_seen": 69271552, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.0625348413281066, | |
| "learning_rate": 1.6767096710845174e-05, | |
| "loss": 0.2925, | |
| "num_input_tokens_seen": 69304320, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 3.0113475177304965, | |
| "grad_norm": 2.86130141515677, | |
| "learning_rate": 1.6598765169614243e-05, | |
| "loss": 0.1334, | |
| "num_input_tokens_seen": 69566464, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 3.022695035460993, | |
| "grad_norm": 1.5425933485480858, | |
| "learning_rate": 1.643086191615194e-05, | |
| "loss": 0.1248, | |
| "num_input_tokens_seen": 69828608, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 3.0340425531914894, | |
| "grad_norm": 1.2430694797405355, | |
| "learning_rate": 1.6263395510022543e-05, | |
| "loss": 0.1212, | |
| "num_input_tokens_seen": 70090752, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.045390070921986, | |
| "grad_norm": 0.9436822278912739, | |
| "learning_rate": 1.6096374488520265e-05, | |
| "loss": 0.1026, | |
| "num_input_tokens_seen": 70352896, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 3.0567375886524824, | |
| "grad_norm": 1.0262534287871352, | |
| "learning_rate": 1.5929807366233977e-05, | |
| "loss": 0.1147, | |
| "num_input_tokens_seen": 70615040, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 3.068085106382979, | |
| "grad_norm": 1.41916462482431, | |
| "learning_rate": 1.5763702634613152e-05, | |
| "loss": 0.1112, | |
| "num_input_tokens_seen": 70877184, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 3.0794326241134753, | |
| "grad_norm": 1.6458131533253877, | |
| "learning_rate": 1.559806876153501e-05, | |
| "loss": 0.1142, | |
| "num_input_tokens_seen": 71139328, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 3.0907801418439718, | |
| "grad_norm": 1.4067415425269045, | |
| "learning_rate": 1.5432914190872757e-05, | |
| "loss": 0.1094, | |
| "num_input_tokens_seen": 71401472, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 3.1021276595744682, | |
| "grad_norm": 1.2565976210206782, | |
| "learning_rate": 1.5268247342065215e-05, | |
| "loss": 0.1024, | |
| "num_input_tokens_seen": 71663616, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 3.1134751773049647, | |
| "grad_norm": 1.0843813224286079, | |
| "learning_rate": 1.5104076609687545e-05, | |
| "loss": 0.1023, | |
| "num_input_tokens_seen": 71925760, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 3.124822695035461, | |
| "grad_norm": 0.9477884459564624, | |
| "learning_rate": 1.4940410363023306e-05, | |
| "loss": 0.0978, | |
| "num_input_tokens_seen": 72187904, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 3.1361702127659576, | |
| "grad_norm": 0.822005236524651, | |
| "learning_rate": 1.4777256945637834e-05, | |
| "loss": 0.099, | |
| "num_input_tokens_seen": 72450048, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 3.147517730496454, | |
| "grad_norm": 0.8023754053464645, | |
| "learning_rate": 1.4614624674952842e-05, | |
| "loss": 0.0956, | |
| "num_input_tokens_seen": 72712192, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.1588652482269506, | |
| "grad_norm": 0.8464902902144091, | |
| "learning_rate": 1.4452521841822436e-05, | |
| "loss": 0.0998, | |
| "num_input_tokens_seen": 72974336, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 3.1702127659574466, | |
| "grad_norm": 0.7987684436490388, | |
| "learning_rate": 1.4290956710110475e-05, | |
| "loss": 0.0945, | |
| "num_input_tokens_seen": 73236480, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 3.181560283687943, | |
| "grad_norm": 0.8094393208784639, | |
| "learning_rate": 1.4129937516269203e-05, | |
| "loss": 0.0988, | |
| "num_input_tokens_seen": 73498624, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 3.1929078014184396, | |
| "grad_norm": 0.8777616390752891, | |
| "learning_rate": 1.3969472468919461e-05, | |
| "loss": 0.0979, | |
| "num_input_tokens_seen": 73760768, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 3.204255319148936, | |
| "grad_norm": 0.8153492833919085, | |
| "learning_rate": 1.3809569748432189e-05, | |
| "loss": 0.0937, | |
| "num_input_tokens_seen": 74022912, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 3.2156028368794325, | |
| "grad_norm": 0.8075245623239392, | |
| "learning_rate": 1.3650237506511331e-05, | |
| "loss": 0.0895, | |
| "num_input_tokens_seen": 74285056, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 3.226950354609929, | |
| "grad_norm": 0.8173043815021659, | |
| "learning_rate": 1.3491483865778365e-05, | |
| "loss": 0.0924, | |
| "num_input_tokens_seen": 74547200, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 3.2382978723404254, | |
| "grad_norm": 0.9317108513266112, | |
| "learning_rate": 1.3333316919358157e-05, | |
| "loss": 0.0987, | |
| "num_input_tokens_seen": 74809344, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 3.249645390070922, | |
| "grad_norm": 0.8288147543181388, | |
| "learning_rate": 1.3175744730466408e-05, | |
| "loss": 0.0841, | |
| "num_input_tokens_seen": 75071488, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 3.2609929078014184, | |
| "grad_norm": 0.8070883717373725, | |
| "learning_rate": 1.301877533199859e-05, | |
| "loss": 0.0931, | |
| "num_input_tokens_seen": 75333632, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.272340425531915, | |
| "grad_norm": 0.8688265252758833, | |
| "learning_rate": 1.2862416726120396e-05, | |
| "loss": 0.0918, | |
| "num_input_tokens_seen": 75595776, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 3.2836879432624113, | |
| "grad_norm": 0.7969272205965748, | |
| "learning_rate": 1.2706676883859903e-05, | |
| "loss": 0.0893, | |
| "num_input_tokens_seen": 75857920, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 3.295035460992908, | |
| "grad_norm": 0.7533077115551139, | |
| "learning_rate": 1.2551563744701109e-05, | |
| "loss": 0.0886, | |
| "num_input_tokens_seen": 76120064, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 3.3063829787234043, | |
| "grad_norm": 0.7785634970662512, | |
| "learning_rate": 1.2397085216179208e-05, | |
| "loss": 0.0883, | |
| "num_input_tokens_seen": 76382208, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 3.3177304964539007, | |
| "grad_norm": 0.7903142511464022, | |
| "learning_rate": 1.2243249173477513e-05, | |
| "loss": 0.0882, | |
| "num_input_tokens_seen": 76644352, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 3.329078014184397, | |
| "grad_norm": 0.7742080002169154, | |
| "learning_rate": 1.2090063459025955e-05, | |
| "loss": 0.0885, | |
| "num_input_tokens_seen": 76906496, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 3.3404255319148937, | |
| "grad_norm": 0.8054772194334612, | |
| "learning_rate": 1.1937535882101281e-05, | |
| "loss": 0.0933, | |
| "num_input_tokens_seen": 77168640, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 3.35177304964539, | |
| "grad_norm": 0.7675259582762006, | |
| "learning_rate": 1.1785674218428952e-05, | |
| "loss": 0.0943, | |
| "num_input_tokens_seen": 77430784, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 3.3631205673758866, | |
| "grad_norm": 0.7414624503341706, | |
| "learning_rate": 1.163448620978674e-05, | |
| "loss": 0.0883, | |
| "num_input_tokens_seen": 77692928, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 3.374468085106383, | |
| "grad_norm": 0.8192053000270058, | |
| "learning_rate": 1.148397956361007e-05, | |
| "loss": 0.0945, | |
| "num_input_tokens_seen": 77955072, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.3858156028368795, | |
| "grad_norm": 0.7961744887050209, | |
| "learning_rate": 1.1334161952599054e-05, | |
| "loss": 0.0859, | |
| "num_input_tokens_seen": 78217216, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 3.397163120567376, | |
| "grad_norm": 0.806567989025415, | |
| "learning_rate": 1.1185041014327433e-05, | |
| "loss": 0.0982, | |
| "num_input_tokens_seen": 78479360, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 3.4085106382978725, | |
| "grad_norm": 0.7627938171296575, | |
| "learning_rate": 1.1036624350853145e-05, | |
| "loss": 0.0891, | |
| "num_input_tokens_seen": 78741504, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 3.419858156028369, | |
| "grad_norm": 0.7764856826754996, | |
| "learning_rate": 1.0888919528330777e-05, | |
| "loss": 0.0905, | |
| "num_input_tokens_seen": 79003648, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 3.4312056737588654, | |
| "grad_norm": 0.8011164595440228, | |
| "learning_rate": 1.0741934076625895e-05, | |
| "loss": 0.0891, | |
| "num_input_tokens_seen": 79265792, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 3.4425531914893615, | |
| "grad_norm": 0.7764472737957309, | |
| "learning_rate": 1.059567548893118e-05, | |
| "loss": 0.0869, | |
| "num_input_tokens_seen": 79527936, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 3.453900709219858, | |
| "grad_norm": 0.7201240824906335, | |
| "learning_rate": 1.0450151221384358e-05, | |
| "loss": 0.09, | |
| "num_input_tokens_seen": 79790080, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 3.4652482269503544, | |
| "grad_norm": 0.8109600636209235, | |
| "learning_rate": 1.0305368692688174e-05, | |
| "loss": 0.0876, | |
| "num_input_tokens_seen": 80052224, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 3.476595744680851, | |
| "grad_norm": 0.7559387558842937, | |
| "learning_rate": 1.016133528373215e-05, | |
| "loss": 0.0875, | |
| "num_input_tokens_seen": 80314368, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 3.4879432624113473, | |
| "grad_norm": 0.7260621903497292, | |
| "learning_rate": 1.0018058337216327e-05, | |
| "loss": 0.079, | |
| "num_input_tokens_seen": 80576512, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.499290780141844, | |
| "grad_norm": 0.7591840412491047, | |
| "learning_rate": 9.875545157276939e-06, | |
| "loss": 0.0793, | |
| "num_input_tokens_seen": 80838656, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 3.5106382978723403, | |
| "grad_norm": 0.741059488771299, | |
| "learning_rate": 9.733803009114045e-06, | |
| "loss": 0.0881, | |
| "num_input_tokens_seen": 81100800, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 3.5219858156028367, | |
| "grad_norm": 0.7374917809789198, | |
| "learning_rate": 9.592839118621187e-06, | |
| "loss": 0.0909, | |
| "num_input_tokens_seen": 81362944, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 3.533333333333333, | |
| "grad_norm": 0.7828847705307154, | |
| "learning_rate": 9.452660672016949e-06, | |
| "loss": 0.0808, | |
| "num_input_tokens_seen": 81625088, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 3.5446808510638297, | |
| "grad_norm": 0.7578842723626861, | |
| "learning_rate": 9.313274815478698e-06, | |
| "loss": 0.0853, | |
| "num_input_tokens_seen": 81887232, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.556028368794326, | |
| "grad_norm": 0.8012195295633705, | |
| "learning_rate": 9.174688654778243e-06, | |
| "loss": 0.0913, | |
| "num_input_tokens_seen": 82149376, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 3.5673758865248226, | |
| "grad_norm": 0.7505651055808276, | |
| "learning_rate": 9.036909254919549e-06, | |
| "loss": 0.0851, | |
| "num_input_tokens_seen": 82411520, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 3.578723404255319, | |
| "grad_norm": 0.7563671980529104, | |
| "learning_rate": 8.899943639778619e-06, | |
| "loss": 0.0898, | |
| "num_input_tokens_seen": 82673664, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 3.5900709219858156, | |
| "grad_norm": 0.7466202200881212, | |
| "learning_rate": 8.763798791745411e-06, | |
| "loss": 0.0808, | |
| "num_input_tokens_seen": 82935808, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 3.601418439716312, | |
| "grad_norm": 0.6880453348725938, | |
| "learning_rate": 8.628481651367876e-06, | |
| "loss": 0.084, | |
| "num_input_tokens_seen": 83197952, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.6127659574468085, | |
| "grad_norm": 0.7694297499307822, | |
| "learning_rate": 8.49399911699814e-06, | |
| "loss": 0.0957, | |
| "num_input_tokens_seen": 83460096, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 3.624113475177305, | |
| "grad_norm": 0.7252376317705156, | |
| "learning_rate": 8.360358044440797e-06, | |
| "loss": 0.0893, | |
| "num_input_tokens_seen": 83722240, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 3.6354609929078014, | |
| "grad_norm": 0.7725027392222746, | |
| "learning_rate": 8.227565246603493e-06, | |
| "loss": 0.0804, | |
| "num_input_tokens_seen": 83984384, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 3.646808510638298, | |
| "grad_norm": 0.6952306274806225, | |
| "learning_rate": 8.09562749314952e-06, | |
| "loss": 0.0868, | |
| "num_input_tokens_seen": 84246528, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 3.6581560283687944, | |
| "grad_norm": 0.7076094401023963, | |
| "learning_rate": 7.96455151015272e-06, | |
| "loss": 0.089, | |
| "num_input_tokens_seen": 84508672, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.669503546099291, | |
| "grad_norm": 0.7549486979904751, | |
| "learning_rate": 7.83434397975466e-06, | |
| "loss": 0.0866, | |
| "num_input_tokens_seen": 84770816, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 3.6808510638297873, | |
| "grad_norm": 0.7136449420637923, | |
| "learning_rate": 7.705011539823911e-06, | |
| "loss": 0.0815, | |
| "num_input_tokens_seen": 85032960, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 3.692198581560284, | |
| "grad_norm": 0.7048331650984001, | |
| "learning_rate": 7.576560783617668e-06, | |
| "loss": 0.0818, | |
| "num_input_tokens_seen": 85295104, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 3.7035460992907803, | |
| "grad_norm": 0.7432477427581144, | |
| "learning_rate": 7.448998259445664e-06, | |
| "loss": 0.0857, | |
| "num_input_tokens_seen": 85557248, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 3.7148936170212767, | |
| "grad_norm": 0.7596413190844352, | |
| "learning_rate": 7.3223304703363135e-06, | |
| "loss": 0.0768, | |
| "num_input_tokens_seen": 85819392, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.726241134751773, | |
| "grad_norm": 0.743937151653106, | |
| "learning_rate": 7.196563873705209e-06, | |
| "loss": 0.0924, | |
| "num_input_tokens_seen": 86081536, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 3.7375886524822697, | |
| "grad_norm": 0.754620960814025, | |
| "learning_rate": 7.071704881025915e-06, | |
| "loss": 0.0864, | |
| "num_input_tokens_seen": 86343680, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 3.748936170212766, | |
| "grad_norm": 0.7675088057052156, | |
| "learning_rate": 6.947759857503119e-06, | |
| "loss": 0.0887, | |
| "num_input_tokens_seen": 86605824, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 3.7602836879432626, | |
| "grad_norm": 0.7509318055805201, | |
| "learning_rate": 6.824735121748163e-06, | |
| "loss": 0.083, | |
| "num_input_tokens_seen": 86867968, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 3.771631205673759, | |
| "grad_norm": 0.7079671243094418, | |
| "learning_rate": 6.70263694545687e-06, | |
| "loss": 0.0783, | |
| "num_input_tokens_seen": 87130112, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.7829787234042556, | |
| "grad_norm": 0.6902777214012117, | |
| "learning_rate": 6.5814715530898745e-06, | |
| "loss": 0.077, | |
| "num_input_tokens_seen": 87392256, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 3.794326241134752, | |
| "grad_norm": 0.736131108315663, | |
| "learning_rate": 6.461245121555307e-06, | |
| "loss": 0.0832, | |
| "num_input_tokens_seen": 87654400, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 3.8056737588652485, | |
| "grad_norm": 0.7602848818372718, | |
| "learning_rate": 6.341963779893828e-06, | |
| "loss": 0.0815, | |
| "num_input_tokens_seen": 87916544, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 3.8170212765957445, | |
| "grad_norm": 0.7035246279871188, | |
| "learning_rate": 6.223633608966254e-06, | |
| "loss": 0.0802, | |
| "num_input_tokens_seen": 88178688, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 3.828368794326241, | |
| "grad_norm": 0.7007067705436694, | |
| "learning_rate": 6.106260641143546e-06, | |
| "loss": 0.0866, | |
| "num_input_tokens_seen": 88440832, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.8397163120567375, | |
| "grad_norm": 0.7399965287319934, | |
| "learning_rate": 5.989850859999227e-06, | |
| "loss": 0.0768, | |
| "num_input_tokens_seen": 88702976, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 3.851063829787234, | |
| "grad_norm": 0.7007319108851067, | |
| "learning_rate": 5.874410200004421e-06, | |
| "loss": 0.0744, | |
| "num_input_tokens_seen": 88965120, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 3.8624113475177304, | |
| "grad_norm": 0.6845592652085333, | |
| "learning_rate": 5.759944546225271e-06, | |
| "loss": 0.081, | |
| "num_input_tokens_seen": 89227264, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 3.873758865248227, | |
| "grad_norm": 0.68562597454897, | |
| "learning_rate": 5.646459734022938e-06, | |
| "loss": 0.0708, | |
| "num_input_tokens_seen": 89489408, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 3.8851063829787233, | |
| "grad_norm": 0.7125683393543343, | |
| "learning_rate": 5.533961548756128e-06, | |
| "loss": 0.078, | |
| "num_input_tokens_seen": 89751552, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.89645390070922, | |
| "grad_norm": 0.7549793531707268, | |
| "learning_rate": 5.422455725486114e-06, | |
| "loss": 0.0878, | |
| "num_input_tokens_seen": 90013696, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 3.9078014184397163, | |
| "grad_norm": 0.743147041969246, | |
| "learning_rate": 5.311947948684457e-06, | |
| "loss": 0.0825, | |
| "num_input_tokens_seen": 90275840, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 3.9191489361702128, | |
| "grad_norm": 0.7265981456361329, | |
| "learning_rate": 5.202443851943126e-06, | |
| "loss": 0.0818, | |
| "num_input_tokens_seen": 90537984, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 3.9304964539007092, | |
| "grad_norm": 0.7077026530723873, | |
| "learning_rate": 5.093949017687341e-06, | |
| "loss": 0.081, | |
| "num_input_tokens_seen": 90800128, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 3.9418439716312057, | |
| "grad_norm": 0.7036326076512418, | |
| "learning_rate": 4.986468976890993e-06, | |
| "loss": 0.0747, | |
| "num_input_tokens_seen": 91062272, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.953191489361702, | |
| "grad_norm": 0.6905130968128987, | |
| "learning_rate": 4.880009208794667e-06, | |
| "loss": 0.0811, | |
| "num_input_tokens_seen": 91324416, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 3.9645390070921986, | |
| "grad_norm": 0.7073702264699631, | |
| "learning_rate": 4.7745751406263165e-06, | |
| "loss": 0.0749, | |
| "num_input_tokens_seen": 91586560, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 3.975886524822695, | |
| "grad_norm": 0.6905489552770284, | |
| "learning_rate": 4.670172147324592e-06, | |
| "loss": 0.0787, | |
| "num_input_tokens_seen": 91848704, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 3.9872340425531916, | |
| "grad_norm": 0.7261963446135224, | |
| "learning_rate": 4.566805551264827e-06, | |
| "loss": 0.0811, | |
| "num_input_tokens_seen": 92110848, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 3.998581560283688, | |
| "grad_norm": 0.7198472258747808, | |
| "learning_rate": 4.4644806219877184e-06, | |
| "loss": 0.0738, | |
| "num_input_tokens_seen": 92372992, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.7198472258747808, | |
| "learning_rate": 4.36320257593065e-06, | |
| "loss": 0.0466, | |
| "num_input_tokens_seen": 92405760, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 4.0113475177304965, | |
| "grad_norm": 1.4688330189672048, | |
| "learning_rate": 4.262976576161842e-06, | |
| "loss": 0.0249, | |
| "num_input_tokens_seen": 92667904, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 4.022695035460993, | |
| "grad_norm": 0.5330531371566788, | |
| "learning_rate": 4.1638077321170646e-06, | |
| "loss": 0.0231, | |
| "num_input_tokens_seen": 92930048, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 4.034042553191489, | |
| "grad_norm": 0.4922654159647282, | |
| "learning_rate": 4.0657010993391865e-06, | |
| "loss": 0.0222, | |
| "num_input_tokens_seen": 93192192, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 4.045390070921986, | |
| "grad_norm": 0.4561531097868261, | |
| "learning_rate": 3.968661679220468e-06, | |
| "loss": 0.0197, | |
| "num_input_tokens_seen": 93454336, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.056737588652482, | |
| "grad_norm": 0.42335290314314167, | |
| "learning_rate": 3.872694418747594e-06, | |
| "loss": 0.0189, | |
| "num_input_tokens_seen": 93716480, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 4.068085106382979, | |
| "grad_norm": 0.4008499835789085, | |
| "learning_rate": 3.777804210249436e-06, | |
| "loss": 0.0176, | |
| "num_input_tokens_seen": 93978624, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 4.079432624113475, | |
| "grad_norm": 0.38452603039597705, | |
| "learning_rate": 3.6839958911476957e-06, | |
| "loss": 0.0198, | |
| "num_input_tokens_seen": 94240768, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 4.090780141843972, | |
| "grad_norm": 0.3766839792161632, | |
| "learning_rate": 3.591274243710277e-06, | |
| "loss": 0.0177, | |
| "num_input_tokens_seen": 94502912, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 4.102127659574468, | |
| "grad_norm": 0.3464025018288397, | |
| "learning_rate": 3.499643994807486e-06, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 94765056, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 4.113475177304965, | |
| "grad_norm": 0.3369150140378179, | |
| "learning_rate": 3.4091098156710744e-06, | |
| "loss": 0.0161, | |
| "num_input_tokens_seen": 95027200, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 4.124822695035461, | |
| "grad_norm": 0.31335039715603397, | |
| "learning_rate": 3.319676321656082e-06, | |
| "loss": 0.0156, | |
| "num_input_tokens_seen": 95289344, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 4.136170212765958, | |
| "grad_norm": 0.37158481677020094, | |
| "learning_rate": 3.2313480720055745e-06, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 95551488, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 4.147517730496454, | |
| "grad_norm": 0.37465306599622433, | |
| "learning_rate": 3.1441295696181897e-06, | |
| "loss": 0.0152, | |
| "num_input_tokens_seen": 95813632, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 4.158865248226951, | |
| "grad_norm": 0.35876073758096233, | |
| "learning_rate": 3.058025260818609e-06, | |
| "loss": 0.0149, | |
| "num_input_tokens_seen": 96075776, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.170212765957447, | |
| "grad_norm": 0.3949875998401012, | |
| "learning_rate": 2.9730395351308866e-06, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 96337920, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 4.1815602836879435, | |
| "grad_norm": 0.3828061015559776, | |
| "learning_rate": 2.889176725054643e-06, | |
| "loss": 0.0173, | |
| "num_input_tokens_seen": 96600064, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 4.19290780141844, | |
| "grad_norm": 0.45575162609289305, | |
| "learning_rate": 2.80644110584424e-06, | |
| "loss": 0.0148, | |
| "num_input_tokens_seen": 96862208, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 4.2042553191489365, | |
| "grad_norm": 0.42419973216850165, | |
| "learning_rate": 2.7248368952908053e-06, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 97124352, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 4.215602836879433, | |
| "grad_norm": 0.42022424947578696, | |
| "learning_rate": 2.6443682535072177e-06, | |
| "loss": 0.0154, | |
| "num_input_tokens_seen": 97386496, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 4.226950354609929, | |
| "grad_norm": 0.4092868425580408, | |
| "learning_rate": 2.565039282716045e-06, | |
| "loss": 0.0149, | |
| "num_input_tokens_seen": 97648640, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 4.238297872340426, | |
| "grad_norm": 0.403775041864487, | |
| "learning_rate": 2.486854027040375e-06, | |
| "loss": 0.0143, | |
| "num_input_tokens_seen": 97910784, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 4.249645390070922, | |
| "grad_norm": 0.400415704748068, | |
| "learning_rate": 2.4098164722977073e-06, | |
| "loss": 0.016, | |
| "num_input_tokens_seen": 98172928, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 4.260992907801419, | |
| "grad_norm": 0.3970826955886615, | |
| "learning_rate": 2.333930545796717e-06, | |
| "loss": 0.0159, | |
| "num_input_tokens_seen": 98435072, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 4.272340425531915, | |
| "grad_norm": 0.4227943452768279, | |
| "learning_rate": 2.2592001161370392e-06, | |
| "loss": 0.0162, | |
| "num_input_tokens_seen": 98697216, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.283687943262412, | |
| "grad_norm": 0.38818903700312996, | |
| "learning_rate": 2.185628993012101e-06, | |
| "loss": 0.0144, | |
| "num_input_tokens_seen": 98959360, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 4.295035460992908, | |
| "grad_norm": 0.37517565866002883, | |
| "learning_rate": 2.11322092701485e-06, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 99221504, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 4.306382978723404, | |
| "grad_norm": 0.4122413600819348, | |
| "learning_rate": 2.0419796094465788e-06, | |
| "loss": 0.0149, | |
| "num_input_tokens_seen": 99483648, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 4.317730496453901, | |
| "grad_norm": 0.3844299207768967, | |
| "learning_rate": 1.97190867212875e-06, | |
| "loss": 0.0163, | |
| "num_input_tokens_seen": 99745792, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 4.329078014184397, | |
| "grad_norm": 0.39386937733627553, | |
| "learning_rate": 1.9030116872178316e-06, | |
| "loss": 0.0147, | |
| "num_input_tokens_seen": 100007936, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 4.340425531914893, | |
| "grad_norm": 0.338450589586265, | |
| "learning_rate": 1.8352921670232143e-06, | |
| "loss": 0.0142, | |
| "num_input_tokens_seen": 100270080, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 4.35177304964539, | |
| "grad_norm": 0.35418453635661823, | |
| "learning_rate": 1.768753563828135e-06, | |
| "loss": 0.0148, | |
| "num_input_tokens_seen": 100532224, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 4.363120567375886, | |
| "grad_norm": 0.35934554094848764, | |
| "learning_rate": 1.703399269713693e-06, | |
| "loss": 0.0148, | |
| "num_input_tokens_seen": 100794368, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 4.374468085106383, | |
| "grad_norm": 0.34543051645814415, | |
| "learning_rate": 1.6392326163859273e-06, | |
| "loss": 0.0158, | |
| "num_input_tokens_seen": 101056512, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 4.385815602836879, | |
| "grad_norm": 0.3413445651057825, | |
| "learning_rate": 1.5762568750059604e-06, | |
| "loss": 0.0149, | |
| "num_input_tokens_seen": 101318656, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.397163120567376, | |
| "grad_norm": 0.352456974681825, | |
| "learning_rate": 1.5144752560232372e-06, | |
| "loss": 0.0159, | |
| "num_input_tokens_seen": 101580800, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 4.408510638297872, | |
| "grad_norm": 0.341100842734161, | |
| "learning_rate": 1.4538909090118846e-06, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 101842944, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 4.4198581560283685, | |
| "grad_norm": 0.35870504615922777, | |
| "learning_rate": 1.3945069225101026e-06, | |
| "loss": 0.0151, | |
| "num_input_tokens_seen": 102105088, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 4.431205673758865, | |
| "grad_norm": 0.35676458850601245, | |
| "learning_rate": 1.3363263238627493e-06, | |
| "loss": 0.0166, | |
| "num_input_tokens_seen": 102367232, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 4.4425531914893615, | |
| "grad_norm": 0.3756541331179786, | |
| "learning_rate": 1.2793520790670116e-06, | |
| "loss": 0.0152, | |
| "num_input_tokens_seen": 102629376, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 4.453900709219858, | |
| "grad_norm": 0.3445790534591682, | |
| "learning_rate": 1.2235870926211619e-06, | |
| "loss": 0.0144, | |
| "num_input_tokens_seen": 102891520, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 4.465248226950354, | |
| "grad_norm": 0.34515720116215964, | |
| "learning_rate": 1.1690342073765375e-06, | |
| "loss": 0.0144, | |
| "num_input_tokens_seen": 103153664, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 4.476595744680851, | |
| "grad_norm": 0.345726417626317, | |
| "learning_rate": 1.1156962043925828e-06, | |
| "loss": 0.0158, | |
| "num_input_tokens_seen": 103415808, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 4.487943262411347, | |
| "grad_norm": 0.3542601574103813, | |
| "learning_rate": 1.0635758027950888e-06, | |
| "loss": 0.0133, | |
| "num_input_tokens_seen": 103677952, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 4.499290780141844, | |
| "grad_norm": 0.32899237428693795, | |
| "learning_rate": 1.0126756596375686e-06, | |
| "loss": 0.016, | |
| "num_input_tokens_seen": 103940096, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 4.51063829787234, | |
| "grad_norm": 0.3481316804818486, | |
| "learning_rate": 9.629983697657886e-07, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 104202240, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 4.521985815602837, | |
| "grad_norm": 0.3233545153466053, | |
| "learning_rate": 9.145464656855257e-07, | |
| "loss": 0.0134, | |
| "num_input_tokens_seen": 104464384, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 4.533333333333333, | |
| "grad_norm": 0.33628106449340045, | |
| "learning_rate": 8.673224174334221e-07, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 104726528, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 4.54468085106383, | |
| "grad_norm": 0.3399953670308035, | |
| "learning_rate": 8.213286324510738e-07, | |
| "loss": 0.0156, | |
| "num_input_tokens_seen": 104988672, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 4.556028368794326, | |
| "grad_norm": 0.3850226318275653, | |
| "learning_rate": 7.765674554623181e-07, | |
| "loss": 0.0165, | |
| "num_input_tokens_seen": 105250816, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 4.567375886524823, | |
| "grad_norm": 0.353247574363122, | |
| "learning_rate": 7.330411683536876e-07, | |
| "loss": 0.0144, | |
| "num_input_tokens_seen": 105512960, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 4.578723404255319, | |
| "grad_norm": 0.32706270694141454, | |
| "learning_rate": 6.907519900580861e-07, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 105775104, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 4.590070921985816, | |
| "grad_norm": 0.37578666168375224, | |
| "learning_rate": 6.497020764416633e-07, | |
| "loss": 0.0156, | |
| "num_input_tokens_seen": 106037248, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 4.601418439716312, | |
| "grad_norm": 0.3185935927438266, | |
| "learning_rate": 6.098935201939187e-07, | |
| "loss": 0.0136, | |
| "num_input_tokens_seen": 106299392, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 4.6127659574468085, | |
| "grad_norm": 0.3198588451035788, | |
| "learning_rate": 5.713283507210148e-07, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 106561536, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 4.624113475177305, | |
| "grad_norm": 0.3127004347421057, | |
| "learning_rate": 5.340085340423129e-07, | |
| "loss": 0.0144, | |
| "num_input_tokens_seen": 106823680, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 4.6354609929078014, | |
| "grad_norm": 0.3412415661804973, | |
| "learning_rate": 4.979359726901639e-07, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 107085824, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 4.646808510638298, | |
| "grad_norm": 0.3048113188870906, | |
| "learning_rate": 4.63112505612906e-07, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 107347968, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 4.658156028368794, | |
| "grad_norm": 0.3005056677101423, | |
| "learning_rate": 4.2953990808111135e-07, | |
| "loss": 0.015, | |
| "num_input_tokens_seen": 107610112, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 4.669503546099291, | |
| "grad_norm": 0.3504689469184949, | |
| "learning_rate": 3.972198915970976e-07, | |
| "loss": 0.0152, | |
| "num_input_tokens_seen": 107872256, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 4.680851063829787, | |
| "grad_norm": 0.38892190993563536, | |
| "learning_rate": 3.6615410380767544e-07, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 108134400, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 4.692198581560284, | |
| "grad_norm": 0.3234118453449877, | |
| "learning_rate": 3.3634412842014353e-07, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 108396544, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 4.70354609929078, | |
| "grad_norm": 0.33351975045706705, | |
| "learning_rate": 3.077914851215585e-07, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 108658688, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 4.714893617021277, | |
| "grad_norm": 0.3243353662442738, | |
| "learning_rate": 2.804976295012612e-07, | |
| "loss": 0.014, | |
| "num_input_tokens_seen": 108920832, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 4.726241134751773, | |
| "grad_norm": 0.3191756927282505, | |
| "learning_rate": 2.544639529766829e-07, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 109182976, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 4.73758865248227, | |
| "grad_norm": 0.34240534777520126, | |
| "learning_rate": 2.2969178272238545e-07, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 109445120, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 4.748936170212766, | |
| "grad_norm": 0.2956411840988351, | |
| "learning_rate": 2.061823816024322e-07, | |
| "loss": 0.0132, | |
| "num_input_tokens_seen": 109707264, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 4.760283687943263, | |
| "grad_norm": 0.36482955125245486, | |
| "learning_rate": 1.8393694810599493e-07, | |
| "loss": 0.016, | |
| "num_input_tokens_seen": 109969408, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 4.771631205673759, | |
| "grad_norm": 0.3332106952260904, | |
| "learning_rate": 1.6295661628624447e-07, | |
| "loss": 0.0136, | |
| "num_input_tokens_seen": 110231552, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 4.782978723404256, | |
| "grad_norm": 0.3411172110927968, | |
| "learning_rate": 1.4324245570256633e-07, | |
| "loss": 0.0169, | |
| "num_input_tokens_seen": 110493696, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.794326241134752, | |
| "grad_norm": 0.34024588053935717, | |
| "learning_rate": 1.2479547136600989e-07, | |
| "loss": 0.0133, | |
| "num_input_tokens_seen": 110755840, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 4.8056737588652485, | |
| "grad_norm": 0.34496629497126063, | |
| "learning_rate": 1.0761660368806548e-07, | |
| "loss": 0.0144, | |
| "num_input_tokens_seen": 111017984, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 4.817021276595745, | |
| "grad_norm": 0.33516309634270675, | |
| "learning_rate": 9.170672843271666e-08, | |
| "loss": 0.0153, | |
| "num_input_tokens_seen": 111280128, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 4.828368794326241, | |
| "grad_norm": 0.35890460501548294, | |
| "learning_rate": 7.706665667180091e-08, | |
| "loss": 0.0127, | |
| "num_input_tokens_seen": 111542272, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 4.839716312056738, | |
| "grad_norm": 0.3167706249520597, | |
| "learning_rate": 6.369713474366212e-08, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 111804416, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 4.851063829787234, | |
| "grad_norm": 0.34919750392535776, | |
| "learning_rate": 5.159884421509498e-08, | |
| "loss": 0.0142, | |
| "num_input_tokens_seen": 112066560, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 4.862411347517731, | |
| "grad_norm": 0.322002789312833, | |
| "learning_rate": 4.07724018466088e-08, | |
| "loss": 0.0156, | |
| "num_input_tokens_seen": 112328704, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 4.873758865248227, | |
| "grad_norm": 0.35090105881839767, | |
| "learning_rate": 3.1218359560974966e-08, | |
| "loss": 0.0154, | |
| "num_input_tokens_seen": 112590848, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 4.885106382978723, | |
| "grad_norm": 0.39964488827914374, | |
| "learning_rate": 2.2937204415107717e-08, | |
| "loss": 0.0151, | |
| "num_input_tokens_seen": 112852992, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 4.89645390070922, | |
| "grad_norm": 0.3076123689331371, | |
| "learning_rate": 1.5929358575206275e-08, | |
| "loss": 0.0144, | |
| "num_input_tokens_seen": 113115136, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 4.907801418439716, | |
| "grad_norm": 0.34810902058971566, | |
| "learning_rate": 1.0195179295269252e-08, | |
| "loss": 0.0146, | |
| "num_input_tokens_seen": 113377280, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 4.919148936170213, | |
| "grad_norm": 0.3213963089203548, | |
| "learning_rate": 5.7349588988481194e-09, | |
| "loss": 0.0125, | |
| "num_input_tokens_seen": 113639424, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 4.930496453900709, | |
| "grad_norm": 0.32788714163055016, | |
| "learning_rate": 2.5489247641674596e-09, | |
| "loss": 0.0129, | |
| "num_input_tokens_seen": 113901568, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 4.941843971631206, | |
| "grad_norm": 0.36471489051146844, | |
| "learning_rate": 6.372393125203546e-10, | |
| "loss": 0.0148, | |
| "num_input_tokens_seen": 114163712, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 4.953191489361702, | |
| "grad_norm": 0.3392645067497023, | |
| "learning_rate": 0.0, | |
| "loss": 0.0161, | |
| "num_input_tokens_seen": 114425856, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 4.953191489361702, | |
| "num_input_tokens_seen": 114425856, | |
| "step": 440, | |
| "total_flos": 182736094494720.0, | |
| "train_loss": 0.6214238642672585, | |
| "train_runtime": 10972.3782, | |
| "train_samples_per_second": 10.27, | |
| "train_steps_per_second": 0.04 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 440, | |
| "num_input_tokens_seen": 114425856, | |
| "num_train_epochs": 5, | |
| "save_steps": 5000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 182736094494720.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |