| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 1274, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.5458984375, |
| "epoch": 0.007849293563579277, |
| "grad_norm": 4.360797014923199, |
| "learning_rate": 6.25e-07, |
| "loss": 0.6017, |
| "mean_token_accuracy": 0.8533750414848328, |
| "num_tokens": 184600.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 0.624609375, |
| "epoch": 0.015698587127158554, |
| "grad_norm": 4.173989171952699, |
| "learning_rate": 1.40625e-06, |
| "loss": 0.6725, |
| "mean_token_accuracy": 0.8375870227813721, |
| "num_tokens": 364953.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.65390625, |
| "epoch": 0.023547880690737835, |
| "grad_norm": 4.134126723754578, |
| "learning_rate": 2.1875000000000002e-06, |
| "loss": 0.6032, |
| "mean_token_accuracy": 0.8524928987026215, |
| "num_tokens": 549934.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 0.65234375, |
| "epoch": 0.03139717425431711, |
| "grad_norm": 4.553331995001466, |
| "learning_rate": 2.96875e-06, |
| "loss": 0.6252, |
| "mean_token_accuracy": 0.8429424941539765, |
| "num_tokens": 729587.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.5826171875, |
| "epoch": 0.03924646781789639, |
| "grad_norm": 3.86036196792769, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.5924, |
| "mean_token_accuracy": 0.8477238178253174, |
| "num_tokens": 902536.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 0.641015625, |
| "epoch": 0.04709576138147567, |
| "grad_norm": 4.179091616309323, |
| "learning_rate": 4.53125e-06, |
| "loss": 0.6158, |
| "mean_token_accuracy": 0.8436026930809021, |
| "num_tokens": 1085970.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.6009765625, |
| "epoch": 0.054945054945054944, |
| "grad_norm": 3.4664489534719136, |
| "learning_rate": 5.3125e-06, |
| "loss": 0.5899, |
| "mean_token_accuracy": 0.8470608413219451, |
| "num_tokens": 1262961.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 0.59296875, |
| "epoch": 0.06279434850863422, |
| "grad_norm": 4.61485579559788, |
| "learning_rate": 6.093750000000001e-06, |
| "loss": 0.5756, |
| "mean_token_accuracy": 0.852037787437439, |
| "num_tokens": 1434864.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.6490234375, |
| "epoch": 0.0706436420722135, |
| "grad_norm": 3.76615983960646, |
| "learning_rate": 6.875e-06, |
| "loss": 0.6202, |
| "mean_token_accuracy": 0.8431805431842804, |
| "num_tokens": 1613822.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 0.6232421875, |
| "epoch": 0.07849293563579278, |
| "grad_norm": 4.323268612794359, |
| "learning_rate": 7.656250000000001e-06, |
| "loss": 0.6209, |
| "mean_token_accuracy": 0.8405247449874877, |
| "num_tokens": 1789186.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.6388671875, |
| "epoch": 0.08634222919937205, |
| "grad_norm": 3.652593030026506, |
| "learning_rate": 8.4375e-06, |
| "loss": 0.613, |
| "mean_token_accuracy": 0.8430534422397613, |
| "num_tokens": 1969254.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.6021484375, |
| "epoch": 0.09419152276295134, |
| "grad_norm": 3.9563909364035212, |
| "learning_rate": 9.21875e-06, |
| "loss": 0.5886, |
| "mean_token_accuracy": 0.8484231889247894, |
| "num_tokens": 2149465.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.6640625, |
| "epoch": 0.10204081632653061, |
| "grad_norm": 4.647691513931947, |
| "learning_rate": 1e-05, |
| "loss": 0.6597, |
| "mean_token_accuracy": 0.8311540305614471, |
| "num_tokens": 2316427.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 0.6666015625, |
| "epoch": 0.10989010989010989, |
| "grad_norm": 3.860853295594871, |
| "learning_rate": 1.0781250000000001e-05, |
| "loss": 0.6299, |
| "mean_token_accuracy": 0.8398839890956878, |
| "num_tokens": 2476047.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.6080078125, |
| "epoch": 0.11773940345368916, |
| "grad_norm": 4.134599413536603, |
| "learning_rate": 1.1562500000000002e-05, |
| "loss": 0.6148, |
| "mean_token_accuracy": 0.8433849811553955, |
| "num_tokens": 2666514.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.66953125, |
| "epoch": 0.12558869701726844, |
| "grad_norm": 4.393935267584192, |
| "learning_rate": 1.234375e-05, |
| "loss": 0.655, |
| "mean_token_accuracy": 0.8323938012123108, |
| "num_tokens": 2845307.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.6310546875, |
| "epoch": 0.13343799058084774, |
| "grad_norm": 3.997712311552316, |
| "learning_rate": 1.3125e-05, |
| "loss": 0.6027, |
| "mean_token_accuracy": 0.8435549437999725, |
| "num_tokens": 3023624.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 0.726953125, |
| "epoch": 0.141287284144427, |
| "grad_norm": 4.8386502206763815, |
| "learning_rate": 1.3906250000000001e-05, |
| "loss": 0.7214, |
| "mean_token_accuracy": 0.8174331367015839, |
| "num_tokens": 3195544.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.70546875, |
| "epoch": 0.14913657770800628, |
| "grad_norm": 4.7174464913779195, |
| "learning_rate": 1.4687500000000001e-05, |
| "loss": 0.6952, |
| "mean_token_accuracy": 0.8213136374950409, |
| "num_tokens": 3361327.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 0.659765625, |
| "epoch": 0.15698587127158556, |
| "grad_norm": 4.517079736996241, |
| "learning_rate": 1.546875e-05, |
| "loss": 0.6254, |
| "mean_token_accuracy": 0.8344886541366577, |
| "num_tokens": 3528717.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.628515625, |
| "epoch": 0.16483516483516483, |
| "grad_norm": 4.025576117988761, |
| "learning_rate": 1.6250000000000002e-05, |
| "loss": 0.6156, |
| "mean_token_accuracy": 0.8382946670055389, |
| "num_tokens": 3711033.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 0.706640625, |
| "epoch": 0.1726844583987441, |
| "grad_norm": 4.39476825016141, |
| "learning_rate": 1.703125e-05, |
| "loss": 0.6876, |
| "mean_token_accuracy": 0.8211474418640137, |
| "num_tokens": 3896865.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.73046875, |
| "epoch": 0.18053375196232338, |
| "grad_norm": 4.5449040365136275, |
| "learning_rate": 1.7812500000000003e-05, |
| "loss": 0.706, |
| "mean_token_accuracy": 0.8200587868690491, |
| "num_tokens": 4064399.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 0.798828125, |
| "epoch": 0.18838304552590268, |
| "grad_norm": 4.7172406953905055, |
| "learning_rate": 1.859375e-05, |
| "loss": 0.7724, |
| "mean_token_accuracy": 0.8062412679195404, |
| "num_tokens": 4250087.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.7640625, |
| "epoch": 0.19623233908948196, |
| "grad_norm": 4.334898424571733, |
| "learning_rate": 1.9375e-05, |
| "loss": 0.7589, |
| "mean_token_accuracy": 0.8066916942596436, |
| "num_tokens": 4424556.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 0.66796875, |
| "epoch": 0.20408163265306123, |
| "grad_norm": 4.117466362089365, |
| "learning_rate": 1.999996242489157e-05, |
| "loss": 0.6492, |
| "mean_token_accuracy": 0.8317610502243042, |
| "num_tokens": 4602319.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.6859375, |
| "epoch": 0.2119309262166405, |
| "grad_norm": 4.969581124334222, |
| "learning_rate": 1.9998647325745995e-05, |
| "loss": 0.6877, |
| "mean_token_accuracy": 0.8207987725734711, |
| "num_tokens": 4781504.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 0.72421875, |
| "epoch": 0.21978021978021978, |
| "grad_norm": 4.719028124484732, |
| "learning_rate": 1.9995453753547198e-05, |
| "loss": 0.7159, |
| "mean_token_accuracy": 0.8126244425773621, |
| "num_tokens": 4956176.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.676171875, |
| "epoch": 0.22762951334379905, |
| "grad_norm": 8.762448889791337, |
| "learning_rate": 1.9990382308280272e-05, |
| "loss": 0.6673, |
| "mean_token_accuracy": 0.8245218932628632, |
| "num_tokens": 5128454.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 0.75859375, |
| "epoch": 0.23547880690737832, |
| "grad_norm": 4.8961918895212, |
| "learning_rate": 1.9983433942731427e-05, |
| "loss": 0.7384, |
| "mean_token_accuracy": 0.8101492524147034, |
| "num_tokens": 5318384.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.760546875, |
| "epoch": 0.24332810047095763, |
| "grad_norm": 5.258590906515805, |
| "learning_rate": 1.9974609962308986e-05, |
| "loss": 0.7455, |
| "mean_token_accuracy": 0.805279117822647, |
| "num_tokens": 5486676.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 0.758203125, |
| "epoch": 0.25117739403453687, |
| "grad_norm": 4.883940068899578, |
| "learning_rate": 1.9963912024798136e-05, |
| "loss": 0.735, |
| "mean_token_accuracy": 0.8099753499031067, |
| "num_tokens": 5654901.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.71796875, |
| "epoch": 0.25902668759811615, |
| "grad_norm": 5.168092897390165, |
| "learning_rate": 1.9951342140049483e-05, |
| "loss": 0.6834, |
| "mean_token_accuracy": 0.8199768126010895, |
| "num_tokens": 5836491.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 0.75078125, |
| "epoch": 0.2668759811616955, |
| "grad_norm": 5.194891788462451, |
| "learning_rate": 1.9936902669601436e-05, |
| "loss": 0.7382, |
| "mean_token_accuracy": 0.8087680697441101, |
| "num_tokens": 6016945.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.7578125, |
| "epoch": 0.27472527472527475, |
| "grad_norm": 4.272844578361102, |
| "learning_rate": 1.992059632623657e-05, |
| "loss": 0.7216, |
| "mean_token_accuracy": 0.810324364900589, |
| "num_tokens": 6211712.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 0.7265625, |
| "epoch": 0.282574568288854, |
| "grad_norm": 5.177357949949477, |
| "learning_rate": 1.9902426173471933e-05, |
| "loss": 0.723, |
| "mean_token_accuracy": 0.8130602538585663, |
| "num_tokens": 6391206.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.690625, |
| "epoch": 0.2904238618524333, |
| "grad_norm": 4.3708505660541634, |
| "learning_rate": 1.9882395624983522e-05, |
| "loss": 0.6693, |
| "mean_token_accuracy": 0.8209006905555725, |
| "num_tokens": 6568364.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 0.76484375, |
| "epoch": 0.29827315541601257, |
| "grad_norm": 4.5630509683508835, |
| "learning_rate": 1.986050844396493e-05, |
| "loss": 0.737, |
| "mean_token_accuracy": 0.8066314697265625, |
| "num_tokens": 6743609.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.79609375, |
| "epoch": 0.30612244897959184, |
| "grad_norm": 4.704418610612491, |
| "learning_rate": 1.9836768742420355e-05, |
| "loss": 0.7801, |
| "mean_token_accuracy": 0.796642541885376, |
| "num_tokens": 6905826.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 0.830859375, |
| "epoch": 0.3139717425431711, |
| "grad_norm": 7.369608742272206, |
| "learning_rate": 1.9811180980392054e-05, |
| "loss": 0.8205, |
| "mean_token_accuracy": 0.7892749607563019, |
| "num_tokens": 7082387.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.808203125, |
| "epoch": 0.3218210361067504, |
| "grad_norm": 6.33690320523207, |
| "learning_rate": 1.9783749965122444e-05, |
| "loss": 0.8285, |
| "mean_token_accuracy": 0.7886267483234406, |
| "num_tokens": 7266538.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 0.805078125, |
| "epoch": 0.32967032967032966, |
| "grad_norm": 5.4501950578656695, |
| "learning_rate": 1.975448085015093e-05, |
| "loss": 0.7669, |
| "mean_token_accuracy": 0.8004365921020508, |
| "num_tokens": 7441353.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.8328125, |
| "epoch": 0.33751962323390894, |
| "grad_norm": 5.481262275844823, |
| "learning_rate": 1.9723379134345698e-05, |
| "loss": 0.7928, |
| "mean_token_accuracy": 0.7955616354942322, |
| "num_tokens": 7610090.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 0.6796875, |
| "epoch": 0.3453689167974882, |
| "grad_norm": 14.03469496366746, |
| "learning_rate": 1.9690450660870657e-05, |
| "loss": 0.7119, |
| "mean_token_accuracy": 0.8117523312568664, |
| "num_tokens": 7785150.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.837109375, |
| "epoch": 0.3532182103610675, |
| "grad_norm": 21.481304681046133, |
| "learning_rate": 1.965570161608762e-05, |
| "loss": 0.7928, |
| "mean_token_accuracy": 0.796742171049118, |
| "num_tokens": 7947571.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 0.694921875, |
| "epoch": 0.36106750392464676, |
| "grad_norm": 4.843158336605314, |
| "learning_rate": 1.961913852839409e-05, |
| "loss": 0.7393, |
| "mean_token_accuracy": 0.8110623478889465, |
| "num_tokens": 8121912.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.84140625, |
| "epoch": 0.36891679748822603, |
| "grad_norm": 4.852685356865867, |
| "learning_rate": 1.958076826699676e-05, |
| "loss": 0.8183, |
| "mean_token_accuracy": 0.7912623822689057, |
| "num_tokens": 8298107.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 0.7515625, |
| "epoch": 0.37676609105180536, |
| "grad_norm": 4.73767538351422, |
| "learning_rate": 1.954059804062092e-05, |
| "loss": 0.7296, |
| "mean_token_accuracy": 0.809317535161972, |
| "num_tokens": 8474685.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.759765625, |
| "epoch": 0.38461538461538464, |
| "grad_norm": 4.736881234284725, |
| "learning_rate": 1.9498635396156217e-05, |
| "loss": 0.7585, |
| "mean_token_accuracy": 0.8065651416778564, |
| "num_tokens": 8646326.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 0.790625, |
| "epoch": 0.3924646781789639, |
| "grad_norm": 6.139673216563283, |
| "learning_rate": 1.945488821723873e-05, |
| "loss": 0.8023, |
| "mean_token_accuracy": 0.7948877096176148, |
| "num_tokens": 8829018.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.77734375, |
| "epoch": 0.4003139717425432, |
| "grad_norm": 4.928971267663469, |
| "learning_rate": 1.9409364722769882e-05, |
| "loss": 0.7675, |
| "mean_token_accuracy": 0.7988379955291748, |
| "num_tokens": 9005749.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 0.776171875, |
| "epoch": 0.40816326530612246, |
| "grad_norm": 5.40486354385506, |
| "learning_rate": 1.936207346537233e-05, |
| "loss": 0.7616, |
| "mean_token_accuracy": 0.8005188524723053, |
| "num_tokens": 9183456.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.7015625, |
| "epoch": 0.41601255886970173, |
| "grad_norm": 4.788382627075379, |
| "learning_rate": 1.931302332978316e-05, |
| "loss": 0.6827, |
| "mean_token_accuracy": 0.8215398371219635, |
| "num_tokens": 9366675.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 0.812109375, |
| "epoch": 0.423861852433281, |
| "grad_norm": 4.627337335063183, |
| "learning_rate": 1.9262223531184678e-05, |
| "loss": 0.8108, |
| "mean_token_accuracy": 0.7931458294391632, |
| "num_tokens": 9537586.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.833984375, |
| "epoch": 0.4317111459968603, |
| "grad_norm": 5.563859895314659, |
| "learning_rate": 1.9209683613473143e-05, |
| "loss": 0.849, |
| "mean_token_accuracy": 0.7871452808380127, |
| "num_tokens": 9706784.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 0.767578125, |
| "epoch": 0.43956043956043955, |
| "grad_norm": 4.599264719197712, |
| "learning_rate": 1.9155413447465715e-05, |
| "loss": 0.752, |
| "mean_token_accuracy": 0.8034415602684021, |
| "num_tokens": 9875538.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.809765625, |
| "epoch": 0.4474097331240188, |
| "grad_norm": 5.321937764249497, |
| "learning_rate": 1.9099423229046015e-05, |
| "loss": 0.7833, |
| "mean_token_accuracy": 0.7978603601455688, |
| "num_tokens": 10044475.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 0.8265625, |
| "epoch": 0.4552590266875981, |
| "grad_norm": 5.63723949982722, |
| "learning_rate": 1.9041723477248575e-05, |
| "loss": 0.8204, |
| "mean_token_accuracy": 0.7882011353969574, |
| "num_tokens": 10222319.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.776953125, |
| "epoch": 0.4631083202511774, |
| "grad_norm": 4.6603074318506, |
| "learning_rate": 1.8982325032282616e-05, |
| "loss": 0.7813, |
| "mean_token_accuracy": 0.7975976228713989, |
| "num_tokens": 10399979.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 0.832421875, |
| "epoch": 0.47095761381475665, |
| "grad_norm": 11.534881035486686, |
| "learning_rate": 1.8921239053495465e-05, |
| "loss": 0.807, |
| "mean_token_accuracy": 0.7901058197021484, |
| "num_tokens": 10572020.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.78671875, |
| "epoch": 0.478806907378336, |
| "grad_norm": 4.825964771710648, |
| "learning_rate": 1.8858477017276002e-05, |
| "loss": 0.7848, |
| "mean_token_accuracy": 0.7961919009685516, |
| "num_tokens": 10747199.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 0.815625, |
| "epoch": 0.48665620094191525, |
| "grad_norm": 29.937628894998547, |
| "learning_rate": 1.8794050714898596e-05, |
| "loss": 0.7982, |
| "mean_token_accuracy": 0.7925647079944611, |
| "num_tokens": 10924508.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.801953125, |
| "epoch": 0.4945054945054945, |
| "grad_norm": 5.031515759113725, |
| "learning_rate": 1.87279722503078e-05, |
| "loss": 0.7959, |
| "mean_token_accuracy": 0.7924800992012024, |
| "num_tokens": 11097529.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 0.7953125, |
| "epoch": 0.5023547880690737, |
| "grad_norm": 5.5920861837911335, |
| "learning_rate": 1.866025403784439e-05, |
| "loss": 0.7869, |
| "mean_token_accuracy": 0.7924351811408996, |
| "num_tokens": 11278748.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.773046875, |
| "epoch": 0.5102040816326531, |
| "grad_norm": 4.767882154179232, |
| "learning_rate": 1.859090879991302e-05, |
| "loss": 0.7659, |
| "mean_token_accuracy": 0.7989370346069335, |
| "num_tokens": 11456413.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.83359375, |
| "epoch": 0.5180533751962323, |
| "grad_norm": 4.615403763103987, |
| "learning_rate": 1.8519949564592047e-05, |
| "loss": 0.8124, |
| "mean_token_accuracy": 0.7914808392524719, |
| "num_tokens": 11625785.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.809765625, |
| "epoch": 0.5259026687598116, |
| "grad_norm": 4.106320362967877, |
| "learning_rate": 1.8447389663185905e-05, |
| "loss": 0.8026, |
| "mean_token_accuracy": 0.7902366161346436, |
| "num_tokens": 11812125.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 0.77109375, |
| "epoch": 0.533751962323391, |
| "grad_norm": 5.182177337614667, |
| "learning_rate": 1.837324272772052e-05, |
| "loss": 0.7623, |
| "mean_token_accuracy": 0.8023563742637634, |
| "num_tokens": 12001790.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.79921875, |
| "epoch": 0.5416012558869702, |
| "grad_norm": 5.040875278584284, |
| "learning_rate": 1.829752268838222e-05, |
| "loss": 0.7811, |
| "mean_token_accuracy": 0.7928344547748566, |
| "num_tokens": 12179297.0, |
| "step": 345 |
| }, |
| { |
| "entropy": 0.79453125, |
| "epoch": 0.5494505494505495, |
| "grad_norm": 8.20904916308416, |
| "learning_rate": 1.8220243770900623e-05, |
| "loss": 0.7627, |
| "mean_token_accuracy": 0.8005800724029541, |
| "num_tokens": 12353848.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.777734375, |
| "epoch": 0.5572998430141287, |
| "grad_norm": 4.8018626718228985, |
| "learning_rate": 1.8141420493876035e-05, |
| "loss": 0.7752, |
| "mean_token_accuracy": 0.7982768774032593, |
| "num_tokens": 12531024.0, |
| "step": 355 |
| }, |
| { |
| "entropy": 0.765625, |
| "epoch": 0.565149136577708, |
| "grad_norm": 5.109362259703448, |
| "learning_rate": 1.806106766605178e-05, |
| "loss": 0.7332, |
| "mean_token_accuracy": 0.8097867131233215, |
| "num_tokens": 12713443.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.7828125, |
| "epoch": 0.5729984301412873, |
| "grad_norm": 5.017082784768869, |
| "learning_rate": 1.7979200383532055e-05, |
| "loss": 0.787, |
| "mean_token_accuracy": 0.7963611066341401, |
| "num_tokens": 12896207.0, |
| "step": 365 |
| }, |
| { |
| "entropy": 0.76328125, |
| "epoch": 0.5808477237048666, |
| "grad_norm": 5.429786099705206, |
| "learning_rate": 1.789583402694577e-05, |
| "loss": 0.751, |
| "mean_token_accuracy": 0.8028202712535858, |
| "num_tokens": 13074959.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.800390625, |
| "epoch": 0.5886970172684458, |
| "grad_norm": 4.665804832964679, |
| "learning_rate": 1.7810984258556955e-05, |
| "loss": 0.7902, |
| "mean_token_accuracy": 0.7924364864826202, |
| "num_tokens": 13239381.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 0.821484375, |
| "epoch": 0.5965463108320251, |
| "grad_norm": 5.110027949764656, |
| "learning_rate": 1.7724667019322258e-05, |
| "loss": 0.8013, |
| "mean_token_accuracy": 0.7881995797157287, |
| "num_tokens": 13409287.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.79921875, |
| "epoch": 0.6043956043956044, |
| "grad_norm": 5.316113035916057, |
| "learning_rate": 1.7636898525896057e-05, |
| "loss": 0.7955, |
| "mean_token_accuracy": 0.7934599101543427, |
| "num_tokens": 13587551.0, |
| "step": 385 |
| }, |
| { |
| "entropy": 0.784375, |
| "epoch": 0.6122448979591837, |
| "grad_norm": 4.69729907059058, |
| "learning_rate": 1.7547695267583794e-05, |
| "loss": 0.7497, |
| "mean_token_accuracy": 0.8004011273384094, |
| "num_tokens": 13765322.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.835546875, |
| "epoch": 0.6200941915227629, |
| "grad_norm": 4.7498568521557285, |
| "learning_rate": 1.74570740032441e-05, |
| "loss": 0.8571, |
| "mean_token_accuracy": 0.7803294241428376, |
| "num_tokens": 13941975.0, |
| "step": 395 |
| }, |
| { |
| "entropy": 0.864453125, |
| "epoch": 0.6279434850863422, |
| "grad_norm": 4.6116162900514315, |
| "learning_rate": 1.736505175814025e-05, |
| "loss": 0.851, |
| "mean_token_accuracy": 0.7785885393619537, |
| "num_tokens": 14124355.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.801171875, |
| "epoch": 0.6357927786499215, |
| "grad_norm": 5.733457038092804, |
| "learning_rate": 1.7271645820741586e-05, |
| "loss": 0.8094, |
| "mean_token_accuracy": 0.78756765127182, |
| "num_tokens": 14293787.0, |
| "step": 405 |
| }, |
| { |
| "entropy": 0.739453125, |
| "epoch": 0.6436420722135008, |
| "grad_norm": 4.338135609354382, |
| "learning_rate": 1.7176873739475475e-05, |
| "loss": 0.7284, |
| "mean_token_accuracy": 0.8109029471874237, |
| "num_tokens": 14474817.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.767578125, |
| "epoch": 0.6514913657770801, |
| "grad_norm": 5.302940753837162, |
| "learning_rate": 1.7080753319430452e-05, |
| "loss": 0.7589, |
| "mean_token_accuracy": 0.80281902551651, |
| "num_tokens": 14652072.0, |
| "step": 415 |
| }, |
| { |
| "entropy": 0.8328125, |
| "epoch": 0.6593406593406593, |
| "grad_norm": 5.1332141590489, |
| "learning_rate": 1.6983302619011125e-05, |
| "loss": 0.8207, |
| "mean_token_accuracy": 0.7852710247039795, |
| "num_tokens": 14825230.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.72265625, |
| "epoch": 0.6671899529042387, |
| "grad_norm": 4.472690721905866, |
| "learning_rate": 1.6884539946545486e-05, |
| "loss": 0.6966, |
| "mean_token_accuracy": 0.8123364210128784, |
| "num_tokens": 14998108.0, |
| "step": 425 |
| }, |
| { |
| "entropy": 0.796484375, |
| "epoch": 0.6750392464678179, |
| "grad_norm": 4.634375280959423, |
| "learning_rate": 1.6784483856845287e-05, |
| "loss": 0.792, |
| "mean_token_accuracy": 0.7913555383682251, |
| "num_tokens": 15179352.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.786328125, |
| "epoch": 0.6828885400313972, |
| "grad_norm": 4.746241903035234, |
| "learning_rate": 1.6683153147720098e-05, |
| "loss": 0.7729, |
| "mean_token_accuracy": 0.7997291147708893, |
| "num_tokens": 15350374.0, |
| "step": 435 |
| }, |
| { |
| "entropy": 0.7984375, |
| "epoch": 0.6907378335949764, |
| "grad_norm": 4.585308958448852, |
| "learning_rate": 1.6580566856445684e-05, |
| "loss": 0.8056, |
| "mean_token_accuracy": 0.7899181842803955, |
| "num_tokens": 15526013.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.819921875, |
| "epoch": 0.6985871271585558, |
| "grad_norm": 4.601688747850842, |
| "learning_rate": 1.647674425618747e-05, |
| "loss": 0.8099, |
| "mean_token_accuracy": 0.7879339218139648, |
| "num_tokens": 15699724.0, |
| "step": 445 |
| }, |
| { |
| "entropy": 0.76875, |
| "epoch": 0.706436420722135, |
| "grad_norm": 5.412743559231603, |
| "learning_rate": 1.6371704852379587e-05, |
| "loss": 0.7586, |
| "mean_token_accuracy": 0.7998056769371032, |
| "num_tokens": 15874961.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.797265625, |
| "epoch": 0.7142857142857143, |
| "grad_norm": 5.991686412229486, |
| "learning_rate": 1.6265468379060364e-05, |
| "loss": 0.7824, |
| "mean_token_accuracy": 0.7962137937545777, |
| "num_tokens": 16041765.0, |
| "step": 455 |
| }, |
| { |
| "entropy": 0.79296875, |
| "epoch": 0.7221350078492935, |
| "grad_norm": 4.671481067368767, |
| "learning_rate": 1.615805479516484e-05, |
| "loss": 0.7786, |
| "mean_token_accuracy": 0.7946656525135041, |
| "num_tokens": 16217463.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.7421875, |
| "epoch": 0.7299843014128728, |
| "grad_norm": 4.396017574728974, |
| "learning_rate": 1.6049484280775012e-05, |
| "loss": 0.7342, |
| "mean_token_accuracy": 0.8057381689548493, |
| "num_tokens": 16400762.0, |
| "step": 465 |
| }, |
| { |
| "entropy": 0.836328125, |
| "epoch": 0.7378335949764521, |
| "grad_norm": 5.547458007168034, |
| "learning_rate": 1.593977723332855e-05, |
| "loss": 0.8167, |
| "mean_token_accuracy": 0.785160768032074, |
| "num_tokens": 16580373.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.81328125, |
| "epoch": 0.7456828885400314, |
| "grad_norm": 5.112513515390749, |
| "learning_rate": 1.5828954263786688e-05, |
| "loss": 0.8067, |
| "mean_token_accuracy": 0.7897652447223663, |
| "num_tokens": 16755173.0, |
| "step": 475 |
| }, |
| { |
| "entropy": 0.771875, |
| "epoch": 0.7535321821036107, |
| "grad_norm": 4.284314946059211, |
| "learning_rate": 1.571703619276197e-05, |
| "loss": 0.7624, |
| "mean_token_accuracy": 0.7975770771503449, |
| "num_tokens": 16928371.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.7578125, |
| "epoch": 0.7613814756671899, |
| "grad_norm": 4.801660029299967, |
| "learning_rate": 1.5604044046606638e-05, |
| "loss": 0.7405, |
| "mean_token_accuracy": 0.8036318182945251, |
| "num_tokens": 17109329.0, |
| "step": 485 |
| }, |
| { |
| "entropy": 0.818359375, |
| "epoch": 0.7692307692307693, |
| "grad_norm": 5.28306110517908, |
| "learning_rate": 1.548999905346234e-05, |
| "loss": 0.8107, |
| "mean_token_accuracy": 0.7882427215576172, |
| "num_tokens": 17278892.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.7859375, |
| "epoch": 0.7770800627943485, |
| "grad_norm": 15.112721306709377, |
| "learning_rate": 1.537492263927196e-05, |
| "loss": 0.8027, |
| "mean_token_accuracy": 0.796792733669281, |
| "num_tokens": 17453647.0, |
| "step": 495 |
| }, |
| { |
| "entropy": 0.915234375, |
| "epoch": 0.7849293563579278, |
| "grad_norm": 4.296741151037327, |
| "learning_rate": 1.5258836423754258e-05, |
| "loss": 0.8982, |
| "mean_token_accuracy": 0.7748664259910584, |
| "num_tokens": 17623664.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.8140625, |
| "epoch": 0.792778649921507, |
| "grad_norm": 4.661848552053591, |
| "learning_rate": 1.5141762216342107e-05, |
| "loss": 0.7966, |
| "mean_token_accuracy": 0.792145174741745, |
| "num_tokens": 17783873.0, |
| "step": 505 |
| }, |
| { |
| "entropy": 0.772265625, |
| "epoch": 0.8006279434850864, |
| "grad_norm": 4.675795378963432, |
| "learning_rate": 1.5023722012085098e-05, |
| "loss": 0.7635, |
| "mean_token_accuracy": 0.7970840752124786, |
| "num_tokens": 17955480.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.765625, |
| "epoch": 0.8084772370486656, |
| "grad_norm": 4.729782812344114, |
| "learning_rate": 1.4904737987517293e-05, |
| "loss": 0.7329, |
| "mean_token_accuracy": 0.8076993823051453, |
| "num_tokens": 18120464.0, |
| "step": 515 |
| }, |
| { |
| "entropy": 0.814453125, |
| "epoch": 0.8163265306122449, |
| "grad_norm": 5.0071131052667015, |
| "learning_rate": 1.4784832496490824e-05, |
| "loss": 0.7925, |
| "mean_token_accuracy": 0.7883158624172211, |
| "num_tokens": 18294198.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.7734375, |
| "epoch": 0.8241758241758241, |
| "grad_norm": 5.267980807907245, |
| "learning_rate": 1.4664028065976245e-05, |
| "loss": 0.7613, |
| "mean_token_accuracy": 0.8003257095813752, |
| "num_tokens": 18475591.0, |
| "step": 525 |
| }, |
| { |
| "entropy": 0.785546875, |
| "epoch": 0.8320251177394035, |
| "grad_norm": 4.671829616451408, |
| "learning_rate": 1.4542347391830308e-05, |
| "loss": 0.7572, |
| "mean_token_accuracy": 0.8027947068214416, |
| "num_tokens": 18651337.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.777734375, |
| "epoch": 0.8398744113029827, |
| "grad_norm": 4.5168552231692916, |
| "learning_rate": 1.4419813334532037e-05, |
| "loss": 0.7769, |
| "mean_token_accuracy": 0.7937651753425599, |
| "num_tokens": 18811548.0, |
| "step": 535 |
| }, |
| { |
| "entropy": 0.745703125, |
| "epoch": 0.847723704866562, |
| "grad_norm": 4.675368428382704, |
| "learning_rate": 1.4296448914887866e-05, |
| "loss": 0.7328, |
| "mean_token_accuracy": 0.8040679156780243, |
| "num_tokens": 18980140.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.73046875, |
| "epoch": 0.8555729984301413, |
| "grad_norm": 4.5422953697606285, |
| "learning_rate": 1.4172277309706677e-05, |
| "loss": 0.7198, |
| "mean_token_accuracy": 0.8091396510601043, |
| "num_tokens": 19155730.0, |
| "step": 545 |
| }, |
| { |
| "entropy": 0.77734375, |
| "epoch": 0.8634222919937206, |
| "grad_norm": 4.113548725011141, |
| "learning_rate": 1.4047321847445474e-05, |
| "loss": 0.7875, |
| "mean_token_accuracy": 0.7970530390739441, |
| "num_tokens": 19331206.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.79609375, |
| "epoch": 0.8712715855572999, |
| "grad_norm": 4.491027067352344, |
| "learning_rate": 1.392160600382663e-05, |
| "loss": 0.7744, |
| "mean_token_accuracy": 0.7995224177837372, |
| "num_tokens": 19501358.0, |
| "step": 555 |
| }, |
| { |
| "entropy": 0.7640625, |
| "epoch": 0.8791208791208791, |
| "grad_norm": 6.530128605470617, |
| "learning_rate": 1.3795153397427426e-05, |
| "loss": 0.7383, |
| "mean_token_accuracy": 0.8075939774513244, |
| "num_tokens": 19682092.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.791015625, |
| "epoch": 0.8869701726844584, |
| "grad_norm": 5.163885415034253, |
| "learning_rate": 1.3667987785242776e-05, |
| "loss": 0.7831, |
| "mean_token_accuracy": 0.7966715455055237, |
| "num_tokens": 19854955.0, |
| "step": 565 |
| }, |
| { |
| "entropy": 0.755859375, |
| "epoch": 0.8948194662480377, |
| "grad_norm": 4.740210922746398, |
| "learning_rate": 1.3540133058221927e-05, |
| "loss": 0.7349, |
| "mean_token_accuracy": 0.8023954510688782, |
| "num_tokens": 20014539.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.7421875, |
| "epoch": 0.902668759811617, |
| "grad_norm": 4.792699385906238, |
| "learning_rate": 1.3411613236779996e-05, |
| "loss": 0.7484, |
| "mean_token_accuracy": 0.8043593823909759, |
| "num_tokens": 20185864.0, |
| "step": 575 |
| }, |
| { |
| "entropy": 0.784765625, |
| "epoch": 0.9105180533751962, |
| "grad_norm": 5.861965682781705, |
| "learning_rate": 1.328245246628521e-05, |
| "loss": 0.7676, |
| "mean_token_accuracy": 0.7994747996330261, |
| "num_tokens": 20353415.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.7734375, |
| "epoch": 0.9183673469387755, |
| "grad_norm": 4.553143635133164, |
| "learning_rate": 1.3152675012522629e-05, |
| "loss": 0.7602, |
| "mean_token_accuracy": 0.8037185490131378, |
| "num_tokens": 20528137.0, |
| "step": 585 |
| }, |
| { |
| "entropy": 0.8125, |
| "epoch": 0.9262166405023547, |
| "grad_norm": 4.341140007225268, |
| "learning_rate": 1.302230525713527e-05, |
| "loss": 0.8159, |
| "mean_token_accuracy": 0.787601375579834, |
| "num_tokens": 20704595.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.815625, |
| "epoch": 0.9340659340659341, |
| "grad_norm": 4.534727275721023, |
| "learning_rate": 1.2891367693043477e-05, |
| "loss": 0.8041, |
| "mean_token_accuracy": 0.7893571734428406, |
| "num_tokens": 20874161.0, |
| "step": 595 |
| }, |
| { |
| "entropy": 0.7796875, |
| "epoch": 0.9419152276295133, |
| "grad_norm": 5.302756725693076, |
| "learning_rate": 1.2759886919843354e-05, |
| "loss": 0.7448, |
| "mean_token_accuracy": 0.8065652489662171, |
| "num_tokens": 21034972.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.778515625, |
| "epoch": 0.9497645211930926, |
| "grad_norm": 4.358915295776673, |
| "learning_rate": 1.262788763918518e-05, |
| "loss": 0.7892, |
| "mean_token_accuracy": 0.7949841141700744, |
| "num_tokens": 21208208.0, |
| "step": 605 |
| }, |
| { |
| "entropy": 0.80546875, |
| "epoch": 0.957613814756672, |
| "grad_norm": 4.657817652151732, |
| "learning_rate": 1.2495394650132628e-05, |
| "loss": 0.7964, |
| "mean_token_accuracy": 0.7912369012832642, |
| "num_tokens": 21385817.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.805859375, |
| "epoch": 0.9654631083202512, |
| "grad_norm": 4.053778957869286, |
| "learning_rate": 1.2362432844503725e-05, |
| "loss": 0.7902, |
| "mean_token_accuracy": 0.7954855740070343, |
| "num_tokens": 21554177.0, |
| "step": 615 |
| }, |
| { |
| "entropy": 0.77578125, |
| "epoch": 0.9733124018838305, |
| "grad_norm": 4.449402252518514, |
| "learning_rate": 1.222902720219433e-05, |
| "loss": 0.7802, |
| "mean_token_accuracy": 0.7978703141212463, |
| "num_tokens": 21733009.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.82265625, |
| "epoch": 0.9811616954474097, |
| "grad_norm": 4.034946890134492, |
| "learning_rate": 1.209520278648512e-05, |
| "loss": 0.7812, |
| "mean_token_accuracy": 0.797600257396698, |
| "num_tokens": 21916510.0, |
| "step": 625 |
| }, |
| { |
| "entropy": 0.6578125, |
| "epoch": 0.989010989010989, |
| "grad_norm": 4.245901684288803, |
| "learning_rate": 1.1960984739332851e-05, |
| "loss": 0.6496, |
| "mean_token_accuracy": 0.824074250459671, |
| "num_tokens": 22091705.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.796484375, |
| "epoch": 0.9968602825745683, |
| "grad_norm": 4.100658254128751, |
| "learning_rate": 1.1826398276646897e-05, |
| "loss": 0.7845, |
| "mean_token_accuracy": 0.7937996804714202, |
| "num_tokens": 22260035.0, |
| "step": 635 |
| }, |
| { |
| "entropy": 0.558984375, |
| "epoch": 1.0047095761381475, |
| "grad_norm": 5.439305366155218, |
| "learning_rate": 1.1691468683551865e-05, |
| "loss": 0.5154, |
| "mean_token_accuracy": 0.8592477262020111, |
| "num_tokens": 22441941.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.3642578125, |
| "epoch": 1.012558869701727, |
| "grad_norm": 4.420472353359968, |
| "learning_rate": 1.1556221309637204e-05, |
| "loss": 0.3321, |
| "mean_token_accuracy": 0.9043326616287232, |
| "num_tokens": 22615765.0, |
| "step": 645 |
| }, |
| { |
| "entropy": 0.3724609375, |
| "epoch": 1.0204081632653061, |
| "grad_norm": 4.573568228629025, |
| "learning_rate": 1.1420681564194694e-05, |
| "loss": 0.335, |
| "mean_token_accuracy": 0.9050298452377319, |
| "num_tokens": 22788401.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.4099609375, |
| "epoch": 1.0282574568288854, |
| "grad_norm": 4.215481005342787, |
| "learning_rate": 1.1284874911444763e-05, |
| "loss": 0.3632, |
| "mean_token_accuracy": 0.8970784783363343, |
| "num_tokens": 22960605.0, |
| "step": 655 |
| }, |
| { |
| "entropy": 0.3654296875, |
| "epoch": 1.0361067503924646, |
| "grad_norm": 4.218508213156498, |
| "learning_rate": 1.1148826865752445e-05, |
| "loss": 0.3575, |
| "mean_token_accuracy": 0.898915582895279, |
| "num_tokens": 23139019.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.455078125, |
| "epoch": 1.043956043956044, |
| "grad_norm": 3.741632896639657, |
| "learning_rate": 1.1012562986833909e-05, |
| "loss": 0.4289, |
| "mean_token_accuracy": 0.8870194792747498, |
| "num_tokens": 23320382.0, |
| "step": 665 |
| }, |
| { |
| "entropy": 0.3673828125, |
| "epoch": 1.0518053375196232, |
| "grad_norm": 4.3558521664226095, |
| "learning_rate": 1.0876108874954498e-05, |
| "loss": 0.3376, |
| "mean_token_accuracy": 0.902996426820755, |
| "num_tokens": 23483411.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.402734375, |
| "epoch": 1.0596546310832025, |
| "grad_norm": 4.363221525545958, |
| "learning_rate": 1.0739490166119155e-05, |
| "loss": 0.3553, |
| "mean_token_accuracy": 0.8993325650691986, |
| "num_tokens": 23655293.0, |
| "step": 675 |
| }, |
| { |
| "entropy": 0.347265625, |
| "epoch": 1.0675039246467817, |
| "grad_norm": 4.3509011526726225, |
| "learning_rate": 1.060273252725609e-05, |
| "loss": 0.3179, |
| "mean_token_accuracy": 0.9088070094585419, |
| "num_tokens": 23827360.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.3892578125, |
| "epoch": 1.0753532182103611, |
| "grad_norm": 3.9376143288253336, |
| "learning_rate": 1.0465861651394673e-05, |
| "loss": 0.3389, |
| "mean_token_accuracy": 0.9021193146705627, |
| "num_tokens": 23998614.0, |
| "step": 685 |
| }, |
| { |
| "entropy": 0.394140625, |
| "epoch": 1.0832025117739403, |
| "grad_norm": 4.541470682953176, |
| "learning_rate": 1.0328903252838415e-05, |
| "loss": 0.3572, |
| "mean_token_accuracy": 0.8969906985759735, |
| "num_tokens": 24162044.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.378515625, |
| "epoch": 1.0910518053375196, |
| "grad_norm": 4.055789916536888, |
| "learning_rate": 1.0191883062333964e-05, |
| "loss": 0.326, |
| "mean_token_accuracy": 0.9068751513957978, |
| "num_tokens": 24337261.0, |
| "step": 695 |
| }, |
| { |
| "entropy": 0.333984375, |
| "epoch": 1.098901098901099, |
| "grad_norm": 5.830501058734357, |
| "learning_rate": 1.0054826822236983e-05, |
| "loss": 0.2959, |
| "mean_token_accuracy": 0.9131093323230743, |
| "num_tokens": 24502246.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.3708984375, |
| "epoch": 1.1067503924646782, |
| "grad_norm": 4.028032326842961, |
| "learning_rate": 9.917760281675867e-06, |
| "loss": 0.3345, |
| "mean_token_accuracy": 0.903233277797699, |
| "num_tokens": 24678284.0, |
| "step": 705 |
| }, |
| { |
| "entropy": 0.389453125, |
| "epoch": 1.1145996860282574, |
| "grad_norm": 4.032598919287802, |
| "learning_rate": 9.780709191714187e-06, |
| "loss": 0.3506, |
| "mean_token_accuracy": 0.8997887074947357, |
| "num_tokens": 24847385.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.3677734375, |
| "epoch": 1.1224489795918366, |
| "grad_norm": 5.251015713215736, |
| "learning_rate": 9.643699300512781e-06, |
| "loss": 0.3321, |
| "mean_token_accuracy": 0.9033790588378906, |
| "num_tokens": 25015708.0, |
| "step": 715 |
| }, |
| { |
| "entropy": 0.386328125, |
| "epoch": 1.130298273155416, |
| "grad_norm": 3.635272069781761, |
| "learning_rate": 9.506756348492348e-06, |
| "loss": 0.3631, |
| "mean_token_accuracy": 0.8966520011425019, |
| "num_tokens": 25189049.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.36796875, |
| "epoch": 1.1381475667189953, |
| "grad_norm": 3.905345940736465, |
| "learning_rate": 9.369906063497547e-06, |
| "loss": 0.3171, |
| "mean_token_accuracy": 0.9075049877166748, |
| "num_tokens": 25370100.0, |
| "step": 725 |
| }, |
| { |
| "entropy": 0.3763671875, |
| "epoch": 1.1459968602825745, |
| "grad_norm": 4.368923829495938, |
| "learning_rate": 9.233174155963432e-06, |
| "loss": 0.3491, |
| "mean_token_accuracy": 0.9008313238620758, |
| "num_tokens": 25538013.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.3888671875, |
| "epoch": 1.1538461538461537, |
| "grad_norm": 3.95972099627997, |
| "learning_rate": 9.096586314085162e-06, |
| "loss": 0.3509, |
| "mean_token_accuracy": 0.8985212922096253, |
| "num_tokens": 25710812.0, |
| "step": 735 |
| }, |
| { |
| "entropy": 0.3560546875, |
| "epoch": 1.1616954474097332, |
| "grad_norm": 3.9507409203470085, |
| "learning_rate": 8.960168198991885e-06, |
| "loss": 0.3241, |
| "mean_token_accuracy": 0.9068559765815735, |
| "num_tokens": 25878865.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.342578125, |
| "epoch": 1.1695447409733124, |
| "grad_norm": 3.7761986277605653, |
| "learning_rate": 8.823945439925725e-06, |
| "loss": 0.3036, |
| "mean_token_accuracy": 0.9105329990386963, |
| "num_tokens": 26053389.0, |
| "step": 745 |
| }, |
| { |
| "entropy": 0.3412109375, |
| "epoch": 1.1773940345368916, |
| "grad_norm": 3.793472691376912, |
| "learning_rate": 8.687943629426725e-06, |
| "loss": 0.3032, |
| "mean_token_accuracy": 0.9119175374507904, |
| "num_tokens": 26237554.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.3552734375, |
| "epoch": 1.185243328100471, |
| "grad_norm": 4.031904479069314, |
| "learning_rate": 8.552188318524737e-06, |
| "loss": 0.3305, |
| "mean_token_accuracy": 0.9051981568336487, |
| "num_tokens": 26420657.0, |
| "step": 755 |
| }, |
| { |
| "entropy": 0.3556640625, |
| "epoch": 1.1930926216640503, |
| "grad_norm": 4.18585912610383, |
| "learning_rate": 8.416705011939052e-06, |
| "loss": 0.3203, |
| "mean_token_accuracy": 0.9096585392951966, |
| "num_tokens": 26594423.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.36640625, |
| "epoch": 1.2009419152276295, |
| "grad_norm": 3.62287005225727, |
| "learning_rate": 8.281519163286772e-06, |
| "loss": 0.3283, |
| "mean_token_accuracy": 0.9058149755001068, |
| "num_tokens": 26772548.0, |
| "step": 765 |
| }, |
| { |
| "entropy": 0.377734375, |
| "epoch": 1.2087912087912087, |
| "grad_norm": 3.790409760971858, |
| "learning_rate": 8.146656170300772e-06, |
| "loss": 0.3457, |
| "mean_token_accuracy": 0.9028036296367645, |
| "num_tokens": 26938630.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.3884765625, |
| "epoch": 1.2166405023547882, |
| "grad_norm": 3.818641932499593, |
| "learning_rate": 8.01214137005815e-06, |
| "loss": 0.3496, |
| "mean_token_accuracy": 0.9000514507293701, |
| "num_tokens": 27111807.0, |
| "step": 775 |
| }, |
| { |
| "entropy": 0.34375, |
| "epoch": 1.2244897959183674, |
| "grad_norm": 3.577131008292094, |
| "learning_rate": 7.878000034220092e-06, |
| "loss": 0.3108, |
| "mean_token_accuracy": 0.9111146748065948, |
| "num_tokens": 27301565.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.35859375, |
| "epoch": 1.2323390894819466, |
| "grad_norm": 3.9774086899528927, |
| "learning_rate": 7.74425736428401e-06, |
| "loss": 0.3098, |
| "mean_token_accuracy": 0.9107671201229095, |
| "num_tokens": 27481197.0, |
| "step": 785 |
| }, |
| { |
| "entropy": 0.3740234375, |
| "epoch": 1.2401883830455258, |
| "grad_norm": 3.616653726364246, |
| "learning_rate": 7.6109384868488646e-06, |
| "loss": 0.3445, |
| "mean_token_accuracy": 0.9035484492778778, |
| "num_tokens": 27656624.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.3419921875, |
| "epoch": 1.2480376766091053, |
| "grad_norm": 3.6294243587435164, |
| "learning_rate": 7.478068448894577e-06, |
| "loss": 0.2992, |
| "mean_token_accuracy": 0.9126500189304352, |
| "num_tokens": 27825076.0, |
| "step": 795 |
| }, |
| { |
| "entropy": 0.378515625, |
| "epoch": 1.2558869701726845, |
| "grad_norm": 3.5723456408449685, |
| "learning_rate": 7.3456722130763665e-06, |
| "loss": 0.3424, |
| "mean_token_accuracy": 0.9039673745632172, |
| "num_tokens": 28007761.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.369140625, |
| "epoch": 1.2637362637362637, |
| "grad_norm": 3.8410419633102793, |
| "learning_rate": 7.213774653034958e-06, |
| "loss": 0.3435, |
| "mean_token_accuracy": 0.9041143357753754, |
| "num_tokens": 28187081.0, |
| "step": 805 |
| }, |
| { |
| "entropy": 0.35234375, |
| "epoch": 1.2715855572998431, |
| "grad_norm": 3.9529644075488384, |
| "learning_rate": 7.082400548723505e-06, |
| "loss": 0.3122, |
| "mean_token_accuracy": 0.9107161283493042, |
| "num_tokens": 28363030.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.366796875, |
| "epoch": 1.2794348508634223, |
| "grad_norm": 4.064104450162385, |
| "learning_rate": 6.951574581752111e-06, |
| "loss": 0.3319, |
| "mean_token_accuracy": 0.9035371005535126, |
| "num_tokens": 28535595.0, |
| "step": 815 |
| }, |
| { |
| "entropy": 0.381640625, |
| "epoch": 1.2872841444270016, |
| "grad_norm": 5.9889597321012555, |
| "learning_rate": 6.8213213307508205e-06, |
| "loss": 0.3437, |
| "mean_token_accuracy": 0.9042888820171356, |
| "num_tokens": 28714743.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.3158203125, |
| "epoch": 1.2951334379905808, |
| "grad_norm": 3.9581361848534358, |
| "learning_rate": 6.6916652667519855e-06, |
| "loss": 0.3069, |
| "mean_token_accuracy": 0.9155528962612152, |
| "num_tokens": 28889873.0, |
| "step": 825 |
| }, |
| { |
| "entropy": 0.35390625, |
| "epoch": 1.30298273155416, |
| "grad_norm": 3.677250554121152, |
| "learning_rate": 6.562630748592794e-06, |
| "loss": 0.32, |
| "mean_token_accuracy": 0.9077256739139556, |
| "num_tokens": 29059408.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.3638671875, |
| "epoch": 1.3108320251177394, |
| "grad_norm": 4.061917185197391, |
| "learning_rate": 6.434242018338948e-06, |
| "loss": 0.3294, |
| "mean_token_accuracy": 0.9073351263999939, |
| "num_tokens": 29236192.0, |
| "step": 835 |
| }, |
| { |
| "entropy": 0.3373046875, |
| "epoch": 1.3186813186813187, |
| "grad_norm": 3.7207684760296558, |
| "learning_rate": 6.3065231967302055e-06, |
| "loss": 0.2991, |
| "mean_token_accuracy": 0.9142134130001068, |
| "num_tokens": 29405817.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 0.33486328125, |
| "epoch": 1.3265306122448979, |
| "grad_norm": 4.167446472870183, |
| "learning_rate": 6.179498278648766e-06, |
| "loss": 0.2989, |
| "mean_token_accuracy": 0.9135854482650757, |
| "num_tokens": 29584863.0, |
| "step": 845 |
| }, |
| { |
| "entropy": 0.3779296875, |
| "epoch": 1.3343799058084773, |
| "grad_norm": 3.6630259747898126, |
| "learning_rate": 6.053191128611298e-06, |
| "loss": 0.3288, |
| "mean_token_accuracy": 0.9065738797187806, |
| "num_tokens": 29760752.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.34375, |
| "epoch": 1.3422291993720565, |
| "grad_norm": 3.653325382479137, |
| "learning_rate": 5.927625476285426e-06, |
| "loss": 0.3081, |
| "mean_token_accuracy": 0.910161966085434, |
| "num_tokens": 29933046.0, |
| "step": 855 |
| }, |
| { |
| "entropy": 0.3578125, |
| "epoch": 1.3500784929356358, |
| "grad_norm": 5.149179896152341, |
| "learning_rate": 5.802824912031588e-06, |
| "loss": 0.324, |
| "mean_token_accuracy": 0.9067489266395569, |
| "num_tokens": 30107202.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 0.3896484375, |
| "epoch": 1.3579277864992152, |
| "grad_norm": 4.046787648445982, |
| "learning_rate": 5.678812882471047e-06, |
| "loss": 0.3563, |
| "mean_token_accuracy": 0.8968492567539215, |
| "num_tokens": 30287266.0, |
| "step": 865 |
| }, |
| { |
| "entropy": 0.3703125, |
| "epoch": 1.3657770800627944, |
| "grad_norm": 3.71919627416169, |
| "learning_rate": 5.555612686080909e-06, |
| "loss": 0.3345, |
| "mean_token_accuracy": 0.9046319007873536, |
| "num_tokens": 30454583.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 0.35546875, |
| "epoch": 1.3736263736263736, |
| "grad_norm": 4.812266161183036, |
| "learning_rate": 5.4332474688169766e-06, |
| "loss": 0.3162, |
| "mean_token_accuracy": 0.9111818552017212, |
| "num_tokens": 30646710.0, |
| "step": 875 |
| }, |
| { |
| "entropy": 0.3578125, |
| "epoch": 1.3814756671899528, |
| "grad_norm": 3.470032833070242, |
| "learning_rate": 5.311740219765247e-06, |
| "loss": 0.3304, |
| "mean_token_accuracy": 0.9047977864742279, |
| "num_tokens": 30838566.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 0.3638671875, |
| "epoch": 1.389324960753532, |
| "grad_norm": 4.00242956343701, |
| "learning_rate": 5.191113766822905e-06, |
| "loss": 0.3265, |
| "mean_token_accuracy": 0.9078042209148407, |
| "num_tokens": 31016269.0, |
| "step": 885 |
| }, |
| { |
| "entropy": 0.360546875, |
| "epoch": 1.3971742543171115, |
| "grad_norm": 6.163854036366698, |
| "learning_rate": 5.071390772409579e-06, |
| "loss": 0.3307, |
| "mean_token_accuracy": 0.9069823741912841, |
| "num_tokens": 31193240.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 0.335546875, |
| "epoch": 1.4050235478806907, |
| "grad_norm": 3.7842513748094695, |
| "learning_rate": 4.952593729209671e-06, |
| "loss": 0.3014, |
| "mean_token_accuracy": 0.912151551246643, |
| "num_tokens": 31362873.0, |
| "step": 895 |
| }, |
| { |
| "entropy": 0.3505859375, |
| "epoch": 1.41287284144427, |
| "grad_norm": 3.8252428181776716, |
| "learning_rate": 4.834744955946631e-06, |
| "loss": 0.3171, |
| "mean_token_accuracy": 0.9103285014629364, |
| "num_tokens": 31532789.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.3349609375, |
| "epoch": 1.4207221350078494, |
| "grad_norm": 3.8195957170982107, |
| "learning_rate": 4.717866593189847e-06, |
| "loss": 0.3016, |
| "mean_token_accuracy": 0.912840747833252, |
| "num_tokens": 31701494.0, |
| "step": 905 |
| }, |
| { |
| "entropy": 0.37734375, |
| "epoch": 1.4285714285714286, |
| "grad_norm": 3.822109137774168, |
| "learning_rate": 4.60198059919505e-06, |
| "loss": 0.3351, |
| "mean_token_accuracy": 0.9047119557857514, |
| "num_tokens": 31872676.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 0.3310546875, |
| "epoch": 1.4364207221350078, |
| "grad_norm": 3.8366883679239243, |
| "learning_rate": 4.487108745778958e-06, |
| "loss": 0.2812, |
| "mean_token_accuracy": 0.91913822889328, |
| "num_tokens": 32041074.0, |
| "step": 915 |
| }, |
| { |
| "entropy": 0.3435546875, |
| "epoch": 1.4442700156985873, |
| "grad_norm": 3.801981633982777, |
| "learning_rate": 4.373272614228932e-06, |
| "loss": 0.3144, |
| "mean_token_accuracy": 0.9115765929222107, |
| "num_tokens": 32212406.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 0.346484375, |
| "epoch": 1.4521193092621665, |
| "grad_norm": 4.447578921600131, |
| "learning_rate": 4.260493591248458e-06, |
| "loss": 0.306, |
| "mean_token_accuracy": 0.911074984073639, |
| "num_tokens": 32378986.0, |
| "step": 925 |
| }, |
| { |
| "entropy": 0.3501953125, |
| "epoch": 1.4599686028257457, |
| "grad_norm": 4.190227519433511, |
| "learning_rate": 4.148792864939164e-06, |
| "loss": 0.3054, |
| "mean_token_accuracy": 0.9116972923278809, |
| "num_tokens": 32558393.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 0.3251953125, |
| "epoch": 1.467817896389325, |
| "grad_norm": 4.161740647309898, |
| "learning_rate": 4.038191420820139e-06, |
| "loss": 0.2966, |
| "mean_token_accuracy": 0.9156083226203918, |
| "num_tokens": 32728716.0, |
| "step": 935 |
| }, |
| { |
| "entropy": 0.327734375, |
| "epoch": 1.4756671899529041, |
| "grad_norm": 3.6311813451272252, |
| "learning_rate": 3.92871003788535e-06, |
| "loss": 0.288, |
| "mean_token_accuracy": 0.918038284778595, |
| "num_tokens": 32905750.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 0.3388671875, |
| "epoch": 1.4835164835164836, |
| "grad_norm": 3.738059542189779, |
| "learning_rate": 3.820369284699823e-06, |
| "loss": 0.2991, |
| "mean_token_accuracy": 0.9139463484287262, |
| "num_tokens": 33075994.0, |
| "step": 945 |
| }, |
| { |
| "entropy": 0.341796875, |
| "epoch": 1.4913657770800628, |
| "grad_norm": 3.7412909653457738, |
| "learning_rate": 3.713189515535368e-06, |
| "loss": 0.3152, |
| "mean_token_accuracy": 0.9099179923534393, |
| "num_tokens": 33248229.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.3126953125, |
| "epoch": 1.499215070643642, |
| "grad_norm": 3.545489585791932, |
| "learning_rate": 3.607190866546578e-06, |
| "loss": 0.2855, |
| "mean_token_accuracy": 0.9172944724559784, |
| "num_tokens": 33422754.0, |
| "step": 955 |
| }, |
| { |
| "entropy": 0.32294921875, |
| "epoch": 1.5070643642072215, |
| "grad_norm": 3.933250466037461, |
| "learning_rate": 3.502393251987776e-06, |
| "loss": 0.2897, |
| "mean_token_accuracy": 0.9173169553279876, |
| "num_tokens": 33592867.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 0.33154296875, |
| "epoch": 1.5149136577708007, |
| "grad_norm": 3.5231854886347347, |
| "learning_rate": 3.3988163604716928e-06, |
| "loss": 0.3022, |
| "mean_token_accuracy": 0.91502086520195, |
| "num_tokens": 33764810.0, |
| "step": 965 |
| }, |
| { |
| "entropy": 0.3322265625, |
| "epoch": 1.5227629513343799, |
| "grad_norm": 3.9047550692209225, |
| "learning_rate": 3.296479651270502e-06, |
| "loss": 0.289, |
| "mean_token_accuracy": 0.9168073177337647, |
| "num_tokens": 33935483.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 0.3173828125, |
| "epoch": 1.5306122448979593, |
| "grad_norm": 3.7792544290205647, |
| "learning_rate": 3.195402350659945e-06, |
| "loss": 0.2933, |
| "mean_token_accuracy": 0.916690468788147, |
| "num_tokens": 34111426.0, |
| "step": 975 |
| }, |
| { |
| "entropy": 0.3521484375, |
| "epoch": 1.5384615384615383, |
| "grad_norm": 3.6732078529565517, |
| "learning_rate": 3.0956034483072573e-06, |
| "loss": 0.3133, |
| "mean_token_accuracy": 0.909661453962326, |
| "num_tokens": 34290003.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 0.3787109375, |
| "epoch": 1.5463108320251178, |
| "grad_norm": 5.190106114131474, |
| "learning_rate": 2.997101693703518e-06, |
| "loss": 0.3385, |
| "mean_token_accuracy": 0.9050460577011108, |
| "num_tokens": 34460879.0, |
| "step": 985 |
| }, |
| { |
| "entropy": 0.2978515625, |
| "epoch": 1.554160125588697, |
| "grad_norm": 3.8928764556590694, |
| "learning_rate": 2.8999155926411203e-06, |
| "loss": 0.2647, |
| "mean_token_accuracy": 0.9240784645080566, |
| "num_tokens": 34645393.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 0.3353515625, |
| "epoch": 1.5620094191522762, |
| "grad_norm": 3.995014435930791, |
| "learning_rate": 2.8040634037370727e-06, |
| "loss": 0.305, |
| "mean_token_accuracy": 0.9108369588851929, |
| "num_tokens": 34819057.0, |
| "step": 995 |
| }, |
| { |
| "entropy": 0.33828125, |
| "epoch": 1.5698587127158556, |
| "grad_norm": 3.5439052316501423, |
| "learning_rate": 2.7095631350026585e-06, |
| "loss": 0.2977, |
| "mean_token_accuracy": 0.9142949402332305, |
| "num_tokens": 34993858.0, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.3111328125, |
| "epoch": 1.5777080062794349, |
| "grad_norm": 3.4692400046730576, |
| "learning_rate": 2.616432540460255e-06, |
| "loss": 0.2801, |
| "mean_token_accuracy": 0.9186800062656403, |
| "num_tokens": 35176584.0, |
| "step": 1005 |
| }, |
| { |
| "entropy": 0.348828125, |
| "epoch": 1.585557299843014, |
| "grad_norm": 4.36529770249698, |
| "learning_rate": 2.524689116807826e-06, |
| "loss": 0.306, |
| "mean_token_accuracy": 0.9119703054428101, |
| "num_tokens": 35358793.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 0.3302734375, |
| "epoch": 1.5934065934065935, |
| "grad_norm": 4.023141037723821, |
| "learning_rate": 2.4343501001317604e-06, |
| "loss": 0.296, |
| "mean_token_accuracy": 0.9139317333698272, |
| "num_tokens": 35538704.0, |
| "step": 1015 |
| }, |
| { |
| "entropy": 0.315625, |
| "epoch": 1.6012558869701727, |
| "grad_norm": 3.523923427899709, |
| "learning_rate": 2.345432462668702e-06, |
| "loss": 0.288, |
| "mean_token_accuracy": 0.917642080783844, |
| "num_tokens": 35716046.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 0.31044921875, |
| "epoch": 1.609105180533752, |
| "grad_norm": 3.644734085017535, |
| "learning_rate": 2.257952909616914e-06, |
| "loss": 0.2666, |
| "mean_token_accuracy": 0.9209400594234467, |
| "num_tokens": 35891208.0, |
| "step": 1025 |
| }, |
| { |
| "entropy": 0.3283203125, |
| "epoch": 1.6169544740973314, |
| "grad_norm": 3.3371550953729354, |
| "learning_rate": 2.1719278759978225e-06, |
| "loss": 0.2935, |
| "mean_token_accuracy": 0.9164785146713257, |
| "num_tokens": 36067493.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 0.3154296875, |
| "epoch": 1.6248037676609104, |
| "grad_norm": 4.576016784982279, |
| "learning_rate": 2.0873735235683535e-06, |
| "loss": 0.2867, |
| "mean_token_accuracy": 0.919123786687851, |
| "num_tokens": 36242711.0, |
| "step": 1035 |
| }, |
| { |
| "entropy": 0.330078125, |
| "epoch": 1.6326530612244898, |
| "grad_norm": 3.7915992206510256, |
| "learning_rate": 2.004305737784541e-06, |
| "loss": 0.306, |
| "mean_token_accuracy": 0.9124962151050567, |
| "num_tokens": 36414692.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 0.34404296875, |
| "epoch": 1.640502354788069, |
| "grad_norm": 3.4114214410740784, |
| "learning_rate": 1.922740124817113e-06, |
| "loss": 0.3324, |
| "mean_token_accuracy": 0.9155471563339234, |
| "num_tokens": 36588961.0, |
| "step": 1045 |
| }, |
| { |
| "entropy": 0.32578125, |
| "epoch": 1.6483516483516483, |
| "grad_norm": 3.6594471068690626, |
| "learning_rate": 1.8426920086195065e-06, |
| "loss": 0.2862, |
| "mean_token_accuracy": 0.9176535487174988, |
| "num_tokens": 36769165.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.33359375, |
| "epoch": 1.6562009419152277, |
| "grad_norm": 3.5169016563130806, |
| "learning_rate": 1.7641764280489081e-06, |
| "loss": 0.3011, |
| "mean_token_accuracy": 0.9151181995868682, |
| "num_tokens": 36937514.0, |
| "step": 1055 |
| }, |
| { |
| "entropy": 0.3498046875, |
| "epoch": 1.664050235478807, |
| "grad_norm": 3.7880404375931866, |
| "learning_rate": 1.6872081340408763e-06, |
| "loss": 0.3118, |
| "mean_token_accuracy": 0.9124642968177795, |
| "num_tokens": 37112141.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 0.3388671875, |
| "epoch": 1.6718995290423861, |
| "grad_norm": 3.698633480496351, |
| "learning_rate": 1.6118015868380387e-06, |
| "loss": 0.303, |
| "mean_token_accuracy": 0.9128694295883178, |
| "num_tokens": 37284315.0, |
| "step": 1065 |
| }, |
| { |
| "entropy": 0.3212890625, |
| "epoch": 1.6797488226059656, |
| "grad_norm": 3.3123418573768357, |
| "learning_rate": 1.5379709532733944e-06, |
| "loss": 0.2808, |
| "mean_token_accuracy": 0.9190841019153595, |
| "num_tokens": 37466201.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 0.3390625, |
| "epoch": 1.6875981161695446, |
| "grad_norm": 3.593562594089776, |
| "learning_rate": 1.4657301041087812e-06, |
| "loss": 0.3123, |
| "mean_token_accuracy": 0.9126930415630341, |
| "num_tokens": 37643306.0, |
| "step": 1075 |
| }, |
| { |
| "entropy": 0.311328125, |
| "epoch": 1.695447409733124, |
| "grad_norm": 3.3618127002719485, |
| "learning_rate": 1.395092611428902e-06, |
| "loss": 0.2776, |
| "mean_token_accuracy": 0.9204743444919586, |
| "num_tokens": 37818425.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 0.33984375, |
| "epoch": 1.7032967032967035, |
| "grad_norm": 3.6575930674463466, |
| "learning_rate": 1.3260717460915296e-06, |
| "loss": 0.3101, |
| "mean_token_accuracy": 0.9130974769592285, |
| "num_tokens": 37992256.0, |
| "step": 1085 |
| }, |
| { |
| "entropy": 0.326953125, |
| "epoch": 1.7111459968602825, |
| "grad_norm": 3.5686085325723558, |
| "learning_rate": 1.2586804752342596e-06, |
| "loss": 0.2819, |
| "mean_token_accuracy": 0.9184079587459564, |
| "num_tokens": 38169078.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 0.30869140625, |
| "epoch": 1.718995290423862, |
| "grad_norm": 3.520061945368019, |
| "learning_rate": 1.1929314598383423e-06, |
| "loss": 0.2821, |
| "mean_token_accuracy": 0.917450076341629, |
| "num_tokens": 38356668.0, |
| "step": 1095 |
| }, |
| { |
| "entropy": 0.306640625, |
| "epoch": 1.7268445839874411, |
| "grad_norm": 3.9895743717145584, |
| "learning_rate": 1.1288370523500303e-06, |
| "loss": 0.2636, |
| "mean_token_accuracy": 0.9235479176044464, |
| "num_tokens": 38523569.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.3361328125, |
| "epoch": 1.7346938775510203, |
| "grad_norm": 3.776146789812367, |
| "learning_rate": 1.0664092943598936e-06, |
| "loss": 0.3036, |
| "mean_token_accuracy": 0.9140356600284576, |
| "num_tokens": 38694147.0, |
| "step": 1105 |
| }, |
| { |
| "entropy": 0.316015625, |
| "epoch": 1.7425431711145998, |
| "grad_norm": 3.4444521360956735, |
| "learning_rate": 1.0056599143405244e-06, |
| "loss": 0.2799, |
| "mean_token_accuracy": 0.9198241591453552, |
| "num_tokens": 38867562.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 0.3056640625, |
| "epoch": 1.750392464678179, |
| "grad_norm": 3.478490953949651, |
| "learning_rate": 9.466003254430933e-07, |
| "loss": 0.2735, |
| "mean_token_accuracy": 0.9212567389011384, |
| "num_tokens": 39034512.0, |
| "step": 1115 |
| }, |
| { |
| "entropy": 0.32734375, |
| "epoch": 1.7582417582417582, |
| "grad_norm": 3.5207204633120157, |
| "learning_rate": 8.892416233531064e-07, |
| "loss": 0.2837, |
| "mean_token_accuracy": 0.9190377771854401, |
| "num_tokens": 39203376.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 0.33671875, |
| "epoch": 1.7660910518053377, |
| "grad_norm": 3.396775939311065, |
| "learning_rate": 8.335945842058524e-07, |
| "loss": 0.2894, |
| "mean_token_accuracy": 0.917039567232132, |
| "num_tokens": 39391730.0, |
| "step": 1125 |
| }, |
| { |
| "entropy": 0.31611328125, |
| "epoch": 1.7739403453689166, |
| "grad_norm": 3.633075296397874, |
| "learning_rate": 7.79669662561845e-07, |
| "loss": 0.2834, |
| "mean_token_accuracy": 0.9175218880176544, |
| "num_tokens": 39559203.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 0.3142578125, |
| "epoch": 1.781789638932496, |
| "grad_norm": 3.693691007663445, |
| "learning_rate": 7.274769894426992e-07, |
| "loss": 0.2841, |
| "mean_token_accuracy": 0.9189928412437439, |
| "num_tokens": 39727754.0, |
| "step": 1135 |
| }, |
| { |
| "entropy": 0.3185546875, |
| "epoch": 1.7896389324960753, |
| "grad_norm": 3.3504527473164267, |
| "learning_rate": 6.770263704277958e-07, |
| "loss": 0.2838, |
| "mean_token_accuracy": 0.9185858130455017, |
| "num_tokens": 39911621.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 0.3326171875, |
| "epoch": 1.7974882260596545, |
| "grad_norm": 4.048970301070765, |
| "learning_rate": 6.283272838120747e-07, |
| "loss": 0.2917, |
| "mean_token_accuracy": 0.9172453165054322, |
| "num_tokens": 40080024.0, |
| "step": 1145 |
| }, |
| { |
| "entropy": 0.3310546875, |
| "epoch": 1.805337519623234, |
| "grad_norm": 4.157221708254746, |
| "learning_rate": 5.813888788253153e-07, |
| "loss": 0.2858, |
| "mean_token_accuracy": 0.9179641664028168, |
| "num_tokens": 40259404.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.3177734375, |
| "epoch": 1.8131868131868132, |
| "grad_norm": 3.7177900235708328, |
| "learning_rate": 5.362199739132656e-07, |
| "loss": 0.2813, |
| "mean_token_accuracy": 0.9183613300323487, |
| "num_tokens": 40435057.0, |
| "step": 1155 |
| }, |
| { |
| "entropy": 0.36015625, |
| "epoch": 1.8210361067503924, |
| "grad_norm": 3.7466301147455314, |
| "learning_rate": 4.928290550808734e-07, |
| "loss": 0.3155, |
| "mean_token_accuracy": 0.9126826763153076, |
| "num_tokens": 40617015.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 0.337890625, |
| "epoch": 1.8288854003139718, |
| "grad_norm": 3.714640014839032, |
| "learning_rate": 4.512242742980155e-07, |
| "loss": 0.2992, |
| "mean_token_accuracy": 0.914109718799591, |
| "num_tokens": 40785015.0, |
| "step": 1165 |
| }, |
| { |
| "entropy": 0.30908203125, |
| "epoch": 1.836734693877551, |
| "grad_norm": 3.510391275134882, |
| "learning_rate": 4.114134479679543e-07, |
| "loss": 0.2734, |
| "mean_token_accuracy": 0.9234184563159943, |
| "num_tokens": 40977530.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 0.318359375, |
| "epoch": 1.8445839874411303, |
| "grad_norm": 3.6233154389122, |
| "learning_rate": 3.734040554588514e-07, |
| "loss": 0.2828, |
| "mean_token_accuracy": 0.9189339816570282, |
| "num_tokens": 41153483.0, |
| "step": 1175 |
| }, |
| { |
| "entropy": 0.3365234375, |
| "epoch": 1.8524332810047097, |
| "grad_norm": 3.993058357311181, |
| "learning_rate": 3.372032376986034e-07, |
| "loss": 0.3026, |
| "mean_token_accuracy": 0.9142104327678681, |
| "num_tokens": 41332210.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 0.39375, |
| "epoch": 1.8602825745682887, |
| "grad_norm": 4.328973024123323, |
| "learning_rate": 3.028177958332512e-07, |
| "loss": 0.3665, |
| "mean_token_accuracy": 0.8986062467098236, |
| "num_tokens": 41507869.0, |
| "step": 1185 |
| }, |
| { |
| "entropy": 0.34140625, |
| "epoch": 1.8681318681318682, |
| "grad_norm": 3.2104548151581347, |
| "learning_rate": 2.7025418994922835e-07, |
| "loss": 0.3053, |
| "mean_token_accuracy": 0.9121370255947113, |
| "num_tokens": 41690709.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 0.31875, |
| "epoch": 1.8759811616954474, |
| "grad_norm": 3.7412102895291426, |
| "learning_rate": 2.3951853785969535e-07, |
| "loss": 0.2721, |
| "mean_token_accuracy": 0.9221031606197357, |
| "num_tokens": 41863334.0, |
| "step": 1195 |
| }, |
| { |
| "entropy": 0.3087890625, |
| "epoch": 1.8838304552590266, |
| "grad_norm": 3.280442340689253, |
| "learning_rate": 2.106166139551602e-07, |
| "loss": 0.2706, |
| "mean_token_accuracy": 0.9225331664085388, |
| "num_tokens": 42034420.0, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.3197265625, |
| "epoch": 1.891679748822606, |
| "grad_norm": 4.156766829239504, |
| "learning_rate": 1.8355384811863274e-07, |
| "loss": 0.2933, |
| "mean_token_accuracy": 0.9180307269096375, |
| "num_tokens": 42205691.0, |
| "step": 1205 |
| }, |
| { |
| "entropy": 0.3162109375, |
| "epoch": 1.8995290423861853, |
| "grad_norm": 3.7746269581832683, |
| "learning_rate": 1.5833532470549862e-07, |
| "loss": 0.2794, |
| "mean_token_accuracy": 0.9198680520057678, |
| "num_tokens": 42379079.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 0.313671875, |
| "epoch": 1.9073783359497645, |
| "grad_norm": 3.6980881739964797, |
| "learning_rate": 1.349657815883032e-07, |
| "loss": 0.2771, |
| "mean_token_accuracy": 0.9206208467483521, |
| "num_tokens": 42561550.0, |
| "step": 1215 |
| }, |
| { |
| "entropy": 0.3236328125, |
| "epoch": 1.915227629513344, |
| "grad_norm": 3.6057672291053167, |
| "learning_rate": 1.134496092666415e-07, |
| "loss": 0.2907, |
| "mean_token_accuracy": 0.9175450682640076, |
| "num_tokens": 42733313.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 0.326171875, |
| "epoch": 1.9230769230769231, |
| "grad_norm": 4.7667736940463055, |
| "learning_rate": 9.379085004229571e-08, |
| "loss": 0.2992, |
| "mean_token_accuracy": 0.916048800945282, |
| "num_tokens": 42920454.0, |
| "step": 1225 |
| }, |
| { |
| "entropy": 0.3322265625, |
| "epoch": 1.9309262166405023, |
| "grad_norm": 3.326013461136122, |
| "learning_rate": 7.599319725980047e-08, |
| "loss": 0.2969, |
| "mean_token_accuracy": 0.9171286225318909, |
| "num_tokens": 43103243.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 0.319921875, |
| "epoch": 1.9387755102040818, |
| "grad_norm": 3.78897614241427, |
| "learning_rate": 6.005999461256684e-08, |
| "loss": 0.2754, |
| "mean_token_accuracy": 0.9204862177371979, |
| "num_tokens": 43268888.0, |
| "step": 1235 |
| }, |
| { |
| "entropy": 0.2892578125, |
| "epoch": 1.9466248037676608, |
| "grad_norm": 2.9548706636867004, |
| "learning_rate": 4.599423551468807e-08, |
| "loss": 0.2507, |
| "mean_token_accuracy": 0.9277023196220398, |
| "num_tokens": 43458666.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 0.32265625, |
| "epoch": 1.9544740973312402, |
| "grad_norm": 3.410711053990863, |
| "learning_rate": 3.379856253855951e-08, |
| "loss": 0.2941, |
| "mean_token_accuracy": 0.916301691532135, |
| "num_tokens": 43643099.0, |
| "step": 1245 |
| }, |
| { |
| "entropy": 0.353515625, |
| "epoch": 1.9623233908948194, |
| "grad_norm": 3.5072404677907123, |
| "learning_rate": 2.347526691841906e-08, |
| "loss": 0.3051, |
| "mean_token_accuracy": 0.911671257019043, |
| "num_tokens": 43824484.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.309375, |
| "epoch": 1.9701726844583987, |
| "grad_norm": 3.7842286783508015, |
| "learning_rate": 1.5026288119874833e-08, |
| "loss": 0.2791, |
| "mean_token_accuracy": 0.9217668533325195, |
| "num_tokens": 43998806.0, |
| "step": 1255 |
| }, |
| { |
| "entropy": 0.3134765625, |
| "epoch": 1.978021978021978, |
| "grad_norm": 3.5676523086191914, |
| "learning_rate": 8.453213475543287e-09, |
| "loss": 0.2865, |
| "mean_token_accuracy": 0.9178596436977386, |
| "num_tokens": 44188721.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 0.3376953125, |
| "epoch": 1.9858712715855573, |
| "grad_norm": 3.569512799557757, |
| "learning_rate": 3.757277886824451e-09, |
| "loss": 0.2936, |
| "mean_token_accuracy": 0.917294180393219, |
| "num_tokens": 44350631.0, |
| "step": 1265 |
| }, |
| { |
| "entropy": 0.29462890625, |
| "epoch": 1.9937205651491365, |
| "grad_norm": 3.519511879531829, |
| "learning_rate": 9.393635919041632e-10, |
| "loss": 0.2592, |
| "mean_token_accuracy": 0.9252937197685241, |
| "num_tokens": 44526708.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 0.320068359375, |
| "epoch": 2.0, |
| "mean_token_accuracy": 0.9173652082681656, |
| "num_tokens": 44672115.0, |
| "step": 1274, |
| "total_flos": 152753014702080.0, |
| "train_loss": 0.5267494187998809, |
| "train_runtime": 3760.0762, |
| "train_samples_per_second": 21.678, |
| "train_steps_per_second": 0.339 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1274, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 152753014702080.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|