zephyr-7b-sft-iter2 / trainer_state.json
billxbf's picture
Model save
70ebca7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1274,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.5458984375,
"epoch": 0.007849293563579277,
"grad_norm": 4.360797014923199,
"learning_rate": 6.25e-07,
"loss": 0.6017,
"mean_token_accuracy": 0.8533750414848328,
"num_tokens": 184600.0,
"step": 5
},
{
"entropy": 0.624609375,
"epoch": 0.015698587127158554,
"grad_norm": 4.173989171952699,
"learning_rate": 1.40625e-06,
"loss": 0.6725,
"mean_token_accuracy": 0.8375870227813721,
"num_tokens": 364953.0,
"step": 10
},
{
"entropy": 0.65390625,
"epoch": 0.023547880690737835,
"grad_norm": 4.134126723754578,
"learning_rate": 2.1875000000000002e-06,
"loss": 0.6032,
"mean_token_accuracy": 0.8524928987026215,
"num_tokens": 549934.0,
"step": 15
},
{
"entropy": 0.65234375,
"epoch": 0.03139717425431711,
"grad_norm": 4.553331995001466,
"learning_rate": 2.96875e-06,
"loss": 0.6252,
"mean_token_accuracy": 0.8429424941539765,
"num_tokens": 729587.0,
"step": 20
},
{
"entropy": 0.5826171875,
"epoch": 0.03924646781789639,
"grad_norm": 3.86036196792769,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.5924,
"mean_token_accuracy": 0.8477238178253174,
"num_tokens": 902536.0,
"step": 25
},
{
"entropy": 0.641015625,
"epoch": 0.04709576138147567,
"grad_norm": 4.179091616309323,
"learning_rate": 4.53125e-06,
"loss": 0.6158,
"mean_token_accuracy": 0.8436026930809021,
"num_tokens": 1085970.0,
"step": 30
},
{
"entropy": 0.6009765625,
"epoch": 0.054945054945054944,
"grad_norm": 3.4664489534719136,
"learning_rate": 5.3125e-06,
"loss": 0.5899,
"mean_token_accuracy": 0.8470608413219451,
"num_tokens": 1262961.0,
"step": 35
},
{
"entropy": 0.59296875,
"epoch": 0.06279434850863422,
"grad_norm": 4.61485579559788,
"learning_rate": 6.093750000000001e-06,
"loss": 0.5756,
"mean_token_accuracy": 0.852037787437439,
"num_tokens": 1434864.0,
"step": 40
},
{
"entropy": 0.6490234375,
"epoch": 0.0706436420722135,
"grad_norm": 3.76615983960646,
"learning_rate": 6.875e-06,
"loss": 0.6202,
"mean_token_accuracy": 0.8431805431842804,
"num_tokens": 1613822.0,
"step": 45
},
{
"entropy": 0.6232421875,
"epoch": 0.07849293563579278,
"grad_norm": 4.323268612794359,
"learning_rate": 7.656250000000001e-06,
"loss": 0.6209,
"mean_token_accuracy": 0.8405247449874877,
"num_tokens": 1789186.0,
"step": 50
},
{
"entropy": 0.6388671875,
"epoch": 0.08634222919937205,
"grad_norm": 3.652593030026506,
"learning_rate": 8.4375e-06,
"loss": 0.613,
"mean_token_accuracy": 0.8430534422397613,
"num_tokens": 1969254.0,
"step": 55
},
{
"entropy": 0.6021484375,
"epoch": 0.09419152276295134,
"grad_norm": 3.9563909364035212,
"learning_rate": 9.21875e-06,
"loss": 0.5886,
"mean_token_accuracy": 0.8484231889247894,
"num_tokens": 2149465.0,
"step": 60
},
{
"entropy": 0.6640625,
"epoch": 0.10204081632653061,
"grad_norm": 4.647691513931947,
"learning_rate": 1e-05,
"loss": 0.6597,
"mean_token_accuracy": 0.8311540305614471,
"num_tokens": 2316427.0,
"step": 65
},
{
"entropy": 0.6666015625,
"epoch": 0.10989010989010989,
"grad_norm": 3.860853295594871,
"learning_rate": 1.0781250000000001e-05,
"loss": 0.6299,
"mean_token_accuracy": 0.8398839890956878,
"num_tokens": 2476047.0,
"step": 70
},
{
"entropy": 0.6080078125,
"epoch": 0.11773940345368916,
"grad_norm": 4.134599413536603,
"learning_rate": 1.1562500000000002e-05,
"loss": 0.6148,
"mean_token_accuracy": 0.8433849811553955,
"num_tokens": 2666514.0,
"step": 75
},
{
"entropy": 0.66953125,
"epoch": 0.12558869701726844,
"grad_norm": 4.393935267584192,
"learning_rate": 1.234375e-05,
"loss": 0.655,
"mean_token_accuracy": 0.8323938012123108,
"num_tokens": 2845307.0,
"step": 80
},
{
"entropy": 0.6310546875,
"epoch": 0.13343799058084774,
"grad_norm": 3.997712311552316,
"learning_rate": 1.3125e-05,
"loss": 0.6027,
"mean_token_accuracy": 0.8435549437999725,
"num_tokens": 3023624.0,
"step": 85
},
{
"entropy": 0.726953125,
"epoch": 0.141287284144427,
"grad_norm": 4.8386502206763815,
"learning_rate": 1.3906250000000001e-05,
"loss": 0.7214,
"mean_token_accuracy": 0.8174331367015839,
"num_tokens": 3195544.0,
"step": 90
},
{
"entropy": 0.70546875,
"epoch": 0.14913657770800628,
"grad_norm": 4.7174464913779195,
"learning_rate": 1.4687500000000001e-05,
"loss": 0.6952,
"mean_token_accuracy": 0.8213136374950409,
"num_tokens": 3361327.0,
"step": 95
},
{
"entropy": 0.659765625,
"epoch": 0.15698587127158556,
"grad_norm": 4.517079736996241,
"learning_rate": 1.546875e-05,
"loss": 0.6254,
"mean_token_accuracy": 0.8344886541366577,
"num_tokens": 3528717.0,
"step": 100
},
{
"entropy": 0.628515625,
"epoch": 0.16483516483516483,
"grad_norm": 4.025576117988761,
"learning_rate": 1.6250000000000002e-05,
"loss": 0.6156,
"mean_token_accuracy": 0.8382946670055389,
"num_tokens": 3711033.0,
"step": 105
},
{
"entropy": 0.706640625,
"epoch": 0.1726844583987441,
"grad_norm": 4.39476825016141,
"learning_rate": 1.703125e-05,
"loss": 0.6876,
"mean_token_accuracy": 0.8211474418640137,
"num_tokens": 3896865.0,
"step": 110
},
{
"entropy": 0.73046875,
"epoch": 0.18053375196232338,
"grad_norm": 4.5449040365136275,
"learning_rate": 1.7812500000000003e-05,
"loss": 0.706,
"mean_token_accuracy": 0.8200587868690491,
"num_tokens": 4064399.0,
"step": 115
},
{
"entropy": 0.798828125,
"epoch": 0.18838304552590268,
"grad_norm": 4.7172406953905055,
"learning_rate": 1.859375e-05,
"loss": 0.7724,
"mean_token_accuracy": 0.8062412679195404,
"num_tokens": 4250087.0,
"step": 120
},
{
"entropy": 0.7640625,
"epoch": 0.19623233908948196,
"grad_norm": 4.334898424571733,
"learning_rate": 1.9375e-05,
"loss": 0.7589,
"mean_token_accuracy": 0.8066916942596436,
"num_tokens": 4424556.0,
"step": 125
},
{
"entropy": 0.66796875,
"epoch": 0.20408163265306123,
"grad_norm": 4.117466362089365,
"learning_rate": 1.999996242489157e-05,
"loss": 0.6492,
"mean_token_accuracy": 0.8317610502243042,
"num_tokens": 4602319.0,
"step": 130
},
{
"entropy": 0.6859375,
"epoch": 0.2119309262166405,
"grad_norm": 4.969581124334222,
"learning_rate": 1.9998647325745995e-05,
"loss": 0.6877,
"mean_token_accuracy": 0.8207987725734711,
"num_tokens": 4781504.0,
"step": 135
},
{
"entropy": 0.72421875,
"epoch": 0.21978021978021978,
"grad_norm": 4.719028124484732,
"learning_rate": 1.9995453753547198e-05,
"loss": 0.7159,
"mean_token_accuracy": 0.8126244425773621,
"num_tokens": 4956176.0,
"step": 140
},
{
"entropy": 0.676171875,
"epoch": 0.22762951334379905,
"grad_norm": 8.762448889791337,
"learning_rate": 1.9990382308280272e-05,
"loss": 0.6673,
"mean_token_accuracy": 0.8245218932628632,
"num_tokens": 5128454.0,
"step": 145
},
{
"entropy": 0.75859375,
"epoch": 0.23547880690737832,
"grad_norm": 4.8961918895212,
"learning_rate": 1.9983433942731427e-05,
"loss": 0.7384,
"mean_token_accuracy": 0.8101492524147034,
"num_tokens": 5318384.0,
"step": 150
},
{
"entropy": 0.760546875,
"epoch": 0.24332810047095763,
"grad_norm": 5.258590906515805,
"learning_rate": 1.9974609962308986e-05,
"loss": 0.7455,
"mean_token_accuracy": 0.805279117822647,
"num_tokens": 5486676.0,
"step": 155
},
{
"entropy": 0.758203125,
"epoch": 0.25117739403453687,
"grad_norm": 4.883940068899578,
"learning_rate": 1.9963912024798136e-05,
"loss": 0.735,
"mean_token_accuracy": 0.8099753499031067,
"num_tokens": 5654901.0,
"step": 160
},
{
"entropy": 0.71796875,
"epoch": 0.25902668759811615,
"grad_norm": 5.168092897390165,
"learning_rate": 1.9951342140049483e-05,
"loss": 0.6834,
"mean_token_accuracy": 0.8199768126010895,
"num_tokens": 5836491.0,
"step": 165
},
{
"entropy": 0.75078125,
"epoch": 0.2668759811616955,
"grad_norm": 5.194891788462451,
"learning_rate": 1.9936902669601436e-05,
"loss": 0.7382,
"mean_token_accuracy": 0.8087680697441101,
"num_tokens": 6016945.0,
"step": 170
},
{
"entropy": 0.7578125,
"epoch": 0.27472527472527475,
"grad_norm": 4.272844578361102,
"learning_rate": 1.992059632623657e-05,
"loss": 0.7216,
"mean_token_accuracy": 0.810324364900589,
"num_tokens": 6211712.0,
"step": 175
},
{
"entropy": 0.7265625,
"epoch": 0.282574568288854,
"grad_norm": 5.177357949949477,
"learning_rate": 1.9902426173471933e-05,
"loss": 0.723,
"mean_token_accuracy": 0.8130602538585663,
"num_tokens": 6391206.0,
"step": 180
},
{
"entropy": 0.690625,
"epoch": 0.2904238618524333,
"grad_norm": 4.3708505660541634,
"learning_rate": 1.9882395624983522e-05,
"loss": 0.6693,
"mean_token_accuracy": 0.8209006905555725,
"num_tokens": 6568364.0,
"step": 185
},
{
"entropy": 0.76484375,
"epoch": 0.29827315541601257,
"grad_norm": 4.5630509683508835,
"learning_rate": 1.986050844396493e-05,
"loss": 0.737,
"mean_token_accuracy": 0.8066314697265625,
"num_tokens": 6743609.0,
"step": 190
},
{
"entropy": 0.79609375,
"epoch": 0.30612244897959184,
"grad_norm": 4.704418610612491,
"learning_rate": 1.9836768742420355e-05,
"loss": 0.7801,
"mean_token_accuracy": 0.796642541885376,
"num_tokens": 6905826.0,
"step": 195
},
{
"entropy": 0.830859375,
"epoch": 0.3139717425431711,
"grad_norm": 7.369608742272206,
"learning_rate": 1.9811180980392054e-05,
"loss": 0.8205,
"mean_token_accuracy": 0.7892749607563019,
"num_tokens": 7082387.0,
"step": 200
},
{
"entropy": 0.808203125,
"epoch": 0.3218210361067504,
"grad_norm": 6.33690320523207,
"learning_rate": 1.9783749965122444e-05,
"loss": 0.8285,
"mean_token_accuracy": 0.7886267483234406,
"num_tokens": 7266538.0,
"step": 205
},
{
"entropy": 0.805078125,
"epoch": 0.32967032967032966,
"grad_norm": 5.4501950578656695,
"learning_rate": 1.975448085015093e-05,
"loss": 0.7669,
"mean_token_accuracy": 0.8004365921020508,
"num_tokens": 7441353.0,
"step": 210
},
{
"entropy": 0.8328125,
"epoch": 0.33751962323390894,
"grad_norm": 5.481262275844823,
"learning_rate": 1.9723379134345698e-05,
"loss": 0.7928,
"mean_token_accuracy": 0.7955616354942322,
"num_tokens": 7610090.0,
"step": 215
},
{
"entropy": 0.6796875,
"epoch": 0.3453689167974882,
"grad_norm": 14.03469496366746,
"learning_rate": 1.9690450660870657e-05,
"loss": 0.7119,
"mean_token_accuracy": 0.8117523312568664,
"num_tokens": 7785150.0,
"step": 220
},
{
"entropy": 0.837109375,
"epoch": 0.3532182103610675,
"grad_norm": 21.481304681046133,
"learning_rate": 1.965570161608762e-05,
"loss": 0.7928,
"mean_token_accuracy": 0.796742171049118,
"num_tokens": 7947571.0,
"step": 225
},
{
"entropy": 0.694921875,
"epoch": 0.36106750392464676,
"grad_norm": 4.843158336605314,
"learning_rate": 1.961913852839409e-05,
"loss": 0.7393,
"mean_token_accuracy": 0.8110623478889465,
"num_tokens": 8121912.0,
"step": 230
},
{
"entropy": 0.84140625,
"epoch": 0.36891679748822603,
"grad_norm": 4.852685356865867,
"learning_rate": 1.958076826699676e-05,
"loss": 0.8183,
"mean_token_accuracy": 0.7912623822689057,
"num_tokens": 8298107.0,
"step": 235
},
{
"entropy": 0.7515625,
"epoch": 0.37676609105180536,
"grad_norm": 4.73767538351422,
"learning_rate": 1.954059804062092e-05,
"loss": 0.7296,
"mean_token_accuracy": 0.809317535161972,
"num_tokens": 8474685.0,
"step": 240
},
{
"entropy": 0.759765625,
"epoch": 0.38461538461538464,
"grad_norm": 4.736881234284725,
"learning_rate": 1.9498635396156217e-05,
"loss": 0.7585,
"mean_token_accuracy": 0.8065651416778564,
"num_tokens": 8646326.0,
"step": 245
},
{
"entropy": 0.790625,
"epoch": 0.3924646781789639,
"grad_norm": 6.139673216563283,
"learning_rate": 1.945488821723873e-05,
"loss": 0.8023,
"mean_token_accuracy": 0.7948877096176148,
"num_tokens": 8829018.0,
"step": 250
},
{
"entropy": 0.77734375,
"epoch": 0.4003139717425432,
"grad_norm": 4.928971267663469,
"learning_rate": 1.9409364722769882e-05,
"loss": 0.7675,
"mean_token_accuracy": 0.7988379955291748,
"num_tokens": 9005749.0,
"step": 255
},
{
"entropy": 0.776171875,
"epoch": 0.40816326530612246,
"grad_norm": 5.40486354385506,
"learning_rate": 1.936207346537233e-05,
"loss": 0.7616,
"mean_token_accuracy": 0.8005188524723053,
"num_tokens": 9183456.0,
"step": 260
},
{
"entropy": 0.7015625,
"epoch": 0.41601255886970173,
"grad_norm": 4.788382627075379,
"learning_rate": 1.931302332978316e-05,
"loss": 0.6827,
"mean_token_accuracy": 0.8215398371219635,
"num_tokens": 9366675.0,
"step": 265
},
{
"entropy": 0.812109375,
"epoch": 0.423861852433281,
"grad_norm": 4.627337335063183,
"learning_rate": 1.9262223531184678e-05,
"loss": 0.8108,
"mean_token_accuracy": 0.7931458294391632,
"num_tokens": 9537586.0,
"step": 270
},
{
"entropy": 0.833984375,
"epoch": 0.4317111459968603,
"grad_norm": 5.563859895314659,
"learning_rate": 1.9209683613473143e-05,
"loss": 0.849,
"mean_token_accuracy": 0.7871452808380127,
"num_tokens": 9706784.0,
"step": 275
},
{
"entropy": 0.767578125,
"epoch": 0.43956043956043955,
"grad_norm": 4.599264719197712,
"learning_rate": 1.9155413447465715e-05,
"loss": 0.752,
"mean_token_accuracy": 0.8034415602684021,
"num_tokens": 9875538.0,
"step": 280
},
{
"entropy": 0.809765625,
"epoch": 0.4474097331240188,
"grad_norm": 5.321937764249497,
"learning_rate": 1.9099423229046015e-05,
"loss": 0.7833,
"mean_token_accuracy": 0.7978603601455688,
"num_tokens": 10044475.0,
"step": 285
},
{
"entropy": 0.8265625,
"epoch": 0.4552590266875981,
"grad_norm": 5.63723949982722,
"learning_rate": 1.9041723477248575e-05,
"loss": 0.8204,
"mean_token_accuracy": 0.7882011353969574,
"num_tokens": 10222319.0,
"step": 290
},
{
"entropy": 0.776953125,
"epoch": 0.4631083202511774,
"grad_norm": 4.6603074318506,
"learning_rate": 1.8982325032282616e-05,
"loss": 0.7813,
"mean_token_accuracy": 0.7975976228713989,
"num_tokens": 10399979.0,
"step": 295
},
{
"entropy": 0.832421875,
"epoch": 0.47095761381475665,
"grad_norm": 11.534881035486686,
"learning_rate": 1.8921239053495465e-05,
"loss": 0.807,
"mean_token_accuracy": 0.7901058197021484,
"num_tokens": 10572020.0,
"step": 300
},
{
"entropy": 0.78671875,
"epoch": 0.478806907378336,
"grad_norm": 4.825964771710648,
"learning_rate": 1.8858477017276002e-05,
"loss": 0.7848,
"mean_token_accuracy": 0.7961919009685516,
"num_tokens": 10747199.0,
"step": 305
},
{
"entropy": 0.815625,
"epoch": 0.48665620094191525,
"grad_norm": 29.937628894998547,
"learning_rate": 1.8794050714898596e-05,
"loss": 0.7982,
"mean_token_accuracy": 0.7925647079944611,
"num_tokens": 10924508.0,
"step": 310
},
{
"entropy": 0.801953125,
"epoch": 0.4945054945054945,
"grad_norm": 5.031515759113725,
"learning_rate": 1.87279722503078e-05,
"loss": 0.7959,
"mean_token_accuracy": 0.7924800992012024,
"num_tokens": 11097529.0,
"step": 315
},
{
"entropy": 0.7953125,
"epoch": 0.5023547880690737,
"grad_norm": 5.5920861837911335,
"learning_rate": 1.866025403784439e-05,
"loss": 0.7869,
"mean_token_accuracy": 0.7924351811408996,
"num_tokens": 11278748.0,
"step": 320
},
{
"entropy": 0.773046875,
"epoch": 0.5102040816326531,
"grad_norm": 4.767882154179232,
"learning_rate": 1.859090879991302e-05,
"loss": 0.7659,
"mean_token_accuracy": 0.7989370346069335,
"num_tokens": 11456413.0,
"step": 325
},
{
"entropy": 0.83359375,
"epoch": 0.5180533751962323,
"grad_norm": 4.615403763103987,
"learning_rate": 1.8519949564592047e-05,
"loss": 0.8124,
"mean_token_accuracy": 0.7914808392524719,
"num_tokens": 11625785.0,
"step": 330
},
{
"entropy": 0.809765625,
"epoch": 0.5259026687598116,
"grad_norm": 4.106320362967877,
"learning_rate": 1.8447389663185905e-05,
"loss": 0.8026,
"mean_token_accuracy": 0.7902366161346436,
"num_tokens": 11812125.0,
"step": 335
},
{
"entropy": 0.77109375,
"epoch": 0.533751962323391,
"grad_norm": 5.182177337614667,
"learning_rate": 1.837324272772052e-05,
"loss": 0.7623,
"mean_token_accuracy": 0.8023563742637634,
"num_tokens": 12001790.0,
"step": 340
},
{
"entropy": 0.79921875,
"epoch": 0.5416012558869702,
"grad_norm": 5.040875278584284,
"learning_rate": 1.829752268838222e-05,
"loss": 0.7811,
"mean_token_accuracy": 0.7928344547748566,
"num_tokens": 12179297.0,
"step": 345
},
{
"entropy": 0.79453125,
"epoch": 0.5494505494505495,
"grad_norm": 8.20904916308416,
"learning_rate": 1.8220243770900623e-05,
"loss": 0.7627,
"mean_token_accuracy": 0.8005800724029541,
"num_tokens": 12353848.0,
"step": 350
},
{
"entropy": 0.777734375,
"epoch": 0.5572998430141287,
"grad_norm": 4.8018626718228985,
"learning_rate": 1.8141420493876035e-05,
"loss": 0.7752,
"mean_token_accuracy": 0.7982768774032593,
"num_tokens": 12531024.0,
"step": 355
},
{
"entropy": 0.765625,
"epoch": 0.565149136577708,
"grad_norm": 5.109362259703448,
"learning_rate": 1.806106766605178e-05,
"loss": 0.7332,
"mean_token_accuracy": 0.8097867131233215,
"num_tokens": 12713443.0,
"step": 360
},
{
"entropy": 0.7828125,
"epoch": 0.5729984301412873,
"grad_norm": 5.017082784768869,
"learning_rate": 1.7979200383532055e-05,
"loss": 0.787,
"mean_token_accuracy": 0.7963611066341401,
"num_tokens": 12896207.0,
"step": 365
},
{
"entropy": 0.76328125,
"epoch": 0.5808477237048666,
"grad_norm": 5.429786099705206,
"learning_rate": 1.789583402694577e-05,
"loss": 0.751,
"mean_token_accuracy": 0.8028202712535858,
"num_tokens": 13074959.0,
"step": 370
},
{
"entropy": 0.800390625,
"epoch": 0.5886970172684458,
"grad_norm": 4.665804832964679,
"learning_rate": 1.7810984258556955e-05,
"loss": 0.7902,
"mean_token_accuracy": 0.7924364864826202,
"num_tokens": 13239381.0,
"step": 375
},
{
"entropy": 0.821484375,
"epoch": 0.5965463108320251,
"grad_norm": 5.110027949764656,
"learning_rate": 1.7724667019322258e-05,
"loss": 0.8013,
"mean_token_accuracy": 0.7881995797157287,
"num_tokens": 13409287.0,
"step": 380
},
{
"entropy": 0.79921875,
"epoch": 0.6043956043956044,
"grad_norm": 5.316113035916057,
"learning_rate": 1.7636898525896057e-05,
"loss": 0.7955,
"mean_token_accuracy": 0.7934599101543427,
"num_tokens": 13587551.0,
"step": 385
},
{
"entropy": 0.784375,
"epoch": 0.6122448979591837,
"grad_norm": 4.69729907059058,
"learning_rate": 1.7547695267583794e-05,
"loss": 0.7497,
"mean_token_accuracy": 0.8004011273384094,
"num_tokens": 13765322.0,
"step": 390
},
{
"entropy": 0.835546875,
"epoch": 0.6200941915227629,
"grad_norm": 4.7498568521557285,
"learning_rate": 1.74570740032441e-05,
"loss": 0.8571,
"mean_token_accuracy": 0.7803294241428376,
"num_tokens": 13941975.0,
"step": 395
},
{
"entropy": 0.864453125,
"epoch": 0.6279434850863422,
"grad_norm": 4.6116162900514315,
"learning_rate": 1.736505175814025e-05,
"loss": 0.851,
"mean_token_accuracy": 0.7785885393619537,
"num_tokens": 14124355.0,
"step": 400
},
{
"entropy": 0.801171875,
"epoch": 0.6357927786499215,
"grad_norm": 5.733457038092804,
"learning_rate": 1.7271645820741586e-05,
"loss": 0.8094,
"mean_token_accuracy": 0.78756765127182,
"num_tokens": 14293787.0,
"step": 405
},
{
"entropy": 0.739453125,
"epoch": 0.6436420722135008,
"grad_norm": 4.338135609354382,
"learning_rate": 1.7176873739475475e-05,
"loss": 0.7284,
"mean_token_accuracy": 0.8109029471874237,
"num_tokens": 14474817.0,
"step": 410
},
{
"entropy": 0.767578125,
"epoch": 0.6514913657770801,
"grad_norm": 5.302940753837162,
"learning_rate": 1.7080753319430452e-05,
"loss": 0.7589,
"mean_token_accuracy": 0.80281902551651,
"num_tokens": 14652072.0,
"step": 415
},
{
"entropy": 0.8328125,
"epoch": 0.6593406593406593,
"grad_norm": 5.1332141590489,
"learning_rate": 1.6983302619011125e-05,
"loss": 0.8207,
"mean_token_accuracy": 0.7852710247039795,
"num_tokens": 14825230.0,
"step": 420
},
{
"entropy": 0.72265625,
"epoch": 0.6671899529042387,
"grad_norm": 4.472690721905866,
"learning_rate": 1.6884539946545486e-05,
"loss": 0.6966,
"mean_token_accuracy": 0.8123364210128784,
"num_tokens": 14998108.0,
"step": 425
},
{
"entropy": 0.796484375,
"epoch": 0.6750392464678179,
"grad_norm": 4.634375280959423,
"learning_rate": 1.6784483856845287e-05,
"loss": 0.792,
"mean_token_accuracy": 0.7913555383682251,
"num_tokens": 15179352.0,
"step": 430
},
{
"entropy": 0.786328125,
"epoch": 0.6828885400313972,
"grad_norm": 4.746241903035234,
"learning_rate": 1.6683153147720098e-05,
"loss": 0.7729,
"mean_token_accuracy": 0.7997291147708893,
"num_tokens": 15350374.0,
"step": 435
},
{
"entropy": 0.7984375,
"epoch": 0.6907378335949764,
"grad_norm": 4.585308958448852,
"learning_rate": 1.6580566856445684e-05,
"loss": 0.8056,
"mean_token_accuracy": 0.7899181842803955,
"num_tokens": 15526013.0,
"step": 440
},
{
"entropy": 0.819921875,
"epoch": 0.6985871271585558,
"grad_norm": 4.601688747850842,
"learning_rate": 1.647674425618747e-05,
"loss": 0.8099,
"mean_token_accuracy": 0.7879339218139648,
"num_tokens": 15699724.0,
"step": 445
},
{
"entropy": 0.76875,
"epoch": 0.706436420722135,
"grad_norm": 5.412743559231603,
"learning_rate": 1.6371704852379587e-05,
"loss": 0.7586,
"mean_token_accuracy": 0.7998056769371032,
"num_tokens": 15874961.0,
"step": 450
},
{
"entropy": 0.797265625,
"epoch": 0.7142857142857143,
"grad_norm": 5.991686412229486,
"learning_rate": 1.6265468379060364e-05,
"loss": 0.7824,
"mean_token_accuracy": 0.7962137937545777,
"num_tokens": 16041765.0,
"step": 455
},
{
"entropy": 0.79296875,
"epoch": 0.7221350078492935,
"grad_norm": 4.671481067368767,
"learning_rate": 1.615805479516484e-05,
"loss": 0.7786,
"mean_token_accuracy": 0.7946656525135041,
"num_tokens": 16217463.0,
"step": 460
},
{
"entropy": 0.7421875,
"epoch": 0.7299843014128728,
"grad_norm": 4.396017574728974,
"learning_rate": 1.6049484280775012e-05,
"loss": 0.7342,
"mean_token_accuracy": 0.8057381689548493,
"num_tokens": 16400762.0,
"step": 465
},
{
"entropy": 0.836328125,
"epoch": 0.7378335949764521,
"grad_norm": 5.547458007168034,
"learning_rate": 1.593977723332855e-05,
"loss": 0.8167,
"mean_token_accuracy": 0.785160768032074,
"num_tokens": 16580373.0,
"step": 470
},
{
"entropy": 0.81328125,
"epoch": 0.7456828885400314,
"grad_norm": 5.112513515390749,
"learning_rate": 1.5828954263786688e-05,
"loss": 0.8067,
"mean_token_accuracy": 0.7897652447223663,
"num_tokens": 16755173.0,
"step": 475
},
{
"entropy": 0.771875,
"epoch": 0.7535321821036107,
"grad_norm": 4.284314946059211,
"learning_rate": 1.571703619276197e-05,
"loss": 0.7624,
"mean_token_accuracy": 0.7975770771503449,
"num_tokens": 16928371.0,
"step": 480
},
{
"entropy": 0.7578125,
"epoch": 0.7613814756671899,
"grad_norm": 4.801660029299967,
"learning_rate": 1.5604044046606638e-05,
"loss": 0.7405,
"mean_token_accuracy": 0.8036318182945251,
"num_tokens": 17109329.0,
"step": 485
},
{
"entropy": 0.818359375,
"epoch": 0.7692307692307693,
"grad_norm": 5.28306110517908,
"learning_rate": 1.548999905346234e-05,
"loss": 0.8107,
"mean_token_accuracy": 0.7882427215576172,
"num_tokens": 17278892.0,
"step": 490
},
{
"entropy": 0.7859375,
"epoch": 0.7770800627943485,
"grad_norm": 15.112721306709377,
"learning_rate": 1.537492263927196e-05,
"loss": 0.8027,
"mean_token_accuracy": 0.796792733669281,
"num_tokens": 17453647.0,
"step": 495
},
{
"entropy": 0.915234375,
"epoch": 0.7849293563579278,
"grad_norm": 4.296741151037327,
"learning_rate": 1.5258836423754258e-05,
"loss": 0.8982,
"mean_token_accuracy": 0.7748664259910584,
"num_tokens": 17623664.0,
"step": 500
},
{
"entropy": 0.8140625,
"epoch": 0.792778649921507,
"grad_norm": 4.661848552053591,
"learning_rate": 1.5141762216342107e-05,
"loss": 0.7966,
"mean_token_accuracy": 0.792145174741745,
"num_tokens": 17783873.0,
"step": 505
},
{
"entropy": 0.772265625,
"epoch": 0.8006279434850864,
"grad_norm": 4.675795378963432,
"learning_rate": 1.5023722012085098e-05,
"loss": 0.7635,
"mean_token_accuracy": 0.7970840752124786,
"num_tokens": 17955480.0,
"step": 510
},
{
"entropy": 0.765625,
"epoch": 0.8084772370486656,
"grad_norm": 4.729782812344114,
"learning_rate": 1.4904737987517293e-05,
"loss": 0.7329,
"mean_token_accuracy": 0.8076993823051453,
"num_tokens": 18120464.0,
"step": 515
},
{
"entropy": 0.814453125,
"epoch": 0.8163265306122449,
"grad_norm": 5.0071131052667015,
"learning_rate": 1.4784832496490824e-05,
"loss": 0.7925,
"mean_token_accuracy": 0.7883158624172211,
"num_tokens": 18294198.0,
"step": 520
},
{
"entropy": 0.7734375,
"epoch": 0.8241758241758241,
"grad_norm": 5.267980807907245,
"learning_rate": 1.4664028065976245e-05,
"loss": 0.7613,
"mean_token_accuracy": 0.8003257095813752,
"num_tokens": 18475591.0,
"step": 525
},
{
"entropy": 0.785546875,
"epoch": 0.8320251177394035,
"grad_norm": 4.671829616451408,
"learning_rate": 1.4542347391830308e-05,
"loss": 0.7572,
"mean_token_accuracy": 0.8027947068214416,
"num_tokens": 18651337.0,
"step": 530
},
{
"entropy": 0.777734375,
"epoch": 0.8398744113029827,
"grad_norm": 4.5168552231692916,
"learning_rate": 1.4419813334532037e-05,
"loss": 0.7769,
"mean_token_accuracy": 0.7937651753425599,
"num_tokens": 18811548.0,
"step": 535
},
{
"entropy": 0.745703125,
"epoch": 0.847723704866562,
"grad_norm": 4.675368428382704,
"learning_rate": 1.4296448914887866e-05,
"loss": 0.7328,
"mean_token_accuracy": 0.8040679156780243,
"num_tokens": 18980140.0,
"step": 540
},
{
"entropy": 0.73046875,
"epoch": 0.8555729984301413,
"grad_norm": 4.5422953697606285,
"learning_rate": 1.4172277309706677e-05,
"loss": 0.7198,
"mean_token_accuracy": 0.8091396510601043,
"num_tokens": 19155730.0,
"step": 545
},
{
"entropy": 0.77734375,
"epoch": 0.8634222919937206,
"grad_norm": 4.113548725011141,
"learning_rate": 1.4047321847445474e-05,
"loss": 0.7875,
"mean_token_accuracy": 0.7970530390739441,
"num_tokens": 19331206.0,
"step": 550
},
{
"entropy": 0.79609375,
"epoch": 0.8712715855572999,
"grad_norm": 4.491027067352344,
"learning_rate": 1.392160600382663e-05,
"loss": 0.7744,
"mean_token_accuracy": 0.7995224177837372,
"num_tokens": 19501358.0,
"step": 555
},
{
"entropy": 0.7640625,
"epoch": 0.8791208791208791,
"grad_norm": 6.530128605470617,
"learning_rate": 1.3795153397427426e-05,
"loss": 0.7383,
"mean_token_accuracy": 0.8075939774513244,
"num_tokens": 19682092.0,
"step": 560
},
{
"entropy": 0.791015625,
"epoch": 0.8869701726844584,
"grad_norm": 5.163885415034253,
"learning_rate": 1.3667987785242776e-05,
"loss": 0.7831,
"mean_token_accuracy": 0.7966715455055237,
"num_tokens": 19854955.0,
"step": 565
},
{
"entropy": 0.755859375,
"epoch": 0.8948194662480377,
"grad_norm": 4.740210922746398,
"learning_rate": 1.3540133058221927e-05,
"loss": 0.7349,
"mean_token_accuracy": 0.8023954510688782,
"num_tokens": 20014539.0,
"step": 570
},
{
"entropy": 0.7421875,
"epoch": 0.902668759811617,
"grad_norm": 4.792699385906238,
"learning_rate": 1.3411613236779996e-05,
"loss": 0.7484,
"mean_token_accuracy": 0.8043593823909759,
"num_tokens": 20185864.0,
"step": 575
},
{
"entropy": 0.784765625,
"epoch": 0.9105180533751962,
"grad_norm": 5.861965682781705,
"learning_rate": 1.328245246628521e-05,
"loss": 0.7676,
"mean_token_accuracy": 0.7994747996330261,
"num_tokens": 20353415.0,
"step": 580
},
{
"entropy": 0.7734375,
"epoch": 0.9183673469387755,
"grad_norm": 4.553143635133164,
"learning_rate": 1.3152675012522629e-05,
"loss": 0.7602,
"mean_token_accuracy": 0.8037185490131378,
"num_tokens": 20528137.0,
"step": 585
},
{
"entropy": 0.8125,
"epoch": 0.9262166405023547,
"grad_norm": 4.341140007225268,
"learning_rate": 1.302230525713527e-05,
"loss": 0.8159,
"mean_token_accuracy": 0.787601375579834,
"num_tokens": 20704595.0,
"step": 590
},
{
"entropy": 0.815625,
"epoch": 0.9340659340659341,
"grad_norm": 4.534727275721023,
"learning_rate": 1.2891367693043477e-05,
"loss": 0.8041,
"mean_token_accuracy": 0.7893571734428406,
"num_tokens": 20874161.0,
"step": 595
},
{
"entropy": 0.7796875,
"epoch": 0.9419152276295133,
"grad_norm": 5.302756725693076,
"learning_rate": 1.2759886919843354e-05,
"loss": 0.7448,
"mean_token_accuracy": 0.8065652489662171,
"num_tokens": 21034972.0,
"step": 600
},
{
"entropy": 0.778515625,
"epoch": 0.9497645211930926,
"grad_norm": 4.358915295776673,
"learning_rate": 1.262788763918518e-05,
"loss": 0.7892,
"mean_token_accuracy": 0.7949841141700744,
"num_tokens": 21208208.0,
"step": 605
},
{
"entropy": 0.80546875,
"epoch": 0.957613814756672,
"grad_norm": 4.657817652151732,
"learning_rate": 1.2495394650132628e-05,
"loss": 0.7964,
"mean_token_accuracy": 0.7912369012832642,
"num_tokens": 21385817.0,
"step": 610
},
{
"entropy": 0.805859375,
"epoch": 0.9654631083202512,
"grad_norm": 4.053778957869286,
"learning_rate": 1.2362432844503725e-05,
"loss": 0.7902,
"mean_token_accuracy": 0.7954855740070343,
"num_tokens": 21554177.0,
"step": 615
},
{
"entropy": 0.77578125,
"epoch": 0.9733124018838305,
"grad_norm": 4.449402252518514,
"learning_rate": 1.222902720219433e-05,
"loss": 0.7802,
"mean_token_accuracy": 0.7978703141212463,
"num_tokens": 21733009.0,
"step": 620
},
{
"entropy": 0.82265625,
"epoch": 0.9811616954474097,
"grad_norm": 4.034946890134492,
"learning_rate": 1.209520278648512e-05,
"loss": 0.7812,
"mean_token_accuracy": 0.797600257396698,
"num_tokens": 21916510.0,
"step": 625
},
{
"entropy": 0.6578125,
"epoch": 0.989010989010989,
"grad_norm": 4.245901684288803,
"learning_rate": 1.1960984739332851e-05,
"loss": 0.6496,
"mean_token_accuracy": 0.824074250459671,
"num_tokens": 22091705.0,
"step": 630
},
{
"entropy": 0.796484375,
"epoch": 0.9968602825745683,
"grad_norm": 4.100658254128751,
"learning_rate": 1.1826398276646897e-05,
"loss": 0.7845,
"mean_token_accuracy": 0.7937996804714202,
"num_tokens": 22260035.0,
"step": 635
},
{
"entropy": 0.558984375,
"epoch": 1.0047095761381475,
"grad_norm": 5.439305366155218,
"learning_rate": 1.1691468683551865e-05,
"loss": 0.5154,
"mean_token_accuracy": 0.8592477262020111,
"num_tokens": 22441941.0,
"step": 640
},
{
"entropy": 0.3642578125,
"epoch": 1.012558869701727,
"grad_norm": 4.420472353359968,
"learning_rate": 1.1556221309637204e-05,
"loss": 0.3321,
"mean_token_accuracy": 0.9043326616287232,
"num_tokens": 22615765.0,
"step": 645
},
{
"entropy": 0.3724609375,
"epoch": 1.0204081632653061,
"grad_norm": 4.573568228629025,
"learning_rate": 1.1420681564194694e-05,
"loss": 0.335,
"mean_token_accuracy": 0.9050298452377319,
"num_tokens": 22788401.0,
"step": 650
},
{
"entropy": 0.4099609375,
"epoch": 1.0282574568288854,
"grad_norm": 4.215481005342787,
"learning_rate": 1.1284874911444763e-05,
"loss": 0.3632,
"mean_token_accuracy": 0.8970784783363343,
"num_tokens": 22960605.0,
"step": 655
},
{
"entropy": 0.3654296875,
"epoch": 1.0361067503924646,
"grad_norm": 4.218508213156498,
"learning_rate": 1.1148826865752445e-05,
"loss": 0.3575,
"mean_token_accuracy": 0.898915582895279,
"num_tokens": 23139019.0,
"step": 660
},
{
"entropy": 0.455078125,
"epoch": 1.043956043956044,
"grad_norm": 3.741632896639657,
"learning_rate": 1.1012562986833909e-05,
"loss": 0.4289,
"mean_token_accuracy": 0.8870194792747498,
"num_tokens": 23320382.0,
"step": 665
},
{
"entropy": 0.3673828125,
"epoch": 1.0518053375196232,
"grad_norm": 4.3558521664226095,
"learning_rate": 1.0876108874954498e-05,
"loss": 0.3376,
"mean_token_accuracy": 0.902996426820755,
"num_tokens": 23483411.0,
"step": 670
},
{
"entropy": 0.402734375,
"epoch": 1.0596546310832025,
"grad_norm": 4.363221525545958,
"learning_rate": 1.0739490166119155e-05,
"loss": 0.3553,
"mean_token_accuracy": 0.8993325650691986,
"num_tokens": 23655293.0,
"step": 675
},
{
"entropy": 0.347265625,
"epoch": 1.0675039246467817,
"grad_norm": 4.3509011526726225,
"learning_rate": 1.060273252725609e-05,
"loss": 0.3179,
"mean_token_accuracy": 0.9088070094585419,
"num_tokens": 23827360.0,
"step": 680
},
{
"entropy": 0.3892578125,
"epoch": 1.0753532182103611,
"grad_norm": 3.9376143288253336,
"learning_rate": 1.0465861651394673e-05,
"loss": 0.3389,
"mean_token_accuracy": 0.9021193146705627,
"num_tokens": 23998614.0,
"step": 685
},
{
"entropy": 0.394140625,
"epoch": 1.0832025117739403,
"grad_norm": 4.541470682953176,
"learning_rate": 1.0328903252838415e-05,
"loss": 0.3572,
"mean_token_accuracy": 0.8969906985759735,
"num_tokens": 24162044.0,
"step": 690
},
{
"entropy": 0.378515625,
"epoch": 1.0910518053375196,
"grad_norm": 4.055789916536888,
"learning_rate": 1.0191883062333964e-05,
"loss": 0.326,
"mean_token_accuracy": 0.9068751513957978,
"num_tokens": 24337261.0,
"step": 695
},
{
"entropy": 0.333984375,
"epoch": 1.098901098901099,
"grad_norm": 5.830501058734357,
"learning_rate": 1.0054826822236983e-05,
"loss": 0.2959,
"mean_token_accuracy": 0.9131093323230743,
"num_tokens": 24502246.0,
"step": 700
},
{
"entropy": 0.3708984375,
"epoch": 1.1067503924646782,
"grad_norm": 4.028032326842961,
"learning_rate": 9.917760281675867e-06,
"loss": 0.3345,
"mean_token_accuracy": 0.903233277797699,
"num_tokens": 24678284.0,
"step": 705
},
{
"entropy": 0.389453125,
"epoch": 1.1145996860282574,
"grad_norm": 4.032598919287802,
"learning_rate": 9.780709191714187e-06,
"loss": 0.3506,
"mean_token_accuracy": 0.8997887074947357,
"num_tokens": 24847385.0,
"step": 710
},
{
"entropy": 0.3677734375,
"epoch": 1.1224489795918366,
"grad_norm": 5.251015713215736,
"learning_rate": 9.643699300512781e-06,
"loss": 0.3321,
"mean_token_accuracy": 0.9033790588378906,
"num_tokens": 25015708.0,
"step": 715
},
{
"entropy": 0.386328125,
"epoch": 1.130298273155416,
"grad_norm": 3.635272069781761,
"learning_rate": 9.506756348492348e-06,
"loss": 0.3631,
"mean_token_accuracy": 0.8966520011425019,
"num_tokens": 25189049.0,
"step": 720
},
{
"entropy": 0.36796875,
"epoch": 1.1381475667189953,
"grad_norm": 3.905345940736465,
"learning_rate": 9.369906063497547e-06,
"loss": 0.3171,
"mean_token_accuracy": 0.9075049877166748,
"num_tokens": 25370100.0,
"step": 725
},
{
"entropy": 0.3763671875,
"epoch": 1.1459968602825745,
"grad_norm": 4.368923829495938,
"learning_rate": 9.233174155963432e-06,
"loss": 0.3491,
"mean_token_accuracy": 0.9008313238620758,
"num_tokens": 25538013.0,
"step": 730
},
{
"entropy": 0.3888671875,
"epoch": 1.1538461538461537,
"grad_norm": 3.95972099627997,
"learning_rate": 9.096586314085162e-06,
"loss": 0.3509,
"mean_token_accuracy": 0.8985212922096253,
"num_tokens": 25710812.0,
"step": 735
},
{
"entropy": 0.3560546875,
"epoch": 1.1616954474097332,
"grad_norm": 3.9507409203470085,
"learning_rate": 8.960168198991885e-06,
"loss": 0.3241,
"mean_token_accuracy": 0.9068559765815735,
"num_tokens": 25878865.0,
"step": 740
},
{
"entropy": 0.342578125,
"epoch": 1.1695447409733124,
"grad_norm": 3.7761986277605653,
"learning_rate": 8.823945439925725e-06,
"loss": 0.3036,
"mean_token_accuracy": 0.9105329990386963,
"num_tokens": 26053389.0,
"step": 745
},
{
"entropy": 0.3412109375,
"epoch": 1.1773940345368916,
"grad_norm": 3.793472691376912,
"learning_rate": 8.687943629426725e-06,
"loss": 0.3032,
"mean_token_accuracy": 0.9119175374507904,
"num_tokens": 26237554.0,
"step": 750
},
{
"entropy": 0.3552734375,
"epoch": 1.185243328100471,
"grad_norm": 4.031904479069314,
"learning_rate": 8.552188318524737e-06,
"loss": 0.3305,
"mean_token_accuracy": 0.9051981568336487,
"num_tokens": 26420657.0,
"step": 755
},
{
"entropy": 0.3556640625,
"epoch": 1.1930926216640503,
"grad_norm": 4.18585912610383,
"learning_rate": 8.416705011939052e-06,
"loss": 0.3203,
"mean_token_accuracy": 0.9096585392951966,
"num_tokens": 26594423.0,
"step": 760
},
{
"entropy": 0.36640625,
"epoch": 1.2009419152276295,
"grad_norm": 3.62287005225727,
"learning_rate": 8.281519163286772e-06,
"loss": 0.3283,
"mean_token_accuracy": 0.9058149755001068,
"num_tokens": 26772548.0,
"step": 765
},
{
"entropy": 0.377734375,
"epoch": 1.2087912087912087,
"grad_norm": 3.790409760971858,
"learning_rate": 8.146656170300772e-06,
"loss": 0.3457,
"mean_token_accuracy": 0.9028036296367645,
"num_tokens": 26938630.0,
"step": 770
},
{
"entropy": 0.3884765625,
"epoch": 1.2166405023547882,
"grad_norm": 3.818641932499593,
"learning_rate": 8.01214137005815e-06,
"loss": 0.3496,
"mean_token_accuracy": 0.9000514507293701,
"num_tokens": 27111807.0,
"step": 775
},
{
"entropy": 0.34375,
"epoch": 1.2244897959183674,
"grad_norm": 3.577131008292094,
"learning_rate": 7.878000034220092e-06,
"loss": 0.3108,
"mean_token_accuracy": 0.9111146748065948,
"num_tokens": 27301565.0,
"step": 780
},
{
"entropy": 0.35859375,
"epoch": 1.2323390894819466,
"grad_norm": 3.9774086899528927,
"learning_rate": 7.74425736428401e-06,
"loss": 0.3098,
"mean_token_accuracy": 0.9107671201229095,
"num_tokens": 27481197.0,
"step": 785
},
{
"entropy": 0.3740234375,
"epoch": 1.2401883830455258,
"grad_norm": 3.616653726364246,
"learning_rate": 7.6109384868488646e-06,
"loss": 0.3445,
"mean_token_accuracy": 0.9035484492778778,
"num_tokens": 27656624.0,
"step": 790
},
{
"entropy": 0.3419921875,
"epoch": 1.2480376766091053,
"grad_norm": 3.6294243587435164,
"learning_rate": 7.478068448894577e-06,
"loss": 0.2992,
"mean_token_accuracy": 0.9126500189304352,
"num_tokens": 27825076.0,
"step": 795
},
{
"entropy": 0.378515625,
"epoch": 1.2558869701726845,
"grad_norm": 3.5723456408449685,
"learning_rate": 7.3456722130763665e-06,
"loss": 0.3424,
"mean_token_accuracy": 0.9039673745632172,
"num_tokens": 28007761.0,
"step": 800
},
{
"entropy": 0.369140625,
"epoch": 1.2637362637362637,
"grad_norm": 3.8410419633102793,
"learning_rate": 7.213774653034958e-06,
"loss": 0.3435,
"mean_token_accuracy": 0.9041143357753754,
"num_tokens": 28187081.0,
"step": 805
},
{
"entropy": 0.35234375,
"epoch": 1.2715855572998431,
"grad_norm": 3.9529644075488384,
"learning_rate": 7.082400548723505e-06,
"loss": 0.3122,
"mean_token_accuracy": 0.9107161283493042,
"num_tokens": 28363030.0,
"step": 810
},
{
"entropy": 0.366796875,
"epoch": 1.2794348508634223,
"grad_norm": 4.064104450162385,
"learning_rate": 6.951574581752111e-06,
"loss": 0.3319,
"mean_token_accuracy": 0.9035371005535126,
"num_tokens": 28535595.0,
"step": 815
},
{
"entropy": 0.381640625,
"epoch": 1.2872841444270016,
"grad_norm": 5.9889597321012555,
"learning_rate": 6.8213213307508205e-06,
"loss": 0.3437,
"mean_token_accuracy": 0.9042888820171356,
"num_tokens": 28714743.0,
"step": 820
},
{
"entropy": 0.3158203125,
"epoch": 1.2951334379905808,
"grad_norm": 3.9581361848534358,
"learning_rate": 6.6916652667519855e-06,
"loss": 0.3069,
"mean_token_accuracy": 0.9155528962612152,
"num_tokens": 28889873.0,
"step": 825
},
{
"entropy": 0.35390625,
"epoch": 1.30298273155416,
"grad_norm": 3.677250554121152,
"learning_rate": 6.562630748592794e-06,
"loss": 0.32,
"mean_token_accuracy": 0.9077256739139556,
"num_tokens": 29059408.0,
"step": 830
},
{
"entropy": 0.3638671875,
"epoch": 1.3108320251177394,
"grad_norm": 4.061917185197391,
"learning_rate": 6.434242018338948e-06,
"loss": 0.3294,
"mean_token_accuracy": 0.9073351263999939,
"num_tokens": 29236192.0,
"step": 835
},
{
"entropy": 0.3373046875,
"epoch": 1.3186813186813187,
"grad_norm": 3.7207684760296558,
"learning_rate": 6.3065231967302055e-06,
"loss": 0.2991,
"mean_token_accuracy": 0.9142134130001068,
"num_tokens": 29405817.0,
"step": 840
},
{
"entropy": 0.33486328125,
"epoch": 1.3265306122448979,
"grad_norm": 4.167446472870183,
"learning_rate": 6.179498278648766e-06,
"loss": 0.2989,
"mean_token_accuracy": 0.9135854482650757,
"num_tokens": 29584863.0,
"step": 845
},
{
"entropy": 0.3779296875,
"epoch": 1.3343799058084773,
"grad_norm": 3.6630259747898126,
"learning_rate": 6.053191128611298e-06,
"loss": 0.3288,
"mean_token_accuracy": 0.9065738797187806,
"num_tokens": 29760752.0,
"step": 850
},
{
"entropy": 0.34375,
"epoch": 1.3422291993720565,
"grad_norm": 3.653325382479137,
"learning_rate": 5.927625476285426e-06,
"loss": 0.3081,
"mean_token_accuracy": 0.910161966085434,
"num_tokens": 29933046.0,
"step": 855
},
{
"entropy": 0.3578125,
"epoch": 1.3500784929356358,
"grad_norm": 5.149179896152341,
"learning_rate": 5.802824912031588e-06,
"loss": 0.324,
"mean_token_accuracy": 0.9067489266395569,
"num_tokens": 30107202.0,
"step": 860
},
{
"entropy": 0.3896484375,
"epoch": 1.3579277864992152,
"grad_norm": 4.046787648445982,
"learning_rate": 5.678812882471047e-06,
"loss": 0.3563,
"mean_token_accuracy": 0.8968492567539215,
"num_tokens": 30287266.0,
"step": 865
},
{
"entropy": 0.3703125,
"epoch": 1.3657770800627944,
"grad_norm": 3.71919627416169,
"learning_rate": 5.555612686080909e-06,
"loss": 0.3345,
"mean_token_accuracy": 0.9046319007873536,
"num_tokens": 30454583.0,
"step": 870
},
{
"entropy": 0.35546875,
"epoch": 1.3736263736263736,
"grad_norm": 4.812266161183036,
"learning_rate": 5.4332474688169766e-06,
"loss": 0.3162,
"mean_token_accuracy": 0.9111818552017212,
"num_tokens": 30646710.0,
"step": 875
},
{
"entropy": 0.3578125,
"epoch": 1.3814756671899528,
"grad_norm": 3.470032833070242,
"learning_rate": 5.311740219765247e-06,
"loss": 0.3304,
"mean_token_accuracy": 0.9047977864742279,
"num_tokens": 30838566.0,
"step": 880
},
{
"entropy": 0.3638671875,
"epoch": 1.389324960753532,
"grad_norm": 4.00242956343701,
"learning_rate": 5.191113766822905e-06,
"loss": 0.3265,
"mean_token_accuracy": 0.9078042209148407,
"num_tokens": 31016269.0,
"step": 885
},
{
"entropy": 0.360546875,
"epoch": 1.3971742543171115,
"grad_norm": 6.163854036366698,
"learning_rate": 5.071390772409579e-06,
"loss": 0.3307,
"mean_token_accuracy": 0.9069823741912841,
"num_tokens": 31193240.0,
"step": 890
},
{
"entropy": 0.335546875,
"epoch": 1.4050235478806907,
"grad_norm": 3.7842513748094695,
"learning_rate": 4.952593729209671e-06,
"loss": 0.3014,
"mean_token_accuracy": 0.912151551246643,
"num_tokens": 31362873.0,
"step": 895
},
{
"entropy": 0.3505859375,
"epoch": 1.41287284144427,
"grad_norm": 3.8252428181776716,
"learning_rate": 4.834744955946631e-06,
"loss": 0.3171,
"mean_token_accuracy": 0.9103285014629364,
"num_tokens": 31532789.0,
"step": 900
},
{
"entropy": 0.3349609375,
"epoch": 1.4207221350078494,
"grad_norm": 3.8195957170982107,
"learning_rate": 4.717866593189847e-06,
"loss": 0.3016,
"mean_token_accuracy": 0.912840747833252,
"num_tokens": 31701494.0,
"step": 905
},
{
"entropy": 0.37734375,
"epoch": 1.4285714285714286,
"grad_norm": 3.822109137774168,
"learning_rate": 4.60198059919505e-06,
"loss": 0.3351,
"mean_token_accuracy": 0.9047119557857514,
"num_tokens": 31872676.0,
"step": 910
},
{
"entropy": 0.3310546875,
"epoch": 1.4364207221350078,
"grad_norm": 3.8366883679239243,
"learning_rate": 4.487108745778958e-06,
"loss": 0.2812,
"mean_token_accuracy": 0.91913822889328,
"num_tokens": 32041074.0,
"step": 915
},
{
"entropy": 0.3435546875,
"epoch": 1.4442700156985873,
"grad_norm": 3.801981633982777,
"learning_rate": 4.373272614228932e-06,
"loss": 0.3144,
"mean_token_accuracy": 0.9115765929222107,
"num_tokens": 32212406.0,
"step": 920
},
{
"entropy": 0.346484375,
"epoch": 1.4521193092621665,
"grad_norm": 4.447578921600131,
"learning_rate": 4.260493591248458e-06,
"loss": 0.306,
"mean_token_accuracy": 0.911074984073639,
"num_tokens": 32378986.0,
"step": 925
},
{
"entropy": 0.3501953125,
"epoch": 1.4599686028257457,
"grad_norm": 4.190227519433511,
"learning_rate": 4.148792864939164e-06,
"loss": 0.3054,
"mean_token_accuracy": 0.9116972923278809,
"num_tokens": 32558393.0,
"step": 930
},
{
"entropy": 0.3251953125,
"epoch": 1.467817896389325,
"grad_norm": 4.161740647309898,
"learning_rate": 4.038191420820139e-06,
"loss": 0.2966,
"mean_token_accuracy": 0.9156083226203918,
"num_tokens": 32728716.0,
"step": 935
},
{
"entropy": 0.327734375,
"epoch": 1.4756671899529041,
"grad_norm": 3.6311813451272252,
"learning_rate": 3.92871003788535e-06,
"loss": 0.288,
"mean_token_accuracy": 0.918038284778595,
"num_tokens": 32905750.0,
"step": 940
},
{
"entropy": 0.3388671875,
"epoch": 1.4835164835164836,
"grad_norm": 3.738059542189779,
"learning_rate": 3.820369284699823e-06,
"loss": 0.2991,
"mean_token_accuracy": 0.9139463484287262,
"num_tokens": 33075994.0,
"step": 945
},
{
"entropy": 0.341796875,
"epoch": 1.4913657770800628,
"grad_norm": 3.7412909653457738,
"learning_rate": 3.713189515535368e-06,
"loss": 0.3152,
"mean_token_accuracy": 0.9099179923534393,
"num_tokens": 33248229.0,
"step": 950
},
{
"entropy": 0.3126953125,
"epoch": 1.499215070643642,
"grad_norm": 3.545489585791932,
"learning_rate": 3.607190866546578e-06,
"loss": 0.2855,
"mean_token_accuracy": 0.9172944724559784,
"num_tokens": 33422754.0,
"step": 955
},
{
"entropy": 0.32294921875,
"epoch": 1.5070643642072215,
"grad_norm": 3.933250466037461,
"learning_rate": 3.502393251987776e-06,
"loss": 0.2897,
"mean_token_accuracy": 0.9173169553279876,
"num_tokens": 33592867.0,
"step": 960
},
{
"entropy": 0.33154296875,
"epoch": 1.5149136577708007,
"grad_norm": 3.5231854886347347,
"learning_rate": 3.3988163604716928e-06,
"loss": 0.3022,
"mean_token_accuracy": 0.91502086520195,
"num_tokens": 33764810.0,
"step": 965
},
{
"entropy": 0.3322265625,
"epoch": 1.5227629513343799,
"grad_norm": 3.9047550692209225,
"learning_rate": 3.296479651270502e-06,
"loss": 0.289,
"mean_token_accuracy": 0.9168073177337647,
"num_tokens": 33935483.0,
"step": 970
},
{
"entropy": 0.3173828125,
"epoch": 1.5306122448979593,
"grad_norm": 3.7792544290205647,
"learning_rate": 3.195402350659945e-06,
"loss": 0.2933,
"mean_token_accuracy": 0.916690468788147,
"num_tokens": 34111426.0,
"step": 975
},
{
"entropy": 0.3521484375,
"epoch": 1.5384615384615383,
"grad_norm": 3.6732078529565517,
"learning_rate": 3.0956034483072573e-06,
"loss": 0.3133,
"mean_token_accuracy": 0.909661453962326,
"num_tokens": 34290003.0,
"step": 980
},
{
"entropy": 0.3787109375,
"epoch": 1.5463108320251178,
"grad_norm": 5.190106114131474,
"learning_rate": 2.997101693703518e-06,
"loss": 0.3385,
"mean_token_accuracy": 0.9050460577011108,
"num_tokens": 34460879.0,
"step": 985
},
{
"entropy": 0.2978515625,
"epoch": 1.554160125588697,
"grad_norm": 3.8928764556590694,
"learning_rate": 2.8999155926411203e-06,
"loss": 0.2647,
"mean_token_accuracy": 0.9240784645080566,
"num_tokens": 34645393.0,
"step": 990
},
{
"entropy": 0.3353515625,
"epoch": 1.5620094191522762,
"grad_norm": 3.995014435930791,
"learning_rate": 2.8040634037370727e-06,
"loss": 0.305,
"mean_token_accuracy": 0.9108369588851929,
"num_tokens": 34819057.0,
"step": 995
},
{
"entropy": 0.33828125,
"epoch": 1.5698587127158556,
"grad_norm": 3.5439052316501423,
"learning_rate": 2.7095631350026585e-06,
"loss": 0.2977,
"mean_token_accuracy": 0.9142949402332305,
"num_tokens": 34993858.0,
"step": 1000
},
{
"entropy": 0.3111328125,
"epoch": 1.5777080062794349,
"grad_norm": 3.4692400046730576,
"learning_rate": 2.616432540460255e-06,
"loss": 0.2801,
"mean_token_accuracy": 0.9186800062656403,
"num_tokens": 35176584.0,
"step": 1005
},
{
"entropy": 0.348828125,
"epoch": 1.585557299843014,
"grad_norm": 4.36529770249698,
"learning_rate": 2.524689116807826e-06,
"loss": 0.306,
"mean_token_accuracy": 0.9119703054428101,
"num_tokens": 35358793.0,
"step": 1010
},
{
"entropy": 0.3302734375,
"epoch": 1.5934065934065935,
"grad_norm": 4.023141037723821,
"learning_rate": 2.4343501001317604e-06,
"loss": 0.296,
"mean_token_accuracy": 0.9139317333698272,
"num_tokens": 35538704.0,
"step": 1015
},
{
"entropy": 0.315625,
"epoch": 1.6012558869701727,
"grad_norm": 3.523923427899709,
"learning_rate": 2.345432462668702e-06,
"loss": 0.288,
"mean_token_accuracy": 0.917642080783844,
"num_tokens": 35716046.0,
"step": 1020
},
{
"entropy": 0.31044921875,
"epoch": 1.609105180533752,
"grad_norm": 3.644734085017535,
"learning_rate": 2.257952909616914e-06,
"loss": 0.2666,
"mean_token_accuracy": 0.9209400594234467,
"num_tokens": 35891208.0,
"step": 1025
},
{
"entropy": 0.3283203125,
"epoch": 1.6169544740973314,
"grad_norm": 3.3371550953729354,
"learning_rate": 2.1719278759978225e-06,
"loss": 0.2935,
"mean_token_accuracy": 0.9164785146713257,
"num_tokens": 36067493.0,
"step": 1030
},
{
"entropy": 0.3154296875,
"epoch": 1.6248037676609104,
"grad_norm": 4.576016784982279,
"learning_rate": 2.0873735235683535e-06,
"loss": 0.2867,
"mean_token_accuracy": 0.919123786687851,
"num_tokens": 36242711.0,
"step": 1035
},
{
"entropy": 0.330078125,
"epoch": 1.6326530612244898,
"grad_norm": 3.7915992206510256,
"learning_rate": 2.004305737784541e-06,
"loss": 0.306,
"mean_token_accuracy": 0.9124962151050567,
"num_tokens": 36414692.0,
"step": 1040
},
{
"entropy": 0.34404296875,
"epoch": 1.640502354788069,
"grad_norm": 3.4114214410740784,
"learning_rate": 1.922740124817113e-06,
"loss": 0.3324,
"mean_token_accuracy": 0.9155471563339234,
"num_tokens": 36588961.0,
"step": 1045
},
{
"entropy": 0.32578125,
"epoch": 1.6483516483516483,
"grad_norm": 3.6594471068690626,
"learning_rate": 1.8426920086195065e-06,
"loss": 0.2862,
"mean_token_accuracy": 0.9176535487174988,
"num_tokens": 36769165.0,
"step": 1050
},
{
"entropy": 0.33359375,
"epoch": 1.6562009419152277,
"grad_norm": 3.5169016563130806,
"learning_rate": 1.7641764280489081e-06,
"loss": 0.3011,
"mean_token_accuracy": 0.9151181995868682,
"num_tokens": 36937514.0,
"step": 1055
},
{
"entropy": 0.3498046875,
"epoch": 1.664050235478807,
"grad_norm": 3.7880404375931866,
"learning_rate": 1.6872081340408763e-06,
"loss": 0.3118,
"mean_token_accuracy": 0.9124642968177795,
"num_tokens": 37112141.0,
"step": 1060
},
{
"entropy": 0.3388671875,
"epoch": 1.6718995290423861,
"grad_norm": 3.698633480496351,
"learning_rate": 1.6118015868380387e-06,
"loss": 0.303,
"mean_token_accuracy": 0.9128694295883178,
"num_tokens": 37284315.0,
"step": 1065
},
{
"entropy": 0.3212890625,
"epoch": 1.6797488226059656,
"grad_norm": 3.3123418573768357,
"learning_rate": 1.5379709532733944e-06,
"loss": 0.2808,
"mean_token_accuracy": 0.9190841019153595,
"num_tokens": 37466201.0,
"step": 1070
},
{
"entropy": 0.3390625,
"epoch": 1.6875981161695446,
"grad_norm": 3.593562594089776,
"learning_rate": 1.4657301041087812e-06,
"loss": 0.3123,
"mean_token_accuracy": 0.9126930415630341,
"num_tokens": 37643306.0,
"step": 1075
},
{
"entropy": 0.311328125,
"epoch": 1.695447409733124,
"grad_norm": 3.3618127002719485,
"learning_rate": 1.395092611428902e-06,
"loss": 0.2776,
"mean_token_accuracy": 0.9204743444919586,
"num_tokens": 37818425.0,
"step": 1080
},
{
"entropy": 0.33984375,
"epoch": 1.7032967032967035,
"grad_norm": 3.6575930674463466,
"learning_rate": 1.3260717460915296e-06,
"loss": 0.3101,
"mean_token_accuracy": 0.9130974769592285,
"num_tokens": 37992256.0,
"step": 1085
},
{
"entropy": 0.326953125,
"epoch": 1.7111459968602825,
"grad_norm": 3.5686085325723558,
"learning_rate": 1.2586804752342596e-06,
"loss": 0.2819,
"mean_token_accuracy": 0.9184079587459564,
"num_tokens": 38169078.0,
"step": 1090
},
{
"entropy": 0.30869140625,
"epoch": 1.718995290423862,
"grad_norm": 3.520061945368019,
"learning_rate": 1.1929314598383423e-06,
"loss": 0.2821,
"mean_token_accuracy": 0.917450076341629,
"num_tokens": 38356668.0,
"step": 1095
},
{
"entropy": 0.306640625,
"epoch": 1.7268445839874411,
"grad_norm": 3.9895743717145584,
"learning_rate": 1.1288370523500303e-06,
"loss": 0.2636,
"mean_token_accuracy": 0.9235479176044464,
"num_tokens": 38523569.0,
"step": 1100
},
{
"entropy": 0.3361328125,
"epoch": 1.7346938775510203,
"grad_norm": 3.776146789812367,
"learning_rate": 1.0664092943598936e-06,
"loss": 0.3036,
"mean_token_accuracy": 0.9140356600284576,
"num_tokens": 38694147.0,
"step": 1105
},
{
"entropy": 0.316015625,
"epoch": 1.7425431711145998,
"grad_norm": 3.4444521360956735,
"learning_rate": 1.0056599143405244e-06,
"loss": 0.2799,
"mean_token_accuracy": 0.9198241591453552,
"num_tokens": 38867562.0,
"step": 1110
},
{
"entropy": 0.3056640625,
"epoch": 1.750392464678179,
"grad_norm": 3.478490953949651,
"learning_rate": 9.466003254430933e-07,
"loss": 0.2735,
"mean_token_accuracy": 0.9212567389011384,
"num_tokens": 39034512.0,
"step": 1115
},
{
"entropy": 0.32734375,
"epoch": 1.7582417582417582,
"grad_norm": 3.5207204633120157,
"learning_rate": 8.892416233531064e-07,
"loss": 0.2837,
"mean_token_accuracy": 0.9190377771854401,
"num_tokens": 39203376.0,
"step": 1120
},
{
"entropy": 0.33671875,
"epoch": 1.7660910518053377,
"grad_norm": 3.396775939311065,
"learning_rate": 8.335945842058524e-07,
"loss": 0.2894,
"mean_token_accuracy": 0.917039567232132,
"num_tokens": 39391730.0,
"step": 1125
},
{
"entropy": 0.31611328125,
"epoch": 1.7739403453689166,
"grad_norm": 3.633075296397874,
"learning_rate": 7.79669662561845e-07,
"loss": 0.2834,
"mean_token_accuracy": 0.9175218880176544,
"num_tokens": 39559203.0,
"step": 1130
},
{
"entropy": 0.3142578125,
"epoch": 1.781789638932496,
"grad_norm": 3.693691007663445,
"learning_rate": 7.274769894426992e-07,
"loss": 0.2841,
"mean_token_accuracy": 0.9189928412437439,
"num_tokens": 39727754.0,
"step": 1135
},
{
"entropy": 0.3185546875,
"epoch": 1.7896389324960753,
"grad_norm": 3.3504527473164267,
"learning_rate": 6.770263704277958e-07,
"loss": 0.2838,
"mean_token_accuracy": 0.9185858130455017,
"num_tokens": 39911621.0,
"step": 1140
},
{
"entropy": 0.3326171875,
"epoch": 1.7974882260596545,
"grad_norm": 4.048970301070765,
"learning_rate": 6.283272838120747e-07,
"loss": 0.2917,
"mean_token_accuracy": 0.9172453165054322,
"num_tokens": 40080024.0,
"step": 1145
},
{
"entropy": 0.3310546875,
"epoch": 1.805337519623234,
"grad_norm": 4.157221708254746,
"learning_rate": 5.813888788253153e-07,
"loss": 0.2858,
"mean_token_accuracy": 0.9179641664028168,
"num_tokens": 40259404.0,
"step": 1150
},
{
"entropy": 0.3177734375,
"epoch": 1.8131868131868132,
"grad_norm": 3.7177900235708328,
"learning_rate": 5.362199739132656e-07,
"loss": 0.2813,
"mean_token_accuracy": 0.9183613300323487,
"num_tokens": 40435057.0,
"step": 1155
},
{
"entropy": 0.36015625,
"epoch": 1.8210361067503924,
"grad_norm": 3.7466301147455314,
"learning_rate": 4.928290550808734e-07,
"loss": 0.3155,
"mean_token_accuracy": 0.9126826763153076,
"num_tokens": 40617015.0,
"step": 1160
},
{
"entropy": 0.337890625,
"epoch": 1.8288854003139718,
"grad_norm": 3.714640014839032,
"learning_rate": 4.512242742980155e-07,
"loss": 0.2992,
"mean_token_accuracy": 0.914109718799591,
"num_tokens": 40785015.0,
"step": 1165
},
{
"entropy": 0.30908203125,
"epoch": 1.836734693877551,
"grad_norm": 3.510391275134882,
"learning_rate": 4.114134479679543e-07,
"loss": 0.2734,
"mean_token_accuracy": 0.9234184563159943,
"num_tokens": 40977530.0,
"step": 1170
},
{
"entropy": 0.318359375,
"epoch": 1.8445839874411303,
"grad_norm": 3.6233154389122,
"learning_rate": 3.734040554588514e-07,
"loss": 0.2828,
"mean_token_accuracy": 0.9189339816570282,
"num_tokens": 41153483.0,
"step": 1175
},
{
"entropy": 0.3365234375,
"epoch": 1.8524332810047097,
"grad_norm": 3.993058357311181,
"learning_rate": 3.372032376986034e-07,
"loss": 0.3026,
"mean_token_accuracy": 0.9142104327678681,
"num_tokens": 41332210.0,
"step": 1180
},
{
"entropy": 0.39375,
"epoch": 1.8602825745682887,
"grad_norm": 4.328973024123323,
"learning_rate": 3.028177958332512e-07,
"loss": 0.3665,
"mean_token_accuracy": 0.8986062467098236,
"num_tokens": 41507869.0,
"step": 1185
},
{
"entropy": 0.34140625,
"epoch": 1.8681318681318682,
"grad_norm": 3.2104548151581347,
"learning_rate": 2.7025418994922835e-07,
"loss": 0.3053,
"mean_token_accuracy": 0.9121370255947113,
"num_tokens": 41690709.0,
"step": 1190
},
{
"entropy": 0.31875,
"epoch": 1.8759811616954474,
"grad_norm": 3.7412102895291426,
"learning_rate": 2.3951853785969535e-07,
"loss": 0.2721,
"mean_token_accuracy": 0.9221031606197357,
"num_tokens": 41863334.0,
"step": 1195
},
{
"entropy": 0.3087890625,
"epoch": 1.8838304552590266,
"grad_norm": 3.280442340689253,
"learning_rate": 2.106166139551602e-07,
"loss": 0.2706,
"mean_token_accuracy": 0.9225331664085388,
"num_tokens": 42034420.0,
"step": 1200
},
{
"entropy": 0.3197265625,
"epoch": 1.891679748822606,
"grad_norm": 4.156766829239504,
"learning_rate": 1.8355384811863274e-07,
"loss": 0.2933,
"mean_token_accuracy": 0.9180307269096375,
"num_tokens": 42205691.0,
"step": 1205
},
{
"entropy": 0.3162109375,
"epoch": 1.8995290423861853,
"grad_norm": 3.7746269581832683,
"learning_rate": 1.5833532470549862e-07,
"loss": 0.2794,
"mean_token_accuracy": 0.9198680520057678,
"num_tokens": 42379079.0,
"step": 1210
},
{
"entropy": 0.313671875,
"epoch": 1.9073783359497645,
"grad_norm": 3.6980881739964797,
"learning_rate": 1.349657815883032e-07,
"loss": 0.2771,
"mean_token_accuracy": 0.9206208467483521,
"num_tokens": 42561550.0,
"step": 1215
},
{
"entropy": 0.3236328125,
"epoch": 1.915227629513344,
"grad_norm": 3.6057672291053167,
"learning_rate": 1.134496092666415e-07,
"loss": 0.2907,
"mean_token_accuracy": 0.9175450682640076,
"num_tokens": 42733313.0,
"step": 1220
},
{
"entropy": 0.326171875,
"epoch": 1.9230769230769231,
"grad_norm": 4.7667736940463055,
"learning_rate": 9.379085004229571e-08,
"loss": 0.2992,
"mean_token_accuracy": 0.916048800945282,
"num_tokens": 42920454.0,
"step": 1225
},
{
"entropy": 0.3322265625,
"epoch": 1.9309262166405023,
"grad_norm": 3.326013461136122,
"learning_rate": 7.599319725980047e-08,
"loss": 0.2969,
"mean_token_accuracy": 0.9171286225318909,
"num_tokens": 43103243.0,
"step": 1230
},
{
"entropy": 0.319921875,
"epoch": 1.9387755102040818,
"grad_norm": 3.78897614241427,
"learning_rate": 6.005999461256684e-08,
"loss": 0.2754,
"mean_token_accuracy": 0.9204862177371979,
"num_tokens": 43268888.0,
"step": 1235
},
{
"entropy": 0.2892578125,
"epoch": 1.9466248037676608,
"grad_norm": 2.9548706636867004,
"learning_rate": 4.599423551468807e-08,
"loss": 0.2507,
"mean_token_accuracy": 0.9277023196220398,
"num_tokens": 43458666.0,
"step": 1240
},
{
"entropy": 0.32265625,
"epoch": 1.9544740973312402,
"grad_norm": 3.410711053990863,
"learning_rate": 3.379856253855951e-08,
"loss": 0.2941,
"mean_token_accuracy": 0.916301691532135,
"num_tokens": 43643099.0,
"step": 1245
},
{
"entropy": 0.353515625,
"epoch": 1.9623233908948194,
"grad_norm": 3.5072404677907123,
"learning_rate": 2.347526691841906e-08,
"loss": 0.3051,
"mean_token_accuracy": 0.911671257019043,
"num_tokens": 43824484.0,
"step": 1250
},
{
"entropy": 0.309375,
"epoch": 1.9701726844583987,
"grad_norm": 3.7842286783508015,
"learning_rate": 1.5026288119874833e-08,
"loss": 0.2791,
"mean_token_accuracy": 0.9217668533325195,
"num_tokens": 43998806.0,
"step": 1255
},
{
"entropy": 0.3134765625,
"epoch": 1.978021978021978,
"grad_norm": 3.5676523086191914,
"learning_rate": 8.453213475543287e-09,
"loss": 0.2865,
"mean_token_accuracy": 0.9178596436977386,
"num_tokens": 44188721.0,
"step": 1260
},
{
"entropy": 0.3376953125,
"epoch": 1.9858712715855573,
"grad_norm": 3.569512799557757,
"learning_rate": 3.757277886824451e-09,
"loss": 0.2936,
"mean_token_accuracy": 0.917294180393219,
"num_tokens": 44350631.0,
"step": 1265
},
{
"entropy": 0.29462890625,
"epoch": 1.9937205651491365,
"grad_norm": 3.519511879531829,
"learning_rate": 9.393635919041632e-10,
"loss": 0.2592,
"mean_token_accuracy": 0.9252937197685241,
"num_tokens": 44526708.0,
"step": 1270
},
{
"entropy": 0.320068359375,
"epoch": 2.0,
"mean_token_accuracy": 0.9173652082681656,
"num_tokens": 44672115.0,
"step": 1274,
"total_flos": 152753014702080.0,
"train_loss": 0.5267494187998809,
"train_runtime": 3760.0762,
"train_samples_per_second": 21.678,
"train_steps_per_second": 0.339
}
],
"logging_steps": 5,
"max_steps": 1274,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 152753014702080.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}