train_rte_789_1760637901 / trainer_state.json
rbelanec's picture
End of training
ea622bc verified
{
"best_global_step": 1122,
"best_metric": 0.10693030804395676,
"best_model_checkpoint": "saves_multiple/lora/llama-3-8b-instruct/train_rte_789_1760637901/checkpoint-1122",
"epoch": 20.0,
"eval_steps": 561,
"global_step": 11220,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008912655971479501,
"grad_norm": 0.10324911028146744,
"learning_rate": 1.7825311942959003e-07,
"loss": 0.2859,
"num_input_tokens_seen": 2848,
"step": 5
},
{
"epoch": 0.017825311942959002,
"grad_norm": 1.0517581701278687,
"learning_rate": 4.010695187165776e-07,
"loss": 0.098,
"num_input_tokens_seen": 6272,
"step": 10
},
{
"epoch": 0.026737967914438502,
"grad_norm": 0.4652565121650696,
"learning_rate": 6.238859180035651e-07,
"loss": 0.0333,
"num_input_tokens_seen": 9344,
"step": 15
},
{
"epoch": 0.035650623885918005,
"grad_norm": 3.560412645339966,
"learning_rate": 8.467023172905526e-07,
"loss": 0.2653,
"num_input_tokens_seen": 12672,
"step": 20
},
{
"epoch": 0.044563279857397504,
"grad_norm": 9.704721450805664,
"learning_rate": 1.0695187165775401e-06,
"loss": 0.6547,
"num_input_tokens_seen": 16000,
"step": 25
},
{
"epoch": 0.053475935828877004,
"grad_norm": 0.041648488491773605,
"learning_rate": 1.2923351158645277e-06,
"loss": 0.2366,
"num_input_tokens_seen": 19488,
"step": 30
},
{
"epoch": 0.062388591800356503,
"grad_norm": 4.751997470855713,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.3026,
"num_input_tokens_seen": 23008,
"step": 35
},
{
"epoch": 0.07130124777183601,
"grad_norm": 5.809197902679443,
"learning_rate": 1.7379679144385028e-06,
"loss": 0.1454,
"num_input_tokens_seen": 25728,
"step": 40
},
{
"epoch": 0.08021390374331551,
"grad_norm": 4.5152974128723145,
"learning_rate": 1.96078431372549e-06,
"loss": 0.2345,
"num_input_tokens_seen": 28672,
"step": 45
},
{
"epoch": 0.08912655971479501,
"grad_norm": 6.366511821746826,
"learning_rate": 2.1836007130124777e-06,
"loss": 0.2825,
"num_input_tokens_seen": 31360,
"step": 50
},
{
"epoch": 0.09803921568627451,
"grad_norm": 0.06518510729074478,
"learning_rate": 2.4064171122994653e-06,
"loss": 0.3811,
"num_input_tokens_seen": 34432,
"step": 55
},
{
"epoch": 0.10695187165775401,
"grad_norm": 3.4774608612060547,
"learning_rate": 2.629233511586453e-06,
"loss": 0.4573,
"num_input_tokens_seen": 37920,
"step": 60
},
{
"epoch": 0.11586452762923351,
"grad_norm": 8.832063674926758,
"learning_rate": 2.8520499108734404e-06,
"loss": 0.3591,
"num_input_tokens_seen": 41344,
"step": 65
},
{
"epoch": 0.12477718360071301,
"grad_norm": 6.110069274902344,
"learning_rate": 3.074866310160428e-06,
"loss": 0.0523,
"num_input_tokens_seen": 44448,
"step": 70
},
{
"epoch": 0.13368983957219252,
"grad_norm": 0.27768051624298096,
"learning_rate": 3.297682709447415e-06,
"loss": 0.3264,
"num_input_tokens_seen": 48000,
"step": 75
},
{
"epoch": 0.14260249554367202,
"grad_norm": 6.778256416320801,
"learning_rate": 3.5204991087344027e-06,
"loss": 0.5562,
"num_input_tokens_seen": 50912,
"step": 80
},
{
"epoch": 0.15151515151515152,
"grad_norm": 0.3445241153240204,
"learning_rate": 3.7433155080213903e-06,
"loss": 0.1069,
"num_input_tokens_seen": 54272,
"step": 85
},
{
"epoch": 0.16042780748663102,
"grad_norm": 5.583451271057129,
"learning_rate": 3.966131907308377e-06,
"loss": 0.2782,
"num_input_tokens_seen": 57632,
"step": 90
},
{
"epoch": 0.16934046345811052,
"grad_norm": 4.155448913574219,
"learning_rate": 4.188948306595366e-06,
"loss": 0.1467,
"num_input_tokens_seen": 60736,
"step": 95
},
{
"epoch": 0.17825311942959002,
"grad_norm": 6.108593940734863,
"learning_rate": 4.411764705882353e-06,
"loss": 0.2713,
"num_input_tokens_seen": 64032,
"step": 100
},
{
"epoch": 0.18716577540106952,
"grad_norm": 1.6728438138961792,
"learning_rate": 4.6345811051693405e-06,
"loss": 0.4227,
"num_input_tokens_seen": 67424,
"step": 105
},
{
"epoch": 0.19607843137254902,
"grad_norm": 6.609097003936768,
"learning_rate": 4.8573975044563285e-06,
"loss": 0.1545,
"num_input_tokens_seen": 70048,
"step": 110
},
{
"epoch": 0.20499108734402852,
"grad_norm": 2.8476388454437256,
"learning_rate": 5.080213903743316e-06,
"loss": 0.1301,
"num_input_tokens_seen": 73376,
"step": 115
},
{
"epoch": 0.21390374331550802,
"grad_norm": 2.5001397132873535,
"learning_rate": 5.303030303030304e-06,
"loss": 0.2149,
"num_input_tokens_seen": 76224,
"step": 120
},
{
"epoch": 0.22281639928698752,
"grad_norm": 0.32793930172920227,
"learning_rate": 5.525846702317291e-06,
"loss": 0.0202,
"num_input_tokens_seen": 79712,
"step": 125
},
{
"epoch": 0.23172905525846701,
"grad_norm": 1.1511719226837158,
"learning_rate": 5.748663101604279e-06,
"loss": 0.1234,
"num_input_tokens_seen": 82336,
"step": 130
},
{
"epoch": 0.24064171122994651,
"grad_norm": 2.1722285747528076,
"learning_rate": 5.971479500891266e-06,
"loss": 0.2654,
"num_input_tokens_seen": 85376,
"step": 135
},
{
"epoch": 0.24955436720142601,
"grad_norm": 1.2259153127670288,
"learning_rate": 6.194295900178253e-06,
"loss": 0.1989,
"num_input_tokens_seen": 88992,
"step": 140
},
{
"epoch": 0.25846702317290554,
"grad_norm": 2.9561991691589355,
"learning_rate": 6.417112299465242e-06,
"loss": 0.2072,
"num_input_tokens_seen": 91840,
"step": 145
},
{
"epoch": 0.26737967914438504,
"grad_norm": 3.219242811203003,
"learning_rate": 6.639928698752229e-06,
"loss": 0.081,
"num_input_tokens_seen": 95424,
"step": 150
},
{
"epoch": 0.27629233511586454,
"grad_norm": 2.12544584274292,
"learning_rate": 6.862745098039216e-06,
"loss": 0.1405,
"num_input_tokens_seen": 98016,
"step": 155
},
{
"epoch": 0.28520499108734404,
"grad_norm": 2.4310574531555176,
"learning_rate": 7.085561497326204e-06,
"loss": 0.1359,
"num_input_tokens_seen": 101184,
"step": 160
},
{
"epoch": 0.29411764705882354,
"grad_norm": 1.650681495666504,
"learning_rate": 7.308377896613191e-06,
"loss": 0.1142,
"num_input_tokens_seen": 103776,
"step": 165
},
{
"epoch": 0.30303030303030304,
"grad_norm": 5.215426445007324,
"learning_rate": 7.531194295900179e-06,
"loss": 0.1139,
"num_input_tokens_seen": 107104,
"step": 170
},
{
"epoch": 0.31194295900178254,
"grad_norm": 3.0023438930511475,
"learning_rate": 7.754010695187166e-06,
"loss": 0.1606,
"num_input_tokens_seen": 109728,
"step": 175
},
{
"epoch": 0.32085561497326204,
"grad_norm": 0.5228773951530457,
"learning_rate": 7.976827094474154e-06,
"loss": 0.1457,
"num_input_tokens_seen": 112224,
"step": 180
},
{
"epoch": 0.32976827094474154,
"grad_norm": 1.5815906524658203,
"learning_rate": 8.19964349376114e-06,
"loss": 0.1591,
"num_input_tokens_seen": 115776,
"step": 185
},
{
"epoch": 0.33868092691622104,
"grad_norm": 1.2593677043914795,
"learning_rate": 8.42245989304813e-06,
"loss": 0.0762,
"num_input_tokens_seen": 118272,
"step": 190
},
{
"epoch": 0.34759358288770054,
"grad_norm": 3.1801679134368896,
"learning_rate": 8.645276292335117e-06,
"loss": 0.1485,
"num_input_tokens_seen": 121120,
"step": 195
},
{
"epoch": 0.35650623885918004,
"grad_norm": 13.186616897583008,
"learning_rate": 8.868092691622104e-06,
"loss": 0.1945,
"num_input_tokens_seen": 124096,
"step": 200
},
{
"epoch": 0.36541889483065954,
"grad_norm": 1.0450903177261353,
"learning_rate": 9.090909090909091e-06,
"loss": 0.2069,
"num_input_tokens_seen": 127328,
"step": 205
},
{
"epoch": 0.37433155080213903,
"grad_norm": 7.1771650314331055,
"learning_rate": 9.31372549019608e-06,
"loss": 0.1029,
"num_input_tokens_seen": 130400,
"step": 210
},
{
"epoch": 0.38324420677361853,
"grad_norm": 1.999472975730896,
"learning_rate": 9.536541889483067e-06,
"loss": 0.1051,
"num_input_tokens_seen": 134272,
"step": 215
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.8615163564682007,
"learning_rate": 9.759358288770054e-06,
"loss": 0.0401,
"num_input_tokens_seen": 136480,
"step": 220
},
{
"epoch": 0.40106951871657753,
"grad_norm": 1.6216832399368286,
"learning_rate": 9.982174688057041e-06,
"loss": 0.0511,
"num_input_tokens_seen": 139872,
"step": 225
},
{
"epoch": 0.40998217468805703,
"grad_norm": 1.0488457679748535,
"learning_rate": 1.0204991087344028e-05,
"loss": 0.0885,
"num_input_tokens_seen": 143200,
"step": 230
},
{
"epoch": 0.41889483065953653,
"grad_norm": 1.4865139722824097,
"learning_rate": 1.0427807486631017e-05,
"loss": 0.1056,
"num_input_tokens_seen": 146752,
"step": 235
},
{
"epoch": 0.42780748663101603,
"grad_norm": 8.77585506439209,
"learning_rate": 1.0650623885918004e-05,
"loss": 0.1464,
"num_input_tokens_seen": 149952,
"step": 240
},
{
"epoch": 0.43672014260249553,
"grad_norm": 5.2574238777160645,
"learning_rate": 1.0873440285204992e-05,
"loss": 0.1099,
"num_input_tokens_seen": 153184,
"step": 245
},
{
"epoch": 0.44563279857397503,
"grad_norm": 2.0072691440582275,
"learning_rate": 1.1096256684491979e-05,
"loss": 0.127,
"num_input_tokens_seen": 156864,
"step": 250
},
{
"epoch": 0.45454545454545453,
"grad_norm": 1.7638945579528809,
"learning_rate": 1.1319073083778966e-05,
"loss": 0.0674,
"num_input_tokens_seen": 159680,
"step": 255
},
{
"epoch": 0.46345811051693403,
"grad_norm": 1.7958506345748901,
"learning_rate": 1.1541889483065955e-05,
"loss": 0.0197,
"num_input_tokens_seen": 162912,
"step": 260
},
{
"epoch": 0.47237076648841353,
"grad_norm": 7.391216278076172,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.0956,
"num_input_tokens_seen": 165632,
"step": 265
},
{
"epoch": 0.48128342245989303,
"grad_norm": 0.057393141090869904,
"learning_rate": 1.1987522281639929e-05,
"loss": 0.1021,
"num_input_tokens_seen": 168960,
"step": 270
},
{
"epoch": 0.49019607843137253,
"grad_norm": 9.154584884643555,
"learning_rate": 1.2210338680926916e-05,
"loss": 0.1136,
"num_input_tokens_seen": 171936,
"step": 275
},
{
"epoch": 0.49910873440285203,
"grad_norm": 2.9709866046905518,
"learning_rate": 1.2433155080213903e-05,
"loss": 0.0711,
"num_input_tokens_seen": 175200,
"step": 280
},
{
"epoch": 0.5080213903743316,
"grad_norm": 2.884493589401245,
"learning_rate": 1.2655971479500892e-05,
"loss": 0.0415,
"num_input_tokens_seen": 177728,
"step": 285
},
{
"epoch": 0.5169340463458111,
"grad_norm": 3.1335482597351074,
"learning_rate": 1.287878787878788e-05,
"loss": 0.0946,
"num_input_tokens_seen": 180768,
"step": 290
},
{
"epoch": 0.5258467023172906,
"grad_norm": 4.962701320648193,
"learning_rate": 1.3101604278074866e-05,
"loss": 0.0913,
"num_input_tokens_seen": 183584,
"step": 295
},
{
"epoch": 0.5347593582887701,
"grad_norm": 3.826408624649048,
"learning_rate": 1.3324420677361854e-05,
"loss": 0.0802,
"num_input_tokens_seen": 186304,
"step": 300
},
{
"epoch": 0.5436720142602496,
"grad_norm": 4.077633380889893,
"learning_rate": 1.3547237076648842e-05,
"loss": 0.0612,
"num_input_tokens_seen": 189472,
"step": 305
},
{
"epoch": 0.5525846702317291,
"grad_norm": 0.1405235230922699,
"learning_rate": 1.377005347593583e-05,
"loss": 0.0308,
"num_input_tokens_seen": 192384,
"step": 310
},
{
"epoch": 0.5614973262032086,
"grad_norm": 2.734724521636963,
"learning_rate": 1.3992869875222817e-05,
"loss": 0.0437,
"num_input_tokens_seen": 195328,
"step": 315
},
{
"epoch": 0.5704099821746881,
"grad_norm": 8.127835273742676,
"learning_rate": 1.4215686274509804e-05,
"loss": 0.3415,
"num_input_tokens_seen": 197888,
"step": 320
},
{
"epoch": 0.5793226381461676,
"grad_norm": 8.873647689819336,
"learning_rate": 1.4438502673796791e-05,
"loss": 0.0985,
"num_input_tokens_seen": 200672,
"step": 325
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.7725785970687866,
"learning_rate": 1.466131907308378e-05,
"loss": 0.1717,
"num_input_tokens_seen": 203904,
"step": 330
},
{
"epoch": 0.5971479500891266,
"grad_norm": 1.473873257637024,
"learning_rate": 1.4884135472370767e-05,
"loss": 0.0824,
"num_input_tokens_seen": 206688,
"step": 335
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.9841282963752747,
"learning_rate": 1.5106951871657754e-05,
"loss": 0.0436,
"num_input_tokens_seen": 208832,
"step": 340
},
{
"epoch": 0.6149732620320856,
"grad_norm": 4.264573097229004,
"learning_rate": 1.532976827094474e-05,
"loss": 0.1102,
"num_input_tokens_seen": 211488,
"step": 345
},
{
"epoch": 0.6238859180035651,
"grad_norm": 1.2473746538162231,
"learning_rate": 1.555258467023173e-05,
"loss": 0.0409,
"num_input_tokens_seen": 214080,
"step": 350
},
{
"epoch": 0.6327985739750446,
"grad_norm": 2.1777610778808594,
"learning_rate": 1.5775401069518716e-05,
"loss": 0.0346,
"num_input_tokens_seen": 217728,
"step": 355
},
{
"epoch": 0.6417112299465241,
"grad_norm": 0.04029938206076622,
"learning_rate": 1.5998217468805704e-05,
"loss": 0.0371,
"num_input_tokens_seen": 220640,
"step": 360
},
{
"epoch": 0.6506238859180036,
"grad_norm": 1.2787014245986938,
"learning_rate": 1.6221033868092693e-05,
"loss": 0.1806,
"num_input_tokens_seen": 223328,
"step": 365
},
{
"epoch": 0.6595365418894831,
"grad_norm": 6.66487979888916,
"learning_rate": 1.644385026737968e-05,
"loss": 0.0279,
"num_input_tokens_seen": 226336,
"step": 370
},
{
"epoch": 0.6684491978609626,
"grad_norm": 1.7244598865509033,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.1167,
"num_input_tokens_seen": 228896,
"step": 375
},
{
"epoch": 0.6773618538324421,
"grad_norm": 0.15285556018352509,
"learning_rate": 1.6889483065953653e-05,
"loss": 0.1697,
"num_input_tokens_seen": 231584,
"step": 380
},
{
"epoch": 0.6862745098039216,
"grad_norm": 7.496185779571533,
"learning_rate": 1.7112299465240642e-05,
"loss": 0.2399,
"num_input_tokens_seen": 235552,
"step": 385
},
{
"epoch": 0.6951871657754011,
"grad_norm": 2.4987404346466064,
"learning_rate": 1.733511586452763e-05,
"loss": 0.0541,
"num_input_tokens_seen": 238784,
"step": 390
},
{
"epoch": 0.7040998217468806,
"grad_norm": 0.9350168108940125,
"learning_rate": 1.7557932263814616e-05,
"loss": 0.1148,
"num_input_tokens_seen": 241728,
"step": 395
},
{
"epoch": 0.7130124777183601,
"grad_norm": 11.738545417785645,
"learning_rate": 1.7780748663101605e-05,
"loss": 0.1737,
"num_input_tokens_seen": 244544,
"step": 400
},
{
"epoch": 0.7219251336898396,
"grad_norm": 0.8024265170097351,
"learning_rate": 1.800356506238859e-05,
"loss": 0.0953,
"num_input_tokens_seen": 247008,
"step": 405
},
{
"epoch": 0.7308377896613191,
"grad_norm": 0.6156238317489624,
"learning_rate": 1.822638146167558e-05,
"loss": 0.0574,
"num_input_tokens_seen": 250368,
"step": 410
},
{
"epoch": 0.7397504456327986,
"grad_norm": 2.1485512256622314,
"learning_rate": 1.8449197860962568e-05,
"loss": 0.0385,
"num_input_tokens_seen": 253216,
"step": 415
},
{
"epoch": 0.7486631016042781,
"grad_norm": 0.7714303135871887,
"learning_rate": 1.8672014260249553e-05,
"loss": 0.0446,
"num_input_tokens_seen": 257216,
"step": 420
},
{
"epoch": 0.7575757575757576,
"grad_norm": 0.0683104544878006,
"learning_rate": 1.8894830659536542e-05,
"loss": 0.1535,
"num_input_tokens_seen": 261088,
"step": 425
},
{
"epoch": 0.7664884135472371,
"grad_norm": 0.10318350046873093,
"learning_rate": 1.9117647058823528e-05,
"loss": 0.057,
"num_input_tokens_seen": 264288,
"step": 430
},
{
"epoch": 0.7754010695187166,
"grad_norm": 3.8079121112823486,
"learning_rate": 1.9340463458110517e-05,
"loss": 0.1,
"num_input_tokens_seen": 266816,
"step": 435
},
{
"epoch": 0.7843137254901961,
"grad_norm": 6.305472373962402,
"learning_rate": 1.9563279857397505e-05,
"loss": 0.0466,
"num_input_tokens_seen": 269760,
"step": 440
},
{
"epoch": 0.7932263814616756,
"grad_norm": 0.969581663608551,
"learning_rate": 1.9786096256684494e-05,
"loss": 0.0307,
"num_input_tokens_seen": 272512,
"step": 445
},
{
"epoch": 0.8021390374331551,
"grad_norm": 0.7665776610374451,
"learning_rate": 2.0008912655971483e-05,
"loss": 0.1512,
"num_input_tokens_seen": 276256,
"step": 450
},
{
"epoch": 0.8110516934046346,
"grad_norm": 2.4583513736724854,
"learning_rate": 2.023172905525847e-05,
"loss": 0.0516,
"num_input_tokens_seen": 279456,
"step": 455
},
{
"epoch": 0.8199643493761141,
"grad_norm": 11.411388397216797,
"learning_rate": 2.0454545454545457e-05,
"loss": 0.1566,
"num_input_tokens_seen": 282496,
"step": 460
},
{
"epoch": 0.8288770053475936,
"grad_norm": 4.11398458480835,
"learning_rate": 2.0677361853832443e-05,
"loss": 0.139,
"num_input_tokens_seen": 286080,
"step": 465
},
{
"epoch": 0.8377896613190731,
"grad_norm": 0.013057722710072994,
"learning_rate": 2.090017825311943e-05,
"loss": 0.1,
"num_input_tokens_seen": 288640,
"step": 470
},
{
"epoch": 0.8467023172905526,
"grad_norm": 0.1136278584599495,
"learning_rate": 2.112299465240642e-05,
"loss": 0.0078,
"num_input_tokens_seen": 291840,
"step": 475
},
{
"epoch": 0.8556149732620321,
"grad_norm": 3.418334722518921,
"learning_rate": 2.1345811051693406e-05,
"loss": 0.0275,
"num_input_tokens_seen": 294496,
"step": 480
},
{
"epoch": 0.8645276292335116,
"grad_norm": 3.2661521434783936,
"learning_rate": 2.1568627450980395e-05,
"loss": 0.1015,
"num_input_tokens_seen": 297664,
"step": 485
},
{
"epoch": 0.8734402852049911,
"grad_norm": 4.954894542694092,
"learning_rate": 2.179144385026738e-05,
"loss": 0.0496,
"num_input_tokens_seen": 300608,
"step": 490
},
{
"epoch": 0.8823529411764706,
"grad_norm": 3.7999918460845947,
"learning_rate": 2.201426024955437e-05,
"loss": 0.2225,
"num_input_tokens_seen": 303808,
"step": 495
},
{
"epoch": 0.8912655971479501,
"grad_norm": 0.635353684425354,
"learning_rate": 2.2237076648841358e-05,
"loss": 0.0955,
"num_input_tokens_seen": 307904,
"step": 500
},
{
"epoch": 0.9001782531194296,
"grad_norm": 0.8217610120773315,
"learning_rate": 2.2459893048128343e-05,
"loss": 0.1362,
"num_input_tokens_seen": 311968,
"step": 505
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.324615716934204,
"learning_rate": 2.2682709447415332e-05,
"loss": 0.0888,
"num_input_tokens_seen": 315232,
"step": 510
},
{
"epoch": 0.9180035650623886,
"grad_norm": 0.03214741870760918,
"learning_rate": 2.2905525846702318e-05,
"loss": 0.0141,
"num_input_tokens_seen": 317792,
"step": 515
},
{
"epoch": 0.9269162210338681,
"grad_norm": 5.245635032653809,
"learning_rate": 2.3128342245989306e-05,
"loss": 0.0916,
"num_input_tokens_seen": 320160,
"step": 520
},
{
"epoch": 0.9358288770053476,
"grad_norm": 3.70466685295105,
"learning_rate": 2.3351158645276295e-05,
"loss": 0.1095,
"num_input_tokens_seen": 323680,
"step": 525
},
{
"epoch": 0.9447415329768271,
"grad_norm": 0.4552616477012634,
"learning_rate": 2.357397504456328e-05,
"loss": 0.0635,
"num_input_tokens_seen": 326464,
"step": 530
},
{
"epoch": 0.9536541889483066,
"grad_norm": 0.10007640719413757,
"learning_rate": 2.379679144385027e-05,
"loss": 0.0561,
"num_input_tokens_seen": 329632,
"step": 535
},
{
"epoch": 0.9625668449197861,
"grad_norm": 0.3329647183418274,
"learning_rate": 2.401960784313726e-05,
"loss": 0.0307,
"num_input_tokens_seen": 333344,
"step": 540
},
{
"epoch": 0.9714795008912656,
"grad_norm": 0.3274289071559906,
"learning_rate": 2.4242424242424244e-05,
"loss": 0.0247,
"num_input_tokens_seen": 337120,
"step": 545
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.05923886224627495,
"learning_rate": 2.4465240641711233e-05,
"loss": 0.002,
"num_input_tokens_seen": 340416,
"step": 550
},
{
"epoch": 0.9893048128342246,
"grad_norm": 4.643728733062744,
"learning_rate": 2.4688057040998218e-05,
"loss": 0.1167,
"num_input_tokens_seen": 344160,
"step": 555
},
{
"epoch": 0.9982174688057041,
"grad_norm": 0.12344362586736679,
"learning_rate": 2.4910873440285207e-05,
"loss": 0.0084,
"num_input_tokens_seen": 347744,
"step": 560
},
{
"epoch": 1.0,
"eval_loss": 0.1087946742773056,
"eval_runtime": 4.5784,
"eval_samples_per_second": 54.386,
"eval_steps_per_second": 13.76,
"num_input_tokens_seen": 347936,
"step": 561
},
{
"epoch": 1.0071301247771836,
"grad_norm": 8.679508209228516,
"learning_rate": 2.5133689839572196e-05,
"loss": 0.1602,
"num_input_tokens_seen": 350816,
"step": 565
},
{
"epoch": 1.0160427807486632,
"grad_norm": 4.490102291107178,
"learning_rate": 2.5356506238859178e-05,
"loss": 0.157,
"num_input_tokens_seen": 353824,
"step": 570
},
{
"epoch": 1.0249554367201426,
"grad_norm": 5.047804355621338,
"learning_rate": 2.557932263814617e-05,
"loss": 0.1128,
"num_input_tokens_seen": 356960,
"step": 575
},
{
"epoch": 1.0338680926916222,
"grad_norm": 1.7584697008132935,
"learning_rate": 2.5802139037433156e-05,
"loss": 0.151,
"num_input_tokens_seen": 360288,
"step": 580
},
{
"epoch": 1.0427807486631016,
"grad_norm": 6.616455554962158,
"learning_rate": 2.6024955436720144e-05,
"loss": 0.0656,
"num_input_tokens_seen": 363552,
"step": 585
},
{
"epoch": 1.0516934046345812,
"grad_norm": 0.09188894182443619,
"learning_rate": 2.624777183600713e-05,
"loss": 0.0211,
"num_input_tokens_seen": 366976,
"step": 590
},
{
"epoch": 1.0606060606060606,
"grad_norm": 5.818865776062012,
"learning_rate": 2.647058823529412e-05,
"loss": 0.1186,
"num_input_tokens_seen": 369952,
"step": 595
},
{
"epoch": 1.0695187165775402,
"grad_norm": 0.9368240833282471,
"learning_rate": 2.6693404634581104e-05,
"loss": 0.0526,
"num_input_tokens_seen": 373248,
"step": 600
},
{
"epoch": 1.0784313725490196,
"grad_norm": 4.885313034057617,
"learning_rate": 2.6916221033868093e-05,
"loss": 0.0932,
"num_input_tokens_seen": 376512,
"step": 605
},
{
"epoch": 1.0873440285204992,
"grad_norm": 0.019336791709065437,
"learning_rate": 2.713903743315508e-05,
"loss": 0.0464,
"num_input_tokens_seen": 379200,
"step": 610
},
{
"epoch": 1.0962566844919786,
"grad_norm": 6.228531360626221,
"learning_rate": 2.736185383244207e-05,
"loss": 0.118,
"num_input_tokens_seen": 382656,
"step": 615
},
{
"epoch": 1.1051693404634582,
"grad_norm": 8.659507751464844,
"learning_rate": 2.7584670231729053e-05,
"loss": 0.0363,
"num_input_tokens_seen": 384928,
"step": 620
},
{
"epoch": 1.1140819964349375,
"grad_norm": 5.95408821105957,
"learning_rate": 2.7807486631016045e-05,
"loss": 0.0361,
"num_input_tokens_seen": 387968,
"step": 625
},
{
"epoch": 1.1229946524064172,
"grad_norm": 1.6606276035308838,
"learning_rate": 2.803030303030303e-05,
"loss": 0.0721,
"num_input_tokens_seen": 390848,
"step": 630
},
{
"epoch": 1.1319073083778965,
"grad_norm": 0.025730885565280914,
"learning_rate": 2.825311942959002e-05,
"loss": 0.0982,
"num_input_tokens_seen": 393952,
"step": 635
},
{
"epoch": 1.1408199643493762,
"grad_norm": 0.515564501285553,
"learning_rate": 2.8475935828877005e-05,
"loss": 0.0538,
"num_input_tokens_seen": 397344,
"step": 640
},
{
"epoch": 1.1497326203208555,
"grad_norm": 1.605162262916565,
"learning_rate": 2.8698752228163994e-05,
"loss": 0.0718,
"num_input_tokens_seen": 400512,
"step": 645
},
{
"epoch": 1.1586452762923352,
"grad_norm": 4.248813629150391,
"learning_rate": 2.8921568627450986e-05,
"loss": 0.0573,
"num_input_tokens_seen": 403872,
"step": 650
},
{
"epoch": 1.1675579322638145,
"grad_norm": 4.230849266052246,
"learning_rate": 2.9144385026737968e-05,
"loss": 0.0858,
"num_input_tokens_seen": 406752,
"step": 655
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.38777992129325867,
"learning_rate": 2.936720142602496e-05,
"loss": 0.0643,
"num_input_tokens_seen": 410080,
"step": 660
},
{
"epoch": 1.1853832442067735,
"grad_norm": 8.435359001159668,
"learning_rate": 2.9590017825311946e-05,
"loss": 0.0513,
"num_input_tokens_seen": 413184,
"step": 665
},
{
"epoch": 1.1942959001782532,
"grad_norm": 0.2321002185344696,
"learning_rate": 2.9812834224598934e-05,
"loss": 0.0781,
"num_input_tokens_seen": 416224,
"step": 670
},
{
"epoch": 1.2032085561497325,
"grad_norm": 0.8981883525848389,
"learning_rate": 3.003565062388592e-05,
"loss": 0.0792,
"num_input_tokens_seen": 419776,
"step": 675
},
{
"epoch": 1.2121212121212122,
"grad_norm": 4.635167121887207,
"learning_rate": 3.025846702317291e-05,
"loss": 0.065,
"num_input_tokens_seen": 423008,
"step": 680
},
{
"epoch": 1.2210338680926915,
"grad_norm": 0.7758586406707764,
"learning_rate": 3.0481283422459894e-05,
"loss": 0.0854,
"num_input_tokens_seen": 426112,
"step": 685
},
{
"epoch": 1.2299465240641712,
"grad_norm": 0.11127970367670059,
"learning_rate": 3.0704099821746886e-05,
"loss": 0.1187,
"num_input_tokens_seen": 429376,
"step": 690
},
{
"epoch": 1.2388591800356505,
"grad_norm": 2.0615482330322266,
"learning_rate": 3.092691622103387e-05,
"loss": 0.0345,
"num_input_tokens_seen": 432480,
"step": 695
},
{
"epoch": 1.2477718360071302,
"grad_norm": 0.4409101605415344,
"learning_rate": 3.114973262032086e-05,
"loss": 0.0525,
"num_input_tokens_seen": 435392,
"step": 700
},
{
"epoch": 1.2566844919786098,
"grad_norm": 5.350732326507568,
"learning_rate": 3.137254901960784e-05,
"loss": 0.0874,
"num_input_tokens_seen": 438816,
"step": 705
},
{
"epoch": 1.2655971479500892,
"grad_norm": 1.308171033859253,
"learning_rate": 3.1595365418894835e-05,
"loss": 0.0303,
"num_input_tokens_seen": 441792,
"step": 710
},
{
"epoch": 1.2745098039215685,
"grad_norm": 1.4284766912460327,
"learning_rate": 3.181818181818182e-05,
"loss": 0.0395,
"num_input_tokens_seen": 445216,
"step": 715
},
{
"epoch": 1.2834224598930482,
"grad_norm": 2.7852184772491455,
"learning_rate": 3.204099821746881e-05,
"loss": 0.0206,
"num_input_tokens_seen": 447136,
"step": 720
},
{
"epoch": 1.2923351158645278,
"grad_norm": 0.03415602445602417,
"learning_rate": 3.226381461675579e-05,
"loss": 0.0704,
"num_input_tokens_seen": 449728,
"step": 725
},
{
"epoch": 1.3012477718360071,
"grad_norm": 0.20296916365623474,
"learning_rate": 3.2486631016042783e-05,
"loss": 0.0941,
"num_input_tokens_seen": 452704,
"step": 730
},
{
"epoch": 1.3101604278074865,
"grad_norm": 0.4193146824836731,
"learning_rate": 3.270944741532977e-05,
"loss": 0.0214,
"num_input_tokens_seen": 455488,
"step": 735
},
{
"epoch": 1.3190730837789661,
"grad_norm": 0.00828193873167038,
"learning_rate": 3.293226381461676e-05,
"loss": 0.0095,
"num_input_tokens_seen": 458592,
"step": 740
},
{
"epoch": 1.3279857397504458,
"grad_norm": 0.07712722569704056,
"learning_rate": 3.3155080213903747e-05,
"loss": 0.0574,
"num_input_tokens_seen": 461440,
"step": 745
},
{
"epoch": 1.3368983957219251,
"grad_norm": 6.220320224761963,
"learning_rate": 3.337789661319073e-05,
"loss": 0.1225,
"num_input_tokens_seen": 465184,
"step": 750
},
{
"epoch": 1.3458110516934045,
"grad_norm": 6.957297325134277,
"learning_rate": 3.360071301247772e-05,
"loss": 0.1108,
"num_input_tokens_seen": 468320,
"step": 755
},
{
"epoch": 1.3547237076648841,
"grad_norm": 0.24312429130077362,
"learning_rate": 3.382352941176471e-05,
"loss": 0.0746,
"num_input_tokens_seen": 471456,
"step": 760
},
{
"epoch": 1.3636363636363638,
"grad_norm": 4.236689567565918,
"learning_rate": 3.4046345811051695e-05,
"loss": 0.0884,
"num_input_tokens_seen": 474400,
"step": 765
},
{
"epoch": 1.3725490196078431,
"grad_norm": 1.9785674810409546,
"learning_rate": 3.426916221033869e-05,
"loss": 0.0374,
"num_input_tokens_seen": 476928,
"step": 770
},
{
"epoch": 1.3814616755793225,
"grad_norm": 3.5035243034362793,
"learning_rate": 3.4491978609625666e-05,
"loss": 0.0541,
"num_input_tokens_seen": 480192,
"step": 775
},
{
"epoch": 1.3903743315508021,
"grad_norm": 2.0421881675720215,
"learning_rate": 3.471479500891266e-05,
"loss": 0.0793,
"num_input_tokens_seen": 483296,
"step": 780
},
{
"epoch": 1.3992869875222818,
"grad_norm": 0.9459624886512756,
"learning_rate": 3.4937611408199644e-05,
"loss": 0.0154,
"num_input_tokens_seen": 486368,
"step": 785
},
{
"epoch": 1.4081996434937611,
"grad_norm": 8.17954158782959,
"learning_rate": 3.5160427807486636e-05,
"loss": 0.0469,
"num_input_tokens_seen": 489280,
"step": 790
},
{
"epoch": 1.4171122994652405,
"grad_norm": 0.35897743701934814,
"learning_rate": 3.538324420677362e-05,
"loss": 0.013,
"num_input_tokens_seen": 492704,
"step": 795
},
{
"epoch": 1.4260249554367201,
"grad_norm": 3.8780298233032227,
"learning_rate": 3.560606060606061e-05,
"loss": 0.033,
"num_input_tokens_seen": 495808,
"step": 800
},
{
"epoch": 1.4349376114081998,
"grad_norm": 0.00485863396897912,
"learning_rate": 3.582887700534759e-05,
"loss": 0.0335,
"num_input_tokens_seen": 499552,
"step": 805
},
{
"epoch": 1.4438502673796791,
"grad_norm": 0.05903235450387001,
"learning_rate": 3.6051693404634585e-05,
"loss": 0.1034,
"num_input_tokens_seen": 502592,
"step": 810
},
{
"epoch": 1.4527629233511585,
"grad_norm": 0.00699259340763092,
"learning_rate": 3.627450980392157e-05,
"loss": 0.0126,
"num_input_tokens_seen": 505408,
"step": 815
},
{
"epoch": 1.4616755793226381,
"grad_norm": 7.649658679962158,
"learning_rate": 3.649732620320856e-05,
"loss": 0.1106,
"num_input_tokens_seen": 507904,
"step": 820
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.009012856520712376,
"learning_rate": 3.672014260249554e-05,
"loss": 0.0749,
"num_input_tokens_seen": 510688,
"step": 825
},
{
"epoch": 1.4795008912655971,
"grad_norm": 0.829964280128479,
"learning_rate": 3.694295900178253e-05,
"loss": 0.0826,
"num_input_tokens_seen": 514240,
"step": 830
},
{
"epoch": 1.4884135472370765,
"grad_norm": 12.730154037475586,
"learning_rate": 3.716577540106952e-05,
"loss": 0.063,
"num_input_tokens_seen": 517568,
"step": 835
},
{
"epoch": 1.4973262032085561,
"grad_norm": 6.636897563934326,
"learning_rate": 3.738859180035651e-05,
"loss": 0.0932,
"num_input_tokens_seen": 520704,
"step": 840
},
{
"epoch": 1.5062388591800357,
"grad_norm": 5.837046146392822,
"learning_rate": 3.7611408199643496e-05,
"loss": 0.0382,
"num_input_tokens_seen": 524320,
"step": 845
},
{
"epoch": 1.5151515151515151,
"grad_norm": 2.597928285598755,
"learning_rate": 3.783422459893048e-05,
"loss": 0.0211,
"num_input_tokens_seen": 527104,
"step": 850
},
{
"epoch": 1.5240641711229945,
"grad_norm": 1.7771317958831787,
"learning_rate": 3.805704099821747e-05,
"loss": 0.0126,
"num_input_tokens_seen": 530624,
"step": 855
},
{
"epoch": 1.5329768270944741,
"grad_norm": 11.486368179321289,
"learning_rate": 3.827985739750446e-05,
"loss": 0.1684,
"num_input_tokens_seen": 533504,
"step": 860
},
{
"epoch": 1.5418894830659537,
"grad_norm": 0.2925088107585907,
"learning_rate": 3.8502673796791445e-05,
"loss": 0.0023,
"num_input_tokens_seen": 536992,
"step": 865
},
{
"epoch": 1.5508021390374331,
"grad_norm": 5.498477458953857,
"learning_rate": 3.872549019607844e-05,
"loss": 0.117,
"num_input_tokens_seen": 539840,
"step": 870
},
{
"epoch": 1.5597147950089125,
"grad_norm": 8.601882934570312,
"learning_rate": 3.894830659536542e-05,
"loss": 0.1269,
"num_input_tokens_seen": 543008,
"step": 875
},
{
"epoch": 1.5686274509803921,
"grad_norm": 0.4930436909198761,
"learning_rate": 3.917112299465241e-05,
"loss": 0.076,
"num_input_tokens_seen": 546144,
"step": 880
},
{
"epoch": 1.5775401069518717,
"grad_norm": 4.17765474319458,
"learning_rate": 3.939393939393939e-05,
"loss": 0.0449,
"num_input_tokens_seen": 549440,
"step": 885
},
{
"epoch": 1.5864527629233511,
"grad_norm": 0.14472661912441254,
"learning_rate": 3.9616755793226386e-05,
"loss": 0.0066,
"num_input_tokens_seen": 552608,
"step": 890
},
{
"epoch": 1.5953654188948305,
"grad_norm": 0.04021904617547989,
"learning_rate": 3.983957219251337e-05,
"loss": 0.0097,
"num_input_tokens_seen": 555424,
"step": 895
},
{
"epoch": 1.6042780748663101,
"grad_norm": 0.017523914575576782,
"learning_rate": 4.0062388591800356e-05,
"loss": 0.047,
"num_input_tokens_seen": 558720,
"step": 900
},
{
"epoch": 1.6131907308377897,
"grad_norm": 0.019056592136621475,
"learning_rate": 4.028520499108734e-05,
"loss": 0.0817,
"num_input_tokens_seen": 560992,
"step": 905
},
{
"epoch": 1.6221033868092691,
"grad_norm": 11.486791610717773,
"learning_rate": 4.0508021390374334e-05,
"loss": 0.0296,
"num_input_tokens_seen": 563808,
"step": 910
},
{
"epoch": 1.6310160427807485,
"grad_norm": 0.019042672589421272,
"learning_rate": 4.073083778966132e-05,
"loss": 0.0296,
"num_input_tokens_seen": 566752,
"step": 915
},
{
"epoch": 1.6399286987522281,
"grad_norm": 0.024380512535572052,
"learning_rate": 4.095365418894831e-05,
"loss": 0.0492,
"num_input_tokens_seen": 570048,
"step": 920
},
{
"epoch": 1.6488413547237077,
"grad_norm": 0.14979250729084015,
"learning_rate": 4.11764705882353e-05,
"loss": 0.0183,
"num_input_tokens_seen": 573312,
"step": 925
},
{
"epoch": 1.6577540106951871,
"grad_norm": 0.011035463772714138,
"learning_rate": 4.139928698752228e-05,
"loss": 0.0249,
"num_input_tokens_seen": 576160,
"step": 930
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.15351063013076782,
"learning_rate": 4.162210338680927e-05,
"loss": 0.0384,
"num_input_tokens_seen": 578976,
"step": 935
},
{
"epoch": 1.6755793226381461,
"grad_norm": 7.670846462249756,
"learning_rate": 4.184491978609626e-05,
"loss": 0.1065,
"num_input_tokens_seen": 581856,
"step": 940
},
{
"epoch": 1.6844919786096257,
"grad_norm": 0.29702576994895935,
"learning_rate": 4.2067736185383246e-05,
"loss": 0.068,
"num_input_tokens_seen": 584704,
"step": 945
},
{
"epoch": 1.6934046345811051,
"grad_norm": 0.04708877205848694,
"learning_rate": 4.229055258467023e-05,
"loss": 0.0205,
"num_input_tokens_seen": 587776,
"step": 950
},
{
"epoch": 1.7023172905525845,
"grad_norm": 1.0828979015350342,
"learning_rate": 4.251336898395722e-05,
"loss": 0.0402,
"num_input_tokens_seen": 590848,
"step": 955
},
{
"epoch": 1.7112299465240641,
"grad_norm": 13.534217834472656,
"learning_rate": 4.273618538324421e-05,
"loss": 0.2889,
"num_input_tokens_seen": 594048,
"step": 960
},
{
"epoch": 1.7201426024955437,
"grad_norm": 0.07279043644666672,
"learning_rate": 4.2959001782531194e-05,
"loss": 0.0494,
"num_input_tokens_seen": 596800,
"step": 965
},
{
"epoch": 1.7290552584670231,
"grad_norm": 6.436086177825928,
"learning_rate": 4.318181818181819e-05,
"loss": 0.0643,
"num_input_tokens_seen": 599552,
"step": 970
},
{
"epoch": 1.7379679144385025,
"grad_norm": 0.8411062955856323,
"learning_rate": 4.340463458110517e-05,
"loss": 0.0762,
"num_input_tokens_seen": 602816,
"step": 975
},
{
"epoch": 1.7468805704099821,
"grad_norm": 11.923518180847168,
"learning_rate": 4.362745098039216e-05,
"loss": 0.1196,
"num_input_tokens_seen": 605280,
"step": 980
},
{
"epoch": 1.7557932263814617,
"grad_norm": 2.0461044311523438,
"learning_rate": 4.385026737967914e-05,
"loss": 0.0464,
"num_input_tokens_seen": 608096,
"step": 985
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.1000722125172615,
"learning_rate": 4.4073083778966135e-05,
"loss": 0.0015,
"num_input_tokens_seen": 611328,
"step": 990
},
{
"epoch": 1.7736185383244205,
"grad_norm": 6.380837917327881,
"learning_rate": 4.429590017825312e-05,
"loss": 0.0903,
"num_input_tokens_seen": 614304,
"step": 995
},
{
"epoch": 1.7825311942959001,
"grad_norm": 2.4291083812713623,
"learning_rate": 4.4518716577540106e-05,
"loss": 0.0074,
"num_input_tokens_seen": 617952,
"step": 1000
},
{
"epoch": 1.7914438502673797,
"grad_norm": 11.327752113342285,
"learning_rate": 4.474153297682709e-05,
"loss": 0.0638,
"num_input_tokens_seen": 621120,
"step": 1005
},
{
"epoch": 1.8003565062388591,
"grad_norm": 0.16634012758731842,
"learning_rate": 4.4964349376114084e-05,
"loss": 0.0011,
"num_input_tokens_seen": 624416,
"step": 1010
},
{
"epoch": 1.8092691622103387,
"grad_norm": 8.74438762664795,
"learning_rate": 4.518716577540107e-05,
"loss": 0.0811,
"num_input_tokens_seen": 627040,
"step": 1015
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.05186864361166954,
"learning_rate": 4.540998217468806e-05,
"loss": 0.1683,
"num_input_tokens_seen": 630432,
"step": 1020
},
{
"epoch": 1.8270944741532977,
"grad_norm": 7.079963207244873,
"learning_rate": 4.563279857397505e-05,
"loss": 0.1726,
"num_input_tokens_seen": 633600,
"step": 1025
},
{
"epoch": 1.8360071301247771,
"grad_norm": 1.006164312362671,
"learning_rate": 4.585561497326203e-05,
"loss": 0.0053,
"num_input_tokens_seen": 636640,
"step": 1030
},
{
"epoch": 1.8449197860962567,
"grad_norm": 9.37168025970459,
"learning_rate": 4.607843137254902e-05,
"loss": 0.1272,
"num_input_tokens_seen": 639776,
"step": 1035
},
{
"epoch": 1.8538324420677363,
"grad_norm": 7.00251579284668,
"learning_rate": 4.630124777183601e-05,
"loss": 0.0867,
"num_input_tokens_seen": 643072,
"step": 1040
},
{
"epoch": 1.8627450980392157,
"grad_norm": 2.162777900695801,
"learning_rate": 4.6524064171123e-05,
"loss": 0.0372,
"num_input_tokens_seen": 646048,
"step": 1045
},
{
"epoch": 1.8716577540106951,
"grad_norm": 1.0629892349243164,
"learning_rate": 4.674688057040999e-05,
"loss": 0.05,
"num_input_tokens_seen": 649472,
"step": 1050
},
{
"epoch": 1.8805704099821747,
"grad_norm": 0.0947604700922966,
"learning_rate": 4.696969696969697e-05,
"loss": 0.0477,
"num_input_tokens_seen": 652800,
"step": 1055
},
{
"epoch": 1.8894830659536543,
"grad_norm": 0.02225646935403347,
"learning_rate": 4.719251336898396e-05,
"loss": 0.1286,
"num_input_tokens_seen": 655904,
"step": 1060
},
{
"epoch": 1.8983957219251337,
"grad_norm": 0.07933518290519714,
"learning_rate": 4.741532976827095e-05,
"loss": 0.0116,
"num_input_tokens_seen": 659072,
"step": 1065
},
{
"epoch": 1.9073083778966131,
"grad_norm": 4.901011943817139,
"learning_rate": 4.7638146167557936e-05,
"loss": 0.0222,
"num_input_tokens_seen": 662656,
"step": 1070
},
{
"epoch": 1.9162210338680927,
"grad_norm": 1.2279400825500488,
"learning_rate": 4.786096256684492e-05,
"loss": 0.0725,
"num_input_tokens_seen": 665856,
"step": 1075
},
{
"epoch": 1.9251336898395723,
"grad_norm": 6.082876205444336,
"learning_rate": 4.808377896613191e-05,
"loss": 0.226,
"num_input_tokens_seen": 669728,
"step": 1080
},
{
"epoch": 1.9340463458110517,
"grad_norm": 10.361590385437012,
"learning_rate": 4.83065953654189e-05,
"loss": 0.0448,
"num_input_tokens_seen": 672384,
"step": 1085
},
{
"epoch": 1.9429590017825311,
"grad_norm": 5.1663713455200195,
"learning_rate": 4.8529411764705885e-05,
"loss": 0.0127,
"num_input_tokens_seen": 675392,
"step": 1090
},
{
"epoch": 1.9518716577540107,
"grad_norm": 0.454103946685791,
"learning_rate": 4.875222816399288e-05,
"loss": 0.0531,
"num_input_tokens_seen": 678592,
"step": 1095
},
{
"epoch": 1.9607843137254903,
"grad_norm": 0.015812421217560768,
"learning_rate": 4.897504456327986e-05,
"loss": 0.0549,
"num_input_tokens_seen": 682080,
"step": 1100
},
{
"epoch": 1.9696969696969697,
"grad_norm": 7.40580940246582,
"learning_rate": 4.919786096256685e-05,
"loss": 0.0428,
"num_input_tokens_seen": 684448,
"step": 1105
},
{
"epoch": 1.9786096256684491,
"grad_norm": 5.290456771850586,
"learning_rate": 4.9420677361853833e-05,
"loss": 0.0128,
"num_input_tokens_seen": 688032,
"step": 1110
},
{
"epoch": 1.9875222816399287,
"grad_norm": 2.195844888687134,
"learning_rate": 4.9643493761140826e-05,
"loss": 0.0973,
"num_input_tokens_seen": 691520,
"step": 1115
},
{
"epoch": 1.9964349376114083,
"grad_norm": 0.004842815455049276,
"learning_rate": 4.986631016042781e-05,
"loss": 0.0807,
"num_input_tokens_seen": 694080,
"step": 1120
},
{
"epoch": 2.0,
"eval_loss": 0.10693030804395676,
"eval_runtime": 4.5835,
"eval_samples_per_second": 54.325,
"eval_steps_per_second": 13.745,
"num_input_tokens_seen": 694664,
"step": 1122
},
{
"epoch": 2.0053475935828877,
"grad_norm": 1.4178153276443481,
"learning_rate": 4.999999516051662e-05,
"loss": 0.0538,
"num_input_tokens_seen": 696360,
"step": 1125
},
{
"epoch": 2.014260249554367,
"grad_norm": 1.4418376684188843,
"learning_rate": 4.999994071635008e-05,
"loss": 0.0031,
"num_input_tokens_seen": 700456,
"step": 1130
},
{
"epoch": 2.0231729055258465,
"grad_norm": 0.04916919022798538,
"learning_rate": 4.999982577879495e-05,
"loss": 0.0009,
"num_input_tokens_seen": 703304,
"step": 1135
},
{
"epoch": 2.0320855614973263,
"grad_norm": 0.00426515331491828,
"learning_rate": 4.999965034812935e-05,
"loss": 0.0583,
"num_input_tokens_seen": 706472,
"step": 1140
},
{
"epoch": 2.0409982174688057,
"grad_norm": 0.01999180018901825,
"learning_rate": 4.9999414424777766e-05,
"loss": 0.0316,
"num_input_tokens_seen": 709512,
"step": 1145
},
{
"epoch": 2.049910873440285,
"grad_norm": 1.4245250225067139,
"learning_rate": 4.9999118009311084e-05,
"loss": 0.1014,
"num_input_tokens_seen": 712328,
"step": 1150
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.011511596851050854,
"learning_rate": 4.9998761102446554e-05,
"loss": 0.003,
"num_input_tokens_seen": 714888,
"step": 1155
},
{
"epoch": 2.0677361853832443,
"grad_norm": 20.848102569580078,
"learning_rate": 4.999834370504779e-05,
"loss": 0.0615,
"num_input_tokens_seen": 717192,
"step": 1160
},
{
"epoch": 2.0766488413547237,
"grad_norm": 4.06023645401001,
"learning_rate": 4.99978658181248e-05,
"loss": 0.0976,
"num_input_tokens_seen": 720616,
"step": 1165
},
{
"epoch": 2.085561497326203,
"grad_norm": 0.18716619908809662,
"learning_rate": 4.999732744283393e-05,
"loss": 0.0859,
"num_input_tokens_seen": 723400,
"step": 1170
},
{
"epoch": 2.0944741532976825,
"grad_norm": 0.1339133232831955,
"learning_rate": 4.999672858047791e-05,
"loss": 0.0051,
"num_input_tokens_seen": 726856,
"step": 1175
},
{
"epoch": 2.1033868092691623,
"grad_norm": 6.125833034515381,
"learning_rate": 4.999606923250585e-05,
"loss": 0.0108,
"num_input_tokens_seen": 730312,
"step": 1180
},
{
"epoch": 2.1122994652406417,
"grad_norm": 0.3494812846183777,
"learning_rate": 4.999534940051317e-05,
"loss": 0.0027,
"num_input_tokens_seen": 733480,
"step": 1185
},
{
"epoch": 2.121212121212121,
"grad_norm": 0.4654238224029541,
"learning_rate": 4.9994569086241716e-05,
"loss": 0.0633,
"num_input_tokens_seen": 736936,
"step": 1190
},
{
"epoch": 2.1301247771836005,
"grad_norm": 0.0284738652408123,
"learning_rate": 4.999372829157962e-05,
"loss": 0.0038,
"num_input_tokens_seen": 740328,
"step": 1195
},
{
"epoch": 2.1390374331550803,
"grad_norm": 0.025350507348775864,
"learning_rate": 4.9992827018561386e-05,
"loss": 0.0767,
"num_input_tokens_seen": 743496,
"step": 1200
},
{
"epoch": 2.1479500891265597,
"grad_norm": 12.299341201782227,
"learning_rate": 4.999186526936788e-05,
"loss": 0.0968,
"num_input_tokens_seen": 746888,
"step": 1205
},
{
"epoch": 2.156862745098039,
"grad_norm": 0.029630400240421295,
"learning_rate": 4.999084304632627e-05,
"loss": 0.034,
"num_input_tokens_seen": 749704,
"step": 1210
},
{
"epoch": 2.165775401069519,
"grad_norm": 1.3330607414245605,
"learning_rate": 4.9989760351910074e-05,
"loss": 0.067,
"num_input_tokens_seen": 753000,
"step": 1215
},
{
"epoch": 2.1746880570409983,
"grad_norm": 0.08385679125785828,
"learning_rate": 4.998861718873915e-05,
"loss": 0.0046,
"num_input_tokens_seen": 756104,
"step": 1220
},
{
"epoch": 2.1836007130124777,
"grad_norm": 14.622919082641602,
"learning_rate": 4.9987413559579636e-05,
"loss": 0.1344,
"num_input_tokens_seen": 759176,
"step": 1225
},
{
"epoch": 2.192513368983957,
"grad_norm": 0.14707021415233612,
"learning_rate": 4.9986149467344004e-05,
"loss": 0.0029,
"num_input_tokens_seen": 762248,
"step": 1230
},
{
"epoch": 2.2014260249554365,
"grad_norm": 0.8391829133033752,
"learning_rate": 4.998482491509104e-05,
"loss": 0.0056,
"num_input_tokens_seen": 765320,
"step": 1235
},
{
"epoch": 2.2103386809269163,
"grad_norm": 0.046552758663892746,
"learning_rate": 4.998343990602582e-05,
"loss": 0.031,
"num_input_tokens_seen": 768008,
"step": 1240
},
{
"epoch": 2.2192513368983957,
"grad_norm": 0.09806407988071442,
"learning_rate": 4.998199444349969e-05,
"loss": 0.051,
"num_input_tokens_seen": 771496,
"step": 1245
},
{
"epoch": 2.228163992869875,
"grad_norm": 0.853639543056488,
"learning_rate": 4.998048853101031e-05,
"loss": 0.1757,
"num_input_tokens_seen": 775048,
"step": 1250
},
{
"epoch": 2.237076648841355,
"grad_norm": 0.11520189791917801,
"learning_rate": 4.99789221722016e-05,
"loss": 0.0012,
"num_input_tokens_seen": 778312,
"step": 1255
},
{
"epoch": 2.2459893048128343,
"grad_norm": 0.007044652942568064,
"learning_rate": 4.997729537086373e-05,
"loss": 0.0165,
"num_input_tokens_seen": 781224,
"step": 1260
},
{
"epoch": 2.2549019607843137,
"grad_norm": 0.011247215792536736,
"learning_rate": 4.997560813093316e-05,
"loss": 0.0043,
"num_input_tokens_seen": 784520,
"step": 1265
},
{
"epoch": 2.263814616755793,
"grad_norm": 0.6388627290725708,
"learning_rate": 4.997386045649255e-05,
"loss": 0.002,
"num_input_tokens_seen": 787880,
"step": 1270
},
{
"epoch": 2.2727272727272725,
"grad_norm": 12.410165786743164,
"learning_rate": 4.9972052351770836e-05,
"loss": 0.1314,
"num_input_tokens_seen": 791240,
"step": 1275
},
{
"epoch": 2.2816399286987523,
"grad_norm": 0.06250201910734177,
"learning_rate": 4.997018382114316e-05,
"loss": 0.032,
"num_input_tokens_seen": 794408,
"step": 1280
},
{
"epoch": 2.2905525846702317,
"grad_norm": 0.0694412887096405,
"learning_rate": 4.996825486913088e-05,
"loss": 0.0025,
"num_input_tokens_seen": 797800,
"step": 1285
},
{
"epoch": 2.299465240641711,
"grad_norm": 0.05269337072968483,
"learning_rate": 4.996626550040157e-05,
"loss": 0.0549,
"num_input_tokens_seen": 801160,
"step": 1290
},
{
"epoch": 2.308377896613191,
"grad_norm": 0.044849712401628494,
"learning_rate": 4.9964215719768964e-05,
"loss": 0.0009,
"num_input_tokens_seen": 804008,
"step": 1295
},
{
"epoch": 2.3172905525846703,
"grad_norm": 0.7724558711051941,
"learning_rate": 4.9962105532193024e-05,
"loss": 0.0246,
"num_input_tokens_seen": 807080,
"step": 1300
},
{
"epoch": 2.3262032085561497,
"grad_norm": 5.825749397277832,
"learning_rate": 4.995993494277985e-05,
"loss": 0.0343,
"num_input_tokens_seen": 810056,
"step": 1305
},
{
"epoch": 2.335115864527629,
"grad_norm": 0.011077574454247952,
"learning_rate": 4.995770395678171e-05,
"loss": 0.0477,
"num_input_tokens_seen": 813544,
"step": 1310
},
{
"epoch": 2.344028520499109,
"grad_norm": 6.1359782218933105,
"learning_rate": 4.9955412579597004e-05,
"loss": 0.0576,
"num_input_tokens_seen": 816200,
"step": 1315
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.015752514824271202,
"learning_rate": 4.995306081677028e-05,
"loss": 0.001,
"num_input_tokens_seen": 818792,
"step": 1320
},
{
"epoch": 2.3618538324420677,
"grad_norm": 0.03618216887116432,
"learning_rate": 4.99506486739922e-05,
"loss": 0.0123,
"num_input_tokens_seen": 821736,
"step": 1325
},
{
"epoch": 2.370766488413547,
"grad_norm": 0.06211644038558006,
"learning_rate": 4.994817615709951e-05,
"loss": 0.0018,
"num_input_tokens_seen": 825032,
"step": 1330
},
{
"epoch": 2.379679144385027,
"grad_norm": 3.1051697731018066,
"learning_rate": 4.994564327207508e-05,
"loss": 0.2383,
"num_input_tokens_seen": 827912,
"step": 1335
},
{
"epoch": 2.3885918003565063,
"grad_norm": 0.001248644315637648,
"learning_rate": 4.9943050025047824e-05,
"loss": 0.0008,
"num_input_tokens_seen": 831080,
"step": 1340
},
{
"epoch": 2.3975044563279857,
"grad_norm": 20.717248916625977,
"learning_rate": 4.994039642229274e-05,
"loss": 0.0835,
"num_input_tokens_seen": 834440,
"step": 1345
},
{
"epoch": 2.406417112299465,
"grad_norm": 3.5113887786865234,
"learning_rate": 4.993768247023084e-05,
"loss": 0.185,
"num_input_tokens_seen": 837096,
"step": 1350
},
{
"epoch": 2.415329768270945,
"grad_norm": 0.20629923045635223,
"learning_rate": 4.9934908175429194e-05,
"loss": 0.0054,
"num_input_tokens_seen": 840520,
"step": 1355
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.030959784984588623,
"learning_rate": 4.993207354460089e-05,
"loss": 0.0024,
"num_input_tokens_seen": 843400,
"step": 1360
},
{
"epoch": 2.4331550802139037,
"grad_norm": 0.0362035296857357,
"learning_rate": 4.9929178584605e-05,
"loss": 0.0362,
"num_input_tokens_seen": 846088,
"step": 1365
},
{
"epoch": 2.442067736185383,
"grad_norm": 0.13232892751693726,
"learning_rate": 4.992622330244656e-05,
"loss": 0.0356,
"num_input_tokens_seen": 848776,
"step": 1370
},
{
"epoch": 2.450980392156863,
"grad_norm": 0.03575564920902252,
"learning_rate": 4.99232077052766e-05,
"loss": 0.0025,
"num_input_tokens_seen": 852328,
"step": 1375
},
{
"epoch": 2.4598930481283423,
"grad_norm": 6.295513153076172,
"learning_rate": 4.992013180039209e-05,
"loss": 0.0426,
"num_input_tokens_seen": 854952,
"step": 1380
},
{
"epoch": 2.4688057040998217,
"grad_norm": 0.1265345811843872,
"learning_rate": 4.991699559523591e-05,
"loss": 0.0016,
"num_input_tokens_seen": 858536,
"step": 1385
},
{
"epoch": 2.477718360071301,
"grad_norm": 6.578834533691406,
"learning_rate": 4.9913799097396877e-05,
"loss": 0.0049,
"num_input_tokens_seen": 861864,
"step": 1390
},
{
"epoch": 2.486631016042781,
"grad_norm": 0.011173945851624012,
"learning_rate": 4.9910542314609684e-05,
"loss": 0.0108,
"num_input_tokens_seen": 864104,
"step": 1395
},
{
"epoch": 2.4955436720142603,
"grad_norm": 6.777383327484131,
"learning_rate": 4.990722525475491e-05,
"loss": 0.0097,
"num_input_tokens_seen": 867336,
"step": 1400
},
{
"epoch": 2.5044563279857397,
"grad_norm": 0.019024401903152466,
"learning_rate": 4.990384792585897e-05,
"loss": 0.041,
"num_input_tokens_seen": 870376,
"step": 1405
},
{
"epoch": 2.5133689839572195,
"grad_norm": 0.026918886229395866,
"learning_rate": 4.990041033609413e-05,
"loss": 0.0134,
"num_input_tokens_seen": 872456,
"step": 1410
},
{
"epoch": 2.522281639928699,
"grad_norm": 0.0020757236052304506,
"learning_rate": 4.989691249377847e-05,
"loss": 0.0461,
"num_input_tokens_seen": 874888,
"step": 1415
},
{
"epoch": 2.5311942959001783,
"grad_norm": 0.0028826724737882614,
"learning_rate": 4.989335440737586e-05,
"loss": 0.1087,
"num_input_tokens_seen": 877992,
"step": 1420
},
{
"epoch": 2.5401069518716577,
"grad_norm": 0.02220228686928749,
"learning_rate": 4.9889736085495965e-05,
"loss": 0.0003,
"num_input_tokens_seen": 880776,
"step": 1425
},
{
"epoch": 2.549019607843137,
"grad_norm": 0.06741692870855331,
"learning_rate": 4.988605753689416e-05,
"loss": 0.1482,
"num_input_tokens_seen": 883944,
"step": 1430
},
{
"epoch": 2.557932263814617,
"grad_norm": 0.012142196297645569,
"learning_rate": 4.98823187704716e-05,
"loss": 0.0004,
"num_input_tokens_seen": 886632,
"step": 1435
},
{
"epoch": 2.5668449197860963,
"grad_norm": 0.021630438044667244,
"learning_rate": 4.9878519795275133e-05,
"loss": 0.0134,
"num_input_tokens_seen": 889448,
"step": 1440
},
{
"epoch": 2.5757575757575757,
"grad_norm": 16.687862396240234,
"learning_rate": 4.987466062049728e-05,
"loss": 0.021,
"num_input_tokens_seen": 892744,
"step": 1445
},
{
"epoch": 2.5846702317290555,
"grad_norm": 0.5119276642799377,
"learning_rate": 4.9870741255476266e-05,
"loss": 0.0019,
"num_input_tokens_seen": 895816,
"step": 1450
},
{
"epoch": 2.593582887700535,
"grad_norm": 0.02749607339501381,
"learning_rate": 4.986676170969593e-05,
"loss": 0.0562,
"num_input_tokens_seen": 899784,
"step": 1455
},
{
"epoch": 2.6024955436720143,
"grad_norm": 1.1509554386138916,
"learning_rate": 4.986272199278574e-05,
"loss": 0.0698,
"num_input_tokens_seen": 902984,
"step": 1460
},
{
"epoch": 2.6114081996434937,
"grad_norm": 0.04116099700331688,
"learning_rate": 4.985862211452077e-05,
"loss": 0.0216,
"num_input_tokens_seen": 905800,
"step": 1465
},
{
"epoch": 2.620320855614973,
"grad_norm": 0.025392096489667892,
"learning_rate": 4.985446208482166e-05,
"loss": 0.0026,
"num_input_tokens_seen": 908488,
"step": 1470
},
{
"epoch": 2.629233511586453,
"grad_norm": 0.03849076107144356,
"learning_rate": 4.985024191375462e-05,
"loss": 0.152,
"num_input_tokens_seen": 911528,
"step": 1475
},
{
"epoch": 2.6381461675579323,
"grad_norm": 3.8424088954925537,
"learning_rate": 4.984596161153136e-05,
"loss": 0.0847,
"num_input_tokens_seen": 914536,
"step": 1480
},
{
"epoch": 2.6470588235294117,
"grad_norm": 3.8528010845184326,
"learning_rate": 4.9841621188509105e-05,
"loss": 0.0324,
"num_input_tokens_seen": 917384,
"step": 1485
},
{
"epoch": 2.6559714795008915,
"grad_norm": 1.6923863887786865,
"learning_rate": 4.983722065519055e-05,
"loss": 0.0044,
"num_input_tokens_seen": 920072,
"step": 1490
},
{
"epoch": 2.664884135472371,
"grad_norm": 0.2570834457874298,
"learning_rate": 4.983276002222386e-05,
"loss": 0.066,
"num_input_tokens_seen": 923304,
"step": 1495
},
{
"epoch": 2.6737967914438503,
"grad_norm": 0.06823292374610901,
"learning_rate": 4.9828239300402605e-05,
"loss": 0.0289,
"num_input_tokens_seen": 925480,
"step": 1500
},
{
"epoch": 2.6827094474153297,
"grad_norm": 0.1330983191728592,
"learning_rate": 4.982365850066576e-05,
"loss": 0.018,
"num_input_tokens_seen": 928264,
"step": 1505
},
{
"epoch": 2.691622103386809,
"grad_norm": 0.988633394241333,
"learning_rate": 4.9819017634097685e-05,
"loss": 0.0038,
"num_input_tokens_seen": 931784,
"step": 1510
},
{
"epoch": 2.700534759358289,
"grad_norm": 0.1021723598241806,
"learning_rate": 4.981431671192807e-05,
"loss": 0.0014,
"num_input_tokens_seen": 935336,
"step": 1515
},
{
"epoch": 2.7094474153297683,
"grad_norm": 0.029408583417534828,
"learning_rate": 4.9809555745531934e-05,
"loss": 0.07,
"num_input_tokens_seen": 938472,
"step": 1520
},
{
"epoch": 2.7183600713012477,
"grad_norm": 2.873579978942871,
"learning_rate": 4.980473474642957e-05,
"loss": 0.1587,
"num_input_tokens_seen": 941288,
"step": 1525
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.38247668743133545,
"learning_rate": 4.979985372628657e-05,
"loss": 0.0555,
"num_input_tokens_seen": 944616,
"step": 1530
},
{
"epoch": 2.736185383244207,
"grad_norm": 0.10064245760440826,
"learning_rate": 4.979491269691372e-05,
"loss": 0.0012,
"num_input_tokens_seen": 947880,
"step": 1535
},
{
"epoch": 2.7450980392156863,
"grad_norm": 7.594188213348389,
"learning_rate": 4.978991167026705e-05,
"loss": 0.1035,
"num_input_tokens_seen": 950440,
"step": 1540
},
{
"epoch": 2.7540106951871657,
"grad_norm": 2.325514078140259,
"learning_rate": 4.9784850658447745e-05,
"loss": 0.0049,
"num_input_tokens_seen": 953576,
"step": 1545
},
{
"epoch": 2.762923351158645,
"grad_norm": 7.605442523956299,
"learning_rate": 4.9779729673702135e-05,
"loss": 0.1588,
"num_input_tokens_seen": 956456,
"step": 1550
},
{
"epoch": 2.771836007130125,
"grad_norm": 0.043773096054792404,
"learning_rate": 4.977454872842169e-05,
"loss": 0.003,
"num_input_tokens_seen": 959496,
"step": 1555
},
{
"epoch": 2.7807486631016043,
"grad_norm": 5.104632377624512,
"learning_rate": 4.9769307835142946e-05,
"loss": 0.0405,
"num_input_tokens_seen": 962440,
"step": 1560
},
{
"epoch": 2.7896613190730837,
"grad_norm": 1.4859371185302734,
"learning_rate": 4.9764007006547516e-05,
"loss": 0.0479,
"num_input_tokens_seen": 965576,
"step": 1565
},
{
"epoch": 2.7985739750445635,
"grad_norm": 0.03639693185687065,
"learning_rate": 4.975864625546204e-05,
"loss": 0.0031,
"num_input_tokens_seen": 969160,
"step": 1570
},
{
"epoch": 2.807486631016043,
"grad_norm": 3.4172000885009766,
"learning_rate": 4.975322559485814e-05,
"loss": 0.0214,
"num_input_tokens_seen": 972232,
"step": 1575
},
{
"epoch": 2.8163992869875223,
"grad_norm": 0.13606803119182587,
"learning_rate": 4.974774503785241e-05,
"loss": 0.0031,
"num_input_tokens_seen": 975592,
"step": 1580
},
{
"epoch": 2.8253119429590017,
"grad_norm": 0.018815385177731514,
"learning_rate": 4.974220459770639e-05,
"loss": 0.0165,
"num_input_tokens_seen": 979304,
"step": 1585
},
{
"epoch": 2.834224598930481,
"grad_norm": 0.00781914871186018,
"learning_rate": 4.9736604287826497e-05,
"loss": 0.0122,
"num_input_tokens_seen": 983176,
"step": 1590
},
{
"epoch": 2.843137254901961,
"grad_norm": 0.008310927078127861,
"learning_rate": 4.9730944121764045e-05,
"loss": 0.0007,
"num_input_tokens_seen": 986280,
"step": 1595
},
{
"epoch": 2.8520499108734403,
"grad_norm": 0.09141358733177185,
"learning_rate": 4.9725224113215164e-05,
"loss": 0.0173,
"num_input_tokens_seen": 989064,
"step": 1600
},
{
"epoch": 2.8609625668449197,
"grad_norm": 0.36811965703964233,
"learning_rate": 4.971944427602081e-05,
"loss": 0.0182,
"num_input_tokens_seen": 992456,
"step": 1605
},
{
"epoch": 2.8698752228163995,
"grad_norm": 0.006202696356922388,
"learning_rate": 4.971360462416667e-05,
"loss": 0.0006,
"num_input_tokens_seen": 995336,
"step": 1610
},
{
"epoch": 2.878787878787879,
"grad_norm": 0.01375326793640852,
"learning_rate": 4.97077051717832e-05,
"loss": 0.0812,
"num_input_tokens_seen": 998248,
"step": 1615
},
{
"epoch": 2.8877005347593583,
"grad_norm": 0.005168983247131109,
"learning_rate": 4.970174593314556e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1001512,
"step": 1620
},
{
"epoch": 2.8966131907308377,
"grad_norm": 0.16203118860721588,
"learning_rate": 4.969572692267355e-05,
"loss": 0.0738,
"num_input_tokens_seen": 1004616,
"step": 1625
},
{
"epoch": 2.905525846702317,
"grad_norm": 0.006419321522116661,
"learning_rate": 4.968964815493162e-05,
"loss": 0.0614,
"num_input_tokens_seen": 1007336,
"step": 1630
},
{
"epoch": 2.914438502673797,
"grad_norm": 8.11766529083252,
"learning_rate": 4.968350964462883e-05,
"loss": 0.0619,
"num_input_tokens_seen": 1010120,
"step": 1635
},
{
"epoch": 2.9233511586452763,
"grad_norm": 4.019729137420654,
"learning_rate": 4.967731140661878e-05,
"loss": 0.0659,
"num_input_tokens_seen": 1013384,
"step": 1640
},
{
"epoch": 2.9322638146167557,
"grad_norm": 11.93336296081543,
"learning_rate": 4.9671053455899584e-05,
"loss": 0.0643,
"num_input_tokens_seen": 1016168,
"step": 1645
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.25298967957496643,
"learning_rate": 4.966473580761389e-05,
"loss": 0.0014,
"num_input_tokens_seen": 1019336,
"step": 1650
},
{
"epoch": 2.950089126559715,
"grad_norm": 8.617263793945312,
"learning_rate": 4.965835847704876e-05,
"loss": 0.0267,
"num_input_tokens_seen": 1021704,
"step": 1655
},
{
"epoch": 2.9590017825311943,
"grad_norm": 11.657393455505371,
"learning_rate": 4.965192147963568e-05,
"loss": 0.0074,
"num_input_tokens_seen": 1025352,
"step": 1660
},
{
"epoch": 2.9679144385026737,
"grad_norm": 2.9546124935150146,
"learning_rate": 4.9645424830950526e-05,
"loss": 0.0319,
"num_input_tokens_seen": 1028712,
"step": 1665
},
{
"epoch": 2.976827094474153,
"grad_norm": 0.027331123128533363,
"learning_rate": 4.963886854671351e-05,
"loss": 0.0657,
"num_input_tokens_seen": 1031592,
"step": 1670
},
{
"epoch": 2.985739750445633,
"grad_norm": 11.084074020385742,
"learning_rate": 4.963225264278914e-05,
"loss": 0.0646,
"num_input_tokens_seen": 1034728,
"step": 1675
},
{
"epoch": 2.9946524064171123,
"grad_norm": 9.177580833435059,
"learning_rate": 4.962557713518617e-05,
"loss": 0.0532,
"num_input_tokens_seen": 1038920,
"step": 1680
},
{
"epoch": 3.0,
"eval_loss": 0.11031711101531982,
"eval_runtime": 4.581,
"eval_samples_per_second": 54.356,
"eval_steps_per_second": 13.753,
"num_input_tokens_seen": 1039864,
"step": 1683
},
{
"epoch": 3.0035650623885917,
"grad_norm": 0.3040387034416199,
"learning_rate": 4.961884204005764e-05,
"loss": 0.3259,
"num_input_tokens_seen": 1041016,
"step": 1685
},
{
"epoch": 3.0124777183600715,
"grad_norm": 0.030186321586370468,
"learning_rate": 4.961204737370071e-05,
"loss": 0.0567,
"num_input_tokens_seen": 1043704,
"step": 1690
},
{
"epoch": 3.021390374331551,
"grad_norm": 14.996708869934082,
"learning_rate": 4.960519315255673e-05,
"loss": 0.0911,
"num_input_tokens_seen": 1046296,
"step": 1695
},
{
"epoch": 3.0303030303030303,
"grad_norm": 0.9796962141990662,
"learning_rate": 4.959827939321113e-05,
"loss": 0.047,
"num_input_tokens_seen": 1049528,
"step": 1700
},
{
"epoch": 3.0392156862745097,
"grad_norm": 0.07655226439237595,
"learning_rate": 4.959130611239343e-05,
"loss": 0.0008,
"num_input_tokens_seen": 1052408,
"step": 1705
},
{
"epoch": 3.0481283422459895,
"grad_norm": 0.0062500229105353355,
"learning_rate": 4.958427332697716e-05,
"loss": 0.0054,
"num_input_tokens_seen": 1056088,
"step": 1710
},
{
"epoch": 3.057040998217469,
"grad_norm": 15.905081748962402,
"learning_rate": 4.9577181053979836e-05,
"loss": 0.0085,
"num_input_tokens_seen": 1059544,
"step": 1715
},
{
"epoch": 3.0659536541889483,
"grad_norm": 1.3312292098999023,
"learning_rate": 4.957002931056293e-05,
"loss": 0.0019,
"num_input_tokens_seen": 1062776,
"step": 1720
},
{
"epoch": 3.0748663101604277,
"grad_norm": 0.00571996346116066,
"learning_rate": 4.956281811403181e-05,
"loss": 0.0007,
"num_input_tokens_seen": 1065432,
"step": 1725
},
{
"epoch": 3.0837789661319075,
"grad_norm": 0.04415470361709595,
"learning_rate": 4.955554748183571e-05,
"loss": 0.0007,
"num_input_tokens_seen": 1068312,
"step": 1730
},
{
"epoch": 3.092691622103387,
"grad_norm": 0.014133289456367493,
"learning_rate": 4.9548217431567665e-05,
"loss": 0.0037,
"num_input_tokens_seen": 1070616,
"step": 1735
},
{
"epoch": 3.1016042780748663,
"grad_norm": 0.006229729391634464,
"learning_rate": 4.954082798096452e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1074200,
"step": 1740
},
{
"epoch": 3.1105169340463457,
"grad_norm": 0.0035163508728146553,
"learning_rate": 4.9533379147906825e-05,
"loss": 0.0137,
"num_input_tokens_seen": 1077656,
"step": 1745
},
{
"epoch": 3.1194295900178255,
"grad_norm": 0.008305568248033524,
"learning_rate": 4.952587095041882e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1080440,
"step": 1750
},
{
"epoch": 3.128342245989305,
"grad_norm": 0.0018318976508453488,
"learning_rate": 4.9518303406668404e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1083288,
"step": 1755
},
{
"epoch": 3.1372549019607843,
"grad_norm": 0.0020866121631115675,
"learning_rate": 4.9510676534967085e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1086616,
"step": 1760
},
{
"epoch": 3.1461675579322637,
"grad_norm": 3.221034526824951,
"learning_rate": 4.950299035376991e-05,
"loss": 0.0738,
"num_input_tokens_seen": 1090360,
"step": 1765
},
{
"epoch": 3.1550802139037435,
"grad_norm": 0.016158247366547585,
"learning_rate": 4.949524488167545e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1093176,
"step": 1770
},
{
"epoch": 3.163992869875223,
"grad_norm": 0.021480686962604523,
"learning_rate": 4.9487440137425755e-05,
"loss": 0.0329,
"num_input_tokens_seen": 1096088,
"step": 1775
},
{
"epoch": 3.1729055258467023,
"grad_norm": 0.024992266669869423,
"learning_rate": 4.947957613990627e-05,
"loss": 0.0059,
"num_input_tokens_seen": 1098872,
"step": 1780
},
{
"epoch": 3.1818181818181817,
"grad_norm": 0.025295181199908257,
"learning_rate": 4.947165290814584e-05,
"loss": 0.0061,
"num_input_tokens_seen": 1102360,
"step": 1785
},
{
"epoch": 3.1907308377896615,
"grad_norm": 0.044885795563459396,
"learning_rate": 4.9463670461316644e-05,
"loss": 0.0273,
"num_input_tokens_seen": 1105592,
"step": 1790
},
{
"epoch": 3.199643493761141,
"grad_norm": 0.012321226298809052,
"learning_rate": 4.945562881873412e-05,
"loss": 0.0009,
"num_input_tokens_seen": 1109432,
"step": 1795
},
{
"epoch": 3.2085561497326203,
"grad_norm": 0.004980860278010368,
"learning_rate": 4.944752799985699e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1112664,
"step": 1800
},
{
"epoch": 3.2174688057040997,
"grad_norm": 0.00631902227178216,
"learning_rate": 4.943936802428712e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1115928,
"step": 1805
},
{
"epoch": 3.2263814616755795,
"grad_norm": 0.017392151057720184,
"learning_rate": 4.9431148911769534e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1119576,
"step": 1810
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.03878406435251236,
"learning_rate": 4.942287068219238e-05,
"loss": 0.0018,
"num_input_tokens_seen": 1123256,
"step": 1815
},
{
"epoch": 3.2442067736185383,
"grad_norm": 15.287306785583496,
"learning_rate": 4.941453335558681e-05,
"loss": 0.0258,
"num_input_tokens_seen": 1126872,
"step": 1820
},
{
"epoch": 3.2531194295900177,
"grad_norm": 24.802167892456055,
"learning_rate": 4.9406136952127015e-05,
"loss": 0.0088,
"num_input_tokens_seen": 1129496,
"step": 1825
},
{
"epoch": 3.2620320855614975,
"grad_norm": 0.0961998999118805,
"learning_rate": 4.9397681492130104e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1132280,
"step": 1830
},
{
"epoch": 3.270944741532977,
"grad_norm": 0.00621894421055913,
"learning_rate": 4.9389166996056114e-05,
"loss": 0.0024,
"num_input_tokens_seen": 1135640,
"step": 1835
},
{
"epoch": 3.2798573975044563,
"grad_norm": 0.005441363900899887,
"learning_rate": 4.938059348450792e-05,
"loss": 0.0678,
"num_input_tokens_seen": 1138264,
"step": 1840
},
{
"epoch": 3.2887700534759357,
"grad_norm": 0.07161960005760193,
"learning_rate": 4.937196097823119e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1141432,
"step": 1845
},
{
"epoch": 3.2976827094474155,
"grad_norm": 0.010849026031792164,
"learning_rate": 4.936326949811437e-05,
"loss": 0.0015,
"num_input_tokens_seen": 1143736,
"step": 1850
},
{
"epoch": 3.306595365418895,
"grad_norm": 0.01427000667899847,
"learning_rate": 4.93545190651886e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1146392,
"step": 1855
},
{
"epoch": 3.3155080213903743,
"grad_norm": 19.148462295532227,
"learning_rate": 4.934570970062765e-05,
"loss": 0.0625,
"num_input_tokens_seen": 1149656,
"step": 1860
},
{
"epoch": 3.3244206773618536,
"grad_norm": 22.86386489868164,
"learning_rate": 4.93368414257479e-05,
"loss": 0.0068,
"num_input_tokens_seen": 1152696,
"step": 1865
},
{
"epoch": 3.3333333333333335,
"grad_norm": 1.5289530754089355,
"learning_rate": 4.932791426200829e-05,
"loss": 0.0015,
"num_input_tokens_seen": 1155480,
"step": 1870
},
{
"epoch": 3.342245989304813,
"grad_norm": 20.363109588623047,
"learning_rate": 4.931892823101024e-05,
"loss": 0.0252,
"num_input_tokens_seen": 1159288,
"step": 1875
},
{
"epoch": 3.3511586452762923,
"grad_norm": 6.934281826019287,
"learning_rate": 4.930988335449762e-05,
"loss": 0.0456,
"num_input_tokens_seen": 1161848,
"step": 1880
},
{
"epoch": 3.3600713012477716,
"grad_norm": 0.005279934033751488,
"learning_rate": 4.9300779654356706e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1165048,
"step": 1885
},
{
"epoch": 3.3689839572192515,
"grad_norm": 0.005097201559692621,
"learning_rate": 4.929161715261608e-05,
"loss": 0.014,
"num_input_tokens_seen": 1167928,
"step": 1890
},
{
"epoch": 3.377896613190731,
"grad_norm": 0.0021505567710846663,
"learning_rate": 4.9282395871446626e-05,
"loss": 0.0048,
"num_input_tokens_seen": 1171928,
"step": 1895
},
{
"epoch": 3.3868092691622103,
"grad_norm": 0.08554159104824066,
"learning_rate": 4.927311583316148e-05,
"loss": 0.0915,
"num_input_tokens_seen": 1174200,
"step": 1900
},
{
"epoch": 3.3957219251336896,
"grad_norm": 0.024701252579689026,
"learning_rate": 4.92637770602159e-05,
"loss": 0.0749,
"num_input_tokens_seen": 1177368,
"step": 1905
},
{
"epoch": 3.4046345811051695,
"grad_norm": 0.5353766083717346,
"learning_rate": 4.925437957520733e-05,
"loss": 0.0578,
"num_input_tokens_seen": 1180600,
"step": 1910
},
{
"epoch": 3.413547237076649,
"grad_norm": 0.5619534850120544,
"learning_rate": 4.9244923400875245e-05,
"loss": 0.0067,
"num_input_tokens_seen": 1183416,
"step": 1915
},
{
"epoch": 3.4224598930481283,
"grad_norm": 0.03234964981675148,
"learning_rate": 4.923540856010113e-05,
"loss": 0.0009,
"num_input_tokens_seen": 1186904,
"step": 1920
},
{
"epoch": 3.431372549019608,
"grad_norm": 0.010062027722597122,
"learning_rate": 4.922583507590843e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1190360,
"step": 1925
},
{
"epoch": 3.4402852049910875,
"grad_norm": 0.0606231763958931,
"learning_rate": 4.921620297146253e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1193912,
"step": 1930
},
{
"epoch": 3.449197860962567,
"grad_norm": 0.0035294261761009693,
"learning_rate": 4.920651227007062e-05,
"loss": 0.0014,
"num_input_tokens_seen": 1197432,
"step": 1935
},
{
"epoch": 3.4581105169340463,
"grad_norm": 0.00476317061111331,
"learning_rate": 4.919676299518167e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1200728,
"step": 1940
},
{
"epoch": 3.4670231729055256,
"grad_norm": 0.000967069179750979,
"learning_rate": 4.918695517038643e-05,
"loss": 0.0018,
"num_input_tokens_seen": 1204408,
"step": 1945
},
{
"epoch": 3.4759358288770055,
"grad_norm": 0.009276410564780235,
"learning_rate": 4.917708881941728e-05,
"loss": 0.0093,
"num_input_tokens_seen": 1207512,
"step": 1950
},
{
"epoch": 3.484848484848485,
"grad_norm": 0.0032905612606555223,
"learning_rate": 4.916716396614824e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1210520,
"step": 1955
},
{
"epoch": 3.4937611408199643,
"grad_norm": 0.00179088837467134,
"learning_rate": 4.91571806345949e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1213784,
"step": 1960
},
{
"epoch": 3.502673796791444,
"grad_norm": 3.6233127117156982,
"learning_rate": 4.914713884891433e-05,
"loss": 0.0028,
"num_input_tokens_seen": 1216504,
"step": 1965
},
{
"epoch": 3.5115864527629235,
"grad_norm": 0.0005736930761486292,
"learning_rate": 4.913703863340504e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1219160,
"step": 1970
},
{
"epoch": 3.520499108734403,
"grad_norm": 0.018492160364985466,
"learning_rate": 4.912688001250697e-05,
"loss": 0.0042,
"num_input_tokens_seen": 1222296,
"step": 1975
},
{
"epoch": 3.5294117647058822,
"grad_norm": 12.424592018127441,
"learning_rate": 4.9116663010801326e-05,
"loss": 0.1497,
"num_input_tokens_seen": 1225400,
"step": 1980
},
{
"epoch": 3.5383244206773616,
"grad_norm": 0.0016981695080175996,
"learning_rate": 4.910638765301062e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1229112,
"step": 1985
},
{
"epoch": 3.5472370766488415,
"grad_norm": 0.30625298619270325,
"learning_rate": 4.909605396399856e-05,
"loss": 0.0615,
"num_input_tokens_seen": 1232088,
"step": 1990
},
{
"epoch": 3.556149732620321,
"grad_norm": 0.03002898208796978,
"learning_rate": 4.908566196876999e-05,
"loss": 0.006,
"num_input_tokens_seen": 1234680,
"step": 1995
},
{
"epoch": 3.5650623885918002,
"grad_norm": 0.7189759016036987,
"learning_rate": 4.9075211692470865e-05,
"loss": 0.0006,
"num_input_tokens_seen": 1238648,
"step": 2000
},
{
"epoch": 3.57397504456328,
"grad_norm": 0.020788084715604782,
"learning_rate": 4.906470316038814e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1242104,
"step": 2005
},
{
"epoch": 3.5828877005347595,
"grad_norm": 0.000429228093707934,
"learning_rate": 4.9054136397949753e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1244472,
"step": 2010
},
{
"epoch": 3.591800356506239,
"grad_norm": 0.005330861080437899,
"learning_rate": 4.904351143072452e-05,
"loss": 0.0291,
"num_input_tokens_seen": 1247864,
"step": 2015
},
{
"epoch": 3.6007130124777182,
"grad_norm": 0.00216846214607358,
"learning_rate": 4.903282828442213e-05,
"loss": 0.0035,
"num_input_tokens_seen": 1251192,
"step": 2020
},
{
"epoch": 3.6096256684491976,
"grad_norm": 0.004735489841550589,
"learning_rate": 4.902208698489302e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1253688,
"step": 2025
},
{
"epoch": 3.6185383244206775,
"grad_norm": 0.0038757645525038242,
"learning_rate": 4.9011287558128366e-05,
"loss": 0.1682,
"num_input_tokens_seen": 1256728,
"step": 2030
},
{
"epoch": 3.627450980392157,
"grad_norm": 0.8156978487968445,
"learning_rate": 4.900043003025998e-05,
"loss": 0.0019,
"num_input_tokens_seen": 1259960,
"step": 2035
},
{
"epoch": 3.6363636363636362,
"grad_norm": 6.951355457305908,
"learning_rate": 4.898951442756027e-05,
"loss": 0.0179,
"num_input_tokens_seen": 1262136,
"step": 2040
},
{
"epoch": 3.645276292335116,
"grad_norm": 1.6026536226272583,
"learning_rate": 4.897854077644217e-05,
"loss": 0.0058,
"num_input_tokens_seen": 1264696,
"step": 2045
},
{
"epoch": 3.6541889483065955,
"grad_norm": 0.007580269128084183,
"learning_rate": 4.8967509103459084e-05,
"loss": 0.0022,
"num_input_tokens_seen": 1267768,
"step": 2050
},
{
"epoch": 3.663101604278075,
"grad_norm": 0.027493992820382118,
"learning_rate": 4.8956419435304804e-05,
"loss": 0.0008,
"num_input_tokens_seen": 1270776,
"step": 2055
},
{
"epoch": 3.6720142602495542,
"grad_norm": 0.008505471050739288,
"learning_rate": 4.894527179881345e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1273848,
"step": 2060
},
{
"epoch": 3.6809269162210336,
"grad_norm": 0.011075139045715332,
"learning_rate": 4.893406622095943e-05,
"loss": 0.0412,
"num_input_tokens_seen": 1275896,
"step": 2065
},
{
"epoch": 3.6898395721925135,
"grad_norm": 0.018276818096637726,
"learning_rate": 4.8922802728857334e-05,
"loss": 0.1426,
"num_input_tokens_seen": 1278552,
"step": 2070
},
{
"epoch": 3.698752228163993,
"grad_norm": 0.0035789543762803078,
"learning_rate": 4.89114813497619e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1281592,
"step": 2075
},
{
"epoch": 3.7076648841354722,
"grad_norm": 0.0011354973539710045,
"learning_rate": 4.890010211106795e-05,
"loss": 0.0006,
"num_input_tokens_seen": 1284504,
"step": 2080
},
{
"epoch": 3.716577540106952,
"grad_norm": 0.1557454615831375,
"learning_rate": 4.8888665040310273e-05,
"loss": 0.0069,
"num_input_tokens_seen": 1287256,
"step": 2085
},
{
"epoch": 3.7254901960784315,
"grad_norm": 0.017272913828492165,
"learning_rate": 4.887717016516363e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1290808,
"step": 2090
},
{
"epoch": 3.734402852049911,
"grad_norm": 0.007790794596076012,
"learning_rate": 4.886561751344266e-05,
"loss": 0.062,
"num_input_tokens_seen": 1294040,
"step": 2095
},
{
"epoch": 3.7433155080213902,
"grad_norm": 0.0025972179137170315,
"learning_rate": 4.885400711310178e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1296568,
"step": 2100
},
{
"epoch": 3.7522281639928696,
"grad_norm": 0.03971244767308235,
"learning_rate": 4.8842338992235146e-05,
"loss": 0.0119,
"num_input_tokens_seen": 1299704,
"step": 2105
},
{
"epoch": 3.7611408199643495,
"grad_norm": 0.014764413237571716,
"learning_rate": 4.883061317907661e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1301816,
"step": 2110
},
{
"epoch": 3.770053475935829,
"grad_norm": 0.02033417299389839,
"learning_rate": 4.8818829701999596e-05,
"loss": 0.0446,
"num_input_tokens_seen": 1304728,
"step": 2115
},
{
"epoch": 3.7789661319073082,
"grad_norm": 0.0014599093701690435,
"learning_rate": 4.880698858951707e-05,
"loss": 0.0268,
"num_input_tokens_seen": 1307736,
"step": 2120
},
{
"epoch": 3.787878787878788,
"grad_norm": 0.03279818594455719,
"learning_rate": 4.879508987028146e-05,
"loss": 0.0444,
"num_input_tokens_seen": 1311320,
"step": 2125
},
{
"epoch": 3.7967914438502675,
"grad_norm": 23.06256103515625,
"learning_rate": 4.87831335730846e-05,
"loss": 0.0369,
"num_input_tokens_seen": 1314136,
"step": 2130
},
{
"epoch": 3.805704099821747,
"grad_norm": 0.00789332389831543,
"learning_rate": 4.877111972685762e-05,
"loss": 0.026,
"num_input_tokens_seen": 1317176,
"step": 2135
},
{
"epoch": 3.8146167557932262,
"grad_norm": 0.002600538544356823,
"learning_rate": 4.875904836067092e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1320632,
"step": 2140
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.18604746460914612,
"learning_rate": 4.874691950373409e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1323512,
"step": 2145
},
{
"epoch": 3.8324420677361855,
"grad_norm": 0.0031476416625082493,
"learning_rate": 4.873473318539583e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1327160,
"step": 2150
},
{
"epoch": 3.841354723707665,
"grad_norm": 0.0015608452958986163,
"learning_rate": 4.872248943514387e-05,
"loss": 0.0005,
"num_input_tokens_seen": 1329912,
"step": 2155
},
{
"epoch": 3.8502673796791442,
"grad_norm": 0.008163553662598133,
"learning_rate": 4.871018828260492e-05,
"loss": 0.007,
"num_input_tokens_seen": 1332760,
"step": 2160
},
{
"epoch": 3.859180035650624,
"grad_norm": 0.005292457994073629,
"learning_rate": 4.869782975754458e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1335608,
"step": 2165
},
{
"epoch": 3.8680926916221035,
"grad_norm": 0.002038877457380295,
"learning_rate": 4.86854138898673e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1339064,
"step": 2170
},
{
"epoch": 3.877005347593583,
"grad_norm": 0.0013985374243929982,
"learning_rate": 4.867294070961625e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1342264,
"step": 2175
},
{
"epoch": 3.8859180035650622,
"grad_norm": 0.19727768003940582,
"learning_rate": 4.8660410246973306e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1345496,
"step": 2180
},
{
"epoch": 3.8948306595365416,
"grad_norm": 0.15647095441818237,
"learning_rate": 4.8647822532258955e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1347896,
"step": 2185
},
{
"epoch": 3.9037433155080214,
"grad_norm": 30.87330436706543,
"learning_rate": 4.86351775959322e-05,
"loss": 0.0068,
"num_input_tokens_seen": 1350808,
"step": 2190
},
{
"epoch": 3.912655971479501,
"grad_norm": 0.0015438764821738005,
"learning_rate": 4.8622475468590514e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1353080,
"step": 2195
},
{
"epoch": 3.9215686274509802,
"grad_norm": 0.001988581381738186,
"learning_rate": 4.8609716180969755e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1356728,
"step": 2200
},
{
"epoch": 3.93048128342246,
"grad_norm": 0.002157883020117879,
"learning_rate": 4.859689976394412e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1360152,
"step": 2205
},
{
"epoch": 3.9393939393939394,
"grad_norm": 0.0009718194487504661,
"learning_rate": 4.858402624852599e-05,
"loss": 0.001,
"num_input_tokens_seen": 1364152,
"step": 2210
},
{
"epoch": 3.948306595365419,
"grad_norm": 0.022542769089341164,
"learning_rate": 4.8571095665865976e-05,
"loss": 0.0866,
"num_input_tokens_seen": 1367512,
"step": 2215
},
{
"epoch": 3.9572192513368982,
"grad_norm": 0.001373310573399067,
"learning_rate": 4.855810804725271e-05,
"loss": 0.0,
"num_input_tokens_seen": 1370520,
"step": 2220
},
{
"epoch": 3.966131907308378,
"grad_norm": 0.014193962328135967,
"learning_rate": 4.854506342411289e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1373016,
"step": 2225
},
{
"epoch": 3.9750445632798574,
"grad_norm": 0.04215296730399132,
"learning_rate": 4.8531961828011124e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1375960,
"step": 2230
},
{
"epoch": 3.983957219251337,
"grad_norm": 0.0009914386318996549,
"learning_rate": 4.8518803290649885e-05,
"loss": 0.0439,
"num_input_tokens_seen": 1379160,
"step": 2235
},
{
"epoch": 3.9928698752228167,
"grad_norm": 0.0015359356766566634,
"learning_rate": 4.8505587843869425e-05,
"loss": 0.1269,
"num_input_tokens_seen": 1382744,
"step": 2240
},
{
"epoch": 4.0,
"eval_loss": 0.13699407875537872,
"eval_runtime": 4.5818,
"eval_samples_per_second": 54.345,
"eval_steps_per_second": 13.75,
"num_input_tokens_seen": 1384096,
"step": 2244
},
{
"epoch": 4.001782531194296,
"grad_norm": 0.003554535796865821,
"learning_rate": 4.849231551964771e-05,
"loss": 0.001,
"num_input_tokens_seen": 1384736,
"step": 2245
},
{
"epoch": 4.010695187165775,
"grad_norm": 0.006082756910473108,
"learning_rate": 4.847898635010033e-05,
"loss": 0.0011,
"num_input_tokens_seen": 1387552,
"step": 2250
},
{
"epoch": 4.019607843137255,
"grad_norm": 0.0006636533071286976,
"learning_rate": 4.846560036748043e-05,
"loss": 0.0026,
"num_input_tokens_seen": 1390944,
"step": 2255
},
{
"epoch": 4.028520499108734,
"grad_norm": 0.014104689471423626,
"learning_rate": 4.8452157604178626e-05,
"loss": 0.0008,
"num_input_tokens_seen": 1393728,
"step": 2260
},
{
"epoch": 4.037433155080214,
"grad_norm": 0.005312174558639526,
"learning_rate": 4.8438658092722914e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1396288,
"step": 2265
},
{
"epoch": 4.046345811051693,
"grad_norm": 0.0029452915769070387,
"learning_rate": 4.8425101865778634e-05,
"loss": 0.0,
"num_input_tokens_seen": 1399424,
"step": 2270
},
{
"epoch": 4.055258467023173,
"grad_norm": 0.0014865044504404068,
"learning_rate": 4.8411488956148344e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1402720,
"step": 2275
},
{
"epoch": 4.064171122994653,
"grad_norm": 0.0015569854294881225,
"learning_rate": 4.839781939677176e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1406208,
"step": 2280
},
{
"epoch": 4.073083778966132,
"grad_norm": 0.002135837683454156,
"learning_rate": 4.838409322072568e-05,
"loss": 0.0,
"num_input_tokens_seen": 1409952,
"step": 2285
},
{
"epoch": 4.081996434937611,
"grad_norm": 0.0058957538567483425,
"learning_rate": 4.8370310461223894e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1413056,
"step": 2290
},
{
"epoch": 4.090909090909091,
"grad_norm": 0.001857185736298561,
"learning_rate": 4.835647115161712e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1415712,
"step": 2295
},
{
"epoch": 4.09982174688057,
"grad_norm": 0.0014896428911015391,
"learning_rate": 4.8342575325392916e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1418368,
"step": 2300
},
{
"epoch": 4.10873440285205,
"grad_norm": 0.002626370871439576,
"learning_rate": 4.832862301617557e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1421248,
"step": 2305
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.0008361501968465745,
"learning_rate": 4.8314614257726076e-05,
"loss": 0.0,
"num_input_tokens_seen": 1424320,
"step": 2310
},
{
"epoch": 4.126559714795009,
"grad_norm": 0.0006268385332077742,
"learning_rate": 4.8300549083941985e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1427040,
"step": 2315
},
{
"epoch": 4.135472370766489,
"grad_norm": 0.0009334477363154292,
"learning_rate": 4.82864275288574e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1430048,
"step": 2320
},
{
"epoch": 4.144385026737968,
"grad_norm": 0.0012060723965987563,
"learning_rate": 4.827224962664282e-05,
"loss": 0.0169,
"num_input_tokens_seen": 1433376,
"step": 2325
},
{
"epoch": 4.153297682709447,
"grad_norm": 0.0006173542933538556,
"learning_rate": 4.8258015411605095e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1436416,
"step": 2330
},
{
"epoch": 4.162210338680927,
"grad_norm": 0.0023394590243697166,
"learning_rate": 4.824372491818735e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1439392,
"step": 2335
},
{
"epoch": 4.171122994652406,
"grad_norm": 0.02793467603623867,
"learning_rate": 4.822937818096888e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1442368,
"step": 2340
},
{
"epoch": 4.180035650623886,
"grad_norm": 0.006997761782258749,
"learning_rate": 4.821497523466508e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1445088,
"step": 2345
},
{
"epoch": 4.188948306595365,
"grad_norm": 0.0006095584249123931,
"learning_rate": 4.820051611412736e-05,
"loss": 0.0,
"num_input_tokens_seen": 1447904,
"step": 2350
},
{
"epoch": 4.197860962566845,
"grad_norm": 0.0007617850787937641,
"learning_rate": 4.8186000854343034e-05,
"loss": 0.0,
"num_input_tokens_seen": 1451616,
"step": 2355
},
{
"epoch": 4.206773618538325,
"grad_norm": 0.9482964277267456,
"learning_rate": 4.8171429490435285e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1454976,
"step": 2360
},
{
"epoch": 4.215686274509804,
"grad_norm": 0.0003435949329286814,
"learning_rate": 4.815680205766304e-05,
"loss": 0.1751,
"num_input_tokens_seen": 1458624,
"step": 2365
},
{
"epoch": 4.224598930481283,
"grad_norm": 0.013822204433381557,
"learning_rate": 4.814211859142092e-05,
"loss": 0.0028,
"num_input_tokens_seen": 1462048,
"step": 2370
},
{
"epoch": 4.233511586452763,
"grad_norm": 0.024504275992512703,
"learning_rate": 4.812737912723908e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1465248,
"step": 2375
},
{
"epoch": 4.242424242424242,
"grad_norm": 0.00452042929828167,
"learning_rate": 4.811258370078324e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1467584,
"step": 2380
},
{
"epoch": 4.251336898395722,
"grad_norm": 2.1872692108154297,
"learning_rate": 4.809773234785449e-05,
"loss": 0.089,
"num_input_tokens_seen": 1470592,
"step": 2385
},
{
"epoch": 4.260249554367201,
"grad_norm": 2.717146158218384,
"learning_rate": 4.8082825104389264e-05,
"loss": 0.0027,
"num_input_tokens_seen": 1473696,
"step": 2390
},
{
"epoch": 4.269162210338681,
"grad_norm": 0.06081445887684822,
"learning_rate": 4.806786200645924e-05,
"loss": 0.0012,
"num_input_tokens_seen": 1477504,
"step": 2395
},
{
"epoch": 4.278074866310161,
"grad_norm": 0.05218374356627464,
"learning_rate": 4.8052843090271235e-05,
"loss": 0.0012,
"num_input_tokens_seen": 1481600,
"step": 2400
},
{
"epoch": 4.28698752228164,
"grad_norm": 0.012032375670969486,
"learning_rate": 4.803776839216715e-05,
"loss": 0.0014,
"num_input_tokens_seen": 1484768,
"step": 2405
},
{
"epoch": 4.295900178253119,
"grad_norm": 0.3018365204334259,
"learning_rate": 4.802263794862385e-05,
"loss": 0.0023,
"num_input_tokens_seen": 1488160,
"step": 2410
},
{
"epoch": 4.304812834224599,
"grad_norm": 0.006996179930865765,
"learning_rate": 4.8007451796253075e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1491072,
"step": 2415
},
{
"epoch": 4.313725490196078,
"grad_norm": 0.006158452481031418,
"learning_rate": 4.7992209971801425e-05,
"loss": 0.0034,
"num_input_tokens_seen": 1494400,
"step": 2420
},
{
"epoch": 4.322638146167558,
"grad_norm": 0.005736066959798336,
"learning_rate": 4.797691251215014e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1497504,
"step": 2425
},
{
"epoch": 4.331550802139038,
"grad_norm": 0.007838579826056957,
"learning_rate": 4.7961559454315126e-05,
"loss": 0.0003,
"num_input_tokens_seen": 1500928,
"step": 2430
},
{
"epoch": 4.340463458110517,
"grad_norm": 0.003752675373107195,
"learning_rate": 4.7946150835446805e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1503936,
"step": 2435
},
{
"epoch": 4.349376114081997,
"grad_norm": 0.04063476249575615,
"learning_rate": 4.7930686692830064e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1506624,
"step": 2440
},
{
"epoch": 4.358288770053476,
"grad_norm": 0.003907045815140009,
"learning_rate": 4.79151670638841e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1509792,
"step": 2445
},
{
"epoch": 4.367201426024955,
"grad_norm": 0.0013991671148687601,
"learning_rate": 4.789959198616243e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1513024,
"step": 2450
},
{
"epoch": 4.376114081996435,
"grad_norm": 0.002485500182956457,
"learning_rate": 4.7883961497352686e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1515936,
"step": 2455
},
{
"epoch": 4.385026737967914,
"grad_norm": 0.00234747352078557,
"learning_rate": 4.786827563527663e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1519424,
"step": 2460
},
{
"epoch": 4.393939393939394,
"grad_norm": 0.003072569379583001,
"learning_rate": 4.785253443788997e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1521856,
"step": 2465
},
{
"epoch": 4.402852049910873,
"grad_norm": 0.001119230524636805,
"learning_rate": 4.783673794328234e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1525056,
"step": 2470
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.003584908088669181,
"learning_rate": 4.7820886189677175e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1528640,
"step": 2475
},
{
"epoch": 4.420677361853833,
"grad_norm": 0.0011027141008526087,
"learning_rate": 4.780497921543161e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1531424,
"step": 2480
},
{
"epoch": 4.429590017825312,
"grad_norm": 0.0016615098575130105,
"learning_rate": 4.7789017059036413e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1534848,
"step": 2485
},
{
"epoch": 4.438502673796791,
"grad_norm": 0.0021322835236787796,
"learning_rate": 4.777299975911587e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1537568,
"step": 2490
},
{
"epoch": 4.447415329768271,
"grad_norm": 0.0021837984677404165,
"learning_rate": 4.775692735442769e-05,
"loss": 0.0,
"num_input_tokens_seen": 1540192,
"step": 2495
},
{
"epoch": 4.45632798573975,
"grad_norm": 0.002096204087138176,
"learning_rate": 4.774079988386296e-05,
"loss": 0.0,
"num_input_tokens_seen": 1543552,
"step": 2500
},
{
"epoch": 4.46524064171123,
"grad_norm": 4.282310485839844,
"learning_rate": 4.772461738644597e-05,
"loss": 0.0018,
"num_input_tokens_seen": 1546368,
"step": 2505
},
{
"epoch": 4.47415329768271,
"grad_norm": 0.0036549854557961226,
"learning_rate": 4.7708379901334184e-05,
"loss": 0.0,
"num_input_tokens_seen": 1549440,
"step": 2510
},
{
"epoch": 4.483065953654189,
"grad_norm": 0.0010477956384420395,
"learning_rate": 4.76920874678181e-05,
"loss": 0.0,
"num_input_tokens_seen": 1552864,
"step": 2515
},
{
"epoch": 4.491978609625669,
"grad_norm": 0.005400074180215597,
"learning_rate": 4.767574012532122e-05,
"loss": 0.0,
"num_input_tokens_seen": 1555968,
"step": 2520
},
{
"epoch": 4.500891265597148,
"grad_norm": 0.0010502905352041125,
"learning_rate": 4.765933791339985e-05,
"loss": 0.0,
"num_input_tokens_seen": 1558464,
"step": 2525
},
{
"epoch": 4.509803921568627,
"grad_norm": 0.0023418355267494917,
"learning_rate": 4.7642880871743124e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1561472,
"step": 2530
},
{
"epoch": 4.518716577540107,
"grad_norm": 0.0029139232356101274,
"learning_rate": 4.762636904017281e-05,
"loss": 0.0,
"num_input_tokens_seen": 1564960,
"step": 2535
},
{
"epoch": 4.527629233511586,
"grad_norm": 0.001190957729704678,
"learning_rate": 4.760980245864329e-05,
"loss": 0.0,
"num_input_tokens_seen": 1567744,
"step": 2540
},
{
"epoch": 4.536541889483066,
"grad_norm": 0.009031428024172783,
"learning_rate": 4.759318116724138e-05,
"loss": 0.0,
"num_input_tokens_seen": 1571552,
"step": 2545
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.003633075626567006,
"learning_rate": 4.757650520618632e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1575072,
"step": 2550
},
{
"epoch": 4.554367201426025,
"grad_norm": 0.005398147739470005,
"learning_rate": 4.755977461582961e-05,
"loss": 0.0598,
"num_input_tokens_seen": 1578272,
"step": 2555
},
{
"epoch": 4.563279857397505,
"grad_norm": 0.06160321831703186,
"learning_rate": 4.754298943665496e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1581504,
"step": 2560
},
{
"epoch": 4.572192513368984,
"grad_norm": 0.0008511216146871448,
"learning_rate": 4.752614970927817e-05,
"loss": 0.0,
"num_input_tokens_seen": 1584896,
"step": 2565
},
{
"epoch": 4.581105169340463,
"grad_norm": 0.0062278094701468945,
"learning_rate": 4.750925547444699e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1588384,
"step": 2570
},
{
"epoch": 4.590017825311943,
"grad_norm": 34.354591369628906,
"learning_rate": 4.749230677304114e-05,
"loss": 0.0169,
"num_input_tokens_seen": 1590976,
"step": 2575
},
{
"epoch": 4.598930481283422,
"grad_norm": 0.0010839140741154552,
"learning_rate": 4.7475303646072054e-05,
"loss": 0.0,
"num_input_tokens_seen": 1594144,
"step": 2580
},
{
"epoch": 4.607843137254902,
"grad_norm": 0.0006140993209555745,
"learning_rate": 4.7458246134682926e-05,
"loss": 0.0,
"num_input_tokens_seen": 1597664,
"step": 2585
},
{
"epoch": 4.616755793226382,
"grad_norm": 0.0008959631086327136,
"learning_rate": 4.744113428014851e-05,
"loss": 0.0,
"num_input_tokens_seen": 1601216,
"step": 2590
},
{
"epoch": 4.625668449197861,
"grad_norm": 0.010111883282661438,
"learning_rate": 4.7423968123875076e-05,
"loss": 0.0,
"num_input_tokens_seen": 1604288,
"step": 2595
},
{
"epoch": 4.634581105169341,
"grad_norm": 0.006966453976929188,
"learning_rate": 4.740674770740027e-05,
"loss": 0.0013,
"num_input_tokens_seen": 1607840,
"step": 2600
},
{
"epoch": 4.64349376114082,
"grad_norm": 0.006527449004352093,
"learning_rate": 4.738947307239305e-05,
"loss": 0.0987,
"num_input_tokens_seen": 1610752,
"step": 2605
},
{
"epoch": 4.652406417112299,
"grad_norm": 0.005451792385429144,
"learning_rate": 4.737214426065355e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1613536,
"step": 2610
},
{
"epoch": 4.661319073083779,
"grad_norm": 0.015521319583058357,
"learning_rate": 4.735476131411304e-05,
"loss": 0.0007,
"num_input_tokens_seen": 1616416,
"step": 2615
},
{
"epoch": 4.670231729055258,
"grad_norm": 0.016396639868617058,
"learning_rate": 4.733732427483373e-05,
"loss": 0.0349,
"num_input_tokens_seen": 1619840,
"step": 2620
},
{
"epoch": 4.6791443850267385,
"grad_norm": 0.004544154740869999,
"learning_rate": 4.731983318500875e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1622752,
"step": 2625
},
{
"epoch": 4.688057040998218,
"grad_norm": 0.009829314425587654,
"learning_rate": 4.730228808696201e-05,
"loss": 0.0004,
"num_input_tokens_seen": 1625120,
"step": 2630
},
{
"epoch": 4.696969696969697,
"grad_norm": 0.008239485323429108,
"learning_rate": 4.728468902314812e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1628896,
"step": 2635
},
{
"epoch": 4.705882352941177,
"grad_norm": 3.9137890338897705,
"learning_rate": 4.726703603615224e-05,
"loss": 0.0019,
"num_input_tokens_seen": 1632352,
"step": 2640
},
{
"epoch": 4.714795008912656,
"grad_norm": 0.002351184142753482,
"learning_rate": 4.724932916869005e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1635616,
"step": 2645
},
{
"epoch": 4.723707664884135,
"grad_norm": 0.02050667069852352,
"learning_rate": 4.7231568463607576e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1638304,
"step": 2650
},
{
"epoch": 4.732620320855615,
"grad_norm": 0.0012839463306590915,
"learning_rate": 4.721375396388113e-05,
"loss": 0.0474,
"num_input_tokens_seen": 1642496,
"step": 2655
},
{
"epoch": 4.741532976827094,
"grad_norm": 0.0013566635316237807,
"learning_rate": 4.719588571261721e-05,
"loss": 0.0,
"num_input_tokens_seen": 1645344,
"step": 2660
},
{
"epoch": 4.750445632798574,
"grad_norm": 0.0050344159826636314,
"learning_rate": 4.7177963753052345e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1647552,
"step": 2665
},
{
"epoch": 4.759358288770054,
"grad_norm": 0.0009222656372003257,
"learning_rate": 4.715998812855305e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1650528,
"step": 2670
},
{
"epoch": 4.768270944741533,
"grad_norm": 0.004403305239975452,
"learning_rate": 4.7141958882615665e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1654016,
"step": 2675
},
{
"epoch": 4.777183600713013,
"grad_norm": 0.0012677984777837992,
"learning_rate": 4.7123876058866315e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1656832,
"step": 2680
},
{
"epoch": 4.786096256684492,
"grad_norm": 0.00171990180388093,
"learning_rate": 4.710573970106076e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1659552,
"step": 2685
},
{
"epoch": 4.795008912655971,
"grad_norm": 10.418574333190918,
"learning_rate": 4.7087549853084286e-05,
"loss": 0.0027,
"num_input_tokens_seen": 1662944,
"step": 2690
},
{
"epoch": 4.803921568627451,
"grad_norm": 0.0004727148625534028,
"learning_rate": 4.706930655895163e-05,
"loss": 0.0,
"num_input_tokens_seen": 1665312,
"step": 2695
},
{
"epoch": 4.81283422459893,
"grad_norm": 0.000861368898767978,
"learning_rate": 4.7051009862806834e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1669312,
"step": 2700
},
{
"epoch": 4.8217468805704105,
"grad_norm": 0.0034621551167219877,
"learning_rate": 4.703265980892316e-05,
"loss": 0.0014,
"num_input_tokens_seen": 1672384,
"step": 2705
},
{
"epoch": 4.83065953654189,
"grad_norm": 0.0034683283884078264,
"learning_rate": 4.701425644170302e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1675552,
"step": 2710
},
{
"epoch": 4.839572192513369,
"grad_norm": 0.00047382700722664595,
"learning_rate": 4.699579980567776e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1678208,
"step": 2715
},
{
"epoch": 4.848484848484849,
"grad_norm": 0.007945065386593342,
"learning_rate": 4.697728994550771e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1681120,
"step": 2720
},
{
"epoch": 4.857397504456328,
"grad_norm": 0.0005510263727046549,
"learning_rate": 4.6958726905981906e-05,
"loss": 0.0,
"num_input_tokens_seen": 1684256,
"step": 2725
},
{
"epoch": 4.866310160427807,
"grad_norm": 0.0004645136359613389,
"learning_rate": 4.694011073201812e-05,
"loss": 0.0,
"num_input_tokens_seen": 1687328,
"step": 2730
},
{
"epoch": 4.875222816399287,
"grad_norm": 0.0005562491132877767,
"learning_rate": 4.6921441468662666e-05,
"loss": 0.0,
"num_input_tokens_seen": 1689824,
"step": 2735
},
{
"epoch": 4.884135472370766,
"grad_norm": 0.00042231017141602933,
"learning_rate": 4.6902719161090345e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1692896,
"step": 2740
},
{
"epoch": 4.893048128342246,
"grad_norm": 0.0009743515984155238,
"learning_rate": 4.688394385460428e-05,
"loss": 0.0,
"num_input_tokens_seen": 1695584,
"step": 2745
},
{
"epoch": 4.901960784313726,
"grad_norm": 0.00040473334956914186,
"learning_rate": 4.6865115594635866e-05,
"loss": 0.0,
"num_input_tokens_seen": 1698784,
"step": 2750
},
{
"epoch": 4.910873440285205,
"grad_norm": 0.0005721452180296183,
"learning_rate": 4.684623442674463e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1702144,
"step": 2755
},
{
"epoch": 4.919786096256685,
"grad_norm": 0.0032837442122399807,
"learning_rate": 4.682730039661809e-05,
"loss": 0.0,
"num_input_tokens_seen": 1704608,
"step": 2760
},
{
"epoch": 4.928698752228164,
"grad_norm": 0.0010922816582024097,
"learning_rate": 4.680831355007172e-05,
"loss": 0.0,
"num_input_tokens_seen": 1707584,
"step": 2765
},
{
"epoch": 4.937611408199643,
"grad_norm": 0.000615122087765485,
"learning_rate": 4.6789273933048766e-05,
"loss": 0.0458,
"num_input_tokens_seen": 1711552,
"step": 2770
},
{
"epoch": 4.946524064171123,
"grad_norm": 0.0006509011727757752,
"learning_rate": 4.677018159162018e-05,
"loss": 0.0,
"num_input_tokens_seen": 1714944,
"step": 2775
},
{
"epoch": 4.955436720142602,
"grad_norm": 0.0003759993414860219,
"learning_rate": 4.675103657198449e-05,
"loss": 0.0,
"num_input_tokens_seen": 1718336,
"step": 2780
},
{
"epoch": 4.9643493761140824,
"grad_norm": 0.09884090721607208,
"learning_rate": 4.6731838920467684e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1721728,
"step": 2785
},
{
"epoch": 4.973262032085562,
"grad_norm": 0.0024623344652354717,
"learning_rate": 4.6712588683523114e-05,
"loss": 0.0,
"num_input_tokens_seen": 1724352,
"step": 2790
},
{
"epoch": 4.982174688057041,
"grad_norm": 0.00023198811686597764,
"learning_rate": 4.669328590773139e-05,
"loss": 0.0,
"num_input_tokens_seen": 1727552,
"step": 2795
},
{
"epoch": 4.991087344028521,
"grad_norm": 0.000825934752356261,
"learning_rate": 4.66739306398002e-05,
"loss": 0.0,
"num_input_tokens_seen": 1730144,
"step": 2800
},
{
"epoch": 5.0,
"grad_norm": 0.0022655415814369917,
"learning_rate": 4.665452292656431e-05,
"loss": 0.0,
"num_input_tokens_seen": 1732712,
"step": 2805
},
{
"epoch": 5.0,
"eval_loss": 0.1608305424451828,
"eval_runtime": 4.5855,
"eval_samples_per_second": 54.302,
"eval_steps_per_second": 13.739,
"num_input_tokens_seen": 1732712,
"step": 2805
},
{
"epoch": 5.008912655971479,
"grad_norm": 7.032632129266858e-05,
"learning_rate": 4.6635062814985374e-05,
"loss": 0.0,
"num_input_tokens_seen": 1736840,
"step": 2810
},
{
"epoch": 5.017825311942959,
"grad_norm": 0.00010113899043062702,
"learning_rate": 4.6615550352151804e-05,
"loss": 0.0912,
"num_input_tokens_seen": 1739720,
"step": 2815
},
{
"epoch": 5.026737967914438,
"grad_norm": 0.008052799850702286,
"learning_rate": 4.659598558527872e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1742568,
"step": 2820
},
{
"epoch": 5.035650623885918,
"grad_norm": 0.004323708824813366,
"learning_rate": 4.6576368561707794e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1746088,
"step": 2825
},
{
"epoch": 5.044563279857398,
"grad_norm": 0.0015223105438053608,
"learning_rate": 4.6556699328907154e-05,
"loss": 0.0078,
"num_input_tokens_seen": 1749224,
"step": 2830
},
{
"epoch": 5.053475935828877,
"grad_norm": 0.020057303830981255,
"learning_rate": 4.653697793447125e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1752392,
"step": 2835
},
{
"epoch": 5.062388591800357,
"grad_norm": 0.0012005399912595749,
"learning_rate": 4.651720442612076e-05,
"loss": 0.0006,
"num_input_tokens_seen": 1755912,
"step": 2840
},
{
"epoch": 5.071301247771836,
"grad_norm": 0.005722702946513891,
"learning_rate": 4.6497378851702456e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1758792,
"step": 2845
},
{
"epoch": 5.080213903743315,
"grad_norm": 0.00282498961314559,
"learning_rate": 4.6477501259189086e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1761960,
"step": 2850
},
{
"epoch": 5.089126559714795,
"grad_norm": 0.002525976160541177,
"learning_rate": 4.64575716966793e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1765064,
"step": 2855
},
{
"epoch": 5.098039215686274,
"grad_norm": 0.0041107297874987125,
"learning_rate": 4.643759021239747e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1767720,
"step": 2860
},
{
"epoch": 5.106951871657754,
"grad_norm": 0.004421675577759743,
"learning_rate": 4.6417556854693636e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1770312,
"step": 2865
},
{
"epoch": 5.115864527629234,
"grad_norm": 0.001751912641339004,
"learning_rate": 4.639747167204332e-05,
"loss": 0.0267,
"num_input_tokens_seen": 1773352,
"step": 2870
},
{
"epoch": 5.124777183600713,
"grad_norm": 0.004455928225070238,
"learning_rate": 4.6377334713047473e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1775720,
"step": 2875
},
{
"epoch": 5.133689839572193,
"grad_norm": 0.0021583575289696455,
"learning_rate": 4.635714602643234e-05,
"loss": 0.0007,
"num_input_tokens_seen": 1779368,
"step": 2880
},
{
"epoch": 5.142602495543672,
"grad_norm": 0.0018912540981546044,
"learning_rate": 4.633690566104929e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1782664,
"step": 2885
},
{
"epoch": 5.151515151515151,
"grad_norm": 0.0013182173715904355,
"learning_rate": 4.631661366587481e-05,
"loss": 0.0,
"num_input_tokens_seen": 1785480,
"step": 2890
},
{
"epoch": 5.160427807486631,
"grad_norm": 0.01736585982143879,
"learning_rate": 4.629627009001024e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1789032,
"step": 2895
},
{
"epoch": 5.16934046345811,
"grad_norm": 0.000722629192750901,
"learning_rate": 4.62758749826818e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1792008,
"step": 2900
},
{
"epoch": 5.17825311942959,
"grad_norm": 0.0014722676714882255,
"learning_rate": 4.625542839324036e-05,
"loss": 0.0,
"num_input_tokens_seen": 1794984,
"step": 2905
},
{
"epoch": 5.18716577540107,
"grad_norm": 0.0006010913057252765,
"learning_rate": 4.623493037116137e-05,
"loss": 0.0,
"num_input_tokens_seen": 1798728,
"step": 2910
},
{
"epoch": 5.196078431372549,
"grad_norm": 0.0027942806482315063,
"learning_rate": 4.621438096604475e-05,
"loss": 0.0751,
"num_input_tokens_seen": 1801896,
"step": 2915
},
{
"epoch": 5.204991087344029,
"grad_norm": 0.014903007075190544,
"learning_rate": 4.6193780227614744e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1804424,
"step": 2920
},
{
"epoch": 5.213903743315508,
"grad_norm": 0.0002501923299860209,
"learning_rate": 4.61731282057198e-05,
"loss": 0.0,
"num_input_tokens_seen": 1806952,
"step": 2925
},
{
"epoch": 5.222816399286987,
"grad_norm": 0.2811971604824066,
"learning_rate": 4.6152424950332486e-05,
"loss": 0.0002,
"num_input_tokens_seen": 1810696,
"step": 2930
},
{
"epoch": 5.231729055258467,
"grad_norm": 0.001552988076582551,
"learning_rate": 4.613167051154931e-05,
"loss": 0.0,
"num_input_tokens_seen": 1813480,
"step": 2935
},
{
"epoch": 5.240641711229946,
"grad_norm": 0.0007487252005375922,
"learning_rate": 4.6110864939590644e-05,
"loss": 0.0,
"num_input_tokens_seen": 1816808,
"step": 2940
},
{
"epoch": 5.249554367201426,
"grad_norm": 0.0006787029560655355,
"learning_rate": 4.609000828480059e-05,
"loss": 0.0,
"num_input_tokens_seen": 1819816,
"step": 2945
},
{
"epoch": 5.258467023172906,
"grad_norm": 0.002901742234826088,
"learning_rate": 4.606910059764687e-05,
"loss": 0.0,
"num_input_tokens_seen": 1822408,
"step": 2950
},
{
"epoch": 5.267379679144385,
"grad_norm": 0.000984704471193254,
"learning_rate": 4.604814192872065e-05,
"loss": 0.0,
"num_input_tokens_seen": 1825800,
"step": 2955
},
{
"epoch": 5.276292335115865,
"grad_norm": 0.0003694720799103379,
"learning_rate": 4.602713232873651e-05,
"loss": 0.0,
"num_input_tokens_seen": 1828904,
"step": 2960
},
{
"epoch": 5.285204991087344,
"grad_norm": 0.00025981373619288206,
"learning_rate": 4.600607184853224e-05,
"loss": 0.0,
"num_input_tokens_seen": 1832168,
"step": 2965
},
{
"epoch": 5.294117647058823,
"grad_norm": 0.00047363084740936756,
"learning_rate": 4.5984960539068754e-05,
"loss": 0.0,
"num_input_tokens_seen": 1835944,
"step": 2970
},
{
"epoch": 5.303030303030303,
"grad_norm": 0.0005336463218554854,
"learning_rate": 4.596379845142995e-05,
"loss": 0.0068,
"num_input_tokens_seen": 1839240,
"step": 2975
},
{
"epoch": 5.311942959001782,
"grad_norm": 0.0069955759681761265,
"learning_rate": 4.594258563682262e-05,
"loss": 0.0,
"num_input_tokens_seen": 1843176,
"step": 2980
},
{
"epoch": 5.320855614973262,
"grad_norm": 0.004931231494992971,
"learning_rate": 4.592132214657628e-05,
"loss": 0.0,
"num_input_tokens_seen": 1846632,
"step": 2985
},
{
"epoch": 5.329768270944742,
"grad_norm": 0.00035663697053678334,
"learning_rate": 4.590000803214307e-05,
"loss": 0.0,
"num_input_tokens_seen": 1849928,
"step": 2990
},
{
"epoch": 5.338680926916221,
"grad_norm": 5.2898729336448014e-05,
"learning_rate": 4.5878643345097644e-05,
"loss": 0.0,
"num_input_tokens_seen": 1853000,
"step": 2995
},
{
"epoch": 5.347593582887701,
"grad_norm": 0.04160798713564873,
"learning_rate": 4.585722813713701e-05,
"loss": 0.0006,
"num_input_tokens_seen": 1856040,
"step": 3000
},
{
"epoch": 5.35650623885918,
"grad_norm": 0.003238047007471323,
"learning_rate": 4.583576246008043e-05,
"loss": 0.0,
"num_input_tokens_seen": 1859400,
"step": 3005
},
{
"epoch": 5.365418894830659,
"grad_norm": 0.0002586895425338298,
"learning_rate": 4.581424636586929e-05,
"loss": 0.0,
"num_input_tokens_seen": 1863048,
"step": 3010
},
{
"epoch": 5.374331550802139,
"grad_norm": 0.00018354503845330328,
"learning_rate": 4.579267990656697e-05,
"loss": 0.0,
"num_input_tokens_seen": 1866376,
"step": 3015
},
{
"epoch": 5.383244206773618,
"grad_norm": 0.00047791743418201804,
"learning_rate": 4.577106313435873e-05,
"loss": 0.0,
"num_input_tokens_seen": 1869192,
"step": 3020
},
{
"epoch": 5.392156862745098,
"grad_norm": 0.000250328826950863,
"learning_rate": 4.574939610155155e-05,
"loss": 0.0,
"num_input_tokens_seen": 1872616,
"step": 3025
},
{
"epoch": 5.401069518716578,
"grad_norm": 0.0004770901578012854,
"learning_rate": 4.5727678860574055e-05,
"loss": 0.0,
"num_input_tokens_seen": 1875496,
"step": 3030
},
{
"epoch": 5.409982174688057,
"grad_norm": 0.00023706883075647056,
"learning_rate": 4.570591146397635e-05,
"loss": 0.0,
"num_input_tokens_seen": 1878760,
"step": 3035
},
{
"epoch": 5.418894830659537,
"grad_norm": 0.0003681717498693615,
"learning_rate": 4.568409396442991e-05,
"loss": 0.0,
"num_input_tokens_seen": 1881704,
"step": 3040
},
{
"epoch": 5.427807486631016,
"grad_norm": 0.00028255791403353214,
"learning_rate": 4.566222641472742e-05,
"loss": 0.0,
"num_input_tokens_seen": 1884744,
"step": 3045
},
{
"epoch": 5.436720142602495,
"grad_norm": 9.651204163674265e-05,
"learning_rate": 4.564030886778271e-05,
"loss": 0.0,
"num_input_tokens_seen": 1887144,
"step": 3050
},
{
"epoch": 5.445632798573975,
"grad_norm": 0.0002257092419313267,
"learning_rate": 4.561834137663056e-05,
"loss": 0.0,
"num_input_tokens_seen": 1889896,
"step": 3055
},
{
"epoch": 5.454545454545454,
"grad_norm": 0.021864645183086395,
"learning_rate": 4.5596323994426626e-05,
"loss": 0.0,
"num_input_tokens_seen": 1893256,
"step": 3060
},
{
"epoch": 5.463458110516934,
"grad_norm": 0.00025736223324202,
"learning_rate": 4.557425677444727e-05,
"loss": 0.0,
"num_input_tokens_seen": 1896712,
"step": 3065
},
{
"epoch": 5.472370766488414,
"grad_norm": 0.0003680383670143783,
"learning_rate": 4.555213977008946e-05,
"loss": 0.0,
"num_input_tokens_seen": 1899240,
"step": 3070
},
{
"epoch": 5.481283422459893,
"grad_norm": 0.0001519775396445766,
"learning_rate": 4.5529973034870624e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1901608,
"step": 3075
},
{
"epoch": 5.490196078431373,
"grad_norm": 0.0001408984389854595,
"learning_rate": 4.550775662242852e-05,
"loss": 0.0,
"num_input_tokens_seen": 1903848,
"step": 3080
},
{
"epoch": 5.499108734402852,
"grad_norm": 0.0014439078513532877,
"learning_rate": 4.5485490586521116e-05,
"loss": 0.0,
"num_input_tokens_seen": 1907400,
"step": 3085
},
{
"epoch": 5.508021390374331,
"grad_norm": 0.00016677154053468257,
"learning_rate": 4.546317498102648e-05,
"loss": 0.0,
"num_input_tokens_seen": 1910248,
"step": 3090
},
{
"epoch": 5.516934046345811,
"grad_norm": 0.00015338376397266984,
"learning_rate": 4.544080985994258e-05,
"loss": 0.0,
"num_input_tokens_seen": 1913544,
"step": 3095
},
{
"epoch": 5.52584670231729,
"grad_norm": 0.0011399354552850127,
"learning_rate": 4.541839527738723e-05,
"loss": 0.0,
"num_input_tokens_seen": 1916104,
"step": 3100
},
{
"epoch": 5.53475935828877,
"grad_norm": 0.00014922766422387213,
"learning_rate": 4.539593128759792e-05,
"loss": 0.0,
"num_input_tokens_seen": 1919464,
"step": 3105
},
{
"epoch": 5.54367201426025,
"grad_norm": 0.0003580522316042334,
"learning_rate": 4.53734179449317e-05,
"loss": 0.0,
"num_input_tokens_seen": 1922152,
"step": 3110
},
{
"epoch": 5.552584670231729,
"grad_norm": 0.0010872179409489036,
"learning_rate": 4.535085530386503e-05,
"loss": 0.0,
"num_input_tokens_seen": 1925512,
"step": 3115
},
{
"epoch": 5.561497326203209,
"grad_norm": 0.0004132896719966084,
"learning_rate": 4.5328243418993665e-05,
"loss": 0.0,
"num_input_tokens_seen": 1928712,
"step": 3120
},
{
"epoch": 5.570409982174688,
"grad_norm": 0.0005687960074283183,
"learning_rate": 4.5305582345032514e-05,
"loss": 0.0,
"num_input_tokens_seen": 1931784,
"step": 3125
},
{
"epoch": 5.579322638146167,
"grad_norm": 0.00024347477301489562,
"learning_rate": 4.5282872136815516e-05,
"loss": 0.0,
"num_input_tokens_seen": 1934888,
"step": 3130
},
{
"epoch": 5.588235294117647,
"grad_norm": 0.06657089293003082,
"learning_rate": 4.526011284929549e-05,
"loss": 0.0,
"num_input_tokens_seen": 1937576,
"step": 3135
},
{
"epoch": 5.597147950089127,
"grad_norm": 0.00020688715449068695,
"learning_rate": 4.523730453754405e-05,
"loss": 0.0,
"num_input_tokens_seen": 1940392,
"step": 3140
},
{
"epoch": 5.606060606060606,
"grad_norm": 0.00039311559521593153,
"learning_rate": 4.521444725675137e-05,
"loss": 0.0,
"num_input_tokens_seen": 1944040,
"step": 3145
},
{
"epoch": 5.614973262032086,
"grad_norm": 0.0001793958363123238,
"learning_rate": 4.5191541062226186e-05,
"loss": 0.0,
"num_input_tokens_seen": 1946920,
"step": 3150
},
{
"epoch": 5.623885918003565,
"grad_norm": 0.0002715518930926919,
"learning_rate": 4.5168586009395555e-05,
"loss": 0.0001,
"num_input_tokens_seen": 1950856,
"step": 3155
},
{
"epoch": 5.632798573975045,
"grad_norm": 0.0005141710862517357,
"learning_rate": 4.514558215380476e-05,
"loss": 0.0,
"num_input_tokens_seen": 1954312,
"step": 3160
},
{
"epoch": 5.641711229946524,
"grad_norm": 0.00017174534150399268,
"learning_rate": 4.512252955111719e-05,
"loss": 0.0,
"num_input_tokens_seen": 1957480,
"step": 3165
},
{
"epoch": 5.650623885918003,
"grad_norm": 9.127831435762346e-05,
"learning_rate": 4.5099428257114175e-05,
"loss": 0.0,
"num_input_tokens_seen": 1960840,
"step": 3170
},
{
"epoch": 5.659536541889483,
"grad_norm": 0.0001852322748163715,
"learning_rate": 4.507627832769486e-05,
"loss": 0.0,
"num_input_tokens_seen": 1964328,
"step": 3175
},
{
"epoch": 5.668449197860962,
"grad_norm": 0.00013597046199720353,
"learning_rate": 4.50530798188761e-05,
"loss": 0.0,
"num_input_tokens_seen": 1966920,
"step": 3180
},
{
"epoch": 5.677361853832442,
"grad_norm": 9.64289138210006e-05,
"learning_rate": 4.502983278679227e-05,
"loss": 0.0,
"num_input_tokens_seen": 1970696,
"step": 3185
},
{
"epoch": 5.686274509803922,
"grad_norm": 0.0005941576091572642,
"learning_rate": 4.5006537287695186e-05,
"loss": 0.0,
"num_input_tokens_seen": 1973800,
"step": 3190
},
{
"epoch": 5.695187165775401,
"grad_norm": 0.0005220057209953666,
"learning_rate": 4.498319337795392e-05,
"loss": 0.0,
"num_input_tokens_seen": 1976744,
"step": 3195
},
{
"epoch": 5.704099821746881,
"grad_norm": 0.0002706492959987372,
"learning_rate": 4.495980111405471e-05,
"loss": 0.0,
"num_input_tokens_seen": 1980232,
"step": 3200
},
{
"epoch": 5.71301247771836,
"grad_norm": 0.00028276382363401353,
"learning_rate": 4.493636055260077e-05,
"loss": 0.0,
"num_input_tokens_seen": 1982984,
"step": 3205
},
{
"epoch": 5.721925133689839,
"grad_norm": 0.0006307061994448304,
"learning_rate": 4.491287175031218e-05,
"loss": 0.0,
"num_input_tokens_seen": 1986504,
"step": 3210
},
{
"epoch": 5.730837789661319,
"grad_norm": 0.00034339685225859284,
"learning_rate": 4.488933476402579e-05,
"loss": 0.0,
"num_input_tokens_seen": 1989384,
"step": 3215
},
{
"epoch": 5.739750445632799,
"grad_norm": 0.0002526050084270537,
"learning_rate": 4.4865749650695e-05,
"loss": 0.0,
"num_input_tokens_seen": 1992168,
"step": 3220
},
{
"epoch": 5.748663101604278,
"grad_norm": 0.0007074022432789207,
"learning_rate": 4.4842116467389696e-05,
"loss": 0.0,
"num_input_tokens_seen": 1996008,
"step": 3225
},
{
"epoch": 5.757575757575758,
"grad_norm": 0.0002924671280197799,
"learning_rate": 4.4818435271296054e-05,
"loss": 0.0,
"num_input_tokens_seen": 1999144,
"step": 3230
},
{
"epoch": 5.766488413547237,
"grad_norm": 9.665504330769181e-05,
"learning_rate": 4.4794706119716455e-05,
"loss": 0.0,
"num_input_tokens_seen": 2002504,
"step": 3235
},
{
"epoch": 5.775401069518717,
"grad_norm": 0.0006428760825656354,
"learning_rate": 4.47709290700693e-05,
"loss": 0.0,
"num_input_tokens_seen": 2004520,
"step": 3240
},
{
"epoch": 5.784313725490196,
"grad_norm": 0.00017735113215167075,
"learning_rate": 4.474710417988889e-05,
"loss": 0.0,
"num_input_tokens_seen": 2007368,
"step": 3245
},
{
"epoch": 5.793226381461675,
"grad_norm": 0.0002311782300239429,
"learning_rate": 4.4723231506825305e-05,
"loss": 0.0,
"num_input_tokens_seen": 2010600,
"step": 3250
},
{
"epoch": 5.802139037433155,
"grad_norm": 0.06747859716415405,
"learning_rate": 4.469931110864424e-05,
"loss": 0.0,
"num_input_tokens_seen": 2013928,
"step": 3255
},
{
"epoch": 5.811051693404634,
"grad_norm": 0.0002444138517603278,
"learning_rate": 4.4675343043226856e-05,
"loss": 0.0,
"num_input_tokens_seen": 2016968,
"step": 3260
},
{
"epoch": 5.819964349376114,
"grad_norm": 0.00021666847169399261,
"learning_rate": 4.465132736856969e-05,
"loss": 0.0,
"num_input_tokens_seen": 2019784,
"step": 3265
},
{
"epoch": 5.828877005347594,
"grad_norm": 5.657947258441709e-05,
"learning_rate": 4.462726414278444e-05,
"loss": 0.0,
"num_input_tokens_seen": 2022888,
"step": 3270
},
{
"epoch": 5.837789661319073,
"grad_norm": 0.00012392383359838277,
"learning_rate": 4.460315342409791e-05,
"loss": 0.0,
"num_input_tokens_seen": 2025512,
"step": 3275
},
{
"epoch": 5.846702317290553,
"grad_norm": 0.00028178084176033735,
"learning_rate": 4.457899527085178e-05,
"loss": 0.0,
"num_input_tokens_seen": 2028456,
"step": 3280
},
{
"epoch": 5.855614973262032,
"grad_norm": 0.0003765980654861778,
"learning_rate": 4.455478974150255e-05,
"loss": 0.0,
"num_input_tokens_seen": 2031048,
"step": 3285
},
{
"epoch": 5.864527629233511,
"grad_norm": 0.00023475800117012113,
"learning_rate": 4.453053689462131e-05,
"loss": 0.0,
"num_input_tokens_seen": 2034824,
"step": 3290
},
{
"epoch": 5.873440285204991,
"grad_norm": 0.00026866758707910776,
"learning_rate": 4.4506236788893706e-05,
"loss": 0.0,
"num_input_tokens_seen": 2037320,
"step": 3295
},
{
"epoch": 5.882352941176471,
"grad_norm": 0.0006037325947545469,
"learning_rate": 4.44818894831197e-05,
"loss": 0.0,
"num_input_tokens_seen": 2040392,
"step": 3300
},
{
"epoch": 5.89126559714795,
"grad_norm": 0.0043501826003193855,
"learning_rate": 4.4457495036213456e-05,
"loss": 0.0,
"num_input_tokens_seen": 2042952,
"step": 3305
},
{
"epoch": 5.90017825311943,
"grad_norm": 7.099113281583413e-05,
"learning_rate": 4.443305350720324e-05,
"loss": 0.0,
"num_input_tokens_seen": 2046664,
"step": 3310
},
{
"epoch": 5.909090909090909,
"grad_norm": 0.00012569865793921053,
"learning_rate": 4.440856495523122e-05,
"loss": 0.0,
"num_input_tokens_seen": 2049256,
"step": 3315
},
{
"epoch": 5.918003565062389,
"grad_norm": 0.00024479979765601456,
"learning_rate": 4.438402943955336e-05,
"loss": 0.0,
"num_input_tokens_seen": 2052488,
"step": 3320
},
{
"epoch": 5.926916221033868,
"grad_norm": 0.000506595300976187,
"learning_rate": 4.4359447019539264e-05,
"loss": 0.0,
"num_input_tokens_seen": 2055592,
"step": 3325
},
{
"epoch": 5.935828877005347,
"grad_norm": 0.0002587987983133644,
"learning_rate": 4.433481775467202e-05,
"loss": 0.0,
"num_input_tokens_seen": 2058952,
"step": 3330
},
{
"epoch": 5.944741532976827,
"grad_norm": 0.00153339805547148,
"learning_rate": 4.4310141704548094e-05,
"loss": 0.0,
"num_input_tokens_seen": 2061768,
"step": 3335
},
{
"epoch": 5.953654188948306,
"grad_norm": 0.0005379040958359838,
"learning_rate": 4.428541892887712e-05,
"loss": 0.0,
"num_input_tokens_seen": 2064936,
"step": 3340
},
{
"epoch": 5.962566844919786,
"grad_norm": 0.0003168246184941381,
"learning_rate": 4.426064948748183e-05,
"loss": 0.0,
"num_input_tokens_seen": 2068424,
"step": 3345
},
{
"epoch": 5.971479500891266,
"grad_norm": 0.00040106987580657005,
"learning_rate": 4.423583344029786e-05,
"loss": 0.0,
"num_input_tokens_seen": 2071528,
"step": 3350
},
{
"epoch": 5.980392156862745,
"grad_norm": 0.0001550520391901955,
"learning_rate": 4.4210970847373636e-05,
"loss": 0.0,
"num_input_tokens_seen": 2074600,
"step": 3355
},
{
"epoch": 5.989304812834225,
"grad_norm": 0.00010328602365916595,
"learning_rate": 4.4186061768870184e-05,
"loss": 0.0,
"num_input_tokens_seen": 2077544,
"step": 3360
},
{
"epoch": 5.998217468805704,
"grad_norm": 4.394166899146512e-05,
"learning_rate": 4.416110626506105e-05,
"loss": 0.0,
"num_input_tokens_seen": 2080008,
"step": 3365
},
{
"epoch": 6.0,
"eval_loss": 0.18822118639945984,
"eval_runtime": 4.5853,
"eval_samples_per_second": 54.303,
"eval_steps_per_second": 13.739,
"num_input_tokens_seen": 2080184,
"step": 3366
},
{
"epoch": 6.007130124777183,
"grad_norm": 8.21559369796887e-05,
"learning_rate": 4.4136104396332066e-05,
"loss": 0.0,
"num_input_tokens_seen": 2083224,
"step": 3370
},
{
"epoch": 6.016042780748663,
"grad_norm": 0.00019038471509702504,
"learning_rate": 4.4111056223181315e-05,
"loss": 0.0,
"num_input_tokens_seen": 2086008,
"step": 3375
},
{
"epoch": 6.024955436720143,
"grad_norm": 5.39618413313292e-05,
"learning_rate": 4.408596180621889e-05,
"loss": 0.0,
"num_input_tokens_seen": 2089880,
"step": 3380
},
{
"epoch": 6.033868092691622,
"grad_norm": 0.00011664695193758234,
"learning_rate": 4.406082120616677e-05,
"loss": 0.0,
"num_input_tokens_seen": 2093336,
"step": 3385
},
{
"epoch": 6.042780748663102,
"grad_norm": 0.00011814136814791709,
"learning_rate": 4.403563448385872e-05,
"loss": 0.0,
"num_input_tokens_seen": 2095800,
"step": 3390
},
{
"epoch": 6.051693404634581,
"grad_norm": 0.00034572480944916606,
"learning_rate": 4.401040170024009e-05,
"loss": 0.0,
"num_input_tokens_seen": 2099000,
"step": 3395
},
{
"epoch": 6.0606060606060606,
"grad_norm": 0.00010606726573314518,
"learning_rate": 4.398512291636768e-05,
"loss": 0.0,
"num_input_tokens_seen": 2102264,
"step": 3400
},
{
"epoch": 6.06951871657754,
"grad_norm": 0.00011406264820834622,
"learning_rate": 4.395979819340961e-05,
"loss": 0.0,
"num_input_tokens_seen": 2105048,
"step": 3405
},
{
"epoch": 6.078431372549019,
"grad_norm": 0.00042664259672164917,
"learning_rate": 4.3934427592645166e-05,
"loss": 0.0,
"num_input_tokens_seen": 2107480,
"step": 3410
},
{
"epoch": 6.087344028520499,
"grad_norm": 0.00013146565470378846,
"learning_rate": 4.390901117546463e-05,
"loss": 0.0,
"num_input_tokens_seen": 2109688,
"step": 3415
},
{
"epoch": 6.096256684491979,
"grad_norm": 0.00023358648468274623,
"learning_rate": 4.388354900336916e-05,
"loss": 0.0,
"num_input_tokens_seen": 2113464,
"step": 3420
},
{
"epoch": 6.105169340463458,
"grad_norm": 4.177187292953022e-05,
"learning_rate": 4.385804113797062e-05,
"loss": 0.0,
"num_input_tokens_seen": 2116408,
"step": 3425
},
{
"epoch": 6.114081996434938,
"grad_norm": 0.00010631237091729417,
"learning_rate": 4.3832487640991446e-05,
"loss": 0.0,
"num_input_tokens_seen": 2119768,
"step": 3430
},
{
"epoch": 6.122994652406417,
"grad_norm": 4.5598495489684865e-05,
"learning_rate": 4.3806888574264495e-05,
"loss": 0.0,
"num_input_tokens_seen": 2122680,
"step": 3435
},
{
"epoch": 6.1319073083778965,
"grad_norm": 7.063472730806097e-05,
"learning_rate": 4.378124399973287e-05,
"loss": 0.0,
"num_input_tokens_seen": 2125816,
"step": 3440
},
{
"epoch": 6.140819964349376,
"grad_norm": 0.00018453726079314947,
"learning_rate": 4.375555397944983e-05,
"loss": 0.0,
"num_input_tokens_seen": 2128472,
"step": 3445
},
{
"epoch": 6.149732620320855,
"grad_norm": 0.0016097086481750011,
"learning_rate": 4.372981857557856e-05,
"loss": 0.0,
"num_input_tokens_seen": 2130776,
"step": 3450
},
{
"epoch": 6.158645276292335,
"grad_norm": 0.00010451207344885916,
"learning_rate": 4.3704037850392085e-05,
"loss": 0.0,
"num_input_tokens_seen": 2134488,
"step": 3455
},
{
"epoch": 6.167557932263815,
"grad_norm": 0.00021727255079895258,
"learning_rate": 4.367821186627309e-05,
"loss": 0.0,
"num_input_tokens_seen": 2137528,
"step": 3460
},
{
"epoch": 6.176470588235294,
"grad_norm": 0.00022375237313099205,
"learning_rate": 4.365234068571377e-05,
"loss": 0.0,
"num_input_tokens_seen": 2141400,
"step": 3465
},
{
"epoch": 6.185383244206774,
"grad_norm": 0.00022818404249846935,
"learning_rate": 4.36264243713157e-05,
"loss": 0.0,
"num_input_tokens_seen": 2144792,
"step": 3470
},
{
"epoch": 6.194295900178253,
"grad_norm": 0.00013080937787890434,
"learning_rate": 4.360046298578965e-05,
"loss": 0.0,
"num_input_tokens_seen": 2148536,
"step": 3475
},
{
"epoch": 6.2032085561497325,
"grad_norm": 8.40335269458592e-05,
"learning_rate": 4.357445659195545e-05,
"loss": 0.0,
"num_input_tokens_seen": 2151736,
"step": 3480
},
{
"epoch": 6.212121212121212,
"grad_norm": 0.00010684873268473893,
"learning_rate": 4.354840525274185e-05,
"loss": 0.0,
"num_input_tokens_seen": 2154744,
"step": 3485
},
{
"epoch": 6.221033868092691,
"grad_norm": 0.00023128798056859523,
"learning_rate": 4.352230903118636e-05,
"loss": 0.0,
"num_input_tokens_seen": 2157784,
"step": 3490
},
{
"epoch": 6.229946524064171,
"grad_norm": 0.00011345902748871595,
"learning_rate": 4.3496167990435065e-05,
"loss": 0.0,
"num_input_tokens_seen": 2161112,
"step": 3495
},
{
"epoch": 6.238859180035651,
"grad_norm": 5.867306754225865e-05,
"learning_rate": 4.346998219374253e-05,
"loss": 0.0,
"num_input_tokens_seen": 2164056,
"step": 3500
},
{
"epoch": 6.24777183600713,
"grad_norm": 0.00016827252693474293,
"learning_rate": 4.344375170447162e-05,
"loss": 0.0,
"num_input_tokens_seen": 2167352,
"step": 3505
},
{
"epoch": 6.25668449197861,
"grad_norm": 0.0012681673979386687,
"learning_rate": 4.341747658609331e-05,
"loss": 0.0,
"num_input_tokens_seen": 2170008,
"step": 3510
},
{
"epoch": 6.265597147950089,
"grad_norm": 0.00011075458314735442,
"learning_rate": 4.3391156902186615e-05,
"loss": 0.0,
"num_input_tokens_seen": 2173912,
"step": 3515
},
{
"epoch": 6.2745098039215685,
"grad_norm": 0.0001631032646400854,
"learning_rate": 4.336479271643833e-05,
"loss": 0.0,
"num_input_tokens_seen": 2177208,
"step": 3520
},
{
"epoch": 6.283422459893048,
"grad_norm": 0.00010968335845973343,
"learning_rate": 4.333838409264299e-05,
"loss": 0.0,
"num_input_tokens_seen": 2180568,
"step": 3525
},
{
"epoch": 6.292335115864527,
"grad_norm": 0.00011602386803133413,
"learning_rate": 4.331193109470262e-05,
"loss": 0.0,
"num_input_tokens_seen": 2183640,
"step": 3530
},
{
"epoch": 6.301247771836007,
"grad_norm": 9.8639284260571e-05,
"learning_rate": 4.328543378662664e-05,
"loss": 0.0,
"num_input_tokens_seen": 2186840,
"step": 3535
},
{
"epoch": 6.310160427807487,
"grad_norm": 0.0004059053317178041,
"learning_rate": 4.3258892232531664e-05,
"loss": 0.0,
"num_input_tokens_seen": 2189400,
"step": 3540
},
{
"epoch": 6.319073083778966,
"grad_norm": 4.721015284303576e-05,
"learning_rate": 4.3232306496641396e-05,
"loss": 0.0,
"num_input_tokens_seen": 2192600,
"step": 3545
},
{
"epoch": 6.327985739750446,
"grad_norm": 0.00016030446568038315,
"learning_rate": 4.320567664328644e-05,
"loss": 0.0,
"num_input_tokens_seen": 2195480,
"step": 3550
},
{
"epoch": 6.336898395721925,
"grad_norm": 0.0020986474119126797,
"learning_rate": 4.317900273690415e-05,
"loss": 0.0,
"num_input_tokens_seen": 2198488,
"step": 3555
},
{
"epoch": 6.3458110516934045,
"grad_norm": 0.00016660125402268022,
"learning_rate": 4.315228484203848e-05,
"loss": 0.0,
"num_input_tokens_seen": 2201816,
"step": 3560
},
{
"epoch": 6.354723707664884,
"grad_norm": 6.600496999453753e-05,
"learning_rate": 4.312552302333982e-05,
"loss": 0.0,
"num_input_tokens_seen": 2205048,
"step": 3565
},
{
"epoch": 6.363636363636363,
"grad_norm": 7.55599103285931e-05,
"learning_rate": 4.3098717345564846e-05,
"loss": 0.0,
"num_input_tokens_seen": 2208408,
"step": 3570
},
{
"epoch": 6.372549019607844,
"grad_norm": 0.007104981690645218,
"learning_rate": 4.3071867873576364e-05,
"loss": 0.0,
"num_input_tokens_seen": 2211128,
"step": 3575
},
{
"epoch": 6.381461675579323,
"grad_norm": 0.00032805779483169317,
"learning_rate": 4.3044974672343164e-05,
"loss": 0.0,
"num_input_tokens_seen": 2214360,
"step": 3580
},
{
"epoch": 6.390374331550802,
"grad_norm": 8.772522414801642e-05,
"learning_rate": 4.301803780693982e-05,
"loss": 0.0,
"num_input_tokens_seen": 2217592,
"step": 3585
},
{
"epoch": 6.399286987522282,
"grad_norm": 7.108946010703221e-05,
"learning_rate": 4.299105734254657e-05,
"loss": 0.0,
"num_input_tokens_seen": 2220216,
"step": 3590
},
{
"epoch": 6.408199643493761,
"grad_norm": 0.0006188752595335245,
"learning_rate": 4.2964033344449174e-05,
"loss": 0.0,
"num_input_tokens_seen": 2223608,
"step": 3595
},
{
"epoch": 6.4171122994652405,
"grad_norm": 0.0004142081888858229,
"learning_rate": 4.293696587803871e-05,
"loss": 0.0,
"num_input_tokens_seen": 2226520,
"step": 3600
},
{
"epoch": 6.42602495543672,
"grad_norm": 6.948486407054588e-05,
"learning_rate": 4.290985500881143e-05,
"loss": 0.0,
"num_input_tokens_seen": 2229752,
"step": 3605
},
{
"epoch": 6.434937611408199,
"grad_norm": 0.00030549734947271645,
"learning_rate": 4.2882700802368644e-05,
"loss": 0.0,
"num_input_tokens_seen": 2232088,
"step": 3610
},
{
"epoch": 6.443850267379679,
"grad_norm": 8.570123463869095e-05,
"learning_rate": 4.285550332441651e-05,
"loss": 0.0,
"num_input_tokens_seen": 2235320,
"step": 3615
},
{
"epoch": 6.452762923351159,
"grad_norm": 7.770225056447089e-05,
"learning_rate": 4.282826264076587e-05,
"loss": 0.0,
"num_input_tokens_seen": 2238200,
"step": 3620
},
{
"epoch": 6.461675579322638,
"grad_norm": 0.0005393362371250987,
"learning_rate": 4.2800978817332136e-05,
"loss": 0.0,
"num_input_tokens_seen": 2241080,
"step": 3625
},
{
"epoch": 6.470588235294118,
"grad_norm": 0.0004659408878069371,
"learning_rate": 4.27736519201351e-05,
"loss": 0.0,
"num_input_tokens_seen": 2244280,
"step": 3630
},
{
"epoch": 6.479500891265597,
"grad_norm": 0.00013535030302591622,
"learning_rate": 4.27462820152988e-05,
"loss": 0.0,
"num_input_tokens_seen": 2247864,
"step": 3635
},
{
"epoch": 6.4884135472370765,
"grad_norm": 0.02784336358308792,
"learning_rate": 4.27188691690513e-05,
"loss": 0.0,
"num_input_tokens_seen": 2250520,
"step": 3640
},
{
"epoch": 6.497326203208556,
"grad_norm": 7.759433356113732e-05,
"learning_rate": 4.269141344772461e-05,
"loss": 0.0,
"num_input_tokens_seen": 2253016,
"step": 3645
},
{
"epoch": 6.506238859180035,
"grad_norm": 0.0012859473936259747,
"learning_rate": 4.2663914917754474e-05,
"loss": 0.0,
"num_input_tokens_seen": 2256536,
"step": 3650
},
{
"epoch": 6.515151515151516,
"grad_norm": 6.0505506553454325e-05,
"learning_rate": 4.263637364568021e-05,
"loss": 0.0,
"num_input_tokens_seen": 2259736,
"step": 3655
},
{
"epoch": 6.524064171122995,
"grad_norm": 0.00010412324627395719,
"learning_rate": 4.260878969814458e-05,
"loss": 0.0,
"num_input_tokens_seen": 2262776,
"step": 3660
},
{
"epoch": 6.532976827094474,
"grad_norm": 5.067836900707334e-05,
"learning_rate": 4.25811631418936e-05,
"loss": 0.0,
"num_input_tokens_seen": 2265944,
"step": 3665
},
{
"epoch": 6.541889483065954,
"grad_norm": 6.499042501673102e-05,
"learning_rate": 4.255349404377638e-05,
"loss": 0.0,
"num_input_tokens_seen": 2268824,
"step": 3670
},
{
"epoch": 6.550802139037433,
"grad_norm": 0.00021577828738372773,
"learning_rate": 4.252578247074499e-05,
"loss": 0.0,
"num_input_tokens_seen": 2271992,
"step": 3675
},
{
"epoch": 6.5597147950089125,
"grad_norm": 6.629472773056477e-05,
"learning_rate": 4.249802848985426e-05,
"loss": 0.0,
"num_input_tokens_seen": 2275320,
"step": 3680
},
{
"epoch": 6.568627450980392,
"grad_norm": 9.349354513688013e-05,
"learning_rate": 4.247023216826164e-05,
"loss": 0.0,
"num_input_tokens_seen": 2277784,
"step": 3685
},
{
"epoch": 6.577540106951871,
"grad_norm": 4.333916513132863e-05,
"learning_rate": 4.2442393573227046e-05,
"loss": 0.0,
"num_input_tokens_seen": 2280152,
"step": 3690
},
{
"epoch": 6.586452762923351,
"grad_norm": 7.514766184613109e-05,
"learning_rate": 4.241451277211268e-05,
"loss": 0.0,
"num_input_tokens_seen": 2283480,
"step": 3695
},
{
"epoch": 6.595365418894831,
"grad_norm": 0.0007041016942821443,
"learning_rate": 4.238658983238284e-05,
"loss": 0.0,
"num_input_tokens_seen": 2286328,
"step": 3700
},
{
"epoch": 6.60427807486631,
"grad_norm": 0.0005451409379020333,
"learning_rate": 4.2358624821603856e-05,
"loss": 0.0,
"num_input_tokens_seen": 2289112,
"step": 3705
},
{
"epoch": 6.61319073083779,
"grad_norm": 5.5295840866165236e-05,
"learning_rate": 4.2330617807443783e-05,
"loss": 0.0,
"num_input_tokens_seen": 2292312,
"step": 3710
},
{
"epoch": 6.622103386809269,
"grad_norm": 0.0012082583270967007,
"learning_rate": 4.2302568857672375e-05,
"loss": 0.0,
"num_input_tokens_seen": 2295576,
"step": 3715
},
{
"epoch": 6.6310160427807485,
"grad_norm": 0.0004467536637093872,
"learning_rate": 4.2274478040160823e-05,
"loss": 0.0,
"num_input_tokens_seen": 2298392,
"step": 3720
},
{
"epoch": 6.639928698752228,
"grad_norm": 3.002311132149771e-05,
"learning_rate": 4.224634542288163e-05,
"loss": 0.0,
"num_input_tokens_seen": 2301496,
"step": 3725
},
{
"epoch": 6.648841354723707,
"grad_norm": 0.00013791839592158794,
"learning_rate": 4.221817107390847e-05,
"loss": 0.0,
"num_input_tokens_seen": 2304120,
"step": 3730
},
{
"epoch": 6.657754010695188,
"grad_norm": 4.801477552973665e-05,
"learning_rate": 4.2189955061415965e-05,
"loss": 0.0,
"num_input_tokens_seen": 2307160,
"step": 3735
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.00012627999240066856,
"learning_rate": 4.216169745367956e-05,
"loss": 0.0,
"num_input_tokens_seen": 2310648,
"step": 3740
},
{
"epoch": 6.675579322638146,
"grad_norm": 4.972055830876343e-05,
"learning_rate": 4.2133398319075366e-05,
"loss": 0.0,
"num_input_tokens_seen": 2313976,
"step": 3745
},
{
"epoch": 6.684491978609626,
"grad_norm": 0.0004118916403967887,
"learning_rate": 4.210505772607997e-05,
"loss": 0.0,
"num_input_tokens_seen": 2316984,
"step": 3750
},
{
"epoch": 6.693404634581105,
"grad_norm": 3.760185791179538e-05,
"learning_rate": 4.207667574327027e-05,
"loss": 0.0,
"num_input_tokens_seen": 2319576,
"step": 3755
},
{
"epoch": 6.7023172905525845,
"grad_norm": 5.7546676544006914e-05,
"learning_rate": 4.204825243932331e-05,
"loss": 0.0,
"num_input_tokens_seen": 2322936,
"step": 3760
},
{
"epoch": 6.711229946524064,
"grad_norm": 5.261941623757593e-05,
"learning_rate": 4.2019787883016145e-05,
"loss": 0.0,
"num_input_tokens_seen": 2325784,
"step": 3765
},
{
"epoch": 6.720142602495543,
"grad_norm": 3.78849363187328e-05,
"learning_rate": 4.199128214322564e-05,
"loss": 0.0,
"num_input_tokens_seen": 2328952,
"step": 3770
},
{
"epoch": 6.729055258467023,
"grad_norm": 5.117354521644302e-05,
"learning_rate": 4.1962735288928305e-05,
"loss": 0.0,
"num_input_tokens_seen": 2332088,
"step": 3775
},
{
"epoch": 6.737967914438503,
"grad_norm": 0.00011116250971099362,
"learning_rate": 4.193414738920014e-05,
"loss": 0.0,
"num_input_tokens_seen": 2335384,
"step": 3780
},
{
"epoch": 6.746880570409982,
"grad_norm": 0.0002807832497637719,
"learning_rate": 4.1905518513216466e-05,
"loss": 0.0,
"num_input_tokens_seen": 2338168,
"step": 3785
},
{
"epoch": 6.755793226381462,
"grad_norm": 2.829926052072551e-05,
"learning_rate": 4.187684873025176e-05,
"loss": 0.0,
"num_input_tokens_seen": 2341400,
"step": 3790
},
{
"epoch": 6.764705882352941,
"grad_norm": 9.852102084551007e-05,
"learning_rate": 4.184813810967947e-05,
"loss": 0.0,
"num_input_tokens_seen": 2344280,
"step": 3795
},
{
"epoch": 6.7736185383244205,
"grad_norm": 0.00025870383251458406,
"learning_rate": 4.181938672097189e-05,
"loss": 0.0,
"num_input_tokens_seen": 2347288,
"step": 3800
},
{
"epoch": 6.7825311942959,
"grad_norm": 7.604555139550939e-05,
"learning_rate": 4.1790594633699917e-05,
"loss": 0.0,
"num_input_tokens_seen": 2350232,
"step": 3805
},
{
"epoch": 6.791443850267379,
"grad_norm": 4.824809366255067e-05,
"learning_rate": 4.1761761917532974e-05,
"loss": 0.0,
"num_input_tokens_seen": 2352856,
"step": 3810
},
{
"epoch": 6.80035650623886,
"grad_norm": 8.800329669611529e-05,
"learning_rate": 4.173288864223876e-05,
"loss": 0.0,
"num_input_tokens_seen": 2356024,
"step": 3815
},
{
"epoch": 6.809269162210339,
"grad_norm": 8.250436803791672e-05,
"learning_rate": 4.170397487768314e-05,
"loss": 0.0,
"num_input_tokens_seen": 2359224,
"step": 3820
},
{
"epoch": 6.818181818181818,
"grad_norm": 3.099319656030275e-05,
"learning_rate": 4.1675020693829933e-05,
"loss": 0.0,
"num_input_tokens_seen": 2362744,
"step": 3825
},
{
"epoch": 6.827094474153298,
"grad_norm": 8.77875936566852e-05,
"learning_rate": 4.164602616074079e-05,
"loss": 0.0,
"num_input_tokens_seen": 2366072,
"step": 3830
},
{
"epoch": 6.836007130124777,
"grad_norm": 7.556960918009281e-05,
"learning_rate": 4.161699134857497e-05,
"loss": 0.0,
"num_input_tokens_seen": 2368600,
"step": 3835
},
{
"epoch": 6.8449197860962565,
"grad_norm": 0.0002625234774313867,
"learning_rate": 4.1587916327589205e-05,
"loss": 0.0,
"num_input_tokens_seen": 2372056,
"step": 3840
},
{
"epoch": 6.853832442067736,
"grad_norm": 0.00016840100579429418,
"learning_rate": 4.1558801168137526e-05,
"loss": 0.0,
"num_input_tokens_seen": 2374360,
"step": 3845
},
{
"epoch": 6.862745098039216,
"grad_norm": 7.558833749499172e-05,
"learning_rate": 4.152964594067108e-05,
"loss": 0.0,
"num_input_tokens_seen": 2376920,
"step": 3850
},
{
"epoch": 6.871657754010696,
"grad_norm": 3.281732278992422e-05,
"learning_rate": 4.150045071573798e-05,
"loss": 0.0,
"num_input_tokens_seen": 2380120,
"step": 3855
},
{
"epoch": 6.880570409982175,
"grad_norm": 0.00011538410763023421,
"learning_rate": 4.147121556398312e-05,
"loss": 0.0,
"num_input_tokens_seen": 2383448,
"step": 3860
},
{
"epoch": 6.889483065953654,
"grad_norm": 0.00011612474190769717,
"learning_rate": 4.1441940556148006e-05,
"loss": 0.0,
"num_input_tokens_seen": 2386520,
"step": 3865
},
{
"epoch": 6.898395721925134,
"grad_norm": 2.983517879329156e-05,
"learning_rate": 4.141262576307058e-05,
"loss": 0.0,
"num_input_tokens_seen": 2389688,
"step": 3870
},
{
"epoch": 6.907308377896613,
"grad_norm": 0.0001038245391100645,
"learning_rate": 4.138327125568505e-05,
"loss": 0.0,
"num_input_tokens_seen": 2392856,
"step": 3875
},
{
"epoch": 6.9162210338680925,
"grad_norm": 0.0001221461861860007,
"learning_rate": 4.1353877105021726e-05,
"loss": 0.0,
"num_input_tokens_seen": 2396088,
"step": 3880
},
{
"epoch": 6.925133689839572,
"grad_norm": 0.00037911630352027714,
"learning_rate": 4.1324443382206864e-05,
"loss": 0.0,
"num_input_tokens_seen": 2398840,
"step": 3885
},
{
"epoch": 6.934046345811051,
"grad_norm": 5.34054015588481e-05,
"learning_rate": 4.129497015846245e-05,
"loss": 0.0,
"num_input_tokens_seen": 2401528,
"step": 3890
},
{
"epoch": 6.942959001782532,
"grad_norm": 9.827831672737375e-05,
"learning_rate": 4.126545750510605e-05,
"loss": 0.0,
"num_input_tokens_seen": 2405016,
"step": 3895
},
{
"epoch": 6.951871657754011,
"grad_norm": 0.00012127986701671034,
"learning_rate": 4.123590549355067e-05,
"loss": 0.0,
"num_input_tokens_seen": 2408440,
"step": 3900
},
{
"epoch": 6.96078431372549,
"grad_norm": 5.946194869466126e-05,
"learning_rate": 4.1206314195304524e-05,
"loss": 0.0,
"num_input_tokens_seen": 2411864,
"step": 3905
},
{
"epoch": 6.96969696969697,
"grad_norm": 5.441272151074372e-05,
"learning_rate": 4.117668368197089e-05,
"loss": 0.0,
"num_input_tokens_seen": 2414904,
"step": 3910
},
{
"epoch": 6.978609625668449,
"grad_norm": 8.909405005397275e-05,
"learning_rate": 4.1147014025247954e-05,
"loss": 0.0,
"num_input_tokens_seen": 2418616,
"step": 3915
},
{
"epoch": 6.9875222816399285,
"grad_norm": 2.683120510482695e-05,
"learning_rate": 4.111730529692861e-05,
"loss": 0.0,
"num_input_tokens_seen": 2421976,
"step": 3920
},
{
"epoch": 6.996434937611408,
"grad_norm": 5.379191861720756e-05,
"learning_rate": 4.108755756890028e-05,
"loss": 0.0,
"num_input_tokens_seen": 2424152,
"step": 3925
},
{
"epoch": 7.0,
"eval_loss": 0.20610958337783813,
"eval_runtime": 4.5805,
"eval_samples_per_second": 54.361,
"eval_steps_per_second": 13.754,
"num_input_tokens_seen": 2425192,
"step": 3927
},
{
"epoch": 7.005347593582887,
"grad_norm": 4.8729463742347434e-05,
"learning_rate": 4.105777091314478e-05,
"loss": 0.0,
"num_input_tokens_seen": 2427720,
"step": 3930
},
{
"epoch": 7.0142602495543676,
"grad_norm": 3.457432831055485e-05,
"learning_rate": 4.102794540173812e-05,
"loss": 0.0,
"num_input_tokens_seen": 2431432,
"step": 3935
},
{
"epoch": 7.023172905525847,
"grad_norm": 0.0002528139157220721,
"learning_rate": 4.09980811068503e-05,
"loss": 0.0,
"num_input_tokens_seen": 2434088,
"step": 3940
},
{
"epoch": 7.032085561497326,
"grad_norm": 4.039124178234488e-05,
"learning_rate": 4.09681781007452e-05,
"loss": 0.0,
"num_input_tokens_seen": 2437192,
"step": 3945
},
{
"epoch": 7.040998217468806,
"grad_norm": 5.282991332933307e-05,
"learning_rate": 4.0938236455780364e-05,
"loss": 0.0,
"num_input_tokens_seen": 2439816,
"step": 3950
},
{
"epoch": 7.049910873440285,
"grad_norm": 0.00011657732102321461,
"learning_rate": 4.090825624440682e-05,
"loss": 0.0,
"num_input_tokens_seen": 2442632,
"step": 3955
},
{
"epoch": 7.0588235294117645,
"grad_norm": 0.00018239057681057602,
"learning_rate": 4.0878237539168915e-05,
"loss": 0.0,
"num_input_tokens_seen": 2445640,
"step": 3960
},
{
"epoch": 7.067736185383244,
"grad_norm": 4.836150765186176e-05,
"learning_rate": 4.084818041270416e-05,
"loss": 0.0,
"num_input_tokens_seen": 2448264,
"step": 3965
},
{
"epoch": 7.076648841354723,
"grad_norm": 8.737414464121684e-05,
"learning_rate": 4.081808493774302e-05,
"loss": 0.0,
"num_input_tokens_seen": 2450856,
"step": 3970
},
{
"epoch": 7.0855614973262036,
"grad_norm": 0.00010665191803127527,
"learning_rate": 4.0787951187108754e-05,
"loss": 0.0,
"num_input_tokens_seen": 2453640,
"step": 3975
},
{
"epoch": 7.094474153297683,
"grad_norm": 7.286696200026199e-05,
"learning_rate": 4.0757779233717255e-05,
"loss": 0.0,
"num_input_tokens_seen": 2456392,
"step": 3980
},
{
"epoch": 7.103386809269162,
"grad_norm": 3.0005081498529762e-05,
"learning_rate": 4.072756915057683e-05,
"loss": 0.0,
"num_input_tokens_seen": 2459656,
"step": 3985
},
{
"epoch": 7.112299465240642,
"grad_norm": 3.010162436112296e-05,
"learning_rate": 4.069732101078808e-05,
"loss": 0.0,
"num_input_tokens_seen": 2462632,
"step": 3990
},
{
"epoch": 7.121212121212121,
"grad_norm": 5.150728247826919e-05,
"learning_rate": 4.066703488754366e-05,
"loss": 0.0,
"num_input_tokens_seen": 2465480,
"step": 3995
},
{
"epoch": 7.1301247771836005,
"grad_norm": 4.586328941513784e-05,
"learning_rate": 4.063671085412817e-05,
"loss": 0.0,
"num_input_tokens_seen": 2468456,
"step": 4000
},
{
"epoch": 7.13903743315508,
"grad_norm": 0.0002808906720019877,
"learning_rate": 4.060634898391792e-05,
"loss": 0.0,
"num_input_tokens_seen": 2472328,
"step": 4005
},
{
"epoch": 7.14795008912656,
"grad_norm": 6.467744969995692e-05,
"learning_rate": 4.057594935038077e-05,
"loss": 0.0,
"num_input_tokens_seen": 2475944,
"step": 4010
},
{
"epoch": 7.1568627450980395,
"grad_norm": 4.673215516959317e-05,
"learning_rate": 4.054551202707597e-05,
"loss": 0.0,
"num_input_tokens_seen": 2478760,
"step": 4015
},
{
"epoch": 7.165775401069519,
"grad_norm": 3.4133565350202844e-05,
"learning_rate": 4.051503708765399e-05,
"loss": 0.0,
"num_input_tokens_seen": 2482408,
"step": 4020
},
{
"epoch": 7.174688057040998,
"grad_norm": 3.378849942237139e-05,
"learning_rate": 4.048452460585627e-05,
"loss": 0.0,
"num_input_tokens_seen": 2485416,
"step": 4025
},
{
"epoch": 7.183600713012478,
"grad_norm": 3.80839264835231e-05,
"learning_rate": 4.045397465551513e-05,
"loss": 0.0,
"num_input_tokens_seen": 2488712,
"step": 4030
},
{
"epoch": 7.192513368983957,
"grad_norm": 3.3100717701017857e-05,
"learning_rate": 4.042338731055356e-05,
"loss": 0.0,
"num_input_tokens_seen": 2491944,
"step": 4035
},
{
"epoch": 7.2014260249554365,
"grad_norm": 3.59532205038704e-05,
"learning_rate": 4.039276264498501e-05,
"loss": 0.0,
"num_input_tokens_seen": 2494568,
"step": 4040
},
{
"epoch": 7.210338680926916,
"grad_norm": 2.274327925988473e-05,
"learning_rate": 4.0362100732913246e-05,
"loss": 0.0,
"num_input_tokens_seen": 2497544,
"step": 4045
},
{
"epoch": 7.219251336898395,
"grad_norm": 2.8736509193549864e-05,
"learning_rate": 4.0331401648532166e-05,
"loss": 0.0,
"num_input_tokens_seen": 2500840,
"step": 4050
},
{
"epoch": 7.2281639928698755,
"grad_norm": 0.00025308437761850655,
"learning_rate": 4.030066546612562e-05,
"loss": 0.0,
"num_input_tokens_seen": 2504168,
"step": 4055
},
{
"epoch": 7.237076648841355,
"grad_norm": 2.420450618956238e-05,
"learning_rate": 4.02698922600672e-05,
"loss": 0.0,
"num_input_tokens_seen": 2507016,
"step": 4060
},
{
"epoch": 7.245989304812834,
"grad_norm": 4.901519059785642e-05,
"learning_rate": 4.0239082104820114e-05,
"loss": 0.0,
"num_input_tokens_seen": 2509672,
"step": 4065
},
{
"epoch": 7.254901960784314,
"grad_norm": 4.465365782380104e-05,
"learning_rate": 4.020823507493696e-05,
"loss": 0.0,
"num_input_tokens_seen": 2512584,
"step": 4070
},
{
"epoch": 7.263814616755793,
"grad_norm": 0.000388812884921208,
"learning_rate": 4.017735124505958e-05,
"loss": 0.0,
"num_input_tokens_seen": 2515592,
"step": 4075
},
{
"epoch": 7.2727272727272725,
"grad_norm": 2.8893498893012293e-05,
"learning_rate": 4.014643068991885e-05,
"loss": 0.0,
"num_input_tokens_seen": 2518664,
"step": 4080
},
{
"epoch": 7.281639928698752,
"grad_norm": 2.9145874577807263e-05,
"learning_rate": 4.0115473484334495e-05,
"loss": 0.0,
"num_input_tokens_seen": 2522184,
"step": 4085
},
{
"epoch": 7.290552584670232,
"grad_norm": 3.5548859159462154e-05,
"learning_rate": 4.008447970321497e-05,
"loss": 0.0,
"num_input_tokens_seen": 2525448,
"step": 4090
},
{
"epoch": 7.2994652406417115,
"grad_norm": 8.155805699061602e-05,
"learning_rate": 4.005344942155719e-05,
"loss": 0.0,
"num_input_tokens_seen": 2529160,
"step": 4095
},
{
"epoch": 7.308377896613191,
"grad_norm": 3.165958696627058e-05,
"learning_rate": 4.0022382714446415e-05,
"loss": 0.0,
"num_input_tokens_seen": 2532840,
"step": 4100
},
{
"epoch": 7.31729055258467,
"grad_norm": 3.817788092419505e-05,
"learning_rate": 3.9991279657056034e-05,
"loss": 0.0,
"num_input_tokens_seen": 2535400,
"step": 4105
},
{
"epoch": 7.32620320855615,
"grad_norm": 7.300837023649365e-05,
"learning_rate": 3.996014032464741e-05,
"loss": 0.0,
"num_input_tokens_seen": 2538888,
"step": 4110
},
{
"epoch": 7.335115864527629,
"grad_norm": 8.633810648461804e-05,
"learning_rate": 3.9928964792569655e-05,
"loss": 0.0,
"num_input_tokens_seen": 2542376,
"step": 4115
},
{
"epoch": 7.3440285204991085,
"grad_norm": 4.182643897365779e-05,
"learning_rate": 3.98977531362595e-05,
"loss": 0.0,
"num_input_tokens_seen": 2544712,
"step": 4120
},
{
"epoch": 7.352941176470588,
"grad_norm": 3.7395973777165636e-05,
"learning_rate": 3.9866505431241084e-05,
"loss": 0.0,
"num_input_tokens_seen": 2547784,
"step": 4125
},
{
"epoch": 7.361853832442068,
"grad_norm": 2.852731449820567e-05,
"learning_rate": 3.983522175312576e-05,
"loss": 0.0,
"num_input_tokens_seen": 2551272,
"step": 4130
},
{
"epoch": 7.3707664884135475,
"grad_norm": 0.00015264320245478302,
"learning_rate": 3.980390217761193e-05,
"loss": 0.0,
"num_input_tokens_seen": 2555176,
"step": 4135
},
{
"epoch": 7.379679144385027,
"grad_norm": 4.525822077994235e-05,
"learning_rate": 3.9772546780484874e-05,
"loss": 0.0,
"num_input_tokens_seen": 2559144,
"step": 4140
},
{
"epoch": 7.388591800356506,
"grad_norm": 2.5928084141924046e-05,
"learning_rate": 3.974115563761655e-05,
"loss": 0.0,
"num_input_tokens_seen": 2561640,
"step": 4145
},
{
"epoch": 7.397504456327986,
"grad_norm": 0.00020593231602106243,
"learning_rate": 3.970972882496537e-05,
"loss": 0.0,
"num_input_tokens_seen": 2564424,
"step": 4150
},
{
"epoch": 7.406417112299465,
"grad_norm": 4.3078893213532865e-05,
"learning_rate": 3.967826641857612e-05,
"loss": 0.0,
"num_input_tokens_seen": 2567048,
"step": 4155
},
{
"epoch": 7.4153297682709445,
"grad_norm": 0.0001659034751355648,
"learning_rate": 3.964676849457968e-05,
"loss": 0.0,
"num_input_tokens_seen": 2569736,
"step": 4160
},
{
"epoch": 7.424242424242424,
"grad_norm": 0.00016778869030531496,
"learning_rate": 3.961523512919286e-05,
"loss": 0.0,
"num_input_tokens_seen": 2573096,
"step": 4165
},
{
"epoch": 7.433155080213904,
"grad_norm": 2.2783213353250176e-05,
"learning_rate": 3.958366639871826e-05,
"loss": 0.0,
"num_input_tokens_seen": 2576232,
"step": 4170
},
{
"epoch": 7.4420677361853835,
"grad_norm": 9.495346603216603e-05,
"learning_rate": 3.955206237954404e-05,
"loss": 0.0,
"num_input_tokens_seen": 2579816,
"step": 4175
},
{
"epoch": 7.450980392156863,
"grad_norm": 0.00015493450337089598,
"learning_rate": 3.952042314814375e-05,
"loss": 0.0,
"num_input_tokens_seen": 2583240,
"step": 4180
},
{
"epoch": 7.459893048128342,
"grad_norm": 4.0488488593837246e-05,
"learning_rate": 3.9488748781076136e-05,
"loss": 0.0,
"num_input_tokens_seen": 2586536,
"step": 4185
},
{
"epoch": 7.468805704099822,
"grad_norm": 0.00010972293239319697,
"learning_rate": 3.9457039354984974e-05,
"loss": 0.0,
"num_input_tokens_seen": 2588712,
"step": 4190
},
{
"epoch": 7.477718360071301,
"grad_norm": 7.80750997364521e-05,
"learning_rate": 3.942529494659888e-05,
"loss": 0.0,
"num_input_tokens_seen": 2591368,
"step": 4195
},
{
"epoch": 7.4866310160427805,
"grad_norm": 3.180309067829512e-05,
"learning_rate": 3.93935156327311e-05,
"loss": 0.0,
"num_input_tokens_seen": 2594408,
"step": 4200
},
{
"epoch": 7.49554367201426,
"grad_norm": 2.1605219444609247e-05,
"learning_rate": 3.9361701490279355e-05,
"loss": 0.0,
"num_input_tokens_seen": 2597256,
"step": 4205
},
{
"epoch": 7.50445632798574,
"grad_norm": 4.804494892596267e-05,
"learning_rate": 3.9329852596225644e-05,
"loss": 0.0,
"num_input_tokens_seen": 2600136,
"step": 4210
},
{
"epoch": 7.5133689839572195,
"grad_norm": 3.224185638828203e-05,
"learning_rate": 3.929796902763604e-05,
"loss": 0.0,
"num_input_tokens_seen": 2603592,
"step": 4215
},
{
"epoch": 7.522281639928699,
"grad_norm": 3.549808025127277e-05,
"learning_rate": 3.926605086166054e-05,
"loss": 0.0,
"num_input_tokens_seen": 2606472,
"step": 4220
},
{
"epoch": 7.531194295900178,
"grad_norm": 2.821335510816425e-05,
"learning_rate": 3.923409817553284e-05,
"loss": 0.0,
"num_input_tokens_seen": 2609448,
"step": 4225
},
{
"epoch": 7.540106951871658,
"grad_norm": 0.00011925551370950416,
"learning_rate": 3.9202111046570175e-05,
"loss": 0.0,
"num_input_tokens_seen": 2612168,
"step": 4230
},
{
"epoch": 7.549019607843137,
"grad_norm": 4.418912067194469e-05,
"learning_rate": 3.917008955217314e-05,
"loss": 0.0,
"num_input_tokens_seen": 2615464,
"step": 4235
},
{
"epoch": 7.5579322638146165,
"grad_norm": 2.4098915673675947e-05,
"learning_rate": 3.9138033769825434e-05,
"loss": 0.0,
"num_input_tokens_seen": 2618088,
"step": 4240
},
{
"epoch": 7.566844919786096,
"grad_norm": 9.82450510491617e-05,
"learning_rate": 3.910594377709378e-05,
"loss": 0.0,
"num_input_tokens_seen": 2621064,
"step": 4245
},
{
"epoch": 7.575757575757576,
"grad_norm": 6.234718603082001e-05,
"learning_rate": 3.9073819651627654e-05,
"loss": 0.0,
"num_input_tokens_seen": 2624104,
"step": 4250
},
{
"epoch": 7.5846702317290555,
"grad_norm": 6.715167546644807e-05,
"learning_rate": 3.904166147115912e-05,
"loss": 0.0,
"num_input_tokens_seen": 2626984,
"step": 4255
},
{
"epoch": 7.593582887700535,
"grad_norm": 4.9607984692556784e-05,
"learning_rate": 3.9009469313502664e-05,
"loss": 0.0,
"num_input_tokens_seen": 2630440,
"step": 4260
},
{
"epoch": 7.602495543672014,
"grad_norm": 4.675610762205906e-05,
"learning_rate": 3.897724325655497e-05,
"loss": 0.0,
"num_input_tokens_seen": 2633192,
"step": 4265
},
{
"epoch": 7.611408199643494,
"grad_norm": 0.00048179851728491485,
"learning_rate": 3.8944983378294775e-05,
"loss": 0.0,
"num_input_tokens_seen": 2636584,
"step": 4270
},
{
"epoch": 7.620320855614973,
"grad_norm": 0.0001708936906652525,
"learning_rate": 3.8912689756782624e-05,
"loss": 0.0,
"num_input_tokens_seen": 2639560,
"step": 4275
},
{
"epoch": 7.6292335115864525,
"grad_norm": 0.0001099999135476537,
"learning_rate": 3.888036247016073e-05,
"loss": 0.0,
"num_input_tokens_seen": 2642792,
"step": 4280
},
{
"epoch": 7.638146167557933,
"grad_norm": 3.335474684718065e-05,
"learning_rate": 3.884800159665276e-05,
"loss": 0.0,
"num_input_tokens_seen": 2645064,
"step": 4285
},
{
"epoch": 7.647058823529412,
"grad_norm": 5.769542622147128e-05,
"learning_rate": 3.881560721456365e-05,
"loss": 0.0,
"num_input_tokens_seen": 2647912,
"step": 4290
},
{
"epoch": 7.6559714795008915,
"grad_norm": 2.4842016500770114e-05,
"learning_rate": 3.8783179402279454e-05,
"loss": 0.0,
"num_input_tokens_seen": 2651048,
"step": 4295
},
{
"epoch": 7.664884135472371,
"grad_norm": 8.799693750916049e-05,
"learning_rate": 3.8750718238267045e-05,
"loss": 0.0,
"num_input_tokens_seen": 2654312,
"step": 4300
},
{
"epoch": 7.67379679144385,
"grad_norm": 2.1094869225635193e-05,
"learning_rate": 3.871822380107407e-05,
"loss": 0.0,
"num_input_tokens_seen": 2658184,
"step": 4305
},
{
"epoch": 7.68270944741533,
"grad_norm": 4.154935959377326e-05,
"learning_rate": 3.868569616932865e-05,
"loss": 0.0,
"num_input_tokens_seen": 2661352,
"step": 4310
},
{
"epoch": 7.691622103386809,
"grad_norm": 0.0002805989934131503,
"learning_rate": 3.865313542173925e-05,
"loss": 0.0,
"num_input_tokens_seen": 2663944,
"step": 4315
},
{
"epoch": 7.7005347593582885,
"grad_norm": 0.00014767648826818913,
"learning_rate": 3.862054163709444e-05,
"loss": 0.0,
"num_input_tokens_seen": 2667496,
"step": 4320
},
{
"epoch": 7.709447415329768,
"grad_norm": 9.234154276782647e-05,
"learning_rate": 3.8587914894262754e-05,
"loss": 0.0,
"num_input_tokens_seen": 2670536,
"step": 4325
},
{
"epoch": 7.718360071301248,
"grad_norm": 0.0003653615422081202,
"learning_rate": 3.8555255272192456e-05,
"loss": 0.0,
"num_input_tokens_seen": 2674184,
"step": 4330
},
{
"epoch": 7.7272727272727275,
"grad_norm": 6.81550518493168e-05,
"learning_rate": 3.85225628499114e-05,
"loss": 0.0,
"num_input_tokens_seen": 2677224,
"step": 4335
},
{
"epoch": 7.736185383244207,
"grad_norm": 2.63045949395746e-05,
"learning_rate": 3.848983770652679e-05,
"loss": 0.0,
"num_input_tokens_seen": 2679880,
"step": 4340
},
{
"epoch": 7.745098039215686,
"grad_norm": 7.03669065842405e-05,
"learning_rate": 3.8457079921224994e-05,
"loss": 0.0,
"num_input_tokens_seen": 2682600,
"step": 4345
},
{
"epoch": 7.754010695187166,
"grad_norm": 3.367187309777364e-05,
"learning_rate": 3.842428957327138e-05,
"loss": 0.0,
"num_input_tokens_seen": 2685512,
"step": 4350
},
{
"epoch": 7.762923351158645,
"grad_norm": 0.00010811091487994418,
"learning_rate": 3.8391466742010105e-05,
"loss": 0.0,
"num_input_tokens_seen": 2688008,
"step": 4355
},
{
"epoch": 7.7718360071301245,
"grad_norm": 3.1085244700079784e-05,
"learning_rate": 3.835861150686393e-05,
"loss": 0.0,
"num_input_tokens_seen": 2690760,
"step": 4360
},
{
"epoch": 7.780748663101605,
"grad_norm": 6.103352643549442e-05,
"learning_rate": 3.8325723947334036e-05,
"loss": 0.0,
"num_input_tokens_seen": 2694856,
"step": 4365
},
{
"epoch": 7.789661319073084,
"grad_norm": 4.178649760433473e-05,
"learning_rate": 3.82928041429998e-05,
"loss": 0.0,
"num_input_tokens_seen": 2697928,
"step": 4370
},
{
"epoch": 7.7985739750445635,
"grad_norm": 7.200521213235334e-05,
"learning_rate": 3.825985217351862e-05,
"loss": 0.0,
"num_input_tokens_seen": 2701224,
"step": 4375
},
{
"epoch": 7.807486631016043,
"grad_norm": 0.00012286264973226935,
"learning_rate": 3.822686811862575e-05,
"loss": 0.0,
"num_input_tokens_seen": 2704104,
"step": 4380
},
{
"epoch": 7.816399286987522,
"grad_norm": 3.214297612430528e-05,
"learning_rate": 3.819385205813407e-05,
"loss": 0.0,
"num_input_tokens_seen": 2707528,
"step": 4385
},
{
"epoch": 7.825311942959002,
"grad_norm": 3.2054165785666555e-05,
"learning_rate": 3.81608040719339e-05,
"loss": 0.0,
"num_input_tokens_seen": 2710824,
"step": 4390
},
{
"epoch": 7.834224598930481,
"grad_norm": 4.8690933908801526e-05,
"learning_rate": 3.812772423999281e-05,
"loss": 0.0,
"num_input_tokens_seen": 2714280,
"step": 4395
},
{
"epoch": 7.8431372549019605,
"grad_norm": 3.2846561225596815e-05,
"learning_rate": 3.809461264235545e-05,
"loss": 0.0,
"num_input_tokens_seen": 2717416,
"step": 4400
},
{
"epoch": 7.85204991087344,
"grad_norm": 0.0007487075054086745,
"learning_rate": 3.806146935914331e-05,
"loss": 0.0,
"num_input_tokens_seen": 2720744,
"step": 4405
},
{
"epoch": 7.86096256684492,
"grad_norm": 0.00022331879881676286,
"learning_rate": 3.8028294470554565e-05,
"loss": 0.0,
"num_input_tokens_seen": 2723592,
"step": 4410
},
{
"epoch": 7.8698752228163995,
"grad_norm": 5.3162981203058735e-05,
"learning_rate": 3.799508805686386e-05,
"loss": 0.0,
"num_input_tokens_seen": 2725960,
"step": 4415
},
{
"epoch": 7.878787878787879,
"grad_norm": 1.9468281607259996e-05,
"learning_rate": 3.796185019842212e-05,
"loss": 0.0,
"num_input_tokens_seen": 2728936,
"step": 4420
},
{
"epoch": 7.887700534759358,
"grad_norm": 3.672900129458867e-05,
"learning_rate": 3.792858097565637e-05,
"loss": 0.0,
"num_input_tokens_seen": 2732264,
"step": 4425
},
{
"epoch": 7.896613190730838,
"grad_norm": 4.778657967108302e-05,
"learning_rate": 3.789528046906953e-05,
"loss": 0.0,
"num_input_tokens_seen": 2734408,
"step": 4430
},
{
"epoch": 7.905525846702317,
"grad_norm": 6.20819118921645e-05,
"learning_rate": 3.786194875924019e-05,
"loss": 0.0,
"num_input_tokens_seen": 2738184,
"step": 4435
},
{
"epoch": 7.9144385026737964,
"grad_norm": 0.00018136748985853046,
"learning_rate": 3.7828585926822466e-05,
"loss": 0.0,
"num_input_tokens_seen": 2742056,
"step": 4440
},
{
"epoch": 7.923351158645277,
"grad_norm": 2.4137643777066842e-05,
"learning_rate": 3.77951920525458e-05,
"loss": 0.0,
"num_input_tokens_seen": 2745512,
"step": 4445
},
{
"epoch": 7.932263814616756,
"grad_norm": 2.8110442144679837e-05,
"learning_rate": 3.776176721721472e-05,
"loss": 0.0,
"num_input_tokens_seen": 2749064,
"step": 4450
},
{
"epoch": 7.9411764705882355,
"grad_norm": 6.0824535466963425e-05,
"learning_rate": 3.772831150170868e-05,
"loss": 0.0,
"num_input_tokens_seen": 2751368,
"step": 4455
},
{
"epoch": 7.950089126559715,
"grad_norm": 5.143785892869346e-05,
"learning_rate": 3.769482498698185e-05,
"loss": 0.0,
"num_input_tokens_seen": 2753768,
"step": 4460
},
{
"epoch": 7.959001782531194,
"grad_norm": 5.6326894991798326e-05,
"learning_rate": 3.766130775406293e-05,
"loss": 0.0,
"num_input_tokens_seen": 2756712,
"step": 4465
},
{
"epoch": 7.967914438502674,
"grad_norm": 5.954270454822108e-05,
"learning_rate": 3.7627759884054955e-05,
"loss": 0.0,
"num_input_tokens_seen": 2759528,
"step": 4470
},
{
"epoch": 7.976827094474153,
"grad_norm": 4.2564461182337254e-05,
"learning_rate": 3.7594181458135105e-05,
"loss": 0.0,
"num_input_tokens_seen": 2763464,
"step": 4475
},
{
"epoch": 7.9857397504456324,
"grad_norm": 2.670029607543256e-05,
"learning_rate": 3.756057255755446e-05,
"loss": 0.0,
"num_input_tokens_seen": 2766696,
"step": 4480
},
{
"epoch": 7.994652406417112,
"grad_norm": 2.5107356123044156e-05,
"learning_rate": 3.752693326363789e-05,
"loss": 0.0,
"num_input_tokens_seen": 2770952,
"step": 4485
},
{
"epoch": 8.0,
"eval_loss": 0.21877846121788025,
"eval_runtime": 4.5861,
"eval_samples_per_second": 54.294,
"eval_steps_per_second": 13.737,
"num_input_tokens_seen": 2772384,
"step": 4488
},
{
"epoch": 8.003565062388592,
"grad_norm": 3.1642932299291715e-05,
"learning_rate": 3.749326365778376e-05,
"loss": 0.0,
"num_input_tokens_seen": 2773728,
"step": 4490
},
{
"epoch": 8.01247771836007,
"grad_norm": 0.0001709481730358675,
"learning_rate": 3.7459563821463816e-05,
"loss": 0.0,
"num_input_tokens_seen": 2776800,
"step": 4495
},
{
"epoch": 8.02139037433155,
"grad_norm": 2.5652454496594146e-05,
"learning_rate": 3.7425833836222944e-05,
"loss": 0.0,
"num_input_tokens_seen": 2779872,
"step": 4500
},
{
"epoch": 8.030303030303031,
"grad_norm": 3.859802382066846e-05,
"learning_rate": 3.739207378367898e-05,
"loss": 0.0,
"num_input_tokens_seen": 2782752,
"step": 4505
},
{
"epoch": 8.03921568627451,
"grad_norm": 2.2162514142110012e-05,
"learning_rate": 3.735828374552252e-05,
"loss": 0.0,
"num_input_tokens_seen": 2786144,
"step": 4510
},
{
"epoch": 8.04812834224599,
"grad_norm": 1.882631295302417e-05,
"learning_rate": 3.73244638035167e-05,
"loss": 0.0,
"num_input_tokens_seen": 2789024,
"step": 4515
},
{
"epoch": 8.057040998217468,
"grad_norm": 2.0714554921141826e-05,
"learning_rate": 3.7290614039497055e-05,
"loss": 0.0,
"num_input_tokens_seen": 2792096,
"step": 4520
},
{
"epoch": 8.065953654188949,
"grad_norm": 2.3915859856060706e-05,
"learning_rate": 3.7256734535371225e-05,
"loss": 0.0,
"num_input_tokens_seen": 2795424,
"step": 4525
},
{
"epoch": 8.074866310160427,
"grad_norm": 2.8331269277259707e-05,
"learning_rate": 3.722282537311887e-05,
"loss": 0.0,
"num_input_tokens_seen": 2798432,
"step": 4530
},
{
"epoch": 8.083778966131907,
"grad_norm": 2.6493120458326302e-05,
"learning_rate": 3.7188886634791374e-05,
"loss": 0.0,
"num_input_tokens_seen": 2801664,
"step": 4535
},
{
"epoch": 8.092691622103386,
"grad_norm": 4.4756827264791355e-05,
"learning_rate": 3.715491840251172e-05,
"loss": 0.0,
"num_input_tokens_seen": 2804640,
"step": 4540
},
{
"epoch": 8.101604278074866,
"grad_norm": 4.7346729843411595e-05,
"learning_rate": 3.712092075847423e-05,
"loss": 0.0,
"num_input_tokens_seen": 2807424,
"step": 4545
},
{
"epoch": 8.110516934046347,
"grad_norm": 5.598060306510888e-05,
"learning_rate": 3.708689378494441e-05,
"loss": 0.0,
"num_input_tokens_seen": 2810496,
"step": 4550
},
{
"epoch": 8.119429590017825,
"grad_norm": 3.0805953429080546e-05,
"learning_rate": 3.705283756425872e-05,
"loss": 0.0,
"num_input_tokens_seen": 2813632,
"step": 4555
},
{
"epoch": 8.128342245989305,
"grad_norm": 0.000747096084523946,
"learning_rate": 3.701875217882443e-05,
"loss": 0.0,
"num_input_tokens_seen": 2817152,
"step": 4560
},
{
"epoch": 8.137254901960784,
"grad_norm": 3.350910992594436e-05,
"learning_rate": 3.698463771111933e-05,
"loss": 0.0,
"num_input_tokens_seen": 2820960,
"step": 4565
},
{
"epoch": 8.146167557932264,
"grad_norm": 2.3410793801303953e-05,
"learning_rate": 3.695049424369162e-05,
"loss": 0.0,
"num_input_tokens_seen": 2823616,
"step": 4570
},
{
"epoch": 8.155080213903743,
"grad_norm": 2.2134110622573644e-05,
"learning_rate": 3.6916321859159655e-05,
"loss": 0.0,
"num_input_tokens_seen": 2827424,
"step": 4575
},
{
"epoch": 8.163992869875223,
"grad_norm": 2.254007813462522e-05,
"learning_rate": 3.6882120640211745e-05,
"loss": 0.0,
"num_input_tokens_seen": 2830368,
"step": 4580
},
{
"epoch": 8.172905525846703,
"grad_norm": 1.957350832526572e-05,
"learning_rate": 3.684789066960602e-05,
"loss": 0.0,
"num_input_tokens_seen": 2833984,
"step": 4585
},
{
"epoch": 8.181818181818182,
"grad_norm": 3.5591012419899926e-05,
"learning_rate": 3.6813632030170145e-05,
"loss": 0.0,
"num_input_tokens_seen": 2837312,
"step": 4590
},
{
"epoch": 8.190730837789662,
"grad_norm": 2.466439582349267e-05,
"learning_rate": 3.677934480480116e-05,
"loss": 0.0,
"num_input_tokens_seen": 2840448,
"step": 4595
},
{
"epoch": 8.19964349376114,
"grad_norm": 0.0002501521375961602,
"learning_rate": 3.674502907646527e-05,
"loss": 0.0,
"num_input_tokens_seen": 2843520,
"step": 4600
},
{
"epoch": 8.20855614973262,
"grad_norm": 0.0016060250345617533,
"learning_rate": 3.6710684928197674e-05,
"loss": 0.0,
"num_input_tokens_seen": 2846304,
"step": 4605
},
{
"epoch": 8.2174688057041,
"grad_norm": 1.862948738562409e-05,
"learning_rate": 3.667631244310232e-05,
"loss": 0.0,
"num_input_tokens_seen": 2849344,
"step": 4610
},
{
"epoch": 8.22638146167558,
"grad_norm": 2.1913916498306207e-05,
"learning_rate": 3.6641911704351734e-05,
"loss": 0.0,
"num_input_tokens_seen": 2852704,
"step": 4615
},
{
"epoch": 8.235294117647058,
"grad_norm": 0.0001454515295336023,
"learning_rate": 3.66074827951868e-05,
"loss": 0.0,
"num_input_tokens_seen": 2856032,
"step": 4620
},
{
"epoch": 8.244206773618538,
"grad_norm": 0.00010574868792900816,
"learning_rate": 3.657302579891657e-05,
"loss": 0.0,
"num_input_tokens_seen": 2858720,
"step": 4625
},
{
"epoch": 8.253119429590019,
"grad_norm": 4.3842454033438116e-05,
"learning_rate": 3.653854079891805e-05,
"loss": 0.0,
"num_input_tokens_seen": 2861824,
"step": 4630
},
{
"epoch": 8.262032085561497,
"grad_norm": 3.260843368479982e-05,
"learning_rate": 3.650402787863605e-05,
"loss": 0.0,
"num_input_tokens_seen": 2864992,
"step": 4635
},
{
"epoch": 8.270944741532977,
"grad_norm": 3.535457290126942e-05,
"learning_rate": 3.646948712158287e-05,
"loss": 0.0,
"num_input_tokens_seen": 2868736,
"step": 4640
},
{
"epoch": 8.279857397504456,
"grad_norm": 3.386079697520472e-05,
"learning_rate": 3.643491861133822e-05,
"loss": 0.0,
"num_input_tokens_seen": 2871552,
"step": 4645
},
{
"epoch": 8.288770053475936,
"grad_norm": 2.413662150502205e-05,
"learning_rate": 3.640032243154896e-05,
"loss": 0.0,
"num_input_tokens_seen": 2874976,
"step": 4650
},
{
"epoch": 8.297682709447415,
"grad_norm": 6.0050530009903014e-05,
"learning_rate": 3.636569866592889e-05,
"loss": 0.0,
"num_input_tokens_seen": 2877504,
"step": 4655
},
{
"epoch": 8.306595365418895,
"grad_norm": 2.1612911950796843e-05,
"learning_rate": 3.633104739825856e-05,
"loss": 0.0,
"num_input_tokens_seen": 2880096,
"step": 4660
},
{
"epoch": 8.315508021390375,
"grad_norm": 9.056284034159034e-05,
"learning_rate": 3.629636871238508e-05,
"loss": 0.0,
"num_input_tokens_seen": 2883040,
"step": 4665
},
{
"epoch": 8.324420677361854,
"grad_norm": 0.001587208709679544,
"learning_rate": 3.626166269222189e-05,
"loss": 0.0,
"num_input_tokens_seen": 2885888,
"step": 4670
},
{
"epoch": 8.333333333333334,
"grad_norm": 3.2096664654091e-05,
"learning_rate": 3.622692942174858e-05,
"loss": 0.0,
"num_input_tokens_seen": 2889472,
"step": 4675
},
{
"epoch": 8.342245989304812,
"grad_norm": 2.563195812399499e-05,
"learning_rate": 3.6192168985010685e-05,
"loss": 0.0,
"num_input_tokens_seen": 2892544,
"step": 4680
},
{
"epoch": 8.351158645276293,
"grad_norm": 1.978753243747633e-05,
"learning_rate": 3.6157381466119475e-05,
"loss": 0.0,
"num_input_tokens_seen": 2896288,
"step": 4685
},
{
"epoch": 8.360071301247771,
"grad_norm": 8.230508683482185e-05,
"learning_rate": 3.6122566949251724e-05,
"loss": 0.0,
"num_input_tokens_seen": 2899296,
"step": 4690
},
{
"epoch": 8.368983957219251,
"grad_norm": 2.73281129921088e-05,
"learning_rate": 3.6087725518649575e-05,
"loss": 0.0,
"num_input_tokens_seen": 2902208,
"step": 4695
},
{
"epoch": 8.37789661319073,
"grad_norm": 2.3481750758946873e-05,
"learning_rate": 3.6052857258620264e-05,
"loss": 0.0,
"num_input_tokens_seen": 2905472,
"step": 4700
},
{
"epoch": 8.38680926916221,
"grad_norm": 2.9414770324365236e-05,
"learning_rate": 3.6017962253535964e-05,
"loss": 0.0,
"num_input_tokens_seen": 2908320,
"step": 4705
},
{
"epoch": 8.39572192513369,
"grad_norm": 3.249278597650118e-05,
"learning_rate": 3.598304058783357e-05,
"loss": 0.0,
"num_input_tokens_seen": 2911104,
"step": 4710
},
{
"epoch": 8.404634581105169,
"grad_norm": 4.262402580934577e-05,
"learning_rate": 3.594809234601445e-05,
"loss": 0.0,
"num_input_tokens_seen": 2914400,
"step": 4715
},
{
"epoch": 8.41354723707665,
"grad_norm": 1.8245505998493172e-05,
"learning_rate": 3.5913117612644335e-05,
"loss": 0.0,
"num_input_tokens_seen": 2917824,
"step": 4720
},
{
"epoch": 8.422459893048128,
"grad_norm": 9.287180000683293e-05,
"learning_rate": 3.587811647235302e-05,
"loss": 0.0,
"num_input_tokens_seen": 2920800,
"step": 4725
},
{
"epoch": 8.431372549019608,
"grad_norm": 2.9012597224209458e-05,
"learning_rate": 3.5843089009834214e-05,
"loss": 0.0,
"num_input_tokens_seen": 2923744,
"step": 4730
},
{
"epoch": 8.440285204991087,
"grad_norm": 3.052468673558906e-05,
"learning_rate": 3.5808035309845305e-05,
"loss": 0.0,
"num_input_tokens_seen": 2926496,
"step": 4735
},
{
"epoch": 8.449197860962567,
"grad_norm": 0.0005738939507864416,
"learning_rate": 3.5772955457207183e-05,
"loss": 0.0,
"num_input_tokens_seen": 2928704,
"step": 4740
},
{
"epoch": 8.458110516934047,
"grad_norm": 3.2330990507034585e-05,
"learning_rate": 3.5737849536804016e-05,
"loss": 0.0,
"num_input_tokens_seen": 2931200,
"step": 4745
},
{
"epoch": 8.467023172905526,
"grad_norm": 0.0014580815332010388,
"learning_rate": 3.570271763358305e-05,
"loss": 0.0,
"num_input_tokens_seen": 2933568,
"step": 4750
},
{
"epoch": 8.475935828877006,
"grad_norm": 2.72096131084254e-05,
"learning_rate": 3.56675598325544e-05,
"loss": 0.0,
"num_input_tokens_seen": 2936672,
"step": 4755
},
{
"epoch": 8.484848484848484,
"grad_norm": 3.214793832739815e-05,
"learning_rate": 3.563237621879085e-05,
"loss": 0.0,
"num_input_tokens_seen": 2939104,
"step": 4760
},
{
"epoch": 8.493761140819965,
"grad_norm": 4.211974373902194e-05,
"learning_rate": 3.559716687742763e-05,
"loss": 0.0,
"num_input_tokens_seen": 2941440,
"step": 4765
},
{
"epoch": 8.502673796791443,
"grad_norm": 1.873859582701698e-05,
"learning_rate": 3.556193189366227e-05,
"loss": 0.0,
"num_input_tokens_seen": 2944704,
"step": 4770
},
{
"epoch": 8.511586452762923,
"grad_norm": 0.0002100716665154323,
"learning_rate": 3.5526671352754285e-05,
"loss": 0.0,
"num_input_tokens_seen": 2947680,
"step": 4775
},
{
"epoch": 8.520499108734402,
"grad_norm": 5.308235631673597e-05,
"learning_rate": 3.5491385340025055e-05,
"loss": 0.0,
"num_input_tokens_seen": 2951232,
"step": 4780
},
{
"epoch": 8.529411764705882,
"grad_norm": 5.459520980366506e-05,
"learning_rate": 3.545607394085763e-05,
"loss": 0.0,
"num_input_tokens_seen": 2954432,
"step": 4785
},
{
"epoch": 8.538324420677363,
"grad_norm": 9.727604629006237e-05,
"learning_rate": 3.542073724069644e-05,
"loss": 0.0,
"num_input_tokens_seen": 2958208,
"step": 4790
},
{
"epoch": 8.547237076648841,
"grad_norm": 8.639811130706221e-05,
"learning_rate": 3.5385375325047166e-05,
"loss": 0.0,
"num_input_tokens_seen": 2961152,
"step": 4795
},
{
"epoch": 8.556149732620321,
"grad_norm": 0.00015109198284335434,
"learning_rate": 3.5349988279476494e-05,
"loss": 0.0,
"num_input_tokens_seen": 2964704,
"step": 4800
},
{
"epoch": 8.5650623885918,
"grad_norm": 2.8421334718586877e-05,
"learning_rate": 3.5314576189611906e-05,
"loss": 0.0,
"num_input_tokens_seen": 2968064,
"step": 4805
},
{
"epoch": 8.57397504456328,
"grad_norm": 2.659242454683408e-05,
"learning_rate": 3.527913914114152e-05,
"loss": 0.0,
"num_input_tokens_seen": 2971552,
"step": 4810
},
{
"epoch": 8.582887700534759,
"grad_norm": 2.7345558919478208e-05,
"learning_rate": 3.524367721981381e-05,
"loss": 0.0,
"num_input_tokens_seen": 2974592,
"step": 4815
},
{
"epoch": 8.591800356506239,
"grad_norm": 4.943818203173578e-05,
"learning_rate": 3.520819051143747e-05,
"loss": 0.0,
"num_input_tokens_seen": 2977376,
"step": 4820
},
{
"epoch": 8.60071301247772,
"grad_norm": 7.073458982631564e-05,
"learning_rate": 3.517267910188112e-05,
"loss": 0.0,
"num_input_tokens_seen": 2980544,
"step": 4825
},
{
"epoch": 8.609625668449198,
"grad_norm": 1.9516908650984988e-05,
"learning_rate": 3.513714307707321e-05,
"loss": 0.0,
"num_input_tokens_seen": 2984352,
"step": 4830
},
{
"epoch": 8.618538324420678,
"grad_norm": 4.920436185784638e-05,
"learning_rate": 3.510158252300171e-05,
"loss": 0.0,
"num_input_tokens_seen": 2987328,
"step": 4835
},
{
"epoch": 8.627450980392156,
"grad_norm": 2.403493272140622e-05,
"learning_rate": 3.506599752571398e-05,
"loss": 0.0,
"num_input_tokens_seen": 2989888,
"step": 4840
},
{
"epoch": 8.636363636363637,
"grad_norm": 1.945541771419812e-05,
"learning_rate": 3.503038817131649e-05,
"loss": 0.0,
"num_input_tokens_seen": 2992960,
"step": 4845
},
{
"epoch": 8.645276292335115,
"grad_norm": 1.9857079678331502e-05,
"learning_rate": 3.499475454597467e-05,
"loss": 0.0,
"num_input_tokens_seen": 2995200,
"step": 4850
},
{
"epoch": 8.654188948306595,
"grad_norm": 2.849534575943835e-05,
"learning_rate": 3.495909673591268e-05,
"loss": 0.0,
"num_input_tokens_seen": 2997920,
"step": 4855
},
{
"epoch": 8.663101604278076,
"grad_norm": 9.624276572139934e-05,
"learning_rate": 3.492341482741319e-05,
"loss": 0.0,
"num_input_tokens_seen": 3001056,
"step": 4860
},
{
"epoch": 8.672014260249554,
"grad_norm": 4.670900307246484e-05,
"learning_rate": 3.488770890681718e-05,
"loss": 0.0,
"num_input_tokens_seen": 3004064,
"step": 4865
},
{
"epoch": 8.680926916221035,
"grad_norm": 5.9905400121351704e-05,
"learning_rate": 3.485197906052376e-05,
"loss": 0.0,
"num_input_tokens_seen": 3007456,
"step": 4870
},
{
"epoch": 8.689839572192513,
"grad_norm": 0.00032246284536086023,
"learning_rate": 3.4816225374989884e-05,
"loss": 0.0,
"num_input_tokens_seen": 3010688,
"step": 4875
},
{
"epoch": 8.698752228163993,
"grad_norm": 3.181647480232641e-05,
"learning_rate": 3.4780447936730245e-05,
"loss": 0.0,
"num_input_tokens_seen": 3014048,
"step": 4880
},
{
"epoch": 8.707664884135472,
"grad_norm": 2.5759185518836603e-05,
"learning_rate": 3.474464683231698e-05,
"loss": 0.0,
"num_input_tokens_seen": 3017056,
"step": 4885
},
{
"epoch": 8.716577540106952,
"grad_norm": 7.166628347476944e-05,
"learning_rate": 3.4708822148379514e-05,
"loss": 0.0,
"num_input_tokens_seen": 3020128,
"step": 4890
},
{
"epoch": 8.72549019607843,
"grad_norm": 1.9339231585036032e-05,
"learning_rate": 3.4672973971604285e-05,
"loss": 0.0,
"num_input_tokens_seen": 3023168,
"step": 4895
},
{
"epoch": 8.73440285204991,
"grad_norm": 1.5031446309876628e-05,
"learning_rate": 3.463710238873462e-05,
"loss": 0.0,
"num_input_tokens_seen": 3026560,
"step": 4900
},
{
"epoch": 8.743315508021391,
"grad_norm": 8.983182488009334e-05,
"learning_rate": 3.4601207486570476e-05,
"loss": 0.0,
"num_input_tokens_seen": 3030336,
"step": 4905
},
{
"epoch": 8.75222816399287,
"grad_norm": 0.00017463577387388796,
"learning_rate": 3.456528935196821e-05,
"loss": 0.0,
"num_input_tokens_seen": 3034016,
"step": 4910
},
{
"epoch": 8.76114081996435,
"grad_norm": 1.955249535967596e-05,
"learning_rate": 3.452934807184044e-05,
"loss": 0.0,
"num_input_tokens_seen": 3037024,
"step": 4915
},
{
"epoch": 8.770053475935828,
"grad_norm": 1.943771530932281e-05,
"learning_rate": 3.449338373315575e-05,
"loss": 0.0,
"num_input_tokens_seen": 3041184,
"step": 4920
},
{
"epoch": 8.778966131907309,
"grad_norm": 4.8170099034905434e-05,
"learning_rate": 3.4457396422938535e-05,
"loss": 0.0,
"num_input_tokens_seen": 3044064,
"step": 4925
},
{
"epoch": 8.787878787878787,
"grad_norm": 6.884737376822159e-05,
"learning_rate": 3.442138622826879e-05,
"loss": 0.0,
"num_input_tokens_seen": 3047712,
"step": 4930
},
{
"epoch": 8.796791443850267,
"grad_norm": 2.13040184462443e-05,
"learning_rate": 3.438535323628185e-05,
"loss": 0.0,
"num_input_tokens_seen": 3050304,
"step": 4935
},
{
"epoch": 8.805704099821746,
"grad_norm": 5.2907158533344045e-05,
"learning_rate": 3.434929753416824e-05,
"loss": 0.0,
"num_input_tokens_seen": 3052864,
"step": 4940
},
{
"epoch": 8.814616755793226,
"grad_norm": 2.187153404520359e-05,
"learning_rate": 3.431321920917343e-05,
"loss": 0.0,
"num_input_tokens_seen": 3056352,
"step": 4945
},
{
"epoch": 8.823529411764707,
"grad_norm": 1.702853296592366e-05,
"learning_rate": 3.427711834859764e-05,
"loss": 0.0,
"num_input_tokens_seen": 3059424,
"step": 4950
},
{
"epoch": 8.832442067736185,
"grad_norm": 1.7070908143068664e-05,
"learning_rate": 3.4240995039795606e-05,
"loss": 0.0,
"num_input_tokens_seen": 3062752,
"step": 4955
},
{
"epoch": 8.841354723707665,
"grad_norm": 1.8176526282331906e-05,
"learning_rate": 3.420484937017639e-05,
"loss": 0.0,
"num_input_tokens_seen": 3065120,
"step": 4960
},
{
"epoch": 8.850267379679144,
"grad_norm": 2.3698372388025746e-05,
"learning_rate": 3.416868142720316e-05,
"loss": 0.0,
"num_input_tokens_seen": 3067872,
"step": 4965
},
{
"epoch": 8.859180035650624,
"grad_norm": 3.496624412946403e-05,
"learning_rate": 3.413249129839298e-05,
"loss": 0.0,
"num_input_tokens_seen": 3071040,
"step": 4970
},
{
"epoch": 8.868092691622103,
"grad_norm": 2.3722381229163148e-05,
"learning_rate": 3.4096279071316606e-05,
"loss": 0.0,
"num_input_tokens_seen": 3074560,
"step": 4975
},
{
"epoch": 8.877005347593583,
"grad_norm": 0.00022836957941763103,
"learning_rate": 3.4060044833598255e-05,
"loss": 0.0,
"num_input_tokens_seen": 3077760,
"step": 4980
},
{
"epoch": 8.885918003565063,
"grad_norm": 0.0004945816472172737,
"learning_rate": 3.40237886729154e-05,
"loss": 0.0,
"num_input_tokens_seen": 3080512,
"step": 4985
},
{
"epoch": 8.894830659536542,
"grad_norm": 1.4382736480911262e-05,
"learning_rate": 3.398751067699858e-05,
"loss": 0.0,
"num_input_tokens_seen": 3083904,
"step": 4990
},
{
"epoch": 8.903743315508022,
"grad_norm": 1.5413037544931285e-05,
"learning_rate": 3.395121093363116e-05,
"loss": 0.0,
"num_input_tokens_seen": 3087264,
"step": 4995
},
{
"epoch": 8.9126559714795,
"grad_norm": 1.8121845641871914e-05,
"learning_rate": 3.3914889530649105e-05,
"loss": 0.0,
"num_input_tokens_seen": 3089536,
"step": 5000
},
{
"epoch": 8.92156862745098,
"grad_norm": 2.3494427296100184e-05,
"learning_rate": 3.387854655594085e-05,
"loss": 0.0,
"num_input_tokens_seen": 3092608,
"step": 5005
},
{
"epoch": 8.93048128342246,
"grad_norm": 1.5714915207354352e-05,
"learning_rate": 3.384218209744697e-05,
"loss": 0.0,
"num_input_tokens_seen": 3095968,
"step": 5010
},
{
"epoch": 8.93939393939394,
"grad_norm": 2.0781550119863823e-05,
"learning_rate": 3.3805796243160035e-05,
"loss": 0.0,
"num_input_tokens_seen": 3099584,
"step": 5015
},
{
"epoch": 8.94830659536542,
"grad_norm": 6.0617785493377596e-05,
"learning_rate": 3.376938908112443e-05,
"loss": 0.0,
"num_input_tokens_seen": 3102880,
"step": 5020
},
{
"epoch": 8.957219251336898,
"grad_norm": 1.7760969058144838e-05,
"learning_rate": 3.373296069943605e-05,
"loss": 0.0,
"num_input_tokens_seen": 3105888,
"step": 5025
},
{
"epoch": 8.966131907308379,
"grad_norm": 4.19039570260793e-05,
"learning_rate": 3.3696511186242144e-05,
"loss": 0.0,
"num_input_tokens_seen": 3108608,
"step": 5030
},
{
"epoch": 8.975044563279857,
"grad_norm": 1.3767701602773741e-05,
"learning_rate": 3.3660040629741114e-05,
"loss": 0.0,
"num_input_tokens_seen": 3111584,
"step": 5035
},
{
"epoch": 8.983957219251337,
"grad_norm": 1.66555073519703e-05,
"learning_rate": 3.3623549118182274e-05,
"loss": 0.0,
"num_input_tokens_seen": 3114976,
"step": 5040
},
{
"epoch": 8.992869875222816,
"grad_norm": 3.6471657949732617e-05,
"learning_rate": 3.358703673986564e-05,
"loss": 0.0,
"num_input_tokens_seen": 3117568,
"step": 5045
},
{
"epoch": 9.0,
"eval_loss": 0.22779197990894318,
"eval_runtime": 4.5839,
"eval_samples_per_second": 54.32,
"eval_steps_per_second": 13.744,
"num_input_tokens_seen": 3119968,
"step": 5049
},
{
"epoch": 9.001782531194296,
"grad_norm": 2.258282074762974e-05,
"learning_rate": 3.355050358314172e-05,
"loss": 0.0,
"num_input_tokens_seen": 3120544,
"step": 5050
},
{
"epoch": 9.010695187165775,
"grad_norm": 2.1495643522939645e-05,
"learning_rate": 3.3513949736411297e-05,
"loss": 0.0,
"num_input_tokens_seen": 3123360,
"step": 5055
},
{
"epoch": 9.019607843137255,
"grad_norm": 2.0894487533951178e-05,
"learning_rate": 3.347737528812523e-05,
"loss": 0.0,
"num_input_tokens_seen": 3126720,
"step": 5060
},
{
"epoch": 9.028520499108735,
"grad_norm": 1.4006104720465373e-05,
"learning_rate": 3.344078032678422e-05,
"loss": 0.0,
"num_input_tokens_seen": 3129760,
"step": 5065
},
{
"epoch": 9.037433155080214,
"grad_norm": 2.4866374587872997e-05,
"learning_rate": 3.340416494093861e-05,
"loss": 0.0,
"num_input_tokens_seen": 3132928,
"step": 5070
},
{
"epoch": 9.046345811051694,
"grad_norm": 2.3607737603015266e-05,
"learning_rate": 3.336752921918814e-05,
"loss": 0.0,
"num_input_tokens_seen": 3136224,
"step": 5075
},
{
"epoch": 9.055258467023172,
"grad_norm": 2.0439423678908497e-05,
"learning_rate": 3.33308732501818e-05,
"loss": 0.0,
"num_input_tokens_seen": 3139296,
"step": 5080
},
{
"epoch": 9.064171122994653,
"grad_norm": 2.25005169340875e-05,
"learning_rate": 3.329419712261754e-05,
"loss": 0.0,
"num_input_tokens_seen": 3142272,
"step": 5085
},
{
"epoch": 9.073083778966131,
"grad_norm": 0.00012200982018839568,
"learning_rate": 3.3257500925242106e-05,
"loss": 0.0,
"num_input_tokens_seen": 3145440,
"step": 5090
},
{
"epoch": 9.081996434937611,
"grad_norm": 0.00030202369089238346,
"learning_rate": 3.322078474685081e-05,
"loss": 0.0,
"num_input_tokens_seen": 3149280,
"step": 5095
},
{
"epoch": 9.090909090909092,
"grad_norm": 1.6156487617990933e-05,
"learning_rate": 3.3184048676287284e-05,
"loss": 0.0,
"num_input_tokens_seen": 3152480,
"step": 5100
},
{
"epoch": 9.09982174688057,
"grad_norm": 0.0004181478579994291,
"learning_rate": 3.314729280244332e-05,
"loss": 0.0,
"num_input_tokens_seen": 3156032,
"step": 5105
},
{
"epoch": 9.10873440285205,
"grad_norm": 1.4357733562064823e-05,
"learning_rate": 3.311051721425864e-05,
"loss": 0.0,
"num_input_tokens_seen": 3159488,
"step": 5110
},
{
"epoch": 9.117647058823529,
"grad_norm": 8.172341040335596e-05,
"learning_rate": 3.3073722000720644e-05,
"loss": 0.0,
"num_input_tokens_seen": 3162272,
"step": 5115
},
{
"epoch": 9.12655971479501,
"grad_norm": 2.1385545551311225e-05,
"learning_rate": 3.303690725086421e-05,
"loss": 0.0,
"num_input_tokens_seen": 3165184,
"step": 5120
},
{
"epoch": 9.135472370766488,
"grad_norm": 2.0441351807676256e-05,
"learning_rate": 3.300007305377153e-05,
"loss": 0.0,
"num_input_tokens_seen": 3167744,
"step": 5125
},
{
"epoch": 9.144385026737968,
"grad_norm": 1.8742101019597612e-05,
"learning_rate": 3.296321949857183e-05,
"loss": 0.0,
"num_input_tokens_seen": 3170464,
"step": 5130
},
{
"epoch": 9.153297682709447,
"grad_norm": 1.82445965037914e-05,
"learning_rate": 3.292634667444117e-05,
"loss": 0.0,
"num_input_tokens_seen": 3173952,
"step": 5135
},
{
"epoch": 9.162210338680927,
"grad_norm": 1.8075352272717282e-05,
"learning_rate": 3.288945467060226e-05,
"loss": 0.0,
"num_input_tokens_seen": 3177792,
"step": 5140
},
{
"epoch": 9.171122994652407,
"grad_norm": 5.484546272782609e-05,
"learning_rate": 3.285254357632418e-05,
"loss": 0.0,
"num_input_tokens_seen": 3180544,
"step": 5145
},
{
"epoch": 9.180035650623886,
"grad_norm": 2.4764098270679824e-05,
"learning_rate": 3.281561348092225e-05,
"loss": 0.0,
"num_input_tokens_seen": 3183808,
"step": 5150
},
{
"epoch": 9.188948306595366,
"grad_norm": 0.0002967322070617229,
"learning_rate": 3.277866447375774e-05,
"loss": 0.0,
"num_input_tokens_seen": 3186688,
"step": 5155
},
{
"epoch": 9.197860962566844,
"grad_norm": 1.93959513126174e-05,
"learning_rate": 3.274169664423768e-05,
"loss": 0.0,
"num_input_tokens_seen": 3190592,
"step": 5160
},
{
"epoch": 9.206773618538325,
"grad_norm": 1.803379927878268e-05,
"learning_rate": 3.270471008181466e-05,
"loss": 0.0,
"num_input_tokens_seen": 3193824,
"step": 5165
},
{
"epoch": 9.215686274509803,
"grad_norm": 0.0001877088361652568,
"learning_rate": 3.26677048759866e-05,
"loss": 0.0,
"num_input_tokens_seen": 3196416,
"step": 5170
},
{
"epoch": 9.224598930481283,
"grad_norm": 2.6759424144984223e-05,
"learning_rate": 3.26306811162965e-05,
"loss": 0.0,
"num_input_tokens_seen": 3199488,
"step": 5175
},
{
"epoch": 9.233511586452764,
"grad_norm": 2.7462174330139533e-05,
"learning_rate": 3.259363889233231e-05,
"loss": 0.0,
"num_input_tokens_seen": 3202080,
"step": 5180
},
{
"epoch": 9.242424242424242,
"grad_norm": 1.7499840396340005e-05,
"learning_rate": 3.255657829372662e-05,
"loss": 0.0,
"num_input_tokens_seen": 3205216,
"step": 5185
},
{
"epoch": 9.251336898395722,
"grad_norm": 3.1354134989669546e-05,
"learning_rate": 3.251949941015646e-05,
"loss": 0.0,
"num_input_tokens_seen": 3208096,
"step": 5190
},
{
"epoch": 9.260249554367201,
"grad_norm": 1.662471731833648e-05,
"learning_rate": 3.248240233134317e-05,
"loss": 0.0,
"num_input_tokens_seen": 3211136,
"step": 5195
},
{
"epoch": 9.269162210338681,
"grad_norm": 2.0622837837436236e-05,
"learning_rate": 3.2445287147052086e-05,
"loss": 0.0,
"num_input_tokens_seen": 3213952,
"step": 5200
},
{
"epoch": 9.27807486631016,
"grad_norm": 3.4175893233623356e-05,
"learning_rate": 3.240815394709234e-05,
"loss": 0.0,
"num_input_tokens_seen": 3217888,
"step": 5205
},
{
"epoch": 9.28698752228164,
"grad_norm": 5.586072074947879e-05,
"learning_rate": 3.237100282131665e-05,
"loss": 0.0,
"num_input_tokens_seen": 3221120,
"step": 5210
},
{
"epoch": 9.29590017825312,
"grad_norm": 2.8636835850193165e-05,
"learning_rate": 3.2333833859621153e-05,
"loss": 0.0,
"num_input_tokens_seen": 3224032,
"step": 5215
},
{
"epoch": 9.304812834224599,
"grad_norm": 9.697127825347707e-05,
"learning_rate": 3.2296647151945114e-05,
"loss": 0.0,
"num_input_tokens_seen": 3227072,
"step": 5220
},
{
"epoch": 9.313725490196079,
"grad_norm": 2.4525064873159863e-05,
"learning_rate": 3.225944278827074e-05,
"loss": 0.0,
"num_input_tokens_seen": 3230208,
"step": 5225
},
{
"epoch": 9.322638146167558,
"grad_norm": 1.6048215911723673e-05,
"learning_rate": 3.222222085862297e-05,
"loss": 0.0,
"num_input_tokens_seen": 3232736,
"step": 5230
},
{
"epoch": 9.331550802139038,
"grad_norm": 1.599715687916614e-05,
"learning_rate": 3.218498145306925e-05,
"loss": 0.0,
"num_input_tokens_seen": 3235808,
"step": 5235
},
{
"epoch": 9.340463458110516,
"grad_norm": 2.0266739738872275e-05,
"learning_rate": 3.21477246617193e-05,
"loss": 0.0,
"num_input_tokens_seen": 3239648,
"step": 5240
},
{
"epoch": 9.349376114081997,
"grad_norm": 1.4434924196393695e-05,
"learning_rate": 3.211045057472491e-05,
"loss": 0.0,
"num_input_tokens_seen": 3242848,
"step": 5245
},
{
"epoch": 9.358288770053475,
"grad_norm": 3.0308527129818685e-05,
"learning_rate": 3.207315928227974e-05,
"loss": 0.0,
"num_input_tokens_seen": 3245600,
"step": 5250
},
{
"epoch": 9.367201426024955,
"grad_norm": 1.3195046449254733e-05,
"learning_rate": 3.2035850874619055e-05,
"loss": 0.0,
"num_input_tokens_seen": 3249088,
"step": 5255
},
{
"epoch": 9.376114081996436,
"grad_norm": 3.300649404991418e-05,
"learning_rate": 3.199852544201955e-05,
"loss": 0.0,
"num_input_tokens_seen": 3252800,
"step": 5260
},
{
"epoch": 9.385026737967914,
"grad_norm": 2.0082863557036035e-05,
"learning_rate": 3.1961183074799143e-05,
"loss": 0.0,
"num_input_tokens_seen": 3255968,
"step": 5265
},
{
"epoch": 9.393939393939394,
"grad_norm": 2.281305569340475e-05,
"learning_rate": 3.192382386331667e-05,
"loss": 0.0,
"num_input_tokens_seen": 3258336,
"step": 5270
},
{
"epoch": 9.402852049910873,
"grad_norm": 4.2561547161312774e-05,
"learning_rate": 3.188644789797177e-05,
"loss": 0.0,
"num_input_tokens_seen": 3261440,
"step": 5275
},
{
"epoch": 9.411764705882353,
"grad_norm": 1.503498151578242e-05,
"learning_rate": 3.1849055269204604e-05,
"loss": 0.0,
"num_input_tokens_seen": 3264192,
"step": 5280
},
{
"epoch": 9.420677361853832,
"grad_norm": 4.1438135667704046e-05,
"learning_rate": 3.181164606749566e-05,
"loss": 0.0,
"num_input_tokens_seen": 3267328,
"step": 5285
},
{
"epoch": 9.429590017825312,
"grad_norm": 5.512424104381353e-05,
"learning_rate": 3.177422038336554e-05,
"loss": 0.0,
"num_input_tokens_seen": 3269984,
"step": 5290
},
{
"epoch": 9.43850267379679,
"grad_norm": 6.817230314482003e-05,
"learning_rate": 3.17367783073747e-05,
"loss": 0.0,
"num_input_tokens_seen": 3272960,
"step": 5295
},
{
"epoch": 9.44741532976827,
"grad_norm": 1.5790932593517937e-05,
"learning_rate": 3.169931993012328e-05,
"loss": 0.0,
"num_input_tokens_seen": 3275744,
"step": 5300
},
{
"epoch": 9.456327985739751,
"grad_norm": 1.866510501713492e-05,
"learning_rate": 3.166184534225087e-05,
"loss": 0.0,
"num_input_tokens_seen": 3278656,
"step": 5305
},
{
"epoch": 9.46524064171123,
"grad_norm": 0.00014416482008527964,
"learning_rate": 3.162435463443628e-05,
"loss": 0.0,
"num_input_tokens_seen": 3282272,
"step": 5310
},
{
"epoch": 9.47415329768271,
"grad_norm": 5.762660293839872e-05,
"learning_rate": 3.158684789739731e-05,
"loss": 0.0,
"num_input_tokens_seen": 3284992,
"step": 5315
},
{
"epoch": 9.483065953654188,
"grad_norm": 2.13386447285302e-05,
"learning_rate": 3.1549325221890575e-05,
"loss": 0.0,
"num_input_tokens_seen": 3288256,
"step": 5320
},
{
"epoch": 9.491978609625669,
"grad_norm": 2.8539412596728653e-05,
"learning_rate": 3.1511786698711224e-05,
"loss": 0.0,
"num_input_tokens_seen": 3291104,
"step": 5325
},
{
"epoch": 9.500891265597147,
"grad_norm": 2.0268642401788384e-05,
"learning_rate": 3.147423241869278e-05,
"loss": 0.0,
"num_input_tokens_seen": 3294336,
"step": 5330
},
{
"epoch": 9.509803921568627,
"grad_norm": 1.5605648513883352e-05,
"learning_rate": 3.1436662472706895e-05,
"loss": 0.0,
"num_input_tokens_seen": 3297568,
"step": 5335
},
{
"epoch": 9.518716577540108,
"grad_norm": 4.1615989175625145e-05,
"learning_rate": 3.139907695166311e-05,
"loss": 0.0,
"num_input_tokens_seen": 3300256,
"step": 5340
},
{
"epoch": 9.527629233511586,
"grad_norm": 4.602934859576635e-05,
"learning_rate": 3.1361475946508645e-05,
"loss": 0.0,
"num_input_tokens_seen": 3303648,
"step": 5345
},
{
"epoch": 9.536541889483066,
"grad_norm": 3.193959128111601e-05,
"learning_rate": 3.132385954822823e-05,
"loss": 0.0,
"num_input_tokens_seen": 3307264,
"step": 5350
},
{
"epoch": 9.545454545454545,
"grad_norm": 1.5343108316301368e-05,
"learning_rate": 3.128622784784381e-05,
"loss": 0.0,
"num_input_tokens_seen": 3310624,
"step": 5355
},
{
"epoch": 9.554367201426025,
"grad_norm": 2.4617331291665323e-05,
"learning_rate": 3.1248580936414354e-05,
"loss": 0.0,
"num_input_tokens_seen": 3313376,
"step": 5360
},
{
"epoch": 9.563279857397504,
"grad_norm": 6.980136095080525e-05,
"learning_rate": 3.1210918905035655e-05,
"loss": 0.0,
"num_input_tokens_seen": 3316576,
"step": 5365
},
{
"epoch": 9.572192513368984,
"grad_norm": 2.2588457795791328e-05,
"learning_rate": 3.117324184484008e-05,
"loss": 0.0,
"num_input_tokens_seen": 3319264,
"step": 5370
},
{
"epoch": 9.581105169340464,
"grad_norm": 1.3750898688158486e-05,
"learning_rate": 3.1135549846996384e-05,
"loss": 0.0,
"num_input_tokens_seen": 3321920,
"step": 5375
},
{
"epoch": 9.590017825311943,
"grad_norm": 2.8685486540780403e-05,
"learning_rate": 3.109784300270943e-05,
"loss": 0.0,
"num_input_tokens_seen": 3324832,
"step": 5380
},
{
"epoch": 9.598930481283423,
"grad_norm": 6.582101195817813e-05,
"learning_rate": 3.106012140322004e-05,
"loss": 0.0,
"num_input_tokens_seen": 3327616,
"step": 5385
},
{
"epoch": 9.607843137254902,
"grad_norm": 1.9889133909600787e-05,
"learning_rate": 3.102238513980471e-05,
"loss": 0.0,
"num_input_tokens_seen": 3329984,
"step": 5390
},
{
"epoch": 9.616755793226382,
"grad_norm": 3.023408498847857e-05,
"learning_rate": 3.098463430377544e-05,
"loss": 0.0,
"num_input_tokens_seen": 3333312,
"step": 5395
},
{
"epoch": 9.62566844919786,
"grad_norm": 1.3414460227068048e-05,
"learning_rate": 3.09468689864795e-05,
"loss": 0.0,
"num_input_tokens_seen": 3336384,
"step": 5400
},
{
"epoch": 9.63458110516934,
"grad_norm": 1.646774762775749e-05,
"learning_rate": 3.090908927929917e-05,
"loss": 0.0,
"num_input_tokens_seen": 3339520,
"step": 5405
},
{
"epoch": 9.643493761140821,
"grad_norm": 8.691311813890934e-05,
"learning_rate": 3.087129527365158e-05,
"loss": 0.0,
"num_input_tokens_seen": 3342528,
"step": 5410
},
{
"epoch": 9.6524064171123,
"grad_norm": 3.321999975014478e-05,
"learning_rate": 3.083348706098844e-05,
"loss": 0.0,
"num_input_tokens_seen": 3345536,
"step": 5415
},
{
"epoch": 9.66131907308378,
"grad_norm": 2.6839175916393287e-05,
"learning_rate": 3.0795664732795825e-05,
"loss": 0.0,
"num_input_tokens_seen": 3349184,
"step": 5420
},
{
"epoch": 9.670231729055258,
"grad_norm": 2.5091510906349868e-05,
"learning_rate": 3.075782838059402e-05,
"loss": 0.0,
"num_input_tokens_seen": 3351904,
"step": 5425
},
{
"epoch": 9.679144385026738,
"grad_norm": 0.000148052436998114,
"learning_rate": 3.071997809593719e-05,
"loss": 0.0,
"num_input_tokens_seen": 3355424,
"step": 5430
},
{
"epoch": 9.688057040998217,
"grad_norm": 3.8398997276090086e-05,
"learning_rate": 3.068211397041322e-05,
"loss": 0.0,
"num_input_tokens_seen": 3358208,
"step": 5435
},
{
"epoch": 9.696969696969697,
"grad_norm": 1.5549350791843608e-05,
"learning_rate": 3.064423609564352e-05,
"loss": 0.0,
"num_input_tokens_seen": 3361472,
"step": 5440
},
{
"epoch": 9.705882352941176,
"grad_norm": 5.20404391863849e-05,
"learning_rate": 3.060634456328273e-05,
"loss": 0.0,
"num_input_tokens_seen": 3364640,
"step": 5445
},
{
"epoch": 9.714795008912656,
"grad_norm": 0.00016949191922321916,
"learning_rate": 3.056843946501856e-05,
"loss": 0.0,
"num_input_tokens_seen": 3367520,
"step": 5450
},
{
"epoch": 9.723707664884136,
"grad_norm": 1.3725932149100117e-05,
"learning_rate": 3.053052089257154e-05,
"loss": 0.0,
"num_input_tokens_seen": 3370656,
"step": 5455
},
{
"epoch": 9.732620320855615,
"grad_norm": 1.540976700198371e-05,
"learning_rate": 3.0492588937694814e-05,
"loss": 0.0,
"num_input_tokens_seen": 3373984,
"step": 5460
},
{
"epoch": 9.741532976827095,
"grad_norm": 1.4343509064929094e-05,
"learning_rate": 3.0454643692173883e-05,
"loss": 0.0,
"num_input_tokens_seen": 3377312,
"step": 5465
},
{
"epoch": 9.750445632798574,
"grad_norm": 2.721649252634961e-05,
"learning_rate": 3.0416685247826443e-05,
"loss": 0.0,
"num_input_tokens_seen": 3380224,
"step": 5470
},
{
"epoch": 9.759358288770054,
"grad_norm": 2.0400497305672616e-05,
"learning_rate": 3.0378713696502097e-05,
"loss": 0.0,
"num_input_tokens_seen": 3383424,
"step": 5475
},
{
"epoch": 9.768270944741532,
"grad_norm": 2.7101803425466642e-05,
"learning_rate": 3.0340729130082175e-05,
"loss": 0.0,
"num_input_tokens_seen": 3387104,
"step": 5480
},
{
"epoch": 9.777183600713013,
"grad_norm": 1.8909197024186142e-05,
"learning_rate": 3.03027316404795e-05,
"loss": 0.0,
"num_input_tokens_seen": 3389792,
"step": 5485
},
{
"epoch": 9.786096256684491,
"grad_norm": 1.3240232874522917e-05,
"learning_rate": 3.026472131963817e-05,
"loss": 0.0,
"num_input_tokens_seen": 3392704,
"step": 5490
},
{
"epoch": 9.795008912655971,
"grad_norm": 4.4982502004131675e-05,
"learning_rate": 3.0226698259533332e-05,
"loss": 0.0,
"num_input_tokens_seen": 3396640,
"step": 5495
},
{
"epoch": 9.803921568627452,
"grad_norm": 2.582524757599458e-05,
"learning_rate": 3.0188662552170943e-05,
"loss": 0.0,
"num_input_tokens_seen": 3400256,
"step": 5500
},
{
"epoch": 9.81283422459893,
"grad_norm": 2.0919447706546634e-05,
"learning_rate": 3.0150614289587585e-05,
"loss": 0.0,
"num_input_tokens_seen": 3403456,
"step": 5505
},
{
"epoch": 9.82174688057041,
"grad_norm": 1.901478935906198e-05,
"learning_rate": 3.0112553563850197e-05,
"loss": 0.0,
"num_input_tokens_seen": 3406528,
"step": 5510
},
{
"epoch": 9.830659536541889,
"grad_norm": 3.364668009453453e-05,
"learning_rate": 3.0074480467055905e-05,
"loss": 0.0,
"num_input_tokens_seen": 3409280,
"step": 5515
},
{
"epoch": 9.83957219251337,
"grad_norm": 1.191617593576666e-05,
"learning_rate": 3.0036395091331743e-05,
"loss": 0.0,
"num_input_tokens_seen": 3412384,
"step": 5520
},
{
"epoch": 9.848484848484848,
"grad_norm": 0.00011046986764995381,
"learning_rate": 2.999829752883446e-05,
"loss": 0.0,
"num_input_tokens_seen": 3416384,
"step": 5525
},
{
"epoch": 9.857397504456328,
"grad_norm": 1.4666607057733927e-05,
"learning_rate": 2.996018787175031e-05,
"loss": 0.0,
"num_input_tokens_seen": 3418720,
"step": 5530
},
{
"epoch": 9.866310160427808,
"grad_norm": 4.578886000672355e-05,
"learning_rate": 2.9922066212294808e-05,
"loss": 0.0,
"num_input_tokens_seen": 3422240,
"step": 5535
},
{
"epoch": 9.875222816399287,
"grad_norm": 6.549031240865588e-05,
"learning_rate": 2.988393264271249e-05,
"loss": 0.0,
"num_input_tokens_seen": 3425408,
"step": 5540
},
{
"epoch": 9.884135472370767,
"grad_norm": 3.163235305692069e-05,
"learning_rate": 2.9845787255276753e-05,
"loss": 0.0,
"num_input_tokens_seen": 3427936,
"step": 5545
},
{
"epoch": 9.893048128342246,
"grad_norm": 7.271839422173798e-05,
"learning_rate": 2.980763014228955e-05,
"loss": 0.0,
"num_input_tokens_seen": 3431392,
"step": 5550
},
{
"epoch": 9.901960784313726,
"grad_norm": 1.4547945283993613e-05,
"learning_rate": 2.9769461396081216e-05,
"loss": 0.0,
"num_input_tokens_seen": 3434624,
"step": 5555
},
{
"epoch": 9.910873440285204,
"grad_norm": 2.0756122466991656e-05,
"learning_rate": 2.9731281109010256e-05,
"loss": 0.0,
"num_input_tokens_seen": 3437760,
"step": 5560
},
{
"epoch": 9.919786096256685,
"grad_norm": 3.5389031836530194e-05,
"learning_rate": 2.9693089373463083e-05,
"loss": 0.0,
"num_input_tokens_seen": 3440992,
"step": 5565
},
{
"epoch": 9.928698752228165,
"grad_norm": 1.5621943020960316e-05,
"learning_rate": 2.965488628185381e-05,
"loss": 0.0,
"num_input_tokens_seen": 3443936,
"step": 5570
},
{
"epoch": 9.937611408199643,
"grad_norm": 1.6809039152576588e-05,
"learning_rate": 2.9616671926624047e-05,
"loss": 0.0,
"num_input_tokens_seen": 3446208,
"step": 5575
},
{
"epoch": 9.946524064171124,
"grad_norm": 1.767824505805038e-05,
"learning_rate": 2.957844640024263e-05,
"loss": 0.0,
"num_input_tokens_seen": 3449152,
"step": 5580
},
{
"epoch": 9.955436720142602,
"grad_norm": 4.033447476103902e-05,
"learning_rate": 2.9540209795205458e-05,
"loss": 0.0,
"num_input_tokens_seen": 3452000,
"step": 5585
},
{
"epoch": 9.964349376114082,
"grad_norm": 5.009349843021482e-05,
"learning_rate": 2.9501962204035217e-05,
"loss": 0.0,
"num_input_tokens_seen": 3455712,
"step": 5590
},
{
"epoch": 9.973262032085561,
"grad_norm": 1.348033401882276e-05,
"learning_rate": 2.9463703719281187e-05,
"loss": 0.0,
"num_input_tokens_seen": 3458880,
"step": 5595
},
{
"epoch": 9.982174688057041,
"grad_norm": 2.4049759304034524e-05,
"learning_rate": 2.9425434433518985e-05,
"loss": 0.0,
"num_input_tokens_seen": 3461632,
"step": 5600
},
{
"epoch": 9.99108734402852,
"grad_norm": 1.4484941857517697e-05,
"learning_rate": 2.9387154439350406e-05,
"loss": 0.0,
"num_input_tokens_seen": 3464064,
"step": 5605
},
{
"epoch": 10.0,
"grad_norm": 1.7108382962760516e-05,
"learning_rate": 2.9348863829403117e-05,
"loss": 0.0,
"num_input_tokens_seen": 3466384,
"step": 5610
},
{
"epoch": 10.0,
"eval_loss": 0.23532113432884216,
"eval_runtime": 4.5836,
"eval_samples_per_second": 54.324,
"eval_steps_per_second": 13.745,
"num_input_tokens_seen": 3466384,
"step": 5610
},
{
"epoch": 10.00891265597148,
"grad_norm": 1.203343526867684e-05,
"learning_rate": 2.931056269633049e-05,
"loss": 0.0,
"num_input_tokens_seen": 3468944,
"step": 5615
},
{
"epoch": 10.017825311942959,
"grad_norm": 1.2630572200578172e-05,
"learning_rate": 2.9272251132811368e-05,
"loss": 0.0,
"num_input_tokens_seen": 3472144,
"step": 5620
},
{
"epoch": 10.026737967914439,
"grad_norm": 1.4490614375972655e-05,
"learning_rate": 2.9233929231549806e-05,
"loss": 0.0,
"num_input_tokens_seen": 3474608,
"step": 5625
},
{
"epoch": 10.035650623885918,
"grad_norm": 8.628850628156215e-05,
"learning_rate": 2.9195597085274893e-05,
"loss": 0.0,
"num_input_tokens_seen": 3477552,
"step": 5630
},
{
"epoch": 10.044563279857398,
"grad_norm": 3.0749939469387755e-05,
"learning_rate": 2.915725478674053e-05,
"loss": 0.0,
"num_input_tokens_seen": 3480592,
"step": 5635
},
{
"epoch": 10.053475935828876,
"grad_norm": 1.7167909390991554e-05,
"learning_rate": 2.9118902428725132e-05,
"loss": 0.0,
"num_input_tokens_seen": 3484656,
"step": 5640
},
{
"epoch": 10.062388591800357,
"grad_norm": 9.126300574280322e-05,
"learning_rate": 2.9080540104031485e-05,
"loss": 0.0,
"num_input_tokens_seen": 3487440,
"step": 5645
},
{
"epoch": 10.071301247771837,
"grad_norm": 2.4468277842970565e-05,
"learning_rate": 2.9042167905486506e-05,
"loss": 0.0,
"num_input_tokens_seen": 3490832,
"step": 5650
},
{
"epoch": 10.080213903743315,
"grad_norm": 1.621020601305645e-05,
"learning_rate": 2.9003785925940975e-05,
"loss": 0.0,
"num_input_tokens_seen": 3493424,
"step": 5655
},
{
"epoch": 10.089126559714796,
"grad_norm": 0.00030905677704140544,
"learning_rate": 2.896539425826935e-05,
"loss": 0.0,
"num_input_tokens_seen": 3497008,
"step": 5660
},
{
"epoch": 10.098039215686274,
"grad_norm": 1.6341586160706356e-05,
"learning_rate": 2.8926992995369556e-05,
"loss": 0.0,
"num_input_tokens_seen": 3500720,
"step": 5665
},
{
"epoch": 10.106951871657754,
"grad_norm": 1.4716282748850062e-05,
"learning_rate": 2.8888582230162688e-05,
"loss": 0.0,
"num_input_tokens_seen": 3503696,
"step": 5670
},
{
"epoch": 10.115864527629233,
"grad_norm": 1.3503812624549028e-05,
"learning_rate": 2.8850162055592866e-05,
"loss": 0.0,
"num_input_tokens_seen": 3506704,
"step": 5675
},
{
"epoch": 10.124777183600713,
"grad_norm": 0.000634489580988884,
"learning_rate": 2.8811732564626987e-05,
"loss": 0.0,
"num_input_tokens_seen": 3510352,
"step": 5680
},
{
"epoch": 10.133689839572192,
"grad_norm": 1.2275093467906117e-05,
"learning_rate": 2.8773293850254463e-05,
"loss": 0.0,
"num_input_tokens_seen": 3513360,
"step": 5685
},
{
"epoch": 10.142602495543672,
"grad_norm": 1.3238013707450591e-05,
"learning_rate": 2.8734846005487036e-05,
"loss": 0.0,
"num_input_tokens_seen": 3516528,
"step": 5690
},
{
"epoch": 10.151515151515152,
"grad_norm": 1.2889286153949797e-05,
"learning_rate": 2.8696389123358553e-05,
"loss": 0.0,
"num_input_tokens_seen": 3519664,
"step": 5695
},
{
"epoch": 10.16042780748663,
"grad_norm": 3.7722624256275594e-05,
"learning_rate": 2.865792329692472e-05,
"loss": 0.0,
"num_input_tokens_seen": 3522608,
"step": 5700
},
{
"epoch": 10.169340463458111,
"grad_norm": 1.4927626580174547e-05,
"learning_rate": 2.8619448619262874e-05,
"loss": 0.0,
"num_input_tokens_seen": 3525360,
"step": 5705
},
{
"epoch": 10.17825311942959,
"grad_norm": 1.6124704416142777e-05,
"learning_rate": 2.8580965183471792e-05,
"loss": 0.0,
"num_input_tokens_seen": 3528752,
"step": 5710
},
{
"epoch": 10.18716577540107,
"grad_norm": 1.633623833185993e-05,
"learning_rate": 2.854247308267142e-05,
"loss": 0.0,
"num_input_tokens_seen": 3532560,
"step": 5715
},
{
"epoch": 10.196078431372548,
"grad_norm": 0.0005267616361379623,
"learning_rate": 2.8503972410002693e-05,
"loss": 0.0,
"num_input_tokens_seen": 3535984,
"step": 5720
},
{
"epoch": 10.204991087344029,
"grad_norm": 0.00010131792805623263,
"learning_rate": 2.8465463258627283e-05,
"loss": 0.0,
"num_input_tokens_seen": 3538864,
"step": 5725
},
{
"epoch": 10.213903743315509,
"grad_norm": 2.3571072233607993e-05,
"learning_rate": 2.8426945721727366e-05,
"loss": 0.0,
"num_input_tokens_seen": 3541680,
"step": 5730
},
{
"epoch": 10.222816399286987,
"grad_norm": 1.532697024231311e-05,
"learning_rate": 2.838841989250541e-05,
"loss": 0.0,
"num_input_tokens_seen": 3544592,
"step": 5735
},
{
"epoch": 10.231729055258468,
"grad_norm": 1.6518961274414323e-05,
"learning_rate": 2.8349885864183955e-05,
"loss": 0.0,
"num_input_tokens_seen": 3548016,
"step": 5740
},
{
"epoch": 10.240641711229946,
"grad_norm": 2.0978848624508828e-05,
"learning_rate": 2.8311343730005397e-05,
"loss": 0.0,
"num_input_tokens_seen": 3550800,
"step": 5745
},
{
"epoch": 10.249554367201426,
"grad_norm": 1.3249901712697465e-05,
"learning_rate": 2.827279358323171e-05,
"loss": 0.0,
"num_input_tokens_seen": 3553840,
"step": 5750
},
{
"epoch": 10.258467023172905,
"grad_norm": 2.9433726012939587e-05,
"learning_rate": 2.823423551714429e-05,
"loss": 0.0,
"num_input_tokens_seen": 3557008,
"step": 5755
},
{
"epoch": 10.267379679144385,
"grad_norm": 6.580901390407234e-05,
"learning_rate": 2.819566962504367e-05,
"loss": 0.0,
"num_input_tokens_seen": 3560432,
"step": 5760
},
{
"epoch": 10.276292335115864,
"grad_norm": 1.4316668966785073e-05,
"learning_rate": 2.8157096000249334e-05,
"loss": 0.0,
"num_input_tokens_seen": 3563472,
"step": 5765
},
{
"epoch": 10.285204991087344,
"grad_norm": 2.383297214691993e-05,
"learning_rate": 2.8118514736099482e-05,
"loss": 0.0,
"num_input_tokens_seen": 3567088,
"step": 5770
},
{
"epoch": 10.294117647058824,
"grad_norm": 4.281475776224397e-05,
"learning_rate": 2.8079925925950784e-05,
"loss": 0.0,
"num_input_tokens_seen": 3569584,
"step": 5775
},
{
"epoch": 10.303030303030303,
"grad_norm": 2.8406180717865936e-05,
"learning_rate": 2.8041329663178173e-05,
"loss": 0.0,
"num_input_tokens_seen": 3572464,
"step": 5780
},
{
"epoch": 10.311942959001783,
"grad_norm": 1.3338190910872072e-05,
"learning_rate": 2.800272604117463e-05,
"loss": 0.0,
"num_input_tokens_seen": 3575120,
"step": 5785
},
{
"epoch": 10.320855614973262,
"grad_norm": 1.4634756553277839e-05,
"learning_rate": 2.7964115153350927e-05,
"loss": 0.0,
"num_input_tokens_seen": 3578128,
"step": 5790
},
{
"epoch": 10.329768270944742,
"grad_norm": 1.9146786144119687e-05,
"learning_rate": 2.7925497093135424e-05,
"loss": 0.0,
"num_input_tokens_seen": 3581648,
"step": 5795
},
{
"epoch": 10.33868092691622,
"grad_norm": 2.9808454200974666e-05,
"learning_rate": 2.7886871953973838e-05,
"loss": 0.0,
"num_input_tokens_seen": 3585264,
"step": 5800
},
{
"epoch": 10.3475935828877,
"grad_norm": 1.4228238796931691e-05,
"learning_rate": 2.7848239829329002e-05,
"loss": 0.0,
"num_input_tokens_seen": 3588048,
"step": 5805
},
{
"epoch": 10.35650623885918,
"grad_norm": 1.5599915059283376e-05,
"learning_rate": 2.7809600812680674e-05,
"loss": 0.0,
"num_input_tokens_seen": 3591216,
"step": 5810
},
{
"epoch": 10.36541889483066,
"grad_norm": 0.0007058135233819485,
"learning_rate": 2.7770954997525277e-05,
"loss": 0.0,
"num_input_tokens_seen": 3594800,
"step": 5815
},
{
"epoch": 10.37433155080214,
"grad_norm": 1.2978567610844038e-05,
"learning_rate": 2.7732302477375688e-05,
"loss": 0.0,
"num_input_tokens_seen": 3597264,
"step": 5820
},
{
"epoch": 10.383244206773618,
"grad_norm": 1.385369341733167e-05,
"learning_rate": 2.769364334576099e-05,
"loss": 0.0,
"num_input_tokens_seen": 3601136,
"step": 5825
},
{
"epoch": 10.392156862745098,
"grad_norm": 1.804764542612247e-05,
"learning_rate": 2.7654977696226292e-05,
"loss": 0.0,
"num_input_tokens_seen": 3604432,
"step": 5830
},
{
"epoch": 10.401069518716577,
"grad_norm": 1.2603826689883135e-05,
"learning_rate": 2.7616305622332466e-05,
"loss": 0.0,
"num_input_tokens_seen": 3607024,
"step": 5835
},
{
"epoch": 10.409982174688057,
"grad_norm": 1.2305051313887816e-05,
"learning_rate": 2.7577627217655916e-05,
"loss": 0.0,
"num_input_tokens_seen": 3610640,
"step": 5840
},
{
"epoch": 10.418894830659536,
"grad_norm": 1.807206353987567e-05,
"learning_rate": 2.7538942575788386e-05,
"loss": 0.0,
"num_input_tokens_seen": 3613424,
"step": 5845
},
{
"epoch": 10.427807486631016,
"grad_norm": 1.0147291504836176e-05,
"learning_rate": 2.7500251790336683e-05,
"loss": 0.0,
"num_input_tokens_seen": 3617008,
"step": 5850
},
{
"epoch": 10.436720142602496,
"grad_norm": 0.00023956519726198167,
"learning_rate": 2.7461554954922514e-05,
"loss": 0.0,
"num_input_tokens_seen": 3620048,
"step": 5855
},
{
"epoch": 10.445632798573975,
"grad_norm": 4.6874189138179645e-05,
"learning_rate": 2.7422852163182205e-05,
"loss": 0.0,
"num_input_tokens_seen": 3624080,
"step": 5860
},
{
"epoch": 10.454545454545455,
"grad_norm": 1.3826291251461953e-05,
"learning_rate": 2.7384143508766496e-05,
"loss": 0.0,
"num_input_tokens_seen": 3627664,
"step": 5865
},
{
"epoch": 10.463458110516934,
"grad_norm": 2.025993489951361e-05,
"learning_rate": 2.7345429085340314e-05,
"loss": 0.0,
"num_input_tokens_seen": 3630384,
"step": 5870
},
{
"epoch": 10.472370766488414,
"grad_norm": 1.1854433068947401e-05,
"learning_rate": 2.7306708986582553e-05,
"loss": 0.0,
"num_input_tokens_seen": 3633936,
"step": 5875
},
{
"epoch": 10.481283422459892,
"grad_norm": 1.50766263686819e-05,
"learning_rate": 2.7267983306185836e-05,
"loss": 0.0,
"num_input_tokens_seen": 3637552,
"step": 5880
},
{
"epoch": 10.490196078431373,
"grad_norm": 1.3261160347610712e-05,
"learning_rate": 2.722925213785628e-05,
"loss": 0.0,
"num_input_tokens_seen": 3640560,
"step": 5885
},
{
"epoch": 10.499108734402853,
"grad_norm": 4.532691673375666e-05,
"learning_rate": 2.7190515575313307e-05,
"loss": 0.0,
"num_input_tokens_seen": 3643920,
"step": 5890
},
{
"epoch": 10.508021390374331,
"grad_norm": 1.3621403013530653e-05,
"learning_rate": 2.7151773712289358e-05,
"loss": 0.0,
"num_input_tokens_seen": 3647088,
"step": 5895
},
{
"epoch": 10.516934046345812,
"grad_norm": 1.2079704902134836e-05,
"learning_rate": 2.711302664252973e-05,
"loss": 0.0,
"num_input_tokens_seen": 3650192,
"step": 5900
},
{
"epoch": 10.52584670231729,
"grad_norm": 1.122187950386433e-05,
"learning_rate": 2.707427445979232e-05,
"loss": 0.0,
"num_input_tokens_seen": 3653552,
"step": 5905
},
{
"epoch": 10.53475935828877,
"grad_norm": 2.3511658582719974e-05,
"learning_rate": 2.7035517257847358e-05,
"loss": 0.0,
"num_input_tokens_seen": 3655760,
"step": 5910
},
{
"epoch": 10.543672014260249,
"grad_norm": 1.2023041563224979e-05,
"learning_rate": 2.699675513047726e-05,
"loss": 0.0,
"num_input_tokens_seen": 3658640,
"step": 5915
},
{
"epoch": 10.55258467023173,
"grad_norm": 1.289930150960572e-05,
"learning_rate": 2.6957988171476344e-05,
"loss": 0.0,
"num_input_tokens_seen": 3661392,
"step": 5920
},
{
"epoch": 10.56149732620321,
"grad_norm": 3.980232577305287e-05,
"learning_rate": 2.691921647465062e-05,
"loss": 0.0,
"num_input_tokens_seen": 3665040,
"step": 5925
},
{
"epoch": 10.570409982174688,
"grad_norm": 7.775369158480316e-05,
"learning_rate": 2.6880440133817562e-05,
"loss": 0.0,
"num_input_tokens_seen": 3667824,
"step": 5930
},
{
"epoch": 10.579322638146168,
"grad_norm": 1.4728250789630692e-05,
"learning_rate": 2.684165924280589e-05,
"loss": 0.0,
"num_input_tokens_seen": 3671152,
"step": 5935
},
{
"epoch": 10.588235294117647,
"grad_norm": 1.8075328625855036e-05,
"learning_rate": 2.6802873895455317e-05,
"loss": 0.0,
"num_input_tokens_seen": 3674576,
"step": 5940
},
{
"epoch": 10.597147950089127,
"grad_norm": 4.87805918965023e-05,
"learning_rate": 2.676408418561635e-05,
"loss": 0.0,
"num_input_tokens_seen": 3677808,
"step": 5945
},
{
"epoch": 10.606060606060606,
"grad_norm": 6.62174352328293e-05,
"learning_rate": 2.672529020715006e-05,
"loss": 0.0,
"num_input_tokens_seen": 3681648,
"step": 5950
},
{
"epoch": 10.614973262032086,
"grad_norm": 2.1246925825835206e-05,
"learning_rate": 2.6686492053927837e-05,
"loss": 0.0,
"num_input_tokens_seen": 3684592,
"step": 5955
},
{
"epoch": 10.623885918003564,
"grad_norm": 2.8126887627877295e-05,
"learning_rate": 2.664768981983116e-05,
"loss": 0.0,
"num_input_tokens_seen": 3688144,
"step": 5960
},
{
"epoch": 10.632798573975045,
"grad_norm": 0.00012585851072799414,
"learning_rate": 2.660888359875141e-05,
"loss": 0.0,
"num_input_tokens_seen": 3690864,
"step": 5965
},
{
"epoch": 10.641711229946525,
"grad_norm": 1.5733108739368618e-05,
"learning_rate": 2.6570073484589607e-05,
"loss": 0.0,
"num_input_tokens_seen": 3693968,
"step": 5970
},
{
"epoch": 10.650623885918003,
"grad_norm": 1.848669307946693e-05,
"learning_rate": 2.6531259571256166e-05,
"loss": 0.0,
"num_input_tokens_seen": 3697520,
"step": 5975
},
{
"epoch": 10.659536541889484,
"grad_norm": 1.7645805201027542e-05,
"learning_rate": 2.649244195267074e-05,
"loss": 0.0,
"num_input_tokens_seen": 3700848,
"step": 5980
},
{
"epoch": 10.668449197860962,
"grad_norm": 1.3177233086025808e-05,
"learning_rate": 2.6453620722761896e-05,
"loss": 0.0,
"num_input_tokens_seen": 3703376,
"step": 5985
},
{
"epoch": 10.677361853832442,
"grad_norm": 1.3371476597967558e-05,
"learning_rate": 2.6414795975466987e-05,
"loss": 0.0,
"num_input_tokens_seen": 3705776,
"step": 5990
},
{
"epoch": 10.686274509803921,
"grad_norm": 1.0603682312648743e-05,
"learning_rate": 2.637596780473186e-05,
"loss": 0.0,
"num_input_tokens_seen": 3708592,
"step": 5995
},
{
"epoch": 10.695187165775401,
"grad_norm": 1.2599515684996732e-05,
"learning_rate": 2.633713630451063e-05,
"loss": 0.0,
"num_input_tokens_seen": 3712080,
"step": 6000
},
{
"epoch": 10.70409982174688,
"grad_norm": 9.517248145129997e-06,
"learning_rate": 2.6298301568765478e-05,
"loss": 0.0,
"num_input_tokens_seen": 3715120,
"step": 6005
},
{
"epoch": 10.71301247771836,
"grad_norm": 2.19264293264132e-05,
"learning_rate": 2.6259463691466423e-05,
"loss": 0.0,
"num_input_tokens_seen": 3717936,
"step": 6010
},
{
"epoch": 10.72192513368984,
"grad_norm": 3.806079621426761e-05,
"learning_rate": 2.622062276659109e-05,
"loss": 0.0,
"num_input_tokens_seen": 3720688,
"step": 6015
},
{
"epoch": 10.730837789661319,
"grad_norm": 1.4707470654684585e-05,
"learning_rate": 2.6181778888124454e-05,
"loss": 0.0,
"num_input_tokens_seen": 3723920,
"step": 6020
},
{
"epoch": 10.739750445632799,
"grad_norm": 1.9322525986353867e-05,
"learning_rate": 2.6142932150058657e-05,
"loss": 0.0,
"num_input_tokens_seen": 3727440,
"step": 6025
},
{
"epoch": 10.748663101604278,
"grad_norm": 2.9112976335454732e-05,
"learning_rate": 2.6104082646392754e-05,
"loss": 0.0,
"num_input_tokens_seen": 3730480,
"step": 6030
},
{
"epoch": 10.757575757575758,
"grad_norm": 1.0991312592523172e-05,
"learning_rate": 2.606523047113249e-05,
"loss": 0.0,
"num_input_tokens_seen": 3733392,
"step": 6035
},
{
"epoch": 10.766488413547236,
"grad_norm": 1.333354703092482e-05,
"learning_rate": 2.6026375718290086e-05,
"loss": 0.0,
"num_input_tokens_seen": 3736112,
"step": 6040
},
{
"epoch": 10.775401069518717,
"grad_norm": 2.259712891827803e-05,
"learning_rate": 2.5987518481883987e-05,
"loss": 0.0,
"num_input_tokens_seen": 3739696,
"step": 6045
},
{
"epoch": 10.784313725490197,
"grad_norm": 1.3553658391174395e-05,
"learning_rate": 2.5948658855938644e-05,
"loss": 0.0,
"num_input_tokens_seen": 3742576,
"step": 6050
},
{
"epoch": 10.793226381461675,
"grad_norm": 1.4138406186248176e-05,
"learning_rate": 2.5909796934484308e-05,
"loss": 0.0,
"num_input_tokens_seen": 3745360,
"step": 6055
},
{
"epoch": 10.802139037433156,
"grad_norm": 0.0002746598329395056,
"learning_rate": 2.587093281155677e-05,
"loss": 0.0,
"num_input_tokens_seen": 3748464,
"step": 6060
},
{
"epoch": 10.811051693404634,
"grad_norm": 1.3880183360015508e-05,
"learning_rate": 2.5832066581197162e-05,
"loss": 0.0,
"num_input_tokens_seen": 3751920,
"step": 6065
},
{
"epoch": 10.819964349376114,
"grad_norm": 1.6997690181597136e-05,
"learning_rate": 2.5793198337451696e-05,
"loss": 0.0,
"num_input_tokens_seen": 3755088,
"step": 6070
},
{
"epoch": 10.828877005347593,
"grad_norm": 1.9864462956320494e-05,
"learning_rate": 2.575432817437146e-05,
"loss": 0.0,
"num_input_tokens_seen": 3757648,
"step": 6075
},
{
"epoch": 10.837789661319073,
"grad_norm": 1.3433314961730503e-05,
"learning_rate": 2.571545618601221e-05,
"loss": 0.0,
"num_input_tokens_seen": 3760912,
"step": 6080
},
{
"epoch": 10.846702317290553,
"grad_norm": 0.00028330503846518695,
"learning_rate": 2.567658246643409e-05,
"loss": 0.0,
"num_input_tokens_seen": 3764144,
"step": 6085
},
{
"epoch": 10.855614973262032,
"grad_norm": 1.4432083844440058e-05,
"learning_rate": 2.5637707109701442e-05,
"loss": 0.0,
"num_input_tokens_seen": 3767088,
"step": 6090
},
{
"epoch": 10.864527629233512,
"grad_norm": 9.103316551772878e-05,
"learning_rate": 2.559883020988258e-05,
"loss": 0.0,
"num_input_tokens_seen": 3770992,
"step": 6095
},
{
"epoch": 10.87344028520499,
"grad_norm": 1.3692199900106061e-05,
"learning_rate": 2.5559951861049532e-05,
"loss": 0.0,
"num_input_tokens_seen": 3773744,
"step": 6100
},
{
"epoch": 10.882352941176471,
"grad_norm": 2.3006872652331367e-05,
"learning_rate": 2.552107215727785e-05,
"loss": 0.0,
"num_input_tokens_seen": 3777904,
"step": 6105
},
{
"epoch": 10.89126559714795,
"grad_norm": 0.0002752006403170526,
"learning_rate": 2.5482191192646365e-05,
"loss": 0.0,
"num_input_tokens_seen": 3780912,
"step": 6110
},
{
"epoch": 10.90017825311943,
"grad_norm": 9.974956810765434e-06,
"learning_rate": 2.544330906123694e-05,
"loss": 0.0,
"num_input_tokens_seen": 3784016,
"step": 6115
},
{
"epoch": 10.909090909090908,
"grad_norm": 2.1135621864232235e-05,
"learning_rate": 2.5404425857134285e-05,
"loss": 0.0,
"num_input_tokens_seen": 3787024,
"step": 6120
},
{
"epoch": 10.918003565062389,
"grad_norm": 1.515540589025477e-05,
"learning_rate": 2.536554167442568e-05,
"loss": 0.0,
"num_input_tokens_seen": 3789584,
"step": 6125
},
{
"epoch": 10.926916221033869,
"grad_norm": 1.2581827832036652e-05,
"learning_rate": 2.53266566072008e-05,
"loss": 0.0,
"num_input_tokens_seen": 3792720,
"step": 6130
},
{
"epoch": 10.935828877005347,
"grad_norm": 0.0006793736247345805,
"learning_rate": 2.5287770749551442e-05,
"loss": 0.0,
"num_input_tokens_seen": 3795376,
"step": 6135
},
{
"epoch": 10.944741532976828,
"grad_norm": 1.5780657122377306e-05,
"learning_rate": 2.5248884195571326e-05,
"loss": 0.0,
"num_input_tokens_seen": 3798480,
"step": 6140
},
{
"epoch": 10.953654188948306,
"grad_norm": 1.0621036381053273e-05,
"learning_rate": 2.5209997039355837e-05,
"loss": 0.0,
"num_input_tokens_seen": 3801808,
"step": 6145
},
{
"epoch": 10.962566844919786,
"grad_norm": 9.538345693727024e-06,
"learning_rate": 2.517110937500185e-05,
"loss": 0.0,
"num_input_tokens_seen": 3804848,
"step": 6150
},
{
"epoch": 10.971479500891265,
"grad_norm": 0.00019309086201246828,
"learning_rate": 2.5132221296607445e-05,
"loss": 0.0,
"num_input_tokens_seen": 3808208,
"step": 6155
},
{
"epoch": 10.980392156862745,
"grad_norm": 0.0001642105489736423,
"learning_rate": 2.509333289827171e-05,
"loss": 0.0,
"num_input_tokens_seen": 3810800,
"step": 6160
},
{
"epoch": 10.989304812834224,
"grad_norm": 1.139036157837836e-05,
"learning_rate": 2.5054444274094507e-05,
"loss": 0.0,
"num_input_tokens_seen": 3813584,
"step": 6165
},
{
"epoch": 10.998217468805704,
"grad_norm": 1.1301727681711782e-05,
"learning_rate": 2.5015555518176243e-05,
"loss": 0.0,
"num_input_tokens_seen": 3817040,
"step": 6170
},
{
"epoch": 11.0,
"eval_loss": 0.24064487218856812,
"eval_runtime": 4.585,
"eval_samples_per_second": 54.308,
"eval_steps_per_second": 13.741,
"num_input_tokens_seen": 3817120,
"step": 6171
},
{
"epoch": 11.007130124777184,
"grad_norm": 1.7638394638197497e-05,
"learning_rate": 2.4976666724617657e-05,
"loss": 0.0,
"num_input_tokens_seen": 3819872,
"step": 6175
},
{
"epoch": 11.016042780748663,
"grad_norm": 1.0595828825898934e-05,
"learning_rate": 2.493777798751956e-05,
"loss": 0.0,
"num_input_tokens_seen": 3823296,
"step": 6180
},
{
"epoch": 11.024955436720143,
"grad_norm": 3.515037678880617e-05,
"learning_rate": 2.489888940098263e-05,
"loss": 0.0,
"num_input_tokens_seen": 3826208,
"step": 6185
},
{
"epoch": 11.033868092691621,
"grad_norm": 1.735261503199581e-05,
"learning_rate": 2.4860001059107187e-05,
"loss": 0.0,
"num_input_tokens_seen": 3829760,
"step": 6190
},
{
"epoch": 11.042780748663102,
"grad_norm": 1.1009148693119641e-05,
"learning_rate": 2.4821113055992965e-05,
"loss": 0.0,
"num_input_tokens_seen": 3833216,
"step": 6195
},
{
"epoch": 11.05169340463458,
"grad_norm": 1.4147810361464508e-05,
"learning_rate": 2.478222548573887e-05,
"loss": 0.0,
"num_input_tokens_seen": 3835840,
"step": 6200
},
{
"epoch": 11.06060606060606,
"grad_norm": 1.4920704415999353e-05,
"learning_rate": 2.4743338442442755e-05,
"loss": 0.0,
"num_input_tokens_seen": 3838560,
"step": 6205
},
{
"epoch": 11.06951871657754,
"grad_norm": 1.137426897912519e-05,
"learning_rate": 2.4704452020201206e-05,
"loss": 0.0,
"num_input_tokens_seen": 3841792,
"step": 6210
},
{
"epoch": 11.07843137254902,
"grad_norm": 1.3182347174733877e-05,
"learning_rate": 2.4665566313109307e-05,
"loss": 0.0,
"num_input_tokens_seen": 3844352,
"step": 6215
},
{
"epoch": 11.0873440285205,
"grad_norm": 1.325432094745338e-05,
"learning_rate": 2.4626681415260393e-05,
"loss": 0.0,
"num_input_tokens_seen": 3847552,
"step": 6220
},
{
"epoch": 11.096256684491978,
"grad_norm": 1.2028423952870071e-05,
"learning_rate": 2.4587797420745883e-05,
"loss": 0.0,
"num_input_tokens_seen": 3850016,
"step": 6225
},
{
"epoch": 11.105169340463458,
"grad_norm": 1.213027280755341e-05,
"learning_rate": 2.4548914423654973e-05,
"loss": 0.0,
"num_input_tokens_seen": 3852512,
"step": 6230
},
{
"epoch": 11.114081996434937,
"grad_norm": 1.2425961358530913e-05,
"learning_rate": 2.4510032518074443e-05,
"loss": 0.0,
"num_input_tokens_seen": 3855872,
"step": 6235
},
{
"epoch": 11.122994652406417,
"grad_norm": 1.2018854249618016e-05,
"learning_rate": 2.4471151798088466e-05,
"loss": 0.0,
"num_input_tokens_seen": 3859648,
"step": 6240
},
{
"epoch": 11.131907308377897,
"grad_norm": 7.846080552553758e-05,
"learning_rate": 2.4432272357778314e-05,
"loss": 0.0,
"num_input_tokens_seen": 3862208,
"step": 6245
},
{
"epoch": 11.140819964349376,
"grad_norm": 1.3241695342003368e-05,
"learning_rate": 2.439339429122216e-05,
"loss": 0.0,
"num_input_tokens_seen": 3864672,
"step": 6250
},
{
"epoch": 11.149732620320856,
"grad_norm": 6.398496770998463e-05,
"learning_rate": 2.4354517692494895e-05,
"loss": 0.0,
"num_input_tokens_seen": 3867872,
"step": 6255
},
{
"epoch": 11.158645276292335,
"grad_norm": 1.46424317790661e-05,
"learning_rate": 2.431564265566781e-05,
"loss": 0.0,
"num_input_tokens_seen": 3871264,
"step": 6260
},
{
"epoch": 11.167557932263815,
"grad_norm": 1.3262514585221652e-05,
"learning_rate": 2.427676927480845e-05,
"loss": 0.0,
"num_input_tokens_seen": 3874464,
"step": 6265
},
{
"epoch": 11.176470588235293,
"grad_norm": 1.7728940292727202e-05,
"learning_rate": 2.4237897643980328e-05,
"loss": 0.0,
"num_input_tokens_seen": 3878016,
"step": 6270
},
{
"epoch": 11.185383244206774,
"grad_norm": 1.4908625416865107e-05,
"learning_rate": 2.4199027857242734e-05,
"loss": 0.0,
"num_input_tokens_seen": 3880832,
"step": 6275
},
{
"epoch": 11.194295900178252,
"grad_norm": 1.1908256055903621e-05,
"learning_rate": 2.41601600086505e-05,
"loss": 0.0,
"num_input_tokens_seen": 3884800,
"step": 6280
},
{
"epoch": 11.203208556149733,
"grad_norm": 1.2285045158932917e-05,
"learning_rate": 2.4121294192253764e-05,
"loss": 0.0,
"num_input_tokens_seen": 3887264,
"step": 6285
},
{
"epoch": 11.212121212121213,
"grad_norm": 1.382422215101542e-05,
"learning_rate": 2.4082430502097747e-05,
"loss": 0.0,
"num_input_tokens_seen": 3890304,
"step": 6290
},
{
"epoch": 11.221033868092691,
"grad_norm": 1.405794591846643e-05,
"learning_rate": 2.4043569032222526e-05,
"loss": 0.0,
"num_input_tokens_seen": 3893248,
"step": 6295
},
{
"epoch": 11.229946524064172,
"grad_norm": 1.7820608263718896e-05,
"learning_rate": 2.4004709876662795e-05,
"loss": 0.0,
"num_input_tokens_seen": 3896544,
"step": 6300
},
{
"epoch": 11.23885918003565,
"grad_norm": 1.839667856984306e-05,
"learning_rate": 2.396585312944767e-05,
"loss": 0.0,
"num_input_tokens_seen": 3899968,
"step": 6305
},
{
"epoch": 11.24777183600713,
"grad_norm": 3.20358740282245e-05,
"learning_rate": 2.3926998884600404e-05,
"loss": 0.0,
"num_input_tokens_seen": 3902912,
"step": 6310
},
{
"epoch": 11.256684491978609,
"grad_norm": 8.05861345725134e-05,
"learning_rate": 2.3888147236138245e-05,
"loss": 0.0,
"num_input_tokens_seen": 3905664,
"step": 6315
},
{
"epoch": 11.26559714795009,
"grad_norm": 1.0621994988468941e-05,
"learning_rate": 2.3849298278072118e-05,
"loss": 0.0,
"num_input_tokens_seen": 3909248,
"step": 6320
},
{
"epoch": 11.27450980392157,
"grad_norm": 1.65787641890347e-05,
"learning_rate": 2.3810452104406444e-05,
"loss": 0.0,
"num_input_tokens_seen": 3912128,
"step": 6325
},
{
"epoch": 11.283422459893048,
"grad_norm": 2.989449058077298e-05,
"learning_rate": 2.3771608809138926e-05,
"loss": 0.0,
"num_input_tokens_seen": 3914848,
"step": 6330
},
{
"epoch": 11.292335115864528,
"grad_norm": 7.710716454312205e-05,
"learning_rate": 2.3732768486260283e-05,
"loss": 0.0,
"num_input_tokens_seen": 3918528,
"step": 6335
},
{
"epoch": 11.301247771836007,
"grad_norm": 3.157812534482218e-05,
"learning_rate": 2.3693931229754036e-05,
"loss": 0.0,
"num_input_tokens_seen": 3921536,
"step": 6340
},
{
"epoch": 11.310160427807487,
"grad_norm": 3.4598073398228735e-05,
"learning_rate": 2.365509713359632e-05,
"loss": 0.0,
"num_input_tokens_seen": 3924352,
"step": 6345
},
{
"epoch": 11.319073083778965,
"grad_norm": 1.363915089314105e-05,
"learning_rate": 2.3616266291755582e-05,
"loss": 0.0,
"num_input_tokens_seen": 3926944,
"step": 6350
},
{
"epoch": 11.327985739750446,
"grad_norm": 9.564583706378471e-06,
"learning_rate": 2.3577438798192427e-05,
"loss": 0.0,
"num_input_tokens_seen": 3930528,
"step": 6355
},
{
"epoch": 11.336898395721924,
"grad_norm": 2.2686510419589467e-05,
"learning_rate": 2.3538614746859338e-05,
"loss": 0.0,
"num_input_tokens_seen": 3934144,
"step": 6360
},
{
"epoch": 11.345811051693405,
"grad_norm": 1.1965239536948502e-05,
"learning_rate": 2.349979423170047e-05,
"loss": 0.0,
"num_input_tokens_seen": 3937856,
"step": 6365
},
{
"epoch": 11.354723707664885,
"grad_norm": 1.0834000022441614e-05,
"learning_rate": 2.346097734665143e-05,
"loss": 0.0,
"num_input_tokens_seen": 3940704,
"step": 6370
},
{
"epoch": 11.363636363636363,
"grad_norm": 1.23635754789575e-05,
"learning_rate": 2.342216418563904e-05,
"loss": 0.0,
"num_input_tokens_seen": 3943008,
"step": 6375
},
{
"epoch": 11.372549019607844,
"grad_norm": 1.816690382838715e-05,
"learning_rate": 2.3383354842581106e-05,
"loss": 0.0,
"num_input_tokens_seen": 3945664,
"step": 6380
},
{
"epoch": 11.381461675579322,
"grad_norm": 1.2368334864731878e-05,
"learning_rate": 2.3344549411386203e-05,
"loss": 0.0,
"num_input_tokens_seen": 3948256,
"step": 6385
},
{
"epoch": 11.390374331550802,
"grad_norm": 2.607596616144292e-05,
"learning_rate": 2.330574798595342e-05,
"loss": 0.0,
"num_input_tokens_seen": 3951264,
"step": 6390
},
{
"epoch": 11.39928698752228,
"grad_norm": 1.1210274351469707e-05,
"learning_rate": 2.3266950660172183e-05,
"loss": 0.0,
"num_input_tokens_seen": 3954592,
"step": 6395
},
{
"epoch": 11.408199643493761,
"grad_norm": 1.4848555110802408e-05,
"learning_rate": 2.3228157527921966e-05,
"loss": 0.0,
"num_input_tokens_seen": 3958112,
"step": 6400
},
{
"epoch": 11.417112299465241,
"grad_norm": 1.3798643522022758e-05,
"learning_rate": 2.3189368683072134e-05,
"loss": 0.0,
"num_input_tokens_seen": 3960896,
"step": 6405
},
{
"epoch": 11.42602495543672,
"grad_norm": 8.388559763261583e-06,
"learning_rate": 2.3150584219481644e-05,
"loss": 0.0,
"num_input_tokens_seen": 3963232,
"step": 6410
},
{
"epoch": 11.4349376114082,
"grad_norm": 1.9333376258146018e-05,
"learning_rate": 2.3111804230998863e-05,
"loss": 0.0,
"num_input_tokens_seen": 3966752,
"step": 6415
},
{
"epoch": 11.443850267379679,
"grad_norm": 9.065933409146965e-05,
"learning_rate": 2.3073028811461335e-05,
"loss": 0.0,
"num_input_tokens_seen": 3970080,
"step": 6420
},
{
"epoch": 11.452762923351159,
"grad_norm": 1.0522752745600883e-05,
"learning_rate": 2.303425805469554e-05,
"loss": 0.0,
"num_input_tokens_seen": 3972928,
"step": 6425
},
{
"epoch": 11.461675579322637,
"grad_norm": 1.237884225702146e-05,
"learning_rate": 2.2995492054516672e-05,
"loss": 0.0,
"num_input_tokens_seen": 3976288,
"step": 6430
},
{
"epoch": 11.470588235294118,
"grad_norm": 1.0875227417272981e-05,
"learning_rate": 2.2956730904728436e-05,
"loss": 0.0,
"num_input_tokens_seen": 3979424,
"step": 6435
},
{
"epoch": 11.479500891265598,
"grad_norm": 0.00011285066284472123,
"learning_rate": 2.2917974699122775e-05,
"loss": 0.0,
"num_input_tokens_seen": 3982592,
"step": 6440
},
{
"epoch": 11.488413547237077,
"grad_norm": 3.4363692975603044e-05,
"learning_rate": 2.287922353147969e-05,
"loss": 0.0,
"num_input_tokens_seen": 3985504,
"step": 6445
},
{
"epoch": 11.497326203208557,
"grad_norm": 1.3565711924456991e-05,
"learning_rate": 2.2840477495566976e-05,
"loss": 0.0,
"num_input_tokens_seen": 3988992,
"step": 6450
},
{
"epoch": 11.506238859180035,
"grad_norm": 9.274257536162622e-06,
"learning_rate": 2.2801736685140012e-05,
"loss": 0.0,
"num_input_tokens_seen": 3992416,
"step": 6455
},
{
"epoch": 11.515151515151516,
"grad_norm": 1.225191317644203e-05,
"learning_rate": 2.276300119394153e-05,
"loss": 0.0,
"num_input_tokens_seen": 3994688,
"step": 6460
},
{
"epoch": 11.524064171122994,
"grad_norm": 1.2006966244371142e-05,
"learning_rate": 2.272427111570141e-05,
"loss": 0.0,
"num_input_tokens_seen": 3997632,
"step": 6465
},
{
"epoch": 11.532976827094474,
"grad_norm": 1.2051388694089837e-05,
"learning_rate": 2.2685546544136422e-05,
"loss": 0.0,
"num_input_tokens_seen": 4000512,
"step": 6470
},
{
"epoch": 11.541889483065953,
"grad_norm": 0.00017697972361929715,
"learning_rate": 2.2646827572950008e-05,
"loss": 0.0,
"num_input_tokens_seen": 4003296,
"step": 6475
},
{
"epoch": 11.550802139037433,
"grad_norm": 9.811637937673368e-06,
"learning_rate": 2.2608114295832053e-05,
"loss": 0.0,
"num_input_tokens_seen": 4006752,
"step": 6480
},
{
"epoch": 11.559714795008913,
"grad_norm": 9.908816537063103e-06,
"learning_rate": 2.256940680645868e-05,
"loss": 0.0,
"num_input_tokens_seen": 4009984,
"step": 6485
},
{
"epoch": 11.568627450980392,
"grad_norm": 1.0891917554545216e-05,
"learning_rate": 2.253070519849199e-05,
"loss": 0.0,
"num_input_tokens_seen": 4012960,
"step": 6490
},
{
"epoch": 11.577540106951872,
"grad_norm": 1.98553352674935e-05,
"learning_rate": 2.2492009565579876e-05,
"loss": 0.0,
"num_input_tokens_seen": 4016640,
"step": 6495
},
{
"epoch": 11.58645276292335,
"grad_norm": 8.628303476143628e-06,
"learning_rate": 2.2453320001355753e-05,
"loss": 0.0,
"num_input_tokens_seen": 4019648,
"step": 6500
},
{
"epoch": 11.595365418894831,
"grad_norm": 9.807788956095465e-06,
"learning_rate": 2.2414636599438345e-05,
"loss": 0.0,
"num_input_tokens_seen": 4021888,
"step": 6505
},
{
"epoch": 11.60427807486631,
"grad_norm": 1.3811519238515757e-05,
"learning_rate": 2.237595945343149e-05,
"loss": 0.0,
"num_input_tokens_seen": 4024480,
"step": 6510
},
{
"epoch": 11.61319073083779,
"grad_norm": 9.580502592143603e-06,
"learning_rate": 2.2337288656923874e-05,
"loss": 0.0,
"num_input_tokens_seen": 4027552,
"step": 6515
},
{
"epoch": 11.622103386809268,
"grad_norm": 1.2243661331012845e-05,
"learning_rate": 2.22986243034888e-05,
"loss": 0.0,
"num_input_tokens_seen": 4030720,
"step": 6520
},
{
"epoch": 11.631016042780749,
"grad_norm": 1.1290548172837589e-05,
"learning_rate": 2.2259966486684034e-05,
"loss": 0.0,
"num_input_tokens_seen": 4033248,
"step": 6525
},
{
"epoch": 11.639928698752229,
"grad_norm": 1.4833147361059673e-05,
"learning_rate": 2.222131530005146e-05,
"loss": 0.0,
"num_input_tokens_seen": 4036704,
"step": 6530
},
{
"epoch": 11.648841354723707,
"grad_norm": 2.9466349587892182e-05,
"learning_rate": 2.2182670837116975e-05,
"loss": 0.0,
"num_input_tokens_seen": 4040288,
"step": 6535
},
{
"epoch": 11.657754010695188,
"grad_norm": 1.0144281077373307e-05,
"learning_rate": 2.2144033191390168e-05,
"loss": 0.0,
"num_input_tokens_seen": 4043392,
"step": 6540
},
{
"epoch": 11.666666666666666,
"grad_norm": 8.521773452230264e-06,
"learning_rate": 2.2105402456364146e-05,
"loss": 0.0,
"num_input_tokens_seen": 4046752,
"step": 6545
},
{
"epoch": 11.675579322638146,
"grad_norm": 1.819172211980913e-05,
"learning_rate": 2.2066778725515283e-05,
"loss": 0.0,
"num_input_tokens_seen": 4049600,
"step": 6550
},
{
"epoch": 11.684491978609625,
"grad_norm": 1.8302695025340654e-05,
"learning_rate": 2.202816209230303e-05,
"loss": 0.0,
"num_input_tokens_seen": 4052416,
"step": 6555
},
{
"epoch": 11.693404634581105,
"grad_norm": 1.0395393474027514e-05,
"learning_rate": 2.1989552650169655e-05,
"loss": 0.0,
"num_input_tokens_seen": 4055616,
"step": 6560
},
{
"epoch": 11.702317290552585,
"grad_norm": 1.530360896140337e-05,
"learning_rate": 2.1950950492540003e-05,
"loss": 0.0,
"num_input_tokens_seen": 4059072,
"step": 6565
},
{
"epoch": 11.711229946524064,
"grad_norm": 1.1224491572647821e-05,
"learning_rate": 2.1912355712821316e-05,
"loss": 0.0,
"num_input_tokens_seen": 4061728,
"step": 6570
},
{
"epoch": 11.720142602495544,
"grad_norm": 1.2782220437657088e-05,
"learning_rate": 2.187376840440297e-05,
"loss": 0.0,
"num_input_tokens_seen": 4064832,
"step": 6575
},
{
"epoch": 11.729055258467023,
"grad_norm": 9.216395483235829e-06,
"learning_rate": 2.1835188660656267e-05,
"loss": 0.0,
"num_input_tokens_seen": 4068160,
"step": 6580
},
{
"epoch": 11.737967914438503,
"grad_norm": 1.4968473806220572e-05,
"learning_rate": 2.179661657493422e-05,
"loss": 0.0,
"num_input_tokens_seen": 4071392,
"step": 6585
},
{
"epoch": 11.746880570409981,
"grad_norm": 1.3059238881396595e-05,
"learning_rate": 2.1758052240571285e-05,
"loss": 0.0,
"num_input_tokens_seen": 4075456,
"step": 6590
},
{
"epoch": 11.755793226381462,
"grad_norm": 2.845875314960722e-05,
"learning_rate": 2.1719495750883172e-05,
"loss": 0.0,
"num_input_tokens_seen": 4078976,
"step": 6595
},
{
"epoch": 11.764705882352942,
"grad_norm": 1.1186496521986555e-05,
"learning_rate": 2.1680947199166624e-05,
"loss": 0.0,
"num_input_tokens_seen": 4082400,
"step": 6600
},
{
"epoch": 11.77361853832442,
"grad_norm": 2.289231815666426e-05,
"learning_rate": 2.1642406678699153e-05,
"loss": 0.0,
"num_input_tokens_seen": 4085536,
"step": 6605
},
{
"epoch": 11.7825311942959,
"grad_norm": 1.3052140275249258e-05,
"learning_rate": 2.1603874282738836e-05,
"loss": 0.0,
"num_input_tokens_seen": 4088192,
"step": 6610
},
{
"epoch": 11.79144385026738,
"grad_norm": 1.2246360711287707e-05,
"learning_rate": 2.156535010452413e-05,
"loss": 0.0,
"num_input_tokens_seen": 4091104,
"step": 6615
},
{
"epoch": 11.80035650623886,
"grad_norm": 1.011964377539698e-05,
"learning_rate": 2.152683423727355e-05,
"loss": 0.0,
"num_input_tokens_seen": 4093728,
"step": 6620
},
{
"epoch": 11.809269162210338,
"grad_norm": 4.648279718821868e-05,
"learning_rate": 2.148832677418556e-05,
"loss": 0.0,
"num_input_tokens_seen": 4096640,
"step": 6625
},
{
"epoch": 11.818181818181818,
"grad_norm": 8.568744306103326e-06,
"learning_rate": 2.1449827808438233e-05,
"loss": 0.0,
"num_input_tokens_seen": 4098944,
"step": 6630
},
{
"epoch": 11.827094474153299,
"grad_norm": 1.164458353741793e-05,
"learning_rate": 2.1411337433189123e-05,
"loss": 0.0,
"num_input_tokens_seen": 4101728,
"step": 6635
},
{
"epoch": 11.836007130124777,
"grad_norm": 0.0002633916446939111,
"learning_rate": 2.1372855741574954e-05,
"loss": 0.0,
"num_input_tokens_seen": 4104864,
"step": 6640
},
{
"epoch": 11.844919786096257,
"grad_norm": 1.1167186130478512e-05,
"learning_rate": 2.133438282671149e-05,
"loss": 0.0,
"num_input_tokens_seen": 4108416,
"step": 6645
},
{
"epoch": 11.853832442067736,
"grad_norm": 8.543814146833029e-06,
"learning_rate": 2.1295918781693232e-05,
"loss": 0.0,
"num_input_tokens_seen": 4111712,
"step": 6650
},
{
"epoch": 11.862745098039216,
"grad_norm": 1.109227287088288e-05,
"learning_rate": 2.12574636995932e-05,
"loss": 0.0,
"num_input_tokens_seen": 4115040,
"step": 6655
},
{
"epoch": 11.871657754010695,
"grad_norm": 9.523738299321849e-06,
"learning_rate": 2.121901767346276e-05,
"loss": 0.0,
"num_input_tokens_seen": 4118880,
"step": 6660
},
{
"epoch": 11.880570409982175,
"grad_norm": 1.0091476724483073e-05,
"learning_rate": 2.1180580796331324e-05,
"loss": 0.0,
"num_input_tokens_seen": 4121696,
"step": 6665
},
{
"epoch": 11.889483065953653,
"grad_norm": 9.299220255343243e-05,
"learning_rate": 2.114215316120622e-05,
"loss": 0.0,
"num_input_tokens_seen": 4125152,
"step": 6670
},
{
"epoch": 11.898395721925134,
"grad_norm": 8.668677764944732e-06,
"learning_rate": 2.1103734861072368e-05,
"loss": 0.0,
"num_input_tokens_seen": 4127680,
"step": 6675
},
{
"epoch": 11.907308377896614,
"grad_norm": 1.0266234312439337e-05,
"learning_rate": 2.106532598889212e-05,
"loss": 0.0,
"num_input_tokens_seen": 4131104,
"step": 6680
},
{
"epoch": 11.916221033868093,
"grad_norm": 1.0019150977313984e-05,
"learning_rate": 2.1026926637605008e-05,
"loss": 0.0,
"num_input_tokens_seen": 4133792,
"step": 6685
},
{
"epoch": 11.925133689839573,
"grad_norm": 9.016303920361679e-06,
"learning_rate": 2.098853690012752e-05,
"loss": 0.0,
"num_input_tokens_seen": 4137472,
"step": 6690
},
{
"epoch": 11.934046345811051,
"grad_norm": 1.0192444278800394e-05,
"learning_rate": 2.095015686935289e-05,
"loss": 0.0,
"num_input_tokens_seen": 4140288,
"step": 6695
},
{
"epoch": 11.942959001782532,
"grad_norm": 1.0827254300238565e-05,
"learning_rate": 2.0911786638150872e-05,
"loss": 0.0,
"num_input_tokens_seen": 4142848,
"step": 6700
},
{
"epoch": 11.95187165775401,
"grad_norm": 8.758339390624315e-06,
"learning_rate": 2.0873426299367502e-05,
"loss": 0.0,
"num_input_tokens_seen": 4145664,
"step": 6705
},
{
"epoch": 11.96078431372549,
"grad_norm": 1.169169172499096e-05,
"learning_rate": 2.0835075945824858e-05,
"loss": 0.0,
"num_input_tokens_seen": 4148352,
"step": 6710
},
{
"epoch": 11.969696969696969,
"grad_norm": 9.24233336263569e-06,
"learning_rate": 2.0796735670320888e-05,
"loss": 0.0,
"num_input_tokens_seen": 4151584,
"step": 6715
},
{
"epoch": 11.97860962566845,
"grad_norm": 1.0173745067731943e-05,
"learning_rate": 2.0758405565629135e-05,
"loss": 0.0,
"num_input_tokens_seen": 4154592,
"step": 6720
},
{
"epoch": 11.98752228163993,
"grad_norm": 1.4830648069619201e-05,
"learning_rate": 2.0720085724498526e-05,
"loss": 0.0,
"num_input_tokens_seen": 4158752,
"step": 6725
},
{
"epoch": 11.996434937611408,
"grad_norm": 2.1176798327360302e-05,
"learning_rate": 2.0681776239653177e-05,
"loss": 0.0,
"num_input_tokens_seen": 4162048,
"step": 6730
},
{
"epoch": 12.0,
"eval_loss": 0.25058555603027344,
"eval_runtime": 4.5876,
"eval_samples_per_second": 54.277,
"eval_steps_per_second": 13.733,
"num_input_tokens_seen": 4163160,
"step": 6732
},
{
"epoch": 12.005347593582888,
"grad_norm": 5.25148534507025e-05,
"learning_rate": 2.0643477203792126e-05,
"loss": 0.0,
"num_input_tokens_seen": 4165048,
"step": 6735
},
{
"epoch": 12.014260249554367,
"grad_norm": 1.4447410649154335e-05,
"learning_rate": 2.060518870958913e-05,
"loss": 0.0,
"num_input_tokens_seen": 4168184,
"step": 6740
},
{
"epoch": 12.023172905525847,
"grad_norm": 8.24018661660375e-06,
"learning_rate": 2.056691084969244e-05,
"loss": 0.0,
"num_input_tokens_seen": 4171864,
"step": 6745
},
{
"epoch": 12.032085561497325,
"grad_norm": 0.00023547218006569892,
"learning_rate": 2.052864371672457e-05,
"loss": 0.0,
"num_input_tokens_seen": 4174392,
"step": 6750
},
{
"epoch": 12.040998217468806,
"grad_norm": 1.2732047252939083e-05,
"learning_rate": 2.0490387403282077e-05,
"loss": 0.0,
"num_input_tokens_seen": 4177816,
"step": 6755
},
{
"epoch": 12.049910873440286,
"grad_norm": 9.282196515414398e-06,
"learning_rate": 2.045214200193535e-05,
"loss": 0.0,
"num_input_tokens_seen": 4180536,
"step": 6760
},
{
"epoch": 12.058823529411764,
"grad_norm": 1.1917635674763005e-05,
"learning_rate": 2.0413907605228372e-05,
"loss": 0.0,
"num_input_tokens_seen": 4183960,
"step": 6765
},
{
"epoch": 12.067736185383245,
"grad_norm": 1.1140574315504637e-05,
"learning_rate": 2.037568430567848e-05,
"loss": 0.0,
"num_input_tokens_seen": 4186936,
"step": 6770
},
{
"epoch": 12.076648841354723,
"grad_norm": 8.070183866948355e-06,
"learning_rate": 2.033747219577618e-05,
"loss": 0.0,
"num_input_tokens_seen": 4190360,
"step": 6775
},
{
"epoch": 12.085561497326204,
"grad_norm": 9.62817375693703e-06,
"learning_rate": 2.0299271367984873e-05,
"loss": 0.0,
"num_input_tokens_seen": 4193336,
"step": 6780
},
{
"epoch": 12.094474153297682,
"grad_norm": 5.264949140837416e-05,
"learning_rate": 2.0261081914740688e-05,
"loss": 0.0,
"num_input_tokens_seen": 4196312,
"step": 6785
},
{
"epoch": 12.103386809269162,
"grad_norm": 8.345319656655192e-06,
"learning_rate": 2.022290392845223e-05,
"loss": 0.0,
"num_input_tokens_seen": 4198808,
"step": 6790
},
{
"epoch": 12.112299465240643,
"grad_norm": 1.0408018169982824e-05,
"learning_rate": 2.018473750150035e-05,
"loss": 0.0,
"num_input_tokens_seen": 4201880,
"step": 6795
},
{
"epoch": 12.121212121212121,
"grad_norm": 8.212978173105512e-06,
"learning_rate": 2.0146582726237916e-05,
"loss": 0.0,
"num_input_tokens_seen": 4204376,
"step": 6800
},
{
"epoch": 12.130124777183601,
"grad_norm": 0.00011563662701519206,
"learning_rate": 2.010843969498961e-05,
"loss": 0.0,
"num_input_tokens_seen": 4207800,
"step": 6805
},
{
"epoch": 12.13903743315508,
"grad_norm": 8.69808627612656e-06,
"learning_rate": 2.0070308500051716e-05,
"loss": 0.0,
"num_input_tokens_seen": 4210520,
"step": 6810
},
{
"epoch": 12.14795008912656,
"grad_norm": 7.624462341482285e-06,
"learning_rate": 2.0032189233691834e-05,
"loss": 0.0,
"num_input_tokens_seen": 4213944,
"step": 6815
},
{
"epoch": 12.156862745098039,
"grad_norm": 3.42975981766358e-05,
"learning_rate": 1.999408198814876e-05,
"loss": 0.0,
"num_input_tokens_seen": 4216792,
"step": 6820
},
{
"epoch": 12.165775401069519,
"grad_norm": 2.3967681045178324e-05,
"learning_rate": 1.995598685563214e-05,
"loss": 0.0,
"num_input_tokens_seen": 4219896,
"step": 6825
},
{
"epoch": 12.174688057040997,
"grad_norm": 0.00024852409842424095,
"learning_rate": 1.9917903928322356e-05,
"loss": 0.0,
"num_input_tokens_seen": 4222264,
"step": 6830
},
{
"epoch": 12.183600713012478,
"grad_norm": 1.4906848264217842e-05,
"learning_rate": 1.9879833298370238e-05,
"loss": 0.0,
"num_input_tokens_seen": 4225656,
"step": 6835
},
{
"epoch": 12.192513368983958,
"grad_norm": 9.082517863134854e-06,
"learning_rate": 1.9841775057896855e-05,
"loss": 0.0,
"num_input_tokens_seen": 4229624,
"step": 6840
},
{
"epoch": 12.201426024955436,
"grad_norm": 8.315850209328346e-06,
"learning_rate": 1.9803729298993297e-05,
"loss": 0.0,
"num_input_tokens_seen": 4231992,
"step": 6845
},
{
"epoch": 12.210338680926917,
"grad_norm": 1.1928617823286913e-05,
"learning_rate": 1.9765696113720463e-05,
"loss": 0.0,
"num_input_tokens_seen": 4235416,
"step": 6850
},
{
"epoch": 12.219251336898395,
"grad_norm": 7.387813639070373e-06,
"learning_rate": 1.9727675594108834e-05,
"loss": 0.0,
"num_input_tokens_seen": 4238968,
"step": 6855
},
{
"epoch": 12.228163992869876,
"grad_norm": 1.2016014807159081e-05,
"learning_rate": 1.968966783215822e-05,
"loss": 0.0,
"num_input_tokens_seen": 4242200,
"step": 6860
},
{
"epoch": 12.237076648841354,
"grad_norm": 0.00013246844173409045,
"learning_rate": 1.965167291983757e-05,
"loss": 0.0,
"num_input_tokens_seen": 4244888,
"step": 6865
},
{
"epoch": 12.245989304812834,
"grad_norm": 6.707129796268418e-05,
"learning_rate": 1.961369094908474e-05,
"loss": 0.0,
"num_input_tokens_seen": 4247640,
"step": 6870
},
{
"epoch": 12.254901960784313,
"grad_norm": 1.8930944861494936e-05,
"learning_rate": 1.957572201180627e-05,
"loss": 0.0,
"num_input_tokens_seen": 4250680,
"step": 6875
},
{
"epoch": 12.263814616755793,
"grad_norm": 8.173775313480292e-06,
"learning_rate": 1.953776619987718e-05,
"loss": 0.0,
"num_input_tokens_seen": 4253976,
"step": 6880
},
{
"epoch": 12.272727272727273,
"grad_norm": 1.1115719644294586e-05,
"learning_rate": 1.949982360514071e-05,
"loss": 0.0,
"num_input_tokens_seen": 4257080,
"step": 6885
},
{
"epoch": 12.281639928698752,
"grad_norm": 2.634565498738084e-05,
"learning_rate": 1.946189431940812e-05,
"loss": 0.0,
"num_input_tokens_seen": 4260920,
"step": 6890
},
{
"epoch": 12.290552584670232,
"grad_norm": 1.156184862338705e-05,
"learning_rate": 1.9423978434458458e-05,
"loss": 0.0,
"num_input_tokens_seen": 4263800,
"step": 6895
},
{
"epoch": 12.29946524064171,
"grad_norm": 7.503647339035524e-06,
"learning_rate": 1.9386076042038372e-05,
"loss": 0.0,
"num_input_tokens_seen": 4267160,
"step": 6900
},
{
"epoch": 12.308377896613191,
"grad_norm": 1.15874208859168e-05,
"learning_rate": 1.934818723386183e-05,
"loss": 0.0,
"num_input_tokens_seen": 4270552,
"step": 6905
},
{
"epoch": 12.31729055258467,
"grad_norm": 8.49560365168145e-06,
"learning_rate": 1.9310312101609964e-05,
"loss": 0.0,
"num_input_tokens_seen": 4273112,
"step": 6910
},
{
"epoch": 12.32620320855615,
"grad_norm": 1.1358582014509011e-05,
"learning_rate": 1.927245073693078e-05,
"loss": 0.0,
"num_input_tokens_seen": 4276408,
"step": 6915
},
{
"epoch": 12.33511586452763,
"grad_norm": 5.10916106577497e-05,
"learning_rate": 1.9234603231438995e-05,
"loss": 0.0,
"num_input_tokens_seen": 4279192,
"step": 6920
},
{
"epoch": 12.344028520499108,
"grad_norm": 3.123953865724616e-05,
"learning_rate": 1.919676967671578e-05,
"loss": 0.0,
"num_input_tokens_seen": 4283096,
"step": 6925
},
{
"epoch": 12.352941176470589,
"grad_norm": 1.0834906788659282e-05,
"learning_rate": 1.9158950164308543e-05,
"loss": 0.0,
"num_input_tokens_seen": 4286008,
"step": 6930
},
{
"epoch": 12.361853832442067,
"grad_norm": 1.2189595508971252e-05,
"learning_rate": 1.912114478573071e-05,
"loss": 0.0,
"num_input_tokens_seen": 4289144,
"step": 6935
},
{
"epoch": 12.370766488413548,
"grad_norm": 7.232151983771473e-05,
"learning_rate": 1.9083353632461533e-05,
"loss": 0.0,
"num_input_tokens_seen": 4292120,
"step": 6940
},
{
"epoch": 12.379679144385026,
"grad_norm": 0.0005177415441721678,
"learning_rate": 1.9045576795945826e-05,
"loss": 0.0,
"num_input_tokens_seen": 4295192,
"step": 6945
},
{
"epoch": 12.388591800356506,
"grad_norm": 9.710914309835061e-05,
"learning_rate": 1.9007814367593755e-05,
"loss": 0.0,
"num_input_tokens_seen": 4298808,
"step": 6950
},
{
"epoch": 12.397504456327987,
"grad_norm": 1.163170236395672e-05,
"learning_rate": 1.8970066438780628e-05,
"loss": 0.0,
"num_input_tokens_seen": 4302136,
"step": 6955
},
{
"epoch": 12.406417112299465,
"grad_norm": 1.068214260158129e-05,
"learning_rate": 1.8932333100846654e-05,
"loss": 0.0,
"num_input_tokens_seen": 4305144,
"step": 6960
},
{
"epoch": 12.415329768270945,
"grad_norm": 1.1760239431168884e-05,
"learning_rate": 1.8894614445096758e-05,
"loss": 0.0,
"num_input_tokens_seen": 4308824,
"step": 6965
},
{
"epoch": 12.424242424242424,
"grad_norm": 8.910944416129496e-06,
"learning_rate": 1.8856910562800342e-05,
"loss": 0.0,
"num_input_tokens_seen": 4311960,
"step": 6970
},
{
"epoch": 12.433155080213904,
"grad_norm": 9.133290404861327e-06,
"learning_rate": 1.881922154519103e-05,
"loss": 0.0,
"num_input_tokens_seen": 4314488,
"step": 6975
},
{
"epoch": 12.442067736185383,
"grad_norm": 0.00017555024533066899,
"learning_rate": 1.8781547483466503e-05,
"loss": 0.0,
"num_input_tokens_seen": 4317336,
"step": 6980
},
{
"epoch": 12.450980392156863,
"grad_norm": 2.1916104742558673e-05,
"learning_rate": 1.874388846878823e-05,
"loss": 0.0,
"num_input_tokens_seen": 4320792,
"step": 6985
},
{
"epoch": 12.459893048128341,
"grad_norm": 8.021089342946652e-06,
"learning_rate": 1.8706244592281298e-05,
"loss": 0.0,
"num_input_tokens_seen": 4323544,
"step": 6990
},
{
"epoch": 12.468805704099822,
"grad_norm": 1.0469730113982223e-05,
"learning_rate": 1.8668615945034128e-05,
"loss": 0.0,
"num_input_tokens_seen": 4326264,
"step": 6995
},
{
"epoch": 12.477718360071302,
"grad_norm": 1.1207467650820035e-05,
"learning_rate": 1.863100261809834e-05,
"loss": 0.0,
"num_input_tokens_seen": 4329656,
"step": 7000
},
{
"epoch": 12.48663101604278,
"grad_norm": 1.2151757800893392e-05,
"learning_rate": 1.8593404702488437e-05,
"loss": 0.0,
"num_input_tokens_seen": 4332760,
"step": 7005
},
{
"epoch": 12.49554367201426,
"grad_norm": 9.998749192163814e-06,
"learning_rate": 1.855582228918165e-05,
"loss": 0.0,
"num_input_tokens_seen": 4335992,
"step": 7010
},
{
"epoch": 12.50445632798574,
"grad_norm": 7.701738468313124e-06,
"learning_rate": 1.8518255469117697e-05,
"loss": 0.0,
"num_input_tokens_seen": 4339160,
"step": 7015
},
{
"epoch": 12.51336898395722,
"grad_norm": 1.6190710084629245e-05,
"learning_rate": 1.8480704333198565e-05,
"loss": 0.0,
"num_input_tokens_seen": 4342136,
"step": 7020
},
{
"epoch": 12.522281639928698,
"grad_norm": 7.62514173402451e-05,
"learning_rate": 1.8443168972288272e-05,
"loss": 0.0,
"num_input_tokens_seen": 4345368,
"step": 7025
},
{
"epoch": 12.531194295900178,
"grad_norm": 8.554538908356335e-06,
"learning_rate": 1.84056494772127e-05,
"loss": 0.0,
"num_input_tokens_seen": 4348440,
"step": 7030
},
{
"epoch": 12.540106951871659,
"grad_norm": 8.511431951774284e-05,
"learning_rate": 1.8368145938759322e-05,
"loss": 0.0,
"num_input_tokens_seen": 4351064,
"step": 7035
},
{
"epoch": 12.549019607843137,
"grad_norm": 1.336482455371879e-05,
"learning_rate": 1.8330658447676986e-05,
"loss": 0.0,
"num_input_tokens_seen": 4354392,
"step": 7040
},
{
"epoch": 12.557932263814617,
"grad_norm": 8.30404314910993e-06,
"learning_rate": 1.829318709467573e-05,
"loss": 0.0,
"num_input_tokens_seen": 4357592,
"step": 7045
},
{
"epoch": 12.566844919786096,
"grad_norm": 8.932283890317194e-06,
"learning_rate": 1.8255731970426522e-05,
"loss": 0.0,
"num_input_tokens_seen": 4361464,
"step": 7050
},
{
"epoch": 12.575757575757576,
"grad_norm": 1.1647942301351577e-05,
"learning_rate": 1.8218293165561072e-05,
"loss": 0.0,
"num_input_tokens_seen": 4365144,
"step": 7055
},
{
"epoch": 12.584670231729055,
"grad_norm": 9.729025805427227e-06,
"learning_rate": 1.818087077067162e-05,
"loss": 0.0,
"num_input_tokens_seen": 4368504,
"step": 7060
},
{
"epoch": 12.593582887700535,
"grad_norm": 9.038580174092203e-05,
"learning_rate": 1.8143464876310673e-05,
"loss": 0.0,
"num_input_tokens_seen": 4371832,
"step": 7065
},
{
"epoch": 12.602495543672013,
"grad_norm": 1.3633638445753604e-05,
"learning_rate": 1.810607557299081e-05,
"loss": 0.0,
"num_input_tokens_seen": 4375576,
"step": 7070
},
{
"epoch": 12.611408199643494,
"grad_norm": 2.659759411471896e-05,
"learning_rate": 1.8068702951184475e-05,
"loss": 0.0,
"num_input_tokens_seen": 4378552,
"step": 7075
},
{
"epoch": 12.620320855614974,
"grad_norm": 8.272366540040821e-05,
"learning_rate": 1.8031347101323748e-05,
"loss": 0.0,
"num_input_tokens_seen": 4382424,
"step": 7080
},
{
"epoch": 12.629233511586452,
"grad_norm": 6.152382411528379e-05,
"learning_rate": 1.7994008113800102e-05,
"loss": 0.0,
"num_input_tokens_seen": 4386456,
"step": 7085
},
{
"epoch": 12.638146167557933,
"grad_norm": 1.3592002687801141e-05,
"learning_rate": 1.795668607896426e-05,
"loss": 0.0,
"num_input_tokens_seen": 4388824,
"step": 7090
},
{
"epoch": 12.647058823529411,
"grad_norm": 9.91694560070755e-06,
"learning_rate": 1.7919381087125868e-05,
"loss": 0.0,
"num_input_tokens_seen": 4392312,
"step": 7095
},
{
"epoch": 12.655971479500892,
"grad_norm": 8.959758815763053e-06,
"learning_rate": 1.7882093228553355e-05,
"loss": 0.0,
"num_input_tokens_seen": 4395928,
"step": 7100
},
{
"epoch": 12.66488413547237,
"grad_norm": 1.1833170901809353e-05,
"learning_rate": 1.78448225934737e-05,
"loss": 0.0,
"num_input_tokens_seen": 4398840,
"step": 7105
},
{
"epoch": 12.67379679144385,
"grad_norm": 7.207169801404234e-06,
"learning_rate": 1.7807569272072194e-05,
"loss": 0.0,
"num_input_tokens_seen": 4402360,
"step": 7110
},
{
"epoch": 12.68270944741533,
"grad_norm": 9.115967259276658e-06,
"learning_rate": 1.7770333354492225e-05,
"loss": 0.0,
"num_input_tokens_seen": 4405240,
"step": 7115
},
{
"epoch": 12.691622103386809,
"grad_norm": 8.961714229371864e-06,
"learning_rate": 1.7733114930835104e-05,
"loss": 0.0,
"num_input_tokens_seen": 4408696,
"step": 7120
},
{
"epoch": 12.70053475935829,
"grad_norm": 9.721734386403114e-06,
"learning_rate": 1.7695914091159765e-05,
"loss": 0.0,
"num_input_tokens_seen": 4411288,
"step": 7125
},
{
"epoch": 12.709447415329768,
"grad_norm": 2.042876985797193e-05,
"learning_rate": 1.765873092548263e-05,
"loss": 0.0,
"num_input_tokens_seen": 4413784,
"step": 7130
},
{
"epoch": 12.718360071301248,
"grad_norm": 9.558229066897184e-06,
"learning_rate": 1.762156552377734e-05,
"loss": 0.0,
"num_input_tokens_seen": 4416856,
"step": 7135
},
{
"epoch": 12.727272727272727,
"grad_norm": 0.00018919534340966493,
"learning_rate": 1.7584417975974534e-05,
"loss": 0.0,
"num_input_tokens_seen": 4420152,
"step": 7140
},
{
"epoch": 12.736185383244207,
"grad_norm": 7.95549203758128e-06,
"learning_rate": 1.7547288371961675e-05,
"loss": 0.0,
"num_input_tokens_seen": 4423064,
"step": 7145
},
{
"epoch": 12.745098039215687,
"grad_norm": 9.42739916354185e-06,
"learning_rate": 1.7510176801582818e-05,
"loss": 0.0,
"num_input_tokens_seen": 4426104,
"step": 7150
},
{
"epoch": 12.754010695187166,
"grad_norm": 3.628047488746233e-05,
"learning_rate": 1.7473083354638344e-05,
"loss": 0.0,
"num_input_tokens_seen": 4429912,
"step": 7155
},
{
"epoch": 12.762923351158646,
"grad_norm": 2.2449634343502112e-05,
"learning_rate": 1.7436008120884794e-05,
"loss": 0.0,
"num_input_tokens_seen": 4433016,
"step": 7160
},
{
"epoch": 12.771836007130124,
"grad_norm": 8.338784937222954e-06,
"learning_rate": 1.739895119003465e-05,
"loss": 0.0,
"num_input_tokens_seen": 4435768,
"step": 7165
},
{
"epoch": 12.780748663101605,
"grad_norm": 7.26561802366632e-06,
"learning_rate": 1.7361912651756098e-05,
"loss": 0.0,
"num_input_tokens_seen": 4438872,
"step": 7170
},
{
"epoch": 12.789661319073083,
"grad_norm": 7.64839023759123e-06,
"learning_rate": 1.7324892595672805e-05,
"loss": 0.0,
"num_input_tokens_seen": 4442104,
"step": 7175
},
{
"epoch": 12.798573975044564,
"grad_norm": 1.171323219750775e-05,
"learning_rate": 1.728789111136375e-05,
"loss": 0.0,
"num_input_tokens_seen": 4445208,
"step": 7180
},
{
"epoch": 12.807486631016042,
"grad_norm": 1.1664715202641673e-05,
"learning_rate": 1.7250908288362944e-05,
"loss": 0.0,
"num_input_tokens_seen": 4448184,
"step": 7185
},
{
"epoch": 12.816399286987522,
"grad_norm": 1.2254623470653314e-05,
"learning_rate": 1.7213944216159242e-05,
"loss": 0.0,
"num_input_tokens_seen": 4450808,
"step": 7190
},
{
"epoch": 12.825311942959003,
"grad_norm": 1.8988508600159548e-05,
"learning_rate": 1.7176998984196146e-05,
"loss": 0.0,
"num_input_tokens_seen": 4453432,
"step": 7195
},
{
"epoch": 12.834224598930481,
"grad_norm": 2.6997422537533566e-05,
"learning_rate": 1.7140072681871554e-05,
"loss": 0.0,
"num_input_tokens_seen": 4456216,
"step": 7200
},
{
"epoch": 12.843137254901961,
"grad_norm": 2.3871938537922688e-05,
"learning_rate": 1.7103165398537553e-05,
"loss": 0.0,
"num_input_tokens_seen": 4458808,
"step": 7205
},
{
"epoch": 12.85204991087344,
"grad_norm": 1.0611884135869332e-05,
"learning_rate": 1.7066277223500245e-05,
"loss": 0.0,
"num_input_tokens_seen": 4461848,
"step": 7210
},
{
"epoch": 12.86096256684492,
"grad_norm": 5.608840729109943e-05,
"learning_rate": 1.7029408246019447e-05,
"loss": 0.0,
"num_input_tokens_seen": 4464824,
"step": 7215
},
{
"epoch": 12.869875222816399,
"grad_norm": 3.4594344469951466e-05,
"learning_rate": 1.699255855530856e-05,
"loss": 0.0,
"num_input_tokens_seen": 4468248,
"step": 7220
},
{
"epoch": 12.878787878787879,
"grad_norm": 7.11159236743697e-06,
"learning_rate": 1.6955728240534305e-05,
"loss": 0.0,
"num_input_tokens_seen": 4470744,
"step": 7225
},
{
"epoch": 12.887700534759357,
"grad_norm": 9.122089068114292e-06,
"learning_rate": 1.6918917390816497e-05,
"loss": 0.0,
"num_input_tokens_seen": 4474264,
"step": 7230
},
{
"epoch": 12.896613190730838,
"grad_norm": 1.050134414981585e-05,
"learning_rate": 1.688212609522788e-05,
"loss": 0.0,
"num_input_tokens_seen": 4477272,
"step": 7235
},
{
"epoch": 12.905525846702318,
"grad_norm": 9.323571248387452e-06,
"learning_rate": 1.684535444279387e-05,
"loss": 0.0,
"num_input_tokens_seen": 4479832,
"step": 7240
},
{
"epoch": 12.914438502673796,
"grad_norm": 1.227039047080325e-05,
"learning_rate": 1.6808602522492357e-05,
"loss": 0.0,
"num_input_tokens_seen": 4482296,
"step": 7245
},
{
"epoch": 12.923351158645277,
"grad_norm": 1.0895531886490062e-05,
"learning_rate": 1.6771870423253472e-05,
"loss": 0.0,
"num_input_tokens_seen": 4484856,
"step": 7250
},
{
"epoch": 12.932263814616755,
"grad_norm": 7.429500783473486e-06,
"learning_rate": 1.673515823395939e-05,
"loss": 0.0,
"num_input_tokens_seen": 4487896,
"step": 7255
},
{
"epoch": 12.941176470588236,
"grad_norm": 8.657786565891001e-06,
"learning_rate": 1.6698466043444123e-05,
"loss": 0.0,
"num_input_tokens_seen": 4491576,
"step": 7260
},
{
"epoch": 12.950089126559714,
"grad_norm": 1.9946444808738306e-05,
"learning_rate": 1.6661793940493263e-05,
"loss": 0.0,
"num_input_tokens_seen": 4494904,
"step": 7265
},
{
"epoch": 12.959001782531194,
"grad_norm": 7.4231547841918655e-06,
"learning_rate": 1.6625142013843825e-05,
"loss": 0.0,
"num_input_tokens_seen": 4498456,
"step": 7270
},
{
"epoch": 12.967914438502675,
"grad_norm": 6.513627340609673e-06,
"learning_rate": 1.658851035218399e-05,
"loss": 0.0,
"num_input_tokens_seen": 4501720,
"step": 7275
},
{
"epoch": 12.976827094474153,
"grad_norm": 1.3617377589980606e-05,
"learning_rate": 1.6551899044152887e-05,
"loss": 0.0,
"num_input_tokens_seen": 4504600,
"step": 7280
},
{
"epoch": 12.985739750445633,
"grad_norm": 9.484362635703292e-06,
"learning_rate": 1.651530817834043e-05,
"loss": 0.0,
"num_input_tokens_seen": 4507704,
"step": 7285
},
{
"epoch": 12.994652406417112,
"grad_norm": 7.737330633972306e-06,
"learning_rate": 1.647873784328703e-05,
"loss": 0.0,
"num_input_tokens_seen": 4510360,
"step": 7290
},
{
"epoch": 13.0,
"eval_loss": 0.25558963418006897,
"eval_runtime": 4.5868,
"eval_samples_per_second": 54.286,
"eval_steps_per_second": 13.735,
"num_input_tokens_seen": 4511312,
"step": 7293
},
{
"epoch": 13.003565062388592,
"grad_norm": 7.779332008794881e-06,
"learning_rate": 1.644218812748343e-05,
"loss": 0.0,
"num_input_tokens_seen": 4512624,
"step": 7295
},
{
"epoch": 13.01247771836007,
"grad_norm": 9.883010534394998e-06,
"learning_rate": 1.6405659119370512e-05,
"loss": 0.0,
"num_input_tokens_seen": 4515984,
"step": 7300
},
{
"epoch": 13.02139037433155,
"grad_norm": 7.949201972223818e-06,
"learning_rate": 1.6369150907339005e-05,
"loss": 0.0,
"num_input_tokens_seen": 4519216,
"step": 7305
},
{
"epoch": 13.030303030303031,
"grad_norm": 1.067373978003161e-05,
"learning_rate": 1.6332663579729352e-05,
"loss": 0.0,
"num_input_tokens_seen": 4522000,
"step": 7310
},
{
"epoch": 13.03921568627451,
"grad_norm": 7.869979526731186e-06,
"learning_rate": 1.6296197224831435e-05,
"loss": 0.0,
"num_input_tokens_seen": 4524912,
"step": 7315
},
{
"epoch": 13.04812834224599,
"grad_norm": 7.198002549557714e-06,
"learning_rate": 1.6259751930884397e-05,
"loss": 0.0,
"num_input_tokens_seen": 4528560,
"step": 7320
},
{
"epoch": 13.057040998217468,
"grad_norm": 1.3213011698098853e-05,
"learning_rate": 1.622332778607642e-05,
"loss": 0.0,
"num_input_tokens_seen": 4530928,
"step": 7325
},
{
"epoch": 13.065953654188949,
"grad_norm": 6.953484444238711e-06,
"learning_rate": 1.618692487854452e-05,
"loss": 0.0,
"num_input_tokens_seen": 4534448,
"step": 7330
},
{
"epoch": 13.074866310160427,
"grad_norm": 9.440144822292496e-06,
"learning_rate": 1.615054329637431e-05,
"loss": 0.0,
"num_input_tokens_seen": 4537392,
"step": 7335
},
{
"epoch": 13.083778966131907,
"grad_norm": 7.443520644301316e-06,
"learning_rate": 1.6114183127599807e-05,
"loss": 0.0,
"num_input_tokens_seen": 4540048,
"step": 7340
},
{
"epoch": 13.092691622103386,
"grad_norm": 1.597183472767938e-05,
"learning_rate": 1.6077844460203206e-05,
"loss": 0.0,
"num_input_tokens_seen": 4543472,
"step": 7345
},
{
"epoch": 13.101604278074866,
"grad_norm": 9.358037459605839e-06,
"learning_rate": 1.6041527382114692e-05,
"loss": 0.0,
"num_input_tokens_seen": 4546544,
"step": 7350
},
{
"epoch": 13.110516934046347,
"grad_norm": 9.500805390416645e-06,
"learning_rate": 1.600523198121218e-05,
"loss": 0.0,
"num_input_tokens_seen": 4549744,
"step": 7355
},
{
"epoch": 13.119429590017825,
"grad_norm": 2.4797285732347518e-05,
"learning_rate": 1.5968958345321178e-05,
"loss": 0.0,
"num_input_tokens_seen": 4552880,
"step": 7360
},
{
"epoch": 13.128342245989305,
"grad_norm": 9.394189874001313e-06,
"learning_rate": 1.593270656221448e-05,
"loss": 0.0,
"num_input_tokens_seen": 4556112,
"step": 7365
},
{
"epoch": 13.137254901960784,
"grad_norm": 8.06232765171444e-06,
"learning_rate": 1.5896476719612023e-05,
"loss": 0.0,
"num_input_tokens_seen": 4559792,
"step": 7370
},
{
"epoch": 13.146167557932264,
"grad_norm": 7.0690566644771025e-06,
"learning_rate": 1.586026890518066e-05,
"loss": 0.0,
"num_input_tokens_seen": 4563376,
"step": 7375
},
{
"epoch": 13.155080213903743,
"grad_norm": 8.308110409416258e-06,
"learning_rate": 1.582408320653393e-05,
"loss": 0.0,
"num_input_tokens_seen": 4566704,
"step": 7380
},
{
"epoch": 13.163992869875223,
"grad_norm": 3.808155088336207e-05,
"learning_rate": 1.578791971123185e-05,
"loss": 0.0,
"num_input_tokens_seen": 4569712,
"step": 7385
},
{
"epoch": 13.172905525846703,
"grad_norm": 9.400876479048748e-06,
"learning_rate": 1.5751778506780748e-05,
"loss": 0.0,
"num_input_tokens_seen": 4573808,
"step": 7390
},
{
"epoch": 13.181818181818182,
"grad_norm": 7.752068086119834e-06,
"learning_rate": 1.5715659680632973e-05,
"loss": 0.0,
"num_input_tokens_seen": 4577680,
"step": 7395
},
{
"epoch": 13.190730837789662,
"grad_norm": 7.223582997539779e-06,
"learning_rate": 1.567956332018674e-05,
"loss": 0.0,
"num_input_tokens_seen": 4581488,
"step": 7400
},
{
"epoch": 13.19964349376114,
"grad_norm": 2.2543108570971526e-05,
"learning_rate": 1.564348951278591e-05,
"loss": 0.0,
"num_input_tokens_seen": 4584720,
"step": 7405
},
{
"epoch": 13.20855614973262,
"grad_norm": 8.04398314357968e-06,
"learning_rate": 1.560743834571975e-05,
"loss": 0.0,
"num_input_tokens_seen": 4587792,
"step": 7410
},
{
"epoch": 13.2174688057041,
"grad_norm": 1.0653552635631058e-05,
"learning_rate": 1.5571409906222765e-05,
"loss": 0.0,
"num_input_tokens_seen": 4590672,
"step": 7415
},
{
"epoch": 13.22638146167558,
"grad_norm": 2.178288923460059e-05,
"learning_rate": 1.5535404281474457e-05,
"loss": 0.0,
"num_input_tokens_seen": 4594064,
"step": 7420
},
{
"epoch": 13.235294117647058,
"grad_norm": 6.663993644906441e-06,
"learning_rate": 1.549942155859913e-05,
"loss": 0.0,
"num_input_tokens_seen": 4598352,
"step": 7425
},
{
"epoch": 13.244206773618538,
"grad_norm": 1.463762964704074e-05,
"learning_rate": 1.546346182466566e-05,
"loss": 0.0,
"num_input_tokens_seen": 4601968,
"step": 7430
},
{
"epoch": 13.253119429590019,
"grad_norm": 1.085971416614484e-05,
"learning_rate": 1.5427525166687288e-05,
"loss": 0.0,
"num_input_tokens_seen": 4605264,
"step": 7435
},
{
"epoch": 13.262032085561497,
"grad_norm": 7.035836915747495e-06,
"learning_rate": 1.5391611671621443e-05,
"loss": 0.0,
"num_input_tokens_seen": 4608272,
"step": 7440
},
{
"epoch": 13.270944741532977,
"grad_norm": 8.502263881382532e-06,
"learning_rate": 1.535572142636948e-05,
"loss": 0.0,
"num_input_tokens_seen": 4611664,
"step": 7445
},
{
"epoch": 13.279857397504456,
"grad_norm": 0.0001319638395216316,
"learning_rate": 1.531985451777652e-05,
"loss": 0.0,
"num_input_tokens_seen": 4614224,
"step": 7450
},
{
"epoch": 13.288770053475936,
"grad_norm": 1.2678749953920487e-05,
"learning_rate": 1.5284011032631197e-05,
"loss": 0.0,
"num_input_tokens_seen": 4617424,
"step": 7455
},
{
"epoch": 13.297682709447415,
"grad_norm": 8.696750228409655e-06,
"learning_rate": 1.5248191057665462e-05,
"loss": 0.0,
"num_input_tokens_seen": 4619920,
"step": 7460
},
{
"epoch": 13.306595365418895,
"grad_norm": 1.0347956958867144e-05,
"learning_rate": 1.5212394679554403e-05,
"loss": 0.0,
"num_input_tokens_seen": 4623184,
"step": 7465
},
{
"epoch": 13.315508021390375,
"grad_norm": 6.585772098333109e-06,
"learning_rate": 1.517662198491599e-05,
"loss": 0.0,
"num_input_tokens_seen": 4625552,
"step": 7470
},
{
"epoch": 13.324420677361854,
"grad_norm": 3.188840128132142e-05,
"learning_rate": 1.5140873060310872e-05,
"loss": 0.0,
"num_input_tokens_seen": 4629200,
"step": 7475
},
{
"epoch": 13.333333333333334,
"grad_norm": 1.206223987537669e-05,
"learning_rate": 1.5105147992242222e-05,
"loss": 0.0,
"num_input_tokens_seen": 4632592,
"step": 7480
},
{
"epoch": 13.342245989304812,
"grad_norm": 6.815862889197888e-06,
"learning_rate": 1.5069446867155446e-05,
"loss": 0.0,
"num_input_tokens_seen": 4636432,
"step": 7485
},
{
"epoch": 13.351158645276293,
"grad_norm": 8.916998922359198e-06,
"learning_rate": 1.5033769771438039e-05,
"loss": 0.0,
"num_input_tokens_seen": 4639664,
"step": 7490
},
{
"epoch": 13.360071301247771,
"grad_norm": 7.717449989286251e-06,
"learning_rate": 1.4998116791419342e-05,
"loss": 0.0,
"num_input_tokens_seen": 4642832,
"step": 7495
},
{
"epoch": 13.368983957219251,
"grad_norm": 7.5525867941905744e-06,
"learning_rate": 1.4962488013370329e-05,
"loss": 0.0,
"num_input_tokens_seen": 4646096,
"step": 7500
},
{
"epoch": 13.37789661319073,
"grad_norm": 7.96997392171761e-06,
"learning_rate": 1.492688352350344e-05,
"loss": 0.0,
"num_input_tokens_seen": 4648560,
"step": 7505
},
{
"epoch": 13.38680926916221,
"grad_norm": 7.938811904750764e-06,
"learning_rate": 1.4891303407972324e-05,
"loss": 0.0,
"num_input_tokens_seen": 4651920,
"step": 7510
},
{
"epoch": 13.39572192513369,
"grad_norm": 1.78286645677872e-05,
"learning_rate": 1.4855747752871657e-05,
"loss": 0.0,
"num_input_tokens_seen": 4654896,
"step": 7515
},
{
"epoch": 13.404634581105169,
"grad_norm": 6.955673597985879e-06,
"learning_rate": 1.4820216644236925e-05,
"loss": 0.0,
"num_input_tokens_seen": 4657872,
"step": 7520
},
{
"epoch": 13.41354723707665,
"grad_norm": 9.080586096388288e-06,
"learning_rate": 1.4784710168044213e-05,
"loss": 0.0,
"num_input_tokens_seen": 4660784,
"step": 7525
},
{
"epoch": 13.422459893048128,
"grad_norm": 2.6790774427354336e-05,
"learning_rate": 1.4749228410210017e-05,
"loss": 0.0,
"num_input_tokens_seen": 4664592,
"step": 7530
},
{
"epoch": 13.431372549019608,
"grad_norm": 6.748609393980587e-06,
"learning_rate": 1.4713771456590996e-05,
"loss": 0.0,
"num_input_tokens_seen": 4667664,
"step": 7535
},
{
"epoch": 13.440285204991087,
"grad_norm": 0.00010700252460082993,
"learning_rate": 1.4678339392983822e-05,
"loss": 0.0,
"num_input_tokens_seen": 4670032,
"step": 7540
},
{
"epoch": 13.449197860962567,
"grad_norm": 1.4548013496096246e-05,
"learning_rate": 1.4642932305124918e-05,
"loss": 0.0,
"num_input_tokens_seen": 4673648,
"step": 7545
},
{
"epoch": 13.458110516934047,
"grad_norm": 8.775423339102417e-06,
"learning_rate": 1.4607550278690262e-05,
"loss": 0.0,
"num_input_tokens_seen": 4676272,
"step": 7550
},
{
"epoch": 13.467023172905526,
"grad_norm": 9.47409171203617e-06,
"learning_rate": 1.4572193399295228e-05,
"loss": 0.0,
"num_input_tokens_seen": 4679664,
"step": 7555
},
{
"epoch": 13.475935828877006,
"grad_norm": 5.426706593425479e-06,
"learning_rate": 1.453686175249429e-05,
"loss": 0.0,
"num_input_tokens_seen": 4683376,
"step": 7560
},
{
"epoch": 13.484848484848484,
"grad_norm": 7.583116530440748e-05,
"learning_rate": 1.4501555423780899e-05,
"loss": 0.0,
"num_input_tokens_seen": 4686512,
"step": 7565
},
{
"epoch": 13.493761140819965,
"grad_norm": 3.9882790588308126e-05,
"learning_rate": 1.446627449858726e-05,
"loss": 0.0,
"num_input_tokens_seen": 4689680,
"step": 7570
},
{
"epoch": 13.502673796791443,
"grad_norm": 8.056603837758303e-05,
"learning_rate": 1.4431019062284057e-05,
"loss": 0.0,
"num_input_tokens_seen": 4693136,
"step": 7575
},
{
"epoch": 13.511586452762923,
"grad_norm": 1.796651486074552e-05,
"learning_rate": 1.4395789200180344e-05,
"loss": 0.0,
"num_input_tokens_seen": 4696752,
"step": 7580
},
{
"epoch": 13.520499108734402,
"grad_norm": 6.788226073695114e-06,
"learning_rate": 1.4360584997523252e-05,
"loss": 0.0,
"num_input_tokens_seen": 4699632,
"step": 7585
},
{
"epoch": 13.529411764705882,
"grad_norm": 8.783808880252764e-06,
"learning_rate": 1.4325406539497854e-05,
"loss": 0.0,
"num_input_tokens_seen": 4703152,
"step": 7590
},
{
"epoch": 13.538324420677363,
"grad_norm": 7.359179107879754e-06,
"learning_rate": 1.4290253911226919e-05,
"loss": 0.0,
"num_input_tokens_seen": 4706288,
"step": 7595
},
{
"epoch": 13.547237076648841,
"grad_norm": 5.950364084128523e-06,
"learning_rate": 1.425512719777071e-05,
"loss": 0.0,
"num_input_tokens_seen": 4709296,
"step": 7600
},
{
"epoch": 13.556149732620321,
"grad_norm": 8.201064702006988e-06,
"learning_rate": 1.4220026484126798e-05,
"loss": 0.0,
"num_input_tokens_seen": 4712272,
"step": 7605
},
{
"epoch": 13.5650623885918,
"grad_norm": 5.314827649272047e-05,
"learning_rate": 1.4184951855229805e-05,
"loss": 0.0,
"num_input_tokens_seen": 4715248,
"step": 7610
},
{
"epoch": 13.57397504456328,
"grad_norm": 3.716797073138878e-05,
"learning_rate": 1.414990339595127e-05,
"loss": 0.0,
"num_input_tokens_seen": 4717552,
"step": 7615
},
{
"epoch": 13.582887700534759,
"grad_norm": 7.371178980974946e-06,
"learning_rate": 1.411488119109941e-05,
"loss": 0.0,
"num_input_tokens_seen": 4721488,
"step": 7620
},
{
"epoch": 13.591800356506239,
"grad_norm": 8.358735613001045e-06,
"learning_rate": 1.4079885325418868e-05,
"loss": 0.0,
"num_input_tokens_seen": 4724560,
"step": 7625
},
{
"epoch": 13.60071301247772,
"grad_norm": 1.5506338968407363e-05,
"learning_rate": 1.4044915883590626e-05,
"loss": 0.0,
"num_input_tokens_seen": 4727408,
"step": 7630
},
{
"epoch": 13.609625668449198,
"grad_norm": 2.4985572963487357e-05,
"learning_rate": 1.4009972950231653e-05,
"loss": 0.0,
"num_input_tokens_seen": 4730672,
"step": 7635
},
{
"epoch": 13.618538324420678,
"grad_norm": 7.919730705907568e-06,
"learning_rate": 1.3975056609894819e-05,
"loss": 0.0,
"num_input_tokens_seen": 4734000,
"step": 7640
},
{
"epoch": 13.627450980392156,
"grad_norm": 7.953521162562538e-06,
"learning_rate": 1.3940166947068644e-05,
"loss": 0.0,
"num_input_tokens_seen": 4737136,
"step": 7645
},
{
"epoch": 13.636363636363637,
"grad_norm": 1.991149656532798e-05,
"learning_rate": 1.3905304046177065e-05,
"loss": 0.0,
"num_input_tokens_seen": 4739888,
"step": 7650
},
{
"epoch": 13.645276292335115,
"grad_norm": 1.0619149179547094e-05,
"learning_rate": 1.3870467991579284e-05,
"loss": 0.0,
"num_input_tokens_seen": 4742864,
"step": 7655
},
{
"epoch": 13.654188948306595,
"grad_norm": 6.993436727498192e-06,
"learning_rate": 1.3835658867569568e-05,
"loss": 0.0,
"num_input_tokens_seen": 4745808,
"step": 7660
},
{
"epoch": 13.663101604278076,
"grad_norm": 0.0001521021913504228,
"learning_rate": 1.3800876758376963e-05,
"loss": 0.0,
"num_input_tokens_seen": 4748592,
"step": 7665
},
{
"epoch": 13.672014260249554,
"grad_norm": 0.00011914921196876094,
"learning_rate": 1.3766121748165194e-05,
"loss": 0.0,
"num_input_tokens_seen": 4751184,
"step": 7670
},
{
"epoch": 13.680926916221035,
"grad_norm": 1.2205597158754244e-05,
"learning_rate": 1.3731393921032376e-05,
"loss": 0.0,
"num_input_tokens_seen": 4754288,
"step": 7675
},
{
"epoch": 13.689839572192513,
"grad_norm": 5.635377419821452e-06,
"learning_rate": 1.3696693361010871e-05,
"loss": 0.0,
"num_input_tokens_seen": 4757424,
"step": 7680
},
{
"epoch": 13.698752228163993,
"grad_norm": 1.6696209058864042e-05,
"learning_rate": 1.3662020152067061e-05,
"loss": 0.0,
"num_input_tokens_seen": 4761232,
"step": 7685
},
{
"epoch": 13.707664884135472,
"grad_norm": 8.318459549627732e-06,
"learning_rate": 1.362737437810114e-05,
"loss": 0.0,
"num_input_tokens_seen": 4764368,
"step": 7690
},
{
"epoch": 13.716577540106952,
"grad_norm": 4.31857151852455e-05,
"learning_rate": 1.3592756122946926e-05,
"loss": 0.0,
"num_input_tokens_seen": 4767472,
"step": 7695
},
{
"epoch": 13.72549019607843,
"grad_norm": 5.811690243717749e-06,
"learning_rate": 1.3558165470371623e-05,
"loss": 0.0,
"num_input_tokens_seen": 4770256,
"step": 7700
},
{
"epoch": 13.73440285204991,
"grad_norm": 1.465106015530182e-05,
"learning_rate": 1.3523602504075666e-05,
"loss": 0.0,
"num_input_tokens_seen": 4773296,
"step": 7705
},
{
"epoch": 13.743315508021391,
"grad_norm": 7.546026154159335e-06,
"learning_rate": 1.348906730769251e-05,
"loss": 0.0,
"num_input_tokens_seen": 4776400,
"step": 7710
},
{
"epoch": 13.75222816399287,
"grad_norm": 6.846257292636437e-06,
"learning_rate": 1.3454559964788355e-05,
"loss": 0.0,
"num_input_tokens_seen": 4780176,
"step": 7715
},
{
"epoch": 13.76114081996435,
"grad_norm": 6.956892320886254e-06,
"learning_rate": 1.3420080558862092e-05,
"loss": 0.0,
"num_input_tokens_seen": 4783536,
"step": 7720
},
{
"epoch": 13.770053475935828,
"grad_norm": 6.646995643677656e-06,
"learning_rate": 1.3385629173344927e-05,
"loss": 0.0,
"num_input_tokens_seen": 4786832,
"step": 7725
},
{
"epoch": 13.778966131907309,
"grad_norm": 8.099759725155309e-06,
"learning_rate": 1.335120589160031e-05,
"loss": 0.0,
"num_input_tokens_seen": 4789904,
"step": 7730
},
{
"epoch": 13.787878787878787,
"grad_norm": 8.979537597042508e-06,
"learning_rate": 1.3316810796923693e-05,
"loss": 0.0,
"num_input_tokens_seen": 4792848,
"step": 7735
},
{
"epoch": 13.796791443850267,
"grad_norm": 1.531767520646099e-05,
"learning_rate": 1.328244397254228e-05,
"loss": 0.0,
"num_input_tokens_seen": 4795888,
"step": 7740
},
{
"epoch": 13.805704099821746,
"grad_norm": 6.935872079338878e-06,
"learning_rate": 1.3248105501614897e-05,
"loss": 0.0,
"num_input_tokens_seen": 4798704,
"step": 7745
},
{
"epoch": 13.814616755793226,
"grad_norm": 7.608053238072898e-06,
"learning_rate": 1.3213795467231788e-05,
"loss": 0.0,
"num_input_tokens_seen": 4801744,
"step": 7750
},
{
"epoch": 13.823529411764707,
"grad_norm": 6.829433004895691e-06,
"learning_rate": 1.3179513952414332e-05,
"loss": 0.0,
"num_input_tokens_seen": 4804592,
"step": 7755
},
{
"epoch": 13.832442067736185,
"grad_norm": 1.5230633835017215e-05,
"learning_rate": 1.3145261040114944e-05,
"loss": 0.0,
"num_input_tokens_seen": 4807248,
"step": 7760
},
{
"epoch": 13.841354723707665,
"grad_norm": 6.265021966100903e-06,
"learning_rate": 1.3111036813216792e-05,
"loss": 0.0,
"num_input_tokens_seen": 4809776,
"step": 7765
},
{
"epoch": 13.850267379679144,
"grad_norm": 6.747121460648486e-06,
"learning_rate": 1.3076841354533658e-05,
"loss": 0.0,
"num_input_tokens_seen": 4813104,
"step": 7770
},
{
"epoch": 13.859180035650624,
"grad_norm": 7.226227353385184e-06,
"learning_rate": 1.3042674746809707e-05,
"loss": 0.0,
"num_input_tokens_seen": 4815984,
"step": 7775
},
{
"epoch": 13.868092691622103,
"grad_norm": 7.566853128082585e-06,
"learning_rate": 1.300853707271929e-05,
"loss": 0.0,
"num_input_tokens_seen": 4819312,
"step": 7780
},
{
"epoch": 13.877005347593583,
"grad_norm": 9.527244401397184e-06,
"learning_rate": 1.2974428414866752e-05,
"loss": 0.0,
"num_input_tokens_seen": 4822000,
"step": 7785
},
{
"epoch": 13.885918003565063,
"grad_norm": 7.2643560997676104e-06,
"learning_rate": 1.2940348855786208e-05,
"loss": 0.0,
"num_input_tokens_seen": 4825296,
"step": 7790
},
{
"epoch": 13.894830659536542,
"grad_norm": 6.629866220464464e-06,
"learning_rate": 1.2906298477941378e-05,
"loss": 0.0,
"num_input_tokens_seen": 4828848,
"step": 7795
},
{
"epoch": 13.903743315508022,
"grad_norm": 3.560426921467297e-05,
"learning_rate": 1.287227736372538e-05,
"loss": 0.0,
"num_input_tokens_seen": 4831696,
"step": 7800
},
{
"epoch": 13.9126559714795,
"grad_norm": 1.4213070244295523e-05,
"learning_rate": 1.2838285595460478e-05,
"loss": 0.0,
"num_input_tokens_seen": 4835152,
"step": 7805
},
{
"epoch": 13.92156862745098,
"grad_norm": 7.702679795329459e-06,
"learning_rate": 1.2804323255397996e-05,
"loss": 0.0,
"num_input_tokens_seen": 4837712,
"step": 7810
},
{
"epoch": 13.93048128342246,
"grad_norm": 7.759183972666506e-06,
"learning_rate": 1.2770390425717982e-05,
"loss": 0.0,
"num_input_tokens_seen": 4841264,
"step": 7815
},
{
"epoch": 13.93939393939394,
"grad_norm": 8.99410770216491e-06,
"learning_rate": 1.2736487188529112e-05,
"loss": 0.0,
"num_input_tokens_seen": 4844272,
"step": 7820
},
{
"epoch": 13.94830659536542,
"grad_norm": 1.2729369700537063e-05,
"learning_rate": 1.2702613625868459e-05,
"loss": 0.0,
"num_input_tokens_seen": 4847056,
"step": 7825
},
{
"epoch": 13.957219251336898,
"grad_norm": 7.109924354153918e-06,
"learning_rate": 1.2668769819701259e-05,
"loss": 0.0,
"num_input_tokens_seen": 4849232,
"step": 7830
},
{
"epoch": 13.966131907308379,
"grad_norm": 6.8033637035114225e-06,
"learning_rate": 1.2634955851920789e-05,
"loss": 0.0,
"num_input_tokens_seen": 4852272,
"step": 7835
},
{
"epoch": 13.975044563279857,
"grad_norm": 6.467252205766272e-06,
"learning_rate": 1.2601171804348085e-05,
"loss": 0.0,
"num_input_tokens_seen": 4855472,
"step": 7840
},
{
"epoch": 13.983957219251337,
"grad_norm": 6.207174010341987e-06,
"learning_rate": 1.2567417758731815e-05,
"loss": 0.0,
"num_input_tokens_seen": 4858064,
"step": 7845
},
{
"epoch": 13.992869875222816,
"grad_norm": 1.287949999095872e-05,
"learning_rate": 1.2533693796748041e-05,
"loss": 0.0,
"num_input_tokens_seen": 4860080,
"step": 7850
},
{
"epoch": 14.0,
"eval_loss": 0.2638137936592102,
"eval_runtime": 4.5897,
"eval_samples_per_second": 54.252,
"eval_steps_per_second": 13.726,
"num_input_tokens_seen": 4861864,
"step": 7854
},
{
"epoch": 14.001782531194296,
"grad_norm": 7.107454621291254e-06,
"learning_rate": 1.2500000000000006e-05,
"loss": 0.0,
"num_input_tokens_seen": 4862344,
"step": 7855
},
{
"epoch": 14.010695187165775,
"grad_norm": 0.00010611310426611453,
"learning_rate": 1.2466336450017981e-05,
"loss": 0.0,
"num_input_tokens_seen": 4865608,
"step": 7860
},
{
"epoch": 14.019607843137255,
"grad_norm": 1.0504892998142168e-05,
"learning_rate": 1.243270322825908e-05,
"loss": 0.0,
"num_input_tokens_seen": 4868296,
"step": 7865
},
{
"epoch": 14.028520499108735,
"grad_norm": 1.3590751223091502e-05,
"learning_rate": 1.2399100416106964e-05,
"loss": 0.0,
"num_input_tokens_seen": 4871432,
"step": 7870
},
{
"epoch": 14.037433155080214,
"grad_norm": 7.295515842997702e-06,
"learning_rate": 1.236552809487177e-05,
"loss": 0.0,
"num_input_tokens_seen": 4874472,
"step": 7875
},
{
"epoch": 14.046345811051694,
"grad_norm": 9.071957720152568e-06,
"learning_rate": 1.2331986345789806e-05,
"loss": 0.0,
"num_input_tokens_seen": 4877736,
"step": 7880
},
{
"epoch": 14.055258467023172,
"grad_norm": 1.2070072443748359e-05,
"learning_rate": 1.2298475250023439e-05,
"loss": 0.0,
"num_input_tokens_seen": 4881064,
"step": 7885
},
{
"epoch": 14.064171122994653,
"grad_norm": 5.530808721232461e-06,
"learning_rate": 1.2264994888660846e-05,
"loss": 0.0,
"num_input_tokens_seen": 4883816,
"step": 7890
},
{
"epoch": 14.073083778966131,
"grad_norm": 6.749080512236105e-06,
"learning_rate": 1.2231545342715847e-05,
"loss": 0.0,
"num_input_tokens_seen": 4886920,
"step": 7895
},
{
"epoch": 14.081996434937611,
"grad_norm": 6.1865684983786196e-06,
"learning_rate": 1.2198126693127693e-05,
"loss": 0.0,
"num_input_tokens_seen": 4890120,
"step": 7900
},
{
"epoch": 14.090909090909092,
"grad_norm": 7.120606824173592e-06,
"learning_rate": 1.2164739020760854e-05,
"loss": 0.0,
"num_input_tokens_seen": 4893000,
"step": 7905
},
{
"epoch": 14.09982174688057,
"grad_norm": 6.07286438025767e-06,
"learning_rate": 1.2131382406404864e-05,
"loss": 0.0,
"num_input_tokens_seen": 4896904,
"step": 7910
},
{
"epoch": 14.10873440285205,
"grad_norm": 6.876835414004745e-06,
"learning_rate": 1.2098056930774116e-05,
"loss": 0.0,
"num_input_tokens_seen": 4900328,
"step": 7915
},
{
"epoch": 14.117647058823529,
"grad_norm": 0.00021053240925539285,
"learning_rate": 1.2064762674507607e-05,
"loss": 0.0,
"num_input_tokens_seen": 4903592,
"step": 7920
},
{
"epoch": 14.12655971479501,
"grad_norm": 6.274164206843125e-06,
"learning_rate": 1.2031499718168859e-05,
"loss": 0.0,
"num_input_tokens_seen": 4906312,
"step": 7925
},
{
"epoch": 14.135472370766488,
"grad_norm": 8.332051038451027e-06,
"learning_rate": 1.1998268142245598e-05,
"loss": 0.0,
"num_input_tokens_seen": 4909352,
"step": 7930
},
{
"epoch": 14.144385026737968,
"grad_norm": 1.3166509233997203e-05,
"learning_rate": 1.1965068027149643e-05,
"loss": 0.0,
"num_input_tokens_seen": 4911688,
"step": 7935
},
{
"epoch": 14.153297682709447,
"grad_norm": 6.029657015460543e-05,
"learning_rate": 1.1931899453216697e-05,
"loss": 0.0,
"num_input_tokens_seen": 4914376,
"step": 7940
},
{
"epoch": 14.162210338680927,
"grad_norm": 9.119195965467952e-06,
"learning_rate": 1.189876250070611e-05,
"loss": 0.0,
"num_input_tokens_seen": 4917352,
"step": 7945
},
{
"epoch": 14.171122994652407,
"grad_norm": 2.1003546862630174e-05,
"learning_rate": 1.1865657249800738e-05,
"loss": 0.0,
"num_input_tokens_seen": 4921096,
"step": 7950
},
{
"epoch": 14.180035650623886,
"grad_norm": 6.048298928362783e-06,
"learning_rate": 1.1832583780606726e-05,
"loss": 0.0,
"num_input_tokens_seen": 4924040,
"step": 7955
},
{
"epoch": 14.188948306595366,
"grad_norm": 6.6788943513529375e-06,
"learning_rate": 1.1799542173153314e-05,
"loss": 0.0,
"num_input_tokens_seen": 4927208,
"step": 7960
},
{
"epoch": 14.197860962566844,
"grad_norm": 6.807384579587961e-06,
"learning_rate": 1.176653250739265e-05,
"loss": 0.0,
"num_input_tokens_seen": 4929928,
"step": 7965
},
{
"epoch": 14.206773618538325,
"grad_norm": 6.744728580088122e-06,
"learning_rate": 1.173355486319957e-05,
"loss": 0.0,
"num_input_tokens_seen": 4932328,
"step": 7970
},
{
"epoch": 14.215686274509803,
"grad_norm": 9.249639333575033e-06,
"learning_rate": 1.1700609320371448e-05,
"loss": 0.0,
"num_input_tokens_seen": 4935272,
"step": 7975
},
{
"epoch": 14.224598930481283,
"grad_norm": 7.17133025318617e-06,
"learning_rate": 1.1667695958627974e-05,
"loss": 0.0,
"num_input_tokens_seen": 4938504,
"step": 7980
},
{
"epoch": 14.233511586452764,
"grad_norm": 6.4170390032813884e-06,
"learning_rate": 1.1634814857610968e-05,
"loss": 0.0,
"num_input_tokens_seen": 4942472,
"step": 7985
},
{
"epoch": 14.242424242424242,
"grad_norm": 8.718321623746306e-06,
"learning_rate": 1.1601966096884198e-05,
"loss": 0.0,
"num_input_tokens_seen": 4945800,
"step": 7990
},
{
"epoch": 14.251336898395722,
"grad_norm": 6.421875241358066e-06,
"learning_rate": 1.1569149755933147e-05,
"loss": 0.0,
"num_input_tokens_seen": 4948840,
"step": 7995
},
{
"epoch": 14.260249554367201,
"grad_norm": 1.2632759535335936e-05,
"learning_rate": 1.1536365914164882e-05,
"loss": 0.0,
"num_input_tokens_seen": 4952520,
"step": 8000
},
{
"epoch": 14.269162210338681,
"grad_norm": 7.619096777489176e-06,
"learning_rate": 1.1503614650907821e-05,
"loss": 0.0,
"num_input_tokens_seen": 4955880,
"step": 8005
},
{
"epoch": 14.27807486631016,
"grad_norm": 1.3213113561505452e-05,
"learning_rate": 1.1470896045411525e-05,
"loss": 0.0,
"num_input_tokens_seen": 4958760,
"step": 8010
},
{
"epoch": 14.28698752228164,
"grad_norm": 7.3548239925003145e-06,
"learning_rate": 1.1438210176846592e-05,
"loss": 0.0,
"num_input_tokens_seen": 4962088,
"step": 8015
},
{
"epoch": 14.29590017825312,
"grad_norm": 7.781805834383704e-06,
"learning_rate": 1.1405557124304337e-05,
"loss": 0.0,
"num_input_tokens_seen": 4965416,
"step": 8020
},
{
"epoch": 14.304812834224599,
"grad_norm": 5.628372491628397e-06,
"learning_rate": 1.137293696679671e-05,
"loss": 0.0,
"num_input_tokens_seen": 4968168,
"step": 8025
},
{
"epoch": 14.313725490196079,
"grad_norm": 4.754801011586096e-06,
"learning_rate": 1.134034978325606e-05,
"loss": 0.0,
"num_input_tokens_seen": 4972200,
"step": 8030
},
{
"epoch": 14.322638146167558,
"grad_norm": 2.6338497264077887e-05,
"learning_rate": 1.1307795652534923e-05,
"loss": 0.0,
"num_input_tokens_seen": 4974696,
"step": 8035
},
{
"epoch": 14.331550802139038,
"grad_norm": 6.248131830943748e-05,
"learning_rate": 1.1275274653405885e-05,
"loss": 0.0,
"num_input_tokens_seen": 4977576,
"step": 8040
},
{
"epoch": 14.340463458110516,
"grad_norm": 6.767934792151209e-06,
"learning_rate": 1.1242786864561344e-05,
"loss": 0.0,
"num_input_tokens_seen": 4980712,
"step": 8045
},
{
"epoch": 14.349376114081997,
"grad_norm": 2.475587280059699e-05,
"learning_rate": 1.121033236461335e-05,
"loss": 0.0,
"num_input_tokens_seen": 4983976,
"step": 8050
},
{
"epoch": 14.358288770053475,
"grad_norm": 7.764555448375177e-06,
"learning_rate": 1.1177911232093403e-05,
"loss": 0.0,
"num_input_tokens_seen": 4987304,
"step": 8055
},
{
"epoch": 14.367201426024955,
"grad_norm": 1.8427512259222567e-05,
"learning_rate": 1.1145523545452235e-05,
"loss": 0.0,
"num_input_tokens_seen": 4990504,
"step": 8060
},
{
"epoch": 14.376114081996436,
"grad_norm": 6.077263151382795e-06,
"learning_rate": 1.1113169383059682e-05,
"loss": 0.0,
"num_input_tokens_seen": 4993032,
"step": 8065
},
{
"epoch": 14.385026737967914,
"grad_norm": 7.569628451165045e-06,
"learning_rate": 1.1080848823204445e-05,
"loss": 0.0,
"num_input_tokens_seen": 4995976,
"step": 8070
},
{
"epoch": 14.393939393939394,
"grad_norm": 8.515498848282732e-06,
"learning_rate": 1.1048561944093914e-05,
"loss": 0.0,
"num_input_tokens_seen": 4999624,
"step": 8075
},
{
"epoch": 14.402852049910873,
"grad_norm": 7.329627806029748e-06,
"learning_rate": 1.1016308823853996e-05,
"loss": 0.0,
"num_input_tokens_seen": 5001928,
"step": 8080
},
{
"epoch": 14.411764705882353,
"grad_norm": 0.00010971013398375362,
"learning_rate": 1.0984089540528878e-05,
"loss": 0.0,
"num_input_tokens_seen": 5005832,
"step": 8085
},
{
"epoch": 14.420677361853832,
"grad_norm": 5.8523328334558755e-06,
"learning_rate": 1.0951904172080896e-05,
"loss": 0.0,
"num_input_tokens_seen": 5008712,
"step": 8090
},
{
"epoch": 14.429590017825312,
"grad_norm": 5.586658062384231e-06,
"learning_rate": 1.0919752796390328e-05,
"loss": 0.0,
"num_input_tokens_seen": 5011720,
"step": 8095
},
{
"epoch": 14.43850267379679,
"grad_norm": 6.4028436099761166e-06,
"learning_rate": 1.0887635491255158e-05,
"loss": 0.0,
"num_input_tokens_seen": 5015240,
"step": 8100
},
{
"epoch": 14.44741532976827,
"grad_norm": 7.400541562674334e-06,
"learning_rate": 1.085555233439099e-05,
"loss": 0.0,
"num_input_tokens_seen": 5019080,
"step": 8105
},
{
"epoch": 14.456327985739751,
"grad_norm": 7.24855044609285e-06,
"learning_rate": 1.0823503403430734e-05,
"loss": 0.0,
"num_input_tokens_seen": 5023304,
"step": 8110
},
{
"epoch": 14.46524064171123,
"grad_norm": 5.581604000326479e-06,
"learning_rate": 1.0791488775924522e-05,
"loss": 0.0,
"num_input_tokens_seen": 5026728,
"step": 8115
},
{
"epoch": 14.47415329768271,
"grad_norm": 2.331694668100681e-05,
"learning_rate": 1.0759508529339479e-05,
"loss": 0.0,
"num_input_tokens_seen": 5029800,
"step": 8120
},
{
"epoch": 14.483065953654188,
"grad_norm": 1.1446799362602178e-05,
"learning_rate": 1.072756274105951e-05,
"loss": 0.0,
"num_input_tokens_seen": 5032776,
"step": 8125
},
{
"epoch": 14.491978609625669,
"grad_norm": 6.0211009440536145e-06,
"learning_rate": 1.0695651488385166e-05,
"loss": 0.0,
"num_input_tokens_seen": 5036264,
"step": 8130
},
{
"epoch": 14.500891265597147,
"grad_norm": 5.3520207075052895e-06,
"learning_rate": 1.0663774848533425e-05,
"loss": 0.0,
"num_input_tokens_seen": 5038824,
"step": 8135
},
{
"epoch": 14.509803921568627,
"grad_norm": 1.092964976123767e-05,
"learning_rate": 1.0631932898637503e-05,
"loss": 0.0,
"num_input_tokens_seen": 5041928,
"step": 8140
},
{
"epoch": 14.518716577540108,
"grad_norm": 0.00015407356841024011,
"learning_rate": 1.0600125715746695e-05,
"loss": 0.0,
"num_input_tokens_seen": 5045000,
"step": 8145
},
{
"epoch": 14.527629233511586,
"grad_norm": 5.3865824156673625e-05,
"learning_rate": 1.0568353376826134e-05,
"loss": 0.0,
"num_input_tokens_seen": 5047848,
"step": 8150
},
{
"epoch": 14.536541889483066,
"grad_norm": 5.537413926504087e-06,
"learning_rate": 1.0536615958756669e-05,
"loss": 0.0,
"num_input_tokens_seen": 5051400,
"step": 8155
},
{
"epoch": 14.545454545454545,
"grad_norm": 6.4448718148923945e-06,
"learning_rate": 1.050491353833464e-05,
"loss": 0.0,
"num_input_tokens_seen": 5054184,
"step": 8160
},
{
"epoch": 14.554367201426025,
"grad_norm": 5.720929948438425e-06,
"learning_rate": 1.0473246192271704e-05,
"loss": 0.0,
"num_input_tokens_seen": 5057064,
"step": 8165
},
{
"epoch": 14.563279857397504,
"grad_norm": 5.878225692868e-06,
"learning_rate": 1.0441613997194654e-05,
"loss": 0.0,
"num_input_tokens_seen": 5060232,
"step": 8170
},
{
"epoch": 14.572192513368984,
"grad_norm": 7.7972963481443e-06,
"learning_rate": 1.0410017029645203e-05,
"loss": 0.0,
"num_input_tokens_seen": 5062856,
"step": 8175
},
{
"epoch": 14.581105169340464,
"grad_norm": 0.0001732358941808343,
"learning_rate": 1.0378455366079843e-05,
"loss": 0.0,
"num_input_tokens_seen": 5065960,
"step": 8180
},
{
"epoch": 14.590017825311943,
"grad_norm": 1.0606602700136136e-05,
"learning_rate": 1.0346929082869641e-05,
"loss": 0.0,
"num_input_tokens_seen": 5068808,
"step": 8185
},
{
"epoch": 14.598930481283423,
"grad_norm": 9.733703336678445e-06,
"learning_rate": 1.0315438256300025e-05,
"loss": 0.0,
"num_input_tokens_seen": 5071848,
"step": 8190
},
{
"epoch": 14.607843137254902,
"grad_norm": 4.52782041975297e-05,
"learning_rate": 1.0283982962570682e-05,
"loss": 0.0,
"num_input_tokens_seen": 5075464,
"step": 8195
},
{
"epoch": 14.616755793226382,
"grad_norm": 6.3549205151502974e-06,
"learning_rate": 1.0252563277795254e-05,
"loss": 0.0,
"num_input_tokens_seen": 5077864,
"step": 8200
},
{
"epoch": 14.62566844919786,
"grad_norm": 5.957157554803416e-06,
"learning_rate": 1.0221179278001264e-05,
"loss": 0.0,
"num_input_tokens_seen": 5080264,
"step": 8205
},
{
"epoch": 14.63458110516934,
"grad_norm": 8.766522114456166e-06,
"learning_rate": 1.0189831039129876e-05,
"loss": 0.0,
"num_input_tokens_seen": 5083816,
"step": 8210
},
{
"epoch": 14.643493761140821,
"grad_norm": 7.778281542414334e-06,
"learning_rate": 1.0158518637035704e-05,
"loss": 0.0,
"num_input_tokens_seen": 5086984,
"step": 8215
},
{
"epoch": 14.6524064171123,
"grad_norm": 2.461150143062696e-05,
"learning_rate": 1.0127242147486668e-05,
"loss": 0.0,
"num_input_tokens_seen": 5089672,
"step": 8220
},
{
"epoch": 14.66131907308378,
"grad_norm": 1.6552379747736268e-05,
"learning_rate": 1.0096001646163777e-05,
"loss": 0.0,
"num_input_tokens_seen": 5092872,
"step": 8225
},
{
"epoch": 14.670231729055258,
"grad_norm": 4.991354580852203e-05,
"learning_rate": 1.0064797208660967e-05,
"loss": 0.0,
"num_input_tokens_seen": 5095752,
"step": 8230
},
{
"epoch": 14.679144385026738,
"grad_norm": 7.476746759493835e-06,
"learning_rate": 1.003362891048491e-05,
"loss": 0.0,
"num_input_tokens_seen": 5098728,
"step": 8235
},
{
"epoch": 14.688057040998217,
"grad_norm": 5.878478077647742e-06,
"learning_rate": 1.0002496827054805e-05,
"loss": 0.0,
"num_input_tokens_seen": 5101992,
"step": 8240
},
{
"epoch": 14.696969696969697,
"grad_norm": 3.9587441278854385e-05,
"learning_rate": 9.971401033702249e-06,
"loss": 0.0,
"num_input_tokens_seen": 5105288,
"step": 8245
},
{
"epoch": 14.705882352941176,
"grad_norm": 7.687859579164069e-06,
"learning_rate": 9.94034160567102e-06,
"loss": 0.0,
"num_input_tokens_seen": 5108776,
"step": 8250
},
{
"epoch": 14.714795008912656,
"grad_norm": 5.535346190299606e-06,
"learning_rate": 9.909318618116892e-06,
"loss": 0.0,
"num_input_tokens_seen": 5112168,
"step": 8255
},
{
"epoch": 14.723707664884136,
"grad_norm": 1.7241862224182114e-05,
"learning_rate": 9.87833214610748e-06,
"loss": 0.0,
"num_input_tokens_seen": 5115464,
"step": 8260
},
{
"epoch": 14.732620320855615,
"grad_norm": 6.819671853008913e-06,
"learning_rate": 9.847382264622016e-06,
"loss": 0.0,
"num_input_tokens_seen": 5118216,
"step": 8265
},
{
"epoch": 14.741532976827095,
"grad_norm": 5.388214503909694e-06,
"learning_rate": 9.81646904855121e-06,
"loss": 0.0,
"num_input_tokens_seen": 5120552,
"step": 8270
},
{
"epoch": 14.750445632798574,
"grad_norm": 1.2260394214536063e-05,
"learning_rate": 9.785592572697058e-06,
"loss": 0.0,
"num_input_tokens_seen": 5124008,
"step": 8275
},
{
"epoch": 14.759358288770054,
"grad_norm": 5.739399512094678e-06,
"learning_rate": 9.754752911772616e-06,
"loss": 0.0,
"num_input_tokens_seen": 5127624,
"step": 8280
},
{
"epoch": 14.768270944741532,
"grad_norm": 6.3006323216541205e-06,
"learning_rate": 9.723950140401922e-06,
"loss": 0.0,
"num_input_tokens_seen": 5130504,
"step": 8285
},
{
"epoch": 14.777183600713013,
"grad_norm": 5.688974397344282e-06,
"learning_rate": 9.693184333119681e-06,
"loss": 0.0,
"num_input_tokens_seen": 5133288,
"step": 8290
},
{
"epoch": 14.786096256684491,
"grad_norm": 5.98324504608172e-06,
"learning_rate": 9.662455564371203e-06,
"loss": 0.0,
"num_input_tokens_seen": 5136648,
"step": 8295
},
{
"epoch": 14.795008912655971,
"grad_norm": 5.788949692941969e-06,
"learning_rate": 9.631763908512164e-06,
"loss": 0.0,
"num_input_tokens_seen": 5139336,
"step": 8300
},
{
"epoch": 14.803921568627452,
"grad_norm": 5.5504601732536685e-06,
"learning_rate": 9.601109439808412e-06,
"loss": 0.0,
"num_input_tokens_seen": 5141832,
"step": 8305
},
{
"epoch": 14.81283422459893,
"grad_norm": 5.692533250112319e-06,
"learning_rate": 9.57049223243584e-06,
"loss": 0.0,
"num_input_tokens_seen": 5144712,
"step": 8310
},
{
"epoch": 14.82174688057041,
"grad_norm": 6.114484222052852e-06,
"learning_rate": 9.53991236048017e-06,
"loss": 0.0,
"num_input_tokens_seen": 5148232,
"step": 8315
},
{
"epoch": 14.830659536541889,
"grad_norm": 7.69770394981606e-06,
"learning_rate": 9.509369897936779e-06,
"loss": 0.0,
"num_input_tokens_seen": 5151080,
"step": 8320
},
{
"epoch": 14.83957219251337,
"grad_norm": 1.0251422281726263e-05,
"learning_rate": 9.478864918710534e-06,
"loss": 0.0,
"num_input_tokens_seen": 5153864,
"step": 8325
},
{
"epoch": 14.848484848484848,
"grad_norm": 2.408368163742125e-05,
"learning_rate": 9.448397496615574e-06,
"loss": 0.0,
"num_input_tokens_seen": 5157736,
"step": 8330
},
{
"epoch": 14.857397504456328,
"grad_norm": 6.29090072834515e-06,
"learning_rate": 9.417967705375186e-06,
"loss": 0.0,
"num_input_tokens_seen": 5160360,
"step": 8335
},
{
"epoch": 14.866310160427808,
"grad_norm": 8.154268471116666e-06,
"learning_rate": 9.387575618621597e-06,
"loss": 0.0,
"num_input_tokens_seen": 5163656,
"step": 8340
},
{
"epoch": 14.875222816399287,
"grad_norm": 8.839137080940418e-06,
"learning_rate": 9.357221309895786e-06,
"loss": 0.0,
"num_input_tokens_seen": 5167336,
"step": 8345
},
{
"epoch": 14.884135472370767,
"grad_norm": 7.0688442974642385e-06,
"learning_rate": 9.326904852647344e-06,
"loss": 0.0,
"num_input_tokens_seen": 5170408,
"step": 8350
},
{
"epoch": 14.893048128342246,
"grad_norm": 9.05954402696807e-06,
"learning_rate": 9.29662632023423e-06,
"loss": 0.0,
"num_input_tokens_seen": 5173192,
"step": 8355
},
{
"epoch": 14.901960784313726,
"grad_norm": 5.333501576387789e-06,
"learning_rate": 9.266385785922672e-06,
"loss": 0.0,
"num_input_tokens_seen": 5177160,
"step": 8360
},
{
"epoch": 14.910873440285204,
"grad_norm": 6.870981906104134e-06,
"learning_rate": 9.236183322886945e-06,
"loss": 0.0,
"num_input_tokens_seen": 5181064,
"step": 8365
},
{
"epoch": 14.919786096256685,
"grad_norm": 5.239366601017537e-06,
"learning_rate": 9.206019004209171e-06,
"loss": 0.0,
"num_input_tokens_seen": 5184264,
"step": 8370
},
{
"epoch": 14.928698752228165,
"grad_norm": 1.6661275367368944e-05,
"learning_rate": 9.175892902879232e-06,
"loss": 0.0,
"num_input_tokens_seen": 5187048,
"step": 8375
},
{
"epoch": 14.937611408199643,
"grad_norm": 8.331326171173714e-06,
"learning_rate": 9.145805091794473e-06,
"loss": 0.0,
"num_input_tokens_seen": 5189928,
"step": 8380
},
{
"epoch": 14.946524064171124,
"grad_norm": 8.945467925514095e-06,
"learning_rate": 9.115755643759621e-06,
"loss": 0.0,
"num_input_tokens_seen": 5192392,
"step": 8385
},
{
"epoch": 14.955436720142602,
"grad_norm": 1.2952526049048174e-05,
"learning_rate": 9.085744631486573e-06,
"loss": 0.0,
"num_input_tokens_seen": 5195720,
"step": 8390
},
{
"epoch": 14.964349376114082,
"grad_norm": 5.643107670039171e-06,
"learning_rate": 9.0557721275942e-06,
"loss": 0.0,
"num_input_tokens_seen": 5198536,
"step": 8395
},
{
"epoch": 14.973262032085561,
"grad_norm": 6.707574357278645e-06,
"learning_rate": 9.025838204608215e-06,
"loss": 0.0,
"num_input_tokens_seen": 5201288,
"step": 8400
},
{
"epoch": 14.982174688057041,
"grad_norm": 1.741484084050171e-05,
"learning_rate": 8.995942934960964e-06,
"loss": 0.0,
"num_input_tokens_seen": 5204520,
"step": 8405
},
{
"epoch": 14.99108734402852,
"grad_norm": 6.384299922501668e-06,
"learning_rate": 8.966086390991266e-06,
"loss": 0.0,
"num_input_tokens_seen": 5207688,
"step": 8410
},
{
"epoch": 15.0,
"grad_norm": 6.7942432906420436e-06,
"learning_rate": 8.936268644944246e-06,
"loss": 0.0,
"num_input_tokens_seen": 5210208,
"step": 8415
},
{
"epoch": 15.0,
"eval_loss": 0.26385584473609924,
"eval_runtime": 4.5846,
"eval_samples_per_second": 54.313,
"eval_steps_per_second": 13.742,
"num_input_tokens_seen": 5210208,
"step": 8415
},
{
"epoch": 15.00891265597148,
"grad_norm": 7.896741408330854e-06,
"learning_rate": 8.906489768971113e-06,
"loss": 0.0,
"num_input_tokens_seen": 5213600,
"step": 8420
},
{
"epoch": 15.017825311942959,
"grad_norm": 4.688414264819585e-05,
"learning_rate": 8.876749835129053e-06,
"loss": 0.0,
"num_input_tokens_seen": 5216704,
"step": 8425
},
{
"epoch": 15.026737967914439,
"grad_norm": 5.828882876812713e-06,
"learning_rate": 8.847048915381011e-06,
"loss": 0.0,
"num_input_tokens_seen": 5219456,
"step": 8430
},
{
"epoch": 15.035650623885918,
"grad_norm": 5.714399321732344e-06,
"learning_rate": 8.817387081595532e-06,
"loss": 0.0,
"num_input_tokens_seen": 5223008,
"step": 8435
},
{
"epoch": 15.044563279857398,
"grad_norm": 0.00012872931256424636,
"learning_rate": 8.787764405546584e-06,
"loss": 0.0,
"num_input_tokens_seen": 5226496,
"step": 8440
},
{
"epoch": 15.053475935828876,
"grad_norm": 8.361628715647385e-06,
"learning_rate": 8.758180958913362e-06,
"loss": 0.0,
"num_input_tokens_seen": 5229440,
"step": 8445
},
{
"epoch": 15.062388591800357,
"grad_norm": 5.478733783093048e-06,
"learning_rate": 8.728636813280163e-06,
"loss": 0.0,
"num_input_tokens_seen": 5232704,
"step": 8450
},
{
"epoch": 15.071301247771837,
"grad_norm": 1.3515757927962113e-05,
"learning_rate": 8.699132040136186e-06,
"loss": 0.0,
"num_input_tokens_seen": 5235168,
"step": 8455
},
{
"epoch": 15.080213903743315,
"grad_norm": 4.47141701442888e-06,
"learning_rate": 8.669666710875318e-06,
"loss": 0.0,
"num_input_tokens_seen": 5238656,
"step": 8460
},
{
"epoch": 15.089126559714796,
"grad_norm": 1.1040413482987788e-05,
"learning_rate": 8.640240896796074e-06,
"loss": 0.0,
"num_input_tokens_seen": 5242240,
"step": 8465
},
{
"epoch": 15.098039215686274,
"grad_norm": 8.942615750129335e-06,
"learning_rate": 8.61085466910128e-06,
"loss": 0.0,
"num_input_tokens_seen": 5245568,
"step": 8470
},
{
"epoch": 15.106951871657754,
"grad_norm": 1.3127410966262687e-05,
"learning_rate": 8.581508098898011e-06,
"loss": 0.0,
"num_input_tokens_seen": 5248384,
"step": 8475
},
{
"epoch": 15.115864527629233,
"grad_norm": 5.364198386814678e-06,
"learning_rate": 8.552201257197389e-06,
"loss": 0.0,
"num_input_tokens_seen": 5251744,
"step": 8480
},
{
"epoch": 15.124777183600713,
"grad_norm": 4.796311259269714e-06,
"learning_rate": 8.522934214914372e-06,
"loss": 0.0,
"num_input_tokens_seen": 5254592,
"step": 8485
},
{
"epoch": 15.133689839572192,
"grad_norm": 7.995293344720267e-06,
"learning_rate": 8.493707042867633e-06,
"loss": 0.0,
"num_input_tokens_seen": 5257312,
"step": 8490
},
{
"epoch": 15.142602495543672,
"grad_norm": 1.153885659732623e-05,
"learning_rate": 8.464519811779367e-06,
"loss": 0.0,
"num_input_tokens_seen": 5260320,
"step": 8495
},
{
"epoch": 15.151515151515152,
"grad_norm": 5.2959899221605156e-06,
"learning_rate": 8.43537259227513e-06,
"loss": 0.0,
"num_input_tokens_seen": 5263744,
"step": 8500
},
{
"epoch": 15.16042780748663,
"grad_norm": 5.650913590216078e-05,
"learning_rate": 8.406265454883649e-06,
"loss": 0.0,
"num_input_tokens_seen": 5267328,
"step": 8505
},
{
"epoch": 15.169340463458111,
"grad_norm": 8.47045703267213e-06,
"learning_rate": 8.37719847003666e-06,
"loss": 0.0,
"num_input_tokens_seen": 5270688,
"step": 8510
},
{
"epoch": 15.17825311942959,
"grad_norm": 6.704518909828039e-06,
"learning_rate": 8.348171708068747e-06,
"loss": 0.0,
"num_input_tokens_seen": 5273728,
"step": 8515
},
{
"epoch": 15.18716577540107,
"grad_norm": 5.129308192408644e-06,
"learning_rate": 8.31918523921717e-06,
"loss": 0.0,
"num_input_tokens_seen": 5276288,
"step": 8520
},
{
"epoch": 15.196078431372548,
"grad_norm": 4.8411061470687855e-06,
"learning_rate": 8.29023913362168e-06,
"loss": 0.0,
"num_input_tokens_seen": 5279136,
"step": 8525
},
{
"epoch": 15.204991087344029,
"grad_norm": 8.145854735630564e-06,
"learning_rate": 8.261333461324372e-06,
"loss": 0.0,
"num_input_tokens_seen": 5281376,
"step": 8530
},
{
"epoch": 15.213903743315509,
"grad_norm": 9.624214726500213e-06,
"learning_rate": 8.23246829226948e-06,
"loss": 0.0,
"num_input_tokens_seen": 5283584,
"step": 8535
},
{
"epoch": 15.222816399286987,
"grad_norm": 0.00012442510342225432,
"learning_rate": 8.203643696303255e-06,
"loss": 0.0,
"num_input_tokens_seen": 5286976,
"step": 8540
},
{
"epoch": 15.231729055258468,
"grad_norm": 4.386363798403181e-06,
"learning_rate": 8.174859743173765e-06,
"loss": 0.0,
"num_input_tokens_seen": 5290208,
"step": 8545
},
{
"epoch": 15.240641711229946,
"grad_norm": 9.525587302050553e-06,
"learning_rate": 8.146116502530709e-06,
"loss": 0.0,
"num_input_tokens_seen": 5293280,
"step": 8550
},
{
"epoch": 15.249554367201426,
"grad_norm": 7.593696409458062e-06,
"learning_rate": 8.117414043925322e-06,
"loss": 0.0,
"num_input_tokens_seen": 5296288,
"step": 8555
},
{
"epoch": 15.258467023172905,
"grad_norm": 1.2609904842975084e-05,
"learning_rate": 8.08875243681011e-06,
"loss": 0.0,
"num_input_tokens_seen": 5299488,
"step": 8560
},
{
"epoch": 15.267379679144385,
"grad_norm": 1.8711343727773055e-05,
"learning_rate": 8.06013175053875e-06,
"loss": 0.0,
"num_input_tokens_seen": 5302880,
"step": 8565
},
{
"epoch": 15.276292335115864,
"grad_norm": 5.2512627917167265e-06,
"learning_rate": 8.031552054365903e-06,
"loss": 0.0,
"num_input_tokens_seen": 5305792,
"step": 8570
},
{
"epoch": 15.285204991087344,
"grad_norm": 5.599639735009987e-06,
"learning_rate": 8.003013417447034e-06,
"loss": 0.0,
"num_input_tokens_seen": 5308384,
"step": 8575
},
{
"epoch": 15.294117647058824,
"grad_norm": 2.4499842766090296e-05,
"learning_rate": 7.974515908838259e-06,
"loss": 0.0,
"num_input_tokens_seen": 5310816,
"step": 8580
},
{
"epoch": 15.303030303030303,
"grad_norm": 5.107368451717775e-06,
"learning_rate": 7.94605959749618e-06,
"loss": 0.0,
"num_input_tokens_seen": 5313440,
"step": 8585
},
{
"epoch": 15.311942959001783,
"grad_norm": 4.877272203884786e-06,
"learning_rate": 7.917644552277708e-06,
"loss": 0.0,
"num_input_tokens_seen": 5316416,
"step": 8590
},
{
"epoch": 15.320855614973262,
"grad_norm": 6.525916433020029e-06,
"learning_rate": 7.889270841939908e-06,
"loss": 0.0,
"num_input_tokens_seen": 5318752,
"step": 8595
},
{
"epoch": 15.329768270944742,
"grad_norm": 6.321250566543313e-06,
"learning_rate": 7.860938535139805e-06,
"loss": 0.0,
"num_input_tokens_seen": 5322112,
"step": 8600
},
{
"epoch": 15.33868092691622,
"grad_norm": 5.054580469732173e-06,
"learning_rate": 7.832647700434257e-06,
"loss": 0.0,
"num_input_tokens_seen": 5325120,
"step": 8605
},
{
"epoch": 15.3475935828877,
"grad_norm": 5.5520063142466825e-06,
"learning_rate": 7.804398406279764e-06,
"loss": 0.0,
"num_input_tokens_seen": 5328960,
"step": 8610
},
{
"epoch": 15.35650623885918,
"grad_norm": 5.534681804419961e-06,
"learning_rate": 7.776190721032312e-06,
"loss": 0.0,
"num_input_tokens_seen": 5332064,
"step": 8615
},
{
"epoch": 15.36541889483066,
"grad_norm": 6.235063210624503e-06,
"learning_rate": 7.748024712947205e-06,
"loss": 0.0,
"num_input_tokens_seen": 5336000,
"step": 8620
},
{
"epoch": 15.37433155080214,
"grad_norm": 6.73488329994143e-06,
"learning_rate": 7.719900450178882e-06,
"loss": 0.0,
"num_input_tokens_seen": 5338944,
"step": 8625
},
{
"epoch": 15.383244206773618,
"grad_norm": 7.579846715088934e-05,
"learning_rate": 7.691818000780796e-06,
"loss": 0.0,
"num_input_tokens_seen": 5342176,
"step": 8630
},
{
"epoch": 15.392156862745098,
"grad_norm": 9.930758096743375e-06,
"learning_rate": 7.663777432705207e-06,
"loss": 0.0,
"num_input_tokens_seen": 5345376,
"step": 8635
},
{
"epoch": 15.401069518716577,
"grad_norm": 5.044625140726566e-06,
"learning_rate": 7.635778813803018e-06,
"loss": 0.0,
"num_input_tokens_seen": 5348544,
"step": 8640
},
{
"epoch": 15.409982174688057,
"grad_norm": 5.212654741626466e-06,
"learning_rate": 7.607822211823673e-06,
"loss": 0.0,
"num_input_tokens_seen": 5351904,
"step": 8645
},
{
"epoch": 15.418894830659536,
"grad_norm": 5.073397915111855e-06,
"learning_rate": 7.579907694414892e-06,
"loss": 0.0,
"num_input_tokens_seen": 5354560,
"step": 8650
},
{
"epoch": 15.427807486631016,
"grad_norm": 5.894227797398344e-06,
"learning_rate": 7.552035329122592e-06,
"loss": 0.0,
"num_input_tokens_seen": 5358368,
"step": 8655
},
{
"epoch": 15.436720142602496,
"grad_norm": 7.546342658315552e-06,
"learning_rate": 7.524205183390698e-06,
"loss": 0.0,
"num_input_tokens_seen": 5361856,
"step": 8660
},
{
"epoch": 15.445632798573975,
"grad_norm": 5.361746843846049e-06,
"learning_rate": 7.49641732456094e-06,
"loss": 0.0,
"num_input_tokens_seen": 5364704,
"step": 8665
},
{
"epoch": 15.454545454545455,
"grad_norm": 4.600000920618186e-06,
"learning_rate": 7.46867181987276e-06,
"loss": 0.0,
"num_input_tokens_seen": 5367200,
"step": 8670
},
{
"epoch": 15.463458110516934,
"grad_norm": 5.97817688685609e-06,
"learning_rate": 7.4409687364631e-06,
"loss": 0.0,
"num_input_tokens_seen": 5369920,
"step": 8675
},
{
"epoch": 15.472370766488414,
"grad_norm": 5.486062946147285e-06,
"learning_rate": 7.413308141366254e-06,
"loss": 0.0,
"num_input_tokens_seen": 5373728,
"step": 8680
},
{
"epoch": 15.481283422459892,
"grad_norm": 6.220386694621993e-06,
"learning_rate": 7.385690101513715e-06,
"loss": 0.0,
"num_input_tokens_seen": 5376736,
"step": 8685
},
{
"epoch": 15.490196078431373,
"grad_norm": 1.1094513865828048e-05,
"learning_rate": 7.358114683733977e-06,
"loss": 0.0,
"num_input_tokens_seen": 5379616,
"step": 8690
},
{
"epoch": 15.499108734402853,
"grad_norm": 6.627035872952547e-06,
"learning_rate": 7.330581954752427e-06,
"loss": 0.0,
"num_input_tokens_seen": 5383104,
"step": 8695
},
{
"epoch": 15.508021390374331,
"grad_norm": 3.4417731512803584e-05,
"learning_rate": 7.303091981191141e-06,
"loss": 0.0,
"num_input_tokens_seen": 5386688,
"step": 8700
},
{
"epoch": 15.516934046345812,
"grad_norm": 4.375489425001433e-06,
"learning_rate": 7.275644829568748e-06,
"loss": 0.0,
"num_input_tokens_seen": 5389760,
"step": 8705
},
{
"epoch": 15.52584670231729,
"grad_norm": 5.563930699281627e-06,
"learning_rate": 7.248240566300257e-06,
"loss": 0.0,
"num_input_tokens_seen": 5392928,
"step": 8710
},
{
"epoch": 15.53475935828877,
"grad_norm": 9.789599062060006e-06,
"learning_rate": 7.220879257696883e-06,
"loss": 0.0,
"num_input_tokens_seen": 5395104,
"step": 8715
},
{
"epoch": 15.543672014260249,
"grad_norm": 5.930739462201018e-06,
"learning_rate": 7.1935609699659236e-06,
"loss": 0.0,
"num_input_tokens_seen": 5398336,
"step": 8720
},
{
"epoch": 15.55258467023173,
"grad_norm": 8.74727174959844e-06,
"learning_rate": 7.166285769210568e-06,
"loss": 0.0,
"num_input_tokens_seen": 5401696,
"step": 8725
},
{
"epoch": 15.56149732620321,
"grad_norm": 4.707947937276913e-06,
"learning_rate": 7.139053721429728e-06,
"loss": 0.0,
"num_input_tokens_seen": 5404736,
"step": 8730
},
{
"epoch": 15.570409982174688,
"grad_norm": 7.5294760790711734e-06,
"learning_rate": 7.111864892517944e-06,
"loss": 0.0,
"num_input_tokens_seen": 5407904,
"step": 8735
},
{
"epoch": 15.579322638146168,
"grad_norm": 1.3512942132365424e-05,
"learning_rate": 7.0847193482651234e-06,
"loss": 0.0,
"num_input_tokens_seen": 5411296,
"step": 8740
},
{
"epoch": 15.588235294117647,
"grad_norm": 4.419291144586168e-06,
"learning_rate": 7.057617154356469e-06,
"loss": 0.0,
"num_input_tokens_seen": 5414592,
"step": 8745
},
{
"epoch": 15.597147950089127,
"grad_norm": 5.332386990630766e-06,
"learning_rate": 7.030558376372284e-06,
"loss": 0.0,
"num_input_tokens_seen": 5417504,
"step": 8750
},
{
"epoch": 15.606060606060606,
"grad_norm": 1.6384390619350597e-05,
"learning_rate": 7.0035430797877974e-06,
"loss": 0.0,
"num_input_tokens_seen": 5420256,
"step": 8755
},
{
"epoch": 15.614973262032086,
"grad_norm": 7.927361366455443e-06,
"learning_rate": 6.976571329973044e-06,
"loss": 0.0,
"num_input_tokens_seen": 5423264,
"step": 8760
},
{
"epoch": 15.623885918003564,
"grad_norm": 4.236151653458364e-05,
"learning_rate": 6.949643192192678e-06,
"loss": 0.0,
"num_input_tokens_seen": 5426528,
"step": 8765
},
{
"epoch": 15.632798573975045,
"grad_norm": 5.545013664232101e-06,
"learning_rate": 6.922758731605833e-06,
"loss": 0.0,
"num_input_tokens_seen": 5429600,
"step": 8770
},
{
"epoch": 15.641711229946525,
"grad_norm": 7.829821697669104e-05,
"learning_rate": 6.8959180132659475e-06,
"loss": 0.0,
"num_input_tokens_seen": 5432704,
"step": 8775
},
{
"epoch": 15.650623885918003,
"grad_norm": 2.0672809114330448e-05,
"learning_rate": 6.869121102120607e-06,
"loss": 0.0,
"num_input_tokens_seen": 5435968,
"step": 8780
},
{
"epoch": 15.659536541889484,
"grad_norm": 6.136841875559185e-06,
"learning_rate": 6.842368063011406e-06,
"loss": 0.0,
"num_input_tokens_seen": 5438496,
"step": 8785
},
{
"epoch": 15.668449197860962,
"grad_norm": 6.172531357151456e-06,
"learning_rate": 6.815658960673782e-06,
"loss": 0.0,
"num_input_tokens_seen": 5441536,
"step": 8790
},
{
"epoch": 15.677361853832442,
"grad_norm": 8.231077117670793e-06,
"learning_rate": 6.7889938597368505e-06,
"loss": 0.0,
"num_input_tokens_seen": 5444672,
"step": 8795
},
{
"epoch": 15.686274509803921,
"grad_norm": 7.102705694705946e-06,
"learning_rate": 6.762372824723265e-06,
"loss": 0.0,
"num_input_tokens_seen": 5447712,
"step": 8800
},
{
"epoch": 15.695187165775401,
"grad_norm": 5.1239135245850775e-06,
"learning_rate": 6.735795920049026e-06,
"loss": 0.0,
"num_input_tokens_seen": 5450720,
"step": 8805
},
{
"epoch": 15.70409982174688,
"grad_norm": 6.167297215142753e-06,
"learning_rate": 6.709263210023375e-06,
"loss": 0.0,
"num_input_tokens_seen": 5453760,
"step": 8810
},
{
"epoch": 15.71301247771836,
"grad_norm": 6.101850885897875e-06,
"learning_rate": 6.682774758848618e-06,
"loss": 0.0,
"num_input_tokens_seen": 5456256,
"step": 8815
},
{
"epoch": 15.72192513368984,
"grad_norm": 0.00011210032243980095,
"learning_rate": 6.656330630619925e-06,
"loss": 0.0,
"num_input_tokens_seen": 5459488,
"step": 8820
},
{
"epoch": 15.730837789661319,
"grad_norm": 5.509222319233231e-06,
"learning_rate": 6.629930889325278e-06,
"loss": 0.0,
"num_input_tokens_seen": 5462848,
"step": 8825
},
{
"epoch": 15.739750445632799,
"grad_norm": 5.822642378916498e-06,
"learning_rate": 6.603575598845196e-06,
"loss": 0.0,
"num_input_tokens_seen": 5465760,
"step": 8830
},
{
"epoch": 15.748663101604278,
"grad_norm": 5.75665217183996e-06,
"learning_rate": 6.577264822952675e-06,
"loss": 0.0,
"num_input_tokens_seen": 5469440,
"step": 8835
},
{
"epoch": 15.757575757575758,
"grad_norm": 3.2189564080908895e-05,
"learning_rate": 6.550998625312987e-06,
"loss": 0.0,
"num_input_tokens_seen": 5472064,
"step": 8840
},
{
"epoch": 15.766488413547236,
"grad_norm": 4.856089435634203e-05,
"learning_rate": 6.524777069483526e-06,
"loss": 0.0,
"num_input_tokens_seen": 5474784,
"step": 8845
},
{
"epoch": 15.775401069518717,
"grad_norm": 4.835144864046015e-06,
"learning_rate": 6.498600218913678e-06,
"loss": 0.0,
"num_input_tokens_seen": 5478048,
"step": 8850
},
{
"epoch": 15.784313725490197,
"grad_norm": 1.2231737855472602e-05,
"learning_rate": 6.472468136944648e-06,
"loss": 0.0,
"num_input_tokens_seen": 5481056,
"step": 8855
},
{
"epoch": 15.793226381461675,
"grad_norm": 5.054258053860394e-06,
"learning_rate": 6.446380886809314e-06,
"loss": 0.0,
"num_input_tokens_seen": 5484320,
"step": 8860
},
{
"epoch": 15.802139037433156,
"grad_norm": 5.58068131795153e-06,
"learning_rate": 6.420338531632078e-06,
"loss": 0.0,
"num_input_tokens_seen": 5487424,
"step": 8865
},
{
"epoch": 15.811051693404634,
"grad_norm": 6.2250314840639476e-06,
"learning_rate": 6.394341134428691e-06,
"loss": 0.0,
"num_input_tokens_seen": 5491456,
"step": 8870
},
{
"epoch": 15.819964349376114,
"grad_norm": 0.00017160887364298105,
"learning_rate": 6.368388758106133e-06,
"loss": 0.0,
"num_input_tokens_seen": 5493920,
"step": 8875
},
{
"epoch": 15.828877005347593,
"grad_norm": 4.926492692902684e-06,
"learning_rate": 6.342481465462441e-06,
"loss": 0.0,
"num_input_tokens_seen": 5496992,
"step": 8880
},
{
"epoch": 15.837789661319073,
"grad_norm": 6.23631831331295e-06,
"learning_rate": 6.316619319186562e-06,
"loss": 0.0,
"num_input_tokens_seen": 5500800,
"step": 8885
},
{
"epoch": 15.846702317290553,
"grad_norm": 7.705433745286427e-06,
"learning_rate": 6.290802381858202e-06,
"loss": 0.0,
"num_input_tokens_seen": 5504064,
"step": 8890
},
{
"epoch": 15.855614973262032,
"grad_norm": 2.909653449023608e-05,
"learning_rate": 6.265030715947659e-06,
"loss": 0.0,
"num_input_tokens_seen": 5507264,
"step": 8895
},
{
"epoch": 15.864527629233512,
"grad_norm": 5.82176107855048e-06,
"learning_rate": 6.2393043838157055e-06,
"loss": 0.0,
"num_input_tokens_seen": 5510176,
"step": 8900
},
{
"epoch": 15.87344028520499,
"grad_norm": 9.333507478004321e-06,
"learning_rate": 6.213623447713413e-06,
"loss": 0.0,
"num_input_tokens_seen": 5512672,
"step": 8905
},
{
"epoch": 15.882352941176471,
"grad_norm": 6.4683308664825745e-06,
"learning_rate": 6.1879879697819806e-06,
"loss": 0.0,
"num_input_tokens_seen": 5515936,
"step": 8910
},
{
"epoch": 15.89126559714795,
"grad_norm": 1.6260813936241902e-05,
"learning_rate": 6.162398012052664e-06,
"loss": 0.0,
"num_input_tokens_seen": 5518784,
"step": 8915
},
{
"epoch": 15.90017825311943,
"grad_norm": 2.4525292246835306e-05,
"learning_rate": 6.136853636446518e-06,
"loss": 0.0,
"num_input_tokens_seen": 5521888,
"step": 8920
},
{
"epoch": 15.909090909090908,
"grad_norm": 0.0001769509253790602,
"learning_rate": 6.11135490477433e-06,
"loss": 0.0,
"num_input_tokens_seen": 5524960,
"step": 8925
},
{
"epoch": 15.918003565062389,
"grad_norm": 5.651103492709808e-06,
"learning_rate": 6.085901878736442e-06,
"loss": 0.0,
"num_input_tokens_seen": 5528128,
"step": 8930
},
{
"epoch": 15.926916221033869,
"grad_norm": 5.721294201066485e-06,
"learning_rate": 6.060494619922575e-06,
"loss": 0.0,
"num_input_tokens_seen": 5530720,
"step": 8935
},
{
"epoch": 15.935828877005347,
"grad_norm": 4.7713056119391695e-06,
"learning_rate": 6.035133189811729e-06,
"loss": 0.0,
"num_input_tokens_seen": 5533888,
"step": 8940
},
{
"epoch": 15.944741532976828,
"grad_norm": 4.651044037018437e-06,
"learning_rate": 6.009817649772007e-06,
"loss": 0.0,
"num_input_tokens_seen": 5536768,
"step": 8945
},
{
"epoch": 15.953654188948306,
"grad_norm": 1.05441022242303e-05,
"learning_rate": 5.9845480610604635e-06,
"loss": 0.0,
"num_input_tokens_seen": 5539968,
"step": 8950
},
{
"epoch": 15.962566844919786,
"grad_norm": 1.0455855772306677e-05,
"learning_rate": 5.959324484822973e-06,
"loss": 0.0,
"num_input_tokens_seen": 5542752,
"step": 8955
},
{
"epoch": 15.971479500891265,
"grad_norm": 3.935343102057232e-06,
"learning_rate": 5.9341469820940495e-06,
"loss": 0.0,
"num_input_tokens_seen": 5545600,
"step": 8960
},
{
"epoch": 15.980392156862745,
"grad_norm": 7.889810149208643e-06,
"learning_rate": 5.909015613796745e-06,
"loss": 0.0,
"num_input_tokens_seen": 5549280,
"step": 8965
},
{
"epoch": 15.989304812834224,
"grad_norm": 3.854881015286082e-06,
"learning_rate": 5.883930440742466e-06,
"loss": 0.0,
"num_input_tokens_seen": 5553088,
"step": 8970
},
{
"epoch": 15.998217468805704,
"grad_norm": 9.84231064649066e-06,
"learning_rate": 5.858891523630844e-06,
"loss": 0.0,
"num_input_tokens_seen": 5555680,
"step": 8975
},
{
"epoch": 16.0,
"eval_loss": 0.2678108513355255,
"eval_runtime": 4.5876,
"eval_samples_per_second": 54.277,
"eval_steps_per_second": 13.733,
"num_input_tokens_seen": 5555776,
"step": 8976
},
{
"epoch": 16.007130124777184,
"grad_norm": 9.22131039260421e-06,
"learning_rate": 5.833898923049586e-06,
"loss": 0.0,
"num_input_tokens_seen": 5558240,
"step": 8980
},
{
"epoch": 16.016042780748663,
"grad_norm": 5.392693765315926e-06,
"learning_rate": 5.8089526994743014e-06,
"loss": 0.0,
"num_input_tokens_seen": 5560992,
"step": 8985
},
{
"epoch": 16.02495543672014,
"grad_norm": 1.1318848919472657e-05,
"learning_rate": 5.784052913268412e-06,
"loss": 0.0,
"num_input_tokens_seen": 5563872,
"step": 8990
},
{
"epoch": 16.033868092691623,
"grad_norm": 9.131179467658512e-06,
"learning_rate": 5.759199624682962e-06,
"loss": 0.0,
"num_input_tokens_seen": 5566400,
"step": 8995
},
{
"epoch": 16.0427807486631,
"grad_norm": 4.842033831664594e-06,
"learning_rate": 5.734392893856458e-06,
"loss": 0.0,
"num_input_tokens_seen": 5569024,
"step": 9000
},
{
"epoch": 16.05169340463458,
"grad_norm": 6.703641702188179e-06,
"learning_rate": 5.709632780814797e-06,
"loss": 0.0,
"num_input_tokens_seen": 5571648,
"step": 9005
},
{
"epoch": 16.060606060606062,
"grad_norm": 3.561503763194196e-05,
"learning_rate": 5.684919345471029e-06,
"loss": 0.0,
"num_input_tokens_seen": 5574656,
"step": 9010
},
{
"epoch": 16.06951871657754,
"grad_norm": 4.695339157478884e-06,
"learning_rate": 5.660252647625278e-06,
"loss": 0.0,
"num_input_tokens_seen": 5577536,
"step": 9015
},
{
"epoch": 16.07843137254902,
"grad_norm": 1.17061272248975e-05,
"learning_rate": 5.635632746964581e-06,
"loss": 0.0,
"num_input_tokens_seen": 5579968,
"step": 9020
},
{
"epoch": 16.087344028520498,
"grad_norm": 7.923229532025289e-06,
"learning_rate": 5.611059703062713e-06,
"loss": 0.0,
"num_input_tokens_seen": 5583008,
"step": 9025
},
{
"epoch": 16.09625668449198,
"grad_norm": 5.213600161368959e-06,
"learning_rate": 5.5865335753800875e-06,
"loss": 0.0,
"num_input_tokens_seen": 5586080,
"step": 9030
},
{
"epoch": 16.10516934046346,
"grad_norm": 8.186175364244264e-06,
"learning_rate": 5.562054423263591e-06,
"loss": 0.0,
"num_input_tokens_seen": 5589248,
"step": 9035
},
{
"epoch": 16.114081996434937,
"grad_norm": 5.150972810952226e-06,
"learning_rate": 5.537622305946436e-06,
"loss": 0.0,
"num_input_tokens_seen": 5591936,
"step": 9040
},
{
"epoch": 16.122994652406415,
"grad_norm": 1.3816433238389436e-05,
"learning_rate": 5.513237282548034e-06,
"loss": 0.0,
"num_input_tokens_seen": 5594304,
"step": 9045
},
{
"epoch": 16.131907308377897,
"grad_norm": 1.9653147319331765e-05,
"learning_rate": 5.4888994120738164e-06,
"loss": 0.0,
"num_input_tokens_seen": 5597376,
"step": 9050
},
{
"epoch": 16.140819964349376,
"grad_norm": 6.136602678452618e-06,
"learning_rate": 5.464608753415146e-06,
"loss": 0.0,
"num_input_tokens_seen": 5600896,
"step": 9055
},
{
"epoch": 16.149732620320854,
"grad_norm": 5.412967766460497e-06,
"learning_rate": 5.440365365349126e-06,
"loss": 0.0,
"num_input_tokens_seen": 5603552,
"step": 9060
},
{
"epoch": 16.158645276292336,
"grad_norm": 5.781025720352773e-06,
"learning_rate": 5.416169306538485e-06,
"loss": 0.0,
"num_input_tokens_seen": 5607264,
"step": 9065
},
{
"epoch": 16.167557932263815,
"grad_norm": 9.013300768856425e-06,
"learning_rate": 5.392020635531433e-06,
"loss": 0.0,
"num_input_tokens_seen": 5609984,
"step": 9070
},
{
"epoch": 16.176470588235293,
"grad_norm": 4.711997462436557e-06,
"learning_rate": 5.367919410761493e-06,
"loss": 0.0,
"num_input_tokens_seen": 5613472,
"step": 9075
},
{
"epoch": 16.185383244206772,
"grad_norm": 4.303492823964916e-05,
"learning_rate": 5.343865690547401e-06,
"loss": 0.0,
"num_input_tokens_seen": 5616608,
"step": 9080
},
{
"epoch": 16.194295900178254,
"grad_norm": 5.112724466016516e-05,
"learning_rate": 5.319859533092933e-06,
"loss": 0.0,
"num_input_tokens_seen": 5619104,
"step": 9085
},
{
"epoch": 16.203208556149733,
"grad_norm": 4.8787801461003255e-06,
"learning_rate": 5.295900996486782e-06,
"loss": 0.0,
"num_input_tokens_seen": 5622208,
"step": 9090
},
{
"epoch": 16.21212121212121,
"grad_norm": 1.4346080206451006e-05,
"learning_rate": 5.271990138702418e-06,
"loss": 0.0,
"num_input_tokens_seen": 5625056,
"step": 9095
},
{
"epoch": 16.221033868092693,
"grad_norm": 3.947042569052428e-05,
"learning_rate": 5.248127017597909e-06,
"loss": 0.0,
"num_input_tokens_seen": 5628608,
"step": 9100
},
{
"epoch": 16.22994652406417,
"grad_norm": 6.575951829290716e-06,
"learning_rate": 5.2243116909158475e-06,
"loss": 0.0,
"num_input_tokens_seen": 5631776,
"step": 9105
},
{
"epoch": 16.23885918003565,
"grad_norm": 5.081686140329111e-06,
"learning_rate": 5.200544216283168e-06,
"loss": 0.0,
"num_input_tokens_seen": 5634720,
"step": 9110
},
{
"epoch": 16.24777183600713,
"grad_norm": 7.938112503325101e-06,
"learning_rate": 5.17682465121099e-06,
"loss": 0.0,
"num_input_tokens_seen": 5638048,
"step": 9115
},
{
"epoch": 16.25668449197861,
"grad_norm": 6.122967079136288e-06,
"learning_rate": 5.153153053094551e-06,
"loss": 0.0,
"num_input_tokens_seen": 5641600,
"step": 9120
},
{
"epoch": 16.26559714795009,
"grad_norm": 1.0540902621869463e-05,
"learning_rate": 5.129529479212969e-06,
"loss": 0.0,
"num_input_tokens_seen": 5644832,
"step": 9125
},
{
"epoch": 16.274509803921568,
"grad_norm": 4.833938874071464e-06,
"learning_rate": 5.105953986729195e-06,
"loss": 0.0,
"num_input_tokens_seen": 5648192,
"step": 9130
},
{
"epoch": 16.28342245989305,
"grad_norm": 4.714757778856438e-06,
"learning_rate": 5.082426632689827e-06,
"loss": 0.0,
"num_input_tokens_seen": 5650912,
"step": 9135
},
{
"epoch": 16.292335115864528,
"grad_norm": 6.666777608188568e-06,
"learning_rate": 5.058947474024958e-06,
"loss": 0.0,
"num_input_tokens_seen": 5654016,
"step": 9140
},
{
"epoch": 16.301247771836007,
"grad_norm": 4.947042270941893e-06,
"learning_rate": 5.0355165675480845e-06,
"loss": 0.0,
"num_input_tokens_seen": 5657376,
"step": 9145
},
{
"epoch": 16.310160427807485,
"grad_norm": 3.160488631692715e-05,
"learning_rate": 5.01213396995594e-06,
"loss": 0.0,
"num_input_tokens_seen": 5660608,
"step": 9150
},
{
"epoch": 16.319073083778967,
"grad_norm": 4.728987278213026e-06,
"learning_rate": 4.988799737828362e-06,
"loss": 0.0,
"num_input_tokens_seen": 5663072,
"step": 9155
},
{
"epoch": 16.327985739750446,
"grad_norm": 4.394975348986918e-06,
"learning_rate": 4.96551392762816e-06,
"loss": 0.0,
"num_input_tokens_seen": 5666304,
"step": 9160
},
{
"epoch": 16.336898395721924,
"grad_norm": 5.376489752961788e-06,
"learning_rate": 4.94227659570096e-06,
"loss": 0.0,
"num_input_tokens_seen": 5668960,
"step": 9165
},
{
"epoch": 16.345811051693406,
"grad_norm": 5.486904228746425e-06,
"learning_rate": 4.9190877982750935e-06,
"loss": 0.0,
"num_input_tokens_seen": 5672032,
"step": 9170
},
{
"epoch": 16.354723707664885,
"grad_norm": 2.8458198357839137e-05,
"learning_rate": 4.8959475914614554e-06,
"loss": 0.0,
"num_input_tokens_seen": 5674912,
"step": 9175
},
{
"epoch": 16.363636363636363,
"grad_norm": 5.422547928901622e-06,
"learning_rate": 4.872856031253362e-06,
"loss": 0.0,
"num_input_tokens_seen": 5677984,
"step": 9180
},
{
"epoch": 16.372549019607842,
"grad_norm": 8.959475962910801e-06,
"learning_rate": 4.849813173526416e-06,
"loss": 0.0,
"num_input_tokens_seen": 5680320,
"step": 9185
},
{
"epoch": 16.381461675579324,
"grad_norm": 6.08643813393428e-06,
"learning_rate": 4.826819074038361e-06,
"loss": 0.0,
"num_input_tokens_seen": 5682688,
"step": 9190
},
{
"epoch": 16.390374331550802,
"grad_norm": 8.104920198093168e-06,
"learning_rate": 4.803873788428972e-06,
"loss": 0.0,
"num_input_tokens_seen": 5685664,
"step": 9195
},
{
"epoch": 16.39928698752228,
"grad_norm": 2.9293587431311607e-05,
"learning_rate": 4.780977372219916e-06,
"loss": 0.0,
"num_input_tokens_seen": 5688768,
"step": 9200
},
{
"epoch": 16.40819964349376,
"grad_norm": 7.23411494618631e-06,
"learning_rate": 4.758129880814574e-06,
"loss": 0.0,
"num_input_tokens_seen": 5692256,
"step": 9205
},
{
"epoch": 16.41711229946524,
"grad_norm": 4.782838459505001e-06,
"learning_rate": 4.735331369497992e-06,
"loss": 0.0,
"num_input_tokens_seen": 5695904,
"step": 9210
},
{
"epoch": 16.42602495543672,
"grad_norm": 5.022124696552055e-06,
"learning_rate": 4.712581893436646e-06,
"loss": 0.0,
"num_input_tokens_seen": 5698720,
"step": 9215
},
{
"epoch": 16.4349376114082,
"grad_norm": 4.0708955566515215e-06,
"learning_rate": 4.689881507678393e-06,
"loss": 0.0,
"num_input_tokens_seen": 5702208,
"step": 9220
},
{
"epoch": 16.44385026737968,
"grad_norm": 7.042349807306891e-06,
"learning_rate": 4.667230267152295e-06,
"loss": 0.0,
"num_input_tokens_seen": 5705344,
"step": 9225
},
{
"epoch": 16.45276292335116,
"grad_norm": 6.274646239035064e-06,
"learning_rate": 4.644628226668485e-06,
"loss": 0.0,
"num_input_tokens_seen": 5708960,
"step": 9230
},
{
"epoch": 16.461675579322637,
"grad_norm": 7.114686741260812e-05,
"learning_rate": 4.622075440918058e-06,
"loss": 0.0,
"num_input_tokens_seen": 5712128,
"step": 9235
},
{
"epoch": 16.470588235294116,
"grad_norm": 4.347176400187891e-06,
"learning_rate": 4.599571964472921e-06,
"loss": 0.0,
"num_input_tokens_seen": 5715552,
"step": 9240
},
{
"epoch": 16.479500891265598,
"grad_norm": 8.624338079243898e-06,
"learning_rate": 4.577117851785665e-06,
"loss": 0.0,
"num_input_tokens_seen": 5719488,
"step": 9245
},
{
"epoch": 16.488413547237077,
"grad_norm": 3.135059159831144e-05,
"learning_rate": 4.554713157189439e-06,
"loss": 0.0,
"num_input_tokens_seen": 5722560,
"step": 9250
},
{
"epoch": 16.497326203208555,
"grad_norm": 5.457145107357064e-06,
"learning_rate": 4.5323579348977966e-06,
"loss": 0.0,
"num_input_tokens_seen": 5726336,
"step": 9255
},
{
"epoch": 16.506238859180037,
"grad_norm": 5.291251000016928e-06,
"learning_rate": 4.510052239004597e-06,
"loss": 0.0,
"num_input_tokens_seen": 5729632,
"step": 9260
},
{
"epoch": 16.515151515151516,
"grad_norm": 6.33938861938077e-06,
"learning_rate": 4.487796123483856e-06,
"loss": 0.0,
"num_input_tokens_seen": 5732384,
"step": 9265
},
{
"epoch": 16.524064171122994,
"grad_norm": 1.3511777979147155e-05,
"learning_rate": 4.46558964218961e-06,
"loss": 0.0,
"num_input_tokens_seen": 5735328,
"step": 9270
},
{
"epoch": 16.532976827094473,
"grad_norm": 8.118163350445684e-06,
"learning_rate": 4.443432848855811e-06,
"loss": 0.0,
"num_input_tokens_seen": 5738016,
"step": 9275
},
{
"epoch": 16.541889483065955,
"grad_norm": 6.011026471242076e-06,
"learning_rate": 4.421325797096146e-06,
"loss": 0.0,
"num_input_tokens_seen": 5740896,
"step": 9280
},
{
"epoch": 16.550802139037433,
"grad_norm": 4.317985258239787e-06,
"learning_rate": 4.399268540403975e-06,
"loss": 0.0,
"num_input_tokens_seen": 5743936,
"step": 9285
},
{
"epoch": 16.55971479500891,
"grad_norm": 4.823016752197873e-06,
"learning_rate": 4.377261132152155e-06,
"loss": 0.0,
"num_input_tokens_seen": 5746464,
"step": 9290
},
{
"epoch": 16.568627450980394,
"grad_norm": 4.637930487660924e-06,
"learning_rate": 4.355303625592899e-06,
"loss": 0.0,
"num_input_tokens_seen": 5749376,
"step": 9295
},
{
"epoch": 16.577540106951872,
"grad_norm": 5.199408860789845e-06,
"learning_rate": 4.333396073857724e-06,
"loss": 0.0,
"num_input_tokens_seen": 5752480,
"step": 9300
},
{
"epoch": 16.58645276292335,
"grad_norm": 1.6364750990760513e-05,
"learning_rate": 4.311538529957213e-06,
"loss": 0.0,
"num_input_tokens_seen": 5755360,
"step": 9305
},
{
"epoch": 16.59536541889483,
"grad_norm": 5.023352514399448e-06,
"learning_rate": 4.289731046780973e-06,
"loss": 0.0,
"num_input_tokens_seen": 5758752,
"step": 9310
},
{
"epoch": 16.60427807486631,
"grad_norm": 5.1138937124051154e-05,
"learning_rate": 4.267973677097481e-06,
"loss": 0.0,
"num_input_tokens_seen": 5762144,
"step": 9315
},
{
"epoch": 16.61319073083779,
"grad_norm": 6.966163255128777e-06,
"learning_rate": 4.246266473553931e-06,
"loss": 0.0,
"num_input_tokens_seen": 5765760,
"step": 9320
},
{
"epoch": 16.62210338680927,
"grad_norm": 8.861073001753539e-05,
"learning_rate": 4.22460948867614e-06,
"loss": 0.0,
"num_input_tokens_seen": 5768704,
"step": 9325
},
{
"epoch": 16.63101604278075,
"grad_norm": 1.665574382059276e-05,
"learning_rate": 4.203002774868414e-06,
"loss": 0.0,
"num_input_tokens_seen": 5771808,
"step": 9330
},
{
"epoch": 16.63992869875223,
"grad_norm": 5.504481123352889e-06,
"learning_rate": 4.18144638441341e-06,
"loss": 0.0,
"num_input_tokens_seen": 5774912,
"step": 9335
},
{
"epoch": 16.648841354723707,
"grad_norm": 5.119895831739996e-06,
"learning_rate": 4.159940369472015e-06,
"loss": 0.0,
"num_input_tokens_seen": 5777792,
"step": 9340
},
{
"epoch": 16.657754010695186,
"grad_norm": 5.188813702261541e-06,
"learning_rate": 4.138484782083219e-06,
"loss": 0.0,
"num_input_tokens_seen": 5780256,
"step": 9345
},
{
"epoch": 16.666666666666668,
"grad_norm": 5.0006860874418635e-06,
"learning_rate": 4.11707967416399e-06,
"loss": 0.0,
"num_input_tokens_seen": 5784032,
"step": 9350
},
{
"epoch": 16.675579322638146,
"grad_norm": 1.523861828900408e-05,
"learning_rate": 4.095725097509157e-06,
"loss": 0.0,
"num_input_tokens_seen": 5788064,
"step": 9355
},
{
"epoch": 16.684491978609625,
"grad_norm": 4.191660536889685e-06,
"learning_rate": 4.0744211037912706e-06,
"loss": 0.0,
"num_input_tokens_seen": 5791584,
"step": 9360
},
{
"epoch": 16.693404634581107,
"grad_norm": 6.965089141885983e-06,
"learning_rate": 4.0531677445604846e-06,
"loss": 0.0,
"num_input_tokens_seen": 5795008,
"step": 9365
},
{
"epoch": 16.702317290552585,
"grad_norm": 4.43164481112035e-06,
"learning_rate": 4.031965071244423e-06,
"loss": 0.0,
"num_input_tokens_seen": 5798400,
"step": 9370
},
{
"epoch": 16.711229946524064,
"grad_norm": 4.396553777041845e-06,
"learning_rate": 4.010813135148073e-06,
"loss": 0.0,
"num_input_tokens_seen": 5801312,
"step": 9375
},
{
"epoch": 16.720142602495542,
"grad_norm": 9.282194696424995e-06,
"learning_rate": 3.9897119874536536e-06,
"loss": 0.0,
"num_input_tokens_seen": 5803936,
"step": 9380
},
{
"epoch": 16.729055258467024,
"grad_norm": 5.5876425903989e-06,
"learning_rate": 3.968661679220468e-06,
"loss": 0.0,
"num_input_tokens_seen": 5807232,
"step": 9385
},
{
"epoch": 16.737967914438503,
"grad_norm": 1.001564487523865e-05,
"learning_rate": 3.9476622613848356e-06,
"loss": 0.0,
"num_input_tokens_seen": 5810624,
"step": 9390
},
{
"epoch": 16.74688057040998,
"grad_norm": 1.018101738736732e-05,
"learning_rate": 3.9267137847599e-06,
"loss": 0.0,
"num_input_tokens_seen": 5813184,
"step": 9395
},
{
"epoch": 16.75579322638146,
"grad_norm": 4.13356701756129e-06,
"learning_rate": 3.905816300035559e-06,
"loss": 0.0,
"num_input_tokens_seen": 5815904,
"step": 9400
},
{
"epoch": 16.764705882352942,
"grad_norm": 5.294611128192628e-06,
"learning_rate": 3.884969857778325e-06,
"loss": 0.0,
"num_input_tokens_seen": 5819744,
"step": 9405
},
{
"epoch": 16.77361853832442,
"grad_norm": 6.165806553326547e-06,
"learning_rate": 3.864174508431187e-06,
"loss": 0.0,
"num_input_tokens_seen": 5822688,
"step": 9410
},
{
"epoch": 16.7825311942959,
"grad_norm": 1.4151906725601293e-05,
"learning_rate": 3.843430302313511e-06,
"loss": 0.0,
"num_input_tokens_seen": 5826144,
"step": 9415
},
{
"epoch": 16.79144385026738,
"grad_norm": 6.0542051869560964e-06,
"learning_rate": 3.822737289620909e-06,
"loss": 0.0,
"num_input_tokens_seen": 5830048,
"step": 9420
},
{
"epoch": 16.80035650623886,
"grad_norm": 9.024288374348544e-06,
"learning_rate": 3.8020955204251223e-06,
"loss": 0.0,
"num_input_tokens_seen": 5833760,
"step": 9425
},
{
"epoch": 16.809269162210338,
"grad_norm": 3.84986788048991e-06,
"learning_rate": 3.781505044673894e-06,
"loss": 0.0,
"num_input_tokens_seen": 5837440,
"step": 9430
},
{
"epoch": 16.818181818181817,
"grad_norm": 5.007711934013059e-06,
"learning_rate": 3.760965912190839e-06,
"loss": 0.0,
"num_input_tokens_seen": 5840480,
"step": 9435
},
{
"epoch": 16.8270944741533,
"grad_norm": 5.706263891624985e-06,
"learning_rate": 3.740478172675346e-06,
"loss": 0.0,
"num_input_tokens_seen": 5843520,
"step": 9440
},
{
"epoch": 16.836007130124777,
"grad_norm": 8.089661605481524e-06,
"learning_rate": 3.720041875702451e-06,
"loss": 0.0,
"num_input_tokens_seen": 5846464,
"step": 9445
},
{
"epoch": 16.844919786096256,
"grad_norm": 8.192110726668034e-06,
"learning_rate": 3.699657070722698e-06,
"loss": 0.0,
"num_input_tokens_seen": 5849088,
"step": 9450
},
{
"epoch": 16.853832442067738,
"grad_norm": 4.462923243409023e-06,
"learning_rate": 3.6793238070620517e-06,
"loss": 0.0,
"num_input_tokens_seen": 5852128,
"step": 9455
},
{
"epoch": 16.862745098039216,
"grad_norm": 1.4291997104010079e-05,
"learning_rate": 3.659042133921736e-06,
"loss": 0.0,
"num_input_tokens_seen": 5855488,
"step": 9460
},
{
"epoch": 16.871657754010695,
"grad_norm": 5.222723757469794e-06,
"learning_rate": 3.6388121003781613e-06,
"loss": 0.0,
"num_input_tokens_seen": 5858816,
"step": 9465
},
{
"epoch": 16.880570409982173,
"grad_norm": 5.399089786806144e-06,
"learning_rate": 3.6186337553827747e-06,
"loss": 0.0,
"num_input_tokens_seen": 5861856,
"step": 9470
},
{
"epoch": 16.889483065953655,
"grad_norm": 4.9572722673474345e-06,
"learning_rate": 3.5985071477619397e-06,
"loss": 0.0,
"num_input_tokens_seen": 5864928,
"step": 9475
},
{
"epoch": 16.898395721925134,
"grad_norm": 4.7666785576439e-06,
"learning_rate": 3.57843232621686e-06,
"loss": 0.0,
"num_input_tokens_seen": 5868128,
"step": 9480
},
{
"epoch": 16.907308377896612,
"grad_norm": 5.753831374022411e-06,
"learning_rate": 3.55840933932339e-06,
"loss": 0.0,
"num_input_tokens_seen": 5870624,
"step": 9485
},
{
"epoch": 16.916221033868094,
"grad_norm": 7.89822661317885e-06,
"learning_rate": 3.5384382355319877e-06,
"loss": 0.0,
"num_input_tokens_seen": 5873504,
"step": 9490
},
{
"epoch": 16.925133689839573,
"grad_norm": 1.078835157386493e-05,
"learning_rate": 3.5185190631675635e-06,
"loss": 0.0,
"num_input_tokens_seen": 5876448,
"step": 9495
},
{
"epoch": 16.93404634581105,
"grad_norm": 1.1036178875656333e-05,
"learning_rate": 3.498651870429345e-06,
"loss": 0.0,
"num_input_tokens_seen": 5879392,
"step": 9500
},
{
"epoch": 16.94295900178253,
"grad_norm": 4.848511252930621e-06,
"learning_rate": 3.478836705390809e-06,
"loss": 0.0,
"num_input_tokens_seen": 5882944,
"step": 9505
},
{
"epoch": 16.951871657754012,
"grad_norm": 4.952781637257431e-06,
"learning_rate": 3.4590736159995253e-06,
"loss": 0.0,
"num_input_tokens_seen": 5886592,
"step": 9510
},
{
"epoch": 16.96078431372549,
"grad_norm": 6.254635991354007e-06,
"learning_rate": 3.4393626500770574e-06,
"loss": 0.0,
"num_input_tokens_seen": 5890144,
"step": 9515
},
{
"epoch": 16.96969696969697,
"grad_norm": 4.650720711651957e-06,
"learning_rate": 3.4197038553188484e-06,
"loss": 0.0,
"num_input_tokens_seen": 5892928,
"step": 9520
},
{
"epoch": 16.97860962566845,
"grad_norm": 5.847327429364668e-06,
"learning_rate": 3.400097279294087e-06,
"loss": 0.0,
"num_input_tokens_seen": 5895104,
"step": 9525
},
{
"epoch": 16.98752228163993,
"grad_norm": 5.981655249343021e-06,
"learning_rate": 3.3805429694456185e-06,
"loss": 0.0,
"num_input_tokens_seen": 5898560,
"step": 9530
},
{
"epoch": 16.996434937611408,
"grad_norm": 5.227420388109749e-06,
"learning_rate": 3.3610409730898155e-06,
"loss": 0.0,
"num_input_tokens_seen": 5901152,
"step": 9535
},
{
"epoch": 17.0,
"eval_loss": 0.2681449055671692,
"eval_runtime": 4.5873,
"eval_samples_per_second": 54.28,
"eval_steps_per_second": 13.734,
"num_input_tokens_seen": 5902048,
"step": 9537
},
{
"epoch": 17.005347593582886,
"grad_norm": 4.638813152268995e-06,
"learning_rate": 3.341591337416461e-06,
"loss": 0.0,
"num_input_tokens_seen": 5904032,
"step": 9540
},
{
"epoch": 17.01426024955437,
"grad_norm": 2.263891656184569e-05,
"learning_rate": 3.3221941094886493e-06,
"loss": 0.0,
"num_input_tokens_seen": 5906880,
"step": 9545
},
{
"epoch": 17.023172905525847,
"grad_norm": 1.2489145774452481e-05,
"learning_rate": 3.3028493362426387e-06,
"loss": 0.0,
"num_input_tokens_seen": 5909344,
"step": 9550
},
{
"epoch": 17.032085561497325,
"grad_norm": 4.581550456350669e-06,
"learning_rate": 3.2835570644877854e-06,
"loss": 0.0,
"num_input_tokens_seen": 5912064,
"step": 9555
},
{
"epoch": 17.040998217468804,
"grad_norm": 5.454068741528317e-06,
"learning_rate": 3.2643173409063977e-06,
"loss": 0.0,
"num_input_tokens_seen": 5915808,
"step": 9560
},
{
"epoch": 17.049910873440286,
"grad_norm": 2.5720153644215316e-05,
"learning_rate": 3.2451302120536155e-06,
"loss": 0.0,
"num_input_tokens_seen": 5918656,
"step": 9565
},
{
"epoch": 17.058823529411764,
"grad_norm": 7.229024049593136e-06,
"learning_rate": 3.2259957243573474e-06,
"loss": 0.0,
"num_input_tokens_seen": 5921344,
"step": 9570
},
{
"epoch": 17.067736185383243,
"grad_norm": 4.824169081985019e-06,
"learning_rate": 3.206913924118085e-06,
"loss": 0.0,
"num_input_tokens_seen": 5924544,
"step": 9575
},
{
"epoch": 17.076648841354725,
"grad_norm": 4.6479017328238115e-06,
"learning_rate": 3.1878848575088576e-06,
"loss": 0.0,
"num_input_tokens_seen": 5927200,
"step": 9580
},
{
"epoch": 17.085561497326204,
"grad_norm": 4.2676165321609005e-06,
"learning_rate": 3.168908570575085e-06,
"loss": 0.0,
"num_input_tokens_seen": 5930848,
"step": 9585
},
{
"epoch": 17.094474153297682,
"grad_norm": 1.122180765378289e-05,
"learning_rate": 3.149985109234463e-06,
"loss": 0.0,
"num_input_tokens_seen": 5934048,
"step": 9590
},
{
"epoch": 17.10338680926916,
"grad_norm": 6.161967576190364e-06,
"learning_rate": 3.131114519276876e-06,
"loss": 0.0,
"num_input_tokens_seen": 5936864,
"step": 9595
},
{
"epoch": 17.112299465240643,
"grad_norm": 2.578197563707363e-05,
"learning_rate": 3.112296846364271e-06,
"loss": 0.0,
"num_input_tokens_seen": 5939936,
"step": 9600
},
{
"epoch": 17.12121212121212,
"grad_norm": 5.605054411716992e-06,
"learning_rate": 3.0935321360305468e-06,
"loss": 0.0,
"num_input_tokens_seen": 5943968,
"step": 9605
},
{
"epoch": 17.1301247771836,
"grad_norm": 5.6164799389080144e-06,
"learning_rate": 3.074820433681455e-06,
"loss": 0.0,
"num_input_tokens_seen": 5947104,
"step": 9610
},
{
"epoch": 17.13903743315508,
"grad_norm": 6.087178917368874e-06,
"learning_rate": 3.0561617845944633e-06,
"loss": 0.0,
"num_input_tokens_seen": 5950656,
"step": 9615
},
{
"epoch": 17.14795008912656,
"grad_norm": 8.51512868393911e-06,
"learning_rate": 3.037556233918684e-06,
"loss": 0.0,
"num_input_tokens_seen": 5954144,
"step": 9620
},
{
"epoch": 17.15686274509804,
"grad_norm": 6.7033947743766475e-06,
"learning_rate": 3.0190038266747184e-06,
"loss": 0.0,
"num_input_tokens_seen": 5957856,
"step": 9625
},
{
"epoch": 17.165775401069517,
"grad_norm": 6.372693405864993e-06,
"learning_rate": 3.0005046077546147e-06,
"loss": 0.0,
"num_input_tokens_seen": 5960384,
"step": 9630
},
{
"epoch": 17.174688057041,
"grad_norm": 8.521442396158818e-06,
"learning_rate": 2.9820586219216908e-06,
"loss": 0.0,
"num_input_tokens_seen": 5962976,
"step": 9635
},
{
"epoch": 17.183600713012478,
"grad_norm": 5.297032657836098e-06,
"learning_rate": 2.9636659138104513e-06,
"loss": 0.0,
"num_input_tokens_seen": 5965856,
"step": 9640
},
{
"epoch": 17.192513368983956,
"grad_norm": 0.00017949608445633203,
"learning_rate": 2.9453265279264954e-06,
"loss": 0.0,
"num_input_tokens_seen": 5968832,
"step": 9645
},
{
"epoch": 17.20142602495544,
"grad_norm": 6.363478860293981e-06,
"learning_rate": 2.9270405086464e-06,
"loss": 0.0,
"num_input_tokens_seen": 5971616,
"step": 9650
},
{
"epoch": 17.210338680926917,
"grad_norm": 0.0001252765068784356,
"learning_rate": 2.908807900217583e-06,
"loss": 0.0,
"num_input_tokens_seen": 5975296,
"step": 9655
},
{
"epoch": 17.219251336898395,
"grad_norm": 1.5092450666998047e-05,
"learning_rate": 2.8906287467582616e-06,
"loss": 0.0,
"num_input_tokens_seen": 5978560,
"step": 9660
},
{
"epoch": 17.228163992869874,
"grad_norm": 5.846444764756598e-06,
"learning_rate": 2.87250309225727e-06,
"loss": 0.0,
"num_input_tokens_seen": 5981792,
"step": 9665
},
{
"epoch": 17.237076648841356,
"grad_norm": 3.902230218955083e-06,
"learning_rate": 2.8544309805740023e-06,
"loss": 0.0,
"num_input_tokens_seen": 5984352,
"step": 9670
},
{
"epoch": 17.245989304812834,
"grad_norm": 3.8089124245743733e-06,
"learning_rate": 2.8364124554383057e-06,
"loss": 0.0,
"num_input_tokens_seen": 5987424,
"step": 9675
},
{
"epoch": 17.254901960784313,
"grad_norm": 4.704589628090616e-06,
"learning_rate": 2.8184475604503324e-06,
"loss": 0.0,
"num_input_tokens_seen": 5990720,
"step": 9680
},
{
"epoch": 17.263814616755795,
"grad_norm": 1.3694937479158398e-05,
"learning_rate": 2.8005363390804896e-06,
"loss": 0.0,
"num_input_tokens_seen": 5993792,
"step": 9685
},
{
"epoch": 17.272727272727273,
"grad_norm": 3.3423213608330116e-05,
"learning_rate": 2.782678834669297e-06,
"loss": 0.0,
"num_input_tokens_seen": 5997344,
"step": 9690
},
{
"epoch": 17.281639928698752,
"grad_norm": 4.3190329961362295e-06,
"learning_rate": 2.7648750904272964e-06,
"loss": 0.0,
"num_input_tokens_seen": 6000096,
"step": 9695
},
{
"epoch": 17.29055258467023,
"grad_norm": 3.917830326827243e-06,
"learning_rate": 2.747125149434948e-06,
"loss": 0.0,
"num_input_tokens_seen": 6003168,
"step": 9700
},
{
"epoch": 17.299465240641712,
"grad_norm": 2.409498847555369e-05,
"learning_rate": 2.7294290546425044e-06,
"loss": 0.0,
"num_input_tokens_seen": 6006944,
"step": 9705
},
{
"epoch": 17.30837789661319,
"grad_norm": 4.727301984530641e-06,
"learning_rate": 2.7117868488699517e-06,
"loss": 0.0,
"num_input_tokens_seen": 6009984,
"step": 9710
},
{
"epoch": 17.31729055258467,
"grad_norm": 4.537017048278358e-06,
"learning_rate": 2.6941985748068418e-06,
"loss": 0.0,
"num_input_tokens_seen": 6013344,
"step": 9715
},
{
"epoch": 17.32620320855615,
"grad_norm": 5.146333023731131e-06,
"learning_rate": 2.6766642750122666e-06,
"loss": 0.0,
"num_input_tokens_seen": 6016992,
"step": 9720
},
{
"epoch": 17.33511586452763,
"grad_norm": 2.907273665186949e-05,
"learning_rate": 2.659183991914696e-06,
"loss": 0.0,
"num_input_tokens_seen": 6020416,
"step": 9725
},
{
"epoch": 17.34402852049911,
"grad_norm": 3.90044351661345e-06,
"learning_rate": 2.641757767811881e-06,
"loss": 0.0,
"num_input_tokens_seen": 6023712,
"step": 9730
},
{
"epoch": 17.352941176470587,
"grad_norm": 7.279854798980523e-06,
"learning_rate": 2.624385644870783e-06,
"loss": 0.0,
"num_input_tokens_seen": 6026656,
"step": 9735
},
{
"epoch": 17.36185383244207,
"grad_norm": 1.2712700481642969e-05,
"learning_rate": 2.607067665127441e-06,
"loss": 0.0,
"num_input_tokens_seen": 6029568,
"step": 9740
},
{
"epoch": 17.370766488413548,
"grad_norm": 5.397015684138751e-06,
"learning_rate": 2.5898038704868818e-06,
"loss": 0.0,
"num_input_tokens_seen": 6033312,
"step": 9745
},
{
"epoch": 17.379679144385026,
"grad_norm": 5.237438472249778e-06,
"learning_rate": 2.5725943027230333e-06,
"loss": 0.0,
"num_input_tokens_seen": 6036128,
"step": 9750
},
{
"epoch": 17.388591800356505,
"grad_norm": 9.407116885995492e-05,
"learning_rate": 2.555439003478591e-06,
"loss": 0.0,
"num_input_tokens_seen": 6039040,
"step": 9755
},
{
"epoch": 17.397504456327987,
"grad_norm": 6.8480708250717726e-06,
"learning_rate": 2.538338014264938e-06,
"loss": 0.0,
"num_input_tokens_seen": 6042432,
"step": 9760
},
{
"epoch": 17.406417112299465,
"grad_norm": 4.484073087951401e-06,
"learning_rate": 2.521291376462051e-06,
"loss": 0.0,
"num_input_tokens_seen": 6046080,
"step": 9765
},
{
"epoch": 17.415329768270944,
"grad_norm": 4.551226084004156e-06,
"learning_rate": 2.5042991313183745e-06,
"loss": 0.0,
"num_input_tokens_seen": 6048352,
"step": 9770
},
{
"epoch": 17.424242424242426,
"grad_norm": 8.205198355426546e-06,
"learning_rate": 2.4873613199507514e-06,
"loss": 0.0,
"num_input_tokens_seen": 6050976,
"step": 9775
},
{
"epoch": 17.433155080213904,
"grad_norm": 5.023833182349335e-06,
"learning_rate": 2.470477983344299e-06,
"loss": 0.0,
"num_input_tokens_seen": 6054784,
"step": 9780
},
{
"epoch": 17.442067736185383,
"grad_norm": 4.796798293682514e-06,
"learning_rate": 2.4536491623523284e-06,
"loss": 0.0,
"num_input_tokens_seen": 6057440,
"step": 9785
},
{
"epoch": 17.45098039215686,
"grad_norm": 4.666772383643547e-06,
"learning_rate": 2.436874897696234e-06,
"loss": 0.0,
"num_input_tokens_seen": 6060768,
"step": 9790
},
{
"epoch": 17.459893048128343,
"grad_norm": 4.6128952817525715e-06,
"learning_rate": 2.42015522996539e-06,
"loss": 0.0,
"num_input_tokens_seen": 6064096,
"step": 9795
},
{
"epoch": 17.46880570409982,
"grad_norm": 4.935674041917082e-06,
"learning_rate": 2.403490199617073e-06,
"loss": 0.0,
"num_input_tokens_seen": 6067072,
"step": 9800
},
{
"epoch": 17.4777183600713,
"grad_norm": 6.165453669382259e-05,
"learning_rate": 2.3868798469763307e-06,
"loss": 0.0,
"num_input_tokens_seen": 6070016,
"step": 9805
},
{
"epoch": 17.486631016042782,
"grad_norm": 6.289596512942808e-06,
"learning_rate": 2.370324212235936e-06,
"loss": 0.0,
"num_input_tokens_seen": 6073664,
"step": 9810
},
{
"epoch": 17.49554367201426,
"grad_norm": 4.492193511396181e-06,
"learning_rate": 2.35382333545624e-06,
"loss": 0.0,
"num_input_tokens_seen": 6076896,
"step": 9815
},
{
"epoch": 17.50445632798574,
"grad_norm": 1.5470133803319186e-05,
"learning_rate": 2.3373772565650874e-06,
"loss": 0.0,
"num_input_tokens_seen": 6080000,
"step": 9820
},
{
"epoch": 17.513368983957218,
"grad_norm": 1.5124073797778692e-05,
"learning_rate": 2.3209860153577402e-06,
"loss": 0.0,
"num_input_tokens_seen": 6082784,
"step": 9825
},
{
"epoch": 17.5222816399287,
"grad_norm": 3.944990112358937e-06,
"learning_rate": 2.304649651496754e-06,
"loss": 0.0,
"num_input_tokens_seen": 6085440,
"step": 9830
},
{
"epoch": 17.53119429590018,
"grad_norm": 5.5363611863867845e-06,
"learning_rate": 2.2883682045119063e-06,
"loss": 0.0,
"num_input_tokens_seen": 6088672,
"step": 9835
},
{
"epoch": 17.540106951871657,
"grad_norm": 5.04318541061366e-06,
"learning_rate": 2.272141713800094e-06,
"loss": 0.0,
"num_input_tokens_seen": 6091680,
"step": 9840
},
{
"epoch": 17.54901960784314,
"grad_norm": 4.3036125134676695e-06,
"learning_rate": 2.255970218625217e-06,
"loss": 0.0,
"num_input_tokens_seen": 6094336,
"step": 9845
},
{
"epoch": 17.557932263814617,
"grad_norm": 1.092425827664556e-05,
"learning_rate": 2.2398537581181155e-06,
"loss": 0.0,
"num_input_tokens_seen": 6096704,
"step": 9850
},
{
"epoch": 17.566844919786096,
"grad_norm": 4.398175860842457e-06,
"learning_rate": 2.2237923712764535e-06,
"loss": 0.0,
"num_input_tokens_seen": 6099136,
"step": 9855
},
{
"epoch": 17.575757575757574,
"grad_norm": 0.00013597174256574363,
"learning_rate": 2.2077860969646285e-06,
"loss": 0.0,
"num_input_tokens_seen": 6102016,
"step": 9860
},
{
"epoch": 17.584670231729056,
"grad_norm": 9.326985491497908e-06,
"learning_rate": 2.191834973913684e-06,
"loss": 0.0,
"num_input_tokens_seen": 6105440,
"step": 9865
},
{
"epoch": 17.593582887700535,
"grad_norm": 3.933530479116598e-06,
"learning_rate": 2.1759390407212117e-06,
"loss": 0.0,
"num_input_tokens_seen": 6108768,
"step": 9870
},
{
"epoch": 17.602495543672013,
"grad_norm": 7.7144486567704e-06,
"learning_rate": 2.1600983358512574e-06,
"loss": 0.0,
"num_input_tokens_seen": 6111840,
"step": 9875
},
{
"epoch": 17.611408199643495,
"grad_norm": 4.5524284359999e-06,
"learning_rate": 2.144312897634232e-06,
"loss": 0.0,
"num_input_tokens_seen": 6114048,
"step": 9880
},
{
"epoch": 17.620320855614974,
"grad_norm": 7.663302312721498e-06,
"learning_rate": 2.1285827642668065e-06,
"loss": 0.0,
"num_input_tokens_seen": 6117440,
"step": 9885
},
{
"epoch": 17.629233511586452,
"grad_norm": 4.616276783053763e-06,
"learning_rate": 2.1129079738118423e-06,
"loss": 0.0,
"num_input_tokens_seen": 6120736,
"step": 9890
},
{
"epoch": 17.63814616755793,
"grad_norm": 8.89071452547796e-06,
"learning_rate": 2.09728856419826e-06,
"loss": 0.0,
"num_input_tokens_seen": 6123392,
"step": 9895
},
{
"epoch": 17.647058823529413,
"grad_norm": 4.064894710609224e-06,
"learning_rate": 2.0817245732210057e-06,
"loss": 0.0,
"num_input_tokens_seen": 6126848,
"step": 9900
},
{
"epoch": 17.65597147950089,
"grad_norm": 5.309177595336223e-06,
"learning_rate": 2.0662160385409108e-06,
"loss": 0.0,
"num_input_tokens_seen": 6130048,
"step": 9905
},
{
"epoch": 17.66488413547237,
"grad_norm": 5.220045750320423e-06,
"learning_rate": 2.050762997684605e-06,
"loss": 0.0,
"num_input_tokens_seen": 6133216,
"step": 9910
},
{
"epoch": 17.67379679144385,
"grad_norm": 4.825315954803955e-06,
"learning_rate": 2.0353654880444635e-06,
"loss": 0.0,
"num_input_tokens_seen": 6136480,
"step": 9915
},
{
"epoch": 17.68270944741533,
"grad_norm": 8.878021617420018e-06,
"learning_rate": 2.0200235468784636e-06,
"loss": 0.0,
"num_input_tokens_seen": 6139136,
"step": 9920
},
{
"epoch": 17.69162210338681,
"grad_norm": 1.6448260794277303e-05,
"learning_rate": 2.0047372113101344e-06,
"loss": 0.0,
"num_input_tokens_seen": 6142112,
"step": 9925
},
{
"epoch": 17.700534759358288,
"grad_norm": 3.952519364247564e-06,
"learning_rate": 1.9895065183284683e-06,
"loss": 0.0,
"num_input_tokens_seen": 6145632,
"step": 9930
},
{
"epoch": 17.70944741532977,
"grad_norm": 5.336608865036396e-06,
"learning_rate": 1.9743315047877853e-06,
"loss": 0.0,
"num_input_tokens_seen": 6149152,
"step": 9935
},
{
"epoch": 17.718360071301248,
"grad_norm": 8.08969070931198e-06,
"learning_rate": 1.9592122074077012e-06,
"loss": 0.0,
"num_input_tokens_seen": 6152512,
"step": 9940
},
{
"epoch": 17.727272727272727,
"grad_norm": 1.4897877917974256e-05,
"learning_rate": 1.9441486627729987e-06,
"loss": 0.0,
"num_input_tokens_seen": 6155232,
"step": 9945
},
{
"epoch": 17.736185383244205,
"grad_norm": 5.695238087355392e-06,
"learning_rate": 1.929140907333557e-06,
"loss": 0.0,
"num_input_tokens_seen": 6158752,
"step": 9950
},
{
"epoch": 17.745098039215687,
"grad_norm": 5.3530861805484165e-06,
"learning_rate": 1.914188977404269e-06,
"loss": 0.0,
"num_input_tokens_seen": 6161696,
"step": 9955
},
{
"epoch": 17.754010695187166,
"grad_norm": 3.5964576454716735e-06,
"learning_rate": 1.899292909164932e-06,
"loss": 0.0,
"num_input_tokens_seen": 6165440,
"step": 9960
},
{
"epoch": 17.762923351158644,
"grad_norm": 2.9325330615392886e-05,
"learning_rate": 1.884452738660178e-06,
"loss": 0.0,
"num_input_tokens_seen": 6168128,
"step": 9965
},
{
"epoch": 17.771836007130126,
"grad_norm": 4.651951712730806e-06,
"learning_rate": 1.8696685017993849e-06,
"loss": 0.0,
"num_input_tokens_seen": 6171456,
"step": 9970
},
{
"epoch": 17.780748663101605,
"grad_norm": 9.019388016895391e-06,
"learning_rate": 1.8549402343565698e-06,
"loss": 0.0,
"num_input_tokens_seen": 6174880,
"step": 9975
},
{
"epoch": 17.789661319073083,
"grad_norm": 1.0646539521985687e-05,
"learning_rate": 1.8402679719703442e-06,
"loss": 0.0,
"num_input_tokens_seen": 6177984,
"step": 9980
},
{
"epoch": 17.79857397504456,
"grad_norm": 3.674474555737106e-06,
"learning_rate": 1.825651750143767e-06,
"loss": 0.0,
"num_input_tokens_seen": 6180928,
"step": 9985
},
{
"epoch": 17.807486631016044,
"grad_norm": 2.7525751647772267e-05,
"learning_rate": 1.8110916042443332e-06,
"loss": 0.0,
"num_input_tokens_seen": 6183712,
"step": 9990
},
{
"epoch": 17.816399286987522,
"grad_norm": 5.357886038837023e-06,
"learning_rate": 1.7965875695038215e-06,
"loss": 0.0,
"num_input_tokens_seen": 6186976,
"step": 9995
},
{
"epoch": 17.825311942959,
"grad_norm": 2.7742196834878996e-05,
"learning_rate": 1.782139681018244e-06,
"loss": 0.0,
"num_input_tokens_seen": 6190880,
"step": 10000
},
{
"epoch": 17.834224598930483,
"grad_norm": 5.267481810733443e-06,
"learning_rate": 1.767747973747752e-06,
"loss": 0.0,
"num_input_tokens_seen": 6193824,
"step": 10005
},
{
"epoch": 17.84313725490196,
"grad_norm": 1.7488835510448553e-05,
"learning_rate": 1.7534124825165505e-06,
"loss": 0.0,
"num_input_tokens_seen": 6197056,
"step": 10010
},
{
"epoch": 17.85204991087344,
"grad_norm": 4.473508852242958e-06,
"learning_rate": 1.7391332420128193e-06,
"loss": 0.0,
"num_input_tokens_seen": 6199776,
"step": 10015
},
{
"epoch": 17.86096256684492,
"grad_norm": 3.7598254039039603e-06,
"learning_rate": 1.7249102867886392e-06,
"loss": 0.0,
"num_input_tokens_seen": 6203392,
"step": 10020
},
{
"epoch": 17.8698752228164,
"grad_norm": 9.932436114468146e-06,
"learning_rate": 1.7107436512598661e-06,
"loss": 0.0,
"num_input_tokens_seen": 6206656,
"step": 10025
},
{
"epoch": 17.87878787878788,
"grad_norm": 5.878880983800627e-06,
"learning_rate": 1.6966333697061049e-06,
"loss": 0.0,
"num_input_tokens_seen": 6209664,
"step": 10030
},
{
"epoch": 17.887700534759357,
"grad_norm": 4.809881829714868e-06,
"learning_rate": 1.6825794762705765e-06,
"loss": 0.0,
"num_input_tokens_seen": 6212320,
"step": 10035
},
{
"epoch": 17.89661319073084,
"grad_norm": 4.702781552623492e-06,
"learning_rate": 1.6685820049600703e-06,
"loss": 0.0,
"num_input_tokens_seen": 6215136,
"step": 10040
},
{
"epoch": 17.905525846702318,
"grad_norm": 4.891365733783459e-06,
"learning_rate": 1.6546409896448457e-06,
"loss": 0.0,
"num_input_tokens_seen": 6218816,
"step": 10045
},
{
"epoch": 17.914438502673796,
"grad_norm": 5.0554826884763315e-06,
"learning_rate": 1.6407564640585572e-06,
"loss": 0.0,
"num_input_tokens_seen": 6221440,
"step": 10050
},
{
"epoch": 17.923351158645275,
"grad_norm": 4.003755293524591e-06,
"learning_rate": 1.6269284617981607e-06,
"loss": 0.0,
"num_input_tokens_seen": 6224576,
"step": 10055
},
{
"epoch": 17.932263814616757,
"grad_norm": 5.1156189329049084e-06,
"learning_rate": 1.6131570163238436e-06,
"loss": 0.0,
"num_input_tokens_seen": 6227744,
"step": 10060
},
{
"epoch": 17.941176470588236,
"grad_norm": 4.8709184738982e-06,
"learning_rate": 1.5994421609589388e-06,
"loss": 0.0,
"num_input_tokens_seen": 6231136,
"step": 10065
},
{
"epoch": 17.950089126559714,
"grad_norm": 5.6951562328322325e-06,
"learning_rate": 1.5857839288898558e-06,
"loss": 0.0,
"num_input_tokens_seen": 6234048,
"step": 10070
},
{
"epoch": 17.959001782531196,
"grad_norm": 5.639751179842278e-06,
"learning_rate": 1.5721823531659712e-06,
"loss": 0.0,
"num_input_tokens_seen": 6237120,
"step": 10075
},
{
"epoch": 17.967914438502675,
"grad_norm": 8.619826257927343e-05,
"learning_rate": 1.558637466699589e-06,
"loss": 0.0,
"num_input_tokens_seen": 6240256,
"step": 10080
},
{
"epoch": 17.976827094474153,
"grad_norm": 5.539429821510566e-06,
"learning_rate": 1.5451493022658332e-06,
"loss": 0.0,
"num_input_tokens_seen": 6243488,
"step": 10085
},
{
"epoch": 17.98573975044563,
"grad_norm": 5.7802731134870555e-06,
"learning_rate": 1.5317178925025571e-06,
"loss": 0.0,
"num_input_tokens_seen": 6247040,
"step": 10090
},
{
"epoch": 17.994652406417114,
"grad_norm": 4.963373612554278e-06,
"learning_rate": 1.5183432699103134e-06,
"loss": 0.0,
"num_input_tokens_seen": 6250880,
"step": 10095
},
{
"epoch": 18.0,
"eval_loss": 0.2708573043346405,
"eval_runtime": 4.5899,
"eval_samples_per_second": 54.249,
"eval_steps_per_second": 13.726,
"num_input_tokens_seen": 6252128,
"step": 10098
},
{
"epoch": 18.003565062388592,
"grad_norm": 5.739175321650691e-06,
"learning_rate": 1.5050254668522168e-06,
"loss": 0.0,
"num_input_tokens_seen": 6253536,
"step": 10100
},
{
"epoch": 18.01247771836007,
"grad_norm": 4.788083060702775e-06,
"learning_rate": 1.4917645155539062e-06,
"loss": 0.0,
"num_input_tokens_seen": 6256864,
"step": 10105
},
{
"epoch": 18.02139037433155,
"grad_norm": 3.880075837514596e-06,
"learning_rate": 1.4785604481034638e-06,
"loss": 0.0,
"num_input_tokens_seen": 6259360,
"step": 10110
},
{
"epoch": 18.03030303030303,
"grad_norm": 4.699511464423267e-06,
"learning_rate": 1.465413296451304e-06,
"loss": 0.0,
"num_input_tokens_seen": 6261920,
"step": 10115
},
{
"epoch": 18.03921568627451,
"grad_norm": 4.53999336968991e-06,
"learning_rate": 1.4523230924101433e-06,
"loss": 0.0,
"num_input_tokens_seen": 6265280,
"step": 10120
},
{
"epoch": 18.048128342245988,
"grad_norm": 4.677222477766918e-06,
"learning_rate": 1.4392898676548777e-06,
"loss": 0.0,
"num_input_tokens_seen": 6267936,
"step": 10125
},
{
"epoch": 18.05704099821747,
"grad_norm": 5.001421413908247e-06,
"learning_rate": 1.4263136537225442e-06,
"loss": 0.0,
"num_input_tokens_seen": 6271552,
"step": 10130
},
{
"epoch": 18.06595365418895,
"grad_norm": 4.5485739974537864e-06,
"learning_rate": 1.4133944820122258e-06,
"loss": 0.0,
"num_input_tokens_seen": 6274272,
"step": 10135
},
{
"epoch": 18.074866310160427,
"grad_norm": 8.964995686255861e-06,
"learning_rate": 1.4005323837849721e-06,
"loss": 0.0,
"num_input_tokens_seen": 6277056,
"step": 10140
},
{
"epoch": 18.083778966131906,
"grad_norm": 5.683996278094128e-06,
"learning_rate": 1.38772739016374e-06,
"loss": 0.0,
"num_input_tokens_seen": 6279488,
"step": 10145
},
{
"epoch": 18.092691622103388,
"grad_norm": 3.7518773297051666e-06,
"learning_rate": 1.3749795321332887e-06,
"loss": 0.0,
"num_input_tokens_seen": 6282944,
"step": 10150
},
{
"epoch": 18.101604278074866,
"grad_norm": 4.472956788958982e-06,
"learning_rate": 1.3622888405401462e-06,
"loss": 0.0,
"num_input_tokens_seen": 6286464,
"step": 10155
},
{
"epoch": 18.110516934046345,
"grad_norm": 6.11408313488937e-06,
"learning_rate": 1.3496553460925042e-06,
"loss": 0.0,
"num_input_tokens_seen": 6289344,
"step": 10160
},
{
"epoch": 18.119429590017827,
"grad_norm": 7.724917850282509e-06,
"learning_rate": 1.3370790793601373e-06,
"loss": 0.0,
"num_input_tokens_seen": 6292160,
"step": 10165
},
{
"epoch": 18.128342245989305,
"grad_norm": 4.958530553267337e-06,
"learning_rate": 1.3245600707743749e-06,
"loss": 0.0,
"num_input_tokens_seen": 6294976,
"step": 10170
},
{
"epoch": 18.137254901960784,
"grad_norm": 4.44446959591005e-06,
"learning_rate": 1.3120983506279689e-06,
"loss": 0.0,
"num_input_tokens_seen": 6298848,
"step": 10175
},
{
"epoch": 18.146167557932262,
"grad_norm": 4.498652288020821e-06,
"learning_rate": 1.2996939490750564e-06,
"loss": 0.0,
"num_input_tokens_seen": 6301568,
"step": 10180
},
{
"epoch": 18.155080213903744,
"grad_norm": 1.0262559044349473e-05,
"learning_rate": 1.2873468961310892e-06,
"loss": 0.0,
"num_input_tokens_seen": 6303936,
"step": 10185
},
{
"epoch": 18.163992869875223,
"grad_norm": 4.685638941737125e-06,
"learning_rate": 1.2750572216727318e-06,
"loss": 0.0,
"num_input_tokens_seen": 6307040,
"step": 10190
},
{
"epoch": 18.1729055258467,
"grad_norm": 5.877223884453997e-06,
"learning_rate": 1.2628249554378135e-06,
"loss": 0.0,
"num_input_tokens_seen": 6310240,
"step": 10195
},
{
"epoch": 18.181818181818183,
"grad_norm": 4.965536390955094e-06,
"learning_rate": 1.2506501270252712e-06,
"loss": 0.0,
"num_input_tokens_seen": 6313664,
"step": 10200
},
{
"epoch": 18.190730837789662,
"grad_norm": 7.754924808978103e-06,
"learning_rate": 1.238532765895023e-06,
"loss": 0.0,
"num_input_tokens_seen": 6316768,
"step": 10205
},
{
"epoch": 18.19964349376114,
"grad_norm": 4.4843045543530025e-06,
"learning_rate": 1.2264729013679588e-06,
"loss": 0.0,
"num_input_tokens_seen": 6319264,
"step": 10210
},
{
"epoch": 18.20855614973262,
"grad_norm": 5.228135705692694e-06,
"learning_rate": 1.2144705626258217e-06,
"loss": 0.0,
"num_input_tokens_seen": 6321984,
"step": 10215
},
{
"epoch": 18.2174688057041,
"grad_norm": 7.139366971387062e-06,
"learning_rate": 1.202525778711172e-06,
"loss": 0.0,
"num_input_tokens_seen": 6325952,
"step": 10220
},
{
"epoch": 18.22638146167558,
"grad_norm": 4.869689291808754e-06,
"learning_rate": 1.1906385785272923e-06,
"loss": 0.0,
"num_input_tokens_seen": 6328896,
"step": 10225
},
{
"epoch": 18.235294117647058,
"grad_norm": 4.0212094063463155e-06,
"learning_rate": 1.1788089908381372e-06,
"loss": 0.0,
"num_input_tokens_seen": 6332256,
"step": 10230
},
{
"epoch": 18.24420677361854,
"grad_norm": 7.100913990143454e-06,
"learning_rate": 1.167037044268246e-06,
"loss": 0.0,
"num_input_tokens_seen": 6335104,
"step": 10235
},
{
"epoch": 18.25311942959002,
"grad_norm": 6.65205589029938e-05,
"learning_rate": 1.1553227673026801e-06,
"loss": 0.0,
"num_input_tokens_seen": 6338592,
"step": 10240
},
{
"epoch": 18.262032085561497,
"grad_norm": 2.6237210477120243e-05,
"learning_rate": 1.1436661882869626e-06,
"loss": 0.0,
"num_input_tokens_seen": 6341856,
"step": 10245
},
{
"epoch": 18.270944741532976,
"grad_norm": 7.856396223360207e-06,
"learning_rate": 1.1320673354270034e-06,
"loss": 0.0,
"num_input_tokens_seen": 6345280,
"step": 10250
},
{
"epoch": 18.279857397504458,
"grad_norm": 3.6295623431215063e-06,
"learning_rate": 1.1205262367890101e-06,
"loss": 0.0,
"num_input_tokens_seen": 6349344,
"step": 10255
},
{
"epoch": 18.288770053475936,
"grad_norm": 4.837954747927142e-06,
"learning_rate": 1.1090429202994746e-06,
"loss": 0.0,
"num_input_tokens_seen": 6353152,
"step": 10260
},
{
"epoch": 18.297682709447415,
"grad_norm": 8.854965926730074e-06,
"learning_rate": 1.097617413745039e-06,
"loss": 0.0,
"num_input_tokens_seen": 6355840,
"step": 10265
},
{
"epoch": 18.306595365418893,
"grad_norm": 5.455266000353731e-05,
"learning_rate": 1.0862497447724802e-06,
"loss": 0.0,
"num_input_tokens_seen": 6359008,
"step": 10270
},
{
"epoch": 18.315508021390375,
"grad_norm": 1.102537407859927e-05,
"learning_rate": 1.0749399408886141e-06,
"loss": 0.0,
"num_input_tokens_seen": 6361760,
"step": 10275
},
{
"epoch": 18.324420677361854,
"grad_norm": 4.134295977564761e-06,
"learning_rate": 1.063688029460233e-06,
"loss": 0.0,
"num_input_tokens_seen": 6364512,
"step": 10280
},
{
"epoch": 18.333333333333332,
"grad_norm": 8.992203220259398e-06,
"learning_rate": 1.0524940377140635e-06,
"loss": 0.0,
"num_input_tokens_seen": 6367104,
"step": 10285
},
{
"epoch": 18.342245989304814,
"grad_norm": 7.417367669404484e-06,
"learning_rate": 1.0413579927366635e-06,
"loss": 0.0,
"num_input_tokens_seen": 6370400,
"step": 10290
},
{
"epoch": 18.351158645276293,
"grad_norm": 2.561887595220469e-05,
"learning_rate": 1.030279921474378e-06,
"loss": 0.0,
"num_input_tokens_seen": 6373536,
"step": 10295
},
{
"epoch": 18.36007130124777,
"grad_norm": 2.4329236111952923e-05,
"learning_rate": 1.0192598507332785e-06,
"loss": 0.0,
"num_input_tokens_seen": 6377280,
"step": 10300
},
{
"epoch": 18.36898395721925,
"grad_norm": 6.050835509086028e-06,
"learning_rate": 1.0082978071790815e-06,
"loss": 0.0,
"num_input_tokens_seen": 6380640,
"step": 10305
},
{
"epoch": 18.37789661319073,
"grad_norm": 1.3629637578560505e-05,
"learning_rate": 9.973938173370972e-07,
"loss": 0.0,
"num_input_tokens_seen": 6383616,
"step": 10310
},
{
"epoch": 18.38680926916221,
"grad_norm": 8.69268988026306e-05,
"learning_rate": 9.865479075921642e-07,
"loss": 0.0,
"num_input_tokens_seen": 6386432,
"step": 10315
},
{
"epoch": 18.39572192513369,
"grad_norm": 4.355636974651134e-06,
"learning_rate": 9.757601041885694e-07,
"loss": 0.0,
"num_input_tokens_seen": 6389824,
"step": 10320
},
{
"epoch": 18.40463458110517,
"grad_norm": 9.561532351654023e-06,
"learning_rate": 9.650304332300159e-07,
"loss": 0.0,
"num_input_tokens_seen": 6392352,
"step": 10325
},
{
"epoch": 18.41354723707665,
"grad_norm": 3.7180716390139423e-06,
"learning_rate": 9.54358920679524e-07,
"loss": 0.0,
"num_input_tokens_seen": 6395936,
"step": 10330
},
{
"epoch": 18.422459893048128,
"grad_norm": 4.346656169218477e-06,
"learning_rate": 9.437455923593963e-07,
"loss": 0.0,
"num_input_tokens_seen": 6399136,
"step": 10335
},
{
"epoch": 18.431372549019606,
"grad_norm": 0.0001780799648258835,
"learning_rate": 9.331904739511399e-07,
"loss": 0.0,
"num_input_tokens_seen": 6402592,
"step": 10340
},
{
"epoch": 18.44028520499109,
"grad_norm": 6.550861144205555e-06,
"learning_rate": 9.226935909954104e-07,
"loss": 0.0,
"num_input_tokens_seen": 6405728,
"step": 10345
},
{
"epoch": 18.449197860962567,
"grad_norm": 4.070516297360882e-06,
"learning_rate": 9.12254968891954e-07,
"loss": 0.0,
"num_input_tokens_seen": 6409664,
"step": 10350
},
{
"epoch": 18.458110516934045,
"grad_norm": 5.320921445672866e-06,
"learning_rate": 9.018746328995298e-07,
"loss": 0.0,
"num_input_tokens_seen": 6412160,
"step": 10355
},
{
"epoch": 18.467023172905527,
"grad_norm": 3.3889532460307237e-06,
"learning_rate": 8.915526081358649e-07,
"loss": 0.0,
"num_input_tokens_seen": 6415072,
"step": 10360
},
{
"epoch": 18.475935828877006,
"grad_norm": 4.983634426025674e-06,
"learning_rate": 8.812889195775942e-07,
"loss": 0.0,
"num_input_tokens_seen": 6417952,
"step": 10365
},
{
"epoch": 18.484848484848484,
"grad_norm": 7.026436833257321e-06,
"learning_rate": 8.710835920601818e-07,
"loss": 0.0,
"num_input_tokens_seen": 6420960,
"step": 10370
},
{
"epoch": 18.493761140819963,
"grad_norm": 5.714975486625917e-06,
"learning_rate": 8.609366502778854e-07,
"loss": 0.0,
"num_input_tokens_seen": 6424352,
"step": 10375
},
{
"epoch": 18.502673796791445,
"grad_norm": 2.397276330157183e-05,
"learning_rate": 8.508481187836759e-07,
"loss": 0.0,
"num_input_tokens_seen": 6427776,
"step": 10380
},
{
"epoch": 18.511586452762923,
"grad_norm": 5.075999524706276e-06,
"learning_rate": 8.408180219891897e-07,
"loss": 0.0,
"num_input_tokens_seen": 6431296,
"step": 10385
},
{
"epoch": 18.520499108734402,
"grad_norm": 5.217380930844229e-06,
"learning_rate": 8.308463841646713e-07,
"loss": 0.0,
"num_input_tokens_seen": 6434464,
"step": 10390
},
{
"epoch": 18.529411764705884,
"grad_norm": 4.922795142192626e-06,
"learning_rate": 8.209332294388972e-07,
"loss": 0.0,
"num_input_tokens_seen": 6438016,
"step": 10395
},
{
"epoch": 18.538324420677363,
"grad_norm": 4.330814590502996e-06,
"learning_rate": 8.110785817991379e-07,
"loss": 0.0,
"num_input_tokens_seen": 6440608,
"step": 10400
},
{
"epoch": 18.54723707664884,
"grad_norm": 7.737668056506664e-06,
"learning_rate": 8.012824650910938e-07,
"loss": 0.0,
"num_input_tokens_seen": 6443552,
"step": 10405
},
{
"epoch": 18.55614973262032,
"grad_norm": 5.734651949751424e-06,
"learning_rate": 7.915449030188316e-07,
"loss": 0.0,
"num_input_tokens_seen": 6445952,
"step": 10410
},
{
"epoch": 18.5650623885918,
"grad_norm": 6.203586963238195e-06,
"learning_rate": 7.818659191447363e-07,
"loss": 0.0,
"num_input_tokens_seen": 6449536,
"step": 10415
},
{
"epoch": 18.57397504456328,
"grad_norm": 3.782967723964248e-06,
"learning_rate": 7.722455368894376e-07,
"loss": 0.0,
"num_input_tokens_seen": 6452960,
"step": 10420
},
{
"epoch": 18.58288770053476,
"grad_norm": 3.587515038816491e-06,
"learning_rate": 7.626837795317781e-07,
"loss": 0.0,
"num_input_tokens_seen": 6456384,
"step": 10425
},
{
"epoch": 18.59180035650624,
"grad_norm": 4.201141564408317e-06,
"learning_rate": 7.531806702087307e-07,
"loss": 0.0,
"num_input_tokens_seen": 6458944,
"step": 10430
},
{
"epoch": 18.60071301247772,
"grad_norm": 4.2174779082415625e-06,
"learning_rate": 7.437362319153651e-07,
"loss": 0.0,
"num_input_tokens_seen": 6461920,
"step": 10435
},
{
"epoch": 18.609625668449198,
"grad_norm": 4.0159065974876285e-06,
"learning_rate": 7.343504875047813e-07,
"loss": 0.0,
"num_input_tokens_seen": 6465536,
"step": 10440
},
{
"epoch": 18.618538324420676,
"grad_norm": 4.9373779802408535e-06,
"learning_rate": 7.250234596880456e-07,
"loss": 0.0,
"num_input_tokens_seen": 6469152,
"step": 10445
},
{
"epoch": 18.627450980392158,
"grad_norm": 4.6428826863120776e-06,
"learning_rate": 7.157551710341576e-07,
"loss": 0.0,
"num_input_tokens_seen": 6472896,
"step": 10450
},
{
"epoch": 18.636363636363637,
"grad_norm": 5.256363238004269e-06,
"learning_rate": 7.065456439699775e-07,
"loss": 0.0,
"num_input_tokens_seen": 6475136,
"step": 10455
},
{
"epoch": 18.645276292335115,
"grad_norm": 4.301908120396547e-06,
"learning_rate": 6.973949007801711e-07,
"loss": 0.0,
"num_input_tokens_seen": 6478368,
"step": 10460
},
{
"epoch": 18.654188948306594,
"grad_norm": 4.6390055103984196e-06,
"learning_rate": 6.883029636071819e-07,
"loss": 0.0,
"num_input_tokens_seen": 6481216,
"step": 10465
},
{
"epoch": 18.663101604278076,
"grad_norm": 1.0161958925891668e-05,
"learning_rate": 6.792698544511366e-07,
"loss": 0.0,
"num_input_tokens_seen": 6484416,
"step": 10470
},
{
"epoch": 18.672014260249554,
"grad_norm": 4.047020865982631e-06,
"learning_rate": 6.702955951698231e-07,
"loss": 0.0,
"num_input_tokens_seen": 6487360,
"step": 10475
},
{
"epoch": 18.680926916221033,
"grad_norm": 3.812608383668703e-06,
"learning_rate": 6.613802074786319e-07,
"loss": 0.0,
"num_input_tokens_seen": 6490208,
"step": 10480
},
{
"epoch": 18.689839572192515,
"grad_norm": 3.885810201609274e-06,
"learning_rate": 6.525237129504868e-07,
"loss": 0.0,
"num_input_tokens_seen": 6493376,
"step": 10485
},
{
"epoch": 18.698752228163993,
"grad_norm": 7.146949883463094e-06,
"learning_rate": 6.437261330158207e-07,
"loss": 0.0,
"num_input_tokens_seen": 6496064,
"step": 10490
},
{
"epoch": 18.707664884135472,
"grad_norm": 1.5320661987061612e-05,
"learning_rate": 6.349874889624962e-07,
"loss": 0.0,
"num_input_tokens_seen": 6499616,
"step": 10495
},
{
"epoch": 18.71657754010695,
"grad_norm": 4.966541837347904e-06,
"learning_rate": 6.263078019357716e-07,
"loss": 0.0,
"num_input_tokens_seen": 6502656,
"step": 10500
},
{
"epoch": 18.725490196078432,
"grad_norm": 8.445246749033686e-06,
"learning_rate": 6.176870929382489e-07,
"loss": 0.0,
"num_input_tokens_seen": 6505344,
"step": 10505
},
{
"epoch": 18.73440285204991,
"grad_norm": 3.86704778065905e-05,
"learning_rate": 6.091253828298088e-07,
"loss": 0.0,
"num_input_tokens_seen": 6508160,
"step": 10510
},
{
"epoch": 18.74331550802139,
"grad_norm": 3.769595195990405e-06,
"learning_rate": 6.006226923275738e-07,
"loss": 0.0,
"num_input_tokens_seen": 6511616,
"step": 10515
},
{
"epoch": 18.75222816399287,
"grad_norm": 8.503994467901066e-05,
"learning_rate": 5.921790420058582e-07,
"loss": 0.0,
"num_input_tokens_seen": 6514240,
"step": 10520
},
{
"epoch": 18.76114081996435,
"grad_norm": 4.197690032015089e-06,
"learning_rate": 5.837944522961075e-07,
"loss": 0.0,
"num_input_tokens_seen": 6517056,
"step": 10525
},
{
"epoch": 18.77005347593583,
"grad_norm": 3.576196377252927e-06,
"learning_rate": 5.754689434868677e-07,
"loss": 0.0,
"num_input_tokens_seen": 6520032,
"step": 10530
},
{
"epoch": 18.778966131907307,
"grad_norm": 7.368656497419579e-06,
"learning_rate": 5.672025357237071e-07,
"loss": 0.0,
"num_input_tokens_seen": 6522784,
"step": 10535
},
{
"epoch": 18.78787878787879,
"grad_norm": 3.6832218484050827e-06,
"learning_rate": 5.589952490091948e-07,
"loss": 0.0,
"num_input_tokens_seen": 6525632,
"step": 10540
},
{
"epoch": 18.796791443850267,
"grad_norm": 4.035174697492039e-06,
"learning_rate": 5.508471032028478e-07,
"loss": 0.0,
"num_input_tokens_seen": 6528832,
"step": 10545
},
{
"epoch": 18.805704099821746,
"grad_norm": 5.856224561284762e-06,
"learning_rate": 5.427581180210639e-07,
"loss": 0.0,
"num_input_tokens_seen": 6532416,
"step": 10550
},
{
"epoch": 18.814616755793228,
"grad_norm": 6.858808774268255e-05,
"learning_rate": 5.347283130371e-07,
"loss": 0.0,
"num_input_tokens_seen": 6535136,
"step": 10555
},
{
"epoch": 18.823529411764707,
"grad_norm": 4.181023086857749e-06,
"learning_rate": 5.267577076810026e-07,
"loss": 0.0,
"num_input_tokens_seen": 6538528,
"step": 10560
},
{
"epoch": 18.832442067736185,
"grad_norm": 4.6028135329834186e-06,
"learning_rate": 5.188463212395744e-07,
"loss": 0.0,
"num_input_tokens_seen": 6540992,
"step": 10565
},
{
"epoch": 18.841354723707664,
"grad_norm": 6.4640212258382235e-06,
"learning_rate": 5.1099417285633e-07,
"loss": 0.0,
"num_input_tokens_seen": 6543744,
"step": 10570
},
{
"epoch": 18.850267379679146,
"grad_norm": 4.091177288501058e-06,
"learning_rate": 5.032012815314291e-07,
"loss": 0.0,
"num_input_tokens_seen": 6546336,
"step": 10575
},
{
"epoch": 18.859180035650624,
"grad_norm": 2.7967815185547806e-05,
"learning_rate": 4.954676661216546e-07,
"loss": 0.0,
"num_input_tokens_seen": 6548864,
"step": 10580
},
{
"epoch": 18.868092691622103,
"grad_norm": 9.529613453196362e-05,
"learning_rate": 4.877933453403593e-07,
"loss": 0.0,
"num_input_tokens_seen": 6551808,
"step": 10585
},
{
"epoch": 18.87700534759358,
"grad_norm": 3.412982914596796e-05,
"learning_rate": 4.801783377574088e-07,
"loss": 0.0,
"num_input_tokens_seen": 6554496,
"step": 10590
},
{
"epoch": 18.885918003565063,
"grad_norm": 4.093836196261691e-06,
"learning_rate": 4.726226617991547e-07,
"loss": 0.0,
"num_input_tokens_seen": 6557312,
"step": 10595
},
{
"epoch": 18.89483065953654,
"grad_norm": 3.870322416332783e-06,
"learning_rate": 4.651263357483754e-07,
"loss": 0.0,
"num_input_tokens_seen": 6560768,
"step": 10600
},
{
"epoch": 18.90374331550802,
"grad_norm": 4.343833097664174e-06,
"learning_rate": 4.5768937774424146e-07,
"loss": 0.0,
"num_input_tokens_seen": 6563712,
"step": 10605
},
{
"epoch": 18.912655971479502,
"grad_norm": 4.011643795820419e-06,
"learning_rate": 4.5031180578226637e-07,
"loss": 0.0,
"num_input_tokens_seen": 6567328,
"step": 10610
},
{
"epoch": 18.92156862745098,
"grad_norm": 4.949677077092929e-06,
"learning_rate": 4.4299363771427015e-07,
"loss": 0.0,
"num_input_tokens_seen": 6570976,
"step": 10615
},
{
"epoch": 18.93048128342246,
"grad_norm": 7.535657459811773e-06,
"learning_rate": 4.357348912483211e-07,
"loss": 0.0,
"num_input_tokens_seen": 6574656,
"step": 10620
},
{
"epoch": 18.939393939393938,
"grad_norm": 6.099482106947107e-06,
"learning_rate": 4.2853558394871096e-07,
"loss": 0.0,
"num_input_tokens_seen": 6578048,
"step": 10625
},
{
"epoch": 18.94830659536542,
"grad_norm": 8.545129276171792e-06,
"learning_rate": 4.2139573323589643e-07,
"loss": 0.0,
"num_input_tokens_seen": 6580576,
"step": 10630
},
{
"epoch": 18.9572192513369,
"grad_norm": 3.974701485276455e-06,
"learning_rate": 4.1431535638647436e-07,
"loss": 0.0,
"num_input_tokens_seen": 6584416,
"step": 10635
},
{
"epoch": 18.966131907308377,
"grad_norm": 5.085218617750797e-06,
"learning_rate": 4.072944705331178e-07,
"loss": 0.0,
"num_input_tokens_seen": 6587488,
"step": 10640
},
{
"epoch": 18.97504456327986,
"grad_norm": 8.860148227540776e-06,
"learning_rate": 4.003330926645649e-07,
"loss": 0.0,
"num_input_tokens_seen": 6590208,
"step": 10645
},
{
"epoch": 18.983957219251337,
"grad_norm": 4.768772669194732e-06,
"learning_rate": 3.9343123962553853e-07,
"loss": 0.0,
"num_input_tokens_seen": 6593568,
"step": 10650
},
{
"epoch": 18.992869875222816,
"grad_norm": 4.617687864083564e-06,
"learning_rate": 3.865889281167406e-07,
"loss": 0.0,
"num_input_tokens_seen": 6596896,
"step": 10655
},
{
"epoch": 19.0,
"eval_loss": 0.27009719610214233,
"eval_runtime": 4.5878,
"eval_samples_per_second": 54.274,
"eval_steps_per_second": 13.732,
"num_input_tokens_seen": 6598768,
"step": 10659
},
{
"epoch": 19.001782531194294,
"grad_norm": 3.65996152140724e-06,
"learning_rate": 3.7980617469479953e-07,
"loss": 0.0,
"num_input_tokens_seen": 6599440,
"step": 10660
},
{
"epoch": 19.010695187165776,
"grad_norm": 3.970195393776521e-06,
"learning_rate": 3.730829957722171e-07,
"loss": 0.0,
"num_input_tokens_seen": 6602032,
"step": 10665
},
{
"epoch": 19.019607843137255,
"grad_norm": 4.120309768040897e-06,
"learning_rate": 3.6641940761735217e-07,
"loss": 0.0,
"num_input_tokens_seen": 6605616,
"step": 10670
},
{
"epoch": 19.028520499108733,
"grad_norm": 6.569275683432352e-06,
"learning_rate": 3.598154263543596e-07,
"loss": 0.0,
"num_input_tokens_seen": 6608720,
"step": 10675
},
{
"epoch": 19.037433155080215,
"grad_norm": 5.234503532847157e-06,
"learning_rate": 3.532710679631679e-07,
"loss": 0.0,
"num_input_tokens_seen": 6611664,
"step": 10680
},
{
"epoch": 19.046345811051694,
"grad_norm": 4.1348962440679315e-06,
"learning_rate": 3.467863482794348e-07,
"loss": 0.0,
"num_input_tokens_seen": 6614928,
"step": 10685
},
{
"epoch": 19.055258467023172,
"grad_norm": 4.554530278255697e-06,
"learning_rate": 3.4036128299449466e-07,
"loss": 0.0,
"num_input_tokens_seen": 6617328,
"step": 10690
},
{
"epoch": 19.06417112299465,
"grad_norm": 3.7013876408309443e-06,
"learning_rate": 3.3399588765535284e-07,
"loss": 0.0,
"num_input_tokens_seen": 6619984,
"step": 10695
},
{
"epoch": 19.073083778966133,
"grad_norm": 3.5750003917200956e-06,
"learning_rate": 3.276901776646135e-07,
"loss": 0.0,
"num_input_tokens_seen": 6623408,
"step": 10700
},
{
"epoch": 19.08199643493761,
"grad_norm": 3.6158824059384642e-06,
"learning_rate": 3.2144416828046307e-07,
"loss": 0.0,
"num_input_tokens_seen": 6626192,
"step": 10705
},
{
"epoch": 19.09090909090909,
"grad_norm": 5.441886969492771e-05,
"learning_rate": 3.1525787461663405e-07,
"loss": 0.0,
"num_input_tokens_seen": 6630096,
"step": 10710
},
{
"epoch": 19.099821746880572,
"grad_norm": 3.259347522543976e-06,
"learning_rate": 3.091313116423522e-07,
"loss": 0.0,
"num_input_tokens_seen": 6633360,
"step": 10715
},
{
"epoch": 19.10873440285205,
"grad_norm": 6.232783562154509e-06,
"learning_rate": 3.0306449418231464e-07,
"loss": 0.0,
"num_input_tokens_seen": 6636656,
"step": 10720
},
{
"epoch": 19.11764705882353,
"grad_norm": 4.463233381102327e-06,
"learning_rate": 2.9705743691665345e-07,
"loss": 0.0,
"num_input_tokens_seen": 6639184,
"step": 10725
},
{
"epoch": 19.126559714795007,
"grad_norm": 4.211536725051701e-05,
"learning_rate": 2.9111015438088583e-07,
"loss": 0.0,
"num_input_tokens_seen": 6642256,
"step": 10730
},
{
"epoch": 19.13547237076649,
"grad_norm": 4.517605702858418e-06,
"learning_rate": 2.852226609659059e-07,
"loss": 0.0,
"num_input_tokens_seen": 6645680,
"step": 10735
},
{
"epoch": 19.144385026737968,
"grad_norm": 3.99427972297417e-06,
"learning_rate": 2.793949709179178e-07,
"loss": 0.0,
"num_input_tokens_seen": 6648784,
"step": 10740
},
{
"epoch": 19.153297682709447,
"grad_norm": 4.534751042228891e-06,
"learning_rate": 2.7362709833842757e-07,
"loss": 0.0,
"num_input_tokens_seen": 6651728,
"step": 10745
},
{
"epoch": 19.16221033868093,
"grad_norm": 4.9152927203977015e-06,
"learning_rate": 2.679190571841933e-07,
"loss": 0.0,
"num_input_tokens_seen": 6654864,
"step": 10750
},
{
"epoch": 19.171122994652407,
"grad_norm": 4.616060323314741e-06,
"learning_rate": 2.62270861267197e-07,
"loss": 0.0,
"num_input_tokens_seen": 6657776,
"step": 10755
},
{
"epoch": 19.180035650623886,
"grad_norm": 8.252305269706994e-05,
"learning_rate": 2.566825242546117e-07,
"loss": 0.0,
"num_input_tokens_seen": 6661264,
"step": 10760
},
{
"epoch": 19.188948306595364,
"grad_norm": 4.335048288339749e-06,
"learning_rate": 2.511540596687678e-07,
"loss": 0.0,
"num_input_tokens_seen": 6665584,
"step": 10765
},
{
"epoch": 19.197860962566846,
"grad_norm": 7.008193279034458e-06,
"learning_rate": 2.456854808871201e-07,
"loss": 0.0,
"num_input_tokens_seen": 6669200,
"step": 10770
},
{
"epoch": 19.206773618538325,
"grad_norm": 4.312935288908193e-06,
"learning_rate": 2.4027680114221405e-07,
"loss": 0.0,
"num_input_tokens_seen": 6672624,
"step": 10775
},
{
"epoch": 19.215686274509803,
"grad_norm": 4.447610626812093e-05,
"learning_rate": 2.3492803352165303e-07,
"loss": 0.0,
"num_input_tokens_seen": 6675632,
"step": 10780
},
{
"epoch": 19.224598930481285,
"grad_norm": 3.66940571439045e-06,
"learning_rate": 2.2963919096807285e-07,
"loss": 0.0,
"num_input_tokens_seen": 6678352,
"step": 10785
},
{
"epoch": 19.233511586452764,
"grad_norm": 1.1107338650617748e-05,
"learning_rate": 2.244102862791031e-07,
"loss": 0.0,
"num_input_tokens_seen": 6681296,
"step": 10790
},
{
"epoch": 19.242424242424242,
"grad_norm": 4.122299287701026e-06,
"learning_rate": 2.1924133210734222e-07,
"loss": 0.0,
"num_input_tokens_seen": 6685200,
"step": 10795
},
{
"epoch": 19.25133689839572,
"grad_norm": 4.148144398641307e-06,
"learning_rate": 2.141323409603241e-07,
"loss": 0.0,
"num_input_tokens_seen": 6688304,
"step": 10800
},
{
"epoch": 19.260249554367203,
"grad_norm": 5.2304230848676525e-06,
"learning_rate": 2.0908332520047645e-07,
"loss": 0.0,
"num_input_tokens_seen": 6691024,
"step": 10805
},
{
"epoch": 19.26916221033868,
"grad_norm": 4.854452981817303e-06,
"learning_rate": 2.0409429704512096e-07,
"loss": 0.0,
"num_input_tokens_seen": 6693936,
"step": 10810
},
{
"epoch": 19.27807486631016,
"grad_norm": 1.8323980839340948e-05,
"learning_rate": 1.9916526856641193e-07,
"loss": 0.0,
"num_input_tokens_seen": 6697520,
"step": 10815
},
{
"epoch": 19.28698752228164,
"grad_norm": 5.771956693934044e-06,
"learning_rate": 1.9429625169131716e-07,
"loss": 0.0,
"num_input_tokens_seen": 6700880,
"step": 10820
},
{
"epoch": 19.29590017825312,
"grad_norm": 1.2314047125983052e-05,
"learning_rate": 1.8948725820160662e-07,
"loss": 0.0,
"num_input_tokens_seen": 6704432,
"step": 10825
},
{
"epoch": 19.3048128342246,
"grad_norm": 4.320734660723247e-06,
"learning_rate": 1.847382997337943e-07,
"loss": 0.0,
"num_input_tokens_seen": 6707568,
"step": 10830
},
{
"epoch": 19.313725490196077,
"grad_norm": 4.215276476315921e-06,
"learning_rate": 1.8004938777913537e-07,
"loss": 0.0,
"num_input_tokens_seen": 6710736,
"step": 10835
},
{
"epoch": 19.32263814616756,
"grad_norm": 6.504408247565152e-06,
"learning_rate": 1.754205336835818e-07,
"loss": 0.0,
"num_input_tokens_seen": 6714224,
"step": 10840
},
{
"epoch": 19.331550802139038,
"grad_norm": 1.6524629245395772e-05,
"learning_rate": 1.7085174864776287e-07,
"loss": 0.0,
"num_input_tokens_seen": 6717680,
"step": 10845
},
{
"epoch": 19.340463458110516,
"grad_norm": 3.6317417198006297e-06,
"learning_rate": 1.6634304372695474e-07,
"loss": 0.0,
"num_input_tokens_seen": 6720304,
"step": 10850
},
{
"epoch": 19.349376114081995,
"grad_norm": 1.5814599464647472e-05,
"learning_rate": 1.6189442983105817e-07,
"loss": 0.0,
"num_input_tokens_seen": 6723728,
"step": 10855
},
{
"epoch": 19.358288770053477,
"grad_norm": 5.51480798094417e-06,
"learning_rate": 1.5750591772456802e-07,
"loss": 0.0,
"num_input_tokens_seen": 6726320,
"step": 10860
},
{
"epoch": 19.367201426024955,
"grad_norm": 9.519346349406987e-06,
"learning_rate": 1.5317751802654823e-07,
"loss": 0.0,
"num_input_tokens_seen": 6729680,
"step": 10865
},
{
"epoch": 19.376114081996434,
"grad_norm": 5.400910595199093e-06,
"learning_rate": 1.489092412106069e-07,
"loss": 0.0,
"num_input_tokens_seen": 6732080,
"step": 10870
},
{
"epoch": 19.385026737967916,
"grad_norm": 8.067914677667432e-06,
"learning_rate": 1.447010976048685e-07,
"loss": 0.0,
"num_input_tokens_seen": 6734768,
"step": 10875
},
{
"epoch": 19.393939393939394,
"grad_norm": 6.035778824298177e-06,
"learning_rate": 1.4055309739195167e-07,
"loss": 0.0,
"num_input_tokens_seen": 6737168,
"step": 10880
},
{
"epoch": 19.402852049910873,
"grad_norm": 4.06550316256471e-06,
"learning_rate": 1.3646525060894422e-07,
"loss": 0.0,
"num_input_tokens_seen": 6739824,
"step": 10885
},
{
"epoch": 19.41176470588235,
"grad_norm": 4.2528931771812495e-06,
"learning_rate": 1.324375671473782e-07,
"loss": 0.0,
"num_input_tokens_seen": 6742832,
"step": 10890
},
{
"epoch": 19.420677361853834,
"grad_norm": 3.7699369386245962e-06,
"learning_rate": 1.2847005675320767e-07,
"loss": 0.0,
"num_input_tokens_seen": 6746192,
"step": 10895
},
{
"epoch": 19.429590017825312,
"grad_norm": 1.7903794287121855e-05,
"learning_rate": 1.2456272902677534e-07,
"loss": 0.0,
"num_input_tokens_seen": 6750064,
"step": 10900
},
{
"epoch": 19.43850267379679,
"grad_norm": 3.696606427183724e-06,
"learning_rate": 1.207155934228099e-07,
"loss": 0.0,
"num_input_tokens_seen": 6753968,
"step": 10905
},
{
"epoch": 19.447415329768273,
"grad_norm": 4.29191641160287e-06,
"learning_rate": 1.16928659250376e-07,
"loss": 0.0,
"num_input_tokens_seen": 6757200,
"step": 10910
},
{
"epoch": 19.45632798573975,
"grad_norm": 9.008747292682528e-05,
"learning_rate": 1.1320193567288529e-07,
"loss": 0.0,
"num_input_tokens_seen": 6760784,
"step": 10915
},
{
"epoch": 19.46524064171123,
"grad_norm": 7.780551095493138e-05,
"learning_rate": 1.0953543170803826e-07,
"loss": 0.0,
"num_input_tokens_seen": 6763760,
"step": 10920
},
{
"epoch": 19.474153297682708,
"grad_norm": 6.355798177537508e-06,
"learning_rate": 1.0592915622782418e-07,
"loss": 0.0,
"num_input_tokens_seen": 6766768,
"step": 10925
},
{
"epoch": 19.48306595365419,
"grad_norm": 8.294658073282335e-06,
"learning_rate": 1.0238311795850163e-07,
"loss": 0.0,
"num_input_tokens_seen": 6769616,
"step": 10930
},
{
"epoch": 19.49197860962567,
"grad_norm": 4.181761596555589e-06,
"learning_rate": 9.889732548056252e-08,
"loss": 0.0,
"num_input_tokens_seen": 6773968,
"step": 10935
},
{
"epoch": 19.500891265597147,
"grad_norm": 4.156318937020842e-06,
"learning_rate": 9.547178722872364e-08,
"loss": 0.0,
"num_input_tokens_seen": 6776752,
"step": 10940
},
{
"epoch": 19.509803921568626,
"grad_norm": 4.000894023192814e-06,
"learning_rate": 9.210651149190175e-08,
"loss": 0.0,
"num_input_tokens_seen": 6779632,
"step": 10945
},
{
"epoch": 19.518716577540108,
"grad_norm": 4.35921583630261e-06,
"learning_rate": 8.880150641319418e-08,
"loss": 0.0,
"num_input_tokens_seen": 6782800,
"step": 10950
},
{
"epoch": 19.527629233511586,
"grad_norm": 4.094060841453029e-06,
"learning_rate": 8.555677998985657e-08,
"loss": 0.0,
"num_input_tokens_seen": 6785648,
"step": 10955
},
{
"epoch": 19.536541889483065,
"grad_norm": 1.1936950613744557e-05,
"learning_rate": 8.23723400732862e-08,
"loss": 0.0,
"num_input_tokens_seen": 6788944,
"step": 10960
},
{
"epoch": 19.545454545454547,
"grad_norm": 5.246789442026056e-06,
"learning_rate": 7.924819436900821e-08,
"loss": 0.0,
"num_input_tokens_seen": 6792048,
"step": 10965
},
{
"epoch": 19.554367201426025,
"grad_norm": 3.7531422094616573e-06,
"learning_rate": 7.618435043664218e-08,
"loss": 0.0,
"num_input_tokens_seen": 6794896,
"step": 10970
},
{
"epoch": 19.563279857397504,
"grad_norm": 7.082822321535787e-06,
"learning_rate": 7.318081568990221e-08,
"loss": 0.0,
"num_input_tokens_seen": 6797936,
"step": 10975
},
{
"epoch": 19.572192513368982,
"grad_norm": 7.775455742375925e-06,
"learning_rate": 7.023759739656078e-08,
"loss": 0.0,
"num_input_tokens_seen": 6801328,
"step": 10980
},
{
"epoch": 19.581105169340464,
"grad_norm": 1.4494855349767022e-05,
"learning_rate": 6.735470267844879e-08,
"loss": 0.0,
"num_input_tokens_seen": 6805328,
"step": 10985
},
{
"epoch": 19.590017825311943,
"grad_norm": 0.0002640245365910232,
"learning_rate": 6.453213851142226e-08,
"loss": 0.0,
"num_input_tokens_seen": 6808176,
"step": 10990
},
{
"epoch": 19.59893048128342,
"grad_norm": 4.6502209443133324e-06,
"learning_rate": 6.176991172535673e-08,
"loss": 0.0,
"num_input_tokens_seen": 6811632,
"step": 10995
},
{
"epoch": 19.607843137254903,
"grad_norm": 2.5176357667078264e-05,
"learning_rate": 5.906802900412789e-08,
"loss": 0.0,
"num_input_tokens_seen": 6814864,
"step": 11000
},
{
"epoch": 19.616755793226382,
"grad_norm": 5.3292624215828255e-06,
"learning_rate": 5.642649688559487e-08,
"loss": 0.0,
"num_input_tokens_seen": 6817936,
"step": 11005
},
{
"epoch": 19.62566844919786,
"grad_norm": 6.279942317632958e-05,
"learning_rate": 5.384532176157808e-08,
"loss": 0.0,
"num_input_tokens_seen": 6820976,
"step": 11010
},
{
"epoch": 19.63458110516934,
"grad_norm": 3.641805233201012e-05,
"learning_rate": 5.132450987785364e-08,
"loss": 0.0,
"num_input_tokens_seen": 6824368,
"step": 11015
},
{
"epoch": 19.64349376114082,
"grad_norm": 4.100953447050415e-06,
"learning_rate": 4.8864067334136735e-08,
"loss": 0.0,
"num_input_tokens_seen": 6828240,
"step": 11020
},
{
"epoch": 19.6524064171123,
"grad_norm": 4.846000138059026e-06,
"learning_rate": 4.6464000084059376e-08,
"loss": 0.0,
"num_input_tokens_seen": 6831376,
"step": 11025
},
{
"epoch": 19.661319073083778,
"grad_norm": 3.7461518331838306e-06,
"learning_rate": 4.412431393516492e-08,
"loss": 0.0,
"num_input_tokens_seen": 6834064,
"step": 11030
},
{
"epoch": 19.67023172905526,
"grad_norm": 6.311793185886927e-06,
"learning_rate": 4.184501454888856e-08,
"loss": 0.0,
"num_input_tokens_seen": 6837136,
"step": 11035
},
{
"epoch": 19.67914438502674,
"grad_norm": 0.00011546228779479861,
"learning_rate": 3.9626107440543515e-08,
"loss": 0.0,
"num_input_tokens_seen": 6840048,
"step": 11040
},
{
"epoch": 19.688057040998217,
"grad_norm": 1.7715008652885444e-05,
"learning_rate": 3.746759797931265e-08,
"loss": 0.0,
"num_input_tokens_seen": 6843568,
"step": 11045
},
{
"epoch": 19.696969696969695,
"grad_norm": 4.317462298786268e-06,
"learning_rate": 3.536949138822909e-08,
"loss": 0.0,
"num_input_tokens_seen": 6847312,
"step": 11050
},
{
"epoch": 19.705882352941178,
"grad_norm": 4.139226803090423e-06,
"learning_rate": 3.333179274417064e-08,
"loss": 0.0,
"num_input_tokens_seen": 6849904,
"step": 11055
},
{
"epoch": 19.714795008912656,
"grad_norm": 8.816688932711259e-06,
"learning_rate": 3.135450697783482e-08,
"loss": 0.0,
"num_input_tokens_seen": 6853104,
"step": 11060
},
{
"epoch": 19.723707664884135,
"grad_norm": 6.18420881437487e-06,
"learning_rate": 2.943763887374995e-08,
"loss": 0.0,
"num_input_tokens_seen": 6856208,
"step": 11065
},
{
"epoch": 19.732620320855617,
"grad_norm": 4.002175501227612e-06,
"learning_rate": 2.7581193070233546e-08,
"loss": 0.0,
"num_input_tokens_seen": 6859440,
"step": 11070
},
{
"epoch": 19.741532976827095,
"grad_norm": 6.689347355859354e-05,
"learning_rate": 2.5785174059408947e-08,
"loss": 0.0,
"num_input_tokens_seen": 6862096,
"step": 11075
},
{
"epoch": 19.750445632798574,
"grad_norm": 7.351686235779198e-06,
"learning_rate": 2.4049586187174787e-08,
"loss": 0.0,
"num_input_tokens_seen": 6864848,
"step": 11080
},
{
"epoch": 19.759358288770052,
"grad_norm": 8.2574997577467e-06,
"learning_rate": 2.237443365320502e-08,
"loss": 0.0,
"num_input_tokens_seen": 6868336,
"step": 11085
},
{
"epoch": 19.768270944741534,
"grad_norm": 5.573724592977669e-06,
"learning_rate": 2.0759720510937773e-08,
"loss": 0.0,
"num_input_tokens_seen": 6870896,
"step": 11090
},
{
"epoch": 19.777183600713013,
"grad_norm": 3.6006058508064598e-06,
"learning_rate": 1.9205450667558743e-08,
"loss": 0.0,
"num_input_tokens_seen": 6873936,
"step": 11095
},
{
"epoch": 19.78609625668449,
"grad_norm": 1.860854354163166e-05,
"learning_rate": 1.7711627883998382e-08,
"loss": 0.0,
"num_input_tokens_seen": 6876784,
"step": 11100
},
{
"epoch": 19.795008912655973,
"grad_norm": 8.149945642799139e-06,
"learning_rate": 1.627825577492359e-08,
"loss": 0.0,
"num_input_tokens_seen": 6879568,
"step": 11105
},
{
"epoch": 19.80392156862745,
"grad_norm": 4.631604952010093e-06,
"learning_rate": 1.4905337808721053e-08,
"loss": 0.0,
"num_input_tokens_seen": 6882256,
"step": 11110
},
{
"epoch": 19.81283422459893,
"grad_norm": 7.1401186687580775e-06,
"learning_rate": 1.3592877307500029e-08,
"loss": 0.0,
"num_input_tokens_seen": 6885168,
"step": 11115
},
{
"epoch": 19.82174688057041,
"grad_norm": 4.165116024523741e-06,
"learning_rate": 1.2340877447072907e-08,
"loss": 0.0,
"num_input_tokens_seen": 6887856,
"step": 11120
},
{
"epoch": 19.83065953654189,
"grad_norm": 1.1358249139448162e-05,
"learning_rate": 1.114934125695799e-08,
"loss": 0.0,
"num_input_tokens_seen": 6891024,
"step": 11125
},
{
"epoch": 19.83957219251337,
"grad_norm": 5.638150923914509e-06,
"learning_rate": 1.001827162036284e-08,
"loss": 0.0,
"num_input_tokens_seen": 6893904,
"step": 11130
},
{
"epoch": 19.848484848484848,
"grad_norm": 4.670748694479698e-06,
"learning_rate": 8.947671274184277e-09,
"loss": 0.0,
"num_input_tokens_seen": 6896336,
"step": 11135
},
{
"epoch": 19.85739750445633,
"grad_norm": 3.5210650821682066e-05,
"learning_rate": 7.937542808997278e-09,
"loss": 0.0,
"num_input_tokens_seen": 6899120,
"step": 11140
},
{
"epoch": 19.86631016042781,
"grad_norm": 7.780026862747036e-06,
"learning_rate": 6.987888669052201e-09,
"loss": 0.0,
"num_input_tokens_seen": 6901808,
"step": 11145
},
{
"epoch": 19.875222816399287,
"grad_norm": 4.677411652664887e-06,
"learning_rate": 6.098711152266456e-09,
"loss": 0.0,
"num_input_tokens_seen": 6905200,
"step": 11150
},
{
"epoch": 19.884135472370765,
"grad_norm": 7.62782474339474e-06,
"learning_rate": 5.270012410216185e-09,
"loss": 0.0,
"num_input_tokens_seen": 6908272,
"step": 11155
},
{
"epoch": 19.893048128342247,
"grad_norm": 4.890515356237302e-06,
"learning_rate": 4.50179444814458e-09,
"loss": 0.0,
"num_input_tokens_seen": 6911760,
"step": 11160
},
{
"epoch": 19.901960784313726,
"grad_norm": 2.7669217161019333e-05,
"learning_rate": 3.794059124934135e-09,
"loss": 0.0,
"num_input_tokens_seen": 6915568,
"step": 11165
},
{
"epoch": 19.910873440285204,
"grad_norm": 1.4286442819866352e-05,
"learning_rate": 3.146808153123293e-09,
"loss": 0.0,
"num_input_tokens_seen": 6918832,
"step": 11170
},
{
"epoch": 19.919786096256683,
"grad_norm": 6.247079727472737e-05,
"learning_rate": 2.560043098895348e-09,
"loss": 0.0,
"num_input_tokens_seen": 6922288,
"step": 11175
},
{
"epoch": 19.928698752228165,
"grad_norm": 4.766968231706414e-06,
"learning_rate": 2.0337653820645673e-09,
"loss": 0.0,
"num_input_tokens_seen": 6925232,
"step": 11180
},
{
"epoch": 19.937611408199643,
"grad_norm": 4.658260877477005e-06,
"learning_rate": 1.5679762760900663e-09,
"loss": 0.0,
"num_input_tokens_seen": 6927792,
"step": 11185
},
{
"epoch": 19.946524064171122,
"grad_norm": 3.767223006434506e-06,
"learning_rate": 1.162676908059157e-09,
"loss": 0.0,
"num_input_tokens_seen": 6931024,
"step": 11190
},
{
"epoch": 19.955436720142604,
"grad_norm": 6.662194209638983e-06,
"learning_rate": 8.178682586928998e-10,
"loss": 0.0,
"num_input_tokens_seen": 6933616,
"step": 11195
},
{
"epoch": 19.964349376114082,
"grad_norm": 6.126639163994696e-06,
"learning_rate": 5.335511623377753e-10,
"loss": 0.0,
"num_input_tokens_seen": 6936592,
"step": 11200
},
{
"epoch": 19.97326203208556,
"grad_norm": 4.0635059121996164e-06,
"learning_rate": 3.0972630696846084e-10,
"loss": 0.0,
"num_input_tokens_seen": 6939024,
"step": 11205
},
{
"epoch": 19.98217468805704,
"grad_norm": 5.989404144202126e-06,
"learning_rate": 1.463942341850544e-10,
"loss": 0.0,
"num_input_tokens_seen": 6942032,
"step": 11210
},
{
"epoch": 19.99108734402852,
"grad_norm": 3.8322787077049725e-06,
"learning_rate": 4.35553392047483e-11,
"loss": 0.0,
"num_input_tokens_seen": 6944464,
"step": 11215
},
{
"epoch": 20.0,
"grad_norm": 3.420008397370111e-06,
"learning_rate": 1.2098708757068978e-12,
"loss": 0.0,
"num_input_tokens_seen": 6947288,
"step": 11220
},
{
"epoch": 20.0,
"eval_loss": 0.2706851363182068,
"eval_runtime": 4.5892,
"eval_samples_per_second": 54.258,
"eval_steps_per_second": 13.728,
"num_input_tokens_seen": 6947288,
"step": 11220
},
{
"epoch": 20.0,
"num_input_tokens_seen": 6947288,
"step": 11220,
"total_flos": 3.134451584829358e+17,
"train_loss": 0.01365312689491912,
"train_runtime": 2907.3969,
"train_samples_per_second": 15.416,
"train_steps_per_second": 3.859
}
],
"logging_steps": 5,
"max_steps": 11220,
"num_input_tokens_seen": 6947288,
"num_train_epochs": 20,
"save_steps": 561,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.134451584829358e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}