Qwen2.5-1.5B-Open-R1-Distill / trainer_state.json
zyl2023's picture
Model save
32111d0 verified
raw
history blame
447 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 10725,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002331002331002331,
"grad_norm": 2.2309207419995793,
"learning_rate": 4.655493482309125e-07,
"loss": 0.868,
"num_tokens": 1310720.0,
"step": 5
},
{
"epoch": 0.004662004662004662,
"grad_norm": 2.1118896401046383,
"learning_rate": 9.31098696461825e-07,
"loss": 0.8785,
"num_tokens": 2621440.0,
"step": 10
},
{
"epoch": 0.006993006993006993,
"grad_norm": 1.5940427530885184,
"learning_rate": 1.3966480446927373e-06,
"loss": 0.8483,
"num_tokens": 3932160.0,
"step": 15
},
{
"epoch": 0.009324009324009324,
"grad_norm": 1.2314768273947896,
"learning_rate": 1.86219739292365e-06,
"loss": 0.8081,
"num_tokens": 5242880.0,
"step": 20
},
{
"epoch": 0.011655011655011656,
"grad_norm": 1.1773776679892092,
"learning_rate": 2.3277467411545626e-06,
"loss": 0.81,
"num_tokens": 6553600.0,
"step": 25
},
{
"epoch": 0.013986013986013986,
"grad_norm": 0.956055714801497,
"learning_rate": 2.7932960893854746e-06,
"loss": 0.8035,
"num_tokens": 7864320.0,
"step": 30
},
{
"epoch": 0.016317016317016316,
"grad_norm": 0.7062278575164957,
"learning_rate": 3.2588454376163876e-06,
"loss": 0.7718,
"num_tokens": 9154672.0,
"step": 35
},
{
"epoch": 0.018648018648018648,
"grad_norm": 0.772524803320663,
"learning_rate": 3.7243947858473e-06,
"loss": 0.7217,
"num_tokens": 10465392.0,
"step": 40
},
{
"epoch": 0.02097902097902098,
"grad_norm": 0.5893728153785672,
"learning_rate": 4.189944134078212e-06,
"loss": 0.7274,
"num_tokens": 11776112.0,
"step": 45
},
{
"epoch": 0.023310023310023312,
"grad_norm": 0.5065797141230348,
"learning_rate": 4.655493482309125e-06,
"loss": 0.7234,
"num_tokens": 13086832.0,
"step": 50
},
{
"epoch": 0.02564102564102564,
"grad_norm": 0.48651817735023045,
"learning_rate": 5.121042830540038e-06,
"loss": 0.7267,
"num_tokens": 14397552.0,
"step": 55
},
{
"epoch": 0.027972027972027972,
"grad_norm": 0.4552730080037496,
"learning_rate": 5.586592178770949e-06,
"loss": 0.6927,
"num_tokens": 15708272.0,
"step": 60
},
{
"epoch": 0.030303030303030304,
"grad_norm": 0.46005771739470563,
"learning_rate": 6.052141527001862e-06,
"loss": 0.6771,
"num_tokens": 17018992.0,
"step": 65
},
{
"epoch": 0.03263403263403263,
"grad_norm": 0.4299236849439654,
"learning_rate": 6.517690875232775e-06,
"loss": 0.6663,
"num_tokens": 18329712.0,
"step": 70
},
{
"epoch": 0.03496503496503497,
"grad_norm": 0.4516492360855498,
"learning_rate": 6.983240223463687e-06,
"loss": 0.6653,
"num_tokens": 19640432.0,
"step": 75
},
{
"epoch": 0.037296037296037296,
"grad_norm": 0.4149060424751554,
"learning_rate": 7.4487895716946e-06,
"loss": 0.6742,
"num_tokens": 20951152.0,
"step": 80
},
{
"epoch": 0.039627039627039624,
"grad_norm": 0.4609610250520845,
"learning_rate": 7.914338919925513e-06,
"loss": 0.6415,
"num_tokens": 22261872.0,
"step": 85
},
{
"epoch": 0.04195804195804196,
"grad_norm": 0.43308223843066856,
"learning_rate": 8.379888268156424e-06,
"loss": 0.6748,
"num_tokens": 23572592.0,
"step": 90
},
{
"epoch": 0.04428904428904429,
"grad_norm": 0.4477929543500944,
"learning_rate": 8.845437616387337e-06,
"loss": 0.6767,
"num_tokens": 24883312.0,
"step": 95
},
{
"epoch": 0.046620046620046623,
"grad_norm": 0.43548039599877864,
"learning_rate": 9.31098696461825e-06,
"loss": 0.6415,
"num_tokens": 26194032.0,
"step": 100
},
{
"epoch": 0.04895104895104895,
"grad_norm": 0.49046398552976145,
"learning_rate": 9.776536312849161e-06,
"loss": 0.6372,
"num_tokens": 27504752.0,
"step": 105
},
{
"epoch": 0.05128205128205128,
"grad_norm": 0.4895199721021838,
"learning_rate": 1.0242085661080076e-05,
"loss": 0.6416,
"num_tokens": 28815472.0,
"step": 110
},
{
"epoch": 0.053613053613053616,
"grad_norm": 0.46201645981993494,
"learning_rate": 1.0707635009310987e-05,
"loss": 0.6493,
"num_tokens": 30126192.0,
"step": 115
},
{
"epoch": 0.055944055944055944,
"grad_norm": 0.46502125769455865,
"learning_rate": 1.1173184357541899e-05,
"loss": 0.6521,
"num_tokens": 31436912.0,
"step": 120
},
{
"epoch": 0.05827505827505827,
"grad_norm": 0.4531752952796131,
"learning_rate": 1.1638733705772813e-05,
"loss": 0.6332,
"num_tokens": 32747632.0,
"step": 125
},
{
"epoch": 0.06060606060606061,
"grad_norm": 0.4905266086400909,
"learning_rate": 1.2104283054003724e-05,
"loss": 0.6285,
"num_tokens": 34058352.0,
"step": 130
},
{
"epoch": 0.06293706293706294,
"grad_norm": 0.47150776268903466,
"learning_rate": 1.2569832402234637e-05,
"loss": 0.6219,
"num_tokens": 35355669.0,
"step": 135
},
{
"epoch": 0.06526806526806526,
"grad_norm": 0.46418902222985564,
"learning_rate": 1.303538175046555e-05,
"loss": 0.6285,
"num_tokens": 36666389.0,
"step": 140
},
{
"epoch": 0.0675990675990676,
"grad_norm": 0.5360827942508652,
"learning_rate": 1.3500931098696462e-05,
"loss": 0.6311,
"num_tokens": 37977109.0,
"step": 145
},
{
"epoch": 0.06993006993006994,
"grad_norm": 0.4574856423555115,
"learning_rate": 1.3966480446927374e-05,
"loss": 0.6265,
"num_tokens": 39287829.0,
"step": 150
},
{
"epoch": 0.07226107226107226,
"grad_norm": 0.509429199061911,
"learning_rate": 1.4432029795158286e-05,
"loss": 0.6212,
"num_tokens": 40598549.0,
"step": 155
},
{
"epoch": 0.07459207459207459,
"grad_norm": 0.47709542958562123,
"learning_rate": 1.48975791433892e-05,
"loss": 0.6225,
"num_tokens": 41909269.0,
"step": 160
},
{
"epoch": 0.07692307692307693,
"grad_norm": 0.49691106283476394,
"learning_rate": 1.5363128491620113e-05,
"loss": 0.6349,
"num_tokens": 43219989.0,
"step": 165
},
{
"epoch": 0.07925407925407925,
"grad_norm": 0.5573190120950272,
"learning_rate": 1.5828677839851026e-05,
"loss": 0.637,
"num_tokens": 44530709.0,
"step": 170
},
{
"epoch": 0.08158508158508158,
"grad_norm": 0.49003264776858235,
"learning_rate": 1.6294227188081936e-05,
"loss": 0.6199,
"num_tokens": 45841429.0,
"step": 175
},
{
"epoch": 0.08391608391608392,
"grad_norm": 0.48723276020731904,
"learning_rate": 1.675977653631285e-05,
"loss": 0.6223,
"num_tokens": 47152149.0,
"step": 180
},
{
"epoch": 0.08624708624708624,
"grad_norm": 0.5590552496737182,
"learning_rate": 1.7225325884543765e-05,
"loss": 0.6203,
"num_tokens": 48462869.0,
"step": 185
},
{
"epoch": 0.08857808857808858,
"grad_norm": 0.5019716267640132,
"learning_rate": 1.7690875232774675e-05,
"loss": 0.6099,
"num_tokens": 49773589.0,
"step": 190
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.48128033207729953,
"learning_rate": 1.8156424581005588e-05,
"loss": 0.591,
"num_tokens": 51058347.0,
"step": 195
},
{
"epoch": 0.09324009324009325,
"grad_norm": 0.48039162413529596,
"learning_rate": 1.86219739292365e-05,
"loss": 0.6084,
"num_tokens": 52353719.0,
"step": 200
},
{
"epoch": 0.09557109557109557,
"grad_norm": 0.48115713986962244,
"learning_rate": 1.9087523277467413e-05,
"loss": 0.6106,
"num_tokens": 53658551.0,
"step": 205
},
{
"epoch": 0.0979020979020979,
"grad_norm": 0.4870180321160556,
"learning_rate": 1.9553072625698323e-05,
"loss": 0.6173,
"num_tokens": 54952917.0,
"step": 210
},
{
"epoch": 0.10023310023310024,
"grad_norm": 0.5094823620103848,
"learning_rate": 2.001862197392924e-05,
"loss": 0.6098,
"num_tokens": 56263637.0,
"step": 215
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.5013089386317249,
"learning_rate": 2.0484171322160152e-05,
"loss": 0.6096,
"num_tokens": 57574357.0,
"step": 220
},
{
"epoch": 0.1048951048951049,
"grad_norm": 0.5232640161264629,
"learning_rate": 2.0949720670391062e-05,
"loss": 0.6201,
"num_tokens": 58885077.0,
"step": 225
},
{
"epoch": 0.10722610722610723,
"grad_norm": 0.550970201278118,
"learning_rate": 2.1415270018621975e-05,
"loss": 0.593,
"num_tokens": 60191231.0,
"step": 230
},
{
"epoch": 0.10955710955710955,
"grad_norm": 0.6029709013121757,
"learning_rate": 2.1880819366852888e-05,
"loss": 0.6074,
"num_tokens": 61491882.0,
"step": 235
},
{
"epoch": 0.11188811188811189,
"grad_norm": 0.5486381526391435,
"learning_rate": 2.2346368715083797e-05,
"loss": 0.6098,
"num_tokens": 62792124.0,
"step": 240
},
{
"epoch": 0.11421911421911422,
"grad_norm": 0.6116198358074656,
"learning_rate": 2.2811918063314713e-05,
"loss": 0.5992,
"num_tokens": 64102844.0,
"step": 245
},
{
"epoch": 0.11655011655011654,
"grad_norm": 0.6390765452401985,
"learning_rate": 2.3277467411545626e-05,
"loss": 0.5884,
"num_tokens": 65405812.0,
"step": 250
},
{
"epoch": 0.11888111888111888,
"grad_norm": 0.5592180145233874,
"learning_rate": 2.3743016759776536e-05,
"loss": 0.6115,
"num_tokens": 66716532.0,
"step": 255
},
{
"epoch": 0.12121212121212122,
"grad_norm": 0.5867882274444552,
"learning_rate": 2.420856610800745e-05,
"loss": 0.6322,
"num_tokens": 68027252.0,
"step": 260
},
{
"epoch": 0.12354312354312354,
"grad_norm": 0.4978541728996833,
"learning_rate": 2.4674115456238362e-05,
"loss": 0.6125,
"num_tokens": 69337972.0,
"step": 265
},
{
"epoch": 0.1258741258741259,
"grad_norm": 0.5176243116469658,
"learning_rate": 2.5139664804469275e-05,
"loss": 0.5889,
"num_tokens": 70648692.0,
"step": 270
},
{
"epoch": 0.1282051282051282,
"grad_norm": 0.5205723612646859,
"learning_rate": 2.5605214152700184e-05,
"loss": 0.5806,
"num_tokens": 71949722.0,
"step": 275
},
{
"epoch": 0.13053613053613053,
"grad_norm": 0.5105297626501796,
"learning_rate": 2.60707635009311e-05,
"loss": 0.5903,
"num_tokens": 73260442.0,
"step": 280
},
{
"epoch": 0.13286713286713286,
"grad_norm": 0.5983979159514128,
"learning_rate": 2.6536312849162014e-05,
"loss": 0.572,
"num_tokens": 74571162.0,
"step": 285
},
{
"epoch": 0.1351981351981352,
"grad_norm": 0.5344572430585598,
"learning_rate": 2.7001862197392923e-05,
"loss": 0.611,
"num_tokens": 75881882.0,
"step": 290
},
{
"epoch": 0.13752913752913754,
"grad_norm": 0.4971065663534153,
"learning_rate": 2.746741154562384e-05,
"loss": 0.584,
"num_tokens": 77192602.0,
"step": 295
},
{
"epoch": 0.13986013986013987,
"grad_norm": 0.47253775531322106,
"learning_rate": 2.793296089385475e-05,
"loss": 0.6098,
"num_tokens": 78503322.0,
"step": 300
},
{
"epoch": 0.14219114219114218,
"grad_norm": 0.5871484700261882,
"learning_rate": 2.8398510242085662e-05,
"loss": 0.5873,
"num_tokens": 79814042.0,
"step": 305
},
{
"epoch": 0.1445221445221445,
"grad_norm": 0.5397162657687059,
"learning_rate": 2.886405959031657e-05,
"loss": 0.5601,
"num_tokens": 81124762.0,
"step": 310
},
{
"epoch": 0.14685314685314685,
"grad_norm": 0.5003299556137542,
"learning_rate": 2.9329608938547488e-05,
"loss": 0.5765,
"num_tokens": 82416836.0,
"step": 315
},
{
"epoch": 0.14918414918414918,
"grad_norm": 0.5745464582996782,
"learning_rate": 2.97951582867784e-05,
"loss": 0.6041,
"num_tokens": 83727556.0,
"step": 320
},
{
"epoch": 0.15151515151515152,
"grad_norm": 0.6004282699112464,
"learning_rate": 3.026070763500931e-05,
"loss": 0.5826,
"num_tokens": 85038276.0,
"step": 325
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.6113197906754673,
"learning_rate": 3.0726256983240227e-05,
"loss": 0.599,
"num_tokens": 86348996.0,
"step": 330
},
{
"epoch": 0.1561771561771562,
"grad_norm": 0.5284281986815572,
"learning_rate": 3.1191806331471136e-05,
"loss": 0.594,
"num_tokens": 87657694.0,
"step": 335
},
{
"epoch": 0.1585081585081585,
"grad_norm": 0.6592854288333747,
"learning_rate": 3.165735567970205e-05,
"loss": 0.5971,
"num_tokens": 88959058.0,
"step": 340
},
{
"epoch": 0.16083916083916083,
"grad_norm": 0.6611677002486963,
"learning_rate": 3.212290502793296e-05,
"loss": 0.5869,
"num_tokens": 90269778.0,
"step": 345
},
{
"epoch": 0.16317016317016317,
"grad_norm": 0.5569884136454792,
"learning_rate": 3.258845437616387e-05,
"loss": 0.5721,
"num_tokens": 91580498.0,
"step": 350
},
{
"epoch": 0.1655011655011655,
"grad_norm": 0.5645452564927658,
"learning_rate": 3.305400372439479e-05,
"loss": 0.6015,
"num_tokens": 92891218.0,
"step": 355
},
{
"epoch": 0.16783216783216784,
"grad_norm": 0.6374940933275999,
"learning_rate": 3.35195530726257e-05,
"loss": 0.5844,
"num_tokens": 94201938.0,
"step": 360
},
{
"epoch": 0.17016317016317017,
"grad_norm": 0.6011646060068315,
"learning_rate": 3.3985102420856614e-05,
"loss": 0.5875,
"num_tokens": 95507207.0,
"step": 365
},
{
"epoch": 0.17249417249417248,
"grad_norm": 0.5884433838979773,
"learning_rate": 3.445065176908753e-05,
"loss": 0.5751,
"num_tokens": 96817927.0,
"step": 370
},
{
"epoch": 0.17482517482517482,
"grad_norm": 0.6051679815251396,
"learning_rate": 3.491620111731844e-05,
"loss": 0.5858,
"num_tokens": 98128647.0,
"step": 375
},
{
"epoch": 0.17715617715617715,
"grad_norm": 0.6049709145754255,
"learning_rate": 3.538175046554935e-05,
"loss": 0.5924,
"num_tokens": 99432200.0,
"step": 380
},
{
"epoch": 0.1794871794871795,
"grad_norm": 0.4705325840654263,
"learning_rate": 3.584729981378026e-05,
"loss": 0.5848,
"num_tokens": 100738580.0,
"step": 385
},
{
"epoch": 0.18181818181818182,
"grad_norm": 0.6888947351263296,
"learning_rate": 3.6312849162011175e-05,
"loss": 0.5765,
"num_tokens": 102049300.0,
"step": 390
},
{
"epoch": 0.18414918414918416,
"grad_norm": 0.5752502148380462,
"learning_rate": 3.6778398510242085e-05,
"loss": 0.5789,
"num_tokens": 103360020.0,
"step": 395
},
{
"epoch": 0.1864801864801865,
"grad_norm": 0.6432121990501041,
"learning_rate": 3.7243947858473e-05,
"loss": 0.5997,
"num_tokens": 104670740.0,
"step": 400
},
{
"epoch": 0.1888111888111888,
"grad_norm": 0.5905703187842473,
"learning_rate": 3.770949720670392e-05,
"loss": 0.5887,
"num_tokens": 105981460.0,
"step": 405
},
{
"epoch": 0.19114219114219114,
"grad_norm": 0.6634303714634665,
"learning_rate": 3.817504655493483e-05,
"loss": 0.5745,
"num_tokens": 107287613.0,
"step": 410
},
{
"epoch": 0.19347319347319347,
"grad_norm": 0.6069673456722715,
"learning_rate": 3.8640595903165736e-05,
"loss": 0.6034,
"num_tokens": 108598333.0,
"step": 415
},
{
"epoch": 0.1958041958041958,
"grad_norm": 0.6162680847616665,
"learning_rate": 3.9106145251396646e-05,
"loss": 0.5891,
"num_tokens": 109909053.0,
"step": 420
},
{
"epoch": 0.19813519813519814,
"grad_norm": 0.5857092916452535,
"learning_rate": 3.957169459962756e-05,
"loss": 0.5789,
"num_tokens": 111219773.0,
"step": 425
},
{
"epoch": 0.20046620046620048,
"grad_norm": 0.6647569324831807,
"learning_rate": 4.003724394785848e-05,
"loss": 0.606,
"num_tokens": 112530493.0,
"step": 430
},
{
"epoch": 0.20279720279720279,
"grad_norm": 0.5524426568425298,
"learning_rate": 4.050279329608939e-05,
"loss": 0.5825,
"num_tokens": 113841213.0,
"step": 435
},
{
"epoch": 0.20512820512820512,
"grad_norm": 0.5810994370146649,
"learning_rate": 4.0968342644320304e-05,
"loss": 0.6003,
"num_tokens": 115151933.0,
"step": 440
},
{
"epoch": 0.20745920745920746,
"grad_norm": 0.5641669807873702,
"learning_rate": 4.143389199255121e-05,
"loss": 0.5916,
"num_tokens": 116454721.0,
"step": 445
},
{
"epoch": 0.2097902097902098,
"grad_norm": 0.5150156412806575,
"learning_rate": 4.1899441340782123e-05,
"loss": 0.5836,
"num_tokens": 117765441.0,
"step": 450
},
{
"epoch": 0.21212121212121213,
"grad_norm": 0.47565520542426254,
"learning_rate": 4.236499068901304e-05,
"loss": 0.5875,
"num_tokens": 119076161.0,
"step": 455
},
{
"epoch": 0.21445221445221446,
"grad_norm": 0.5296345463311969,
"learning_rate": 4.283054003724395e-05,
"loss": 0.5875,
"num_tokens": 120376968.0,
"step": 460
},
{
"epoch": 0.21678321678321677,
"grad_norm": 0.5322255322866658,
"learning_rate": 4.3296089385474866e-05,
"loss": 0.5675,
"num_tokens": 121685949.0,
"step": 465
},
{
"epoch": 0.2191142191142191,
"grad_norm": 0.5124276326443321,
"learning_rate": 4.3761638733705775e-05,
"loss": 0.5834,
"num_tokens": 122996669.0,
"step": 470
},
{
"epoch": 0.22144522144522144,
"grad_norm": 0.5493568540280628,
"learning_rate": 4.4227188081936685e-05,
"loss": 0.5699,
"num_tokens": 124296690.0,
"step": 475
},
{
"epoch": 0.22377622377622378,
"grad_norm": 0.5080116188657589,
"learning_rate": 4.4692737430167594e-05,
"loss": 0.5859,
"num_tokens": 125607410.0,
"step": 480
},
{
"epoch": 0.2261072261072261,
"grad_norm": 0.4796592051544516,
"learning_rate": 4.515828677839851e-05,
"loss": 0.5707,
"num_tokens": 126904766.0,
"step": 485
},
{
"epoch": 0.22843822843822845,
"grad_norm": 0.6183745714886669,
"learning_rate": 4.562383612662943e-05,
"loss": 0.5921,
"num_tokens": 128215486.0,
"step": 490
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.5361760011420288,
"learning_rate": 4.6089385474860336e-05,
"loss": 0.5669,
"num_tokens": 129521193.0,
"step": 495
},
{
"epoch": 0.2331002331002331,
"grad_norm": 0.635330676879503,
"learning_rate": 4.655493482309125e-05,
"loss": 0.5884,
"num_tokens": 130831913.0,
"step": 500
},
{
"epoch": 0.23543123543123542,
"grad_norm": 0.7485046943209333,
"learning_rate": 4.702048417132216e-05,
"loss": 0.5685,
"num_tokens": 132142633.0,
"step": 505
},
{
"epoch": 0.23776223776223776,
"grad_norm": 0.48973907523525306,
"learning_rate": 4.748603351955307e-05,
"loss": 0.5748,
"num_tokens": 133447076.0,
"step": 510
},
{
"epoch": 0.2400932400932401,
"grad_norm": 0.501560835222231,
"learning_rate": 4.795158286778399e-05,
"loss": 0.591,
"num_tokens": 134742243.0,
"step": 515
},
{
"epoch": 0.24242424242424243,
"grad_norm": 0.5768660260790422,
"learning_rate": 4.84171322160149e-05,
"loss": 0.5918,
"num_tokens": 136036466.0,
"step": 520
},
{
"epoch": 0.24475524475524477,
"grad_norm": 0.5577461974387155,
"learning_rate": 4.8882681564245814e-05,
"loss": 0.5881,
"num_tokens": 137347186.0,
"step": 525
},
{
"epoch": 0.24708624708624707,
"grad_norm": 0.511973917006169,
"learning_rate": 4.9348230912476724e-05,
"loss": 0.5767,
"num_tokens": 138657906.0,
"step": 530
},
{
"epoch": 0.2494172494172494,
"grad_norm": 0.5058707953577455,
"learning_rate": 4.981378026070764e-05,
"loss": 0.5768,
"num_tokens": 139963192.0,
"step": 535
},
{
"epoch": 0.2517482517482518,
"grad_norm": 0.5486438929823766,
"learning_rate": 4.999999037242581e-05,
"loss": 0.5832,
"num_tokens": 141273912.0,
"step": 540
},
{
"epoch": 0.2540792540792541,
"grad_norm": 0.5099572474285374,
"learning_rate": 4.999993153728008e-05,
"loss": 0.5689,
"num_tokens": 142570091.0,
"step": 545
},
{
"epoch": 0.2564102564102564,
"grad_norm": 0.5750075114680016,
"learning_rate": 4.9999819215780634e-05,
"loss": 0.5811,
"num_tokens": 143880811.0,
"step": 550
},
{
"epoch": 0.25874125874125875,
"grad_norm": 0.6134226971370147,
"learning_rate": 4.9999653408194474e-05,
"loss": 0.5843,
"num_tokens": 145191531.0,
"step": 555
},
{
"epoch": 0.26107226107226106,
"grad_norm": 0.6314862123358536,
"learning_rate": 4.999943411491576e-05,
"loss": 0.5793,
"num_tokens": 146502251.0,
"step": 560
},
{
"epoch": 0.2634032634032634,
"grad_norm": 0.6889606086375162,
"learning_rate": 4.9999161336465794e-05,
"loss": 0.5702,
"num_tokens": 147812971.0,
"step": 565
},
{
"epoch": 0.26573426573426573,
"grad_norm": 0.6217160853993534,
"learning_rate": 4.999883507349302e-05,
"loss": 0.5774,
"num_tokens": 149113471.0,
"step": 570
},
{
"epoch": 0.2680652680652681,
"grad_norm": 0.8603569158115519,
"learning_rate": 4.9998455326773e-05,
"loss": 0.5723,
"num_tokens": 150424191.0,
"step": 575
},
{
"epoch": 0.2703962703962704,
"grad_norm": 0.5345785403032188,
"learning_rate": 4.9998022097208494e-05,
"loss": 0.5841,
"num_tokens": 151734911.0,
"step": 580
},
{
"epoch": 0.2727272727272727,
"grad_norm": 0.4897793835702214,
"learning_rate": 4.9997535385829355e-05,
"loss": 0.5847,
"num_tokens": 153045631.0,
"step": 585
},
{
"epoch": 0.27505827505827507,
"grad_norm": 0.6403908386621698,
"learning_rate": 4.9996995193792575e-05,
"loss": 0.5852,
"num_tokens": 154356351.0,
"step": 590
},
{
"epoch": 0.2773892773892774,
"grad_norm": 0.5209712741969316,
"learning_rate": 4.9996401522382285e-05,
"loss": 0.5581,
"num_tokens": 155667071.0,
"step": 595
},
{
"epoch": 0.27972027972027974,
"grad_norm": 0.4869182742809078,
"learning_rate": 4.9995754373009756e-05,
"loss": 0.5818,
"num_tokens": 156977308.0,
"step": 600
},
{
"epoch": 0.28205128205128205,
"grad_norm": 0.5217363999507327,
"learning_rate": 4.999505374721338e-05,
"loss": 0.568,
"num_tokens": 158288028.0,
"step": 605
},
{
"epoch": 0.28438228438228436,
"grad_norm": 0.5636140387216821,
"learning_rate": 4.999429964665866e-05,
"loss": 0.5685,
"num_tokens": 159598748.0,
"step": 610
},
{
"epoch": 0.2867132867132867,
"grad_norm": 0.4953249441318155,
"learning_rate": 4.999349207313823e-05,
"loss": 0.5569,
"num_tokens": 160909468.0,
"step": 615
},
{
"epoch": 0.289044289044289,
"grad_norm": 0.5870745218369171,
"learning_rate": 4.999263102857185e-05,
"loss": 0.5684,
"num_tokens": 162220188.0,
"step": 620
},
{
"epoch": 0.2913752913752914,
"grad_norm": 0.5796420642176587,
"learning_rate": 4.9991716515006354e-05,
"loss": 0.5908,
"num_tokens": 163517758.0,
"step": 625
},
{
"epoch": 0.2937062937062937,
"grad_norm": 0.5238869041431976,
"learning_rate": 4.9990748534615714e-05,
"loss": 0.5591,
"num_tokens": 164828478.0,
"step": 630
},
{
"epoch": 0.29603729603729606,
"grad_norm": 0.5863277078576777,
"learning_rate": 4.998972708970101e-05,
"loss": 0.5777,
"num_tokens": 166123691.0,
"step": 635
},
{
"epoch": 0.29836829836829837,
"grad_norm": 0.5171070705654046,
"learning_rate": 4.998865218269036e-05,
"loss": 0.5659,
"num_tokens": 167423794.0,
"step": 640
},
{
"epoch": 0.3006993006993007,
"grad_norm": 0.6049960425262351,
"learning_rate": 4.998752381613905e-05,
"loss": 0.5683,
"num_tokens": 168734514.0,
"step": 645
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.4913193380088962,
"learning_rate": 4.998634199272939e-05,
"loss": 0.5561,
"num_tokens": 170045234.0,
"step": 650
},
{
"epoch": 0.30536130536130535,
"grad_norm": 0.47269645394182036,
"learning_rate": 4.9985106715270786e-05,
"loss": 0.5509,
"num_tokens": 171355954.0,
"step": 655
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.5606565456686575,
"learning_rate": 4.99838179866997e-05,
"loss": 0.5639,
"num_tokens": 172657586.0,
"step": 660
},
{
"epoch": 0.31002331002331,
"grad_norm": 0.5304938940189576,
"learning_rate": 4.99824758100797e-05,
"loss": 0.5512,
"num_tokens": 173968306.0,
"step": 665
},
{
"epoch": 0.3123543123543124,
"grad_norm": 0.4909731279417892,
"learning_rate": 4.998108018860136e-05,
"loss": 0.5729,
"num_tokens": 175279026.0,
"step": 670
},
{
"epoch": 0.3146853146853147,
"grad_norm": 0.5316113973406738,
"learning_rate": 4.997963112558232e-05,
"loss": 0.5679,
"num_tokens": 176589746.0,
"step": 675
},
{
"epoch": 0.317016317016317,
"grad_norm": 0.5548933976438383,
"learning_rate": 4.9978128624467266e-05,
"loss": 0.5559,
"num_tokens": 177900466.0,
"step": 680
},
{
"epoch": 0.31934731934731936,
"grad_norm": 0.6354292278890509,
"learning_rate": 4.997657268882791e-05,
"loss": 0.569,
"num_tokens": 179211186.0,
"step": 685
},
{
"epoch": 0.32167832167832167,
"grad_norm": 0.5118276377254981,
"learning_rate": 4.9974963322362986e-05,
"loss": 0.575,
"num_tokens": 180509493.0,
"step": 690
},
{
"epoch": 0.32400932400932403,
"grad_norm": 0.5775683157667488,
"learning_rate": 4.997330052889826e-05,
"loss": 0.5627,
"num_tokens": 181820213.0,
"step": 695
},
{
"epoch": 0.32634032634032634,
"grad_norm": 0.5342512966303329,
"learning_rate": 4.9971584312386467e-05,
"loss": 0.5616,
"num_tokens": 183130933.0,
"step": 700
},
{
"epoch": 0.32867132867132864,
"grad_norm": 0.4679024175601337,
"learning_rate": 4.996981467690738e-05,
"loss": 0.5549,
"num_tokens": 184441653.0,
"step": 705
},
{
"epoch": 0.331002331002331,
"grad_norm": 0.46007737133219345,
"learning_rate": 4.9967991626667726e-05,
"loss": 0.5709,
"num_tokens": 185752373.0,
"step": 710
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.5781752361746175,
"learning_rate": 4.996611516600122e-05,
"loss": 0.5705,
"num_tokens": 187063093.0,
"step": 715
},
{
"epoch": 0.3356643356643357,
"grad_norm": 0.505350299094054,
"learning_rate": 4.996418529936855e-05,
"loss": 0.5488,
"num_tokens": 188373813.0,
"step": 720
},
{
"epoch": 0.337995337995338,
"grad_norm": 0.5228405284991805,
"learning_rate": 4.9962202031357356e-05,
"loss": 0.5719,
"num_tokens": 189684342.0,
"step": 725
},
{
"epoch": 0.34032634032634035,
"grad_norm": 0.5260336277261016,
"learning_rate": 4.996016536668221e-05,
"loss": 0.5723,
"num_tokens": 190995062.0,
"step": 730
},
{
"epoch": 0.34265734265734266,
"grad_norm": 0.5197547644486562,
"learning_rate": 4.9958075310184634e-05,
"loss": 0.5769,
"num_tokens": 192305782.0,
"step": 735
},
{
"epoch": 0.34498834498834496,
"grad_norm": 0.4760010257555004,
"learning_rate": 4.995593186683308e-05,
"loss": 0.5504,
"num_tokens": 193616502.0,
"step": 740
},
{
"epoch": 0.3473193473193473,
"grad_norm": 0.5832438782265436,
"learning_rate": 4.995373504172286e-05,
"loss": 0.5709,
"num_tokens": 194927222.0,
"step": 745
},
{
"epoch": 0.34965034965034963,
"grad_norm": 0.4426080176608608,
"learning_rate": 4.9951484840076246e-05,
"loss": 0.56,
"num_tokens": 196237942.0,
"step": 750
},
{
"epoch": 0.351981351981352,
"grad_norm": 0.5864066431584307,
"learning_rate": 4.9949181267242365e-05,
"loss": 0.5494,
"num_tokens": 197548662.0,
"step": 755
},
{
"epoch": 0.3543123543123543,
"grad_norm": 0.6479586979194639,
"learning_rate": 4.994682432869722e-05,
"loss": 0.548,
"num_tokens": 198859382.0,
"step": 760
},
{
"epoch": 0.35664335664335667,
"grad_norm": 0.5890555644210004,
"learning_rate": 4.994441403004366e-05,
"loss": 0.5513,
"num_tokens": 200170102.0,
"step": 765
},
{
"epoch": 0.358974358974359,
"grad_norm": 0.5160808856165031,
"learning_rate": 4.9941950377011424e-05,
"loss": 0.5554,
"num_tokens": 201480822.0,
"step": 770
},
{
"epoch": 0.3613053613053613,
"grad_norm": 0.46297814176374613,
"learning_rate": 4.993943337545703e-05,
"loss": 0.5607,
"num_tokens": 202791542.0,
"step": 775
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.5611193032764832,
"learning_rate": 4.993686303136385e-05,
"loss": 0.5539,
"num_tokens": 204102262.0,
"step": 780
},
{
"epoch": 0.36596736596736595,
"grad_norm": 0.5066631995519579,
"learning_rate": 4.9934239350842064e-05,
"loss": 0.5613,
"num_tokens": 205412982.0,
"step": 785
},
{
"epoch": 0.3682983682983683,
"grad_norm": 0.5659611475992192,
"learning_rate": 4.99315623401286e-05,
"loss": 0.5613,
"num_tokens": 206723702.0,
"step": 790
},
{
"epoch": 0.3706293706293706,
"grad_norm": 0.48541130012045497,
"learning_rate": 4.992883200558721e-05,
"loss": 0.5534,
"num_tokens": 208034422.0,
"step": 795
},
{
"epoch": 0.372960372960373,
"grad_norm": 0.5007321223818056,
"learning_rate": 4.992604835370838e-05,
"loss": 0.5676,
"num_tokens": 209345142.0,
"step": 800
},
{
"epoch": 0.3752913752913753,
"grad_norm": 0.49768356226116434,
"learning_rate": 4.992321139110935e-05,
"loss": 0.5628,
"num_tokens": 210655862.0,
"step": 805
},
{
"epoch": 0.3776223776223776,
"grad_norm": 0.49652952690828717,
"learning_rate": 4.992032112453409e-05,
"loss": 0.5602,
"num_tokens": 211966582.0,
"step": 810
},
{
"epoch": 0.37995337995337997,
"grad_norm": 0.4649469721716684,
"learning_rate": 4.9917377560853265e-05,
"loss": 0.5545,
"num_tokens": 213277302.0,
"step": 815
},
{
"epoch": 0.3822843822843823,
"grad_norm": 0.5360683146657782,
"learning_rate": 4.991438070706428e-05,
"loss": 0.5519,
"num_tokens": 214586037.0,
"step": 820
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.49705599435755293,
"learning_rate": 4.991133057029116e-05,
"loss": 0.5509,
"num_tokens": 215896757.0,
"step": 825
},
{
"epoch": 0.38694638694638694,
"grad_norm": 0.5283918826785868,
"learning_rate": 4.9908227157784645e-05,
"loss": 0.5391,
"num_tokens": 217207477.0,
"step": 830
},
{
"epoch": 0.38927738927738925,
"grad_norm": 0.5007115483504386,
"learning_rate": 4.9905070476922086e-05,
"loss": 0.5639,
"num_tokens": 218509916.0,
"step": 835
},
{
"epoch": 0.3916083916083916,
"grad_norm": 0.5498454232202491,
"learning_rate": 4.9901860535207486e-05,
"loss": 0.5705,
"num_tokens": 219820636.0,
"step": 840
},
{
"epoch": 0.3939393939393939,
"grad_norm": 0.48240740398938314,
"learning_rate": 4.9898597340271446e-05,
"loss": 0.5368,
"num_tokens": 221131356.0,
"step": 845
},
{
"epoch": 0.3962703962703963,
"grad_norm": 0.5056929824942472,
"learning_rate": 4.989528089987117e-05,
"loss": 0.5575,
"num_tokens": 222442076.0,
"step": 850
},
{
"epoch": 0.3986013986013986,
"grad_norm": 0.489254628262671,
"learning_rate": 4.989191122189042e-05,
"loss": 0.5493,
"num_tokens": 223752796.0,
"step": 855
},
{
"epoch": 0.40093240093240096,
"grad_norm": 0.48008396663558006,
"learning_rate": 4.988848831433952e-05,
"loss": 0.5428,
"num_tokens": 225063516.0,
"step": 860
},
{
"epoch": 0.40326340326340326,
"grad_norm": 0.49333444429559375,
"learning_rate": 4.9885012185355346e-05,
"loss": 0.5481,
"num_tokens": 226374236.0,
"step": 865
},
{
"epoch": 0.40559440559440557,
"grad_norm": 0.45124369244739004,
"learning_rate": 4.9881482843201266e-05,
"loss": 0.555,
"num_tokens": 227684956.0,
"step": 870
},
{
"epoch": 0.40792540792540793,
"grad_norm": 0.5496711964879529,
"learning_rate": 4.987790029626716e-05,
"loss": 0.5616,
"num_tokens": 228995676.0,
"step": 875
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.47265434607763146,
"learning_rate": 4.9874264553069376e-05,
"loss": 0.5386,
"num_tokens": 230306396.0,
"step": 880
},
{
"epoch": 0.4125874125874126,
"grad_norm": 0.5135497697717332,
"learning_rate": 4.987057562225074e-05,
"loss": 0.5603,
"num_tokens": 231617116.0,
"step": 885
},
{
"epoch": 0.4149184149184149,
"grad_norm": 0.4682122711297366,
"learning_rate": 4.986683351258048e-05,
"loss": 0.5445,
"num_tokens": 232927836.0,
"step": 890
},
{
"epoch": 0.4172494172494173,
"grad_norm": 0.4112633329315492,
"learning_rate": 4.986303823295427e-05,
"loss": 0.5426,
"num_tokens": 234238556.0,
"step": 895
},
{
"epoch": 0.4195804195804196,
"grad_norm": 0.402214665476133,
"learning_rate": 4.985918979239416e-05,
"loss": 0.5485,
"num_tokens": 235549276.0,
"step": 900
},
{
"epoch": 0.4219114219114219,
"grad_norm": 0.5455511517804665,
"learning_rate": 4.985528820004859e-05,
"loss": 0.557,
"num_tokens": 236859996.0,
"step": 905
},
{
"epoch": 0.42424242424242425,
"grad_norm": 0.47199199317580776,
"learning_rate": 4.9851333465192336e-05,
"loss": 0.5371,
"num_tokens": 238170716.0,
"step": 910
},
{
"epoch": 0.42657342657342656,
"grad_norm": 0.4776585972671657,
"learning_rate": 4.984732559722651e-05,
"loss": 0.555,
"num_tokens": 239481436.0,
"step": 915
},
{
"epoch": 0.4289044289044289,
"grad_norm": 0.5249113633053311,
"learning_rate": 4.984326460567852e-05,
"loss": 0.5629,
"num_tokens": 240792156.0,
"step": 920
},
{
"epoch": 0.43123543123543123,
"grad_norm": 0.5202213780622079,
"learning_rate": 4.9839150500202085e-05,
"loss": 0.5443,
"num_tokens": 242102876.0,
"step": 925
},
{
"epoch": 0.43356643356643354,
"grad_norm": 0.5456374996972472,
"learning_rate": 4.983498329057715e-05,
"loss": 0.5597,
"num_tokens": 243413596.0,
"step": 930
},
{
"epoch": 0.4358974358974359,
"grad_norm": 0.4380920966683162,
"learning_rate": 4.983076298670994e-05,
"loss": 0.5325,
"num_tokens": 244719166.0,
"step": 935
},
{
"epoch": 0.4382284382284382,
"grad_norm": 0.5542937783335792,
"learning_rate": 4.982648959863285e-05,
"loss": 0.5562,
"num_tokens": 246029886.0,
"step": 940
},
{
"epoch": 0.4405594405594406,
"grad_norm": 0.503124227495007,
"learning_rate": 4.982216313650448e-05,
"loss": 0.554,
"num_tokens": 247327205.0,
"step": 945
},
{
"epoch": 0.4428904428904429,
"grad_norm": 0.5660923842068369,
"learning_rate": 4.981778361060962e-05,
"loss": 0.5592,
"num_tokens": 248637925.0,
"step": 950
},
{
"epoch": 0.44522144522144524,
"grad_norm": 0.46034726240926843,
"learning_rate": 4.981335103135919e-05,
"loss": 0.5484,
"num_tokens": 249948645.0,
"step": 955
},
{
"epoch": 0.44755244755244755,
"grad_norm": 0.46499481242052637,
"learning_rate": 4.980886540929021e-05,
"loss": 0.5432,
"num_tokens": 251259365.0,
"step": 960
},
{
"epoch": 0.44988344988344986,
"grad_norm": 0.5139881063742346,
"learning_rate": 4.98043267550658e-05,
"loss": 0.5609,
"num_tokens": 252570085.0,
"step": 965
},
{
"epoch": 0.4522144522144522,
"grad_norm": 0.44368184026909635,
"learning_rate": 4.979973507947516e-05,
"loss": 0.5372,
"num_tokens": 253880805.0,
"step": 970
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.45865519207556255,
"learning_rate": 4.979509039343352e-05,
"loss": 0.559,
"num_tokens": 255191525.0,
"step": 975
},
{
"epoch": 0.4568764568764569,
"grad_norm": 0.5353248215200734,
"learning_rate": 4.9790392707982137e-05,
"loss": 0.5715,
"num_tokens": 256502245.0,
"step": 980
},
{
"epoch": 0.4592074592074592,
"grad_norm": 0.4173868436003061,
"learning_rate": 4.978564203428823e-05,
"loss": 0.5447,
"num_tokens": 257812965.0,
"step": 985
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.5116076701150912,
"learning_rate": 4.9780838383645007e-05,
"loss": 0.5551,
"num_tokens": 259123685.0,
"step": 990
},
{
"epoch": 0.46386946386946387,
"grad_norm": 0.46300754955199347,
"learning_rate": 4.977598176747161e-05,
"loss": 0.539,
"num_tokens": 260425724.0,
"step": 995
},
{
"epoch": 0.4662004662004662,
"grad_norm": 0.5076641753208481,
"learning_rate": 4.977107219731307e-05,
"loss": 0.5526,
"num_tokens": 261736444.0,
"step": 1000
},
{
"epoch": 0.46853146853146854,
"grad_norm": 0.47654605410882744,
"learning_rate": 4.9766109684840316e-05,
"loss": 0.5507,
"num_tokens": 263047164.0,
"step": 1005
},
{
"epoch": 0.47086247086247085,
"grad_norm": 0.47216898809671914,
"learning_rate": 4.9761094241850137e-05,
"loss": 0.5564,
"num_tokens": 264341546.0,
"step": 1010
},
{
"epoch": 0.4731934731934732,
"grad_norm": 0.522140635226013,
"learning_rate": 4.9756025880265124e-05,
"loss": 0.5583,
"num_tokens": 265652266.0,
"step": 1015
},
{
"epoch": 0.4755244755244755,
"grad_norm": 0.48172818261748473,
"learning_rate": 4.975090461213368e-05,
"loss": 0.5534,
"num_tokens": 266962986.0,
"step": 1020
},
{
"epoch": 0.47785547785547783,
"grad_norm": 0.43774729877649815,
"learning_rate": 4.9745730449629967e-05,
"loss": 0.5398,
"num_tokens": 268273706.0,
"step": 1025
},
{
"epoch": 0.4801864801864802,
"grad_norm": 0.45666010168372156,
"learning_rate": 4.9740503405053904e-05,
"loss": 0.558,
"num_tokens": 269584426.0,
"step": 1030
},
{
"epoch": 0.4825174825174825,
"grad_norm": 0.45349675110750093,
"learning_rate": 4.9735223490831104e-05,
"loss": 0.5558,
"num_tokens": 270895146.0,
"step": 1035
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.4811125107950563,
"learning_rate": 4.9729890719512875e-05,
"loss": 0.5332,
"num_tokens": 272205866.0,
"step": 1040
},
{
"epoch": 0.48717948717948717,
"grad_norm": 0.48607706780099336,
"learning_rate": 4.972450510377615e-05,
"loss": 0.5511,
"num_tokens": 273514547.0,
"step": 1045
},
{
"epoch": 0.48951048951048953,
"grad_norm": 0.4988992952928454,
"learning_rate": 4.971906665642351e-05,
"loss": 0.5509,
"num_tokens": 274825267.0,
"step": 1050
},
{
"epoch": 0.49184149184149184,
"grad_norm": 0.45178980549967424,
"learning_rate": 4.971357539038311e-05,
"loss": 0.5352,
"num_tokens": 276135501.0,
"step": 1055
},
{
"epoch": 0.49417249417249415,
"grad_norm": 0.4628244844080021,
"learning_rate": 4.970803131870867e-05,
"loss": 0.5576,
"num_tokens": 277446221.0,
"step": 1060
},
{
"epoch": 0.4965034965034965,
"grad_norm": 0.4968199670572577,
"learning_rate": 4.9702434454579435e-05,
"loss": 0.5302,
"num_tokens": 278751167.0,
"step": 1065
},
{
"epoch": 0.4988344988344988,
"grad_norm": 0.5350366333592432,
"learning_rate": 4.969678481130017e-05,
"loss": 0.5447,
"num_tokens": 280061887.0,
"step": 1070
},
{
"epoch": 0.5011655011655012,
"grad_norm": 0.4428901687128577,
"learning_rate": 4.9691082402301056e-05,
"loss": 0.5515,
"num_tokens": 281372607.0,
"step": 1075
},
{
"epoch": 0.5034965034965035,
"grad_norm": 0.4375390965832773,
"learning_rate": 4.9685327241137755e-05,
"loss": 0.5429,
"num_tokens": 282683327.0,
"step": 1080
},
{
"epoch": 0.5058275058275058,
"grad_norm": 0.4522043765274002,
"learning_rate": 4.967951934149132e-05,
"loss": 0.5603,
"num_tokens": 283980719.0,
"step": 1085
},
{
"epoch": 0.5081585081585082,
"grad_norm": 0.45069894232095004,
"learning_rate": 4.967365871716814e-05,
"loss": 0.5528,
"num_tokens": 285291439.0,
"step": 1090
},
{
"epoch": 0.5104895104895105,
"grad_norm": 0.4750493200396816,
"learning_rate": 4.9667745382099986e-05,
"loss": 0.533,
"num_tokens": 286602159.0,
"step": 1095
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.4549131138548312,
"learning_rate": 4.96617793503439e-05,
"loss": 0.5531,
"num_tokens": 287912879.0,
"step": 1100
},
{
"epoch": 0.5151515151515151,
"grad_norm": 0.5019297246106439,
"learning_rate": 4.9655760636082214e-05,
"loss": 0.5749,
"num_tokens": 289215063.0,
"step": 1105
},
{
"epoch": 0.5174825174825175,
"grad_norm": 0.4113411292647171,
"learning_rate": 4.964968925362248e-05,
"loss": 0.5372,
"num_tokens": 290525783.0,
"step": 1110
},
{
"epoch": 0.5198135198135199,
"grad_norm": 0.41397626069442495,
"learning_rate": 4.964356521739746e-05,
"loss": 0.5385,
"num_tokens": 291823567.0,
"step": 1115
},
{
"epoch": 0.5221445221445221,
"grad_norm": 0.44284424686828006,
"learning_rate": 4.9637388541965074e-05,
"loss": 0.5346,
"num_tokens": 293122806.0,
"step": 1120
},
{
"epoch": 0.5244755244755245,
"grad_norm": 0.47707957535562784,
"learning_rate": 4.9631159242008394e-05,
"loss": 0.5411,
"num_tokens": 294423907.0,
"step": 1125
},
{
"epoch": 0.5268065268065268,
"grad_norm": 0.5044770815753655,
"learning_rate": 4.9624877332335576e-05,
"loss": 0.5675,
"num_tokens": 295734627.0,
"step": 1130
},
{
"epoch": 0.5291375291375291,
"grad_norm": 0.5774114892004122,
"learning_rate": 4.9618542827879826e-05,
"loss": 0.5546,
"num_tokens": 297045347.0,
"step": 1135
},
{
"epoch": 0.5314685314685315,
"grad_norm": 0.4376278267424837,
"learning_rate": 4.9612155743699416e-05,
"loss": 0.5377,
"num_tokens": 298356067.0,
"step": 1140
},
{
"epoch": 0.5337995337995338,
"grad_norm": 0.5642480010358203,
"learning_rate": 4.960571609497756e-05,
"loss": 0.5576,
"num_tokens": 299666787.0,
"step": 1145
},
{
"epoch": 0.5361305361305362,
"grad_norm": 0.46779318091035216,
"learning_rate": 4.9599223897022474e-05,
"loss": 0.5292,
"num_tokens": 300977507.0,
"step": 1150
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.5065887547269632,
"learning_rate": 4.959267916526726e-05,
"loss": 0.5493,
"num_tokens": 302288227.0,
"step": 1155
},
{
"epoch": 0.5407925407925408,
"grad_norm": 0.4516510903602445,
"learning_rate": 4.958608191526992e-05,
"loss": 0.5392,
"num_tokens": 303598947.0,
"step": 1160
},
{
"epoch": 0.5431235431235432,
"grad_norm": 0.4860574849415324,
"learning_rate": 4.957943216271328e-05,
"loss": 0.5479,
"num_tokens": 304909667.0,
"step": 1165
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.4464946298053418,
"learning_rate": 4.9572729923405e-05,
"loss": 0.5459,
"num_tokens": 306213321.0,
"step": 1170
},
{
"epoch": 0.5477855477855478,
"grad_norm": 0.4873945912917641,
"learning_rate": 4.956597521327751e-05,
"loss": 0.5616,
"num_tokens": 307524041.0,
"step": 1175
},
{
"epoch": 0.5501165501165501,
"grad_norm": 0.4388018067262825,
"learning_rate": 4.955916804838794e-05,
"loss": 0.5423,
"num_tokens": 308834761.0,
"step": 1180
},
{
"epoch": 0.5524475524475524,
"grad_norm": 0.5105304916507707,
"learning_rate": 4.955230844491815e-05,
"loss": 0.5437,
"num_tokens": 310145481.0,
"step": 1185
},
{
"epoch": 0.5547785547785548,
"grad_norm": 0.4456654310545227,
"learning_rate": 4.954539641917464e-05,
"loss": 0.522,
"num_tokens": 311456201.0,
"step": 1190
},
{
"epoch": 0.5571095571095571,
"grad_norm": 0.4454183246164721,
"learning_rate": 4.953843198758853e-05,
"loss": 0.5404,
"num_tokens": 312766921.0,
"step": 1195
},
{
"epoch": 0.5594405594405595,
"grad_norm": 0.4348791044569224,
"learning_rate": 4.953141516671551e-05,
"loss": 0.543,
"num_tokens": 314077641.0,
"step": 1200
},
{
"epoch": 0.5617715617715617,
"grad_norm": 0.4380820952282363,
"learning_rate": 4.952434597323582e-05,
"loss": 0.5396,
"num_tokens": 315388361.0,
"step": 1205
},
{
"epoch": 0.5641025641025641,
"grad_norm": 0.5026501223667641,
"learning_rate": 4.9517224423954207e-05,
"loss": 0.5347,
"num_tokens": 316699081.0,
"step": 1210
},
{
"epoch": 0.5664335664335665,
"grad_norm": 0.44043504602766326,
"learning_rate": 4.951005053579985e-05,
"loss": 0.5339,
"num_tokens": 317998833.0,
"step": 1215
},
{
"epoch": 0.5687645687645687,
"grad_norm": 0.4743068657928212,
"learning_rate": 4.950282432582635e-05,
"loss": 0.5339,
"num_tokens": 319309553.0,
"step": 1220
},
{
"epoch": 0.5710955710955711,
"grad_norm": 0.47052451377323873,
"learning_rate": 4.9495545811211724e-05,
"loss": 0.5294,
"num_tokens": 320613226.0,
"step": 1225
},
{
"epoch": 0.5734265734265734,
"grad_norm": 0.4514576116741238,
"learning_rate": 4.948821500925829e-05,
"loss": 0.5322,
"num_tokens": 321923946.0,
"step": 1230
},
{
"epoch": 0.5757575757575758,
"grad_norm": 0.499267629747507,
"learning_rate": 4.948083193739267e-05,
"loss": 0.5288,
"num_tokens": 323234666.0,
"step": 1235
},
{
"epoch": 0.578088578088578,
"grad_norm": 0.41784291038069327,
"learning_rate": 4.947339661316574e-05,
"loss": 0.5412,
"num_tokens": 324545386.0,
"step": 1240
},
{
"epoch": 0.5804195804195804,
"grad_norm": 0.4039329790155304,
"learning_rate": 4.946590905425262e-05,
"loss": 0.5417,
"num_tokens": 325856106.0,
"step": 1245
},
{
"epoch": 0.5827505827505828,
"grad_norm": 0.47163444684620975,
"learning_rate": 4.9458369278452536e-05,
"loss": 0.5312,
"num_tokens": 327166826.0,
"step": 1250
},
{
"epoch": 0.585081585081585,
"grad_norm": 0.49628607779772915,
"learning_rate": 4.94507773036889e-05,
"loss": 0.5646,
"num_tokens": 328477546.0,
"step": 1255
},
{
"epoch": 0.5874125874125874,
"grad_norm": 0.47690969125101035,
"learning_rate": 4.9443133148009193e-05,
"loss": 0.5458,
"num_tokens": 329788266.0,
"step": 1260
},
{
"epoch": 0.5897435897435898,
"grad_norm": 0.5529225791199783,
"learning_rate": 4.943543682958494e-05,
"loss": 0.5515,
"num_tokens": 331098986.0,
"step": 1265
},
{
"epoch": 0.5920745920745921,
"grad_norm": 0.4570156847193979,
"learning_rate": 4.942768836671165e-05,
"loss": 0.5624,
"num_tokens": 332409706.0,
"step": 1270
},
{
"epoch": 0.5944055944055944,
"grad_norm": 0.4476694941858805,
"learning_rate": 4.941988777780881e-05,
"loss": 0.5278,
"num_tokens": 333720426.0,
"step": 1275
},
{
"epoch": 0.5967365967365967,
"grad_norm": 0.5092541231473139,
"learning_rate": 4.941203508141982e-05,
"loss": 0.541,
"num_tokens": 335031146.0,
"step": 1280
},
{
"epoch": 0.5990675990675991,
"grad_norm": 0.49506494033393816,
"learning_rate": 4.940413029621193e-05,
"loss": 0.5176,
"num_tokens": 336338731.0,
"step": 1285
},
{
"epoch": 0.6013986013986014,
"grad_norm": 0.4380679777553619,
"learning_rate": 4.939617344097622e-05,
"loss": 0.5303,
"num_tokens": 337649451.0,
"step": 1290
},
{
"epoch": 0.6037296037296037,
"grad_norm": 0.48586508532838385,
"learning_rate": 4.938816453462758e-05,
"loss": 0.536,
"num_tokens": 338960171.0,
"step": 1295
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.530127091401037,
"learning_rate": 4.9380103596204584e-05,
"loss": 0.5227,
"num_tokens": 340270891.0,
"step": 1300
},
{
"epoch": 0.6083916083916084,
"grad_norm": 0.4723471427584443,
"learning_rate": 4.9371990644869534e-05,
"loss": 0.5364,
"num_tokens": 341578590.0,
"step": 1305
},
{
"epoch": 0.6107226107226107,
"grad_norm": 0.4745273455014607,
"learning_rate": 4.936382569990837e-05,
"loss": 0.5294,
"num_tokens": 342889310.0,
"step": 1310
},
{
"epoch": 0.6130536130536131,
"grad_norm": 0.4155339618600452,
"learning_rate": 4.935560878073061e-05,
"loss": 0.5167,
"num_tokens": 344200030.0,
"step": 1315
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.4782114553800321,
"learning_rate": 4.934733990686934e-05,
"loss": 0.5185,
"num_tokens": 345504904.0,
"step": 1320
},
{
"epoch": 0.6177156177156177,
"grad_norm": 0.44089991433891135,
"learning_rate": 4.9339019097981155e-05,
"loss": 0.5533,
"num_tokens": 346815624.0,
"step": 1325
},
{
"epoch": 0.62004662004662,
"grad_norm": 0.43436432584179596,
"learning_rate": 4.933064637384611e-05,
"loss": 0.5159,
"num_tokens": 348126344.0,
"step": 1330
},
{
"epoch": 0.6223776223776224,
"grad_norm": 0.43330361810730905,
"learning_rate": 4.932222175436764e-05,
"loss": 0.5162,
"num_tokens": 349437064.0,
"step": 1335
},
{
"epoch": 0.6247086247086248,
"grad_norm": 0.49741869953347956,
"learning_rate": 4.9313745259572594e-05,
"loss": 0.539,
"num_tokens": 350734169.0,
"step": 1340
},
{
"epoch": 0.627039627039627,
"grad_norm": 0.4766273272639524,
"learning_rate": 4.93052169096111e-05,
"loss": 0.5331,
"num_tokens": 352031299.0,
"step": 1345
},
{
"epoch": 0.6293706293706294,
"grad_norm": 0.4354355175881609,
"learning_rate": 4.9296636724756576e-05,
"loss": 0.5616,
"num_tokens": 353342019.0,
"step": 1350
},
{
"epoch": 0.6317016317016317,
"grad_norm": 0.4509143900260654,
"learning_rate": 4.928800472540564e-05,
"loss": 0.5162,
"num_tokens": 354652739.0,
"step": 1355
},
{
"epoch": 0.634032634032634,
"grad_norm": 0.48892330783121585,
"learning_rate": 4.9279320932078114e-05,
"loss": 0.5432,
"num_tokens": 355956520.0,
"step": 1360
},
{
"epoch": 0.6363636363636364,
"grad_norm": 0.49979045808989403,
"learning_rate": 4.927058536541691e-05,
"loss": 0.5421,
"num_tokens": 357259308.0,
"step": 1365
},
{
"epoch": 0.6386946386946387,
"grad_norm": 0.4773067120764309,
"learning_rate": 4.926179804618805e-05,
"loss": 0.5232,
"num_tokens": 358570028.0,
"step": 1370
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.4807159216924898,
"learning_rate": 4.925295899528052e-05,
"loss": 0.5378,
"num_tokens": 359880748.0,
"step": 1375
},
{
"epoch": 0.6433566433566433,
"grad_norm": 0.5792009053156909,
"learning_rate": 4.924406823370637e-05,
"loss": 0.5505,
"num_tokens": 361191468.0,
"step": 1380
},
{
"epoch": 0.6456876456876457,
"grad_norm": 0.43269843498677835,
"learning_rate": 4.923512578260049e-05,
"loss": 0.5271,
"num_tokens": 362502188.0,
"step": 1385
},
{
"epoch": 0.6480186480186481,
"grad_norm": 0.487074695118609,
"learning_rate": 4.922613166322071e-05,
"loss": 0.524,
"num_tokens": 363812908.0,
"step": 1390
},
{
"epoch": 0.6503496503496503,
"grad_norm": 0.4550175361090717,
"learning_rate": 4.9217085896947636e-05,
"loss": 0.5314,
"num_tokens": 365123628.0,
"step": 1395
},
{
"epoch": 0.6526806526806527,
"grad_norm": 0.47974397984295003,
"learning_rate": 4.920798850528468e-05,
"loss": 0.5467,
"num_tokens": 366434348.0,
"step": 1400
},
{
"epoch": 0.655011655011655,
"grad_norm": 0.48000475609552506,
"learning_rate": 4.919883950985796e-05,
"loss": 0.5284,
"num_tokens": 367745068.0,
"step": 1405
},
{
"epoch": 0.6573426573426573,
"grad_norm": 0.6241647775105474,
"learning_rate": 4.918963893241628e-05,
"loss": 0.5464,
"num_tokens": 369055788.0,
"step": 1410
},
{
"epoch": 0.6596736596736597,
"grad_norm": 0.48926526915624463,
"learning_rate": 4.918038679483105e-05,
"loss": 0.5331,
"num_tokens": 370366508.0,
"step": 1415
},
{
"epoch": 0.662004662004662,
"grad_norm": 0.4456502453067459,
"learning_rate": 4.917108311909624e-05,
"loss": 0.5525,
"num_tokens": 371677228.0,
"step": 1420
},
{
"epoch": 0.6643356643356644,
"grad_norm": 0.4346779313930937,
"learning_rate": 4.916172792732838e-05,
"loss": 0.5191,
"num_tokens": 372987948.0,
"step": 1425
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.5028199232030454,
"learning_rate": 4.91523212417664e-05,
"loss": 0.5355,
"num_tokens": 374298668.0,
"step": 1430
},
{
"epoch": 0.668997668997669,
"grad_norm": 0.43127888280556725,
"learning_rate": 4.914286308477168e-05,
"loss": 0.5402,
"num_tokens": 375597975.0,
"step": 1435
},
{
"epoch": 0.6713286713286714,
"grad_norm": 0.42701713218021103,
"learning_rate": 4.913335347882795e-05,
"loss": 0.5436,
"num_tokens": 376908695.0,
"step": 1440
},
{
"epoch": 0.6736596736596736,
"grad_norm": 0.45471075660929255,
"learning_rate": 4.912379244654125e-05,
"loss": 0.5496,
"num_tokens": 378219415.0,
"step": 1445
},
{
"epoch": 0.675990675990676,
"grad_norm": 0.5036127947007575,
"learning_rate": 4.911418001063985e-05,
"loss": 0.5457,
"num_tokens": 379526026.0,
"step": 1450
},
{
"epoch": 0.6783216783216783,
"grad_norm": 0.6069326047714376,
"learning_rate": 4.910451619397421e-05,
"loss": 0.5532,
"num_tokens": 380835617.0,
"step": 1455
},
{
"epoch": 0.6806526806526807,
"grad_norm": 0.42021839267741845,
"learning_rate": 4.9094801019516987e-05,
"loss": 0.5302,
"num_tokens": 382146337.0,
"step": 1460
},
{
"epoch": 0.682983682983683,
"grad_norm": 0.42181592137530005,
"learning_rate": 4.908503451036285e-05,
"loss": 0.5395,
"num_tokens": 383457057.0,
"step": 1465
},
{
"epoch": 0.6853146853146853,
"grad_norm": 0.4819433448161508,
"learning_rate": 4.9075216689728545e-05,
"loss": 0.5232,
"num_tokens": 384767777.0,
"step": 1470
},
{
"epoch": 0.6876456876456877,
"grad_norm": 0.4360769334365848,
"learning_rate": 4.9065347580952795e-05,
"loss": 0.5419,
"num_tokens": 386078497.0,
"step": 1475
},
{
"epoch": 0.6899766899766899,
"grad_norm": 0.46623263345856697,
"learning_rate": 4.9055427207496216e-05,
"loss": 0.5327,
"num_tokens": 387384301.0,
"step": 1480
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.49755651823842795,
"learning_rate": 4.9045455592941325e-05,
"loss": 0.5313,
"num_tokens": 388695021.0,
"step": 1485
},
{
"epoch": 0.6946386946386947,
"grad_norm": 0.47799577567341445,
"learning_rate": 4.903543276099241e-05,
"loss": 0.5191,
"num_tokens": 390005741.0,
"step": 1490
},
{
"epoch": 0.696969696969697,
"grad_norm": 0.43014258037668707,
"learning_rate": 4.902535873547555e-05,
"loss": 0.5279,
"num_tokens": 391300707.0,
"step": 1495
},
{
"epoch": 0.6993006993006993,
"grad_norm": 0.43677160663867953,
"learning_rate": 4.901523354033849e-05,
"loss": 0.5239,
"num_tokens": 392611427.0,
"step": 1500
},
{
"epoch": 0.7016317016317016,
"grad_norm": 0.4314496324860873,
"learning_rate": 4.9005057199650624e-05,
"loss": 0.5507,
"num_tokens": 393915317.0,
"step": 1505
},
{
"epoch": 0.703962703962704,
"grad_norm": 0.5211544326381605,
"learning_rate": 4.8994829737602945e-05,
"loss": 0.5327,
"num_tokens": 395213883.0,
"step": 1510
},
{
"epoch": 0.7062937062937062,
"grad_norm": 0.4976803079849033,
"learning_rate": 4.8984551178507936e-05,
"loss": 0.5281,
"num_tokens": 396524603.0,
"step": 1515
},
{
"epoch": 0.7086247086247086,
"grad_norm": 0.48985310144346317,
"learning_rate": 4.897422154679959e-05,
"loss": 0.5285,
"num_tokens": 397835323.0,
"step": 1520
},
{
"epoch": 0.710955710955711,
"grad_norm": 0.5059108187782707,
"learning_rate": 4.896384086703327e-05,
"loss": 0.5221,
"num_tokens": 399146043.0,
"step": 1525
},
{
"epoch": 0.7132867132867133,
"grad_norm": 0.4641511467601387,
"learning_rate": 4.8953409163885706e-05,
"loss": 0.5263,
"num_tokens": 400443842.0,
"step": 1530
},
{
"epoch": 0.7156177156177156,
"grad_norm": 0.46382018249195767,
"learning_rate": 4.894292646215492e-05,
"loss": 0.5295,
"num_tokens": 401754562.0,
"step": 1535
},
{
"epoch": 0.717948717948718,
"grad_norm": 0.46293693643832434,
"learning_rate": 4.8932392786760174e-05,
"loss": 0.5311,
"num_tokens": 403065282.0,
"step": 1540
},
{
"epoch": 0.7202797202797203,
"grad_norm": 0.4416679999163852,
"learning_rate": 4.8921808162741875e-05,
"loss": 0.5316,
"num_tokens": 404376002.0,
"step": 1545
},
{
"epoch": 0.7226107226107226,
"grad_norm": 0.5431939837625993,
"learning_rate": 4.891117261526159e-05,
"loss": 0.5232,
"num_tokens": 405686722.0,
"step": 1550
},
{
"epoch": 0.7249417249417249,
"grad_norm": 0.41054749209427666,
"learning_rate": 4.890048616960189e-05,
"loss": 0.5272,
"num_tokens": 406997442.0,
"step": 1555
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.4379625629492077,
"learning_rate": 4.888974885116637e-05,
"loss": 0.5359,
"num_tokens": 408298322.0,
"step": 1560
},
{
"epoch": 0.7296037296037297,
"grad_norm": 0.48689004241017486,
"learning_rate": 4.887896068547957e-05,
"loss": 0.5469,
"num_tokens": 409609042.0,
"step": 1565
},
{
"epoch": 0.7319347319347319,
"grad_norm": 0.38565120780565504,
"learning_rate": 4.886812169818686e-05,
"loss": 0.5409,
"num_tokens": 410917246.0,
"step": 1570
},
{
"epoch": 0.7342657342657343,
"grad_norm": 0.42695613923544207,
"learning_rate": 4.8857231915054465e-05,
"loss": 0.5445,
"num_tokens": 412227966.0,
"step": 1575
},
{
"epoch": 0.7365967365967366,
"grad_norm": 0.381808559474973,
"learning_rate": 4.884629136196934e-05,
"loss": 0.5207,
"num_tokens": 413538686.0,
"step": 1580
},
{
"epoch": 0.7389277389277389,
"grad_norm": 0.44656354419841626,
"learning_rate": 4.8835300064939126e-05,
"loss": 0.5172,
"num_tokens": 414849406.0,
"step": 1585
},
{
"epoch": 0.7412587412587412,
"grad_norm": 0.4096295626157741,
"learning_rate": 4.88242580500921e-05,
"loss": 0.5255,
"num_tokens": 416160126.0,
"step": 1590
},
{
"epoch": 0.7435897435897436,
"grad_norm": 0.4147213001212058,
"learning_rate": 4.8813165343677106e-05,
"loss": 0.5426,
"num_tokens": 417470846.0,
"step": 1595
},
{
"epoch": 0.745920745920746,
"grad_norm": 0.5042968381378073,
"learning_rate": 4.8802021972063496e-05,
"loss": 0.5351,
"num_tokens": 418781566.0,
"step": 1600
},
{
"epoch": 0.7482517482517482,
"grad_norm": 0.4572941443823439,
"learning_rate": 4.879082796174104e-05,
"loss": 0.5267,
"num_tokens": 420090396.0,
"step": 1605
},
{
"epoch": 0.7505827505827506,
"grad_norm": 0.5253757414894156,
"learning_rate": 4.87795833393199e-05,
"loss": 0.5256,
"num_tokens": 421401116.0,
"step": 1610
},
{
"epoch": 0.752913752913753,
"grad_norm": 0.4314784400622872,
"learning_rate": 4.876828813153055e-05,
"loss": 0.52,
"num_tokens": 422711836.0,
"step": 1615
},
{
"epoch": 0.7552447552447552,
"grad_norm": 0.43464618943674793,
"learning_rate": 4.875694236522372e-05,
"loss": 0.5157,
"num_tokens": 424003342.0,
"step": 1620
},
{
"epoch": 0.7575757575757576,
"grad_norm": 0.43677420380768783,
"learning_rate": 4.8745546067370326e-05,
"loss": 0.5305,
"num_tokens": 425314062.0,
"step": 1625
},
{
"epoch": 0.7599067599067599,
"grad_norm": 0.4958407404835606,
"learning_rate": 4.873409926506139e-05,
"loss": 0.5362,
"num_tokens": 426624782.0,
"step": 1630
},
{
"epoch": 0.7622377622377622,
"grad_norm": 0.5242256465945466,
"learning_rate": 4.8722601985508024e-05,
"loss": 0.5369,
"num_tokens": 427935502.0,
"step": 1635
},
{
"epoch": 0.7645687645687645,
"grad_norm": 0.5675275645432154,
"learning_rate": 4.871105425604129e-05,
"loss": 0.5422,
"num_tokens": 429246222.0,
"step": 1640
},
{
"epoch": 0.7668997668997669,
"grad_norm": 0.5159660910808933,
"learning_rate": 4.869945610411222e-05,
"loss": 0.5379,
"num_tokens": 430556942.0,
"step": 1645
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.45981018020284387,
"learning_rate": 4.8687807557291684e-05,
"loss": 0.5233,
"num_tokens": 431867662.0,
"step": 1650
},
{
"epoch": 0.7715617715617715,
"grad_norm": 0.5178606410785168,
"learning_rate": 4.867610864327035e-05,
"loss": 0.517,
"num_tokens": 433176429.0,
"step": 1655
},
{
"epoch": 0.7738927738927739,
"grad_norm": 0.4153211549717526,
"learning_rate": 4.866435938985864e-05,
"loss": 0.521,
"num_tokens": 434474764.0,
"step": 1660
},
{
"epoch": 0.7762237762237763,
"grad_norm": 0.43474480248852926,
"learning_rate": 4.8652559824986614e-05,
"loss": 0.5149,
"num_tokens": 435785484.0,
"step": 1665
},
{
"epoch": 0.7785547785547785,
"grad_norm": 0.40937995874484956,
"learning_rate": 4.8640709976703955e-05,
"loss": 0.5255,
"num_tokens": 437096204.0,
"step": 1670
},
{
"epoch": 0.7808857808857809,
"grad_norm": 0.4465967325967671,
"learning_rate": 4.862880987317987e-05,
"loss": 0.5322,
"num_tokens": 438406924.0,
"step": 1675
},
{
"epoch": 0.7832167832167832,
"grad_norm": 0.49435400982888844,
"learning_rate": 4.8616859542703015e-05,
"loss": 0.5139,
"num_tokens": 439717644.0,
"step": 1680
},
{
"epoch": 0.7855477855477856,
"grad_norm": 0.45230753156297615,
"learning_rate": 4.860485901368146e-05,
"loss": 0.5204,
"num_tokens": 441012029.0,
"step": 1685
},
{
"epoch": 0.7878787878787878,
"grad_norm": 0.46302825612412485,
"learning_rate": 4.859280831464262e-05,
"loss": 0.5307,
"num_tokens": 442322749.0,
"step": 1690
},
{
"epoch": 0.7902097902097902,
"grad_norm": 0.4352069645722489,
"learning_rate": 4.858070747423315e-05,
"loss": 0.5293,
"num_tokens": 443633469.0,
"step": 1695
},
{
"epoch": 0.7925407925407926,
"grad_norm": 0.44776105300683866,
"learning_rate": 4.856855652121889e-05,
"loss": 0.5376,
"num_tokens": 444944189.0,
"step": 1700
},
{
"epoch": 0.7948717948717948,
"grad_norm": 0.5047765930651309,
"learning_rate": 4.855635548448485e-05,
"loss": 0.5266,
"num_tokens": 446254909.0,
"step": 1705
},
{
"epoch": 0.7972027972027972,
"grad_norm": 0.5902940318374286,
"learning_rate": 4.8544104393035064e-05,
"loss": 0.5548,
"num_tokens": 447549445.0,
"step": 1710
},
{
"epoch": 0.7995337995337995,
"grad_norm": 0.47765982040228566,
"learning_rate": 4.8531803275992564e-05,
"loss": 0.5234,
"num_tokens": 448860165.0,
"step": 1715
},
{
"epoch": 0.8018648018648019,
"grad_norm": 0.4683409566871783,
"learning_rate": 4.85194521625993e-05,
"loss": 0.5303,
"num_tokens": 450170885.0,
"step": 1720
},
{
"epoch": 0.8041958041958042,
"grad_norm": 0.3875193157237528,
"learning_rate": 4.850705108221607e-05,
"loss": 0.5243,
"num_tokens": 451481605.0,
"step": 1725
},
{
"epoch": 0.8065268065268065,
"grad_norm": 0.4090714505669396,
"learning_rate": 4.849460006432246e-05,
"loss": 0.5368,
"num_tokens": 452792325.0,
"step": 1730
},
{
"epoch": 0.8088578088578089,
"grad_norm": 0.41342814545426965,
"learning_rate": 4.848209913851676e-05,
"loss": 0.5367,
"num_tokens": 454103045.0,
"step": 1735
},
{
"epoch": 0.8111888111888111,
"grad_norm": 0.43002723270421467,
"learning_rate": 4.8469548334515895e-05,
"loss": 0.5128,
"num_tokens": 455413765.0,
"step": 1740
},
{
"epoch": 0.8135198135198135,
"grad_norm": 0.44536700540815066,
"learning_rate": 4.845694768215538e-05,
"loss": 0.5225,
"num_tokens": 456724485.0,
"step": 1745
},
{
"epoch": 0.8158508158508159,
"grad_norm": 0.41417616843988325,
"learning_rate": 4.844429721138921e-05,
"loss": 0.5179,
"num_tokens": 458035205.0,
"step": 1750
},
{
"epoch": 0.8181818181818182,
"grad_norm": 0.4335240853933784,
"learning_rate": 4.843159695228981e-05,
"loss": 0.5338,
"num_tokens": 459345925.0,
"step": 1755
},
{
"epoch": 0.8205128205128205,
"grad_norm": 0.4804531520417236,
"learning_rate": 4.841884693504796e-05,
"loss": 0.5301,
"num_tokens": 460656645.0,
"step": 1760
},
{
"epoch": 0.8228438228438228,
"grad_norm": 0.426207694644574,
"learning_rate": 4.8406047189972745e-05,
"loss": 0.512,
"num_tokens": 461967365.0,
"step": 1765
},
{
"epoch": 0.8251748251748252,
"grad_norm": 0.5170006552608741,
"learning_rate": 4.839319774749142e-05,
"loss": 0.5439,
"num_tokens": 463278085.0,
"step": 1770
},
{
"epoch": 0.8275058275058275,
"grad_norm": 0.41296058440902,
"learning_rate": 4.8380298638149414e-05,
"loss": 0.529,
"num_tokens": 464588805.0,
"step": 1775
},
{
"epoch": 0.8298368298368298,
"grad_norm": 0.4432327702578238,
"learning_rate": 4.8367349892610205e-05,
"loss": 0.5141,
"num_tokens": 465899525.0,
"step": 1780
},
{
"epoch": 0.8321678321678322,
"grad_norm": 0.4186473579130964,
"learning_rate": 4.8354351541655295e-05,
"loss": 0.5056,
"num_tokens": 467210245.0,
"step": 1785
},
{
"epoch": 0.8344988344988346,
"grad_norm": 0.4085487401716384,
"learning_rate": 4.834130361618407e-05,
"loss": 0.5201,
"num_tokens": 468520965.0,
"step": 1790
},
{
"epoch": 0.8368298368298368,
"grad_norm": 0.41696496844476233,
"learning_rate": 4.832820614721377e-05,
"loss": 0.5182,
"num_tokens": 469831685.0,
"step": 1795
},
{
"epoch": 0.8391608391608392,
"grad_norm": 0.49106045483752714,
"learning_rate": 4.8315059165879424e-05,
"loss": 0.5053,
"num_tokens": 471142405.0,
"step": 1800
},
{
"epoch": 0.8414918414918415,
"grad_norm": 0.48437176577007535,
"learning_rate": 4.830186270343375e-05,
"loss": 0.5168,
"num_tokens": 472453125.0,
"step": 1805
},
{
"epoch": 0.8438228438228438,
"grad_norm": 0.40932065591417777,
"learning_rate": 4.828861679124711e-05,
"loss": 0.5381,
"num_tokens": 473763845.0,
"step": 1810
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.5194673863667205,
"learning_rate": 4.827532146080738e-05,
"loss": 0.5299,
"num_tokens": 475074565.0,
"step": 1815
},
{
"epoch": 0.8484848484848485,
"grad_norm": 0.41189276085541665,
"learning_rate": 4.826197674371995e-05,
"loss": 0.5107,
"num_tokens": 476385285.0,
"step": 1820
},
{
"epoch": 0.8508158508158508,
"grad_norm": 0.41512648167613064,
"learning_rate": 4.8248582671707585e-05,
"loss": 0.5182,
"num_tokens": 477684249.0,
"step": 1825
},
{
"epoch": 0.8531468531468531,
"grad_norm": 0.39429940494764987,
"learning_rate": 4.8235139276610395e-05,
"loss": 0.527,
"num_tokens": 478994969.0,
"step": 1830
},
{
"epoch": 0.8554778554778555,
"grad_norm": 0.4269707121617,
"learning_rate": 4.8221646590385723e-05,
"loss": 0.5202,
"num_tokens": 480305689.0,
"step": 1835
},
{
"epoch": 0.8578088578088578,
"grad_norm": 0.4747259759226198,
"learning_rate": 4.8208104645108086e-05,
"loss": 0.5163,
"num_tokens": 481616409.0,
"step": 1840
},
{
"epoch": 0.8601398601398601,
"grad_norm": 0.4214636443982996,
"learning_rate": 4.819451347296912e-05,
"loss": 0.5202,
"num_tokens": 482927129.0,
"step": 1845
},
{
"epoch": 0.8624708624708625,
"grad_norm": 0.4097076386562582,
"learning_rate": 4.818087310627746e-05,
"loss": 0.5198,
"num_tokens": 484237849.0,
"step": 1850
},
{
"epoch": 0.8648018648018648,
"grad_norm": 0.4415899349026412,
"learning_rate": 4.816718357745869e-05,
"loss": 0.5116,
"num_tokens": 485544074.0,
"step": 1855
},
{
"epoch": 0.8671328671328671,
"grad_norm": 0.49520322041534987,
"learning_rate": 4.815344491905527e-05,
"loss": 0.5268,
"num_tokens": 486854794.0,
"step": 1860
},
{
"epoch": 0.8694638694638694,
"grad_norm": 0.4013730407728088,
"learning_rate": 4.813965716372644e-05,
"loss": 0.5357,
"num_tokens": 488165514.0,
"step": 1865
},
{
"epoch": 0.8717948717948718,
"grad_norm": 0.3971657654693432,
"learning_rate": 4.812582034424815e-05,
"loss": 0.5036,
"num_tokens": 489476234.0,
"step": 1870
},
{
"epoch": 0.8741258741258742,
"grad_norm": 0.4546086981413901,
"learning_rate": 4.811193449351301e-05,
"loss": 0.5185,
"num_tokens": 490786954.0,
"step": 1875
},
{
"epoch": 0.8764568764568764,
"grad_norm": 0.4514734870231076,
"learning_rate": 4.809799964453014e-05,
"loss": 0.5285,
"num_tokens": 492097674.0,
"step": 1880
},
{
"epoch": 0.8787878787878788,
"grad_norm": 0.486652363082405,
"learning_rate": 4.808401583042517e-05,
"loss": 0.5214,
"num_tokens": 493408394.0,
"step": 1885
},
{
"epoch": 0.8811188811188811,
"grad_norm": 0.38657514612617594,
"learning_rate": 4.806998308444014e-05,
"loss": 0.5285,
"num_tokens": 494719114.0,
"step": 1890
},
{
"epoch": 0.8834498834498834,
"grad_norm": 0.43457520597462723,
"learning_rate": 4.805590143993337e-05,
"loss": 0.5283,
"num_tokens": 496018186.0,
"step": 1895
},
{
"epoch": 0.8857808857808858,
"grad_norm": 0.45512971801887536,
"learning_rate": 4.804177093037947e-05,
"loss": 0.5167,
"num_tokens": 497312162.0,
"step": 1900
},
{
"epoch": 0.8881118881118881,
"grad_norm": 0.5864459417857114,
"learning_rate": 4.802759158936914e-05,
"loss": 0.507,
"num_tokens": 498622882.0,
"step": 1905
},
{
"epoch": 0.8904428904428905,
"grad_norm": 0.4119135091650743,
"learning_rate": 4.801336345060925e-05,
"loss": 0.5075,
"num_tokens": 499933602.0,
"step": 1910
},
{
"epoch": 0.8927738927738927,
"grad_norm": 0.5607982382740162,
"learning_rate": 4.79990865479226e-05,
"loss": 0.5317,
"num_tokens": 501244322.0,
"step": 1915
},
{
"epoch": 0.8951048951048951,
"grad_norm": 0.4424330844089448,
"learning_rate": 4.7984760915247945e-05,
"loss": 0.5024,
"num_tokens": 502555042.0,
"step": 1920
},
{
"epoch": 0.8974358974358975,
"grad_norm": 0.4047970520184837,
"learning_rate": 4.7970386586639867e-05,
"loss": 0.4966,
"num_tokens": 503865762.0,
"step": 1925
},
{
"epoch": 0.8997668997668997,
"grad_norm": 0.44771407799642704,
"learning_rate": 4.795596359626871e-05,
"loss": 0.5236,
"num_tokens": 505176482.0,
"step": 1930
},
{
"epoch": 0.9020979020979021,
"grad_norm": 0.5048948938413714,
"learning_rate": 4.794149197842051e-05,
"loss": 0.5179,
"num_tokens": 506479186.0,
"step": 1935
},
{
"epoch": 0.9044289044289044,
"grad_norm": 0.45102633005957127,
"learning_rate": 4.792697176749686e-05,
"loss": 0.5329,
"num_tokens": 507789906.0,
"step": 1940
},
{
"epoch": 0.9067599067599068,
"grad_norm": 0.47896436686520744,
"learning_rate": 4.791240299801492e-05,
"loss": 0.5144,
"num_tokens": 509100626.0,
"step": 1945
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.44024138392686507,
"learning_rate": 4.7897785704607244e-05,
"loss": 0.5319,
"num_tokens": 510411346.0,
"step": 1950
},
{
"epoch": 0.9114219114219114,
"grad_norm": 0.46492535601527907,
"learning_rate": 4.7883119922021744e-05,
"loss": 0.5005,
"num_tokens": 511720801.0,
"step": 1955
},
{
"epoch": 0.9137529137529138,
"grad_norm": 0.47305357877273235,
"learning_rate": 4.7868405685121614e-05,
"loss": 0.5058,
"num_tokens": 513031521.0,
"step": 1960
},
{
"epoch": 0.916083916083916,
"grad_norm": 0.5349540175928681,
"learning_rate": 4.7853643028885216e-05,
"loss": 0.5259,
"num_tokens": 514342241.0,
"step": 1965
},
{
"epoch": 0.9184149184149184,
"grad_norm": 0.4744283556978438,
"learning_rate": 4.783883198840601e-05,
"loss": 0.5247,
"num_tokens": 515652961.0,
"step": 1970
},
{
"epoch": 0.9207459207459208,
"grad_norm": 0.4233785441775157,
"learning_rate": 4.78239725988925e-05,
"loss": 0.5229,
"num_tokens": 516963681.0,
"step": 1975
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.48524455188818355,
"learning_rate": 4.78090648956681e-05,
"loss": 0.5176,
"num_tokens": 518274401.0,
"step": 1980
},
{
"epoch": 0.9254079254079254,
"grad_norm": 0.4955971872107986,
"learning_rate": 4.779410891417107e-05,
"loss": 0.517,
"num_tokens": 519585121.0,
"step": 1985
},
{
"epoch": 0.9277389277389277,
"grad_norm": 0.45326272641602827,
"learning_rate": 4.777910468995447e-05,
"loss": 0.525,
"num_tokens": 520895841.0,
"step": 1990
},
{
"epoch": 0.9300699300699301,
"grad_norm": 0.5154625413913212,
"learning_rate": 4.7764052258686e-05,
"loss": 0.5155,
"num_tokens": 522206561.0,
"step": 1995
},
{
"epoch": 0.9324009324009324,
"grad_norm": 0.5173600774002117,
"learning_rate": 4.774895165614799e-05,
"loss": 0.5368,
"num_tokens": 523517281.0,
"step": 2000
},
{
"epoch": 0.9347319347319347,
"grad_norm": 0.4394182635279032,
"learning_rate": 4.773380291823726e-05,
"loss": 0.5112,
"num_tokens": 524828001.0,
"step": 2005
},
{
"epoch": 0.9370629370629371,
"grad_norm": 0.46211117414103975,
"learning_rate": 4.7718606080965064e-05,
"loss": 0.5176,
"num_tokens": 526138721.0,
"step": 2010
},
{
"epoch": 0.9393939393939394,
"grad_norm": 0.44369729323375096,
"learning_rate": 4.770336118045701e-05,
"loss": 0.5202,
"num_tokens": 527449441.0,
"step": 2015
},
{
"epoch": 0.9417249417249417,
"grad_norm": 0.46786194576051876,
"learning_rate": 4.768806825295292e-05,
"loss": 0.5435,
"num_tokens": 528760161.0,
"step": 2020
},
{
"epoch": 0.9440559440559441,
"grad_norm": 0.45716691252844754,
"learning_rate": 4.7672727334806844e-05,
"loss": 0.5217,
"num_tokens": 530065986.0,
"step": 2025
},
{
"epoch": 0.9463869463869464,
"grad_norm": 0.4716782323029019,
"learning_rate": 4.765733846248685e-05,
"loss": 0.5093,
"num_tokens": 531376706.0,
"step": 2030
},
{
"epoch": 0.9487179487179487,
"grad_norm": 0.4809813652755708,
"learning_rate": 4.764190167257508e-05,
"loss": 0.5222,
"num_tokens": 532687426.0,
"step": 2035
},
{
"epoch": 0.951048951048951,
"grad_norm": 0.42126121038875564,
"learning_rate": 4.7626417001767495e-05,
"loss": 0.5105,
"num_tokens": 533998146.0,
"step": 2040
},
{
"epoch": 0.9533799533799534,
"grad_norm": 0.41522513599681043,
"learning_rate": 4.7610884486873947e-05,
"loss": 0.5056,
"num_tokens": 535308866.0,
"step": 2045
},
{
"epoch": 0.9557109557109557,
"grad_norm": 0.4622274108637101,
"learning_rate": 4.759530416481798e-05,
"loss": 0.5275,
"num_tokens": 536619586.0,
"step": 2050
},
{
"epoch": 0.958041958041958,
"grad_norm": 0.411035345439206,
"learning_rate": 4.757967607263681e-05,
"loss": 0.5172,
"num_tokens": 537916961.0,
"step": 2055
},
{
"epoch": 0.9603729603729604,
"grad_norm": 0.41117700376194166,
"learning_rate": 4.756400024748121e-05,
"loss": 0.5129,
"num_tokens": 539227681.0,
"step": 2060
},
{
"epoch": 0.9627039627039627,
"grad_norm": 0.4009672829244564,
"learning_rate": 4.75482767266154e-05,
"loss": 0.5233,
"num_tokens": 540538401.0,
"step": 2065
},
{
"epoch": 0.965034965034965,
"grad_norm": 0.38122118693492923,
"learning_rate": 4.7532505547417e-05,
"loss": 0.5142,
"num_tokens": 541849121.0,
"step": 2070
},
{
"epoch": 0.9673659673659674,
"grad_norm": 0.45914187153924546,
"learning_rate": 4.7516686747376926e-05,
"loss": 0.5085,
"num_tokens": 543159841.0,
"step": 2075
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.43269563864820926,
"learning_rate": 4.7500820364099287e-05,
"loss": 0.5108,
"num_tokens": 544470561.0,
"step": 2080
},
{
"epoch": 0.972027972027972,
"grad_norm": 0.5180648923614879,
"learning_rate": 4.74849064353013e-05,
"loss": 0.5101,
"num_tokens": 545781281.0,
"step": 2085
},
{
"epoch": 0.9743589743589743,
"grad_norm": 0.4167054274754852,
"learning_rate": 4.746894499881322e-05,
"loss": 0.5058,
"num_tokens": 547092001.0,
"step": 2090
},
{
"epoch": 0.9766899766899767,
"grad_norm": 0.4186229461075277,
"learning_rate": 4.745293609257822e-05,
"loss": 0.5063,
"num_tokens": 548394350.0,
"step": 2095
},
{
"epoch": 0.9790209790209791,
"grad_norm": 0.45822668262911204,
"learning_rate": 4.7436879754652345e-05,
"loss": 0.5252,
"num_tokens": 549691697.0,
"step": 2100
},
{
"epoch": 0.9813519813519813,
"grad_norm": 0.3764583775129133,
"learning_rate": 4.742077602320437e-05,
"loss": 0.5007,
"num_tokens": 551002417.0,
"step": 2105
},
{
"epoch": 0.9836829836829837,
"grad_norm": 0.39341524513084175,
"learning_rate": 4.7404624936515746e-05,
"loss": 0.5171,
"num_tokens": 552313137.0,
"step": 2110
},
{
"epoch": 0.986013986013986,
"grad_norm": 0.4199783806277457,
"learning_rate": 4.738842653298048e-05,
"loss": 0.5069,
"num_tokens": 553623857.0,
"step": 2115
},
{
"epoch": 0.9883449883449883,
"grad_norm": 0.4951470880188904,
"learning_rate": 4.737218085110506e-05,
"loss": 0.5139,
"num_tokens": 554934577.0,
"step": 2120
},
{
"epoch": 0.9906759906759907,
"grad_norm": 0.4660659693883058,
"learning_rate": 4.73558879295084e-05,
"loss": 0.5158,
"num_tokens": 556245297.0,
"step": 2125
},
{
"epoch": 0.993006993006993,
"grad_norm": 0.4274088119401731,
"learning_rate": 4.733954780692165e-05,
"loss": 0.5086,
"num_tokens": 557556017.0,
"step": 2130
},
{
"epoch": 0.9953379953379954,
"grad_norm": 0.3888459894818999,
"learning_rate": 4.732316052218822e-05,
"loss": 0.5214,
"num_tokens": 558866737.0,
"step": 2135
},
{
"epoch": 0.9976689976689976,
"grad_norm": 0.46590499158532944,
"learning_rate": 4.730672611426361e-05,
"loss": 0.4982,
"num_tokens": 560177457.0,
"step": 2140
},
{
"epoch": 1.0,
"grad_norm": 0.40795631542247435,
"learning_rate": 4.729024462221533e-05,
"loss": 0.5045,
"num_tokens": 561488177.0,
"step": 2145
},
{
"epoch": 1.0023310023310024,
"grad_norm": 0.4709448881516708,
"learning_rate": 4.727371608522284e-05,
"loss": 0.4741,
"num_tokens": 562798897.0,
"step": 2150
},
{
"epoch": 1.0046620046620047,
"grad_norm": 0.39954760886818896,
"learning_rate": 4.725714054257742e-05,
"loss": 0.4879,
"num_tokens": 564109617.0,
"step": 2155
},
{
"epoch": 1.006993006993007,
"grad_norm": 0.48984368311093457,
"learning_rate": 4.724051803368209e-05,
"loss": 0.4857,
"num_tokens": 565420337.0,
"step": 2160
},
{
"epoch": 1.0093240093240092,
"grad_norm": 0.46609742273249777,
"learning_rate": 4.7223848598051514e-05,
"loss": 0.4796,
"num_tokens": 566731057.0,
"step": 2165
},
{
"epoch": 1.0116550116550116,
"grad_norm": 0.36935474096318466,
"learning_rate": 4.720713227531193e-05,
"loss": 0.4696,
"num_tokens": 568041777.0,
"step": 2170
},
{
"epoch": 1.013986013986014,
"grad_norm": 0.3580596575113959,
"learning_rate": 4.719036910520102e-05,
"loss": 0.4624,
"num_tokens": 569352497.0,
"step": 2175
},
{
"epoch": 1.0163170163170163,
"grad_norm": 0.39328253702466387,
"learning_rate": 4.717355912756783e-05,
"loss": 0.4874,
"num_tokens": 570663217.0,
"step": 2180
},
{
"epoch": 1.0186480186480187,
"grad_norm": 0.43854519615893484,
"learning_rate": 4.715670238237267e-05,
"loss": 0.4921,
"num_tokens": 571946596.0,
"step": 2185
},
{
"epoch": 1.020979020979021,
"grad_norm": 0.4320886782336019,
"learning_rate": 4.713979890968704e-05,
"loss": 0.4726,
"num_tokens": 573254295.0,
"step": 2190
},
{
"epoch": 1.0233100233100234,
"grad_norm": 0.417052323523323,
"learning_rate": 4.712284874969351e-05,
"loss": 0.4761,
"num_tokens": 574555927.0,
"step": 2195
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.39493641603163543,
"learning_rate": 4.710585194268564e-05,
"loss": 0.4708,
"num_tokens": 575866647.0,
"step": 2200
},
{
"epoch": 1.027972027972028,
"grad_norm": 0.43644055780187835,
"learning_rate": 4.708880852906786e-05,
"loss": 0.4811,
"num_tokens": 577177367.0,
"step": 2205
},
{
"epoch": 1.0303030303030303,
"grad_norm": 0.44168949675712726,
"learning_rate": 4.707171854935542e-05,
"loss": 0.487,
"num_tokens": 578488087.0,
"step": 2210
},
{
"epoch": 1.0326340326340326,
"grad_norm": 0.4723407127116691,
"learning_rate": 4.705458204417426e-05,
"loss": 0.4752,
"num_tokens": 579798807.0,
"step": 2215
},
{
"epoch": 1.034965034965035,
"grad_norm": 0.5001904118319063,
"learning_rate": 4.703739905426089e-05,
"loss": 0.4641,
"num_tokens": 581109527.0,
"step": 2220
},
{
"epoch": 1.0372960372960374,
"grad_norm": 0.41134533047549643,
"learning_rate": 4.7020169620462363e-05,
"loss": 0.4888,
"num_tokens": 582420247.0,
"step": 2225
},
{
"epoch": 1.0396270396270397,
"grad_norm": 0.4299285336564839,
"learning_rate": 4.7002893783736104e-05,
"loss": 0.4663,
"num_tokens": 583730967.0,
"step": 2230
},
{
"epoch": 1.0419580419580419,
"grad_norm": 0.39597875355379963,
"learning_rate": 4.6985571585149876e-05,
"loss": 0.4913,
"num_tokens": 585031618.0,
"step": 2235
},
{
"epoch": 1.0442890442890442,
"grad_norm": 0.5200351424675955,
"learning_rate": 4.696820306588162e-05,
"loss": 0.4696,
"num_tokens": 586342338.0,
"step": 2240
},
{
"epoch": 1.0466200466200466,
"grad_norm": 0.45179809404395227,
"learning_rate": 4.6950788267219425e-05,
"loss": 0.479,
"num_tokens": 587653058.0,
"step": 2245
},
{
"epoch": 1.048951048951049,
"grad_norm": 0.8707060578758659,
"learning_rate": 4.6933327230561366e-05,
"loss": 0.4666,
"num_tokens": 588963778.0,
"step": 2250
},
{
"epoch": 1.0512820512820513,
"grad_norm": 0.4615963497950469,
"learning_rate": 4.691581999741544e-05,
"loss": 0.477,
"num_tokens": 590274498.0,
"step": 2255
},
{
"epoch": 1.0536130536130537,
"grad_norm": 0.44042568334028054,
"learning_rate": 4.689826660939947e-05,
"loss": 0.4835,
"num_tokens": 591579372.0,
"step": 2260
},
{
"epoch": 1.055944055944056,
"grad_norm": 0.4171686514533797,
"learning_rate": 4.6880667108241e-05,
"loss": 0.4755,
"num_tokens": 592882201.0,
"step": 2265
},
{
"epoch": 1.0582750582750582,
"grad_norm": 0.3747338047053368,
"learning_rate": 4.686302153577717e-05,
"loss": 0.4797,
"num_tokens": 594192921.0,
"step": 2270
},
{
"epoch": 1.0606060606060606,
"grad_norm": 0.45499821388190786,
"learning_rate": 4.6845329933954685e-05,
"loss": 0.4933,
"num_tokens": 595488293.0,
"step": 2275
},
{
"epoch": 1.062937062937063,
"grad_norm": 0.4457910657080489,
"learning_rate": 4.682759234482961e-05,
"loss": 0.4812,
"num_tokens": 596799013.0,
"step": 2280
},
{
"epoch": 1.0652680652680653,
"grad_norm": 0.48446830788970874,
"learning_rate": 4.680980881056736e-05,
"loss": 0.4807,
"num_tokens": 598097884.0,
"step": 2285
},
{
"epoch": 1.0675990675990676,
"grad_norm": 0.39552772687416615,
"learning_rate": 4.6791979373442594e-05,
"loss": 0.4788,
"num_tokens": 599408604.0,
"step": 2290
},
{
"epoch": 1.06993006993007,
"grad_norm": 0.4456624889263455,
"learning_rate": 4.6774104075839055e-05,
"loss": 0.4652,
"num_tokens": 600719324.0,
"step": 2295
},
{
"epoch": 1.0722610722610724,
"grad_norm": 0.4907163642978358,
"learning_rate": 4.6756182960249514e-05,
"loss": 0.4881,
"num_tokens": 602030044.0,
"step": 2300
},
{
"epoch": 1.0745920745920745,
"grad_norm": 0.4552923449742935,
"learning_rate": 4.6738216069275656e-05,
"loss": 0.4767,
"num_tokens": 603340764.0,
"step": 2305
},
{
"epoch": 1.0769230769230769,
"grad_norm": 0.42327123031082714,
"learning_rate": 4.6720203445628006e-05,
"loss": 0.4698,
"num_tokens": 604651484.0,
"step": 2310
},
{
"epoch": 1.0792540792540792,
"grad_norm": 0.42295921506423456,
"learning_rate": 4.6702145132125774e-05,
"loss": 0.4814,
"num_tokens": 605950791.0,
"step": 2315
},
{
"epoch": 1.0815850815850816,
"grad_norm": 0.4362001736140205,
"learning_rate": 4.668404117169679e-05,
"loss": 0.4859,
"num_tokens": 607261511.0,
"step": 2320
},
{
"epoch": 1.083916083916084,
"grad_norm": 0.404472977066644,
"learning_rate": 4.6665891607377415e-05,
"loss": 0.4841,
"num_tokens": 608572231.0,
"step": 2325
},
{
"epoch": 1.0862470862470863,
"grad_norm": 0.4154110483833513,
"learning_rate": 4.664769648231239e-05,
"loss": 0.4737,
"num_tokens": 609882951.0,
"step": 2330
},
{
"epoch": 1.0885780885780885,
"grad_norm": 0.4046042178116509,
"learning_rate": 4.662945583975478e-05,
"loss": 0.4874,
"num_tokens": 611193671.0,
"step": 2335
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.6093115629231195,
"learning_rate": 4.6611169723065854e-05,
"loss": 0.4522,
"num_tokens": 612504391.0,
"step": 2340
},
{
"epoch": 1.0932400932400932,
"grad_norm": 0.5011935199868325,
"learning_rate": 4.659283817571496e-05,
"loss": 0.4816,
"num_tokens": 613815111.0,
"step": 2345
},
{
"epoch": 1.0955710955710956,
"grad_norm": 0.4169395778521336,
"learning_rate": 4.657446124127948e-05,
"loss": 0.4807,
"num_tokens": 615125831.0,
"step": 2350
},
{
"epoch": 1.097902097902098,
"grad_norm": 0.41261114151322104,
"learning_rate": 4.655603896344465e-05,
"loss": 0.4881,
"num_tokens": 616436551.0,
"step": 2355
},
{
"epoch": 1.1002331002331003,
"grad_norm": 0.4380543995664826,
"learning_rate": 4.653757138600352e-05,
"loss": 0.4654,
"num_tokens": 617747271.0,
"step": 2360
},
{
"epoch": 1.1025641025641026,
"grad_norm": 0.4436608684626528,
"learning_rate": 4.651905855285682e-05,
"loss": 0.4568,
"num_tokens": 619057991.0,
"step": 2365
},
{
"epoch": 1.104895104895105,
"grad_norm": 0.41612907159233714,
"learning_rate": 4.650050050801285e-05,
"loss": 0.479,
"num_tokens": 620368711.0,
"step": 2370
},
{
"epoch": 1.1072261072261071,
"grad_norm": 0.4396241255197849,
"learning_rate": 4.64818972955874e-05,
"loss": 0.4708,
"num_tokens": 621679431.0,
"step": 2375
},
{
"epoch": 1.1095571095571095,
"grad_norm": 0.4231567577088472,
"learning_rate": 4.646324895980363e-05,
"loss": 0.476,
"num_tokens": 622990151.0,
"step": 2380
},
{
"epoch": 1.1118881118881119,
"grad_norm": 0.37510083417666035,
"learning_rate": 4.6444555544991965e-05,
"loss": 0.4699,
"num_tokens": 624300871.0,
"step": 2385
},
{
"epoch": 1.1142191142191142,
"grad_norm": 0.41354134914061674,
"learning_rate": 4.642581709558998e-05,
"loss": 0.4922,
"num_tokens": 625611591.0,
"step": 2390
},
{
"epoch": 1.1165501165501166,
"grad_norm": 0.4506054239813869,
"learning_rate": 4.640703365614233e-05,
"loss": 0.4777,
"num_tokens": 626922311.0,
"step": 2395
},
{
"epoch": 1.118881118881119,
"grad_norm": 0.49541627026643736,
"learning_rate": 4.6388205271300585e-05,
"loss": 0.4784,
"num_tokens": 628220877.0,
"step": 2400
},
{
"epoch": 1.121212121212121,
"grad_norm": 0.4457913773922344,
"learning_rate": 4.636933198582319e-05,
"loss": 0.4847,
"num_tokens": 629531597.0,
"step": 2405
},
{
"epoch": 1.1235431235431235,
"grad_norm": 0.4403563514498699,
"learning_rate": 4.63504138445753e-05,
"loss": 0.4913,
"num_tokens": 630842317.0,
"step": 2410
},
{
"epoch": 1.1258741258741258,
"grad_norm": 0.5039870137615442,
"learning_rate": 4.6331450892528725e-05,
"loss": 0.4767,
"num_tokens": 632153037.0,
"step": 2415
},
{
"epoch": 1.1282051282051282,
"grad_norm": 0.43430596728961307,
"learning_rate": 4.631244317476179e-05,
"loss": 0.4818,
"num_tokens": 633463757.0,
"step": 2420
},
{
"epoch": 1.1305361305361306,
"grad_norm": 0.4943774670951527,
"learning_rate": 4.6293390736459226e-05,
"loss": 0.4692,
"num_tokens": 634774477.0,
"step": 2425
},
{
"epoch": 1.132867132867133,
"grad_norm": 0.45784553779974424,
"learning_rate": 4.627429362291208e-05,
"loss": 0.4787,
"num_tokens": 636085197.0,
"step": 2430
},
{
"epoch": 1.1351981351981353,
"grad_norm": 0.47029067075996683,
"learning_rate": 4.62551518795176e-05,
"loss": 0.4722,
"num_tokens": 637395917.0,
"step": 2435
},
{
"epoch": 1.1375291375291376,
"grad_norm": 0.444979370543174,
"learning_rate": 4.623596555177913e-05,
"loss": 0.4613,
"num_tokens": 638706637.0,
"step": 2440
},
{
"epoch": 1.1398601398601398,
"grad_norm": 0.3767342879009302,
"learning_rate": 4.621673468530599e-05,
"loss": 0.4723,
"num_tokens": 640017357.0,
"step": 2445
},
{
"epoch": 1.1421911421911422,
"grad_norm": 0.4436410093654824,
"learning_rate": 4.6197459325813406e-05,
"loss": 0.4814,
"num_tokens": 641328077.0,
"step": 2450
},
{
"epoch": 1.1445221445221445,
"grad_norm": 0.45322324569299677,
"learning_rate": 4.617813951912231e-05,
"loss": 0.4648,
"num_tokens": 642637532.0,
"step": 2455
},
{
"epoch": 1.1468531468531469,
"grad_norm": 0.3889321859528221,
"learning_rate": 4.6158775311159357e-05,
"loss": 0.4776,
"num_tokens": 643948252.0,
"step": 2460
},
{
"epoch": 1.1491841491841492,
"grad_norm": 0.38162917158886384,
"learning_rate": 4.613936674795672e-05,
"loss": 0.4886,
"num_tokens": 645258972.0,
"step": 2465
},
{
"epoch": 1.1515151515151516,
"grad_norm": 0.4103235852443348,
"learning_rate": 4.611991387565202e-05,
"loss": 0.4854,
"num_tokens": 646569692.0,
"step": 2470
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.4463003624553352,
"learning_rate": 4.6100416740488204e-05,
"loss": 0.4682,
"num_tokens": 647864905.0,
"step": 2475
},
{
"epoch": 1.156177156177156,
"grad_norm": 1.1574168466987094,
"learning_rate": 4.608087538881344e-05,
"loss": 0.4912,
"num_tokens": 649175625.0,
"step": 2480
},
{
"epoch": 1.1585081585081585,
"grad_norm": 0.3575350970980845,
"learning_rate": 4.606128986708101e-05,
"loss": 0.4725,
"num_tokens": 650486345.0,
"step": 2485
},
{
"epoch": 1.1608391608391608,
"grad_norm": 0.4698297516737484,
"learning_rate": 4.604166022184921e-05,
"loss": 0.4818,
"num_tokens": 651772267.0,
"step": 2490
},
{
"epoch": 1.1631701631701632,
"grad_norm": 0.4518636492405959,
"learning_rate": 4.602198649978119e-05,
"loss": 0.4823,
"num_tokens": 653082987.0,
"step": 2495
},
{
"epoch": 1.1655011655011656,
"grad_norm": 0.4108073804989677,
"learning_rate": 4.600226874764491e-05,
"loss": 0.4718,
"num_tokens": 654390572.0,
"step": 2500
},
{
"epoch": 1.167832167832168,
"grad_norm": 0.42730405936868865,
"learning_rate": 4.598250701231299e-05,
"loss": 0.4621,
"num_tokens": 655701292.0,
"step": 2505
},
{
"epoch": 1.1701631701631703,
"grad_norm": 0.4294330624184933,
"learning_rate": 4.596270134076259e-05,
"loss": 0.4773,
"num_tokens": 657012012.0,
"step": 2510
},
{
"epoch": 1.1724941724941724,
"grad_norm": 0.4229790807053798,
"learning_rate": 4.594285178007534e-05,
"loss": 0.4889,
"num_tokens": 658322732.0,
"step": 2515
},
{
"epoch": 1.1748251748251748,
"grad_norm": 0.3704279315350088,
"learning_rate": 4.592295837743719e-05,
"loss": 0.4645,
"num_tokens": 659633452.0,
"step": 2520
},
{
"epoch": 1.1771561771561772,
"grad_norm": 0.4283412404900056,
"learning_rate": 4.590302118013829e-05,
"loss": 0.4722,
"num_tokens": 660944172.0,
"step": 2525
},
{
"epoch": 1.1794871794871795,
"grad_norm": 0.44943350837379137,
"learning_rate": 4.588304023557293e-05,
"loss": 0.5052,
"num_tokens": 662254892.0,
"step": 2530
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.3908539181988753,
"learning_rate": 4.586301559123939e-05,
"loss": 0.4688,
"num_tokens": 663565612.0,
"step": 2535
},
{
"epoch": 1.1841491841491842,
"grad_norm": 0.4160330010793307,
"learning_rate": 4.5842947294739815e-05,
"loss": 0.4725,
"num_tokens": 664876332.0,
"step": 2540
},
{
"epoch": 1.1864801864801864,
"grad_norm": 0.4360101739142026,
"learning_rate": 4.582283539378012e-05,
"loss": 0.4849,
"num_tokens": 666187052.0,
"step": 2545
},
{
"epoch": 1.1888111888111887,
"grad_norm": 0.4342183592761623,
"learning_rate": 4.580267993616991e-05,
"loss": 0.4825,
"num_tokens": 667497772.0,
"step": 2550
},
{
"epoch": 1.191142191142191,
"grad_norm": 0.39093089607584136,
"learning_rate": 4.578248096982227e-05,
"loss": 0.4577,
"num_tokens": 668808492.0,
"step": 2555
},
{
"epoch": 1.1934731934731935,
"grad_norm": 0.4265274502701168,
"learning_rate": 4.576223854275378e-05,
"loss": 0.4695,
"num_tokens": 670119212.0,
"step": 2560
},
{
"epoch": 1.1958041958041958,
"grad_norm": 0.5106783765028553,
"learning_rate": 4.574195270308428e-05,
"loss": 0.4596,
"num_tokens": 671429932.0,
"step": 2565
},
{
"epoch": 1.1981351981351982,
"grad_norm": 0.45465970639582615,
"learning_rate": 4.572162349903685e-05,
"loss": 0.4808,
"num_tokens": 672740652.0,
"step": 2570
},
{
"epoch": 1.2004662004662006,
"grad_norm": 0.4654417167187371,
"learning_rate": 4.570125097893762e-05,
"loss": 0.481,
"num_tokens": 674051372.0,
"step": 2575
},
{
"epoch": 1.2027972027972027,
"grad_norm": 0.41022644781405776,
"learning_rate": 4.568083519121572e-05,
"loss": 0.4741,
"num_tokens": 675362092.0,
"step": 2580
},
{
"epoch": 1.205128205128205,
"grad_norm": 0.43369798835654055,
"learning_rate": 4.566037618440313e-05,
"loss": 0.4842,
"num_tokens": 676670859.0,
"step": 2585
},
{
"epoch": 1.2074592074592074,
"grad_norm": 0.5470112563703007,
"learning_rate": 4.563987400713454e-05,
"loss": 0.4745,
"num_tokens": 677981579.0,
"step": 2590
},
{
"epoch": 1.2097902097902098,
"grad_norm": 0.4175250823314187,
"learning_rate": 4.561932870814729e-05,
"loss": 0.4714,
"num_tokens": 679282079.0,
"step": 2595
},
{
"epoch": 1.2121212121212122,
"grad_norm": 0.38110478817150606,
"learning_rate": 4.5598740336281225e-05,
"loss": 0.4675,
"num_tokens": 680592799.0,
"step": 2600
},
{
"epoch": 1.2144522144522145,
"grad_norm": 0.47620665151012714,
"learning_rate": 4.557810894047859e-05,
"loss": 0.4964,
"num_tokens": 681887165.0,
"step": 2605
},
{
"epoch": 1.2167832167832167,
"grad_norm": 0.4049180388899848,
"learning_rate": 4.555743456978388e-05,
"loss": 0.4744,
"num_tokens": 683197885.0,
"step": 2610
},
{
"epoch": 1.219114219114219,
"grad_norm": 0.4199713705201553,
"learning_rate": 4.553671727334378e-05,
"loss": 0.4786,
"num_tokens": 684508605.0,
"step": 2615
},
{
"epoch": 1.2214452214452214,
"grad_norm": 0.39770427056801977,
"learning_rate": 4.5515957100407e-05,
"loss": 0.4696,
"num_tokens": 685819325.0,
"step": 2620
},
{
"epoch": 1.2237762237762237,
"grad_norm": 0.4512802814267864,
"learning_rate": 4.5495154100324166e-05,
"loss": 0.4872,
"num_tokens": 687130045.0,
"step": 2625
},
{
"epoch": 1.2261072261072261,
"grad_norm": 0.42479431900627285,
"learning_rate": 4.547430832254773e-05,
"loss": 0.4706,
"num_tokens": 688440765.0,
"step": 2630
},
{
"epoch": 1.2284382284382285,
"grad_norm": 0.43515400124583126,
"learning_rate": 4.545341981663182e-05,
"loss": 0.4647,
"num_tokens": 689751485.0,
"step": 2635
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.392935620732648,
"learning_rate": 4.543248863223215e-05,
"loss": 0.4685,
"num_tokens": 691062205.0,
"step": 2640
},
{
"epoch": 1.2331002331002332,
"grad_norm": 0.46909934828902206,
"learning_rate": 4.541151481910589e-05,
"loss": 0.4717,
"num_tokens": 692372925.0,
"step": 2645
},
{
"epoch": 1.2354312354312353,
"grad_norm": 0.3944491634545739,
"learning_rate": 4.5390498427111525e-05,
"loss": 0.4873,
"num_tokens": 693683645.0,
"step": 2650
},
{
"epoch": 1.2377622377622377,
"grad_norm": 0.3542154968237942,
"learning_rate": 4.536943950620877e-05,
"loss": 0.4947,
"num_tokens": 694994365.0,
"step": 2655
},
{
"epoch": 1.24009324009324,
"grad_norm": 0.43496568575132033,
"learning_rate": 4.5348338106458446e-05,
"loss": 0.465,
"num_tokens": 696305085.0,
"step": 2660
},
{
"epoch": 1.2424242424242424,
"grad_norm": 0.38699131626194777,
"learning_rate": 4.532719427802234e-05,
"loss": 0.4752,
"num_tokens": 697607524.0,
"step": 2665
},
{
"epoch": 1.2447552447552448,
"grad_norm": 0.42606969233099046,
"learning_rate": 4.5306008071163105e-05,
"loss": 0.4965,
"num_tokens": 698918244.0,
"step": 2670
},
{
"epoch": 1.2470862470862472,
"grad_norm": 0.46040092500555474,
"learning_rate": 4.528477953624416e-05,
"loss": 0.4861,
"num_tokens": 700228964.0,
"step": 2675
},
{
"epoch": 1.2494172494172493,
"grad_norm": 0.40529545375633014,
"learning_rate": 4.526350872372949e-05,
"loss": 0.4576,
"num_tokens": 701539684.0,
"step": 2680
},
{
"epoch": 1.2517482517482517,
"grad_norm": 0.4181283911639419,
"learning_rate": 4.524219568418364e-05,
"loss": 0.4711,
"num_tokens": 702850404.0,
"step": 2685
},
{
"epoch": 1.254079254079254,
"grad_norm": 0.39553338610588684,
"learning_rate": 4.522084046827148e-05,
"loss": 0.476,
"num_tokens": 704161124.0,
"step": 2690
},
{
"epoch": 1.2564102564102564,
"grad_norm": 0.37450359234529307,
"learning_rate": 4.51994431267582e-05,
"loss": 0.4639,
"num_tokens": 705462856.0,
"step": 2695
},
{
"epoch": 1.2587412587412588,
"grad_norm": 0.4094194955768158,
"learning_rate": 4.5178003710509087e-05,
"loss": 0.4825,
"num_tokens": 706773576.0,
"step": 2700
},
{
"epoch": 1.2610722610722611,
"grad_norm": 0.4885076570757472,
"learning_rate": 4.515652227048946e-05,
"loss": 0.4737,
"num_tokens": 708084296.0,
"step": 2705
},
{
"epoch": 1.2634032634032635,
"grad_norm": 0.49289407887729214,
"learning_rate": 4.513499885776453e-05,
"loss": 0.4757,
"num_tokens": 709395016.0,
"step": 2710
},
{
"epoch": 1.2657342657342658,
"grad_norm": 0.3999863566296061,
"learning_rate": 4.511343352349931e-05,
"loss": 0.4839,
"num_tokens": 710690433.0,
"step": 2715
},
{
"epoch": 1.2680652680652682,
"grad_norm": 0.4305940675053385,
"learning_rate": 4.5091826318958434e-05,
"loss": 0.4744,
"num_tokens": 712001153.0,
"step": 2720
},
{
"epoch": 1.2703962703962703,
"grad_norm": 0.4514699940855802,
"learning_rate": 4.50701772955061e-05,
"loss": 0.4656,
"num_tokens": 713311873.0,
"step": 2725
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.3809637591571807,
"learning_rate": 4.5048486504605874e-05,
"loss": 0.4627,
"num_tokens": 714622593.0,
"step": 2730
},
{
"epoch": 1.275058275058275,
"grad_norm": 0.41819681647184476,
"learning_rate": 4.502675399782066e-05,
"loss": 0.4746,
"num_tokens": 715933313.0,
"step": 2735
},
{
"epoch": 1.2773892773892774,
"grad_norm": 0.4133255247787771,
"learning_rate": 4.5004979826812505e-05,
"loss": 0.4763,
"num_tokens": 717244033.0,
"step": 2740
},
{
"epoch": 1.2797202797202798,
"grad_norm": 0.4440690290427124,
"learning_rate": 4.498316404334249e-05,
"loss": 0.4857,
"num_tokens": 718554753.0,
"step": 2745
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.41320454746022983,
"learning_rate": 4.4961306699270634e-05,
"loss": 0.4812,
"num_tokens": 719865473.0,
"step": 2750
},
{
"epoch": 1.2843822843822843,
"grad_norm": 0.37971285454946885,
"learning_rate": 4.4939407846555734e-05,
"loss": 0.4592,
"num_tokens": 721176193.0,
"step": 2755
},
{
"epoch": 1.2867132867132867,
"grad_norm": 0.38669049099636904,
"learning_rate": 4.49174675372553e-05,
"loss": 0.4808,
"num_tokens": 722486913.0,
"step": 2760
},
{
"epoch": 1.289044289044289,
"grad_norm": 0.38833499571078073,
"learning_rate": 4.489548582352533e-05,
"loss": 0.4648,
"num_tokens": 723781879.0,
"step": 2765
},
{
"epoch": 1.2913752913752914,
"grad_norm": 0.37993452738902034,
"learning_rate": 4.487346275762031e-05,
"loss": 0.468,
"num_tokens": 725092599.0,
"step": 2770
},
{
"epoch": 1.2937062937062938,
"grad_norm": 0.3673232976728133,
"learning_rate": 4.4851398391892976e-05,
"loss": 0.4648,
"num_tokens": 726403319.0,
"step": 2775
},
{
"epoch": 1.2960372960372961,
"grad_norm": 0.4116221051012755,
"learning_rate": 4.482929277879428e-05,
"loss": 0.4745,
"num_tokens": 727707400.0,
"step": 2780
},
{
"epoch": 1.2983682983682985,
"grad_norm": 0.4092445660610788,
"learning_rate": 4.4807145970873206e-05,
"loss": 0.4822,
"num_tokens": 729018120.0,
"step": 2785
},
{
"epoch": 1.3006993006993006,
"grad_norm": 0.4185448307140966,
"learning_rate": 4.4784958020776665e-05,
"loss": 0.4616,
"num_tokens": 730328840.0,
"step": 2790
},
{
"epoch": 1.303030303030303,
"grad_norm": 0.38479876763978926,
"learning_rate": 4.476272898124938e-05,
"loss": 0.4721,
"num_tokens": 731639560.0,
"step": 2795
},
{
"epoch": 1.3053613053613053,
"grad_norm": 0.46767133110126385,
"learning_rate": 4.474045890513374e-05,
"loss": 0.4752,
"num_tokens": 732950280.0,
"step": 2800
},
{
"epoch": 1.3076923076923077,
"grad_norm": 0.411649743246021,
"learning_rate": 4.4718147845369696e-05,
"loss": 0.4573,
"num_tokens": 734261000.0,
"step": 2805
},
{
"epoch": 1.31002331002331,
"grad_norm": 0.4316901135425129,
"learning_rate": 4.469579585499463e-05,
"loss": 0.4783,
"num_tokens": 735564553.0,
"step": 2810
},
{
"epoch": 1.3123543123543124,
"grad_norm": 0.40955334557870626,
"learning_rate": 4.467340298714319e-05,
"loss": 0.4883,
"num_tokens": 736875273.0,
"step": 2815
},
{
"epoch": 1.3146853146853146,
"grad_norm": 0.4080344761506793,
"learning_rate": 4.4650969295047236e-05,
"loss": 0.4832,
"num_tokens": 738185993.0,
"step": 2820
},
{
"epoch": 1.317016317016317,
"grad_norm": 0.4946836082141776,
"learning_rate": 4.462849483203566e-05,
"loss": 0.4761,
"num_tokens": 739475770.0,
"step": 2825
},
{
"epoch": 1.3193473193473193,
"grad_norm": 0.4584478564333903,
"learning_rate": 4.460597965153426e-05,
"loss": 0.4649,
"num_tokens": 740786490.0,
"step": 2830
},
{
"epoch": 1.3216783216783217,
"grad_norm": 0.41894692846730963,
"learning_rate": 4.458342380706566e-05,
"loss": 0.4809,
"num_tokens": 742097210.0,
"step": 2835
},
{
"epoch": 1.324009324009324,
"grad_norm": 0.4516243516797518,
"learning_rate": 4.456082735224911e-05,
"loss": 0.4703,
"num_tokens": 743407930.0,
"step": 2840
},
{
"epoch": 1.3263403263403264,
"grad_norm": 0.4254492278222537,
"learning_rate": 4.4538190340800426e-05,
"loss": 0.4793,
"num_tokens": 744718650.0,
"step": 2845
},
{
"epoch": 1.3286713286713288,
"grad_norm": 0.4137937243385849,
"learning_rate": 4.451551282653182e-05,
"loss": 0.48,
"num_tokens": 746029370.0,
"step": 2850
},
{
"epoch": 1.3310023310023311,
"grad_norm": 0.519752518142126,
"learning_rate": 4.449279486335179e-05,
"loss": 0.4736,
"num_tokens": 747340090.0,
"step": 2855
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.4353489605024039,
"learning_rate": 4.4470036505265e-05,
"loss": 0.4744,
"num_tokens": 748650810.0,
"step": 2860
},
{
"epoch": 1.3356643356643356,
"grad_norm": 0.4419316706152749,
"learning_rate": 4.444723780637212e-05,
"loss": 0.493,
"num_tokens": 749939934.0,
"step": 2865
},
{
"epoch": 1.337995337995338,
"grad_norm": 0.4901281137641553,
"learning_rate": 4.442439882086973e-05,
"loss": 0.4901,
"num_tokens": 751250654.0,
"step": 2870
},
{
"epoch": 1.3403263403263403,
"grad_norm": 0.4412679310378607,
"learning_rate": 4.440151960305017e-05,
"loss": 0.4725,
"num_tokens": 752561374.0,
"step": 2875
},
{
"epoch": 1.3426573426573427,
"grad_norm": 0.46469847523541835,
"learning_rate": 4.437860020730144e-05,
"loss": 0.4784,
"num_tokens": 753872094.0,
"step": 2880
},
{
"epoch": 1.3449883449883449,
"grad_norm": 0.4631294470988359,
"learning_rate": 4.4355640688107024e-05,
"loss": 0.4645,
"num_tokens": 755182814.0,
"step": 2885
},
{
"epoch": 1.3473193473193472,
"grad_norm": 0.4557090602532957,
"learning_rate": 4.43326411000458e-05,
"loss": 0.4713,
"num_tokens": 756493534.0,
"step": 2890
},
{
"epoch": 1.3496503496503496,
"grad_norm": 0.4386481997162464,
"learning_rate": 4.4309601497791894e-05,
"loss": 0.4733,
"num_tokens": 757804254.0,
"step": 2895
},
{
"epoch": 1.351981351981352,
"grad_norm": 0.4269351049606663,
"learning_rate": 4.428652193611454e-05,
"loss": 0.4692,
"num_tokens": 759114974.0,
"step": 2900
},
{
"epoch": 1.3543123543123543,
"grad_norm": 2.3425883033287866,
"learning_rate": 4.4263402469878015e-05,
"loss": 0.4567,
"num_tokens": 760425694.0,
"step": 2905
},
{
"epoch": 1.3566433566433567,
"grad_norm": 0.36072419381621385,
"learning_rate": 4.424024315404137e-05,
"loss": 0.4748,
"num_tokens": 761736414.0,
"step": 2910
},
{
"epoch": 1.358974358974359,
"grad_norm": 0.4249230241783985,
"learning_rate": 4.421704404365847e-05,
"loss": 0.4683,
"num_tokens": 763047134.0,
"step": 2915
},
{
"epoch": 1.3613053613053614,
"grad_norm": 0.4299202972649719,
"learning_rate": 4.4193805193877714e-05,
"loss": 0.4663,
"num_tokens": 764357854.0,
"step": 2920
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.3879041745135311,
"learning_rate": 4.4170526659942015e-05,
"loss": 0.4721,
"num_tokens": 765660038.0,
"step": 2925
},
{
"epoch": 1.365967365967366,
"grad_norm": 0.5484971150134953,
"learning_rate": 4.414720849718859e-05,
"loss": 0.4839,
"num_tokens": 766970758.0,
"step": 2930
},
{
"epoch": 1.3682983682983683,
"grad_norm": 0.40718901418820086,
"learning_rate": 4.412385076104889e-05,
"loss": 0.4667,
"num_tokens": 768281478.0,
"step": 2935
},
{
"epoch": 1.3706293706293706,
"grad_norm": 0.3876539740773812,
"learning_rate": 4.410045350704841e-05,
"loss": 0.4612,
"num_tokens": 769592198.0,
"step": 2940
},
{
"epoch": 1.372960372960373,
"grad_norm": 0.41309989970311883,
"learning_rate": 4.4077016790806604e-05,
"loss": 0.4705,
"num_tokens": 770902918.0,
"step": 2945
},
{
"epoch": 1.3752913752913754,
"grad_norm": 0.3744163790044172,
"learning_rate": 4.405354066803673e-05,
"loss": 0.4707,
"num_tokens": 772213638.0,
"step": 2950
},
{
"epoch": 1.3776223776223775,
"grad_norm": 0.480030813067916,
"learning_rate": 4.403002519454573e-05,
"loss": 0.489,
"num_tokens": 773524358.0,
"step": 2955
},
{
"epoch": 1.3799533799533799,
"grad_norm": 0.44969634064801967,
"learning_rate": 4.400647042623407e-05,
"loss": 0.4688,
"num_tokens": 774835078.0,
"step": 2960
},
{
"epoch": 1.3822843822843822,
"grad_norm": 0.4122214657313857,
"learning_rate": 4.398287641909564e-05,
"loss": 0.4521,
"num_tokens": 776145798.0,
"step": 2965
},
{
"epoch": 1.3846153846153846,
"grad_norm": 0.4180712738656679,
"learning_rate": 4.395924322921762e-05,
"loss": 0.471,
"num_tokens": 777456518.0,
"step": 2970
},
{
"epoch": 1.386946386946387,
"grad_norm": 0.49070370600152946,
"learning_rate": 4.393557091278031e-05,
"loss": 0.4844,
"num_tokens": 778766755.0,
"step": 2975
},
{
"epoch": 1.3892773892773893,
"grad_norm": 0.40125497971986246,
"learning_rate": 4.391185952605703e-05,
"loss": 0.4859,
"num_tokens": 780077475.0,
"step": 2980
},
{
"epoch": 1.3916083916083917,
"grad_norm": 0.4410310353900502,
"learning_rate": 4.3888109125413984e-05,
"loss": 0.4713,
"num_tokens": 781383628.0,
"step": 2985
},
{
"epoch": 1.393939393939394,
"grad_norm": 0.3981200505167726,
"learning_rate": 4.3864319767310116e-05,
"loss": 0.474,
"num_tokens": 782694348.0,
"step": 2990
},
{
"epoch": 1.3962703962703964,
"grad_norm": 0.4882530547316365,
"learning_rate": 4.384049150829697e-05,
"loss": 0.4907,
"num_tokens": 784003029.0,
"step": 2995
},
{
"epoch": 1.3986013986013985,
"grad_norm": 0.4625354000998703,
"learning_rate": 4.381662440501857e-05,
"loss": 0.4783,
"num_tokens": 785313749.0,
"step": 3000
},
{
"epoch": 1.400932400932401,
"grad_norm": 0.42320091867347914,
"learning_rate": 4.379271851421129e-05,
"loss": 0.4745,
"num_tokens": 786608916.0,
"step": 3005
},
{
"epoch": 1.4032634032634033,
"grad_norm": 0.34353265181156867,
"learning_rate": 4.3768773892703696e-05,
"loss": 0.4682,
"num_tokens": 787919636.0,
"step": 3010
},
{
"epoch": 1.4055944055944056,
"grad_norm": 0.41229838212196757,
"learning_rate": 4.374479059741643e-05,
"loss": 0.4903,
"num_tokens": 789230356.0,
"step": 3015
},
{
"epoch": 1.407925407925408,
"grad_norm": 0.3828081072364993,
"learning_rate": 4.372076868536206e-05,
"loss": 0.4685,
"num_tokens": 790541076.0,
"step": 3020
},
{
"epoch": 1.4102564102564101,
"grad_norm": 0.3782799956569614,
"learning_rate": 4.369670821364497e-05,
"loss": 0.4875,
"num_tokens": 791851796.0,
"step": 3025
},
{
"epoch": 1.4125874125874125,
"grad_norm": 0.38271818179702677,
"learning_rate": 4.3672609239461185e-05,
"loss": 0.472,
"num_tokens": 793162516.0,
"step": 3030
},
{
"epoch": 1.4149184149184149,
"grad_norm": 0.4432182142698113,
"learning_rate": 4.364847182009827e-05,
"loss": 0.4536,
"num_tokens": 794473236.0,
"step": 3035
},
{
"epoch": 1.4172494172494172,
"grad_norm": 0.4163541269383781,
"learning_rate": 4.362429601293519e-05,
"loss": 0.4674,
"num_tokens": 795783956.0,
"step": 3040
},
{
"epoch": 1.4195804195804196,
"grad_norm": 0.3963132427363548,
"learning_rate": 4.360008187544213e-05,
"loss": 0.4691,
"num_tokens": 797094676.0,
"step": 3045
},
{
"epoch": 1.421911421911422,
"grad_norm": 0.376804595278115,
"learning_rate": 4.357582946518045e-05,
"loss": 0.4638,
"num_tokens": 798392983.0,
"step": 3050
},
{
"epoch": 1.4242424242424243,
"grad_norm": 0.3809783523823445,
"learning_rate": 4.355153883980243e-05,
"loss": 0.4779,
"num_tokens": 799693790.0,
"step": 3055
},
{
"epoch": 1.4265734265734267,
"grad_norm": 0.3621884672482386,
"learning_rate": 4.3527210057051246e-05,
"loss": 0.4808,
"num_tokens": 801004510.0,
"step": 3060
},
{
"epoch": 1.428904428904429,
"grad_norm": 0.4816211455166946,
"learning_rate": 4.3502843174760736e-05,
"loss": 0.4627,
"num_tokens": 802315230.0,
"step": 3065
},
{
"epoch": 1.4312354312354312,
"grad_norm": 0.3686667678553204,
"learning_rate": 4.3478438250855344e-05,
"loss": 0.4781,
"num_tokens": 803625950.0,
"step": 3070
},
{
"epoch": 1.4335664335664335,
"grad_norm": 0.39139268593103854,
"learning_rate": 4.345399534334993e-05,
"loss": 0.4614,
"num_tokens": 804936670.0,
"step": 3075
},
{
"epoch": 1.435897435897436,
"grad_norm": 0.3682813822971589,
"learning_rate": 4.3429514510349636e-05,
"loss": 0.4698,
"num_tokens": 806247390.0,
"step": 3080
},
{
"epoch": 1.4382284382284383,
"grad_norm": 0.39577744941721116,
"learning_rate": 4.340499581004979e-05,
"loss": 0.4696,
"num_tokens": 807558110.0,
"step": 3085
},
{
"epoch": 1.4405594405594406,
"grad_norm": 0.39221995923827324,
"learning_rate": 4.33804393007357e-05,
"loss": 0.4575,
"num_tokens": 808854280.0,
"step": 3090
},
{
"epoch": 1.4428904428904428,
"grad_norm": 0.3469876104551282,
"learning_rate": 4.335584504078258e-05,
"loss": 0.4663,
"num_tokens": 810160671.0,
"step": 3095
},
{
"epoch": 1.4452214452214451,
"grad_norm": 0.38039731278638117,
"learning_rate": 4.333121308865539e-05,
"loss": 0.4656,
"num_tokens": 811471391.0,
"step": 3100
},
{
"epoch": 1.4475524475524475,
"grad_norm": 0.3871641009392114,
"learning_rate": 4.330654350290866e-05,
"loss": 0.4741,
"num_tokens": 812782111.0,
"step": 3105
},
{
"epoch": 1.4498834498834499,
"grad_norm": 0.3723812044477465,
"learning_rate": 4.328183634218641e-05,
"loss": 0.4616,
"num_tokens": 814092831.0,
"step": 3110
},
{
"epoch": 1.4522144522144522,
"grad_norm": 0.4159411274519776,
"learning_rate": 4.325709166522196e-05,
"loss": 0.4705,
"num_tokens": 815403551.0,
"step": 3115
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.3786619070365938,
"learning_rate": 4.3232309530837826e-05,
"loss": 0.4702,
"num_tokens": 816714271.0,
"step": 3120
},
{
"epoch": 1.456876456876457,
"grad_norm": 0.3629693685192925,
"learning_rate": 4.320748999794558e-05,
"loss": 0.4623,
"num_tokens": 818024991.0,
"step": 3125
},
{
"epoch": 1.4592074592074593,
"grad_norm": 0.3807642152047702,
"learning_rate": 4.3182633125545664e-05,
"loss": 0.4826,
"num_tokens": 819335711.0,
"step": 3130
},
{
"epoch": 1.4615384615384617,
"grad_norm": 0.36345106221694695,
"learning_rate": 4.3157738972727316e-05,
"loss": 0.4749,
"num_tokens": 820646431.0,
"step": 3135
},
{
"epoch": 1.4638694638694638,
"grad_norm": 0.3982169584509918,
"learning_rate": 4.3132807598668366e-05,
"loss": 0.4592,
"num_tokens": 821957151.0,
"step": 3140
},
{
"epoch": 1.4662004662004662,
"grad_norm": 0.3578005945222367,
"learning_rate": 4.310783906263515e-05,
"loss": 0.472,
"num_tokens": 823267871.0,
"step": 3145
},
{
"epoch": 1.4685314685314685,
"grad_norm": 0.44584910654927556,
"learning_rate": 4.3082833423982346e-05,
"loss": 0.4682,
"num_tokens": 824556476.0,
"step": 3150
},
{
"epoch": 1.470862470862471,
"grad_norm": 0.439322244082669,
"learning_rate": 4.3057790742152785e-05,
"loss": 0.4572,
"num_tokens": 825867196.0,
"step": 3155
},
{
"epoch": 1.4731934731934733,
"grad_norm": 0.3890285611037925,
"learning_rate": 4.3032711076677436e-05,
"loss": 0.4684,
"num_tokens": 827177916.0,
"step": 3160
},
{
"epoch": 1.4755244755244754,
"grad_norm": 0.4262221846633365,
"learning_rate": 4.3007594487175114e-05,
"loss": 0.4748,
"num_tokens": 828488636.0,
"step": 3165
},
{
"epoch": 1.4778554778554778,
"grad_norm": 0.3770399690427187,
"learning_rate": 4.298244103335244e-05,
"loss": 0.4597,
"num_tokens": 829799356.0,
"step": 3170
},
{
"epoch": 1.4801864801864801,
"grad_norm": 0.3524048177654436,
"learning_rate": 4.2957250775003664e-05,
"loss": 0.4814,
"num_tokens": 831095535.0,
"step": 3175
},
{
"epoch": 1.4825174825174825,
"grad_norm": 0.4087741944757923,
"learning_rate": 4.293202377201053e-05,
"loss": 0.4892,
"num_tokens": 832394592.0,
"step": 3180
},
{
"epoch": 1.4848484848484849,
"grad_norm": 0.3875644282355541,
"learning_rate": 4.290676008434214e-05,
"loss": 0.4817,
"num_tokens": 833705312.0,
"step": 3185
},
{
"epoch": 1.4871794871794872,
"grad_norm": 0.41275819372925243,
"learning_rate": 4.2881459772054764e-05,
"loss": 0.4705,
"num_tokens": 835016032.0,
"step": 3190
},
{
"epoch": 1.4895104895104896,
"grad_norm": 0.3565273655614255,
"learning_rate": 4.2856122895291767e-05,
"loss": 0.4539,
"num_tokens": 836326752.0,
"step": 3195
},
{
"epoch": 1.491841491841492,
"grad_norm": 0.4013967833750652,
"learning_rate": 4.2830749514283444e-05,
"loss": 0.471,
"num_tokens": 837637472.0,
"step": 3200
},
{
"epoch": 1.494172494172494,
"grad_norm": 0.44988006189556307,
"learning_rate": 4.280533968934683e-05,
"loss": 0.4737,
"num_tokens": 838948192.0,
"step": 3205
},
{
"epoch": 1.4965034965034965,
"grad_norm": 0.4128029879161445,
"learning_rate": 4.277989348088564e-05,
"loss": 0.4618,
"num_tokens": 840258912.0,
"step": 3210
},
{
"epoch": 1.4988344988344988,
"grad_norm": 0.3860230229659164,
"learning_rate": 4.275441094939002e-05,
"loss": 0.4772,
"num_tokens": 841569632.0,
"step": 3215
},
{
"epoch": 1.5011655011655012,
"grad_norm": 0.38330430061235765,
"learning_rate": 4.2728892155436524e-05,
"loss": 0.4655,
"num_tokens": 842875786.0,
"step": 3220
},
{
"epoch": 1.5034965034965035,
"grad_norm": 0.3583158818403601,
"learning_rate": 4.270333715968787e-05,
"loss": 0.4637,
"num_tokens": 844186506.0,
"step": 3225
},
{
"epoch": 1.5058275058275057,
"grad_norm": 0.4292683787844713,
"learning_rate": 4.267774602289285e-05,
"loss": 0.4513,
"num_tokens": 845490179.0,
"step": 3230
},
{
"epoch": 1.508158508158508,
"grad_norm": 0.4561169755057234,
"learning_rate": 4.265211880588617e-05,
"loss": 0.4575,
"num_tokens": 846800899.0,
"step": 3235
},
{
"epoch": 1.5104895104895104,
"grad_norm": 0.38223276760013997,
"learning_rate": 4.2626455569588274e-05,
"loss": 0.4591,
"num_tokens": 848111619.0,
"step": 3240
},
{
"epoch": 1.5128205128205128,
"grad_norm": 0.3989737822244612,
"learning_rate": 4.260075637500528e-05,
"loss": 0.4791,
"num_tokens": 849422339.0,
"step": 3245
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.39448654246499554,
"learning_rate": 4.257502128322875e-05,
"loss": 0.4697,
"num_tokens": 850733059.0,
"step": 3250
},
{
"epoch": 1.5174825174825175,
"grad_norm": 0.38135346811109977,
"learning_rate": 4.25492503554356e-05,
"loss": 0.4858,
"num_tokens": 852033301.0,
"step": 3255
},
{
"epoch": 1.5198135198135199,
"grad_norm": 0.4063492887591379,
"learning_rate": 4.252344365288791e-05,
"loss": 0.4558,
"num_tokens": 853344021.0,
"step": 3260
},
{
"epoch": 1.5221445221445222,
"grad_norm": 0.41694831501332164,
"learning_rate": 4.2497601236932836e-05,
"loss": 0.4695,
"num_tokens": 854654741.0,
"step": 3265
},
{
"epoch": 1.5244755244755246,
"grad_norm": 0.36492369915112965,
"learning_rate": 4.2471723169002404e-05,
"loss": 0.4656,
"num_tokens": 855965461.0,
"step": 3270
},
{
"epoch": 1.526806526806527,
"grad_norm": 0.4768438850647179,
"learning_rate": 4.244580951061341e-05,
"loss": 0.4628,
"num_tokens": 857276181.0,
"step": 3275
},
{
"epoch": 1.529137529137529,
"grad_norm": 0.364684932951632,
"learning_rate": 4.2419860323367236e-05,
"loss": 0.4789,
"num_tokens": 858586901.0,
"step": 3280
},
{
"epoch": 1.5314685314685315,
"grad_norm": 0.3723413634564425,
"learning_rate": 4.239387566894973e-05,
"loss": 0.4852,
"num_tokens": 859897621.0,
"step": 3285
},
{
"epoch": 1.5337995337995338,
"grad_norm": 0.4225249222913032,
"learning_rate": 4.2367855609131074e-05,
"loss": 0.479,
"num_tokens": 861194968.0,
"step": 3290
},
{
"epoch": 1.5361305361305362,
"grad_norm": 0.37715348099690094,
"learning_rate": 4.234180020576556e-05,
"loss": 0.4849,
"num_tokens": 862505688.0,
"step": 3295
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.38467493999417174,
"learning_rate": 4.231570952079157e-05,
"loss": 0.4664,
"num_tokens": 863814518.0,
"step": 3300
},
{
"epoch": 1.5407925407925407,
"grad_norm": 0.37660805921200846,
"learning_rate": 4.22895836162313e-05,
"loss": 0.4735,
"num_tokens": 865125238.0,
"step": 3305
},
{
"epoch": 1.543123543123543,
"grad_norm": 0.3576120232900732,
"learning_rate": 4.226342255419069e-05,
"loss": 0.4836,
"num_tokens": 866435958.0,
"step": 3310
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.35132235584289356,
"learning_rate": 4.2237226396859256e-05,
"loss": 0.4482,
"num_tokens": 867746678.0,
"step": 3315
},
{
"epoch": 1.5477855477855478,
"grad_norm": 0.44192479046805605,
"learning_rate": 4.2210995206509945e-05,
"loss": 0.4741,
"num_tokens": 869057398.0,
"step": 3320
},
{
"epoch": 1.5501165501165501,
"grad_norm": 0.46240430648199005,
"learning_rate": 4.218472904549897e-05,
"loss": 0.4685,
"num_tokens": 870368118.0,
"step": 3325
},
{
"epoch": 1.5524475524475525,
"grad_norm": 0.3991386786525242,
"learning_rate": 4.215842797626569e-05,
"loss": 0.4821,
"num_tokens": 871678838.0,
"step": 3330
},
{
"epoch": 1.5547785547785549,
"grad_norm": 0.4310594533956871,
"learning_rate": 4.2132092061332444e-05,
"loss": 0.4716,
"num_tokens": 872989558.0,
"step": 3335
},
{
"epoch": 1.5571095571095572,
"grad_norm": 0.4719206923402879,
"learning_rate": 4.21057213633044e-05,
"loss": 0.4636,
"num_tokens": 874287357.0,
"step": 3340
},
{
"epoch": 1.5594405594405596,
"grad_norm": 0.4174800382083453,
"learning_rate": 4.207931594486941e-05,
"loss": 0.4702,
"num_tokens": 875598077.0,
"step": 3345
},
{
"epoch": 1.5617715617715617,
"grad_norm": 0.4353294397787359,
"learning_rate": 4.205287586879788e-05,
"loss": 0.4731,
"num_tokens": 876908797.0,
"step": 3350
},
{
"epoch": 1.564102564102564,
"grad_norm": 0.4245486575857189,
"learning_rate": 4.202640119794258e-05,
"loss": 0.4897,
"num_tokens": 878219517.0,
"step": 3355
},
{
"epoch": 1.5664335664335665,
"grad_norm": 0.36829836620338285,
"learning_rate": 4.1999891995238525e-05,
"loss": 0.4713,
"num_tokens": 879530237.0,
"step": 3360
},
{
"epoch": 1.5687645687645686,
"grad_norm": 0.4572528057427085,
"learning_rate": 4.1973348323702834e-05,
"loss": 0.4839,
"num_tokens": 880840957.0,
"step": 3365
},
{
"epoch": 1.571095571095571,
"grad_norm": 0.4298611015712589,
"learning_rate": 4.1946770246434554e-05,
"loss": 0.4712,
"num_tokens": 882151677.0,
"step": 3370
},
{
"epoch": 1.5734265734265733,
"grad_norm": 0.4045036094575523,
"learning_rate": 4.19201578266145e-05,
"loss": 0.4762,
"num_tokens": 883462397.0,
"step": 3375
},
{
"epoch": 1.5757575757575757,
"grad_norm": 0.3508435597625087,
"learning_rate": 4.1893511127505155e-05,
"loss": 0.4771,
"num_tokens": 884773117.0,
"step": 3380
},
{
"epoch": 1.578088578088578,
"grad_norm": 0.34886439925605456,
"learning_rate": 4.186683021245048e-05,
"loss": 0.4667,
"num_tokens": 886083837.0,
"step": 3385
},
{
"epoch": 1.5804195804195804,
"grad_norm": 0.3763065049584587,
"learning_rate": 4.1840115144875784e-05,
"loss": 0.4802,
"num_tokens": 887394557.0,
"step": 3390
},
{
"epoch": 1.5827505827505828,
"grad_norm": 0.49328331391815927,
"learning_rate": 4.1813365988287536e-05,
"loss": 0.4842,
"num_tokens": 888691868.0,
"step": 3395
},
{
"epoch": 1.5850815850815851,
"grad_norm": 0.3986181417508742,
"learning_rate": 4.178658280627326e-05,
"loss": 0.484,
"num_tokens": 890002588.0,
"step": 3400
},
{
"epoch": 1.5874125874125875,
"grad_norm": 0.49931340147631254,
"learning_rate": 4.175976566250136e-05,
"loss": 0.484,
"num_tokens": 891313308.0,
"step": 3405
},
{
"epoch": 1.5897435897435899,
"grad_norm": 0.45433476497736364,
"learning_rate": 4.173291462072098e-05,
"loss": 0.4618,
"num_tokens": 892617089.0,
"step": 3410
},
{
"epoch": 1.5920745920745922,
"grad_norm": 0.384524852468567,
"learning_rate": 4.170602974476184e-05,
"loss": 0.468,
"num_tokens": 893927809.0,
"step": 3415
},
{
"epoch": 1.5944055944055944,
"grad_norm": 0.3807586219393433,
"learning_rate": 4.167911109853407e-05,
"loss": 0.4771,
"num_tokens": 895238529.0,
"step": 3420
},
{
"epoch": 1.5967365967365967,
"grad_norm": 0.4447754359804572,
"learning_rate": 4.1652158746028116e-05,
"loss": 0.4716,
"num_tokens": 896541317.0,
"step": 3425
},
{
"epoch": 1.599067599067599,
"grad_norm": 0.42809843952530335,
"learning_rate": 4.162517275131454e-05,
"loss": 0.4604,
"num_tokens": 897852037.0,
"step": 3430
},
{
"epoch": 1.6013986013986012,
"grad_norm": 0.38368322059871096,
"learning_rate": 4.159815317854384e-05,
"loss": 0.4722,
"num_tokens": 899157306.0,
"step": 3435
},
{
"epoch": 1.6037296037296036,
"grad_norm": 0.41349052270597325,
"learning_rate": 4.157110009194639e-05,
"loss": 0.4854,
"num_tokens": 900455090.0,
"step": 3440
},
{
"epoch": 1.606060606060606,
"grad_norm": 0.37303520873037516,
"learning_rate": 4.15440135558322e-05,
"loss": 0.4504,
"num_tokens": 901765810.0,
"step": 3445
},
{
"epoch": 1.6083916083916083,
"grad_norm": 0.35322625283306586,
"learning_rate": 4.151689363459078e-05,
"loss": 0.4829,
"num_tokens": 903076530.0,
"step": 3450
},
{
"epoch": 1.6107226107226107,
"grad_norm": 0.41613261144008123,
"learning_rate": 4.1489740392691054e-05,
"loss": 0.4642,
"num_tokens": 904387250.0,
"step": 3455
},
{
"epoch": 1.613053613053613,
"grad_norm": 0.36788455921816615,
"learning_rate": 4.1462553894681115e-05,
"loss": 0.4504,
"num_tokens": 905697970.0,
"step": 3460
},
{
"epoch": 1.6153846153846154,
"grad_norm": 0.3865061275286875,
"learning_rate": 4.1435334205188106e-05,
"loss": 0.4742,
"num_tokens": 907008690.0,
"step": 3465
},
{
"epoch": 1.6177156177156178,
"grad_norm": 0.4445705789673663,
"learning_rate": 4.1408081388918114e-05,
"loss": 0.4611,
"num_tokens": 908319410.0,
"step": 3470
},
{
"epoch": 1.6200466200466201,
"grad_norm": 0.3887208640873298,
"learning_rate": 4.138079551065593e-05,
"loss": 0.4561,
"num_tokens": 909630130.0,
"step": 3475
},
{
"epoch": 1.6223776223776225,
"grad_norm": 0.377258046210296,
"learning_rate": 4.135347663526496e-05,
"loss": 0.4745,
"num_tokens": 910940850.0,
"step": 3480
},
{
"epoch": 1.6247086247086249,
"grad_norm": 0.3678540515945634,
"learning_rate": 4.132612482768704e-05,
"loss": 0.4724,
"num_tokens": 912251570.0,
"step": 3485
},
{
"epoch": 1.627039627039627,
"grad_norm": 0.36120123975381446,
"learning_rate": 4.129874015294234e-05,
"loss": 0.4844,
"num_tokens": 913562290.0,
"step": 3490
},
{
"epoch": 1.6293706293706294,
"grad_norm": 0.409642194208649,
"learning_rate": 4.127132267612907e-05,
"loss": 0.4665,
"num_tokens": 914873010.0,
"step": 3495
},
{
"epoch": 1.6317016317016317,
"grad_norm": 0.34029196923903116,
"learning_rate": 4.1243872462423485e-05,
"loss": 0.4753,
"num_tokens": 916183730.0,
"step": 3500
},
{
"epoch": 1.6340326340326339,
"grad_norm": 0.407667262964334,
"learning_rate": 4.121638957707965e-05,
"loss": 0.4627,
"num_tokens": 917494450.0,
"step": 3505
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.35645199665718186,
"learning_rate": 4.118887408542927e-05,
"loss": 0.4705,
"num_tokens": 918791806.0,
"step": 3510
},
{
"epoch": 1.6386946386946386,
"grad_norm": 0.37596379435936367,
"learning_rate": 4.11613260528816e-05,
"loss": 0.4591,
"num_tokens": 920102526.0,
"step": 3515
},
{
"epoch": 1.641025641025641,
"grad_norm": 0.4320558743649901,
"learning_rate": 4.1133745544923236e-05,
"loss": 0.456,
"num_tokens": 921413246.0,
"step": 3520
},
{
"epoch": 1.6433566433566433,
"grad_norm": 0.4341288373141138,
"learning_rate": 4.1106132627117956e-05,
"loss": 0.4748,
"num_tokens": 922715595.0,
"step": 3525
},
{
"epoch": 1.6456876456876457,
"grad_norm": 0.41676180005636126,
"learning_rate": 4.107848736510659e-05,
"loss": 0.4575,
"num_tokens": 924026315.0,
"step": 3530
},
{
"epoch": 1.648018648018648,
"grad_norm": 0.40802471968176357,
"learning_rate": 4.105080982460687e-05,
"loss": 0.4628,
"num_tokens": 925337035.0,
"step": 3535
},
{
"epoch": 1.6503496503496504,
"grad_norm": 0.4507954091649136,
"learning_rate": 4.102310007141324e-05,
"loss": 0.4837,
"num_tokens": 926631017.0,
"step": 3540
},
{
"epoch": 1.6526806526806528,
"grad_norm": 0.4050589740560033,
"learning_rate": 4.0995358171396747e-05,
"loss": 0.4736,
"num_tokens": 927941737.0,
"step": 3545
},
{
"epoch": 1.6550116550116551,
"grad_norm": 0.42380553595370807,
"learning_rate": 4.0967584190504825e-05,
"loss": 0.4734,
"num_tokens": 929252457.0,
"step": 3550
},
{
"epoch": 1.6573426573426573,
"grad_norm": 0.3958904686940088,
"learning_rate": 4.0939778194761196e-05,
"loss": 0.488,
"num_tokens": 930563177.0,
"step": 3555
},
{
"epoch": 1.6596736596736597,
"grad_norm": 0.43394413515654184,
"learning_rate": 4.091194025026567e-05,
"loss": 0.4692,
"num_tokens": 931860496.0,
"step": 3560
},
{
"epoch": 1.662004662004662,
"grad_norm": 0.4589951627434292,
"learning_rate": 4.0884070423194007e-05,
"loss": 0.4805,
"num_tokens": 933171216.0,
"step": 3565
},
{
"epoch": 1.6643356643356644,
"grad_norm": 0.4150754775824352,
"learning_rate": 4.085616877979776e-05,
"loss": 0.4628,
"num_tokens": 934465598.0,
"step": 3570
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.40687005175901875,
"learning_rate": 4.0828235386404124e-05,
"loss": 0.4564,
"num_tokens": 935774296.0,
"step": 3575
},
{
"epoch": 1.6689976689976689,
"grad_norm": 0.3856327268150972,
"learning_rate": 4.0800270309415756e-05,
"loss": 0.4635,
"num_tokens": 937085016.0,
"step": 3580
},
{
"epoch": 1.6713286713286712,
"grad_norm": 0.43509202476957415,
"learning_rate": 4.077227361531063e-05,
"loss": 0.4708,
"num_tokens": 938395736.0,
"step": 3585
},
{
"epoch": 1.6736596736596736,
"grad_norm": 0.37887891871157525,
"learning_rate": 4.07442453706419e-05,
"loss": 0.4775,
"num_tokens": 939706456.0,
"step": 3590
},
{
"epoch": 1.675990675990676,
"grad_norm": 0.35092056020696705,
"learning_rate": 4.07161856420377e-05,
"loss": 0.4642,
"num_tokens": 941017176.0,
"step": 3595
},
{
"epoch": 1.6783216783216783,
"grad_norm": 0.3618238243762927,
"learning_rate": 4.068809449620101e-05,
"loss": 0.4728,
"num_tokens": 942319215.0,
"step": 3600
},
{
"epoch": 1.6806526806526807,
"grad_norm": 0.3850438304208137,
"learning_rate": 4.065997199990951e-05,
"loss": 0.4823,
"num_tokens": 943629935.0,
"step": 3605
},
{
"epoch": 1.682983682983683,
"grad_norm": 0.3981622077125061,
"learning_rate": 4.063181822001538e-05,
"loss": 0.4605,
"num_tokens": 944940655.0,
"step": 3610
},
{
"epoch": 1.6853146853146854,
"grad_norm": 0.4200393473429356,
"learning_rate": 4.060363322344518e-05,
"loss": 0.4816,
"num_tokens": 946251375.0,
"step": 3615
},
{
"epoch": 1.6876456876456878,
"grad_norm": 0.389339620745983,
"learning_rate": 4.05754170771997e-05,
"loss": 0.4714,
"num_tokens": 947562095.0,
"step": 3620
},
{
"epoch": 1.68997668997669,
"grad_norm": 0.4117742280678568,
"learning_rate": 4.054716984835372e-05,
"loss": 0.4695,
"num_tokens": 948872815.0,
"step": 3625
},
{
"epoch": 1.6923076923076923,
"grad_norm": 0.4245786229670795,
"learning_rate": 4.051889160405598e-05,
"loss": 0.4574,
"num_tokens": 950183535.0,
"step": 3630
},
{
"epoch": 1.6946386946386947,
"grad_norm": 0.4777093865988957,
"learning_rate": 4.0490582411528896e-05,
"loss": 0.4572,
"num_tokens": 951494255.0,
"step": 3635
},
{
"epoch": 1.696969696969697,
"grad_norm": 0.3924099164372027,
"learning_rate": 4.0462242338068476e-05,
"loss": 0.4651,
"num_tokens": 952804975.0,
"step": 3640
},
{
"epoch": 1.6993006993006992,
"grad_norm": 0.38987930954143907,
"learning_rate": 4.0433871451044136e-05,
"loss": 0.4873,
"num_tokens": 954115695.0,
"step": 3645
},
{
"epoch": 1.7016317016317015,
"grad_norm": 0.41461621362174245,
"learning_rate": 4.040546981789854e-05,
"loss": 0.4748,
"num_tokens": 955426415.0,
"step": 3650
},
{
"epoch": 1.7039627039627039,
"grad_norm": 0.3425572462390038,
"learning_rate": 4.0377037506147436e-05,
"loss": 0.4858,
"num_tokens": 956737135.0,
"step": 3655
},
{
"epoch": 1.7062937062937062,
"grad_norm": 0.35362235462359226,
"learning_rate": 4.0348574583379506e-05,
"loss": 0.4515,
"num_tokens": 958047855.0,
"step": 3660
},
{
"epoch": 1.7086247086247086,
"grad_norm": 0.4269439890212748,
"learning_rate": 4.032008111725619e-05,
"loss": 0.478,
"num_tokens": 959358575.0,
"step": 3665
},
{
"epoch": 1.710955710955711,
"grad_norm": 0.3975233814611868,
"learning_rate": 4.029155717551156e-05,
"loss": 0.4682,
"num_tokens": 960647821.0,
"step": 3670
},
{
"epoch": 1.7132867132867133,
"grad_norm": 0.42538111394289835,
"learning_rate": 4.026300282595211e-05,
"loss": 0.4821,
"num_tokens": 961958541.0,
"step": 3675
},
{
"epoch": 1.7156177156177157,
"grad_norm": 0.44368078414892886,
"learning_rate": 4.023441813645662e-05,
"loss": 0.4629,
"num_tokens": 963269261.0,
"step": 3680
},
{
"epoch": 1.717948717948718,
"grad_norm": 0.4096743909582884,
"learning_rate": 4.0205803174975996e-05,
"loss": 0.4678,
"num_tokens": 964579981.0,
"step": 3685
},
{
"epoch": 1.7202797202797204,
"grad_norm": 0.36149619103114544,
"learning_rate": 4.0177158009533136e-05,
"loss": 0.4661,
"num_tokens": 965885806.0,
"step": 3690
},
{
"epoch": 1.7226107226107226,
"grad_norm": 0.3883498094552971,
"learning_rate": 4.014848270822268e-05,
"loss": 0.4679,
"num_tokens": 967191092.0,
"step": 3695
},
{
"epoch": 1.724941724941725,
"grad_norm": 0.3874435629285386,
"learning_rate": 4.011977733921096e-05,
"loss": 0.4613,
"num_tokens": 968501812.0,
"step": 3700
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.40622823181165385,
"learning_rate": 4.009104197073575e-05,
"loss": 0.4813,
"num_tokens": 969796954.0,
"step": 3705
},
{
"epoch": 1.7296037296037297,
"grad_norm": 0.3576403196641578,
"learning_rate": 4.0062276671106154e-05,
"loss": 0.456,
"num_tokens": 971107674.0,
"step": 3710
},
{
"epoch": 1.7319347319347318,
"grad_norm": 0.42428688878098647,
"learning_rate": 4.0033481508702425e-05,
"loss": 0.4771,
"num_tokens": 972418394.0,
"step": 3715
},
{
"epoch": 1.7342657342657342,
"grad_norm": 0.398776005644926,
"learning_rate": 4.00046565519758e-05,
"loss": 0.4826,
"num_tokens": 973729114.0,
"step": 3720
},
{
"epoch": 1.7365967365967365,
"grad_norm": 0.3803669387697352,
"learning_rate": 3.997580186944835e-05,
"loss": 0.4817,
"num_tokens": 975039834.0,
"step": 3725
},
{
"epoch": 1.7389277389277389,
"grad_norm": 0.3668247169705026,
"learning_rate": 3.994691752971282e-05,
"loss": 0.4671,
"num_tokens": 976350554.0,
"step": 3730
},
{
"epoch": 1.7412587412587412,
"grad_norm": 0.3618934392527535,
"learning_rate": 3.991800360143241e-05,
"loss": 0.475,
"num_tokens": 977661274.0,
"step": 3735
},
{
"epoch": 1.7435897435897436,
"grad_norm": 0.3616956268703821,
"learning_rate": 3.988906015334073e-05,
"loss": 0.4595,
"num_tokens": 978971994.0,
"step": 3740
},
{
"epoch": 1.745920745920746,
"grad_norm": 0.3930597565056608,
"learning_rate": 3.986008725424148e-05,
"loss": 0.465,
"num_tokens": 980282714.0,
"step": 3745
},
{
"epoch": 1.7482517482517483,
"grad_norm": 0.39536789600867117,
"learning_rate": 3.983108497300844e-05,
"loss": 0.4701,
"num_tokens": 981585502.0,
"step": 3750
},
{
"epoch": 1.7505827505827507,
"grad_norm": 0.4099848875497283,
"learning_rate": 3.9802053378585205e-05,
"loss": 0.4751,
"num_tokens": 982896222.0,
"step": 3755
},
{
"epoch": 1.752913752913753,
"grad_norm": 0.3722702285436598,
"learning_rate": 3.977299253998504e-05,
"loss": 0.4738,
"num_tokens": 984206942.0,
"step": 3760
},
{
"epoch": 1.7552447552447552,
"grad_norm": 0.3970652311409931,
"learning_rate": 3.974390252629078e-05,
"loss": 0.4671,
"num_tokens": 985517662.0,
"step": 3765
},
{
"epoch": 1.7575757575757576,
"grad_norm": 0.38418584202250494,
"learning_rate": 3.971478340665455e-05,
"loss": 0.478,
"num_tokens": 986828382.0,
"step": 3770
},
{
"epoch": 1.75990675990676,
"grad_norm": 0.4897203405840543,
"learning_rate": 3.968563525029771e-05,
"loss": 0.4758,
"num_tokens": 988139102.0,
"step": 3775
},
{
"epoch": 1.762237762237762,
"grad_norm": 0.4708593559155846,
"learning_rate": 3.965645812651063e-05,
"loss": 0.4872,
"num_tokens": 989444809.0,
"step": 3780
},
{
"epoch": 1.7645687645687644,
"grad_norm": 0.4215417171128016,
"learning_rate": 3.9627252104652535e-05,
"loss": 0.4591,
"num_tokens": 990753790.0,
"step": 3785
},
{
"epoch": 1.7668997668997668,
"grad_norm": 0.40405280271269434,
"learning_rate": 3.959801725415136e-05,
"loss": 0.4648,
"num_tokens": 992054396.0,
"step": 3790
},
{
"epoch": 1.7692307692307692,
"grad_norm": 0.41393808447009156,
"learning_rate": 3.9568753644503566e-05,
"loss": 0.4587,
"num_tokens": 993352731.0,
"step": 3795
},
{
"epoch": 1.7715617715617715,
"grad_norm": 0.37732142695063003,
"learning_rate": 3.9539461345273956e-05,
"loss": 0.4737,
"num_tokens": 994656385.0,
"step": 3800
},
{
"epoch": 1.7738927738927739,
"grad_norm": 0.528280250373434,
"learning_rate": 3.951014042609559e-05,
"loss": 0.4702,
"num_tokens": 995967105.0,
"step": 3805
},
{
"epoch": 1.7762237762237763,
"grad_norm": 0.4213342625415021,
"learning_rate": 3.9480790956669486e-05,
"loss": 0.4791,
"num_tokens": 997267208.0,
"step": 3810
},
{
"epoch": 1.7785547785547786,
"grad_norm": 0.3698430883873798,
"learning_rate": 3.9451413006764604e-05,
"loss": 0.4653,
"num_tokens": 998577928.0,
"step": 3815
},
{
"epoch": 1.780885780885781,
"grad_norm": 0.414231394973144,
"learning_rate": 3.942200664621756e-05,
"loss": 0.4687,
"num_tokens": 999888162.0,
"step": 3820
},
{
"epoch": 1.7832167832167833,
"grad_norm": 0.4350040599172444,
"learning_rate": 3.939257194493253e-05,
"loss": 0.4513,
"num_tokens": 1001186063.0,
"step": 3825
},
{
"epoch": 1.7855477855477857,
"grad_norm": 0.4152803596705445,
"learning_rate": 3.936310897288104e-05,
"loss": 0.4562,
"num_tokens": 1002496783.0,
"step": 3830
},
{
"epoch": 1.7878787878787878,
"grad_norm": 0.41881419362638367,
"learning_rate": 3.933361780010185e-05,
"loss": 0.4646,
"num_tokens": 1003807503.0,
"step": 3835
},
{
"epoch": 1.7902097902097902,
"grad_norm": 0.38522642875173096,
"learning_rate": 3.930409849670073e-05,
"loss": 0.4596,
"num_tokens": 1005118223.0,
"step": 3840
},
{
"epoch": 1.7925407925407926,
"grad_norm": 0.34873890955647613,
"learning_rate": 3.927455113285035e-05,
"loss": 0.4559,
"num_tokens": 1006428943.0,
"step": 3845
},
{
"epoch": 1.7948717948717947,
"grad_norm": 0.39305628827987876,
"learning_rate": 3.924497577879005e-05,
"loss": 0.4647,
"num_tokens": 1007739663.0,
"step": 3850
},
{
"epoch": 1.797202797202797,
"grad_norm": 0.3854549232040684,
"learning_rate": 3.9215372504825735e-05,
"loss": 0.4737,
"num_tokens": 1009050383.0,
"step": 3855
},
{
"epoch": 1.7995337995337994,
"grad_norm": 0.40079016381501426,
"learning_rate": 3.9185741381329664e-05,
"loss": 0.4792,
"num_tokens": 1010361103.0,
"step": 3860
},
{
"epoch": 1.8018648018648018,
"grad_norm": 0.40266606077066175,
"learning_rate": 3.915608247874032e-05,
"loss": 0.487,
"num_tokens": 1011671823.0,
"step": 3865
},
{
"epoch": 1.8041958041958042,
"grad_norm": 0.3510993917665707,
"learning_rate": 3.912639586756221e-05,
"loss": 0.4514,
"num_tokens": 1012982543.0,
"step": 3870
},
{
"epoch": 1.8065268065268065,
"grad_norm": 0.3907134061685135,
"learning_rate": 3.9096681618365686e-05,
"loss": 0.447,
"num_tokens": 1014293263.0,
"step": 3875
},
{
"epoch": 1.808857808857809,
"grad_norm": 0.3708262682688497,
"learning_rate": 3.9066939801786836e-05,
"loss": 0.4765,
"num_tokens": 1015592603.0,
"step": 3880
},
{
"epoch": 1.8111888111888113,
"grad_norm": 0.44553625905509964,
"learning_rate": 3.903717048852728e-05,
"loss": 0.4709,
"num_tokens": 1016903323.0,
"step": 3885
},
{
"epoch": 1.8135198135198136,
"grad_norm": 0.5102907463594831,
"learning_rate": 3.900737374935396e-05,
"loss": 0.477,
"num_tokens": 1018214043.0,
"step": 3890
},
{
"epoch": 1.815850815850816,
"grad_norm": 0.4103843094724411,
"learning_rate": 3.897754965509908e-05,
"loss": 0.4557,
"num_tokens": 1019524763.0,
"step": 3895
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.3973998481652665,
"learning_rate": 3.8947698276659806e-05,
"loss": 0.4606,
"num_tokens": 1020835483.0,
"step": 3900
},
{
"epoch": 1.8205128205128205,
"grad_norm": 0.4071222548172448,
"learning_rate": 3.8917819684998215e-05,
"loss": 0.4734,
"num_tokens": 1022146203.0,
"step": 3905
},
{
"epoch": 1.8228438228438228,
"grad_norm": 0.38953013965021704,
"learning_rate": 3.888791395114103e-05,
"loss": 0.4481,
"num_tokens": 1023456923.0,
"step": 3910
},
{
"epoch": 1.8251748251748252,
"grad_norm": 0.34776150147238477,
"learning_rate": 3.885798114617954e-05,
"loss": 0.4653,
"num_tokens": 1024767643.0,
"step": 3915
},
{
"epoch": 1.8275058275058274,
"grad_norm": 0.43412541332007054,
"learning_rate": 3.8828021341269363e-05,
"loss": 0.4696,
"num_tokens": 1026078363.0,
"step": 3920
},
{
"epoch": 1.8298368298368297,
"grad_norm": 0.39408936405816397,
"learning_rate": 3.879803460763029e-05,
"loss": 0.471,
"num_tokens": 1027389083.0,
"step": 3925
},
{
"epoch": 1.832167832167832,
"grad_norm": 0.36162777389200856,
"learning_rate": 3.876802101654614e-05,
"loss": 0.4669,
"num_tokens": 1028699803.0,
"step": 3930
},
{
"epoch": 1.8344988344988344,
"grad_norm": 0.4060158965672199,
"learning_rate": 3.87379806393646e-05,
"loss": 0.4601,
"num_tokens": 1030010523.0,
"step": 3935
},
{
"epoch": 1.8368298368298368,
"grad_norm": 0.40896752020498217,
"learning_rate": 3.870791354749698e-05,
"loss": 0.4688,
"num_tokens": 1031319258.0,
"step": 3940
},
{
"epoch": 1.8391608391608392,
"grad_norm": 0.44359916481371037,
"learning_rate": 3.867781981241814e-05,
"loss": 0.4889,
"num_tokens": 1032629978.0,
"step": 3945
},
{
"epoch": 1.8414918414918415,
"grad_norm": 0.40506992750006354,
"learning_rate": 3.8647699505666265e-05,
"loss": 0.4477,
"num_tokens": 1033940698.0,
"step": 3950
},
{
"epoch": 1.843822843822844,
"grad_norm": 0.39121466623448464,
"learning_rate": 3.861755269884269e-05,
"loss": 0.462,
"num_tokens": 1035251418.0,
"step": 3955
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.3762364215121074,
"learning_rate": 3.8587379463611766e-05,
"loss": 0.4718,
"num_tokens": 1036562138.0,
"step": 3960
},
{
"epoch": 1.8484848484848486,
"grad_norm": 0.38276793618104127,
"learning_rate": 3.855717987170065e-05,
"loss": 0.4694,
"num_tokens": 1037868363.0,
"step": 3965
},
{
"epoch": 1.8508158508158508,
"grad_norm": 0.3993712479498845,
"learning_rate": 3.852695399489917e-05,
"loss": 0.4632,
"num_tokens": 1039179083.0,
"step": 3970
},
{
"epoch": 1.8531468531468531,
"grad_norm": 0.3653526687250586,
"learning_rate": 3.849670190505963e-05,
"loss": 0.458,
"num_tokens": 1040489803.0,
"step": 3975
},
{
"epoch": 1.8554778554778555,
"grad_norm": 0.35478532717038763,
"learning_rate": 3.846642367409663e-05,
"loss": 0.4773,
"num_tokens": 1041789042.0,
"step": 3980
},
{
"epoch": 1.8578088578088578,
"grad_norm": 0.4152595626409347,
"learning_rate": 3.843611937398695e-05,
"loss": 0.4734,
"num_tokens": 1043099762.0,
"step": 3985
},
{
"epoch": 1.86013986013986,
"grad_norm": 0.3697811329438798,
"learning_rate": 3.840578907676933e-05,
"loss": 0.4603,
"num_tokens": 1044410482.0,
"step": 3990
},
{
"epoch": 1.8624708624708624,
"grad_norm": 0.3673405078658897,
"learning_rate": 3.8375432854544265e-05,
"loss": 0.468,
"num_tokens": 1045721202.0,
"step": 3995
},
{
"epoch": 1.8648018648018647,
"grad_norm": 0.3642674759468013,
"learning_rate": 3.834505077947395e-05,
"loss": 0.4679,
"num_tokens": 1047031922.0,
"step": 4000
},
{
"epoch": 1.867132867132867,
"grad_norm": 0.39831494968982045,
"learning_rate": 3.831464292378199e-05,
"loss": 0.4603,
"num_tokens": 1048342642.0,
"step": 4005
},
{
"epoch": 1.8694638694638694,
"grad_norm": 0.47030492710960053,
"learning_rate": 3.828420935975328e-05,
"loss": 0.4718,
"num_tokens": 1049647085.0,
"step": 4010
},
{
"epoch": 1.8717948717948718,
"grad_norm": 0.34963450633338594,
"learning_rate": 3.825375015973383e-05,
"loss": 0.4582,
"num_tokens": 1050957805.0,
"step": 4015
},
{
"epoch": 1.8741258741258742,
"grad_norm": 0.38080688538607055,
"learning_rate": 3.822326539613061e-05,
"loss": 0.4686,
"num_tokens": 1052268525.0,
"step": 4020
},
{
"epoch": 1.8764568764568765,
"grad_norm": 0.3878968790543752,
"learning_rate": 3.819275514141134e-05,
"loss": 0.4718,
"num_tokens": 1053579245.0,
"step": 4025
},
{
"epoch": 1.878787878787879,
"grad_norm": 0.4444788783917722,
"learning_rate": 3.816221946810434e-05,
"loss": 0.449,
"num_tokens": 1054889965.0,
"step": 4030
},
{
"epoch": 1.8811188811188813,
"grad_norm": 0.3855436235980361,
"learning_rate": 3.813165844879835e-05,
"loss": 0.4663,
"num_tokens": 1056200685.0,
"step": 4035
},
{
"epoch": 1.8834498834498834,
"grad_norm": 0.41749033912855776,
"learning_rate": 3.8101072156142376e-05,
"loss": 0.4721,
"num_tokens": 1057507296.0,
"step": 4040
},
{
"epoch": 1.8857808857808858,
"grad_norm": 0.3325680793207379,
"learning_rate": 3.8070460662845495e-05,
"loss": 0.4685,
"num_tokens": 1058818016.0,
"step": 4045
},
{
"epoch": 1.8881118881118881,
"grad_norm": 0.36642497795228024,
"learning_rate": 3.80398240416767e-05,
"loss": 0.4513,
"num_tokens": 1060123820.0,
"step": 4050
},
{
"epoch": 1.8904428904428905,
"grad_norm": 0.3301418844159666,
"learning_rate": 3.800916236546468e-05,
"loss": 0.4734,
"num_tokens": 1061434540.0,
"step": 4055
},
{
"epoch": 1.8927738927738926,
"grad_norm": 0.34768445537128323,
"learning_rate": 3.797847570709775e-05,
"loss": 0.4723,
"num_tokens": 1062745260.0,
"step": 4060
},
{
"epoch": 1.895104895104895,
"grad_norm": 0.347789179412769,
"learning_rate": 3.794776413952354e-05,
"loss": 0.4626,
"num_tokens": 1064055980.0,
"step": 4065
},
{
"epoch": 1.8974358974358974,
"grad_norm": 0.3462234651976999,
"learning_rate": 3.7917027735748956e-05,
"loss": 0.4607,
"num_tokens": 1065366509.0,
"step": 4070
},
{
"epoch": 1.8997668997668997,
"grad_norm": 0.38708966741877066,
"learning_rate": 3.788626656883991e-05,
"loss": 0.4826,
"num_tokens": 1066677229.0,
"step": 4075
},
{
"epoch": 1.902097902097902,
"grad_norm": 0.38785225263682604,
"learning_rate": 3.785548071192117e-05,
"loss": 0.4663,
"num_tokens": 1067987949.0,
"step": 4080
},
{
"epoch": 1.9044289044289044,
"grad_norm": 0.43056969345933427,
"learning_rate": 3.782467023817623e-05,
"loss": 0.4647,
"num_tokens": 1069298669.0,
"step": 4085
},
{
"epoch": 1.9067599067599068,
"grad_norm": 0.3412833063088448,
"learning_rate": 3.7793835220847076e-05,
"loss": 0.4678,
"num_tokens": 1070609389.0,
"step": 4090
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.4184934857430221,
"learning_rate": 3.776297573323406e-05,
"loss": 0.474,
"num_tokens": 1071920109.0,
"step": 4095
},
{
"epoch": 1.9114219114219115,
"grad_norm": 0.44954773016733546,
"learning_rate": 3.7732091848695686e-05,
"loss": 0.4647,
"num_tokens": 1073230829.0,
"step": 4100
},
{
"epoch": 1.913752913752914,
"grad_norm": 0.4771060848896429,
"learning_rate": 3.770118364064846e-05,
"loss": 0.4743,
"num_tokens": 1074541549.0,
"step": 4105
},
{
"epoch": 1.916083916083916,
"grad_norm": 0.3945529571970546,
"learning_rate": 3.767025118256672e-05,
"loss": 0.4691,
"num_tokens": 1075852269.0,
"step": 4110
},
{
"epoch": 1.9184149184149184,
"grad_norm": 0.40375250793365847,
"learning_rate": 3.7639294547982416e-05,
"loss": 0.4699,
"num_tokens": 1077160473.0,
"step": 4115
},
{
"epoch": 1.9207459207459208,
"grad_norm": 0.3682666412768804,
"learning_rate": 3.760831381048503e-05,
"loss": 0.4396,
"num_tokens": 1078471193.0,
"step": 4120
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.40270380854681137,
"learning_rate": 3.757730904372127e-05,
"loss": 0.4655,
"num_tokens": 1079781913.0,
"step": 4125
},
{
"epoch": 1.9254079254079253,
"grad_norm": 0.4091540171139127,
"learning_rate": 3.754628032139502e-05,
"loss": 0.4676,
"num_tokens": 1081092633.0,
"step": 4130
},
{
"epoch": 1.9277389277389276,
"grad_norm": 0.37723565465509623,
"learning_rate": 3.75152277172671e-05,
"loss": 0.458,
"num_tokens": 1082403353.0,
"step": 4135
},
{
"epoch": 1.93006993006993,
"grad_norm": 0.35463116725754923,
"learning_rate": 3.7484151305155066e-05,
"loss": 0.4601,
"num_tokens": 1083697889.0,
"step": 4140
},
{
"epoch": 1.9324009324009324,
"grad_norm": 0.4440845165505597,
"learning_rate": 3.7453051158933124e-05,
"loss": 0.4635,
"num_tokens": 1084995264.0,
"step": 4145
},
{
"epoch": 1.9347319347319347,
"grad_norm": 0.3844563537675778,
"learning_rate": 3.742192735253186e-05,
"loss": 0.4486,
"num_tokens": 1086305984.0,
"step": 4150
},
{
"epoch": 1.937062937062937,
"grad_norm": 0.3653521354637281,
"learning_rate": 3.739077995993811e-05,
"loss": 0.4609,
"num_tokens": 1087616704.0,
"step": 4155
},
{
"epoch": 1.9393939393939394,
"grad_norm": 0.3338487174568724,
"learning_rate": 3.735960905519482e-05,
"loss": 0.4475,
"num_tokens": 1088926295.0,
"step": 4160
},
{
"epoch": 1.9417249417249418,
"grad_norm": 0.3624856781426835,
"learning_rate": 3.732841471240076e-05,
"loss": 0.4515,
"num_tokens": 1090237015.0,
"step": 4165
},
{
"epoch": 1.9440559440559442,
"grad_norm": 0.34674172363160854,
"learning_rate": 3.729719700571046e-05,
"loss": 0.4581,
"num_tokens": 1091547735.0,
"step": 4170
},
{
"epoch": 1.9463869463869465,
"grad_norm": 0.3634515054857927,
"learning_rate": 3.726595600933398e-05,
"loss": 0.4614,
"num_tokens": 1092858455.0,
"step": 4175
},
{
"epoch": 1.9487179487179487,
"grad_norm": 0.38834463875662345,
"learning_rate": 3.7234691797536746e-05,
"loss": 0.4655,
"num_tokens": 1094169175.0,
"step": 4180
},
{
"epoch": 1.951048951048951,
"grad_norm": 0.4040443266819196,
"learning_rate": 3.720340444463939e-05,
"loss": 0.4603,
"num_tokens": 1095479895.0,
"step": 4185
},
{
"epoch": 1.9533799533799534,
"grad_norm": 0.37150785338102815,
"learning_rate": 3.7172094025017504e-05,
"loss": 0.4644,
"num_tokens": 1096790615.0,
"step": 4190
},
{
"epoch": 1.9557109557109555,
"grad_norm": 0.38391418767198954,
"learning_rate": 3.714076061310157e-05,
"loss": 0.47,
"num_tokens": 1098101335.0,
"step": 4195
},
{
"epoch": 1.958041958041958,
"grad_norm": 0.342628715510528,
"learning_rate": 3.710940428337668e-05,
"loss": 0.4598,
"num_tokens": 1099412055.0,
"step": 4200
},
{
"epoch": 1.9603729603729603,
"grad_norm": 0.35615594682543533,
"learning_rate": 3.7078025110382455e-05,
"loss": 0.453,
"num_tokens": 1100722775.0,
"step": 4205
},
{
"epoch": 1.9627039627039626,
"grad_norm": 0.34603620652492556,
"learning_rate": 3.704662316871276e-05,
"loss": 0.4821,
"num_tokens": 1102033495.0,
"step": 4210
},
{
"epoch": 1.965034965034965,
"grad_norm": 0.3544349334013842,
"learning_rate": 3.7015198533015633e-05,
"loss": 0.4739,
"num_tokens": 1103344215.0,
"step": 4215
},
{
"epoch": 1.9673659673659674,
"grad_norm": 0.3717399478589991,
"learning_rate": 3.6983751277993045e-05,
"loss": 0.4683,
"num_tokens": 1104654935.0,
"step": 4220
},
{
"epoch": 1.9696969696969697,
"grad_norm": 0.3807350804066158,
"learning_rate": 3.6952281478400715e-05,
"loss": 0.4721,
"num_tokens": 1105965655.0,
"step": 4225
},
{
"epoch": 1.972027972027972,
"grad_norm": 0.36855589171035036,
"learning_rate": 3.692078920904799e-05,
"loss": 0.4701,
"num_tokens": 1107267769.0,
"step": 4230
},
{
"epoch": 1.9743589743589745,
"grad_norm": 0.31575827164633735,
"learning_rate": 3.688927454479763e-05,
"loss": 0.4512,
"num_tokens": 1108578489.0,
"step": 4235
},
{
"epoch": 1.9766899766899768,
"grad_norm": 0.4085486617161448,
"learning_rate": 3.6857737560565584e-05,
"loss": 0.4624,
"num_tokens": 1109889209.0,
"step": 4240
},
{
"epoch": 1.9790209790209792,
"grad_norm": 0.33727800612662295,
"learning_rate": 3.682617833132092e-05,
"loss": 0.4427,
"num_tokens": 1111190573.0,
"step": 4245
},
{
"epoch": 1.9813519813519813,
"grad_norm": 0.4098762469941274,
"learning_rate": 3.679459693208555e-05,
"loss": 0.4656,
"num_tokens": 1112501293.0,
"step": 4250
},
{
"epoch": 1.9836829836829837,
"grad_norm": 0.3985077104938252,
"learning_rate": 3.6762993437934094e-05,
"loss": 0.4585,
"num_tokens": 1113812013.0,
"step": 4255
},
{
"epoch": 1.986013986013986,
"grad_norm": 0.37492705859719844,
"learning_rate": 3.673136792399371e-05,
"loss": 0.4589,
"num_tokens": 1115112034.0,
"step": 4260
},
{
"epoch": 1.9883449883449882,
"grad_norm": 0.4337730376679739,
"learning_rate": 3.6699720465443885e-05,
"loss": 0.471,
"num_tokens": 1116422754.0,
"step": 4265
},
{
"epoch": 1.9906759906759905,
"grad_norm": 0.36853486357793097,
"learning_rate": 3.6668051137516275e-05,
"loss": 0.4793,
"num_tokens": 1117733474.0,
"step": 4270
},
{
"epoch": 1.993006993006993,
"grad_norm": 0.4408364026741728,
"learning_rate": 3.663636001549452e-05,
"loss": 0.4637,
"num_tokens": 1119044194.0,
"step": 4275
},
{
"epoch": 1.9953379953379953,
"grad_norm": 0.4221479195643412,
"learning_rate": 3.660464717471408e-05,
"loss": 0.4608,
"num_tokens": 1120354914.0,
"step": 4280
},
{
"epoch": 1.9976689976689976,
"grad_norm": 0.37516116056322824,
"learning_rate": 3.6572912690562045e-05,
"loss": 0.4605,
"num_tokens": 1121665634.0,
"step": 4285
},
{
"epoch": 2.0,
"grad_norm": 0.4487985158857049,
"learning_rate": 3.654115663847694e-05,
"loss": 0.4591,
"num_tokens": 1122976354.0,
"step": 4290
},
{
"epoch": 2.0023310023310024,
"grad_norm": 0.41022871529290744,
"learning_rate": 3.650937909394857e-05,
"loss": 0.4071,
"num_tokens": 1124287074.0,
"step": 4295
},
{
"epoch": 2.0046620046620047,
"grad_norm": 0.38634036595793075,
"learning_rate": 3.6477580132517833e-05,
"loss": 0.4082,
"num_tokens": 1125594773.0,
"step": 4300
},
{
"epoch": 2.006993006993007,
"grad_norm": 0.3650522398368059,
"learning_rate": 3.644575982977655e-05,
"loss": 0.4186,
"num_tokens": 1126905493.0,
"step": 4305
},
{
"epoch": 2.0093240093240095,
"grad_norm": 0.3719826700428957,
"learning_rate": 3.641391826136724e-05,
"loss": 0.4182,
"num_tokens": 1128216213.0,
"step": 4310
},
{
"epoch": 2.011655011655012,
"grad_norm": 0.36321886526329594,
"learning_rate": 3.6382055502983e-05,
"loss": 0.416,
"num_tokens": 1129526933.0,
"step": 4315
},
{
"epoch": 2.013986013986014,
"grad_norm": 0.38670247193489393,
"learning_rate": 3.63501716303673e-05,
"loss": 0.4081,
"num_tokens": 1130837653.0,
"step": 4320
},
{
"epoch": 2.016317016317016,
"grad_norm": 0.4267019144244097,
"learning_rate": 3.631826671931379e-05,
"loss": 0.4238,
"num_tokens": 1132148373.0,
"step": 4325
},
{
"epoch": 2.0186480186480185,
"grad_norm": 0.3862354868842951,
"learning_rate": 3.628634084566615e-05,
"loss": 0.4009,
"num_tokens": 1133459093.0,
"step": 4330
},
{
"epoch": 2.020979020979021,
"grad_norm": 0.3553448746301145,
"learning_rate": 3.625439408531787e-05,
"loss": 0.4141,
"num_tokens": 1134757659.0,
"step": 4335
},
{
"epoch": 2.023310023310023,
"grad_norm": 0.3988598508713757,
"learning_rate": 3.62224265142121e-05,
"loss": 0.4054,
"num_tokens": 1136052195.0,
"step": 4340
},
{
"epoch": 2.0256410256410255,
"grad_norm": 0.4052184398184166,
"learning_rate": 3.6190438208341484e-05,
"loss": 0.4113,
"num_tokens": 1137362915.0,
"step": 4345
},
{
"epoch": 2.027972027972028,
"grad_norm": 0.3848516871568244,
"learning_rate": 3.615842924374791e-05,
"loss": 0.4153,
"num_tokens": 1138673635.0,
"step": 4350
},
{
"epoch": 2.0303030303030303,
"grad_norm": 0.36445962395799664,
"learning_rate": 3.6126399696522413e-05,
"loss": 0.4067,
"num_tokens": 1139984355.0,
"step": 4355
},
{
"epoch": 2.0326340326340326,
"grad_norm": 0.4070066068730233,
"learning_rate": 3.609434964280495e-05,
"loss": 0.4114,
"num_tokens": 1141285456.0,
"step": 4360
},
{
"epoch": 2.034965034965035,
"grad_norm": 0.3864432857902367,
"learning_rate": 3.6062279158784205e-05,
"loss": 0.4047,
"num_tokens": 1142596176.0,
"step": 4365
},
{
"epoch": 2.0372960372960374,
"grad_norm": 0.3673852417907459,
"learning_rate": 3.603018832069744e-05,
"loss": 0.4178,
"num_tokens": 1143906896.0,
"step": 4370
},
{
"epoch": 2.0396270396270397,
"grad_norm": 0.37584930043751846,
"learning_rate": 3.599807720483034e-05,
"loss": 0.418,
"num_tokens": 1145217616.0,
"step": 4375
},
{
"epoch": 2.041958041958042,
"grad_norm": 0.36602579882002695,
"learning_rate": 3.5965945887516715e-05,
"loss": 0.4056,
"num_tokens": 1146528336.0,
"step": 4380
},
{
"epoch": 2.0442890442890445,
"grad_norm": 0.351202961985245,
"learning_rate": 3.593379444513848e-05,
"loss": 0.3902,
"num_tokens": 1147839056.0,
"step": 4385
},
{
"epoch": 2.046620046620047,
"grad_norm": 0.4105988426092805,
"learning_rate": 3.590162295412533e-05,
"loss": 0.3981,
"num_tokens": 1149142866.0,
"step": 4390
},
{
"epoch": 2.0489510489510487,
"grad_norm": 0.38250301137351694,
"learning_rate": 3.586943149095464e-05,
"loss": 0.4103,
"num_tokens": 1150453586.0,
"step": 4395
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.3386603986144158,
"learning_rate": 3.5837220132151286e-05,
"loss": 0.4069,
"num_tokens": 1151764306.0,
"step": 4400
},
{
"epoch": 2.0536130536130535,
"grad_norm": 0.35936045689135193,
"learning_rate": 3.58049889542874e-05,
"loss": 0.4141,
"num_tokens": 1153075026.0,
"step": 4405
},
{
"epoch": 2.055944055944056,
"grad_norm": 0.35837033557087505,
"learning_rate": 3.577273803398225e-05,
"loss": 0.4302,
"num_tokens": 1154385746.0,
"step": 4410
},
{
"epoch": 2.058275058275058,
"grad_norm": 0.35713780316116617,
"learning_rate": 3.574046744790203e-05,
"loss": 0.4052,
"num_tokens": 1155696466.0,
"step": 4415
},
{
"epoch": 2.0606060606060606,
"grad_norm": 0.3834311499947858,
"learning_rate": 3.570817727275968e-05,
"loss": 0.4107,
"num_tokens": 1156996487.0,
"step": 4420
},
{
"epoch": 2.062937062937063,
"grad_norm": 0.37490633384840205,
"learning_rate": 3.567586758531471e-05,
"loss": 0.4154,
"num_tokens": 1158307207.0,
"step": 4425
},
{
"epoch": 2.0652680652680653,
"grad_norm": 0.3741096498424809,
"learning_rate": 3.5643538462373035e-05,
"loss": 0.403,
"num_tokens": 1159617927.0,
"step": 4430
},
{
"epoch": 2.0675990675990676,
"grad_norm": 0.37754550383970564,
"learning_rate": 3.561118998078673e-05,
"loss": 0.4057,
"num_tokens": 1160928647.0,
"step": 4435
},
{
"epoch": 2.06993006993007,
"grad_norm": 0.4251054782461779,
"learning_rate": 3.55788222174539e-05,
"loss": 0.4188,
"num_tokens": 1162239367.0,
"step": 4440
},
{
"epoch": 2.0722610722610724,
"grad_norm": 0.3266838827050116,
"learning_rate": 3.5546435249318535e-05,
"loss": 0.4088,
"num_tokens": 1163550087.0,
"step": 4445
},
{
"epoch": 2.0745920745920747,
"grad_norm": 0.37379003655649706,
"learning_rate": 3.551402915337021e-05,
"loss": 0.4075,
"num_tokens": 1164860807.0,
"step": 4450
},
{
"epoch": 2.076923076923077,
"grad_norm": 0.3572437557048024,
"learning_rate": 3.5481604006644e-05,
"loss": 0.4179,
"num_tokens": 1166171527.0,
"step": 4455
},
{
"epoch": 2.0792540792540795,
"grad_norm": 0.3859455958746963,
"learning_rate": 3.544915988622028e-05,
"loss": 0.4237,
"num_tokens": 1167482247.0,
"step": 4460
},
{
"epoch": 2.0815850815850814,
"grad_norm": 0.4603570748371342,
"learning_rate": 3.5416696869224504e-05,
"loss": 0.4286,
"num_tokens": 1168792967.0,
"step": 4465
},
{
"epoch": 2.0839160839160837,
"grad_norm": 0.41262993381270163,
"learning_rate": 3.538421503282707e-05,
"loss": 0.4136,
"num_tokens": 1170103687.0,
"step": 4470
},
{
"epoch": 2.086247086247086,
"grad_norm": 0.37590755119976543,
"learning_rate": 3.5351714454243096e-05,
"loss": 0.4251,
"num_tokens": 1171414407.0,
"step": 4475
},
{
"epoch": 2.0885780885780885,
"grad_norm": 0.3465536685672719,
"learning_rate": 3.531919521073225e-05,
"loss": 0.4157,
"num_tokens": 1172725127.0,
"step": 4480
},
{
"epoch": 2.090909090909091,
"grad_norm": 0.42464850126292525,
"learning_rate": 3.5286657379598586e-05,
"loss": 0.405,
"num_tokens": 1174035847.0,
"step": 4485
},
{
"epoch": 2.093240093240093,
"grad_norm": 0.3901689607224437,
"learning_rate": 3.5254101038190345e-05,
"loss": 0.4168,
"num_tokens": 1175346567.0,
"step": 4490
},
{
"epoch": 2.0955710955710956,
"grad_norm": 0.4360019158309819,
"learning_rate": 3.522152626389975e-05,
"loss": 0.4151,
"num_tokens": 1176657287.0,
"step": 4495
},
{
"epoch": 2.097902097902098,
"grad_norm": 0.3502067766511093,
"learning_rate": 3.5188933134162865e-05,
"loss": 0.4224,
"num_tokens": 1177968007.0,
"step": 4500
},
{
"epoch": 2.1002331002331003,
"grad_norm": 0.5275300365734853,
"learning_rate": 3.515632172645937e-05,
"loss": 0.4159,
"num_tokens": 1179278727.0,
"step": 4505
},
{
"epoch": 2.1025641025641026,
"grad_norm": 0.3789515598432233,
"learning_rate": 3.51236921183124e-05,
"loss": 0.4191,
"num_tokens": 1180589447.0,
"step": 4510
},
{
"epoch": 2.104895104895105,
"grad_norm": 0.365833048618729,
"learning_rate": 3.509104438728837e-05,
"loss": 0.4059,
"num_tokens": 1181900167.0,
"step": 4515
},
{
"epoch": 2.1072261072261074,
"grad_norm": 0.35537158917962436,
"learning_rate": 3.505837861099676e-05,
"loss": 0.4234,
"num_tokens": 1183210887.0,
"step": 4520
},
{
"epoch": 2.1095571095571097,
"grad_norm": 0.35631156133374037,
"learning_rate": 3.5025694867089945e-05,
"loss": 0.4111,
"num_tokens": 1184521607.0,
"step": 4525
},
{
"epoch": 2.111888111888112,
"grad_norm": 0.38809219895458275,
"learning_rate": 3.499299323326302e-05,
"loss": 0.421,
"num_tokens": 1185832327.0,
"step": 4530
},
{
"epoch": 2.114219114219114,
"grad_norm": 0.3497921359981421,
"learning_rate": 3.496027378725361e-05,
"loss": 0.407,
"num_tokens": 1187143047.0,
"step": 4535
},
{
"epoch": 2.1165501165501164,
"grad_norm": 0.40913669577154826,
"learning_rate": 3.492753660684167e-05,
"loss": 0.4033,
"num_tokens": 1188453767.0,
"step": 4540
},
{
"epoch": 2.1188811188811187,
"grad_norm": 0.39030245051407625,
"learning_rate": 3.489478176984934e-05,
"loss": 0.4217,
"num_tokens": 1189756471.0,
"step": 4545
},
{
"epoch": 2.121212121212121,
"grad_norm": 0.3443883610438411,
"learning_rate": 3.48620093541407e-05,
"loss": 0.4232,
"num_tokens": 1191067191.0,
"step": 4550
},
{
"epoch": 2.1235431235431235,
"grad_norm": 0.3841739344611504,
"learning_rate": 3.482921943762163e-05,
"loss": 0.4141,
"num_tokens": 1192377911.0,
"step": 4555
},
{
"epoch": 2.125874125874126,
"grad_norm": 0.3383108963469932,
"learning_rate": 3.479641209823964e-05,
"loss": 0.4092,
"num_tokens": 1193688631.0,
"step": 4560
},
{
"epoch": 2.128205128205128,
"grad_norm": 0.3760327145277456,
"learning_rate": 3.47635874139836e-05,
"loss": 0.4087,
"num_tokens": 1194999351.0,
"step": 4565
},
{
"epoch": 2.1305361305361306,
"grad_norm": 0.34503872631449223,
"learning_rate": 3.473074546288366e-05,
"loss": 0.4048,
"num_tokens": 1196310071.0,
"step": 4570
},
{
"epoch": 2.132867132867133,
"grad_norm": 0.3580599391213381,
"learning_rate": 3.4697886323010994e-05,
"loss": 0.4152,
"num_tokens": 1197608942.0,
"step": 4575
},
{
"epoch": 2.1351981351981353,
"grad_norm": 0.435723923799417,
"learning_rate": 3.466501007247764e-05,
"loss": 0.4286,
"num_tokens": 1198919662.0,
"step": 4580
},
{
"epoch": 2.1375291375291376,
"grad_norm": 0.40488968120953833,
"learning_rate": 3.4632116789436334e-05,
"loss": 0.4118,
"num_tokens": 1200230382.0,
"step": 4585
},
{
"epoch": 2.13986013986014,
"grad_norm": 0.36702045490517987,
"learning_rate": 3.459920655208027e-05,
"loss": 0.4118,
"num_tokens": 1201528689.0,
"step": 4590
},
{
"epoch": 2.1421911421911424,
"grad_norm": 0.36787552508826277,
"learning_rate": 3.456627943864295e-05,
"loss": 0.4184,
"num_tokens": 1202839409.0,
"step": 4595
},
{
"epoch": 2.1445221445221447,
"grad_norm": 0.3756630907032684,
"learning_rate": 3.453333552739801e-05,
"loss": 0.4053,
"num_tokens": 1204150129.0,
"step": 4600
},
{
"epoch": 2.1468531468531467,
"grad_norm": 0.3579470200059966,
"learning_rate": 3.4500374896658996e-05,
"loss": 0.4147,
"num_tokens": 1205460849.0,
"step": 4605
},
{
"epoch": 2.149184149184149,
"grad_norm": 0.3946329332145125,
"learning_rate": 3.446739762477922e-05,
"loss": 0.4207,
"num_tokens": 1206761500.0,
"step": 4610
},
{
"epoch": 2.1515151515151514,
"grad_norm": 0.3580747430804025,
"learning_rate": 3.4434403790151546e-05,
"loss": 0.3979,
"num_tokens": 1208065281.0,
"step": 4615
},
{
"epoch": 2.1538461538461537,
"grad_norm": 0.36202526630950443,
"learning_rate": 3.44013934712082e-05,
"loss": 0.4197,
"num_tokens": 1209376001.0,
"step": 4620
},
{
"epoch": 2.156177156177156,
"grad_norm": 0.3794229385793346,
"learning_rate": 3.4368366746420613e-05,
"loss": 0.4259,
"num_tokens": 1210686721.0,
"step": 4625
},
{
"epoch": 2.1585081585081585,
"grad_norm": 0.3356824344579626,
"learning_rate": 3.4335323694299205e-05,
"loss": 0.4168,
"num_tokens": 1211997441.0,
"step": 4630
},
{
"epoch": 2.160839160839161,
"grad_norm": 0.3749451829370277,
"learning_rate": 3.43022643933932e-05,
"loss": 0.4106,
"num_tokens": 1213308161.0,
"step": 4635
},
{
"epoch": 2.163170163170163,
"grad_norm": 0.4016684163697115,
"learning_rate": 3.426918892229046e-05,
"loss": 0.4098,
"num_tokens": 1214618881.0,
"step": 4640
},
{
"epoch": 2.1655011655011656,
"grad_norm": 0.4000860482831418,
"learning_rate": 3.423609735961729e-05,
"loss": 0.41,
"num_tokens": 1215929601.0,
"step": 4645
},
{
"epoch": 2.167832167832168,
"grad_norm": 0.4074946003687253,
"learning_rate": 3.420298978403824e-05,
"loss": 0.418,
"num_tokens": 1217240321.0,
"step": 4650
},
{
"epoch": 2.1701631701631703,
"grad_norm": 0.3696291402654651,
"learning_rate": 3.4169866274255926e-05,
"loss": 0.4149,
"num_tokens": 1218544211.0,
"step": 4655
},
{
"epoch": 2.1724941724941726,
"grad_norm": 0.38844912853019314,
"learning_rate": 3.413672690901084e-05,
"loss": 0.4059,
"num_tokens": 1219854931.0,
"step": 4660
},
{
"epoch": 2.174825174825175,
"grad_norm": 0.33716786832291057,
"learning_rate": 3.410357176708118e-05,
"loss": 0.4033,
"num_tokens": 1221165651.0,
"step": 4665
},
{
"epoch": 2.177156177156177,
"grad_norm": 0.36180594189314,
"learning_rate": 3.4070400927282616e-05,
"loss": 0.4134,
"num_tokens": 1222476371.0,
"step": 4670
},
{
"epoch": 2.1794871794871793,
"grad_norm": 0.34683197522568154,
"learning_rate": 3.403721446846818e-05,
"loss": 0.3892,
"num_tokens": 1223784575.0,
"step": 4675
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.3776362556179136,
"learning_rate": 3.400401246952798e-05,
"loss": 0.4259,
"num_tokens": 1225095104.0,
"step": 4680
},
{
"epoch": 2.184149184149184,
"grad_norm": 0.3518038107992216,
"learning_rate": 3.397079500938913e-05,
"loss": 0.4227,
"num_tokens": 1226405824.0,
"step": 4685
},
{
"epoch": 2.1864801864801864,
"grad_norm": 0.38008130799789136,
"learning_rate": 3.3937562167015444e-05,
"loss": 0.4192,
"num_tokens": 1227716544.0,
"step": 4690
},
{
"epoch": 2.1888111888111887,
"grad_norm": 0.410030351985677,
"learning_rate": 3.3904314021407306e-05,
"loss": 0.4187,
"num_tokens": 1229027264.0,
"step": 4695
},
{
"epoch": 2.191142191142191,
"grad_norm": 0.44730022868266656,
"learning_rate": 3.3871050651601526e-05,
"loss": 0.4035,
"num_tokens": 1230337984.0,
"step": 4700
},
{
"epoch": 2.1934731934731935,
"grad_norm": 0.4230202641947911,
"learning_rate": 3.383777213667104e-05,
"loss": 0.4354,
"num_tokens": 1231648704.0,
"step": 4705
},
{
"epoch": 2.195804195804196,
"grad_norm": 0.39299528264780836,
"learning_rate": 3.3804478555724836e-05,
"loss": 0.4189,
"num_tokens": 1232959424.0,
"step": 4710
},
{
"epoch": 2.198135198135198,
"grad_norm": 0.33696525877227246,
"learning_rate": 3.3771169987907694e-05,
"loss": 0.3992,
"num_tokens": 1234270144.0,
"step": 4715
},
{
"epoch": 2.2004662004662006,
"grad_norm": 0.3608578810758537,
"learning_rate": 3.373784651240003e-05,
"loss": 0.4138,
"num_tokens": 1235580864.0,
"step": 4720
},
{
"epoch": 2.202797202797203,
"grad_norm": 0.36875097820212976,
"learning_rate": 3.370450820841769e-05,
"loss": 0.4168,
"num_tokens": 1236891584.0,
"step": 4725
},
{
"epoch": 2.2051282051282053,
"grad_norm": 0.3857628385194161,
"learning_rate": 3.3671155155211775e-05,
"loss": 0.4126,
"num_tokens": 1238197409.0,
"step": 4730
},
{
"epoch": 2.2074592074592077,
"grad_norm": 0.36269590587156925,
"learning_rate": 3.363778743206844e-05,
"loss": 0.4124,
"num_tokens": 1239508129.0,
"step": 4735
},
{
"epoch": 2.20979020979021,
"grad_norm": 0.37684424909135084,
"learning_rate": 3.360440511830873e-05,
"loss": 0.4051,
"num_tokens": 1240818849.0,
"step": 4740
},
{
"epoch": 2.212121212121212,
"grad_norm": 0.33976488415925243,
"learning_rate": 3.3571008293288366e-05,
"loss": 0.4058,
"num_tokens": 1242129569.0,
"step": 4745
},
{
"epoch": 2.2144522144522143,
"grad_norm": 0.3390834642562686,
"learning_rate": 3.3537597036397555e-05,
"loss": 0.3954,
"num_tokens": 1243440289.0,
"step": 4750
},
{
"epoch": 2.2167832167832167,
"grad_norm": 0.3625798993977378,
"learning_rate": 3.35041714270608e-05,
"loss": 0.415,
"num_tokens": 1244751009.0,
"step": 4755
},
{
"epoch": 2.219114219114219,
"grad_norm": 0.3637371812561485,
"learning_rate": 3.3470731544736784e-05,
"loss": 0.4099,
"num_tokens": 1246061729.0,
"step": 4760
},
{
"epoch": 2.2214452214452214,
"grad_norm": 0.3469567003110868,
"learning_rate": 3.3437277468918046e-05,
"loss": 0.4205,
"num_tokens": 1247372449.0,
"step": 4765
},
{
"epoch": 2.2237762237762237,
"grad_norm": 0.3626483429391148,
"learning_rate": 3.3403809279130904e-05,
"loss": 0.4348,
"num_tokens": 1248679073.0,
"step": 4770
},
{
"epoch": 2.226107226107226,
"grad_norm": 0.34400225732618517,
"learning_rate": 3.337032705493522e-05,
"loss": 0.4088,
"num_tokens": 1249989793.0,
"step": 4775
},
{
"epoch": 2.2284382284382285,
"grad_norm": 0.42160458003014656,
"learning_rate": 3.333683087592421e-05,
"loss": 0.4182,
"num_tokens": 1251300513.0,
"step": 4780
},
{
"epoch": 2.230769230769231,
"grad_norm": 0.3628491977962132,
"learning_rate": 3.3303320821724285e-05,
"loss": 0.4263,
"num_tokens": 1252597120.0,
"step": 4785
},
{
"epoch": 2.233100233100233,
"grad_norm": 0.3646447806760171,
"learning_rate": 3.326979697199482e-05,
"loss": 0.4206,
"num_tokens": 1253907840.0,
"step": 4790
},
{
"epoch": 2.2354312354312356,
"grad_norm": 0.42668719182222326,
"learning_rate": 3.323625940642797e-05,
"loss": 0.4124,
"num_tokens": 1255218560.0,
"step": 4795
},
{
"epoch": 2.237762237762238,
"grad_norm": 0.38155232884871143,
"learning_rate": 3.320270820474856e-05,
"loss": 0.4019,
"num_tokens": 1256529280.0,
"step": 4800
},
{
"epoch": 2.2400932400932403,
"grad_norm": 0.3680246772827745,
"learning_rate": 3.316914344671374e-05,
"loss": 0.424,
"num_tokens": 1257840000.0,
"step": 4805
},
{
"epoch": 2.242424242424242,
"grad_norm": 0.3538991106150229,
"learning_rate": 3.313556521211296e-05,
"loss": 0.4171,
"num_tokens": 1259150720.0,
"step": 4810
},
{
"epoch": 2.2447552447552446,
"grad_norm": 0.3799814749588959,
"learning_rate": 3.310197358076767e-05,
"loss": 0.4089,
"num_tokens": 1260448949.0,
"step": 4815
},
{
"epoch": 2.247086247086247,
"grad_norm": 0.3534336331691378,
"learning_rate": 3.3068368632531166e-05,
"loss": 0.419,
"num_tokens": 1261759669.0,
"step": 4820
},
{
"epoch": 2.2494172494172493,
"grad_norm": 0.3948143146379037,
"learning_rate": 3.303475044728842e-05,
"loss": 0.4341,
"num_tokens": 1263070389.0,
"step": 4825
},
{
"epoch": 2.2517482517482517,
"grad_norm": 0.38620152575690947,
"learning_rate": 3.3001119104955856e-05,
"loss": 0.3993,
"num_tokens": 1264381109.0,
"step": 4830
},
{
"epoch": 2.254079254079254,
"grad_norm": 0.409856401580112,
"learning_rate": 3.296747468548117e-05,
"loss": 0.4284,
"num_tokens": 1265691829.0,
"step": 4835
},
{
"epoch": 2.2564102564102564,
"grad_norm": 0.3894321494198609,
"learning_rate": 3.2933817268843175e-05,
"loss": 0.4044,
"num_tokens": 1267002549.0,
"step": 4840
},
{
"epoch": 2.2587412587412588,
"grad_norm": 0.44554017981068295,
"learning_rate": 3.2900146935051535e-05,
"loss": 0.4046,
"num_tokens": 1268313269.0,
"step": 4845
},
{
"epoch": 2.261072261072261,
"grad_norm": 0.38537941934187847,
"learning_rate": 3.2866463764146647e-05,
"loss": 0.4088,
"num_tokens": 1269611053.0,
"step": 4850
},
{
"epoch": 2.2634032634032635,
"grad_norm": 0.34106108845050315,
"learning_rate": 3.2832767836199435e-05,
"loss": 0.4066,
"num_tokens": 1270921773.0,
"step": 4855
},
{
"epoch": 2.265734265734266,
"grad_norm": 0.37822769502708925,
"learning_rate": 3.279905923131112e-05,
"loss": 0.4352,
"num_tokens": 1272232493.0,
"step": 4860
},
{
"epoch": 2.268065268065268,
"grad_norm": 0.3717980750928343,
"learning_rate": 3.276533802961308e-05,
"loss": 0.4149,
"num_tokens": 1273533373.0,
"step": 4865
},
{
"epoch": 2.2703962703962706,
"grad_norm": 0.40049719958281904,
"learning_rate": 3.273160431126664e-05,
"loss": 0.4149,
"num_tokens": 1274825773.0,
"step": 4870
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.3492455161378825,
"learning_rate": 3.269785815646286e-05,
"loss": 0.4078,
"num_tokens": 1276121190.0,
"step": 4875
},
{
"epoch": 2.2750582750582753,
"grad_norm": 0.3434108328323875,
"learning_rate": 3.266409964542236e-05,
"loss": 0.4315,
"num_tokens": 1277431910.0,
"step": 4880
},
{
"epoch": 2.277389277389277,
"grad_norm": 0.34434993144818066,
"learning_rate": 3.263032885839517e-05,
"loss": 0.3986,
"num_tokens": 1278729221.0,
"step": 4885
},
{
"epoch": 2.2797202797202796,
"grad_norm": 0.32968474076252635,
"learning_rate": 3.2596545875660474e-05,
"loss": 0.4029,
"num_tokens": 1280039941.0,
"step": 4890
},
{
"epoch": 2.282051282051282,
"grad_norm": 0.3533987796359302,
"learning_rate": 3.256275077752644e-05,
"loss": 0.4132,
"num_tokens": 1281350661.0,
"step": 4895
},
{
"epoch": 2.2843822843822843,
"grad_norm": 0.36035857510185826,
"learning_rate": 3.2528943644330066e-05,
"loss": 0.4062,
"num_tokens": 1282661381.0,
"step": 4900
},
{
"epoch": 2.2867132867132867,
"grad_norm": 0.4102624567196318,
"learning_rate": 3.2495124556436935e-05,
"loss": 0.405,
"num_tokens": 1283972101.0,
"step": 4905
},
{
"epoch": 2.289044289044289,
"grad_norm": 0.32975812878362454,
"learning_rate": 3.246129359424105e-05,
"loss": 0.4183,
"num_tokens": 1285282821.0,
"step": 4910
},
{
"epoch": 2.2913752913752914,
"grad_norm": 0.3671230776462342,
"learning_rate": 3.2427450838164665e-05,
"loss": 0.4202,
"num_tokens": 1286593541.0,
"step": 4915
},
{
"epoch": 2.2937062937062938,
"grad_norm": 0.35487029443793194,
"learning_rate": 3.239359636865803e-05,
"loss": 0.4135,
"num_tokens": 1287904261.0,
"step": 4920
},
{
"epoch": 2.296037296037296,
"grad_norm": 0.35127658402893597,
"learning_rate": 3.235973026619928e-05,
"loss": 0.4119,
"num_tokens": 1289214981.0,
"step": 4925
},
{
"epoch": 2.2983682983682985,
"grad_norm": 0.34383367438253826,
"learning_rate": 3.2325852611294175e-05,
"loss": 0.4191,
"num_tokens": 1290517020.0,
"step": 4930
},
{
"epoch": 2.300699300699301,
"grad_norm": 0.3781283212650206,
"learning_rate": 3.229196348447595e-05,
"loss": 0.4133,
"num_tokens": 1291814376.0,
"step": 4935
},
{
"epoch": 2.303030303030303,
"grad_norm": 0.3797225808633411,
"learning_rate": 3.225806296630512e-05,
"loss": 0.4314,
"num_tokens": 1293125096.0,
"step": 4940
},
{
"epoch": 2.3053613053613056,
"grad_norm": 0.3820018986940426,
"learning_rate": 3.2224151137369244e-05,
"loss": 0.4089,
"num_tokens": 1294422895.0,
"step": 4945
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.3535499522376053,
"learning_rate": 3.219022807828282e-05,
"loss": 0.4105,
"num_tokens": 1295733615.0,
"step": 4950
},
{
"epoch": 2.31002331002331,
"grad_norm": 0.35691493546859565,
"learning_rate": 3.215629386968701e-05,
"loss": 0.4103,
"num_tokens": 1297044335.0,
"step": 4955
},
{
"epoch": 2.312354312354312,
"grad_norm": 0.3387110035916893,
"learning_rate": 3.212234859224946e-05,
"loss": 0.4284,
"num_tokens": 1298355055.0,
"step": 4960
},
{
"epoch": 2.3146853146853146,
"grad_norm": 0.3710615014269036,
"learning_rate": 3.208839232666419e-05,
"loss": 0.4101,
"num_tokens": 1299665775.0,
"step": 4965
},
{
"epoch": 2.317016317016317,
"grad_norm": 0.3373474545094395,
"learning_rate": 3.205442515365128e-05,
"loss": 0.4088,
"num_tokens": 1300976495.0,
"step": 4970
},
{
"epoch": 2.3193473193473193,
"grad_norm": 0.34943245534248407,
"learning_rate": 3.202044715395677e-05,
"loss": 0.4291,
"num_tokens": 1302287215.0,
"step": 4975
},
{
"epoch": 2.3216783216783217,
"grad_norm": 0.3617392320216389,
"learning_rate": 3.198645840835243e-05,
"loss": 0.4289,
"num_tokens": 1303590183.0,
"step": 4980
},
{
"epoch": 2.324009324009324,
"grad_norm": 0.37806058308011203,
"learning_rate": 3.195245899763559e-05,
"loss": 0.4104,
"num_tokens": 1304900903.0,
"step": 4985
},
{
"epoch": 2.3263403263403264,
"grad_norm": 0.3317302716295439,
"learning_rate": 3.1918449002628895e-05,
"loss": 0.4084,
"num_tokens": 1306211623.0,
"step": 4990
},
{
"epoch": 2.3286713286713288,
"grad_norm": 0.3435386897987483,
"learning_rate": 3.1884428504180186e-05,
"loss": 0.4135,
"num_tokens": 1307522343.0,
"step": 4995
},
{
"epoch": 2.331002331002331,
"grad_norm": 0.360164719000458,
"learning_rate": 3.185039758316226e-05,
"loss": 0.4115,
"num_tokens": 1308833063.0,
"step": 5000
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.3945377389793074,
"learning_rate": 3.1816356320472695e-05,
"loss": 0.4188,
"num_tokens": 1310143783.0,
"step": 5005
},
{
"epoch": 2.335664335664336,
"grad_norm": 0.4051939320818589,
"learning_rate": 3.178230479703364e-05,
"loss": 0.4203,
"num_tokens": 1311454503.0,
"step": 5010
},
{
"epoch": 2.3379953379953378,
"grad_norm": 0.35708148981072574,
"learning_rate": 3.174824309379166e-05,
"loss": 0.418,
"num_tokens": 1312765223.0,
"step": 5015
},
{
"epoch": 2.3403263403263406,
"grad_norm": 0.33742717843001446,
"learning_rate": 3.1714171291717486e-05,
"loss": 0.4084,
"num_tokens": 1314069304.0,
"step": 5020
},
{
"epoch": 2.3426573426573425,
"grad_norm": 0.3589509813194004,
"learning_rate": 3.168008947180588e-05,
"loss": 0.4045,
"num_tokens": 1315380024.0,
"step": 5025
},
{
"epoch": 2.344988344988345,
"grad_norm": 0.3508912054860665,
"learning_rate": 3.1645997715075426e-05,
"loss": 0.4033,
"num_tokens": 1316690744.0,
"step": 5030
},
{
"epoch": 2.347319347319347,
"grad_norm": 0.3295215331291118,
"learning_rate": 3.161189610256829e-05,
"loss": 0.4066,
"num_tokens": 1318001464.0,
"step": 5035
},
{
"epoch": 2.3496503496503496,
"grad_norm": 0.35230973624032774,
"learning_rate": 3.157778471535011e-05,
"loss": 0.417,
"num_tokens": 1319312184.0,
"step": 5040
},
{
"epoch": 2.351981351981352,
"grad_norm": 0.3884539975737978,
"learning_rate": 3.154366363450974e-05,
"loss": 0.4236,
"num_tokens": 1320618337.0,
"step": 5045
},
{
"epoch": 2.3543123543123543,
"grad_norm": 0.3698867601064244,
"learning_rate": 3.150953294115907e-05,
"loss": 0.4054,
"num_tokens": 1321929057.0,
"step": 5050
},
{
"epoch": 2.3566433566433567,
"grad_norm": 0.3652929150276016,
"learning_rate": 3.147539271643287e-05,
"loss": 0.4267,
"num_tokens": 1323239777.0,
"step": 5055
},
{
"epoch": 2.358974358974359,
"grad_norm": 0.35528129394096414,
"learning_rate": 3.1441243041488525e-05,
"loss": 0.4336,
"num_tokens": 1324550497.0,
"step": 5060
},
{
"epoch": 2.3613053613053614,
"grad_norm": 0.37876386202795675,
"learning_rate": 3.140708399750594e-05,
"loss": 0.425,
"num_tokens": 1325861217.0,
"step": 5065
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.35526472594618097,
"learning_rate": 3.1372915665687225e-05,
"loss": 0.4073,
"num_tokens": 1327171937.0,
"step": 5070
},
{
"epoch": 2.365967365967366,
"grad_norm": 0.3550032233081871,
"learning_rate": 3.133873812725662e-05,
"loss": 0.4078,
"num_tokens": 1328482657.0,
"step": 5075
},
{
"epoch": 2.3682983682983685,
"grad_norm": 0.3431998998307288,
"learning_rate": 3.130455146346024e-05,
"loss": 0.4105,
"num_tokens": 1329791355.0,
"step": 5080
},
{
"epoch": 2.370629370629371,
"grad_norm": 0.3608126350221179,
"learning_rate": 3.1270355755565886e-05,
"loss": 0.4262,
"num_tokens": 1331102075.0,
"step": 5085
},
{
"epoch": 2.3729603729603728,
"grad_norm": 0.3494733230503953,
"learning_rate": 3.123615108486286e-05,
"loss": 0.4238,
"num_tokens": 1332404189.0,
"step": 5090
},
{
"epoch": 2.375291375291375,
"grad_norm": 0.338730898909754,
"learning_rate": 3.120193753266175e-05,
"loss": 0.4191,
"num_tokens": 1333714909.0,
"step": 5095
},
{
"epoch": 2.3776223776223775,
"grad_norm": 0.402224903788031,
"learning_rate": 3.116771518029431e-05,
"loss": 0.4161,
"num_tokens": 1335025629.0,
"step": 5100
},
{
"epoch": 2.37995337995338,
"grad_norm": 0.36595610812881385,
"learning_rate": 3.113348410911316e-05,
"loss": 0.4081,
"num_tokens": 1336336349.0,
"step": 5105
},
{
"epoch": 2.382284382284382,
"grad_norm": 0.3880930752201236,
"learning_rate": 3.109924440049166e-05,
"loss": 0.4176,
"num_tokens": 1337640003.0,
"step": 5110
},
{
"epoch": 2.3846153846153846,
"grad_norm": 0.35480427992895003,
"learning_rate": 3.1064996135823736e-05,
"loss": 0.4143,
"num_tokens": 1338950723.0,
"step": 5115
},
{
"epoch": 2.386946386946387,
"grad_norm": 0.3439245249078291,
"learning_rate": 3.10307393965236e-05,
"loss": 0.4128,
"num_tokens": 1340261443.0,
"step": 5120
},
{
"epoch": 2.3892773892773893,
"grad_norm": 0.38182867678172666,
"learning_rate": 3.0996474264025654e-05,
"loss": 0.4112,
"num_tokens": 1341564973.0,
"step": 5125
},
{
"epoch": 2.3916083916083917,
"grad_norm": 0.3598389668576128,
"learning_rate": 3.096220081978423e-05,
"loss": 0.4156,
"num_tokens": 1342875693.0,
"step": 5130
},
{
"epoch": 2.393939393939394,
"grad_norm": 0.3860442296593736,
"learning_rate": 3.092791914527341e-05,
"loss": 0.4091,
"num_tokens": 1344186413.0,
"step": 5135
},
{
"epoch": 2.3962703962703964,
"grad_norm": 0.36500293825420654,
"learning_rate": 3.0893629321986874e-05,
"loss": 0.4121,
"num_tokens": 1345497133.0,
"step": 5140
},
{
"epoch": 2.3986013986013988,
"grad_norm": 0.3776001399782403,
"learning_rate": 3.085933143143765e-05,
"loss": 0.4306,
"num_tokens": 1346794525.0,
"step": 5145
},
{
"epoch": 2.400932400932401,
"grad_norm": 0.3760233460610802,
"learning_rate": 3.082502555515793e-05,
"loss": 0.4302,
"num_tokens": 1348105245.0,
"step": 5150
},
{
"epoch": 2.403263403263403,
"grad_norm": 0.37109726618618827,
"learning_rate": 3.079071177469892e-05,
"loss": 0.4021,
"num_tokens": 1349414836.0,
"step": 5155
},
{
"epoch": 2.4055944055944054,
"grad_norm": 0.3562308097398838,
"learning_rate": 3.07563901716306e-05,
"loss": 0.4198,
"num_tokens": 1350725556.0,
"step": 5160
},
{
"epoch": 2.4079254079254078,
"grad_norm": 0.37126907481875954,
"learning_rate": 3.072206082754154e-05,
"loss": 0.427,
"num_tokens": 1352022873.0,
"step": 5165
},
{
"epoch": 2.41025641025641,
"grad_norm": 0.3766487217257149,
"learning_rate": 3.068772382403873e-05,
"loss": 0.4231,
"num_tokens": 1353333593.0,
"step": 5170
},
{
"epoch": 2.4125874125874125,
"grad_norm": 0.36252558911615673,
"learning_rate": 3.065337924274735e-05,
"loss": 0.4113,
"num_tokens": 1354644313.0,
"step": 5175
},
{
"epoch": 2.414918414918415,
"grad_norm": 0.36974990717403,
"learning_rate": 3.06190271653106e-05,
"loss": 0.4289,
"num_tokens": 1355955033.0,
"step": 5180
},
{
"epoch": 2.417249417249417,
"grad_norm": 0.3383571793373514,
"learning_rate": 3.058466767338951e-05,
"loss": 0.4214,
"num_tokens": 1357265753.0,
"step": 5185
},
{
"epoch": 2.4195804195804196,
"grad_norm": 0.38036895795490944,
"learning_rate": 3.0550300848662704e-05,
"loss": 0.4207,
"num_tokens": 1358576473.0,
"step": 5190
},
{
"epoch": 2.421911421911422,
"grad_norm": 0.3505203016572691,
"learning_rate": 3.051592677282628e-05,
"loss": 0.4165,
"num_tokens": 1359887193.0,
"step": 5195
},
{
"epoch": 2.4242424242424243,
"grad_norm": 0.3640785191337429,
"learning_rate": 3.0481545527593546e-05,
"loss": 0.4272,
"num_tokens": 1361197913.0,
"step": 5200
},
{
"epoch": 2.4265734265734267,
"grad_norm": 0.35398548228202553,
"learning_rate": 3.0447157194694864e-05,
"loss": 0.4182,
"num_tokens": 1362508633.0,
"step": 5205
},
{
"epoch": 2.428904428904429,
"grad_norm": 0.36683505543529804,
"learning_rate": 3.041276185587743e-05,
"loss": 0.421,
"num_tokens": 1363819353.0,
"step": 5210
},
{
"epoch": 2.4312354312354314,
"grad_norm": 0.37273261132442753,
"learning_rate": 3.0378359592905097e-05,
"loss": 0.4004,
"num_tokens": 1365130073.0,
"step": 5215
},
{
"epoch": 2.4335664335664333,
"grad_norm": 0.3711753185510304,
"learning_rate": 3.0343950487558208e-05,
"loss": 0.4157,
"num_tokens": 1366440793.0,
"step": 5220
},
{
"epoch": 2.435897435897436,
"grad_norm": 0.3886512119876518,
"learning_rate": 3.030953462163334e-05,
"loss": 0.4203,
"num_tokens": 1367751513.0,
"step": 5225
},
{
"epoch": 2.438228438228438,
"grad_norm": 0.35270305999051793,
"learning_rate": 3.0275112076943145e-05,
"loss": 0.4039,
"num_tokens": 1369051755.0,
"step": 5230
},
{
"epoch": 2.4405594405594404,
"grad_norm": 0.3912345566826907,
"learning_rate": 3.0240682935316156e-05,
"loss": 0.4152,
"num_tokens": 1370362475.0,
"step": 5235
},
{
"epoch": 2.4428904428904428,
"grad_norm": 0.39812504543925764,
"learning_rate": 3.0206247278596594e-05,
"loss": 0.4252,
"num_tokens": 1371667349.0,
"step": 5240
},
{
"epoch": 2.445221445221445,
"grad_norm": 0.36495714706101673,
"learning_rate": 3.0171805188644163e-05,
"loss": 0.4262,
"num_tokens": 1372978069.0,
"step": 5245
},
{
"epoch": 2.4475524475524475,
"grad_norm": 0.32681284740431915,
"learning_rate": 3.013735674733385e-05,
"loss": 0.4091,
"num_tokens": 1374288789.0,
"step": 5250
},
{
"epoch": 2.44988344988345,
"grad_norm": 0.41435527791431637,
"learning_rate": 3.0102902036555765e-05,
"loss": 0.4153,
"num_tokens": 1375599509.0,
"step": 5255
},
{
"epoch": 2.4522144522144522,
"grad_norm": 0.347283922287271,
"learning_rate": 3.0068441138214886e-05,
"loss": 0.4092,
"num_tokens": 1376910229.0,
"step": 5260
},
{
"epoch": 2.4545454545454546,
"grad_norm": 0.38829537811571885,
"learning_rate": 3.0033974134230937e-05,
"loss": 0.4177,
"num_tokens": 1378220949.0,
"step": 5265
},
{
"epoch": 2.456876456876457,
"grad_norm": 0.3275166232456935,
"learning_rate": 2.9999501106538126e-05,
"loss": 0.4082,
"num_tokens": 1379531669.0,
"step": 5270
},
{
"epoch": 2.4592074592074593,
"grad_norm": 0.42514354355776585,
"learning_rate": 2.9965022137084997e-05,
"loss": 0.4056,
"num_tokens": 1380837473.0,
"step": 5275
},
{
"epoch": 2.4615384615384617,
"grad_norm": 0.37968747313444934,
"learning_rate": 2.993053730783422e-05,
"loss": 0.3987,
"num_tokens": 1382148193.0,
"step": 5280
},
{
"epoch": 2.463869463869464,
"grad_norm": 0.3560839943508509,
"learning_rate": 2.9896046700762398e-05,
"loss": 0.4136,
"num_tokens": 1383458913.0,
"step": 5285
},
{
"epoch": 2.4662004662004664,
"grad_norm": 0.36938187951856855,
"learning_rate": 2.9861550397859838e-05,
"loss": 0.4154,
"num_tokens": 1384769633.0,
"step": 5290
},
{
"epoch": 2.4685314685314683,
"grad_norm": 0.40308698065693266,
"learning_rate": 2.982704848113043e-05,
"loss": 0.425,
"num_tokens": 1386065812.0,
"step": 5295
},
{
"epoch": 2.4708624708624707,
"grad_norm": 0.3293971532786334,
"learning_rate": 2.9792541032591387e-05,
"loss": 0.4114,
"num_tokens": 1387376532.0,
"step": 5300
},
{
"epoch": 2.473193473193473,
"grad_norm": 0.3273718604110626,
"learning_rate": 2.975802813427307e-05,
"loss": 0.3997,
"num_tokens": 1388687252.0,
"step": 5305
},
{
"epoch": 2.4755244755244754,
"grad_norm": 0.35222934802235306,
"learning_rate": 2.9723509868218792e-05,
"loss": 0.4143,
"num_tokens": 1389997972.0,
"step": 5310
},
{
"epoch": 2.4778554778554778,
"grad_norm": 0.3606682109903173,
"learning_rate": 2.9688986316484636e-05,
"loss": 0.4158,
"num_tokens": 1391308692.0,
"step": 5315
},
{
"epoch": 2.48018648018648,
"grad_norm": 0.3851368043166362,
"learning_rate": 2.9654457561139254e-05,
"loss": 0.4204,
"num_tokens": 1392619412.0,
"step": 5320
},
{
"epoch": 2.4825174825174825,
"grad_norm": 0.35215808790406317,
"learning_rate": 2.961992368426366e-05,
"loss": 0.4112,
"num_tokens": 1393930132.0,
"step": 5325
},
{
"epoch": 2.484848484848485,
"grad_norm": 0.33407840882303375,
"learning_rate": 2.958538476795104e-05,
"loss": 0.4065,
"num_tokens": 1395232316.0,
"step": 5330
},
{
"epoch": 2.4871794871794872,
"grad_norm": 0.365448648296931,
"learning_rate": 2.9550840894306565e-05,
"loss": 0.4144,
"num_tokens": 1396543036.0,
"step": 5335
},
{
"epoch": 2.4895104895104896,
"grad_norm": 0.3516522718004562,
"learning_rate": 2.9516292145447187e-05,
"loss": 0.4036,
"num_tokens": 1397853756.0,
"step": 5340
},
{
"epoch": 2.491841491841492,
"grad_norm": 0.3964385897616128,
"learning_rate": 2.9481738603501464e-05,
"loss": 0.4145,
"num_tokens": 1399164476.0,
"step": 5345
},
{
"epoch": 2.4941724941724943,
"grad_norm": 0.3330290617721678,
"learning_rate": 2.9447180350609305e-05,
"loss": 0.4126,
"num_tokens": 1400475196.0,
"step": 5350
},
{
"epoch": 2.4965034965034967,
"grad_norm": 0.3620399941365234,
"learning_rate": 2.941261746892187e-05,
"loss": 0.4198,
"num_tokens": 1401785916.0,
"step": 5355
},
{
"epoch": 2.4988344988344986,
"grad_norm": 0.4031249657041131,
"learning_rate": 2.937805004060129e-05,
"loss": 0.3909,
"num_tokens": 1403096636.0,
"step": 5360
},
{
"epoch": 2.5011655011655014,
"grad_norm": 0.35297787174955664,
"learning_rate": 2.9343478147820515e-05,
"loss": 0.4161,
"num_tokens": 1404398368.0,
"step": 5365
},
{
"epoch": 2.5034965034965033,
"grad_norm": 0.3580720337699733,
"learning_rate": 2.9308901872763107e-05,
"loss": 0.4119,
"num_tokens": 1405709088.0,
"step": 5370
},
{
"epoch": 2.5058275058275057,
"grad_norm": 0.33634655420177195,
"learning_rate": 2.927432129762303e-05,
"loss": 0.4072,
"num_tokens": 1407019808.0,
"step": 5375
},
{
"epoch": 2.508158508158508,
"grad_norm": 0.4092817541410367,
"learning_rate": 2.923973650460451e-05,
"loss": 0.4249,
"num_tokens": 1408330528.0,
"step": 5380
},
{
"epoch": 2.5104895104895104,
"grad_norm": 0.386629174107386,
"learning_rate": 2.9205147575921748e-05,
"loss": 0.4001,
"num_tokens": 1409641248.0,
"step": 5385
},
{
"epoch": 2.5128205128205128,
"grad_norm": 0.38645028662583764,
"learning_rate": 2.917055459379881e-05,
"loss": 0.4201,
"num_tokens": 1410951968.0,
"step": 5390
},
{
"epoch": 2.515151515151515,
"grad_norm": 0.3870366504220812,
"learning_rate": 2.9135957640469407e-05,
"loss": 0.4013,
"num_tokens": 1412262688.0,
"step": 5395
},
{
"epoch": 2.5174825174825175,
"grad_norm": 0.3590149507926292,
"learning_rate": 2.9101356798176648e-05,
"loss": 0.4281,
"num_tokens": 1413573408.0,
"step": 5400
},
{
"epoch": 2.51981351981352,
"grad_norm": 0.3914639948997253,
"learning_rate": 2.9066752149172927e-05,
"loss": 0.4105,
"num_tokens": 1414884128.0,
"step": 5405
},
{
"epoch": 2.5221445221445222,
"grad_norm": 0.3459336521291884,
"learning_rate": 2.903214377571967e-05,
"loss": 0.4018,
"num_tokens": 1416194848.0,
"step": 5410
},
{
"epoch": 2.5244755244755246,
"grad_norm": 0.351270481877377,
"learning_rate": 2.8997531760087143e-05,
"loss": 0.4256,
"num_tokens": 1417505568.0,
"step": 5415
},
{
"epoch": 2.526806526806527,
"grad_norm": 0.3494010823054712,
"learning_rate": 2.896291618455431e-05,
"loss": 0.412,
"num_tokens": 1418816288.0,
"step": 5420
},
{
"epoch": 2.529137529137529,
"grad_norm": 0.3663146601610986,
"learning_rate": 2.8928297131408557e-05,
"loss": 0.4104,
"num_tokens": 1420127008.0,
"step": 5425
},
{
"epoch": 2.5314685314685317,
"grad_norm": 0.39872390845448125,
"learning_rate": 2.889367468294556e-05,
"loss": 0.4068,
"num_tokens": 1421437728.0,
"step": 5430
},
{
"epoch": 2.5337995337995336,
"grad_norm": 0.37140846588281806,
"learning_rate": 2.885904892146905e-05,
"loss": 0.4189,
"num_tokens": 1422748448.0,
"step": 5435
},
{
"epoch": 2.5361305361305364,
"grad_norm": 0.3657604671694247,
"learning_rate": 2.8824419929290665e-05,
"loss": 0.4191,
"num_tokens": 1424059168.0,
"step": 5440
},
{
"epoch": 2.5384615384615383,
"grad_norm": 0.3568520716840305,
"learning_rate": 2.878978778872968e-05,
"loss": 0.4201,
"num_tokens": 1425369888.0,
"step": 5445
},
{
"epoch": 2.5407925407925407,
"grad_norm": 0.39448531507467444,
"learning_rate": 2.8755152582112877e-05,
"loss": 0.4161,
"num_tokens": 1426680608.0,
"step": 5450
},
{
"epoch": 2.543123543123543,
"grad_norm": 0.34606657031969595,
"learning_rate": 2.8720514391774333e-05,
"loss": 0.4134,
"num_tokens": 1427991328.0,
"step": 5455
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.4053792494656858,
"learning_rate": 2.8685873300055206e-05,
"loss": 0.402,
"num_tokens": 1429302048.0,
"step": 5460
},
{
"epoch": 2.5477855477855478,
"grad_norm": 0.3798273899991343,
"learning_rate": 2.8651229389303556e-05,
"loss": 0.4133,
"num_tokens": 1430612768.0,
"step": 5465
},
{
"epoch": 2.55011655011655,
"grad_norm": 0.35682635054873424,
"learning_rate": 2.8616582741874143e-05,
"loss": 0.4117,
"num_tokens": 1431923488.0,
"step": 5470
},
{
"epoch": 2.5524475524475525,
"grad_norm": 0.3645622618000647,
"learning_rate": 2.8581933440128228e-05,
"loss": 0.4239,
"num_tokens": 1433234208.0,
"step": 5475
},
{
"epoch": 2.554778554778555,
"grad_norm": 0.3703457691159881,
"learning_rate": 2.8547281566433393e-05,
"loss": 0.4128,
"num_tokens": 1434539040.0,
"step": 5480
},
{
"epoch": 2.5571095571095572,
"grad_norm": 0.41270632575142685,
"learning_rate": 2.851262720316332e-05,
"loss": 0.4095,
"num_tokens": 1435849760.0,
"step": 5485
},
{
"epoch": 2.5594405594405596,
"grad_norm": 0.33721292116678364,
"learning_rate": 2.8477970432697625e-05,
"loss": 0.3976,
"num_tokens": 1437160480.0,
"step": 5490
},
{
"epoch": 2.561771561771562,
"grad_norm": 0.3503161770381493,
"learning_rate": 2.8443311337421642e-05,
"loss": 0.4228,
"num_tokens": 1438471200.0,
"step": 5495
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.39093092792942047,
"learning_rate": 2.840864999972621e-05,
"loss": 0.4102,
"num_tokens": 1439768547.0,
"step": 5500
},
{
"epoch": 2.5664335664335667,
"grad_norm": 0.3855562639577423,
"learning_rate": 2.8373986502007522e-05,
"loss": 0.3962,
"num_tokens": 1441079267.0,
"step": 5505
},
{
"epoch": 2.5687645687645686,
"grad_norm": 0.39374706330512194,
"learning_rate": 2.833932092666692e-05,
"loss": 0.4168,
"num_tokens": 1442389987.0,
"step": 5510
},
{
"epoch": 2.571095571095571,
"grad_norm": 0.35307123842027394,
"learning_rate": 2.830465335611064e-05,
"loss": 0.4109,
"num_tokens": 1443700707.0,
"step": 5515
},
{
"epoch": 2.5734265734265733,
"grad_norm": 0.3531292824198428,
"learning_rate": 2.826998387274969e-05,
"loss": 0.401,
"num_tokens": 1445011427.0,
"step": 5520
},
{
"epoch": 2.5757575757575757,
"grad_norm": 0.35841271371937383,
"learning_rate": 2.8235312558999634e-05,
"loss": 0.3987,
"num_tokens": 1446322147.0,
"step": 5525
},
{
"epoch": 2.578088578088578,
"grad_norm": 0.36420778646479857,
"learning_rate": 2.820063949728035e-05,
"loss": 0.4004,
"num_tokens": 1447632867.0,
"step": 5530
},
{
"epoch": 2.5804195804195804,
"grad_norm": 0.34605617089524987,
"learning_rate": 2.8165964770015923e-05,
"loss": 0.4046,
"num_tokens": 1448943587.0,
"step": 5535
},
{
"epoch": 2.582750582750583,
"grad_norm": 0.3638468608455636,
"learning_rate": 2.8131288459634358e-05,
"loss": 0.4183,
"num_tokens": 1450254307.0,
"step": 5540
},
{
"epoch": 2.585081585081585,
"grad_norm": 0.33673390656126306,
"learning_rate": 2.8096610648567428e-05,
"loss": 0.4052,
"num_tokens": 1451549273.0,
"step": 5545
},
{
"epoch": 2.5874125874125875,
"grad_norm": 0.3450093834999504,
"learning_rate": 2.806193141925048e-05,
"loss": 0.4092,
"num_tokens": 1452852946.0,
"step": 5550
},
{
"epoch": 2.58974358974359,
"grad_norm": 0.33375024409509796,
"learning_rate": 2.8027250854122245e-05,
"loss": 0.4071,
"num_tokens": 1454163666.0,
"step": 5555
},
{
"epoch": 2.5920745920745922,
"grad_norm": 0.32434266094123126,
"learning_rate": 2.7992569035624612e-05,
"loss": 0.4088,
"num_tokens": 1455464696.0,
"step": 5560
},
{
"epoch": 2.594405594405594,
"grad_norm": 0.3625165606072977,
"learning_rate": 2.795788604620246e-05,
"loss": 0.4027,
"num_tokens": 1456775416.0,
"step": 5565
},
{
"epoch": 2.596736596736597,
"grad_norm": 0.34850576006189254,
"learning_rate": 2.7923201968303427e-05,
"loss": 0.4225,
"num_tokens": 1458086136.0,
"step": 5570
},
{
"epoch": 2.599067599067599,
"grad_norm": 0.31593994005170517,
"learning_rate": 2.788851688437777e-05,
"loss": 0.4014,
"num_tokens": 1459396856.0,
"step": 5575
},
{
"epoch": 2.6013986013986012,
"grad_norm": 0.34688524294847034,
"learning_rate": 2.785383087687813e-05,
"loss": 0.4172,
"num_tokens": 1460707576.0,
"step": 5580
},
{
"epoch": 2.6037296037296036,
"grad_norm": 0.3557511247705549,
"learning_rate": 2.781914402825933e-05,
"loss": 0.4143,
"num_tokens": 1462018296.0,
"step": 5585
},
{
"epoch": 2.606060606060606,
"grad_norm": 0.3387419936179967,
"learning_rate": 2.77844564209782e-05,
"loss": 0.4029,
"num_tokens": 1463329016.0,
"step": 5590
},
{
"epoch": 2.6083916083916083,
"grad_norm": 0.343630335231053,
"learning_rate": 2.77497681374934e-05,
"loss": 0.404,
"num_tokens": 1464639736.0,
"step": 5595
},
{
"epoch": 2.6107226107226107,
"grad_norm": 0.3332182207020839,
"learning_rate": 2.7715079260265124e-05,
"loss": 0.4006,
"num_tokens": 1465950456.0,
"step": 5600
},
{
"epoch": 2.613053613053613,
"grad_norm": 0.3643825631091346,
"learning_rate": 2.7680389871755064e-05,
"loss": 0.4097,
"num_tokens": 1467261176.0,
"step": 5605
},
{
"epoch": 2.6153846153846154,
"grad_norm": 0.3364764273544578,
"learning_rate": 2.7645700054426087e-05,
"loss": 0.4033,
"num_tokens": 1468571896.0,
"step": 5610
},
{
"epoch": 2.617715617715618,
"grad_norm": 0.34072785097660746,
"learning_rate": 2.7611009890742058e-05,
"loss": 0.4212,
"num_tokens": 1469882616.0,
"step": 5615
},
{
"epoch": 2.62004662004662,
"grad_norm": 0.3574149052487222,
"learning_rate": 2.757631946316771e-05,
"loss": 0.4154,
"num_tokens": 1471193336.0,
"step": 5620
},
{
"epoch": 2.6223776223776225,
"grad_norm": 0.32899936425520643,
"learning_rate": 2.754162885416837e-05,
"loss": 0.4135,
"num_tokens": 1472504056.0,
"step": 5625
},
{
"epoch": 2.624708624708625,
"grad_norm": 0.3263664126860434,
"learning_rate": 2.7506938146209816e-05,
"loss": 0.4048,
"num_tokens": 1473814776.0,
"step": 5630
},
{
"epoch": 2.6270396270396272,
"grad_norm": 0.35274896684534274,
"learning_rate": 2.7472247421758046e-05,
"loss": 0.3946,
"num_tokens": 1475125496.0,
"step": 5635
},
{
"epoch": 2.629370629370629,
"grad_norm": 0.3706959177174659,
"learning_rate": 2.743755676327911e-05,
"loss": 0.4116,
"num_tokens": 1476436216.0,
"step": 5640
},
{
"epoch": 2.631701631701632,
"grad_norm": 0.3391136391587131,
"learning_rate": 2.7402866253238896e-05,
"loss": 0.4028,
"num_tokens": 1477746936.0,
"step": 5645
},
{
"epoch": 2.634032634032634,
"grad_norm": 0.3880994944779498,
"learning_rate": 2.7368175974102938e-05,
"loss": 0.3995,
"num_tokens": 1479057656.0,
"step": 5650
},
{
"epoch": 2.6363636363636362,
"grad_norm": 0.3565909411799108,
"learning_rate": 2.7333486008336217e-05,
"loss": 0.4089,
"num_tokens": 1480368376.0,
"step": 5655
},
{
"epoch": 2.6386946386946386,
"grad_norm": 0.3229561588592343,
"learning_rate": 2.7298796438402986e-05,
"loss": 0.4108,
"num_tokens": 1481679096.0,
"step": 5660
},
{
"epoch": 2.641025641025641,
"grad_norm": 0.3393467174551697,
"learning_rate": 2.726410734676653e-05,
"loss": 0.4153,
"num_tokens": 1482989816.0,
"step": 5665
},
{
"epoch": 2.6433566433566433,
"grad_norm": 0.40533026810505063,
"learning_rate": 2.7229418815889023e-05,
"loss": 0.427,
"num_tokens": 1484300536.0,
"step": 5670
},
{
"epoch": 2.6456876456876457,
"grad_norm": 0.3493290537833516,
"learning_rate": 2.7194730928231292e-05,
"loss": 0.4233,
"num_tokens": 1485603324.0,
"step": 5675
},
{
"epoch": 2.648018648018648,
"grad_norm": 0.3293219859362695,
"learning_rate": 2.716004376625264e-05,
"loss": 0.4137,
"num_tokens": 1486914044.0,
"step": 5680
},
{
"epoch": 2.6503496503496504,
"grad_norm": 0.32917350717509924,
"learning_rate": 2.7125357412410634e-05,
"loss": 0.4112,
"num_tokens": 1488224764.0,
"step": 5685
},
{
"epoch": 2.652680652680653,
"grad_norm": 0.36778219939737017,
"learning_rate": 2.7090671949160945e-05,
"loss": 0.4151,
"num_tokens": 1489530334.0,
"step": 5690
},
{
"epoch": 2.655011655011655,
"grad_norm": 0.3520157165475074,
"learning_rate": 2.70559874589571e-05,
"loss": 0.4252,
"num_tokens": 1490841054.0,
"step": 5695
},
{
"epoch": 2.6573426573426575,
"grad_norm": 0.364402639873204,
"learning_rate": 2.7021304024250315e-05,
"loss": 0.415,
"num_tokens": 1492151774.0,
"step": 5700
},
{
"epoch": 2.6596736596736594,
"grad_norm": 0.3802755602559821,
"learning_rate": 2.698662172748933e-05,
"loss": 0.4084,
"num_tokens": 1493457481.0,
"step": 5705
},
{
"epoch": 2.6620046620046622,
"grad_norm": 0.3296000790473323,
"learning_rate": 2.695194065112014e-05,
"loss": 0.41,
"num_tokens": 1494759113.0,
"step": 5710
},
{
"epoch": 2.664335664335664,
"grad_norm": 0.3427344361950261,
"learning_rate": 2.6917260877585854e-05,
"loss": 0.4155,
"num_tokens": 1496069833.0,
"step": 5715
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.33635269462764017,
"learning_rate": 2.6882582489326485e-05,
"loss": 0.4175,
"num_tokens": 1497380553.0,
"step": 5720
},
{
"epoch": 2.668997668997669,
"grad_norm": 0.3931298913652689,
"learning_rate": 2.6847905568778753e-05,
"loss": 0.421,
"num_tokens": 1498670997.0,
"step": 5725
},
{
"epoch": 2.6713286713286712,
"grad_norm": 0.35218576187719325,
"learning_rate": 2.6813230198375887e-05,
"loss": 0.4072,
"num_tokens": 1499981717.0,
"step": 5730
},
{
"epoch": 2.6736596736596736,
"grad_norm": 0.35875764676123595,
"learning_rate": 2.6778556460547437e-05,
"loss": 0.4185,
"num_tokens": 1501292437.0,
"step": 5735
},
{
"epoch": 2.675990675990676,
"grad_norm": 0.36986425256659194,
"learning_rate": 2.6743884437719064e-05,
"loss": 0.4052,
"num_tokens": 1502603157.0,
"step": 5740
},
{
"epoch": 2.6783216783216783,
"grad_norm": 0.32481671854646,
"learning_rate": 2.6709214212312362e-05,
"loss": 0.4175,
"num_tokens": 1503913877.0,
"step": 5745
},
{
"epoch": 2.6806526806526807,
"grad_norm": 0.33341909508902867,
"learning_rate": 2.6674545866744627e-05,
"loss": 0.4095,
"num_tokens": 1505212841.0,
"step": 5750
},
{
"epoch": 2.682983682983683,
"grad_norm": 0.3247256800202665,
"learning_rate": 2.663987948342873e-05,
"loss": 0.3978,
"num_tokens": 1506523561.0,
"step": 5755
},
{
"epoch": 2.6853146853146854,
"grad_norm": 0.3693723379792467,
"learning_rate": 2.6605215144772844e-05,
"loss": 0.4031,
"num_tokens": 1507834281.0,
"step": 5760
},
{
"epoch": 2.687645687645688,
"grad_norm": 0.3363247073452088,
"learning_rate": 2.6570552933180275e-05,
"loss": 0.4096,
"num_tokens": 1509145001.0,
"step": 5765
},
{
"epoch": 2.6899766899766897,
"grad_norm": 0.3686548048692657,
"learning_rate": 2.6535892931049304e-05,
"loss": 0.3989,
"num_tokens": 1510455721.0,
"step": 5770
},
{
"epoch": 2.6923076923076925,
"grad_norm": 0.3590538341136808,
"learning_rate": 2.650123522077294e-05,
"loss": 0.4126,
"num_tokens": 1511756221.0,
"step": 5775
},
{
"epoch": 2.6946386946386944,
"grad_norm": 0.3939246015397987,
"learning_rate": 2.6466579884738745e-05,
"loss": 0.4127,
"num_tokens": 1513066941.0,
"step": 5780
},
{
"epoch": 2.6969696969696972,
"grad_norm": 0.36656521638788236,
"learning_rate": 2.6431927005328634e-05,
"loss": 0.4118,
"num_tokens": 1514377661.0,
"step": 5785
},
{
"epoch": 2.699300699300699,
"grad_norm": 0.3786768997732052,
"learning_rate": 2.6397276664918695e-05,
"loss": 0.4056,
"num_tokens": 1515688381.0,
"step": 5790
},
{
"epoch": 2.7016317016317015,
"grad_norm": 0.34250789196379766,
"learning_rate": 2.6362628945878982e-05,
"loss": 0.4237,
"num_tokens": 1516983122.0,
"step": 5795
},
{
"epoch": 2.703962703962704,
"grad_norm": 0.3879698601925601,
"learning_rate": 2.6327983930573275e-05,
"loss": 0.4194,
"num_tokens": 1518293842.0,
"step": 5800
},
{
"epoch": 2.7062937062937062,
"grad_norm": 0.34322275404532027,
"learning_rate": 2.629334170135899e-05,
"loss": 0.421,
"num_tokens": 1519604562.0,
"step": 5805
},
{
"epoch": 2.7086247086247086,
"grad_norm": 0.3274763994589238,
"learning_rate": 2.6258702340586888e-05,
"loss": 0.3991,
"num_tokens": 1520915282.0,
"step": 5810
},
{
"epoch": 2.710955710955711,
"grad_norm": 0.3313466614418822,
"learning_rate": 2.6224065930600895e-05,
"loss": 0.4114,
"num_tokens": 1522226002.0,
"step": 5815
},
{
"epoch": 2.7132867132867133,
"grad_norm": 0.3577956678121881,
"learning_rate": 2.6189432553737965e-05,
"loss": 0.4313,
"num_tokens": 1523536722.0,
"step": 5820
},
{
"epoch": 2.7156177156177157,
"grad_norm": 0.34556719467314156,
"learning_rate": 2.6154802292327795e-05,
"loss": 0.4179,
"num_tokens": 1524847442.0,
"step": 5825
},
{
"epoch": 2.717948717948718,
"grad_norm": 0.3422225730154786,
"learning_rate": 2.6120175228692705e-05,
"loss": 0.4224,
"num_tokens": 1526145343.0,
"step": 5830
},
{
"epoch": 2.7202797202797204,
"grad_norm": 0.3310291884231872,
"learning_rate": 2.608555144514741e-05,
"loss": 0.4104,
"num_tokens": 1527456063.0,
"step": 5835
},
{
"epoch": 2.722610722610723,
"grad_norm": 0.35527150456435,
"learning_rate": 2.6050931023998825e-05,
"loss": 0.4265,
"num_tokens": 1528762674.0,
"step": 5840
},
{
"epoch": 2.7249417249417247,
"grad_norm": 0.3810983867864767,
"learning_rate": 2.601631404754587e-05,
"loss": 0.4156,
"num_tokens": 1530058844.0,
"step": 5845
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.3920073125528213,
"learning_rate": 2.5981700598079267e-05,
"loss": 0.4202,
"num_tokens": 1531358947.0,
"step": 5850
},
{
"epoch": 2.7296037296037294,
"grad_norm": 0.3259366792942078,
"learning_rate": 2.594709075788138e-05,
"loss": 0.414,
"num_tokens": 1532660121.0,
"step": 5855
},
{
"epoch": 2.731934731934732,
"grad_norm": 0.37607877851481675,
"learning_rate": 2.5912484609225973e-05,
"loss": 0.4125,
"num_tokens": 1533961019.0,
"step": 5860
},
{
"epoch": 2.734265734265734,
"grad_norm": 0.361401132584504,
"learning_rate": 2.5877882234378027e-05,
"loss": 0.4149,
"num_tokens": 1535271739.0,
"step": 5865
},
{
"epoch": 2.7365967365967365,
"grad_norm": 0.3814825573434443,
"learning_rate": 2.584328371559358e-05,
"loss": 0.4171,
"num_tokens": 1536582459.0,
"step": 5870
},
{
"epoch": 2.738927738927739,
"grad_norm": 0.33196554110199666,
"learning_rate": 2.5808689135119484e-05,
"loss": 0.4198,
"num_tokens": 1537893179.0,
"step": 5875
},
{
"epoch": 2.7412587412587412,
"grad_norm": 0.3514505419081785,
"learning_rate": 2.577409857519323e-05,
"loss": 0.4116,
"num_tokens": 1539203899.0,
"step": 5880
},
{
"epoch": 2.7435897435897436,
"grad_norm": 0.36051816664386027,
"learning_rate": 2.573951211804274e-05,
"loss": 0.3955,
"num_tokens": 1540501469.0,
"step": 5885
},
{
"epoch": 2.745920745920746,
"grad_norm": 0.34431578542179486,
"learning_rate": 2.570492984588622e-05,
"loss": 0.4048,
"num_tokens": 1541807849.0,
"step": 5890
},
{
"epoch": 2.7482517482517483,
"grad_norm": 0.3482952718749119,
"learning_rate": 2.56703518409319e-05,
"loss": 0.421,
"num_tokens": 1543118569.0,
"step": 5895
},
{
"epoch": 2.7505827505827507,
"grad_norm": 0.3435903033996109,
"learning_rate": 2.5635778185377846e-05,
"loss": 0.4105,
"num_tokens": 1544429289.0,
"step": 5900
},
{
"epoch": 2.752913752913753,
"grad_norm": 0.3649183613089683,
"learning_rate": 2.5601208961411838e-05,
"loss": 0.4363,
"num_tokens": 1545740009.0,
"step": 5905
},
{
"epoch": 2.755244755244755,
"grad_norm": 0.33542477596319575,
"learning_rate": 2.556664425121108e-05,
"loss": 0.417,
"num_tokens": 1547050729.0,
"step": 5910
},
{
"epoch": 2.757575757575758,
"grad_norm": 0.3612431221112293,
"learning_rate": 2.5532084136942048e-05,
"loss": 0.4106,
"num_tokens": 1548361449.0,
"step": 5915
},
{
"epoch": 2.7599067599067597,
"grad_norm": 0.34457273412289285,
"learning_rate": 2.5497528700760333e-05,
"loss": 0.4111,
"num_tokens": 1549672169.0,
"step": 5920
},
{
"epoch": 2.762237762237762,
"grad_norm": 0.3643149061193825,
"learning_rate": 2.5462978024810347e-05,
"loss": 0.4007,
"num_tokens": 1550982889.0,
"step": 5925
},
{
"epoch": 2.7645687645687644,
"grad_norm": 0.33113278908341853,
"learning_rate": 2.5428432191225226e-05,
"loss": 0.4115,
"num_tokens": 1552293609.0,
"step": 5930
},
{
"epoch": 2.766899766899767,
"grad_norm": 0.324549859393571,
"learning_rate": 2.5393891282126576e-05,
"loss": 0.4147,
"num_tokens": 1553603846.0,
"step": 5935
},
{
"epoch": 2.769230769230769,
"grad_norm": 0.34583557264006637,
"learning_rate": 2.5359355379624317e-05,
"loss": 0.4159,
"num_tokens": 1554898212.0,
"step": 5940
},
{
"epoch": 2.7715617715617715,
"grad_norm": 0.3704574170132862,
"learning_rate": 2.532482456581644e-05,
"loss": 0.4187,
"num_tokens": 1556204437.0,
"step": 5945
},
{
"epoch": 2.773892773892774,
"grad_norm": 0.3427794333227205,
"learning_rate": 2.529029892278886e-05,
"loss": 0.4052,
"num_tokens": 1557515157.0,
"step": 5950
},
{
"epoch": 2.7762237762237763,
"grad_norm": 0.3533422903029431,
"learning_rate": 2.5255778532615194e-05,
"loss": 0.4092,
"num_tokens": 1558825877.0,
"step": 5955
},
{
"epoch": 2.7785547785547786,
"grad_norm": 0.3549142246233924,
"learning_rate": 2.5221263477356572e-05,
"loss": 0.4081,
"num_tokens": 1560136597.0,
"step": 5960
},
{
"epoch": 2.780885780885781,
"grad_norm": 0.38274212191913465,
"learning_rate": 2.5186753839061438e-05,
"loss": 0.4038,
"num_tokens": 1561447317.0,
"step": 5965
},
{
"epoch": 2.7832167832167833,
"grad_norm": 0.33786865946250155,
"learning_rate": 2.5152249699765367e-05,
"loss": 0.4018,
"num_tokens": 1562758037.0,
"step": 5970
},
{
"epoch": 2.7855477855477857,
"grad_norm": 0.37594129759116907,
"learning_rate": 2.5117751141490858e-05,
"loss": 0.4275,
"num_tokens": 1564068757.0,
"step": 5975
},
{
"epoch": 2.787878787878788,
"grad_norm": 0.3547780778843013,
"learning_rate": 2.5083258246247144e-05,
"loss": 0.4107,
"num_tokens": 1565366132.0,
"step": 5980
},
{
"epoch": 2.79020979020979,
"grad_norm": 0.336676677250395,
"learning_rate": 2.5048771096029976e-05,
"loss": 0.4228,
"num_tokens": 1566676852.0,
"step": 5985
},
{
"epoch": 2.792540792540793,
"grad_norm": 0.37105836604705056,
"learning_rate": 2.5014289772821486e-05,
"loss": 0.4141,
"num_tokens": 1567987572.0,
"step": 5990
},
{
"epoch": 2.7948717948717947,
"grad_norm": 0.3641465643333037,
"learning_rate": 2.4979814358589944e-05,
"loss": 0.4268,
"num_tokens": 1569298292.0,
"step": 5995
},
{
"epoch": 2.797202797202797,
"grad_norm": 0.39906084842919354,
"learning_rate": 2.494534493528952e-05,
"loss": 0.4249,
"num_tokens": 1570609012.0,
"step": 6000
},
{
"epoch": 2.7995337995337994,
"grad_norm": 0.33358892565041576,
"learning_rate": 2.491088158486024e-05,
"loss": 0.3972,
"num_tokens": 1571908251.0,
"step": 6005
},
{
"epoch": 2.801864801864802,
"grad_norm": 0.3597994521954223,
"learning_rate": 2.487642438922761e-05,
"loss": 0.3987,
"num_tokens": 1573218971.0,
"step": 6010
},
{
"epoch": 2.804195804195804,
"grad_norm": 0.32444840115208157,
"learning_rate": 2.484197343030253e-05,
"loss": 0.4103,
"num_tokens": 1574529691.0,
"step": 6015
},
{
"epoch": 2.8065268065268065,
"grad_norm": 0.3369493662408065,
"learning_rate": 2.48075287899811e-05,
"loss": 0.4077,
"num_tokens": 1575834151.0,
"step": 6020
},
{
"epoch": 2.808857808857809,
"grad_norm": 0.35772585365072584,
"learning_rate": 2.4773090550144366e-05,
"loss": 0.4176,
"num_tokens": 1577144871.0,
"step": 6025
},
{
"epoch": 2.8111888111888113,
"grad_norm": 0.36110473631062134,
"learning_rate": 2.473865879265817e-05,
"loss": 0.4253,
"num_tokens": 1578449314.0,
"step": 6030
},
{
"epoch": 2.8135198135198136,
"grad_norm": 0.33350627178900166,
"learning_rate": 2.470423359937295e-05,
"loss": 0.4142,
"num_tokens": 1579760034.0,
"step": 6035
},
{
"epoch": 2.815850815850816,
"grad_norm": 0.3597334739451086,
"learning_rate": 2.4669815052123534e-05,
"loss": 0.4125,
"num_tokens": 1581060841.0,
"step": 6040
},
{
"epoch": 2.8181818181818183,
"grad_norm": 0.39863743568305726,
"learning_rate": 2.463540323272896e-05,
"loss": 0.4161,
"num_tokens": 1582371561.0,
"step": 6045
},
{
"epoch": 2.8205128205128203,
"grad_norm": 0.3596208010960054,
"learning_rate": 2.4600998222992257e-05,
"loss": 0.4126,
"num_tokens": 1583682281.0,
"step": 6050
},
{
"epoch": 2.822843822843823,
"grad_norm": 0.3542098381141003,
"learning_rate": 2.456660010470028e-05,
"loss": 0.4164,
"num_tokens": 1584993001.0,
"step": 6055
},
{
"epoch": 2.825174825174825,
"grad_norm": 0.33689485990156454,
"learning_rate": 2.4532208959623488e-05,
"loss": 0.3965,
"num_tokens": 1586303721.0,
"step": 6060
},
{
"epoch": 2.8275058275058274,
"grad_norm": 0.36681615470748274,
"learning_rate": 2.4497824869515773e-05,
"loss": 0.4268,
"num_tokens": 1587614441.0,
"step": 6065
},
{
"epoch": 2.8298368298368297,
"grad_norm": 0.33853074905875574,
"learning_rate": 2.4463447916114273e-05,
"loss": 0.4105,
"num_tokens": 1588909813.0,
"step": 6070
},
{
"epoch": 2.832167832167832,
"grad_norm": 0.32550466329208894,
"learning_rate": 2.4429078181139127e-05,
"loss": 0.4083,
"num_tokens": 1590220533.0,
"step": 6075
},
{
"epoch": 2.8344988344988344,
"grad_norm": 0.3234109896716426,
"learning_rate": 2.439471574629333e-05,
"loss": 0.4162,
"num_tokens": 1591531253.0,
"step": 6080
},
{
"epoch": 2.836829836829837,
"grad_norm": 0.32857277817721126,
"learning_rate": 2.4360360693262524e-05,
"loss": 0.4077,
"num_tokens": 1592841973.0,
"step": 6085
},
{
"epoch": 2.839160839160839,
"grad_norm": 0.37570041632784523,
"learning_rate": 2.4326013103714813e-05,
"loss": 0.4081,
"num_tokens": 1594152693.0,
"step": 6090
},
{
"epoch": 2.8414918414918415,
"grad_norm": 0.36498566644592234,
"learning_rate": 2.4291673059300546e-05,
"loss": 0.4101,
"num_tokens": 1595463413.0,
"step": 6095
},
{
"epoch": 2.843822843822844,
"grad_norm": 0.33322954386185466,
"learning_rate": 2.4257340641652115e-05,
"loss": 0.4203,
"num_tokens": 1596774133.0,
"step": 6100
},
{
"epoch": 2.8461538461538463,
"grad_norm": 0.32749325978795546,
"learning_rate": 2.4223015932383842e-05,
"loss": 0.41,
"num_tokens": 1598084853.0,
"step": 6105
},
{
"epoch": 2.8484848484848486,
"grad_norm": 0.37491209566738504,
"learning_rate": 2.4188699013091665e-05,
"loss": 0.4288,
"num_tokens": 1599395573.0,
"step": 6110
},
{
"epoch": 2.8508158508158505,
"grad_norm": 0.34071317068951223,
"learning_rate": 2.4154389965353025e-05,
"loss": 0.4114,
"num_tokens": 1600702382.0,
"step": 6115
},
{
"epoch": 2.8531468531468533,
"grad_norm": 0.3564070027232447,
"learning_rate": 2.4120088870726675e-05,
"loss": 0.4269,
"num_tokens": 1602013102.0,
"step": 6120
},
{
"epoch": 2.8554778554778553,
"grad_norm": 0.37329758310001454,
"learning_rate": 2.408579581075242e-05,
"loss": 0.4311,
"num_tokens": 1603321783.0,
"step": 6125
},
{
"epoch": 2.857808857808858,
"grad_norm": 0.3300871433307017,
"learning_rate": 2.4051510866950987e-05,
"loss": 0.431,
"num_tokens": 1604632503.0,
"step": 6130
},
{
"epoch": 2.86013986013986,
"grad_norm": 0.33854849401601644,
"learning_rate": 2.4017234120823816e-05,
"loss": 0.4085,
"num_tokens": 1605943223.0,
"step": 6135
},
{
"epoch": 2.8624708624708624,
"grad_norm": 0.3397431033223637,
"learning_rate": 2.3982965653852845e-05,
"loss": 0.427,
"num_tokens": 1607238436.0,
"step": 6140
},
{
"epoch": 2.8648018648018647,
"grad_norm": 0.31163913161758244,
"learning_rate": 2.3948705547500346e-05,
"loss": 0.396,
"num_tokens": 1608549156.0,
"step": 6145
},
{
"epoch": 2.867132867132867,
"grad_norm": 0.36119797833011485,
"learning_rate": 2.391445388320869e-05,
"loss": 0.4219,
"num_tokens": 1609859876.0,
"step": 6150
},
{
"epoch": 2.8694638694638694,
"grad_norm": 0.35324575640930755,
"learning_rate": 2.388021074240021e-05,
"loss": 0.4045,
"num_tokens": 1611170596.0,
"step": 6155
},
{
"epoch": 2.871794871794872,
"grad_norm": 0.34168122235530196,
"learning_rate": 2.3845976206476962e-05,
"loss": 0.4119,
"num_tokens": 1612481316.0,
"step": 6160
},
{
"epoch": 2.874125874125874,
"grad_norm": 0.3267756311441551,
"learning_rate": 2.381175035682055e-05,
"loss": 0.4068,
"num_tokens": 1613780656.0,
"step": 6165
},
{
"epoch": 2.8764568764568765,
"grad_norm": 0.33042098367758244,
"learning_rate": 2.377753327479193e-05,
"loss": 0.4063,
"num_tokens": 1615091376.0,
"step": 6170
},
{
"epoch": 2.878787878787879,
"grad_norm": 0.3484728853535675,
"learning_rate": 2.374332504173121e-05,
"loss": 0.4062,
"num_tokens": 1616402096.0,
"step": 6175
},
{
"epoch": 2.8811188811188813,
"grad_norm": 0.3454455029684927,
"learning_rate": 2.3709125738957467e-05,
"loss": 0.4047,
"num_tokens": 1617712816.0,
"step": 6180
},
{
"epoch": 2.8834498834498836,
"grad_norm": 0.35256247777872834,
"learning_rate": 2.3674935447768547e-05,
"loss": 0.4092,
"num_tokens": 1619023536.0,
"step": 6185
},
{
"epoch": 2.8857808857808855,
"grad_norm": 0.34447264412116185,
"learning_rate": 2.3640754249440893e-05,
"loss": 0.4171,
"num_tokens": 1620320855.0,
"step": 6190
},
{
"epoch": 2.8881118881118883,
"grad_norm": 0.3300552790749895,
"learning_rate": 2.360658222522929e-05,
"loss": 0.404,
"num_tokens": 1621631575.0,
"step": 6195
},
{
"epoch": 2.8904428904428903,
"grad_norm": 0.3301478105077118,
"learning_rate": 2.357241945636674e-05,
"loss": 0.4041,
"num_tokens": 1622942295.0,
"step": 6200
},
{
"epoch": 2.8927738927738926,
"grad_norm": 0.3241286505711926,
"learning_rate": 2.3538266024064272e-05,
"loss": 0.4088,
"num_tokens": 1624236677.0,
"step": 6205
},
{
"epoch": 2.895104895104895,
"grad_norm": 0.3393889730654891,
"learning_rate": 2.350412200951066e-05,
"loss": 0.4045,
"num_tokens": 1625547397.0,
"step": 6210
},
{
"epoch": 2.8974358974358974,
"grad_norm": 0.3722971080352807,
"learning_rate": 2.346998749387233e-05,
"loss": 0.3972,
"num_tokens": 1626858117.0,
"step": 6215
},
{
"epoch": 2.8997668997668997,
"grad_norm": 0.3481952783229655,
"learning_rate": 2.3435862558293137e-05,
"loss": 0.4185,
"num_tokens": 1628168837.0,
"step": 6220
},
{
"epoch": 2.902097902097902,
"grad_norm": 0.3575311095008992,
"learning_rate": 2.3401747283894122e-05,
"loss": 0.4089,
"num_tokens": 1629479557.0,
"step": 6225
},
{
"epoch": 2.9044289044289044,
"grad_norm": 0.3213929542319523,
"learning_rate": 2.3367641751773388e-05,
"loss": 0.4044,
"num_tokens": 1630783572.0,
"step": 6230
},
{
"epoch": 2.906759906759907,
"grad_norm": 0.33775314492186553,
"learning_rate": 2.3333546043005877e-05,
"loss": 0.4117,
"num_tokens": 1632094292.0,
"step": 6235
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.35583582490206767,
"learning_rate": 2.3299460238643178e-05,
"loss": 0.4191,
"num_tokens": 1633405012.0,
"step": 6240
},
{
"epoch": 2.9114219114219115,
"grad_norm": 0.3403055493539477,
"learning_rate": 2.3265384419713325e-05,
"loss": 0.4074,
"num_tokens": 1634715732.0,
"step": 6245
},
{
"epoch": 2.913752913752914,
"grad_norm": 0.32361564388685027,
"learning_rate": 2.3231318667220624e-05,
"loss": 0.4047,
"num_tokens": 1636021018.0,
"step": 6250
},
{
"epoch": 2.916083916083916,
"grad_norm": 0.36911990299906383,
"learning_rate": 2.3197263062145457e-05,
"loss": 0.3952,
"num_tokens": 1637331738.0,
"step": 6255
},
{
"epoch": 2.9184149184149186,
"grad_norm": 0.3529888818933417,
"learning_rate": 2.3163217685444067e-05,
"loss": 0.4037,
"num_tokens": 1638640505.0,
"step": 6260
},
{
"epoch": 2.9207459207459205,
"grad_norm": 0.3228114643758619,
"learning_rate": 2.312918261804839e-05,
"loss": 0.4039,
"num_tokens": 1639951225.0,
"step": 6265
},
{
"epoch": 2.9230769230769234,
"grad_norm": 0.37676066453773155,
"learning_rate": 2.3095157940865876e-05,
"loss": 0.408,
"num_tokens": 1641261945.0,
"step": 6270
},
{
"epoch": 2.9254079254079253,
"grad_norm": 0.3498311618012844,
"learning_rate": 2.3061143734779235e-05,
"loss": 0.4052,
"num_tokens": 1642572665.0,
"step": 6275
},
{
"epoch": 2.9277389277389276,
"grad_norm": 0.3793591470148143,
"learning_rate": 2.3027140080646313e-05,
"loss": 0.4059,
"num_tokens": 1643883385.0,
"step": 6280
},
{
"epoch": 2.93006993006993,
"grad_norm": 0.30604459853495847,
"learning_rate": 2.299314705929987e-05,
"loss": 0.4008,
"num_tokens": 1645194105.0,
"step": 6285
},
{
"epoch": 2.9324009324009324,
"grad_norm": 0.3471040950462663,
"learning_rate": 2.295916475154739e-05,
"loss": 0.4152,
"num_tokens": 1646501490.0,
"step": 6290
},
{
"epoch": 2.9347319347319347,
"grad_norm": 0.3497693139520713,
"learning_rate": 2.292519323817087e-05,
"loss": 0.4247,
"num_tokens": 1647803839.0,
"step": 6295
},
{
"epoch": 2.937062937062937,
"grad_norm": 0.37949081912769744,
"learning_rate": 2.2891232599926666e-05,
"loss": 0.4094,
"num_tokens": 1649114559.0,
"step": 6300
},
{
"epoch": 2.9393939393939394,
"grad_norm": 0.3885455375827252,
"learning_rate": 2.2857282917545285e-05,
"loss": 0.4188,
"num_tokens": 1650425279.0,
"step": 6305
},
{
"epoch": 2.941724941724942,
"grad_norm": 0.3477949975861951,
"learning_rate": 2.2823344271731184e-05,
"loss": 0.4176,
"num_tokens": 1651731670.0,
"step": 6310
},
{
"epoch": 2.944055944055944,
"grad_norm": 0.4148827072433097,
"learning_rate": 2.2789416743162567e-05,
"loss": 0.4097,
"num_tokens": 1653042390.0,
"step": 6315
},
{
"epoch": 2.9463869463869465,
"grad_norm": 0.34732540637824916,
"learning_rate": 2.275550041249124e-05,
"loss": 0.4112,
"num_tokens": 1654353110.0,
"step": 6320
},
{
"epoch": 2.948717948717949,
"grad_norm": 0.32571968477635,
"learning_rate": 2.272159536034238e-05,
"loss": 0.4127,
"num_tokens": 1655663830.0,
"step": 6325
},
{
"epoch": 2.951048951048951,
"grad_norm": 0.34646771087112505,
"learning_rate": 2.2687701667314327e-05,
"loss": 0.4042,
"num_tokens": 1656974550.0,
"step": 6330
},
{
"epoch": 2.9533799533799536,
"grad_norm": 0.3649182836596108,
"learning_rate": 2.2653819413978454e-05,
"loss": 0.3955,
"num_tokens": 1658285270.0,
"step": 6335
},
{
"epoch": 2.9557109557109555,
"grad_norm": 0.36915878506354805,
"learning_rate": 2.261994868087893e-05,
"loss": 0.4217,
"num_tokens": 1659595990.0,
"step": 6340
},
{
"epoch": 2.958041958041958,
"grad_norm": 0.34333413093504295,
"learning_rate": 2.258608954853252e-05,
"loss": 0.4019,
"num_tokens": 1660899543.0,
"step": 6345
},
{
"epoch": 2.9603729603729603,
"grad_norm": 0.3548268496609685,
"learning_rate": 2.2552242097428432e-05,
"loss": 0.4111,
"num_tokens": 1662198615.0,
"step": 6350
},
{
"epoch": 2.9627039627039626,
"grad_norm": 0.33885473039293307,
"learning_rate": 2.2518406408028108e-05,
"loss": 0.4136,
"num_tokens": 1663509335.0,
"step": 6355
},
{
"epoch": 2.965034965034965,
"grad_norm": 0.31938033848179553,
"learning_rate": 2.2484582560765012e-05,
"loss": 0.4059,
"num_tokens": 1664820055.0,
"step": 6360
},
{
"epoch": 2.9673659673659674,
"grad_norm": 0.33392158191585564,
"learning_rate": 2.245077063604446e-05,
"loss": 0.4011,
"num_tokens": 1666130775.0,
"step": 6365
},
{
"epoch": 2.9696969696969697,
"grad_norm": 0.31402160061675133,
"learning_rate": 2.241697071424345e-05,
"loss": 0.4065,
"num_tokens": 1667441495.0,
"step": 6370
},
{
"epoch": 2.972027972027972,
"grad_norm": 0.3762934050333541,
"learning_rate": 2.2383182875710424e-05,
"loss": 0.4137,
"num_tokens": 1668752215.0,
"step": 6375
},
{
"epoch": 2.9743589743589745,
"grad_norm": 0.34874468121941143,
"learning_rate": 2.23494072007651e-05,
"loss": 0.4089,
"num_tokens": 1670062935.0,
"step": 6380
},
{
"epoch": 2.976689976689977,
"grad_norm": 0.31091397228227735,
"learning_rate": 2.231564376969829e-05,
"loss": 0.4169,
"num_tokens": 1671372884.0,
"step": 6385
},
{
"epoch": 2.979020979020979,
"grad_norm": 0.34721881259745,
"learning_rate": 2.2281892662771703e-05,
"loss": 0.4073,
"num_tokens": 1672668051.0,
"step": 6390
},
{
"epoch": 2.981351981351981,
"grad_norm": 0.32893411375726883,
"learning_rate": 2.224815396021772e-05,
"loss": 0.4115,
"num_tokens": 1673978771.0,
"step": 6395
},
{
"epoch": 2.983682983682984,
"grad_norm": 0.3148206325892906,
"learning_rate": 2.221442774223929e-05,
"loss": 0.3946,
"num_tokens": 1675289491.0,
"step": 6400
},
{
"epoch": 2.986013986013986,
"grad_norm": 0.33342285443614306,
"learning_rate": 2.2180714089009652e-05,
"loss": 0.3874,
"num_tokens": 1676600211.0,
"step": 6405
},
{
"epoch": 2.988344988344988,
"grad_norm": 0.3144256371115021,
"learning_rate": 2.214701308067216e-05,
"loss": 0.4006,
"num_tokens": 1677910931.0,
"step": 6410
},
{
"epoch": 2.9906759906759905,
"grad_norm": 0.33559767396859935,
"learning_rate": 2.211332479734013e-05,
"loss": 0.4079,
"num_tokens": 1679221651.0,
"step": 6415
},
{
"epoch": 2.993006993006993,
"grad_norm": 0.3406414559616791,
"learning_rate": 2.207964931909663e-05,
"loss": 0.3905,
"num_tokens": 1680532371.0,
"step": 6420
},
{
"epoch": 2.9953379953379953,
"grad_norm": 0.34257530422603977,
"learning_rate": 2.2045986725994287e-05,
"loss": 0.4173,
"num_tokens": 1681843091.0,
"step": 6425
},
{
"epoch": 2.9976689976689976,
"grad_norm": 0.36116021112956004,
"learning_rate": 2.2012337098055086e-05,
"loss": 0.4182,
"num_tokens": 1683153811.0,
"step": 6430
},
{
"epoch": 3.0,
"grad_norm": 0.33421231681330704,
"learning_rate": 2.19787005152702e-05,
"loss": 0.3941,
"num_tokens": 1684464531.0,
"step": 6435
},
{
"epoch": 3.0023310023310024,
"grad_norm": 0.3393950285548961,
"learning_rate": 2.1945077057599804e-05,
"loss": 0.3565,
"num_tokens": 1685775251.0,
"step": 6440
},
{
"epoch": 3.0046620046620047,
"grad_norm": 0.3640517357882801,
"learning_rate": 2.191146680497284e-05,
"loss": 0.3643,
"num_tokens": 1687076615.0,
"step": 6445
},
{
"epoch": 3.006993006993007,
"grad_norm": 0.3644950262571567,
"learning_rate": 2.1877869837286896e-05,
"loss": 0.3655,
"num_tokens": 1688387335.0,
"step": 6450
},
{
"epoch": 3.0093240093240095,
"grad_norm": 0.36701056648807145,
"learning_rate": 2.1844286234407947e-05,
"loss": 0.3499,
"num_tokens": 1689698055.0,
"step": 6455
},
{
"epoch": 3.011655011655012,
"grad_norm": 0.3865682129879495,
"learning_rate": 2.181071607617022e-05,
"loss": 0.3712,
"num_tokens": 1691008775.0,
"step": 6460
},
{
"epoch": 3.013986013986014,
"grad_norm": 0.33558912130902724,
"learning_rate": 2.1777159442375967e-05,
"loss": 0.362,
"num_tokens": 1692319495.0,
"step": 6465
},
{
"epoch": 3.016317016317016,
"grad_norm": 0.3541099475736384,
"learning_rate": 2.1743616412795303e-05,
"loss": 0.3473,
"num_tokens": 1693630024.0,
"step": 6470
},
{
"epoch": 3.0186480186480185,
"grad_norm": 0.3374133971809518,
"learning_rate": 2.1710087067165998e-05,
"loss": 0.3659,
"num_tokens": 1694940744.0,
"step": 6475
},
{
"epoch": 3.020979020979021,
"grad_norm": 0.31181861303337255,
"learning_rate": 2.1676571485193282e-05,
"loss": 0.367,
"num_tokens": 1696244679.0,
"step": 6480
},
{
"epoch": 3.023310023310023,
"grad_norm": 0.34475890728017977,
"learning_rate": 2.1643069746549694e-05,
"loss": 0.3575,
"num_tokens": 1697555399.0,
"step": 6485
},
{
"epoch": 3.0256410256410255,
"grad_norm": 0.3467950380584418,
"learning_rate": 2.1609581930874835e-05,
"loss": 0.3531,
"num_tokens": 1698866119.0,
"step": 6490
},
{
"epoch": 3.027972027972028,
"grad_norm": 0.3620598508450458,
"learning_rate": 2.1576108117775205e-05,
"loss": 0.3685,
"num_tokens": 1700166619.0,
"step": 6495
},
{
"epoch": 3.0303030303030303,
"grad_norm": 0.32443886801232974,
"learning_rate": 2.154264838682407e-05,
"loss": 0.3438,
"num_tokens": 1701477339.0,
"step": 6500
},
{
"epoch": 3.0326340326340326,
"grad_norm": 0.3529808773793806,
"learning_rate": 2.1509202817561164e-05,
"loss": 0.3613,
"num_tokens": 1702788059.0,
"step": 6505
},
{
"epoch": 3.034965034965035,
"grad_norm": 0.34696686691787276,
"learning_rate": 2.1475771489492567e-05,
"loss": 0.3548,
"num_tokens": 1704098779.0,
"step": 6510
},
{
"epoch": 3.0372960372960374,
"grad_norm": 0.35892737626187393,
"learning_rate": 2.144235448209052e-05,
"loss": 0.3546,
"num_tokens": 1705409499.0,
"step": 6515
},
{
"epoch": 3.0396270396270397,
"grad_norm": 0.3442123945875795,
"learning_rate": 2.140895187479322e-05,
"loss": 0.3461,
"num_tokens": 1706707069.0,
"step": 6520
},
{
"epoch": 3.041958041958042,
"grad_norm": 0.3173856196639699,
"learning_rate": 2.137556374700463e-05,
"loss": 0.3513,
"num_tokens": 1708017789.0,
"step": 6525
},
{
"epoch": 3.0442890442890445,
"grad_norm": 0.34717882632048175,
"learning_rate": 2.1342190178094267e-05,
"loss": 0.3616,
"num_tokens": 1709324169.0,
"step": 6530
},
{
"epoch": 3.046620046620047,
"grad_norm": 0.36986993805338514,
"learning_rate": 2.1308831247397094e-05,
"loss": 0.3543,
"num_tokens": 1710634889.0,
"step": 6535
},
{
"epoch": 3.0489510489510487,
"grad_norm": 0.3537014861009382,
"learning_rate": 2.1275487034213227e-05,
"loss": 0.3434,
"num_tokens": 1711945609.0,
"step": 6540
},
{
"epoch": 3.051282051282051,
"grad_norm": 0.3753264100076296,
"learning_rate": 2.1242157617807807e-05,
"loss": 0.3509,
"num_tokens": 1713256329.0,
"step": 6545
},
{
"epoch": 3.0536130536130535,
"grad_norm": 0.3526114805968593,
"learning_rate": 2.1208843077410816e-05,
"loss": 0.3542,
"num_tokens": 1714567049.0,
"step": 6550
},
{
"epoch": 3.055944055944056,
"grad_norm": 0.32727730145454015,
"learning_rate": 2.117554349221687e-05,
"loss": 0.3536,
"num_tokens": 1715877769.0,
"step": 6555
},
{
"epoch": 3.058275058275058,
"grad_norm": 0.3437242965711344,
"learning_rate": 2.1142258941385012e-05,
"loss": 0.3525,
"num_tokens": 1717182601.0,
"step": 6560
},
{
"epoch": 3.0606060606060606,
"grad_norm": 0.33805241257757357,
"learning_rate": 2.1108989504038567e-05,
"loss": 0.3603,
"num_tokens": 1718493321.0,
"step": 6565
},
{
"epoch": 3.062937062937063,
"grad_norm": 0.3645037205678424,
"learning_rate": 2.1075735259264935e-05,
"loss": 0.3576,
"num_tokens": 1719804041.0,
"step": 6570
},
{
"epoch": 3.0652680652680653,
"grad_norm": 0.33019158555715583,
"learning_rate": 2.1042496286115383e-05,
"loss": 0.3455,
"num_tokens": 1721114761.0,
"step": 6575
},
{
"epoch": 3.0675990675990676,
"grad_norm": 0.35114211762624814,
"learning_rate": 2.100927266360487e-05,
"loss": 0.3624,
"num_tokens": 1722425481.0,
"step": 6580
},
{
"epoch": 3.06993006993007,
"grad_norm": 0.341780681859243,
"learning_rate": 2.0976064470711908e-05,
"loss": 0.3487,
"num_tokens": 1723733066.0,
"step": 6585
},
{
"epoch": 3.0722610722610724,
"grad_norm": 0.31101561100825387,
"learning_rate": 2.0942871786378283e-05,
"loss": 0.3424,
"num_tokens": 1725043786.0,
"step": 6590
},
{
"epoch": 3.0745920745920747,
"grad_norm": 0.3413708055501447,
"learning_rate": 2.090969468950892e-05,
"loss": 0.3449,
"num_tokens": 1726347459.0,
"step": 6595
},
{
"epoch": 3.076923076923077,
"grad_norm": 0.3631157917623202,
"learning_rate": 2.087653325897172e-05,
"loss": 0.3533,
"num_tokens": 1727658179.0,
"step": 6600
},
{
"epoch": 3.0792540792540795,
"grad_norm": 0.35244419005544364,
"learning_rate": 2.0843387573597324e-05,
"loss": 0.3594,
"num_tokens": 1728968899.0,
"step": 6605
},
{
"epoch": 3.0815850815850814,
"grad_norm": 0.3705036974511524,
"learning_rate": 2.0810257712178914e-05,
"loss": 0.3519,
"num_tokens": 1730278354.0,
"step": 6610
},
{
"epoch": 3.0839160839160837,
"grad_norm": 0.37121857812051107,
"learning_rate": 2.077714375347213e-05,
"loss": 0.344,
"num_tokens": 1731589074.0,
"step": 6615
},
{
"epoch": 3.086247086247086,
"grad_norm": 0.5821507533226011,
"learning_rate": 2.074404577619472e-05,
"loss": 0.3588,
"num_tokens": 1732899794.0,
"step": 6620
},
{
"epoch": 3.0885780885780885,
"grad_norm": 0.34294237084456936,
"learning_rate": 2.071096385902651e-05,
"loss": 0.3466,
"num_tokens": 1734195166.0,
"step": 6625
},
{
"epoch": 3.090909090909091,
"grad_norm": 0.3654999256904382,
"learning_rate": 2.067789808060911e-05,
"loss": 0.361,
"num_tokens": 1735505886.0,
"step": 6630
},
{
"epoch": 3.093240093240093,
"grad_norm": 0.33471915902491023,
"learning_rate": 2.064484851954579e-05,
"loss": 0.3542,
"num_tokens": 1736816606.0,
"step": 6635
},
{
"epoch": 3.0955710955710956,
"grad_norm": 0.3378622718593044,
"learning_rate": 2.061181525440124e-05,
"loss": 0.3542,
"num_tokens": 1738127326.0,
"step": 6640
},
{
"epoch": 3.097902097902098,
"grad_norm": 0.3179265953787293,
"learning_rate": 2.057879836370144e-05,
"loss": 0.3444,
"num_tokens": 1739438046.0,
"step": 6645
},
{
"epoch": 3.1002331002331003,
"grad_norm": 0.3616349714374049,
"learning_rate": 2.0545797925933437e-05,
"loss": 0.3502,
"num_tokens": 1740748766.0,
"step": 6650
},
{
"epoch": 3.1025641025641026,
"grad_norm": 0.3304854559937638,
"learning_rate": 2.0512814019545153e-05,
"loss": 0.3549,
"num_tokens": 1742059486.0,
"step": 6655
},
{
"epoch": 3.104895104895105,
"grad_norm": 0.3524011066513125,
"learning_rate": 2.047984672294521e-05,
"loss": 0.3465,
"num_tokens": 1743360293.0,
"step": 6660
},
{
"epoch": 3.1072261072261074,
"grad_norm": 0.35458352089191,
"learning_rate": 2.044689611450279e-05,
"loss": 0.3549,
"num_tokens": 1744671013.0,
"step": 6665
},
{
"epoch": 3.1095571095571097,
"grad_norm": 0.3269683114844527,
"learning_rate": 2.0413962272547343e-05,
"loss": 0.3686,
"num_tokens": 1745981733.0,
"step": 6670
},
{
"epoch": 3.111888111888112,
"grad_norm": 0.33666988209851806,
"learning_rate": 2.0381045275368504e-05,
"loss": 0.3569,
"num_tokens": 1747292453.0,
"step": 6675
},
{
"epoch": 3.114219114219114,
"grad_norm": 0.36146342082712163,
"learning_rate": 2.034814520121584e-05,
"loss": 0.3628,
"num_tokens": 1748603173.0,
"step": 6680
},
{
"epoch": 3.1165501165501164,
"grad_norm": 0.3358286209657946,
"learning_rate": 2.0315262128298713e-05,
"loss": 0.3503,
"num_tokens": 1749913893.0,
"step": 6685
},
{
"epoch": 3.1188811188811187,
"grad_norm": 0.335084467637873,
"learning_rate": 2.0282396134786052e-05,
"loss": 0.3654,
"num_tokens": 1751212459.0,
"step": 6690
},
{
"epoch": 3.121212121212121,
"grad_norm": 0.3438646934966656,
"learning_rate": 2.024954729880618e-05,
"loss": 0.3603,
"num_tokens": 1752511336.0,
"step": 6695
},
{
"epoch": 3.1235431235431235,
"grad_norm": 0.3361069117028752,
"learning_rate": 2.0216715698446665e-05,
"loss": 0.359,
"num_tokens": 1753807515.0,
"step": 6700
},
{
"epoch": 3.125874125874126,
"grad_norm": 0.34476799845177647,
"learning_rate": 2.0183901411754074e-05,
"loss": 0.3559,
"num_tokens": 1755118235.0,
"step": 6705
},
{
"epoch": 3.128205128205128,
"grad_norm": 0.3307136432579508,
"learning_rate": 2.01511045167338e-05,
"loss": 0.3595,
"num_tokens": 1756428955.0,
"step": 6710
},
{
"epoch": 3.1305361305361306,
"grad_norm": 0.38351409943354237,
"learning_rate": 2.011832509134996e-05,
"loss": 0.3515,
"num_tokens": 1757739675.0,
"step": 6715
},
{
"epoch": 3.132867132867133,
"grad_norm": 0.3085970573121658,
"learning_rate": 2.0085563213525065e-05,
"loss": 0.3622,
"num_tokens": 1759050395.0,
"step": 6720
},
{
"epoch": 3.1351981351981353,
"grad_norm": 0.33740198770023655,
"learning_rate": 2.005281896113997e-05,
"loss": 0.3564,
"num_tokens": 1760361115.0,
"step": 6725
},
{
"epoch": 3.1375291375291376,
"grad_norm": 0.3727057443139077,
"learning_rate": 2.0020092412033587e-05,
"loss": 0.3651,
"num_tokens": 1761671835.0,
"step": 6730
},
{
"epoch": 3.13986013986014,
"grad_norm": 0.3237016735245595,
"learning_rate": 1.9987383644002776e-05,
"loss": 0.355,
"num_tokens": 1762982555.0,
"step": 6735
},
{
"epoch": 3.1421911421911424,
"grad_norm": 0.3357114616276399,
"learning_rate": 1.995469273480212e-05,
"loss": 0.3566,
"num_tokens": 1764293275.0,
"step": 6740
},
{
"epoch": 3.1445221445221447,
"grad_norm": 0.3438598090126279,
"learning_rate": 1.9922019762143744e-05,
"loss": 0.3583,
"num_tokens": 1765603995.0,
"step": 6745
},
{
"epoch": 3.1468531468531467,
"grad_norm": 0.3456073924056793,
"learning_rate": 1.9889364803697137e-05,
"loss": 0.3781,
"num_tokens": 1766914715.0,
"step": 6750
},
{
"epoch": 3.149184149184149,
"grad_norm": 0.33427973566883806,
"learning_rate": 1.9856727937088955e-05,
"loss": 0.3451,
"num_tokens": 1768213787.0,
"step": 6755
},
{
"epoch": 3.1515151515151514,
"grad_norm": 0.3420541495199111,
"learning_rate": 1.9824109239902865e-05,
"loss": 0.3705,
"num_tokens": 1769519941.0,
"step": 6760
},
{
"epoch": 3.1538461538461537,
"grad_norm": 0.3396999634927412,
"learning_rate": 1.9791508789679337e-05,
"loss": 0.3563,
"num_tokens": 1770830661.0,
"step": 6765
},
{
"epoch": 3.156177156177156,
"grad_norm": 0.3580402097640208,
"learning_rate": 1.9758926663915455e-05,
"loss": 0.3635,
"num_tokens": 1772141381.0,
"step": 6770
},
{
"epoch": 3.1585081585081585,
"grad_norm": 0.3349041898550379,
"learning_rate": 1.9726362940064752e-05,
"loss": 0.3514,
"num_tokens": 1773452101.0,
"step": 6775
},
{
"epoch": 3.160839160839161,
"grad_norm": 0.33160721758218376,
"learning_rate": 1.9693817695537e-05,
"loss": 0.3556,
"num_tokens": 1774762821.0,
"step": 6780
},
{
"epoch": 3.163170163170163,
"grad_norm": 0.34077147929405477,
"learning_rate": 1.9661291007698062e-05,
"loss": 0.3549,
"num_tokens": 1776073541.0,
"step": 6785
},
{
"epoch": 3.1655011655011656,
"grad_norm": 0.350707492092592,
"learning_rate": 1.9628782953869696e-05,
"loss": 0.3575,
"num_tokens": 1777384261.0,
"step": 6790
},
{
"epoch": 3.167832167832168,
"grad_norm": 0.32637137207920947,
"learning_rate": 1.959629361132932e-05,
"loss": 0.3487,
"num_tokens": 1778681608.0,
"step": 6795
},
{
"epoch": 3.1701631701631703,
"grad_norm": 0.34823978144338197,
"learning_rate": 1.956382305730993e-05,
"loss": 0.3579,
"num_tokens": 1779992328.0,
"step": 6800
},
{
"epoch": 3.1724941724941726,
"grad_norm": 0.3231155957000068,
"learning_rate": 1.953137136899982e-05,
"loss": 0.3485,
"num_tokens": 1781303048.0,
"step": 6805
},
{
"epoch": 3.174825174825175,
"grad_norm": 0.32946568056880193,
"learning_rate": 1.9498938623542418e-05,
"loss": 0.3536,
"num_tokens": 1782611783.0,
"step": 6810
},
{
"epoch": 3.177156177156177,
"grad_norm": 0.33586077368245165,
"learning_rate": 1.94665248980362e-05,
"loss": 0.3476,
"num_tokens": 1783922503.0,
"step": 6815
},
{
"epoch": 3.1794871794871793,
"grad_norm": 0.345953994294241,
"learning_rate": 1.943413026953434e-05,
"loss": 0.3567,
"num_tokens": 1785233223.0,
"step": 6820
},
{
"epoch": 3.1818181818181817,
"grad_norm": 0.3350392695147026,
"learning_rate": 1.9401754815044665e-05,
"loss": 0.368,
"num_tokens": 1786536776.0,
"step": 6825
},
{
"epoch": 3.184149184149184,
"grad_norm": 0.34554062524489243,
"learning_rate": 1.9369398611529405e-05,
"loss": 0.3589,
"num_tokens": 1787847496.0,
"step": 6830
},
{
"epoch": 3.1864801864801864,
"grad_norm": 0.3554652710783725,
"learning_rate": 1.9337061735905038e-05,
"loss": 0.3516,
"num_tokens": 1789158216.0,
"step": 6835
},
{
"epoch": 3.1888111888111887,
"grad_norm": 0.3893789449502026,
"learning_rate": 1.930474426504209e-05,
"loss": 0.3631,
"num_tokens": 1790468936.0,
"step": 6840
},
{
"epoch": 3.191142191142191,
"grad_norm": 0.3556543352743324,
"learning_rate": 1.9272446275764954e-05,
"loss": 0.3733,
"num_tokens": 1791779656.0,
"step": 6845
},
{
"epoch": 3.1934731934731935,
"grad_norm": 0.35483552905034643,
"learning_rate": 1.924016784485172e-05,
"loss": 0.3609,
"num_tokens": 1793090376.0,
"step": 6850
},
{
"epoch": 3.195804195804196,
"grad_norm": 0.33949854975550836,
"learning_rate": 1.9207909049033972e-05,
"loss": 0.3584,
"num_tokens": 1794401096.0,
"step": 6855
},
{
"epoch": 3.198135198135198,
"grad_norm": 0.33640749306479956,
"learning_rate": 1.9175669964996636e-05,
"loss": 0.3633,
"num_tokens": 1795711816.0,
"step": 6860
},
{
"epoch": 3.2004662004662006,
"grad_norm": 0.3409998399187101,
"learning_rate": 1.9143450669377762e-05,
"loss": 0.3634,
"num_tokens": 1797022536.0,
"step": 6865
},
{
"epoch": 3.202797202797203,
"grad_norm": 0.3275911678319333,
"learning_rate": 1.9111251238768373e-05,
"loss": 0.3487,
"num_tokens": 1798317072.0,
"step": 6870
},
{
"epoch": 3.2051282051282053,
"grad_norm": 0.3348585258935862,
"learning_rate": 1.9079071749712262e-05,
"loss": 0.354,
"num_tokens": 1799619860.0,
"step": 6875
},
{
"epoch": 3.2074592074592077,
"grad_norm": 0.34340870474918594,
"learning_rate": 1.9046912278705815e-05,
"loss": 0.363,
"num_tokens": 1800930580.0,
"step": 6880
},
{
"epoch": 3.20979020979021,
"grad_norm": 0.3407672323919426,
"learning_rate": 1.901477290219784e-05,
"loss": 0.3573,
"num_tokens": 1802235866.0,
"step": 6885
},
{
"epoch": 3.212121212121212,
"grad_norm": 0.3586302213819541,
"learning_rate": 1.898265369658938e-05,
"loss": 0.3595,
"num_tokens": 1803546586.0,
"step": 6890
},
{
"epoch": 3.2144522144522143,
"grad_norm": 0.3352168444831923,
"learning_rate": 1.8950554738233495e-05,
"loss": 0.3547,
"num_tokens": 1804840952.0,
"step": 6895
},
{
"epoch": 3.2167832167832167,
"grad_norm": 0.33771864707994703,
"learning_rate": 1.8918476103435174e-05,
"loss": 0.3581,
"num_tokens": 1806151672.0,
"step": 6900
},
{
"epoch": 3.219114219114219,
"grad_norm": 0.32791228261887584,
"learning_rate": 1.888641786845102e-05,
"loss": 0.3475,
"num_tokens": 1807462392.0,
"step": 6905
},
{
"epoch": 3.2214452214452214,
"grad_norm": 0.3475449829583515,
"learning_rate": 1.8854380109489206e-05,
"loss": 0.3597,
"num_tokens": 1808759709.0,
"step": 6910
},
{
"epoch": 3.2237762237762237,
"grad_norm": 0.349286003745334,
"learning_rate": 1.88223629027092e-05,
"loss": 0.3702,
"num_tokens": 1810057084.0,
"step": 6915
},
{
"epoch": 3.226107226107226,
"grad_norm": 0.34279824899570205,
"learning_rate": 1.8790366324221616e-05,
"loss": 0.3572,
"num_tokens": 1811367804.0,
"step": 6920
},
{
"epoch": 3.2284382284382285,
"grad_norm": 0.3270543901207085,
"learning_rate": 1.8758390450088025e-05,
"loss": 0.3581,
"num_tokens": 1812678524.0,
"step": 6925
},
{
"epoch": 3.230769230769231,
"grad_norm": 0.3719710378237588,
"learning_rate": 1.8726435356320804e-05,
"loss": 0.3503,
"num_tokens": 1813989244.0,
"step": 6930
},
{
"epoch": 3.233100233100233,
"grad_norm": 0.34389192152825504,
"learning_rate": 1.8694501118882902e-05,
"loss": 0.3677,
"num_tokens": 1815299964.0,
"step": 6935
},
{
"epoch": 3.2354312354312356,
"grad_norm": 0.3206403097527311,
"learning_rate": 1.8662587813687704e-05,
"loss": 0.3698,
"num_tokens": 1816610684.0,
"step": 6940
},
{
"epoch": 3.237762237762238,
"grad_norm": 0.3333938729526916,
"learning_rate": 1.8630695516598832e-05,
"loss": 0.3517,
"num_tokens": 1817921404.0,
"step": 6945
},
{
"epoch": 3.2400932400932403,
"grad_norm": 0.3453920815805985,
"learning_rate": 1.8598824303429985e-05,
"loss": 0.3608,
"num_tokens": 1819232124.0,
"step": 6950
},
{
"epoch": 3.242424242424242,
"grad_norm": 0.34058628970443383,
"learning_rate": 1.8566974249944707e-05,
"loss": 0.356,
"num_tokens": 1820534563.0,
"step": 6955
},
{
"epoch": 3.2447552447552446,
"grad_norm": 0.3288432699435565,
"learning_rate": 1.8535145431856266e-05,
"loss": 0.3554,
"num_tokens": 1821845283.0,
"step": 6960
},
{
"epoch": 3.247086247086247,
"grad_norm": 0.35168833123437004,
"learning_rate": 1.8503337924827446e-05,
"loss": 0.3537,
"num_tokens": 1823156003.0,
"step": 6965
},
{
"epoch": 3.2494172494172493,
"grad_norm": 0.3392709841202291,
"learning_rate": 1.8471551804470372e-05,
"loss": 0.3557,
"num_tokens": 1824466723.0,
"step": 6970
},
{
"epoch": 3.2517482517482517,
"grad_norm": 0.3426042646396951,
"learning_rate": 1.8439787146346314e-05,
"loss": 0.3532,
"num_tokens": 1825777443.0,
"step": 6975
},
{
"epoch": 3.254079254079254,
"grad_norm": 0.3454602775337345,
"learning_rate": 1.8408044025965555e-05,
"loss": 0.3484,
"num_tokens": 1827081333.0,
"step": 6980
},
{
"epoch": 3.2564102564102564,
"grad_norm": 0.36506433832795027,
"learning_rate": 1.8376322518787144e-05,
"loss": 0.3621,
"num_tokens": 1828392053.0,
"step": 6985
},
{
"epoch": 3.2587412587412588,
"grad_norm": 0.36524454212892954,
"learning_rate": 1.8344622700218774e-05,
"loss": 0.3632,
"num_tokens": 1829702773.0,
"step": 6990
},
{
"epoch": 3.261072261072261,
"grad_norm": 0.3635612315820556,
"learning_rate": 1.831294464561655e-05,
"loss": 0.3577,
"num_tokens": 1831013493.0,
"step": 6995
},
{
"epoch": 3.2634032634032635,
"grad_norm": 0.3439969893709455,
"learning_rate": 1.8281288430284898e-05,
"loss": 0.3587,
"num_tokens": 1832324213.0,
"step": 7000
},
{
"epoch": 3.265734265734266,
"grad_norm": 0.3664905114985399,
"learning_rate": 1.8249654129476267e-05,
"loss": 0.3643,
"num_tokens": 1833634933.0,
"step": 7005
},
{
"epoch": 3.268065268065268,
"grad_norm": 0.3415502009175583,
"learning_rate": 1.8218041818391046e-05,
"loss": 0.3627,
"num_tokens": 1834945653.0,
"step": 7010
},
{
"epoch": 3.2703962703962706,
"grad_norm": 0.3416327517246984,
"learning_rate": 1.8186451572177348e-05,
"loss": 0.3581,
"num_tokens": 1836243452.0,
"step": 7015
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.34814377337071994,
"learning_rate": 1.8154883465930816e-05,
"loss": 0.3629,
"num_tokens": 1837547262.0,
"step": 7020
},
{
"epoch": 3.2750582750582753,
"grad_norm": 0.34272113854001507,
"learning_rate": 1.812333757469447e-05,
"loss": 0.3489,
"num_tokens": 1838857982.0,
"step": 7025
},
{
"epoch": 3.277389277389277,
"grad_norm": 0.35857308092533485,
"learning_rate": 1.8091813973458538e-05,
"loss": 0.3756,
"num_tokens": 1840156853.0,
"step": 7030
},
{
"epoch": 3.2797202797202796,
"grad_norm": 0.32955350777081055,
"learning_rate": 1.806031273716025e-05,
"loss": 0.3706,
"num_tokens": 1841467573.0,
"step": 7035
},
{
"epoch": 3.282051282051282,
"grad_norm": 0.3456971274249648,
"learning_rate": 1.802883394068366e-05,
"loss": 0.3567,
"num_tokens": 1842778293.0,
"step": 7040
},
{
"epoch": 3.2843822843822843,
"grad_norm": 0.3288793199678882,
"learning_rate": 1.7997377658859464e-05,
"loss": 0.3604,
"num_tokens": 1844089013.0,
"step": 7045
},
{
"epoch": 3.2867132867132867,
"grad_norm": 0.3251910644151693,
"learning_rate": 1.796594396646491e-05,
"loss": 0.3573,
"num_tokens": 1845399733.0,
"step": 7050
},
{
"epoch": 3.289044289044289,
"grad_norm": 0.32298843530806964,
"learning_rate": 1.7934532938223457e-05,
"loss": 0.368,
"num_tokens": 1846710453.0,
"step": 7055
},
{
"epoch": 3.2913752913752914,
"grad_norm": 0.3390547972284236,
"learning_rate": 1.7903144648804725e-05,
"loss": 0.3488,
"num_tokens": 1848021173.0,
"step": 7060
},
{
"epoch": 3.2937062937062938,
"grad_norm": 0.31322899510921975,
"learning_rate": 1.7871779172824316e-05,
"loss": 0.3567,
"num_tokens": 1849331893.0,
"step": 7065
},
{
"epoch": 3.296037296037296,
"grad_norm": 0.3248889681783109,
"learning_rate": 1.7840436584843536e-05,
"loss": 0.3569,
"num_tokens": 1850642613.0,
"step": 7070
},
{
"epoch": 3.2983682983682985,
"grad_norm": 0.3205616691475068,
"learning_rate": 1.780911695936931e-05,
"loss": 0.3479,
"num_tokens": 1851953333.0,
"step": 7075
},
{
"epoch": 3.300699300699301,
"grad_norm": 0.3382664447246441,
"learning_rate": 1.7777820370853988e-05,
"loss": 0.3602,
"num_tokens": 1853264053.0,
"step": 7080
},
{
"epoch": 3.303030303030303,
"grad_norm": 0.3535879556837902,
"learning_rate": 1.7746546893695148e-05,
"loss": 0.354,
"num_tokens": 1854574773.0,
"step": 7085
},
{
"epoch": 3.3053613053613056,
"grad_norm": 0.3688428341220924,
"learning_rate": 1.7715296602235427e-05,
"loss": 0.3568,
"num_tokens": 1855885493.0,
"step": 7090
},
{
"epoch": 3.3076923076923075,
"grad_norm": 0.34122421753248505,
"learning_rate": 1.768406957076234e-05,
"loss": 0.3659,
"num_tokens": 1857196213.0,
"step": 7095
},
{
"epoch": 3.31002331002331,
"grad_norm": 0.3474475995666852,
"learning_rate": 1.7652865873508134e-05,
"loss": 0.3658,
"num_tokens": 1858506933.0,
"step": 7100
},
{
"epoch": 3.312354312354312,
"grad_norm": 0.3120215313160503,
"learning_rate": 1.7621685584649543e-05,
"loss": 0.3636,
"num_tokens": 1859817653.0,
"step": 7105
},
{
"epoch": 3.3146853146853146,
"grad_norm": 0.34100285389129825,
"learning_rate": 1.7590528778307693e-05,
"loss": 0.3575,
"num_tokens": 1861128373.0,
"step": 7110
},
{
"epoch": 3.317016317016317,
"grad_norm": 0.3401139043980884,
"learning_rate": 1.7559395528547874e-05,
"loss": 0.3716,
"num_tokens": 1862423586.0,
"step": 7115
},
{
"epoch": 3.3193473193473193,
"grad_norm": 0.35077446828202064,
"learning_rate": 1.752828590937938e-05,
"loss": 0.3715,
"num_tokens": 1863734306.0,
"step": 7120
},
{
"epoch": 3.3216783216783217,
"grad_norm": 0.3890151970682224,
"learning_rate": 1.7497199994755313e-05,
"loss": 0.3625,
"num_tokens": 1865045026.0,
"step": 7125
},
{
"epoch": 3.324009324009324,
"grad_norm": 0.3357664208904343,
"learning_rate": 1.7466137858572467e-05,
"loss": 0.3565,
"num_tokens": 1866349041.0,
"step": 7130
},
{
"epoch": 3.3263403263403264,
"grad_norm": 0.3489477894369833,
"learning_rate": 1.743509957467107e-05,
"loss": 0.3615,
"num_tokens": 1867659761.0,
"step": 7135
},
{
"epoch": 3.3286713286713288,
"grad_norm": 0.32828301621058453,
"learning_rate": 1.740408521683465e-05,
"loss": 0.3456,
"num_tokens": 1868970481.0,
"step": 7140
},
{
"epoch": 3.331002331002331,
"grad_norm": 0.3585651282189203,
"learning_rate": 1.7373094858789905e-05,
"loss": 0.366,
"num_tokens": 1870281201.0,
"step": 7145
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.3665074740674538,
"learning_rate": 1.7342128574206428e-05,
"loss": 0.3575,
"num_tokens": 1871582933.0,
"step": 7150
},
{
"epoch": 3.335664335664336,
"grad_norm": 0.3250266204452436,
"learning_rate": 1.7311186436696597e-05,
"loss": 0.3478,
"num_tokens": 1872886587.0,
"step": 7155
},
{
"epoch": 3.3379953379953378,
"grad_norm": 0.3224181885863933,
"learning_rate": 1.7280268519815413e-05,
"loss": 0.349,
"num_tokens": 1874197307.0,
"step": 7160
},
{
"epoch": 3.3403263403263406,
"grad_norm": 0.3388758481335769,
"learning_rate": 1.7249374897060282e-05,
"loss": 0.3583,
"num_tokens": 1875508027.0,
"step": 7165
},
{
"epoch": 3.3426573426573425,
"grad_norm": 0.35014080427325917,
"learning_rate": 1.7218505641870846e-05,
"loss": 0.3644,
"num_tokens": 1876818747.0,
"step": 7170
},
{
"epoch": 3.344988344988345,
"grad_norm": 0.34625439089102006,
"learning_rate": 1.7187660827628844e-05,
"loss": 0.3544,
"num_tokens": 1878129467.0,
"step": 7175
},
{
"epoch": 3.347319347319347,
"grad_norm": 0.3322054073312721,
"learning_rate": 1.7156840527657915e-05,
"loss": 0.3507,
"num_tokens": 1879421830.0,
"step": 7180
},
{
"epoch": 3.3496503496503496,
"grad_norm": 0.31612604477115036,
"learning_rate": 1.712604481522339e-05,
"loss": 0.3527,
"num_tokens": 1880732550.0,
"step": 7185
},
{
"epoch": 3.351981351981352,
"grad_norm": 0.32854072716891647,
"learning_rate": 1.70952737635322e-05,
"loss": 0.3654,
"num_tokens": 1882043270.0,
"step": 7190
},
{
"epoch": 3.3543123543123543,
"grad_norm": 0.3377153380244728,
"learning_rate": 1.706452744573262e-05,
"loss": 0.3639,
"num_tokens": 1883353990.0,
"step": 7195
},
{
"epoch": 3.3566433566433567,
"grad_norm": 0.3368928740025494,
"learning_rate": 1.7033805934914126e-05,
"loss": 0.3615,
"num_tokens": 1884664710.0,
"step": 7200
},
{
"epoch": 3.358974358974359,
"grad_norm": 0.3423816860220159,
"learning_rate": 1.7003109304107245e-05,
"loss": 0.3521,
"num_tokens": 1885975430.0,
"step": 7205
},
{
"epoch": 3.3613053613053614,
"grad_norm": 0.31997739521276675,
"learning_rate": 1.697243762628334e-05,
"loss": 0.3576,
"num_tokens": 1887286150.0,
"step": 7210
},
{
"epoch": 3.3636363636363638,
"grad_norm": 0.31310634183618563,
"learning_rate": 1.6941790974354464e-05,
"loss": 0.3578,
"num_tokens": 1888596870.0,
"step": 7215
},
{
"epoch": 3.365967365967366,
"grad_norm": 0.32276140427338623,
"learning_rate": 1.6911169421173194e-05,
"loss": 0.3628,
"num_tokens": 1889907590.0,
"step": 7220
},
{
"epoch": 3.3682983682983685,
"grad_norm": 0.32607047003096234,
"learning_rate": 1.688057303953241e-05,
"loss": 0.3642,
"num_tokens": 1891218310.0,
"step": 7225
},
{
"epoch": 3.370629370629371,
"grad_norm": 0.34484157121118036,
"learning_rate": 1.6850001902165176e-05,
"loss": 0.3467,
"num_tokens": 1892529030.0,
"step": 7230
},
{
"epoch": 3.3729603729603728,
"grad_norm": 0.3407500682459095,
"learning_rate": 1.6819456081744558e-05,
"loss": 0.355,
"num_tokens": 1893829272.0,
"step": 7235
},
{
"epoch": 3.375291375291375,
"grad_norm": 0.34265345627298444,
"learning_rate": 1.6788935650883407e-05,
"loss": 0.3559,
"num_tokens": 1895139992.0,
"step": 7240
},
{
"epoch": 3.3776223776223775,
"grad_norm": 0.3256176164919399,
"learning_rate": 1.6758440682134235e-05,
"loss": 0.3537,
"num_tokens": 1896450712.0,
"step": 7245
},
{
"epoch": 3.37995337995338,
"grad_norm": 0.34858433343769213,
"learning_rate": 1.6727971247989045e-05,
"loss": 0.3573,
"num_tokens": 1897749047.0,
"step": 7250
},
{
"epoch": 3.382284382284382,
"grad_norm": 0.35644873419658263,
"learning_rate": 1.669752742087911e-05,
"loss": 0.356,
"num_tokens": 1899059767.0,
"step": 7255
},
{
"epoch": 3.3846153846153846,
"grad_norm": 0.3339145275176127,
"learning_rate": 1.6667109273174823e-05,
"loss": 0.3562,
"num_tokens": 1900370487.0,
"step": 7260
},
{
"epoch": 3.386946386946387,
"grad_norm": 0.3479027207281923,
"learning_rate": 1.6636716877185575e-05,
"loss": 0.3515,
"num_tokens": 1901681207.0,
"step": 7265
},
{
"epoch": 3.3892773892773893,
"grad_norm": 0.3373951962701607,
"learning_rate": 1.660635030515952e-05,
"loss": 0.3524,
"num_tokens": 1902991927.0,
"step": 7270
},
{
"epoch": 3.3916083916083917,
"grad_norm": 0.3252740139876212,
"learning_rate": 1.6576009629283402e-05,
"loss": 0.3585,
"num_tokens": 1904302647.0,
"step": 7275
},
{
"epoch": 3.393939393939394,
"grad_norm": 0.32879290977391473,
"learning_rate": 1.654569492168243e-05,
"loss": 0.3588,
"num_tokens": 1905613367.0,
"step": 7280
},
{
"epoch": 3.3962703962703964,
"grad_norm": 0.34714056963970763,
"learning_rate": 1.6515406254420085e-05,
"loss": 0.3614,
"num_tokens": 1906924087.0,
"step": 7285
},
{
"epoch": 3.3986013986013988,
"grad_norm": 0.32182206400603064,
"learning_rate": 1.6485143699497917e-05,
"loss": 0.3732,
"num_tokens": 1908234807.0,
"step": 7290
},
{
"epoch": 3.400932400932401,
"grad_norm": 0.3521858097420374,
"learning_rate": 1.6454907328855436e-05,
"loss": 0.3601,
"num_tokens": 1909539681.0,
"step": 7295
},
{
"epoch": 3.403263403263403,
"grad_norm": 0.3222892499746984,
"learning_rate": 1.6424697214369894e-05,
"loss": 0.3548,
"num_tokens": 1910850401.0,
"step": 7300
},
{
"epoch": 3.4055944055944054,
"grad_norm": 0.3361283433082892,
"learning_rate": 1.6394513427856117e-05,
"loss": 0.3627,
"num_tokens": 1912161121.0,
"step": 7305
},
{
"epoch": 3.4079254079254078,
"grad_norm": 0.34204868238681374,
"learning_rate": 1.6364356041066355e-05,
"loss": 0.3577,
"num_tokens": 1913471841.0,
"step": 7310
},
{
"epoch": 3.41025641025641,
"grad_norm": 0.3302716919114987,
"learning_rate": 1.633422512569011e-05,
"loss": 0.367,
"num_tokens": 1914782561.0,
"step": 7315
},
{
"epoch": 3.4125874125874125,
"grad_norm": 0.3333885988826235,
"learning_rate": 1.630412075335393e-05,
"loss": 0.3626,
"num_tokens": 1916093281.0,
"step": 7320
},
{
"epoch": 3.414918414918415,
"grad_norm": 0.34793721986248155,
"learning_rate": 1.627404299562129e-05,
"loss": 0.3613,
"num_tokens": 1917404001.0,
"step": 7325
},
{
"epoch": 3.417249417249417,
"grad_norm": 0.3338489940605626,
"learning_rate": 1.6243991923992404e-05,
"loss": 0.3577,
"num_tokens": 1918698383.0,
"step": 7330
},
{
"epoch": 3.4195804195804196,
"grad_norm": 0.33919384025686283,
"learning_rate": 1.6213967609904014e-05,
"loss": 0.3688,
"num_tokens": 1920009103.0,
"step": 7335
},
{
"epoch": 3.421911421911422,
"grad_norm": 0.32694232562377856,
"learning_rate": 1.6183970124729268e-05,
"loss": 0.3559,
"num_tokens": 1921319823.0,
"step": 7340
},
{
"epoch": 3.4242424242424243,
"grad_norm": 0.33584818314646175,
"learning_rate": 1.615399953977757e-05,
"loss": 0.3589,
"num_tokens": 1922630543.0,
"step": 7345
},
{
"epoch": 3.4265734265734267,
"grad_norm": 0.31756474094191617,
"learning_rate": 1.612405592629433e-05,
"loss": 0.3509,
"num_tokens": 1923941263.0,
"step": 7350
},
{
"epoch": 3.428904428904429,
"grad_norm": 0.3256749853351091,
"learning_rate": 1.6094139355460855e-05,
"loss": 0.3589,
"num_tokens": 1925251983.0,
"step": 7355
},
{
"epoch": 3.4312354312354314,
"grad_norm": 0.3440498427386265,
"learning_rate": 1.6064249898394205e-05,
"loss": 0.366,
"num_tokens": 1926562703.0,
"step": 7360
},
{
"epoch": 3.4335664335664333,
"grad_norm": 0.31653310321422085,
"learning_rate": 1.6034387626146936e-05,
"loss": 0.3644,
"num_tokens": 1927873423.0,
"step": 7365
},
{
"epoch": 3.435897435897436,
"grad_norm": 0.316076483824156,
"learning_rate": 1.6004552609706992e-05,
"loss": 0.3512,
"num_tokens": 1929184143.0,
"step": 7370
},
{
"epoch": 3.438228438228438,
"grad_norm": 0.32507348992734375,
"learning_rate": 1.5974744919997543e-05,
"loss": 0.3498,
"num_tokens": 1930494863.0,
"step": 7375
},
{
"epoch": 3.4405594405594404,
"grad_norm": 0.3302545729067321,
"learning_rate": 1.5944964627876795e-05,
"loss": 0.3665,
"num_tokens": 1931792174.0,
"step": 7380
},
{
"epoch": 3.4428904428904428,
"grad_norm": 0.32458383971347465,
"learning_rate": 1.5915211804137803e-05,
"loss": 0.3633,
"num_tokens": 1933102894.0,
"step": 7385
},
{
"epoch": 3.445221445221445,
"grad_norm": 0.34226951871442984,
"learning_rate": 1.5885486519508347e-05,
"loss": 0.3595,
"num_tokens": 1934413614.0,
"step": 7390
},
{
"epoch": 3.4475524475524475,
"grad_norm": 0.34897664385155186,
"learning_rate": 1.5855788844650744e-05,
"loss": 0.3594,
"num_tokens": 1935724334.0,
"step": 7395
},
{
"epoch": 3.44988344988345,
"grad_norm": 0.3366078374615375,
"learning_rate": 1.5826118850161653e-05,
"loss": 0.3551,
"num_tokens": 1937035054.0,
"step": 7400
},
{
"epoch": 3.4522144522144522,
"grad_norm": 0.36277795416572367,
"learning_rate": 1.5796476606571957e-05,
"loss": 0.3704,
"num_tokens": 1938345774.0,
"step": 7405
},
{
"epoch": 3.4545454545454546,
"grad_norm": 0.34336029415281094,
"learning_rate": 1.576686218434656e-05,
"loss": 0.3537,
"num_tokens": 1939646875.0,
"step": 7410
},
{
"epoch": 3.456876456876457,
"grad_norm": 0.3393838080049611,
"learning_rate": 1.5737275653884225e-05,
"loss": 0.3724,
"num_tokens": 1940957595.0,
"step": 7415
},
{
"epoch": 3.4592074592074593,
"grad_norm": 0.34552607264965096,
"learning_rate": 1.5707717085517427e-05,
"loss": 0.3535,
"num_tokens": 1942268315.0,
"step": 7420
},
{
"epoch": 3.4615384615384617,
"grad_norm": 0.3161448905098042,
"learning_rate": 1.567818654951214e-05,
"loss": 0.3551,
"num_tokens": 1943579035.0,
"step": 7425
},
{
"epoch": 3.463869463869464,
"grad_norm": 0.33578110841595465,
"learning_rate": 1.5648684116067737e-05,
"loss": 0.3737,
"num_tokens": 1944889755.0,
"step": 7430
},
{
"epoch": 3.4662004662004664,
"grad_norm": 0.3547622429809544,
"learning_rate": 1.5619209855316766e-05,
"loss": 0.3628,
"num_tokens": 1946184922.0,
"step": 7435
},
{
"epoch": 3.4685314685314683,
"grad_norm": 0.3257055842704097,
"learning_rate": 1.5589763837324794e-05,
"loss": 0.3557,
"num_tokens": 1947495642.0,
"step": 7440
},
{
"epoch": 3.4708624708624707,
"grad_norm": 0.3259822417139684,
"learning_rate": 1.5560346132090275e-05,
"loss": 0.3544,
"num_tokens": 1948806362.0,
"step": 7445
},
{
"epoch": 3.473193473193473,
"grad_norm": 0.34231123217132675,
"learning_rate": 1.5530956809544354e-05,
"loss": 0.3609,
"num_tokens": 1950117082.0,
"step": 7450
},
{
"epoch": 3.4755244755244754,
"grad_norm": 0.32649483544948477,
"learning_rate": 1.5501595939550674e-05,
"loss": 0.352,
"num_tokens": 1951427802.0,
"step": 7455
},
{
"epoch": 3.4778554778554778,
"grad_norm": 0.3538064723620207,
"learning_rate": 1.547226359190528e-05,
"loss": 0.3601,
"num_tokens": 1952738522.0,
"step": 7460
},
{
"epoch": 3.48018648018648,
"grad_norm": 0.34134701799353756,
"learning_rate": 1.544295983633639e-05,
"loss": 0.3543,
"num_tokens": 1954049242.0,
"step": 7465
},
{
"epoch": 3.4825174825174825,
"grad_norm": 0.3050547799836289,
"learning_rate": 1.5413684742504275e-05,
"loss": 0.3426,
"num_tokens": 1955359962.0,
"step": 7470
},
{
"epoch": 3.484848484848485,
"grad_norm": 0.33289101107690966,
"learning_rate": 1.538443838000104e-05,
"loss": 0.3555,
"num_tokens": 1956670682.0,
"step": 7475
},
{
"epoch": 3.4871794871794872,
"grad_norm": 0.3248102594895154,
"learning_rate": 1.5355220818350517e-05,
"loss": 0.3664,
"num_tokens": 1957981402.0,
"step": 7480
},
{
"epoch": 3.4895104895104896,
"grad_norm": 0.32624207166938474,
"learning_rate": 1.5326032127008077e-05,
"loss": 0.3627,
"num_tokens": 1959292122.0,
"step": 7485
},
{
"epoch": 3.491841491841492,
"grad_norm": 0.32138847441298957,
"learning_rate": 1.5296872375360434e-05,
"loss": 0.3596,
"num_tokens": 1960602842.0,
"step": 7490
},
{
"epoch": 3.4941724941724943,
"grad_norm": 0.3329259719385868,
"learning_rate": 1.526774163272553e-05,
"loss": 0.3713,
"num_tokens": 1961913562.0,
"step": 7495
},
{
"epoch": 3.4965034965034967,
"grad_norm": 0.3270763048933,
"learning_rate": 1.5238639968352346e-05,
"loss": 0.3605,
"num_tokens": 1963224282.0,
"step": 7500
},
{
"epoch": 3.4988344988344986,
"grad_norm": 0.33222559214722885,
"learning_rate": 1.520956745142072e-05,
"loss": 0.3557,
"num_tokens": 1964535002.0,
"step": 7505
},
{
"epoch": 3.5011655011655014,
"grad_norm": 0.31280758663110264,
"learning_rate": 1.518052415104122e-05,
"loss": 0.3525,
"num_tokens": 1965845722.0,
"step": 7510
},
{
"epoch": 3.5034965034965033,
"grad_norm": 0.3474298912782969,
"learning_rate": 1.5151510136254971e-05,
"loss": 0.3762,
"num_tokens": 1967156442.0,
"step": 7515
},
{
"epoch": 3.5058275058275057,
"grad_norm": 0.35950009162167107,
"learning_rate": 1.5122525476033448e-05,
"loss": 0.3629,
"num_tokens": 1968467162.0,
"step": 7520
},
{
"epoch": 3.508158508158508,
"grad_norm": 0.32565859243171547,
"learning_rate": 1.5093570239278348e-05,
"loss": 0.3567,
"num_tokens": 1969777882.0,
"step": 7525
},
{
"epoch": 3.5104895104895104,
"grad_norm": 0.34433837280720203,
"learning_rate": 1.5064644494821472e-05,
"loss": 0.3578,
"num_tokens": 1971088602.0,
"step": 7530
},
{
"epoch": 3.5128205128205128,
"grad_norm": 0.32951210301147243,
"learning_rate": 1.503574831142446e-05,
"loss": 0.3564,
"num_tokens": 1972384019.0,
"step": 7535
},
{
"epoch": 3.515151515151515,
"grad_norm": 0.3256591839058464,
"learning_rate": 1.5006881757778687e-05,
"loss": 0.3592,
"num_tokens": 1973681803.0,
"step": 7540
},
{
"epoch": 3.5174825174825175,
"grad_norm": 0.3271419894397293,
"learning_rate": 1.4978044902505133e-05,
"loss": 0.3569,
"num_tokens": 1974992523.0,
"step": 7545
},
{
"epoch": 3.51981351981352,
"grad_norm": 0.34506187438893227,
"learning_rate": 1.4949237814154132e-05,
"loss": 0.3484,
"num_tokens": 1976296304.0,
"step": 7550
},
{
"epoch": 3.5221445221445222,
"grad_norm": 0.34392115950662083,
"learning_rate": 1.4920460561205263e-05,
"loss": 0.3605,
"num_tokens": 1977607024.0,
"step": 7555
},
{
"epoch": 3.5244755244755246,
"grad_norm": 0.31870592682573046,
"learning_rate": 1.4891713212067223e-05,
"loss": 0.3539,
"num_tokens": 1978917744.0,
"step": 7560
},
{
"epoch": 3.526806526806527,
"grad_norm": 0.3369778884605504,
"learning_rate": 1.4862995835077582e-05,
"loss": 0.3616,
"num_tokens": 1980228464.0,
"step": 7565
},
{
"epoch": 3.529137529137529,
"grad_norm": 0.3181421663920499,
"learning_rate": 1.4834308498502652e-05,
"loss": 0.3586,
"num_tokens": 1981539184.0,
"step": 7570
},
{
"epoch": 3.5314685314685317,
"grad_norm": 0.3574276899464395,
"learning_rate": 1.480565127053737e-05,
"loss": 0.3432,
"num_tokens": 1982849904.0,
"step": 7575
},
{
"epoch": 3.5337995337995336,
"grad_norm": 0.35575215961743184,
"learning_rate": 1.4777024219305092e-05,
"loss": 0.3638,
"num_tokens": 1984160624.0,
"step": 7580
},
{
"epoch": 3.5361305361305364,
"grad_norm": 0.3463286942931235,
"learning_rate": 1.4748427412857407e-05,
"loss": 0.3687,
"num_tokens": 1985471344.0,
"step": 7585
},
{
"epoch": 3.5384615384615383,
"grad_norm": 0.3249769467333938,
"learning_rate": 1.4719860919174039e-05,
"loss": 0.3618,
"num_tokens": 1986782064.0,
"step": 7590
},
{
"epoch": 3.5407925407925407,
"grad_norm": 0.3489092698701824,
"learning_rate": 1.469132480616265e-05,
"loss": 0.3592,
"num_tokens": 1988092784.0,
"step": 7595
},
{
"epoch": 3.543123543123543,
"grad_norm": 0.33206107999422213,
"learning_rate": 1.4662819141658662e-05,
"loss": 0.3435,
"num_tokens": 1989403504.0,
"step": 7600
},
{
"epoch": 3.5454545454545454,
"grad_norm": 0.32926989102798976,
"learning_rate": 1.4634343993425132e-05,
"loss": 0.3598,
"num_tokens": 1990714224.0,
"step": 7605
},
{
"epoch": 3.5477855477855478,
"grad_norm": 0.3312078503555784,
"learning_rate": 1.4605899429152581e-05,
"loss": 0.366,
"num_tokens": 1992024944.0,
"step": 7610
},
{
"epoch": 3.55011655011655,
"grad_norm": 0.32101930069810486,
"learning_rate": 1.45774855164588e-05,
"loss": 0.3786,
"num_tokens": 1993335664.0,
"step": 7615
},
{
"epoch": 3.5524475524475525,
"grad_norm": 0.3270442973643097,
"learning_rate": 1.4549102322888739e-05,
"loss": 0.3522,
"num_tokens": 1994646384.0,
"step": 7620
},
{
"epoch": 3.554778554778555,
"grad_norm": 0.3410847452328196,
"learning_rate": 1.452074991591432e-05,
"loss": 0.3661,
"num_tokens": 1995957104.0,
"step": 7625
},
{
"epoch": 3.5571095571095572,
"grad_norm": 0.3311841562429209,
"learning_rate": 1.4492428362934269e-05,
"loss": 0.3644,
"num_tokens": 1997267824.0,
"step": 7630
},
{
"epoch": 3.5594405594405596,
"grad_norm": 0.34891575680597303,
"learning_rate": 1.4464137731273974e-05,
"loss": 0.3659,
"num_tokens": 1998564615.0,
"step": 7635
},
{
"epoch": 3.561771561771562,
"grad_norm": 0.3270565111830101,
"learning_rate": 1.4435878088185317e-05,
"loss": 0.3588,
"num_tokens": 1999861934.0,
"step": 7640
},
{
"epoch": 3.564102564102564,
"grad_norm": 0.3566229262833042,
"learning_rate": 1.440764950084652e-05,
"loss": 0.3651,
"num_tokens": 2001172654.0,
"step": 7645
},
{
"epoch": 3.5664335664335667,
"grad_norm": 0.34737324488773064,
"learning_rate": 1.4379452036361963e-05,
"loss": 0.3685,
"num_tokens": 2002483374.0,
"step": 7650
},
{
"epoch": 3.5687645687645686,
"grad_norm": 0.3479603277552613,
"learning_rate": 1.4351285761762057e-05,
"loss": 0.3603,
"num_tokens": 2003794094.0,
"step": 7655
},
{
"epoch": 3.571095571095571,
"grad_norm": 0.320114547771287,
"learning_rate": 1.4323150744003075e-05,
"loss": 0.3594,
"num_tokens": 2005104814.0,
"step": 7660
},
{
"epoch": 3.5734265734265733,
"grad_norm": 0.32862256230307296,
"learning_rate": 1.4295047049966958e-05,
"loss": 0.3605,
"num_tokens": 2006415534.0,
"step": 7665
},
{
"epoch": 3.5757575757575757,
"grad_norm": 0.35139510455853196,
"learning_rate": 1.4266974746461217e-05,
"loss": 0.3586,
"num_tokens": 2007726254.0,
"step": 7670
},
{
"epoch": 3.578088578088578,
"grad_norm": 0.33176820642996774,
"learning_rate": 1.4238933900218731e-05,
"loss": 0.3515,
"num_tokens": 2009033953.0,
"step": 7675
},
{
"epoch": 3.5804195804195804,
"grad_norm": 0.36171642550182254,
"learning_rate": 1.4210924577897583e-05,
"loss": 0.3604,
"num_tokens": 2010344673.0,
"step": 7680
},
{
"epoch": 3.582750582750583,
"grad_norm": 0.3374934342841691,
"learning_rate": 1.4182946846080952e-05,
"loss": 0.3545,
"num_tokens": 2011655393.0,
"step": 7685
},
{
"epoch": 3.585081585081585,
"grad_norm": 0.32415522876729075,
"learning_rate": 1.4155000771276878e-05,
"loss": 0.3434,
"num_tokens": 2012966113.0,
"step": 7690
},
{
"epoch": 3.5874125874125875,
"grad_norm": 0.3552935558976595,
"learning_rate": 1.4127086419918178e-05,
"loss": 0.366,
"num_tokens": 2014276833.0,
"step": 7695
},
{
"epoch": 3.58974358974359,
"grad_norm": 0.3306470072028632,
"learning_rate": 1.4099203858362262e-05,
"loss": 0.3608,
"num_tokens": 2015587553.0,
"step": 7700
},
{
"epoch": 3.5920745920745922,
"grad_norm": 0.32899882379271284,
"learning_rate": 1.4071353152890936e-05,
"loss": 0.3564,
"num_tokens": 2016898273.0,
"step": 7705
},
{
"epoch": 3.594405594405594,
"grad_norm": 0.32893163008661985,
"learning_rate": 1.4043534369710307e-05,
"loss": 0.3618,
"num_tokens": 2018208993.0,
"step": 7710
},
{
"epoch": 3.596736596736597,
"grad_norm": 0.33890514726280374,
"learning_rate": 1.4015747574950597e-05,
"loss": 0.3585,
"num_tokens": 2019519713.0,
"step": 7715
},
{
"epoch": 3.599067599067599,
"grad_norm": 0.35025561594471827,
"learning_rate": 1.3987992834665963e-05,
"loss": 0.3764,
"num_tokens": 2020817105.0,
"step": 7720
},
{
"epoch": 3.6013986013986012,
"grad_norm": 0.322499498679104,
"learning_rate": 1.3960270214834381e-05,
"loss": 0.3557,
"num_tokens": 2022127825.0,
"step": 7725
},
{
"epoch": 3.6037296037296036,
"grad_norm": 0.32465177989473437,
"learning_rate": 1.3932579781357477e-05,
"loss": 0.3528,
"num_tokens": 2023438545.0,
"step": 7730
},
{
"epoch": 3.606060606060606,
"grad_norm": 0.32534204421549856,
"learning_rate": 1.390492160006035e-05,
"loss": 0.3514,
"num_tokens": 2024749265.0,
"step": 7735
},
{
"epoch": 3.6083916083916083,
"grad_norm": 0.3348791455491572,
"learning_rate": 1.3877295736691408e-05,
"loss": 0.3548,
"num_tokens": 2026059985.0,
"step": 7740
},
{
"epoch": 3.6107226107226107,
"grad_norm": 0.30376460026082297,
"learning_rate": 1.3849702256922309e-05,
"loss": 0.3517,
"num_tokens": 2027370705.0,
"step": 7745
},
{
"epoch": 3.613053613053613,
"grad_norm": 0.3202796706000662,
"learning_rate": 1.3822141226347646e-05,
"loss": 0.3661,
"num_tokens": 2028678090.0,
"step": 7750
},
{
"epoch": 3.6153846153846154,
"grad_norm": 0.32249662232530607,
"learning_rate": 1.3794612710484905e-05,
"loss": 0.351,
"num_tokens": 2029988810.0,
"step": 7755
},
{
"epoch": 3.617715617715618,
"grad_norm": 0.3475421569483326,
"learning_rate": 1.3767116774774307e-05,
"loss": 0.3744,
"num_tokens": 2031288913.0,
"step": 7760
},
{
"epoch": 3.62004662004662,
"grad_norm": 0.33679545809282563,
"learning_rate": 1.3739653484578586e-05,
"loss": 0.3555,
"num_tokens": 2032589943.0,
"step": 7765
},
{
"epoch": 3.6223776223776225,
"grad_norm": 0.340971223695831,
"learning_rate": 1.3712222905182881e-05,
"loss": 0.3499,
"num_tokens": 2033900663.0,
"step": 7770
},
{
"epoch": 3.624708624708625,
"grad_norm": 0.3039493470993974,
"learning_rate": 1.3684825101794575e-05,
"loss": 0.3514,
"num_tokens": 2035211383.0,
"step": 7775
},
{
"epoch": 3.6270396270396272,
"grad_norm": 0.3352791563799734,
"learning_rate": 1.3657460139543155e-05,
"loss": 0.3626,
"num_tokens": 2036522103.0,
"step": 7780
},
{
"epoch": 3.629370629370629,
"grad_norm": 0.34252274886593165,
"learning_rate": 1.3630128083479998e-05,
"loss": 0.3504,
"num_tokens": 2037832823.0,
"step": 7785
},
{
"epoch": 3.631701631701632,
"grad_norm": 0.3513374308995934,
"learning_rate": 1.3602828998578293e-05,
"loss": 0.3684,
"num_tokens": 2039143543.0,
"step": 7790
},
{
"epoch": 3.634032634032634,
"grad_norm": 0.33422112695710476,
"learning_rate": 1.3575562949732845e-05,
"loss": 0.3584,
"num_tokens": 2040454263.0,
"step": 7795
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.35082577383897384,
"learning_rate": 1.3548330001759898e-05,
"loss": 0.3797,
"num_tokens": 2041764983.0,
"step": 7800
},
{
"epoch": 3.6386946386946386,
"grad_norm": 0.3188497601819274,
"learning_rate": 1.352113021939705e-05,
"loss": 0.3511,
"num_tokens": 2043075703.0,
"step": 7805
},
{
"epoch": 3.641025641025641,
"grad_norm": 0.34579309685383136,
"learning_rate": 1.3493963667303036e-05,
"loss": 0.3563,
"num_tokens": 2044386423.0,
"step": 7810
},
{
"epoch": 3.6433566433566433,
"grad_norm": 0.3507371852839906,
"learning_rate": 1.3466830410057588e-05,
"loss": 0.3416,
"num_tokens": 2045697143.0,
"step": 7815
},
{
"epoch": 3.6456876456876457,
"grad_norm": 0.3437622175939715,
"learning_rate": 1.343973051216131e-05,
"loss": 0.3652,
"num_tokens": 2047007863.0,
"step": 7820
},
{
"epoch": 3.648018648018648,
"grad_norm": 0.31415387046199733,
"learning_rate": 1.3412664038035507e-05,
"loss": 0.3619,
"num_tokens": 2048318583.0,
"step": 7825
},
{
"epoch": 3.6503496503496504,
"grad_norm": 0.3398023578909356,
"learning_rate": 1.338563105202201e-05,
"loss": 0.35,
"num_tokens": 2049629303.0,
"step": 7830
},
{
"epoch": 3.652680652680653,
"grad_norm": 0.3850082776958453,
"learning_rate": 1.3358631618383041e-05,
"loss": 0.3495,
"num_tokens": 2050940023.0,
"step": 7835
},
{
"epoch": 3.655011655011655,
"grad_norm": 0.3506882837431775,
"learning_rate": 1.3331665801301085e-05,
"loss": 0.3587,
"num_tokens": 2052245292.0,
"step": 7840
},
{
"epoch": 3.6573426573426575,
"grad_norm": 0.33043151182996516,
"learning_rate": 1.3304733664878714e-05,
"loss": 0.3757,
"num_tokens": 2053556012.0,
"step": 7845
},
{
"epoch": 3.6596736596736594,
"grad_norm": 0.32244585116058333,
"learning_rate": 1.32778352731384e-05,
"loss": 0.3674,
"num_tokens": 2054864779.0,
"step": 7850
},
{
"epoch": 3.6620046620046622,
"grad_norm": 0.34649671127745907,
"learning_rate": 1.3250970690022435e-05,
"loss": 0.3607,
"num_tokens": 2056173760.0,
"step": 7855
},
{
"epoch": 3.664335664335664,
"grad_norm": 0.3592288768686839,
"learning_rate": 1.3224139979392739e-05,
"loss": 0.3483,
"num_tokens": 2057484480.0,
"step": 7860
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.33267649430105906,
"learning_rate": 1.3197343205030677e-05,
"loss": 0.3601,
"num_tokens": 2058795200.0,
"step": 7865
},
{
"epoch": 3.668997668997669,
"grad_norm": 0.3155221547689689,
"learning_rate": 1.317058043063698e-05,
"loss": 0.3592,
"num_tokens": 2060105920.0,
"step": 7870
},
{
"epoch": 3.6713286713286712,
"grad_norm": 0.32504676536520444,
"learning_rate": 1.3143851719831545e-05,
"loss": 0.347,
"num_tokens": 2061416640.0,
"step": 7875
},
{
"epoch": 3.6736596736596736,
"grad_norm": 0.3390312427438953,
"learning_rate": 1.3117157136153275e-05,
"loss": 0.3598,
"num_tokens": 2062727360.0,
"step": 7880
},
{
"epoch": 3.675990675990676,
"grad_norm": 0.3272385721818141,
"learning_rate": 1.3090496743059963e-05,
"loss": 0.3457,
"num_tokens": 2064038080.0,
"step": 7885
},
{
"epoch": 3.6783216783216783,
"grad_norm": 0.3406171598088719,
"learning_rate": 1.3063870603928135e-05,
"loss": 0.3619,
"num_tokens": 2065348800.0,
"step": 7890
},
{
"epoch": 3.6806526806526807,
"grad_norm": 0.3199792712358228,
"learning_rate": 1.3037278782052863e-05,
"loss": 0.3676,
"num_tokens": 2066659520.0,
"step": 7895
},
{
"epoch": 3.682983682983683,
"grad_norm": 0.32349347623327407,
"learning_rate": 1.3010721340647672e-05,
"loss": 0.351,
"num_tokens": 2067970240.0,
"step": 7900
},
{
"epoch": 3.6853146853146854,
"grad_norm": 0.3197976897891316,
"learning_rate": 1.2984198342844317e-05,
"loss": 0.3507,
"num_tokens": 2069280960.0,
"step": 7905
},
{
"epoch": 3.687645687645688,
"grad_norm": 0.31153094945488263,
"learning_rate": 1.2957709851692709e-05,
"loss": 0.3531,
"num_tokens": 2070591680.0,
"step": 7910
},
{
"epoch": 3.6899766899766897,
"grad_norm": 0.34104591153105757,
"learning_rate": 1.293125593016073e-05,
"loss": 0.352,
"num_tokens": 2071902400.0,
"step": 7915
},
{
"epoch": 3.6923076923076925,
"grad_norm": 0.32756629305528456,
"learning_rate": 1.2904836641134058e-05,
"loss": 0.3609,
"num_tokens": 2073193557.0,
"step": 7920
},
{
"epoch": 3.6946386946386944,
"grad_norm": 0.32387204375058537,
"learning_rate": 1.2878452047416065e-05,
"loss": 0.3558,
"num_tokens": 2074504277.0,
"step": 7925
},
{
"epoch": 3.6969696969696972,
"grad_norm": 0.34675201672498257,
"learning_rate": 1.2852102211727648e-05,
"loss": 0.3616,
"num_tokens": 2075814997.0,
"step": 7930
},
{
"epoch": 3.699300699300699,
"grad_norm": 0.3247586444847732,
"learning_rate": 1.2825787196707059e-05,
"loss": 0.349,
"num_tokens": 2077125717.0,
"step": 7935
},
{
"epoch": 3.7016317016317015,
"grad_norm": 0.3156890734127022,
"learning_rate": 1.2799507064909787e-05,
"loss": 0.3533,
"num_tokens": 2078436437.0,
"step": 7940
},
{
"epoch": 3.703962703962704,
"grad_norm": 0.3227247560032221,
"learning_rate": 1.2773261878808413e-05,
"loss": 0.3466,
"num_tokens": 2079747157.0,
"step": 7945
},
{
"epoch": 3.7062937062937062,
"grad_norm": 0.3216324993277899,
"learning_rate": 1.2747051700792412e-05,
"loss": 0.3554,
"num_tokens": 2081057877.0,
"step": 7950
},
{
"epoch": 3.7086247086247086,
"grad_norm": 0.31249996903728416,
"learning_rate": 1.2720876593168052e-05,
"loss": 0.3492,
"num_tokens": 2082368597.0,
"step": 7955
},
{
"epoch": 3.710955710955711,
"grad_norm": 0.3188316644091349,
"learning_rate": 1.2694736618158249e-05,
"loss": 0.3458,
"num_tokens": 2083679317.0,
"step": 7960
},
{
"epoch": 3.7132867132867133,
"grad_norm": 0.3335771049219983,
"learning_rate": 1.2668631837902389e-05,
"loss": 0.3424,
"num_tokens": 2084990037.0,
"step": 7965
},
{
"epoch": 3.7156177156177157,
"grad_norm": 0.3499985089110688,
"learning_rate": 1.2642562314456185e-05,
"loss": 0.3534,
"num_tokens": 2086300757.0,
"step": 7970
},
{
"epoch": 3.717948717948718,
"grad_norm": 0.3410878602651583,
"learning_rate": 1.2616528109791554e-05,
"loss": 0.3659,
"num_tokens": 2087611477.0,
"step": 7975
},
{
"epoch": 3.7202797202797204,
"grad_norm": 0.3527728670467037,
"learning_rate": 1.259052928579646e-05,
"loss": 0.3591,
"num_tokens": 2088908084.0,
"step": 7980
},
{
"epoch": 3.722610722610723,
"grad_norm": 0.3510762979938676,
"learning_rate": 1.2564565904274722e-05,
"loss": 0.368,
"num_tokens": 2090210123.0,
"step": 7985
},
{
"epoch": 3.7249417249417247,
"grad_norm": 0.37404478878722747,
"learning_rate": 1.2538638026945954e-05,
"loss": 0.3647,
"num_tokens": 2091520843.0,
"step": 7990
},
{
"epoch": 3.7272727272727275,
"grad_norm": 0.3253947492356622,
"learning_rate": 1.2512745715445345e-05,
"loss": 0.3691,
"num_tokens": 2092831563.0,
"step": 7995
},
{
"epoch": 3.7296037296037294,
"grad_norm": 0.3730204049764418,
"learning_rate": 1.2486889031323528e-05,
"loss": 0.3568,
"num_tokens": 2094140244.0,
"step": 8000
},
{
"epoch": 3.731934731934732,
"grad_norm": 0.322985878009181,
"learning_rate": 1.2461068036046474e-05,
"loss": 0.3558,
"num_tokens": 2095450964.0,
"step": 8005
},
{
"epoch": 3.734265734265734,
"grad_norm": 0.3451139274796168,
"learning_rate": 1.2435282790995294e-05,
"loss": 0.3568,
"num_tokens": 2096761684.0,
"step": 8010
},
{
"epoch": 3.7365967365967365,
"grad_norm": 0.32232149066408405,
"learning_rate": 1.240953335746611e-05,
"loss": 0.3592,
"num_tokens": 2098072404.0,
"step": 8015
},
{
"epoch": 3.738927738927739,
"grad_norm": 0.33356015443779835,
"learning_rate": 1.2383819796669929e-05,
"loss": 0.3485,
"num_tokens": 2099383124.0,
"step": 8020
},
{
"epoch": 3.7412587412587412,
"grad_norm": 0.31783927007138674,
"learning_rate": 1.235814216973248e-05,
"loss": 0.362,
"num_tokens": 2100693844.0,
"step": 8025
},
{
"epoch": 3.7435897435897436,
"grad_norm": 0.3258650151987321,
"learning_rate": 1.2332500537694061e-05,
"loss": 0.3643,
"num_tokens": 2102004564.0,
"step": 8030
},
{
"epoch": 3.745920745920746,
"grad_norm": 0.31862835430157704,
"learning_rate": 1.2306894961509392e-05,
"loss": 0.3559,
"num_tokens": 2103315284.0,
"step": 8035
},
{
"epoch": 3.7482517482517483,
"grad_norm": 0.31403800945327537,
"learning_rate": 1.2281325502047526e-05,
"loss": 0.352,
"num_tokens": 2104610250.0,
"step": 8040
},
{
"epoch": 3.7505827505827507,
"grad_norm": 0.3364153402051559,
"learning_rate": 1.2255792220091623e-05,
"loss": 0.3605,
"num_tokens": 2105920970.0,
"step": 8045
},
{
"epoch": 3.752913752913753,
"grad_norm": 0.33818645363457694,
"learning_rate": 1.2230295176338843e-05,
"loss": 0.3528,
"num_tokens": 2107231690.0,
"step": 8050
},
{
"epoch": 3.755244755244755,
"grad_norm": 0.33218579315347985,
"learning_rate": 1.2204834431400218e-05,
"loss": 0.3646,
"num_tokens": 2108542410.0,
"step": 8055
},
{
"epoch": 3.757575757575758,
"grad_norm": 0.3150157899865806,
"learning_rate": 1.2179410045800486e-05,
"loss": 0.3678,
"num_tokens": 2109845114.0,
"step": 8060
},
{
"epoch": 3.7599067599067597,
"grad_norm": 0.3239476998553056,
"learning_rate": 1.2154022079977941e-05,
"loss": 0.364,
"num_tokens": 2111155834.0,
"step": 8065
},
{
"epoch": 3.762237762237762,
"grad_norm": 0.30771382308356826,
"learning_rate": 1.2128670594284317e-05,
"loss": 0.3656,
"num_tokens": 2112466554.0,
"step": 8070
},
{
"epoch": 3.7645687645687644,
"grad_norm": 0.36628760033281316,
"learning_rate": 1.2103355648984627e-05,
"loss": 0.3539,
"num_tokens": 2113777274.0,
"step": 8075
},
{
"epoch": 3.766899766899767,
"grad_norm": 0.3435307692569418,
"learning_rate": 1.2078077304256999e-05,
"loss": 0.3698,
"num_tokens": 2115087994.0,
"step": 8080
},
{
"epoch": 3.769230769230769,
"grad_norm": 0.31904278075717907,
"learning_rate": 1.2052835620192577e-05,
"loss": 0.3539,
"num_tokens": 2116398714.0,
"step": 8085
},
{
"epoch": 3.7715617715617715,
"grad_norm": 0.33066965750120453,
"learning_rate": 1.2027630656795365e-05,
"loss": 0.3608,
"num_tokens": 2117709434.0,
"step": 8090
},
{
"epoch": 3.773892773892774,
"grad_norm": 0.3254496733507665,
"learning_rate": 1.2002462473982034e-05,
"loss": 0.3646,
"num_tokens": 2119020154.0,
"step": 8095
},
{
"epoch": 3.7762237762237763,
"grad_norm": 0.318029815318303,
"learning_rate": 1.1977331131581872e-05,
"loss": 0.3643,
"num_tokens": 2120330874.0,
"step": 8100
},
{
"epoch": 3.7785547785547786,
"grad_norm": 0.33657956842752124,
"learning_rate": 1.1952236689336547e-05,
"loss": 0.3483,
"num_tokens": 2121641594.0,
"step": 8105
},
{
"epoch": 3.780885780885781,
"grad_norm": 0.339494433625539,
"learning_rate": 1.1927179206900036e-05,
"loss": 0.3624,
"num_tokens": 2122952314.0,
"step": 8110
},
{
"epoch": 3.7832167832167833,
"grad_norm": 0.32937653713889037,
"learning_rate": 1.1902158743838455e-05,
"loss": 0.3578,
"num_tokens": 2124263034.0,
"step": 8115
},
{
"epoch": 3.7855477855477857,
"grad_norm": 0.31184931579510083,
"learning_rate": 1.1877175359629895e-05,
"loss": 0.3515,
"num_tokens": 2125564666.0,
"step": 8120
},
{
"epoch": 3.787878787878788,
"grad_norm": 0.32358413426775373,
"learning_rate": 1.185222911366433e-05,
"loss": 0.3634,
"num_tokens": 2126875386.0,
"step": 8125
},
{
"epoch": 3.79020979020979,
"grad_norm": 0.34491635141056515,
"learning_rate": 1.1827320065243442e-05,
"loss": 0.3663,
"num_tokens": 2128186106.0,
"step": 8130
},
{
"epoch": 3.792540792540793,
"grad_norm": 0.3263014090650137,
"learning_rate": 1.1802448273580482e-05,
"loss": 0.3531,
"num_tokens": 2129496826.0,
"step": 8135
},
{
"epoch": 3.7948717948717947,
"grad_norm": 0.34055454076158553,
"learning_rate": 1.1777613797800132e-05,
"loss": 0.3526,
"num_tokens": 2130807546.0,
"step": 8140
},
{
"epoch": 3.797202797202797,
"grad_norm": 0.35098519401821454,
"learning_rate": 1.175281669693839e-05,
"loss": 0.3567,
"num_tokens": 2132118266.0,
"step": 8145
},
{
"epoch": 3.7995337995337994,
"grad_norm": 0.3266773596447305,
"learning_rate": 1.1728057029942377e-05,
"loss": 0.3531,
"num_tokens": 2133428986.0,
"step": 8150
},
{
"epoch": 3.801864801864802,
"grad_norm": 0.31360535728361383,
"learning_rate": 1.170333485567025e-05,
"loss": 0.3674,
"num_tokens": 2134739706.0,
"step": 8155
},
{
"epoch": 3.804195804195804,
"grad_norm": 0.31624940012549485,
"learning_rate": 1.1678650232891021e-05,
"loss": 0.3518,
"num_tokens": 2136050426.0,
"step": 8160
},
{
"epoch": 3.8065268065268065,
"grad_norm": 0.3141407909472399,
"learning_rate": 1.1654003220284459e-05,
"loss": 0.3619,
"num_tokens": 2137356817.0,
"step": 8165
},
{
"epoch": 3.808857808857809,
"grad_norm": 0.34368525998084476,
"learning_rate": 1.1629393876440894e-05,
"loss": 0.3526,
"num_tokens": 2138667537.0,
"step": 8170
},
{
"epoch": 3.8111888111888113,
"grad_norm": 0.33230532572557525,
"learning_rate": 1.1604822259861143e-05,
"loss": 0.3554,
"num_tokens": 2139978257.0,
"step": 8175
},
{
"epoch": 3.8135198135198136,
"grad_norm": 0.3285215453270888,
"learning_rate": 1.1580288428956326e-05,
"loss": 0.3545,
"num_tokens": 2141288977.0,
"step": 8180
},
{
"epoch": 3.815850815850816,
"grad_norm": 0.32253369632017526,
"learning_rate": 1.1555792442047727e-05,
"loss": 0.3545,
"num_tokens": 2142599697.0,
"step": 8185
},
{
"epoch": 3.8181818181818183,
"grad_norm": 0.31275107239667255,
"learning_rate": 1.1531334357366687e-05,
"loss": 0.3648,
"num_tokens": 2143910417.0,
"step": 8190
},
{
"epoch": 3.8205128205128203,
"grad_norm": 0.3371090455672606,
"learning_rate": 1.1506914233054449e-05,
"loss": 0.3548,
"num_tokens": 2145215363.0,
"step": 8195
},
{
"epoch": 3.822843822843823,
"grad_norm": 0.3330220005685937,
"learning_rate": 1.1482532127161987e-05,
"loss": 0.3682,
"num_tokens": 2146526083.0,
"step": 8200
},
{
"epoch": 3.825174825174825,
"grad_norm": 0.3163036289984921,
"learning_rate": 1.1458188097649931e-05,
"loss": 0.3652,
"num_tokens": 2147828267.0,
"step": 8205
},
{
"epoch": 3.8275058275058274,
"grad_norm": 0.31880150929387496,
"learning_rate": 1.143388220238839e-05,
"loss": 0.3612,
"num_tokens": 2149138987.0,
"step": 8210
},
{
"epoch": 3.8298368298368297,
"grad_norm": 0.319810402665015,
"learning_rate": 1.1409614499156807e-05,
"loss": 0.355,
"num_tokens": 2150449707.0,
"step": 8215
},
{
"epoch": 3.832167832167832,
"grad_norm": 0.3394499363463578,
"learning_rate": 1.138538504564384e-05,
"loss": 0.3543,
"num_tokens": 2151760427.0,
"step": 8220
},
{
"epoch": 3.8344988344988344,
"grad_norm": 0.3339998319196104,
"learning_rate": 1.1361193899447239e-05,
"loss": 0.3643,
"num_tokens": 2153071147.0,
"step": 8225
},
{
"epoch": 3.836829836829837,
"grad_norm": 0.3283698373871858,
"learning_rate": 1.1337041118073673e-05,
"loss": 0.365,
"num_tokens": 2154381867.0,
"step": 8230
},
{
"epoch": 3.839160839160839,
"grad_norm": 0.31803261230391683,
"learning_rate": 1.1312926758938598e-05,
"loss": 0.3542,
"num_tokens": 2155692587.0,
"step": 8235
},
{
"epoch": 3.8414918414918415,
"grad_norm": 0.32415297440890517,
"learning_rate": 1.1288850879366178e-05,
"loss": 0.3476,
"num_tokens": 2157003307.0,
"step": 8240
},
{
"epoch": 3.843822843822844,
"grad_norm": 0.32171582377815044,
"learning_rate": 1.1264813536589063e-05,
"loss": 0.3505,
"num_tokens": 2158302271.0,
"step": 8245
},
{
"epoch": 3.8461538461538463,
"grad_norm": 0.32594823868385486,
"learning_rate": 1.1240814787748294e-05,
"loss": 0.3542,
"num_tokens": 2159612991.0,
"step": 8250
},
{
"epoch": 3.8484848484848486,
"grad_norm": 0.3130567477371064,
"learning_rate": 1.1216854689893208e-05,
"loss": 0.3474,
"num_tokens": 2160923711.0,
"step": 8255
},
{
"epoch": 3.8508158508158505,
"grad_norm": 0.32884981317619555,
"learning_rate": 1.119293329998122e-05,
"loss": 0.3613,
"num_tokens": 2162234431.0,
"step": 8260
},
{
"epoch": 3.8531468531468533,
"grad_norm": 0.3182477239378319,
"learning_rate": 1.116905067487774e-05,
"loss": 0.3496,
"num_tokens": 2163545151.0,
"step": 8265
},
{
"epoch": 3.8554778554778553,
"grad_norm": 0.3238906779321128,
"learning_rate": 1.1145206871356035e-05,
"loss": 0.3755,
"num_tokens": 2164845172.0,
"step": 8270
},
{
"epoch": 3.857808857808858,
"grad_norm": 0.3230035270398852,
"learning_rate": 1.1121401946097089e-05,
"loss": 0.3579,
"num_tokens": 2166155892.0,
"step": 8275
},
{
"epoch": 3.86013986013986,
"grad_norm": 0.34202250350581626,
"learning_rate": 1.1097635955689447e-05,
"loss": 0.367,
"num_tokens": 2167466612.0,
"step": 8280
},
{
"epoch": 3.8624708624708624,
"grad_norm": 0.33282529022268353,
"learning_rate": 1.107390895662912e-05,
"loss": 0.3634,
"num_tokens": 2168777332.0,
"step": 8285
},
{
"epoch": 3.8648018648018647,
"grad_norm": 0.34181595087542227,
"learning_rate": 1.1050221005319422e-05,
"loss": 0.3674,
"num_tokens": 2170088052.0,
"step": 8290
},
{
"epoch": 3.867132867132867,
"grad_norm": 0.3036597141524631,
"learning_rate": 1.1026572158070831e-05,
"loss": 0.3555,
"num_tokens": 2171398772.0,
"step": 8295
},
{
"epoch": 3.8694638694638694,
"grad_norm": 0.32232320158504335,
"learning_rate": 1.1002962471100883e-05,
"loss": 0.3548,
"num_tokens": 2172709492.0,
"step": 8300
},
{
"epoch": 3.871794871794872,
"grad_norm": 0.3210640108188077,
"learning_rate": 1.0979392000534027e-05,
"loss": 0.3585,
"num_tokens": 2174020212.0,
"step": 8305
},
{
"epoch": 3.874125874125874,
"grad_norm": 0.3273031634760707,
"learning_rate": 1.0955860802401465e-05,
"loss": 0.3868,
"num_tokens": 2175330932.0,
"step": 8310
},
{
"epoch": 3.8764568764568765,
"grad_norm": 0.31087673967738866,
"learning_rate": 1.0932368932641074e-05,
"loss": 0.3553,
"num_tokens": 2176633046.0,
"step": 8315
},
{
"epoch": 3.878787878787879,
"grad_norm": 0.3289983288827823,
"learning_rate": 1.0908916447097199e-05,
"loss": 0.3661,
"num_tokens": 2177943766.0,
"step": 8320
},
{
"epoch": 3.8811188811188813,
"grad_norm": 0.35024193147589183,
"learning_rate": 1.0885503401520598e-05,
"loss": 0.3489,
"num_tokens": 2179254486.0,
"step": 8325
},
{
"epoch": 3.8834498834498836,
"grad_norm": 0.3317648553400327,
"learning_rate": 1.0862129851568261e-05,
"loss": 0.3525,
"num_tokens": 2180565206.0,
"step": 8330
},
{
"epoch": 3.8857808857808855,
"grad_norm": 0.34059702892010785,
"learning_rate": 1.0838795852803285e-05,
"loss": 0.3658,
"num_tokens": 2181875926.0,
"step": 8335
},
{
"epoch": 3.8881118881118883,
"grad_norm": 0.33839087803574835,
"learning_rate": 1.0815501460694752e-05,
"loss": 0.357,
"num_tokens": 2183186646.0,
"step": 8340
},
{
"epoch": 3.8904428904428903,
"grad_norm": 0.33882338228224024,
"learning_rate": 1.0792246730617587e-05,
"loss": 0.352,
"num_tokens": 2184497366.0,
"step": 8345
},
{
"epoch": 3.8927738927738926,
"grad_norm": 0.33693236579652286,
"learning_rate": 1.0769031717852435e-05,
"loss": 0.3518,
"num_tokens": 2185808086.0,
"step": 8350
},
{
"epoch": 3.895104895104895,
"grad_norm": 0.330046544608827,
"learning_rate": 1.0745856477585534e-05,
"loss": 0.3645,
"num_tokens": 2187118806.0,
"step": 8355
},
{
"epoch": 3.8974358974358974,
"grad_norm": 0.3214183099709155,
"learning_rate": 1.0722721064908554e-05,
"loss": 0.3602,
"num_tokens": 2188419457.0,
"step": 8360
},
{
"epoch": 3.8997668997668997,
"grad_norm": 0.3166655849628983,
"learning_rate": 1.0699625534818512e-05,
"loss": 0.3618,
"num_tokens": 2189730177.0,
"step": 8365
},
{
"epoch": 3.902097902097902,
"grad_norm": 0.31754371228858536,
"learning_rate": 1.0676569942217596e-05,
"loss": 0.3628,
"num_tokens": 2191040897.0,
"step": 8370
},
{
"epoch": 3.9044289044289044,
"grad_norm": 0.3060117009044129,
"learning_rate": 1.0653554341913072e-05,
"loss": 0.3535,
"num_tokens": 2192351617.0,
"step": 8375
},
{
"epoch": 3.906759906759907,
"grad_norm": 0.3435379540747769,
"learning_rate": 1.0630578788617131e-05,
"loss": 0.3642,
"num_tokens": 2193648973.0,
"step": 8380
},
{
"epoch": 3.909090909090909,
"grad_norm": 0.3389237512793272,
"learning_rate": 1.060764333694676e-05,
"loss": 0.3509,
"num_tokens": 2194959693.0,
"step": 8385
},
{
"epoch": 3.9114219114219115,
"grad_norm": 0.32842554162135046,
"learning_rate": 1.0584748041423623e-05,
"loss": 0.3556,
"num_tokens": 2196265846.0,
"step": 8390
},
{
"epoch": 3.913752913752914,
"grad_norm": 0.33837517172721177,
"learning_rate": 1.0561892956473932e-05,
"loss": 0.3573,
"num_tokens": 2197568195.0,
"step": 8395
},
{
"epoch": 3.916083916083916,
"grad_norm": 0.3218786208698996,
"learning_rate": 1.0539078136428294e-05,
"loss": 0.3634,
"num_tokens": 2198878915.0,
"step": 8400
},
{
"epoch": 3.9184149184149186,
"grad_norm": 0.351508799242857,
"learning_rate": 1.0516303635521606e-05,
"loss": 0.3753,
"num_tokens": 2200185526.0,
"step": 8405
},
{
"epoch": 3.9207459207459205,
"grad_norm": 0.34131167249228345,
"learning_rate": 1.0493569507892938e-05,
"loss": 0.3613,
"num_tokens": 2201496246.0,
"step": 8410
},
{
"epoch": 3.9230769230769234,
"grad_norm": 0.3318435319138198,
"learning_rate": 1.0470875807585354e-05,
"loss": 0.3572,
"num_tokens": 2202799214.0,
"step": 8415
},
{
"epoch": 3.9254079254079253,
"grad_norm": 0.33378030300757455,
"learning_rate": 1.0448222588545837e-05,
"loss": 0.3565,
"num_tokens": 2204109934.0,
"step": 8420
},
{
"epoch": 3.9277389277389276,
"grad_norm": 0.33527036359922735,
"learning_rate": 1.0425609904625137e-05,
"loss": 0.3599,
"num_tokens": 2205420654.0,
"step": 8425
},
{
"epoch": 3.93006993006993,
"grad_norm": 0.31167055961688644,
"learning_rate": 1.0403037809577636e-05,
"loss": 0.3581,
"num_tokens": 2206731374.0,
"step": 8430
},
{
"epoch": 3.9324009324009324,
"grad_norm": 0.3311994709997141,
"learning_rate": 1.0380506357061221e-05,
"loss": 0.3695,
"num_tokens": 2208042094.0,
"step": 8435
},
{
"epoch": 3.9347319347319347,
"grad_norm": 0.30544971351661804,
"learning_rate": 1.03580156006372e-05,
"loss": 0.3575,
"num_tokens": 2209352814.0,
"step": 8440
},
{
"epoch": 3.937062937062937,
"grad_norm": 0.3163147870974832,
"learning_rate": 1.0335565593770102e-05,
"loss": 0.3519,
"num_tokens": 2210663534.0,
"step": 8445
},
{
"epoch": 3.9393939393939394,
"grad_norm": 0.31310671731601936,
"learning_rate": 1.0313156389827596e-05,
"loss": 0.3589,
"num_tokens": 2211974254.0,
"step": 8450
},
{
"epoch": 3.941724941724942,
"grad_norm": 0.3359729029408067,
"learning_rate": 1.0290788042080375e-05,
"loss": 0.3617,
"num_tokens": 2213279287.0,
"step": 8455
},
{
"epoch": 3.944055944055944,
"grad_norm": 0.33715090925620084,
"learning_rate": 1.026846060370199e-05,
"loss": 0.3555,
"num_tokens": 2214584857.0,
"step": 8460
},
{
"epoch": 3.9463869463869465,
"grad_norm": 0.30902647410730066,
"learning_rate": 1.0246174127768738e-05,
"loss": 0.3595,
"num_tokens": 2215888814.0,
"step": 8465
},
{
"epoch": 3.948717948717949,
"grad_norm": 0.3291193720775341,
"learning_rate": 1.0223928667259556e-05,
"loss": 0.3673,
"num_tokens": 2217199534.0,
"step": 8470
},
{
"epoch": 3.951048951048951,
"grad_norm": 0.34291852278637736,
"learning_rate": 1.020172427505588e-05,
"loss": 0.3525,
"num_tokens": 2218509771.0,
"step": 8475
},
{
"epoch": 3.9533799533799536,
"grad_norm": 0.3561232943784015,
"learning_rate": 1.0179561003941507e-05,
"loss": 0.3538,
"num_tokens": 2219820491.0,
"step": 8480
},
{
"epoch": 3.9557109557109555,
"grad_norm": 0.3157272502198812,
"learning_rate": 1.0157438906602487e-05,
"loss": 0.3524,
"num_tokens": 2221130082.0,
"step": 8485
},
{
"epoch": 3.958041958041958,
"grad_norm": 0.33080441973925323,
"learning_rate": 1.0135358035627007e-05,
"loss": 0.3614,
"num_tokens": 2222424293.0,
"step": 8490
},
{
"epoch": 3.9603729603729603,
"grad_norm": 0.3289328935479798,
"learning_rate": 1.0113318443505226e-05,
"loss": 0.3659,
"num_tokens": 2223735013.0,
"step": 8495
},
{
"epoch": 3.9627039627039626,
"grad_norm": 0.31711304131356893,
"learning_rate": 1.0091320182629193e-05,
"loss": 0.3653,
"num_tokens": 2225045733.0,
"step": 8500
},
{
"epoch": 3.965034965034965,
"grad_norm": 0.323224468581729,
"learning_rate": 1.0069363305292708e-05,
"loss": 0.3628,
"num_tokens": 2226356453.0,
"step": 8505
},
{
"epoch": 3.9673659673659674,
"grad_norm": 0.3399766154632268,
"learning_rate": 1.0047447863691175e-05,
"loss": 0.3523,
"num_tokens": 2227667173.0,
"step": 8510
},
{
"epoch": 3.9696969696969697,
"grad_norm": 0.31305493281081237,
"learning_rate": 1.0025573909921515e-05,
"loss": 0.3553,
"num_tokens": 2228973398.0,
"step": 8515
},
{
"epoch": 3.972027972027972,
"grad_norm": 0.3312572614176095,
"learning_rate": 1.0003741495982034e-05,
"loss": 0.3563,
"num_tokens": 2230272637.0,
"step": 8520
},
{
"epoch": 3.9743589743589745,
"grad_norm": 0.3358237225226056,
"learning_rate": 9.981950673772256e-06,
"loss": 0.3611,
"num_tokens": 2231583357.0,
"step": 8525
},
{
"epoch": 3.976689976689977,
"grad_norm": 0.31567748638452275,
"learning_rate": 9.960201495092871e-06,
"loss": 0.37,
"num_tokens": 2232894077.0,
"step": 8530
},
{
"epoch": 3.979020979020979,
"grad_norm": 0.33864552842513596,
"learning_rate": 9.938494011645553e-06,
"loss": 0.3614,
"num_tokens": 2234204797.0,
"step": 8535
},
{
"epoch": 3.981351981351981,
"grad_norm": 0.3294973703926195,
"learning_rate": 9.916828275032868e-06,
"loss": 0.3585,
"num_tokens": 2235502698.0,
"step": 8540
},
{
"epoch": 3.983682983682984,
"grad_norm": 0.324896447348713,
"learning_rate": 9.895204336758132e-06,
"loss": 0.3539,
"num_tokens": 2236813418.0,
"step": 8545
},
{
"epoch": 3.986013986013986,
"grad_norm": 0.31758055886736325,
"learning_rate": 9.87362224822531e-06,
"loss": 0.3543,
"num_tokens": 2238114298.0,
"step": 8550
},
{
"epoch": 3.988344988344988,
"grad_norm": 0.34580160569991886,
"learning_rate": 9.85208206073889e-06,
"loss": 0.3552,
"num_tokens": 2239417127.0,
"step": 8555
},
{
"epoch": 3.9906759906759905,
"grad_norm": 0.3386420228574512,
"learning_rate": 9.830583825503725e-06,
"loss": 0.3521,
"num_tokens": 2240727847.0,
"step": 8560
},
{
"epoch": 3.993006993006993,
"grad_norm": 0.32993309995325326,
"learning_rate": 9.80912759362497e-06,
"loss": 0.3504,
"num_tokens": 2242038567.0,
"step": 8565
},
{
"epoch": 3.9953379953379953,
"grad_norm": 0.34047566030766635,
"learning_rate": 9.787713416107919e-06,
"loss": 0.3535,
"num_tokens": 2243349287.0,
"step": 8570
},
{
"epoch": 3.9976689976689976,
"grad_norm": 0.32231730842208467,
"learning_rate": 9.76634134385788e-06,
"loss": 0.3576,
"num_tokens": 2244653368.0,
"step": 8575
},
{
"epoch": 4.0,
"grad_norm": 0.3196697658342358,
"learning_rate": 9.745011427680106e-06,
"loss": 0.3417,
"num_tokens": 2245952708.0,
"step": 8580
},
{
"epoch": 4.002331002331002,
"grad_norm": 0.3154639847560265,
"learning_rate": 9.723723718279595e-06,
"loss": 0.3027,
"num_tokens": 2247263428.0,
"step": 8585
},
{
"epoch": 4.004662004662005,
"grad_norm": 0.3627171601549924,
"learning_rate": 9.702478266261042e-06,
"loss": 0.3105,
"num_tokens": 2248564079.0,
"step": 8590
},
{
"epoch": 4.006993006993007,
"grad_norm": 0.36469519862133226,
"learning_rate": 9.68127512212868e-06,
"loss": 0.3218,
"num_tokens": 2249874799.0,
"step": 8595
},
{
"epoch": 4.0093240093240095,
"grad_norm": 0.3315289199193423,
"learning_rate": 9.660114336286164e-06,
"loss": 0.3212,
"num_tokens": 2251185519.0,
"step": 8600
},
{
"epoch": 4.011655011655011,
"grad_norm": 0.35875417884768623,
"learning_rate": 9.638995959036456e-06,
"loss": 0.3109,
"num_tokens": 2252491344.0,
"step": 8605
},
{
"epoch": 4.013986013986014,
"grad_norm": 0.32867156865417013,
"learning_rate": 9.617920040581724e-06,
"loss": 0.303,
"num_tokens": 2253802064.0,
"step": 8610
},
{
"epoch": 4.016317016317016,
"grad_norm": 0.310349653790998,
"learning_rate": 9.596886631023169e-06,
"loss": 0.3094,
"num_tokens": 2255108675.0,
"step": 8615
},
{
"epoch": 4.018648018648019,
"grad_norm": 0.32806845035826643,
"learning_rate": 9.575895780360969e-06,
"loss": 0.3207,
"num_tokens": 2256419395.0,
"step": 8620
},
{
"epoch": 4.020979020979021,
"grad_norm": 0.3217206000624844,
"learning_rate": 9.55494753849413e-06,
"loss": 0.3088,
"num_tokens": 2257730115.0,
"step": 8625
},
{
"epoch": 4.023310023310024,
"grad_norm": 0.33329725907377766,
"learning_rate": 9.534041955220353e-06,
"loss": 0.309,
"num_tokens": 2259040835.0,
"step": 8630
},
{
"epoch": 4.0256410256410255,
"grad_norm": 0.3237761512977047,
"learning_rate": 9.513179080235933e-06,
"loss": 0.3108,
"num_tokens": 2260351555.0,
"step": 8635
},
{
"epoch": 4.027972027972028,
"grad_norm": 0.3301327078317335,
"learning_rate": 9.492358963135671e-06,
"loss": 0.3075,
"num_tokens": 2261662275.0,
"step": 8640
},
{
"epoch": 4.03030303030303,
"grad_norm": 0.3257522642348306,
"learning_rate": 9.47158165341269e-06,
"loss": 0.3167,
"num_tokens": 2262972995.0,
"step": 8645
},
{
"epoch": 4.032634032634032,
"grad_norm": 0.3400399129587691,
"learning_rate": 9.450847200458351e-06,
"loss": 0.3144,
"num_tokens": 2264283715.0,
"step": 8650
},
{
"epoch": 4.034965034965035,
"grad_norm": 0.3279323276345651,
"learning_rate": 9.430155653562176e-06,
"loss": 0.3138,
"num_tokens": 2265594435.0,
"step": 8655
},
{
"epoch": 4.037296037296037,
"grad_norm": 0.34371766089078787,
"learning_rate": 9.409507061911648e-06,
"loss": 0.3153,
"num_tokens": 2266897223.0,
"step": 8660
},
{
"epoch": 4.03962703962704,
"grad_norm": 0.3237833855664639,
"learning_rate": 9.38890147459216e-06,
"loss": 0.3141,
"num_tokens": 2268207943.0,
"step": 8665
},
{
"epoch": 4.041958041958042,
"grad_norm": 0.33288798285203314,
"learning_rate": 9.368338940586866e-06,
"loss": 0.3144,
"num_tokens": 2269518663.0,
"step": 8670
},
{
"epoch": 4.0442890442890445,
"grad_norm": 0.34404924053052394,
"learning_rate": 9.347819508776593e-06,
"loss": 0.3142,
"num_tokens": 2270829383.0,
"step": 8675
},
{
"epoch": 4.046620046620046,
"grad_norm": 0.3366069132240311,
"learning_rate": 9.327343227939677e-06,
"loss": 0.3118,
"num_tokens": 2272140103.0,
"step": 8680
},
{
"epoch": 4.048951048951049,
"grad_norm": 0.33022829494586375,
"learning_rate": 9.306910146751903e-06,
"loss": 0.3025,
"num_tokens": 2273448784.0,
"step": 8685
},
{
"epoch": 4.051282051282051,
"grad_norm": 0.3362236980144924,
"learning_rate": 9.286520313786359e-06,
"loss": 0.3062,
"num_tokens": 2274759504.0,
"step": 8690
},
{
"epoch": 4.053613053613054,
"grad_norm": 0.3375407896276986,
"learning_rate": 9.2661737775133e-06,
"loss": 0.3115,
"num_tokens": 2276070224.0,
"step": 8695
},
{
"epoch": 4.055944055944056,
"grad_norm": 0.3387055817635362,
"learning_rate": 9.245870586300086e-06,
"loss": 0.3076,
"num_tokens": 2277380944.0,
"step": 8700
},
{
"epoch": 4.058275058275059,
"grad_norm": 0.33597806666914465,
"learning_rate": 9.225610788411028e-06,
"loss": 0.3124,
"num_tokens": 2278683128.0,
"step": 8705
},
{
"epoch": 4.0606060606060606,
"grad_norm": 0.33166354222510536,
"learning_rate": 9.205394432007274e-06,
"loss": 0.3195,
"num_tokens": 2279993848.0,
"step": 8710
},
{
"epoch": 4.062937062937063,
"grad_norm": 0.340915400738789,
"learning_rate": 9.185221565146719e-06,
"loss": 0.3129,
"num_tokens": 2281304568.0,
"step": 8715
},
{
"epoch": 4.065268065268065,
"grad_norm": 0.324138237680919,
"learning_rate": 9.165092235783872e-06,
"loss": 0.3026,
"num_tokens": 2282615288.0,
"step": 8720
},
{
"epoch": 4.067599067599067,
"grad_norm": 0.32248302399372863,
"learning_rate": 9.145006491769734e-06,
"loss": 0.3131,
"num_tokens": 2283926008.0,
"step": 8725
},
{
"epoch": 4.06993006993007,
"grad_norm": 0.3300129820806774,
"learning_rate": 9.124964380851697e-06,
"loss": 0.3147,
"num_tokens": 2285236728.0,
"step": 8730
},
{
"epoch": 4.072261072261072,
"grad_norm": 0.3281535752862011,
"learning_rate": 9.104965950673457e-06,
"loss": 0.317,
"num_tokens": 2286547448.0,
"step": 8735
},
{
"epoch": 4.074592074592075,
"grad_norm": 0.33825429279702496,
"learning_rate": 9.085011248774844e-06,
"loss": 0.3056,
"num_tokens": 2287858168.0,
"step": 8740
},
{
"epoch": 4.076923076923077,
"grad_norm": 0.33028034320434174,
"learning_rate": 9.065100322591735e-06,
"loss": 0.3084,
"num_tokens": 2289164548.0,
"step": 8745
},
{
"epoch": 4.0792540792540795,
"grad_norm": 0.3353664679588956,
"learning_rate": 9.045233219455967e-06,
"loss": 0.3257,
"num_tokens": 2290475268.0,
"step": 8750
},
{
"epoch": 4.081585081585081,
"grad_norm": 0.3315814088066854,
"learning_rate": 9.025409986595191e-06,
"loss": 0.3131,
"num_tokens": 2291785988.0,
"step": 8755
},
{
"epoch": 4.083916083916084,
"grad_norm": 0.32829158816501314,
"learning_rate": 9.005630671132767e-06,
"loss": 0.3247,
"num_tokens": 2293090934.0,
"step": 8760
},
{
"epoch": 4.086247086247086,
"grad_norm": 0.3400820384105229,
"learning_rate": 8.985895320087657e-06,
"loss": 0.322,
"num_tokens": 2294401654.0,
"step": 8765
},
{
"epoch": 4.088578088578089,
"grad_norm": 0.3469249174397503,
"learning_rate": 8.96620398037432e-06,
"loss": 0.3204,
"num_tokens": 2295712374.0,
"step": 8770
},
{
"epoch": 4.090909090909091,
"grad_norm": 0.33092792284960687,
"learning_rate": 8.946556698802578e-06,
"loss": 0.3171,
"num_tokens": 2297023094.0,
"step": 8775
},
{
"epoch": 4.093240093240094,
"grad_norm": 0.34641277690350863,
"learning_rate": 8.926953522077528e-06,
"loss": 0.314,
"num_tokens": 2298333814.0,
"step": 8780
},
{
"epoch": 4.0955710955710956,
"grad_norm": 0.32438963991128505,
"learning_rate": 8.907394496799429e-06,
"loss": 0.3143,
"num_tokens": 2299637704.0,
"step": 8785
},
{
"epoch": 4.0979020979020975,
"grad_norm": 0.33556755134102456,
"learning_rate": 8.887879669463562e-06,
"loss": 0.3144,
"num_tokens": 2300946471.0,
"step": 8790
},
{
"epoch": 4.1002331002331,
"grad_norm": 0.3370515747236785,
"learning_rate": 8.868409086460167e-06,
"loss": 0.3138,
"num_tokens": 2302257191.0,
"step": 8795
},
{
"epoch": 4.102564102564102,
"grad_norm": 0.32539271406101317,
"learning_rate": 8.848982794074288e-06,
"loss": 0.3013,
"num_tokens": 2303567911.0,
"step": 8800
},
{
"epoch": 4.104895104895105,
"grad_norm": 0.3348268658945831,
"learning_rate": 8.829600838485691e-06,
"loss": 0.3191,
"num_tokens": 2304878631.0,
"step": 8805
},
{
"epoch": 4.107226107226107,
"grad_norm": 0.3254305192351051,
"learning_rate": 8.810263265768749e-06,
"loss": 0.3097,
"num_tokens": 2306189351.0,
"step": 8810
},
{
"epoch": 4.10955710955711,
"grad_norm": 0.3528490799980861,
"learning_rate": 8.790970121892318e-06,
"loss": 0.3144,
"num_tokens": 2307500071.0,
"step": 8815
},
{
"epoch": 4.111888111888112,
"grad_norm": 0.34560029308995477,
"learning_rate": 8.771721452719644e-06,
"loss": 0.329,
"num_tokens": 2308800878.0,
"step": 8820
},
{
"epoch": 4.1142191142191145,
"grad_norm": 0.34271459470922583,
"learning_rate": 8.752517304008263e-06,
"loss": 0.3179,
"num_tokens": 2310111598.0,
"step": 8825
},
{
"epoch": 4.116550116550116,
"grad_norm": 0.34444328829999327,
"learning_rate": 8.733357721409847e-06,
"loss": 0.3074,
"num_tokens": 2311422318.0,
"step": 8830
},
{
"epoch": 4.118881118881119,
"grad_norm": 0.3345099910245634,
"learning_rate": 8.714242750470155e-06,
"loss": 0.3169,
"num_tokens": 2312733038.0,
"step": 8835
},
{
"epoch": 4.121212121212121,
"grad_norm": 0.336105428464414,
"learning_rate": 8.695172436628885e-06,
"loss": 0.3074,
"num_tokens": 2314036848.0,
"step": 8840
},
{
"epoch": 4.123543123543124,
"grad_norm": 0.3432184731029786,
"learning_rate": 8.676146825219574e-06,
"loss": 0.3244,
"num_tokens": 2315347568.0,
"step": 8845
},
{
"epoch": 4.125874125874126,
"grad_norm": 0.3514286741500932,
"learning_rate": 8.657165961469496e-06,
"loss": 0.3122,
"num_tokens": 2316647401.0,
"step": 8850
},
{
"epoch": 4.128205128205128,
"grad_norm": 0.3464258867521915,
"learning_rate": 8.63822989049955e-06,
"loss": 0.3121,
"num_tokens": 2317958121.0,
"step": 8855
},
{
"epoch": 4.130536130536131,
"grad_norm": 0.33188673347599557,
"learning_rate": 8.619338657324167e-06,
"loss": 0.308,
"num_tokens": 2319268841.0,
"step": 8860
},
{
"epoch": 4.1328671328671325,
"grad_norm": 0.33368696164039957,
"learning_rate": 8.600492306851166e-06,
"loss": 0.3115,
"num_tokens": 2320579561.0,
"step": 8865
},
{
"epoch": 4.135198135198135,
"grad_norm": 0.3233544453202552,
"learning_rate": 8.581690883881696e-06,
"loss": 0.3185,
"num_tokens": 2321890281.0,
"step": 8870
},
{
"epoch": 4.137529137529137,
"grad_norm": 0.33173085265251867,
"learning_rate": 8.562934433110101e-06,
"loss": 0.3081,
"num_tokens": 2323194296.0,
"step": 8875
},
{
"epoch": 4.13986013986014,
"grad_norm": 0.32262540271377654,
"learning_rate": 8.544222999123798e-06,
"loss": 0.3099,
"num_tokens": 2324493603.0,
"step": 8880
},
{
"epoch": 4.142191142191142,
"grad_norm": 0.3267911687013554,
"learning_rate": 8.525556626403214e-06,
"loss": 0.3149,
"num_tokens": 2325804323.0,
"step": 8885
},
{
"epoch": 4.144522144522145,
"grad_norm": 0.3668716182370164,
"learning_rate": 8.506935359321655e-06,
"loss": 0.317,
"num_tokens": 2327101698.0,
"step": 8890
},
{
"epoch": 4.146853146853147,
"grad_norm": 0.3223134667321982,
"learning_rate": 8.488359242145182e-06,
"loss": 0.3086,
"num_tokens": 2328412418.0,
"step": 8895
},
{
"epoch": 4.1491841491841495,
"grad_norm": 0.31791906605050724,
"learning_rate": 8.469828319032555e-06,
"loss": 0.3112,
"num_tokens": 2329705306.0,
"step": 8900
},
{
"epoch": 4.151515151515151,
"grad_norm": 0.3238691360164819,
"learning_rate": 8.451342634035081e-06,
"loss": 0.312,
"num_tokens": 2331002662.0,
"step": 8905
},
{
"epoch": 4.153846153846154,
"grad_norm": 0.32796231037465196,
"learning_rate": 8.432902231096532e-06,
"loss": 0.318,
"num_tokens": 2332313382.0,
"step": 8910
},
{
"epoch": 4.156177156177156,
"grad_norm": 0.33796829228728653,
"learning_rate": 8.414507154053038e-06,
"loss": 0.309,
"num_tokens": 2333624102.0,
"step": 8915
},
{
"epoch": 4.158508158508159,
"grad_norm": 0.32836861040158594,
"learning_rate": 8.396157446632985e-06,
"loss": 0.3019,
"num_tokens": 2334934822.0,
"step": 8920
},
{
"epoch": 4.160839160839161,
"grad_norm": 0.32684570489397824,
"learning_rate": 8.3778531524569e-06,
"loss": 0.312,
"num_tokens": 2336245542.0,
"step": 8925
},
{
"epoch": 4.163170163170163,
"grad_norm": 0.3318787549687187,
"learning_rate": 8.359594315037348e-06,
"loss": 0.3202,
"num_tokens": 2337556262.0,
"step": 8930
},
{
"epoch": 4.165501165501166,
"grad_norm": 0.33111122882180744,
"learning_rate": 8.341380977778866e-06,
"loss": 0.3155,
"num_tokens": 2338853654.0,
"step": 8935
},
{
"epoch": 4.1678321678321675,
"grad_norm": 0.3224813447044692,
"learning_rate": 8.323213183977793e-06,
"loss": 0.3091,
"num_tokens": 2340164374.0,
"step": 8940
},
{
"epoch": 4.17016317016317,
"grad_norm": 0.3456964708575295,
"learning_rate": 8.305090976822214e-06,
"loss": 0.31,
"num_tokens": 2341462721.0,
"step": 8945
},
{
"epoch": 4.172494172494172,
"grad_norm": 0.33107924062669025,
"learning_rate": 8.287014399391866e-06,
"loss": 0.3207,
"num_tokens": 2342762061.0,
"step": 8950
},
{
"epoch": 4.174825174825175,
"grad_norm": 0.33431120526499053,
"learning_rate": 8.268983494657993e-06,
"loss": 0.3179,
"num_tokens": 2344072781.0,
"step": 8955
},
{
"epoch": 4.177156177156177,
"grad_norm": 0.32952875537848103,
"learning_rate": 8.250998305483268e-06,
"loss": 0.306,
"num_tokens": 2345376435.0,
"step": 8960
},
{
"epoch": 4.17948717948718,
"grad_norm": 0.3456893653270103,
"learning_rate": 8.233058874621704e-06,
"loss": 0.326,
"num_tokens": 2346687155.0,
"step": 8965
},
{
"epoch": 4.181818181818182,
"grad_norm": 0.3394277090932486,
"learning_rate": 8.215165244718532e-06,
"loss": 0.3144,
"num_tokens": 2347997875.0,
"step": 8970
},
{
"epoch": 4.1841491841491845,
"grad_norm": 0.33738822320988965,
"learning_rate": 8.197317458310092e-06,
"loss": 0.3092,
"num_tokens": 2349296210.0,
"step": 8975
},
{
"epoch": 4.186480186480186,
"grad_norm": 0.34369988163488063,
"learning_rate": 8.179515557823769e-06,
"loss": 0.3109,
"num_tokens": 2350606930.0,
"step": 8980
},
{
"epoch": 4.188811188811189,
"grad_norm": 0.33185490990267685,
"learning_rate": 8.161759585577863e-06,
"loss": 0.3222,
"num_tokens": 2351905894.0,
"step": 8985
},
{
"epoch": 4.191142191142191,
"grad_norm": 0.33437248830096067,
"learning_rate": 8.144049583781475e-06,
"loss": 0.313,
"num_tokens": 2353216614.0,
"step": 8990
},
{
"epoch": 4.193473193473194,
"grad_norm": 0.32606670299996504,
"learning_rate": 8.126385594534448e-06,
"loss": 0.3155,
"num_tokens": 2354527334.0,
"step": 8995
},
{
"epoch": 4.195804195804196,
"grad_norm": 0.354696304621865,
"learning_rate": 8.108767659827245e-06,
"loss": 0.3019,
"num_tokens": 2355838054.0,
"step": 9000
},
{
"epoch": 4.198135198135198,
"grad_norm": 0.30886261035523754,
"learning_rate": 8.09119582154083e-06,
"loss": 0.3125,
"num_tokens": 2357148774.0,
"step": 9005
},
{
"epoch": 4.200466200466201,
"grad_norm": 0.32186205072669777,
"learning_rate": 8.07367012144661e-06,
"loss": 0.3102,
"num_tokens": 2358459494.0,
"step": 9010
},
{
"epoch": 4.2027972027972025,
"grad_norm": 0.32692083529916577,
"learning_rate": 8.05619060120629e-06,
"loss": 0.3097,
"num_tokens": 2359770023.0,
"step": 9015
},
{
"epoch": 4.205128205128205,
"grad_norm": 0.3536074622682776,
"learning_rate": 8.038757302371816e-06,
"loss": 0.3124,
"num_tokens": 2361080743.0,
"step": 9020
},
{
"epoch": 4.207459207459207,
"grad_norm": 0.3320417618080785,
"learning_rate": 8.021370266385257e-06,
"loss": 0.3143,
"num_tokens": 2362384296.0,
"step": 9025
},
{
"epoch": 4.20979020979021,
"grad_norm": 0.33236316161184004,
"learning_rate": 8.004029534578694e-06,
"loss": 0.3202,
"num_tokens": 2363695016.0,
"step": 9030
},
{
"epoch": 4.212121212121212,
"grad_norm": 0.3282580657954475,
"learning_rate": 7.986735148174142e-06,
"loss": 0.3102,
"num_tokens": 2364989382.0,
"step": 9035
},
{
"epoch": 4.214452214452215,
"grad_norm": 0.3449624050121814,
"learning_rate": 7.969487148283451e-06,
"loss": 0.3222,
"num_tokens": 2366300102.0,
"step": 9040
},
{
"epoch": 4.216783216783217,
"grad_norm": 0.33015411004128303,
"learning_rate": 7.95228557590819e-06,
"loss": 0.3189,
"num_tokens": 2367610822.0,
"step": 9045
},
{
"epoch": 4.2191142191142195,
"grad_norm": 0.3482626079276277,
"learning_rate": 7.935130471939572e-06,
"loss": 0.319,
"num_tokens": 2368919557.0,
"step": 9050
},
{
"epoch": 4.221445221445221,
"grad_norm": 0.3313155428462869,
"learning_rate": 7.918021877158333e-06,
"loss": 0.3229,
"num_tokens": 2370230277.0,
"step": 9055
},
{
"epoch": 4.223776223776224,
"grad_norm": 0.3286320237256984,
"learning_rate": 7.900959832234667e-06,
"loss": 0.315,
"num_tokens": 2371540997.0,
"step": 9060
},
{
"epoch": 4.226107226107226,
"grad_norm": 0.32781843149335,
"learning_rate": 7.883944377728091e-06,
"loss": 0.3168,
"num_tokens": 2372851717.0,
"step": 9065
},
{
"epoch": 4.228438228438228,
"grad_norm": 0.3199378658171041,
"learning_rate": 7.866975554087384e-06,
"loss": 0.3196,
"num_tokens": 2374154156.0,
"step": 9070
},
{
"epoch": 4.230769230769231,
"grad_norm": 0.32318125321218416,
"learning_rate": 7.85005340165047e-06,
"loss": 0.3109,
"num_tokens": 2375464876.0,
"step": 9075
},
{
"epoch": 4.233100233100233,
"grad_norm": 0.32948239675038016,
"learning_rate": 7.833177960644318e-06,
"loss": 0.3149,
"num_tokens": 2376775596.0,
"step": 9080
},
{
"epoch": 4.235431235431236,
"grad_norm": 0.330342035385706,
"learning_rate": 7.816349271184873e-06,
"loss": 0.3228,
"num_tokens": 2378072943.0,
"step": 9085
},
{
"epoch": 4.2377622377622375,
"grad_norm": 0.3343981894720827,
"learning_rate": 7.79956737327693e-06,
"loss": 0.3272,
"num_tokens": 2379383663.0,
"step": 9090
},
{
"epoch": 4.24009324009324,
"grad_norm": 0.3370576768884812,
"learning_rate": 7.782832306814055e-06,
"loss": 0.3215,
"num_tokens": 2380694383.0,
"step": 9095
},
{
"epoch": 4.242424242424242,
"grad_norm": 0.34055936569913503,
"learning_rate": 7.766144111578488e-06,
"loss": 0.3154,
"num_tokens": 2382005103.0,
"step": 9100
},
{
"epoch": 4.244755244755245,
"grad_norm": 0.3250618537313876,
"learning_rate": 7.749502827241053e-06,
"loss": 0.3054,
"num_tokens": 2383315823.0,
"step": 9105
},
{
"epoch": 4.247086247086247,
"grad_norm": 0.33979822723341285,
"learning_rate": 7.732908493361054e-06,
"loss": 0.3131,
"num_tokens": 2384626543.0,
"step": 9110
},
{
"epoch": 4.24941724941725,
"grad_norm": 0.3448739247944029,
"learning_rate": 7.716361149386169e-06,
"loss": 0.3154,
"num_tokens": 2385937263.0,
"step": 9115
},
{
"epoch": 4.251748251748252,
"grad_norm": 0.3513942678682862,
"learning_rate": 7.69986083465241e-06,
"loss": 0.3161,
"num_tokens": 2387247983.0,
"step": 9120
},
{
"epoch": 4.2540792540792545,
"grad_norm": 0.33283021075110353,
"learning_rate": 7.68340758838396e-06,
"loss": 0.3138,
"num_tokens": 2388558703.0,
"step": 9125
},
{
"epoch": 4.256410256410256,
"grad_norm": 0.3241327933794044,
"learning_rate": 7.667001449693118e-06,
"loss": 0.3073,
"num_tokens": 2389869423.0,
"step": 9130
},
{
"epoch": 4.258741258741258,
"grad_norm": 0.33663247742640745,
"learning_rate": 7.650642457580216e-06,
"loss": 0.3245,
"num_tokens": 2391180143.0,
"step": 9135
},
{
"epoch": 4.261072261072261,
"grad_norm": 0.35167952703734645,
"learning_rate": 7.634330650933491e-06,
"loss": 0.318,
"num_tokens": 2392490863.0,
"step": 9140
},
{
"epoch": 4.263403263403263,
"grad_norm": 0.3458772358952002,
"learning_rate": 7.618066068529013e-06,
"loss": 0.3217,
"num_tokens": 2393801583.0,
"step": 9145
},
{
"epoch": 4.265734265734266,
"grad_norm": 0.33672532361513857,
"learning_rate": 7.601848749030614e-06,
"loss": 0.3153,
"num_tokens": 2395112303.0,
"step": 9150
},
{
"epoch": 4.268065268065268,
"grad_norm": 0.3412841871135003,
"learning_rate": 7.5856787309897485e-06,
"loss": 0.3129,
"num_tokens": 2396423023.0,
"step": 9155
},
{
"epoch": 4.270396270396271,
"grad_norm": 0.34739331638218984,
"learning_rate": 7.5695560528454335e-06,
"loss": 0.3268,
"num_tokens": 2397733743.0,
"step": 9160
},
{
"epoch": 4.2727272727272725,
"grad_norm": 0.32838660057211366,
"learning_rate": 7.553480752924152e-06,
"loss": 0.3176,
"num_tokens": 2399044463.0,
"step": 9165
},
{
"epoch": 4.275058275058275,
"grad_norm": 0.34100398627810036,
"learning_rate": 7.537452869439773e-06,
"loss": 0.3238,
"num_tokens": 2400355183.0,
"step": 9170
},
{
"epoch": 4.277389277389277,
"grad_norm": 0.34349089530086746,
"learning_rate": 7.521472440493424e-06,
"loss": 0.3241,
"num_tokens": 2401665903.0,
"step": 9175
},
{
"epoch": 4.27972027972028,
"grad_norm": 0.32237561114424745,
"learning_rate": 7.5055395040734375e-06,
"loss": 0.31,
"num_tokens": 2402964469.0,
"step": 9180
},
{
"epoch": 4.282051282051282,
"grad_norm": 0.34307510366114763,
"learning_rate": 7.489654098055261e-06,
"loss": 0.3307,
"num_tokens": 2404275189.0,
"step": 9185
},
{
"epoch": 4.284382284382285,
"grad_norm": 0.3541210255290945,
"learning_rate": 7.473816260201326e-06,
"loss": 0.3213,
"num_tokens": 2405585909.0,
"step": 9190
},
{
"epoch": 4.286713286713287,
"grad_norm": 0.33830233519487823,
"learning_rate": 7.458026028161005e-06,
"loss": 0.3171,
"num_tokens": 2406871865.0,
"step": 9195
},
{
"epoch": 4.2890442890442895,
"grad_norm": 0.3280850693643302,
"learning_rate": 7.442283439470503e-06,
"loss": 0.3234,
"num_tokens": 2408182585.0,
"step": 9200
},
{
"epoch": 4.291375291375291,
"grad_norm": 0.34250336046966473,
"learning_rate": 7.426588531552755e-06,
"loss": 0.3188,
"num_tokens": 2409488739.0,
"step": 9205
},
{
"epoch": 4.293706293706293,
"grad_norm": 0.33353326558959956,
"learning_rate": 7.4109413417173645e-06,
"loss": 0.3162,
"num_tokens": 2410799459.0,
"step": 9210
},
{
"epoch": 4.296037296037296,
"grad_norm": 0.32332388680356194,
"learning_rate": 7.3953419071604965e-06,
"loss": 0.3229,
"num_tokens": 2412110179.0,
"step": 9215
},
{
"epoch": 4.298368298368298,
"grad_norm": 0.3240810807807097,
"learning_rate": 7.379790264964787e-06,
"loss": 0.3071,
"num_tokens": 2413420899.0,
"step": 9220
},
{
"epoch": 4.300699300699301,
"grad_norm": 0.33571497946632756,
"learning_rate": 7.364286452099268e-06,
"loss": 0.3247,
"num_tokens": 2414731619.0,
"step": 9225
},
{
"epoch": 4.303030303030303,
"grad_norm": 0.3156999850446858,
"learning_rate": 7.348830505419266e-06,
"loss": 0.3078,
"num_tokens": 2416042339.0,
"step": 9230
},
{
"epoch": 4.305361305361306,
"grad_norm": 0.31560747105000503,
"learning_rate": 7.333422461666334e-06,
"loss": 0.3079,
"num_tokens": 2417352288.0,
"step": 9235
},
{
"epoch": 4.3076923076923075,
"grad_norm": 0.31764759996906916,
"learning_rate": 7.318062357468133e-06,
"loss": 0.3109,
"num_tokens": 2418663008.0,
"step": 9240
},
{
"epoch": 4.31002331002331,
"grad_norm": 0.33954512734132447,
"learning_rate": 7.302750229338377e-06,
"loss": 0.3141,
"num_tokens": 2419973728.0,
"step": 9245
},
{
"epoch": 4.312354312354312,
"grad_norm": 0.33905444936733614,
"learning_rate": 7.287486113676732e-06,
"loss": 0.3055,
"num_tokens": 2421284448.0,
"step": 9250
},
{
"epoch": 4.314685314685315,
"grad_norm": 0.3516319488679377,
"learning_rate": 7.272270046768719e-06,
"loss": 0.3229,
"num_tokens": 2422595168.0,
"step": 9255
},
{
"epoch": 4.317016317016317,
"grad_norm": 0.31543886841356666,
"learning_rate": 7.257102064785647e-06,
"loss": 0.3026,
"num_tokens": 2423905888.0,
"step": 9260
},
{
"epoch": 4.31934731934732,
"grad_norm": 0.33654187527085266,
"learning_rate": 7.241982203784521e-06,
"loss": 0.316,
"num_tokens": 2425216608.0,
"step": 9265
},
{
"epoch": 4.321678321678322,
"grad_norm": 0.3453335987898658,
"learning_rate": 7.226910499707942e-06,
"loss": 0.3213,
"num_tokens": 2426527328.0,
"step": 9270
},
{
"epoch": 4.3240093240093245,
"grad_norm": 0.31972916985922056,
"learning_rate": 7.211886988384051e-06,
"loss": 0.3141,
"num_tokens": 2427838048.0,
"step": 9275
},
{
"epoch": 4.326340326340326,
"grad_norm": 0.31446079269320515,
"learning_rate": 7.196911705526405e-06,
"loss": 0.3161,
"num_tokens": 2429148768.0,
"step": 9280
},
{
"epoch": 4.328671328671328,
"grad_norm": 0.31884066154984625,
"learning_rate": 7.181984686733929e-06,
"loss": 0.3059,
"num_tokens": 2430459488.0,
"step": 9285
},
{
"epoch": 4.331002331002331,
"grad_norm": 0.338313943294416,
"learning_rate": 7.167105967490818e-06,
"loss": 0.3104,
"num_tokens": 2431770208.0,
"step": 9290
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.3434119449215798,
"learning_rate": 7.1522755831664345e-06,
"loss": 0.3128,
"num_tokens": 2433080928.0,
"step": 9295
},
{
"epoch": 4.335664335664336,
"grad_norm": 0.31426173762807547,
"learning_rate": 7.137493569015252e-06,
"loss": 0.3136,
"num_tokens": 2434388313.0,
"step": 9300
},
{
"epoch": 4.337995337995338,
"grad_norm": 0.33860929397786954,
"learning_rate": 7.122759960176764e-06,
"loss": 0.316,
"num_tokens": 2435699033.0,
"step": 9305
},
{
"epoch": 4.340326340326341,
"grad_norm": 0.32512758447513196,
"learning_rate": 7.108074791675377e-06,
"loss": 0.3276,
"num_tokens": 2437009753.0,
"step": 9310
},
{
"epoch": 4.3426573426573425,
"grad_norm": 0.3408691365223931,
"learning_rate": 7.093438098420364e-06,
"loss": 0.3111,
"num_tokens": 2438320473.0,
"step": 9315
},
{
"epoch": 4.344988344988345,
"grad_norm": 10.7223619115978,
"learning_rate": 7.078849915205761e-06,
"loss": 0.3984,
"num_tokens": 2439624916.0,
"step": 9320
},
{
"epoch": 4.347319347319347,
"grad_norm": 0.3465031175218188,
"learning_rate": 7.06431027671028e-06,
"loss": 0.3282,
"num_tokens": 2440935636.0,
"step": 9325
},
{
"epoch": 4.34965034965035,
"grad_norm": 0.34298399330314866,
"learning_rate": 7.049819217497229e-06,
"loss": 0.3151,
"num_tokens": 2442246356.0,
"step": 9330
},
{
"epoch": 4.351981351981352,
"grad_norm": 0.32774175918296494,
"learning_rate": 7.0353767720144585e-06,
"loss": 0.311,
"num_tokens": 2443557076.0,
"step": 9335
},
{
"epoch": 4.354312354312354,
"grad_norm": 0.3445401287963301,
"learning_rate": 7.020982974594234e-06,
"loss": 0.3177,
"num_tokens": 2444867796.0,
"step": 9340
},
{
"epoch": 4.356643356643357,
"grad_norm": 0.33840413843763606,
"learning_rate": 7.006637859453166e-06,
"loss": 0.3175,
"num_tokens": 2446166667.0,
"step": 9345
},
{
"epoch": 4.358974358974359,
"grad_norm": 0.3442185853976276,
"learning_rate": 6.99234146069218e-06,
"loss": 0.3285,
"num_tokens": 2447477387.0,
"step": 9350
},
{
"epoch": 4.361305361305361,
"grad_norm": 0.32304618994576395,
"learning_rate": 6.978093812296353e-06,
"loss": 0.3241,
"num_tokens": 2448788107.0,
"step": 9355
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.3371003210685687,
"learning_rate": 6.963894948134886e-06,
"loss": 0.3153,
"num_tokens": 2450098827.0,
"step": 9360
},
{
"epoch": 4.365967365967366,
"grad_norm": 0.32770902047531997,
"learning_rate": 6.949744901961018e-06,
"loss": 0.3205,
"num_tokens": 2451409547.0,
"step": 9365
},
{
"epoch": 4.368298368298368,
"grad_norm": 0.3316976289164916,
"learning_rate": 6.935643707411941e-06,
"loss": 0.3181,
"num_tokens": 2452715351.0,
"step": 9370
},
{
"epoch": 4.370629370629371,
"grad_norm": 0.3297586670212026,
"learning_rate": 6.9215913980087e-06,
"loss": 0.3127,
"num_tokens": 2454026071.0,
"step": 9375
},
{
"epoch": 4.372960372960373,
"grad_norm": 0.34164744331202934,
"learning_rate": 6.907588007156147e-06,
"loss": 0.3167,
"num_tokens": 2455323641.0,
"step": 9380
},
{
"epoch": 4.375291375291376,
"grad_norm": 0.3325814425613154,
"learning_rate": 6.893633568142849e-06,
"loss": 0.3115,
"num_tokens": 2456634361.0,
"step": 9385
},
{
"epoch": 4.3776223776223775,
"grad_norm": 0.358839160865776,
"learning_rate": 6.87972811414099e-06,
"loss": 0.3007,
"num_tokens": 2457945081.0,
"step": 9390
},
{
"epoch": 4.37995337995338,
"grad_norm": 0.34252531902628414,
"learning_rate": 6.865871678206317e-06,
"loss": 0.3189,
"num_tokens": 2459255801.0,
"step": 9395
},
{
"epoch": 4.382284382284382,
"grad_norm": 0.3241607534033865,
"learning_rate": 6.85206429327806e-06,
"loss": 0.3063,
"num_tokens": 2460566521.0,
"step": 9400
},
{
"epoch": 4.384615384615385,
"grad_norm": 0.32915418617519726,
"learning_rate": 6.838305992178824e-06,
"loss": 0.3181,
"num_tokens": 2461877241.0,
"step": 9405
},
{
"epoch": 4.386946386946387,
"grad_norm": 0.3252694583787816,
"learning_rate": 6.824596807614559e-06,
"loss": 0.3115,
"num_tokens": 2463187961.0,
"step": 9410
},
{
"epoch": 4.389277389277389,
"grad_norm": 0.3293083654300197,
"learning_rate": 6.810936772174439e-06,
"loss": 0.3235,
"num_tokens": 2464495546.0,
"step": 9415
},
{
"epoch": 4.391608391608392,
"grad_norm": 0.3542236093862788,
"learning_rate": 6.797325918330806e-06,
"loss": 0.3032,
"num_tokens": 2465796046.0,
"step": 9420
},
{
"epoch": 4.393939393939394,
"grad_norm": 0.3330424585038265,
"learning_rate": 6.783764278439092e-06,
"loss": 0.3112,
"num_tokens": 2467106766.0,
"step": 9425
},
{
"epoch": 4.396270396270396,
"grad_norm": 0.34014850867321345,
"learning_rate": 6.77025188473773e-06,
"loss": 0.3108,
"num_tokens": 2468417486.0,
"step": 9430
},
{
"epoch": 4.398601398601398,
"grad_norm": 0.3325831324351841,
"learning_rate": 6.756788769348103e-06,
"loss": 0.3189,
"num_tokens": 2469728206.0,
"step": 9435
},
{
"epoch": 4.400932400932401,
"grad_norm": 0.33398045142731836,
"learning_rate": 6.743374964274427e-06,
"loss": 0.3212,
"num_tokens": 2471038926.0,
"step": 9440
},
{
"epoch": 4.403263403263403,
"grad_norm": 0.3149622560196894,
"learning_rate": 6.730010501403718e-06,
"loss": 0.3103,
"num_tokens": 2472349646.0,
"step": 9445
},
{
"epoch": 4.405594405594406,
"grad_norm": 0.3296632544724334,
"learning_rate": 6.716695412505688e-06,
"loss": 0.3141,
"num_tokens": 2473660366.0,
"step": 9450
},
{
"epoch": 4.407925407925408,
"grad_norm": 0.3266017858365408,
"learning_rate": 6.703429729232682e-06,
"loss": 0.3195,
"num_tokens": 2474971086.0,
"step": 9455
},
{
"epoch": 4.410256410256411,
"grad_norm": 0.3373805559516959,
"learning_rate": 6.690213483119595e-06,
"loss": 0.312,
"num_tokens": 2476281806.0,
"step": 9460
},
{
"epoch": 4.4125874125874125,
"grad_norm": 0.3366721023486427,
"learning_rate": 6.677046705583806e-06,
"loss": 0.3171,
"num_tokens": 2477592526.0,
"step": 9465
},
{
"epoch": 4.414918414918415,
"grad_norm": 0.3310996468709432,
"learning_rate": 6.663929427925095e-06,
"loss": 0.3054,
"num_tokens": 2478903246.0,
"step": 9470
},
{
"epoch": 4.417249417249417,
"grad_norm": 0.3237625603637502,
"learning_rate": 6.650861681325567e-06,
"loss": 0.3063,
"num_tokens": 2480204978.0,
"step": 9475
},
{
"epoch": 4.41958041958042,
"grad_norm": 0.35443174214884327,
"learning_rate": 6.6378434968495965e-06,
"loss": 0.3186,
"num_tokens": 2481515698.0,
"step": 9480
},
{
"epoch": 4.421911421911422,
"grad_norm": 0.34643582807007817,
"learning_rate": 6.624874905443726e-06,
"loss": 0.3104,
"num_tokens": 2482810080.0,
"step": 9485
},
{
"epoch": 4.424242424242424,
"grad_norm": 0.3353431088763468,
"learning_rate": 6.611955937936619e-06,
"loss": 0.3042,
"num_tokens": 2484095674.0,
"step": 9490
},
{
"epoch": 4.426573426573427,
"grad_norm": 0.3242056478567321,
"learning_rate": 6.599086625038957e-06,
"loss": 0.32,
"num_tokens": 2485406394.0,
"step": 9495
},
{
"epoch": 4.428904428904429,
"grad_norm": 0.3331241038387844,
"learning_rate": 6.586266997343402e-06,
"loss": 0.3078,
"num_tokens": 2486697670.0,
"step": 9500
},
{
"epoch": 4.431235431235431,
"grad_norm": 0.3288847500668807,
"learning_rate": 6.5734970853244985e-06,
"loss": 0.3095,
"num_tokens": 2488008390.0,
"step": 9505
},
{
"epoch": 4.433566433566433,
"grad_norm": 0.3444690042268666,
"learning_rate": 6.560776919338599e-06,
"loss": 0.3171,
"num_tokens": 2489319110.0,
"step": 9510
},
{
"epoch": 4.435897435897436,
"grad_norm": 0.3401045593755526,
"learning_rate": 6.5481065296238155e-06,
"loss": 0.3233,
"num_tokens": 2490621224.0,
"step": 9515
},
{
"epoch": 4.438228438228438,
"grad_norm": 0.32485207168584423,
"learning_rate": 6.535485946299927e-06,
"loss": 0.3,
"num_tokens": 2491931944.0,
"step": 9520
},
{
"epoch": 4.440559440559441,
"grad_norm": 0.3362671229182379,
"learning_rate": 6.5229151993683065e-06,
"loss": 0.3231,
"num_tokens": 2493242664.0,
"step": 9525
},
{
"epoch": 4.442890442890443,
"grad_norm": 0.3188938856125156,
"learning_rate": 6.5103943187118654e-06,
"loss": 0.3248,
"num_tokens": 2494553384.0,
"step": 9530
},
{
"epoch": 4.445221445221446,
"grad_norm": 0.3206644304295667,
"learning_rate": 6.49792333409498e-06,
"loss": 0.3193,
"num_tokens": 2495864104.0,
"step": 9535
},
{
"epoch": 4.4475524475524475,
"grad_norm": 0.32725720854706297,
"learning_rate": 6.485502275163401e-06,
"loss": 0.3128,
"num_tokens": 2497174824.0,
"step": 9540
},
{
"epoch": 4.449883449883449,
"grad_norm": 0.3377685213956361,
"learning_rate": 6.473131171444192e-06,
"loss": 0.3098,
"num_tokens": 2498485544.0,
"step": 9545
},
{
"epoch": 4.452214452214452,
"grad_norm": 0.3267131751428466,
"learning_rate": 6.460810052345697e-06,
"loss": 0.3122,
"num_tokens": 2499796264.0,
"step": 9550
},
{
"epoch": 4.454545454545454,
"grad_norm": 0.3369467611573572,
"learning_rate": 6.4485389471574025e-06,
"loss": 0.3121,
"num_tokens": 2501097144.0,
"step": 9555
},
{
"epoch": 4.456876456876457,
"grad_norm": 0.31471698424983463,
"learning_rate": 6.4363178850499115e-06,
"loss": 0.3114,
"num_tokens": 2502407864.0,
"step": 9560
},
{
"epoch": 4.459207459207459,
"grad_norm": 0.3182422570183859,
"learning_rate": 6.424146895074878e-06,
"loss": 0.3217,
"num_tokens": 2503718584.0,
"step": 9565
},
{
"epoch": 4.461538461538462,
"grad_norm": 0.32824460237041364,
"learning_rate": 6.41202600616492e-06,
"loss": 0.312,
"num_tokens": 2505029304.0,
"step": 9570
},
{
"epoch": 4.463869463869464,
"grad_norm": 0.3276454341355643,
"learning_rate": 6.399955247133547e-06,
"loss": 0.3233,
"num_tokens": 2506340024.0,
"step": 9575
},
{
"epoch": 4.466200466200466,
"grad_norm": 0.35551047742829733,
"learning_rate": 6.387934646675109e-06,
"loss": 0.3172,
"num_tokens": 2507650744.0,
"step": 9580
},
{
"epoch": 4.468531468531468,
"grad_norm": 0.33605570429689574,
"learning_rate": 6.375964233364725e-06,
"loss": 0.3353,
"num_tokens": 2508961464.0,
"step": 9585
},
{
"epoch": 4.470862470862471,
"grad_norm": 0.3172835951855472,
"learning_rate": 6.364044035658198e-06,
"loss": 0.3063,
"num_tokens": 2510272184.0,
"step": 9590
},
{
"epoch": 4.473193473193473,
"grad_norm": 0.3207971543619864,
"learning_rate": 6.352174081891969e-06,
"loss": 0.3132,
"num_tokens": 2511582904.0,
"step": 9595
},
{
"epoch": 4.475524475524476,
"grad_norm": 0.31751145509697243,
"learning_rate": 6.340354400283039e-06,
"loss": 0.3107,
"num_tokens": 2512893624.0,
"step": 9600
},
{
"epoch": 4.477855477855478,
"grad_norm": 0.3384772695617782,
"learning_rate": 6.328585018928896e-06,
"loss": 0.3239,
"num_tokens": 2514204344.0,
"step": 9605
},
{
"epoch": 4.480186480186481,
"grad_norm": 0.34136380265968547,
"learning_rate": 6.31686596580746e-06,
"loss": 0.3159,
"num_tokens": 2515515064.0,
"step": 9610
},
{
"epoch": 4.4825174825174825,
"grad_norm": 0.34265867835608826,
"learning_rate": 6.305197268777023e-06,
"loss": 0.3232,
"num_tokens": 2516804613.0,
"step": 9615
},
{
"epoch": 4.484848484848484,
"grad_norm": 0.3471232349713498,
"learning_rate": 6.293578955576149e-06,
"loss": 0.3162,
"num_tokens": 2518115333.0,
"step": 9620
},
{
"epoch": 4.487179487179487,
"grad_norm": 0.33307127442521534,
"learning_rate": 6.28201105382364e-06,
"loss": 0.3196,
"num_tokens": 2519426053.0,
"step": 9625
},
{
"epoch": 4.489510489510489,
"grad_norm": 0.33359074202000116,
"learning_rate": 6.2704935910184785e-06,
"loss": 0.3136,
"num_tokens": 2520736773.0,
"step": 9630
},
{
"epoch": 4.491841491841492,
"grad_norm": 0.3240069604325885,
"learning_rate": 6.259026594539719e-06,
"loss": 0.3188,
"num_tokens": 2522047493.0,
"step": 9635
},
{
"epoch": 4.494172494172494,
"grad_norm": 0.3388605125051464,
"learning_rate": 6.2476100916464585e-06,
"loss": 0.3154,
"num_tokens": 2523358213.0,
"step": 9640
},
{
"epoch": 4.496503496503497,
"grad_norm": 0.32541913779560644,
"learning_rate": 6.236244109477764e-06,
"loss": 0.3197,
"num_tokens": 2524663045.0,
"step": 9645
},
{
"epoch": 4.498834498834499,
"grad_norm": 0.33945252106405477,
"learning_rate": 6.224928675052609e-06,
"loss": 0.3211,
"num_tokens": 2525973765.0,
"step": 9650
},
{
"epoch": 4.501165501165501,
"grad_norm": 0.3476363419781912,
"learning_rate": 6.213663815269794e-06,
"loss": 0.3079,
"num_tokens": 2527279335.0,
"step": 9655
},
{
"epoch": 4.503496503496503,
"grad_norm": 0.33583724777887775,
"learning_rate": 6.202449556907903e-06,
"loss": 0.325,
"num_tokens": 2528590055.0,
"step": 9660
},
{
"epoch": 4.505827505827506,
"grad_norm": 0.3216459701800872,
"learning_rate": 6.191285926625236e-06,
"loss": 0.3106,
"num_tokens": 2529900775.0,
"step": 9665
},
{
"epoch": 4.508158508158508,
"grad_norm": 0.3346538262362633,
"learning_rate": 6.180172950959726e-06,
"loss": 0.3161,
"num_tokens": 2531211495.0,
"step": 9670
},
{
"epoch": 4.510489510489511,
"grad_norm": 0.33904717924175304,
"learning_rate": 6.169110656328905e-06,
"loss": 0.3256,
"num_tokens": 2532522215.0,
"step": 9675
},
{
"epoch": 4.512820512820513,
"grad_norm": 0.31774716560986643,
"learning_rate": 6.158099069029825e-06,
"loss": 0.3101,
"num_tokens": 2533832935.0,
"step": 9680
},
{
"epoch": 4.515151515151516,
"grad_norm": 0.3219903613779173,
"learning_rate": 6.147138215238987e-06,
"loss": 0.3175,
"num_tokens": 2535143655.0,
"step": 9685
},
{
"epoch": 4.5174825174825175,
"grad_norm": 0.3270086455520368,
"learning_rate": 6.136228121012301e-06,
"loss": 0.3025,
"num_tokens": 2536454375.0,
"step": 9690
},
{
"epoch": 4.519813519813519,
"grad_norm": 0.32790806083662694,
"learning_rate": 6.125368812285014e-06,
"loss": 0.324,
"num_tokens": 2537765095.0,
"step": 9695
},
{
"epoch": 4.522144522144522,
"grad_norm": 0.3374631769629436,
"learning_rate": 6.11456031487163e-06,
"loss": 0.3113,
"num_tokens": 2539075815.0,
"step": 9700
},
{
"epoch": 4.524475524475524,
"grad_norm": 0.3165604361693966,
"learning_rate": 6.103802654465887e-06,
"loss": 0.3189,
"num_tokens": 2540386535.0,
"step": 9705
},
{
"epoch": 4.526806526806527,
"grad_norm": 0.33329264324814467,
"learning_rate": 6.093095856640659e-06,
"loss": 0.3267,
"num_tokens": 2541697255.0,
"step": 9710
},
{
"epoch": 4.529137529137529,
"grad_norm": 0.318747910398136,
"learning_rate": 6.082439946847914e-06,
"loss": 0.3152,
"num_tokens": 2543007975.0,
"step": 9715
},
{
"epoch": 4.531468531468532,
"grad_norm": 0.3395245128133879,
"learning_rate": 6.0718349504186596e-06,
"loss": 0.3177,
"num_tokens": 2544305294.0,
"step": 9720
},
{
"epoch": 4.533799533799534,
"grad_norm": 0.3234654794073021,
"learning_rate": 6.061280892562856e-06,
"loss": 0.313,
"num_tokens": 2545614275.0,
"step": 9725
},
{
"epoch": 4.536130536130536,
"grad_norm": 0.3230472732102473,
"learning_rate": 6.050777798369387e-06,
"loss": 0.3145,
"num_tokens": 2546924995.0,
"step": 9730
},
{
"epoch": 4.538461538461538,
"grad_norm": 0.32753670861955114,
"learning_rate": 6.040325692805984e-06,
"loss": 0.3119,
"num_tokens": 2548235715.0,
"step": 9735
},
{
"epoch": 4.540792540792541,
"grad_norm": 0.3312438687676759,
"learning_rate": 6.029924600719165e-06,
"loss": 0.3168,
"num_tokens": 2549546435.0,
"step": 9740
},
{
"epoch": 4.543123543123543,
"grad_norm": 0.34207396319559835,
"learning_rate": 6.019574546834186e-06,
"loss": 0.329,
"num_tokens": 2550857155.0,
"step": 9745
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.32384193950890633,
"learning_rate": 6.009275555754967e-06,
"loss": 0.3133,
"num_tokens": 2552167875.0,
"step": 9750
},
{
"epoch": 4.547785547785548,
"grad_norm": 0.33222986120067743,
"learning_rate": 5.999027651964054e-06,
"loss": 0.3178,
"num_tokens": 2553478595.0,
"step": 9755
},
{
"epoch": 4.550116550116551,
"grad_norm": 0.3394591599285521,
"learning_rate": 5.988830859822541e-06,
"loss": 0.3106,
"num_tokens": 2554789315.0,
"step": 9760
},
{
"epoch": 4.5524475524475525,
"grad_norm": 0.3273894065028813,
"learning_rate": 5.978685203570021e-06,
"loss": 0.3109,
"num_tokens": 2556100035.0,
"step": 9765
},
{
"epoch": 4.554778554778554,
"grad_norm": 0.3441663270198807,
"learning_rate": 5.968590707324535e-06,
"loss": 0.3214,
"num_tokens": 2557410755.0,
"step": 9770
},
{
"epoch": 4.557109557109557,
"grad_norm": 0.3305110868532999,
"learning_rate": 5.958547395082498e-06,
"loss": 0.3214,
"num_tokens": 2558721475.0,
"step": 9775
},
{
"epoch": 4.559440559440559,
"grad_norm": 0.3318319061404925,
"learning_rate": 5.948555290718658e-06,
"loss": 0.3203,
"num_tokens": 2560032195.0,
"step": 9780
},
{
"epoch": 4.561771561771562,
"grad_norm": 0.3143112654235783,
"learning_rate": 5.938614417986035e-06,
"loss": 0.3238,
"num_tokens": 2561342915.0,
"step": 9785
},
{
"epoch": 4.564102564102564,
"grad_norm": 0.33345941184977745,
"learning_rate": 5.928724800515848e-06,
"loss": 0.3143,
"num_tokens": 2562653635.0,
"step": 9790
},
{
"epoch": 4.566433566433567,
"grad_norm": 0.34686197219373827,
"learning_rate": 5.91888646181749e-06,
"loss": 0.3137,
"num_tokens": 2563948407.0,
"step": 9795
},
{
"epoch": 4.568764568764569,
"grad_norm": 0.33117506317785395,
"learning_rate": 5.909099425278451e-06,
"loss": 0.32,
"num_tokens": 2565259127.0,
"step": 9800
},
{
"epoch": 4.571095571095571,
"grad_norm": 0.3435052314461775,
"learning_rate": 5.899363714164259e-06,
"loss": 0.3148,
"num_tokens": 2566569847.0,
"step": 9805
},
{
"epoch": 4.573426573426573,
"grad_norm": 0.3511413949641888,
"learning_rate": 5.889679351618435e-06,
"loss": 0.3239,
"num_tokens": 2567880567.0,
"step": 9810
},
{
"epoch": 4.575757575757576,
"grad_norm": 0.3277981153602279,
"learning_rate": 5.880046360662442e-06,
"loss": 0.319,
"num_tokens": 2569191287.0,
"step": 9815
},
{
"epoch": 4.578088578088578,
"grad_norm": 0.34041898778575075,
"learning_rate": 5.870464764195621e-06,
"loss": 0.3117,
"num_tokens": 2570502007.0,
"step": 9820
},
{
"epoch": 4.58041958041958,
"grad_norm": 0.3243193311955865,
"learning_rate": 5.8609345849951275e-06,
"loss": 0.312,
"num_tokens": 2571812727.0,
"step": 9825
},
{
"epoch": 4.582750582750583,
"grad_norm": 0.3310866370673146,
"learning_rate": 5.851455845715912e-06,
"loss": 0.3109,
"num_tokens": 2573123447.0,
"step": 9830
},
{
"epoch": 4.585081585081585,
"grad_norm": 0.31513987342632316,
"learning_rate": 5.842028568890624e-06,
"loss": 0.3069,
"num_tokens": 2574434167.0,
"step": 9835
},
{
"epoch": 4.5874125874125875,
"grad_norm": 0.3275061402794141,
"learning_rate": 5.832652776929576e-06,
"loss": 0.3048,
"num_tokens": 2575744887.0,
"step": 9840
},
{
"epoch": 4.589743589743589,
"grad_norm": 0.33530078415240044,
"learning_rate": 5.823328492120709e-06,
"loss": 0.3205,
"num_tokens": 2577055607.0,
"step": 9845
},
{
"epoch": 4.592074592074592,
"grad_norm": 0.3482685265210104,
"learning_rate": 5.814055736629512e-06,
"loss": 0.3222,
"num_tokens": 2578366327.0,
"step": 9850
},
{
"epoch": 4.594405594405594,
"grad_norm": 0.3537893890025293,
"learning_rate": 5.804834532498973e-06,
"loss": 0.3125,
"num_tokens": 2579662506.0,
"step": 9855
},
{
"epoch": 4.596736596736597,
"grad_norm": 0.3358652150747448,
"learning_rate": 5.795664901649546e-06,
"loss": 0.3123,
"num_tokens": 2580973226.0,
"step": 9860
},
{
"epoch": 4.599067599067599,
"grad_norm": 0.35598478994424876,
"learning_rate": 5.78654686587908e-06,
"loss": 0.3333,
"num_tokens": 2582283946.0,
"step": 9865
},
{
"epoch": 4.601398601398602,
"grad_norm": 0.331525987147412,
"learning_rate": 5.777480446862771e-06,
"loss": 0.3199,
"num_tokens": 2583594666.0,
"step": 9870
},
{
"epoch": 4.603729603729604,
"grad_norm": 0.32178290426004424,
"learning_rate": 5.768465666153116e-06,
"loss": 0.3289,
"num_tokens": 2584905386.0,
"step": 9875
},
{
"epoch": 4.606060606060606,
"grad_norm": 0.30393892105397075,
"learning_rate": 5.759502545179865e-06,
"loss": 0.3076,
"num_tokens": 2586200553.0,
"step": 9880
},
{
"epoch": 4.608391608391608,
"grad_norm": 0.32968332333326905,
"learning_rate": 5.750591105249945e-06,
"loss": 0.3105,
"num_tokens": 2587511273.0,
"step": 9885
},
{
"epoch": 4.610722610722611,
"grad_norm": 0.3231640929433455,
"learning_rate": 5.741731367547445e-06,
"loss": 0.3175,
"num_tokens": 2588821993.0,
"step": 9890
},
{
"epoch": 4.613053613053613,
"grad_norm": 0.31411918328374727,
"learning_rate": 5.732923353133545e-06,
"loss": 0.3102,
"num_tokens": 2590132713.0,
"step": 9895
},
{
"epoch": 4.615384615384615,
"grad_norm": 0.335002070080694,
"learning_rate": 5.724167082946466e-06,
"loss": 0.3225,
"num_tokens": 2591443433.0,
"step": 9900
},
{
"epoch": 4.617715617715618,
"grad_norm": 0.3229052741900776,
"learning_rate": 5.715462577801427e-06,
"loss": 0.3156,
"num_tokens": 2592754153.0,
"step": 9905
},
{
"epoch": 4.62004662004662,
"grad_norm": 0.3297660710371224,
"learning_rate": 5.706809858390583e-06,
"loss": 0.3276,
"num_tokens": 2594060378.0,
"step": 9910
},
{
"epoch": 4.6223776223776225,
"grad_norm": 0.3155783908720556,
"learning_rate": 5.698208945283e-06,
"loss": 0.2992,
"num_tokens": 2595371098.0,
"step": 9915
},
{
"epoch": 4.624708624708624,
"grad_norm": 0.3193903370909751,
"learning_rate": 5.689659858924586e-06,
"loss": 0.3068,
"num_tokens": 2596681818.0,
"step": 9920
},
{
"epoch": 4.627039627039627,
"grad_norm": 0.33774842672012795,
"learning_rate": 5.6811626196380385e-06,
"loss": 0.3121,
"num_tokens": 2597992538.0,
"step": 9925
},
{
"epoch": 4.629370629370629,
"grad_norm": 0.33740225064211665,
"learning_rate": 5.672717247622816e-06,
"loss": 0.3102,
"num_tokens": 2599303258.0,
"step": 9930
},
{
"epoch": 4.631701631701632,
"grad_norm": 0.3373159671469985,
"learning_rate": 5.664323762955072e-06,
"loss": 0.3221,
"num_tokens": 2600613978.0,
"step": 9935
},
{
"epoch": 4.634032634032634,
"grad_norm": 0.35331673034205946,
"learning_rate": 5.655982185587621e-06,
"loss": 0.3184,
"num_tokens": 2601924698.0,
"step": 9940
},
{
"epoch": 4.636363636363637,
"grad_norm": 0.34075562342066007,
"learning_rate": 5.647692535349884e-06,
"loss": 0.3176,
"num_tokens": 2603235418.0,
"step": 9945
},
{
"epoch": 4.638694638694639,
"grad_norm": 0.3272741009052787,
"learning_rate": 5.6394548319478325e-06,
"loss": 0.308,
"num_tokens": 2604546138.0,
"step": 9950
},
{
"epoch": 4.641025641025641,
"grad_norm": 0.3353878917152132,
"learning_rate": 5.631269094963962e-06,
"loss": 0.3132,
"num_tokens": 2605856858.0,
"step": 9955
},
{
"epoch": 4.643356643356643,
"grad_norm": 0.335095370367202,
"learning_rate": 5.623135343857232e-06,
"loss": 0.3179,
"num_tokens": 2607167578.0,
"step": 9960
},
{
"epoch": 4.645687645687646,
"grad_norm": 0.3448209805418296,
"learning_rate": 5.615053597963018e-06,
"loss": 0.3266,
"num_tokens": 2608468942.0,
"step": 9965
},
{
"epoch": 4.648018648018648,
"grad_norm": 0.3365763346832491,
"learning_rate": 5.607023876493075e-06,
"loss": 0.3251,
"num_tokens": 2609779662.0,
"step": 9970
},
{
"epoch": 4.65034965034965,
"grad_norm": 0.3371348358061654,
"learning_rate": 5.59904619853548e-06,
"loss": 0.314,
"num_tokens": 2611090382.0,
"step": 9975
},
{
"epoch": 4.652680652680653,
"grad_norm": 0.3211402055694574,
"learning_rate": 5.591120583054602e-06,
"loss": 0.3172,
"num_tokens": 2612401102.0,
"step": 9980
},
{
"epoch": 4.655011655011655,
"grad_norm": 0.32650824383942834,
"learning_rate": 5.583247048891042e-06,
"loss": 0.3177,
"num_tokens": 2613704775.0,
"step": 9985
},
{
"epoch": 4.6573426573426575,
"grad_norm": 0.33015275188111703,
"learning_rate": 5.575425614761597e-06,
"loss": 0.3105,
"num_tokens": 2615005017.0,
"step": 9990
},
{
"epoch": 4.659673659673659,
"grad_norm": 0.3350066530847012,
"learning_rate": 5.567656299259212e-06,
"loss": 0.3179,
"num_tokens": 2616315737.0,
"step": 9995
},
{
"epoch": 4.662004662004662,
"grad_norm": 0.3296673905861499,
"learning_rate": 5.559939120852936e-06,
"loss": 0.3183,
"num_tokens": 2617626457.0,
"step": 10000
},
{
"epoch": 4.664335664335664,
"grad_norm": 0.3314605447902211,
"learning_rate": 5.552274097887879e-06,
"loss": 0.311,
"num_tokens": 2618937177.0,
"step": 10005
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.31963246919644556,
"learning_rate": 5.544661248585172e-06,
"loss": 0.3148,
"num_tokens": 2620238809.0,
"step": 10010
},
{
"epoch": 4.668997668997669,
"grad_norm": 0.32606186826503414,
"learning_rate": 5.537100591041915e-06,
"loss": 0.3197,
"num_tokens": 2621537881.0,
"step": 10015
},
{
"epoch": 4.671328671328672,
"grad_norm": 0.3127362641909877,
"learning_rate": 5.529592143231142e-06,
"loss": 0.3187,
"num_tokens": 2622848601.0,
"step": 10020
},
{
"epoch": 4.673659673659674,
"grad_norm": 0.34140879351797326,
"learning_rate": 5.522135923001767e-06,
"loss": 0.3129,
"num_tokens": 2624159321.0,
"step": 10025
},
{
"epoch": 4.6759906759906755,
"grad_norm": 0.32642677770519635,
"learning_rate": 5.514731948078565e-06,
"loss": 0.3089,
"num_tokens": 2625454738.0,
"step": 10030
},
{
"epoch": 4.678321678321678,
"grad_norm": 0.329034389428116,
"learning_rate": 5.5073802360621035e-06,
"loss": 0.315,
"num_tokens": 2626765458.0,
"step": 10035
},
{
"epoch": 4.680652680652681,
"grad_norm": 0.33163073393602516,
"learning_rate": 5.50008080442871e-06,
"loss": 0.3146,
"num_tokens": 2628076178.0,
"step": 10040
},
{
"epoch": 4.682983682983683,
"grad_norm": 0.3272067629094256,
"learning_rate": 5.492833670530445e-06,
"loss": 0.3173,
"num_tokens": 2629386898.0,
"step": 10045
},
{
"epoch": 4.685314685314685,
"grad_norm": 0.3144574455080941,
"learning_rate": 5.485638851595033e-06,
"loss": 0.3054,
"num_tokens": 2630697618.0,
"step": 10050
},
{
"epoch": 4.687645687645688,
"grad_norm": 0.33627454004902,
"learning_rate": 5.478496364725844e-06,
"loss": 0.3188,
"num_tokens": 2632008338.0,
"step": 10055
},
{
"epoch": 4.68997668997669,
"grad_norm": 0.3353988218499314,
"learning_rate": 5.471406226901843e-06,
"loss": 0.3178,
"num_tokens": 2633315147.0,
"step": 10060
},
{
"epoch": 4.6923076923076925,
"grad_norm": 0.34081231503644693,
"learning_rate": 5.464368454977559e-06,
"loss": 0.3181,
"num_tokens": 2634625867.0,
"step": 10065
},
{
"epoch": 4.694638694638694,
"grad_norm": 0.32001347894247395,
"learning_rate": 5.457383065683023e-06,
"loss": 0.3094,
"num_tokens": 2635936587.0,
"step": 10070
},
{
"epoch": 4.696969696969697,
"grad_norm": 0.33209202640372304,
"learning_rate": 5.450450075623761e-06,
"loss": 0.3203,
"num_tokens": 2637233194.0,
"step": 10075
},
{
"epoch": 4.699300699300699,
"grad_norm": 0.3444659685507314,
"learning_rate": 5.443569501280724e-06,
"loss": 0.3298,
"num_tokens": 2638543914.0,
"step": 10080
},
{
"epoch": 4.701631701631702,
"grad_norm": 0.32572981399860407,
"learning_rate": 5.436741359010265e-06,
"loss": 0.3145,
"num_tokens": 2639854634.0,
"step": 10085
},
{
"epoch": 4.703962703962704,
"grad_norm": 0.3265125453077855,
"learning_rate": 5.429965665044099e-06,
"loss": 0.3113,
"num_tokens": 2641165354.0,
"step": 10090
},
{
"epoch": 4.706293706293707,
"grad_norm": 0.32908645785435287,
"learning_rate": 5.4232424354892605e-06,
"loss": 0.3259,
"num_tokens": 2642476074.0,
"step": 10095
},
{
"epoch": 4.708624708624709,
"grad_norm": 0.32471263137765566,
"learning_rate": 5.4165716863280626e-06,
"loss": 0.3148,
"num_tokens": 2643786794.0,
"step": 10100
},
{
"epoch": 4.7109557109557105,
"grad_norm": 0.33045093010828397,
"learning_rate": 5.409953433418071e-06,
"loss": 0.3265,
"num_tokens": 2645097514.0,
"step": 10105
},
{
"epoch": 4.713286713286713,
"grad_norm": 0.3293031597544229,
"learning_rate": 5.403387692492053e-06,
"loss": 0.312,
"num_tokens": 2646390978.0,
"step": 10110
},
{
"epoch": 4.715617715617715,
"grad_norm": 0.33516744422096806,
"learning_rate": 5.396874479157943e-06,
"loss": 0.3169,
"num_tokens": 2647689285.0,
"step": 10115
},
{
"epoch": 4.717948717948718,
"grad_norm": 0.3351308591346485,
"learning_rate": 5.39041380889882e-06,
"loss": 0.3235,
"num_tokens": 2648996984.0,
"step": 10120
},
{
"epoch": 4.72027972027972,
"grad_norm": 0.3345027011060605,
"learning_rate": 5.384005697072842e-06,
"loss": 0.308,
"num_tokens": 2650307704.0,
"step": 10125
},
{
"epoch": 4.722610722610723,
"grad_norm": 0.326143155782565,
"learning_rate": 5.377650158913239e-06,
"loss": 0.3272,
"num_tokens": 2651618424.0,
"step": 10130
},
{
"epoch": 4.724941724941725,
"grad_norm": 0.3382309775663869,
"learning_rate": 5.371347209528259e-06,
"loss": 0.3201,
"num_tokens": 2652929144.0,
"step": 10135
},
{
"epoch": 4.7272727272727275,
"grad_norm": 0.3229354536782934,
"learning_rate": 5.365096863901139e-06,
"loss": 0.317,
"num_tokens": 2654239864.0,
"step": 10140
},
{
"epoch": 4.729603729603729,
"grad_norm": 0.3265549491709372,
"learning_rate": 5.3588991368900655e-06,
"loss": 0.3197,
"num_tokens": 2655550584.0,
"step": 10145
},
{
"epoch": 4.731934731934732,
"grad_norm": 0.3317883087071852,
"learning_rate": 5.352754043228138e-06,
"loss": 0.3105,
"num_tokens": 2656861304.0,
"step": 10150
},
{
"epoch": 4.734265734265734,
"grad_norm": 0.30886908963659704,
"learning_rate": 5.346661597523347e-06,
"loss": 0.3183,
"num_tokens": 2658172024.0,
"step": 10155
},
{
"epoch": 4.736596736596737,
"grad_norm": 0.3265593067229369,
"learning_rate": 5.340621814258523e-06,
"loss": 0.3113,
"num_tokens": 2659482744.0,
"step": 10160
},
{
"epoch": 4.738927738927739,
"grad_norm": 0.34583140436007476,
"learning_rate": 5.334634707791303e-06,
"loss": 0.3193,
"num_tokens": 2660789135.0,
"step": 10165
},
{
"epoch": 4.741258741258742,
"grad_norm": 0.33571097006056455,
"learning_rate": 5.328700292354117e-06,
"loss": 0.3122,
"num_tokens": 2662099855.0,
"step": 10170
},
{
"epoch": 4.743589743589744,
"grad_norm": 0.3233278556899711,
"learning_rate": 5.322818582054123e-06,
"loss": 0.3159,
"num_tokens": 2663410575.0,
"step": 10175
},
{
"epoch": 4.7459207459207455,
"grad_norm": 0.3317303155316829,
"learning_rate": 5.316989590873196e-06,
"loss": 0.3194,
"num_tokens": 2664721295.0,
"step": 10180
},
{
"epoch": 4.748251748251748,
"grad_norm": 0.33625182489617234,
"learning_rate": 5.311213332667893e-06,
"loss": 0.3163,
"num_tokens": 2666032015.0,
"step": 10185
},
{
"epoch": 4.75058275058275,
"grad_norm": 0.3356074033954618,
"learning_rate": 5.305489821169408e-06,
"loss": 0.3078,
"num_tokens": 2667342735.0,
"step": 10190
},
{
"epoch": 4.752913752913753,
"grad_norm": 0.34585034308778323,
"learning_rate": 5.2998190699835485e-06,
"loss": 0.3257,
"num_tokens": 2668653455.0,
"step": 10195
},
{
"epoch": 4.755244755244755,
"grad_norm": 0.3657646741180601,
"learning_rate": 5.2942010925907074e-06,
"loss": 0.3309,
"num_tokens": 2669964175.0,
"step": 10200
},
{
"epoch": 4.757575757575758,
"grad_norm": 0.3300884821473932,
"learning_rate": 5.288635902345814e-06,
"loss": 0.3172,
"num_tokens": 2671274895.0,
"step": 10205
},
{
"epoch": 4.75990675990676,
"grad_norm": 0.3148848991416642,
"learning_rate": 5.283123512478321e-06,
"loss": 0.3097,
"num_tokens": 2672585615.0,
"step": 10210
},
{
"epoch": 4.7622377622377625,
"grad_norm": 0.3248022910063405,
"learning_rate": 5.2776639360921664e-06,
"loss": 0.3113,
"num_tokens": 2673881785.0,
"step": 10215
},
{
"epoch": 4.764568764568764,
"grad_norm": 0.32808387177263987,
"learning_rate": 5.272257186165733e-06,
"loss": 0.3208,
"num_tokens": 2675192505.0,
"step": 10220
},
{
"epoch": 4.766899766899767,
"grad_norm": 0.3269866573020375,
"learning_rate": 5.26690327555183e-06,
"loss": 0.3149,
"num_tokens": 2676503225.0,
"step": 10225
},
{
"epoch": 4.769230769230769,
"grad_norm": 0.332961069358366,
"learning_rate": 5.261602216977668e-06,
"loss": 0.3145,
"num_tokens": 2677813945.0,
"step": 10230
},
{
"epoch": 4.771561771561771,
"grad_norm": 0.33357108039723987,
"learning_rate": 5.256354023044799e-06,
"loss": 0.324,
"num_tokens": 2679124665.0,
"step": 10235
},
{
"epoch": 4.773892773892774,
"grad_norm": 0.3288868211446766,
"learning_rate": 5.251158706229117e-06,
"loss": 0.318,
"num_tokens": 2680435385.0,
"step": 10240
},
{
"epoch": 4.776223776223777,
"grad_norm": 0.3295445331080107,
"learning_rate": 5.246016278880824e-06,
"loss": 0.3233,
"num_tokens": 2681746105.0,
"step": 10245
},
{
"epoch": 4.778554778554779,
"grad_norm": 0.3456197462845134,
"learning_rate": 5.240926753224386e-06,
"loss": 0.3186,
"num_tokens": 2683056825.0,
"step": 10250
},
{
"epoch": 4.7808857808857805,
"grad_norm": 0.3204798461467708,
"learning_rate": 5.235890141358512e-06,
"loss": 0.3118,
"num_tokens": 2684367545.0,
"step": 10255
},
{
"epoch": 4.783216783216783,
"grad_norm": 0.330521485739032,
"learning_rate": 5.230906455256126e-06,
"loss": 0.319,
"num_tokens": 2685678265.0,
"step": 10260
},
{
"epoch": 4.785547785547785,
"grad_norm": 0.32924346560709944,
"learning_rate": 5.225975706764347e-06,
"loss": 0.3112,
"num_tokens": 2686988499.0,
"step": 10265
},
{
"epoch": 4.787878787878788,
"grad_norm": 0.3368638495877885,
"learning_rate": 5.221097907604436e-06,
"loss": 0.3194,
"num_tokens": 2688299219.0,
"step": 10270
},
{
"epoch": 4.79020979020979,
"grad_norm": 0.3263693614554864,
"learning_rate": 5.216273069371794e-06,
"loss": 0.3189,
"num_tokens": 2689609939.0,
"step": 10275
},
{
"epoch": 4.792540792540793,
"grad_norm": 0.334167060120909,
"learning_rate": 5.211501203535926e-06,
"loss": 0.316,
"num_tokens": 2690918769.0,
"step": 10280
},
{
"epoch": 4.794871794871795,
"grad_norm": 0.33700365497537554,
"learning_rate": 5.2067823214404076e-06,
"loss": 0.3136,
"num_tokens": 2692229489.0,
"step": 10285
},
{
"epoch": 4.7972027972027975,
"grad_norm": 0.3550316959254366,
"learning_rate": 5.2021164343028615e-06,
"loss": 0.3226,
"num_tokens": 2693540209.0,
"step": 10290
},
{
"epoch": 4.799533799533799,
"grad_norm": 0.32560252408951285,
"learning_rate": 5.1975035532149374e-06,
"loss": 0.3153,
"num_tokens": 2694850929.0,
"step": 10295
},
{
"epoch": 4.801864801864802,
"grad_norm": 0.33773170010312276,
"learning_rate": 5.192943689142276e-06,
"loss": 0.3197,
"num_tokens": 2696161649.0,
"step": 10300
},
{
"epoch": 4.804195804195804,
"grad_norm": 0.32245870964120255,
"learning_rate": 5.188436852924488e-06,
"loss": 0.3096,
"num_tokens": 2697472369.0,
"step": 10305
},
{
"epoch": 4.806526806526806,
"grad_norm": 0.3157741766261794,
"learning_rate": 5.183983055275129e-06,
"loss": 0.318,
"num_tokens": 2698783089.0,
"step": 10310
},
{
"epoch": 4.808857808857809,
"grad_norm": 0.3230046443910475,
"learning_rate": 5.17958230678167e-06,
"loss": 0.3038,
"num_tokens": 2700093809.0,
"step": 10315
},
{
"epoch": 4.811188811188811,
"grad_norm": 0.32531156650812953,
"learning_rate": 5.175234617905471e-06,
"loss": 0.3056,
"num_tokens": 2701404529.0,
"step": 10320
},
{
"epoch": 4.813519813519814,
"grad_norm": 0.3377022386596855,
"learning_rate": 5.170939998981775e-06,
"loss": 0.3138,
"num_tokens": 2702706568.0,
"step": 10325
},
{
"epoch": 4.8158508158508155,
"grad_norm": 0.3440675929974891,
"learning_rate": 5.16669846021965e-06,
"loss": 0.3208,
"num_tokens": 2704003885.0,
"step": 10330
},
{
"epoch": 4.818181818181818,
"grad_norm": 0.3293134125006892,
"learning_rate": 5.162510011701991e-06,
"loss": 0.313,
"num_tokens": 2705310038.0,
"step": 10335
},
{
"epoch": 4.82051282051282,
"grad_norm": 0.34589478246737115,
"learning_rate": 5.15837466338549e-06,
"loss": 0.3227,
"num_tokens": 2706605004.0,
"step": 10340
},
{
"epoch": 4.822843822843823,
"grad_norm": 0.3250195530849573,
"learning_rate": 5.15429242510061e-06,
"loss": 0.3095,
"num_tokens": 2707915724.0,
"step": 10345
},
{
"epoch": 4.825174825174825,
"grad_norm": 0.33813235349784826,
"learning_rate": 5.150263306551556e-06,
"loss": 0.3176,
"num_tokens": 2709226444.0,
"step": 10350
},
{
"epoch": 4.827505827505828,
"grad_norm": 0.3342791833320397,
"learning_rate": 5.146287317316262e-06,
"loss": 0.3177,
"num_tokens": 2710537164.0,
"step": 10355
},
{
"epoch": 4.82983682983683,
"grad_norm": 0.32303166071349837,
"learning_rate": 5.1423644668463695e-06,
"loss": 0.3127,
"num_tokens": 2711847884.0,
"step": 10360
},
{
"epoch": 4.8321678321678325,
"grad_norm": 0.34342920997262655,
"learning_rate": 5.138494764467189e-06,
"loss": 0.3207,
"num_tokens": 2713158604.0,
"step": 10365
},
{
"epoch": 4.834498834498834,
"grad_norm": 0.3370264711785083,
"learning_rate": 5.134678219377695e-06,
"loss": 0.3169,
"num_tokens": 2714469324.0,
"step": 10370
},
{
"epoch": 4.836829836829837,
"grad_norm": 0.33019492580209653,
"learning_rate": 5.1309148406505e-06,
"loss": 0.319,
"num_tokens": 2715780044.0,
"step": 10375
},
{
"epoch": 4.839160839160839,
"grad_norm": 0.3296889225888387,
"learning_rate": 5.127204637231821e-06,
"loss": 0.3096,
"num_tokens": 2717090764.0,
"step": 10380
},
{
"epoch": 4.841491841491841,
"grad_norm": 0.3170653884338263,
"learning_rate": 5.12354761794148e-06,
"loss": 0.3151,
"num_tokens": 2718401484.0,
"step": 10385
},
{
"epoch": 4.843822843822844,
"grad_norm": 0.3171594099332368,
"learning_rate": 5.1199437914728596e-06,
"loss": 0.3121,
"num_tokens": 2719712204.0,
"step": 10390
},
{
"epoch": 4.846153846153846,
"grad_norm": 0.333775671862216,
"learning_rate": 5.116393166392901e-06,
"loss": 0.3082,
"num_tokens": 2721022924.0,
"step": 10395
},
{
"epoch": 4.848484848484849,
"grad_norm": 0.34403397168216404,
"learning_rate": 5.112895751142073e-06,
"loss": 0.3231,
"num_tokens": 2722333644.0,
"step": 10400
},
{
"epoch": 4.8508158508158505,
"grad_norm": 0.31993219706477866,
"learning_rate": 5.109451554034357e-06,
"loss": 0.3184,
"num_tokens": 2723644364.0,
"step": 10405
},
{
"epoch": 4.853146853146853,
"grad_norm": 0.343409768713205,
"learning_rate": 5.1060605832572235e-06,
"loss": 0.3171,
"num_tokens": 2724955084.0,
"step": 10410
},
{
"epoch": 4.855477855477855,
"grad_norm": 0.32246016600388927,
"learning_rate": 5.102722846871616e-06,
"loss": 0.3084,
"num_tokens": 2726265804.0,
"step": 10415
},
{
"epoch": 4.857808857808858,
"grad_norm": 0.3418584388038729,
"learning_rate": 5.099438352811931e-06,
"loss": 0.3302,
"num_tokens": 2727576524.0,
"step": 10420
},
{
"epoch": 4.86013986013986,
"grad_norm": 0.3242160539074971,
"learning_rate": 5.0962071088859935e-06,
"loss": 0.3102,
"num_tokens": 2728887244.0,
"step": 10425
},
{
"epoch": 4.862470862470863,
"grad_norm": 0.3149813322054902,
"learning_rate": 5.093029122775049e-06,
"loss": 0.3071,
"num_tokens": 2730197964.0,
"step": 10430
},
{
"epoch": 4.864801864801865,
"grad_norm": 0.3334336110289972,
"learning_rate": 5.08990440203374e-06,
"loss": 0.3203,
"num_tokens": 2731508684.0,
"step": 10435
},
{
"epoch": 4.867132867132867,
"grad_norm": 0.32299813348426,
"learning_rate": 5.086832954090082e-06,
"loss": 0.313,
"num_tokens": 2732819404.0,
"step": 10440
},
{
"epoch": 4.869463869463869,
"grad_norm": 0.3364027385588916,
"learning_rate": 5.083814786245458e-06,
"loss": 0.3179,
"num_tokens": 2734130124.0,
"step": 10445
},
{
"epoch": 4.871794871794872,
"grad_norm": 0.3656168760454608,
"learning_rate": 5.080849905674588e-06,
"loss": 0.3201,
"num_tokens": 2735440844.0,
"step": 10450
},
{
"epoch": 4.874125874125874,
"grad_norm": 0.33683461461528713,
"learning_rate": 5.077938319425526e-06,
"loss": 0.3186,
"num_tokens": 2736751564.0,
"step": 10455
},
{
"epoch": 4.876456876456876,
"grad_norm": 0.3300837602076609,
"learning_rate": 5.075080034419631e-06,
"loss": 0.3262,
"num_tokens": 2738062284.0,
"step": 10460
},
{
"epoch": 4.878787878787879,
"grad_norm": 0.348047283341893,
"learning_rate": 5.072275057451558e-06,
"loss": 0.3164,
"num_tokens": 2739356820.0,
"step": 10465
},
{
"epoch": 4.881118881118881,
"grad_norm": 0.32910332567884476,
"learning_rate": 5.0695233951892345e-06,
"loss": 0.3107,
"num_tokens": 2740662089.0,
"step": 10470
},
{
"epoch": 4.883449883449884,
"grad_norm": 0.3200833744056691,
"learning_rate": 5.066825054173854e-06,
"loss": 0.3117,
"num_tokens": 2741972809.0,
"step": 10475
},
{
"epoch": 4.8857808857808855,
"grad_norm": 0.3341587779600143,
"learning_rate": 5.064180040819858e-06,
"loss": 0.3179,
"num_tokens": 2743283529.0,
"step": 10480
},
{
"epoch": 4.888111888111888,
"grad_norm": 0.33148410077095414,
"learning_rate": 5.0615883614149136e-06,
"loss": 0.3172,
"num_tokens": 2744594249.0,
"step": 10485
},
{
"epoch": 4.89044289044289,
"grad_norm": 0.3272484124935763,
"learning_rate": 5.059050022119904e-06,
"loss": 0.3165,
"num_tokens": 2745887369.0,
"step": 10490
},
{
"epoch": 4.892773892773893,
"grad_norm": 0.3209922201282976,
"learning_rate": 5.056565028968916e-06,
"loss": 0.3154,
"num_tokens": 2747198089.0,
"step": 10495
},
{
"epoch": 4.895104895104895,
"grad_norm": 0.3269414783389918,
"learning_rate": 5.05413338786922e-06,
"loss": 0.312,
"num_tokens": 2748508809.0,
"step": 10500
},
{
"epoch": 4.897435897435898,
"grad_norm": 0.31618238859555264,
"learning_rate": 5.051755104601264e-06,
"loss": 0.3143,
"num_tokens": 2749819529.0,
"step": 10505
},
{
"epoch": 4.8997668997669,
"grad_norm": 0.32153457363047355,
"learning_rate": 5.049430184818651e-06,
"loss": 0.3224,
"num_tokens": 2751117328.0,
"step": 10510
},
{
"epoch": 4.902097902097902,
"grad_norm": 0.31465978988545573,
"learning_rate": 5.047158634048129e-06,
"loss": 0.3177,
"num_tokens": 2752420296.0,
"step": 10515
},
{
"epoch": 4.9044289044289044,
"grad_norm": 0.31044680611687897,
"learning_rate": 5.044940457689581e-06,
"loss": 0.3105,
"num_tokens": 2753731016.0,
"step": 10520
},
{
"epoch": 4.906759906759907,
"grad_norm": 0.31811560483359963,
"learning_rate": 5.042775661016008e-06,
"loss": 0.3169,
"num_tokens": 2755041736.0,
"step": 10525
},
{
"epoch": 4.909090909090909,
"grad_norm": 0.31742684514452485,
"learning_rate": 5.040664249173518e-06,
"loss": 0.305,
"num_tokens": 2756352456.0,
"step": 10530
},
{
"epoch": 4.911421911421911,
"grad_norm": 0.33606166438372637,
"learning_rate": 5.038606227181312e-06,
"loss": 0.3182,
"num_tokens": 2757663176.0,
"step": 10535
},
{
"epoch": 4.913752913752914,
"grad_norm": 0.3205242654431987,
"learning_rate": 5.0366015999316775e-06,
"loss": 0.3147,
"num_tokens": 2758973413.0,
"step": 10540
},
{
"epoch": 4.916083916083916,
"grad_norm": 0.3427922994912874,
"learning_rate": 5.034650372189974e-06,
"loss": 0.3125,
"num_tokens": 2760284133.0,
"step": 10545
},
{
"epoch": 4.918414918414919,
"grad_norm": 0.3310997932717806,
"learning_rate": 5.0327525485946135e-06,
"loss": 0.3184,
"num_tokens": 2761592831.0,
"step": 10550
},
{
"epoch": 4.9207459207459205,
"grad_norm": 0.3393685987917944,
"learning_rate": 5.030908133657063e-06,
"loss": 0.3156,
"num_tokens": 2762895660.0,
"step": 10555
},
{
"epoch": 4.923076923076923,
"grad_norm": 0.3322488130315273,
"learning_rate": 5.029117131761826e-06,
"loss": 0.3213,
"num_tokens": 2764206380.0,
"step": 10560
},
{
"epoch": 4.925407925407925,
"grad_norm": 0.35642657501632585,
"learning_rate": 5.027379547166436e-06,
"loss": 0.3173,
"num_tokens": 2765517100.0,
"step": 10565
},
{
"epoch": 4.927738927738928,
"grad_norm": 0.33722387753318867,
"learning_rate": 5.025695384001438e-06,
"loss": 0.3297,
"num_tokens": 2766827820.0,
"step": 10570
},
{
"epoch": 4.93006993006993,
"grad_norm": 0.32350013184482923,
"learning_rate": 5.02406464627039e-06,
"loss": 0.3141,
"num_tokens": 2768138540.0,
"step": 10575
},
{
"epoch": 4.932400932400933,
"grad_norm": 0.31985329238680343,
"learning_rate": 5.0224873378498475e-06,
"loss": 0.3103,
"num_tokens": 2769449260.0,
"step": 10580
},
{
"epoch": 4.934731934731935,
"grad_norm": 0.33320437693468646,
"learning_rate": 5.0209634624893535e-06,
"loss": 0.316,
"num_tokens": 2770759980.0,
"step": 10585
},
{
"epoch": 4.937062937062937,
"grad_norm": 0.34199930181650334,
"learning_rate": 5.0194930238114344e-06,
"loss": 0.3165,
"num_tokens": 2772070700.0,
"step": 10590
},
{
"epoch": 4.9393939393939394,
"grad_norm": 0.33840903211030815,
"learning_rate": 5.01807602531158e-06,
"loss": 0.3279,
"num_tokens": 2773381420.0,
"step": 10595
},
{
"epoch": 4.941724941724941,
"grad_norm": 0.33970146942955454,
"learning_rate": 5.016712470358254e-06,
"loss": 0.3243,
"num_tokens": 2774692140.0,
"step": 10600
},
{
"epoch": 4.944055944055944,
"grad_norm": 0.32417078423146617,
"learning_rate": 5.015402362192865e-06,
"loss": 0.3095,
"num_tokens": 2776002860.0,
"step": 10605
},
{
"epoch": 4.946386946386946,
"grad_norm": 0.31477911467606806,
"learning_rate": 5.0141457039297765e-06,
"loss": 0.3152,
"num_tokens": 2777313580.0,
"step": 10610
},
{
"epoch": 4.948717948717949,
"grad_norm": 0.3382420127269151,
"learning_rate": 5.012942498556292e-06,
"loss": 0.3145,
"num_tokens": 2778624300.0,
"step": 10615
},
{
"epoch": 4.951048951048951,
"grad_norm": 0.31746762640712656,
"learning_rate": 5.011792748932641e-06,
"loss": 0.3067,
"num_tokens": 2779935020.0,
"step": 10620
},
{
"epoch": 4.953379953379954,
"grad_norm": 0.3281133514868718,
"learning_rate": 5.010696457791986e-06,
"loss": 0.3132,
"num_tokens": 2781245740.0,
"step": 10625
},
{
"epoch": 4.9557109557109555,
"grad_norm": 0.3239412806584807,
"learning_rate": 5.009653627740407e-06,
"loss": 0.3212,
"num_tokens": 2782556460.0,
"step": 10630
},
{
"epoch": 4.958041958041958,
"grad_norm": 0.3368010167927629,
"learning_rate": 5.008664261256898e-06,
"loss": 0.3145,
"num_tokens": 2783867180.0,
"step": 10635
},
{
"epoch": 4.96037296037296,
"grad_norm": 0.34491073030538105,
"learning_rate": 5.007728360693355e-06,
"loss": 0.3176,
"num_tokens": 2785177900.0,
"step": 10640
},
{
"epoch": 4.962703962703963,
"grad_norm": 0.3510601956916241,
"learning_rate": 5.006845928274586e-06,
"loss": 0.3187,
"num_tokens": 2786488620.0,
"step": 10645
},
{
"epoch": 4.965034965034965,
"grad_norm": 0.3337835159331755,
"learning_rate": 5.006016966098288e-06,
"loss": 0.314,
"num_tokens": 2787799340.0,
"step": 10650
},
{
"epoch": 4.967365967365968,
"grad_norm": 0.3172585463158374,
"learning_rate": 5.005241476135051e-06,
"loss": 0.317,
"num_tokens": 2789110060.0,
"step": 10655
},
{
"epoch": 4.96969696969697,
"grad_norm": 0.32635822297017036,
"learning_rate": 5.004519460228356e-06,
"loss": 0.3102,
"num_tokens": 2790420780.0,
"step": 10660
},
{
"epoch": 4.972027972027972,
"grad_norm": 0.34214637134938164,
"learning_rate": 5.003850920094564e-06,
"loss": 0.3127,
"num_tokens": 2791731500.0,
"step": 10665
},
{
"epoch": 4.9743589743589745,
"grad_norm": 0.31863256082976954,
"learning_rate": 5.00323585732291e-06,
"loss": 0.3037,
"num_tokens": 2793042220.0,
"step": 10670
},
{
"epoch": 4.976689976689976,
"grad_norm": 0.33066360911088155,
"learning_rate": 5.00267427337551e-06,
"loss": 0.3155,
"num_tokens": 2794352940.0,
"step": 10675
},
{
"epoch": 4.979020979020979,
"grad_norm": 0.32620437111284734,
"learning_rate": 5.002166169587351e-06,
"loss": 0.3171,
"num_tokens": 2795657814.0,
"step": 10680
},
{
"epoch": 4.981351981351981,
"grad_norm": 0.33430055011282034,
"learning_rate": 5.001711547166285e-06,
"loss": 0.3189,
"num_tokens": 2796968534.0,
"step": 10685
},
{
"epoch": 4.983682983682984,
"grad_norm": 0.3224437309450024,
"learning_rate": 5.001310407193031e-06,
"loss": 0.3223,
"num_tokens": 2798279254.0,
"step": 10690
},
{
"epoch": 4.986013986013986,
"grad_norm": 0.32410671370691807,
"learning_rate": 5.000962750621168e-06,
"loss": 0.3311,
"num_tokens": 2799589974.0,
"step": 10695
},
{
"epoch": 4.988344988344989,
"grad_norm": 0.3235106959855654,
"learning_rate": 5.0006685782771445e-06,
"loss": 0.3132,
"num_tokens": 2800887285.0,
"step": 10700
},
{
"epoch": 4.9906759906759905,
"grad_norm": 0.3305908727408153,
"learning_rate": 5.000427890860252e-06,
"loss": 0.3113,
"num_tokens": 2802198005.0,
"step": 10705
},
{
"epoch": 4.993006993006993,
"grad_norm": 0.33403825665973846,
"learning_rate": 5.000240688942652e-06,
"loss": 0.3186,
"num_tokens": 2803508725.0,
"step": 10710
},
{
"epoch": 4.995337995337995,
"grad_norm": 0.3278267015029189,
"learning_rate": 5.000106972969358e-06,
"loss": 0.3166,
"num_tokens": 2804819445.0,
"step": 10715
},
{
"epoch": 4.997668997668997,
"grad_norm": 0.3295223914621982,
"learning_rate": 5.000026743258234e-06,
"loss": 0.3119,
"num_tokens": 2806130165.0,
"step": 10720
},
{
"epoch": 5.0,
"grad_norm": 0.3242625515113468,
"learning_rate": 5e-06,
"loss": 0.3108,
"num_tokens": 2807440885.0,
"step": 10725
},
{
"epoch": 5.0,
"step": 10725,
"total_flos": 2444245755494400.0,
"train_loss": 0.42360519842668015,
"train_runtime": 82605.1286,
"train_samples_per_second": 2.077,
"train_steps_per_second": 0.13
}
],
"logging_steps": 5,
"max_steps": 10725,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2444245755494400.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}