sedrickkeh's picture
End of training
836b17a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9983102399459276,
"eval_steps": 500,
"global_step": 2217,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013518080432578574,
"grad_norm": 3.0995258158495695,
"learning_rate": 5e-06,
"loss": 0.845,
"step": 10
},
{
"epoch": 0.027036160865157147,
"grad_norm": 2.4721955543947596,
"learning_rate": 5e-06,
"loss": 0.6931,
"step": 20
},
{
"epoch": 0.04055424129773572,
"grad_norm": 2.8908078365820544,
"learning_rate": 5e-06,
"loss": 0.6512,
"step": 30
},
{
"epoch": 0.054072321730314295,
"grad_norm": 2.128304353555308,
"learning_rate": 5e-06,
"loss": 0.6546,
"step": 40
},
{
"epoch": 0.06759040216289287,
"grad_norm": 1.8583110367716618,
"learning_rate": 5e-06,
"loss": 0.6376,
"step": 50
},
{
"epoch": 0.08110848259547145,
"grad_norm": 1.9572074992289452,
"learning_rate": 5e-06,
"loss": 0.6293,
"step": 60
},
{
"epoch": 0.09462656302805002,
"grad_norm": 2.1439310974732906,
"learning_rate": 5e-06,
"loss": 0.6303,
"step": 70
},
{
"epoch": 0.10814464346062859,
"grad_norm": 2.1027938324907276,
"learning_rate": 5e-06,
"loss": 0.6263,
"step": 80
},
{
"epoch": 0.12166272389320716,
"grad_norm": 2.0994621637558613,
"learning_rate": 5e-06,
"loss": 0.6197,
"step": 90
},
{
"epoch": 0.13518080432578575,
"grad_norm": 2.3558311705586434,
"learning_rate": 5e-06,
"loss": 0.6137,
"step": 100
},
{
"epoch": 0.14869888475836432,
"grad_norm": 3.6078661884191936,
"learning_rate": 5e-06,
"loss": 0.6143,
"step": 110
},
{
"epoch": 0.1622169651909429,
"grad_norm": 2.0098953064632252,
"learning_rate": 5e-06,
"loss": 0.6119,
"step": 120
},
{
"epoch": 0.17573504562352146,
"grad_norm": 1.7892961602329431,
"learning_rate": 5e-06,
"loss": 0.6082,
"step": 130
},
{
"epoch": 0.18925312605610004,
"grad_norm": 2.495233222827848,
"learning_rate": 5e-06,
"loss": 0.6105,
"step": 140
},
{
"epoch": 0.2027712064886786,
"grad_norm": 2.73669002831843,
"learning_rate": 5e-06,
"loss": 0.6005,
"step": 150
},
{
"epoch": 0.21628928692125718,
"grad_norm": 2.25755467972834,
"learning_rate": 5e-06,
"loss": 0.6041,
"step": 160
},
{
"epoch": 0.22980736735383575,
"grad_norm": 2.2899488859758317,
"learning_rate": 5e-06,
"loss": 0.6042,
"step": 170
},
{
"epoch": 0.24332544778641432,
"grad_norm": 1.9924897200956881,
"learning_rate": 5e-06,
"loss": 0.5942,
"step": 180
},
{
"epoch": 0.2568435282189929,
"grad_norm": 2.0832289534715596,
"learning_rate": 5e-06,
"loss": 0.6008,
"step": 190
},
{
"epoch": 0.2703616086515715,
"grad_norm": 1.9923009880039881,
"learning_rate": 5e-06,
"loss": 0.6039,
"step": 200
},
{
"epoch": 0.28387968908415007,
"grad_norm": 1.6840960209729652,
"learning_rate": 5e-06,
"loss": 0.6056,
"step": 210
},
{
"epoch": 0.29739776951672864,
"grad_norm": 1.644653139908748,
"learning_rate": 5e-06,
"loss": 0.5976,
"step": 220
},
{
"epoch": 0.3109158499493072,
"grad_norm": 1.815796136594004,
"learning_rate": 5e-06,
"loss": 0.5983,
"step": 230
},
{
"epoch": 0.3244339303818858,
"grad_norm": 2.0476220349655736,
"learning_rate": 5e-06,
"loss": 0.5985,
"step": 240
},
{
"epoch": 0.33795201081446435,
"grad_norm": 1.826188487172147,
"learning_rate": 5e-06,
"loss": 0.5991,
"step": 250
},
{
"epoch": 0.3514700912470429,
"grad_norm": 1.8779821757544928,
"learning_rate": 5e-06,
"loss": 0.6011,
"step": 260
},
{
"epoch": 0.3649881716796215,
"grad_norm": 1.546297402705703,
"learning_rate": 5e-06,
"loss": 0.5991,
"step": 270
},
{
"epoch": 0.37850625211220007,
"grad_norm": 1.9100992643242412,
"learning_rate": 5e-06,
"loss": 0.6016,
"step": 280
},
{
"epoch": 0.39202433254477864,
"grad_norm": 1.633188543050389,
"learning_rate": 5e-06,
"loss": 0.5931,
"step": 290
},
{
"epoch": 0.4055424129773572,
"grad_norm": 1.7612843300351624,
"learning_rate": 5e-06,
"loss": 0.5912,
"step": 300
},
{
"epoch": 0.4190604934099358,
"grad_norm": 1.5609583873755621,
"learning_rate": 5e-06,
"loss": 0.5983,
"step": 310
},
{
"epoch": 0.43257857384251436,
"grad_norm": 1.6780024100547228,
"learning_rate": 5e-06,
"loss": 0.5899,
"step": 320
},
{
"epoch": 0.44609665427509293,
"grad_norm": 2.142609724069825,
"learning_rate": 5e-06,
"loss": 0.5905,
"step": 330
},
{
"epoch": 0.4596147347076715,
"grad_norm": 1.7302040609097082,
"learning_rate": 5e-06,
"loss": 0.5977,
"step": 340
},
{
"epoch": 0.4731328151402501,
"grad_norm": 1.6276169982986228,
"learning_rate": 5e-06,
"loss": 0.5883,
"step": 350
},
{
"epoch": 0.48665089557282865,
"grad_norm": 1.6710934957701402,
"learning_rate": 5e-06,
"loss": 0.5974,
"step": 360
},
{
"epoch": 0.5001689760054072,
"grad_norm": 1.767516047841104,
"learning_rate": 5e-06,
"loss": 0.5912,
"step": 370
},
{
"epoch": 0.5136870564379858,
"grad_norm": 1.6695942153323693,
"learning_rate": 5e-06,
"loss": 0.5891,
"step": 380
},
{
"epoch": 0.5272051368705644,
"grad_norm": 1.7527726211773795,
"learning_rate": 5e-06,
"loss": 0.5908,
"step": 390
},
{
"epoch": 0.540723217303143,
"grad_norm": 1.651084341663377,
"learning_rate": 5e-06,
"loss": 0.5858,
"step": 400
},
{
"epoch": 0.5542412977357215,
"grad_norm": 1.7390619949343342,
"learning_rate": 5e-06,
"loss": 0.595,
"step": 410
},
{
"epoch": 0.5677593781683001,
"grad_norm": 1.4988408072021968,
"learning_rate": 5e-06,
"loss": 0.5846,
"step": 420
},
{
"epoch": 0.5812774586008786,
"grad_norm": 1.573068476680162,
"learning_rate": 5e-06,
"loss": 0.5814,
"step": 430
},
{
"epoch": 0.5947955390334573,
"grad_norm": 1.6120372463584451,
"learning_rate": 5e-06,
"loss": 0.5899,
"step": 440
},
{
"epoch": 0.6083136194660358,
"grad_norm": 1.6274467183788262,
"learning_rate": 5e-06,
"loss": 0.59,
"step": 450
},
{
"epoch": 0.6218316998986144,
"grad_norm": 1.7252328462432982,
"learning_rate": 5e-06,
"loss": 0.5854,
"step": 460
},
{
"epoch": 0.6353497803311929,
"grad_norm": 1.737699797028079,
"learning_rate": 5e-06,
"loss": 0.5806,
"step": 470
},
{
"epoch": 0.6488678607637716,
"grad_norm": 1.5370682500918078,
"learning_rate": 5e-06,
"loss": 0.5881,
"step": 480
},
{
"epoch": 0.6623859411963501,
"grad_norm": 1.7740843576068728,
"learning_rate": 5e-06,
"loss": 0.5806,
"step": 490
},
{
"epoch": 0.6759040216289287,
"grad_norm": 1.5652522580101986,
"learning_rate": 5e-06,
"loss": 0.572,
"step": 500
},
{
"epoch": 0.6894221020615072,
"grad_norm": 1.5568021887696903,
"learning_rate": 5e-06,
"loss": 0.58,
"step": 510
},
{
"epoch": 0.7029401824940859,
"grad_norm": 1.5175784957924134,
"learning_rate": 5e-06,
"loss": 0.579,
"step": 520
},
{
"epoch": 0.7164582629266644,
"grad_norm": 1.4295771272533815,
"learning_rate": 5e-06,
"loss": 0.5871,
"step": 530
},
{
"epoch": 0.729976343359243,
"grad_norm": 1.7204826587382944,
"learning_rate": 5e-06,
"loss": 0.5817,
"step": 540
},
{
"epoch": 0.7434944237918215,
"grad_norm": 1.577838279477792,
"learning_rate": 5e-06,
"loss": 0.5779,
"step": 550
},
{
"epoch": 0.7570125042244001,
"grad_norm": 1.6528962259307287,
"learning_rate": 5e-06,
"loss": 0.5812,
"step": 560
},
{
"epoch": 0.7705305846569787,
"grad_norm": 1.6042870663862332,
"learning_rate": 5e-06,
"loss": 0.5786,
"step": 570
},
{
"epoch": 0.7840486650895573,
"grad_norm": 1.709986377305198,
"learning_rate": 5e-06,
"loss": 0.5793,
"step": 580
},
{
"epoch": 0.7975667455221359,
"grad_norm": 1.7924690518428852,
"learning_rate": 5e-06,
"loss": 0.5803,
"step": 590
},
{
"epoch": 0.8110848259547144,
"grad_norm": 1.5055046913979044,
"learning_rate": 5e-06,
"loss": 0.5768,
"step": 600
},
{
"epoch": 0.824602906387293,
"grad_norm": 1.4964520058827533,
"learning_rate": 5e-06,
"loss": 0.5759,
"step": 610
},
{
"epoch": 0.8381209868198716,
"grad_norm": 1.6720625396672169,
"learning_rate": 5e-06,
"loss": 0.5754,
"step": 620
},
{
"epoch": 0.8516390672524502,
"grad_norm": 1.5463769829403606,
"learning_rate": 5e-06,
"loss": 0.5786,
"step": 630
},
{
"epoch": 0.8651571476850287,
"grad_norm": 1.4669224733461368,
"learning_rate": 5e-06,
"loss": 0.5708,
"step": 640
},
{
"epoch": 0.8786752281176073,
"grad_norm": 1.520619999962285,
"learning_rate": 5e-06,
"loss": 0.574,
"step": 650
},
{
"epoch": 0.8921933085501859,
"grad_norm": 1.5914892508820981,
"learning_rate": 5e-06,
"loss": 0.5783,
"step": 660
},
{
"epoch": 0.9057113889827645,
"grad_norm": 1.8577143073111821,
"learning_rate": 5e-06,
"loss": 0.5829,
"step": 670
},
{
"epoch": 0.919229469415343,
"grad_norm": 1.5962074409528684,
"learning_rate": 5e-06,
"loss": 0.5838,
"step": 680
},
{
"epoch": 0.9327475498479216,
"grad_norm": 1.446689597838186,
"learning_rate": 5e-06,
"loss": 0.5756,
"step": 690
},
{
"epoch": 0.9462656302805001,
"grad_norm": 1.510508928886103,
"learning_rate": 5e-06,
"loss": 0.5738,
"step": 700
},
{
"epoch": 0.9597837107130788,
"grad_norm": 1.4607211586070719,
"learning_rate": 5e-06,
"loss": 0.5757,
"step": 710
},
{
"epoch": 0.9733017911456573,
"grad_norm": 1.4221599670931588,
"learning_rate": 5e-06,
"loss": 0.5735,
"step": 720
},
{
"epoch": 0.9868198715782359,
"grad_norm": 1.484579352264901,
"learning_rate": 5e-06,
"loss": 0.5772,
"step": 730
},
{
"epoch": 0.9989861439675566,
"eval_loss": 0.1434517651796341,
"eval_runtime": 379.3166,
"eval_samples_per_second": 26.271,
"eval_steps_per_second": 0.411,
"step": 739
},
{
"epoch": 1.0010138560324433,
"grad_norm": 2.7421566702088738,
"learning_rate": 5e-06,
"loss": 0.5708,
"step": 740
},
{
"epoch": 1.014531936465022,
"grad_norm": 1.8529103798230466,
"learning_rate": 5e-06,
"loss": 0.4629,
"step": 750
},
{
"epoch": 1.0280500168976006,
"grad_norm": 1.67580460086375,
"learning_rate": 5e-06,
"loss": 0.46,
"step": 760
},
{
"epoch": 1.041568097330179,
"grad_norm": 1.640470300013924,
"learning_rate": 5e-06,
"loss": 0.4562,
"step": 770
},
{
"epoch": 1.0550861777627576,
"grad_norm": 1.6699136465078195,
"learning_rate": 5e-06,
"loss": 0.4536,
"step": 780
},
{
"epoch": 1.0686042581953363,
"grad_norm": 1.7807842688633373,
"learning_rate": 5e-06,
"loss": 0.4662,
"step": 790
},
{
"epoch": 1.0821223386279148,
"grad_norm": 1.6145188030618127,
"learning_rate": 5e-06,
"loss": 0.4587,
"step": 800
},
{
"epoch": 1.0956404190604934,
"grad_norm": 1.6719771276304467,
"learning_rate": 5e-06,
"loss": 0.4581,
"step": 810
},
{
"epoch": 1.1091584994930719,
"grad_norm": 1.479688368315656,
"learning_rate": 5e-06,
"loss": 0.456,
"step": 820
},
{
"epoch": 1.1226765799256506,
"grad_norm": 1.5743910150201328,
"learning_rate": 5e-06,
"loss": 0.4604,
"step": 830
},
{
"epoch": 1.1361946603582291,
"grad_norm": 1.5023364194883289,
"learning_rate": 5e-06,
"loss": 0.4652,
"step": 840
},
{
"epoch": 1.1497127407908077,
"grad_norm": 1.7167217243377628,
"learning_rate": 5e-06,
"loss": 0.472,
"step": 850
},
{
"epoch": 1.1632308212233862,
"grad_norm": 1.5221498447986286,
"learning_rate": 5e-06,
"loss": 0.4665,
"step": 860
},
{
"epoch": 1.176748901655965,
"grad_norm": 1.7809772273502287,
"learning_rate": 5e-06,
"loss": 0.4656,
"step": 870
},
{
"epoch": 1.1902669820885434,
"grad_norm": 1.7250751373229314,
"learning_rate": 5e-06,
"loss": 0.4581,
"step": 880
},
{
"epoch": 1.203785062521122,
"grad_norm": 1.861343520779267,
"learning_rate": 5e-06,
"loss": 0.4636,
"step": 890
},
{
"epoch": 1.2173031429537007,
"grad_norm": 2.0625018676728715,
"learning_rate": 5e-06,
"loss": 0.4659,
"step": 900
},
{
"epoch": 1.2308212233862792,
"grad_norm": 1.551721464148321,
"learning_rate": 5e-06,
"loss": 0.4629,
"step": 910
},
{
"epoch": 1.2443393038188577,
"grad_norm": 1.7884538726855719,
"learning_rate": 5e-06,
"loss": 0.4707,
"step": 920
},
{
"epoch": 1.2578573842514362,
"grad_norm": 1.6748195476683634,
"learning_rate": 5e-06,
"loss": 0.4711,
"step": 930
},
{
"epoch": 1.2713754646840147,
"grad_norm": 1.7623689240351639,
"learning_rate": 5e-06,
"loss": 0.4649,
"step": 940
},
{
"epoch": 1.2848935451165935,
"grad_norm": 1.7011947665040341,
"learning_rate": 5e-06,
"loss": 0.4634,
"step": 950
},
{
"epoch": 1.298411625549172,
"grad_norm": 1.5930792336293036,
"learning_rate": 5e-06,
"loss": 0.4646,
"step": 960
},
{
"epoch": 1.3119297059817505,
"grad_norm": 1.727734986402252,
"learning_rate": 5e-06,
"loss": 0.4593,
"step": 970
},
{
"epoch": 1.3254477864143293,
"grad_norm": 1.5431155855105338,
"learning_rate": 5e-06,
"loss": 0.4686,
"step": 980
},
{
"epoch": 1.3389658668469078,
"grad_norm": 1.701221393426559,
"learning_rate": 5e-06,
"loss": 0.4647,
"step": 990
},
{
"epoch": 1.3524839472794863,
"grad_norm": 1.5946440906826853,
"learning_rate": 5e-06,
"loss": 0.474,
"step": 1000
},
{
"epoch": 1.3660020277120648,
"grad_norm": 1.570462820909487,
"learning_rate": 5e-06,
"loss": 0.4715,
"step": 1010
},
{
"epoch": 1.3795201081446435,
"grad_norm": 1.5515085427705777,
"learning_rate": 5e-06,
"loss": 0.4658,
"step": 1020
},
{
"epoch": 1.393038188577222,
"grad_norm": 1.612110437850543,
"learning_rate": 5e-06,
"loss": 0.475,
"step": 1030
},
{
"epoch": 1.4065562690098006,
"grad_norm": 1.48793819280309,
"learning_rate": 5e-06,
"loss": 0.4711,
"step": 1040
},
{
"epoch": 1.420074349442379,
"grad_norm": 1.5900270531941814,
"learning_rate": 5e-06,
"loss": 0.466,
"step": 1050
},
{
"epoch": 1.4335924298749578,
"grad_norm": 1.625411671224631,
"learning_rate": 5e-06,
"loss": 0.4655,
"step": 1060
},
{
"epoch": 1.4471105103075363,
"grad_norm": 1.7287593338346183,
"learning_rate": 5e-06,
"loss": 0.4718,
"step": 1070
},
{
"epoch": 1.4606285907401149,
"grad_norm": 1.5782208709005707,
"learning_rate": 5e-06,
"loss": 0.4711,
"step": 1080
},
{
"epoch": 1.4741466711726936,
"grad_norm": 1.7240525193349314,
"learning_rate": 5e-06,
"loss": 0.4683,
"step": 1090
},
{
"epoch": 1.4876647516052721,
"grad_norm": 1.7400342420780646,
"learning_rate": 5e-06,
"loss": 0.4695,
"step": 1100
},
{
"epoch": 1.5011828320378506,
"grad_norm": 1.5363707121370902,
"learning_rate": 5e-06,
"loss": 0.4724,
"step": 1110
},
{
"epoch": 1.5147009124704292,
"grad_norm": 1.7266066662849726,
"learning_rate": 5e-06,
"loss": 0.4709,
"step": 1120
},
{
"epoch": 1.5282189929030077,
"grad_norm": 1.7190911907364863,
"learning_rate": 5e-06,
"loss": 0.4645,
"step": 1130
},
{
"epoch": 1.5417370733355864,
"grad_norm": 1.606217517626092,
"learning_rate": 5e-06,
"loss": 0.4702,
"step": 1140
},
{
"epoch": 1.555255153768165,
"grad_norm": 1.766144995012523,
"learning_rate": 5e-06,
"loss": 0.4739,
"step": 1150
},
{
"epoch": 1.5687732342007434,
"grad_norm": 1.5926253751713118,
"learning_rate": 5e-06,
"loss": 0.4672,
"step": 1160
},
{
"epoch": 1.5822913146333222,
"grad_norm": 1.709203191355986,
"learning_rate": 5e-06,
"loss": 0.4731,
"step": 1170
},
{
"epoch": 1.5958093950659007,
"grad_norm": 1.5574946245332464,
"learning_rate": 5e-06,
"loss": 0.4703,
"step": 1180
},
{
"epoch": 1.6093274754984792,
"grad_norm": 1.584669174527815,
"learning_rate": 5e-06,
"loss": 0.4705,
"step": 1190
},
{
"epoch": 1.622845555931058,
"grad_norm": 1.5042749804205873,
"learning_rate": 5e-06,
"loss": 0.47,
"step": 1200
},
{
"epoch": 1.6363636363636362,
"grad_norm": 1.6609143139694778,
"learning_rate": 5e-06,
"loss": 0.4677,
"step": 1210
},
{
"epoch": 1.649881716796215,
"grad_norm": 1.7891835221599115,
"learning_rate": 5e-06,
"loss": 0.4719,
"step": 1220
},
{
"epoch": 1.6633997972287935,
"grad_norm": 1.5476583418714311,
"learning_rate": 5e-06,
"loss": 0.4721,
"step": 1230
},
{
"epoch": 1.676917877661372,
"grad_norm": 1.6549785438500684,
"learning_rate": 5e-06,
"loss": 0.4751,
"step": 1240
},
{
"epoch": 1.6904359580939508,
"grad_norm": 1.6039460840462256,
"learning_rate": 5e-06,
"loss": 0.4845,
"step": 1250
},
{
"epoch": 1.7039540385265293,
"grad_norm": 1.6896161846830133,
"learning_rate": 5e-06,
"loss": 0.4729,
"step": 1260
},
{
"epoch": 1.7174721189591078,
"grad_norm": 1.5618414410556232,
"learning_rate": 5e-06,
"loss": 0.4776,
"step": 1270
},
{
"epoch": 1.7309901993916865,
"grad_norm": 1.5952861695756622,
"learning_rate": 5e-06,
"loss": 0.4779,
"step": 1280
},
{
"epoch": 1.7445082798242648,
"grad_norm": 1.699598571695824,
"learning_rate": 5e-06,
"loss": 0.4751,
"step": 1290
},
{
"epoch": 1.7580263602568436,
"grad_norm": 1.809834320799901,
"learning_rate": 5e-06,
"loss": 0.4739,
"step": 1300
},
{
"epoch": 1.771544440689422,
"grad_norm": 1.6219739662720212,
"learning_rate": 5e-06,
"loss": 0.473,
"step": 1310
},
{
"epoch": 1.7850625211220006,
"grad_norm": 1.6762754772871185,
"learning_rate": 5e-06,
"loss": 0.4721,
"step": 1320
},
{
"epoch": 1.7985806015545793,
"grad_norm": 1.6645833857767827,
"learning_rate": 5e-06,
"loss": 0.4755,
"step": 1330
},
{
"epoch": 1.8120986819871578,
"grad_norm": 1.5125797777292427,
"learning_rate": 5e-06,
"loss": 0.4721,
"step": 1340
},
{
"epoch": 1.8256167624197364,
"grad_norm": 1.4872727963351564,
"learning_rate": 5e-06,
"loss": 0.477,
"step": 1350
},
{
"epoch": 1.839134842852315,
"grad_norm": 1.5871439872439899,
"learning_rate": 5e-06,
"loss": 0.4765,
"step": 1360
},
{
"epoch": 1.8526529232848934,
"grad_norm": 1.592006128597771,
"learning_rate": 5e-06,
"loss": 0.4746,
"step": 1370
},
{
"epoch": 1.8661710037174721,
"grad_norm": 1.598997771987625,
"learning_rate": 5e-06,
"loss": 0.4781,
"step": 1380
},
{
"epoch": 1.8796890841500506,
"grad_norm": 1.5928725680036628,
"learning_rate": 5e-06,
"loss": 0.4803,
"step": 1390
},
{
"epoch": 1.8932071645826292,
"grad_norm": 1.623893547015882,
"learning_rate": 5e-06,
"loss": 0.4752,
"step": 1400
},
{
"epoch": 1.906725245015208,
"grad_norm": 1.7940114937618843,
"learning_rate": 5e-06,
"loss": 0.4836,
"step": 1410
},
{
"epoch": 1.9202433254477864,
"grad_norm": 1.5205185910384997,
"learning_rate": 5e-06,
"loss": 0.4809,
"step": 1420
},
{
"epoch": 1.933761405880365,
"grad_norm": 1.6080341693065385,
"learning_rate": 5e-06,
"loss": 0.481,
"step": 1430
},
{
"epoch": 1.9472794863129437,
"grad_norm": 1.5206917589511617,
"learning_rate": 5e-06,
"loss": 0.4711,
"step": 1440
},
{
"epoch": 1.9607975667455222,
"grad_norm": 1.6175809292420489,
"learning_rate": 5e-06,
"loss": 0.4759,
"step": 1450
},
{
"epoch": 1.9743156471781007,
"grad_norm": 1.5775797663419633,
"learning_rate": 5e-06,
"loss": 0.4699,
"step": 1460
},
{
"epoch": 1.9878337276106794,
"grad_norm": 1.6961433441398912,
"learning_rate": 5e-06,
"loss": 0.4737,
"step": 1470
},
{
"epoch": 1.9986481919567423,
"eval_loss": 0.14539429545402527,
"eval_runtime": 380.8674,
"eval_samples_per_second": 26.164,
"eval_steps_per_second": 0.41,
"step": 1478
},
{
"epoch": 2.0020277120648866,
"grad_norm": 3.8413511660769193,
"learning_rate": 5e-06,
"loss": 0.4622,
"step": 1480
},
{
"epoch": 2.0155457924974653,
"grad_norm": 2.289274121046122,
"learning_rate": 5e-06,
"loss": 0.3511,
"step": 1490
},
{
"epoch": 2.029063872930044,
"grad_norm": 1.9108409569758165,
"learning_rate": 5e-06,
"loss": 0.3338,
"step": 1500
},
{
"epoch": 2.0425819533626224,
"grad_norm": 1.7936328744279362,
"learning_rate": 5e-06,
"loss": 0.3366,
"step": 1510
},
{
"epoch": 2.056100033795201,
"grad_norm": 1.9158569170297433,
"learning_rate": 5e-06,
"loss": 0.3439,
"step": 1520
},
{
"epoch": 2.06961811422778,
"grad_norm": 1.791129292519035,
"learning_rate": 5e-06,
"loss": 0.3392,
"step": 1530
},
{
"epoch": 2.083136194660358,
"grad_norm": 2.0098239614151026,
"learning_rate": 5e-06,
"loss": 0.3392,
"step": 1540
},
{
"epoch": 2.096654275092937,
"grad_norm": 1.7766198397906945,
"learning_rate": 5e-06,
"loss": 0.3389,
"step": 1550
},
{
"epoch": 2.110172355525515,
"grad_norm": 2.08318641104621,
"learning_rate": 5e-06,
"loss": 0.3431,
"step": 1560
},
{
"epoch": 2.123690435958094,
"grad_norm": 2.2228561770595667,
"learning_rate": 5e-06,
"loss": 0.3395,
"step": 1570
},
{
"epoch": 2.1372085163906727,
"grad_norm": 1.9214094936994222,
"learning_rate": 5e-06,
"loss": 0.3366,
"step": 1580
},
{
"epoch": 2.150726596823251,
"grad_norm": 1.8535859994144672,
"learning_rate": 5e-06,
"loss": 0.3451,
"step": 1590
},
{
"epoch": 2.1642446772558297,
"grad_norm": 1.8759119457837454,
"learning_rate": 5e-06,
"loss": 0.3469,
"step": 1600
},
{
"epoch": 2.1777627576884084,
"grad_norm": 1.9517825046828854,
"learning_rate": 5e-06,
"loss": 0.3455,
"step": 1610
},
{
"epoch": 2.1912808381209867,
"grad_norm": 1.951414687627,
"learning_rate": 5e-06,
"loss": 0.3433,
"step": 1620
},
{
"epoch": 2.2047989185535655,
"grad_norm": 1.8390753103711273,
"learning_rate": 5e-06,
"loss": 0.3394,
"step": 1630
},
{
"epoch": 2.2183169989861438,
"grad_norm": 1.852336735941585,
"learning_rate": 5e-06,
"loss": 0.3482,
"step": 1640
},
{
"epoch": 2.2318350794187225,
"grad_norm": 2.104974336142616,
"learning_rate": 5e-06,
"loss": 0.3546,
"step": 1650
},
{
"epoch": 2.2453531598513012,
"grad_norm": 1.9022953700727612,
"learning_rate": 5e-06,
"loss": 0.344,
"step": 1660
},
{
"epoch": 2.2588712402838795,
"grad_norm": 1.912675084485768,
"learning_rate": 5e-06,
"loss": 0.3491,
"step": 1670
},
{
"epoch": 2.2723893207164583,
"grad_norm": 1.766645490672379,
"learning_rate": 5e-06,
"loss": 0.3531,
"step": 1680
},
{
"epoch": 2.285907401149037,
"grad_norm": 1.864712721003459,
"learning_rate": 5e-06,
"loss": 0.346,
"step": 1690
},
{
"epoch": 2.2994254815816153,
"grad_norm": 2.007832977417507,
"learning_rate": 5e-06,
"loss": 0.3478,
"step": 1700
},
{
"epoch": 2.312943562014194,
"grad_norm": 1.948790808762209,
"learning_rate": 5e-06,
"loss": 0.3485,
"step": 1710
},
{
"epoch": 2.3264616424467723,
"grad_norm": 2.0217128757867293,
"learning_rate": 5e-06,
"loss": 0.3517,
"step": 1720
},
{
"epoch": 2.339979722879351,
"grad_norm": 2.144727796810822,
"learning_rate": 5e-06,
"loss": 0.3555,
"step": 1730
},
{
"epoch": 2.35349780331193,
"grad_norm": 1.8087253456008405,
"learning_rate": 5e-06,
"loss": 0.3559,
"step": 1740
},
{
"epoch": 2.367015883744508,
"grad_norm": 1.838290701763884,
"learning_rate": 5e-06,
"loss": 0.3546,
"step": 1750
},
{
"epoch": 2.380533964177087,
"grad_norm": 1.775768613839116,
"learning_rate": 5e-06,
"loss": 0.3551,
"step": 1760
},
{
"epoch": 2.3940520446096656,
"grad_norm": 1.912762130191922,
"learning_rate": 5e-06,
"loss": 0.3547,
"step": 1770
},
{
"epoch": 2.407570125042244,
"grad_norm": 1.7693808777695688,
"learning_rate": 5e-06,
"loss": 0.3554,
"step": 1780
},
{
"epoch": 2.4210882054748226,
"grad_norm": 2.062101118297791,
"learning_rate": 5e-06,
"loss": 0.3539,
"step": 1790
},
{
"epoch": 2.4346062859074014,
"grad_norm": 1.7936240593829114,
"learning_rate": 5e-06,
"loss": 0.3565,
"step": 1800
},
{
"epoch": 2.4481243663399797,
"grad_norm": 1.8990616257107005,
"learning_rate": 5e-06,
"loss": 0.3506,
"step": 1810
},
{
"epoch": 2.4616424467725584,
"grad_norm": 1.8897003912341879,
"learning_rate": 5e-06,
"loss": 0.3539,
"step": 1820
},
{
"epoch": 2.4751605272051367,
"grad_norm": 1.9638380799816073,
"learning_rate": 5e-06,
"loss": 0.3531,
"step": 1830
},
{
"epoch": 2.4886786076377154,
"grad_norm": 1.7974113469484045,
"learning_rate": 5e-06,
"loss": 0.354,
"step": 1840
},
{
"epoch": 2.502196688070294,
"grad_norm": 1.825033766196877,
"learning_rate": 5e-06,
"loss": 0.3575,
"step": 1850
},
{
"epoch": 2.5157147685028725,
"grad_norm": 1.8251675734409782,
"learning_rate": 5e-06,
"loss": 0.3518,
"step": 1860
},
{
"epoch": 2.529232848935451,
"grad_norm": 2.0787239988149397,
"learning_rate": 5e-06,
"loss": 0.3633,
"step": 1870
},
{
"epoch": 2.5427509293680295,
"grad_norm": 1.7869028110036567,
"learning_rate": 5e-06,
"loss": 0.3576,
"step": 1880
},
{
"epoch": 2.5562690098006082,
"grad_norm": 1.7932422035286009,
"learning_rate": 5e-06,
"loss": 0.3589,
"step": 1890
},
{
"epoch": 2.569787090233187,
"grad_norm": 1.8125739600297648,
"learning_rate": 5e-06,
"loss": 0.3571,
"step": 1900
},
{
"epoch": 2.5833051706657653,
"grad_norm": 1.8667683506155952,
"learning_rate": 5e-06,
"loss": 0.3612,
"step": 1910
},
{
"epoch": 2.596823251098344,
"grad_norm": 1.8639012162902293,
"learning_rate": 5e-06,
"loss": 0.3598,
"step": 1920
},
{
"epoch": 2.6103413315309227,
"grad_norm": 2.0202670438057924,
"learning_rate": 5e-06,
"loss": 0.3612,
"step": 1930
},
{
"epoch": 2.623859411963501,
"grad_norm": 1.8193852600474405,
"learning_rate": 5e-06,
"loss": 0.3604,
"step": 1940
},
{
"epoch": 2.6373774923960798,
"grad_norm": 1.8662033138623173,
"learning_rate": 5e-06,
"loss": 0.3646,
"step": 1950
},
{
"epoch": 2.6508955728286585,
"grad_norm": 1.9796354345767144,
"learning_rate": 5e-06,
"loss": 0.3633,
"step": 1960
},
{
"epoch": 2.664413653261237,
"grad_norm": 1.9455704972640486,
"learning_rate": 5e-06,
"loss": 0.3622,
"step": 1970
},
{
"epoch": 2.6779317336938155,
"grad_norm": 1.6977674417388293,
"learning_rate": 5e-06,
"loss": 0.3661,
"step": 1980
},
{
"epoch": 2.6914498141263943,
"grad_norm": 1.7297060050258417,
"learning_rate": 5e-06,
"loss": 0.353,
"step": 1990
},
{
"epoch": 2.7049678945589726,
"grad_norm": 1.982039686598582,
"learning_rate": 5e-06,
"loss": 0.3616,
"step": 2000
},
{
"epoch": 2.7184859749915513,
"grad_norm": 1.9841967443753195,
"learning_rate": 5e-06,
"loss": 0.3626,
"step": 2010
},
{
"epoch": 2.7320040554241296,
"grad_norm": 1.9857692866519607,
"learning_rate": 5e-06,
"loss": 0.3634,
"step": 2020
},
{
"epoch": 2.7455221358567083,
"grad_norm": 1.9757117083253184,
"learning_rate": 5e-06,
"loss": 0.3697,
"step": 2030
},
{
"epoch": 2.759040216289287,
"grad_norm": 1.8782922508939865,
"learning_rate": 5e-06,
"loss": 0.3652,
"step": 2040
},
{
"epoch": 2.7725582967218654,
"grad_norm": 1.9035541906463247,
"learning_rate": 5e-06,
"loss": 0.3599,
"step": 2050
},
{
"epoch": 2.786076377154444,
"grad_norm": 1.951655639577415,
"learning_rate": 5e-06,
"loss": 0.369,
"step": 2060
},
{
"epoch": 2.7995944575870224,
"grad_norm": 1.7412018691630233,
"learning_rate": 5e-06,
"loss": 0.3638,
"step": 2070
},
{
"epoch": 2.813112538019601,
"grad_norm": 1.8554265727819574,
"learning_rate": 5e-06,
"loss": 0.3679,
"step": 2080
},
{
"epoch": 2.82663061845218,
"grad_norm": 1.9678242485081974,
"learning_rate": 5e-06,
"loss": 0.3637,
"step": 2090
},
{
"epoch": 2.840148698884758,
"grad_norm": 1.9578034139617126,
"learning_rate": 5e-06,
"loss": 0.3653,
"step": 2100
},
{
"epoch": 2.853666779317337,
"grad_norm": 2.016045119436882,
"learning_rate": 5e-06,
"loss": 0.3616,
"step": 2110
},
{
"epoch": 2.8671848597499157,
"grad_norm": 1.9377074982514926,
"learning_rate": 5e-06,
"loss": 0.3615,
"step": 2120
},
{
"epoch": 2.880702940182494,
"grad_norm": 2.005341596622271,
"learning_rate": 5e-06,
"loss": 0.3693,
"step": 2130
},
{
"epoch": 2.8942210206150727,
"grad_norm": 1.823360962532199,
"learning_rate": 5e-06,
"loss": 0.3686,
"step": 2140
},
{
"epoch": 2.9077391010476514,
"grad_norm": 1.7689506770881627,
"learning_rate": 5e-06,
"loss": 0.3667,
"step": 2150
},
{
"epoch": 2.9212571814802297,
"grad_norm": 1.899310672863797,
"learning_rate": 5e-06,
"loss": 0.3645,
"step": 2160
},
{
"epoch": 2.9347752619128085,
"grad_norm": 1.8348517557022244,
"learning_rate": 5e-06,
"loss": 0.3646,
"step": 2170
},
{
"epoch": 2.948293342345387,
"grad_norm": 1.8637043447496588,
"learning_rate": 5e-06,
"loss": 0.3702,
"step": 2180
},
{
"epoch": 2.9618114227779655,
"grad_norm": 1.8308571241634983,
"learning_rate": 5e-06,
"loss": 0.3634,
"step": 2190
},
{
"epoch": 2.9753295032105442,
"grad_norm": 1.9515551374723294,
"learning_rate": 5e-06,
"loss": 0.3733,
"step": 2200
},
{
"epoch": 2.9888475836431225,
"grad_norm": 1.9021705828666453,
"learning_rate": 5e-06,
"loss": 0.3691,
"step": 2210
},
{
"epoch": 2.9983102399459276,
"eval_loss": 0.16083495318889618,
"eval_runtime": 379.909,
"eval_samples_per_second": 26.23,
"eval_steps_per_second": 0.411,
"step": 2217
},
{
"epoch": 2.9983102399459276,
"step": 2217,
"total_flos": 1856569406914560.0,
"train_loss": 0.47475808259329927,
"train_runtime": 63259.7159,
"train_samples_per_second": 8.979,
"train_steps_per_second": 0.035
}
],
"logging_steps": 10,
"max_steps": 2217,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1856569406914560.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}