ltg_norbert3-small / trainer_state.json
Veggissss's picture
Training in progress, step 100
1b0450c verified
{
"best_global_step": 3700,
"best_metric": 0.03170738369226456,
"best_model_checkpoint": "trained/ltg/norbert3-small\\checkpoint-3700",
"epoch": 7.2265625,
"eval_steps": 100,
"global_step": 3700,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01953125,
"grad_norm": 7.950218677520752,
"learning_rate": 8.789062500000001e-06,
"loss": 22.0208,
"step": 10
},
{
"epoch": 0.0390625,
"grad_norm": 6.9703545570373535,
"learning_rate": 1.85546875e-05,
"loss": 21.3936,
"step": 20
},
{
"epoch": 0.05859375,
"grad_norm": 5.963114261627197,
"learning_rate": 2.83203125e-05,
"loss": 20.3377,
"step": 30
},
{
"epoch": 0.078125,
"grad_norm": 5.306910514831543,
"learning_rate": 3.80859375e-05,
"loss": 19.0723,
"step": 40
},
{
"epoch": 0.09765625,
"grad_norm": 5.844285488128662,
"learning_rate": 4.78515625e-05,
"loss": 17.5429,
"step": 50
},
{
"epoch": 0.1171875,
"grad_norm": 5.66201639175415,
"learning_rate": 5.7617187500000004e-05,
"loss": 16.1115,
"step": 60
},
{
"epoch": 0.13671875,
"grad_norm": 5.144637584686279,
"learning_rate": 6.73828125e-05,
"loss": 14.6307,
"step": 70
},
{
"epoch": 0.15625,
"grad_norm": 5.500018119812012,
"learning_rate": 7.71484375e-05,
"loss": 13.097,
"step": 80
},
{
"epoch": 0.17578125,
"grad_norm": 5.008663654327393,
"learning_rate": 8.69140625e-05,
"loss": 11.5314,
"step": 90
},
{
"epoch": 0.1953125,
"grad_norm": 4.854718208312988,
"learning_rate": 9.66796875e-05,
"loss": 10.3316,
"step": 100
},
{
"epoch": 0.1953125,
"eval_loss": 4.540501594543457,
"eval_runtime": 34.552,
"eval_samples_per_second": 118.546,
"eval_steps_per_second": 7.409,
"step": 100
},
{
"epoch": 0.21484375,
"grad_norm": 4.839401721954346,
"learning_rate": 0.0001064453125,
"loss": 8.8709,
"step": 110
},
{
"epoch": 0.234375,
"grad_norm": 4.494856357574463,
"learning_rate": 0.00011621093750000001,
"loss": 7.8886,
"step": 120
},
{
"epoch": 0.25390625,
"grad_norm": 4.555051326751709,
"learning_rate": 0.0001259765625,
"loss": 6.8872,
"step": 130
},
{
"epoch": 0.2734375,
"grad_norm": 4.32931661605835,
"learning_rate": 0.0001357421875,
"loss": 5.8908,
"step": 140
},
{
"epoch": 0.29296875,
"grad_norm": 3.7109575271606445,
"learning_rate": 0.0001455078125,
"loss": 5.2308,
"step": 150
},
{
"epoch": 0.3125,
"grad_norm": 4.085555076599121,
"learning_rate": 0.0001552734375,
"loss": 4.5431,
"step": 160
},
{
"epoch": 0.33203125,
"grad_norm": 3.3191423416137695,
"learning_rate": 0.0001650390625,
"loss": 4.0313,
"step": 170
},
{
"epoch": 0.3515625,
"grad_norm": 4.100945949554443,
"learning_rate": 0.0001748046875,
"loss": 3.7451,
"step": 180
},
{
"epoch": 0.37109375,
"grad_norm": 4.261106014251709,
"learning_rate": 0.0001845703125,
"loss": 3.4457,
"step": 190
},
{
"epoch": 0.390625,
"grad_norm": 3.0237832069396973,
"learning_rate": 0.0001943359375,
"loss": 3.1331,
"step": 200
},
{
"epoch": 0.390625,
"eval_loss": 1.426753282546997,
"eval_runtime": 34.579,
"eval_samples_per_second": 118.453,
"eval_steps_per_second": 7.403,
"step": 200
},
{
"epoch": 0.41015625,
"grad_norm": 3.6745340824127197,
"learning_rate": 0.0002041015625,
"loss": 2.8997,
"step": 210
},
{
"epoch": 0.4296875,
"grad_norm": 4.417777061462402,
"learning_rate": 0.0002138671875,
"loss": 2.81,
"step": 220
},
{
"epoch": 0.44921875,
"grad_norm": 5.513803482055664,
"learning_rate": 0.0002236328125,
"loss": 2.5776,
"step": 230
},
{
"epoch": 0.46875,
"grad_norm": 3.619678020477295,
"learning_rate": 0.00023339843750000002,
"loss": 2.4229,
"step": 240
},
{
"epoch": 0.48828125,
"grad_norm": 3.636021137237549,
"learning_rate": 0.0002431640625,
"loss": 2.3517,
"step": 250
},
{
"epoch": 0.5078125,
"grad_norm": 5.008062839508057,
"learning_rate": 0.0002529296875,
"loss": 2.1649,
"step": 260
},
{
"epoch": 0.52734375,
"grad_norm": 4.469094276428223,
"learning_rate": 0.0002626953125,
"loss": 1.9232,
"step": 270
},
{
"epoch": 0.546875,
"grad_norm": 3.6506378650665283,
"learning_rate": 0.0002724609375,
"loss": 1.922,
"step": 280
},
{
"epoch": 0.56640625,
"grad_norm": 2.868321418762207,
"learning_rate": 0.0002822265625,
"loss": 1.9761,
"step": 290
},
{
"epoch": 0.5859375,
"grad_norm": 3.0979578495025635,
"learning_rate": 0.0002919921875,
"loss": 1.6869,
"step": 300
},
{
"epoch": 0.5859375,
"eval_loss": 0.7856405973434448,
"eval_runtime": 34.54,
"eval_samples_per_second": 118.587,
"eval_steps_per_second": 7.412,
"step": 300
},
{
"epoch": 0.60546875,
"grad_norm": 4.681755542755127,
"learning_rate": 0.0003017578125,
"loss": 1.7185,
"step": 310
},
{
"epoch": 0.625,
"grad_norm": 2.84851336479187,
"learning_rate": 0.0003115234375,
"loss": 1.6253,
"step": 320
},
{
"epoch": 0.64453125,
"grad_norm": 2.8802149295806885,
"learning_rate": 0.0003212890625,
"loss": 1.4382,
"step": 330
},
{
"epoch": 0.6640625,
"grad_norm": 3.137683391571045,
"learning_rate": 0.0003310546875,
"loss": 1.4573,
"step": 340
},
{
"epoch": 0.68359375,
"grad_norm": 2.84382963180542,
"learning_rate": 0.00034082031250000003,
"loss": 1.2982,
"step": 350
},
{
"epoch": 0.703125,
"grad_norm": 2.7262027263641357,
"learning_rate": 0.0003505859375,
"loss": 1.3976,
"step": 360
},
{
"epoch": 0.72265625,
"grad_norm": 2.5905797481536865,
"learning_rate": 0.0003603515625,
"loss": 1.3751,
"step": 370
},
{
"epoch": 0.7421875,
"grad_norm": 2.3549230098724365,
"learning_rate": 0.0003701171875,
"loss": 1.3441,
"step": 380
},
{
"epoch": 0.76171875,
"grad_norm": 3.1322712898254395,
"learning_rate": 0.0003798828125,
"loss": 1.2995,
"step": 390
},
{
"epoch": 0.78125,
"grad_norm": 3.1575980186462402,
"learning_rate": 0.0003896484375,
"loss": 1.2435,
"step": 400
},
{
"epoch": 0.78125,
"eval_loss": 0.5356224179267883,
"eval_runtime": 34.903,
"eval_samples_per_second": 117.354,
"eval_steps_per_second": 7.335,
"step": 400
},
{
"epoch": 0.80078125,
"grad_norm": 2.9619956016540527,
"learning_rate": 0.00039941406250000003,
"loss": 1.158,
"step": 410
},
{
"epoch": 0.8203125,
"grad_norm": 2.5613064765930176,
"learning_rate": 0.0004091796875,
"loss": 1.0989,
"step": 420
},
{
"epoch": 0.83984375,
"grad_norm": 3.0814015865325928,
"learning_rate": 0.0004189453125,
"loss": 1.0123,
"step": 430
},
{
"epoch": 0.859375,
"grad_norm": 2.1889028549194336,
"learning_rate": 0.0004287109375,
"loss": 1.109,
"step": 440
},
{
"epoch": 0.87890625,
"grad_norm": 2.4778764247894287,
"learning_rate": 0.0004384765625,
"loss": 1.0337,
"step": 450
},
{
"epoch": 0.8984375,
"grad_norm": 2.1767776012420654,
"learning_rate": 0.0004482421875,
"loss": 1.0537,
"step": 460
},
{
"epoch": 0.91796875,
"grad_norm": 2.518540382385254,
"learning_rate": 0.00045800781250000003,
"loss": 1.0118,
"step": 470
},
{
"epoch": 0.9375,
"grad_norm": 2.429670810699463,
"learning_rate": 0.00046777343750000004,
"loss": 0.9931,
"step": 480
},
{
"epoch": 0.95703125,
"grad_norm": 2.2620222568511963,
"learning_rate": 0.0004775390625,
"loss": 0.9221,
"step": 490
},
{
"epoch": 0.9765625,
"grad_norm": 2.0658576488494873,
"learning_rate": 0.0004873046875,
"loss": 0.9128,
"step": 500
},
{
"epoch": 0.9765625,
"eval_loss": 0.42379331588745117,
"eval_runtime": 34.733,
"eval_samples_per_second": 117.928,
"eval_steps_per_second": 7.371,
"step": 500
},
{
"epoch": 0.99609375,
"grad_norm": 2.622636079788208,
"learning_rate": 0.0004970703125,
"loss": 0.9074,
"step": 510
},
{
"epoch": 1.015625,
"grad_norm": 2.956885576248169,
"learning_rate": 0.0004999971530484696,
"loss": 0.9474,
"step": 520
},
{
"epoch": 1.03515625,
"grad_norm": 2.8692688941955566,
"learning_rate": 0.0004999832089521691,
"loss": 0.9159,
"step": 530
},
{
"epoch": 1.0546875,
"grad_norm": 2.4971773624420166,
"learning_rate": 0.0004999576454489559,
"loss": 0.8008,
"step": 540
},
{
"epoch": 1.07421875,
"grad_norm": 1.7604514360427856,
"learning_rate": 0.0004999204637270404,
"loss": 0.8161,
"step": 550
},
{
"epoch": 1.09375,
"grad_norm": 2.117351531982422,
"learning_rate": 0.0004998716655146573,
"loss": 0.9061,
"step": 560
},
{
"epoch": 1.11328125,
"grad_norm": 1.9122625589370728,
"learning_rate": 0.0004998112530799839,
"loss": 0.8696,
"step": 570
},
{
"epoch": 1.1328125,
"grad_norm": 2.0043485164642334,
"learning_rate": 0.0004997392292310354,
"loss": 0.7629,
"step": 580
},
{
"epoch": 1.15234375,
"grad_norm": 1.8798938989639282,
"learning_rate": 0.0004996555973155344,
"loss": 0.7976,
"step": 590
},
{
"epoch": 1.171875,
"grad_norm": 2.1096367835998535,
"learning_rate": 0.0004995603612207548,
"loss": 0.7728,
"step": 600
},
{
"epoch": 1.171875,
"eval_loss": 0.34791266918182373,
"eval_runtime": 34.889,
"eval_samples_per_second": 117.401,
"eval_steps_per_second": 7.338,
"step": 600
},
{
"epoch": 1.19140625,
"grad_norm": 1.8750674724578857,
"learning_rate": 0.000499453525373342,
"loss": 0.7348,
"step": 610
},
{
"epoch": 1.2109375,
"grad_norm": 1.9086616039276123,
"learning_rate": 0.0004993350947391059,
"loss": 0.7259,
"step": 620
},
{
"epoch": 1.23046875,
"grad_norm": 2.0031375885009766,
"learning_rate": 0.0004992050748227915,
"loss": 0.6752,
"step": 630
},
{
"epoch": 1.25,
"grad_norm": 2.394740104675293,
"learning_rate": 0.0004990634716678217,
"loss": 0.7237,
"step": 640
},
{
"epoch": 1.26953125,
"grad_norm": 1.9420517683029175,
"learning_rate": 0.0004989102918560172,
"loss": 0.6634,
"step": 650
},
{
"epoch": 1.2890625,
"grad_norm": 2.5375707149505615,
"learning_rate": 0.0004987455425072907,
"loss": 0.866,
"step": 660
},
{
"epoch": 1.30859375,
"grad_norm": 2.0575597286224365,
"learning_rate": 0.0004985692312793153,
"loss": 0.5999,
"step": 670
},
{
"epoch": 1.328125,
"grad_norm": 1.7922422885894775,
"learning_rate": 0.000498381366367169,
"loss": 0.5935,
"step": 680
},
{
"epoch": 1.34765625,
"grad_norm": 2.0383851528167725,
"learning_rate": 0.0004981819565029539,
"loss": 0.6418,
"step": 690
},
{
"epoch": 1.3671875,
"grad_norm": 1.6255093812942505,
"learning_rate": 0.0004979710109553896,
"loss": 0.6444,
"step": 700
},
{
"epoch": 1.3671875,
"eval_loss": 0.2871336340904236,
"eval_runtime": 34.917,
"eval_samples_per_second": 117.307,
"eval_steps_per_second": 7.332,
"step": 700
},
{
"epoch": 1.38671875,
"grad_norm": 1.7046557664871216,
"learning_rate": 0.0004977485395293836,
"loss": 0.6661,
"step": 710
},
{
"epoch": 1.40625,
"grad_norm": 2.547611951828003,
"learning_rate": 0.0004975145525655744,
"loss": 0.5595,
"step": 720
},
{
"epoch": 1.42578125,
"grad_norm": 1.8859753608703613,
"learning_rate": 0.0004972690609398512,
"loss": 0.5831,
"step": 730
},
{
"epoch": 1.4453125,
"grad_norm": 1.7141295671463013,
"learning_rate": 0.0004970120760628492,
"loss": 0.5981,
"step": 740
},
{
"epoch": 1.46484375,
"grad_norm": 1.5137165784835815,
"learning_rate": 0.0004967436098794177,
"loss": 0.6364,
"step": 750
},
{
"epoch": 1.484375,
"grad_norm": 2.534536600112915,
"learning_rate": 0.0004964636748680664,
"loss": 0.54,
"step": 760
},
{
"epoch": 1.50390625,
"grad_norm": 1.795462727546692,
"learning_rate": 0.0004961722840403843,
"loss": 0.533,
"step": 770
},
{
"epoch": 1.5234375,
"grad_norm": 2.0635874271392822,
"learning_rate": 0.0004958694509404355,
"loss": 0.593,
"step": 780
},
{
"epoch": 1.54296875,
"grad_norm": 1.7885998487472534,
"learning_rate": 0.0004955551896441295,
"loss": 0.6076,
"step": 790
},
{
"epoch": 1.5625,
"grad_norm": 2.2840216159820557,
"learning_rate": 0.0004952295147585667,
"loss": 0.5232,
"step": 800
},
{
"epoch": 1.5625,
"eval_loss": 0.24195311963558197,
"eval_runtime": 34.654,
"eval_samples_per_second": 118.197,
"eval_steps_per_second": 7.387,
"step": 800
},
{
"epoch": 1.58203125,
"grad_norm": 1.3189480304718018,
"learning_rate": 0.0004948924414213601,
"loss": 0.4159,
"step": 810
},
{
"epoch": 1.6015625,
"grad_norm": 1.859397530555725,
"learning_rate": 0.000494543985299931,
"loss": 0.4731,
"step": 820
},
{
"epoch": 1.62109375,
"grad_norm": 1.8662714958190918,
"learning_rate": 0.0004941841625907811,
"loss": 0.5586,
"step": 830
},
{
"epoch": 1.640625,
"grad_norm": 1.637846827507019,
"learning_rate": 0.0004938129900187393,
"loss": 0.4765,
"step": 840
},
{
"epoch": 1.66015625,
"grad_norm": 1.739134430885315,
"learning_rate": 0.0004934304848361855,
"loss": 0.492,
"step": 850
},
{
"epoch": 1.6796875,
"grad_norm": 1.5579233169555664,
"learning_rate": 0.0004930366648222467,
"loss": 0.5734,
"step": 860
},
{
"epoch": 1.69921875,
"grad_norm": 1.7096999883651733,
"learning_rate": 0.0004926315482819728,
"loss": 0.55,
"step": 870
},
{
"epoch": 1.71875,
"grad_norm": 1.415420651435852,
"learning_rate": 0.0004922151540454839,
"loss": 0.4812,
"step": 880
},
{
"epoch": 1.73828125,
"grad_norm": 1.2610892057418823,
"learning_rate": 0.0004917875014670963,
"loss": 0.4289,
"step": 890
},
{
"epoch": 1.7578125,
"grad_norm": 1.3743246793746948,
"learning_rate": 0.0004913486104244223,
"loss": 0.4662,
"step": 900
},
{
"epoch": 1.7578125,
"eval_loss": 0.21815192699432373,
"eval_runtime": 34.764,
"eval_samples_per_second": 117.823,
"eval_steps_per_second": 7.364,
"step": 900
},
{
"epoch": 1.77734375,
"grad_norm": 1.5094037055969238,
"learning_rate": 0.0004908985013174468,
"loss": 0.4558,
"step": 910
},
{
"epoch": 1.796875,
"grad_norm": 1.7575606107711792,
"learning_rate": 0.000490437195067578,
"loss": 0.4556,
"step": 920
},
{
"epoch": 1.81640625,
"grad_norm": 1.3195754289627075,
"learning_rate": 0.0004899647131166763,
"loss": 0.4727,
"step": 930
},
{
"epoch": 1.8359375,
"grad_norm": 1.6985130310058594,
"learning_rate": 0.0004894810774260572,
"loss": 0.5443,
"step": 940
},
{
"epoch": 1.85546875,
"grad_norm": 1.709730625152588,
"learning_rate": 0.0004889863104754697,
"loss": 0.556,
"step": 950
},
{
"epoch": 1.875,
"grad_norm": 1.666263461112976,
"learning_rate": 0.0004884804352620526,
"loss": 0.4635,
"step": 960
},
{
"epoch": 1.89453125,
"grad_norm": 1.3932074308395386,
"learning_rate": 0.00048796347529926517,
"loss": 0.4252,
"step": 970
},
{
"epoch": 1.9140625,
"grad_norm": 2.161454677581787,
"learning_rate": 0.0004874354546157936,
"loss": 0.4579,
"step": 980
},
{
"epoch": 1.93359375,
"grad_norm": 1.7341850996017456,
"learning_rate": 0.0004868963977544353,
"loss": 0.4311,
"step": 990
},
{
"epoch": 1.953125,
"grad_norm": 1.5829185247421265,
"learning_rate": 0.00048634632977095704,
"loss": 0.3864,
"step": 1000
},
{
"epoch": 1.953125,
"eval_loss": 0.19052913784980774,
"eval_runtime": 34.761,
"eval_samples_per_second": 117.833,
"eval_steps_per_second": 7.365,
"step": 1000
},
{
"epoch": 1.97265625,
"grad_norm": 1.101081132888794,
"learning_rate": 0.000485785276232931,
"loss": 0.3874,
"step": 1010
},
{
"epoch": 1.9921875,
"grad_norm": 1.6454037427902222,
"learning_rate": 0.0004852132632185461,
"loss": 0.4326,
"step": 1020
},
{
"epoch": 2.01171875,
"grad_norm": 1.2393088340759277,
"learning_rate": 0.000484630317315396,
"loss": 0.4382,
"step": 1030
},
{
"epoch": 2.03125,
"grad_norm": 1.658333420753479,
"learning_rate": 0.0004840364656192433,
"loss": 0.4206,
"step": 1040
},
{
"epoch": 2.05078125,
"grad_norm": 1.313991665840149,
"learning_rate": 0.0004834317357327597,
"loss": 0.3975,
"step": 1050
},
{
"epoch": 2.0703125,
"grad_norm": 1.3297566175460815,
"learning_rate": 0.00048281615576424374,
"loss": 0.3958,
"step": 1060
},
{
"epoch": 2.08984375,
"grad_norm": 1.1647272109985352,
"learning_rate": 0.00048218975432631365,
"loss": 0.377,
"step": 1070
},
{
"epoch": 2.109375,
"grad_norm": 1.5755267143249512,
"learning_rate": 0.00048155256053457785,
"loss": 0.3417,
"step": 1080
},
{
"epoch": 2.12890625,
"grad_norm": 2.3393445014953613,
"learning_rate": 0.00048090460400628123,
"loss": 0.4214,
"step": 1090
},
{
"epoch": 2.1484375,
"grad_norm": 1.5548704862594604,
"learning_rate": 0.0004802459148589289,
"loss": 0.4284,
"step": 1100
},
{
"epoch": 2.1484375,
"eval_loss": 0.18148231506347656,
"eval_runtime": 34.931,
"eval_samples_per_second": 117.26,
"eval_steps_per_second": 7.329,
"step": 1100
},
{
"epoch": 2.16796875,
"grad_norm": 1.8194355964660645,
"learning_rate": 0.00047957652370888616,
"loss": 0.4066,
"step": 1110
},
{
"epoch": 2.1875,
"grad_norm": 1.922125220298767,
"learning_rate": 0.0004788964616699554,
"loss": 0.3621,
"step": 1120
},
{
"epoch": 2.20703125,
"grad_norm": 1.7716784477233887,
"learning_rate": 0.0004782057603519297,
"loss": 0.3802,
"step": 1130
},
{
"epoch": 2.2265625,
"grad_norm": 1.233404278755188,
"learning_rate": 0.0004775044518591242,
"loss": 0.3952,
"step": 1140
},
{
"epoch": 2.24609375,
"grad_norm": 1.3415420055389404,
"learning_rate": 0.00047679256878888315,
"loss": 0.4005,
"step": 1150
},
{
"epoch": 2.265625,
"grad_norm": 1.4483531713485718,
"learning_rate": 0.00047607014423006527,
"loss": 0.3756,
"step": 1160
},
{
"epoch": 2.28515625,
"grad_norm": 1.1628267765045166,
"learning_rate": 0.0004753372117615055,
"loss": 0.3908,
"step": 1170
},
{
"epoch": 2.3046875,
"grad_norm": 1.5845067501068115,
"learning_rate": 0.00047459380545045426,
"loss": 0.3727,
"step": 1180
},
{
"epoch": 2.32421875,
"grad_norm": 1.2387616634368896,
"learning_rate": 0.00047383995985099414,
"loss": 0.3489,
"step": 1190
},
{
"epoch": 2.34375,
"grad_norm": 1.4392259120941162,
"learning_rate": 0.0004730757100024336,
"loss": 0.3804,
"step": 1200
},
{
"epoch": 2.34375,
"eval_loss": 0.1616286039352417,
"eval_runtime": 34.8,
"eval_samples_per_second": 117.701,
"eval_steps_per_second": 7.356,
"step": 1200
},
{
"epoch": 2.36328125,
"grad_norm": 1.6787796020507812,
"learning_rate": 0.0004723010914276783,
"loss": 0.3817,
"step": 1210
},
{
"epoch": 2.3828125,
"grad_norm": 1.2978942394256592,
"learning_rate": 0.0004715161401315803,
"loss": 0.3436,
"step": 1220
},
{
"epoch": 2.40234375,
"grad_norm": 1.933402180671692,
"learning_rate": 0.000470720892599264,
"loss": 0.3104,
"step": 1230
},
{
"epoch": 2.421875,
"grad_norm": 1.8381954431533813,
"learning_rate": 0.00046991538579443096,
"loss": 0.3381,
"step": 1240
},
{
"epoch": 2.44140625,
"grad_norm": 1.3875645399093628,
"learning_rate": 0.0004690996571576409,
"loss": 0.3334,
"step": 1250
},
{
"epoch": 2.4609375,
"grad_norm": 1.373177170753479,
"learning_rate": 0.0004682737446045725,
"loss": 0.2988,
"step": 1260
},
{
"epoch": 2.48046875,
"grad_norm": 1.131691336631775,
"learning_rate": 0.00046743768652426015,
"loss": 0.323,
"step": 1270
},
{
"epoch": 2.5,
"grad_norm": 1.4642337560653687,
"learning_rate": 0.00046659152177731003,
"loss": 0.3457,
"step": 1280
},
{
"epoch": 2.51953125,
"grad_norm": 1.3850147724151611,
"learning_rate": 0.00046573528969409374,
"loss": 0.3136,
"step": 1290
},
{
"epoch": 2.5390625,
"grad_norm": 1.4759999513626099,
"learning_rate": 0.0004648690300729203,
"loss": 0.3233,
"step": 1300
},
{
"epoch": 2.5390625,
"eval_loss": 0.1541527956724167,
"eval_runtime": 34.809,
"eval_samples_per_second": 117.671,
"eval_steps_per_second": 7.354,
"step": 1300
},
{
"epoch": 2.55859375,
"grad_norm": 1.1843773126602173,
"learning_rate": 0.0004639927831781862,
"loss": 0.3037,
"step": 1310
},
{
"epoch": 2.578125,
"grad_norm": 1.3902676105499268,
"learning_rate": 0.0004631065897385037,
"loss": 0.313,
"step": 1320
},
{
"epoch": 2.59765625,
"grad_norm": 1.2758233547210693,
"learning_rate": 0.0004622104909448082,
"loss": 0.3129,
"step": 1330
},
{
"epoch": 2.6171875,
"grad_norm": 1.7758818864822388,
"learning_rate": 0.0004613045284484432,
"loss": 0.3269,
"step": 1340
},
{
"epoch": 2.63671875,
"grad_norm": 1.3369204998016357,
"learning_rate": 0.00046038874435922465,
"loss": 0.3199,
"step": 1350
},
{
"epoch": 2.65625,
"grad_norm": 1.5773146152496338,
"learning_rate": 0.0004594631812434832,
"loss": 0.3204,
"step": 1360
},
{
"epoch": 2.67578125,
"grad_norm": 1.4648780822753906,
"learning_rate": 0.0004585278821220863,
"loss": 0.2864,
"step": 1370
},
{
"epoch": 2.6953125,
"grad_norm": 1.6221363544464111,
"learning_rate": 0.00045758289046843813,
"loss": 0.34,
"step": 1380
},
{
"epoch": 2.71484375,
"grad_norm": 1.1161458492279053,
"learning_rate": 0.00045662825020645895,
"loss": 0.2723,
"step": 1390
},
{
"epoch": 2.734375,
"grad_norm": 1.5751299858093262,
"learning_rate": 0.0004556640057085436,
"loss": 0.3149,
"step": 1400
},
{
"epoch": 2.734375,
"eval_loss": 0.14088299870491028,
"eval_runtime": 34.861,
"eval_samples_per_second": 117.495,
"eval_steps_per_second": 7.343,
"step": 1400
},
{
"epoch": 2.75390625,
"grad_norm": 1.2796870470046997,
"learning_rate": 0.00045469020179349917,
"loss": 0.3107,
"step": 1410
},
{
"epoch": 2.7734375,
"grad_norm": 1.2906869649887085,
"learning_rate": 0.00045370688372446146,
"loss": 0.3022,
"step": 1420
},
{
"epoch": 2.79296875,
"grad_norm": 1.2449394464492798,
"learning_rate": 0.0004527140972067911,
"loss": 0.271,
"step": 1430
},
{
"epoch": 2.8125,
"grad_norm": 0.9955422282218933,
"learning_rate": 0.00045171188838594986,
"loss": 0.2902,
"step": 1440
},
{
"epoch": 2.83203125,
"grad_norm": 1.6944818496704102,
"learning_rate": 0.0004507003038453546,
"loss": 0.288,
"step": 1450
},
{
"epoch": 2.8515625,
"grad_norm": 1.4964483976364136,
"learning_rate": 0.00044967939060421307,
"loss": 0.3125,
"step": 1460
},
{
"epoch": 2.87109375,
"grad_norm": 1.1946748495101929,
"learning_rate": 0.0004486491961153379,
"loss": 0.3261,
"step": 1470
},
{
"epoch": 2.890625,
"grad_norm": 1.3839339017868042,
"learning_rate": 0.00044760976826294097,
"loss": 0.3375,
"step": 1480
},
{
"epoch": 2.91015625,
"grad_norm": 1.3183151483535767,
"learning_rate": 0.00044656115536040797,
"loss": 0.3032,
"step": 1490
},
{
"epoch": 2.9296875,
"grad_norm": 1.4515591859817505,
"learning_rate": 0.00044550340614805256,
"loss": 0.2747,
"step": 1500
},
{
"epoch": 2.9296875,
"eval_loss": 0.12650033831596375,
"eval_runtime": 34.752,
"eval_samples_per_second": 117.864,
"eval_steps_per_second": 7.366,
"step": 1500
},
{
"epoch": 2.94921875,
"grad_norm": 1.1999162435531616,
"learning_rate": 0.0004444365697908509,
"loss": 0.2887,
"step": 1510
},
{
"epoch": 2.96875,
"grad_norm": 1.201749563217163,
"learning_rate": 0.00044336069587615635,
"loss": 0.2875,
"step": 1520
},
{
"epoch": 2.98828125,
"grad_norm": 1.2322083711624146,
"learning_rate": 0.00044227583441139496,
"loss": 0.2716,
"step": 1530
},
{
"epoch": 3.0078125,
"grad_norm": 1.1172407865524292,
"learning_rate": 0.00044118203582174057,
"loss": 0.2435,
"step": 1540
},
{
"epoch": 3.02734375,
"grad_norm": 1.0245709419250488,
"learning_rate": 0.00044007935094777156,
"loss": 0.2701,
"step": 1550
},
{
"epoch": 3.046875,
"grad_norm": 1.523379921913147,
"learning_rate": 0.00043896783104310734,
"loss": 0.2365,
"step": 1560
},
{
"epoch": 3.06640625,
"grad_norm": 0.9000476598739624,
"learning_rate": 0.00043784752777202595,
"loss": 0.2712,
"step": 1570
},
{
"epoch": 3.0859375,
"grad_norm": 1.0866985321044922,
"learning_rate": 0.00043671849320706335,
"loss": 0.3082,
"step": 1580
},
{
"epoch": 3.10546875,
"grad_norm": 1.2548413276672363,
"learning_rate": 0.00043558077982659216,
"loss": 0.207,
"step": 1590
},
{
"epoch": 3.125,
"grad_norm": 1.4354357719421387,
"learning_rate": 0.000434434440512383,
"loss": 0.239,
"step": 1600
},
{
"epoch": 3.125,
"eval_loss": 0.12245145440101624,
"eval_runtime": 34.447,
"eval_samples_per_second": 118.907,
"eval_steps_per_second": 7.432,
"step": 1600
},
{
"epoch": 3.14453125,
"grad_norm": 1.0551234483718872,
"learning_rate": 0.0004332795285471465,
"loss": 0.2292,
"step": 1610
},
{
"epoch": 3.1640625,
"grad_norm": 1.2726539373397827,
"learning_rate": 0.00043211609761205626,
"loss": 0.2304,
"step": 1620
},
{
"epoch": 3.18359375,
"grad_norm": 1.4743865728378296,
"learning_rate": 0.0004309442017842543,
"loss": 0.2784,
"step": 1630
},
{
"epoch": 3.203125,
"grad_norm": 1.5050513744354248,
"learning_rate": 0.0004297638955343368,
"loss": 0.3108,
"step": 1640
},
{
"epoch": 3.22265625,
"grad_norm": 1.5667752027511597,
"learning_rate": 0.0004285752337238231,
"loss": 0.2638,
"step": 1650
},
{
"epoch": 3.2421875,
"grad_norm": 1.187193751335144,
"learning_rate": 0.0004273782716026049,
"loss": 0.2541,
"step": 1660
},
{
"epoch": 3.26171875,
"grad_norm": 1.0397928953170776,
"learning_rate": 0.0004261730648063788,
"loss": 0.2217,
"step": 1670
},
{
"epoch": 3.28125,
"grad_norm": 0.8323363065719604,
"learning_rate": 0.00042495966935405995,
"loss": 0.2599,
"step": 1680
},
{
"epoch": 3.30078125,
"grad_norm": 1.1350178718566895,
"learning_rate": 0.00042373814164517833,
"loss": 0.2313,
"step": 1690
},
{
"epoch": 3.3203125,
"grad_norm": 1.0103367567062378,
"learning_rate": 0.00042250853845725745,
"loss": 0.2296,
"step": 1700
},
{
"epoch": 3.3203125,
"eval_loss": 0.11100158095359802,
"eval_runtime": 34.453,
"eval_samples_per_second": 118.887,
"eval_steps_per_second": 7.43,
"step": 1700
},
{
"epoch": 3.33984375,
"grad_norm": 1.268911361694336,
"learning_rate": 0.0004212709169431751,
"loss": 0.2437,
"step": 1710
},
{
"epoch": 3.359375,
"grad_norm": 1.5205929279327393,
"learning_rate": 0.0004200253346285068,
"loss": 0.2649,
"step": 1720
},
{
"epoch": 3.37890625,
"grad_norm": 1.3827295303344727,
"learning_rate": 0.0004187718494088521,
"loss": 0.2552,
"step": 1730
},
{
"epoch": 3.3984375,
"grad_norm": 0.8661313652992249,
"learning_rate": 0.0004175105195471435,
"loss": 0.2318,
"step": 1740
},
{
"epoch": 3.41796875,
"grad_norm": 1.2733081579208374,
"learning_rate": 0.0004162414036709383,
"loss": 0.2276,
"step": 1750
},
{
"epoch": 3.4375,
"grad_norm": 1.0363296270370483,
"learning_rate": 0.0004149645607696936,
"loss": 0.1892,
"step": 1760
},
{
"epoch": 3.45703125,
"grad_norm": 0.9911412000656128,
"learning_rate": 0.0004136800501920245,
"loss": 0.2341,
"step": 1770
},
{
"epoch": 3.4765625,
"grad_norm": 1.0639911890029907,
"learning_rate": 0.00041238793164294536,
"loss": 0.2346,
"step": 1780
},
{
"epoch": 3.49609375,
"grad_norm": 1.0140990018844604,
"learning_rate": 0.0004110882651810948,
"loss": 0.2557,
"step": 1790
},
{
"epoch": 3.515625,
"grad_norm": 1.5262864828109741,
"learning_rate": 0.00040978111121594396,
"loss": 0.2314,
"step": 1800
},
{
"epoch": 3.515625,
"eval_loss": 0.10740732401609421,
"eval_runtime": 34.468,
"eval_samples_per_second": 118.835,
"eval_steps_per_second": 7.427,
"step": 1800
},
{
"epoch": 3.53515625,
"grad_norm": 1.5407931804656982,
"learning_rate": 0.00040846653050498897,
"loss": 0.2093,
"step": 1810
},
{
"epoch": 3.5546875,
"grad_norm": 0.7513458728790283,
"learning_rate": 0.0004071445841509264,
"loss": 0.2161,
"step": 1820
},
{
"epoch": 3.57421875,
"grad_norm": 1.2236207723617554,
"learning_rate": 0.00040581533359881374,
"loss": 0.2536,
"step": 1830
},
{
"epoch": 3.59375,
"grad_norm": 1.3843364715576172,
"learning_rate": 0.0004044788406332128,
"loss": 0.2095,
"step": 1840
},
{
"epoch": 3.61328125,
"grad_norm": 1.372381329536438,
"learning_rate": 0.0004031351673753184,
"loss": 0.2136,
"step": 1850
},
{
"epoch": 3.6328125,
"grad_norm": 1.251421570777893,
"learning_rate": 0.00040178437628007055,
"loss": 0.2403,
"step": 1860
},
{
"epoch": 3.65234375,
"grad_norm": 1.3957655429840088,
"learning_rate": 0.0004004265301332518,
"loss": 0.2149,
"step": 1870
},
{
"epoch": 3.671875,
"grad_norm": 1.109490990638733,
"learning_rate": 0.00039906169204856877,
"loss": 0.2212,
"step": 1880
},
{
"epoch": 3.69140625,
"grad_norm": 1.0680170059204102,
"learning_rate": 0.0003976899254647186,
"loss": 0.1957,
"step": 1890
},
{
"epoch": 3.7109375,
"grad_norm": 1.177930235862732,
"learning_rate": 0.00039631129414244016,
"loss": 0.2181,
"step": 1900
},
{
"epoch": 3.7109375,
"eval_loss": 0.09738427400588989,
"eval_runtime": 34.465,
"eval_samples_per_second": 118.845,
"eval_steps_per_second": 7.428,
"step": 1900
},
{
"epoch": 3.73046875,
"grad_norm": 1.2667044401168823,
"learning_rate": 0.00039492586216155056,
"loss": 0.2459,
"step": 1910
},
{
"epoch": 3.75,
"grad_norm": 0.9396888017654419,
"learning_rate": 0.0003935336939179668,
"loss": 0.1969,
"step": 1920
},
{
"epoch": 3.76953125,
"grad_norm": 1.1412925720214844,
"learning_rate": 0.0003921348541207122,
"loss": 0.1768,
"step": 1930
},
{
"epoch": 3.7890625,
"grad_norm": 1.208653211593628,
"learning_rate": 0.0003907294077889089,
"loss": 0.197,
"step": 1940
},
{
"epoch": 3.80859375,
"grad_norm": 1.0788354873657227,
"learning_rate": 0.00038931742024875585,
"loss": 0.1923,
"step": 1950
},
{
"epoch": 3.828125,
"grad_norm": 1.0677030086517334,
"learning_rate": 0.00038789895713049207,
"loss": 0.2269,
"step": 1960
},
{
"epoch": 3.84765625,
"grad_norm": 1.1341768503189087,
"learning_rate": 0.00038647408436534646,
"loss": 0.1879,
"step": 1970
},
{
"epoch": 3.8671875,
"grad_norm": 0.8455312848091125,
"learning_rate": 0.0003850428681824732,
"loss": 0.2181,
"step": 1980
},
{
"epoch": 3.88671875,
"grad_norm": 1.1865947246551514,
"learning_rate": 0.00038360537510587315,
"loss": 0.2105,
"step": 1990
},
{
"epoch": 3.90625,
"grad_norm": 1.2523059844970703,
"learning_rate": 0.0003821616719513017,
"loss": 0.1949,
"step": 2000
},
{
"epoch": 3.90625,
"eval_loss": 0.08832964301109314,
"eval_runtime": 34.454,
"eval_samples_per_second": 118.883,
"eval_steps_per_second": 7.43,
"step": 2000
},
{
"epoch": 3.92578125,
"grad_norm": 1.1952605247497559,
"learning_rate": 0.00038071182582316364,
"loss": 0.2248,
"step": 2010
},
{
"epoch": 3.9453125,
"grad_norm": 1.3617719411849976,
"learning_rate": 0.00037925590411139377,
"loss": 0.2039,
"step": 2020
},
{
"epoch": 3.96484375,
"grad_norm": 1.457497000694275,
"learning_rate": 0.0003777939744883243,
"loss": 0.2065,
"step": 2030
},
{
"epoch": 3.984375,
"grad_norm": 1.2239354848861694,
"learning_rate": 0.0003763261049055399,
"loss": 0.1916,
"step": 2040
},
{
"epoch": 4.00390625,
"grad_norm": 0.7479121088981628,
"learning_rate": 0.00037485236359071885,
"loss": 0.1767,
"step": 2050
},
{
"epoch": 4.0234375,
"grad_norm": 0.7983143329620361,
"learning_rate": 0.0003733728190444621,
"loss": 0.1865,
"step": 2060
},
{
"epoch": 4.04296875,
"grad_norm": 0.8446422219276428,
"learning_rate": 0.000371887540037109,
"loss": 0.1498,
"step": 2070
},
{
"epoch": 4.0625,
"grad_norm": 1.9428411722183228,
"learning_rate": 0.0003703965956055411,
"loss": 0.236,
"step": 2080
},
{
"epoch": 4.08203125,
"grad_norm": 1.1161248683929443,
"learning_rate": 0.00036890005504997296,
"loss": 0.2052,
"step": 2090
},
{
"epoch": 4.1015625,
"grad_norm": 1.3190343379974365,
"learning_rate": 0.0003673979879307314,
"loss": 0.1805,
"step": 2100
},
{
"epoch": 4.1015625,
"eval_loss": 0.08478689938783646,
"eval_runtime": 34.46,
"eval_samples_per_second": 118.862,
"eval_steps_per_second": 7.429,
"step": 2100
},
{
"epoch": 4.12109375,
"grad_norm": 0.820060670375824,
"learning_rate": 0.00036589046406502166,
"loss": 0.212,
"step": 2110
},
{
"epoch": 4.140625,
"grad_norm": 1.2379734516143799,
"learning_rate": 0.0003643775535236832,
"loss": 0.1904,
"step": 2120
},
{
"epoch": 4.16015625,
"grad_norm": 0.763465940952301,
"learning_rate": 0.0003628593266279316,
"loss": 0.1798,
"step": 2130
},
{
"epoch": 4.1796875,
"grad_norm": 1.0594629049301147,
"learning_rate": 0.00036133585394609104,
"loss": 0.2282,
"step": 2140
},
{
"epoch": 4.19921875,
"grad_norm": 1.0510274171829224,
"learning_rate": 0.0003598072062903137,
"loss": 0.1604,
"step": 2150
},
{
"epoch": 4.21875,
"grad_norm": 0.7347229719161987,
"learning_rate": 0.000358273454713288,
"loss": 0.1874,
"step": 2160
},
{
"epoch": 4.23828125,
"grad_norm": 1.2081701755523682,
"learning_rate": 0.0003567346705049371,
"loss": 0.1761,
"step": 2170
},
{
"epoch": 4.2578125,
"grad_norm": 0.8199229836463928,
"learning_rate": 0.0003551909251891041,
"loss": 0.1742,
"step": 2180
},
{
"epoch": 4.27734375,
"grad_norm": 0.9021660089492798,
"learning_rate": 0.0003536422905202286,
"loss": 0.1469,
"step": 2190
},
{
"epoch": 4.296875,
"grad_norm": 1.1995556354522705,
"learning_rate": 0.00035208883848001027,
"loss": 0.1564,
"step": 2200
},
{
"epoch": 4.296875,
"eval_loss": 0.08129081130027771,
"eval_runtime": 34.451,
"eval_samples_per_second": 118.893,
"eval_steps_per_second": 7.431,
"step": 2200
},
{
"epoch": 4.31640625,
"grad_norm": 1.4361320734024048,
"learning_rate": 0.00035053064127406466,
"loss": 0.1818,
"step": 2210
},
{
"epoch": 4.3359375,
"grad_norm": 1.2441810369491577,
"learning_rate": 0.0003489677713285655,
"loss": 0.1815,
"step": 2220
},
{
"epoch": 4.35546875,
"grad_norm": 1.0656682252883911,
"learning_rate": 0.0003474003012868793,
"loss": 0.1546,
"step": 2230
},
{
"epoch": 4.375,
"grad_norm": 0.9471537470817566,
"learning_rate": 0.00034582830400618834,
"loss": 0.1666,
"step": 2240
},
{
"epoch": 4.39453125,
"grad_norm": 0.9355669617652893,
"learning_rate": 0.0003442518525541046,
"loss": 0.1624,
"step": 2250
},
{
"epoch": 4.4140625,
"grad_norm": 0.8041836619377136,
"learning_rate": 0.0003426710202052729,
"loss": 0.1739,
"step": 2260
},
{
"epoch": 4.43359375,
"grad_norm": 0.9458361864089966,
"learning_rate": 0.0003410858804379658,
"loss": 0.1664,
"step": 2270
},
{
"epoch": 4.453125,
"grad_norm": 0.9826671481132507,
"learning_rate": 0.0003394965069306677,
"loss": 0.1644,
"step": 2280
},
{
"epoch": 4.47265625,
"grad_norm": 0.9595081210136414,
"learning_rate": 0.00033790297355865037,
"loss": 0.1791,
"step": 2290
},
{
"epoch": 4.4921875,
"grad_norm": 1.212173342704773,
"learning_rate": 0.00033630535439053933,
"loss": 0.1888,
"step": 2300
},
{
"epoch": 4.4921875,
"eval_loss": 0.07625420391559601,
"eval_runtime": 34.457,
"eval_samples_per_second": 118.873,
"eval_steps_per_second": 7.43,
"step": 2300
},
{
"epoch": 4.51171875,
"grad_norm": 1.1109684705734253,
"learning_rate": 0.0003347037236848709,
"loss": 0.175,
"step": 2310
},
{
"epoch": 4.53125,
"grad_norm": 0.680024266242981,
"learning_rate": 0.00033309815588664077,
"loss": 0.1532,
"step": 2320
},
{
"epoch": 4.55078125,
"grad_norm": 1.1742795705795288,
"learning_rate": 0.0003314887256238435,
"loss": 0.1887,
"step": 2330
},
{
"epoch": 4.5703125,
"grad_norm": 0.9640453457832336,
"learning_rate": 0.00032987550770400393,
"loss": 0.1294,
"step": 2340
},
{
"epoch": 4.58984375,
"grad_norm": 1.4085016250610352,
"learning_rate": 0.0003282585771107001,
"loss": 0.1663,
"step": 2350
},
{
"epoch": 4.609375,
"grad_norm": 0.9533945322036743,
"learning_rate": 0.0003266380090000779,
"loss": 0.1543,
"step": 2360
},
{
"epoch": 4.62890625,
"grad_norm": 1.1523878574371338,
"learning_rate": 0.00032501387869735774,
"loss": 0.1616,
"step": 2370
},
{
"epoch": 4.6484375,
"grad_norm": 1.5500876903533936,
"learning_rate": 0.0003233862616933333,
"loss": 0.1601,
"step": 2380
},
{
"epoch": 4.66796875,
"grad_norm": 0.9692633152008057,
"learning_rate": 0.0003217552336408628,
"loss": 0.1531,
"step": 2390
},
{
"epoch": 4.6875,
"grad_norm": 0.9365313649177551,
"learning_rate": 0.00032012087035135264,
"loss": 0.1531,
"step": 2400
},
{
"epoch": 4.6875,
"eval_loss": 0.07005032151937485,
"eval_runtime": 34.454,
"eval_samples_per_second": 118.883,
"eval_steps_per_second": 7.43,
"step": 2400
},
{
"epoch": 4.70703125,
"grad_norm": 0.956684410572052,
"learning_rate": 0.0003184832477912334,
"loss": 0.1503,
"step": 2410
},
{
"epoch": 4.7265625,
"grad_norm": 1.1487888097763062,
"learning_rate": 0.00031684244207842905,
"loss": 0.1415,
"step": 2420
},
{
"epoch": 4.74609375,
"grad_norm": 1.2333308458328247,
"learning_rate": 0.0003151985294788189,
"loss": 0.1753,
"step": 2430
},
{
"epoch": 4.765625,
"grad_norm": 1.0322210788726807,
"learning_rate": 0.0003135515864026927,
"loss": 0.1398,
"step": 2440
},
{
"epoch": 4.78515625,
"grad_norm": 1.2911465167999268,
"learning_rate": 0.0003119016894011991,
"loss": 0.1412,
"step": 2450
},
{
"epoch": 4.8046875,
"grad_norm": 1.3067231178283691,
"learning_rate": 0.00031024891516278713,
"loss": 0.1636,
"step": 2460
},
{
"epoch": 4.82421875,
"grad_norm": 0.6816452741622925,
"learning_rate": 0.00030859334050964226,
"loss": 0.1645,
"step": 2470
},
{
"epoch": 4.84375,
"grad_norm": 1.6802979707717896,
"learning_rate": 0.0003069350423941152,
"loss": 0.1606,
"step": 2480
},
{
"epoch": 4.86328125,
"grad_norm": 0.8826921582221985,
"learning_rate": 0.00030527409789514524,
"loss": 0.1364,
"step": 2490
},
{
"epoch": 4.8828125,
"grad_norm": 0.714592456817627,
"learning_rate": 0.0003036105842146775,
"loss": 0.1444,
"step": 2500
},
{
"epoch": 4.8828125,
"eval_loss": 0.06955922394990921,
"eval_runtime": 34.453,
"eval_samples_per_second": 118.887,
"eval_steps_per_second": 7.43,
"step": 2500
},
{
"epoch": 4.90234375,
"grad_norm": 0.8907870650291443,
"learning_rate": 0.0003019445786740747,
"loss": 0.1408,
"step": 2510
},
{
"epoch": 4.921875,
"grad_norm": 0.8849253058433533,
"learning_rate": 0.000300276158710523,
"loss": 0.1243,
"step": 2520
},
{
"epoch": 4.94140625,
"grad_norm": 1.1217460632324219,
"learning_rate": 0.00029860540187343277,
"loss": 0.1611,
"step": 2530
},
{
"epoch": 4.9609375,
"grad_norm": 0.7618215084075928,
"learning_rate": 0.00029693238582083407,
"loss": 0.1344,
"step": 2540
},
{
"epoch": 4.98046875,
"grad_norm": 0.904474675655365,
"learning_rate": 0.0002952571883157669,
"loss": 0.1409,
"step": 2550
},
{
"epoch": 5.0,
"grad_norm": 1.0269676446914673,
"learning_rate": 0.0002935798872226668,
"loss": 0.1477,
"step": 2560
},
{
"epoch": 5.01953125,
"grad_norm": 0.8338671326637268,
"learning_rate": 0.0002919005605037458,
"loss": 0.1469,
"step": 2570
},
{
"epoch": 5.0390625,
"grad_norm": 0.5822446346282959,
"learning_rate": 0.00029021928621536834,
"loss": 0.1225,
"step": 2580
},
{
"epoch": 5.05859375,
"grad_norm": 1.279576063156128,
"learning_rate": 0.00028853614250442356,
"loss": 0.1375,
"step": 2590
},
{
"epoch": 5.078125,
"grad_norm": 0.9247716069221497,
"learning_rate": 0.0002868512076046925,
"loss": 0.126,
"step": 2600
},
{
"epoch": 5.078125,
"eval_loss": 0.06358367204666138,
"eval_runtime": 34.464,
"eval_samples_per_second": 118.849,
"eval_steps_per_second": 7.428,
"step": 2600
},
{
"epoch": 5.09765625,
"grad_norm": 0.9483351111412048,
"learning_rate": 0.0002851645598332123,
"loss": 0.1334,
"step": 2610
},
{
"epoch": 5.1171875,
"grad_norm": 0.7646822929382324,
"learning_rate": 0.00028347627758663543,
"loss": 0.1175,
"step": 2620
},
{
"epoch": 5.13671875,
"grad_norm": 0.5134221315383911,
"learning_rate": 0.00028178643933758613,
"loss": 0.1182,
"step": 2630
},
{
"epoch": 5.15625,
"grad_norm": 1.063459873199463,
"learning_rate": 0.00028009512363101266,
"loss": 0.1415,
"step": 2640
},
{
"epoch": 5.17578125,
"grad_norm": 0.9747726917266846,
"learning_rate": 0.0002784024090805367,
"loss": 0.1591,
"step": 2650
},
{
"epoch": 5.1953125,
"grad_norm": 1.0984761714935303,
"learning_rate": 0.00027670837436479927,
"loss": 0.1461,
"step": 2660
},
{
"epoch": 5.21484375,
"grad_norm": 0.5246152877807617,
"learning_rate": 0.0002750130982238036,
"loss": 0.1102,
"step": 2670
},
{
"epoch": 5.234375,
"grad_norm": 1.721552848815918,
"learning_rate": 0.0002733166594552554,
"loss": 0.1362,
"step": 2680
},
{
"epoch": 5.25390625,
"grad_norm": 0.5377854108810425,
"learning_rate": 0.0002716191369109,
"loss": 0.1305,
"step": 2690
},
{
"epoch": 5.2734375,
"grad_norm": 1.294967532157898,
"learning_rate": 0.00026992060949285754,
"loss": 0.1604,
"step": 2700
},
{
"epoch": 5.2734375,
"eval_loss": 0.05843832343816757,
"eval_runtime": 34.482,
"eval_samples_per_second": 118.787,
"eval_steps_per_second": 7.424,
"step": 2700
},
{
"epoch": 5.29296875,
"grad_norm": 1.295292854309082,
"learning_rate": 0.0002682211561499555,
"loss": 0.1355,
"step": 2710
},
{
"epoch": 5.3125,
"grad_norm": 0.5737702250480652,
"learning_rate": 0.000266520855874059,
"loss": 0.1295,
"step": 2720
},
{
"epoch": 5.33203125,
"grad_norm": 1.1641753911972046,
"learning_rate": 0.00026481978769639917,
"loss": 0.1261,
"step": 2730
},
{
"epoch": 5.3515625,
"grad_norm": 0.966760516166687,
"learning_rate": 0.0002631180306838999,
"loss": 0.1168,
"step": 2740
},
{
"epoch": 5.37109375,
"grad_norm": 1.2680935859680176,
"learning_rate": 0.0002614156639355026,
"loss": 0.1511,
"step": 2750
},
{
"epoch": 5.390625,
"grad_norm": 1.0339317321777344,
"learning_rate": 0.00025971276657848965,
"loss": 0.1239,
"step": 2760
},
{
"epoch": 5.41015625,
"grad_norm": 0.655948281288147,
"learning_rate": 0.0002580094177648064,
"loss": 0.1134,
"step": 2770
},
{
"epoch": 5.4296875,
"grad_norm": 0.9367031455039978,
"learning_rate": 0.00025630569666738233,
"loss": 0.12,
"step": 2780
},
{
"epoch": 5.44921875,
"grad_norm": 1.1288447380065918,
"learning_rate": 0.0002546016824764512,
"loss": 0.1317,
"step": 2790
},
{
"epoch": 5.46875,
"grad_norm": 1.4184249639511108,
"learning_rate": 0.0002528974543958697,
"loss": 0.1453,
"step": 2800
},
{
"epoch": 5.46875,
"eval_loss": 0.054754838347435,
"eval_runtime": 34.455,
"eval_samples_per_second": 118.88,
"eval_steps_per_second": 7.43,
"step": 2800
},
{
"epoch": 5.48828125,
"grad_norm": 1.135767936706543,
"learning_rate": 0.00025119309163943614,
"loss": 0.0988,
"step": 2810
},
{
"epoch": 5.5078125,
"grad_norm": 0.49979278445243835,
"learning_rate": 0.00024948867342720904,
"loss": 0.1069,
"step": 2820
},
{
"epoch": 5.52734375,
"grad_norm": 1.0691176652908325,
"learning_rate": 0.00024778427898182416,
"loss": 0.1215,
"step": 2830
},
{
"epoch": 5.546875,
"grad_norm": 1.043531894683838,
"learning_rate": 0.000246079987524813,
"loss": 0.1227,
"step": 2840
},
{
"epoch": 5.56640625,
"grad_norm": 0.6733863949775696,
"learning_rate": 0.00024437587827291963,
"loss": 0.1079,
"step": 2850
},
{
"epoch": 5.5859375,
"grad_norm": 0.5758414268493652,
"learning_rate": 0.00024267203043441945,
"loss": 0.1132,
"step": 2860
},
{
"epoch": 5.60546875,
"grad_norm": 0.5270745158195496,
"learning_rate": 0.00024096852320543686,
"loss": 0.115,
"step": 2870
},
{
"epoch": 5.625,
"grad_norm": 0.5830528140068054,
"learning_rate": 0.0002392654357662648,
"loss": 0.0911,
"step": 2880
},
{
"epoch": 5.64453125,
"grad_norm": 1.0041881799697876,
"learning_rate": 0.0002375628472776838,
"loss": 0.1322,
"step": 2890
},
{
"epoch": 5.6640625,
"grad_norm": 0.8648679256439209,
"learning_rate": 0.00023586083687728284,
"loss": 0.1263,
"step": 2900
},
{
"epoch": 5.6640625,
"eval_loss": 0.04941609501838684,
"eval_runtime": 34.47,
"eval_samples_per_second": 118.828,
"eval_steps_per_second": 7.427,
"step": 2900
},
{
"epoch": 5.68359375,
"grad_norm": 0.8996643424034119,
"learning_rate": 0.0002341594836757811,
"loss": 0.1196,
"step": 2910
},
{
"epoch": 5.703125,
"grad_norm": 0.7420564293861389,
"learning_rate": 0.00023245886675335038,
"loss": 0.1106,
"step": 2920
},
{
"epoch": 5.72265625,
"grad_norm": 0.639359176158905,
"learning_rate": 0.00023075906515594003,
"loss": 0.1063,
"step": 2930
},
{
"epoch": 5.7421875,
"grad_norm": 0.8854117393493652,
"learning_rate": 0.00022906015789160212,
"loss": 0.1121,
"step": 2940
},
{
"epoch": 5.76171875,
"grad_norm": 1.648057222366333,
"learning_rate": 0.0002273622239268197,
"loss": 0.105,
"step": 2950
},
{
"epoch": 5.78125,
"grad_norm": 0.8455070853233337,
"learning_rate": 0.000225665342182836,
"loss": 0.1343,
"step": 2960
},
{
"epoch": 5.80078125,
"grad_norm": 0.6193717122077942,
"learning_rate": 0.00022396959153198634,
"loss": 0.0955,
"step": 2970
},
{
"epoch": 5.8203125,
"grad_norm": 1.3201427459716797,
"learning_rate": 0.00022227505079403193,
"loss": 0.1114,
"step": 2980
},
{
"epoch": 5.83984375,
"grad_norm": 1.1538575887680054,
"learning_rate": 0.00022058179873249623,
"loss": 0.1,
"step": 2990
},
{
"epoch": 5.859375,
"grad_norm": 0.9225913286209106,
"learning_rate": 0.00021888991405100426,
"loss": 0.109,
"step": 3000
},
{
"epoch": 5.859375,
"eval_loss": 0.04830887168645859,
"eval_runtime": 34.463,
"eval_samples_per_second": 118.852,
"eval_steps_per_second": 7.428,
"step": 3000
},
{
"epoch": 5.87890625,
"grad_norm": 1.0470666885375977,
"learning_rate": 0.00021719947538962386,
"loss": 0.1331,
"step": 3010
},
{
"epoch": 5.8984375,
"grad_norm": 1.0696215629577637,
"learning_rate": 0.00021551056132121125,
"loss": 0.0929,
"step": 3020
},
{
"epoch": 5.91796875,
"grad_norm": 0.6320193409919739,
"learning_rate": 0.000213823250347758,
"loss": 0.0871,
"step": 3030
},
{
"epoch": 5.9375,
"grad_norm": 0.5821089148521423,
"learning_rate": 0.0002121376208967428,
"loss": 0.0957,
"step": 3040
},
{
"epoch": 5.95703125,
"grad_norm": 0.6666136980056763,
"learning_rate": 0.00021045375131748589,
"loss": 0.1102,
"step": 3050
},
{
"epoch": 5.9765625,
"grad_norm": 0.5259461998939514,
"learning_rate": 0.00020877171987750752,
"loss": 0.093,
"step": 3060
},
{
"epoch": 5.99609375,
"grad_norm": 1.2559717893600464,
"learning_rate": 0.0002070916047588896,
"loss": 0.0905,
"step": 3070
},
{
"epoch": 6.015625,
"grad_norm": 0.9847208261489868,
"learning_rate": 0.00020541348405464185,
"loss": 0.112,
"step": 3080
},
{
"epoch": 6.03515625,
"grad_norm": 0.6805605292320251,
"learning_rate": 0.00020373743576507269,
"loss": 0.0977,
"step": 3090
},
{
"epoch": 6.0546875,
"grad_norm": 1.0333281755447388,
"learning_rate": 0.00020206353779416252,
"loss": 0.1017,
"step": 3100
},
{
"epoch": 6.0546875,
"eval_loss": 0.04791799187660217,
"eval_runtime": 34.46,
"eval_samples_per_second": 118.862,
"eval_steps_per_second": 7.429,
"step": 3100
},
{
"epoch": 6.07421875,
"grad_norm": 0.6116499900817871,
"learning_rate": 0.00020039186794594394,
"loss": 0.1053,
"step": 3110
},
{
"epoch": 6.09375,
"grad_norm": 0.6878781318664551,
"learning_rate": 0.00019872250392088402,
"loss": 0.1066,
"step": 3120
},
{
"epoch": 6.11328125,
"grad_norm": 1.0378636121749878,
"learning_rate": 0.00019705552331227412,
"loss": 0.0992,
"step": 3130
},
{
"epoch": 6.1328125,
"grad_norm": 0.9008749127388,
"learning_rate": 0.00019539100360262208,
"loss": 0.1091,
"step": 3140
},
{
"epoch": 6.15234375,
"grad_norm": 0.7476337552070618,
"learning_rate": 0.00019372902216005183,
"loss": 0.1359,
"step": 3150
},
{
"epoch": 6.171875,
"grad_norm": 0.4607301354408264,
"learning_rate": 0.00019206965623470626,
"loss": 0.1225,
"step": 3160
},
{
"epoch": 6.19140625,
"grad_norm": 0.7025051712989807,
"learning_rate": 0.0001904129829551572,
"loss": 0.0941,
"step": 3170
},
{
"epoch": 6.2109375,
"grad_norm": 0.9943326711654663,
"learning_rate": 0.00018875907932482062,
"loss": 0.0909,
"step": 3180
},
{
"epoch": 6.23046875,
"grad_norm": 1.1952718496322632,
"learning_rate": 0.0001871080222183766,
"loss": 0.1054,
"step": 3190
},
{
"epoch": 6.25,
"grad_norm": 1.0064172744750977,
"learning_rate": 0.00018545988837819703,
"loss": 0.0698,
"step": 3200
},
{
"epoch": 6.25,
"eval_loss": 0.04147997871041298,
"eval_runtime": 34.463,
"eval_samples_per_second": 118.852,
"eval_steps_per_second": 7.428,
"step": 3200
},
{
"epoch": 6.26953125,
"grad_norm": 0.7319508790969849,
"learning_rate": 0.00018381475441077793,
"loss": 0.0826,
"step": 3210
},
{
"epoch": 6.2890625,
"grad_norm": 0.6343052387237549,
"learning_rate": 0.00018217269678317936,
"loss": 0.0748,
"step": 3220
},
{
"epoch": 6.30859375,
"grad_norm": 1.0118874311447144,
"learning_rate": 0.00018053379181947032,
"loss": 0.085,
"step": 3230
},
{
"epoch": 6.328125,
"grad_norm": 0.6920039653778076,
"learning_rate": 0.00017889811569718207,
"loss": 0.0818,
"step": 3240
},
{
"epoch": 6.34765625,
"grad_norm": 0.9968111515045166,
"learning_rate": 0.0001772657444437666,
"loss": 0.0892,
"step": 3250
},
{
"epoch": 6.3671875,
"grad_norm": 0.7734740376472473,
"learning_rate": 0.00017563675393306313,
"loss": 0.0868,
"step": 3260
},
{
"epoch": 6.38671875,
"grad_norm": 0.8593881130218506,
"learning_rate": 0.0001740112198817717,
"loss": 0.0842,
"step": 3270
},
{
"epoch": 6.40625,
"grad_norm": 0.5908093452453613,
"learning_rate": 0.00017238921784593325,
"loss": 0.0905,
"step": 3280
},
{
"epoch": 6.42578125,
"grad_norm": 0.8824208378791809,
"learning_rate": 0.0001707708232174181,
"loss": 0.068,
"step": 3290
},
{
"epoch": 6.4453125,
"grad_norm": 1.5253270864486694,
"learning_rate": 0.0001691561112204215,
"loss": 0.1058,
"step": 3300
},
{
"epoch": 6.4453125,
"eval_loss": 0.03964650630950928,
"eval_runtime": 34.469,
"eval_samples_per_second": 118.831,
"eval_steps_per_second": 7.427,
"step": 3300
},
{
"epoch": 6.46484375,
"grad_norm": 0.8700875043869019,
"learning_rate": 0.0001675451569079674,
"loss": 0.084,
"step": 3310
},
{
"epoch": 6.484375,
"grad_norm": 0.4273681938648224,
"learning_rate": 0.00016593803515841955,
"loss": 0.0958,
"step": 3320
},
{
"epoch": 6.50390625,
"grad_norm": 0.6970112919807434,
"learning_rate": 0.00016433482067200144,
"loss": 0.0838,
"step": 3330
},
{
"epoch": 6.5234375,
"grad_norm": 0.7514935731887817,
"learning_rate": 0.000162735587967324,
"loss": 0.0901,
"step": 3340
},
{
"epoch": 6.54296875,
"grad_norm": 0.5686305165290833,
"learning_rate": 0.00016114041137792185,
"loss": 0.0874,
"step": 3350
},
{
"epoch": 6.5625,
"grad_norm": 0.7676323056221008,
"learning_rate": 0.00015954936504879863,
"loss": 0.0815,
"step": 3360
},
{
"epoch": 6.58203125,
"grad_norm": 0.7747044563293457,
"learning_rate": 0.00015796252293298006,
"loss": 0.0805,
"step": 3370
},
{
"epoch": 6.6015625,
"grad_norm": 0.5802006125450134,
"learning_rate": 0.0001563799587880771,
"loss": 0.0759,
"step": 3380
},
{
"epoch": 6.62109375,
"grad_norm": 0.6963436603546143,
"learning_rate": 0.00015480174617285713,
"loss": 0.0883,
"step": 3390
},
{
"epoch": 6.640625,
"grad_norm": 0.6564157009124756,
"learning_rate": 0.00015322795844382566,
"loss": 0.082,
"step": 3400
},
{
"epoch": 6.640625,
"eval_loss": 0.03754296153783798,
"eval_runtime": 34.623,
"eval_samples_per_second": 118.303,
"eval_steps_per_second": 7.394,
"step": 3400
},
{
"epoch": 6.66015625,
"grad_norm": 0.6527138352394104,
"learning_rate": 0.00015165866875181566,
"loss": 0.0966,
"step": 3410
},
{
"epoch": 6.6796875,
"grad_norm": 0.7239379286766052,
"learning_rate": 0.00015009395003858834,
"loss": 0.1032,
"step": 3420
},
{
"epoch": 6.69921875,
"grad_norm": 0.9023746252059937,
"learning_rate": 0.00014853387503344212,
"loss": 0.0896,
"step": 3430
},
{
"epoch": 6.71875,
"grad_norm": 0.7207731604576111,
"learning_rate": 0.00014697851624983243,
"loss": 0.0858,
"step": 3440
},
{
"epoch": 6.73828125,
"grad_norm": 0.8441604971885681,
"learning_rate": 0.00014542794598200147,
"loss": 0.0794,
"step": 3450
},
{
"epoch": 6.7578125,
"grad_norm": 1.3690321445465088,
"learning_rate": 0.00014388223630161734,
"loss": 0.0774,
"step": 3460
},
{
"epoch": 6.77734375,
"grad_norm": 0.8731696009635925,
"learning_rate": 0.00014234145905442445,
"loss": 0.0784,
"step": 3470
},
{
"epoch": 6.796875,
"grad_norm": 0.5190788507461548,
"learning_rate": 0.00014080568585690407,
"loss": 0.0767,
"step": 3480
},
{
"epoch": 6.81640625,
"grad_norm": 0.9499125480651855,
"learning_rate": 0.0001392749880929456,
"loss": 0.0922,
"step": 3490
},
{
"epoch": 6.8359375,
"grad_norm": 0.5663052201271057,
"learning_rate": 0.000137749436910528,
"loss": 0.082,
"step": 3500
},
{
"epoch": 6.8359375,
"eval_loss": 0.03372948616743088,
"eval_runtime": 34.565,
"eval_samples_per_second": 118.501,
"eval_steps_per_second": 7.406,
"step": 3500
},
{
"epoch": 6.85546875,
"grad_norm": 0.8701801896095276,
"learning_rate": 0.0001362291032184141,
"loss": 0.0773,
"step": 3510
},
{
"epoch": 6.875,
"grad_norm": 0.4705177843570709,
"learning_rate": 0.00013471405768285299,
"loss": 0.0755,
"step": 3520
},
{
"epoch": 6.89453125,
"grad_norm": 0.5040585398674011,
"learning_rate": 0.0001332043707242969,
"loss": 0.0727,
"step": 3530
},
{
"epoch": 6.9140625,
"grad_norm": 0.6496409773826599,
"learning_rate": 0.00013170011251412717,
"loss": 0.0904,
"step": 3540
},
{
"epoch": 6.93359375,
"grad_norm": 1.0118383169174194,
"learning_rate": 0.00013020135297139257,
"loss": 0.0832,
"step": 3550
},
{
"epoch": 6.953125,
"grad_norm": 0.7414892911911011,
"learning_rate": 0.00012870816175956034,
"loss": 0.0859,
"step": 3560
},
{
"epoch": 6.97265625,
"grad_norm": 0.41224896907806396,
"learning_rate": 0.00012722060828327693,
"loss": 0.0825,
"step": 3570
},
{
"epoch": 6.9921875,
"grad_norm": 1.4074021577835083,
"learning_rate": 0.00012573876168514282,
"loss": 0.0875,
"step": 3580
},
{
"epoch": 7.01171875,
"grad_norm": 0.7603104710578918,
"learning_rate": 0.00012426269084249856,
"loss": 0.0635,
"step": 3590
},
{
"epoch": 7.03125,
"grad_norm": 0.3152506649494171,
"learning_rate": 0.00012279246436422322,
"loss": 0.0763,
"step": 3600
},
{
"epoch": 7.03125,
"eval_loss": 0.032879240810871124,
"eval_runtime": 34.561,
"eval_samples_per_second": 118.515,
"eval_steps_per_second": 7.407,
"step": 3600
},
{
"epoch": 7.05078125,
"grad_norm": 0.8169571161270142,
"learning_rate": 0.00012132815058754557,
"loss": 0.0681,
"step": 3610
},
{
"epoch": 7.0703125,
"grad_norm": 0.9279233813285828,
"learning_rate": 0.00011986981757486717,
"loss": 0.0829,
"step": 3620
},
{
"epoch": 7.08984375,
"grad_norm": 0.7519627809524536,
"learning_rate": 0.00011841753311059967,
"loss": 0.0613,
"step": 3630
},
{
"epoch": 7.109375,
"grad_norm": 0.5586460828781128,
"learning_rate": 0.00011697136469801334,
"loss": 0.0702,
"step": 3640
},
{
"epoch": 7.12890625,
"grad_norm": 0.3905629813671112,
"learning_rate": 0.00011553137955609993,
"loss": 0.0711,
"step": 3650
},
{
"epoch": 7.1484375,
"grad_norm": 1.6302764415740967,
"learning_rate": 0.00011409764461644814,
"loss": 0.0971,
"step": 3660
},
{
"epoch": 7.16796875,
"grad_norm": 0.31183406710624695,
"learning_rate": 0.00011267022652013256,
"loss": 0.0612,
"step": 3670
},
{
"epoch": 7.1875,
"grad_norm": 0.8089606761932373,
"learning_rate": 0.00011124919161461592,
"loss": 0.0608,
"step": 3680
},
{
"epoch": 7.20703125,
"grad_norm": 0.4437573552131653,
"learning_rate": 0.00010983460595066602,
"loss": 0.0639,
"step": 3690
},
{
"epoch": 7.2265625,
"grad_norm": 0.5053586363792419,
"learning_rate": 0.00010842653527928447,
"loss": 0.0811,
"step": 3700
},
{
"epoch": 7.2265625,
"eval_loss": 0.03170738369226456,
"eval_runtime": 35.205,
"eval_samples_per_second": 116.347,
"eval_steps_per_second": 7.272,
"step": 3700
}
],
"logging_steps": 10,
"max_steps": 5120,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 2,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4810141121097728e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}