SALAMA_NEWMEDTTTT / checkpoint-2000 /trainer_state.json
EYEDOL's picture
Upload folder using huggingface_hub
f8b2ae4 verified
{
"best_global_step": 2000,
"best_metric": 0.3999578198909343,
"best_model_checkpoint": "./SALAMA_NEWMEDTTTT/checkpoint-2000",
"epoch": 1.0976948408342482,
"eval_steps": 2000,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005488474204171241,
"grad_norm": 1.0394881963729858,
"learning_rate": 1.8e-07,
"loss": 0.0043,
"step": 10
},
{
"epoch": 0.010976948408342482,
"grad_norm": 0.25432130694389343,
"learning_rate": 3.8e-07,
"loss": 0.0027,
"step": 20
},
{
"epoch": 0.01646542261251372,
"grad_norm": 0.6018465161323547,
"learning_rate": 5.800000000000001e-07,
"loss": 0.0037,
"step": 30
},
{
"epoch": 0.021953896816684963,
"grad_norm": 0.07274393737316132,
"learning_rate": 7.8e-07,
"loss": 0.0034,
"step": 40
},
{
"epoch": 0.027442371020856202,
"grad_norm": 1.1111565828323364,
"learning_rate": 9.800000000000001e-07,
"loss": 0.0032,
"step": 50
},
{
"epoch": 0.03293084522502744,
"grad_norm": 2.1740646362304688,
"learning_rate": 1.1800000000000001e-06,
"loss": 0.0086,
"step": 60
},
{
"epoch": 0.038419319429198684,
"grad_norm": 1.1649271249771118,
"learning_rate": 1.3800000000000001e-06,
"loss": 0.0049,
"step": 70
},
{
"epoch": 0.043907793633369926,
"grad_norm": 1.0835011005401611,
"learning_rate": 1.5800000000000001e-06,
"loss": 0.0036,
"step": 80
},
{
"epoch": 0.04939626783754116,
"grad_norm": 2.567765474319458,
"learning_rate": 1.7800000000000001e-06,
"loss": 0.0072,
"step": 90
},
{
"epoch": 0.054884742041712405,
"grad_norm": 0.5648300647735596,
"learning_rate": 1.98e-06,
"loss": 0.0033,
"step": 100
},
{
"epoch": 0.06037321624588365,
"grad_norm": 0.5851211547851562,
"learning_rate": 2.1800000000000003e-06,
"loss": 0.0042,
"step": 110
},
{
"epoch": 0.06586169045005488,
"grad_norm": 0.40879732370376587,
"learning_rate": 2.38e-06,
"loss": 0.004,
"step": 120
},
{
"epoch": 0.07135016465422613,
"grad_norm": 0.36008283495903015,
"learning_rate": 2.5800000000000003e-06,
"loss": 0.0037,
"step": 130
},
{
"epoch": 0.07683863885839737,
"grad_norm": 0.07423322647809982,
"learning_rate": 2.7800000000000005e-06,
"loss": 0.0075,
"step": 140
},
{
"epoch": 0.08232711306256861,
"grad_norm": 1.0768777132034302,
"learning_rate": 2.9800000000000003e-06,
"loss": 0.0067,
"step": 150
},
{
"epoch": 0.08781558726673985,
"grad_norm": 0.29102134704589844,
"learning_rate": 3.1800000000000005e-06,
"loss": 0.0035,
"step": 160
},
{
"epoch": 0.09330406147091108,
"grad_norm": 0.5590409636497498,
"learning_rate": 3.3800000000000007e-06,
"loss": 0.0031,
"step": 170
},
{
"epoch": 0.09879253567508232,
"grad_norm": 0.4114173948764801,
"learning_rate": 3.58e-06,
"loss": 0.0038,
"step": 180
},
{
"epoch": 0.10428100987925357,
"grad_norm": 0.9015783667564392,
"learning_rate": 3.7800000000000002e-06,
"loss": 0.004,
"step": 190
},
{
"epoch": 0.10976948408342481,
"grad_norm": 0.26067736744880676,
"learning_rate": 3.980000000000001e-06,
"loss": 0.0101,
"step": 200
},
{
"epoch": 0.11525795828759605,
"grad_norm": 0.819459080696106,
"learning_rate": 4.18e-06,
"loss": 0.0043,
"step": 210
},
{
"epoch": 0.1207464324917673,
"grad_norm": 0.9547446966171265,
"learning_rate": 4.38e-06,
"loss": 0.0078,
"step": 220
},
{
"epoch": 0.12623490669593854,
"grad_norm": 0.6792054772377014,
"learning_rate": 4.58e-06,
"loss": 0.0058,
"step": 230
},
{
"epoch": 0.13172338090010977,
"grad_norm": 0.04598504304885864,
"learning_rate": 4.78e-06,
"loss": 0.0058,
"step": 240
},
{
"epoch": 0.13721185510428102,
"grad_norm": 0.977815568447113,
"learning_rate": 4.980000000000001e-06,
"loss": 0.0065,
"step": 250
},
{
"epoch": 0.14270032930845225,
"grad_norm": 1.0802408456802368,
"learning_rate": 5.18e-06,
"loss": 0.0086,
"step": 260
},
{
"epoch": 0.14818880351262348,
"grad_norm": 0.30211061239242554,
"learning_rate": 5.380000000000001e-06,
"loss": 0.0045,
"step": 270
},
{
"epoch": 0.15367727771679474,
"grad_norm": 1.0189473628997803,
"learning_rate": 5.580000000000001e-06,
"loss": 0.0035,
"step": 280
},
{
"epoch": 0.15916575192096596,
"grad_norm": 1.2080388069152832,
"learning_rate": 5.78e-06,
"loss": 0.0054,
"step": 290
},
{
"epoch": 0.16465422612513722,
"grad_norm": 0.7697501182556152,
"learning_rate": 5.98e-06,
"loss": 0.0074,
"step": 300
},
{
"epoch": 0.17014270032930845,
"grad_norm": 0.23319111764431,
"learning_rate": 6.18e-06,
"loss": 0.0075,
"step": 310
},
{
"epoch": 0.1756311745334797,
"grad_norm": 1.1132267713546753,
"learning_rate": 6.380000000000001e-06,
"loss": 0.006,
"step": 320
},
{
"epoch": 0.18111964873765093,
"grad_norm": 0.9462475776672363,
"learning_rate": 6.5800000000000005e-06,
"loss": 0.0082,
"step": 330
},
{
"epoch": 0.18660812294182216,
"grad_norm": 0.6547773480415344,
"learning_rate": 6.780000000000001e-06,
"loss": 0.0064,
"step": 340
},
{
"epoch": 0.19209659714599342,
"grad_norm": 1.4683443307876587,
"learning_rate": 6.98e-06,
"loss": 0.0059,
"step": 350
},
{
"epoch": 0.19758507135016465,
"grad_norm": 0.6405034065246582,
"learning_rate": 7.180000000000001e-06,
"loss": 0.0096,
"step": 360
},
{
"epoch": 0.2030735455543359,
"grad_norm": 1.1234091520309448,
"learning_rate": 7.3800000000000005e-06,
"loss": 0.0099,
"step": 370
},
{
"epoch": 0.20856201975850713,
"grad_norm": 0.9663105607032776,
"learning_rate": 7.58e-06,
"loss": 0.0092,
"step": 380
},
{
"epoch": 0.21405049396267836,
"grad_norm": 0.7793697714805603,
"learning_rate": 7.78e-06,
"loss": 0.0081,
"step": 390
},
{
"epoch": 0.21953896816684962,
"grad_norm": 0.7131162285804749,
"learning_rate": 7.980000000000002e-06,
"loss": 0.0084,
"step": 400
},
{
"epoch": 0.22502744237102085,
"grad_norm": 1.2374234199523926,
"learning_rate": 8.18e-06,
"loss": 0.0106,
"step": 410
},
{
"epoch": 0.2305159165751921,
"grad_norm": 1.7101589441299438,
"learning_rate": 8.380000000000001e-06,
"loss": 0.0106,
"step": 420
},
{
"epoch": 0.23600439077936333,
"grad_norm": 1.1548316478729248,
"learning_rate": 8.580000000000001e-06,
"loss": 0.0078,
"step": 430
},
{
"epoch": 0.2414928649835346,
"grad_norm": 0.6724960803985596,
"learning_rate": 8.78e-06,
"loss": 0.0069,
"step": 440
},
{
"epoch": 0.24698133918770582,
"grad_norm": 1.403664469718933,
"learning_rate": 8.98e-06,
"loss": 0.0094,
"step": 450
},
{
"epoch": 0.2524698133918771,
"grad_norm": 1.1001019477844238,
"learning_rate": 9.180000000000002e-06,
"loss": 0.0107,
"step": 460
},
{
"epoch": 0.2579582875960483,
"grad_norm": 1.0355250835418701,
"learning_rate": 9.38e-06,
"loss": 0.0081,
"step": 470
},
{
"epoch": 0.26344676180021953,
"grad_norm": 1.619025707244873,
"learning_rate": 9.58e-06,
"loss": 0.0134,
"step": 480
},
{
"epoch": 0.2689352360043908,
"grad_norm": 1.4473015069961548,
"learning_rate": 9.780000000000001e-06,
"loss": 0.0119,
"step": 490
},
{
"epoch": 0.27442371020856204,
"grad_norm": 1.3764768838882446,
"learning_rate": 9.980000000000001e-06,
"loss": 0.0086,
"step": 500
},
{
"epoch": 0.27991218441273324,
"grad_norm": 1.75978422164917,
"learning_rate": 9.971374045801527e-06,
"loss": 0.0106,
"step": 510
},
{
"epoch": 0.2854006586169045,
"grad_norm": 2.658644914627075,
"learning_rate": 9.939567430025446e-06,
"loss": 0.0146,
"step": 520
},
{
"epoch": 0.29088913282107576,
"grad_norm": 0.3355913758277893,
"learning_rate": 9.907760814249365e-06,
"loss": 0.0134,
"step": 530
},
{
"epoch": 0.29637760702524696,
"grad_norm": 1.7025257349014282,
"learning_rate": 9.875954198473283e-06,
"loss": 0.0151,
"step": 540
},
{
"epoch": 0.3018660812294182,
"grad_norm": 1.6538467407226562,
"learning_rate": 9.844147582697202e-06,
"loss": 0.0147,
"step": 550
},
{
"epoch": 0.30735455543358947,
"grad_norm": 1.4546349048614502,
"learning_rate": 9.81234096692112e-06,
"loss": 0.0181,
"step": 560
},
{
"epoch": 0.31284302963776073,
"grad_norm": 1.5585579872131348,
"learning_rate": 9.780534351145039e-06,
"loss": 0.0163,
"step": 570
},
{
"epoch": 0.31833150384193193,
"grad_norm": 1.1905714273452759,
"learning_rate": 9.748727735368957e-06,
"loss": 0.0158,
"step": 580
},
{
"epoch": 0.3238199780461032,
"grad_norm": 1.6334969997406006,
"learning_rate": 9.716921119592876e-06,
"loss": 0.0128,
"step": 590
},
{
"epoch": 0.32930845225027444,
"grad_norm": 1.060271143913269,
"learning_rate": 9.685114503816794e-06,
"loss": 0.018,
"step": 600
},
{
"epoch": 0.33479692645444564,
"grad_norm": 1.6735498905181885,
"learning_rate": 9.653307888040713e-06,
"loss": 0.0114,
"step": 610
},
{
"epoch": 0.3402854006586169,
"grad_norm": 1.7198753356933594,
"learning_rate": 9.621501272264631e-06,
"loss": 0.0156,
"step": 620
},
{
"epoch": 0.34577387486278816,
"grad_norm": 0.7011512517929077,
"learning_rate": 9.58969465648855e-06,
"loss": 0.0124,
"step": 630
},
{
"epoch": 0.3512623490669594,
"grad_norm": 1.9055498838424683,
"learning_rate": 9.557888040712468e-06,
"loss": 0.0177,
"step": 640
},
{
"epoch": 0.3567508232711306,
"grad_norm": 1.77641761302948,
"learning_rate": 9.526081424936387e-06,
"loss": 0.0114,
"step": 650
},
{
"epoch": 0.36223929747530187,
"grad_norm": 2.173353910446167,
"learning_rate": 9.494274809160307e-06,
"loss": 0.0187,
"step": 660
},
{
"epoch": 0.3677277716794731,
"grad_norm": 1.061390995979309,
"learning_rate": 9.462468193384224e-06,
"loss": 0.0132,
"step": 670
},
{
"epoch": 0.3732162458836443,
"grad_norm": 0.8496463298797607,
"learning_rate": 9.430661577608143e-06,
"loss": 0.0136,
"step": 680
},
{
"epoch": 0.3787047200878156,
"grad_norm": 1.2099004983901978,
"learning_rate": 9.398854961832063e-06,
"loss": 0.0109,
"step": 690
},
{
"epoch": 0.38419319429198684,
"grad_norm": 1.3495599031448364,
"learning_rate": 9.36704834605598e-06,
"loss": 0.0153,
"step": 700
},
{
"epoch": 0.3896816684961581,
"grad_norm": 0.764531135559082,
"learning_rate": 9.3352417302799e-06,
"loss": 0.0073,
"step": 710
},
{
"epoch": 0.3951701427003293,
"grad_norm": 2.1928865909576416,
"learning_rate": 9.303435114503817e-06,
"loss": 0.0135,
"step": 720
},
{
"epoch": 0.40065861690450055,
"grad_norm": 1.8005603551864624,
"learning_rate": 9.271628498727735e-06,
"loss": 0.0187,
"step": 730
},
{
"epoch": 0.4061470911086718,
"grad_norm": 1.2742944955825806,
"learning_rate": 9.239821882951655e-06,
"loss": 0.0089,
"step": 740
},
{
"epoch": 0.411635565312843,
"grad_norm": 1.6193122863769531,
"learning_rate": 9.208015267175572e-06,
"loss": 0.0152,
"step": 750
},
{
"epoch": 0.41712403951701427,
"grad_norm": 1.4442307949066162,
"learning_rate": 9.176208651399493e-06,
"loss": 0.0162,
"step": 760
},
{
"epoch": 0.4226125137211855,
"grad_norm": 0.9129316806793213,
"learning_rate": 9.144402035623411e-06,
"loss": 0.0151,
"step": 770
},
{
"epoch": 0.4281009879253567,
"grad_norm": 1.479588270187378,
"learning_rate": 9.112595419847328e-06,
"loss": 0.014,
"step": 780
},
{
"epoch": 0.433589462129528,
"grad_norm": 1.5315167903900146,
"learning_rate": 9.080788804071248e-06,
"loss": 0.0123,
"step": 790
},
{
"epoch": 0.43907793633369924,
"grad_norm": 2.470548391342163,
"learning_rate": 9.048982188295165e-06,
"loss": 0.0112,
"step": 800
},
{
"epoch": 0.4445664105378705,
"grad_norm": 1.5762847661972046,
"learning_rate": 9.017175572519085e-06,
"loss": 0.0163,
"step": 810
},
{
"epoch": 0.4500548847420417,
"grad_norm": 1.4822980165481567,
"learning_rate": 8.985368956743004e-06,
"loss": 0.0145,
"step": 820
},
{
"epoch": 0.45554335894621295,
"grad_norm": 2.682856798171997,
"learning_rate": 8.95356234096692e-06,
"loss": 0.0141,
"step": 830
},
{
"epoch": 0.4610318331503842,
"grad_norm": 1.2349945306777954,
"learning_rate": 8.92175572519084e-06,
"loss": 0.0148,
"step": 840
},
{
"epoch": 0.4665203073545554,
"grad_norm": 3.259676694869995,
"learning_rate": 8.88994910941476e-06,
"loss": 0.0212,
"step": 850
},
{
"epoch": 0.47200878155872666,
"grad_norm": 1.4975826740264893,
"learning_rate": 8.858142493638678e-06,
"loss": 0.0112,
"step": 860
},
{
"epoch": 0.4774972557628979,
"grad_norm": 2.8876535892486572,
"learning_rate": 8.826335877862596e-06,
"loss": 0.0168,
"step": 870
},
{
"epoch": 0.4829857299670692,
"grad_norm": 2.306791305541992,
"learning_rate": 8.794529262086515e-06,
"loss": 0.0192,
"step": 880
},
{
"epoch": 0.4884742041712404,
"grad_norm": 0.6873131394386292,
"learning_rate": 8.762722646310434e-06,
"loss": 0.0144,
"step": 890
},
{
"epoch": 0.49396267837541163,
"grad_norm": 3.158386468887329,
"learning_rate": 8.730916030534352e-06,
"loss": 0.0131,
"step": 900
},
{
"epoch": 0.4994511525795829,
"grad_norm": 0.8878953456878662,
"learning_rate": 8.69910941475827e-06,
"loss": 0.0173,
"step": 910
},
{
"epoch": 0.5049396267837541,
"grad_norm": 1.9014732837677002,
"learning_rate": 8.667302798982189e-06,
"loss": 0.0112,
"step": 920
},
{
"epoch": 0.5104281009879253,
"grad_norm": 1.7305513620376587,
"learning_rate": 8.635496183206108e-06,
"loss": 0.0137,
"step": 930
},
{
"epoch": 0.5159165751920965,
"grad_norm": 1.7590184211730957,
"learning_rate": 8.603689567430026e-06,
"loss": 0.0126,
"step": 940
},
{
"epoch": 0.5214050493962679,
"grad_norm": 1.3747210502624512,
"learning_rate": 8.571882951653945e-06,
"loss": 0.0156,
"step": 950
},
{
"epoch": 0.5268935236004391,
"grad_norm": 1.0799747705459595,
"learning_rate": 8.540076335877863e-06,
"loss": 0.0101,
"step": 960
},
{
"epoch": 0.5323819978046103,
"grad_norm": 0.8307255506515503,
"learning_rate": 8.508269720101782e-06,
"loss": 0.0145,
"step": 970
},
{
"epoch": 0.5378704720087816,
"grad_norm": 1.852042317390442,
"learning_rate": 8.4764631043257e-06,
"loss": 0.0115,
"step": 980
},
{
"epoch": 0.5433589462129528,
"grad_norm": 2.150557279586792,
"learning_rate": 8.444656488549619e-06,
"loss": 0.0107,
"step": 990
},
{
"epoch": 0.5488474204171241,
"grad_norm": 0.7547608613967896,
"learning_rate": 8.412849872773537e-06,
"loss": 0.0119,
"step": 1000
},
{
"epoch": 0.5543358946212953,
"grad_norm": 1.4302098751068115,
"learning_rate": 8.381043256997456e-06,
"loss": 0.0134,
"step": 1010
},
{
"epoch": 0.5598243688254665,
"grad_norm": 2.210999011993408,
"learning_rate": 8.349236641221374e-06,
"loss": 0.0109,
"step": 1020
},
{
"epoch": 0.5653128430296378,
"grad_norm": 3.0575549602508545,
"learning_rate": 8.317430025445293e-06,
"loss": 0.0288,
"step": 1030
},
{
"epoch": 0.570801317233809,
"grad_norm": 1.2066882848739624,
"learning_rate": 8.285623409669212e-06,
"loss": 0.0109,
"step": 1040
},
{
"epoch": 0.5762897914379802,
"grad_norm": 0.9596546292304993,
"learning_rate": 8.25381679389313e-06,
"loss": 0.0154,
"step": 1050
},
{
"epoch": 0.5817782656421515,
"grad_norm": 1.2375856637954712,
"learning_rate": 8.222010178117049e-06,
"loss": 0.0117,
"step": 1060
},
{
"epoch": 0.5872667398463227,
"grad_norm": 1.287665605545044,
"learning_rate": 8.190203562340969e-06,
"loss": 0.0113,
"step": 1070
},
{
"epoch": 0.5927552140504939,
"grad_norm": 1.2491388320922852,
"learning_rate": 8.158396946564886e-06,
"loss": 0.0131,
"step": 1080
},
{
"epoch": 0.5982436882546652,
"grad_norm": 1.8166123628616333,
"learning_rate": 8.126590330788804e-06,
"loss": 0.0135,
"step": 1090
},
{
"epoch": 0.6037321624588364,
"grad_norm": 0.9061824679374695,
"learning_rate": 8.094783715012723e-06,
"loss": 0.0123,
"step": 1100
},
{
"epoch": 0.6092206366630076,
"grad_norm": 1.2774139642715454,
"learning_rate": 8.062977099236641e-06,
"loss": 0.0118,
"step": 1110
},
{
"epoch": 0.6147091108671789,
"grad_norm": 1.7925004959106445,
"learning_rate": 8.031170483460562e-06,
"loss": 0.014,
"step": 1120
},
{
"epoch": 0.6201975850713501,
"grad_norm": 1.256042242050171,
"learning_rate": 7.999363867684478e-06,
"loss": 0.0174,
"step": 1130
},
{
"epoch": 0.6256860592755215,
"grad_norm": 1.2440769672393799,
"learning_rate": 7.967557251908397e-06,
"loss": 0.0114,
"step": 1140
},
{
"epoch": 0.6311745334796927,
"grad_norm": 1.6593252420425415,
"learning_rate": 7.935750636132317e-06,
"loss": 0.0119,
"step": 1150
},
{
"epoch": 0.6366630076838639,
"grad_norm": 1.7107939720153809,
"learning_rate": 7.903944020356234e-06,
"loss": 0.014,
"step": 1160
},
{
"epoch": 0.6421514818880352,
"grad_norm": 1.2454367876052856,
"learning_rate": 7.872137404580154e-06,
"loss": 0.0126,
"step": 1170
},
{
"epoch": 0.6476399560922064,
"grad_norm": 1.0048370361328125,
"learning_rate": 7.840330788804071e-06,
"loss": 0.0113,
"step": 1180
},
{
"epoch": 0.6531284302963776,
"grad_norm": 4.3503098487854,
"learning_rate": 7.80852417302799e-06,
"loss": 0.01,
"step": 1190
},
{
"epoch": 0.6586169045005489,
"grad_norm": 2.078575611114502,
"learning_rate": 7.77671755725191e-06,
"loss": 0.0131,
"step": 1200
},
{
"epoch": 0.6641053787047201,
"grad_norm": 2.2236897945404053,
"learning_rate": 7.744910941475827e-06,
"loss": 0.0143,
"step": 1210
},
{
"epoch": 0.6695938529088913,
"grad_norm": 2.2201192378997803,
"learning_rate": 7.713104325699747e-06,
"loss": 0.0098,
"step": 1220
},
{
"epoch": 0.6750823271130626,
"grad_norm": 1.5262202024459839,
"learning_rate": 7.681297709923665e-06,
"loss": 0.0163,
"step": 1230
},
{
"epoch": 0.6805708013172338,
"grad_norm": 0.6526926755905151,
"learning_rate": 7.649491094147582e-06,
"loss": 0.0093,
"step": 1240
},
{
"epoch": 0.686059275521405,
"grad_norm": 0.6294535994529724,
"learning_rate": 7.6176844783715025e-06,
"loss": 0.013,
"step": 1250
},
{
"epoch": 0.6915477497255763,
"grad_norm": 0.6937686800956726,
"learning_rate": 7.58587786259542e-06,
"loss": 0.0121,
"step": 1260
},
{
"epoch": 0.6970362239297475,
"grad_norm": 1.6241185665130615,
"learning_rate": 7.554071246819339e-06,
"loss": 0.0146,
"step": 1270
},
{
"epoch": 0.7025246981339188,
"grad_norm": 1.467155933380127,
"learning_rate": 7.522264631043258e-06,
"loss": 0.0131,
"step": 1280
},
{
"epoch": 0.70801317233809,
"grad_norm": 1.753973126411438,
"learning_rate": 7.490458015267176e-06,
"loss": 0.014,
"step": 1290
},
{
"epoch": 0.7135016465422612,
"grad_norm": 1.4710702896118164,
"learning_rate": 7.458651399491095e-06,
"loss": 0.0103,
"step": 1300
},
{
"epoch": 0.7189901207464325,
"grad_norm": 2.0423262119293213,
"learning_rate": 7.426844783715014e-06,
"loss": 0.0107,
"step": 1310
},
{
"epoch": 0.7244785949506037,
"grad_norm": 1.1584227085113525,
"learning_rate": 7.395038167938931e-06,
"loss": 0.0099,
"step": 1320
},
{
"epoch": 0.7299670691547749,
"grad_norm": 1.1535860300064087,
"learning_rate": 7.363231552162851e-06,
"loss": 0.0113,
"step": 1330
},
{
"epoch": 0.7354555433589463,
"grad_norm": 0.7290008664131165,
"learning_rate": 7.331424936386769e-06,
"loss": 0.0111,
"step": 1340
},
{
"epoch": 0.7409440175631175,
"grad_norm": 0.7790582776069641,
"learning_rate": 7.299618320610688e-06,
"loss": 0.0067,
"step": 1350
},
{
"epoch": 0.7464324917672887,
"grad_norm": 1.8725967407226562,
"learning_rate": 7.267811704834606e-06,
"loss": 0.0132,
"step": 1360
},
{
"epoch": 0.75192096597146,
"grad_norm": 2.039541721343994,
"learning_rate": 7.236005089058524e-06,
"loss": 0.0186,
"step": 1370
},
{
"epoch": 0.7574094401756312,
"grad_norm": 1.802741527557373,
"learning_rate": 7.204198473282443e-06,
"loss": 0.0127,
"step": 1380
},
{
"epoch": 0.7628979143798024,
"grad_norm": 1.0849511623382568,
"learning_rate": 7.172391857506362e-06,
"loss": 0.0156,
"step": 1390
},
{
"epoch": 0.7683863885839737,
"grad_norm": 1.2373745441436768,
"learning_rate": 7.1405852417302805e-06,
"loss": 0.0168,
"step": 1400
},
{
"epoch": 0.7738748627881449,
"grad_norm": 1.8411822319030762,
"learning_rate": 7.108778625954199e-06,
"loss": 0.0139,
"step": 1410
},
{
"epoch": 0.7793633369923162,
"grad_norm": 2.8104448318481445,
"learning_rate": 7.076972010178118e-06,
"loss": 0.0206,
"step": 1420
},
{
"epoch": 0.7848518111964874,
"grad_norm": 0.9695596098899841,
"learning_rate": 7.045165394402036e-06,
"loss": 0.0123,
"step": 1430
},
{
"epoch": 0.7903402854006586,
"grad_norm": 1.6235179901123047,
"learning_rate": 7.013358778625955e-06,
"loss": 0.0115,
"step": 1440
},
{
"epoch": 0.7958287596048299,
"grad_norm": 1.1207462549209595,
"learning_rate": 6.981552162849873e-06,
"loss": 0.0097,
"step": 1450
},
{
"epoch": 0.8013172338090011,
"grad_norm": 1.1788724660873413,
"learning_rate": 6.949745547073792e-06,
"loss": 0.0095,
"step": 1460
},
{
"epoch": 0.8068057080131723,
"grad_norm": 2.085524320602417,
"learning_rate": 6.917938931297711e-06,
"loss": 0.0136,
"step": 1470
},
{
"epoch": 0.8122941822173436,
"grad_norm": 1.6332577466964722,
"learning_rate": 6.886132315521629e-06,
"loss": 0.0102,
"step": 1480
},
{
"epoch": 0.8177826564215148,
"grad_norm": 1.769086241722107,
"learning_rate": 6.854325699745547e-06,
"loss": 0.0118,
"step": 1490
},
{
"epoch": 0.823271130625686,
"grad_norm": 1.046510934829712,
"learning_rate": 6.822519083969467e-06,
"loss": 0.0094,
"step": 1500
},
{
"epoch": 0.8287596048298573,
"grad_norm": 1.5111862421035767,
"learning_rate": 6.790712468193384e-06,
"loss": 0.0143,
"step": 1510
},
{
"epoch": 0.8342480790340285,
"grad_norm": 1.3604211807250977,
"learning_rate": 6.758905852417304e-06,
"loss": 0.0138,
"step": 1520
},
{
"epoch": 0.8397365532381997,
"grad_norm": 0.9713101387023926,
"learning_rate": 6.727099236641222e-06,
"loss": 0.01,
"step": 1530
},
{
"epoch": 0.845225027442371,
"grad_norm": 1.2814525365829468,
"learning_rate": 6.69529262086514e-06,
"loss": 0.0084,
"step": 1540
},
{
"epoch": 0.8507135016465422,
"grad_norm": 0.9360769391059875,
"learning_rate": 6.663486005089059e-06,
"loss": 0.0095,
"step": 1550
},
{
"epoch": 0.8562019758507134,
"grad_norm": 2.029505491256714,
"learning_rate": 6.631679389312977e-06,
"loss": 0.012,
"step": 1560
},
{
"epoch": 0.8616904500548848,
"grad_norm": 1.2836129665374756,
"learning_rate": 6.599872773536896e-06,
"loss": 0.0178,
"step": 1570
},
{
"epoch": 0.867178924259056,
"grad_norm": 1.5491465330123901,
"learning_rate": 6.568066157760815e-06,
"loss": 0.0121,
"step": 1580
},
{
"epoch": 0.8726673984632273,
"grad_norm": 1.215768575668335,
"learning_rate": 6.536259541984733e-06,
"loss": 0.0167,
"step": 1590
},
{
"epoch": 0.8781558726673985,
"grad_norm": 1.0636669397354126,
"learning_rate": 6.504452926208652e-06,
"loss": 0.0094,
"step": 1600
},
{
"epoch": 0.8836443468715697,
"grad_norm": 1.4701627492904663,
"learning_rate": 6.4726463104325706e-06,
"loss": 0.0124,
"step": 1610
},
{
"epoch": 0.889132821075741,
"grad_norm": 1.176419734954834,
"learning_rate": 6.440839694656489e-06,
"loss": 0.0123,
"step": 1620
},
{
"epoch": 0.8946212952799122,
"grad_norm": 2.032910108566284,
"learning_rate": 6.409033078880408e-06,
"loss": 0.0114,
"step": 1630
},
{
"epoch": 0.9001097694840834,
"grad_norm": 1.0917820930480957,
"learning_rate": 6.377226463104325e-06,
"loss": 0.0107,
"step": 1640
},
{
"epoch": 0.9055982436882547,
"grad_norm": 1.4592185020446777,
"learning_rate": 6.345419847328245e-06,
"loss": 0.0128,
"step": 1650
},
{
"epoch": 0.9110867178924259,
"grad_norm": 1.2474491596221924,
"learning_rate": 6.313613231552164e-06,
"loss": 0.0122,
"step": 1660
},
{
"epoch": 0.9165751920965971,
"grad_norm": 1.5561631917953491,
"learning_rate": 6.281806615776082e-06,
"loss": 0.0107,
"step": 1670
},
{
"epoch": 0.9220636663007684,
"grad_norm": 0.8761013746261597,
"learning_rate": 6.25e-06,
"loss": 0.0068,
"step": 1680
},
{
"epoch": 0.9275521405049396,
"grad_norm": 2.1419386863708496,
"learning_rate": 6.21819338422392e-06,
"loss": 0.0147,
"step": 1690
},
{
"epoch": 0.9330406147091108,
"grad_norm": 1.0107790231704712,
"learning_rate": 6.186386768447837e-06,
"loss": 0.0075,
"step": 1700
},
{
"epoch": 0.9385290889132821,
"grad_norm": 0.9932330846786499,
"learning_rate": 6.154580152671757e-06,
"loss": 0.0079,
"step": 1710
},
{
"epoch": 0.9440175631174533,
"grad_norm": 1.2500951290130615,
"learning_rate": 6.122773536895675e-06,
"loss": 0.0108,
"step": 1720
},
{
"epoch": 0.9495060373216246,
"grad_norm": 1.5545804500579834,
"learning_rate": 6.090966921119593e-06,
"loss": 0.0104,
"step": 1730
},
{
"epoch": 0.9549945115257958,
"grad_norm": 1.4742019176483154,
"learning_rate": 6.059160305343512e-06,
"loss": 0.0139,
"step": 1740
},
{
"epoch": 0.960482985729967,
"grad_norm": 0.8499981760978699,
"learning_rate": 6.02735368956743e-06,
"loss": 0.0073,
"step": 1750
},
{
"epoch": 0.9659714599341384,
"grad_norm": 0.7065290808677673,
"learning_rate": 5.9955470737913494e-06,
"loss": 0.0074,
"step": 1760
},
{
"epoch": 0.9714599341383096,
"grad_norm": 1.6678274869918823,
"learning_rate": 5.963740458015268e-06,
"loss": 0.0098,
"step": 1770
},
{
"epoch": 0.9769484083424808,
"grad_norm": 1.185567855834961,
"learning_rate": 5.931933842239186e-06,
"loss": 0.0118,
"step": 1780
},
{
"epoch": 0.9824368825466521,
"grad_norm": 1.7147798538208008,
"learning_rate": 5.900127226463105e-06,
"loss": 0.012,
"step": 1790
},
{
"epoch": 0.9879253567508233,
"grad_norm": 2.5320818424224854,
"learning_rate": 5.8683206106870236e-06,
"loss": 0.0059,
"step": 1800
},
{
"epoch": 0.9934138309549945,
"grad_norm": 1.0351759195327759,
"learning_rate": 5.836513994910942e-06,
"loss": 0.0087,
"step": 1810
},
{
"epoch": 0.9989023051591658,
"grad_norm": 1.2726657390594482,
"learning_rate": 5.804707379134861e-06,
"loss": 0.0133,
"step": 1820
},
{
"epoch": 1.004390779363337,
"grad_norm": 0.4543689489364624,
"learning_rate": 5.772900763358778e-06,
"loss": 0.0043,
"step": 1830
},
{
"epoch": 1.0098792535675083,
"grad_norm": 2.0367791652679443,
"learning_rate": 5.741094147582698e-06,
"loss": 0.0044,
"step": 1840
},
{
"epoch": 1.0153677277716795,
"grad_norm": 0.6520805358886719,
"learning_rate": 5.709287531806616e-06,
"loss": 0.004,
"step": 1850
},
{
"epoch": 1.0208562019758507,
"grad_norm": 0.8149614930152893,
"learning_rate": 5.677480916030535e-06,
"loss": 0.0032,
"step": 1860
},
{
"epoch": 1.026344676180022,
"grad_norm": 0.4136104881763458,
"learning_rate": 5.645674300254453e-06,
"loss": 0.0036,
"step": 1870
},
{
"epoch": 1.031833150384193,
"grad_norm": 1.050353765487671,
"learning_rate": 5.613867684478373e-06,
"loss": 0.0045,
"step": 1880
},
{
"epoch": 1.0373216245883645,
"grad_norm": 2.067906379699707,
"learning_rate": 5.58206106870229e-06,
"loss": 0.0037,
"step": 1890
},
{
"epoch": 1.0428100987925357,
"grad_norm": 0.31829890608787537,
"learning_rate": 5.550254452926209e-06,
"loss": 0.0044,
"step": 1900
},
{
"epoch": 1.048298572996707,
"grad_norm": 0.434925377368927,
"learning_rate": 5.518447837150128e-06,
"loss": 0.0027,
"step": 1910
},
{
"epoch": 1.0537870472008781,
"grad_norm": 1.5393106937408447,
"learning_rate": 5.486641221374046e-06,
"loss": 0.0043,
"step": 1920
},
{
"epoch": 1.0592755214050493,
"grad_norm": 0.3788773715496063,
"learning_rate": 5.454834605597965e-06,
"loss": 0.002,
"step": 1930
},
{
"epoch": 1.0647639956092205,
"grad_norm": 0.29814398288726807,
"learning_rate": 5.423027989821883e-06,
"loss": 0.0042,
"step": 1940
},
{
"epoch": 1.070252469813392,
"grad_norm": 0.24681848287582397,
"learning_rate": 5.391221374045802e-06,
"loss": 0.0049,
"step": 1950
},
{
"epoch": 1.0757409440175631,
"grad_norm": 0.11974932998418808,
"learning_rate": 5.359414758269721e-06,
"loss": 0.0032,
"step": 1960
},
{
"epoch": 1.0812294182217344,
"grad_norm": 1.4361236095428467,
"learning_rate": 5.327608142493639e-06,
"loss": 0.0028,
"step": 1970
},
{
"epoch": 1.0867178924259056,
"grad_norm": 0.645820140838623,
"learning_rate": 5.295801526717558e-06,
"loss": 0.0024,
"step": 1980
},
{
"epoch": 1.0922063666300768,
"grad_norm": 0.14708861708641052,
"learning_rate": 5.2639949109414766e-06,
"loss": 0.0017,
"step": 1990
},
{
"epoch": 1.0976948408342482,
"grad_norm": 0.40531185269355774,
"learning_rate": 5.232188295165394e-06,
"loss": 0.0032,
"step": 2000
},
{
"epoch": 1.0976948408342482,
"eval_loss": 0.0047075627371668816,
"eval_runtime": 10648.9323,
"eval_samples_per_second": 1.369,
"eval_steps_per_second": 0.171,
"eval_wer": 0.3999578198909343,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 3644,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.531565226655744e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}