Aitana-ClearLangDetection-R-1.0 / trainer_state.json
marbonora's picture
Upload folder using huggingface_hub
4bdf289 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2094,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009551098376313277,
"grad_norm": 2.2569775581359863,
"learning_rate": 1.9914040114613182e-05,
"loss": 1.0939,
"step": 10
},
{
"epoch": 0.019102196752626553,
"grad_norm": 2.2780814170837402,
"learning_rate": 1.981852913085005e-05,
"loss": 1.0778,
"step": 20
},
{
"epoch": 0.02865329512893983,
"grad_norm": 6.2301554679870605,
"learning_rate": 1.9723018147086915e-05,
"loss": 0.9625,
"step": 30
},
{
"epoch": 0.038204393505253106,
"grad_norm": 24.95392417907715,
"learning_rate": 1.9627507163323785e-05,
"loss": 0.8937,
"step": 40
},
{
"epoch": 0.04775549188156638,
"grad_norm": 3.5438709259033203,
"learning_rate": 1.9531996179560652e-05,
"loss": 0.7843,
"step": 50
},
{
"epoch": 0.05730659025787966,
"grad_norm": 3.102586269378662,
"learning_rate": 1.943648519579752e-05,
"loss": 0.6911,
"step": 60
},
{
"epoch": 0.06685768863419293,
"grad_norm": 1.7375000715255737,
"learning_rate": 1.9340974212034385e-05,
"loss": 0.5772,
"step": 70
},
{
"epoch": 0.07640878701050621,
"grad_norm": 4.719320774078369,
"learning_rate": 1.9245463228271252e-05,
"loss": 0.5894,
"step": 80
},
{
"epoch": 0.08595988538681948,
"grad_norm": 1.8283956050872803,
"learning_rate": 1.9149952244508122e-05,
"loss": 0.405,
"step": 90
},
{
"epoch": 0.09551098376313276,
"grad_norm": 27.354843139648438,
"learning_rate": 1.905444126074499e-05,
"loss": 0.4705,
"step": 100
},
{
"epoch": 0.10506208213944604,
"grad_norm": 4.766273021697998,
"learning_rate": 1.8958930276981855e-05,
"loss": 0.3644,
"step": 110
},
{
"epoch": 0.11461318051575932,
"grad_norm": 4.26953125,
"learning_rate": 1.8863419293218722e-05,
"loss": 0.2284,
"step": 120
},
{
"epoch": 0.12416427889207259,
"grad_norm": 0.7594988942146301,
"learning_rate": 1.876790830945559e-05,
"loss": 0.2435,
"step": 130
},
{
"epoch": 0.13371537726838587,
"grad_norm": 1.365365743637085,
"learning_rate": 1.8672397325692455e-05,
"loss": 0.2184,
"step": 140
},
{
"epoch": 0.14326647564469913,
"grad_norm": 9.257822036743164,
"learning_rate": 1.857688634192932e-05,
"loss": 0.2248,
"step": 150
},
{
"epoch": 0.15281757402101243,
"grad_norm": 0.5103208422660828,
"learning_rate": 1.848137535816619e-05,
"loss": 0.243,
"step": 160
},
{
"epoch": 0.1623686723973257,
"grad_norm": Infinity,
"learning_rate": 1.8385864374403058e-05,
"loss": 0.2442,
"step": 170
},
{
"epoch": 0.17191977077363896,
"grad_norm": 0.6302635669708252,
"learning_rate": 1.8290353390639925e-05,
"loss": 0.4167,
"step": 180
},
{
"epoch": 0.18147086914995225,
"grad_norm": 4.381824493408203,
"learning_rate": 1.819484240687679e-05,
"loss": 0.2861,
"step": 190
},
{
"epoch": 0.19102196752626552,
"grad_norm": 0.35794690251350403,
"learning_rate": 1.8099331423113658e-05,
"loss": 0.1059,
"step": 200
},
{
"epoch": 0.20057306590257878,
"grad_norm": 0.4472959637641907,
"learning_rate": 1.8003820439350528e-05,
"loss": 0.0896,
"step": 210
},
{
"epoch": 0.21012416427889208,
"grad_norm": 0.2821422517299652,
"learning_rate": 1.7908309455587395e-05,
"loss": 0.0837,
"step": 220
},
{
"epoch": 0.21967526265520534,
"grad_norm": 26.638248443603516,
"learning_rate": 1.781279847182426e-05,
"loss": 0.1115,
"step": 230
},
{
"epoch": 0.22922636103151864,
"grad_norm": 1.5657349824905396,
"learning_rate": 1.7717287488061128e-05,
"loss": 0.1829,
"step": 240
},
{
"epoch": 0.2387774594078319,
"grad_norm": 0.2336825728416443,
"learning_rate": 1.7621776504297995e-05,
"loss": 0.0556,
"step": 250
},
{
"epoch": 0.24832855778414517,
"grad_norm": 4.114994525909424,
"learning_rate": 1.752626552053486e-05,
"loss": 0.1532,
"step": 260
},
{
"epoch": 0.25787965616045844,
"grad_norm": 13.862896919250488,
"learning_rate": 1.743075453677173e-05,
"loss": 0.2155,
"step": 270
},
{
"epoch": 0.26743075453677173,
"grad_norm": 0.21343673765659332,
"learning_rate": 1.7335243553008598e-05,
"loss": 0.0374,
"step": 280
},
{
"epoch": 0.276981852913085,
"grad_norm": 0.1753835529088974,
"learning_rate": 1.7239732569245464e-05,
"loss": 0.108,
"step": 290
},
{
"epoch": 0.28653295128939826,
"grad_norm": 0.24574324488639832,
"learning_rate": 1.714422158548233e-05,
"loss": 0.1175,
"step": 300
},
{
"epoch": 0.29608404966571156,
"grad_norm": 23.268014907836914,
"learning_rate": 1.7048710601719198e-05,
"loss": 0.1532,
"step": 310
},
{
"epoch": 0.30563514804202485,
"grad_norm": 0.1591091752052307,
"learning_rate": 1.6953199617956068e-05,
"loss": 0.0994,
"step": 320
},
{
"epoch": 0.3151862464183381,
"grad_norm": 0.17243853211402893,
"learning_rate": 1.6857688634192934e-05,
"loss": 0.083,
"step": 330
},
{
"epoch": 0.3247373447946514,
"grad_norm": 2.6030852794647217,
"learning_rate": 1.67621776504298e-05,
"loss": 0.2142,
"step": 340
},
{
"epoch": 0.3342884431709647,
"grad_norm": 0.15623551607131958,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0821,
"step": 350
},
{
"epoch": 0.3438395415472779,
"grad_norm": 1.7919316291809082,
"learning_rate": 1.6571155682903534e-05,
"loss": 0.1883,
"step": 360
},
{
"epoch": 0.3533906399235912,
"grad_norm": 0.2467084378004074,
"learning_rate": 1.6475644699140404e-05,
"loss": 0.083,
"step": 370
},
{
"epoch": 0.3629417382999045,
"grad_norm": 1.9838752746582031,
"learning_rate": 1.6380133715377267e-05,
"loss": 0.0919,
"step": 380
},
{
"epoch": 0.37249283667621774,
"grad_norm": 0.13745100796222687,
"learning_rate": 1.6284622731614137e-05,
"loss": 0.2578,
"step": 390
},
{
"epoch": 0.38204393505253104,
"grad_norm": 23.14975929260254,
"learning_rate": 1.6189111747851004e-05,
"loss": 0.0609,
"step": 400
},
{
"epoch": 0.39159503342884433,
"grad_norm": 0.16147832572460175,
"learning_rate": 1.609360076408787e-05,
"loss": 0.1865,
"step": 410
},
{
"epoch": 0.40114613180515757,
"grad_norm": 8.0061616897583,
"learning_rate": 1.599808978032474e-05,
"loss": 0.1216,
"step": 420
},
{
"epoch": 0.41069723018147086,
"grad_norm": 1.0904359817504883,
"learning_rate": 1.5902578796561604e-05,
"loss": 0.137,
"step": 430
},
{
"epoch": 0.42024832855778416,
"grad_norm": 2.3162689208984375,
"learning_rate": 1.5807067812798474e-05,
"loss": 0.1358,
"step": 440
},
{
"epoch": 0.4297994269340974,
"grad_norm": 0.11886035650968552,
"learning_rate": 1.571155682903534e-05,
"loss": 0.0733,
"step": 450
},
{
"epoch": 0.4393505253104107,
"grad_norm": 7.155008792877197,
"learning_rate": 1.5616045845272207e-05,
"loss": 0.1238,
"step": 460
},
{
"epoch": 0.448901623686724,
"grad_norm": 0.11069323867559433,
"learning_rate": 1.5520534861509077e-05,
"loss": 0.1987,
"step": 470
},
{
"epoch": 0.4584527220630373,
"grad_norm": 0.10282690078020096,
"learning_rate": 1.542502387774594e-05,
"loss": 0.091,
"step": 480
},
{
"epoch": 0.4680038204393505,
"grad_norm": 0.1403094232082367,
"learning_rate": 1.532951289398281e-05,
"loss": 0.1213,
"step": 490
},
{
"epoch": 0.4775549188156638,
"grad_norm": 0.1331929713487625,
"learning_rate": 1.5234001910219675e-05,
"loss": 0.0154,
"step": 500
},
{
"epoch": 0.4871060171919771,
"grad_norm": 0.11407709866762161,
"learning_rate": 1.5138490926456543e-05,
"loss": 0.1221,
"step": 510
},
{
"epoch": 0.49665711556829034,
"grad_norm": 0.5473654270172119,
"learning_rate": 1.5042979942693412e-05,
"loss": 0.1143,
"step": 520
},
{
"epoch": 0.5062082139446036,
"grad_norm": 0.11086848378181458,
"learning_rate": 1.4947468958930278e-05,
"loss": 0.0141,
"step": 530
},
{
"epoch": 0.5157593123209169,
"grad_norm": 78.81968688964844,
"learning_rate": 1.4851957975167147e-05,
"loss": 0.1552,
"step": 540
},
{
"epoch": 0.5253104106972302,
"grad_norm": 6.011876106262207,
"learning_rate": 1.4756446991404012e-05,
"loss": 0.1232,
"step": 550
},
{
"epoch": 0.5348615090735435,
"grad_norm": 0.15130910277366638,
"learning_rate": 1.466093600764088e-05,
"loss": 0.1164,
"step": 560
},
{
"epoch": 0.5444126074498568,
"grad_norm": 0.0998225063085556,
"learning_rate": 1.4565425023877747e-05,
"loss": 0.1248,
"step": 570
},
{
"epoch": 0.55396370582617,
"grad_norm": 0.09413418173789978,
"learning_rate": 1.4469914040114615e-05,
"loss": 0.1258,
"step": 580
},
{
"epoch": 0.5635148042024832,
"grad_norm": 30.505067825317383,
"learning_rate": 1.4374403056351483e-05,
"loss": 0.0788,
"step": 590
},
{
"epoch": 0.5730659025787965,
"grad_norm": 0.10750491917133331,
"learning_rate": 1.4278892072588348e-05,
"loss": 0.1026,
"step": 600
},
{
"epoch": 0.5826170009551098,
"grad_norm": 0.08975467830896378,
"learning_rate": 1.4183381088825216e-05,
"loss": 0.2408,
"step": 610
},
{
"epoch": 0.5921680993314231,
"grad_norm": 0.12342803925275803,
"learning_rate": 1.4087870105062083e-05,
"loss": 0.0717,
"step": 620
},
{
"epoch": 0.6017191977077364,
"grad_norm": 0.07816806435585022,
"learning_rate": 1.3992359121298951e-05,
"loss": 0.1131,
"step": 630
},
{
"epoch": 0.6112702960840497,
"grad_norm": 0.06454802304506302,
"learning_rate": 1.389684813753582e-05,
"loss": 0.0096,
"step": 640
},
{
"epoch": 0.620821394460363,
"grad_norm": 0.08184290677309036,
"learning_rate": 1.3801337153772685e-05,
"loss": 0.1253,
"step": 650
},
{
"epoch": 0.6303724928366762,
"grad_norm": 0.07238755375146866,
"learning_rate": 1.3705826170009553e-05,
"loss": 0.0805,
"step": 660
},
{
"epoch": 0.6399235912129895,
"grad_norm": 1.8935270309448242,
"learning_rate": 1.361031518624642e-05,
"loss": 0.1435,
"step": 670
},
{
"epoch": 0.6494746895893028,
"grad_norm": 17.36036491394043,
"learning_rate": 1.3514804202483288e-05,
"loss": 0.0125,
"step": 680
},
{
"epoch": 0.6590257879656161,
"grad_norm": 0.07514392584562302,
"learning_rate": 1.3419293218720153e-05,
"loss": 0.0106,
"step": 690
},
{
"epoch": 0.6685768863419294,
"grad_norm": 4.228085041046143,
"learning_rate": 1.3323782234957021e-05,
"loss": 0.2031,
"step": 700
},
{
"epoch": 0.6781279847182426,
"grad_norm": 0.21097038686275482,
"learning_rate": 1.322827125119389e-05,
"loss": 0.1307,
"step": 710
},
{
"epoch": 0.6876790830945558,
"grad_norm": 0.10752805322408676,
"learning_rate": 1.3132760267430756e-05,
"loss": 0.0989,
"step": 720
},
{
"epoch": 0.6972301814708691,
"grad_norm": 51.11091995239258,
"learning_rate": 1.3037249283667624e-05,
"loss": 0.1572,
"step": 730
},
{
"epoch": 0.7067812798471824,
"grad_norm": 0.08639833331108093,
"learning_rate": 1.2941738299904489e-05,
"loss": 0.0508,
"step": 740
},
{
"epoch": 0.7163323782234957,
"grad_norm": 0.09963525086641312,
"learning_rate": 1.2846227316141357e-05,
"loss": 0.0763,
"step": 750
},
{
"epoch": 0.725883476599809,
"grad_norm": 0.0683053508400917,
"learning_rate": 1.2750716332378224e-05,
"loss": 0.0567,
"step": 760
},
{
"epoch": 0.7354345749761223,
"grad_norm": 40.62727737426758,
"learning_rate": 1.2655205348615092e-05,
"loss": 0.2471,
"step": 770
},
{
"epoch": 0.7449856733524355,
"grad_norm": 0.07331220805644989,
"learning_rate": 1.2559694364851959e-05,
"loss": 0.1743,
"step": 780
},
{
"epoch": 0.7545367717287488,
"grad_norm": 0.06410760432481766,
"learning_rate": 1.2464183381088826e-05,
"loss": 0.0624,
"step": 790
},
{
"epoch": 0.7640878701050621,
"grad_norm": 0.11088142544031143,
"learning_rate": 1.2368672397325694e-05,
"loss": 0.0087,
"step": 800
},
{
"epoch": 0.7736389684813754,
"grad_norm": 0.05074993893504143,
"learning_rate": 1.227316141356256e-05,
"loss": 0.1333,
"step": 810
},
{
"epoch": 0.7831900668576887,
"grad_norm": 0.05052105337381363,
"learning_rate": 1.2177650429799429e-05,
"loss": 0.0829,
"step": 820
},
{
"epoch": 0.792741165234002,
"grad_norm": 0.06240995600819588,
"learning_rate": 1.2082139446036295e-05,
"loss": 0.0074,
"step": 830
},
{
"epoch": 0.8022922636103151,
"grad_norm": 0.06128745898604393,
"learning_rate": 1.1986628462273162e-05,
"loss": 0.0705,
"step": 840
},
{
"epoch": 0.8118433619866284,
"grad_norm": 4.010462760925293,
"learning_rate": 1.189111747851003e-05,
"loss": 0.1351,
"step": 850
},
{
"epoch": 0.8213944603629417,
"grad_norm": 0.07143828272819519,
"learning_rate": 1.1795606494746897e-05,
"loss": 0.0514,
"step": 860
},
{
"epoch": 0.830945558739255,
"grad_norm": 0.06396259367465973,
"learning_rate": 1.1700095510983764e-05,
"loss": 0.0713,
"step": 870
},
{
"epoch": 0.8404966571155683,
"grad_norm": 14.529672622680664,
"learning_rate": 1.160458452722063e-05,
"loss": 0.0513,
"step": 880
},
{
"epoch": 0.8500477554918816,
"grad_norm": 1.4704737663269043,
"learning_rate": 1.1509073543457498e-05,
"loss": 0.353,
"step": 890
},
{
"epoch": 0.8595988538681948,
"grad_norm": 0.06813743710517883,
"learning_rate": 1.1413562559694367e-05,
"loss": 0.144,
"step": 900
},
{
"epoch": 0.8691499522445081,
"grad_norm": 0.7163823843002319,
"learning_rate": 1.1318051575931233e-05,
"loss": 0.0415,
"step": 910
},
{
"epoch": 0.8787010506208214,
"grad_norm": 0.05734021216630936,
"learning_rate": 1.12225405921681e-05,
"loss": 0.1911,
"step": 920
},
{
"epoch": 0.8882521489971347,
"grad_norm": 0.06162785366177559,
"learning_rate": 1.1127029608404967e-05,
"loss": 0.0711,
"step": 930
},
{
"epoch": 0.897803247373448,
"grad_norm": 0.1327008605003357,
"learning_rate": 1.1031518624641835e-05,
"loss": 0.058,
"step": 940
},
{
"epoch": 0.9073543457497613,
"grad_norm": 0.05325314775109291,
"learning_rate": 1.0936007640878703e-05,
"loss": 0.0376,
"step": 950
},
{
"epoch": 0.9169054441260746,
"grad_norm": 0.7295445799827576,
"learning_rate": 1.0840496657115568e-05,
"loss": 0.083,
"step": 960
},
{
"epoch": 0.9264565425023877,
"grad_norm": 0.0540502592921257,
"learning_rate": 1.0744985673352436e-05,
"loss": 0.1386,
"step": 970
},
{
"epoch": 0.936007640878701,
"grad_norm": 0.1747369021177292,
"learning_rate": 1.0649474689589303e-05,
"loss": 0.0063,
"step": 980
},
{
"epoch": 0.9455587392550143,
"grad_norm": 0.04095704108476639,
"learning_rate": 1.0553963705826171e-05,
"loss": 0.0961,
"step": 990
},
{
"epoch": 0.9551098376313276,
"grad_norm": 1.786160945892334,
"learning_rate": 1.0458452722063038e-05,
"loss": 0.2077,
"step": 1000
},
{
"epoch": 0.9646609360076409,
"grad_norm": 0.057904984802007675,
"learning_rate": 1.0362941738299905e-05,
"loss": 0.1009,
"step": 1010
},
{
"epoch": 0.9742120343839542,
"grad_norm": 0.04530341923236847,
"learning_rate": 1.0267430754536773e-05,
"loss": 0.0077,
"step": 1020
},
{
"epoch": 0.9837631327602674,
"grad_norm": 0.04884221404790878,
"learning_rate": 1.017191977077364e-05,
"loss": 0.0804,
"step": 1030
},
{
"epoch": 0.9933142311365807,
"grad_norm": 5.464759349822998,
"learning_rate": 1.0076408787010508e-05,
"loss": 0.1108,
"step": 1040
},
{
"epoch": 1.0,
"eval_loss": 0.10391418635845184,
"eval_runtime": 1.2229,
"eval_samples_per_second": 760.514,
"eval_steps_per_second": 95.678,
"step": 1047
},
{
"epoch": 1.002865329512894,
"grad_norm": 0.054792579263448715,
"learning_rate": 9.980897803247374e-06,
"loss": 0.0723,
"step": 1050
},
{
"epoch": 1.0124164278892072,
"grad_norm": 2.5259604454040527,
"learning_rate": 9.885386819484241e-06,
"loss": 0.074,
"step": 1060
},
{
"epoch": 1.0219675262655206,
"grad_norm": 0.06726188212633133,
"learning_rate": 9.78987583572111e-06,
"loss": 0.0709,
"step": 1070
},
{
"epoch": 1.0315186246418337,
"grad_norm": 0.05034675449132919,
"learning_rate": 9.694364851957976e-06,
"loss": 0.0072,
"step": 1080
},
{
"epoch": 1.0410697230181472,
"grad_norm": 1.9011842012405396,
"learning_rate": 9.598853868194843e-06,
"loss": 0.0803,
"step": 1090
},
{
"epoch": 1.0506208213944603,
"grad_norm": 0.05382240563631058,
"learning_rate": 9.50334288443171e-06,
"loss": 0.0247,
"step": 1100
},
{
"epoch": 1.0601719197707737,
"grad_norm": 0.22710120677947998,
"learning_rate": 9.407831900668578e-06,
"loss": 0.0061,
"step": 1110
},
{
"epoch": 1.069723018147087,
"grad_norm": 0.042488373816013336,
"learning_rate": 9.312320916905446e-06,
"loss": 0.1183,
"step": 1120
},
{
"epoch": 1.0792741165234,
"grad_norm": 2.047455072402954,
"learning_rate": 9.216809933142312e-06,
"loss": 0.0765,
"step": 1130
},
{
"epoch": 1.0888252148997135,
"grad_norm": 61.58501052856445,
"learning_rate": 9.121298949379179e-06,
"loss": 0.2333,
"step": 1140
},
{
"epoch": 1.0983763132760267,
"grad_norm": 0.06981759518384933,
"learning_rate": 9.025787965616046e-06,
"loss": 0.2329,
"step": 1150
},
{
"epoch": 1.10792741165234,
"grad_norm": 0.09120076149702072,
"learning_rate": 8.930276981852914e-06,
"loss": 0.0239,
"step": 1160
},
{
"epoch": 1.1174785100286533,
"grad_norm": 2.0684778690338135,
"learning_rate": 8.834765998089782e-06,
"loss": 0.0902,
"step": 1170
},
{
"epoch": 1.1270296084049667,
"grad_norm": 0.08632172644138336,
"learning_rate": 8.739255014326649e-06,
"loss": 0.1149,
"step": 1180
},
{
"epoch": 1.1365807067812799,
"grad_norm": 0.6690634489059448,
"learning_rate": 8.643744030563516e-06,
"loss": 0.0071,
"step": 1190
},
{
"epoch": 1.146131805157593,
"grad_norm": 0.09388808161020279,
"learning_rate": 8.548233046800382e-06,
"loss": 0.0717,
"step": 1200
},
{
"epoch": 1.1556829035339065,
"grad_norm": 0.06623850017786026,
"learning_rate": 8.45272206303725e-06,
"loss": 0.0866,
"step": 1210
},
{
"epoch": 1.1652340019102196,
"grad_norm": 0.05635674670338631,
"learning_rate": 8.357211079274117e-06,
"loss": 0.0914,
"step": 1220
},
{
"epoch": 1.174785100286533,
"grad_norm": 0.0588347390294075,
"learning_rate": 8.261700095510985e-06,
"loss": 0.0461,
"step": 1230
},
{
"epoch": 1.1843361986628462,
"grad_norm": 0.03934504836797714,
"learning_rate": 8.166189111747852e-06,
"loss": 0.1019,
"step": 1240
},
{
"epoch": 1.1938872970391594,
"grad_norm": 0.051371876150369644,
"learning_rate": 8.070678127984719e-06,
"loss": 0.016,
"step": 1250
},
{
"epoch": 1.2034383954154728,
"grad_norm": 0.061191458255052567,
"learning_rate": 7.975167144221587e-06,
"loss": 0.0077,
"step": 1260
},
{
"epoch": 1.212989493791786,
"grad_norm": 0.0495857410132885,
"learning_rate": 7.879656160458454e-06,
"loss": 0.0051,
"step": 1270
},
{
"epoch": 1.2225405921680994,
"grad_norm": 0.04128009453415871,
"learning_rate": 7.78414517669532e-06,
"loss": 0.317,
"step": 1280
},
{
"epoch": 1.2320916905444126,
"grad_norm": 0.03453819081187248,
"learning_rate": 7.688634192932188e-06,
"loss": 0.0608,
"step": 1290
},
{
"epoch": 1.2416427889207258,
"grad_norm": 0.03668952360749245,
"learning_rate": 7.593123209169055e-06,
"loss": 0.0264,
"step": 1300
},
{
"epoch": 1.2511938872970392,
"grad_norm": 0.03199330344796181,
"learning_rate": 7.4976122254059225e-06,
"loss": 0.0047,
"step": 1310
},
{
"epoch": 1.2607449856733524,
"grad_norm": 1.9187037944793701,
"learning_rate": 7.402101241642789e-06,
"loss": 0.1462,
"step": 1320
},
{
"epoch": 1.2702960840496658,
"grad_norm": 0.04411700740456581,
"learning_rate": 7.306590257879657e-06,
"loss": 0.0955,
"step": 1330
},
{
"epoch": 1.279847182425979,
"grad_norm": 0.03471948206424713,
"learning_rate": 7.211079274116523e-06,
"loss": 0.0062,
"step": 1340
},
{
"epoch": 1.2893982808022924,
"grad_norm": 0.042391568422317505,
"learning_rate": 7.115568290353391e-06,
"loss": 0.006,
"step": 1350
},
{
"epoch": 1.2989493791786055,
"grad_norm": 0.04176805168390274,
"learning_rate": 7.020057306590259e-06,
"loss": 0.1142,
"step": 1360
},
{
"epoch": 1.3085004775549187,
"grad_norm": 0.06825416535139084,
"learning_rate": 6.924546322827126e-06,
"loss": 0.1209,
"step": 1370
},
{
"epoch": 1.3180515759312321,
"grad_norm": 0.04017266258597374,
"learning_rate": 6.829035339063993e-06,
"loss": 0.0045,
"step": 1380
},
{
"epoch": 1.3276026743075453,
"grad_norm": 0.0732770562171936,
"learning_rate": 6.73352435530086e-06,
"loss": 0.1188,
"step": 1390
},
{
"epoch": 1.3371537726838587,
"grad_norm": 0.04319130256772041,
"learning_rate": 6.638013371537727e-06,
"loss": 0.007,
"step": 1400
},
{
"epoch": 1.346704871060172,
"grad_norm": 0.08936483412981033,
"learning_rate": 6.542502387774594e-06,
"loss": 0.0118,
"step": 1410
},
{
"epoch": 1.3562559694364853,
"grad_norm": 0.035478316247463226,
"learning_rate": 6.446991404011462e-06,
"loss": 0.0048,
"step": 1420
},
{
"epoch": 1.3658070678127985,
"grad_norm": 0.10484705865383148,
"learning_rate": 6.3514804202483295e-06,
"loss": 0.0056,
"step": 1430
},
{
"epoch": 1.3753581661891117,
"grad_norm": 0.03144150972366333,
"learning_rate": 6.255969436485196e-06,
"loss": 0.0758,
"step": 1440
},
{
"epoch": 1.384909264565425,
"grad_norm": 0.12434408813714981,
"learning_rate": 6.160458452722064e-06,
"loss": 0.0049,
"step": 1450
},
{
"epoch": 1.3944603629417383,
"grad_norm": 3.348506212234497,
"learning_rate": 6.06494746895893e-06,
"loss": 0.0309,
"step": 1460
},
{
"epoch": 1.4040114613180517,
"grad_norm": 0.03951037675142288,
"learning_rate": 5.969436485195798e-06,
"loss": 0.0538,
"step": 1470
},
{
"epoch": 1.4135625596943648,
"grad_norm": 0.04269490763545036,
"learning_rate": 5.873925501432666e-06,
"loss": 0.0044,
"step": 1480
},
{
"epoch": 1.4231136580706782,
"grad_norm": 0.760600745677948,
"learning_rate": 5.778414517669533e-06,
"loss": 0.0973,
"step": 1490
},
{
"epoch": 1.4326647564469914,
"grad_norm": 0.03149113059043884,
"learning_rate": 5.6829035339064e-06,
"loss": 0.0346,
"step": 1500
},
{
"epoch": 1.4422158548233046,
"grad_norm": 0.05680393800139427,
"learning_rate": 5.587392550143267e-06,
"loss": 0.0043,
"step": 1510
},
{
"epoch": 1.451766953199618,
"grad_norm": 0.03370094299316406,
"learning_rate": 5.491881566380134e-06,
"loss": 0.1295,
"step": 1520
},
{
"epoch": 1.4613180515759312,
"grad_norm": 0.045079704374074936,
"learning_rate": 5.396370582617001e-06,
"loss": 0.0736,
"step": 1530
},
{
"epoch": 1.4708691499522444,
"grad_norm": 0.7708030343055725,
"learning_rate": 5.300859598853869e-06,
"loss": 0.0042,
"step": 1540
},
{
"epoch": 1.4804202483285578,
"grad_norm": 0.04070596769452095,
"learning_rate": 5.2053486150907365e-06,
"loss": 0.0301,
"step": 1550
},
{
"epoch": 1.4899713467048712,
"grad_norm": 0.033276643604040146,
"learning_rate": 5.109837631327603e-06,
"loss": 0.0696,
"step": 1560
},
{
"epoch": 1.4995224450811844,
"grad_norm": 0.04143739864230156,
"learning_rate": 5.014326647564471e-06,
"loss": 0.0753,
"step": 1570
},
{
"epoch": 1.5090735434574976,
"grad_norm": 0.032175932079553604,
"learning_rate": 4.918815663801337e-06,
"loss": 0.1394,
"step": 1580
},
{
"epoch": 1.518624641833811,
"grad_norm": 0.045357052236795425,
"learning_rate": 4.823304680038205e-06,
"loss": 0.0066,
"step": 1590
},
{
"epoch": 1.5281757402101241,
"grad_norm": 0.04212405905127525,
"learning_rate": 4.727793696275072e-06,
"loss": 0.0758,
"step": 1600
},
{
"epoch": 1.5377268385864373,
"grad_norm": 0.06627684831619263,
"learning_rate": 4.632282712511939e-06,
"loss": 0.0945,
"step": 1610
},
{
"epoch": 1.5472779369627507,
"grad_norm": 0.04194959998130798,
"learning_rate": 4.536771728748807e-06,
"loss": 0.0046,
"step": 1620
},
{
"epoch": 1.5568290353390641,
"grad_norm": 0.04786338284611702,
"learning_rate": 4.441260744985674e-06,
"loss": 0.0045,
"step": 1630
},
{
"epoch": 1.5663801337153773,
"grad_norm": 0.030701184645295143,
"learning_rate": 4.345749761222541e-06,
"loss": 0.0699,
"step": 1640
},
{
"epoch": 1.5759312320916905,
"grad_norm": 0.043966639786958694,
"learning_rate": 4.250238777459409e-06,
"loss": 0.122,
"step": 1650
},
{
"epoch": 1.585482330468004,
"grad_norm": 0.04325714334845543,
"learning_rate": 4.154727793696275e-06,
"loss": 0.0051,
"step": 1660
},
{
"epoch": 1.595033428844317,
"grad_norm": 0.03839458152651787,
"learning_rate": 4.059216809933143e-06,
"loss": 0.0041,
"step": 1670
},
{
"epoch": 1.6045845272206303,
"grad_norm": 0.02976052649319172,
"learning_rate": 3.96370582617001e-06,
"loss": 0.0563,
"step": 1680
},
{
"epoch": 1.6141356255969437,
"grad_norm": 55.89206314086914,
"learning_rate": 3.868194842406877e-06,
"loss": 0.0258,
"step": 1690
},
{
"epoch": 1.623686723973257,
"grad_norm": 0.057242073118686676,
"learning_rate": 3.772683858643744e-06,
"loss": 0.0042,
"step": 1700
},
{
"epoch": 1.63323782234957,
"grad_norm": 0.0714183896780014,
"learning_rate": 3.6771728748806117e-06,
"loss": 0.005,
"step": 1710
},
{
"epoch": 1.6427889207258835,
"grad_norm": 0.0358208492398262,
"learning_rate": 3.5816618911174787e-06,
"loss": 0.0739,
"step": 1720
},
{
"epoch": 1.6523400191021969,
"grad_norm": 0.039776891469955444,
"learning_rate": 3.4861509073543457e-06,
"loss": 0.0685,
"step": 1730
},
{
"epoch": 1.66189111747851,
"grad_norm": 0.03394331783056259,
"learning_rate": 3.3906399235912136e-06,
"loss": 0.0794,
"step": 1740
},
{
"epoch": 1.6714422158548232,
"grad_norm": 0.031364619731903076,
"learning_rate": 3.2951289398280806e-06,
"loss": 0.118,
"step": 1750
},
{
"epoch": 1.6809933142311366,
"grad_norm": 2.0534205436706543,
"learning_rate": 3.1996179560649477e-06,
"loss": 0.1878,
"step": 1760
},
{
"epoch": 1.6905444126074498,
"grad_norm": 0.038952384144067764,
"learning_rate": 3.104106972301815e-06,
"loss": 0.0738,
"step": 1770
},
{
"epoch": 1.700095510983763,
"grad_norm": 0.029340583831071854,
"learning_rate": 3.008595988538682e-06,
"loss": 0.005,
"step": 1780
},
{
"epoch": 1.7096466093600764,
"grad_norm": 0.04813091456890106,
"learning_rate": 2.9130850047755492e-06,
"loss": 0.0675,
"step": 1790
},
{
"epoch": 1.7191977077363898,
"grad_norm": 0.05901302769780159,
"learning_rate": 2.8175740210124163e-06,
"loss": 0.0479,
"step": 1800
},
{
"epoch": 1.728748806112703,
"grad_norm": 0.044173464179039,
"learning_rate": 2.722063037249284e-06,
"loss": 0.0423,
"step": 1810
},
{
"epoch": 1.7382999044890162,
"grad_norm": 0.03839905560016632,
"learning_rate": 2.626552053486151e-06,
"loss": 0.1425,
"step": 1820
},
{
"epoch": 1.7478510028653296,
"grad_norm": 0.059842657297849655,
"learning_rate": 2.5310410697230182e-06,
"loss": 0.0042,
"step": 1830
},
{
"epoch": 1.7574021012416428,
"grad_norm": 0.24521498382091522,
"learning_rate": 2.4355300859598857e-06,
"loss": 0.0525,
"step": 1840
},
{
"epoch": 1.766953199617956,
"grad_norm": 0.5483675003051758,
"learning_rate": 2.3400191021967527e-06,
"loss": 0.0752,
"step": 1850
},
{
"epoch": 1.7765042979942693,
"grad_norm": 0.036952149122953415,
"learning_rate": 2.24450811843362e-06,
"loss": 0.1074,
"step": 1860
},
{
"epoch": 1.7860553963705827,
"grad_norm": 0.8135057091712952,
"learning_rate": 2.1489971346704872e-06,
"loss": 0.0048,
"step": 1870
},
{
"epoch": 1.795606494746896,
"grad_norm": 0.06449055671691895,
"learning_rate": 2.0534861509073547e-06,
"loss": 0.1038,
"step": 1880
},
{
"epoch": 1.8051575931232091,
"grad_norm": 107.41304779052734,
"learning_rate": 1.9579751671442217e-06,
"loss": 0.0318,
"step": 1890
},
{
"epoch": 1.8147086914995225,
"grad_norm": 0.04807087033987045,
"learning_rate": 1.862464183381089e-06,
"loss": 0.1689,
"step": 1900
},
{
"epoch": 1.8242597898758357,
"grad_norm": 0.04132077470421791,
"learning_rate": 1.7669531996179562e-06,
"loss": 0.0041,
"step": 1910
},
{
"epoch": 1.8338108882521489,
"grad_norm": 0.06250818073749542,
"learning_rate": 1.6714422158548235e-06,
"loss": 0.0052,
"step": 1920
},
{
"epoch": 1.8433619866284623,
"grad_norm": 83.07594299316406,
"learning_rate": 1.5759312320916905e-06,
"loss": 0.174,
"step": 1930
},
{
"epoch": 1.8529130850047757,
"grad_norm": 0.04571348428726196,
"learning_rate": 1.480420248328558e-06,
"loss": 0.0049,
"step": 1940
},
{
"epoch": 1.8624641833810889,
"grad_norm": 0.04580092057585716,
"learning_rate": 1.3849092645654252e-06,
"loss": 0.0638,
"step": 1950
},
{
"epoch": 1.872015281757402,
"grad_norm": 0.04569645971059799,
"learning_rate": 1.2893982808022922e-06,
"loss": 0.0551,
"step": 1960
},
{
"epoch": 1.8815663801337155,
"grad_norm": 0.049619242548942566,
"learning_rate": 1.1938872970391597e-06,
"loss": 0.0436,
"step": 1970
},
{
"epoch": 1.8911174785100286,
"grad_norm": 0.059808436781167984,
"learning_rate": 1.0983763132760267e-06,
"loss": 0.0653,
"step": 1980
},
{
"epoch": 1.9006685768863418,
"grad_norm": 0.038019582629203796,
"learning_rate": 1.002865329512894e-06,
"loss": 0.0632,
"step": 1990
},
{
"epoch": 1.9102196752626552,
"grad_norm": 0.037652261555194855,
"learning_rate": 9.073543457497613e-07,
"loss": 0.0676,
"step": 2000
},
{
"epoch": 1.9197707736389686,
"grad_norm": 0.0339687243103981,
"learning_rate": 8.118433619866285e-07,
"loss": 0.0048,
"step": 2010
},
{
"epoch": 1.9293218720152816,
"grad_norm": 0.043730951845645905,
"learning_rate": 7.163323782234957e-07,
"loss": 0.005,
"step": 2020
},
{
"epoch": 1.938872970391595,
"grad_norm": 0.044012073427438736,
"learning_rate": 6.20821394460363e-07,
"loss": 0.0738,
"step": 2030
},
{
"epoch": 1.9484240687679084,
"grad_norm": 0.34402212500572205,
"learning_rate": 5.253104106972302e-07,
"loss": 0.0062,
"step": 2040
},
{
"epoch": 1.9579751671442216,
"grad_norm": 0.06250176578760147,
"learning_rate": 4.2979942693409743e-07,
"loss": 0.0665,
"step": 2050
},
{
"epoch": 1.9675262655205348,
"grad_norm": 0.06154881417751312,
"learning_rate": 3.342884431709647e-07,
"loss": 0.0266,
"step": 2060
},
{
"epoch": 1.9770773638968482,
"grad_norm": 0.038224026560783386,
"learning_rate": 2.3877745940783193e-07,
"loss": 0.0039,
"step": 2070
},
{
"epoch": 1.9866284622731614,
"grad_norm": 0.668195903301239,
"learning_rate": 1.4326647564469915e-07,
"loss": 0.0162,
"step": 2080
},
{
"epoch": 1.9961795606494745,
"grad_norm": 0.048277534544467926,
"learning_rate": 4.775549188156639e-08,
"loss": 0.0049,
"step": 2090
},
{
"epoch": 2.0,
"eval_loss": 0.10492703318595886,
"eval_runtime": 1.2317,
"eval_samples_per_second": 755.069,
"eval_steps_per_second": 94.993,
"step": 2094
}
],
"logging_steps": 10,
"max_steps": 2094,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4403992385931264.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}