gpt-2-time-recognition / trainer_state.json
namesarnav's picture
Upload folder using huggingface_hub
c83d097 verified
{
"best_global_step": 5000,
"best_metric": 0.790944661822247,
"best_model_checkpoint": "./results/checkpoint-5000",
"epoch": 3.0,
"eval_steps": 250,
"global_step": 5340,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0056179775280898875,
"grad_norm": 37.83753967285156,
"learning_rate": 3.6e-07,
"loss": 0.5628,
"step": 10
},
{
"epoch": 0.011235955056179775,
"grad_norm": 18.735244750976562,
"learning_rate": 7.6e-07,
"loss": 0.5392,
"step": 20
},
{
"epoch": 0.016853932584269662,
"grad_norm": 27.133329391479492,
"learning_rate": 1.1600000000000001e-06,
"loss": 0.4695,
"step": 30
},
{
"epoch": 0.02247191011235955,
"grad_norm": 18.776302337646484,
"learning_rate": 1.56e-06,
"loss": 0.5545,
"step": 40
},
{
"epoch": 0.028089887640449437,
"grad_norm": 23.711435317993164,
"learning_rate": 1.9600000000000003e-06,
"loss": 0.5058,
"step": 50
},
{
"epoch": 0.033707865168539325,
"grad_norm": 21.259469985961914,
"learning_rate": 2.3600000000000003e-06,
"loss": 0.4333,
"step": 60
},
{
"epoch": 0.03932584269662921,
"grad_norm": 26.772079467773438,
"learning_rate": 2.7600000000000003e-06,
"loss": 0.4378,
"step": 70
},
{
"epoch": 0.0449438202247191,
"grad_norm": 18.497085571289062,
"learning_rate": 3.1600000000000002e-06,
"loss": 0.4262,
"step": 80
},
{
"epoch": 0.05056179775280899,
"grad_norm": 15.191393852233887,
"learning_rate": 3.5600000000000002e-06,
"loss": 0.3317,
"step": 90
},
{
"epoch": 0.056179775280898875,
"grad_norm": 14.540416717529297,
"learning_rate": 3.96e-06,
"loss": 0.3036,
"step": 100
},
{
"epoch": 0.06179775280898876,
"grad_norm": 7.341688632965088,
"learning_rate": 4.360000000000001e-06,
"loss": 0.315,
"step": 110
},
{
"epoch": 0.06741573033707865,
"grad_norm": 8.017271041870117,
"learning_rate": 4.76e-06,
"loss": 0.3194,
"step": 120
},
{
"epoch": 0.07303370786516854,
"grad_norm": 8.424580574035645,
"learning_rate": 5.1600000000000006e-06,
"loss": 0.2942,
"step": 130
},
{
"epoch": 0.07865168539325842,
"grad_norm": 3.8868699073791504,
"learning_rate": 5.560000000000001e-06,
"loss": 0.2451,
"step": 140
},
{
"epoch": 0.08426966292134831,
"grad_norm": 4.187905311584473,
"learning_rate": 5.9600000000000005e-06,
"loss": 0.2187,
"step": 150
},
{
"epoch": 0.0898876404494382,
"grad_norm": 3.5908498764038086,
"learning_rate": 6.360000000000001e-06,
"loss": 0.1697,
"step": 160
},
{
"epoch": 0.09550561797752809,
"grad_norm": 2.1136908531188965,
"learning_rate": 6.760000000000001e-06,
"loss": 0.1783,
"step": 170
},
{
"epoch": 0.10112359550561797,
"grad_norm": 2.29725980758667,
"learning_rate": 7.16e-06,
"loss": 0.161,
"step": 180
},
{
"epoch": 0.10674157303370786,
"grad_norm": 1.8517777919769287,
"learning_rate": 7.5600000000000005e-06,
"loss": 0.0987,
"step": 190
},
{
"epoch": 0.11235955056179775,
"grad_norm": 3.0938291549682617,
"learning_rate": 7.960000000000002e-06,
"loss": 0.1304,
"step": 200
},
{
"epoch": 0.11797752808988764,
"grad_norm": 1.1395810842514038,
"learning_rate": 8.36e-06,
"loss": 0.0922,
"step": 210
},
{
"epoch": 0.12359550561797752,
"grad_norm": 1.0684705972671509,
"learning_rate": 8.76e-06,
"loss": 0.0657,
"step": 220
},
{
"epoch": 0.12921348314606743,
"grad_norm": 1.4070613384246826,
"learning_rate": 9.16e-06,
"loss": 0.1045,
"step": 230
},
{
"epoch": 0.1348314606741573,
"grad_norm": 1.3161265850067139,
"learning_rate": 9.56e-06,
"loss": 0.0942,
"step": 240
},
{
"epoch": 0.1404494382022472,
"grad_norm": 1.0167522430419922,
"learning_rate": 9.960000000000001e-06,
"loss": 0.0972,
"step": 250
},
{
"epoch": 0.1404494382022472,
"eval_f1": 0.5152618314197704,
"eval_loss": 0.0704234316945076,
"eval_precision": 0.4922418405564473,
"eval_recall": 0.5405405405405406,
"eval_runtime": 28.0049,
"eval_samples_per_second": 140.333,
"eval_steps_per_second": 4.392,
"step": 250
},
{
"epoch": 0.14606741573033707,
"grad_norm": 0.8816568851470947,
"learning_rate": 1.036e-05,
"loss": 0.0733,
"step": 260
},
{
"epoch": 0.15168539325842698,
"grad_norm": 1.3037965297698975,
"learning_rate": 1.0760000000000002e-05,
"loss": 0.0576,
"step": 270
},
{
"epoch": 0.15730337078651685,
"grad_norm": 0.5775825381278992,
"learning_rate": 1.1160000000000002e-05,
"loss": 0.0648,
"step": 280
},
{
"epoch": 0.16292134831460675,
"grad_norm": 0.48878130316734314,
"learning_rate": 1.156e-05,
"loss": 0.0566,
"step": 290
},
{
"epoch": 0.16853932584269662,
"grad_norm": 1.132553219795227,
"learning_rate": 1.196e-05,
"loss": 0.0665,
"step": 300
},
{
"epoch": 0.17415730337078653,
"grad_norm": 3.24303936958313,
"learning_rate": 1.236e-05,
"loss": 0.0756,
"step": 310
},
{
"epoch": 0.1797752808988764,
"grad_norm": 1.5602407455444336,
"learning_rate": 1.2760000000000001e-05,
"loss": 0.0588,
"step": 320
},
{
"epoch": 0.1853932584269663,
"grad_norm": 0.718769371509552,
"learning_rate": 1.3160000000000001e-05,
"loss": 0.065,
"step": 330
},
{
"epoch": 0.19101123595505617,
"grad_norm": 0.5588079690933228,
"learning_rate": 1.3560000000000002e-05,
"loss": 0.0655,
"step": 340
},
{
"epoch": 0.19662921348314608,
"grad_norm": 0.7195122838020325,
"learning_rate": 1.396e-05,
"loss": 0.0547,
"step": 350
},
{
"epoch": 0.20224719101123595,
"grad_norm": 0.3126872777938843,
"learning_rate": 1.4360000000000001e-05,
"loss": 0.0399,
"step": 360
},
{
"epoch": 0.20786516853932585,
"grad_norm": 0.4424603581428528,
"learning_rate": 1.4760000000000001e-05,
"loss": 0.0822,
"step": 370
},
{
"epoch": 0.21348314606741572,
"grad_norm": 0.514411449432373,
"learning_rate": 1.516e-05,
"loss": 0.058,
"step": 380
},
{
"epoch": 0.21910112359550563,
"grad_norm": 0.8999485969543457,
"learning_rate": 1.556e-05,
"loss": 0.0696,
"step": 390
},
{
"epoch": 0.2247191011235955,
"grad_norm": 0.376237154006958,
"learning_rate": 1.5960000000000003e-05,
"loss": 0.0431,
"step": 400
},
{
"epoch": 0.2303370786516854,
"grad_norm": 0.3700561821460724,
"learning_rate": 1.636e-05,
"loss": 0.0355,
"step": 410
},
{
"epoch": 0.23595505617977527,
"grad_norm": 0.6543495059013367,
"learning_rate": 1.6760000000000002e-05,
"loss": 0.0539,
"step": 420
},
{
"epoch": 0.24157303370786518,
"grad_norm": 0.6010973453521729,
"learning_rate": 1.7160000000000002e-05,
"loss": 0.038,
"step": 430
},
{
"epoch": 0.24719101123595505,
"grad_norm": 0.723072350025177,
"learning_rate": 1.756e-05,
"loss": 0.0452,
"step": 440
},
{
"epoch": 0.25280898876404495,
"grad_norm": 0.7555616497993469,
"learning_rate": 1.796e-05,
"loss": 0.037,
"step": 450
},
{
"epoch": 0.25842696629213485,
"grad_norm": 0.9911591410636902,
"learning_rate": 1.8360000000000004e-05,
"loss": 0.054,
"step": 460
},
{
"epoch": 0.2640449438202247,
"grad_norm": 0.38651639223098755,
"learning_rate": 1.876e-05,
"loss": 0.0761,
"step": 470
},
{
"epoch": 0.2696629213483146,
"grad_norm": 0.461109459400177,
"learning_rate": 1.916e-05,
"loss": 0.0487,
"step": 480
},
{
"epoch": 0.2752808988764045,
"grad_norm": 0.5480913519859314,
"learning_rate": 1.9560000000000002e-05,
"loss": 0.0688,
"step": 490
},
{
"epoch": 0.2808988764044944,
"grad_norm": 0.5451459288597107,
"learning_rate": 1.9960000000000002e-05,
"loss": 0.0385,
"step": 500
},
{
"epoch": 0.2808988764044944,
"eval_f1": 0.6540447504302925,
"eval_loss": 0.040211886167526245,
"eval_precision": 0.6390134529147982,
"eval_recall": 0.6698002350176263,
"eval_runtime": 27.5658,
"eval_samples_per_second": 142.568,
"eval_steps_per_second": 4.462,
"step": 500
},
{
"epoch": 0.28651685393258425,
"grad_norm": 0.4356236159801483,
"learning_rate": 1.9962809917355374e-05,
"loss": 0.0509,
"step": 510
},
{
"epoch": 0.29213483146067415,
"grad_norm": 0.39902523159980774,
"learning_rate": 1.9921487603305786e-05,
"loss": 0.0419,
"step": 520
},
{
"epoch": 0.29775280898876405,
"grad_norm": 0.6652863025665283,
"learning_rate": 1.98801652892562e-05,
"loss": 0.0321,
"step": 530
},
{
"epoch": 0.30337078651685395,
"grad_norm": 0.6239707469940186,
"learning_rate": 1.9838842975206615e-05,
"loss": 0.0499,
"step": 540
},
{
"epoch": 0.3089887640449438,
"grad_norm": 0.6594592332839966,
"learning_rate": 1.9797520661157025e-05,
"loss": 0.0526,
"step": 550
},
{
"epoch": 0.3146067415730337,
"grad_norm": 1.5072342157363892,
"learning_rate": 1.975619834710744e-05,
"loss": 0.0493,
"step": 560
},
{
"epoch": 0.3202247191011236,
"grad_norm": 0.5700026154518127,
"learning_rate": 1.9714876033057854e-05,
"loss": 0.0492,
"step": 570
},
{
"epoch": 0.3258426966292135,
"grad_norm": 5.261657238006592,
"learning_rate": 1.9673553719008267e-05,
"loss": 0.0709,
"step": 580
},
{
"epoch": 0.33146067415730335,
"grad_norm": 0.6704587936401367,
"learning_rate": 1.963223140495868e-05,
"loss": 0.0417,
"step": 590
},
{
"epoch": 0.33707865168539325,
"grad_norm": 0.7397480607032776,
"learning_rate": 1.9590909090909092e-05,
"loss": 0.0501,
"step": 600
},
{
"epoch": 0.34269662921348315,
"grad_norm": 1.0854114294052124,
"learning_rate": 1.9549586776859505e-05,
"loss": 0.0397,
"step": 610
},
{
"epoch": 0.34831460674157305,
"grad_norm": 0.7938683032989502,
"learning_rate": 1.9508264462809918e-05,
"loss": 0.0344,
"step": 620
},
{
"epoch": 0.3539325842696629,
"grad_norm": 0.5936018228530884,
"learning_rate": 1.9466942148760334e-05,
"loss": 0.0428,
"step": 630
},
{
"epoch": 0.3595505617977528,
"grad_norm": 0.37717097997665405,
"learning_rate": 1.9425619834710743e-05,
"loss": 0.0457,
"step": 640
},
{
"epoch": 0.3651685393258427,
"grad_norm": 0.5075058937072754,
"learning_rate": 1.938429752066116e-05,
"loss": 0.0345,
"step": 650
},
{
"epoch": 0.3707865168539326,
"grad_norm": 1.0874806642532349,
"learning_rate": 1.9342975206611572e-05,
"loss": 0.0406,
"step": 660
},
{
"epoch": 0.37640449438202245,
"grad_norm": 0.7143360376358032,
"learning_rate": 1.9301652892561985e-05,
"loss": 0.0392,
"step": 670
},
{
"epoch": 0.38202247191011235,
"grad_norm": 1.010405421257019,
"learning_rate": 1.9260330578512398e-05,
"loss": 0.0489,
"step": 680
},
{
"epoch": 0.38764044943820225,
"grad_norm": 0.44814518094062805,
"learning_rate": 1.921900826446281e-05,
"loss": 0.0344,
"step": 690
},
{
"epoch": 0.39325842696629215,
"grad_norm": 0.5441784262657166,
"learning_rate": 1.9177685950413224e-05,
"loss": 0.0423,
"step": 700
},
{
"epoch": 0.398876404494382,
"grad_norm": 0.4021058678627014,
"learning_rate": 1.9136363636363636e-05,
"loss": 0.0432,
"step": 710
},
{
"epoch": 0.4044943820224719,
"grad_norm": 0.34465810656547546,
"learning_rate": 1.9095041322314053e-05,
"loss": 0.0428,
"step": 720
},
{
"epoch": 0.4101123595505618,
"grad_norm": 0.14510081708431244,
"learning_rate": 1.9053719008264465e-05,
"loss": 0.0295,
"step": 730
},
{
"epoch": 0.4157303370786517,
"grad_norm": 0.330493688583374,
"learning_rate": 1.9012396694214878e-05,
"loss": 0.0305,
"step": 740
},
{
"epoch": 0.42134831460674155,
"grad_norm": 0.21993879973888397,
"learning_rate": 1.897107438016529e-05,
"loss": 0.0334,
"step": 750
},
{
"epoch": 0.42134831460674155,
"eval_f1": 0.7062535857716581,
"eval_loss": 0.03633005544543266,
"eval_precision": 0.6900224215246636,
"eval_recall": 0.7232667450058754,
"eval_runtime": 28.2949,
"eval_samples_per_second": 138.895,
"eval_steps_per_second": 4.347,
"step": 750
},
{
"epoch": 0.42696629213483145,
"grad_norm": 0.623182475566864,
"learning_rate": 1.8929752066115704e-05,
"loss": 0.0434,
"step": 760
},
{
"epoch": 0.43258426966292135,
"grad_norm": 1.1757659912109375,
"learning_rate": 1.8888429752066117e-05,
"loss": 0.0588,
"step": 770
},
{
"epoch": 0.43820224719101125,
"grad_norm": 0.37911972403526306,
"learning_rate": 1.884710743801653e-05,
"loss": 0.0293,
"step": 780
},
{
"epoch": 0.4438202247191011,
"grad_norm": 0.671259343624115,
"learning_rate": 1.8805785123966946e-05,
"loss": 0.034,
"step": 790
},
{
"epoch": 0.449438202247191,
"grad_norm": 0.46087324619293213,
"learning_rate": 1.8764462809917355e-05,
"loss": 0.0339,
"step": 800
},
{
"epoch": 0.4550561797752809,
"grad_norm": 0.33612871170043945,
"learning_rate": 1.872314049586777e-05,
"loss": 0.0358,
"step": 810
},
{
"epoch": 0.4606741573033708,
"grad_norm": 0.30411240458488464,
"learning_rate": 1.8681818181818184e-05,
"loss": 0.0467,
"step": 820
},
{
"epoch": 0.46629213483146065,
"grad_norm": 0.26056644320487976,
"learning_rate": 1.8640495867768597e-05,
"loss": 0.0404,
"step": 830
},
{
"epoch": 0.47191011235955055,
"grad_norm": 0.7425059676170349,
"learning_rate": 1.859917355371901e-05,
"loss": 0.0519,
"step": 840
},
{
"epoch": 0.47752808988764045,
"grad_norm": 0.20595073699951172,
"learning_rate": 1.8557851239669422e-05,
"loss": 0.0373,
"step": 850
},
{
"epoch": 0.48314606741573035,
"grad_norm": 0.706141471862793,
"learning_rate": 1.8516528925619835e-05,
"loss": 0.0427,
"step": 860
},
{
"epoch": 0.4887640449438202,
"grad_norm": 0.27146199345588684,
"learning_rate": 1.8475206611570248e-05,
"loss": 0.0406,
"step": 870
},
{
"epoch": 0.4943820224719101,
"grad_norm": 0.40938302874565125,
"learning_rate": 1.8433884297520664e-05,
"loss": 0.0373,
"step": 880
},
{
"epoch": 0.5,
"grad_norm": 0.43672415614128113,
"learning_rate": 1.8392561983471077e-05,
"loss": 0.0613,
"step": 890
},
{
"epoch": 0.5056179775280899,
"grad_norm": 0.3265509009361267,
"learning_rate": 1.835123966942149e-05,
"loss": 0.0433,
"step": 900
},
{
"epoch": 0.5112359550561798,
"grad_norm": 0.33285167813301086,
"learning_rate": 1.8309917355371903e-05,
"loss": 0.0435,
"step": 910
},
{
"epoch": 0.5168539325842697,
"grad_norm": 0.46705156564712524,
"learning_rate": 1.8268595041322316e-05,
"loss": 0.0316,
"step": 920
},
{
"epoch": 0.5224719101123596,
"grad_norm": 0.34717079997062683,
"learning_rate": 1.822727272727273e-05,
"loss": 0.0432,
"step": 930
},
{
"epoch": 0.5280898876404494,
"grad_norm": 0.6267735362052917,
"learning_rate": 1.818595041322314e-05,
"loss": 0.0391,
"step": 940
},
{
"epoch": 0.5337078651685393,
"grad_norm": 0.29985982179641724,
"learning_rate": 1.8144628099173557e-05,
"loss": 0.0314,
"step": 950
},
{
"epoch": 0.5393258426966292,
"grad_norm": 0.5540522933006287,
"learning_rate": 1.8103305785123967e-05,
"loss": 0.0434,
"step": 960
},
{
"epoch": 0.5449438202247191,
"grad_norm": 1.0504409074783325,
"learning_rate": 1.8061983471074383e-05,
"loss": 0.0369,
"step": 970
},
{
"epoch": 0.550561797752809,
"grad_norm": 0.1691039800643921,
"learning_rate": 1.8020661157024796e-05,
"loss": 0.0288,
"step": 980
},
{
"epoch": 0.5561797752808989,
"grad_norm": 0.20762376487255096,
"learning_rate": 1.797933884297521e-05,
"loss": 0.021,
"step": 990
},
{
"epoch": 0.5617977528089888,
"grad_norm": 0.5355738997459412,
"learning_rate": 1.793801652892562e-05,
"loss": 0.0335,
"step": 1000
},
{
"epoch": 0.5617977528089888,
"eval_f1": 0.7190332326283987,
"eval_loss": 0.03450481593608856,
"eval_precision": 0.6750902527075813,
"eval_recall": 0.7690951821386603,
"eval_runtime": 32.9589,
"eval_samples_per_second": 119.239,
"eval_steps_per_second": 3.732,
"step": 1000
},
{
"epoch": 0.5674157303370787,
"grad_norm": 0.2527494430541992,
"learning_rate": 1.7896694214876034e-05,
"loss": 0.0336,
"step": 1010
},
{
"epoch": 0.5730337078651685,
"grad_norm": 0.21574370563030243,
"learning_rate": 1.7855371900826447e-05,
"loss": 0.0401,
"step": 1020
},
{
"epoch": 0.5786516853932584,
"grad_norm": 0.3417276442050934,
"learning_rate": 1.781404958677686e-05,
"loss": 0.0268,
"step": 1030
},
{
"epoch": 0.5842696629213483,
"grad_norm": 0.3619694411754608,
"learning_rate": 1.7772727272727276e-05,
"loss": 0.052,
"step": 1040
},
{
"epoch": 0.5898876404494382,
"grad_norm": 0.4339875876903534,
"learning_rate": 1.7731404958677685e-05,
"loss": 0.0409,
"step": 1050
},
{
"epoch": 0.5955056179775281,
"grad_norm": 0.31473612785339355,
"learning_rate": 1.76900826446281e-05,
"loss": 0.0433,
"step": 1060
},
{
"epoch": 0.601123595505618,
"grad_norm": 0.3506283760070801,
"learning_rate": 1.7648760330578514e-05,
"loss": 0.0345,
"step": 1070
},
{
"epoch": 0.6067415730337079,
"grad_norm": 0.31608250737190247,
"learning_rate": 1.7607438016528927e-05,
"loss": 0.0373,
"step": 1080
},
{
"epoch": 0.6123595505617978,
"grad_norm": 0.9107354879379272,
"learning_rate": 1.756611570247934e-05,
"loss": 0.0325,
"step": 1090
},
{
"epoch": 0.6179775280898876,
"grad_norm": 1.0891706943511963,
"learning_rate": 1.7524793388429753e-05,
"loss": 0.0424,
"step": 1100
},
{
"epoch": 0.6235955056179775,
"grad_norm": 0.716555118560791,
"learning_rate": 1.7483471074380166e-05,
"loss": 0.0332,
"step": 1110
},
{
"epoch": 0.6292134831460674,
"grad_norm": 0.33739280700683594,
"learning_rate": 1.744214876033058e-05,
"loss": 0.029,
"step": 1120
},
{
"epoch": 0.6348314606741573,
"grad_norm": 0.25892215967178345,
"learning_rate": 1.7400826446280995e-05,
"loss": 0.0205,
"step": 1130
},
{
"epoch": 0.6404494382022472,
"grad_norm": 0.6945488452911377,
"learning_rate": 1.7359504132231407e-05,
"loss": 0.0262,
"step": 1140
},
{
"epoch": 0.6460674157303371,
"grad_norm": 0.511842668056488,
"learning_rate": 1.731818181818182e-05,
"loss": 0.0379,
"step": 1150
},
{
"epoch": 0.651685393258427,
"grad_norm": 4.446812152862549,
"learning_rate": 1.7276859504132233e-05,
"loss": 0.0473,
"step": 1160
},
{
"epoch": 0.6573033707865169,
"grad_norm": 0.35227376222610474,
"learning_rate": 1.7235537190082646e-05,
"loss": 0.0397,
"step": 1170
},
{
"epoch": 0.6629213483146067,
"grad_norm": 0.489005446434021,
"learning_rate": 1.719421487603306e-05,
"loss": 0.033,
"step": 1180
},
{
"epoch": 0.6685393258426966,
"grad_norm": 0.20524592697620392,
"learning_rate": 1.715289256198347e-05,
"loss": 0.0384,
"step": 1190
},
{
"epoch": 0.6741573033707865,
"grad_norm": 0.2928679287433624,
"learning_rate": 1.7111570247933888e-05,
"loss": 0.0308,
"step": 1200
},
{
"epoch": 0.6797752808988764,
"grad_norm": 0.3542841970920563,
"learning_rate": 1.7070247933884297e-05,
"loss": 0.0411,
"step": 1210
},
{
"epoch": 0.6853932584269663,
"grad_norm": 0.39853426814079285,
"learning_rate": 1.7028925619834713e-05,
"loss": 0.0332,
"step": 1220
},
{
"epoch": 0.6910112359550562,
"grad_norm": 0.7216328382492065,
"learning_rate": 1.6987603305785126e-05,
"loss": 0.0283,
"step": 1230
},
{
"epoch": 0.6966292134831461,
"grad_norm": 0.5684111714363098,
"learning_rate": 1.694628099173554e-05,
"loss": 0.0368,
"step": 1240
},
{
"epoch": 0.702247191011236,
"grad_norm": 0.2529934048652649,
"learning_rate": 1.690495867768595e-05,
"loss": 0.0329,
"step": 1250
},
{
"epoch": 0.702247191011236,
"eval_f1": 0.7409268565047459,
"eval_loss": 0.03298617899417877,
"eval_precision": 0.7058510638297872,
"eval_recall": 0.7796709753231492,
"eval_runtime": 32.8203,
"eval_samples_per_second": 119.743,
"eval_steps_per_second": 3.748,
"step": 1250
},
{
"epoch": 0.7078651685393258,
"grad_norm": 0.9135796427726746,
"learning_rate": 1.6863636363636364e-05,
"loss": 0.0296,
"step": 1260
},
{
"epoch": 0.7134831460674157,
"grad_norm": 0.4056197702884674,
"learning_rate": 1.6822314049586777e-05,
"loss": 0.0324,
"step": 1270
},
{
"epoch": 0.7191011235955056,
"grad_norm": 0.36684852838516235,
"learning_rate": 1.678099173553719e-05,
"loss": 0.0322,
"step": 1280
},
{
"epoch": 0.7247191011235955,
"grad_norm": 0.2179303914308548,
"learning_rate": 1.6739669421487606e-05,
"loss": 0.0208,
"step": 1290
},
{
"epoch": 0.7303370786516854,
"grad_norm": 1.9967374801635742,
"learning_rate": 1.669834710743802e-05,
"loss": 0.0344,
"step": 1300
},
{
"epoch": 0.7359550561797753,
"grad_norm": 0.5518152713775635,
"learning_rate": 1.6657024793388432e-05,
"loss": 0.033,
"step": 1310
},
{
"epoch": 0.7415730337078652,
"grad_norm": 0.8944317698478699,
"learning_rate": 1.6615702479338845e-05,
"loss": 0.0336,
"step": 1320
},
{
"epoch": 0.7471910112359551,
"grad_norm": 1.0821423530578613,
"learning_rate": 1.6574380165289258e-05,
"loss": 0.0359,
"step": 1330
},
{
"epoch": 0.7528089887640449,
"grad_norm": 0.30826064944267273,
"learning_rate": 1.653305785123967e-05,
"loss": 0.0229,
"step": 1340
},
{
"epoch": 0.7584269662921348,
"grad_norm": 0.12171895056962967,
"learning_rate": 1.6491735537190083e-05,
"loss": 0.0325,
"step": 1350
},
{
"epoch": 0.7640449438202247,
"grad_norm": 0.304765909910202,
"learning_rate": 1.64504132231405e-05,
"loss": 0.0422,
"step": 1360
},
{
"epoch": 0.7696629213483146,
"grad_norm": 0.2779518961906433,
"learning_rate": 1.640909090909091e-05,
"loss": 0.0241,
"step": 1370
},
{
"epoch": 0.7752808988764045,
"grad_norm": 0.5539456605911255,
"learning_rate": 1.6367768595041325e-05,
"loss": 0.0338,
"step": 1380
},
{
"epoch": 0.7808988764044944,
"grad_norm": 0.3549717664718628,
"learning_rate": 1.6326446280991738e-05,
"loss": 0.0379,
"step": 1390
},
{
"epoch": 0.7865168539325843,
"grad_norm": 2.0511608123779297,
"learning_rate": 1.628512396694215e-05,
"loss": 0.0432,
"step": 1400
},
{
"epoch": 0.7921348314606742,
"grad_norm": 0.6535409688949585,
"learning_rate": 1.6243801652892563e-05,
"loss": 0.0263,
"step": 1410
},
{
"epoch": 0.797752808988764,
"grad_norm": 0.5641036629676819,
"learning_rate": 1.6202479338842976e-05,
"loss": 0.0444,
"step": 1420
},
{
"epoch": 0.8033707865168539,
"grad_norm": 0.2755410373210907,
"learning_rate": 1.616115702479339e-05,
"loss": 0.0382,
"step": 1430
},
{
"epoch": 0.8089887640449438,
"grad_norm": 0.48516207933425903,
"learning_rate": 1.6119834710743802e-05,
"loss": 0.0436,
"step": 1440
},
{
"epoch": 0.8146067415730337,
"grad_norm": 0.2544703781604767,
"learning_rate": 1.6078512396694218e-05,
"loss": 0.0259,
"step": 1450
},
{
"epoch": 0.8202247191011236,
"grad_norm": 0.38116955757141113,
"learning_rate": 1.603719008264463e-05,
"loss": 0.0363,
"step": 1460
},
{
"epoch": 0.8258426966292135,
"grad_norm": 0.1579870879650116,
"learning_rate": 1.5995867768595044e-05,
"loss": 0.0312,
"step": 1470
},
{
"epoch": 0.8314606741573034,
"grad_norm": 0.31899958848953247,
"learning_rate": 1.5954545454545456e-05,
"loss": 0.0328,
"step": 1480
},
{
"epoch": 0.8370786516853933,
"grad_norm": 0.2832179367542267,
"learning_rate": 1.591322314049587e-05,
"loss": 0.0321,
"step": 1490
},
{
"epoch": 0.8426966292134831,
"grad_norm": 0.43299055099487305,
"learning_rate": 1.5871900826446282e-05,
"loss": 0.0366,
"step": 1500
},
{
"epoch": 0.8426966292134831,
"eval_f1": 0.7306652244456463,
"eval_loss": 0.03298529237508774,
"eval_precision": 0.6768537074148296,
"eval_recall": 0.7937720329024677,
"eval_runtime": 33.137,
"eval_samples_per_second": 118.599,
"eval_steps_per_second": 3.712,
"step": 1500
},
{
"epoch": 0.848314606741573,
"grad_norm": 0.5625007748603821,
"learning_rate": 1.5830578512396695e-05,
"loss": 0.0256,
"step": 1510
},
{
"epoch": 0.8539325842696629,
"grad_norm": 0.5187763571739197,
"learning_rate": 1.5789256198347108e-05,
"loss": 0.0328,
"step": 1520
},
{
"epoch": 0.8595505617977528,
"grad_norm": 0.4886994957923889,
"learning_rate": 1.574793388429752e-05,
"loss": 0.0547,
"step": 1530
},
{
"epoch": 0.8651685393258427,
"grad_norm": 0.5035399198532104,
"learning_rate": 1.5706611570247937e-05,
"loss": 0.0387,
"step": 1540
},
{
"epoch": 0.8707865168539326,
"grad_norm": 0.8656401634216309,
"learning_rate": 1.566528925619835e-05,
"loss": 0.0335,
"step": 1550
},
{
"epoch": 0.8764044943820225,
"grad_norm": 0.5307649970054626,
"learning_rate": 1.562396694214876e-05,
"loss": 0.0287,
"step": 1560
},
{
"epoch": 0.8820224719101124,
"grad_norm": 0.47168630361557007,
"learning_rate": 1.5582644628099175e-05,
"loss": 0.0368,
"step": 1570
},
{
"epoch": 0.8876404494382022,
"grad_norm": 0.38605886697769165,
"learning_rate": 1.5541322314049588e-05,
"loss": 0.0277,
"step": 1580
},
{
"epoch": 0.8932584269662921,
"grad_norm": 0.18422210216522217,
"learning_rate": 1.55e-05,
"loss": 0.03,
"step": 1590
},
{
"epoch": 0.898876404494382,
"grad_norm": 0.5309029221534729,
"learning_rate": 1.5458677685950413e-05,
"loss": 0.0361,
"step": 1600
},
{
"epoch": 0.9044943820224719,
"grad_norm": 0.13399222493171692,
"learning_rate": 1.541735537190083e-05,
"loss": 0.0324,
"step": 1610
},
{
"epoch": 0.9101123595505618,
"grad_norm": 0.441811740398407,
"learning_rate": 1.537603305785124e-05,
"loss": 0.0314,
"step": 1620
},
{
"epoch": 0.9157303370786517,
"grad_norm": 0.4198859930038452,
"learning_rate": 1.5334710743801655e-05,
"loss": 0.0405,
"step": 1630
},
{
"epoch": 0.9213483146067416,
"grad_norm": 1.2370645999908447,
"learning_rate": 1.5293388429752068e-05,
"loss": 0.0436,
"step": 1640
},
{
"epoch": 0.9269662921348315,
"grad_norm": 0.31183677911758423,
"learning_rate": 1.525206611570248e-05,
"loss": 0.0311,
"step": 1650
},
{
"epoch": 0.9325842696629213,
"grad_norm": 0.6841112375259399,
"learning_rate": 1.5210743801652894e-05,
"loss": 0.0263,
"step": 1660
},
{
"epoch": 0.9382022471910112,
"grad_norm": 0.744534969329834,
"learning_rate": 1.5169421487603307e-05,
"loss": 0.0434,
"step": 1670
},
{
"epoch": 0.9438202247191011,
"grad_norm": 0.5860826969146729,
"learning_rate": 1.5128099173553721e-05,
"loss": 0.0309,
"step": 1680
},
{
"epoch": 0.949438202247191,
"grad_norm": 0.30711379647254944,
"learning_rate": 1.5086776859504134e-05,
"loss": 0.0349,
"step": 1690
},
{
"epoch": 0.9550561797752809,
"grad_norm": 0.46106937527656555,
"learning_rate": 1.5045454545454548e-05,
"loss": 0.0238,
"step": 1700
},
{
"epoch": 0.9606741573033708,
"grad_norm": 0.7039032578468323,
"learning_rate": 1.500413223140496e-05,
"loss": 0.0495,
"step": 1710
},
{
"epoch": 0.9662921348314607,
"grad_norm": 0.5371297597885132,
"learning_rate": 1.4962809917355372e-05,
"loss": 0.024,
"step": 1720
},
{
"epoch": 0.9719101123595506,
"grad_norm": 0.18199366331100464,
"learning_rate": 1.4921487603305787e-05,
"loss": 0.0294,
"step": 1730
},
{
"epoch": 0.9775280898876404,
"grad_norm": 0.4313061535358429,
"learning_rate": 1.48801652892562e-05,
"loss": 0.0314,
"step": 1740
},
{
"epoch": 0.9831460674157303,
"grad_norm": 0.633383572101593,
"learning_rate": 1.4838842975206614e-05,
"loss": 0.0307,
"step": 1750
},
{
"epoch": 0.9831460674157303,
"eval_f1": 0.7417628836947339,
"eval_loss": 0.02960079535841942,
"eval_precision": 0.7122769064359114,
"eval_recall": 0.7737955346650999,
"eval_runtime": 33.4177,
"eval_samples_per_second": 117.602,
"eval_steps_per_second": 3.681,
"step": 1750
},
{
"epoch": 0.9887640449438202,
"grad_norm": 0.36454710364341736,
"learning_rate": 1.4797520661157025e-05,
"loss": 0.0314,
"step": 1760
},
{
"epoch": 0.9943820224719101,
"grad_norm": 0.4246029555797577,
"learning_rate": 1.475619834710744e-05,
"loss": 0.0262,
"step": 1770
},
{
"epoch": 1.0,
"grad_norm": 0.33757057785987854,
"learning_rate": 1.4714876033057852e-05,
"loss": 0.028,
"step": 1780
},
{
"epoch": 1.0056179775280898,
"grad_norm": 0.30218544602394104,
"learning_rate": 1.4673553719008267e-05,
"loss": 0.0185,
"step": 1790
},
{
"epoch": 1.0112359550561798,
"grad_norm": 0.4351919889450073,
"learning_rate": 1.4632231404958678e-05,
"loss": 0.0235,
"step": 1800
},
{
"epoch": 1.0168539325842696,
"grad_norm": 1.160334587097168,
"learning_rate": 1.4590909090909091e-05,
"loss": 0.0348,
"step": 1810
},
{
"epoch": 1.0224719101123596,
"grad_norm": 0.324881911277771,
"learning_rate": 1.4549586776859505e-05,
"loss": 0.0287,
"step": 1820
},
{
"epoch": 1.0280898876404494,
"grad_norm": 0.21142716705799103,
"learning_rate": 1.4508264462809918e-05,
"loss": 0.0421,
"step": 1830
},
{
"epoch": 1.0337078651685394,
"grad_norm": 0.14528660476207733,
"learning_rate": 1.4466942148760333e-05,
"loss": 0.0281,
"step": 1840
},
{
"epoch": 1.0393258426966292,
"grad_norm": 0.33770596981048584,
"learning_rate": 1.4425619834710744e-05,
"loss": 0.0365,
"step": 1850
},
{
"epoch": 1.0449438202247192,
"grad_norm": 0.3050376772880554,
"learning_rate": 1.4384297520661158e-05,
"loss": 0.0253,
"step": 1860
},
{
"epoch": 1.050561797752809,
"grad_norm": 0.706710696220398,
"learning_rate": 1.4342975206611571e-05,
"loss": 0.0245,
"step": 1870
},
{
"epoch": 1.0561797752808988,
"grad_norm": 0.23844891786575317,
"learning_rate": 1.4301652892561986e-05,
"loss": 0.02,
"step": 1880
},
{
"epoch": 1.0617977528089888,
"grad_norm": 0.4411591589450836,
"learning_rate": 1.4260330578512398e-05,
"loss": 0.0251,
"step": 1890
},
{
"epoch": 1.0674157303370786,
"grad_norm": 0.23174303770065308,
"learning_rate": 1.421900826446281e-05,
"loss": 0.0203,
"step": 1900
},
{
"epoch": 1.0730337078651686,
"grad_norm": 1.1798287630081177,
"learning_rate": 1.4177685950413224e-05,
"loss": 0.0356,
"step": 1910
},
{
"epoch": 1.0786516853932584,
"grad_norm": 0.33650219440460205,
"learning_rate": 1.4136363636363637e-05,
"loss": 0.0265,
"step": 1920
},
{
"epoch": 1.0842696629213484,
"grad_norm": 0.21325266361236572,
"learning_rate": 1.4095041322314051e-05,
"loss": 0.0183,
"step": 1930
},
{
"epoch": 1.0898876404494382,
"grad_norm": 0.4521240293979645,
"learning_rate": 1.4053719008264464e-05,
"loss": 0.0268,
"step": 1940
},
{
"epoch": 1.095505617977528,
"grad_norm": 0.4072856307029724,
"learning_rate": 1.4012396694214879e-05,
"loss": 0.0402,
"step": 1950
},
{
"epoch": 1.101123595505618,
"grad_norm": 0.47774896025657654,
"learning_rate": 1.397107438016529e-05,
"loss": 0.0419,
"step": 1960
},
{
"epoch": 1.1067415730337078,
"grad_norm": 0.4504724442958832,
"learning_rate": 1.3929752066115703e-05,
"loss": 0.0273,
"step": 1970
},
{
"epoch": 1.1123595505617978,
"grad_norm": 0.4287577271461487,
"learning_rate": 1.3888429752066117e-05,
"loss": 0.0367,
"step": 1980
},
{
"epoch": 1.1179775280898876,
"grad_norm": 0.37722307443618774,
"learning_rate": 1.384710743801653e-05,
"loss": 0.0239,
"step": 1990
},
{
"epoch": 1.1235955056179776,
"grad_norm": 0.22570855915546417,
"learning_rate": 1.3805785123966944e-05,
"loss": 0.0224,
"step": 2000
},
{
"epoch": 1.1235955056179776,
"eval_f1": 0.7568022440392708,
"eval_loss": 0.02905386872589588,
"eval_precision": 0.7241009125067096,
"eval_recall": 0.7925969447708578,
"eval_runtime": 28.6809,
"eval_samples_per_second": 137.025,
"eval_steps_per_second": 4.289,
"step": 2000
},
{
"epoch": 1.1292134831460674,
"grad_norm": 0.728447437286377,
"learning_rate": 1.3764462809917355e-05,
"loss": 0.022,
"step": 2010
},
{
"epoch": 1.1348314606741572,
"grad_norm": 0.10380419343709946,
"learning_rate": 1.372314049586777e-05,
"loss": 0.0181,
"step": 2020
},
{
"epoch": 1.1404494382022472,
"grad_norm": 0.38085779547691345,
"learning_rate": 1.3681818181818183e-05,
"loss": 0.0308,
"step": 2030
},
{
"epoch": 1.146067415730337,
"grad_norm": 0.5472461581230164,
"learning_rate": 1.3640495867768597e-05,
"loss": 0.0391,
"step": 2040
},
{
"epoch": 1.151685393258427,
"grad_norm": 0.4686163663864136,
"learning_rate": 1.359917355371901e-05,
"loss": 0.0259,
"step": 2050
},
{
"epoch": 1.1573033707865168,
"grad_norm": 0.5144456028938293,
"learning_rate": 1.3557851239669421e-05,
"loss": 0.0252,
"step": 2060
},
{
"epoch": 1.1629213483146068,
"grad_norm": 0.5633528232574463,
"learning_rate": 1.3516528925619836e-05,
"loss": 0.0309,
"step": 2070
},
{
"epoch": 1.1685393258426966,
"grad_norm": 0.23379996418952942,
"learning_rate": 1.3475206611570249e-05,
"loss": 0.0206,
"step": 2080
},
{
"epoch": 1.1741573033707866,
"grad_norm": 1.475098729133606,
"learning_rate": 1.3433884297520663e-05,
"loss": 0.0399,
"step": 2090
},
{
"epoch": 1.1797752808988764,
"grad_norm": 0.6528710126876831,
"learning_rate": 1.3392561983471076e-05,
"loss": 0.0442,
"step": 2100
},
{
"epoch": 1.1853932584269664,
"grad_norm": 0.5824321508407593,
"learning_rate": 1.335123966942149e-05,
"loss": 0.0314,
"step": 2110
},
{
"epoch": 1.1910112359550562,
"grad_norm": 0.8506532311439514,
"learning_rate": 1.3309917355371901e-05,
"loss": 0.0245,
"step": 2120
},
{
"epoch": 1.196629213483146,
"grad_norm": 0.7218942046165466,
"learning_rate": 1.3268595041322314e-05,
"loss": 0.0299,
"step": 2130
},
{
"epoch": 1.202247191011236,
"grad_norm": 0.2650141716003418,
"learning_rate": 1.3227272727272729e-05,
"loss": 0.0235,
"step": 2140
},
{
"epoch": 1.2078651685393258,
"grad_norm": 0.1138467863202095,
"learning_rate": 1.3185950413223142e-05,
"loss": 0.0311,
"step": 2150
},
{
"epoch": 1.2134831460674158,
"grad_norm": 0.3181060254573822,
"learning_rate": 1.3144628099173556e-05,
"loss": 0.0321,
"step": 2160
},
{
"epoch": 1.2191011235955056,
"grad_norm": 0.2905648648738861,
"learning_rate": 1.3103305785123967e-05,
"loss": 0.0241,
"step": 2170
},
{
"epoch": 1.2247191011235956,
"grad_norm": 0.18806235492229462,
"learning_rate": 1.3061983471074382e-05,
"loss": 0.0233,
"step": 2180
},
{
"epoch": 1.2303370786516854,
"grad_norm": 0.3551190495491028,
"learning_rate": 1.3020661157024794e-05,
"loss": 0.0348,
"step": 2190
},
{
"epoch": 1.2359550561797752,
"grad_norm": 0.2311626672744751,
"learning_rate": 1.2979338842975209e-05,
"loss": 0.0352,
"step": 2200
},
{
"epoch": 1.2415730337078652,
"grad_norm": 0.41256803274154663,
"learning_rate": 1.2938016528925622e-05,
"loss": 0.0334,
"step": 2210
},
{
"epoch": 1.247191011235955,
"grad_norm": 0.23223748803138733,
"learning_rate": 1.2896694214876033e-05,
"loss": 0.0285,
"step": 2220
},
{
"epoch": 1.252808988764045,
"grad_norm": 0.2413235753774643,
"learning_rate": 1.2855371900826447e-05,
"loss": 0.0193,
"step": 2230
},
{
"epoch": 1.2584269662921348,
"grad_norm": 0.5607805252075195,
"learning_rate": 1.281404958677686e-05,
"loss": 0.022,
"step": 2240
},
{
"epoch": 1.2640449438202248,
"grad_norm": 0.1772303432226181,
"learning_rate": 1.2772727272727275e-05,
"loss": 0.0331,
"step": 2250
},
{
"epoch": 1.2640449438202248,
"eval_f1": 0.760931289040318,
"eval_loss": 0.028664391487836838,
"eval_precision": 0.7362637362637363,
"eval_recall": 0.7873090481786134,
"eval_runtime": 28.3945,
"eval_samples_per_second": 138.407,
"eval_steps_per_second": 4.332,
"step": 2250
},
{
"epoch": 1.2696629213483146,
"grad_norm": 0.2562411427497864,
"learning_rate": 1.2731404958677686e-05,
"loss": 0.0295,
"step": 2260
},
{
"epoch": 1.2752808988764044,
"grad_norm": 0.13123421370983124,
"learning_rate": 1.26900826446281e-05,
"loss": 0.0306,
"step": 2270
},
{
"epoch": 1.2808988764044944,
"grad_norm": 0.1774195283651352,
"learning_rate": 1.2648760330578513e-05,
"loss": 0.0221,
"step": 2280
},
{
"epoch": 1.2865168539325842,
"grad_norm": 0.21137966215610504,
"learning_rate": 1.2607438016528926e-05,
"loss": 0.024,
"step": 2290
},
{
"epoch": 1.2921348314606742,
"grad_norm": 0.3009016513824463,
"learning_rate": 1.256611570247934e-05,
"loss": 0.024,
"step": 2300
},
{
"epoch": 1.297752808988764,
"grad_norm": 1.0978662967681885,
"learning_rate": 1.2524793388429752e-05,
"loss": 0.0285,
"step": 2310
},
{
"epoch": 1.303370786516854,
"grad_norm": 0.17469504475593567,
"learning_rate": 1.2483471074380166e-05,
"loss": 0.0242,
"step": 2320
},
{
"epoch": 1.3089887640449438,
"grad_norm": 0.227843776345253,
"learning_rate": 1.2442148760330579e-05,
"loss": 0.0248,
"step": 2330
},
{
"epoch": 1.3146067415730336,
"grad_norm": 0.2597135901451111,
"learning_rate": 1.2400826446280993e-05,
"loss": 0.0184,
"step": 2340
},
{
"epoch": 1.3202247191011236,
"grad_norm": 0.3049301207065582,
"learning_rate": 1.2359504132231406e-05,
"loss": 0.0352,
"step": 2350
},
{
"epoch": 1.3258426966292136,
"grad_norm": 0.6089704036712646,
"learning_rate": 1.231818181818182e-05,
"loss": 0.0267,
"step": 2360
},
{
"epoch": 1.3314606741573034,
"grad_norm": 0.29360252618789673,
"learning_rate": 1.2276859504132232e-05,
"loss": 0.042,
"step": 2370
},
{
"epoch": 1.3370786516853932,
"grad_norm": 0.21009930968284607,
"learning_rate": 1.2235537190082645e-05,
"loss": 0.0242,
"step": 2380
},
{
"epoch": 1.3426966292134832,
"grad_norm": 0.3115074932575226,
"learning_rate": 1.2194214876033059e-05,
"loss": 0.022,
"step": 2390
},
{
"epoch": 1.348314606741573,
"grad_norm": 0.5562008023262024,
"learning_rate": 1.2152892561983472e-05,
"loss": 0.0355,
"step": 2400
},
{
"epoch": 1.3539325842696628,
"grad_norm": 0.5737345218658447,
"learning_rate": 1.2111570247933886e-05,
"loss": 0.0268,
"step": 2410
},
{
"epoch": 1.3595505617977528,
"grad_norm": 0.10395320504903793,
"learning_rate": 1.2070247933884298e-05,
"loss": 0.0139,
"step": 2420
},
{
"epoch": 1.3651685393258428,
"grad_norm": 0.3552614450454712,
"learning_rate": 1.2028925619834712e-05,
"loss": 0.0332,
"step": 2430
},
{
"epoch": 1.3707865168539326,
"grad_norm": 0.46564894914627075,
"learning_rate": 1.1987603305785125e-05,
"loss": 0.0295,
"step": 2440
},
{
"epoch": 1.3764044943820224,
"grad_norm": 0.16816848516464233,
"learning_rate": 1.1946280991735538e-05,
"loss": 0.0368,
"step": 2450
},
{
"epoch": 1.3820224719101124,
"grad_norm": 0.6996704936027527,
"learning_rate": 1.1904958677685952e-05,
"loss": 0.0274,
"step": 2460
},
{
"epoch": 1.3876404494382022,
"grad_norm": 0.14424335956573486,
"learning_rate": 1.1863636363636363e-05,
"loss": 0.0256,
"step": 2470
},
{
"epoch": 1.3932584269662922,
"grad_norm": 0.287166565656662,
"learning_rate": 1.1822314049586778e-05,
"loss": 0.0251,
"step": 2480
},
{
"epoch": 1.398876404494382,
"grad_norm": 0.31948205828666687,
"learning_rate": 1.178099173553719e-05,
"loss": 0.0325,
"step": 2490
},
{
"epoch": 1.404494382022472,
"grad_norm": 0.5256792902946472,
"learning_rate": 1.1739669421487605e-05,
"loss": 0.028,
"step": 2500
},
{
"epoch": 1.404494382022472,
"eval_f1": 0.7634016278417064,
"eval_loss": 0.027553008869290352,
"eval_precision": 0.7307898979043524,
"eval_recall": 0.799059929494712,
"eval_runtime": 28.5911,
"eval_samples_per_second": 137.455,
"eval_steps_per_second": 4.302,
"step": 2500
},
{
"epoch": 1.4101123595505618,
"grad_norm": 0.47131842374801636,
"learning_rate": 1.1698347107438018e-05,
"loss": 0.0224,
"step": 2510
},
{
"epoch": 1.4157303370786516,
"grad_norm": 0.45454141497612,
"learning_rate": 1.1657024793388432e-05,
"loss": 0.0212,
"step": 2520
},
{
"epoch": 1.4213483146067416,
"grad_norm": 0.542829155921936,
"learning_rate": 1.1615702479338843e-05,
"loss": 0.0362,
"step": 2530
},
{
"epoch": 1.4269662921348314,
"grad_norm": 0.2345789521932602,
"learning_rate": 1.1574380165289256e-05,
"loss": 0.0325,
"step": 2540
},
{
"epoch": 1.4325842696629214,
"grad_norm": 0.3538082540035248,
"learning_rate": 1.153305785123967e-05,
"loss": 0.0319,
"step": 2550
},
{
"epoch": 1.4382022471910112,
"grad_norm": 0.30572107434272766,
"learning_rate": 1.1491735537190084e-05,
"loss": 0.0258,
"step": 2560
},
{
"epoch": 1.4438202247191012,
"grad_norm": 0.6635351777076721,
"learning_rate": 1.1450413223140498e-05,
"loss": 0.0184,
"step": 2570
},
{
"epoch": 1.449438202247191,
"grad_norm": 0.15726110339164734,
"learning_rate": 1.140909090909091e-05,
"loss": 0.0285,
"step": 2580
},
{
"epoch": 1.4550561797752808,
"grad_norm": 0.46479687094688416,
"learning_rate": 1.1367768595041324e-05,
"loss": 0.0282,
"step": 2590
},
{
"epoch": 1.4606741573033708,
"grad_norm": 0.3639979064464569,
"learning_rate": 1.1326446280991737e-05,
"loss": 0.022,
"step": 2600
},
{
"epoch": 1.4662921348314606,
"grad_norm": 0.2872810959815979,
"learning_rate": 1.128512396694215e-05,
"loss": 0.0234,
"step": 2610
},
{
"epoch": 1.4719101123595506,
"grad_norm": 0.30308109521865845,
"learning_rate": 1.1243801652892564e-05,
"loss": 0.0258,
"step": 2620
},
{
"epoch": 1.4775280898876404,
"grad_norm": 0.1484086513519287,
"learning_rate": 1.1202479338842975e-05,
"loss": 0.0244,
"step": 2630
},
{
"epoch": 1.4831460674157304,
"grad_norm": 0.2520122230052948,
"learning_rate": 1.116115702479339e-05,
"loss": 0.0269,
"step": 2640
},
{
"epoch": 1.4887640449438202,
"grad_norm": 0.4813540577888489,
"learning_rate": 1.1119834710743802e-05,
"loss": 0.0295,
"step": 2650
},
{
"epoch": 1.49438202247191,
"grad_norm": 0.2896086871623993,
"learning_rate": 1.1078512396694217e-05,
"loss": 0.0302,
"step": 2660
},
{
"epoch": 1.5,
"grad_norm": 1.093878984451294,
"learning_rate": 1.103719008264463e-05,
"loss": 0.023,
"step": 2670
},
{
"epoch": 1.50561797752809,
"grad_norm": 0.21631434559822083,
"learning_rate": 1.0995867768595044e-05,
"loss": 0.0276,
"step": 2680
},
{
"epoch": 1.5112359550561798,
"grad_norm": 0.19131970405578613,
"learning_rate": 1.0954545454545455e-05,
"loss": 0.0255,
"step": 2690
},
{
"epoch": 1.5168539325842696,
"grad_norm": 0.38947793841362,
"learning_rate": 1.0913223140495868e-05,
"loss": 0.044,
"step": 2700
},
{
"epoch": 1.5224719101123596,
"grad_norm": 0.2285880446434021,
"learning_rate": 1.0871900826446282e-05,
"loss": 0.0203,
"step": 2710
},
{
"epoch": 1.5280898876404494,
"grad_norm": 0.32444268465042114,
"learning_rate": 1.0830578512396695e-05,
"loss": 0.0349,
"step": 2720
},
{
"epoch": 1.5337078651685392,
"grad_norm": 0.19382227957248688,
"learning_rate": 1.0789256198347108e-05,
"loss": 0.0217,
"step": 2730
},
{
"epoch": 1.5393258426966292,
"grad_norm": 0.5336052775382996,
"learning_rate": 1.0747933884297521e-05,
"loss": 0.0278,
"step": 2740
},
{
"epoch": 1.5449438202247192,
"grad_norm": 0.33937588334083557,
"learning_rate": 1.0706611570247935e-05,
"loss": 0.0308,
"step": 2750
},
{
"epoch": 1.5449438202247192,
"eval_f1": 0.775,
"eval_loss": 0.027629304677248,
"eval_precision": 0.7349841938883035,
"eval_recall": 0.8196239717978848,
"eval_runtime": 28.5578,
"eval_samples_per_second": 137.616,
"eval_steps_per_second": 4.307,
"step": 2750
},
{
"epoch": 1.550561797752809,
"grad_norm": 0.5089320540428162,
"learning_rate": 1.0665289256198348e-05,
"loss": 0.0417,
"step": 2760
},
{
"epoch": 1.5561797752808988,
"grad_norm": 0.3134651780128479,
"learning_rate": 1.062396694214876e-05,
"loss": 0.0194,
"step": 2770
},
{
"epoch": 1.5617977528089888,
"grad_norm": 0.27301183342933655,
"learning_rate": 1.0582644628099174e-05,
"loss": 0.0232,
"step": 2780
},
{
"epoch": 1.5674157303370788,
"grad_norm": 0.4512588679790497,
"learning_rate": 1.0541322314049587e-05,
"loss": 0.0279,
"step": 2790
},
{
"epoch": 1.5730337078651684,
"grad_norm": 0.0741652399301529,
"learning_rate": 1.0500000000000001e-05,
"loss": 0.0304,
"step": 2800
},
{
"epoch": 1.5786516853932584,
"grad_norm": 0.5196810960769653,
"learning_rate": 1.0458677685950414e-05,
"loss": 0.0191,
"step": 2810
},
{
"epoch": 1.5842696629213484,
"grad_norm": 0.6707150340080261,
"learning_rate": 1.0417355371900828e-05,
"loss": 0.0223,
"step": 2820
},
{
"epoch": 1.5898876404494382,
"grad_norm": 0.10301195830106735,
"learning_rate": 1.037603305785124e-05,
"loss": 0.0256,
"step": 2830
},
{
"epoch": 1.595505617977528,
"grad_norm": 0.41582268476486206,
"learning_rate": 1.0334710743801654e-05,
"loss": 0.0211,
"step": 2840
},
{
"epoch": 1.601123595505618,
"grad_norm": 0.07321290671825409,
"learning_rate": 1.0293388429752067e-05,
"loss": 0.0206,
"step": 2850
},
{
"epoch": 1.606741573033708,
"grad_norm": 0.1504063755273819,
"learning_rate": 1.025206611570248e-05,
"loss": 0.0209,
"step": 2860
},
{
"epoch": 1.6123595505617978,
"grad_norm": 0.18410630524158478,
"learning_rate": 1.0210743801652894e-05,
"loss": 0.0224,
"step": 2870
},
{
"epoch": 1.6179775280898876,
"grad_norm": 0.30197209119796753,
"learning_rate": 1.0169421487603305e-05,
"loss": 0.037,
"step": 2880
},
{
"epoch": 1.6235955056179776,
"grad_norm": 0.35574042797088623,
"learning_rate": 1.012809917355372e-05,
"loss": 0.0369,
"step": 2890
},
{
"epoch": 1.6292134831460674,
"grad_norm": 0.6198880076408386,
"learning_rate": 1.0086776859504133e-05,
"loss": 0.0289,
"step": 2900
},
{
"epoch": 1.6348314606741572,
"grad_norm": 0.282822847366333,
"learning_rate": 1.0045454545454547e-05,
"loss": 0.0237,
"step": 2910
},
{
"epoch": 1.6404494382022472,
"grad_norm": 0.4256187677383423,
"learning_rate": 1.000413223140496e-05,
"loss": 0.0262,
"step": 2920
},
{
"epoch": 1.6460674157303372,
"grad_norm": 0.5048171877861023,
"learning_rate": 9.962809917355373e-06,
"loss": 0.019,
"step": 2930
},
{
"epoch": 1.651685393258427,
"grad_norm": 0.3408055603504181,
"learning_rate": 9.921487603305785e-06,
"loss": 0.0154,
"step": 2940
},
{
"epoch": 1.6573033707865168,
"grad_norm": 0.8848744630813599,
"learning_rate": 9.8801652892562e-06,
"loss": 0.0242,
"step": 2950
},
{
"epoch": 1.6629213483146068,
"grad_norm": 0.5149729251861572,
"learning_rate": 9.838842975206613e-06,
"loss": 0.0348,
"step": 2960
},
{
"epoch": 1.6685393258426966,
"grad_norm": 0.9948667287826538,
"learning_rate": 9.797520661157026e-06,
"loss": 0.0329,
"step": 2970
},
{
"epoch": 1.6741573033707864,
"grad_norm": 0.5616855621337891,
"learning_rate": 9.756198347107438e-06,
"loss": 0.0218,
"step": 2980
},
{
"epoch": 1.6797752808988764,
"grad_norm": 0.23508839309215546,
"learning_rate": 9.714876033057851e-06,
"loss": 0.0218,
"step": 2990
},
{
"epoch": 1.6853932584269664,
"grad_norm": 1.1748582124710083,
"learning_rate": 9.673553719008266e-06,
"loss": 0.0207,
"step": 3000
},
{
"epoch": 1.6853932584269664,
"eval_f1": 0.774390243902439,
"eval_loss": 0.028053877875208855,
"eval_precision": 0.7329485834207765,
"eval_recall": 0.8207990599294948,
"eval_runtime": 28.6894,
"eval_samples_per_second": 136.985,
"eval_steps_per_second": 4.287,
"step": 3000
},
{
"epoch": 1.6910112359550562,
"grad_norm": 0.5501458048820496,
"learning_rate": 9.632231404958679e-06,
"loss": 0.0447,
"step": 3010
},
{
"epoch": 1.696629213483146,
"grad_norm": 0.2999919056892395,
"learning_rate": 9.590909090909091e-06,
"loss": 0.0251,
"step": 3020
},
{
"epoch": 1.702247191011236,
"grad_norm": 0.3625814914703369,
"learning_rate": 9.549586776859506e-06,
"loss": 0.022,
"step": 3030
},
{
"epoch": 1.7078651685393258,
"grad_norm": 0.3906262516975403,
"learning_rate": 9.508264462809919e-06,
"loss": 0.0263,
"step": 3040
},
{
"epoch": 1.7134831460674156,
"grad_norm": 0.37318113446235657,
"learning_rate": 9.466942148760331e-06,
"loss": 0.0283,
"step": 3050
},
{
"epoch": 1.7191011235955056,
"grad_norm": 0.2729286849498749,
"learning_rate": 9.425619834710744e-06,
"loss": 0.0293,
"step": 3060
},
{
"epoch": 1.7247191011235956,
"grad_norm": 0.2665950059890747,
"learning_rate": 9.384297520661157e-06,
"loss": 0.0264,
"step": 3070
},
{
"epoch": 1.7303370786516854,
"grad_norm": 0.3190416991710663,
"learning_rate": 9.342975206611572e-06,
"loss": 0.028,
"step": 3080
},
{
"epoch": 1.7359550561797752,
"grad_norm": 0.1753334254026413,
"learning_rate": 9.301652892561984e-06,
"loss": 0.0182,
"step": 3090
},
{
"epoch": 1.7415730337078652,
"grad_norm": 0.20632904767990112,
"learning_rate": 9.260330578512397e-06,
"loss": 0.0276,
"step": 3100
},
{
"epoch": 1.7471910112359552,
"grad_norm": 0.9477939605712891,
"learning_rate": 9.219008264462812e-06,
"loss": 0.0144,
"step": 3110
},
{
"epoch": 1.7528089887640448,
"grad_norm": 0.09771013259887695,
"learning_rate": 9.177685950413224e-06,
"loss": 0.0391,
"step": 3120
},
{
"epoch": 1.7584269662921348,
"grad_norm": 0.46412840485572815,
"learning_rate": 9.136363636363637e-06,
"loss": 0.0301,
"step": 3130
},
{
"epoch": 1.7640449438202248,
"grad_norm": 0.29480355978012085,
"learning_rate": 9.09504132231405e-06,
"loss": 0.0248,
"step": 3140
},
{
"epoch": 1.7696629213483146,
"grad_norm": 0.2871951758861542,
"learning_rate": 9.053719008264463e-06,
"loss": 0.0256,
"step": 3150
},
{
"epoch": 1.7752808988764044,
"grad_norm": 0.6297323107719421,
"learning_rate": 9.012396694214877e-06,
"loss": 0.0366,
"step": 3160
},
{
"epoch": 1.7808988764044944,
"grad_norm": 0.16934314370155334,
"learning_rate": 8.97107438016529e-06,
"loss": 0.0301,
"step": 3170
},
{
"epoch": 1.7865168539325844,
"grad_norm": 0.33454304933547974,
"learning_rate": 8.929752066115703e-06,
"loss": 0.0198,
"step": 3180
},
{
"epoch": 1.7921348314606742,
"grad_norm": 2.09133243560791,
"learning_rate": 8.888429752066118e-06,
"loss": 0.0395,
"step": 3190
},
{
"epoch": 1.797752808988764,
"grad_norm": 0.31659209728240967,
"learning_rate": 8.84710743801653e-06,
"loss": 0.0289,
"step": 3200
},
{
"epoch": 1.803370786516854,
"grad_norm": 0.44267770648002625,
"learning_rate": 8.805785123966943e-06,
"loss": 0.0218,
"step": 3210
},
{
"epoch": 1.8089887640449438,
"grad_norm": 0.615263044834137,
"learning_rate": 8.764462809917356e-06,
"loss": 0.0186,
"step": 3220
},
{
"epoch": 1.8146067415730336,
"grad_norm": 0.5823076367378235,
"learning_rate": 8.723140495867769e-06,
"loss": 0.0274,
"step": 3230
},
{
"epoch": 1.8202247191011236,
"grad_norm": 0.7892521619796753,
"learning_rate": 8.681818181818182e-06,
"loss": 0.0268,
"step": 3240
},
{
"epoch": 1.8258426966292136,
"grad_norm": 0.285401314496994,
"learning_rate": 8.640495867768596e-06,
"loss": 0.0227,
"step": 3250
},
{
"epoch": 1.8258426966292136,
"eval_f1": 0.7772403982930299,
"eval_loss": 0.027760878205299377,
"eval_precision": 0.7534473248758963,
"eval_recall": 0.8025851938895417,
"eval_runtime": 29.8401,
"eval_samples_per_second": 131.702,
"eval_steps_per_second": 4.122,
"step": 3250
},
{
"epoch": 1.8314606741573034,
"grad_norm": 0.387917160987854,
"learning_rate": 8.599173553719009e-06,
"loss": 0.029,
"step": 3260
},
{
"epoch": 1.8370786516853932,
"grad_norm": 0.9873700141906738,
"learning_rate": 8.557851239669422e-06,
"loss": 0.031,
"step": 3270
},
{
"epoch": 1.8426966292134832,
"grad_norm": 1.048519253730774,
"learning_rate": 8.516528925619836e-06,
"loss": 0.0401,
"step": 3280
},
{
"epoch": 1.848314606741573,
"grad_norm": 0.311575323343277,
"learning_rate": 8.475206611570249e-06,
"loss": 0.0296,
"step": 3290
},
{
"epoch": 1.8539325842696628,
"grad_norm": 0.5239911675453186,
"learning_rate": 8.433884297520662e-06,
"loss": 0.0264,
"step": 3300
},
{
"epoch": 1.8595505617977528,
"grad_norm": 0.28296995162963867,
"learning_rate": 8.392561983471075e-06,
"loss": 0.0179,
"step": 3310
},
{
"epoch": 1.8651685393258428,
"grad_norm": 0.19286566972732544,
"learning_rate": 8.351239669421487e-06,
"loss": 0.0251,
"step": 3320
},
{
"epoch": 1.8707865168539326,
"grad_norm": 0.19410869479179382,
"learning_rate": 8.309917355371902e-06,
"loss": 0.0225,
"step": 3330
},
{
"epoch": 1.8764044943820224,
"grad_norm": 0.253444641828537,
"learning_rate": 8.268595041322315e-06,
"loss": 0.0238,
"step": 3340
},
{
"epoch": 1.8820224719101124,
"grad_norm": 0.9381386041641235,
"learning_rate": 8.227272727272728e-06,
"loss": 0.0169,
"step": 3350
},
{
"epoch": 1.8876404494382022,
"grad_norm": 0.33516111969947815,
"learning_rate": 8.185950413223142e-06,
"loss": 0.016,
"step": 3360
},
{
"epoch": 1.893258426966292,
"grad_norm": 1.103043556213379,
"learning_rate": 8.144628099173555e-06,
"loss": 0.0236,
"step": 3370
},
{
"epoch": 1.898876404494382,
"grad_norm": 0.5476918816566467,
"learning_rate": 8.103305785123968e-06,
"loss": 0.0358,
"step": 3380
},
{
"epoch": 1.904494382022472,
"grad_norm": 0.23972313106060028,
"learning_rate": 8.06198347107438e-06,
"loss": 0.0141,
"step": 3390
},
{
"epoch": 1.9101123595505618,
"grad_norm": 0.3573669493198395,
"learning_rate": 8.020661157024793e-06,
"loss": 0.0221,
"step": 3400
},
{
"epoch": 1.9157303370786516,
"grad_norm": 0.9739108085632324,
"learning_rate": 7.979338842975208e-06,
"loss": 0.0284,
"step": 3410
},
{
"epoch": 1.9213483146067416,
"grad_norm": 0.7444821000099182,
"learning_rate": 7.93801652892562e-06,
"loss": 0.0391,
"step": 3420
},
{
"epoch": 1.9269662921348316,
"grad_norm": 0.3812079131603241,
"learning_rate": 7.896694214876033e-06,
"loss": 0.0285,
"step": 3430
},
{
"epoch": 1.9325842696629212,
"grad_norm": 0.5086238980293274,
"learning_rate": 7.855371900826448e-06,
"loss": 0.0227,
"step": 3440
},
{
"epoch": 1.9382022471910112,
"grad_norm": 0.32544034719467163,
"learning_rate": 7.81404958677686e-06,
"loss": 0.0181,
"step": 3450
},
{
"epoch": 1.9438202247191012,
"grad_norm": 1.2971950769424438,
"learning_rate": 7.772727272727273e-06,
"loss": 0.0422,
"step": 3460
},
{
"epoch": 1.949438202247191,
"grad_norm": 0.2870737910270691,
"learning_rate": 7.731404958677686e-06,
"loss": 0.024,
"step": 3470
},
{
"epoch": 1.9550561797752808,
"grad_norm": 0.3665461540222168,
"learning_rate": 7.690082644628099e-06,
"loss": 0.0222,
"step": 3480
},
{
"epoch": 1.9606741573033708,
"grad_norm": 0.3672967255115509,
"learning_rate": 7.648760330578514e-06,
"loss": 0.0332,
"step": 3490
},
{
"epoch": 1.9662921348314608,
"grad_norm": 0.8471900224685669,
"learning_rate": 7.607438016528926e-06,
"loss": 0.0427,
"step": 3500
},
{
"epoch": 1.9662921348314608,
"eval_f1": 0.782293986636971,
"eval_loss": 0.026905611157417297,
"eval_precision": 0.7433862433862434,
"eval_recall": 0.8254994124559342,
"eval_runtime": 27.5479,
"eval_samples_per_second": 142.66,
"eval_steps_per_second": 4.465,
"step": 3500
},
{
"epoch": 1.9719101123595506,
"grad_norm": 0.32538992166519165,
"learning_rate": 7.56611570247934e-06,
"loss": 0.0269,
"step": 3510
},
{
"epoch": 1.9775280898876404,
"grad_norm": 0.10877460241317749,
"learning_rate": 7.524793388429753e-06,
"loss": 0.0212,
"step": 3520
},
{
"epoch": 1.9831460674157304,
"grad_norm": 0.0975649505853653,
"learning_rate": 7.4834710743801665e-06,
"loss": 0.0232,
"step": 3530
},
{
"epoch": 1.9887640449438202,
"grad_norm": 0.32634270191192627,
"learning_rate": 7.4421487603305785e-06,
"loss": 0.0217,
"step": 3540
},
{
"epoch": 1.99438202247191,
"grad_norm": 0.10080187767744064,
"learning_rate": 7.400826446280992e-06,
"loss": 0.0299,
"step": 3550
},
{
"epoch": 2.0,
"grad_norm": 0.27767449617385864,
"learning_rate": 7.359504132231406e-06,
"loss": 0.026,
"step": 3560
},
{
"epoch": 2.00561797752809,
"grad_norm": 0.2725074291229248,
"learning_rate": 7.3181818181818186e-06,
"loss": 0.0243,
"step": 3570
},
{
"epoch": 2.0112359550561796,
"grad_norm": 0.41963133215904236,
"learning_rate": 7.276859504132232e-06,
"loss": 0.027,
"step": 3580
},
{
"epoch": 2.0168539325842696,
"grad_norm": 0.3473270535469055,
"learning_rate": 7.235537190082645e-06,
"loss": 0.0238,
"step": 3590
},
{
"epoch": 2.0224719101123596,
"grad_norm": 0.38403695821762085,
"learning_rate": 7.194214876033059e-06,
"loss": 0.0261,
"step": 3600
},
{
"epoch": 2.0280898876404496,
"grad_norm": 0.42964357137680054,
"learning_rate": 7.152892561983472e-06,
"loss": 0.0343,
"step": 3610
},
{
"epoch": 2.033707865168539,
"grad_norm": 0.5099667310714722,
"learning_rate": 7.111570247933884e-06,
"loss": 0.0209,
"step": 3620
},
{
"epoch": 2.039325842696629,
"grad_norm": 0.34141868352890015,
"learning_rate": 7.070247933884298e-06,
"loss": 0.0295,
"step": 3630
},
{
"epoch": 2.044943820224719,
"grad_norm": 0.08186814934015274,
"learning_rate": 7.028925619834711e-06,
"loss": 0.021,
"step": 3640
},
{
"epoch": 2.050561797752809,
"grad_norm": 0.15661613643169403,
"learning_rate": 6.987603305785124e-06,
"loss": 0.0183,
"step": 3650
},
{
"epoch": 2.056179775280899,
"grad_norm": 0.24707892537117004,
"learning_rate": 6.946280991735538e-06,
"loss": 0.0204,
"step": 3660
},
{
"epoch": 2.061797752808989,
"grad_norm": 0.1336873322725296,
"learning_rate": 6.904958677685951e-06,
"loss": 0.0186,
"step": 3670
},
{
"epoch": 2.067415730337079,
"grad_norm": 0.07359451055526733,
"learning_rate": 6.8636363636363645e-06,
"loss": 0.0295,
"step": 3680
},
{
"epoch": 2.0730337078651684,
"grad_norm": 0.8857179880142212,
"learning_rate": 6.822314049586778e-06,
"loss": 0.0185,
"step": 3690
},
{
"epoch": 2.0786516853932584,
"grad_norm": 0.5854625105857849,
"learning_rate": 6.78099173553719e-06,
"loss": 0.0202,
"step": 3700
},
{
"epoch": 2.0842696629213484,
"grad_norm": 0.3949134647846222,
"learning_rate": 6.739669421487604e-06,
"loss": 0.0277,
"step": 3710
},
{
"epoch": 2.0898876404494384,
"grad_norm": 1.1485154628753662,
"learning_rate": 6.698347107438017e-06,
"loss": 0.0191,
"step": 3720
},
{
"epoch": 2.095505617977528,
"grad_norm": 0.5519204139709473,
"learning_rate": 6.65702479338843e-06,
"loss": 0.0245,
"step": 3730
},
{
"epoch": 2.101123595505618,
"grad_norm": 0.6175654530525208,
"learning_rate": 6.615702479338844e-06,
"loss": 0.0353,
"step": 3740
},
{
"epoch": 2.106741573033708,
"grad_norm": 0.23454013466835022,
"learning_rate": 6.574380165289257e-06,
"loss": 0.0366,
"step": 3750
},
{
"epoch": 2.106741573033708,
"eval_f1": 0.7762276785714285,
"eval_loss": 0.026860907673835754,
"eval_precision": 0.7391073326248672,
"eval_recall": 0.8172737955346651,
"eval_runtime": 27.5328,
"eval_samples_per_second": 142.739,
"eval_steps_per_second": 4.467,
"step": 3750
},
{
"epoch": 2.1123595505617976,
"grad_norm": 0.7361034750938416,
"learning_rate": 6.53305785123967e-06,
"loss": 0.0334,
"step": 3760
},
{
"epoch": 2.1179775280898876,
"grad_norm": 0.6150330901145935,
"learning_rate": 6.491735537190084e-06,
"loss": 0.0153,
"step": 3770
},
{
"epoch": 2.1235955056179776,
"grad_norm": 0.45887675881385803,
"learning_rate": 6.450413223140496e-06,
"loss": 0.0193,
"step": 3780
},
{
"epoch": 2.1292134831460676,
"grad_norm": 0.658532440662384,
"learning_rate": 6.40909090909091e-06,
"loss": 0.017,
"step": 3790
},
{
"epoch": 2.134831460674157,
"grad_norm": 0.3698727786540985,
"learning_rate": 6.3677685950413224e-06,
"loss": 0.0154,
"step": 3800
},
{
"epoch": 2.140449438202247,
"grad_norm": 0.3840799033641815,
"learning_rate": 6.326446280991736e-06,
"loss": 0.0185,
"step": 3810
},
{
"epoch": 2.146067415730337,
"grad_norm": 0.5709568858146667,
"learning_rate": 6.285123966942149e-06,
"loss": 0.0166,
"step": 3820
},
{
"epoch": 2.151685393258427,
"grad_norm": 0.5471051335334778,
"learning_rate": 6.2438016528925626e-06,
"loss": 0.0284,
"step": 3830
},
{
"epoch": 2.157303370786517,
"grad_norm": 0.2647755444049835,
"learning_rate": 6.202479338842976e-06,
"loss": 0.0266,
"step": 3840
},
{
"epoch": 2.162921348314607,
"grad_norm": 0.8098225593566895,
"learning_rate": 6.161157024793389e-06,
"loss": 0.0324,
"step": 3850
},
{
"epoch": 2.168539325842697,
"grad_norm": 0.060193296521902084,
"learning_rate": 6.119834710743802e-06,
"loss": 0.0198,
"step": 3860
},
{
"epoch": 2.1741573033707864,
"grad_norm": 0.29814398288726807,
"learning_rate": 6.078512396694215e-06,
"loss": 0.022,
"step": 3870
},
{
"epoch": 2.1797752808988764,
"grad_norm": 0.4286690652370453,
"learning_rate": 6.037190082644628e-06,
"loss": 0.0268,
"step": 3880
},
{
"epoch": 2.1853932584269664,
"grad_norm": 0.29490065574645996,
"learning_rate": 5.995867768595042e-06,
"loss": 0.0276,
"step": 3890
},
{
"epoch": 2.191011235955056,
"grad_norm": 0.19046035408973694,
"learning_rate": 5.954545454545455e-06,
"loss": 0.0166,
"step": 3900
},
{
"epoch": 2.196629213483146,
"grad_norm": 0.17454463243484497,
"learning_rate": 5.913223140495868e-06,
"loss": 0.0248,
"step": 3910
},
{
"epoch": 2.202247191011236,
"grad_norm": 0.4883466362953186,
"learning_rate": 5.871900826446282e-06,
"loss": 0.0227,
"step": 3920
},
{
"epoch": 2.207865168539326,
"grad_norm": 0.4298355281352997,
"learning_rate": 5.830578512396695e-06,
"loss": 0.0193,
"step": 3930
},
{
"epoch": 2.2134831460674156,
"grad_norm": 0.348326176404953,
"learning_rate": 5.789256198347108e-06,
"loss": 0.0197,
"step": 3940
},
{
"epoch": 2.2191011235955056,
"grad_norm": 0.25717148184776306,
"learning_rate": 5.7479338842975205e-06,
"loss": 0.0191,
"step": 3950
},
{
"epoch": 2.2247191011235956,
"grad_norm": 0.217723548412323,
"learning_rate": 5.706611570247934e-06,
"loss": 0.0132,
"step": 3960
},
{
"epoch": 2.2303370786516856,
"grad_norm": 0.1498790681362152,
"learning_rate": 5.665289256198348e-06,
"loss": 0.0193,
"step": 3970
},
{
"epoch": 2.235955056179775,
"grad_norm": 0.30186113715171814,
"learning_rate": 5.623966942148761e-06,
"loss": 0.0157,
"step": 3980
},
{
"epoch": 2.241573033707865,
"grad_norm": 0.5343513488769531,
"learning_rate": 5.582644628099174e-06,
"loss": 0.0238,
"step": 3990
},
{
"epoch": 2.247191011235955,
"grad_norm": 0.05514984950423241,
"learning_rate": 5.541322314049588e-06,
"loss": 0.0232,
"step": 4000
},
{
"epoch": 2.247191011235955,
"eval_f1": 0.7822648969217735,
"eval_loss": 0.027265124022960663,
"eval_precision": 0.7531266992930941,
"eval_recall": 0.8137485311398355,
"eval_runtime": 27.5955,
"eval_samples_per_second": 142.414,
"eval_steps_per_second": 4.457,
"step": 4000
},
{
"epoch": 2.252808988764045,
"grad_norm": 0.28251585364341736,
"learning_rate": 5.500000000000001e-06,
"loss": 0.0171,
"step": 4010
},
{
"epoch": 2.258426966292135,
"grad_norm": 0.5261769890785217,
"learning_rate": 5.4586776859504135e-06,
"loss": 0.0333,
"step": 4020
},
{
"epoch": 2.264044943820225,
"grad_norm": 0.5585474967956543,
"learning_rate": 5.417355371900826e-06,
"loss": 0.0226,
"step": 4030
},
{
"epoch": 2.2696629213483144,
"grad_norm": 0.3328685760498047,
"learning_rate": 5.37603305785124e-06,
"loss": 0.026,
"step": 4040
},
{
"epoch": 2.2752808988764044,
"grad_norm": 0.16690856218338013,
"learning_rate": 5.334710743801654e-06,
"loss": 0.0231,
"step": 4050
},
{
"epoch": 2.2808988764044944,
"grad_norm": 0.2629925608634949,
"learning_rate": 5.2933884297520664e-06,
"loss": 0.0209,
"step": 4060
},
{
"epoch": 2.2865168539325844,
"grad_norm": 0.21847395598888397,
"learning_rate": 5.25206611570248e-06,
"loss": 0.0214,
"step": 4070
},
{
"epoch": 2.292134831460674,
"grad_norm": 0.4954984188079834,
"learning_rate": 5.210743801652893e-06,
"loss": 0.0224,
"step": 4080
},
{
"epoch": 2.297752808988764,
"grad_norm": 0.19115880131721497,
"learning_rate": 5.1694214876033065e-06,
"loss": 0.0294,
"step": 4090
},
{
"epoch": 2.303370786516854,
"grad_norm": 0.9964483976364136,
"learning_rate": 5.1280991735537185e-06,
"loss": 0.0264,
"step": 4100
},
{
"epoch": 2.308988764044944,
"grad_norm": 0.4354044795036316,
"learning_rate": 5.086776859504132e-06,
"loss": 0.0159,
"step": 4110
},
{
"epoch": 2.3146067415730336,
"grad_norm": 0.6680939793586731,
"learning_rate": 5.045454545454546e-06,
"loss": 0.0297,
"step": 4120
},
{
"epoch": 2.3202247191011236,
"grad_norm": 0.39689382910728455,
"learning_rate": 5.004132231404959e-06,
"loss": 0.0234,
"step": 4130
},
{
"epoch": 2.3258426966292136,
"grad_norm": 0.23396579921245575,
"learning_rate": 4.962809917355372e-06,
"loss": 0.0267,
"step": 4140
},
{
"epoch": 2.331460674157303,
"grad_norm": 0.19987128674983978,
"learning_rate": 4.921487603305786e-06,
"loss": 0.0158,
"step": 4150
},
{
"epoch": 2.337078651685393,
"grad_norm": 0.3705327808856964,
"learning_rate": 4.880165289256199e-06,
"loss": 0.0213,
"step": 4160
},
{
"epoch": 2.342696629213483,
"grad_norm": 0.1622433066368103,
"learning_rate": 4.8388429752066115e-06,
"loss": 0.0216,
"step": 4170
},
{
"epoch": 2.348314606741573,
"grad_norm": 0.1764877438545227,
"learning_rate": 4.797520661157025e-06,
"loss": 0.0202,
"step": 4180
},
{
"epoch": 2.353932584269663,
"grad_norm": 0.612802267074585,
"learning_rate": 4.756198347107439e-06,
"loss": 0.0296,
"step": 4190
},
{
"epoch": 2.359550561797753,
"grad_norm": 0.29788005352020264,
"learning_rate": 4.714876033057852e-06,
"loss": 0.0163,
"step": 4200
},
{
"epoch": 2.365168539325843,
"grad_norm": 0.9954004883766174,
"learning_rate": 4.6735537190082645e-06,
"loss": 0.0236,
"step": 4210
},
{
"epoch": 2.370786516853933,
"grad_norm": 0.6486590504646301,
"learning_rate": 4.632231404958678e-06,
"loss": 0.0218,
"step": 4220
},
{
"epoch": 2.3764044943820224,
"grad_norm": 0.3950415551662445,
"learning_rate": 4.590909090909092e-06,
"loss": 0.0264,
"step": 4230
},
{
"epoch": 2.3820224719101124,
"grad_norm": 0.8847681879997253,
"learning_rate": 4.549586776859505e-06,
"loss": 0.0192,
"step": 4240
},
{
"epoch": 2.3876404494382024,
"grad_norm": 1.5672686100006104,
"learning_rate": 4.508264462809917e-06,
"loss": 0.0395,
"step": 4250
},
{
"epoch": 2.3876404494382024,
"eval_f1": 0.7883418222976797,
"eval_loss": 0.026355577632784843,
"eval_precision": 0.7603711790393013,
"eval_recall": 0.818448883666275,
"eval_runtime": 27.5406,
"eval_samples_per_second": 142.699,
"eval_steps_per_second": 4.466,
"step": 4250
},
{
"epoch": 2.393258426966292,
"grad_norm": 0.16888077557086945,
"learning_rate": 4.466942148760331e-06,
"loss": 0.0189,
"step": 4260
},
{
"epoch": 2.398876404494382,
"grad_norm": 0.2687821686267853,
"learning_rate": 4.425619834710745e-06,
"loss": 0.0234,
"step": 4270
},
{
"epoch": 2.404494382022472,
"grad_norm": 0.22109387814998627,
"learning_rate": 4.3842975206611575e-06,
"loss": 0.0196,
"step": 4280
},
{
"epoch": 2.4101123595505616,
"grad_norm": 0.36069610714912415,
"learning_rate": 4.34297520661157e-06,
"loss": 0.0212,
"step": 4290
},
{
"epoch": 2.4157303370786516,
"grad_norm": 0.2530086636543274,
"learning_rate": 4.301652892561984e-06,
"loss": 0.0244,
"step": 4300
},
{
"epoch": 2.4213483146067416,
"grad_norm": 0.5015830397605896,
"learning_rate": 4.260330578512397e-06,
"loss": 0.0226,
"step": 4310
},
{
"epoch": 2.4269662921348316,
"grad_norm": 0.18124614655971527,
"learning_rate": 4.21900826446281e-06,
"loss": 0.0202,
"step": 4320
},
{
"epoch": 2.432584269662921,
"grad_norm": 0.2587762176990509,
"learning_rate": 4.177685950413223e-06,
"loss": 0.0157,
"step": 4330
},
{
"epoch": 2.438202247191011,
"grad_norm": 0.42097729444503784,
"learning_rate": 4.136363636363637e-06,
"loss": 0.0265,
"step": 4340
},
{
"epoch": 2.443820224719101,
"grad_norm": 0.2626970410346985,
"learning_rate": 4.09504132231405e-06,
"loss": 0.0163,
"step": 4350
},
{
"epoch": 2.449438202247191,
"grad_norm": 0.42754292488098145,
"learning_rate": 4.053719008264463e-06,
"loss": 0.0281,
"step": 4360
},
{
"epoch": 2.455056179775281,
"grad_norm": 0.32999956607818604,
"learning_rate": 4.012396694214876e-06,
"loss": 0.0206,
"step": 4370
},
{
"epoch": 2.460674157303371,
"grad_norm": 0.26131534576416016,
"learning_rate": 3.97107438016529e-06,
"loss": 0.0204,
"step": 4380
},
{
"epoch": 2.466292134831461,
"grad_norm": 0.6028513312339783,
"learning_rate": 3.929752066115703e-06,
"loss": 0.0237,
"step": 4390
},
{
"epoch": 2.4719101123595504,
"grad_norm": 0.42414769530296326,
"learning_rate": 3.888429752066116e-06,
"loss": 0.0233,
"step": 4400
},
{
"epoch": 2.4775280898876404,
"grad_norm": 0.21420839428901672,
"learning_rate": 3.847107438016529e-06,
"loss": 0.0179,
"step": 4410
},
{
"epoch": 2.4831460674157304,
"grad_norm": 0.13455547392368317,
"learning_rate": 3.8057851239669423e-06,
"loss": 0.0305,
"step": 4420
},
{
"epoch": 2.48876404494382,
"grad_norm": 0.2912025451660156,
"learning_rate": 3.764462809917356e-06,
"loss": 0.0239,
"step": 4430
},
{
"epoch": 2.49438202247191,
"grad_norm": 0.2664025127887726,
"learning_rate": 3.723140495867769e-06,
"loss": 0.0253,
"step": 4440
},
{
"epoch": 2.5,
"grad_norm": 0.22163937985897064,
"learning_rate": 3.681818181818182e-06,
"loss": 0.0235,
"step": 4450
},
{
"epoch": 2.50561797752809,
"grad_norm": 0.8055572509765625,
"learning_rate": 3.6404958677685952e-06,
"loss": 0.0208,
"step": 4460
},
{
"epoch": 2.51123595505618,
"grad_norm": 0.42651161551475525,
"learning_rate": 3.599173553719009e-06,
"loss": 0.0262,
"step": 4470
},
{
"epoch": 2.5168539325842696,
"grad_norm": 0.29703494906425476,
"learning_rate": 3.557851239669422e-06,
"loss": 0.0145,
"step": 4480
},
{
"epoch": 2.5224719101123596,
"grad_norm": 0.2598767578601837,
"learning_rate": 3.516528925619835e-06,
"loss": 0.0188,
"step": 4490
},
{
"epoch": 2.5280898876404496,
"grad_norm": 0.39948177337646484,
"learning_rate": 3.475206611570248e-06,
"loss": 0.0231,
"step": 4500
},
{
"epoch": 2.5280898876404496,
"eval_f1": 0.7884074282498592,
"eval_loss": 0.02689271606504917,
"eval_precision": 0.7564794816414687,
"eval_recall": 0.8231492361927144,
"eval_runtime": 27.545,
"eval_samples_per_second": 142.676,
"eval_steps_per_second": 4.465,
"step": 4500
},
{
"epoch": 2.533707865168539,
"grad_norm": 0.36148425936698914,
"learning_rate": 3.4338842975206614e-06,
"loss": 0.0221,
"step": 4510
},
{
"epoch": 2.539325842696629,
"grad_norm": 0.05635352060198784,
"learning_rate": 3.392561983471075e-06,
"loss": 0.0115,
"step": 4520
},
{
"epoch": 2.544943820224719,
"grad_norm": 0.15450328588485718,
"learning_rate": 3.351239669421488e-06,
"loss": 0.0142,
"step": 4530
},
{
"epoch": 2.550561797752809,
"grad_norm": 0.7420428395271301,
"learning_rate": 3.309917355371901e-06,
"loss": 0.0242,
"step": 4540
},
{
"epoch": 2.556179775280899,
"grad_norm": 0.21072138845920563,
"learning_rate": 3.2685950413223143e-06,
"loss": 0.0154,
"step": 4550
},
{
"epoch": 2.561797752808989,
"grad_norm": 0.41004472970962524,
"learning_rate": 3.227272727272728e-06,
"loss": 0.027,
"step": 4560
},
{
"epoch": 2.567415730337079,
"grad_norm": 0.2680492401123047,
"learning_rate": 3.1859504132231408e-06,
"loss": 0.0328,
"step": 4570
},
{
"epoch": 2.5730337078651684,
"grad_norm": 0.2726670503616333,
"learning_rate": 3.144628099173554e-06,
"loss": 0.0208,
"step": 4580
},
{
"epoch": 2.5786516853932584,
"grad_norm": 0.07600165903568268,
"learning_rate": 3.1033057851239672e-06,
"loss": 0.0275,
"step": 4590
},
{
"epoch": 2.5842696629213484,
"grad_norm": 0.4380427598953247,
"learning_rate": 3.0619834710743804e-06,
"loss": 0.0441,
"step": 4600
},
{
"epoch": 2.5898876404494384,
"grad_norm": 1.735329270362854,
"learning_rate": 3.0206611570247932e-06,
"loss": 0.0298,
"step": 4610
},
{
"epoch": 2.595505617977528,
"grad_norm": 0.2738408148288727,
"learning_rate": 2.979338842975207e-06,
"loss": 0.0175,
"step": 4620
},
{
"epoch": 2.601123595505618,
"grad_norm": 0.46852007508277893,
"learning_rate": 2.93801652892562e-06,
"loss": 0.0166,
"step": 4630
},
{
"epoch": 2.606741573033708,
"grad_norm": 0.13703812658786774,
"learning_rate": 2.8966942148760334e-06,
"loss": 0.0298,
"step": 4640
},
{
"epoch": 2.6123595505617976,
"grad_norm": 0.3436073958873749,
"learning_rate": 2.855371900826446e-06,
"loss": 0.0203,
"step": 4650
},
{
"epoch": 2.6179775280898876,
"grad_norm": 0.5731604695320129,
"learning_rate": 2.81404958677686e-06,
"loss": 0.0166,
"step": 4660
},
{
"epoch": 2.6235955056179776,
"grad_norm": 0.3746987283229828,
"learning_rate": 2.772727272727273e-06,
"loss": 0.0223,
"step": 4670
},
{
"epoch": 2.629213483146067,
"grad_norm": 0.30169859528541565,
"learning_rate": 2.7314049586776863e-06,
"loss": 0.0277,
"step": 4680
},
{
"epoch": 2.634831460674157,
"grad_norm": 0.2838500738143921,
"learning_rate": 2.690082644628099e-06,
"loss": 0.0176,
"step": 4690
},
{
"epoch": 2.640449438202247,
"grad_norm": 0.14983642101287842,
"learning_rate": 2.6487603305785127e-06,
"loss": 0.0218,
"step": 4700
},
{
"epoch": 2.646067415730337,
"grad_norm": 0.28590282797813416,
"learning_rate": 2.607438016528926e-06,
"loss": 0.0248,
"step": 4710
},
{
"epoch": 2.6516853932584272,
"grad_norm": 0.16218622028827667,
"learning_rate": 2.566115702479339e-06,
"loss": 0.0192,
"step": 4720
},
{
"epoch": 2.657303370786517,
"grad_norm": 0.2653314471244812,
"learning_rate": 2.524793388429752e-06,
"loss": 0.0224,
"step": 4730
},
{
"epoch": 2.662921348314607,
"grad_norm": 0.19694961607456207,
"learning_rate": 2.4834710743801652e-06,
"loss": 0.0163,
"step": 4740
},
{
"epoch": 2.668539325842697,
"grad_norm": 0.396456241607666,
"learning_rate": 2.442148760330579e-06,
"loss": 0.0184,
"step": 4750
},
{
"epoch": 2.668539325842697,
"eval_f1": 0.7902769926512153,
"eval_loss": 0.0268043614923954,
"eval_precision": 0.761437908496732,
"eval_recall": 0.8213866039952996,
"eval_runtime": 27.5363,
"eval_samples_per_second": 142.721,
"eval_steps_per_second": 4.467,
"step": 4750
},
{
"epoch": 2.6741573033707864,
"grad_norm": 0.17081989347934723,
"learning_rate": 2.4008264462809917e-06,
"loss": 0.0165,
"step": 4760
},
{
"epoch": 2.6797752808988764,
"grad_norm": 0.248233824968338,
"learning_rate": 2.3595041322314054e-06,
"loss": 0.0165,
"step": 4770
},
{
"epoch": 2.6853932584269664,
"grad_norm": 0.5152795910835266,
"learning_rate": 2.318181818181818e-06,
"loss": 0.0232,
"step": 4780
},
{
"epoch": 2.691011235955056,
"grad_norm": 0.3962818384170532,
"learning_rate": 2.276859504132232e-06,
"loss": 0.0165,
"step": 4790
},
{
"epoch": 2.696629213483146,
"grad_norm": 0.8177412152290344,
"learning_rate": 2.2355371900826446e-06,
"loss": 0.0237,
"step": 4800
},
{
"epoch": 2.702247191011236,
"grad_norm": 0.3415655791759491,
"learning_rate": 2.1942148760330583e-06,
"loss": 0.0293,
"step": 4810
},
{
"epoch": 2.7078651685393256,
"grad_norm": 0.39049091935157776,
"learning_rate": 2.152892561983471e-06,
"loss": 0.0151,
"step": 4820
},
{
"epoch": 2.7134831460674156,
"grad_norm": 0.25125929713249207,
"learning_rate": 2.1115702479338847e-06,
"loss": 0.0246,
"step": 4830
},
{
"epoch": 2.7191011235955056,
"grad_norm": 0.3042113482952118,
"learning_rate": 2.0702479338842975e-06,
"loss": 0.0131,
"step": 4840
},
{
"epoch": 2.7247191011235956,
"grad_norm": 0.3315228223800659,
"learning_rate": 2.0289256198347108e-06,
"loss": 0.0355,
"step": 4850
},
{
"epoch": 2.7303370786516856,
"grad_norm": 0.7296513319015503,
"learning_rate": 1.987603305785124e-06,
"loss": 0.0167,
"step": 4860
},
{
"epoch": 2.735955056179775,
"grad_norm": 0.1494741439819336,
"learning_rate": 1.9462809917355372e-06,
"loss": 0.0158,
"step": 4870
},
{
"epoch": 2.741573033707865,
"grad_norm": 1.2954223155975342,
"learning_rate": 1.9049586776859505e-06,
"loss": 0.0243,
"step": 4880
},
{
"epoch": 2.747191011235955,
"grad_norm": 0.5072038769721985,
"learning_rate": 1.863636363636364e-06,
"loss": 0.0257,
"step": 4890
},
{
"epoch": 2.752808988764045,
"grad_norm": 0.3296859562397003,
"learning_rate": 1.822314049586777e-06,
"loss": 0.0198,
"step": 4900
},
{
"epoch": 2.758426966292135,
"grad_norm": 0.11316215991973877,
"learning_rate": 1.7809917355371904e-06,
"loss": 0.0185,
"step": 4910
},
{
"epoch": 2.764044943820225,
"grad_norm": 0.15724743902683258,
"learning_rate": 1.7396694214876034e-06,
"loss": 0.0156,
"step": 4920
},
{
"epoch": 2.7696629213483144,
"grad_norm": 0.20462727546691895,
"learning_rate": 1.6983471074380168e-06,
"loss": 0.0225,
"step": 4930
},
{
"epoch": 2.7752808988764044,
"grad_norm": 0.3503582179546356,
"learning_rate": 1.6570247933884298e-06,
"loss": 0.0191,
"step": 4940
},
{
"epoch": 2.7808988764044944,
"grad_norm": 0.3289487361907959,
"learning_rate": 1.615702479338843e-06,
"loss": 0.0241,
"step": 4950
},
{
"epoch": 2.7865168539325844,
"grad_norm": 0.3454688787460327,
"learning_rate": 1.5743801652892563e-06,
"loss": 0.0154,
"step": 4960
},
{
"epoch": 2.7921348314606744,
"grad_norm": 0.07760006934404373,
"learning_rate": 1.5330578512396695e-06,
"loss": 0.0124,
"step": 4970
},
{
"epoch": 2.797752808988764,
"grad_norm": 0.6230753064155579,
"learning_rate": 1.4917355371900828e-06,
"loss": 0.0198,
"step": 4980
},
{
"epoch": 2.803370786516854,
"grad_norm": 0.399117648601532,
"learning_rate": 1.450413223140496e-06,
"loss": 0.022,
"step": 4990
},
{
"epoch": 2.808988764044944,
"grad_norm": 0.24584145843982697,
"learning_rate": 1.409090909090909e-06,
"loss": 0.0199,
"step": 5000
},
{
"epoch": 2.808988764044944,
"eval_f1": 0.790944661822247,
"eval_loss": 0.02688935212790966,
"eval_precision": 0.7542643923240938,
"eval_recall": 0.8313748531139835,
"eval_runtime": 27.5368,
"eval_samples_per_second": 142.718,
"eval_steps_per_second": 4.467,
"step": 5000
},
{
"epoch": 2.8146067415730336,
"grad_norm": 0.27072691917419434,
"learning_rate": 1.3677685950413225e-06,
"loss": 0.0147,
"step": 5010
},
{
"epoch": 2.8202247191011236,
"grad_norm": 0.2784029543399811,
"learning_rate": 1.3264462809917355e-06,
"loss": 0.0365,
"step": 5020
},
{
"epoch": 2.8258426966292136,
"grad_norm": 0.3331715166568756,
"learning_rate": 1.285123966942149e-06,
"loss": 0.0161,
"step": 5030
},
{
"epoch": 2.831460674157303,
"grad_norm": 0.432522177696228,
"learning_rate": 1.2438016528925622e-06,
"loss": 0.0199,
"step": 5040
},
{
"epoch": 2.837078651685393,
"grad_norm": 0.22144052386283875,
"learning_rate": 1.2024793388429754e-06,
"loss": 0.0218,
"step": 5050
},
{
"epoch": 2.842696629213483,
"grad_norm": 0.30622825026512146,
"learning_rate": 1.1611570247933886e-06,
"loss": 0.0235,
"step": 5060
},
{
"epoch": 2.8483146067415728,
"grad_norm": 0.29084160923957825,
"learning_rate": 1.1198347107438018e-06,
"loss": 0.0136,
"step": 5070
},
{
"epoch": 2.853932584269663,
"grad_norm": 0.4032902717590332,
"learning_rate": 1.078512396694215e-06,
"loss": 0.0169,
"step": 5080
},
{
"epoch": 2.859550561797753,
"grad_norm": 0.2889857292175293,
"learning_rate": 1.0371900826446283e-06,
"loss": 0.0248,
"step": 5090
},
{
"epoch": 2.865168539325843,
"grad_norm": 0.40357986092567444,
"learning_rate": 9.958677685950415e-07,
"loss": 0.0242,
"step": 5100
},
{
"epoch": 2.870786516853933,
"grad_norm": 0.2112303227186203,
"learning_rate": 9.545454545454548e-07,
"loss": 0.0295,
"step": 5110
},
{
"epoch": 2.8764044943820224,
"grad_norm": 0.12360019236803055,
"learning_rate": 9.132231404958679e-07,
"loss": 0.0315,
"step": 5120
},
{
"epoch": 2.8820224719101124,
"grad_norm": 0.37716934084892273,
"learning_rate": 8.719008264462811e-07,
"loss": 0.0181,
"step": 5130
},
{
"epoch": 2.8876404494382024,
"grad_norm": 0.3970218002796173,
"learning_rate": 8.305785123966943e-07,
"loss": 0.0219,
"step": 5140
},
{
"epoch": 2.893258426966292,
"grad_norm": 0.32977351546287537,
"learning_rate": 7.892561983471076e-07,
"loss": 0.0232,
"step": 5150
},
{
"epoch": 2.898876404494382,
"grad_norm": 1.0419588088989258,
"learning_rate": 7.479338842975208e-07,
"loss": 0.0296,
"step": 5160
},
{
"epoch": 2.904494382022472,
"grad_norm": 0.6743459105491638,
"learning_rate": 7.066115702479339e-07,
"loss": 0.0232,
"step": 5170
},
{
"epoch": 2.9101123595505616,
"grad_norm": 0.37269556522369385,
"learning_rate": 6.652892561983472e-07,
"loss": 0.0246,
"step": 5180
},
{
"epoch": 2.9157303370786516,
"grad_norm": 0.29990169405937195,
"learning_rate": 6.239669421487604e-07,
"loss": 0.0195,
"step": 5190
},
{
"epoch": 2.9213483146067416,
"grad_norm": 0.31683966517448425,
"learning_rate": 5.826446280991736e-07,
"loss": 0.0184,
"step": 5200
},
{
"epoch": 2.9269662921348316,
"grad_norm": 0.3079575002193451,
"learning_rate": 5.413223140495869e-07,
"loss": 0.0207,
"step": 5210
},
{
"epoch": 2.932584269662921,
"grad_norm": 0.4728926718235016,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0227,
"step": 5220
},
{
"epoch": 2.938202247191011,
"grad_norm": 0.3681657910346985,
"learning_rate": 4.586776859504133e-07,
"loss": 0.0165,
"step": 5230
},
{
"epoch": 2.943820224719101,
"grad_norm": 0.2549396753311157,
"learning_rate": 4.173553719008265e-07,
"loss": 0.019,
"step": 5240
},
{
"epoch": 2.949438202247191,
"grad_norm": 0.36846932768821716,
"learning_rate": 3.760330578512397e-07,
"loss": 0.0245,
"step": 5250
},
{
"epoch": 2.949438202247191,
"eval_f1": 0.7906976744186046,
"eval_loss": 0.02675323560833931,
"eval_precision": 0.7557579003749331,
"eval_recall": 0.8290246768507638,
"eval_runtime": 28.1644,
"eval_samples_per_second": 139.538,
"eval_steps_per_second": 4.367,
"step": 5250
},
{
"epoch": 2.955056179775281,
"grad_norm": 0.3710382282733917,
"learning_rate": 3.3471074380165295e-07,
"loss": 0.0188,
"step": 5260
},
{
"epoch": 2.960674157303371,
"grad_norm": 0.36082541942596436,
"learning_rate": 2.9338842975206613e-07,
"loss": 0.027,
"step": 5270
},
{
"epoch": 2.966292134831461,
"grad_norm": 0.573581874370575,
"learning_rate": 2.5206611570247936e-07,
"loss": 0.0183,
"step": 5280
},
{
"epoch": 2.9719101123595504,
"grad_norm": 0.30395030975341797,
"learning_rate": 2.1074380165289256e-07,
"loss": 0.0232,
"step": 5290
},
{
"epoch": 2.9775280898876404,
"grad_norm": 0.38595783710479736,
"learning_rate": 1.694214876033058e-07,
"loss": 0.0173,
"step": 5300
},
{
"epoch": 2.9831460674157304,
"grad_norm": 0.25339680910110474,
"learning_rate": 1.2809917355371902e-07,
"loss": 0.0167,
"step": 5310
},
{
"epoch": 2.98876404494382,
"grad_norm": 0.41702669858932495,
"learning_rate": 8.677685950413224e-08,
"loss": 0.0205,
"step": 5320
},
{
"epoch": 2.99438202247191,
"grad_norm": 0.18185748159885406,
"learning_rate": 4.545454545454546e-08,
"loss": 0.0151,
"step": 5330
},
{
"epoch": 3.0,
"grad_norm": 0.7380737662315369,
"learning_rate": 4.132231404958678e-09,
"loss": 0.0193,
"step": 5340
}
],
"logging_steps": 10,
"max_steps": 5340,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 250,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5580565286704128.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}