CaffeineThief's picture
Upload saved model files
db02f52 verified
{
"best_global_step": 545,
"best_metric": 0.5921985815602837,
"best_model_checkpoint": "./cysecbert-ttp-bert-base_data/checkpoint-545",
"epoch": 5.0,
"eval_steps": 500,
"global_step": 545,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009174311926605505,
"grad_norm": 46530.15234375,
"learning_rate": 0.0,
"loss": 0.7023,
"step": 1
},
{
"epoch": 0.01834862385321101,
"grad_norm": 49571.35546875,
"learning_rate": 5.000000000000001e-07,
"loss": 0.7006,
"step": 2
},
{
"epoch": 0.027522935779816515,
"grad_norm": 45885.55078125,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.7033,
"step": 3
},
{
"epoch": 0.03669724770642202,
"grad_norm": 43333.73046875,
"learning_rate": 1.5e-06,
"loss": 0.7014,
"step": 4
},
{
"epoch": 0.045871559633027525,
"grad_norm": 44340.41796875,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.6992,
"step": 5
},
{
"epoch": 0.05504587155963303,
"grad_norm": 48282.54296875,
"learning_rate": 2.5e-06,
"loss": 0.6924,
"step": 6
},
{
"epoch": 0.06422018348623854,
"grad_norm": 48561.56640625,
"learning_rate": 3e-06,
"loss": 0.6948,
"step": 7
},
{
"epoch": 0.07339449541284404,
"grad_norm": 43580.078125,
"learning_rate": 3.5000000000000004e-06,
"loss": 0.6916,
"step": 8
},
{
"epoch": 0.08256880733944955,
"grad_norm": 46552.28515625,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6854,
"step": 9
},
{
"epoch": 0.09174311926605505,
"grad_norm": 49042.15234375,
"learning_rate": 4.5e-06,
"loss": 0.6822,
"step": 10
},
{
"epoch": 0.10091743119266056,
"grad_norm": 97274.8515625,
"learning_rate": 5e-06,
"loss": 0.6788,
"step": 11
},
{
"epoch": 0.11009174311926606,
"grad_norm": 55310.375,
"learning_rate": 5.500000000000001e-06,
"loss": 0.6781,
"step": 12
},
{
"epoch": 0.11926605504587157,
"grad_norm": 70317.6328125,
"learning_rate": 6e-06,
"loss": 0.6754,
"step": 13
},
{
"epoch": 0.12844036697247707,
"grad_norm": 46825.9765625,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.6626,
"step": 14
},
{
"epoch": 0.13761467889908258,
"grad_norm": 50745.15625,
"learning_rate": 7.000000000000001e-06,
"loss": 0.655,
"step": 15
},
{
"epoch": 0.14678899082568808,
"grad_norm": 51342.453125,
"learning_rate": 7.5e-06,
"loss": 0.6491,
"step": 16
},
{
"epoch": 0.1559633027522936,
"grad_norm": 51815.421875,
"learning_rate": 8.000000000000001e-06,
"loss": 0.6464,
"step": 17
},
{
"epoch": 0.1651376146788991,
"grad_norm": 54610.65234375,
"learning_rate": 8.500000000000002e-06,
"loss": 0.6298,
"step": 18
},
{
"epoch": 0.1743119266055046,
"grad_norm": 61936.31640625,
"learning_rate": 9e-06,
"loss": 0.623,
"step": 19
},
{
"epoch": 0.1834862385321101,
"grad_norm": 56021.69921875,
"learning_rate": 9.5e-06,
"loss": 0.6156,
"step": 20
},
{
"epoch": 0.1926605504587156,
"grad_norm": 60806.10546875,
"learning_rate": 1e-05,
"loss": 0.5981,
"step": 21
},
{
"epoch": 0.2018348623853211,
"grad_norm": 58582.4140625,
"learning_rate": 1.05e-05,
"loss": 0.5869,
"step": 22
},
{
"epoch": 0.21100917431192662,
"grad_norm": 59099.16015625,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.5732,
"step": 23
},
{
"epoch": 0.22018348623853212,
"grad_norm": 52971.81640625,
"learning_rate": 1.1500000000000002e-05,
"loss": 0.5587,
"step": 24
},
{
"epoch": 0.22935779816513763,
"grad_norm": 51724.43359375,
"learning_rate": 1.2e-05,
"loss": 0.5477,
"step": 25
},
{
"epoch": 0.23853211009174313,
"grad_norm": 58133.453125,
"learning_rate": 1.25e-05,
"loss": 0.5422,
"step": 26
},
{
"epoch": 0.24770642201834864,
"grad_norm": 54134.3359375,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.5319,
"step": 27
},
{
"epoch": 0.25688073394495414,
"grad_norm": 78195.7890625,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.5259,
"step": 28
},
{
"epoch": 0.26605504587155965,
"grad_norm": 47713.20703125,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.5105,
"step": 29
},
{
"epoch": 0.27522935779816515,
"grad_norm": 45838.0703125,
"learning_rate": 1.45e-05,
"loss": 0.499,
"step": 30
},
{
"epoch": 0.28440366972477066,
"grad_norm": 46735.58203125,
"learning_rate": 1.5e-05,
"loss": 0.4914,
"step": 31
},
{
"epoch": 0.29357798165137616,
"grad_norm": 45432.8828125,
"learning_rate": 1.55e-05,
"loss": 0.4783,
"step": 32
},
{
"epoch": 0.30275229357798167,
"grad_norm": 46758.58984375,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.4758,
"step": 33
},
{
"epoch": 0.3119266055045872,
"grad_norm": 45245.82421875,
"learning_rate": 1.65e-05,
"loss": 0.4608,
"step": 34
},
{
"epoch": 0.3211009174311927,
"grad_norm": 43336.75390625,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.4567,
"step": 35
},
{
"epoch": 0.3302752293577982,
"grad_norm": 41445.90625,
"learning_rate": 1.75e-05,
"loss": 0.4465,
"step": 36
},
{
"epoch": 0.3394495412844037,
"grad_norm": 42278.80078125,
"learning_rate": 1.8e-05,
"loss": 0.4395,
"step": 37
},
{
"epoch": 0.3486238532110092,
"grad_norm": 42039.28515625,
"learning_rate": 1.85e-05,
"loss": 0.4289,
"step": 38
},
{
"epoch": 0.3577981651376147,
"grad_norm": 41411.61328125,
"learning_rate": 1.9e-05,
"loss": 0.423,
"step": 39
},
{
"epoch": 0.3669724770642202,
"grad_norm": 40133.84375,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.4191,
"step": 40
},
{
"epoch": 0.3761467889908257,
"grad_norm": 39758.8828125,
"learning_rate": 2e-05,
"loss": 0.4071,
"step": 41
},
{
"epoch": 0.3853211009174312,
"grad_norm": 38760.36328125,
"learning_rate": 2.05e-05,
"loss": 0.3996,
"step": 42
},
{
"epoch": 0.3944954128440367,
"grad_norm": 38552.80078125,
"learning_rate": 2.1e-05,
"loss": 0.3918,
"step": 43
},
{
"epoch": 0.4036697247706422,
"grad_norm": 38561.7578125,
"learning_rate": 2.15e-05,
"loss": 0.3865,
"step": 44
},
{
"epoch": 0.41284403669724773,
"grad_norm": 37616.85546875,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.3794,
"step": 45
},
{
"epoch": 0.42201834862385323,
"grad_norm": 37784.78515625,
"learning_rate": 2.25e-05,
"loss": 0.3745,
"step": 46
},
{
"epoch": 0.43119266055045874,
"grad_norm": 36332.9140625,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.3711,
"step": 47
},
{
"epoch": 0.44036697247706424,
"grad_norm": 38899.73046875,
"learning_rate": 2.35e-05,
"loss": 0.3643,
"step": 48
},
{
"epoch": 0.44954128440366975,
"grad_norm": 36174.9765625,
"learning_rate": 2.4e-05,
"loss": 0.3575,
"step": 49
},
{
"epoch": 0.45871559633027525,
"grad_norm": 35699.83203125,
"learning_rate": 2.45e-05,
"loss": 0.347,
"step": 50
},
{
"epoch": 0.46788990825688076,
"grad_norm": 35562.3671875,
"learning_rate": 2.5e-05,
"loss": 0.3438,
"step": 51
},
{
"epoch": 0.47706422018348627,
"grad_norm": 35428.0625,
"learning_rate": 2.5500000000000003e-05,
"loss": 0.3325,
"step": 52
},
{
"epoch": 0.48623853211009177,
"grad_norm": 34396.80078125,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.3302,
"step": 53
},
{
"epoch": 0.4954128440366973,
"grad_norm": 38376.41796875,
"learning_rate": 2.6500000000000004e-05,
"loss": 0.3218,
"step": 54
},
{
"epoch": 0.5045871559633027,
"grad_norm": 33996.23046875,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.3131,
"step": 55
},
{
"epoch": 0.5137614678899083,
"grad_norm": 33070.55859375,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.3129,
"step": 56
},
{
"epoch": 0.5229357798165137,
"grad_norm": 33200.44140625,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.3003,
"step": 57
},
{
"epoch": 0.5321100917431193,
"grad_norm": 33099.1015625,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.2904,
"step": 58
},
{
"epoch": 0.5412844036697247,
"grad_norm": 31352.298828125,
"learning_rate": 2.9e-05,
"loss": 0.2911,
"step": 59
},
{
"epoch": 0.5504587155963303,
"grad_norm": 31965.087890625,
"learning_rate": 2.95e-05,
"loss": 0.2815,
"step": 60
},
{
"epoch": 0.5596330275229358,
"grad_norm": 30810.849609375,
"learning_rate": 3e-05,
"loss": 0.2818,
"step": 61
},
{
"epoch": 0.5688073394495413,
"grad_norm": 30034.779296875,
"learning_rate": 3.05e-05,
"loss": 0.2731,
"step": 62
},
{
"epoch": 0.5779816513761468,
"grad_norm": 30009.12109375,
"learning_rate": 3.1e-05,
"loss": 0.266,
"step": 63
},
{
"epoch": 0.5871559633027523,
"grad_norm": 29305.173828125,
"learning_rate": 3.15e-05,
"loss": 0.2609,
"step": 64
},
{
"epoch": 0.5963302752293578,
"grad_norm": 29081.853515625,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.2543,
"step": 65
},
{
"epoch": 0.6055045871559633,
"grad_norm": 28217.021484375,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.252,
"step": 66
},
{
"epoch": 0.6146788990825688,
"grad_norm": 26448.12890625,
"learning_rate": 3.3e-05,
"loss": 0.2565,
"step": 67
},
{
"epoch": 0.6238532110091743,
"grad_norm": 27198.80859375,
"learning_rate": 3.35e-05,
"loss": 0.2342,
"step": 68
},
{
"epoch": 0.6330275229357798,
"grad_norm": 26946.30859375,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.2273,
"step": 69
},
{
"epoch": 0.6422018348623854,
"grad_norm": 26236.7265625,
"learning_rate": 3.45e-05,
"loss": 0.2219,
"step": 70
},
{
"epoch": 0.6513761467889908,
"grad_norm": 25535.818359375,
"learning_rate": 3.5e-05,
"loss": 0.2197,
"step": 71
},
{
"epoch": 0.6605504587155964,
"grad_norm": 24871.8515625,
"learning_rate": 3.55e-05,
"loss": 0.2144,
"step": 72
},
{
"epoch": 0.6697247706422018,
"grad_norm": 24484.55078125,
"learning_rate": 3.6e-05,
"loss": 0.2066,
"step": 73
},
{
"epoch": 0.6788990825688074,
"grad_norm": 23655.677734375,
"learning_rate": 3.65e-05,
"loss": 0.206,
"step": 74
},
{
"epoch": 0.6880733944954128,
"grad_norm": 23129.076171875,
"learning_rate": 3.7e-05,
"loss": 0.1979,
"step": 75
},
{
"epoch": 0.6972477064220184,
"grad_norm": 21871.90625,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.1973,
"step": 76
},
{
"epoch": 0.7064220183486238,
"grad_norm": 21561.58984375,
"learning_rate": 3.8e-05,
"loss": 0.1935,
"step": 77
},
{
"epoch": 0.7155963302752294,
"grad_norm": 22210.29296875,
"learning_rate": 3.85e-05,
"loss": 0.1848,
"step": 78
},
{
"epoch": 0.7247706422018348,
"grad_norm": 20555.775390625,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.1801,
"step": 79
},
{
"epoch": 0.7339449541284404,
"grad_norm": 20406.75,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.1746,
"step": 80
},
{
"epoch": 0.7431192660550459,
"grad_norm": 19150.931640625,
"learning_rate": 4e-05,
"loss": 0.1688,
"step": 81
},
{
"epoch": 0.7522935779816514,
"grad_norm": 18422.65625,
"learning_rate": 4.05e-05,
"loss": 0.1661,
"step": 82
},
{
"epoch": 0.7614678899082569,
"grad_norm": 16746.20703125,
"learning_rate": 4.1e-05,
"loss": 0.1711,
"step": 83
},
{
"epoch": 0.7706422018348624,
"grad_norm": 18229.41015625,
"learning_rate": 4.15e-05,
"loss": 0.1646,
"step": 84
},
{
"epoch": 0.7798165137614679,
"grad_norm": 17156.267578125,
"learning_rate": 4.2e-05,
"loss": 0.1588,
"step": 85
},
{
"epoch": 0.7889908256880734,
"grad_norm": 15970.611328125,
"learning_rate": 4.25e-05,
"loss": 0.1547,
"step": 86
},
{
"epoch": 0.7981651376146789,
"grad_norm": 16997.103515625,
"learning_rate": 4.3e-05,
"loss": 0.1493,
"step": 87
},
{
"epoch": 0.8073394495412844,
"grad_norm": 15529.58984375,
"learning_rate": 4.35e-05,
"loss": 0.1439,
"step": 88
},
{
"epoch": 0.8165137614678899,
"grad_norm": 15099.9052734375,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.1432,
"step": 89
},
{
"epoch": 0.8256880733944955,
"grad_norm": 14261.3232421875,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.1396,
"step": 90
},
{
"epoch": 0.8348623853211009,
"grad_norm": 13915.87890625,
"learning_rate": 4.5e-05,
"loss": 0.1317,
"step": 91
},
{
"epoch": 0.8440366972477065,
"grad_norm": 13518.7568359375,
"learning_rate": 4.55e-05,
"loss": 0.1289,
"step": 92
},
{
"epoch": 0.8532110091743119,
"grad_norm": 12170.1865234375,
"learning_rate": 4.600000000000001e-05,
"loss": 0.1371,
"step": 93
},
{
"epoch": 0.8623853211009175,
"grad_norm": 11745.9775390625,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.1434,
"step": 94
},
{
"epoch": 0.8715596330275229,
"grad_norm": 12265.5556640625,
"learning_rate": 4.7e-05,
"loss": 0.1343,
"step": 95
},
{
"epoch": 0.8807339449541285,
"grad_norm": 11386.3291015625,
"learning_rate": 4.75e-05,
"loss": 0.1317,
"step": 96
},
{
"epoch": 0.8899082568807339,
"grad_norm": 10869.4599609375,
"learning_rate": 4.8e-05,
"loss": 0.1295,
"step": 97
},
{
"epoch": 0.8990825688073395,
"grad_norm": 10752.25,
"learning_rate": 4.85e-05,
"loss": 0.124,
"step": 98
},
{
"epoch": 0.908256880733945,
"grad_norm": 9687.6376953125,
"learning_rate": 4.9e-05,
"loss": 0.1271,
"step": 99
},
{
"epoch": 0.9174311926605505,
"grad_norm": 9697.15234375,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.1194,
"step": 100
},
{
"epoch": 0.926605504587156,
"grad_norm": 9423.21875,
"learning_rate": 5e-05,
"loss": 0.1176,
"step": 101
},
{
"epoch": 0.9357798165137615,
"grad_norm": 10253.8564453125,
"learning_rate": 4.994949494949495e-05,
"loss": 0.121,
"step": 102
},
{
"epoch": 0.944954128440367,
"grad_norm": 9756.7646484375,
"learning_rate": 4.98989898989899e-05,
"loss": 0.1173,
"step": 103
},
{
"epoch": 0.9541284403669725,
"grad_norm": 8482.177734375,
"learning_rate": 4.984848484848485e-05,
"loss": 0.1233,
"step": 104
},
{
"epoch": 0.963302752293578,
"grad_norm": 7755.4892578125,
"learning_rate": 4.97979797979798e-05,
"loss": 0.1251,
"step": 105
},
{
"epoch": 0.9724770642201835,
"grad_norm": 8553.6689453125,
"learning_rate": 4.974747474747475e-05,
"loss": 0.1287,
"step": 106
},
{
"epoch": 0.981651376146789,
"grad_norm": 8794.58984375,
"learning_rate": 4.9696969696969694e-05,
"loss": 0.1178,
"step": 107
},
{
"epoch": 0.9908256880733946,
"grad_norm": 8048.97509765625,
"learning_rate": 4.964646464646465e-05,
"loss": 0.1112,
"step": 108
},
{
"epoch": 1.0,
"grad_norm": 24349.96484375,
"learning_rate": 4.9595959595959594e-05,
"loss": 0.112,
"step": 109
},
{
"epoch": 1.0,
"eval_f1_macro": 0.0,
"eval_f1_micro": 0.0,
"eval_loss": 0.1133684441447258,
"eval_precision": 0.0,
"eval_recall": 0.0,
"eval_runtime": 0.8578,
"eval_samples_per_second": 336.892,
"eval_steps_per_second": 15.154,
"step": 109
},
{
"epoch": 1.0091743119266054,
"grad_norm": 7603.02587890625,
"learning_rate": 4.9545454545454553e-05,
"loss": 0.1133,
"step": 110
},
{
"epoch": 1.018348623853211,
"grad_norm": 8602.8974609375,
"learning_rate": 4.94949494949495e-05,
"loss": 0.1192,
"step": 111
},
{
"epoch": 1.0275229357798166,
"grad_norm": 8388.9609375,
"learning_rate": 4.9444444444444446e-05,
"loss": 0.1246,
"step": 112
},
{
"epoch": 1.036697247706422,
"grad_norm": 9368.150390625,
"learning_rate": 4.93939393939394e-05,
"loss": 0.1089,
"step": 113
},
{
"epoch": 1.0458715596330275,
"grad_norm": 7053.083984375,
"learning_rate": 4.9343434343434346e-05,
"loss": 0.1042,
"step": 114
},
{
"epoch": 1.0550458715596331,
"grad_norm": 9539.36328125,
"learning_rate": 4.92929292929293e-05,
"loss": 0.1049,
"step": 115
},
{
"epoch": 1.0642201834862386,
"grad_norm": 6956.2763671875,
"learning_rate": 4.9242424242424245e-05,
"loss": 0.109,
"step": 116
},
{
"epoch": 1.073394495412844,
"grad_norm": 6597.9580078125,
"learning_rate": 4.919191919191919e-05,
"loss": 0.1124,
"step": 117
},
{
"epoch": 1.0825688073394495,
"grad_norm": 8124.76123046875,
"learning_rate": 4.9141414141414145e-05,
"loss": 0.1226,
"step": 118
},
{
"epoch": 1.091743119266055,
"grad_norm": 7030.619140625,
"learning_rate": 4.909090909090909e-05,
"loss": 0.0989,
"step": 119
},
{
"epoch": 1.1009174311926606,
"grad_norm": 9377.66796875,
"learning_rate": 4.9040404040404044e-05,
"loss": 0.1038,
"step": 120
},
{
"epoch": 1.110091743119266,
"grad_norm": 9298.802734375,
"learning_rate": 4.898989898989899e-05,
"loss": 0.1216,
"step": 121
},
{
"epoch": 1.1192660550458715,
"grad_norm": 7350.28369140625,
"learning_rate": 4.8939393939393944e-05,
"loss": 0.0915,
"step": 122
},
{
"epoch": 1.1284403669724772,
"grad_norm": 8066.943359375,
"learning_rate": 4.888888888888889e-05,
"loss": 0.1291,
"step": 123
},
{
"epoch": 1.1376146788990826,
"grad_norm": 8210.3095703125,
"learning_rate": 4.8838383838383836e-05,
"loss": 0.1196,
"step": 124
},
{
"epoch": 1.146788990825688,
"grad_norm": 7568.0234375,
"learning_rate": 4.878787878787879e-05,
"loss": 0.1037,
"step": 125
},
{
"epoch": 1.1559633027522935,
"grad_norm": 6394.8896484375,
"learning_rate": 4.8737373737373736e-05,
"loss": 0.0961,
"step": 126
},
{
"epoch": 1.165137614678899,
"grad_norm": 7549.9951171875,
"learning_rate": 4.868686868686869e-05,
"loss": 0.1075,
"step": 127
},
{
"epoch": 1.1743119266055047,
"grad_norm": 7983.36865234375,
"learning_rate": 4.863636363636364e-05,
"loss": 0.1055,
"step": 128
},
{
"epoch": 1.18348623853211,
"grad_norm": 7613.1455078125,
"learning_rate": 4.858585858585859e-05,
"loss": 0.1097,
"step": 129
},
{
"epoch": 1.1926605504587156,
"grad_norm": 7754.9228515625,
"learning_rate": 4.853535353535354e-05,
"loss": 0.1157,
"step": 130
},
{
"epoch": 1.2018348623853212,
"grad_norm": 8360.388671875,
"learning_rate": 4.848484848484849e-05,
"loss": 0.1019,
"step": 131
},
{
"epoch": 1.2110091743119267,
"grad_norm": 8300.9169921875,
"learning_rate": 4.843434343434344e-05,
"loss": 0.1098,
"step": 132
},
{
"epoch": 1.2201834862385321,
"grad_norm": 7554.8017578125,
"learning_rate": 4.838383838383839e-05,
"loss": 0.1075,
"step": 133
},
{
"epoch": 1.2293577981651376,
"grad_norm": 6510.427734375,
"learning_rate": 4.8333333333333334e-05,
"loss": 0.1041,
"step": 134
},
{
"epoch": 1.238532110091743,
"grad_norm": 5649.77978515625,
"learning_rate": 4.828282828282829e-05,
"loss": 0.1012,
"step": 135
},
{
"epoch": 1.2477064220183487,
"grad_norm": 7662.76513671875,
"learning_rate": 4.823232323232323e-05,
"loss": 0.1357,
"step": 136
},
{
"epoch": 1.2568807339449541,
"grad_norm": 7261.087890625,
"learning_rate": 4.8181818181818186e-05,
"loss": 0.1149,
"step": 137
},
{
"epoch": 1.2660550458715596,
"grad_norm": 6818.208984375,
"learning_rate": 4.813131313131313e-05,
"loss": 0.1067,
"step": 138
},
{
"epoch": 1.2752293577981653,
"grad_norm": 8839.0810546875,
"learning_rate": 4.808080808080808e-05,
"loss": 0.089,
"step": 139
},
{
"epoch": 1.2844036697247707,
"grad_norm": 8774.0615234375,
"learning_rate": 4.803030303030303e-05,
"loss": 0.1013,
"step": 140
},
{
"epoch": 1.2935779816513762,
"grad_norm": 7869.935546875,
"learning_rate": 4.797979797979798e-05,
"loss": 0.1187,
"step": 141
},
{
"epoch": 1.3027522935779816,
"grad_norm": 9808.5986328125,
"learning_rate": 4.792929292929293e-05,
"loss": 0.13,
"step": 142
},
{
"epoch": 1.311926605504587,
"grad_norm": 7122.400390625,
"learning_rate": 4.787878787878788e-05,
"loss": 0.1089,
"step": 143
},
{
"epoch": 1.3211009174311927,
"grad_norm": 6575.39892578125,
"learning_rate": 4.782828282828283e-05,
"loss": 0.1011,
"step": 144
},
{
"epoch": 1.3302752293577982,
"grad_norm": 8656.4345703125,
"learning_rate": 4.7777777777777784e-05,
"loss": 0.0998,
"step": 145
},
{
"epoch": 1.3394495412844036,
"grad_norm": 7247.53466796875,
"learning_rate": 4.772727272727273e-05,
"loss": 0.1255,
"step": 146
},
{
"epoch": 1.3486238532110093,
"grad_norm": 7125.67822265625,
"learning_rate": 4.7676767676767684e-05,
"loss": 0.1123,
"step": 147
},
{
"epoch": 1.3577981651376148,
"grad_norm": 7758.84375,
"learning_rate": 4.762626262626263e-05,
"loss": 0.1138,
"step": 148
},
{
"epoch": 1.3669724770642202,
"grad_norm": 6958.53173828125,
"learning_rate": 4.7575757575757576e-05,
"loss": 0.0968,
"step": 149
},
{
"epoch": 1.3761467889908257,
"grad_norm": 7246.337890625,
"learning_rate": 4.752525252525253e-05,
"loss": 0.0924,
"step": 150
},
{
"epoch": 1.385321100917431,
"grad_norm": 7635.5615234375,
"learning_rate": 4.7474747474747476e-05,
"loss": 0.1152,
"step": 151
},
{
"epoch": 1.3944954128440368,
"grad_norm": 7913.57080078125,
"learning_rate": 4.742424242424243e-05,
"loss": 0.0951,
"step": 152
},
{
"epoch": 1.4036697247706422,
"grad_norm": 8531.388671875,
"learning_rate": 4.7373737373737375e-05,
"loss": 0.1161,
"step": 153
},
{
"epoch": 1.4128440366972477,
"grad_norm": 9724.3291015625,
"learning_rate": 4.732323232323232e-05,
"loss": 0.1253,
"step": 154
},
{
"epoch": 1.4220183486238533,
"grad_norm": 6726.74365234375,
"learning_rate": 4.7272727272727275e-05,
"loss": 0.1037,
"step": 155
},
{
"epoch": 1.4311926605504588,
"grad_norm": 6435.69970703125,
"learning_rate": 4.722222222222222e-05,
"loss": 0.1036,
"step": 156
},
{
"epoch": 1.4403669724770642,
"grad_norm": 6851.59814453125,
"learning_rate": 4.7171717171717174e-05,
"loss": 0.0954,
"step": 157
},
{
"epoch": 1.4495412844036697,
"grad_norm": 7785.31640625,
"learning_rate": 4.712121212121212e-05,
"loss": 0.0996,
"step": 158
},
{
"epoch": 1.4587155963302751,
"grad_norm": 6317.5576171875,
"learning_rate": 4.7070707070707074e-05,
"loss": 0.0977,
"step": 159
},
{
"epoch": 1.4678899082568808,
"grad_norm": 8045.7197265625,
"learning_rate": 4.702020202020202e-05,
"loss": 0.1101,
"step": 160
},
{
"epoch": 1.4770642201834863,
"grad_norm": 7386.67529296875,
"learning_rate": 4.696969696969697e-05,
"loss": 0.1137,
"step": 161
},
{
"epoch": 1.4862385321100917,
"grad_norm": 8017.5791015625,
"learning_rate": 4.6919191919191926e-05,
"loss": 0.0959,
"step": 162
},
{
"epoch": 1.4954128440366974,
"grad_norm": 7198.42138671875,
"learning_rate": 4.686868686868687e-05,
"loss": 0.1005,
"step": 163
},
{
"epoch": 1.5045871559633026,
"grad_norm": 9139.7900390625,
"learning_rate": 4.681818181818182e-05,
"loss": 0.0918,
"step": 164
},
{
"epoch": 1.5137614678899083,
"grad_norm": 6384.1640625,
"learning_rate": 4.676767676767677e-05,
"loss": 0.0948,
"step": 165
},
{
"epoch": 1.5229357798165137,
"grad_norm": 5053.224609375,
"learning_rate": 4.671717171717172e-05,
"loss": 0.1056,
"step": 166
},
{
"epoch": 1.5321100917431192,
"grad_norm": 9213.654296875,
"learning_rate": 4.666666666666667e-05,
"loss": 0.0931,
"step": 167
},
{
"epoch": 1.5412844036697249,
"grad_norm": 7414.76171875,
"learning_rate": 4.661616161616162e-05,
"loss": 0.1123,
"step": 168
},
{
"epoch": 1.5504587155963303,
"grad_norm": 6406.48583984375,
"learning_rate": 4.656565656565657e-05,
"loss": 0.0992,
"step": 169
},
{
"epoch": 1.5596330275229358,
"grad_norm": 7213.58837890625,
"learning_rate": 4.651515151515152e-05,
"loss": 0.0952,
"step": 170
},
{
"epoch": 1.5688073394495414,
"grad_norm": 7537.673828125,
"learning_rate": 4.6464646464646464e-05,
"loss": 0.0961,
"step": 171
},
{
"epoch": 1.5779816513761467,
"grad_norm": 7876.294921875,
"learning_rate": 4.641414141414142e-05,
"loss": 0.1308,
"step": 172
},
{
"epoch": 1.5871559633027523,
"grad_norm": 9550.125,
"learning_rate": 4.636363636363636e-05,
"loss": 0.0941,
"step": 173
},
{
"epoch": 1.5963302752293578,
"grad_norm": 6364.3330078125,
"learning_rate": 4.6313131313131316e-05,
"loss": 0.0976,
"step": 174
},
{
"epoch": 1.6055045871559632,
"grad_norm": 6976.2138671875,
"learning_rate": 4.626262626262626e-05,
"loss": 0.091,
"step": 175
},
{
"epoch": 1.614678899082569,
"grad_norm": 7259.40234375,
"learning_rate": 4.621212121212121e-05,
"loss": 0.1096,
"step": 176
},
{
"epoch": 1.6238532110091743,
"grad_norm": 8176.20849609375,
"learning_rate": 4.616161616161616e-05,
"loss": 0.1029,
"step": 177
},
{
"epoch": 1.6330275229357798,
"grad_norm": 5628.04345703125,
"learning_rate": 4.6111111111111115e-05,
"loss": 0.1008,
"step": 178
},
{
"epoch": 1.6422018348623855,
"grad_norm": 6802.91064453125,
"learning_rate": 4.606060606060607e-05,
"loss": 0.1014,
"step": 179
},
{
"epoch": 1.6513761467889907,
"grad_norm": 14422.4482421875,
"learning_rate": 4.6010101010101015e-05,
"loss": 0.0912,
"step": 180
},
{
"epoch": 1.6605504587155964,
"grad_norm": 8855.7744140625,
"learning_rate": 4.595959595959596e-05,
"loss": 0.0983,
"step": 181
},
{
"epoch": 1.6697247706422018,
"grad_norm": 7411.93603515625,
"learning_rate": 4.5909090909090914e-05,
"loss": 0.1099,
"step": 182
},
{
"epoch": 1.6788990825688073,
"grad_norm": 8138.41748046875,
"learning_rate": 4.585858585858586e-05,
"loss": 0.1054,
"step": 183
},
{
"epoch": 1.688073394495413,
"grad_norm": 6157.80908203125,
"learning_rate": 4.5808080808080814e-05,
"loss": 0.1054,
"step": 184
},
{
"epoch": 1.6972477064220184,
"grad_norm": 6791.07421875,
"learning_rate": 4.575757575757576e-05,
"loss": 0.1022,
"step": 185
},
{
"epoch": 1.7064220183486238,
"grad_norm": 9991.0537109375,
"learning_rate": 4.5707070707070706e-05,
"loss": 0.1131,
"step": 186
},
{
"epoch": 1.7155963302752295,
"grad_norm": 7240.28955078125,
"learning_rate": 4.565656565656566e-05,
"loss": 0.1005,
"step": 187
},
{
"epoch": 1.7247706422018347,
"grad_norm": 6442.6259765625,
"learning_rate": 4.5606060606060606e-05,
"loss": 0.1015,
"step": 188
},
{
"epoch": 1.7339449541284404,
"grad_norm": 8573.3955078125,
"learning_rate": 4.555555555555556e-05,
"loss": 0.1176,
"step": 189
},
{
"epoch": 1.7431192660550459,
"grad_norm": 7330.88525390625,
"learning_rate": 4.5505050505050505e-05,
"loss": 0.0923,
"step": 190
},
{
"epoch": 1.7522935779816513,
"grad_norm": 7870.916015625,
"learning_rate": 4.545454545454546e-05,
"loss": 0.1113,
"step": 191
},
{
"epoch": 1.761467889908257,
"grad_norm": 7789.8564453125,
"learning_rate": 4.5404040404040405e-05,
"loss": 0.1062,
"step": 192
},
{
"epoch": 1.7706422018348624,
"grad_norm": 7983.59326171875,
"learning_rate": 4.535353535353535e-05,
"loss": 0.1078,
"step": 193
},
{
"epoch": 1.7798165137614679,
"grad_norm": 6533.43994140625,
"learning_rate": 4.5303030303030304e-05,
"loss": 0.1033,
"step": 194
},
{
"epoch": 1.7889908256880735,
"grad_norm": 7317.1318359375,
"learning_rate": 4.525252525252526e-05,
"loss": 0.1106,
"step": 195
},
{
"epoch": 1.7981651376146788,
"grad_norm": 7829.7880859375,
"learning_rate": 4.5202020202020204e-05,
"loss": 0.1163,
"step": 196
},
{
"epoch": 1.8073394495412844,
"grad_norm": 7524.41357421875,
"learning_rate": 4.515151515151516e-05,
"loss": 0.1043,
"step": 197
},
{
"epoch": 1.81651376146789,
"grad_norm": 7487.89892578125,
"learning_rate": 4.51010101010101e-05,
"loss": 0.1091,
"step": 198
},
{
"epoch": 1.8256880733944953,
"grad_norm": 6964.3154296875,
"learning_rate": 4.5050505050505056e-05,
"loss": 0.0986,
"step": 199
},
{
"epoch": 1.834862385321101,
"grad_norm": 7087.05029296875,
"learning_rate": 4.5e-05,
"loss": 0.0997,
"step": 200
},
{
"epoch": 1.8440366972477065,
"grad_norm": 7148.7578125,
"learning_rate": 4.494949494949495e-05,
"loss": 0.0989,
"step": 201
},
{
"epoch": 1.853211009174312,
"grad_norm": 7850.2470703125,
"learning_rate": 4.48989898989899e-05,
"loss": 0.0957,
"step": 202
},
{
"epoch": 1.8623853211009176,
"grad_norm": 7752.841796875,
"learning_rate": 4.484848484848485e-05,
"loss": 0.1292,
"step": 203
},
{
"epoch": 1.8715596330275228,
"grad_norm": 6098.35595703125,
"learning_rate": 4.47979797979798e-05,
"loss": 0.1001,
"step": 204
},
{
"epoch": 1.8807339449541285,
"grad_norm": 7094.8173828125,
"learning_rate": 4.474747474747475e-05,
"loss": 0.1286,
"step": 205
},
{
"epoch": 1.889908256880734,
"grad_norm": 7456.2265625,
"learning_rate": 4.46969696969697e-05,
"loss": 0.1256,
"step": 206
},
{
"epoch": 1.8990825688073394,
"grad_norm": 8136.77197265625,
"learning_rate": 4.464646464646465e-05,
"loss": 0.1014,
"step": 207
},
{
"epoch": 1.908256880733945,
"grad_norm": 7520.1650390625,
"learning_rate": 4.4595959595959594e-05,
"loss": 0.0919,
"step": 208
},
{
"epoch": 1.9174311926605505,
"grad_norm": 8064.79296875,
"learning_rate": 4.454545454545455e-05,
"loss": 0.0985,
"step": 209
},
{
"epoch": 1.926605504587156,
"grad_norm": 6153.19677734375,
"learning_rate": 4.4494949494949493e-05,
"loss": 0.1101,
"step": 210
},
{
"epoch": 1.9357798165137616,
"grad_norm": 8089.42431640625,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.0988,
"step": 211
},
{
"epoch": 1.9449541284403669,
"grad_norm": 8096.140625,
"learning_rate": 4.43939393939394e-05,
"loss": 0.0992,
"step": 212
},
{
"epoch": 1.9541284403669725,
"grad_norm": 7865.43798828125,
"learning_rate": 4.4343434343434346e-05,
"loss": 0.0945,
"step": 213
},
{
"epoch": 1.963302752293578,
"grad_norm": 6380.67724609375,
"learning_rate": 4.42929292929293e-05,
"loss": 0.1073,
"step": 214
},
{
"epoch": 1.9724770642201834,
"grad_norm": 8548.4345703125,
"learning_rate": 4.4242424242424246e-05,
"loss": 0.1121,
"step": 215
},
{
"epoch": 1.981651376146789,
"grad_norm": 8465.8466796875,
"learning_rate": 4.41919191919192e-05,
"loss": 0.1078,
"step": 216
},
{
"epoch": 1.9908256880733946,
"grad_norm": 8595.419921875,
"learning_rate": 4.4141414141414145e-05,
"loss": 0.0924,
"step": 217
},
{
"epoch": 2.0,
"grad_norm": 22229.833984375,
"learning_rate": 4.409090909090909e-05,
"loss": 0.0785,
"step": 218
},
{
"epoch": 2.0,
"eval_f1_macro": 0.0,
"eval_f1_micro": 0.0,
"eval_loss": 0.10520372539758682,
"eval_precision": 0.0,
"eval_recall": 0.0,
"eval_runtime": 0.8127,
"eval_samples_per_second": 355.611,
"eval_steps_per_second": 15.996,
"step": 218
},
{
"epoch": 2.0091743119266057,
"grad_norm": 8562.740234375,
"learning_rate": 4.4040404040404044e-05,
"loss": 0.1152,
"step": 219
},
{
"epoch": 2.018348623853211,
"grad_norm": 7770.8720703125,
"learning_rate": 4.398989898989899e-05,
"loss": 0.0957,
"step": 220
},
{
"epoch": 2.0275229357798166,
"grad_norm": 6170.166015625,
"learning_rate": 4.3939393939393944e-05,
"loss": 0.1027,
"step": 221
},
{
"epoch": 2.036697247706422,
"grad_norm": 10314.91796875,
"learning_rate": 4.388888888888889e-05,
"loss": 0.1166,
"step": 222
},
{
"epoch": 2.0458715596330275,
"grad_norm": 7144.62109375,
"learning_rate": 4.383838383838384e-05,
"loss": 0.1065,
"step": 223
},
{
"epoch": 2.055045871559633,
"grad_norm": 7398.2109375,
"learning_rate": 4.378787878787879e-05,
"loss": 0.1165,
"step": 224
},
{
"epoch": 2.0642201834862384,
"grad_norm": 7307.375,
"learning_rate": 4.3737373737373736e-05,
"loss": 0.0941,
"step": 225
},
{
"epoch": 2.073394495412844,
"grad_norm": 8046.33837890625,
"learning_rate": 4.368686868686869e-05,
"loss": 0.1092,
"step": 226
},
{
"epoch": 2.0825688073394497,
"grad_norm": 8320.74609375,
"learning_rate": 4.3636363636363636e-05,
"loss": 0.0983,
"step": 227
},
{
"epoch": 2.091743119266055,
"grad_norm": 10485.9384765625,
"learning_rate": 4.358585858585859e-05,
"loss": 0.0981,
"step": 228
},
{
"epoch": 2.1009174311926606,
"grad_norm": 7908.02734375,
"learning_rate": 4.3535353535353535e-05,
"loss": 0.1021,
"step": 229
},
{
"epoch": 2.1100917431192663,
"grad_norm": 7627.345703125,
"learning_rate": 4.348484848484849e-05,
"loss": 0.106,
"step": 230
},
{
"epoch": 2.1192660550458715,
"grad_norm": 7474.5263671875,
"learning_rate": 4.343434343434344e-05,
"loss": 0.1156,
"step": 231
},
{
"epoch": 2.128440366972477,
"grad_norm": 6541.48828125,
"learning_rate": 4.338383838383839e-05,
"loss": 0.0974,
"step": 232
},
{
"epoch": 2.1376146788990824,
"grad_norm": 10176.2197265625,
"learning_rate": 4.3333333333333334e-05,
"loss": 0.1172,
"step": 233
},
{
"epoch": 2.146788990825688,
"grad_norm": 10230.0478515625,
"learning_rate": 4.328282828282829e-05,
"loss": 0.1089,
"step": 234
},
{
"epoch": 2.1559633027522938,
"grad_norm": 7409.98583984375,
"learning_rate": 4.3232323232323234e-05,
"loss": 0.095,
"step": 235
},
{
"epoch": 2.165137614678899,
"grad_norm": 9656.2216796875,
"learning_rate": 4.318181818181819e-05,
"loss": 0.0862,
"step": 236
},
{
"epoch": 2.1743119266055047,
"grad_norm": 10004.51171875,
"learning_rate": 4.313131313131313e-05,
"loss": 0.1019,
"step": 237
},
{
"epoch": 2.18348623853211,
"grad_norm": 10041.599609375,
"learning_rate": 4.308080808080808e-05,
"loss": 0.0828,
"step": 238
},
{
"epoch": 2.1926605504587156,
"grad_norm": 10838.517578125,
"learning_rate": 4.303030303030303e-05,
"loss": 0.1093,
"step": 239
},
{
"epoch": 2.2018348623853212,
"grad_norm": 7099.97265625,
"learning_rate": 4.297979797979798e-05,
"loss": 0.0962,
"step": 240
},
{
"epoch": 2.2110091743119265,
"grad_norm": 8438.68359375,
"learning_rate": 4.292929292929293e-05,
"loss": 0.1095,
"step": 241
},
{
"epoch": 2.220183486238532,
"grad_norm": 7405.43359375,
"learning_rate": 4.287878787878788e-05,
"loss": 0.1035,
"step": 242
},
{
"epoch": 2.229357798165138,
"grad_norm": 9133.9833984375,
"learning_rate": 4.282828282828283e-05,
"loss": 0.0925,
"step": 243
},
{
"epoch": 2.238532110091743,
"grad_norm": 7998.142578125,
"learning_rate": 4.277777777777778e-05,
"loss": 0.0998,
"step": 244
},
{
"epoch": 2.2477064220183487,
"grad_norm": 9984.765625,
"learning_rate": 4.2727272727272724e-05,
"loss": 0.1027,
"step": 245
},
{
"epoch": 2.2568807339449544,
"grad_norm": 8999.0556640625,
"learning_rate": 4.267676767676768e-05,
"loss": 0.1177,
"step": 246
},
{
"epoch": 2.2660550458715596,
"grad_norm": 7668.86962890625,
"learning_rate": 4.262626262626263e-05,
"loss": 0.0946,
"step": 247
},
{
"epoch": 2.2752293577981653,
"grad_norm": 7349.47998046875,
"learning_rate": 4.257575757575758e-05,
"loss": 0.1066,
"step": 248
},
{
"epoch": 2.2844036697247705,
"grad_norm": 8360.69921875,
"learning_rate": 4.252525252525253e-05,
"loss": 0.1062,
"step": 249
},
{
"epoch": 2.293577981651376,
"grad_norm": 8717.70703125,
"learning_rate": 4.2474747474747476e-05,
"loss": 0.0907,
"step": 250
},
{
"epoch": 2.302752293577982,
"grad_norm": 8706.5791015625,
"learning_rate": 4.242424242424243e-05,
"loss": 0.0973,
"step": 251
},
{
"epoch": 2.311926605504587,
"grad_norm": 8500.1220703125,
"learning_rate": 4.2373737373737376e-05,
"loss": 0.1022,
"step": 252
},
{
"epoch": 2.3211009174311927,
"grad_norm": 10551.7021484375,
"learning_rate": 4.232323232323233e-05,
"loss": 0.0979,
"step": 253
},
{
"epoch": 2.330275229357798,
"grad_norm": 8305.2802734375,
"learning_rate": 4.2272727272727275e-05,
"loss": 0.0926,
"step": 254
},
{
"epoch": 2.3394495412844036,
"grad_norm": 10128.9423828125,
"learning_rate": 4.222222222222222e-05,
"loss": 0.1018,
"step": 255
},
{
"epoch": 2.3486238532110093,
"grad_norm": 9000.6689453125,
"learning_rate": 4.2171717171717175e-05,
"loss": 0.091,
"step": 256
},
{
"epoch": 2.3577981651376145,
"grad_norm": 8849.099609375,
"learning_rate": 4.212121212121212e-05,
"loss": 0.0993,
"step": 257
},
{
"epoch": 2.36697247706422,
"grad_norm": 6667.78564453125,
"learning_rate": 4.2070707070707074e-05,
"loss": 0.1044,
"step": 258
},
{
"epoch": 2.376146788990826,
"grad_norm": 7288.4970703125,
"learning_rate": 4.202020202020202e-05,
"loss": 0.0966,
"step": 259
},
{
"epoch": 2.385321100917431,
"grad_norm": 7155.8310546875,
"learning_rate": 4.196969696969697e-05,
"loss": 0.1018,
"step": 260
},
{
"epoch": 2.3944954128440368,
"grad_norm": 6328.56396484375,
"learning_rate": 4.191919191919192e-05,
"loss": 0.0862,
"step": 261
},
{
"epoch": 2.4036697247706424,
"grad_norm": 8594.3603515625,
"learning_rate": 4.1868686868686866e-05,
"loss": 0.1028,
"step": 262
},
{
"epoch": 2.4128440366972477,
"grad_norm": 8077.544921875,
"learning_rate": 4.181818181818182e-05,
"loss": 0.1044,
"step": 263
},
{
"epoch": 2.4220183486238533,
"grad_norm": 6332.14453125,
"learning_rate": 4.176767676767677e-05,
"loss": 0.0902,
"step": 264
},
{
"epoch": 2.4311926605504586,
"grad_norm": 7677.5009765625,
"learning_rate": 4.171717171717172e-05,
"loss": 0.098,
"step": 265
},
{
"epoch": 2.4403669724770642,
"grad_norm": 7953.89501953125,
"learning_rate": 4.166666666666667e-05,
"loss": 0.0809,
"step": 266
},
{
"epoch": 2.44954128440367,
"grad_norm": 6055.19287109375,
"learning_rate": 4.161616161616162e-05,
"loss": 0.0814,
"step": 267
},
{
"epoch": 2.458715596330275,
"grad_norm": 6427.98046875,
"learning_rate": 4.156565656565657e-05,
"loss": 0.0994,
"step": 268
},
{
"epoch": 2.467889908256881,
"grad_norm": 8880.2197265625,
"learning_rate": 4.151515151515152e-05,
"loss": 0.0804,
"step": 269
},
{
"epoch": 2.477064220183486,
"grad_norm": 7500.67578125,
"learning_rate": 4.1464646464646464e-05,
"loss": 0.0997,
"step": 270
},
{
"epoch": 2.4862385321100917,
"grad_norm": 7541.759765625,
"learning_rate": 4.141414141414142e-05,
"loss": 0.0901,
"step": 271
},
{
"epoch": 2.4954128440366974,
"grad_norm": 6423.1025390625,
"learning_rate": 4.1363636363636364e-05,
"loss": 0.089,
"step": 272
},
{
"epoch": 2.5045871559633026,
"grad_norm": 6384.140625,
"learning_rate": 4.131313131313132e-05,
"loss": 0.0848,
"step": 273
},
{
"epoch": 2.5137614678899083,
"grad_norm": 6418.369140625,
"learning_rate": 4.126262626262626e-05,
"loss": 0.0845,
"step": 274
},
{
"epoch": 2.522935779816514,
"grad_norm": 6441.12060546875,
"learning_rate": 4.1212121212121216e-05,
"loss": 0.0855,
"step": 275
},
{
"epoch": 2.532110091743119,
"grad_norm": 6891.9501953125,
"learning_rate": 4.116161616161616e-05,
"loss": 0.0798,
"step": 276
},
{
"epoch": 2.541284403669725,
"grad_norm": 8036.39306640625,
"learning_rate": 4.111111111111111e-05,
"loss": 0.1083,
"step": 277
},
{
"epoch": 2.5504587155963305,
"grad_norm": 7460.015625,
"learning_rate": 4.106060606060606e-05,
"loss": 0.0934,
"step": 278
},
{
"epoch": 2.5596330275229358,
"grad_norm": 7343.33154296875,
"learning_rate": 4.101010101010101e-05,
"loss": 0.0951,
"step": 279
},
{
"epoch": 2.5688073394495414,
"grad_norm": 9859.599609375,
"learning_rate": 4.095959595959596e-05,
"loss": 0.0849,
"step": 280
},
{
"epoch": 2.5779816513761467,
"grad_norm": 8394.8212890625,
"learning_rate": 4.0909090909090915e-05,
"loss": 0.079,
"step": 281
},
{
"epoch": 2.5871559633027523,
"grad_norm": 8307.279296875,
"learning_rate": 4.085858585858586e-05,
"loss": 0.0985,
"step": 282
},
{
"epoch": 2.5963302752293576,
"grad_norm": 7505.14794921875,
"learning_rate": 4.0808080808080814e-05,
"loss": 0.0932,
"step": 283
},
{
"epoch": 2.6055045871559632,
"grad_norm": 6129.16552734375,
"learning_rate": 4.075757575757576e-05,
"loss": 0.0797,
"step": 284
},
{
"epoch": 2.614678899082569,
"grad_norm": 6896.5908203125,
"learning_rate": 4.070707070707071e-05,
"loss": 0.0891,
"step": 285
},
{
"epoch": 2.623853211009174,
"grad_norm": 8686.771484375,
"learning_rate": 4.065656565656566e-05,
"loss": 0.0838,
"step": 286
},
{
"epoch": 2.63302752293578,
"grad_norm": 6954.4072265625,
"learning_rate": 4.0606060606060606e-05,
"loss": 0.0976,
"step": 287
},
{
"epoch": 2.6422018348623855,
"grad_norm": 8336.8720703125,
"learning_rate": 4.055555555555556e-05,
"loss": 0.1339,
"step": 288
},
{
"epoch": 2.6513761467889907,
"grad_norm": 6729.21044921875,
"learning_rate": 4.0505050505050506e-05,
"loss": 0.0897,
"step": 289
},
{
"epoch": 2.6605504587155964,
"grad_norm": 8650.7490234375,
"learning_rate": 4.045454545454546e-05,
"loss": 0.1034,
"step": 290
},
{
"epoch": 2.669724770642202,
"grad_norm": 8123.34228515625,
"learning_rate": 4.0404040404040405e-05,
"loss": 0.0752,
"step": 291
},
{
"epoch": 2.6788990825688073,
"grad_norm": 7077.0859375,
"learning_rate": 4.035353535353535e-05,
"loss": 0.0889,
"step": 292
},
{
"epoch": 2.688073394495413,
"grad_norm": 8160.107421875,
"learning_rate": 4.0303030303030305e-05,
"loss": 0.1067,
"step": 293
},
{
"epoch": 2.6972477064220186,
"grad_norm": 11080.52734375,
"learning_rate": 4.025252525252525e-05,
"loss": 0.0984,
"step": 294
},
{
"epoch": 2.706422018348624,
"grad_norm": 8655.8173828125,
"learning_rate": 4.0202020202020204e-05,
"loss": 0.0977,
"step": 295
},
{
"epoch": 2.7155963302752295,
"grad_norm": 9417.9384765625,
"learning_rate": 4.015151515151515e-05,
"loss": 0.0863,
"step": 296
},
{
"epoch": 2.7247706422018347,
"grad_norm": 7852.45361328125,
"learning_rate": 4.01010101010101e-05,
"loss": 0.0865,
"step": 297
},
{
"epoch": 2.7339449541284404,
"grad_norm": 8051.74609375,
"learning_rate": 4.005050505050506e-05,
"loss": 0.11,
"step": 298
},
{
"epoch": 2.7431192660550456,
"grad_norm": 7042.64013671875,
"learning_rate": 4e-05,
"loss": 0.0805,
"step": 299
},
{
"epoch": 2.7522935779816513,
"grad_norm": 7486.0947265625,
"learning_rate": 3.9949494949494956e-05,
"loss": 0.0999,
"step": 300
},
{
"epoch": 2.761467889908257,
"grad_norm": 8937.4580078125,
"learning_rate": 3.98989898989899e-05,
"loss": 0.0936,
"step": 301
},
{
"epoch": 2.770642201834862,
"grad_norm": 7193.6708984375,
"learning_rate": 3.984848484848485e-05,
"loss": 0.0743,
"step": 302
},
{
"epoch": 2.779816513761468,
"grad_norm": 7543.75341796875,
"learning_rate": 3.97979797979798e-05,
"loss": 0.0994,
"step": 303
},
{
"epoch": 2.7889908256880735,
"grad_norm": 8849.693359375,
"learning_rate": 3.974747474747475e-05,
"loss": 0.0891,
"step": 304
},
{
"epoch": 2.7981651376146788,
"grad_norm": 6587.58837890625,
"learning_rate": 3.96969696969697e-05,
"loss": 0.0921,
"step": 305
},
{
"epoch": 2.8073394495412844,
"grad_norm": 7597.34521484375,
"learning_rate": 3.964646464646465e-05,
"loss": 0.079,
"step": 306
},
{
"epoch": 2.81651376146789,
"grad_norm": 8688.705078125,
"learning_rate": 3.9595959595959594e-05,
"loss": 0.1014,
"step": 307
},
{
"epoch": 2.8256880733944953,
"grad_norm": 14665.2021484375,
"learning_rate": 3.954545454545455e-05,
"loss": 0.1354,
"step": 308
},
{
"epoch": 2.834862385321101,
"grad_norm": 10845.7890625,
"learning_rate": 3.9494949494949494e-05,
"loss": 0.1076,
"step": 309
},
{
"epoch": 2.8440366972477067,
"grad_norm": 10935.6669921875,
"learning_rate": 3.944444444444445e-05,
"loss": 0.1054,
"step": 310
},
{
"epoch": 2.853211009174312,
"grad_norm": 11541.736328125,
"learning_rate": 3.939393939393939e-05,
"loss": 0.0943,
"step": 311
},
{
"epoch": 2.8623853211009176,
"grad_norm": 6934.3125,
"learning_rate": 3.9343434343434346e-05,
"loss": 0.1018,
"step": 312
},
{
"epoch": 2.871559633027523,
"grad_norm": 8040.87939453125,
"learning_rate": 3.929292929292929e-05,
"loss": 0.0926,
"step": 313
},
{
"epoch": 2.8807339449541285,
"grad_norm": 6876.23876953125,
"learning_rate": 3.924242424242424e-05,
"loss": 0.0806,
"step": 314
},
{
"epoch": 2.8899082568807337,
"grad_norm": 8227.6435546875,
"learning_rate": 3.91919191919192e-05,
"loss": 0.0888,
"step": 315
},
{
"epoch": 2.8990825688073394,
"grad_norm": 9191.8701171875,
"learning_rate": 3.9141414141414145e-05,
"loss": 0.0982,
"step": 316
},
{
"epoch": 2.908256880733945,
"grad_norm": 8231.9013671875,
"learning_rate": 3.909090909090909e-05,
"loss": 0.0851,
"step": 317
},
{
"epoch": 2.9174311926605503,
"grad_norm": 8839.322265625,
"learning_rate": 3.9040404040404045e-05,
"loss": 0.0757,
"step": 318
},
{
"epoch": 2.926605504587156,
"grad_norm": 8270.8525390625,
"learning_rate": 3.898989898989899e-05,
"loss": 0.1015,
"step": 319
},
{
"epoch": 2.9357798165137616,
"grad_norm": 12873.2529296875,
"learning_rate": 3.8939393939393944e-05,
"loss": 0.0871,
"step": 320
},
{
"epoch": 2.944954128440367,
"grad_norm": 7598.33837890625,
"learning_rate": 3.888888888888889e-05,
"loss": 0.0884,
"step": 321
},
{
"epoch": 2.9541284403669725,
"grad_norm": 8909.1279296875,
"learning_rate": 3.8838383838383844e-05,
"loss": 0.102,
"step": 322
},
{
"epoch": 2.963302752293578,
"grad_norm": 9611.2451171875,
"learning_rate": 3.878787878787879e-05,
"loss": 0.094,
"step": 323
},
{
"epoch": 2.9724770642201834,
"grad_norm": 8551.978515625,
"learning_rate": 3.8737373737373737e-05,
"loss": 0.0857,
"step": 324
},
{
"epoch": 2.981651376146789,
"grad_norm": 7810.45703125,
"learning_rate": 3.868686868686869e-05,
"loss": 0.0854,
"step": 325
},
{
"epoch": 2.9908256880733948,
"grad_norm": 8891.1826171875,
"learning_rate": 3.8636363636363636e-05,
"loss": 0.101,
"step": 326
},
{
"epoch": 3.0,
"grad_norm": 16918.001953125,
"learning_rate": 3.858585858585859e-05,
"loss": 0.0625,
"step": 327
},
{
"epoch": 3.0,
"eval_f1_macro": 0.05806976361668135,
"eval_f1_micro": 0.37575757575757573,
"eval_loss": 0.0906587690114975,
"eval_precision": 0.7380952380952381,
"eval_recall": 0.25203252032520324,
"eval_runtime": 0.8046,
"eval_samples_per_second": 359.182,
"eval_steps_per_second": 16.157,
"step": 327
},
{
"epoch": 3.0091743119266057,
"grad_norm": 8289.6953125,
"learning_rate": 3.8535353535353536e-05,
"loss": 0.0924,
"step": 328
},
{
"epoch": 3.018348623853211,
"grad_norm": 13682.880859375,
"learning_rate": 3.848484848484848e-05,
"loss": 0.0939,
"step": 329
},
{
"epoch": 3.0275229357798166,
"grad_norm": 8464.88671875,
"learning_rate": 3.8434343434343435e-05,
"loss": 0.0784,
"step": 330
},
{
"epoch": 3.036697247706422,
"grad_norm": 8201.7705078125,
"learning_rate": 3.838383838383838e-05,
"loss": 0.0749,
"step": 331
},
{
"epoch": 3.0458715596330275,
"grad_norm": 9458.033203125,
"learning_rate": 3.8333333333333334e-05,
"loss": 0.1053,
"step": 332
},
{
"epoch": 3.055045871559633,
"grad_norm": 10062.501953125,
"learning_rate": 3.828282828282829e-05,
"loss": 0.0665,
"step": 333
},
{
"epoch": 3.0642201834862384,
"grad_norm": 7707.68359375,
"learning_rate": 3.8232323232323234e-05,
"loss": 0.0785,
"step": 334
},
{
"epoch": 3.073394495412844,
"grad_norm": 12023.5595703125,
"learning_rate": 3.818181818181819e-05,
"loss": 0.0952,
"step": 335
},
{
"epoch": 3.0825688073394497,
"grad_norm": 6632.79931640625,
"learning_rate": 3.8131313131313133e-05,
"loss": 0.0861,
"step": 336
},
{
"epoch": 3.091743119266055,
"grad_norm": 7961.10400390625,
"learning_rate": 3.8080808080808087e-05,
"loss": 0.0942,
"step": 337
},
{
"epoch": 3.1009174311926606,
"grad_norm": 7544.1826171875,
"learning_rate": 3.803030303030303e-05,
"loss": 0.0829,
"step": 338
},
{
"epoch": 3.1100917431192663,
"grad_norm": 12524.0947265625,
"learning_rate": 3.797979797979798e-05,
"loss": 0.0895,
"step": 339
},
{
"epoch": 3.1192660550458715,
"grad_norm": 15579.77734375,
"learning_rate": 3.792929292929293e-05,
"loss": 0.1132,
"step": 340
},
{
"epoch": 3.128440366972477,
"grad_norm": 7564.0458984375,
"learning_rate": 3.787878787878788e-05,
"loss": 0.0909,
"step": 341
},
{
"epoch": 3.1376146788990824,
"grad_norm": 8664.2548828125,
"learning_rate": 3.782828282828283e-05,
"loss": 0.0784,
"step": 342
},
{
"epoch": 3.146788990825688,
"grad_norm": 8917.251953125,
"learning_rate": 3.777777777777778e-05,
"loss": 0.0817,
"step": 343
},
{
"epoch": 3.1559633027522938,
"grad_norm": 8527.2666015625,
"learning_rate": 3.7727272727272725e-05,
"loss": 0.0852,
"step": 344
},
{
"epoch": 3.165137614678899,
"grad_norm": 7334.591796875,
"learning_rate": 3.767676767676768e-05,
"loss": 0.079,
"step": 345
},
{
"epoch": 3.1743119266055047,
"grad_norm": 7469.1240234375,
"learning_rate": 3.7626262626262624e-05,
"loss": 0.0742,
"step": 346
},
{
"epoch": 3.18348623853211,
"grad_norm": 10801.150390625,
"learning_rate": 3.757575757575758e-05,
"loss": 0.0915,
"step": 347
},
{
"epoch": 3.1926605504587156,
"grad_norm": 7579.3603515625,
"learning_rate": 3.7525252525252524e-05,
"loss": 0.0881,
"step": 348
},
{
"epoch": 3.2018348623853212,
"grad_norm": 9901.044921875,
"learning_rate": 3.747474747474748e-05,
"loss": 0.0823,
"step": 349
},
{
"epoch": 3.2110091743119265,
"grad_norm": 10092.9248046875,
"learning_rate": 3.742424242424243e-05,
"loss": 0.0788,
"step": 350
},
{
"epoch": 3.220183486238532,
"grad_norm": 7686.80859375,
"learning_rate": 3.7373737373737376e-05,
"loss": 0.0828,
"step": 351
},
{
"epoch": 3.229357798165138,
"grad_norm": 7244.4970703125,
"learning_rate": 3.732323232323233e-05,
"loss": 0.0943,
"step": 352
},
{
"epoch": 3.238532110091743,
"grad_norm": 7361.87353515625,
"learning_rate": 3.7272727272727276e-05,
"loss": 0.0811,
"step": 353
},
{
"epoch": 3.2477064220183487,
"grad_norm": 9312.623046875,
"learning_rate": 3.722222222222222e-05,
"loss": 0.0843,
"step": 354
},
{
"epoch": 3.2568807339449544,
"grad_norm": 9429.6171875,
"learning_rate": 3.7171717171717175e-05,
"loss": 0.0856,
"step": 355
},
{
"epoch": 3.2660550458715596,
"grad_norm": 7771.47705078125,
"learning_rate": 3.712121212121212e-05,
"loss": 0.0815,
"step": 356
},
{
"epoch": 3.2752293577981653,
"grad_norm": 6975.99609375,
"learning_rate": 3.7070707070707075e-05,
"loss": 0.0842,
"step": 357
},
{
"epoch": 3.2844036697247705,
"grad_norm": 9537.3935546875,
"learning_rate": 3.702020202020202e-05,
"loss": 0.0913,
"step": 358
},
{
"epoch": 3.293577981651376,
"grad_norm": 8734.8681640625,
"learning_rate": 3.6969696969696974e-05,
"loss": 0.0777,
"step": 359
},
{
"epoch": 3.302752293577982,
"grad_norm": 9178.66015625,
"learning_rate": 3.691919191919192e-05,
"loss": 0.0833,
"step": 360
},
{
"epoch": 3.311926605504587,
"grad_norm": 10208.962890625,
"learning_rate": 3.686868686868687e-05,
"loss": 0.0715,
"step": 361
},
{
"epoch": 3.3211009174311927,
"grad_norm": 6356.3134765625,
"learning_rate": 3.681818181818182e-05,
"loss": 0.0803,
"step": 362
},
{
"epoch": 3.330275229357798,
"grad_norm": 11730.859375,
"learning_rate": 3.6767676767676766e-05,
"loss": 0.0782,
"step": 363
},
{
"epoch": 3.3394495412844036,
"grad_norm": 7810.03369140625,
"learning_rate": 3.671717171717172e-05,
"loss": 0.0733,
"step": 364
},
{
"epoch": 3.3486238532110093,
"grad_norm": 8370.4697265625,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.0786,
"step": 365
},
{
"epoch": 3.3577981651376145,
"grad_norm": 8111.5361328125,
"learning_rate": 3.661616161616162e-05,
"loss": 0.0894,
"step": 366
},
{
"epoch": 3.36697247706422,
"grad_norm": 8765.7822265625,
"learning_rate": 3.656565656565657e-05,
"loss": 0.0904,
"step": 367
},
{
"epoch": 3.376146788990826,
"grad_norm": 18947.57421875,
"learning_rate": 3.651515151515152e-05,
"loss": 0.1098,
"step": 368
},
{
"epoch": 3.385321100917431,
"grad_norm": 7882.3369140625,
"learning_rate": 3.6464646464646465e-05,
"loss": 0.0892,
"step": 369
},
{
"epoch": 3.3944954128440368,
"grad_norm": 7774.26025390625,
"learning_rate": 3.641414141414142e-05,
"loss": 0.0775,
"step": 370
},
{
"epoch": 3.4036697247706424,
"grad_norm": 7653.77880859375,
"learning_rate": 3.6363636363636364e-05,
"loss": 0.0732,
"step": 371
},
{
"epoch": 3.4128440366972477,
"grad_norm": 7157.17529296875,
"learning_rate": 3.631313131313132e-05,
"loss": 0.0787,
"step": 372
},
{
"epoch": 3.4220183486238533,
"grad_norm": 8637.5361328125,
"learning_rate": 3.6262626262626264e-05,
"loss": 0.0811,
"step": 373
},
{
"epoch": 3.4311926605504586,
"grad_norm": 9387.2333984375,
"learning_rate": 3.621212121212122e-05,
"loss": 0.0914,
"step": 374
},
{
"epoch": 3.4403669724770642,
"grad_norm": 6949.630859375,
"learning_rate": 3.616161616161616e-05,
"loss": 0.0693,
"step": 375
},
{
"epoch": 3.44954128440367,
"grad_norm": 9442.3046875,
"learning_rate": 3.611111111111111e-05,
"loss": 0.0833,
"step": 376
},
{
"epoch": 3.458715596330275,
"grad_norm": 10019.27734375,
"learning_rate": 3.606060606060606e-05,
"loss": 0.0941,
"step": 377
},
{
"epoch": 3.467889908256881,
"grad_norm": 8413.140625,
"learning_rate": 3.601010101010101e-05,
"loss": 0.0788,
"step": 378
},
{
"epoch": 3.477064220183486,
"grad_norm": 8428.1181640625,
"learning_rate": 3.595959595959596e-05,
"loss": 0.0999,
"step": 379
},
{
"epoch": 3.4862385321100917,
"grad_norm": 8074.10400390625,
"learning_rate": 3.590909090909091e-05,
"loss": 0.0803,
"step": 380
},
{
"epoch": 3.4954128440366974,
"grad_norm": 9101.3134765625,
"learning_rate": 3.5858585858585855e-05,
"loss": 0.0679,
"step": 381
},
{
"epoch": 3.5045871559633026,
"grad_norm": 7441.87255859375,
"learning_rate": 3.580808080808081e-05,
"loss": 0.0721,
"step": 382
},
{
"epoch": 3.5137614678899083,
"grad_norm": 7897.45458984375,
"learning_rate": 3.575757575757576e-05,
"loss": 0.0905,
"step": 383
},
{
"epoch": 3.522935779816514,
"grad_norm": 8986.716796875,
"learning_rate": 3.5707070707070714e-05,
"loss": 0.0715,
"step": 384
},
{
"epoch": 3.532110091743119,
"grad_norm": 6766.3271484375,
"learning_rate": 3.565656565656566e-05,
"loss": 0.0647,
"step": 385
},
{
"epoch": 3.541284403669725,
"grad_norm": 9427.990234375,
"learning_rate": 3.560606060606061e-05,
"loss": 0.0822,
"step": 386
},
{
"epoch": 3.5504587155963305,
"grad_norm": 10683.732421875,
"learning_rate": 3.555555555555556e-05,
"loss": 0.0738,
"step": 387
},
{
"epoch": 3.5596330275229358,
"grad_norm": 14099.65625,
"learning_rate": 3.5505050505050506e-05,
"loss": 0.0832,
"step": 388
},
{
"epoch": 3.5688073394495414,
"grad_norm": 10605.873046875,
"learning_rate": 3.545454545454546e-05,
"loss": 0.0621,
"step": 389
},
{
"epoch": 3.5779816513761467,
"grad_norm": 7646.0703125,
"learning_rate": 3.5404040404040406e-05,
"loss": 0.0759,
"step": 390
},
{
"epoch": 3.5871559633027523,
"grad_norm": 12968.7607421875,
"learning_rate": 3.535353535353535e-05,
"loss": 0.0724,
"step": 391
},
{
"epoch": 3.5963302752293576,
"grad_norm": 11772.3857421875,
"learning_rate": 3.5303030303030305e-05,
"loss": 0.0881,
"step": 392
},
{
"epoch": 3.6055045871559632,
"grad_norm": 7031.99755859375,
"learning_rate": 3.525252525252525e-05,
"loss": 0.0636,
"step": 393
},
{
"epoch": 3.614678899082569,
"grad_norm": 8371.5205078125,
"learning_rate": 3.5202020202020205e-05,
"loss": 0.0686,
"step": 394
},
{
"epoch": 3.623853211009174,
"grad_norm": 7546.57666015625,
"learning_rate": 3.515151515151515e-05,
"loss": 0.0715,
"step": 395
},
{
"epoch": 3.63302752293578,
"grad_norm": 9094.708984375,
"learning_rate": 3.5101010101010104e-05,
"loss": 0.0778,
"step": 396
},
{
"epoch": 3.6422018348623855,
"grad_norm": 8982.2861328125,
"learning_rate": 3.505050505050505e-05,
"loss": 0.0724,
"step": 397
},
{
"epoch": 3.6513761467889907,
"grad_norm": 10044.265625,
"learning_rate": 3.5e-05,
"loss": 0.1019,
"step": 398
},
{
"epoch": 3.6605504587155964,
"grad_norm": 10931.083984375,
"learning_rate": 3.494949494949495e-05,
"loss": 0.0732,
"step": 399
},
{
"epoch": 3.669724770642202,
"grad_norm": 10163.0771484375,
"learning_rate": 3.48989898989899e-05,
"loss": 0.0899,
"step": 400
},
{
"epoch": 3.6788990825688073,
"grad_norm": 7464.64111328125,
"learning_rate": 3.484848484848485e-05,
"loss": 0.0802,
"step": 401
},
{
"epoch": 3.688073394495413,
"grad_norm": 12514.2763671875,
"learning_rate": 3.47979797979798e-05,
"loss": 0.0773,
"step": 402
},
{
"epoch": 3.6972477064220186,
"grad_norm": 9357.0361328125,
"learning_rate": 3.474747474747475e-05,
"loss": 0.0916,
"step": 403
},
{
"epoch": 3.706422018348624,
"grad_norm": 9900.4990234375,
"learning_rate": 3.46969696969697e-05,
"loss": 0.0679,
"step": 404
},
{
"epoch": 3.7155963302752295,
"grad_norm": 7504.73876953125,
"learning_rate": 3.464646464646465e-05,
"loss": 0.0646,
"step": 405
},
{
"epoch": 3.7247706422018347,
"grad_norm": 9507.84765625,
"learning_rate": 3.45959595959596e-05,
"loss": 0.0855,
"step": 406
},
{
"epoch": 3.7339449541284404,
"grad_norm": 7167.67041015625,
"learning_rate": 3.454545454545455e-05,
"loss": 0.0727,
"step": 407
},
{
"epoch": 3.7431192660550456,
"grad_norm": 8691.91796875,
"learning_rate": 3.4494949494949494e-05,
"loss": 0.0809,
"step": 408
},
{
"epoch": 3.7522935779816513,
"grad_norm": 7374.8330078125,
"learning_rate": 3.444444444444445e-05,
"loss": 0.0763,
"step": 409
},
{
"epoch": 3.761467889908257,
"grad_norm": 6899.1455078125,
"learning_rate": 3.4393939393939394e-05,
"loss": 0.0699,
"step": 410
},
{
"epoch": 3.770642201834862,
"grad_norm": 8236.75,
"learning_rate": 3.434343434343435e-05,
"loss": 0.0773,
"step": 411
},
{
"epoch": 3.779816513761468,
"grad_norm": 7490.56787109375,
"learning_rate": 3.429292929292929e-05,
"loss": 0.0705,
"step": 412
},
{
"epoch": 3.7889908256880735,
"grad_norm": 7241.3564453125,
"learning_rate": 3.424242424242424e-05,
"loss": 0.0636,
"step": 413
},
{
"epoch": 3.7981651376146788,
"grad_norm": 7502.1787109375,
"learning_rate": 3.419191919191919e-05,
"loss": 0.069,
"step": 414
},
{
"epoch": 3.8073394495412844,
"grad_norm": 10924.54296875,
"learning_rate": 3.414141414141414e-05,
"loss": 0.09,
"step": 415
},
{
"epoch": 3.81651376146789,
"grad_norm": 9266.40625,
"learning_rate": 3.409090909090909e-05,
"loss": 0.073,
"step": 416
},
{
"epoch": 3.8256880733944953,
"grad_norm": 10273.2900390625,
"learning_rate": 3.4040404040404045e-05,
"loss": 0.0951,
"step": 417
},
{
"epoch": 3.834862385321101,
"grad_norm": 6867.47802734375,
"learning_rate": 3.398989898989899e-05,
"loss": 0.0702,
"step": 418
},
{
"epoch": 3.8440366972477067,
"grad_norm": 8106.5517578125,
"learning_rate": 3.3939393939393945e-05,
"loss": 0.075,
"step": 419
},
{
"epoch": 3.853211009174312,
"grad_norm": 7965.34033203125,
"learning_rate": 3.388888888888889e-05,
"loss": 0.0695,
"step": 420
},
{
"epoch": 3.8623853211009176,
"grad_norm": 8276.9873046875,
"learning_rate": 3.3838383838383844e-05,
"loss": 0.0837,
"step": 421
},
{
"epoch": 3.871559633027523,
"grad_norm": 13724.3798828125,
"learning_rate": 3.378787878787879e-05,
"loss": 0.0807,
"step": 422
},
{
"epoch": 3.8807339449541285,
"grad_norm": 8130.3525390625,
"learning_rate": 3.373737373737374e-05,
"loss": 0.0793,
"step": 423
},
{
"epoch": 3.8899082568807337,
"grad_norm": 8378.513671875,
"learning_rate": 3.368686868686869e-05,
"loss": 0.0669,
"step": 424
},
{
"epoch": 3.8990825688073394,
"grad_norm": 7662.083984375,
"learning_rate": 3.3636363636363636e-05,
"loss": 0.0755,
"step": 425
},
{
"epoch": 3.908256880733945,
"grad_norm": 8806.6005859375,
"learning_rate": 3.358585858585859e-05,
"loss": 0.0735,
"step": 426
},
{
"epoch": 3.9174311926605503,
"grad_norm": 8842.380859375,
"learning_rate": 3.3535353535353536e-05,
"loss": 0.0666,
"step": 427
},
{
"epoch": 3.926605504587156,
"grad_norm": 8317.888671875,
"learning_rate": 3.348484848484848e-05,
"loss": 0.0848,
"step": 428
},
{
"epoch": 3.9357798165137616,
"grad_norm": 8649.2880859375,
"learning_rate": 3.3434343434343435e-05,
"loss": 0.0843,
"step": 429
},
{
"epoch": 3.944954128440367,
"grad_norm": 10749.7431640625,
"learning_rate": 3.338383838383838e-05,
"loss": 0.0915,
"step": 430
},
{
"epoch": 3.9541284403669725,
"grad_norm": 11217.1533203125,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0833,
"step": 431
},
{
"epoch": 3.963302752293578,
"grad_norm": 7443.095703125,
"learning_rate": 3.328282828282828e-05,
"loss": 0.0763,
"step": 432
},
{
"epoch": 3.9724770642201834,
"grad_norm": 7243.30859375,
"learning_rate": 3.3232323232323234e-05,
"loss": 0.068,
"step": 433
},
{
"epoch": 3.981651376146789,
"grad_norm": 7107.99658203125,
"learning_rate": 3.318181818181819e-05,
"loss": 0.0749,
"step": 434
},
{
"epoch": 3.9908256880733948,
"grad_norm": 12792.3837890625,
"learning_rate": 3.3131313131313134e-05,
"loss": 0.0799,
"step": 435
},
{
"epoch": 4.0,
"grad_norm": 16352.5830078125,
"learning_rate": 3.308080808080809e-05,
"loss": 0.0644,
"step": 436
},
{
"epoch": 4.0,
"eval_f1_macro": 0.12554329514843507,
"eval_f1_micro": 0.4918032786885246,
"eval_loss": 0.07779007405042648,
"eval_precision": 0.75,
"eval_recall": 0.36585365853658536,
"eval_runtime": 0.8052,
"eval_samples_per_second": 358.932,
"eval_steps_per_second": 16.146,
"step": 436
},
{
"epoch": 4.009174311926605,
"grad_norm": 8458.75,
"learning_rate": 3.303030303030303e-05,
"loss": 0.0692,
"step": 437
},
{
"epoch": 4.018348623853211,
"grad_norm": 13337.8359375,
"learning_rate": 3.297979797979798e-05,
"loss": 0.0755,
"step": 438
},
{
"epoch": 4.027522935779817,
"grad_norm": 8051.86865234375,
"learning_rate": 3.292929292929293e-05,
"loss": 0.0578,
"step": 439
},
{
"epoch": 4.036697247706422,
"grad_norm": 10501.3466796875,
"learning_rate": 3.287878787878788e-05,
"loss": 0.0765,
"step": 440
},
{
"epoch": 4.045871559633028,
"grad_norm": 8340.0751953125,
"learning_rate": 3.282828282828283e-05,
"loss": 0.0625,
"step": 441
},
{
"epoch": 4.055045871559633,
"grad_norm": 7503.3642578125,
"learning_rate": 3.277777777777778e-05,
"loss": 0.0677,
"step": 442
},
{
"epoch": 4.064220183486238,
"grad_norm": 6723.236328125,
"learning_rate": 3.272727272727273e-05,
"loss": 0.0635,
"step": 443
},
{
"epoch": 4.073394495412844,
"grad_norm": 7500.09033203125,
"learning_rate": 3.267676767676768e-05,
"loss": 0.0745,
"step": 444
},
{
"epoch": 4.08256880733945,
"grad_norm": 7736.67333984375,
"learning_rate": 3.2626262626262624e-05,
"loss": 0.0644,
"step": 445
},
{
"epoch": 4.091743119266055,
"grad_norm": 11797.1162109375,
"learning_rate": 3.257575757575758e-05,
"loss": 0.0781,
"step": 446
},
{
"epoch": 4.10091743119266,
"grad_norm": 6644.9296875,
"learning_rate": 3.2525252525252524e-05,
"loss": 0.0583,
"step": 447
},
{
"epoch": 4.110091743119266,
"grad_norm": 6017.3408203125,
"learning_rate": 3.247474747474748e-05,
"loss": 0.0584,
"step": 448
},
{
"epoch": 4.1192660550458715,
"grad_norm": 7825.36865234375,
"learning_rate": 3.2424242424242423e-05,
"loss": 0.0539,
"step": 449
},
{
"epoch": 4.128440366972477,
"grad_norm": 9209.607421875,
"learning_rate": 3.237373737373737e-05,
"loss": 0.0661,
"step": 450
},
{
"epoch": 4.137614678899083,
"grad_norm": 7857.53271484375,
"learning_rate": 3.232323232323233e-05,
"loss": 0.0696,
"step": 451
},
{
"epoch": 4.146788990825688,
"grad_norm": 7992.9599609375,
"learning_rate": 3.2272727272727276e-05,
"loss": 0.06,
"step": 452
},
{
"epoch": 4.155963302752293,
"grad_norm": 8909.8310546875,
"learning_rate": 3.222222222222223e-05,
"loss": 0.0661,
"step": 453
},
{
"epoch": 4.165137614678899,
"grad_norm": 6850.833984375,
"learning_rate": 3.2171717171717176e-05,
"loss": 0.0686,
"step": 454
},
{
"epoch": 4.174311926605505,
"grad_norm": 6964.1201171875,
"learning_rate": 3.212121212121212e-05,
"loss": 0.0625,
"step": 455
},
{
"epoch": 4.18348623853211,
"grad_norm": 7593.55615234375,
"learning_rate": 3.2070707070707075e-05,
"loss": 0.0694,
"step": 456
},
{
"epoch": 4.192660550458716,
"grad_norm": 9576.828125,
"learning_rate": 3.202020202020202e-05,
"loss": 0.075,
"step": 457
},
{
"epoch": 4.201834862385321,
"grad_norm": 8686.0986328125,
"learning_rate": 3.1969696969696974e-05,
"loss": 0.0758,
"step": 458
},
{
"epoch": 4.2110091743119265,
"grad_norm": 8695.9306640625,
"learning_rate": 3.191919191919192e-05,
"loss": 0.0447,
"step": 459
},
{
"epoch": 4.220183486238533,
"grad_norm": 9451.28125,
"learning_rate": 3.186868686868687e-05,
"loss": 0.0538,
"step": 460
},
{
"epoch": 4.229357798165138,
"grad_norm": 6842.013671875,
"learning_rate": 3.181818181818182e-05,
"loss": 0.0815,
"step": 461
},
{
"epoch": 4.238532110091743,
"grad_norm": 6817.18994140625,
"learning_rate": 3.176767676767677e-05,
"loss": 0.0576,
"step": 462
},
{
"epoch": 4.247706422018348,
"grad_norm": 7361.92578125,
"learning_rate": 3.171717171717172e-05,
"loss": 0.0629,
"step": 463
},
{
"epoch": 4.256880733944954,
"grad_norm": 9081.8603515625,
"learning_rate": 3.1666666666666666e-05,
"loss": 0.0671,
"step": 464
},
{
"epoch": 4.26605504587156,
"grad_norm": 9699.37109375,
"learning_rate": 3.161616161616161e-05,
"loss": 0.0743,
"step": 465
},
{
"epoch": 4.275229357798165,
"grad_norm": 9937.20703125,
"learning_rate": 3.1565656565656566e-05,
"loss": 0.076,
"step": 466
},
{
"epoch": 4.284403669724771,
"grad_norm": 12852.361328125,
"learning_rate": 3.151515151515151e-05,
"loss": 0.0838,
"step": 467
},
{
"epoch": 4.293577981651376,
"grad_norm": 10356.1337890625,
"learning_rate": 3.146464646464647e-05,
"loss": 0.0727,
"step": 468
},
{
"epoch": 4.302752293577981,
"grad_norm": 13259.0107421875,
"learning_rate": 3.141414141414142e-05,
"loss": 0.0609,
"step": 469
},
{
"epoch": 4.3119266055045875,
"grad_norm": 9083.2255859375,
"learning_rate": 3.1363636363636365e-05,
"loss": 0.0731,
"step": 470
},
{
"epoch": 4.321100917431193,
"grad_norm": 9072.19921875,
"learning_rate": 3.131313131313132e-05,
"loss": 0.0634,
"step": 471
},
{
"epoch": 4.330275229357798,
"grad_norm": 7686.09765625,
"learning_rate": 3.1262626262626264e-05,
"loss": 0.0671,
"step": 472
},
{
"epoch": 4.339449541284404,
"grad_norm": 6557.51953125,
"learning_rate": 3.121212121212122e-05,
"loss": 0.0575,
"step": 473
},
{
"epoch": 4.348623853211009,
"grad_norm": 7452.078125,
"learning_rate": 3.1161616161616164e-05,
"loss": 0.0616,
"step": 474
},
{
"epoch": 4.3577981651376145,
"grad_norm": 10433.482421875,
"learning_rate": 3.111111111111111e-05,
"loss": 0.0807,
"step": 475
},
{
"epoch": 4.36697247706422,
"grad_norm": 7201.25927734375,
"learning_rate": 3.106060606060606e-05,
"loss": 0.0814,
"step": 476
},
{
"epoch": 4.376146788990826,
"grad_norm": 8066.92333984375,
"learning_rate": 3.101010101010101e-05,
"loss": 0.0615,
"step": 477
},
{
"epoch": 4.385321100917431,
"grad_norm": 10633.607421875,
"learning_rate": 3.095959595959596e-05,
"loss": 0.0732,
"step": 478
},
{
"epoch": 4.394495412844036,
"grad_norm": 12350.8046875,
"learning_rate": 3.090909090909091e-05,
"loss": 0.0784,
"step": 479
},
{
"epoch": 4.4036697247706424,
"grad_norm": 10261.6748046875,
"learning_rate": 3.085858585858586e-05,
"loss": 0.0643,
"step": 480
},
{
"epoch": 4.412844036697248,
"grad_norm": 19839.958984375,
"learning_rate": 3.080808080808081e-05,
"loss": 0.0885,
"step": 481
},
{
"epoch": 4.422018348623853,
"grad_norm": 7739.05615234375,
"learning_rate": 3.0757575757575755e-05,
"loss": 0.0529,
"step": 482
},
{
"epoch": 4.431192660550459,
"grad_norm": 7615.5048828125,
"learning_rate": 3.070707070707071e-05,
"loss": 0.0663,
"step": 483
},
{
"epoch": 4.440366972477064,
"grad_norm": 11299.6220703125,
"learning_rate": 3.0656565656565654e-05,
"loss": 0.0475,
"step": 484
},
{
"epoch": 4.4495412844036695,
"grad_norm": 11667.0,
"learning_rate": 3.060606060606061e-05,
"loss": 0.0598,
"step": 485
},
{
"epoch": 4.458715596330276,
"grad_norm": 14411.435546875,
"learning_rate": 3.055555555555556e-05,
"loss": 0.0651,
"step": 486
},
{
"epoch": 4.467889908256881,
"grad_norm": 9918.951171875,
"learning_rate": 3.050505050505051e-05,
"loss": 0.087,
"step": 487
},
{
"epoch": 4.477064220183486,
"grad_norm": 19711.619140625,
"learning_rate": 3.0454545454545456e-05,
"loss": 0.0764,
"step": 488
},
{
"epoch": 4.486238532110092,
"grad_norm": 8031.31591796875,
"learning_rate": 3.0404040404040406e-05,
"loss": 0.0589,
"step": 489
},
{
"epoch": 4.495412844036697,
"grad_norm": 9987.06640625,
"learning_rate": 3.0353535353535356e-05,
"loss": 0.0717,
"step": 490
},
{
"epoch": 4.504587155963303,
"grad_norm": 8235.3388671875,
"learning_rate": 3.0303030303030306e-05,
"loss": 0.0604,
"step": 491
},
{
"epoch": 4.513761467889909,
"grad_norm": 8193.5595703125,
"learning_rate": 3.0252525252525255e-05,
"loss": 0.059,
"step": 492
},
{
"epoch": 4.522935779816514,
"grad_norm": 9168.8310546875,
"learning_rate": 3.0202020202020205e-05,
"loss": 0.0681,
"step": 493
},
{
"epoch": 4.532110091743119,
"grad_norm": 7615.90234375,
"learning_rate": 3.015151515151515e-05,
"loss": 0.0701,
"step": 494
},
{
"epoch": 4.541284403669724,
"grad_norm": 9336.4892578125,
"learning_rate": 3.01010101010101e-05,
"loss": 0.0774,
"step": 495
},
{
"epoch": 4.5504587155963305,
"grad_norm": 12258.80078125,
"learning_rate": 3.005050505050505e-05,
"loss": 0.0688,
"step": 496
},
{
"epoch": 4.559633027522936,
"grad_norm": 8392.525390625,
"learning_rate": 3e-05,
"loss": 0.0564,
"step": 497
},
{
"epoch": 4.568807339449541,
"grad_norm": 9604.12890625,
"learning_rate": 2.994949494949495e-05,
"loss": 0.064,
"step": 498
},
{
"epoch": 4.577981651376147,
"grad_norm": 9194.5859375,
"learning_rate": 2.98989898989899e-05,
"loss": 0.0636,
"step": 499
},
{
"epoch": 4.587155963302752,
"grad_norm": 9700.25390625,
"learning_rate": 2.9848484848484847e-05,
"loss": 0.0679,
"step": 500
},
{
"epoch": 4.5963302752293576,
"grad_norm": 8867.0146484375,
"learning_rate": 2.9797979797979796e-05,
"loss": 0.0701,
"step": 501
},
{
"epoch": 4.605504587155964,
"grad_norm": 10309.2646484375,
"learning_rate": 2.9747474747474746e-05,
"loss": 0.0691,
"step": 502
},
{
"epoch": 4.614678899082569,
"grad_norm": 8310.7587890625,
"learning_rate": 2.96969696969697e-05,
"loss": 0.0661,
"step": 503
},
{
"epoch": 4.623853211009174,
"grad_norm": 8526.0302734375,
"learning_rate": 2.964646464646465e-05,
"loss": 0.0629,
"step": 504
},
{
"epoch": 4.63302752293578,
"grad_norm": 7720.8388671875,
"learning_rate": 2.95959595959596e-05,
"loss": 0.068,
"step": 505
},
{
"epoch": 4.6422018348623855,
"grad_norm": 7792.38330078125,
"learning_rate": 2.954545454545455e-05,
"loss": 0.0655,
"step": 506
},
{
"epoch": 4.651376146788991,
"grad_norm": 9584.6845703125,
"learning_rate": 2.9494949494949498e-05,
"loss": 0.0703,
"step": 507
},
{
"epoch": 4.660550458715596,
"grad_norm": 9758.236328125,
"learning_rate": 2.9444444444444448e-05,
"loss": 0.0725,
"step": 508
},
{
"epoch": 4.669724770642202,
"grad_norm": 12375.326171875,
"learning_rate": 2.9393939393939394e-05,
"loss": 0.0644,
"step": 509
},
{
"epoch": 4.678899082568807,
"grad_norm": 10054.2705078125,
"learning_rate": 2.9343434343434344e-05,
"loss": 0.0462,
"step": 510
},
{
"epoch": 4.6880733944954125,
"grad_norm": 14444.8173828125,
"learning_rate": 2.9292929292929294e-05,
"loss": 0.0693,
"step": 511
},
{
"epoch": 4.697247706422019,
"grad_norm": 9948.7880859375,
"learning_rate": 2.9242424242424243e-05,
"loss": 0.0576,
"step": 512
},
{
"epoch": 4.706422018348624,
"grad_norm": 8357.00390625,
"learning_rate": 2.9191919191919193e-05,
"loss": 0.081,
"step": 513
},
{
"epoch": 4.715596330275229,
"grad_norm": 9313.916015625,
"learning_rate": 2.9141414141414143e-05,
"loss": 0.068,
"step": 514
},
{
"epoch": 4.724770642201835,
"grad_norm": 10014.9775390625,
"learning_rate": 2.909090909090909e-05,
"loss": 0.0672,
"step": 515
},
{
"epoch": 4.73394495412844,
"grad_norm": 10116.134765625,
"learning_rate": 2.904040404040404e-05,
"loss": 0.0668,
"step": 516
},
{
"epoch": 4.743119266055046,
"grad_norm": 7578.20703125,
"learning_rate": 2.898989898989899e-05,
"loss": 0.048,
"step": 517
},
{
"epoch": 4.752293577981652,
"grad_norm": 8988.373046875,
"learning_rate": 2.893939393939394e-05,
"loss": 0.0628,
"step": 518
},
{
"epoch": 4.761467889908257,
"grad_norm": 10147.2666015625,
"learning_rate": 2.8888888888888888e-05,
"loss": 0.0641,
"step": 519
},
{
"epoch": 4.770642201834862,
"grad_norm": 7943.02294921875,
"learning_rate": 2.883838383838384e-05,
"loss": 0.0557,
"step": 520
},
{
"epoch": 4.779816513761467,
"grad_norm": 8308.49609375,
"learning_rate": 2.878787878787879e-05,
"loss": 0.0617,
"step": 521
},
{
"epoch": 4.7889908256880735,
"grad_norm": 12425.205078125,
"learning_rate": 2.873737373737374e-05,
"loss": 0.0631,
"step": 522
},
{
"epoch": 4.798165137614679,
"grad_norm": 6772.55224609375,
"learning_rate": 2.868686868686869e-05,
"loss": 0.0535,
"step": 523
},
{
"epoch": 4.807339449541285,
"grad_norm": 10277.5205078125,
"learning_rate": 2.863636363636364e-05,
"loss": 0.0745,
"step": 524
},
{
"epoch": 4.81651376146789,
"grad_norm": 10549.9482421875,
"learning_rate": 2.8585858585858587e-05,
"loss": 0.0758,
"step": 525
},
{
"epoch": 4.825688073394495,
"grad_norm": 8067.88330078125,
"learning_rate": 2.8535353535353536e-05,
"loss": 0.0693,
"step": 526
},
{
"epoch": 4.834862385321101,
"grad_norm": 9647.41796875,
"learning_rate": 2.8484848484848486e-05,
"loss": 0.0523,
"step": 527
},
{
"epoch": 4.844036697247707,
"grad_norm": 7713.77587890625,
"learning_rate": 2.8434343434343436e-05,
"loss": 0.0561,
"step": 528
},
{
"epoch": 4.853211009174312,
"grad_norm": 7126.24658203125,
"learning_rate": 2.8383838383838386e-05,
"loss": 0.0602,
"step": 529
},
{
"epoch": 4.862385321100917,
"grad_norm": 9366.20703125,
"learning_rate": 2.8333333333333335e-05,
"loss": 0.0506,
"step": 530
},
{
"epoch": 4.871559633027523,
"grad_norm": 11138.2060546875,
"learning_rate": 2.8282828282828282e-05,
"loss": 0.0507,
"step": 531
},
{
"epoch": 4.8807339449541285,
"grad_norm": 8674.6513671875,
"learning_rate": 2.823232323232323e-05,
"loss": 0.0611,
"step": 532
},
{
"epoch": 4.889908256880734,
"grad_norm": 9256.6474609375,
"learning_rate": 2.818181818181818e-05,
"loss": 0.0641,
"step": 533
},
{
"epoch": 4.89908256880734,
"grad_norm": 11366.2158203125,
"learning_rate": 2.813131313131313e-05,
"loss": 0.0705,
"step": 534
},
{
"epoch": 4.908256880733945,
"grad_norm": 9709.6484375,
"learning_rate": 2.808080808080808e-05,
"loss": 0.0676,
"step": 535
},
{
"epoch": 4.91743119266055,
"grad_norm": 7580.935546875,
"learning_rate": 2.803030303030303e-05,
"loss": 0.0518,
"step": 536
},
{
"epoch": 4.926605504587156,
"grad_norm": 11798.3857421875,
"learning_rate": 2.7979797979797984e-05,
"loss": 0.0657,
"step": 537
},
{
"epoch": 4.935779816513762,
"grad_norm": 7368.62841796875,
"learning_rate": 2.7929292929292933e-05,
"loss": 0.0644,
"step": 538
},
{
"epoch": 4.944954128440367,
"grad_norm": 6701.8466796875,
"learning_rate": 2.7878787878787883e-05,
"loss": 0.0515,
"step": 539
},
{
"epoch": 4.954128440366972,
"grad_norm": 8849.685546875,
"learning_rate": 2.7828282828282833e-05,
"loss": 0.0547,
"step": 540
},
{
"epoch": 4.963302752293578,
"grad_norm": 9179.5751953125,
"learning_rate": 2.777777777777778e-05,
"loss": 0.0542,
"step": 541
},
{
"epoch": 4.972477064220183,
"grad_norm": 14458.0810546875,
"learning_rate": 2.772727272727273e-05,
"loss": 0.067,
"step": 542
},
{
"epoch": 4.981651376146789,
"grad_norm": 9508.0732421875,
"learning_rate": 2.767676767676768e-05,
"loss": 0.0666,
"step": 543
},
{
"epoch": 4.990825688073395,
"grad_norm": 9082.1591796875,
"learning_rate": 2.762626262626263e-05,
"loss": 0.0591,
"step": 544
},
{
"epoch": 5.0,
"grad_norm": 38391.5390625,
"learning_rate": 2.7575757575757578e-05,
"loss": 0.0782,
"step": 545
},
{
"epoch": 5.0,
"eval_f1_macro": 0.2588916080981378,
"eval_f1_micro": 0.5921985815602837,
"eval_loss": 0.06743249297142029,
"eval_precision": 0.8564102564102564,
"eval_recall": 0.45257452574525747,
"eval_runtime": 0.8048,
"eval_samples_per_second": 359.083,
"eval_steps_per_second": 16.153,
"step": 545
}
],
"logging_steps": 1,
"max_steps": 1090,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3415337226086400.0,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}