ft-phi-3.5-mini-instruct / trainer_state.json
rshwndsz's picture
Add files using upload-large-folder tool
70edc46 verified
{
"best_metric": 0.8978955572876072,
"best_model_checkpoint": "./results/finetunes/20250205-121158__microsoft_Phi-3.5-mini-instruct__ft/checkpoint-1792",
"epoch": 0.13208520675167687,
"eval_steps": 16,
"global_step": 1792,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001474165253924965,
"grad_norm": 112.13977813720703,
"learning_rate": 0.00012128399488167067,
"loss": 2.0334,
"step": 2
},
{
"epoch": 0.000294833050784993,
"grad_norm": 47.18525695800781,
"learning_rate": 0.00012128399457682722,
"loss": 0.4295,
"step": 4
},
{
"epoch": 0.0004422495761774895,
"grad_norm": 46.89369583129883,
"learning_rate": 0.0001212839940687548,
"loss": 1.793,
"step": 6
},
{
"epoch": 0.000589666101569986,
"grad_norm": 90.68251037597656,
"learning_rate": 0.00012128399335745342,
"loss": 1.582,
"step": 8
},
{
"epoch": 0.0007370826269624825,
"grad_norm": 10.48133373260498,
"learning_rate": 0.00012128399244292309,
"loss": 1.152,
"step": 10
},
{
"epoch": 0.000884499152354979,
"grad_norm": 57.58028030395508,
"learning_rate": 0.00012128399132516379,
"loss": 0.8417,
"step": 12
},
{
"epoch": 0.0010319156777474755,
"grad_norm": 24.7613468170166,
"learning_rate": 0.00012128399000417552,
"loss": 0.6337,
"step": 14
},
{
"epoch": 0.001179332203139972,
"grad_norm": 5.995689868927002,
"learning_rate": 0.00012128398847995831,
"loss": 0.29,
"step": 16
},
{
"epoch": 0.001179332203139972,
"eval_1_ratio_diff": -0.12081060015588468,
"eval_accuracy": 0.6360093530787218,
"eval_f1": 0.5856255545696539,
"eval_loss": 0.7121835350990295,
"eval_precision": 0.6790123456790124,
"eval_recall": 0.514820592823713,
"eval_runtime": 1440.0319,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 16
},
{
"epoch": 0.0013267487285324685,
"grad_norm": 20.756057739257812,
"learning_rate": 0.00012128398675251216,
"loss": 0.4541,
"step": 18
},
{
"epoch": 0.001474165253924965,
"grad_norm": 49.25767135620117,
"learning_rate": 0.00012128398482183706,
"loss": 1.1751,
"step": 20
},
{
"epoch": 0.0016215817793174615,
"grad_norm": 10.73904037475586,
"learning_rate": 0.00012128398268793303,
"loss": 0.2334,
"step": 22
},
{
"epoch": 0.001768998304709958,
"grad_norm": 3.5153348445892334,
"learning_rate": 0.00012128398035080009,
"loss": 0.8965,
"step": 24
},
{
"epoch": 0.0019164148301024544,
"grad_norm": 117.84137725830078,
"learning_rate": 0.0001212839778104382,
"loss": 2.9108,
"step": 26
},
{
"epoch": 0.002063831355494951,
"grad_norm": 108.86376190185547,
"learning_rate": 0.00012128397506684742,
"loss": 2.1317,
"step": 28
},
{
"epoch": 0.0022112478808874476,
"grad_norm": 19.305322647094727,
"learning_rate": 0.00012128397212002774,
"loss": 0.2653,
"step": 30
},
{
"epoch": 0.002358664406279944,
"grad_norm": 46.865966796875,
"learning_rate": 0.00012128396896997918,
"loss": 2.2461,
"step": 32
},
{
"epoch": 0.002358664406279944,
"eval_1_ratio_diff": -0.49961028838659394,
"eval_accuracy": 0.5003897116134061,
"eval_f1": 0.0,
"eval_loss": 1.7971160411834717,
"eval_precision": 0.0,
"eval_recall": 0.0,
"eval_runtime": 1438.1269,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 32
},
{
"epoch": 0.0025060809316724405,
"grad_norm": 55.90218734741211,
"learning_rate": 0.00012128396561670172,
"loss": 1.0773,
"step": 34
},
{
"epoch": 0.002653497457064937,
"grad_norm": 8.257821083068848,
"learning_rate": 0.0001212839620601954,
"loss": 0.7481,
"step": 36
},
{
"epoch": 0.0028009139824574335,
"grad_norm": 11.776910781860352,
"learning_rate": 0.00012128395830046022,
"loss": 0.0906,
"step": 38
},
{
"epoch": 0.00294833050784993,
"grad_norm": 115.57841491699219,
"learning_rate": 0.00012128395433749618,
"loss": 3.0851,
"step": 40
},
{
"epoch": 0.0030957470332424264,
"grad_norm": 5.130585193634033,
"learning_rate": 0.00012128395017130333,
"loss": 0.9399,
"step": 42
},
{
"epoch": 0.003243163558634923,
"grad_norm": 43.877689361572266,
"learning_rate": 0.00012128394580188166,
"loss": 0.9284,
"step": 44
},
{
"epoch": 0.0033905800840274194,
"grad_norm": 48.76664733886719,
"learning_rate": 0.00012128394122923118,
"loss": 0.5431,
"step": 46
},
{
"epoch": 0.003537996609419916,
"grad_norm": 33.9229736328125,
"learning_rate": 0.00012128393645335193,
"loss": 0.6688,
"step": 48
},
{
"epoch": 0.003537996609419916,
"eval_1_ratio_diff": -0.09353078721745911,
"eval_accuracy": 0.764614185502728,
"eval_f1": 0.7401032702237521,
"eval_loss": 0.49912577867507935,
"eval_precision": 0.8253358925143954,
"eval_recall": 0.6708268330733229,
"eval_runtime": 1439.1521,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 48
},
{
"epoch": 0.0036854131348124123,
"grad_norm": 20.783430099487305,
"learning_rate": 0.00012128393147424389,
"loss": 0.7502,
"step": 50
},
{
"epoch": 0.003832829660204909,
"grad_norm": 28.81708335876465,
"learning_rate": 0.0001212839262919071,
"loss": 0.8271,
"step": 52
},
{
"epoch": 0.003980246185597405,
"grad_norm": 58.47079086303711,
"learning_rate": 0.00012128392090634156,
"loss": 1.0213,
"step": 54
},
{
"epoch": 0.004127662710989902,
"grad_norm": 107.4663314819336,
"learning_rate": 0.00012128391531754733,
"loss": 1.6449,
"step": 56
},
{
"epoch": 0.004275079236382398,
"grad_norm": 21.926761627197266,
"learning_rate": 0.00012128390952552436,
"loss": 1.5282,
"step": 58
},
{
"epoch": 0.004422495761774895,
"grad_norm": 108.13206481933594,
"learning_rate": 0.00012128390353027275,
"loss": 1.2688,
"step": 60
},
{
"epoch": 0.004569912287167391,
"grad_norm": 85.27387237548828,
"learning_rate": 0.00012128389733179246,
"loss": 1.4725,
"step": 62
},
{
"epoch": 0.004717328812559888,
"grad_norm": 3.8993313312530518,
"learning_rate": 0.00012128389093008353,
"loss": 0.1737,
"step": 64
},
{
"epoch": 0.004717328812559888,
"eval_1_ratio_diff": 0.05378020265003891,
"eval_accuracy": 0.7809820732657833,
"eval_f1": 0.7920059215396003,
"eval_loss": 0.4972352981567383,
"eval_precision": 0.7535211267605634,
"eval_recall": 0.8346333853354134,
"eval_runtime": 1439.2432,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 64
},
{
"epoch": 0.004864745337952384,
"grad_norm": 1.8932548761367798,
"learning_rate": 0.00012128388432514599,
"loss": 0.1574,
"step": 66
},
{
"epoch": 0.005012161863344881,
"grad_norm": 31.92827606201172,
"learning_rate": 0.00012128387751697984,
"loss": 0.2239,
"step": 68
},
{
"epoch": 0.005159578388737377,
"grad_norm": 57.11052703857422,
"learning_rate": 0.00012128387050558512,
"loss": 1.4278,
"step": 70
},
{
"epoch": 0.005306994914129874,
"grad_norm": 0.45575767755508423,
"learning_rate": 0.00012128386329096184,
"loss": 2.7855,
"step": 72
},
{
"epoch": 0.00545441143952237,
"grad_norm": 71.60086059570312,
"learning_rate": 0.00012128385587311005,
"loss": 1.4066,
"step": 74
},
{
"epoch": 0.005601827964914867,
"grad_norm": 0.1034606546163559,
"learning_rate": 0.00012128384825202977,
"loss": 2.1198,
"step": 76
},
{
"epoch": 0.005749244490307364,
"grad_norm": 0.3067642152309418,
"learning_rate": 0.00012128384042772098,
"loss": 0.0126,
"step": 78
},
{
"epoch": 0.00589666101569986,
"grad_norm": 63.32870101928711,
"learning_rate": 0.00012128383240018376,
"loss": 1.4007,
"step": 80
},
{
"epoch": 0.00589666101569986,
"eval_1_ratio_diff": 0.04130943102104445,
"eval_accuracy": 0.7653936087295401,
"eval_f1": 0.7745318352059926,
"eval_loss": 1.208424687385559,
"eval_precision": 0.7449567723342939,
"eval_recall": 0.8065522620904836,
"eval_runtime": 1438.9869,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 80
},
{
"epoch": 0.006044077541092357,
"grad_norm": 0.002626498695462942,
"learning_rate": 0.00012128382416941812,
"loss": 0.003,
"step": 82
},
{
"epoch": 0.006191494066484853,
"grad_norm": 78.83605194091797,
"learning_rate": 0.00012128381573542408,
"loss": 1.7103,
"step": 84
},
{
"epoch": 0.00633891059187735,
"grad_norm": 0.04237201437354088,
"learning_rate": 0.00012128380709820168,
"loss": 0.0184,
"step": 86
},
{
"epoch": 0.006486327117269846,
"grad_norm": 57.11608123779297,
"learning_rate": 0.00012128379825775094,
"loss": 0.3886,
"step": 88
},
{
"epoch": 0.006633743642662343,
"grad_norm": 71.66314697265625,
"learning_rate": 0.00012128378921407189,
"loss": 1.0122,
"step": 90
},
{
"epoch": 0.006781160168054839,
"grad_norm": 60.63711166381836,
"learning_rate": 0.00012128377996716456,
"loss": 2.2072,
"step": 92
},
{
"epoch": 0.006928576693447336,
"grad_norm": 64.88410186767578,
"learning_rate": 0.00012128377051702896,
"loss": 1.7641,
"step": 94
},
{
"epoch": 0.007075993218839832,
"grad_norm": 15.290694236755371,
"learning_rate": 0.00012128376086366519,
"loss": 0.2084,
"step": 96
},
{
"epoch": 0.007075993218839832,
"eval_1_ratio_diff": -0.07794232268121593,
"eval_accuracy": 0.764614185502728,
"eval_f1": 0.7445008460236887,
"eval_loss": 0.6278901100158691,
"eval_precision": 0.8133086876155268,
"eval_recall": 0.6864274570982839,
"eval_runtime": 1439.7986,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 96
},
{
"epoch": 0.007223409744232329,
"grad_norm": 4.278674125671387,
"learning_rate": 0.00012128375100707322,
"loss": 0.0205,
"step": 98
},
{
"epoch": 0.007370826269624825,
"grad_norm": 12.730552673339844,
"learning_rate": 0.00012128374094725308,
"loss": 0.0596,
"step": 100
},
{
"epoch": 0.007518242795017322,
"grad_norm": 0.03387758880853653,
"learning_rate": 0.00012128373068420486,
"loss": 1.1734,
"step": 102
},
{
"epoch": 0.007665659320409818,
"grad_norm": 0.002689527813345194,
"learning_rate": 0.00012128372021792852,
"loss": 0.016,
"step": 104
},
{
"epoch": 0.007813075845802315,
"grad_norm": 46.29806900024414,
"learning_rate": 0.00012128370954842415,
"loss": 3.8453,
"step": 106
},
{
"epoch": 0.00796049237119481,
"grad_norm": 65.56766510009766,
"learning_rate": 0.00012128369867569178,
"loss": 3.0592,
"step": 108
},
{
"epoch": 0.008107908896587307,
"grad_norm": 67.830322265625,
"learning_rate": 0.00012128368759973141,
"loss": 1.5232,
"step": 110
},
{
"epoch": 0.008255325421979804,
"grad_norm": 1.828292965888977,
"learning_rate": 0.00012128367632054312,
"loss": 0.899,
"step": 112
},
{
"epoch": 0.008255325421979804,
"eval_1_ratio_diff": -0.24707716289945442,
"eval_accuracy": 0.6952455183164459,
"eval_f1": 0.5948186528497409,
"eval_loss": 1.2687604427337646,
"eval_precision": 0.8858024691358025,
"eval_recall": 0.44773790951638065,
"eval_runtime": 1440.6646,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 112
},
{
"epoch": 0.0084027419473723,
"grad_norm": 2.445478916168213,
"learning_rate": 0.00012128366483812693,
"loss": 1.3983,
"step": 114
},
{
"epoch": 0.008550158472764796,
"grad_norm": 0.8839952349662781,
"learning_rate": 0.00012128365315248287,
"loss": 2.515,
"step": 116
},
{
"epoch": 0.008697574998157294,
"grad_norm": 20.67784881591797,
"learning_rate": 0.000121283641263611,
"loss": 1.5722,
"step": 118
},
{
"epoch": 0.00884499152354979,
"grad_norm": 1.1078622341156006,
"learning_rate": 0.00012128362917151136,
"loss": 0.0058,
"step": 120
},
{
"epoch": 0.008992408048942286,
"grad_norm": 52.540367126464844,
"learning_rate": 0.00012128361687618396,
"loss": 2.8601,
"step": 122
},
{
"epoch": 0.009139824574334782,
"grad_norm": 40.01364517211914,
"learning_rate": 0.00012128360437762885,
"loss": 0.6845,
"step": 124
},
{
"epoch": 0.00928724109972728,
"grad_norm": 4.011626243591309,
"learning_rate": 0.00012128359167584609,
"loss": 0.6806,
"step": 126
},
{
"epoch": 0.009434657625119776,
"grad_norm": 12.99624252319336,
"learning_rate": 0.00012128357877083573,
"loss": 0.8965,
"step": 128
},
{
"epoch": 0.009434657625119776,
"eval_1_ratio_diff": 0.33982852689010135,
"eval_accuracy": 0.6492595479345284,
"eval_f1": 0.7380675203725262,
"eval_loss": 0.9785400629043579,
"eval_precision": 0.5886722376973074,
"eval_recall": 0.9890795631825273,
"eval_runtime": 1440.0679,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 128
},
{
"epoch": 0.009582074150512272,
"grad_norm": 34.23851013183594,
"learning_rate": 0.00012128356566259777,
"loss": 0.3434,
"step": 130
},
{
"epoch": 0.009729490675904768,
"grad_norm": 66.7353286743164,
"learning_rate": 0.0001212835523511323,
"loss": 0.475,
"step": 132
},
{
"epoch": 0.009876907201297266,
"grad_norm": 56.82964324951172,
"learning_rate": 0.00012128353883643935,
"loss": 0.7709,
"step": 134
},
{
"epoch": 0.010024323726689762,
"grad_norm": 34.38500213623047,
"learning_rate": 0.00012128352511851894,
"loss": 0.7302,
"step": 136
},
{
"epoch": 0.010171740252082258,
"grad_norm": 106.88589477539062,
"learning_rate": 0.00012128351119737116,
"loss": 1.332,
"step": 138
},
{
"epoch": 0.010319156777474754,
"grad_norm": 85.7337875366211,
"learning_rate": 0.00012128349707299602,
"loss": 1.6342,
"step": 140
},
{
"epoch": 0.010466573302867252,
"grad_norm": 4.05411958694458,
"learning_rate": 0.00012128348274539358,
"loss": 0.0673,
"step": 142
},
{
"epoch": 0.010613989828259748,
"grad_norm": 2.334378719329834,
"learning_rate": 0.0001212834682145639,
"loss": 0.0332,
"step": 144
},
{
"epoch": 0.010613989828259748,
"eval_1_ratio_diff": -0.2346063912704599,
"eval_accuracy": 0.7014809041309431,
"eval_f1": 0.6095820591233435,
"eval_loss": 1.218570351600647,
"eval_precision": 0.8794117647058823,
"eval_recall": 0.4664586583463339,
"eval_runtime": 1440.6194,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 144
},
{
"epoch": 0.010761406353652244,
"grad_norm": 1.3649911880493164,
"learning_rate": 0.00012128345348050701,
"loss": 0.985,
"step": 146
},
{
"epoch": 0.01090882287904474,
"grad_norm": 6.569690227508545,
"learning_rate": 0.00012128343854322297,
"loss": 0.0316,
"step": 148
},
{
"epoch": 0.011056239404437238,
"grad_norm": 50.96843719482422,
"learning_rate": 0.00012128342340271183,
"loss": 2.5112,
"step": 150
},
{
"epoch": 0.011203655929829734,
"grad_norm": 46.42570877075195,
"learning_rate": 0.00012128340805897364,
"loss": 2.5907,
"step": 152
},
{
"epoch": 0.01135107245522223,
"grad_norm": 35.919315338134766,
"learning_rate": 0.00012128339251200845,
"loss": 0.5731,
"step": 154
},
{
"epoch": 0.011498488980614728,
"grad_norm": 0.33857831358909607,
"learning_rate": 0.0001212833767618163,
"loss": 0.0029,
"step": 156
},
{
"epoch": 0.011645905506007224,
"grad_norm": 0.6119909286499023,
"learning_rate": 0.00012128336080839724,
"loss": 0.0036,
"step": 158
},
{
"epoch": 0.01179332203139972,
"grad_norm": 34.078514099121094,
"learning_rate": 0.00012128334465175136,
"loss": 3.0454,
"step": 160
},
{
"epoch": 0.01179332203139972,
"eval_1_ratio_diff": -0.05222135619641466,
"eval_accuracy": 0.8106001558846454,
"eval_f1": 0.8,
"eval_loss": 0.9759823083877563,
"eval_precision": 0.8466898954703833,
"eval_recall": 0.7581903276131046,
"eval_runtime": 1440.5068,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 160
},
{
"epoch": 0.011940738556792216,
"grad_norm": 0.10960781574249268,
"learning_rate": 0.0001212833282918787,
"loss": 0.0036,
"step": 162
},
{
"epoch": 0.012088155082184714,
"grad_norm": 0.12220565974712372,
"learning_rate": 0.0001212833117287793,
"loss": 0.0025,
"step": 164
},
{
"epoch": 0.01223557160757721,
"grad_norm": 127.77825164794922,
"learning_rate": 0.00012128329496245321,
"loss": 2.7251,
"step": 166
},
{
"epoch": 0.012382988132969706,
"grad_norm": 65.698486328125,
"learning_rate": 0.0001212832779929005,
"loss": 0.4867,
"step": 168
},
{
"epoch": 0.012530404658362202,
"grad_norm": 37.85614013671875,
"learning_rate": 0.00012128326082012124,
"loss": 0.2097,
"step": 170
},
{
"epoch": 0.0126778211837547,
"grad_norm": 12.939319610595703,
"learning_rate": 0.00012128324344411546,
"loss": 1.4561,
"step": 172
},
{
"epoch": 0.012825237709147196,
"grad_norm": 81.24678039550781,
"learning_rate": 0.00012128322586488326,
"loss": 1.1304,
"step": 174
},
{
"epoch": 0.012972654234539692,
"grad_norm": 58.61750030517578,
"learning_rate": 0.00012128320808242463,
"loss": 0.9005,
"step": 176
},
{
"epoch": 0.012972654234539692,
"eval_1_ratio_diff": 0.2533125487139517,
"eval_accuracy": 0.7186282151208107,
"eval_f1": 0.7753578095830741,
"eval_loss": 0.8996144533157349,
"eval_precision": 0.644927536231884,
"eval_recall": 0.9719188767550702,
"eval_runtime": 1439.76,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 176
},
{
"epoch": 0.013120070759932188,
"grad_norm": 60.69062805175781,
"learning_rate": 0.00012128319009673968,
"loss": 1.4957,
"step": 178
},
{
"epoch": 0.013267487285324685,
"grad_norm": 6.7324652671813965,
"learning_rate": 0.00012128317190782848,
"loss": 0.2882,
"step": 180
},
{
"epoch": 0.013414903810717181,
"grad_norm": 0.18422821164131165,
"learning_rate": 0.00012128315351569106,
"loss": 0.5841,
"step": 182
},
{
"epoch": 0.013562320336109678,
"grad_norm": 106.35135650634766,
"learning_rate": 0.00012128313492032748,
"loss": 1.3522,
"step": 184
},
{
"epoch": 0.013709736861502174,
"grad_norm": 35.63379669189453,
"learning_rate": 0.00012128311612173782,
"loss": 1.237,
"step": 186
},
{
"epoch": 0.013857153386894671,
"grad_norm": 83.5736312866211,
"learning_rate": 0.00012128309711992214,
"loss": 1.3351,
"step": 188
},
{
"epoch": 0.014004569912287167,
"grad_norm": 97.8160400390625,
"learning_rate": 0.0001212830779148805,
"loss": 1.6019,
"step": 190
},
{
"epoch": 0.014151986437679663,
"grad_norm": 2.5867555141448975,
"learning_rate": 0.00012128305850661298,
"loss": 0.0897,
"step": 192
},
{
"epoch": 0.014151986437679663,
"eval_1_ratio_diff": 0.24863600935307872,
"eval_accuracy": 0.7295401402961809,
"eval_f1": 0.783260462211118,
"eval_loss": 1.138918161392212,
"eval_precision": 0.653125,
"eval_recall": 0.9781591263650546,
"eval_runtime": 1440.7407,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 192
},
{
"epoch": 0.014299402963072161,
"grad_norm": 71.02184295654297,
"learning_rate": 0.00012128303889511963,
"loss": 1.3708,
"step": 194
},
{
"epoch": 0.014446819488464657,
"grad_norm": 0.5830493569374084,
"learning_rate": 0.0001212830190804005,
"loss": 3.0855,
"step": 196
},
{
"epoch": 0.014594236013857153,
"grad_norm": 63.9030876159668,
"learning_rate": 0.00012128299906245568,
"loss": 1.6675,
"step": 198
},
{
"epoch": 0.01474165253924965,
"grad_norm": 0.18025726079940796,
"learning_rate": 0.00012128297884128523,
"loss": 0.1379,
"step": 200
},
{
"epoch": 0.014889069064642147,
"grad_norm": 0.8397954702377319,
"learning_rate": 0.00012128295841688921,
"loss": 1.528,
"step": 202
},
{
"epoch": 0.015036485590034643,
"grad_norm": 78.28919219970703,
"learning_rate": 0.0001212829377892677,
"loss": 1.2677,
"step": 204
},
{
"epoch": 0.01518390211542714,
"grad_norm": 5.996486186981201,
"learning_rate": 0.00012128291695842078,
"loss": 1.205,
"step": 206
},
{
"epoch": 0.015331318640819635,
"grad_norm": 1.2115447521209717,
"learning_rate": 0.0001212828959243485,
"loss": 0.0076,
"step": 208
},
{
"epoch": 0.015331318640819635,
"eval_1_ratio_diff": 0.03351519875292286,
"eval_accuracy": 0.8402182385035074,
"eval_f1": 0.8452830188679246,
"eval_loss": 0.5696436166763306,
"eval_precision": 0.8187134502923976,
"eval_recall": 0.8736349453978159,
"eval_runtime": 1440.7431,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 208
},
{
"epoch": 0.015478735166212133,
"grad_norm": 41.47733688354492,
"learning_rate": 0.00012128287468705092,
"loss": 1.0424,
"step": 210
},
{
"epoch": 0.01562615169160463,
"grad_norm": 13.133481979370117,
"learning_rate": 0.00012128285324652816,
"loss": 0.0602,
"step": 212
},
{
"epoch": 0.015773568216997127,
"grad_norm": 14.336326599121094,
"learning_rate": 0.00012128283160278022,
"loss": 0.0887,
"step": 214
},
{
"epoch": 0.01592098474238962,
"grad_norm": 2.6840479373931885,
"learning_rate": 0.00012128280975580723,
"loss": 0.0105,
"step": 216
},
{
"epoch": 0.01606840126778212,
"grad_norm": 0.026224393397569656,
"learning_rate": 0.00012128278770560924,
"loss": 0.0006,
"step": 218
},
{
"epoch": 0.016215817793174613,
"grad_norm": 0.0356808602809906,
"learning_rate": 0.00012128276545218633,
"loss": 1.6274,
"step": 220
},
{
"epoch": 0.01636323431856711,
"grad_norm": 0.03703249245882034,
"learning_rate": 0.00012128274299553858,
"loss": 1.6564,
"step": 222
},
{
"epoch": 0.01651065084395961,
"grad_norm": 0.23091621696949005,
"learning_rate": 0.00012128272033566606,
"loss": 0.0017,
"step": 224
},
{
"epoch": 0.01651065084395961,
"eval_1_ratio_diff": 0.10210444271239283,
"eval_accuracy": 0.8106001558846454,
"eval_f1": 0.8280254777070064,
"eval_loss": 1.4256943464279175,
"eval_precision": 0.7577720207253886,
"eval_recall": 0.9126365054602185,
"eval_runtime": 1440.6468,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 224
},
{
"epoch": 0.016658067369352103,
"grad_norm": 0.2899627983570099,
"learning_rate": 0.00012128269747256883,
"loss": 0.0048,
"step": 226
},
{
"epoch": 0.0168054838947446,
"grad_norm": 138.98680114746094,
"learning_rate": 0.00012128267440624699,
"loss": 7.0607,
"step": 228
},
{
"epoch": 0.0169529004201371,
"grad_norm": 64.21833801269531,
"learning_rate": 0.0001212826511367006,
"loss": 2.4323,
"step": 230
},
{
"epoch": 0.017100316945529593,
"grad_norm": 69.21852111816406,
"learning_rate": 0.00012128262766392974,
"loss": 3.8941,
"step": 232
},
{
"epoch": 0.01724773347092209,
"grad_norm": 0.6788825988769531,
"learning_rate": 0.00012128260398793452,
"loss": 0.0033,
"step": 234
},
{
"epoch": 0.01739514999631459,
"grad_norm": 0.5503783822059631,
"learning_rate": 0.000121282580108715,
"loss": 0.0089,
"step": 236
},
{
"epoch": 0.017542566521707083,
"grad_norm": 1.4736528396606445,
"learning_rate": 0.00012128255602627122,
"loss": 0.6923,
"step": 238
},
{
"epoch": 0.01768998304709958,
"grad_norm": 0.052145253866910934,
"learning_rate": 0.0001212825317406033,
"loss": 0.003,
"step": 240
},
{
"epoch": 0.01768998304709958,
"eval_1_ratio_diff": 0.05689789555728764,
"eval_accuracy": 0.8667186282151208,
"eval_f1": 0.8738007380073801,
"eval_loss": 0.5649486184120178,
"eval_precision": 0.8291316526610645,
"eval_recall": 0.9235569422776911,
"eval_runtime": 1440.858,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.446,
"step": 240
},
{
"epoch": 0.017837399572492075,
"grad_norm": 33.907466888427734,
"learning_rate": 0.00012128250725171133,
"loss": 1.0754,
"step": 242
},
{
"epoch": 0.017984816097884573,
"grad_norm": 1.5523881912231445,
"learning_rate": 0.00012128248255959539,
"loss": 2.2872,
"step": 244
},
{
"epoch": 0.01813223262327707,
"grad_norm": 0.45814594626426697,
"learning_rate": 0.00012128245766425553,
"loss": 0.0082,
"step": 246
},
{
"epoch": 0.018279649148669565,
"grad_norm": 63.94032669067383,
"learning_rate": 0.00012128243256569185,
"loss": 1.7641,
"step": 248
},
{
"epoch": 0.018427065674062063,
"grad_norm": 0.17571286857128143,
"learning_rate": 0.00012128240726390445,
"loss": 0.0017,
"step": 250
},
{
"epoch": 0.01857448219945456,
"grad_norm": 0.08677598834037781,
"learning_rate": 0.0001212823817588934,
"loss": 2.0446,
"step": 252
},
{
"epoch": 0.018721898724847055,
"grad_norm": 0.06298824399709702,
"learning_rate": 0.00012128235605065879,
"loss": 0.0031,
"step": 254
},
{
"epoch": 0.018869315250239552,
"grad_norm": 0.04490824043750763,
"learning_rate": 0.00012128233013920071,
"loss": 0.0016,
"step": 256
},
{
"epoch": 0.018869315250239552,
"eval_1_ratio_diff": 0.26032735775526106,
"eval_accuracy": 0.7272018706157444,
"eval_f1": 0.7834158415841584,
"eval_loss": 1.7306467294692993,
"eval_precision": 0.6492307692307693,
"eval_recall": 0.9875195007800313,
"eval_runtime": 1441.1243,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 256
},
{
"epoch": 0.019016731775632047,
"grad_norm": 64.88382720947266,
"learning_rate": 0.00012128230402451925,
"loss": 1.4818,
"step": 258
},
{
"epoch": 0.019164148301024544,
"grad_norm": 0.04304850101470947,
"learning_rate": 0.00012128227770661447,
"loss": 0.0006,
"step": 260
},
{
"epoch": 0.019311564826417042,
"grad_norm": 99.55477142333984,
"learning_rate": 0.00012128225118548648,
"loss": 1.3041,
"step": 262
},
{
"epoch": 0.019458981351809537,
"grad_norm": 64.24674987792969,
"learning_rate": 0.00012128222446113537,
"loss": 3.4221,
"step": 264
},
{
"epoch": 0.019606397877202034,
"grad_norm": 1.130561351776123,
"learning_rate": 0.00012128219753356123,
"loss": 0.0047,
"step": 266
},
{
"epoch": 0.019753814402594532,
"grad_norm": 60.320674896240234,
"learning_rate": 0.00012128217040276413,
"loss": 0.7215,
"step": 268
},
{
"epoch": 0.019901230927987026,
"grad_norm": 56.348636627197266,
"learning_rate": 0.0001212821430687442,
"loss": 3.0486,
"step": 270
},
{
"epoch": 0.020048647453379524,
"grad_norm": 4.682687759399414,
"learning_rate": 0.0001212821155315015,
"loss": 0.0195,
"step": 272
},
{
"epoch": 0.020048647453379524,
"eval_1_ratio_diff": -0.07170693686671864,
"eval_accuracy": 0.8035853468433359,
"eval_f1": 0.788235294117647,
"eval_loss": 0.7957486510276794,
"eval_precision": 0.8542805100182149,
"eval_recall": 0.7316692667706708,
"eval_runtime": 1438.4097,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 272
},
{
"epoch": 0.020196063978772022,
"grad_norm": 0.11813419312238693,
"learning_rate": 0.00012128208779103613,
"loss": 0.1104,
"step": 274
},
{
"epoch": 0.020343480504164516,
"grad_norm": 61.332427978515625,
"learning_rate": 0.0001212820598473482,
"loss": 0.8622,
"step": 276
},
{
"epoch": 0.020490897029557014,
"grad_norm": 9.628612518310547,
"learning_rate": 0.00012128203170043776,
"loss": 0.0682,
"step": 278
},
{
"epoch": 0.02063831355494951,
"grad_norm": 59.6220703125,
"learning_rate": 0.00012128200335030495,
"loss": 0.7833,
"step": 280
},
{
"epoch": 0.020785730080342006,
"grad_norm": 1.084692358970642,
"learning_rate": 0.00012128197479694983,
"loss": 1.5881,
"step": 282
},
{
"epoch": 0.020933146605734504,
"grad_norm": 0.44916099309921265,
"learning_rate": 0.00012128194604037253,
"loss": 0.0187,
"step": 284
},
{
"epoch": 0.021080563131126998,
"grad_norm": 0.11146622151136398,
"learning_rate": 0.00012128191708057311,
"loss": 0.0025,
"step": 286
},
{
"epoch": 0.021227979656519496,
"grad_norm": 0.05726571008563042,
"learning_rate": 0.00012128188791755172,
"loss": 0.0004,
"step": 288
},
{
"epoch": 0.021227979656519496,
"eval_1_ratio_diff": 0.09119251753702262,
"eval_accuracy": 0.8277474668745128,
"eval_f1": 0.8420300214438885,
"eval_loss": 1.1355745792388916,
"eval_precision": 0.7770448548812665,
"eval_recall": 0.9188767550702028,
"eval_runtime": 1440.4727,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 288
},
{
"epoch": 0.021375396181911994,
"grad_norm": 63.95652770996094,
"learning_rate": 0.0001212818585513084,
"loss": 2.2186,
"step": 290
},
{
"epoch": 0.021522812707304488,
"grad_norm": 0.041420936584472656,
"learning_rate": 0.00012128182898184326,
"loss": 2.2755,
"step": 292
},
{
"epoch": 0.021670229232696986,
"grad_norm": 0.19315005838871002,
"learning_rate": 0.00012128179920915643,
"loss": 1.7156,
"step": 294
},
{
"epoch": 0.02181764575808948,
"grad_norm": 0.06642986834049225,
"learning_rate": 0.00012128176923324799,
"loss": 0.0021,
"step": 296
},
{
"epoch": 0.021965062283481978,
"grad_norm": 0.22619064152240753,
"learning_rate": 0.00012128173905411805,
"loss": 1.2636,
"step": 298
},
{
"epoch": 0.022112478808874476,
"grad_norm": 0.30320611596107483,
"learning_rate": 0.00012128170867176669,
"loss": 0.0031,
"step": 300
},
{
"epoch": 0.02225989533426697,
"grad_norm": 62.3597412109375,
"learning_rate": 0.00012128167808619403,
"loss": 1.3432,
"step": 302
},
{
"epoch": 0.022407311859659468,
"grad_norm": 63.980323791503906,
"learning_rate": 0.00012128164729740015,
"loss": 0.8526,
"step": 304
},
{
"epoch": 0.022407311859659468,
"eval_1_ratio_diff": 0.15354637568199536,
"eval_accuracy": 0.8028059236165238,
"eval_f1": 0.8289384719405003,
"eval_loss": 0.781088650226593,
"eval_precision": 0.7315035799522673,
"eval_recall": 0.9563182527301092,
"eval_runtime": 1439.8087,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 304
},
{
"epoch": 0.022554728385051966,
"grad_norm": 0.5441477298736572,
"learning_rate": 0.0001212816163053852,
"loss": 0.023,
"step": 306
},
{
"epoch": 0.02270214491044446,
"grad_norm": 60.2026252746582,
"learning_rate": 0.00012128158511014924,
"loss": 0.4811,
"step": 308
},
{
"epoch": 0.022849561435836958,
"grad_norm": 3.5183231830596924,
"learning_rate": 0.00012128155371169238,
"loss": 0.0164,
"step": 310
},
{
"epoch": 0.022996977961229455,
"grad_norm": 49.883365631103516,
"learning_rate": 0.00012128152211001475,
"loss": 2.6559,
"step": 312
},
{
"epoch": 0.02314439448662195,
"grad_norm": 0.21442897617816925,
"learning_rate": 0.00012128149030511643,
"loss": 1.0737,
"step": 314
},
{
"epoch": 0.023291811012014448,
"grad_norm": 66.95639038085938,
"learning_rate": 0.00012128145829699753,
"loss": 2.2649,
"step": 316
},
{
"epoch": 0.023439227537406942,
"grad_norm": 41.275150299072266,
"learning_rate": 0.00012128142608565818,
"loss": 1.4307,
"step": 318
},
{
"epoch": 0.02358664406279944,
"grad_norm": 60.39665603637695,
"learning_rate": 0.00012128139367109845,
"loss": 0.8912,
"step": 320
},
{
"epoch": 0.02358664406279944,
"eval_1_ratio_diff": 0.15666406858924398,
"eval_accuracy": 0.7903351519875292,
"eval_f1": 0.8186109238031019,
"eval_loss": 0.6988638043403625,
"eval_precision": 0.7209026128266033,
"eval_recall": 0.9469578783151326,
"eval_runtime": 1440.1147,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 320
},
{
"epoch": 0.023734060588191937,
"grad_norm": 0.26957735419273376,
"learning_rate": 0.0001212813610533185,
"loss": 0.0109,
"step": 322
},
{
"epoch": 0.02388147711358443,
"grad_norm": 1.1442532539367676,
"learning_rate": 0.00012128132823231837,
"loss": 0.8164,
"step": 324
},
{
"epoch": 0.02402889363897693,
"grad_norm": 2.7633121013641357,
"learning_rate": 0.00012128129520809825,
"loss": 0.0146,
"step": 326
},
{
"epoch": 0.024176310164369427,
"grad_norm": 103.85281372070312,
"learning_rate": 0.00012128126198065819,
"loss": 2.8926,
"step": 328
},
{
"epoch": 0.02432372668976192,
"grad_norm": 4.870635032653809,
"learning_rate": 0.00012128122854999832,
"loss": 0.0289,
"step": 330
},
{
"epoch": 0.02447114321515442,
"grad_norm": 0.17178401350975037,
"learning_rate": 0.00012128119491611876,
"loss": 0.7425,
"step": 332
},
{
"epoch": 0.024618559740546914,
"grad_norm": 37.24171447753906,
"learning_rate": 0.00012128116107901961,
"loss": 3.577,
"step": 334
},
{
"epoch": 0.02476597626593941,
"grad_norm": 12.520587921142578,
"learning_rate": 0.00012128112703870099,
"loss": 0.0673,
"step": 336
},
{
"epoch": 0.02476597626593941,
"eval_1_ratio_diff": -0.025720966484801266,
"eval_accuracy": 0.857365549493375,
"eval_f1": 0.8534827862289832,
"eval_loss": 0.4316674470901489,
"eval_precision": 0.8766447368421053,
"eval_recall": 0.8315132605304212,
"eval_runtime": 1440.3285,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 336
},
{
"epoch": 0.02491339279133191,
"grad_norm": 41.28479766845703,
"learning_rate": 0.00012128109279516303,
"loss": 0.2896,
"step": 338
},
{
"epoch": 0.025060809316724404,
"grad_norm": 6.806232452392578,
"learning_rate": 0.00012128105834840581,
"loss": 0.0378,
"step": 340
},
{
"epoch": 0.0252082258421169,
"grad_norm": 2.091874361038208,
"learning_rate": 0.00012128102369842947,
"loss": 0.0118,
"step": 342
},
{
"epoch": 0.0253556423675094,
"grad_norm": 57.055580139160156,
"learning_rate": 0.00012128098884523412,
"loss": 0.6633,
"step": 344
},
{
"epoch": 0.025503058892901893,
"grad_norm": 59.19140625,
"learning_rate": 0.00012128095378881987,
"loss": 0.4166,
"step": 346
},
{
"epoch": 0.02565047541829439,
"grad_norm": 0.08690566569566727,
"learning_rate": 0.00012128091852918686,
"loss": 0.0041,
"step": 348
},
{
"epoch": 0.02579789194368689,
"grad_norm": 0.4953851103782654,
"learning_rate": 0.00012128088306633519,
"loss": 0.0058,
"step": 350
},
{
"epoch": 0.025945308469079383,
"grad_norm": 0.8310350179672241,
"learning_rate": 0.00012128084740026497,
"loss": 0.0115,
"step": 352
},
{
"epoch": 0.025945308469079383,
"eval_1_ratio_diff": -0.05455962587685115,
"eval_accuracy": 0.8659392049883087,
"eval_f1": 0.858085808580858,
"eval_loss": 0.6554389595985413,
"eval_precision": 0.9106830122591943,
"eval_recall": 0.8112324492979719,
"eval_runtime": 1441.1917,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 352
},
{
"epoch": 0.02609272499447188,
"grad_norm": 75.97391510009766,
"learning_rate": 0.00012128081153097633,
"loss": 1.0946,
"step": 354
},
{
"epoch": 0.026240141519864375,
"grad_norm": 0.1318621188402176,
"learning_rate": 0.0001212807754584694,
"loss": 0.0013,
"step": 356
},
{
"epoch": 0.026387558045256873,
"grad_norm": 0.07249584794044495,
"learning_rate": 0.0001212807391827443,
"loss": 0.2854,
"step": 358
},
{
"epoch": 0.02653497457064937,
"grad_norm": 23.931421279907227,
"learning_rate": 0.00012128070270380113,
"loss": 0.0587,
"step": 360
},
{
"epoch": 0.026682391096041865,
"grad_norm": 228.77931213378906,
"learning_rate": 0.00012128066602164004,
"loss": 0.6358,
"step": 362
},
{
"epoch": 0.026829807621434363,
"grad_norm": 0.020578529685735703,
"learning_rate": 0.00012128062913626113,
"loss": 0.0003,
"step": 364
},
{
"epoch": 0.02697722414682686,
"grad_norm": 0.044141389429569244,
"learning_rate": 0.00012128059204766453,
"loss": 0.0003,
"step": 366
},
{
"epoch": 0.027124640672219355,
"grad_norm": 35.83491516113281,
"learning_rate": 0.00012128055475585035,
"loss": 2.1523,
"step": 368
},
{
"epoch": 0.027124640672219355,
"eval_1_ratio_diff": -0.044427123928293066,
"eval_accuracy": 0.8604832424006236,
"eval_f1": 0.8538775510204082,
"eval_loss": 1.1068644523620605,
"eval_precision": 0.8955479452054794,
"eval_recall": 0.8159126365054602,
"eval_runtime": 1440.348,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 368
},
{
"epoch": 0.027272057197611853,
"grad_norm": 0.004144140053540468,
"learning_rate": 0.00012128051726081876,
"loss": 0.0,
"step": 370
},
{
"epoch": 0.027419473723004347,
"grad_norm": 0.0015425934689119458,
"learning_rate": 0.00012128047956256984,
"loss": 0.0002,
"step": 372
},
{
"epoch": 0.027566890248396845,
"grad_norm": 36.42764663696289,
"learning_rate": 0.00012128044166110374,
"loss": 2.8486,
"step": 374
},
{
"epoch": 0.027714306773789343,
"grad_norm": 0.6206398010253906,
"learning_rate": 0.00012128040355642058,
"loss": 2.924,
"step": 376
},
{
"epoch": 0.027861723299181837,
"grad_norm": 97.60330963134766,
"learning_rate": 0.00012128036524852049,
"loss": 1.9209,
"step": 378
},
{
"epoch": 0.028009139824574335,
"grad_norm": 2.1615848541259766,
"learning_rate": 0.0001212803267374036,
"loss": 0.0215,
"step": 380
},
{
"epoch": 0.028156556349966833,
"grad_norm": 41.35491180419922,
"learning_rate": 0.00012128028802307003,
"loss": 0.8105,
"step": 382
},
{
"epoch": 0.028303972875359327,
"grad_norm": 39.422916412353516,
"learning_rate": 0.00012128024910551992,
"loss": 1.131,
"step": 384
},
{
"epoch": 0.028303972875359327,
"eval_1_ratio_diff": -0.3904910366328917,
"eval_accuracy": 0.5876851130163678,
"eval_f1": 0.322663252240717,
"eval_loss": 1.1657379865646362,
"eval_precision": 0.9,
"eval_recall": 0.19656786271450857,
"eval_runtime": 1441.4939,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 384
},
{
"epoch": 0.028451389400751825,
"grad_norm": 1.7290548086166382,
"learning_rate": 0.0001212802099847534,
"loss": 0.0986,
"step": 386
},
{
"epoch": 0.028598805926144322,
"grad_norm": 40.167484283447266,
"learning_rate": 0.00012128017066077058,
"loss": 1.1352,
"step": 388
},
{
"epoch": 0.028746222451536817,
"grad_norm": 36.6862678527832,
"learning_rate": 0.00012128013113357162,
"loss": 2.6405,
"step": 390
},
{
"epoch": 0.028893638976929314,
"grad_norm": 1.1684958934783936,
"learning_rate": 0.00012128009140315665,
"loss": 1.1565,
"step": 392
},
{
"epoch": 0.02904105550232181,
"grad_norm": 28.306957244873047,
"learning_rate": 0.00012128005146952578,
"loss": 1.6548,
"step": 394
},
{
"epoch": 0.029188472027714307,
"grad_norm": 18.64267349243164,
"learning_rate": 0.00012128001133267917,
"loss": 1.1205,
"step": 396
},
{
"epoch": 0.029335888553106804,
"grad_norm": 7.279528617858887,
"learning_rate": 0.00012127997099261693,
"loss": 0.6742,
"step": 398
},
{
"epoch": 0.0294833050784993,
"grad_norm": 41.569854736328125,
"learning_rate": 0.00012127993044933921,
"loss": 0.6977,
"step": 400
},
{
"epoch": 0.0294833050784993,
"eval_1_ratio_diff": -0.2704598597038192,
"eval_accuracy": 0.6344505066250974,
"eval_f1": 0.4983957219251337,
"eval_loss": 0.6263108849525452,
"eval_precision": 0.7925170068027211,
"eval_recall": 0.36349453978159124,
"eval_runtime": 1441.3891,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 400
},
{
"epoch": 0.029630721603891796,
"grad_norm": 14.647398948669434,
"learning_rate": 0.00012127988970284616,
"loss": 0.4508,
"step": 402
},
{
"epoch": 0.029778138129284294,
"grad_norm": 21.75971221923828,
"learning_rate": 0.00012127984875313788,
"loss": 0.6282,
"step": 404
},
{
"epoch": 0.02992555465467679,
"grad_norm": 32.292236328125,
"learning_rate": 0.00012127980760021456,
"loss": 1.0279,
"step": 406
},
{
"epoch": 0.030072971180069286,
"grad_norm": 59.10111999511719,
"learning_rate": 0.00012127976624407626,
"loss": 1.7322,
"step": 408
},
{
"epoch": 0.03022038770546178,
"grad_norm": 56.45620346069336,
"learning_rate": 0.00012127972468472319,
"loss": 2.3399,
"step": 410
},
{
"epoch": 0.03036780423085428,
"grad_norm": 33.3152961730957,
"learning_rate": 0.00012127968292215546,
"loss": 1.1374,
"step": 412
},
{
"epoch": 0.030515220756246776,
"grad_norm": 9.003528594970703,
"learning_rate": 0.00012127964095637322,
"loss": 0.531,
"step": 414
},
{
"epoch": 0.03066263728163927,
"grad_norm": 11.181624412536621,
"learning_rate": 0.00012127959878737659,
"loss": 0.167,
"step": 416
},
{
"epoch": 0.03066263728163927,
"eval_1_ratio_diff": -0.49961028838659394,
"eval_accuracy": 0.5003897116134061,
"eval_f1": 0.0,
"eval_loss": 0.9164891839027405,
"eval_precision": 0.0,
"eval_recall": 0.0,
"eval_runtime": 1441.9045,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 416
},
{
"epoch": 0.030810053807031768,
"grad_norm": 81.1378173828125,
"learning_rate": 0.00012127955641516573,
"loss": 1.5427,
"step": 418
},
{
"epoch": 0.030957470332424266,
"grad_norm": 40.89067840576172,
"learning_rate": 0.00012127951383974079,
"loss": 0.8105,
"step": 420
},
{
"epoch": 0.03110488685781676,
"grad_norm": 0.7650836706161499,
"learning_rate": 0.00012127947106110188,
"loss": 0.8716,
"step": 422
},
{
"epoch": 0.03125230338320926,
"grad_norm": 41.49223709106445,
"learning_rate": 0.00012127942807924917,
"loss": 3.4998,
"step": 424
},
{
"epoch": 0.031399719908601756,
"grad_norm": 0.12294773012399673,
"learning_rate": 0.00012127938489418281,
"loss": 1.8698,
"step": 426
},
{
"epoch": 0.031547136433994254,
"grad_norm": 35.12305450439453,
"learning_rate": 0.00012127934150590295,
"loss": 1.6532,
"step": 428
},
{
"epoch": 0.031694552959386744,
"grad_norm": 27.799177169799805,
"learning_rate": 0.00012127929791440968,
"loss": 0.5514,
"step": 430
},
{
"epoch": 0.03184196948477924,
"grad_norm": 24.18194580078125,
"learning_rate": 0.00012127925411970319,
"loss": 0.6588,
"step": 432
},
{
"epoch": 0.03184196948477924,
"eval_1_ratio_diff": 0.0615744349181605,
"eval_accuracy": 0.8074824629773967,
"eval_f1": 0.8185157972079353,
"eval_loss": 0.46238815784454346,
"eval_precision": 0.7736111111111111,
"eval_recall": 0.8689547581903276,
"eval_runtime": 1441.3065,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 432
},
{
"epoch": 0.03198938601017174,
"grad_norm": 39.476436614990234,
"learning_rate": 0.00012127921012178362,
"loss": 0.5056,
"step": 434
},
{
"epoch": 0.03213680253556424,
"grad_norm": 17.45188331604004,
"learning_rate": 0.00012127916592065112,
"loss": 1.9197,
"step": 436
},
{
"epoch": 0.032284219060956736,
"grad_norm": 37.614906311035156,
"learning_rate": 0.00012127912151630586,
"loss": 1.4371,
"step": 438
},
{
"epoch": 0.032431635586349226,
"grad_norm": 6.937824726104736,
"learning_rate": 0.00012127907690874794,
"loss": 0.1527,
"step": 440
},
{
"epoch": 0.032579052111741724,
"grad_norm": 1.9573392868041992,
"learning_rate": 0.00012127903209797754,
"loss": 0.0619,
"step": 442
},
{
"epoch": 0.03272646863713422,
"grad_norm": 5.234042167663574,
"learning_rate": 0.00012127898708399481,
"loss": 0.0308,
"step": 444
},
{
"epoch": 0.03287388516252672,
"grad_norm": 19.76664161682129,
"learning_rate": 0.00012127894186679988,
"loss": 2.5914,
"step": 446
},
{
"epoch": 0.03302130168791922,
"grad_norm": 48.643428802490234,
"learning_rate": 0.00012127889644639293,
"loss": 3.5738,
"step": 448
},
{
"epoch": 0.03302130168791922,
"eval_1_ratio_diff": -0.4964925954793453,
"eval_accuracy": 0.5035074045206547,
"eval_f1": 0.012403100775193798,
"eval_loss": 2.0848419666290283,
"eval_precision": 1.0,
"eval_recall": 0.0062402496099844,
"eval_runtime": 1441.7896,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 448
},
{
"epoch": 0.033168718213311715,
"grad_norm": 41.91992950439453,
"learning_rate": 0.0001212788508227741,
"loss": 3.656,
"step": 450
},
{
"epoch": 0.033316134738704206,
"grad_norm": 58.21712112426758,
"learning_rate": 0.00012127880499594355,
"loss": 2.5973,
"step": 452
},
{
"epoch": 0.033463551264096704,
"grad_norm": 14.196877479553223,
"learning_rate": 0.00012127875896590141,
"loss": 0.9817,
"step": 454
},
{
"epoch": 0.0336109677894892,
"grad_norm": 21.982349395751953,
"learning_rate": 0.00012127871273264783,
"loss": 0.6516,
"step": 456
},
{
"epoch": 0.0337583843148817,
"grad_norm": 26.360563278198242,
"learning_rate": 0.00012127866629618302,
"loss": 0.5606,
"step": 458
},
{
"epoch": 0.0339058008402742,
"grad_norm": 15.224770545959473,
"learning_rate": 0.00012127861965650708,
"loss": 0.4791,
"step": 460
},
{
"epoch": 0.03405321736566669,
"grad_norm": 40.95515441894531,
"learning_rate": 0.0001212785728136202,
"loss": 0.8481,
"step": 462
},
{
"epoch": 0.034200633891059186,
"grad_norm": 0.4365566670894623,
"learning_rate": 0.00012127852576752252,
"loss": 0.2475,
"step": 464
},
{
"epoch": 0.034200633891059186,
"eval_1_ratio_diff": 0.2938425565081839,
"eval_accuracy": 0.6890101325019485,
"eval_f1": 0.759493670886076,
"eval_loss": 0.8622868061065674,
"eval_precision": 0.618860510805501,
"eval_recall": 0.982839313572543,
"eval_runtime": 1441.2401,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 464
},
{
"epoch": 0.034348050416451684,
"grad_norm": 15.40101432800293,
"learning_rate": 0.0001212784785182142,
"loss": 0.6156,
"step": 466
},
{
"epoch": 0.03449546694184418,
"grad_norm": 5.0568013191223145,
"learning_rate": 0.00012127843106569541,
"loss": 0.4877,
"step": 468
},
{
"epoch": 0.03464288346723668,
"grad_norm": 0.277358740568161,
"learning_rate": 0.00012127838340996629,
"loss": 0.6857,
"step": 470
},
{
"epoch": 0.03479029999262918,
"grad_norm": 0.04443424195051193,
"learning_rate": 0.00012127833555102701,
"loss": 0.0286,
"step": 472
},
{
"epoch": 0.03493771651802167,
"grad_norm": 35.34669876098633,
"learning_rate": 0.00012127828748887773,
"loss": 1.7842,
"step": 474
},
{
"epoch": 0.035085133043414166,
"grad_norm": 0.08662135899066925,
"learning_rate": 0.00012127823922351861,
"loss": 0.0011,
"step": 476
},
{
"epoch": 0.03523254956880666,
"grad_norm": 0.021065138280391693,
"learning_rate": 0.00012127819075494979,
"loss": 0.0013,
"step": 478
},
{
"epoch": 0.03537996609419916,
"grad_norm": 39.31500244140625,
"learning_rate": 0.00012127814208317148,
"loss": 1.3799,
"step": 480
},
{
"epoch": 0.03537996609419916,
"eval_1_ratio_diff": 0.03975058456742009,
"eval_accuracy": 0.838659392049883,
"eval_f1": 0.8447111777944486,
"eval_loss": 0.967132568359375,
"eval_precision": 0.8135838150289018,
"eval_recall": 0.8783151326053042,
"eval_runtime": 1441.5685,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 480
},
{
"epoch": 0.03552738261959166,
"grad_norm": 0.021114541217684746,
"learning_rate": 0.0001212780932081838,
"loss": 0.0022,
"step": 482
},
{
"epoch": 0.03567479914498415,
"grad_norm": 0.15021076798439026,
"learning_rate": 0.00012127804412998695,
"loss": 0.0023,
"step": 484
},
{
"epoch": 0.03582221567037665,
"grad_norm": 0.017235957086086273,
"learning_rate": 0.00012127799484858106,
"loss": 0.0157,
"step": 486
},
{
"epoch": 0.035969632195769145,
"grad_norm": 0.07619292289018631,
"learning_rate": 0.00012127794536396632,
"loss": 0.0006,
"step": 488
},
{
"epoch": 0.03611704872116164,
"grad_norm": 0.35548681020736694,
"learning_rate": 0.0001212778956761429,
"loss": 0.0025,
"step": 490
},
{
"epoch": 0.03626446524655414,
"grad_norm": 0.019310960546135902,
"learning_rate": 0.00012127784578511092,
"loss": 0.0006,
"step": 492
},
{
"epoch": 0.03641188177194664,
"grad_norm": 0.0059149437583982944,
"learning_rate": 0.00012127779569087061,
"loss": 0.0222,
"step": 494
},
{
"epoch": 0.03655929829733913,
"grad_norm": 0.0023440527729690075,
"learning_rate": 0.00012127774539342209,
"loss": 2.0713,
"step": 496
},
{
"epoch": 0.03655929829733913,
"eval_1_ratio_diff": 0.0615744349181605,
"eval_accuracy": 0.848012470771629,
"eval_f1": 0.8567229977957385,
"eval_loss": 1.1258606910705566,
"eval_precision": 0.8097222222222222,
"eval_recall": 0.9095163806552262,
"eval_runtime": 1442.1776,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 496
},
{
"epoch": 0.03670671482273163,
"grad_norm": 0.4357898235321045,
"learning_rate": 0.00012127769489276555,
"loss": 0.0017,
"step": 498
},
{
"epoch": 0.036854131348124125,
"grad_norm": 0.0051942430436611176,
"learning_rate": 0.00012127764418890117,
"loss": 0.0001,
"step": 500
},
{
"epoch": 0.03700154787351662,
"grad_norm": 0.048877667635679245,
"learning_rate": 0.0001212775932818291,
"loss": 1.0276,
"step": 502
},
{
"epoch": 0.03714896439890912,
"grad_norm": 0.030356034636497498,
"learning_rate": 0.00012127754217154949,
"loss": 2.3301,
"step": 504
},
{
"epoch": 0.03729638092430161,
"grad_norm": 0.06719710677862167,
"learning_rate": 0.00012127749085806257,
"loss": 0.0008,
"step": 506
},
{
"epoch": 0.03744379744969411,
"grad_norm": 0.8071137070655823,
"learning_rate": 0.00012127743934136846,
"loss": 0.0034,
"step": 508
},
{
"epoch": 0.03759121397508661,
"grad_norm": 66.58085632324219,
"learning_rate": 0.00012127738762146735,
"loss": 2.0918,
"step": 510
},
{
"epoch": 0.037738630500479105,
"grad_norm": 0.5617576241493225,
"learning_rate": 0.00012127733569835943,
"loss": 0.004,
"step": 512
},
{
"epoch": 0.037738630500479105,
"eval_1_ratio_diff": 0.13795791114575218,
"eval_accuracy": 0.8152766952455183,
"eval_f1": 0.8375599725839616,
"eval_loss": 1.003125548362732,
"eval_precision": 0.7469437652811736,
"eval_recall": 0.953198127925117,
"eval_runtime": 1441.8288,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 512
},
{
"epoch": 0.0378860470258716,
"grad_norm": 0.019583938643336296,
"learning_rate": 0.00012127728357204487,
"loss": 0.0029,
"step": 514
},
{
"epoch": 0.03803346355126409,
"grad_norm": 66.44640350341797,
"learning_rate": 0.00012127723124252383,
"loss": 1.346,
"step": 516
},
{
"epoch": 0.03818088007665659,
"grad_norm": 0.05073532462120056,
"learning_rate": 0.00012127717870979647,
"loss": 1.726,
"step": 518
},
{
"epoch": 0.03832829660204909,
"grad_norm": 0.008476372808218002,
"learning_rate": 0.000121277125973863,
"loss": 0.0002,
"step": 520
},
{
"epoch": 0.03847571312744159,
"grad_norm": 78.07063293457031,
"learning_rate": 0.00012127707303472356,
"loss": 3.8118,
"step": 522
},
{
"epoch": 0.038623129652834085,
"grad_norm": 37.921451568603516,
"learning_rate": 0.00012127701989237836,
"loss": 3.374,
"step": 524
},
{
"epoch": 0.03877054617822658,
"grad_norm": 38.97615432739258,
"learning_rate": 0.0001212769665468276,
"loss": 1.849,
"step": 526
},
{
"epoch": 0.03891796270361907,
"grad_norm": 1.3990278244018555,
"learning_rate": 0.0001212769129980714,
"loss": 0.2307,
"step": 528
},
{
"epoch": 0.03891796270361907,
"eval_1_ratio_diff": -0.002338269680436489,
"eval_accuracy": 0.8511301636788776,
"eval_f1": 0.8506645817044566,
"eval_loss": 0.5836467742919922,
"eval_precision": 0.8526645768025078,
"eval_recall": 0.8486739469578783,
"eval_runtime": 1442.6344,
"eval_samples_per_second": 0.889,
"eval_steps_per_second": 0.445,
"step": 528
},
{
"epoch": 0.03906537922901157,
"grad_norm": 0.5216283798217773,
"learning_rate": 0.00012127685924610997,
"loss": 0.0092,
"step": 530
},
{
"epoch": 0.03921279575440407,
"grad_norm": 0.716465950012207,
"learning_rate": 0.00012127680529094349,
"loss": 0.0057,
"step": 532
},
{
"epoch": 0.039360212279796566,
"grad_norm": 0.17090915143489838,
"learning_rate": 0.00012127675113257214,
"loss": 0.0031,
"step": 534
},
{
"epoch": 0.039507628805189064,
"grad_norm": 62.14753723144531,
"learning_rate": 0.00012127669677099608,
"loss": 1.6501,
"step": 536
},
{
"epoch": 0.039655045330581555,
"grad_norm": 35.18620681762695,
"learning_rate": 0.00012127664220621553,
"loss": 0.8287,
"step": 538
},
{
"epoch": 0.03980246185597405,
"grad_norm": 34.50994873046875,
"learning_rate": 0.00012127658743823064,
"loss": 2.5161,
"step": 540
},
{
"epoch": 0.03994987838136655,
"grad_norm": 0.9479020237922668,
"learning_rate": 0.00012127653246704162,
"loss": 0.0155,
"step": 542
},
{
"epoch": 0.04009729490675905,
"grad_norm": 0.040624819695949554,
"learning_rate": 0.00012127647729264862,
"loss": 1.536,
"step": 544
},
{
"epoch": 0.04009729490675905,
"eval_1_ratio_diff": -0.008573655494933774,
"eval_accuracy": 0.8791893998441154,
"eval_f1": 0.8780487804878049,
"eval_loss": 0.49514248967170715,
"eval_precision": 0.8857142857142857,
"eval_recall": 0.8705148205928237,
"eval_runtime": 1441.6693,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 544
},
{
"epoch": 0.040244711432151546,
"grad_norm": 0.08040345460176468,
"learning_rate": 0.00012127642191505187,
"loss": 0.0205,
"step": 546
},
{
"epoch": 0.040392127957544044,
"grad_norm": 58.783809661865234,
"learning_rate": 0.00012127636633425152,
"loss": 1.1192,
"step": 548
},
{
"epoch": 0.040539544482936535,
"grad_norm": 0.25617870688438416,
"learning_rate": 0.00012127631055024779,
"loss": 1.0263,
"step": 550
},
{
"epoch": 0.04068696100832903,
"grad_norm": 46.056339263916016,
"learning_rate": 0.00012127625456304081,
"loss": 1.1183,
"step": 552
},
{
"epoch": 0.04083437753372153,
"grad_norm": 0.17480018734931946,
"learning_rate": 0.00012127619837263082,
"loss": 0.0055,
"step": 554
},
{
"epoch": 0.04098179405911403,
"grad_norm": 0.37528491020202637,
"learning_rate": 0.000121276141979018,
"loss": 0.0032,
"step": 556
},
{
"epoch": 0.041129210584506526,
"grad_norm": 0.35542991757392883,
"learning_rate": 0.00012127608538220252,
"loss": 0.9512,
"step": 558
},
{
"epoch": 0.04127662710989902,
"grad_norm": 0.08831676840782166,
"learning_rate": 0.00012127602858218457,
"loss": 0.0184,
"step": 560
},
{
"epoch": 0.04127662710989902,
"eval_1_ratio_diff": 0.12860483242400622,
"eval_accuracy": 0.8402182385035074,
"eval_f1": 0.8583275742916379,
"eval_loss": 1.0018821954727173,
"eval_precision": 0.7704714640198511,
"eval_recall": 0.968798751950078,
"eval_runtime": 1442.4789,
"eval_samples_per_second": 0.889,
"eval_steps_per_second": 0.445,
"step": 560
},
{
"epoch": 0.041424043635291515,
"grad_norm": 46.25735092163086,
"learning_rate": 0.00012127597157896437,
"loss": 0.6495,
"step": 562
},
{
"epoch": 0.04157146016068401,
"grad_norm": 58.521575927734375,
"learning_rate": 0.00012127591437254209,
"loss": 1.4757,
"step": 564
},
{
"epoch": 0.04171887668607651,
"grad_norm": 0.3296540379524231,
"learning_rate": 0.0001212758569629179,
"loss": 2.2725,
"step": 566
},
{
"epoch": 0.04186629321146901,
"grad_norm": 0.03395453095436096,
"learning_rate": 0.00012127579935009204,
"loss": 0.0006,
"step": 568
},
{
"epoch": 0.042013709736861506,
"grad_norm": 0.02328958362340927,
"learning_rate": 0.00012127574153406467,
"loss": 0.0004,
"step": 570
},
{
"epoch": 0.042161126262253996,
"grad_norm": 58.99131774902344,
"learning_rate": 0.000121275683514836,
"loss": 2.0081,
"step": 572
},
{
"epoch": 0.042308542787646494,
"grad_norm": 0.9085908532142639,
"learning_rate": 0.0001212756252924062,
"loss": 0.006,
"step": 574
},
{
"epoch": 0.04245595931303899,
"grad_norm": 0.5718927383422852,
"learning_rate": 0.00012127556686677549,
"loss": 2.0144,
"step": 576
},
{
"epoch": 0.04245595931303899,
"eval_1_ratio_diff": 0.22291504286827746,
"eval_accuracy": 0.7443491816056118,
"eval_f1": 0.7908163265306123,
"eval_loss": 0.9025093913078308,
"eval_precision": 0.668824163969795,
"eval_recall": 0.9672386895475819,
"eval_runtime": 1442.0314,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 576
},
{
"epoch": 0.04260337583843149,
"grad_norm": 0.4269089698791504,
"learning_rate": 0.00012127550823794406,
"loss": 1.8595,
"step": 578
},
{
"epoch": 0.04275079236382399,
"grad_norm": 1.5817714929580688,
"learning_rate": 0.00012127544940591211,
"loss": 0.4153,
"step": 580
},
{
"epoch": 0.04289820888921648,
"grad_norm": 56.673728942871094,
"learning_rate": 0.00012127539037067981,
"loss": 1.8132,
"step": 582
},
{
"epoch": 0.043045625414608976,
"grad_norm": 4.291464805603027,
"learning_rate": 0.0001212753311322474,
"loss": 0.3818,
"step": 584
},
{
"epoch": 0.043193041940001474,
"grad_norm": 18.92963981628418,
"learning_rate": 0.00012127527169061505,
"loss": 0.0941,
"step": 586
},
{
"epoch": 0.04334045846539397,
"grad_norm": 27.108686447143555,
"learning_rate": 0.00012127521204578297,
"loss": 0.1314,
"step": 588
},
{
"epoch": 0.04348787499078647,
"grad_norm": 33.73942184448242,
"learning_rate": 0.00012127515219775134,
"loss": 0.1772,
"step": 590
},
{
"epoch": 0.04363529151617896,
"grad_norm": 52.08650588989258,
"learning_rate": 0.00012127509214652041,
"loss": 0.4505,
"step": 592
},
{
"epoch": 0.04363529151617896,
"eval_1_ratio_diff": -0.1184723304754482,
"eval_accuracy": 0.798908807482463,
"eval_f1": 0.7716814159292036,
"eval_loss": 0.7536761164665222,
"eval_precision": 0.8916155419222904,
"eval_recall": 0.6801872074882995,
"eval_runtime": 1442.1268,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 592
},
{
"epoch": 0.04378270804157146,
"grad_norm": 0.05625031143426895,
"learning_rate": 0.00012127503189209032,
"loss": 0.3175,
"step": 594
},
{
"epoch": 0.043930124566963956,
"grad_norm": 0.10953383892774582,
"learning_rate": 0.0001212749714344613,
"loss": 0.0059,
"step": 596
},
{
"epoch": 0.044077541092356454,
"grad_norm": 71.34505462646484,
"learning_rate": 0.00012127491077363357,
"loss": 0.5113,
"step": 598
},
{
"epoch": 0.04422495761774895,
"grad_norm": 0.012292311526834965,
"learning_rate": 0.00012127484990960732,
"loss": 0.0008,
"step": 600
},
{
"epoch": 0.04437237414314145,
"grad_norm": 0.010139914229512215,
"learning_rate": 0.00012127478884238274,
"loss": 0.0002,
"step": 602
},
{
"epoch": 0.04451979066853394,
"grad_norm": 58.99741744995117,
"learning_rate": 0.00012127472757196004,
"loss": 3.6273,
"step": 604
},
{
"epoch": 0.04466720719392644,
"grad_norm": 56.25634765625,
"learning_rate": 0.00012127466609833943,
"loss": 3.663,
"step": 606
},
{
"epoch": 0.044814623719318936,
"grad_norm": 56.98939895629883,
"learning_rate": 0.00012127460442152114,
"loss": 1.6247,
"step": 608
},
{
"epoch": 0.044814623719318936,
"eval_1_ratio_diff": -0.03273577552611068,
"eval_accuracy": 0.8456742010911925,
"eval_f1": 0.8403225806451613,
"eval_loss": 0.6838305592536926,
"eval_precision": 0.8697829716193656,
"eval_recall": 0.8127925117004681,
"eval_runtime": 1441.7869,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 608
},
{
"epoch": 0.04496204024471143,
"grad_norm": 41.00777053833008,
"learning_rate": 0.00012127454254150532,
"loss": 3.2637,
"step": 610
},
{
"epoch": 0.04510945677010393,
"grad_norm": 15.958291053771973,
"learning_rate": 0.00012127448045829223,
"loss": 0.0749,
"step": 612
},
{
"epoch": 0.04525687329549642,
"grad_norm": 52.62068176269531,
"learning_rate": 0.00012127441817188204,
"loss": 1.1452,
"step": 614
},
{
"epoch": 0.04540428982088892,
"grad_norm": 0.8104878067970276,
"learning_rate": 0.00012127435568227499,
"loss": 0.0086,
"step": 616
},
{
"epoch": 0.04555170634628142,
"grad_norm": 6.7712883949279785,
"learning_rate": 0.00012127429298947129,
"loss": 0.035,
"step": 618
},
{
"epoch": 0.045699122871673915,
"grad_norm": 1.2900152206420898,
"learning_rate": 0.00012127423009347112,
"loss": 0.0133,
"step": 620
},
{
"epoch": 0.04584653939706641,
"grad_norm": 0.5468306541442871,
"learning_rate": 0.00012127416699427471,
"loss": 0.0066,
"step": 622
},
{
"epoch": 0.04599395592245891,
"grad_norm": 0.16869762539863586,
"learning_rate": 0.00012127410369188226,
"loss": 0.0026,
"step": 624
},
{
"epoch": 0.04599395592245891,
"eval_1_ratio_diff": 0.015588464536243185,
"eval_accuracy": 0.8862042088854248,
"eval_f1": 0.8878648233486943,
"eval_loss": 0.570717990398407,
"eval_precision": 0.8744326777609682,
"eval_recall": 0.9017160686427457,
"eval_runtime": 1441.498,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 624
},
{
"epoch": 0.0461413724478514,
"grad_norm": 0.20163878798484802,
"learning_rate": 0.00012127404018629401,
"loss": 0.0013,
"step": 626
},
{
"epoch": 0.0462887889732439,
"grad_norm": 0.1430014669895172,
"learning_rate": 0.00012127397647751014,
"loss": 0.0016,
"step": 628
},
{
"epoch": 0.0464362054986364,
"grad_norm": 60.50364303588867,
"learning_rate": 0.00012127391256553088,
"loss": 1.6526,
"step": 630
},
{
"epoch": 0.046583622024028895,
"grad_norm": 0.009336289949715137,
"learning_rate": 0.00012127384845035646,
"loss": 0.0005,
"step": 632
},
{
"epoch": 0.04673103854942139,
"grad_norm": 0.02924017794430256,
"learning_rate": 0.00012127378413198706,
"loss": 2.0099,
"step": 634
},
{
"epoch": 0.046878455074813884,
"grad_norm": 0.1369701325893402,
"learning_rate": 0.00012127371961042292,
"loss": 1.9002,
"step": 636
},
{
"epoch": 0.04702587160020638,
"grad_norm": 77.09698486328125,
"learning_rate": 0.00012127365488566423,
"loss": 1.0021,
"step": 638
},
{
"epoch": 0.04717328812559888,
"grad_norm": 4.486428260803223,
"learning_rate": 0.00012127358995771124,
"loss": 1.8971,
"step": 640
},
{
"epoch": 0.04717328812559888,
"eval_1_ratio_diff": -0.17225253312548716,
"eval_accuracy": 0.7669524551831645,
"eval_f1": 0.7181903864278982,
"eval_loss": 1.282883644104004,
"eval_precision": 0.9071428571428571,
"eval_recall": 0.594383775351014,
"eval_runtime": 1441.6631,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 640
},
{
"epoch": 0.04732070465099138,
"grad_norm": 0.3835877478122711,
"learning_rate": 0.00012127352482656414,
"loss": 1.5125,
"step": 642
},
{
"epoch": 0.047468121176383875,
"grad_norm": 0.3453172445297241,
"learning_rate": 0.00012127345949222316,
"loss": 1.4256,
"step": 644
},
{
"epoch": 0.04761553770177637,
"grad_norm": 56.087467193603516,
"learning_rate": 0.00012127339395468855,
"loss": 1.389,
"step": 646
},
{
"epoch": 0.04776295422716886,
"grad_norm": 39.20930099487305,
"learning_rate": 0.00012127332821396047,
"loss": 2.2849,
"step": 648
},
{
"epoch": 0.04791037075256136,
"grad_norm": 2.4249165058135986,
"learning_rate": 0.00012127326227003918,
"loss": 0.0286,
"step": 650
},
{
"epoch": 0.04805778727795386,
"grad_norm": 1.4587557315826416,
"learning_rate": 0.0001212731961229249,
"loss": 0.775,
"step": 652
},
{
"epoch": 0.04820520380334636,
"grad_norm": 45.33637237548828,
"learning_rate": 0.00012127312977261783,
"loss": 0.2852,
"step": 654
},
{
"epoch": 0.048352620328738855,
"grad_norm": 0.07065322250127792,
"learning_rate": 0.0001212730632191182,
"loss": 0.7127,
"step": 656
},
{
"epoch": 0.048352620328738855,
"eval_1_ratio_diff": -0.05455962587685115,
"eval_accuracy": 0.8487918939984411,
"eval_f1": 0.8399339933993399,
"eval_loss": 0.5940015316009521,
"eval_precision": 0.8914185639229422,
"eval_recall": 0.7940717628705148,
"eval_runtime": 1441.517,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 656
},
{
"epoch": 0.048500036854131345,
"grad_norm": 35.70323181152344,
"learning_rate": 0.00012127299646242624,
"loss": 0.6816,
"step": 658
},
{
"epoch": 0.04864745337952384,
"grad_norm": 1.4870625734329224,
"learning_rate": 0.00012127292950254218,
"loss": 0.1488,
"step": 660
},
{
"epoch": 0.04879486990491634,
"grad_norm": 0.6423426866531372,
"learning_rate": 0.00012127286233946625,
"loss": 0.0136,
"step": 662
},
{
"epoch": 0.04894228643030884,
"grad_norm": 0.3320056200027466,
"learning_rate": 0.00012127279497319864,
"loss": 0.0058,
"step": 664
},
{
"epoch": 0.049089702955701336,
"grad_norm": 4.33368444442749,
"learning_rate": 0.00012127272740373959,
"loss": 0.5196,
"step": 666
},
{
"epoch": 0.04923711948109383,
"grad_norm": 71.66387939453125,
"learning_rate": 0.00012127265963108935,
"loss": 2.7961,
"step": 668
},
{
"epoch": 0.049384536006486325,
"grad_norm": 94.96151733398438,
"learning_rate": 0.00012127259165524814,
"loss": 3.8152,
"step": 670
},
{
"epoch": 0.04953195253187882,
"grad_norm": 39.40300369262695,
"learning_rate": 0.00012127252347621616,
"loss": 1.1659,
"step": 672
},
{
"epoch": 0.04953195253187882,
"eval_1_ratio_diff": -0.11301636788776309,
"eval_accuracy": 0.779423226812159,
"eval_f1": 0.751099384344767,
"eval_loss": 1.093988060951233,
"eval_precision": 0.8608870967741935,
"eval_recall": 0.6661466458658346,
"eval_runtime": 1441.6444,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 672
},
{
"epoch": 0.04967936905727132,
"grad_norm": 142.39564514160156,
"learning_rate": 0.00012127245509399365,
"loss": 1.9772,
"step": 674
},
{
"epoch": 0.04982678558266382,
"grad_norm": 0.43099793791770935,
"learning_rate": 0.00012127238650858088,
"loss": 0.0056,
"step": 676
},
{
"epoch": 0.049974202108056316,
"grad_norm": 0.22017613053321838,
"learning_rate": 0.00012127231771997801,
"loss": 0.0026,
"step": 678
},
{
"epoch": 0.05012161863344881,
"grad_norm": 0.06024312227964401,
"learning_rate": 0.00012127224872818532,
"loss": 1.4556,
"step": 680
},
{
"epoch": 0.050269035158841305,
"grad_norm": 30.382848739624023,
"learning_rate": 0.00012127217953320302,
"loss": 0.959,
"step": 682
},
{
"epoch": 0.0504164516842338,
"grad_norm": 0.12178266048431396,
"learning_rate": 0.00012127211013503136,
"loss": 0.0025,
"step": 684
},
{
"epoch": 0.0505638682096263,
"grad_norm": 0.2670276165008545,
"learning_rate": 0.00012127204053367056,
"loss": 0.0059,
"step": 686
},
{
"epoch": 0.0507112847350188,
"grad_norm": 0.7420686483383179,
"learning_rate": 0.00012127197072912085,
"loss": 0.0205,
"step": 688
},
{
"epoch": 0.0507112847350188,
"eval_1_ratio_diff": -0.05300077942322684,
"eval_accuracy": 0.828526890101325,
"eval_f1": 0.8187808896210873,
"eval_loss": 0.5867729783058167,
"eval_precision": 0.8673647469458988,
"eval_recall": 0.7753510140405616,
"eval_runtime": 1441.426,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 688
},
{
"epoch": 0.05085870126041129,
"grad_norm": 69.81874084472656,
"learning_rate": 0.00012127190072138247,
"loss": 0.853,
"step": 690
},
{
"epoch": 0.05100611778580379,
"grad_norm": 0.51251220703125,
"learning_rate": 0.00012127183051045567,
"loss": 0.037,
"step": 692
},
{
"epoch": 0.051153534311196285,
"grad_norm": 32.83553695678711,
"learning_rate": 0.00012127176009634066,
"loss": 1.7711,
"step": 694
},
{
"epoch": 0.05130095083658878,
"grad_norm": 0.029091738164424896,
"learning_rate": 0.00012127168947903768,
"loss": 0.0006,
"step": 696
},
{
"epoch": 0.05144836736198128,
"grad_norm": 59.94422912597656,
"learning_rate": 0.00012127161865854698,
"loss": 1.6607,
"step": 698
},
{
"epoch": 0.05159578388737378,
"grad_norm": 60.350067138671875,
"learning_rate": 0.00012127154763486877,
"loss": 2.055,
"step": 700
},
{
"epoch": 0.05174320041276627,
"grad_norm": 0.08221148699522018,
"learning_rate": 0.00012127147640800332,
"loss": 1.6475,
"step": 702
},
{
"epoch": 0.051890616938158766,
"grad_norm": 39.905357360839844,
"learning_rate": 0.00012127140497795086,
"loss": 1.2104,
"step": 704
},
{
"epoch": 0.051890616938158766,
"eval_1_ratio_diff": -0.0919719407638348,
"eval_accuracy": 0.8316445830085737,
"eval_f1": 0.8144329896907216,
"eval_loss": 0.7349568605422974,
"eval_precision": 0.9063097514340345,
"eval_recall": 0.7394695787831513,
"eval_runtime": 1441.7333,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 704
},
{
"epoch": 0.052038033463551264,
"grad_norm": 4.226317882537842,
"learning_rate": 0.00012127133334471161,
"loss": 0.4275,
"step": 706
},
{
"epoch": 0.05218544998894376,
"grad_norm": 0.05035168305039406,
"learning_rate": 0.00012127126150828585,
"loss": 1.3166,
"step": 708
},
{
"epoch": 0.05233286651433626,
"grad_norm": 0.25760674476623535,
"learning_rate": 0.00012127118946867378,
"loss": 0.0081,
"step": 710
},
{
"epoch": 0.05248028303972875,
"grad_norm": 36.74332809448242,
"learning_rate": 0.00012127111722587565,
"loss": 1.1506,
"step": 712
},
{
"epoch": 0.05262769956512125,
"grad_norm": 36.16116714477539,
"learning_rate": 0.00012127104477989172,
"loss": 1.2632,
"step": 714
},
{
"epoch": 0.052775116090513746,
"grad_norm": 37.083343505859375,
"learning_rate": 0.00012127097213072223,
"loss": 1.8408,
"step": 716
},
{
"epoch": 0.052922532615906244,
"grad_norm": 3.0497827529907227,
"learning_rate": 0.0001212708992783674,
"loss": 0.0247,
"step": 718
},
{
"epoch": 0.05306994914129874,
"grad_norm": 4.117802619934082,
"learning_rate": 0.00012127082622282751,
"loss": 0.0342,
"step": 720
},
{
"epoch": 0.05306994914129874,
"eval_1_ratio_diff": -0.08183943881527672,
"eval_accuracy": 0.8121590023382697,
"eval_f1": 0.7952421410365336,
"eval_loss": 0.5786097645759583,
"eval_precision": 0.8731343283582089,
"eval_recall": 0.7301092043681747,
"eval_runtime": 1442.0276,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 720
},
{
"epoch": 0.05321736566669124,
"grad_norm": 0.633588969707489,
"learning_rate": 0.00012127075296410277,
"loss": 0.0056,
"step": 722
},
{
"epoch": 0.05336478219208373,
"grad_norm": 36.505218505859375,
"learning_rate": 0.00012127067950219344,
"loss": 0.7263,
"step": 724
},
{
"epoch": 0.05351219871747623,
"grad_norm": 0.6716632843017578,
"learning_rate": 0.00012127060583709976,
"loss": 0.0045,
"step": 726
},
{
"epoch": 0.053659615242868726,
"grad_norm": 36.19940948486328,
"learning_rate": 0.000121270531968822,
"loss": 0.377,
"step": 728
},
{
"epoch": 0.053807031768261224,
"grad_norm": 81.25736236572266,
"learning_rate": 0.00012127045789736038,
"loss": 0.6006,
"step": 730
},
{
"epoch": 0.05395444829365372,
"grad_norm": 29.044986724853516,
"learning_rate": 0.00012127038362271517,
"loss": 1.1609,
"step": 732
},
{
"epoch": 0.05410186481904621,
"grad_norm": 0.1593562811613083,
"learning_rate": 0.0001212703091448866,
"loss": 0.0055,
"step": 734
},
{
"epoch": 0.05424928134443871,
"grad_norm": 7.988092422485352,
"learning_rate": 0.00012127023446387492,
"loss": 0.0238,
"step": 736
},
{
"epoch": 0.05424928134443871,
"eval_1_ratio_diff": -0.07092751363990646,
"eval_accuracy": 0.838659392049883,
"eval_f1": 0.8261964735516373,
"eval_loss": 0.6887457370758057,
"eval_precision": 0.8945454545454545,
"eval_recall": 0.7675507020280812,
"eval_runtime": 1441.3319,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 736
},
{
"epoch": 0.05439669786983121,
"grad_norm": 58.56552505493164,
"learning_rate": 0.00012127015957968041,
"loss": 2.3194,
"step": 738
},
{
"epoch": 0.054544114395223706,
"grad_norm": 0.37152421474456787,
"learning_rate": 0.00012127008449230329,
"loss": 0.0029,
"step": 740
},
{
"epoch": 0.0546915309206162,
"grad_norm": 33.52932357788086,
"learning_rate": 0.00012127000920174381,
"loss": 1.1549,
"step": 742
},
{
"epoch": 0.054838947446008694,
"grad_norm": 0.02616913430392742,
"learning_rate": 0.00012126993370800224,
"loss": 0.0021,
"step": 744
},
{
"epoch": 0.05498636397140119,
"grad_norm": 36.83317565917969,
"learning_rate": 0.00012126985801107882,
"loss": 1.2016,
"step": 746
},
{
"epoch": 0.05513378049679369,
"grad_norm": 0.006011671852320433,
"learning_rate": 0.00012126978211097381,
"loss": 2.834,
"step": 748
},
{
"epoch": 0.05528119702218619,
"grad_norm": 58.966102600097656,
"learning_rate": 0.00012126970600768747,
"loss": 2.0661,
"step": 750
},
{
"epoch": 0.055428613547578685,
"grad_norm": 57.80133819580078,
"learning_rate": 0.00012126962970122005,
"loss": 1.2417,
"step": 752
},
{
"epoch": 0.055428613547578685,
"eval_1_ratio_diff": 0.09508963367108336,
"eval_accuracy": 0.8503507404520655,
"eval_f1": 0.8632478632478633,
"eval_loss": 0.7459388375282288,
"eval_precision": 0.7942332896461337,
"eval_recall": 0.9453978159126365,
"eval_runtime": 1440.976,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.446,
"step": 752
},
{
"epoch": 0.05557603007297118,
"grad_norm": 0.10538947582244873,
"learning_rate": 0.00012126955319157181,
"loss": 1.5568,
"step": 754
},
{
"epoch": 0.055723446598363674,
"grad_norm": 0.3577294647693634,
"learning_rate": 0.000121269476478743,
"loss": 1.3633,
"step": 756
},
{
"epoch": 0.05587086312375617,
"grad_norm": 111.04033660888672,
"learning_rate": 0.00012126939956273387,
"loss": 1.5691,
"step": 758
},
{
"epoch": 0.05601827964914867,
"grad_norm": 8.450987815856934,
"learning_rate": 0.00012126932244354469,
"loss": 0.6036,
"step": 760
},
{
"epoch": 0.05616569617454117,
"grad_norm": 6.646569728851318,
"learning_rate": 0.00012126924512117572,
"loss": 0.0554,
"step": 762
},
{
"epoch": 0.056313112699933665,
"grad_norm": 10.05777359008789,
"learning_rate": 0.00012126916759562719,
"loss": 0.0507,
"step": 764
},
{
"epoch": 0.056460529225326156,
"grad_norm": 1.5429670810699463,
"learning_rate": 0.00012126908986689941,
"loss": 0.5476,
"step": 766
},
{
"epoch": 0.056607945750718654,
"grad_norm": 0.7471988201141357,
"learning_rate": 0.0001212690119349926,
"loss": 0.0357,
"step": 768
},
{
"epoch": 0.056607945750718654,
"eval_1_ratio_diff": -0.03273577552611068,
"eval_accuracy": 0.8534684333593141,
"eval_f1": 0.8483870967741935,
"eval_loss": 0.5041674971580505,
"eval_precision": 0.8781302170283807,
"eval_recall": 0.8205928237129485,
"eval_runtime": 1441.5634,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 768
},
{
"epoch": 0.05675536227611115,
"grad_norm": 25.423622131347656,
"learning_rate": 0.00012126893379990705,
"loss": 0.0991,
"step": 770
},
{
"epoch": 0.05690277880150365,
"grad_norm": 8.131854057312012,
"learning_rate": 0.00012126885546164299,
"loss": 0.0467,
"step": 772
},
{
"epoch": 0.05705019532689615,
"grad_norm": 0.7007619738578796,
"learning_rate": 0.00012126877692020069,
"loss": 0.0319,
"step": 774
},
{
"epoch": 0.057197611852288645,
"grad_norm": 0.0242279302328825,
"learning_rate": 0.00012126869817558045,
"loss": 0.5106,
"step": 776
},
{
"epoch": 0.057345028377681136,
"grad_norm": 1.126301646232605,
"learning_rate": 0.00012126861922778249,
"loss": 0.0068,
"step": 778
},
{
"epoch": 0.05749244490307363,
"grad_norm": 2.2255496978759766,
"learning_rate": 0.0001212685400768071,
"loss": 0.0125,
"step": 780
},
{
"epoch": 0.05763986142846613,
"grad_norm": 53.08203125,
"learning_rate": 0.00012126846072265453,
"loss": 3.4784,
"step": 782
},
{
"epoch": 0.05778727795385863,
"grad_norm": 53.75185012817383,
"learning_rate": 0.00012126838116532506,
"loss": 5.3382,
"step": 784
},
{
"epoch": 0.05778727795385863,
"eval_1_ratio_diff": 0.024162120031176904,
"eval_accuracy": 0.8074824629773967,
"eval_f1": 0.8118811881188119,
"eval_loss": 1.1864495277404785,
"eval_precision": 0.7931547619047619,
"eval_recall": 0.8315132605304212,
"eval_runtime": 1441.6478,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 784
},
{
"epoch": 0.05793469447925113,
"grad_norm": 20.95121955871582,
"learning_rate": 0.00012126830140481893,
"loss": 3.3432,
"step": 786
},
{
"epoch": 0.05808211100464362,
"grad_norm": 49.42118453979492,
"learning_rate": 0.00012126822144113646,
"loss": 0.762,
"step": 788
},
{
"epoch": 0.058229527530036115,
"grad_norm": 16.03618812561035,
"learning_rate": 0.00012126814127427784,
"loss": 0.1045,
"step": 790
},
{
"epoch": 0.05837694405542861,
"grad_norm": 34.1168212890625,
"learning_rate": 0.00012126806090424342,
"loss": 3.1091,
"step": 792
},
{
"epoch": 0.05852436058082111,
"grad_norm": 34.757083892822266,
"learning_rate": 0.00012126798033103342,
"loss": 2.0632,
"step": 794
},
{
"epoch": 0.05867177710621361,
"grad_norm": 1.412405014038086,
"learning_rate": 0.00012126789955464813,
"loss": 0.2568,
"step": 796
},
{
"epoch": 0.058819193631606106,
"grad_norm": 55.76416015625,
"learning_rate": 0.00012126781857508779,
"loss": 0.497,
"step": 798
},
{
"epoch": 0.0589666101569986,
"grad_norm": 0.3345389664173126,
"learning_rate": 0.00012126773739235272,
"loss": 0.0121,
"step": 800
},
{
"epoch": 0.0589666101569986,
"eval_1_ratio_diff": 0.06703039750584561,
"eval_accuracy": 0.8363211223694466,
"eval_f1": 0.8464912280701754,
"eval_loss": 0.7451047897338867,
"eval_precision": 0.796423658872077,
"eval_recall": 0.9032761310452418,
"eval_runtime": 1440.5179,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 800
},
{
"epoch": 0.059114026682391095,
"grad_norm": 0.15297777950763702,
"learning_rate": 0.00012126765600644314,
"loss": 0.0082,
"step": 802
},
{
"epoch": 0.05926144320778359,
"grad_norm": 0.19080302119255066,
"learning_rate": 0.00012126757441735937,
"loss": 0.0026,
"step": 804
},
{
"epoch": 0.05940885973317609,
"grad_norm": 0.43317776918411255,
"learning_rate": 0.00012126749262510164,
"loss": 0.0055,
"step": 806
},
{
"epoch": 0.05955627625856859,
"grad_norm": 6.4003984334704e-06,
"learning_rate": 0.00012126741062967027,
"loss": 0.0031,
"step": 808
},
{
"epoch": 0.05970369278396108,
"grad_norm": 0.03450751677155495,
"learning_rate": 0.00012126732843106551,
"loss": 0.0052,
"step": 810
},
{
"epoch": 0.05985110930935358,
"grad_norm": 52.02117156982422,
"learning_rate": 0.00012126724602928764,
"loss": 5.0919,
"step": 812
},
{
"epoch": 0.059998525834746075,
"grad_norm": 50.249900817871094,
"learning_rate": 0.00012126716342433692,
"loss": 4.0749,
"step": 814
},
{
"epoch": 0.06014594236013857,
"grad_norm": 0.0067368014715611935,
"learning_rate": 0.00012126708061621366,
"loss": 0.0001,
"step": 816
},
{
"epoch": 0.06014594236013857,
"eval_1_ratio_diff": 0.06469212782540923,
"eval_accuracy": 0.8651597817614964,
"eval_f1": 0.8732600732600733,
"eval_loss": 0.9449532628059387,
"eval_precision": 0.8232044198895028,
"eval_recall": 0.9297971918876755,
"eval_runtime": 1440.6727,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 816
},
{
"epoch": 0.06029335888553107,
"grad_norm": 44.037471771240234,
"learning_rate": 0.00012126699760491808,
"loss": 2.1184,
"step": 818
},
{
"epoch": 0.06044077541092356,
"grad_norm": 31.20966148376465,
"learning_rate": 0.00012126691439045052,
"loss": 2.3532,
"step": 820
},
{
"epoch": 0.06058819193631606,
"grad_norm": 1.108382225036621,
"learning_rate": 0.00012126683097281125,
"loss": 0.0093,
"step": 822
},
{
"epoch": 0.06073560846170856,
"grad_norm": 1.2753050327301025,
"learning_rate": 0.0001212667473520005,
"loss": 0.011,
"step": 824
},
{
"epoch": 0.060883024987101055,
"grad_norm": 4.512105941772461,
"learning_rate": 0.00012126666352801861,
"loss": 0.0212,
"step": 826
},
{
"epoch": 0.06103044151249355,
"grad_norm": 0.3488874137401581,
"learning_rate": 0.00012126657950086582,
"loss": 1.2435,
"step": 828
},
{
"epoch": 0.06117785803788605,
"grad_norm": 0.11297665536403656,
"learning_rate": 0.00012126649527054243,
"loss": 0.0027,
"step": 830
},
{
"epoch": 0.06132527456327854,
"grad_norm": 0.07631942629814148,
"learning_rate": 0.00012126641083704874,
"loss": 0.0032,
"step": 832
},
{
"epoch": 0.06132527456327854,
"eval_1_ratio_diff": 0.08885424785658613,
"eval_accuracy": 0.8799688230709275,
"eval_f1": 0.8896848137535817,
"eval_loss": 0.7254036068916321,
"eval_precision": 0.8225165562913908,
"eval_recall": 0.968798751950078,
"eval_runtime": 1440.5593,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 832
},
{
"epoch": 0.06147269108867104,
"grad_norm": 41.37874984741211,
"learning_rate": 0.00012126632620038498,
"loss": 1.3108,
"step": 834
},
{
"epoch": 0.061620107614063536,
"grad_norm": 55.71513366699219,
"learning_rate": 0.00012126624136055149,
"loss": 1.7068,
"step": 836
},
{
"epoch": 0.061767524139456034,
"grad_norm": 1.5174663066864014,
"learning_rate": 0.0001212661563175485,
"loss": 0.0173,
"step": 838
},
{
"epoch": 0.06191494066484853,
"grad_norm": 0.10353035479784012,
"learning_rate": 0.00012126607107137636,
"loss": 1.2081,
"step": 840
},
{
"epoch": 0.06206235719024102,
"grad_norm": 0.05997217819094658,
"learning_rate": 0.00012126598562203531,
"loss": 1.8296,
"step": 842
},
{
"epoch": 0.06220977371563352,
"grad_norm": 0.17887941002845764,
"learning_rate": 0.00012126589996952563,
"loss": 0.0016,
"step": 844
},
{
"epoch": 0.06235719024102602,
"grad_norm": 0.08932141214609146,
"learning_rate": 0.00012126581411384764,
"loss": 1.5849,
"step": 846
},
{
"epoch": 0.06250460676641852,
"grad_norm": 41.82356643676758,
"learning_rate": 0.0001212657280550016,
"loss": 1.2425,
"step": 848
},
{
"epoch": 0.06250460676641852,
"eval_1_ratio_diff": 0.09586905689789549,
"eval_accuracy": 0.8542478565861262,
"eval_f1": 0.8669039145907473,
"eval_loss": 0.6706948280334473,
"eval_precision": 0.7971204188481675,
"eval_recall": 0.9500780031201248,
"eval_runtime": 1441.8264,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 848
},
{
"epoch": 0.06265202329181101,
"grad_norm": 45.45724105834961,
"learning_rate": 0.00012126564179298783,
"loss": 0.7189,
"step": 850
},
{
"epoch": 0.06279943981720351,
"grad_norm": 1.7170765399932861,
"learning_rate": 0.00012126555532780658,
"loss": 0.014,
"step": 852
},
{
"epoch": 0.062946856342596,
"grad_norm": 1.8296376466751099,
"learning_rate": 0.00012126546865945818,
"loss": 0.0249,
"step": 854
},
{
"epoch": 0.06309427286798851,
"grad_norm": 28.392093658447266,
"learning_rate": 0.00012126538178794288,
"loss": 2.1082,
"step": 856
},
{
"epoch": 0.063241689393381,
"grad_norm": 0.024955546483397484,
"learning_rate": 0.00012126529471326101,
"loss": 0.0026,
"step": 858
},
{
"epoch": 0.06338910591877349,
"grad_norm": 29.141136169433594,
"learning_rate": 0.00012126520743541283,
"loss": 1.5827,
"step": 860
},
{
"epoch": 0.063536522444166,
"grad_norm": 0.3031620383262634,
"learning_rate": 0.00012126511995439865,
"loss": 1.4029,
"step": 862
},
{
"epoch": 0.06368393896955848,
"grad_norm": 0.2821040451526642,
"learning_rate": 0.00012126503227021874,
"loss": 1.3931,
"step": 864
},
{
"epoch": 0.06368393896955848,
"eval_1_ratio_diff": -0.04053000779423227,
"eval_accuracy": 0.8581449727201871,
"eval_f1": 0.8520325203252033,
"eval_loss": 0.5900216102600098,
"eval_precision": 0.8896434634974533,
"eval_recall": 0.8174726989079563,
"eval_runtime": 1439.3591,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 864
},
{
"epoch": 0.06383135549495099,
"grad_norm": 4.8987860679626465,
"learning_rate": 0.00012126494438287343,
"loss": 0.0265,
"step": 866
},
{
"epoch": 0.06397877202034348,
"grad_norm": 0.27837908267974854,
"learning_rate": 0.000121264856292363,
"loss": 0.0093,
"step": 868
},
{
"epoch": 0.06412618854573597,
"grad_norm": 0.5379538536071777,
"learning_rate": 0.00012126476799868773,
"loss": 0.0095,
"step": 870
},
{
"epoch": 0.06427360507112848,
"grad_norm": 23.87804412841797,
"learning_rate": 0.00012126467950184793,
"loss": 0.8342,
"step": 872
},
{
"epoch": 0.06442102159652097,
"grad_norm": 1.2284973859786987,
"learning_rate": 0.0001212645908018439,
"loss": 0.0162,
"step": 874
},
{
"epoch": 0.06456843812191347,
"grad_norm": 36.555442810058594,
"learning_rate": 0.00012126450189867592,
"loss": 2.2561,
"step": 876
},
{
"epoch": 0.06471585464730596,
"grad_norm": 24.54311180114746,
"learning_rate": 0.00012126441279234432,
"loss": 3.1743,
"step": 878
},
{
"epoch": 0.06486327117269845,
"grad_norm": 0.13615825772285461,
"learning_rate": 0.00012126432348284936,
"loss": 0.0021,
"step": 880
},
{
"epoch": 0.06486327117269845,
"eval_1_ratio_diff": -0.2899454403741232,
"eval_accuracy": 0.6975837879968823,
"eval_f1": 0.5736263736263736,
"eval_loss": 1.3224732875823975,
"eval_precision": 0.9702602230483272,
"eval_recall": 0.40717628705148207,
"eval_runtime": 1440.3311,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 880
},
{
"epoch": 0.06501068769809096,
"grad_norm": 3.8478543758392334,
"learning_rate": 0.00012126423397019136,
"loss": 0.021,
"step": 882
},
{
"epoch": 0.06515810422348345,
"grad_norm": 0.08823257684707642,
"learning_rate": 0.00012126414425437062,
"loss": 0.0016,
"step": 884
},
{
"epoch": 0.06530552074887595,
"grad_norm": 27.02589988708496,
"learning_rate": 0.00012126405433538744,
"loss": 2.9462,
"step": 886
},
{
"epoch": 0.06545293727426844,
"grad_norm": 24.244503021240234,
"learning_rate": 0.00012126396421324212,
"loss": 0.8423,
"step": 888
},
{
"epoch": 0.06560035379966095,
"grad_norm": 0.3652421236038208,
"learning_rate": 0.00012126387388793495,
"loss": 0.0081,
"step": 890
},
{
"epoch": 0.06574777032505344,
"grad_norm": 22.919225692749023,
"learning_rate": 0.00012126378335946625,
"loss": 1.1268,
"step": 892
},
{
"epoch": 0.06589518685044593,
"grad_norm": 0.18866397440433502,
"learning_rate": 0.00012126369262783633,
"loss": 1.8645,
"step": 894
},
{
"epoch": 0.06604260337583844,
"grad_norm": 2.1540791988372803,
"learning_rate": 0.00012126360169304547,
"loss": 0.036,
"step": 896
},
{
"epoch": 0.06604260337583844,
"eval_1_ratio_diff": 0.017147310989867437,
"eval_accuracy": 0.8862042088854248,
"eval_f1": 0.8880368098159509,
"eval_loss": 0.49060943722724915,
"eval_precision": 0.8733031674208145,
"eval_recall": 0.9032761310452418,
"eval_runtime": 1439.5517,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 896
},
{
"epoch": 0.06619001990123093,
"grad_norm": 0.7136353850364685,
"learning_rate": 0.00012126351055509399,
"loss": 1.4136,
"step": 898
},
{
"epoch": 0.06633743642662343,
"grad_norm": 1.6063231229782104,
"learning_rate": 0.00012126341921398221,
"loss": 0.0358,
"step": 900
},
{
"epoch": 0.06648485295201592,
"grad_norm": 4.673253536224365,
"learning_rate": 0.00012126332766971038,
"loss": 0.0494,
"step": 902
},
{
"epoch": 0.06663226947740841,
"grad_norm": 0.21607956290245056,
"learning_rate": 0.00012126323592227886,
"loss": 0.0053,
"step": 904
},
{
"epoch": 0.06677968600280092,
"grad_norm": 32.70335006713867,
"learning_rate": 0.00012126314397168796,
"loss": 1.5106,
"step": 906
},
{
"epoch": 0.06692710252819341,
"grad_norm": 38.56415557861328,
"learning_rate": 0.00012126305181793794,
"loss": 0.8798,
"step": 908
},
{
"epoch": 0.06707451905358591,
"grad_norm": 0.018692007288336754,
"learning_rate": 0.00012126295946102917,
"loss": 0.0004,
"step": 910
},
{
"epoch": 0.0672219355789784,
"grad_norm": 22.49344825744629,
"learning_rate": 0.00012126286690096191,
"loss": 0.9364,
"step": 912
},
{
"epoch": 0.0672219355789784,
"eval_1_ratio_diff": -0.18706157443491817,
"eval_accuracy": 0.7802026500389712,
"eval_f1": 0.7293666026871402,
"eval_loss": 1.4581658840179443,
"eval_precision": 0.9476309226932669,
"eval_recall": 0.592823712948518,
"eval_runtime": 1440.0947,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 912
},
{
"epoch": 0.0673693521043709,
"grad_norm": 182.47938537597656,
"learning_rate": 0.00012126277413773649,
"loss": 1.0293,
"step": 914
},
{
"epoch": 0.0675167686297634,
"grad_norm": 4.0591816902160645,
"learning_rate": 0.00012126268117135323,
"loss": 0.0308,
"step": 916
},
{
"epoch": 0.06766418515515589,
"grad_norm": 6.380730628967285,
"learning_rate": 0.00012126258800181242,
"loss": 1.2327,
"step": 918
},
{
"epoch": 0.0678116016805484,
"grad_norm": 31.462488174438477,
"learning_rate": 0.00012126249462911438,
"loss": 0.8761,
"step": 920
},
{
"epoch": 0.06795901820594089,
"grad_norm": 0.0057801539078354836,
"learning_rate": 0.00012126240105325944,
"loss": 0.0077,
"step": 922
},
{
"epoch": 0.06810643473133338,
"grad_norm": 1.9855010509490967,
"learning_rate": 0.0001212623072742479,
"loss": 0.0239,
"step": 924
},
{
"epoch": 0.06825385125672588,
"grad_norm": 0.4254453480243683,
"learning_rate": 0.00012126221329208006,
"loss": 0.0032,
"step": 926
},
{
"epoch": 0.06840126778211837,
"grad_norm": 0.502257227897644,
"learning_rate": 0.00012126211910675626,
"loss": 0.0103,
"step": 928
},
{
"epoch": 0.06840126778211837,
"eval_1_ratio_diff": 0.026500389711613392,
"eval_accuracy": 0.8768511301636789,
"eval_f1": 0.8799392097264438,
"eval_loss": 0.7347307205200195,
"eval_precision": 0.8577777777777778,
"eval_recall": 0.9032761310452418,
"eval_runtime": 1440.0126,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 928
},
{
"epoch": 0.06854868430751088,
"grad_norm": 0.6557896733283997,
"learning_rate": 0.00012126202471827679,
"loss": 0.003,
"step": 930
},
{
"epoch": 0.06869610083290337,
"grad_norm": 0.020085789263248444,
"learning_rate": 0.00012126193012664201,
"loss": 0.0004,
"step": 932
},
{
"epoch": 0.06884351735829586,
"grad_norm": 0.006013574078679085,
"learning_rate": 0.00012126183533185218,
"loss": 0.0001,
"step": 934
},
{
"epoch": 0.06899093388368836,
"grad_norm": 252.07472229003906,
"learning_rate": 0.00012126174033390767,
"loss": 0.8075,
"step": 936
},
{
"epoch": 0.06913835040908085,
"grad_norm": 0.002460025018081069,
"learning_rate": 0.00012126164513280875,
"loss": 0.0011,
"step": 938
},
{
"epoch": 0.06928576693447336,
"grad_norm": 0.01136123575270176,
"learning_rate": 0.00012126154972855578,
"loss": 0.0003,
"step": 940
},
{
"epoch": 0.06943318345986585,
"grad_norm": 37.112640380859375,
"learning_rate": 0.00012126145412114907,
"loss": 2.9468,
"step": 942
},
{
"epoch": 0.06958059998525835,
"grad_norm": 0.006933971308171749,
"learning_rate": 0.00012126135831058891,
"loss": 0.0001,
"step": 944
},
{
"epoch": 0.06958059998525835,
"eval_1_ratio_diff": 0.2478565861262666,
"eval_accuracy": 0.7443491816056118,
"eval_f1": 0.795,
"eval_loss": 2.4577670097351074,
"eval_precision": 0.6631908237747653,
"eval_recall": 0.9921996879875195,
"eval_runtime": 1439.5508,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 944
},
{
"epoch": 0.06972801651065084,
"grad_norm": 36.87862777709961,
"learning_rate": 0.00012126126229687566,
"loss": 5.0295,
"step": 946
},
{
"epoch": 0.06987543303604334,
"grad_norm": 70.17023468017578,
"learning_rate": 0.00012126116608000961,
"loss": 4.0308,
"step": 948
},
{
"epoch": 0.07002284956143584,
"grad_norm": 37.03538513183594,
"learning_rate": 0.00012126106965999112,
"loss": 1.8733,
"step": 950
},
{
"epoch": 0.07017026608682833,
"grad_norm": 66.47712707519531,
"learning_rate": 0.00012126097303682048,
"loss": 4.2016,
"step": 952
},
{
"epoch": 0.07031768261222084,
"grad_norm": 29.390884399414062,
"learning_rate": 0.00012126087621049803,
"loss": 1.9788,
"step": 954
},
{
"epoch": 0.07046509913761333,
"grad_norm": 10.997523307800293,
"learning_rate": 0.00012126077918102409,
"loss": 0.1381,
"step": 956
},
{
"epoch": 0.07061251566300582,
"grad_norm": 38.46750259399414,
"learning_rate": 0.00012126068194839898,
"loss": 0.8822,
"step": 958
},
{
"epoch": 0.07075993218839832,
"grad_norm": 18.62594985961914,
"learning_rate": 0.00012126058451262304,
"loss": 0.3758,
"step": 960
},
{
"epoch": 0.07075993218839832,
"eval_1_ratio_diff": -0.4505066250974279,
"eval_accuracy": 0.5494933749025721,
"eval_f1": 0.17897727272727273,
"eval_loss": 0.5460181832313538,
"eval_precision": 1.0,
"eval_recall": 0.09828393135725429,
"eval_runtime": 1440.4539,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 960
},
{
"epoch": 0.07090734871379081,
"grad_norm": 21.68712615966797,
"learning_rate": 0.00012126048687369658,
"loss": 0.3891,
"step": 962
},
{
"epoch": 0.07105476523918332,
"grad_norm": 7.1598124504089355,
"learning_rate": 0.00012126038903161995,
"loss": 0.3555,
"step": 964
},
{
"epoch": 0.07120218176457581,
"grad_norm": 28.80471420288086,
"learning_rate": 0.00012126029098639344,
"loss": 0.9078,
"step": 966
},
{
"epoch": 0.0713495982899683,
"grad_norm": 18.606401443481445,
"learning_rate": 0.00012126019273801743,
"loss": 0.2927,
"step": 968
},
{
"epoch": 0.0714970148153608,
"grad_norm": 21.51089859008789,
"learning_rate": 0.0001212600942864922,
"loss": 0.6348,
"step": 970
},
{
"epoch": 0.0716444313407533,
"grad_norm": 4.713807582855225,
"learning_rate": 0.00012125999563181809,
"loss": 0.5351,
"step": 972
},
{
"epoch": 0.0717918478661458,
"grad_norm": 11.428181648254395,
"learning_rate": 0.00012125989677399546,
"loss": 0.2465,
"step": 974
},
{
"epoch": 0.07193926439153829,
"grad_norm": 13.697668075561523,
"learning_rate": 0.00012125979771302464,
"loss": 0.1411,
"step": 976
},
{
"epoch": 0.07193926439153829,
"eval_1_ratio_diff": 0.03897116134060796,
"eval_accuracy": 0.8846453624318005,
"eval_f1": 0.8888888888888888,
"eval_loss": 0.38700371980667114,
"eval_precision": 0.8567293777134588,
"eval_recall": 0.9235569422776911,
"eval_runtime": 1440.5564,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 976
},
{
"epoch": 0.07208668091693078,
"grad_norm": 3.2123868465423584,
"learning_rate": 0.0001212596984489059,
"loss": 0.0295,
"step": 978
},
{
"epoch": 0.07223409744232329,
"grad_norm": 0.026355383917689323,
"learning_rate": 0.00012125959898163965,
"loss": 0.0005,
"step": 980
},
{
"epoch": 0.07238151396771578,
"grad_norm": 0.10228274017572403,
"learning_rate": 0.00012125949931122618,
"loss": 0.002,
"step": 982
},
{
"epoch": 0.07252893049310828,
"grad_norm": 0.031520161777734756,
"learning_rate": 0.00012125939943766583,
"loss": 0.0008,
"step": 984
},
{
"epoch": 0.07267634701850077,
"grad_norm": 0.1047026515007019,
"learning_rate": 0.00012125929936095894,
"loss": 0.0009,
"step": 986
},
{
"epoch": 0.07282376354389328,
"grad_norm": 30.88459587097168,
"learning_rate": 0.00012125919908110585,
"loss": 2.267,
"step": 988
},
{
"epoch": 0.07297118006928577,
"grad_norm": 0.029362376779317856,
"learning_rate": 0.0001212590985981069,
"loss": 0.0003,
"step": 990
},
{
"epoch": 0.07311859659467826,
"grad_norm": 0.2791018784046173,
"learning_rate": 0.0001212589979119624,
"loss": 0.0017,
"step": 992
},
{
"epoch": 0.07311859659467826,
"eval_1_ratio_diff": 0.05378020265003891,
"eval_accuracy": 0.877630553390491,
"eval_f1": 0.8837897853441895,
"eval_loss": 0.7231972813606262,
"eval_precision": 0.8408450704225352,
"eval_recall": 0.9313572542901716,
"eval_runtime": 1440.0578,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 992
},
{
"epoch": 0.07326601312007076,
"grad_norm": 0.056903205811977386,
"learning_rate": 0.00012125889702267272,
"loss": 0.0007,
"step": 994
},
{
"epoch": 0.07341342964546325,
"grad_norm": 0.015094200149178505,
"learning_rate": 0.00012125879593023818,
"loss": 0.0002,
"step": 996
},
{
"epoch": 0.07356084617085576,
"grad_norm": 0.6008047461509705,
"learning_rate": 0.00012125869463465912,
"loss": 0.0045,
"step": 998
},
{
"epoch": 0.07370826269624825,
"grad_norm": 0.6626961827278137,
"learning_rate": 0.00012125859313593587,
"loss": 0.004,
"step": 1000
},
{
"epoch": 0.07385567922164074,
"grad_norm": 0.009313930757343769,
"learning_rate": 0.0001212584914340688,
"loss": 0.0002,
"step": 1002
},
{
"epoch": 0.07400309574703325,
"grad_norm": 0.01076335646212101,
"learning_rate": 0.00012125838952905822,
"loss": 0.0004,
"step": 1004
},
{
"epoch": 0.07415051227242574,
"grad_norm": 0.008014670573174953,
"learning_rate": 0.00012125828742090447,
"loss": 0.0001,
"step": 1006
},
{
"epoch": 0.07429792879781824,
"grad_norm": 33.344932556152344,
"learning_rate": 0.00012125818510960795,
"loss": 2.0841,
"step": 1008
},
{
"epoch": 0.07429792879781824,
"eval_1_ratio_diff": 0.08261886204208879,
"eval_accuracy": 0.8643803585346843,
"eval_f1": 0.8746397694524496,
"eval_loss": 0.954525887966156,
"eval_precision": 0.8125836680053548,
"eval_recall": 0.9469578783151326,
"eval_runtime": 1439.8312,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1008
},
{
"epoch": 0.07444534532321073,
"grad_norm": 0.009119726717472076,
"learning_rate": 0.00012125808259516893,
"loss": 2.2253,
"step": 1010
},
{
"epoch": 0.07459276184860322,
"grad_norm": 0.08696369081735611,
"learning_rate": 0.00012125797987758778,
"loss": 0.0009,
"step": 1012
},
{
"epoch": 0.07474017837399573,
"grad_norm": 26.136661529541016,
"learning_rate": 0.00012125787695686484,
"loss": 1.5774,
"step": 1014
},
{
"epoch": 0.07488759489938822,
"grad_norm": 32.24976348876953,
"learning_rate": 0.00012125777383300048,
"loss": 1.1735,
"step": 1016
},
{
"epoch": 0.07503501142478072,
"grad_norm": 0.5457736253738403,
"learning_rate": 0.00012125767050599501,
"loss": 0.0112,
"step": 1018
},
{
"epoch": 0.07518242795017321,
"grad_norm": 0.7166759967803955,
"learning_rate": 0.0001212575669758488,
"loss": 0.2859,
"step": 1020
},
{
"epoch": 0.0753298444755657,
"grad_norm": 0.32718005776405334,
"learning_rate": 0.00012125746324256221,
"loss": 1.5148,
"step": 1022
},
{
"epoch": 0.07547726100095821,
"grad_norm": 88.85284423828125,
"learning_rate": 0.00012125735930613554,
"loss": 2.6444,
"step": 1024
},
{
"epoch": 0.07547726100095821,
"eval_1_ratio_diff": 0.024162120031176904,
"eval_accuracy": 0.8838659392049883,
"eval_f1": 0.8865194211728865,
"eval_loss": 0.48203912377357483,
"eval_precision": 0.8660714285714286,
"eval_recall": 0.9079563182527302,
"eval_runtime": 1439.7938,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1024
},
{
"epoch": 0.0756246775263507,
"grad_norm": 0.2224024385213852,
"learning_rate": 0.00012125725516656918,
"loss": 0.0068,
"step": 1026
},
{
"epoch": 0.0757720940517432,
"grad_norm": 0.2110309898853302,
"learning_rate": 0.00012125715082386346,
"loss": 0.0036,
"step": 1028
},
{
"epoch": 0.0759195105771357,
"grad_norm": 0.20480689406394958,
"learning_rate": 0.00012125704627801874,
"loss": 0.0101,
"step": 1030
},
{
"epoch": 0.07606692710252819,
"grad_norm": 24.321718215942383,
"learning_rate": 0.00012125694152903538,
"loss": 2.3569,
"step": 1032
},
{
"epoch": 0.07621434362792069,
"grad_norm": 0.3324243426322937,
"learning_rate": 0.00012125683657691368,
"loss": 0.0101,
"step": 1034
},
{
"epoch": 0.07636176015331318,
"grad_norm": 1.0518757104873657,
"learning_rate": 0.00012125673142165406,
"loss": 0.013,
"step": 1036
},
{
"epoch": 0.07650917667870569,
"grad_norm": 25.96786880493164,
"learning_rate": 0.00012125662606325683,
"loss": 1.3031,
"step": 1038
},
{
"epoch": 0.07665659320409818,
"grad_norm": 0.12808893620967865,
"learning_rate": 0.00012125652050172236,
"loss": 0.0051,
"step": 1040
},
{
"epoch": 0.07665659320409818,
"eval_1_ratio_diff": -0.05689789555728764,
"eval_accuracy": 0.8698363211223694,
"eval_f1": 0.8618693134822167,
"eval_loss": 0.5904788970947266,
"eval_precision": 0.9172535211267606,
"eval_recall": 0.8127925117004681,
"eval_runtime": 1440.2519,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1040
},
{
"epoch": 0.07680400972949068,
"grad_norm": 30.74445343017578,
"learning_rate": 0.00012125641473705098,
"loss": 1.479,
"step": 1042
},
{
"epoch": 0.07695142625488317,
"grad_norm": 0.33330148458480835,
"learning_rate": 0.00012125630876924309,
"loss": 2.7544,
"step": 1044
},
{
"epoch": 0.07709884278027566,
"grad_norm": 0.4779714047908783,
"learning_rate": 0.00012125620259829898,
"loss": 0.0059,
"step": 1046
},
{
"epoch": 0.07724625930566817,
"grad_norm": 0.4376041889190674,
"learning_rate": 0.00012125609622421907,
"loss": 0.0273,
"step": 1048
},
{
"epoch": 0.07739367583106066,
"grad_norm": 0.14147210121154785,
"learning_rate": 0.00012125598964700367,
"loss": 1.3617,
"step": 1050
},
{
"epoch": 0.07754109235645316,
"grad_norm": 20.751298904418945,
"learning_rate": 0.00012125588286665319,
"loss": 2.4864,
"step": 1052
},
{
"epoch": 0.07768850888184566,
"grad_norm": 0.3589191138744354,
"learning_rate": 0.00012125577588316793,
"loss": 0.0102,
"step": 1054
},
{
"epoch": 0.07783592540723815,
"grad_norm": 31.519622802734375,
"learning_rate": 0.00012125566869654828,
"loss": 1.8161,
"step": 1056
},
{
"epoch": 0.07783592540723815,
"eval_1_ratio_diff": -0.05455962587685115,
"eval_accuracy": 0.8472330475448169,
"eval_f1": 0.8382838283828383,
"eval_loss": 0.5423593521118164,
"eval_precision": 0.8896672504378283,
"eval_recall": 0.7925117004680188,
"eval_runtime": 1440.6162,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1056
},
{
"epoch": 0.07798334193263065,
"grad_norm": 29.618946075439453,
"learning_rate": 0.00012125556130679457,
"loss": 0.9249,
"step": 1058
},
{
"epoch": 0.07813075845802314,
"grad_norm": 24.92931365966797,
"learning_rate": 0.0001212554537139072,
"loss": 1.3237,
"step": 1060
},
{
"epoch": 0.07827817498341565,
"grad_norm": 6.922366142272949,
"learning_rate": 0.00012125534591788653,
"loss": 0.0954,
"step": 1062
},
{
"epoch": 0.07842559150880814,
"grad_norm": 1.4033849239349365,
"learning_rate": 0.00012125523791873287,
"loss": 0.1059,
"step": 1064
},
{
"epoch": 0.07857300803420063,
"grad_norm": 0.5430750846862793,
"learning_rate": 0.00012125512971644664,
"loss": 0.0167,
"step": 1066
},
{
"epoch": 0.07872042455959313,
"grad_norm": 25.31169319152832,
"learning_rate": 0.00012125502131102817,
"loss": 1.4498,
"step": 1068
},
{
"epoch": 0.07886784108498562,
"grad_norm": 24.96006965637207,
"learning_rate": 0.00012125491270247783,
"loss": 1.3258,
"step": 1070
},
{
"epoch": 0.07901525761037813,
"grad_norm": 1.3635300397872925,
"learning_rate": 0.000121254803890796,
"loss": 0.0339,
"step": 1072
},
{
"epoch": 0.07901525761037813,
"eval_1_ratio_diff": -0.029618082618862063,
"eval_accuracy": 0.8581449727201871,
"eval_f1": 0.8536977491961415,
"eval_loss": 0.6108663082122803,
"eval_precision": 0.8805970149253731,
"eval_recall": 0.828393135725429,
"eval_runtime": 1440.3267,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1072
},
{
"epoch": 0.07916267413577062,
"grad_norm": 41.34056091308594,
"learning_rate": 0.00012125469487598301,
"loss": 1.665,
"step": 1074
},
{
"epoch": 0.07931009066116311,
"grad_norm": 0.8467972278594971,
"learning_rate": 0.00012125458565803925,
"loss": 0.0102,
"step": 1076
},
{
"epoch": 0.07945750718655561,
"grad_norm": 0.09642868489027023,
"learning_rate": 0.00012125447623696508,
"loss": 0.0053,
"step": 1078
},
{
"epoch": 0.0796049237119481,
"grad_norm": 0.06861916184425354,
"learning_rate": 0.00012125436661276089,
"loss": 0.5701,
"step": 1080
},
{
"epoch": 0.07975234023734061,
"grad_norm": 22.139467239379883,
"learning_rate": 0.000121254256785427,
"loss": 1.088,
"step": 1082
},
{
"epoch": 0.0798997567627331,
"grad_norm": 0.037754353135824203,
"learning_rate": 0.00012125414675496381,
"loss": 0.0046,
"step": 1084
},
{
"epoch": 0.08004717328812559,
"grad_norm": 32.785037994384766,
"learning_rate": 0.00012125403652137169,
"loss": 0.956,
"step": 1086
},
{
"epoch": 0.0801945898135181,
"grad_norm": 21.96536636352539,
"learning_rate": 0.000121253926084651,
"loss": 3.1959,
"step": 1088
},
{
"epoch": 0.0801945898135181,
"eval_1_ratio_diff": -0.04364770070148094,
"eval_accuracy": 0.8659392049883087,
"eval_f1": 0.8597063621533442,
"eval_loss": 0.6232408285140991,
"eval_precision": 0.9008547008547009,
"eval_recall": 0.8221528861154446,
"eval_runtime": 1440.7525,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1088
},
{
"epoch": 0.08034200633891059,
"grad_norm": 0.09767896682024002,
"learning_rate": 0.00012125381544480211,
"loss": 1.9769,
"step": 1090
},
{
"epoch": 0.08048942286430309,
"grad_norm": 4.43467378616333,
"learning_rate": 0.0001212537046018254,
"loss": 1.8357,
"step": 1092
},
{
"epoch": 0.08063683938969558,
"grad_norm": 22.267379760742188,
"learning_rate": 0.00012125359355572121,
"loss": 0.9555,
"step": 1094
},
{
"epoch": 0.08078425591508809,
"grad_norm": 0.7281066179275513,
"learning_rate": 0.00012125348230648997,
"loss": 0.0391,
"step": 1096
},
{
"epoch": 0.08093167244048058,
"grad_norm": 0.4542294442653656,
"learning_rate": 0.000121253370854132,
"loss": 0.9409,
"step": 1098
},
{
"epoch": 0.08107908896587307,
"grad_norm": 22.38312339782715,
"learning_rate": 0.0001212532591986477,
"loss": 1.0271,
"step": 1100
},
{
"epoch": 0.08122650549126557,
"grad_norm": 26.339080810546875,
"learning_rate": 0.00012125314734003743,
"loss": 1.4919,
"step": 1102
},
{
"epoch": 0.08137392201665807,
"grad_norm": 3.4797956943511963,
"learning_rate": 0.00012125303527830157,
"loss": 0.0342,
"step": 1104
},
{
"epoch": 0.08137392201665807,
"eval_1_ratio_diff": -0.15354637568199536,
"eval_accuracy": 0.7887763055339049,
"eval_f1": 0.7502304147465437,
"eval_loss": 0.8048840761184692,
"eval_precision": 0.9166666666666666,
"eval_recall": 0.6349453978159126,
"eval_runtime": 1440.2595,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1104
},
{
"epoch": 0.08152133854205057,
"grad_norm": 27.387937545776367,
"learning_rate": 0.0001212529230134405,
"loss": 2.2878,
"step": 1106
},
{
"epoch": 0.08166875506744306,
"grad_norm": 0.27098074555397034,
"learning_rate": 0.00012125281054545459,
"loss": 0.0115,
"step": 1108
},
{
"epoch": 0.08181617159283555,
"grad_norm": 0.17622074484825134,
"learning_rate": 0.00012125269787434425,
"loss": 0.9066,
"step": 1110
},
{
"epoch": 0.08196358811822806,
"grad_norm": 13.168516159057617,
"learning_rate": 0.00012125258500010979,
"loss": 1.0468,
"step": 1112
},
{
"epoch": 0.08211100464362055,
"grad_norm": 15.512298583984375,
"learning_rate": 0.00012125247192275165,
"loss": 0.1845,
"step": 1114
},
{
"epoch": 0.08225842116901305,
"grad_norm": 31.888328552246094,
"learning_rate": 0.00012125235864227018,
"loss": 0.3136,
"step": 1116
},
{
"epoch": 0.08240583769440554,
"grad_norm": 9.891843795776367,
"learning_rate": 0.00012125224515866574,
"loss": 0.8436,
"step": 1118
},
{
"epoch": 0.08255325421979803,
"grad_norm": 46.16787338256836,
"learning_rate": 0.00012125213147193877,
"loss": 2.5811,
"step": 1120
},
{
"epoch": 0.08255325421979803,
"eval_1_ratio_diff": 0.07560405300077944,
"eval_accuracy": 0.848012470771629,
"eval_f1": 0.8585931834662799,
"eval_loss": 0.6410078406333923,
"eval_precision": 0.8021680216802168,
"eval_recall": 0.9235569422776911,
"eval_runtime": 1440.0548,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1120
},
{
"epoch": 0.08270067074519054,
"grad_norm": 7.604285717010498,
"learning_rate": 0.00012125201758208962,
"loss": 1.1177,
"step": 1122
},
{
"epoch": 0.08284808727058303,
"grad_norm": 0.39338427782058716,
"learning_rate": 0.00012125190348911864,
"loss": 1.5911,
"step": 1124
},
{
"epoch": 0.08299550379597553,
"grad_norm": 36.94788360595703,
"learning_rate": 0.00012125178919302626,
"loss": 1.3629,
"step": 1126
},
{
"epoch": 0.08314292032136802,
"grad_norm": 0.8372169137001038,
"learning_rate": 0.00012125167469381283,
"loss": 0.0102,
"step": 1128
},
{
"epoch": 0.08329033684676052,
"grad_norm": 0.12225531786680222,
"learning_rate": 0.00012125155999147876,
"loss": 0.0043,
"step": 1130
},
{
"epoch": 0.08343775337215302,
"grad_norm": 62.011695861816406,
"learning_rate": 0.0001212514450860244,
"loss": 1.6697,
"step": 1132
},
{
"epoch": 0.08358516989754551,
"grad_norm": 0.02834857441484928,
"learning_rate": 0.00012125132997745018,
"loss": 0.0044,
"step": 1134
},
{
"epoch": 0.08373258642293802,
"grad_norm": 0.007508635055273771,
"learning_rate": 0.00012125121466575647,
"loss": 0.0067,
"step": 1136
},
{
"epoch": 0.08373258642293802,
"eval_1_ratio_diff": -0.059236165237724125,
"eval_accuracy": 0.8534684333593141,
"eval_f1": 0.8441127694859039,
"eval_loss": 0.8524520993232727,
"eval_precision": 0.9008849557522124,
"eval_recall": 0.7940717628705148,
"eval_runtime": 1440.1118,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1136
},
{
"epoch": 0.0838800029483305,
"grad_norm": 70.38623046875,
"learning_rate": 0.00012125109915094362,
"loss": 2.694,
"step": 1138
},
{
"epoch": 0.08402741947372301,
"grad_norm": 1.548732042312622,
"learning_rate": 0.00012125098343301206,
"loss": 0.0213,
"step": 1140
},
{
"epoch": 0.0841748359991155,
"grad_norm": 1.2770323753356934,
"learning_rate": 0.00012125086751196217,
"loss": 0.0099,
"step": 1142
},
{
"epoch": 0.08432225252450799,
"grad_norm": 30.610591888427734,
"learning_rate": 0.00012125075138779432,
"loss": 2.0352,
"step": 1144
},
{
"epoch": 0.0844696690499005,
"grad_norm": 0.8128361701965332,
"learning_rate": 0.0001212506350605089,
"loss": 1.0719,
"step": 1146
},
{
"epoch": 0.08461708557529299,
"grad_norm": 1.6853057146072388,
"learning_rate": 0.00012125051853010634,
"loss": 0.0092,
"step": 1148
},
{
"epoch": 0.0847645021006855,
"grad_norm": 39.670047760009766,
"learning_rate": 0.000121250401796587,
"loss": 1.7653,
"step": 1150
},
{
"epoch": 0.08491191862607798,
"grad_norm": 36.04311752319336,
"learning_rate": 0.00012125028485995127,
"loss": 1.3473,
"step": 1152
},
{
"epoch": 0.08491191862607798,
"eval_1_ratio_diff": 0.03897116134060796,
"eval_accuracy": 0.8737334372564303,
"eval_f1": 0.8783783783783784,
"eval_loss": 0.6749188899993896,
"eval_precision": 0.8465991316931982,
"eval_recall": 0.9126365054602185,
"eval_runtime": 1440.4215,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1152
},
{
"epoch": 0.08505933515147047,
"grad_norm": 0.17764577269554138,
"learning_rate": 0.00012125016772019952,
"loss": 0.0023,
"step": 1154
},
{
"epoch": 0.08520675167686298,
"grad_norm": 0.3527587652206421,
"learning_rate": 0.0001212500503773322,
"loss": 0.0055,
"step": 1156
},
{
"epoch": 0.08535416820225547,
"grad_norm": 0.1379138082265854,
"learning_rate": 0.00012124993283134963,
"loss": 1.6429,
"step": 1158
},
{
"epoch": 0.08550158472764798,
"grad_norm": 0.14264832437038422,
"learning_rate": 0.0001212498150822523,
"loss": 0.0089,
"step": 1160
},
{
"epoch": 0.08564900125304047,
"grad_norm": 30.086095809936523,
"learning_rate": 0.00012124969713004051,
"loss": 2.4261,
"step": 1162
},
{
"epoch": 0.08579641777843296,
"grad_norm": 0.26527953147888184,
"learning_rate": 0.00012124957897471469,
"loss": 0.6917,
"step": 1164
},
{
"epoch": 0.08594383430382546,
"grad_norm": 8.70952320098877,
"learning_rate": 0.00012124946061627526,
"loss": 0.0826,
"step": 1166
},
{
"epoch": 0.08609125082921795,
"grad_norm": 0.031940966844558716,
"learning_rate": 0.0001212493420547226,
"loss": 0.0008,
"step": 1168
},
{
"epoch": 0.08609125082921795,
"eval_1_ratio_diff": -0.010132501948558081,
"eval_accuracy": 0.8636009353078722,
"eval_f1": 0.8620961386918834,
"eval_loss": 0.5565428137779236,
"eval_precision": 0.8710191082802548,
"eval_recall": 0.8533541341653667,
"eval_runtime": 1440.6772,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1168
},
{
"epoch": 0.08623866735461046,
"grad_norm": 1.1032943725585938,
"learning_rate": 0.0001212492232900571,
"loss": 0.0135,
"step": 1170
},
{
"epoch": 0.08638608388000295,
"grad_norm": 0.6731190085411072,
"learning_rate": 0.00012124910432227916,
"loss": 0.0145,
"step": 1172
},
{
"epoch": 0.08653350040539544,
"grad_norm": 0.2941815257072449,
"learning_rate": 0.00012124898515138918,
"loss": 0.005,
"step": 1174
},
{
"epoch": 0.08668091693078794,
"grad_norm": 0.060058582574129105,
"learning_rate": 0.00012124886577738757,
"loss": 0.0024,
"step": 1176
},
{
"epoch": 0.08682833345618043,
"grad_norm": 0.029819436371326447,
"learning_rate": 0.0001212487462002747,
"loss": 0.0015,
"step": 1178
},
{
"epoch": 0.08697574998157294,
"grad_norm": 0.1549704670906067,
"learning_rate": 0.000121248626420051,
"loss": 0.0023,
"step": 1180
},
{
"epoch": 0.08712316650696543,
"grad_norm": 1.1005401611328125,
"learning_rate": 0.00012124850643671686,
"loss": 0.0065,
"step": 1182
},
{
"epoch": 0.08727058303235792,
"grad_norm": 200.2630157470703,
"learning_rate": 0.00012124838625027271,
"loss": 0.7416,
"step": 1184
},
{
"epoch": 0.08727058303235792,
"eval_1_ratio_diff": 0.04520654715510519,
"eval_accuracy": 0.8752922837100545,
"eval_f1": 0.8805970149253731,
"eval_loss": 0.8647755980491638,
"eval_precision": 0.844062947067239,
"eval_recall": 0.9204368174726989,
"eval_runtime": 1441.0897,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 1184
},
{
"epoch": 0.08741799955775043,
"grad_norm": 0.012469271197915077,
"learning_rate": 0.0001212482658607189,
"loss": 0.0003,
"step": 1186
},
{
"epoch": 0.08756541608314292,
"grad_norm": 0.017095841467380524,
"learning_rate": 0.00012124814526805586,
"loss": 0.0003,
"step": 1188
},
{
"epoch": 0.08771283260853542,
"grad_norm": 23.186222076416016,
"learning_rate": 0.00012124802447228401,
"loss": 2.0149,
"step": 1190
},
{
"epoch": 0.08786024913392791,
"grad_norm": 0.010486994870007038,
"learning_rate": 0.00012124790347340374,
"loss": 0.0006,
"step": 1192
},
{
"epoch": 0.08800766565932042,
"grad_norm": 40.754051208496094,
"learning_rate": 0.00012124778227141545,
"loss": 2.8077,
"step": 1194
},
{
"epoch": 0.08815508218471291,
"grad_norm": 0.08611409366130829,
"learning_rate": 0.00012124766086631955,
"loss": 0.0013,
"step": 1196
},
{
"epoch": 0.0883024987101054,
"grad_norm": 0.28396108746528625,
"learning_rate": 0.00012124753925811646,
"loss": 2.2785,
"step": 1198
},
{
"epoch": 0.0884499152354979,
"grad_norm": 0.03215723857283592,
"learning_rate": 0.00012124741744680656,
"loss": 0.0026,
"step": 1200
},
{
"epoch": 0.0884499152354979,
"eval_1_ratio_diff": 0.04598597038191732,
"eval_accuracy": 0.877630553390491,
"eval_f1": 0.8829231916480239,
"eval_loss": 0.7880816459655762,
"eval_precision": 0.8457142857142858,
"eval_recall": 0.9235569422776911,
"eval_runtime": 1441.3958,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 1200
},
{
"epoch": 0.0885973317608904,
"grad_norm": 0.03621472418308258,
"learning_rate": 0.00012124729543239029,
"loss": 0.7748,
"step": 1202
},
{
"epoch": 0.0887447482862829,
"grad_norm": 0.09097783267498016,
"learning_rate": 0.00012124717321486803,
"loss": 1.8821,
"step": 1204
},
{
"epoch": 0.08889216481167539,
"grad_norm": 0.3395259976387024,
"learning_rate": 0.00012124705079424022,
"loss": 0.0073,
"step": 1206
},
{
"epoch": 0.08903958133706788,
"grad_norm": 0.04736631363630295,
"learning_rate": 0.00012124692817050723,
"loss": 1.567,
"step": 1208
},
{
"epoch": 0.08918699786246038,
"grad_norm": 0.08807298541069031,
"learning_rate": 0.00012124680534366952,
"loss": 0.0014,
"step": 1210
},
{
"epoch": 0.08933441438785288,
"grad_norm": 0.05549991875886917,
"learning_rate": 0.00012124668231372745,
"loss": 0.0021,
"step": 1212
},
{
"epoch": 0.08948183091324538,
"grad_norm": 0.06815358251333237,
"learning_rate": 0.00012124655908068146,
"loss": 0.9174,
"step": 1214
},
{
"epoch": 0.08962924743863787,
"grad_norm": 0.03639994189143181,
"learning_rate": 0.00012124643564453199,
"loss": 0.0199,
"step": 1216
},
{
"epoch": 0.08962924743863787,
"eval_1_ratio_diff": 0.020265003897116163,
"eval_accuracy": 0.8628215120810601,
"eval_f1": 0.8654434250764526,
"eval_loss": 0.648876965045929,
"eval_precision": 0.848575712143928,
"eval_recall": 0.8829953198127926,
"eval_runtime": 1441.2405,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 1216
},
{
"epoch": 0.08977666396403036,
"grad_norm": 23.13437271118164,
"learning_rate": 0.00012124631200527941,
"loss": 1.6889,
"step": 1218
},
{
"epoch": 0.08992408048942287,
"grad_norm": 0.2734740674495697,
"learning_rate": 0.00012124618816292414,
"loss": 0.0059,
"step": 1220
},
{
"epoch": 0.09007149701481536,
"grad_norm": 12.39369010925293,
"learning_rate": 0.00012124606411746661,
"loss": 0.0533,
"step": 1222
},
{
"epoch": 0.09021891354020786,
"grad_norm": 0.036048658192157745,
"learning_rate": 0.00012124593986890722,
"loss": 0.0011,
"step": 1224
},
{
"epoch": 0.09036633006560035,
"grad_norm": 0.3171124756336212,
"learning_rate": 0.00012124581541724642,
"loss": 1.5207,
"step": 1226
},
{
"epoch": 0.09051374659099284,
"grad_norm": 4.317696571350098,
"learning_rate": 0.00012124569076248459,
"loss": 1.5358,
"step": 1228
},
{
"epoch": 0.09066116311638535,
"grad_norm": 0.12044669687747955,
"learning_rate": 0.00012124556590462215,
"loss": 0.0053,
"step": 1230
},
{
"epoch": 0.09080857964177784,
"grad_norm": 0.21298988163471222,
"learning_rate": 0.00012124544084365953,
"loss": 0.0081,
"step": 1232
},
{
"epoch": 0.09080857964177784,
"eval_1_ratio_diff": 0.009353078721745844,
"eval_accuracy": 0.8222915042868277,
"eval_f1": 0.8238021638330757,
"eval_loss": 0.7862046360969543,
"eval_precision": 0.8162327718223583,
"eval_recall": 0.8315132605304212,
"eval_runtime": 1439.2896,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1232
},
{
"epoch": 0.09095599616717034,
"grad_norm": 20.541194915771484,
"learning_rate": 0.00012124531557959717,
"loss": 1.194,
"step": 1234
},
{
"epoch": 0.09110341269256284,
"grad_norm": 0.2897285223007202,
"learning_rate": 0.00012124519011243545,
"loss": 0.8952,
"step": 1236
},
{
"epoch": 0.09125082921795533,
"grad_norm": 0.08111666887998581,
"learning_rate": 0.0001212450644421748,
"loss": 0.006,
"step": 1238
},
{
"epoch": 0.09139824574334783,
"grad_norm": 0.4867294430732727,
"learning_rate": 0.00012124493856881568,
"loss": 1.7795,
"step": 1240
},
{
"epoch": 0.09154566226874032,
"grad_norm": 0.9198406934738159,
"learning_rate": 0.00012124481249235846,
"loss": 0.0259,
"step": 1242
},
{
"epoch": 0.09169307879413283,
"grad_norm": 0.08149991929531097,
"learning_rate": 0.0001212446862128036,
"loss": 1.2016,
"step": 1244
},
{
"epoch": 0.09184049531952532,
"grad_norm": 0.1457146853208542,
"learning_rate": 0.0001212445597301515,
"loss": 0.9302,
"step": 1246
},
{
"epoch": 0.09198791184491782,
"grad_norm": 0.24497820436954498,
"learning_rate": 0.00012124443304440259,
"loss": 0.0051,
"step": 1248
},
{
"epoch": 0.09198791184491782,
"eval_1_ratio_diff": 0.02260327357755254,
"eval_accuracy": 0.8339828526890102,
"eval_f1": 0.8375286041189931,
"eval_loss": 0.747604489326477,
"eval_precision": 0.8194029850746268,
"eval_recall": 0.8564742589703588,
"eval_runtime": 1440.6099,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1248
},
{
"epoch": 0.09213532837031031,
"grad_norm": 0.10772482305765152,
"learning_rate": 0.0001212443061555573,
"loss": 0.0032,
"step": 1250
},
{
"epoch": 0.0922827448957028,
"grad_norm": 3.8056480884552,
"learning_rate": 0.00012124417906361605,
"loss": 0.838,
"step": 1252
},
{
"epoch": 0.09243016142109531,
"grad_norm": 21.590364456176758,
"learning_rate": 0.00012124405176857927,
"loss": 2.5474,
"step": 1254
},
{
"epoch": 0.0925775779464878,
"grad_norm": 21.33682632446289,
"learning_rate": 0.00012124392427044737,
"loss": 2.7454,
"step": 1256
},
{
"epoch": 0.0927249944718803,
"grad_norm": 0.21534398198127747,
"learning_rate": 0.00012124379656922081,
"loss": 0.0068,
"step": 1258
},
{
"epoch": 0.0928724109972728,
"grad_norm": 20.76007843017578,
"learning_rate": 0.0001212436686649,
"loss": 1.2547,
"step": 1260
},
{
"epoch": 0.09301982752266529,
"grad_norm": 20.636024475097656,
"learning_rate": 0.00012124354055748535,
"loss": 1.5976,
"step": 1262
},
{
"epoch": 0.09316724404805779,
"grad_norm": 2.3518083095550537,
"learning_rate": 0.00012124341224697731,
"loss": 0.0369,
"step": 1264
},
{
"epoch": 0.09316724404805779,
"eval_1_ratio_diff": -0.03117692907248637,
"eval_accuracy": 0.8565861262665627,
"eval_f1": 0.8518518518518519,
"eval_loss": 0.43984636664390564,
"eval_precision": 0.8801996672212978,
"eval_recall": 0.8252730109204368,
"eval_runtime": 1440.9991,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.446,
"step": 1264
},
{
"epoch": 0.09331466057345028,
"grad_norm": 1.4304808378219604,
"learning_rate": 0.0001212432837333763,
"loss": 0.03,
"step": 1266
},
{
"epoch": 0.09346207709884279,
"grad_norm": 0.6885532736778259,
"learning_rate": 0.00012124315501668278,
"loss": 0.7603,
"step": 1268
},
{
"epoch": 0.09360949362423528,
"grad_norm": 0.5777420997619629,
"learning_rate": 0.00012124302609689715,
"loss": 1.1026,
"step": 1270
},
{
"epoch": 0.09375691014962777,
"grad_norm": 1.5885238647460938,
"learning_rate": 0.00012124289697401986,
"loss": 0.041,
"step": 1272
},
{
"epoch": 0.09390432667502027,
"grad_norm": 0.37640276551246643,
"learning_rate": 0.00012124276764805132,
"loss": 0.0182,
"step": 1274
},
{
"epoch": 0.09405174320041276,
"grad_norm": 25.54754066467285,
"learning_rate": 0.00012124263811899196,
"loss": 1.2952,
"step": 1276
},
{
"epoch": 0.09419915972580527,
"grad_norm": 41.04960632324219,
"learning_rate": 0.00012124250838684226,
"loss": 2.126,
"step": 1278
},
{
"epoch": 0.09434657625119776,
"grad_norm": 0.16556452214717865,
"learning_rate": 0.00012124237845160263,
"loss": 0.0078,
"step": 1280
},
{
"epoch": 0.09434657625119776,
"eval_1_ratio_diff": -0.08573655494933752,
"eval_accuracy": 0.8487918939984411,
"eval_f1": 0.8344709897610921,
"eval_loss": 0.6657168865203857,
"eval_precision": 0.9209039548022598,
"eval_recall": 0.7628705148205929,
"eval_runtime": 1440.6129,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1280
},
{
"epoch": 0.09449399277659025,
"grad_norm": 17.6622314453125,
"learning_rate": 0.00012124224831327347,
"loss": 0.1561,
"step": 1282
},
{
"epoch": 0.09464140930198275,
"grad_norm": 0.15980716049671173,
"learning_rate": 0.00012124211797185528,
"loss": 0.0042,
"step": 1284
},
{
"epoch": 0.09478882582737524,
"grad_norm": 0.04221845418214798,
"learning_rate": 0.00012124198742734845,
"loss": 1.4535,
"step": 1286
},
{
"epoch": 0.09493624235276775,
"grad_norm": 0.056126296520233154,
"learning_rate": 0.00012124185667975342,
"loss": 0.0031,
"step": 1288
},
{
"epoch": 0.09508365887816024,
"grad_norm": 0.08041621744632721,
"learning_rate": 0.00012124172572907067,
"loss": 0.0018,
"step": 1290
},
{
"epoch": 0.09523107540355275,
"grad_norm": 28.64826011657715,
"learning_rate": 0.00012124159457530059,
"loss": 1.6516,
"step": 1292
},
{
"epoch": 0.09537849192894524,
"grad_norm": 0.31489408016204834,
"learning_rate": 0.00012124146321844365,
"loss": 0.0038,
"step": 1294
},
{
"epoch": 0.09552590845433773,
"grad_norm": 1.7656670808792114,
"learning_rate": 0.00012124133165850026,
"loss": 0.0131,
"step": 1296
},
{
"epoch": 0.09552590845433773,
"eval_1_ratio_diff": 0.04832424006235381,
"eval_accuracy": 0.8176149649259548,
"eval_f1": 0.8258928571428571,
"eval_loss": 0.8926898241043091,
"eval_precision": 0.7894736842105263,
"eval_recall": 0.8658346333853354,
"eval_runtime": 1440.8824,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.446,
"step": 1296
},
{
"epoch": 0.09567332497973023,
"grad_norm": 0.3228819668292999,
"learning_rate": 0.00012124119989547089,
"loss": 0.8356,
"step": 1298
},
{
"epoch": 0.09582074150512272,
"grad_norm": 58.03204345703125,
"learning_rate": 0.00012124106792935597,
"loss": 0.5161,
"step": 1300
},
{
"epoch": 0.09596815803051523,
"grad_norm": 0.094666488468647,
"learning_rate": 0.00012124093576015595,
"loss": 0.0014,
"step": 1302
},
{
"epoch": 0.09611557455590772,
"grad_norm": 0.054852358996868134,
"learning_rate": 0.00012124080338787127,
"loss": 0.0025,
"step": 1304
},
{
"epoch": 0.09626299108130021,
"grad_norm": 2.4614083766937256,
"learning_rate": 0.00012124067081250235,
"loss": 0.0231,
"step": 1306
},
{
"epoch": 0.09641040760669271,
"grad_norm": 0.13067440688610077,
"learning_rate": 0.00012124053803404966,
"loss": 0.0019,
"step": 1308
},
{
"epoch": 0.0965578241320852,
"grad_norm": 0.05831296741962433,
"learning_rate": 0.00012124040505251365,
"loss": 1.1599,
"step": 1310
},
{
"epoch": 0.09670524065747771,
"grad_norm": 22.675302505493164,
"learning_rate": 0.00012124027186789477,
"loss": 1.7971,
"step": 1312
},
{
"epoch": 0.09670524065747771,
"eval_1_ratio_diff": -0.07638347622759162,
"eval_accuracy": 0.8207326578332035,
"eval_f1": 0.8057432432432432,
"eval_loss": 0.9711058735847473,
"eval_precision": 0.8784530386740331,
"eval_recall": 0.7441497659906396,
"eval_runtime": 1440.5355,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1312
},
{
"epoch": 0.0968526571828702,
"grad_norm": 245.76840209960938,
"learning_rate": 0.00012124013848019342,
"loss": 2.4617,
"step": 1314
},
{
"epoch": 0.09700007370826269,
"grad_norm": 21.968021392822266,
"learning_rate": 0.00012124000488941008,
"loss": 1.4503,
"step": 1316
},
{
"epoch": 0.0971474902336552,
"grad_norm": 0.03653848171234131,
"learning_rate": 0.00012123987109554522,
"loss": 0.0015,
"step": 1318
},
{
"epoch": 0.09729490675904769,
"grad_norm": 0.16115568578243256,
"learning_rate": 0.00012123973709859925,
"loss": 0.0201,
"step": 1320
},
{
"epoch": 0.09744232328444019,
"grad_norm": 34.74784851074219,
"learning_rate": 0.00012123960289857264,
"loss": 1.092,
"step": 1322
},
{
"epoch": 0.09758973980983268,
"grad_norm": 17.326068878173828,
"learning_rate": 0.00012123946849546582,
"loss": 0.0826,
"step": 1324
},
{
"epoch": 0.09773715633522517,
"grad_norm": 22.532522201538086,
"learning_rate": 0.00012123933388927926,
"loss": 2.0905,
"step": 1326
},
{
"epoch": 0.09788457286061768,
"grad_norm": 0.09820098429918289,
"learning_rate": 0.0001212391990800134,
"loss": 0.002,
"step": 1328
},
{
"epoch": 0.09788457286061768,
"eval_1_ratio_diff": 0.05689789555728764,
"eval_accuracy": 0.8106001558846454,
"eval_f1": 0.8206642066420664,
"eval_loss": 0.7345473170280457,
"eval_precision": 0.7787114845938375,
"eval_recall": 0.8673946957878315,
"eval_runtime": 1439.7279,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1328
},
{
"epoch": 0.09803198938601017,
"grad_norm": 0.11757276207208633,
"learning_rate": 0.00012123906406766871,
"loss": 0.0079,
"step": 1330
},
{
"epoch": 0.09817940591140267,
"grad_norm": 24.76763153076172,
"learning_rate": 0.00012123892885224563,
"loss": 1.3389,
"step": 1332
},
{
"epoch": 0.09832682243679516,
"grad_norm": 0.0959400087594986,
"learning_rate": 0.0001212387934337446,
"loss": 0.9421,
"step": 1334
},
{
"epoch": 0.09847423896218765,
"grad_norm": 0.3935282826423645,
"learning_rate": 0.00012123865781216609,
"loss": 0.0104,
"step": 1336
},
{
"epoch": 0.09862165548758016,
"grad_norm": 22.505558013916016,
"learning_rate": 0.00012123852198751054,
"loss": 0.7555,
"step": 1338
},
{
"epoch": 0.09876907201297265,
"grad_norm": 1.3673774003982544,
"learning_rate": 0.00012123838595977844,
"loss": 0.0409,
"step": 1340
},
{
"epoch": 0.09891648853836515,
"grad_norm": 0.6889051198959351,
"learning_rate": 0.0001212382497289702,
"loss": 0.0269,
"step": 1342
},
{
"epoch": 0.09906390506375765,
"grad_norm": 0.2218835949897766,
"learning_rate": 0.0001212381132950863,
"loss": 0.9572,
"step": 1344
},
{
"epoch": 0.09906390506375765,
"eval_1_ratio_diff": 0.07560405300077944,
"eval_accuracy": 0.8277474668745128,
"eval_f1": 0.8397389412617839,
"eval_loss": 0.7541435360908508,
"eval_precision": 0.7845528455284553,
"eval_recall": 0.9032761310452418,
"eval_runtime": 1440.149,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1344
},
{
"epoch": 0.09921132158915015,
"grad_norm": 0.08860000967979431,
"learning_rate": 0.0001212379766581272,
"loss": 0.0038,
"step": 1346
},
{
"epoch": 0.09935873811454264,
"grad_norm": 0.1549777388572693,
"learning_rate": 0.00012123783981809338,
"loss": 0.6904,
"step": 1348
},
{
"epoch": 0.09950615463993513,
"grad_norm": 0.4857753813266754,
"learning_rate": 0.00012123770277498524,
"loss": 0.005,
"step": 1350
},
{
"epoch": 0.09965357116532764,
"grad_norm": 0.5475670099258423,
"learning_rate": 0.00012123756552880328,
"loss": 0.0057,
"step": 1352
},
{
"epoch": 0.09980098769072013,
"grad_norm": 0.8644952178001404,
"learning_rate": 0.00012123742807954794,
"loss": 2.7045,
"step": 1354
},
{
"epoch": 0.09994840421611263,
"grad_norm": 0.15051943063735962,
"learning_rate": 0.0001212372904272197,
"loss": 0.7707,
"step": 1356
},
{
"epoch": 0.10009582074150512,
"grad_norm": 0.04434569925069809,
"learning_rate": 0.00012123715257181902,
"loss": 0.0007,
"step": 1358
},
{
"epoch": 0.10024323726689761,
"grad_norm": 0.03767779842019081,
"learning_rate": 0.00012123701451334634,
"loss": 1.7987,
"step": 1360
},
{
"epoch": 0.10024323726689761,
"eval_1_ratio_diff": 0.10054559625876847,
"eval_accuracy": 0.8589243959469992,
"eval_f1": 0.8717221828490432,
"eval_loss": 0.7392542958259583,
"eval_precision": 0.7987012987012987,
"eval_recall": 0.9594383775351014,
"eval_runtime": 1439.9484,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1360
},
{
"epoch": 0.10039065379229012,
"grad_norm": 0.032404810190200806,
"learning_rate": 0.00012123687625180216,
"loss": 1.3724,
"step": 1362
},
{
"epoch": 0.10053807031768261,
"grad_norm": 0.02649116888642311,
"learning_rate": 0.00012123673778718691,
"loss": 1.3162,
"step": 1364
},
{
"epoch": 0.10068548684307511,
"grad_norm": 0.120023712515831,
"learning_rate": 0.00012123659911950106,
"loss": 0.0026,
"step": 1366
},
{
"epoch": 0.1008329033684676,
"grad_norm": 0.28818804025650024,
"learning_rate": 0.00012123646024874507,
"loss": 0.0048,
"step": 1368
},
{
"epoch": 0.1009803198938601,
"grad_norm": 0.5911560654640198,
"learning_rate": 0.00012123632117491944,
"loss": 0.0142,
"step": 1370
},
{
"epoch": 0.1011277364192526,
"grad_norm": 22.85379409790039,
"learning_rate": 0.00012123618189802459,
"loss": 1.8439,
"step": 1372
},
{
"epoch": 0.10127515294464509,
"grad_norm": 0.37168049812316895,
"learning_rate": 0.00012123604241806102,
"loss": 0.0065,
"step": 1374
},
{
"epoch": 0.1014225694700376,
"grad_norm": 0.10927151888608932,
"learning_rate": 0.00012123590273502919,
"loss": 1.1801,
"step": 1376
},
{
"epoch": 0.1014225694700376,
"eval_1_ratio_diff": 0.021823850350740415,
"eval_accuracy": 0.8862042088854248,
"eval_f1": 0.8885496183206106,
"eval_loss": 0.5426926612854004,
"eval_precision": 0.8699551569506726,
"eval_recall": 0.9079563182527302,
"eval_runtime": 1440.2334,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1376
},
{
"epoch": 0.10156998599543009,
"grad_norm": 0.038460321724414825,
"learning_rate": 0.00012123576284892955,
"loss": 0.0101,
"step": 1378
},
{
"epoch": 0.10171740252082258,
"grad_norm": 25.498838424682617,
"learning_rate": 0.00012123562275976258,
"loss": 1.3981,
"step": 1380
},
{
"epoch": 0.10186481904621508,
"grad_norm": 159.9862060546875,
"learning_rate": 0.00012123548246752878,
"loss": 1.2495,
"step": 1382
},
{
"epoch": 0.10201223557160757,
"grad_norm": 0.06094611436128616,
"learning_rate": 0.00012123534197222857,
"loss": 0.0046,
"step": 1384
},
{
"epoch": 0.10215965209700008,
"grad_norm": 26.12101173400879,
"learning_rate": 0.00012123520127386245,
"loss": 1.3714,
"step": 1386
},
{
"epoch": 0.10230706862239257,
"grad_norm": 48.13339614868164,
"learning_rate": 0.00012123506037243086,
"loss": 0.0869,
"step": 1388
},
{
"epoch": 0.10245448514778506,
"grad_norm": 0.5880022644996643,
"learning_rate": 0.00012123491926793433,
"loss": 0.6204,
"step": 1390
},
{
"epoch": 0.10260190167317756,
"grad_norm": 24.889034271240234,
"learning_rate": 0.00012123477796037328,
"loss": 0.9381,
"step": 1392
},
{
"epoch": 0.10260190167317756,
"eval_1_ratio_diff": 0.09664848012470773,
"eval_accuracy": 0.8487918939984411,
"eval_f1": 0.8620199146514936,
"eval_loss": 0.5980536937713623,
"eval_precision": 0.792156862745098,
"eval_recall": 0.9453978159126365,
"eval_runtime": 1440.6605,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1392
},
{
"epoch": 0.10274931819857006,
"grad_norm": 23.767898559570312,
"learning_rate": 0.00012123463644974822,
"loss": 1.3434,
"step": 1394
},
{
"epoch": 0.10289673472396256,
"grad_norm": 0.05240378528833389,
"learning_rate": 0.0001212344947360596,
"loss": 0.006,
"step": 1396
},
{
"epoch": 0.10304415124935505,
"grad_norm": 0.05574984475970268,
"learning_rate": 0.00012123435281930789,
"loss": 0.0062,
"step": 1398
},
{
"epoch": 0.10319156777474756,
"grad_norm": 25.049999237060547,
"learning_rate": 0.00012123421069949359,
"loss": 0.7515,
"step": 1400
},
{
"epoch": 0.10333898430014005,
"grad_norm": 0.6514810919761658,
"learning_rate": 0.00012123406837661717,
"loss": 0.0286,
"step": 1402
},
{
"epoch": 0.10348640082553254,
"grad_norm": 25.315319061279297,
"learning_rate": 0.00012123392585067908,
"loss": 0.6189,
"step": 1404
},
{
"epoch": 0.10363381735092504,
"grad_norm": 24.714847564697266,
"learning_rate": 0.00012123378312167983,
"loss": 0.7992,
"step": 1406
},
{
"epoch": 0.10378123387631753,
"grad_norm": 21.79236602783203,
"learning_rate": 0.00012123364018961989,
"loss": 1.8653,
"step": 1408
},
{
"epoch": 0.10378123387631753,
"eval_1_ratio_diff": -0.018706157443491855,
"eval_accuracy": 0.8752922837100545,
"eval_f1": 0.8728139904610492,
"eval_loss": 0.573785662651062,
"eval_precision": 0.8897893030794165,
"eval_recall": 0.8564742589703588,
"eval_runtime": 1440.8628,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.446,
"step": 1408
},
{
"epoch": 0.10392865040171004,
"grad_norm": 21.346384048461914,
"learning_rate": 0.00012123349705449974,
"loss": 1.923,
"step": 1410
},
{
"epoch": 0.10407606692710253,
"grad_norm": 10.506868362426758,
"learning_rate": 0.00012123335371631985,
"loss": 0.5301,
"step": 1412
},
{
"epoch": 0.10422348345249502,
"grad_norm": 1.1288862228393555,
"learning_rate": 0.00012123321017508069,
"loss": 0.0411,
"step": 1414
},
{
"epoch": 0.10437089997788752,
"grad_norm": 0.11825437843799591,
"learning_rate": 0.00012123306643078279,
"loss": 0.0026,
"step": 1416
},
{
"epoch": 0.10451831650328001,
"grad_norm": 0.14662548899650574,
"learning_rate": 0.00012123292248342657,
"loss": 1.3863,
"step": 1418
},
{
"epoch": 0.10466573302867252,
"grad_norm": 1.1349258422851562,
"learning_rate": 0.00012123277833301255,
"loss": 0.0148,
"step": 1420
},
{
"epoch": 0.10481314955406501,
"grad_norm": 20.21559715270996,
"learning_rate": 0.00012123263397954121,
"loss": 2.3576,
"step": 1422
},
{
"epoch": 0.1049605660794575,
"grad_norm": 27.789064407348633,
"learning_rate": 0.00012123248942301302,
"loss": 1.3553,
"step": 1424
},
{
"epoch": 0.1049605660794575,
"eval_1_ratio_diff": -0.18082618862042088,
"eval_accuracy": 0.7833203429462198,
"eval_f1": 0.7352380952380952,
"eval_loss": 0.8213497400283813,
"eval_precision": 0.9437652811735942,
"eval_recall": 0.6021840873634945,
"eval_runtime": 1440.567,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1424
},
{
"epoch": 0.10510798260485,
"grad_norm": 0.472825288772583,
"learning_rate": 0.00012123234466342849,
"loss": 0.0161,
"step": 1426
},
{
"epoch": 0.1052553991302425,
"grad_norm": 44.982635498046875,
"learning_rate": 0.00012123219970078806,
"loss": 0.216,
"step": 1428
},
{
"epoch": 0.105402815655635,
"grad_norm": 20.85587501525879,
"learning_rate": 0.00012123205453509228,
"loss": 1.7555,
"step": 1430
},
{
"epoch": 0.10555023218102749,
"grad_norm": 19.432729721069336,
"learning_rate": 0.00012123190916634158,
"loss": 0.9614,
"step": 1432
},
{
"epoch": 0.10569764870641998,
"grad_norm": 1.2885982990264893,
"learning_rate": 0.00012123176359453646,
"loss": 0.7221,
"step": 1434
},
{
"epoch": 0.10584506523181249,
"grad_norm": 39.255924224853516,
"learning_rate": 0.00012123161781967742,
"loss": 0.7135,
"step": 1436
},
{
"epoch": 0.10599248175720498,
"grad_norm": 1.8398678302764893,
"learning_rate": 0.00012123147184176495,
"loss": 1.7681,
"step": 1438
},
{
"epoch": 0.10613989828259748,
"grad_norm": 0.04480309039354324,
"learning_rate": 0.00012123132566079952,
"loss": 0.0198,
"step": 1440
},
{
"epoch": 0.10613989828259748,
"eval_1_ratio_diff": 0.013250194855806696,
"eval_accuracy": 0.8495713172252534,
"eval_f1": 0.8514241724403387,
"eval_loss": 0.5520654916763306,
"eval_precision": 0.8404255319148937,
"eval_recall": 0.8627145085803433,
"eval_runtime": 1441.2669,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 1440
},
{
"epoch": 0.10628731480798997,
"grad_norm": 0.10228992253541946,
"learning_rate": 0.00012123117927678164,
"loss": 0.0767,
"step": 1442
},
{
"epoch": 0.10643473133338248,
"grad_norm": 0.14043979346752167,
"learning_rate": 0.0001212310326897118,
"loss": 0.062,
"step": 1444
},
{
"epoch": 0.10658214785877497,
"grad_norm": 18.650835037231445,
"learning_rate": 0.00012123088589959048,
"loss": 0.5735,
"step": 1446
},
{
"epoch": 0.10672956438416746,
"grad_norm": 18.65635871887207,
"learning_rate": 0.00012123073890641816,
"loss": 0.697,
"step": 1448
},
{
"epoch": 0.10687698090955997,
"grad_norm": 24.889253616333008,
"learning_rate": 0.00012123059171019538,
"loss": 1.1449,
"step": 1450
},
{
"epoch": 0.10702439743495246,
"grad_norm": 0.32461315393447876,
"learning_rate": 0.00012123044431092258,
"loss": 0.0108,
"step": 1452
},
{
"epoch": 0.10717181396034496,
"grad_norm": 0.195255309343338,
"learning_rate": 0.00012123029670860029,
"loss": 0.0082,
"step": 1454
},
{
"epoch": 0.10731923048573745,
"grad_norm": 0.3942672312259674,
"learning_rate": 0.00012123014890322897,
"loss": 0.0278,
"step": 1456
},
{
"epoch": 0.10731923048573745,
"eval_1_ratio_diff": -0.05455962587685115,
"eval_accuracy": 0.8487918939984411,
"eval_f1": 0.8399339933993399,
"eval_loss": 0.6235100626945496,
"eval_precision": 0.8914185639229422,
"eval_recall": 0.7940717628705148,
"eval_runtime": 1441.051,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.446,
"step": 1456
},
{
"epoch": 0.10746664701112994,
"grad_norm": 20.8675537109375,
"learning_rate": 0.00012123000089480917,
"loss": 2.0488,
"step": 1458
},
{
"epoch": 0.10761406353652245,
"grad_norm": 19.674894332885742,
"learning_rate": 0.00012122985268334132,
"loss": 0.9135,
"step": 1460
},
{
"epoch": 0.10776148006191494,
"grad_norm": 0.16670210659503937,
"learning_rate": 0.00012122970426882597,
"loss": 0.0074,
"step": 1462
},
{
"epoch": 0.10790889658730744,
"grad_norm": 20.293106079101562,
"learning_rate": 0.00012122955565126358,
"loss": 1.0217,
"step": 1464
},
{
"epoch": 0.10805631311269993,
"grad_norm": 0.6973972916603088,
"learning_rate": 0.00012122940683065467,
"loss": 0.9069,
"step": 1466
},
{
"epoch": 0.10820372963809242,
"grad_norm": 25.440162658691406,
"learning_rate": 0.00012122925780699975,
"loss": 1.5865,
"step": 1468
},
{
"epoch": 0.10835114616348493,
"grad_norm": 4.310685157775879,
"learning_rate": 0.00012122910858029928,
"loss": 0.4176,
"step": 1470
},
{
"epoch": 0.10849856268887742,
"grad_norm": 0.3989110291004181,
"learning_rate": 0.00012122895915055379,
"loss": 1.2954,
"step": 1472
},
{
"epoch": 0.10849856268887742,
"eval_1_ratio_diff": 0.021823850350740415,
"eval_accuracy": 0.8581449727201871,
"eval_f1": 0.8610687022900764,
"eval_loss": 0.4833138585090637,
"eval_precision": 0.8430493273542601,
"eval_recall": 0.8798751950078003,
"eval_runtime": 1440.7462,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1472
},
{
"epoch": 0.10864597921426992,
"grad_norm": 0.3485046923160553,
"learning_rate": 0.00012122880951776379,
"loss": 0.0092,
"step": 1474
},
{
"epoch": 0.10879339573966242,
"grad_norm": 31.38138198852539,
"learning_rate": 0.00012122865968192974,
"loss": 2.2038,
"step": 1476
},
{
"epoch": 0.1089408122650549,
"grad_norm": 0.1756962537765503,
"learning_rate": 0.00012122850964305218,
"loss": 0.0039,
"step": 1478
},
{
"epoch": 0.10908822879044741,
"grad_norm": 0.4892203211784363,
"learning_rate": 0.0001212283594011316,
"loss": 1.2883,
"step": 1480
},
{
"epoch": 0.1092356453158399,
"grad_norm": 0.38502997159957886,
"learning_rate": 0.00012122820895616849,
"loss": 0.015,
"step": 1482
},
{
"epoch": 0.1093830618412324,
"grad_norm": 0.3273461163043976,
"learning_rate": 0.00012122805830816339,
"loss": 0.0328,
"step": 1484
},
{
"epoch": 0.1095304783666249,
"grad_norm": 53.52883529663086,
"learning_rate": 0.00012122790745711678,
"loss": 1.4843,
"step": 1486
},
{
"epoch": 0.10967789489201739,
"grad_norm": 0.2854032814502716,
"learning_rate": 0.00012122775640302914,
"loss": 0.0227,
"step": 1488
},
{
"epoch": 0.10967789489201739,
"eval_1_ratio_diff": 0.014809041309431059,
"eval_accuracy": 0.8448947778643804,
"eval_f1": 0.8470407378939278,
"eval_loss": 0.6297035217285156,
"eval_precision": 0.8348484848484848,
"eval_recall": 0.859594383775351,
"eval_runtime": 1441.3865,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 1488
},
{
"epoch": 0.1098253114174099,
"grad_norm": 0.2311754673719406,
"learning_rate": 0.00012122760514590104,
"loss": 0.0063,
"step": 1490
},
{
"epoch": 0.10997272794280238,
"grad_norm": 21.77858543395996,
"learning_rate": 0.00012122745368573293,
"loss": 1.6042,
"step": 1492
},
{
"epoch": 0.11012014446819489,
"grad_norm": 0.12185559421777725,
"learning_rate": 0.00012122730202252534,
"loss": 0.0054,
"step": 1494
},
{
"epoch": 0.11026756099358738,
"grad_norm": 0.07674361765384674,
"learning_rate": 0.00012122715015627879,
"loss": 1.2277,
"step": 1496
},
{
"epoch": 0.11041497751897988,
"grad_norm": 1.0588175058364868,
"learning_rate": 0.00012122699808699376,
"loss": 0.0121,
"step": 1498
},
{
"epoch": 0.11056239404437238,
"grad_norm": 148.854248046875,
"learning_rate": 0.00012122684581467078,
"loss": 1.6651,
"step": 1500
},
{
"epoch": 0.11070981056976487,
"grad_norm": 0.07673851400613785,
"learning_rate": 0.00012122669333931036,
"loss": 0.0037,
"step": 1502
},
{
"epoch": 0.11085722709515737,
"grad_norm": 0.14825621247291565,
"learning_rate": 0.00012122654066091301,
"loss": 0.0033,
"step": 1504
},
{
"epoch": 0.11085722709515737,
"eval_1_ratio_diff": 0.0,
"eval_accuracy": 0.8394388152766953,
"eval_f1": 0.8393135725429017,
"eval_loss": 0.7106738686561584,
"eval_precision": 0.8393135725429017,
"eval_recall": 0.8393135725429017,
"eval_runtime": 1440.7668,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.446,
"step": 1504
},
{
"epoch": 0.11100464362054986,
"grad_norm": 0.3325727880001068,
"learning_rate": 0.00012122638777947923,
"loss": 0.0043,
"step": 1506
},
{
"epoch": 0.11115206014594237,
"grad_norm": 0.16898727416992188,
"learning_rate": 0.00012122623469500956,
"loss": 1.3778,
"step": 1508
},
{
"epoch": 0.11129947667133486,
"grad_norm": 24.855741500854492,
"learning_rate": 0.00012122608140750447,
"loss": 1.1577,
"step": 1510
},
{
"epoch": 0.11144689319672735,
"grad_norm": 0.15268811583518982,
"learning_rate": 0.0001212259279169645,
"loss": 0.0057,
"step": 1512
},
{
"epoch": 0.11159430972211985,
"grad_norm": 37.5292854309082,
"learning_rate": 0.00012122577422339017,
"loss": 2.6301,
"step": 1514
},
{
"epoch": 0.11174172624751234,
"grad_norm": 0.23876796662807465,
"learning_rate": 0.000121225620326782,
"loss": 0.0067,
"step": 1516
},
{
"epoch": 0.11188914277290485,
"grad_norm": 0.14355158805847168,
"learning_rate": 0.00012122546622714046,
"loss": 0.0082,
"step": 1518
},
{
"epoch": 0.11203655929829734,
"grad_norm": 0.14837191998958588,
"learning_rate": 0.00012122531192446613,
"loss": 1.1954,
"step": 1520
},
{
"epoch": 0.11203655929829734,
"eval_1_ratio_diff": -0.006235385814497285,
"eval_accuracy": 0.8456742010911925,
"eval_f1": 0.8445839874411303,
"eval_loss": 0.5836101174354553,
"eval_precision": 0.8499210110584519,
"eval_recall": 0.8393135725429017,
"eval_runtime": 1440.5652,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1520
},
{
"epoch": 0.11218397582368983,
"grad_norm": 1.0671629905700684,
"learning_rate": 0.0001212251574187595,
"loss": 0.0128,
"step": 1522
},
{
"epoch": 0.11233139234908233,
"grad_norm": 22.311914443969727,
"learning_rate": 0.00012122500271002106,
"loss": 1.1378,
"step": 1524
},
{
"epoch": 0.11247880887447483,
"grad_norm": 24.98206329345703,
"learning_rate": 0.00012122484779825135,
"loss": 1.4429,
"step": 1526
},
{
"epoch": 0.11262622539986733,
"grad_norm": 0.10400061309337616,
"learning_rate": 0.00012122469268345093,
"loss": 0.8205,
"step": 1528
},
{
"epoch": 0.11277364192525982,
"grad_norm": 0.1311234086751938,
"learning_rate": 0.00012122453736562024,
"loss": 0.0052,
"step": 1530
},
{
"epoch": 0.11292105845065231,
"grad_norm": 24.459693908691406,
"learning_rate": 0.00012122438184475986,
"loss": 0.8169,
"step": 1532
},
{
"epoch": 0.11306847497604482,
"grad_norm": 0.6599878072738647,
"learning_rate": 0.0001212242261208703,
"loss": 0.0172,
"step": 1534
},
{
"epoch": 0.11321589150143731,
"grad_norm": 0.7011798024177551,
"learning_rate": 0.00012122407019395205,
"loss": 0.0101,
"step": 1536
},
{
"epoch": 0.11321589150143731,
"eval_1_ratio_diff": -0.011691348402182389,
"eval_accuracy": 0.852689010132502,
"eval_f1": 0.850828729281768,
"eval_loss": 0.616263747215271,
"eval_precision": 0.8610223642172524,
"eval_recall": 0.8408736349453978,
"eval_runtime": 1440.4808,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1536
},
{
"epoch": 0.11336330802682981,
"grad_norm": 0.11136188358068466,
"learning_rate": 0.00012122391406400568,
"loss": 0.0043,
"step": 1538
},
{
"epoch": 0.1135107245522223,
"grad_norm": 0.09410673379898071,
"learning_rate": 0.00012122375773103169,
"loss": 0.0029,
"step": 1540
},
{
"epoch": 0.1136581410776148,
"grad_norm": 0.0886264443397522,
"learning_rate": 0.00012122360119503061,
"loss": 0.0027,
"step": 1542
},
{
"epoch": 0.1138055576030073,
"grad_norm": 0.06019139662384987,
"learning_rate": 0.00012122344445600295,
"loss": 0.0012,
"step": 1544
},
{
"epoch": 0.11395297412839979,
"grad_norm": 24.27945327758789,
"learning_rate": 0.00012122328751394924,
"loss": 1.2476,
"step": 1546
},
{
"epoch": 0.1141003906537923,
"grad_norm": 0.07040827721357346,
"learning_rate": 0.00012122313036887001,
"loss": 1.05,
"step": 1548
},
{
"epoch": 0.11424780717918478,
"grad_norm": 21.743165969848633,
"learning_rate": 0.00012122297302076579,
"loss": 3.2561,
"step": 1550
},
{
"epoch": 0.11439522370457729,
"grad_norm": 0.21815018355846405,
"learning_rate": 0.00012122281546963711,
"loss": 0.0085,
"step": 1552
},
{
"epoch": 0.11439522370457729,
"eval_1_ratio_diff": 0.05845674201091189,
"eval_accuracy": 0.8620420888542478,
"eval_f1": 0.8695652173913043,
"eval_loss": 0.5588727593421936,
"eval_precision": 0.8240223463687151,
"eval_recall": 0.9204368174726989,
"eval_runtime": 1440.4266,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1552
},
{
"epoch": 0.11454264022996978,
"grad_norm": 0.3978158235549927,
"learning_rate": 0.0001212226577154845,
"loss": 0.0087,
"step": 1554
},
{
"epoch": 0.11469005675536227,
"grad_norm": 0.07042258977890015,
"learning_rate": 0.00012122249975830848,
"loss": 0.0021,
"step": 1556
},
{
"epoch": 0.11483747328075478,
"grad_norm": 0.16607695817947388,
"learning_rate": 0.00012122234159810957,
"loss": 0.0024,
"step": 1558
},
{
"epoch": 0.11498488980614727,
"grad_norm": 0.11605281382799149,
"learning_rate": 0.00012122218323488832,
"loss": 0.0026,
"step": 1560
},
{
"epoch": 0.11513230633153977,
"grad_norm": 24.77876091003418,
"learning_rate": 0.00012122202466864525,
"loss": 1.4127,
"step": 1562
},
{
"epoch": 0.11527972285693226,
"grad_norm": 0.17567309737205505,
"learning_rate": 0.00012122186589938088,
"loss": 0.0037,
"step": 1564
},
{
"epoch": 0.11542713938232475,
"grad_norm": 0.19481156766414642,
"learning_rate": 0.00012122170692709576,
"loss": 0.6267,
"step": 1566
},
{
"epoch": 0.11557455590771726,
"grad_norm": 24.115211486816406,
"learning_rate": 0.00012122154775179043,
"loss": 0.8964,
"step": 1568
},
{
"epoch": 0.11557455590771726,
"eval_1_ratio_diff": 0.03975058456742009,
"eval_accuracy": 0.8713951675759938,
"eval_f1": 0.8762190547636909,
"eval_loss": 0.5382638573646545,
"eval_precision": 0.8439306358381503,
"eval_recall": 0.9110764430577223,
"eval_runtime": 1441.1253,
"eval_samples_per_second": 0.89,
"eval_steps_per_second": 0.445,
"step": 1568
},
{
"epoch": 0.11572197243310975,
"grad_norm": 0.140619158744812,
"learning_rate": 0.0001212213883734654,
"loss": 0.0054,
"step": 1570
},
{
"epoch": 0.11586938895850225,
"grad_norm": 0.12547695636749268,
"learning_rate": 0.00012122122879212122,
"loss": 0.3549,
"step": 1572
},
{
"epoch": 0.11601680548389474,
"grad_norm": 0.12592053413391113,
"learning_rate": 0.00012122106900775843,
"loss": 0.0105,
"step": 1574
},
{
"epoch": 0.11616422200928724,
"grad_norm": 0.11613775789737701,
"learning_rate": 0.00012122090902037755,
"loss": 0.0044,
"step": 1576
},
{
"epoch": 0.11631163853467974,
"grad_norm": 0.06327944993972778,
"learning_rate": 0.00012122074882997911,
"loss": 0.0052,
"step": 1578
},
{
"epoch": 0.11645905506007223,
"grad_norm": 0.26552170515060425,
"learning_rate": 0.00012122058843656367,
"loss": 0.0049,
"step": 1580
},
{
"epoch": 0.11660647158546474,
"grad_norm": 0.05181106925010681,
"learning_rate": 0.00012122042784013175,
"loss": 0.8965,
"step": 1582
},
{
"epoch": 0.11675388811085723,
"grad_norm": 0.07022108882665634,
"learning_rate": 0.0001212202670406839,
"loss": 1.4149,
"step": 1584
},
{
"epoch": 0.11675388811085723,
"eval_1_ratio_diff": 0.014029618082618822,
"eval_accuracy": 0.8565861262665627,
"eval_f1": 0.8584615384615385,
"eval_loss": 0.6103407144546509,
"eval_precision": 0.8467374810318664,
"eval_recall": 0.8705148205928237,
"eval_runtime": 1439.8898,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1584
},
{
"epoch": 0.11690130463624972,
"grad_norm": 0.20126762986183167,
"learning_rate": 0.00012122010603822065,
"loss": 0.0077,
"step": 1586
},
{
"epoch": 0.11704872116164222,
"grad_norm": 0.09971367567777634,
"learning_rate": 0.00012121994483274255,
"loss": 0.0049,
"step": 1588
},
{
"epoch": 0.11719613768703471,
"grad_norm": 0.06467089802026749,
"learning_rate": 0.00012121978342425012,
"loss": 0.005,
"step": 1590
},
{
"epoch": 0.11734355421242722,
"grad_norm": 0.06981782615184784,
"learning_rate": 0.00012121962181274392,
"loss": 0.0028,
"step": 1592
},
{
"epoch": 0.11749097073781971,
"grad_norm": 0.12012193351984024,
"learning_rate": 0.00012121945999822448,
"loss": 0.0022,
"step": 1594
},
{
"epoch": 0.11763838726321221,
"grad_norm": 24.71665382385254,
"learning_rate": 0.00012121929798069236,
"loss": 1.756,
"step": 1596
},
{
"epoch": 0.1177858037886047,
"grad_norm": 0.31951653957366943,
"learning_rate": 0.0001212191357601481,
"loss": 0.004,
"step": 1598
},
{
"epoch": 0.1179332203139972,
"grad_norm": 0.03907225281000137,
"learning_rate": 0.0001212189733365922,
"loss": 0.0018,
"step": 1600
},
{
"epoch": 0.1179332203139972,
"eval_1_ratio_diff": 0.003117692907248615,
"eval_accuracy": 0.8628215120810601,
"eval_f1": 0.8631415241057543,
"eval_loss": 0.6979319453239441,
"eval_precision": 0.8604651162790697,
"eval_recall": 0.8658346333853354,
"eval_runtime": 1440.2188,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1600
},
{
"epoch": 0.1180806368393897,
"grad_norm": 0.14489419758319855,
"learning_rate": 0.00012121881071002525,
"loss": 0.004,
"step": 1602
},
{
"epoch": 0.11822805336478219,
"grad_norm": 0.02964833378791809,
"learning_rate": 0.00012121864788044781,
"loss": 0.0014,
"step": 1604
},
{
"epoch": 0.1183754698901747,
"grad_norm": 0.1308467835187912,
"learning_rate": 0.00012121848484786039,
"loss": 1.2428,
"step": 1606
},
{
"epoch": 0.11852288641556719,
"grad_norm": 0.012196216732263565,
"learning_rate": 0.00012121832161226353,
"loss": 0.0039,
"step": 1608
},
{
"epoch": 0.11867030294095968,
"grad_norm": 26.82729721069336,
"learning_rate": 0.0001212181581736578,
"loss": 0.9557,
"step": 1610
},
{
"epoch": 0.11881771946635218,
"grad_norm": 55.06840515136719,
"learning_rate": 0.00012121799453204374,
"loss": 1.341,
"step": 1612
},
{
"epoch": 0.11896513599174467,
"grad_norm": 0.10571928322315216,
"learning_rate": 0.0001212178306874219,
"loss": 0.0018,
"step": 1614
},
{
"epoch": 0.11911255251713718,
"grad_norm": 23.6888427734375,
"learning_rate": 0.00012121766663979284,
"loss": 2.8349,
"step": 1616
},
{
"epoch": 0.11911255251713718,
"eval_1_ratio_diff": 0.001558846453624252,
"eval_accuracy": 0.8612626656274357,
"eval_f1": 0.8613707165109035,
"eval_loss": 0.6912267804145813,
"eval_precision": 0.8600311041990669,
"eval_recall": 0.8627145085803433,
"eval_runtime": 1439.8805,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1616
},
{
"epoch": 0.11925996904252967,
"grad_norm": 0.013893804512917995,
"learning_rate": 0.00012121750238915708,
"loss": 0.0039,
"step": 1618
},
{
"epoch": 0.11940738556792216,
"grad_norm": 0.0326993353664875,
"learning_rate": 0.00012121733793551521,
"loss": 0.0071,
"step": 1620
},
{
"epoch": 0.11955480209331466,
"grad_norm": 0.021896235644817352,
"learning_rate": 0.00012121717327886775,
"loss": 0.4694,
"step": 1622
},
{
"epoch": 0.11970221861870715,
"grad_norm": 2.5759835243225098,
"learning_rate": 0.00012121700841921524,
"loss": 0.8411,
"step": 1624
},
{
"epoch": 0.11984963514409966,
"grad_norm": 6.512516021728516,
"learning_rate": 0.00012121684335655828,
"loss": 1.2897,
"step": 1626
},
{
"epoch": 0.11999705166949215,
"grad_norm": 1.0826752185821533,
"learning_rate": 0.00012121667809089738,
"loss": 0.067,
"step": 1628
},
{
"epoch": 0.12014446819488464,
"grad_norm": 0.5020477771759033,
"learning_rate": 0.00012121651262223313,
"loss": 0.0061,
"step": 1630
},
{
"epoch": 0.12029188472027715,
"grad_norm": 1.0385483503341675,
"learning_rate": 0.00012121634695056605,
"loss": 0.0162,
"step": 1632
},
{
"epoch": 0.12029188472027715,
"eval_1_ratio_diff": 0.024162120031176904,
"eval_accuracy": 0.8854247856586126,
"eval_f1": 0.8880426504188881,
"eval_loss": 0.4667970538139343,
"eval_precision": 0.8675595238095238,
"eval_recall": 0.9095163806552262,
"eval_runtime": 1440.04,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1632
},
{
"epoch": 0.12043930124566964,
"grad_norm": 0.3257231116294861,
"learning_rate": 0.00012121618107589671,
"loss": 0.0073,
"step": 1634
},
{
"epoch": 0.12058671777106214,
"grad_norm": 0.17591340839862823,
"learning_rate": 0.00012121601499822568,
"loss": 0.7197,
"step": 1636
},
{
"epoch": 0.12073413429645463,
"grad_norm": 20.77132797241211,
"learning_rate": 0.0001212158487175535,
"loss": 1.5072,
"step": 1638
},
{
"epoch": 0.12088155082184712,
"grad_norm": 0.013665467500686646,
"learning_rate": 0.00012121568223388071,
"loss": 0.0014,
"step": 1640
},
{
"epoch": 0.12102896734723963,
"grad_norm": 0.368145614862442,
"learning_rate": 0.00012121551554720792,
"loss": 1.0871,
"step": 1642
},
{
"epoch": 0.12117638387263212,
"grad_norm": 0.2764877378940582,
"learning_rate": 0.00012121534865753563,
"loss": 0.0044,
"step": 1644
},
{
"epoch": 0.12132380039802462,
"grad_norm": 0.15803444385528564,
"learning_rate": 0.00012121518156486446,
"loss": 0.0058,
"step": 1646
},
{
"epoch": 0.12147121692341711,
"grad_norm": 21.269418716430664,
"learning_rate": 0.0001212150142691949,
"loss": 1.5637,
"step": 1648
},
{
"epoch": 0.12147121692341711,
"eval_1_ratio_diff": -0.03117692907248637,
"eval_accuracy": 0.8877630553390491,
"eval_f1": 0.8840579710144928,
"eval_loss": 0.5637651681900024,
"eval_precision": 0.913477537437604,
"eval_recall": 0.8564742589703588,
"eval_runtime": 1440.5567,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1648
},
{
"epoch": 0.12161863344880962,
"grad_norm": 21.965253829956055,
"learning_rate": 0.00012121484677052757,
"loss": 0.9775,
"step": 1650
},
{
"epoch": 0.12176604997420211,
"grad_norm": 5.706968307495117,
"learning_rate": 0.000121214679068863,
"loss": 1.2593,
"step": 1652
},
{
"epoch": 0.1219134664995946,
"grad_norm": 78.91386413574219,
"learning_rate": 0.00012121451116420174,
"loss": 1.8529,
"step": 1654
},
{
"epoch": 0.1220608830249871,
"grad_norm": 20.03242301940918,
"learning_rate": 0.00012121434305654442,
"loss": 3.822,
"step": 1656
},
{
"epoch": 0.1222082995503796,
"grad_norm": 18.92554473876953,
"learning_rate": 0.00012121417474589151,
"loss": 1.7478,
"step": 1658
},
{
"epoch": 0.1223557160757721,
"grad_norm": 18.513463973999023,
"learning_rate": 0.00012121400623224365,
"loss": 0.9207,
"step": 1660
},
{
"epoch": 0.12250313260116459,
"grad_norm": 2.1414077281951904,
"learning_rate": 0.00012121383751560137,
"loss": 0.0559,
"step": 1662
},
{
"epoch": 0.12265054912655708,
"grad_norm": 1.9082714319229126,
"learning_rate": 0.00012121366859596523,
"loss": 0.0867,
"step": 1664
},
{
"epoch": 0.12265054912655708,
"eval_1_ratio_diff": -0.11223694466095091,
"eval_accuracy": 0.8316445830085737,
"eval_f1": 0.8101933216168717,
"eval_loss": 0.46373099088668823,
"eval_precision": 0.9275653923541247,
"eval_recall": 0.719188767550702,
"eval_runtime": 1439.8045,
"eval_samples_per_second": 0.891,
"eval_steps_per_second": 0.446,
"step": 1664
},
{
"epoch": 0.12279796565194959,
"grad_norm": 17.66658592224121,
"learning_rate": 0.0001212134994733358,
"loss": 0.694,
"step": 1666
},
{
"epoch": 0.12294538217734208,
"grad_norm": 0.5736209750175476,
"learning_rate": 0.00012121333014771369,
"loss": 0.5414,
"step": 1668
},
{
"epoch": 0.12309279870273458,
"grad_norm": 16.726125717163086,
"learning_rate": 0.0001212131606190994,
"loss": 2.7414,
"step": 1670
},
{
"epoch": 0.12324021522812707,
"grad_norm": 1.1649620532989502,
"learning_rate": 0.00012121299088749353,
"loss": 0.0285,
"step": 1672
},
{
"epoch": 0.12338763175351956,
"grad_norm": 18.4560604095459,
"learning_rate": 0.00012121282095289665,
"loss": 0.9068,
"step": 1674
},
{
"epoch": 0.12353504827891207,
"grad_norm": 0.3899083137512207,
"learning_rate": 0.00012121265081530934,
"loss": 0.0192,
"step": 1676
},
{
"epoch": 0.12368246480430456,
"grad_norm": 0.6309532523155212,
"learning_rate": 0.00012121248047473215,
"loss": 0.0398,
"step": 1678
},
{
"epoch": 0.12382988132969706,
"grad_norm": 25.81404685974121,
"learning_rate": 0.00012121230993116564,
"loss": 0.9268,
"step": 1680
},
{
"epoch": 0.12382988132969706,
"eval_1_ratio_diff": -0.05222135619641466,
"eval_accuracy": 0.8620420888542478,
"eval_f1": 0.854320987654321,
"eval_loss": 0.49660980701446533,
"eval_precision": 0.9041811846689896,
"eval_recall": 0.8096723868954758,
"eval_runtime": 1439.1088,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 1680
},
{
"epoch": 0.12397729785508955,
"grad_norm": 0.17194198071956635,
"learning_rate": 0.00012121213918461043,
"loss": 0.0091,
"step": 1682
},
{
"epoch": 0.12412471438048205,
"grad_norm": 0.1233774870634079,
"learning_rate": 0.00012121196823506704,
"loss": 0.033,
"step": 1684
},
{
"epoch": 0.12427213090587455,
"grad_norm": 0.12911829352378845,
"learning_rate": 0.00012121179708253609,
"loss": 0.9894,
"step": 1686
},
{
"epoch": 0.12441954743126704,
"grad_norm": 12.796908378601074,
"learning_rate": 0.00012121162572701811,
"loss": 0.2167,
"step": 1688
},
{
"epoch": 0.12456696395665955,
"grad_norm": 19.411853790283203,
"learning_rate": 0.0001212114541685137,
"loss": 1.1343,
"step": 1690
},
{
"epoch": 0.12471438048205204,
"grad_norm": 2.125748872756958,
"learning_rate": 0.00012121128240702341,
"loss": 0.0167,
"step": 1692
},
{
"epoch": 0.12486179700744453,
"grad_norm": 0.23534013330936432,
"learning_rate": 0.00012121111044254785,
"loss": 0.0099,
"step": 1694
},
{
"epoch": 0.12500921353283703,
"grad_norm": 0.2723231911659241,
"learning_rate": 0.00012121093827508758,
"loss": 0.0222,
"step": 1696
},
{
"epoch": 0.12500921353283703,
"eval_1_ratio_diff": -0.05144193296960253,
"eval_accuracy": 0.8721745908028059,
"eval_f1": 0.8651315789473685,
"eval_loss": 0.608511209487915,
"eval_precision": 0.9147826086956522,
"eval_recall": 0.8205928237129485,
"eval_runtime": 1438.9329,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 1696
},
{
"epoch": 0.12515663005822952,
"grad_norm": 0.1575896292924881,
"learning_rate": 0.00012121076590464316,
"loss": 0.0045,
"step": 1698
},
{
"epoch": 0.12530404658362201,
"grad_norm": 25.341609954833984,
"learning_rate": 0.00012121059333121521,
"loss": 2.9943,
"step": 1700
},
{
"epoch": 0.12545146310901453,
"grad_norm": 0.13375264406204224,
"learning_rate": 0.00012121042055480427,
"loss": 0.0033,
"step": 1702
},
{
"epoch": 0.12559887963440702,
"grad_norm": 0.06750854849815369,
"learning_rate": 0.00012121024757541094,
"loss": 0.0024,
"step": 1704
},
{
"epoch": 0.12574629615979951,
"grad_norm": 0.05674993619322777,
"learning_rate": 0.00012121007439303577,
"loss": 1.2325,
"step": 1706
},
{
"epoch": 0.125893712685192,
"grad_norm": 0.06746107339859009,
"learning_rate": 0.00012120990100767938,
"loss": 0.0016,
"step": 1708
},
{
"epoch": 0.1260411292105845,
"grad_norm": 18.890642166137695,
"learning_rate": 0.00012120972741934233,
"loss": 1.5509,
"step": 1710
},
{
"epoch": 0.12618854573597701,
"grad_norm": 0.0601690337061882,
"learning_rate": 0.00012120955362802522,
"loss": 0.0042,
"step": 1712
},
{
"epoch": 0.12618854573597701,
"eval_1_ratio_diff": -0.21278254091971943,
"eval_accuracy": 0.7575993764614185,
"eval_f1": 0.6917740336967294,
"eval_loss": 1.260048747062683,
"eval_precision": 0.9483695652173914,
"eval_recall": 0.5444617784711389,
"eval_runtime": 1438.8451,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 1712
},
{
"epoch": 0.1263359622613695,
"grad_norm": 31.62714385986328,
"learning_rate": 0.00012120937963372859,
"loss": 2.3397,
"step": 1714
},
{
"epoch": 0.126483378786762,
"grad_norm": 0.09423007071018219,
"learning_rate": 0.00012120920543645306,
"loss": 0.0056,
"step": 1716
},
{
"epoch": 0.1266307953121545,
"grad_norm": 18.73729133605957,
"learning_rate": 0.0001212090310361992,
"loss": 1.3417,
"step": 1718
},
{
"epoch": 0.12677821183754698,
"grad_norm": 0.16277751326560974,
"learning_rate": 0.0001212088564329676,
"loss": 0.0088,
"step": 1720
},
{
"epoch": 0.1269256283629395,
"grad_norm": 18.30181884765625,
"learning_rate": 0.00012120868162675886,
"loss": 0.966,
"step": 1722
},
{
"epoch": 0.127073044888332,
"grad_norm": 0.3613678812980652,
"learning_rate": 0.00012120850661757353,
"loss": 1.0053,
"step": 1724
},
{
"epoch": 0.12722046141372448,
"grad_norm": 0.7345402836799622,
"learning_rate": 0.00012120833140541222,
"loss": 1.4195,
"step": 1726
},
{
"epoch": 0.12736787793911697,
"grad_norm": 1.3485078811645508,
"learning_rate": 0.00012120815599027552,
"loss": 0.0247,
"step": 1728
},
{
"epoch": 0.12736787793911697,
"eval_1_ratio_diff": -0.04130943102104445,
"eval_accuracy": 0.8651597817614964,
"eval_f1": 0.8592351505288853,
"eval_loss": 0.4965825080871582,
"eval_precision": 0.8979591836734694,
"eval_recall": 0.8237129485179407,
"eval_runtime": 1438.3328,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 1728
},
{
"epoch": 0.12751529446450946,
"grad_norm": 17.94972801208496,
"learning_rate": 0.000121207980372164,
"loss": 0.7518,
"step": 1730
},
{
"epoch": 0.12766271098990198,
"grad_norm": 1.6150920391082764,
"learning_rate": 0.00012120780455107827,
"loss": 0.0328,
"step": 1732
},
{
"epoch": 0.12781012751529447,
"grad_norm": 0.22876843810081482,
"learning_rate": 0.00012120762852701892,
"loss": 0.0105,
"step": 1734
},
{
"epoch": 0.12795754404068696,
"grad_norm": 0.1126691922545433,
"learning_rate": 0.0001212074522999865,
"loss": 0.0038,
"step": 1736
},
{
"epoch": 0.12810496056607945,
"grad_norm": 0.5277115702629089,
"learning_rate": 0.00012120727586998164,
"loss": 0.0094,
"step": 1738
},
{
"epoch": 0.12825237709147194,
"grad_norm": 0.11928611248731613,
"learning_rate": 0.00012120709923700492,
"loss": 0.0054,
"step": 1740
},
{
"epoch": 0.12839979361686446,
"grad_norm": 22.84393310546875,
"learning_rate": 0.00012120692240105693,
"loss": 1.7358,
"step": 1742
},
{
"epoch": 0.12854721014225695,
"grad_norm": 0.08426441997289658,
"learning_rate": 0.0001212067453621383,
"loss": 0.0029,
"step": 1744
},
{
"epoch": 0.12854721014225695,
"eval_1_ratio_diff": -0.014029618082618878,
"eval_accuracy": 0.8784099766173032,
"eval_f1": 0.8765822784810127,
"eval_loss": 0.6492618322372437,
"eval_precision": 0.8892455858747994,
"eval_recall": 0.8642745709828393,
"eval_runtime": 1438.7827,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 1744
},
{
"epoch": 0.12869462666764944,
"grad_norm": 22.079143524169922,
"learning_rate": 0.00012120656812024955,
"loss": 1.2809,
"step": 1746
},
{
"epoch": 0.12884204319304193,
"grad_norm": 21.899768829345703,
"learning_rate": 0.00012120639067539131,
"loss": 3.0657,
"step": 1748
},
{
"epoch": 0.12898945971843442,
"grad_norm": 0.1824941784143448,
"learning_rate": 0.0001212062130275642,
"loss": 0.0032,
"step": 1750
},
{
"epoch": 0.12913687624382694,
"grad_norm": 0.1769951432943344,
"learning_rate": 0.00012120603517676877,
"loss": 1.2614,
"step": 1752
},
{
"epoch": 0.12928429276921943,
"grad_norm": 21.305864334106445,
"learning_rate": 0.00012120585712300566,
"loss": 1.0725,
"step": 1754
},
{
"epoch": 0.12943170929461192,
"grad_norm": 0.44233354926109314,
"learning_rate": 0.00012120567886627544,
"loss": 0.9641,
"step": 1756
},
{
"epoch": 0.12957912582000441,
"grad_norm": 0.2779258191585541,
"learning_rate": 0.00012120550040657871,
"loss": 0.0096,
"step": 1758
},
{
"epoch": 0.1297265423453969,
"grad_norm": 22.293994903564453,
"learning_rate": 0.00012120532174391606,
"loss": 0.9558,
"step": 1760
},
{
"epoch": 0.1297265423453969,
"eval_1_ratio_diff": 0.031956352299298496,
"eval_accuracy": 0.8901013250194856,
"eval_f1": 0.8934240362811792,
"eval_loss": 0.45321086049079895,
"eval_precision": 0.8665689149560117,
"eval_recall": 0.921996879875195,
"eval_runtime": 1438.3028,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 1760
},
{
"epoch": 0.12987395887078942,
"grad_norm": 0.15532580018043518,
"learning_rate": 0.00012120514287828811,
"loss": 0.0082,
"step": 1762
},
{
"epoch": 0.13002137539618192,
"grad_norm": 27.137800216674805,
"learning_rate": 0.00012120496380969545,
"loss": 0.8253,
"step": 1764
},
{
"epoch": 0.1301687919215744,
"grad_norm": 0.12127237766981125,
"learning_rate": 0.00012120478453813868,
"loss": 0.007,
"step": 1766
},
{
"epoch": 0.1303162084469669,
"grad_norm": 0.12471210211515427,
"learning_rate": 0.00012120460506361839,
"loss": 0.0118,
"step": 1768
},
{
"epoch": 0.1304636249723594,
"grad_norm": 45.0229377746582,
"learning_rate": 0.0001212044253861352,
"loss": 3.5846,
"step": 1770
},
{
"epoch": 0.1306110414977519,
"grad_norm": 0.4128153622150421,
"learning_rate": 0.0001212042455056897,
"loss": 0.0073,
"step": 1772
},
{
"epoch": 0.1307584580231444,
"grad_norm": 0.40481987595558167,
"learning_rate": 0.0001212040654222825,
"loss": 0.0072,
"step": 1774
},
{
"epoch": 0.1309058745485369,
"grad_norm": 0.11055589467287064,
"learning_rate": 0.00012120388513591419,
"loss": 1.0826,
"step": 1776
},
{
"epoch": 0.1309058745485369,
"eval_1_ratio_diff": 0.1200311769290725,
"eval_accuracy": 0.8332034294621979,
"eval_f1": 0.850974930362117,
"eval_loss": 0.6285108923912048,
"eval_precision": 0.7685534591194969,
"eval_recall": 0.953198127925117,
"eval_runtime": 1438.3285,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 1776
},
{
"epoch": 0.13105329107392938,
"grad_norm": 0.12451104074716568,
"learning_rate": 0.0001212037046465854,
"loss": 1.0074,
"step": 1778
},
{
"epoch": 0.1312007075993219,
"grad_norm": 0.27884507179260254,
"learning_rate": 0.0001212035239542967,
"loss": 0.0129,
"step": 1780
},
{
"epoch": 0.1313481241247144,
"grad_norm": 0.992557168006897,
"learning_rate": 0.00012120334305904872,
"loss": 1.4174,
"step": 1782
},
{
"epoch": 0.13149554065010688,
"grad_norm": 0.9067917466163635,
"learning_rate": 0.00012120316196084206,
"loss": 1.435,
"step": 1784
},
{
"epoch": 0.13164295717549937,
"grad_norm": 20.08501625061035,
"learning_rate": 0.00012120298065967733,
"loss": 1.7277,
"step": 1786
},
{
"epoch": 0.13179037370089186,
"grad_norm": 0.20194768905639648,
"learning_rate": 0.00012120279915555515,
"loss": 0.005,
"step": 1788
},
{
"epoch": 0.13193779022628438,
"grad_norm": 0.29110512137413025,
"learning_rate": 0.0001212026174484761,
"loss": 0.0065,
"step": 1790
},
{
"epoch": 0.13208520675167687,
"grad_norm": 0.3067338764667511,
"learning_rate": 0.00012120243553844079,
"loss": 0.006,
"step": 1792
},
{
"epoch": 0.13208520675167687,
"eval_1_ratio_diff": -0.002338269680436489,
"eval_accuracy": 0.8978955572876072,
"eval_f1": 0.8975762314308053,
"eval_loss": 0.42508459091186523,
"eval_precision": 0.8996865203761756,
"eval_recall": 0.8954758190327613,
"eval_runtime": 1439.0957,
"eval_samples_per_second": 0.892,
"eval_steps_per_second": 0.446,
"step": 1792
}
],
"logging_steps": 2,
"max_steps": 108536,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 64,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1000,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5936070605815808e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}