Jambert / adapter /checkpoint-1220 /trainer_state.json
Pclanglais's picture
Upload folder using huggingface_hub
c215d89 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9975440032746623,
"eval_steps": 500,
"global_step": 1220,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.3300960063934326,
"learning_rate": 2.9999999999999997e-05,
"loss": 0.9966,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.4113194942474365,
"learning_rate": 5.9999999999999995e-05,
"loss": 1.1253,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 0.2486647665500641,
"learning_rate": 8.999999999999999e-05,
"loss": 1.0721,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 0.2249160259962082,
"learning_rate": 0.00011999999999999999,
"loss": 0.9033,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 0.3706735074520111,
"learning_rate": 0.00015,
"loss": 1.0498,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 0.28104931116104126,
"learning_rate": 0.00017999999999999998,
"loss": 0.9108,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 0.27497801184654236,
"learning_rate": 0.00020999999999999998,
"loss": 0.9038,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 0.30283215641975403,
"learning_rate": 0.00023999999999999998,
"loss": 0.8605,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 0.33457252383232117,
"learning_rate": 0.00027,
"loss": 0.9049,
"step": 9
},
{
"epoch": 0.02,
"grad_norm": 0.37725692987442017,
"learning_rate": 0.0003,
"loss": 0.772,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 0.27986466884613037,
"learning_rate": 0.00029975206611570246,
"loss": 0.7666,
"step": 11
},
{
"epoch": 0.02,
"grad_norm": 0.30687034130096436,
"learning_rate": 0.00029950413223140494,
"loss": 0.8312,
"step": 12
},
{
"epoch": 0.02,
"grad_norm": 0.3321741819381714,
"learning_rate": 0.0002992561983471074,
"loss": 0.8308,
"step": 13
},
{
"epoch": 0.02,
"grad_norm": 0.29080134630203247,
"learning_rate": 0.0002990082644628099,
"loss": 0.7597,
"step": 14
},
{
"epoch": 0.02,
"grad_norm": 0.33823856711387634,
"learning_rate": 0.0002987603305785124,
"loss": 0.8693,
"step": 15
},
{
"epoch": 0.03,
"grad_norm": 0.3461182117462158,
"learning_rate": 0.0002985123966942149,
"loss": 1.0571,
"step": 16
},
{
"epoch": 0.03,
"grad_norm": 0.22306275367736816,
"learning_rate": 0.0002982644628099173,
"loss": 0.7706,
"step": 17
},
{
"epoch": 0.03,
"grad_norm": 154.4940643310547,
"learning_rate": 0.0002980165289256198,
"loss": 2.6519,
"step": 18
},
{
"epoch": 0.03,
"grad_norm": 0.22956405580043793,
"learning_rate": 0.00029776859504132227,
"loss": 0.6897,
"step": 19
},
{
"epoch": 0.03,
"grad_norm": 0.25711989402770996,
"learning_rate": 0.00029752066115702476,
"loss": 0.7338,
"step": 20
},
{
"epoch": 0.03,
"grad_norm": 0.2565441131591797,
"learning_rate": 0.00029727272727272724,
"loss": 0.8211,
"step": 21
},
{
"epoch": 0.04,
"grad_norm": 0.2437434047460556,
"learning_rate": 0.0002970247933884297,
"loss": 0.8027,
"step": 22
},
{
"epoch": 0.04,
"grad_norm": 0.21284469962120056,
"learning_rate": 0.0002967768595041322,
"loss": 0.7944,
"step": 23
},
{
"epoch": 0.04,
"grad_norm": 0.23338356614112854,
"learning_rate": 0.0002965289256198347,
"loss": 0.7696,
"step": 24
},
{
"epoch": 0.04,
"grad_norm": 0.25512659549713135,
"learning_rate": 0.0002962809917355372,
"loss": 0.7693,
"step": 25
},
{
"epoch": 0.04,
"grad_norm": 0.19500921666622162,
"learning_rate": 0.0002960330578512396,
"loss": 0.7599,
"step": 26
},
{
"epoch": 0.04,
"grad_norm": 0.2554054260253906,
"learning_rate": 0.00029578512396694214,
"loss": 0.966,
"step": 27
},
{
"epoch": 0.05,
"grad_norm": 0.17682747542858124,
"learning_rate": 0.0002955371900826446,
"loss": 0.676,
"step": 28
},
{
"epoch": 0.05,
"grad_norm": 0.20516635477542877,
"learning_rate": 0.0002952892561983471,
"loss": 0.8144,
"step": 29
},
{
"epoch": 0.05,
"grad_norm": 0.3275119662284851,
"learning_rate": 0.0002950413223140496,
"loss": 0.7704,
"step": 30
},
{
"epoch": 0.05,
"grad_norm": 0.22231778502464294,
"learning_rate": 0.000294793388429752,
"loss": 0.7614,
"step": 31
},
{
"epoch": 0.05,
"grad_norm": 0.17065812647342682,
"learning_rate": 0.0002945454545454545,
"loss": 0.5634,
"step": 32
},
{
"epoch": 0.05,
"grad_norm": 0.1771956831216812,
"learning_rate": 0.000294297520661157,
"loss": 0.7607,
"step": 33
},
{
"epoch": 0.06,
"grad_norm": 0.26693442463874817,
"learning_rate": 0.00029404958677685947,
"loss": 0.8171,
"step": 34
},
{
"epoch": 0.06,
"grad_norm": 1.409070611000061,
"learning_rate": 0.00029380165289256196,
"loss": 0.7791,
"step": 35
},
{
"epoch": 0.06,
"grad_norm": 0.20727217197418213,
"learning_rate": 0.00029355371900826444,
"loss": 0.7357,
"step": 36
},
{
"epoch": 0.06,
"grad_norm": 0.2145707905292511,
"learning_rate": 0.0002933057851239669,
"loss": 0.8458,
"step": 37
},
{
"epoch": 0.06,
"grad_norm": 0.2068527340888977,
"learning_rate": 0.0002930578512396694,
"loss": 0.78,
"step": 38
},
{
"epoch": 0.06,
"grad_norm": 0.22432388365268707,
"learning_rate": 0.00029280991735537184,
"loss": 0.8523,
"step": 39
},
{
"epoch": 0.07,
"grad_norm": 0.19982610642910004,
"learning_rate": 0.0002925619834710743,
"loss": 0.7372,
"step": 40
},
{
"epoch": 0.07,
"grad_norm": 6.248472213745117,
"learning_rate": 0.00029231404958677686,
"loss": 0.7399,
"step": 41
},
{
"epoch": 0.07,
"grad_norm": 0.2269737422466278,
"learning_rate": 0.00029206611570247934,
"loss": 0.7842,
"step": 42
},
{
"epoch": 0.07,
"grad_norm": 0.23117898404598236,
"learning_rate": 0.0002918181818181818,
"loss": 0.7111,
"step": 43
},
{
"epoch": 0.07,
"grad_norm": 0.22466522455215454,
"learning_rate": 0.00029157024793388425,
"loss": 0.8979,
"step": 44
},
{
"epoch": 0.07,
"grad_norm": 0.20770332217216492,
"learning_rate": 0.00029132231404958674,
"loss": 0.774,
"step": 45
},
{
"epoch": 0.08,
"grad_norm": 0.2376495748758316,
"learning_rate": 0.0002910743801652892,
"loss": 0.7216,
"step": 46
},
{
"epoch": 0.08,
"grad_norm": 0.2470778226852417,
"learning_rate": 0.0002908264462809917,
"loss": 0.7369,
"step": 47
},
{
"epoch": 0.08,
"grad_norm": 0.23465900123119354,
"learning_rate": 0.0002905785123966942,
"loss": 0.7528,
"step": 48
},
{
"epoch": 0.08,
"grad_norm": 0.5718627572059631,
"learning_rate": 0.00029033057851239667,
"loss": 0.7535,
"step": 49
},
{
"epoch": 0.08,
"grad_norm": 0.21493370831012726,
"learning_rate": 0.00029008264462809916,
"loss": 0.8593,
"step": 50
},
{
"epoch": 0.08,
"grad_norm": 0.21197210252285004,
"learning_rate": 0.00028983471074380164,
"loss": 0.8013,
"step": 51
},
{
"epoch": 0.09,
"grad_norm": 0.20836398005485535,
"learning_rate": 0.0002895867768595041,
"loss": 0.7905,
"step": 52
},
{
"epoch": 0.09,
"grad_norm": 0.2096678912639618,
"learning_rate": 0.00028933884297520655,
"loss": 0.6754,
"step": 53
},
{
"epoch": 0.09,
"grad_norm": 0.25898435711860657,
"learning_rate": 0.00028909090909090904,
"loss": 0.7725,
"step": 54
},
{
"epoch": 0.09,
"grad_norm": 0.23370735347270966,
"learning_rate": 0.0002888429752066116,
"loss": 0.7007,
"step": 55
},
{
"epoch": 0.09,
"grad_norm": 0.23006942868232727,
"learning_rate": 0.00028859504132231406,
"loss": 0.7534,
"step": 56
},
{
"epoch": 0.09,
"grad_norm": 0.20855402946472168,
"learning_rate": 0.0002883471074380165,
"loss": 0.9491,
"step": 57
},
{
"epoch": 0.09,
"grad_norm": 0.24340493977069855,
"learning_rate": 0.00028809917355371897,
"loss": 0.8089,
"step": 58
},
{
"epoch": 0.1,
"grad_norm": 0.20169466733932495,
"learning_rate": 0.00028785123966942145,
"loss": 0.64,
"step": 59
},
{
"epoch": 0.1,
"grad_norm": 0.23272906243801117,
"learning_rate": 0.00028760330578512394,
"loss": 0.8456,
"step": 60
},
{
"epoch": 0.1,
"grad_norm": 0.1767100691795349,
"learning_rate": 0.0002873553719008264,
"loss": 0.6686,
"step": 61
},
{
"epoch": 0.1,
"grad_norm": 0.24511106312274933,
"learning_rate": 0.0002871074380165289,
"loss": 0.6998,
"step": 62
},
{
"epoch": 0.1,
"grad_norm": 0.22284479439258575,
"learning_rate": 0.0002868595041322314,
"loss": 0.6699,
"step": 63
},
{
"epoch": 0.1,
"grad_norm": 0.21842750906944275,
"learning_rate": 0.00028661157024793387,
"loss": 0.7413,
"step": 64
},
{
"epoch": 0.11,
"grad_norm": 0.2669163644313812,
"learning_rate": 0.00028636363636363636,
"loss": 0.931,
"step": 65
},
{
"epoch": 0.11,
"grad_norm": 0.1864808052778244,
"learning_rate": 0.0002861157024793388,
"loss": 0.5652,
"step": 66
},
{
"epoch": 0.11,
"grad_norm": 0.18369853496551514,
"learning_rate": 0.00028586776859504127,
"loss": 0.6847,
"step": 67
},
{
"epoch": 0.11,
"grad_norm": 0.22353056073188782,
"learning_rate": 0.00028561983471074375,
"loss": 0.598,
"step": 68
},
{
"epoch": 0.11,
"grad_norm": 0.20269523561000824,
"learning_rate": 0.0002853719008264463,
"loss": 0.8688,
"step": 69
},
{
"epoch": 0.11,
"grad_norm": 0.2291198968887329,
"learning_rate": 0.0002851239669421488,
"loss": 0.7535,
"step": 70
},
{
"epoch": 0.12,
"grad_norm": 0.22033120691776276,
"learning_rate": 0.0002848760330578512,
"loss": 0.8377,
"step": 71
},
{
"epoch": 0.12,
"grad_norm": 0.2687983214855194,
"learning_rate": 0.0002846280991735537,
"loss": 0.6926,
"step": 72
},
{
"epoch": 0.12,
"grad_norm": 0.1933681070804596,
"learning_rate": 0.00028438016528925617,
"loss": 0.6276,
"step": 73
},
{
"epoch": 0.12,
"grad_norm": 0.2820705473423004,
"learning_rate": 0.00028413223140495865,
"loss": 0.848,
"step": 74
},
{
"epoch": 0.12,
"grad_norm": 0.19532324373722076,
"learning_rate": 0.00028388429752066114,
"loss": 0.6198,
"step": 75
},
{
"epoch": 0.12,
"grad_norm": 0.25057846307754517,
"learning_rate": 0.0002836363636363636,
"loss": 0.6838,
"step": 76
},
{
"epoch": 0.13,
"grad_norm": 0.2168462574481964,
"learning_rate": 0.0002833884297520661,
"loss": 0.7885,
"step": 77
},
{
"epoch": 0.13,
"grad_norm": 0.2106674313545227,
"learning_rate": 0.0002831404958677686,
"loss": 0.6757,
"step": 78
},
{
"epoch": 0.13,
"grad_norm": 0.24460363388061523,
"learning_rate": 0.000282892561983471,
"loss": 0.7414,
"step": 79
},
{
"epoch": 0.13,
"grad_norm": 0.3706071078777313,
"learning_rate": 0.0002826446280991735,
"loss": 0.621,
"step": 80
},
{
"epoch": 0.13,
"grad_norm": 0.2251998782157898,
"learning_rate": 0.000282396694214876,
"loss": 0.7453,
"step": 81
},
{
"epoch": 0.13,
"grad_norm": 0.24521738290786743,
"learning_rate": 0.00028214876033057847,
"loss": 0.6985,
"step": 82
},
{
"epoch": 0.14,
"grad_norm": 0.2262742966413498,
"learning_rate": 0.000281900826446281,
"loss": 0.6316,
"step": 83
},
{
"epoch": 0.14,
"grad_norm": 0.19723354279994965,
"learning_rate": 0.00028165289256198344,
"loss": 0.4798,
"step": 84
},
{
"epoch": 0.14,
"grad_norm": 0.20684833824634552,
"learning_rate": 0.0002814049586776859,
"loss": 0.7993,
"step": 85
},
{
"epoch": 0.14,
"grad_norm": 0.19534814357757568,
"learning_rate": 0.0002811570247933884,
"loss": 0.7735,
"step": 86
},
{
"epoch": 0.14,
"grad_norm": 0.2585545480251312,
"learning_rate": 0.0002809090909090909,
"loss": 0.8126,
"step": 87
},
{
"epoch": 0.14,
"grad_norm": 0.2510583996772766,
"learning_rate": 0.00028066115702479337,
"loss": 0.6973,
"step": 88
},
{
"epoch": 0.15,
"grad_norm": 0.1884051263332367,
"learning_rate": 0.00028041322314049585,
"loss": 0.701,
"step": 89
},
{
"epoch": 0.15,
"grad_norm": 0.2526257038116455,
"learning_rate": 0.00028016528925619834,
"loss": 0.7132,
"step": 90
},
{
"epoch": 0.15,
"grad_norm": 0.200734481215477,
"learning_rate": 0.0002799173553719008,
"loss": 0.7024,
"step": 91
},
{
"epoch": 0.15,
"grad_norm": 0.2404022514820099,
"learning_rate": 0.0002796694214876033,
"loss": 0.704,
"step": 92
},
{
"epoch": 0.15,
"grad_norm": 0.23063871264457703,
"learning_rate": 0.00027942148760330573,
"loss": 0.6312,
"step": 93
},
{
"epoch": 0.15,
"grad_norm": 0.1759747564792633,
"learning_rate": 0.0002791735537190082,
"loss": 0.6577,
"step": 94
},
{
"epoch": 0.16,
"grad_norm": 0.2009582370519638,
"learning_rate": 0.0002789256198347107,
"loss": 0.8036,
"step": 95
},
{
"epoch": 0.16,
"grad_norm": 0.2200164943933487,
"learning_rate": 0.0002786776859504132,
"loss": 0.7101,
"step": 96
},
{
"epoch": 0.16,
"grad_norm": 0.19693537056446075,
"learning_rate": 0.00027842975206611567,
"loss": 0.6221,
"step": 97
},
{
"epoch": 0.16,
"grad_norm": 0.23269779980182648,
"learning_rate": 0.00027818181818181815,
"loss": 0.8264,
"step": 98
},
{
"epoch": 0.16,
"grad_norm": 0.2440226823091507,
"learning_rate": 0.00027793388429752064,
"loss": 0.8051,
"step": 99
},
{
"epoch": 0.16,
"grad_norm": 0.2307034134864807,
"learning_rate": 0.0002776859504132231,
"loss": 0.631,
"step": 100
},
{
"epoch": 0.17,
"grad_norm": 0.2530567944049835,
"learning_rate": 0.0002774380165289256,
"loss": 0.8616,
"step": 101
},
{
"epoch": 0.17,
"grad_norm": 0.2808806300163269,
"learning_rate": 0.0002771900826446281,
"loss": 0.8333,
"step": 102
},
{
"epoch": 0.17,
"grad_norm": 0.20667941868305206,
"learning_rate": 0.00027694214876033057,
"loss": 0.7212,
"step": 103
},
{
"epoch": 0.17,
"grad_norm": 0.17540781199932098,
"learning_rate": 0.00027669421487603305,
"loss": 0.5964,
"step": 104
},
{
"epoch": 0.17,
"grad_norm": 0.2526637613773346,
"learning_rate": 0.00027644628099173554,
"loss": 0.6868,
"step": 105
},
{
"epoch": 0.17,
"grad_norm": 0.2137339860200882,
"learning_rate": 0.00027619834710743797,
"loss": 0.6155,
"step": 106
},
{
"epoch": 0.18,
"grad_norm": 0.21061092615127563,
"learning_rate": 0.00027595041322314045,
"loss": 0.813,
"step": 107
},
{
"epoch": 0.18,
"grad_norm": 0.21619191765785217,
"learning_rate": 0.00027570247933884293,
"loss": 0.8046,
"step": 108
},
{
"epoch": 0.18,
"grad_norm": 0.2212170660495758,
"learning_rate": 0.0002754545454545454,
"loss": 0.6706,
"step": 109
},
{
"epoch": 0.18,
"grad_norm": 0.23427413403987885,
"learning_rate": 0.0002752066115702479,
"loss": 0.7152,
"step": 110
},
{
"epoch": 0.18,
"grad_norm": 0.20566123723983765,
"learning_rate": 0.0002749586776859504,
"loss": 0.6568,
"step": 111
},
{
"epoch": 0.18,
"grad_norm": 0.22977930307388306,
"learning_rate": 0.00027471074380165287,
"loss": 0.7832,
"step": 112
},
{
"epoch": 0.19,
"grad_norm": 0.28307485580444336,
"learning_rate": 0.00027446280991735535,
"loss": 0.7446,
"step": 113
},
{
"epoch": 0.19,
"grad_norm": 0.19567596912384033,
"learning_rate": 0.00027421487603305784,
"loss": 0.6394,
"step": 114
},
{
"epoch": 0.19,
"grad_norm": 0.24577689170837402,
"learning_rate": 0.0002739669421487603,
"loss": 0.6389,
"step": 115
},
{
"epoch": 0.19,
"grad_norm": 0.2180463820695877,
"learning_rate": 0.0002737190082644628,
"loss": 0.7814,
"step": 116
},
{
"epoch": 0.19,
"grad_norm": 0.19546380639076233,
"learning_rate": 0.0002734710743801653,
"loss": 0.8312,
"step": 117
},
{
"epoch": 0.19,
"grad_norm": 0.22698360681533813,
"learning_rate": 0.00027322314049586777,
"loss": 0.7443,
"step": 118
},
{
"epoch": 0.19,
"grad_norm": 0.22987066209316254,
"learning_rate": 0.0002729752066115702,
"loss": 0.7839,
"step": 119
},
{
"epoch": 0.2,
"grad_norm": 0.20548178255558014,
"learning_rate": 0.0002727272727272727,
"loss": 0.7805,
"step": 120
},
{
"epoch": 0.2,
"grad_norm": 0.2477702796459198,
"learning_rate": 0.00027247933884297517,
"loss": 0.5694,
"step": 121
},
{
"epoch": 0.2,
"grad_norm": 0.20593340694904327,
"learning_rate": 0.00027223140495867765,
"loss": 0.6479,
"step": 122
},
{
"epoch": 0.2,
"grad_norm": 0.23635917901992798,
"learning_rate": 0.00027198347107438013,
"loss": 0.8107,
"step": 123
},
{
"epoch": 0.2,
"grad_norm": 0.25808119773864746,
"learning_rate": 0.0002717355371900826,
"loss": 0.7876,
"step": 124
},
{
"epoch": 0.2,
"grad_norm": 0.22156469523906708,
"learning_rate": 0.0002714876033057851,
"loss": 0.7261,
"step": 125
},
{
"epoch": 0.21,
"grad_norm": 0.19892215728759766,
"learning_rate": 0.0002712396694214876,
"loss": 0.6874,
"step": 126
},
{
"epoch": 0.21,
"grad_norm": 0.24936752021312714,
"learning_rate": 0.00027099173553719007,
"loss": 0.6155,
"step": 127
},
{
"epoch": 0.21,
"grad_norm": 0.23287539184093475,
"learning_rate": 0.0002707438016528925,
"loss": 0.602,
"step": 128
},
{
"epoch": 0.21,
"grad_norm": 0.2086639404296875,
"learning_rate": 0.00027049586776859504,
"loss": 0.7198,
"step": 129
},
{
"epoch": 0.21,
"grad_norm": 0.24974922835826874,
"learning_rate": 0.0002702479338842975,
"loss": 0.6873,
"step": 130
},
{
"epoch": 0.21,
"grad_norm": 0.2066827118396759,
"learning_rate": 0.00027,
"loss": 0.5821,
"step": 131
},
{
"epoch": 0.22,
"grad_norm": 0.28004395961761475,
"learning_rate": 0.0002697520661157025,
"loss": 0.7864,
"step": 132
},
{
"epoch": 0.22,
"grad_norm": 0.22391608357429504,
"learning_rate": 0.0002695041322314049,
"loss": 0.6773,
"step": 133
},
{
"epoch": 0.22,
"grad_norm": 0.2821199297904968,
"learning_rate": 0.0002692561983471074,
"loss": 0.6806,
"step": 134
},
{
"epoch": 0.22,
"grad_norm": 0.21736428141593933,
"learning_rate": 0.0002690082644628099,
"loss": 0.6662,
"step": 135
},
{
"epoch": 0.22,
"grad_norm": 0.23889939486980438,
"learning_rate": 0.00026876033057851237,
"loss": 0.6356,
"step": 136
},
{
"epoch": 0.22,
"grad_norm": 0.21096719801425934,
"learning_rate": 0.00026851239669421485,
"loss": 0.6762,
"step": 137
},
{
"epoch": 0.23,
"grad_norm": 0.22622421383857727,
"learning_rate": 0.00026826446280991733,
"loss": 0.8085,
"step": 138
},
{
"epoch": 0.23,
"grad_norm": 0.19824957847595215,
"learning_rate": 0.0002680165289256198,
"loss": 0.6031,
"step": 139
},
{
"epoch": 0.23,
"grad_norm": 0.24482691287994385,
"learning_rate": 0.0002677685950413223,
"loss": 0.6649,
"step": 140
},
{
"epoch": 0.23,
"grad_norm": 0.21291929483413696,
"learning_rate": 0.0002675206611570248,
"loss": 0.6671,
"step": 141
},
{
"epoch": 0.23,
"grad_norm": 0.2202674299478531,
"learning_rate": 0.0002672727272727272,
"loss": 0.6469,
"step": 142
},
{
"epoch": 0.23,
"grad_norm": 0.23572632670402527,
"learning_rate": 0.0002670247933884297,
"loss": 0.7377,
"step": 143
},
{
"epoch": 0.24,
"grad_norm": 0.2051907777786255,
"learning_rate": 0.00026677685950413224,
"loss": 0.6217,
"step": 144
},
{
"epoch": 0.24,
"grad_norm": 0.23270072042942047,
"learning_rate": 0.0002665289256198347,
"loss": 0.7933,
"step": 145
},
{
"epoch": 0.24,
"grad_norm": 0.20652809739112854,
"learning_rate": 0.00026628099173553715,
"loss": 0.6007,
"step": 146
},
{
"epoch": 0.24,
"grad_norm": 0.23084674775600433,
"learning_rate": 0.00026603305785123963,
"loss": 0.701,
"step": 147
},
{
"epoch": 0.24,
"grad_norm": 0.25663891434669495,
"learning_rate": 0.0002657851239669421,
"loss": 0.7271,
"step": 148
},
{
"epoch": 0.24,
"grad_norm": 0.25880497694015503,
"learning_rate": 0.0002655371900826446,
"loss": 0.6562,
"step": 149
},
{
"epoch": 0.25,
"grad_norm": 0.19349205493927002,
"learning_rate": 0.0002652892561983471,
"loss": 0.5016,
"step": 150
},
{
"epoch": 0.25,
"grad_norm": 0.2401740401983261,
"learning_rate": 0.00026504132231404957,
"loss": 0.6978,
"step": 151
},
{
"epoch": 0.25,
"grad_norm": 0.19495394825935364,
"learning_rate": 0.00026479338842975205,
"loss": 0.5562,
"step": 152
},
{
"epoch": 0.25,
"grad_norm": 0.21485286951065063,
"learning_rate": 0.00026454545454545453,
"loss": 0.7847,
"step": 153
},
{
"epoch": 0.25,
"grad_norm": 0.241348534822464,
"learning_rate": 0.000264297520661157,
"loss": 0.7513,
"step": 154
},
{
"epoch": 0.25,
"grad_norm": 0.3316986858844757,
"learning_rate": 0.00026404958677685945,
"loss": 0.664,
"step": 155
},
{
"epoch": 0.26,
"grad_norm": 0.2419958859682083,
"learning_rate": 0.00026380165289256193,
"loss": 0.7322,
"step": 156
},
{
"epoch": 0.26,
"grad_norm": 0.2868640124797821,
"learning_rate": 0.0002635537190082644,
"loss": 0.7004,
"step": 157
},
{
"epoch": 0.26,
"grad_norm": 0.24806949496269226,
"learning_rate": 0.00026330578512396695,
"loss": 0.6497,
"step": 158
},
{
"epoch": 0.26,
"grad_norm": 0.23873400688171387,
"learning_rate": 0.00026305785123966944,
"loss": 0.7543,
"step": 159
},
{
"epoch": 0.26,
"grad_norm": 0.2480355203151703,
"learning_rate": 0.00026280991735537187,
"loss": 0.6048,
"step": 160
},
{
"epoch": 0.26,
"grad_norm": 0.2619112730026245,
"learning_rate": 0.00026256198347107435,
"loss": 0.762,
"step": 161
},
{
"epoch": 0.27,
"grad_norm": 0.22763262689113617,
"learning_rate": 0.00026231404958677683,
"loss": 0.6557,
"step": 162
},
{
"epoch": 0.27,
"grad_norm": 0.3291528522968292,
"learning_rate": 0.0002620661157024793,
"loss": 0.7059,
"step": 163
},
{
"epoch": 0.27,
"grad_norm": 0.2959338426589966,
"learning_rate": 0.0002618181818181818,
"loss": 0.6622,
"step": 164
},
{
"epoch": 0.27,
"grad_norm": 0.23001112043857574,
"learning_rate": 0.0002615702479338843,
"loss": 0.6465,
"step": 165
},
{
"epoch": 0.27,
"grad_norm": 0.1998877376317978,
"learning_rate": 0.00026132231404958677,
"loss": 0.666,
"step": 166
},
{
"epoch": 0.27,
"grad_norm": 0.23009613156318665,
"learning_rate": 0.00026107438016528925,
"loss": 0.8793,
"step": 167
},
{
"epoch": 0.28,
"grad_norm": 0.24525685608386993,
"learning_rate": 0.0002608264462809917,
"loss": 0.8009,
"step": 168
},
{
"epoch": 0.28,
"grad_norm": 0.21605077385902405,
"learning_rate": 0.00026057851239669416,
"loss": 0.5459,
"step": 169
},
{
"epoch": 0.28,
"grad_norm": 0.2576725482940674,
"learning_rate": 0.00026033057851239665,
"loss": 0.6818,
"step": 170
},
{
"epoch": 0.28,
"grad_norm": 0.23385170102119446,
"learning_rate": 0.00026008264462809913,
"loss": 0.7559,
"step": 171
},
{
"epoch": 0.28,
"grad_norm": 0.1973017454147339,
"learning_rate": 0.00025983471074380167,
"loss": 0.6798,
"step": 172
},
{
"epoch": 0.28,
"grad_norm": 0.22262559831142426,
"learning_rate": 0.0002595867768595041,
"loss": 0.5566,
"step": 173
},
{
"epoch": 0.28,
"grad_norm": 0.23010462522506714,
"learning_rate": 0.0002593388429752066,
"loss": 0.7101,
"step": 174
},
{
"epoch": 0.29,
"grad_norm": 0.21676452457904816,
"learning_rate": 0.00025909090909090907,
"loss": 0.7038,
"step": 175
},
{
"epoch": 0.29,
"grad_norm": 0.22475261986255646,
"learning_rate": 0.00025884297520661155,
"loss": 0.7812,
"step": 176
},
{
"epoch": 0.29,
"grad_norm": 0.28893202543258667,
"learning_rate": 0.00025859504132231403,
"loss": 0.5925,
"step": 177
},
{
"epoch": 0.29,
"grad_norm": 0.22777552902698517,
"learning_rate": 0.0002583471074380165,
"loss": 0.7319,
"step": 178
},
{
"epoch": 0.29,
"grad_norm": 0.2287953644990921,
"learning_rate": 0.000258099173553719,
"loss": 0.7775,
"step": 179
},
{
"epoch": 0.29,
"grad_norm": 0.2049843668937683,
"learning_rate": 0.0002578512396694215,
"loss": 0.7448,
"step": 180
},
{
"epoch": 0.3,
"grad_norm": 0.22585280239582062,
"learning_rate": 0.00025760330578512397,
"loss": 0.59,
"step": 181
},
{
"epoch": 0.3,
"grad_norm": 0.23159150779247284,
"learning_rate": 0.0002573553719008264,
"loss": 0.737,
"step": 182
},
{
"epoch": 0.3,
"grad_norm": 0.3393082320690155,
"learning_rate": 0.0002571074380165289,
"loss": 0.6948,
"step": 183
},
{
"epoch": 0.3,
"grad_norm": 0.2345617413520813,
"learning_rate": 0.00025685950413223136,
"loss": 0.6351,
"step": 184
},
{
"epoch": 0.3,
"grad_norm": 0.23474591970443726,
"learning_rate": 0.00025661157024793385,
"loss": 0.6643,
"step": 185
},
{
"epoch": 0.3,
"grad_norm": 0.2473030984401703,
"learning_rate": 0.00025636363636363633,
"loss": 0.7663,
"step": 186
},
{
"epoch": 0.31,
"grad_norm": 0.2971685230731964,
"learning_rate": 0.0002561157024793388,
"loss": 0.7449,
"step": 187
},
{
"epoch": 0.31,
"grad_norm": 0.2745087742805481,
"learning_rate": 0.0002558677685950413,
"loss": 0.6125,
"step": 188
},
{
"epoch": 0.31,
"grad_norm": 0.23520545661449432,
"learning_rate": 0.0002556198347107438,
"loss": 0.573,
"step": 189
},
{
"epoch": 0.31,
"grad_norm": 0.2955464720726013,
"learning_rate": 0.00025537190082644627,
"loss": 0.5315,
"step": 190
},
{
"epoch": 0.31,
"grad_norm": 0.23987281322479248,
"learning_rate": 0.00025512396694214875,
"loss": 0.5636,
"step": 191
},
{
"epoch": 0.31,
"grad_norm": 0.24263744056224823,
"learning_rate": 0.00025487603305785123,
"loss": 0.6047,
"step": 192
},
{
"epoch": 0.32,
"grad_norm": 0.26061922311782837,
"learning_rate": 0.0002546280991735537,
"loss": 0.7812,
"step": 193
},
{
"epoch": 0.32,
"grad_norm": 0.2458687126636505,
"learning_rate": 0.0002543801652892562,
"loss": 0.58,
"step": 194
},
{
"epoch": 0.32,
"grad_norm": 0.24598994851112366,
"learning_rate": 0.00025413223140495863,
"loss": 0.7432,
"step": 195
},
{
"epoch": 0.32,
"grad_norm": 0.248992919921875,
"learning_rate": 0.0002538842975206611,
"loss": 0.6953,
"step": 196
},
{
"epoch": 0.32,
"grad_norm": 0.2518531382083893,
"learning_rate": 0.0002536363636363636,
"loss": 0.6707,
"step": 197
},
{
"epoch": 0.32,
"grad_norm": 0.23844210803508759,
"learning_rate": 0.0002533884297520661,
"loss": 0.6285,
"step": 198
},
{
"epoch": 0.33,
"grad_norm": 0.21948237717151642,
"learning_rate": 0.00025314049586776856,
"loss": 0.6859,
"step": 199
},
{
"epoch": 0.33,
"grad_norm": 0.2003835141658783,
"learning_rate": 0.00025289256198347105,
"loss": 0.6305,
"step": 200
},
{
"epoch": 0.33,
"grad_norm": 0.23421582579612732,
"learning_rate": 0.00025264462809917353,
"loss": 0.7164,
"step": 201
},
{
"epoch": 0.33,
"grad_norm": 0.22344104945659637,
"learning_rate": 0.000252396694214876,
"loss": 0.6498,
"step": 202
},
{
"epoch": 0.33,
"grad_norm": 0.17792212963104248,
"learning_rate": 0.0002521487603305785,
"loss": 0.614,
"step": 203
},
{
"epoch": 0.33,
"grad_norm": 0.217886820435524,
"learning_rate": 0.000251900826446281,
"loss": 0.7033,
"step": 204
},
{
"epoch": 0.34,
"grad_norm": 0.209726020693779,
"learning_rate": 0.00025165289256198347,
"loss": 0.5913,
"step": 205
},
{
"epoch": 0.34,
"grad_norm": 0.2401910424232483,
"learning_rate": 0.00025140495867768595,
"loss": 0.6405,
"step": 206
},
{
"epoch": 0.34,
"grad_norm": 0.21315626800060272,
"learning_rate": 0.00025115702479338843,
"loss": 0.7369,
"step": 207
},
{
"epoch": 0.34,
"grad_norm": 0.20102320611476898,
"learning_rate": 0.00025090909090909086,
"loss": 0.6245,
"step": 208
},
{
"epoch": 0.34,
"grad_norm": 0.20447981357574463,
"learning_rate": 0.00025066115702479335,
"loss": 0.5423,
"step": 209
},
{
"epoch": 0.34,
"grad_norm": 0.24979281425476074,
"learning_rate": 0.00025041322314049583,
"loss": 0.8078,
"step": 210
},
{
"epoch": 0.35,
"grad_norm": 0.20141547918319702,
"learning_rate": 0.0002501652892561983,
"loss": 0.7386,
"step": 211
},
{
"epoch": 0.35,
"grad_norm": 0.2538990378379822,
"learning_rate": 0.0002499173553719008,
"loss": 0.7219,
"step": 212
},
{
"epoch": 0.35,
"grad_norm": 0.2613961100578308,
"learning_rate": 0.0002496694214876033,
"loss": 0.7903,
"step": 213
},
{
"epoch": 0.35,
"grad_norm": 0.24777857959270477,
"learning_rate": 0.00024942148760330576,
"loss": 0.664,
"step": 214
},
{
"epoch": 0.35,
"grad_norm": 0.21958425641059875,
"learning_rate": 0.00024917355371900825,
"loss": 0.6755,
"step": 215
},
{
"epoch": 0.35,
"grad_norm": 0.2218528538942337,
"learning_rate": 0.00024892561983471073,
"loss": 0.5568,
"step": 216
},
{
"epoch": 0.36,
"grad_norm": 0.23632755875587463,
"learning_rate": 0.00024867768595041316,
"loss": 0.6858,
"step": 217
},
{
"epoch": 0.36,
"grad_norm": 0.2641279697418213,
"learning_rate": 0.0002484297520661157,
"loss": 0.7783,
"step": 218
},
{
"epoch": 0.36,
"grad_norm": 0.3147680163383484,
"learning_rate": 0.0002481818181818182,
"loss": 0.662,
"step": 219
},
{
"epoch": 0.36,
"grad_norm": 0.27947697043418884,
"learning_rate": 0.00024793388429752067,
"loss": 0.6477,
"step": 220
},
{
"epoch": 0.36,
"grad_norm": 0.2297278195619583,
"learning_rate": 0.00024768595041322315,
"loss": 0.5895,
"step": 221
},
{
"epoch": 0.36,
"grad_norm": 0.23085851967334747,
"learning_rate": 0.0002474380165289256,
"loss": 0.5806,
"step": 222
},
{
"epoch": 0.37,
"grad_norm": 0.19654251635074615,
"learning_rate": 0.00024719008264462806,
"loss": 0.5942,
"step": 223
},
{
"epoch": 0.37,
"grad_norm": 0.2467166632413864,
"learning_rate": 0.00024694214876033055,
"loss": 0.5059,
"step": 224
},
{
"epoch": 0.37,
"grad_norm": 0.22614917159080505,
"learning_rate": 0.00024669421487603303,
"loss": 0.643,
"step": 225
},
{
"epoch": 0.37,
"grad_norm": 0.2622920274734497,
"learning_rate": 0.0002464462809917355,
"loss": 0.6257,
"step": 226
},
{
"epoch": 0.37,
"grad_norm": 0.21843163669109344,
"learning_rate": 0.000246198347107438,
"loss": 0.6057,
"step": 227
},
{
"epoch": 0.37,
"grad_norm": 0.2294640988111496,
"learning_rate": 0.0002459504132231405,
"loss": 0.6876,
"step": 228
},
{
"epoch": 0.37,
"grad_norm": 0.1791463941335678,
"learning_rate": 0.00024570247933884296,
"loss": 0.5348,
"step": 229
},
{
"epoch": 0.38,
"grad_norm": 0.17243699729442596,
"learning_rate": 0.00024545454545454545,
"loss": 0.5966,
"step": 230
},
{
"epoch": 0.38,
"grad_norm": 0.22769273817539215,
"learning_rate": 0.0002452066115702479,
"loss": 0.7912,
"step": 231
},
{
"epoch": 0.38,
"grad_norm": 0.2325255423784256,
"learning_rate": 0.0002449586776859504,
"loss": 0.7441,
"step": 232
},
{
"epoch": 0.38,
"grad_norm": 0.24277740716934204,
"learning_rate": 0.0002447107438016529,
"loss": 0.6653,
"step": 233
},
{
"epoch": 0.38,
"grad_norm": 0.21596141159534454,
"learning_rate": 0.0002444628099173554,
"loss": 0.6668,
"step": 234
},
{
"epoch": 0.38,
"grad_norm": 0.20814135670661926,
"learning_rate": 0.0002442148760330578,
"loss": 0.6306,
"step": 235
},
{
"epoch": 0.39,
"grad_norm": 0.25570017099380493,
"learning_rate": 0.0002439669421487603,
"loss": 0.6524,
"step": 236
},
{
"epoch": 0.39,
"grad_norm": 0.2502390146255493,
"learning_rate": 0.00024371900826446278,
"loss": 0.6048,
"step": 237
},
{
"epoch": 0.39,
"grad_norm": 0.23688243329524994,
"learning_rate": 0.0002434710743801653,
"loss": 0.568,
"step": 238
},
{
"epoch": 0.39,
"grad_norm": 0.21041709184646606,
"learning_rate": 0.00024322314049586777,
"loss": 0.6908,
"step": 239
},
{
"epoch": 0.39,
"grad_norm": 0.21656759083271027,
"learning_rate": 0.00024297520661157023,
"loss": 0.4993,
"step": 240
},
{
"epoch": 0.39,
"grad_norm": 0.25133028626441956,
"learning_rate": 0.0002427272727272727,
"loss": 0.718,
"step": 241
},
{
"epoch": 0.4,
"grad_norm": 0.22228790819644928,
"learning_rate": 0.0002424793388429752,
"loss": 0.6146,
"step": 242
},
{
"epoch": 0.4,
"grad_norm": 0.26273205876350403,
"learning_rate": 0.00024223140495867768,
"loss": 0.7459,
"step": 243
},
{
"epoch": 0.4,
"grad_norm": 0.2156606763601303,
"learning_rate": 0.00024198347107438014,
"loss": 0.6692,
"step": 244
},
{
"epoch": 0.4,
"grad_norm": 0.2075020670890808,
"learning_rate": 0.00024173553719008262,
"loss": 0.6427,
"step": 245
},
{
"epoch": 0.4,
"grad_norm": 0.25821176171302795,
"learning_rate": 0.0002414876033057851,
"loss": 0.7964,
"step": 246
},
{
"epoch": 0.4,
"grad_norm": 0.23016126453876495,
"learning_rate": 0.0002412396694214876,
"loss": 0.536,
"step": 247
},
{
"epoch": 0.41,
"grad_norm": 0.23115016520023346,
"learning_rate": 0.00024099173553719004,
"loss": 0.6053,
"step": 248
},
{
"epoch": 0.41,
"grad_norm": 0.18249157071113586,
"learning_rate": 0.00024074380165289253,
"loss": 0.6574,
"step": 249
},
{
"epoch": 0.41,
"grad_norm": 0.28391778469085693,
"learning_rate": 0.000240495867768595,
"loss": 0.7152,
"step": 250
},
{
"epoch": 0.41,
"grad_norm": 0.2581539452075958,
"learning_rate": 0.0002402479338842975,
"loss": 0.8476,
"step": 251
},
{
"epoch": 0.41,
"grad_norm": 0.2304867058992386,
"learning_rate": 0.00023999999999999998,
"loss": 0.5781,
"step": 252
},
{
"epoch": 0.41,
"grad_norm": 0.239717036485672,
"learning_rate": 0.00023975206611570244,
"loss": 0.6543,
"step": 253
},
{
"epoch": 0.42,
"grad_norm": 0.22493794560432434,
"learning_rate": 0.00023950413223140495,
"loss": 0.7048,
"step": 254
},
{
"epoch": 0.42,
"grad_norm": 0.22085991501808167,
"learning_rate": 0.00023925619834710743,
"loss": 0.5572,
"step": 255
},
{
"epoch": 0.42,
"grad_norm": 0.35917988419532776,
"learning_rate": 0.0002390082644628099,
"loss": 0.8485,
"step": 256
},
{
"epoch": 0.42,
"grad_norm": 0.28269943594932556,
"learning_rate": 0.00023876033057851237,
"loss": 0.5732,
"step": 257
},
{
"epoch": 0.42,
"grad_norm": 0.26313093304634094,
"learning_rate": 0.00023851239669421485,
"loss": 0.8212,
"step": 258
},
{
"epoch": 0.42,
"grad_norm": 0.30286532640457153,
"learning_rate": 0.00023826446280991734,
"loss": 0.5878,
"step": 259
},
{
"epoch": 0.43,
"grad_norm": 0.22270837426185608,
"learning_rate": 0.00023801652892561982,
"loss": 0.6933,
"step": 260
},
{
"epoch": 0.43,
"grad_norm": 0.29011014103889465,
"learning_rate": 0.0002377685950413223,
"loss": 0.6188,
"step": 261
},
{
"epoch": 0.43,
"grad_norm": 0.2390982061624527,
"learning_rate": 0.00023752066115702476,
"loss": 0.6426,
"step": 262
},
{
"epoch": 0.43,
"grad_norm": 0.3416346609592438,
"learning_rate": 0.00023727272727272724,
"loss": 0.8845,
"step": 263
},
{
"epoch": 0.43,
"grad_norm": 0.25051388144493103,
"learning_rate": 0.00023702479338842973,
"loss": 0.7286,
"step": 264
},
{
"epoch": 0.43,
"grad_norm": 0.2497546523809433,
"learning_rate": 0.0002367768595041322,
"loss": 0.6027,
"step": 265
},
{
"epoch": 0.44,
"grad_norm": 0.23835037648677826,
"learning_rate": 0.00023652892561983467,
"loss": 0.7052,
"step": 266
},
{
"epoch": 0.44,
"grad_norm": 0.22467398643493652,
"learning_rate": 0.00023628099173553715,
"loss": 0.5806,
"step": 267
},
{
"epoch": 0.44,
"grad_norm": 0.2663390338420868,
"learning_rate": 0.00023603305785123964,
"loss": 0.6943,
"step": 268
},
{
"epoch": 0.44,
"grad_norm": 0.22997191548347473,
"learning_rate": 0.00023578512396694215,
"loss": 0.6411,
"step": 269
},
{
"epoch": 0.44,
"grad_norm": 0.23266558349132538,
"learning_rate": 0.00023553719008264463,
"loss": 0.6068,
"step": 270
},
{
"epoch": 0.44,
"grad_norm": 0.2304474264383316,
"learning_rate": 0.00023528925619834709,
"loss": 0.6427,
"step": 271
},
{
"epoch": 0.45,
"grad_norm": 0.28231826424598694,
"learning_rate": 0.00023504132231404957,
"loss": 0.8011,
"step": 272
},
{
"epoch": 0.45,
"grad_norm": 0.28013259172439575,
"learning_rate": 0.00023479338842975205,
"loss": 0.5988,
"step": 273
},
{
"epoch": 0.45,
"grad_norm": 0.22702372074127197,
"learning_rate": 0.00023454545454545454,
"loss": 0.6737,
"step": 274
},
{
"epoch": 0.45,
"grad_norm": 0.27958643436431885,
"learning_rate": 0.000234297520661157,
"loss": 0.6621,
"step": 275
},
{
"epoch": 0.45,
"grad_norm": 0.23902451992034912,
"learning_rate": 0.00023404958677685948,
"loss": 0.6525,
"step": 276
},
{
"epoch": 0.45,
"grad_norm": 0.2778523564338684,
"learning_rate": 0.00023380165289256196,
"loss": 0.6697,
"step": 277
},
{
"epoch": 0.46,
"grad_norm": 0.2382276952266693,
"learning_rate": 0.00023355371900826444,
"loss": 0.6281,
"step": 278
},
{
"epoch": 0.46,
"grad_norm": 0.24487091600894928,
"learning_rate": 0.00023330578512396693,
"loss": 0.6842,
"step": 279
},
{
"epoch": 0.46,
"grad_norm": 0.2063397765159607,
"learning_rate": 0.00023305785123966938,
"loss": 0.6554,
"step": 280
},
{
"epoch": 0.46,
"grad_norm": 0.21523278951644897,
"learning_rate": 0.00023280991735537187,
"loss": 0.632,
"step": 281
},
{
"epoch": 0.46,
"grad_norm": 0.2420080006122589,
"learning_rate": 0.00023256198347107435,
"loss": 0.6001,
"step": 282
},
{
"epoch": 0.46,
"grad_norm": 0.2390110194683075,
"learning_rate": 0.00023231404958677686,
"loss": 0.5648,
"step": 283
},
{
"epoch": 0.47,
"grad_norm": 0.24080687761306763,
"learning_rate": 0.0002320661157024793,
"loss": 0.86,
"step": 284
},
{
"epoch": 0.47,
"grad_norm": 0.29456445574760437,
"learning_rate": 0.0002318181818181818,
"loss": 0.7418,
"step": 285
},
{
"epoch": 0.47,
"grad_norm": 0.23326683044433594,
"learning_rate": 0.00023157024793388429,
"loss": 0.6967,
"step": 286
},
{
"epoch": 0.47,
"grad_norm": 0.20866093039512634,
"learning_rate": 0.00023132231404958677,
"loss": 0.5205,
"step": 287
},
{
"epoch": 0.47,
"grad_norm": 0.3158474266529083,
"learning_rate": 0.00023107438016528925,
"loss": 0.7879,
"step": 288
},
{
"epoch": 0.47,
"grad_norm": 0.2730140686035156,
"learning_rate": 0.0002308264462809917,
"loss": 0.7292,
"step": 289
},
{
"epoch": 0.47,
"grad_norm": 0.25384965538978577,
"learning_rate": 0.0002305785123966942,
"loss": 0.7258,
"step": 290
},
{
"epoch": 0.48,
"grad_norm": 0.20765069127082825,
"learning_rate": 0.00023033057851239668,
"loss": 0.7108,
"step": 291
},
{
"epoch": 0.48,
"grad_norm": 0.25662195682525635,
"learning_rate": 0.00023008264462809916,
"loss": 0.7473,
"step": 292
},
{
"epoch": 0.48,
"grad_norm": 0.300243616104126,
"learning_rate": 0.00022983471074380162,
"loss": 0.6902,
"step": 293
},
{
"epoch": 0.48,
"grad_norm": 0.23513919115066528,
"learning_rate": 0.0002295867768595041,
"loss": 0.5888,
"step": 294
},
{
"epoch": 0.48,
"grad_norm": 0.2077571451663971,
"learning_rate": 0.00022933884297520658,
"loss": 0.6256,
"step": 295
},
{
"epoch": 0.48,
"grad_norm": 0.266201376914978,
"learning_rate": 0.00022909090909090907,
"loss": 0.6913,
"step": 296
},
{
"epoch": 0.49,
"grad_norm": 0.2239614725112915,
"learning_rate": 0.00022884297520661152,
"loss": 0.7369,
"step": 297
},
{
"epoch": 0.49,
"grad_norm": 0.21509824693202972,
"learning_rate": 0.000228595041322314,
"loss": 0.4445,
"step": 298
},
{
"epoch": 0.49,
"grad_norm": 0.21956239640712738,
"learning_rate": 0.00022834710743801652,
"loss": 0.6732,
"step": 299
},
{
"epoch": 0.49,
"grad_norm": 0.18832357227802277,
"learning_rate": 0.000228099173553719,
"loss": 0.6808,
"step": 300
},
{
"epoch": 0.49,
"grad_norm": 0.21115505695343018,
"learning_rate": 0.0002278512396694215,
"loss": 0.5323,
"step": 301
},
{
"epoch": 0.49,
"grad_norm": 0.23715418577194214,
"learning_rate": 0.00022760330578512394,
"loss": 0.8333,
"step": 302
},
{
"epoch": 0.5,
"grad_norm": 0.29385048151016235,
"learning_rate": 0.00022735537190082643,
"loss": 0.6,
"step": 303
},
{
"epoch": 0.5,
"grad_norm": 0.26947689056396484,
"learning_rate": 0.0002271074380165289,
"loss": 0.8788,
"step": 304
},
{
"epoch": 0.5,
"grad_norm": 0.2778269946575165,
"learning_rate": 0.0002268595041322314,
"loss": 0.7073,
"step": 305
},
{
"epoch": 0.5,
"grad_norm": 0.20938479900360107,
"learning_rate": 0.00022661157024793385,
"loss": 0.6422,
"step": 306
},
{
"epoch": 0.5,
"grad_norm": 0.2777106761932373,
"learning_rate": 0.00022636363636363633,
"loss": 0.7495,
"step": 307
},
{
"epoch": 0.5,
"grad_norm": 0.20872819423675537,
"learning_rate": 0.00022611570247933882,
"loss": 0.6492,
"step": 308
},
{
"epoch": 0.51,
"grad_norm": 0.2752722501754761,
"learning_rate": 0.0002258677685950413,
"loss": 0.6014,
"step": 309
},
{
"epoch": 0.51,
"grad_norm": 0.24615786969661713,
"learning_rate": 0.00022561983471074378,
"loss": 0.6287,
"step": 310
},
{
"epoch": 0.51,
"grad_norm": 0.24146385490894318,
"learning_rate": 0.00022537190082644624,
"loss": 0.6151,
"step": 311
},
{
"epoch": 0.51,
"grad_norm": 0.24762235581874847,
"learning_rate": 0.00022512396694214872,
"loss": 0.6377,
"step": 312
},
{
"epoch": 0.51,
"grad_norm": 0.24630331993103027,
"learning_rate": 0.00022487603305785124,
"loss": 0.7255,
"step": 313
},
{
"epoch": 0.51,
"grad_norm": 0.2922554612159729,
"learning_rate": 0.00022462809917355372,
"loss": 0.6645,
"step": 314
},
{
"epoch": 0.52,
"grad_norm": 0.21686063706874847,
"learning_rate": 0.00022438016528925618,
"loss": 0.5606,
"step": 315
},
{
"epoch": 0.52,
"grad_norm": 0.2216208428144455,
"learning_rate": 0.00022413223140495866,
"loss": 0.5126,
"step": 316
},
{
"epoch": 0.52,
"grad_norm": 0.25635436177253723,
"learning_rate": 0.00022388429752066114,
"loss": 0.7387,
"step": 317
},
{
"epoch": 0.52,
"grad_norm": 0.2786000669002533,
"learning_rate": 0.00022363636363636363,
"loss": 0.5941,
"step": 318
},
{
"epoch": 0.52,
"grad_norm": 0.26092806458473206,
"learning_rate": 0.0002233884297520661,
"loss": 0.7851,
"step": 319
},
{
"epoch": 0.52,
"grad_norm": 0.23881889879703522,
"learning_rate": 0.00022314049586776857,
"loss": 0.598,
"step": 320
},
{
"epoch": 0.53,
"grad_norm": 0.23304526507854462,
"learning_rate": 0.00022289256198347105,
"loss": 0.7165,
"step": 321
},
{
"epoch": 0.53,
"grad_norm": 0.2340225875377655,
"learning_rate": 0.00022264462809917353,
"loss": 0.6608,
"step": 322
},
{
"epoch": 0.53,
"grad_norm": 0.31176140904426575,
"learning_rate": 0.00022239669421487602,
"loss": 0.6711,
"step": 323
},
{
"epoch": 0.53,
"grad_norm": 0.23832640051841736,
"learning_rate": 0.00022214876033057847,
"loss": 0.732,
"step": 324
},
{
"epoch": 0.53,
"grad_norm": 0.28845977783203125,
"learning_rate": 0.00022190082644628096,
"loss": 0.7968,
"step": 325
},
{
"epoch": 0.53,
"grad_norm": 0.1978536993265152,
"learning_rate": 0.00022165289256198344,
"loss": 0.6592,
"step": 326
},
{
"epoch": 0.54,
"grad_norm": 0.26940053701400757,
"learning_rate": 0.00022140495867768595,
"loss": 0.7953,
"step": 327
},
{
"epoch": 0.54,
"grad_norm": 0.20393389463424683,
"learning_rate": 0.00022115702479338844,
"loss": 0.4871,
"step": 328
},
{
"epoch": 0.54,
"grad_norm": 0.27152347564697266,
"learning_rate": 0.0002209090909090909,
"loss": 0.5583,
"step": 329
},
{
"epoch": 0.54,
"grad_norm": 0.2883144021034241,
"learning_rate": 0.00022066115702479338,
"loss": 0.6156,
"step": 330
},
{
"epoch": 0.54,
"grad_norm": 0.1987351030111313,
"learning_rate": 0.00022041322314049586,
"loss": 0.5196,
"step": 331
},
{
"epoch": 0.54,
"grad_norm": 0.2651583254337311,
"learning_rate": 0.00022016528925619834,
"loss": 0.6099,
"step": 332
},
{
"epoch": 0.55,
"grad_norm": 0.2574511468410492,
"learning_rate": 0.0002199173553719008,
"loss": 0.6925,
"step": 333
},
{
"epoch": 0.55,
"grad_norm": 0.27730292081832886,
"learning_rate": 0.00021966942148760328,
"loss": 0.6752,
"step": 334
},
{
"epoch": 0.55,
"grad_norm": 0.2001207172870636,
"learning_rate": 0.00021942148760330577,
"loss": 0.75,
"step": 335
},
{
"epoch": 0.55,
"grad_norm": 0.24222363531589508,
"learning_rate": 0.00021917355371900825,
"loss": 0.6364,
"step": 336
},
{
"epoch": 0.55,
"grad_norm": 0.26326724886894226,
"learning_rate": 0.0002189256198347107,
"loss": 0.673,
"step": 337
},
{
"epoch": 0.55,
"grad_norm": 0.2272881418466568,
"learning_rate": 0.0002186776859504132,
"loss": 0.561,
"step": 338
},
{
"epoch": 0.56,
"grad_norm": 0.24880024790763855,
"learning_rate": 0.00021842975206611567,
"loss": 0.5552,
"step": 339
},
{
"epoch": 0.56,
"grad_norm": 0.2593706548213959,
"learning_rate": 0.00021818181818181816,
"loss": 0.5417,
"step": 340
},
{
"epoch": 0.56,
"grad_norm": 0.19063642621040344,
"learning_rate": 0.00021793388429752067,
"loss": 0.5694,
"step": 341
},
{
"epoch": 0.56,
"grad_norm": 0.2146475464105606,
"learning_rate": 0.0002176859504132231,
"loss": 0.4314,
"step": 342
},
{
"epoch": 0.56,
"grad_norm": 0.25150927901268005,
"learning_rate": 0.0002174380165289256,
"loss": 0.631,
"step": 343
},
{
"epoch": 0.56,
"grad_norm": 0.2753889858722687,
"learning_rate": 0.0002171900826446281,
"loss": 0.6859,
"step": 344
},
{
"epoch": 0.56,
"grad_norm": 0.20773079991340637,
"learning_rate": 0.00021694214876033058,
"loss": 0.7515,
"step": 345
},
{
"epoch": 0.57,
"grad_norm": 0.2547062635421753,
"learning_rate": 0.00021669421487603303,
"loss": 0.7582,
"step": 346
},
{
"epoch": 0.57,
"grad_norm": 0.24687208235263824,
"learning_rate": 0.00021644628099173552,
"loss": 0.5865,
"step": 347
},
{
"epoch": 0.57,
"grad_norm": 0.24116279184818268,
"learning_rate": 0.000216198347107438,
"loss": 0.4841,
"step": 348
},
{
"epoch": 0.57,
"grad_norm": 0.2270282804965973,
"learning_rate": 0.00021595041322314048,
"loss": 0.5933,
"step": 349
},
{
"epoch": 0.57,
"grad_norm": 0.21436922252178192,
"learning_rate": 0.00021570247933884297,
"loss": 0.6959,
"step": 350
},
{
"epoch": 0.57,
"grad_norm": 0.25802701711654663,
"learning_rate": 0.00021545454545454542,
"loss": 0.729,
"step": 351
},
{
"epoch": 0.58,
"grad_norm": 0.23808260262012482,
"learning_rate": 0.0002152066115702479,
"loss": 0.6346,
"step": 352
},
{
"epoch": 0.58,
"grad_norm": 0.23161651194095612,
"learning_rate": 0.0002149586776859504,
"loss": 0.6459,
"step": 353
},
{
"epoch": 0.58,
"grad_norm": 0.2442287802696228,
"learning_rate": 0.00021471074380165287,
"loss": 0.6803,
"step": 354
},
{
"epoch": 0.58,
"grad_norm": 0.19150683283805847,
"learning_rate": 0.00021446280991735533,
"loss": 0.4375,
"step": 355
},
{
"epoch": 0.58,
"grad_norm": 0.23142127692699432,
"learning_rate": 0.00021421487603305781,
"loss": 0.5505,
"step": 356
},
{
"epoch": 0.58,
"grad_norm": 0.22447548806667328,
"learning_rate": 0.00021396694214876033,
"loss": 0.6368,
"step": 357
},
{
"epoch": 0.59,
"grad_norm": 0.25168758630752563,
"learning_rate": 0.0002137190082644628,
"loss": 0.6322,
"step": 358
},
{
"epoch": 0.59,
"grad_norm": 0.25538235902786255,
"learning_rate": 0.0002134710743801653,
"loss": 0.5317,
"step": 359
},
{
"epoch": 0.59,
"grad_norm": 0.2565425634384155,
"learning_rate": 0.00021322314049586775,
"loss": 0.6261,
"step": 360
},
{
"epoch": 0.59,
"grad_norm": 0.25399863719940186,
"learning_rate": 0.00021297520661157023,
"loss": 0.596,
"step": 361
},
{
"epoch": 0.59,
"grad_norm": 0.27143988013267517,
"learning_rate": 0.00021272727272727272,
"loss": 0.6691,
"step": 362
},
{
"epoch": 0.59,
"grad_norm": 0.2387736439704895,
"learning_rate": 0.0002124793388429752,
"loss": 0.5288,
"step": 363
},
{
"epoch": 0.6,
"grad_norm": 0.2549780607223511,
"learning_rate": 0.00021223140495867766,
"loss": 0.7455,
"step": 364
},
{
"epoch": 0.6,
"grad_norm": 0.2740858793258667,
"learning_rate": 0.00021198347107438014,
"loss": 0.4921,
"step": 365
},
{
"epoch": 0.6,
"grad_norm": 0.25273847579956055,
"learning_rate": 0.00021173553719008262,
"loss": 0.7965,
"step": 366
},
{
"epoch": 0.6,
"grad_norm": 0.25858959555625916,
"learning_rate": 0.0002114876033057851,
"loss": 0.7303,
"step": 367
},
{
"epoch": 0.6,
"grad_norm": 0.2599296271800995,
"learning_rate": 0.0002112396694214876,
"loss": 0.6342,
"step": 368
},
{
"epoch": 0.6,
"grad_norm": 0.21084599196910858,
"learning_rate": 0.00021099173553719005,
"loss": 0.633,
"step": 369
},
{
"epoch": 0.61,
"grad_norm": 0.24272632598876953,
"learning_rate": 0.00021074380165289253,
"loss": 0.6213,
"step": 370
},
{
"epoch": 0.61,
"grad_norm": 0.26323699951171875,
"learning_rate": 0.00021049586776859501,
"loss": 0.563,
"step": 371
},
{
"epoch": 0.61,
"grad_norm": 0.20646587014198303,
"learning_rate": 0.00021024793388429753,
"loss": 0.6248,
"step": 372
},
{
"epoch": 0.61,
"grad_norm": 0.21778297424316406,
"learning_rate": 0.00020999999999999998,
"loss": 0.7186,
"step": 373
},
{
"epoch": 0.61,
"grad_norm": 0.21315112709999084,
"learning_rate": 0.00020975206611570247,
"loss": 0.5961,
"step": 374
},
{
"epoch": 0.61,
"grad_norm": 0.20787106454372406,
"learning_rate": 0.00020950413223140495,
"loss": 0.5917,
"step": 375
},
{
"epoch": 0.62,
"grad_norm": 0.23541009426116943,
"learning_rate": 0.00020925619834710743,
"loss": 0.7803,
"step": 376
},
{
"epoch": 0.62,
"grad_norm": 0.22649626433849335,
"learning_rate": 0.00020900826446280992,
"loss": 0.5895,
"step": 377
},
{
"epoch": 0.62,
"grad_norm": 0.23644742369651794,
"learning_rate": 0.00020876033057851237,
"loss": 0.6656,
"step": 378
},
{
"epoch": 0.62,
"grad_norm": 0.22934262454509735,
"learning_rate": 0.00020851239669421486,
"loss": 0.5933,
"step": 379
},
{
"epoch": 0.62,
"grad_norm": 0.289989709854126,
"learning_rate": 0.00020826446280991734,
"loss": 0.6852,
"step": 380
},
{
"epoch": 0.62,
"grad_norm": 0.24489325284957886,
"learning_rate": 0.00020801652892561982,
"loss": 0.5546,
"step": 381
},
{
"epoch": 0.63,
"grad_norm": 0.27165278792381287,
"learning_rate": 0.00020776859504132228,
"loss": 0.6845,
"step": 382
},
{
"epoch": 0.63,
"grad_norm": 0.19467370212078094,
"learning_rate": 0.00020752066115702476,
"loss": 0.5587,
"step": 383
},
{
"epoch": 0.63,
"grad_norm": 0.27320200204849243,
"learning_rate": 0.00020727272727272725,
"loss": 0.7144,
"step": 384
},
{
"epoch": 0.63,
"grad_norm": 0.28100526332855225,
"learning_rate": 0.00020702479338842973,
"loss": 0.6914,
"step": 385
},
{
"epoch": 0.63,
"grad_norm": 0.3059975504875183,
"learning_rate": 0.0002067768595041322,
"loss": 0.6075,
"step": 386
},
{
"epoch": 0.63,
"grad_norm": 0.24904222786426544,
"learning_rate": 0.00020652892561983467,
"loss": 0.5543,
"step": 387
},
{
"epoch": 0.64,
"grad_norm": 0.24768255650997162,
"learning_rate": 0.00020628099173553718,
"loss": 0.607,
"step": 388
},
{
"epoch": 0.64,
"grad_norm": 0.25083738565444946,
"learning_rate": 0.00020603305785123967,
"loss": 0.7961,
"step": 389
},
{
"epoch": 0.64,
"grad_norm": 0.26338303089141846,
"learning_rate": 0.00020578512396694215,
"loss": 0.6467,
"step": 390
},
{
"epoch": 0.64,
"grad_norm": 0.25761598348617554,
"learning_rate": 0.0002055371900826446,
"loss": 0.5891,
"step": 391
},
{
"epoch": 0.64,
"grad_norm": 0.2616937756538391,
"learning_rate": 0.0002052892561983471,
"loss": 0.5706,
"step": 392
},
{
"epoch": 0.64,
"grad_norm": 0.18980839848518372,
"learning_rate": 0.00020504132231404957,
"loss": 0.4479,
"step": 393
},
{
"epoch": 0.65,
"grad_norm": 0.250431627035141,
"learning_rate": 0.00020479338842975206,
"loss": 0.6006,
"step": 394
},
{
"epoch": 0.65,
"grad_norm": 0.2146655172109604,
"learning_rate": 0.0002045454545454545,
"loss": 0.7113,
"step": 395
},
{
"epoch": 0.65,
"grad_norm": 0.2195209115743637,
"learning_rate": 0.000204297520661157,
"loss": 0.5354,
"step": 396
},
{
"epoch": 0.65,
"grad_norm": 0.24879257380962372,
"learning_rate": 0.00020404958677685948,
"loss": 0.5478,
"step": 397
},
{
"epoch": 0.65,
"grad_norm": 0.27159082889556885,
"learning_rate": 0.00020380165289256196,
"loss": 0.7681,
"step": 398
},
{
"epoch": 0.65,
"grad_norm": 0.20614947378635406,
"learning_rate": 0.00020355371900826445,
"loss": 0.6357,
"step": 399
},
{
"epoch": 0.65,
"grad_norm": 0.25690051913261414,
"learning_rate": 0.0002033057851239669,
"loss": 0.5731,
"step": 400
},
{
"epoch": 0.66,
"grad_norm": 0.24473583698272705,
"learning_rate": 0.0002030578512396694,
"loss": 0.6784,
"step": 401
},
{
"epoch": 0.66,
"grad_norm": 0.32395297288894653,
"learning_rate": 0.0002028099173553719,
"loss": 0.7118,
"step": 402
},
{
"epoch": 0.66,
"grad_norm": 0.2975274324417114,
"learning_rate": 0.00020256198347107438,
"loss": 0.6504,
"step": 403
},
{
"epoch": 0.66,
"grad_norm": 0.2652553915977478,
"learning_rate": 0.00020231404958677684,
"loss": 0.6986,
"step": 404
},
{
"epoch": 0.66,
"grad_norm": 0.29475778341293335,
"learning_rate": 0.00020206611570247932,
"loss": 0.6525,
"step": 405
},
{
"epoch": 0.66,
"grad_norm": 0.24549973011016846,
"learning_rate": 0.0002018181818181818,
"loss": 0.5408,
"step": 406
},
{
"epoch": 0.67,
"grad_norm": 0.2181435376405716,
"learning_rate": 0.0002015702479338843,
"loss": 0.6146,
"step": 407
},
{
"epoch": 0.67,
"grad_norm": 0.2682584226131439,
"learning_rate": 0.00020132231404958677,
"loss": 0.6368,
"step": 408
},
{
"epoch": 0.67,
"grad_norm": 0.2641114592552185,
"learning_rate": 0.00020107438016528923,
"loss": 0.51,
"step": 409
},
{
"epoch": 0.67,
"grad_norm": 0.27871838212013245,
"learning_rate": 0.0002008264462809917,
"loss": 0.7269,
"step": 410
},
{
"epoch": 0.67,
"grad_norm": 0.23890569806098938,
"learning_rate": 0.0002005785123966942,
"loss": 0.6444,
"step": 411
},
{
"epoch": 0.67,
"grad_norm": 0.2451583445072174,
"learning_rate": 0.00020033057851239668,
"loss": 0.5806,
"step": 412
},
{
"epoch": 0.68,
"grad_norm": 0.2743864953517914,
"learning_rate": 0.00020008264462809914,
"loss": 0.6305,
"step": 413
},
{
"epoch": 0.68,
"grad_norm": 0.2626914978027344,
"learning_rate": 0.00019983471074380162,
"loss": 0.5765,
"step": 414
},
{
"epoch": 0.68,
"grad_norm": 0.2874875068664551,
"learning_rate": 0.0001995867768595041,
"loss": 0.5928,
"step": 415
},
{
"epoch": 0.68,
"grad_norm": 0.30499163269996643,
"learning_rate": 0.00019933884297520661,
"loss": 0.6271,
"step": 416
},
{
"epoch": 0.68,
"grad_norm": 0.30474454164505005,
"learning_rate": 0.0001990909090909091,
"loss": 0.6755,
"step": 417
},
{
"epoch": 0.68,
"grad_norm": 0.1819755882024765,
"learning_rate": 0.00019884297520661155,
"loss": 0.394,
"step": 418
},
{
"epoch": 0.69,
"grad_norm": 0.25470343232154846,
"learning_rate": 0.00019859504132231404,
"loss": 0.7121,
"step": 419
},
{
"epoch": 0.69,
"grad_norm": 0.26749151945114136,
"learning_rate": 0.00019834710743801652,
"loss": 0.6487,
"step": 420
},
{
"epoch": 0.69,
"grad_norm": 0.20643912255764008,
"learning_rate": 0.000198099173553719,
"loss": 0.4585,
"step": 421
},
{
"epoch": 0.69,
"grad_norm": 0.2576930522918701,
"learning_rate": 0.00019785123966942146,
"loss": 0.5235,
"step": 422
},
{
"epoch": 0.69,
"grad_norm": 0.2899012863636017,
"learning_rate": 0.00019760330578512395,
"loss": 0.6292,
"step": 423
},
{
"epoch": 0.69,
"grad_norm": 0.2541065216064453,
"learning_rate": 0.00019735537190082643,
"loss": 0.648,
"step": 424
},
{
"epoch": 0.7,
"grad_norm": 0.24382047355175018,
"learning_rate": 0.0001971074380165289,
"loss": 0.5939,
"step": 425
},
{
"epoch": 0.7,
"grad_norm": 0.22931940853595734,
"learning_rate": 0.00019685950413223137,
"loss": 0.6812,
"step": 426
},
{
"epoch": 0.7,
"grad_norm": 0.2592567205429077,
"learning_rate": 0.00019661157024793385,
"loss": 0.69,
"step": 427
},
{
"epoch": 0.7,
"grad_norm": 0.2516980767250061,
"learning_rate": 0.00019636363636363634,
"loss": 0.5707,
"step": 428
},
{
"epoch": 0.7,
"grad_norm": 0.23515059053897858,
"learning_rate": 0.00019611570247933882,
"loss": 0.6739,
"step": 429
},
{
"epoch": 0.7,
"grad_norm": 0.24742184579372406,
"learning_rate": 0.00019586776859504133,
"loss": 0.6761,
"step": 430
},
{
"epoch": 0.71,
"grad_norm": 0.26232922077178955,
"learning_rate": 0.00019561983471074376,
"loss": 0.7071,
"step": 431
},
{
"epoch": 0.71,
"grad_norm": 0.2853042781352997,
"learning_rate": 0.00019537190082644627,
"loss": 0.7667,
"step": 432
},
{
"epoch": 0.71,
"grad_norm": 0.251169353723526,
"learning_rate": 0.00019512396694214875,
"loss": 0.6518,
"step": 433
},
{
"epoch": 0.71,
"grad_norm": 0.2321665734052658,
"learning_rate": 0.00019487603305785124,
"loss": 0.4377,
"step": 434
},
{
"epoch": 0.71,
"grad_norm": 0.25216928124427795,
"learning_rate": 0.0001946280991735537,
"loss": 0.7173,
"step": 435
},
{
"epoch": 0.71,
"grad_norm": 0.19498330354690552,
"learning_rate": 0.00019438016528925618,
"loss": 0.5584,
"step": 436
},
{
"epoch": 0.72,
"grad_norm": 0.32786309719085693,
"learning_rate": 0.00019413223140495866,
"loss": 0.6583,
"step": 437
},
{
"epoch": 0.72,
"grad_norm": 0.25834760069847107,
"learning_rate": 0.00019388429752066115,
"loss": 0.4957,
"step": 438
},
{
"epoch": 0.72,
"grad_norm": 0.3462083041667938,
"learning_rate": 0.00019363636363636363,
"loss": 0.5205,
"step": 439
},
{
"epoch": 0.72,
"grad_norm": 0.27106693387031555,
"learning_rate": 0.00019338842975206609,
"loss": 0.6803,
"step": 440
},
{
"epoch": 0.72,
"grad_norm": 0.28165388107299805,
"learning_rate": 0.00019314049586776857,
"loss": 0.7049,
"step": 441
},
{
"epoch": 0.72,
"grad_norm": 0.20732273161411285,
"learning_rate": 0.00019289256198347105,
"loss": 0.6407,
"step": 442
},
{
"epoch": 0.73,
"grad_norm": 0.2609116733074188,
"learning_rate": 0.00019264462809917354,
"loss": 0.5377,
"step": 443
},
{
"epoch": 0.73,
"grad_norm": 0.2561998963356018,
"learning_rate": 0.000192396694214876,
"loss": 0.6212,
"step": 444
},
{
"epoch": 0.73,
"grad_norm": 0.27699044346809387,
"learning_rate": 0.00019214876033057848,
"loss": 0.5482,
"step": 445
},
{
"epoch": 0.73,
"grad_norm": 0.2426328808069229,
"learning_rate": 0.000191900826446281,
"loss": 0.6444,
"step": 446
},
{
"epoch": 0.73,
"grad_norm": 0.26187026500701904,
"learning_rate": 0.00019165289256198347,
"loss": 0.5443,
"step": 447
},
{
"epoch": 0.73,
"grad_norm": 0.2719630002975464,
"learning_rate": 0.00019140495867768595,
"loss": 0.6886,
"step": 448
},
{
"epoch": 0.74,
"grad_norm": 0.18477971851825714,
"learning_rate": 0.0001911570247933884,
"loss": 0.5292,
"step": 449
},
{
"epoch": 0.74,
"grad_norm": 0.2144313007593155,
"learning_rate": 0.0001909090909090909,
"loss": 0.4613,
"step": 450
},
{
"epoch": 0.74,
"grad_norm": 0.2580784857273102,
"learning_rate": 0.00019066115702479338,
"loss": 0.5606,
"step": 451
},
{
"epoch": 0.74,
"grad_norm": 0.3073588013648987,
"learning_rate": 0.00019041322314049586,
"loss": 0.6123,
"step": 452
},
{
"epoch": 0.74,
"grad_norm": 0.21787844598293304,
"learning_rate": 0.00019016528925619832,
"loss": 0.5939,
"step": 453
},
{
"epoch": 0.74,
"grad_norm": 0.255750447511673,
"learning_rate": 0.0001899173553719008,
"loss": 0.5739,
"step": 454
},
{
"epoch": 0.74,
"grad_norm": 0.24147820472717285,
"learning_rate": 0.00018966942148760329,
"loss": 0.6026,
"step": 455
},
{
"epoch": 0.75,
"grad_norm": 0.26172590255737305,
"learning_rate": 0.00018942148760330577,
"loss": 0.5166,
"step": 456
},
{
"epoch": 0.75,
"grad_norm": 0.2710455358028412,
"learning_rate": 0.00018917355371900825,
"loss": 0.6429,
"step": 457
},
{
"epoch": 0.75,
"grad_norm": 0.1971074640750885,
"learning_rate": 0.0001889256198347107,
"loss": 0.4799,
"step": 458
},
{
"epoch": 0.75,
"grad_norm": 0.23394368588924408,
"learning_rate": 0.0001886776859504132,
"loss": 0.5491,
"step": 459
},
{
"epoch": 0.75,
"grad_norm": 0.22820048034191132,
"learning_rate": 0.0001884297520661157,
"loss": 0.5343,
"step": 460
},
{
"epoch": 0.75,
"grad_norm": 0.23169974982738495,
"learning_rate": 0.0001881818181818182,
"loss": 0.5852,
"step": 461
},
{
"epoch": 0.76,
"grad_norm": 0.24015003442764282,
"learning_rate": 0.00018793388429752064,
"loss": 0.6209,
"step": 462
},
{
"epoch": 0.76,
"grad_norm": 0.2230776697397232,
"learning_rate": 0.00018768595041322313,
"loss": 0.6296,
"step": 463
},
{
"epoch": 0.76,
"grad_norm": 0.2518354654312134,
"learning_rate": 0.0001874380165289256,
"loss": 0.6167,
"step": 464
},
{
"epoch": 0.76,
"grad_norm": 0.338256299495697,
"learning_rate": 0.0001871900826446281,
"loss": 0.6512,
"step": 465
},
{
"epoch": 0.76,
"grad_norm": 0.23796728253364563,
"learning_rate": 0.00018694214876033055,
"loss": 0.8155,
"step": 466
},
{
"epoch": 0.76,
"grad_norm": 0.31516361236572266,
"learning_rate": 0.00018669421487603303,
"loss": 0.8023,
"step": 467
},
{
"epoch": 0.77,
"grad_norm": 0.2371574491262436,
"learning_rate": 0.00018644628099173552,
"loss": 0.5613,
"step": 468
},
{
"epoch": 0.77,
"grad_norm": 0.2822033762931824,
"learning_rate": 0.000186198347107438,
"loss": 0.5549,
"step": 469
},
{
"epoch": 0.77,
"grad_norm": 0.25953295826911926,
"learning_rate": 0.00018595041322314049,
"loss": 0.6199,
"step": 470
},
{
"epoch": 0.77,
"grad_norm": 0.2478639930486679,
"learning_rate": 0.00018570247933884294,
"loss": 0.5806,
"step": 471
},
{
"epoch": 0.77,
"grad_norm": 0.2439350187778473,
"learning_rate": 0.00018545454545454543,
"loss": 0.6222,
"step": 472
},
{
"epoch": 0.77,
"grad_norm": 0.24993474781513214,
"learning_rate": 0.0001852066115702479,
"loss": 0.6048,
"step": 473
},
{
"epoch": 0.78,
"grad_norm": 0.24781496822834015,
"learning_rate": 0.00018495867768595042,
"loss": 0.5941,
"step": 474
},
{
"epoch": 0.78,
"grad_norm": 0.1847202032804489,
"learning_rate": 0.00018471074380165285,
"loss": 0.609,
"step": 475
},
{
"epoch": 0.78,
"grad_norm": 0.21596528589725494,
"learning_rate": 0.00018446280991735536,
"loss": 0.4457,
"step": 476
},
{
"epoch": 0.78,
"grad_norm": 0.240879625082016,
"learning_rate": 0.00018421487603305784,
"loss": 0.6118,
"step": 477
},
{
"epoch": 0.78,
"grad_norm": 0.2898111641407013,
"learning_rate": 0.00018396694214876033,
"loss": 0.7725,
"step": 478
},
{
"epoch": 0.78,
"grad_norm": 0.27428382635116577,
"learning_rate": 0.0001837190082644628,
"loss": 0.5366,
"step": 479
},
{
"epoch": 0.79,
"grad_norm": 0.23467296361923218,
"learning_rate": 0.00018347107438016527,
"loss": 0.6018,
"step": 480
},
{
"epoch": 0.79,
"grad_norm": 0.2190561592578888,
"learning_rate": 0.00018322314049586775,
"loss": 0.5249,
"step": 481
},
{
"epoch": 0.79,
"grad_norm": 0.2240625023841858,
"learning_rate": 0.00018297520661157024,
"loss": 0.6891,
"step": 482
},
{
"epoch": 0.79,
"grad_norm": 0.24726848304271698,
"learning_rate": 0.00018272727272727272,
"loss": 0.5545,
"step": 483
},
{
"epoch": 0.79,
"grad_norm": 0.3318251371383667,
"learning_rate": 0.00018247933884297518,
"loss": 0.4809,
"step": 484
},
{
"epoch": 0.79,
"grad_norm": 0.2396695613861084,
"learning_rate": 0.00018223140495867766,
"loss": 0.4942,
"step": 485
},
{
"epoch": 0.8,
"grad_norm": 0.25009942054748535,
"learning_rate": 0.00018198347107438014,
"loss": 0.7381,
"step": 486
},
{
"epoch": 0.8,
"grad_norm": 0.22655311226844788,
"learning_rate": 0.00018173553719008263,
"loss": 0.4729,
"step": 487
},
{
"epoch": 0.8,
"grad_norm": 0.23187695443630219,
"learning_rate": 0.0001814876033057851,
"loss": 0.5719,
"step": 488
},
{
"epoch": 0.8,
"grad_norm": 0.2703653573989868,
"learning_rate": 0.00018123966942148757,
"loss": 0.6031,
"step": 489
},
{
"epoch": 0.8,
"grad_norm": 0.2207796424627304,
"learning_rate": 0.00018099173553719008,
"loss": 0.5361,
"step": 490
},
{
"epoch": 0.8,
"grad_norm": 0.24914169311523438,
"learning_rate": 0.00018074380165289256,
"loss": 0.6547,
"step": 491
},
{
"epoch": 0.81,
"grad_norm": 0.2714746594429016,
"learning_rate": 0.00018049586776859504,
"loss": 0.5702,
"step": 492
},
{
"epoch": 0.81,
"grad_norm": 0.3201580047607422,
"learning_rate": 0.0001802479338842975,
"loss": 0.6119,
"step": 493
},
{
"epoch": 0.81,
"grad_norm": 0.2548397183418274,
"learning_rate": 0.00017999999999999998,
"loss": 0.5251,
"step": 494
},
{
"epoch": 0.81,
"grad_norm": 0.28669115900993347,
"learning_rate": 0.00017975206611570247,
"loss": 0.5773,
"step": 495
},
{
"epoch": 0.81,
"grad_norm": 0.26253971457481384,
"learning_rate": 0.00017950413223140495,
"loss": 0.6504,
"step": 496
},
{
"epoch": 0.81,
"grad_norm": 0.22113384306430817,
"learning_rate": 0.00017925619834710744,
"loss": 0.4741,
"step": 497
},
{
"epoch": 0.82,
"grad_norm": 0.261636346578598,
"learning_rate": 0.0001790082644628099,
"loss": 0.6241,
"step": 498
},
{
"epoch": 0.82,
"grad_norm": 0.1780402809381485,
"learning_rate": 0.00017876033057851238,
"loss": 0.5207,
"step": 499
},
{
"epoch": 0.82,
"grad_norm": 0.26149195432662964,
"learning_rate": 0.00017851239669421486,
"loss": 0.5872,
"step": 500
},
{
"epoch": 0.82,
"grad_norm": 0.26113009452819824,
"learning_rate": 0.00017826446280991734,
"loss": 0.6163,
"step": 501
},
{
"epoch": 0.82,
"grad_norm": 0.21397502720355988,
"learning_rate": 0.0001780165289256198,
"loss": 0.479,
"step": 502
},
{
"epoch": 0.82,
"grad_norm": 0.21250088512897491,
"learning_rate": 0.00017776859504132228,
"loss": 0.6978,
"step": 503
},
{
"epoch": 0.83,
"grad_norm": 0.2556426525115967,
"learning_rate": 0.00017752066115702477,
"loss": 0.6128,
"step": 504
},
{
"epoch": 0.83,
"grad_norm": 0.24139715731143951,
"learning_rate": 0.00017727272727272728,
"loss": 0.5066,
"step": 505
},
{
"epoch": 0.83,
"grad_norm": 0.23671215772628784,
"learning_rate": 0.00017702479338842976,
"loss": 0.5183,
"step": 506
},
{
"epoch": 0.83,
"grad_norm": 0.23494285345077515,
"learning_rate": 0.00017677685950413222,
"loss": 0.5181,
"step": 507
},
{
"epoch": 0.83,
"grad_norm": 0.2547609806060791,
"learning_rate": 0.0001765289256198347,
"loss": 0.5406,
"step": 508
},
{
"epoch": 0.83,
"grad_norm": 0.3042651414871216,
"learning_rate": 0.00017628099173553718,
"loss": 0.5551,
"step": 509
},
{
"epoch": 0.84,
"grad_norm": 0.22910748422145844,
"learning_rate": 0.00017603305785123967,
"loss": 0.6373,
"step": 510
},
{
"epoch": 0.84,
"grad_norm": 0.19777967035770416,
"learning_rate": 0.00017578512396694212,
"loss": 0.5471,
"step": 511
},
{
"epoch": 0.84,
"grad_norm": 0.31034502387046814,
"learning_rate": 0.0001755371900826446,
"loss": 0.7017,
"step": 512
},
{
"epoch": 0.84,
"grad_norm": 0.3504410684108734,
"learning_rate": 0.0001752892561983471,
"loss": 0.7208,
"step": 513
},
{
"epoch": 0.84,
"grad_norm": 0.24271292984485626,
"learning_rate": 0.00017504132231404958,
"loss": 0.5563,
"step": 514
},
{
"epoch": 0.84,
"grad_norm": 0.27147865295410156,
"learning_rate": 0.00017479338842975203,
"loss": 0.5869,
"step": 515
},
{
"epoch": 0.84,
"grad_norm": 0.2976628839969635,
"learning_rate": 0.00017454545454545452,
"loss": 0.5471,
"step": 516
},
{
"epoch": 0.85,
"grad_norm": 0.28489646315574646,
"learning_rate": 0.000174297520661157,
"loss": 0.6053,
"step": 517
},
{
"epoch": 0.85,
"grad_norm": 0.30020108819007874,
"learning_rate": 0.00017404958677685948,
"loss": 0.6178,
"step": 518
},
{
"epoch": 0.85,
"grad_norm": 0.23986253142356873,
"learning_rate": 0.000173801652892562,
"loss": 0.5896,
"step": 519
},
{
"epoch": 0.85,
"grad_norm": 0.2667832374572754,
"learning_rate": 0.00017355371900826442,
"loss": 0.5375,
"step": 520
},
{
"epoch": 0.85,
"grad_norm": 0.22176356613636017,
"learning_rate": 0.00017330578512396693,
"loss": 0.5723,
"step": 521
},
{
"epoch": 0.85,
"grad_norm": 0.263257771730423,
"learning_rate": 0.00017305785123966942,
"loss": 0.7317,
"step": 522
},
{
"epoch": 0.86,
"grad_norm": 0.24838753044605255,
"learning_rate": 0.0001728099173553719,
"loss": 0.5849,
"step": 523
},
{
"epoch": 0.86,
"grad_norm": 0.24839664995670319,
"learning_rate": 0.00017256198347107436,
"loss": 0.6678,
"step": 524
},
{
"epoch": 0.86,
"grad_norm": 0.2849573493003845,
"learning_rate": 0.00017231404958677684,
"loss": 0.7144,
"step": 525
},
{
"epoch": 0.86,
"grad_norm": 0.26900768280029297,
"learning_rate": 0.00017206611570247932,
"loss": 0.5156,
"step": 526
},
{
"epoch": 0.86,
"grad_norm": 0.2212425172328949,
"learning_rate": 0.0001718181818181818,
"loss": 0.4551,
"step": 527
},
{
"epoch": 0.86,
"grad_norm": 0.2066129595041275,
"learning_rate": 0.0001715702479338843,
"loss": 0.4193,
"step": 528
},
{
"epoch": 0.87,
"grad_norm": 0.2838365137577057,
"learning_rate": 0.00017132231404958675,
"loss": 0.6078,
"step": 529
},
{
"epoch": 0.87,
"grad_norm": 0.239679753780365,
"learning_rate": 0.00017107438016528923,
"loss": 0.616,
"step": 530
},
{
"epoch": 0.87,
"grad_norm": 0.23269398510456085,
"learning_rate": 0.00017082644628099172,
"loss": 0.542,
"step": 531
},
{
"epoch": 0.87,
"grad_norm": 0.23838558793067932,
"learning_rate": 0.0001705785123966942,
"loss": 0.5147,
"step": 532
},
{
"epoch": 0.87,
"grad_norm": 0.2819415330886841,
"learning_rate": 0.00017033057851239666,
"loss": 0.6437,
"step": 533
},
{
"epoch": 0.87,
"grad_norm": 0.243398055434227,
"learning_rate": 0.00017008264462809914,
"loss": 0.6611,
"step": 534
},
{
"epoch": 0.88,
"grad_norm": 0.22569122910499573,
"learning_rate": 0.00016983471074380165,
"loss": 0.3979,
"step": 535
},
{
"epoch": 0.88,
"grad_norm": 0.33265820145606995,
"learning_rate": 0.00016958677685950413,
"loss": 0.6005,
"step": 536
},
{
"epoch": 0.88,
"grad_norm": 0.26828673481941223,
"learning_rate": 0.00016933884297520662,
"loss": 0.608,
"step": 537
},
{
"epoch": 0.88,
"grad_norm": 0.24439513683319092,
"learning_rate": 0.00016909090909090907,
"loss": 0.5572,
"step": 538
},
{
"epoch": 0.88,
"grad_norm": 0.22491876780986786,
"learning_rate": 0.00016884297520661156,
"loss": 0.7226,
"step": 539
},
{
"epoch": 0.88,
"grad_norm": 0.24468480050563812,
"learning_rate": 0.00016859504132231404,
"loss": 0.4582,
"step": 540
},
{
"epoch": 0.89,
"grad_norm": 0.23392945528030396,
"learning_rate": 0.00016834710743801652,
"loss": 0.6477,
"step": 541
},
{
"epoch": 0.89,
"grad_norm": 0.27548858523368835,
"learning_rate": 0.00016809917355371898,
"loss": 0.5846,
"step": 542
},
{
"epoch": 0.89,
"grad_norm": 0.2861180603504181,
"learning_rate": 0.00016785123966942146,
"loss": 0.6412,
"step": 543
},
{
"epoch": 0.89,
"grad_norm": 0.24700766801834106,
"learning_rate": 0.00016760330578512395,
"loss": 0.6947,
"step": 544
},
{
"epoch": 0.89,
"grad_norm": 0.2600953280925751,
"learning_rate": 0.00016735537190082643,
"loss": 0.6165,
"step": 545
},
{
"epoch": 0.89,
"grad_norm": 0.26876646280288696,
"learning_rate": 0.00016710743801652892,
"loss": 0.6855,
"step": 546
},
{
"epoch": 0.9,
"grad_norm": 0.26161080598831177,
"learning_rate": 0.00016685950413223137,
"loss": 0.5066,
"step": 547
},
{
"epoch": 0.9,
"grad_norm": 0.25190046429634094,
"learning_rate": 0.00016661157024793386,
"loss": 0.5902,
"step": 548
},
{
"epoch": 0.9,
"grad_norm": 0.25269225239753723,
"learning_rate": 0.00016636363636363637,
"loss": 0.7017,
"step": 549
},
{
"epoch": 0.9,
"grad_norm": 0.28042706847190857,
"learning_rate": 0.00016611570247933885,
"loss": 0.6264,
"step": 550
},
{
"epoch": 0.9,
"grad_norm": 0.2767360508441925,
"learning_rate": 0.0001658677685950413,
"loss": 0.7562,
"step": 551
},
{
"epoch": 0.9,
"grad_norm": 0.2771216034889221,
"learning_rate": 0.0001656198347107438,
"loss": 0.5333,
"step": 552
},
{
"epoch": 0.91,
"grad_norm": 0.189210906624794,
"learning_rate": 0.00016537190082644627,
"loss": 0.5378,
"step": 553
},
{
"epoch": 0.91,
"grad_norm": 0.22517065703868866,
"learning_rate": 0.00016512396694214876,
"loss": 0.5292,
"step": 554
},
{
"epoch": 0.91,
"grad_norm": 0.2390165776014328,
"learning_rate": 0.00016487603305785121,
"loss": 0.4407,
"step": 555
},
{
"epoch": 0.91,
"grad_norm": 0.21548262238502502,
"learning_rate": 0.0001646280991735537,
"loss": 0.4504,
"step": 556
},
{
"epoch": 0.91,
"grad_norm": 0.20831167697906494,
"learning_rate": 0.00016438016528925618,
"loss": 0.6848,
"step": 557
},
{
"epoch": 0.91,
"grad_norm": 0.271257609128952,
"learning_rate": 0.00016413223140495866,
"loss": 0.535,
"step": 558
},
{
"epoch": 0.92,
"grad_norm": 0.32008254528045654,
"learning_rate": 0.00016388429752066115,
"loss": 0.5107,
"step": 559
},
{
"epoch": 0.92,
"grad_norm": 0.34058302640914917,
"learning_rate": 0.0001636363636363636,
"loss": 0.5708,
"step": 560
},
{
"epoch": 0.92,
"grad_norm": 0.28070059418678284,
"learning_rate": 0.0001633884297520661,
"loss": 0.5086,
"step": 561
},
{
"epoch": 0.92,
"grad_norm": 0.25487688183784485,
"learning_rate": 0.00016314049586776857,
"loss": 0.5184,
"step": 562
},
{
"epoch": 0.92,
"grad_norm": 0.3240332007408142,
"learning_rate": 0.00016289256198347108,
"loss": 0.6774,
"step": 563
},
{
"epoch": 0.92,
"grad_norm": 0.30744409561157227,
"learning_rate": 0.0001626446280991735,
"loss": 0.5314,
"step": 564
},
{
"epoch": 0.93,
"grad_norm": 0.25220754742622375,
"learning_rate": 0.00016239669421487602,
"loss": 0.6308,
"step": 565
},
{
"epoch": 0.93,
"grad_norm": 0.29116958379745483,
"learning_rate": 0.0001621487603305785,
"loss": 0.5685,
"step": 566
},
{
"epoch": 0.93,
"grad_norm": 0.23250073194503784,
"learning_rate": 0.000161900826446281,
"loss": 0.4318,
"step": 567
},
{
"epoch": 0.93,
"grad_norm": 0.2808091640472412,
"learning_rate": 0.00016165289256198347,
"loss": 0.6313,
"step": 568
},
{
"epoch": 0.93,
"grad_norm": 0.2711193561553955,
"learning_rate": 0.00016140495867768593,
"loss": 0.4651,
"step": 569
},
{
"epoch": 0.93,
"grad_norm": 0.29540935158729553,
"learning_rate": 0.00016115702479338841,
"loss": 0.6663,
"step": 570
},
{
"epoch": 0.93,
"grad_norm": 0.23418714106082916,
"learning_rate": 0.0001609090909090909,
"loss": 0.448,
"step": 571
},
{
"epoch": 0.94,
"grad_norm": 0.21675793826580048,
"learning_rate": 0.00016066115702479338,
"loss": 0.5034,
"step": 572
},
{
"epoch": 0.94,
"grad_norm": 0.22451865673065186,
"learning_rate": 0.00016041322314049584,
"loss": 0.4476,
"step": 573
},
{
"epoch": 0.94,
"grad_norm": 0.26300856471061707,
"learning_rate": 0.00016016528925619832,
"loss": 0.6646,
"step": 574
},
{
"epoch": 0.94,
"grad_norm": 0.3377116918563843,
"learning_rate": 0.0001599173553719008,
"loss": 0.6029,
"step": 575
},
{
"epoch": 0.94,
"grad_norm": 0.23391880095005035,
"learning_rate": 0.0001596694214876033,
"loss": 0.6277,
"step": 576
},
{
"epoch": 0.94,
"grad_norm": 0.19620922207832336,
"learning_rate": 0.0001594214876033058,
"loss": 0.4638,
"step": 577
},
{
"epoch": 0.95,
"grad_norm": 0.22981096804141998,
"learning_rate": 0.00015917355371900823,
"loss": 0.5826,
"step": 578
},
{
"epoch": 0.95,
"grad_norm": 0.34321555495262146,
"learning_rate": 0.00015892561983471074,
"loss": 0.5618,
"step": 579
},
{
"epoch": 0.95,
"grad_norm": 0.28461968898773193,
"learning_rate": 0.00015867768595041322,
"loss": 0.5129,
"step": 580
},
{
"epoch": 0.95,
"grad_norm": 0.24368269741535187,
"learning_rate": 0.0001584297520661157,
"loss": 0.5866,
"step": 581
},
{
"epoch": 0.95,
"grad_norm": 0.282255083322525,
"learning_rate": 0.00015818181818181816,
"loss": 0.6274,
"step": 582
},
{
"epoch": 0.95,
"grad_norm": 0.26298072934150696,
"learning_rate": 0.00015793388429752065,
"loss": 0.5187,
"step": 583
},
{
"epoch": 0.96,
"grad_norm": 0.2671455144882202,
"learning_rate": 0.00015768595041322313,
"loss": 0.6878,
"step": 584
},
{
"epoch": 0.96,
"grad_norm": 0.2681390643119812,
"learning_rate": 0.00015743801652892561,
"loss": 0.5469,
"step": 585
},
{
"epoch": 0.96,
"grad_norm": 0.38484248518943787,
"learning_rate": 0.0001571900826446281,
"loss": 0.6364,
"step": 586
},
{
"epoch": 0.96,
"grad_norm": 0.23353587090969086,
"learning_rate": 0.00015694214876033055,
"loss": 0.4844,
"step": 587
},
{
"epoch": 0.96,
"grad_norm": 0.29452502727508545,
"learning_rate": 0.00015669421487603304,
"loss": 0.5059,
"step": 588
},
{
"epoch": 0.96,
"grad_norm": 0.2460879236459732,
"learning_rate": 0.00015644628099173552,
"loss": 0.6495,
"step": 589
},
{
"epoch": 0.97,
"grad_norm": 0.30693721771240234,
"learning_rate": 0.000156198347107438,
"loss": 0.5165,
"step": 590
},
{
"epoch": 0.97,
"grad_norm": 0.2171495109796524,
"learning_rate": 0.00015595041322314046,
"loss": 0.6172,
"step": 591
},
{
"epoch": 0.97,
"grad_norm": 0.24301984906196594,
"learning_rate": 0.00015570247933884294,
"loss": 0.6786,
"step": 592
},
{
"epoch": 0.97,
"grad_norm": 0.2288222461938858,
"learning_rate": 0.00015545454545454546,
"loss": 0.5669,
"step": 593
},
{
"epoch": 0.97,
"grad_norm": 0.2407921552658081,
"learning_rate": 0.00015520661157024794,
"loss": 0.5968,
"step": 594
},
{
"epoch": 0.97,
"grad_norm": 0.2591527998447418,
"learning_rate": 0.0001549586776859504,
"loss": 0.544,
"step": 595
},
{
"epoch": 0.98,
"grad_norm": 0.25770679116249084,
"learning_rate": 0.00015471074380165288,
"loss": 0.7177,
"step": 596
},
{
"epoch": 0.98,
"grad_norm": 0.2528848648071289,
"learning_rate": 0.00015446280991735536,
"loss": 0.4703,
"step": 597
},
{
"epoch": 0.98,
"grad_norm": 0.24993537366390228,
"learning_rate": 0.00015421487603305785,
"loss": 0.6003,
"step": 598
},
{
"epoch": 0.98,
"grad_norm": 0.25807908177375793,
"learning_rate": 0.00015396694214876033,
"loss": 0.465,
"step": 599
},
{
"epoch": 0.98,
"grad_norm": 0.3142452836036682,
"learning_rate": 0.0001537190082644628,
"loss": 0.6122,
"step": 600
},
{
"epoch": 0.98,
"grad_norm": 0.27111849188804626,
"learning_rate": 0.00015347107438016527,
"loss": 0.5962,
"step": 601
},
{
"epoch": 0.99,
"grad_norm": 0.28503674268722534,
"learning_rate": 0.00015322314049586775,
"loss": 0.6667,
"step": 602
},
{
"epoch": 0.99,
"grad_norm": 0.27074381709098816,
"learning_rate": 0.00015297520661157024,
"loss": 0.6115,
"step": 603
},
{
"epoch": 0.99,
"grad_norm": 0.25918465852737427,
"learning_rate": 0.0001527272727272727,
"loss": 0.4483,
"step": 604
},
{
"epoch": 0.99,
"grad_norm": 0.24476633965969086,
"learning_rate": 0.00015247933884297518,
"loss": 0.6501,
"step": 605
},
{
"epoch": 0.99,
"grad_norm": 0.21205200254917145,
"learning_rate": 0.00015223140495867766,
"loss": 0.3914,
"step": 606
},
{
"epoch": 0.99,
"grad_norm": 0.25496751070022583,
"learning_rate": 0.00015198347107438017,
"loss": 0.5335,
"step": 607
},
{
"epoch": 1.0,
"grad_norm": 0.27991780638694763,
"learning_rate": 0.00015173553719008266,
"loss": 0.6083,
"step": 608
},
{
"epoch": 1.0,
"grad_norm": 0.23995639383792877,
"learning_rate": 0.0001514876033057851,
"loss": 0.55,
"step": 609
},
{
"epoch": 1.0,
"grad_norm": 0.2349666953086853,
"learning_rate": 0.0001512396694214876,
"loss": 0.7054,
"step": 610
},
{
"epoch": 1.0,
"grad_norm": 0.27498871088027954,
"learning_rate": 0.00015099173553719008,
"loss": 0.55,
"step": 611
},
{
"epoch": 1.0,
"grad_norm": 0.21346105635166168,
"learning_rate": 0.00015074380165289256,
"loss": 0.3467,
"step": 612
},
{
"epoch": 1.0,
"grad_norm": 0.2638354003429413,
"learning_rate": 0.00015049586776859502,
"loss": 0.5624,
"step": 613
},
{
"epoch": 1.01,
"grad_norm": 0.2751975953578949,
"learning_rate": 0.0001502479338842975,
"loss": 0.3814,
"step": 614
},
{
"epoch": 1.01,
"grad_norm": 0.225106880068779,
"learning_rate": 0.00015,
"loss": 0.479,
"step": 615
},
{
"epoch": 1.01,
"grad_norm": 0.22013232111930847,
"learning_rate": 0.00014975206611570247,
"loss": 0.5672,
"step": 616
},
{
"epoch": 1.01,
"grad_norm": 0.21252033114433289,
"learning_rate": 0.00014950413223140495,
"loss": 0.546,
"step": 617
},
{
"epoch": 1.01,
"grad_norm": 0.2847185432910919,
"learning_rate": 0.00014925619834710744,
"loss": 0.4434,
"step": 618
},
{
"epoch": 1.01,
"grad_norm": 0.25599631667137146,
"learning_rate": 0.0001490082644628099,
"loss": 0.4713,
"step": 619
},
{
"epoch": 1.02,
"grad_norm": 0.2719402611255646,
"learning_rate": 0.00014876033057851238,
"loss": 0.4475,
"step": 620
},
{
"epoch": 1.02,
"grad_norm": 0.26454958319664,
"learning_rate": 0.00014851239669421486,
"loss": 0.4515,
"step": 621
},
{
"epoch": 1.02,
"grad_norm": 0.39801672101020813,
"learning_rate": 0.00014826446280991735,
"loss": 0.4647,
"step": 622
},
{
"epoch": 1.02,
"grad_norm": 0.3378361463546753,
"learning_rate": 0.0001480165289256198,
"loss": 0.4414,
"step": 623
},
{
"epoch": 1.02,
"grad_norm": 0.3039036989212036,
"learning_rate": 0.0001477685950413223,
"loss": 0.5634,
"step": 624
},
{
"epoch": 1.02,
"grad_norm": 0.3506157398223877,
"learning_rate": 0.0001475206611570248,
"loss": 0.5001,
"step": 625
},
{
"epoch": 1.02,
"grad_norm": 0.2508845925331116,
"learning_rate": 0.00014727272727272725,
"loss": 0.3379,
"step": 626
},
{
"epoch": 1.03,
"grad_norm": 0.26913216710090637,
"learning_rate": 0.00014702479338842974,
"loss": 0.4575,
"step": 627
},
{
"epoch": 1.03,
"grad_norm": 0.329659640789032,
"learning_rate": 0.00014677685950413222,
"loss": 0.437,
"step": 628
},
{
"epoch": 1.03,
"grad_norm": 0.2972075343132019,
"learning_rate": 0.0001465289256198347,
"loss": 0.5048,
"step": 629
},
{
"epoch": 1.03,
"grad_norm": 0.3184354603290558,
"learning_rate": 0.00014628099173553716,
"loss": 0.4374,
"step": 630
},
{
"epoch": 1.03,
"grad_norm": 0.3377355635166168,
"learning_rate": 0.00014603305785123967,
"loss": 0.4946,
"step": 631
},
{
"epoch": 1.03,
"grad_norm": 0.29106801748275757,
"learning_rate": 0.00014578512396694213,
"loss": 0.5414,
"step": 632
},
{
"epoch": 1.04,
"grad_norm": 0.22808948159217834,
"learning_rate": 0.0001455371900826446,
"loss": 0.3739,
"step": 633
},
{
"epoch": 1.04,
"grad_norm": 0.27818021178245544,
"learning_rate": 0.0001452892561983471,
"loss": 0.4172,
"step": 634
},
{
"epoch": 1.04,
"grad_norm": 0.25634923577308655,
"learning_rate": 0.00014504132231404958,
"loss": 0.4293,
"step": 635
},
{
"epoch": 1.04,
"grad_norm": 0.30696937441825867,
"learning_rate": 0.00014479338842975206,
"loss": 0.4454,
"step": 636
},
{
"epoch": 1.04,
"grad_norm": 0.26105087995529175,
"learning_rate": 0.00014454545454545452,
"loss": 0.2978,
"step": 637
},
{
"epoch": 1.04,
"grad_norm": 0.3100634515285492,
"learning_rate": 0.00014429752066115703,
"loss": 0.4499,
"step": 638
},
{
"epoch": 1.05,
"grad_norm": 0.27640992403030396,
"learning_rate": 0.00014404958677685949,
"loss": 0.3837,
"step": 639
},
{
"epoch": 1.05,
"grad_norm": 0.24559038877487183,
"learning_rate": 0.00014380165289256197,
"loss": 0.3347,
"step": 640
},
{
"epoch": 1.05,
"grad_norm": 0.2920415699481964,
"learning_rate": 0.00014355371900826445,
"loss": 0.4333,
"step": 641
},
{
"epoch": 1.05,
"grad_norm": 0.3147384226322174,
"learning_rate": 0.00014330578512396694,
"loss": 0.4385,
"step": 642
},
{
"epoch": 1.05,
"grad_norm": 0.35469138622283936,
"learning_rate": 0.0001430578512396694,
"loss": 0.5442,
"step": 643
},
{
"epoch": 1.05,
"grad_norm": 0.2619563043117523,
"learning_rate": 0.00014280991735537188,
"loss": 0.3837,
"step": 644
},
{
"epoch": 1.06,
"grad_norm": 0.32273221015930176,
"learning_rate": 0.0001425619834710744,
"loss": 0.4946,
"step": 645
},
{
"epoch": 1.06,
"grad_norm": 0.2692110538482666,
"learning_rate": 0.00014231404958677684,
"loss": 0.4683,
"step": 646
},
{
"epoch": 1.06,
"grad_norm": 0.35255464911460876,
"learning_rate": 0.00014206611570247933,
"loss": 0.5456,
"step": 647
},
{
"epoch": 1.06,
"grad_norm": 0.29768630862236023,
"learning_rate": 0.0001418181818181818,
"loss": 0.3394,
"step": 648
},
{
"epoch": 1.06,
"grad_norm": 0.30738797783851624,
"learning_rate": 0.0001415702479338843,
"loss": 0.3583,
"step": 649
},
{
"epoch": 1.06,
"grad_norm": 0.33226314187049866,
"learning_rate": 0.00014132231404958675,
"loss": 0.4477,
"step": 650
},
{
"epoch": 1.07,
"grad_norm": 0.2842199504375458,
"learning_rate": 0.00014107438016528923,
"loss": 0.4454,
"step": 651
},
{
"epoch": 1.07,
"grad_norm": 0.28207266330718994,
"learning_rate": 0.00014082644628099172,
"loss": 0.3665,
"step": 652
},
{
"epoch": 1.07,
"grad_norm": 0.2228500097990036,
"learning_rate": 0.0001405785123966942,
"loss": 0.3446,
"step": 653
},
{
"epoch": 1.07,
"grad_norm": 0.2969403564929962,
"learning_rate": 0.00014033057851239669,
"loss": 0.377,
"step": 654
},
{
"epoch": 1.07,
"grad_norm": 0.28087565302848816,
"learning_rate": 0.00014008264462809917,
"loss": 0.3683,
"step": 655
},
{
"epoch": 1.07,
"grad_norm": 0.27268192172050476,
"learning_rate": 0.00013983471074380165,
"loss": 0.427,
"step": 656
},
{
"epoch": 1.08,
"grad_norm": 0.339070588350296,
"learning_rate": 0.0001395867768595041,
"loss": 0.4887,
"step": 657
},
{
"epoch": 1.08,
"grad_norm": 0.3170423209667206,
"learning_rate": 0.0001393388429752066,
"loss": 0.5097,
"step": 658
},
{
"epoch": 1.08,
"grad_norm": 0.3114936947822571,
"learning_rate": 0.00013909090909090908,
"loss": 0.4587,
"step": 659
},
{
"epoch": 1.08,
"grad_norm": 0.28112486004829407,
"learning_rate": 0.00013884297520661156,
"loss": 0.4781,
"step": 660
},
{
"epoch": 1.08,
"grad_norm": 0.28116974234580994,
"learning_rate": 0.00013859504132231404,
"loss": 0.3546,
"step": 661
},
{
"epoch": 1.08,
"grad_norm": 0.25061559677124023,
"learning_rate": 0.00013834710743801653,
"loss": 0.4512,
"step": 662
},
{
"epoch": 1.09,
"grad_norm": 0.29854199290275574,
"learning_rate": 0.00013809917355371898,
"loss": 0.6068,
"step": 663
},
{
"epoch": 1.09,
"grad_norm": 0.2901363670825958,
"learning_rate": 0.00013785123966942147,
"loss": 0.3667,
"step": 664
},
{
"epoch": 1.09,
"grad_norm": 0.29766595363616943,
"learning_rate": 0.00013760330578512395,
"loss": 0.5194,
"step": 665
},
{
"epoch": 1.09,
"grad_norm": 0.2765616476535797,
"learning_rate": 0.00013735537190082643,
"loss": 0.5079,
"step": 666
},
{
"epoch": 1.09,
"grad_norm": 0.27531540393829346,
"learning_rate": 0.00013710743801652892,
"loss": 0.4423,
"step": 667
},
{
"epoch": 1.09,
"grad_norm": 0.3063349425792694,
"learning_rate": 0.0001368595041322314,
"loss": 0.4666,
"step": 668
},
{
"epoch": 1.1,
"grad_norm": 0.24519848823547363,
"learning_rate": 0.00013661157024793389,
"loss": 0.2995,
"step": 669
},
{
"epoch": 1.1,
"grad_norm": 0.4366275370121002,
"learning_rate": 0.00013636363636363634,
"loss": 0.4961,
"step": 670
},
{
"epoch": 1.1,
"grad_norm": 0.28639987111091614,
"learning_rate": 0.00013611570247933883,
"loss": 0.5015,
"step": 671
},
{
"epoch": 1.1,
"grad_norm": 0.2763878107070923,
"learning_rate": 0.0001358677685950413,
"loss": 0.4883,
"step": 672
},
{
"epoch": 1.1,
"grad_norm": 0.5589582324028015,
"learning_rate": 0.0001356198347107438,
"loss": 0.5072,
"step": 673
},
{
"epoch": 1.1,
"grad_norm": 0.238887220621109,
"learning_rate": 0.00013537190082644625,
"loss": 0.411,
"step": 674
},
{
"epoch": 1.11,
"grad_norm": 0.2899521589279175,
"learning_rate": 0.00013512396694214876,
"loss": 0.3478,
"step": 675
},
{
"epoch": 1.11,
"grad_norm": 0.30960512161254883,
"learning_rate": 0.00013487603305785124,
"loss": 0.5058,
"step": 676
},
{
"epoch": 1.11,
"grad_norm": 0.33305928111076355,
"learning_rate": 0.0001346280991735537,
"loss": 0.4528,
"step": 677
},
{
"epoch": 1.11,
"grad_norm": 0.33324292302131653,
"learning_rate": 0.00013438016528925618,
"loss": 0.3523,
"step": 678
},
{
"epoch": 1.11,
"grad_norm": 0.25855520367622375,
"learning_rate": 0.00013413223140495867,
"loss": 0.4257,
"step": 679
},
{
"epoch": 1.11,
"grad_norm": 0.36000239849090576,
"learning_rate": 0.00013388429752066115,
"loss": 0.4963,
"step": 680
},
{
"epoch": 1.12,
"grad_norm": 0.30540961027145386,
"learning_rate": 0.0001336363636363636,
"loss": 0.4706,
"step": 681
},
{
"epoch": 1.12,
"grad_norm": 0.2791118025779724,
"learning_rate": 0.00013338842975206612,
"loss": 0.4543,
"step": 682
},
{
"epoch": 1.12,
"grad_norm": 0.37401753664016724,
"learning_rate": 0.00013314049586776857,
"loss": 0.5614,
"step": 683
},
{
"epoch": 1.12,
"grad_norm": 0.2772528827190399,
"learning_rate": 0.00013289256198347106,
"loss": 0.3881,
"step": 684
},
{
"epoch": 1.12,
"grad_norm": 0.29219475388526917,
"learning_rate": 0.00013264462809917354,
"loss": 0.5418,
"step": 685
},
{
"epoch": 1.12,
"grad_norm": 0.3255159258842468,
"learning_rate": 0.00013239669421487603,
"loss": 0.4669,
"step": 686
},
{
"epoch": 1.12,
"grad_norm": 0.2640572488307953,
"learning_rate": 0.0001321487603305785,
"loss": 0.4156,
"step": 687
},
{
"epoch": 1.13,
"grad_norm": 0.2618845999240875,
"learning_rate": 0.00013190082644628097,
"loss": 0.3537,
"step": 688
},
{
"epoch": 1.13,
"grad_norm": 0.27396076917648315,
"learning_rate": 0.00013165289256198348,
"loss": 0.4391,
"step": 689
},
{
"epoch": 1.13,
"grad_norm": 0.5098498463630676,
"learning_rate": 0.00013140495867768593,
"loss": 0.3863,
"step": 690
},
{
"epoch": 1.13,
"grad_norm": 0.31764644384384155,
"learning_rate": 0.00013115702479338842,
"loss": 0.3874,
"step": 691
},
{
"epoch": 1.13,
"grad_norm": 0.28738152980804443,
"learning_rate": 0.0001309090909090909,
"loss": 0.3209,
"step": 692
},
{
"epoch": 1.13,
"grad_norm": 0.32756757736206055,
"learning_rate": 0.00013066115702479338,
"loss": 0.4614,
"step": 693
},
{
"epoch": 1.14,
"grad_norm": 0.27650028467178345,
"learning_rate": 0.00013041322314049584,
"loss": 0.4717,
"step": 694
},
{
"epoch": 1.14,
"grad_norm": 0.33100056648254395,
"learning_rate": 0.00013016528925619832,
"loss": 0.4317,
"step": 695
},
{
"epoch": 1.14,
"grad_norm": 0.3200342357158661,
"learning_rate": 0.00012991735537190083,
"loss": 0.4494,
"step": 696
},
{
"epoch": 1.14,
"grad_norm": 0.29615214467048645,
"learning_rate": 0.0001296694214876033,
"loss": 0.3786,
"step": 697
},
{
"epoch": 1.14,
"grad_norm": 0.278094619512558,
"learning_rate": 0.00012942148760330577,
"loss": 0.4484,
"step": 698
},
{
"epoch": 1.14,
"grad_norm": 0.32800769805908203,
"learning_rate": 0.00012917355371900826,
"loss": 0.4635,
"step": 699
},
{
"epoch": 1.15,
"grad_norm": 0.3319619596004486,
"learning_rate": 0.00012892561983471074,
"loss": 0.5001,
"step": 700
},
{
"epoch": 1.15,
"grad_norm": 0.2818608283996582,
"learning_rate": 0.0001286776859504132,
"loss": 0.3536,
"step": 701
},
{
"epoch": 1.15,
"grad_norm": 0.28644126653671265,
"learning_rate": 0.00012842975206611568,
"loss": 0.4168,
"step": 702
},
{
"epoch": 1.15,
"grad_norm": 0.2802482545375824,
"learning_rate": 0.00012818181818181817,
"loss": 0.3918,
"step": 703
},
{
"epoch": 1.15,
"grad_norm": 0.21232947707176208,
"learning_rate": 0.00012793388429752065,
"loss": 0.3218,
"step": 704
},
{
"epoch": 1.15,
"grad_norm": 0.36512815952301025,
"learning_rate": 0.00012768595041322313,
"loss": 0.4566,
"step": 705
},
{
"epoch": 1.16,
"grad_norm": 0.26876160502433777,
"learning_rate": 0.00012743801652892562,
"loss": 0.4394,
"step": 706
},
{
"epoch": 1.16,
"grad_norm": 0.3757662773132324,
"learning_rate": 0.0001271900826446281,
"loss": 0.574,
"step": 707
},
{
"epoch": 1.16,
"grad_norm": 0.3161550760269165,
"learning_rate": 0.00012694214876033056,
"loss": 0.4524,
"step": 708
},
{
"epoch": 1.16,
"grad_norm": 0.31256961822509766,
"learning_rate": 0.00012669421487603304,
"loss": 0.4332,
"step": 709
},
{
"epoch": 1.16,
"grad_norm": 0.3122079074382782,
"learning_rate": 0.00012644628099173552,
"loss": 0.5669,
"step": 710
},
{
"epoch": 1.16,
"grad_norm": 0.33779048919677734,
"learning_rate": 0.000126198347107438,
"loss": 0.515,
"step": 711
},
{
"epoch": 1.17,
"grad_norm": 0.38516169786453247,
"learning_rate": 0.0001259504132231405,
"loss": 0.5502,
"step": 712
},
{
"epoch": 1.17,
"grad_norm": 0.2803480625152588,
"learning_rate": 0.00012570247933884297,
"loss": 0.404,
"step": 713
},
{
"epoch": 1.17,
"grad_norm": 0.31674399971961975,
"learning_rate": 0.00012545454545454543,
"loss": 0.4403,
"step": 714
},
{
"epoch": 1.17,
"grad_norm": 0.3029496669769287,
"learning_rate": 0.00012520661157024791,
"loss": 0.372,
"step": 715
},
{
"epoch": 1.17,
"grad_norm": 0.22542959451675415,
"learning_rate": 0.0001249586776859504,
"loss": 0.355,
"step": 716
},
{
"epoch": 1.17,
"grad_norm": 0.32029619812965393,
"learning_rate": 0.00012471074380165288,
"loss": 0.4845,
"step": 717
},
{
"epoch": 1.18,
"grad_norm": 0.34882861375808716,
"learning_rate": 0.00012446280991735537,
"loss": 0.4184,
"step": 718
},
{
"epoch": 1.18,
"grad_norm": 0.3319970667362213,
"learning_rate": 0.00012421487603305785,
"loss": 0.5733,
"step": 719
},
{
"epoch": 1.18,
"grad_norm": 0.2770652770996094,
"learning_rate": 0.00012396694214876033,
"loss": 0.4296,
"step": 720
},
{
"epoch": 1.18,
"grad_norm": 0.3109978437423706,
"learning_rate": 0.0001237190082644628,
"loss": 0.3757,
"step": 721
},
{
"epoch": 1.18,
"grad_norm": 0.23606395721435547,
"learning_rate": 0.00012347107438016527,
"loss": 0.2713,
"step": 722
},
{
"epoch": 1.18,
"grad_norm": 0.304574579000473,
"learning_rate": 0.00012322314049586776,
"loss": 0.4451,
"step": 723
},
{
"epoch": 1.19,
"grad_norm": 0.31314462423324585,
"learning_rate": 0.00012297520661157024,
"loss": 0.493,
"step": 724
},
{
"epoch": 1.19,
"grad_norm": 0.32014840841293335,
"learning_rate": 0.00012272727272727272,
"loss": 0.3784,
"step": 725
},
{
"epoch": 1.19,
"grad_norm": 0.29856279492378235,
"learning_rate": 0.0001224793388429752,
"loss": 0.581,
"step": 726
},
{
"epoch": 1.19,
"grad_norm": 0.30951863527297974,
"learning_rate": 0.0001222314049586777,
"loss": 0.4851,
"step": 727
},
{
"epoch": 1.19,
"grad_norm": 0.264663428068161,
"learning_rate": 0.00012198347107438015,
"loss": 0.431,
"step": 728
},
{
"epoch": 1.19,
"grad_norm": 0.3092226982116699,
"learning_rate": 0.00012173553719008264,
"loss": 0.4553,
"step": 729
},
{
"epoch": 1.2,
"grad_norm": 0.33568286895751953,
"learning_rate": 0.00012148760330578511,
"loss": 0.4894,
"step": 730
},
{
"epoch": 1.2,
"grad_norm": 0.2966444492340088,
"learning_rate": 0.0001212396694214876,
"loss": 0.3855,
"step": 731
},
{
"epoch": 1.2,
"grad_norm": 0.2829122841358185,
"learning_rate": 0.00012099173553719007,
"loss": 0.5328,
"step": 732
},
{
"epoch": 1.2,
"grad_norm": 0.31785663962364197,
"learning_rate": 0.00012074380165289255,
"loss": 0.4142,
"step": 733
},
{
"epoch": 1.2,
"grad_norm": 0.2983114719390869,
"learning_rate": 0.00012049586776859502,
"loss": 0.4168,
"step": 734
},
{
"epoch": 1.2,
"grad_norm": 0.2514868378639221,
"learning_rate": 0.0001202479338842975,
"loss": 0.4728,
"step": 735
},
{
"epoch": 1.21,
"grad_norm": 0.2959445118904114,
"learning_rate": 0.00011999999999999999,
"loss": 0.458,
"step": 736
},
{
"epoch": 1.21,
"grad_norm": 0.31830325722694397,
"learning_rate": 0.00011975206611570247,
"loss": 0.5035,
"step": 737
},
{
"epoch": 1.21,
"grad_norm": 0.31181418895721436,
"learning_rate": 0.00011950413223140496,
"loss": 0.3776,
"step": 738
},
{
"epoch": 1.21,
"grad_norm": 0.3027549684047699,
"learning_rate": 0.00011925619834710743,
"loss": 0.4483,
"step": 739
},
{
"epoch": 1.21,
"grad_norm": 0.28026890754699707,
"learning_rate": 0.00011900826446280991,
"loss": 0.4236,
"step": 740
},
{
"epoch": 1.21,
"grad_norm": 0.29137665033340454,
"learning_rate": 0.00011876033057851238,
"loss": 0.3615,
"step": 741
},
{
"epoch": 1.21,
"grad_norm": 0.282008558511734,
"learning_rate": 0.00011851239669421486,
"loss": 0.4335,
"step": 742
},
{
"epoch": 1.22,
"grad_norm": 0.297736793756485,
"learning_rate": 0.00011826446280991733,
"loss": 0.4945,
"step": 743
},
{
"epoch": 1.22,
"grad_norm": 0.3276868164539337,
"learning_rate": 0.00011801652892561982,
"loss": 0.5379,
"step": 744
},
{
"epoch": 1.22,
"grad_norm": 0.3510095179080963,
"learning_rate": 0.00011776859504132231,
"loss": 0.3589,
"step": 745
},
{
"epoch": 1.22,
"grad_norm": 0.29952242970466614,
"learning_rate": 0.00011752066115702478,
"loss": 0.3805,
"step": 746
},
{
"epoch": 1.22,
"grad_norm": 0.220473513007164,
"learning_rate": 0.00011727272727272727,
"loss": 0.3978,
"step": 747
},
{
"epoch": 1.22,
"grad_norm": 0.30668944120407104,
"learning_rate": 0.00011702479338842974,
"loss": 0.3577,
"step": 748
},
{
"epoch": 1.23,
"grad_norm": 0.3152049779891968,
"learning_rate": 0.00011677685950413222,
"loss": 0.5186,
"step": 749
},
{
"epoch": 1.23,
"grad_norm": 0.17376375198364258,
"learning_rate": 0.00011652892561983469,
"loss": 0.32,
"step": 750
},
{
"epoch": 1.23,
"grad_norm": 0.32847121357917786,
"learning_rate": 0.00011628099173553718,
"loss": 0.5403,
"step": 751
},
{
"epoch": 1.23,
"grad_norm": 0.28821662068367004,
"learning_rate": 0.00011603305785123965,
"loss": 0.3516,
"step": 752
},
{
"epoch": 1.23,
"grad_norm": 0.23324501514434814,
"learning_rate": 0.00011578512396694214,
"loss": 0.3398,
"step": 753
},
{
"epoch": 1.23,
"grad_norm": 0.2897385060787201,
"learning_rate": 0.00011553719008264463,
"loss": 0.3775,
"step": 754
},
{
"epoch": 1.24,
"grad_norm": 0.33701419830322266,
"learning_rate": 0.0001152892561983471,
"loss": 0.5225,
"step": 755
},
{
"epoch": 1.24,
"grad_norm": 0.3228382468223572,
"learning_rate": 0.00011504132231404958,
"loss": 0.4384,
"step": 756
},
{
"epoch": 1.24,
"grad_norm": 0.24733024835586548,
"learning_rate": 0.00011479338842975205,
"loss": 0.2883,
"step": 757
},
{
"epoch": 1.24,
"grad_norm": 0.2824367880821228,
"learning_rate": 0.00011454545454545453,
"loss": 0.3141,
"step": 758
},
{
"epoch": 1.24,
"grad_norm": 0.27844521403312683,
"learning_rate": 0.000114297520661157,
"loss": 0.3327,
"step": 759
},
{
"epoch": 1.24,
"grad_norm": 0.26114732027053833,
"learning_rate": 0.0001140495867768595,
"loss": 0.4071,
"step": 760
},
{
"epoch": 1.25,
"grad_norm": 0.34284186363220215,
"learning_rate": 0.00011380165289256197,
"loss": 0.4619,
"step": 761
},
{
"epoch": 1.25,
"grad_norm": 0.2463303506374359,
"learning_rate": 0.00011355371900826446,
"loss": 0.3038,
"step": 762
},
{
"epoch": 1.25,
"grad_norm": 0.26452890038490295,
"learning_rate": 0.00011330578512396693,
"loss": 0.3603,
"step": 763
},
{
"epoch": 1.25,
"grad_norm": 0.27888497710227966,
"learning_rate": 0.00011305785123966941,
"loss": 0.5109,
"step": 764
},
{
"epoch": 1.25,
"grad_norm": 0.3039766252040863,
"learning_rate": 0.00011280991735537189,
"loss": 0.5377,
"step": 765
},
{
"epoch": 1.25,
"grad_norm": 0.28995901346206665,
"learning_rate": 0.00011256198347107436,
"loss": 0.4797,
"step": 766
},
{
"epoch": 1.26,
"grad_norm": 0.3420790135860443,
"learning_rate": 0.00011231404958677686,
"loss": 0.5209,
"step": 767
},
{
"epoch": 1.26,
"grad_norm": 0.33119046688079834,
"learning_rate": 0.00011206611570247933,
"loss": 0.3709,
"step": 768
},
{
"epoch": 1.26,
"grad_norm": 0.3408135175704956,
"learning_rate": 0.00011181818181818181,
"loss": 0.4389,
"step": 769
},
{
"epoch": 1.26,
"grad_norm": 0.29120129346847534,
"learning_rate": 0.00011157024793388428,
"loss": 0.4327,
"step": 770
},
{
"epoch": 1.26,
"grad_norm": 0.32718029618263245,
"learning_rate": 0.00011132231404958677,
"loss": 0.4859,
"step": 771
},
{
"epoch": 1.26,
"grad_norm": 0.34422147274017334,
"learning_rate": 0.00011107438016528924,
"loss": 0.5184,
"step": 772
},
{
"epoch": 1.27,
"grad_norm": 0.330323189496994,
"learning_rate": 0.00011082644628099172,
"loss": 0.4322,
"step": 773
},
{
"epoch": 1.27,
"grad_norm": 0.3218427002429962,
"learning_rate": 0.00011057851239669422,
"loss": 0.4129,
"step": 774
},
{
"epoch": 1.27,
"grad_norm": 0.2976725995540619,
"learning_rate": 0.00011033057851239669,
"loss": 0.5039,
"step": 775
},
{
"epoch": 1.27,
"grad_norm": 0.32841789722442627,
"learning_rate": 0.00011008264462809917,
"loss": 0.4718,
"step": 776
},
{
"epoch": 1.27,
"grad_norm": 0.32977914810180664,
"learning_rate": 0.00010983471074380164,
"loss": 0.4248,
"step": 777
},
{
"epoch": 1.27,
"grad_norm": 0.2632751166820526,
"learning_rate": 0.00010958677685950413,
"loss": 0.3458,
"step": 778
},
{
"epoch": 1.28,
"grad_norm": 0.33028510212898254,
"learning_rate": 0.0001093388429752066,
"loss": 0.4884,
"step": 779
},
{
"epoch": 1.28,
"grad_norm": 0.30288752913475037,
"learning_rate": 0.00010909090909090908,
"loss": 0.3776,
"step": 780
},
{
"epoch": 1.28,
"grad_norm": 0.32292476296424866,
"learning_rate": 0.00010884297520661155,
"loss": 0.392,
"step": 781
},
{
"epoch": 1.28,
"grad_norm": 0.31956765055656433,
"learning_rate": 0.00010859504132231405,
"loss": 0.3308,
"step": 782
},
{
"epoch": 1.28,
"grad_norm": 0.280553936958313,
"learning_rate": 0.00010834710743801652,
"loss": 0.5806,
"step": 783
},
{
"epoch": 1.28,
"grad_norm": 0.35859328508377075,
"learning_rate": 0.000108099173553719,
"loss": 0.5059,
"step": 784
},
{
"epoch": 1.29,
"grad_norm": 0.2944432497024536,
"learning_rate": 0.00010785123966942148,
"loss": 0.5132,
"step": 785
},
{
"epoch": 1.29,
"grad_norm": 0.27504968643188477,
"learning_rate": 0.00010760330578512395,
"loss": 0.3741,
"step": 786
},
{
"epoch": 1.29,
"grad_norm": 0.29401764273643494,
"learning_rate": 0.00010735537190082644,
"loss": 0.4992,
"step": 787
},
{
"epoch": 1.29,
"grad_norm": 0.30569151043891907,
"learning_rate": 0.00010710743801652891,
"loss": 0.5029,
"step": 788
},
{
"epoch": 1.29,
"grad_norm": 0.28654801845550537,
"learning_rate": 0.0001068595041322314,
"loss": 0.4618,
"step": 789
},
{
"epoch": 1.29,
"grad_norm": 0.26424363255500793,
"learning_rate": 0.00010661157024793387,
"loss": 0.3929,
"step": 790
},
{
"epoch": 1.3,
"grad_norm": 0.28117212653160095,
"learning_rate": 0.00010636363636363636,
"loss": 0.5116,
"step": 791
},
{
"epoch": 1.3,
"grad_norm": 0.28402891755104065,
"learning_rate": 0.00010611570247933883,
"loss": 0.3758,
"step": 792
},
{
"epoch": 1.3,
"grad_norm": 0.32903602719306946,
"learning_rate": 0.00010586776859504131,
"loss": 0.3594,
"step": 793
},
{
"epoch": 1.3,
"grad_norm": 0.4285104274749756,
"learning_rate": 0.0001056198347107438,
"loss": 0.3007,
"step": 794
},
{
"epoch": 1.3,
"grad_norm": 0.27649369835853577,
"learning_rate": 0.00010537190082644627,
"loss": 0.342,
"step": 795
},
{
"epoch": 1.3,
"grad_norm": 0.3094039261341095,
"learning_rate": 0.00010512396694214876,
"loss": 0.4452,
"step": 796
},
{
"epoch": 1.3,
"grad_norm": 0.32547199726104736,
"learning_rate": 0.00010487603305785123,
"loss": 0.4274,
"step": 797
},
{
"epoch": 1.31,
"grad_norm": 0.30244141817092896,
"learning_rate": 0.00010462809917355372,
"loss": 0.393,
"step": 798
},
{
"epoch": 1.31,
"grad_norm": 0.3018583655357361,
"learning_rate": 0.00010438016528925619,
"loss": 0.4012,
"step": 799
},
{
"epoch": 1.31,
"grad_norm": 0.36397960782051086,
"learning_rate": 0.00010413223140495867,
"loss": 0.5231,
"step": 800
},
{
"epoch": 1.31,
"grad_norm": 0.3178517520427704,
"learning_rate": 0.00010388429752066114,
"loss": 0.4036,
"step": 801
},
{
"epoch": 1.31,
"grad_norm": 0.34640219807624817,
"learning_rate": 0.00010363636363636362,
"loss": 0.4717,
"step": 802
},
{
"epoch": 1.31,
"grad_norm": 0.302775114774704,
"learning_rate": 0.0001033884297520661,
"loss": 0.4207,
"step": 803
},
{
"epoch": 1.32,
"grad_norm": 0.30845245718955994,
"learning_rate": 0.00010314049586776859,
"loss": 0.3976,
"step": 804
},
{
"epoch": 1.32,
"grad_norm": 0.2689266502857208,
"learning_rate": 0.00010289256198347107,
"loss": 0.3777,
"step": 805
},
{
"epoch": 1.32,
"grad_norm": 0.33539149165153503,
"learning_rate": 0.00010264462809917354,
"loss": 0.3896,
"step": 806
},
{
"epoch": 1.32,
"grad_norm": 0.2548604905605316,
"learning_rate": 0.00010239669421487603,
"loss": 0.4026,
"step": 807
},
{
"epoch": 1.32,
"grad_norm": 0.5050720572471619,
"learning_rate": 0.0001021487603305785,
"loss": 0.4008,
"step": 808
},
{
"epoch": 1.32,
"grad_norm": 0.2518717646598816,
"learning_rate": 0.00010190082644628098,
"loss": 0.348,
"step": 809
},
{
"epoch": 1.33,
"grad_norm": 0.39397895336151123,
"learning_rate": 0.00010165289256198345,
"loss": 0.5369,
"step": 810
},
{
"epoch": 1.33,
"grad_norm": 0.3471471965312958,
"learning_rate": 0.00010140495867768595,
"loss": 0.5272,
"step": 811
},
{
"epoch": 1.33,
"grad_norm": 0.4147883355617523,
"learning_rate": 0.00010115702479338842,
"loss": 0.427,
"step": 812
},
{
"epoch": 1.33,
"grad_norm": 0.2932160794734955,
"learning_rate": 0.0001009090909090909,
"loss": 0.3274,
"step": 813
},
{
"epoch": 1.33,
"grad_norm": 0.28647059202194214,
"learning_rate": 0.00010066115702479339,
"loss": 0.3346,
"step": 814
},
{
"epoch": 1.33,
"grad_norm": 0.28154057264328003,
"learning_rate": 0.00010041322314049586,
"loss": 0.3785,
"step": 815
},
{
"epoch": 1.34,
"grad_norm": 0.25706711411476135,
"learning_rate": 0.00010016528925619834,
"loss": 0.3261,
"step": 816
},
{
"epoch": 1.34,
"grad_norm": 0.3318668603897095,
"learning_rate": 9.991735537190081e-05,
"loss": 0.4362,
"step": 817
},
{
"epoch": 1.34,
"grad_norm": 0.33185282349586487,
"learning_rate": 9.966942148760331e-05,
"loss": 0.5219,
"step": 818
},
{
"epoch": 1.34,
"grad_norm": 0.2683846056461334,
"learning_rate": 9.942148760330578e-05,
"loss": 0.3657,
"step": 819
},
{
"epoch": 1.34,
"grad_norm": 0.2643420100212097,
"learning_rate": 9.917355371900826e-05,
"loss": 0.4697,
"step": 820
},
{
"epoch": 1.34,
"grad_norm": 0.32440856099128723,
"learning_rate": 9.892561983471073e-05,
"loss": 0.5572,
"step": 821
},
{
"epoch": 1.35,
"grad_norm": 0.22183597087860107,
"learning_rate": 9.867768595041321e-05,
"loss": 0.3379,
"step": 822
},
{
"epoch": 1.35,
"grad_norm": 0.26266101002693176,
"learning_rate": 9.842975206611568e-05,
"loss": 0.439,
"step": 823
},
{
"epoch": 1.35,
"grad_norm": 0.2978360950946808,
"learning_rate": 9.818181818181817e-05,
"loss": 0.4654,
"step": 824
},
{
"epoch": 1.35,
"grad_norm": 0.2713984251022339,
"learning_rate": 9.793388429752067e-05,
"loss": 0.2983,
"step": 825
},
{
"epoch": 1.35,
"grad_norm": 0.2561984956264496,
"learning_rate": 9.768595041322314e-05,
"loss": 0.3381,
"step": 826
},
{
"epoch": 1.35,
"grad_norm": 0.2766323983669281,
"learning_rate": 9.743801652892562e-05,
"loss": 0.4167,
"step": 827
},
{
"epoch": 1.36,
"grad_norm": 0.33810022473335266,
"learning_rate": 9.719008264462809e-05,
"loss": 0.3793,
"step": 828
},
{
"epoch": 1.36,
"grad_norm": 0.3332251310348511,
"learning_rate": 9.694214876033057e-05,
"loss": 0.5517,
"step": 829
},
{
"epoch": 1.36,
"grad_norm": 0.2713959515094757,
"learning_rate": 9.669421487603304e-05,
"loss": 0.3583,
"step": 830
},
{
"epoch": 1.36,
"grad_norm": 0.2778157889842987,
"learning_rate": 9.644628099173553e-05,
"loss": 0.3089,
"step": 831
},
{
"epoch": 1.36,
"grad_norm": 0.33538392186164856,
"learning_rate": 9.6198347107438e-05,
"loss": 0.3776,
"step": 832
},
{
"epoch": 1.36,
"grad_norm": 0.32728123664855957,
"learning_rate": 9.59504132231405e-05,
"loss": 0.434,
"step": 833
},
{
"epoch": 1.37,
"grad_norm": 0.30630162358283997,
"learning_rate": 9.570247933884298e-05,
"loss": 0.3913,
"step": 834
},
{
"epoch": 1.37,
"grad_norm": 0.2960034906864166,
"learning_rate": 9.545454545454545e-05,
"loss": 0.4368,
"step": 835
},
{
"epoch": 1.37,
"grad_norm": 0.35711923241615295,
"learning_rate": 9.520661157024793e-05,
"loss": 0.399,
"step": 836
},
{
"epoch": 1.37,
"grad_norm": 0.30195897817611694,
"learning_rate": 9.49586776859504e-05,
"loss": 0.4421,
"step": 837
},
{
"epoch": 1.37,
"grad_norm": 0.3220643401145935,
"learning_rate": 9.471074380165288e-05,
"loss": 0.3441,
"step": 838
},
{
"epoch": 1.37,
"grad_norm": 0.3709239661693573,
"learning_rate": 9.446280991735535e-05,
"loss": 0.4095,
"step": 839
},
{
"epoch": 1.38,
"grad_norm": 0.40360063314437866,
"learning_rate": 9.421487603305785e-05,
"loss": 0.5692,
"step": 840
},
{
"epoch": 1.38,
"grad_norm": 0.32428041100502014,
"learning_rate": 9.396694214876032e-05,
"loss": 0.4306,
"step": 841
},
{
"epoch": 1.38,
"grad_norm": 0.2750518321990967,
"learning_rate": 9.37190082644628e-05,
"loss": 0.3905,
"step": 842
},
{
"epoch": 1.38,
"grad_norm": 0.331478476524353,
"learning_rate": 9.347107438016528e-05,
"loss": 0.6008,
"step": 843
},
{
"epoch": 1.38,
"grad_norm": 0.3165242671966553,
"learning_rate": 9.322314049586776e-05,
"loss": 0.4624,
"step": 844
},
{
"epoch": 1.38,
"grad_norm": 0.26457470655441284,
"learning_rate": 9.297520661157024e-05,
"loss": 0.4462,
"step": 845
},
{
"epoch": 1.39,
"grad_norm": 0.3557126522064209,
"learning_rate": 9.272727272727271e-05,
"loss": 0.5737,
"step": 846
},
{
"epoch": 1.39,
"grad_norm": 0.3306926488876343,
"learning_rate": 9.247933884297521e-05,
"loss": 0.4597,
"step": 847
},
{
"epoch": 1.39,
"grad_norm": 0.24906127154827118,
"learning_rate": 9.223140495867768e-05,
"loss": 0.378,
"step": 848
},
{
"epoch": 1.39,
"grad_norm": 0.29440054297447205,
"learning_rate": 9.198347107438016e-05,
"loss": 0.4562,
"step": 849
},
{
"epoch": 1.39,
"grad_norm": 0.34878161549568176,
"learning_rate": 9.173553719008263e-05,
"loss": 0.4546,
"step": 850
},
{
"epoch": 1.39,
"grad_norm": 0.3725307583808899,
"learning_rate": 9.148760330578512e-05,
"loss": 0.4119,
"step": 851
},
{
"epoch": 1.4,
"grad_norm": 0.30648747086524963,
"learning_rate": 9.123966942148759e-05,
"loss": 0.4428,
"step": 852
},
{
"epoch": 1.4,
"grad_norm": 0.2755535840988159,
"learning_rate": 9.099173553719007e-05,
"loss": 0.3592,
"step": 853
},
{
"epoch": 1.4,
"grad_norm": 0.2802577614784241,
"learning_rate": 9.074380165289255e-05,
"loss": 0.472,
"step": 854
},
{
"epoch": 1.4,
"grad_norm": 0.28871360421180725,
"learning_rate": 9.049586776859504e-05,
"loss": 0.4532,
"step": 855
},
{
"epoch": 1.4,
"grad_norm": 0.37071362137794495,
"learning_rate": 9.024793388429752e-05,
"loss": 0.3426,
"step": 856
},
{
"epoch": 1.4,
"grad_norm": 0.30081430077552795,
"learning_rate": 8.999999999999999e-05,
"loss": 0.4069,
"step": 857
},
{
"epoch": 1.4,
"grad_norm": 0.3186596930027008,
"learning_rate": 8.975206611570248e-05,
"loss": 0.4997,
"step": 858
},
{
"epoch": 1.41,
"grad_norm": 0.286479115486145,
"learning_rate": 8.950413223140495e-05,
"loss": 0.3902,
"step": 859
},
{
"epoch": 1.41,
"grad_norm": 0.3457258939743042,
"learning_rate": 8.925619834710743e-05,
"loss": 0.4339,
"step": 860
},
{
"epoch": 1.41,
"grad_norm": 0.30513113737106323,
"learning_rate": 8.90082644628099e-05,
"loss": 0.3414,
"step": 861
},
{
"epoch": 1.41,
"grad_norm": 0.30697953701019287,
"learning_rate": 8.876033057851238e-05,
"loss": 0.4657,
"step": 862
},
{
"epoch": 1.41,
"grad_norm": 0.3395203649997711,
"learning_rate": 8.851239669421488e-05,
"loss": 0.3945,
"step": 863
},
{
"epoch": 1.41,
"grad_norm": 0.43322789669036865,
"learning_rate": 8.826446280991735e-05,
"loss": 0.5337,
"step": 864
},
{
"epoch": 1.42,
"grad_norm": 0.3421814739704132,
"learning_rate": 8.801652892561983e-05,
"loss": 0.4481,
"step": 865
},
{
"epoch": 1.42,
"grad_norm": 0.24497461318969727,
"learning_rate": 8.77685950413223e-05,
"loss": 0.4199,
"step": 866
},
{
"epoch": 1.42,
"grad_norm": 0.3835270404815674,
"learning_rate": 8.752066115702479e-05,
"loss": 0.5534,
"step": 867
},
{
"epoch": 1.42,
"grad_norm": 0.3144569396972656,
"learning_rate": 8.727272727272726e-05,
"loss": 0.4563,
"step": 868
},
{
"epoch": 1.42,
"grad_norm": 0.2757865786552429,
"learning_rate": 8.702479338842974e-05,
"loss": 0.4241,
"step": 869
},
{
"epoch": 1.42,
"grad_norm": 0.28413090109825134,
"learning_rate": 8.677685950413221e-05,
"loss": 0.3484,
"step": 870
},
{
"epoch": 1.43,
"grad_norm": 0.27918362617492676,
"learning_rate": 8.652892561983471e-05,
"loss": 0.4133,
"step": 871
},
{
"epoch": 1.43,
"grad_norm": 0.3901917040348053,
"learning_rate": 8.628099173553718e-05,
"loss": 0.4755,
"step": 872
},
{
"epoch": 1.43,
"grad_norm": 0.34810692071914673,
"learning_rate": 8.603305785123966e-05,
"loss": 0.4516,
"step": 873
},
{
"epoch": 1.43,
"grad_norm": 0.3317393958568573,
"learning_rate": 8.578512396694215e-05,
"loss": 0.4995,
"step": 874
},
{
"epoch": 1.43,
"grad_norm": 0.26235052943229675,
"learning_rate": 8.553719008264462e-05,
"loss": 0.3348,
"step": 875
},
{
"epoch": 1.43,
"grad_norm": 0.2735447585582733,
"learning_rate": 8.52892561983471e-05,
"loss": 0.2932,
"step": 876
},
{
"epoch": 1.44,
"grad_norm": 0.30968329310417175,
"learning_rate": 8.504132231404957e-05,
"loss": 0.3783,
"step": 877
},
{
"epoch": 1.44,
"grad_norm": 0.30193984508514404,
"learning_rate": 8.479338842975207e-05,
"loss": 0.4357,
"step": 878
},
{
"epoch": 1.44,
"grad_norm": 0.3407258987426758,
"learning_rate": 8.454545454545454e-05,
"loss": 0.4821,
"step": 879
},
{
"epoch": 1.44,
"grad_norm": 0.28090009093284607,
"learning_rate": 8.429752066115702e-05,
"loss": 0.4158,
"step": 880
},
{
"epoch": 1.44,
"grad_norm": 0.2898884415626526,
"learning_rate": 8.404958677685949e-05,
"loss": 0.3091,
"step": 881
},
{
"epoch": 1.44,
"grad_norm": 0.31658637523651123,
"learning_rate": 8.380165289256197e-05,
"loss": 0.3773,
"step": 882
},
{
"epoch": 1.45,
"grad_norm": 0.2722189724445343,
"learning_rate": 8.355371900826446e-05,
"loss": 0.4483,
"step": 883
},
{
"epoch": 1.45,
"grad_norm": 0.23621954023838043,
"learning_rate": 8.330578512396693e-05,
"loss": 0.3112,
"step": 884
},
{
"epoch": 1.45,
"grad_norm": 0.3659461438655853,
"learning_rate": 8.305785123966942e-05,
"loss": 0.4507,
"step": 885
},
{
"epoch": 1.45,
"grad_norm": 0.3253099322319031,
"learning_rate": 8.28099173553719e-05,
"loss": 0.4854,
"step": 886
},
{
"epoch": 1.45,
"grad_norm": 0.3201637864112854,
"learning_rate": 8.256198347107438e-05,
"loss": 0.5687,
"step": 887
},
{
"epoch": 1.45,
"grad_norm": 0.4112270772457123,
"learning_rate": 8.231404958677685e-05,
"loss": 0.3742,
"step": 888
},
{
"epoch": 1.46,
"grad_norm": 0.3146194517612457,
"learning_rate": 8.206611570247933e-05,
"loss": 0.4869,
"step": 889
},
{
"epoch": 1.46,
"grad_norm": 0.34321263432502747,
"learning_rate": 8.18181818181818e-05,
"loss": 0.5154,
"step": 890
},
{
"epoch": 1.46,
"grad_norm": 0.2986968159675598,
"learning_rate": 8.157024793388429e-05,
"loss": 0.647,
"step": 891
},
{
"epoch": 1.46,
"grad_norm": 0.3427133858203888,
"learning_rate": 8.132231404958676e-05,
"loss": 0.3912,
"step": 892
},
{
"epoch": 1.46,
"grad_norm": 0.3434309661388397,
"learning_rate": 8.107438016528925e-05,
"loss": 0.51,
"step": 893
},
{
"epoch": 1.46,
"grad_norm": 0.32024991512298584,
"learning_rate": 8.082644628099174e-05,
"loss": 0.387,
"step": 894
},
{
"epoch": 1.47,
"grad_norm": 0.2961815595626831,
"learning_rate": 8.057851239669421e-05,
"loss": 0.3909,
"step": 895
},
{
"epoch": 1.47,
"grad_norm": 0.3219030201435089,
"learning_rate": 8.033057851239669e-05,
"loss": 0.3911,
"step": 896
},
{
"epoch": 1.47,
"grad_norm": 0.2776000201702118,
"learning_rate": 8.008264462809916e-05,
"loss": 0.3625,
"step": 897
},
{
"epoch": 1.47,
"grad_norm": 0.31484290957450867,
"learning_rate": 7.983471074380164e-05,
"loss": 0.6162,
"step": 898
},
{
"epoch": 1.47,
"grad_norm": 0.2789134085178375,
"learning_rate": 7.958677685950411e-05,
"loss": 0.3199,
"step": 899
},
{
"epoch": 1.47,
"grad_norm": 0.27821627259254456,
"learning_rate": 7.933884297520661e-05,
"loss": 0.4295,
"step": 900
},
{
"epoch": 1.48,
"grad_norm": 0.3022254705429077,
"learning_rate": 7.909090909090908e-05,
"loss": 0.309,
"step": 901
},
{
"epoch": 1.48,
"grad_norm": 0.24830293655395508,
"learning_rate": 7.884297520661157e-05,
"loss": 0.3833,
"step": 902
},
{
"epoch": 1.48,
"grad_norm": 0.31184327602386475,
"learning_rate": 7.859504132231405e-05,
"loss": 0.3715,
"step": 903
},
{
"epoch": 1.48,
"grad_norm": 0.2993053197860718,
"learning_rate": 7.834710743801652e-05,
"loss": 0.3825,
"step": 904
},
{
"epoch": 1.48,
"grad_norm": 0.3385005295276642,
"learning_rate": 7.8099173553719e-05,
"loss": 0.4868,
"step": 905
},
{
"epoch": 1.48,
"grad_norm": 0.26812323927879333,
"learning_rate": 7.785123966942147e-05,
"loss": 0.2925,
"step": 906
},
{
"epoch": 1.49,
"grad_norm": 0.3275848925113678,
"learning_rate": 7.760330578512397e-05,
"loss": 0.3657,
"step": 907
},
{
"epoch": 1.49,
"grad_norm": 0.2972089350223541,
"learning_rate": 7.735537190082644e-05,
"loss": 0.4396,
"step": 908
},
{
"epoch": 1.49,
"grad_norm": 0.27619728446006775,
"learning_rate": 7.710743801652892e-05,
"loss": 0.3946,
"step": 909
},
{
"epoch": 1.49,
"grad_norm": 0.30436667799949646,
"learning_rate": 7.68595041322314e-05,
"loss": 0.4177,
"step": 910
},
{
"epoch": 1.49,
"grad_norm": 0.2652393877506256,
"learning_rate": 7.661157024793388e-05,
"loss": 0.3165,
"step": 911
},
{
"epoch": 1.49,
"grad_norm": 0.28303712606430054,
"learning_rate": 7.636363636363635e-05,
"loss": 0.4829,
"step": 912
},
{
"epoch": 1.49,
"grad_norm": 0.33964964747428894,
"learning_rate": 7.611570247933883e-05,
"loss": 0.5043,
"step": 913
},
{
"epoch": 1.5,
"grad_norm": 0.2591302692890167,
"learning_rate": 7.586776859504133e-05,
"loss": 0.3814,
"step": 914
},
{
"epoch": 1.5,
"grad_norm": 0.3488747179508209,
"learning_rate": 7.56198347107438e-05,
"loss": 0.5233,
"step": 915
},
{
"epoch": 1.5,
"grad_norm": 0.29015597701072693,
"learning_rate": 7.537190082644628e-05,
"loss": 0.4672,
"step": 916
},
{
"epoch": 1.5,
"grad_norm": 0.31618839502334595,
"learning_rate": 7.512396694214875e-05,
"loss": 0.4538,
"step": 917
},
{
"epoch": 1.5,
"grad_norm": 0.35049545764923096,
"learning_rate": 7.487603305785124e-05,
"loss": 0.4089,
"step": 918
},
{
"epoch": 1.5,
"grad_norm": 0.34093132615089417,
"learning_rate": 7.462809917355372e-05,
"loss": 0.4306,
"step": 919
},
{
"epoch": 1.51,
"grad_norm": 0.30601584911346436,
"learning_rate": 7.438016528925619e-05,
"loss": 0.4396,
"step": 920
},
{
"epoch": 1.51,
"grad_norm": 0.45013612508773804,
"learning_rate": 7.413223140495867e-05,
"loss": 0.4477,
"step": 921
},
{
"epoch": 1.51,
"grad_norm": 0.30486834049224854,
"learning_rate": 7.388429752066116e-05,
"loss": 0.3777,
"step": 922
},
{
"epoch": 1.51,
"grad_norm": 0.3926061689853668,
"learning_rate": 7.363636363636363e-05,
"loss": 0.3532,
"step": 923
},
{
"epoch": 1.51,
"grad_norm": 0.3843371272087097,
"learning_rate": 7.338842975206611e-05,
"loss": 0.5182,
"step": 924
},
{
"epoch": 1.51,
"grad_norm": 0.30922451615333557,
"learning_rate": 7.314049586776858e-05,
"loss": 0.4361,
"step": 925
},
{
"epoch": 1.52,
"grad_norm": 0.3367323875427246,
"learning_rate": 7.289256198347106e-05,
"loss": 0.3809,
"step": 926
},
{
"epoch": 1.52,
"grad_norm": 0.39369019865989685,
"learning_rate": 7.264462809917355e-05,
"loss": 0.3623,
"step": 927
},
{
"epoch": 1.52,
"grad_norm": 0.3159162104129791,
"learning_rate": 7.239669421487603e-05,
"loss": 0.5059,
"step": 928
},
{
"epoch": 1.52,
"grad_norm": 0.34716740250587463,
"learning_rate": 7.214876033057851e-05,
"loss": 0.4201,
"step": 929
},
{
"epoch": 1.52,
"grad_norm": 0.20480923354625702,
"learning_rate": 7.190082644628098e-05,
"loss": 0.2699,
"step": 930
},
{
"epoch": 1.52,
"grad_norm": 0.3518913686275482,
"learning_rate": 7.165289256198347e-05,
"loss": 0.5337,
"step": 931
},
{
"epoch": 1.53,
"grad_norm": 0.28605952858924866,
"learning_rate": 7.140495867768594e-05,
"loss": 0.44,
"step": 932
},
{
"epoch": 1.53,
"grad_norm": 0.28229033946990967,
"learning_rate": 7.115702479338842e-05,
"loss": 0.3534,
"step": 933
},
{
"epoch": 1.53,
"grad_norm": 0.3456754684448242,
"learning_rate": 7.09090909090909e-05,
"loss": 0.3952,
"step": 934
},
{
"epoch": 1.53,
"grad_norm": 0.27707159519195557,
"learning_rate": 7.066115702479338e-05,
"loss": 0.3667,
"step": 935
},
{
"epoch": 1.53,
"grad_norm": 0.2811780273914337,
"learning_rate": 7.041322314049586e-05,
"loss": 0.3954,
"step": 936
},
{
"epoch": 1.53,
"grad_norm": 0.3099793493747711,
"learning_rate": 7.016528925619834e-05,
"loss": 0.441,
"step": 937
},
{
"epoch": 1.54,
"grad_norm": 0.4153590500354767,
"learning_rate": 6.991735537190083e-05,
"loss": 0.4462,
"step": 938
},
{
"epoch": 1.54,
"grad_norm": 0.2945801615715027,
"learning_rate": 6.96694214876033e-05,
"loss": 0.4535,
"step": 939
},
{
"epoch": 1.54,
"grad_norm": 0.2930592894554138,
"learning_rate": 6.942148760330578e-05,
"loss": 0.5566,
"step": 940
},
{
"epoch": 1.54,
"grad_norm": 0.3034913241863251,
"learning_rate": 6.917355371900826e-05,
"loss": 0.4695,
"step": 941
},
{
"epoch": 1.54,
"grad_norm": 0.3054913878440857,
"learning_rate": 6.892561983471073e-05,
"loss": 0.3921,
"step": 942
},
{
"epoch": 1.54,
"grad_norm": 0.3297981917858124,
"learning_rate": 6.867768595041322e-05,
"loss": 0.5057,
"step": 943
},
{
"epoch": 1.55,
"grad_norm": 0.23640452325344086,
"learning_rate": 6.84297520661157e-05,
"loss": 0.329,
"step": 944
},
{
"epoch": 1.55,
"grad_norm": 0.2970188856124878,
"learning_rate": 6.818181818181817e-05,
"loss": 0.4376,
"step": 945
},
{
"epoch": 1.55,
"grad_norm": 0.3243064880371094,
"learning_rate": 6.793388429752065e-05,
"loss": 0.4922,
"step": 946
},
{
"epoch": 1.55,
"grad_norm": 0.4473859667778015,
"learning_rate": 6.768595041322312e-05,
"loss": 0.5245,
"step": 947
},
{
"epoch": 1.55,
"grad_norm": 0.2901310622692108,
"learning_rate": 6.743801652892562e-05,
"loss": 0.4996,
"step": 948
},
{
"epoch": 1.55,
"grad_norm": 0.3633457124233246,
"learning_rate": 6.719008264462809e-05,
"loss": 0.4669,
"step": 949
},
{
"epoch": 1.56,
"grad_norm": 0.33570581674575806,
"learning_rate": 6.694214876033058e-05,
"loss": 0.404,
"step": 950
},
{
"epoch": 1.56,
"grad_norm": 0.26466354727745056,
"learning_rate": 6.669421487603306e-05,
"loss": 0.2881,
"step": 951
},
{
"epoch": 1.56,
"grad_norm": 0.29028353095054626,
"learning_rate": 6.644628099173553e-05,
"loss": 0.3607,
"step": 952
},
{
"epoch": 1.56,
"grad_norm": 0.2878669798374176,
"learning_rate": 6.619834710743801e-05,
"loss": 0.4415,
"step": 953
},
{
"epoch": 1.56,
"grad_norm": 0.33260804414749146,
"learning_rate": 6.595041322314048e-05,
"loss": 0.4424,
"step": 954
},
{
"epoch": 1.56,
"grad_norm": 0.3135119378566742,
"learning_rate": 6.570247933884297e-05,
"loss": 0.4276,
"step": 955
},
{
"epoch": 1.57,
"grad_norm": 0.2714795470237732,
"learning_rate": 6.545454545454545e-05,
"loss": 0.2789,
"step": 956
},
{
"epoch": 1.57,
"grad_norm": 0.3564438819885254,
"learning_rate": 6.520661157024792e-05,
"loss": 0.4683,
"step": 957
},
{
"epoch": 1.57,
"grad_norm": 0.3303399682044983,
"learning_rate": 6.495867768595042e-05,
"loss": 0.4657,
"step": 958
},
{
"epoch": 1.57,
"grad_norm": 0.30086350440979004,
"learning_rate": 6.471074380165289e-05,
"loss": 0.3296,
"step": 959
},
{
"epoch": 1.57,
"grad_norm": 0.34699100255966187,
"learning_rate": 6.446280991735537e-05,
"loss": 0.3543,
"step": 960
},
{
"epoch": 1.57,
"grad_norm": 0.326579213142395,
"learning_rate": 6.421487603305784e-05,
"loss": 0.4001,
"step": 961
},
{
"epoch": 1.58,
"grad_norm": 0.3462665379047394,
"learning_rate": 6.396694214876032e-05,
"loss": 0.3999,
"step": 962
},
{
"epoch": 1.58,
"grad_norm": 0.3408821225166321,
"learning_rate": 6.371900826446281e-05,
"loss": 0.3614,
"step": 963
},
{
"epoch": 1.58,
"grad_norm": 0.3061428666114807,
"learning_rate": 6.347107438016528e-05,
"loss": 0.4127,
"step": 964
},
{
"epoch": 1.58,
"grad_norm": 0.30745938420295715,
"learning_rate": 6.322314049586776e-05,
"loss": 0.3965,
"step": 965
},
{
"epoch": 1.58,
"grad_norm": 0.33782872557640076,
"learning_rate": 6.297520661157025e-05,
"loss": 0.5026,
"step": 966
},
{
"epoch": 1.58,
"grad_norm": 0.3501698076725006,
"learning_rate": 6.272727272727272e-05,
"loss": 0.4731,
"step": 967
},
{
"epoch": 1.58,
"grad_norm": 0.3578520119190216,
"learning_rate": 6.24793388429752e-05,
"loss": 0.4302,
"step": 968
},
{
"epoch": 1.59,
"grad_norm": 0.30132660269737244,
"learning_rate": 6.223140495867768e-05,
"loss": 0.3784,
"step": 969
},
{
"epoch": 1.59,
"grad_norm": 0.29198774695396423,
"learning_rate": 6.198347107438017e-05,
"loss": 0.396,
"step": 970
},
{
"epoch": 1.59,
"grad_norm": 0.3028549551963806,
"learning_rate": 6.173553719008264e-05,
"loss": 0.3531,
"step": 971
},
{
"epoch": 1.59,
"grad_norm": 0.3193860352039337,
"learning_rate": 6.148760330578512e-05,
"loss": 0.5261,
"step": 972
},
{
"epoch": 1.59,
"grad_norm": 0.330228716135025,
"learning_rate": 6.12396694214876e-05,
"loss": 0.3853,
"step": 973
},
{
"epoch": 1.59,
"grad_norm": 0.2856347858905792,
"learning_rate": 6.0991735537190074e-05,
"loss": 0.4543,
"step": 974
},
{
"epoch": 1.6,
"grad_norm": 0.3663886487483978,
"learning_rate": 6.074380165289256e-05,
"loss": 0.3821,
"step": 975
},
{
"epoch": 1.6,
"grad_norm": 0.3297857642173767,
"learning_rate": 6.0495867768595034e-05,
"loss": 0.4504,
"step": 976
},
{
"epoch": 1.6,
"grad_norm": 0.29853883385658264,
"learning_rate": 6.024793388429751e-05,
"loss": 0.3528,
"step": 977
},
{
"epoch": 1.6,
"grad_norm": 0.3246425986289978,
"learning_rate": 5.9999999999999995e-05,
"loss": 0.3986,
"step": 978
},
{
"epoch": 1.6,
"grad_norm": 0.3537238836288452,
"learning_rate": 5.975206611570248e-05,
"loss": 0.3776,
"step": 979
},
{
"epoch": 1.6,
"grad_norm": 0.2915757894515991,
"learning_rate": 5.9504132231404955e-05,
"loss": 0.2895,
"step": 980
},
{
"epoch": 1.61,
"grad_norm": 0.30707284808158875,
"learning_rate": 5.925619834710743e-05,
"loss": 0.3238,
"step": 981
},
{
"epoch": 1.61,
"grad_norm": 0.301845520734787,
"learning_rate": 5.900826446280991e-05,
"loss": 0.4031,
"step": 982
},
{
"epoch": 1.61,
"grad_norm": 0.24002347886562347,
"learning_rate": 5.876033057851239e-05,
"loss": 0.3477,
"step": 983
},
{
"epoch": 1.61,
"grad_norm": 0.3008634150028229,
"learning_rate": 5.851239669421487e-05,
"loss": 0.4595,
"step": 984
},
{
"epoch": 1.61,
"grad_norm": 0.32416027784347534,
"learning_rate": 5.8264462809917346e-05,
"loss": 0.403,
"step": 985
},
{
"epoch": 1.61,
"grad_norm": 0.3158760368824005,
"learning_rate": 5.801652892561982e-05,
"loss": 0.305,
"step": 986
},
{
"epoch": 1.62,
"grad_norm": 0.33743736147880554,
"learning_rate": 5.7768595041322313e-05,
"loss": 0.4867,
"step": 987
},
{
"epoch": 1.62,
"grad_norm": 0.3402981460094452,
"learning_rate": 5.752066115702479e-05,
"loss": 0.3982,
"step": 988
},
{
"epoch": 1.62,
"grad_norm": 0.3389660716056824,
"learning_rate": 5.727272727272727e-05,
"loss": 0.4311,
"step": 989
},
{
"epoch": 1.62,
"grad_norm": 0.47749587893486023,
"learning_rate": 5.702479338842975e-05,
"loss": 0.3775,
"step": 990
},
{
"epoch": 1.62,
"grad_norm": 0.27538084983825684,
"learning_rate": 5.677685950413223e-05,
"loss": 0.3568,
"step": 991
},
{
"epoch": 1.62,
"grad_norm": 0.33023789525032043,
"learning_rate": 5.6528925619834704e-05,
"loss": 0.4225,
"step": 992
},
{
"epoch": 1.63,
"grad_norm": 0.28135445713996887,
"learning_rate": 5.628099173553718e-05,
"loss": 0.3658,
"step": 993
},
{
"epoch": 1.63,
"grad_norm": 0.3511416018009186,
"learning_rate": 5.6033057851239665e-05,
"loss": 0.3928,
"step": 994
},
{
"epoch": 1.63,
"grad_norm": 0.2987925708293915,
"learning_rate": 5.578512396694214e-05,
"loss": 0.4015,
"step": 995
},
{
"epoch": 1.63,
"grad_norm": 0.3340010344982147,
"learning_rate": 5.553719008264462e-05,
"loss": 0.4566,
"step": 996
},
{
"epoch": 1.63,
"grad_norm": 0.23461014032363892,
"learning_rate": 5.528925619834711e-05,
"loss": 0.3556,
"step": 997
},
{
"epoch": 1.63,
"grad_norm": 0.3425525724887848,
"learning_rate": 5.5041322314049586e-05,
"loss": 0.3736,
"step": 998
},
{
"epoch": 1.64,
"grad_norm": 0.33320698142051697,
"learning_rate": 5.479338842975206e-05,
"loss": 0.3926,
"step": 999
},
{
"epoch": 1.64,
"grad_norm": 0.26936790347099304,
"learning_rate": 5.454545454545454e-05,
"loss": 0.3587,
"step": 1000
},
{
"epoch": 1.64,
"grad_norm": 0.322934091091156,
"learning_rate": 5.429752066115702e-05,
"loss": 0.3119,
"step": 1001
},
{
"epoch": 1.64,
"grad_norm": 0.3295484483242035,
"learning_rate": 5.40495867768595e-05,
"loss": 0.3257,
"step": 1002
},
{
"epoch": 1.64,
"grad_norm": 0.2893584370613098,
"learning_rate": 5.380165289256198e-05,
"loss": 0.3451,
"step": 1003
},
{
"epoch": 1.64,
"grad_norm": 0.3215138912200928,
"learning_rate": 5.3553719008264454e-05,
"loss": 0.4104,
"step": 1004
},
{
"epoch": 1.65,
"grad_norm": 0.19545914232730865,
"learning_rate": 5.330578512396694e-05,
"loss": 0.2245,
"step": 1005
},
{
"epoch": 1.65,
"grad_norm": 0.2952648103237152,
"learning_rate": 5.3057851239669414e-05,
"loss": 0.3393,
"step": 1006
},
{
"epoch": 1.65,
"grad_norm": 0.34105175733566284,
"learning_rate": 5.28099173553719e-05,
"loss": 0.519,
"step": 1007
},
{
"epoch": 1.65,
"grad_norm": 0.3435216546058655,
"learning_rate": 5.256198347107438e-05,
"loss": 0.4968,
"step": 1008
},
{
"epoch": 1.65,
"grad_norm": 0.29052355885505676,
"learning_rate": 5.231404958677686e-05,
"loss": 0.4419,
"step": 1009
},
{
"epoch": 1.65,
"grad_norm": 0.3326230049133301,
"learning_rate": 5.2066115702479335e-05,
"loss": 0.4461,
"step": 1010
},
{
"epoch": 1.66,
"grad_norm": 0.35595494508743286,
"learning_rate": 5.181818181818181e-05,
"loss": 0.4886,
"step": 1011
},
{
"epoch": 1.66,
"grad_norm": 0.3467525541782379,
"learning_rate": 5.1570247933884295e-05,
"loss": 0.4671,
"step": 1012
},
{
"epoch": 1.66,
"grad_norm": 0.29460448026657104,
"learning_rate": 5.132231404958677e-05,
"loss": 0.3872,
"step": 1013
},
{
"epoch": 1.66,
"grad_norm": 0.273575097322464,
"learning_rate": 5.107438016528925e-05,
"loss": 0.3603,
"step": 1014
},
{
"epoch": 1.66,
"grad_norm": 0.3603818416595459,
"learning_rate": 5.0826446280991726e-05,
"loss": 0.3539,
"step": 1015
},
{
"epoch": 1.66,
"grad_norm": 0.31469517946243286,
"learning_rate": 5.057851239669421e-05,
"loss": 0.3988,
"step": 1016
},
{
"epoch": 1.67,
"grad_norm": 0.3218969702720642,
"learning_rate": 5.033057851239669e-05,
"loss": 0.4366,
"step": 1017
},
{
"epoch": 1.67,
"grad_norm": 0.34077420830726624,
"learning_rate": 5.008264462809917e-05,
"loss": 0.4248,
"step": 1018
},
{
"epoch": 1.67,
"grad_norm": 0.322591096162796,
"learning_rate": 4.9834710743801654e-05,
"loss": 0.5081,
"step": 1019
},
{
"epoch": 1.67,
"grad_norm": 0.35607361793518066,
"learning_rate": 4.958677685950413e-05,
"loss": 0.3596,
"step": 1020
},
{
"epoch": 1.67,
"grad_norm": 0.2865798771381378,
"learning_rate": 4.933884297520661e-05,
"loss": 0.2703,
"step": 1021
},
{
"epoch": 1.67,
"grad_norm": 0.30387502908706665,
"learning_rate": 4.9090909090909084e-05,
"loss": 0.3051,
"step": 1022
},
{
"epoch": 1.67,
"grad_norm": 0.3474448323249817,
"learning_rate": 4.884297520661157e-05,
"loss": 0.2851,
"step": 1023
},
{
"epoch": 1.68,
"grad_norm": 0.3696686625480652,
"learning_rate": 4.8595041322314045e-05,
"loss": 0.4403,
"step": 1024
},
{
"epoch": 1.68,
"grad_norm": 0.33602291345596313,
"learning_rate": 4.834710743801652e-05,
"loss": 0.4134,
"step": 1025
},
{
"epoch": 1.68,
"grad_norm": 0.27331918478012085,
"learning_rate": 4.8099173553719e-05,
"loss": 0.3303,
"step": 1026
},
{
"epoch": 1.68,
"grad_norm": 0.3705825209617615,
"learning_rate": 4.785123966942149e-05,
"loss": 0.3411,
"step": 1027
},
{
"epoch": 1.68,
"grad_norm": 0.4541082978248596,
"learning_rate": 4.7603305785123966e-05,
"loss": 0.4263,
"step": 1028
},
{
"epoch": 1.68,
"grad_norm": 0.29885897040367126,
"learning_rate": 4.735537190082644e-05,
"loss": 0.5602,
"step": 1029
},
{
"epoch": 1.69,
"grad_norm": 0.35169675946235657,
"learning_rate": 4.7107438016528926e-05,
"loss": 0.4409,
"step": 1030
},
{
"epoch": 1.69,
"grad_norm": 0.41590291261672974,
"learning_rate": 4.68595041322314e-05,
"loss": 0.4355,
"step": 1031
},
{
"epoch": 1.69,
"grad_norm": 0.33613288402557373,
"learning_rate": 4.661157024793388e-05,
"loss": 0.4399,
"step": 1032
},
{
"epoch": 1.69,
"grad_norm": 0.3519938886165619,
"learning_rate": 4.6363636363636356e-05,
"loss": 0.4464,
"step": 1033
},
{
"epoch": 1.69,
"grad_norm": 0.2981269359588623,
"learning_rate": 4.611570247933884e-05,
"loss": 0.3667,
"step": 1034
},
{
"epoch": 1.69,
"grad_norm": 0.32030418515205383,
"learning_rate": 4.586776859504132e-05,
"loss": 0.3759,
"step": 1035
},
{
"epoch": 1.7,
"grad_norm": 0.39815372228622437,
"learning_rate": 4.5619834710743794e-05,
"loss": 0.3259,
"step": 1036
},
{
"epoch": 1.7,
"grad_norm": 0.33106112480163574,
"learning_rate": 4.537190082644628e-05,
"loss": 0.4985,
"step": 1037
},
{
"epoch": 1.7,
"grad_norm": 0.3748137950897217,
"learning_rate": 4.512396694214876e-05,
"loss": 0.5177,
"step": 1038
},
{
"epoch": 1.7,
"grad_norm": 0.31328514218330383,
"learning_rate": 4.487603305785124e-05,
"loss": 0.3406,
"step": 1039
},
{
"epoch": 1.7,
"grad_norm": 0.35391247272491455,
"learning_rate": 4.4628099173553715e-05,
"loss": 0.4216,
"step": 1040
},
{
"epoch": 1.7,
"grad_norm": 0.37352749705314636,
"learning_rate": 4.438016528925619e-05,
"loss": 0.4936,
"step": 1041
},
{
"epoch": 1.71,
"grad_norm": 0.20523978769779205,
"learning_rate": 4.4132231404958675e-05,
"loss": 0.2241,
"step": 1042
},
{
"epoch": 1.71,
"grad_norm": 0.26052072644233704,
"learning_rate": 4.388429752066115e-05,
"loss": 0.352,
"step": 1043
},
{
"epoch": 1.71,
"grad_norm": 0.30189159512519836,
"learning_rate": 4.363636363636363e-05,
"loss": 0.3956,
"step": 1044
},
{
"epoch": 1.71,
"grad_norm": 0.28206998109817505,
"learning_rate": 4.3388429752066106e-05,
"loss": 0.3073,
"step": 1045
},
{
"epoch": 1.71,
"grad_norm": 0.3497346341609955,
"learning_rate": 4.314049586776859e-05,
"loss": 0.4544,
"step": 1046
},
{
"epoch": 1.71,
"grad_norm": 0.31490492820739746,
"learning_rate": 4.289256198347107e-05,
"loss": 0.4809,
"step": 1047
},
{
"epoch": 1.72,
"grad_norm": 0.26548659801483154,
"learning_rate": 4.264462809917355e-05,
"loss": 0.3189,
"step": 1048
},
{
"epoch": 1.72,
"grad_norm": 0.40890252590179443,
"learning_rate": 4.239669421487603e-05,
"loss": 0.4825,
"step": 1049
},
{
"epoch": 1.72,
"grad_norm": 0.392419695854187,
"learning_rate": 4.214876033057851e-05,
"loss": 0.3518,
"step": 1050
},
{
"epoch": 1.72,
"grad_norm": 0.3267776370048523,
"learning_rate": 4.190082644628099e-05,
"loss": 0.5964,
"step": 1051
},
{
"epoch": 1.72,
"grad_norm": 0.29872927069664,
"learning_rate": 4.1652892561983464e-05,
"loss": 0.3496,
"step": 1052
},
{
"epoch": 1.72,
"grad_norm": 0.3140263259410858,
"learning_rate": 4.140495867768595e-05,
"loss": 0.3496,
"step": 1053
},
{
"epoch": 1.73,
"grad_norm": 0.35923945903778076,
"learning_rate": 4.1157024793388424e-05,
"loss": 0.4328,
"step": 1054
},
{
"epoch": 1.73,
"grad_norm": 0.24899311363697052,
"learning_rate": 4.09090909090909e-05,
"loss": 0.3662,
"step": 1055
},
{
"epoch": 1.73,
"grad_norm": 0.300325870513916,
"learning_rate": 4.066115702479338e-05,
"loss": 0.3714,
"step": 1056
},
{
"epoch": 1.73,
"grad_norm": 0.26927053928375244,
"learning_rate": 4.041322314049587e-05,
"loss": 0.3518,
"step": 1057
},
{
"epoch": 1.73,
"grad_norm": 0.28170421719551086,
"learning_rate": 4.0165289256198345e-05,
"loss": 0.4214,
"step": 1058
},
{
"epoch": 1.73,
"grad_norm": 0.3097275197505951,
"learning_rate": 3.991735537190082e-05,
"loss": 0.3387,
"step": 1059
},
{
"epoch": 1.74,
"grad_norm": 0.36259180307388306,
"learning_rate": 3.9669421487603306e-05,
"loss": 0.4968,
"step": 1060
},
{
"epoch": 1.74,
"grad_norm": 0.3555668592453003,
"learning_rate": 3.942148760330578e-05,
"loss": 0.4415,
"step": 1061
},
{
"epoch": 1.74,
"grad_norm": 0.2894740104675293,
"learning_rate": 3.917355371900826e-05,
"loss": 0.3911,
"step": 1062
},
{
"epoch": 1.74,
"grad_norm": 0.3361656665802002,
"learning_rate": 3.8925619834710736e-05,
"loss": 0.4286,
"step": 1063
},
{
"epoch": 1.74,
"grad_norm": 0.33269697427749634,
"learning_rate": 3.867768595041322e-05,
"loss": 0.5162,
"step": 1064
},
{
"epoch": 1.74,
"grad_norm": 0.3324260711669922,
"learning_rate": 3.84297520661157e-05,
"loss": 0.4073,
"step": 1065
},
{
"epoch": 1.75,
"grad_norm": 0.3037840723991394,
"learning_rate": 3.8181818181818174e-05,
"loss": 0.4084,
"step": 1066
},
{
"epoch": 1.75,
"grad_norm": 0.29843100905418396,
"learning_rate": 3.7933884297520664e-05,
"loss": 0.4028,
"step": 1067
},
{
"epoch": 1.75,
"grad_norm": 0.24433061480522156,
"learning_rate": 3.768595041322314e-05,
"loss": 0.3769,
"step": 1068
},
{
"epoch": 1.75,
"grad_norm": 0.31540754437446594,
"learning_rate": 3.743801652892562e-05,
"loss": 0.4006,
"step": 1069
},
{
"epoch": 1.75,
"grad_norm": 0.3915780186653137,
"learning_rate": 3.7190082644628094e-05,
"loss": 0.3859,
"step": 1070
},
{
"epoch": 1.75,
"grad_norm": 0.7843402028083801,
"learning_rate": 3.694214876033058e-05,
"loss": 0.4284,
"step": 1071
},
{
"epoch": 1.76,
"grad_norm": 0.3000487685203552,
"learning_rate": 3.6694214876033055e-05,
"loss": 0.6066,
"step": 1072
},
{
"epoch": 1.76,
"grad_norm": 0.2342897206544876,
"learning_rate": 3.644628099173553e-05,
"loss": 0.3012,
"step": 1073
},
{
"epoch": 1.76,
"grad_norm": 0.3100823760032654,
"learning_rate": 3.6198347107438015e-05,
"loss": 0.4236,
"step": 1074
},
{
"epoch": 1.76,
"grad_norm": 0.3442421853542328,
"learning_rate": 3.595041322314049e-05,
"loss": 0.4716,
"step": 1075
},
{
"epoch": 1.76,
"grad_norm": 0.2785506546497345,
"learning_rate": 3.570247933884297e-05,
"loss": 0.307,
"step": 1076
},
{
"epoch": 1.76,
"grad_norm": 0.333635151386261,
"learning_rate": 3.545454545454545e-05,
"loss": 0.4521,
"step": 1077
},
{
"epoch": 1.77,
"grad_norm": 0.3365010619163513,
"learning_rate": 3.520661157024793e-05,
"loss": 0.4522,
"step": 1078
},
{
"epoch": 1.77,
"grad_norm": 0.31510964035987854,
"learning_rate": 3.495867768595041e-05,
"loss": 0.4101,
"step": 1079
},
{
"epoch": 1.77,
"grad_norm": 0.2939818501472473,
"learning_rate": 3.471074380165289e-05,
"loss": 0.378,
"step": 1080
},
{
"epoch": 1.77,
"grad_norm": 0.33073171973228455,
"learning_rate": 3.446280991735537e-05,
"loss": 0.4319,
"step": 1081
},
{
"epoch": 1.77,
"grad_norm": 0.306769460439682,
"learning_rate": 3.421487603305785e-05,
"loss": 0.4584,
"step": 1082
},
{
"epoch": 1.77,
"grad_norm": 0.3151317536830902,
"learning_rate": 3.396694214876033e-05,
"loss": 0.3202,
"step": 1083
},
{
"epoch": 1.77,
"grad_norm": 0.313348650932312,
"learning_rate": 3.371900826446281e-05,
"loss": 0.4051,
"step": 1084
},
{
"epoch": 1.78,
"grad_norm": 0.3377431333065033,
"learning_rate": 3.347107438016529e-05,
"loss": 0.3842,
"step": 1085
},
{
"epoch": 1.78,
"grad_norm": 0.31378257274627686,
"learning_rate": 3.3223140495867765e-05,
"loss": 0.377,
"step": 1086
},
{
"epoch": 1.78,
"grad_norm": 0.31627315282821655,
"learning_rate": 3.297520661157024e-05,
"loss": 0.4278,
"step": 1087
},
{
"epoch": 1.78,
"grad_norm": 0.2957272529602051,
"learning_rate": 3.2727272727272725e-05,
"loss": 0.3384,
"step": 1088
},
{
"epoch": 1.78,
"grad_norm": 0.3261624872684479,
"learning_rate": 3.247933884297521e-05,
"loss": 0.4525,
"step": 1089
},
{
"epoch": 1.78,
"grad_norm": 0.28680557012557983,
"learning_rate": 3.2231404958677685e-05,
"loss": 0.3627,
"step": 1090
},
{
"epoch": 1.79,
"grad_norm": 0.29543063044548035,
"learning_rate": 3.198347107438016e-05,
"loss": 0.2922,
"step": 1091
},
{
"epoch": 1.79,
"grad_norm": 0.3554795980453491,
"learning_rate": 3.173553719008264e-05,
"loss": 0.4692,
"step": 1092
},
{
"epoch": 1.79,
"grad_norm": 0.28728747367858887,
"learning_rate": 3.148760330578512e-05,
"loss": 0.2595,
"step": 1093
},
{
"epoch": 1.79,
"grad_norm": 0.3099517524242401,
"learning_rate": 3.12396694214876e-05,
"loss": 0.3912,
"step": 1094
},
{
"epoch": 1.79,
"grad_norm": 0.3173176050186157,
"learning_rate": 3.099173553719008e-05,
"loss": 0.4186,
"step": 1095
},
{
"epoch": 1.79,
"grad_norm": 0.3445116877555847,
"learning_rate": 3.074380165289256e-05,
"loss": 0.4966,
"step": 1096
},
{
"epoch": 1.8,
"grad_norm": 0.32030245661735535,
"learning_rate": 3.0495867768595037e-05,
"loss": 0.4671,
"step": 1097
},
{
"epoch": 1.8,
"grad_norm": 0.3321797847747803,
"learning_rate": 3.0247933884297517e-05,
"loss": 0.4265,
"step": 1098
},
{
"epoch": 1.8,
"grad_norm": 0.36085036396980286,
"learning_rate": 2.9999999999999997e-05,
"loss": 0.4324,
"step": 1099
},
{
"epoch": 1.8,
"grad_norm": 0.2999497950077057,
"learning_rate": 2.9752066115702478e-05,
"loss": 0.3173,
"step": 1100
},
{
"epoch": 1.8,
"grad_norm": 0.31063607335090637,
"learning_rate": 2.9504132231404954e-05,
"loss": 0.4941,
"step": 1101
},
{
"epoch": 1.8,
"grad_norm": 0.2864468991756439,
"learning_rate": 2.9256198347107435e-05,
"loss": 0.4309,
"step": 1102
},
{
"epoch": 1.81,
"grad_norm": 0.2904879152774811,
"learning_rate": 2.900826446280991e-05,
"loss": 0.4782,
"step": 1103
},
{
"epoch": 1.81,
"grad_norm": 0.31169822812080383,
"learning_rate": 2.8760330578512395e-05,
"loss": 0.4881,
"step": 1104
},
{
"epoch": 1.81,
"grad_norm": 0.3462170660495758,
"learning_rate": 2.8512396694214875e-05,
"loss": 0.3551,
"step": 1105
},
{
"epoch": 1.81,
"grad_norm": 0.3066549301147461,
"learning_rate": 2.8264462809917352e-05,
"loss": 0.4522,
"step": 1106
},
{
"epoch": 1.81,
"grad_norm": 0.33785369992256165,
"learning_rate": 2.8016528925619832e-05,
"loss": 0.3763,
"step": 1107
},
{
"epoch": 1.81,
"grad_norm": 0.2975507378578186,
"learning_rate": 2.776859504132231e-05,
"loss": 0.3193,
"step": 1108
},
{
"epoch": 1.82,
"grad_norm": 0.31934845447540283,
"learning_rate": 2.7520661157024793e-05,
"loss": 0.2994,
"step": 1109
},
{
"epoch": 1.82,
"grad_norm": 0.29450473189353943,
"learning_rate": 2.727272727272727e-05,
"loss": 0.4279,
"step": 1110
},
{
"epoch": 1.82,
"grad_norm": 0.3054717779159546,
"learning_rate": 2.702479338842975e-05,
"loss": 0.4687,
"step": 1111
},
{
"epoch": 1.82,
"grad_norm": 0.32938167452812195,
"learning_rate": 2.6776859504132227e-05,
"loss": 0.4815,
"step": 1112
},
{
"epoch": 1.82,
"grad_norm": 0.2678495943546295,
"learning_rate": 2.6528925619834707e-05,
"loss": 0.3116,
"step": 1113
},
{
"epoch": 1.82,
"grad_norm": 0.26357004046440125,
"learning_rate": 2.628099173553719e-05,
"loss": 0.3286,
"step": 1114
},
{
"epoch": 1.83,
"grad_norm": 0.3359578251838684,
"learning_rate": 2.6033057851239667e-05,
"loss": 0.4137,
"step": 1115
},
{
"epoch": 1.83,
"grad_norm": 0.3395717442035675,
"learning_rate": 2.5785123966942148e-05,
"loss": 0.3812,
"step": 1116
},
{
"epoch": 1.83,
"grad_norm": 0.29891693592071533,
"learning_rate": 2.5537190082644625e-05,
"loss": 0.2989,
"step": 1117
},
{
"epoch": 1.83,
"grad_norm": 0.402649462223053,
"learning_rate": 2.5289256198347105e-05,
"loss": 0.4333,
"step": 1118
},
{
"epoch": 1.83,
"grad_norm": 0.3397662341594696,
"learning_rate": 2.5041322314049585e-05,
"loss": 0.4188,
"step": 1119
},
{
"epoch": 1.83,
"grad_norm": 0.33743607997894287,
"learning_rate": 2.4793388429752065e-05,
"loss": 0.5309,
"step": 1120
},
{
"epoch": 1.84,
"grad_norm": 0.3248274624347687,
"learning_rate": 2.4545454545454542e-05,
"loss": 0.3905,
"step": 1121
},
{
"epoch": 1.84,
"grad_norm": 0.3567257821559906,
"learning_rate": 2.4297520661157022e-05,
"loss": 0.4107,
"step": 1122
},
{
"epoch": 1.84,
"grad_norm": 0.4383893311023712,
"learning_rate": 2.40495867768595e-05,
"loss": 0.5024,
"step": 1123
},
{
"epoch": 1.84,
"grad_norm": 0.2777807414531708,
"learning_rate": 2.3801652892561983e-05,
"loss": 0.3289,
"step": 1124
},
{
"epoch": 1.84,
"grad_norm": 0.3409118950366974,
"learning_rate": 2.3553719008264463e-05,
"loss": 0.5199,
"step": 1125
},
{
"epoch": 1.84,
"grad_norm": 0.3060845732688904,
"learning_rate": 2.330578512396694e-05,
"loss": 0.412,
"step": 1126
},
{
"epoch": 1.85,
"grad_norm": 0.3366425335407257,
"learning_rate": 2.305785123966942e-05,
"loss": 0.484,
"step": 1127
},
{
"epoch": 1.85,
"grad_norm": 0.36060798168182373,
"learning_rate": 2.2809917355371897e-05,
"loss": 0.5543,
"step": 1128
},
{
"epoch": 1.85,
"grad_norm": 0.25729015469551086,
"learning_rate": 2.256198347107438e-05,
"loss": 0.2763,
"step": 1129
},
{
"epoch": 1.85,
"grad_norm": 0.2890430688858032,
"learning_rate": 2.2314049586776857e-05,
"loss": 0.3762,
"step": 1130
},
{
"epoch": 1.85,
"grad_norm": 0.31579041481018066,
"learning_rate": 2.2066115702479338e-05,
"loss": 0.396,
"step": 1131
},
{
"epoch": 1.85,
"grad_norm": 0.3136342763900757,
"learning_rate": 2.1818181818181814e-05,
"loss": 0.4134,
"step": 1132
},
{
"epoch": 1.86,
"grad_norm": 0.37239784002304077,
"learning_rate": 2.1570247933884295e-05,
"loss": 0.4666,
"step": 1133
},
{
"epoch": 1.86,
"grad_norm": 0.2847795784473419,
"learning_rate": 2.1322314049586775e-05,
"loss": 0.3481,
"step": 1134
},
{
"epoch": 1.86,
"grad_norm": 0.27870920300483704,
"learning_rate": 2.1074380165289255e-05,
"loss": 0.2669,
"step": 1135
},
{
"epoch": 1.86,
"grad_norm": 0.2700231969356537,
"learning_rate": 2.0826446280991732e-05,
"loss": 0.2798,
"step": 1136
},
{
"epoch": 1.86,
"grad_norm": 0.3257925510406494,
"learning_rate": 2.0578512396694212e-05,
"loss": 0.4931,
"step": 1137
},
{
"epoch": 1.86,
"grad_norm": 0.2964242994785309,
"learning_rate": 2.033057851239669e-05,
"loss": 0.429,
"step": 1138
},
{
"epoch": 1.86,
"grad_norm": 0.32561832666397095,
"learning_rate": 2.0082644628099173e-05,
"loss": 0.3467,
"step": 1139
},
{
"epoch": 1.87,
"grad_norm": 0.27957382798194885,
"learning_rate": 1.9834710743801653e-05,
"loss": 0.2686,
"step": 1140
},
{
"epoch": 1.87,
"grad_norm": 0.3476884663105011,
"learning_rate": 1.958677685950413e-05,
"loss": 0.4814,
"step": 1141
},
{
"epoch": 1.87,
"grad_norm": 0.2950107753276825,
"learning_rate": 1.933884297520661e-05,
"loss": 0.3578,
"step": 1142
},
{
"epoch": 1.87,
"grad_norm": 0.30689096450805664,
"learning_rate": 1.9090909090909087e-05,
"loss": 0.3725,
"step": 1143
},
{
"epoch": 1.87,
"grad_norm": 0.430915504693985,
"learning_rate": 1.884297520661157e-05,
"loss": 0.4766,
"step": 1144
},
{
"epoch": 1.87,
"grad_norm": 0.3086168169975281,
"learning_rate": 1.8595041322314047e-05,
"loss": 0.5506,
"step": 1145
},
{
"epoch": 1.88,
"grad_norm": 0.3441203534603119,
"learning_rate": 1.8347107438016527e-05,
"loss": 0.4251,
"step": 1146
},
{
"epoch": 1.88,
"grad_norm": 0.2828252613544464,
"learning_rate": 1.8099173553719008e-05,
"loss": 0.3,
"step": 1147
},
{
"epoch": 1.88,
"grad_norm": 0.33563023805618286,
"learning_rate": 1.7851239669421485e-05,
"loss": 0.4082,
"step": 1148
},
{
"epoch": 1.88,
"grad_norm": 0.33100175857543945,
"learning_rate": 1.7603305785123965e-05,
"loss": 0.5853,
"step": 1149
},
{
"epoch": 1.88,
"grad_norm": 0.3554556369781494,
"learning_rate": 1.7355371900826445e-05,
"loss": 0.5677,
"step": 1150
},
{
"epoch": 1.88,
"grad_norm": 0.32995131611824036,
"learning_rate": 1.7107438016528925e-05,
"loss": 0.3315,
"step": 1151
},
{
"epoch": 1.89,
"grad_norm": 0.3160393238067627,
"learning_rate": 1.6859504132231405e-05,
"loss": 0.3632,
"step": 1152
},
{
"epoch": 1.89,
"grad_norm": 0.3632807433605194,
"learning_rate": 1.6611570247933882e-05,
"loss": 0.4053,
"step": 1153
},
{
"epoch": 1.89,
"grad_norm": 0.2931605279445648,
"learning_rate": 1.6363636363636363e-05,
"loss": 0.358,
"step": 1154
},
{
"epoch": 1.89,
"grad_norm": 0.32687610387802124,
"learning_rate": 1.6115702479338843e-05,
"loss": 0.4584,
"step": 1155
},
{
"epoch": 1.89,
"grad_norm": 0.3283078074455261,
"learning_rate": 1.586776859504132e-05,
"loss": 0.3818,
"step": 1156
},
{
"epoch": 1.89,
"grad_norm": 0.31993189454078674,
"learning_rate": 1.56198347107438e-05,
"loss": 0.3714,
"step": 1157
},
{
"epoch": 1.9,
"grad_norm": 0.2674204409122467,
"learning_rate": 1.537190082644628e-05,
"loss": 0.3943,
"step": 1158
},
{
"epoch": 1.9,
"grad_norm": 0.3968242406845093,
"learning_rate": 1.5123966942148759e-05,
"loss": 0.4465,
"step": 1159
},
{
"epoch": 1.9,
"grad_norm": 0.2870213985443115,
"learning_rate": 1.4876033057851239e-05,
"loss": 0.3616,
"step": 1160
},
{
"epoch": 1.9,
"grad_norm": 0.29502633213996887,
"learning_rate": 1.4628099173553717e-05,
"loss": 0.4112,
"step": 1161
},
{
"epoch": 1.9,
"grad_norm": 0.36414459347724915,
"learning_rate": 1.4380165289256198e-05,
"loss": 0.3928,
"step": 1162
},
{
"epoch": 1.9,
"grad_norm": 0.274940550327301,
"learning_rate": 1.4132231404958676e-05,
"loss": 0.3971,
"step": 1163
},
{
"epoch": 1.91,
"grad_norm": 0.3382115364074707,
"learning_rate": 1.3884297520661155e-05,
"loss": 0.339,
"step": 1164
},
{
"epoch": 1.91,
"grad_norm": 0.32059189677238464,
"learning_rate": 1.3636363636363635e-05,
"loss": 0.4632,
"step": 1165
},
{
"epoch": 1.91,
"grad_norm": 0.40788954496383667,
"learning_rate": 1.3388429752066113e-05,
"loss": 0.4729,
"step": 1166
},
{
"epoch": 1.91,
"grad_norm": 0.4415609836578369,
"learning_rate": 1.3140495867768595e-05,
"loss": 0.4311,
"step": 1167
},
{
"epoch": 1.91,
"grad_norm": 0.29439279437065125,
"learning_rate": 1.2892561983471074e-05,
"loss": 0.3428,
"step": 1168
},
{
"epoch": 1.91,
"grad_norm": 0.38421952724456787,
"learning_rate": 1.2644628099173552e-05,
"loss": 0.5504,
"step": 1169
},
{
"epoch": 1.92,
"grad_norm": 0.2757047116756439,
"learning_rate": 1.2396694214876033e-05,
"loss": 0.3488,
"step": 1170
},
{
"epoch": 1.92,
"grad_norm": 0.27029332518577576,
"learning_rate": 1.2148760330578511e-05,
"loss": 0.3922,
"step": 1171
},
{
"epoch": 1.92,
"grad_norm": 0.29828086495399475,
"learning_rate": 1.1900826446280991e-05,
"loss": 0.3484,
"step": 1172
},
{
"epoch": 1.92,
"grad_norm": 0.3248095214366913,
"learning_rate": 1.165289256198347e-05,
"loss": 0.4166,
"step": 1173
},
{
"epoch": 1.92,
"grad_norm": 0.3183375895023346,
"learning_rate": 1.1404958677685948e-05,
"loss": 0.4207,
"step": 1174
},
{
"epoch": 1.92,
"grad_norm": 0.38209760189056396,
"learning_rate": 1.1157024793388429e-05,
"loss": 0.4136,
"step": 1175
},
{
"epoch": 1.93,
"grad_norm": 0.31191781163215637,
"learning_rate": 1.0909090909090907e-05,
"loss": 0.3821,
"step": 1176
},
{
"epoch": 1.93,
"grad_norm": 0.3147072494029999,
"learning_rate": 1.0661157024793387e-05,
"loss": 0.2973,
"step": 1177
},
{
"epoch": 1.93,
"grad_norm": 0.346629798412323,
"learning_rate": 1.0413223140495866e-05,
"loss": 0.5924,
"step": 1178
},
{
"epoch": 1.93,
"grad_norm": 0.30329591035842896,
"learning_rate": 1.0165289256198345e-05,
"loss": 0.4802,
"step": 1179
},
{
"epoch": 1.93,
"grad_norm": 0.3608144521713257,
"learning_rate": 9.917355371900826e-06,
"loss": 0.4187,
"step": 1180
},
{
"epoch": 1.93,
"grad_norm": 0.3330174684524536,
"learning_rate": 9.669421487603305e-06,
"loss": 0.4585,
"step": 1181
},
{
"epoch": 1.94,
"grad_norm": 0.2880091071128845,
"learning_rate": 9.421487603305785e-06,
"loss": 0.3926,
"step": 1182
},
{
"epoch": 1.94,
"grad_norm": 0.2711026668548584,
"learning_rate": 9.173553719008264e-06,
"loss": 0.3128,
"step": 1183
},
{
"epoch": 1.94,
"grad_norm": 0.3472573161125183,
"learning_rate": 8.925619834710742e-06,
"loss": 0.3626,
"step": 1184
},
{
"epoch": 1.94,
"grad_norm": 0.29903772473335266,
"learning_rate": 8.677685950413222e-06,
"loss": 0.3778,
"step": 1185
},
{
"epoch": 1.94,
"grad_norm": 0.309654176235199,
"learning_rate": 8.429752066115703e-06,
"loss": 0.3965,
"step": 1186
},
{
"epoch": 1.94,
"grad_norm": 0.3163444399833679,
"learning_rate": 8.181818181818181e-06,
"loss": 0.3207,
"step": 1187
},
{
"epoch": 1.95,
"grad_norm": 0.3754628300666809,
"learning_rate": 7.93388429752066e-06,
"loss": 0.3954,
"step": 1188
},
{
"epoch": 1.95,
"grad_norm": 0.2967177629470825,
"learning_rate": 7.68595041322314e-06,
"loss": 0.4092,
"step": 1189
},
{
"epoch": 1.95,
"grad_norm": 0.37930914759635925,
"learning_rate": 7.438016528925619e-06,
"loss": 0.5038,
"step": 1190
},
{
"epoch": 1.95,
"grad_norm": 0.31978312134742737,
"learning_rate": 7.190082644628099e-06,
"loss": 0.3039,
"step": 1191
},
{
"epoch": 1.95,
"grad_norm": 0.34556475281715393,
"learning_rate": 6.942148760330577e-06,
"loss": 0.3749,
"step": 1192
},
{
"epoch": 1.95,
"grad_norm": 0.33958449959754944,
"learning_rate": 6.694214876033057e-06,
"loss": 0.4974,
"step": 1193
},
{
"epoch": 1.95,
"grad_norm": 0.34213709831237793,
"learning_rate": 6.446280991735537e-06,
"loss": 0.4874,
"step": 1194
},
{
"epoch": 1.96,
"grad_norm": 0.3194979131221771,
"learning_rate": 6.198347107438016e-06,
"loss": 0.4415,
"step": 1195
},
{
"epoch": 1.96,
"grad_norm": 0.3170003890991211,
"learning_rate": 5.950413223140496e-06,
"loss": 0.299,
"step": 1196
},
{
"epoch": 1.96,
"grad_norm": 0.35796797275543213,
"learning_rate": 5.702479338842974e-06,
"loss": 0.4516,
"step": 1197
},
{
"epoch": 1.96,
"grad_norm": 0.36410433053970337,
"learning_rate": 5.454545454545454e-06,
"loss": 0.3137,
"step": 1198
},
{
"epoch": 1.96,
"grad_norm": 0.27563753724098206,
"learning_rate": 5.206611570247933e-06,
"loss": 0.3465,
"step": 1199
},
{
"epoch": 1.96,
"grad_norm": 0.3430056869983673,
"learning_rate": 4.958677685950413e-06,
"loss": 0.5325,
"step": 1200
},
{
"epoch": 1.97,
"grad_norm": 0.3032241463661194,
"learning_rate": 4.710743801652893e-06,
"loss": 0.3802,
"step": 1201
},
{
"epoch": 1.97,
"grad_norm": 0.3008878231048584,
"learning_rate": 4.462809917355371e-06,
"loss": 0.3674,
"step": 1202
},
{
"epoch": 1.97,
"grad_norm": 0.34465453028678894,
"learning_rate": 4.214876033057851e-06,
"loss": 0.3465,
"step": 1203
},
{
"epoch": 1.97,
"grad_norm": 0.3217530846595764,
"learning_rate": 3.96694214876033e-06,
"loss": 0.395,
"step": 1204
},
{
"epoch": 1.97,
"grad_norm": 0.3256390690803528,
"learning_rate": 3.7190082644628097e-06,
"loss": 0.2928,
"step": 1205
},
{
"epoch": 1.97,
"grad_norm": 0.404376357793808,
"learning_rate": 3.4710743801652887e-06,
"loss": 0.5579,
"step": 1206
},
{
"epoch": 1.98,
"grad_norm": 0.2786218822002411,
"learning_rate": 3.2231404958677685e-06,
"loss": 0.3842,
"step": 1207
},
{
"epoch": 1.98,
"grad_norm": 0.339501291513443,
"learning_rate": 2.975206611570248e-06,
"loss": 0.4061,
"step": 1208
},
{
"epoch": 1.98,
"grad_norm": 0.3386409878730774,
"learning_rate": 2.727272727272727e-06,
"loss": 0.3452,
"step": 1209
},
{
"epoch": 1.98,
"grad_norm": 0.36449265480041504,
"learning_rate": 2.4793388429752066e-06,
"loss": 0.3769,
"step": 1210
},
{
"epoch": 1.98,
"grad_norm": 0.3336932361125946,
"learning_rate": 2.2314049586776856e-06,
"loss": 0.4361,
"step": 1211
},
{
"epoch": 1.98,
"grad_norm": 0.28075236082077026,
"learning_rate": 1.983471074380165e-06,
"loss": 0.3614,
"step": 1212
},
{
"epoch": 1.99,
"grad_norm": 0.31337854266166687,
"learning_rate": 1.7355371900826443e-06,
"loss": 0.37,
"step": 1213
},
{
"epoch": 1.99,
"grad_norm": 0.3034374415874481,
"learning_rate": 1.487603305785124e-06,
"loss": 0.274,
"step": 1214
},
{
"epoch": 1.99,
"grad_norm": 0.3485061526298523,
"learning_rate": 1.2396694214876033e-06,
"loss": 0.425,
"step": 1215
},
{
"epoch": 1.99,
"grad_norm": 0.24720066785812378,
"learning_rate": 9.917355371900825e-07,
"loss": 0.307,
"step": 1216
},
{
"epoch": 1.99,
"grad_norm": 0.2727121412754059,
"learning_rate": 7.43801652892562e-07,
"loss": 0.2991,
"step": 1217
},
{
"epoch": 1.99,
"grad_norm": 0.33211690187454224,
"learning_rate": 4.958677685950412e-07,
"loss": 0.5309,
"step": 1218
},
{
"epoch": 2.0,
"grad_norm": 0.328895628452301,
"learning_rate": 2.479338842975206e-07,
"loss": 0.3547,
"step": 1219
},
{
"epoch": 2.0,
"grad_norm": 0.2642543315887451,
"learning_rate": 0.0,
"loss": 0.3047,
"step": 1220
}
],
"logging_steps": 1,
"max_steps": 1220,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 610,
"total_flos": 2.626577866972938e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}