v0.8z-adapter / checkpoint-950 /trainer_state.json
gotzmann's picture
..
f83c873
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.012523314681588,
"eval_steps": 500,
"global_step": 950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.057669635862112045,
"learning_rate": 1.0638297872340426e-07,
"loss": 1.3494,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.06152823567390442,
"learning_rate": 2.1276595744680852e-07,
"loss": 1.2781,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 0.05398479849100113,
"learning_rate": 3.1914893617021275e-07,
"loss": 1.3861,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 0.059882670640945435,
"learning_rate": 4.2553191489361704e-07,
"loss": 1.4557,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 0.059738870710134506,
"learning_rate": 5.319148936170213e-07,
"loss": 1.3505,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 0.05607615038752556,
"learning_rate": 6.382978723404255e-07,
"loss": 1.4366,
"step": 6
},
{
"epoch": 0.01,
"grad_norm": 0.05640924349427223,
"learning_rate": 7.446808510638298e-07,
"loss": 1.3647,
"step": 7
},
{
"epoch": 0.01,
"grad_norm": 0.05529299005866051,
"learning_rate": 8.510638297872341e-07,
"loss": 1.3731,
"step": 8
},
{
"epoch": 0.01,
"grad_norm": 0.05329303443431854,
"learning_rate": 9.574468085106382e-07,
"loss": 1.4332,
"step": 9
},
{
"epoch": 0.01,
"grad_norm": 0.06044170632958412,
"learning_rate": 1.0638297872340427e-06,
"loss": 1.4435,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.05790272355079651,
"learning_rate": 1.170212765957447e-06,
"loss": 1.3862,
"step": 11
},
{
"epoch": 0.01,
"grad_norm": 0.06205734983086586,
"learning_rate": 1.276595744680851e-06,
"loss": 1.4129,
"step": 12
},
{
"epoch": 0.01,
"grad_norm": 0.059065915644168854,
"learning_rate": 1.3829787234042553e-06,
"loss": 1.3791,
"step": 13
},
{
"epoch": 0.01,
"grad_norm": 0.05682244524359703,
"learning_rate": 1.4893617021276596e-06,
"loss": 1.431,
"step": 14
},
{
"epoch": 0.02,
"grad_norm": 0.059100136160850525,
"learning_rate": 1.5957446808510639e-06,
"loss": 1.4507,
"step": 15
},
{
"epoch": 0.02,
"grad_norm": 0.059931278228759766,
"learning_rate": 1.7021276595744682e-06,
"loss": 1.3852,
"step": 16
},
{
"epoch": 0.02,
"grad_norm": 0.056699033826589584,
"learning_rate": 1.8085106382978722e-06,
"loss": 1.2552,
"step": 17
},
{
"epoch": 0.02,
"grad_norm": 0.05666350945830345,
"learning_rate": 1.9148936170212763e-06,
"loss": 1.5264,
"step": 18
},
{
"epoch": 0.02,
"grad_norm": 0.05692203715443611,
"learning_rate": 2.021276595744681e-06,
"loss": 1.4234,
"step": 19
},
{
"epoch": 0.02,
"grad_norm": 0.06246646121144295,
"learning_rate": 2.1276595744680853e-06,
"loss": 1.3362,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 0.056722771376371384,
"learning_rate": 2.2340425531914894e-06,
"loss": 1.3446,
"step": 21
},
{
"epoch": 0.02,
"grad_norm": 0.05918258801102638,
"learning_rate": 2.340425531914894e-06,
"loss": 1.4613,
"step": 22
},
{
"epoch": 0.02,
"grad_norm": 0.05318083241581917,
"learning_rate": 2.446808510638298e-06,
"loss": 1.4447,
"step": 23
},
{
"epoch": 0.03,
"grad_norm": 0.0610308013856411,
"learning_rate": 2.553191489361702e-06,
"loss": 1.3705,
"step": 24
},
{
"epoch": 0.03,
"grad_norm": 0.0567488893866539,
"learning_rate": 2.6595744680851065e-06,
"loss": 1.4548,
"step": 25
},
{
"epoch": 0.03,
"grad_norm": 0.06045207753777504,
"learning_rate": 2.7659574468085106e-06,
"loss": 1.389,
"step": 26
},
{
"epoch": 0.03,
"grad_norm": 0.05329489707946777,
"learning_rate": 2.872340425531915e-06,
"loss": 1.3524,
"step": 27
},
{
"epoch": 0.03,
"grad_norm": 0.07088607549667358,
"learning_rate": 2.978723404255319e-06,
"loss": 1.2576,
"step": 28
},
{
"epoch": 0.03,
"grad_norm": 0.07728853821754456,
"learning_rate": 3.0851063829787233e-06,
"loss": 1.4285,
"step": 29
},
{
"epoch": 0.03,
"grad_norm": 0.0550098791718483,
"learning_rate": 3.1914893617021277e-06,
"loss": 1.2073,
"step": 30
},
{
"epoch": 0.03,
"grad_norm": 0.0542742982506752,
"learning_rate": 3.2978723404255322e-06,
"loss": 1.354,
"step": 31
},
{
"epoch": 0.03,
"grad_norm": 0.06096404418349266,
"learning_rate": 3.4042553191489363e-06,
"loss": 1.2414,
"step": 32
},
{
"epoch": 0.04,
"grad_norm": 0.062333572655916214,
"learning_rate": 3.5106382978723404e-06,
"loss": 1.4806,
"step": 33
},
{
"epoch": 0.04,
"grad_norm": 0.05783746764063835,
"learning_rate": 3.6170212765957445e-06,
"loss": 1.4405,
"step": 34
},
{
"epoch": 0.04,
"grad_norm": 0.095641128718853,
"learning_rate": 3.723404255319149e-06,
"loss": 1.4705,
"step": 35
},
{
"epoch": 0.04,
"grad_norm": 0.14163658022880554,
"learning_rate": 3.829787234042553e-06,
"loss": 1.4584,
"step": 36
},
{
"epoch": 0.04,
"grad_norm": 0.05600857362151146,
"learning_rate": 3.936170212765957e-06,
"loss": 1.4418,
"step": 37
},
{
"epoch": 0.04,
"grad_norm": 0.05181832239031792,
"learning_rate": 4.042553191489362e-06,
"loss": 1.3885,
"step": 38
},
{
"epoch": 0.04,
"grad_norm": 0.09394165873527527,
"learning_rate": 4.148936170212766e-06,
"loss": 1.4062,
"step": 39
},
{
"epoch": 0.04,
"grad_norm": 0.0698341354727745,
"learning_rate": 4.255319148936171e-06,
"loss": 1.2806,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 0.055212073028087616,
"learning_rate": 4.361702127659574e-06,
"loss": 1.4028,
"step": 41
},
{
"epoch": 0.04,
"grad_norm": 0.05782864987850189,
"learning_rate": 4.468085106382979e-06,
"loss": 1.4935,
"step": 42
},
{
"epoch": 0.05,
"grad_norm": 0.05666356906294823,
"learning_rate": 4.574468085106383e-06,
"loss": 1.435,
"step": 43
},
{
"epoch": 0.05,
"grad_norm": 0.05617048591375351,
"learning_rate": 4.680851063829788e-06,
"loss": 1.2891,
"step": 44
},
{
"epoch": 0.05,
"grad_norm": 0.05038372054696083,
"learning_rate": 4.787234042553192e-06,
"loss": 1.4356,
"step": 45
},
{
"epoch": 0.05,
"grad_norm": 0.049356039613485336,
"learning_rate": 4.893617021276596e-06,
"loss": 1.3012,
"step": 46
},
{
"epoch": 0.05,
"grad_norm": 0.05181947723031044,
"learning_rate": 4.9999999999999996e-06,
"loss": 1.3487,
"step": 47
},
{
"epoch": 0.05,
"grad_norm": 0.05446089804172516,
"learning_rate": 5.106382978723404e-06,
"loss": 1.4829,
"step": 48
},
{
"epoch": 0.05,
"grad_norm": 0.05051864683628082,
"learning_rate": 5.2127659574468086e-06,
"loss": 1.3265,
"step": 49
},
{
"epoch": 0.05,
"grad_norm": 0.05824195221066475,
"learning_rate": 5.319148936170213e-06,
"loss": 1.3334,
"step": 50
},
{
"epoch": 0.05,
"grad_norm": 0.05690138414502144,
"learning_rate": 5.4255319148936176e-06,
"loss": 1.4572,
"step": 51
},
{
"epoch": 0.06,
"grad_norm": 0.09080282598733902,
"learning_rate": 5.531914893617021e-06,
"loss": 1.4261,
"step": 52
},
{
"epoch": 0.06,
"grad_norm": 0.04653813689947128,
"learning_rate": 5.638297872340426e-06,
"loss": 1.262,
"step": 53
},
{
"epoch": 0.06,
"grad_norm": 0.044598598033189774,
"learning_rate": 5.74468085106383e-06,
"loss": 1.3706,
"step": 54
},
{
"epoch": 0.06,
"grad_norm": 0.04525616019964218,
"learning_rate": 5.851063829787235e-06,
"loss": 1.2848,
"step": 55
},
{
"epoch": 0.06,
"grad_norm": 0.05448417738080025,
"learning_rate": 5.957446808510638e-06,
"loss": 1.3936,
"step": 56
},
{
"epoch": 0.06,
"grad_norm": 0.04820968583226204,
"learning_rate": 6.063829787234042e-06,
"loss": 1.3226,
"step": 57
},
{
"epoch": 0.06,
"grad_norm": 0.052025895565748215,
"learning_rate": 6.1702127659574465e-06,
"loss": 1.3372,
"step": 58
},
{
"epoch": 0.06,
"grad_norm": 0.046200189739465714,
"learning_rate": 6.276595744680851e-06,
"loss": 1.4202,
"step": 59
},
{
"epoch": 0.06,
"grad_norm": 0.05124253034591675,
"learning_rate": 6.3829787234042555e-06,
"loss": 1.3549,
"step": 60
},
{
"epoch": 0.07,
"grad_norm": 0.04417189210653305,
"learning_rate": 6.48936170212766e-06,
"loss": 1.4315,
"step": 61
},
{
"epoch": 0.07,
"grad_norm": 0.05024256929755211,
"learning_rate": 6.5957446808510645e-06,
"loss": 1.3213,
"step": 62
},
{
"epoch": 0.07,
"grad_norm": 0.04305976629257202,
"learning_rate": 6.702127659574468e-06,
"loss": 1.1938,
"step": 63
},
{
"epoch": 0.07,
"grad_norm": 0.04368586093187332,
"learning_rate": 6.808510638297873e-06,
"loss": 1.2782,
"step": 64
},
{
"epoch": 0.07,
"grad_norm": 0.04419870302081108,
"learning_rate": 6.914893617021277e-06,
"loss": 1.2692,
"step": 65
},
{
"epoch": 0.07,
"grad_norm": 0.04923289269208908,
"learning_rate": 7.021276595744681e-06,
"loss": 1.4176,
"step": 66
},
{
"epoch": 0.07,
"grad_norm": 0.05031334236264229,
"learning_rate": 7.127659574468085e-06,
"loss": 1.4667,
"step": 67
},
{
"epoch": 0.07,
"grad_norm": 0.05887551233172417,
"learning_rate": 7.234042553191489e-06,
"loss": 1.365,
"step": 68
},
{
"epoch": 0.07,
"grad_norm": 0.04077250510454178,
"learning_rate": 7.3404255319148934e-06,
"loss": 1.263,
"step": 69
},
{
"epoch": 0.07,
"grad_norm": 0.046896953135728836,
"learning_rate": 7.446808510638298e-06,
"loss": 1.2821,
"step": 70
},
{
"epoch": 0.08,
"grad_norm": 0.045666612684726715,
"learning_rate": 7.553191489361702e-06,
"loss": 1.4069,
"step": 71
},
{
"epoch": 0.08,
"grad_norm": 0.07492675632238388,
"learning_rate": 7.659574468085105e-06,
"loss": 1.2269,
"step": 72
},
{
"epoch": 0.08,
"grad_norm": 0.05555059388279915,
"learning_rate": 7.76595744680851e-06,
"loss": 1.4084,
"step": 73
},
{
"epoch": 0.08,
"grad_norm": 0.04331756755709648,
"learning_rate": 7.872340425531914e-06,
"loss": 1.3114,
"step": 74
},
{
"epoch": 0.08,
"grad_norm": 0.0964915007352829,
"learning_rate": 7.978723404255319e-06,
"loss": 1.3633,
"step": 75
},
{
"epoch": 0.08,
"grad_norm": 0.046700503677129745,
"learning_rate": 8.085106382978723e-06,
"loss": 1.3688,
"step": 76
},
{
"epoch": 0.08,
"grad_norm": 0.04673081636428833,
"learning_rate": 8.191489361702128e-06,
"loss": 1.2467,
"step": 77
},
{
"epoch": 0.08,
"grad_norm": 0.04381676763296127,
"learning_rate": 8.297872340425532e-06,
"loss": 1.2566,
"step": 78
},
{
"epoch": 0.08,
"grad_norm": 0.05993415787816048,
"learning_rate": 8.404255319148937e-06,
"loss": 1.3618,
"step": 79
},
{
"epoch": 0.09,
"grad_norm": 0.06440860033035278,
"learning_rate": 8.510638297872341e-06,
"loss": 1.5144,
"step": 80
},
{
"epoch": 0.09,
"grad_norm": 0.040967535227537155,
"learning_rate": 8.617021276595746e-06,
"loss": 1.2927,
"step": 81
},
{
"epoch": 0.09,
"grad_norm": 0.04743165895342827,
"learning_rate": 8.723404255319149e-06,
"loss": 1.3762,
"step": 82
},
{
"epoch": 0.09,
"grad_norm": 0.04562428966164589,
"learning_rate": 8.829787234042553e-06,
"loss": 1.377,
"step": 83
},
{
"epoch": 0.09,
"grad_norm": 0.051328569650650024,
"learning_rate": 8.936170212765958e-06,
"loss": 1.2928,
"step": 84
},
{
"epoch": 0.09,
"grad_norm": 0.05074065551161766,
"learning_rate": 9.042553191489362e-06,
"loss": 1.2354,
"step": 85
},
{
"epoch": 0.09,
"grad_norm": 0.04352608695626259,
"learning_rate": 9.148936170212767e-06,
"loss": 1.4465,
"step": 86
},
{
"epoch": 0.09,
"grad_norm": 0.04803245887160301,
"learning_rate": 9.255319148936171e-06,
"loss": 1.3891,
"step": 87
},
{
"epoch": 0.09,
"grad_norm": 0.08481187373399734,
"learning_rate": 9.361702127659576e-06,
"loss": 1.1974,
"step": 88
},
{
"epoch": 0.09,
"grad_norm": 0.0441882386803627,
"learning_rate": 9.46808510638298e-06,
"loss": 1.3792,
"step": 89
},
{
"epoch": 0.1,
"grad_norm": 0.04781670495867729,
"learning_rate": 9.574468085106385e-06,
"loss": 1.3,
"step": 90
},
{
"epoch": 0.1,
"grad_norm": 0.04570171609520912,
"learning_rate": 9.680851063829787e-06,
"loss": 1.3352,
"step": 91
},
{
"epoch": 0.1,
"grad_norm": 0.041407499462366104,
"learning_rate": 9.787234042553192e-06,
"loss": 1.2453,
"step": 92
},
{
"epoch": 0.1,
"grad_norm": 0.06641850620508194,
"learning_rate": 9.893617021276595e-06,
"loss": 1.2,
"step": 93
},
{
"epoch": 0.1,
"grad_norm": 0.04800207167863846,
"learning_rate": 9.999999999999999e-06,
"loss": 1.3938,
"step": 94
},
{
"epoch": 0.1,
"grad_norm": 0.04421789571642876,
"learning_rate": 1.0106382978723404e-05,
"loss": 1.3387,
"step": 95
},
{
"epoch": 0.1,
"grad_norm": 0.04799410328269005,
"learning_rate": 1.0212765957446808e-05,
"loss": 1.3857,
"step": 96
},
{
"epoch": 0.1,
"grad_norm": 0.0477750189602375,
"learning_rate": 1.0319148936170213e-05,
"loss": 1.3585,
"step": 97
},
{
"epoch": 0.1,
"grad_norm": 0.042658887803554535,
"learning_rate": 1.0425531914893617e-05,
"loss": 1.2802,
"step": 98
},
{
"epoch": 0.11,
"grad_norm": 0.046312738209962845,
"learning_rate": 1.0531914893617022e-05,
"loss": 1.3663,
"step": 99
},
{
"epoch": 0.11,
"grad_norm": 0.04404019936919212,
"learning_rate": 1.0638297872340426e-05,
"loss": 1.3354,
"step": 100
},
{
"epoch": 0.11,
"grad_norm": 0.055406101047992706,
"learning_rate": 1.074468085106383e-05,
"loss": 1.2967,
"step": 101
},
{
"epoch": 0.11,
"grad_norm": 0.045367974787950516,
"learning_rate": 1.0851063829787235e-05,
"loss": 1.2914,
"step": 102
},
{
"epoch": 0.11,
"grad_norm": 0.049990568310022354,
"learning_rate": 1.095744680851064e-05,
"loss": 1.1151,
"step": 103
},
{
"epoch": 0.11,
"grad_norm": 0.04687273129820824,
"learning_rate": 1.1063829787234042e-05,
"loss": 1.2756,
"step": 104
},
{
"epoch": 0.11,
"grad_norm": 0.04907204583287239,
"learning_rate": 1.1170212765957447e-05,
"loss": 1.3726,
"step": 105
},
{
"epoch": 0.11,
"grad_norm": 0.057853613048791885,
"learning_rate": 1.1276595744680851e-05,
"loss": 1.2599,
"step": 106
},
{
"epoch": 0.11,
"grad_norm": 0.0487029105424881,
"learning_rate": 1.1382978723404256e-05,
"loss": 1.2803,
"step": 107
},
{
"epoch": 0.12,
"grad_norm": 0.049311500042676926,
"learning_rate": 1.148936170212766e-05,
"loss": 1.3391,
"step": 108
},
{
"epoch": 0.12,
"grad_norm": 0.04935484379529953,
"learning_rate": 1.1595744680851065e-05,
"loss": 1.3807,
"step": 109
},
{
"epoch": 0.12,
"grad_norm": 0.04239289090037346,
"learning_rate": 1.170212765957447e-05,
"loss": 1.2443,
"step": 110
},
{
"epoch": 0.12,
"grad_norm": 0.076308473944664,
"learning_rate": 1.1808510638297872e-05,
"loss": 1.3879,
"step": 111
},
{
"epoch": 0.12,
"grad_norm": 0.048640284687280655,
"learning_rate": 1.1914893617021277e-05,
"loss": 1.3674,
"step": 112
},
{
"epoch": 0.12,
"grad_norm": 0.04677354916930199,
"learning_rate": 1.2021276595744681e-05,
"loss": 1.3282,
"step": 113
},
{
"epoch": 0.12,
"grad_norm": 0.04633704200387001,
"learning_rate": 1.2127659574468084e-05,
"loss": 1.3136,
"step": 114
},
{
"epoch": 0.12,
"grad_norm": 0.05674600228667259,
"learning_rate": 1.2234042553191489e-05,
"loss": 1.4119,
"step": 115
},
{
"epoch": 0.12,
"grad_norm": 0.052234843373298645,
"learning_rate": 1.2340425531914893e-05,
"loss": 1.3089,
"step": 116
},
{
"epoch": 0.12,
"grad_norm": 0.043746188282966614,
"learning_rate": 1.2446808510638298e-05,
"loss": 1.3544,
"step": 117
},
{
"epoch": 0.13,
"grad_norm": 0.044916700571775436,
"learning_rate": 1.2553191489361702e-05,
"loss": 1.2777,
"step": 118
},
{
"epoch": 0.13,
"grad_norm": 0.05204184353351593,
"learning_rate": 1.2659574468085106e-05,
"loss": 1.3396,
"step": 119
},
{
"epoch": 0.13,
"grad_norm": 0.04302500560879707,
"learning_rate": 1.2765957446808511e-05,
"loss": 1.3921,
"step": 120
},
{
"epoch": 0.13,
"grad_norm": 0.06135503947734833,
"learning_rate": 1.2872340425531915e-05,
"loss": 1.2857,
"step": 121
},
{
"epoch": 0.13,
"grad_norm": 0.05298823118209839,
"learning_rate": 1.297872340425532e-05,
"loss": 1.3892,
"step": 122
},
{
"epoch": 0.13,
"grad_norm": 0.09373245388269424,
"learning_rate": 1.3085106382978724e-05,
"loss": 1.3407,
"step": 123
},
{
"epoch": 0.13,
"grad_norm": 0.0466972291469574,
"learning_rate": 1.3191489361702129e-05,
"loss": 1.3431,
"step": 124
},
{
"epoch": 0.13,
"grad_norm": 0.04748416692018509,
"learning_rate": 1.3297872340425532e-05,
"loss": 1.3532,
"step": 125
},
{
"epoch": 0.13,
"grad_norm": 0.04710518568754196,
"learning_rate": 1.3404255319148936e-05,
"loss": 1.325,
"step": 126
},
{
"epoch": 0.14,
"grad_norm": 0.04562179371714592,
"learning_rate": 1.351063829787234e-05,
"loss": 1.3433,
"step": 127
},
{
"epoch": 0.14,
"grad_norm": 0.0475505031645298,
"learning_rate": 1.3617021276595745e-05,
"loss": 1.3037,
"step": 128
},
{
"epoch": 0.14,
"grad_norm": 0.045639630407094955,
"learning_rate": 1.372340425531915e-05,
"loss": 1.3,
"step": 129
},
{
"epoch": 0.14,
"grad_norm": 0.04853609576821327,
"learning_rate": 1.3829787234042554e-05,
"loss": 1.4097,
"step": 130
},
{
"epoch": 0.14,
"grad_norm": 0.04848809540271759,
"learning_rate": 1.3936170212765957e-05,
"loss": 1.1995,
"step": 131
},
{
"epoch": 0.14,
"grad_norm": 0.0436336025595665,
"learning_rate": 1.4042553191489362e-05,
"loss": 1.0676,
"step": 132
},
{
"epoch": 0.14,
"grad_norm": 0.05467860400676727,
"learning_rate": 1.4148936170212766e-05,
"loss": 1.428,
"step": 133
},
{
"epoch": 0.14,
"grad_norm": 0.05164318531751633,
"learning_rate": 1.425531914893617e-05,
"loss": 1.3411,
"step": 134
},
{
"epoch": 0.14,
"grad_norm": 0.04806946590542793,
"learning_rate": 1.4361702127659575e-05,
"loss": 1.2856,
"step": 135
},
{
"epoch": 0.14,
"grad_norm": 0.04568091407418251,
"learning_rate": 1.4468085106382978e-05,
"loss": 1.2643,
"step": 136
},
{
"epoch": 0.15,
"grad_norm": 0.051061000674963,
"learning_rate": 1.4574468085106382e-05,
"loss": 1.2959,
"step": 137
},
{
"epoch": 0.15,
"grad_norm": 0.043656568974256516,
"learning_rate": 1.4680851063829787e-05,
"loss": 1.2584,
"step": 138
},
{
"epoch": 0.15,
"grad_norm": 0.04155721887946129,
"learning_rate": 1.4787234042553191e-05,
"loss": 1.3342,
"step": 139
},
{
"epoch": 0.15,
"grad_norm": 0.05964464321732521,
"learning_rate": 1.4893617021276596e-05,
"loss": 1.284,
"step": 140
},
{
"epoch": 0.15,
"grad_norm": 0.04124300926923752,
"learning_rate": 1.5e-05,
"loss": 1.2437,
"step": 141
},
{
"epoch": 0.15,
"grad_norm": 0.055146049708127975,
"learning_rate": 1.5106382978723403e-05,
"loss": 1.5472,
"step": 142
},
{
"epoch": 0.15,
"grad_norm": 0.0521329827606678,
"learning_rate": 1.521276595744681e-05,
"loss": 1.2676,
"step": 143
},
{
"epoch": 0.15,
"grad_norm": 0.046129606664180756,
"learning_rate": 1.531914893617021e-05,
"loss": 1.3367,
"step": 144
},
{
"epoch": 0.15,
"grad_norm": 0.044848017394542694,
"learning_rate": 1.5425531914893617e-05,
"loss": 1.352,
"step": 145
},
{
"epoch": 0.16,
"grad_norm": 0.05782546475529671,
"learning_rate": 1.553191489361702e-05,
"loss": 1.1799,
"step": 146
},
{
"epoch": 0.16,
"grad_norm": 0.05274609848856926,
"learning_rate": 1.5638297872340426e-05,
"loss": 1.4095,
"step": 147
},
{
"epoch": 0.16,
"grad_norm": 0.047185566276311874,
"learning_rate": 1.574468085106383e-05,
"loss": 1.2185,
"step": 148
},
{
"epoch": 0.16,
"grad_norm": 0.048585060983896255,
"learning_rate": 1.5851063829787235e-05,
"loss": 1.3097,
"step": 149
},
{
"epoch": 0.16,
"grad_norm": 0.05114852264523506,
"learning_rate": 1.5957446808510637e-05,
"loss": 1.3263,
"step": 150
},
{
"epoch": 0.16,
"grad_norm": 0.04557744786143303,
"learning_rate": 1.6063829787234044e-05,
"loss": 1.4026,
"step": 151
},
{
"epoch": 0.16,
"grad_norm": 0.04844217747449875,
"learning_rate": 1.6170212765957446e-05,
"loss": 1.2965,
"step": 152
},
{
"epoch": 0.16,
"grad_norm": 0.06304433941841125,
"learning_rate": 1.627659574468085e-05,
"loss": 1.3728,
"step": 153
},
{
"epoch": 0.16,
"grad_norm": 0.11255922168493271,
"learning_rate": 1.6382978723404255e-05,
"loss": 1.2606,
"step": 154
},
{
"epoch": 0.17,
"grad_norm": 0.056885555386543274,
"learning_rate": 1.6489361702127658e-05,
"loss": 1.3312,
"step": 155
},
{
"epoch": 0.17,
"grad_norm": 0.07175802439451218,
"learning_rate": 1.6595744680851064e-05,
"loss": 1.3512,
"step": 156
},
{
"epoch": 0.17,
"grad_norm": 0.049801841378211975,
"learning_rate": 1.6702127659574467e-05,
"loss": 1.3083,
"step": 157
},
{
"epoch": 0.17,
"grad_norm": 0.05776335299015045,
"learning_rate": 1.6808510638297873e-05,
"loss": 1.3987,
"step": 158
},
{
"epoch": 0.17,
"grad_norm": 0.051518019288778305,
"learning_rate": 1.6914893617021276e-05,
"loss": 1.4337,
"step": 159
},
{
"epoch": 0.17,
"grad_norm": 0.0506494864821434,
"learning_rate": 1.7021276595744682e-05,
"loss": 1.3609,
"step": 160
},
{
"epoch": 0.17,
"grad_norm": 0.04913006350398064,
"learning_rate": 1.7127659574468085e-05,
"loss": 1.3788,
"step": 161
},
{
"epoch": 0.17,
"grad_norm": 0.0492931567132473,
"learning_rate": 1.723404255319149e-05,
"loss": 1.2924,
"step": 162
},
{
"epoch": 0.17,
"grad_norm": 0.05355142429471016,
"learning_rate": 1.7340425531914894e-05,
"loss": 1.3872,
"step": 163
},
{
"epoch": 0.17,
"grad_norm": 0.0524597205221653,
"learning_rate": 1.7446808510638297e-05,
"loss": 1.4147,
"step": 164
},
{
"epoch": 0.18,
"grad_norm": 0.049367666244506836,
"learning_rate": 1.7553191489361703e-05,
"loss": 1.3175,
"step": 165
},
{
"epoch": 0.18,
"grad_norm": 0.05368790030479431,
"learning_rate": 1.7659574468085106e-05,
"loss": 1.3939,
"step": 166
},
{
"epoch": 0.18,
"grad_norm": 0.047138139605522156,
"learning_rate": 1.7765957446808512e-05,
"loss": 1.3383,
"step": 167
},
{
"epoch": 0.18,
"grad_norm": 0.05449504777789116,
"learning_rate": 1.7872340425531915e-05,
"loss": 1.3612,
"step": 168
},
{
"epoch": 0.18,
"grad_norm": 0.0647950991988182,
"learning_rate": 1.797872340425532e-05,
"loss": 1.2121,
"step": 169
},
{
"epoch": 0.18,
"grad_norm": 0.05256028473377228,
"learning_rate": 1.8085106382978724e-05,
"loss": 1.4302,
"step": 170
},
{
"epoch": 0.18,
"grad_norm": 0.052294984459877014,
"learning_rate": 1.819148936170213e-05,
"loss": 1.3529,
"step": 171
},
{
"epoch": 0.18,
"grad_norm": 0.043925654143095016,
"learning_rate": 1.8297872340425533e-05,
"loss": 1.1719,
"step": 172
},
{
"epoch": 0.18,
"grad_norm": 0.04635035991668701,
"learning_rate": 1.840425531914894e-05,
"loss": 1.3229,
"step": 173
},
{
"epoch": 0.19,
"grad_norm": 0.07445945590734482,
"learning_rate": 1.8510638297872342e-05,
"loss": 1.2615,
"step": 174
},
{
"epoch": 0.19,
"grad_norm": 0.050731562077999115,
"learning_rate": 1.8617021276595745e-05,
"loss": 1.3552,
"step": 175
},
{
"epoch": 0.19,
"grad_norm": 0.04691868647933006,
"learning_rate": 1.872340425531915e-05,
"loss": 1.1959,
"step": 176
},
{
"epoch": 0.19,
"grad_norm": 0.047465287148952484,
"learning_rate": 1.8829787234042554e-05,
"loss": 1.3113,
"step": 177
},
{
"epoch": 0.19,
"grad_norm": 0.05117448791861534,
"learning_rate": 1.893617021276596e-05,
"loss": 1.3717,
"step": 178
},
{
"epoch": 0.19,
"grad_norm": 0.0473572202026844,
"learning_rate": 1.9042553191489363e-05,
"loss": 1.3947,
"step": 179
},
{
"epoch": 0.19,
"grad_norm": 0.05099477618932724,
"learning_rate": 1.914893617021277e-05,
"loss": 1.4049,
"step": 180
},
{
"epoch": 0.19,
"grad_norm": 0.04812943935394287,
"learning_rate": 1.9255319148936172e-05,
"loss": 1.3586,
"step": 181
},
{
"epoch": 0.19,
"grad_norm": 0.05050328001379967,
"learning_rate": 1.9361702127659575e-05,
"loss": 1.2463,
"step": 182
},
{
"epoch": 0.2,
"grad_norm": 0.05110020935535431,
"learning_rate": 1.9468085106382977e-05,
"loss": 1.2487,
"step": 183
},
{
"epoch": 0.2,
"grad_norm": 0.05224141478538513,
"learning_rate": 1.9574468085106384e-05,
"loss": 1.3602,
"step": 184
},
{
"epoch": 0.2,
"grad_norm": 0.05101168900728226,
"learning_rate": 1.9680851063829786e-05,
"loss": 1.429,
"step": 185
},
{
"epoch": 0.2,
"grad_norm": 0.09453223645687103,
"learning_rate": 1.978723404255319e-05,
"loss": 1.2619,
"step": 186
},
{
"epoch": 0.2,
"grad_norm": 0.060608815401792526,
"learning_rate": 1.9893617021276595e-05,
"loss": 1.3107,
"step": 187
},
{
"epoch": 0.2,
"grad_norm": 0.044588133692741394,
"learning_rate": 1.9999999999999998e-05,
"loss": 1.3813,
"step": 188
},
{
"epoch": 0.2,
"grad_norm": 0.0486648753285408,
"learning_rate": 2.0106382978723404e-05,
"loss": 1.3399,
"step": 189
},
{
"epoch": 0.2,
"grad_norm": 0.052965641021728516,
"learning_rate": 2.0212765957446807e-05,
"loss": 1.3557,
"step": 190
},
{
"epoch": 0.2,
"grad_norm": 0.059409502893686295,
"learning_rate": 2.0319148936170213e-05,
"loss": 1.3735,
"step": 191
},
{
"epoch": 0.2,
"grad_norm": 0.05749582126736641,
"learning_rate": 2.0425531914893616e-05,
"loss": 1.2828,
"step": 192
},
{
"epoch": 0.21,
"grad_norm": 0.04642318934202194,
"learning_rate": 2.0531914893617022e-05,
"loss": 1.3082,
"step": 193
},
{
"epoch": 0.21,
"grad_norm": 0.04926323518157005,
"learning_rate": 2.0638297872340425e-05,
"loss": 1.2715,
"step": 194
},
{
"epoch": 0.21,
"grad_norm": 0.05380849912762642,
"learning_rate": 2.074468085106383e-05,
"loss": 1.4502,
"step": 195
},
{
"epoch": 0.21,
"grad_norm": 0.0523720309138298,
"learning_rate": 2.0851063829787234e-05,
"loss": 1.4332,
"step": 196
},
{
"epoch": 0.21,
"grad_norm": 0.04891609400510788,
"learning_rate": 2.0957446808510637e-05,
"loss": 1.2524,
"step": 197
},
{
"epoch": 0.21,
"grad_norm": 0.06308029592037201,
"learning_rate": 2.1063829787234043e-05,
"loss": 1.4187,
"step": 198
},
{
"epoch": 0.21,
"grad_norm": 0.05856901407241821,
"learning_rate": 2.1170212765957446e-05,
"loss": 1.2604,
"step": 199
},
{
"epoch": 0.21,
"grad_norm": 0.048220206052064896,
"learning_rate": 2.1276595744680852e-05,
"loss": 1.3664,
"step": 200
},
{
"epoch": 0.21,
"grad_norm": 0.048834629356861115,
"learning_rate": 2.1382978723404255e-05,
"loss": 1.1301,
"step": 201
},
{
"epoch": 0.22,
"grad_norm": 0.05079879239201546,
"learning_rate": 2.148936170212766e-05,
"loss": 1.3391,
"step": 202
},
{
"epoch": 0.22,
"grad_norm": 0.04985832795500755,
"learning_rate": 2.1595744680851064e-05,
"loss": 1.2989,
"step": 203
},
{
"epoch": 0.22,
"grad_norm": 0.04496655985713005,
"learning_rate": 2.170212765957447e-05,
"loss": 1.2832,
"step": 204
},
{
"epoch": 0.22,
"grad_norm": 0.048497602343559265,
"learning_rate": 2.1808510638297873e-05,
"loss": 1.2237,
"step": 205
},
{
"epoch": 0.22,
"grad_norm": 0.050113365054130554,
"learning_rate": 2.191489361702128e-05,
"loss": 1.3618,
"step": 206
},
{
"epoch": 0.22,
"grad_norm": 0.04853734374046326,
"learning_rate": 2.2021276595744682e-05,
"loss": 1.2066,
"step": 207
},
{
"epoch": 0.22,
"grad_norm": 0.047260671854019165,
"learning_rate": 2.2127659574468085e-05,
"loss": 1.2672,
"step": 208
},
{
"epoch": 0.22,
"grad_norm": 0.0458863228559494,
"learning_rate": 2.223404255319149e-05,
"loss": 1.2686,
"step": 209
},
{
"epoch": 0.22,
"grad_norm": 0.0532243587076664,
"learning_rate": 2.2340425531914894e-05,
"loss": 1.4068,
"step": 210
},
{
"epoch": 0.22,
"grad_norm": 0.05112733691930771,
"learning_rate": 2.24468085106383e-05,
"loss": 1.4665,
"step": 211
},
{
"epoch": 0.23,
"grad_norm": 0.05383682623505592,
"learning_rate": 2.2553191489361703e-05,
"loss": 1.3952,
"step": 212
},
{
"epoch": 0.23,
"grad_norm": 0.05613754689693451,
"learning_rate": 2.265957446808511e-05,
"loss": 1.2767,
"step": 213
},
{
"epoch": 0.23,
"grad_norm": 0.04550475999712944,
"learning_rate": 2.2765957446808512e-05,
"loss": 1.237,
"step": 214
},
{
"epoch": 0.23,
"grad_norm": 0.04769672453403473,
"learning_rate": 2.2872340425531918e-05,
"loss": 1.4378,
"step": 215
},
{
"epoch": 0.23,
"grad_norm": 1.0261180400848389,
"learning_rate": 2.297872340425532e-05,
"loss": 1.2374,
"step": 216
},
{
"epoch": 0.23,
"grad_norm": 0.04573334380984306,
"learning_rate": 2.3085106382978724e-05,
"loss": 1.2826,
"step": 217
},
{
"epoch": 0.23,
"grad_norm": 0.04903516545891762,
"learning_rate": 2.319148936170213e-05,
"loss": 1.313,
"step": 218
},
{
"epoch": 0.23,
"grad_norm": 0.049122072756290436,
"learning_rate": 2.3297872340425533e-05,
"loss": 1.4223,
"step": 219
},
{
"epoch": 0.23,
"grad_norm": 0.04724901542067528,
"learning_rate": 2.340425531914894e-05,
"loss": 1.2758,
"step": 220
},
{
"epoch": 0.24,
"grad_norm": 0.04624621570110321,
"learning_rate": 2.351063829787234e-05,
"loss": 1.2517,
"step": 221
},
{
"epoch": 0.24,
"grad_norm": 0.056727565824985504,
"learning_rate": 2.3617021276595744e-05,
"loss": 1.3039,
"step": 222
},
{
"epoch": 0.24,
"grad_norm": 0.05122361332178116,
"learning_rate": 2.3723404255319147e-05,
"loss": 1.3883,
"step": 223
},
{
"epoch": 0.24,
"grad_norm": 0.05158834904432297,
"learning_rate": 2.3829787234042553e-05,
"loss": 1.4112,
"step": 224
},
{
"epoch": 0.24,
"grad_norm": 0.07292402535676956,
"learning_rate": 2.3936170212765956e-05,
"loss": 1.3428,
"step": 225
},
{
"epoch": 0.24,
"grad_norm": 0.05974160134792328,
"learning_rate": 2.4042553191489362e-05,
"loss": 1.201,
"step": 226
},
{
"epoch": 0.24,
"grad_norm": 0.05300895869731903,
"learning_rate": 2.4148936170212765e-05,
"loss": 1.3143,
"step": 227
},
{
"epoch": 0.24,
"grad_norm": 0.045893993228673935,
"learning_rate": 2.4255319148936168e-05,
"loss": 1.3017,
"step": 228
},
{
"epoch": 0.24,
"grad_norm": 0.051792554557323456,
"learning_rate": 2.4361702127659574e-05,
"loss": 1.382,
"step": 229
},
{
"epoch": 0.25,
"grad_norm": 0.10382523387670517,
"learning_rate": 2.4468085106382977e-05,
"loss": 1.2969,
"step": 230
},
{
"epoch": 0.25,
"grad_norm": 0.052977245301008224,
"learning_rate": 2.4574468085106383e-05,
"loss": 1.3849,
"step": 231
},
{
"epoch": 0.25,
"grad_norm": 0.04847870394587517,
"learning_rate": 2.4680851063829786e-05,
"loss": 1.2976,
"step": 232
},
{
"epoch": 0.25,
"grad_norm": 0.07622654736042023,
"learning_rate": 2.4787234042553192e-05,
"loss": 1.4746,
"step": 233
},
{
"epoch": 0.25,
"grad_norm": 0.051023781299591064,
"learning_rate": 2.4893617021276595e-05,
"loss": 1.3735,
"step": 234
},
{
"epoch": 0.25,
"grad_norm": 0.0486944243311882,
"learning_rate": 2.5e-05,
"loss": 1.3675,
"step": 235
},
{
"epoch": 0.25,
"grad_norm": 0.048643093556165695,
"learning_rate": 2.5106382978723404e-05,
"loss": 1.2665,
"step": 236
},
{
"epoch": 0.25,
"grad_norm": 0.051694802939891815,
"learning_rate": 2.521276595744681e-05,
"loss": 1.4357,
"step": 237
},
{
"epoch": 0.25,
"grad_norm": 0.051911093294620514,
"learning_rate": 2.5319148936170213e-05,
"loss": 1.303,
"step": 238
},
{
"epoch": 0.25,
"grad_norm": 0.044154971837997437,
"learning_rate": 2.5425531914893616e-05,
"loss": 1.207,
"step": 239
},
{
"epoch": 0.26,
"grad_norm": 0.048809949308633804,
"learning_rate": 2.5531914893617022e-05,
"loss": 1.2401,
"step": 240
},
{
"epoch": 0.26,
"grad_norm": 0.040964074432849884,
"learning_rate": 2.5638297872340425e-05,
"loss": 1.1855,
"step": 241
},
{
"epoch": 0.26,
"grad_norm": 0.0428529791533947,
"learning_rate": 2.574468085106383e-05,
"loss": 1.2966,
"step": 242
},
{
"epoch": 0.26,
"grad_norm": 0.04508029296994209,
"learning_rate": 2.5851063829787234e-05,
"loss": 1.2393,
"step": 243
},
{
"epoch": 0.26,
"grad_norm": 0.0446944423019886,
"learning_rate": 2.595744680851064e-05,
"loss": 1.249,
"step": 244
},
{
"epoch": 0.26,
"grad_norm": 0.0411514937877655,
"learning_rate": 2.6063829787234043e-05,
"loss": 1.2409,
"step": 245
},
{
"epoch": 0.26,
"grad_norm": 0.04906069114804268,
"learning_rate": 2.617021276595745e-05,
"loss": 1.4407,
"step": 246
},
{
"epoch": 0.26,
"grad_norm": 0.043277911841869354,
"learning_rate": 2.6276595744680852e-05,
"loss": 1.2552,
"step": 247
},
{
"epoch": 0.26,
"grad_norm": 0.045710548758506775,
"learning_rate": 2.6382978723404258e-05,
"loss": 1.3415,
"step": 248
},
{
"epoch": 0.27,
"grad_norm": 0.04261719062924385,
"learning_rate": 2.648936170212766e-05,
"loss": 1.3518,
"step": 249
},
{
"epoch": 0.27,
"grad_norm": 0.044100042432546616,
"learning_rate": 2.6595744680851064e-05,
"loss": 1.3098,
"step": 250
},
{
"epoch": 0.27,
"grad_norm": 0.04507607966661453,
"learning_rate": 2.670212765957447e-05,
"loss": 1.1852,
"step": 251
},
{
"epoch": 0.27,
"grad_norm": 0.04363155737519264,
"learning_rate": 2.6808510638297873e-05,
"loss": 1.2703,
"step": 252
},
{
"epoch": 0.27,
"grad_norm": 0.07022202759981155,
"learning_rate": 2.691489361702128e-05,
"loss": 1.175,
"step": 253
},
{
"epoch": 0.27,
"grad_norm": 0.050820302218198776,
"learning_rate": 2.702127659574468e-05,
"loss": 1.3806,
"step": 254
},
{
"epoch": 0.27,
"grad_norm": 0.04110841080546379,
"learning_rate": 2.7127659574468088e-05,
"loss": 1.1963,
"step": 255
},
{
"epoch": 0.27,
"grad_norm": 0.04145009443163872,
"learning_rate": 2.723404255319149e-05,
"loss": 1.2588,
"step": 256
},
{
"epoch": 0.27,
"grad_norm": 0.04492926597595215,
"learning_rate": 2.7340425531914897e-05,
"loss": 1.1934,
"step": 257
},
{
"epoch": 0.27,
"grad_norm": 0.05001668259501457,
"learning_rate": 2.74468085106383e-05,
"loss": 1.3063,
"step": 258
},
{
"epoch": 0.28,
"grad_norm": 0.05600470304489136,
"learning_rate": 2.7553191489361706e-05,
"loss": 1.303,
"step": 259
},
{
"epoch": 0.28,
"grad_norm": 0.04361084848642349,
"learning_rate": 2.765957446808511e-05,
"loss": 1.2889,
"step": 260
},
{
"epoch": 0.28,
"grad_norm": 0.043035976588726044,
"learning_rate": 2.776595744680851e-05,
"loss": 1.2992,
"step": 261
},
{
"epoch": 0.28,
"grad_norm": 0.044470012187957764,
"learning_rate": 2.7872340425531914e-05,
"loss": 1.4082,
"step": 262
},
{
"epoch": 0.28,
"grad_norm": 0.043607208877801895,
"learning_rate": 2.7978723404255317e-05,
"loss": 1.3521,
"step": 263
},
{
"epoch": 0.28,
"grad_norm": 0.047062911093235016,
"learning_rate": 2.8085106382978723e-05,
"loss": 1.2293,
"step": 264
},
{
"epoch": 0.28,
"grad_norm": 0.051996584981679916,
"learning_rate": 2.8191489361702126e-05,
"loss": 1.21,
"step": 265
},
{
"epoch": 0.28,
"grad_norm": 0.055774882435798645,
"learning_rate": 2.8297872340425532e-05,
"loss": 1.3027,
"step": 266
},
{
"epoch": 0.28,
"grad_norm": 0.04664234817028046,
"learning_rate": 2.8404255319148935e-05,
"loss": 1.2192,
"step": 267
},
{
"epoch": 0.29,
"grad_norm": 0.05001696199178696,
"learning_rate": 2.851063829787234e-05,
"loss": 1.2731,
"step": 268
},
{
"epoch": 0.29,
"grad_norm": 0.04934161901473999,
"learning_rate": 2.8617021276595744e-05,
"loss": 1.267,
"step": 269
},
{
"epoch": 0.29,
"grad_norm": 0.05439780652523041,
"learning_rate": 2.872340425531915e-05,
"loss": 1.3526,
"step": 270
},
{
"epoch": 0.29,
"grad_norm": 0.04861316457390785,
"learning_rate": 2.8829787234042553e-05,
"loss": 1.3071,
"step": 271
},
{
"epoch": 0.29,
"grad_norm": 0.04321487993001938,
"learning_rate": 2.8936170212765956e-05,
"loss": 1.2382,
"step": 272
},
{
"epoch": 0.29,
"grad_norm": 0.07793135941028595,
"learning_rate": 2.9042553191489362e-05,
"loss": 1.2424,
"step": 273
},
{
"epoch": 0.29,
"grad_norm": 0.04725024476647377,
"learning_rate": 2.9148936170212765e-05,
"loss": 1.2969,
"step": 274
},
{
"epoch": 0.29,
"grad_norm": 0.04626401886343956,
"learning_rate": 2.925531914893617e-05,
"loss": 1.3019,
"step": 275
},
{
"epoch": 0.29,
"grad_norm": 0.045404914766550064,
"learning_rate": 2.9361702127659574e-05,
"loss": 1.2824,
"step": 276
},
{
"epoch": 0.3,
"grad_norm": 0.05153006315231323,
"learning_rate": 2.946808510638298e-05,
"loss": 1.3043,
"step": 277
},
{
"epoch": 0.3,
"grad_norm": 0.04886719956994057,
"learning_rate": 2.9574468085106383e-05,
"loss": 1.3016,
"step": 278
},
{
"epoch": 0.3,
"grad_norm": 0.045437101274728775,
"learning_rate": 2.968085106382979e-05,
"loss": 1.3027,
"step": 279
},
{
"epoch": 0.3,
"grad_norm": 0.044898632913827896,
"learning_rate": 2.9787234042553192e-05,
"loss": 1.2955,
"step": 280
},
{
"epoch": 0.3,
"grad_norm": 0.15321741998195648,
"learning_rate": 2.9893617021276598e-05,
"loss": 1.3549,
"step": 281
},
{
"epoch": 0.3,
"grad_norm": 0.04944868013262749,
"learning_rate": 3e-05,
"loss": 1.2074,
"step": 282
},
{
"epoch": 0.3,
"grad_norm": 0.04308634251356125,
"learning_rate": 2.9999988453946903e-05,
"loss": 1.2587,
"step": 283
},
{
"epoch": 0.3,
"grad_norm": 0.04539733752608299,
"learning_rate": 2.9999953815805386e-05,
"loss": 1.2209,
"step": 284
},
{
"epoch": 0.3,
"grad_norm": 0.05106598138809204,
"learning_rate": 2.9999896085628773e-05,
"loss": 1.2978,
"step": 285
},
{
"epoch": 0.3,
"grad_norm": 0.04330745339393616,
"learning_rate": 2.9999815263505937e-05,
"loss": 1.3341,
"step": 286
},
{
"epoch": 0.31,
"grad_norm": 0.49190011620521545,
"learning_rate": 2.999971134956131e-05,
"loss": 1.1967,
"step": 287
},
{
"epoch": 0.31,
"grad_norm": 0.12518596649169922,
"learning_rate": 2.9999584343954855e-05,
"loss": 1.3994,
"step": 288
},
{
"epoch": 0.31,
"grad_norm": 0.0452754944562912,
"learning_rate": 2.9999434246882094e-05,
"loss": 1.3459,
"step": 289
},
{
"epoch": 0.31,
"grad_norm": 0.15973900258541107,
"learning_rate": 2.9999261058574106e-05,
"loss": 1.2315,
"step": 290
},
{
"epoch": 0.31,
"grad_norm": 0.05226750299334526,
"learning_rate": 2.99990647792975e-05,
"loss": 1.3226,
"step": 291
},
{
"epoch": 0.31,
"grad_norm": 0.05565487965941429,
"learning_rate": 2.999884540935445e-05,
"loss": 1.274,
"step": 292
},
{
"epoch": 0.31,
"grad_norm": 0.042020637542009354,
"learning_rate": 2.9998602949082663e-05,
"loss": 1.2769,
"step": 293
},
{
"epoch": 0.31,
"grad_norm": 0.04224269464612007,
"learning_rate": 2.999833739885541e-05,
"loss": 1.2758,
"step": 294
},
{
"epoch": 0.31,
"grad_norm": 0.04438871517777443,
"learning_rate": 2.999804875908149e-05,
"loss": 1.2992,
"step": 295
},
{
"epoch": 0.32,
"grad_norm": 0.04211125522851944,
"learning_rate": 2.999773703020526e-05,
"loss": 1.2551,
"step": 296
},
{
"epoch": 0.32,
"grad_norm": 0.044215280562639236,
"learning_rate": 2.999740221270662e-05,
"loss": 1.3433,
"step": 297
},
{
"epoch": 0.32,
"grad_norm": 0.0452650748193264,
"learning_rate": 2.999704430710101e-05,
"loss": 1.0829,
"step": 298
},
{
"epoch": 0.32,
"grad_norm": 0.04780622571706772,
"learning_rate": 2.9996663313939412e-05,
"loss": 1.3217,
"step": 299
},
{
"epoch": 0.32,
"grad_norm": 0.042652346193790436,
"learning_rate": 2.999625923380837e-05,
"loss": 1.2241,
"step": 300
},
{
"epoch": 0.32,
"grad_norm": 0.038346268236637115,
"learning_rate": 2.9995832067329933e-05,
"loss": 1.2186,
"step": 301
},
{
"epoch": 0.32,
"grad_norm": 0.04913242533802986,
"learning_rate": 2.9995381815161732e-05,
"loss": 1.3639,
"step": 302
},
{
"epoch": 0.32,
"grad_norm": 0.046195752918720245,
"learning_rate": 2.9994908477996913e-05,
"loss": 1.2807,
"step": 303
},
{
"epoch": 0.32,
"grad_norm": 0.04986129701137543,
"learning_rate": 2.9994412056564157e-05,
"loss": 1.3036,
"step": 304
},
{
"epoch": 0.33,
"grad_norm": 0.053075432777404785,
"learning_rate": 2.9993892551627702e-05,
"loss": 1.3456,
"step": 305
},
{
"epoch": 0.33,
"grad_norm": 0.04712017998099327,
"learning_rate": 2.9993349963987306e-05,
"loss": 1.4103,
"step": 306
},
{
"epoch": 0.33,
"grad_norm": 0.0960259810090065,
"learning_rate": 2.9992784294478277e-05,
"loss": 1.2689,
"step": 307
},
{
"epoch": 0.33,
"grad_norm": 0.04377806931734085,
"learning_rate": 2.9992195543971437e-05,
"loss": 1.2946,
"step": 308
},
{
"epoch": 0.33,
"grad_norm": 0.04737775772809982,
"learning_rate": 2.999158371337316e-05,
"loss": 1.3504,
"step": 309
},
{
"epoch": 0.33,
"grad_norm": 0.04784572497010231,
"learning_rate": 2.9990948803625344e-05,
"loss": 1.217,
"step": 310
},
{
"epoch": 0.33,
"grad_norm": 0.046987585723400116,
"learning_rate": 2.999029081570541e-05,
"loss": 1.1544,
"step": 311
},
{
"epoch": 0.33,
"grad_norm": 0.04943583905696869,
"learning_rate": 2.9989609750626313e-05,
"loss": 1.2561,
"step": 312
},
{
"epoch": 0.33,
"grad_norm": 0.04215755686163902,
"learning_rate": 2.998890560943654e-05,
"loss": 1.2639,
"step": 313
},
{
"epoch": 0.33,
"grad_norm": 0.04213941469788551,
"learning_rate": 2.99881783932201e-05,
"loss": 1.2206,
"step": 314
},
{
"epoch": 0.34,
"grad_norm": 0.057730745524168015,
"learning_rate": 2.9987428103096507e-05,
"loss": 1.3025,
"step": 315
},
{
"epoch": 0.34,
"grad_norm": 0.04681975021958351,
"learning_rate": 2.9986654740220835e-05,
"loss": 1.3503,
"step": 316
},
{
"epoch": 0.34,
"grad_norm": 0.047414880245923996,
"learning_rate": 2.9985858305783643e-05,
"loss": 1.3505,
"step": 317
},
{
"epoch": 0.34,
"grad_norm": 0.042863670736551285,
"learning_rate": 2.998503880101102e-05,
"loss": 1.2928,
"step": 318
},
{
"epoch": 0.34,
"grad_norm": 0.041847921907901764,
"learning_rate": 2.998419622716458e-05,
"loss": 1.2902,
"step": 319
},
{
"epoch": 0.34,
"grad_norm": 0.04203863441944122,
"learning_rate": 2.998333058554144e-05,
"loss": 1.226,
"step": 320
},
{
"epoch": 0.34,
"grad_norm": 0.045996423810720444,
"learning_rate": 2.9982441877474225e-05,
"loss": 1.2747,
"step": 321
},
{
"epoch": 0.34,
"grad_norm": 0.04503284767270088,
"learning_rate": 2.9981530104331087e-05,
"loss": 1.3563,
"step": 322
},
{
"epoch": 0.34,
"grad_norm": 0.04348743334412575,
"learning_rate": 2.9980595267515677e-05,
"loss": 1.3476,
"step": 323
},
{
"epoch": 0.35,
"grad_norm": 0.05177181586623192,
"learning_rate": 2.9979637368467143e-05,
"loss": 1.2985,
"step": 324
},
{
"epoch": 0.35,
"grad_norm": 0.04224792867898941,
"learning_rate": 2.9978656408660157e-05,
"loss": 1.263,
"step": 325
},
{
"epoch": 0.35,
"grad_norm": 0.04062522202730179,
"learning_rate": 2.9977652389604867e-05,
"loss": 1.2936,
"step": 326
},
{
"epoch": 0.35,
"grad_norm": 0.04481014609336853,
"learning_rate": 2.9976625312846952e-05,
"loss": 1.3739,
"step": 327
},
{
"epoch": 0.35,
"grad_norm": 0.04340618476271629,
"learning_rate": 2.9975575179967552e-05,
"loss": 1.3147,
"step": 328
},
{
"epoch": 0.35,
"grad_norm": 0.038771748542785645,
"learning_rate": 2.9974501992583333e-05,
"loss": 1.1779,
"step": 329
},
{
"epoch": 0.35,
"grad_norm": 0.04300400987267494,
"learning_rate": 2.9973405752346424e-05,
"loss": 1.32,
"step": 330
},
{
"epoch": 0.35,
"grad_norm": 0.04594357684254646,
"learning_rate": 2.9972286460944477e-05,
"loss": 1.2634,
"step": 331
},
{
"epoch": 0.35,
"grad_norm": 0.041061241179704666,
"learning_rate": 2.997114412010059e-05,
"loss": 1.2628,
"step": 332
},
{
"epoch": 0.35,
"grad_norm": 0.04096174240112305,
"learning_rate": 2.9969978731573384e-05,
"loss": 1.2161,
"step": 333
},
{
"epoch": 0.36,
"grad_norm": 0.051395904272794724,
"learning_rate": 2.996879029715694e-05,
"loss": 1.3317,
"step": 334
},
{
"epoch": 0.36,
"grad_norm": 0.04332401603460312,
"learning_rate": 2.9967578818680817e-05,
"loss": 1.1812,
"step": 335
},
{
"epoch": 0.36,
"grad_norm": 0.04285382851958275,
"learning_rate": 2.9966344298010055e-05,
"loss": 1.3631,
"step": 336
},
{
"epoch": 0.36,
"grad_norm": 0.04322006553411484,
"learning_rate": 2.996508673704517e-05,
"loss": 1.3598,
"step": 337
},
{
"epoch": 0.36,
"grad_norm": 0.04882935434579849,
"learning_rate": 2.9963806137722145e-05,
"loss": 1.3121,
"step": 338
},
{
"epoch": 0.36,
"grad_norm": 0.04319699481129646,
"learning_rate": 2.996250250201242e-05,
"loss": 1.1114,
"step": 339
},
{
"epoch": 0.36,
"grad_norm": 0.044990669935941696,
"learning_rate": 2.996117583192292e-05,
"loss": 1.4712,
"step": 340
},
{
"epoch": 0.36,
"grad_norm": 0.04074972867965698,
"learning_rate": 2.995982612949601e-05,
"loss": 1.1548,
"step": 341
},
{
"epoch": 0.36,
"grad_norm": 0.042052388191223145,
"learning_rate": 2.9958453396809524e-05,
"loss": 1.2864,
"step": 342
},
{
"epoch": 0.37,
"grad_norm": 0.04926493391394615,
"learning_rate": 2.995705763597675e-05,
"loss": 1.295,
"step": 343
},
{
"epoch": 0.37,
"grad_norm": 0.04231850057840347,
"learning_rate": 2.9955638849146422e-05,
"loss": 1.2497,
"step": 344
},
{
"epoch": 0.37,
"grad_norm": 0.04407385364174843,
"learning_rate": 2.9954197038502727e-05,
"loss": 1.3868,
"step": 345
},
{
"epoch": 0.37,
"grad_norm": 0.04645228013396263,
"learning_rate": 2.9952732206265295e-05,
"loss": 1.2736,
"step": 346
},
{
"epoch": 0.37,
"grad_norm": 0.042599406093358994,
"learning_rate": 2.9951244354689195e-05,
"loss": 1.3913,
"step": 347
},
{
"epoch": 0.37,
"grad_norm": 0.04068687558174133,
"learning_rate": 2.994973348606494e-05,
"loss": 1.267,
"step": 348
},
{
"epoch": 0.37,
"grad_norm": 0.039476945996284485,
"learning_rate": 2.9948199602718463e-05,
"loss": 1.3569,
"step": 349
},
{
"epoch": 0.37,
"grad_norm": 0.041935890913009644,
"learning_rate": 2.9946642707011144e-05,
"loss": 1.0696,
"step": 350
},
{
"epoch": 0.37,
"grad_norm": 0.042615581303834915,
"learning_rate": 2.9945062801339784e-05,
"loss": 1.2675,
"step": 351
},
{
"epoch": 0.38,
"grad_norm": 0.03951689228415489,
"learning_rate": 2.9943459888136607e-05,
"loss": 1.2978,
"step": 352
},
{
"epoch": 0.38,
"grad_norm": 0.042145539075136185,
"learning_rate": 2.994183396986925e-05,
"loss": 1.3425,
"step": 353
},
{
"epoch": 0.38,
"grad_norm": 0.04591543972492218,
"learning_rate": 2.994018504904078e-05,
"loss": 1.2591,
"step": 354
},
{
"epoch": 0.38,
"grad_norm": 0.042830970138311386,
"learning_rate": 2.993851312818965e-05,
"loss": 1.3181,
"step": 355
},
{
"epoch": 0.38,
"grad_norm": 0.04242956265807152,
"learning_rate": 2.9936818209889764e-05,
"loss": 1.3664,
"step": 356
},
{
"epoch": 0.38,
"grad_norm": 0.04106505587697029,
"learning_rate": 2.993510029675038e-05,
"loss": 1.2306,
"step": 357
},
{
"epoch": 0.38,
"grad_norm": 0.04205963388085365,
"learning_rate": 2.9933359391416197e-05,
"loss": 1.3389,
"step": 358
},
{
"epoch": 0.38,
"grad_norm": 0.043333835899829865,
"learning_rate": 2.9931595496567285e-05,
"loss": 1.3347,
"step": 359
},
{
"epoch": 0.38,
"grad_norm": 0.03695542365312576,
"learning_rate": 2.9929808614919114e-05,
"loss": 1.3666,
"step": 360
},
{
"epoch": 0.38,
"grad_norm": 0.03952499479055405,
"learning_rate": 2.9927998749222546e-05,
"loss": 1.346,
"step": 361
},
{
"epoch": 0.39,
"grad_norm": 0.0432700589299202,
"learning_rate": 2.9926165902263814e-05,
"loss": 1.4122,
"step": 362
},
{
"epoch": 0.39,
"grad_norm": 0.04118409752845764,
"learning_rate": 2.992431007686455e-05,
"loss": 1.3356,
"step": 363
},
{
"epoch": 0.39,
"grad_norm": 0.041710883378982544,
"learning_rate": 2.9922431275881736e-05,
"loss": 1.2396,
"step": 364
},
{
"epoch": 0.39,
"grad_norm": 0.040193233639001846,
"learning_rate": 2.9920529502207744e-05,
"loss": 1.3486,
"step": 365
},
{
"epoch": 0.39,
"grad_norm": 0.07698454707860947,
"learning_rate": 2.9918604758770298e-05,
"loss": 1.351,
"step": 366
},
{
"epoch": 0.39,
"grad_norm": 0.04122081398963928,
"learning_rate": 2.9916657048532498e-05,
"loss": 1.2852,
"step": 367
},
{
"epoch": 0.39,
"grad_norm": 0.0415961854159832,
"learning_rate": 2.991468637449279e-05,
"loss": 1.1954,
"step": 368
},
{
"epoch": 0.39,
"grad_norm": 0.05483356490731239,
"learning_rate": 2.9912692739684973e-05,
"loss": 1.1881,
"step": 369
},
{
"epoch": 0.39,
"grad_norm": 0.043232712894678116,
"learning_rate": 2.9910676147178194e-05,
"loss": 1.3717,
"step": 370
},
{
"epoch": 0.4,
"grad_norm": 0.04087051749229431,
"learning_rate": 2.990863660007695e-05,
"loss": 1.2881,
"step": 371
},
{
"epoch": 0.4,
"grad_norm": 0.0421091727912426,
"learning_rate": 2.9906574101521068e-05,
"loss": 1.3797,
"step": 372
},
{
"epoch": 0.4,
"grad_norm": 0.0437094122171402,
"learning_rate": 2.9904488654685706e-05,
"loss": 1.3868,
"step": 373
},
{
"epoch": 0.4,
"grad_norm": 0.044396668672561646,
"learning_rate": 2.990238026278136e-05,
"loss": 1.3332,
"step": 374
},
{
"epoch": 0.4,
"grad_norm": 0.0429987758398056,
"learning_rate": 2.990024892905384e-05,
"loss": 1.3638,
"step": 375
},
{
"epoch": 0.4,
"grad_norm": 0.05373203381896019,
"learning_rate": 2.9898094656784283e-05,
"loss": 1.274,
"step": 376
},
{
"epoch": 0.4,
"grad_norm": 0.04591381922364235,
"learning_rate": 2.9895917449289128e-05,
"loss": 1.2414,
"step": 377
},
{
"epoch": 0.4,
"grad_norm": 0.04005320370197296,
"learning_rate": 2.9893717309920134e-05,
"loss": 1.3568,
"step": 378
},
{
"epoch": 0.4,
"grad_norm": 0.042381271719932556,
"learning_rate": 2.989149424206436e-05,
"loss": 1.3165,
"step": 379
},
{
"epoch": 0.41,
"grad_norm": 0.04066908732056618,
"learning_rate": 2.9889248249144153e-05,
"loss": 1.1674,
"step": 380
},
{
"epoch": 0.41,
"grad_norm": 0.06059327349066734,
"learning_rate": 2.9886979334617167e-05,
"loss": 1.3155,
"step": 381
},
{
"epoch": 0.41,
"grad_norm": 0.05215556547045708,
"learning_rate": 2.9884687501976336e-05,
"loss": 1.2876,
"step": 382
},
{
"epoch": 0.41,
"grad_norm": 0.039729043841362,
"learning_rate": 2.9882372754749867e-05,
"loss": 1.3197,
"step": 383
},
{
"epoch": 0.41,
"grad_norm": 0.04095631465315819,
"learning_rate": 2.9880035096501265e-05,
"loss": 1.2877,
"step": 384
},
{
"epoch": 0.41,
"grad_norm": 0.04893979802727699,
"learning_rate": 2.9877674530829286e-05,
"loss": 1.2932,
"step": 385
},
{
"epoch": 0.41,
"grad_norm": 0.043521177023649216,
"learning_rate": 2.987529106136796e-05,
"loss": 1.359,
"step": 386
},
{
"epoch": 0.41,
"grad_norm": 0.04460636153817177,
"learning_rate": 2.9872884691786576e-05,
"loss": 1.3171,
"step": 387
},
{
"epoch": 0.41,
"grad_norm": 0.039205264300107956,
"learning_rate": 2.9870455425789678e-05,
"loss": 1.1949,
"step": 388
},
{
"epoch": 0.41,
"grad_norm": 0.04290665313601494,
"learning_rate": 2.986800326711706e-05,
"loss": 1.3771,
"step": 389
},
{
"epoch": 0.42,
"grad_norm": 0.04245166853070259,
"learning_rate": 2.9865528219543747e-05,
"loss": 1.3134,
"step": 390
},
{
"epoch": 0.42,
"grad_norm": 0.04090409353375435,
"learning_rate": 2.9863030286880017e-05,
"loss": 1.387,
"step": 391
},
{
"epoch": 0.42,
"grad_norm": 0.04054385796189308,
"learning_rate": 2.986050947297137e-05,
"loss": 1.2733,
"step": 392
},
{
"epoch": 0.42,
"grad_norm": 0.04850127920508385,
"learning_rate": 2.985796578169853e-05,
"loss": 1.2143,
"step": 393
},
{
"epoch": 0.42,
"grad_norm": 0.05087016895413399,
"learning_rate": 2.9855399216977453e-05,
"loss": 1.3101,
"step": 394
},
{
"epoch": 0.42,
"grad_norm": 0.044336311519145966,
"learning_rate": 2.9852809782759285e-05,
"loss": 1.2811,
"step": 395
},
{
"epoch": 0.42,
"grad_norm": 0.04210364446043968,
"learning_rate": 2.9850197483030397e-05,
"loss": 1.2509,
"step": 396
},
{
"epoch": 0.42,
"grad_norm": 0.043686628341674805,
"learning_rate": 2.9847562321812358e-05,
"loss": 1.262,
"step": 397
},
{
"epoch": 0.42,
"grad_norm": 0.04465370252728462,
"learning_rate": 2.9844904303161925e-05,
"loss": 1.2717,
"step": 398
},
{
"epoch": 0.43,
"grad_norm": 0.041322916746139526,
"learning_rate": 2.9842223431171056e-05,
"loss": 1.3261,
"step": 399
},
{
"epoch": 0.43,
"grad_norm": 0.05307582765817642,
"learning_rate": 2.9839519709966875e-05,
"loss": 1.229,
"step": 400
},
{
"epoch": 0.43,
"grad_norm": 0.05377936363220215,
"learning_rate": 2.9836793143711692e-05,
"loss": 1.3681,
"step": 401
},
{
"epoch": 0.43,
"grad_norm": 0.06358564645051956,
"learning_rate": 2.9834043736602985e-05,
"loss": 1.3573,
"step": 402
},
{
"epoch": 0.43,
"grad_norm": 0.046525027602910995,
"learning_rate": 2.9831271492873396e-05,
"loss": 1.1367,
"step": 403
},
{
"epoch": 0.43,
"grad_norm": 0.04199456423521042,
"learning_rate": 2.982847641679072e-05,
"loss": 1.2746,
"step": 404
},
{
"epoch": 0.43,
"grad_norm": 0.04503254592418671,
"learning_rate": 2.9825658512657902e-05,
"loss": 1.2289,
"step": 405
},
{
"epoch": 0.43,
"grad_norm": 0.0457291416823864,
"learning_rate": 2.982281778481303e-05,
"loss": 1.2515,
"step": 406
},
{
"epoch": 0.43,
"grad_norm": 0.04370247200131416,
"learning_rate": 2.9819954237629333e-05,
"loss": 1.1892,
"step": 407
},
{
"epoch": 0.43,
"grad_norm": 0.04227456450462341,
"learning_rate": 2.9817067875515165e-05,
"loss": 1.3287,
"step": 408
},
{
"epoch": 0.44,
"grad_norm": 0.04234752431511879,
"learning_rate": 2.981415870291401e-05,
"loss": 1.3267,
"step": 409
},
{
"epoch": 0.44,
"grad_norm": 0.04369445517659187,
"learning_rate": 2.981122672430445e-05,
"loss": 1.2835,
"step": 410
},
{
"epoch": 0.44,
"grad_norm": 0.05714486539363861,
"learning_rate": 2.9808271944200208e-05,
"loss": 1.3951,
"step": 411
},
{
"epoch": 0.44,
"grad_norm": 0.04107912257313728,
"learning_rate": 2.980529436715007e-05,
"loss": 1.272,
"step": 412
},
{
"epoch": 0.44,
"grad_norm": 0.040028270334005356,
"learning_rate": 2.980229399773795e-05,
"loss": 1.2165,
"step": 413
},
{
"epoch": 0.44,
"grad_norm": 0.03916673734784126,
"learning_rate": 2.9799270840582838e-05,
"loss": 1.1649,
"step": 414
},
{
"epoch": 0.44,
"grad_norm": 0.03947027027606964,
"learning_rate": 2.97962249003388e-05,
"loss": 1.1785,
"step": 415
},
{
"epoch": 0.44,
"grad_norm": 0.10322597622871399,
"learning_rate": 2.979315618169499e-05,
"loss": 1.1446,
"step": 416
},
{
"epoch": 0.44,
"grad_norm": 0.044616565108299255,
"learning_rate": 2.9790064689375605e-05,
"loss": 1.2937,
"step": 417
},
{
"epoch": 0.45,
"grad_norm": 0.0792105421423912,
"learning_rate": 2.9786950428139926e-05,
"loss": 1.0531,
"step": 418
},
{
"epoch": 0.45,
"grad_norm": 0.042875826358795166,
"learning_rate": 2.978381340278228e-05,
"loss": 1.4458,
"step": 419
},
{
"epoch": 0.45,
"grad_norm": 0.042151302099227905,
"learning_rate": 2.9780653618132026e-05,
"loss": 1.4125,
"step": 420
},
{
"epoch": 0.45,
"grad_norm": 0.0390373058617115,
"learning_rate": 2.9777471079053573e-05,
"loss": 1.2743,
"step": 421
},
{
"epoch": 0.45,
"grad_norm": 0.04105671867728233,
"learning_rate": 2.977426579044636e-05,
"loss": 1.3484,
"step": 422
},
{
"epoch": 0.45,
"grad_norm": 0.04211370646953583,
"learning_rate": 2.977103775724484e-05,
"loss": 1.3602,
"step": 423
},
{
"epoch": 0.45,
"grad_norm": 0.07791785150766373,
"learning_rate": 2.9767786984418484e-05,
"loss": 1.1127,
"step": 424
},
{
"epoch": 0.45,
"grad_norm": 0.03947106748819351,
"learning_rate": 2.9764513476971783e-05,
"loss": 1.489,
"step": 425
},
{
"epoch": 0.45,
"grad_norm": 0.04182416945695877,
"learning_rate": 2.9761217239944202e-05,
"loss": 1.2798,
"step": 426
},
{
"epoch": 0.46,
"grad_norm": 0.0393020324409008,
"learning_rate": 2.9757898278410216e-05,
"loss": 1.2509,
"step": 427
},
{
"epoch": 0.46,
"grad_norm": 0.042079027742147446,
"learning_rate": 2.975455659747928e-05,
"loss": 1.2902,
"step": 428
},
{
"epoch": 0.46,
"grad_norm": 0.0440564788877964,
"learning_rate": 2.9751192202295824e-05,
"loss": 1.4684,
"step": 429
},
{
"epoch": 0.46,
"grad_norm": 0.041536420583724976,
"learning_rate": 2.9747805098039246e-05,
"loss": 1.1899,
"step": 430
},
{
"epoch": 0.46,
"grad_norm": 0.05359746143221855,
"learning_rate": 2.9744395289923903e-05,
"loss": 1.1661,
"step": 431
},
{
"epoch": 0.46,
"grad_norm": 0.037684116512537,
"learning_rate": 2.974096278319911e-05,
"loss": 1.301,
"step": 432
},
{
"epoch": 0.46,
"grad_norm": 0.038028784096241,
"learning_rate": 2.9737507583149116e-05,
"loss": 1.3669,
"step": 433
},
{
"epoch": 0.46,
"grad_norm": 0.04109985753893852,
"learning_rate": 2.973402969509311e-05,
"loss": 1.3343,
"step": 434
},
{
"epoch": 0.46,
"grad_norm": 0.04338208958506584,
"learning_rate": 2.973052912438521e-05,
"loss": 1.2158,
"step": 435
},
{
"epoch": 0.46,
"grad_norm": 0.06308567523956299,
"learning_rate": 2.9727005876414452e-05,
"loss": 1.2786,
"step": 436
},
{
"epoch": 0.47,
"grad_norm": 0.06322115659713745,
"learning_rate": 2.972345995660479e-05,
"loss": 1.2336,
"step": 437
},
{
"epoch": 0.47,
"grad_norm": 0.0424451045691967,
"learning_rate": 2.9719891370415072e-05,
"loss": 1.2459,
"step": 438
},
{
"epoch": 0.47,
"grad_norm": 0.044482551515102386,
"learning_rate": 2.9716300123339034e-05,
"loss": 1.2846,
"step": 439
},
{
"epoch": 0.47,
"grad_norm": 0.039096757769584656,
"learning_rate": 2.9712686220905318e-05,
"loss": 1.4319,
"step": 440
},
{
"epoch": 0.47,
"grad_norm": 0.3719871938228607,
"learning_rate": 2.9709049668677425e-05,
"loss": 1.326,
"step": 441
},
{
"epoch": 0.47,
"grad_norm": 0.040621835738420486,
"learning_rate": 2.9705390472253738e-05,
"loss": 1.3761,
"step": 442
},
{
"epoch": 0.47,
"grad_norm": 0.03975391760468483,
"learning_rate": 2.9701708637267487e-05,
"loss": 1.3475,
"step": 443
},
{
"epoch": 0.47,
"grad_norm": 0.04256337508559227,
"learning_rate": 2.9698004169386762e-05,
"loss": 1.3495,
"step": 444
},
{
"epoch": 0.47,
"grad_norm": 0.04694630578160286,
"learning_rate": 2.969427707431449e-05,
"loss": 1.3478,
"step": 445
},
{
"epoch": 0.48,
"grad_norm": 0.04910700023174286,
"learning_rate": 2.9690527357788452e-05,
"loss": 1.2657,
"step": 446
},
{
"epoch": 0.48,
"grad_norm": 0.04067717120051384,
"learning_rate": 2.9686755025581224e-05,
"loss": 1.3389,
"step": 447
},
{
"epoch": 0.48,
"grad_norm": 0.04208545386791229,
"learning_rate": 2.9682960083500214e-05,
"loss": 1.3061,
"step": 448
},
{
"epoch": 0.48,
"grad_norm": 0.04172629490494728,
"learning_rate": 2.9679142537387636e-05,
"loss": 1.252,
"step": 449
},
{
"epoch": 0.48,
"grad_norm": 0.04157547652721405,
"learning_rate": 2.9675302393120506e-05,
"loss": 1.2367,
"step": 450
},
{
"epoch": 0.48,
"grad_norm": 0.04133530333638191,
"learning_rate": 2.9671439656610622e-05,
"loss": 1.2163,
"step": 451
},
{
"epoch": 0.48,
"grad_norm": 0.03688955307006836,
"learning_rate": 2.966755433380457e-05,
"loss": 1.057,
"step": 452
},
{
"epoch": 0.48,
"grad_norm": 0.041086386889219284,
"learning_rate": 2.9663646430683695e-05,
"loss": 1.2647,
"step": 453
},
{
"epoch": 0.48,
"grad_norm": 0.04206252843141556,
"learning_rate": 2.9659715953264114e-05,
"loss": 1.2702,
"step": 454
},
{
"epoch": 0.48,
"grad_norm": 0.044997621327638626,
"learning_rate": 2.9655762907596695e-05,
"loss": 1.2934,
"step": 455
},
{
"epoch": 0.49,
"grad_norm": 0.04272051900625229,
"learning_rate": 2.9651787299767044e-05,
"loss": 1.16,
"step": 456
},
{
"epoch": 0.49,
"grad_norm": 0.04372408613562584,
"learning_rate": 2.9647789135895514e-05,
"loss": 1.3399,
"step": 457
},
{
"epoch": 0.49,
"grad_norm": 0.04171840101480484,
"learning_rate": 2.9643768422137167e-05,
"loss": 1.1995,
"step": 458
},
{
"epoch": 0.49,
"grad_norm": 0.04140634834766388,
"learning_rate": 2.963972516468179e-05,
"loss": 1.2467,
"step": 459
},
{
"epoch": 0.49,
"grad_norm": 0.041341882199048996,
"learning_rate": 2.9635659369753865e-05,
"loss": 1.2195,
"step": 460
},
{
"epoch": 0.49,
"grad_norm": 0.044502872973680496,
"learning_rate": 2.963157104361258e-05,
"loss": 1.1808,
"step": 461
},
{
"epoch": 0.49,
"grad_norm": 0.04876202344894409,
"learning_rate": 2.9627460192551806e-05,
"loss": 1.3945,
"step": 462
},
{
"epoch": 0.49,
"grad_norm": 0.0470486581325531,
"learning_rate": 2.9623326822900094e-05,
"loss": 1.3613,
"step": 463
},
{
"epoch": 0.49,
"grad_norm": 0.04317512735724449,
"learning_rate": 2.9619170941020652e-05,
"loss": 1.3771,
"step": 464
},
{
"epoch": 0.5,
"grad_norm": 0.046641379594802856,
"learning_rate": 2.9614992553311356e-05,
"loss": 1.5062,
"step": 465
},
{
"epoch": 0.5,
"grad_norm": 0.04240740090608597,
"learning_rate": 2.9610791666204715e-05,
"loss": 1.228,
"step": 466
},
{
"epoch": 0.5,
"grad_norm": 0.04536912590265274,
"learning_rate": 2.9606568286167897e-05,
"loss": 1.4184,
"step": 467
},
{
"epoch": 0.5,
"grad_norm": 0.04742661491036415,
"learning_rate": 2.960232241970268e-05,
"loss": 1.2567,
"step": 468
},
{
"epoch": 0.5,
"grad_norm": 0.041507788002491,
"learning_rate": 2.959805407334546e-05,
"loss": 1.2605,
"step": 469
},
{
"epoch": 0.5,
"grad_norm": 0.04272027313709259,
"learning_rate": 2.959376325366725e-05,
"loss": 1.2577,
"step": 470
},
{
"epoch": 0.5,
"grad_norm": 0.04529990255832672,
"learning_rate": 2.9589449967273647e-05,
"loss": 1.2425,
"step": 471
},
{
"epoch": 0.5,
"grad_norm": 0.044238731265068054,
"learning_rate": 2.9585114220804848e-05,
"loss": 1.3398,
"step": 472
},
{
"epoch": 0.5,
"grad_norm": 0.04481234773993492,
"learning_rate": 2.9580756020935615e-05,
"loss": 1.1884,
"step": 473
},
{
"epoch": 0.51,
"grad_norm": 0.04729278013110161,
"learning_rate": 2.957637537437529e-05,
"loss": 1.3259,
"step": 474
},
{
"epoch": 0.51,
"grad_norm": 0.05300765484571457,
"learning_rate": 2.9571972287867767e-05,
"loss": 1.3226,
"step": 475
},
{
"epoch": 0.51,
"grad_norm": 0.04174305126070976,
"learning_rate": 2.9567546768191463e-05,
"loss": 1.0939,
"step": 476
},
{
"epoch": 0.51,
"grad_norm": 0.043156612664461136,
"learning_rate": 2.956309882215937e-05,
"loss": 1.3715,
"step": 477
},
{
"epoch": 0.51,
"grad_norm": 0.0387759804725647,
"learning_rate": 2.955862845661897e-05,
"loss": 1.198,
"step": 478
},
{
"epoch": 0.51,
"grad_norm": 0.08106935769319534,
"learning_rate": 2.9554135678452284e-05,
"loss": 1.3499,
"step": 479
},
{
"epoch": 0.51,
"grad_norm": 0.04314654693007469,
"learning_rate": 2.9549620494575816e-05,
"loss": 1.1616,
"step": 480
},
{
"epoch": 0.51,
"grad_norm": 0.04383983090519905,
"learning_rate": 2.954508291194058e-05,
"loss": 1.3067,
"step": 481
},
{
"epoch": 0.51,
"grad_norm": 0.03872944414615631,
"learning_rate": 2.954052293753206e-05,
"loss": 1.2056,
"step": 482
},
{
"epoch": 0.51,
"grad_norm": 0.04128652438521385,
"learning_rate": 2.953594057837023e-05,
"loss": 1.164,
"step": 483
},
{
"epoch": 0.52,
"grad_norm": 0.043227825313806534,
"learning_rate": 2.9531335841509495e-05,
"loss": 1.3105,
"step": 484
},
{
"epoch": 0.52,
"grad_norm": 0.03934094309806824,
"learning_rate": 2.952670873403873e-05,
"loss": 1.2328,
"step": 485
},
{
"epoch": 0.52,
"grad_norm": 0.050571732223033905,
"learning_rate": 2.952205926308125e-05,
"loss": 1.1993,
"step": 486
},
{
"epoch": 0.52,
"grad_norm": 0.044990718364715576,
"learning_rate": 2.9517387435794796e-05,
"loss": 1.3368,
"step": 487
},
{
"epoch": 0.52,
"grad_norm": 0.043135274201631546,
"learning_rate": 2.9512693259371518e-05,
"loss": 1.3259,
"step": 488
},
{
"epoch": 0.52,
"grad_norm": 0.04202147200703621,
"learning_rate": 2.950797674103798e-05,
"loss": 1.3862,
"step": 489
},
{
"epoch": 0.52,
"grad_norm": 0.044921282678842545,
"learning_rate": 2.9503237888055136e-05,
"loss": 1.3632,
"step": 490
},
{
"epoch": 0.52,
"grad_norm": 0.041155070066452026,
"learning_rate": 2.9498476707718328e-05,
"loss": 1.304,
"step": 491
},
{
"epoch": 0.52,
"grad_norm": 0.0547085665166378,
"learning_rate": 2.9493693207357266e-05,
"loss": 1.3055,
"step": 492
},
{
"epoch": 0.53,
"grad_norm": 0.04637569189071655,
"learning_rate": 2.9488887394336025e-05,
"loss": 1.264,
"step": 493
},
{
"epoch": 0.53,
"grad_norm": 0.0417875237762928,
"learning_rate": 2.9484059276053027e-05,
"loss": 1.4294,
"step": 494
},
{
"epoch": 0.53,
"grad_norm": 0.04684137552976608,
"learning_rate": 2.9479208859941034e-05,
"loss": 1.3123,
"step": 495
},
{
"epoch": 0.53,
"grad_norm": 0.04218194633722305,
"learning_rate": 2.9474336153467135e-05,
"loss": 1.1791,
"step": 496
},
{
"epoch": 0.53,
"grad_norm": 0.037852074950933456,
"learning_rate": 2.946944116413273e-05,
"loss": 1.2582,
"step": 497
},
{
"epoch": 0.53,
"grad_norm": 0.039469506591558456,
"learning_rate": 2.946452389947353e-05,
"loss": 1.1058,
"step": 498
},
{
"epoch": 0.53,
"grad_norm": 0.043612875044345856,
"learning_rate": 2.9459584367059533e-05,
"loss": 1.2935,
"step": 499
},
{
"epoch": 0.53,
"grad_norm": 0.04316055774688721,
"learning_rate": 2.9454622574495022e-05,
"loss": 1.3011,
"step": 500
},
{
"epoch": 0.53,
"grad_norm": 0.04812432825565338,
"learning_rate": 2.9449638529418544e-05,
"loss": 1.2616,
"step": 501
},
{
"epoch": 0.54,
"grad_norm": 0.11042577773332596,
"learning_rate": 2.9444632239502906e-05,
"loss": 1.301,
"step": 502
},
{
"epoch": 0.54,
"grad_norm": 0.046350039541721344,
"learning_rate": 2.9439603712455163e-05,
"loss": 1.4199,
"step": 503
},
{
"epoch": 0.54,
"grad_norm": 0.04246099293231964,
"learning_rate": 2.943455295601659e-05,
"loss": 1.2141,
"step": 504
},
{
"epoch": 0.54,
"grad_norm": 0.041246578097343445,
"learning_rate": 2.9429479977962712e-05,
"loss": 1.3512,
"step": 505
},
{
"epoch": 0.54,
"grad_norm": 0.042336758226156235,
"learning_rate": 2.942438478610323e-05,
"loss": 1.3296,
"step": 506
},
{
"epoch": 0.54,
"grad_norm": 0.041006408631801605,
"learning_rate": 2.941926738828206e-05,
"loss": 1.3967,
"step": 507
},
{
"epoch": 0.54,
"grad_norm": 0.04907152056694031,
"learning_rate": 2.9414127792377314e-05,
"loss": 1.323,
"step": 508
},
{
"epoch": 0.54,
"grad_norm": 0.03816520795226097,
"learning_rate": 2.9408966006301247e-05,
"loss": 1.3492,
"step": 509
},
{
"epoch": 0.54,
"grad_norm": 0.03879634290933609,
"learning_rate": 2.9403782038000306e-05,
"loss": 1.2649,
"step": 510
},
{
"epoch": 0.54,
"grad_norm": 0.042063359171152115,
"learning_rate": 2.939857589545507e-05,
"loss": 1.3584,
"step": 511
},
{
"epoch": 0.55,
"grad_norm": 0.04285269230604172,
"learning_rate": 2.9393347586680255e-05,
"loss": 1.2769,
"step": 512
},
{
"epoch": 0.55,
"grad_norm": 0.12035181373357773,
"learning_rate": 2.938809711972471e-05,
"loss": 1.347,
"step": 513
},
{
"epoch": 0.55,
"grad_norm": 0.05274464190006256,
"learning_rate": 2.9382824502671392e-05,
"loss": 1.3312,
"step": 514
},
{
"epoch": 0.55,
"grad_norm": 0.10297731310129166,
"learning_rate": 2.937752974363736e-05,
"loss": 1.339,
"step": 515
},
{
"epoch": 0.55,
"grad_norm": 0.10403720289468765,
"learning_rate": 2.9372212850773742e-05,
"loss": 1.3299,
"step": 516
},
{
"epoch": 0.55,
"grad_norm": 0.0409269854426384,
"learning_rate": 2.9366873832265766e-05,
"loss": 1.2374,
"step": 517
},
{
"epoch": 0.55,
"grad_norm": 0.13820002973079681,
"learning_rate": 2.9361512696332714e-05,
"loss": 1.2301,
"step": 518
},
{
"epoch": 0.55,
"grad_norm": 0.05118661746382713,
"learning_rate": 2.9356129451227903e-05,
"loss": 1.2539,
"step": 519
},
{
"epoch": 0.55,
"grad_norm": 0.039746690541505814,
"learning_rate": 2.9350724105238703e-05,
"loss": 1.335,
"step": 520
},
{
"epoch": 0.56,
"grad_norm": 0.04060789570212364,
"learning_rate": 2.9345296666686505e-05,
"loss": 1.3324,
"step": 521
},
{
"epoch": 0.56,
"grad_norm": 0.03841068223118782,
"learning_rate": 2.9339847143926705e-05,
"loss": 1.1333,
"step": 522
},
{
"epoch": 0.56,
"grad_norm": 0.04431808739900589,
"learning_rate": 2.93343755453487e-05,
"loss": 1.279,
"step": 523
},
{
"epoch": 0.56,
"grad_norm": 0.03678007051348686,
"learning_rate": 2.932888187937587e-05,
"loss": 1.1657,
"step": 524
},
{
"epoch": 0.56,
"grad_norm": 0.06775739043951035,
"learning_rate": 2.9323366154465584e-05,
"loss": 1.3119,
"step": 525
},
{
"epoch": 0.56,
"grad_norm": 0.03947743773460388,
"learning_rate": 2.9317828379109137e-05,
"loss": 1.2451,
"step": 526
},
{
"epoch": 0.56,
"grad_norm": 0.0774097591638565,
"learning_rate": 2.9312268561831797e-05,
"loss": 1.3464,
"step": 527
},
{
"epoch": 0.56,
"grad_norm": 0.04033540561795235,
"learning_rate": 2.9306686711192752e-05,
"loss": 1.2757,
"step": 528
},
{
"epoch": 0.56,
"grad_norm": 0.04514104872941971,
"learning_rate": 2.9301082835785123e-05,
"loss": 1.286,
"step": 529
},
{
"epoch": 0.56,
"grad_norm": 0.04130253940820694,
"learning_rate": 2.9295456944235928e-05,
"loss": 1.3865,
"step": 530
},
{
"epoch": 0.57,
"grad_norm": 0.0434497706592083,
"learning_rate": 2.9289809045206067e-05,
"loss": 1.2949,
"step": 531
},
{
"epoch": 0.57,
"grad_norm": 0.05484382435679436,
"learning_rate": 2.928413914739035e-05,
"loss": 1.3318,
"step": 532
},
{
"epoch": 0.57,
"grad_norm": 0.0377834215760231,
"learning_rate": 2.9278447259517423e-05,
"loss": 1.2115,
"step": 533
},
{
"epoch": 0.57,
"grad_norm": 0.04128464683890343,
"learning_rate": 2.92727333903498e-05,
"loss": 1.3297,
"step": 534
},
{
"epoch": 0.57,
"grad_norm": 0.044725801795721054,
"learning_rate": 2.9266997548683838e-05,
"loss": 1.3444,
"step": 535
},
{
"epoch": 0.57,
"grad_norm": 0.04274127259850502,
"learning_rate": 2.9261239743349708e-05,
"loss": 1.4422,
"step": 536
},
{
"epoch": 0.57,
"grad_norm": 0.04571430757641792,
"learning_rate": 2.9255459983211406e-05,
"loss": 1.368,
"step": 537
},
{
"epoch": 0.57,
"grad_norm": 0.043202176690101624,
"learning_rate": 2.924965827716672e-05,
"loss": 1.3105,
"step": 538
},
{
"epoch": 0.57,
"grad_norm": 0.04540957510471344,
"learning_rate": 2.924383463414722e-05,
"loss": 1.2472,
"step": 539
},
{
"epoch": 0.58,
"grad_norm": 0.0444292277097702,
"learning_rate": 2.9237989063118253e-05,
"loss": 1.2525,
"step": 540
},
{
"epoch": 0.58,
"grad_norm": 0.04204078018665314,
"learning_rate": 2.9232121573078923e-05,
"loss": 1.3316,
"step": 541
},
{
"epoch": 0.58,
"grad_norm": 0.045940667390823364,
"learning_rate": 2.922623217306208e-05,
"loss": 1.3357,
"step": 542
},
{
"epoch": 0.58,
"grad_norm": 0.06495653092861176,
"learning_rate": 2.9220320872134298e-05,
"loss": 1.2616,
"step": 543
},
{
"epoch": 0.58,
"grad_norm": 0.04345276579260826,
"learning_rate": 2.9214387679395868e-05,
"loss": 1.3768,
"step": 544
},
{
"epoch": 0.58,
"grad_norm": 0.04214997962117195,
"learning_rate": 2.9208432603980784e-05,
"loss": 1.2542,
"step": 545
},
{
"epoch": 0.58,
"grad_norm": 0.04196178540587425,
"learning_rate": 2.9202455655056732e-05,
"loss": 1.2082,
"step": 546
},
{
"epoch": 0.58,
"grad_norm": 0.04009537026286125,
"learning_rate": 2.9196456841825064e-05,
"loss": 1.337,
"step": 547
},
{
"epoch": 0.58,
"grad_norm": 0.049930717796087265,
"learning_rate": 2.9190436173520797e-05,
"loss": 1.2476,
"step": 548
},
{
"epoch": 0.59,
"grad_norm": 0.04278023913502693,
"learning_rate": 2.9184393659412597e-05,
"loss": 1.2621,
"step": 549
},
{
"epoch": 0.59,
"grad_norm": 0.04328072443604469,
"learning_rate": 2.9178329308802745e-05,
"loss": 1.2938,
"step": 550
},
{
"epoch": 0.59,
"grad_norm": 0.049271196126937866,
"learning_rate": 2.9172243131027163e-05,
"loss": 1.3036,
"step": 551
},
{
"epoch": 0.59,
"grad_norm": 0.03971061483025551,
"learning_rate": 2.9166135135455348e-05,
"loss": 1.3597,
"step": 552
},
{
"epoch": 0.59,
"grad_norm": 0.03890816867351532,
"learning_rate": 2.916000533149041e-05,
"loss": 1.305,
"step": 553
},
{
"epoch": 0.59,
"grad_norm": 0.03783348947763443,
"learning_rate": 2.9153853728569013e-05,
"loss": 1.2693,
"step": 554
},
{
"epoch": 0.59,
"grad_norm": 0.0400649756193161,
"learning_rate": 2.9147680336161394e-05,
"loss": 1.3958,
"step": 555
},
{
"epoch": 0.59,
"grad_norm": 0.05036576837301254,
"learning_rate": 2.9141485163771328e-05,
"loss": 1.3374,
"step": 556
},
{
"epoch": 0.59,
"grad_norm": 0.039989981800317764,
"learning_rate": 2.913526822093611e-05,
"loss": 1.2158,
"step": 557
},
{
"epoch": 0.59,
"grad_norm": 0.03950633481144905,
"learning_rate": 2.912902951722658e-05,
"loss": 1.2969,
"step": 558
},
{
"epoch": 0.6,
"grad_norm": 0.03998475894331932,
"learning_rate": 2.9122769062247042e-05,
"loss": 1.2929,
"step": 559
},
{
"epoch": 0.6,
"grad_norm": 0.041236512362957,
"learning_rate": 2.9116486865635305e-05,
"loss": 1.345,
"step": 560
},
{
"epoch": 0.6,
"grad_norm": 0.03818591311573982,
"learning_rate": 2.9110182937062655e-05,
"loss": 1.442,
"step": 561
},
{
"epoch": 0.6,
"grad_norm": 0.06457041203975677,
"learning_rate": 2.9103857286233815e-05,
"loss": 1.2847,
"step": 562
},
{
"epoch": 0.6,
"grad_norm": 0.06327484548091888,
"learning_rate": 2.909750992288696e-05,
"loss": 1.2155,
"step": 563
},
{
"epoch": 0.6,
"grad_norm": 0.04206790402531624,
"learning_rate": 2.909114085679369e-05,
"loss": 1.3996,
"step": 564
},
{
"epoch": 0.6,
"grad_norm": 0.0427677147090435,
"learning_rate": 2.9084750097759013e-05,
"loss": 1.3078,
"step": 565
},
{
"epoch": 0.6,
"grad_norm": 0.043656881898641586,
"learning_rate": 2.9078337655621347e-05,
"loss": 1.3627,
"step": 566
},
{
"epoch": 0.6,
"grad_norm": 0.04029693827033043,
"learning_rate": 2.907190354025246e-05,
"loss": 1.2636,
"step": 567
},
{
"epoch": 0.61,
"grad_norm": 0.04099896177649498,
"learning_rate": 2.9065447761557514e-05,
"loss": 1.2378,
"step": 568
},
{
"epoch": 0.61,
"grad_norm": 0.04414224252104759,
"learning_rate": 2.9058970329475012e-05,
"loss": 1.3569,
"step": 569
},
{
"epoch": 0.61,
"grad_norm": 0.04171021282672882,
"learning_rate": 2.9052471253976782e-05,
"loss": 1.2692,
"step": 570
},
{
"epoch": 0.61,
"grad_norm": 0.03992126137018204,
"learning_rate": 2.904595054506799e-05,
"loss": 1.2591,
"step": 571
},
{
"epoch": 0.61,
"grad_norm": 0.04168523848056793,
"learning_rate": 2.9039408212787094e-05,
"loss": 1.3033,
"step": 572
},
{
"epoch": 0.61,
"grad_norm": 0.049589138478040695,
"learning_rate": 2.9032844267205838e-05,
"loss": 1.2772,
"step": 573
},
{
"epoch": 0.61,
"grad_norm": 0.03980257362127304,
"learning_rate": 2.9026258718429245e-05,
"loss": 1.1404,
"step": 574
},
{
"epoch": 0.61,
"grad_norm": 0.05382708087563515,
"learning_rate": 2.9019651576595597e-05,
"loss": 1.2493,
"step": 575
},
{
"epoch": 0.61,
"grad_norm": 0.04534833878278732,
"learning_rate": 2.9013022851876416e-05,
"loss": 1.429,
"step": 576
},
{
"epoch": 0.61,
"grad_norm": 0.03764050453901291,
"learning_rate": 2.9006372554476445e-05,
"loss": 1.0558,
"step": 577
},
{
"epoch": 0.62,
"grad_norm": 0.04251871630549431,
"learning_rate": 2.8999700694633654e-05,
"loss": 1.324,
"step": 578
},
{
"epoch": 0.62,
"grad_norm": 0.04109210520982742,
"learning_rate": 2.899300728261918e-05,
"loss": 1.3635,
"step": 579
},
{
"epoch": 0.62,
"grad_norm": 0.03659521043300629,
"learning_rate": 2.898629232873736e-05,
"loss": 1.2116,
"step": 580
},
{
"epoch": 0.62,
"grad_norm": 0.04072749614715576,
"learning_rate": 2.89795558433257e-05,
"loss": 1.3335,
"step": 581
},
{
"epoch": 0.62,
"grad_norm": 0.03850967437028885,
"learning_rate": 2.897279783675483e-05,
"loss": 1.2563,
"step": 582
},
{
"epoch": 0.62,
"grad_norm": 0.042142994701862335,
"learning_rate": 2.8966018319428524e-05,
"loss": 1.4424,
"step": 583
},
{
"epoch": 0.62,
"grad_norm": 0.04661838337779045,
"learning_rate": 2.8959217301783682e-05,
"loss": 1.3586,
"step": 584
},
{
"epoch": 0.62,
"grad_norm": 0.042438358068466187,
"learning_rate": 2.8952394794290284e-05,
"loss": 1.2736,
"step": 585
},
{
"epoch": 0.62,
"grad_norm": 0.03861173614859581,
"learning_rate": 2.8945550807451395e-05,
"loss": 1.2671,
"step": 586
},
{
"epoch": 0.63,
"grad_norm": 0.044916752725839615,
"learning_rate": 2.8938685351803168e-05,
"loss": 1.2626,
"step": 587
},
{
"epoch": 0.63,
"grad_norm": 0.043335918337106705,
"learning_rate": 2.8931798437914778e-05,
"loss": 1.1822,
"step": 588
},
{
"epoch": 0.63,
"grad_norm": 0.040797822177410126,
"learning_rate": 2.892489007638846e-05,
"loss": 1.1828,
"step": 589
},
{
"epoch": 0.63,
"grad_norm": 0.04138658568263054,
"learning_rate": 2.8917960277859442e-05,
"loss": 1.2201,
"step": 590
},
{
"epoch": 0.63,
"grad_norm": 0.042210813611745834,
"learning_rate": 2.891100905299598e-05,
"loss": 1.3068,
"step": 591
},
{
"epoch": 0.63,
"grad_norm": 0.03918137401342392,
"learning_rate": 2.8904036412499297e-05,
"loss": 1.2342,
"step": 592
},
{
"epoch": 0.63,
"grad_norm": 0.04338767006993294,
"learning_rate": 2.8897042367103588e-05,
"loss": 1.3416,
"step": 593
},
{
"epoch": 0.63,
"grad_norm": 0.042658790946006775,
"learning_rate": 2.8890026927576e-05,
"loss": 1.3262,
"step": 594
},
{
"epoch": 0.63,
"grad_norm": 0.040876757353544235,
"learning_rate": 2.8882990104716624e-05,
"loss": 1.3446,
"step": 595
},
{
"epoch": 0.64,
"grad_norm": 0.04136212170124054,
"learning_rate": 2.8875931909358462e-05,
"loss": 1.3195,
"step": 596
},
{
"epoch": 0.64,
"grad_norm": 0.04199006408452988,
"learning_rate": 2.886885235236742e-05,
"loss": 1.3002,
"step": 597
},
{
"epoch": 0.64,
"grad_norm": 0.04258933663368225,
"learning_rate": 2.886175144464229e-05,
"loss": 1.2437,
"step": 598
},
{
"epoch": 0.64,
"grad_norm": 0.051209740340709686,
"learning_rate": 2.885462919711473e-05,
"loss": 1.3638,
"step": 599
},
{
"epoch": 0.64,
"grad_norm": 0.05017193779349327,
"learning_rate": 2.884748562074926e-05,
"loss": 1.3595,
"step": 600
},
{
"epoch": 0.64,
"grad_norm": 0.04402180016040802,
"learning_rate": 2.8840320726543226e-05,
"loss": 1.2947,
"step": 601
},
{
"epoch": 0.64,
"grad_norm": 0.04097672924399376,
"learning_rate": 2.883313452552679e-05,
"loss": 1.238,
"step": 602
},
{
"epoch": 0.64,
"grad_norm": 0.04349937289953232,
"learning_rate": 2.8825927028762923e-05,
"loss": 1.2293,
"step": 603
},
{
"epoch": 0.64,
"grad_norm": 0.03766897693276405,
"learning_rate": 2.881869824734738e-05,
"loss": 1.3303,
"step": 604
},
{
"epoch": 0.64,
"grad_norm": 0.041030097752809525,
"learning_rate": 2.8811448192408675e-05,
"loss": 1.3331,
"step": 605
},
{
"epoch": 0.65,
"grad_norm": 0.0407574400305748,
"learning_rate": 2.880417687510808e-05,
"loss": 1.1861,
"step": 606
},
{
"epoch": 0.65,
"grad_norm": 0.0400300994515419,
"learning_rate": 2.8796884306639596e-05,
"loss": 1.3008,
"step": 607
},
{
"epoch": 0.65,
"grad_norm": 0.03695922717452049,
"learning_rate": 2.8789570498229937e-05,
"loss": 1.2157,
"step": 608
},
{
"epoch": 0.65,
"grad_norm": 0.039951398968696594,
"learning_rate": 2.878223546113853e-05,
"loss": 1.2931,
"step": 609
},
{
"epoch": 0.65,
"grad_norm": 0.041708268225193024,
"learning_rate": 2.877487920665746e-05,
"loss": 1.3288,
"step": 610
},
{
"epoch": 0.65,
"grad_norm": 0.15182824432849884,
"learning_rate": 2.8767501746111494e-05,
"loss": 1.2683,
"step": 611
},
{
"epoch": 0.65,
"grad_norm": 0.042990490794181824,
"learning_rate": 2.876010309085804e-05,
"loss": 1.2458,
"step": 612
},
{
"epoch": 0.65,
"grad_norm": 0.04103465750813484,
"learning_rate": 2.8752683252287134e-05,
"loss": 1.2582,
"step": 613
},
{
"epoch": 0.65,
"grad_norm": 0.05002060532569885,
"learning_rate": 2.8745242241821413e-05,
"loss": 1.3105,
"step": 614
},
{
"epoch": 0.66,
"grad_norm": 0.043134111911058426,
"learning_rate": 2.873778007091613e-05,
"loss": 1.1705,
"step": 615
},
{
"epoch": 0.66,
"grad_norm": 0.05832618102431297,
"learning_rate": 2.8730296751059087e-05,
"loss": 1.2837,
"step": 616
},
{
"epoch": 0.66,
"grad_norm": 0.0403001494705677,
"learning_rate": 2.872279229377067e-05,
"loss": 1.3912,
"step": 617
},
{
"epoch": 0.66,
"grad_norm": 0.03862776979804039,
"learning_rate": 2.87152667106038e-05,
"loss": 1.206,
"step": 618
},
{
"epoch": 0.66,
"grad_norm": 0.03989758342504501,
"learning_rate": 2.8707720013143896e-05,
"loss": 1.296,
"step": 619
},
{
"epoch": 0.66,
"grad_norm": 0.04268000274896622,
"learning_rate": 2.870015221300891e-05,
"loss": 1.298,
"step": 620
},
{
"epoch": 0.66,
"grad_norm": 0.043824777007102966,
"learning_rate": 2.8692563321849277e-05,
"loss": 1.2559,
"step": 621
},
{
"epoch": 0.66,
"grad_norm": 0.04446446895599365,
"learning_rate": 2.8684953351347883e-05,
"loss": 1.1997,
"step": 622
},
{
"epoch": 0.66,
"grad_norm": 0.04489293321967125,
"learning_rate": 2.8677322313220093e-05,
"loss": 1.2014,
"step": 623
},
{
"epoch": 0.67,
"grad_norm": 0.03992080315947533,
"learning_rate": 2.8669670219213674e-05,
"loss": 1.3665,
"step": 624
},
{
"epoch": 0.67,
"grad_norm": 0.04235706850886345,
"learning_rate": 2.866199708110884e-05,
"loss": 1.2692,
"step": 625
},
{
"epoch": 0.67,
"grad_norm": 0.0407961942255497,
"learning_rate": 2.8654302910718173e-05,
"loss": 1.3089,
"step": 626
},
{
"epoch": 0.67,
"grad_norm": 0.04869687184691429,
"learning_rate": 2.8646587719886653e-05,
"loss": 1.4737,
"step": 627
},
{
"epoch": 0.67,
"grad_norm": 0.045265767723321915,
"learning_rate": 2.863885152049161e-05,
"loss": 1.2694,
"step": 628
},
{
"epoch": 0.67,
"grad_norm": 0.04179368540644646,
"learning_rate": 2.863109432444272e-05,
"loss": 1.2708,
"step": 629
},
{
"epoch": 0.67,
"grad_norm": 0.044434092938899994,
"learning_rate": 2.862331614368199e-05,
"loss": 1.2315,
"step": 630
},
{
"epoch": 0.67,
"grad_norm": 0.04100622236728668,
"learning_rate": 2.8615516990183715e-05,
"loss": 1.2529,
"step": 631
},
{
"epoch": 0.67,
"grad_norm": 0.04165005311369896,
"learning_rate": 2.860769687595449e-05,
"loss": 1.316,
"step": 632
},
{
"epoch": 0.67,
"grad_norm": 0.04114099219441414,
"learning_rate": 2.859985581303318e-05,
"loss": 1.2013,
"step": 633
},
{
"epoch": 0.68,
"grad_norm": 0.038906700909137726,
"learning_rate": 2.859199381349089e-05,
"loss": 1.2939,
"step": 634
},
{
"epoch": 0.68,
"grad_norm": 0.04231356829404831,
"learning_rate": 2.8584110889430968e-05,
"loss": 1.3255,
"step": 635
},
{
"epoch": 0.68,
"grad_norm": 0.040223341435194016,
"learning_rate": 2.857620705298896e-05,
"loss": 1.1812,
"step": 636
},
{
"epoch": 0.68,
"grad_norm": 0.041090674698352814,
"learning_rate": 2.8568282316332623e-05,
"loss": 1.2755,
"step": 637
},
{
"epoch": 0.68,
"grad_norm": 0.0405619814991951,
"learning_rate": 2.8560336691661873e-05,
"loss": 1.2817,
"step": 638
},
{
"epoch": 0.68,
"grad_norm": 0.041612740606069565,
"learning_rate": 2.85523701912088e-05,
"loss": 1.4021,
"step": 639
},
{
"epoch": 0.68,
"grad_norm": 0.047294363379478455,
"learning_rate": 2.8544382827237616e-05,
"loss": 1.3847,
"step": 640
},
{
"epoch": 0.68,
"grad_norm": 0.040402382612228394,
"learning_rate": 2.853637461204466e-05,
"loss": 1.3138,
"step": 641
},
{
"epoch": 0.68,
"grad_norm": 0.045728690922260284,
"learning_rate": 2.8528345557958365e-05,
"loss": 1.3433,
"step": 642
},
{
"epoch": 0.69,
"grad_norm": 0.04862399771809578,
"learning_rate": 2.8520295677339256e-05,
"loss": 1.4913,
"step": 643
},
{
"epoch": 0.69,
"grad_norm": 0.04778144508600235,
"learning_rate": 2.851222498257991e-05,
"loss": 1.1898,
"step": 644
},
{
"epoch": 0.69,
"grad_norm": 0.054490551352500916,
"learning_rate": 2.850413348610495e-05,
"loss": 1.3287,
"step": 645
},
{
"epoch": 0.69,
"grad_norm": 0.04370054230093956,
"learning_rate": 2.8496021200371018e-05,
"loss": 1.3576,
"step": 646
},
{
"epoch": 0.69,
"grad_norm": 0.04136567935347557,
"learning_rate": 2.848788813786677e-05,
"loss": 1.3145,
"step": 647
},
{
"epoch": 0.69,
"grad_norm": 0.042698897421360016,
"learning_rate": 2.847973431111284e-05,
"loss": 1.2438,
"step": 648
},
{
"epoch": 0.69,
"grad_norm": 0.03822973743081093,
"learning_rate": 2.847155973266183e-05,
"loss": 1.2262,
"step": 649
},
{
"epoch": 0.69,
"grad_norm": 0.040024396032094955,
"learning_rate": 2.8463364415098295e-05,
"loss": 1.3446,
"step": 650
},
{
"epoch": 0.69,
"grad_norm": 0.04169702157378197,
"learning_rate": 2.84551483710387e-05,
"loss": 1.3185,
"step": 651
},
{
"epoch": 0.69,
"grad_norm": 0.062234602868556976,
"learning_rate": 2.8446911613131437e-05,
"loss": 1.366,
"step": 652
},
{
"epoch": 0.7,
"grad_norm": 0.044699717313051224,
"learning_rate": 2.843865415405678e-05,
"loss": 1.1966,
"step": 653
},
{
"epoch": 0.7,
"grad_norm": 0.049964308738708496,
"learning_rate": 2.8430376006526862e-05,
"loss": 1.3174,
"step": 654
},
{
"epoch": 0.7,
"grad_norm": 0.039814963936805725,
"learning_rate": 2.8422077183285686e-05,
"loss": 1.3091,
"step": 655
},
{
"epoch": 0.7,
"grad_norm": 0.03822264447808266,
"learning_rate": 2.841375769710906e-05,
"loss": 1.2779,
"step": 656
},
{
"epoch": 0.7,
"grad_norm": 0.04611162468791008,
"learning_rate": 2.8405417560804618e-05,
"loss": 1.2708,
"step": 657
},
{
"epoch": 0.7,
"grad_norm": 0.04191526770591736,
"learning_rate": 2.8397056787211787e-05,
"loss": 1.3307,
"step": 658
},
{
"epoch": 0.7,
"grad_norm": 0.04210788011550903,
"learning_rate": 2.838867538920175e-05,
"loss": 1.4363,
"step": 659
},
{
"epoch": 0.7,
"grad_norm": 0.04391501471400261,
"learning_rate": 2.8380273379677463e-05,
"loss": 1.2504,
"step": 660
},
{
"epoch": 0.7,
"grad_norm": 0.04508218541741371,
"learning_rate": 2.837185077157358e-05,
"loss": 1.2052,
"step": 661
},
{
"epoch": 0.71,
"grad_norm": 0.03969848155975342,
"learning_rate": 2.8363407577856498e-05,
"loss": 1.2945,
"step": 662
},
{
"epoch": 0.71,
"grad_norm": 0.04233105480670929,
"learning_rate": 2.835494381152429e-05,
"loss": 1.3685,
"step": 663
},
{
"epoch": 0.71,
"grad_norm": 0.0404694527387619,
"learning_rate": 2.83464594856067e-05,
"loss": 1.1128,
"step": 664
},
{
"epoch": 0.71,
"grad_norm": 0.04120095074176788,
"learning_rate": 2.8337954613165124e-05,
"loss": 1.4247,
"step": 665
},
{
"epoch": 0.71,
"grad_norm": 0.0430976077914238,
"learning_rate": 2.832942920729259e-05,
"loss": 1.2779,
"step": 666
},
{
"epoch": 0.71,
"grad_norm": 0.046013325452804565,
"learning_rate": 2.8320883281113744e-05,
"loss": 1.3943,
"step": 667
},
{
"epoch": 0.71,
"grad_norm": 0.0432153195142746,
"learning_rate": 2.8312316847784805e-05,
"loss": 1.3161,
"step": 668
},
{
"epoch": 0.71,
"grad_norm": 0.04044407978653908,
"learning_rate": 2.8303729920493578e-05,
"loss": 1.1947,
"step": 669
},
{
"epoch": 0.71,
"grad_norm": 0.04066836088895798,
"learning_rate": 2.8295122512459412e-05,
"loss": 1.307,
"step": 670
},
{
"epoch": 0.72,
"grad_norm": 0.039611902087926865,
"learning_rate": 2.8286494636933182e-05,
"loss": 1.2142,
"step": 671
},
{
"epoch": 0.72,
"grad_norm": 0.03812957555055618,
"learning_rate": 2.827784630719728e-05,
"loss": 1.2966,
"step": 672
},
{
"epoch": 0.72,
"grad_norm": 0.03867647051811218,
"learning_rate": 2.8269177536565578e-05,
"loss": 1.3149,
"step": 673
},
{
"epoch": 0.72,
"grad_norm": 0.038965556770563126,
"learning_rate": 2.8260488338383424e-05,
"loss": 1.2024,
"step": 674
},
{
"epoch": 0.72,
"grad_norm": 0.040611233562231064,
"learning_rate": 2.825177872602761e-05,
"loss": 1.3236,
"step": 675
},
{
"epoch": 0.72,
"grad_norm": 0.04156762361526489,
"learning_rate": 2.8243048712906356e-05,
"loss": 1.2006,
"step": 676
},
{
"epoch": 0.72,
"grad_norm": 0.04537190496921539,
"learning_rate": 2.8234298312459287e-05,
"loss": 1.2474,
"step": 677
},
{
"epoch": 0.72,
"grad_norm": 0.04531079903244972,
"learning_rate": 2.8225527538157413e-05,
"loss": 1.2898,
"step": 678
},
{
"epoch": 0.72,
"grad_norm": 0.0478559210896492,
"learning_rate": 2.8216736403503117e-05,
"loss": 1.2698,
"step": 679
},
{
"epoch": 0.72,
"grad_norm": 0.03838729113340378,
"learning_rate": 2.8207924922030116e-05,
"loss": 1.244,
"step": 680
},
{
"epoch": 0.73,
"grad_norm": 0.04377003014087677,
"learning_rate": 2.819909310730345e-05,
"loss": 1.2137,
"step": 681
},
{
"epoch": 0.73,
"grad_norm": 0.05953631177544594,
"learning_rate": 2.8190240972919474e-05,
"loss": 1.306,
"step": 682
},
{
"epoch": 0.73,
"grad_norm": 0.05924424156546593,
"learning_rate": 2.8181368532505812e-05,
"loss": 1.2539,
"step": 683
},
{
"epoch": 0.73,
"grad_norm": 0.04990265145897865,
"learning_rate": 2.8172475799721353e-05,
"loss": 1.23,
"step": 684
},
{
"epoch": 0.73,
"grad_norm": 0.05186406522989273,
"learning_rate": 2.816356278825623e-05,
"loss": 1.2383,
"step": 685
},
{
"epoch": 0.73,
"grad_norm": 0.04090893268585205,
"learning_rate": 2.8154629511831784e-05,
"loss": 1.2833,
"step": 686
},
{
"epoch": 0.73,
"grad_norm": 0.04050731286406517,
"learning_rate": 2.814567598420056e-05,
"loss": 1.206,
"step": 687
},
{
"epoch": 0.73,
"grad_norm": 0.044532258063554764,
"learning_rate": 2.8136702219146285e-05,
"loss": 1.4248,
"step": 688
},
{
"epoch": 0.73,
"grad_norm": 0.040547750890254974,
"learning_rate": 2.8127708230483825e-05,
"loss": 1.219,
"step": 689
},
{
"epoch": 0.74,
"grad_norm": 0.07601740211248398,
"learning_rate": 2.81186940320592e-05,
"loss": 1.2666,
"step": 690
},
{
"epoch": 0.74,
"grad_norm": 0.04106530919671059,
"learning_rate": 2.8109659637749525e-05,
"loss": 1.2695,
"step": 691
},
{
"epoch": 0.74,
"grad_norm": 0.04745073616504669,
"learning_rate": 2.8100605061463015e-05,
"loss": 1.4064,
"step": 692
},
{
"epoch": 0.74,
"grad_norm": 0.0383027046918869,
"learning_rate": 2.8091530317138953e-05,
"loss": 1.2877,
"step": 693
},
{
"epoch": 0.74,
"grad_norm": 0.04280005767941475,
"learning_rate": 2.808243541874767e-05,
"loss": 1.2139,
"step": 694
},
{
"epoch": 0.74,
"grad_norm": 0.0441480427980423,
"learning_rate": 2.807332038029052e-05,
"loss": 1.3107,
"step": 695
},
{
"epoch": 0.74,
"grad_norm": 0.04255714640021324,
"learning_rate": 2.806418521579987e-05,
"loss": 1.1937,
"step": 696
},
{
"epoch": 0.74,
"grad_norm": 0.040505584329366684,
"learning_rate": 2.8055029939339055e-05,
"loss": 1.2599,
"step": 697
},
{
"epoch": 0.74,
"grad_norm": 0.043979816138744354,
"learning_rate": 2.80458545650024e-05,
"loss": 1.2452,
"step": 698
},
{
"epoch": 0.75,
"grad_norm": 0.039587393403053284,
"learning_rate": 2.8036659106915145e-05,
"loss": 1.285,
"step": 699
},
{
"epoch": 0.75,
"grad_norm": 0.03848938271403313,
"learning_rate": 2.802744357923345e-05,
"loss": 1.3721,
"step": 700
},
{
"epoch": 0.75,
"grad_norm": 0.35720014572143555,
"learning_rate": 2.8018207996144388e-05,
"loss": 1.3332,
"step": 701
},
{
"epoch": 0.75,
"grad_norm": 0.03978364169597626,
"learning_rate": 2.8008952371865886e-05,
"loss": 1.1509,
"step": 702
},
{
"epoch": 0.75,
"grad_norm": 0.03576143831014633,
"learning_rate": 2.7999676720646744e-05,
"loss": 1.2826,
"step": 703
},
{
"epoch": 0.75,
"grad_norm": 0.040340177714824677,
"learning_rate": 2.7990381056766583e-05,
"loss": 1.3036,
"step": 704
},
{
"epoch": 0.75,
"grad_norm": 0.04206692427396774,
"learning_rate": 2.7981065394535824e-05,
"loss": 1.2688,
"step": 705
},
{
"epoch": 0.75,
"grad_norm": 0.044807758182287216,
"learning_rate": 2.7971729748295697e-05,
"loss": 1.0828,
"step": 706
},
{
"epoch": 0.75,
"grad_norm": 0.037925321608781815,
"learning_rate": 2.7962374132418176e-05,
"loss": 1.2166,
"step": 707
},
{
"epoch": 0.75,
"grad_norm": 0.0471733994781971,
"learning_rate": 2.795299856130599e-05,
"loss": 1.183,
"step": 708
},
{
"epoch": 0.76,
"grad_norm": 0.03962987661361694,
"learning_rate": 2.7943603049392578e-05,
"loss": 1.2063,
"step": 709
},
{
"epoch": 0.76,
"grad_norm": 0.04328012466430664,
"learning_rate": 2.7934187611142093e-05,
"loss": 1.3816,
"step": 710
},
{
"epoch": 0.76,
"grad_norm": 0.04300890862941742,
"learning_rate": 2.792475226104935e-05,
"loss": 1.3163,
"step": 711
},
{
"epoch": 0.76,
"grad_norm": 0.04583253711462021,
"learning_rate": 2.7915297013639828e-05,
"loss": 1.4251,
"step": 712
},
{
"epoch": 0.76,
"grad_norm": 0.04772812873125076,
"learning_rate": 2.790582188346962e-05,
"loss": 1.2981,
"step": 713
},
{
"epoch": 0.76,
"grad_norm": 0.04397529363632202,
"learning_rate": 2.789632688512545e-05,
"loss": 1.2911,
"step": 714
},
{
"epoch": 0.76,
"grad_norm": 0.04336230084300041,
"learning_rate": 2.7886812033224618e-05,
"loss": 1.3388,
"step": 715
},
{
"epoch": 0.76,
"grad_norm": 0.04407043755054474,
"learning_rate": 2.787727734241499e-05,
"loss": 1.1775,
"step": 716
},
{
"epoch": 0.76,
"grad_norm": 0.03971351310610771,
"learning_rate": 2.7867722827374964e-05,
"loss": 1.3216,
"step": 717
},
{
"epoch": 0.77,
"grad_norm": 0.043983425945043564,
"learning_rate": 2.7858148502813477e-05,
"loss": 1.4081,
"step": 718
},
{
"epoch": 0.77,
"grad_norm": 0.035951223224401474,
"learning_rate": 2.784855438346994e-05,
"loss": 1.3331,
"step": 719
},
{
"epoch": 0.77,
"grad_norm": 0.04295084998011589,
"learning_rate": 2.783894048411425e-05,
"loss": 1.3059,
"step": 720
},
{
"epoch": 0.77,
"grad_norm": 0.047317974269390106,
"learning_rate": 2.7829306819546756e-05,
"loss": 1.2763,
"step": 721
},
{
"epoch": 0.77,
"grad_norm": 0.03932027146220207,
"learning_rate": 2.781965340459823e-05,
"loss": 1.2098,
"step": 722
},
{
"epoch": 0.77,
"grad_norm": 0.03825077414512634,
"learning_rate": 2.780998025412985e-05,
"loss": 1.2286,
"step": 723
},
{
"epoch": 0.77,
"grad_norm": 0.04540243372321129,
"learning_rate": 2.780028738303318e-05,
"loss": 1.36,
"step": 724
},
{
"epoch": 0.77,
"grad_norm": 0.04947693273425102,
"learning_rate": 2.7790574806230137e-05,
"loss": 1.3462,
"step": 725
},
{
"epoch": 0.77,
"grad_norm": 0.040425993502140045,
"learning_rate": 2.7780842538672983e-05,
"loss": 1.2426,
"step": 726
},
{
"epoch": 0.77,
"grad_norm": 0.04652491956949234,
"learning_rate": 2.777109059534428e-05,
"loss": 1.2174,
"step": 727
},
{
"epoch": 0.78,
"grad_norm": 0.04260154813528061,
"learning_rate": 2.77613189912569e-05,
"loss": 1.3217,
"step": 728
},
{
"epoch": 0.78,
"grad_norm": 0.04189547896385193,
"learning_rate": 2.775152774145396e-05,
"loss": 1.3517,
"step": 729
},
{
"epoch": 0.78,
"grad_norm": 0.05027705430984497,
"learning_rate": 2.7741716861008838e-05,
"loss": 1.2322,
"step": 730
},
{
"epoch": 0.78,
"grad_norm": 0.04095868766307831,
"learning_rate": 2.7731886365025128e-05,
"loss": 1.2756,
"step": 731
},
{
"epoch": 0.78,
"grad_norm": 0.042236629873514175,
"learning_rate": 2.7722036268636613e-05,
"loss": 1.3117,
"step": 732
},
{
"epoch": 0.78,
"grad_norm": 0.03990009054541588,
"learning_rate": 2.771216658700727e-05,
"loss": 1.22,
"step": 733
},
{
"epoch": 0.78,
"grad_norm": 0.04218020290136337,
"learning_rate": 2.77022773353312e-05,
"loss": 1.1763,
"step": 734
},
{
"epoch": 0.78,
"grad_norm": 0.03973948955535889,
"learning_rate": 2.769236852883266e-05,
"loss": 1.2557,
"step": 735
},
{
"epoch": 0.78,
"grad_norm": 0.04438718408346176,
"learning_rate": 2.7682440182765987e-05,
"loss": 1.2702,
"step": 736
},
{
"epoch": 0.79,
"grad_norm": 0.04357394203543663,
"learning_rate": 2.767249231241562e-05,
"loss": 1.4582,
"step": 737
},
{
"epoch": 0.79,
"grad_norm": 0.03988838940858841,
"learning_rate": 2.766252493309603e-05,
"loss": 1.2441,
"step": 738
},
{
"epoch": 0.79,
"grad_norm": 0.040849536657333374,
"learning_rate": 2.7652538060151747e-05,
"loss": 1.1811,
"step": 739
},
{
"epoch": 0.79,
"grad_norm": 0.04130591079592705,
"learning_rate": 2.7642531708957297e-05,
"loss": 1.392,
"step": 740
},
{
"epoch": 0.79,
"grad_norm": 0.03944481164216995,
"learning_rate": 2.7632505894917194e-05,
"loss": 1.278,
"step": 741
},
{
"epoch": 0.79,
"grad_norm": 0.04051590338349342,
"learning_rate": 2.7622460633465915e-05,
"loss": 1.2217,
"step": 742
},
{
"epoch": 0.79,
"grad_norm": 0.048737745732069016,
"learning_rate": 2.7612395940067875e-05,
"loss": 1.364,
"step": 743
},
{
"epoch": 0.79,
"grad_norm": 0.03939497843384743,
"learning_rate": 2.760231183021741e-05,
"loss": 1.2369,
"step": 744
},
{
"epoch": 0.79,
"grad_norm": 0.044156353920698166,
"learning_rate": 2.759220831943874e-05,
"loss": 1.2601,
"step": 745
},
{
"epoch": 0.8,
"grad_norm": 0.04156497120857239,
"learning_rate": 2.7582085423285952e-05,
"loss": 1.3323,
"step": 746
},
{
"epoch": 0.8,
"grad_norm": 0.049551285803318024,
"learning_rate": 2.757194315734298e-05,
"loss": 1.3135,
"step": 747
},
{
"epoch": 0.8,
"grad_norm": 0.0678929015994072,
"learning_rate": 2.756178153722358e-05,
"loss": 1.331,
"step": 748
},
{
"epoch": 0.8,
"grad_norm": 0.04068392515182495,
"learning_rate": 2.7551600578571298e-05,
"loss": 1.3646,
"step": 749
},
{
"epoch": 0.8,
"grad_norm": 0.044063862413167953,
"learning_rate": 2.7541400297059452e-05,
"loss": 1.2513,
"step": 750
},
{
"epoch": 0.8,
"grad_norm": 0.040134869515895844,
"learning_rate": 2.75311807083911e-05,
"loss": 1.2778,
"step": 751
},
{
"epoch": 0.8,
"grad_norm": 0.04928048327565193,
"learning_rate": 2.7520941828299043e-05,
"loss": 1.4237,
"step": 752
},
{
"epoch": 0.8,
"grad_norm": 0.1714377999305725,
"learning_rate": 2.751068367254576e-05,
"loss": 1.1875,
"step": 753
},
{
"epoch": 0.8,
"grad_norm": 0.040971677750349045,
"learning_rate": 2.7500406256923418e-05,
"loss": 1.2199,
"step": 754
},
{
"epoch": 0.8,
"grad_norm": 0.042567458003759384,
"learning_rate": 2.749010959725382e-05,
"loss": 1.3554,
"step": 755
},
{
"epoch": 0.81,
"grad_norm": 0.04325272887945175,
"learning_rate": 2.747979370938841e-05,
"loss": 1.19,
"step": 756
},
{
"epoch": 0.81,
"grad_norm": 0.04391823336482048,
"learning_rate": 2.746945860920823e-05,
"loss": 1.3019,
"step": 757
},
{
"epoch": 0.81,
"grad_norm": 0.041555944830179214,
"learning_rate": 2.7459104312623886e-05,
"loss": 1.2084,
"step": 758
},
{
"epoch": 0.81,
"grad_norm": 0.042668092995882034,
"learning_rate": 2.7448730835575552e-05,
"loss": 1.222,
"step": 759
},
{
"epoch": 0.81,
"grad_norm": 0.0391099713742733,
"learning_rate": 2.7438338194032922e-05,
"loss": 1.2251,
"step": 760
},
{
"epoch": 0.81,
"grad_norm": 0.0442223884165287,
"learning_rate": 2.7427926403995193e-05,
"loss": 1.1863,
"step": 761
},
{
"epoch": 0.81,
"grad_norm": 0.04271350055932999,
"learning_rate": 2.7417495481491047e-05,
"loss": 1.362,
"step": 762
},
{
"epoch": 0.81,
"grad_norm": 0.04445281997323036,
"learning_rate": 2.7407045442578608e-05,
"loss": 1.3202,
"step": 763
},
{
"epoch": 0.81,
"grad_norm": 0.038975391536951065,
"learning_rate": 2.7396576303345445e-05,
"loss": 1.2612,
"step": 764
},
{
"epoch": 0.82,
"grad_norm": 0.04713955149054527,
"learning_rate": 2.7386088079908515e-05,
"loss": 1.2862,
"step": 765
},
{
"epoch": 0.82,
"grad_norm": 0.04461180418729782,
"learning_rate": 2.7375580788414172e-05,
"loss": 1.3267,
"step": 766
},
{
"epoch": 0.82,
"grad_norm": 0.039272554218769073,
"learning_rate": 2.7365054445038104e-05,
"loss": 1.1617,
"step": 767
},
{
"epoch": 0.82,
"grad_norm": 0.043195609003305435,
"learning_rate": 2.7354509065985352e-05,
"loss": 1.3224,
"step": 768
},
{
"epoch": 0.82,
"grad_norm": 0.04246910288929939,
"learning_rate": 2.734394466749024e-05,
"loss": 1.3336,
"step": 769
},
{
"epoch": 0.82,
"grad_norm": 0.061883531510829926,
"learning_rate": 2.733336126581639e-05,
"loss": 1.3105,
"step": 770
},
{
"epoch": 0.82,
"grad_norm": 0.040977708995342255,
"learning_rate": 2.7322758877256666e-05,
"loss": 1.2908,
"step": 771
},
{
"epoch": 0.82,
"grad_norm": 0.04068838432431221,
"learning_rate": 2.7312137518133164e-05,
"loss": 1.3416,
"step": 772
},
{
"epoch": 0.82,
"grad_norm": 0.05024155229330063,
"learning_rate": 2.73014972047972e-05,
"loss": 1.2781,
"step": 773
},
{
"epoch": 0.82,
"grad_norm": 0.038461074233055115,
"learning_rate": 2.7290837953629243e-05,
"loss": 1.1326,
"step": 774
},
{
"epoch": 0.83,
"grad_norm": 0.04922636225819588,
"learning_rate": 2.728015978103894e-05,
"loss": 1.2539,
"step": 775
},
{
"epoch": 0.83,
"grad_norm": 0.04049984738230705,
"learning_rate": 2.726946270346505e-05,
"loss": 1.1698,
"step": 776
},
{
"epoch": 0.83,
"grad_norm": 0.036820389330387115,
"learning_rate": 2.725874673737545e-05,
"loss": 1.2235,
"step": 777
},
{
"epoch": 0.83,
"grad_norm": 0.043402861803770065,
"learning_rate": 2.724801189926708e-05,
"loss": 1.3451,
"step": 778
},
{
"epoch": 0.83,
"grad_norm": 0.04600697383284569,
"learning_rate": 2.7237258205665955e-05,
"loss": 1.265,
"step": 779
},
{
"epoch": 0.83,
"grad_norm": 0.042860984802246094,
"learning_rate": 2.7226485673127088e-05,
"loss": 1.4376,
"step": 780
},
{
"epoch": 0.83,
"grad_norm": 0.08648369461297989,
"learning_rate": 2.7215694318234525e-05,
"loss": 1.2199,
"step": 781
},
{
"epoch": 0.83,
"grad_norm": 0.04271225258708,
"learning_rate": 2.720488415760126e-05,
"loss": 1.2322,
"step": 782
},
{
"epoch": 0.83,
"grad_norm": 0.038085825741291046,
"learning_rate": 2.719405520786926e-05,
"loss": 1.1582,
"step": 783
},
{
"epoch": 0.84,
"grad_norm": 0.04303895682096481,
"learning_rate": 2.7183207485709404e-05,
"loss": 1.4118,
"step": 784
},
{
"epoch": 0.84,
"grad_norm": 0.040101755410432816,
"learning_rate": 2.7172341007821485e-05,
"loss": 1.2151,
"step": 785
},
{
"epoch": 0.84,
"grad_norm": 0.041162651032209396,
"learning_rate": 2.716145579093415e-05,
"loss": 1.2997,
"step": 786
},
{
"epoch": 0.84,
"grad_norm": 0.05283331498503685,
"learning_rate": 2.7150551851804904e-05,
"loss": 1.3426,
"step": 787
},
{
"epoch": 0.84,
"grad_norm": 0.040588632225990295,
"learning_rate": 2.713962920722008e-05,
"loss": 1.1467,
"step": 788
},
{
"epoch": 0.84,
"grad_norm": 0.041328929364681244,
"learning_rate": 2.7128687873994807e-05,
"loss": 1.2999,
"step": 789
},
{
"epoch": 0.84,
"grad_norm": 0.04245878756046295,
"learning_rate": 2.7117727868972968e-05,
"loss": 1.2076,
"step": 790
},
{
"epoch": 0.84,
"grad_norm": 0.04110388457775116,
"learning_rate": 2.7106749209027216e-05,
"loss": 1.3715,
"step": 791
},
{
"epoch": 0.84,
"grad_norm": 0.04127703979611397,
"learning_rate": 2.70957519110589e-05,
"loss": 1.1791,
"step": 792
},
{
"epoch": 0.85,
"grad_norm": 0.03841520473361015,
"learning_rate": 2.7084735991998077e-05,
"loss": 1.2702,
"step": 793
},
{
"epoch": 0.85,
"grad_norm": 0.04112560302019119,
"learning_rate": 2.707370146880346e-05,
"loss": 1.2306,
"step": 794
},
{
"epoch": 0.85,
"grad_norm": 0.03800290822982788,
"learning_rate": 2.7062648358462417e-05,
"loss": 1.3471,
"step": 795
},
{
"epoch": 0.85,
"grad_norm": 0.05236433446407318,
"learning_rate": 2.705157667799091e-05,
"loss": 1.2621,
"step": 796
},
{
"epoch": 0.85,
"grad_norm": 0.044191982597112656,
"learning_rate": 2.7040486444433506e-05,
"loss": 1.1561,
"step": 797
},
{
"epoch": 0.85,
"grad_norm": 0.04605194926261902,
"learning_rate": 2.7029377674863332e-05,
"loss": 1.3653,
"step": 798
},
{
"epoch": 0.85,
"grad_norm": 0.04338742420077324,
"learning_rate": 2.7018250386382036e-05,
"loss": 1.104,
"step": 799
},
{
"epoch": 0.85,
"grad_norm": 0.04334155097603798,
"learning_rate": 2.70071045961198e-05,
"loss": 1.3779,
"step": 800
},
{
"epoch": 0.85,
"grad_norm": 0.03772331029176712,
"learning_rate": 2.699594032123527e-05,
"loss": 1.3273,
"step": 801
},
{
"epoch": 0.85,
"grad_norm": 0.05215068534016609,
"learning_rate": 2.6984757578915546e-05,
"loss": 1.3153,
"step": 802
},
{
"epoch": 0.86,
"grad_norm": 0.04011908918619156,
"learning_rate": 2.6973556386376178e-05,
"loss": 1.337,
"step": 803
},
{
"epoch": 0.86,
"grad_norm": 0.04127516970038414,
"learning_rate": 2.6962336760861107e-05,
"loss": 1.2966,
"step": 804
},
{
"epoch": 0.86,
"grad_norm": 0.04114411026239395,
"learning_rate": 2.6951098719642643e-05,
"loss": 1.3454,
"step": 805
},
{
"epoch": 0.86,
"grad_norm": 0.04113148897886276,
"learning_rate": 2.693984228002146e-05,
"loss": 1.3322,
"step": 806
},
{
"epoch": 0.86,
"grad_norm": 0.04927441105246544,
"learning_rate": 2.6928567459326558e-05,
"loss": 1.3766,
"step": 807
},
{
"epoch": 0.86,
"grad_norm": 0.038145359605550766,
"learning_rate": 2.6917274274915215e-05,
"loss": 1.1307,
"step": 808
},
{
"epoch": 0.86,
"grad_norm": 0.04447488114237785,
"learning_rate": 2.6905962744173002e-05,
"loss": 1.1212,
"step": 809
},
{
"epoch": 0.86,
"grad_norm": 0.08171934634447098,
"learning_rate": 2.689463288451372e-05,
"loss": 1.3832,
"step": 810
},
{
"epoch": 0.86,
"grad_norm": 0.0424637608230114,
"learning_rate": 2.6883284713379388e-05,
"loss": 1.3069,
"step": 811
},
{
"epoch": 0.87,
"grad_norm": 0.03858843818306923,
"learning_rate": 2.687191824824022e-05,
"loss": 1.2404,
"step": 812
},
{
"epoch": 0.87,
"grad_norm": 0.038555946201086044,
"learning_rate": 2.686053350659459e-05,
"loss": 1.2822,
"step": 813
},
{
"epoch": 0.87,
"grad_norm": 0.046382464468479156,
"learning_rate": 2.6849130505969014e-05,
"loss": 1.3299,
"step": 814
},
{
"epoch": 0.87,
"grad_norm": 0.04165438562631607,
"learning_rate": 2.6837709263918102e-05,
"loss": 1.3089,
"step": 815
},
{
"epoch": 0.87,
"grad_norm": 0.05100713670253754,
"learning_rate": 2.6826269798024566e-05,
"loss": 1.2938,
"step": 816
},
{
"epoch": 0.87,
"grad_norm": 0.04596945270895958,
"learning_rate": 2.6814812125899154e-05,
"loss": 1.2895,
"step": 817
},
{
"epoch": 0.87,
"grad_norm": 0.03950990363955498,
"learning_rate": 2.680333626518066e-05,
"loss": 1.3278,
"step": 818
},
{
"epoch": 0.87,
"grad_norm": 0.04206983745098114,
"learning_rate": 2.679184223353587e-05,
"loss": 1.2307,
"step": 819
},
{
"epoch": 0.87,
"grad_norm": 0.042084578424692154,
"learning_rate": 2.678033004865954e-05,
"loss": 1.2185,
"step": 820
},
{
"epoch": 0.88,
"grad_norm": 0.038785211741924286,
"learning_rate": 2.6768799728274372e-05,
"loss": 1.1745,
"step": 821
},
{
"epoch": 0.88,
"grad_norm": 0.039074115455150604,
"learning_rate": 2.6757251290131002e-05,
"loss": 1.2186,
"step": 822
},
{
"epoch": 0.88,
"grad_norm": 0.05322974920272827,
"learning_rate": 2.6745684752007943e-05,
"loss": 1.313,
"step": 823
},
{
"epoch": 0.88,
"grad_norm": 0.1969892978668213,
"learning_rate": 2.673410013171157e-05,
"loss": 1.2945,
"step": 824
},
{
"epoch": 0.88,
"grad_norm": 0.03685871139168739,
"learning_rate": 2.6722497447076114e-05,
"loss": 1.263,
"step": 825
},
{
"epoch": 0.88,
"grad_norm": 0.042687300592660904,
"learning_rate": 2.671087671596359e-05,
"loss": 1.387,
"step": 826
},
{
"epoch": 0.88,
"grad_norm": 0.043946314603090286,
"learning_rate": 2.6699237956263817e-05,
"loss": 1.292,
"step": 827
},
{
"epoch": 0.88,
"grad_norm": 0.04511501267552376,
"learning_rate": 2.6687581185894363e-05,
"loss": 1.2041,
"step": 828
},
{
"epoch": 0.88,
"grad_norm": 0.04132469370961189,
"learning_rate": 2.6675906422800514e-05,
"loss": 1.2325,
"step": 829
},
{
"epoch": 0.88,
"grad_norm": 0.06625653058290482,
"learning_rate": 2.6664213684955267e-05,
"loss": 1.3227,
"step": 830
},
{
"epoch": 0.89,
"grad_norm": 0.17180198431015015,
"learning_rate": 2.6652502990359272e-05,
"loss": 1.2708,
"step": 831
},
{
"epoch": 0.89,
"grad_norm": 0.04059137776494026,
"learning_rate": 2.6640774357040846e-05,
"loss": 1.2888,
"step": 832
},
{
"epoch": 0.89,
"grad_norm": 0.045602891594171524,
"learning_rate": 2.6629027803055917e-05,
"loss": 1.1677,
"step": 833
},
{
"epoch": 0.89,
"grad_norm": 0.039703212678432465,
"learning_rate": 2.6617263346487987e-05,
"loss": 1.3742,
"step": 834
},
{
"epoch": 0.89,
"grad_norm": 0.04089050367474556,
"learning_rate": 2.660548100544813e-05,
"loss": 1.3612,
"step": 835
},
{
"epoch": 0.89,
"grad_norm": 0.04322800040245056,
"learning_rate": 2.6593680798074952e-05,
"loss": 1.2789,
"step": 836
},
{
"epoch": 0.89,
"grad_norm": 0.04172592982649803,
"learning_rate": 2.6581862742534563e-05,
"loss": 1.3865,
"step": 837
},
{
"epoch": 0.89,
"grad_norm": 0.04252813756465912,
"learning_rate": 2.657002685702055e-05,
"loss": 1.3192,
"step": 838
},
{
"epoch": 0.89,
"grad_norm": 0.03941137343645096,
"learning_rate": 2.6558173159753946e-05,
"loss": 1.3576,
"step": 839
},
{
"epoch": 0.9,
"grad_norm": 0.04537597671151161,
"learning_rate": 2.6546301668983206e-05,
"loss": 1.3712,
"step": 840
},
{
"epoch": 0.9,
"grad_norm": 0.041427433490753174,
"learning_rate": 2.653441240298418e-05,
"loss": 1.1632,
"step": 841
},
{
"epoch": 0.9,
"grad_norm": 0.05885668843984604,
"learning_rate": 2.6522505380060078e-05,
"loss": 1.2416,
"step": 842
},
{
"epoch": 0.9,
"grad_norm": 0.043777357786893845,
"learning_rate": 2.6510580618541458e-05,
"loss": 1.3483,
"step": 843
},
{
"epoch": 0.9,
"grad_norm": 0.039015110582113266,
"learning_rate": 2.6498638136786166e-05,
"loss": 1.2819,
"step": 844
},
{
"epoch": 0.9,
"grad_norm": 0.040871817618608475,
"learning_rate": 2.6486677953179344e-05,
"loss": 1.2825,
"step": 845
},
{
"epoch": 0.9,
"grad_norm": 0.04270527511835098,
"learning_rate": 2.6474700086133384e-05,
"loss": 1.1776,
"step": 846
},
{
"epoch": 0.9,
"grad_norm": 0.046877775341272354,
"learning_rate": 2.6462704554087894e-05,
"loss": 1.2799,
"step": 847
},
{
"epoch": 0.9,
"grad_norm": 0.037062957882881165,
"learning_rate": 2.645069137550968e-05,
"loss": 1.2094,
"step": 848
},
{
"epoch": 0.9,
"grad_norm": 0.04165401682257652,
"learning_rate": 2.643866056889272e-05,
"loss": 1.3647,
"step": 849
},
{
"epoch": 0.91,
"grad_norm": 0.04253152012825012,
"learning_rate": 2.6426612152758118e-05,
"loss": 1.3101,
"step": 850
},
{
"epoch": 0.91,
"grad_norm": 0.039812587201595306,
"learning_rate": 2.6414546145654097e-05,
"loss": 1.2479,
"step": 851
},
{
"epoch": 0.91,
"grad_norm": 0.040445294231176376,
"learning_rate": 2.640246256615596e-05,
"loss": 1.23,
"step": 852
},
{
"epoch": 0.91,
"grad_norm": 0.04047653079032898,
"learning_rate": 2.6390361432866058e-05,
"loss": 1.2417,
"step": 853
},
{
"epoch": 0.91,
"grad_norm": 0.04961085692048073,
"learning_rate": 2.6378242764413773e-05,
"loss": 1.2508,
"step": 854
},
{
"epoch": 0.91,
"grad_norm": 0.0414389967918396,
"learning_rate": 2.6366106579455468e-05,
"loss": 1.2624,
"step": 855
},
{
"epoch": 0.91,
"grad_norm": 0.03844548389315605,
"learning_rate": 2.635395289667449e-05,
"loss": 1.2981,
"step": 856
},
{
"epoch": 0.91,
"grad_norm": 0.04521048441529274,
"learning_rate": 2.6341781734781106e-05,
"loss": 1.2127,
"step": 857
},
{
"epoch": 0.91,
"grad_norm": 0.04535103589296341,
"learning_rate": 2.6329593112512508e-05,
"loss": 1.3261,
"step": 858
},
{
"epoch": 0.92,
"grad_norm": 0.04055513069033623,
"learning_rate": 2.6317387048632757e-05,
"loss": 1.2221,
"step": 859
},
{
"epoch": 0.92,
"grad_norm": 0.04123299941420555,
"learning_rate": 2.6305163561932773e-05,
"loss": 1.4202,
"step": 860
},
{
"epoch": 0.92,
"grad_norm": 0.053416475653648376,
"learning_rate": 2.629292267123028e-05,
"loss": 1.253,
"step": 861
},
{
"epoch": 0.92,
"grad_norm": 0.041178278625011444,
"learning_rate": 2.628066439536982e-05,
"loss": 1.0999,
"step": 862
},
{
"epoch": 0.92,
"grad_norm": 0.04176180437207222,
"learning_rate": 2.6268388753222677e-05,
"loss": 1.1518,
"step": 863
},
{
"epoch": 0.92,
"grad_norm": 0.04024680331349373,
"learning_rate": 2.6256095763686895e-05,
"loss": 1.2264,
"step": 864
},
{
"epoch": 0.92,
"grad_norm": 0.04161696508526802,
"learning_rate": 2.6243785445687192e-05,
"loss": 1.3583,
"step": 865
},
{
"epoch": 0.92,
"grad_norm": 0.046228162944316864,
"learning_rate": 2.6231457818174986e-05,
"loss": 1.2576,
"step": 866
},
{
"epoch": 0.92,
"grad_norm": 0.03873226419091225,
"learning_rate": 2.6219112900128337e-05,
"loss": 1.1708,
"step": 867
},
{
"epoch": 0.93,
"grad_norm": 0.04049040004611015,
"learning_rate": 2.6206750710551922e-05,
"loss": 1.2502,
"step": 868
},
{
"epoch": 0.93,
"grad_norm": 0.04042857140302658,
"learning_rate": 2.6194371268477008e-05,
"loss": 1.1942,
"step": 869
},
{
"epoch": 0.93,
"grad_norm": 0.040148042142391205,
"learning_rate": 2.6181974592961417e-05,
"loss": 1.1096,
"step": 870
},
{
"epoch": 0.93,
"grad_norm": 0.041213199496269226,
"learning_rate": 2.616956070308951e-05,
"loss": 1.3469,
"step": 871
},
{
"epoch": 0.93,
"grad_norm": 0.0426071472465992,
"learning_rate": 2.615712961797214e-05,
"loss": 1.2508,
"step": 872
},
{
"epoch": 0.93,
"grad_norm": 0.04406768083572388,
"learning_rate": 2.6144681356746647e-05,
"loss": 1.3422,
"step": 873
},
{
"epoch": 0.93,
"grad_norm": 0.039021048694849014,
"learning_rate": 2.6132215938576787e-05,
"loss": 1.248,
"step": 874
},
{
"epoch": 0.93,
"grad_norm": 0.04162931442260742,
"learning_rate": 2.6119733382652755e-05,
"loss": 1.3637,
"step": 875
},
{
"epoch": 0.93,
"grad_norm": 0.03739694878458977,
"learning_rate": 2.6107233708191108e-05,
"loss": 1.2575,
"step": 876
},
{
"epoch": 0.93,
"grad_norm": 0.04426296800374985,
"learning_rate": 2.6094716934434784e-05,
"loss": 1.2268,
"step": 877
},
{
"epoch": 0.94,
"grad_norm": 0.041838765144348145,
"learning_rate": 2.608218308065301e-05,
"loss": 1.2094,
"step": 878
},
{
"epoch": 0.94,
"grad_norm": 0.04208039864897728,
"learning_rate": 2.606963216614133e-05,
"loss": 1.3097,
"step": 879
},
{
"epoch": 0.94,
"grad_norm": 0.0421212799847126,
"learning_rate": 2.6057064210221556e-05,
"loss": 1.3256,
"step": 880
},
{
"epoch": 0.94,
"grad_norm": 0.04151439294219017,
"learning_rate": 2.6044479232241713e-05,
"loss": 1.2748,
"step": 881
},
{
"epoch": 0.94,
"grad_norm": 0.048513032495975494,
"learning_rate": 2.6031877251576054e-05,
"loss": 1.2447,
"step": 882
},
{
"epoch": 0.94,
"grad_norm": 0.04105342924594879,
"learning_rate": 2.6019258287624988e-05,
"loss": 1.1613,
"step": 883
},
{
"epoch": 0.94,
"grad_norm": 0.044454991817474365,
"learning_rate": 2.600662235981509e-05,
"loss": 1.1713,
"step": 884
},
{
"epoch": 0.94,
"grad_norm": 0.07957032322883606,
"learning_rate": 2.599396948759903e-05,
"loss": 1.4649,
"step": 885
},
{
"epoch": 0.94,
"grad_norm": 0.07526232302188873,
"learning_rate": 2.598129969045558e-05,
"loss": 1.1874,
"step": 886
},
{
"epoch": 0.95,
"grad_norm": 0.04056404158473015,
"learning_rate": 2.5968612987889553e-05,
"loss": 1.1964,
"step": 887
},
{
"epoch": 0.95,
"grad_norm": 0.043638139963150024,
"learning_rate": 2.5955909399431798e-05,
"loss": 1.2819,
"step": 888
},
{
"epoch": 0.95,
"grad_norm": 0.041582487523555756,
"learning_rate": 2.594318894463916e-05,
"loss": 1.2305,
"step": 889
},
{
"epoch": 0.95,
"grad_norm": 0.041533682495355606,
"learning_rate": 2.5930451643094435e-05,
"loss": 1.2939,
"step": 890
},
{
"epoch": 0.95,
"grad_norm": 0.03732079640030861,
"learning_rate": 2.5917697514406374e-05,
"loss": 1.2324,
"step": 891
},
{
"epoch": 0.95,
"grad_norm": 0.04289107024669647,
"learning_rate": 2.5904926578209617e-05,
"loss": 1.3801,
"step": 892
},
{
"epoch": 0.95,
"grad_norm": 0.04154540225863457,
"learning_rate": 2.589213885416469e-05,
"loss": 1.2219,
"step": 893
},
{
"epoch": 0.95,
"grad_norm": 0.04077089950442314,
"learning_rate": 2.5879334361957955e-05,
"loss": 1.1392,
"step": 894
},
{
"epoch": 0.95,
"grad_norm": 0.04148285463452339,
"learning_rate": 2.5866513121301592e-05,
"loss": 1.3283,
"step": 895
},
{
"epoch": 0.95,
"grad_norm": 0.04197124391794205,
"learning_rate": 2.5853675151933565e-05,
"loss": 1.2432,
"step": 896
},
{
"epoch": 0.96,
"grad_norm": 0.04120796546339989,
"learning_rate": 2.584082047361759e-05,
"loss": 1.3758,
"step": 897
},
{
"epoch": 0.96,
"grad_norm": 0.05007264018058777,
"learning_rate": 2.5827949106143113e-05,
"loss": 1.3027,
"step": 898
},
{
"epoch": 0.96,
"grad_norm": 0.03969128802418709,
"learning_rate": 2.5815061069325252e-05,
"loss": 1.366,
"step": 899
},
{
"epoch": 0.96,
"grad_norm": 0.05790838971734047,
"learning_rate": 2.5802156383004817e-05,
"loss": 1.2677,
"step": 900
},
{
"epoch": 0.96,
"grad_norm": 0.057674333453178406,
"learning_rate": 2.5789235067048224e-05,
"loss": 1.3836,
"step": 901
},
{
"epoch": 0.96,
"grad_norm": 0.03906143456697464,
"learning_rate": 2.57762971413475e-05,
"loss": 1.2964,
"step": 902
},
{
"epoch": 0.96,
"grad_norm": 0.05400891602039337,
"learning_rate": 2.576334262582025e-05,
"loss": 1.261,
"step": 903
},
{
"epoch": 0.96,
"grad_norm": 0.04630432277917862,
"learning_rate": 2.57503715404096e-05,
"loss": 1.3345,
"step": 904
},
{
"epoch": 0.96,
"grad_norm": 0.039165519177913666,
"learning_rate": 2.5737383905084207e-05,
"loss": 1.2549,
"step": 905
},
{
"epoch": 0.97,
"grad_norm": 0.16873140633106232,
"learning_rate": 2.572437973983818e-05,
"loss": 1.365,
"step": 906
},
{
"epoch": 0.97,
"grad_norm": 0.03847840055823326,
"learning_rate": 2.5711359064691105e-05,
"loss": 1.2175,
"step": 907
},
{
"epoch": 0.97,
"grad_norm": 0.0415102019906044,
"learning_rate": 2.569832189968796e-05,
"loss": 1.2817,
"step": 908
},
{
"epoch": 0.97,
"grad_norm": 0.04224313050508499,
"learning_rate": 2.5685268264899117e-05,
"loss": 1.4298,
"step": 909
},
{
"epoch": 0.97,
"grad_norm": 0.042650867253541946,
"learning_rate": 2.567219818042031e-05,
"loss": 1.2426,
"step": 910
},
{
"epoch": 0.97,
"grad_norm": 0.04199182987213135,
"learning_rate": 2.5659111666372593e-05,
"loss": 1.1769,
"step": 911
},
{
"epoch": 0.97,
"grad_norm": 0.041494037955999374,
"learning_rate": 2.5646008742902305e-05,
"loss": 1.3261,
"step": 912
},
{
"epoch": 0.97,
"grad_norm": 0.03892706334590912,
"learning_rate": 2.5632889430181054e-05,
"loss": 1.1609,
"step": 913
},
{
"epoch": 0.97,
"grad_norm": 0.046930767595767975,
"learning_rate": 2.561975374840568e-05,
"loss": 1.2276,
"step": 914
},
{
"epoch": 0.98,
"grad_norm": 0.038510628044605255,
"learning_rate": 2.5606601717798212e-05,
"loss": 1.2124,
"step": 915
},
{
"epoch": 0.98,
"grad_norm": 0.04090854153037071,
"learning_rate": 2.5593433358605867e-05,
"loss": 1.2333,
"step": 916
},
{
"epoch": 0.98,
"grad_norm": 0.042670849710702896,
"learning_rate": 2.558024869110098e-05,
"loss": 1.2648,
"step": 917
},
{
"epoch": 0.98,
"grad_norm": 0.042589254677295685,
"learning_rate": 2.556704773558101e-05,
"loss": 1.2574,
"step": 918
},
{
"epoch": 0.98,
"grad_norm": 0.04135293513536453,
"learning_rate": 2.555383051236847e-05,
"loss": 1.2974,
"step": 919
},
{
"epoch": 0.98,
"grad_norm": 0.03972950950264931,
"learning_rate": 2.554059704181093e-05,
"loss": 1.3323,
"step": 920
},
{
"epoch": 0.98,
"grad_norm": 0.03985543176531792,
"learning_rate": 2.5527347344280977e-05,
"loss": 1.2946,
"step": 921
},
{
"epoch": 0.98,
"grad_norm": 0.04313001036643982,
"learning_rate": 2.5514081440176173e-05,
"loss": 1.3231,
"step": 922
},
{
"epoch": 0.98,
"grad_norm": 0.04007202759385109,
"learning_rate": 2.5500799349919023e-05,
"loss": 1.2983,
"step": 923
},
{
"epoch": 0.98,
"grad_norm": 0.03902252018451691,
"learning_rate": 2.5487501093956956e-05,
"loss": 1.2326,
"step": 924
},
{
"epoch": 0.99,
"grad_norm": 0.04481017589569092,
"learning_rate": 2.5474186692762294e-05,
"loss": 1.2874,
"step": 925
},
{
"epoch": 0.99,
"grad_norm": 0.04088298976421356,
"learning_rate": 2.5460856166832204e-05,
"loss": 1.2841,
"step": 926
},
{
"epoch": 0.99,
"grad_norm": 0.03811021149158478,
"learning_rate": 2.544750953668868e-05,
"loss": 1.214,
"step": 927
},
{
"epoch": 0.99,
"grad_norm": 0.03895227238535881,
"learning_rate": 2.543414682287851e-05,
"loss": 1.3366,
"step": 928
},
{
"epoch": 0.99,
"grad_norm": 0.04189634323120117,
"learning_rate": 2.542076804597324e-05,
"loss": 1.436,
"step": 929
},
{
"epoch": 0.99,
"grad_norm": 0.043176136910915375,
"learning_rate": 2.540737322656915e-05,
"loss": 1.2532,
"step": 930
},
{
"epoch": 0.99,
"grad_norm": 0.03896043822169304,
"learning_rate": 2.539396238528721e-05,
"loss": 1.2106,
"step": 931
},
{
"epoch": 0.99,
"grad_norm": 0.03971550986170769,
"learning_rate": 2.5380535542773052e-05,
"loss": 1.2685,
"step": 932
},
{
"epoch": 0.99,
"grad_norm": 0.04015112668275833,
"learning_rate": 2.5367092719696957e-05,
"loss": 1.2589,
"step": 933
},
{
"epoch": 1.0,
"grad_norm": 0.052997201681137085,
"learning_rate": 2.53536339367538e-05,
"loss": 1.25,
"step": 934
},
{
"epoch": 1.0,
"grad_norm": 0.03955504670739174,
"learning_rate": 2.5340159214663007e-05,
"loss": 1.3744,
"step": 935
},
{
"epoch": 1.0,
"grad_norm": 0.0384516641497612,
"learning_rate": 2.532666857416858e-05,
"loss": 1.1435,
"step": 936
},
{
"epoch": 1.0,
"grad_norm": 0.0435960479080677,
"learning_rate": 2.531316203603899e-05,
"loss": 1.2677,
"step": 937
},
{
"epoch": 1.0,
"grad_norm": 0.0423857718706131,
"learning_rate": 2.529963962106721e-05,
"loss": 1.2528,
"step": 938
},
{
"epoch": 1.0,
"grad_norm": 0.05119800567626953,
"learning_rate": 2.5286101350070638e-05,
"loss": 1.2663,
"step": 939
},
{
"epoch": 1.0,
"grad_norm": 0.04668194055557251,
"learning_rate": 2.5272547243891076e-05,
"loss": 1.2601,
"step": 940
},
{
"epoch": 1.0,
"grad_norm": 0.04023146629333496,
"learning_rate": 2.525897732339473e-05,
"loss": 1.3211,
"step": 941
},
{
"epoch": 1.0,
"grad_norm": 0.04063122346997261,
"learning_rate": 2.524539160947213e-05,
"loss": 1.2688,
"step": 942
},
{
"epoch": 1.01,
"grad_norm": 0.04180854186415672,
"learning_rate": 2.523179012303813e-05,
"loss": 1.3655,
"step": 943
},
{
"epoch": 1.01,
"grad_norm": 0.04253614321351051,
"learning_rate": 2.5218172885031854e-05,
"loss": 1.1455,
"step": 944
},
{
"epoch": 1.01,
"grad_norm": 0.03980337083339691,
"learning_rate": 2.520453991641669e-05,
"loss": 1.1863,
"step": 945
},
{
"epoch": 1.01,
"grad_norm": 0.03819122910499573,
"learning_rate": 2.519089123818023e-05,
"loss": 1.2007,
"step": 946
},
{
"epoch": 1.01,
"grad_norm": 0.03928643837571144,
"learning_rate": 2.517722687133426e-05,
"loss": 1.084,
"step": 947
},
{
"epoch": 1.01,
"grad_norm": 0.04488477110862732,
"learning_rate": 2.5163546836914705e-05,
"loss": 1.2667,
"step": 948
},
{
"epoch": 1.01,
"grad_norm": 0.0637240782380104,
"learning_rate": 2.5149851155981626e-05,
"loss": 1.2081,
"step": 949
},
{
"epoch": 1.01,
"grad_norm": 0.03988213464617729,
"learning_rate": 2.5136139849619164e-05,
"loss": 1.3405,
"step": 950
}
],
"logging_steps": 1.0,
"max_steps": 2814,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"total_flos": 1.9321548867807216e+19,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}