ViCA-base-5p / trainer_state.json
nkkbr's picture
Initial commit
3c6d455
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.050008343995104856,
"eval_steps": 500,
"global_step": 899,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 31.671595567958246,
"learning_rate": 1.851851851851852e-08,
"loss": 0.9836,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 21.590568341208314,
"learning_rate": 3.703703703703704e-08,
"loss": 0.6951,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 33.43736925901718,
"learning_rate": 5.555555555555556e-08,
"loss": 1.0552,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 25.76674648270905,
"learning_rate": 7.407407407407409e-08,
"loss": 0.9979,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 29.142254478570287,
"learning_rate": 9.259259259259259e-08,
"loss": 1.1524,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 128.44118605473278,
"learning_rate": 1.1111111111111112e-07,
"loss": 0.9627,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 22.844065944506312,
"learning_rate": 1.2962962962962964e-07,
"loss": 0.7732,
"step": 7
},
{
"epoch": 0.0,
"grad_norm": 26.662421778739866,
"learning_rate": 1.4814814814814817e-07,
"loss": 0.9373,
"step": 8
},
{
"epoch": 0.0,
"grad_norm": 24.8894090181831,
"learning_rate": 1.6666666666666668e-07,
"loss": 1.1089,
"step": 9
},
{
"epoch": 0.0,
"grad_norm": 21.578880921091358,
"learning_rate": 1.8518518518518518e-07,
"loss": 0.6824,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 28.74029164802737,
"learning_rate": 2.0370370370370374e-07,
"loss": 0.8942,
"step": 11
},
{
"epoch": 0.0,
"grad_norm": 22.14964568326191,
"learning_rate": 2.2222222222222224e-07,
"loss": 0.8041,
"step": 12
},
{
"epoch": 0.0,
"grad_norm": 32.9657912531208,
"learning_rate": 2.4074074074074075e-07,
"loss": 1.2601,
"step": 13
},
{
"epoch": 0.0,
"grad_norm": 17.145724154465036,
"learning_rate": 2.592592592592593e-07,
"loss": 0.7905,
"step": 14
},
{
"epoch": 0.0,
"grad_norm": 24.26835111004837,
"learning_rate": 2.7777777777777776e-07,
"loss": 0.9586,
"step": 15
},
{
"epoch": 0.0,
"grad_norm": 22.011607835789768,
"learning_rate": 2.9629629629629634e-07,
"loss": 0.8789,
"step": 16
},
{
"epoch": 0.0,
"grad_norm": 30.838211196101994,
"learning_rate": 3.148148148148148e-07,
"loss": 0.9217,
"step": 17
},
{
"epoch": 0.0,
"grad_norm": 15.955309084125279,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.911,
"step": 18
},
{
"epoch": 0.0,
"grad_norm": 24.344206309908216,
"learning_rate": 3.518518518518519e-07,
"loss": 0.9155,
"step": 19
},
{
"epoch": 0.0,
"grad_norm": 32.92989138430358,
"learning_rate": 3.7037037037037036e-07,
"loss": 0.8828,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 22.41057192779469,
"learning_rate": 3.8888888888888895e-07,
"loss": 0.8469,
"step": 21
},
{
"epoch": 0.0,
"grad_norm": 17.46446762413705,
"learning_rate": 4.074074074074075e-07,
"loss": 0.6855,
"step": 22
},
{
"epoch": 0.0,
"grad_norm": 15.433059856938662,
"learning_rate": 4.2592592592592596e-07,
"loss": 0.7404,
"step": 23
},
{
"epoch": 0.0,
"grad_norm": 10.728638383519646,
"learning_rate": 4.444444444444445e-07,
"loss": 0.6503,
"step": 24
},
{
"epoch": 0.0,
"grad_norm": 12.213215386669342,
"learning_rate": 4.6296296296296297e-07,
"loss": 0.8609,
"step": 25
},
{
"epoch": 0.0,
"grad_norm": 18.317938299817154,
"learning_rate": 4.814814814814815e-07,
"loss": 0.8072,
"step": 26
},
{
"epoch": 0.0,
"grad_norm": 19.438962335560756,
"learning_rate": 5.000000000000001e-07,
"loss": 0.5746,
"step": 27
},
{
"epoch": 0.0,
"grad_norm": 23.19453080923279,
"learning_rate": 5.185185185185186e-07,
"loss": 0.8071,
"step": 28
},
{
"epoch": 0.0,
"grad_norm": 17.74196002097125,
"learning_rate": 5.37037037037037e-07,
"loss": 0.6765,
"step": 29
},
{
"epoch": 0.0,
"grad_norm": 13.827465942539503,
"learning_rate": 5.555555555555555e-07,
"loss": 0.7021,
"step": 30
},
{
"epoch": 0.0,
"grad_norm": 15.488021836868372,
"learning_rate": 5.740740740740741e-07,
"loss": 0.4757,
"step": 31
},
{
"epoch": 0.0,
"grad_norm": 13.20162265514205,
"learning_rate": 5.925925925925927e-07,
"loss": 0.5341,
"step": 32
},
{
"epoch": 0.0,
"grad_norm": 9.110026681402434,
"learning_rate": 6.111111111111112e-07,
"loss": 0.4415,
"step": 33
},
{
"epoch": 0.0,
"grad_norm": 9.830477386012433,
"learning_rate": 6.296296296296296e-07,
"loss": 0.6151,
"step": 34
},
{
"epoch": 0.0,
"grad_norm": 11.622213091116294,
"learning_rate": 6.481481481481481e-07,
"loss": 0.4269,
"step": 35
},
{
"epoch": 0.0,
"grad_norm": 8.207427780107848,
"learning_rate": 6.666666666666667e-07,
"loss": 0.3635,
"step": 36
},
{
"epoch": 0.0,
"grad_norm": 7.836791093758771,
"learning_rate": 6.851851851851853e-07,
"loss": 0.6096,
"step": 37
},
{
"epoch": 0.0,
"grad_norm": 7.193647061430912,
"learning_rate": 7.037037037037038e-07,
"loss": 0.4741,
"step": 38
},
{
"epoch": 0.0,
"grad_norm": 9.476468858625738,
"learning_rate": 7.222222222222222e-07,
"loss": 0.3655,
"step": 39
},
{
"epoch": 0.0,
"grad_norm": 7.157861986137023,
"learning_rate": 7.407407407407407e-07,
"loss": 0.4635,
"step": 40
},
{
"epoch": 0.0,
"grad_norm": 7.246782509697589,
"learning_rate": 7.592592592592593e-07,
"loss": 0.54,
"step": 41
},
{
"epoch": 0.0,
"grad_norm": 4.533392537772903,
"learning_rate": 7.777777777777779e-07,
"loss": 0.1987,
"step": 42
},
{
"epoch": 0.0,
"grad_norm": 6.082186481942415,
"learning_rate": 7.962962962962964e-07,
"loss": 0.3954,
"step": 43
},
{
"epoch": 0.0,
"grad_norm": 4.686018035593464,
"learning_rate": 8.14814814814815e-07,
"loss": 0.2148,
"step": 44
},
{
"epoch": 0.0,
"grad_norm": 5.285244734447013,
"learning_rate": 8.333333333333333e-07,
"loss": 0.312,
"step": 45
},
{
"epoch": 0.0,
"grad_norm": 5.426938300900884,
"learning_rate": 8.518518518518519e-07,
"loss": 0.2549,
"step": 46
},
{
"epoch": 0.0,
"grad_norm": 6.275553478285126,
"learning_rate": 8.703703703703705e-07,
"loss": 0.3357,
"step": 47
},
{
"epoch": 0.0,
"grad_norm": 5.135064616590709,
"learning_rate": 8.88888888888889e-07,
"loss": 0.1967,
"step": 48
},
{
"epoch": 0.0,
"grad_norm": 4.486540108880125,
"learning_rate": 9.074074074074076e-07,
"loss": 0.3064,
"step": 49
},
{
"epoch": 0.0,
"grad_norm": 6.190915869898936,
"learning_rate": 9.259259259259259e-07,
"loss": 0.4127,
"step": 50
},
{
"epoch": 0.0,
"grad_norm": 6.080158746554302,
"learning_rate": 9.444444444444445e-07,
"loss": 0.2437,
"step": 51
},
{
"epoch": 0.0,
"grad_norm": 5.0751084508724,
"learning_rate": 9.62962962962963e-07,
"loss": 0.4358,
"step": 52
},
{
"epoch": 0.0,
"grad_norm": 7.434165335252677,
"learning_rate": 9.814814814814816e-07,
"loss": 0.4203,
"step": 53
},
{
"epoch": 0.0,
"grad_norm": 5.253355134896159,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.3269,
"step": 54
},
{
"epoch": 0.0,
"grad_norm": 4.243061853720817,
"learning_rate": 1.0185185185185185e-06,
"loss": 0.1813,
"step": 55
},
{
"epoch": 0.0,
"grad_norm": 4.856767967025552,
"learning_rate": 1.0370370370370371e-06,
"loss": 0.3047,
"step": 56
},
{
"epoch": 0.0,
"grad_norm": 5.127255350560153,
"learning_rate": 1.0555555555555557e-06,
"loss": 0.2576,
"step": 57
},
{
"epoch": 0.0,
"grad_norm": 4.346136439290515,
"learning_rate": 1.074074074074074e-06,
"loss": 0.2684,
"step": 58
},
{
"epoch": 0.0,
"grad_norm": 3.4920134020938165,
"learning_rate": 1.0925925925925927e-06,
"loss": 0.194,
"step": 59
},
{
"epoch": 0.0,
"grad_norm": 4.647235144260062,
"learning_rate": 1.111111111111111e-06,
"loss": 0.224,
"step": 60
},
{
"epoch": 0.0,
"grad_norm": 5.519727656169724,
"learning_rate": 1.1296296296296296e-06,
"loss": 0.2936,
"step": 61
},
{
"epoch": 0.0,
"grad_norm": 8.295031154084294,
"learning_rate": 1.1481481481481482e-06,
"loss": 0.3286,
"step": 62
},
{
"epoch": 0.0,
"grad_norm": 5.722549697873489,
"learning_rate": 1.1666666666666668e-06,
"loss": 0.4155,
"step": 63
},
{
"epoch": 0.0,
"grad_norm": 5.875671754133045,
"learning_rate": 1.1851851851851854e-06,
"loss": 0.378,
"step": 64
},
{
"epoch": 0.0,
"grad_norm": 6.792663865708492,
"learning_rate": 1.2037037037037037e-06,
"loss": 0.2497,
"step": 65
},
{
"epoch": 0.0,
"grad_norm": 7.20501181162727,
"learning_rate": 1.2222222222222223e-06,
"loss": 0.3962,
"step": 66
},
{
"epoch": 0.0,
"grad_norm": 4.794105811510582,
"learning_rate": 1.240740740740741e-06,
"loss": 0.2524,
"step": 67
},
{
"epoch": 0.0,
"grad_norm": 5.064103025788011,
"learning_rate": 1.2592592592592593e-06,
"loss": 0.3846,
"step": 68
},
{
"epoch": 0.0,
"grad_norm": 3.452696967890651,
"learning_rate": 1.2777777777777779e-06,
"loss": 0.2413,
"step": 69
},
{
"epoch": 0.0,
"grad_norm": 2.9764029793033213,
"learning_rate": 1.2962962962962962e-06,
"loss": 0.1858,
"step": 70
},
{
"epoch": 0.0,
"grad_norm": 4.887938203216541,
"learning_rate": 1.3148148148148148e-06,
"loss": 0.2745,
"step": 71
},
{
"epoch": 0.0,
"grad_norm": 5.134226727564933,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.2885,
"step": 72
},
{
"epoch": 0.0,
"grad_norm": 5.590512921407683,
"learning_rate": 1.351851851851852e-06,
"loss": 0.2419,
"step": 73
},
{
"epoch": 0.0,
"grad_norm": 4.674717092878751,
"learning_rate": 1.3703703703703706e-06,
"loss": 0.3462,
"step": 74
},
{
"epoch": 0.0,
"grad_norm": 4.509220346401417,
"learning_rate": 1.3888888888888892e-06,
"loss": 0.1586,
"step": 75
},
{
"epoch": 0.0,
"grad_norm": 3.997696481472079,
"learning_rate": 1.4074074074074075e-06,
"loss": 0.2175,
"step": 76
},
{
"epoch": 0.0,
"grad_norm": 4.966583739405523,
"learning_rate": 1.4259259259259261e-06,
"loss": 0.3962,
"step": 77
},
{
"epoch": 0.0,
"grad_norm": 5.093276066175207,
"learning_rate": 1.4444444444444445e-06,
"loss": 0.3756,
"step": 78
},
{
"epoch": 0.0,
"grad_norm": 6.335763630204078,
"learning_rate": 1.4629629629629629e-06,
"loss": 0.2768,
"step": 79
},
{
"epoch": 0.0,
"grad_norm": 5.52584594274486,
"learning_rate": 1.4814814814814815e-06,
"loss": 0.2227,
"step": 80
},
{
"epoch": 0.0,
"grad_norm": 4.292995296600986,
"learning_rate": 1.5e-06,
"loss": 0.2931,
"step": 81
},
{
"epoch": 0.0,
"grad_norm": 3.3855191650524232,
"learning_rate": 1.5185185185185186e-06,
"loss": 0.2184,
"step": 82
},
{
"epoch": 0.0,
"grad_norm": 6.680707851356181,
"learning_rate": 1.5370370370370372e-06,
"loss": 0.3002,
"step": 83
},
{
"epoch": 0.0,
"grad_norm": 4.157140363964968,
"learning_rate": 1.5555555555555558e-06,
"loss": 0.2268,
"step": 84
},
{
"epoch": 0.0,
"grad_norm": 2.9320222630351624,
"learning_rate": 1.5740740740740742e-06,
"loss": 0.2046,
"step": 85
},
{
"epoch": 0.0,
"grad_norm": 4.54889007696515,
"learning_rate": 1.5925925925925927e-06,
"loss": 0.2168,
"step": 86
},
{
"epoch": 0.0,
"grad_norm": 4.0441417227247705,
"learning_rate": 1.6111111111111113e-06,
"loss": 0.2149,
"step": 87
},
{
"epoch": 0.0,
"grad_norm": 5.7596791320081,
"learning_rate": 1.62962962962963e-06,
"loss": 0.2557,
"step": 88
},
{
"epoch": 0.0,
"grad_norm": 4.4753129689831885,
"learning_rate": 1.648148148148148e-06,
"loss": 0.2086,
"step": 89
},
{
"epoch": 0.01,
"grad_norm": 4.625168152922665,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.3106,
"step": 90
},
{
"epoch": 0.01,
"grad_norm": 4.778566667141804,
"learning_rate": 1.6851851851851852e-06,
"loss": 0.236,
"step": 91
},
{
"epoch": 0.01,
"grad_norm": 3.4927683097649407,
"learning_rate": 1.7037037037037038e-06,
"loss": 0.228,
"step": 92
},
{
"epoch": 0.01,
"grad_norm": 5.238504380858114,
"learning_rate": 1.7222222222222224e-06,
"loss": 0.2445,
"step": 93
},
{
"epoch": 0.01,
"grad_norm": 5.26550356218346,
"learning_rate": 1.740740740740741e-06,
"loss": 0.335,
"step": 94
},
{
"epoch": 0.01,
"grad_norm": 4.084517792100619,
"learning_rate": 1.7592592592592594e-06,
"loss": 0.2466,
"step": 95
},
{
"epoch": 0.01,
"grad_norm": 4.059980476806942,
"learning_rate": 1.777777777777778e-06,
"loss": 0.2352,
"step": 96
},
{
"epoch": 0.01,
"grad_norm": 5.618509680814076,
"learning_rate": 1.7962962962962965e-06,
"loss": 0.2983,
"step": 97
},
{
"epoch": 0.01,
"grad_norm": 3.2752739027377102,
"learning_rate": 1.8148148148148151e-06,
"loss": 0.2138,
"step": 98
},
{
"epoch": 0.01,
"grad_norm": 4.531959879286411,
"learning_rate": 1.8333333333333333e-06,
"loss": 0.2109,
"step": 99
},
{
"epoch": 0.01,
"grad_norm": 3.719039633234832,
"learning_rate": 1.8518518518518519e-06,
"loss": 0.2742,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 4.791294915525922,
"learning_rate": 1.8703703703703705e-06,
"loss": 0.2604,
"step": 101
},
{
"epoch": 0.01,
"grad_norm": 4.386141106337409,
"learning_rate": 1.888888888888889e-06,
"loss": 0.2207,
"step": 102
},
{
"epoch": 0.01,
"grad_norm": 3.693752709660448,
"learning_rate": 1.9074074074074076e-06,
"loss": 0.1844,
"step": 103
},
{
"epoch": 0.01,
"grad_norm": 3.539126599807116,
"learning_rate": 1.925925925925926e-06,
"loss": 0.2236,
"step": 104
},
{
"epoch": 0.01,
"grad_norm": 5.701328883111506,
"learning_rate": 1.944444444444445e-06,
"loss": 0.2436,
"step": 105
},
{
"epoch": 0.01,
"grad_norm": 3.49144964082344,
"learning_rate": 1.962962962962963e-06,
"loss": 0.2266,
"step": 106
},
{
"epoch": 0.01,
"grad_norm": 3.6072130209912188,
"learning_rate": 1.9814814814814815e-06,
"loss": 0.2317,
"step": 107
},
{
"epoch": 0.01,
"grad_norm": 3.0365130068969792,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.2223,
"step": 108
},
{
"epoch": 0.01,
"grad_norm": 13.354029999428898,
"learning_rate": 2.0185185185185187e-06,
"loss": 0.2359,
"step": 109
},
{
"epoch": 0.01,
"grad_norm": 4.866215102048313,
"learning_rate": 2.037037037037037e-06,
"loss": 0.2178,
"step": 110
},
{
"epoch": 0.01,
"grad_norm": 4.2919768835495775,
"learning_rate": 2.0555555555555555e-06,
"loss": 0.2053,
"step": 111
},
{
"epoch": 0.01,
"grad_norm": 3.0247999172403386,
"learning_rate": 2.0740740740740742e-06,
"loss": 0.2116,
"step": 112
},
{
"epoch": 0.01,
"grad_norm": 4.873767819329223,
"learning_rate": 2.0925925925925926e-06,
"loss": 0.2542,
"step": 113
},
{
"epoch": 0.01,
"grad_norm": 3.290100400539165,
"learning_rate": 2.1111111111111114e-06,
"loss": 0.2353,
"step": 114
},
{
"epoch": 0.01,
"grad_norm": 4.751597713000032,
"learning_rate": 2.1296296296296298e-06,
"loss": 0.2315,
"step": 115
},
{
"epoch": 0.01,
"grad_norm": 4.473212528201221,
"learning_rate": 2.148148148148148e-06,
"loss": 0.2496,
"step": 116
},
{
"epoch": 0.01,
"grad_norm": 5.783412858689247,
"learning_rate": 2.166666666666667e-06,
"loss": 0.2554,
"step": 117
},
{
"epoch": 0.01,
"grad_norm": 4.699102842656633,
"learning_rate": 2.1851851851851853e-06,
"loss": 0.2953,
"step": 118
},
{
"epoch": 0.01,
"grad_norm": 3.1020105552376935,
"learning_rate": 2.203703703703704e-06,
"loss": 0.2232,
"step": 119
},
{
"epoch": 0.01,
"grad_norm": 4.231270542404818,
"learning_rate": 2.222222222222222e-06,
"loss": 0.2855,
"step": 120
},
{
"epoch": 0.01,
"grad_norm": 5.1755404319160485,
"learning_rate": 2.240740740740741e-06,
"loss": 0.2506,
"step": 121
},
{
"epoch": 0.01,
"grad_norm": 2.832933353259535,
"learning_rate": 2.2592592592592592e-06,
"loss": 0.2209,
"step": 122
},
{
"epoch": 0.01,
"grad_norm": 3.3747496600378026,
"learning_rate": 2.277777777777778e-06,
"loss": 0.2641,
"step": 123
},
{
"epoch": 0.01,
"grad_norm": 3.1614197231660426,
"learning_rate": 2.2962962962962964e-06,
"loss": 0.2591,
"step": 124
},
{
"epoch": 0.01,
"grad_norm": 3.15164363985477,
"learning_rate": 2.314814814814815e-06,
"loss": 0.2195,
"step": 125
},
{
"epoch": 0.01,
"grad_norm": 3.1520741854423773,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.2327,
"step": 126
},
{
"epoch": 0.01,
"grad_norm": 3.612537600311694,
"learning_rate": 2.351851851851852e-06,
"loss": 0.2544,
"step": 127
},
{
"epoch": 0.01,
"grad_norm": 2.658574949293894,
"learning_rate": 2.3703703703703707e-06,
"loss": 0.1689,
"step": 128
},
{
"epoch": 0.01,
"grad_norm": 2.521522811253935,
"learning_rate": 2.388888888888889e-06,
"loss": 0.2293,
"step": 129
},
{
"epoch": 0.01,
"grad_norm": 5.564732601796836,
"learning_rate": 2.4074074074074075e-06,
"loss": 0.2667,
"step": 130
},
{
"epoch": 0.01,
"grad_norm": 2.44070512393425,
"learning_rate": 2.425925925925926e-06,
"loss": 0.2093,
"step": 131
},
{
"epoch": 0.01,
"grad_norm": 3.7275314191060787,
"learning_rate": 2.4444444444444447e-06,
"loss": 0.2531,
"step": 132
},
{
"epoch": 0.01,
"grad_norm": 3.682061533051656,
"learning_rate": 2.462962962962963e-06,
"loss": 0.2493,
"step": 133
},
{
"epoch": 0.01,
"grad_norm": 2.951520736342321,
"learning_rate": 2.481481481481482e-06,
"loss": 0.2651,
"step": 134
},
{
"epoch": 0.01,
"grad_norm": 3.1993013028962376,
"learning_rate": 2.5e-06,
"loss": 0.2193,
"step": 135
},
{
"epoch": 0.01,
"grad_norm": 3.4100755941983794,
"learning_rate": 2.5185185185185186e-06,
"loss": 0.2403,
"step": 136
},
{
"epoch": 0.01,
"grad_norm": 3.3821935109396386,
"learning_rate": 2.5370370370370374e-06,
"loss": 0.2758,
"step": 137
},
{
"epoch": 0.01,
"grad_norm": 2.6543322877297006,
"learning_rate": 2.5555555555555557e-06,
"loss": 0.2147,
"step": 138
},
{
"epoch": 0.01,
"grad_norm": 3.416779857404282,
"learning_rate": 2.5740740740740745e-06,
"loss": 0.2329,
"step": 139
},
{
"epoch": 0.01,
"grad_norm": 4.095895132385728,
"learning_rate": 2.5925925925925925e-06,
"loss": 0.2328,
"step": 140
},
{
"epoch": 0.01,
"grad_norm": 2.802919951884446,
"learning_rate": 2.6111111111111113e-06,
"loss": 0.2641,
"step": 141
},
{
"epoch": 0.01,
"grad_norm": 3.3297063178072643,
"learning_rate": 2.6296296296296297e-06,
"loss": 0.237,
"step": 142
},
{
"epoch": 0.01,
"grad_norm": 3.369922845676044,
"learning_rate": 2.6481481481481485e-06,
"loss": 0.2538,
"step": 143
},
{
"epoch": 0.01,
"grad_norm": 3.7670644169639695,
"learning_rate": 2.666666666666667e-06,
"loss": 0.2539,
"step": 144
},
{
"epoch": 0.01,
"grad_norm": 3.4216695924186804,
"learning_rate": 2.6851851851851856e-06,
"loss": 0.2053,
"step": 145
},
{
"epoch": 0.01,
"grad_norm": 3.221839005854741,
"learning_rate": 2.703703703703704e-06,
"loss": 0.2127,
"step": 146
},
{
"epoch": 0.01,
"grad_norm": 3.0235480946266806,
"learning_rate": 2.7222222222222224e-06,
"loss": 0.2106,
"step": 147
},
{
"epoch": 0.01,
"grad_norm": 2.834709595232568,
"learning_rate": 2.740740740740741e-06,
"loss": 0.1946,
"step": 148
},
{
"epoch": 0.01,
"grad_norm": 3.015178509009479,
"learning_rate": 2.759259259259259e-06,
"loss": 0.2388,
"step": 149
},
{
"epoch": 0.01,
"grad_norm": 4.409130426513994,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.2415,
"step": 150
},
{
"epoch": 0.01,
"grad_norm": 3.3739352577710386,
"learning_rate": 2.7962962962962963e-06,
"loss": 0.2457,
"step": 151
},
{
"epoch": 0.01,
"grad_norm": 3.4098241760142383,
"learning_rate": 2.814814814814815e-06,
"loss": 0.251,
"step": 152
},
{
"epoch": 0.01,
"grad_norm": 3.333754001338976,
"learning_rate": 2.8333333333333335e-06,
"loss": 0.1801,
"step": 153
},
{
"epoch": 0.01,
"grad_norm": 3.318077097848955,
"learning_rate": 2.8518518518518522e-06,
"loss": 0.2458,
"step": 154
},
{
"epoch": 0.01,
"grad_norm": 2.9466885051594103,
"learning_rate": 2.8703703703703706e-06,
"loss": 0.2406,
"step": 155
},
{
"epoch": 0.01,
"grad_norm": 2.731062708528279,
"learning_rate": 2.888888888888889e-06,
"loss": 0.2001,
"step": 156
},
{
"epoch": 0.01,
"grad_norm": 2.9272308425569653,
"learning_rate": 2.907407407407408e-06,
"loss": 0.2369,
"step": 157
},
{
"epoch": 0.01,
"grad_norm": 2.481414362366478,
"learning_rate": 2.9259259259259257e-06,
"loss": 0.1691,
"step": 158
},
{
"epoch": 0.01,
"grad_norm": 3.0329718074790053,
"learning_rate": 2.944444444444445e-06,
"loss": 0.2387,
"step": 159
},
{
"epoch": 0.01,
"grad_norm": 3.391297972382365,
"learning_rate": 2.962962962962963e-06,
"loss": 0.2509,
"step": 160
},
{
"epoch": 0.01,
"grad_norm": 3.569378052344309,
"learning_rate": 2.9814814814814817e-06,
"loss": 0.2017,
"step": 161
},
{
"epoch": 0.01,
"grad_norm": 3.7110963646864787,
"learning_rate": 3e-06,
"loss": 0.2179,
"step": 162
},
{
"epoch": 0.01,
"grad_norm": 2.246489859773125,
"learning_rate": 3.018518518518519e-06,
"loss": 0.1956,
"step": 163
},
{
"epoch": 0.01,
"grad_norm": 4.009805700489273,
"learning_rate": 3.0370370370370372e-06,
"loss": 0.2621,
"step": 164
},
{
"epoch": 0.01,
"grad_norm": 6.603600398514934,
"learning_rate": 3.055555555555556e-06,
"loss": 0.2286,
"step": 165
},
{
"epoch": 0.01,
"grad_norm": 3.2140092102588227,
"learning_rate": 3.0740740740740744e-06,
"loss": 0.1985,
"step": 166
},
{
"epoch": 0.01,
"grad_norm": 3.129528788571764,
"learning_rate": 3.0925925925925928e-06,
"loss": 0.2672,
"step": 167
},
{
"epoch": 0.01,
"grad_norm": 4.012024850134178,
"learning_rate": 3.1111111111111116e-06,
"loss": 0.217,
"step": 168
},
{
"epoch": 0.01,
"grad_norm": 7.820458104991282,
"learning_rate": 3.1296296296296295e-06,
"loss": 0.301,
"step": 169
},
{
"epoch": 0.01,
"grad_norm": 3.0860558209491344,
"learning_rate": 3.1481481481481483e-06,
"loss": 0.3091,
"step": 170
},
{
"epoch": 0.01,
"grad_norm": 2.7755722534108647,
"learning_rate": 3.1666666666666667e-06,
"loss": 0.2226,
"step": 171
},
{
"epoch": 0.01,
"grad_norm": 2.5247435245938923,
"learning_rate": 3.1851851851851855e-06,
"loss": 0.1661,
"step": 172
},
{
"epoch": 0.01,
"grad_norm": 3.01004084492849,
"learning_rate": 3.203703703703704e-06,
"loss": 0.21,
"step": 173
},
{
"epoch": 0.01,
"grad_norm": 4.228753627219633,
"learning_rate": 3.2222222222222227e-06,
"loss": 0.2477,
"step": 174
},
{
"epoch": 0.01,
"grad_norm": 3.1752871871583532,
"learning_rate": 3.240740740740741e-06,
"loss": 0.2079,
"step": 175
},
{
"epoch": 0.01,
"grad_norm": 3.152560617770134,
"learning_rate": 3.25925925925926e-06,
"loss": 0.2665,
"step": 176
},
{
"epoch": 0.01,
"grad_norm": 2.961029589788596,
"learning_rate": 3.277777777777778e-06,
"loss": 0.1568,
"step": 177
},
{
"epoch": 0.01,
"grad_norm": 4.709782042755141,
"learning_rate": 3.296296296296296e-06,
"loss": 0.2532,
"step": 178
},
{
"epoch": 0.01,
"grad_norm": 3.8365927717296775,
"learning_rate": 3.314814814814815e-06,
"loss": 0.2034,
"step": 179
},
{
"epoch": 0.01,
"grad_norm": 3.693611221159195,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.2731,
"step": 180
},
{
"epoch": 0.01,
"grad_norm": 3.9986957942154158,
"learning_rate": 3.351851851851852e-06,
"loss": 0.2597,
"step": 181
},
{
"epoch": 0.01,
"grad_norm": 3.2919659156622347,
"learning_rate": 3.3703703703703705e-06,
"loss": 0.1949,
"step": 182
},
{
"epoch": 0.01,
"grad_norm": 2.9341893207437875,
"learning_rate": 3.3888888888888893e-06,
"loss": 0.2386,
"step": 183
},
{
"epoch": 0.01,
"grad_norm": 5.102536761269667,
"learning_rate": 3.4074074074074077e-06,
"loss": 0.2774,
"step": 184
},
{
"epoch": 0.01,
"grad_norm": 3.4536820389601535,
"learning_rate": 3.4259259259259265e-06,
"loss": 0.1826,
"step": 185
},
{
"epoch": 0.01,
"grad_norm": 3.0300989423466786,
"learning_rate": 3.444444444444445e-06,
"loss": 0.2446,
"step": 186
},
{
"epoch": 0.01,
"grad_norm": 3.4403011526481473,
"learning_rate": 3.4629629629629628e-06,
"loss": 0.1901,
"step": 187
},
{
"epoch": 0.01,
"grad_norm": 3.6775091247134197,
"learning_rate": 3.481481481481482e-06,
"loss": 0.2674,
"step": 188
},
{
"epoch": 0.01,
"grad_norm": 4.391028178699074,
"learning_rate": 3.5e-06,
"loss": 0.2829,
"step": 189
},
{
"epoch": 0.01,
"grad_norm": 2.423366192150436,
"learning_rate": 3.5185185185185187e-06,
"loss": 0.1824,
"step": 190
},
{
"epoch": 0.01,
"grad_norm": 3.122177437193447,
"learning_rate": 3.537037037037037e-06,
"loss": 0.2371,
"step": 191
},
{
"epoch": 0.01,
"grad_norm": 3.45841158448972,
"learning_rate": 3.555555555555556e-06,
"loss": 0.2292,
"step": 192
},
{
"epoch": 0.01,
"grad_norm": 3.4469803213869747,
"learning_rate": 3.5740740740740743e-06,
"loss": 0.1955,
"step": 193
},
{
"epoch": 0.01,
"grad_norm": 2.9771563520791173,
"learning_rate": 3.592592592592593e-06,
"loss": 0.2377,
"step": 194
},
{
"epoch": 0.01,
"grad_norm": 3.8796906691318185,
"learning_rate": 3.6111111111111115e-06,
"loss": 0.2551,
"step": 195
},
{
"epoch": 0.01,
"grad_norm": 3.468676128116348,
"learning_rate": 3.6296296296296302e-06,
"loss": 0.2039,
"step": 196
},
{
"epoch": 0.01,
"grad_norm": 2.958880079590501,
"learning_rate": 3.6481481481481486e-06,
"loss": 0.1795,
"step": 197
},
{
"epoch": 0.01,
"grad_norm": 2.6200401360225767,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.232,
"step": 198
},
{
"epoch": 0.01,
"grad_norm": 4.983293081071033,
"learning_rate": 3.6851851851851854e-06,
"loss": 0.1908,
"step": 199
},
{
"epoch": 0.01,
"grad_norm": 3.0050236938894415,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.2286,
"step": 200
},
{
"epoch": 0.01,
"grad_norm": 3.3306140725673634,
"learning_rate": 3.7222222222222225e-06,
"loss": 0.2619,
"step": 201
},
{
"epoch": 0.01,
"grad_norm": 3.4938522953418802,
"learning_rate": 3.740740740740741e-06,
"loss": 0.1959,
"step": 202
},
{
"epoch": 0.01,
"grad_norm": 3.4177239728199855,
"learning_rate": 3.7592592592592597e-06,
"loss": 0.2236,
"step": 203
},
{
"epoch": 0.01,
"grad_norm": 3.7652783689393297,
"learning_rate": 3.777777777777778e-06,
"loss": 0.2085,
"step": 204
},
{
"epoch": 0.01,
"grad_norm": 3.6604435338767782,
"learning_rate": 3.796296296296297e-06,
"loss": 0.261,
"step": 205
},
{
"epoch": 0.01,
"grad_norm": 3.113607597716548,
"learning_rate": 3.814814814814815e-06,
"loss": 0.2424,
"step": 206
},
{
"epoch": 0.01,
"grad_norm": 4.4771669455747585,
"learning_rate": 3.833333333333334e-06,
"loss": 0.2283,
"step": 207
},
{
"epoch": 0.01,
"grad_norm": 3.1589659628880478,
"learning_rate": 3.851851851851852e-06,
"loss": 0.2219,
"step": 208
},
{
"epoch": 0.01,
"grad_norm": 2.7645376675755027,
"learning_rate": 3.87037037037037e-06,
"loss": 0.2135,
"step": 209
},
{
"epoch": 0.01,
"grad_norm": 2.980265299201392,
"learning_rate": 3.88888888888889e-06,
"loss": 0.2686,
"step": 210
},
{
"epoch": 0.01,
"grad_norm": 3.1181423283269614,
"learning_rate": 3.907407407407408e-06,
"loss": 0.223,
"step": 211
},
{
"epoch": 0.01,
"grad_norm": 2.42736974480654,
"learning_rate": 3.925925925925926e-06,
"loss": 0.187,
"step": 212
},
{
"epoch": 0.01,
"grad_norm": 1.8743075840685295,
"learning_rate": 3.944444444444445e-06,
"loss": 0.1818,
"step": 213
},
{
"epoch": 0.01,
"grad_norm": 2.343322079058555,
"learning_rate": 3.962962962962963e-06,
"loss": 0.2371,
"step": 214
},
{
"epoch": 0.01,
"grad_norm": 2.081694878178825,
"learning_rate": 3.9814814814814814e-06,
"loss": 0.2267,
"step": 215
},
{
"epoch": 0.01,
"grad_norm": 4.3576806169853315,
"learning_rate": 4.000000000000001e-06,
"loss": 0.2567,
"step": 216
},
{
"epoch": 0.01,
"grad_norm": 3.4811683148674706,
"learning_rate": 4.018518518518519e-06,
"loss": 0.2124,
"step": 217
},
{
"epoch": 0.01,
"grad_norm": 3.5712780387693677,
"learning_rate": 4.037037037037037e-06,
"loss": 0.175,
"step": 218
},
{
"epoch": 0.01,
"grad_norm": 3.10431155350948,
"learning_rate": 4.055555555555556e-06,
"loss": 0.2544,
"step": 219
},
{
"epoch": 0.01,
"grad_norm": 3.4035525621721305,
"learning_rate": 4.074074074074074e-06,
"loss": 0.2443,
"step": 220
},
{
"epoch": 0.01,
"grad_norm": 2.497804840268957,
"learning_rate": 4.092592592592593e-06,
"loss": 0.2103,
"step": 221
},
{
"epoch": 0.01,
"grad_norm": 3.453502720676144,
"learning_rate": 4.111111111111111e-06,
"loss": 0.1928,
"step": 222
},
{
"epoch": 0.01,
"grad_norm": 2.5319715107359095,
"learning_rate": 4.12962962962963e-06,
"loss": 0.3045,
"step": 223
},
{
"epoch": 0.01,
"grad_norm": 3.652891605515027,
"learning_rate": 4.1481481481481485e-06,
"loss": 0.2066,
"step": 224
},
{
"epoch": 0.01,
"grad_norm": 5.197667005680618,
"learning_rate": 4.166666666666667e-06,
"loss": 0.2867,
"step": 225
},
{
"epoch": 0.01,
"grad_norm": 3.1852861083681456,
"learning_rate": 4.185185185185185e-06,
"loss": 0.2962,
"step": 226
},
{
"epoch": 0.01,
"grad_norm": 3.309751449945396,
"learning_rate": 4.2037037037037045e-06,
"loss": 0.2399,
"step": 227
},
{
"epoch": 0.01,
"grad_norm": 5.662600750900519,
"learning_rate": 4.222222222222223e-06,
"loss": 0.1835,
"step": 228
},
{
"epoch": 0.01,
"grad_norm": 3.1627381367040655,
"learning_rate": 4.240740740740741e-06,
"loss": 0.2256,
"step": 229
},
{
"epoch": 0.01,
"grad_norm": 2.859414772632586,
"learning_rate": 4.2592592592592596e-06,
"loss": 0.2474,
"step": 230
},
{
"epoch": 0.01,
"grad_norm": 3.5930782685464924,
"learning_rate": 4.277777777777778e-06,
"loss": 0.2853,
"step": 231
},
{
"epoch": 0.01,
"grad_norm": 2.3413935831851864,
"learning_rate": 4.296296296296296e-06,
"loss": 0.2001,
"step": 232
},
{
"epoch": 0.01,
"grad_norm": 2.1343012773284142,
"learning_rate": 4.314814814814815e-06,
"loss": 0.1442,
"step": 233
},
{
"epoch": 0.01,
"grad_norm": 4.447325533939575,
"learning_rate": 4.333333333333334e-06,
"loss": 0.279,
"step": 234
},
{
"epoch": 0.01,
"grad_norm": 2.0281971364645366,
"learning_rate": 4.351851851851852e-06,
"loss": 0.1831,
"step": 235
},
{
"epoch": 0.01,
"grad_norm": 2.399177738362128,
"learning_rate": 4.370370370370371e-06,
"loss": 0.2159,
"step": 236
},
{
"epoch": 0.01,
"grad_norm": 3.9459614890346613,
"learning_rate": 4.388888888888889e-06,
"loss": 0.2585,
"step": 237
},
{
"epoch": 0.01,
"grad_norm": 4.2484957894089685,
"learning_rate": 4.407407407407408e-06,
"loss": 0.2717,
"step": 238
},
{
"epoch": 0.01,
"grad_norm": 3.671599268198715,
"learning_rate": 4.425925925925927e-06,
"loss": 0.2394,
"step": 239
},
{
"epoch": 0.01,
"grad_norm": 3.005918803380453,
"learning_rate": 4.444444444444444e-06,
"loss": 0.2524,
"step": 240
},
{
"epoch": 0.01,
"grad_norm": 2.6518617846617962,
"learning_rate": 4.462962962962963e-06,
"loss": 0.2379,
"step": 241
},
{
"epoch": 0.01,
"grad_norm": 2.437999918608316,
"learning_rate": 4.481481481481482e-06,
"loss": 0.2195,
"step": 242
},
{
"epoch": 0.01,
"grad_norm": 2.6583787466273825,
"learning_rate": 4.5e-06,
"loss": 0.1729,
"step": 243
},
{
"epoch": 0.01,
"grad_norm": 2.402951258317827,
"learning_rate": 4.5185185185185185e-06,
"loss": 0.2234,
"step": 244
},
{
"epoch": 0.01,
"grad_norm": 2.7857996129322897,
"learning_rate": 4.537037037037038e-06,
"loss": 0.2547,
"step": 245
},
{
"epoch": 0.01,
"grad_norm": 2.49488576274901,
"learning_rate": 4.555555555555556e-06,
"loss": 0.2407,
"step": 246
},
{
"epoch": 0.01,
"grad_norm": 1.868135271583363,
"learning_rate": 4.5740740740740745e-06,
"loss": 0.2208,
"step": 247
},
{
"epoch": 0.01,
"grad_norm": 2.5562428105913084,
"learning_rate": 4.592592592592593e-06,
"loss": 0.2166,
"step": 248
},
{
"epoch": 0.01,
"grad_norm": 2.58224768027302,
"learning_rate": 4.611111111111112e-06,
"loss": 0.2492,
"step": 249
},
{
"epoch": 0.01,
"grad_norm": 2.3433573075972296,
"learning_rate": 4.62962962962963e-06,
"loss": 0.1936,
"step": 250
},
{
"epoch": 0.01,
"grad_norm": 2.8626887969261507,
"learning_rate": 4.648148148148148e-06,
"loss": 0.2315,
"step": 251
},
{
"epoch": 0.01,
"grad_norm": 2.5816386699164062,
"learning_rate": 4.666666666666667e-06,
"loss": 0.2163,
"step": 252
},
{
"epoch": 0.01,
"grad_norm": 2.3400604453390237,
"learning_rate": 4.6851851851851855e-06,
"loss": 0.2611,
"step": 253
},
{
"epoch": 0.01,
"grad_norm": 1.8458488911384965,
"learning_rate": 4.703703703703704e-06,
"loss": 0.2489,
"step": 254
},
{
"epoch": 0.01,
"grad_norm": 2.522444603513024,
"learning_rate": 4.722222222222222e-06,
"loss": 0.2122,
"step": 255
},
{
"epoch": 0.01,
"grad_norm": 2.3783927325650613,
"learning_rate": 4.7407407407407415e-06,
"loss": 0.1981,
"step": 256
},
{
"epoch": 0.01,
"grad_norm": 3.019533613828299,
"learning_rate": 4.75925925925926e-06,
"loss": 0.2384,
"step": 257
},
{
"epoch": 0.01,
"grad_norm": 2.918759132999941,
"learning_rate": 4.777777777777778e-06,
"loss": 0.1788,
"step": 258
},
{
"epoch": 0.01,
"grad_norm": 2.4610427470635274,
"learning_rate": 4.796296296296297e-06,
"loss": 0.2268,
"step": 259
},
{
"epoch": 0.01,
"grad_norm": 2.7848761312000536,
"learning_rate": 4.814814814814815e-06,
"loss": 0.2325,
"step": 260
},
{
"epoch": 0.01,
"grad_norm": 3.4191520473690815,
"learning_rate": 4.833333333333333e-06,
"loss": 0.2172,
"step": 261
},
{
"epoch": 0.01,
"grad_norm": 3.103758968658333,
"learning_rate": 4.851851851851852e-06,
"loss": 0.2478,
"step": 262
},
{
"epoch": 0.01,
"grad_norm": 2.5482504471380185,
"learning_rate": 4.870370370370371e-06,
"loss": 0.2497,
"step": 263
},
{
"epoch": 0.01,
"grad_norm": 3.4810407020892766,
"learning_rate": 4.888888888888889e-06,
"loss": 0.1416,
"step": 264
},
{
"epoch": 0.01,
"grad_norm": 3.3881716312252794,
"learning_rate": 4.907407407407408e-06,
"loss": 0.2587,
"step": 265
},
{
"epoch": 0.01,
"grad_norm": 2.3292168482752666,
"learning_rate": 4.925925925925926e-06,
"loss": 0.2184,
"step": 266
},
{
"epoch": 0.01,
"grad_norm": 4.639069216123649,
"learning_rate": 4.944444444444445e-06,
"loss": 0.2131,
"step": 267
},
{
"epoch": 0.01,
"grad_norm": 2.775550585330974,
"learning_rate": 4.962962962962964e-06,
"loss": 0.1981,
"step": 268
},
{
"epoch": 0.01,
"grad_norm": 2.6776742740622606,
"learning_rate": 4.981481481481482e-06,
"loss": 0.18,
"step": 269
},
{
"epoch": 0.02,
"grad_norm": 4.811276614758999,
"learning_rate": 5e-06,
"loss": 0.2509,
"step": 270
},
{
"epoch": 0.02,
"grad_norm": 3.316089340902278,
"learning_rate": 5.01851851851852e-06,
"loss": 0.267,
"step": 271
},
{
"epoch": 0.02,
"grad_norm": 2.212839461001947,
"learning_rate": 5.037037037037037e-06,
"loss": 0.2075,
"step": 272
},
{
"epoch": 0.02,
"grad_norm": 1.9776739041544913,
"learning_rate": 5.0555555555555555e-06,
"loss": 0.1698,
"step": 273
},
{
"epoch": 0.02,
"grad_norm": 3.1890958082449243,
"learning_rate": 5.074074074074075e-06,
"loss": 0.2372,
"step": 274
},
{
"epoch": 0.02,
"grad_norm": 2.947525306217703,
"learning_rate": 5.092592592592593e-06,
"loss": 0.2561,
"step": 275
},
{
"epoch": 0.02,
"grad_norm": 4.297391853458588,
"learning_rate": 5.1111111111111115e-06,
"loss": 0.2481,
"step": 276
},
{
"epoch": 0.02,
"grad_norm": 3.322602754472144,
"learning_rate": 5.12962962962963e-06,
"loss": 0.2631,
"step": 277
},
{
"epoch": 0.02,
"grad_norm": 3.050443252029387,
"learning_rate": 5.148148148148149e-06,
"loss": 0.2261,
"step": 278
},
{
"epoch": 0.02,
"grad_norm": 1.8973907756153363,
"learning_rate": 5.1666666666666675e-06,
"loss": 0.2428,
"step": 279
},
{
"epoch": 0.02,
"grad_norm": 4.0422139591962765,
"learning_rate": 5.185185185185185e-06,
"loss": 0.237,
"step": 280
},
{
"epoch": 0.02,
"grad_norm": 3.3841461829891855,
"learning_rate": 5.203703703703704e-06,
"loss": 0.2163,
"step": 281
},
{
"epoch": 0.02,
"grad_norm": 2.5529633837409493,
"learning_rate": 5.2222222222222226e-06,
"loss": 0.1871,
"step": 282
},
{
"epoch": 0.02,
"grad_norm": 3.3624447392598986,
"learning_rate": 5.240740740740741e-06,
"loss": 0.2218,
"step": 283
},
{
"epoch": 0.02,
"grad_norm": 2.2956439501496666,
"learning_rate": 5.259259259259259e-06,
"loss": 0.2417,
"step": 284
},
{
"epoch": 0.02,
"grad_norm": 2.297097785680975,
"learning_rate": 5.2777777777777785e-06,
"loss": 0.2266,
"step": 285
},
{
"epoch": 0.02,
"grad_norm": 1.7470113660102002,
"learning_rate": 5.296296296296297e-06,
"loss": 0.191,
"step": 286
},
{
"epoch": 0.02,
"grad_norm": 2.3084465857781824,
"learning_rate": 5.314814814814815e-06,
"loss": 0.2177,
"step": 287
},
{
"epoch": 0.02,
"grad_norm": 2.7920950850278485,
"learning_rate": 5.333333333333334e-06,
"loss": 0.2735,
"step": 288
},
{
"epoch": 0.02,
"grad_norm": 5.370310217840505,
"learning_rate": 5.351851851851853e-06,
"loss": 0.2444,
"step": 289
},
{
"epoch": 0.02,
"grad_norm": 5.418966695111219,
"learning_rate": 5.370370370370371e-06,
"loss": 0.2502,
"step": 290
},
{
"epoch": 0.02,
"grad_norm": 5.1345778728764495,
"learning_rate": 5.388888888888889e-06,
"loss": 0.2527,
"step": 291
},
{
"epoch": 0.02,
"grad_norm": 4.809604219423627,
"learning_rate": 5.407407407407408e-06,
"loss": 0.309,
"step": 292
},
{
"epoch": 0.02,
"grad_norm": 2.1252464263745963,
"learning_rate": 5.425925925925926e-06,
"loss": 0.1791,
"step": 293
},
{
"epoch": 0.02,
"grad_norm": 2.382409471290975,
"learning_rate": 5.444444444444445e-06,
"loss": 0.1995,
"step": 294
},
{
"epoch": 0.02,
"grad_norm": 2.78654941635765,
"learning_rate": 5.462962962962963e-06,
"loss": 0.2433,
"step": 295
},
{
"epoch": 0.02,
"grad_norm": 4.162243895055201,
"learning_rate": 5.481481481481482e-06,
"loss": 0.2002,
"step": 296
},
{
"epoch": 0.02,
"grad_norm": 3.020240729949975,
"learning_rate": 5.500000000000001e-06,
"loss": 0.2292,
"step": 297
},
{
"epoch": 0.02,
"grad_norm": 5.455860053216916,
"learning_rate": 5.518518518518518e-06,
"loss": 0.2278,
"step": 298
},
{
"epoch": 0.02,
"grad_norm": 2.0276007075998423,
"learning_rate": 5.5370370370370374e-06,
"loss": 0.2324,
"step": 299
},
{
"epoch": 0.02,
"grad_norm": 2.6593618957429763,
"learning_rate": 5.555555555555557e-06,
"loss": 0.2855,
"step": 300
},
{
"epoch": 0.02,
"grad_norm": 3.3040564094529894,
"learning_rate": 5.574074074074075e-06,
"loss": 0.2893,
"step": 301
},
{
"epoch": 0.02,
"grad_norm": 2.0887879744067774,
"learning_rate": 5.5925925925925926e-06,
"loss": 0.2725,
"step": 302
},
{
"epoch": 0.02,
"grad_norm": 2.256352232891461,
"learning_rate": 5.611111111111112e-06,
"loss": 0.2021,
"step": 303
},
{
"epoch": 0.02,
"grad_norm": 2.5862358935281637,
"learning_rate": 5.62962962962963e-06,
"loss": 0.2192,
"step": 304
},
{
"epoch": 0.02,
"grad_norm": 2.253995397778198,
"learning_rate": 5.6481481481481485e-06,
"loss": 0.2372,
"step": 305
},
{
"epoch": 0.02,
"grad_norm": 2.7795628348705796,
"learning_rate": 5.666666666666667e-06,
"loss": 0.2946,
"step": 306
},
{
"epoch": 0.02,
"grad_norm": 1.6514569015090887,
"learning_rate": 5.685185185185186e-06,
"loss": 0.2368,
"step": 307
},
{
"epoch": 0.02,
"grad_norm": 2.7901136196750858,
"learning_rate": 5.7037037037037045e-06,
"loss": 0.2328,
"step": 308
},
{
"epoch": 0.02,
"grad_norm": 2.929473431713666,
"learning_rate": 5.722222222222222e-06,
"loss": 0.2245,
"step": 309
},
{
"epoch": 0.02,
"grad_norm": 2.7336612641838127,
"learning_rate": 5.740740740740741e-06,
"loss": 0.2357,
"step": 310
},
{
"epoch": 0.02,
"grad_norm": 2.833964247208147,
"learning_rate": 5.75925925925926e-06,
"loss": 0.2282,
"step": 311
},
{
"epoch": 0.02,
"grad_norm": 3.404103521479665,
"learning_rate": 5.777777777777778e-06,
"loss": 0.2043,
"step": 312
},
{
"epoch": 0.02,
"grad_norm": 2.4901435985101377,
"learning_rate": 5.796296296296296e-06,
"loss": 0.1827,
"step": 313
},
{
"epoch": 0.02,
"grad_norm": 2.4801730482808333,
"learning_rate": 5.814814814814816e-06,
"loss": 0.2216,
"step": 314
},
{
"epoch": 0.02,
"grad_norm": 3.1306132066915424,
"learning_rate": 5.833333333333334e-06,
"loss": 0.2295,
"step": 315
},
{
"epoch": 0.02,
"grad_norm": 2.1971949417613255,
"learning_rate": 5.8518518518518515e-06,
"loss": 0.2182,
"step": 316
},
{
"epoch": 0.02,
"grad_norm": 2.0099244050381335,
"learning_rate": 5.870370370370371e-06,
"loss": 0.2124,
"step": 317
},
{
"epoch": 0.02,
"grad_norm": 10.956187648563132,
"learning_rate": 5.88888888888889e-06,
"loss": 0.2244,
"step": 318
},
{
"epoch": 0.02,
"grad_norm": 2.237500079384062,
"learning_rate": 5.907407407407408e-06,
"loss": 0.2017,
"step": 319
},
{
"epoch": 0.02,
"grad_norm": 2.7735158452533324,
"learning_rate": 5.925925925925926e-06,
"loss": 0.2866,
"step": 320
},
{
"epoch": 0.02,
"grad_norm": 3.0634574172078675,
"learning_rate": 5.944444444444445e-06,
"loss": 0.2174,
"step": 321
},
{
"epoch": 0.02,
"grad_norm": 2.082726231968373,
"learning_rate": 5.962962962962963e-06,
"loss": 0.221,
"step": 322
},
{
"epoch": 0.02,
"grad_norm": 2.5734571016094367,
"learning_rate": 5.981481481481482e-06,
"loss": 0.2007,
"step": 323
},
{
"epoch": 0.02,
"grad_norm": 2.6433284734602704,
"learning_rate": 6e-06,
"loss": 0.2226,
"step": 324
},
{
"epoch": 0.02,
"grad_norm": 1.9103685818787577,
"learning_rate": 6.018518518518519e-06,
"loss": 0.2193,
"step": 325
},
{
"epoch": 0.02,
"grad_norm": 2.3024757052308895,
"learning_rate": 6.037037037037038e-06,
"loss": 0.2581,
"step": 326
},
{
"epoch": 0.02,
"grad_norm": 3.275150115088954,
"learning_rate": 6.055555555555555e-06,
"loss": 0.2276,
"step": 327
},
{
"epoch": 0.02,
"grad_norm": 2.9732833180868155,
"learning_rate": 6.0740740740740745e-06,
"loss": 0.2077,
"step": 328
},
{
"epoch": 0.02,
"grad_norm": 1.7655336812127842,
"learning_rate": 6.092592592592593e-06,
"loss": 0.2234,
"step": 329
},
{
"epoch": 0.02,
"grad_norm": 2.8087553526925393,
"learning_rate": 6.111111111111112e-06,
"loss": 0.247,
"step": 330
},
{
"epoch": 0.02,
"grad_norm": 2.171279420768631,
"learning_rate": 6.12962962962963e-06,
"loss": 0.2527,
"step": 331
},
{
"epoch": 0.02,
"grad_norm": 2.4271539806076436,
"learning_rate": 6.148148148148149e-06,
"loss": 0.1873,
"step": 332
},
{
"epoch": 0.02,
"grad_norm": 2.2561373188486256,
"learning_rate": 6.166666666666667e-06,
"loss": 0.2243,
"step": 333
},
{
"epoch": 0.02,
"grad_norm": 2.0341046960919478,
"learning_rate": 6.1851851851851856e-06,
"loss": 0.263,
"step": 334
},
{
"epoch": 0.02,
"grad_norm": 2.2655957647607807,
"learning_rate": 6.203703703703704e-06,
"loss": 0.2711,
"step": 335
},
{
"epoch": 0.02,
"grad_norm": 1.8712782002364605,
"learning_rate": 6.222222222222223e-06,
"loss": 0.2466,
"step": 336
},
{
"epoch": 0.02,
"grad_norm": 1.3390799589142879,
"learning_rate": 6.2407407407407415e-06,
"loss": 0.1433,
"step": 337
},
{
"epoch": 0.02,
"grad_norm": 3.373163217946868,
"learning_rate": 6.259259259259259e-06,
"loss": 0.1725,
"step": 338
},
{
"epoch": 0.02,
"grad_norm": 2.394808070251205,
"learning_rate": 6.277777777777778e-06,
"loss": 0.2197,
"step": 339
},
{
"epoch": 0.02,
"grad_norm": 2.755518848297966,
"learning_rate": 6.296296296296297e-06,
"loss": 0.272,
"step": 340
},
{
"epoch": 0.02,
"grad_norm": 1.8779923323935852,
"learning_rate": 6.314814814814816e-06,
"loss": 0.2232,
"step": 341
},
{
"epoch": 0.02,
"grad_norm": 2.6353096651260834,
"learning_rate": 6.333333333333333e-06,
"loss": 0.2798,
"step": 342
},
{
"epoch": 0.02,
"grad_norm": 2.763581485415892,
"learning_rate": 6.351851851851853e-06,
"loss": 0.1904,
"step": 343
},
{
"epoch": 0.02,
"grad_norm": 2.758094713722052,
"learning_rate": 6.370370370370371e-06,
"loss": 0.2441,
"step": 344
},
{
"epoch": 0.02,
"grad_norm": 1.9172636589026844,
"learning_rate": 6.3888888888888885e-06,
"loss": 0.2762,
"step": 345
},
{
"epoch": 0.02,
"grad_norm": 2.394315538755901,
"learning_rate": 6.407407407407408e-06,
"loss": 0.2145,
"step": 346
},
{
"epoch": 0.02,
"grad_norm": 2.4548469501382484,
"learning_rate": 6.425925925925927e-06,
"loss": 0.1939,
"step": 347
},
{
"epoch": 0.02,
"grad_norm": 2.2452509086512755,
"learning_rate": 6.444444444444445e-06,
"loss": 0.2615,
"step": 348
},
{
"epoch": 0.02,
"grad_norm": 3.098802544447235,
"learning_rate": 6.462962962962963e-06,
"loss": 0.2461,
"step": 349
},
{
"epoch": 0.02,
"grad_norm": 2.616470147506204,
"learning_rate": 6.481481481481482e-06,
"loss": 0.2117,
"step": 350
},
{
"epoch": 0.02,
"grad_norm": 2.2334691592329525,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.2131,
"step": 351
},
{
"epoch": 0.02,
"grad_norm": 2.5589799662286956,
"learning_rate": 6.51851851851852e-06,
"loss": 0.242,
"step": 352
},
{
"epoch": 0.02,
"grad_norm": 1.659120456475795,
"learning_rate": 6.537037037037037e-06,
"loss": 0.1708,
"step": 353
},
{
"epoch": 0.02,
"grad_norm": 3.0372077950479426,
"learning_rate": 6.555555555555556e-06,
"loss": 0.2708,
"step": 354
},
{
"epoch": 0.02,
"grad_norm": 3.1775196661741996,
"learning_rate": 6.574074074074075e-06,
"loss": 0.2874,
"step": 355
},
{
"epoch": 0.02,
"grad_norm": 2.830893410296079,
"learning_rate": 6.592592592592592e-06,
"loss": 0.2716,
"step": 356
},
{
"epoch": 0.02,
"grad_norm": 3.7230523009836536,
"learning_rate": 6.6111111111111115e-06,
"loss": 0.1982,
"step": 357
},
{
"epoch": 0.02,
"grad_norm": 1.893792415370512,
"learning_rate": 6.62962962962963e-06,
"loss": 0.1864,
"step": 358
},
{
"epoch": 0.02,
"grad_norm": 2.666844618467573,
"learning_rate": 6.648148148148149e-06,
"loss": 0.2428,
"step": 359
},
{
"epoch": 0.02,
"grad_norm": 3.608780254558276,
"learning_rate": 6.666666666666667e-06,
"loss": 0.2006,
"step": 360
},
{
"epoch": 0.02,
"grad_norm": 3.302008298067494,
"learning_rate": 6.685185185185186e-06,
"loss": 0.201,
"step": 361
},
{
"epoch": 0.02,
"grad_norm": 2.7545141320246036,
"learning_rate": 6.703703703703704e-06,
"loss": 0.2023,
"step": 362
},
{
"epoch": 0.02,
"grad_norm": 3.310552294341296,
"learning_rate": 6.7222222222222235e-06,
"loss": 0.2524,
"step": 363
},
{
"epoch": 0.02,
"grad_norm": 10.135064668152676,
"learning_rate": 6.740740740740741e-06,
"loss": 0.3195,
"step": 364
},
{
"epoch": 0.02,
"grad_norm": 2.9020320884158277,
"learning_rate": 6.75925925925926e-06,
"loss": 0.2071,
"step": 365
},
{
"epoch": 0.02,
"grad_norm": 2.27841075156875,
"learning_rate": 6.777777777777779e-06,
"loss": 0.2421,
"step": 366
},
{
"epoch": 0.02,
"grad_norm": 3.4100360302965322,
"learning_rate": 6.796296296296296e-06,
"loss": 0.2714,
"step": 367
},
{
"epoch": 0.02,
"grad_norm": 2.5363251000308034,
"learning_rate": 6.814814814814815e-06,
"loss": 0.244,
"step": 368
},
{
"epoch": 0.02,
"grad_norm": 2.592834724808257,
"learning_rate": 6.833333333333334e-06,
"loss": 0.1717,
"step": 369
},
{
"epoch": 0.02,
"grad_norm": 2.667653937916638,
"learning_rate": 6.851851851851853e-06,
"loss": 0.1939,
"step": 370
},
{
"epoch": 0.02,
"grad_norm": 2.4491773781331814,
"learning_rate": 6.8703703703703704e-06,
"loss": 0.2883,
"step": 371
},
{
"epoch": 0.02,
"grad_norm": 2.491279484468612,
"learning_rate": 6.88888888888889e-06,
"loss": 0.284,
"step": 372
},
{
"epoch": 0.02,
"grad_norm": 3.0752719398369,
"learning_rate": 6.907407407407408e-06,
"loss": 0.2466,
"step": 373
},
{
"epoch": 0.02,
"grad_norm": 2.9643781997595835,
"learning_rate": 6.9259259259259256e-06,
"loss": 0.2628,
"step": 374
},
{
"epoch": 0.02,
"grad_norm": 2.0006146510934153,
"learning_rate": 6.944444444444445e-06,
"loss": 0.1835,
"step": 375
},
{
"epoch": 0.02,
"grad_norm": 3.8235231337722313,
"learning_rate": 6.962962962962964e-06,
"loss": 0.3059,
"step": 376
},
{
"epoch": 0.02,
"grad_norm": 2.05096254546488,
"learning_rate": 6.981481481481482e-06,
"loss": 0.2287,
"step": 377
},
{
"epoch": 0.02,
"grad_norm": 2.4587295800172235,
"learning_rate": 7e-06,
"loss": 0.2654,
"step": 378
},
{
"epoch": 0.02,
"grad_norm": 2.6549597748171516,
"learning_rate": 7.018518518518519e-06,
"loss": 0.3164,
"step": 379
},
{
"epoch": 0.02,
"grad_norm": 1.9038346967059707,
"learning_rate": 7.0370370370370375e-06,
"loss": 0.2488,
"step": 380
},
{
"epoch": 0.02,
"grad_norm": 2.3697078179976567,
"learning_rate": 7.055555555555557e-06,
"loss": 0.2037,
"step": 381
},
{
"epoch": 0.02,
"grad_norm": 2.0954183152096797,
"learning_rate": 7.074074074074074e-06,
"loss": 0.1585,
"step": 382
},
{
"epoch": 0.02,
"grad_norm": 2.2018921290610343,
"learning_rate": 7.0925925925925935e-06,
"loss": 0.1813,
"step": 383
},
{
"epoch": 0.02,
"grad_norm": 1.5869216684012442,
"learning_rate": 7.111111111111112e-06,
"loss": 0.2381,
"step": 384
},
{
"epoch": 0.02,
"grad_norm": 1.5440442501750655,
"learning_rate": 7.129629629629629e-06,
"loss": 0.1969,
"step": 385
},
{
"epoch": 0.02,
"grad_norm": 2.2971557912660665,
"learning_rate": 7.1481481481481486e-06,
"loss": 0.2147,
"step": 386
},
{
"epoch": 0.02,
"grad_norm": 2.550293529670386,
"learning_rate": 7.166666666666667e-06,
"loss": 0.1814,
"step": 387
},
{
"epoch": 0.02,
"grad_norm": 1.8004297380922656,
"learning_rate": 7.185185185185186e-06,
"loss": 0.2686,
"step": 388
},
{
"epoch": 0.02,
"grad_norm": 2.1173214359781203,
"learning_rate": 7.203703703703704e-06,
"loss": 0.2166,
"step": 389
},
{
"epoch": 0.02,
"grad_norm": 1.518227044480827,
"learning_rate": 7.222222222222223e-06,
"loss": 0.1604,
"step": 390
},
{
"epoch": 0.02,
"grad_norm": 1.8852826175316413,
"learning_rate": 7.240740740740741e-06,
"loss": 0.1795,
"step": 391
},
{
"epoch": 0.02,
"grad_norm": 2.30584778108288,
"learning_rate": 7.2592592592592605e-06,
"loss": 0.2376,
"step": 392
},
{
"epoch": 0.02,
"grad_norm": 1.5760050357974753,
"learning_rate": 7.277777777777778e-06,
"loss": 0.1886,
"step": 393
},
{
"epoch": 0.02,
"grad_norm": 1.6085933249919047,
"learning_rate": 7.296296296296297e-06,
"loss": 0.1625,
"step": 394
},
{
"epoch": 0.02,
"grad_norm": 2.8386472676550367,
"learning_rate": 7.314814814814816e-06,
"loss": 0.2517,
"step": 395
},
{
"epoch": 0.02,
"grad_norm": 2.6820386575918245,
"learning_rate": 7.333333333333333e-06,
"loss": 0.253,
"step": 396
},
{
"epoch": 0.02,
"grad_norm": 2.7885866053003387,
"learning_rate": 7.351851851851852e-06,
"loss": 0.2369,
"step": 397
},
{
"epoch": 0.02,
"grad_norm": 2.2231783865351793,
"learning_rate": 7.370370370370371e-06,
"loss": 0.2185,
"step": 398
},
{
"epoch": 0.02,
"grad_norm": 2.86403769874136,
"learning_rate": 7.38888888888889e-06,
"loss": 0.191,
"step": 399
},
{
"epoch": 0.02,
"grad_norm": 2.6112782053430785,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.2541,
"step": 400
},
{
"epoch": 0.02,
"grad_norm": 2.1721369907794355,
"learning_rate": 7.425925925925927e-06,
"loss": 0.2131,
"step": 401
},
{
"epoch": 0.02,
"grad_norm": 2.3261842511133697,
"learning_rate": 7.444444444444445e-06,
"loss": 0.2542,
"step": 402
},
{
"epoch": 0.02,
"grad_norm": 2.4321492348789997,
"learning_rate": 7.462962962962964e-06,
"loss": 0.2691,
"step": 403
},
{
"epoch": 0.02,
"grad_norm": 1.564152501781726,
"learning_rate": 7.481481481481482e-06,
"loss": 0.15,
"step": 404
},
{
"epoch": 0.02,
"grad_norm": 2.345816845324918,
"learning_rate": 7.500000000000001e-06,
"loss": 0.2501,
"step": 405
},
{
"epoch": 0.02,
"grad_norm": 2.015424997549943,
"learning_rate": 7.518518518518519e-06,
"loss": 0.2193,
"step": 406
},
{
"epoch": 0.02,
"grad_norm": 2.6450364209319477,
"learning_rate": 7.537037037037037e-06,
"loss": 0.2726,
"step": 407
},
{
"epoch": 0.02,
"grad_norm": 1.665537149427856,
"learning_rate": 7.555555555555556e-06,
"loss": 0.1735,
"step": 408
},
{
"epoch": 0.02,
"grad_norm": 2.318529079431767,
"learning_rate": 7.5740740740740745e-06,
"loss": 0.2626,
"step": 409
},
{
"epoch": 0.02,
"grad_norm": 1.9981876819623996,
"learning_rate": 7.592592592592594e-06,
"loss": 0.2393,
"step": 410
},
{
"epoch": 0.02,
"grad_norm": 2.0379069550854196,
"learning_rate": 7.611111111111111e-06,
"loss": 0.2069,
"step": 411
},
{
"epoch": 0.02,
"grad_norm": 1.9110119780954062,
"learning_rate": 7.62962962962963e-06,
"loss": 0.231,
"step": 412
},
{
"epoch": 0.02,
"grad_norm": 1.6627143090730263,
"learning_rate": 7.64814814814815e-06,
"loss": 0.2442,
"step": 413
},
{
"epoch": 0.02,
"grad_norm": 2.3799133298824615,
"learning_rate": 7.666666666666667e-06,
"loss": 0.281,
"step": 414
},
{
"epoch": 0.02,
"grad_norm": 1.5833113318773921,
"learning_rate": 7.685185185185185e-06,
"loss": 0.221,
"step": 415
},
{
"epoch": 0.02,
"grad_norm": 1.9057489268142138,
"learning_rate": 7.703703703703704e-06,
"loss": 0.2688,
"step": 416
},
{
"epoch": 0.02,
"grad_norm": 1.8121203361980647,
"learning_rate": 7.722222222222223e-06,
"loss": 0.2347,
"step": 417
},
{
"epoch": 0.02,
"grad_norm": 3.0687752410914997,
"learning_rate": 7.74074074074074e-06,
"loss": 0.2464,
"step": 418
},
{
"epoch": 0.02,
"grad_norm": 1.8131616737058565,
"learning_rate": 7.75925925925926e-06,
"loss": 0.251,
"step": 419
},
{
"epoch": 0.02,
"grad_norm": 1.9091346676877106,
"learning_rate": 7.77777777777778e-06,
"loss": 0.2727,
"step": 420
},
{
"epoch": 0.02,
"grad_norm": 2.46372123585346,
"learning_rate": 7.796296296296297e-06,
"loss": 0.287,
"step": 421
},
{
"epoch": 0.02,
"grad_norm": 2.8823349203033533,
"learning_rate": 7.814814814814816e-06,
"loss": 0.2292,
"step": 422
},
{
"epoch": 0.02,
"grad_norm": 1.4743929849468036,
"learning_rate": 7.833333333333333e-06,
"loss": 0.2142,
"step": 423
},
{
"epoch": 0.02,
"grad_norm": 2.2897706352334475,
"learning_rate": 7.851851851851853e-06,
"loss": 0.2657,
"step": 424
},
{
"epoch": 0.02,
"grad_norm": 1.6830692137954337,
"learning_rate": 7.870370370370372e-06,
"loss": 0.2479,
"step": 425
},
{
"epoch": 0.02,
"grad_norm": 1.5407121550492335,
"learning_rate": 7.88888888888889e-06,
"loss": 0.2246,
"step": 426
},
{
"epoch": 0.02,
"grad_norm": 2.412774183574061,
"learning_rate": 7.907407407407409e-06,
"loss": 0.2763,
"step": 427
},
{
"epoch": 0.02,
"grad_norm": 1.964345565494054,
"learning_rate": 7.925925925925926e-06,
"loss": 0.2587,
"step": 428
},
{
"epoch": 0.02,
"grad_norm": 2.0450440983284652,
"learning_rate": 7.944444444444445e-06,
"loss": 0.1765,
"step": 429
},
{
"epoch": 0.02,
"grad_norm": 1.7867699472742486,
"learning_rate": 7.962962962962963e-06,
"loss": 0.1863,
"step": 430
},
{
"epoch": 0.02,
"grad_norm": 2.5398855665291156,
"learning_rate": 7.981481481481482e-06,
"loss": 0.3173,
"step": 431
},
{
"epoch": 0.02,
"grad_norm": 2.046668493613578,
"learning_rate": 8.000000000000001e-06,
"loss": 0.2758,
"step": 432
},
{
"epoch": 0.02,
"grad_norm": 1.6390293672556615,
"learning_rate": 8.018518518518519e-06,
"loss": 0.2715,
"step": 433
},
{
"epoch": 0.02,
"grad_norm": 2.4504855224681408,
"learning_rate": 8.037037037037038e-06,
"loss": 0.2455,
"step": 434
},
{
"epoch": 0.02,
"grad_norm": 2.599909976400243,
"learning_rate": 8.055555555555557e-06,
"loss": 0.2278,
"step": 435
},
{
"epoch": 0.02,
"grad_norm": 2.234139456704803,
"learning_rate": 8.074074074074075e-06,
"loss": 0.2529,
"step": 436
},
{
"epoch": 0.02,
"grad_norm": 1.7070969104141578,
"learning_rate": 8.092592592592592e-06,
"loss": 0.2011,
"step": 437
},
{
"epoch": 0.02,
"grad_norm": 1.8321578382667267,
"learning_rate": 8.111111111111112e-06,
"loss": 0.2051,
"step": 438
},
{
"epoch": 0.02,
"grad_norm": 1.9908165571547292,
"learning_rate": 8.12962962962963e-06,
"loss": 0.1968,
"step": 439
},
{
"epoch": 0.02,
"grad_norm": 2.4006332821096077,
"learning_rate": 8.148148148148148e-06,
"loss": 0.2538,
"step": 440
},
{
"epoch": 0.02,
"grad_norm": 2.6387770354396216,
"learning_rate": 8.166666666666668e-06,
"loss": 0.1666,
"step": 441
},
{
"epoch": 0.02,
"grad_norm": 2.292311358079653,
"learning_rate": 8.185185185185187e-06,
"loss": 0.2268,
"step": 442
},
{
"epoch": 0.02,
"grad_norm": 1.9779324740690347,
"learning_rate": 8.203703703703704e-06,
"loss": 0.2034,
"step": 443
},
{
"epoch": 0.02,
"grad_norm": 1.9459054495158559,
"learning_rate": 8.222222222222222e-06,
"loss": 0.1963,
"step": 444
},
{
"epoch": 0.02,
"grad_norm": 2.727768704983483,
"learning_rate": 8.240740740740741e-06,
"loss": 0.248,
"step": 445
},
{
"epoch": 0.02,
"grad_norm": 1.762347648339636,
"learning_rate": 8.25925925925926e-06,
"loss": 0.2705,
"step": 446
},
{
"epoch": 0.02,
"grad_norm": 2.0538712972668325,
"learning_rate": 8.277777777777778e-06,
"loss": 0.2157,
"step": 447
},
{
"epoch": 0.02,
"grad_norm": 2.8976656510690924,
"learning_rate": 8.296296296296297e-06,
"loss": 0.2878,
"step": 448
},
{
"epoch": 0.02,
"grad_norm": 1.917974747844571,
"learning_rate": 8.314814814814816e-06,
"loss": 0.2703,
"step": 449
},
{
"epoch": 0.03,
"grad_norm": 2.225709818814864,
"learning_rate": 8.333333333333334e-06,
"loss": 0.2594,
"step": 450
},
{
"epoch": 0.03,
"grad_norm": 2.1943687433117205,
"learning_rate": 8.351851851851851e-06,
"loss": 0.2184,
"step": 451
},
{
"epoch": 0.03,
"grad_norm": 1.918405805463593,
"learning_rate": 8.37037037037037e-06,
"loss": 0.2855,
"step": 452
},
{
"epoch": 0.03,
"grad_norm": 1.723128091977472,
"learning_rate": 8.38888888888889e-06,
"loss": 0.1916,
"step": 453
},
{
"epoch": 0.03,
"grad_norm": 1.958157567411811,
"learning_rate": 8.407407407407409e-06,
"loss": 0.1901,
"step": 454
},
{
"epoch": 0.03,
"grad_norm": 1.4763912525479064,
"learning_rate": 8.425925925925926e-06,
"loss": 0.273,
"step": 455
},
{
"epoch": 0.03,
"grad_norm": 1.5981461116737503,
"learning_rate": 8.444444444444446e-06,
"loss": 0.2516,
"step": 456
},
{
"epoch": 0.03,
"grad_norm": 1.1663828771437823,
"learning_rate": 8.462962962962963e-06,
"loss": 0.1716,
"step": 457
},
{
"epoch": 0.03,
"grad_norm": 1.4781190549476864,
"learning_rate": 8.481481481481482e-06,
"loss": 0.2412,
"step": 458
},
{
"epoch": 0.03,
"grad_norm": 1.6193928401336815,
"learning_rate": 8.5e-06,
"loss": 0.2716,
"step": 459
},
{
"epoch": 0.03,
"grad_norm": 1.6339004624552358,
"learning_rate": 8.518518518518519e-06,
"loss": 0.1968,
"step": 460
},
{
"epoch": 0.03,
"grad_norm": 1.260078140056335,
"learning_rate": 8.537037037037038e-06,
"loss": 0.2227,
"step": 461
},
{
"epoch": 0.03,
"grad_norm": 1.7074133840265566,
"learning_rate": 8.555555555555556e-06,
"loss": 0.219,
"step": 462
},
{
"epoch": 0.03,
"grad_norm": 1.5611418448064829,
"learning_rate": 8.574074074074075e-06,
"loss": 0.1963,
"step": 463
},
{
"epoch": 0.03,
"grad_norm": 1.6694050753250427,
"learning_rate": 8.592592592592593e-06,
"loss": 0.205,
"step": 464
},
{
"epoch": 0.03,
"grad_norm": 1.5774347382027096,
"learning_rate": 8.611111111111112e-06,
"loss": 0.2314,
"step": 465
},
{
"epoch": 0.03,
"grad_norm": 1.707162173333124,
"learning_rate": 8.62962962962963e-06,
"loss": 0.2333,
"step": 466
},
{
"epoch": 0.03,
"grad_norm": 1.586900107102287,
"learning_rate": 8.648148148148149e-06,
"loss": 0.1828,
"step": 467
},
{
"epoch": 0.03,
"grad_norm": 1.8205988008032974,
"learning_rate": 8.666666666666668e-06,
"loss": 0.2781,
"step": 468
},
{
"epoch": 0.03,
"grad_norm": 2.011013273961918,
"learning_rate": 8.685185185185185e-06,
"loss": 0.2364,
"step": 469
},
{
"epoch": 0.03,
"grad_norm": 2.347424978140611,
"learning_rate": 8.703703703703705e-06,
"loss": 0.2951,
"step": 470
},
{
"epoch": 0.03,
"grad_norm": 1.5832688941725674,
"learning_rate": 8.722222222222224e-06,
"loss": 0.1645,
"step": 471
},
{
"epoch": 0.03,
"grad_norm": 1.0163279823952933,
"learning_rate": 8.740740740740741e-06,
"loss": 0.2116,
"step": 472
},
{
"epoch": 0.03,
"grad_norm": 1.599090443091723,
"learning_rate": 8.759259259259259e-06,
"loss": 0.2338,
"step": 473
},
{
"epoch": 0.03,
"grad_norm": 1.5651329244699874,
"learning_rate": 8.777777777777778e-06,
"loss": 0.2141,
"step": 474
},
{
"epoch": 0.03,
"grad_norm": 1.644723516265944,
"learning_rate": 8.796296296296297e-06,
"loss": 0.2205,
"step": 475
},
{
"epoch": 0.03,
"grad_norm": 1.5697078465922618,
"learning_rate": 8.814814814814817e-06,
"loss": 0.1948,
"step": 476
},
{
"epoch": 0.03,
"grad_norm": 1.4385062614115038,
"learning_rate": 8.833333333333334e-06,
"loss": 0.2373,
"step": 477
},
{
"epoch": 0.03,
"grad_norm": 1.911017504582339,
"learning_rate": 8.851851851851853e-06,
"loss": 0.205,
"step": 478
},
{
"epoch": 0.03,
"grad_norm": 2.069808868113705,
"learning_rate": 8.87037037037037e-06,
"loss": 0.2354,
"step": 479
},
{
"epoch": 0.03,
"grad_norm": 2.159608690490603,
"learning_rate": 8.888888888888888e-06,
"loss": 0.2083,
"step": 480
},
{
"epoch": 0.03,
"grad_norm": 1.4701455010284599,
"learning_rate": 8.907407407407408e-06,
"loss": 0.2135,
"step": 481
},
{
"epoch": 0.03,
"grad_norm": 1.8772804379750925,
"learning_rate": 8.925925925925927e-06,
"loss": 0.1787,
"step": 482
},
{
"epoch": 0.03,
"grad_norm": 1.3222927854219297,
"learning_rate": 8.944444444444446e-06,
"loss": 0.2602,
"step": 483
},
{
"epoch": 0.03,
"grad_norm": 1.9651963465511846,
"learning_rate": 8.962962962962963e-06,
"loss": 0.1747,
"step": 484
},
{
"epoch": 0.03,
"grad_norm": 1.9266834331682001,
"learning_rate": 8.981481481481483e-06,
"loss": 0.1919,
"step": 485
},
{
"epoch": 0.03,
"grad_norm": 1.3278082610152648,
"learning_rate": 9e-06,
"loss": 0.2168,
"step": 486
},
{
"epoch": 0.03,
"grad_norm": 2.0527498297286506,
"learning_rate": 9.01851851851852e-06,
"loss": 0.2822,
"step": 487
},
{
"epoch": 0.03,
"grad_norm": 1.4196503456996559,
"learning_rate": 9.037037037037037e-06,
"loss": 0.2282,
"step": 488
},
{
"epoch": 0.03,
"grad_norm": 1.7355838892748336,
"learning_rate": 9.055555555555556e-06,
"loss": 0.2613,
"step": 489
},
{
"epoch": 0.03,
"grad_norm": 2.3124993169628887,
"learning_rate": 9.074074074074075e-06,
"loss": 0.2076,
"step": 490
},
{
"epoch": 0.03,
"grad_norm": 1.55742161398934,
"learning_rate": 9.092592592592593e-06,
"loss": 0.1866,
"step": 491
},
{
"epoch": 0.03,
"grad_norm": 4.12557886859245,
"learning_rate": 9.111111111111112e-06,
"loss": 0.1925,
"step": 492
},
{
"epoch": 0.03,
"grad_norm": 2.1746743501029098,
"learning_rate": 9.12962962962963e-06,
"loss": 0.1506,
"step": 493
},
{
"epoch": 0.03,
"grad_norm": 2.524886716057685,
"learning_rate": 9.148148148148149e-06,
"loss": 0.2345,
"step": 494
},
{
"epoch": 0.03,
"grad_norm": 2.179892451724111,
"learning_rate": 9.166666666666666e-06,
"loss": 0.2059,
"step": 495
},
{
"epoch": 0.03,
"grad_norm": 1.913187582169457,
"learning_rate": 9.185185185185186e-06,
"loss": 0.2595,
"step": 496
},
{
"epoch": 0.03,
"grad_norm": 1.6506808324873237,
"learning_rate": 9.203703703703705e-06,
"loss": 0.2009,
"step": 497
},
{
"epoch": 0.03,
"grad_norm": 7.353623914997798,
"learning_rate": 9.222222222222224e-06,
"loss": 0.2359,
"step": 498
},
{
"epoch": 0.03,
"grad_norm": 1.73397872452955,
"learning_rate": 9.240740740740742e-06,
"loss": 0.1601,
"step": 499
},
{
"epoch": 0.03,
"grad_norm": 2.363800054935029,
"learning_rate": 9.25925925925926e-06,
"loss": 0.2178,
"step": 500
},
{
"epoch": 0.03,
"grad_norm": 3.174926117165057,
"learning_rate": 9.277777777777778e-06,
"loss": 0.2262,
"step": 501
},
{
"epoch": 0.03,
"grad_norm": 2.6754495456894998,
"learning_rate": 9.296296296296296e-06,
"loss": 0.2648,
"step": 502
},
{
"epoch": 0.03,
"grad_norm": 2.1212048896630415,
"learning_rate": 9.314814814814815e-06,
"loss": 0.3119,
"step": 503
},
{
"epoch": 0.03,
"grad_norm": 1.845141499967797,
"learning_rate": 9.333333333333334e-06,
"loss": 0.214,
"step": 504
},
{
"epoch": 0.03,
"grad_norm": 1.6034717530580787,
"learning_rate": 9.351851851851854e-06,
"loss": 0.1861,
"step": 505
},
{
"epoch": 0.03,
"grad_norm": 1.494876834777987,
"learning_rate": 9.370370370370371e-06,
"loss": 0.1951,
"step": 506
},
{
"epoch": 0.03,
"grad_norm": 2.567749735780204,
"learning_rate": 9.38888888888889e-06,
"loss": 0.2589,
"step": 507
},
{
"epoch": 0.03,
"grad_norm": 2.156161180339284,
"learning_rate": 9.407407407407408e-06,
"loss": 0.2119,
"step": 508
},
{
"epoch": 0.03,
"grad_norm": 2.1757434074631155,
"learning_rate": 9.425925925925925e-06,
"loss": 0.2806,
"step": 509
},
{
"epoch": 0.03,
"grad_norm": 1.6893535871344756,
"learning_rate": 9.444444444444445e-06,
"loss": 0.2379,
"step": 510
},
{
"epoch": 0.03,
"grad_norm": 1.7028334459930194,
"learning_rate": 9.462962962962964e-06,
"loss": 0.2342,
"step": 511
},
{
"epoch": 0.03,
"grad_norm": 2.573367833466672,
"learning_rate": 9.481481481481483e-06,
"loss": 0.2728,
"step": 512
},
{
"epoch": 0.03,
"grad_norm": 1.629665728283705,
"learning_rate": 9.5e-06,
"loss": 0.1883,
"step": 513
},
{
"epoch": 0.03,
"grad_norm": 5.469608392104702,
"learning_rate": 9.51851851851852e-06,
"loss": 0.2699,
"step": 514
},
{
"epoch": 0.03,
"grad_norm": 2.922868529122991,
"learning_rate": 9.537037037037037e-06,
"loss": 0.2474,
"step": 515
},
{
"epoch": 0.03,
"grad_norm": 2.383420583648189,
"learning_rate": 9.555555555555556e-06,
"loss": 0.2799,
"step": 516
},
{
"epoch": 0.03,
"grad_norm": 1.7284155714312717,
"learning_rate": 9.574074074074074e-06,
"loss": 0.2199,
"step": 517
},
{
"epoch": 0.03,
"grad_norm": 1.8042763638037695,
"learning_rate": 9.592592592592593e-06,
"loss": 0.2357,
"step": 518
},
{
"epoch": 0.03,
"grad_norm": 1.8240537997703765,
"learning_rate": 9.611111111111112e-06,
"loss": 0.2571,
"step": 519
},
{
"epoch": 0.03,
"grad_norm": 2.240434759867715,
"learning_rate": 9.62962962962963e-06,
"loss": 0.2571,
"step": 520
},
{
"epoch": 0.03,
"grad_norm": 2.2212595515134352,
"learning_rate": 9.64814814814815e-06,
"loss": 0.2805,
"step": 521
},
{
"epoch": 0.03,
"grad_norm": 1.6706904432909366,
"learning_rate": 9.666666666666667e-06,
"loss": 0.2818,
"step": 522
},
{
"epoch": 0.03,
"grad_norm": 1.882622112644092,
"learning_rate": 9.685185185185186e-06,
"loss": 0.2546,
"step": 523
},
{
"epoch": 0.03,
"grad_norm": 2.003335137994384,
"learning_rate": 9.703703703703703e-06,
"loss": 0.2216,
"step": 524
},
{
"epoch": 0.03,
"grad_norm": 1.8173690992162865,
"learning_rate": 9.722222222222223e-06,
"loss": 0.2595,
"step": 525
},
{
"epoch": 0.03,
"grad_norm": 2.4188288081930738,
"learning_rate": 9.740740740740742e-06,
"loss": 0.2453,
"step": 526
},
{
"epoch": 0.03,
"grad_norm": 3.275836038529327,
"learning_rate": 9.759259259259261e-06,
"loss": 0.2442,
"step": 527
},
{
"epoch": 0.03,
"grad_norm": 1.3602762550601097,
"learning_rate": 9.777777777777779e-06,
"loss": 0.2221,
"step": 528
},
{
"epoch": 0.03,
"grad_norm": 1.4099576775919647,
"learning_rate": 9.796296296296298e-06,
"loss": 0.2381,
"step": 529
},
{
"epoch": 0.03,
"grad_norm": 1.948210621819492,
"learning_rate": 9.814814814814815e-06,
"loss": 0.2942,
"step": 530
},
{
"epoch": 0.03,
"grad_norm": 1.6352317839316701,
"learning_rate": 9.833333333333333e-06,
"loss": 0.2261,
"step": 531
},
{
"epoch": 0.03,
"grad_norm": 2.07004143541843,
"learning_rate": 9.851851851851852e-06,
"loss": 0.1908,
"step": 532
},
{
"epoch": 0.03,
"grad_norm": 2.15105594941232,
"learning_rate": 9.870370370370371e-06,
"loss": 0.2878,
"step": 533
},
{
"epoch": 0.03,
"grad_norm": 2.0665011897929353,
"learning_rate": 9.88888888888889e-06,
"loss": 0.2648,
"step": 534
},
{
"epoch": 0.03,
"grad_norm": 1.3177102461777124,
"learning_rate": 9.907407407407408e-06,
"loss": 0.1964,
"step": 535
},
{
"epoch": 0.03,
"grad_norm": 1.6617003227758134,
"learning_rate": 9.925925925925927e-06,
"loss": 0.2061,
"step": 536
},
{
"epoch": 0.03,
"grad_norm": 1.8712692178466874,
"learning_rate": 9.944444444444445e-06,
"loss": 0.2097,
"step": 537
},
{
"epoch": 0.03,
"grad_norm": 1.2364405901770519,
"learning_rate": 9.962962962962964e-06,
"loss": 0.2007,
"step": 538
},
{
"epoch": 0.03,
"grad_norm": 1.60292206138748,
"learning_rate": 9.981481481481482e-06,
"loss": 0.2364,
"step": 539
},
{
"epoch": 0.03,
"grad_norm": 1.8072231920574293,
"learning_rate": 1e-05,
"loss": 0.2238,
"step": 540
},
{
"epoch": 0.03,
"grad_norm": 2.4814728889599436,
"learning_rate": 9.999999918848563e-06,
"loss": 0.2314,
"step": 541
},
{
"epoch": 0.03,
"grad_norm": 1.7318830824208753,
"learning_rate": 9.999999675394253e-06,
"loss": 0.2389,
"step": 542
},
{
"epoch": 0.03,
"grad_norm": 1.3036697895542924,
"learning_rate": 9.999999269637079e-06,
"loss": 0.1491,
"step": 543
},
{
"epoch": 0.03,
"grad_norm": 1.8513482106621912,
"learning_rate": 9.999998701577052e-06,
"loss": 0.2016,
"step": 544
},
{
"epoch": 0.03,
"grad_norm": 2.2647752911372168,
"learning_rate": 9.999997971214192e-06,
"loss": 0.2791,
"step": 545
},
{
"epoch": 0.03,
"grad_norm": 2.897363047140287,
"learning_rate": 9.999997078548524e-06,
"loss": 0.2325,
"step": 546
},
{
"epoch": 0.03,
"grad_norm": 1.5670415512729239,
"learning_rate": 9.999996023580074e-06,
"loss": 0.2251,
"step": 547
},
{
"epoch": 0.03,
"grad_norm": 2.6066296463318857,
"learning_rate": 9.99999480630888e-06,
"loss": 0.2611,
"step": 548
},
{
"epoch": 0.03,
"grad_norm": 1.4653422867881003,
"learning_rate": 9.999993426734977e-06,
"loss": 0.2573,
"step": 549
},
{
"epoch": 0.03,
"grad_norm": 1.5471928587812813,
"learning_rate": 9.999991884858413e-06,
"loss": 0.2317,
"step": 550
},
{
"epoch": 0.03,
"grad_norm": 2.1788321358326717,
"learning_rate": 9.999990180679238e-06,
"loss": 0.2661,
"step": 551
},
{
"epoch": 0.03,
"grad_norm": 1.9518145246539516,
"learning_rate": 9.999988314197507e-06,
"loss": 0.1772,
"step": 552
},
{
"epoch": 0.03,
"grad_norm": 1.6427564762279832,
"learning_rate": 9.999986285413278e-06,
"loss": 0.2232,
"step": 553
},
{
"epoch": 0.03,
"grad_norm": 1.7725487637369426,
"learning_rate": 9.99998409432662e-06,
"loss": 0.1818,
"step": 554
},
{
"epoch": 0.03,
"grad_norm": 1.7108703016558175,
"learning_rate": 9.999981740937604e-06,
"loss": 0.1876,
"step": 555
},
{
"epoch": 0.03,
"grad_norm": 1.8711769627044077,
"learning_rate": 9.999979225246305e-06,
"loss": 0.3121,
"step": 556
},
{
"epoch": 0.03,
"grad_norm": 2.7045900117922645,
"learning_rate": 9.999976547252805e-06,
"loss": 0.1916,
"step": 557
},
{
"epoch": 0.03,
"grad_norm": 2.048128523173138,
"learning_rate": 9.999973706957191e-06,
"loss": 0.2373,
"step": 558
},
{
"epoch": 0.03,
"grad_norm": 1.9221990901329062,
"learning_rate": 9.999970704359555e-06,
"loss": 0.2004,
"step": 559
},
{
"epoch": 0.03,
"grad_norm": 2.1596146848014603,
"learning_rate": 9.999967539459996e-06,
"loss": 0.2664,
"step": 560
},
{
"epoch": 0.03,
"grad_norm": 1.2490925057696007,
"learning_rate": 9.999964212258612e-06,
"loss": 0.1911,
"step": 561
},
{
"epoch": 0.03,
"grad_norm": 1.7084579310879742,
"learning_rate": 9.999960722755519e-06,
"loss": 0.2211,
"step": 562
},
{
"epoch": 0.03,
"grad_norm": 1.8416896032362209,
"learning_rate": 9.999957070950824e-06,
"loss": 0.2526,
"step": 563
},
{
"epoch": 0.03,
"grad_norm": 1.7889661492301776,
"learning_rate": 9.999953256844646e-06,
"loss": 0.2224,
"step": 564
},
{
"epoch": 0.03,
"grad_norm": 1.8237790858313632,
"learning_rate": 9.999949280437111e-06,
"loss": 0.1634,
"step": 565
},
{
"epoch": 0.03,
"grad_norm": 1.4797735469173663,
"learning_rate": 9.999945141728348e-06,
"loss": 0.2205,
"step": 566
},
{
"epoch": 0.03,
"grad_norm": 2.4426823569233567,
"learning_rate": 9.99994084071849e-06,
"loss": 0.3041,
"step": 567
},
{
"epoch": 0.03,
"grad_norm": 1.743574684996697,
"learning_rate": 9.999936377407677e-06,
"loss": 0.2238,
"step": 568
},
{
"epoch": 0.03,
"grad_norm": 2.032513000231454,
"learning_rate": 9.999931751796055e-06,
"loss": 0.1836,
"step": 569
},
{
"epoch": 0.03,
"grad_norm": 2.299920431087586,
"learning_rate": 9.99992696388377e-06,
"loss": 0.2829,
"step": 570
},
{
"epoch": 0.03,
"grad_norm": 1.456484249618141,
"learning_rate": 9.999922013670984e-06,
"loss": 0.1619,
"step": 571
},
{
"epoch": 0.03,
"grad_norm": 1.8247965197914737,
"learning_rate": 9.999916901157852e-06,
"loss": 0.2324,
"step": 572
},
{
"epoch": 0.03,
"grad_norm": 1.8103485101634382,
"learning_rate": 9.999911626344544e-06,
"loss": 0.2591,
"step": 573
},
{
"epoch": 0.03,
"grad_norm": 1.4997873254847325,
"learning_rate": 9.999906189231228e-06,
"loss": 0.2105,
"step": 574
},
{
"epoch": 0.03,
"grad_norm": 1.6577781968518213,
"learning_rate": 9.999900589818083e-06,
"loss": 0.2225,
"step": 575
},
{
"epoch": 0.03,
"grad_norm": 1.0842225165438453,
"learning_rate": 9.99989482810529e-06,
"loss": 0.1728,
"step": 576
},
{
"epoch": 0.03,
"grad_norm": 1.4611910074773522,
"learning_rate": 9.999888904093035e-06,
"loss": 0.193,
"step": 577
},
{
"epoch": 0.03,
"grad_norm": 1.2211246342575959,
"learning_rate": 9.99988281778151e-06,
"loss": 0.1969,
"step": 578
},
{
"epoch": 0.03,
"grad_norm": 1.8041481599306388,
"learning_rate": 9.999876569170916e-06,
"loss": 0.291,
"step": 579
},
{
"epoch": 0.03,
"grad_norm": 2.436450340993971,
"learning_rate": 9.99987015826145e-06,
"loss": 0.3033,
"step": 580
},
{
"epoch": 0.03,
"grad_norm": 1.5594826042435612,
"learning_rate": 9.999863585053327e-06,
"loss": 0.1409,
"step": 581
},
{
"epoch": 0.03,
"grad_norm": 1.9152897928877035,
"learning_rate": 9.999856849546757e-06,
"loss": 0.267,
"step": 582
},
{
"epoch": 0.03,
"grad_norm": 1.6390096341345133,
"learning_rate": 9.999849951741955e-06,
"loss": 0.2145,
"step": 583
},
{
"epoch": 0.03,
"grad_norm": 1.903619272035446,
"learning_rate": 9.999842891639151e-06,
"loss": 0.2076,
"step": 584
},
{
"epoch": 0.03,
"grad_norm": 2.055727158980303,
"learning_rate": 9.999835669238571e-06,
"loss": 0.2839,
"step": 585
},
{
"epoch": 0.03,
"grad_norm": 1.464435144020495,
"learning_rate": 9.999828284540451e-06,
"loss": 0.178,
"step": 586
},
{
"epoch": 0.03,
"grad_norm": 1.2701477725215002,
"learning_rate": 9.99982073754503e-06,
"loss": 0.2349,
"step": 587
},
{
"epoch": 0.03,
"grad_norm": 1.1983313414618901,
"learning_rate": 9.999813028252551e-06,
"loss": 0.2407,
"step": 588
},
{
"epoch": 0.03,
"grad_norm": 1.4233421299492355,
"learning_rate": 9.999805156663267e-06,
"loss": 0.2286,
"step": 589
},
{
"epoch": 0.03,
"grad_norm": 1.4703401938433902,
"learning_rate": 9.999797122777433e-06,
"loss": 0.1695,
"step": 590
},
{
"epoch": 0.03,
"grad_norm": 1.2974353988467726,
"learning_rate": 9.999788926595309e-06,
"loss": 0.2403,
"step": 591
},
{
"epoch": 0.03,
"grad_norm": 1.754992726258613,
"learning_rate": 9.99978056811716e-06,
"loss": 0.2765,
"step": 592
},
{
"epoch": 0.03,
"grad_norm": 2.109167652182334,
"learning_rate": 9.999772047343259e-06,
"loss": 0.2608,
"step": 593
},
{
"epoch": 0.03,
"grad_norm": 1.7035748167716884,
"learning_rate": 9.999763364273884e-06,
"loss": 0.2052,
"step": 594
},
{
"epoch": 0.03,
"grad_norm": 1.569435155660177,
"learning_rate": 9.999754518909314e-06,
"loss": 0.2147,
"step": 595
},
{
"epoch": 0.03,
"grad_norm": 1.4123044469514268,
"learning_rate": 9.999745511249837e-06,
"loss": 0.2548,
"step": 596
},
{
"epoch": 0.03,
"grad_norm": 1.2550973923976543,
"learning_rate": 9.999736341295746e-06,
"loss": 0.2043,
"step": 597
},
{
"epoch": 0.03,
"grad_norm": 2.0765398045800105,
"learning_rate": 9.999727009047337e-06,
"loss": 0.2845,
"step": 598
},
{
"epoch": 0.03,
"grad_norm": 2.6420925960269117,
"learning_rate": 9.999717514504917e-06,
"loss": 0.2908,
"step": 599
},
{
"epoch": 0.03,
"grad_norm": 1.5955704665399983,
"learning_rate": 9.99970785766879e-06,
"loss": 0.204,
"step": 600
},
{
"epoch": 0.03,
"grad_norm": 1.9013730574616672,
"learning_rate": 9.99969803853927e-06,
"loss": 0.2616,
"step": 601
},
{
"epoch": 0.03,
"grad_norm": 1.300116200022206,
"learning_rate": 9.999688057116679e-06,
"loss": 0.1658,
"step": 602
},
{
"epoch": 0.03,
"grad_norm": 2.1232774516287902,
"learning_rate": 9.999677913401335e-06,
"loss": 0.2344,
"step": 603
},
{
"epoch": 0.03,
"grad_norm": 1.7319872675478898,
"learning_rate": 9.999667607393576e-06,
"loss": 0.2378,
"step": 604
},
{
"epoch": 0.03,
"grad_norm": 1.4052874396755204,
"learning_rate": 9.999657139093727e-06,
"loss": 0.2119,
"step": 605
},
{
"epoch": 0.03,
"grad_norm": 1.8178640032969893,
"learning_rate": 9.999646508502134e-06,
"loss": 0.2131,
"step": 606
},
{
"epoch": 0.03,
"grad_norm": 2.824809053601352,
"learning_rate": 9.99963571561914e-06,
"loss": 0.2898,
"step": 607
},
{
"epoch": 0.03,
"grad_norm": 18.329759841943982,
"learning_rate": 9.999624760445095e-06,
"loss": 0.3558,
"step": 608
},
{
"epoch": 0.03,
"grad_norm": 4.959488207961797,
"learning_rate": 9.999613642980358e-06,
"loss": 0.2528,
"step": 609
},
{
"epoch": 0.03,
"grad_norm": 10.810279662319726,
"learning_rate": 9.999602363225284e-06,
"loss": 0.2493,
"step": 610
},
{
"epoch": 0.03,
"grad_norm": 2.752698701205404,
"learning_rate": 9.999590921180244e-06,
"loss": 0.276,
"step": 611
},
{
"epoch": 0.03,
"grad_norm": 43.669199514580086,
"learning_rate": 9.999579316845607e-06,
"loss": 0.2679,
"step": 612
},
{
"epoch": 0.03,
"grad_norm": 20.58903500382208,
"learning_rate": 9.99956755022175e-06,
"loss": 0.3086,
"step": 613
},
{
"epoch": 0.03,
"grad_norm": 17.772749385717987,
"learning_rate": 9.999555621309057e-06,
"loss": 0.2392,
"step": 614
},
{
"epoch": 0.03,
"grad_norm": 39.173913440838724,
"learning_rate": 9.999543530107912e-06,
"loss": 0.3374,
"step": 615
},
{
"epoch": 0.03,
"grad_norm": 12.994662509599983,
"learning_rate": 9.99953127661871e-06,
"loss": 0.3013,
"step": 616
},
{
"epoch": 0.03,
"grad_norm": 81.81398140634857,
"learning_rate": 9.999518860841847e-06,
"loss": 0.3234,
"step": 617
},
{
"epoch": 0.03,
"grad_norm": 41.446118041182025,
"learning_rate": 9.999506282777727e-06,
"loss": 0.3536,
"step": 618
},
{
"epoch": 0.03,
"grad_norm": 29.228570897145367,
"learning_rate": 9.999493542426759e-06,
"loss": 0.4052,
"step": 619
},
{
"epoch": 0.03,
"grad_norm": 15.15354187357157,
"learning_rate": 9.999480639789355e-06,
"loss": 0.2878,
"step": 620
},
{
"epoch": 0.03,
"grad_norm": 6.612631786742226,
"learning_rate": 9.999467574865935e-06,
"loss": 0.2893,
"step": 621
},
{
"epoch": 0.03,
"grad_norm": 24.43898009674187,
"learning_rate": 9.999454347656923e-06,
"loss": 0.3064,
"step": 622
},
{
"epoch": 0.03,
"grad_norm": 14.64972989090464,
"learning_rate": 9.999440958162747e-06,
"loss": 0.3189,
"step": 623
},
{
"epoch": 0.03,
"grad_norm": 3.2441817068017054,
"learning_rate": 9.999427406383843e-06,
"loss": 0.2497,
"step": 624
},
{
"epoch": 0.03,
"grad_norm": 6.36137774749413,
"learning_rate": 9.99941369232065e-06,
"loss": 0.304,
"step": 625
},
{
"epoch": 0.03,
"grad_norm": 5.610408422117223,
"learning_rate": 9.999399815973615e-06,
"loss": 0.2792,
"step": 626
},
{
"epoch": 0.03,
"grad_norm": 2.644214887304561,
"learning_rate": 9.999385777343185e-06,
"loss": 0.2389,
"step": 627
},
{
"epoch": 0.03,
"grad_norm": 4.995091961958238,
"learning_rate": 9.999371576429819e-06,
"loss": 0.1999,
"step": 628
},
{
"epoch": 0.03,
"grad_norm": 9.846007654361046,
"learning_rate": 9.999357213233978e-06,
"loss": 0.32,
"step": 629
},
{
"epoch": 0.04,
"grad_norm": 14.346899613459236,
"learning_rate": 9.999342687756126e-06,
"loss": 0.229,
"step": 630
},
{
"epoch": 0.04,
"grad_norm": 1.7492375075104223,
"learning_rate": 9.999327999996735e-06,
"loss": 0.2714,
"step": 631
},
{
"epoch": 0.04,
"grad_norm": 1.713373966378423,
"learning_rate": 9.999313149956282e-06,
"loss": 0.2408,
"step": 632
},
{
"epoch": 0.04,
"grad_norm": 1.5228538250927752,
"learning_rate": 9.99929813763525e-06,
"loss": 0.2474,
"step": 633
},
{
"epoch": 0.04,
"grad_norm": 1.5658819838583038,
"learning_rate": 9.999282963034126e-06,
"loss": 0.2009,
"step": 634
},
{
"epoch": 0.04,
"grad_norm": 1.3916280188471633,
"learning_rate": 9.999267626153401e-06,
"loss": 0.2009,
"step": 635
},
{
"epoch": 0.04,
"grad_norm": 2.7101261122944824,
"learning_rate": 9.999252126993574e-06,
"loss": 0.2652,
"step": 636
},
{
"epoch": 0.04,
"grad_norm": 1.7148812535384157,
"learning_rate": 9.999236465555152e-06,
"loss": 0.2354,
"step": 637
},
{
"epoch": 0.04,
"grad_norm": 2.0234141624607167,
"learning_rate": 9.999220641838637e-06,
"loss": 0.2267,
"step": 638
},
{
"epoch": 0.04,
"grad_norm": 1.3750651300784082,
"learning_rate": 9.999204655844543e-06,
"loss": 0.2218,
"step": 639
},
{
"epoch": 0.04,
"grad_norm": 1.7894919600454031,
"learning_rate": 9.999188507573394e-06,
"loss": 0.2294,
"step": 640
},
{
"epoch": 0.04,
"grad_norm": 1.253131242210937,
"learning_rate": 9.99917219702571e-06,
"loss": 0.2007,
"step": 641
},
{
"epoch": 0.04,
"grad_norm": 1.7178947835267584,
"learning_rate": 9.999155724202022e-06,
"loss": 0.2208,
"step": 642
},
{
"epoch": 0.04,
"grad_norm": 1.4817840701843323,
"learning_rate": 9.999139089102866e-06,
"loss": 0.1922,
"step": 643
},
{
"epoch": 0.04,
"grad_norm": 2.2511555432970494,
"learning_rate": 9.999122291728778e-06,
"loss": 0.2371,
"step": 644
},
{
"epoch": 0.04,
"grad_norm": 1.3644123822672882,
"learning_rate": 9.999105332080306e-06,
"loss": 0.1903,
"step": 645
},
{
"epoch": 0.04,
"grad_norm": 1.2690162200868096,
"learning_rate": 9.999088210158001e-06,
"loss": 0.217,
"step": 646
},
{
"epoch": 0.04,
"grad_norm": 1.7579445153572213,
"learning_rate": 9.999070925962418e-06,
"loss": 0.237,
"step": 647
},
{
"epoch": 0.04,
"grad_norm": 2.7747401066383834,
"learning_rate": 9.999053479494118e-06,
"loss": 0.2233,
"step": 648
},
{
"epoch": 0.04,
"grad_norm": 1.625083673597082,
"learning_rate": 9.999035870753669e-06,
"loss": 0.2504,
"step": 649
},
{
"epoch": 0.04,
"grad_norm": 2.455600206556279,
"learning_rate": 9.99901809974164e-06,
"loss": 0.3197,
"step": 650
},
{
"epoch": 0.04,
"grad_norm": 1.3696478585684608,
"learning_rate": 9.999000166458607e-06,
"loss": 0.1852,
"step": 651
},
{
"epoch": 0.04,
"grad_norm": 2.9877473640982473,
"learning_rate": 9.998982070905155e-06,
"loss": 0.2738,
"step": 652
},
{
"epoch": 0.04,
"grad_norm": 1.5267221376121667,
"learning_rate": 9.99896381308187e-06,
"loss": 0.2145,
"step": 653
},
{
"epoch": 0.04,
"grad_norm": 2.62593829324627,
"learning_rate": 9.998945392989347e-06,
"loss": 0.2533,
"step": 654
},
{
"epoch": 0.04,
"grad_norm": 2.001065789798318,
"learning_rate": 9.99892681062818e-06,
"loss": 0.2617,
"step": 655
},
{
"epoch": 0.04,
"grad_norm": 2.603263756945268,
"learning_rate": 9.998908065998975e-06,
"loss": 0.2039,
"step": 656
},
{
"epoch": 0.04,
"grad_norm": 1.956740344387672,
"learning_rate": 9.998889159102339e-06,
"loss": 0.2122,
"step": 657
},
{
"epoch": 0.04,
"grad_norm": 1.584203293391321,
"learning_rate": 9.998870089938886e-06,
"loss": 0.1865,
"step": 658
},
{
"epoch": 0.04,
"grad_norm": 2.053990771737211,
"learning_rate": 9.998850858509237e-06,
"loss": 0.2554,
"step": 659
},
{
"epoch": 0.04,
"grad_norm": 2.405207222241233,
"learning_rate": 9.998831464814014e-06,
"loss": 0.254,
"step": 660
},
{
"epoch": 0.04,
"grad_norm": 2.808261957851011,
"learning_rate": 9.998811908853847e-06,
"loss": 0.2548,
"step": 661
},
{
"epoch": 0.04,
"grad_norm": 1.5276001400400812,
"learning_rate": 9.998792190629369e-06,
"loss": 0.2019,
"step": 662
},
{
"epoch": 0.04,
"grad_norm": 2.2381691796438554,
"learning_rate": 9.998772310141225e-06,
"loss": 0.2737,
"step": 663
},
{
"epoch": 0.04,
"grad_norm": 2.413871066209702,
"learning_rate": 9.998752267390055e-06,
"loss": 0.2401,
"step": 664
},
{
"epoch": 0.04,
"grad_norm": 1.229324476334034,
"learning_rate": 9.998732062376513e-06,
"loss": 0.1934,
"step": 665
},
{
"epoch": 0.04,
"grad_norm": 1.7869477329215326,
"learning_rate": 9.998711695101254e-06,
"loss": 0.2307,
"step": 666
},
{
"epoch": 0.04,
"grad_norm": 1.2130204037635122,
"learning_rate": 9.998691165564938e-06,
"loss": 0.222,
"step": 667
},
{
"epoch": 0.04,
"grad_norm": 1.5596636826017594,
"learning_rate": 9.998670473768232e-06,
"loss": 0.2044,
"step": 668
},
{
"epoch": 0.04,
"grad_norm": 1.3948911007574005,
"learning_rate": 9.998649619711808e-06,
"loss": 0.2502,
"step": 669
},
{
"epoch": 0.04,
"grad_norm": 2.359095121040132,
"learning_rate": 9.998628603396345e-06,
"loss": 0.2524,
"step": 670
},
{
"epoch": 0.04,
"grad_norm": 1.7274161942575648,
"learning_rate": 9.998607424822522e-06,
"loss": 0.2523,
"step": 671
},
{
"epoch": 0.04,
"grad_norm": 3.43300454353923,
"learning_rate": 9.998586083991028e-06,
"loss": 0.2306,
"step": 672
},
{
"epoch": 0.04,
"grad_norm": 2.1139468627339784,
"learning_rate": 9.998564580902555e-06,
"loss": 0.2472,
"step": 673
},
{
"epoch": 0.04,
"grad_norm": 2.1596330005523834,
"learning_rate": 9.9985429155578e-06,
"loss": 0.2363,
"step": 674
},
{
"epoch": 0.04,
"grad_norm": 2.280091409544863,
"learning_rate": 9.998521087957468e-06,
"loss": 0.2438,
"step": 675
},
{
"epoch": 0.04,
"grad_norm": 1.9700134628218318,
"learning_rate": 9.998499098102268e-06,
"loss": 0.2778,
"step": 676
},
{
"epoch": 0.04,
"grad_norm": 2.06506438214983,
"learning_rate": 9.998476945992913e-06,
"loss": 0.2987,
"step": 677
},
{
"epoch": 0.04,
"grad_norm": 1.8740205073967353,
"learning_rate": 9.998454631630123e-06,
"loss": 0.2418,
"step": 678
},
{
"epoch": 0.04,
"grad_norm": 1.8872574773562758,
"learning_rate": 9.998432155014619e-06,
"loss": 0.2837,
"step": 679
},
{
"epoch": 0.04,
"grad_norm": 1.4657243828131423,
"learning_rate": 9.998409516147135e-06,
"loss": 0.1898,
"step": 680
},
{
"epoch": 0.04,
"grad_norm": 2.3584560221400532,
"learning_rate": 9.998386715028403e-06,
"loss": 0.2326,
"step": 681
},
{
"epoch": 0.04,
"grad_norm": 1.8820926350629803,
"learning_rate": 9.998363751659163e-06,
"loss": 0.1695,
"step": 682
},
{
"epoch": 0.04,
"grad_norm": 2.9141438654363108,
"learning_rate": 9.998340626040162e-06,
"loss": 0.2036,
"step": 683
},
{
"epoch": 0.04,
"grad_norm": 1.7084189934789658,
"learning_rate": 9.99831733817215e-06,
"loss": 0.2271,
"step": 684
},
{
"epoch": 0.04,
"grad_norm": 1.3658176248622982,
"learning_rate": 9.998293888055882e-06,
"loss": 0.2688,
"step": 685
},
{
"epoch": 0.04,
"grad_norm": 1.4298539742233574,
"learning_rate": 9.998270275692123e-06,
"loss": 0.2012,
"step": 686
},
{
"epoch": 0.04,
"grad_norm": 2.1142214501053354,
"learning_rate": 9.998246501081635e-06,
"loss": 0.2675,
"step": 687
},
{
"epoch": 0.04,
"grad_norm": 1.7491912345971081,
"learning_rate": 9.998222564225191e-06,
"loss": 0.2313,
"step": 688
},
{
"epoch": 0.04,
"grad_norm": 1.9556946882636561,
"learning_rate": 9.998198465123569e-06,
"loss": 0.2405,
"step": 689
},
{
"epoch": 0.04,
"grad_norm": 1.733282763422793,
"learning_rate": 9.998174203777549e-06,
"loss": 0.2521,
"step": 690
},
{
"epoch": 0.04,
"grad_norm": 1.5037118810910108,
"learning_rate": 9.998149780187922e-06,
"loss": 0.1822,
"step": 691
},
{
"epoch": 0.04,
"grad_norm": 2.9634766172075806,
"learning_rate": 9.998125194355479e-06,
"loss": 0.2111,
"step": 692
},
{
"epoch": 0.04,
"grad_norm": 1.5280912623747476,
"learning_rate": 9.998100446281018e-06,
"loss": 0.2512,
"step": 693
},
{
"epoch": 0.04,
"grad_norm": 1.7715209591251764,
"learning_rate": 9.998075535965342e-06,
"loss": 0.2257,
"step": 694
},
{
"epoch": 0.04,
"grad_norm": 1.9308633117820866,
"learning_rate": 9.998050463409259e-06,
"loss": 0.2138,
"step": 695
},
{
"epoch": 0.04,
"grad_norm": 1.2120358669477806,
"learning_rate": 9.998025228613586e-06,
"loss": 0.2171,
"step": 696
},
{
"epoch": 0.04,
"grad_norm": 3.948725910167626,
"learning_rate": 9.997999831579138e-06,
"loss": 0.2203,
"step": 697
},
{
"epoch": 0.04,
"grad_norm": 1.3921277895023731,
"learning_rate": 9.997974272306742e-06,
"loss": 0.2406,
"step": 698
},
{
"epoch": 0.04,
"grad_norm": 1.2562402812026698,
"learning_rate": 9.997948550797227e-06,
"loss": 0.165,
"step": 699
},
{
"epoch": 0.04,
"grad_norm": 1.610502840874703,
"learning_rate": 9.997922667051429e-06,
"loss": 0.2431,
"step": 700
},
{
"epoch": 0.04,
"grad_norm": 1.276328494150188,
"learning_rate": 9.997896621070187e-06,
"loss": 0.1982,
"step": 701
},
{
"epoch": 0.04,
"grad_norm": 2.1114722669380392,
"learning_rate": 9.997870412854347e-06,
"loss": 0.2066,
"step": 702
},
{
"epoch": 0.04,
"grad_norm": 2.78201244918868,
"learning_rate": 9.997844042404758e-06,
"loss": 0.2583,
"step": 703
},
{
"epoch": 0.04,
"grad_norm": 1.535084620835797,
"learning_rate": 9.997817509722279e-06,
"loss": 0.2257,
"step": 704
},
{
"epoch": 0.04,
"grad_norm": 1.8874994325321177,
"learning_rate": 9.997790814807769e-06,
"loss": 0.2009,
"step": 705
},
{
"epoch": 0.04,
"grad_norm": 1.6047968049973862,
"learning_rate": 9.997763957662094e-06,
"loss": 0.2602,
"step": 706
},
{
"epoch": 0.04,
"grad_norm": 1.7570357757703434,
"learning_rate": 9.997736938286129e-06,
"loss": 0.2485,
"step": 707
},
{
"epoch": 0.04,
"grad_norm": 3.2229389968292472,
"learning_rate": 9.997709756680746e-06,
"loss": 0.2572,
"step": 708
},
{
"epoch": 0.04,
"grad_norm": 1.8492538247369952,
"learning_rate": 9.997682412846835e-06,
"loss": 0.2053,
"step": 709
},
{
"epoch": 0.04,
"grad_norm": 1.7469708897898746,
"learning_rate": 9.997654906785274e-06,
"loss": 0.2146,
"step": 710
},
{
"epoch": 0.04,
"grad_norm": 2.2291057918531525,
"learning_rate": 9.997627238496964e-06,
"loss": 0.2099,
"step": 711
},
{
"epoch": 0.04,
"grad_norm": 1.3402541063235494,
"learning_rate": 9.9975994079828e-06,
"loss": 0.1807,
"step": 712
},
{
"epoch": 0.04,
"grad_norm": 1.922145502701153,
"learning_rate": 9.997571415243684e-06,
"loss": 0.236,
"step": 713
},
{
"epoch": 0.04,
"grad_norm": 1.4461143048070895,
"learning_rate": 9.997543260280527e-06,
"loss": 0.1837,
"step": 714
},
{
"epoch": 0.04,
"grad_norm": 1.3119016202921865,
"learning_rate": 9.997514943094243e-06,
"loss": 0.2211,
"step": 715
},
{
"epoch": 0.04,
"grad_norm": 5.0686545931001525,
"learning_rate": 9.997486463685748e-06,
"loss": 0.1679,
"step": 716
},
{
"epoch": 0.04,
"grad_norm": 2.8344728665347683,
"learning_rate": 9.99745782205597e-06,
"loss": 0.2857,
"step": 717
},
{
"epoch": 0.04,
"grad_norm": 2.4148963426375163,
"learning_rate": 9.997429018205838e-06,
"loss": 0.246,
"step": 718
},
{
"epoch": 0.04,
"grad_norm": 2.3087572791826942,
"learning_rate": 9.997400052136284e-06,
"loss": 0.2206,
"step": 719
},
{
"epoch": 0.04,
"grad_norm": 1.8846658472718998,
"learning_rate": 9.997370923848253e-06,
"loss": 0.2204,
"step": 720
},
{
"epoch": 0.04,
"grad_norm": 2.4933567951325903,
"learning_rate": 9.997341633342685e-06,
"loss": 0.1732,
"step": 721
},
{
"epoch": 0.04,
"grad_norm": 1.4171284742165549,
"learning_rate": 9.997312180620538e-06,
"loss": 0.1865,
"step": 722
},
{
"epoch": 0.04,
"grad_norm": 1.7608028746274034,
"learning_rate": 9.99728256568276e-06,
"loss": 0.2024,
"step": 723
},
{
"epoch": 0.04,
"grad_norm": 2.12302828221747,
"learning_rate": 9.997252788530318e-06,
"loss": 0.2497,
"step": 724
},
{
"epoch": 0.04,
"grad_norm": 26.926509571172836,
"learning_rate": 9.997222849164179e-06,
"loss": 0.2529,
"step": 725
},
{
"epoch": 0.04,
"grad_norm": 9.988579785507007,
"learning_rate": 9.997192747585309e-06,
"loss": 0.2325,
"step": 726
},
{
"epoch": 0.04,
"grad_norm": 2.073780616996417,
"learning_rate": 9.997162483794692e-06,
"loss": 0.2371,
"step": 727
},
{
"epoch": 0.04,
"grad_norm": 83.88128606717183,
"learning_rate": 9.997132057793306e-06,
"loss": 0.2815,
"step": 728
},
{
"epoch": 0.04,
"grad_norm": 1.6923507763339276,
"learning_rate": 9.99710146958214e-06,
"loss": 0.205,
"step": 729
},
{
"epoch": 0.04,
"grad_norm": 1.6437592540596508,
"learning_rate": 9.997070719162185e-06,
"loss": 0.2277,
"step": 730
},
{
"epoch": 0.04,
"grad_norm": 2.1464775022860074,
"learning_rate": 9.997039806534441e-06,
"loss": 0.2652,
"step": 731
},
{
"epoch": 0.04,
"grad_norm": 1.7918314370547077,
"learning_rate": 9.997008731699914e-06,
"loss": 0.1765,
"step": 732
},
{
"epoch": 0.04,
"grad_norm": 1.1126204754928446,
"learning_rate": 9.99697749465961e-06,
"loss": 0.2301,
"step": 733
},
{
"epoch": 0.04,
"grad_norm": 1.9214132517978426,
"learning_rate": 9.99694609541454e-06,
"loss": 0.2226,
"step": 734
},
{
"epoch": 0.04,
"grad_norm": 1.3356586968469206,
"learning_rate": 9.996914533965729e-06,
"loss": 0.2596,
"step": 735
},
{
"epoch": 0.04,
"grad_norm": 2.049688458611854,
"learning_rate": 9.996882810314198e-06,
"loss": 0.2259,
"step": 736
},
{
"epoch": 0.04,
"grad_norm": 2.5737350773575582,
"learning_rate": 9.996850924460977e-06,
"loss": 0.2308,
"step": 737
},
{
"epoch": 0.04,
"grad_norm": 1.5406058289958093,
"learning_rate": 9.996818876407101e-06,
"loss": 0.2375,
"step": 738
},
{
"epoch": 0.04,
"grad_norm": 1.538738234726375,
"learning_rate": 9.996786666153612e-06,
"loss": 0.2012,
"step": 739
},
{
"epoch": 0.04,
"grad_norm": 1.453470927255759,
"learning_rate": 9.996754293701556e-06,
"loss": 0.2343,
"step": 740
},
{
"epoch": 0.04,
"grad_norm": 1.6459359356329737,
"learning_rate": 9.99672175905198e-06,
"loss": 0.2573,
"step": 741
},
{
"epoch": 0.04,
"grad_norm": 1.7620574473342865,
"learning_rate": 9.996689062205942e-06,
"loss": 0.2457,
"step": 742
},
{
"epoch": 0.04,
"grad_norm": 1.7957641319017301,
"learning_rate": 9.996656203164505e-06,
"loss": 0.2719,
"step": 743
},
{
"epoch": 0.04,
"grad_norm": 1.5850265357732467,
"learning_rate": 9.996623181928735e-06,
"loss": 0.2108,
"step": 744
},
{
"epoch": 0.04,
"grad_norm": 6.063883358148109,
"learning_rate": 9.996589998499702e-06,
"loss": 0.2384,
"step": 745
},
{
"epoch": 0.04,
"grad_norm": 1.7691083322695482,
"learning_rate": 9.996556652878486e-06,
"loss": 0.233,
"step": 746
},
{
"epoch": 0.04,
"grad_norm": 1.8960295111073695,
"learning_rate": 9.996523145066165e-06,
"loss": 0.1974,
"step": 747
},
{
"epoch": 0.04,
"grad_norm": 1.6525652491229723,
"learning_rate": 9.996489475063832e-06,
"loss": 0.2616,
"step": 748
},
{
"epoch": 0.04,
"grad_norm": 1.6148126159936937,
"learning_rate": 9.996455642872578e-06,
"loss": 0.1963,
"step": 749
},
{
"epoch": 0.04,
"grad_norm": 1.7864077623918613,
"learning_rate": 9.996421648493499e-06,
"loss": 0.1906,
"step": 750
},
{
"epoch": 0.04,
"grad_norm": 1.830951504661101,
"learning_rate": 9.996387491927699e-06,
"loss": 0.251,
"step": 751
},
{
"epoch": 0.04,
"grad_norm": 1.1138158384711583,
"learning_rate": 9.99635317317629e-06,
"loss": 0.1347,
"step": 752
},
{
"epoch": 0.04,
"grad_norm": 1.9654294424616947,
"learning_rate": 9.996318692240384e-06,
"loss": 0.2566,
"step": 753
},
{
"epoch": 0.04,
"grad_norm": 1.4760254016795331,
"learning_rate": 9.996284049121098e-06,
"loss": 0.2391,
"step": 754
},
{
"epoch": 0.04,
"grad_norm": 1.5792199953262465,
"learning_rate": 9.99624924381956e-06,
"loss": 0.2341,
"step": 755
},
{
"epoch": 0.04,
"grad_norm": 9.185073220631361,
"learning_rate": 9.9962142763369e-06,
"loss": 0.2156,
"step": 756
},
{
"epoch": 0.04,
"grad_norm": 2.3936048959223153,
"learning_rate": 9.99617914667425e-06,
"loss": 0.263,
"step": 757
},
{
"epoch": 0.04,
"grad_norm": 1.7616782892152645,
"learning_rate": 9.996143854832752e-06,
"loss": 0.2508,
"step": 758
},
{
"epoch": 0.04,
"grad_norm": 1.72638969937547,
"learning_rate": 9.99610840081355e-06,
"loss": 0.2191,
"step": 759
},
{
"epoch": 0.04,
"grad_norm": 2.205201257894953,
"learning_rate": 9.996072784617799e-06,
"loss": 0.2754,
"step": 760
},
{
"epoch": 0.04,
"grad_norm": 2.3208722296787774,
"learning_rate": 9.996037006246651e-06,
"loss": 0.2637,
"step": 761
},
{
"epoch": 0.04,
"grad_norm": 1.4094917998582825,
"learning_rate": 9.99600106570127e-06,
"loss": 0.1922,
"step": 762
},
{
"epoch": 0.04,
"grad_norm": 1.4576681294561826,
"learning_rate": 9.99596496298282e-06,
"loss": 0.212,
"step": 763
},
{
"epoch": 0.04,
"grad_norm": 4.372567508758709,
"learning_rate": 9.995928698092475e-06,
"loss": 0.1416,
"step": 764
},
{
"epoch": 0.04,
"grad_norm": 1.5621205047855777,
"learning_rate": 9.99589227103141e-06,
"loss": 0.171,
"step": 765
},
{
"epoch": 0.04,
"grad_norm": 1.4635162543628049,
"learning_rate": 9.995855681800813e-06,
"loss": 0.241,
"step": 766
},
{
"epoch": 0.04,
"grad_norm": 2.308129291318679,
"learning_rate": 9.995818930401865e-06,
"loss": 0.252,
"step": 767
},
{
"epoch": 0.04,
"grad_norm": 1.6108515734099502,
"learning_rate": 9.995782016835762e-06,
"loss": 0.2097,
"step": 768
},
{
"epoch": 0.04,
"grad_norm": 1.5092952335991863,
"learning_rate": 9.995744941103702e-06,
"loss": 0.2199,
"step": 769
},
{
"epoch": 0.04,
"grad_norm": 1.5698018121380966,
"learning_rate": 9.995707703206889e-06,
"loss": 0.2054,
"step": 770
},
{
"epoch": 0.04,
"grad_norm": 2.0919332949536926,
"learning_rate": 9.99567030314653e-06,
"loss": 0.2476,
"step": 771
},
{
"epoch": 0.04,
"grad_norm": 1.7983041221681502,
"learning_rate": 9.995632740923841e-06,
"loss": 0.2442,
"step": 772
},
{
"epoch": 0.04,
"grad_norm": 1.8123833026807628,
"learning_rate": 9.99559501654004e-06,
"loss": 0.2489,
"step": 773
},
{
"epoch": 0.04,
"grad_norm": 1.4794587109495483,
"learning_rate": 9.995557129996354e-06,
"loss": 0.2479,
"step": 774
},
{
"epoch": 0.04,
"grad_norm": 2.0845419800043787,
"learning_rate": 9.995519081294009e-06,
"loss": 0.2502,
"step": 775
},
{
"epoch": 0.04,
"grad_norm": 2.218294768577398,
"learning_rate": 9.995480870434241e-06,
"loss": 0.2272,
"step": 776
},
{
"epoch": 0.04,
"grad_norm": 1.3143656873032976,
"learning_rate": 9.995442497418295e-06,
"loss": 0.2182,
"step": 777
},
{
"epoch": 0.04,
"grad_norm": 2.029352181849417,
"learning_rate": 9.995403962247411e-06,
"loss": 0.2909,
"step": 778
},
{
"epoch": 0.04,
"grad_norm": 1.0203498024423483,
"learning_rate": 9.995365264922841e-06,
"loss": 0.2256,
"step": 779
},
{
"epoch": 0.04,
"grad_norm": 2.1115985241987407,
"learning_rate": 9.995326405445842e-06,
"loss": 0.2991,
"step": 780
},
{
"epoch": 0.04,
"grad_norm": 2.0550399013924285,
"learning_rate": 9.995287383817676e-06,
"loss": 0.2447,
"step": 781
},
{
"epoch": 0.04,
"grad_norm": 1.2567733646813406,
"learning_rate": 9.99524820003961e-06,
"loss": 0.1906,
"step": 782
},
{
"epoch": 0.04,
"grad_norm": 1.5716726618425232,
"learning_rate": 9.995208854112915e-06,
"loss": 0.2231,
"step": 783
},
{
"epoch": 0.04,
"grad_norm": 1.1434299163118575,
"learning_rate": 9.995169346038867e-06,
"loss": 0.2275,
"step": 784
},
{
"epoch": 0.04,
"grad_norm": 1.7054133877612792,
"learning_rate": 9.995129675818748e-06,
"loss": 0.2149,
"step": 785
},
{
"epoch": 0.04,
"grad_norm": 1.6301066003282731,
"learning_rate": 9.995089843453849e-06,
"loss": 0.2611,
"step": 786
},
{
"epoch": 0.04,
"grad_norm": 1.9479567545745378,
"learning_rate": 9.995049848945462e-06,
"loss": 0.2367,
"step": 787
},
{
"epoch": 0.04,
"grad_norm": 1.7888615018469156,
"learning_rate": 9.995009692294884e-06,
"loss": 0.2558,
"step": 788
},
{
"epoch": 0.04,
"grad_norm": 1.1355459858878072,
"learning_rate": 9.994969373503419e-06,
"loss": 0.1715,
"step": 789
},
{
"epoch": 0.04,
"grad_norm": 1.4047483851102234,
"learning_rate": 9.994928892572376e-06,
"loss": 0.187,
"step": 790
},
{
"epoch": 0.04,
"grad_norm": 1.3005230908061722,
"learning_rate": 9.994888249503069e-06,
"loss": 0.2253,
"step": 791
},
{
"epoch": 0.04,
"grad_norm": 1.3781808222820444,
"learning_rate": 9.994847444296816e-06,
"loss": 0.2364,
"step": 792
},
{
"epoch": 0.04,
"grad_norm": 1.1397672458882684,
"learning_rate": 9.994806476954943e-06,
"loss": 0.1481,
"step": 793
},
{
"epoch": 0.04,
"grad_norm": 1.147034264635771,
"learning_rate": 9.99476534747878e-06,
"loss": 0.2296,
"step": 794
},
{
"epoch": 0.04,
"grad_norm": 1.2264614868482613,
"learning_rate": 9.994724055869663e-06,
"loss": 0.2105,
"step": 795
},
{
"epoch": 0.04,
"grad_norm": 1.2980497193169003,
"learning_rate": 9.994682602128929e-06,
"loss": 0.2495,
"step": 796
},
{
"epoch": 0.04,
"grad_norm": 1.2960108727647346,
"learning_rate": 9.994640986257927e-06,
"loss": 0.1762,
"step": 797
},
{
"epoch": 0.04,
"grad_norm": 1.6657532801801207,
"learning_rate": 9.994599208258005e-06,
"loss": 0.2725,
"step": 798
},
{
"epoch": 0.04,
"grad_norm": 1.3637181759625494,
"learning_rate": 9.99455726813052e-06,
"loss": 0.281,
"step": 799
},
{
"epoch": 0.04,
"grad_norm": 1.004064282176184,
"learning_rate": 9.994515165876837e-06,
"loss": 0.1306,
"step": 800
},
{
"epoch": 0.04,
"grad_norm": 1.160079748026799,
"learning_rate": 9.994472901498318e-06,
"loss": 0.2201,
"step": 801
},
{
"epoch": 0.04,
"grad_norm": 1.6866050484001025,
"learning_rate": 9.994430474996337e-06,
"loss": 0.1441,
"step": 802
},
{
"epoch": 0.04,
"grad_norm": 1.2357125188438258,
"learning_rate": 9.994387886372269e-06,
"loss": 0.213,
"step": 803
},
{
"epoch": 0.04,
"grad_norm": 1.3157086179306139,
"learning_rate": 9.994345135627498e-06,
"loss": 0.1935,
"step": 804
},
{
"epoch": 0.04,
"grad_norm": 2.088398720819187,
"learning_rate": 9.994302222763415e-06,
"loss": 0.2422,
"step": 805
},
{
"epoch": 0.04,
"grad_norm": 1.501032900823252,
"learning_rate": 9.99425914778141e-06,
"loss": 0.2431,
"step": 806
},
{
"epoch": 0.04,
"grad_norm": 1.9161484426335658,
"learning_rate": 9.994215910682878e-06,
"loss": 0.2639,
"step": 807
},
{
"epoch": 0.04,
"grad_norm": 1.4251877267893638,
"learning_rate": 9.994172511469227e-06,
"loss": 0.1807,
"step": 808
},
{
"epoch": 0.05,
"grad_norm": 1.585305987844208,
"learning_rate": 9.994128950141866e-06,
"loss": 0.1572,
"step": 809
},
{
"epoch": 0.05,
"grad_norm": 1.568296769722774,
"learning_rate": 9.994085226702206e-06,
"loss": 0.218,
"step": 810
},
{
"epoch": 0.05,
"grad_norm": 4.279281497790149,
"learning_rate": 9.994041341151667e-06,
"loss": 0.2671,
"step": 811
},
{
"epoch": 0.05,
"grad_norm": 1.6708094329583227,
"learning_rate": 9.993997293491674e-06,
"loss": 0.2519,
"step": 812
},
{
"epoch": 0.05,
"grad_norm": 1.8452941885547565,
"learning_rate": 9.99395308372366e-06,
"loss": 0.2501,
"step": 813
},
{
"epoch": 0.05,
"grad_norm": 1.313985562878818,
"learning_rate": 9.993908711849055e-06,
"loss": 0.2488,
"step": 814
},
{
"epoch": 0.05,
"grad_norm": 1.329512446431142,
"learning_rate": 9.993864177869302e-06,
"loss": 0.2596,
"step": 815
},
{
"epoch": 0.05,
"grad_norm": 1.2622987695407983,
"learning_rate": 9.993819481785846e-06,
"loss": 0.1941,
"step": 816
},
{
"epoch": 0.05,
"grad_norm": 1.629491924865182,
"learning_rate": 9.993774623600138e-06,
"loss": 0.129,
"step": 817
},
{
"epoch": 0.05,
"grad_norm": 0.8711916222778915,
"learning_rate": 9.993729603313633e-06,
"loss": 0.2262,
"step": 818
},
{
"epoch": 0.05,
"grad_norm": 1.445170470295729,
"learning_rate": 9.993684420927795e-06,
"loss": 0.2237,
"step": 819
},
{
"epoch": 0.05,
"grad_norm": 2.5140740488568305,
"learning_rate": 9.993639076444089e-06,
"loss": 0.3494,
"step": 820
},
{
"epoch": 0.05,
"grad_norm": 1.9510408076807002,
"learning_rate": 9.993593569863987e-06,
"loss": 0.2524,
"step": 821
},
{
"epoch": 0.05,
"grad_norm": 1.8622112240714863,
"learning_rate": 9.993547901188966e-06,
"loss": 0.2383,
"step": 822
},
{
"epoch": 0.05,
"grad_norm": 1.7631780539261597,
"learning_rate": 9.993502070420509e-06,
"loss": 0.2936,
"step": 823
},
{
"epoch": 0.05,
"grad_norm": 2.1521535190488015,
"learning_rate": 9.993456077560101e-06,
"loss": 0.2377,
"step": 824
},
{
"epoch": 0.05,
"grad_norm": 1.4783259714859684,
"learning_rate": 9.993409922609241e-06,
"loss": 0.215,
"step": 825
},
{
"epoch": 0.05,
"grad_norm": 2.625916769383279,
"learning_rate": 9.993363605569421e-06,
"loss": 0.3517,
"step": 826
},
{
"epoch": 0.05,
"grad_norm": 1.901423682190051,
"learning_rate": 9.993317126442149e-06,
"loss": 0.2951,
"step": 827
},
{
"epoch": 0.05,
"grad_norm": 1.843750507144535,
"learning_rate": 9.993270485228929e-06,
"loss": 0.2428,
"step": 828
},
{
"epoch": 0.05,
"grad_norm": 1.410532408497231,
"learning_rate": 9.993223681931278e-06,
"loss": 0.2409,
"step": 829
},
{
"epoch": 0.05,
"grad_norm": 1.3757206981467418,
"learning_rate": 9.993176716550717e-06,
"loss": 0.2084,
"step": 830
},
{
"epoch": 0.05,
"grad_norm": 1.3048156115741543,
"learning_rate": 9.993129589088769e-06,
"loss": 0.2426,
"step": 831
},
{
"epoch": 0.05,
"grad_norm": 1.866408569674609,
"learning_rate": 9.993082299546961e-06,
"loss": 0.2548,
"step": 832
},
{
"epoch": 0.05,
"grad_norm": 1.7268773095065082,
"learning_rate": 9.99303484792683e-06,
"loss": 0.217,
"step": 833
},
{
"epoch": 0.05,
"grad_norm": 1.6728730899029136,
"learning_rate": 9.992987234229918e-06,
"loss": 0.211,
"step": 834
},
{
"epoch": 0.05,
"grad_norm": 1.4376750015422304,
"learning_rate": 9.992939458457769e-06,
"loss": 0.2419,
"step": 835
},
{
"epoch": 0.05,
"grad_norm": 0.9700654850317781,
"learning_rate": 9.992891520611934e-06,
"loss": 0.203,
"step": 836
},
{
"epoch": 0.05,
"grad_norm": 1.5474833703380089,
"learning_rate": 9.99284342069397e-06,
"loss": 0.1894,
"step": 837
},
{
"epoch": 0.05,
"grad_norm": 1.2430915601614154,
"learning_rate": 9.992795158705434e-06,
"loss": 0.229,
"step": 838
},
{
"epoch": 0.05,
"grad_norm": 1.1066378706181776,
"learning_rate": 9.9927467346479e-06,
"loss": 0.138,
"step": 839
},
{
"epoch": 0.05,
"grad_norm": 1.2442485052912813,
"learning_rate": 9.992698148522934e-06,
"loss": 0.2188,
"step": 840
},
{
"epoch": 0.05,
"grad_norm": 1.503172641026925,
"learning_rate": 9.992649400332117e-06,
"loss": 0.1977,
"step": 841
},
{
"epoch": 0.05,
"grad_norm": 1.3095615053840808,
"learning_rate": 9.992600490077027e-06,
"loss": 0.2181,
"step": 842
},
{
"epoch": 0.05,
"grad_norm": 1.4209163953033057,
"learning_rate": 9.992551417759256e-06,
"loss": 0.2142,
"step": 843
},
{
"epoch": 0.05,
"grad_norm": 1.8985850478946014,
"learning_rate": 9.992502183380395e-06,
"loss": 0.231,
"step": 844
},
{
"epoch": 0.05,
"grad_norm": 1.4331381850919158,
"learning_rate": 9.992452786942041e-06,
"loss": 0.2471,
"step": 845
},
{
"epoch": 0.05,
"grad_norm": 1.4496724917035175,
"learning_rate": 9.9924032284458e-06,
"loss": 0.1898,
"step": 846
},
{
"epoch": 0.05,
"grad_norm": 1.0682342637360658,
"learning_rate": 9.992353507893279e-06,
"loss": 0.1589,
"step": 847
},
{
"epoch": 0.05,
"grad_norm": 1.811711592041193,
"learning_rate": 9.992303625286092e-06,
"loss": 0.2533,
"step": 848
},
{
"epoch": 0.05,
"grad_norm": 1.5553208671215994,
"learning_rate": 9.992253580625858e-06,
"loss": 0.2926,
"step": 849
},
{
"epoch": 0.05,
"grad_norm": 1.3665783765002752,
"learning_rate": 9.992203373914203e-06,
"loss": 0.2051,
"step": 850
},
{
"epoch": 0.05,
"grad_norm": 2.278038370450238,
"learning_rate": 9.992153005152754e-06,
"loss": 0.2622,
"step": 851
},
{
"epoch": 0.05,
"grad_norm": 2.244775285609426,
"learning_rate": 9.99210247434315e-06,
"loss": 0.2412,
"step": 852
},
{
"epoch": 0.05,
"grad_norm": 1.460435046526099,
"learning_rate": 9.992051781487029e-06,
"loss": 0.1935,
"step": 853
},
{
"epoch": 0.05,
"grad_norm": 1.3709198713766988,
"learning_rate": 9.992000926586035e-06,
"loss": 0.1716,
"step": 854
},
{
"epoch": 0.05,
"grad_norm": 1.486706223187014,
"learning_rate": 9.99194990964182e-06,
"loss": 0.2501,
"step": 855
},
{
"epoch": 0.05,
"grad_norm": 1.5583334970814147,
"learning_rate": 9.991898730656043e-06,
"loss": 0.1993,
"step": 856
},
{
"epoch": 0.05,
"grad_norm": 1.437353572436606,
"learning_rate": 9.99184738963036e-06,
"loss": 0.2311,
"step": 857
},
{
"epoch": 0.05,
"grad_norm": 1.677794414329712,
"learning_rate": 9.991795886566443e-06,
"loss": 0.2295,
"step": 858
},
{
"epoch": 0.05,
"grad_norm": 1.9166710040874402,
"learning_rate": 9.991744221465958e-06,
"loss": 0.2475,
"step": 859
},
{
"epoch": 0.05,
"grad_norm": 1.8914227024451598,
"learning_rate": 9.991692394330586e-06,
"loss": 0.2385,
"step": 860
},
{
"epoch": 0.05,
"grad_norm": 1.3891599678615987,
"learning_rate": 9.991640405162009e-06,
"loss": 0.2042,
"step": 861
},
{
"epoch": 0.05,
"grad_norm": 4.536812150590555,
"learning_rate": 9.991588253961914e-06,
"loss": 0.2438,
"step": 862
},
{
"epoch": 0.05,
"grad_norm": 1.8795405722864267,
"learning_rate": 9.991535940731993e-06,
"loss": 0.2164,
"step": 863
},
{
"epoch": 0.05,
"grad_norm": 1.8151272218590928,
"learning_rate": 9.991483465473945e-06,
"loss": 0.2886,
"step": 864
},
{
"epoch": 0.05,
"grad_norm": 1.9047040318216781,
"learning_rate": 9.991430828189476e-06,
"loss": 0.218,
"step": 865
},
{
"epoch": 0.05,
"grad_norm": 1.5089580256101758,
"learning_rate": 9.99137802888029e-06,
"loss": 0.238,
"step": 866
},
{
"epoch": 0.05,
"grad_norm": 1.5526845456171137,
"learning_rate": 9.991325067548103e-06,
"loss": 0.2379,
"step": 867
},
{
"epoch": 0.05,
"grad_norm": 1.8013273763946662,
"learning_rate": 9.991271944194634e-06,
"loss": 0.2145,
"step": 868
},
{
"epoch": 0.05,
"grad_norm": 1.8404494582374002,
"learning_rate": 9.991218658821609e-06,
"loss": 0.1939,
"step": 869
},
{
"epoch": 0.05,
"grad_norm": 2.08758554825834,
"learning_rate": 9.991165211430753e-06,
"loss": 0.2482,
"step": 870
},
{
"epoch": 0.05,
"grad_norm": 3.7042416204901096,
"learning_rate": 9.991111602023807e-06,
"loss": 0.2697,
"step": 871
},
{
"epoch": 0.05,
"grad_norm": 1.766875578430197,
"learning_rate": 9.991057830602508e-06,
"loss": 0.2023,
"step": 872
},
{
"epoch": 0.05,
"grad_norm": 2.1795710221482967,
"learning_rate": 9.9910038971686e-06,
"loss": 0.2272,
"step": 873
},
{
"epoch": 0.05,
"grad_norm": 1.6079584572606918,
"learning_rate": 9.990949801723837e-06,
"loss": 0.1936,
"step": 874
},
{
"epoch": 0.05,
"grad_norm": 0.9629808232771345,
"learning_rate": 9.990895544269973e-06,
"loss": 0.214,
"step": 875
},
{
"epoch": 0.05,
"grad_norm": 0.9922395499161029,
"learning_rate": 9.99084112480877e-06,
"loss": 0.1953,
"step": 876
},
{
"epoch": 0.05,
"grad_norm": 1.7868187756680873,
"learning_rate": 9.990786543341994e-06,
"loss": 0.268,
"step": 877
},
{
"epoch": 0.05,
"grad_norm": 1.4733962873437458,
"learning_rate": 9.990731799871415e-06,
"loss": 0.185,
"step": 878
},
{
"epoch": 0.05,
"grad_norm": 1.1905698216535727,
"learning_rate": 9.990676894398815e-06,
"loss": 0.1686,
"step": 879
},
{
"epoch": 0.05,
"grad_norm": 1.429458474747954,
"learning_rate": 9.990621826925972e-06,
"loss": 0.1919,
"step": 880
},
{
"epoch": 0.05,
"grad_norm": 2.214006551656187,
"learning_rate": 9.990566597454673e-06,
"loss": 0.2069,
"step": 881
},
{
"epoch": 0.05,
"grad_norm": 0.9883021684174255,
"learning_rate": 9.990511205986716e-06,
"loss": 0.1796,
"step": 882
},
{
"epoch": 0.05,
"grad_norm": 1.6791241432970971,
"learning_rate": 9.990455652523893e-06,
"loss": 0.3108,
"step": 883
},
{
"epoch": 0.05,
"grad_norm": 1.6095545191777498,
"learning_rate": 9.99039993706801e-06,
"loss": 0.2519,
"step": 884
},
{
"epoch": 0.05,
"grad_norm": 1.3335942051042562,
"learning_rate": 9.990344059620876e-06,
"loss": 0.2722,
"step": 885
},
{
"epoch": 0.05,
"grad_norm": 1.1723544475154293,
"learning_rate": 9.990288020184303e-06,
"loss": 0.2113,
"step": 886
},
{
"epoch": 0.05,
"grad_norm": 1.5290073057293918,
"learning_rate": 9.990231818760112e-06,
"loss": 0.2109,
"step": 887
},
{
"epoch": 0.05,
"grad_norm": 3.4602109607153175,
"learning_rate": 9.990175455350126e-06,
"loss": 0.2651,
"step": 888
},
{
"epoch": 0.05,
"grad_norm": 1.1566241374285202,
"learning_rate": 9.990118929956175e-06,
"loss": 0.2727,
"step": 889
},
{
"epoch": 0.05,
"grad_norm": 2.052044855199194,
"learning_rate": 9.990062242580096e-06,
"loss": 0.2241,
"step": 890
},
{
"epoch": 0.05,
"grad_norm": 1.4177814076146151,
"learning_rate": 9.990005393223726e-06,
"loss": 0.2151,
"step": 891
},
{
"epoch": 0.05,
"grad_norm": 1.7672564303364384,
"learning_rate": 9.989948381888913e-06,
"loss": 0.2271,
"step": 892
},
{
"epoch": 0.05,
"grad_norm": 1.3500005649194595,
"learning_rate": 9.989891208577502e-06,
"loss": 0.1935,
"step": 893
},
{
"epoch": 0.05,
"grad_norm": 1.641037130816496,
"learning_rate": 9.989833873291357e-06,
"loss": 0.2766,
"step": 894
},
{
"epoch": 0.05,
"grad_norm": 1.7856500556157282,
"learning_rate": 9.989776376032333e-06,
"loss": 0.2342,
"step": 895
},
{
"epoch": 0.05,
"grad_norm": 1.2303187324263358,
"learning_rate": 9.989718716802299e-06,
"loss": 0.2207,
"step": 896
},
{
"epoch": 0.05,
"grad_norm": 1.407066620705677,
"learning_rate": 9.989660895603127e-06,
"loss": 0.2123,
"step": 897
},
{
"epoch": 0.05,
"grad_norm": 1.2960471216727876,
"learning_rate": 9.989602912436692e-06,
"loss": 0.2201,
"step": 898
},
{
"epoch": 0.05,
"grad_norm": 1.8542379372718563,
"learning_rate": 9.989544767304877e-06,
"loss": 0.2902,
"step": 899
}
],
"logging_steps": 1.0,
"max_steps": 17977,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 899,
"total_flos": 52192960905216.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}