1JV44C / checkpoint-3000 /trainer_state.json
gotzmann's picture
..
b87d162
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6598482349059717,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00021994941163532388,
"grad_norm": 0.7960259914398193,
"learning_rate": 8e-05,
"loss": 2.2745,
"step": 1
},
{
"epoch": 0.00043989882327064776,
"grad_norm": 0.3960127830505371,
"learning_rate": 8e-05,
"loss": 1.9155,
"step": 2
},
{
"epoch": 0.0006598482349059716,
"grad_norm": 0.3869185745716095,
"learning_rate": 8e-05,
"loss": 1.8754,
"step": 3
},
{
"epoch": 0.0008797976465412955,
"grad_norm": 0.33234134316444397,
"learning_rate": 8e-05,
"loss": 1.9126,
"step": 4
},
{
"epoch": 0.0010997470581766194,
"grad_norm": 0.3670472502708435,
"learning_rate": 8e-05,
"loss": 1.9912,
"step": 5
},
{
"epoch": 0.0013196964698119432,
"grad_norm": 0.32942938804626465,
"learning_rate": 8e-05,
"loss": 1.8746,
"step": 6
},
{
"epoch": 0.001539645881447267,
"grad_norm": 0.3588086664676666,
"learning_rate": 8e-05,
"loss": 1.9545,
"step": 7
},
{
"epoch": 0.001759595293082591,
"grad_norm": 0.33002492785453796,
"learning_rate": 8e-05,
"loss": 1.8781,
"step": 8
},
{
"epoch": 0.001979544704717915,
"grad_norm": 0.3024381101131439,
"learning_rate": 8e-05,
"loss": 1.8859,
"step": 9
},
{
"epoch": 0.002199494116353239,
"grad_norm": 0.3224199712276459,
"learning_rate": 8e-05,
"loss": 1.829,
"step": 10
},
{
"epoch": 0.0024194435279885624,
"grad_norm": 0.31481102108955383,
"learning_rate": 8e-05,
"loss": 1.817,
"step": 11
},
{
"epoch": 0.0026393929396238865,
"grad_norm": 0.3078259825706482,
"learning_rate": 8e-05,
"loss": 1.9631,
"step": 12
},
{
"epoch": 0.0028593423512592105,
"grad_norm": 0.3141743540763855,
"learning_rate": 8e-05,
"loss": 1.8879,
"step": 13
},
{
"epoch": 0.003079291762894534,
"grad_norm": 0.29574745893478394,
"learning_rate": 8e-05,
"loss": 1.799,
"step": 14
},
{
"epoch": 0.003299241174529858,
"grad_norm": 0.3095031976699829,
"learning_rate": 8e-05,
"loss": 1.8741,
"step": 15
},
{
"epoch": 0.003519190586165182,
"grad_norm": 0.28804337978363037,
"learning_rate": 8e-05,
"loss": 1.9931,
"step": 16
},
{
"epoch": 0.0037391399978005057,
"grad_norm": 0.25137585401535034,
"learning_rate": 8e-05,
"loss": 1.6762,
"step": 17
},
{
"epoch": 0.00395908940943583,
"grad_norm": 0.28565698862075806,
"learning_rate": 8e-05,
"loss": 1.8489,
"step": 18
},
{
"epoch": 0.004179038821071153,
"grad_norm": 0.2877500355243683,
"learning_rate": 8e-05,
"loss": 1.8871,
"step": 19
},
{
"epoch": 0.004398988232706478,
"grad_norm": 0.28803154826164246,
"learning_rate": 8e-05,
"loss": 1.6956,
"step": 20
},
{
"epoch": 0.004618937644341801,
"grad_norm": 0.32161301374435425,
"learning_rate": 8e-05,
"loss": 1.8243,
"step": 21
},
{
"epoch": 0.004838887055977125,
"grad_norm": 0.2959391176700592,
"learning_rate": 8e-05,
"loss": 1.8991,
"step": 22
},
{
"epoch": 0.005058836467612449,
"grad_norm": 0.3021189868450165,
"learning_rate": 8e-05,
"loss": 1.9975,
"step": 23
},
{
"epoch": 0.005278785879247773,
"grad_norm": 0.2793104946613312,
"learning_rate": 8e-05,
"loss": 1.8792,
"step": 24
},
{
"epoch": 0.0054987352908830965,
"grad_norm": 0.2658381760120392,
"learning_rate": 8e-05,
"loss": 1.6467,
"step": 25
},
{
"epoch": 0.005718684702518421,
"grad_norm": 0.2793010175228119,
"learning_rate": 8e-05,
"loss": 1.7479,
"step": 26
},
{
"epoch": 0.0059386341141537445,
"grad_norm": 0.2800044119358063,
"learning_rate": 8e-05,
"loss": 1.7885,
"step": 27
},
{
"epoch": 0.006158583525789068,
"grad_norm": 0.2864585220813751,
"learning_rate": 8e-05,
"loss": 1.9257,
"step": 28
},
{
"epoch": 0.006378532937424393,
"grad_norm": 0.301496684551239,
"learning_rate": 8e-05,
"loss": 1.8586,
"step": 29
},
{
"epoch": 0.006598482349059716,
"grad_norm": 0.2858293354511261,
"learning_rate": 8e-05,
"loss": 1.8541,
"step": 30
},
{
"epoch": 0.00681843176069504,
"grad_norm": 0.31271278858184814,
"learning_rate": 8e-05,
"loss": 1.8774,
"step": 31
},
{
"epoch": 0.007038381172330364,
"grad_norm": 0.30428266525268555,
"learning_rate": 8e-05,
"loss": 1.8464,
"step": 32
},
{
"epoch": 0.007258330583965688,
"grad_norm": 0.26637139916419983,
"learning_rate": 8e-05,
"loss": 1.7896,
"step": 33
},
{
"epoch": 0.007478279995601011,
"grad_norm": 0.2802716791629791,
"learning_rate": 8e-05,
"loss": 1.9534,
"step": 34
},
{
"epoch": 0.007698229407236336,
"grad_norm": 0.35580113530158997,
"learning_rate": 8e-05,
"loss": 1.8236,
"step": 35
},
{
"epoch": 0.00791817881887166,
"grad_norm": 0.2794848382472992,
"learning_rate": 8e-05,
"loss": 1.8089,
"step": 36
},
{
"epoch": 0.008138128230506983,
"grad_norm": 0.27942711114883423,
"learning_rate": 8e-05,
"loss": 1.7725,
"step": 37
},
{
"epoch": 0.008358077642142307,
"grad_norm": 0.2882610857486725,
"learning_rate": 8e-05,
"loss": 1.8279,
"step": 38
},
{
"epoch": 0.008578027053777632,
"grad_norm": 0.29375842213630676,
"learning_rate": 8e-05,
"loss": 2.0123,
"step": 39
},
{
"epoch": 0.008797976465412955,
"grad_norm": 0.26120567321777344,
"learning_rate": 8e-05,
"loss": 1.6731,
"step": 40
},
{
"epoch": 0.009017925877048279,
"grad_norm": 0.25272971391677856,
"learning_rate": 8e-05,
"loss": 1.5723,
"step": 41
},
{
"epoch": 0.009237875288683603,
"grad_norm": 0.30548569560050964,
"learning_rate": 8e-05,
"loss": 1.9269,
"step": 42
},
{
"epoch": 0.009457824700318926,
"grad_norm": 0.2714739441871643,
"learning_rate": 8e-05,
"loss": 1.6715,
"step": 43
},
{
"epoch": 0.00967777411195425,
"grad_norm": 0.3086313009262085,
"learning_rate": 8e-05,
"loss": 1.8903,
"step": 44
},
{
"epoch": 0.009897723523589575,
"grad_norm": 0.28676554560661316,
"learning_rate": 8e-05,
"loss": 1.8257,
"step": 45
},
{
"epoch": 0.010117672935224899,
"grad_norm": 0.2898331880569458,
"learning_rate": 8e-05,
"loss": 1.822,
"step": 46
},
{
"epoch": 0.010337622346860222,
"grad_norm": 0.2887754440307617,
"learning_rate": 8e-05,
"loss": 1.7629,
"step": 47
},
{
"epoch": 0.010557571758495546,
"grad_norm": 0.28026437759399414,
"learning_rate": 8e-05,
"loss": 1.8874,
"step": 48
},
{
"epoch": 0.01077752117013087,
"grad_norm": 0.29256439208984375,
"learning_rate": 8e-05,
"loss": 1.9169,
"step": 49
},
{
"epoch": 0.010997470581766193,
"grad_norm": 0.29388460516929626,
"learning_rate": 8e-05,
"loss": 1.8341,
"step": 50
},
{
"epoch": 0.011217419993401518,
"grad_norm": 0.29456326365470886,
"learning_rate": 8e-05,
"loss": 1.7088,
"step": 51
},
{
"epoch": 0.011437369405036842,
"grad_norm": 0.2810533046722412,
"learning_rate": 8e-05,
"loss": 1.8564,
"step": 52
},
{
"epoch": 0.011657318816672166,
"grad_norm": 0.3049224019050598,
"learning_rate": 8e-05,
"loss": 2.0114,
"step": 53
},
{
"epoch": 0.011877268228307489,
"grad_norm": 0.347817987203598,
"learning_rate": 8e-05,
"loss": 1.6936,
"step": 54
},
{
"epoch": 0.012097217639942813,
"grad_norm": 0.28999242186546326,
"learning_rate": 8e-05,
"loss": 1.5852,
"step": 55
},
{
"epoch": 0.012317167051578136,
"grad_norm": 0.32856103777885437,
"learning_rate": 8e-05,
"loss": 1.8249,
"step": 56
},
{
"epoch": 0.012537116463213462,
"grad_norm": 0.3450610339641571,
"learning_rate": 8e-05,
"loss": 1.9309,
"step": 57
},
{
"epoch": 0.012757065874848785,
"grad_norm": 0.27445971965789795,
"learning_rate": 8e-05,
"loss": 1.8153,
"step": 58
},
{
"epoch": 0.012977015286484109,
"grad_norm": 0.28595077991485596,
"learning_rate": 8e-05,
"loss": 1.8061,
"step": 59
},
{
"epoch": 0.013196964698119432,
"grad_norm": 0.2909082770347595,
"learning_rate": 8e-05,
"loss": 1.7793,
"step": 60
},
{
"epoch": 0.013416914109754756,
"grad_norm": 0.28822049498558044,
"learning_rate": 8e-05,
"loss": 1.7218,
"step": 61
},
{
"epoch": 0.01363686352139008,
"grad_norm": 0.29159948229789734,
"learning_rate": 8e-05,
"loss": 1.7985,
"step": 62
},
{
"epoch": 0.013856812933025405,
"grad_norm": 0.29802417755126953,
"learning_rate": 8e-05,
"loss": 1.8903,
"step": 63
},
{
"epoch": 0.014076762344660728,
"grad_norm": 0.29128944873809814,
"learning_rate": 8e-05,
"loss": 1.7928,
"step": 64
},
{
"epoch": 0.014296711756296052,
"grad_norm": 0.3093227446079254,
"learning_rate": 8e-05,
"loss": 1.8409,
"step": 65
},
{
"epoch": 0.014516661167931376,
"grad_norm": 0.2688956558704376,
"learning_rate": 8e-05,
"loss": 1.6136,
"step": 66
},
{
"epoch": 0.0147366105795667,
"grad_norm": 0.316579133272171,
"learning_rate": 8e-05,
"loss": 1.8153,
"step": 67
},
{
"epoch": 0.014956559991202023,
"grad_norm": 0.30234795808792114,
"learning_rate": 8e-05,
"loss": 1.7311,
"step": 68
},
{
"epoch": 0.015176509402837348,
"grad_norm": 0.2790556848049164,
"learning_rate": 8e-05,
"loss": 1.7714,
"step": 69
},
{
"epoch": 0.015396458814472672,
"grad_norm": 0.29012972116470337,
"learning_rate": 8e-05,
"loss": 1.7528,
"step": 70
},
{
"epoch": 0.015616408226107995,
"grad_norm": 0.28507527709007263,
"learning_rate": 8e-05,
"loss": 1.6452,
"step": 71
},
{
"epoch": 0.01583635763774332,
"grad_norm": 0.28862133622169495,
"learning_rate": 8e-05,
"loss": 1.7473,
"step": 72
},
{
"epoch": 0.016056307049378642,
"grad_norm": 0.2726048231124878,
"learning_rate": 8e-05,
"loss": 1.7519,
"step": 73
},
{
"epoch": 0.016276256461013966,
"grad_norm": 0.26808786392211914,
"learning_rate": 8e-05,
"loss": 1.6332,
"step": 74
},
{
"epoch": 0.01649620587264929,
"grad_norm": 0.32144519686698914,
"learning_rate": 8e-05,
"loss": 1.7115,
"step": 75
},
{
"epoch": 0.016716155284284613,
"grad_norm": 0.26930421590805054,
"learning_rate": 8e-05,
"loss": 1.7854,
"step": 76
},
{
"epoch": 0.016936104695919937,
"grad_norm": 0.29462486505508423,
"learning_rate": 8e-05,
"loss": 1.6919,
"step": 77
},
{
"epoch": 0.017156054107555264,
"grad_norm": 0.2780003249645233,
"learning_rate": 8e-05,
"loss": 1.6355,
"step": 78
},
{
"epoch": 0.017376003519190587,
"grad_norm": 0.29219016432762146,
"learning_rate": 8e-05,
"loss": 1.883,
"step": 79
},
{
"epoch": 0.01759595293082591,
"grad_norm": 0.2893241047859192,
"learning_rate": 8e-05,
"loss": 1.8548,
"step": 80
},
{
"epoch": 0.017815902342461234,
"grad_norm": 0.283512145280838,
"learning_rate": 8e-05,
"loss": 1.79,
"step": 81
},
{
"epoch": 0.018035851754096558,
"grad_norm": 0.2679024040699005,
"learning_rate": 8e-05,
"loss": 1.5866,
"step": 82
},
{
"epoch": 0.01825580116573188,
"grad_norm": 0.2892123758792877,
"learning_rate": 8e-05,
"loss": 1.9033,
"step": 83
},
{
"epoch": 0.018475750577367205,
"grad_norm": 0.2680201530456543,
"learning_rate": 8e-05,
"loss": 1.8557,
"step": 84
},
{
"epoch": 0.01869569998900253,
"grad_norm": 0.30922645330429077,
"learning_rate": 8e-05,
"loss": 1.8885,
"step": 85
},
{
"epoch": 0.018915649400637852,
"grad_norm": 0.2735271751880646,
"learning_rate": 8e-05,
"loss": 1.8765,
"step": 86
},
{
"epoch": 0.019135598812273176,
"grad_norm": 0.28639712929725647,
"learning_rate": 8e-05,
"loss": 1.9429,
"step": 87
},
{
"epoch": 0.0193555482239085,
"grad_norm": 0.28437235951423645,
"learning_rate": 8e-05,
"loss": 1.8405,
"step": 88
},
{
"epoch": 0.019575497635543827,
"grad_norm": 0.276517778635025,
"learning_rate": 8e-05,
"loss": 1.7496,
"step": 89
},
{
"epoch": 0.01979544704717915,
"grad_norm": 0.273404598236084,
"learning_rate": 8e-05,
"loss": 1.704,
"step": 90
},
{
"epoch": 0.020015396458814474,
"grad_norm": 0.2707740366458893,
"learning_rate": 8e-05,
"loss": 1.8274,
"step": 91
},
{
"epoch": 0.020235345870449797,
"grad_norm": 0.26880595088005066,
"learning_rate": 8e-05,
"loss": 1.7695,
"step": 92
},
{
"epoch": 0.02045529528208512,
"grad_norm": 0.28712528944015503,
"learning_rate": 8e-05,
"loss": 1.9436,
"step": 93
},
{
"epoch": 0.020675244693720445,
"grad_norm": 0.26633599400520325,
"learning_rate": 8e-05,
"loss": 1.7877,
"step": 94
},
{
"epoch": 0.020895194105355768,
"grad_norm": 0.2843431532382965,
"learning_rate": 8e-05,
"loss": 1.8389,
"step": 95
},
{
"epoch": 0.02111514351699109,
"grad_norm": 0.2597465515136719,
"learning_rate": 8e-05,
"loss": 1.7047,
"step": 96
},
{
"epoch": 0.021335092928626415,
"grad_norm": 0.2804902493953705,
"learning_rate": 8e-05,
"loss": 1.9375,
"step": 97
},
{
"epoch": 0.02155504234026174,
"grad_norm": 0.2825285792350769,
"learning_rate": 8e-05,
"loss": 1.8348,
"step": 98
},
{
"epoch": 0.021774991751897062,
"grad_norm": 0.26459112763404846,
"learning_rate": 8e-05,
"loss": 1.7416,
"step": 99
},
{
"epoch": 0.021994941163532386,
"grad_norm": 0.28523096442222595,
"learning_rate": 8e-05,
"loss": 1.9202,
"step": 100
},
{
"epoch": 0.022214890575167713,
"grad_norm": 0.2679818570613861,
"learning_rate": 8e-05,
"loss": 1.6741,
"step": 101
},
{
"epoch": 0.022434839986803037,
"grad_norm": 0.2798464894294739,
"learning_rate": 8e-05,
"loss": 1.6622,
"step": 102
},
{
"epoch": 0.02265478939843836,
"grad_norm": 0.2826269567012787,
"learning_rate": 8e-05,
"loss": 1.7577,
"step": 103
},
{
"epoch": 0.022874738810073684,
"grad_norm": 0.3859495222568512,
"learning_rate": 8e-05,
"loss": 1.9705,
"step": 104
},
{
"epoch": 0.023094688221709007,
"grad_norm": 0.2766650319099426,
"learning_rate": 8e-05,
"loss": 1.7706,
"step": 105
},
{
"epoch": 0.02331463763334433,
"grad_norm": 0.2804067134857178,
"learning_rate": 8e-05,
"loss": 1.8007,
"step": 106
},
{
"epoch": 0.023534587044979655,
"grad_norm": 0.27818629145622253,
"learning_rate": 8e-05,
"loss": 1.7913,
"step": 107
},
{
"epoch": 0.023754536456614978,
"grad_norm": 0.2697458267211914,
"learning_rate": 8e-05,
"loss": 1.8458,
"step": 108
},
{
"epoch": 0.023974485868250302,
"grad_norm": 0.28805410861968994,
"learning_rate": 8e-05,
"loss": 1.7543,
"step": 109
},
{
"epoch": 0.024194435279885625,
"grad_norm": 0.28452396392822266,
"learning_rate": 8e-05,
"loss": 1.8499,
"step": 110
},
{
"epoch": 0.02441438469152095,
"grad_norm": 0.2837978005409241,
"learning_rate": 8e-05,
"loss": 1.797,
"step": 111
},
{
"epoch": 0.024634334103156273,
"grad_norm": 0.2965853810310364,
"learning_rate": 8e-05,
"loss": 1.7988,
"step": 112
},
{
"epoch": 0.0248542835147916,
"grad_norm": 0.28529393672943115,
"learning_rate": 8e-05,
"loss": 1.7886,
"step": 113
},
{
"epoch": 0.025074232926426923,
"grad_norm": 0.285199910402298,
"learning_rate": 8e-05,
"loss": 1.9112,
"step": 114
},
{
"epoch": 0.025294182338062247,
"grad_norm": 0.286316454410553,
"learning_rate": 8e-05,
"loss": 1.6735,
"step": 115
},
{
"epoch": 0.02551413174969757,
"grad_norm": 0.2648874819278717,
"learning_rate": 8e-05,
"loss": 1.5333,
"step": 116
},
{
"epoch": 0.025734081161332894,
"grad_norm": 0.2834017276763916,
"learning_rate": 8e-05,
"loss": 1.7524,
"step": 117
},
{
"epoch": 0.025954030572968217,
"grad_norm": 0.27846938371658325,
"learning_rate": 8e-05,
"loss": 1.8448,
"step": 118
},
{
"epoch": 0.02617397998460354,
"grad_norm": 0.3278025984764099,
"learning_rate": 8e-05,
"loss": 1.9158,
"step": 119
},
{
"epoch": 0.026393929396238865,
"grad_norm": 0.30259498953819275,
"learning_rate": 8e-05,
"loss": 1.7897,
"step": 120
},
{
"epoch": 0.026613878807874188,
"grad_norm": 0.27566099166870117,
"learning_rate": 8e-05,
"loss": 1.682,
"step": 121
},
{
"epoch": 0.026833828219509512,
"grad_norm": 0.2959173321723938,
"learning_rate": 8e-05,
"loss": 1.9032,
"step": 122
},
{
"epoch": 0.027053777631144835,
"grad_norm": 0.29449525475502014,
"learning_rate": 8e-05,
"loss": 1.6174,
"step": 123
},
{
"epoch": 0.02727372704278016,
"grad_norm": 0.3012568950653076,
"learning_rate": 8e-05,
"loss": 1.6817,
"step": 124
},
{
"epoch": 0.027493676454415486,
"grad_norm": 0.29086676239967346,
"learning_rate": 8e-05,
"loss": 1.833,
"step": 125
},
{
"epoch": 0.02771362586605081,
"grad_norm": 0.2756067216396332,
"learning_rate": 8e-05,
"loss": 1.7807,
"step": 126
},
{
"epoch": 0.027933575277686133,
"grad_norm": 0.3420695662498474,
"learning_rate": 8e-05,
"loss": 1.8652,
"step": 127
},
{
"epoch": 0.028153524689321457,
"grad_norm": 0.2899749279022217,
"learning_rate": 8e-05,
"loss": 1.7199,
"step": 128
},
{
"epoch": 0.02837347410095678,
"grad_norm": 0.274718701839447,
"learning_rate": 8e-05,
"loss": 1.7322,
"step": 129
},
{
"epoch": 0.028593423512592104,
"grad_norm": 0.3784034848213196,
"learning_rate": 8e-05,
"loss": 1.8917,
"step": 130
},
{
"epoch": 0.028813372924227428,
"grad_norm": 0.2814437448978424,
"learning_rate": 8e-05,
"loss": 1.726,
"step": 131
},
{
"epoch": 0.02903332233586275,
"grad_norm": 0.287701815366745,
"learning_rate": 8e-05,
"loss": 1.8166,
"step": 132
},
{
"epoch": 0.029253271747498075,
"grad_norm": 0.28487101197242737,
"learning_rate": 8e-05,
"loss": 1.7183,
"step": 133
},
{
"epoch": 0.0294732211591334,
"grad_norm": 0.27141597867012024,
"learning_rate": 8e-05,
"loss": 1.7436,
"step": 134
},
{
"epoch": 0.029693170570768722,
"grad_norm": 0.2708652913570404,
"learning_rate": 8e-05,
"loss": 1.8116,
"step": 135
},
{
"epoch": 0.029913119982404045,
"grad_norm": 0.2789991796016693,
"learning_rate": 8e-05,
"loss": 1.7942,
"step": 136
},
{
"epoch": 0.030133069394039372,
"grad_norm": 0.3053725063800812,
"learning_rate": 8e-05,
"loss": 1.8508,
"step": 137
},
{
"epoch": 0.030353018805674696,
"grad_norm": 0.30432772636413574,
"learning_rate": 8e-05,
"loss": 1.8129,
"step": 138
},
{
"epoch": 0.03057296821731002,
"grad_norm": 0.2873070240020752,
"learning_rate": 8e-05,
"loss": 1.8713,
"step": 139
},
{
"epoch": 0.030792917628945343,
"grad_norm": 0.2777135968208313,
"learning_rate": 8e-05,
"loss": 1.7065,
"step": 140
},
{
"epoch": 0.031012867040580667,
"grad_norm": 0.29774004220962524,
"learning_rate": 8e-05,
"loss": 1.6471,
"step": 141
},
{
"epoch": 0.03123281645221599,
"grad_norm": 0.2803782522678375,
"learning_rate": 8e-05,
"loss": 1.6992,
"step": 142
},
{
"epoch": 0.03145276586385132,
"grad_norm": 0.2777007818222046,
"learning_rate": 8e-05,
"loss": 1.8398,
"step": 143
},
{
"epoch": 0.03167271527548664,
"grad_norm": 0.26938894391059875,
"learning_rate": 8e-05,
"loss": 1.6082,
"step": 144
},
{
"epoch": 0.031892664687121965,
"grad_norm": 0.2934747338294983,
"learning_rate": 8e-05,
"loss": 1.6929,
"step": 145
},
{
"epoch": 0.032112614098757285,
"grad_norm": 0.2687772214412689,
"learning_rate": 8e-05,
"loss": 1.6472,
"step": 146
},
{
"epoch": 0.03233256351039261,
"grad_norm": 0.2758256793022156,
"learning_rate": 8e-05,
"loss": 1.7128,
"step": 147
},
{
"epoch": 0.03255251292202793,
"grad_norm": 0.26065707206726074,
"learning_rate": 8e-05,
"loss": 1.7108,
"step": 148
},
{
"epoch": 0.03277246233366326,
"grad_norm": 0.31668898463249207,
"learning_rate": 8e-05,
"loss": 1.9365,
"step": 149
},
{
"epoch": 0.03299241174529858,
"grad_norm": 0.2915947437286377,
"learning_rate": 8e-05,
"loss": 1.855,
"step": 150
},
{
"epoch": 0.033212361156933906,
"grad_norm": 0.2741534113883972,
"learning_rate": 8e-05,
"loss": 1.735,
"step": 151
},
{
"epoch": 0.033432310568569226,
"grad_norm": 0.300800085067749,
"learning_rate": 8e-05,
"loss": 1.7161,
"step": 152
},
{
"epoch": 0.03365225998020455,
"grad_norm": 0.26691076159477234,
"learning_rate": 8e-05,
"loss": 1.665,
"step": 153
},
{
"epoch": 0.03387220939183987,
"grad_norm": 0.2605098485946655,
"learning_rate": 8e-05,
"loss": 1.7288,
"step": 154
},
{
"epoch": 0.0340921588034752,
"grad_norm": 0.2728619873523712,
"learning_rate": 8e-05,
"loss": 1.7237,
"step": 155
},
{
"epoch": 0.03431210821511053,
"grad_norm": 0.29627877473831177,
"learning_rate": 8e-05,
"loss": 1.8024,
"step": 156
},
{
"epoch": 0.03453205762674585,
"grad_norm": 0.27106964588165283,
"learning_rate": 8e-05,
"loss": 1.8166,
"step": 157
},
{
"epoch": 0.034752007038381175,
"grad_norm": 0.26806893944740295,
"learning_rate": 8e-05,
"loss": 1.7061,
"step": 158
},
{
"epoch": 0.034971956450016495,
"grad_norm": 0.2509767413139343,
"learning_rate": 8e-05,
"loss": 1.6897,
"step": 159
},
{
"epoch": 0.03519190586165182,
"grad_norm": 0.34342750906944275,
"learning_rate": 8e-05,
"loss": 1.7151,
"step": 160
},
{
"epoch": 0.03541185527328714,
"grad_norm": 0.27948594093322754,
"learning_rate": 8e-05,
"loss": 1.6574,
"step": 161
},
{
"epoch": 0.03563180468492247,
"grad_norm": 0.28651687502861023,
"learning_rate": 8e-05,
"loss": 1.839,
"step": 162
},
{
"epoch": 0.03585175409655779,
"grad_norm": 0.2787701189517975,
"learning_rate": 8e-05,
"loss": 1.8146,
"step": 163
},
{
"epoch": 0.036071703508193116,
"grad_norm": 0.2596721351146698,
"learning_rate": 8e-05,
"loss": 1.6088,
"step": 164
},
{
"epoch": 0.036291652919828436,
"grad_norm": 0.2630285322666168,
"learning_rate": 8e-05,
"loss": 1.6941,
"step": 165
},
{
"epoch": 0.03651160233146376,
"grad_norm": 0.30072465538978577,
"learning_rate": 8e-05,
"loss": 1.8684,
"step": 166
},
{
"epoch": 0.03673155174309909,
"grad_norm": 0.2789234519004822,
"learning_rate": 8e-05,
"loss": 1.9136,
"step": 167
},
{
"epoch": 0.03695150115473441,
"grad_norm": 0.25597283244132996,
"learning_rate": 8e-05,
"loss": 1.669,
"step": 168
},
{
"epoch": 0.03717145056636974,
"grad_norm": 0.30354219675064087,
"learning_rate": 8e-05,
"loss": 1.7845,
"step": 169
},
{
"epoch": 0.03739139997800506,
"grad_norm": 0.26998043060302734,
"learning_rate": 8e-05,
"loss": 1.6626,
"step": 170
},
{
"epoch": 0.037611349389640385,
"grad_norm": 0.27418825030326843,
"learning_rate": 8e-05,
"loss": 1.6444,
"step": 171
},
{
"epoch": 0.037831298801275705,
"grad_norm": 0.2858507037162781,
"learning_rate": 8e-05,
"loss": 1.8584,
"step": 172
},
{
"epoch": 0.03805124821291103,
"grad_norm": 0.26513633131980896,
"learning_rate": 8e-05,
"loss": 1.7107,
"step": 173
},
{
"epoch": 0.03827119762454635,
"grad_norm": 0.3162567913532257,
"learning_rate": 8e-05,
"loss": 1.7153,
"step": 174
},
{
"epoch": 0.03849114703618168,
"grad_norm": 0.28961601853370667,
"learning_rate": 8e-05,
"loss": 1.8455,
"step": 175
},
{
"epoch": 0.038711096447817,
"grad_norm": 0.29676249623298645,
"learning_rate": 8e-05,
"loss": 1.9303,
"step": 176
},
{
"epoch": 0.038931045859452326,
"grad_norm": 0.2863664925098419,
"learning_rate": 8e-05,
"loss": 1.6975,
"step": 177
},
{
"epoch": 0.03915099527108765,
"grad_norm": 0.2715422213077545,
"learning_rate": 8e-05,
"loss": 1.5472,
"step": 178
},
{
"epoch": 0.03937094468272297,
"grad_norm": 0.2740415036678314,
"learning_rate": 8e-05,
"loss": 1.7113,
"step": 179
},
{
"epoch": 0.0395908940943583,
"grad_norm": 0.29612302780151367,
"learning_rate": 8e-05,
"loss": 1.8689,
"step": 180
},
{
"epoch": 0.03981084350599362,
"grad_norm": 0.26745903491973877,
"learning_rate": 8e-05,
"loss": 1.6076,
"step": 181
},
{
"epoch": 0.04003079291762895,
"grad_norm": 0.296695739030838,
"learning_rate": 8e-05,
"loss": 1.846,
"step": 182
},
{
"epoch": 0.04025074232926427,
"grad_norm": 0.27626705169677734,
"learning_rate": 8e-05,
"loss": 1.8103,
"step": 183
},
{
"epoch": 0.040470691740899595,
"grad_norm": 0.2597677409648895,
"learning_rate": 8e-05,
"loss": 1.6432,
"step": 184
},
{
"epoch": 0.040690641152534915,
"grad_norm": 0.2738899290561676,
"learning_rate": 8e-05,
"loss": 1.8351,
"step": 185
},
{
"epoch": 0.04091059056417024,
"grad_norm": 0.2683742344379425,
"learning_rate": 8e-05,
"loss": 1.6453,
"step": 186
},
{
"epoch": 0.04113053997580556,
"grad_norm": 0.28722816705703735,
"learning_rate": 8e-05,
"loss": 1.7685,
"step": 187
},
{
"epoch": 0.04135048938744089,
"grad_norm": 0.2851015627384186,
"learning_rate": 8e-05,
"loss": 1.8464,
"step": 188
},
{
"epoch": 0.04157043879907621,
"grad_norm": 0.2630920112133026,
"learning_rate": 8e-05,
"loss": 1.7176,
"step": 189
},
{
"epoch": 0.041790388210711536,
"grad_norm": 0.2678779661655426,
"learning_rate": 8e-05,
"loss": 1.671,
"step": 190
},
{
"epoch": 0.04201033762234686,
"grad_norm": 0.27810946106910706,
"learning_rate": 8e-05,
"loss": 1.6467,
"step": 191
},
{
"epoch": 0.04223028703398218,
"grad_norm": 0.2831014394760132,
"learning_rate": 8e-05,
"loss": 1.8784,
"step": 192
},
{
"epoch": 0.04245023644561751,
"grad_norm": 0.2643384635448456,
"learning_rate": 8e-05,
"loss": 1.6239,
"step": 193
},
{
"epoch": 0.04267018585725283,
"grad_norm": 0.27143070101737976,
"learning_rate": 8e-05,
"loss": 1.8012,
"step": 194
},
{
"epoch": 0.04289013526888816,
"grad_norm": 0.28524088859558105,
"learning_rate": 8e-05,
"loss": 1.7534,
"step": 195
},
{
"epoch": 0.04311008468052348,
"grad_norm": 0.27226153016090393,
"learning_rate": 8e-05,
"loss": 1.847,
"step": 196
},
{
"epoch": 0.043330034092158805,
"grad_norm": 0.27042534947395325,
"learning_rate": 8e-05,
"loss": 1.698,
"step": 197
},
{
"epoch": 0.043549983503794125,
"grad_norm": 0.2673223912715912,
"learning_rate": 8e-05,
"loss": 1.7825,
"step": 198
},
{
"epoch": 0.04376993291542945,
"grad_norm": 0.26485180854797363,
"learning_rate": 8e-05,
"loss": 1.7755,
"step": 199
},
{
"epoch": 0.04398988232706477,
"grad_norm": 0.26945164799690247,
"learning_rate": 8e-05,
"loss": 1.8612,
"step": 200
},
{
"epoch": 0.0442098317387001,
"grad_norm": 0.30337756872177124,
"learning_rate": 8e-05,
"loss": 1.8556,
"step": 201
},
{
"epoch": 0.044429781150335426,
"grad_norm": 0.26593855023384094,
"learning_rate": 8e-05,
"loss": 1.7633,
"step": 202
},
{
"epoch": 0.044649730561970746,
"grad_norm": 0.26703208684921265,
"learning_rate": 8e-05,
"loss": 1.7787,
"step": 203
},
{
"epoch": 0.04486967997360607,
"grad_norm": 0.2799319922924042,
"learning_rate": 8e-05,
"loss": 1.8946,
"step": 204
},
{
"epoch": 0.04508962938524139,
"grad_norm": 0.261406809091568,
"learning_rate": 8e-05,
"loss": 1.714,
"step": 205
},
{
"epoch": 0.04530957879687672,
"grad_norm": 0.30923140048980713,
"learning_rate": 8e-05,
"loss": 1.9953,
"step": 206
},
{
"epoch": 0.04552952820851204,
"grad_norm": 0.28189903497695923,
"learning_rate": 8e-05,
"loss": 1.8068,
"step": 207
},
{
"epoch": 0.04574947762014737,
"grad_norm": 0.28659504652023315,
"learning_rate": 8e-05,
"loss": 1.7961,
"step": 208
},
{
"epoch": 0.04596942703178269,
"grad_norm": 0.27828094363212585,
"learning_rate": 8e-05,
"loss": 1.6398,
"step": 209
},
{
"epoch": 0.046189376443418015,
"grad_norm": 0.2826248109340668,
"learning_rate": 8e-05,
"loss": 1.8442,
"step": 210
},
{
"epoch": 0.046409325855053335,
"grad_norm": 0.2596709430217743,
"learning_rate": 8e-05,
"loss": 1.7269,
"step": 211
},
{
"epoch": 0.04662927526668866,
"grad_norm": 0.26883357763290405,
"learning_rate": 8e-05,
"loss": 1.7396,
"step": 212
},
{
"epoch": 0.04684922467832398,
"grad_norm": 0.2834852933883667,
"learning_rate": 8e-05,
"loss": 1.6992,
"step": 213
},
{
"epoch": 0.04706917408995931,
"grad_norm": 0.30232125520706177,
"learning_rate": 8e-05,
"loss": 1.8216,
"step": 214
},
{
"epoch": 0.047289123501594636,
"grad_norm": 0.2887151539325714,
"learning_rate": 8e-05,
"loss": 1.5633,
"step": 215
},
{
"epoch": 0.047509072913229956,
"grad_norm": 0.27171874046325684,
"learning_rate": 8e-05,
"loss": 1.8272,
"step": 216
},
{
"epoch": 0.04772902232486528,
"grad_norm": 0.35441088676452637,
"learning_rate": 8e-05,
"loss": 1.8308,
"step": 217
},
{
"epoch": 0.047948971736500604,
"grad_norm": 0.28351160883903503,
"learning_rate": 8e-05,
"loss": 1.8697,
"step": 218
},
{
"epoch": 0.04816892114813593,
"grad_norm": 0.26361364126205444,
"learning_rate": 8e-05,
"loss": 1.7044,
"step": 219
},
{
"epoch": 0.04838887055977125,
"grad_norm": 0.2720041871070862,
"learning_rate": 8e-05,
"loss": 1.7718,
"step": 220
},
{
"epoch": 0.04860881997140658,
"grad_norm": 0.28131023049354553,
"learning_rate": 8e-05,
"loss": 1.8066,
"step": 221
},
{
"epoch": 0.0488287693830419,
"grad_norm": 0.2640543580055237,
"learning_rate": 8e-05,
"loss": 1.69,
"step": 222
},
{
"epoch": 0.049048718794677225,
"grad_norm": 0.26101046800613403,
"learning_rate": 8e-05,
"loss": 1.6372,
"step": 223
},
{
"epoch": 0.049268668206312545,
"grad_norm": 0.3021651804447174,
"learning_rate": 8e-05,
"loss": 1.8528,
"step": 224
},
{
"epoch": 0.04948861761794787,
"grad_norm": 0.2655261158943176,
"learning_rate": 8e-05,
"loss": 1.7406,
"step": 225
},
{
"epoch": 0.0497085670295832,
"grad_norm": 0.2873914837837219,
"learning_rate": 8e-05,
"loss": 1.7643,
"step": 226
},
{
"epoch": 0.04992851644121852,
"grad_norm": 0.31813880801200867,
"learning_rate": 8e-05,
"loss": 1.8645,
"step": 227
},
{
"epoch": 0.050148465852853846,
"grad_norm": 0.2996014654636383,
"learning_rate": 8e-05,
"loss": 1.6685,
"step": 228
},
{
"epoch": 0.050368415264489166,
"grad_norm": 0.2837509512901306,
"learning_rate": 8e-05,
"loss": 1.9227,
"step": 229
},
{
"epoch": 0.05058836467612449,
"grad_norm": 0.29532885551452637,
"learning_rate": 8e-05,
"loss": 1.9073,
"step": 230
},
{
"epoch": 0.050808314087759814,
"grad_norm": 0.285295307636261,
"learning_rate": 8e-05,
"loss": 1.8248,
"step": 231
},
{
"epoch": 0.05102826349939514,
"grad_norm": 0.26331770420074463,
"learning_rate": 8e-05,
"loss": 1.7146,
"step": 232
},
{
"epoch": 0.05124821291103046,
"grad_norm": 0.24956567585468292,
"learning_rate": 8e-05,
"loss": 1.5574,
"step": 233
},
{
"epoch": 0.05146816232266579,
"grad_norm": 0.27515965700149536,
"learning_rate": 8e-05,
"loss": 1.7854,
"step": 234
},
{
"epoch": 0.05168811173430111,
"grad_norm": 0.28268730640411377,
"learning_rate": 8e-05,
"loss": 1.8294,
"step": 235
},
{
"epoch": 0.051908061145936435,
"grad_norm": 0.25420427322387695,
"learning_rate": 8e-05,
"loss": 1.6735,
"step": 236
},
{
"epoch": 0.052128010557571755,
"grad_norm": 0.2869463860988617,
"learning_rate": 8e-05,
"loss": 1.808,
"step": 237
},
{
"epoch": 0.05234795996920708,
"grad_norm": 0.2574792206287384,
"learning_rate": 8e-05,
"loss": 1.7563,
"step": 238
},
{
"epoch": 0.05256790938084241,
"grad_norm": 0.26652273535728455,
"learning_rate": 8e-05,
"loss": 1.743,
"step": 239
},
{
"epoch": 0.05278785879247773,
"grad_norm": 0.2956235408782959,
"learning_rate": 8e-05,
"loss": 1.9169,
"step": 240
},
{
"epoch": 0.053007808204113056,
"grad_norm": 0.274142861366272,
"learning_rate": 8e-05,
"loss": 1.8321,
"step": 241
},
{
"epoch": 0.053227757615748376,
"grad_norm": 0.27525436878204346,
"learning_rate": 8e-05,
"loss": 1.8206,
"step": 242
},
{
"epoch": 0.053447707027383703,
"grad_norm": 0.26323091983795166,
"learning_rate": 8e-05,
"loss": 1.7574,
"step": 243
},
{
"epoch": 0.053667656439019024,
"grad_norm": 0.28554126620292664,
"learning_rate": 8e-05,
"loss": 1.9293,
"step": 244
},
{
"epoch": 0.05388760585065435,
"grad_norm": 0.2651476562023163,
"learning_rate": 8e-05,
"loss": 1.808,
"step": 245
},
{
"epoch": 0.05410755526228967,
"grad_norm": 0.27941837906837463,
"learning_rate": 8e-05,
"loss": 1.7838,
"step": 246
},
{
"epoch": 0.054327504673925,
"grad_norm": 0.26575711369514465,
"learning_rate": 8e-05,
"loss": 1.6117,
"step": 247
},
{
"epoch": 0.05454745408556032,
"grad_norm": 0.2620556354522705,
"learning_rate": 8e-05,
"loss": 1.7703,
"step": 248
},
{
"epoch": 0.054767403497195645,
"grad_norm": 0.2782936990261078,
"learning_rate": 8e-05,
"loss": 1.753,
"step": 249
},
{
"epoch": 0.05498735290883097,
"grad_norm": 0.28347843885421753,
"learning_rate": 8e-05,
"loss": 1.8365,
"step": 250
},
{
"epoch": 0.05520730232046629,
"grad_norm": 0.2740314304828644,
"learning_rate": 8e-05,
"loss": 1.7448,
"step": 251
},
{
"epoch": 0.05542725173210162,
"grad_norm": 0.2779199779033661,
"learning_rate": 8e-05,
"loss": 1.8025,
"step": 252
},
{
"epoch": 0.05564720114373694,
"grad_norm": 0.27700838446617126,
"learning_rate": 8e-05,
"loss": 1.6368,
"step": 253
},
{
"epoch": 0.055867150555372266,
"grad_norm": 0.2753797173500061,
"learning_rate": 8e-05,
"loss": 1.7058,
"step": 254
},
{
"epoch": 0.056087099967007586,
"grad_norm": 0.2677604556083679,
"learning_rate": 8e-05,
"loss": 1.772,
"step": 255
},
{
"epoch": 0.056307049378642914,
"grad_norm": 0.291358083486557,
"learning_rate": 8e-05,
"loss": 1.7229,
"step": 256
},
{
"epoch": 0.056526998790278234,
"grad_norm": 0.2605611979961395,
"learning_rate": 8e-05,
"loss": 1.6654,
"step": 257
},
{
"epoch": 0.05674694820191356,
"grad_norm": 0.2726796865463257,
"learning_rate": 8e-05,
"loss": 1.8524,
"step": 258
},
{
"epoch": 0.05696689761354888,
"grad_norm": 0.2769307494163513,
"learning_rate": 8e-05,
"loss": 1.913,
"step": 259
},
{
"epoch": 0.05718684702518421,
"grad_norm": 0.27163514494895935,
"learning_rate": 8e-05,
"loss": 1.7076,
"step": 260
},
{
"epoch": 0.057406796436819535,
"grad_norm": 0.27037522196769714,
"learning_rate": 8e-05,
"loss": 1.7461,
"step": 261
},
{
"epoch": 0.057626745848454855,
"grad_norm": 0.2570153772830963,
"learning_rate": 8e-05,
"loss": 1.6714,
"step": 262
},
{
"epoch": 0.05784669526009018,
"grad_norm": 0.2802227735519409,
"learning_rate": 8e-05,
"loss": 1.6782,
"step": 263
},
{
"epoch": 0.0580666446717255,
"grad_norm": 0.293969064950943,
"learning_rate": 8e-05,
"loss": 1.6253,
"step": 264
},
{
"epoch": 0.05828659408336083,
"grad_norm": 0.28199446201324463,
"learning_rate": 8e-05,
"loss": 1.791,
"step": 265
},
{
"epoch": 0.05850654349499615,
"grad_norm": 0.3037835657596588,
"learning_rate": 8e-05,
"loss": 1.8553,
"step": 266
},
{
"epoch": 0.058726492906631476,
"grad_norm": 0.2814860939979553,
"learning_rate": 8e-05,
"loss": 1.7237,
"step": 267
},
{
"epoch": 0.0589464423182668,
"grad_norm": 0.29769864678382874,
"learning_rate": 8e-05,
"loss": 1.8635,
"step": 268
},
{
"epoch": 0.059166391729902124,
"grad_norm": 0.26650169491767883,
"learning_rate": 8e-05,
"loss": 1.8173,
"step": 269
},
{
"epoch": 0.059386341141537444,
"grad_norm": 0.29682958126068115,
"learning_rate": 8e-05,
"loss": 1.6548,
"step": 270
},
{
"epoch": 0.05960629055317277,
"grad_norm": 0.2702498137950897,
"learning_rate": 8e-05,
"loss": 1.6022,
"step": 271
},
{
"epoch": 0.05982623996480809,
"grad_norm": 0.2940424680709839,
"learning_rate": 8e-05,
"loss": 1.7955,
"step": 272
},
{
"epoch": 0.06004618937644342,
"grad_norm": 0.2655317485332489,
"learning_rate": 8e-05,
"loss": 1.786,
"step": 273
},
{
"epoch": 0.060266138788078745,
"grad_norm": 0.28093400597572327,
"learning_rate": 8e-05,
"loss": 1.9798,
"step": 274
},
{
"epoch": 0.060486088199714065,
"grad_norm": 0.2635514736175537,
"learning_rate": 8e-05,
"loss": 1.6737,
"step": 275
},
{
"epoch": 0.06070603761134939,
"grad_norm": 0.2648226320743561,
"learning_rate": 8e-05,
"loss": 1.8771,
"step": 276
},
{
"epoch": 0.06092598702298471,
"grad_norm": 0.2934603691101074,
"learning_rate": 8e-05,
"loss": 1.4751,
"step": 277
},
{
"epoch": 0.06114593643462004,
"grad_norm": 0.26369500160217285,
"learning_rate": 8e-05,
"loss": 1.7832,
"step": 278
},
{
"epoch": 0.06136588584625536,
"grad_norm": 0.26159989833831787,
"learning_rate": 8e-05,
"loss": 1.7276,
"step": 279
},
{
"epoch": 0.061585835257890686,
"grad_norm": 0.2826705873012543,
"learning_rate": 8e-05,
"loss": 1.8767,
"step": 280
},
{
"epoch": 0.06180578466952601,
"grad_norm": 0.2911459505558014,
"learning_rate": 8e-05,
"loss": 1.7795,
"step": 281
},
{
"epoch": 0.062025734081161334,
"grad_norm": 0.27846869826316833,
"learning_rate": 8e-05,
"loss": 1.838,
"step": 282
},
{
"epoch": 0.062245683492796654,
"grad_norm": 0.33195585012435913,
"learning_rate": 8e-05,
"loss": 1.8576,
"step": 283
},
{
"epoch": 0.06246563290443198,
"grad_norm": 0.26306337118148804,
"learning_rate": 8e-05,
"loss": 1.7202,
"step": 284
},
{
"epoch": 0.0626855823160673,
"grad_norm": 0.2703022360801697,
"learning_rate": 8e-05,
"loss": 1.6962,
"step": 285
},
{
"epoch": 0.06290553172770263,
"grad_norm": 0.2754605710506439,
"learning_rate": 8e-05,
"loss": 1.6468,
"step": 286
},
{
"epoch": 0.06312548113933796,
"grad_norm": 0.2995694577693939,
"learning_rate": 8e-05,
"loss": 1.9298,
"step": 287
},
{
"epoch": 0.06334543055097328,
"grad_norm": 0.27501800656318665,
"learning_rate": 8e-05,
"loss": 1.8152,
"step": 288
},
{
"epoch": 0.0635653799626086,
"grad_norm": 0.2668202519416809,
"learning_rate": 8e-05,
"loss": 1.8809,
"step": 289
},
{
"epoch": 0.06378532937424393,
"grad_norm": 0.26209571957588196,
"learning_rate": 8e-05,
"loss": 1.4927,
"step": 290
},
{
"epoch": 0.06400527878587925,
"grad_norm": 0.35276591777801514,
"learning_rate": 8e-05,
"loss": 1.9654,
"step": 291
},
{
"epoch": 0.06422522819751457,
"grad_norm": 0.26070040464401245,
"learning_rate": 8e-05,
"loss": 1.7332,
"step": 292
},
{
"epoch": 0.06444517760914989,
"grad_norm": 0.26518604159355164,
"learning_rate": 8e-05,
"loss": 1.6867,
"step": 293
},
{
"epoch": 0.06466512702078522,
"grad_norm": 0.28992095589637756,
"learning_rate": 8e-05,
"loss": 1.7498,
"step": 294
},
{
"epoch": 0.06488507643242054,
"grad_norm": 0.27465108036994934,
"learning_rate": 8e-05,
"loss": 1.6095,
"step": 295
},
{
"epoch": 0.06510502584405586,
"grad_norm": 0.2841359078884125,
"learning_rate": 8e-05,
"loss": 1.6869,
"step": 296
},
{
"epoch": 0.0653249752556912,
"grad_norm": 0.28873759508132935,
"learning_rate": 8e-05,
"loss": 1.7954,
"step": 297
},
{
"epoch": 0.06554492466732652,
"grad_norm": 0.2542605698108673,
"learning_rate": 8e-05,
"loss": 1.6075,
"step": 298
},
{
"epoch": 0.06576487407896184,
"grad_norm": 0.270823210477829,
"learning_rate": 8e-05,
"loss": 1.7238,
"step": 299
},
{
"epoch": 0.06598482349059716,
"grad_norm": 0.2610267102718353,
"learning_rate": 8e-05,
"loss": 1.697,
"step": 300
},
{
"epoch": 0.06620477290223249,
"grad_norm": 0.28088685870170593,
"learning_rate": 8e-05,
"loss": 1.6806,
"step": 301
},
{
"epoch": 0.06642472231386781,
"grad_norm": 0.2656930088996887,
"learning_rate": 8e-05,
"loss": 1.8744,
"step": 302
},
{
"epoch": 0.06664467172550313,
"grad_norm": 0.2721637189388275,
"learning_rate": 8e-05,
"loss": 1.6903,
"step": 303
},
{
"epoch": 0.06686462113713845,
"grad_norm": 0.2612883746623993,
"learning_rate": 8e-05,
"loss": 1.7444,
"step": 304
},
{
"epoch": 0.06708457054877379,
"grad_norm": 0.2533530592918396,
"learning_rate": 8e-05,
"loss": 1.6427,
"step": 305
},
{
"epoch": 0.0673045199604091,
"grad_norm": 0.27200043201446533,
"learning_rate": 8e-05,
"loss": 1.769,
"step": 306
},
{
"epoch": 0.06752446937204443,
"grad_norm": 0.2626403272151947,
"learning_rate": 8e-05,
"loss": 1.64,
"step": 307
},
{
"epoch": 0.06774441878367975,
"grad_norm": 0.3720408082008362,
"learning_rate": 8e-05,
"loss": 1.9055,
"step": 308
},
{
"epoch": 0.06796436819531508,
"grad_norm": 0.2745527923107147,
"learning_rate": 8e-05,
"loss": 1.7844,
"step": 309
},
{
"epoch": 0.0681843176069504,
"grad_norm": 0.2568323612213135,
"learning_rate": 8e-05,
"loss": 1.6728,
"step": 310
},
{
"epoch": 0.06840426701858572,
"grad_norm": 0.2704140543937683,
"learning_rate": 8e-05,
"loss": 1.7685,
"step": 311
},
{
"epoch": 0.06862421643022105,
"grad_norm": 0.27828502655029297,
"learning_rate": 8e-05,
"loss": 1.7957,
"step": 312
},
{
"epoch": 0.06884416584185638,
"grad_norm": 0.2951858341693878,
"learning_rate": 8e-05,
"loss": 1.7709,
"step": 313
},
{
"epoch": 0.0690641152534917,
"grad_norm": 0.2756475806236267,
"learning_rate": 8e-05,
"loss": 1.6348,
"step": 314
},
{
"epoch": 0.06928406466512702,
"grad_norm": 0.2913607954978943,
"learning_rate": 8e-05,
"loss": 1.7888,
"step": 315
},
{
"epoch": 0.06950401407676235,
"grad_norm": 0.2798636853694916,
"learning_rate": 8e-05,
"loss": 1.7806,
"step": 316
},
{
"epoch": 0.06972396348839767,
"grad_norm": 0.27596554160118103,
"learning_rate": 8e-05,
"loss": 1.7458,
"step": 317
},
{
"epoch": 0.06994391290003299,
"grad_norm": 0.26655322313308716,
"learning_rate": 8e-05,
"loss": 1.5985,
"step": 318
},
{
"epoch": 0.07016386231166831,
"grad_norm": 0.2731332778930664,
"learning_rate": 8e-05,
"loss": 1.5995,
"step": 319
},
{
"epoch": 0.07038381172330364,
"grad_norm": 0.2769210934638977,
"learning_rate": 8e-05,
"loss": 1.6748,
"step": 320
},
{
"epoch": 0.07060376113493896,
"grad_norm": 0.290889173746109,
"learning_rate": 8e-05,
"loss": 1.9427,
"step": 321
},
{
"epoch": 0.07082371054657428,
"grad_norm": 0.2911258339881897,
"learning_rate": 8e-05,
"loss": 1.7723,
"step": 322
},
{
"epoch": 0.07104365995820962,
"grad_norm": 0.301992267370224,
"learning_rate": 8e-05,
"loss": 1.7772,
"step": 323
},
{
"epoch": 0.07126360936984494,
"grad_norm": 0.3023516535758972,
"learning_rate": 8e-05,
"loss": 1.8363,
"step": 324
},
{
"epoch": 0.07148355878148026,
"grad_norm": 0.3058542013168335,
"learning_rate": 8e-05,
"loss": 1.8762,
"step": 325
},
{
"epoch": 0.07170350819311558,
"grad_norm": 0.3215092718601227,
"learning_rate": 8e-05,
"loss": 1.7265,
"step": 326
},
{
"epoch": 0.07192345760475091,
"grad_norm": 0.2762998342514038,
"learning_rate": 8e-05,
"loss": 1.6361,
"step": 327
},
{
"epoch": 0.07214340701638623,
"grad_norm": 0.258635014295578,
"learning_rate": 8e-05,
"loss": 1.7031,
"step": 328
},
{
"epoch": 0.07236335642802155,
"grad_norm": 0.27160710096359253,
"learning_rate": 8e-05,
"loss": 1.6759,
"step": 329
},
{
"epoch": 0.07258330583965687,
"grad_norm": 0.31089314818382263,
"learning_rate": 8e-05,
"loss": 1.8141,
"step": 330
},
{
"epoch": 0.0728032552512922,
"grad_norm": 0.3026575744152069,
"learning_rate": 8e-05,
"loss": 1.9513,
"step": 331
},
{
"epoch": 0.07302320466292753,
"grad_norm": 0.2692122161388397,
"learning_rate": 8e-05,
"loss": 1.8277,
"step": 332
},
{
"epoch": 0.07324315407456285,
"grad_norm": 0.27460286021232605,
"learning_rate": 8e-05,
"loss": 1.6426,
"step": 333
},
{
"epoch": 0.07346310348619818,
"grad_norm": 0.2557325065135956,
"learning_rate": 8e-05,
"loss": 1.6418,
"step": 334
},
{
"epoch": 0.0736830528978335,
"grad_norm": 0.28074318170547485,
"learning_rate": 8e-05,
"loss": 1.79,
"step": 335
},
{
"epoch": 0.07390300230946882,
"grad_norm": 0.28538671135902405,
"learning_rate": 8e-05,
"loss": 1.7363,
"step": 336
},
{
"epoch": 0.07412295172110414,
"grad_norm": 0.27379995584487915,
"learning_rate": 8e-05,
"loss": 1.7881,
"step": 337
},
{
"epoch": 0.07434290113273948,
"grad_norm": 0.2628316283226013,
"learning_rate": 8e-05,
"loss": 1.745,
"step": 338
},
{
"epoch": 0.0745628505443748,
"grad_norm": 0.2573058009147644,
"learning_rate": 8e-05,
"loss": 1.7997,
"step": 339
},
{
"epoch": 0.07478279995601012,
"grad_norm": 0.31905651092529297,
"learning_rate": 8e-05,
"loss": 1.8125,
"step": 340
},
{
"epoch": 0.07500274936764544,
"grad_norm": 0.2501446604728699,
"learning_rate": 8e-05,
"loss": 1.557,
"step": 341
},
{
"epoch": 0.07522269877928077,
"grad_norm": 0.26969289779663086,
"learning_rate": 8e-05,
"loss": 1.7819,
"step": 342
},
{
"epoch": 0.07544264819091609,
"grad_norm": 0.28457415103912354,
"learning_rate": 8e-05,
"loss": 1.7682,
"step": 343
},
{
"epoch": 0.07566259760255141,
"grad_norm": 0.27833452820777893,
"learning_rate": 8e-05,
"loss": 1.8436,
"step": 344
},
{
"epoch": 0.07588254701418674,
"grad_norm": 0.2574867010116577,
"learning_rate": 8e-05,
"loss": 1.7196,
"step": 345
},
{
"epoch": 0.07610249642582206,
"grad_norm": 0.30035245418548584,
"learning_rate": 8e-05,
"loss": 1.7159,
"step": 346
},
{
"epoch": 0.07632244583745738,
"grad_norm": 0.284169465303421,
"learning_rate": 8e-05,
"loss": 1.7238,
"step": 347
},
{
"epoch": 0.0765423952490927,
"grad_norm": 0.257168173789978,
"learning_rate": 8e-05,
"loss": 1.8531,
"step": 348
},
{
"epoch": 0.07676234466072804,
"grad_norm": 0.2611413300037384,
"learning_rate": 8e-05,
"loss": 1.7753,
"step": 349
},
{
"epoch": 0.07698229407236336,
"grad_norm": 0.26592132449150085,
"learning_rate": 8e-05,
"loss": 1.7557,
"step": 350
},
{
"epoch": 0.07720224348399868,
"grad_norm": 0.27427396178245544,
"learning_rate": 8e-05,
"loss": 1.8699,
"step": 351
},
{
"epoch": 0.077422192895634,
"grad_norm": 0.27014485001564026,
"learning_rate": 8e-05,
"loss": 1.816,
"step": 352
},
{
"epoch": 0.07764214230726933,
"grad_norm": 0.27720019221305847,
"learning_rate": 8e-05,
"loss": 1.9601,
"step": 353
},
{
"epoch": 0.07786209171890465,
"grad_norm": 0.3222314417362213,
"learning_rate": 8e-05,
"loss": 1.6726,
"step": 354
},
{
"epoch": 0.07808204113053997,
"grad_norm": 0.2675410211086273,
"learning_rate": 8e-05,
"loss": 1.7113,
"step": 355
},
{
"epoch": 0.0783019905421753,
"grad_norm": 0.2902251183986664,
"learning_rate": 8e-05,
"loss": 1.7734,
"step": 356
},
{
"epoch": 0.07852193995381063,
"grad_norm": 0.2985514998435974,
"learning_rate": 8e-05,
"loss": 1.9182,
"step": 357
},
{
"epoch": 0.07874188936544595,
"grad_norm": 0.30351343750953674,
"learning_rate": 8e-05,
"loss": 1.7795,
"step": 358
},
{
"epoch": 0.07896183877708127,
"grad_norm": 0.2885829210281372,
"learning_rate": 8e-05,
"loss": 1.8054,
"step": 359
},
{
"epoch": 0.0791817881887166,
"grad_norm": 0.273366242647171,
"learning_rate": 8e-05,
"loss": 1.7903,
"step": 360
},
{
"epoch": 0.07940173760035192,
"grad_norm": 0.2959200441837311,
"learning_rate": 8e-05,
"loss": 1.9163,
"step": 361
},
{
"epoch": 0.07962168701198724,
"grad_norm": 0.2587856948375702,
"learning_rate": 8e-05,
"loss": 1.5969,
"step": 362
},
{
"epoch": 0.07984163642362256,
"grad_norm": 0.27777665853500366,
"learning_rate": 8e-05,
"loss": 1.8769,
"step": 363
},
{
"epoch": 0.0800615858352579,
"grad_norm": 0.2635156512260437,
"learning_rate": 8e-05,
"loss": 1.8236,
"step": 364
},
{
"epoch": 0.08028153524689322,
"grad_norm": 0.26534774899482727,
"learning_rate": 8e-05,
"loss": 1.6824,
"step": 365
},
{
"epoch": 0.08050148465852854,
"grad_norm": 0.26372772455215454,
"learning_rate": 8e-05,
"loss": 1.517,
"step": 366
},
{
"epoch": 0.08072143407016386,
"grad_norm": 0.2707895338535309,
"learning_rate": 8e-05,
"loss": 1.6757,
"step": 367
},
{
"epoch": 0.08094138348179919,
"grad_norm": 0.2712070345878601,
"learning_rate": 8e-05,
"loss": 1.7261,
"step": 368
},
{
"epoch": 0.08116133289343451,
"grad_norm": 0.2870525121688843,
"learning_rate": 8e-05,
"loss": 1.6337,
"step": 369
},
{
"epoch": 0.08138128230506983,
"grad_norm": 0.30548396706581116,
"learning_rate": 8e-05,
"loss": 1.8733,
"step": 370
},
{
"epoch": 0.08160123171670516,
"grad_norm": 0.2853962182998657,
"learning_rate": 8e-05,
"loss": 1.7938,
"step": 371
},
{
"epoch": 0.08182118112834048,
"grad_norm": 0.2716579735279083,
"learning_rate": 8e-05,
"loss": 1.6733,
"step": 372
},
{
"epoch": 0.0820411305399758,
"grad_norm": 0.3110131025314331,
"learning_rate": 8e-05,
"loss": 1.8554,
"step": 373
},
{
"epoch": 0.08226107995161112,
"grad_norm": 0.28003835678100586,
"learning_rate": 8e-05,
"loss": 1.8032,
"step": 374
},
{
"epoch": 0.08248102936324646,
"grad_norm": 0.28504347801208496,
"learning_rate": 8e-05,
"loss": 1.942,
"step": 375
},
{
"epoch": 0.08270097877488178,
"grad_norm": 0.2593232989311218,
"learning_rate": 8e-05,
"loss": 1.4993,
"step": 376
},
{
"epoch": 0.0829209281865171,
"grad_norm": 0.35680094361305237,
"learning_rate": 8e-05,
"loss": 1.8997,
"step": 377
},
{
"epoch": 0.08314087759815242,
"grad_norm": 0.2747777998447418,
"learning_rate": 8e-05,
"loss": 1.7364,
"step": 378
},
{
"epoch": 0.08336082700978775,
"grad_norm": 0.26816287636756897,
"learning_rate": 8e-05,
"loss": 1.7011,
"step": 379
},
{
"epoch": 0.08358077642142307,
"grad_norm": 0.31877851486206055,
"learning_rate": 8e-05,
"loss": 1.6131,
"step": 380
},
{
"epoch": 0.08380072583305839,
"grad_norm": 0.2845601737499237,
"learning_rate": 8e-05,
"loss": 1.6544,
"step": 381
},
{
"epoch": 0.08402067524469373,
"grad_norm": 0.27758803963661194,
"learning_rate": 8e-05,
"loss": 1.8891,
"step": 382
},
{
"epoch": 0.08424062465632905,
"grad_norm": 0.2832657992839813,
"learning_rate": 8e-05,
"loss": 1.7505,
"step": 383
},
{
"epoch": 0.08446057406796437,
"grad_norm": 0.2901705801486969,
"learning_rate": 8e-05,
"loss": 1.7501,
"step": 384
},
{
"epoch": 0.08468052347959969,
"grad_norm": 0.31189531087875366,
"learning_rate": 8e-05,
"loss": 1.8132,
"step": 385
},
{
"epoch": 0.08490047289123502,
"grad_norm": 0.27582603693008423,
"learning_rate": 8e-05,
"loss": 1.7693,
"step": 386
},
{
"epoch": 0.08512042230287034,
"grad_norm": 0.3030100464820862,
"learning_rate": 8e-05,
"loss": 1.7327,
"step": 387
},
{
"epoch": 0.08534037171450566,
"grad_norm": 0.26879045367240906,
"learning_rate": 8e-05,
"loss": 1.6614,
"step": 388
},
{
"epoch": 0.08556032112614098,
"grad_norm": 0.29507508873939514,
"learning_rate": 8e-05,
"loss": 1.9483,
"step": 389
},
{
"epoch": 0.08578027053777632,
"grad_norm": 0.27386122941970825,
"learning_rate": 8e-05,
"loss": 1.8974,
"step": 390
},
{
"epoch": 0.08600021994941164,
"grad_norm": 0.27103161811828613,
"learning_rate": 8e-05,
"loss": 1.7579,
"step": 391
},
{
"epoch": 0.08622016936104696,
"grad_norm": 0.3045141100883484,
"learning_rate": 8e-05,
"loss": 1.8175,
"step": 392
},
{
"epoch": 0.08644011877268229,
"grad_norm": 0.29032695293426514,
"learning_rate": 8e-05,
"loss": 1.7493,
"step": 393
},
{
"epoch": 0.08666006818431761,
"grad_norm": 0.27853158116340637,
"learning_rate": 8e-05,
"loss": 1.7297,
"step": 394
},
{
"epoch": 0.08688001759595293,
"grad_norm": 0.3007650375366211,
"learning_rate": 8e-05,
"loss": 1.6736,
"step": 395
},
{
"epoch": 0.08709996700758825,
"grad_norm": 0.28009670972824097,
"learning_rate": 8e-05,
"loss": 1.9539,
"step": 396
},
{
"epoch": 0.08731991641922358,
"grad_norm": 0.2512955665588379,
"learning_rate": 8e-05,
"loss": 1.6362,
"step": 397
},
{
"epoch": 0.0875398658308589,
"grad_norm": 0.297489732503891,
"learning_rate": 8e-05,
"loss": 1.9097,
"step": 398
},
{
"epoch": 0.08775981524249422,
"grad_norm": 0.2735532522201538,
"learning_rate": 8e-05,
"loss": 1.8348,
"step": 399
},
{
"epoch": 0.08797976465412954,
"grad_norm": 0.2559053897857666,
"learning_rate": 8e-05,
"loss": 1.685,
"step": 400
},
{
"epoch": 0.08819971406576488,
"grad_norm": 0.27982097864151,
"learning_rate": 8e-05,
"loss": 1.6801,
"step": 401
},
{
"epoch": 0.0884196634774002,
"grad_norm": 0.26066988706588745,
"learning_rate": 8e-05,
"loss": 1.7732,
"step": 402
},
{
"epoch": 0.08863961288903552,
"grad_norm": 0.26763463020324707,
"learning_rate": 8e-05,
"loss": 1.7214,
"step": 403
},
{
"epoch": 0.08885956230067085,
"grad_norm": 0.2795925736427307,
"learning_rate": 8e-05,
"loss": 1.8387,
"step": 404
},
{
"epoch": 0.08907951171230617,
"grad_norm": 0.266305148601532,
"learning_rate": 8e-05,
"loss": 1.6515,
"step": 405
},
{
"epoch": 0.08929946112394149,
"grad_norm": 0.27049583196640015,
"learning_rate": 8e-05,
"loss": 1.7824,
"step": 406
},
{
"epoch": 0.08951941053557681,
"grad_norm": 0.2959458529949188,
"learning_rate": 8e-05,
"loss": 1.8766,
"step": 407
},
{
"epoch": 0.08973935994721215,
"grad_norm": 0.28563347458839417,
"learning_rate": 8e-05,
"loss": 1.8618,
"step": 408
},
{
"epoch": 0.08995930935884747,
"grad_norm": 0.2840110659599304,
"learning_rate": 8e-05,
"loss": 1.6834,
"step": 409
},
{
"epoch": 0.09017925877048279,
"grad_norm": 0.25303247570991516,
"learning_rate": 8e-05,
"loss": 1.6477,
"step": 410
},
{
"epoch": 0.09039920818211811,
"grad_norm": 0.27236899733543396,
"learning_rate": 8e-05,
"loss": 1.7004,
"step": 411
},
{
"epoch": 0.09061915759375344,
"grad_norm": 0.2795659899711609,
"learning_rate": 8e-05,
"loss": 1.7492,
"step": 412
},
{
"epoch": 0.09083910700538876,
"grad_norm": 0.26019132137298584,
"learning_rate": 8e-05,
"loss": 1.691,
"step": 413
},
{
"epoch": 0.09105905641702408,
"grad_norm": 0.26624274253845215,
"learning_rate": 8e-05,
"loss": 1.7001,
"step": 414
},
{
"epoch": 0.09127900582865942,
"grad_norm": 0.2661585509777069,
"learning_rate": 8e-05,
"loss": 1.6762,
"step": 415
},
{
"epoch": 0.09149895524029474,
"grad_norm": 0.2719002068042755,
"learning_rate": 8e-05,
"loss": 1.6915,
"step": 416
},
{
"epoch": 0.09171890465193006,
"grad_norm": 0.24670244753360748,
"learning_rate": 8e-05,
"loss": 1.5598,
"step": 417
},
{
"epoch": 0.09193885406356538,
"grad_norm": 0.2550405263900757,
"learning_rate": 8e-05,
"loss": 1.4817,
"step": 418
},
{
"epoch": 0.09215880347520071,
"grad_norm": 0.26272761821746826,
"learning_rate": 8e-05,
"loss": 1.7016,
"step": 419
},
{
"epoch": 0.09237875288683603,
"grad_norm": 0.2673632502555847,
"learning_rate": 8e-05,
"loss": 1.7626,
"step": 420
},
{
"epoch": 0.09259870229847135,
"grad_norm": 0.25949448347091675,
"learning_rate": 8e-05,
"loss": 1.6273,
"step": 421
},
{
"epoch": 0.09281865171010667,
"grad_norm": 0.27953028678894043,
"learning_rate": 8e-05,
"loss": 1.8843,
"step": 422
},
{
"epoch": 0.093038601121742,
"grad_norm": 0.2534630298614502,
"learning_rate": 8e-05,
"loss": 1.7305,
"step": 423
},
{
"epoch": 0.09325855053337732,
"grad_norm": 0.2573072910308838,
"learning_rate": 8e-05,
"loss": 1.6397,
"step": 424
},
{
"epoch": 0.09347849994501264,
"grad_norm": 0.2604135572910309,
"learning_rate": 8e-05,
"loss": 1.6696,
"step": 425
},
{
"epoch": 0.09369844935664796,
"grad_norm": 0.25805628299713135,
"learning_rate": 8e-05,
"loss": 1.6441,
"step": 426
},
{
"epoch": 0.0939183987682833,
"grad_norm": 0.2935563027858734,
"learning_rate": 8e-05,
"loss": 1.6475,
"step": 427
},
{
"epoch": 0.09413834817991862,
"grad_norm": 0.25222933292388916,
"learning_rate": 8e-05,
"loss": 1.727,
"step": 428
},
{
"epoch": 0.09435829759155394,
"grad_norm": 0.2593076527118683,
"learning_rate": 8e-05,
"loss": 1.7066,
"step": 429
},
{
"epoch": 0.09457824700318927,
"grad_norm": 0.25259336829185486,
"learning_rate": 8e-05,
"loss": 1.6821,
"step": 430
},
{
"epoch": 0.09479819641482459,
"grad_norm": 0.2512541115283966,
"learning_rate": 8e-05,
"loss": 1.5923,
"step": 431
},
{
"epoch": 0.09501814582645991,
"grad_norm": 0.2711183726787567,
"learning_rate": 8e-05,
"loss": 1.755,
"step": 432
},
{
"epoch": 0.09523809523809523,
"grad_norm": 0.2782961130142212,
"learning_rate": 8e-05,
"loss": 1.8914,
"step": 433
},
{
"epoch": 0.09545804464973057,
"grad_norm": 0.25964146852493286,
"learning_rate": 8e-05,
"loss": 1.6588,
"step": 434
},
{
"epoch": 0.09567799406136589,
"grad_norm": 0.27077510952949524,
"learning_rate": 8e-05,
"loss": 1.753,
"step": 435
},
{
"epoch": 0.09589794347300121,
"grad_norm": 0.2923937141895294,
"learning_rate": 8e-05,
"loss": 1.8218,
"step": 436
},
{
"epoch": 0.09611789288463653,
"grad_norm": 0.2513190805912018,
"learning_rate": 8e-05,
"loss": 1.6232,
"step": 437
},
{
"epoch": 0.09633784229627186,
"grad_norm": 0.28531181812286377,
"learning_rate": 8e-05,
"loss": 1.7199,
"step": 438
},
{
"epoch": 0.09655779170790718,
"grad_norm": 0.302020400762558,
"learning_rate": 8e-05,
"loss": 1.8359,
"step": 439
},
{
"epoch": 0.0967777411195425,
"grad_norm": 0.28001338243484497,
"learning_rate": 8e-05,
"loss": 1.8434,
"step": 440
},
{
"epoch": 0.09699769053117784,
"grad_norm": 0.2990663945674896,
"learning_rate": 8e-05,
"loss": 1.6995,
"step": 441
},
{
"epoch": 0.09721763994281316,
"grad_norm": 0.266197144985199,
"learning_rate": 8e-05,
"loss": 1.6195,
"step": 442
},
{
"epoch": 0.09743758935444848,
"grad_norm": 0.28108519315719604,
"learning_rate": 8e-05,
"loss": 1.8108,
"step": 443
},
{
"epoch": 0.0976575387660838,
"grad_norm": 0.26744788885116577,
"learning_rate": 8e-05,
"loss": 1.6497,
"step": 444
},
{
"epoch": 0.09787748817771913,
"grad_norm": 0.28030574321746826,
"learning_rate": 8e-05,
"loss": 1.8143,
"step": 445
},
{
"epoch": 0.09809743758935445,
"grad_norm": 0.27872079610824585,
"learning_rate": 8e-05,
"loss": 1.6319,
"step": 446
},
{
"epoch": 0.09831738700098977,
"grad_norm": 0.2816067039966583,
"learning_rate": 8e-05,
"loss": 1.8385,
"step": 447
},
{
"epoch": 0.09853733641262509,
"grad_norm": 0.25677627325057983,
"learning_rate": 8e-05,
"loss": 1.6885,
"step": 448
},
{
"epoch": 0.09875728582426042,
"grad_norm": 0.276569128036499,
"learning_rate": 8e-05,
"loss": 1.7652,
"step": 449
},
{
"epoch": 0.09897723523589574,
"grad_norm": 0.2765633463859558,
"learning_rate": 8e-05,
"loss": 1.7763,
"step": 450
},
{
"epoch": 0.09919718464753106,
"grad_norm": 0.27050015330314636,
"learning_rate": 8e-05,
"loss": 1.6459,
"step": 451
},
{
"epoch": 0.0994171340591664,
"grad_norm": 0.2552846372127533,
"learning_rate": 8e-05,
"loss": 1.6877,
"step": 452
},
{
"epoch": 0.09963708347080172,
"grad_norm": 0.2653469741344452,
"learning_rate": 8e-05,
"loss": 1.6536,
"step": 453
},
{
"epoch": 0.09985703288243704,
"grad_norm": 0.28801941871643066,
"learning_rate": 8e-05,
"loss": 1.7643,
"step": 454
},
{
"epoch": 0.10007698229407236,
"grad_norm": 0.2930269241333008,
"learning_rate": 8e-05,
"loss": 1.7766,
"step": 455
},
{
"epoch": 0.10029693170570769,
"grad_norm": 0.2718334496021271,
"learning_rate": 8e-05,
"loss": 1.7347,
"step": 456
},
{
"epoch": 0.10051688111734301,
"grad_norm": 0.2807629704475403,
"learning_rate": 8e-05,
"loss": 1.7245,
"step": 457
},
{
"epoch": 0.10073683052897833,
"grad_norm": 0.2801489531993866,
"learning_rate": 8e-05,
"loss": 1.7854,
"step": 458
},
{
"epoch": 0.10095677994061365,
"grad_norm": 0.2616996765136719,
"learning_rate": 8e-05,
"loss": 1.6179,
"step": 459
},
{
"epoch": 0.10117672935224899,
"grad_norm": 0.2626480758190155,
"learning_rate": 8e-05,
"loss": 1.7475,
"step": 460
},
{
"epoch": 0.10139667876388431,
"grad_norm": 0.27338841557502747,
"learning_rate": 8e-05,
"loss": 1.8972,
"step": 461
},
{
"epoch": 0.10161662817551963,
"grad_norm": 0.2695038616657257,
"learning_rate": 8e-05,
"loss": 1.7279,
"step": 462
},
{
"epoch": 0.10183657758715496,
"grad_norm": 0.25614050030708313,
"learning_rate": 8e-05,
"loss": 1.6,
"step": 463
},
{
"epoch": 0.10205652699879028,
"grad_norm": 0.2722180187702179,
"learning_rate": 8e-05,
"loss": 1.9241,
"step": 464
},
{
"epoch": 0.1022764764104256,
"grad_norm": 0.2580203115940094,
"learning_rate": 8e-05,
"loss": 1.693,
"step": 465
},
{
"epoch": 0.10249642582206092,
"grad_norm": 0.2848857641220093,
"learning_rate": 8e-05,
"loss": 1.9072,
"step": 466
},
{
"epoch": 0.10271637523369626,
"grad_norm": 0.2783052325248718,
"learning_rate": 8e-05,
"loss": 1.9102,
"step": 467
},
{
"epoch": 0.10293632464533158,
"grad_norm": 0.279695987701416,
"learning_rate": 8e-05,
"loss": 1.7491,
"step": 468
},
{
"epoch": 0.1031562740569669,
"grad_norm": 0.2493034154176712,
"learning_rate": 8e-05,
"loss": 1.6789,
"step": 469
},
{
"epoch": 0.10337622346860222,
"grad_norm": 0.2751196622848511,
"learning_rate": 8e-05,
"loss": 1.8132,
"step": 470
},
{
"epoch": 0.10359617288023755,
"grad_norm": 0.2739677131175995,
"learning_rate": 8e-05,
"loss": 1.7945,
"step": 471
},
{
"epoch": 0.10381612229187287,
"grad_norm": 0.30357351899147034,
"learning_rate": 8e-05,
"loss": 1.9113,
"step": 472
},
{
"epoch": 0.10403607170350819,
"grad_norm": 0.2646970748901367,
"learning_rate": 8e-05,
"loss": 1.811,
"step": 473
},
{
"epoch": 0.10425602111514351,
"grad_norm": 0.2626940608024597,
"learning_rate": 8e-05,
"loss": 1.6911,
"step": 474
},
{
"epoch": 0.10447597052677884,
"grad_norm": 0.2613508701324463,
"learning_rate": 8e-05,
"loss": 1.7209,
"step": 475
},
{
"epoch": 0.10469591993841416,
"grad_norm": 0.2609264552593231,
"learning_rate": 8e-05,
"loss": 1.6303,
"step": 476
},
{
"epoch": 0.10491586935004948,
"grad_norm": 0.2549975514411926,
"learning_rate": 8e-05,
"loss": 1.7769,
"step": 477
},
{
"epoch": 0.10513581876168482,
"grad_norm": 0.2742570638656616,
"learning_rate": 8e-05,
"loss": 1.8101,
"step": 478
},
{
"epoch": 0.10535576817332014,
"grad_norm": 0.267070472240448,
"learning_rate": 8e-05,
"loss": 1.787,
"step": 479
},
{
"epoch": 0.10557571758495546,
"grad_norm": 0.2735085189342499,
"learning_rate": 8e-05,
"loss": 1.8112,
"step": 480
},
{
"epoch": 0.10579566699659078,
"grad_norm": 0.260111540555954,
"learning_rate": 8e-05,
"loss": 1.6926,
"step": 481
},
{
"epoch": 0.10601561640822611,
"grad_norm": 0.26309284567832947,
"learning_rate": 8e-05,
"loss": 1.778,
"step": 482
},
{
"epoch": 0.10623556581986143,
"grad_norm": 0.2658458948135376,
"learning_rate": 8e-05,
"loss": 1.7179,
"step": 483
},
{
"epoch": 0.10645551523149675,
"grad_norm": 0.27498647570610046,
"learning_rate": 8e-05,
"loss": 1.6689,
"step": 484
},
{
"epoch": 0.10667546464313207,
"grad_norm": 0.2658367156982422,
"learning_rate": 8e-05,
"loss": 1.6786,
"step": 485
},
{
"epoch": 0.10689541405476741,
"grad_norm": 0.26023292541503906,
"learning_rate": 8e-05,
"loss": 1.6995,
"step": 486
},
{
"epoch": 0.10711536346640273,
"grad_norm": 0.25749459862709045,
"learning_rate": 8e-05,
"loss": 1.6614,
"step": 487
},
{
"epoch": 0.10733531287803805,
"grad_norm": 0.26305267214775085,
"learning_rate": 8e-05,
"loss": 1.6838,
"step": 488
},
{
"epoch": 0.10755526228967338,
"grad_norm": 0.25277695059776306,
"learning_rate": 8e-05,
"loss": 1.6975,
"step": 489
},
{
"epoch": 0.1077752117013087,
"grad_norm": 0.2584420144557953,
"learning_rate": 8e-05,
"loss": 1.7434,
"step": 490
},
{
"epoch": 0.10799516111294402,
"grad_norm": 0.28107360005378723,
"learning_rate": 8e-05,
"loss": 1.8037,
"step": 491
},
{
"epoch": 0.10821511052457934,
"grad_norm": 0.553341269493103,
"learning_rate": 8e-05,
"loss": 1.8896,
"step": 492
},
{
"epoch": 0.10843505993621468,
"grad_norm": 0.2718677222728729,
"learning_rate": 8e-05,
"loss": 1.6646,
"step": 493
},
{
"epoch": 0.10865500934785,
"grad_norm": 0.27301734685897827,
"learning_rate": 8e-05,
"loss": 1.6663,
"step": 494
},
{
"epoch": 0.10887495875948532,
"grad_norm": 0.26952439546585083,
"learning_rate": 8e-05,
"loss": 1.7228,
"step": 495
},
{
"epoch": 0.10909490817112064,
"grad_norm": 0.3017599582672119,
"learning_rate": 8e-05,
"loss": 1.7936,
"step": 496
},
{
"epoch": 0.10931485758275597,
"grad_norm": 0.2676602303981781,
"learning_rate": 8e-05,
"loss": 1.7861,
"step": 497
},
{
"epoch": 0.10953480699439129,
"grad_norm": 0.27192267775535583,
"learning_rate": 8e-05,
"loss": 1.8032,
"step": 498
},
{
"epoch": 0.10975475640602661,
"grad_norm": 0.2807183861732483,
"learning_rate": 8e-05,
"loss": 1.6331,
"step": 499
},
{
"epoch": 0.10997470581766194,
"grad_norm": 0.2652963399887085,
"learning_rate": 8e-05,
"loss": 1.6231,
"step": 500
},
{
"epoch": 0.11019465522929726,
"grad_norm": 0.26010751724243164,
"learning_rate": 8e-05,
"loss": 1.729,
"step": 501
},
{
"epoch": 0.11041460464093258,
"grad_norm": 0.29573148488998413,
"learning_rate": 8e-05,
"loss": 1.8082,
"step": 502
},
{
"epoch": 0.1106345540525679,
"grad_norm": 0.28008025884628296,
"learning_rate": 8e-05,
"loss": 1.6829,
"step": 503
},
{
"epoch": 0.11085450346420324,
"grad_norm": 0.3029135763645172,
"learning_rate": 8e-05,
"loss": 1.7699,
"step": 504
},
{
"epoch": 0.11107445287583856,
"grad_norm": 0.2821674346923828,
"learning_rate": 8e-05,
"loss": 1.7337,
"step": 505
},
{
"epoch": 0.11129440228747388,
"grad_norm": 0.274880975484848,
"learning_rate": 8e-05,
"loss": 1.7973,
"step": 506
},
{
"epoch": 0.1115143516991092,
"grad_norm": 0.28885796666145325,
"learning_rate": 8e-05,
"loss": 1.7756,
"step": 507
},
{
"epoch": 0.11173430111074453,
"grad_norm": 0.2744079530239105,
"learning_rate": 8e-05,
"loss": 1.7991,
"step": 508
},
{
"epoch": 0.11195425052237985,
"grad_norm": 0.2645000219345093,
"learning_rate": 8e-05,
"loss": 1.6566,
"step": 509
},
{
"epoch": 0.11217419993401517,
"grad_norm": 0.2640466094017029,
"learning_rate": 8e-05,
"loss": 1.6649,
"step": 510
},
{
"epoch": 0.11239414934565051,
"grad_norm": 0.2965867817401886,
"learning_rate": 8e-05,
"loss": 1.7733,
"step": 511
},
{
"epoch": 0.11261409875728583,
"grad_norm": 0.2533203661441803,
"learning_rate": 8e-05,
"loss": 1.7194,
"step": 512
},
{
"epoch": 0.11283404816892115,
"grad_norm": 0.261994868516922,
"learning_rate": 8e-05,
"loss": 1.7387,
"step": 513
},
{
"epoch": 0.11305399758055647,
"grad_norm": 0.2868165969848633,
"learning_rate": 8e-05,
"loss": 1.7444,
"step": 514
},
{
"epoch": 0.1132739469921918,
"grad_norm": 0.2836281657218933,
"learning_rate": 8e-05,
"loss": 1.6507,
"step": 515
},
{
"epoch": 0.11349389640382712,
"grad_norm": 0.28675276041030884,
"learning_rate": 8e-05,
"loss": 1.7054,
"step": 516
},
{
"epoch": 0.11371384581546244,
"grad_norm": 0.2745465040206909,
"learning_rate": 8e-05,
"loss": 1.77,
"step": 517
},
{
"epoch": 0.11393379522709776,
"grad_norm": 0.27250972390174866,
"learning_rate": 8e-05,
"loss": 1.9102,
"step": 518
},
{
"epoch": 0.1141537446387331,
"grad_norm": 0.2781262695789337,
"learning_rate": 8e-05,
"loss": 1.8126,
"step": 519
},
{
"epoch": 0.11437369405036842,
"grad_norm": 0.2691183388233185,
"learning_rate": 8e-05,
"loss": 1.5978,
"step": 520
},
{
"epoch": 0.11459364346200374,
"grad_norm": 0.29496780037879944,
"learning_rate": 8e-05,
"loss": 1.9214,
"step": 521
},
{
"epoch": 0.11481359287363907,
"grad_norm": 0.27725401520729065,
"learning_rate": 8e-05,
"loss": 1.8722,
"step": 522
},
{
"epoch": 0.11503354228527439,
"grad_norm": 0.28819364309310913,
"learning_rate": 8e-05,
"loss": 1.6739,
"step": 523
},
{
"epoch": 0.11525349169690971,
"grad_norm": 0.278857946395874,
"learning_rate": 8e-05,
"loss": 1.8137,
"step": 524
},
{
"epoch": 0.11547344110854503,
"grad_norm": 0.26911258697509766,
"learning_rate": 8e-05,
"loss": 1.7123,
"step": 525
},
{
"epoch": 0.11569339052018036,
"grad_norm": 0.2656850814819336,
"learning_rate": 8e-05,
"loss": 1.8124,
"step": 526
},
{
"epoch": 0.11591333993181568,
"grad_norm": 0.26521819829940796,
"learning_rate": 8e-05,
"loss": 1.8188,
"step": 527
},
{
"epoch": 0.116133289343451,
"grad_norm": 0.2821720540523529,
"learning_rate": 8e-05,
"loss": 1.7607,
"step": 528
},
{
"epoch": 0.11635323875508632,
"grad_norm": 0.294612854719162,
"learning_rate": 8e-05,
"loss": 1.8142,
"step": 529
},
{
"epoch": 0.11657318816672166,
"grad_norm": 0.29858094453811646,
"learning_rate": 8e-05,
"loss": 1.8795,
"step": 530
},
{
"epoch": 0.11679313757835698,
"grad_norm": 0.2726878821849823,
"learning_rate": 8e-05,
"loss": 1.7988,
"step": 531
},
{
"epoch": 0.1170130869899923,
"grad_norm": 0.2651258111000061,
"learning_rate": 8e-05,
"loss": 1.8106,
"step": 532
},
{
"epoch": 0.11723303640162762,
"grad_norm": 0.2681291997432709,
"learning_rate": 8e-05,
"loss": 1.6692,
"step": 533
},
{
"epoch": 0.11745298581326295,
"grad_norm": 0.2641060948371887,
"learning_rate": 8e-05,
"loss": 1.6479,
"step": 534
},
{
"epoch": 0.11767293522489827,
"grad_norm": 0.2850191593170166,
"learning_rate": 8e-05,
"loss": 1.7337,
"step": 535
},
{
"epoch": 0.1178928846365336,
"grad_norm": 0.2718667685985565,
"learning_rate": 8e-05,
"loss": 1.7069,
"step": 536
},
{
"epoch": 0.11811283404816893,
"grad_norm": 0.27950581908226013,
"learning_rate": 8e-05,
"loss": 1.83,
"step": 537
},
{
"epoch": 0.11833278345980425,
"grad_norm": 0.26720213890075684,
"learning_rate": 8e-05,
"loss": 1.6787,
"step": 538
},
{
"epoch": 0.11855273287143957,
"grad_norm": 0.25440508127212524,
"learning_rate": 8e-05,
"loss": 1.5966,
"step": 539
},
{
"epoch": 0.11877268228307489,
"grad_norm": 0.2716729938983917,
"learning_rate": 8e-05,
"loss": 1.793,
"step": 540
},
{
"epoch": 0.11899263169471022,
"grad_norm": 0.26204821467399597,
"learning_rate": 8e-05,
"loss": 1.5882,
"step": 541
},
{
"epoch": 0.11921258110634554,
"grad_norm": 0.2756775915622711,
"learning_rate": 8e-05,
"loss": 1.7529,
"step": 542
},
{
"epoch": 0.11943253051798086,
"grad_norm": 0.27235740423202515,
"learning_rate": 8e-05,
"loss": 1.7607,
"step": 543
},
{
"epoch": 0.11965247992961618,
"grad_norm": 0.27712538838386536,
"learning_rate": 8e-05,
"loss": 1.7504,
"step": 544
},
{
"epoch": 0.11987242934125152,
"grad_norm": 0.27800193428993225,
"learning_rate": 8e-05,
"loss": 1.7421,
"step": 545
},
{
"epoch": 0.12009237875288684,
"grad_norm": 0.27911701798439026,
"learning_rate": 8e-05,
"loss": 1.6683,
"step": 546
},
{
"epoch": 0.12031232816452216,
"grad_norm": 0.27643364667892456,
"learning_rate": 8e-05,
"loss": 1.6393,
"step": 547
},
{
"epoch": 0.12053227757615749,
"grad_norm": 0.25785166025161743,
"learning_rate": 8e-05,
"loss": 1.641,
"step": 548
},
{
"epoch": 0.12075222698779281,
"grad_norm": 0.2791956067085266,
"learning_rate": 8e-05,
"loss": 1.7585,
"step": 549
},
{
"epoch": 0.12097217639942813,
"grad_norm": 0.28245967626571655,
"learning_rate": 8e-05,
"loss": 1.8716,
"step": 550
},
{
"epoch": 0.12119212581106345,
"grad_norm": 0.27160346508026123,
"learning_rate": 8e-05,
"loss": 1.7023,
"step": 551
},
{
"epoch": 0.12141207522269878,
"grad_norm": 0.2670506536960602,
"learning_rate": 8e-05,
"loss": 1.5844,
"step": 552
},
{
"epoch": 0.1216320246343341,
"grad_norm": 0.2762441337108612,
"learning_rate": 8e-05,
"loss": 1.7286,
"step": 553
},
{
"epoch": 0.12185197404596942,
"grad_norm": 0.29608720541000366,
"learning_rate": 8e-05,
"loss": 1.7875,
"step": 554
},
{
"epoch": 0.12207192345760474,
"grad_norm": 0.2847777307033539,
"learning_rate": 8e-05,
"loss": 1.7388,
"step": 555
},
{
"epoch": 0.12229187286924008,
"grad_norm": 0.2769443988800049,
"learning_rate": 8e-05,
"loss": 1.8129,
"step": 556
},
{
"epoch": 0.1225118222808754,
"grad_norm": 0.27490487694740295,
"learning_rate": 8e-05,
"loss": 1.678,
"step": 557
},
{
"epoch": 0.12273177169251072,
"grad_norm": 0.2851822078227997,
"learning_rate": 8e-05,
"loss": 1.8268,
"step": 558
},
{
"epoch": 0.12295172110414605,
"grad_norm": 0.31336653232574463,
"learning_rate": 8e-05,
"loss": 1.8247,
"step": 559
},
{
"epoch": 0.12317167051578137,
"grad_norm": 0.26455923914909363,
"learning_rate": 8e-05,
"loss": 1.5548,
"step": 560
},
{
"epoch": 0.12339161992741669,
"grad_norm": 0.2750054597854614,
"learning_rate": 8e-05,
"loss": 1.7912,
"step": 561
},
{
"epoch": 0.12361156933905201,
"grad_norm": 0.28016433119773865,
"learning_rate": 8e-05,
"loss": 1.7367,
"step": 562
},
{
"epoch": 0.12383151875068735,
"grad_norm": 0.30594533681869507,
"learning_rate": 8e-05,
"loss": 1.7959,
"step": 563
},
{
"epoch": 0.12405146816232267,
"grad_norm": 0.2753421664237976,
"learning_rate": 8e-05,
"loss": 1.6714,
"step": 564
},
{
"epoch": 0.12427141757395799,
"grad_norm": 0.3309609889984131,
"learning_rate": 8e-05,
"loss": 1.7632,
"step": 565
},
{
"epoch": 0.12449136698559331,
"grad_norm": 0.3116569221019745,
"learning_rate": 8e-05,
"loss": 1.8312,
"step": 566
},
{
"epoch": 0.12471131639722864,
"grad_norm": 0.27756184339523315,
"learning_rate": 8e-05,
"loss": 1.6622,
"step": 567
},
{
"epoch": 0.12493126580886396,
"grad_norm": 0.2740349769592285,
"learning_rate": 8e-05,
"loss": 1.7015,
"step": 568
},
{
"epoch": 0.1251512152204993,
"grad_norm": 0.2696126401424408,
"learning_rate": 8e-05,
"loss": 1.6063,
"step": 569
},
{
"epoch": 0.1253711646321346,
"grad_norm": 0.29191461205482483,
"learning_rate": 8e-05,
"loss": 1.8429,
"step": 570
},
{
"epoch": 0.12559111404376994,
"grad_norm": 0.2984013855457306,
"learning_rate": 8e-05,
"loss": 1.8194,
"step": 571
},
{
"epoch": 0.12581106345540527,
"grad_norm": 0.27315613627433777,
"learning_rate": 8e-05,
"loss": 1.7027,
"step": 572
},
{
"epoch": 0.12603101286704058,
"grad_norm": 0.28547149896621704,
"learning_rate": 8e-05,
"loss": 1.694,
"step": 573
},
{
"epoch": 0.1262509622786759,
"grad_norm": 0.26458805799484253,
"learning_rate": 8e-05,
"loss": 1.7978,
"step": 574
},
{
"epoch": 0.12647091169031122,
"grad_norm": 0.29676830768585205,
"learning_rate": 8e-05,
"loss": 1.8295,
"step": 575
},
{
"epoch": 0.12669086110194655,
"grad_norm": 0.28077611327171326,
"learning_rate": 8e-05,
"loss": 1.7711,
"step": 576
},
{
"epoch": 0.12691081051358188,
"grad_norm": 0.256736159324646,
"learning_rate": 8e-05,
"loss": 1.5371,
"step": 577
},
{
"epoch": 0.1271307599252172,
"grad_norm": 0.2888578474521637,
"learning_rate": 8e-05,
"loss": 1.7532,
"step": 578
},
{
"epoch": 0.12735070933685252,
"grad_norm": 0.29349133372306824,
"learning_rate": 8e-05,
"loss": 1.856,
"step": 579
},
{
"epoch": 0.12757065874848786,
"grad_norm": 0.2626110911369324,
"learning_rate": 8e-05,
"loss": 1.5482,
"step": 580
},
{
"epoch": 0.12779060816012316,
"grad_norm": 0.2715248167514801,
"learning_rate": 8e-05,
"loss": 1.7003,
"step": 581
},
{
"epoch": 0.1280105575717585,
"grad_norm": 0.2800534963607788,
"learning_rate": 8e-05,
"loss": 1.7065,
"step": 582
},
{
"epoch": 0.12823050698339383,
"grad_norm": 0.3190186619758606,
"learning_rate": 8e-05,
"loss": 1.8099,
"step": 583
},
{
"epoch": 0.12845045639502914,
"grad_norm": 0.2689470648765564,
"learning_rate": 8e-05,
"loss": 1.7824,
"step": 584
},
{
"epoch": 0.12867040580666447,
"grad_norm": 0.2715473473072052,
"learning_rate": 8e-05,
"loss": 1.7721,
"step": 585
},
{
"epoch": 0.12889035521829978,
"grad_norm": 0.27956798672676086,
"learning_rate": 8e-05,
"loss": 1.7888,
"step": 586
},
{
"epoch": 0.1291103046299351,
"grad_norm": 0.2842330038547516,
"learning_rate": 8e-05,
"loss": 1.7131,
"step": 587
},
{
"epoch": 0.12933025404157045,
"grad_norm": 0.2888692021369934,
"learning_rate": 8e-05,
"loss": 1.7509,
"step": 588
},
{
"epoch": 0.12955020345320575,
"grad_norm": 0.27673423290252686,
"learning_rate": 8e-05,
"loss": 1.7235,
"step": 589
},
{
"epoch": 0.1297701528648411,
"grad_norm": 0.26007330417633057,
"learning_rate": 8e-05,
"loss": 1.7157,
"step": 590
},
{
"epoch": 0.12999010227647642,
"grad_norm": 0.27521616220474243,
"learning_rate": 8e-05,
"loss": 1.7519,
"step": 591
},
{
"epoch": 0.13021005168811173,
"grad_norm": 0.2753496766090393,
"learning_rate": 8e-05,
"loss": 1.6956,
"step": 592
},
{
"epoch": 0.13043000109974706,
"grad_norm": 0.25559505820274353,
"learning_rate": 8e-05,
"loss": 1.522,
"step": 593
},
{
"epoch": 0.1306499505113824,
"grad_norm": 0.26815375685691833,
"learning_rate": 8e-05,
"loss": 1.7658,
"step": 594
},
{
"epoch": 0.1308698999230177,
"grad_norm": 0.26870042085647583,
"learning_rate": 8e-05,
"loss": 1.779,
"step": 595
},
{
"epoch": 0.13108984933465304,
"grad_norm": 0.27346327900886536,
"learning_rate": 8e-05,
"loss": 1.7397,
"step": 596
},
{
"epoch": 0.13130979874628834,
"grad_norm": 0.26674172282218933,
"learning_rate": 8e-05,
"loss": 1.906,
"step": 597
},
{
"epoch": 0.13152974815792368,
"grad_norm": 0.266916960477829,
"learning_rate": 8e-05,
"loss": 1.6896,
"step": 598
},
{
"epoch": 0.131749697569559,
"grad_norm": 0.2620035707950592,
"learning_rate": 8e-05,
"loss": 1.8032,
"step": 599
},
{
"epoch": 0.13196964698119432,
"grad_norm": 0.2721168100833893,
"learning_rate": 8e-05,
"loss": 1.7992,
"step": 600
},
{
"epoch": 0.13218959639282965,
"grad_norm": 0.2902929186820984,
"learning_rate": 8e-05,
"loss": 1.8392,
"step": 601
},
{
"epoch": 0.13240954580446498,
"grad_norm": 0.267459899187088,
"learning_rate": 8e-05,
"loss": 1.8469,
"step": 602
},
{
"epoch": 0.1326294952161003,
"grad_norm": 0.25643131136894226,
"learning_rate": 8e-05,
"loss": 1.5562,
"step": 603
},
{
"epoch": 0.13284944462773562,
"grad_norm": 0.2919185757637024,
"learning_rate": 8e-05,
"loss": 1.7108,
"step": 604
},
{
"epoch": 0.13306939403937096,
"grad_norm": 0.2631925046443939,
"learning_rate": 8e-05,
"loss": 1.4344,
"step": 605
},
{
"epoch": 0.13328934345100626,
"grad_norm": 0.2710738182067871,
"learning_rate": 8e-05,
"loss": 1.6774,
"step": 606
},
{
"epoch": 0.1335092928626416,
"grad_norm": 0.2641798257827759,
"learning_rate": 8e-05,
"loss": 1.8032,
"step": 607
},
{
"epoch": 0.1337292422742769,
"grad_norm": 0.2571311891078949,
"learning_rate": 8e-05,
"loss": 1.63,
"step": 608
},
{
"epoch": 0.13394919168591224,
"grad_norm": 0.24528057873249054,
"learning_rate": 8e-05,
"loss": 1.4576,
"step": 609
},
{
"epoch": 0.13416914109754757,
"grad_norm": 0.270641028881073,
"learning_rate": 8e-05,
"loss": 1.7896,
"step": 610
},
{
"epoch": 0.13438909050918288,
"grad_norm": 0.2723008990287781,
"learning_rate": 8e-05,
"loss": 1.7894,
"step": 611
},
{
"epoch": 0.1346090399208182,
"grad_norm": 0.26487669348716736,
"learning_rate": 8e-05,
"loss": 1.7646,
"step": 612
},
{
"epoch": 0.13482898933245355,
"grad_norm": 0.26771143078804016,
"learning_rate": 8e-05,
"loss": 1.8015,
"step": 613
},
{
"epoch": 0.13504893874408885,
"grad_norm": 0.2585919499397278,
"learning_rate": 8e-05,
"loss": 1.6487,
"step": 614
},
{
"epoch": 0.1352688881557242,
"grad_norm": 0.28161996603012085,
"learning_rate": 8e-05,
"loss": 1.7813,
"step": 615
},
{
"epoch": 0.1354888375673595,
"grad_norm": 0.25246456265449524,
"learning_rate": 8e-05,
"loss": 1.5549,
"step": 616
},
{
"epoch": 0.13570878697899483,
"grad_norm": 0.2803630530834198,
"learning_rate": 8e-05,
"loss": 1.7545,
"step": 617
},
{
"epoch": 0.13592873639063016,
"grad_norm": 0.2587769031524658,
"learning_rate": 8e-05,
"loss": 1.6755,
"step": 618
},
{
"epoch": 0.13614868580226547,
"grad_norm": 0.2890148162841797,
"learning_rate": 8e-05,
"loss": 1.9753,
"step": 619
},
{
"epoch": 0.1363686352139008,
"grad_norm": 0.2924948036670685,
"learning_rate": 8e-05,
"loss": 1.7611,
"step": 620
},
{
"epoch": 0.13658858462553614,
"grad_norm": 0.2594594359397888,
"learning_rate": 8e-05,
"loss": 1.6945,
"step": 621
},
{
"epoch": 0.13680853403717144,
"grad_norm": 0.2853068709373474,
"learning_rate": 8e-05,
"loss": 1.8637,
"step": 622
},
{
"epoch": 0.13702848344880678,
"grad_norm": 0.2696111798286438,
"learning_rate": 8e-05,
"loss": 1.7777,
"step": 623
},
{
"epoch": 0.1372484328604421,
"grad_norm": 0.3137861490249634,
"learning_rate": 8e-05,
"loss": 1.8799,
"step": 624
},
{
"epoch": 0.13746838227207742,
"grad_norm": 0.25645750761032104,
"learning_rate": 8e-05,
"loss": 1.5023,
"step": 625
},
{
"epoch": 0.13768833168371275,
"grad_norm": 0.29853489995002747,
"learning_rate": 8e-05,
"loss": 1.9131,
"step": 626
},
{
"epoch": 0.13790828109534806,
"grad_norm": 0.2653225362300873,
"learning_rate": 8e-05,
"loss": 1.6835,
"step": 627
},
{
"epoch": 0.1381282305069834,
"grad_norm": 0.26686328649520874,
"learning_rate": 8e-05,
"loss": 1.7667,
"step": 628
},
{
"epoch": 0.13834817991861872,
"grad_norm": 0.26114073395729065,
"learning_rate": 8e-05,
"loss": 1.6925,
"step": 629
},
{
"epoch": 0.13856812933025403,
"grad_norm": 0.2520682215690613,
"learning_rate": 8e-05,
"loss": 1.6065,
"step": 630
},
{
"epoch": 0.13878807874188936,
"grad_norm": 0.2676456868648529,
"learning_rate": 8e-05,
"loss": 1.7353,
"step": 631
},
{
"epoch": 0.1390080281535247,
"grad_norm": 0.2525452673435211,
"learning_rate": 8e-05,
"loss": 1.5993,
"step": 632
},
{
"epoch": 0.13922797756516,
"grad_norm": 0.25620371103286743,
"learning_rate": 8e-05,
"loss": 1.7188,
"step": 633
},
{
"epoch": 0.13944792697679534,
"grad_norm": 0.4071904420852661,
"learning_rate": 8e-05,
"loss": 1.9348,
"step": 634
},
{
"epoch": 0.13966787638843067,
"grad_norm": 0.2656376361846924,
"learning_rate": 8e-05,
"loss": 1.7833,
"step": 635
},
{
"epoch": 0.13988782580006598,
"grad_norm": 0.25558993220329285,
"learning_rate": 8e-05,
"loss": 1.715,
"step": 636
},
{
"epoch": 0.1401077752117013,
"grad_norm": 0.28318601846694946,
"learning_rate": 8e-05,
"loss": 1.8012,
"step": 637
},
{
"epoch": 0.14032772462333662,
"grad_norm": 0.2558564245700836,
"learning_rate": 8e-05,
"loss": 1.5802,
"step": 638
},
{
"epoch": 0.14054767403497195,
"grad_norm": 0.26874974370002747,
"learning_rate": 8e-05,
"loss": 1.7884,
"step": 639
},
{
"epoch": 0.1407676234466073,
"grad_norm": 0.2960795760154724,
"learning_rate": 8e-05,
"loss": 1.7884,
"step": 640
},
{
"epoch": 0.1409875728582426,
"grad_norm": 0.3098964989185333,
"learning_rate": 8e-05,
"loss": 1.8844,
"step": 641
},
{
"epoch": 0.14120752226987793,
"grad_norm": 0.2819165885448456,
"learning_rate": 8e-05,
"loss": 1.7111,
"step": 642
},
{
"epoch": 0.14142747168151326,
"grad_norm": 0.26352617144584656,
"learning_rate": 8e-05,
"loss": 1.7337,
"step": 643
},
{
"epoch": 0.14164742109314857,
"grad_norm": 0.2622654139995575,
"learning_rate": 8e-05,
"loss": 1.7284,
"step": 644
},
{
"epoch": 0.1418673705047839,
"grad_norm": 0.2793010473251343,
"learning_rate": 8e-05,
"loss": 1.8534,
"step": 645
},
{
"epoch": 0.14208731991641924,
"grad_norm": 0.27972397208213806,
"learning_rate": 8e-05,
"loss": 1.7658,
"step": 646
},
{
"epoch": 0.14230726932805454,
"grad_norm": 0.25940972566604614,
"learning_rate": 8e-05,
"loss": 1.6676,
"step": 647
},
{
"epoch": 0.14252721873968988,
"grad_norm": 0.29578897356987,
"learning_rate": 8e-05,
"loss": 1.8002,
"step": 648
},
{
"epoch": 0.14274716815132518,
"grad_norm": 0.2577681541442871,
"learning_rate": 8e-05,
"loss": 1.6154,
"step": 649
},
{
"epoch": 0.14296711756296052,
"grad_norm": 0.2615002989768982,
"learning_rate": 8e-05,
"loss": 1.7539,
"step": 650
},
{
"epoch": 0.14318706697459585,
"grad_norm": 0.26044437289237976,
"learning_rate": 8e-05,
"loss": 1.5284,
"step": 651
},
{
"epoch": 0.14340701638623116,
"grad_norm": 0.28386443853378296,
"learning_rate": 8e-05,
"loss": 1.7188,
"step": 652
},
{
"epoch": 0.1436269657978665,
"grad_norm": 0.2579086124897003,
"learning_rate": 8e-05,
"loss": 1.6758,
"step": 653
},
{
"epoch": 0.14384691520950182,
"grad_norm": 0.263192743062973,
"learning_rate": 8e-05,
"loss": 1.7013,
"step": 654
},
{
"epoch": 0.14406686462113713,
"grad_norm": 0.26551106572151184,
"learning_rate": 8e-05,
"loss": 1.7314,
"step": 655
},
{
"epoch": 0.14428681403277246,
"grad_norm": 0.26143091917037964,
"learning_rate": 8e-05,
"loss": 1.7041,
"step": 656
},
{
"epoch": 0.1445067634444078,
"grad_norm": 0.26432663202285767,
"learning_rate": 8e-05,
"loss": 1.601,
"step": 657
},
{
"epoch": 0.1447267128560431,
"grad_norm": 0.2831920087337494,
"learning_rate": 8e-05,
"loss": 1.8573,
"step": 658
},
{
"epoch": 0.14494666226767844,
"grad_norm": 0.3045855462551117,
"learning_rate": 8e-05,
"loss": 1.7853,
"step": 659
},
{
"epoch": 0.14516661167931375,
"grad_norm": 0.28249257802963257,
"learning_rate": 8e-05,
"loss": 1.7525,
"step": 660
},
{
"epoch": 0.14538656109094908,
"grad_norm": 0.27501189708709717,
"learning_rate": 8e-05,
"loss": 1.6939,
"step": 661
},
{
"epoch": 0.1456065105025844,
"grad_norm": 0.28419750928878784,
"learning_rate": 8e-05,
"loss": 1.837,
"step": 662
},
{
"epoch": 0.14582645991421972,
"grad_norm": 0.28872454166412354,
"learning_rate": 8e-05,
"loss": 1.623,
"step": 663
},
{
"epoch": 0.14604640932585505,
"grad_norm": 0.2926316559314728,
"learning_rate": 8e-05,
"loss": 1.7438,
"step": 664
},
{
"epoch": 0.1462663587374904,
"grad_norm": 0.2716543972492218,
"learning_rate": 8e-05,
"loss": 1.8925,
"step": 665
},
{
"epoch": 0.1464863081491257,
"grad_norm": 0.2707289159297943,
"learning_rate": 8e-05,
"loss": 1.8218,
"step": 666
},
{
"epoch": 0.14670625756076103,
"grad_norm": 0.2609579265117645,
"learning_rate": 8e-05,
"loss": 1.4612,
"step": 667
},
{
"epoch": 0.14692620697239636,
"grad_norm": 0.2958548367023468,
"learning_rate": 8e-05,
"loss": 1.6191,
"step": 668
},
{
"epoch": 0.14714615638403167,
"grad_norm": 0.2585492730140686,
"learning_rate": 8e-05,
"loss": 1.7161,
"step": 669
},
{
"epoch": 0.147366105795667,
"grad_norm": 0.2637808322906494,
"learning_rate": 8e-05,
"loss": 1.6534,
"step": 670
},
{
"epoch": 0.1475860552073023,
"grad_norm": 0.2885671854019165,
"learning_rate": 8e-05,
"loss": 1.7663,
"step": 671
},
{
"epoch": 0.14780600461893764,
"grad_norm": 0.27028244733810425,
"learning_rate": 8e-05,
"loss": 1.7718,
"step": 672
},
{
"epoch": 0.14802595403057298,
"grad_norm": 0.27723586559295654,
"learning_rate": 8e-05,
"loss": 1.7762,
"step": 673
},
{
"epoch": 0.14824590344220828,
"grad_norm": 0.26336848735809326,
"learning_rate": 8e-05,
"loss": 1.6114,
"step": 674
},
{
"epoch": 0.14846585285384362,
"grad_norm": 0.26031750440597534,
"learning_rate": 8e-05,
"loss": 1.7259,
"step": 675
},
{
"epoch": 0.14868580226547895,
"grad_norm": 0.30176040530204773,
"learning_rate": 8e-05,
"loss": 1.7007,
"step": 676
},
{
"epoch": 0.14890575167711426,
"grad_norm": 0.25952771306037903,
"learning_rate": 8e-05,
"loss": 1.6573,
"step": 677
},
{
"epoch": 0.1491257010887496,
"grad_norm": 0.2727009356021881,
"learning_rate": 8e-05,
"loss": 1.7725,
"step": 678
},
{
"epoch": 0.14934565050038492,
"grad_norm": 0.26398420333862305,
"learning_rate": 8e-05,
"loss": 1.7245,
"step": 679
},
{
"epoch": 0.14956559991202023,
"grad_norm": 0.273967981338501,
"learning_rate": 8e-05,
"loss": 1.7231,
"step": 680
},
{
"epoch": 0.14978554932365556,
"grad_norm": 0.27241724729537964,
"learning_rate": 8e-05,
"loss": 1.6896,
"step": 681
},
{
"epoch": 0.15000549873529087,
"grad_norm": 0.26996085047721863,
"learning_rate": 8e-05,
"loss": 1.6767,
"step": 682
},
{
"epoch": 0.1502254481469262,
"grad_norm": 0.27165672183036804,
"learning_rate": 8e-05,
"loss": 1.7747,
"step": 683
},
{
"epoch": 0.15044539755856154,
"grad_norm": 0.26840028166770935,
"learning_rate": 8e-05,
"loss": 1.7616,
"step": 684
},
{
"epoch": 0.15066534697019685,
"grad_norm": 0.27101555466651917,
"learning_rate": 8e-05,
"loss": 1.622,
"step": 685
},
{
"epoch": 0.15088529638183218,
"grad_norm": 0.2691043019294739,
"learning_rate": 8e-05,
"loss": 1.7514,
"step": 686
},
{
"epoch": 0.1511052457934675,
"grad_norm": 0.2926357090473175,
"learning_rate": 8e-05,
"loss": 1.6953,
"step": 687
},
{
"epoch": 0.15132519520510282,
"grad_norm": 0.2730226516723633,
"learning_rate": 8e-05,
"loss": 1.6286,
"step": 688
},
{
"epoch": 0.15154514461673815,
"grad_norm": 0.2618841826915741,
"learning_rate": 8e-05,
"loss": 1.7194,
"step": 689
},
{
"epoch": 0.1517650940283735,
"grad_norm": 0.2584119737148285,
"learning_rate": 8e-05,
"loss": 1.6032,
"step": 690
},
{
"epoch": 0.1519850434400088,
"grad_norm": 0.26063093543052673,
"learning_rate": 8e-05,
"loss": 1.63,
"step": 691
},
{
"epoch": 0.15220499285164413,
"grad_norm": 0.267938494682312,
"learning_rate": 8e-05,
"loss": 1.7087,
"step": 692
},
{
"epoch": 0.15242494226327943,
"grad_norm": 0.2709169089794159,
"learning_rate": 8e-05,
"loss": 1.6663,
"step": 693
},
{
"epoch": 0.15264489167491477,
"grad_norm": 0.3015836775302887,
"learning_rate": 8e-05,
"loss": 1.6797,
"step": 694
},
{
"epoch": 0.1528648410865501,
"grad_norm": 0.27824944257736206,
"learning_rate": 8e-05,
"loss": 1.7972,
"step": 695
},
{
"epoch": 0.1530847904981854,
"grad_norm": 0.31089073419570923,
"learning_rate": 8e-05,
"loss": 1.7352,
"step": 696
},
{
"epoch": 0.15330473990982074,
"grad_norm": 0.2804546654224396,
"learning_rate": 8e-05,
"loss": 1.6898,
"step": 697
},
{
"epoch": 0.15352468932145608,
"grad_norm": 0.2804514765739441,
"learning_rate": 8e-05,
"loss": 1.8409,
"step": 698
},
{
"epoch": 0.15374463873309138,
"grad_norm": 0.31666815280914307,
"learning_rate": 8e-05,
"loss": 1.6569,
"step": 699
},
{
"epoch": 0.15396458814472672,
"grad_norm": 0.2846215069293976,
"learning_rate": 8e-05,
"loss": 1.8081,
"step": 700
},
{
"epoch": 0.15418453755636205,
"grad_norm": 0.2656068801879883,
"learning_rate": 8e-05,
"loss": 1.5747,
"step": 701
},
{
"epoch": 0.15440448696799736,
"grad_norm": 0.2633317708969116,
"learning_rate": 8e-05,
"loss": 1.6027,
"step": 702
},
{
"epoch": 0.1546244363796327,
"grad_norm": 0.2669740319252014,
"learning_rate": 8e-05,
"loss": 1.6964,
"step": 703
},
{
"epoch": 0.154844385791268,
"grad_norm": 0.2878497540950775,
"learning_rate": 8e-05,
"loss": 1.677,
"step": 704
},
{
"epoch": 0.15506433520290333,
"grad_norm": 0.2624325156211853,
"learning_rate": 8e-05,
"loss": 1.6247,
"step": 705
},
{
"epoch": 0.15528428461453866,
"grad_norm": 0.2894291579723358,
"learning_rate": 8e-05,
"loss": 1.7271,
"step": 706
},
{
"epoch": 0.15550423402617397,
"grad_norm": 0.2924456298351288,
"learning_rate": 8e-05,
"loss": 1.7475,
"step": 707
},
{
"epoch": 0.1557241834378093,
"grad_norm": 0.2519112229347229,
"learning_rate": 8e-05,
"loss": 1.6306,
"step": 708
},
{
"epoch": 0.15594413284944464,
"grad_norm": 0.2831405699253082,
"learning_rate": 8e-05,
"loss": 1.7571,
"step": 709
},
{
"epoch": 0.15616408226107995,
"grad_norm": 0.2804257273674011,
"learning_rate": 8e-05,
"loss": 1.6721,
"step": 710
},
{
"epoch": 0.15638403167271528,
"grad_norm": 0.27130362391471863,
"learning_rate": 8e-05,
"loss": 1.7451,
"step": 711
},
{
"epoch": 0.1566039810843506,
"grad_norm": 0.27843937277793884,
"learning_rate": 8e-05,
"loss": 1.7187,
"step": 712
},
{
"epoch": 0.15682393049598592,
"grad_norm": 0.26205387711524963,
"learning_rate": 8e-05,
"loss": 1.7667,
"step": 713
},
{
"epoch": 0.15704387990762125,
"grad_norm": 0.25978967547416687,
"learning_rate": 8e-05,
"loss": 1.6595,
"step": 714
},
{
"epoch": 0.15726382931925656,
"grad_norm": 0.26331478357315063,
"learning_rate": 8e-05,
"loss": 1.8067,
"step": 715
},
{
"epoch": 0.1574837787308919,
"grad_norm": 0.26023924350738525,
"learning_rate": 8e-05,
"loss": 1.8533,
"step": 716
},
{
"epoch": 0.15770372814252723,
"grad_norm": 0.27147844433784485,
"learning_rate": 8e-05,
"loss": 1.6309,
"step": 717
},
{
"epoch": 0.15792367755416253,
"grad_norm": 0.286035418510437,
"learning_rate": 8e-05,
"loss": 1.72,
"step": 718
},
{
"epoch": 0.15814362696579787,
"grad_norm": 0.3167229890823364,
"learning_rate": 8e-05,
"loss": 1.9007,
"step": 719
},
{
"epoch": 0.1583635763774332,
"grad_norm": 0.283975750207901,
"learning_rate": 8e-05,
"loss": 1.6662,
"step": 720
},
{
"epoch": 0.1585835257890685,
"grad_norm": 0.2812137007713318,
"learning_rate": 8e-05,
"loss": 1.7651,
"step": 721
},
{
"epoch": 0.15880347520070384,
"grad_norm": 0.2737642526626587,
"learning_rate": 8e-05,
"loss": 1.7679,
"step": 722
},
{
"epoch": 0.15902342461233915,
"grad_norm": 0.30812978744506836,
"learning_rate": 8e-05,
"loss": 1.8408,
"step": 723
},
{
"epoch": 0.15924337402397448,
"grad_norm": 0.27026352286338806,
"learning_rate": 8e-05,
"loss": 1.7362,
"step": 724
},
{
"epoch": 0.15946332343560982,
"grad_norm": 0.2788861393928528,
"learning_rate": 8e-05,
"loss": 1.8371,
"step": 725
},
{
"epoch": 0.15968327284724512,
"grad_norm": 0.2623996138572693,
"learning_rate": 8e-05,
"loss": 1.5855,
"step": 726
},
{
"epoch": 0.15990322225888046,
"grad_norm": 0.2764820158481598,
"learning_rate": 8e-05,
"loss": 1.8185,
"step": 727
},
{
"epoch": 0.1601231716705158,
"grad_norm": 0.27394816279411316,
"learning_rate": 8e-05,
"loss": 1.641,
"step": 728
},
{
"epoch": 0.1603431210821511,
"grad_norm": 0.2726307511329651,
"learning_rate": 8e-05,
"loss": 1.6128,
"step": 729
},
{
"epoch": 0.16056307049378643,
"grad_norm": 0.28221258521080017,
"learning_rate": 8e-05,
"loss": 1.8413,
"step": 730
},
{
"epoch": 0.16078301990542176,
"grad_norm": 0.2649543881416321,
"learning_rate": 8e-05,
"loss": 1.5707,
"step": 731
},
{
"epoch": 0.16100296931705707,
"grad_norm": 0.2659435570240021,
"learning_rate": 8e-05,
"loss": 1.6761,
"step": 732
},
{
"epoch": 0.1612229187286924,
"grad_norm": 0.3131570518016815,
"learning_rate": 8e-05,
"loss": 1.9439,
"step": 733
},
{
"epoch": 0.1614428681403277,
"grad_norm": 0.263069748878479,
"learning_rate": 8e-05,
"loss": 1.7069,
"step": 734
},
{
"epoch": 0.16166281755196305,
"grad_norm": 0.2708505392074585,
"learning_rate": 8e-05,
"loss": 1.8031,
"step": 735
},
{
"epoch": 0.16188276696359838,
"grad_norm": 0.26446613669395447,
"learning_rate": 8e-05,
"loss": 1.6419,
"step": 736
},
{
"epoch": 0.16210271637523369,
"grad_norm": 0.27720367908477783,
"learning_rate": 8e-05,
"loss": 1.8291,
"step": 737
},
{
"epoch": 0.16232266578686902,
"grad_norm": 0.25950226187705994,
"learning_rate": 8e-05,
"loss": 1.7498,
"step": 738
},
{
"epoch": 0.16254261519850435,
"grad_norm": 0.25445327162742615,
"learning_rate": 8e-05,
"loss": 1.6804,
"step": 739
},
{
"epoch": 0.16276256461013966,
"grad_norm": 0.2868766784667969,
"learning_rate": 8e-05,
"loss": 1.8058,
"step": 740
},
{
"epoch": 0.162982514021775,
"grad_norm": 0.2775559425354004,
"learning_rate": 8e-05,
"loss": 1.7971,
"step": 741
},
{
"epoch": 0.16320246343341033,
"grad_norm": 0.2822381556034088,
"learning_rate": 8e-05,
"loss": 1.7294,
"step": 742
},
{
"epoch": 0.16342241284504563,
"grad_norm": 0.26617857813835144,
"learning_rate": 8e-05,
"loss": 1.8011,
"step": 743
},
{
"epoch": 0.16364236225668097,
"grad_norm": 0.25615090131759644,
"learning_rate": 8e-05,
"loss": 1.6328,
"step": 744
},
{
"epoch": 0.16386231166831627,
"grad_norm": 0.25831338763237,
"learning_rate": 8e-05,
"loss": 1.6174,
"step": 745
},
{
"epoch": 0.1640822610799516,
"grad_norm": 0.2707291543483734,
"learning_rate": 8e-05,
"loss": 1.8217,
"step": 746
},
{
"epoch": 0.16430221049158694,
"grad_norm": 0.3028862774372101,
"learning_rate": 8e-05,
"loss": 1.5852,
"step": 747
},
{
"epoch": 0.16452215990322225,
"grad_norm": 0.26598575711250305,
"learning_rate": 8e-05,
"loss": 1.7213,
"step": 748
},
{
"epoch": 0.16474210931485758,
"grad_norm": 0.27408871054649353,
"learning_rate": 8e-05,
"loss": 1.7109,
"step": 749
},
{
"epoch": 0.16496205872649292,
"grad_norm": 0.27065837383270264,
"learning_rate": 8e-05,
"loss": 1.6696,
"step": 750
},
{
"epoch": 0.16518200813812822,
"grad_norm": 0.2721879184246063,
"learning_rate": 8e-05,
"loss": 1.7055,
"step": 751
},
{
"epoch": 0.16540195754976356,
"grad_norm": 0.29569125175476074,
"learning_rate": 8e-05,
"loss": 1.5921,
"step": 752
},
{
"epoch": 0.1656219069613989,
"grad_norm": 0.28580978512763977,
"learning_rate": 8e-05,
"loss": 1.7518,
"step": 753
},
{
"epoch": 0.1658418563730342,
"grad_norm": 0.2869469225406647,
"learning_rate": 8e-05,
"loss": 1.8164,
"step": 754
},
{
"epoch": 0.16606180578466953,
"grad_norm": 0.2796071171760559,
"learning_rate": 8e-05,
"loss": 1.8325,
"step": 755
},
{
"epoch": 0.16628175519630484,
"grad_norm": 0.27365031838417053,
"learning_rate": 8e-05,
"loss": 1.7287,
"step": 756
},
{
"epoch": 0.16650170460794017,
"grad_norm": 0.2524491846561432,
"learning_rate": 8e-05,
"loss": 1.5379,
"step": 757
},
{
"epoch": 0.1667216540195755,
"grad_norm": 0.259860634803772,
"learning_rate": 8e-05,
"loss": 1.5204,
"step": 758
},
{
"epoch": 0.1669416034312108,
"grad_norm": 0.2714100182056427,
"learning_rate": 8e-05,
"loss": 1.7245,
"step": 759
},
{
"epoch": 0.16716155284284615,
"grad_norm": 0.2729417383670807,
"learning_rate": 8e-05,
"loss": 1.6889,
"step": 760
},
{
"epoch": 0.16738150225448148,
"grad_norm": 0.2753896415233612,
"learning_rate": 8e-05,
"loss": 1.7345,
"step": 761
},
{
"epoch": 0.16760145166611679,
"grad_norm": 0.2830727994441986,
"learning_rate": 8e-05,
"loss": 1.6884,
"step": 762
},
{
"epoch": 0.16782140107775212,
"grad_norm": 0.27818116545677185,
"learning_rate": 8e-05,
"loss": 1.7819,
"step": 763
},
{
"epoch": 0.16804135048938745,
"grad_norm": 0.2601570785045624,
"learning_rate": 8e-05,
"loss": 1.6323,
"step": 764
},
{
"epoch": 0.16826129990102276,
"grad_norm": 0.2638706564903259,
"learning_rate": 8e-05,
"loss": 1.5957,
"step": 765
},
{
"epoch": 0.1684812493126581,
"grad_norm": 0.2798631489276886,
"learning_rate": 8e-05,
"loss": 1.7946,
"step": 766
},
{
"epoch": 0.1687011987242934,
"grad_norm": 0.2975100874900818,
"learning_rate": 8e-05,
"loss": 1.871,
"step": 767
},
{
"epoch": 0.16892114813592873,
"grad_norm": 0.28308364748954773,
"learning_rate": 8e-05,
"loss": 1.7184,
"step": 768
},
{
"epoch": 0.16914109754756407,
"grad_norm": 0.2594911456108093,
"learning_rate": 8e-05,
"loss": 1.5867,
"step": 769
},
{
"epoch": 0.16936104695919937,
"grad_norm": 0.27594470977783203,
"learning_rate": 8e-05,
"loss": 1.7722,
"step": 770
},
{
"epoch": 0.1695809963708347,
"grad_norm": 0.2783298194408417,
"learning_rate": 8e-05,
"loss": 1.7891,
"step": 771
},
{
"epoch": 0.16980094578247004,
"grad_norm": 0.2863733172416687,
"learning_rate": 8e-05,
"loss": 1.6274,
"step": 772
},
{
"epoch": 0.17002089519410535,
"grad_norm": 0.27953147888183594,
"learning_rate": 8e-05,
"loss": 1.7287,
"step": 773
},
{
"epoch": 0.17024084460574068,
"grad_norm": 0.2736772894859314,
"learning_rate": 8e-05,
"loss": 1.6802,
"step": 774
},
{
"epoch": 0.17046079401737602,
"grad_norm": 0.27663713693618774,
"learning_rate": 8e-05,
"loss": 1.6607,
"step": 775
},
{
"epoch": 0.17068074342901132,
"grad_norm": 0.3064086437225342,
"learning_rate": 8e-05,
"loss": 1.8244,
"step": 776
},
{
"epoch": 0.17090069284064666,
"grad_norm": 0.29848581552505493,
"learning_rate": 8e-05,
"loss": 1.7702,
"step": 777
},
{
"epoch": 0.17112064225228196,
"grad_norm": 0.3101220726966858,
"learning_rate": 8e-05,
"loss": 1.7714,
"step": 778
},
{
"epoch": 0.1713405916639173,
"grad_norm": 0.2754581868648529,
"learning_rate": 8e-05,
"loss": 1.6367,
"step": 779
},
{
"epoch": 0.17156054107555263,
"grad_norm": 0.2706362307071686,
"learning_rate": 8e-05,
"loss": 1.6236,
"step": 780
},
{
"epoch": 0.17178049048718794,
"grad_norm": 0.29135438799858093,
"learning_rate": 8e-05,
"loss": 1.8478,
"step": 781
},
{
"epoch": 0.17200043989882327,
"grad_norm": 0.2751868963241577,
"learning_rate": 8e-05,
"loss": 1.751,
"step": 782
},
{
"epoch": 0.1722203893104586,
"grad_norm": 0.2871004045009613,
"learning_rate": 8e-05,
"loss": 1.6793,
"step": 783
},
{
"epoch": 0.1724403387220939,
"grad_norm": 0.31024861335754395,
"learning_rate": 8e-05,
"loss": 1.7419,
"step": 784
},
{
"epoch": 0.17266028813372924,
"grad_norm": 0.2917722165584564,
"learning_rate": 8e-05,
"loss": 1.8913,
"step": 785
},
{
"epoch": 0.17288023754536458,
"grad_norm": 0.25443291664123535,
"learning_rate": 8e-05,
"loss": 1.6991,
"step": 786
},
{
"epoch": 0.17310018695699989,
"grad_norm": 0.2827921211719513,
"learning_rate": 8e-05,
"loss": 1.8408,
"step": 787
},
{
"epoch": 0.17332013636863522,
"grad_norm": 0.26190435886383057,
"learning_rate": 8e-05,
"loss": 1.6841,
"step": 788
},
{
"epoch": 0.17354008578027053,
"grad_norm": 0.31557098031044006,
"learning_rate": 8e-05,
"loss": 1.8838,
"step": 789
},
{
"epoch": 0.17376003519190586,
"grad_norm": 0.27622002363204956,
"learning_rate": 8e-05,
"loss": 1.6558,
"step": 790
},
{
"epoch": 0.1739799846035412,
"grad_norm": 0.3161294758319855,
"learning_rate": 8e-05,
"loss": 1.5771,
"step": 791
},
{
"epoch": 0.1741999340151765,
"grad_norm": 0.3014603555202484,
"learning_rate": 8e-05,
"loss": 1.7742,
"step": 792
},
{
"epoch": 0.17441988342681183,
"grad_norm": 0.24996457993984222,
"learning_rate": 8e-05,
"loss": 1.5667,
"step": 793
},
{
"epoch": 0.17463983283844717,
"grad_norm": 0.29180648922920227,
"learning_rate": 8e-05,
"loss": 1.7703,
"step": 794
},
{
"epoch": 0.17485978225008247,
"grad_norm": 0.26707547903060913,
"learning_rate": 8e-05,
"loss": 1.7964,
"step": 795
},
{
"epoch": 0.1750797316617178,
"grad_norm": 0.24924349784851074,
"learning_rate": 8e-05,
"loss": 1.6619,
"step": 796
},
{
"epoch": 0.17529968107335314,
"grad_norm": 0.29872292280197144,
"learning_rate": 8e-05,
"loss": 1.8561,
"step": 797
},
{
"epoch": 0.17551963048498845,
"grad_norm": 0.2770175337791443,
"learning_rate": 8e-05,
"loss": 1.6352,
"step": 798
},
{
"epoch": 0.17573957989662378,
"grad_norm": 0.26890453696250916,
"learning_rate": 8e-05,
"loss": 1.885,
"step": 799
},
{
"epoch": 0.1759595293082591,
"grad_norm": 0.2830483317375183,
"learning_rate": 8e-05,
"loss": 1.6029,
"step": 800
},
{
"epoch": 0.17617947871989442,
"grad_norm": 0.27421921491622925,
"learning_rate": 8e-05,
"loss": 1.6845,
"step": 801
},
{
"epoch": 0.17639942813152976,
"grad_norm": 0.29273220896720886,
"learning_rate": 8e-05,
"loss": 1.8135,
"step": 802
},
{
"epoch": 0.17661937754316506,
"grad_norm": 0.2675575315952301,
"learning_rate": 8e-05,
"loss": 1.571,
"step": 803
},
{
"epoch": 0.1768393269548004,
"grad_norm": 0.2821138799190521,
"learning_rate": 8e-05,
"loss": 1.9244,
"step": 804
},
{
"epoch": 0.17705927636643573,
"grad_norm": 0.28082311153411865,
"learning_rate": 8e-05,
"loss": 1.7395,
"step": 805
},
{
"epoch": 0.17727922577807104,
"grad_norm": 0.27897313237190247,
"learning_rate": 8e-05,
"loss": 1.6347,
"step": 806
},
{
"epoch": 0.17749917518970637,
"grad_norm": 0.27358707785606384,
"learning_rate": 8e-05,
"loss": 1.7643,
"step": 807
},
{
"epoch": 0.1777191246013417,
"grad_norm": 0.284059077501297,
"learning_rate": 8e-05,
"loss": 1.5789,
"step": 808
},
{
"epoch": 0.177939074012977,
"grad_norm": 0.26125824451446533,
"learning_rate": 8e-05,
"loss": 1.7029,
"step": 809
},
{
"epoch": 0.17815902342461234,
"grad_norm": 0.26438888907432556,
"learning_rate": 8e-05,
"loss": 1.6424,
"step": 810
},
{
"epoch": 0.17837897283624765,
"grad_norm": 0.2746163010597229,
"learning_rate": 8e-05,
"loss": 1.7992,
"step": 811
},
{
"epoch": 0.17859892224788299,
"grad_norm": 0.27717527747154236,
"learning_rate": 8e-05,
"loss": 1.7603,
"step": 812
},
{
"epoch": 0.17881887165951832,
"grad_norm": 0.28336596488952637,
"learning_rate": 8e-05,
"loss": 1.7133,
"step": 813
},
{
"epoch": 0.17903882107115363,
"grad_norm": 0.2701306939125061,
"learning_rate": 8e-05,
"loss": 1.7724,
"step": 814
},
{
"epoch": 0.17925877048278896,
"grad_norm": 0.2807336449623108,
"learning_rate": 8e-05,
"loss": 1.7832,
"step": 815
},
{
"epoch": 0.1794787198944243,
"grad_norm": 0.2847912907600403,
"learning_rate": 8e-05,
"loss": 1.7042,
"step": 816
},
{
"epoch": 0.1796986693060596,
"grad_norm": 0.2836345434188843,
"learning_rate": 8e-05,
"loss": 1.8506,
"step": 817
},
{
"epoch": 0.17991861871769493,
"grad_norm": 0.30620551109313965,
"learning_rate": 8e-05,
"loss": 1.7695,
"step": 818
},
{
"epoch": 0.18013856812933027,
"grad_norm": 0.2698993980884552,
"learning_rate": 8e-05,
"loss": 1.6388,
"step": 819
},
{
"epoch": 0.18035851754096557,
"grad_norm": 0.2937266528606415,
"learning_rate": 8e-05,
"loss": 1.8648,
"step": 820
},
{
"epoch": 0.1805784669526009,
"grad_norm": 0.2661988139152527,
"learning_rate": 8e-05,
"loss": 1.7563,
"step": 821
},
{
"epoch": 0.18079841636423621,
"grad_norm": 0.2944018840789795,
"learning_rate": 8e-05,
"loss": 1.882,
"step": 822
},
{
"epoch": 0.18101836577587155,
"grad_norm": 0.2774435579776764,
"learning_rate": 8e-05,
"loss": 1.8117,
"step": 823
},
{
"epoch": 0.18123831518750688,
"grad_norm": 0.27865204215049744,
"learning_rate": 8e-05,
"loss": 1.8815,
"step": 824
},
{
"epoch": 0.1814582645991422,
"grad_norm": 0.26444011926651,
"learning_rate": 8e-05,
"loss": 1.5844,
"step": 825
},
{
"epoch": 0.18167821401077752,
"grad_norm": 0.27044716477394104,
"learning_rate": 8e-05,
"loss": 1.7403,
"step": 826
},
{
"epoch": 0.18189816342241286,
"grad_norm": 0.28727805614471436,
"learning_rate": 8e-05,
"loss": 1.8556,
"step": 827
},
{
"epoch": 0.18211811283404816,
"grad_norm": 0.26131972670555115,
"learning_rate": 8e-05,
"loss": 1.7727,
"step": 828
},
{
"epoch": 0.1823380622456835,
"grad_norm": 0.269638329744339,
"learning_rate": 8e-05,
"loss": 1.6795,
"step": 829
},
{
"epoch": 0.18255801165731883,
"grad_norm": 0.2671653628349304,
"learning_rate": 8e-05,
"loss": 1.6811,
"step": 830
},
{
"epoch": 0.18277796106895414,
"grad_norm": 0.2659014165401459,
"learning_rate": 8e-05,
"loss": 1.7166,
"step": 831
},
{
"epoch": 0.18299791048058947,
"grad_norm": 0.2719801962375641,
"learning_rate": 8e-05,
"loss": 1.6938,
"step": 832
},
{
"epoch": 0.18321785989222478,
"grad_norm": 0.3272366225719452,
"learning_rate": 8e-05,
"loss": 1.8213,
"step": 833
},
{
"epoch": 0.1834378093038601,
"grad_norm": 0.2635113000869751,
"learning_rate": 8e-05,
"loss": 1.6291,
"step": 834
},
{
"epoch": 0.18365775871549544,
"grad_norm": 0.29401281476020813,
"learning_rate": 8e-05,
"loss": 1.8234,
"step": 835
},
{
"epoch": 0.18387770812713075,
"grad_norm": 0.29188451170921326,
"learning_rate": 8e-05,
"loss": 1.7359,
"step": 836
},
{
"epoch": 0.18409765753876609,
"grad_norm": 0.2688080072402954,
"learning_rate": 8e-05,
"loss": 1.7088,
"step": 837
},
{
"epoch": 0.18431760695040142,
"grad_norm": 0.27907344698905945,
"learning_rate": 8e-05,
"loss": 1.6762,
"step": 838
},
{
"epoch": 0.18453755636203673,
"grad_norm": 0.2875908315181732,
"learning_rate": 8e-05,
"loss": 1.7612,
"step": 839
},
{
"epoch": 0.18475750577367206,
"grad_norm": 0.2683177888393402,
"learning_rate": 8e-05,
"loss": 1.5965,
"step": 840
},
{
"epoch": 0.18497745518530737,
"grad_norm": 0.29948660731315613,
"learning_rate": 8e-05,
"loss": 1.7358,
"step": 841
},
{
"epoch": 0.1851974045969427,
"grad_norm": 0.28153204917907715,
"learning_rate": 8e-05,
"loss": 1.8089,
"step": 842
},
{
"epoch": 0.18541735400857803,
"grad_norm": 0.29185283184051514,
"learning_rate": 8e-05,
"loss": 1.8357,
"step": 843
},
{
"epoch": 0.18563730342021334,
"grad_norm": 0.27565860748291016,
"learning_rate": 8e-05,
"loss": 1.9565,
"step": 844
},
{
"epoch": 0.18585725283184867,
"grad_norm": 0.2811479866504669,
"learning_rate": 8e-05,
"loss": 1.8493,
"step": 845
},
{
"epoch": 0.186077202243484,
"grad_norm": 0.271893173456192,
"learning_rate": 8e-05,
"loss": 1.7622,
"step": 846
},
{
"epoch": 0.18629715165511931,
"grad_norm": 0.26383113861083984,
"learning_rate": 8e-05,
"loss": 1.7392,
"step": 847
},
{
"epoch": 0.18651710106675465,
"grad_norm": 0.2863881289958954,
"learning_rate": 8e-05,
"loss": 1.7367,
"step": 848
},
{
"epoch": 0.18673705047838998,
"grad_norm": 0.28036433458328247,
"learning_rate": 8e-05,
"loss": 1.6587,
"step": 849
},
{
"epoch": 0.1869569998900253,
"grad_norm": 0.2938581705093384,
"learning_rate": 8e-05,
"loss": 1.7411,
"step": 850
},
{
"epoch": 0.18717694930166062,
"grad_norm": 0.27487799525260925,
"learning_rate": 8e-05,
"loss": 1.8054,
"step": 851
},
{
"epoch": 0.18739689871329593,
"grad_norm": 0.2693670690059662,
"learning_rate": 8e-05,
"loss": 1.7361,
"step": 852
},
{
"epoch": 0.18761684812493126,
"grad_norm": 0.2999705970287323,
"learning_rate": 8e-05,
"loss": 1.909,
"step": 853
},
{
"epoch": 0.1878367975365666,
"grad_norm": 0.28235265612602234,
"learning_rate": 8e-05,
"loss": 1.8611,
"step": 854
},
{
"epoch": 0.1880567469482019,
"grad_norm": 0.28417298197746277,
"learning_rate": 8e-05,
"loss": 1.683,
"step": 855
},
{
"epoch": 0.18827669635983724,
"grad_norm": 0.2697356045246124,
"learning_rate": 8e-05,
"loss": 1.7138,
"step": 856
},
{
"epoch": 0.18849664577147257,
"grad_norm": 0.26900357007980347,
"learning_rate": 8e-05,
"loss": 1.5579,
"step": 857
},
{
"epoch": 0.18871659518310788,
"grad_norm": 0.259941041469574,
"learning_rate": 8e-05,
"loss": 1.7106,
"step": 858
},
{
"epoch": 0.1889365445947432,
"grad_norm": 0.26958781480789185,
"learning_rate": 8e-05,
"loss": 1.6454,
"step": 859
},
{
"epoch": 0.18915649400637854,
"grad_norm": 0.26425305008888245,
"learning_rate": 8e-05,
"loss": 1.6408,
"step": 860
},
{
"epoch": 0.18937644341801385,
"grad_norm": 0.26996907591819763,
"learning_rate": 8e-05,
"loss": 1.6949,
"step": 861
},
{
"epoch": 0.18959639282964919,
"grad_norm": 0.25882837176322937,
"learning_rate": 8e-05,
"loss": 1.6142,
"step": 862
},
{
"epoch": 0.1898163422412845,
"grad_norm": 0.28000783920288086,
"learning_rate": 8e-05,
"loss": 1.8007,
"step": 863
},
{
"epoch": 0.19003629165291983,
"grad_norm": 0.2744222581386566,
"learning_rate": 8e-05,
"loss": 1.6604,
"step": 864
},
{
"epoch": 0.19025624106455516,
"grad_norm": 0.2791576683521271,
"learning_rate": 8e-05,
"loss": 1.7061,
"step": 865
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.27878084778785706,
"learning_rate": 8e-05,
"loss": 1.8604,
"step": 866
},
{
"epoch": 0.1906961398878258,
"grad_norm": 0.3818608820438385,
"learning_rate": 8e-05,
"loss": 1.8616,
"step": 867
},
{
"epoch": 0.19091608929946113,
"grad_norm": 0.27952665090560913,
"learning_rate": 8e-05,
"loss": 1.7616,
"step": 868
},
{
"epoch": 0.19113603871109644,
"grad_norm": 0.2711832523345947,
"learning_rate": 8e-05,
"loss": 1.7974,
"step": 869
},
{
"epoch": 0.19135598812273177,
"grad_norm": 0.2572176456451416,
"learning_rate": 8e-05,
"loss": 1.584,
"step": 870
},
{
"epoch": 0.1915759375343671,
"grad_norm": 0.2847760319709778,
"learning_rate": 8e-05,
"loss": 1.8598,
"step": 871
},
{
"epoch": 0.19179588694600241,
"grad_norm": 0.29798731207847595,
"learning_rate": 8e-05,
"loss": 1.6689,
"step": 872
},
{
"epoch": 0.19201583635763775,
"grad_norm": 0.2674097716808319,
"learning_rate": 8e-05,
"loss": 1.6694,
"step": 873
},
{
"epoch": 0.19223578576927305,
"grad_norm": 0.27707335352897644,
"learning_rate": 8e-05,
"loss": 1.7251,
"step": 874
},
{
"epoch": 0.1924557351809084,
"grad_norm": 0.2801666259765625,
"learning_rate": 8e-05,
"loss": 1.7251,
"step": 875
},
{
"epoch": 0.19267568459254372,
"grad_norm": 0.2656191885471344,
"learning_rate": 8e-05,
"loss": 1.6232,
"step": 876
},
{
"epoch": 0.19289563400417903,
"grad_norm": 0.2588733732700348,
"learning_rate": 8e-05,
"loss": 1.7595,
"step": 877
},
{
"epoch": 0.19311558341581436,
"grad_norm": 0.2999958097934723,
"learning_rate": 8e-05,
"loss": 1.9095,
"step": 878
},
{
"epoch": 0.1933355328274497,
"grad_norm": 0.27143120765686035,
"learning_rate": 8e-05,
"loss": 1.6698,
"step": 879
},
{
"epoch": 0.193555482239085,
"grad_norm": 0.29155731201171875,
"learning_rate": 8e-05,
"loss": 1.6437,
"step": 880
},
{
"epoch": 0.19377543165072034,
"grad_norm": 0.26307716965675354,
"learning_rate": 8e-05,
"loss": 1.6161,
"step": 881
},
{
"epoch": 0.19399538106235567,
"grad_norm": 0.27041196823120117,
"learning_rate": 8e-05,
"loss": 1.5374,
"step": 882
},
{
"epoch": 0.19421533047399098,
"grad_norm": 0.2752692699432373,
"learning_rate": 8e-05,
"loss": 1.6543,
"step": 883
},
{
"epoch": 0.1944352798856263,
"grad_norm": 0.2883388102054596,
"learning_rate": 8e-05,
"loss": 1.7503,
"step": 884
},
{
"epoch": 0.19465522929726162,
"grad_norm": 0.27332282066345215,
"learning_rate": 8e-05,
"loss": 1.8456,
"step": 885
},
{
"epoch": 0.19487517870889695,
"grad_norm": 0.26226627826690674,
"learning_rate": 8e-05,
"loss": 1.2577,
"step": 886
},
{
"epoch": 0.19509512812053229,
"grad_norm": 0.2709749639034271,
"learning_rate": 8e-05,
"loss": 1.7854,
"step": 887
},
{
"epoch": 0.1953150775321676,
"grad_norm": 0.28380879759788513,
"learning_rate": 8e-05,
"loss": 1.7151,
"step": 888
},
{
"epoch": 0.19553502694380293,
"grad_norm": 0.2702254354953766,
"learning_rate": 8e-05,
"loss": 1.6779,
"step": 889
},
{
"epoch": 0.19575497635543826,
"grad_norm": 0.2620486617088318,
"learning_rate": 8e-05,
"loss": 1.7166,
"step": 890
},
{
"epoch": 0.19597492576707357,
"grad_norm": 0.27195873856544495,
"learning_rate": 8e-05,
"loss": 1.7263,
"step": 891
},
{
"epoch": 0.1961948751787089,
"grad_norm": 0.2719867527484894,
"learning_rate": 8e-05,
"loss": 1.6982,
"step": 892
},
{
"epoch": 0.19641482459034423,
"grad_norm": 0.27889111638069153,
"learning_rate": 8e-05,
"loss": 1.7726,
"step": 893
},
{
"epoch": 0.19663477400197954,
"grad_norm": 0.2745397686958313,
"learning_rate": 8e-05,
"loss": 1.7737,
"step": 894
},
{
"epoch": 0.19685472341361487,
"grad_norm": 0.2698670029640198,
"learning_rate": 8e-05,
"loss": 1.7149,
"step": 895
},
{
"epoch": 0.19707467282525018,
"grad_norm": 0.27113667130470276,
"learning_rate": 8e-05,
"loss": 1.7887,
"step": 896
},
{
"epoch": 0.19729462223688551,
"grad_norm": 0.2772979140281677,
"learning_rate": 8e-05,
"loss": 1.8163,
"step": 897
},
{
"epoch": 0.19751457164852085,
"grad_norm": 0.2757657766342163,
"learning_rate": 8e-05,
"loss": 1.636,
"step": 898
},
{
"epoch": 0.19773452106015615,
"grad_norm": 0.26945242285728455,
"learning_rate": 8e-05,
"loss": 1.7639,
"step": 899
},
{
"epoch": 0.1979544704717915,
"grad_norm": 0.27328991889953613,
"learning_rate": 8e-05,
"loss": 1.8421,
"step": 900
},
{
"epoch": 0.19817441988342682,
"grad_norm": 0.2721468210220337,
"learning_rate": 8e-05,
"loss": 1.6926,
"step": 901
},
{
"epoch": 0.19839436929506213,
"grad_norm": 0.2633766233921051,
"learning_rate": 8e-05,
"loss": 1.7629,
"step": 902
},
{
"epoch": 0.19861431870669746,
"grad_norm": 0.26183879375457764,
"learning_rate": 8e-05,
"loss": 1.7149,
"step": 903
},
{
"epoch": 0.1988342681183328,
"grad_norm": 0.2837960422039032,
"learning_rate": 8e-05,
"loss": 1.7923,
"step": 904
},
{
"epoch": 0.1990542175299681,
"grad_norm": 0.30745571851730347,
"learning_rate": 8e-05,
"loss": 1.8315,
"step": 905
},
{
"epoch": 0.19927416694160344,
"grad_norm": 0.2734341323375702,
"learning_rate": 8e-05,
"loss": 1.7424,
"step": 906
},
{
"epoch": 0.19949411635323874,
"grad_norm": 0.2613460123538971,
"learning_rate": 8e-05,
"loss": 1.6445,
"step": 907
},
{
"epoch": 0.19971406576487408,
"grad_norm": 0.27867522835731506,
"learning_rate": 8e-05,
"loss": 1.7075,
"step": 908
},
{
"epoch": 0.1999340151765094,
"grad_norm": 0.269789457321167,
"learning_rate": 8e-05,
"loss": 1.6801,
"step": 909
},
{
"epoch": 0.20015396458814472,
"grad_norm": 0.2684427797794342,
"learning_rate": 8e-05,
"loss": 1.6862,
"step": 910
},
{
"epoch": 0.20037391399978005,
"grad_norm": 0.2929883897304535,
"learning_rate": 8e-05,
"loss": 1.7972,
"step": 911
},
{
"epoch": 0.20059386341141539,
"grad_norm": 0.2757764756679535,
"learning_rate": 8e-05,
"loss": 1.6198,
"step": 912
},
{
"epoch": 0.2008138128230507,
"grad_norm": 0.28071129322052,
"learning_rate": 8e-05,
"loss": 1.8268,
"step": 913
},
{
"epoch": 0.20103376223468603,
"grad_norm": 0.2964448928833008,
"learning_rate": 8e-05,
"loss": 1.7806,
"step": 914
},
{
"epoch": 0.20125371164632136,
"grad_norm": 0.2682490050792694,
"learning_rate": 8e-05,
"loss": 1.7016,
"step": 915
},
{
"epoch": 0.20147366105795667,
"grad_norm": 0.2838338613510132,
"learning_rate": 8e-05,
"loss": 1.7402,
"step": 916
},
{
"epoch": 0.201693610469592,
"grad_norm": 0.27621790766716003,
"learning_rate": 8e-05,
"loss": 1.7442,
"step": 917
},
{
"epoch": 0.2019135598812273,
"grad_norm": 0.29265734553337097,
"learning_rate": 8e-05,
"loss": 1.6924,
"step": 918
},
{
"epoch": 0.20213350929286264,
"grad_norm": 0.27404630184173584,
"learning_rate": 8e-05,
"loss": 1.7312,
"step": 919
},
{
"epoch": 0.20235345870449797,
"grad_norm": 0.2742730975151062,
"learning_rate": 8e-05,
"loss": 1.8645,
"step": 920
},
{
"epoch": 0.20257340811613328,
"grad_norm": 0.28536343574523926,
"learning_rate": 8e-05,
"loss": 1.717,
"step": 921
},
{
"epoch": 0.20279335752776861,
"grad_norm": 0.28739288449287415,
"learning_rate": 8e-05,
"loss": 1.8356,
"step": 922
},
{
"epoch": 0.20301330693940395,
"grad_norm": 0.2650564908981323,
"learning_rate": 8e-05,
"loss": 1.717,
"step": 923
},
{
"epoch": 0.20323325635103925,
"grad_norm": 0.28638410568237305,
"learning_rate": 8e-05,
"loss": 1.8822,
"step": 924
},
{
"epoch": 0.2034532057626746,
"grad_norm": 0.25474488735198975,
"learning_rate": 8e-05,
"loss": 1.5533,
"step": 925
},
{
"epoch": 0.20367315517430992,
"grad_norm": 0.2719588279724121,
"learning_rate": 8e-05,
"loss": 1.7887,
"step": 926
},
{
"epoch": 0.20389310458594523,
"grad_norm": 0.2572193741798401,
"learning_rate": 8e-05,
"loss": 1.5735,
"step": 927
},
{
"epoch": 0.20411305399758056,
"grad_norm": 0.2975933253765106,
"learning_rate": 8e-05,
"loss": 1.7875,
"step": 928
},
{
"epoch": 0.20433300340921587,
"grad_norm": 0.2562117874622345,
"learning_rate": 8e-05,
"loss": 1.5977,
"step": 929
},
{
"epoch": 0.2045529528208512,
"grad_norm": 0.2524821162223816,
"learning_rate": 8e-05,
"loss": 1.6356,
"step": 930
},
{
"epoch": 0.20477290223248654,
"grad_norm": 0.2621130347251892,
"learning_rate": 8e-05,
"loss": 1.6082,
"step": 931
},
{
"epoch": 0.20499285164412184,
"grad_norm": 0.27930355072021484,
"learning_rate": 8e-05,
"loss": 1.7618,
"step": 932
},
{
"epoch": 0.20521280105575718,
"grad_norm": 0.29147934913635254,
"learning_rate": 8e-05,
"loss": 1.8223,
"step": 933
},
{
"epoch": 0.2054327504673925,
"grad_norm": 0.2584928870201111,
"learning_rate": 8e-05,
"loss": 1.6916,
"step": 934
},
{
"epoch": 0.20565269987902782,
"grad_norm": 0.27299705147743225,
"learning_rate": 8e-05,
"loss": 1.5535,
"step": 935
},
{
"epoch": 0.20587264929066315,
"grad_norm": 0.2682443857192993,
"learning_rate": 8e-05,
"loss": 1.7119,
"step": 936
},
{
"epoch": 0.20609259870229849,
"grad_norm": 0.29716598987579346,
"learning_rate": 8e-05,
"loss": 2.0561,
"step": 937
},
{
"epoch": 0.2063125481139338,
"grad_norm": 0.27801281213760376,
"learning_rate": 8e-05,
"loss": 1.7605,
"step": 938
},
{
"epoch": 0.20653249752556913,
"grad_norm": 0.26767662167549133,
"learning_rate": 8e-05,
"loss": 1.6462,
"step": 939
},
{
"epoch": 0.20675244693720443,
"grad_norm": 0.27354639768600464,
"learning_rate": 8e-05,
"loss": 1.7241,
"step": 940
},
{
"epoch": 0.20697239634883977,
"grad_norm": 0.2684631049633026,
"learning_rate": 8e-05,
"loss": 1.8066,
"step": 941
},
{
"epoch": 0.2071923457604751,
"grad_norm": 0.27846816182136536,
"learning_rate": 8e-05,
"loss": 1.7553,
"step": 942
},
{
"epoch": 0.2074122951721104,
"grad_norm": 0.2820284366607666,
"learning_rate": 8e-05,
"loss": 1.8302,
"step": 943
},
{
"epoch": 0.20763224458374574,
"grad_norm": 0.28080835938453674,
"learning_rate": 8e-05,
"loss": 1.7544,
"step": 944
},
{
"epoch": 0.20785219399538107,
"grad_norm": 0.28095102310180664,
"learning_rate": 8e-05,
"loss": 1.7271,
"step": 945
},
{
"epoch": 0.20807214340701638,
"grad_norm": 0.27856144309043884,
"learning_rate": 8e-05,
"loss": 1.8441,
"step": 946
},
{
"epoch": 0.20829209281865171,
"grad_norm": 0.27816230058670044,
"learning_rate": 8e-05,
"loss": 1.981,
"step": 947
},
{
"epoch": 0.20851204223028702,
"grad_norm": 0.2954215705394745,
"learning_rate": 8e-05,
"loss": 1.8001,
"step": 948
},
{
"epoch": 0.20873199164192235,
"grad_norm": 0.24413350224494934,
"learning_rate": 8e-05,
"loss": 1.5436,
"step": 949
},
{
"epoch": 0.2089519410535577,
"grad_norm": 0.2849874198436737,
"learning_rate": 8e-05,
"loss": 1.7027,
"step": 950
},
{
"epoch": 0.209171890465193,
"grad_norm": 0.2710252106189728,
"learning_rate": 8e-05,
"loss": 1.7222,
"step": 951
},
{
"epoch": 0.20939183987682833,
"grad_norm": 0.2557348608970642,
"learning_rate": 8e-05,
"loss": 1.6469,
"step": 952
},
{
"epoch": 0.20961178928846366,
"grad_norm": 0.2688618004322052,
"learning_rate": 8e-05,
"loss": 1.6471,
"step": 953
},
{
"epoch": 0.20983173870009897,
"grad_norm": 0.28641626238822937,
"learning_rate": 8e-05,
"loss": 1.8796,
"step": 954
},
{
"epoch": 0.2100516881117343,
"grad_norm": 0.2582222521305084,
"learning_rate": 8e-05,
"loss": 1.5961,
"step": 955
},
{
"epoch": 0.21027163752336964,
"grad_norm": 0.2615504562854767,
"learning_rate": 8e-05,
"loss": 1.6668,
"step": 956
},
{
"epoch": 0.21049158693500494,
"grad_norm": 0.2669670879840851,
"learning_rate": 8e-05,
"loss": 1.663,
"step": 957
},
{
"epoch": 0.21071153634664028,
"grad_norm": 0.2649092972278595,
"learning_rate": 8e-05,
"loss": 1.5377,
"step": 958
},
{
"epoch": 0.21093148575827558,
"grad_norm": 0.2936461865901947,
"learning_rate": 8e-05,
"loss": 1.5659,
"step": 959
},
{
"epoch": 0.21115143516991092,
"grad_norm": 0.2878846824169159,
"learning_rate": 8e-05,
"loss": 1.8567,
"step": 960
},
{
"epoch": 0.21137138458154625,
"grad_norm": 0.2928799092769623,
"learning_rate": 8e-05,
"loss": 1.8423,
"step": 961
},
{
"epoch": 0.21159133399318156,
"grad_norm": 0.2641200125217438,
"learning_rate": 8e-05,
"loss": 1.6403,
"step": 962
},
{
"epoch": 0.2118112834048169,
"grad_norm": 0.26553985476493835,
"learning_rate": 8e-05,
"loss": 1.7436,
"step": 963
},
{
"epoch": 0.21203123281645223,
"grad_norm": 0.25616276264190674,
"learning_rate": 8e-05,
"loss": 1.5959,
"step": 964
},
{
"epoch": 0.21225118222808753,
"grad_norm": 0.29729175567626953,
"learning_rate": 8e-05,
"loss": 1.7164,
"step": 965
},
{
"epoch": 0.21247113163972287,
"grad_norm": 0.2739759683609009,
"learning_rate": 8e-05,
"loss": 1.7797,
"step": 966
},
{
"epoch": 0.2126910810513582,
"grad_norm": 0.2686353921890259,
"learning_rate": 8e-05,
"loss": 1.6974,
"step": 967
},
{
"epoch": 0.2129110304629935,
"grad_norm": 0.261820912361145,
"learning_rate": 8e-05,
"loss": 1.6864,
"step": 968
},
{
"epoch": 0.21313097987462884,
"grad_norm": 0.26877105236053467,
"learning_rate": 8e-05,
"loss": 1.6164,
"step": 969
},
{
"epoch": 0.21335092928626415,
"grad_norm": 0.2555043399333954,
"learning_rate": 8e-05,
"loss": 1.6898,
"step": 970
},
{
"epoch": 0.21357087869789948,
"grad_norm": 0.28584909439086914,
"learning_rate": 8e-05,
"loss": 1.9125,
"step": 971
},
{
"epoch": 0.21379082810953481,
"grad_norm": 0.2830945551395416,
"learning_rate": 8e-05,
"loss": 1.6416,
"step": 972
},
{
"epoch": 0.21401077752117012,
"grad_norm": 0.27979904413223267,
"learning_rate": 8e-05,
"loss": 1.8355,
"step": 973
},
{
"epoch": 0.21423072693280545,
"grad_norm": 0.2672286033630371,
"learning_rate": 8e-05,
"loss": 1.685,
"step": 974
},
{
"epoch": 0.2144506763444408,
"grad_norm": 0.26699069142341614,
"learning_rate": 8e-05,
"loss": 1.5951,
"step": 975
},
{
"epoch": 0.2146706257560761,
"grad_norm": 0.2720418870449066,
"learning_rate": 8e-05,
"loss": 1.7558,
"step": 976
},
{
"epoch": 0.21489057516771143,
"grad_norm": 0.26792463660240173,
"learning_rate": 8e-05,
"loss": 1.7407,
"step": 977
},
{
"epoch": 0.21511052457934676,
"grad_norm": 0.2763652503490448,
"learning_rate": 8e-05,
"loss": 1.7525,
"step": 978
},
{
"epoch": 0.21533047399098207,
"grad_norm": 0.2952554225921631,
"learning_rate": 8e-05,
"loss": 1.6535,
"step": 979
},
{
"epoch": 0.2155504234026174,
"grad_norm": 0.24981874227523804,
"learning_rate": 8e-05,
"loss": 1.6055,
"step": 980
},
{
"epoch": 0.2157703728142527,
"grad_norm": 0.29071807861328125,
"learning_rate": 8e-05,
"loss": 1.7461,
"step": 981
},
{
"epoch": 0.21599032222588804,
"grad_norm": 0.26875782012939453,
"learning_rate": 8e-05,
"loss": 1.5809,
"step": 982
},
{
"epoch": 0.21621027163752338,
"grad_norm": 0.2519072890281677,
"learning_rate": 8e-05,
"loss": 1.7001,
"step": 983
},
{
"epoch": 0.21643022104915868,
"grad_norm": 0.2748781144618988,
"learning_rate": 8e-05,
"loss": 1.8367,
"step": 984
},
{
"epoch": 0.21665017046079402,
"grad_norm": 0.274047315120697,
"learning_rate": 8e-05,
"loss": 1.7698,
"step": 985
},
{
"epoch": 0.21687011987242935,
"grad_norm": 0.2614712119102478,
"learning_rate": 8e-05,
"loss": 1.5411,
"step": 986
},
{
"epoch": 0.21709006928406466,
"grad_norm": 0.2714536190032959,
"learning_rate": 8e-05,
"loss": 1.6058,
"step": 987
},
{
"epoch": 0.2173100186957,
"grad_norm": 0.28763729333877563,
"learning_rate": 8e-05,
"loss": 1.6711,
"step": 988
},
{
"epoch": 0.21752996810733533,
"grad_norm": 0.26780402660369873,
"learning_rate": 8e-05,
"loss": 1.549,
"step": 989
},
{
"epoch": 0.21774991751897063,
"grad_norm": 0.28782159090042114,
"learning_rate": 8e-05,
"loss": 1.8305,
"step": 990
},
{
"epoch": 0.21796986693060597,
"grad_norm": 0.2859013080596924,
"learning_rate": 8e-05,
"loss": 1.7794,
"step": 991
},
{
"epoch": 0.21818981634224127,
"grad_norm": 0.2893369197845459,
"learning_rate": 8e-05,
"loss": 1.6284,
"step": 992
},
{
"epoch": 0.2184097657538766,
"grad_norm": 0.2809627652168274,
"learning_rate": 8e-05,
"loss": 1.6401,
"step": 993
},
{
"epoch": 0.21862971516551194,
"grad_norm": 0.2700895667076111,
"learning_rate": 8e-05,
"loss": 1.7153,
"step": 994
},
{
"epoch": 0.21884966457714725,
"grad_norm": 0.26506903767585754,
"learning_rate": 8e-05,
"loss": 1.4688,
"step": 995
},
{
"epoch": 0.21906961398878258,
"grad_norm": 0.28202024102211,
"learning_rate": 8e-05,
"loss": 1.7009,
"step": 996
},
{
"epoch": 0.21928956340041791,
"grad_norm": 0.2625409960746765,
"learning_rate": 8e-05,
"loss": 1.6491,
"step": 997
},
{
"epoch": 0.21950951281205322,
"grad_norm": 0.29967787861824036,
"learning_rate": 8e-05,
"loss": 1.7231,
"step": 998
},
{
"epoch": 0.21972946222368855,
"grad_norm": 0.2992357909679413,
"learning_rate": 8e-05,
"loss": 1.7028,
"step": 999
},
{
"epoch": 0.2199494116353239,
"grad_norm": 0.28712475299835205,
"learning_rate": 8e-05,
"loss": 1.763,
"step": 1000
},
{
"epoch": 0.2201693610469592,
"grad_norm": 0.26186901330947876,
"learning_rate": 8e-05,
"loss": 1.5695,
"step": 1001
},
{
"epoch": 0.22038931045859453,
"grad_norm": 0.2897952198982239,
"learning_rate": 8e-05,
"loss": 1.6303,
"step": 1002
},
{
"epoch": 0.22060925987022983,
"grad_norm": 0.2761494815349579,
"learning_rate": 8e-05,
"loss": 1.7448,
"step": 1003
},
{
"epoch": 0.22082920928186517,
"grad_norm": 0.2604154944419861,
"learning_rate": 8e-05,
"loss": 1.53,
"step": 1004
},
{
"epoch": 0.2210491586935005,
"grad_norm": 0.2897418737411499,
"learning_rate": 8e-05,
"loss": 1.7639,
"step": 1005
},
{
"epoch": 0.2212691081051358,
"grad_norm": 0.28289687633514404,
"learning_rate": 8e-05,
"loss": 1.7202,
"step": 1006
},
{
"epoch": 0.22148905751677114,
"grad_norm": 0.26917099952697754,
"learning_rate": 8e-05,
"loss": 1.7183,
"step": 1007
},
{
"epoch": 0.22170900692840648,
"grad_norm": 0.26708024740219116,
"learning_rate": 8e-05,
"loss": 1.636,
"step": 1008
},
{
"epoch": 0.22192895634004178,
"grad_norm": 0.2759459316730499,
"learning_rate": 8e-05,
"loss": 1.6537,
"step": 1009
},
{
"epoch": 0.22214890575167712,
"grad_norm": 0.3040393590927124,
"learning_rate": 8e-05,
"loss": 1.7849,
"step": 1010
},
{
"epoch": 0.22236885516331245,
"grad_norm": 0.2729750871658325,
"learning_rate": 8e-05,
"loss": 1.8199,
"step": 1011
},
{
"epoch": 0.22258880457494776,
"grad_norm": 0.28002965450286865,
"learning_rate": 8e-05,
"loss": 1.8286,
"step": 1012
},
{
"epoch": 0.2228087539865831,
"grad_norm": 0.27389100193977356,
"learning_rate": 8e-05,
"loss": 1.6472,
"step": 1013
},
{
"epoch": 0.2230287033982184,
"grad_norm": 0.2610195279121399,
"learning_rate": 8e-05,
"loss": 1.6096,
"step": 1014
},
{
"epoch": 0.22324865280985373,
"grad_norm": 0.2683162987232208,
"learning_rate": 8e-05,
"loss": 1.6477,
"step": 1015
},
{
"epoch": 0.22346860222148907,
"grad_norm": 0.26524773240089417,
"learning_rate": 8e-05,
"loss": 1.6224,
"step": 1016
},
{
"epoch": 0.22368855163312437,
"grad_norm": 0.26295366883277893,
"learning_rate": 8e-05,
"loss": 1.7316,
"step": 1017
},
{
"epoch": 0.2239085010447597,
"grad_norm": 0.2837565243244171,
"learning_rate": 8e-05,
"loss": 1.9452,
"step": 1018
},
{
"epoch": 0.22412845045639504,
"grad_norm": 0.28365132212638855,
"learning_rate": 8e-05,
"loss": 1.6577,
"step": 1019
},
{
"epoch": 0.22434839986803035,
"grad_norm": 0.2736522853374481,
"learning_rate": 8e-05,
"loss": 1.6644,
"step": 1020
},
{
"epoch": 0.22456834927966568,
"grad_norm": 0.2878374755382538,
"learning_rate": 8e-05,
"loss": 1.5844,
"step": 1021
},
{
"epoch": 0.22478829869130101,
"grad_norm": 0.28223422169685364,
"learning_rate": 8e-05,
"loss": 1.881,
"step": 1022
},
{
"epoch": 0.22500824810293632,
"grad_norm": 0.26408734917640686,
"learning_rate": 8e-05,
"loss": 1.6201,
"step": 1023
},
{
"epoch": 0.22522819751457165,
"grad_norm": 0.28506824374198914,
"learning_rate": 8e-05,
"loss": 1.8146,
"step": 1024
},
{
"epoch": 0.22544814692620696,
"grad_norm": 0.2808188796043396,
"learning_rate": 8e-05,
"loss": 1.8394,
"step": 1025
},
{
"epoch": 0.2256680963378423,
"grad_norm": 0.2950645387172699,
"learning_rate": 8e-05,
"loss": 1.7993,
"step": 1026
},
{
"epoch": 0.22588804574947763,
"grad_norm": 0.27935850620269775,
"learning_rate": 8e-05,
"loss": 1.6506,
"step": 1027
},
{
"epoch": 0.22610799516111293,
"grad_norm": 0.2576957643032074,
"learning_rate": 8e-05,
"loss": 1.6987,
"step": 1028
},
{
"epoch": 0.22632794457274827,
"grad_norm": 0.2719384729862213,
"learning_rate": 8e-05,
"loss": 1.6407,
"step": 1029
},
{
"epoch": 0.2265478939843836,
"grad_norm": 0.25457167625427246,
"learning_rate": 8e-05,
"loss": 1.6877,
"step": 1030
},
{
"epoch": 0.2267678433960189,
"grad_norm": 0.2758035659790039,
"learning_rate": 8e-05,
"loss": 1.6739,
"step": 1031
},
{
"epoch": 0.22698779280765424,
"grad_norm": 0.27135321497917175,
"learning_rate": 8e-05,
"loss": 1.7124,
"step": 1032
},
{
"epoch": 0.22720774221928958,
"grad_norm": 0.2675740420818329,
"learning_rate": 8e-05,
"loss": 1.7857,
"step": 1033
},
{
"epoch": 0.22742769163092488,
"grad_norm": 0.28627943992614746,
"learning_rate": 8e-05,
"loss": 1.7012,
"step": 1034
},
{
"epoch": 0.22764764104256022,
"grad_norm": 0.2710109353065491,
"learning_rate": 8e-05,
"loss": 1.6463,
"step": 1035
},
{
"epoch": 0.22786759045419552,
"grad_norm": 0.27190473675727844,
"learning_rate": 8e-05,
"loss": 1.7288,
"step": 1036
},
{
"epoch": 0.22808753986583086,
"grad_norm": 0.2503564953804016,
"learning_rate": 8e-05,
"loss": 1.566,
"step": 1037
},
{
"epoch": 0.2283074892774662,
"grad_norm": 0.26503992080688477,
"learning_rate": 8e-05,
"loss": 1.7034,
"step": 1038
},
{
"epoch": 0.2285274386891015,
"grad_norm": 0.29445260763168335,
"learning_rate": 8e-05,
"loss": 1.6739,
"step": 1039
},
{
"epoch": 0.22874738810073683,
"grad_norm": 0.25705471634864807,
"learning_rate": 8e-05,
"loss": 1.6503,
"step": 1040
},
{
"epoch": 0.22896733751237217,
"grad_norm": 0.27109014987945557,
"learning_rate": 8e-05,
"loss": 1.8045,
"step": 1041
},
{
"epoch": 0.22918728692400747,
"grad_norm": 0.2972055673599243,
"learning_rate": 8e-05,
"loss": 1.6439,
"step": 1042
},
{
"epoch": 0.2294072363356428,
"grad_norm": 0.27126485109329224,
"learning_rate": 8e-05,
"loss": 1.672,
"step": 1043
},
{
"epoch": 0.22962718574727814,
"grad_norm": 0.2731145918369293,
"learning_rate": 8e-05,
"loss": 1.7795,
"step": 1044
},
{
"epoch": 0.22984713515891345,
"grad_norm": 0.2768365442752838,
"learning_rate": 8e-05,
"loss": 1.7145,
"step": 1045
},
{
"epoch": 0.23006708457054878,
"grad_norm": 0.2606940269470215,
"learning_rate": 8e-05,
"loss": 1.6169,
"step": 1046
},
{
"epoch": 0.2302870339821841,
"grad_norm": 0.2898729741573334,
"learning_rate": 8e-05,
"loss": 1.7315,
"step": 1047
},
{
"epoch": 0.23050698339381942,
"grad_norm": 0.2772413194179535,
"learning_rate": 8e-05,
"loss": 1.8632,
"step": 1048
},
{
"epoch": 0.23072693280545475,
"grad_norm": 0.25808605551719666,
"learning_rate": 8e-05,
"loss": 1.6626,
"step": 1049
},
{
"epoch": 0.23094688221709006,
"grad_norm": 0.2727161645889282,
"learning_rate": 8e-05,
"loss": 1.7848,
"step": 1050
},
{
"epoch": 0.2311668316287254,
"grad_norm": 0.25677087903022766,
"learning_rate": 8e-05,
"loss": 1.6168,
"step": 1051
},
{
"epoch": 0.23138678104036073,
"grad_norm": 0.2761050760746002,
"learning_rate": 8e-05,
"loss": 1.8615,
"step": 1052
},
{
"epoch": 0.23160673045199603,
"grad_norm": 0.2862778604030609,
"learning_rate": 8e-05,
"loss": 1.8728,
"step": 1053
},
{
"epoch": 0.23182667986363137,
"grad_norm": 0.27526941895484924,
"learning_rate": 8e-05,
"loss": 1.7627,
"step": 1054
},
{
"epoch": 0.23204662927526667,
"grad_norm": 0.2932235896587372,
"learning_rate": 8e-05,
"loss": 1.8539,
"step": 1055
},
{
"epoch": 0.232266578686902,
"grad_norm": 0.2770839035511017,
"learning_rate": 8e-05,
"loss": 1.7393,
"step": 1056
},
{
"epoch": 0.23248652809853734,
"grad_norm": 0.2741580307483673,
"learning_rate": 8e-05,
"loss": 1.6076,
"step": 1057
},
{
"epoch": 0.23270647751017265,
"grad_norm": 0.2788783311843872,
"learning_rate": 8e-05,
"loss": 1.7615,
"step": 1058
},
{
"epoch": 0.23292642692180798,
"grad_norm": 0.28565406799316406,
"learning_rate": 8e-05,
"loss": 1.6266,
"step": 1059
},
{
"epoch": 0.23314637633344332,
"grad_norm": 0.26543545722961426,
"learning_rate": 8e-05,
"loss": 1.7192,
"step": 1060
},
{
"epoch": 0.23336632574507862,
"grad_norm": 0.2770478129386902,
"learning_rate": 8e-05,
"loss": 1.8056,
"step": 1061
},
{
"epoch": 0.23358627515671396,
"grad_norm": 0.27805015444755554,
"learning_rate": 8e-05,
"loss": 1.6735,
"step": 1062
},
{
"epoch": 0.2338062245683493,
"grad_norm": 0.309862345457077,
"learning_rate": 8e-05,
"loss": 1.7235,
"step": 1063
},
{
"epoch": 0.2340261739799846,
"grad_norm": 0.27140697836875916,
"learning_rate": 8e-05,
"loss": 1.6883,
"step": 1064
},
{
"epoch": 0.23424612339161993,
"grad_norm": 0.3052090108394623,
"learning_rate": 8e-05,
"loss": 1.8792,
"step": 1065
},
{
"epoch": 0.23446607280325524,
"grad_norm": 0.2995065450668335,
"learning_rate": 8e-05,
"loss": 1.6632,
"step": 1066
},
{
"epoch": 0.23468602221489057,
"grad_norm": 0.2782532870769501,
"learning_rate": 8e-05,
"loss": 1.7395,
"step": 1067
},
{
"epoch": 0.2349059716265259,
"grad_norm": 0.28436902165412903,
"learning_rate": 8e-05,
"loss": 1.8416,
"step": 1068
},
{
"epoch": 0.2351259210381612,
"grad_norm": 0.2740377187728882,
"learning_rate": 8e-05,
"loss": 1.9026,
"step": 1069
},
{
"epoch": 0.23534587044979655,
"grad_norm": 0.2978285849094391,
"learning_rate": 8e-05,
"loss": 1.7277,
"step": 1070
},
{
"epoch": 0.23556581986143188,
"grad_norm": 0.27265986800193787,
"learning_rate": 8e-05,
"loss": 1.7376,
"step": 1071
},
{
"epoch": 0.2357857692730672,
"grad_norm": 0.24915599822998047,
"learning_rate": 8e-05,
"loss": 1.6151,
"step": 1072
},
{
"epoch": 0.23600571868470252,
"grad_norm": 0.28203171491622925,
"learning_rate": 8e-05,
"loss": 1.7713,
"step": 1073
},
{
"epoch": 0.23622566809633785,
"grad_norm": 0.278793066740036,
"learning_rate": 8e-05,
"loss": 1.6717,
"step": 1074
},
{
"epoch": 0.23644561750797316,
"grad_norm": 0.2760609984397888,
"learning_rate": 8e-05,
"loss": 1.5866,
"step": 1075
},
{
"epoch": 0.2366655669196085,
"grad_norm": 0.2726036012172699,
"learning_rate": 8e-05,
"loss": 1.6774,
"step": 1076
},
{
"epoch": 0.2368855163312438,
"grad_norm": 0.27443891763687134,
"learning_rate": 8e-05,
"loss": 1.7615,
"step": 1077
},
{
"epoch": 0.23710546574287913,
"grad_norm": 0.2818880081176758,
"learning_rate": 8e-05,
"loss": 1.7433,
"step": 1078
},
{
"epoch": 0.23732541515451447,
"grad_norm": 0.2646252512931824,
"learning_rate": 8e-05,
"loss": 1.5498,
"step": 1079
},
{
"epoch": 0.23754536456614977,
"grad_norm": 0.2964784502983093,
"learning_rate": 8e-05,
"loss": 1.6162,
"step": 1080
},
{
"epoch": 0.2377653139777851,
"grad_norm": 0.3044411242008209,
"learning_rate": 8e-05,
"loss": 1.7395,
"step": 1081
},
{
"epoch": 0.23798526338942044,
"grad_norm": 0.28679221868515015,
"learning_rate": 8e-05,
"loss": 1.8126,
"step": 1082
},
{
"epoch": 0.23820521280105575,
"grad_norm": 0.26326417922973633,
"learning_rate": 8e-05,
"loss": 1.6451,
"step": 1083
},
{
"epoch": 0.23842516221269108,
"grad_norm": 0.28527480363845825,
"learning_rate": 8e-05,
"loss": 1.8442,
"step": 1084
},
{
"epoch": 0.23864511162432642,
"grad_norm": 0.28897759318351746,
"learning_rate": 8e-05,
"loss": 1.8224,
"step": 1085
},
{
"epoch": 0.23886506103596172,
"grad_norm": 0.2955721616744995,
"learning_rate": 8e-05,
"loss": 1.7304,
"step": 1086
},
{
"epoch": 0.23908501044759706,
"grad_norm": 0.26267075538635254,
"learning_rate": 8e-05,
"loss": 1.67,
"step": 1087
},
{
"epoch": 0.23930495985923236,
"grad_norm": 0.27105912566185,
"learning_rate": 8e-05,
"loss": 1.7461,
"step": 1088
},
{
"epoch": 0.2395249092708677,
"grad_norm": 0.26483941078186035,
"learning_rate": 8e-05,
"loss": 1.6215,
"step": 1089
},
{
"epoch": 0.23974485868250303,
"grad_norm": 0.2804373800754547,
"learning_rate": 8e-05,
"loss": 1.6618,
"step": 1090
},
{
"epoch": 0.23996480809413834,
"grad_norm": 0.26146185398101807,
"learning_rate": 8e-05,
"loss": 1.6641,
"step": 1091
},
{
"epoch": 0.24018475750577367,
"grad_norm": 0.2839837372303009,
"learning_rate": 8e-05,
"loss": 1.5898,
"step": 1092
},
{
"epoch": 0.240404706917409,
"grad_norm": 0.26833322644233704,
"learning_rate": 8e-05,
"loss": 1.8341,
"step": 1093
},
{
"epoch": 0.2406246563290443,
"grad_norm": 0.2779574394226074,
"learning_rate": 8e-05,
"loss": 1.7142,
"step": 1094
},
{
"epoch": 0.24084460574067965,
"grad_norm": 0.2821759879589081,
"learning_rate": 8e-05,
"loss": 1.7261,
"step": 1095
},
{
"epoch": 0.24106455515231498,
"grad_norm": 0.2849150002002716,
"learning_rate": 8e-05,
"loss": 1.6834,
"step": 1096
},
{
"epoch": 0.24128450456395029,
"grad_norm": 0.277148574590683,
"learning_rate": 8e-05,
"loss": 1.5617,
"step": 1097
},
{
"epoch": 0.24150445397558562,
"grad_norm": 0.28307756781578064,
"learning_rate": 8e-05,
"loss": 1.8104,
"step": 1098
},
{
"epoch": 0.24172440338722093,
"grad_norm": 0.28540289402008057,
"learning_rate": 8e-05,
"loss": 1.7331,
"step": 1099
},
{
"epoch": 0.24194435279885626,
"grad_norm": 0.277544766664505,
"learning_rate": 8e-05,
"loss": 1.762,
"step": 1100
},
{
"epoch": 0.2421643022104916,
"grad_norm": 0.259435772895813,
"learning_rate": 8e-05,
"loss": 1.5474,
"step": 1101
},
{
"epoch": 0.2423842516221269,
"grad_norm": 0.2759372591972351,
"learning_rate": 8e-05,
"loss": 1.6535,
"step": 1102
},
{
"epoch": 0.24260420103376223,
"grad_norm": 0.27163347601890564,
"learning_rate": 8e-05,
"loss": 1.6035,
"step": 1103
},
{
"epoch": 0.24282415044539757,
"grad_norm": 0.26722922921180725,
"learning_rate": 8e-05,
"loss": 1.7607,
"step": 1104
},
{
"epoch": 0.24304409985703287,
"grad_norm": 0.2925039529800415,
"learning_rate": 8e-05,
"loss": 1.6441,
"step": 1105
},
{
"epoch": 0.2432640492686682,
"grad_norm": 0.271672785282135,
"learning_rate": 8e-05,
"loss": 1.658,
"step": 1106
},
{
"epoch": 0.24348399868030354,
"grad_norm": 0.2827896773815155,
"learning_rate": 8e-05,
"loss": 1.6258,
"step": 1107
},
{
"epoch": 0.24370394809193885,
"grad_norm": 0.2732497751712799,
"learning_rate": 8e-05,
"loss": 1.6379,
"step": 1108
},
{
"epoch": 0.24392389750357418,
"grad_norm": 0.28081193566322327,
"learning_rate": 8e-05,
"loss": 1.7901,
"step": 1109
},
{
"epoch": 0.2441438469152095,
"grad_norm": 0.2799675762653351,
"learning_rate": 8e-05,
"loss": 1.8323,
"step": 1110
},
{
"epoch": 0.24436379632684482,
"grad_norm": 0.2677648961544037,
"learning_rate": 8e-05,
"loss": 1.7372,
"step": 1111
},
{
"epoch": 0.24458374573848016,
"grad_norm": 0.2644648551940918,
"learning_rate": 8e-05,
"loss": 1.6594,
"step": 1112
},
{
"epoch": 0.24480369515011546,
"grad_norm": 0.2704750895500183,
"learning_rate": 8e-05,
"loss": 1.706,
"step": 1113
},
{
"epoch": 0.2450236445617508,
"grad_norm": 0.2762587368488312,
"learning_rate": 8e-05,
"loss": 1.7445,
"step": 1114
},
{
"epoch": 0.24524359397338613,
"grad_norm": 0.2578018307685852,
"learning_rate": 8e-05,
"loss": 1.5707,
"step": 1115
},
{
"epoch": 0.24546354338502144,
"grad_norm": 0.2892129719257355,
"learning_rate": 8e-05,
"loss": 1.805,
"step": 1116
},
{
"epoch": 0.24568349279665677,
"grad_norm": 0.2868081033229828,
"learning_rate": 8e-05,
"loss": 1.7756,
"step": 1117
},
{
"epoch": 0.2459034422082921,
"grad_norm": 0.2820534110069275,
"learning_rate": 8e-05,
"loss": 1.7826,
"step": 1118
},
{
"epoch": 0.2461233916199274,
"grad_norm": 0.2824958264827728,
"learning_rate": 8e-05,
"loss": 1.6752,
"step": 1119
},
{
"epoch": 0.24634334103156275,
"grad_norm": 0.2782610356807709,
"learning_rate": 8e-05,
"loss": 1.7536,
"step": 1120
},
{
"epoch": 0.24656329044319805,
"grad_norm": 0.27147912979125977,
"learning_rate": 8e-05,
"loss": 1.6783,
"step": 1121
},
{
"epoch": 0.24678323985483339,
"grad_norm": 0.2740795612335205,
"learning_rate": 8e-05,
"loss": 1.7702,
"step": 1122
},
{
"epoch": 0.24700318926646872,
"grad_norm": 0.2922619879245758,
"learning_rate": 8e-05,
"loss": 1.8204,
"step": 1123
},
{
"epoch": 0.24722313867810403,
"grad_norm": 0.2872619926929474,
"learning_rate": 8e-05,
"loss": 1.714,
"step": 1124
},
{
"epoch": 0.24744308808973936,
"grad_norm": 0.27333369851112366,
"learning_rate": 8e-05,
"loss": 1.6575,
"step": 1125
},
{
"epoch": 0.2476630375013747,
"grad_norm": 0.28192320466041565,
"learning_rate": 8e-05,
"loss": 1.7221,
"step": 1126
},
{
"epoch": 0.24788298691301,
"grad_norm": 0.26607248187065125,
"learning_rate": 8e-05,
"loss": 1.7262,
"step": 1127
},
{
"epoch": 0.24810293632464533,
"grad_norm": 0.279690682888031,
"learning_rate": 8e-05,
"loss": 1.7004,
"step": 1128
},
{
"epoch": 0.24832288573628067,
"grad_norm": 0.27289190888404846,
"learning_rate": 8e-05,
"loss": 1.6916,
"step": 1129
},
{
"epoch": 0.24854283514791597,
"grad_norm": 0.27388349175453186,
"learning_rate": 8e-05,
"loss": 1.6656,
"step": 1130
},
{
"epoch": 0.2487627845595513,
"grad_norm": 0.2912501096725464,
"learning_rate": 8e-05,
"loss": 1.8086,
"step": 1131
},
{
"epoch": 0.24898273397118661,
"grad_norm": 0.2999799847602844,
"learning_rate": 8e-05,
"loss": 1.7659,
"step": 1132
},
{
"epoch": 0.24920268338282195,
"grad_norm": 0.262207955121994,
"learning_rate": 8e-05,
"loss": 1.6581,
"step": 1133
},
{
"epoch": 0.24942263279445728,
"grad_norm": 0.2571624517440796,
"learning_rate": 8e-05,
"loss": 1.6509,
"step": 1134
},
{
"epoch": 0.2496425822060926,
"grad_norm": 0.26213690638542175,
"learning_rate": 8e-05,
"loss": 1.6044,
"step": 1135
},
{
"epoch": 0.24986253161772792,
"grad_norm": 0.2870398461818695,
"learning_rate": 8e-05,
"loss": 1.6678,
"step": 1136
},
{
"epoch": 0.25008248102936326,
"grad_norm": 0.2672583758831024,
"learning_rate": 8e-05,
"loss": 1.6563,
"step": 1137
},
{
"epoch": 0.2503024304409986,
"grad_norm": 0.29864680767059326,
"learning_rate": 8e-05,
"loss": 1.858,
"step": 1138
},
{
"epoch": 0.25052237985263387,
"grad_norm": 0.3096907436847687,
"learning_rate": 8e-05,
"loss": 1.7731,
"step": 1139
},
{
"epoch": 0.2507423292642692,
"grad_norm": 0.2668014466762543,
"learning_rate": 8e-05,
"loss": 1.6173,
"step": 1140
},
{
"epoch": 0.25096227867590454,
"grad_norm": 0.275074303150177,
"learning_rate": 8e-05,
"loss": 1.704,
"step": 1141
},
{
"epoch": 0.25118222808753987,
"grad_norm": 0.29657119512557983,
"learning_rate": 8e-05,
"loss": 1.9789,
"step": 1142
},
{
"epoch": 0.2514021774991752,
"grad_norm": 0.26117807626724243,
"learning_rate": 8e-05,
"loss": 1.6815,
"step": 1143
},
{
"epoch": 0.25162212691081054,
"grad_norm": 0.2738019824028015,
"learning_rate": 8e-05,
"loss": 1.7031,
"step": 1144
},
{
"epoch": 0.2518420763224458,
"grad_norm": 0.27922967076301575,
"learning_rate": 8e-05,
"loss": 1.7914,
"step": 1145
},
{
"epoch": 0.25206202573408115,
"grad_norm": 0.2876172661781311,
"learning_rate": 8e-05,
"loss": 1.721,
"step": 1146
},
{
"epoch": 0.2522819751457165,
"grad_norm": 0.28017961978912354,
"learning_rate": 8e-05,
"loss": 1.6731,
"step": 1147
},
{
"epoch": 0.2525019245573518,
"grad_norm": 0.2898389399051666,
"learning_rate": 8e-05,
"loss": 1.8749,
"step": 1148
},
{
"epoch": 0.25272187396898715,
"grad_norm": 0.2742408812046051,
"learning_rate": 8e-05,
"loss": 1.6811,
"step": 1149
},
{
"epoch": 0.25294182338062243,
"grad_norm": 0.2806207835674286,
"learning_rate": 8e-05,
"loss": 1.7082,
"step": 1150
},
{
"epoch": 0.25316177279225777,
"grad_norm": 0.27871328592300415,
"learning_rate": 8e-05,
"loss": 1.7142,
"step": 1151
},
{
"epoch": 0.2533817222038931,
"grad_norm": 0.2792799472808838,
"learning_rate": 8e-05,
"loss": 1.5703,
"step": 1152
},
{
"epoch": 0.25360167161552843,
"grad_norm": 0.27358901500701904,
"learning_rate": 8e-05,
"loss": 1.7576,
"step": 1153
},
{
"epoch": 0.25382162102716377,
"grad_norm": 0.26983192563056946,
"learning_rate": 8e-05,
"loss": 1.6646,
"step": 1154
},
{
"epoch": 0.2540415704387991,
"grad_norm": 0.2711959183216095,
"learning_rate": 8e-05,
"loss": 1.7698,
"step": 1155
},
{
"epoch": 0.2542615198504344,
"grad_norm": 0.28412333130836487,
"learning_rate": 8e-05,
"loss": 1.7446,
"step": 1156
},
{
"epoch": 0.2544814692620697,
"grad_norm": 0.2698575258255005,
"learning_rate": 8e-05,
"loss": 1.6861,
"step": 1157
},
{
"epoch": 0.25470141867370505,
"grad_norm": 0.2806732952594757,
"learning_rate": 8e-05,
"loss": 1.7308,
"step": 1158
},
{
"epoch": 0.2549213680853404,
"grad_norm": 0.2715948522090912,
"learning_rate": 8e-05,
"loss": 1.852,
"step": 1159
},
{
"epoch": 0.2551413174969757,
"grad_norm": 0.33048170804977417,
"learning_rate": 8e-05,
"loss": 1.881,
"step": 1160
},
{
"epoch": 0.255361266908611,
"grad_norm": 0.27907994389533997,
"learning_rate": 8e-05,
"loss": 1.6501,
"step": 1161
},
{
"epoch": 0.25558121632024633,
"grad_norm": 0.2747988998889923,
"learning_rate": 8e-05,
"loss": 1.7265,
"step": 1162
},
{
"epoch": 0.25580116573188166,
"grad_norm": 0.28321677446365356,
"learning_rate": 8e-05,
"loss": 1.8602,
"step": 1163
},
{
"epoch": 0.256021115143517,
"grad_norm": 0.2695465683937073,
"learning_rate": 8e-05,
"loss": 1.6091,
"step": 1164
},
{
"epoch": 0.25624106455515233,
"grad_norm": 0.272135466337204,
"learning_rate": 8e-05,
"loss": 1.6236,
"step": 1165
},
{
"epoch": 0.25646101396678767,
"grad_norm": 0.2715020775794983,
"learning_rate": 8e-05,
"loss": 1.674,
"step": 1166
},
{
"epoch": 0.25668096337842294,
"grad_norm": 0.2879820764064789,
"learning_rate": 8e-05,
"loss": 1.8393,
"step": 1167
},
{
"epoch": 0.2569009127900583,
"grad_norm": 0.2616657018661499,
"learning_rate": 8e-05,
"loss": 1.6391,
"step": 1168
},
{
"epoch": 0.2571208622016936,
"grad_norm": 0.2558441460132599,
"learning_rate": 8e-05,
"loss": 1.606,
"step": 1169
},
{
"epoch": 0.25734081161332895,
"grad_norm": 0.26944512128829956,
"learning_rate": 8e-05,
"loss": 1.7288,
"step": 1170
},
{
"epoch": 0.2575607610249643,
"grad_norm": 0.26958367228507996,
"learning_rate": 8e-05,
"loss": 1.6233,
"step": 1171
},
{
"epoch": 0.25778071043659956,
"grad_norm": 0.29003527760505676,
"learning_rate": 8e-05,
"loss": 1.777,
"step": 1172
},
{
"epoch": 0.2580006598482349,
"grad_norm": 0.2677457630634308,
"learning_rate": 8e-05,
"loss": 1.6835,
"step": 1173
},
{
"epoch": 0.2582206092598702,
"grad_norm": 0.28062689304351807,
"learning_rate": 8e-05,
"loss": 1.726,
"step": 1174
},
{
"epoch": 0.25844055867150556,
"grad_norm": 0.26764920353889465,
"learning_rate": 8e-05,
"loss": 1.6575,
"step": 1175
},
{
"epoch": 0.2586605080831409,
"grad_norm": 0.28183332085609436,
"learning_rate": 8e-05,
"loss": 1.784,
"step": 1176
},
{
"epoch": 0.25888045749477623,
"grad_norm": 0.25718390941619873,
"learning_rate": 8e-05,
"loss": 1.6317,
"step": 1177
},
{
"epoch": 0.2591004069064115,
"grad_norm": 0.25523149967193604,
"learning_rate": 8e-05,
"loss": 1.5634,
"step": 1178
},
{
"epoch": 0.25932035631804684,
"grad_norm": 0.2539874315261841,
"learning_rate": 8e-05,
"loss": 1.5878,
"step": 1179
},
{
"epoch": 0.2595403057296822,
"grad_norm": 0.2868393659591675,
"learning_rate": 8e-05,
"loss": 1.7301,
"step": 1180
},
{
"epoch": 0.2597602551413175,
"grad_norm": 0.27819645404815674,
"learning_rate": 8e-05,
"loss": 1.6895,
"step": 1181
},
{
"epoch": 0.25998020455295284,
"grad_norm": 0.27499255537986755,
"learning_rate": 8e-05,
"loss": 1.7462,
"step": 1182
},
{
"epoch": 0.2602001539645881,
"grad_norm": 0.2858695685863495,
"learning_rate": 8e-05,
"loss": 1.8199,
"step": 1183
},
{
"epoch": 0.26042010337622346,
"grad_norm": 0.2646760642528534,
"learning_rate": 8e-05,
"loss": 1.6597,
"step": 1184
},
{
"epoch": 0.2606400527878588,
"grad_norm": 0.2831268310546875,
"learning_rate": 8e-05,
"loss": 1.8383,
"step": 1185
},
{
"epoch": 0.2608600021994941,
"grad_norm": 0.2593746483325958,
"learning_rate": 8e-05,
"loss": 1.6115,
"step": 1186
},
{
"epoch": 0.26107995161112946,
"grad_norm": 0.26519641280174255,
"learning_rate": 8e-05,
"loss": 1.5959,
"step": 1187
},
{
"epoch": 0.2612999010227648,
"grad_norm": 0.2733252942562103,
"learning_rate": 8e-05,
"loss": 1.6318,
"step": 1188
},
{
"epoch": 0.26151985043440007,
"grad_norm": 0.27299511432647705,
"learning_rate": 8e-05,
"loss": 1.7313,
"step": 1189
},
{
"epoch": 0.2617397998460354,
"grad_norm": 0.2684955894947052,
"learning_rate": 8e-05,
"loss": 1.5826,
"step": 1190
},
{
"epoch": 0.26195974925767074,
"grad_norm": 0.2747553586959839,
"learning_rate": 8e-05,
"loss": 1.7008,
"step": 1191
},
{
"epoch": 0.26217969866930607,
"grad_norm": 0.26033639907836914,
"learning_rate": 8e-05,
"loss": 1.5571,
"step": 1192
},
{
"epoch": 0.2623996480809414,
"grad_norm": 0.2640804350376129,
"learning_rate": 8e-05,
"loss": 1.5317,
"step": 1193
},
{
"epoch": 0.2626195974925767,
"grad_norm": 0.27063700556755066,
"learning_rate": 8e-05,
"loss": 1.5501,
"step": 1194
},
{
"epoch": 0.262839546904212,
"grad_norm": 0.2677111029624939,
"learning_rate": 8e-05,
"loss": 1.5894,
"step": 1195
},
{
"epoch": 0.26305949631584735,
"grad_norm": 0.28144168853759766,
"learning_rate": 8e-05,
"loss": 1.7496,
"step": 1196
},
{
"epoch": 0.2632794457274827,
"grad_norm": 0.2602388858795166,
"learning_rate": 8e-05,
"loss": 1.571,
"step": 1197
},
{
"epoch": 0.263499395139118,
"grad_norm": 0.2941505014896393,
"learning_rate": 8e-05,
"loss": 1.6692,
"step": 1198
},
{
"epoch": 0.26371934455075335,
"grad_norm": 0.264433354139328,
"learning_rate": 8e-05,
"loss": 1.6922,
"step": 1199
},
{
"epoch": 0.26393929396238863,
"grad_norm": 0.25587090849876404,
"learning_rate": 8e-05,
"loss": 1.5599,
"step": 1200
},
{
"epoch": 0.26415924337402397,
"grad_norm": 0.3012869358062744,
"learning_rate": 8e-05,
"loss": 1.9195,
"step": 1201
},
{
"epoch": 0.2643791927856593,
"grad_norm": 0.2762719392776489,
"learning_rate": 8e-05,
"loss": 1.898,
"step": 1202
},
{
"epoch": 0.26459914219729463,
"grad_norm": 0.2701188325881958,
"learning_rate": 8e-05,
"loss": 1.7312,
"step": 1203
},
{
"epoch": 0.26481909160892997,
"grad_norm": 0.29665982723236084,
"learning_rate": 8e-05,
"loss": 1.8089,
"step": 1204
},
{
"epoch": 0.26503904102056525,
"grad_norm": 0.26700517535209656,
"learning_rate": 8e-05,
"loss": 1.8401,
"step": 1205
},
{
"epoch": 0.2652589904322006,
"grad_norm": 0.2828493118286133,
"learning_rate": 8e-05,
"loss": 1.8622,
"step": 1206
},
{
"epoch": 0.2654789398438359,
"grad_norm": 0.2746271789073944,
"learning_rate": 8e-05,
"loss": 1.6521,
"step": 1207
},
{
"epoch": 0.26569888925547125,
"grad_norm": 0.2882270812988281,
"learning_rate": 8e-05,
"loss": 1.7168,
"step": 1208
},
{
"epoch": 0.2659188386671066,
"grad_norm": 0.29784512519836426,
"learning_rate": 8e-05,
"loss": 1.6968,
"step": 1209
},
{
"epoch": 0.2661387880787419,
"grad_norm": 0.2807427942752838,
"learning_rate": 8e-05,
"loss": 1.6004,
"step": 1210
},
{
"epoch": 0.2663587374903772,
"grad_norm": 0.2956424951553345,
"learning_rate": 8e-05,
"loss": 1.8325,
"step": 1211
},
{
"epoch": 0.26657868690201253,
"grad_norm": 0.2647739350795746,
"learning_rate": 8e-05,
"loss": 1.6391,
"step": 1212
},
{
"epoch": 0.26679863631364786,
"grad_norm": 0.2955171465873718,
"learning_rate": 8e-05,
"loss": 1.7893,
"step": 1213
},
{
"epoch": 0.2670185857252832,
"grad_norm": 0.27241894602775574,
"learning_rate": 8e-05,
"loss": 1.781,
"step": 1214
},
{
"epoch": 0.26723853513691853,
"grad_norm": 0.2841251492500305,
"learning_rate": 8e-05,
"loss": 1.8612,
"step": 1215
},
{
"epoch": 0.2674584845485538,
"grad_norm": 0.327891081571579,
"learning_rate": 8e-05,
"loss": 1.844,
"step": 1216
},
{
"epoch": 0.26767843396018914,
"grad_norm": 0.26434099674224854,
"learning_rate": 8e-05,
"loss": 1.6325,
"step": 1217
},
{
"epoch": 0.2678983833718245,
"grad_norm": 0.2868417799472809,
"learning_rate": 8e-05,
"loss": 1.7087,
"step": 1218
},
{
"epoch": 0.2681183327834598,
"grad_norm": 0.27408069372177124,
"learning_rate": 8e-05,
"loss": 1.6006,
"step": 1219
},
{
"epoch": 0.26833828219509515,
"grad_norm": 0.2697390019893646,
"learning_rate": 8e-05,
"loss": 1.6833,
"step": 1220
},
{
"epoch": 0.2685582316067304,
"grad_norm": 0.27598559856414795,
"learning_rate": 8e-05,
"loss": 1.7192,
"step": 1221
},
{
"epoch": 0.26877818101836576,
"grad_norm": 0.26871007680892944,
"learning_rate": 8e-05,
"loss": 1.6301,
"step": 1222
},
{
"epoch": 0.2689981304300011,
"grad_norm": 0.2739337980747223,
"learning_rate": 8e-05,
"loss": 1.6828,
"step": 1223
},
{
"epoch": 0.2692180798416364,
"grad_norm": 0.286530464887619,
"learning_rate": 8e-05,
"loss": 1.6484,
"step": 1224
},
{
"epoch": 0.26943802925327176,
"grad_norm": 0.27509886026382446,
"learning_rate": 8e-05,
"loss": 1.6647,
"step": 1225
},
{
"epoch": 0.2696579786649071,
"grad_norm": 0.2916969358921051,
"learning_rate": 8e-05,
"loss": 1.7908,
"step": 1226
},
{
"epoch": 0.2698779280765424,
"grad_norm": 0.26566174626350403,
"learning_rate": 8e-05,
"loss": 1.6075,
"step": 1227
},
{
"epoch": 0.2700978774881777,
"grad_norm": 0.27648022770881653,
"learning_rate": 8e-05,
"loss": 1.7536,
"step": 1228
},
{
"epoch": 0.27031782689981304,
"grad_norm": 0.27313023805618286,
"learning_rate": 8e-05,
"loss": 1.6978,
"step": 1229
},
{
"epoch": 0.2705377763114484,
"grad_norm": 0.2755061388015747,
"learning_rate": 8e-05,
"loss": 1.7196,
"step": 1230
},
{
"epoch": 0.2707577257230837,
"grad_norm": 0.25907769799232483,
"learning_rate": 8e-05,
"loss": 1.5518,
"step": 1231
},
{
"epoch": 0.270977675134719,
"grad_norm": 0.26485681533813477,
"learning_rate": 8e-05,
"loss": 1.5053,
"step": 1232
},
{
"epoch": 0.2711976245463543,
"grad_norm": 0.27980178594589233,
"learning_rate": 8e-05,
"loss": 1.7824,
"step": 1233
},
{
"epoch": 0.27141757395798966,
"grad_norm": 0.2750954329967499,
"learning_rate": 8e-05,
"loss": 1.6973,
"step": 1234
},
{
"epoch": 0.271637523369625,
"grad_norm": 0.27367594838142395,
"learning_rate": 8e-05,
"loss": 1.6691,
"step": 1235
},
{
"epoch": 0.2718574727812603,
"grad_norm": 0.27089521288871765,
"learning_rate": 8e-05,
"loss": 1.7532,
"step": 1236
},
{
"epoch": 0.27207742219289566,
"grad_norm": 0.30656641721725464,
"learning_rate": 8e-05,
"loss": 1.8411,
"step": 1237
},
{
"epoch": 0.27229737160453094,
"grad_norm": 0.25732672214508057,
"learning_rate": 8e-05,
"loss": 1.5599,
"step": 1238
},
{
"epoch": 0.27251732101616627,
"grad_norm": 0.2643807828426361,
"learning_rate": 8e-05,
"loss": 1.6654,
"step": 1239
},
{
"epoch": 0.2727372704278016,
"grad_norm": 0.2703326344490051,
"learning_rate": 8e-05,
"loss": 1.594,
"step": 1240
},
{
"epoch": 0.27295721983943694,
"grad_norm": 0.27907243371009827,
"learning_rate": 8e-05,
"loss": 1.7531,
"step": 1241
},
{
"epoch": 0.27317716925107227,
"grad_norm": 0.2482902854681015,
"learning_rate": 8e-05,
"loss": 1.3586,
"step": 1242
},
{
"epoch": 0.27339711866270755,
"grad_norm": 0.2879469394683838,
"learning_rate": 8e-05,
"loss": 1.76,
"step": 1243
},
{
"epoch": 0.2736170680743429,
"grad_norm": 0.26334571838378906,
"learning_rate": 8e-05,
"loss": 1.536,
"step": 1244
},
{
"epoch": 0.2738370174859782,
"grad_norm": 0.27328065037727356,
"learning_rate": 8e-05,
"loss": 1.7199,
"step": 1245
},
{
"epoch": 0.27405696689761355,
"grad_norm": 0.27392926812171936,
"learning_rate": 8e-05,
"loss": 1.7731,
"step": 1246
},
{
"epoch": 0.2742769163092489,
"grad_norm": 0.29755476117134094,
"learning_rate": 8e-05,
"loss": 1.7184,
"step": 1247
},
{
"epoch": 0.2744968657208842,
"grad_norm": 0.29554107785224915,
"learning_rate": 8e-05,
"loss": 1.7442,
"step": 1248
},
{
"epoch": 0.2747168151325195,
"grad_norm": 0.2562367618083954,
"learning_rate": 8e-05,
"loss": 1.63,
"step": 1249
},
{
"epoch": 0.27493676454415483,
"grad_norm": 0.27746453881263733,
"learning_rate": 8e-05,
"loss": 1.7396,
"step": 1250
},
{
"epoch": 0.27515671395579017,
"grad_norm": 0.2747843265533447,
"learning_rate": 8e-05,
"loss": 1.6628,
"step": 1251
},
{
"epoch": 0.2753766633674255,
"grad_norm": 0.2650463581085205,
"learning_rate": 8e-05,
"loss": 1.6409,
"step": 1252
},
{
"epoch": 0.27559661277906083,
"grad_norm": 0.30537328124046326,
"learning_rate": 8e-05,
"loss": 1.4927,
"step": 1253
},
{
"epoch": 0.2758165621906961,
"grad_norm": 0.26015424728393555,
"learning_rate": 8e-05,
"loss": 1.718,
"step": 1254
},
{
"epoch": 0.27603651160233145,
"grad_norm": 0.2512992322444916,
"learning_rate": 8e-05,
"loss": 1.4757,
"step": 1255
},
{
"epoch": 0.2762564610139668,
"grad_norm": 0.28478461503982544,
"learning_rate": 8e-05,
"loss": 1.9081,
"step": 1256
},
{
"epoch": 0.2764764104256021,
"grad_norm": 0.28490516543388367,
"learning_rate": 8e-05,
"loss": 1.8495,
"step": 1257
},
{
"epoch": 0.27669635983723745,
"grad_norm": 0.2758481204509735,
"learning_rate": 8e-05,
"loss": 1.767,
"step": 1258
},
{
"epoch": 0.2769163092488728,
"grad_norm": 0.28743213415145874,
"learning_rate": 8e-05,
"loss": 1.6548,
"step": 1259
},
{
"epoch": 0.27713625866050806,
"grad_norm": 0.2738385796546936,
"learning_rate": 8e-05,
"loss": 1.5616,
"step": 1260
},
{
"epoch": 0.2773562080721434,
"grad_norm": 0.27758583426475525,
"learning_rate": 8e-05,
"loss": 1.7793,
"step": 1261
},
{
"epoch": 0.27757615748377873,
"grad_norm": 0.2830480635166168,
"learning_rate": 8e-05,
"loss": 1.8048,
"step": 1262
},
{
"epoch": 0.27779610689541406,
"grad_norm": 0.296036034822464,
"learning_rate": 8e-05,
"loss": 1.7844,
"step": 1263
},
{
"epoch": 0.2780160563070494,
"grad_norm": 0.28651297092437744,
"learning_rate": 8e-05,
"loss": 1.7239,
"step": 1264
},
{
"epoch": 0.2782360057186847,
"grad_norm": 0.2826116979122162,
"learning_rate": 8e-05,
"loss": 1.8415,
"step": 1265
},
{
"epoch": 0.27845595513032,
"grad_norm": 0.27445724606513977,
"learning_rate": 8e-05,
"loss": 1.6738,
"step": 1266
},
{
"epoch": 0.27867590454195534,
"grad_norm": 0.28153640031814575,
"learning_rate": 8e-05,
"loss": 1.6519,
"step": 1267
},
{
"epoch": 0.2788958539535907,
"grad_norm": 0.27389946579933167,
"learning_rate": 8e-05,
"loss": 1.681,
"step": 1268
},
{
"epoch": 0.279115803365226,
"grad_norm": 0.2639203667640686,
"learning_rate": 8e-05,
"loss": 1.6398,
"step": 1269
},
{
"epoch": 0.27933575277686135,
"grad_norm": 0.2787509560585022,
"learning_rate": 8e-05,
"loss": 1.7199,
"step": 1270
},
{
"epoch": 0.2795557021884966,
"grad_norm": 0.28468430042266846,
"learning_rate": 8e-05,
"loss": 1.8668,
"step": 1271
},
{
"epoch": 0.27977565160013196,
"grad_norm": 0.2907005250453949,
"learning_rate": 8e-05,
"loss": 1.9328,
"step": 1272
},
{
"epoch": 0.2799956010117673,
"grad_norm": 0.2607463300228119,
"learning_rate": 8e-05,
"loss": 1.5958,
"step": 1273
},
{
"epoch": 0.2802155504234026,
"grad_norm": 0.2695181965827942,
"learning_rate": 8e-05,
"loss": 1.6708,
"step": 1274
},
{
"epoch": 0.28043549983503796,
"grad_norm": 0.28671538829803467,
"learning_rate": 8e-05,
"loss": 1.7736,
"step": 1275
},
{
"epoch": 0.28065544924667324,
"grad_norm": 0.3246489465236664,
"learning_rate": 8e-05,
"loss": 1.8145,
"step": 1276
},
{
"epoch": 0.2808753986583086,
"grad_norm": 0.2879314720630646,
"learning_rate": 8e-05,
"loss": 1.782,
"step": 1277
},
{
"epoch": 0.2810953480699439,
"grad_norm": 0.27141574025154114,
"learning_rate": 8e-05,
"loss": 1.8069,
"step": 1278
},
{
"epoch": 0.28131529748157924,
"grad_norm": 0.2893892228603363,
"learning_rate": 8e-05,
"loss": 1.7893,
"step": 1279
},
{
"epoch": 0.2815352468932146,
"grad_norm": 0.2985538840293884,
"learning_rate": 8e-05,
"loss": 1.7804,
"step": 1280
},
{
"epoch": 0.2817551963048499,
"grad_norm": 0.2664276957511902,
"learning_rate": 8e-05,
"loss": 1.6785,
"step": 1281
},
{
"epoch": 0.2819751457164852,
"grad_norm": 0.3002198040485382,
"learning_rate": 8e-05,
"loss": 1.6109,
"step": 1282
},
{
"epoch": 0.2821950951281205,
"grad_norm": 0.27687907218933105,
"learning_rate": 8e-05,
"loss": 1.6322,
"step": 1283
},
{
"epoch": 0.28241504453975586,
"grad_norm": 0.28822144865989685,
"learning_rate": 8e-05,
"loss": 1.6785,
"step": 1284
},
{
"epoch": 0.2826349939513912,
"grad_norm": 0.2801685333251953,
"learning_rate": 8e-05,
"loss": 1.69,
"step": 1285
},
{
"epoch": 0.2828549433630265,
"grad_norm": 0.27876734733581543,
"learning_rate": 8e-05,
"loss": 1.6442,
"step": 1286
},
{
"epoch": 0.2830748927746618,
"grad_norm": 0.2990095317363739,
"learning_rate": 8e-05,
"loss": 1.7439,
"step": 1287
},
{
"epoch": 0.28329484218629714,
"grad_norm": 0.2710682451725006,
"learning_rate": 8e-05,
"loss": 1.6908,
"step": 1288
},
{
"epoch": 0.28351479159793247,
"grad_norm": 0.2922731935977936,
"learning_rate": 8e-05,
"loss": 1.8361,
"step": 1289
},
{
"epoch": 0.2837347410095678,
"grad_norm": 0.2638223171234131,
"learning_rate": 8e-05,
"loss": 1.6233,
"step": 1290
},
{
"epoch": 0.28395469042120314,
"grad_norm": 0.27564552426338196,
"learning_rate": 8e-05,
"loss": 1.7624,
"step": 1291
},
{
"epoch": 0.28417463983283847,
"grad_norm": 0.28238940238952637,
"learning_rate": 8e-05,
"loss": 1.8649,
"step": 1292
},
{
"epoch": 0.28439458924447375,
"grad_norm": 0.27798035740852356,
"learning_rate": 8e-05,
"loss": 1.7877,
"step": 1293
},
{
"epoch": 0.2846145386561091,
"grad_norm": 0.29618534445762634,
"learning_rate": 8e-05,
"loss": 1.816,
"step": 1294
},
{
"epoch": 0.2848344880677444,
"grad_norm": 0.27669045329093933,
"learning_rate": 8e-05,
"loss": 1.7014,
"step": 1295
},
{
"epoch": 0.28505443747937975,
"grad_norm": 0.27973508834838867,
"learning_rate": 8e-05,
"loss": 1.7491,
"step": 1296
},
{
"epoch": 0.2852743868910151,
"grad_norm": 0.28833356499671936,
"learning_rate": 8e-05,
"loss": 1.6948,
"step": 1297
},
{
"epoch": 0.28549433630265036,
"grad_norm": 0.2751030921936035,
"learning_rate": 8e-05,
"loss": 1.6846,
"step": 1298
},
{
"epoch": 0.2857142857142857,
"grad_norm": 0.2766781449317932,
"learning_rate": 8e-05,
"loss": 1.5442,
"step": 1299
},
{
"epoch": 0.28593423512592103,
"grad_norm": 0.29664894938468933,
"learning_rate": 8e-05,
"loss": 1.6884,
"step": 1300
},
{
"epoch": 0.28615418453755637,
"grad_norm": 0.2771795392036438,
"learning_rate": 8e-05,
"loss": 1.6479,
"step": 1301
},
{
"epoch": 0.2863741339491917,
"grad_norm": 0.2623322904109955,
"learning_rate": 8e-05,
"loss": 1.5803,
"step": 1302
},
{
"epoch": 0.28659408336082703,
"grad_norm": 0.2821153998374939,
"learning_rate": 8e-05,
"loss": 1.7758,
"step": 1303
},
{
"epoch": 0.2868140327724623,
"grad_norm": 0.29058384895324707,
"learning_rate": 8e-05,
"loss": 1.7244,
"step": 1304
},
{
"epoch": 0.28703398218409765,
"grad_norm": 0.2811940312385559,
"learning_rate": 8e-05,
"loss": 1.6708,
"step": 1305
},
{
"epoch": 0.287253931595733,
"grad_norm": 0.2773367762565613,
"learning_rate": 8e-05,
"loss": 1.7857,
"step": 1306
},
{
"epoch": 0.2874738810073683,
"grad_norm": 0.2689999043941498,
"learning_rate": 8e-05,
"loss": 1.7432,
"step": 1307
},
{
"epoch": 0.28769383041900365,
"grad_norm": 0.26896870136260986,
"learning_rate": 8e-05,
"loss": 1.6389,
"step": 1308
},
{
"epoch": 0.2879137798306389,
"grad_norm": 0.2981964349746704,
"learning_rate": 8e-05,
"loss": 1.8771,
"step": 1309
},
{
"epoch": 0.28813372924227426,
"grad_norm": 0.2872856855392456,
"learning_rate": 8e-05,
"loss": 1.785,
"step": 1310
},
{
"epoch": 0.2883536786539096,
"grad_norm": 0.3186649680137634,
"learning_rate": 8e-05,
"loss": 1.9051,
"step": 1311
},
{
"epoch": 0.28857362806554493,
"grad_norm": 0.2802119255065918,
"learning_rate": 8e-05,
"loss": 1.6532,
"step": 1312
},
{
"epoch": 0.28879357747718026,
"grad_norm": 0.2864134907722473,
"learning_rate": 8e-05,
"loss": 1.7373,
"step": 1313
},
{
"epoch": 0.2890135268888156,
"grad_norm": 0.2739737331867218,
"learning_rate": 8e-05,
"loss": 1.5365,
"step": 1314
},
{
"epoch": 0.2892334763004509,
"grad_norm": 0.2707555294036865,
"learning_rate": 8e-05,
"loss": 1.6516,
"step": 1315
},
{
"epoch": 0.2894534257120862,
"grad_norm": 0.2895212173461914,
"learning_rate": 8e-05,
"loss": 1.5634,
"step": 1316
},
{
"epoch": 0.28967337512372154,
"grad_norm": 0.26424047350883484,
"learning_rate": 8e-05,
"loss": 1.6543,
"step": 1317
},
{
"epoch": 0.2898933245353569,
"grad_norm": 0.26237159967422485,
"learning_rate": 8e-05,
"loss": 1.6814,
"step": 1318
},
{
"epoch": 0.2901132739469922,
"grad_norm": 0.27964159846305847,
"learning_rate": 8e-05,
"loss": 1.7505,
"step": 1319
},
{
"epoch": 0.2903332233586275,
"grad_norm": 0.27128270268440247,
"learning_rate": 8e-05,
"loss": 1.7229,
"step": 1320
},
{
"epoch": 0.2905531727702628,
"grad_norm": 0.3012688159942627,
"learning_rate": 8e-05,
"loss": 1.6851,
"step": 1321
},
{
"epoch": 0.29077312218189816,
"grad_norm": 0.2725695073604584,
"learning_rate": 8e-05,
"loss": 1.6552,
"step": 1322
},
{
"epoch": 0.2909930715935335,
"grad_norm": 0.2855455279350281,
"learning_rate": 8e-05,
"loss": 1.7779,
"step": 1323
},
{
"epoch": 0.2912130210051688,
"grad_norm": 0.2906174659729004,
"learning_rate": 8e-05,
"loss": 1.8209,
"step": 1324
},
{
"epoch": 0.29143297041680416,
"grad_norm": 0.26015472412109375,
"learning_rate": 8e-05,
"loss": 1.5403,
"step": 1325
},
{
"epoch": 0.29165291982843944,
"grad_norm": 0.29065820574760437,
"learning_rate": 8e-05,
"loss": 1.8499,
"step": 1326
},
{
"epoch": 0.2918728692400748,
"grad_norm": 0.28715917468070984,
"learning_rate": 8e-05,
"loss": 1.854,
"step": 1327
},
{
"epoch": 0.2920928186517101,
"grad_norm": 0.26932859420776367,
"learning_rate": 8e-05,
"loss": 1.6254,
"step": 1328
},
{
"epoch": 0.29231276806334544,
"grad_norm": 0.2757404148578644,
"learning_rate": 8e-05,
"loss": 1.6077,
"step": 1329
},
{
"epoch": 0.2925327174749808,
"grad_norm": 0.26532551646232605,
"learning_rate": 8e-05,
"loss": 1.6551,
"step": 1330
},
{
"epoch": 0.29275266688661605,
"grad_norm": 0.2754289209842682,
"learning_rate": 8e-05,
"loss": 1.7276,
"step": 1331
},
{
"epoch": 0.2929726162982514,
"grad_norm": 0.290568470954895,
"learning_rate": 8e-05,
"loss": 1.7622,
"step": 1332
},
{
"epoch": 0.2931925657098867,
"grad_norm": 0.3045903742313385,
"learning_rate": 8e-05,
"loss": 1.7937,
"step": 1333
},
{
"epoch": 0.29341251512152206,
"grad_norm": 0.2594483196735382,
"learning_rate": 8e-05,
"loss": 1.6163,
"step": 1334
},
{
"epoch": 0.2936324645331574,
"grad_norm": 0.3054102957248688,
"learning_rate": 8e-05,
"loss": 1.7767,
"step": 1335
},
{
"epoch": 0.2938524139447927,
"grad_norm": 0.27347666025161743,
"learning_rate": 8e-05,
"loss": 1.682,
"step": 1336
},
{
"epoch": 0.294072363356428,
"grad_norm": 0.2639494836330414,
"learning_rate": 8e-05,
"loss": 1.4616,
"step": 1337
},
{
"epoch": 0.29429231276806334,
"grad_norm": 0.2842942178249359,
"learning_rate": 8e-05,
"loss": 1.7625,
"step": 1338
},
{
"epoch": 0.29451226217969867,
"grad_norm": 0.2895960509777069,
"learning_rate": 8e-05,
"loss": 1.7127,
"step": 1339
},
{
"epoch": 0.294732211591334,
"grad_norm": 0.2836678624153137,
"learning_rate": 8e-05,
"loss": 1.7765,
"step": 1340
},
{
"epoch": 0.29495216100296934,
"grad_norm": 0.26315444707870483,
"learning_rate": 8e-05,
"loss": 1.6592,
"step": 1341
},
{
"epoch": 0.2951721104146046,
"grad_norm": 0.2601313591003418,
"learning_rate": 8e-05,
"loss": 1.5803,
"step": 1342
},
{
"epoch": 0.29539205982623995,
"grad_norm": 0.28084784746170044,
"learning_rate": 8e-05,
"loss": 1.6172,
"step": 1343
},
{
"epoch": 0.2956120092378753,
"grad_norm": 0.27707698941230774,
"learning_rate": 8e-05,
"loss": 1.6774,
"step": 1344
},
{
"epoch": 0.2958319586495106,
"grad_norm": 0.28750407695770264,
"learning_rate": 8e-05,
"loss": 1.7775,
"step": 1345
},
{
"epoch": 0.29605190806114595,
"grad_norm": 0.27315664291381836,
"learning_rate": 8e-05,
"loss": 1.6578,
"step": 1346
},
{
"epoch": 0.2962718574727813,
"grad_norm": 0.26131486892700195,
"learning_rate": 8e-05,
"loss": 1.6429,
"step": 1347
},
{
"epoch": 0.29649180688441656,
"grad_norm": 0.27198976278305054,
"learning_rate": 8e-05,
"loss": 1.6594,
"step": 1348
},
{
"epoch": 0.2967117562960519,
"grad_norm": 0.2785218060016632,
"learning_rate": 8e-05,
"loss": 1.6959,
"step": 1349
},
{
"epoch": 0.29693170570768723,
"grad_norm": 0.26987215876579285,
"learning_rate": 8e-05,
"loss": 1.6561,
"step": 1350
},
{
"epoch": 0.29715165511932257,
"grad_norm": 0.2634013295173645,
"learning_rate": 8e-05,
"loss": 1.6817,
"step": 1351
},
{
"epoch": 0.2973716045309579,
"grad_norm": 0.2584557831287384,
"learning_rate": 8e-05,
"loss": 1.5104,
"step": 1352
},
{
"epoch": 0.2975915539425932,
"grad_norm": 0.28787991404533386,
"learning_rate": 8e-05,
"loss": 1.8217,
"step": 1353
},
{
"epoch": 0.2978115033542285,
"grad_norm": 0.5047094225883484,
"learning_rate": 8e-05,
"loss": 1.7733,
"step": 1354
},
{
"epoch": 0.29803145276586385,
"grad_norm": 0.26776471734046936,
"learning_rate": 8e-05,
"loss": 1.6961,
"step": 1355
},
{
"epoch": 0.2982514021774992,
"grad_norm": 0.30351778864860535,
"learning_rate": 8e-05,
"loss": 1.7104,
"step": 1356
},
{
"epoch": 0.2984713515891345,
"grad_norm": 0.27889010310173035,
"learning_rate": 8e-05,
"loss": 1.7276,
"step": 1357
},
{
"epoch": 0.29869130100076985,
"grad_norm": 0.2656184136867523,
"learning_rate": 8e-05,
"loss": 1.7438,
"step": 1358
},
{
"epoch": 0.2989112504124051,
"grad_norm": 0.27338340878486633,
"learning_rate": 8e-05,
"loss": 1.7526,
"step": 1359
},
{
"epoch": 0.29913119982404046,
"grad_norm": 0.3266398310661316,
"learning_rate": 8e-05,
"loss": 1.8091,
"step": 1360
},
{
"epoch": 0.2993511492356758,
"grad_norm": 0.309469997882843,
"learning_rate": 8e-05,
"loss": 2.0485,
"step": 1361
},
{
"epoch": 0.29957109864731113,
"grad_norm": 0.2768929600715637,
"learning_rate": 8e-05,
"loss": 1.7977,
"step": 1362
},
{
"epoch": 0.29979104805894646,
"grad_norm": 0.27685433626174927,
"learning_rate": 8e-05,
"loss": 1.5712,
"step": 1363
},
{
"epoch": 0.30001099747058174,
"grad_norm": 0.26404622197151184,
"learning_rate": 8e-05,
"loss": 1.6639,
"step": 1364
},
{
"epoch": 0.3002309468822171,
"grad_norm": 0.2719237208366394,
"learning_rate": 8e-05,
"loss": 1.788,
"step": 1365
},
{
"epoch": 0.3004508962938524,
"grad_norm": 0.27983394265174866,
"learning_rate": 8e-05,
"loss": 1.7361,
"step": 1366
},
{
"epoch": 0.30067084570548774,
"grad_norm": 0.2673875689506531,
"learning_rate": 8e-05,
"loss": 1.6288,
"step": 1367
},
{
"epoch": 0.3008907951171231,
"grad_norm": 0.2850426435470581,
"learning_rate": 8e-05,
"loss": 1.8328,
"step": 1368
},
{
"epoch": 0.3011107445287584,
"grad_norm": 0.2577967345714569,
"learning_rate": 8e-05,
"loss": 1.6267,
"step": 1369
},
{
"epoch": 0.3013306939403937,
"grad_norm": 0.276094913482666,
"learning_rate": 8e-05,
"loss": 1.7673,
"step": 1370
},
{
"epoch": 0.301550643352029,
"grad_norm": 0.2834344208240509,
"learning_rate": 8e-05,
"loss": 1.6692,
"step": 1371
},
{
"epoch": 0.30177059276366436,
"grad_norm": 0.2617560029029846,
"learning_rate": 8e-05,
"loss": 1.7734,
"step": 1372
},
{
"epoch": 0.3019905421752997,
"grad_norm": 0.27122870087623596,
"learning_rate": 8e-05,
"loss": 1.6988,
"step": 1373
},
{
"epoch": 0.302210491586935,
"grad_norm": 0.26526594161987305,
"learning_rate": 8e-05,
"loss": 1.7459,
"step": 1374
},
{
"epoch": 0.3024304409985703,
"grad_norm": 0.2893051207065582,
"learning_rate": 8e-05,
"loss": 1.8214,
"step": 1375
},
{
"epoch": 0.30265039041020564,
"grad_norm": 0.2735356092453003,
"learning_rate": 8e-05,
"loss": 1.8437,
"step": 1376
},
{
"epoch": 0.302870339821841,
"grad_norm": 0.2743459939956665,
"learning_rate": 8e-05,
"loss": 1.8365,
"step": 1377
},
{
"epoch": 0.3030902892334763,
"grad_norm": 0.28047019243240356,
"learning_rate": 8e-05,
"loss": 1.6143,
"step": 1378
},
{
"epoch": 0.30331023864511164,
"grad_norm": 0.268197238445282,
"learning_rate": 8e-05,
"loss": 1.591,
"step": 1379
},
{
"epoch": 0.303530188056747,
"grad_norm": 0.2890843451023102,
"learning_rate": 8e-05,
"loss": 1.7757,
"step": 1380
},
{
"epoch": 0.30375013746838225,
"grad_norm": 0.2765072286128998,
"learning_rate": 8e-05,
"loss": 1.6363,
"step": 1381
},
{
"epoch": 0.3039700868800176,
"grad_norm": 0.290147602558136,
"learning_rate": 8e-05,
"loss": 1.7615,
"step": 1382
},
{
"epoch": 0.3041900362916529,
"grad_norm": 0.2721220850944519,
"learning_rate": 8e-05,
"loss": 1.7101,
"step": 1383
},
{
"epoch": 0.30440998570328826,
"grad_norm": 0.27125662565231323,
"learning_rate": 8e-05,
"loss": 1.7291,
"step": 1384
},
{
"epoch": 0.3046299351149236,
"grad_norm": 0.2594304084777832,
"learning_rate": 8e-05,
"loss": 1.6754,
"step": 1385
},
{
"epoch": 0.30484988452655887,
"grad_norm": 0.28582707047462463,
"learning_rate": 8e-05,
"loss": 1.6808,
"step": 1386
},
{
"epoch": 0.3050698339381942,
"grad_norm": 0.2853895425796509,
"learning_rate": 8e-05,
"loss": 1.779,
"step": 1387
},
{
"epoch": 0.30528978334982954,
"grad_norm": 0.2580530345439911,
"learning_rate": 8e-05,
"loss": 1.6316,
"step": 1388
},
{
"epoch": 0.30550973276146487,
"grad_norm": 0.2793220281600952,
"learning_rate": 8e-05,
"loss": 1.7326,
"step": 1389
},
{
"epoch": 0.3057296821731002,
"grad_norm": 0.2672085165977478,
"learning_rate": 8e-05,
"loss": 1.6544,
"step": 1390
},
{
"epoch": 0.30594963158473554,
"grad_norm": 0.27718111872673035,
"learning_rate": 8e-05,
"loss": 1.6307,
"step": 1391
},
{
"epoch": 0.3061695809963708,
"grad_norm": 0.29295554757118225,
"learning_rate": 8e-05,
"loss": 1.502,
"step": 1392
},
{
"epoch": 0.30638953040800615,
"grad_norm": 0.2840512990951538,
"learning_rate": 8e-05,
"loss": 1.6326,
"step": 1393
},
{
"epoch": 0.3066094798196415,
"grad_norm": 0.2897029519081116,
"learning_rate": 8e-05,
"loss": 1.7543,
"step": 1394
},
{
"epoch": 0.3068294292312768,
"grad_norm": 0.28060710430145264,
"learning_rate": 8e-05,
"loss": 1.7227,
"step": 1395
},
{
"epoch": 0.30704937864291215,
"grad_norm": 0.27874305844306946,
"learning_rate": 8e-05,
"loss": 1.6639,
"step": 1396
},
{
"epoch": 0.30726932805454743,
"grad_norm": 0.2679193615913391,
"learning_rate": 8e-05,
"loss": 1.7226,
"step": 1397
},
{
"epoch": 0.30748927746618276,
"grad_norm": 0.2769779562950134,
"learning_rate": 8e-05,
"loss": 1.6384,
"step": 1398
},
{
"epoch": 0.3077092268778181,
"grad_norm": 0.26620879769325256,
"learning_rate": 8e-05,
"loss": 1.7134,
"step": 1399
},
{
"epoch": 0.30792917628945343,
"grad_norm": 0.277423620223999,
"learning_rate": 8e-05,
"loss": 1.7376,
"step": 1400
},
{
"epoch": 0.30814912570108877,
"grad_norm": 0.2629416882991791,
"learning_rate": 8e-05,
"loss": 1.598,
"step": 1401
},
{
"epoch": 0.3083690751127241,
"grad_norm": 0.2844812572002411,
"learning_rate": 8e-05,
"loss": 1.7067,
"step": 1402
},
{
"epoch": 0.3085890245243594,
"grad_norm": 0.2731526494026184,
"learning_rate": 8e-05,
"loss": 1.8571,
"step": 1403
},
{
"epoch": 0.3088089739359947,
"grad_norm": 0.287438303232193,
"learning_rate": 8e-05,
"loss": 1.7612,
"step": 1404
},
{
"epoch": 0.30902892334763005,
"grad_norm": 0.266718327999115,
"learning_rate": 8e-05,
"loss": 1.6106,
"step": 1405
},
{
"epoch": 0.3092488727592654,
"grad_norm": 0.28080686926841736,
"learning_rate": 8e-05,
"loss": 1.8281,
"step": 1406
},
{
"epoch": 0.3094688221709007,
"grad_norm": 0.27558308839797974,
"learning_rate": 8e-05,
"loss": 1.8677,
"step": 1407
},
{
"epoch": 0.309688771582536,
"grad_norm": 0.2798183262348175,
"learning_rate": 8e-05,
"loss": 1.7867,
"step": 1408
},
{
"epoch": 0.3099087209941713,
"grad_norm": 0.25823187828063965,
"learning_rate": 8e-05,
"loss": 1.6743,
"step": 1409
},
{
"epoch": 0.31012867040580666,
"grad_norm": 0.27356335520744324,
"learning_rate": 8e-05,
"loss": 1.7039,
"step": 1410
},
{
"epoch": 0.310348619817442,
"grad_norm": 0.2842661440372467,
"learning_rate": 8e-05,
"loss": 1.7046,
"step": 1411
},
{
"epoch": 0.31056856922907733,
"grad_norm": 0.2561197876930237,
"learning_rate": 8e-05,
"loss": 1.4887,
"step": 1412
},
{
"epoch": 0.31078851864071266,
"grad_norm": 0.2851184904575348,
"learning_rate": 8e-05,
"loss": 1.7074,
"step": 1413
},
{
"epoch": 0.31100846805234794,
"grad_norm": 0.2655506432056427,
"learning_rate": 8e-05,
"loss": 1.6049,
"step": 1414
},
{
"epoch": 0.3112284174639833,
"grad_norm": 0.26412099599838257,
"learning_rate": 8e-05,
"loss": 1.6052,
"step": 1415
},
{
"epoch": 0.3114483668756186,
"grad_norm": 0.3026227056980133,
"learning_rate": 8e-05,
"loss": 1.7085,
"step": 1416
},
{
"epoch": 0.31166831628725394,
"grad_norm": 0.28821703791618347,
"learning_rate": 8e-05,
"loss": 1.7573,
"step": 1417
},
{
"epoch": 0.3118882656988893,
"grad_norm": 0.26806455850601196,
"learning_rate": 8e-05,
"loss": 1.7136,
"step": 1418
},
{
"epoch": 0.31210821511052456,
"grad_norm": 0.28336799144744873,
"learning_rate": 8e-05,
"loss": 1.8445,
"step": 1419
},
{
"epoch": 0.3123281645221599,
"grad_norm": 0.2772139012813568,
"learning_rate": 8e-05,
"loss": 1.692,
"step": 1420
},
{
"epoch": 0.3125481139337952,
"grad_norm": 0.2815256714820862,
"learning_rate": 8e-05,
"loss": 1.77,
"step": 1421
},
{
"epoch": 0.31276806334543056,
"grad_norm": 0.4029920697212219,
"learning_rate": 8e-05,
"loss": 1.8103,
"step": 1422
},
{
"epoch": 0.3129880127570659,
"grad_norm": 0.2677610218524933,
"learning_rate": 8e-05,
"loss": 1.5898,
"step": 1423
},
{
"epoch": 0.3132079621687012,
"grad_norm": 0.2605397701263428,
"learning_rate": 8e-05,
"loss": 1.5735,
"step": 1424
},
{
"epoch": 0.3134279115803365,
"grad_norm": 0.2831586003303528,
"learning_rate": 8e-05,
"loss": 1.6641,
"step": 1425
},
{
"epoch": 0.31364786099197184,
"grad_norm": 0.2746485471725464,
"learning_rate": 8e-05,
"loss": 1.618,
"step": 1426
},
{
"epoch": 0.3138678104036072,
"grad_norm": 0.283342182636261,
"learning_rate": 8e-05,
"loss": 1.6963,
"step": 1427
},
{
"epoch": 0.3140877598152425,
"grad_norm": 0.27635300159454346,
"learning_rate": 8e-05,
"loss": 1.6911,
"step": 1428
},
{
"epoch": 0.31430770922687784,
"grad_norm": 0.2719132900238037,
"learning_rate": 8e-05,
"loss": 1.7063,
"step": 1429
},
{
"epoch": 0.3145276586385131,
"grad_norm": 0.27162256836891174,
"learning_rate": 8e-05,
"loss": 1.6397,
"step": 1430
},
{
"epoch": 0.31474760805014845,
"grad_norm": 0.2934938073158264,
"learning_rate": 8e-05,
"loss": 1.7555,
"step": 1431
},
{
"epoch": 0.3149675574617838,
"grad_norm": 0.3060123920440674,
"learning_rate": 8e-05,
"loss": 1.642,
"step": 1432
},
{
"epoch": 0.3151875068734191,
"grad_norm": 0.280846506357193,
"learning_rate": 8e-05,
"loss": 1.6805,
"step": 1433
},
{
"epoch": 0.31540745628505445,
"grad_norm": 0.2768997550010681,
"learning_rate": 8e-05,
"loss": 1.7359,
"step": 1434
},
{
"epoch": 0.3156274056966898,
"grad_norm": 0.29172810912132263,
"learning_rate": 8e-05,
"loss": 1.821,
"step": 1435
},
{
"epoch": 0.31584735510832507,
"grad_norm": 0.30742648243904114,
"learning_rate": 8e-05,
"loss": 1.8198,
"step": 1436
},
{
"epoch": 0.3160673045199604,
"grad_norm": 0.2889997065067291,
"learning_rate": 8e-05,
"loss": 1.6733,
"step": 1437
},
{
"epoch": 0.31628725393159574,
"grad_norm": 0.2859675884246826,
"learning_rate": 8e-05,
"loss": 1.7655,
"step": 1438
},
{
"epoch": 0.31650720334323107,
"grad_norm": 0.2926831543445587,
"learning_rate": 8e-05,
"loss": 1.7871,
"step": 1439
},
{
"epoch": 0.3167271527548664,
"grad_norm": 0.28924524784088135,
"learning_rate": 8e-05,
"loss": 1.665,
"step": 1440
},
{
"epoch": 0.3169471021665017,
"grad_norm": 0.2940097749233246,
"learning_rate": 8e-05,
"loss": 1.8364,
"step": 1441
},
{
"epoch": 0.317167051578137,
"grad_norm": 0.2923974096775055,
"learning_rate": 8e-05,
"loss": 1.8071,
"step": 1442
},
{
"epoch": 0.31738700098977235,
"grad_norm": 0.28991878032684326,
"learning_rate": 8e-05,
"loss": 1.7445,
"step": 1443
},
{
"epoch": 0.3176069504014077,
"grad_norm": 0.283600777387619,
"learning_rate": 8e-05,
"loss": 1.8043,
"step": 1444
},
{
"epoch": 0.317826899813043,
"grad_norm": 0.3082323372364044,
"learning_rate": 8e-05,
"loss": 1.7858,
"step": 1445
},
{
"epoch": 0.3180468492246783,
"grad_norm": 0.28433462977409363,
"learning_rate": 8e-05,
"loss": 1.6911,
"step": 1446
},
{
"epoch": 0.31826679863631363,
"grad_norm": 0.27776578068733215,
"learning_rate": 8e-05,
"loss": 1.7212,
"step": 1447
},
{
"epoch": 0.31848674804794896,
"grad_norm": 0.29395151138305664,
"learning_rate": 8e-05,
"loss": 1.7221,
"step": 1448
},
{
"epoch": 0.3187066974595843,
"grad_norm": 0.27507245540618896,
"learning_rate": 8e-05,
"loss": 1.7358,
"step": 1449
},
{
"epoch": 0.31892664687121963,
"grad_norm": 0.25614190101623535,
"learning_rate": 8e-05,
"loss": 1.5138,
"step": 1450
},
{
"epoch": 0.31914659628285497,
"grad_norm": 0.2908024489879608,
"learning_rate": 8e-05,
"loss": 1.756,
"step": 1451
},
{
"epoch": 0.31936654569449024,
"grad_norm": 0.2729463577270508,
"learning_rate": 8e-05,
"loss": 1.5542,
"step": 1452
},
{
"epoch": 0.3195864951061256,
"grad_norm": 0.27094194293022156,
"learning_rate": 8e-05,
"loss": 1.5917,
"step": 1453
},
{
"epoch": 0.3198064445177609,
"grad_norm": 0.28125494718551636,
"learning_rate": 8e-05,
"loss": 1.6584,
"step": 1454
},
{
"epoch": 0.32002639392939625,
"grad_norm": 0.29033198952674866,
"learning_rate": 8e-05,
"loss": 1.7332,
"step": 1455
},
{
"epoch": 0.3202463433410316,
"grad_norm": 0.26570284366607666,
"learning_rate": 8e-05,
"loss": 1.6159,
"step": 1456
},
{
"epoch": 0.32046629275266686,
"grad_norm": 0.307412713766098,
"learning_rate": 8e-05,
"loss": 1.7351,
"step": 1457
},
{
"epoch": 0.3206862421643022,
"grad_norm": 0.29387474060058594,
"learning_rate": 8e-05,
"loss": 1.9386,
"step": 1458
},
{
"epoch": 0.3209061915759375,
"grad_norm": 0.26545315980911255,
"learning_rate": 8e-05,
"loss": 1.6343,
"step": 1459
},
{
"epoch": 0.32112614098757286,
"grad_norm": 0.279238224029541,
"learning_rate": 8e-05,
"loss": 1.6245,
"step": 1460
},
{
"epoch": 0.3213460903992082,
"grad_norm": 0.2766862213611603,
"learning_rate": 8e-05,
"loss": 1.7135,
"step": 1461
},
{
"epoch": 0.32156603981084353,
"grad_norm": 0.2705351412296295,
"learning_rate": 8e-05,
"loss": 1.6526,
"step": 1462
},
{
"epoch": 0.3217859892224788,
"grad_norm": 0.27870967984199524,
"learning_rate": 8e-05,
"loss": 1.6512,
"step": 1463
},
{
"epoch": 0.32200593863411414,
"grad_norm": 0.284407377243042,
"learning_rate": 8e-05,
"loss": 1.755,
"step": 1464
},
{
"epoch": 0.3222258880457495,
"grad_norm": 0.2897641062736511,
"learning_rate": 8e-05,
"loss": 1.8383,
"step": 1465
},
{
"epoch": 0.3224458374573848,
"grad_norm": 0.2667568624019623,
"learning_rate": 8e-05,
"loss": 1.6989,
"step": 1466
},
{
"epoch": 0.32266578686902014,
"grad_norm": 0.26580294966697693,
"learning_rate": 8e-05,
"loss": 1.5895,
"step": 1467
},
{
"epoch": 0.3228857362806554,
"grad_norm": 0.26188549399375916,
"learning_rate": 8e-05,
"loss": 1.5799,
"step": 1468
},
{
"epoch": 0.32310568569229076,
"grad_norm": 0.27703747153282166,
"learning_rate": 8e-05,
"loss": 1.8306,
"step": 1469
},
{
"epoch": 0.3233256351039261,
"grad_norm": 0.27643802762031555,
"learning_rate": 8e-05,
"loss": 1.6864,
"step": 1470
},
{
"epoch": 0.3235455845155614,
"grad_norm": 0.27216553688049316,
"learning_rate": 8e-05,
"loss": 1.6006,
"step": 1471
},
{
"epoch": 0.32376553392719676,
"grad_norm": 0.2984940707683563,
"learning_rate": 8e-05,
"loss": 1.7548,
"step": 1472
},
{
"epoch": 0.3239854833388321,
"grad_norm": 0.30579298734664917,
"learning_rate": 8e-05,
"loss": 1.8307,
"step": 1473
},
{
"epoch": 0.32420543275046737,
"grad_norm": 0.27524709701538086,
"learning_rate": 8e-05,
"loss": 1.6134,
"step": 1474
},
{
"epoch": 0.3244253821621027,
"grad_norm": 0.2788650393486023,
"learning_rate": 8e-05,
"loss": 1.8194,
"step": 1475
},
{
"epoch": 0.32464533157373804,
"grad_norm": 0.28263744711875916,
"learning_rate": 8e-05,
"loss": 1.7633,
"step": 1476
},
{
"epoch": 0.3248652809853734,
"grad_norm": 0.30234408378601074,
"learning_rate": 8e-05,
"loss": 1.6057,
"step": 1477
},
{
"epoch": 0.3250852303970087,
"grad_norm": 0.2820134162902832,
"learning_rate": 8e-05,
"loss": 1.6913,
"step": 1478
},
{
"epoch": 0.325305179808644,
"grad_norm": 0.28929245471954346,
"learning_rate": 8e-05,
"loss": 1.9538,
"step": 1479
},
{
"epoch": 0.3255251292202793,
"grad_norm": 0.26399463415145874,
"learning_rate": 8e-05,
"loss": 1.7309,
"step": 1480
},
{
"epoch": 0.32574507863191465,
"grad_norm": 0.2722630202770233,
"learning_rate": 8e-05,
"loss": 1.5595,
"step": 1481
},
{
"epoch": 0.32596502804355,
"grad_norm": 0.2759261727333069,
"learning_rate": 8e-05,
"loss": 1.6272,
"step": 1482
},
{
"epoch": 0.3261849774551853,
"grad_norm": 0.28047022223472595,
"learning_rate": 8e-05,
"loss": 1.6933,
"step": 1483
},
{
"epoch": 0.32640492686682065,
"grad_norm": 0.2835995554924011,
"learning_rate": 8e-05,
"loss": 1.7165,
"step": 1484
},
{
"epoch": 0.32662487627845593,
"grad_norm": 0.28965097665786743,
"learning_rate": 8e-05,
"loss": 1.6399,
"step": 1485
},
{
"epoch": 0.32684482569009127,
"grad_norm": 0.2729817032814026,
"learning_rate": 8e-05,
"loss": 1.7397,
"step": 1486
},
{
"epoch": 0.3270647751017266,
"grad_norm": 0.26809874176979065,
"learning_rate": 8e-05,
"loss": 1.6954,
"step": 1487
},
{
"epoch": 0.32728472451336194,
"grad_norm": 0.29766684770584106,
"learning_rate": 8e-05,
"loss": 1.6947,
"step": 1488
},
{
"epoch": 0.32750467392499727,
"grad_norm": 0.27032670378685,
"learning_rate": 8e-05,
"loss": 1.8036,
"step": 1489
},
{
"epoch": 0.32772462333663255,
"grad_norm": 0.2694716453552246,
"learning_rate": 8e-05,
"loss": 1.6856,
"step": 1490
},
{
"epoch": 0.3279445727482679,
"grad_norm": 0.27968841791152954,
"learning_rate": 8e-05,
"loss": 1.7466,
"step": 1491
},
{
"epoch": 0.3281645221599032,
"grad_norm": 0.2956348955631256,
"learning_rate": 8e-05,
"loss": 1.833,
"step": 1492
},
{
"epoch": 0.32838447157153855,
"grad_norm": 0.27069491147994995,
"learning_rate": 8e-05,
"loss": 1.715,
"step": 1493
},
{
"epoch": 0.3286044209831739,
"grad_norm": 0.26747795939445496,
"learning_rate": 8e-05,
"loss": 1.6663,
"step": 1494
},
{
"epoch": 0.3288243703948092,
"grad_norm": 0.2619915008544922,
"learning_rate": 8e-05,
"loss": 1.6503,
"step": 1495
},
{
"epoch": 0.3290443198064445,
"grad_norm": 0.2720276117324829,
"learning_rate": 8e-05,
"loss": 1.7174,
"step": 1496
},
{
"epoch": 0.32926426921807983,
"grad_norm": 0.26874253153800964,
"learning_rate": 8e-05,
"loss": 1.6503,
"step": 1497
},
{
"epoch": 0.32948421862971516,
"grad_norm": 0.28397336602211,
"learning_rate": 8e-05,
"loss": 1.67,
"step": 1498
},
{
"epoch": 0.3297041680413505,
"grad_norm": 0.2544403076171875,
"learning_rate": 8e-05,
"loss": 1.5153,
"step": 1499
},
{
"epoch": 0.32992411745298583,
"grad_norm": 0.2819180488586426,
"learning_rate": 8e-05,
"loss": 1.6704,
"step": 1500
},
{
"epoch": 0.3301440668646211,
"grad_norm": 0.28150951862335205,
"learning_rate": 8e-05,
"loss": 1.8451,
"step": 1501
},
{
"epoch": 0.33036401627625644,
"grad_norm": 0.27396339178085327,
"learning_rate": 8e-05,
"loss": 1.7631,
"step": 1502
},
{
"epoch": 0.3305839656878918,
"grad_norm": 0.2954351007938385,
"learning_rate": 8e-05,
"loss": 1.8101,
"step": 1503
},
{
"epoch": 0.3308039150995271,
"grad_norm": 0.27129319310188293,
"learning_rate": 8e-05,
"loss": 1.6484,
"step": 1504
},
{
"epoch": 0.33102386451116245,
"grad_norm": 0.27612754702568054,
"learning_rate": 8e-05,
"loss": 1.6178,
"step": 1505
},
{
"epoch": 0.3312438139227978,
"grad_norm": 0.26097655296325684,
"learning_rate": 8e-05,
"loss": 1.5781,
"step": 1506
},
{
"epoch": 0.33146376333443306,
"grad_norm": 0.2704753577709198,
"learning_rate": 8e-05,
"loss": 1.6919,
"step": 1507
},
{
"epoch": 0.3316837127460684,
"grad_norm": 0.26866593956947327,
"learning_rate": 8e-05,
"loss": 1.6795,
"step": 1508
},
{
"epoch": 0.3319036621577037,
"grad_norm": 0.31797948479652405,
"learning_rate": 8e-05,
"loss": 1.7511,
"step": 1509
},
{
"epoch": 0.33212361156933906,
"grad_norm": 0.29456841945648193,
"learning_rate": 8e-05,
"loss": 1.7041,
"step": 1510
},
{
"epoch": 0.3323435609809744,
"grad_norm": 0.28345033526420593,
"learning_rate": 8e-05,
"loss": 1.7499,
"step": 1511
},
{
"epoch": 0.3325635103926097,
"grad_norm": 0.28679129481315613,
"learning_rate": 8e-05,
"loss": 1.8304,
"step": 1512
},
{
"epoch": 0.332783459804245,
"grad_norm": 0.2799399793148041,
"learning_rate": 8e-05,
"loss": 1.6461,
"step": 1513
},
{
"epoch": 0.33300340921588034,
"grad_norm": 0.3234422206878662,
"learning_rate": 8e-05,
"loss": 1.5622,
"step": 1514
},
{
"epoch": 0.3332233586275157,
"grad_norm": 0.27786344289779663,
"learning_rate": 8e-05,
"loss": 1.6718,
"step": 1515
},
{
"epoch": 0.333443308039151,
"grad_norm": 0.27040839195251465,
"learning_rate": 8e-05,
"loss": 1.7428,
"step": 1516
},
{
"epoch": 0.33366325745078634,
"grad_norm": 0.2837252616882324,
"learning_rate": 8e-05,
"loss": 1.6929,
"step": 1517
},
{
"epoch": 0.3338832068624216,
"grad_norm": 0.27352792024612427,
"learning_rate": 8e-05,
"loss": 1.7804,
"step": 1518
},
{
"epoch": 0.33410315627405696,
"grad_norm": 0.27237218618392944,
"learning_rate": 8e-05,
"loss": 1.7652,
"step": 1519
},
{
"epoch": 0.3343231056856923,
"grad_norm": 0.3166270852088928,
"learning_rate": 8e-05,
"loss": 1.6363,
"step": 1520
},
{
"epoch": 0.3345430550973276,
"grad_norm": 0.2650817930698395,
"learning_rate": 8e-05,
"loss": 1.6954,
"step": 1521
},
{
"epoch": 0.33476300450896296,
"grad_norm": 0.2907481789588928,
"learning_rate": 8e-05,
"loss": 1.809,
"step": 1522
},
{
"epoch": 0.33498295392059824,
"grad_norm": 0.2754502296447754,
"learning_rate": 8e-05,
"loss": 1.8143,
"step": 1523
},
{
"epoch": 0.33520290333223357,
"grad_norm": 0.2890012264251709,
"learning_rate": 8e-05,
"loss": 1.6603,
"step": 1524
},
{
"epoch": 0.3354228527438689,
"grad_norm": 0.271720826625824,
"learning_rate": 8e-05,
"loss": 1.7186,
"step": 1525
},
{
"epoch": 0.33564280215550424,
"grad_norm": 0.2845331132411957,
"learning_rate": 8e-05,
"loss": 1.7739,
"step": 1526
},
{
"epoch": 0.3358627515671396,
"grad_norm": 0.2787776291370392,
"learning_rate": 8e-05,
"loss": 1.6146,
"step": 1527
},
{
"epoch": 0.3360827009787749,
"grad_norm": 0.2612919211387634,
"learning_rate": 8e-05,
"loss": 1.5575,
"step": 1528
},
{
"epoch": 0.3363026503904102,
"grad_norm": 0.279220849275589,
"learning_rate": 8e-05,
"loss": 1.7661,
"step": 1529
},
{
"epoch": 0.3365225998020455,
"grad_norm": 0.2812168300151825,
"learning_rate": 8e-05,
"loss": 1.7011,
"step": 1530
},
{
"epoch": 0.33674254921368085,
"grad_norm": 0.28216826915740967,
"learning_rate": 8e-05,
"loss": 1.7856,
"step": 1531
},
{
"epoch": 0.3369624986253162,
"grad_norm": 0.279895156621933,
"learning_rate": 8e-05,
"loss": 1.6793,
"step": 1532
},
{
"epoch": 0.3371824480369515,
"grad_norm": 0.2694056034088135,
"learning_rate": 8e-05,
"loss": 1.6289,
"step": 1533
},
{
"epoch": 0.3374023974485868,
"grad_norm": 0.2692592740058899,
"learning_rate": 8e-05,
"loss": 1.5595,
"step": 1534
},
{
"epoch": 0.33762234686022213,
"grad_norm": 0.32149383425712585,
"learning_rate": 8e-05,
"loss": 1.6667,
"step": 1535
},
{
"epoch": 0.33784229627185747,
"grad_norm": 0.28884437680244446,
"learning_rate": 8e-05,
"loss": 1.7836,
"step": 1536
},
{
"epoch": 0.3380622456834928,
"grad_norm": 0.276017963886261,
"learning_rate": 8e-05,
"loss": 1.712,
"step": 1537
},
{
"epoch": 0.33828219509512814,
"grad_norm": 0.26901450753211975,
"learning_rate": 8e-05,
"loss": 1.6442,
"step": 1538
},
{
"epoch": 0.33850214450676347,
"grad_norm": 0.29827412962913513,
"learning_rate": 8e-05,
"loss": 1.7619,
"step": 1539
},
{
"epoch": 0.33872209391839875,
"grad_norm": 0.2763231098651886,
"learning_rate": 8e-05,
"loss": 1.6344,
"step": 1540
},
{
"epoch": 0.3389420433300341,
"grad_norm": 0.26493677496910095,
"learning_rate": 8e-05,
"loss": 1.6964,
"step": 1541
},
{
"epoch": 0.3391619927416694,
"grad_norm": 0.2956371605396271,
"learning_rate": 8e-05,
"loss": 1.7328,
"step": 1542
},
{
"epoch": 0.33938194215330475,
"grad_norm": 0.2845339775085449,
"learning_rate": 8e-05,
"loss": 1.6477,
"step": 1543
},
{
"epoch": 0.3396018915649401,
"grad_norm": 0.29501214623451233,
"learning_rate": 8e-05,
"loss": 1.7951,
"step": 1544
},
{
"epoch": 0.33982184097657536,
"grad_norm": 0.2859644591808319,
"learning_rate": 8e-05,
"loss": 1.6607,
"step": 1545
},
{
"epoch": 0.3400417903882107,
"grad_norm": 0.2733168303966522,
"learning_rate": 8e-05,
"loss": 1.6397,
"step": 1546
},
{
"epoch": 0.34026173979984603,
"grad_norm": 0.2580598294734955,
"learning_rate": 8e-05,
"loss": 1.5692,
"step": 1547
},
{
"epoch": 0.34048168921148136,
"grad_norm": 0.3042803406715393,
"learning_rate": 8e-05,
"loss": 1.7063,
"step": 1548
},
{
"epoch": 0.3407016386231167,
"grad_norm": 0.2833859324455261,
"learning_rate": 8e-05,
"loss": 1.7531,
"step": 1549
},
{
"epoch": 0.34092158803475203,
"grad_norm": 0.259620726108551,
"learning_rate": 8e-05,
"loss": 1.6179,
"step": 1550
},
{
"epoch": 0.3411415374463873,
"grad_norm": 0.268355131149292,
"learning_rate": 8e-05,
"loss": 1.6009,
"step": 1551
},
{
"epoch": 0.34136148685802264,
"grad_norm": 0.2858780324459076,
"learning_rate": 8e-05,
"loss": 1.7033,
"step": 1552
},
{
"epoch": 0.341581436269658,
"grad_norm": 0.2777354121208191,
"learning_rate": 8e-05,
"loss": 1.7615,
"step": 1553
},
{
"epoch": 0.3418013856812933,
"grad_norm": 0.27899524569511414,
"learning_rate": 8e-05,
"loss": 1.6684,
"step": 1554
},
{
"epoch": 0.34202133509292865,
"grad_norm": 0.3156200349330902,
"learning_rate": 8e-05,
"loss": 1.5658,
"step": 1555
},
{
"epoch": 0.3422412845045639,
"grad_norm": 0.27549582719802856,
"learning_rate": 8e-05,
"loss": 1.692,
"step": 1556
},
{
"epoch": 0.34246123391619926,
"grad_norm": 0.27770310640335083,
"learning_rate": 8e-05,
"loss": 1.5891,
"step": 1557
},
{
"epoch": 0.3426811833278346,
"grad_norm": 0.28138646483421326,
"learning_rate": 8e-05,
"loss": 1.5949,
"step": 1558
},
{
"epoch": 0.3429011327394699,
"grad_norm": 0.2790684998035431,
"learning_rate": 8e-05,
"loss": 1.6371,
"step": 1559
},
{
"epoch": 0.34312108215110526,
"grad_norm": 0.303230345249176,
"learning_rate": 8e-05,
"loss": 1.7416,
"step": 1560
},
{
"epoch": 0.3433410315627406,
"grad_norm": 0.26891767978668213,
"learning_rate": 8e-05,
"loss": 1.8044,
"step": 1561
},
{
"epoch": 0.3435609809743759,
"grad_norm": 0.2734631896018982,
"learning_rate": 8e-05,
"loss": 1.7171,
"step": 1562
},
{
"epoch": 0.3437809303860112,
"grad_norm": 0.29556018114089966,
"learning_rate": 8e-05,
"loss": 1.9085,
"step": 1563
},
{
"epoch": 0.34400087979764654,
"grad_norm": 0.26478004455566406,
"learning_rate": 8e-05,
"loss": 1.6153,
"step": 1564
},
{
"epoch": 0.3442208292092819,
"grad_norm": 0.27655404806137085,
"learning_rate": 8e-05,
"loss": 1.7384,
"step": 1565
},
{
"epoch": 0.3444407786209172,
"grad_norm": 0.2902698218822479,
"learning_rate": 8e-05,
"loss": 1.6589,
"step": 1566
},
{
"epoch": 0.3446607280325525,
"grad_norm": 0.2857147455215454,
"learning_rate": 8e-05,
"loss": 1.6598,
"step": 1567
},
{
"epoch": 0.3448806774441878,
"grad_norm": 0.28339943289756775,
"learning_rate": 8e-05,
"loss": 1.7356,
"step": 1568
},
{
"epoch": 0.34510062685582316,
"grad_norm": 0.29340776801109314,
"learning_rate": 8e-05,
"loss": 1.8316,
"step": 1569
},
{
"epoch": 0.3453205762674585,
"grad_norm": 0.26669397950172424,
"learning_rate": 8e-05,
"loss": 1.6803,
"step": 1570
},
{
"epoch": 0.3455405256790938,
"grad_norm": 0.28508248925209045,
"learning_rate": 8e-05,
"loss": 1.7702,
"step": 1571
},
{
"epoch": 0.34576047509072916,
"grad_norm": 0.25610047578811646,
"learning_rate": 8e-05,
"loss": 1.6343,
"step": 1572
},
{
"epoch": 0.34598042450236444,
"grad_norm": 0.2758273482322693,
"learning_rate": 8e-05,
"loss": 1.7875,
"step": 1573
},
{
"epoch": 0.34620037391399977,
"grad_norm": 0.2674688398838043,
"learning_rate": 8e-05,
"loss": 1.6804,
"step": 1574
},
{
"epoch": 0.3464203233256351,
"grad_norm": 0.2796163558959961,
"learning_rate": 8e-05,
"loss": 1.6135,
"step": 1575
},
{
"epoch": 0.34664027273727044,
"grad_norm": 0.26260775327682495,
"learning_rate": 8e-05,
"loss": 1.6752,
"step": 1576
},
{
"epoch": 0.3468602221489058,
"grad_norm": 0.2897137403488159,
"learning_rate": 8e-05,
"loss": 1.6743,
"step": 1577
},
{
"epoch": 0.34708017156054105,
"grad_norm": 0.27681732177734375,
"learning_rate": 8e-05,
"loss": 1.6436,
"step": 1578
},
{
"epoch": 0.3473001209721764,
"grad_norm": 0.2694265842437744,
"learning_rate": 8e-05,
"loss": 1.6343,
"step": 1579
},
{
"epoch": 0.3475200703838117,
"grad_norm": 0.28179508447647095,
"learning_rate": 8e-05,
"loss": 1.676,
"step": 1580
},
{
"epoch": 0.34774001979544705,
"grad_norm": 0.29600057005882263,
"learning_rate": 8e-05,
"loss": 1.786,
"step": 1581
},
{
"epoch": 0.3479599692070824,
"grad_norm": 0.28932616114616394,
"learning_rate": 8e-05,
"loss": 1.7151,
"step": 1582
},
{
"epoch": 0.3481799186187177,
"grad_norm": 0.2912417948246002,
"learning_rate": 8e-05,
"loss": 1.7788,
"step": 1583
},
{
"epoch": 0.348399868030353,
"grad_norm": 0.2844431698322296,
"learning_rate": 8e-05,
"loss": 1.5585,
"step": 1584
},
{
"epoch": 0.34861981744198833,
"grad_norm": 0.2916630804538727,
"learning_rate": 8e-05,
"loss": 1.7484,
"step": 1585
},
{
"epoch": 0.34883976685362367,
"grad_norm": 0.2785089612007141,
"learning_rate": 8e-05,
"loss": 1.61,
"step": 1586
},
{
"epoch": 0.349059716265259,
"grad_norm": 0.2777422368526459,
"learning_rate": 8e-05,
"loss": 1.7183,
"step": 1587
},
{
"epoch": 0.34927966567689434,
"grad_norm": 0.28772565722465515,
"learning_rate": 8e-05,
"loss": 1.7161,
"step": 1588
},
{
"epoch": 0.3494996150885296,
"grad_norm": 0.28452831506729126,
"learning_rate": 8e-05,
"loss": 1.8004,
"step": 1589
},
{
"epoch": 0.34971956450016495,
"grad_norm": 0.2837449014186859,
"learning_rate": 8e-05,
"loss": 1.7992,
"step": 1590
},
{
"epoch": 0.3499395139118003,
"grad_norm": 0.2874920666217804,
"learning_rate": 8e-05,
"loss": 1.6408,
"step": 1591
},
{
"epoch": 0.3501594633234356,
"grad_norm": 0.26615065336227417,
"learning_rate": 8e-05,
"loss": 1.658,
"step": 1592
},
{
"epoch": 0.35037941273507095,
"grad_norm": 0.27493569254875183,
"learning_rate": 8e-05,
"loss": 1.6843,
"step": 1593
},
{
"epoch": 0.3505993621467063,
"grad_norm": 0.291886568069458,
"learning_rate": 8e-05,
"loss": 1.7683,
"step": 1594
},
{
"epoch": 0.35081931155834156,
"grad_norm": 0.2868814468383789,
"learning_rate": 8e-05,
"loss": 1.7825,
"step": 1595
},
{
"epoch": 0.3510392609699769,
"grad_norm": 0.30988067388534546,
"learning_rate": 8e-05,
"loss": 1.7511,
"step": 1596
},
{
"epoch": 0.35125921038161223,
"grad_norm": 0.2746553122997284,
"learning_rate": 8e-05,
"loss": 1.6298,
"step": 1597
},
{
"epoch": 0.35147915979324756,
"grad_norm": 0.3013536036014557,
"learning_rate": 8e-05,
"loss": 1.7883,
"step": 1598
},
{
"epoch": 0.3516991092048829,
"grad_norm": 0.2906748056411743,
"learning_rate": 8e-05,
"loss": 1.5819,
"step": 1599
},
{
"epoch": 0.3519190586165182,
"grad_norm": 0.28082364797592163,
"learning_rate": 8e-05,
"loss": 1.7525,
"step": 1600
},
{
"epoch": 0.3521390080281535,
"grad_norm": 0.28713324666023254,
"learning_rate": 8e-05,
"loss": 1.79,
"step": 1601
},
{
"epoch": 0.35235895743978884,
"grad_norm": 0.2819896638393402,
"learning_rate": 8e-05,
"loss": 1.6514,
"step": 1602
},
{
"epoch": 0.3525789068514242,
"grad_norm": 0.27669310569763184,
"learning_rate": 8e-05,
"loss": 1.5888,
"step": 1603
},
{
"epoch": 0.3527988562630595,
"grad_norm": 0.2873641848564148,
"learning_rate": 8e-05,
"loss": 1.8206,
"step": 1604
},
{
"epoch": 0.35301880567469485,
"grad_norm": 0.28426647186279297,
"learning_rate": 8e-05,
"loss": 1.7736,
"step": 1605
},
{
"epoch": 0.3532387550863301,
"grad_norm": 0.2733590602874756,
"learning_rate": 8e-05,
"loss": 1.6653,
"step": 1606
},
{
"epoch": 0.35345870449796546,
"grad_norm": 0.26751479506492615,
"learning_rate": 8e-05,
"loss": 1.5841,
"step": 1607
},
{
"epoch": 0.3536786539096008,
"grad_norm": 0.2767663598060608,
"learning_rate": 8e-05,
"loss": 1.6859,
"step": 1608
},
{
"epoch": 0.3538986033212361,
"grad_norm": 0.28359255194664,
"learning_rate": 8e-05,
"loss": 1.8799,
"step": 1609
},
{
"epoch": 0.35411855273287146,
"grad_norm": 0.27551594376564026,
"learning_rate": 8e-05,
"loss": 1.6429,
"step": 1610
},
{
"epoch": 0.35433850214450674,
"grad_norm": 0.26260972023010254,
"learning_rate": 8e-05,
"loss": 1.6068,
"step": 1611
},
{
"epoch": 0.3545584515561421,
"grad_norm": 0.2778937518596649,
"learning_rate": 8e-05,
"loss": 1.8057,
"step": 1612
},
{
"epoch": 0.3547784009677774,
"grad_norm": 0.27607765793800354,
"learning_rate": 8e-05,
"loss": 1.7439,
"step": 1613
},
{
"epoch": 0.35499835037941274,
"grad_norm": 0.2628287076950073,
"learning_rate": 8e-05,
"loss": 1.6916,
"step": 1614
},
{
"epoch": 0.3552182997910481,
"grad_norm": 0.2767592966556549,
"learning_rate": 8e-05,
"loss": 1.7185,
"step": 1615
},
{
"epoch": 0.3554382492026834,
"grad_norm": 0.2666943669319153,
"learning_rate": 8e-05,
"loss": 1.7351,
"step": 1616
},
{
"epoch": 0.3556581986143187,
"grad_norm": 0.28780093789100647,
"learning_rate": 8e-05,
"loss": 1.757,
"step": 1617
},
{
"epoch": 0.355878148025954,
"grad_norm": 0.30761584639549255,
"learning_rate": 8e-05,
"loss": 1.8096,
"step": 1618
},
{
"epoch": 0.35609809743758936,
"grad_norm": 0.2926090359687805,
"learning_rate": 8e-05,
"loss": 1.8609,
"step": 1619
},
{
"epoch": 0.3563180468492247,
"grad_norm": 0.27546852827072144,
"learning_rate": 8e-05,
"loss": 1.6422,
"step": 1620
},
{
"epoch": 0.35653799626086,
"grad_norm": 0.28559309244155884,
"learning_rate": 8e-05,
"loss": 1.8225,
"step": 1621
},
{
"epoch": 0.3567579456724953,
"grad_norm": 0.2804494798183441,
"learning_rate": 8e-05,
"loss": 1.9108,
"step": 1622
},
{
"epoch": 0.35697789508413064,
"grad_norm": 0.2643645703792572,
"learning_rate": 8e-05,
"loss": 1.5462,
"step": 1623
},
{
"epoch": 0.35719784449576597,
"grad_norm": 0.2888531982898712,
"learning_rate": 8e-05,
"loss": 1.701,
"step": 1624
},
{
"epoch": 0.3574177939074013,
"grad_norm": 0.28601035475730896,
"learning_rate": 8e-05,
"loss": 1.628,
"step": 1625
},
{
"epoch": 0.35763774331903664,
"grad_norm": 0.2877524197101593,
"learning_rate": 8e-05,
"loss": 1.8403,
"step": 1626
},
{
"epoch": 0.357857692730672,
"grad_norm": 0.2658945918083191,
"learning_rate": 8e-05,
"loss": 1.4552,
"step": 1627
},
{
"epoch": 0.35807764214230725,
"grad_norm": 0.2911885976791382,
"learning_rate": 8e-05,
"loss": 1.7753,
"step": 1628
},
{
"epoch": 0.3582975915539426,
"grad_norm": 0.29072439670562744,
"learning_rate": 8e-05,
"loss": 1.7229,
"step": 1629
},
{
"epoch": 0.3585175409655779,
"grad_norm": 0.29961150884628296,
"learning_rate": 8e-05,
"loss": 1.7694,
"step": 1630
},
{
"epoch": 0.35873749037721325,
"grad_norm": 0.2760653793811798,
"learning_rate": 8e-05,
"loss": 1.719,
"step": 1631
},
{
"epoch": 0.3589574397888486,
"grad_norm": 0.2739832103252411,
"learning_rate": 8e-05,
"loss": 1.7367,
"step": 1632
},
{
"epoch": 0.35917738920048387,
"grad_norm": 0.2669771611690521,
"learning_rate": 8e-05,
"loss": 1.5306,
"step": 1633
},
{
"epoch": 0.3593973386121192,
"grad_norm": 0.2744583189487457,
"learning_rate": 8e-05,
"loss": 1.5713,
"step": 1634
},
{
"epoch": 0.35961728802375453,
"grad_norm": 0.2943086326122284,
"learning_rate": 8e-05,
"loss": 1.6569,
"step": 1635
},
{
"epoch": 0.35983723743538987,
"grad_norm": 0.2873243987560272,
"learning_rate": 8e-05,
"loss": 1.6864,
"step": 1636
},
{
"epoch": 0.3600571868470252,
"grad_norm": 0.27217867970466614,
"learning_rate": 8e-05,
"loss": 1.7519,
"step": 1637
},
{
"epoch": 0.36027713625866054,
"grad_norm": 0.28656938672065735,
"learning_rate": 8e-05,
"loss": 1.7892,
"step": 1638
},
{
"epoch": 0.3604970856702958,
"grad_norm": 0.2876884937286377,
"learning_rate": 8e-05,
"loss": 1.6709,
"step": 1639
},
{
"epoch": 0.36071703508193115,
"grad_norm": 0.2873481512069702,
"learning_rate": 8e-05,
"loss": 1.8336,
"step": 1640
},
{
"epoch": 0.3609369844935665,
"grad_norm": 0.28285419940948486,
"learning_rate": 8e-05,
"loss": 1.5887,
"step": 1641
},
{
"epoch": 0.3611569339052018,
"grad_norm": 0.2624582052230835,
"learning_rate": 8e-05,
"loss": 1.6248,
"step": 1642
},
{
"epoch": 0.36137688331683715,
"grad_norm": 0.2794424891471863,
"learning_rate": 8e-05,
"loss": 1.7191,
"step": 1643
},
{
"epoch": 0.36159683272847243,
"grad_norm": 0.2890479862689972,
"learning_rate": 8e-05,
"loss": 1.905,
"step": 1644
},
{
"epoch": 0.36181678214010776,
"grad_norm": 0.28444570302963257,
"learning_rate": 8e-05,
"loss": 1.6948,
"step": 1645
},
{
"epoch": 0.3620367315517431,
"grad_norm": 0.27037203311920166,
"learning_rate": 8e-05,
"loss": 1.6245,
"step": 1646
},
{
"epoch": 0.36225668096337843,
"grad_norm": 0.2864437699317932,
"learning_rate": 8e-05,
"loss": 1.688,
"step": 1647
},
{
"epoch": 0.36247663037501376,
"grad_norm": 0.27912065386772156,
"learning_rate": 8e-05,
"loss": 1.6056,
"step": 1648
},
{
"epoch": 0.3626965797866491,
"grad_norm": 0.26467230916023254,
"learning_rate": 8e-05,
"loss": 1.5786,
"step": 1649
},
{
"epoch": 0.3629165291982844,
"grad_norm": 0.2793690264225006,
"learning_rate": 8e-05,
"loss": 1.6003,
"step": 1650
},
{
"epoch": 0.3631364786099197,
"grad_norm": 0.288629949092865,
"learning_rate": 8e-05,
"loss": 1.6752,
"step": 1651
},
{
"epoch": 0.36335642802155504,
"grad_norm": 0.283195823431015,
"learning_rate": 8e-05,
"loss": 1.6854,
"step": 1652
},
{
"epoch": 0.3635763774331904,
"grad_norm": 0.2929665446281433,
"learning_rate": 8e-05,
"loss": 1.7191,
"step": 1653
},
{
"epoch": 0.3637963268448257,
"grad_norm": 0.28676289319992065,
"learning_rate": 8e-05,
"loss": 1.6959,
"step": 1654
},
{
"epoch": 0.364016276256461,
"grad_norm": 0.264635294675827,
"learning_rate": 8e-05,
"loss": 1.6232,
"step": 1655
},
{
"epoch": 0.3642362256680963,
"grad_norm": 0.2763380706310272,
"learning_rate": 8e-05,
"loss": 1.7631,
"step": 1656
},
{
"epoch": 0.36445617507973166,
"grad_norm": 0.2624233365058899,
"learning_rate": 8e-05,
"loss": 1.6635,
"step": 1657
},
{
"epoch": 0.364676124491367,
"grad_norm": 0.2564058303833008,
"learning_rate": 8e-05,
"loss": 1.4745,
"step": 1658
},
{
"epoch": 0.3648960739030023,
"grad_norm": 0.2966236174106598,
"learning_rate": 8e-05,
"loss": 1.6892,
"step": 1659
},
{
"epoch": 0.36511602331463766,
"grad_norm": 0.30588555335998535,
"learning_rate": 8e-05,
"loss": 1.6884,
"step": 1660
},
{
"epoch": 0.36533597272627294,
"grad_norm": 0.2692076861858368,
"learning_rate": 8e-05,
"loss": 1.7158,
"step": 1661
},
{
"epoch": 0.3655559221379083,
"grad_norm": 0.29388558864593506,
"learning_rate": 8e-05,
"loss": 1.7133,
"step": 1662
},
{
"epoch": 0.3657758715495436,
"grad_norm": 0.28685635328292847,
"learning_rate": 8e-05,
"loss": 1.7444,
"step": 1663
},
{
"epoch": 0.36599582096117894,
"grad_norm": 0.2885795831680298,
"learning_rate": 8e-05,
"loss": 1.7537,
"step": 1664
},
{
"epoch": 0.3662157703728143,
"grad_norm": 0.3066631853580475,
"learning_rate": 8e-05,
"loss": 1.7843,
"step": 1665
},
{
"epoch": 0.36643571978444955,
"grad_norm": 0.31112298369407654,
"learning_rate": 8e-05,
"loss": 1.4934,
"step": 1666
},
{
"epoch": 0.3666556691960849,
"grad_norm": 0.2751656472682953,
"learning_rate": 8e-05,
"loss": 1.7463,
"step": 1667
},
{
"epoch": 0.3668756186077202,
"grad_norm": 0.2834889590740204,
"learning_rate": 8e-05,
"loss": 1.7757,
"step": 1668
},
{
"epoch": 0.36709556801935556,
"grad_norm": 0.2778145968914032,
"learning_rate": 8e-05,
"loss": 1.7423,
"step": 1669
},
{
"epoch": 0.3673155174309909,
"grad_norm": 0.32161521911621094,
"learning_rate": 8e-05,
"loss": 1.8312,
"step": 1670
},
{
"epoch": 0.36753546684262617,
"grad_norm": 0.27995115518569946,
"learning_rate": 8e-05,
"loss": 1.7694,
"step": 1671
},
{
"epoch": 0.3677554162542615,
"grad_norm": 0.27701541781425476,
"learning_rate": 8e-05,
"loss": 1.8054,
"step": 1672
},
{
"epoch": 0.36797536566589684,
"grad_norm": 0.2757355570793152,
"learning_rate": 8e-05,
"loss": 1.7036,
"step": 1673
},
{
"epoch": 0.36819531507753217,
"grad_norm": 0.27305907011032104,
"learning_rate": 8e-05,
"loss": 1.6627,
"step": 1674
},
{
"epoch": 0.3684152644891675,
"grad_norm": 0.299679696559906,
"learning_rate": 8e-05,
"loss": 1.7552,
"step": 1675
},
{
"epoch": 0.36863521390080284,
"grad_norm": 0.2728777825832367,
"learning_rate": 8e-05,
"loss": 1.7649,
"step": 1676
},
{
"epoch": 0.3688551633124381,
"grad_norm": 0.26330089569091797,
"learning_rate": 8e-05,
"loss": 1.5887,
"step": 1677
},
{
"epoch": 0.36907511272407345,
"grad_norm": 0.2850317060947418,
"learning_rate": 8e-05,
"loss": 1.6255,
"step": 1678
},
{
"epoch": 0.3692950621357088,
"grad_norm": 0.2784862220287323,
"learning_rate": 8e-05,
"loss": 1.7123,
"step": 1679
},
{
"epoch": 0.3695150115473441,
"grad_norm": 0.284298300743103,
"learning_rate": 8e-05,
"loss": 1.5809,
"step": 1680
},
{
"epoch": 0.36973496095897945,
"grad_norm": 0.2725334167480469,
"learning_rate": 8e-05,
"loss": 1.7037,
"step": 1681
},
{
"epoch": 0.36995491037061473,
"grad_norm": 0.2760758399963379,
"learning_rate": 8e-05,
"loss": 1.6827,
"step": 1682
},
{
"epoch": 0.37017485978225007,
"grad_norm": 0.2661541700363159,
"learning_rate": 8e-05,
"loss": 1.7042,
"step": 1683
},
{
"epoch": 0.3703948091938854,
"grad_norm": 0.27737516164779663,
"learning_rate": 8e-05,
"loss": 1.7689,
"step": 1684
},
{
"epoch": 0.37061475860552073,
"grad_norm": 0.2607424259185791,
"learning_rate": 8e-05,
"loss": 1.6356,
"step": 1685
},
{
"epoch": 0.37083470801715607,
"grad_norm": 0.2802969217300415,
"learning_rate": 8e-05,
"loss": 1.7004,
"step": 1686
},
{
"epoch": 0.3710546574287914,
"grad_norm": 0.2660817801952362,
"learning_rate": 8e-05,
"loss": 1.5539,
"step": 1687
},
{
"epoch": 0.3712746068404267,
"grad_norm": 0.27867192029953003,
"learning_rate": 8e-05,
"loss": 1.6531,
"step": 1688
},
{
"epoch": 0.371494556252062,
"grad_norm": 0.27857083082199097,
"learning_rate": 8e-05,
"loss": 1.8023,
"step": 1689
},
{
"epoch": 0.37171450566369735,
"grad_norm": 0.2689161002635956,
"learning_rate": 8e-05,
"loss": 1.7601,
"step": 1690
},
{
"epoch": 0.3719344550753327,
"grad_norm": 0.297826886177063,
"learning_rate": 8e-05,
"loss": 1.7627,
"step": 1691
},
{
"epoch": 0.372154404486968,
"grad_norm": 0.2592705190181732,
"learning_rate": 8e-05,
"loss": 1.7132,
"step": 1692
},
{
"epoch": 0.3723743538986033,
"grad_norm": 0.28288522362709045,
"learning_rate": 8e-05,
"loss": 1.7604,
"step": 1693
},
{
"epoch": 0.37259430331023863,
"grad_norm": 0.30823859572410583,
"learning_rate": 8e-05,
"loss": 1.8563,
"step": 1694
},
{
"epoch": 0.37281425272187396,
"grad_norm": 0.27835527062416077,
"learning_rate": 8e-05,
"loss": 1.6816,
"step": 1695
},
{
"epoch": 0.3730342021335093,
"grad_norm": 0.2626672089099884,
"learning_rate": 8e-05,
"loss": 1.6185,
"step": 1696
},
{
"epoch": 0.37325415154514463,
"grad_norm": 0.2489227056503296,
"learning_rate": 8e-05,
"loss": 1.6119,
"step": 1697
},
{
"epoch": 0.37347410095677996,
"grad_norm": 0.28637897968292236,
"learning_rate": 8e-05,
"loss": 1.6695,
"step": 1698
},
{
"epoch": 0.37369405036841524,
"grad_norm": 0.27077022194862366,
"learning_rate": 8e-05,
"loss": 1.6095,
"step": 1699
},
{
"epoch": 0.3739139997800506,
"grad_norm": 0.32049357891082764,
"learning_rate": 8e-05,
"loss": 1.875,
"step": 1700
},
{
"epoch": 0.3741339491916859,
"grad_norm": 0.2890382707118988,
"learning_rate": 8e-05,
"loss": 1.7129,
"step": 1701
},
{
"epoch": 0.37435389860332124,
"grad_norm": 0.2785224914550781,
"learning_rate": 8e-05,
"loss": 1.7162,
"step": 1702
},
{
"epoch": 0.3745738480149566,
"grad_norm": 0.2685299217700958,
"learning_rate": 8e-05,
"loss": 1.7358,
"step": 1703
},
{
"epoch": 0.37479379742659186,
"grad_norm": 0.2840120494365692,
"learning_rate": 8e-05,
"loss": 1.9123,
"step": 1704
},
{
"epoch": 0.3750137468382272,
"grad_norm": 0.27426856756210327,
"learning_rate": 8e-05,
"loss": 1.7144,
"step": 1705
},
{
"epoch": 0.3752336962498625,
"grad_norm": 0.2707318663597107,
"learning_rate": 8e-05,
"loss": 1.6961,
"step": 1706
},
{
"epoch": 0.37545364566149786,
"grad_norm": 0.3059745728969574,
"learning_rate": 8e-05,
"loss": 1.7491,
"step": 1707
},
{
"epoch": 0.3756735950731332,
"grad_norm": 0.27109962701797485,
"learning_rate": 8e-05,
"loss": 1.6515,
"step": 1708
},
{
"epoch": 0.3758935444847685,
"grad_norm": 0.26874709129333496,
"learning_rate": 8e-05,
"loss": 1.7119,
"step": 1709
},
{
"epoch": 0.3761134938964038,
"grad_norm": 0.27959340810775757,
"learning_rate": 8e-05,
"loss": 1.5449,
"step": 1710
},
{
"epoch": 0.37633344330803914,
"grad_norm": 0.284386545419693,
"learning_rate": 8e-05,
"loss": 1.8336,
"step": 1711
},
{
"epoch": 0.3765533927196745,
"grad_norm": 0.27861231565475464,
"learning_rate": 8e-05,
"loss": 1.7547,
"step": 1712
},
{
"epoch": 0.3767733421313098,
"grad_norm": 0.26845625042915344,
"learning_rate": 8e-05,
"loss": 1.6838,
"step": 1713
},
{
"epoch": 0.37699329154294514,
"grad_norm": 0.31240981817245483,
"learning_rate": 8e-05,
"loss": 1.7489,
"step": 1714
},
{
"epoch": 0.3772132409545804,
"grad_norm": 0.2878013253211975,
"learning_rate": 8e-05,
"loss": 1.7533,
"step": 1715
},
{
"epoch": 0.37743319036621575,
"grad_norm": 0.27676892280578613,
"learning_rate": 8e-05,
"loss": 1.6218,
"step": 1716
},
{
"epoch": 0.3776531397778511,
"grad_norm": 0.2782065272331238,
"learning_rate": 8e-05,
"loss": 1.6311,
"step": 1717
},
{
"epoch": 0.3778730891894864,
"grad_norm": 0.2829797863960266,
"learning_rate": 8e-05,
"loss": 1.5863,
"step": 1718
},
{
"epoch": 0.37809303860112176,
"grad_norm": 0.2851261794567108,
"learning_rate": 8e-05,
"loss": 1.8365,
"step": 1719
},
{
"epoch": 0.3783129880127571,
"grad_norm": 0.2844488322734833,
"learning_rate": 8e-05,
"loss": 1.7765,
"step": 1720
},
{
"epoch": 0.37853293742439237,
"grad_norm": 0.2976120412349701,
"learning_rate": 8e-05,
"loss": 1.7334,
"step": 1721
},
{
"epoch": 0.3787528868360277,
"grad_norm": 0.27947840094566345,
"learning_rate": 8e-05,
"loss": 1.6641,
"step": 1722
},
{
"epoch": 0.37897283624766304,
"grad_norm": 0.2986278831958771,
"learning_rate": 8e-05,
"loss": 1.8201,
"step": 1723
},
{
"epoch": 0.37919278565929837,
"grad_norm": 0.26200374960899353,
"learning_rate": 8e-05,
"loss": 1.5835,
"step": 1724
},
{
"epoch": 0.3794127350709337,
"grad_norm": 0.2846388816833496,
"learning_rate": 8e-05,
"loss": 1.7863,
"step": 1725
},
{
"epoch": 0.379632684482569,
"grad_norm": 0.2809320390224457,
"learning_rate": 8e-05,
"loss": 1.6667,
"step": 1726
},
{
"epoch": 0.3798526338942043,
"grad_norm": 0.28523099422454834,
"learning_rate": 8e-05,
"loss": 1.6647,
"step": 1727
},
{
"epoch": 0.38007258330583965,
"grad_norm": 0.2719436287879944,
"learning_rate": 8e-05,
"loss": 1.6,
"step": 1728
},
{
"epoch": 0.380292532717475,
"grad_norm": 0.2762429118156433,
"learning_rate": 8e-05,
"loss": 1.6888,
"step": 1729
},
{
"epoch": 0.3805124821291103,
"grad_norm": 0.30161863565444946,
"learning_rate": 8e-05,
"loss": 1.6659,
"step": 1730
},
{
"epoch": 0.38073243154074565,
"grad_norm": 0.27962687611579895,
"learning_rate": 8e-05,
"loss": 1.629,
"step": 1731
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.27580323815345764,
"learning_rate": 8e-05,
"loss": 1.689,
"step": 1732
},
{
"epoch": 0.38117233036401627,
"grad_norm": 0.2676113545894623,
"learning_rate": 8e-05,
"loss": 1.7195,
"step": 1733
},
{
"epoch": 0.3813922797756516,
"grad_norm": 0.27840152382850647,
"learning_rate": 8e-05,
"loss": 1.6433,
"step": 1734
},
{
"epoch": 0.38161222918728693,
"grad_norm": 0.27100005745887756,
"learning_rate": 8e-05,
"loss": 1.6517,
"step": 1735
},
{
"epoch": 0.38183217859892227,
"grad_norm": 0.2874828577041626,
"learning_rate": 8e-05,
"loss": 1.9139,
"step": 1736
},
{
"epoch": 0.38205212801055755,
"grad_norm": 0.2685931324958801,
"learning_rate": 8e-05,
"loss": 1.7373,
"step": 1737
},
{
"epoch": 0.3822720774221929,
"grad_norm": 0.2895548641681671,
"learning_rate": 8e-05,
"loss": 1.7828,
"step": 1738
},
{
"epoch": 0.3824920268338282,
"grad_norm": 0.29109206795692444,
"learning_rate": 8e-05,
"loss": 1.6347,
"step": 1739
},
{
"epoch": 0.38271197624546355,
"grad_norm": 0.2804923951625824,
"learning_rate": 8e-05,
"loss": 1.5978,
"step": 1740
},
{
"epoch": 0.3829319256570989,
"grad_norm": 0.2829732894897461,
"learning_rate": 8e-05,
"loss": 1.6271,
"step": 1741
},
{
"epoch": 0.3831518750687342,
"grad_norm": 0.28979840874671936,
"learning_rate": 8e-05,
"loss": 1.7244,
"step": 1742
},
{
"epoch": 0.3833718244803695,
"grad_norm": 0.30159792304039,
"learning_rate": 8e-05,
"loss": 1.8074,
"step": 1743
},
{
"epoch": 0.38359177389200483,
"grad_norm": 0.28228580951690674,
"learning_rate": 8e-05,
"loss": 1.6669,
"step": 1744
},
{
"epoch": 0.38381172330364016,
"grad_norm": 0.27950945496559143,
"learning_rate": 8e-05,
"loss": 1.6583,
"step": 1745
},
{
"epoch": 0.3840316727152755,
"grad_norm": 0.2708896994590759,
"learning_rate": 8e-05,
"loss": 1.5793,
"step": 1746
},
{
"epoch": 0.38425162212691083,
"grad_norm": 0.27368029952049255,
"learning_rate": 8e-05,
"loss": 1.6371,
"step": 1747
},
{
"epoch": 0.3844715715385461,
"grad_norm": 0.27621379494667053,
"learning_rate": 8e-05,
"loss": 1.5737,
"step": 1748
},
{
"epoch": 0.38469152095018144,
"grad_norm": 0.27143922448158264,
"learning_rate": 8e-05,
"loss": 1.7289,
"step": 1749
},
{
"epoch": 0.3849114703618168,
"grad_norm": 0.28887274861335754,
"learning_rate": 8e-05,
"loss": 1.7262,
"step": 1750
},
{
"epoch": 0.3851314197734521,
"grad_norm": 0.26516541838645935,
"learning_rate": 8e-05,
"loss": 1.6358,
"step": 1751
},
{
"epoch": 0.38535136918508744,
"grad_norm": 0.31475701928138733,
"learning_rate": 8e-05,
"loss": 1.8599,
"step": 1752
},
{
"epoch": 0.3855713185967228,
"grad_norm": 0.27711552381515503,
"learning_rate": 8e-05,
"loss": 1.632,
"step": 1753
},
{
"epoch": 0.38579126800835806,
"grad_norm": 0.27542901039123535,
"learning_rate": 8e-05,
"loss": 1.6741,
"step": 1754
},
{
"epoch": 0.3860112174199934,
"grad_norm": 0.2941054701805115,
"learning_rate": 8e-05,
"loss": 1.6396,
"step": 1755
},
{
"epoch": 0.3862311668316287,
"grad_norm": 0.27836698293685913,
"learning_rate": 8e-05,
"loss": 1.5689,
"step": 1756
},
{
"epoch": 0.38645111624326406,
"grad_norm": 0.29147645831108093,
"learning_rate": 8e-05,
"loss": 1.7523,
"step": 1757
},
{
"epoch": 0.3866710656548994,
"grad_norm": 0.30084285140037537,
"learning_rate": 8e-05,
"loss": 1.6932,
"step": 1758
},
{
"epoch": 0.38689101506653467,
"grad_norm": 0.2850727140903473,
"learning_rate": 8e-05,
"loss": 1.7276,
"step": 1759
},
{
"epoch": 0.38711096447817,
"grad_norm": 0.27011391520500183,
"learning_rate": 8e-05,
"loss": 1.6635,
"step": 1760
},
{
"epoch": 0.38733091388980534,
"grad_norm": 0.28682348132133484,
"learning_rate": 8e-05,
"loss": 1.7642,
"step": 1761
},
{
"epoch": 0.3875508633014407,
"grad_norm": 0.27676117420196533,
"learning_rate": 8e-05,
"loss": 1.7406,
"step": 1762
},
{
"epoch": 0.387770812713076,
"grad_norm": 0.2654523551464081,
"learning_rate": 8e-05,
"loss": 1.6484,
"step": 1763
},
{
"epoch": 0.38799076212471134,
"grad_norm": 0.28026026487350464,
"learning_rate": 8e-05,
"loss": 1.6714,
"step": 1764
},
{
"epoch": 0.3882107115363466,
"grad_norm": 0.3003789782524109,
"learning_rate": 8e-05,
"loss": 1.8121,
"step": 1765
},
{
"epoch": 0.38843066094798195,
"grad_norm": 0.35523107647895813,
"learning_rate": 8e-05,
"loss": 1.9299,
"step": 1766
},
{
"epoch": 0.3886506103596173,
"grad_norm": 0.26844245195388794,
"learning_rate": 8e-05,
"loss": 1.6358,
"step": 1767
},
{
"epoch": 0.3888705597712526,
"grad_norm": 0.27308356761932373,
"learning_rate": 8e-05,
"loss": 1.6104,
"step": 1768
},
{
"epoch": 0.38909050918288796,
"grad_norm": 0.2775373160839081,
"learning_rate": 8e-05,
"loss": 1.5679,
"step": 1769
},
{
"epoch": 0.38931045859452323,
"grad_norm": 0.29753705859184265,
"learning_rate": 8e-05,
"loss": 1.7678,
"step": 1770
},
{
"epoch": 0.38953040800615857,
"grad_norm": 0.2798722982406616,
"learning_rate": 8e-05,
"loss": 1.7034,
"step": 1771
},
{
"epoch": 0.3897503574177939,
"grad_norm": 0.2842818796634674,
"learning_rate": 8e-05,
"loss": 1.727,
"step": 1772
},
{
"epoch": 0.38997030682942924,
"grad_norm": 0.27555832266807556,
"learning_rate": 8e-05,
"loss": 1.7438,
"step": 1773
},
{
"epoch": 0.39019025624106457,
"grad_norm": 0.2824547588825226,
"learning_rate": 8e-05,
"loss": 1.5733,
"step": 1774
},
{
"epoch": 0.3904102056526999,
"grad_norm": 0.2658035159111023,
"learning_rate": 8e-05,
"loss": 1.5997,
"step": 1775
},
{
"epoch": 0.3906301550643352,
"grad_norm": 0.27601394057273865,
"learning_rate": 8e-05,
"loss": 1.7025,
"step": 1776
},
{
"epoch": 0.3908501044759705,
"grad_norm": 0.2990022897720337,
"learning_rate": 8e-05,
"loss": 1.8278,
"step": 1777
},
{
"epoch": 0.39107005388760585,
"grad_norm": 0.29378873109817505,
"learning_rate": 8e-05,
"loss": 1.7549,
"step": 1778
},
{
"epoch": 0.3912900032992412,
"grad_norm": 0.29202136397361755,
"learning_rate": 8e-05,
"loss": 1.6602,
"step": 1779
},
{
"epoch": 0.3915099527108765,
"grad_norm": 0.28191903233528137,
"learning_rate": 8e-05,
"loss": 1.6618,
"step": 1780
},
{
"epoch": 0.3917299021225118,
"grad_norm": 0.26916682720184326,
"learning_rate": 8e-05,
"loss": 1.6523,
"step": 1781
},
{
"epoch": 0.39194985153414713,
"grad_norm": 0.2886850833892822,
"learning_rate": 8e-05,
"loss": 1.5752,
"step": 1782
},
{
"epoch": 0.39216980094578247,
"grad_norm": 0.2749246656894684,
"learning_rate": 8e-05,
"loss": 1.6731,
"step": 1783
},
{
"epoch": 0.3923897503574178,
"grad_norm": 0.28945374488830566,
"learning_rate": 8e-05,
"loss": 1.7404,
"step": 1784
},
{
"epoch": 0.39260969976905313,
"grad_norm": 0.27297836542129517,
"learning_rate": 8e-05,
"loss": 1.7176,
"step": 1785
},
{
"epoch": 0.39282964918068847,
"grad_norm": 0.2738782465457916,
"learning_rate": 8e-05,
"loss": 1.6844,
"step": 1786
},
{
"epoch": 0.39304959859232375,
"grad_norm": 0.2897050082683563,
"learning_rate": 8e-05,
"loss": 1.6522,
"step": 1787
},
{
"epoch": 0.3932695480039591,
"grad_norm": 0.31031668186187744,
"learning_rate": 8e-05,
"loss": 1.8216,
"step": 1788
},
{
"epoch": 0.3934894974155944,
"grad_norm": 0.2869516909122467,
"learning_rate": 8e-05,
"loss": 1.7598,
"step": 1789
},
{
"epoch": 0.39370944682722975,
"grad_norm": 0.3080596625804901,
"learning_rate": 8e-05,
"loss": 1.7645,
"step": 1790
},
{
"epoch": 0.3939293962388651,
"grad_norm": 0.27992716431617737,
"learning_rate": 8e-05,
"loss": 1.8373,
"step": 1791
},
{
"epoch": 0.39414934565050036,
"grad_norm": 0.2761777341365814,
"learning_rate": 8e-05,
"loss": 1.6767,
"step": 1792
},
{
"epoch": 0.3943692950621357,
"grad_norm": 0.30193084478378296,
"learning_rate": 8e-05,
"loss": 1.7394,
"step": 1793
},
{
"epoch": 0.39458924447377103,
"grad_norm": 0.29375529289245605,
"learning_rate": 8e-05,
"loss": 1.761,
"step": 1794
},
{
"epoch": 0.39480919388540636,
"grad_norm": 0.32190364599227905,
"learning_rate": 8e-05,
"loss": 1.7045,
"step": 1795
},
{
"epoch": 0.3950291432970417,
"grad_norm": 0.27505311369895935,
"learning_rate": 8e-05,
"loss": 1.6678,
"step": 1796
},
{
"epoch": 0.39524909270867703,
"grad_norm": 0.28678107261657715,
"learning_rate": 8e-05,
"loss": 1.7155,
"step": 1797
},
{
"epoch": 0.3954690421203123,
"grad_norm": 0.28372088074684143,
"learning_rate": 8e-05,
"loss": 1.8833,
"step": 1798
},
{
"epoch": 0.39568899153194764,
"grad_norm": 0.27803388237953186,
"learning_rate": 8e-05,
"loss": 1.7125,
"step": 1799
},
{
"epoch": 0.395908940943583,
"grad_norm": 0.278728187084198,
"learning_rate": 8e-05,
"loss": 1.7019,
"step": 1800
},
{
"epoch": 0.3961288903552183,
"grad_norm": 0.29563671350479126,
"learning_rate": 8e-05,
"loss": 1.6588,
"step": 1801
},
{
"epoch": 0.39634883976685364,
"grad_norm": 0.35585105419158936,
"learning_rate": 8e-05,
"loss": 1.8864,
"step": 1802
},
{
"epoch": 0.3965687891784889,
"grad_norm": 0.27399691939353943,
"learning_rate": 8e-05,
"loss": 1.6222,
"step": 1803
},
{
"epoch": 0.39678873859012426,
"grad_norm": 0.2557234764099121,
"learning_rate": 8e-05,
"loss": 1.4835,
"step": 1804
},
{
"epoch": 0.3970086880017596,
"grad_norm": 0.2929818332195282,
"learning_rate": 8e-05,
"loss": 1.7737,
"step": 1805
},
{
"epoch": 0.3972286374133949,
"grad_norm": 0.279729425907135,
"learning_rate": 8e-05,
"loss": 1.6073,
"step": 1806
},
{
"epoch": 0.39744858682503026,
"grad_norm": 0.2622847259044647,
"learning_rate": 8e-05,
"loss": 1.5368,
"step": 1807
},
{
"epoch": 0.3976685362366656,
"grad_norm": 0.2704038619995117,
"learning_rate": 8e-05,
"loss": 1.5809,
"step": 1808
},
{
"epoch": 0.39788848564830087,
"grad_norm": 0.2785516679286957,
"learning_rate": 8e-05,
"loss": 1.6416,
"step": 1809
},
{
"epoch": 0.3981084350599362,
"grad_norm": 0.29611852765083313,
"learning_rate": 8e-05,
"loss": 1.7186,
"step": 1810
},
{
"epoch": 0.39832838447157154,
"grad_norm": 0.28127896785736084,
"learning_rate": 8e-05,
"loss": 1.6248,
"step": 1811
},
{
"epoch": 0.3985483338832069,
"grad_norm": 0.2746615707874298,
"learning_rate": 8e-05,
"loss": 1.7029,
"step": 1812
},
{
"epoch": 0.3987682832948422,
"grad_norm": 0.2650880515575409,
"learning_rate": 8e-05,
"loss": 1.7828,
"step": 1813
},
{
"epoch": 0.3989882327064775,
"grad_norm": 0.28278401494026184,
"learning_rate": 8e-05,
"loss": 1.6383,
"step": 1814
},
{
"epoch": 0.3992081821181128,
"grad_norm": 0.2749755382537842,
"learning_rate": 8e-05,
"loss": 1.7551,
"step": 1815
},
{
"epoch": 0.39942813152974815,
"grad_norm": 0.26788878440856934,
"learning_rate": 8e-05,
"loss": 1.5206,
"step": 1816
},
{
"epoch": 0.3996480809413835,
"grad_norm": 0.28166842460632324,
"learning_rate": 8e-05,
"loss": 1.7134,
"step": 1817
},
{
"epoch": 0.3998680303530188,
"grad_norm": 0.2781674563884735,
"learning_rate": 8e-05,
"loss": 1.7864,
"step": 1818
},
{
"epoch": 0.40008797976465416,
"grad_norm": 0.2810186445713043,
"learning_rate": 8e-05,
"loss": 1.7017,
"step": 1819
},
{
"epoch": 0.40030792917628943,
"grad_norm": 0.2872167229652405,
"learning_rate": 8e-05,
"loss": 1.563,
"step": 1820
},
{
"epoch": 0.40052787858792477,
"grad_norm": 0.2864447832107544,
"learning_rate": 8e-05,
"loss": 1.8261,
"step": 1821
},
{
"epoch": 0.4007478279995601,
"grad_norm": 0.2633639872074127,
"learning_rate": 8e-05,
"loss": 1.5864,
"step": 1822
},
{
"epoch": 0.40096777741119544,
"grad_norm": 0.3556326925754547,
"learning_rate": 8e-05,
"loss": 1.5781,
"step": 1823
},
{
"epoch": 0.40118772682283077,
"grad_norm": 0.2832813560962677,
"learning_rate": 8e-05,
"loss": 1.7145,
"step": 1824
},
{
"epoch": 0.40140767623446605,
"grad_norm": 0.2862699627876282,
"learning_rate": 8e-05,
"loss": 1.7753,
"step": 1825
},
{
"epoch": 0.4016276256461014,
"grad_norm": 0.3274460732936859,
"learning_rate": 8e-05,
"loss": 1.7979,
"step": 1826
},
{
"epoch": 0.4018475750577367,
"grad_norm": 0.277118444442749,
"learning_rate": 8e-05,
"loss": 1.6089,
"step": 1827
},
{
"epoch": 0.40206752446937205,
"grad_norm": 0.278337687253952,
"learning_rate": 8e-05,
"loss": 1.5788,
"step": 1828
},
{
"epoch": 0.4022874738810074,
"grad_norm": 0.28072914481163025,
"learning_rate": 8e-05,
"loss": 1.6735,
"step": 1829
},
{
"epoch": 0.4025074232926427,
"grad_norm": 0.2815505564212799,
"learning_rate": 8e-05,
"loss": 1.7593,
"step": 1830
},
{
"epoch": 0.402727372704278,
"grad_norm": 0.2957006096839905,
"learning_rate": 8e-05,
"loss": 1.865,
"step": 1831
},
{
"epoch": 0.40294732211591333,
"grad_norm": 0.3079582452774048,
"learning_rate": 8e-05,
"loss": 1.7421,
"step": 1832
},
{
"epoch": 0.40316727152754867,
"grad_norm": 0.2924387454986572,
"learning_rate": 8e-05,
"loss": 1.7462,
"step": 1833
},
{
"epoch": 0.403387220939184,
"grad_norm": 0.28879454731941223,
"learning_rate": 8e-05,
"loss": 1.7433,
"step": 1834
},
{
"epoch": 0.40360717035081933,
"grad_norm": 0.27446237206459045,
"learning_rate": 8e-05,
"loss": 1.5869,
"step": 1835
},
{
"epoch": 0.4038271197624546,
"grad_norm": 0.3164878487586975,
"learning_rate": 8e-05,
"loss": 1.7505,
"step": 1836
},
{
"epoch": 0.40404706917408995,
"grad_norm": 0.25979530811309814,
"learning_rate": 8e-05,
"loss": 1.6001,
"step": 1837
},
{
"epoch": 0.4042670185857253,
"grad_norm": 0.30625709891319275,
"learning_rate": 8e-05,
"loss": 1.7907,
"step": 1838
},
{
"epoch": 0.4044869679973606,
"grad_norm": 0.27351540327072144,
"learning_rate": 8e-05,
"loss": 1.5835,
"step": 1839
},
{
"epoch": 0.40470691740899595,
"grad_norm": 0.302372545003891,
"learning_rate": 8e-05,
"loss": 1.7821,
"step": 1840
},
{
"epoch": 0.4049268668206313,
"grad_norm": 0.2910183370113373,
"learning_rate": 8e-05,
"loss": 1.7993,
"step": 1841
},
{
"epoch": 0.40514681623226656,
"grad_norm": 0.2934883236885071,
"learning_rate": 8e-05,
"loss": 1.5928,
"step": 1842
},
{
"epoch": 0.4053667656439019,
"grad_norm": 0.2586327791213989,
"learning_rate": 8e-05,
"loss": 1.5714,
"step": 1843
},
{
"epoch": 0.40558671505553723,
"grad_norm": 0.27952027320861816,
"learning_rate": 8e-05,
"loss": 1.8168,
"step": 1844
},
{
"epoch": 0.40580666446717256,
"grad_norm": 0.2987437844276428,
"learning_rate": 8e-05,
"loss": 1.555,
"step": 1845
},
{
"epoch": 0.4060266138788079,
"grad_norm": 0.29165002703666687,
"learning_rate": 8e-05,
"loss": 1.6865,
"step": 1846
},
{
"epoch": 0.4062465632904432,
"grad_norm": 0.2825503945350647,
"learning_rate": 8e-05,
"loss": 1.8814,
"step": 1847
},
{
"epoch": 0.4064665127020785,
"grad_norm": 0.27995482087135315,
"learning_rate": 8e-05,
"loss": 1.6897,
"step": 1848
},
{
"epoch": 0.40668646211371384,
"grad_norm": 0.2735064923763275,
"learning_rate": 8e-05,
"loss": 1.7279,
"step": 1849
},
{
"epoch": 0.4069064115253492,
"grad_norm": 0.2850511074066162,
"learning_rate": 8e-05,
"loss": 1.6459,
"step": 1850
},
{
"epoch": 0.4071263609369845,
"grad_norm": 0.3000599145889282,
"learning_rate": 8e-05,
"loss": 1.7301,
"step": 1851
},
{
"epoch": 0.40734631034861984,
"grad_norm": 0.2768002152442932,
"learning_rate": 8e-05,
"loss": 1.5748,
"step": 1852
},
{
"epoch": 0.4075662597602551,
"grad_norm": 0.26737141609191895,
"learning_rate": 8e-05,
"loss": 1.5895,
"step": 1853
},
{
"epoch": 0.40778620917189046,
"grad_norm": 0.26408424973487854,
"learning_rate": 8e-05,
"loss": 1.5611,
"step": 1854
},
{
"epoch": 0.4080061585835258,
"grad_norm": 0.2646276354789734,
"learning_rate": 8e-05,
"loss": 1.5865,
"step": 1855
},
{
"epoch": 0.4082261079951611,
"grad_norm": 0.27871212363243103,
"learning_rate": 8e-05,
"loss": 1.8202,
"step": 1856
},
{
"epoch": 0.40844605740679646,
"grad_norm": 0.3234533965587616,
"learning_rate": 8e-05,
"loss": 1.8213,
"step": 1857
},
{
"epoch": 0.40866600681843174,
"grad_norm": 0.2705099284648895,
"learning_rate": 8e-05,
"loss": 1.6637,
"step": 1858
},
{
"epoch": 0.40888595623006707,
"grad_norm": 0.28647711873054504,
"learning_rate": 8e-05,
"loss": 1.7396,
"step": 1859
},
{
"epoch": 0.4091059056417024,
"grad_norm": 0.2812083959579468,
"learning_rate": 8e-05,
"loss": 1.5663,
"step": 1860
},
{
"epoch": 0.40932585505333774,
"grad_norm": 0.2818193733692169,
"learning_rate": 8e-05,
"loss": 1.6073,
"step": 1861
},
{
"epoch": 0.4095458044649731,
"grad_norm": 0.29906994104385376,
"learning_rate": 8e-05,
"loss": 1.7061,
"step": 1862
},
{
"epoch": 0.4097657538766084,
"grad_norm": 0.27941465377807617,
"learning_rate": 8e-05,
"loss": 1.7282,
"step": 1863
},
{
"epoch": 0.4099857032882437,
"grad_norm": 0.27629899978637695,
"learning_rate": 8e-05,
"loss": 1.6879,
"step": 1864
},
{
"epoch": 0.410205652699879,
"grad_norm": 0.2792319059371948,
"learning_rate": 8e-05,
"loss": 1.7196,
"step": 1865
},
{
"epoch": 0.41042560211151435,
"grad_norm": 0.2763090431690216,
"learning_rate": 8e-05,
"loss": 1.673,
"step": 1866
},
{
"epoch": 0.4106455515231497,
"grad_norm": 0.2930999994277954,
"learning_rate": 8e-05,
"loss": 1.6919,
"step": 1867
},
{
"epoch": 0.410865500934785,
"grad_norm": 0.2748461365699768,
"learning_rate": 8e-05,
"loss": 1.7553,
"step": 1868
},
{
"epoch": 0.4110854503464203,
"grad_norm": 0.2742187976837158,
"learning_rate": 8e-05,
"loss": 1.6786,
"step": 1869
},
{
"epoch": 0.41130539975805563,
"grad_norm": 0.3050731420516968,
"learning_rate": 8e-05,
"loss": 1.4902,
"step": 1870
},
{
"epoch": 0.41152534916969097,
"grad_norm": 0.29456627368927,
"learning_rate": 8e-05,
"loss": 1.758,
"step": 1871
},
{
"epoch": 0.4117452985813263,
"grad_norm": 0.2844219505786896,
"learning_rate": 8e-05,
"loss": 1.6206,
"step": 1872
},
{
"epoch": 0.41196524799296164,
"grad_norm": 0.28889915347099304,
"learning_rate": 8e-05,
"loss": 1.6907,
"step": 1873
},
{
"epoch": 0.41218519740459697,
"grad_norm": 0.27245181798934937,
"learning_rate": 8e-05,
"loss": 1.6749,
"step": 1874
},
{
"epoch": 0.41240514681623225,
"grad_norm": 0.2927252948284149,
"learning_rate": 8e-05,
"loss": 1.6382,
"step": 1875
},
{
"epoch": 0.4126250962278676,
"grad_norm": 0.27153030037879944,
"learning_rate": 8e-05,
"loss": 1.6011,
"step": 1876
},
{
"epoch": 0.4128450456395029,
"grad_norm": 0.2807110846042633,
"learning_rate": 8e-05,
"loss": 1.7126,
"step": 1877
},
{
"epoch": 0.41306499505113825,
"grad_norm": 0.27375784516334534,
"learning_rate": 8e-05,
"loss": 1.7443,
"step": 1878
},
{
"epoch": 0.4132849444627736,
"grad_norm": 0.27330929040908813,
"learning_rate": 8e-05,
"loss": 1.6305,
"step": 1879
},
{
"epoch": 0.41350489387440886,
"grad_norm": 0.27126336097717285,
"learning_rate": 8e-05,
"loss": 1.6688,
"step": 1880
},
{
"epoch": 0.4137248432860442,
"grad_norm": 0.2768147885799408,
"learning_rate": 8e-05,
"loss": 1.7274,
"step": 1881
},
{
"epoch": 0.41394479269767953,
"grad_norm": 0.2686031460762024,
"learning_rate": 8e-05,
"loss": 1.6445,
"step": 1882
},
{
"epoch": 0.41416474210931487,
"grad_norm": 0.27737778425216675,
"learning_rate": 8e-05,
"loss": 1.5226,
"step": 1883
},
{
"epoch": 0.4143846915209502,
"grad_norm": 0.2761901319026947,
"learning_rate": 8e-05,
"loss": 1.6884,
"step": 1884
},
{
"epoch": 0.41460464093258553,
"grad_norm": 0.28609856963157654,
"learning_rate": 8e-05,
"loss": 1.7719,
"step": 1885
},
{
"epoch": 0.4148245903442208,
"grad_norm": 0.2904943525791168,
"learning_rate": 8e-05,
"loss": 1.6979,
"step": 1886
},
{
"epoch": 0.41504453975585615,
"grad_norm": 0.3016435503959656,
"learning_rate": 8e-05,
"loss": 1.7912,
"step": 1887
},
{
"epoch": 0.4152644891674915,
"grad_norm": 0.27562782168388367,
"learning_rate": 8e-05,
"loss": 1.5822,
"step": 1888
},
{
"epoch": 0.4154844385791268,
"grad_norm": 0.2841348648071289,
"learning_rate": 8e-05,
"loss": 1.7524,
"step": 1889
},
{
"epoch": 0.41570438799076215,
"grad_norm": 0.26393935084342957,
"learning_rate": 8e-05,
"loss": 1.6219,
"step": 1890
},
{
"epoch": 0.4159243374023974,
"grad_norm": 0.2792678773403168,
"learning_rate": 8e-05,
"loss": 1.7243,
"step": 1891
},
{
"epoch": 0.41614428681403276,
"grad_norm": 0.291425496339798,
"learning_rate": 8e-05,
"loss": 1.7499,
"step": 1892
},
{
"epoch": 0.4163642362256681,
"grad_norm": 0.2737634778022766,
"learning_rate": 8e-05,
"loss": 1.6565,
"step": 1893
},
{
"epoch": 0.41658418563730343,
"grad_norm": 0.26807767152786255,
"learning_rate": 8e-05,
"loss": 1.6149,
"step": 1894
},
{
"epoch": 0.41680413504893876,
"grad_norm": 0.28826507925987244,
"learning_rate": 8e-05,
"loss": 1.6857,
"step": 1895
},
{
"epoch": 0.41702408446057404,
"grad_norm": 0.27604466676712036,
"learning_rate": 8e-05,
"loss": 1.7689,
"step": 1896
},
{
"epoch": 0.4172440338722094,
"grad_norm": 0.27355703711509705,
"learning_rate": 8e-05,
"loss": 1.7254,
"step": 1897
},
{
"epoch": 0.4174639832838447,
"grad_norm": 0.26692044734954834,
"learning_rate": 8e-05,
"loss": 1.6372,
"step": 1898
},
{
"epoch": 0.41768393269548004,
"grad_norm": 0.27527916431427,
"learning_rate": 8e-05,
"loss": 1.6913,
"step": 1899
},
{
"epoch": 0.4179038821071154,
"grad_norm": 0.26881837844848633,
"learning_rate": 8e-05,
"loss": 1.6663,
"step": 1900
},
{
"epoch": 0.4181238315187507,
"grad_norm": 0.27977946400642395,
"learning_rate": 8e-05,
"loss": 1.8117,
"step": 1901
},
{
"epoch": 0.418343780930386,
"grad_norm": 0.2958911955356598,
"learning_rate": 8e-05,
"loss": 1.6603,
"step": 1902
},
{
"epoch": 0.4185637303420213,
"grad_norm": 0.2845151424407959,
"learning_rate": 8e-05,
"loss": 1.7517,
"step": 1903
},
{
"epoch": 0.41878367975365666,
"grad_norm": 0.2804581820964813,
"learning_rate": 8e-05,
"loss": 1.765,
"step": 1904
},
{
"epoch": 0.419003629165292,
"grad_norm": 0.29568520188331604,
"learning_rate": 8e-05,
"loss": 1.6995,
"step": 1905
},
{
"epoch": 0.4192235785769273,
"grad_norm": 0.303100049495697,
"learning_rate": 8e-05,
"loss": 1.77,
"step": 1906
},
{
"epoch": 0.4194435279885626,
"grad_norm": 0.26847636699676514,
"learning_rate": 8e-05,
"loss": 1.6964,
"step": 1907
},
{
"epoch": 0.41966347740019794,
"grad_norm": 0.2791590094566345,
"learning_rate": 8e-05,
"loss": 1.5912,
"step": 1908
},
{
"epoch": 0.41988342681183327,
"grad_norm": 0.2687268555164337,
"learning_rate": 8e-05,
"loss": 1.6163,
"step": 1909
},
{
"epoch": 0.4201033762234686,
"grad_norm": 0.29087433218955994,
"learning_rate": 8e-05,
"loss": 1.6588,
"step": 1910
},
{
"epoch": 0.42032332563510394,
"grad_norm": 0.29639971256256104,
"learning_rate": 8e-05,
"loss": 1.6911,
"step": 1911
},
{
"epoch": 0.4205432750467393,
"grad_norm": 0.27669841051101685,
"learning_rate": 8e-05,
"loss": 1.7078,
"step": 1912
},
{
"epoch": 0.42076322445837455,
"grad_norm": 0.2851327955722809,
"learning_rate": 8e-05,
"loss": 1.7217,
"step": 1913
},
{
"epoch": 0.4209831738700099,
"grad_norm": 0.27069011330604553,
"learning_rate": 8e-05,
"loss": 1.7076,
"step": 1914
},
{
"epoch": 0.4212031232816452,
"grad_norm": 0.26195240020751953,
"learning_rate": 8e-05,
"loss": 1.6647,
"step": 1915
},
{
"epoch": 0.42142307269328055,
"grad_norm": 0.3046209216117859,
"learning_rate": 8e-05,
"loss": 1.5303,
"step": 1916
},
{
"epoch": 0.4216430221049159,
"grad_norm": 0.29437899589538574,
"learning_rate": 8e-05,
"loss": 1.6589,
"step": 1917
},
{
"epoch": 0.42186297151655117,
"grad_norm": 0.2954728603363037,
"learning_rate": 8e-05,
"loss": 1.777,
"step": 1918
},
{
"epoch": 0.4220829209281865,
"grad_norm": 0.2612738609313965,
"learning_rate": 8e-05,
"loss": 1.5668,
"step": 1919
},
{
"epoch": 0.42230287033982183,
"grad_norm": 0.3015122413635254,
"learning_rate": 8e-05,
"loss": 1.7861,
"step": 1920
},
{
"epoch": 0.42252281975145717,
"grad_norm": 0.3785838484764099,
"learning_rate": 8e-05,
"loss": 1.8979,
"step": 1921
},
{
"epoch": 0.4227427691630925,
"grad_norm": 0.2849038541316986,
"learning_rate": 8e-05,
"loss": 1.6891,
"step": 1922
},
{
"epoch": 0.42296271857472784,
"grad_norm": 0.278728723526001,
"learning_rate": 8e-05,
"loss": 1.7891,
"step": 1923
},
{
"epoch": 0.4231826679863631,
"grad_norm": 0.27032172679901123,
"learning_rate": 8e-05,
"loss": 1.6963,
"step": 1924
},
{
"epoch": 0.42340261739799845,
"grad_norm": 0.2731832265853882,
"learning_rate": 8e-05,
"loss": 1.632,
"step": 1925
},
{
"epoch": 0.4236225668096338,
"grad_norm": 0.30378425121307373,
"learning_rate": 8e-05,
"loss": 1.7823,
"step": 1926
},
{
"epoch": 0.4238425162212691,
"grad_norm": 0.27693971991539,
"learning_rate": 8e-05,
"loss": 1.483,
"step": 1927
},
{
"epoch": 0.42406246563290445,
"grad_norm": 0.2719477415084839,
"learning_rate": 8e-05,
"loss": 1.6708,
"step": 1928
},
{
"epoch": 0.42428241504453973,
"grad_norm": 0.26625335216522217,
"learning_rate": 8e-05,
"loss": 1.4946,
"step": 1929
},
{
"epoch": 0.42450236445617506,
"grad_norm": 0.2843473553657532,
"learning_rate": 8e-05,
"loss": 1.722,
"step": 1930
},
{
"epoch": 0.4247223138678104,
"grad_norm": 0.3453083336353302,
"learning_rate": 8e-05,
"loss": 1.7238,
"step": 1931
},
{
"epoch": 0.42494226327944573,
"grad_norm": 0.25626078248023987,
"learning_rate": 8e-05,
"loss": 1.4706,
"step": 1932
},
{
"epoch": 0.42516221269108107,
"grad_norm": 0.2908123731613159,
"learning_rate": 8e-05,
"loss": 1.7105,
"step": 1933
},
{
"epoch": 0.4253821621027164,
"grad_norm": 0.33517104387283325,
"learning_rate": 8e-05,
"loss": 1.8023,
"step": 1934
},
{
"epoch": 0.4256021115143517,
"grad_norm": 0.28047069907188416,
"learning_rate": 8e-05,
"loss": 1.6266,
"step": 1935
},
{
"epoch": 0.425822060925987,
"grad_norm": 0.2778942584991455,
"learning_rate": 8e-05,
"loss": 1.6866,
"step": 1936
},
{
"epoch": 0.42604201033762235,
"grad_norm": 0.3038877248764038,
"learning_rate": 8e-05,
"loss": 1.6075,
"step": 1937
},
{
"epoch": 0.4262619597492577,
"grad_norm": 0.2814297378063202,
"learning_rate": 8e-05,
"loss": 1.5939,
"step": 1938
},
{
"epoch": 0.426481909160893,
"grad_norm": 0.27854403853416443,
"learning_rate": 8e-05,
"loss": 1.5943,
"step": 1939
},
{
"epoch": 0.4267018585725283,
"grad_norm": 0.2924019694328308,
"learning_rate": 8e-05,
"loss": 1.8193,
"step": 1940
},
{
"epoch": 0.4269218079841636,
"grad_norm": 0.2862766683101654,
"learning_rate": 8e-05,
"loss": 1.6065,
"step": 1941
},
{
"epoch": 0.42714175739579896,
"grad_norm": 0.2696346342563629,
"learning_rate": 8e-05,
"loss": 1.5343,
"step": 1942
},
{
"epoch": 0.4273617068074343,
"grad_norm": 0.2578338384628296,
"learning_rate": 8e-05,
"loss": 1.5786,
"step": 1943
},
{
"epoch": 0.42758165621906963,
"grad_norm": 0.28594937920570374,
"learning_rate": 8e-05,
"loss": 1.725,
"step": 1944
},
{
"epoch": 0.42780160563070496,
"grad_norm": 0.2808282971382141,
"learning_rate": 8e-05,
"loss": 1.7951,
"step": 1945
},
{
"epoch": 0.42802155504234024,
"grad_norm": 0.32533401250839233,
"learning_rate": 8e-05,
"loss": 1.9645,
"step": 1946
},
{
"epoch": 0.4282415044539756,
"grad_norm": 0.2737642228603363,
"learning_rate": 8e-05,
"loss": 1.6243,
"step": 1947
},
{
"epoch": 0.4284614538656109,
"grad_norm": 0.2885657250881195,
"learning_rate": 8e-05,
"loss": 1.7338,
"step": 1948
},
{
"epoch": 0.42868140327724624,
"grad_norm": 0.2788100242614746,
"learning_rate": 8e-05,
"loss": 1.76,
"step": 1949
},
{
"epoch": 0.4289013526888816,
"grad_norm": 0.2899073362350464,
"learning_rate": 8e-05,
"loss": 1.7739,
"step": 1950
},
{
"epoch": 0.42912130210051685,
"grad_norm": 0.2874782681465149,
"learning_rate": 8e-05,
"loss": 1.8283,
"step": 1951
},
{
"epoch": 0.4293412515121522,
"grad_norm": 0.2757413685321808,
"learning_rate": 8e-05,
"loss": 1.641,
"step": 1952
},
{
"epoch": 0.4295612009237875,
"grad_norm": 0.2811121940612793,
"learning_rate": 8e-05,
"loss": 1.7231,
"step": 1953
},
{
"epoch": 0.42978115033542286,
"grad_norm": 0.3400493860244751,
"learning_rate": 8e-05,
"loss": 1.8431,
"step": 1954
},
{
"epoch": 0.4300010997470582,
"grad_norm": 0.29006627202033997,
"learning_rate": 8e-05,
"loss": 1.7438,
"step": 1955
},
{
"epoch": 0.4302210491586935,
"grad_norm": 0.30233392119407654,
"learning_rate": 8e-05,
"loss": 1.6603,
"step": 1956
},
{
"epoch": 0.4304409985703288,
"grad_norm": 0.2921263873577118,
"learning_rate": 8e-05,
"loss": 1.6604,
"step": 1957
},
{
"epoch": 0.43066094798196414,
"grad_norm": 0.27695250511169434,
"learning_rate": 8e-05,
"loss": 1.7536,
"step": 1958
},
{
"epoch": 0.43088089739359947,
"grad_norm": 0.2827337980270386,
"learning_rate": 8e-05,
"loss": 1.6324,
"step": 1959
},
{
"epoch": 0.4311008468052348,
"grad_norm": 0.27993375062942505,
"learning_rate": 8e-05,
"loss": 1.7168,
"step": 1960
},
{
"epoch": 0.43132079621687014,
"grad_norm": 0.2801220417022705,
"learning_rate": 8e-05,
"loss": 1.705,
"step": 1961
},
{
"epoch": 0.4315407456285054,
"grad_norm": 0.27520567178726196,
"learning_rate": 8e-05,
"loss": 1.664,
"step": 1962
},
{
"epoch": 0.43176069504014075,
"grad_norm": 0.26910632848739624,
"learning_rate": 8e-05,
"loss": 1.3616,
"step": 1963
},
{
"epoch": 0.4319806444517761,
"grad_norm": 0.27770352363586426,
"learning_rate": 8e-05,
"loss": 1.6689,
"step": 1964
},
{
"epoch": 0.4322005938634114,
"grad_norm": 0.27606719732284546,
"learning_rate": 8e-05,
"loss": 1.6644,
"step": 1965
},
{
"epoch": 0.43242054327504675,
"grad_norm": 0.27787330746650696,
"learning_rate": 8e-05,
"loss": 1.8854,
"step": 1966
},
{
"epoch": 0.4326404926866821,
"grad_norm": 0.26479870080947876,
"learning_rate": 8e-05,
"loss": 1.5904,
"step": 1967
},
{
"epoch": 0.43286044209831737,
"grad_norm": 0.27598053216934204,
"learning_rate": 8e-05,
"loss": 1.5666,
"step": 1968
},
{
"epoch": 0.4330803915099527,
"grad_norm": 0.27461937069892883,
"learning_rate": 8e-05,
"loss": 1.5487,
"step": 1969
},
{
"epoch": 0.43330034092158803,
"grad_norm": 0.2928270399570465,
"learning_rate": 8e-05,
"loss": 1.8173,
"step": 1970
},
{
"epoch": 0.43352029033322337,
"grad_norm": 0.30754199624061584,
"learning_rate": 8e-05,
"loss": 1.6762,
"step": 1971
},
{
"epoch": 0.4337402397448587,
"grad_norm": 0.2676936089992523,
"learning_rate": 8e-05,
"loss": 1.7314,
"step": 1972
},
{
"epoch": 0.433960189156494,
"grad_norm": 0.2919710576534271,
"learning_rate": 8e-05,
"loss": 1.8586,
"step": 1973
},
{
"epoch": 0.4341801385681293,
"grad_norm": 0.28165963292121887,
"learning_rate": 8e-05,
"loss": 1.7943,
"step": 1974
},
{
"epoch": 0.43440008797976465,
"grad_norm": 0.2700537443161011,
"learning_rate": 8e-05,
"loss": 1.613,
"step": 1975
},
{
"epoch": 0.4346200373914,
"grad_norm": 0.26830658316612244,
"learning_rate": 8e-05,
"loss": 1.5854,
"step": 1976
},
{
"epoch": 0.4348399868030353,
"grad_norm": 0.28799256682395935,
"learning_rate": 8e-05,
"loss": 1.8246,
"step": 1977
},
{
"epoch": 0.43505993621467065,
"grad_norm": 0.27226150035858154,
"learning_rate": 8e-05,
"loss": 1.6252,
"step": 1978
},
{
"epoch": 0.43527988562630593,
"grad_norm": 0.2646162807941437,
"learning_rate": 8e-05,
"loss": 1.5699,
"step": 1979
},
{
"epoch": 0.43549983503794126,
"grad_norm": 0.27331140637397766,
"learning_rate": 8e-05,
"loss": 1.6893,
"step": 1980
},
{
"epoch": 0.4357197844495766,
"grad_norm": 0.26996269822120667,
"learning_rate": 8e-05,
"loss": 1.6004,
"step": 1981
},
{
"epoch": 0.43593973386121193,
"grad_norm": 0.29484307765960693,
"learning_rate": 8e-05,
"loss": 1.551,
"step": 1982
},
{
"epoch": 0.43615968327284727,
"grad_norm": 0.28224268555641174,
"learning_rate": 8e-05,
"loss": 1.6898,
"step": 1983
},
{
"epoch": 0.43637963268448254,
"grad_norm": 0.26172178983688354,
"learning_rate": 8e-05,
"loss": 1.4375,
"step": 1984
},
{
"epoch": 0.4365995820961179,
"grad_norm": 0.2603735029697418,
"learning_rate": 8e-05,
"loss": 1.4528,
"step": 1985
},
{
"epoch": 0.4368195315077532,
"grad_norm": 0.30643707513809204,
"learning_rate": 8e-05,
"loss": 1.688,
"step": 1986
},
{
"epoch": 0.43703948091938855,
"grad_norm": 0.2951216995716095,
"learning_rate": 8e-05,
"loss": 1.7769,
"step": 1987
},
{
"epoch": 0.4372594303310239,
"grad_norm": 0.2939329445362091,
"learning_rate": 8e-05,
"loss": 1.8161,
"step": 1988
},
{
"epoch": 0.4374793797426592,
"grad_norm": 0.27539846301078796,
"learning_rate": 8e-05,
"loss": 1.6019,
"step": 1989
},
{
"epoch": 0.4376993291542945,
"grad_norm": 0.2770693898200989,
"learning_rate": 8e-05,
"loss": 1.5972,
"step": 1990
},
{
"epoch": 0.4379192785659298,
"grad_norm": 0.2832552492618561,
"learning_rate": 8e-05,
"loss": 1.7467,
"step": 1991
},
{
"epoch": 0.43813922797756516,
"grad_norm": 0.2983148992061615,
"learning_rate": 8e-05,
"loss": 1.7181,
"step": 1992
},
{
"epoch": 0.4383591773892005,
"grad_norm": 0.2829340994358063,
"learning_rate": 8e-05,
"loss": 1.5984,
"step": 1993
},
{
"epoch": 0.43857912680083583,
"grad_norm": 0.2857687473297119,
"learning_rate": 8e-05,
"loss": 1.6471,
"step": 1994
},
{
"epoch": 0.4387990762124711,
"grad_norm": 0.2669824957847595,
"learning_rate": 8e-05,
"loss": 1.6215,
"step": 1995
},
{
"epoch": 0.43901902562410644,
"grad_norm": 0.28832894563674927,
"learning_rate": 8e-05,
"loss": 1.6884,
"step": 1996
},
{
"epoch": 0.4392389750357418,
"grad_norm": 0.2919970154762268,
"learning_rate": 8e-05,
"loss": 1.7462,
"step": 1997
},
{
"epoch": 0.4394589244473771,
"grad_norm": 0.2998509109020233,
"learning_rate": 8e-05,
"loss": 1.7219,
"step": 1998
},
{
"epoch": 0.43967887385901244,
"grad_norm": 0.2780647575855255,
"learning_rate": 8e-05,
"loss": 1.8219,
"step": 1999
},
{
"epoch": 0.4398988232706478,
"grad_norm": 0.2833268940448761,
"learning_rate": 8e-05,
"loss": 1.6873,
"step": 2000
},
{
"epoch": 0.44011877268228305,
"grad_norm": 0.2802470624446869,
"learning_rate": 8e-05,
"loss": 1.7249,
"step": 2001
},
{
"epoch": 0.4403387220939184,
"grad_norm": 0.2767699658870697,
"learning_rate": 8e-05,
"loss": 1.6789,
"step": 2002
},
{
"epoch": 0.4405586715055537,
"grad_norm": 0.28534451127052307,
"learning_rate": 8e-05,
"loss": 1.6269,
"step": 2003
},
{
"epoch": 0.44077862091718906,
"grad_norm": 0.28716540336608887,
"learning_rate": 8e-05,
"loss": 1.8097,
"step": 2004
},
{
"epoch": 0.4409985703288244,
"grad_norm": 0.27516409754753113,
"learning_rate": 8e-05,
"loss": 1.837,
"step": 2005
},
{
"epoch": 0.44121851974045967,
"grad_norm": 0.2910866439342499,
"learning_rate": 8e-05,
"loss": 1.61,
"step": 2006
},
{
"epoch": 0.441438469152095,
"grad_norm": 0.2851128578186035,
"learning_rate": 8e-05,
"loss": 1.6939,
"step": 2007
},
{
"epoch": 0.44165841856373034,
"grad_norm": 0.2709331214427948,
"learning_rate": 8e-05,
"loss": 1.6352,
"step": 2008
},
{
"epoch": 0.44187836797536567,
"grad_norm": 0.28077712655067444,
"learning_rate": 8e-05,
"loss": 1.6119,
"step": 2009
},
{
"epoch": 0.442098317387001,
"grad_norm": 0.2804681956768036,
"learning_rate": 8e-05,
"loss": 1.706,
"step": 2010
},
{
"epoch": 0.44231826679863634,
"grad_norm": 0.28305575251579285,
"learning_rate": 8e-05,
"loss": 1.7501,
"step": 2011
},
{
"epoch": 0.4425382162102716,
"grad_norm": 0.30372944474220276,
"learning_rate": 8e-05,
"loss": 1.69,
"step": 2012
},
{
"epoch": 0.44275816562190695,
"grad_norm": 0.2695739269256592,
"learning_rate": 8e-05,
"loss": 1.4976,
"step": 2013
},
{
"epoch": 0.4429781150335423,
"grad_norm": 0.27175822854042053,
"learning_rate": 8e-05,
"loss": 1.6941,
"step": 2014
},
{
"epoch": 0.4431980644451776,
"grad_norm": 0.2786177396774292,
"learning_rate": 8e-05,
"loss": 1.719,
"step": 2015
},
{
"epoch": 0.44341801385681295,
"grad_norm": 0.26625001430511475,
"learning_rate": 8e-05,
"loss": 1.6197,
"step": 2016
},
{
"epoch": 0.44363796326844823,
"grad_norm": 0.29516807198524475,
"learning_rate": 8e-05,
"loss": 1.7305,
"step": 2017
},
{
"epoch": 0.44385791268008357,
"grad_norm": 0.39562076330184937,
"learning_rate": 8e-05,
"loss": 1.783,
"step": 2018
},
{
"epoch": 0.4440778620917189,
"grad_norm": 0.27659425139427185,
"learning_rate": 8e-05,
"loss": 1.6948,
"step": 2019
},
{
"epoch": 0.44429781150335423,
"grad_norm": 0.2787366211414337,
"learning_rate": 8e-05,
"loss": 1.6237,
"step": 2020
},
{
"epoch": 0.44451776091498957,
"grad_norm": 0.27939459681510925,
"learning_rate": 8e-05,
"loss": 1.7628,
"step": 2021
},
{
"epoch": 0.4447377103266249,
"grad_norm": 0.27395081520080566,
"learning_rate": 8e-05,
"loss": 1.5619,
"step": 2022
},
{
"epoch": 0.4449576597382602,
"grad_norm": 0.28255096077919006,
"learning_rate": 8e-05,
"loss": 1.7556,
"step": 2023
},
{
"epoch": 0.4451776091498955,
"grad_norm": 0.2922489643096924,
"learning_rate": 8e-05,
"loss": 1.624,
"step": 2024
},
{
"epoch": 0.44539755856153085,
"grad_norm": 0.4039583206176758,
"learning_rate": 8e-05,
"loss": 1.507,
"step": 2025
},
{
"epoch": 0.4456175079731662,
"grad_norm": 0.28025928139686584,
"learning_rate": 8e-05,
"loss": 1.8057,
"step": 2026
},
{
"epoch": 0.4458374573848015,
"grad_norm": 0.2777588963508606,
"learning_rate": 8e-05,
"loss": 1.6104,
"step": 2027
},
{
"epoch": 0.4460574067964368,
"grad_norm": 0.2689501941204071,
"learning_rate": 8e-05,
"loss": 1.5339,
"step": 2028
},
{
"epoch": 0.44627735620807213,
"grad_norm": 0.28021785616874695,
"learning_rate": 8e-05,
"loss": 1.7819,
"step": 2029
},
{
"epoch": 0.44649730561970746,
"grad_norm": 0.26980918645858765,
"learning_rate": 8e-05,
"loss": 1.5995,
"step": 2030
},
{
"epoch": 0.4467172550313428,
"grad_norm": 0.293047696352005,
"learning_rate": 8e-05,
"loss": 1.8571,
"step": 2031
},
{
"epoch": 0.44693720444297813,
"grad_norm": 0.2841939330101013,
"learning_rate": 8e-05,
"loss": 1.5794,
"step": 2032
},
{
"epoch": 0.44715715385461346,
"grad_norm": 0.2845712900161743,
"learning_rate": 8e-05,
"loss": 1.6635,
"step": 2033
},
{
"epoch": 0.44737710326624874,
"grad_norm": 0.27919885516166687,
"learning_rate": 8e-05,
"loss": 1.7094,
"step": 2034
},
{
"epoch": 0.4475970526778841,
"grad_norm": 0.30076712369918823,
"learning_rate": 8e-05,
"loss": 1.6989,
"step": 2035
},
{
"epoch": 0.4478170020895194,
"grad_norm": 0.2666080892086029,
"learning_rate": 8e-05,
"loss": 1.689,
"step": 2036
},
{
"epoch": 0.44803695150115475,
"grad_norm": 0.29133087396621704,
"learning_rate": 8e-05,
"loss": 1.8559,
"step": 2037
},
{
"epoch": 0.4482569009127901,
"grad_norm": 0.28610843420028687,
"learning_rate": 8e-05,
"loss": 1.6556,
"step": 2038
},
{
"epoch": 0.44847685032442536,
"grad_norm": 0.26724278926849365,
"learning_rate": 8e-05,
"loss": 1.6104,
"step": 2039
},
{
"epoch": 0.4486967997360607,
"grad_norm": 0.2724173665046692,
"learning_rate": 8e-05,
"loss": 1.645,
"step": 2040
},
{
"epoch": 0.448916749147696,
"grad_norm": 0.26570823788642883,
"learning_rate": 8e-05,
"loss": 1.4611,
"step": 2041
},
{
"epoch": 0.44913669855933136,
"grad_norm": 0.29498788714408875,
"learning_rate": 8e-05,
"loss": 1.8787,
"step": 2042
},
{
"epoch": 0.4493566479709667,
"grad_norm": 0.28211459517478943,
"learning_rate": 8e-05,
"loss": 1.6062,
"step": 2043
},
{
"epoch": 0.44957659738260203,
"grad_norm": 0.3148192763328552,
"learning_rate": 8e-05,
"loss": 1.8734,
"step": 2044
},
{
"epoch": 0.4497965467942373,
"grad_norm": 0.27721115946769714,
"learning_rate": 8e-05,
"loss": 1.7274,
"step": 2045
},
{
"epoch": 0.45001649620587264,
"grad_norm": 0.29178541898727417,
"learning_rate": 8e-05,
"loss": 1.7568,
"step": 2046
},
{
"epoch": 0.450236445617508,
"grad_norm": 0.27845948934555054,
"learning_rate": 8e-05,
"loss": 1.6615,
"step": 2047
},
{
"epoch": 0.4504563950291433,
"grad_norm": 0.2741856873035431,
"learning_rate": 8e-05,
"loss": 1.5667,
"step": 2048
},
{
"epoch": 0.45067634444077864,
"grad_norm": 0.28572753071784973,
"learning_rate": 8e-05,
"loss": 1.7115,
"step": 2049
},
{
"epoch": 0.4508962938524139,
"grad_norm": 0.2769505977630615,
"learning_rate": 8e-05,
"loss": 1.6297,
"step": 2050
},
{
"epoch": 0.45111624326404925,
"grad_norm": 0.27633893489837646,
"learning_rate": 8e-05,
"loss": 1.6455,
"step": 2051
},
{
"epoch": 0.4513361926756846,
"grad_norm": 0.28455862402915955,
"learning_rate": 8e-05,
"loss": 1.7496,
"step": 2052
},
{
"epoch": 0.4515561420873199,
"grad_norm": 0.2920532524585724,
"learning_rate": 8e-05,
"loss": 1.7671,
"step": 2053
},
{
"epoch": 0.45177609149895526,
"grad_norm": 0.27528077363967896,
"learning_rate": 8e-05,
"loss": 1.6535,
"step": 2054
},
{
"epoch": 0.4519960409105906,
"grad_norm": 0.2949519157409668,
"learning_rate": 8e-05,
"loss": 1.6796,
"step": 2055
},
{
"epoch": 0.45221599032222587,
"grad_norm": 0.2740989923477173,
"learning_rate": 8e-05,
"loss": 1.4341,
"step": 2056
},
{
"epoch": 0.4524359397338612,
"grad_norm": 0.31732696294784546,
"learning_rate": 8e-05,
"loss": 1.717,
"step": 2057
},
{
"epoch": 0.45265588914549654,
"grad_norm": 0.2747776210308075,
"learning_rate": 8e-05,
"loss": 1.7162,
"step": 2058
},
{
"epoch": 0.45287583855713187,
"grad_norm": 0.3037000596523285,
"learning_rate": 8e-05,
"loss": 1.6399,
"step": 2059
},
{
"epoch": 0.4530957879687672,
"grad_norm": 0.29499107599258423,
"learning_rate": 8e-05,
"loss": 1.6306,
"step": 2060
},
{
"epoch": 0.4533157373804025,
"grad_norm": 0.2890235483646393,
"learning_rate": 8e-05,
"loss": 1.8048,
"step": 2061
},
{
"epoch": 0.4535356867920378,
"grad_norm": 0.28515708446502686,
"learning_rate": 8e-05,
"loss": 1.843,
"step": 2062
},
{
"epoch": 0.45375563620367315,
"grad_norm": 0.28525930643081665,
"learning_rate": 8e-05,
"loss": 1.6568,
"step": 2063
},
{
"epoch": 0.4539755856153085,
"grad_norm": 0.27117711305618286,
"learning_rate": 8e-05,
"loss": 1.6605,
"step": 2064
},
{
"epoch": 0.4541955350269438,
"grad_norm": 0.2771861255168915,
"learning_rate": 8e-05,
"loss": 1.7553,
"step": 2065
},
{
"epoch": 0.45441548443857915,
"grad_norm": 0.29263827204704285,
"learning_rate": 8e-05,
"loss": 1.7919,
"step": 2066
},
{
"epoch": 0.45463543385021443,
"grad_norm": 0.2785603702068329,
"learning_rate": 8e-05,
"loss": 1.6686,
"step": 2067
},
{
"epoch": 0.45485538326184977,
"grad_norm": 0.2752209007740021,
"learning_rate": 8e-05,
"loss": 1.5984,
"step": 2068
},
{
"epoch": 0.4550753326734851,
"grad_norm": 0.279784232378006,
"learning_rate": 8e-05,
"loss": 1.6903,
"step": 2069
},
{
"epoch": 0.45529528208512043,
"grad_norm": 0.2957722246646881,
"learning_rate": 8e-05,
"loss": 1.8176,
"step": 2070
},
{
"epoch": 0.45551523149675577,
"grad_norm": 0.2798726260662079,
"learning_rate": 8e-05,
"loss": 1.6637,
"step": 2071
},
{
"epoch": 0.45573518090839105,
"grad_norm": 0.26509538292884827,
"learning_rate": 8e-05,
"loss": 1.5034,
"step": 2072
},
{
"epoch": 0.4559551303200264,
"grad_norm": 0.2984442114830017,
"learning_rate": 8e-05,
"loss": 1.8225,
"step": 2073
},
{
"epoch": 0.4561750797316617,
"grad_norm": 0.28242239356040955,
"learning_rate": 8e-05,
"loss": 1.6707,
"step": 2074
},
{
"epoch": 0.45639502914329705,
"grad_norm": 0.2722650468349457,
"learning_rate": 8e-05,
"loss": 1.5723,
"step": 2075
},
{
"epoch": 0.4566149785549324,
"grad_norm": 0.25942909717559814,
"learning_rate": 8e-05,
"loss": 1.5417,
"step": 2076
},
{
"epoch": 0.4568349279665677,
"grad_norm": 0.2782632112503052,
"learning_rate": 8e-05,
"loss": 1.5857,
"step": 2077
},
{
"epoch": 0.457054877378203,
"grad_norm": 0.28298354148864746,
"learning_rate": 8e-05,
"loss": 1.6124,
"step": 2078
},
{
"epoch": 0.45727482678983833,
"grad_norm": 0.2920227348804474,
"learning_rate": 8e-05,
"loss": 1.6247,
"step": 2079
},
{
"epoch": 0.45749477620147366,
"grad_norm": 0.30804532766342163,
"learning_rate": 8e-05,
"loss": 1.8689,
"step": 2080
},
{
"epoch": 0.457714725613109,
"grad_norm": 0.2759280204772949,
"learning_rate": 8e-05,
"loss": 1.6366,
"step": 2081
},
{
"epoch": 0.45793467502474433,
"grad_norm": 0.27967414259910583,
"learning_rate": 8e-05,
"loss": 1.6391,
"step": 2082
},
{
"epoch": 0.4581546244363796,
"grad_norm": 0.30624908208847046,
"learning_rate": 8e-05,
"loss": 1.8188,
"step": 2083
},
{
"epoch": 0.45837457384801494,
"grad_norm": 0.2747632563114166,
"learning_rate": 8e-05,
"loss": 1.6394,
"step": 2084
},
{
"epoch": 0.4585945232596503,
"grad_norm": 0.29921606183052063,
"learning_rate": 8e-05,
"loss": 1.6264,
"step": 2085
},
{
"epoch": 0.4588144726712856,
"grad_norm": 0.27374643087387085,
"learning_rate": 8e-05,
"loss": 1.5955,
"step": 2086
},
{
"epoch": 0.45903442208292095,
"grad_norm": 0.2804218530654907,
"learning_rate": 8e-05,
"loss": 1.6936,
"step": 2087
},
{
"epoch": 0.4592543714945563,
"grad_norm": 0.288095623254776,
"learning_rate": 8e-05,
"loss": 1.658,
"step": 2088
},
{
"epoch": 0.45947432090619156,
"grad_norm": 0.2622469961643219,
"learning_rate": 8e-05,
"loss": 1.5468,
"step": 2089
},
{
"epoch": 0.4596942703178269,
"grad_norm": 0.298968106508255,
"learning_rate": 8e-05,
"loss": 1.8415,
"step": 2090
},
{
"epoch": 0.4599142197294622,
"grad_norm": 0.27127620577812195,
"learning_rate": 8e-05,
"loss": 1.5154,
"step": 2091
},
{
"epoch": 0.46013416914109756,
"grad_norm": 0.3025810122489929,
"learning_rate": 8e-05,
"loss": 1.7251,
"step": 2092
},
{
"epoch": 0.4603541185527329,
"grad_norm": 0.2805241346359253,
"learning_rate": 8e-05,
"loss": 1.786,
"step": 2093
},
{
"epoch": 0.4605740679643682,
"grad_norm": 0.28292620182037354,
"learning_rate": 8e-05,
"loss": 1.6205,
"step": 2094
},
{
"epoch": 0.4607940173760035,
"grad_norm": 0.27890294790267944,
"learning_rate": 8e-05,
"loss": 1.6182,
"step": 2095
},
{
"epoch": 0.46101396678763884,
"grad_norm": 0.2704887390136719,
"learning_rate": 8e-05,
"loss": 1.643,
"step": 2096
},
{
"epoch": 0.4612339161992742,
"grad_norm": 0.27034714818000793,
"learning_rate": 8e-05,
"loss": 1.6918,
"step": 2097
},
{
"epoch": 0.4614538656109095,
"grad_norm": 0.2763729691505432,
"learning_rate": 8e-05,
"loss": 1.6218,
"step": 2098
},
{
"epoch": 0.46167381502254484,
"grad_norm": 0.28457143902778625,
"learning_rate": 8e-05,
"loss": 1.5602,
"step": 2099
},
{
"epoch": 0.4618937644341801,
"grad_norm": 0.3102862536907196,
"learning_rate": 8e-05,
"loss": 1.6946,
"step": 2100
},
{
"epoch": 0.46211371384581545,
"grad_norm": 0.2817099690437317,
"learning_rate": 8e-05,
"loss": 1.69,
"step": 2101
},
{
"epoch": 0.4623336632574508,
"grad_norm": 0.2816404104232788,
"learning_rate": 8e-05,
"loss": 1.6651,
"step": 2102
},
{
"epoch": 0.4625536126690861,
"grad_norm": 0.2756252884864807,
"learning_rate": 8e-05,
"loss": 1.6935,
"step": 2103
},
{
"epoch": 0.46277356208072146,
"grad_norm": 0.28443071246147156,
"learning_rate": 8e-05,
"loss": 1.8278,
"step": 2104
},
{
"epoch": 0.46299351149235674,
"grad_norm": 0.2955114543437958,
"learning_rate": 8e-05,
"loss": 1.7654,
"step": 2105
},
{
"epoch": 0.46321346090399207,
"grad_norm": 0.30527764558792114,
"learning_rate": 8e-05,
"loss": 1.921,
"step": 2106
},
{
"epoch": 0.4634334103156274,
"grad_norm": 0.28985050320625305,
"learning_rate": 8e-05,
"loss": 1.7637,
"step": 2107
},
{
"epoch": 0.46365335972726274,
"grad_norm": 0.2904118299484253,
"learning_rate": 8e-05,
"loss": 1.7645,
"step": 2108
},
{
"epoch": 0.46387330913889807,
"grad_norm": 0.3137964606285095,
"learning_rate": 8e-05,
"loss": 1.8301,
"step": 2109
},
{
"epoch": 0.46409325855053335,
"grad_norm": 0.2634297013282776,
"learning_rate": 8e-05,
"loss": 1.5916,
"step": 2110
},
{
"epoch": 0.4643132079621687,
"grad_norm": 0.26435586810112,
"learning_rate": 8e-05,
"loss": 1.5805,
"step": 2111
},
{
"epoch": 0.464533157373804,
"grad_norm": 0.2845149040222168,
"learning_rate": 8e-05,
"loss": 1.6894,
"step": 2112
},
{
"epoch": 0.46475310678543935,
"grad_norm": 0.3034592568874359,
"learning_rate": 8e-05,
"loss": 1.7122,
"step": 2113
},
{
"epoch": 0.4649730561970747,
"grad_norm": 0.2862027585506439,
"learning_rate": 8e-05,
"loss": 1.5886,
"step": 2114
},
{
"epoch": 0.46519300560871,
"grad_norm": 0.2709789574146271,
"learning_rate": 8e-05,
"loss": 1.6112,
"step": 2115
},
{
"epoch": 0.4654129550203453,
"grad_norm": 0.3048953115940094,
"learning_rate": 8e-05,
"loss": 1.718,
"step": 2116
},
{
"epoch": 0.46563290443198063,
"grad_norm": 0.295149564743042,
"learning_rate": 8e-05,
"loss": 1.7339,
"step": 2117
},
{
"epoch": 0.46585285384361597,
"grad_norm": 0.27533626556396484,
"learning_rate": 8e-05,
"loss": 1.6143,
"step": 2118
},
{
"epoch": 0.4660728032552513,
"grad_norm": 0.30383235216140747,
"learning_rate": 8e-05,
"loss": 1.7154,
"step": 2119
},
{
"epoch": 0.46629275266688663,
"grad_norm": 0.2834450304508209,
"learning_rate": 8e-05,
"loss": 1.6892,
"step": 2120
},
{
"epoch": 0.4665127020785219,
"grad_norm": 0.27407264709472656,
"learning_rate": 8e-05,
"loss": 1.7138,
"step": 2121
},
{
"epoch": 0.46673265149015725,
"grad_norm": 0.28417688608169556,
"learning_rate": 8e-05,
"loss": 1.7416,
"step": 2122
},
{
"epoch": 0.4669526009017926,
"grad_norm": 0.2915797233581543,
"learning_rate": 8e-05,
"loss": 1.7104,
"step": 2123
},
{
"epoch": 0.4671725503134279,
"grad_norm": 0.29155269265174866,
"learning_rate": 8e-05,
"loss": 1.7374,
"step": 2124
},
{
"epoch": 0.46739249972506325,
"grad_norm": 0.27683204412460327,
"learning_rate": 8e-05,
"loss": 1.5669,
"step": 2125
},
{
"epoch": 0.4676124491366986,
"grad_norm": 0.2835148870944977,
"learning_rate": 8e-05,
"loss": 1.7426,
"step": 2126
},
{
"epoch": 0.46783239854833386,
"grad_norm": 0.27906641364097595,
"learning_rate": 8e-05,
"loss": 1.6297,
"step": 2127
},
{
"epoch": 0.4680523479599692,
"grad_norm": 0.28407955169677734,
"learning_rate": 8e-05,
"loss": 1.7351,
"step": 2128
},
{
"epoch": 0.46827229737160453,
"grad_norm": 0.2793600261211395,
"learning_rate": 8e-05,
"loss": 1.5786,
"step": 2129
},
{
"epoch": 0.46849224678323986,
"grad_norm": 0.2806802988052368,
"learning_rate": 8e-05,
"loss": 1.7466,
"step": 2130
},
{
"epoch": 0.4687121961948752,
"grad_norm": 0.30251967906951904,
"learning_rate": 8e-05,
"loss": 1.6703,
"step": 2131
},
{
"epoch": 0.4689321456065105,
"grad_norm": 0.275473415851593,
"learning_rate": 8e-05,
"loss": 1.5881,
"step": 2132
},
{
"epoch": 0.4691520950181458,
"grad_norm": 0.28925517201423645,
"learning_rate": 8e-05,
"loss": 1.7369,
"step": 2133
},
{
"epoch": 0.46937204442978114,
"grad_norm": 0.26768866181373596,
"learning_rate": 8e-05,
"loss": 1.5469,
"step": 2134
},
{
"epoch": 0.4695919938414165,
"grad_norm": 0.272969514131546,
"learning_rate": 8e-05,
"loss": 1.6235,
"step": 2135
},
{
"epoch": 0.4698119432530518,
"grad_norm": 0.39006346464157104,
"learning_rate": 8e-05,
"loss": 1.6943,
"step": 2136
},
{
"epoch": 0.47003189266468715,
"grad_norm": 0.2898694574832916,
"learning_rate": 8e-05,
"loss": 1.7196,
"step": 2137
},
{
"epoch": 0.4702518420763224,
"grad_norm": 0.28824204206466675,
"learning_rate": 8e-05,
"loss": 1.6807,
"step": 2138
},
{
"epoch": 0.47047179148795776,
"grad_norm": 0.3024749755859375,
"learning_rate": 8e-05,
"loss": 1.7435,
"step": 2139
},
{
"epoch": 0.4706917408995931,
"grad_norm": 0.2894933521747589,
"learning_rate": 8e-05,
"loss": 1.8047,
"step": 2140
},
{
"epoch": 0.4709116903112284,
"grad_norm": 0.2900967299938202,
"learning_rate": 8e-05,
"loss": 1.6536,
"step": 2141
},
{
"epoch": 0.47113163972286376,
"grad_norm": 0.2727701961994171,
"learning_rate": 8e-05,
"loss": 1.6187,
"step": 2142
},
{
"epoch": 0.47135158913449904,
"grad_norm": 0.2630798816680908,
"learning_rate": 8e-05,
"loss": 1.5558,
"step": 2143
},
{
"epoch": 0.4715715385461344,
"grad_norm": 0.2755641043186188,
"learning_rate": 8e-05,
"loss": 1.6247,
"step": 2144
},
{
"epoch": 0.4717914879577697,
"grad_norm": 0.26855289936065674,
"learning_rate": 8e-05,
"loss": 1.6908,
"step": 2145
},
{
"epoch": 0.47201143736940504,
"grad_norm": 0.26333558559417725,
"learning_rate": 8e-05,
"loss": 1.6467,
"step": 2146
},
{
"epoch": 0.4722313867810404,
"grad_norm": 0.2696126103401184,
"learning_rate": 8e-05,
"loss": 1.646,
"step": 2147
},
{
"epoch": 0.4724513361926757,
"grad_norm": 0.2838461101055145,
"learning_rate": 8e-05,
"loss": 1.7269,
"step": 2148
},
{
"epoch": 0.472671285604311,
"grad_norm": 0.27359622716903687,
"learning_rate": 8e-05,
"loss": 1.6578,
"step": 2149
},
{
"epoch": 0.4728912350159463,
"grad_norm": 0.28489992022514343,
"learning_rate": 8e-05,
"loss": 1.7451,
"step": 2150
},
{
"epoch": 0.47311118442758165,
"grad_norm": 0.30069005489349365,
"learning_rate": 8e-05,
"loss": 1.7789,
"step": 2151
},
{
"epoch": 0.473331133839217,
"grad_norm": 0.2787550091743469,
"learning_rate": 8e-05,
"loss": 1.8532,
"step": 2152
},
{
"epoch": 0.4735510832508523,
"grad_norm": 0.28521937131881714,
"learning_rate": 8e-05,
"loss": 1.6202,
"step": 2153
},
{
"epoch": 0.4737710326624876,
"grad_norm": 0.27512073516845703,
"learning_rate": 8e-05,
"loss": 1.6982,
"step": 2154
},
{
"epoch": 0.47399098207412294,
"grad_norm": 0.28500398993492126,
"learning_rate": 8e-05,
"loss": 1.7262,
"step": 2155
},
{
"epoch": 0.47421093148575827,
"grad_norm": 0.2889910340309143,
"learning_rate": 8e-05,
"loss": 1.7001,
"step": 2156
},
{
"epoch": 0.4744308808973936,
"grad_norm": 0.2868637144565582,
"learning_rate": 8e-05,
"loss": 1.6569,
"step": 2157
},
{
"epoch": 0.47465083030902894,
"grad_norm": 0.27974042296409607,
"learning_rate": 8e-05,
"loss": 1.7308,
"step": 2158
},
{
"epoch": 0.47487077972066427,
"grad_norm": 0.2812412977218628,
"learning_rate": 8e-05,
"loss": 1.6586,
"step": 2159
},
{
"epoch": 0.47509072913229955,
"grad_norm": 0.27973487973213196,
"learning_rate": 8e-05,
"loss": 1.7688,
"step": 2160
},
{
"epoch": 0.4753106785439349,
"grad_norm": 0.2852223515510559,
"learning_rate": 8e-05,
"loss": 1.7773,
"step": 2161
},
{
"epoch": 0.4755306279555702,
"grad_norm": 0.2702232301235199,
"learning_rate": 8e-05,
"loss": 1.5842,
"step": 2162
},
{
"epoch": 0.47575057736720555,
"grad_norm": 0.26885986328125,
"learning_rate": 8e-05,
"loss": 1.6179,
"step": 2163
},
{
"epoch": 0.4759705267788409,
"grad_norm": 0.26561740040779114,
"learning_rate": 8e-05,
"loss": 1.6105,
"step": 2164
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.3136744499206543,
"learning_rate": 8e-05,
"loss": 1.7263,
"step": 2165
},
{
"epoch": 0.4764104256021115,
"grad_norm": 0.29980844259262085,
"learning_rate": 8e-05,
"loss": 1.6428,
"step": 2166
},
{
"epoch": 0.47663037501374683,
"grad_norm": 0.2782588303089142,
"learning_rate": 8e-05,
"loss": 1.5905,
"step": 2167
},
{
"epoch": 0.47685032442538217,
"grad_norm": 0.3134911358356476,
"learning_rate": 8e-05,
"loss": 1.7437,
"step": 2168
},
{
"epoch": 0.4770702738370175,
"grad_norm": 0.28305792808532715,
"learning_rate": 8e-05,
"loss": 1.6187,
"step": 2169
},
{
"epoch": 0.47729022324865283,
"grad_norm": 0.2741806209087372,
"learning_rate": 8e-05,
"loss": 1.7017,
"step": 2170
},
{
"epoch": 0.4775101726602881,
"grad_norm": 0.2861132323741913,
"learning_rate": 8e-05,
"loss": 1.7167,
"step": 2171
},
{
"epoch": 0.47773012207192345,
"grad_norm": 0.2796178162097931,
"learning_rate": 8e-05,
"loss": 1.7747,
"step": 2172
},
{
"epoch": 0.4779500714835588,
"grad_norm": 0.3019583821296692,
"learning_rate": 8e-05,
"loss": 1.9244,
"step": 2173
},
{
"epoch": 0.4781700208951941,
"grad_norm": 0.2874825894832611,
"learning_rate": 8e-05,
"loss": 1.676,
"step": 2174
},
{
"epoch": 0.47838997030682945,
"grad_norm": 0.2864963412284851,
"learning_rate": 8e-05,
"loss": 1.8289,
"step": 2175
},
{
"epoch": 0.4786099197184647,
"grad_norm": 0.3347536623477936,
"learning_rate": 8e-05,
"loss": 1.8204,
"step": 2176
},
{
"epoch": 0.47882986913010006,
"grad_norm": 0.2859993577003479,
"learning_rate": 8e-05,
"loss": 1.5472,
"step": 2177
},
{
"epoch": 0.4790498185417354,
"grad_norm": 0.2972160875797272,
"learning_rate": 8e-05,
"loss": 1.6577,
"step": 2178
},
{
"epoch": 0.47926976795337073,
"grad_norm": 0.26402032375335693,
"learning_rate": 8e-05,
"loss": 1.4936,
"step": 2179
},
{
"epoch": 0.47948971736500606,
"grad_norm": 0.28069886565208435,
"learning_rate": 8e-05,
"loss": 1.8158,
"step": 2180
},
{
"epoch": 0.4797096667766414,
"grad_norm": 0.2630525529384613,
"learning_rate": 8e-05,
"loss": 1.5814,
"step": 2181
},
{
"epoch": 0.4799296161882767,
"grad_norm": 0.2999349534511566,
"learning_rate": 8e-05,
"loss": 1.7879,
"step": 2182
},
{
"epoch": 0.480149565599912,
"grad_norm": 0.28722846508026123,
"learning_rate": 8e-05,
"loss": 1.7143,
"step": 2183
},
{
"epoch": 0.48036951501154734,
"grad_norm": 0.2746049165725708,
"learning_rate": 8e-05,
"loss": 1.5197,
"step": 2184
},
{
"epoch": 0.4805894644231827,
"grad_norm": 0.2719694972038269,
"learning_rate": 8e-05,
"loss": 1.6642,
"step": 2185
},
{
"epoch": 0.480809413834818,
"grad_norm": 0.286636084318161,
"learning_rate": 8e-05,
"loss": 1.6885,
"step": 2186
},
{
"epoch": 0.4810293632464533,
"grad_norm": 0.3076469600200653,
"learning_rate": 8e-05,
"loss": 1.7681,
"step": 2187
},
{
"epoch": 0.4812493126580886,
"grad_norm": 0.3075680434703827,
"learning_rate": 8e-05,
"loss": 1.8007,
"step": 2188
},
{
"epoch": 0.48146926206972396,
"grad_norm": 0.2793465852737427,
"learning_rate": 8e-05,
"loss": 1.6552,
"step": 2189
},
{
"epoch": 0.4816892114813593,
"grad_norm": 0.2886781692504883,
"learning_rate": 8e-05,
"loss": 1.7827,
"step": 2190
},
{
"epoch": 0.4819091608929946,
"grad_norm": 0.27432283759117126,
"learning_rate": 8e-05,
"loss": 1.7724,
"step": 2191
},
{
"epoch": 0.48212911030462996,
"grad_norm": 0.26780393719673157,
"learning_rate": 8e-05,
"loss": 1.4613,
"step": 2192
},
{
"epoch": 0.48234905971626524,
"grad_norm": 0.27178165316581726,
"learning_rate": 8e-05,
"loss": 1.6439,
"step": 2193
},
{
"epoch": 0.48256900912790057,
"grad_norm": 0.27651992440223694,
"learning_rate": 8e-05,
"loss": 1.6672,
"step": 2194
},
{
"epoch": 0.4827889585395359,
"grad_norm": 0.26919931173324585,
"learning_rate": 8e-05,
"loss": 1.594,
"step": 2195
},
{
"epoch": 0.48300890795117124,
"grad_norm": 0.267678439617157,
"learning_rate": 8e-05,
"loss": 1.5852,
"step": 2196
},
{
"epoch": 0.4832288573628066,
"grad_norm": 0.2895921766757965,
"learning_rate": 8e-05,
"loss": 1.6456,
"step": 2197
},
{
"epoch": 0.48344880677444185,
"grad_norm": 0.27052855491638184,
"learning_rate": 8e-05,
"loss": 1.5393,
"step": 2198
},
{
"epoch": 0.4836687561860772,
"grad_norm": 0.2793048024177551,
"learning_rate": 8e-05,
"loss": 1.7006,
"step": 2199
},
{
"epoch": 0.4838887055977125,
"grad_norm": 0.2838841676712036,
"learning_rate": 8e-05,
"loss": 1.6547,
"step": 2200
},
{
"epoch": 0.48410865500934785,
"grad_norm": 0.29814717173576355,
"learning_rate": 8e-05,
"loss": 1.7392,
"step": 2201
},
{
"epoch": 0.4843286044209832,
"grad_norm": 0.28823426365852356,
"learning_rate": 8e-05,
"loss": 1.6434,
"step": 2202
},
{
"epoch": 0.4845485538326185,
"grad_norm": 0.2645476460456848,
"learning_rate": 8e-05,
"loss": 1.5663,
"step": 2203
},
{
"epoch": 0.4847685032442538,
"grad_norm": 0.3046651780605316,
"learning_rate": 8e-05,
"loss": 1.5941,
"step": 2204
},
{
"epoch": 0.48498845265588914,
"grad_norm": 0.28958046436309814,
"learning_rate": 8e-05,
"loss": 1.7348,
"step": 2205
},
{
"epoch": 0.48520840206752447,
"grad_norm": 0.25703537464141846,
"learning_rate": 8e-05,
"loss": 1.5712,
"step": 2206
},
{
"epoch": 0.4854283514791598,
"grad_norm": 0.2969980537891388,
"learning_rate": 8e-05,
"loss": 1.7966,
"step": 2207
},
{
"epoch": 0.48564830089079514,
"grad_norm": 0.27638381719589233,
"learning_rate": 8e-05,
"loss": 1.6478,
"step": 2208
},
{
"epoch": 0.4858682503024304,
"grad_norm": 0.283682644367218,
"learning_rate": 8e-05,
"loss": 1.6524,
"step": 2209
},
{
"epoch": 0.48608819971406575,
"grad_norm": 0.2837259769439697,
"learning_rate": 8e-05,
"loss": 1.7302,
"step": 2210
},
{
"epoch": 0.4863081491257011,
"grad_norm": 0.3042410612106323,
"learning_rate": 8e-05,
"loss": 1.5969,
"step": 2211
},
{
"epoch": 0.4865280985373364,
"grad_norm": 0.2819627821445465,
"learning_rate": 8e-05,
"loss": 1.5621,
"step": 2212
},
{
"epoch": 0.48674804794897175,
"grad_norm": 0.28049173951148987,
"learning_rate": 8e-05,
"loss": 1.6831,
"step": 2213
},
{
"epoch": 0.4869679973606071,
"grad_norm": 0.29762500524520874,
"learning_rate": 8e-05,
"loss": 1.7883,
"step": 2214
},
{
"epoch": 0.48718794677224236,
"grad_norm": 0.3022189140319824,
"learning_rate": 8e-05,
"loss": 1.5744,
"step": 2215
},
{
"epoch": 0.4874078961838777,
"grad_norm": 0.28249025344848633,
"learning_rate": 8e-05,
"loss": 1.6514,
"step": 2216
},
{
"epoch": 0.48762784559551303,
"grad_norm": 0.3398612141609192,
"learning_rate": 8e-05,
"loss": 1.4215,
"step": 2217
},
{
"epoch": 0.48784779500714837,
"grad_norm": 0.28481197357177734,
"learning_rate": 8e-05,
"loss": 1.7065,
"step": 2218
},
{
"epoch": 0.4880677444187837,
"grad_norm": 0.3076950013637543,
"learning_rate": 8e-05,
"loss": 1.7665,
"step": 2219
},
{
"epoch": 0.488287693830419,
"grad_norm": 0.28533896803855896,
"learning_rate": 8e-05,
"loss": 1.6156,
"step": 2220
},
{
"epoch": 0.4885076432420543,
"grad_norm": 0.2940129041671753,
"learning_rate": 8e-05,
"loss": 1.6957,
"step": 2221
},
{
"epoch": 0.48872759265368965,
"grad_norm": 0.30342915654182434,
"learning_rate": 8e-05,
"loss": 1.6014,
"step": 2222
},
{
"epoch": 0.488947542065325,
"grad_norm": 0.2851787507534027,
"learning_rate": 8e-05,
"loss": 1.8816,
"step": 2223
},
{
"epoch": 0.4891674914769603,
"grad_norm": 0.3813328146934509,
"learning_rate": 8e-05,
"loss": 1.6608,
"step": 2224
},
{
"epoch": 0.48938744088859565,
"grad_norm": 0.329373300075531,
"learning_rate": 8e-05,
"loss": 1.7455,
"step": 2225
},
{
"epoch": 0.4896073903002309,
"grad_norm": 0.2751460373401642,
"learning_rate": 8e-05,
"loss": 1.6854,
"step": 2226
},
{
"epoch": 0.48982733971186626,
"grad_norm": 0.283033549785614,
"learning_rate": 8e-05,
"loss": 1.7018,
"step": 2227
},
{
"epoch": 0.4900472891235016,
"grad_norm": 0.2869894504547119,
"learning_rate": 8e-05,
"loss": 1.7062,
"step": 2228
},
{
"epoch": 0.49026723853513693,
"grad_norm": 0.2895123064517975,
"learning_rate": 8e-05,
"loss": 1.625,
"step": 2229
},
{
"epoch": 0.49048718794677226,
"grad_norm": 0.28288763761520386,
"learning_rate": 8e-05,
"loss": 1.6901,
"step": 2230
},
{
"epoch": 0.49070713735840754,
"grad_norm": 0.28054291009902954,
"learning_rate": 8e-05,
"loss": 1.7782,
"step": 2231
},
{
"epoch": 0.4909270867700429,
"grad_norm": 0.2727196216583252,
"learning_rate": 8e-05,
"loss": 1.531,
"step": 2232
},
{
"epoch": 0.4911470361816782,
"grad_norm": 0.28192493319511414,
"learning_rate": 8e-05,
"loss": 1.7144,
"step": 2233
},
{
"epoch": 0.49136698559331354,
"grad_norm": 0.28133469820022583,
"learning_rate": 8e-05,
"loss": 1.6448,
"step": 2234
},
{
"epoch": 0.4915869350049489,
"grad_norm": 0.3175356686115265,
"learning_rate": 8e-05,
"loss": 1.648,
"step": 2235
},
{
"epoch": 0.4918068844165842,
"grad_norm": 0.2990395128726959,
"learning_rate": 8e-05,
"loss": 1.6871,
"step": 2236
},
{
"epoch": 0.4920268338282195,
"grad_norm": 0.3121372163295746,
"learning_rate": 8e-05,
"loss": 1.7671,
"step": 2237
},
{
"epoch": 0.4922467832398548,
"grad_norm": 0.2772499918937683,
"learning_rate": 8e-05,
"loss": 1.674,
"step": 2238
},
{
"epoch": 0.49246673265149016,
"grad_norm": 0.2990477383136749,
"learning_rate": 8e-05,
"loss": 1.8202,
"step": 2239
},
{
"epoch": 0.4926866820631255,
"grad_norm": 0.28028279542922974,
"learning_rate": 8e-05,
"loss": 1.7255,
"step": 2240
},
{
"epoch": 0.4929066314747608,
"grad_norm": 0.28883370757102966,
"learning_rate": 8e-05,
"loss": 1.6102,
"step": 2241
},
{
"epoch": 0.4931265808863961,
"grad_norm": 0.2775261700153351,
"learning_rate": 8e-05,
"loss": 1.8572,
"step": 2242
},
{
"epoch": 0.49334653029803144,
"grad_norm": 0.2821192443370819,
"learning_rate": 8e-05,
"loss": 1.8068,
"step": 2243
},
{
"epoch": 0.49356647970966677,
"grad_norm": 0.29555544257164,
"learning_rate": 8e-05,
"loss": 1.6875,
"step": 2244
},
{
"epoch": 0.4937864291213021,
"grad_norm": 0.28827783465385437,
"learning_rate": 8e-05,
"loss": 1.7123,
"step": 2245
},
{
"epoch": 0.49400637853293744,
"grad_norm": 0.27672290802001953,
"learning_rate": 8e-05,
"loss": 1.4739,
"step": 2246
},
{
"epoch": 0.4942263279445728,
"grad_norm": 0.27185946702957153,
"learning_rate": 8e-05,
"loss": 1.7748,
"step": 2247
},
{
"epoch": 0.49444627735620805,
"grad_norm": 0.2972213327884674,
"learning_rate": 8e-05,
"loss": 1.7122,
"step": 2248
},
{
"epoch": 0.4946662267678434,
"grad_norm": 0.30817538499832153,
"learning_rate": 8e-05,
"loss": 1.926,
"step": 2249
},
{
"epoch": 0.4948861761794787,
"grad_norm": 0.2821509838104248,
"learning_rate": 8e-05,
"loss": 1.7608,
"step": 2250
},
{
"epoch": 0.49510612559111405,
"grad_norm": 0.29807963967323303,
"learning_rate": 8e-05,
"loss": 1.8233,
"step": 2251
},
{
"epoch": 0.4953260750027494,
"grad_norm": 0.29549431800842285,
"learning_rate": 8e-05,
"loss": 1.7963,
"step": 2252
},
{
"epoch": 0.49554602441438467,
"grad_norm": 0.3025868535041809,
"learning_rate": 8e-05,
"loss": 1.7597,
"step": 2253
},
{
"epoch": 0.49576597382602,
"grad_norm": 0.30950862169265747,
"learning_rate": 8e-05,
"loss": 1.8901,
"step": 2254
},
{
"epoch": 0.49598592323765534,
"grad_norm": 0.3357299864292145,
"learning_rate": 8e-05,
"loss": 1.7692,
"step": 2255
},
{
"epoch": 0.49620587264929067,
"grad_norm": 0.2873973548412323,
"learning_rate": 8e-05,
"loss": 1.6934,
"step": 2256
},
{
"epoch": 0.496425822060926,
"grad_norm": 0.2997465431690216,
"learning_rate": 8e-05,
"loss": 1.5939,
"step": 2257
},
{
"epoch": 0.49664577147256134,
"grad_norm": 0.269217312335968,
"learning_rate": 8e-05,
"loss": 1.5687,
"step": 2258
},
{
"epoch": 0.4968657208841966,
"grad_norm": 0.27386826276779175,
"learning_rate": 8e-05,
"loss": 1.6858,
"step": 2259
},
{
"epoch": 0.49708567029583195,
"grad_norm": 0.2911466658115387,
"learning_rate": 8e-05,
"loss": 1.701,
"step": 2260
},
{
"epoch": 0.4973056197074673,
"grad_norm": 0.2837962508201599,
"learning_rate": 8e-05,
"loss": 1.6886,
"step": 2261
},
{
"epoch": 0.4975255691191026,
"grad_norm": 0.3071229159832001,
"learning_rate": 8e-05,
"loss": 1.73,
"step": 2262
},
{
"epoch": 0.49774551853073795,
"grad_norm": 0.303252249956131,
"learning_rate": 8e-05,
"loss": 1.7693,
"step": 2263
},
{
"epoch": 0.49796546794237323,
"grad_norm": 0.2802221179008484,
"learning_rate": 8e-05,
"loss": 1.6394,
"step": 2264
},
{
"epoch": 0.49818541735400856,
"grad_norm": 0.28856000304222107,
"learning_rate": 8e-05,
"loss": 1.6035,
"step": 2265
},
{
"epoch": 0.4984053667656439,
"grad_norm": 0.28943875432014465,
"learning_rate": 8e-05,
"loss": 1.7989,
"step": 2266
},
{
"epoch": 0.49862531617727923,
"grad_norm": 0.26969149708747864,
"learning_rate": 8e-05,
"loss": 1.6607,
"step": 2267
},
{
"epoch": 0.49884526558891457,
"grad_norm": 0.311819851398468,
"learning_rate": 8e-05,
"loss": 1.7912,
"step": 2268
},
{
"epoch": 0.4990652150005499,
"grad_norm": 0.296274334192276,
"learning_rate": 8e-05,
"loss": 1.6953,
"step": 2269
},
{
"epoch": 0.4992851644121852,
"grad_norm": 0.26551195979118347,
"learning_rate": 8e-05,
"loss": 1.3549,
"step": 2270
},
{
"epoch": 0.4995051138238205,
"grad_norm": 0.28540030121803284,
"learning_rate": 8e-05,
"loss": 1.7142,
"step": 2271
},
{
"epoch": 0.49972506323545585,
"grad_norm": 0.2834233045578003,
"learning_rate": 8e-05,
"loss": 1.7293,
"step": 2272
},
{
"epoch": 0.4999450126470912,
"grad_norm": 0.34650975465774536,
"learning_rate": 8e-05,
"loss": 1.7652,
"step": 2273
},
{
"epoch": 0.5001649620587265,
"grad_norm": 0.2988453507423401,
"learning_rate": 8e-05,
"loss": 1.8895,
"step": 2274
},
{
"epoch": 0.5003849114703618,
"grad_norm": 0.2912983000278473,
"learning_rate": 8e-05,
"loss": 1.6431,
"step": 2275
},
{
"epoch": 0.5006048608819972,
"grad_norm": 0.28406545519828796,
"learning_rate": 8e-05,
"loss": 1.7805,
"step": 2276
},
{
"epoch": 0.5008248102936325,
"grad_norm": 0.2748315632343292,
"learning_rate": 8e-05,
"loss": 1.5514,
"step": 2277
},
{
"epoch": 0.5010447597052677,
"grad_norm": 0.3016912341117859,
"learning_rate": 8e-05,
"loss": 1.6271,
"step": 2278
},
{
"epoch": 0.5012647091169031,
"grad_norm": 0.3006996512413025,
"learning_rate": 8e-05,
"loss": 1.7195,
"step": 2279
},
{
"epoch": 0.5014846585285384,
"grad_norm": 0.27598950266838074,
"learning_rate": 8e-05,
"loss": 1.4975,
"step": 2280
},
{
"epoch": 0.5017046079401738,
"grad_norm": 0.2810399830341339,
"learning_rate": 8e-05,
"loss": 1.6498,
"step": 2281
},
{
"epoch": 0.5019245573518091,
"grad_norm": 0.3018679916858673,
"learning_rate": 8e-05,
"loss": 1.6711,
"step": 2282
},
{
"epoch": 0.5021445067634444,
"grad_norm": 0.2889658510684967,
"learning_rate": 8e-05,
"loss": 1.7318,
"step": 2283
},
{
"epoch": 0.5023644561750797,
"grad_norm": 0.28475597500801086,
"learning_rate": 8e-05,
"loss": 1.7573,
"step": 2284
},
{
"epoch": 0.502584405586715,
"grad_norm": 0.3001987338066101,
"learning_rate": 8e-05,
"loss": 1.8463,
"step": 2285
},
{
"epoch": 0.5028043549983504,
"grad_norm": 0.273773193359375,
"learning_rate": 8e-05,
"loss": 1.6465,
"step": 2286
},
{
"epoch": 0.5030243044099857,
"grad_norm": 0.34727615118026733,
"learning_rate": 8e-05,
"loss": 1.9431,
"step": 2287
},
{
"epoch": 0.5032442538216211,
"grad_norm": 0.2818615138530731,
"learning_rate": 8e-05,
"loss": 1.6113,
"step": 2288
},
{
"epoch": 0.5034642032332564,
"grad_norm": 0.26619333028793335,
"learning_rate": 8e-05,
"loss": 1.5167,
"step": 2289
},
{
"epoch": 0.5036841526448916,
"grad_norm": 0.2748722434043884,
"learning_rate": 8e-05,
"loss": 1.6384,
"step": 2290
},
{
"epoch": 0.503904102056527,
"grad_norm": 0.29873546957969666,
"learning_rate": 8e-05,
"loss": 1.7309,
"step": 2291
},
{
"epoch": 0.5041240514681623,
"grad_norm": 0.28378361463546753,
"learning_rate": 8e-05,
"loss": 1.6788,
"step": 2292
},
{
"epoch": 0.5043440008797977,
"grad_norm": 0.28786107897758484,
"learning_rate": 8e-05,
"loss": 1.7281,
"step": 2293
},
{
"epoch": 0.504563950291433,
"grad_norm": 0.2831546366214752,
"learning_rate": 8e-05,
"loss": 1.7644,
"step": 2294
},
{
"epoch": 0.5047838997030683,
"grad_norm": 0.28964316844940186,
"learning_rate": 8e-05,
"loss": 1.6409,
"step": 2295
},
{
"epoch": 0.5050038491147036,
"grad_norm": 0.2778918743133545,
"learning_rate": 8e-05,
"loss": 1.6078,
"step": 2296
},
{
"epoch": 0.5052237985263389,
"grad_norm": 0.2749491035938263,
"learning_rate": 8e-05,
"loss": 1.668,
"step": 2297
},
{
"epoch": 0.5054437479379743,
"grad_norm": 0.2856389880180359,
"learning_rate": 8e-05,
"loss": 1.7052,
"step": 2298
},
{
"epoch": 0.5056636973496096,
"grad_norm": 0.28082379698753357,
"learning_rate": 8e-05,
"loss": 1.6627,
"step": 2299
},
{
"epoch": 0.5058836467612449,
"grad_norm": 0.27894240617752075,
"learning_rate": 8e-05,
"loss": 1.6408,
"step": 2300
},
{
"epoch": 0.5061035961728803,
"grad_norm": 0.31712663173675537,
"learning_rate": 8e-05,
"loss": 1.6148,
"step": 2301
},
{
"epoch": 0.5063235455845155,
"grad_norm": 0.28600454330444336,
"learning_rate": 8e-05,
"loss": 1.7695,
"step": 2302
},
{
"epoch": 0.5065434949961509,
"grad_norm": 0.3285694718360901,
"learning_rate": 8e-05,
"loss": 1.7882,
"step": 2303
},
{
"epoch": 0.5067634444077862,
"grad_norm": 0.27823877334594727,
"learning_rate": 8e-05,
"loss": 1.4648,
"step": 2304
},
{
"epoch": 0.5069833938194215,
"grad_norm": 0.3157597482204437,
"learning_rate": 8e-05,
"loss": 1.8441,
"step": 2305
},
{
"epoch": 0.5072033432310569,
"grad_norm": 0.2913108170032501,
"learning_rate": 8e-05,
"loss": 1.7584,
"step": 2306
},
{
"epoch": 0.5074232926426921,
"grad_norm": 0.28753626346588135,
"learning_rate": 8e-05,
"loss": 1.7045,
"step": 2307
},
{
"epoch": 0.5076432420543275,
"grad_norm": 0.2981377840042114,
"learning_rate": 8e-05,
"loss": 1.54,
"step": 2308
},
{
"epoch": 0.5078631914659628,
"grad_norm": 0.2911180853843689,
"learning_rate": 8e-05,
"loss": 1.8266,
"step": 2309
},
{
"epoch": 0.5080831408775982,
"grad_norm": 0.2862488031387329,
"learning_rate": 8e-05,
"loss": 1.7503,
"step": 2310
},
{
"epoch": 0.5083030902892335,
"grad_norm": 0.3015568256378174,
"learning_rate": 8e-05,
"loss": 1.8209,
"step": 2311
},
{
"epoch": 0.5085230397008688,
"grad_norm": 0.27230823040008545,
"learning_rate": 8e-05,
"loss": 1.7977,
"step": 2312
},
{
"epoch": 0.5087429891125042,
"grad_norm": 0.27871981263160706,
"learning_rate": 8e-05,
"loss": 1.5971,
"step": 2313
},
{
"epoch": 0.5089629385241394,
"grad_norm": 0.2884436249732971,
"learning_rate": 8e-05,
"loss": 1.7726,
"step": 2314
},
{
"epoch": 0.5091828879357748,
"grad_norm": 0.2778714895248413,
"learning_rate": 8e-05,
"loss": 1.7104,
"step": 2315
},
{
"epoch": 0.5094028373474101,
"grad_norm": 0.30202385783195496,
"learning_rate": 8e-05,
"loss": 1.658,
"step": 2316
},
{
"epoch": 0.5096227867590454,
"grad_norm": 0.2785106599330902,
"learning_rate": 8e-05,
"loss": 1.717,
"step": 2317
},
{
"epoch": 0.5098427361706808,
"grad_norm": 0.28846096992492676,
"learning_rate": 8e-05,
"loss": 1.7456,
"step": 2318
},
{
"epoch": 0.510062685582316,
"grad_norm": 0.27753984928131104,
"learning_rate": 8e-05,
"loss": 1.6569,
"step": 2319
},
{
"epoch": 0.5102826349939514,
"grad_norm": 0.2834784686565399,
"learning_rate": 8e-05,
"loss": 1.6348,
"step": 2320
},
{
"epoch": 0.5105025844055867,
"grad_norm": 0.27789169549942017,
"learning_rate": 8e-05,
"loss": 1.5586,
"step": 2321
},
{
"epoch": 0.510722533817222,
"grad_norm": 0.28466710448265076,
"learning_rate": 8e-05,
"loss": 1.6793,
"step": 2322
},
{
"epoch": 0.5109424832288574,
"grad_norm": 0.2759189009666443,
"learning_rate": 8e-05,
"loss": 1.5311,
"step": 2323
},
{
"epoch": 0.5111624326404927,
"grad_norm": 0.2931334674358368,
"learning_rate": 8e-05,
"loss": 1.7258,
"step": 2324
},
{
"epoch": 0.511382382052128,
"grad_norm": 0.2740546464920044,
"learning_rate": 8e-05,
"loss": 1.558,
"step": 2325
},
{
"epoch": 0.5116023314637633,
"grad_norm": 0.29584407806396484,
"learning_rate": 8e-05,
"loss": 1.7777,
"step": 2326
},
{
"epoch": 0.5118222808753986,
"grad_norm": 0.2948019504547119,
"learning_rate": 8e-05,
"loss": 1.7722,
"step": 2327
},
{
"epoch": 0.512042230287034,
"grad_norm": 0.27409225702285767,
"learning_rate": 8e-05,
"loss": 1.6524,
"step": 2328
},
{
"epoch": 0.5122621796986693,
"grad_norm": 0.26278048753738403,
"learning_rate": 8e-05,
"loss": 1.5945,
"step": 2329
},
{
"epoch": 0.5124821291103047,
"grad_norm": 0.29483261704444885,
"learning_rate": 8e-05,
"loss": 1.8132,
"step": 2330
},
{
"epoch": 0.5127020785219399,
"grad_norm": 0.27037349343299866,
"learning_rate": 8e-05,
"loss": 1.5657,
"step": 2331
},
{
"epoch": 0.5129220279335753,
"grad_norm": 0.2826361060142517,
"learning_rate": 8e-05,
"loss": 1.6955,
"step": 2332
},
{
"epoch": 0.5131419773452106,
"grad_norm": 0.2957696318626404,
"learning_rate": 8e-05,
"loss": 1.814,
"step": 2333
},
{
"epoch": 0.5133619267568459,
"grad_norm": 0.2752826511859894,
"learning_rate": 8e-05,
"loss": 1.636,
"step": 2334
},
{
"epoch": 0.5135818761684813,
"grad_norm": 0.28523313999176025,
"learning_rate": 8e-05,
"loss": 1.5444,
"step": 2335
},
{
"epoch": 0.5138018255801166,
"grad_norm": 0.286304235458374,
"learning_rate": 8e-05,
"loss": 1.4665,
"step": 2336
},
{
"epoch": 0.514021774991752,
"grad_norm": 0.28738734126091003,
"learning_rate": 8e-05,
"loss": 1.6802,
"step": 2337
},
{
"epoch": 0.5142417244033872,
"grad_norm": 0.2669237554073334,
"learning_rate": 8e-05,
"loss": 1.6011,
"step": 2338
},
{
"epoch": 0.5144616738150225,
"grad_norm": 0.274325430393219,
"learning_rate": 8e-05,
"loss": 1.6248,
"step": 2339
},
{
"epoch": 0.5146816232266579,
"grad_norm": 0.2798522710800171,
"learning_rate": 8e-05,
"loss": 1.7636,
"step": 2340
},
{
"epoch": 0.5149015726382932,
"grad_norm": 0.27266305685043335,
"learning_rate": 8e-05,
"loss": 1.6434,
"step": 2341
},
{
"epoch": 0.5151215220499286,
"grad_norm": 0.2740791440010071,
"learning_rate": 8e-05,
"loss": 1.7084,
"step": 2342
},
{
"epoch": 0.5153414714615638,
"grad_norm": 0.28098320960998535,
"learning_rate": 8e-05,
"loss": 1.6356,
"step": 2343
},
{
"epoch": 0.5155614208731991,
"grad_norm": 0.2760515809059143,
"learning_rate": 8e-05,
"loss": 1.661,
"step": 2344
},
{
"epoch": 0.5157813702848345,
"grad_norm": 0.27894532680511475,
"learning_rate": 8e-05,
"loss": 1.6794,
"step": 2345
},
{
"epoch": 0.5160013196964698,
"grad_norm": 0.2972679138183594,
"learning_rate": 8e-05,
"loss": 1.6943,
"step": 2346
},
{
"epoch": 0.5162212691081052,
"grad_norm": 0.3100125789642334,
"learning_rate": 8e-05,
"loss": 1.7214,
"step": 2347
},
{
"epoch": 0.5164412185197405,
"grad_norm": 0.2743578255176544,
"learning_rate": 8e-05,
"loss": 1.7021,
"step": 2348
},
{
"epoch": 0.5166611679313757,
"grad_norm": 0.29266777634620667,
"learning_rate": 8e-05,
"loss": 1.7585,
"step": 2349
},
{
"epoch": 0.5168811173430111,
"grad_norm": 0.2791600227355957,
"learning_rate": 8e-05,
"loss": 1.7012,
"step": 2350
},
{
"epoch": 0.5171010667546464,
"grad_norm": 0.28535401821136475,
"learning_rate": 8e-05,
"loss": 1.6695,
"step": 2351
},
{
"epoch": 0.5173210161662818,
"grad_norm": 0.2860865592956543,
"learning_rate": 8e-05,
"loss": 1.7419,
"step": 2352
},
{
"epoch": 0.5175409655779171,
"grad_norm": 0.27693790197372437,
"learning_rate": 8e-05,
"loss": 1.5459,
"step": 2353
},
{
"epoch": 0.5177609149895525,
"grad_norm": 0.2858433723449707,
"learning_rate": 8e-05,
"loss": 1.799,
"step": 2354
},
{
"epoch": 0.5179808644011877,
"grad_norm": 0.30761632323265076,
"learning_rate": 8e-05,
"loss": 1.5971,
"step": 2355
},
{
"epoch": 0.518200813812823,
"grad_norm": 0.2943046987056732,
"learning_rate": 8e-05,
"loss": 1.7399,
"step": 2356
},
{
"epoch": 0.5184207632244584,
"grad_norm": 0.2748922109603882,
"learning_rate": 8e-05,
"loss": 1.7202,
"step": 2357
},
{
"epoch": 0.5186407126360937,
"grad_norm": 0.2818071246147156,
"learning_rate": 8e-05,
"loss": 1.5918,
"step": 2358
},
{
"epoch": 0.5188606620477291,
"grad_norm": 0.28235137462615967,
"learning_rate": 8e-05,
"loss": 1.5728,
"step": 2359
},
{
"epoch": 0.5190806114593643,
"grad_norm": 0.27833959460258484,
"learning_rate": 8e-05,
"loss": 1.5966,
"step": 2360
},
{
"epoch": 0.5193005608709996,
"grad_norm": 0.2731468677520752,
"learning_rate": 8e-05,
"loss": 1.6518,
"step": 2361
},
{
"epoch": 0.519520510282635,
"grad_norm": 0.2777821719646454,
"learning_rate": 8e-05,
"loss": 1.6885,
"step": 2362
},
{
"epoch": 0.5197404596942703,
"grad_norm": 0.2685951590538025,
"learning_rate": 8e-05,
"loss": 1.6315,
"step": 2363
},
{
"epoch": 0.5199604091059057,
"grad_norm": 0.3087875545024872,
"learning_rate": 8e-05,
"loss": 1.7525,
"step": 2364
},
{
"epoch": 0.520180358517541,
"grad_norm": 0.2693195044994354,
"learning_rate": 8e-05,
"loss": 1.5993,
"step": 2365
},
{
"epoch": 0.5204003079291762,
"grad_norm": 0.2832968235015869,
"learning_rate": 8e-05,
"loss": 1.5798,
"step": 2366
},
{
"epoch": 0.5206202573408116,
"grad_norm": 0.2873738706111908,
"learning_rate": 8e-05,
"loss": 1.7373,
"step": 2367
},
{
"epoch": 0.5208402067524469,
"grad_norm": 0.2888682782649994,
"learning_rate": 8e-05,
"loss": 1.6995,
"step": 2368
},
{
"epoch": 0.5210601561640823,
"grad_norm": 0.2788809537887573,
"learning_rate": 8e-05,
"loss": 1.5928,
"step": 2369
},
{
"epoch": 0.5212801055757176,
"grad_norm": 0.28021880984306335,
"learning_rate": 8e-05,
"loss": 1.6692,
"step": 2370
},
{
"epoch": 0.5215000549873529,
"grad_norm": 0.3041718006134033,
"learning_rate": 8e-05,
"loss": 1.7378,
"step": 2371
},
{
"epoch": 0.5217200043989882,
"grad_norm": 0.2774748206138611,
"learning_rate": 8e-05,
"loss": 1.5802,
"step": 2372
},
{
"epoch": 0.5219399538106235,
"grad_norm": 0.2876451015472412,
"learning_rate": 8e-05,
"loss": 1.8057,
"step": 2373
},
{
"epoch": 0.5221599032222589,
"grad_norm": 0.2740166485309601,
"learning_rate": 8e-05,
"loss": 1.6694,
"step": 2374
},
{
"epoch": 0.5223798526338942,
"grad_norm": 0.288555771112442,
"learning_rate": 8e-05,
"loss": 1.792,
"step": 2375
},
{
"epoch": 0.5225998020455296,
"grad_norm": 0.2777664065361023,
"learning_rate": 8e-05,
"loss": 1.6781,
"step": 2376
},
{
"epoch": 0.5228197514571649,
"grad_norm": 0.27108079195022583,
"learning_rate": 8e-05,
"loss": 1.4881,
"step": 2377
},
{
"epoch": 0.5230397008688001,
"grad_norm": 0.2909669578075409,
"learning_rate": 8e-05,
"loss": 1.7174,
"step": 2378
},
{
"epoch": 0.5232596502804355,
"grad_norm": 0.2978494167327881,
"learning_rate": 8e-05,
"loss": 1.8641,
"step": 2379
},
{
"epoch": 0.5234795996920708,
"grad_norm": 0.2649437487125397,
"learning_rate": 8e-05,
"loss": 1.473,
"step": 2380
},
{
"epoch": 0.5236995491037062,
"grad_norm": 0.28939372301101685,
"learning_rate": 8e-05,
"loss": 1.5567,
"step": 2381
},
{
"epoch": 0.5239194985153415,
"grad_norm": 0.2740820646286011,
"learning_rate": 8e-05,
"loss": 1.6321,
"step": 2382
},
{
"epoch": 0.5241394479269768,
"grad_norm": 0.28426122665405273,
"learning_rate": 8e-05,
"loss": 1.5952,
"step": 2383
},
{
"epoch": 0.5243593973386121,
"grad_norm": 0.28176257014274597,
"learning_rate": 8e-05,
"loss": 1.6231,
"step": 2384
},
{
"epoch": 0.5245793467502474,
"grad_norm": 0.29681360721588135,
"learning_rate": 8e-05,
"loss": 1.8203,
"step": 2385
},
{
"epoch": 0.5247992961618828,
"grad_norm": 0.272658109664917,
"learning_rate": 8e-05,
"loss": 1.6942,
"step": 2386
},
{
"epoch": 0.5250192455735181,
"grad_norm": 0.27786141633987427,
"learning_rate": 8e-05,
"loss": 1.6081,
"step": 2387
},
{
"epoch": 0.5252391949851534,
"grad_norm": 0.2938309609889984,
"learning_rate": 8e-05,
"loss": 1.7454,
"step": 2388
},
{
"epoch": 0.5254591443967888,
"grad_norm": 0.2710343599319458,
"learning_rate": 8e-05,
"loss": 1.6391,
"step": 2389
},
{
"epoch": 0.525679093808424,
"grad_norm": 0.2757870554924011,
"learning_rate": 8e-05,
"loss": 1.6526,
"step": 2390
},
{
"epoch": 0.5258990432200594,
"grad_norm": 0.2581859827041626,
"learning_rate": 8e-05,
"loss": 1.4344,
"step": 2391
},
{
"epoch": 0.5261189926316947,
"grad_norm": 0.2732166647911072,
"learning_rate": 8e-05,
"loss": 1.5987,
"step": 2392
},
{
"epoch": 0.52633894204333,
"grad_norm": 0.2859753370285034,
"learning_rate": 8e-05,
"loss": 1.6654,
"step": 2393
},
{
"epoch": 0.5265588914549654,
"grad_norm": 0.2680748701095581,
"learning_rate": 8e-05,
"loss": 1.5764,
"step": 2394
},
{
"epoch": 0.5267788408666007,
"grad_norm": 0.2866816818714142,
"learning_rate": 8e-05,
"loss": 1.7725,
"step": 2395
},
{
"epoch": 0.526998790278236,
"grad_norm": 0.27792397141456604,
"learning_rate": 8e-05,
"loss": 1.5932,
"step": 2396
},
{
"epoch": 0.5272187396898713,
"grad_norm": 0.26985982060432434,
"learning_rate": 8e-05,
"loss": 1.5955,
"step": 2397
},
{
"epoch": 0.5274386891015067,
"grad_norm": 0.28183454275131226,
"learning_rate": 8e-05,
"loss": 1.553,
"step": 2398
},
{
"epoch": 0.527658638513142,
"grad_norm": 0.29282763600349426,
"learning_rate": 8e-05,
"loss": 1.7858,
"step": 2399
},
{
"epoch": 0.5278785879247773,
"grad_norm": 0.30619367957115173,
"learning_rate": 8e-05,
"loss": 1.718,
"step": 2400
},
{
"epoch": 0.5280985373364127,
"grad_norm": 0.26707130670547485,
"learning_rate": 8e-05,
"loss": 1.5899,
"step": 2401
},
{
"epoch": 0.5283184867480479,
"grad_norm": 0.3182383179664612,
"learning_rate": 8e-05,
"loss": 1.7268,
"step": 2402
},
{
"epoch": 0.5285384361596833,
"grad_norm": 0.3178313374519348,
"learning_rate": 8e-05,
"loss": 1.7282,
"step": 2403
},
{
"epoch": 0.5287583855713186,
"grad_norm": 0.26504799723625183,
"learning_rate": 8e-05,
"loss": 1.6046,
"step": 2404
},
{
"epoch": 0.5289783349829539,
"grad_norm": 0.2749512195587158,
"learning_rate": 8e-05,
"loss": 1.6251,
"step": 2405
},
{
"epoch": 0.5291982843945893,
"grad_norm": 0.27312803268432617,
"learning_rate": 8e-05,
"loss": 1.746,
"step": 2406
},
{
"epoch": 0.5294182338062245,
"grad_norm": 0.26339027285575867,
"learning_rate": 8e-05,
"loss": 1.5214,
"step": 2407
},
{
"epoch": 0.5296381832178599,
"grad_norm": 0.28254935145378113,
"learning_rate": 8e-05,
"loss": 1.629,
"step": 2408
},
{
"epoch": 0.5298581326294952,
"grad_norm": 0.2761283218860626,
"learning_rate": 8e-05,
"loss": 1.7202,
"step": 2409
},
{
"epoch": 0.5300780820411305,
"grad_norm": 0.27570095658302307,
"learning_rate": 8e-05,
"loss": 1.7042,
"step": 2410
},
{
"epoch": 0.5302980314527659,
"grad_norm": 0.2886349856853485,
"learning_rate": 8e-05,
"loss": 1.7923,
"step": 2411
},
{
"epoch": 0.5305179808644012,
"grad_norm": 0.29611504077911377,
"learning_rate": 8e-05,
"loss": 1.7219,
"step": 2412
},
{
"epoch": 0.5307379302760366,
"grad_norm": 0.28122174739837646,
"learning_rate": 8e-05,
"loss": 1.6888,
"step": 2413
},
{
"epoch": 0.5309578796876718,
"grad_norm": 0.2690391540527344,
"learning_rate": 8e-05,
"loss": 1.2745,
"step": 2414
},
{
"epoch": 0.5311778290993071,
"grad_norm": 0.2676471173763275,
"learning_rate": 8e-05,
"loss": 1.6422,
"step": 2415
},
{
"epoch": 0.5313977785109425,
"grad_norm": 0.2947712540626526,
"learning_rate": 8e-05,
"loss": 1.654,
"step": 2416
},
{
"epoch": 0.5316177279225778,
"grad_norm": 0.27766644954681396,
"learning_rate": 8e-05,
"loss": 1.6208,
"step": 2417
},
{
"epoch": 0.5318376773342132,
"grad_norm": 0.28579944372177124,
"learning_rate": 8e-05,
"loss": 1.8171,
"step": 2418
},
{
"epoch": 0.5320576267458484,
"grad_norm": 0.2734217345714569,
"learning_rate": 8e-05,
"loss": 1.658,
"step": 2419
},
{
"epoch": 0.5322775761574838,
"grad_norm": 0.28343021869659424,
"learning_rate": 8e-05,
"loss": 1.6291,
"step": 2420
},
{
"epoch": 0.5324975255691191,
"grad_norm": 0.2881801128387451,
"learning_rate": 8e-05,
"loss": 1.5212,
"step": 2421
},
{
"epoch": 0.5327174749807544,
"grad_norm": 0.27267688512802124,
"learning_rate": 8e-05,
"loss": 1.6599,
"step": 2422
},
{
"epoch": 0.5329374243923898,
"grad_norm": 0.29100489616394043,
"learning_rate": 8e-05,
"loss": 1.6636,
"step": 2423
},
{
"epoch": 0.5331573738040251,
"grad_norm": 0.301812082529068,
"learning_rate": 8e-05,
"loss": 1.9097,
"step": 2424
},
{
"epoch": 0.5333773232156604,
"grad_norm": 0.2864093482494354,
"learning_rate": 8e-05,
"loss": 1.6535,
"step": 2425
},
{
"epoch": 0.5335972726272957,
"grad_norm": 0.28721320629119873,
"learning_rate": 8e-05,
"loss": 1.7307,
"step": 2426
},
{
"epoch": 0.533817222038931,
"grad_norm": 0.3100323975086212,
"learning_rate": 8e-05,
"loss": 1.7155,
"step": 2427
},
{
"epoch": 0.5340371714505664,
"grad_norm": 0.2595236301422119,
"learning_rate": 8e-05,
"loss": 1.4525,
"step": 2428
},
{
"epoch": 0.5342571208622017,
"grad_norm": 0.27981269359588623,
"learning_rate": 8e-05,
"loss": 1.6821,
"step": 2429
},
{
"epoch": 0.5344770702738371,
"grad_norm": 0.28523892164230347,
"learning_rate": 8e-05,
"loss": 1.6213,
"step": 2430
},
{
"epoch": 0.5346970196854723,
"grad_norm": 0.2951820194721222,
"learning_rate": 8e-05,
"loss": 1.7798,
"step": 2431
},
{
"epoch": 0.5349169690971076,
"grad_norm": 0.27744752168655396,
"learning_rate": 8e-05,
"loss": 1.664,
"step": 2432
},
{
"epoch": 0.535136918508743,
"grad_norm": 0.2700327932834625,
"learning_rate": 8e-05,
"loss": 1.6476,
"step": 2433
},
{
"epoch": 0.5353568679203783,
"grad_norm": 0.3043116331100464,
"learning_rate": 8e-05,
"loss": 1.8377,
"step": 2434
},
{
"epoch": 0.5355768173320137,
"grad_norm": 0.2886519730091095,
"learning_rate": 8e-05,
"loss": 1.7098,
"step": 2435
},
{
"epoch": 0.535796766743649,
"grad_norm": 0.28121626377105713,
"learning_rate": 8e-05,
"loss": 1.5902,
"step": 2436
},
{
"epoch": 0.5360167161552842,
"grad_norm": 0.28657859563827515,
"learning_rate": 8e-05,
"loss": 1.6769,
"step": 2437
},
{
"epoch": 0.5362366655669196,
"grad_norm": 0.3111754059791565,
"learning_rate": 8e-05,
"loss": 1.8352,
"step": 2438
},
{
"epoch": 0.5364566149785549,
"grad_norm": 0.27172762155532837,
"learning_rate": 8e-05,
"loss": 1.5897,
"step": 2439
},
{
"epoch": 0.5366765643901903,
"grad_norm": 0.28469017148017883,
"learning_rate": 8e-05,
"loss": 1.6976,
"step": 2440
},
{
"epoch": 0.5368965138018256,
"grad_norm": 0.29801180958747864,
"learning_rate": 8e-05,
"loss": 1.7811,
"step": 2441
},
{
"epoch": 0.5371164632134608,
"grad_norm": 0.2860267758369446,
"learning_rate": 8e-05,
"loss": 1.7026,
"step": 2442
},
{
"epoch": 0.5373364126250962,
"grad_norm": 0.3069910705089569,
"learning_rate": 8e-05,
"loss": 1.823,
"step": 2443
},
{
"epoch": 0.5375563620367315,
"grad_norm": 0.29847028851509094,
"learning_rate": 8e-05,
"loss": 1.6314,
"step": 2444
},
{
"epoch": 0.5377763114483669,
"grad_norm": 0.2970685660839081,
"learning_rate": 8e-05,
"loss": 1.8591,
"step": 2445
},
{
"epoch": 0.5379962608600022,
"grad_norm": 0.28767916560173035,
"learning_rate": 8e-05,
"loss": 1.6244,
"step": 2446
},
{
"epoch": 0.5382162102716376,
"grad_norm": 0.2858954966068268,
"learning_rate": 8e-05,
"loss": 1.6002,
"step": 2447
},
{
"epoch": 0.5384361596832729,
"grad_norm": 0.25083082914352417,
"learning_rate": 8e-05,
"loss": 1.4772,
"step": 2448
},
{
"epoch": 0.5386561090949081,
"grad_norm": 0.28772327303886414,
"learning_rate": 8e-05,
"loss": 1.7885,
"step": 2449
},
{
"epoch": 0.5388760585065435,
"grad_norm": 0.300503671169281,
"learning_rate": 8e-05,
"loss": 1.8074,
"step": 2450
},
{
"epoch": 0.5390960079181788,
"grad_norm": 0.29243797063827515,
"learning_rate": 8e-05,
"loss": 1.7033,
"step": 2451
},
{
"epoch": 0.5393159573298142,
"grad_norm": 0.28921830654144287,
"learning_rate": 8e-05,
"loss": 1.734,
"step": 2452
},
{
"epoch": 0.5395359067414495,
"grad_norm": 0.2754501700401306,
"learning_rate": 8e-05,
"loss": 1.577,
"step": 2453
},
{
"epoch": 0.5397558561530847,
"grad_norm": 0.26824522018432617,
"learning_rate": 8e-05,
"loss": 1.6434,
"step": 2454
},
{
"epoch": 0.5399758055647201,
"grad_norm": 0.26851388812065125,
"learning_rate": 8e-05,
"loss": 1.6706,
"step": 2455
},
{
"epoch": 0.5401957549763554,
"grad_norm": 0.2697846293449402,
"learning_rate": 8e-05,
"loss": 1.6052,
"step": 2456
},
{
"epoch": 0.5404157043879908,
"grad_norm": 0.27774059772491455,
"learning_rate": 8e-05,
"loss": 1.663,
"step": 2457
},
{
"epoch": 0.5406356537996261,
"grad_norm": 0.2799103558063507,
"learning_rate": 8e-05,
"loss": 1.9412,
"step": 2458
},
{
"epoch": 0.5408556032112614,
"grad_norm": 0.2874007523059845,
"learning_rate": 8e-05,
"loss": 1.6366,
"step": 2459
},
{
"epoch": 0.5410755526228967,
"grad_norm": 0.29054176807403564,
"learning_rate": 8e-05,
"loss": 1.7546,
"step": 2460
},
{
"epoch": 0.541295502034532,
"grad_norm": 0.29359421133995056,
"learning_rate": 8e-05,
"loss": 1.8274,
"step": 2461
},
{
"epoch": 0.5415154514461674,
"grad_norm": 0.29589033126831055,
"learning_rate": 8e-05,
"loss": 1.8619,
"step": 2462
},
{
"epoch": 0.5417354008578027,
"grad_norm": 0.2997150421142578,
"learning_rate": 8e-05,
"loss": 1.7685,
"step": 2463
},
{
"epoch": 0.541955350269438,
"grad_norm": 0.2759319543838501,
"learning_rate": 8e-05,
"loss": 1.5652,
"step": 2464
},
{
"epoch": 0.5421752996810734,
"grad_norm": 0.27741214632987976,
"learning_rate": 8e-05,
"loss": 1.6247,
"step": 2465
},
{
"epoch": 0.5423952490927086,
"grad_norm": 0.29365673661231995,
"learning_rate": 8e-05,
"loss": 1.7768,
"step": 2466
},
{
"epoch": 0.542615198504344,
"grad_norm": 0.2897026836872101,
"learning_rate": 8e-05,
"loss": 1.7435,
"step": 2467
},
{
"epoch": 0.5428351479159793,
"grad_norm": 0.2963312566280365,
"learning_rate": 8e-05,
"loss": 1.7792,
"step": 2468
},
{
"epoch": 0.5430550973276147,
"grad_norm": 0.3142043948173523,
"learning_rate": 8e-05,
"loss": 1.8348,
"step": 2469
},
{
"epoch": 0.54327504673925,
"grad_norm": 0.28869184851646423,
"learning_rate": 8e-05,
"loss": 1.6916,
"step": 2470
},
{
"epoch": 0.5434949961508853,
"grad_norm": 0.27220281958580017,
"learning_rate": 8e-05,
"loss": 1.5963,
"step": 2471
},
{
"epoch": 0.5437149455625206,
"grad_norm": 0.3002524971961975,
"learning_rate": 8e-05,
"loss": 1.7516,
"step": 2472
},
{
"epoch": 0.5439348949741559,
"grad_norm": 0.27016308903694153,
"learning_rate": 8e-05,
"loss": 1.5655,
"step": 2473
},
{
"epoch": 0.5441548443857913,
"grad_norm": 0.2886146903038025,
"learning_rate": 8e-05,
"loss": 1.6563,
"step": 2474
},
{
"epoch": 0.5443747937974266,
"grad_norm": 0.2743261158466339,
"learning_rate": 8e-05,
"loss": 1.7916,
"step": 2475
},
{
"epoch": 0.5445947432090619,
"grad_norm": 0.27933475375175476,
"learning_rate": 8e-05,
"loss": 1.7397,
"step": 2476
},
{
"epoch": 0.5448146926206973,
"grad_norm": 0.2805885672569275,
"learning_rate": 8e-05,
"loss": 1.6608,
"step": 2477
},
{
"epoch": 0.5450346420323325,
"grad_norm": 0.26985716819763184,
"learning_rate": 8e-05,
"loss": 1.6551,
"step": 2478
},
{
"epoch": 0.5452545914439679,
"grad_norm": 0.2778765857219696,
"learning_rate": 8e-05,
"loss": 1.6792,
"step": 2479
},
{
"epoch": 0.5454745408556032,
"grad_norm": 0.27623313665390015,
"learning_rate": 8e-05,
"loss": 1.6867,
"step": 2480
},
{
"epoch": 0.5456944902672385,
"grad_norm": 0.27185389399528503,
"learning_rate": 8e-05,
"loss": 1.6184,
"step": 2481
},
{
"epoch": 0.5459144396788739,
"grad_norm": 0.29302138090133667,
"learning_rate": 8e-05,
"loss": 1.7075,
"step": 2482
},
{
"epoch": 0.5461343890905092,
"grad_norm": 0.26639193296432495,
"learning_rate": 8e-05,
"loss": 1.6138,
"step": 2483
},
{
"epoch": 0.5463543385021445,
"grad_norm": 0.28048211336135864,
"learning_rate": 8e-05,
"loss": 1.6672,
"step": 2484
},
{
"epoch": 0.5465742879137798,
"grad_norm": 0.2844570577144623,
"learning_rate": 8e-05,
"loss": 1.6888,
"step": 2485
},
{
"epoch": 0.5467942373254151,
"grad_norm": 0.2801128923892975,
"learning_rate": 8e-05,
"loss": 1.8506,
"step": 2486
},
{
"epoch": 0.5470141867370505,
"grad_norm": 0.2718241810798645,
"learning_rate": 8e-05,
"loss": 1.6105,
"step": 2487
},
{
"epoch": 0.5472341361486858,
"grad_norm": 0.28759825229644775,
"learning_rate": 8e-05,
"loss": 1.7449,
"step": 2488
},
{
"epoch": 0.5474540855603212,
"grad_norm": 0.29218876361846924,
"learning_rate": 8e-05,
"loss": 1.8732,
"step": 2489
},
{
"epoch": 0.5476740349719564,
"grad_norm": 0.29760751128196716,
"learning_rate": 8e-05,
"loss": 1.7804,
"step": 2490
},
{
"epoch": 0.5478939843835918,
"grad_norm": 0.28636956214904785,
"learning_rate": 8e-05,
"loss": 1.6994,
"step": 2491
},
{
"epoch": 0.5481139337952271,
"grad_norm": 0.2892046570777893,
"learning_rate": 8e-05,
"loss": 1.759,
"step": 2492
},
{
"epoch": 0.5483338832068624,
"grad_norm": 0.280556857585907,
"learning_rate": 8e-05,
"loss": 1.8084,
"step": 2493
},
{
"epoch": 0.5485538326184978,
"grad_norm": 0.2733471691608429,
"learning_rate": 8e-05,
"loss": 1.7293,
"step": 2494
},
{
"epoch": 0.548773782030133,
"grad_norm": 0.2813643515110016,
"learning_rate": 8e-05,
"loss": 1.6178,
"step": 2495
},
{
"epoch": 0.5489937314417684,
"grad_norm": 0.27255943417549133,
"learning_rate": 8e-05,
"loss": 1.6827,
"step": 2496
},
{
"epoch": 0.5492136808534037,
"grad_norm": 0.2690375745296478,
"learning_rate": 8e-05,
"loss": 1.684,
"step": 2497
},
{
"epoch": 0.549433630265039,
"grad_norm": 0.30036401748657227,
"learning_rate": 8e-05,
"loss": 1.8676,
"step": 2498
},
{
"epoch": 0.5496535796766744,
"grad_norm": 0.27924251556396484,
"learning_rate": 8e-05,
"loss": 1.6619,
"step": 2499
},
{
"epoch": 0.5498735290883097,
"grad_norm": 0.2792947590351105,
"learning_rate": 8e-05,
"loss": 1.6121,
"step": 2500
},
{
"epoch": 0.550093478499945,
"grad_norm": 0.27976930141448975,
"learning_rate": 8e-05,
"loss": 1.5815,
"step": 2501
},
{
"epoch": 0.5503134279115803,
"grad_norm": 0.28429850935935974,
"learning_rate": 8e-05,
"loss": 1.6713,
"step": 2502
},
{
"epoch": 0.5505333773232156,
"grad_norm": 0.2669944763183594,
"learning_rate": 8e-05,
"loss": 1.5065,
"step": 2503
},
{
"epoch": 0.550753326734851,
"grad_norm": 0.2846994400024414,
"learning_rate": 8e-05,
"loss": 1.7238,
"step": 2504
},
{
"epoch": 0.5509732761464863,
"grad_norm": 0.27598071098327637,
"learning_rate": 8e-05,
"loss": 1.5364,
"step": 2505
},
{
"epoch": 0.5511932255581217,
"grad_norm": 0.27275460958480835,
"learning_rate": 8e-05,
"loss": 1.6171,
"step": 2506
},
{
"epoch": 0.551413174969757,
"grad_norm": 0.2846895456314087,
"learning_rate": 8e-05,
"loss": 1.7082,
"step": 2507
},
{
"epoch": 0.5516331243813922,
"grad_norm": 0.3010547161102295,
"learning_rate": 8e-05,
"loss": 1.7946,
"step": 2508
},
{
"epoch": 0.5518530737930276,
"grad_norm": 0.28405773639678955,
"learning_rate": 8e-05,
"loss": 1.6063,
"step": 2509
},
{
"epoch": 0.5520730232046629,
"grad_norm": 0.2855536639690399,
"learning_rate": 8e-05,
"loss": 1.6138,
"step": 2510
},
{
"epoch": 0.5522929726162983,
"grad_norm": 0.2949456572532654,
"learning_rate": 8e-05,
"loss": 1.7734,
"step": 2511
},
{
"epoch": 0.5525129220279336,
"grad_norm": 0.31665512919425964,
"learning_rate": 8e-05,
"loss": 1.7163,
"step": 2512
},
{
"epoch": 0.552732871439569,
"grad_norm": 0.2881389260292053,
"learning_rate": 8e-05,
"loss": 1.5617,
"step": 2513
},
{
"epoch": 0.5529528208512042,
"grad_norm": 0.26758721470832825,
"learning_rate": 8e-05,
"loss": 1.6714,
"step": 2514
},
{
"epoch": 0.5531727702628395,
"grad_norm": 0.29549580812454224,
"learning_rate": 8e-05,
"loss": 1.6516,
"step": 2515
},
{
"epoch": 0.5533927196744749,
"grad_norm": 0.2811340391635895,
"learning_rate": 8e-05,
"loss": 1.6026,
"step": 2516
},
{
"epoch": 0.5536126690861102,
"grad_norm": 0.2837204039096832,
"learning_rate": 8e-05,
"loss": 1.8413,
"step": 2517
},
{
"epoch": 0.5538326184977456,
"grad_norm": 0.276216983795166,
"learning_rate": 8e-05,
"loss": 1.6671,
"step": 2518
},
{
"epoch": 0.5540525679093808,
"grad_norm": 0.2781767249107361,
"learning_rate": 8e-05,
"loss": 1.5886,
"step": 2519
},
{
"epoch": 0.5542725173210161,
"grad_norm": 0.2830861210823059,
"learning_rate": 8e-05,
"loss": 1.6097,
"step": 2520
},
{
"epoch": 0.5544924667326515,
"grad_norm": 0.2746805250644684,
"learning_rate": 8e-05,
"loss": 1.7153,
"step": 2521
},
{
"epoch": 0.5547124161442868,
"grad_norm": 0.2781994640827179,
"learning_rate": 8e-05,
"loss": 1.6597,
"step": 2522
},
{
"epoch": 0.5549323655559222,
"grad_norm": 0.2919979393482208,
"learning_rate": 8e-05,
"loss": 1.715,
"step": 2523
},
{
"epoch": 0.5551523149675575,
"grad_norm": 0.27563661336898804,
"learning_rate": 8e-05,
"loss": 1.6129,
"step": 2524
},
{
"epoch": 0.5553722643791927,
"grad_norm": 0.3070942163467407,
"learning_rate": 8e-05,
"loss": 1.875,
"step": 2525
},
{
"epoch": 0.5555922137908281,
"grad_norm": 0.278039813041687,
"learning_rate": 8e-05,
"loss": 1.6894,
"step": 2526
},
{
"epoch": 0.5558121632024634,
"grad_norm": 0.2709571421146393,
"learning_rate": 8e-05,
"loss": 1.5268,
"step": 2527
},
{
"epoch": 0.5560321126140988,
"grad_norm": 0.27659523487091064,
"learning_rate": 8e-05,
"loss": 1.5817,
"step": 2528
},
{
"epoch": 0.5562520620257341,
"grad_norm": 0.33376970887184143,
"learning_rate": 8e-05,
"loss": 1.6275,
"step": 2529
},
{
"epoch": 0.5564720114373694,
"grad_norm": 0.28663134574890137,
"learning_rate": 8e-05,
"loss": 1.6497,
"step": 2530
},
{
"epoch": 0.5566919608490047,
"grad_norm": 0.27400556206703186,
"learning_rate": 8e-05,
"loss": 1.6768,
"step": 2531
},
{
"epoch": 0.55691191026064,
"grad_norm": 0.3359694182872772,
"learning_rate": 8e-05,
"loss": 1.7628,
"step": 2532
},
{
"epoch": 0.5571318596722754,
"grad_norm": 0.3009445071220398,
"learning_rate": 8e-05,
"loss": 1.6368,
"step": 2533
},
{
"epoch": 0.5573518090839107,
"grad_norm": 0.2951606512069702,
"learning_rate": 8e-05,
"loss": 1.6468,
"step": 2534
},
{
"epoch": 0.5575717584955461,
"grad_norm": 0.298835426568985,
"learning_rate": 8e-05,
"loss": 1.679,
"step": 2535
},
{
"epoch": 0.5577917079071814,
"grad_norm": 0.29196399450302124,
"learning_rate": 8e-05,
"loss": 1.6865,
"step": 2536
},
{
"epoch": 0.5580116573188166,
"grad_norm": 0.3057127296924591,
"learning_rate": 8e-05,
"loss": 1.7979,
"step": 2537
},
{
"epoch": 0.558231606730452,
"grad_norm": 0.3170565664768219,
"learning_rate": 8e-05,
"loss": 1.7815,
"step": 2538
},
{
"epoch": 0.5584515561420873,
"grad_norm": 0.28287273645401,
"learning_rate": 8e-05,
"loss": 1.7151,
"step": 2539
},
{
"epoch": 0.5586715055537227,
"grad_norm": 0.30313780903816223,
"learning_rate": 8e-05,
"loss": 1.6501,
"step": 2540
},
{
"epoch": 0.558891454965358,
"grad_norm": 0.28195586800575256,
"learning_rate": 8e-05,
"loss": 1.7281,
"step": 2541
},
{
"epoch": 0.5591114043769932,
"grad_norm": 0.2734014391899109,
"learning_rate": 8e-05,
"loss": 1.6504,
"step": 2542
},
{
"epoch": 0.5593313537886286,
"grad_norm": 0.28178513050079346,
"learning_rate": 8e-05,
"loss": 1.5946,
"step": 2543
},
{
"epoch": 0.5595513032002639,
"grad_norm": 0.2800062894821167,
"learning_rate": 8e-05,
"loss": 1.7498,
"step": 2544
},
{
"epoch": 0.5597712526118993,
"grad_norm": 0.28368762135505676,
"learning_rate": 8e-05,
"loss": 1.6732,
"step": 2545
},
{
"epoch": 0.5599912020235346,
"grad_norm": 0.3069396913051605,
"learning_rate": 8e-05,
"loss": 1.7647,
"step": 2546
},
{
"epoch": 0.5602111514351699,
"grad_norm": 0.27336394786834717,
"learning_rate": 8e-05,
"loss": 1.4557,
"step": 2547
},
{
"epoch": 0.5604311008468053,
"grad_norm": 0.28363245725631714,
"learning_rate": 8e-05,
"loss": 1.7301,
"step": 2548
},
{
"epoch": 0.5606510502584405,
"grad_norm": 0.3097067177295685,
"learning_rate": 8e-05,
"loss": 1.7322,
"step": 2549
},
{
"epoch": 0.5608709996700759,
"grad_norm": 0.28125154972076416,
"learning_rate": 8e-05,
"loss": 1.5551,
"step": 2550
},
{
"epoch": 0.5610909490817112,
"grad_norm": 0.3111821413040161,
"learning_rate": 8e-05,
"loss": 1.8965,
"step": 2551
},
{
"epoch": 0.5613108984933465,
"grad_norm": 0.2920529842376709,
"learning_rate": 8e-05,
"loss": 1.6835,
"step": 2552
},
{
"epoch": 0.5615308479049819,
"grad_norm": 0.27278631925582886,
"learning_rate": 8e-05,
"loss": 1.7175,
"step": 2553
},
{
"epoch": 0.5617507973166171,
"grad_norm": 0.2742355763912201,
"learning_rate": 8e-05,
"loss": 1.6745,
"step": 2554
},
{
"epoch": 0.5619707467282525,
"grad_norm": 0.2675003707408905,
"learning_rate": 8e-05,
"loss": 1.5771,
"step": 2555
},
{
"epoch": 0.5621906961398878,
"grad_norm": 0.2805350422859192,
"learning_rate": 8e-05,
"loss": 1.7503,
"step": 2556
},
{
"epoch": 0.5624106455515232,
"grad_norm": 0.27205830812454224,
"learning_rate": 8e-05,
"loss": 1.6591,
"step": 2557
},
{
"epoch": 0.5626305949631585,
"grad_norm": 0.26984983682632446,
"learning_rate": 8e-05,
"loss": 1.6351,
"step": 2558
},
{
"epoch": 0.5628505443747938,
"grad_norm": 0.3067481517791748,
"learning_rate": 8e-05,
"loss": 1.5304,
"step": 2559
},
{
"epoch": 0.5630704937864291,
"grad_norm": 0.28945624828338623,
"learning_rate": 8e-05,
"loss": 1.7099,
"step": 2560
},
{
"epoch": 0.5632904431980644,
"grad_norm": 0.269144743680954,
"learning_rate": 8e-05,
"loss": 1.557,
"step": 2561
},
{
"epoch": 0.5635103926096998,
"grad_norm": 0.329520583152771,
"learning_rate": 8e-05,
"loss": 1.7867,
"step": 2562
},
{
"epoch": 0.5637303420213351,
"grad_norm": 0.35944700241088867,
"learning_rate": 8e-05,
"loss": 1.8146,
"step": 2563
},
{
"epoch": 0.5639502914329704,
"grad_norm": 0.30693116784095764,
"learning_rate": 8e-05,
"loss": 1.7709,
"step": 2564
},
{
"epoch": 0.5641702408446058,
"grad_norm": 0.31814631819725037,
"learning_rate": 8e-05,
"loss": 1.8679,
"step": 2565
},
{
"epoch": 0.564390190256241,
"grad_norm": 0.2988479435443878,
"learning_rate": 8e-05,
"loss": 1.7675,
"step": 2566
},
{
"epoch": 0.5646101396678764,
"grad_norm": 0.2955850064754486,
"learning_rate": 8e-05,
"loss": 1.7082,
"step": 2567
},
{
"epoch": 0.5648300890795117,
"grad_norm": 0.27773404121398926,
"learning_rate": 8e-05,
"loss": 1.5378,
"step": 2568
},
{
"epoch": 0.565050038491147,
"grad_norm": 0.2847524583339691,
"learning_rate": 8e-05,
"loss": 1.792,
"step": 2569
},
{
"epoch": 0.5652699879027824,
"grad_norm": 0.29024967551231384,
"learning_rate": 8e-05,
"loss": 1.6185,
"step": 2570
},
{
"epoch": 0.5654899373144177,
"grad_norm": 0.27534323930740356,
"learning_rate": 8e-05,
"loss": 1.7044,
"step": 2571
},
{
"epoch": 0.565709886726053,
"grad_norm": 0.28059902787208557,
"learning_rate": 8e-05,
"loss": 1.7199,
"step": 2572
},
{
"epoch": 0.5659298361376883,
"grad_norm": 0.29140958189964294,
"learning_rate": 8e-05,
"loss": 1.6534,
"step": 2573
},
{
"epoch": 0.5661497855493236,
"grad_norm": 0.303821861743927,
"learning_rate": 8e-05,
"loss": 1.7163,
"step": 2574
},
{
"epoch": 0.566369734960959,
"grad_norm": 0.3073093295097351,
"learning_rate": 8e-05,
"loss": 1.7885,
"step": 2575
},
{
"epoch": 0.5665896843725943,
"grad_norm": 0.2976214289665222,
"learning_rate": 8e-05,
"loss": 1.7059,
"step": 2576
},
{
"epoch": 0.5668096337842297,
"grad_norm": 0.3081284761428833,
"learning_rate": 8e-05,
"loss": 1.5529,
"step": 2577
},
{
"epoch": 0.5670295831958649,
"grad_norm": 0.2893354594707489,
"learning_rate": 8e-05,
"loss": 1.8564,
"step": 2578
},
{
"epoch": 0.5672495326075003,
"grad_norm": 0.2904176115989685,
"learning_rate": 8e-05,
"loss": 1.6903,
"step": 2579
},
{
"epoch": 0.5674694820191356,
"grad_norm": 0.2907819449901581,
"learning_rate": 8e-05,
"loss": 1.5663,
"step": 2580
},
{
"epoch": 0.5676894314307709,
"grad_norm": 0.27938172221183777,
"learning_rate": 8e-05,
"loss": 1.6031,
"step": 2581
},
{
"epoch": 0.5679093808424063,
"grad_norm": 0.28864786028862,
"learning_rate": 8e-05,
"loss": 1.666,
"step": 2582
},
{
"epoch": 0.5681293302540416,
"grad_norm": 0.29587891697883606,
"learning_rate": 8e-05,
"loss": 1.7545,
"step": 2583
},
{
"epoch": 0.5683492796656769,
"grad_norm": 0.26541203260421753,
"learning_rate": 8e-05,
"loss": 1.6059,
"step": 2584
},
{
"epoch": 0.5685692290773122,
"grad_norm": 0.2819576561450958,
"learning_rate": 8e-05,
"loss": 1.7227,
"step": 2585
},
{
"epoch": 0.5687891784889475,
"grad_norm": 0.2920463979244232,
"learning_rate": 8e-05,
"loss": 1.7553,
"step": 2586
},
{
"epoch": 0.5690091279005829,
"grad_norm": 0.29490089416503906,
"learning_rate": 8e-05,
"loss": 1.7117,
"step": 2587
},
{
"epoch": 0.5692290773122182,
"grad_norm": 0.29847970604896545,
"learning_rate": 8e-05,
"loss": 1.8931,
"step": 2588
},
{
"epoch": 0.5694490267238536,
"grad_norm": 0.28575995564460754,
"learning_rate": 8e-05,
"loss": 1.595,
"step": 2589
},
{
"epoch": 0.5696689761354888,
"grad_norm": 0.28053271770477295,
"learning_rate": 8e-05,
"loss": 1.6089,
"step": 2590
},
{
"epoch": 0.5698889255471241,
"grad_norm": 0.27538979053497314,
"learning_rate": 8e-05,
"loss": 1.7808,
"step": 2591
},
{
"epoch": 0.5701088749587595,
"grad_norm": 0.2819748520851135,
"learning_rate": 8e-05,
"loss": 1.7355,
"step": 2592
},
{
"epoch": 0.5703288243703948,
"grad_norm": 0.3023085594177246,
"learning_rate": 8e-05,
"loss": 1.759,
"step": 2593
},
{
"epoch": 0.5705487737820302,
"grad_norm": 0.28369995951652527,
"learning_rate": 8e-05,
"loss": 1.7796,
"step": 2594
},
{
"epoch": 0.5707687231936655,
"grad_norm": 0.27062156796455383,
"learning_rate": 8e-05,
"loss": 1.6206,
"step": 2595
},
{
"epoch": 0.5709886726053007,
"grad_norm": 0.2928752303123474,
"learning_rate": 8e-05,
"loss": 1.7578,
"step": 2596
},
{
"epoch": 0.5712086220169361,
"grad_norm": 0.28366369009017944,
"learning_rate": 8e-05,
"loss": 1.6367,
"step": 2597
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.2794798016548157,
"learning_rate": 8e-05,
"loss": 1.7006,
"step": 2598
},
{
"epoch": 0.5716485208402068,
"grad_norm": 0.278814435005188,
"learning_rate": 8e-05,
"loss": 1.6883,
"step": 2599
},
{
"epoch": 0.5718684702518421,
"grad_norm": 0.28789058327674866,
"learning_rate": 8e-05,
"loss": 1.76,
"step": 2600
},
{
"epoch": 0.5720884196634775,
"grad_norm": 0.289120614528656,
"learning_rate": 8e-05,
"loss": 1.5449,
"step": 2601
},
{
"epoch": 0.5723083690751127,
"grad_norm": 0.27491265535354614,
"learning_rate": 8e-05,
"loss": 1.6287,
"step": 2602
},
{
"epoch": 0.572528318486748,
"grad_norm": 0.2837536931037903,
"learning_rate": 8e-05,
"loss": 1.6618,
"step": 2603
},
{
"epoch": 0.5727482678983834,
"grad_norm": 0.27386194467544556,
"learning_rate": 8e-05,
"loss": 1.5815,
"step": 2604
},
{
"epoch": 0.5729682173100187,
"grad_norm": 0.2818918228149414,
"learning_rate": 8e-05,
"loss": 1.511,
"step": 2605
},
{
"epoch": 0.5731881667216541,
"grad_norm": 0.29329514503479004,
"learning_rate": 8e-05,
"loss": 1.5494,
"step": 2606
},
{
"epoch": 0.5734081161332893,
"grad_norm": 0.29942408204078674,
"learning_rate": 8e-05,
"loss": 1.8049,
"step": 2607
},
{
"epoch": 0.5736280655449246,
"grad_norm": 0.30527159571647644,
"learning_rate": 8e-05,
"loss": 1.8735,
"step": 2608
},
{
"epoch": 0.57384801495656,
"grad_norm": 0.2842453122138977,
"learning_rate": 8e-05,
"loss": 1.7124,
"step": 2609
},
{
"epoch": 0.5740679643681953,
"grad_norm": 0.3305295407772064,
"learning_rate": 8e-05,
"loss": 1.6555,
"step": 2610
},
{
"epoch": 0.5742879137798307,
"grad_norm": 0.28134140372276306,
"learning_rate": 8e-05,
"loss": 1.639,
"step": 2611
},
{
"epoch": 0.574507863191466,
"grad_norm": 0.2862444818019867,
"learning_rate": 8e-05,
"loss": 1.7949,
"step": 2612
},
{
"epoch": 0.5747278126031012,
"grad_norm": 0.3089071214199066,
"learning_rate": 8e-05,
"loss": 1.7782,
"step": 2613
},
{
"epoch": 0.5749477620147366,
"grad_norm": 0.3113284111022949,
"learning_rate": 8e-05,
"loss": 1.5588,
"step": 2614
},
{
"epoch": 0.5751677114263719,
"grad_norm": 0.2865052819252014,
"learning_rate": 8e-05,
"loss": 1.6919,
"step": 2615
},
{
"epoch": 0.5753876608380073,
"grad_norm": 0.26997220516204834,
"learning_rate": 8e-05,
"loss": 1.6853,
"step": 2616
},
{
"epoch": 0.5756076102496426,
"grad_norm": 0.3056239187717438,
"learning_rate": 8e-05,
"loss": 1.7971,
"step": 2617
},
{
"epoch": 0.5758275596612779,
"grad_norm": 0.3041035234928131,
"learning_rate": 8e-05,
"loss": 1.7889,
"step": 2618
},
{
"epoch": 0.5760475090729132,
"grad_norm": 0.2829764187335968,
"learning_rate": 8e-05,
"loss": 1.6371,
"step": 2619
},
{
"epoch": 0.5762674584845485,
"grad_norm": 0.29050111770629883,
"learning_rate": 8e-05,
"loss": 1.8909,
"step": 2620
},
{
"epoch": 0.5764874078961839,
"grad_norm": 0.29888811707496643,
"learning_rate": 8e-05,
"loss": 1.6974,
"step": 2621
},
{
"epoch": 0.5767073573078192,
"grad_norm": 0.3193587362766266,
"learning_rate": 8e-05,
"loss": 1.7083,
"step": 2622
},
{
"epoch": 0.5769273067194546,
"grad_norm": 0.2855699360370636,
"learning_rate": 8e-05,
"loss": 1.6106,
"step": 2623
},
{
"epoch": 0.5771472561310899,
"grad_norm": 0.29608815908432007,
"learning_rate": 8e-05,
"loss": 1.7813,
"step": 2624
},
{
"epoch": 0.5773672055427251,
"grad_norm": 0.2846873700618744,
"learning_rate": 8e-05,
"loss": 1.6164,
"step": 2625
},
{
"epoch": 0.5775871549543605,
"grad_norm": 0.3074873983860016,
"learning_rate": 8e-05,
"loss": 1.7278,
"step": 2626
},
{
"epoch": 0.5778071043659958,
"grad_norm": 0.3016159236431122,
"learning_rate": 8e-05,
"loss": 1.6286,
"step": 2627
},
{
"epoch": 0.5780270537776312,
"grad_norm": 0.28926798701286316,
"learning_rate": 8e-05,
"loss": 1.746,
"step": 2628
},
{
"epoch": 0.5782470031892665,
"grad_norm": 0.3222711682319641,
"learning_rate": 8e-05,
"loss": 1.8108,
"step": 2629
},
{
"epoch": 0.5784669526009018,
"grad_norm": 0.30052945017814636,
"learning_rate": 8e-05,
"loss": 1.7374,
"step": 2630
},
{
"epoch": 0.5786869020125371,
"grad_norm": 0.2880706787109375,
"learning_rate": 8e-05,
"loss": 1.627,
"step": 2631
},
{
"epoch": 0.5789068514241724,
"grad_norm": 0.30028629302978516,
"learning_rate": 8e-05,
"loss": 1.8345,
"step": 2632
},
{
"epoch": 0.5791268008358078,
"grad_norm": 0.3164263665676117,
"learning_rate": 8e-05,
"loss": 1.9713,
"step": 2633
},
{
"epoch": 0.5793467502474431,
"grad_norm": 0.294114887714386,
"learning_rate": 8e-05,
"loss": 1.6083,
"step": 2634
},
{
"epoch": 0.5795666996590784,
"grad_norm": 0.31809002161026,
"learning_rate": 8e-05,
"loss": 1.794,
"step": 2635
},
{
"epoch": 0.5797866490707138,
"grad_norm": 0.3005049526691437,
"learning_rate": 8e-05,
"loss": 1.469,
"step": 2636
},
{
"epoch": 0.580006598482349,
"grad_norm": 0.2874310314655304,
"learning_rate": 8e-05,
"loss": 1.7345,
"step": 2637
},
{
"epoch": 0.5802265478939844,
"grad_norm": 0.295523077249527,
"learning_rate": 8e-05,
"loss": 1.6995,
"step": 2638
},
{
"epoch": 0.5804464973056197,
"grad_norm": 0.29120928049087524,
"learning_rate": 8e-05,
"loss": 1.7736,
"step": 2639
},
{
"epoch": 0.580666446717255,
"grad_norm": 0.2916790246963501,
"learning_rate": 8e-05,
"loss": 1.74,
"step": 2640
},
{
"epoch": 0.5808863961288904,
"grad_norm": 0.285230427980423,
"learning_rate": 8e-05,
"loss": 1.7685,
"step": 2641
},
{
"epoch": 0.5811063455405256,
"grad_norm": 0.2743189334869385,
"learning_rate": 8e-05,
"loss": 1.6751,
"step": 2642
},
{
"epoch": 0.581326294952161,
"grad_norm": 0.2997332811355591,
"learning_rate": 8e-05,
"loss": 1.5959,
"step": 2643
},
{
"epoch": 0.5815462443637963,
"grad_norm": 0.28394201397895813,
"learning_rate": 8e-05,
"loss": 1.6288,
"step": 2644
},
{
"epoch": 0.5817661937754317,
"grad_norm": 0.2787470519542694,
"learning_rate": 8e-05,
"loss": 1.7496,
"step": 2645
},
{
"epoch": 0.581986143187067,
"grad_norm": 0.2853599488735199,
"learning_rate": 8e-05,
"loss": 1.6439,
"step": 2646
},
{
"epoch": 0.5822060925987023,
"grad_norm": 0.2939299941062927,
"learning_rate": 8e-05,
"loss": 1.7293,
"step": 2647
},
{
"epoch": 0.5824260420103377,
"grad_norm": 0.27831408381462097,
"learning_rate": 8e-05,
"loss": 1.6748,
"step": 2648
},
{
"epoch": 0.5826459914219729,
"grad_norm": 0.296762615442276,
"learning_rate": 8e-05,
"loss": 1.6735,
"step": 2649
},
{
"epoch": 0.5828659408336083,
"grad_norm": 0.27961719036102295,
"learning_rate": 8e-05,
"loss": 1.6837,
"step": 2650
},
{
"epoch": 0.5830858902452436,
"grad_norm": 0.27915704250335693,
"learning_rate": 8e-05,
"loss": 1.6745,
"step": 2651
},
{
"epoch": 0.5833058396568789,
"grad_norm": 0.273799329996109,
"learning_rate": 8e-05,
"loss": 1.5609,
"step": 2652
},
{
"epoch": 0.5835257890685143,
"grad_norm": 0.287383109331131,
"learning_rate": 8e-05,
"loss": 1.7569,
"step": 2653
},
{
"epoch": 0.5837457384801495,
"grad_norm": 0.27745500206947327,
"learning_rate": 8e-05,
"loss": 1.6093,
"step": 2654
},
{
"epoch": 0.5839656878917849,
"grad_norm": 0.2954557240009308,
"learning_rate": 8e-05,
"loss": 1.788,
"step": 2655
},
{
"epoch": 0.5841856373034202,
"grad_norm": 0.28464850783348083,
"learning_rate": 8e-05,
"loss": 1.7079,
"step": 2656
},
{
"epoch": 0.5844055867150555,
"grad_norm": 0.27475497126579285,
"learning_rate": 8e-05,
"loss": 1.6137,
"step": 2657
},
{
"epoch": 0.5846255361266909,
"grad_norm": 0.27928462624549866,
"learning_rate": 8e-05,
"loss": 1.5776,
"step": 2658
},
{
"epoch": 0.5848454855383262,
"grad_norm": 0.2889251708984375,
"learning_rate": 8e-05,
"loss": 1.7871,
"step": 2659
},
{
"epoch": 0.5850654349499615,
"grad_norm": 0.29489466547966003,
"learning_rate": 8e-05,
"loss": 1.7299,
"step": 2660
},
{
"epoch": 0.5852853843615968,
"grad_norm": 0.27761825919151306,
"learning_rate": 8e-05,
"loss": 1.6772,
"step": 2661
},
{
"epoch": 0.5855053337732321,
"grad_norm": 0.2886674702167511,
"learning_rate": 8e-05,
"loss": 1.6718,
"step": 2662
},
{
"epoch": 0.5857252831848675,
"grad_norm": 0.2736080586910248,
"learning_rate": 8e-05,
"loss": 1.5834,
"step": 2663
},
{
"epoch": 0.5859452325965028,
"grad_norm": 0.29493847489356995,
"learning_rate": 8e-05,
"loss": 1.7123,
"step": 2664
},
{
"epoch": 0.5861651820081382,
"grad_norm": 0.2919282615184784,
"learning_rate": 8e-05,
"loss": 1.7192,
"step": 2665
},
{
"epoch": 0.5863851314197734,
"grad_norm": 0.2883647680282593,
"learning_rate": 8e-05,
"loss": 1.6271,
"step": 2666
},
{
"epoch": 0.5866050808314087,
"grad_norm": 0.2852446734905243,
"learning_rate": 8e-05,
"loss": 1.7295,
"step": 2667
},
{
"epoch": 0.5868250302430441,
"grad_norm": 0.3113778531551361,
"learning_rate": 8e-05,
"loss": 1.6605,
"step": 2668
},
{
"epoch": 0.5870449796546794,
"grad_norm": 0.2629379630088806,
"learning_rate": 8e-05,
"loss": 1.3457,
"step": 2669
},
{
"epoch": 0.5872649290663148,
"grad_norm": 0.28648287057876587,
"learning_rate": 8e-05,
"loss": 1.7137,
"step": 2670
},
{
"epoch": 0.5874848784779501,
"grad_norm": 0.30140426754951477,
"learning_rate": 8e-05,
"loss": 1.7612,
"step": 2671
},
{
"epoch": 0.5877048278895854,
"grad_norm": 0.29059261083602905,
"learning_rate": 8e-05,
"loss": 1.8526,
"step": 2672
},
{
"epoch": 0.5879247773012207,
"grad_norm": 0.2913878560066223,
"learning_rate": 8e-05,
"loss": 1.7214,
"step": 2673
},
{
"epoch": 0.588144726712856,
"grad_norm": 0.3046487271785736,
"learning_rate": 8e-05,
"loss": 1.8342,
"step": 2674
},
{
"epoch": 0.5883646761244914,
"grad_norm": 0.2699670195579529,
"learning_rate": 8e-05,
"loss": 1.6057,
"step": 2675
},
{
"epoch": 0.5885846255361267,
"grad_norm": 0.2722747027873993,
"learning_rate": 8e-05,
"loss": 1.5067,
"step": 2676
},
{
"epoch": 0.5888045749477621,
"grad_norm": 0.27758973836898804,
"learning_rate": 8e-05,
"loss": 1.613,
"step": 2677
},
{
"epoch": 0.5890245243593973,
"grad_norm": 0.30234992504119873,
"learning_rate": 8e-05,
"loss": 1.7266,
"step": 2678
},
{
"epoch": 0.5892444737710326,
"grad_norm": 0.3146234452724457,
"learning_rate": 8e-05,
"loss": 1.6588,
"step": 2679
},
{
"epoch": 0.589464423182668,
"grad_norm": 0.2867683470249176,
"learning_rate": 8e-05,
"loss": 1.6726,
"step": 2680
},
{
"epoch": 0.5896843725943033,
"grad_norm": 0.28295040130615234,
"learning_rate": 8e-05,
"loss": 1.7338,
"step": 2681
},
{
"epoch": 0.5899043220059387,
"grad_norm": 0.28655725717544556,
"learning_rate": 8e-05,
"loss": 1.6791,
"step": 2682
},
{
"epoch": 0.590124271417574,
"grad_norm": 0.2897862493991852,
"learning_rate": 8e-05,
"loss": 1.7127,
"step": 2683
},
{
"epoch": 0.5903442208292092,
"grad_norm": 0.278427392244339,
"learning_rate": 8e-05,
"loss": 1.7166,
"step": 2684
},
{
"epoch": 0.5905641702408446,
"grad_norm": 0.28383758664131165,
"learning_rate": 8e-05,
"loss": 1.8498,
"step": 2685
},
{
"epoch": 0.5907841196524799,
"grad_norm": 0.2690020501613617,
"learning_rate": 8e-05,
"loss": 1.5887,
"step": 2686
},
{
"epoch": 0.5910040690641153,
"grad_norm": 0.2910546362400055,
"learning_rate": 8e-05,
"loss": 1.7525,
"step": 2687
},
{
"epoch": 0.5912240184757506,
"grad_norm": 0.2932651937007904,
"learning_rate": 8e-05,
"loss": 1.6113,
"step": 2688
},
{
"epoch": 0.5914439678873858,
"grad_norm": 0.275622695684433,
"learning_rate": 8e-05,
"loss": 1.6322,
"step": 2689
},
{
"epoch": 0.5916639172990212,
"grad_norm": 0.2838039696216583,
"learning_rate": 8e-05,
"loss": 1.8021,
"step": 2690
},
{
"epoch": 0.5918838667106565,
"grad_norm": 0.290005087852478,
"learning_rate": 8e-05,
"loss": 1.6378,
"step": 2691
},
{
"epoch": 0.5921038161222919,
"grad_norm": 0.2730334401130676,
"learning_rate": 8e-05,
"loss": 1.6665,
"step": 2692
},
{
"epoch": 0.5923237655339272,
"grad_norm": 0.27828192710876465,
"learning_rate": 8e-05,
"loss": 1.6905,
"step": 2693
},
{
"epoch": 0.5925437149455626,
"grad_norm": 0.26481491327285767,
"learning_rate": 8e-05,
"loss": 1.5056,
"step": 2694
},
{
"epoch": 0.5927636643571979,
"grad_norm": 0.2684583365917206,
"learning_rate": 8e-05,
"loss": 1.4181,
"step": 2695
},
{
"epoch": 0.5929836137688331,
"grad_norm": 0.2848527431488037,
"learning_rate": 8e-05,
"loss": 1.6243,
"step": 2696
},
{
"epoch": 0.5932035631804685,
"grad_norm": 0.2943567931652069,
"learning_rate": 8e-05,
"loss": 1.7296,
"step": 2697
},
{
"epoch": 0.5934235125921038,
"grad_norm": 0.2790435552597046,
"learning_rate": 8e-05,
"loss": 1.611,
"step": 2698
},
{
"epoch": 0.5936434620037392,
"grad_norm": 0.3020678460597992,
"learning_rate": 8e-05,
"loss": 1.8619,
"step": 2699
},
{
"epoch": 0.5938634114153745,
"grad_norm": 0.2809624969959259,
"learning_rate": 8e-05,
"loss": 1.7253,
"step": 2700
},
{
"epoch": 0.5940833608270097,
"grad_norm": 0.2655926048755646,
"learning_rate": 8e-05,
"loss": 1.5514,
"step": 2701
},
{
"epoch": 0.5943033102386451,
"grad_norm": 0.28663522005081177,
"learning_rate": 8e-05,
"loss": 1.6708,
"step": 2702
},
{
"epoch": 0.5945232596502804,
"grad_norm": 0.28419819474220276,
"learning_rate": 8e-05,
"loss": 1.6551,
"step": 2703
},
{
"epoch": 0.5947432090619158,
"grad_norm": 0.29084041714668274,
"learning_rate": 8e-05,
"loss": 1.7509,
"step": 2704
},
{
"epoch": 0.5949631584735511,
"grad_norm": 0.27892929315567017,
"learning_rate": 8e-05,
"loss": 1.6507,
"step": 2705
},
{
"epoch": 0.5951831078851864,
"grad_norm": 0.29692748188972473,
"learning_rate": 8e-05,
"loss": 1.8667,
"step": 2706
},
{
"epoch": 0.5954030572968217,
"grad_norm": 0.2867085933685303,
"learning_rate": 8e-05,
"loss": 1.6157,
"step": 2707
},
{
"epoch": 0.595623006708457,
"grad_norm": 0.29867735505104065,
"learning_rate": 8e-05,
"loss": 1.7813,
"step": 2708
},
{
"epoch": 0.5958429561200924,
"grad_norm": 0.28061944246292114,
"learning_rate": 8e-05,
"loss": 1.6157,
"step": 2709
},
{
"epoch": 0.5960629055317277,
"grad_norm": 0.2807196080684662,
"learning_rate": 8e-05,
"loss": 1.5745,
"step": 2710
},
{
"epoch": 0.596282854943363,
"grad_norm": 0.2854728698730469,
"learning_rate": 8e-05,
"loss": 1.78,
"step": 2711
},
{
"epoch": 0.5965028043549984,
"grad_norm": 0.2980540990829468,
"learning_rate": 8e-05,
"loss": 1.8421,
"step": 2712
},
{
"epoch": 0.5967227537666336,
"grad_norm": 0.2892910838127136,
"learning_rate": 8e-05,
"loss": 1.6555,
"step": 2713
},
{
"epoch": 0.596942703178269,
"grad_norm": 0.2773078680038452,
"learning_rate": 8e-05,
"loss": 1.609,
"step": 2714
},
{
"epoch": 0.5971626525899043,
"grad_norm": 0.29283806681632996,
"learning_rate": 8e-05,
"loss": 1.7398,
"step": 2715
},
{
"epoch": 0.5973826020015397,
"grad_norm": 0.2872734069824219,
"learning_rate": 8e-05,
"loss": 1.6864,
"step": 2716
},
{
"epoch": 0.597602551413175,
"grad_norm": 0.26770031452178955,
"learning_rate": 8e-05,
"loss": 1.5877,
"step": 2717
},
{
"epoch": 0.5978225008248103,
"grad_norm": 0.2958748936653137,
"learning_rate": 8e-05,
"loss": 1.6565,
"step": 2718
},
{
"epoch": 0.5980424502364456,
"grad_norm": 0.30203044414520264,
"learning_rate": 8e-05,
"loss": 1.7878,
"step": 2719
},
{
"epoch": 0.5982623996480809,
"grad_norm": 0.29320842027664185,
"learning_rate": 8e-05,
"loss": 1.5706,
"step": 2720
},
{
"epoch": 0.5984823490597163,
"grad_norm": 0.29835638403892517,
"learning_rate": 8e-05,
"loss": 1.7271,
"step": 2721
},
{
"epoch": 0.5987022984713516,
"grad_norm": 0.36251741647720337,
"learning_rate": 8e-05,
"loss": 1.5832,
"step": 2722
},
{
"epoch": 0.5989222478829869,
"grad_norm": 0.28875645995140076,
"learning_rate": 8e-05,
"loss": 1.8148,
"step": 2723
},
{
"epoch": 0.5991421972946223,
"grad_norm": 0.27607399225234985,
"learning_rate": 8e-05,
"loss": 1.6024,
"step": 2724
},
{
"epoch": 0.5993621467062575,
"grad_norm": 0.290351539850235,
"learning_rate": 8e-05,
"loss": 1.7287,
"step": 2725
},
{
"epoch": 0.5995820961178929,
"grad_norm": 0.28432413935661316,
"learning_rate": 8e-05,
"loss": 1.8603,
"step": 2726
},
{
"epoch": 0.5998020455295282,
"grad_norm": 0.2780609130859375,
"learning_rate": 8e-05,
"loss": 1.6839,
"step": 2727
},
{
"epoch": 0.6000219949411635,
"grad_norm": 0.31952062249183655,
"learning_rate": 8e-05,
"loss": 1.585,
"step": 2728
},
{
"epoch": 0.6002419443527989,
"grad_norm": 0.2631243169307709,
"learning_rate": 8e-05,
"loss": 1.6074,
"step": 2729
},
{
"epoch": 0.6004618937644342,
"grad_norm": 0.28518691658973694,
"learning_rate": 8e-05,
"loss": 1.6944,
"step": 2730
},
{
"epoch": 0.6006818431760695,
"grad_norm": 0.29021504521369934,
"learning_rate": 8e-05,
"loss": 1.5919,
"step": 2731
},
{
"epoch": 0.6009017925877048,
"grad_norm": 0.2772546410560608,
"learning_rate": 8e-05,
"loss": 1.6372,
"step": 2732
},
{
"epoch": 0.6011217419993401,
"grad_norm": 0.27938538789749146,
"learning_rate": 8e-05,
"loss": 1.7311,
"step": 2733
},
{
"epoch": 0.6013416914109755,
"grad_norm": 0.2936658561229706,
"learning_rate": 8e-05,
"loss": 1.695,
"step": 2734
},
{
"epoch": 0.6015616408226108,
"grad_norm": 0.2893039286136627,
"learning_rate": 8e-05,
"loss": 1.6837,
"step": 2735
},
{
"epoch": 0.6017815902342462,
"grad_norm": 0.28634974360466003,
"learning_rate": 8e-05,
"loss": 1.6155,
"step": 2736
},
{
"epoch": 0.6020015396458814,
"grad_norm": 0.2868409752845764,
"learning_rate": 8e-05,
"loss": 1.5901,
"step": 2737
},
{
"epoch": 0.6022214890575168,
"grad_norm": 0.28888818621635437,
"learning_rate": 8e-05,
"loss": 1.6951,
"step": 2738
},
{
"epoch": 0.6024414384691521,
"grad_norm": 0.2881872355937958,
"learning_rate": 8e-05,
"loss": 1.7648,
"step": 2739
},
{
"epoch": 0.6026613878807874,
"grad_norm": 0.29601114988327026,
"learning_rate": 8e-05,
"loss": 1.6499,
"step": 2740
},
{
"epoch": 0.6028813372924228,
"grad_norm": 0.28861135244369507,
"learning_rate": 8e-05,
"loss": 1.6896,
"step": 2741
},
{
"epoch": 0.603101286704058,
"grad_norm": 0.30852892994880676,
"learning_rate": 8e-05,
"loss": 1.8539,
"step": 2742
},
{
"epoch": 0.6033212361156934,
"grad_norm": 0.2659029960632324,
"learning_rate": 8e-05,
"loss": 1.5581,
"step": 2743
},
{
"epoch": 0.6035411855273287,
"grad_norm": 0.2938629686832428,
"learning_rate": 8e-05,
"loss": 1.6893,
"step": 2744
},
{
"epoch": 0.603761134938964,
"grad_norm": 0.3215024769306183,
"learning_rate": 8e-05,
"loss": 1.809,
"step": 2745
},
{
"epoch": 0.6039810843505994,
"grad_norm": 0.3320122957229614,
"learning_rate": 8e-05,
"loss": 1.7137,
"step": 2746
},
{
"epoch": 0.6042010337622347,
"grad_norm": 0.2901141047477722,
"learning_rate": 8e-05,
"loss": 1.7675,
"step": 2747
},
{
"epoch": 0.60442098317387,
"grad_norm": 0.2905901074409485,
"learning_rate": 8e-05,
"loss": 1.7454,
"step": 2748
},
{
"epoch": 0.6046409325855053,
"grad_norm": 0.2820628583431244,
"learning_rate": 8e-05,
"loss": 1.7143,
"step": 2749
},
{
"epoch": 0.6048608819971406,
"grad_norm": 0.29754188656806946,
"learning_rate": 8e-05,
"loss": 1.6957,
"step": 2750
},
{
"epoch": 0.605080831408776,
"grad_norm": 0.28644484281539917,
"learning_rate": 8e-05,
"loss": 1.864,
"step": 2751
},
{
"epoch": 0.6053007808204113,
"grad_norm": 0.2816253900527954,
"learning_rate": 8e-05,
"loss": 1.6956,
"step": 2752
},
{
"epoch": 0.6055207302320467,
"grad_norm": 0.27785420417785645,
"learning_rate": 8e-05,
"loss": 1.6618,
"step": 2753
},
{
"epoch": 0.605740679643682,
"grad_norm": 0.2993432283401489,
"learning_rate": 8e-05,
"loss": 1.6782,
"step": 2754
},
{
"epoch": 0.6059606290553172,
"grad_norm": 0.2837073802947998,
"learning_rate": 8e-05,
"loss": 1.6096,
"step": 2755
},
{
"epoch": 0.6061805784669526,
"grad_norm": 0.2930501401424408,
"learning_rate": 8e-05,
"loss": 1.7036,
"step": 2756
},
{
"epoch": 0.6064005278785879,
"grad_norm": 0.2830953299999237,
"learning_rate": 8e-05,
"loss": 1.7393,
"step": 2757
},
{
"epoch": 0.6066204772902233,
"grad_norm": 0.3069010078907013,
"learning_rate": 8e-05,
"loss": 1.5636,
"step": 2758
},
{
"epoch": 0.6068404267018586,
"grad_norm": 0.2761766314506531,
"learning_rate": 8e-05,
"loss": 1.7233,
"step": 2759
},
{
"epoch": 0.607060376113494,
"grad_norm": 0.28254058957099915,
"learning_rate": 8e-05,
"loss": 1.7132,
"step": 2760
},
{
"epoch": 0.6072803255251292,
"grad_norm": 0.27911651134490967,
"learning_rate": 8e-05,
"loss": 1.7782,
"step": 2761
},
{
"epoch": 0.6075002749367645,
"grad_norm": 0.2875358462333679,
"learning_rate": 8e-05,
"loss": 1.6974,
"step": 2762
},
{
"epoch": 0.6077202243483999,
"grad_norm": 0.28940457105636597,
"learning_rate": 8e-05,
"loss": 1.6274,
"step": 2763
},
{
"epoch": 0.6079401737600352,
"grad_norm": 0.27163782715797424,
"learning_rate": 8e-05,
"loss": 1.6507,
"step": 2764
},
{
"epoch": 0.6081601231716706,
"grad_norm": 0.2914412021636963,
"learning_rate": 8e-05,
"loss": 1.6826,
"step": 2765
},
{
"epoch": 0.6083800725833058,
"grad_norm": 0.31414681673049927,
"learning_rate": 8e-05,
"loss": 1.8466,
"step": 2766
},
{
"epoch": 0.6086000219949411,
"grad_norm": 0.3015105426311493,
"learning_rate": 8e-05,
"loss": 1.599,
"step": 2767
},
{
"epoch": 0.6088199714065765,
"grad_norm": 0.27743127942085266,
"learning_rate": 8e-05,
"loss": 1.5278,
"step": 2768
},
{
"epoch": 0.6090399208182118,
"grad_norm": 0.2868049442768097,
"learning_rate": 8e-05,
"loss": 1.7008,
"step": 2769
},
{
"epoch": 0.6092598702298472,
"grad_norm": 0.2832272946834564,
"learning_rate": 8e-05,
"loss": 1.6368,
"step": 2770
},
{
"epoch": 0.6094798196414825,
"grad_norm": 0.28054770827293396,
"learning_rate": 8e-05,
"loss": 1.6475,
"step": 2771
},
{
"epoch": 0.6096997690531177,
"grad_norm": 0.28185421228408813,
"learning_rate": 8e-05,
"loss": 1.7731,
"step": 2772
},
{
"epoch": 0.6099197184647531,
"grad_norm": 0.2819845676422119,
"learning_rate": 8e-05,
"loss": 1.6824,
"step": 2773
},
{
"epoch": 0.6101396678763884,
"grad_norm": 0.2764539420604706,
"learning_rate": 8e-05,
"loss": 1.7001,
"step": 2774
},
{
"epoch": 0.6103596172880238,
"grad_norm": 0.30475977063179016,
"learning_rate": 8e-05,
"loss": 1.8297,
"step": 2775
},
{
"epoch": 0.6105795666996591,
"grad_norm": 0.2848237454891205,
"learning_rate": 8e-05,
"loss": 1.7453,
"step": 2776
},
{
"epoch": 0.6107995161112943,
"grad_norm": 0.28268033266067505,
"learning_rate": 8e-05,
"loss": 1.5241,
"step": 2777
},
{
"epoch": 0.6110194655229297,
"grad_norm": 0.27673062682151794,
"learning_rate": 8e-05,
"loss": 1.5273,
"step": 2778
},
{
"epoch": 0.611239414934565,
"grad_norm": 0.28202882409095764,
"learning_rate": 8e-05,
"loss": 1.5769,
"step": 2779
},
{
"epoch": 0.6114593643462004,
"grad_norm": 0.28480303287506104,
"learning_rate": 8e-05,
"loss": 1.7456,
"step": 2780
},
{
"epoch": 0.6116793137578357,
"grad_norm": 0.3028055727481842,
"learning_rate": 8e-05,
"loss": 1.6652,
"step": 2781
},
{
"epoch": 0.6118992631694711,
"grad_norm": 0.28677237033843994,
"learning_rate": 8e-05,
"loss": 1.6877,
"step": 2782
},
{
"epoch": 0.6121192125811064,
"grad_norm": 0.3057413399219513,
"learning_rate": 8e-05,
"loss": 1.9811,
"step": 2783
},
{
"epoch": 0.6123391619927416,
"grad_norm": 0.2802276313304901,
"learning_rate": 8e-05,
"loss": 1.5965,
"step": 2784
},
{
"epoch": 0.612559111404377,
"grad_norm": 0.27934229373931885,
"learning_rate": 8e-05,
"loss": 1.5959,
"step": 2785
},
{
"epoch": 0.6127790608160123,
"grad_norm": 0.2864493429660797,
"learning_rate": 8e-05,
"loss": 1.6289,
"step": 2786
},
{
"epoch": 0.6129990102276477,
"grad_norm": 0.26668915152549744,
"learning_rate": 8e-05,
"loss": 1.4584,
"step": 2787
},
{
"epoch": 0.613218959639283,
"grad_norm": 0.28092291951179504,
"learning_rate": 8e-05,
"loss": 1.6641,
"step": 2788
},
{
"epoch": 0.6134389090509182,
"grad_norm": 0.2933676242828369,
"learning_rate": 8e-05,
"loss": 1.9453,
"step": 2789
},
{
"epoch": 0.6136588584625536,
"grad_norm": 0.31618431210517883,
"learning_rate": 8e-05,
"loss": 1.594,
"step": 2790
},
{
"epoch": 0.6138788078741889,
"grad_norm": 0.28090760111808777,
"learning_rate": 8e-05,
"loss": 1.7531,
"step": 2791
},
{
"epoch": 0.6140987572858243,
"grad_norm": 0.3137405216693878,
"learning_rate": 8e-05,
"loss": 1.7243,
"step": 2792
},
{
"epoch": 0.6143187066974596,
"grad_norm": 0.2949986755847931,
"learning_rate": 8e-05,
"loss": 1.7581,
"step": 2793
},
{
"epoch": 0.6145386561090949,
"grad_norm": 0.28396037220954895,
"learning_rate": 8e-05,
"loss": 1.6995,
"step": 2794
},
{
"epoch": 0.6147586055207303,
"grad_norm": 0.26976051926612854,
"learning_rate": 8e-05,
"loss": 1.654,
"step": 2795
},
{
"epoch": 0.6149785549323655,
"grad_norm": 0.27323633432388306,
"learning_rate": 8e-05,
"loss": 1.6944,
"step": 2796
},
{
"epoch": 0.6151985043440009,
"grad_norm": 0.29849350452423096,
"learning_rate": 8e-05,
"loss": 1.6127,
"step": 2797
},
{
"epoch": 0.6154184537556362,
"grad_norm": 0.28575918078422546,
"learning_rate": 8e-05,
"loss": 1.7579,
"step": 2798
},
{
"epoch": 0.6156384031672715,
"grad_norm": 0.26723456382751465,
"learning_rate": 8e-05,
"loss": 1.5461,
"step": 2799
},
{
"epoch": 0.6158583525789069,
"grad_norm": 0.29076528549194336,
"learning_rate": 8e-05,
"loss": 1.7001,
"step": 2800
},
{
"epoch": 0.6160783019905421,
"grad_norm": 0.27913492918014526,
"learning_rate": 8e-05,
"loss": 1.3878,
"step": 2801
},
{
"epoch": 0.6162982514021775,
"grad_norm": 0.2841816246509552,
"learning_rate": 8e-05,
"loss": 1.6871,
"step": 2802
},
{
"epoch": 0.6165182008138128,
"grad_norm": 0.26845458149909973,
"learning_rate": 8e-05,
"loss": 1.5518,
"step": 2803
},
{
"epoch": 0.6167381502254482,
"grad_norm": 0.30308809876441956,
"learning_rate": 8e-05,
"loss": 1.7735,
"step": 2804
},
{
"epoch": 0.6169580996370835,
"grad_norm": 0.2812938690185547,
"learning_rate": 8e-05,
"loss": 1.7043,
"step": 2805
},
{
"epoch": 0.6171780490487188,
"grad_norm": 0.27101054787635803,
"learning_rate": 8e-05,
"loss": 1.6156,
"step": 2806
},
{
"epoch": 0.6173979984603541,
"grad_norm": 0.2900649607181549,
"learning_rate": 8e-05,
"loss": 1.7119,
"step": 2807
},
{
"epoch": 0.6176179478719894,
"grad_norm": 0.3011523187160492,
"learning_rate": 8e-05,
"loss": 1.7085,
"step": 2808
},
{
"epoch": 0.6178378972836248,
"grad_norm": 0.2845047116279602,
"learning_rate": 8e-05,
"loss": 1.6691,
"step": 2809
},
{
"epoch": 0.6180578466952601,
"grad_norm": 0.31060662865638733,
"learning_rate": 8e-05,
"loss": 1.831,
"step": 2810
},
{
"epoch": 0.6182777961068954,
"grad_norm": 0.27987706661224365,
"learning_rate": 8e-05,
"loss": 1.4881,
"step": 2811
},
{
"epoch": 0.6184977455185308,
"grad_norm": 0.3197080194950104,
"learning_rate": 8e-05,
"loss": 1.7112,
"step": 2812
},
{
"epoch": 0.618717694930166,
"grad_norm": 0.31402066349983215,
"learning_rate": 8e-05,
"loss": 1.6535,
"step": 2813
},
{
"epoch": 0.6189376443418014,
"grad_norm": 0.303529292345047,
"learning_rate": 8e-05,
"loss": 1.6563,
"step": 2814
},
{
"epoch": 0.6191575937534367,
"grad_norm": 0.26674556732177734,
"learning_rate": 8e-05,
"loss": 1.5202,
"step": 2815
},
{
"epoch": 0.619377543165072,
"grad_norm": 0.30466997623443604,
"learning_rate": 8e-05,
"loss": 1.6014,
"step": 2816
},
{
"epoch": 0.6195974925767074,
"grad_norm": 0.2991195619106293,
"learning_rate": 8e-05,
"loss": 1.765,
"step": 2817
},
{
"epoch": 0.6198174419883427,
"grad_norm": 0.30000337958335876,
"learning_rate": 8e-05,
"loss": 1.7794,
"step": 2818
},
{
"epoch": 0.620037391399978,
"grad_norm": 0.29237842559814453,
"learning_rate": 8e-05,
"loss": 1.8753,
"step": 2819
},
{
"epoch": 0.6202573408116133,
"grad_norm": 0.2896344065666199,
"learning_rate": 8e-05,
"loss": 1.6137,
"step": 2820
},
{
"epoch": 0.6204772902232486,
"grad_norm": 0.34269601106643677,
"learning_rate": 8e-05,
"loss": 1.7693,
"step": 2821
},
{
"epoch": 0.620697239634884,
"grad_norm": 0.30044153332710266,
"learning_rate": 8e-05,
"loss": 1.7286,
"step": 2822
},
{
"epoch": 0.6209171890465193,
"grad_norm": 0.2616185247898102,
"learning_rate": 8e-05,
"loss": 1.5373,
"step": 2823
},
{
"epoch": 0.6211371384581547,
"grad_norm": 0.3217238485813141,
"learning_rate": 8e-05,
"loss": 1.7681,
"step": 2824
},
{
"epoch": 0.6213570878697899,
"grad_norm": 0.284446120262146,
"learning_rate": 8e-05,
"loss": 1.5597,
"step": 2825
},
{
"epoch": 0.6215770372814253,
"grad_norm": 0.28698036074638367,
"learning_rate": 8e-05,
"loss": 1.6844,
"step": 2826
},
{
"epoch": 0.6217969866930606,
"grad_norm": 0.2828524708747864,
"learning_rate": 8e-05,
"loss": 1.7142,
"step": 2827
},
{
"epoch": 0.6220169361046959,
"grad_norm": 0.3004125952720642,
"learning_rate": 8e-05,
"loss": 1.6812,
"step": 2828
},
{
"epoch": 0.6222368855163313,
"grad_norm": 0.30438825488090515,
"learning_rate": 8e-05,
"loss": 1.6696,
"step": 2829
},
{
"epoch": 0.6224568349279666,
"grad_norm": 0.2654431164264679,
"learning_rate": 8e-05,
"loss": 1.5656,
"step": 2830
},
{
"epoch": 0.6226767843396019,
"grad_norm": 0.28561410307884216,
"learning_rate": 8e-05,
"loss": 1.5924,
"step": 2831
},
{
"epoch": 0.6228967337512372,
"grad_norm": 0.29075953364372253,
"learning_rate": 8e-05,
"loss": 1.6026,
"step": 2832
},
{
"epoch": 0.6231166831628725,
"grad_norm": 0.3002355098724365,
"learning_rate": 8e-05,
"loss": 1.7783,
"step": 2833
},
{
"epoch": 0.6233366325745079,
"grad_norm": 0.2757151424884796,
"learning_rate": 8e-05,
"loss": 1.6404,
"step": 2834
},
{
"epoch": 0.6235565819861432,
"grad_norm": 0.28108781576156616,
"learning_rate": 8e-05,
"loss": 1.6727,
"step": 2835
},
{
"epoch": 0.6237765313977786,
"grad_norm": 0.31818297505378723,
"learning_rate": 8e-05,
"loss": 1.8859,
"step": 2836
},
{
"epoch": 0.6239964808094138,
"grad_norm": 0.30283015966415405,
"learning_rate": 8e-05,
"loss": 1.6967,
"step": 2837
},
{
"epoch": 0.6242164302210491,
"grad_norm": 0.28991082310676575,
"learning_rate": 8e-05,
"loss": 1.7564,
"step": 2838
},
{
"epoch": 0.6244363796326845,
"grad_norm": 0.27985113859176636,
"learning_rate": 8e-05,
"loss": 1.645,
"step": 2839
},
{
"epoch": 0.6246563290443198,
"grad_norm": 0.28599318861961365,
"learning_rate": 8e-05,
"loss": 1.7069,
"step": 2840
},
{
"epoch": 0.6248762784559552,
"grad_norm": 0.291962593793869,
"learning_rate": 8e-05,
"loss": 1.7303,
"step": 2841
},
{
"epoch": 0.6250962278675904,
"grad_norm": 0.2977605164051056,
"learning_rate": 8e-05,
"loss": 1.6251,
"step": 2842
},
{
"epoch": 0.6253161772792257,
"grad_norm": 0.280979186296463,
"learning_rate": 8e-05,
"loss": 1.6132,
"step": 2843
},
{
"epoch": 0.6255361266908611,
"grad_norm": 0.30565154552459717,
"learning_rate": 8e-05,
"loss": 1.8351,
"step": 2844
},
{
"epoch": 0.6257560761024964,
"grad_norm": 0.2870398759841919,
"learning_rate": 8e-05,
"loss": 1.7169,
"step": 2845
},
{
"epoch": 0.6259760255141318,
"grad_norm": 0.2740568518638611,
"learning_rate": 8e-05,
"loss": 1.6666,
"step": 2846
},
{
"epoch": 0.6261959749257671,
"grad_norm": 0.27255693078041077,
"learning_rate": 8e-05,
"loss": 1.6112,
"step": 2847
},
{
"epoch": 0.6264159243374025,
"grad_norm": 0.2785317003726959,
"learning_rate": 8e-05,
"loss": 1.6532,
"step": 2848
},
{
"epoch": 0.6266358737490377,
"grad_norm": 0.2979902923107147,
"learning_rate": 8e-05,
"loss": 1.7981,
"step": 2849
},
{
"epoch": 0.626855823160673,
"grad_norm": 0.29625701904296875,
"learning_rate": 8e-05,
"loss": 1.7212,
"step": 2850
},
{
"epoch": 0.6270757725723084,
"grad_norm": 0.2768239676952362,
"learning_rate": 8e-05,
"loss": 1.6358,
"step": 2851
},
{
"epoch": 0.6272957219839437,
"grad_norm": 0.2931036055088043,
"learning_rate": 8e-05,
"loss": 1.7728,
"step": 2852
},
{
"epoch": 0.6275156713955791,
"grad_norm": 0.2883271872997284,
"learning_rate": 8e-05,
"loss": 1.6403,
"step": 2853
},
{
"epoch": 0.6277356208072143,
"grad_norm": 0.31137219071388245,
"learning_rate": 8e-05,
"loss": 1.6454,
"step": 2854
},
{
"epoch": 0.6279555702188496,
"grad_norm": 0.3026840388774872,
"learning_rate": 8e-05,
"loss": 1.6534,
"step": 2855
},
{
"epoch": 0.628175519630485,
"grad_norm": 0.2950657606124878,
"learning_rate": 8e-05,
"loss": 1.7298,
"step": 2856
},
{
"epoch": 0.6283954690421203,
"grad_norm": 0.29347553849220276,
"learning_rate": 8e-05,
"loss": 1.58,
"step": 2857
},
{
"epoch": 0.6286154184537557,
"grad_norm": 0.28075262904167175,
"learning_rate": 8e-05,
"loss": 1.6262,
"step": 2858
},
{
"epoch": 0.628835367865391,
"grad_norm": 0.26556506752967834,
"learning_rate": 8e-05,
"loss": 1.5666,
"step": 2859
},
{
"epoch": 0.6290553172770262,
"grad_norm": 0.28918468952178955,
"learning_rate": 8e-05,
"loss": 1.6918,
"step": 2860
},
{
"epoch": 0.6292752666886616,
"grad_norm": 0.2816839814186096,
"learning_rate": 8e-05,
"loss": 1.6387,
"step": 2861
},
{
"epoch": 0.6294952161002969,
"grad_norm": 0.2819633185863495,
"learning_rate": 8e-05,
"loss": 1.6945,
"step": 2862
},
{
"epoch": 0.6297151655119323,
"grad_norm": 0.2847195863723755,
"learning_rate": 8e-05,
"loss": 1.744,
"step": 2863
},
{
"epoch": 0.6299351149235676,
"grad_norm": 0.2706061899662018,
"learning_rate": 8e-05,
"loss": 1.6246,
"step": 2864
},
{
"epoch": 0.6301550643352029,
"grad_norm": 0.281125545501709,
"learning_rate": 8e-05,
"loss": 1.49,
"step": 2865
},
{
"epoch": 0.6303750137468382,
"grad_norm": 0.2861780822277069,
"learning_rate": 8e-05,
"loss": 1.7375,
"step": 2866
},
{
"epoch": 0.6305949631584735,
"grad_norm": 0.2654918134212494,
"learning_rate": 8e-05,
"loss": 1.5997,
"step": 2867
},
{
"epoch": 0.6308149125701089,
"grad_norm": 0.29169219732284546,
"learning_rate": 8e-05,
"loss": 1.8921,
"step": 2868
},
{
"epoch": 0.6310348619817442,
"grad_norm": 0.2858426570892334,
"learning_rate": 8e-05,
"loss": 1.8006,
"step": 2869
},
{
"epoch": 0.6312548113933796,
"grad_norm": 0.2712969183921814,
"learning_rate": 8e-05,
"loss": 1.5761,
"step": 2870
},
{
"epoch": 0.6314747608050149,
"grad_norm": 0.28961536288261414,
"learning_rate": 8e-05,
"loss": 1.618,
"step": 2871
},
{
"epoch": 0.6316947102166501,
"grad_norm": 0.2879860997200012,
"learning_rate": 8e-05,
"loss": 1.6116,
"step": 2872
},
{
"epoch": 0.6319146596282855,
"grad_norm": 0.3009500801563263,
"learning_rate": 8e-05,
"loss": 1.8099,
"step": 2873
},
{
"epoch": 0.6321346090399208,
"grad_norm": 0.3012961149215698,
"learning_rate": 8e-05,
"loss": 1.8123,
"step": 2874
},
{
"epoch": 0.6323545584515562,
"grad_norm": 0.27382341027259827,
"learning_rate": 8e-05,
"loss": 1.5486,
"step": 2875
},
{
"epoch": 0.6325745078631915,
"grad_norm": 0.45538756251335144,
"learning_rate": 8e-05,
"loss": 1.7382,
"step": 2876
},
{
"epoch": 0.6327944572748267,
"grad_norm": 0.27454543113708496,
"learning_rate": 8e-05,
"loss": 1.6488,
"step": 2877
},
{
"epoch": 0.6330144066864621,
"grad_norm": 0.28111204504966736,
"learning_rate": 8e-05,
"loss": 1.7262,
"step": 2878
},
{
"epoch": 0.6332343560980974,
"grad_norm": 0.2855817675590515,
"learning_rate": 8e-05,
"loss": 1.6841,
"step": 2879
},
{
"epoch": 0.6334543055097328,
"grad_norm": 0.3017145097255707,
"learning_rate": 8e-05,
"loss": 1.759,
"step": 2880
},
{
"epoch": 0.6336742549213681,
"grad_norm": 0.27578651905059814,
"learning_rate": 8e-05,
"loss": 1.5341,
"step": 2881
},
{
"epoch": 0.6338942043330034,
"grad_norm": 0.28522011637687683,
"learning_rate": 8e-05,
"loss": 1.5042,
"step": 2882
},
{
"epoch": 0.6341141537446388,
"grad_norm": 0.28013676404953003,
"learning_rate": 8e-05,
"loss": 1.591,
"step": 2883
},
{
"epoch": 0.634334103156274,
"grad_norm": 0.30440640449523926,
"learning_rate": 8e-05,
"loss": 1.6698,
"step": 2884
},
{
"epoch": 0.6345540525679094,
"grad_norm": 0.28555527329444885,
"learning_rate": 8e-05,
"loss": 1.6199,
"step": 2885
},
{
"epoch": 0.6347740019795447,
"grad_norm": 0.31451916694641113,
"learning_rate": 8e-05,
"loss": 1.6717,
"step": 2886
},
{
"epoch": 0.63499395139118,
"grad_norm": 0.3116842806339264,
"learning_rate": 8e-05,
"loss": 1.7043,
"step": 2887
},
{
"epoch": 0.6352139008028154,
"grad_norm": 0.30441299080848694,
"learning_rate": 8e-05,
"loss": 1.6953,
"step": 2888
},
{
"epoch": 0.6354338502144506,
"grad_norm": 0.2890806496143341,
"learning_rate": 8e-05,
"loss": 1.7363,
"step": 2889
},
{
"epoch": 0.635653799626086,
"grad_norm": 0.2715187072753906,
"learning_rate": 8e-05,
"loss": 1.5412,
"step": 2890
},
{
"epoch": 0.6358737490377213,
"grad_norm": 0.32213905453681946,
"learning_rate": 8e-05,
"loss": 1.7735,
"step": 2891
},
{
"epoch": 0.6360936984493566,
"grad_norm": 0.28850191831588745,
"learning_rate": 8e-05,
"loss": 1.5156,
"step": 2892
},
{
"epoch": 0.636313647860992,
"grad_norm": 0.2934744358062744,
"learning_rate": 8e-05,
"loss": 1.6822,
"step": 2893
},
{
"epoch": 0.6365335972726273,
"grad_norm": 0.29068851470947266,
"learning_rate": 8e-05,
"loss": 1.7723,
"step": 2894
},
{
"epoch": 0.6367535466842627,
"grad_norm": 0.28490251302719116,
"learning_rate": 8e-05,
"loss": 1.7666,
"step": 2895
},
{
"epoch": 0.6369734960958979,
"grad_norm": 0.28677448630332947,
"learning_rate": 8e-05,
"loss": 1.6792,
"step": 2896
},
{
"epoch": 0.6371934455075333,
"grad_norm": 0.29424387216567993,
"learning_rate": 8e-05,
"loss": 1.6224,
"step": 2897
},
{
"epoch": 0.6374133949191686,
"grad_norm": 0.2872456908226013,
"learning_rate": 8e-05,
"loss": 1.6677,
"step": 2898
},
{
"epoch": 0.6376333443308039,
"grad_norm": 0.26886799931526184,
"learning_rate": 8e-05,
"loss": 1.5086,
"step": 2899
},
{
"epoch": 0.6378532937424393,
"grad_norm": 0.2737233638763428,
"learning_rate": 8e-05,
"loss": 1.6832,
"step": 2900
},
{
"epoch": 0.6380732431540745,
"grad_norm": 0.2912994623184204,
"learning_rate": 8e-05,
"loss": 1.5945,
"step": 2901
},
{
"epoch": 0.6382931925657099,
"grad_norm": 0.2800372540950775,
"learning_rate": 8e-05,
"loss": 1.5661,
"step": 2902
},
{
"epoch": 0.6385131419773452,
"grad_norm": 0.3248150050640106,
"learning_rate": 8e-05,
"loss": 1.6098,
"step": 2903
},
{
"epoch": 0.6387330913889805,
"grad_norm": 0.27953609824180603,
"learning_rate": 8e-05,
"loss": 1.7255,
"step": 2904
},
{
"epoch": 0.6389530408006159,
"grad_norm": 0.276395708322525,
"learning_rate": 8e-05,
"loss": 1.6782,
"step": 2905
},
{
"epoch": 0.6391729902122512,
"grad_norm": 0.2754693031311035,
"learning_rate": 8e-05,
"loss": 1.5135,
"step": 2906
},
{
"epoch": 0.6393929396238865,
"grad_norm": 0.2756873369216919,
"learning_rate": 8e-05,
"loss": 1.6208,
"step": 2907
},
{
"epoch": 0.6396128890355218,
"grad_norm": 0.3032161593437195,
"learning_rate": 8e-05,
"loss": 1.7675,
"step": 2908
},
{
"epoch": 0.6398328384471571,
"grad_norm": 0.2915925085544586,
"learning_rate": 8e-05,
"loss": 1.6899,
"step": 2909
},
{
"epoch": 0.6400527878587925,
"grad_norm": 0.28415006399154663,
"learning_rate": 8e-05,
"loss": 1.7218,
"step": 2910
},
{
"epoch": 0.6402727372704278,
"grad_norm": 0.3023785352706909,
"learning_rate": 8e-05,
"loss": 1.7959,
"step": 2911
},
{
"epoch": 0.6404926866820632,
"grad_norm": 0.2656283974647522,
"learning_rate": 8e-05,
"loss": 1.6287,
"step": 2912
},
{
"epoch": 0.6407126360936984,
"grad_norm": 0.2835081219673157,
"learning_rate": 8e-05,
"loss": 1.6597,
"step": 2913
},
{
"epoch": 0.6409325855053337,
"grad_norm": 0.2756771743297577,
"learning_rate": 8e-05,
"loss": 1.625,
"step": 2914
},
{
"epoch": 0.6411525349169691,
"grad_norm": 0.283149778842926,
"learning_rate": 8e-05,
"loss": 1.662,
"step": 2915
},
{
"epoch": 0.6413724843286044,
"grad_norm": 0.28902921080589294,
"learning_rate": 8e-05,
"loss": 1.4862,
"step": 2916
},
{
"epoch": 0.6415924337402398,
"grad_norm": 0.28932076692581177,
"learning_rate": 8e-05,
"loss": 1.6109,
"step": 2917
},
{
"epoch": 0.641812383151875,
"grad_norm": 0.30964934825897217,
"learning_rate": 8e-05,
"loss": 1.7021,
"step": 2918
},
{
"epoch": 0.6420323325635104,
"grad_norm": 0.28258854150772095,
"learning_rate": 8e-05,
"loss": 1.6536,
"step": 2919
},
{
"epoch": 0.6422522819751457,
"grad_norm": 0.2939313054084778,
"learning_rate": 8e-05,
"loss": 1.6299,
"step": 2920
},
{
"epoch": 0.642472231386781,
"grad_norm": 0.31722190976142883,
"learning_rate": 8e-05,
"loss": 1.6789,
"step": 2921
},
{
"epoch": 0.6426921807984164,
"grad_norm": 0.29024428129196167,
"learning_rate": 8e-05,
"loss": 1.7429,
"step": 2922
},
{
"epoch": 0.6429121302100517,
"grad_norm": 0.2716485559940338,
"learning_rate": 8e-05,
"loss": 1.619,
"step": 2923
},
{
"epoch": 0.6431320796216871,
"grad_norm": 0.2986311614513397,
"learning_rate": 8e-05,
"loss": 1.7574,
"step": 2924
},
{
"epoch": 0.6433520290333223,
"grad_norm": 0.29542550444602966,
"learning_rate": 8e-05,
"loss": 1.7322,
"step": 2925
},
{
"epoch": 0.6435719784449576,
"grad_norm": 0.27215078473091125,
"learning_rate": 8e-05,
"loss": 1.5544,
"step": 2926
},
{
"epoch": 0.643791927856593,
"grad_norm": 0.29105404019355774,
"learning_rate": 8e-05,
"loss": 1.7499,
"step": 2927
},
{
"epoch": 0.6440118772682283,
"grad_norm": 0.2990782558917999,
"learning_rate": 8e-05,
"loss": 1.7987,
"step": 2928
},
{
"epoch": 0.6442318266798637,
"grad_norm": 0.27296003699302673,
"learning_rate": 8e-05,
"loss": 1.6271,
"step": 2929
},
{
"epoch": 0.644451776091499,
"grad_norm": 0.27194517850875854,
"learning_rate": 8e-05,
"loss": 1.5178,
"step": 2930
},
{
"epoch": 0.6446717255031342,
"grad_norm": 0.2720150053501129,
"learning_rate": 8e-05,
"loss": 1.5665,
"step": 2931
},
{
"epoch": 0.6448916749147696,
"grad_norm": 0.2825513780117035,
"learning_rate": 8e-05,
"loss": 1.6276,
"step": 2932
},
{
"epoch": 0.6451116243264049,
"grad_norm": 0.2869420349597931,
"learning_rate": 8e-05,
"loss": 1.5306,
"step": 2933
},
{
"epoch": 0.6453315737380403,
"grad_norm": 0.2829979360103607,
"learning_rate": 8e-05,
"loss": 1.7132,
"step": 2934
},
{
"epoch": 0.6455515231496756,
"grad_norm": 0.28047260642051697,
"learning_rate": 8e-05,
"loss": 1.5932,
"step": 2935
},
{
"epoch": 0.6457714725613108,
"grad_norm": 0.287765234708786,
"learning_rate": 8e-05,
"loss": 1.6029,
"step": 2936
},
{
"epoch": 0.6459914219729462,
"grad_norm": 0.2858487665653229,
"learning_rate": 8e-05,
"loss": 1.5959,
"step": 2937
},
{
"epoch": 0.6462113713845815,
"grad_norm": 0.29041311144828796,
"learning_rate": 8e-05,
"loss": 1.6348,
"step": 2938
},
{
"epoch": 0.6464313207962169,
"grad_norm": 0.2873425781726837,
"learning_rate": 8e-05,
"loss": 1.6514,
"step": 2939
},
{
"epoch": 0.6466512702078522,
"grad_norm": 0.2767978310585022,
"learning_rate": 8e-05,
"loss": 1.5471,
"step": 2940
},
{
"epoch": 0.6468712196194876,
"grad_norm": 0.27061501145362854,
"learning_rate": 8e-05,
"loss": 1.5439,
"step": 2941
},
{
"epoch": 0.6470911690311228,
"grad_norm": 0.2724677324295044,
"learning_rate": 8e-05,
"loss": 1.6808,
"step": 2942
},
{
"epoch": 0.6473111184427581,
"grad_norm": 0.2804121971130371,
"learning_rate": 8e-05,
"loss": 1.7923,
"step": 2943
},
{
"epoch": 0.6475310678543935,
"grad_norm": 0.2881599962711334,
"learning_rate": 8e-05,
"loss": 1.745,
"step": 2944
},
{
"epoch": 0.6477510172660288,
"grad_norm": 0.3064921200275421,
"learning_rate": 8e-05,
"loss": 1.7387,
"step": 2945
},
{
"epoch": 0.6479709666776642,
"grad_norm": 0.2994825541973114,
"learning_rate": 8e-05,
"loss": 1.7143,
"step": 2946
},
{
"epoch": 0.6481909160892995,
"grad_norm": 0.29315468668937683,
"learning_rate": 8e-05,
"loss": 1.7712,
"step": 2947
},
{
"epoch": 0.6484108655009347,
"grad_norm": 0.2923111915588379,
"learning_rate": 8e-05,
"loss": 1.6861,
"step": 2948
},
{
"epoch": 0.6486308149125701,
"grad_norm": 0.2861957848072052,
"learning_rate": 8e-05,
"loss": 1.5951,
"step": 2949
},
{
"epoch": 0.6488507643242054,
"grad_norm": 0.2978787422180176,
"learning_rate": 8e-05,
"loss": 1.6617,
"step": 2950
},
{
"epoch": 0.6490707137358408,
"grad_norm": 0.28596314787864685,
"learning_rate": 8e-05,
"loss": 1.7623,
"step": 2951
},
{
"epoch": 0.6492906631474761,
"grad_norm": 0.29754844307899475,
"learning_rate": 8e-05,
"loss": 1.6497,
"step": 2952
},
{
"epoch": 0.6495106125591114,
"grad_norm": 0.2931132912635803,
"learning_rate": 8e-05,
"loss": 1.5972,
"step": 2953
},
{
"epoch": 0.6497305619707467,
"grad_norm": 0.2667228877544403,
"learning_rate": 8e-05,
"loss": 1.5598,
"step": 2954
},
{
"epoch": 0.649950511382382,
"grad_norm": 0.2866271436214447,
"learning_rate": 8e-05,
"loss": 1.5908,
"step": 2955
},
{
"epoch": 0.6501704607940174,
"grad_norm": 0.28429698944091797,
"learning_rate": 8e-05,
"loss": 1.6582,
"step": 2956
},
{
"epoch": 0.6503904102056527,
"grad_norm": 0.27636975049972534,
"learning_rate": 8e-05,
"loss": 1.6778,
"step": 2957
},
{
"epoch": 0.650610359617288,
"grad_norm": 0.3005516529083252,
"learning_rate": 8e-05,
"loss": 1.7191,
"step": 2958
},
{
"epoch": 0.6508303090289234,
"grad_norm": 0.28478094935417175,
"learning_rate": 8e-05,
"loss": 1.5651,
"step": 2959
},
{
"epoch": 0.6510502584405586,
"grad_norm": 0.2879832088947296,
"learning_rate": 8e-05,
"loss": 1.7055,
"step": 2960
},
{
"epoch": 0.651270207852194,
"grad_norm": 0.2899249196052551,
"learning_rate": 8e-05,
"loss": 1.6682,
"step": 2961
},
{
"epoch": 0.6514901572638293,
"grad_norm": 0.26806798577308655,
"learning_rate": 8e-05,
"loss": 1.6962,
"step": 2962
},
{
"epoch": 0.6517101066754647,
"grad_norm": 0.2929481565952301,
"learning_rate": 8e-05,
"loss": 1.6388,
"step": 2963
},
{
"epoch": 0.6519300560871,
"grad_norm": 0.2920469641685486,
"learning_rate": 8e-05,
"loss": 1.8584,
"step": 2964
},
{
"epoch": 0.6521500054987353,
"grad_norm": 0.285696417093277,
"learning_rate": 8e-05,
"loss": 1.6491,
"step": 2965
},
{
"epoch": 0.6523699549103706,
"grad_norm": 0.2991807162761688,
"learning_rate": 8e-05,
"loss": 1.6565,
"step": 2966
},
{
"epoch": 0.6525899043220059,
"grad_norm": 0.27987217903137207,
"learning_rate": 8e-05,
"loss": 1.6274,
"step": 2967
},
{
"epoch": 0.6528098537336413,
"grad_norm": 0.2810576856136322,
"learning_rate": 8e-05,
"loss": 1.5294,
"step": 2968
},
{
"epoch": 0.6530298031452766,
"grad_norm": 0.2755715847015381,
"learning_rate": 8e-05,
"loss": 1.6319,
"step": 2969
},
{
"epoch": 0.6532497525569119,
"grad_norm": 0.3041331171989441,
"learning_rate": 8e-05,
"loss": 1.5308,
"step": 2970
},
{
"epoch": 0.6534697019685473,
"grad_norm": 0.2858032286167145,
"learning_rate": 8e-05,
"loss": 1.6557,
"step": 2971
},
{
"epoch": 0.6536896513801825,
"grad_norm": 0.3001968562602997,
"learning_rate": 8e-05,
"loss": 1.6841,
"step": 2972
},
{
"epoch": 0.6539096007918179,
"grad_norm": 0.29567384719848633,
"learning_rate": 8e-05,
"loss": 1.6329,
"step": 2973
},
{
"epoch": 0.6541295502034532,
"grad_norm": 0.29874905943870544,
"learning_rate": 8e-05,
"loss": 1.7178,
"step": 2974
},
{
"epoch": 0.6543494996150885,
"grad_norm": 0.34721627831459045,
"learning_rate": 8e-05,
"loss": 1.8192,
"step": 2975
},
{
"epoch": 0.6545694490267239,
"grad_norm": 0.2965874671936035,
"learning_rate": 8e-05,
"loss": 1.8156,
"step": 2976
},
{
"epoch": 0.6547893984383591,
"grad_norm": 0.2710880637168884,
"learning_rate": 8e-05,
"loss": 1.5101,
"step": 2977
},
{
"epoch": 0.6550093478499945,
"grad_norm": 0.2852049171924591,
"learning_rate": 8e-05,
"loss": 1.5835,
"step": 2978
},
{
"epoch": 0.6552292972616298,
"grad_norm": 0.2898092567920685,
"learning_rate": 8e-05,
"loss": 1.599,
"step": 2979
},
{
"epoch": 0.6554492466732651,
"grad_norm": 0.2880117893218994,
"learning_rate": 8e-05,
"loss": 1.5904,
"step": 2980
},
{
"epoch": 0.6556691960849005,
"grad_norm": 0.2850951850414276,
"learning_rate": 8e-05,
"loss": 1.5551,
"step": 2981
},
{
"epoch": 0.6558891454965358,
"grad_norm": 0.292883038520813,
"learning_rate": 8e-05,
"loss": 1.5587,
"step": 2982
},
{
"epoch": 0.6561090949081712,
"grad_norm": 0.3050660490989685,
"learning_rate": 8e-05,
"loss": 1.7904,
"step": 2983
},
{
"epoch": 0.6563290443198064,
"grad_norm": 0.31059566140174866,
"learning_rate": 8e-05,
"loss": 1.7716,
"step": 2984
},
{
"epoch": 0.6565489937314418,
"grad_norm": 0.33118265867233276,
"learning_rate": 8e-05,
"loss": 1.7878,
"step": 2985
},
{
"epoch": 0.6567689431430771,
"grad_norm": 0.27835318446159363,
"learning_rate": 8e-05,
"loss": 1.7069,
"step": 2986
},
{
"epoch": 0.6569888925547124,
"grad_norm": 0.33706921339035034,
"learning_rate": 8e-05,
"loss": 1.7135,
"step": 2987
},
{
"epoch": 0.6572088419663478,
"grad_norm": 0.3115323781967163,
"learning_rate": 8e-05,
"loss": 1.7396,
"step": 2988
},
{
"epoch": 0.657428791377983,
"grad_norm": 0.2827862501144409,
"learning_rate": 8e-05,
"loss": 1.6889,
"step": 2989
},
{
"epoch": 0.6576487407896184,
"grad_norm": 0.29057440161705017,
"learning_rate": 8e-05,
"loss": 1.7208,
"step": 2990
},
{
"epoch": 0.6578686902012537,
"grad_norm": 0.2977316081523895,
"learning_rate": 8e-05,
"loss": 1.7192,
"step": 2991
},
{
"epoch": 0.658088639612889,
"grad_norm": 0.296475887298584,
"learning_rate": 8e-05,
"loss": 1.6815,
"step": 2992
},
{
"epoch": 0.6583085890245244,
"grad_norm": 0.29579752683639526,
"learning_rate": 8e-05,
"loss": 1.781,
"step": 2993
},
{
"epoch": 0.6585285384361597,
"grad_norm": 0.2853552997112274,
"learning_rate": 8e-05,
"loss": 1.69,
"step": 2994
},
{
"epoch": 0.658748487847795,
"grad_norm": 0.2831558883190155,
"learning_rate": 8e-05,
"loss": 1.7118,
"step": 2995
},
{
"epoch": 0.6589684372594303,
"grad_norm": 0.30975469946861267,
"learning_rate": 8e-05,
"loss": 1.7153,
"step": 2996
},
{
"epoch": 0.6591883866710656,
"grad_norm": 0.287047803401947,
"learning_rate": 8e-05,
"loss": 1.7241,
"step": 2997
},
{
"epoch": 0.659408336082701,
"grad_norm": 0.2812976837158203,
"learning_rate": 8e-05,
"loss": 1.6536,
"step": 2998
},
{
"epoch": 0.6596282854943363,
"grad_norm": 0.2794138491153717,
"learning_rate": 8e-05,
"loss": 1.6176,
"step": 2999
},
{
"epoch": 0.6598482349059717,
"grad_norm": 0.2949649691581726,
"learning_rate": 8e-05,
"loss": 1.6499,
"step": 3000
}
],
"logging_steps": 1,
"max_steps": 4546,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.65692592029696e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}