9b-61 / trainer_state.json
furproxy's picture
Upload folder using huggingface_hub
d28b21e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 928,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008620689655172414,
"grad_norm": 0.2490028440952301,
"learning_rate": 8.510638297872341e-07,
"loss": 3.7051703929901123,
"step": 2
},
{
"epoch": 0.017241379310344827,
"grad_norm": 0.9354268908500671,
"learning_rate": 2.553191489361702e-06,
"loss": 2.5432679653167725,
"step": 4
},
{
"epoch": 0.02586206896551724,
"grad_norm": 1.1884268522262573,
"learning_rate": 4.255319148936171e-06,
"loss": 2.127363681793213,
"step": 6
},
{
"epoch": 0.034482758620689655,
"grad_norm": 0.21074028313159943,
"learning_rate": 5.957446808510638e-06,
"loss": 1.8431488275527954,
"step": 8
},
{
"epoch": 0.04310344827586207,
"grad_norm": 1.0560111999511719,
"learning_rate": 7.659574468085107e-06,
"loss": 1.2361581325531006,
"step": 10
},
{
"epoch": 0.05172413793103448,
"grad_norm": 0.1471565067768097,
"learning_rate": 9.361702127659576e-06,
"loss": 1.7195181846618652,
"step": 12
},
{
"epoch": 0.0603448275862069,
"grad_norm": 0.28380081057548523,
"learning_rate": 1.1063829787234044e-05,
"loss": 1.7923121452331543,
"step": 14
},
{
"epoch": 0.06896551724137931,
"grad_norm": 0.16648316383361816,
"learning_rate": 1.2765957446808513e-05,
"loss": 0.7930053472518921,
"step": 16
},
{
"epoch": 0.07758620689655173,
"grad_norm": 0.662397027015686,
"learning_rate": 1.4468085106382981e-05,
"loss": 0.8174023032188416,
"step": 18
},
{
"epoch": 0.08620689655172414,
"grad_norm": 0.13494855165481567,
"learning_rate": 1.6170212765957446e-05,
"loss": 1.5101033449172974,
"step": 20
},
{
"epoch": 0.09482758620689655,
"grad_norm": 0.1281285285949707,
"learning_rate": 1.7872340425531915e-05,
"loss": 1.6496508121490479,
"step": 22
},
{
"epoch": 0.10344827586206896,
"grad_norm": 0.19361676275730133,
"learning_rate": 1.9574468085106384e-05,
"loss": 1.5436862707138062,
"step": 24
},
{
"epoch": 0.11206896551724138,
"grad_norm": 0.26801514625549316,
"learning_rate": 2.1276595744680852e-05,
"loss": 1.258086919784546,
"step": 26
},
{
"epoch": 0.1206896551724138,
"grad_norm": 0.6263072490692139,
"learning_rate": 2.2978723404255324e-05,
"loss": 0.4653662145137787,
"step": 28
},
{
"epoch": 0.12931034482758622,
"grad_norm": 0.09440683573484421,
"learning_rate": 2.468085106382979e-05,
"loss": 1.5492061376571655,
"step": 30
},
{
"epoch": 0.13793103448275862,
"grad_norm": 0.18837958574295044,
"learning_rate": 2.6382978723404255e-05,
"loss": 1.1802101135253906,
"step": 32
},
{
"epoch": 0.14655172413793102,
"grad_norm": 0.13221898674964905,
"learning_rate": 2.8085106382978727e-05,
"loss": 1.5317002534866333,
"step": 34
},
{
"epoch": 0.15517241379310345,
"grad_norm": 0.09329993277788162,
"learning_rate": 2.9787234042553192e-05,
"loss": 1.4935221672058105,
"step": 36
},
{
"epoch": 0.16379310344827586,
"grad_norm": 0.1460646688938141,
"learning_rate": 3.1489361702127664e-05,
"loss": 1.4348247051239014,
"step": 38
},
{
"epoch": 0.1724137931034483,
"grad_norm": 0.11409169435501099,
"learning_rate": 3.319148936170213e-05,
"loss": 1.6168015003204346,
"step": 40
},
{
"epoch": 0.1810344827586207,
"grad_norm": 0.212503582239151,
"learning_rate": 3.48936170212766e-05,
"loss": 1.7296580076217651,
"step": 42
},
{
"epoch": 0.1896551724137931,
"grad_norm": 0.2593461275100708,
"learning_rate": 3.6595744680851066e-05,
"loss": 1.0080050230026245,
"step": 44
},
{
"epoch": 0.19827586206896552,
"grad_norm": 0.15509171783924103,
"learning_rate": 3.829787234042554e-05,
"loss": 1.2960833311080933,
"step": 46
},
{
"epoch": 0.20689655172413793,
"grad_norm": 0.11303484439849854,
"learning_rate": 4e-05,
"loss": 1.463219404220581,
"step": 48
},
{
"epoch": 0.21551724137931033,
"grad_norm": 0.10671091824769974,
"learning_rate": 3.999954222867108e-05,
"loss": 1.5820165872573853,
"step": 50
},
{
"epoch": 0.22413793103448276,
"grad_norm": 0.2280566394329071,
"learning_rate": 3.999816893796815e-05,
"loss": 0.7817078232765198,
"step": 52
},
{
"epoch": 0.23275862068965517,
"grad_norm": 0.0907350480556488,
"learning_rate": 3.9995880197741576e-05,
"loss": 1.5489472150802612,
"step": 54
},
{
"epoch": 0.2413793103448276,
"grad_norm": 0.11344298720359802,
"learning_rate": 3.999267612440463e-05,
"loss": 1.447698950767517,
"step": 56
},
{
"epoch": 0.25,
"grad_norm": 0.14231091737747192,
"learning_rate": 3.9988556880927647e-05,
"loss": 1.4835537672042847,
"step": 58
},
{
"epoch": 0.25862068965517243,
"grad_norm": 0.10744752734899521,
"learning_rate": 3.998352267682969e-05,
"loss": 1.4630577564239502,
"step": 60
},
{
"epoch": 0.2672413793103448,
"grad_norm": 0.33425694704055786,
"learning_rate": 3.99775737681679e-05,
"loss": 1.2140964269638062,
"step": 62
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.0645914226770401,
"learning_rate": 3.9970710457524474e-05,
"loss": 1.4031065702438354,
"step": 64
},
{
"epoch": 0.28448275862068967,
"grad_norm": 0.07510636001825333,
"learning_rate": 3.9962933093991296e-05,
"loss": 1.3692433834075928,
"step": 66
},
{
"epoch": 0.29310344827586204,
"grad_norm": 0.06527955085039139,
"learning_rate": 3.995424207315214e-05,
"loss": 1.3801196813583374,
"step": 68
},
{
"epoch": 0.3017241379310345,
"grad_norm": 0.12786391377449036,
"learning_rate": 3.994463783706259e-05,
"loss": 1.151434063911438,
"step": 70
},
{
"epoch": 0.3103448275862069,
"grad_norm": 0.19329607486724854,
"learning_rate": 3.9934120874227505e-05,
"loss": 1.0419366359710693,
"step": 72
},
{
"epoch": 0.31896551724137934,
"grad_norm": 0.08128409832715988,
"learning_rate": 3.992269171957624e-05,
"loss": 1.2281261682510376,
"step": 74
},
{
"epoch": 0.3275862068965517,
"grad_norm": 0.1377028524875641,
"learning_rate": 3.991035095443538e-05,
"loss": 1.3645248413085938,
"step": 76
},
{
"epoch": 0.33620689655172414,
"grad_norm": 0.05881645902991295,
"learning_rate": 3.9897099206499204e-05,
"loss": 1.2278920412063599,
"step": 78
},
{
"epoch": 0.3448275862068966,
"grad_norm": 0.15875652432441711,
"learning_rate": 3.9882937149797735e-05,
"loss": 1.0160530805587769,
"step": 80
},
{
"epoch": 0.35344827586206895,
"grad_norm": 0.09680726379156113,
"learning_rate": 3.986786550466246e-05,
"loss": 1.4256402254104614,
"step": 82
},
{
"epoch": 0.3620689655172414,
"grad_norm": 0.30841922760009766,
"learning_rate": 3.985188503768975e-05,
"loss": 1.318834900856018,
"step": 84
},
{
"epoch": 0.3706896551724138,
"grad_norm": 0.14286209642887115,
"learning_rate": 3.983499656170176e-05,
"loss": 1.474804162979126,
"step": 86
},
{
"epoch": 0.3793103448275862,
"grad_norm": 0.11407710611820221,
"learning_rate": 3.981720093570517e-05,
"loss": 1.3328633308410645,
"step": 88
},
{
"epoch": 0.3879310344827586,
"grad_norm": 0.4702969193458557,
"learning_rate": 3.9798499064847466e-05,
"loss": 1.1703399419784546,
"step": 90
},
{
"epoch": 0.39655172413793105,
"grad_norm": 0.13214579224586487,
"learning_rate": 3.9778891900370905e-05,
"loss": 1.0334569215774536,
"step": 92
},
{
"epoch": 0.4051724137931034,
"grad_norm": 0.17201748490333557,
"learning_rate": 3.9758380439564117e-05,
"loss": 1.1891283988952637,
"step": 94
},
{
"epoch": 0.41379310344827586,
"grad_norm": 0.2221606969833374,
"learning_rate": 3.97369657257114e-05,
"loss": 1.4113942384719849,
"step": 96
},
{
"epoch": 0.4224137931034483,
"grad_norm": 0.39266011118888855,
"learning_rate": 3.9714648848039655e-05,
"loss": 1.2900447845458984,
"step": 98
},
{
"epoch": 0.43103448275862066,
"grad_norm": 0.09964156895875931,
"learning_rate": 3.969143094166295e-05,
"loss": 1.270521640777588,
"step": 100
},
{
"epoch": 0.4396551724137931,
"grad_norm": 0.5467281937599182,
"learning_rate": 3.966731318752484e-05,
"loss": 1.2587134838104248,
"step": 102
},
{
"epoch": 0.4482758620689655,
"grad_norm": 0.10468795150518417,
"learning_rate": 3.964229681233825e-05,
"loss": 1.3480840921401978,
"step": 104
},
{
"epoch": 0.45689655172413796,
"grad_norm": 0.14980582892894745,
"learning_rate": 3.961638308852309e-05,
"loss": 1.0994645357131958,
"step": 106
},
{
"epoch": 0.46551724137931033,
"grad_norm": 0.22398428618907928,
"learning_rate": 3.958957333414157e-05,
"loss": 1.233306646347046,
"step": 108
},
{
"epoch": 0.47413793103448276,
"grad_norm": 0.20820066332817078,
"learning_rate": 3.9561868912831135e-05,
"loss": 1.2420070171356201,
"step": 110
},
{
"epoch": 0.4827586206896552,
"grad_norm": 0.07252290099859238,
"learning_rate": 3.953327123373506e-05,
"loss": 1.5031483173370361,
"step": 112
},
{
"epoch": 0.49137931034482757,
"grad_norm": 0.30114564299583435,
"learning_rate": 3.950378175143088e-05,
"loss": 1.2730351686477661,
"step": 114
},
{
"epoch": 0.5,
"grad_norm": 0.5594906806945801,
"learning_rate": 3.947340196585631e-05,
"loss": 1.0227445363998413,
"step": 116
},
{
"epoch": 0.5086206896551724,
"grad_norm": 0.06979751586914062,
"learning_rate": 3.944213342223299e-05,
"loss": 1.3545396327972412,
"step": 118
},
{
"epoch": 0.5172413793103449,
"grad_norm": 0.13030360639095306,
"learning_rate": 3.9409977710987896e-05,
"loss": 1.332112431526184,
"step": 120
},
{
"epoch": 0.5258620689655172,
"grad_norm": 0.0672382041811943,
"learning_rate": 3.937693646767245e-05,
"loss": 1.3639230728149414,
"step": 122
},
{
"epoch": 0.5344827586206896,
"grad_norm": 0.13222964107990265,
"learning_rate": 3.9343011372879275e-05,
"loss": 1.5418974161148071,
"step": 124
},
{
"epoch": 0.5431034482758621,
"grad_norm": 0.09776262193918228,
"learning_rate": 3.930820415215681e-05,
"loss": 1.6416376829147339,
"step": 126
},
{
"epoch": 0.5517241379310345,
"grad_norm": 0.058448825031518936,
"learning_rate": 3.927251657592146e-05,
"loss": 1.1251301765441895,
"step": 128
},
{
"epoch": 0.5603448275862069,
"grad_norm": 0.052552610635757446,
"learning_rate": 3.923595045936757e-05,
"loss": 1.3253697156906128,
"step": 130
},
{
"epoch": 0.5689655172413793,
"grad_norm": 0.19469662010669708,
"learning_rate": 3.919850766237512e-05,
"loss": 1.263968586921692,
"step": 132
},
{
"epoch": 0.5775862068965517,
"grad_norm": 0.17169688642024994,
"learning_rate": 3.9160190089415106e-05,
"loss": 0.7878425717353821,
"step": 134
},
{
"epoch": 0.5862068965517241,
"grad_norm": 0.07661473006010056,
"learning_rate": 3.912099968945268e-05,
"loss": 1.0626349449157715,
"step": 136
},
{
"epoch": 0.5948275862068966,
"grad_norm": 0.04046183452010155,
"learning_rate": 3.908093845584798e-05,
"loss": 1.05846107006073,
"step": 138
},
{
"epoch": 0.603448275862069,
"grad_norm": 0.09312810003757477,
"learning_rate": 3.9040008426254824e-05,
"loss": 1.0136967897415161,
"step": 140
},
{
"epoch": 0.6120689655172413,
"grad_norm": 0.06341353058815002,
"learning_rate": 3.8998211682516976e-05,
"loss": 1.0979063510894775,
"step": 142
},
{
"epoch": 0.6206896551724138,
"grad_norm": 0.07724281400442123,
"learning_rate": 3.895555035056233e-05,
"loss": 1.2308785915374756,
"step": 144
},
{
"epoch": 0.6293103448275862,
"grad_norm": 0.06727743148803711,
"learning_rate": 3.891202660029474e-05,
"loss": 1.3009754419326782,
"step": 146
},
{
"epoch": 0.6379310344827587,
"grad_norm": 0.06898898631334305,
"learning_rate": 3.886764264548363e-05,
"loss": 1.3296358585357666,
"step": 148
},
{
"epoch": 0.646551724137931,
"grad_norm": 0.09349878877401352,
"learning_rate": 3.882240074365145e-05,
"loss": 1.2398273944854736,
"step": 150
},
{
"epoch": 0.6551724137931034,
"grad_norm": 0.17715542018413544,
"learning_rate": 3.8776303195958814e-05,
"loss": 0.9495888948440552,
"step": 152
},
{
"epoch": 0.6637931034482759,
"grad_norm": 0.7314029932022095,
"learning_rate": 3.872935234708747e-05,
"loss": 1.1250660419464111,
"step": 154
},
{
"epoch": 0.6724137931034483,
"grad_norm": 0.24987009167671204,
"learning_rate": 3.868155058512102e-05,
"loss": 1.2095718383789062,
"step": 156
},
{
"epoch": 0.6810344827586207,
"grad_norm": 0.14576859772205353,
"learning_rate": 3.8632900341423464e-05,
"loss": 1.295078992843628,
"step": 158
},
{
"epoch": 0.6896551724137931,
"grad_norm": 0.11293840408325195,
"learning_rate": 3.858340409051558e-05,
"loss": 1.338647723197937,
"step": 160
},
{
"epoch": 0.6982758620689655,
"grad_norm": 0.14143511652946472,
"learning_rate": 3.853306434994895e-05,
"loss": 1.6283466815948486,
"step": 162
},
{
"epoch": 0.7068965517241379,
"grad_norm": 0.06157878786325455,
"learning_rate": 3.848188368017803e-05,
"loss": 1.055685043334961,
"step": 164
},
{
"epoch": 0.7155172413793104,
"grad_norm": 0.10017464309930801,
"learning_rate": 3.8429864684429846e-05,
"loss": 1.3115266561508179,
"step": 166
},
{
"epoch": 0.7241379310344828,
"grad_norm": 0.1262677162885666,
"learning_rate": 3.837701000857159e-05,
"loss": 1.3648704290390015,
"step": 168
},
{
"epoch": 0.7327586206896551,
"grad_norm": 0.1431019902229309,
"learning_rate": 3.832332234097606e-05,
"loss": 1.2736293077468872,
"step": 170
},
{
"epoch": 0.7413793103448276,
"grad_norm": 0.3130134046077728,
"learning_rate": 3.8268804412384936e-05,
"loss": 1.2950979471206665,
"step": 172
},
{
"epoch": 0.75,
"grad_norm": 0.10647283494472504,
"learning_rate": 3.821345899576982e-05,
"loss": 0.9605190753936768,
"step": 174
},
{
"epoch": 0.7586206896551724,
"grad_norm": 0.16715826094150543,
"learning_rate": 3.815728890619127e-05,
"loss": 0.7583910226821899,
"step": 176
},
{
"epoch": 0.7672413793103449,
"grad_norm": 0.21642223000526428,
"learning_rate": 3.8100297000655566e-05,
"loss": 1.3208308219909668,
"step": 178
},
{
"epoch": 0.7758620689655172,
"grad_norm": 0.09079600870609283,
"learning_rate": 3.804248617796941e-05,
"loss": 1.3078787326812744,
"step": 180
},
{
"epoch": 0.7844827586206896,
"grad_norm": 0.08445187658071518,
"learning_rate": 3.798385937859249e-05,
"loss": 1.0150703191757202,
"step": 182
},
{
"epoch": 0.7931034482758621,
"grad_norm": 0.07777632772922516,
"learning_rate": 3.79244195844879e-05,
"loss": 0.8019794225692749,
"step": 184
},
{
"epoch": 0.8017241379310345,
"grad_norm": 0.0645889863371849,
"learning_rate": 3.7864169818970465e-05,
"loss": 1.423434853553772,
"step": 186
},
{
"epoch": 0.8103448275862069,
"grad_norm": 0.14517554640769958,
"learning_rate": 3.7803113146553e-05,
"loss": 1.5573757886886597,
"step": 188
},
{
"epoch": 0.8189655172413793,
"grad_norm": 0.08607929199934006,
"learning_rate": 3.774125267279041e-05,
"loss": 1.2926381826400757,
"step": 190
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.07077501714229584,
"learning_rate": 3.767859154412171e-05,
"loss": 1.2611286640167236,
"step": 192
},
{
"epoch": 0.8362068965517241,
"grad_norm": 0.08681857585906982,
"learning_rate": 3.7615132947710036e-05,
"loss": 1.3005847930908203,
"step": 194
},
{
"epoch": 0.8448275862068966,
"grad_norm": 0.15100322663784027,
"learning_rate": 3.755088011128049e-05,
"loss": 1.3176685571670532,
"step": 196
},
{
"epoch": 0.853448275862069,
"grad_norm": 0.1908247470855713,
"learning_rate": 3.7485836302956016e-05,
"loss": 1.2926079034805298,
"step": 198
},
{
"epoch": 0.8620689655172413,
"grad_norm": 0.11072229593992233,
"learning_rate": 3.7420004831091105e-05,
"loss": 1.290125846862793,
"step": 200
},
{
"epoch": 0.8706896551724138,
"grad_norm": 0.10266255587339401,
"learning_rate": 3.735338904410358e-05,
"loss": 1.333167552947998,
"step": 202
},
{
"epoch": 0.8793103448275862,
"grad_norm": 0.12212225794792175,
"learning_rate": 3.728599233030425e-05,
"loss": 0.894460916519165,
"step": 204
},
{
"epoch": 0.8879310344827587,
"grad_norm": 0.07256551086902618,
"learning_rate": 3.72178181177246e-05,
"loss": 1.2725472450256348,
"step": 206
},
{
"epoch": 0.896551724137931,
"grad_norm": 0.12780705094337463,
"learning_rate": 3.714886987394238e-05,
"loss": 1.3160998821258545,
"step": 208
},
{
"epoch": 0.9051724137931034,
"grad_norm": 0.9440551400184631,
"learning_rate": 3.70791511059053e-05,
"loss": 0.6522892117500305,
"step": 210
},
{
"epoch": 0.9137931034482759,
"grad_norm": 0.09754825383424759,
"learning_rate": 3.700866535975256e-05,
"loss": 0.9885504245758057,
"step": 212
},
{
"epoch": 0.9224137931034483,
"grad_norm": 0.4700559675693512,
"learning_rate": 3.69374162206346e-05,
"loss": 1.2210713624954224,
"step": 214
},
{
"epoch": 0.9310344827586207,
"grad_norm": 0.15046778321266174,
"learning_rate": 3.6865407312530635e-05,
"loss": 1.2765154838562012,
"step": 216
},
{
"epoch": 0.9396551724137931,
"grad_norm": 0.11940351128578186,
"learning_rate": 3.67926422980644e-05,
"loss": 1.0038096904754639,
"step": 218
},
{
"epoch": 0.9482758620689655,
"grad_norm": 0.11115628480911255,
"learning_rate": 3.671912487831783e-05,
"loss": 0.9949377179145813,
"step": 220
},
{
"epoch": 0.9568965517241379,
"grad_norm": 0.07952730357646942,
"learning_rate": 3.664485879264279e-05,
"loss": 1.3989291191101074,
"step": 222
},
{
"epoch": 0.9655172413793104,
"grad_norm": 0.09136626124382019,
"learning_rate": 3.656984781847094e-05,
"loss": 0.9785476922988892,
"step": 224
},
{
"epoch": 0.9741379310344828,
"grad_norm": 0.057783424854278564,
"learning_rate": 3.649409577112152e-05,
"loss": 0.9384239315986633,
"step": 226
},
{
"epoch": 0.9827586206896551,
"grad_norm": 0.07152694463729858,
"learning_rate": 3.641760650360736e-05,
"loss": 1.2639554738998413,
"step": 228
},
{
"epoch": 0.9913793103448276,
"grad_norm": 0.33899354934692383,
"learning_rate": 3.634038390643886e-05,
"loss": 0.9682251811027527,
"step": 230
},
{
"epoch": 1.0,
"grad_norm": 0.07940968871116638,
"learning_rate": 3.626243190742613e-05,
"loss": 0.870396614074707,
"step": 232
},
{
"epoch": 1.0086206896551724,
"grad_norm": 0.12955711781978607,
"learning_rate": 3.618375447147918e-05,
"loss": 0.9028921723365784,
"step": 234
},
{
"epoch": 1.0172413793103448,
"grad_norm": 0.18779002130031586,
"learning_rate": 3.6104355600406284e-05,
"loss": 0.7830209136009216,
"step": 236
},
{
"epoch": 1.0258620689655173,
"grad_norm": 0.9097674489021301,
"learning_rate": 3.6024239332710415e-05,
"loss": 0.8674835562705994,
"step": 238
},
{
"epoch": 1.0344827586206897,
"grad_norm": 0.27647626399993896,
"learning_rate": 3.5943409743383826e-05,
"loss": 0.6074855327606201,
"step": 240
},
{
"epoch": 1.043103448275862,
"grad_norm": 0.11918103694915771,
"learning_rate": 3.586187094370079e-05,
"loss": 1.2056649923324585,
"step": 242
},
{
"epoch": 1.0517241379310345,
"grad_norm": 0.11876146495342255,
"learning_rate": 3.577962708100851e-05,
"loss": 0.39286842942237854,
"step": 244
},
{
"epoch": 1.0603448275862069,
"grad_norm": 0.12174484878778458,
"learning_rate": 3.569668233851613e-05,
"loss": 0.9662060141563416,
"step": 246
},
{
"epoch": 1.0689655172413792,
"grad_norm": 0.06474713236093521,
"learning_rate": 3.561304093508198e-05,
"loss": 0.9487460851669312,
"step": 248
},
{
"epoch": 1.0775862068965518,
"grad_norm": 0.1427253633737564,
"learning_rate": 3.552870712499898e-05,
"loss": 0.8610017895698547,
"step": 250
},
{
"epoch": 1.0862068965517242,
"grad_norm": 0.1900222897529602,
"learning_rate": 3.54436851977783e-05,
"loss": 0.448039710521698,
"step": 252
},
{
"epoch": 1.0948275862068966,
"grad_norm": 0.22907602787017822,
"learning_rate": 3.535797947793111e-05,
"loss": 0.8761284351348877,
"step": 254
},
{
"epoch": 1.103448275862069,
"grad_norm": 0.15932975709438324,
"learning_rate": 3.527159432474865e-05,
"loss": 0.8125666975975037,
"step": 256
},
{
"epoch": 1.1120689655172413,
"grad_norm": 0.349263459444046,
"learning_rate": 3.518453413208053e-05,
"loss": 0.6569501757621765,
"step": 258
},
{
"epoch": 1.1206896551724137,
"grad_norm": 0.14833390712738037,
"learning_rate": 3.509680332811121e-05,
"loss": 0.7028253674507141,
"step": 260
},
{
"epoch": 1.1293103448275863,
"grad_norm": 0.05770875886082649,
"learning_rate": 3.5008406375134756e-05,
"loss": 1.1632713079452515,
"step": 262
},
{
"epoch": 1.1379310344827587,
"grad_norm": 0.055273283272981644,
"learning_rate": 3.491934776932791e-05,
"loss": 0.8371788263320923,
"step": 264
},
{
"epoch": 1.146551724137931,
"grad_norm": 0.05901844799518585,
"learning_rate": 3.482963204052139e-05,
"loss": 1.0003291368484497,
"step": 266
},
{
"epoch": 1.1551724137931034,
"grad_norm": 0.06095067784190178,
"learning_rate": 3.473926375196943e-05,
"loss": 0.9226457476615906,
"step": 268
},
{
"epoch": 1.1637931034482758,
"grad_norm": 0.07438337802886963,
"learning_rate": 3.464824750011779e-05,
"loss": 1.1149680614471436,
"step": 270
},
{
"epoch": 1.1724137931034484,
"grad_norm": 0.06853963434696198,
"learning_rate": 3.455658791436985e-05,
"loss": 1.0302170515060425,
"step": 272
},
{
"epoch": 1.1810344827586208,
"grad_norm": 0.0973813459277153,
"learning_rate": 3.446428965685121e-05,
"loss": 0.7569156289100647,
"step": 274
},
{
"epoch": 1.1896551724137931,
"grad_norm": 0.056327857077121735,
"learning_rate": 3.437135742217254e-05,
"loss": 0.3794441223144531,
"step": 276
},
{
"epoch": 1.1982758620689655,
"grad_norm": 0.07669582962989807,
"learning_rate": 3.427779593719079e-05,
"loss": 1.1280944347381592,
"step": 278
},
{
"epoch": 1.206896551724138,
"grad_norm": 0.058203186839818954,
"learning_rate": 3.4183609960768764e-05,
"loss": 0.9517163634300232,
"step": 280
},
{
"epoch": 1.2155172413793103,
"grad_norm": 0.06986816227436066,
"learning_rate": 3.4088804283533094e-05,
"loss": 0.6671708822250366,
"step": 282
},
{
"epoch": 1.2241379310344827,
"grad_norm": 0.08936108648777008,
"learning_rate": 3.399338372763055e-05,
"loss": 0.7694864869117737,
"step": 284
},
{
"epoch": 1.2327586206896552,
"grad_norm": 0.12140902131795883,
"learning_rate": 3.389735314648274e-05,
"loss": 0.8068587183952332,
"step": 286
},
{
"epoch": 1.2413793103448276,
"grad_norm": 0.04297681525349617,
"learning_rate": 3.380071742453931e-05,
"loss": 0.40287792682647705,
"step": 288
},
{
"epoch": 1.25,
"grad_norm": 0.11908482015132904,
"learning_rate": 3.370348147702949e-05,
"loss": 1.0401684045791626,
"step": 290
},
{
"epoch": 1.2586206896551724,
"grad_norm": 0.057489216327667236,
"learning_rate": 3.360565024971202e-05,
"loss": 0.8889655470848083,
"step": 292
},
{
"epoch": 1.2672413793103448,
"grad_norm": 0.15609467029571533,
"learning_rate": 3.350722871862368e-05,
"loss": 0.9757481813430786,
"step": 294
},
{
"epoch": 1.2758620689655173,
"grad_norm": 0.1248452365398407,
"learning_rate": 3.340822188982616e-05,
"loss": 0.7736673355102539,
"step": 296
},
{
"epoch": 1.2844827586206897,
"grad_norm": 0.09071607887744904,
"learning_rate": 3.330863479915138e-05,
"loss": 1.0164954662322998,
"step": 298
},
{
"epoch": 1.293103448275862,
"grad_norm": 0.06744378060102463,
"learning_rate": 3.320847251194546e-05,
"loss": 0.9475960731506348,
"step": 300
},
{
"epoch": 1.3017241379310345,
"grad_norm": 0.07189597189426422,
"learning_rate": 3.310774012281099e-05,
"loss": 0.6825069785118103,
"step": 302
},
{
"epoch": 1.3103448275862069,
"grad_norm": 0.07518645375967026,
"learning_rate": 3.300644275534793e-05,
"loss": 0.5717735290527344,
"step": 304
},
{
"epoch": 1.3189655172413794,
"grad_norm": 0.09223438799381256,
"learning_rate": 3.290458556189299e-05,
"loss": 1.3711295127868652,
"step": 306
},
{
"epoch": 1.3275862068965516,
"grad_norm": 0.14958783984184265,
"learning_rate": 3.2802173723257604e-05,
"loss": 0.6421374082565308,
"step": 308
},
{
"epoch": 1.3362068965517242,
"grad_norm": 0.1238432452082634,
"learning_rate": 3.2699212448464385e-05,
"loss": 0.9758880734443665,
"step": 310
},
{
"epoch": 1.3448275862068966,
"grad_norm": 0.06866496056318283,
"learning_rate": 3.259570697448217e-05,
"loss": 0.9329778552055359,
"step": 312
},
{
"epoch": 1.353448275862069,
"grad_norm": 0.10072822868824005,
"learning_rate": 3.249166256595967e-05,
"loss": 1.2179062366485596,
"step": 314
},
{
"epoch": 1.3620689655172413,
"grad_norm": 0.06878109276294708,
"learning_rate": 3.2387084514957675e-05,
"loss": 1.3471888303756714,
"step": 316
},
{
"epoch": 1.3706896551724137,
"grad_norm": 0.06524922698736191,
"learning_rate": 3.2281978140679894e-05,
"loss": 0.9441757202148438,
"step": 318
},
{
"epoch": 1.3793103448275863,
"grad_norm": 0.11429349333047867,
"learning_rate": 3.21763487892024e-05,
"loss": 0.7498874664306641,
"step": 320
},
{
"epoch": 1.3879310344827587,
"grad_norm": 0.07464733719825745,
"learning_rate": 3.207020183320171e-05,
"loss": 1.4824918508529663,
"step": 322
},
{
"epoch": 1.396551724137931,
"grad_norm": 0.11774388700723648,
"learning_rate": 3.196354267168149e-05,
"loss": 0.49022743105888367,
"step": 324
},
{
"epoch": 1.4051724137931034,
"grad_norm": 0.16186490654945374,
"learning_rate": 3.185637672969799e-05,
"loss": 0.6543675661087036,
"step": 326
},
{
"epoch": 1.4137931034482758,
"grad_norm": 0.10584386438131332,
"learning_rate": 3.1748709458084045e-05,
"loss": 0.8541685342788696,
"step": 328
},
{
"epoch": 1.4224137931034484,
"grad_norm": 0.07407426834106445,
"learning_rate": 3.1640546333171894e-05,
"loss": 0.7656717300415039,
"step": 330
},
{
"epoch": 1.4310344827586206,
"grad_norm": 0.16052280366420746,
"learning_rate": 3.153189285651458e-05,
"loss": 0.6957482695579529,
"step": 332
},
{
"epoch": 1.4396551724137931,
"grad_norm": 0.13904230296611786,
"learning_rate": 3.142275455460614e-05,
"loss": 0.6638420224189758,
"step": 334
},
{
"epoch": 1.4482758620689655,
"grad_norm": 0.11371087282896042,
"learning_rate": 3.131313697860053e-05,
"loss": 0.7661845088005066,
"step": 336
},
{
"epoch": 1.456896551724138,
"grad_norm": 0.11575423926115036,
"learning_rate": 3.120304570402924e-05,
"loss": 1.1160173416137695,
"step": 338
},
{
"epoch": 1.4655172413793103,
"grad_norm": 0.22061830759048462,
"learning_rate": 3.1092486330517714e-05,
"loss": 1.384441614151001,
"step": 340
},
{
"epoch": 1.4741379310344827,
"grad_norm": 0.12608060240745544,
"learning_rate": 3.098146448150055e-05,
"loss": 1.145660638809204,
"step": 342
},
{
"epoch": 1.4827586206896552,
"grad_norm": 0.06820492446422577,
"learning_rate": 3.086998580393547e-05,
"loss": 0.9891381859779358,
"step": 344
},
{
"epoch": 1.4913793103448276,
"grad_norm": 0.11383876949548721,
"learning_rate": 3.075805596801605e-05,
"loss": 0.6093174815177917,
"step": 346
},
{
"epoch": 1.5,
"grad_norm": 0.2013673037290573,
"learning_rate": 3.0645680666883374e-05,
"loss": 0.9298641681671143,
"step": 348
},
{
"epoch": 1.5086206896551724,
"grad_norm": 0.08500847220420837,
"learning_rate": 3.053286561633644e-05,
"loss": 0.9974504113197327,
"step": 350
},
{
"epoch": 1.5172413793103448,
"grad_norm": 0.14812250435352325,
"learning_rate": 3.041961655454143e-05,
"loss": 0.9739059209823608,
"step": 352
},
{
"epoch": 1.5258620689655173,
"grad_norm": 0.11965472251176834,
"learning_rate": 3.030593924173984e-05,
"loss": 1.133984088897705,
"step": 354
},
{
"epoch": 1.5344827586206895,
"grad_norm": 0.36424365639686584,
"learning_rate": 3.0191839459955514e-05,
"loss": 0.8807175755500793,
"step": 356
},
{
"epoch": 1.543103448275862,
"grad_norm": 0.05107448622584343,
"learning_rate": 3.0077323012700534e-05,
"loss": 0.8361281156539917,
"step": 358
},
{
"epoch": 1.5517241379310345,
"grad_norm": 0.09036049991846085,
"learning_rate": 2.996239572468003e-05,
"loss": 1.2387166023254395,
"step": 360
},
{
"epoch": 1.5603448275862069,
"grad_norm": 0.06331617385149002,
"learning_rate": 2.984706344149595e-05,
"loss": 1.0467900037765503,
"step": 362
},
{
"epoch": 1.5689655172413794,
"grad_norm": 0.06433523446321487,
"learning_rate": 2.9731332029349667e-05,
"loss": 1.0626113414764404,
"step": 364
},
{
"epoch": 1.5775862068965516,
"grad_norm": 0.09752818942070007,
"learning_rate": 2.961520737474367e-05,
"loss": 1.0128107070922852,
"step": 366
},
{
"epoch": 1.5862068965517242,
"grad_norm": 0.05285457894206047,
"learning_rate": 2.9498695384182123e-05,
"loss": 0.9877223968505859,
"step": 368
},
{
"epoch": 1.5948275862068966,
"grad_norm": 0.05934653803706169,
"learning_rate": 2.9381801983870435e-05,
"loss": 0.9603118300437927,
"step": 370
},
{
"epoch": 1.603448275862069,
"grad_norm": 0.22097375988960266,
"learning_rate": 2.9264533119413866e-05,
"loss": 1.081476092338562,
"step": 372
},
{
"epoch": 1.6120689655172413,
"grad_norm": 0.10628407448530197,
"learning_rate": 2.914689475551506e-05,
"loss": 0.7714329957962036,
"step": 374
},
{
"epoch": 1.6206896551724137,
"grad_norm": 0.10955756157636642,
"learning_rate": 2.902889287567072e-05,
"loss": 0.9913143515586853,
"step": 376
},
{
"epoch": 1.6293103448275863,
"grad_norm": 0.07451241463422775,
"learning_rate": 2.8910533481867195e-05,
"loss": 1.1765313148498535,
"step": 378
},
{
"epoch": 1.6379310344827587,
"grad_norm": 0.07359088957309723,
"learning_rate": 2.879182259427528e-05,
"loss": 0.7655573487281799,
"step": 380
},
{
"epoch": 1.646551724137931,
"grad_norm": 0.13642138242721558,
"learning_rate": 2.8672766250943947e-05,
"loss": 1.3452657461166382,
"step": 382
},
{
"epoch": 1.6551724137931034,
"grad_norm": 0.08765345811843872,
"learning_rate": 2.8553370507493246e-05,
"loss": 0.9972445964813232,
"step": 384
},
{
"epoch": 1.6637931034482758,
"grad_norm": 0.0989682674407959,
"learning_rate": 2.8433641436806306e-05,
"loss": 0.8845785856246948,
"step": 386
},
{
"epoch": 1.6724137931034484,
"grad_norm": 0.06875207275152206,
"learning_rate": 2.8313585128720444e-05,
"loss": 1.3110713958740234,
"step": 388
},
{
"epoch": 1.6810344827586206,
"grad_norm": 0.13957612216472626,
"learning_rate": 2.8193207689717393e-05,
"loss": 0.8128502368927002,
"step": 390
},
{
"epoch": 1.6896551724137931,
"grad_norm": 0.6921377778053284,
"learning_rate": 2.807251524261275e-05,
"loss": 0.6244351863861084,
"step": 392
},
{
"epoch": 1.6982758620689655,
"grad_norm": 0.30923035740852356,
"learning_rate": 2.7951513926244484e-05,
"loss": 1.127506136894226,
"step": 394
},
{
"epoch": 1.706896551724138,
"grad_norm": 0.0620148703455925,
"learning_rate": 2.7830209895160764e-05,
"loss": 1.042289137840271,
"step": 396
},
{
"epoch": 1.7155172413793105,
"grad_norm": 0.16145341098308563,
"learning_rate": 2.770860931930687e-05,
"loss": 1.0570330619812012,
"step": 398
},
{
"epoch": 1.7241379310344827,
"grad_norm": 0.09267118573188782,
"learning_rate": 2.7586718383711367e-05,
"loss": 0.9959380626678467,
"step": 400
},
{
"epoch": 1.7327586206896552,
"grad_norm": 0.07319535315036774,
"learning_rate": 2.7464543288171558e-05,
"loss": 1.0200254917144775,
"step": 402
},
{
"epoch": 1.7413793103448276,
"grad_norm": 0.055158186703920364,
"learning_rate": 2.7342090246938076e-05,
"loss": 0.6205574870109558,
"step": 404
},
{
"epoch": 1.75,
"grad_norm": 0.07343259453773499,
"learning_rate": 2.721936548839887e-05,
"loss": 0.8922735452651978,
"step": 406
},
{
"epoch": 1.7586206896551724,
"grad_norm": 0.06107189506292343,
"learning_rate": 2.709637525476236e-05,
"loss": 0.6991145014762878,
"step": 408
},
{
"epoch": 1.7672413793103448,
"grad_norm": 0.0519319549202919,
"learning_rate": 2.697312580173995e-05,
"loss": 0.8093492984771729,
"step": 410
},
{
"epoch": 1.7758620689655173,
"grad_norm": 0.07292782515287399,
"learning_rate": 2.684962339822785e-05,
"loss": 0.7507970929145813,
"step": 412
},
{
"epoch": 1.7844827586206895,
"grad_norm": 0.07456238567829132,
"learning_rate": 2.672587432598823e-05,
"loss": 0.5883830189704895,
"step": 414
},
{
"epoch": 1.793103448275862,
"grad_norm": 0.11243204772472382,
"learning_rate": 2.6601884879329653e-05,
"loss": 0.7915773391723633,
"step": 416
},
{
"epoch": 1.8017241379310345,
"grad_norm": 0.07653719186782837,
"learning_rate": 2.6477661364786996e-05,
"loss": 1.0269769430160522,
"step": 418
},
{
"epoch": 1.8103448275862069,
"grad_norm": 0.14341171085834503,
"learning_rate": 2.635321010080062e-05,
"loss": 1.053789496421814,
"step": 420
},
{
"epoch": 1.8189655172413794,
"grad_norm": 0.12033911049365997,
"learning_rate": 2.6228537417395034e-05,
"loss": 1.158492088317871,
"step": 422
},
{
"epoch": 1.8275862068965516,
"grad_norm": 0.047955527901649475,
"learning_rate": 2.61036496558569e-05,
"loss": 0.9592758417129517,
"step": 424
},
{
"epoch": 1.8362068965517242,
"grad_norm": 0.088678739964962,
"learning_rate": 2.59785531684125e-05,
"loss": 0.6086317300796509,
"step": 426
},
{
"epoch": 1.8448275862068966,
"grad_norm": 0.07942725718021393,
"learning_rate": 2.585325431790464e-05,
"loss": 1.0528879165649414,
"step": 428
},
{
"epoch": 1.853448275862069,
"grad_norm": 0.0694958046078682,
"learning_rate": 2.572775947746903e-05,
"loss": 1.0576783418655396,
"step": 430
},
{
"epoch": 1.8620689655172413,
"grad_norm": 0.17858955264091492,
"learning_rate": 2.5602075030210096e-05,
"loss": 0.9204137325286865,
"step": 432
},
{
"epoch": 1.8706896551724137,
"grad_norm": 0.296277791261673,
"learning_rate": 2.5476207368876334e-05,
"loss": 1.114011287689209,
"step": 434
},
{
"epoch": 1.8793103448275863,
"grad_norm": 0.07735295593738556,
"learning_rate": 2.535016289553514e-05,
"loss": 0.7933326363563538,
"step": 436
},
{
"epoch": 1.8879310344827587,
"grad_norm": 0.12477041035890579,
"learning_rate": 2.5223948021247197e-05,
"loss": 0.9807726144790649,
"step": 438
},
{
"epoch": 1.896551724137931,
"grad_norm": 0.09196372330188751,
"learning_rate": 2.509756916574035e-05,
"loss": 1.0345503091812134,
"step": 440
},
{
"epoch": 1.9051724137931034,
"grad_norm": 0.06840290129184723,
"learning_rate": 2.4971032757083123e-05,
"loss": 1.1201728582382202,
"step": 442
},
{
"epoch": 1.9137931034482758,
"grad_norm": 0.11144451051950455,
"learning_rate": 2.4844345231357734e-05,
"loss": 0.28341731429100037,
"step": 444
},
{
"epoch": 1.9224137931034484,
"grad_norm": 0.14570969343185425,
"learning_rate": 2.4717513032332736e-05,
"loss": 0.7789583206176758,
"step": 446
},
{
"epoch": 1.9310344827586206,
"grad_norm": 0.05790058895945549,
"learning_rate": 2.4590542611135274e-05,
"loss": 1.012285590171814,
"step": 448
},
{
"epoch": 1.9396551724137931,
"grad_norm": 0.05153496563434601,
"learning_rate": 2.446344042592295e-05,
"loss": 1.0196033716201782,
"step": 450
},
{
"epoch": 1.9482758620689655,
"grad_norm": 0.057060956954956055,
"learning_rate": 2.433621294155535e-05,
"loss": 0.8052966594696045,
"step": 452
},
{
"epoch": 1.956896551724138,
"grad_norm": 0.0602966733276844,
"learning_rate": 2.420886662926521e-05,
"loss": 0.9915321469306946,
"step": 454
},
{
"epoch": 1.9655172413793105,
"grad_norm": 0.07094614952802658,
"learning_rate": 2.4081407966329256e-05,
"loss": 0.9689676761627197,
"step": 456
},
{
"epoch": 1.9741379310344827,
"grad_norm": 0.08627466857433319,
"learning_rate": 2.3953843435738775e-05,
"loss": 0.41972166299819946,
"step": 458
},
{
"epoch": 1.9827586206896552,
"grad_norm": 0.10626411437988281,
"learning_rate": 2.3826179525869836e-05,
"loss": 1.1633706092834473,
"step": 460
},
{
"epoch": 1.9913793103448276,
"grad_norm": 0.15631678700447083,
"learning_rate": 2.36984227301533e-05,
"loss": 0.7487952709197998,
"step": 462
},
{
"epoch": 2.0,
"grad_norm": 0.16628113389015198,
"learning_rate": 2.3570579546744504e-05,
"loss": 0.8847077488899231,
"step": 464
},
{
"epoch": 2.0086206896551726,
"grad_norm": 0.06411660462617874,
"learning_rate": 2.3442656478192794e-05,
"loss": 0.484560489654541,
"step": 466
},
{
"epoch": 2.0172413793103448,
"grad_norm": 0.3941573202610016,
"learning_rate": 2.331466003111073e-05,
"loss": 0.6984850764274597,
"step": 468
},
{
"epoch": 2.0258620689655173,
"grad_norm": 0.044237978756427765,
"learning_rate": 2.318659671584316e-05,
"loss": 0.4863373935222626,
"step": 470
},
{
"epoch": 2.0344827586206895,
"grad_norm": 0.0645633190870285,
"learning_rate": 2.305847304613609e-05,
"loss": 0.4588513970375061,
"step": 472
},
{
"epoch": 2.043103448275862,
"grad_norm": 0.05587729066610336,
"learning_rate": 2.293029553880536e-05,
"loss": 0.4486234486103058,
"step": 474
},
{
"epoch": 2.0517241379310347,
"grad_norm": 0.06679260730743408,
"learning_rate": 2.280207071340517e-05,
"loss": 0.5298870205879211,
"step": 476
},
{
"epoch": 2.060344827586207,
"grad_norm": 0.08075322210788727,
"learning_rate": 2.26738050918965e-05,
"loss": 0.4382156729698181,
"step": 478
},
{
"epoch": 2.0689655172413794,
"grad_norm": 0.06546280533075333,
"learning_rate": 2.2545505198315346e-05,
"loss": 0.5762298107147217,
"step": 480
},
{
"epoch": 2.0775862068965516,
"grad_norm": 0.11915218830108643,
"learning_rate": 2.2417177558440907e-05,
"loss": 0.36859992146492004,
"step": 482
},
{
"epoch": 2.086206896551724,
"grad_norm": 0.22198820114135742,
"learning_rate": 2.2288828699463652e-05,
"loss": 0.5293700098991394,
"step": 484
},
{
"epoch": 2.0948275862068964,
"grad_norm": 0.0842965617775917,
"learning_rate": 2.2160465149653337e-05,
"loss": 0.49147215485572815,
"step": 486
},
{
"epoch": 2.103448275862069,
"grad_norm": 0.11753598600625992,
"learning_rate": 2.203209343802692e-05,
"loss": 0.5180780291557312,
"step": 488
},
{
"epoch": 2.1120689655172415,
"grad_norm": 0.37540075182914734,
"learning_rate": 2.1903720094016537e-05,
"loss": 0.581203818321228,
"step": 490
},
{
"epoch": 2.1206896551724137,
"grad_norm": 0.062044426798820496,
"learning_rate": 2.1775351647137323e-05,
"loss": 0.4889185130596161,
"step": 492
},
{
"epoch": 2.1293103448275863,
"grad_norm": 0.07434380799531937,
"learning_rate": 2.1646994626655332e-05,
"loss": 0.6391059756278992,
"step": 494
},
{
"epoch": 2.1379310344827585,
"grad_norm": 0.10223301500082016,
"learning_rate": 2.151865556125544e-05,
"loss": 0.6237853169441223,
"step": 496
},
{
"epoch": 2.146551724137931,
"grad_norm": 0.14267216622829437,
"learning_rate": 2.1390340978709254e-05,
"loss": 0.36577755212783813,
"step": 498
},
{
"epoch": 2.1551724137931036,
"grad_norm": 0.13929963111877441,
"learning_rate": 2.1262057405543115e-05,
"loss": 0.49633127450942993,
"step": 500
},
{
"epoch": 2.163793103448276,
"grad_norm": 0.05517968162894249,
"learning_rate": 2.1133811366706097e-05,
"loss": 0.38259175419807434,
"step": 502
},
{
"epoch": 2.1724137931034484,
"grad_norm": 0.058835044503211975,
"learning_rate": 2.100560938523817e-05,
"loss": 0.4427034258842468,
"step": 504
},
{
"epoch": 2.1810344827586206,
"grad_norm": 0.15045633912086487,
"learning_rate": 2.0877457981938364e-05,
"loss": 0.6942803263664246,
"step": 506
},
{
"epoch": 2.189655172413793,
"grad_norm": 2.292686700820923,
"learning_rate": 2.074936367503317e-05,
"loss": 0.5671365261077881,
"step": 508
},
{
"epoch": 2.1982758620689653,
"grad_norm": 0.046695832163095474,
"learning_rate": 2.0621332979844904e-05,
"loss": 0.6063480377197266,
"step": 510
},
{
"epoch": 2.206896551724138,
"grad_norm": 0.16905461251735687,
"learning_rate": 2.0493372408460425e-05,
"loss": 0.6027957201004028,
"step": 512
},
{
"epoch": 2.2155172413793105,
"grad_norm": 0.06160572171211243,
"learning_rate": 2.0365488469399795e-05,
"loss": 0.6078309416770935,
"step": 514
},
{
"epoch": 2.2241379310344827,
"grad_norm": 0.07821284979581833,
"learning_rate": 2.0237687667285345e-05,
"loss": 0.3304949402809143,
"step": 516
},
{
"epoch": 2.2327586206896552,
"grad_norm": 0.34748536348342896,
"learning_rate": 2.010997650251072e-05,
"loss": 0.12825970351696014,
"step": 518
},
{
"epoch": 2.2413793103448274,
"grad_norm": 0.11893010139465332,
"learning_rate": 1.9982361470910342e-05,
"loss": 0.1828547865152359,
"step": 520
},
{
"epoch": 2.25,
"grad_norm": 0.12491466104984283,
"learning_rate": 1.9854849063428926e-05,
"loss": 0.6522985696792603,
"step": 522
},
{
"epoch": 2.2586206896551726,
"grad_norm": 0.15903355181217194,
"learning_rate": 1.9727445765791405e-05,
"loss": 0.47932472825050354,
"step": 524
},
{
"epoch": 2.2672413793103448,
"grad_norm": 0.09779471158981323,
"learning_rate": 1.9600158058172974e-05,
"loss": 0.4181676208972931,
"step": 526
},
{
"epoch": 2.2758620689655173,
"grad_norm": 0.07378951460123062,
"learning_rate": 1.9472992414869534e-05,
"loss": 0.46739447116851807,
"step": 528
},
{
"epoch": 2.2844827586206895,
"grad_norm": 0.04063527286052704,
"learning_rate": 1.9345955303968365e-05,
"loss": 0.38251054286956787,
"step": 530
},
{
"epoch": 2.293103448275862,
"grad_norm": 0.08258794993162155,
"learning_rate": 1.9219053187019144e-05,
"loss": 0.4366922080516815,
"step": 532
},
{
"epoch": 2.3017241379310347,
"grad_norm": 0.09015543758869171,
"learning_rate": 1.909229251870528e-05,
"loss": 0.4965798556804657,
"step": 534
},
{
"epoch": 2.310344827586207,
"grad_norm": 0.08743222802877426,
"learning_rate": 1.8965679746515628e-05,
"loss": 0.43146276473999023,
"step": 536
},
{
"epoch": 2.3189655172413794,
"grad_norm": 0.084476038813591,
"learning_rate": 1.88392213104165e-05,
"loss": 0.2771337330341339,
"step": 538
},
{
"epoch": 2.3275862068965516,
"grad_norm": 0.07576002180576324,
"learning_rate": 1.8712923642524175e-05,
"loss": 0.36878013610839844,
"step": 540
},
{
"epoch": 2.336206896551724,
"grad_norm": 0.10497633367776871,
"learning_rate": 1.858679316677767e-05,
"loss": 0.6058629751205444,
"step": 542
},
{
"epoch": 2.344827586206897,
"grad_norm": 0.13856923580169678,
"learning_rate": 1.8460836298612056e-05,
"loss": 0.6428977251052856,
"step": 544
},
{
"epoch": 2.353448275862069,
"grad_norm": 0.1172226220369339,
"learning_rate": 1.8335059444632078e-05,
"loss": 0.2821408212184906,
"step": 546
},
{
"epoch": 2.3620689655172415,
"grad_norm": 0.1798970252275467,
"learning_rate": 1.820946900228639e-05,
"loss": 0.8290093541145325,
"step": 548
},
{
"epoch": 2.3706896551724137,
"grad_norm": 0.2738807499408722,
"learning_rate": 1.808407135954204e-05,
"loss": 0.5475698709487915,
"step": 550
},
{
"epoch": 2.3793103448275863,
"grad_norm": 0.20505401492118835,
"learning_rate": 1.7958872894559666e-05,
"loss": 0.6245191693305969,
"step": 552
},
{
"epoch": 2.3879310344827585,
"grad_norm": 0.05477019026875496,
"learning_rate": 1.7833879975368994e-05,
"loss": 0.5108689665794373,
"step": 554
},
{
"epoch": 2.396551724137931,
"grad_norm": 0.09034960716962814,
"learning_rate": 1.7709098959545015e-05,
"loss": 0.5519805550575256,
"step": 556
},
{
"epoch": 2.405172413793103,
"grad_norm": 0.1560261845588684,
"learning_rate": 1.758453619388453e-05,
"loss": 0.4397192597389221,
"step": 558
},
{
"epoch": 2.413793103448276,
"grad_norm": 0.118907131254673,
"learning_rate": 1.7460198014083424e-05,
"loss": 0.38739266991615295,
"step": 560
},
{
"epoch": 2.4224137931034484,
"grad_norm": 0.23784895241260529,
"learning_rate": 1.733609074441433e-05,
"loss": 0.5064358711242676,
"step": 562
},
{
"epoch": 2.4310344827586206,
"grad_norm": 0.09993483871221542,
"learning_rate": 1.7212220697405003e-05,
"loss": 0.540324330329895,
"step": 564
},
{
"epoch": 2.439655172413793,
"grad_norm": 0.7780280113220215,
"learning_rate": 1.7088594173517225e-05,
"loss": 0.5431786179542542,
"step": 566
},
{
"epoch": 2.4482758620689653,
"grad_norm": 0.14646178483963013,
"learning_rate": 1.6965217460826345e-05,
"loss": 0.3365917205810547,
"step": 568
},
{
"epoch": 2.456896551724138,
"grad_norm": 0.07466763257980347,
"learning_rate": 1.6842096834701443e-05,
"loss": 0.6636412739753723,
"step": 570
},
{
"epoch": 2.4655172413793105,
"grad_norm": 0.3850714862346649,
"learning_rate": 1.6719238557486143e-05,
"loss": 0.3930183947086334,
"step": 572
},
{
"epoch": 2.4741379310344827,
"grad_norm": 0.12653613090515137,
"learning_rate": 1.6596648878180088e-05,
"loss": 0.4772527813911438,
"step": 574
},
{
"epoch": 2.4827586206896552,
"grad_norm": 0.10766978561878204,
"learning_rate": 1.647433403212112e-05,
"loss": 0.6689369082450867,
"step": 576
},
{
"epoch": 2.4913793103448274,
"grad_norm": 0.1643172800540924,
"learning_rate": 1.635230024066807e-05,
"loss": 0.5050515532493591,
"step": 578
},
{
"epoch": 2.5,
"grad_norm": 0.06176433712244034,
"learning_rate": 1.6230553710884373e-05,
"loss": 0.6936325430870056,
"step": 580
},
{
"epoch": 2.5086206896551726,
"grad_norm": 0.17540457844734192,
"learning_rate": 1.610910063522233e-05,
"loss": 0.5566367506980896,
"step": 582
},
{
"epoch": 2.5172413793103448,
"grad_norm": 0.09146937727928162,
"learning_rate": 1.598794719120816e-05,
"loss": 0.5264196991920471,
"step": 584
},
{
"epoch": 2.5258620689655173,
"grad_norm": 0.08665334433317184,
"learning_rate": 1.5867099541127737e-05,
"loss": 0.4999127686023712,
"step": 586
},
{
"epoch": 2.5344827586206895,
"grad_norm": 0.05140522122383118,
"learning_rate": 1.5746563831713236e-05,
"loss": 0.5660111308097839,
"step": 588
},
{
"epoch": 2.543103448275862,
"grad_norm": 0.08618345856666565,
"learning_rate": 1.56263461938304e-05,
"loss": 0.7160353064537048,
"step": 590
},
{
"epoch": 2.5517241379310347,
"grad_norm": 0.05319703742861748,
"learning_rate": 1.5506452742166796e-05,
"loss": 0.575738251209259,
"step": 592
},
{
"epoch": 2.560344827586207,
"grad_norm": 0.29011279344558716,
"learning_rate": 1.5386889574920692e-05,
"loss": 0.35511380434036255,
"step": 594
},
{
"epoch": 2.5689655172413794,
"grad_norm": 0.07296542078256607,
"learning_rate": 1.5267662773491e-05,
"loss": 0.40391749143600464,
"step": 596
},
{
"epoch": 2.5775862068965516,
"grad_norm": 0.09713292866945267,
"learning_rate": 1.514877840216785e-05,
"loss": 0.5037810802459717,
"step": 598
},
{
"epoch": 2.586206896551724,
"grad_norm": 0.1726667881011963,
"learning_rate": 1.5030242507824215e-05,
"loss": 0.6312216520309448,
"step": 600
},
{
"epoch": 2.594827586206897,
"grad_norm": 0.0342765673995018,
"learning_rate": 1.4912061119608292e-05,
"loss": 0.39456382393836975,
"step": 602
},
{
"epoch": 2.603448275862069,
"grad_norm": 0.45015275478363037,
"learning_rate": 1.4794240248636885e-05,
"loss": 0.5595788359642029,
"step": 604
},
{
"epoch": 2.612068965517241,
"grad_norm": 0.10634768009185791,
"learning_rate": 1.4676785887689614e-05,
"loss": 0.41876575350761414,
"step": 606
},
{
"epoch": 2.6206896551724137,
"grad_norm": 0.06522602587938309,
"learning_rate": 1.4559704010904145e-05,
"loss": 0.6346225142478943,
"step": 608
},
{
"epoch": 2.6293103448275863,
"grad_norm": 0.24831700325012207,
"learning_rate": 1.444300057347229e-05,
"loss": 0.5777739882469177,
"step": 610
},
{
"epoch": 2.637931034482759,
"grad_norm": 0.06677041202783585,
"learning_rate": 1.432668151133712e-05,
"loss": 0.5916672945022583,
"step": 612
},
{
"epoch": 2.646551724137931,
"grad_norm": 0.09093949943780899,
"learning_rate": 1.4210752740891032e-05,
"loss": 0.5175487995147705,
"step": 614
},
{
"epoch": 2.655172413793103,
"grad_norm": 0.1291448175907135,
"learning_rate": 1.4095220158674851e-05,
"loss": 0.37486380338668823,
"step": 616
},
{
"epoch": 2.663793103448276,
"grad_norm": 0.10089799761772156,
"learning_rate": 1.3980089641077864e-05,
"loss": 0.5902385115623474,
"step": 618
},
{
"epoch": 2.6724137931034484,
"grad_norm": 0.3151969611644745,
"learning_rate": 1.3865367044038972e-05,
"loss": 0.3626130223274231,
"step": 620
},
{
"epoch": 2.6810344827586206,
"grad_norm": 0.10858116298913956,
"learning_rate": 1.3751058202748815e-05,
"loss": 0.6260622143745422,
"step": 622
},
{
"epoch": 2.689655172413793,
"grad_norm": 0.09145694226026535,
"learning_rate": 1.3637168931352952e-05,
"loss": 0.3847617506980896,
"step": 624
},
{
"epoch": 2.6982758620689653,
"grad_norm": 0.10181720554828644,
"learning_rate": 1.3523705022656194e-05,
"loss": 0.5213911533355713,
"step": 626
},
{
"epoch": 2.706896551724138,
"grad_norm": 0.07265552878379822,
"learning_rate": 1.3410672247827887e-05,
"loss": 0.3843521475791931,
"step": 628
},
{
"epoch": 2.7155172413793105,
"grad_norm": 0.06394084542989731,
"learning_rate": 1.3298076356108431e-05,
"loss": 0.7390468716621399,
"step": 630
},
{
"epoch": 2.7241379310344827,
"grad_norm": 0.08277060091495514,
"learning_rate": 1.318592307451683e-05,
"loss": 0.3152429461479187,
"step": 632
},
{
"epoch": 2.7327586206896552,
"grad_norm": 0.06954030692577362,
"learning_rate": 1.307421810755938e-05,
"loss": 0.5903550982475281,
"step": 634
},
{
"epoch": 2.7413793103448274,
"grad_norm": 0.14430810511112213,
"learning_rate": 1.296296713693956e-05,
"loss": 0.4196533262729645,
"step": 636
},
{
"epoch": 2.75,
"grad_norm": 0.049837417900562286,
"learning_rate": 1.2852175821268977e-05,
"loss": 0.5849826335906982,
"step": 638
},
{
"epoch": 2.7586206896551726,
"grad_norm": 0.16439993679523468,
"learning_rate": 1.274184979577963e-05,
"loss": 0.40721848607063293,
"step": 640
},
{
"epoch": 2.7672413793103448,
"grad_norm": 0.15708234906196594,
"learning_rate": 1.2631994672037205e-05,
"loss": 0.5138668417930603,
"step": 642
},
{
"epoch": 2.7758620689655173,
"grad_norm": 0.0595339760184288,
"learning_rate": 1.2522616037655713e-05,
"loss": 0.6097421646118164,
"step": 644
},
{
"epoch": 2.7844827586206895,
"grad_norm": 0.14719434082508087,
"learning_rate": 1.2413719456013231e-05,
"loss": 0.5522211194038391,
"step": 646
},
{
"epoch": 2.793103448275862,
"grad_norm": 0.06864980608224869,
"learning_rate": 1.2305310465968985e-05,
"loss": 0.3453619182109833,
"step": 648
},
{
"epoch": 2.8017241379310347,
"grad_norm": 0.05219966545701027,
"learning_rate": 1.2197394581581561e-05,
"loss": 0.7121859788894653,
"step": 650
},
{
"epoch": 2.810344827586207,
"grad_norm": 0.24679023027420044,
"learning_rate": 1.2089977291828512e-05,
"loss": 0.7990239262580872,
"step": 652
},
{
"epoch": 2.8189655172413794,
"grad_norm": 0.15024927258491516,
"learning_rate": 1.1983064060327098e-05,
"loss": 0.6081220507621765,
"step": 654
},
{
"epoch": 2.8275862068965516,
"grad_norm": 0.05443995073437691,
"learning_rate": 1.187666032505645e-05,
"loss": 0.43975335359573364,
"step": 656
},
{
"epoch": 2.836206896551724,
"grad_norm": 0.05697048828005791,
"learning_rate": 1.1770771498080921e-05,
"loss": 0.6137202978134155,
"step": 658
},
{
"epoch": 2.844827586206897,
"grad_norm": 0.11451619118452072,
"learning_rate": 1.1665402965274866e-05,
"loss": 0.20562584698200226,
"step": 660
},
{
"epoch": 2.853448275862069,
"grad_norm": 0.22301547229290009,
"learning_rate": 1.1560560086048632e-05,
"loss": 0.42035165429115295,
"step": 662
},
{
"epoch": 2.862068965517241,
"grad_norm": 0.15491816401481628,
"learning_rate": 1.1456248193076027e-05,
"loss": 0.6786882877349854,
"step": 664
},
{
"epoch": 2.8706896551724137,
"grad_norm": 0.06417909264564514,
"learning_rate": 1.1352472592023026e-05,
"loss": 0.34481775760650635,
"step": 666
},
{
"epoch": 2.8793103448275863,
"grad_norm": 0.2559848129749298,
"learning_rate": 1.1249238561277957e-05,
"loss": 0.37077146768569946,
"step": 668
},
{
"epoch": 2.887931034482759,
"grad_norm": 0.07367434352636337,
"learning_rate": 1.1146551351682962e-05,
"loss": 0.6234573125839233,
"step": 670
},
{
"epoch": 2.896551724137931,
"grad_norm": 0.13318119943141937,
"learning_rate": 1.1044416186266985e-05,
"loss": 0.43646591901779175,
"step": 672
},
{
"epoch": 2.905172413793103,
"grad_norm": 0.04189766198396683,
"learning_rate": 1.0942838259980065e-05,
"loss": 0.6099374890327454,
"step": 674
},
{
"epoch": 2.913793103448276,
"grad_norm": 0.16093385219573975,
"learning_rate": 1.0841822739429131e-05,
"loss": 0.5961918830871582,
"step": 676
},
{
"epoch": 2.9224137931034484,
"grad_norm": 0.05338941141963005,
"learning_rate": 1.0741374762615181e-05,
"loss": 0.5247670412063599,
"step": 678
},
{
"epoch": 2.9310344827586206,
"grad_norm": 0.06662659347057343,
"learning_rate": 1.0641499438671994e-05,
"loss": 0.4245750606060028,
"step": 680
},
{
"epoch": 2.939655172413793,
"grad_norm": 0.03824161738157272,
"learning_rate": 1.054220184760619e-05,
"loss": 0.21983936429023743,
"step": 682
},
{
"epoch": 2.9482758620689653,
"grad_norm": 0.061386823654174805,
"learning_rate": 1.0443487040038919e-05,
"loss": 0.3854738771915436,
"step": 684
},
{
"epoch": 2.956896551724138,
"grad_norm": 0.06032966449856758,
"learning_rate": 1.0345360036948912e-05,
"loss": 0.6782163381576538,
"step": 686
},
{
"epoch": 2.9655172413793105,
"grad_norm": 0.06708291918039322,
"learning_rate": 1.0247825829417132e-05,
"loss": 0.5401458740234375,
"step": 688
},
{
"epoch": 2.9741379310344827,
"grad_norm": 0.0782044380903244,
"learning_rate": 1.0150889378372878e-05,
"loss": 0.7114209532737732,
"step": 690
},
{
"epoch": 2.9827586206896552,
"grad_norm": 0.06770720332860947,
"learning_rate": 1.00545556143415e-05,
"loss": 0.660466730594635,
"step": 692
},
{
"epoch": 2.9913793103448274,
"grad_norm": 0.07091684639453888,
"learning_rate": 9.958829437193558e-06,
"loss": 0.4320341944694519,
"step": 694
},
{
"epoch": 3.0,
"grad_norm": 0.06834368407726288,
"learning_rate": 9.863715715895658e-06,
"loss": 0.6856396198272705,
"step": 696
},
{
"epoch": 3.0086206896551726,
"grad_norm": 0.03995652124285698,
"learning_rate": 9.769219288262745e-06,
"loss": 0.16509434580802917,
"step": 698
},
{
"epoch": 3.0172413793103448,
"grad_norm": 0.043883178383111954,
"learning_rate": 9.675344960712074e-06,
"loss": 0.29928964376449585,
"step": 700
},
{
"epoch": 3.0258620689655173,
"grad_norm": 0.0733269527554512,
"learning_rate": 9.582097508018724e-06,
"loss": 0.25162428617477417,
"step": 702
},
{
"epoch": 3.0344827586206895,
"grad_norm": 0.12920475006103516,
"learning_rate": 9.489481673072723e-06,
"loss": 0.3514169454574585,
"step": 704
},
{
"epoch": 3.043103448275862,
"grad_norm": 0.017986657097935677,
"learning_rate": 9.397502166637837e-06,
"loss": 0.07074951380491257,
"step": 706
},
{
"epoch": 3.0517241379310347,
"grad_norm": 0.07337481528520584,
"learning_rate": 9.30616366711195e-06,
"loss": 0.20599356293678284,
"step": 708
},
{
"epoch": 3.060344827586207,
"grad_norm": 0.03576648607850075,
"learning_rate": 9.21547082028908e-06,
"loss": 0.11480194330215454,
"step": 710
},
{
"epoch": 3.0689655172413794,
"grad_norm": 0.38087305426597595,
"learning_rate": 9.125428239123133e-06,
"loss": 0.26979854702949524,
"step": 712
},
{
"epoch": 3.0775862068965516,
"grad_norm": 0.0725908949971199,
"learning_rate": 9.036040503493213e-06,
"loss": 0.42210009694099426,
"step": 714
},
{
"epoch": 3.086206896551724,
"grad_norm": 0.14822497963905334,
"learning_rate": 8.947312159970725e-06,
"loss": 0.1675470620393753,
"step": 716
},
{
"epoch": 3.0948275862068964,
"grad_norm": 0.08073808997869492,
"learning_rate": 8.859247721588064e-06,
"loss": 0.20833522081375122,
"step": 718
},
{
"epoch": 3.103448275862069,
"grad_norm": 0.057046178728342056,
"learning_rate": 8.77185166760914e-06,
"loss": 0.16950953006744385,
"step": 720
},
{
"epoch": 3.1120689655172415,
"grad_norm": 0.10354648530483246,
"learning_rate": 8.685128443301465e-06,
"loss": 0.12641456723213196,
"step": 722
},
{
"epoch": 3.1206896551724137,
"grad_norm": 0.05845208466053009,
"learning_rate": 8.599082459710125e-06,
"loss": 0.13568329811096191,
"step": 724
},
{
"epoch": 3.1293103448275863,
"grad_norm": 0.04908813536167145,
"learning_rate": 8.513718093433354e-06,
"loss": 0.21239104866981506,
"step": 726
},
{
"epoch": 3.1379310344827585,
"grad_norm": 0.13193517923355103,
"learning_rate": 8.42903968639999e-06,
"loss": 0.2763456702232361,
"step": 728
},
{
"epoch": 3.146551724137931,
"grad_norm": 0.03571261465549469,
"learning_rate": 8.345051545648565e-06,
"loss": 0.12836386263370514,
"step": 730
},
{
"epoch": 3.1551724137931036,
"grad_norm": 0.06112167611718178,
"learning_rate": 8.261757943108296e-06,
"loss": 0.16560682654380798,
"step": 732
},
{
"epoch": 3.163793103448276,
"grad_norm": 0.0860171988606453,
"learning_rate": 8.179163115381737e-06,
"loss": 0.2081730216741562,
"step": 734
},
{
"epoch": 3.1724137931034484,
"grad_norm": 0.03247256577014923,
"learning_rate": 8.097271263529346e-06,
"loss": 0.14392191171646118,
"step": 736
},
{
"epoch": 3.1810344827586206,
"grad_norm": 0.0918356403708458,
"learning_rate": 8.016086552855771e-06,
"loss": 0.15577132999897003,
"step": 738
},
{
"epoch": 3.189655172413793,
"grad_norm": 0.06287133693695068,
"learning_rate": 7.935613112698003e-06,
"loss": 0.0789552852511406,
"step": 740
},
{
"epoch": 3.1982758620689653,
"grad_norm": 0.03986852988600731,
"learning_rate": 7.855855036215328e-06,
"loss": 0.10101716220378876,
"step": 742
},
{
"epoch": 3.206896551724138,
"grad_norm": 0.13693907856941223,
"learning_rate": 7.776816380181165e-06,
"loss": 0.1658182144165039,
"step": 744
},
{
"epoch": 3.2155172413793105,
"grad_norm": 0.14548790454864502,
"learning_rate": 7.698501164776679e-06,
"loss": 0.19248032569885254,
"step": 746
},
{
"epoch": 3.2241379310344827,
"grad_norm": 0.05582420900464058,
"learning_rate": 7.620913373386356e-06,
"loss": 0.21470694243907928,
"step": 748
},
{
"epoch": 3.2327586206896552,
"grad_norm": 0.04277574643492699,
"learning_rate": 7.5440569523953315e-06,
"loss": 0.15740104019641876,
"step": 750
},
{
"epoch": 3.2413793103448274,
"grad_norm": 0.14733938872814178,
"learning_rate": 7.467935810988729e-06,
"loss": 0.18646365404129028,
"step": 752
},
{
"epoch": 3.25,
"grad_norm": 0.06095249578356743,
"learning_rate": 7.392553820952764e-06,
"loss": 0.22709967195987701,
"step": 754
},
{
"epoch": 3.2586206896551726,
"grad_norm": 0.04888584464788437,
"learning_rate": 7.317914816477865e-06,
"loss": 0.1782107949256897,
"step": 756
},
{
"epoch": 3.2672413793103448,
"grad_norm": 0.2761983275413513,
"learning_rate": 7.244022593963609e-06,
"loss": 0.19192323088645935,
"step": 758
},
{
"epoch": 3.2758620689655173,
"grad_norm": 0.041269440203905106,
"learning_rate": 7.170880911825657e-06,
"loss": 0.13779321312904358,
"step": 760
},
{
"epoch": 3.2844827586206895,
"grad_norm": 0.2219523787498474,
"learning_rate": 7.098493490304566e-06,
"loss": 0.24427469074726105,
"step": 762
},
{
"epoch": 3.293103448275862,
"grad_norm": 0.7461491227149963,
"learning_rate": 7.026864011276575e-06,
"loss": 0.32002437114715576,
"step": 764
},
{
"epoch": 3.3017241379310347,
"grad_norm": 0.100465789437294,
"learning_rate": 6.955996118066326e-06,
"loss": 0.11214806139469147,
"step": 766
},
{
"epoch": 3.310344827586207,
"grad_norm": 0.06019704416394234,
"learning_rate": 6.8858934152615646e-06,
"loss": 0.1987936794757843,
"step": 768
},
{
"epoch": 3.3189655172413794,
"grad_norm": 0.12379293888807297,
"learning_rate": 6.816559468529773e-06,
"loss": 0.058321211487054825,
"step": 770
},
{
"epoch": 3.3275862068965516,
"grad_norm": 0.3285755515098572,
"learning_rate": 6.747997804436846e-06,
"loss": 0.08903615176677704,
"step": 772
},
{
"epoch": 3.336206896551724,
"grad_norm": 0.11563495546579361,
"learning_rate": 6.680211910267665e-06,
"loss": 0.35364535450935364,
"step": 774
},
{
"epoch": 3.344827586206897,
"grad_norm": 0.07364711910486221,
"learning_rate": 6.613205233848783e-06,
"loss": 0.20209553837776184,
"step": 776
},
{
"epoch": 3.353448275862069,
"grad_norm": 0.0495804026722908,
"learning_rate": 6.546981183373009e-06,
"loss": 0.19359779357910156,
"step": 778
},
{
"epoch": 3.3620689655172415,
"grad_norm": 0.13539589941501617,
"learning_rate": 6.481543127226073e-06,
"loss": 0.28171947598457336,
"step": 780
},
{
"epoch": 3.3706896551724137,
"grad_norm": 0.07525072246789932,
"learning_rate": 6.4168943938153e-06,
"loss": 0.1644493192434311,
"step": 782
},
{
"epoch": 3.3793103448275863,
"grad_norm": 0.04455971717834473,
"learning_rate": 6.353038271400319e-06,
"loss": 0.17818251252174377,
"step": 784
},
{
"epoch": 3.3879310344827585,
"grad_norm": 0.04888049513101578,
"learning_rate": 6.289978007925791e-06,
"loss": 0.08893375098705292,
"step": 786
},
{
"epoch": 3.396551724137931,
"grad_norm": 0.0592099204659462,
"learning_rate": 6.227716810856235e-06,
"loss": 0.16159863770008087,
"step": 788
},
{
"epoch": 3.405172413793103,
"grad_norm": 0.05931266024708748,
"learning_rate": 6.1662578470128595e-06,
"loss": 0.19914919137954712,
"step": 790
},
{
"epoch": 3.413793103448276,
"grad_norm": 0.05199761316180229,
"learning_rate": 6.105604242412507e-06,
"loss": 0.1833517998456955,
"step": 792
},
{
"epoch": 3.4224137931034484,
"grad_norm": 0.07053744047880173,
"learning_rate": 6.0457590821086364e-06,
"loss": 0.1568892002105713,
"step": 794
},
{
"epoch": 3.4310344827586206,
"grad_norm": 0.11103974282741547,
"learning_rate": 5.9867254100344305e-06,
"loss": 0.5605343580245972,
"step": 796
},
{
"epoch": 3.439655172413793,
"grad_norm": 0.1462671458721161,
"learning_rate": 5.92850622884794e-06,
"loss": 0.2542985677719116,
"step": 798
},
{
"epoch": 3.4482758620689653,
"grad_norm": 0.07662937045097351,
"learning_rate": 5.871104499779383e-06,
"loss": 0.3042844533920288,
"step": 800
},
{
"epoch": 3.456896551724138,
"grad_norm": 0.40208032727241516,
"learning_rate": 5.814523142480514e-06,
"loss": 0.23688863217830658,
"step": 802
},
{
"epoch": 3.4655172413793105,
"grad_norm": 0.0428071990609169,
"learning_rate": 5.758765034876124e-06,
"loss": 0.1413598358631134,
"step": 804
},
{
"epoch": 3.4741379310344827,
"grad_norm": 0.18820738792419434,
"learning_rate": 5.703833013017659e-06,
"loss": 0.26621344685554504,
"step": 806
},
{
"epoch": 3.4827586206896552,
"grad_norm": 0.0474395789206028,
"learning_rate": 5.649729870938974e-06,
"loss": 0.1856929361820221,
"step": 808
},
{
"epoch": 3.4913793103448274,
"grad_norm": 0.041845474392175674,
"learning_rate": 5.596458360514197e-06,
"loss": 0.11116787791252136,
"step": 810
},
{
"epoch": 3.5,
"grad_norm": 0.06532273441553116,
"learning_rate": 5.544021191317797e-06,
"loss": 0.2585083842277527,
"step": 812
},
{
"epoch": 3.5086206896551726,
"grad_norm": 0.17792125046253204,
"learning_rate": 5.492421030486723e-06,
"loss": 0.24390508234500885,
"step": 814
},
{
"epoch": 3.5172413793103448,
"grad_norm": 0.06345438957214355,
"learning_rate": 5.441660502584782e-06,
"loss": 0.19690856337547302,
"step": 816
},
{
"epoch": 3.5258620689655173,
"grad_norm": 0.06546366214752197,
"learning_rate": 5.391742189469118e-06,
"loss": 0.18222372233867645,
"step": 818
},
{
"epoch": 3.5344827586206895,
"grad_norm": 0.06039542332291603,
"learning_rate": 5.342668630158901e-06,
"loss": 0.14991328120231628,
"step": 820
},
{
"epoch": 3.543103448275862,
"grad_norm": 0.08110994100570679,
"learning_rate": 5.294442320706179e-06,
"loss": 0.12025367468595505,
"step": 822
},
{
"epoch": 3.5517241379310347,
"grad_norm": 0.04069434478878975,
"learning_rate": 5.247065714068933e-06,
"loss": 0.0922561064362526,
"step": 824
},
{
"epoch": 3.560344827586207,
"grad_norm": 0.012679479084908962,
"learning_rate": 5.200541219986286e-06,
"loss": 0.03818206116557121,
"step": 826
},
{
"epoch": 3.5689655172413794,
"grad_norm": 0.22782403230667114,
"learning_rate": 5.1548712048559655e-06,
"loss": 0.2238304615020752,
"step": 828
},
{
"epoch": 3.5775862068965516,
"grad_norm": 0.05799900367856026,
"learning_rate": 5.110057991613912e-06,
"loss": 0.1549633890390396,
"step": 830
},
{
"epoch": 3.586206896551724,
"grad_norm": 0.05750008672475815,
"learning_rate": 5.0661038596161515e-06,
"loss": 0.14927032589912415,
"step": 832
},
{
"epoch": 3.594827586206897,
"grad_norm": 0.24478773772716522,
"learning_rate": 5.023011044522834e-06,
"loss": 0.2999204397201538,
"step": 834
},
{
"epoch": 3.603448275862069,
"grad_norm": 0.07759716361761093,
"learning_rate": 4.980781738184549e-06,
"loss": 0.20024727284908295,
"step": 836
},
{
"epoch": 3.612068965517241,
"grad_norm": 0.0670485645532608,
"learning_rate": 4.939418088530811e-06,
"loss": 0.13863810896873474,
"step": 838
},
{
"epoch": 3.6206896551724137,
"grad_norm": 0.12198883295059204,
"learning_rate": 4.898922199460831e-06,
"loss": 0.18534965813159943,
"step": 840
},
{
"epoch": 3.6293103448275863,
"grad_norm": 0.1402168571949005,
"learning_rate": 4.859296130736489e-06,
"loss": 0.15518294274806976,
"step": 842
},
{
"epoch": 3.637931034482759,
"grad_norm": 0.06257359683513641,
"learning_rate": 4.820541897877585e-06,
"loss": 0.23298737406730652,
"step": 844
},
{
"epoch": 3.646551724137931,
"grad_norm": 0.11765491217374802,
"learning_rate": 4.782661472059298e-06,
"loss": 0.264419823884964,
"step": 846
},
{
"epoch": 3.655172413793103,
"grad_norm": 0.03430064767599106,
"learning_rate": 4.745656780011951e-06,
"loss": 0.15973711013793945,
"step": 848
},
{
"epoch": 3.663793103448276,
"grad_norm": 0.03395868092775345,
"learning_rate": 4.709529703922993e-06,
"loss": 0.17208503186702728,
"step": 850
},
{
"epoch": 3.6724137931034484,
"grad_norm": 0.08469868451356888,
"learning_rate": 4.674282081341271e-06,
"loss": 0.19475609064102173,
"step": 852
},
{
"epoch": 3.6810344827586206,
"grad_norm": 0.06020957604050636,
"learning_rate": 4.639915705083572e-06,
"loss": 0.1562570333480835,
"step": 854
},
{
"epoch": 3.689655172413793,
"grad_norm": 0.07793577015399933,
"learning_rate": 4.606432323143412e-06,
"loss": 0.15900962054729462,
"step": 856
},
{
"epoch": 3.6982758620689653,
"grad_norm": 0.13332881033420563,
"learning_rate": 4.573833638602159e-06,
"loss": 0.22381483018398285,
"step": 858
},
{
"epoch": 3.706896551724138,
"grad_norm": 0.02578054927289486,
"learning_rate": 4.542121309542383e-06,
"loss": 0.09598782658576965,
"step": 860
},
{
"epoch": 3.7155172413793105,
"grad_norm": 0.15609142184257507,
"learning_rate": 4.511296948963527e-06,
"loss": 0.19943147897720337,
"step": 862
},
{
"epoch": 3.7241379310344827,
"grad_norm": 0.01152154989540577,
"learning_rate": 4.4813621246998765e-06,
"loss": 0.07272744178771973,
"step": 864
},
{
"epoch": 3.7327586206896552,
"grad_norm": 0.06737919896841049,
"learning_rate": 4.45231835934079e-06,
"loss": 0.1303609311580658,
"step": 866
},
{
"epoch": 3.7413793103448274,
"grad_norm": 0.2695164084434509,
"learning_rate": 4.424167130153277e-06,
"loss": 0.18073761463165283,
"step": 868
},
{
"epoch": 3.75,
"grad_norm": 0.044187769293785095,
"learning_rate": 4.396909869006847e-06,
"loss": 0.12275875359773636,
"step": 870
},
{
"epoch": 3.7586206896551726,
"grad_norm": 0.008070076815783978,
"learning_rate": 4.3705479623006866e-06,
"loss": 0.06019383668899536,
"step": 872
},
{
"epoch": 3.7672413793103448,
"grad_norm": 0.06783478707075119,
"learning_rate": 4.345082750893132e-06,
"loss": 0.10059908032417297,
"step": 874
},
{
"epoch": 3.7758620689655173,
"grad_norm": 0.10238350927829742,
"learning_rate": 4.320515530033487e-06,
"loss": 0.30081015825271606,
"step": 876
},
{
"epoch": 3.7844827586206895,
"grad_norm": 0.14017876982688904,
"learning_rate": 4.296847549296115e-06,
"loss": 0.30149415135383606,
"step": 878
},
{
"epoch": 3.793103448275862,
"grad_norm": 0.06336027383804321,
"learning_rate": 4.274080012516909e-06,
"loss": 0.13996456563472748,
"step": 880
},
{
"epoch": 3.8017241379310347,
"grad_norm": 0.04175444692373276,
"learning_rate": 4.2522140777320404e-06,
"loss": 0.09511252492666245,
"step": 882
},
{
"epoch": 3.810344827586207,
"grad_norm": 0.05849481746554375,
"learning_rate": 4.23125085711907e-06,
"loss": 0.23412549495697021,
"step": 884
},
{
"epoch": 3.8189655172413794,
"grad_norm": 0.06667976826429367,
"learning_rate": 4.21119141694037e-06,
"loss": 0.160780131816864,
"step": 886
},
{
"epoch": 3.8275862068965516,
"grad_norm": 0.056150369346141815,
"learning_rate": 4.192036777488896e-06,
"loss": 0.11835036426782608,
"step": 888
},
{
"epoch": 3.836206896551724,
"grad_norm": 0.12212974578142166,
"learning_rate": 4.173787913036284e-06,
"loss": 0.11370360106229782,
"step": 890
},
{
"epoch": 3.844827586206897,
"grad_norm": 0.08489777147769928,
"learning_rate": 4.156445751783308e-06,
"loss": 0.17437399923801422,
"step": 892
},
{
"epoch": 3.853448275862069,
"grad_norm": 0.03076266683638096,
"learning_rate": 4.140011175812656e-06,
"loss": 0.15946733951568604,
"step": 894
},
{
"epoch": 3.862068965517241,
"grad_norm": 0.044967204332351685,
"learning_rate": 4.124485021044069e-06,
"loss": 0.16160649061203003,
"step": 896
},
{
"epoch": 3.8706896551724137,
"grad_norm": 0.06540035456418991,
"learning_rate": 4.1098680771918245e-06,
"loss": 0.13039463758468628,
"step": 898
},
{
"epoch": 3.8793103448275863,
"grad_norm": 0.0530594103038311,
"learning_rate": 4.096161087724573e-06,
"loss": 0.16959071159362793,
"step": 900
},
{
"epoch": 3.887931034482759,
"grad_norm": 0.15922929346561432,
"learning_rate": 4.0833647498275085e-06,
"loss": 0.20945216715335846,
"step": 902
},
{
"epoch": 3.896551724137931,
"grad_norm": 0.06301749497652054,
"learning_rate": 4.07147971436692e-06,
"loss": 0.22212789952754974,
"step": 904
},
{
"epoch": 3.905172413793103,
"grad_norm": 0.15187880396842957,
"learning_rate": 4.060506585857085e-06,
"loss": 0.21481694281101227,
"step": 906
},
{
"epoch": 3.913793103448276,
"grad_norm": 0.32235345244407654,
"learning_rate": 4.0504459224295174e-06,
"loss": 0.16184020042419434,
"step": 908
},
{
"epoch": 3.9224137931034484,
"grad_norm": 0.07125352323055267,
"learning_rate": 4.041298235804577e-06,
"loss": 0.1316578984260559,
"step": 910
},
{
"epoch": 3.9310344827586206,
"grad_norm": 0.04981033504009247,
"learning_rate": 4.0330639912654516e-06,
"loss": 0.11852114647626877,
"step": 912
},
{
"epoch": 3.939655172413793,
"grad_norm": 0.11605612933635712,
"learning_rate": 4.02574360763448e-06,
"loss": 0.16175302863121033,
"step": 914
},
{
"epoch": 3.9482758620689653,
"grad_norm": 0.05728490650653839,
"learning_rate": 4.019337457251857e-06,
"loss": 0.16411718726158142,
"step": 916
},
{
"epoch": 3.956896551724138,
"grad_norm": 0.08225003629922867,
"learning_rate": 4.013845865956692e-06,
"loss": 0.22733992338180542,
"step": 918
},
{
"epoch": 3.9655172413793105,
"grad_norm": 0.2269326150417328,
"learning_rate": 4.00926911307043e-06,
"loss": 0.1820860654115677,
"step": 920
},
{
"epoch": 3.9741379310344827,
"grad_norm": 0.06667070835828781,
"learning_rate": 4.005607431382659e-06,
"loss": 0.15438126027584076,
"step": 922
},
{
"epoch": 3.9827586206896552,
"grad_norm": 0.05220466107130051,
"learning_rate": 4.002861007139253e-06,
"loss": 0.17508190870285034,
"step": 924
},
{
"epoch": 3.9913793103448274,
"grad_norm": 0.07188162952661514,
"learning_rate": 4.001029980032909e-06,
"loss": 0.1996261328458786,
"step": 926
},
{
"epoch": 4.0,
"grad_norm": 0.3331678509712219,
"learning_rate": 4.000114443196044e-06,
"loss": 0.2806675136089325,
"step": 928
},
{
"epoch": 4.0,
"step": 928,
"total_flos": 3.61934117404672e+18,
"train_loss": 0.7229323635552207,
"train_runtime": 32622.9814,
"train_samples_per_second": 1.707,
"train_steps_per_second": 0.028
}
],
"logging_steps": 2,
"max_steps": 928,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 99999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.61934117404672e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}