1epoch_CuratedThoughts / trainer_state.json
sedrickkeh's picture
End of training
d21d7d1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999288256227758,
"eval_steps": 500,
"global_step": 936,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010676156583629894,
"grad_norm": 6.073639869689941,
"learning_rate": 1.0638297872340426e-07,
"loss": 0.8861,
"step": 1
},
{
"epoch": 0.002135231316725979,
"grad_norm": 5.964370250701904,
"learning_rate": 2.1276595744680852e-07,
"loss": 0.8784,
"step": 2
},
{
"epoch": 0.003202846975088968,
"grad_norm": 5.9699530601501465,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.9081,
"step": 3
},
{
"epoch": 0.004270462633451958,
"grad_norm": 6.123816013336182,
"learning_rate": 4.2553191489361704e-07,
"loss": 0.923,
"step": 4
},
{
"epoch": 0.005338078291814947,
"grad_norm": 6.031068801879883,
"learning_rate": 5.319148936170213e-07,
"loss": 0.8834,
"step": 5
},
{
"epoch": 0.006405693950177936,
"grad_norm": 5.705842018127441,
"learning_rate": 6.382978723404255e-07,
"loss": 0.8708,
"step": 6
},
{
"epoch": 0.007473309608540925,
"grad_norm": 5.794719696044922,
"learning_rate": 7.446808510638298e-07,
"loss": 0.8466,
"step": 7
},
{
"epoch": 0.008540925266903915,
"grad_norm": 5.5866618156433105,
"learning_rate": 8.510638297872341e-07,
"loss": 0.8647,
"step": 8
},
{
"epoch": 0.009608540925266904,
"grad_norm": 5.529083251953125,
"learning_rate": 9.574468085106384e-07,
"loss": 0.8451,
"step": 9
},
{
"epoch": 0.010676156583629894,
"grad_norm": 5.221846580505371,
"learning_rate": 1.0638297872340427e-06,
"loss": 0.8676,
"step": 10
},
{
"epoch": 0.011743772241992882,
"grad_norm": 4.504139423370361,
"learning_rate": 1.170212765957447e-06,
"loss": 0.8504,
"step": 11
},
{
"epoch": 0.012811387900355872,
"grad_norm": 4.460880756378174,
"learning_rate": 1.276595744680851e-06,
"loss": 0.85,
"step": 12
},
{
"epoch": 0.013879003558718862,
"grad_norm": 4.31349515914917,
"learning_rate": 1.3829787234042555e-06,
"loss": 0.8676,
"step": 13
},
{
"epoch": 0.01494661921708185,
"grad_norm": 2.5565595626831055,
"learning_rate": 1.4893617021276596e-06,
"loss": 0.8101,
"step": 14
},
{
"epoch": 0.01601423487544484,
"grad_norm": 2.412811040878296,
"learning_rate": 1.595744680851064e-06,
"loss": 0.7941,
"step": 15
},
{
"epoch": 0.01708185053380783,
"grad_norm": 2.3634886741638184,
"learning_rate": 1.7021276595744682e-06,
"loss": 0.7672,
"step": 16
},
{
"epoch": 0.018149466192170817,
"grad_norm": 2.1130712032318115,
"learning_rate": 1.8085106382978727e-06,
"loss": 0.7964,
"step": 17
},
{
"epoch": 0.019217081850533807,
"grad_norm": 1.9730169773101807,
"learning_rate": 1.9148936170212767e-06,
"loss": 0.7533,
"step": 18
},
{
"epoch": 0.020284697508896797,
"grad_norm": 2.553852081298828,
"learning_rate": 2.021276595744681e-06,
"loss": 0.7736,
"step": 19
},
{
"epoch": 0.021352313167259787,
"grad_norm": 3.0640649795532227,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.7367,
"step": 20
},
{
"epoch": 0.022419928825622777,
"grad_norm": 3.216430902481079,
"learning_rate": 2.2340425531914894e-06,
"loss": 0.7172,
"step": 21
},
{
"epoch": 0.023487544483985764,
"grad_norm": 3.115963935852051,
"learning_rate": 2.340425531914894e-06,
"loss": 0.722,
"step": 22
},
{
"epoch": 0.024555160142348754,
"grad_norm": 2.9504928588867188,
"learning_rate": 2.446808510638298e-06,
"loss": 0.7265,
"step": 23
},
{
"epoch": 0.025622775800711744,
"grad_norm": 2.530184030532837,
"learning_rate": 2.553191489361702e-06,
"loss": 0.7168,
"step": 24
},
{
"epoch": 0.026690391459074734,
"grad_norm": 2.288259506225586,
"learning_rate": 2.6595744680851065e-06,
"loss": 0.7317,
"step": 25
},
{
"epoch": 0.027758007117437724,
"grad_norm": 1.7040042877197266,
"learning_rate": 2.765957446808511e-06,
"loss": 0.7177,
"step": 26
},
{
"epoch": 0.02882562277580071,
"grad_norm": 1.2607332468032837,
"learning_rate": 2.8723404255319155e-06,
"loss": 0.6854,
"step": 27
},
{
"epoch": 0.0298932384341637,
"grad_norm": 1.2629083395004272,
"learning_rate": 2.978723404255319e-06,
"loss": 0.6923,
"step": 28
},
{
"epoch": 0.03096085409252669,
"grad_norm": 1.2417811155319214,
"learning_rate": 3.0851063829787237e-06,
"loss": 0.6781,
"step": 29
},
{
"epoch": 0.03202846975088968,
"grad_norm": 1.1789079904556274,
"learning_rate": 3.191489361702128e-06,
"loss": 0.6828,
"step": 30
},
{
"epoch": 0.03309608540925267,
"grad_norm": 1.0594401359558105,
"learning_rate": 3.297872340425532e-06,
"loss": 0.6351,
"step": 31
},
{
"epoch": 0.03416370106761566,
"grad_norm": 1.0672959089279175,
"learning_rate": 3.4042553191489363e-06,
"loss": 0.6465,
"step": 32
},
{
"epoch": 0.03523131672597865,
"grad_norm": 0.9413697123527527,
"learning_rate": 3.510638297872341e-06,
"loss": 0.6452,
"step": 33
},
{
"epoch": 0.036298932384341634,
"grad_norm": 0.8563829064369202,
"learning_rate": 3.6170212765957453e-06,
"loss": 0.6293,
"step": 34
},
{
"epoch": 0.037366548042704624,
"grad_norm": 0.7698128819465637,
"learning_rate": 3.723404255319149e-06,
"loss": 0.6168,
"step": 35
},
{
"epoch": 0.038434163701067614,
"grad_norm": 0.9001553654670715,
"learning_rate": 3.8297872340425535e-06,
"loss": 0.6169,
"step": 36
},
{
"epoch": 0.039501779359430604,
"grad_norm": 1.0053948163986206,
"learning_rate": 3.936170212765958e-06,
"loss": 0.6171,
"step": 37
},
{
"epoch": 0.040569395017793594,
"grad_norm": 1.0858631134033203,
"learning_rate": 4.042553191489362e-06,
"loss": 0.651,
"step": 38
},
{
"epoch": 0.041637010676156584,
"grad_norm": 0.7921259999275208,
"learning_rate": 4.148936170212766e-06,
"loss": 0.6447,
"step": 39
},
{
"epoch": 0.042704626334519574,
"grad_norm": 0.6242907643318176,
"learning_rate": 4.255319148936171e-06,
"loss": 0.5966,
"step": 40
},
{
"epoch": 0.043772241992882564,
"grad_norm": 0.6965751051902771,
"learning_rate": 4.361702127659575e-06,
"loss": 0.5909,
"step": 41
},
{
"epoch": 0.044839857651245554,
"grad_norm": 0.7059489488601685,
"learning_rate": 4.468085106382979e-06,
"loss": 0.607,
"step": 42
},
{
"epoch": 0.045907473309608544,
"grad_norm": 0.6813053488731384,
"learning_rate": 4.574468085106383e-06,
"loss": 0.5946,
"step": 43
},
{
"epoch": 0.04697508896797153,
"grad_norm": 0.6372105479240417,
"learning_rate": 4.680851063829788e-06,
"loss": 0.5817,
"step": 44
},
{
"epoch": 0.04804270462633452,
"grad_norm": 0.5932055711746216,
"learning_rate": 4.787234042553192e-06,
"loss": 0.5916,
"step": 45
},
{
"epoch": 0.04911032028469751,
"grad_norm": 0.5361504554748535,
"learning_rate": 4.893617021276596e-06,
"loss": 0.6004,
"step": 46
},
{
"epoch": 0.0501779359430605,
"grad_norm": 0.5340394973754883,
"learning_rate": 5e-06,
"loss": 0.5828,
"step": 47
},
{
"epoch": 0.05124555160142349,
"grad_norm": 0.7338045835494995,
"learning_rate": 5.106382978723404e-06,
"loss": 0.622,
"step": 48
},
{
"epoch": 0.05231316725978648,
"grad_norm": 0.6164150834083557,
"learning_rate": 5.212765957446809e-06,
"loss": 0.5792,
"step": 49
},
{
"epoch": 0.05338078291814947,
"grad_norm": 0.5131736397743225,
"learning_rate": 5.319148936170213e-06,
"loss": 0.5769,
"step": 50
},
{
"epoch": 0.05444839857651246,
"grad_norm": 0.5779881477355957,
"learning_rate": 5.425531914893617e-06,
"loss": 0.6085,
"step": 51
},
{
"epoch": 0.05551601423487545,
"grad_norm": 0.652091383934021,
"learning_rate": 5.531914893617022e-06,
"loss": 0.6063,
"step": 52
},
{
"epoch": 0.05658362989323843,
"grad_norm": 0.5679817199707031,
"learning_rate": 5.638297872340426e-06,
"loss": 0.5642,
"step": 53
},
{
"epoch": 0.05765124555160142,
"grad_norm": 0.564539909362793,
"learning_rate": 5.744680851063831e-06,
"loss": 0.5881,
"step": 54
},
{
"epoch": 0.05871886120996441,
"grad_norm": 0.5641509294509888,
"learning_rate": 5.851063829787235e-06,
"loss": 0.575,
"step": 55
},
{
"epoch": 0.0597864768683274,
"grad_norm": 0.5606682300567627,
"learning_rate": 5.957446808510638e-06,
"loss": 0.5476,
"step": 56
},
{
"epoch": 0.06085409252669039,
"grad_norm": 0.573742151260376,
"learning_rate": 6.063829787234044e-06,
"loss": 0.5857,
"step": 57
},
{
"epoch": 0.06192170818505338,
"grad_norm": 0.574886679649353,
"learning_rate": 6.170212765957447e-06,
"loss": 0.5638,
"step": 58
},
{
"epoch": 0.06298932384341636,
"grad_norm": 0.5414725542068481,
"learning_rate": 6.276595744680851e-06,
"loss": 0.5927,
"step": 59
},
{
"epoch": 0.06405693950177936,
"grad_norm": 0.5933969020843506,
"learning_rate": 6.382978723404256e-06,
"loss": 0.55,
"step": 60
},
{
"epoch": 0.06512455516014234,
"grad_norm": 0.4871617257595062,
"learning_rate": 6.48936170212766e-06,
"loss": 0.5433,
"step": 61
},
{
"epoch": 0.06619217081850534,
"grad_norm": 0.55656498670578,
"learning_rate": 6.595744680851064e-06,
"loss": 0.5599,
"step": 62
},
{
"epoch": 0.06725978647686832,
"grad_norm": 0.5835067629814148,
"learning_rate": 6.702127659574469e-06,
"loss": 0.5549,
"step": 63
},
{
"epoch": 0.06832740213523132,
"grad_norm": 0.4503428637981415,
"learning_rate": 6.808510638297873e-06,
"loss": 0.573,
"step": 64
},
{
"epoch": 0.0693950177935943,
"grad_norm": 0.5671048164367676,
"learning_rate": 6.914893617021278e-06,
"loss": 0.5565,
"step": 65
},
{
"epoch": 0.0704626334519573,
"grad_norm": 0.4813767075538635,
"learning_rate": 7.021276595744682e-06,
"loss": 0.575,
"step": 66
},
{
"epoch": 0.07153024911032028,
"grad_norm": 0.5659592151641846,
"learning_rate": 7.127659574468085e-06,
"loss": 0.5181,
"step": 67
},
{
"epoch": 0.07259786476868327,
"grad_norm": 0.5178795456886292,
"learning_rate": 7.234042553191491e-06,
"loss": 0.5566,
"step": 68
},
{
"epoch": 0.07366548042704626,
"grad_norm": 0.5484414100646973,
"learning_rate": 7.340425531914894e-06,
"loss": 0.5409,
"step": 69
},
{
"epoch": 0.07473309608540925,
"grad_norm": 0.5428405404090881,
"learning_rate": 7.446808510638298e-06,
"loss": 0.5962,
"step": 70
},
{
"epoch": 0.07580071174377224,
"grad_norm": 0.4844944179058075,
"learning_rate": 7.553191489361703e-06,
"loss": 0.571,
"step": 71
},
{
"epoch": 0.07686832740213523,
"grad_norm": 0.590509831905365,
"learning_rate": 7.659574468085107e-06,
"loss": 0.5348,
"step": 72
},
{
"epoch": 0.07793594306049823,
"grad_norm": 0.4901210367679596,
"learning_rate": 7.765957446808511e-06,
"loss": 0.5472,
"step": 73
},
{
"epoch": 0.07900355871886121,
"grad_norm": 0.4863327443599701,
"learning_rate": 7.872340425531916e-06,
"loss": 0.5249,
"step": 74
},
{
"epoch": 0.0800711743772242,
"grad_norm": 0.5700340270996094,
"learning_rate": 7.97872340425532e-06,
"loss": 0.5875,
"step": 75
},
{
"epoch": 0.08113879003558719,
"grad_norm": 0.4684799015522003,
"learning_rate": 8.085106382978723e-06,
"loss": 0.5293,
"step": 76
},
{
"epoch": 0.08220640569395017,
"grad_norm": 0.6248029470443726,
"learning_rate": 8.191489361702128e-06,
"loss": 0.5562,
"step": 77
},
{
"epoch": 0.08327402135231317,
"grad_norm": 0.5804619789123535,
"learning_rate": 8.297872340425532e-06,
"loss": 0.544,
"step": 78
},
{
"epoch": 0.08434163701067615,
"grad_norm": 0.4794471561908722,
"learning_rate": 8.404255319148937e-06,
"loss": 0.5629,
"step": 79
},
{
"epoch": 0.08540925266903915,
"grad_norm": 0.5686808824539185,
"learning_rate": 8.510638297872341e-06,
"loss": 0.5605,
"step": 80
},
{
"epoch": 0.08647686832740213,
"grad_norm": 0.6154677867889404,
"learning_rate": 8.617021276595746e-06,
"loss": 0.5632,
"step": 81
},
{
"epoch": 0.08754448398576513,
"grad_norm": 0.599371612071991,
"learning_rate": 8.72340425531915e-06,
"loss": 0.5118,
"step": 82
},
{
"epoch": 0.08861209964412811,
"grad_norm": 0.6383739709854126,
"learning_rate": 8.829787234042555e-06,
"loss": 0.5584,
"step": 83
},
{
"epoch": 0.08967971530249111,
"grad_norm": 0.6114341616630554,
"learning_rate": 8.936170212765958e-06,
"loss": 0.5475,
"step": 84
},
{
"epoch": 0.09074733096085409,
"grad_norm": 0.5422399044036865,
"learning_rate": 9.042553191489362e-06,
"loss": 0.5356,
"step": 85
},
{
"epoch": 0.09181494661921709,
"grad_norm": 0.5314775109291077,
"learning_rate": 9.148936170212767e-06,
"loss": 0.5241,
"step": 86
},
{
"epoch": 0.09288256227758007,
"grad_norm": 0.592779278755188,
"learning_rate": 9.255319148936171e-06,
"loss": 0.5339,
"step": 87
},
{
"epoch": 0.09395017793594305,
"grad_norm": 0.6122350096702576,
"learning_rate": 9.361702127659576e-06,
"loss": 0.5434,
"step": 88
},
{
"epoch": 0.09501779359430605,
"grad_norm": 0.5250897407531738,
"learning_rate": 9.46808510638298e-06,
"loss": 0.5459,
"step": 89
},
{
"epoch": 0.09608540925266904,
"grad_norm": 0.592778205871582,
"learning_rate": 9.574468085106385e-06,
"loss": 0.5578,
"step": 90
},
{
"epoch": 0.09715302491103203,
"grad_norm": 0.47788286209106445,
"learning_rate": 9.680851063829787e-06,
"loss": 0.5471,
"step": 91
},
{
"epoch": 0.09822064056939502,
"grad_norm": 0.5285771489143372,
"learning_rate": 9.787234042553192e-06,
"loss": 0.5281,
"step": 92
},
{
"epoch": 0.09928825622775801,
"grad_norm": 0.47819527983665466,
"learning_rate": 9.893617021276596e-06,
"loss": 0.5269,
"step": 93
},
{
"epoch": 0.100355871886121,
"grad_norm": 0.48683878779411316,
"learning_rate": 1e-05,
"loss": 0.5286,
"step": 94
},
{
"epoch": 0.10142348754448399,
"grad_norm": 0.4826238751411438,
"learning_rate": 9.999965197129365e-06,
"loss": 0.5338,
"step": 95
},
{
"epoch": 0.10249110320284698,
"grad_norm": 0.5617088675498962,
"learning_rate": 9.999860789001947e-06,
"loss": 0.5396,
"step": 96
},
{
"epoch": 0.10355871886120996,
"grad_norm": 0.46838635206222534,
"learning_rate": 9.999686777071233e-06,
"loss": 0.5162,
"step": 97
},
{
"epoch": 0.10462633451957296,
"grad_norm": 0.5251678228378296,
"learning_rate": 9.999443163759669e-06,
"loss": 0.5158,
"step": 98
},
{
"epoch": 0.10569395017793594,
"grad_norm": 0.4968458116054535,
"learning_rate": 9.999129952458628e-06,
"loss": 0.5348,
"step": 99
},
{
"epoch": 0.10676156583629894,
"grad_norm": 0.5481524467468262,
"learning_rate": 9.998747147528375e-06,
"loss": 0.5245,
"step": 100
},
{
"epoch": 0.10782918149466192,
"grad_norm": 0.495601624250412,
"learning_rate": 9.998294754297992e-06,
"loss": 0.4902,
"step": 101
},
{
"epoch": 0.10889679715302492,
"grad_norm": 0.5450451374053955,
"learning_rate": 9.997772779065312e-06,
"loss": 0.5487,
"step": 102
},
{
"epoch": 0.1099644128113879,
"grad_norm": 0.5144025087356567,
"learning_rate": 9.997181229096831e-06,
"loss": 0.5088,
"step": 103
},
{
"epoch": 0.1110320284697509,
"grad_norm": 0.5595222115516663,
"learning_rate": 9.996520112627602e-06,
"loss": 0.5327,
"step": 104
},
{
"epoch": 0.11209964412811388,
"grad_norm": 0.6185320615768433,
"learning_rate": 9.995789438861128e-06,
"loss": 0.529,
"step": 105
},
{
"epoch": 0.11316725978647686,
"grad_norm": 0.5363855957984924,
"learning_rate": 9.994989217969224e-06,
"loss": 0.5566,
"step": 106
},
{
"epoch": 0.11423487544483986,
"grad_norm": 0.598929762840271,
"learning_rate": 9.994119461091885e-06,
"loss": 0.5024,
"step": 107
},
{
"epoch": 0.11530249110320284,
"grad_norm": 0.49212321639060974,
"learning_rate": 9.993180180337126e-06,
"loss": 0.5165,
"step": 108
},
{
"epoch": 0.11637010676156584,
"grad_norm": 0.5540521740913391,
"learning_rate": 9.992171388780814e-06,
"loss": 0.545,
"step": 109
},
{
"epoch": 0.11743772241992882,
"grad_norm": 0.45905211567878723,
"learning_rate": 9.991093100466482e-06,
"loss": 0.4976,
"step": 110
},
{
"epoch": 0.11850533807829182,
"grad_norm": 0.6659161448478699,
"learning_rate": 9.989945330405146e-06,
"loss": 0.5297,
"step": 111
},
{
"epoch": 0.1195729537366548,
"grad_norm": 0.5286558866500854,
"learning_rate": 9.988728094575082e-06,
"loss": 0.5029,
"step": 112
},
{
"epoch": 0.1206405693950178,
"grad_norm": 0.615146279335022,
"learning_rate": 9.98744140992161e-06,
"loss": 0.4897,
"step": 113
},
{
"epoch": 0.12170818505338078,
"grad_norm": 0.5887618064880371,
"learning_rate": 9.986085294356858e-06,
"loss": 0.5802,
"step": 114
},
{
"epoch": 0.12277580071174377,
"grad_norm": 0.5688331723213196,
"learning_rate": 9.98465976675951e-06,
"loss": 0.5015,
"step": 115
},
{
"epoch": 0.12384341637010676,
"grad_norm": 0.548701822757721,
"learning_rate": 9.983164846974549e-06,
"loss": 0.5154,
"step": 116
},
{
"epoch": 0.12491103202846975,
"grad_norm": 0.5815207362174988,
"learning_rate": 9.981600555812975e-06,
"loss": 0.5134,
"step": 117
},
{
"epoch": 0.12597864768683273,
"grad_norm": 0.5351188778877258,
"learning_rate": 9.979966915051517e-06,
"loss": 0.4912,
"step": 118
},
{
"epoch": 0.12704626334519573,
"grad_norm": 0.5330350399017334,
"learning_rate": 9.978263947432331e-06,
"loss": 0.5283,
"step": 119
},
{
"epoch": 0.12811387900355872,
"grad_norm": 0.5946553349494934,
"learning_rate": 9.976491676662679e-06,
"loss": 0.5365,
"step": 120
},
{
"epoch": 0.12918149466192172,
"grad_norm": 0.4562559723854065,
"learning_rate": 9.974650127414609e-06,
"loss": 0.5257,
"step": 121
},
{
"epoch": 0.1302491103202847,
"grad_norm": 0.6122528910636902,
"learning_rate": 9.972739325324596e-06,
"loss": 0.5068,
"step": 122
},
{
"epoch": 0.13131672597864769,
"grad_norm": 0.5213277339935303,
"learning_rate": 9.970759296993205e-06,
"loss": 0.4931,
"step": 123
},
{
"epoch": 0.13238434163701068,
"grad_norm": 0.5232741236686707,
"learning_rate": 9.968710069984699e-06,
"loss": 0.5208,
"step": 124
},
{
"epoch": 0.13345195729537365,
"grad_norm": 0.5642791390419006,
"learning_rate": 9.966591672826674e-06,
"loss": 0.5277,
"step": 125
},
{
"epoch": 0.13451957295373665,
"grad_norm": 0.5566534996032715,
"learning_rate": 9.964404135009649e-06,
"loss": 0.5485,
"step": 126
},
{
"epoch": 0.13558718861209965,
"grad_norm": 0.512524425983429,
"learning_rate": 9.962147486986664e-06,
"loss": 0.5566,
"step": 127
},
{
"epoch": 0.13665480427046264,
"grad_norm": 0.5945698618888855,
"learning_rate": 9.959821760172849e-06,
"loss": 0.5323,
"step": 128
},
{
"epoch": 0.1377224199288256,
"grad_norm": 0.519640326499939,
"learning_rate": 9.957426986944994e-06,
"loss": 0.5002,
"step": 129
},
{
"epoch": 0.1387900355871886,
"grad_norm": 0.5728665590286255,
"learning_rate": 9.95496320064109e-06,
"loss": 0.4939,
"step": 130
},
{
"epoch": 0.1398576512455516,
"grad_norm": 0.5529624819755554,
"learning_rate": 9.952430435559873e-06,
"loss": 0.4971,
"step": 131
},
{
"epoch": 0.1409252669039146,
"grad_norm": 0.5441509485244751,
"learning_rate": 9.94982872696034e-06,
"loss": 0.5287,
"step": 132
},
{
"epoch": 0.14199288256227757,
"grad_norm": 0.5497064590454102,
"learning_rate": 9.947158111061263e-06,
"loss": 0.5063,
"step": 133
},
{
"epoch": 0.14306049822064057,
"grad_norm": 0.5112823843955994,
"learning_rate": 9.94441862504068e-06,
"loss": 0.5122,
"step": 134
},
{
"epoch": 0.14412811387900357,
"grad_norm": 0.588614284992218,
"learning_rate": 9.941610307035385e-06,
"loss": 0.5098,
"step": 135
},
{
"epoch": 0.14519572953736654,
"grad_norm": 0.4941340386867523,
"learning_rate": 9.938733196140386e-06,
"loss": 0.5082,
"step": 136
},
{
"epoch": 0.14626334519572953,
"grad_norm": 0.566385805606842,
"learning_rate": 9.935787332408375e-06,
"loss": 0.4837,
"step": 137
},
{
"epoch": 0.14733096085409253,
"grad_norm": 0.5272175073623657,
"learning_rate": 9.932772756849152e-06,
"loss": 0.5014,
"step": 138
},
{
"epoch": 0.14839857651245553,
"grad_norm": 0.5560716986656189,
"learning_rate": 9.929689511429075e-06,
"loss": 0.5203,
"step": 139
},
{
"epoch": 0.1494661921708185,
"grad_norm": 0.6580199003219604,
"learning_rate": 9.926537639070457e-06,
"loss": 0.5088,
"step": 140
},
{
"epoch": 0.1505338078291815,
"grad_norm": 0.5059327483177185,
"learning_rate": 9.923317183650985e-06,
"loss": 0.5139,
"step": 141
},
{
"epoch": 0.1516014234875445,
"grad_norm": 0.54743891954422,
"learning_rate": 9.92002819000309e-06,
"loss": 0.5079,
"step": 142
},
{
"epoch": 0.1526690391459075,
"grad_norm": 0.5422698259353638,
"learning_rate": 9.916670703913345e-06,
"loss": 0.5176,
"step": 143
},
{
"epoch": 0.15373665480427046,
"grad_norm": 0.5230839848518372,
"learning_rate": 9.913244772121811e-06,
"loss": 0.5,
"step": 144
},
{
"epoch": 0.15480427046263345,
"grad_norm": 0.4665907025337219,
"learning_rate": 9.90975044232139e-06,
"loss": 0.5053,
"step": 145
},
{
"epoch": 0.15587188612099645,
"grad_norm": 0.6438184380531311,
"learning_rate": 9.90618776315717e-06,
"loss": 0.5266,
"step": 146
},
{
"epoch": 0.15693950177935942,
"grad_norm": 0.4731660485267639,
"learning_rate": 9.902556784225729e-06,
"loss": 0.4834,
"step": 147
},
{
"epoch": 0.15800711743772242,
"grad_norm": 0.5352628827095032,
"learning_rate": 9.898857556074469e-06,
"loss": 0.5099,
"step": 148
},
{
"epoch": 0.1590747330960854,
"grad_norm": 0.5462168455123901,
"learning_rate": 9.895090130200889e-06,
"loss": 0.5195,
"step": 149
},
{
"epoch": 0.1601423487544484,
"grad_norm": 0.4784468710422516,
"learning_rate": 9.891254559051886e-06,
"loss": 0.5173,
"step": 150
},
{
"epoch": 0.16120996441281138,
"grad_norm": 0.498097687959671,
"learning_rate": 9.887350896023015e-06,
"loss": 0.5065,
"step": 151
},
{
"epoch": 0.16227758007117438,
"grad_norm": 0.4685448706150055,
"learning_rate": 9.883379195457747e-06,
"loss": 0.4687,
"step": 152
},
{
"epoch": 0.16334519572953737,
"grad_norm": 0.5289403200149536,
"learning_rate": 9.879339512646714e-06,
"loss": 0.5154,
"step": 153
},
{
"epoch": 0.16441281138790034,
"grad_norm": 0.5611624121665955,
"learning_rate": 9.875231903826936e-06,
"loss": 0.5305,
"step": 154
},
{
"epoch": 0.16548042704626334,
"grad_norm": 0.5301553010940552,
"learning_rate": 9.871056426181052e-06,
"loss": 0.5316,
"step": 155
},
{
"epoch": 0.16654804270462634,
"grad_norm": 0.6103717684745789,
"learning_rate": 9.8668131378365e-06,
"loss": 0.5185,
"step": 156
},
{
"epoch": 0.16761565836298933,
"grad_norm": 0.5421136021614075,
"learning_rate": 9.862502097864726e-06,
"loss": 0.518,
"step": 157
},
{
"epoch": 0.1686832740213523,
"grad_norm": 0.5964006185531616,
"learning_rate": 9.858123366280358e-06,
"loss": 0.5073,
"step": 158
},
{
"epoch": 0.1697508896797153,
"grad_norm": 0.5336704254150391,
"learning_rate": 9.853677004040368e-06,
"loss": 0.5178,
"step": 159
},
{
"epoch": 0.1708185053380783,
"grad_norm": 0.5406745076179504,
"learning_rate": 9.849163073043223e-06,
"loss": 0.5046,
"step": 160
},
{
"epoch": 0.1718861209964413,
"grad_norm": 0.5165396928787231,
"learning_rate": 9.844581636128025e-06,
"loss": 0.497,
"step": 161
},
{
"epoch": 0.17295373665480426,
"grad_norm": 0.5873040556907654,
"learning_rate": 9.83993275707364e-06,
"loss": 0.5206,
"step": 162
},
{
"epoch": 0.17402135231316726,
"grad_norm": 0.5253546237945557,
"learning_rate": 9.835216500597797e-06,
"loss": 0.515,
"step": 163
},
{
"epoch": 0.17508896797153026,
"grad_norm": 0.5812190175056458,
"learning_rate": 9.830432932356207e-06,
"loss": 0.5192,
"step": 164
},
{
"epoch": 0.17615658362989323,
"grad_norm": 0.4591503143310547,
"learning_rate": 9.82558211894163e-06,
"loss": 0.5025,
"step": 165
},
{
"epoch": 0.17722419928825622,
"grad_norm": 0.6000644564628601,
"learning_rate": 9.820664127882958e-06,
"loss": 0.4817,
"step": 166
},
{
"epoch": 0.17829181494661922,
"grad_norm": 0.5603543519973755,
"learning_rate": 9.815679027644273e-06,
"loss": 0.4793,
"step": 167
},
{
"epoch": 0.17935943060498222,
"grad_norm": 0.5255252122879028,
"learning_rate": 9.8106268876239e-06,
"loss": 0.494,
"step": 168
},
{
"epoch": 0.1804270462633452,
"grad_norm": 0.6187337040901184,
"learning_rate": 9.805507778153423e-06,
"loss": 0.5069,
"step": 169
},
{
"epoch": 0.18149466192170818,
"grad_norm": 0.5259950160980225,
"learning_rate": 9.800321770496726e-06,
"loss": 0.5192,
"step": 170
},
{
"epoch": 0.18256227758007118,
"grad_norm": 0.5141558051109314,
"learning_rate": 9.79506893684899e-06,
"loss": 0.488,
"step": 171
},
{
"epoch": 0.18362989323843418,
"grad_norm": 0.5964564681053162,
"learning_rate": 9.789749350335693e-06,
"loss": 0.501,
"step": 172
},
{
"epoch": 0.18469750889679715,
"grad_norm": 0.5745047330856323,
"learning_rate": 9.784363085011587e-06,
"loss": 0.5174,
"step": 173
},
{
"epoch": 0.18576512455516014,
"grad_norm": 0.44916895031929016,
"learning_rate": 9.778910215859666e-06,
"loss": 0.4964,
"step": 174
},
{
"epoch": 0.18683274021352314,
"grad_norm": 0.5034676790237427,
"learning_rate": 9.773390818790136e-06,
"loss": 0.4729,
"step": 175
},
{
"epoch": 0.1879003558718861,
"grad_norm": 0.5329164266586304,
"learning_rate": 9.767804970639338e-06,
"loss": 0.4945,
"step": 176
},
{
"epoch": 0.1889679715302491,
"grad_norm": 0.4742647409439087,
"learning_rate": 9.762152749168693e-06,
"loss": 0.5445,
"step": 177
},
{
"epoch": 0.1900355871886121,
"grad_norm": 0.4610464572906494,
"learning_rate": 9.756434233063616e-06,
"loss": 0.4924,
"step": 178
},
{
"epoch": 0.1911032028469751,
"grad_norm": 0.5255376696586609,
"learning_rate": 9.750649501932414e-06,
"loss": 0.5241,
"step": 179
},
{
"epoch": 0.19217081850533807,
"grad_norm": 0.5016917586326599,
"learning_rate": 9.744798636305189e-06,
"loss": 0.5058,
"step": 180
},
{
"epoch": 0.19323843416370107,
"grad_norm": 0.6196140646934509,
"learning_rate": 9.738881717632709e-06,
"loss": 0.5042,
"step": 181
},
{
"epoch": 0.19430604982206406,
"grad_norm": 0.5428318977355957,
"learning_rate": 9.732898828285273e-06,
"loss": 0.5129,
"step": 182
},
{
"epoch": 0.19537366548042703,
"grad_norm": 0.5006230473518372,
"learning_rate": 9.726850051551575e-06,
"loss": 0.4631,
"step": 183
},
{
"epoch": 0.19644128113879003,
"grad_norm": 0.5109187960624695,
"learning_rate": 9.72073547163753e-06,
"loss": 0.4773,
"step": 184
},
{
"epoch": 0.19750889679715303,
"grad_norm": 0.5989903807640076,
"learning_rate": 9.714555173665112e-06,
"loss": 0.5078,
"step": 185
},
{
"epoch": 0.19857651245551602,
"grad_norm": 0.5101140737533569,
"learning_rate": 9.708309243671167e-06,
"loss": 0.5248,
"step": 186
},
{
"epoch": 0.199644128113879,
"grad_norm": 0.4500106871128082,
"learning_rate": 9.701997768606209e-06,
"loss": 0.4814,
"step": 187
},
{
"epoch": 0.200711743772242,
"grad_norm": 0.5334274172782898,
"learning_rate": 9.695620836333219e-06,
"loss": 0.4939,
"step": 188
},
{
"epoch": 0.201779359430605,
"grad_norm": 0.5067172050476074,
"learning_rate": 9.68917853562642e-06,
"loss": 0.5177,
"step": 189
},
{
"epoch": 0.20284697508896798,
"grad_norm": 0.5605948567390442,
"learning_rate": 9.68267095617003e-06,
"loss": 0.5021,
"step": 190
},
{
"epoch": 0.20391459074733095,
"grad_norm": 0.536536455154419,
"learning_rate": 9.676098188557032e-06,
"loss": 0.4814,
"step": 191
},
{
"epoch": 0.20498220640569395,
"grad_norm": 0.5245672464370728,
"learning_rate": 9.669460324287899e-06,
"loss": 0.4853,
"step": 192
},
{
"epoch": 0.20604982206405695,
"grad_norm": 0.6165151596069336,
"learning_rate": 9.662757455769317e-06,
"loss": 0.4744,
"step": 193
},
{
"epoch": 0.20711743772241992,
"grad_norm": 0.5017523169517517,
"learning_rate": 9.655989676312918e-06,
"loss": 0.5089,
"step": 194
},
{
"epoch": 0.20818505338078291,
"grad_norm": 0.6126395463943481,
"learning_rate": 9.649157080133962e-06,
"loss": 0.5089,
"step": 195
},
{
"epoch": 0.2092526690391459,
"grad_norm": 0.520261824131012,
"learning_rate": 9.642259762350034e-06,
"loss": 0.4986,
"step": 196
},
{
"epoch": 0.2103202846975089,
"grad_norm": 0.5865549445152283,
"learning_rate": 9.635297818979715e-06,
"loss": 0.5336,
"step": 197
},
{
"epoch": 0.21138790035587188,
"grad_norm": 0.5497699975967407,
"learning_rate": 9.628271346941252e-06,
"loss": 0.5195,
"step": 198
},
{
"epoch": 0.21245551601423487,
"grad_norm": 0.5673022866249084,
"learning_rate": 9.621180444051206e-06,
"loss": 0.5036,
"step": 199
},
{
"epoch": 0.21352313167259787,
"grad_norm": 0.5429431796073914,
"learning_rate": 9.614025209023084e-06,
"loss": 0.5244,
"step": 200
},
{
"epoch": 0.21459074733096084,
"grad_norm": 0.5560723543167114,
"learning_rate": 9.606805741465977e-06,
"loss": 0.5,
"step": 201
},
{
"epoch": 0.21565836298932384,
"grad_norm": 0.5545246005058289,
"learning_rate": 9.59952214188316e-06,
"loss": 0.4939,
"step": 202
},
{
"epoch": 0.21672597864768683,
"grad_norm": 0.6207299828529358,
"learning_rate": 9.592174511670704e-06,
"loss": 0.5191,
"step": 203
},
{
"epoch": 0.21779359430604983,
"grad_norm": 0.5119560360908508,
"learning_rate": 9.58476295311606e-06,
"loss": 0.4974,
"step": 204
},
{
"epoch": 0.2188612099644128,
"grad_norm": 0.5543833374977112,
"learning_rate": 9.577287569396632e-06,
"loss": 0.4777,
"step": 205
},
{
"epoch": 0.2199288256227758,
"grad_norm": 0.5279098153114319,
"learning_rate": 9.569748464578343e-06,
"loss": 0.5012,
"step": 206
},
{
"epoch": 0.2209964412811388,
"grad_norm": 0.5337633490562439,
"learning_rate": 9.562145743614193e-06,
"loss": 0.4872,
"step": 207
},
{
"epoch": 0.2220640569395018,
"grad_norm": 0.534850001335144,
"learning_rate": 9.554479512342785e-06,
"loss": 0.4928,
"step": 208
},
{
"epoch": 0.22313167259786476,
"grad_norm": 0.48084014654159546,
"learning_rate": 9.54674987748686e-06,
"loss": 0.4863,
"step": 209
},
{
"epoch": 0.22419928825622776,
"grad_norm": 0.6603854298591614,
"learning_rate": 9.538956946651816e-06,
"loss": 0.5256,
"step": 210
},
{
"epoch": 0.22526690391459075,
"grad_norm": 0.5027628540992737,
"learning_rate": 9.531100828324191e-06,
"loss": 0.5022,
"step": 211
},
{
"epoch": 0.22633451957295372,
"grad_norm": 0.5168050527572632,
"learning_rate": 9.52318163187018e-06,
"loss": 0.4878,
"step": 212
},
{
"epoch": 0.22740213523131672,
"grad_norm": 0.5213115215301514,
"learning_rate": 9.515199467534086e-06,
"loss": 0.527,
"step": 213
},
{
"epoch": 0.22846975088967972,
"grad_norm": 0.49242091178894043,
"learning_rate": 9.507154446436806e-06,
"loss": 0.4916,
"step": 214
},
{
"epoch": 0.22953736654804271,
"grad_norm": 0.5438655018806458,
"learning_rate": 9.499046680574267e-06,
"loss": 0.4751,
"step": 215
},
{
"epoch": 0.23060498220640568,
"grad_norm": 0.5265784859657288,
"learning_rate": 9.490876282815884e-06,
"loss": 0.4805,
"step": 216
},
{
"epoch": 0.23167259786476868,
"grad_norm": 0.5454720258712769,
"learning_rate": 9.482643366902972e-06,
"loss": 0.5312,
"step": 217
},
{
"epoch": 0.23274021352313168,
"grad_norm": 0.6158825159072876,
"learning_rate": 9.474348047447177e-06,
"loss": 0.5209,
"step": 218
},
{
"epoch": 0.23380782918149468,
"grad_norm": 0.49415621161460876,
"learning_rate": 9.465990439928868e-06,
"loss": 0.4835,
"step": 219
},
{
"epoch": 0.23487544483985764,
"grad_norm": 0.5915224552154541,
"learning_rate": 9.457570660695542e-06,
"loss": 0.486,
"step": 220
},
{
"epoch": 0.23594306049822064,
"grad_norm": 0.5715787410736084,
"learning_rate": 9.449088826960187e-06,
"loss": 0.4949,
"step": 221
},
{
"epoch": 0.23701067615658364,
"grad_norm": 0.6309436559677124,
"learning_rate": 9.440545056799677e-06,
"loss": 0.5237,
"step": 222
},
{
"epoch": 0.2380782918149466,
"grad_norm": 0.6128714084625244,
"learning_rate": 9.431939469153096e-06,
"loss": 0.4709,
"step": 223
},
{
"epoch": 0.2391459074733096,
"grad_norm": 0.5757558345794678,
"learning_rate": 9.423272183820109e-06,
"loss": 0.5063,
"step": 224
},
{
"epoch": 0.2402135231316726,
"grad_norm": 0.5617343187332153,
"learning_rate": 9.41454332145928e-06,
"loss": 0.4868,
"step": 225
},
{
"epoch": 0.2412811387900356,
"grad_norm": 0.5010789036750793,
"learning_rate": 9.405753003586396e-06,
"loss": 0.5037,
"step": 226
},
{
"epoch": 0.24234875444839857,
"grad_norm": 0.49613580107688904,
"learning_rate": 9.396901352572771e-06,
"loss": 0.4892,
"step": 227
},
{
"epoch": 0.24341637010676156,
"grad_norm": 0.6001424789428711,
"learning_rate": 9.387988491643558e-06,
"loss": 0.5054,
"step": 228
},
{
"epoch": 0.24448398576512456,
"grad_norm": 0.5321950316429138,
"learning_rate": 9.379014544876011e-06,
"loss": 0.5082,
"step": 229
},
{
"epoch": 0.24555160142348753,
"grad_norm": 0.5619071125984192,
"learning_rate": 9.369979637197774e-06,
"loss": 0.5071,
"step": 230
},
{
"epoch": 0.24661921708185053,
"grad_norm": 0.4910016357898712,
"learning_rate": 9.360883894385137e-06,
"loss": 0.4774,
"step": 231
},
{
"epoch": 0.24768683274021353,
"grad_norm": 0.5721420645713806,
"learning_rate": 9.351727443061284e-06,
"loss": 0.4978,
"step": 232
},
{
"epoch": 0.24875444839857652,
"grad_norm": 0.5795683264732361,
"learning_rate": 9.342510410694529e-06,
"loss": 0.5085,
"step": 233
},
{
"epoch": 0.2498220640569395,
"grad_norm": 0.5288822054862976,
"learning_rate": 9.33323292559655e-06,
"loss": 0.4864,
"step": 234
},
{
"epoch": 0.2508896797153025,
"grad_norm": 0.5554943680763245,
"learning_rate": 9.323895116920591e-06,
"loss": 0.4998,
"step": 235
},
{
"epoch": 0.25195729537366546,
"grad_norm": 0.5668061971664429,
"learning_rate": 9.31449711465967e-06,
"loss": 0.477,
"step": 236
},
{
"epoch": 0.25302491103202845,
"grad_norm": 0.5568402409553528,
"learning_rate": 9.305039049644772e-06,
"loss": 0.5175,
"step": 237
},
{
"epoch": 0.25409252669039145,
"grad_norm": 0.5518472790718079,
"learning_rate": 9.29552105354302e-06,
"loss": 0.4745,
"step": 238
},
{
"epoch": 0.25516014234875445,
"grad_norm": 0.6117028594017029,
"learning_rate": 9.28594325885585e-06,
"loss": 0.5148,
"step": 239
},
{
"epoch": 0.25622775800711745,
"grad_norm": 0.5180391073226929,
"learning_rate": 9.27630579891716e-06,
"loss": 0.4837,
"step": 240
},
{
"epoch": 0.25729537366548044,
"grad_norm": 0.5480329990386963,
"learning_rate": 9.266608807891459e-06,
"loss": 0.501,
"step": 241
},
{
"epoch": 0.25836298932384344,
"grad_norm": 0.4984034597873688,
"learning_rate": 9.256852420771999e-06,
"loss": 0.4946,
"step": 242
},
{
"epoch": 0.2594306049822064,
"grad_norm": 0.5542665123939514,
"learning_rate": 9.24703677337889e-06,
"loss": 0.4815,
"step": 243
},
{
"epoch": 0.2604982206405694,
"grad_norm": 0.4799808859825134,
"learning_rate": 9.237162002357214e-06,
"loss": 0.4838,
"step": 244
},
{
"epoch": 0.2615658362989324,
"grad_norm": 0.49428969621658325,
"learning_rate": 9.227228245175127e-06,
"loss": 0.4865,
"step": 245
},
{
"epoch": 0.26263345195729537,
"grad_norm": 0.4643561840057373,
"learning_rate": 9.217235640121927e-06,
"loss": 0.4722,
"step": 246
},
{
"epoch": 0.26370106761565837,
"grad_norm": 0.48922228813171387,
"learning_rate": 9.207184326306155e-06,
"loss": 0.5024,
"step": 247
},
{
"epoch": 0.26476868327402137,
"grad_norm": 0.5658605098724365,
"learning_rate": 9.197074443653643e-06,
"loss": 0.4878,
"step": 248
},
{
"epoch": 0.26583629893238436,
"grad_norm": 0.4450552761554718,
"learning_rate": 9.186906132905563e-06,
"loss": 0.4621,
"step": 249
},
{
"epoch": 0.2669039145907473,
"grad_norm": 0.5502617955207825,
"learning_rate": 9.176679535616477e-06,
"loss": 0.4836,
"step": 250
},
{
"epoch": 0.2679715302491103,
"grad_norm": 0.4923563599586487,
"learning_rate": 9.166394794152363e-06,
"loss": 0.5166,
"step": 251
},
{
"epoch": 0.2690391459074733,
"grad_norm": 0.5220004916191101,
"learning_rate": 9.156052051688633e-06,
"loss": 0.464,
"step": 252
},
{
"epoch": 0.2701067615658363,
"grad_norm": 0.5749658942222595,
"learning_rate": 9.145651452208133e-06,
"loss": 0.493,
"step": 253
},
{
"epoch": 0.2711743772241993,
"grad_norm": 0.4788929522037506,
"learning_rate": 9.135193140499155e-06,
"loss": 0.506,
"step": 254
},
{
"epoch": 0.2722419928825623,
"grad_norm": 0.5826008915901184,
"learning_rate": 9.124677262153405e-06,
"loss": 0.481,
"step": 255
},
{
"epoch": 0.2733096085409253,
"grad_norm": 0.5467514395713806,
"learning_rate": 9.114103963563986e-06,
"loss": 0.4821,
"step": 256
},
{
"epoch": 0.2743772241992883,
"grad_norm": 0.5301008224487305,
"learning_rate": 9.103473391923354e-06,
"loss": 0.4727,
"step": 257
},
{
"epoch": 0.2754448398576512,
"grad_norm": 0.5102054476737976,
"learning_rate": 9.092785695221271e-06,
"loss": 0.4828,
"step": 258
},
{
"epoch": 0.2765124555160142,
"grad_norm": 0.6430336236953735,
"learning_rate": 9.08204102224275e-06,
"loss": 0.4909,
"step": 259
},
{
"epoch": 0.2775800711743772,
"grad_norm": 0.5367814898490906,
"learning_rate": 9.071239522565978e-06,
"loss": 0.4805,
"step": 260
},
{
"epoch": 0.2786476868327402,
"grad_norm": 0.561622142791748,
"learning_rate": 9.06038134656023e-06,
"loss": 0.4643,
"step": 261
},
{
"epoch": 0.2797153024911032,
"grad_norm": 0.5907300710678101,
"learning_rate": 9.049466645383785e-06,
"loss": 0.5223,
"step": 262
},
{
"epoch": 0.2807829181494662,
"grad_norm": 0.5875605940818787,
"learning_rate": 9.038495570981814e-06,
"loss": 0.4932,
"step": 263
},
{
"epoch": 0.2818505338078292,
"grad_norm": 0.5611529350280762,
"learning_rate": 9.027468276084274e-06,
"loss": 0.4901,
"step": 264
},
{
"epoch": 0.28291814946619215,
"grad_norm": 0.6816518902778625,
"learning_rate": 9.016384914203771e-06,
"loss": 0.5165,
"step": 265
},
{
"epoch": 0.28398576512455515,
"grad_norm": 0.68822181224823,
"learning_rate": 9.00524563963343e-06,
"loss": 0.4756,
"step": 266
},
{
"epoch": 0.28505338078291814,
"grad_norm": 0.5975049138069153,
"learning_rate": 8.99405060744474e-06,
"loss": 0.4945,
"step": 267
},
{
"epoch": 0.28612099644128114,
"grad_norm": 0.7125190496444702,
"learning_rate": 8.982799973485407e-06,
"loss": 0.4962,
"step": 268
},
{
"epoch": 0.28718861209964414,
"grad_norm": 0.6332557201385498,
"learning_rate": 8.971493894377174e-06,
"loss": 0.4869,
"step": 269
},
{
"epoch": 0.28825622775800713,
"grad_norm": 0.5689089894294739,
"learning_rate": 8.960132527513642e-06,
"loss": 0.5099,
"step": 270
},
{
"epoch": 0.28932384341637013,
"grad_norm": 0.5326068997383118,
"learning_rate": 8.94871603105809e-06,
"loss": 0.4912,
"step": 271
},
{
"epoch": 0.29039145907473307,
"grad_norm": 0.5300759077072144,
"learning_rate": 8.937244563941248e-06,
"loss": 0.5066,
"step": 272
},
{
"epoch": 0.29145907473309607,
"grad_norm": 0.5240178108215332,
"learning_rate": 8.925718285859118e-06,
"loss": 0.5005,
"step": 273
},
{
"epoch": 0.29252669039145907,
"grad_norm": 0.47631746530532837,
"learning_rate": 8.914137357270723e-06,
"loss": 0.5008,
"step": 274
},
{
"epoch": 0.29359430604982206,
"grad_norm": 0.49288827180862427,
"learning_rate": 8.902501939395887e-06,
"loss": 0.4866,
"step": 275
},
{
"epoch": 0.29466192170818506,
"grad_norm": 0.5662288069725037,
"learning_rate": 8.890812194212987e-06,
"loss": 0.5421,
"step": 276
},
{
"epoch": 0.29572953736654806,
"grad_norm": 0.4656676650047302,
"learning_rate": 8.879068284456702e-06,
"loss": 0.4997,
"step": 277
},
{
"epoch": 0.29679715302491105,
"grad_norm": 0.5733962059020996,
"learning_rate": 8.867270373615735e-06,
"loss": 0.501,
"step": 278
},
{
"epoch": 0.297864768683274,
"grad_norm": 0.5234590172767639,
"learning_rate": 8.855418625930556e-06,
"loss": 0.4848,
"step": 279
},
{
"epoch": 0.298932384341637,
"grad_norm": 0.5430875420570374,
"learning_rate": 8.8435132063911e-06,
"loss": 0.5157,
"step": 280
},
{
"epoch": 0.3,
"grad_norm": 0.5232681035995483,
"learning_rate": 8.83155428073448e-06,
"loss": 0.4854,
"step": 281
},
{
"epoch": 0.301067615658363,
"grad_norm": 0.5697162747383118,
"learning_rate": 8.81954201544267e-06,
"loss": 0.4928,
"step": 282
},
{
"epoch": 0.302135231316726,
"grad_norm": 0.462223619222641,
"learning_rate": 8.8074765777402e-06,
"loss": 0.4856,
"step": 283
},
{
"epoch": 0.303202846975089,
"grad_norm": 0.518064022064209,
"learning_rate": 8.79535813559181e-06,
"loss": 0.5049,
"step": 284
},
{
"epoch": 0.304270462633452,
"grad_norm": 0.46611088514328003,
"learning_rate": 8.783186857700137e-06,
"loss": 0.4837,
"step": 285
},
{
"epoch": 0.305338078291815,
"grad_norm": 0.519318699836731,
"learning_rate": 8.77096291350334e-06,
"loss": 0.4947,
"step": 286
},
{
"epoch": 0.3064056939501779,
"grad_norm": 0.46297067403793335,
"learning_rate": 8.75868647317276e-06,
"loss": 0.4985,
"step": 287
},
{
"epoch": 0.3074733096085409,
"grad_norm": 0.4645700752735138,
"learning_rate": 8.746357707610544e-06,
"loss": 0.4659,
"step": 288
},
{
"epoch": 0.3085409252669039,
"grad_norm": 0.463349848985672,
"learning_rate": 8.733976788447265e-06,
"loss": 0.5017,
"step": 289
},
{
"epoch": 0.3096085409252669,
"grad_norm": 0.5248959064483643,
"learning_rate": 8.721543888039534e-06,
"loss": 0.5002,
"step": 290
},
{
"epoch": 0.3106761565836299,
"grad_norm": 0.514178991317749,
"learning_rate": 8.709059179467598e-06,
"loss": 0.4693,
"step": 291
},
{
"epoch": 0.3117437722419929,
"grad_norm": 0.49520182609558105,
"learning_rate": 8.69652283653294e-06,
"loss": 0.4899,
"step": 292
},
{
"epoch": 0.3128113879003559,
"grad_norm": 0.4822703003883362,
"learning_rate": 8.683935033755848e-06,
"loss": 0.4944,
"step": 293
},
{
"epoch": 0.31387900355871884,
"grad_norm": 0.5043975710868835,
"learning_rate": 8.671295946372989e-06,
"loss": 0.4725,
"step": 294
},
{
"epoch": 0.31494661921708184,
"grad_norm": 0.4966917932033539,
"learning_rate": 8.658605750334972e-06,
"loss": 0.4764,
"step": 295
},
{
"epoch": 0.31601423487544483,
"grad_norm": 0.48992806673049927,
"learning_rate": 8.6458646223039e-06,
"loss": 0.4899,
"step": 296
},
{
"epoch": 0.31708185053380783,
"grad_norm": 0.5476608872413635,
"learning_rate": 8.6330727396509e-06,
"loss": 0.488,
"step": 297
},
{
"epoch": 0.3181494661921708,
"grad_norm": 0.6118818521499634,
"learning_rate": 8.620230280453672e-06,
"loss": 0.5071,
"step": 298
},
{
"epoch": 0.3192170818505338,
"grad_norm": 0.45298174023628235,
"learning_rate": 8.607337423493996e-06,
"loss": 0.471,
"step": 299
},
{
"epoch": 0.3202846975088968,
"grad_norm": 0.5458585023880005,
"learning_rate": 8.594394348255239e-06,
"loss": 0.5012,
"step": 300
},
{
"epoch": 0.32135231316725976,
"grad_norm": 0.5509236454963684,
"learning_rate": 8.581401234919873e-06,
"loss": 0.497,
"step": 301
},
{
"epoch": 0.32241992882562276,
"grad_norm": 0.520375669002533,
"learning_rate": 8.568358264366958e-06,
"loss": 0.4948,
"step": 302
},
{
"epoch": 0.32348754448398576,
"grad_norm": 0.46900251507759094,
"learning_rate": 8.555265618169615e-06,
"loss": 0.4987,
"step": 303
},
{
"epoch": 0.32455516014234875,
"grad_norm": 0.44442543387413025,
"learning_rate": 8.542123478592518e-06,
"loss": 0.4898,
"step": 304
},
{
"epoch": 0.32562277580071175,
"grad_norm": 0.43564245104789734,
"learning_rate": 8.528932028589337e-06,
"loss": 0.4587,
"step": 305
},
{
"epoch": 0.32669039145907475,
"grad_norm": 0.6464988589286804,
"learning_rate": 8.515691451800206e-06,
"loss": 0.511,
"step": 306
},
{
"epoch": 0.32775800711743774,
"grad_norm": 0.485740602016449,
"learning_rate": 8.502401932549154e-06,
"loss": 0.4917,
"step": 307
},
{
"epoch": 0.3288256227758007,
"grad_norm": 0.5098385214805603,
"learning_rate": 8.489063655841552e-06,
"loss": 0.4796,
"step": 308
},
{
"epoch": 0.3298932384341637,
"grad_norm": 0.5981292724609375,
"learning_rate": 8.475676807361526e-06,
"loss": 0.5112,
"step": 309
},
{
"epoch": 0.3309608540925267,
"grad_norm": 0.499467670917511,
"learning_rate": 8.462241573469378e-06,
"loss": 0.4924,
"step": 310
},
{
"epoch": 0.3320284697508897,
"grad_norm": 0.5141733884811401,
"learning_rate": 8.448758141198991e-06,
"loss": 0.4856,
"step": 311
},
{
"epoch": 0.3330960854092527,
"grad_norm": 0.49083369970321655,
"learning_rate": 8.435226698255228e-06,
"loss": 0.4927,
"step": 312
},
{
"epoch": 0.33416370106761567,
"grad_norm": 0.5083484053611755,
"learning_rate": 8.421647433011306e-06,
"loss": 0.4963,
"step": 313
},
{
"epoch": 0.33523131672597867,
"grad_norm": 0.5530070066452026,
"learning_rate": 8.408020534506195e-06,
"loss": 0.5088,
"step": 314
},
{
"epoch": 0.33629893238434166,
"grad_norm": 0.5097641944885254,
"learning_rate": 8.394346192441967e-06,
"loss": 0.4999,
"step": 315
},
{
"epoch": 0.3373665480427046,
"grad_norm": 0.5912004709243774,
"learning_rate": 8.380624597181165e-06,
"loss": 0.5071,
"step": 316
},
{
"epoch": 0.3384341637010676,
"grad_norm": 0.5386204123497009,
"learning_rate": 8.366855939744152e-06,
"loss": 0.5018,
"step": 317
},
{
"epoch": 0.3395017793594306,
"grad_norm": 0.6744493246078491,
"learning_rate": 8.353040411806449e-06,
"loss": 0.5036,
"step": 318
},
{
"epoch": 0.3405693950177936,
"grad_norm": 0.5442379117012024,
"learning_rate": 8.339178205696067e-06,
"loss": 0.5192,
"step": 319
},
{
"epoch": 0.3416370106761566,
"grad_norm": 0.4711393117904663,
"learning_rate": 8.325269514390835e-06,
"loss": 0.4805,
"step": 320
},
{
"epoch": 0.3427046263345196,
"grad_norm": 0.5519885420799255,
"learning_rate": 8.311314531515707e-06,
"loss": 0.4606,
"step": 321
},
{
"epoch": 0.3437722419928826,
"grad_norm": 0.48979809880256653,
"learning_rate": 8.297313451340064e-06,
"loss": 0.4683,
"step": 322
},
{
"epoch": 0.34483985765124553,
"grad_norm": 0.45639723539352417,
"learning_rate": 8.283266468775024e-06,
"loss": 0.4899,
"step": 323
},
{
"epoch": 0.3459074733096085,
"grad_norm": 0.559330940246582,
"learning_rate": 8.269173779370712e-06,
"loss": 0.4993,
"step": 324
},
{
"epoch": 0.3469750889679715,
"grad_norm": 0.5538395047187805,
"learning_rate": 8.255035579313545e-06,
"loss": 0.4826,
"step": 325
},
{
"epoch": 0.3480427046263345,
"grad_norm": 0.5136542320251465,
"learning_rate": 8.240852065423507e-06,
"loss": 0.4979,
"step": 326
},
{
"epoch": 0.3491103202846975,
"grad_norm": 0.5399389863014221,
"learning_rate": 8.226623435151389e-06,
"loss": 0.4782,
"step": 327
},
{
"epoch": 0.3501779359430605,
"grad_norm": 0.535988450050354,
"learning_rate": 8.21234988657607e-06,
"loss": 0.507,
"step": 328
},
{
"epoch": 0.3512455516014235,
"grad_norm": 0.4826440215110779,
"learning_rate": 8.198031618401733e-06,
"loss": 0.4858,
"step": 329
},
{
"epoch": 0.35231316725978645,
"grad_norm": 0.541845440864563,
"learning_rate": 8.183668829955111e-06,
"loss": 0.4436,
"step": 330
},
{
"epoch": 0.35338078291814945,
"grad_norm": 0.5265049338340759,
"learning_rate": 8.169261721182715e-06,
"loss": 0.4608,
"step": 331
},
{
"epoch": 0.35444839857651245,
"grad_norm": 0.5588465332984924,
"learning_rate": 8.154810492648038e-06,
"loss": 0.5055,
"step": 332
},
{
"epoch": 0.35551601423487544,
"grad_norm": 0.4561479091644287,
"learning_rate": 8.140315345528778e-06,
"loss": 0.4939,
"step": 333
},
{
"epoch": 0.35658362989323844,
"grad_norm": 0.4961983263492584,
"learning_rate": 8.125776481614025e-06,
"loss": 0.5079,
"step": 334
},
{
"epoch": 0.35765124555160144,
"grad_norm": 0.4646869897842407,
"learning_rate": 8.111194103301461e-06,
"loss": 0.4641,
"step": 335
},
{
"epoch": 0.35871886120996443,
"grad_norm": 0.5100634694099426,
"learning_rate": 8.096568413594533e-06,
"loss": 0.5032,
"step": 336
},
{
"epoch": 0.3597864768683274,
"grad_norm": 0.5835485458374023,
"learning_rate": 8.081899616099638e-06,
"loss": 0.4585,
"step": 337
},
{
"epoch": 0.3608540925266904,
"grad_norm": 0.39481019973754883,
"learning_rate": 8.067187915023283e-06,
"loss": 0.5012,
"step": 338
},
{
"epoch": 0.36192170818505337,
"grad_norm": 0.55184006690979,
"learning_rate": 8.052433515169235e-06,
"loss": 0.4703,
"step": 339
},
{
"epoch": 0.36298932384341637,
"grad_norm": 0.471427321434021,
"learning_rate": 8.037636621935686e-06,
"loss": 0.478,
"step": 340
},
{
"epoch": 0.36405693950177936,
"grad_norm": 0.47815489768981934,
"learning_rate": 8.022797441312376e-06,
"loss": 0.4687,
"step": 341
},
{
"epoch": 0.36512455516014236,
"grad_norm": 0.48546668887138367,
"learning_rate": 8.007916179877742e-06,
"loss": 0.5058,
"step": 342
},
{
"epoch": 0.36619217081850536,
"grad_norm": 0.48870334029197693,
"learning_rate": 7.99299304479603e-06,
"loss": 0.4874,
"step": 343
},
{
"epoch": 0.36725978647686836,
"grad_norm": 0.4691154658794403,
"learning_rate": 7.978028243814416e-06,
"loss": 0.4834,
"step": 344
},
{
"epoch": 0.3683274021352313,
"grad_norm": 0.49752214550971985,
"learning_rate": 7.96302198526011e-06,
"loss": 0.4959,
"step": 345
},
{
"epoch": 0.3693950177935943,
"grad_norm": 0.5052193403244019,
"learning_rate": 7.947974478037468e-06,
"loss": 0.4817,
"step": 346
},
{
"epoch": 0.3704626334519573,
"grad_norm": 0.5181514620780945,
"learning_rate": 7.932885931625063e-06,
"loss": 0.4578,
"step": 347
},
{
"epoch": 0.3715302491103203,
"grad_norm": 0.482715368270874,
"learning_rate": 7.917756556072792e-06,
"loss": 0.4587,
"step": 348
},
{
"epoch": 0.3725978647686833,
"grad_norm": 0.4123336672782898,
"learning_rate": 7.902586561998928e-06,
"loss": 0.454,
"step": 349
},
{
"epoch": 0.3736654804270463,
"grad_norm": 0.5441368222236633,
"learning_rate": 7.887376160587214e-06,
"loss": 0.4759,
"step": 350
},
{
"epoch": 0.3747330960854093,
"grad_norm": 0.48946669697761536,
"learning_rate": 7.8721255635839e-06,
"loss": 0.4942,
"step": 351
},
{
"epoch": 0.3758007117437722,
"grad_norm": 0.4664275348186493,
"learning_rate": 7.85683498329481e-06,
"loss": 0.4775,
"step": 352
},
{
"epoch": 0.3768683274021352,
"grad_norm": 0.5947299599647522,
"learning_rate": 7.841504632582378e-06,
"loss": 0.4781,
"step": 353
},
{
"epoch": 0.3779359430604982,
"grad_norm": 0.43907010555267334,
"learning_rate": 7.826134724862687e-06,
"loss": 0.4785,
"step": 354
},
{
"epoch": 0.3790035587188612,
"grad_norm": 0.49153631925582886,
"learning_rate": 7.810725474102504e-06,
"loss": 0.4623,
"step": 355
},
{
"epoch": 0.3800711743772242,
"grad_norm": 0.5009203553199768,
"learning_rate": 7.795277094816292e-06,
"loss": 0.4878,
"step": 356
},
{
"epoch": 0.3811387900355872,
"grad_norm": 0.5319011211395264,
"learning_rate": 7.779789802063229e-06,
"loss": 0.4535,
"step": 357
},
{
"epoch": 0.3822064056939502,
"grad_norm": 0.5173964500427246,
"learning_rate": 7.764263811444214e-06,
"loss": 0.4956,
"step": 358
},
{
"epoch": 0.38327402135231314,
"grad_norm": 0.4726311266422272,
"learning_rate": 7.748699339098864e-06,
"loss": 0.4771,
"step": 359
},
{
"epoch": 0.38434163701067614,
"grad_norm": 0.5030087232589722,
"learning_rate": 7.733096601702508e-06,
"loss": 0.4995,
"step": 360
},
{
"epoch": 0.38540925266903914,
"grad_norm": 0.4362412989139557,
"learning_rate": 7.717455816463161e-06,
"loss": 0.483,
"step": 361
},
{
"epoch": 0.38647686832740213,
"grad_norm": 0.45854416489601135,
"learning_rate": 7.70177720111852e-06,
"loss": 0.4828,
"step": 362
},
{
"epoch": 0.38754448398576513,
"grad_norm": 0.4099372327327728,
"learning_rate": 7.68606097393291e-06,
"loss": 0.4601,
"step": 363
},
{
"epoch": 0.38861209964412813,
"grad_norm": 0.5316334962844849,
"learning_rate": 7.67030735369426e-06,
"loss": 0.5109,
"step": 364
},
{
"epoch": 0.3896797153024911,
"grad_norm": 0.5196130871772766,
"learning_rate": 7.654516559711053e-06,
"loss": 0.4849,
"step": 365
},
{
"epoch": 0.39074733096085407,
"grad_norm": 0.4411613941192627,
"learning_rate": 7.638688811809274e-06,
"loss": 0.4807,
"step": 366
},
{
"epoch": 0.39181494661921706,
"grad_norm": 0.508170485496521,
"learning_rate": 7.622824330329345e-06,
"loss": 0.4694,
"step": 367
},
{
"epoch": 0.39288256227758006,
"grad_norm": 0.42211753129959106,
"learning_rate": 7.6069233361230696e-06,
"loss": 0.4573,
"step": 368
},
{
"epoch": 0.39395017793594306,
"grad_norm": 0.4601055085659027,
"learning_rate": 7.590986050550542e-06,
"loss": 0.4752,
"step": 369
},
{
"epoch": 0.39501779359430605,
"grad_norm": 0.4858173131942749,
"learning_rate": 7.575012695477076e-06,
"loss": 0.4706,
"step": 370
},
{
"epoch": 0.39608540925266905,
"grad_norm": 0.42238175868988037,
"learning_rate": 7.55900349327012e-06,
"loss": 0.4828,
"step": 371
},
{
"epoch": 0.39715302491103205,
"grad_norm": 0.4975998103618622,
"learning_rate": 7.542958666796149e-06,
"loss": 0.4884,
"step": 372
},
{
"epoch": 0.398220640569395,
"grad_norm": 0.48582613468170166,
"learning_rate": 7.526878439417572e-06,
"loss": 0.4961,
"step": 373
},
{
"epoch": 0.399288256227758,
"grad_norm": 0.4576529562473297,
"learning_rate": 7.510763034989616e-06,
"loss": 0.4311,
"step": 374
},
{
"epoch": 0.400355871886121,
"grad_norm": 0.48702099919319153,
"learning_rate": 7.494612677857218e-06,
"loss": 0.4955,
"step": 375
},
{
"epoch": 0.401423487544484,
"grad_norm": 0.4474165737628937,
"learning_rate": 7.478427592851894e-06,
"loss": 0.4615,
"step": 376
},
{
"epoch": 0.402491103202847,
"grad_norm": 0.4888235032558441,
"learning_rate": 7.462208005288609e-06,
"loss": 0.4711,
"step": 377
},
{
"epoch": 0.40355871886121,
"grad_norm": 0.5036333799362183,
"learning_rate": 7.44595414096265e-06,
"loss": 0.4885,
"step": 378
},
{
"epoch": 0.40462633451957297,
"grad_norm": 0.4840095639228821,
"learning_rate": 7.429666226146468e-06,
"loss": 0.4932,
"step": 379
},
{
"epoch": 0.40569395017793597,
"grad_norm": 0.4943961203098297,
"learning_rate": 7.413344487586542e-06,
"loss": 0.4874,
"step": 380
},
{
"epoch": 0.4067615658362989,
"grad_norm": 0.535376250743866,
"learning_rate": 7.396989152500215e-06,
"loss": 0.4982,
"step": 381
},
{
"epoch": 0.4078291814946619,
"grad_norm": 0.4504840672016144,
"learning_rate": 7.380600448572532e-06,
"loss": 0.436,
"step": 382
},
{
"epoch": 0.4088967971530249,
"grad_norm": 0.5047032833099365,
"learning_rate": 7.364178603953066e-06,
"loss": 0.4702,
"step": 383
},
{
"epoch": 0.4099644128113879,
"grad_norm": 0.4717814028263092,
"learning_rate": 7.347723847252756e-06,
"loss": 0.4783,
"step": 384
},
{
"epoch": 0.4110320284697509,
"grad_norm": 0.4659929871559143,
"learning_rate": 7.331236407540704e-06,
"loss": 0.4612,
"step": 385
},
{
"epoch": 0.4120996441281139,
"grad_norm": 0.47856637835502625,
"learning_rate": 7.314716514341007e-06,
"loss": 0.4766,
"step": 386
},
{
"epoch": 0.4131672597864769,
"grad_norm": 0.4641667902469635,
"learning_rate": 7.298164397629545e-06,
"loss": 0.4708,
"step": 387
},
{
"epoch": 0.41423487544483983,
"grad_norm": 0.5396067500114441,
"learning_rate": 7.28158028783079e-06,
"loss": 0.4809,
"step": 388
},
{
"epoch": 0.41530249110320283,
"grad_norm": 0.5329163670539856,
"learning_rate": 7.2649644158145925e-06,
"loss": 0.4829,
"step": 389
},
{
"epoch": 0.41637010676156583,
"grad_norm": 0.450914204120636,
"learning_rate": 7.248317012892969e-06,
"loss": 0.4527,
"step": 390
},
{
"epoch": 0.4174377224199288,
"grad_norm": 0.5790780782699585,
"learning_rate": 7.231638310816888e-06,
"loss": 0.4893,
"step": 391
},
{
"epoch": 0.4185053380782918,
"grad_norm": 0.5594152212142944,
"learning_rate": 7.214928541773027e-06,
"loss": 0.4794,
"step": 392
},
{
"epoch": 0.4195729537366548,
"grad_norm": 0.46533674001693726,
"learning_rate": 7.198187938380565e-06,
"loss": 0.466,
"step": 393
},
{
"epoch": 0.4206405693950178,
"grad_norm": 0.5824273228645325,
"learning_rate": 7.1814167336879195e-06,
"loss": 0.4833,
"step": 394
},
{
"epoch": 0.42170818505338076,
"grad_norm": 0.4478416442871094,
"learning_rate": 7.164615161169518e-06,
"loss": 0.5013,
"step": 395
},
{
"epoch": 0.42277580071174375,
"grad_norm": 0.6205080151557922,
"learning_rate": 7.147783454722545e-06,
"loss": 0.4905,
"step": 396
},
{
"epoch": 0.42384341637010675,
"grad_norm": 0.4739533066749573,
"learning_rate": 7.130921848663678e-06,
"loss": 0.4834,
"step": 397
},
{
"epoch": 0.42491103202846975,
"grad_norm": 0.4931207597255707,
"learning_rate": 7.1140305777258355e-06,
"loss": 0.5142,
"step": 398
},
{
"epoch": 0.42597864768683275,
"grad_norm": 0.5040392279624939,
"learning_rate": 7.097109877054906e-06,
"loss": 0.4679,
"step": 399
},
{
"epoch": 0.42704626334519574,
"grad_norm": 0.4795084595680237,
"learning_rate": 7.080159982206471e-06,
"loss": 0.4869,
"step": 400
},
{
"epoch": 0.42811387900355874,
"grad_norm": 0.5009298920631409,
"learning_rate": 7.06318112914253e-06,
"loss": 0.4786,
"step": 401
},
{
"epoch": 0.4291814946619217,
"grad_norm": 0.5377593040466309,
"learning_rate": 7.046173554228213e-06,
"loss": 0.4968,
"step": 402
},
{
"epoch": 0.4302491103202847,
"grad_norm": 0.4396429657936096,
"learning_rate": 7.029137494228491e-06,
"loss": 0.5166,
"step": 403
},
{
"epoch": 0.4313167259786477,
"grad_norm": 0.4758850336074829,
"learning_rate": 7.012073186304885e-06,
"loss": 0.4896,
"step": 404
},
{
"epoch": 0.43238434163701067,
"grad_norm": 0.4826003611087799,
"learning_rate": 6.994980868012151e-06,
"loss": 0.5043,
"step": 405
},
{
"epoch": 0.43345195729537367,
"grad_norm": 0.4461214244365692,
"learning_rate": 6.9778607772949894e-06,
"loss": 0.4657,
"step": 406
},
{
"epoch": 0.43451957295373667,
"grad_norm": 0.4717596769332886,
"learning_rate": 6.9607131524847175e-06,
"loss": 0.4889,
"step": 407
},
{
"epoch": 0.43558718861209966,
"grad_norm": 0.41522154211997986,
"learning_rate": 6.943538232295965e-06,
"loss": 0.4716,
"step": 408
},
{
"epoch": 0.43665480427046266,
"grad_norm": 0.5176120400428772,
"learning_rate": 6.926336255823341e-06,
"loss": 0.4855,
"step": 409
},
{
"epoch": 0.4377224199288256,
"grad_norm": 0.4708162248134613,
"learning_rate": 6.909107462538113e-06,
"loss": 0.4839,
"step": 410
},
{
"epoch": 0.4387900355871886,
"grad_norm": 0.39738133549690247,
"learning_rate": 6.891852092284863e-06,
"loss": 0.4911,
"step": 411
},
{
"epoch": 0.4398576512455516,
"grad_norm": 0.4732625186443329,
"learning_rate": 6.874570385278161e-06,
"loss": 0.4938,
"step": 412
},
{
"epoch": 0.4409252669039146,
"grad_norm": 0.5151704549789429,
"learning_rate": 6.857262582099209e-06,
"loss": 0.504,
"step": 413
},
{
"epoch": 0.4419928825622776,
"grad_norm": 0.49842819571495056,
"learning_rate": 6.839928923692505e-06,
"loss": 0.5116,
"step": 414
},
{
"epoch": 0.4430604982206406,
"grad_norm": 0.4782036244869232,
"learning_rate": 6.822569651362475e-06,
"loss": 0.4888,
"step": 415
},
{
"epoch": 0.4441281138790036,
"grad_norm": 0.4534831941127777,
"learning_rate": 6.805185006770125e-06,
"loss": 0.4548,
"step": 416
},
{
"epoch": 0.4451957295373665,
"grad_norm": 0.5043431520462036,
"learning_rate": 6.787775231929666e-06,
"loss": 0.5011,
"step": 417
},
{
"epoch": 0.4462633451957295,
"grad_norm": 0.47425511479377747,
"learning_rate": 6.7703405692051585e-06,
"loss": 0.4861,
"step": 418
},
{
"epoch": 0.4473309608540925,
"grad_norm": 0.4268990159034729,
"learning_rate": 6.752881261307125e-06,
"loss": 0.4773,
"step": 419
},
{
"epoch": 0.4483985765124555,
"grad_norm": 0.459902822971344,
"learning_rate": 6.735397551289179e-06,
"loss": 0.4815,
"step": 420
},
{
"epoch": 0.4494661921708185,
"grad_norm": 0.5495928525924683,
"learning_rate": 6.717889682544641e-06,
"loss": 0.5039,
"step": 421
},
{
"epoch": 0.4505338078291815,
"grad_norm": 0.4123859703540802,
"learning_rate": 6.700357898803146e-06,
"loss": 0.487,
"step": 422
},
{
"epoch": 0.4516014234875445,
"grad_norm": 0.44671013951301575,
"learning_rate": 6.6828024441272554e-06,
"loss": 0.4913,
"step": 423
},
{
"epoch": 0.45266903914590745,
"grad_norm": 0.4648853838443756,
"learning_rate": 6.665223562909058e-06,
"loss": 0.4852,
"step": 424
},
{
"epoch": 0.45373665480427045,
"grad_norm": 0.494157075881958,
"learning_rate": 6.647621499866762e-06,
"loss": 0.4851,
"step": 425
},
{
"epoch": 0.45480427046263344,
"grad_norm": 0.5244255661964417,
"learning_rate": 6.629996500041299e-06,
"loss": 0.4945,
"step": 426
},
{
"epoch": 0.45587188612099644,
"grad_norm": 0.48558488488197327,
"learning_rate": 6.612348808792904e-06,
"loss": 0.4829,
"step": 427
},
{
"epoch": 0.45693950177935944,
"grad_norm": 0.46548742055892944,
"learning_rate": 6.5946786717977026e-06,
"loss": 0.5057,
"step": 428
},
{
"epoch": 0.45800711743772243,
"grad_norm": 0.49716660380363464,
"learning_rate": 6.576986335044292e-06,
"loss": 0.4682,
"step": 429
},
{
"epoch": 0.45907473309608543,
"grad_norm": 0.427898108959198,
"learning_rate": 6.5592720448303174e-06,
"loss": 0.4922,
"step": 430
},
{
"epoch": 0.46014234875444837,
"grad_norm": 0.46051132678985596,
"learning_rate": 6.541536047759034e-06,
"loss": 0.4756,
"step": 431
},
{
"epoch": 0.46120996441281137,
"grad_norm": 0.4844045639038086,
"learning_rate": 6.523778590735892e-06,
"loss": 0.5199,
"step": 432
},
{
"epoch": 0.46227758007117437,
"grad_norm": 0.4431370496749878,
"learning_rate": 6.5059999209650795e-06,
"loss": 0.4744,
"step": 433
},
{
"epoch": 0.46334519572953736,
"grad_norm": 0.4615848958492279,
"learning_rate": 6.488200285946094e-06,
"loss": 0.4459,
"step": 434
},
{
"epoch": 0.46441281138790036,
"grad_norm": 0.5034524202346802,
"learning_rate": 6.470379933470296e-06,
"loss": 0.4859,
"step": 435
},
{
"epoch": 0.46548042704626336,
"grad_norm": 0.42077118158340454,
"learning_rate": 6.452539111617454e-06,
"loss": 0.4703,
"step": 436
},
{
"epoch": 0.46654804270462635,
"grad_norm": 0.5283306241035461,
"learning_rate": 6.434678068752293e-06,
"loss": 0.4733,
"step": 437
},
{
"epoch": 0.46761565836298935,
"grad_norm": 0.48218491673469543,
"learning_rate": 6.416797053521039e-06,
"loss": 0.4779,
"step": 438
},
{
"epoch": 0.4686832740213523,
"grad_norm": 0.4461103677749634,
"learning_rate": 6.398896314847954e-06,
"loss": 0.4851,
"step": 439
},
{
"epoch": 0.4697508896797153,
"grad_norm": 0.47475722432136536,
"learning_rate": 6.380976101931879e-06,
"loss": 0.4747,
"step": 440
},
{
"epoch": 0.4708185053380783,
"grad_norm": 0.4456132650375366,
"learning_rate": 6.363036664242751e-06,
"loss": 0.4364,
"step": 441
},
{
"epoch": 0.4718861209964413,
"grad_norm": 0.4457268714904785,
"learning_rate": 6.345078251518144e-06,
"loss": 0.4487,
"step": 442
},
{
"epoch": 0.4729537366548043,
"grad_norm": 0.4818935990333557,
"learning_rate": 6.327101113759783e-06,
"loss": 0.5008,
"step": 443
},
{
"epoch": 0.4740213523131673,
"grad_norm": 0.44397759437561035,
"learning_rate": 6.3091055012300675e-06,
"loss": 0.4546,
"step": 444
},
{
"epoch": 0.4750889679715303,
"grad_norm": 0.4248422086238861,
"learning_rate": 6.291091664448589e-06,
"loss": 0.4797,
"step": 445
},
{
"epoch": 0.4761565836298932,
"grad_norm": 0.48325735330581665,
"learning_rate": 6.273059854188636e-06,
"loss": 0.4949,
"step": 446
},
{
"epoch": 0.4772241992882562,
"grad_norm": 0.44900190830230713,
"learning_rate": 6.25501032147372e-06,
"loss": 0.4731,
"step": 447
},
{
"epoch": 0.4782918149466192,
"grad_norm": 0.4795812964439392,
"learning_rate": 6.236943317574054e-06,
"loss": 0.466,
"step": 448
},
{
"epoch": 0.4793594306049822,
"grad_norm": 0.49573490023612976,
"learning_rate": 6.218859094003082e-06,
"loss": 0.4884,
"step": 449
},
{
"epoch": 0.4804270462633452,
"grad_norm": 0.40788835287094116,
"learning_rate": 6.200757902513962e-06,
"loss": 0.4572,
"step": 450
},
{
"epoch": 0.4814946619217082,
"grad_norm": 0.44407787919044495,
"learning_rate": 6.182639995096061e-06,
"loss": 0.5016,
"step": 451
},
{
"epoch": 0.4825622775800712,
"grad_norm": 0.43770918250083923,
"learning_rate": 6.164505623971458e-06,
"loss": 0.4699,
"step": 452
},
{
"epoch": 0.48362989323843414,
"grad_norm": 0.41643866896629333,
"learning_rate": 6.146355041591419e-06,
"loss": 0.4783,
"step": 453
},
{
"epoch": 0.48469750889679714,
"grad_norm": 0.44599294662475586,
"learning_rate": 6.128188500632892e-06,
"loss": 0.4764,
"step": 454
},
{
"epoch": 0.48576512455516013,
"grad_norm": 0.4716036319732666,
"learning_rate": 6.11000625399499e-06,
"loss": 0.4683,
"step": 455
},
{
"epoch": 0.48683274021352313,
"grad_norm": 0.49038171768188477,
"learning_rate": 6.091808554795462e-06,
"loss": 0.4716,
"step": 456
},
{
"epoch": 0.4879003558718861,
"grad_norm": 0.43345335125923157,
"learning_rate": 6.073595656367175e-06,
"loss": 0.4742,
"step": 457
},
{
"epoch": 0.4889679715302491,
"grad_norm": 0.4429580569267273,
"learning_rate": 6.055367812254592e-06,
"loss": 0.4951,
"step": 458
},
{
"epoch": 0.4900355871886121,
"grad_norm": 0.510330319404602,
"learning_rate": 6.037125276210229e-06,
"loss": 0.4771,
"step": 459
},
{
"epoch": 0.49110320284697506,
"grad_norm": 0.42020678520202637,
"learning_rate": 6.0188683021911394e-06,
"loss": 0.4939,
"step": 460
},
{
"epoch": 0.49217081850533806,
"grad_norm": 0.45770880579948425,
"learning_rate": 6.000597144355361e-06,
"loss": 0.4931,
"step": 461
},
{
"epoch": 0.49323843416370106,
"grad_norm": 0.3960902690887451,
"learning_rate": 5.982312057058392e-06,
"loss": 0.4706,
"step": 462
},
{
"epoch": 0.49430604982206405,
"grad_norm": 0.5214159488677979,
"learning_rate": 5.964013294849646e-06,
"loss": 0.4777,
"step": 463
},
{
"epoch": 0.49537366548042705,
"grad_norm": 0.43403932452201843,
"learning_rate": 5.9457011124689025e-06,
"loss": 0.4688,
"step": 464
},
{
"epoch": 0.49644128113879005,
"grad_norm": 0.4650368392467499,
"learning_rate": 5.927375764842766e-06,
"loss": 0.467,
"step": 465
},
{
"epoch": 0.49750889679715304,
"grad_norm": 0.4884885847568512,
"learning_rate": 5.9090375070811215e-06,
"loss": 0.4872,
"step": 466
},
{
"epoch": 0.49857651245551604,
"grad_norm": 0.4051380157470703,
"learning_rate": 5.890686594473571e-06,
"loss": 0.4685,
"step": 467
},
{
"epoch": 0.499644128113879,
"grad_norm": 0.443988561630249,
"learning_rate": 5.872323282485889e-06,
"loss": 0.4981,
"step": 468
},
{
"epoch": 0.500711743772242,
"grad_norm": 0.444369375705719,
"learning_rate": 5.853947826756465e-06,
"loss": 0.465,
"step": 469
},
{
"epoch": 0.501779359430605,
"grad_norm": 0.42520400881767273,
"learning_rate": 5.835560483092743e-06,
"loss": 0.484,
"step": 470
},
{
"epoch": 0.5028469750889679,
"grad_norm": 0.45270073413848877,
"learning_rate": 5.8171615074676615e-06,
"loss": 0.4886,
"step": 471
},
{
"epoch": 0.5039145907473309,
"grad_norm": 0.47045156359672546,
"learning_rate": 5.798751156016085e-06,
"loss": 0.4733,
"step": 472
},
{
"epoch": 0.5049822064056939,
"grad_norm": 0.49486202001571655,
"learning_rate": 5.780329685031247e-06,
"loss": 0.4799,
"step": 473
},
{
"epoch": 0.5060498220640569,
"grad_norm": 0.495645135641098,
"learning_rate": 5.7618973509611755e-06,
"loss": 0.483,
"step": 474
},
{
"epoch": 0.5071174377224199,
"grad_norm": 0.4936763048171997,
"learning_rate": 5.743454410405126e-06,
"loss": 0.495,
"step": 475
},
{
"epoch": 0.5081850533807829,
"grad_norm": 0.3758457899093628,
"learning_rate": 5.72500112011001e-06,
"loss": 0.4616,
"step": 476
},
{
"epoch": 0.5092526690391459,
"grad_norm": 0.45885196328163147,
"learning_rate": 5.706537736966814e-06,
"loss": 0.4808,
"step": 477
},
{
"epoch": 0.5103202846975089,
"grad_norm": 0.46448948979377747,
"learning_rate": 5.688064518007036e-06,
"loss": 0.495,
"step": 478
},
{
"epoch": 0.5113879003558719,
"grad_norm": 0.43365931510925293,
"learning_rate": 5.669581720399094e-06,
"loss": 0.4811,
"step": 479
},
{
"epoch": 0.5124555160142349,
"grad_norm": 0.4774491786956787,
"learning_rate": 5.651089601444752e-06,
"loss": 0.4794,
"step": 480
},
{
"epoch": 0.5135231316725979,
"grad_norm": 0.48784658312797546,
"learning_rate": 5.632588418575542e-06,
"loss": 0.4799,
"step": 481
},
{
"epoch": 0.5145907473309609,
"grad_norm": 0.49221184849739075,
"learning_rate": 5.614078429349172e-06,
"loss": 0.4921,
"step": 482
},
{
"epoch": 0.5156583629893239,
"grad_norm": 0.45859670639038086,
"learning_rate": 5.5955598914459465e-06,
"loss": 0.481,
"step": 483
},
{
"epoch": 0.5167259786476869,
"grad_norm": 0.5400739908218384,
"learning_rate": 5.577033062665179e-06,
"loss": 0.4904,
"step": 484
},
{
"epoch": 0.5177935943060499,
"grad_norm": 0.4636092483997345,
"learning_rate": 5.558498200921597e-06,
"loss": 0.4493,
"step": 485
},
{
"epoch": 0.5188612099644128,
"grad_norm": 0.4300142228603363,
"learning_rate": 5.53995556424176e-06,
"loss": 0.4679,
"step": 486
},
{
"epoch": 0.5199288256227758,
"grad_norm": 0.4838177561759949,
"learning_rate": 5.521405410760462e-06,
"loss": 0.4625,
"step": 487
},
{
"epoch": 0.5209964412811388,
"grad_norm": 0.47749972343444824,
"learning_rate": 5.50284799871714e-06,
"loss": 0.4488,
"step": 488
},
{
"epoch": 0.5220640569395018,
"grad_norm": 0.4752497673034668,
"learning_rate": 5.484283586452279e-06,
"loss": 0.5103,
"step": 489
},
{
"epoch": 0.5231316725978647,
"grad_norm": 0.463785856962204,
"learning_rate": 5.465712432403812e-06,
"loss": 0.4624,
"step": 490
},
{
"epoch": 0.5241992882562277,
"grad_norm": 0.4545430839061737,
"learning_rate": 5.447134795103531e-06,
"loss": 0.4719,
"step": 491
},
{
"epoch": 0.5252669039145907,
"grad_norm": 0.41529300808906555,
"learning_rate": 5.428550933173476e-06,
"loss": 0.4708,
"step": 492
},
{
"epoch": 0.5263345195729537,
"grad_norm": 0.4432843327522278,
"learning_rate": 5.409961105322347e-06,
"loss": 0.4675,
"step": 493
},
{
"epoch": 0.5274021352313167,
"grad_norm": 0.48375219106674194,
"learning_rate": 5.391365570341893e-06,
"loss": 0.4847,
"step": 494
},
{
"epoch": 0.5284697508896797,
"grad_norm": 0.40725329518318176,
"learning_rate": 5.372764587103309e-06,
"loss": 0.4477,
"step": 495
},
{
"epoch": 0.5295373665480427,
"grad_norm": 0.4455367624759674,
"learning_rate": 5.3541584145536475e-06,
"loss": 0.4819,
"step": 496
},
{
"epoch": 0.5306049822064057,
"grad_norm": 0.45164966583251953,
"learning_rate": 5.335547311712188e-06,
"loss": 0.4642,
"step": 497
},
{
"epoch": 0.5316725978647687,
"grad_norm": 0.49471500515937805,
"learning_rate": 5.3169315376668566e-06,
"loss": 0.4823,
"step": 498
},
{
"epoch": 0.5327402135231317,
"grad_norm": 0.4399643838405609,
"learning_rate": 5.2983113515706045e-06,
"loss": 0.4819,
"step": 499
},
{
"epoch": 0.5338078291814946,
"grad_norm": 0.5442211627960205,
"learning_rate": 5.279687012637798e-06,
"loss": 0.4677,
"step": 500
},
{
"epoch": 0.5348754448398576,
"grad_norm": 0.45155906677246094,
"learning_rate": 5.2610587801406256e-06,
"loss": 0.4878,
"step": 501
},
{
"epoch": 0.5359430604982206,
"grad_norm": 0.5040996670722961,
"learning_rate": 5.242426913405471e-06,
"loss": 0.482,
"step": 502
},
{
"epoch": 0.5370106761565836,
"grad_norm": 0.4809477925300598,
"learning_rate": 5.223791671809314e-06,
"loss": 0.4697,
"step": 503
},
{
"epoch": 0.5380782918149466,
"grad_norm": 0.5347772240638733,
"learning_rate": 5.2051533147761155e-06,
"loss": 0.4574,
"step": 504
},
{
"epoch": 0.5391459074733096,
"grad_norm": 0.48782646656036377,
"learning_rate": 5.186512101773206e-06,
"loss": 0.4747,
"step": 505
},
{
"epoch": 0.5402135231316726,
"grad_norm": 0.46589890122413635,
"learning_rate": 5.167868292307679e-06,
"loss": 0.4814,
"step": 506
},
{
"epoch": 0.5412811387900356,
"grad_norm": 0.5483913421630859,
"learning_rate": 5.149222145922765e-06,
"loss": 0.4807,
"step": 507
},
{
"epoch": 0.5423487544483986,
"grad_norm": 0.4894302487373352,
"learning_rate": 5.130573922194236e-06,
"loss": 0.4881,
"step": 508
},
{
"epoch": 0.5434163701067616,
"grad_norm": 0.41597887873649597,
"learning_rate": 5.111923880726779e-06,
"loss": 0.4766,
"step": 509
},
{
"epoch": 0.5444839857651246,
"grad_norm": 0.5004387497901917,
"learning_rate": 5.093272281150383e-06,
"loss": 0.4656,
"step": 510
},
{
"epoch": 0.5455516014234876,
"grad_norm": 0.4926692843437195,
"learning_rate": 5.074619383116733e-06,
"loss": 0.4579,
"step": 511
},
{
"epoch": 0.5466192170818506,
"grad_norm": 0.4545387029647827,
"learning_rate": 5.05596544629559e-06,
"loss": 0.4748,
"step": 512
},
{
"epoch": 0.5476868327402136,
"grad_norm": 0.4115523397922516,
"learning_rate": 5.03731073037117e-06,
"loss": 0.4856,
"step": 513
},
{
"epoch": 0.5487544483985766,
"grad_norm": 0.4478975236415863,
"learning_rate": 5.018655495038542e-06,
"loss": 0.4728,
"step": 514
},
{
"epoch": 0.5498220640569395,
"grad_norm": 0.4211094081401825,
"learning_rate": 5e-06,
"loss": 0.4614,
"step": 515
},
{
"epoch": 0.5508896797153024,
"grad_norm": 0.45692694187164307,
"learning_rate": 4.981344504961459e-06,
"loss": 0.4711,
"step": 516
},
{
"epoch": 0.5519572953736654,
"grad_norm": 0.41460829973220825,
"learning_rate": 4.962689269628832e-06,
"loss": 0.467,
"step": 517
},
{
"epoch": 0.5530249110320284,
"grad_norm": 0.4291308522224426,
"learning_rate": 4.944034553704412e-06,
"loss": 0.4901,
"step": 518
},
{
"epoch": 0.5540925266903914,
"grad_norm": 0.4243936538696289,
"learning_rate": 4.9253806168832685e-06,
"loss": 0.4966,
"step": 519
},
{
"epoch": 0.5551601423487544,
"grad_norm": 0.42817196249961853,
"learning_rate": 4.906727718849619e-06,
"loss": 0.46,
"step": 520
},
{
"epoch": 0.5562277580071174,
"grad_norm": 0.419493168592453,
"learning_rate": 4.888076119273223e-06,
"loss": 0.4788,
"step": 521
},
{
"epoch": 0.5572953736654804,
"grad_norm": 0.4330461919307709,
"learning_rate": 4.8694260778057655e-06,
"loss": 0.4875,
"step": 522
},
{
"epoch": 0.5583629893238434,
"grad_norm": 0.44664815068244934,
"learning_rate": 4.850777854077235e-06,
"loss": 0.4645,
"step": 523
},
{
"epoch": 0.5594306049822064,
"grad_norm": 0.4051723778247833,
"learning_rate": 4.832131707692322e-06,
"loss": 0.4596,
"step": 524
},
{
"epoch": 0.5604982206405694,
"grad_norm": 0.4329952895641327,
"learning_rate": 4.813487898226794e-06,
"loss": 0.4652,
"step": 525
},
{
"epoch": 0.5615658362989324,
"grad_norm": 0.4694920480251312,
"learning_rate": 4.7948466852238844e-06,
"loss": 0.4751,
"step": 526
},
{
"epoch": 0.5626334519572954,
"grad_norm": 0.42388251423835754,
"learning_rate": 4.7762083281906864e-06,
"loss": 0.457,
"step": 527
},
{
"epoch": 0.5637010676156584,
"grad_norm": 0.4349200129508972,
"learning_rate": 4.757573086594529e-06,
"loss": 0.4655,
"step": 528
},
{
"epoch": 0.5647686832740213,
"grad_norm": 0.42134931683540344,
"learning_rate": 4.738941219859375e-06,
"loss": 0.4806,
"step": 529
},
{
"epoch": 0.5658362989323843,
"grad_norm": 0.4428733289241791,
"learning_rate": 4.720312987362204e-06,
"loss": 0.4581,
"step": 530
},
{
"epoch": 0.5669039145907473,
"grad_norm": 0.4444166123867035,
"learning_rate": 4.701688648429399e-06,
"loss": 0.4592,
"step": 531
},
{
"epoch": 0.5679715302491103,
"grad_norm": 0.37343311309814453,
"learning_rate": 4.683068462333144e-06,
"loss": 0.4742,
"step": 532
},
{
"epoch": 0.5690391459074733,
"grad_norm": 0.4210268557071686,
"learning_rate": 4.6644526882878145e-06,
"loss": 0.4853,
"step": 533
},
{
"epoch": 0.5701067615658363,
"grad_norm": 0.46207836270332336,
"learning_rate": 4.645841585446356e-06,
"loss": 0.4698,
"step": 534
},
{
"epoch": 0.5711743772241993,
"grad_norm": 0.469249963760376,
"learning_rate": 4.6272354128966924e-06,
"loss": 0.4578,
"step": 535
},
{
"epoch": 0.5722419928825623,
"grad_norm": 0.45976918935775757,
"learning_rate": 4.6086344296581095e-06,
"loss": 0.4904,
"step": 536
},
{
"epoch": 0.5733096085409253,
"grad_norm": 0.4256848096847534,
"learning_rate": 4.590038894677653e-06,
"loss": 0.4615,
"step": 537
},
{
"epoch": 0.5743772241992883,
"grad_norm": 0.4688819348812103,
"learning_rate": 4.5714490668265245e-06,
"loss": 0.4806,
"step": 538
},
{
"epoch": 0.5754448398576513,
"grad_norm": 0.44800180196762085,
"learning_rate": 4.55286520489647e-06,
"loss": 0.4964,
"step": 539
},
{
"epoch": 0.5765124555160143,
"grad_norm": 0.42129072546958923,
"learning_rate": 4.534287567596189e-06,
"loss": 0.4693,
"step": 540
},
{
"epoch": 0.5775800711743773,
"grad_norm": 0.41702598333358765,
"learning_rate": 4.515716413547722e-06,
"loss": 0.4818,
"step": 541
},
{
"epoch": 0.5786476868327403,
"grad_norm": 0.46011829376220703,
"learning_rate": 4.497152001282861e-06,
"loss": 0.4997,
"step": 542
},
{
"epoch": 0.5797153024911033,
"grad_norm": 0.4624707102775574,
"learning_rate": 4.478594589239539e-06,
"loss": 0.5302,
"step": 543
},
{
"epoch": 0.5807829181494661,
"grad_norm": 0.4008091688156128,
"learning_rate": 4.460044435758241e-06,
"loss": 0.4739,
"step": 544
},
{
"epoch": 0.5818505338078291,
"grad_norm": 0.41396379470825195,
"learning_rate": 4.441501799078405e-06,
"loss": 0.4919,
"step": 545
},
{
"epoch": 0.5829181494661921,
"grad_norm": 0.4313451945781708,
"learning_rate": 4.4229669373348225e-06,
"loss": 0.4872,
"step": 546
},
{
"epoch": 0.5839857651245551,
"grad_norm": 0.41983485221862793,
"learning_rate": 4.404440108554055e-06,
"loss": 0.492,
"step": 547
},
{
"epoch": 0.5850533807829181,
"grad_norm": 0.4576341211795807,
"learning_rate": 4.3859215706508295e-06,
"loss": 0.4676,
"step": 548
},
{
"epoch": 0.5861209964412811,
"grad_norm": 0.4995148181915283,
"learning_rate": 4.3674115814244595e-06,
"loss": 0.5213,
"step": 549
},
{
"epoch": 0.5871886120996441,
"grad_norm": 0.45019280910491943,
"learning_rate": 4.348910398555249e-06,
"loss": 0.4792,
"step": 550
},
{
"epoch": 0.5882562277580071,
"grad_norm": 0.4817792475223541,
"learning_rate": 4.330418279600907e-06,
"loss": 0.4968,
"step": 551
},
{
"epoch": 0.5893238434163701,
"grad_norm": 0.4851461946964264,
"learning_rate": 4.311935481992965e-06,
"loss": 0.4855,
"step": 552
},
{
"epoch": 0.5903914590747331,
"grad_norm": 0.3923802673816681,
"learning_rate": 4.2934622630331855e-06,
"loss": 0.4775,
"step": 553
},
{
"epoch": 0.5914590747330961,
"grad_norm": 0.46750932931900024,
"learning_rate": 4.274998879889991e-06,
"loss": 0.4677,
"step": 554
},
{
"epoch": 0.5925266903914591,
"grad_norm": 0.456074982881546,
"learning_rate": 4.2565455895948745e-06,
"loss": 0.4664,
"step": 555
},
{
"epoch": 0.5935943060498221,
"grad_norm": 0.5120862126350403,
"learning_rate": 4.238102649038825e-06,
"loss": 0.4831,
"step": 556
},
{
"epoch": 0.5946619217081851,
"grad_norm": 0.3992975950241089,
"learning_rate": 4.219670314968754e-06,
"loss": 0.4801,
"step": 557
},
{
"epoch": 0.595729537366548,
"grad_norm": 0.4378175437450409,
"learning_rate": 4.2012488439839185e-06,
"loss": 0.4724,
"step": 558
},
{
"epoch": 0.596797153024911,
"grad_norm": 0.4528578221797943,
"learning_rate": 4.182838492532342e-06,
"loss": 0.449,
"step": 559
},
{
"epoch": 0.597864768683274,
"grad_norm": 0.46270951628685,
"learning_rate": 4.164439516907258e-06,
"loss": 0.483,
"step": 560
},
{
"epoch": 0.598932384341637,
"grad_norm": 0.4630880057811737,
"learning_rate": 4.146052173243538e-06,
"loss": 0.4694,
"step": 561
},
{
"epoch": 0.6,
"grad_norm": 0.44219690561294556,
"learning_rate": 4.127676717514114e-06,
"loss": 0.5014,
"step": 562
},
{
"epoch": 0.601067615658363,
"grad_norm": 0.41033241152763367,
"learning_rate": 4.109313405526433e-06,
"loss": 0.4957,
"step": 563
},
{
"epoch": 0.602135231316726,
"grad_norm": 0.4031945765018463,
"learning_rate": 4.090962492918881e-06,
"loss": 0.4555,
"step": 564
},
{
"epoch": 0.603202846975089,
"grad_norm": 0.5318504571914673,
"learning_rate": 4.072624235157234e-06,
"loss": 0.4834,
"step": 565
},
{
"epoch": 0.604270462633452,
"grad_norm": 0.39881038665771484,
"learning_rate": 4.054298887531099e-06,
"loss": 0.4665,
"step": 566
},
{
"epoch": 0.605338078291815,
"grad_norm": 0.4450599253177643,
"learning_rate": 4.035986705150355e-06,
"loss": 0.4724,
"step": 567
},
{
"epoch": 0.606405693950178,
"grad_norm": 0.43651196360588074,
"learning_rate": 4.017687942941609e-06,
"loss": 0.5019,
"step": 568
},
{
"epoch": 0.607473309608541,
"grad_norm": 0.41378405690193176,
"learning_rate": 3.9994028556446404e-06,
"loss": 0.4942,
"step": 569
},
{
"epoch": 0.608540925266904,
"grad_norm": 0.45917779207229614,
"learning_rate": 3.981131697808862e-06,
"loss": 0.476,
"step": 570
},
{
"epoch": 0.609608540925267,
"grad_norm": 0.3615592420101166,
"learning_rate": 3.9628747237897715e-06,
"loss": 0.4678,
"step": 571
},
{
"epoch": 0.61067615658363,
"grad_norm": 0.4037294387817383,
"learning_rate": 3.94463218774541e-06,
"loss": 0.4717,
"step": 572
},
{
"epoch": 0.6117437722419928,
"grad_norm": 0.4695199429988861,
"learning_rate": 3.926404343632826e-06,
"loss": 0.4758,
"step": 573
},
{
"epoch": 0.6128113879003558,
"grad_norm": 0.45771774649620056,
"learning_rate": 3.90819144520454e-06,
"loss": 0.4931,
"step": 574
},
{
"epoch": 0.6138790035587188,
"grad_norm": 0.36958596110343933,
"learning_rate": 3.889993746005011e-06,
"loss": 0.4487,
"step": 575
},
{
"epoch": 0.6149466192170818,
"grad_norm": 0.4408724904060364,
"learning_rate": 3.8718114993671086e-06,
"loss": 0.4563,
"step": 576
},
{
"epoch": 0.6160142348754448,
"grad_norm": 0.4657142162322998,
"learning_rate": 3.853644958408582e-06,
"loss": 0.4743,
"step": 577
},
{
"epoch": 0.6170818505338078,
"grad_norm": 0.42271625995635986,
"learning_rate": 3.835494376028544e-06,
"loss": 0.494,
"step": 578
},
{
"epoch": 0.6181494661921708,
"grad_norm": 0.4289335608482361,
"learning_rate": 3.817360004903939e-06,
"loss": 0.4617,
"step": 579
},
{
"epoch": 0.6192170818505338,
"grad_norm": 0.4913620352745056,
"learning_rate": 3.799242097486038e-06,
"loss": 0.4606,
"step": 580
},
{
"epoch": 0.6202846975088968,
"grad_norm": 0.4116392731666565,
"learning_rate": 3.7811409059969177e-06,
"loss": 0.4623,
"step": 581
},
{
"epoch": 0.6213523131672598,
"grad_norm": 0.4178345799446106,
"learning_rate": 3.7630566824259456e-06,
"loss": 0.5072,
"step": 582
},
{
"epoch": 0.6224199288256228,
"grad_norm": 0.4855571687221527,
"learning_rate": 3.7449896785262817e-06,
"loss": 0.4737,
"step": 583
},
{
"epoch": 0.6234875444839858,
"grad_norm": 0.46843597292900085,
"learning_rate": 3.726940145811363e-06,
"loss": 0.4703,
"step": 584
},
{
"epoch": 0.6245551601423488,
"grad_norm": 0.38505470752716064,
"learning_rate": 3.708908335551412e-06,
"loss": 0.4872,
"step": 585
},
{
"epoch": 0.6256227758007118,
"grad_norm": 0.42972132563591003,
"learning_rate": 3.6908944987699346e-06,
"loss": 0.4792,
"step": 586
},
{
"epoch": 0.6266903914590747,
"grad_norm": 0.5449157357215881,
"learning_rate": 3.67289888624022e-06,
"loss": 0.5233,
"step": 587
},
{
"epoch": 0.6277580071174377,
"grad_norm": 0.4144046902656555,
"learning_rate": 3.6549217484818576e-06,
"loss": 0.4798,
"step": 588
},
{
"epoch": 0.6288256227758007,
"grad_norm": 0.42087435722351074,
"learning_rate": 3.6369633357572514e-06,
"loss": 0.4573,
"step": 589
},
{
"epoch": 0.6298932384341637,
"grad_norm": 0.42363405227661133,
"learning_rate": 3.6190238980681235e-06,
"loss": 0.4652,
"step": 590
},
{
"epoch": 0.6309608540925267,
"grad_norm": 0.4286684989929199,
"learning_rate": 3.6011036851520465e-06,
"loss": 0.4637,
"step": 591
},
{
"epoch": 0.6320284697508897,
"grad_norm": 0.4206468462944031,
"learning_rate": 3.583202946478963e-06,
"loss": 0.4761,
"step": 592
},
{
"epoch": 0.6330960854092527,
"grad_norm": 0.3857564926147461,
"learning_rate": 3.5653219312477085e-06,
"loss": 0.4771,
"step": 593
},
{
"epoch": 0.6341637010676157,
"grad_norm": 0.42064541578292847,
"learning_rate": 3.5474608883825475e-06,
"loss": 0.499,
"step": 594
},
{
"epoch": 0.6352313167259787,
"grad_norm": 0.35660263895988464,
"learning_rate": 3.529620066529704e-06,
"loss": 0.4626,
"step": 595
},
{
"epoch": 0.6362989323843417,
"grad_norm": 0.4862718880176544,
"learning_rate": 3.5117997140539073e-06,
"loss": 0.5183,
"step": 596
},
{
"epoch": 0.6373665480427047,
"grad_norm": 0.44003114104270935,
"learning_rate": 3.4940000790349226e-06,
"loss": 0.4649,
"step": 597
},
{
"epoch": 0.6384341637010676,
"grad_norm": 0.38733163475990295,
"learning_rate": 3.47622140926411e-06,
"loss": 0.4378,
"step": 598
},
{
"epoch": 0.6395017793594306,
"grad_norm": 0.41804930567741394,
"learning_rate": 3.458463952240967e-06,
"loss": 0.4664,
"step": 599
},
{
"epoch": 0.6405693950177936,
"grad_norm": 0.44392499327659607,
"learning_rate": 3.4407279551696846e-06,
"loss": 0.4655,
"step": 600
},
{
"epoch": 0.6416370106761566,
"grad_norm": 0.39837706089019775,
"learning_rate": 3.4230136649557087e-06,
"loss": 0.4701,
"step": 601
},
{
"epoch": 0.6427046263345195,
"grad_norm": 0.41694167256355286,
"learning_rate": 3.4053213282022983e-06,
"loss": 0.4716,
"step": 602
},
{
"epoch": 0.6437722419928825,
"grad_norm": 0.41177675127983093,
"learning_rate": 3.387651191207097e-06,
"loss": 0.4802,
"step": 603
},
{
"epoch": 0.6448398576512455,
"grad_norm": 0.4301503300666809,
"learning_rate": 3.370003499958703e-06,
"loss": 0.4546,
"step": 604
},
{
"epoch": 0.6459074733096085,
"grad_norm": 0.37474584579467773,
"learning_rate": 3.352378500133239e-06,
"loss": 0.477,
"step": 605
},
{
"epoch": 0.6469750889679715,
"grad_norm": 0.4323018491268158,
"learning_rate": 3.334776437090944e-06,
"loss": 0.4656,
"step": 606
},
{
"epoch": 0.6480427046263345,
"grad_norm": 0.41514450311660767,
"learning_rate": 3.317197555872745e-06,
"loss": 0.4198,
"step": 607
},
{
"epoch": 0.6491103202846975,
"grad_norm": 0.3888489007949829,
"learning_rate": 3.2996421011968546e-06,
"loss": 0.4535,
"step": 608
},
{
"epoch": 0.6501779359430605,
"grad_norm": 0.4327705204486847,
"learning_rate": 3.28211031745536e-06,
"loss": 0.4983,
"step": 609
},
{
"epoch": 0.6512455516014235,
"grad_norm": 0.4344913065433502,
"learning_rate": 3.264602448710822e-06,
"loss": 0.4947,
"step": 610
},
{
"epoch": 0.6523131672597865,
"grad_norm": 0.3991352915763855,
"learning_rate": 3.2471187386928766e-06,
"loss": 0.4805,
"step": 611
},
{
"epoch": 0.6533807829181495,
"grad_norm": 0.39237743616104126,
"learning_rate": 3.2296594307948428e-06,
"loss": 0.4891,
"step": 612
},
{
"epoch": 0.6544483985765125,
"grad_norm": 0.40971338748931885,
"learning_rate": 3.212224768070334e-06,
"loss": 0.453,
"step": 613
},
{
"epoch": 0.6555160142348755,
"grad_norm": 0.41694802045822144,
"learning_rate": 3.194814993229878e-06,
"loss": 0.4718,
"step": 614
},
{
"epoch": 0.6565836298932385,
"grad_norm": 0.429420530796051,
"learning_rate": 3.177430348637527e-06,
"loss": 0.4929,
"step": 615
},
{
"epoch": 0.6576512455516014,
"grad_norm": 0.39137008786201477,
"learning_rate": 3.1600710763074972e-06,
"loss": 0.4672,
"step": 616
},
{
"epoch": 0.6587188612099644,
"grad_norm": 0.40734052658081055,
"learning_rate": 3.142737417900793e-06,
"loss": 0.4999,
"step": 617
},
{
"epoch": 0.6597864768683274,
"grad_norm": 0.36672934889793396,
"learning_rate": 3.125429614721842e-06,
"loss": 0.466,
"step": 618
},
{
"epoch": 0.6608540925266904,
"grad_norm": 0.36196407675743103,
"learning_rate": 3.1081479077151387e-06,
"loss": 0.4425,
"step": 619
},
{
"epoch": 0.6619217081850534,
"grad_norm": 0.3950616717338562,
"learning_rate": 3.090892537461889e-06,
"loss": 0.4726,
"step": 620
},
{
"epoch": 0.6629893238434164,
"grad_norm": 0.42815542221069336,
"learning_rate": 3.0736637441766594e-06,
"loss": 0.4753,
"step": 621
},
{
"epoch": 0.6640569395017794,
"grad_norm": 0.3979141116142273,
"learning_rate": 3.056461767704037e-06,
"loss": 0.4799,
"step": 622
},
{
"epoch": 0.6651245551601424,
"grad_norm": 0.37764808535575867,
"learning_rate": 3.039286847515284e-06,
"loss": 0.4752,
"step": 623
},
{
"epoch": 0.6661921708185053,
"grad_norm": 0.38070034980773926,
"learning_rate": 3.0221392227050126e-06,
"loss": 0.4782,
"step": 624
},
{
"epoch": 0.6672597864768683,
"grad_norm": 0.40708160400390625,
"learning_rate": 3.00501913198785e-06,
"loss": 0.4533,
"step": 625
},
{
"epoch": 0.6683274021352313,
"grad_norm": 0.41723665595054626,
"learning_rate": 2.9879268136951163e-06,
"loss": 0.4827,
"step": 626
},
{
"epoch": 0.6693950177935943,
"grad_norm": 0.41290441155433655,
"learning_rate": 2.970862505771509e-06,
"loss": 0.4443,
"step": 627
},
{
"epoch": 0.6704626334519573,
"grad_norm": 0.4340071678161621,
"learning_rate": 2.953826445771788e-06,
"loss": 0.48,
"step": 628
},
{
"epoch": 0.6715302491103203,
"grad_norm": 0.35983264446258545,
"learning_rate": 2.9368188708574706e-06,
"loss": 0.4415,
"step": 629
},
{
"epoch": 0.6725978647686833,
"grad_norm": 0.3808664381504059,
"learning_rate": 2.9198400177935303e-06,
"loss": 0.4683,
"step": 630
},
{
"epoch": 0.6736654804270462,
"grad_norm": 0.3902174234390259,
"learning_rate": 2.902890122945096e-06,
"loss": 0.4984,
"step": 631
},
{
"epoch": 0.6747330960854092,
"grad_norm": 0.38310402631759644,
"learning_rate": 2.8859694222741653e-06,
"loss": 0.5024,
"step": 632
},
{
"epoch": 0.6758007117437722,
"grad_norm": 0.407287061214447,
"learning_rate": 2.869078151336323e-06,
"loss": 0.474,
"step": 633
},
{
"epoch": 0.6768683274021352,
"grad_norm": 0.38502153754234314,
"learning_rate": 2.852216545277456e-06,
"loss": 0.4786,
"step": 634
},
{
"epoch": 0.6779359430604982,
"grad_norm": 0.3770993649959564,
"learning_rate": 2.835384838830481e-06,
"loss": 0.4876,
"step": 635
},
{
"epoch": 0.6790035587188612,
"grad_norm": 0.3542179763317108,
"learning_rate": 2.8185832663120817e-06,
"loss": 0.4748,
"step": 636
},
{
"epoch": 0.6800711743772242,
"grad_norm": 0.3789761960506439,
"learning_rate": 2.8018120616194356e-06,
"loss": 0.4936,
"step": 637
},
{
"epoch": 0.6811387900355872,
"grad_norm": 0.3685765564441681,
"learning_rate": 2.785071458226972e-06,
"loss": 0.4749,
"step": 638
},
{
"epoch": 0.6822064056939502,
"grad_norm": 0.35032930970191956,
"learning_rate": 2.768361689183113e-06,
"loss": 0.4439,
"step": 639
},
{
"epoch": 0.6832740213523132,
"grad_norm": 0.3704805374145508,
"learning_rate": 2.7516829871070295e-06,
"loss": 0.4622,
"step": 640
},
{
"epoch": 0.6843416370106762,
"grad_norm": 0.3895471692085266,
"learning_rate": 2.735035584185409e-06,
"loss": 0.4826,
"step": 641
},
{
"epoch": 0.6854092526690392,
"grad_norm": 0.38154760003089905,
"learning_rate": 2.718419712169213e-06,
"loss": 0.4544,
"step": 642
},
{
"epoch": 0.6864768683274022,
"grad_norm": 0.3842725157737732,
"learning_rate": 2.7018356023704574e-06,
"loss": 0.4961,
"step": 643
},
{
"epoch": 0.6875444839857652,
"grad_norm": 0.3748033940792084,
"learning_rate": 2.685283485658995e-06,
"loss": 0.4642,
"step": 644
},
{
"epoch": 0.6886120996441281,
"grad_norm": 0.36127620935440063,
"learning_rate": 2.668763592459297e-06,
"loss": 0.4591,
"step": 645
},
{
"epoch": 0.6896797153024911,
"grad_norm": 0.3470078110694885,
"learning_rate": 2.6522761527472464e-06,
"loss": 0.4559,
"step": 646
},
{
"epoch": 0.6907473309608541,
"grad_norm": 0.38986238837242126,
"learning_rate": 2.6358213960469357e-06,
"loss": 0.4881,
"step": 647
},
{
"epoch": 0.691814946619217,
"grad_norm": 0.3686830699443817,
"learning_rate": 2.6193995514274705e-06,
"loss": 0.4754,
"step": 648
},
{
"epoch": 0.69288256227758,
"grad_norm": 0.36502450704574585,
"learning_rate": 2.6030108474997854e-06,
"loss": 0.4739,
"step": 649
},
{
"epoch": 0.693950177935943,
"grad_norm": 0.3817600905895233,
"learning_rate": 2.586655512413458e-06,
"loss": 0.4406,
"step": 650
},
{
"epoch": 0.695017793594306,
"grad_norm": 0.3689401149749756,
"learning_rate": 2.5703337738535324e-06,
"loss": 0.4461,
"step": 651
},
{
"epoch": 0.696085409252669,
"grad_norm": 0.39199331402778625,
"learning_rate": 2.554045859037353e-06,
"loss": 0.4631,
"step": 652
},
{
"epoch": 0.697153024911032,
"grad_norm": 0.35440245270729065,
"learning_rate": 2.5377919947113917e-06,
"loss": 0.4523,
"step": 653
},
{
"epoch": 0.698220640569395,
"grad_norm": 0.3502133786678314,
"learning_rate": 2.521572407148107e-06,
"loss": 0.4592,
"step": 654
},
{
"epoch": 0.699288256227758,
"grad_norm": 0.37463781237602234,
"learning_rate": 2.505387322142782e-06,
"loss": 0.4719,
"step": 655
},
{
"epoch": 0.700355871886121,
"grad_norm": 0.391875296831131,
"learning_rate": 2.4892369650103837e-06,
"loss": 0.4656,
"step": 656
},
{
"epoch": 0.701423487544484,
"grad_norm": 0.3908476233482361,
"learning_rate": 2.4731215605824304e-06,
"loss": 0.4962,
"step": 657
},
{
"epoch": 0.702491103202847,
"grad_norm": 0.3876582086086273,
"learning_rate": 2.4570413332038523e-06,
"loss": 0.4776,
"step": 658
},
{
"epoch": 0.70355871886121,
"grad_norm": 0.36631351709365845,
"learning_rate": 2.440996506729881e-06,
"loss": 0.4603,
"step": 659
},
{
"epoch": 0.7046263345195729,
"grad_norm": 0.40206146240234375,
"learning_rate": 2.4249873045229244e-06,
"loss": 0.456,
"step": 660
},
{
"epoch": 0.7056939501779359,
"grad_norm": 0.3983338475227356,
"learning_rate": 2.4090139494494596e-06,
"loss": 0.4664,
"step": 661
},
{
"epoch": 0.7067615658362989,
"grad_norm": 0.37411966919898987,
"learning_rate": 2.3930766638769325e-06,
"loss": 0.4738,
"step": 662
},
{
"epoch": 0.7078291814946619,
"grad_norm": 0.42127808928489685,
"learning_rate": 2.3771756696706553e-06,
"loss": 0.4782,
"step": 663
},
{
"epoch": 0.7088967971530249,
"grad_norm": 0.4144476354122162,
"learning_rate": 2.3613111881907273e-06,
"loss": 0.4737,
"step": 664
},
{
"epoch": 0.7099644128113879,
"grad_norm": 0.40858951210975647,
"learning_rate": 2.345483440288947e-06,
"loss": 0.4516,
"step": 665
},
{
"epoch": 0.7110320284697509,
"grad_norm": 0.3829437792301178,
"learning_rate": 2.3296926463057396e-06,
"loss": 0.4509,
"step": 666
},
{
"epoch": 0.7120996441281139,
"grad_norm": 0.37926656007766724,
"learning_rate": 2.313939026067091e-06,
"loss": 0.4628,
"step": 667
},
{
"epoch": 0.7131672597864769,
"grad_norm": 0.36293280124664307,
"learning_rate": 2.29822279888148e-06,
"loss": 0.4454,
"step": 668
},
{
"epoch": 0.7142348754448399,
"grad_norm": 0.40881264209747314,
"learning_rate": 2.2825441835368377e-06,
"loss": 0.4754,
"step": 669
},
{
"epoch": 0.7153024911032029,
"grad_norm": 0.3915267884731293,
"learning_rate": 2.2669033982974946e-06,
"loss": 0.4869,
"step": 670
},
{
"epoch": 0.7163701067615659,
"grad_norm": 0.34906652569770813,
"learning_rate": 2.2513006609011365e-06,
"loss": 0.4686,
"step": 671
},
{
"epoch": 0.7174377224199289,
"grad_norm": 0.4089764952659607,
"learning_rate": 2.235736188555787e-06,
"loss": 0.4766,
"step": 672
},
{
"epoch": 0.7185053380782919,
"grad_norm": 0.35783180594444275,
"learning_rate": 2.2202101979367735e-06,
"loss": 0.4816,
"step": 673
},
{
"epoch": 0.7195729537366548,
"grad_norm": 0.3813284635543823,
"learning_rate": 2.2047229051837107e-06,
"loss": 0.5012,
"step": 674
},
{
"epoch": 0.7206405693950177,
"grad_norm": 0.34306350350379944,
"learning_rate": 2.189274525897498e-06,
"loss": 0.5031,
"step": 675
},
{
"epoch": 0.7217081850533807,
"grad_norm": 0.3745080530643463,
"learning_rate": 2.173865275137314e-06,
"loss": 0.4705,
"step": 676
},
{
"epoch": 0.7227758007117437,
"grad_norm": 0.3763768672943115,
"learning_rate": 2.158495367417625e-06,
"loss": 0.4748,
"step": 677
},
{
"epoch": 0.7238434163701067,
"grad_norm": 0.3732641935348511,
"learning_rate": 2.143165016705192e-06,
"loss": 0.485,
"step": 678
},
{
"epoch": 0.7249110320284697,
"grad_norm": 0.41331830620765686,
"learning_rate": 2.1278744364161007e-06,
"loss": 0.5154,
"step": 679
},
{
"epoch": 0.7259786476868327,
"grad_norm": 0.3479762077331543,
"learning_rate": 2.1126238394127868e-06,
"loss": 0.4668,
"step": 680
},
{
"epoch": 0.7270462633451957,
"grad_norm": 0.3638448417186737,
"learning_rate": 2.0974134380010726e-06,
"loss": 0.479,
"step": 681
},
{
"epoch": 0.7281138790035587,
"grad_norm": 0.3845721185207367,
"learning_rate": 2.082243443927212e-06,
"loss": 0.4757,
"step": 682
},
{
"epoch": 0.7291814946619217,
"grad_norm": 0.3670172095298767,
"learning_rate": 2.0671140683749386e-06,
"loss": 0.4841,
"step": 683
},
{
"epoch": 0.7302491103202847,
"grad_norm": 0.4244895279407501,
"learning_rate": 2.052025521962534e-06,
"loss": 0.4964,
"step": 684
},
{
"epoch": 0.7313167259786477,
"grad_norm": 0.37311217188835144,
"learning_rate": 2.03697801473989e-06,
"loss": 0.4675,
"step": 685
},
{
"epoch": 0.7323843416370107,
"grad_norm": 0.40401390194892883,
"learning_rate": 2.0219717561855857e-06,
"loss": 0.4787,
"step": 686
},
{
"epoch": 0.7334519572953737,
"grad_norm": 0.4272782802581787,
"learning_rate": 2.0070069552039722e-06,
"loss": 0.4704,
"step": 687
},
{
"epoch": 0.7345195729537367,
"grad_norm": 0.41608813405036926,
"learning_rate": 1.992083820122259e-06,
"loss": 0.4982,
"step": 688
},
{
"epoch": 0.7355871886120996,
"grad_norm": 0.38170090317726135,
"learning_rate": 1.9772025586876252e-06,
"loss": 0.468,
"step": 689
},
{
"epoch": 0.7366548042704626,
"grad_norm": 0.40004512667655945,
"learning_rate": 1.962363378064316e-06,
"loss": 0.4606,
"step": 690
},
{
"epoch": 0.7377224199288256,
"grad_norm": 0.3620181679725647,
"learning_rate": 1.947566484830765e-06,
"loss": 0.4608,
"step": 691
},
{
"epoch": 0.7387900355871886,
"grad_norm": 0.378568559885025,
"learning_rate": 1.9328120849767198e-06,
"loss": 0.4974,
"step": 692
},
{
"epoch": 0.7398576512455516,
"grad_norm": 0.4036838412284851,
"learning_rate": 1.9181003839003627e-06,
"loss": 0.4859,
"step": 693
},
{
"epoch": 0.7409252669039146,
"grad_norm": 0.3742115795612335,
"learning_rate": 1.9034315864054682e-06,
"loss": 0.445,
"step": 694
},
{
"epoch": 0.7419928825622776,
"grad_norm": 0.45754826068878174,
"learning_rate": 1.8888058966985407e-06,
"loss": 0.4882,
"step": 695
},
{
"epoch": 0.7430604982206406,
"grad_norm": 0.3731890320777893,
"learning_rate": 1.8742235183859747e-06,
"loss": 0.4656,
"step": 696
},
{
"epoch": 0.7441281138790036,
"grad_norm": 0.35599714517593384,
"learning_rate": 1.8596846544712233e-06,
"loss": 0.4508,
"step": 697
},
{
"epoch": 0.7451957295373666,
"grad_norm": 0.3616451621055603,
"learning_rate": 1.8451895073519643e-06,
"loss": 0.4636,
"step": 698
},
{
"epoch": 0.7462633451957296,
"grad_norm": 0.36844977736473083,
"learning_rate": 1.8307382788172877e-06,
"loss": 0.4858,
"step": 699
},
{
"epoch": 0.7473309608540926,
"grad_norm": 0.37101319432258606,
"learning_rate": 1.8163311700448899e-06,
"loss": 0.4542,
"step": 700
},
{
"epoch": 0.7483985765124556,
"grad_norm": 0.34689757227897644,
"learning_rate": 1.8019683815982691e-06,
"loss": 0.4336,
"step": 701
},
{
"epoch": 0.7494661921708186,
"grad_norm": 0.36886388063430786,
"learning_rate": 1.7876501134239316e-06,
"loss": 0.4688,
"step": 702
},
{
"epoch": 0.7505338078291814,
"grad_norm": 0.40008699893951416,
"learning_rate": 1.7733765648486134e-06,
"loss": 0.4842,
"step": 703
},
{
"epoch": 0.7516014234875444,
"grad_norm": 0.3825279772281647,
"learning_rate": 1.7591479345764972e-06,
"loss": 0.4843,
"step": 704
},
{
"epoch": 0.7526690391459074,
"grad_norm": 0.33588531613349915,
"learning_rate": 1.7449644206864564e-06,
"loss": 0.4673,
"step": 705
},
{
"epoch": 0.7537366548042704,
"grad_norm": 0.3410935699939728,
"learning_rate": 1.7308262206292898e-06,
"loss": 0.4593,
"step": 706
},
{
"epoch": 0.7548042704626334,
"grad_norm": 0.3688999116420746,
"learning_rate": 1.7167335312249766e-06,
"loss": 0.4669,
"step": 707
},
{
"epoch": 0.7558718861209964,
"grad_norm": 0.3429146409034729,
"learning_rate": 1.7026865486599375e-06,
"loss": 0.4686,
"step": 708
},
{
"epoch": 0.7569395017793594,
"grad_norm": 0.3735763728618622,
"learning_rate": 1.6886854684842962e-06,
"loss": 0.4414,
"step": 709
},
{
"epoch": 0.7580071174377224,
"grad_norm": 0.3942524790763855,
"learning_rate": 1.6747304856091662e-06,
"loss": 0.4921,
"step": 710
},
{
"epoch": 0.7590747330960854,
"grad_norm": 0.3756312131881714,
"learning_rate": 1.660821794303934e-06,
"loss": 0.4729,
"step": 711
},
{
"epoch": 0.7601423487544484,
"grad_norm": 0.3681127727031708,
"learning_rate": 1.6469595881935523e-06,
"loss": 0.4657,
"step": 712
},
{
"epoch": 0.7612099644128114,
"grad_norm": 0.3375697135925293,
"learning_rate": 1.6331440602558501e-06,
"loss": 0.46,
"step": 713
},
{
"epoch": 0.7622775800711744,
"grad_norm": 0.3568233549594879,
"learning_rate": 1.6193754028188363e-06,
"loss": 0.4758,
"step": 714
},
{
"epoch": 0.7633451957295374,
"grad_norm": 0.3790285587310791,
"learning_rate": 1.6056538075580342e-06,
"loss": 0.4669,
"step": 715
},
{
"epoch": 0.7644128113879004,
"grad_norm": 0.3637920022010803,
"learning_rate": 1.591979465493806e-06,
"loss": 0.4688,
"step": 716
},
{
"epoch": 0.7654804270462633,
"grad_norm": 0.36419907212257385,
"learning_rate": 1.5783525669886934e-06,
"loss": 0.4705,
"step": 717
},
{
"epoch": 0.7665480427046263,
"grad_norm": 0.40734171867370605,
"learning_rate": 1.5647733017447741e-06,
"loss": 0.4984,
"step": 718
},
{
"epoch": 0.7676156583629893,
"grad_norm": 0.3663610816001892,
"learning_rate": 1.5512418588010086e-06,
"loss": 0.4833,
"step": 719
},
{
"epoch": 0.7686832740213523,
"grad_norm": 0.3341020345687866,
"learning_rate": 1.5377584265306222e-06,
"loss": 0.4512,
"step": 720
},
{
"epoch": 0.7697508896797153,
"grad_norm": 0.3661962151527405,
"learning_rate": 1.5243231926384744e-06,
"loss": 0.4722,
"step": 721
},
{
"epoch": 0.7708185053380783,
"grad_norm": 0.36284494400024414,
"learning_rate": 1.510936344158448e-06,
"loss": 0.4475,
"step": 722
},
{
"epoch": 0.7718861209964413,
"grad_norm": 0.3552328646183014,
"learning_rate": 1.4975980674508472e-06,
"loss": 0.4568,
"step": 723
},
{
"epoch": 0.7729537366548043,
"grad_norm": 0.3626512885093689,
"learning_rate": 1.484308548199796e-06,
"loss": 0.4832,
"step": 724
},
{
"epoch": 0.7740213523131673,
"grad_norm": 0.35424965620040894,
"learning_rate": 1.4710679714106635e-06,
"loss": 0.4741,
"step": 725
},
{
"epoch": 0.7750889679715303,
"grad_norm": 0.3459206223487854,
"learning_rate": 1.4578765214074842e-06,
"loss": 0.4401,
"step": 726
},
{
"epoch": 0.7761565836298933,
"grad_norm": 0.38151949644088745,
"learning_rate": 1.444734381830386e-06,
"loss": 0.459,
"step": 727
},
{
"epoch": 0.7772241992882563,
"grad_norm": 0.3755812346935272,
"learning_rate": 1.4316417356330441e-06,
"loss": 0.4612,
"step": 728
},
{
"epoch": 0.7782918149466193,
"grad_norm": 0.3835029900074005,
"learning_rate": 1.4185987650801286e-06,
"loss": 0.4556,
"step": 729
},
{
"epoch": 0.7793594306049823,
"grad_norm": 0.3661644756793976,
"learning_rate": 1.4056056517447637e-06,
"loss": 0.491,
"step": 730
},
{
"epoch": 0.7804270462633452,
"grad_norm": 0.3670632541179657,
"learning_rate": 1.392662576506007e-06,
"loss": 0.4821,
"step": 731
},
{
"epoch": 0.7814946619217081,
"grad_norm": 0.3170434832572937,
"learning_rate": 1.3797697195463278e-06,
"loss": 0.4571,
"step": 732
},
{
"epoch": 0.7825622775800711,
"grad_norm": 0.37059327960014343,
"learning_rate": 1.3669272603491002e-06,
"loss": 0.472,
"step": 733
},
{
"epoch": 0.7836298932384341,
"grad_norm": 0.3722604513168335,
"learning_rate": 1.3541353776961035e-06,
"loss": 0.4716,
"step": 734
},
{
"epoch": 0.7846975088967971,
"grad_norm": 0.3722414970397949,
"learning_rate": 1.3413942496650301e-06,
"loss": 0.4824,
"step": 735
},
{
"epoch": 0.7857651245551601,
"grad_norm": 0.3409653306007385,
"learning_rate": 1.3287040536270135e-06,
"loss": 0.4605,
"step": 736
},
{
"epoch": 0.7868327402135231,
"grad_norm": 0.3402983546257019,
"learning_rate": 1.3160649662441532e-06,
"loss": 0.4756,
"step": 737
},
{
"epoch": 0.7879003558718861,
"grad_norm": 0.34389257431030273,
"learning_rate": 1.30347716346706e-06,
"loss": 0.491,
"step": 738
},
{
"epoch": 0.7889679715302491,
"grad_norm": 0.35902342200279236,
"learning_rate": 1.290940820532403e-06,
"loss": 0.4962,
"step": 739
},
{
"epoch": 0.7900355871886121,
"grad_norm": 0.3977390229701996,
"learning_rate": 1.2784561119604683e-06,
"loss": 0.4772,
"step": 740
},
{
"epoch": 0.7911032028469751,
"grad_norm": 0.3474990427494049,
"learning_rate": 1.266023211552736e-06,
"loss": 0.4722,
"step": 741
},
{
"epoch": 0.7921708185053381,
"grad_norm": 0.3343373239040375,
"learning_rate": 1.2536422923894565e-06,
"loss": 0.4693,
"step": 742
},
{
"epoch": 0.7932384341637011,
"grad_norm": 0.3417350649833679,
"learning_rate": 1.2413135268272403e-06,
"loss": 0.4557,
"step": 743
},
{
"epoch": 0.7943060498220641,
"grad_norm": 0.35946568846702576,
"learning_rate": 1.2290370864966623e-06,
"loss": 0.4719,
"step": 744
},
{
"epoch": 0.7953736654804271,
"grad_norm": 0.3475436866283417,
"learning_rate": 1.2168131422998653e-06,
"loss": 0.4822,
"step": 745
},
{
"epoch": 0.79644128113879,
"grad_norm": 0.34873461723327637,
"learning_rate": 1.2046418644081904e-06,
"loss": 0.469,
"step": 746
},
{
"epoch": 0.797508896797153,
"grad_norm": 0.37052375078201294,
"learning_rate": 1.192523422259802e-06,
"loss": 0.4926,
"step": 747
},
{
"epoch": 0.798576512455516,
"grad_norm": 0.40255382657051086,
"learning_rate": 1.1804579845573288e-06,
"loss": 0.4759,
"step": 748
},
{
"epoch": 0.799644128113879,
"grad_norm": 0.35330265760421753,
"learning_rate": 1.1684457192655207e-06,
"loss": 0.4904,
"step": 749
},
{
"epoch": 0.800711743772242,
"grad_norm": 0.34803614020347595,
"learning_rate": 1.156486793608899e-06,
"loss": 0.4786,
"step": 750
},
{
"epoch": 0.801779359430605,
"grad_norm": 0.3456575572490692,
"learning_rate": 1.144581374069444e-06,
"loss": 0.4493,
"step": 751
},
{
"epoch": 0.802846975088968,
"grad_norm": 0.3717256188392639,
"learning_rate": 1.1327296263842653e-06,
"loss": 0.4414,
"step": 752
},
{
"epoch": 0.803914590747331,
"grad_norm": 0.35381019115448,
"learning_rate": 1.120931715543299e-06,
"loss": 0.4235,
"step": 753
},
{
"epoch": 0.804982206405694,
"grad_norm": 0.37933510541915894,
"learning_rate": 1.1091878057870137e-06,
"loss": 0.4721,
"step": 754
},
{
"epoch": 0.806049822064057,
"grad_norm": 0.3646122217178345,
"learning_rate": 1.0974980606041152e-06,
"loss": 0.4799,
"step": 755
},
{
"epoch": 0.80711743772242,
"grad_norm": 0.34057337045669556,
"learning_rate": 1.0858626427292796e-06,
"loss": 0.4549,
"step": 756
},
{
"epoch": 0.808185053380783,
"grad_norm": 0.35293883085250854,
"learning_rate": 1.074281714140884e-06,
"loss": 0.4939,
"step": 757
},
{
"epoch": 0.8092526690391459,
"grad_norm": 0.35031718015670776,
"learning_rate": 1.0627554360587533e-06,
"loss": 0.4707,
"step": 758
},
{
"epoch": 0.8103202846975089,
"grad_norm": 0.36572709679603577,
"learning_rate": 1.0512839689419124e-06,
"loss": 0.473,
"step": 759
},
{
"epoch": 0.8113879003558719,
"grad_norm": 0.3748714327812195,
"learning_rate": 1.0398674724863584e-06,
"loss": 0.4637,
"step": 760
},
{
"epoch": 0.8124555160142348,
"grad_norm": 0.35171017050743103,
"learning_rate": 1.0285061056228273e-06,
"loss": 0.4651,
"step": 761
},
{
"epoch": 0.8135231316725978,
"grad_norm": 0.3429271876811981,
"learning_rate": 1.0172000265145938e-06,
"loss": 0.484,
"step": 762
},
{
"epoch": 0.8145907473309608,
"grad_norm": 0.33533966541290283,
"learning_rate": 1.0059493925552604e-06,
"loss": 0.4695,
"step": 763
},
{
"epoch": 0.8156583629893238,
"grad_norm": 0.40435880422592163,
"learning_rate": 9.947543603665711e-07,
"loss": 0.4899,
"step": 764
},
{
"epoch": 0.8167259786476868,
"grad_norm": 0.35124266147613525,
"learning_rate": 9.836150857962296e-07,
"loss": 0.4515,
"step": 765
},
{
"epoch": 0.8177935943060498,
"grad_norm": 0.3399527370929718,
"learning_rate": 9.72531723915726e-07,
"loss": 0.4715,
"step": 766
},
{
"epoch": 0.8188612099644128,
"grad_norm": 0.35184887051582336,
"learning_rate": 9.615044290181863e-07,
"loss": 0.4596,
"step": 767
},
{
"epoch": 0.8199288256227758,
"grad_norm": 0.39761510491371155,
"learning_rate": 9.505333546162171e-07,
"loss": 0.4723,
"step": 768
},
{
"epoch": 0.8209964412811388,
"grad_norm": 0.39027488231658936,
"learning_rate": 9.396186534397711e-07,
"loss": 0.4922,
"step": 769
},
{
"epoch": 0.8220640569395018,
"grad_norm": 0.3433700203895569,
"learning_rate": 9.287604774340236e-07,
"loss": 0.5038,
"step": 770
},
{
"epoch": 0.8231316725978648,
"grad_norm": 0.37167125940322876,
"learning_rate": 9.179589777572496e-07,
"loss": 0.4837,
"step": 771
},
{
"epoch": 0.8241992882562278,
"grad_norm": 0.3502262532711029,
"learning_rate": 9.07214304778729e-07,
"loss": 0.4589,
"step": 772
},
{
"epoch": 0.8252669039145908,
"grad_norm": 0.3304504454135895,
"learning_rate": 8.965266080766471e-07,
"loss": 0.4718,
"step": 773
},
{
"epoch": 0.8263345195729538,
"grad_norm": 0.3667429983615875,
"learning_rate": 8.858960364360142e-07,
"loss": 0.4946,
"step": 774
},
{
"epoch": 0.8274021352313167,
"grad_norm": 0.38245144486427307,
"learning_rate": 8.753227378465956e-07,
"loss": 0.4551,
"step": 775
},
{
"epoch": 0.8284697508896797,
"grad_norm": 0.3940775394439697,
"learning_rate": 8.648068595008458e-07,
"loss": 0.4492,
"step": 776
},
{
"epoch": 0.8295373665480427,
"grad_norm": 0.3834594488143921,
"learning_rate": 8.543485477918672e-07,
"loss": 0.4642,
"step": 777
},
{
"epoch": 0.8306049822064057,
"grad_norm": 0.35629889369010925,
"learning_rate": 8.439479483113683e-07,
"loss": 0.4479,
"step": 778
},
{
"epoch": 0.8316725978647687,
"grad_norm": 0.38858264684677124,
"learning_rate": 8.336052058476374e-07,
"loss": 0.4774,
"step": 779
},
{
"epoch": 0.8327402135231317,
"grad_norm": 0.3532935380935669,
"learning_rate": 8.233204643835235e-07,
"loss": 0.4941,
"step": 780
},
{
"epoch": 0.8338078291814947,
"grad_norm": 0.34240975975990295,
"learning_rate": 8.130938670944377e-07,
"loss": 0.4695,
"step": 781
},
{
"epoch": 0.8348754448398576,
"grad_norm": 0.3459632396697998,
"learning_rate": 8.029255563463589e-07,
"loss": 0.4913,
"step": 782
},
{
"epoch": 0.8359430604982206,
"grad_norm": 0.3634418547153473,
"learning_rate": 7.928156736938458e-07,
"loss": 0.4515,
"step": 783
},
{
"epoch": 0.8370106761565836,
"grad_norm": 0.3613983690738678,
"learning_rate": 7.827643598780748e-07,
"loss": 0.4654,
"step": 784
},
{
"epoch": 0.8380782918149466,
"grad_norm": 0.35897570848464966,
"learning_rate": 7.72771754824877e-07,
"loss": 0.4687,
"step": 785
},
{
"epoch": 0.8391459074733096,
"grad_norm": 0.3587648570537567,
"learning_rate": 7.628379976427868e-07,
"loss": 0.4862,
"step": 786
},
{
"epoch": 0.8402135231316726,
"grad_norm": 0.3191976249217987,
"learning_rate": 7.529632266211112e-07,
"loss": 0.4501,
"step": 787
},
{
"epoch": 0.8412811387900356,
"grad_norm": 0.36679914593696594,
"learning_rate": 7.431475792280018e-07,
"loss": 0.4668,
"step": 788
},
{
"epoch": 0.8423487544483986,
"grad_norm": 0.34455785155296326,
"learning_rate": 7.333911921085418e-07,
"loss": 0.4621,
"step": 789
},
{
"epoch": 0.8434163701067615,
"grad_norm": 0.326860249042511,
"learning_rate": 7.23694201082843e-07,
"loss": 0.4336,
"step": 790
},
{
"epoch": 0.8444839857651245,
"grad_norm": 0.34012243151664734,
"learning_rate": 7.140567411441529e-07,
"loss": 0.4742,
"step": 791
},
{
"epoch": 0.8455516014234875,
"grad_norm": 0.3265022933483124,
"learning_rate": 7.044789464569817e-07,
"loss": 0.4561,
"step": 792
},
{
"epoch": 0.8466192170818505,
"grad_norm": 0.36320456862449646,
"learning_rate": 6.94960950355229e-07,
"loss": 0.4812,
"step": 793
},
{
"epoch": 0.8476868327402135,
"grad_norm": 0.3391510546207428,
"learning_rate": 6.855028853403295e-07,
"loss": 0.4705,
"step": 794
},
{
"epoch": 0.8487544483985765,
"grad_norm": 0.34259894490242004,
"learning_rate": 6.761048830794098e-07,
"loss": 0.4889,
"step": 795
},
{
"epoch": 0.8498220640569395,
"grad_norm": 0.37212345004081726,
"learning_rate": 6.667670744034498e-07,
"loss": 0.4869,
"step": 796
},
{
"epoch": 0.8508896797153025,
"grad_norm": 0.34556707739830017,
"learning_rate": 6.574895893054711e-07,
"loss": 0.507,
"step": 797
},
{
"epoch": 0.8519572953736655,
"grad_norm": 0.3274592459201813,
"learning_rate": 6.482725569387171e-07,
"loss": 0.4678,
"step": 798
},
{
"epoch": 0.8530249110320285,
"grad_norm": 0.3611302971839905,
"learning_rate": 6.391161056148637e-07,
"loss": 0.4792,
"step": 799
},
{
"epoch": 0.8540925266903915,
"grad_norm": 0.36246782541275024,
"learning_rate": 6.300203628022272e-07,
"loss": 0.4867,
"step": 800
},
{
"epoch": 0.8551601423487545,
"grad_norm": 0.37306517362594604,
"learning_rate": 6.209854551239902e-07,
"loss": 0.4795,
"step": 801
},
{
"epoch": 0.8562277580071175,
"grad_norm": 0.3382475972175598,
"learning_rate": 6.120115083564432e-07,
"loss": 0.437,
"step": 802
},
{
"epoch": 0.8572953736654805,
"grad_norm": 0.33135363459587097,
"learning_rate": 6.030986474272288e-07,
"loss": 0.4715,
"step": 803
},
{
"epoch": 0.8583629893238434,
"grad_norm": 0.3751276135444641,
"learning_rate": 5.942469964136055e-07,
"loss": 0.4808,
"step": 804
},
{
"epoch": 0.8594306049822064,
"grad_norm": 0.3444526493549347,
"learning_rate": 5.854566785407212e-07,
"loss": 0.4636,
"step": 805
},
{
"epoch": 0.8604982206405694,
"grad_norm": 0.3411964178085327,
"learning_rate": 5.767278161798912e-07,
"loss": 0.4396,
"step": 806
},
{
"epoch": 0.8615658362989324,
"grad_norm": 0.33897465467453003,
"learning_rate": 5.680605308469045e-07,
"loss": 0.4747,
"step": 807
},
{
"epoch": 0.8626334519572953,
"grad_norm": 0.3335667550563812,
"learning_rate": 5.594549432003244e-07,
"loss": 0.4449,
"step": 808
},
{
"epoch": 0.8637010676156583,
"grad_norm": 0.35737693309783936,
"learning_rate": 5.509111730398125e-07,
"loss": 0.4713,
"step": 809
},
{
"epoch": 0.8647686832740213,
"grad_norm": 0.3388114869594574,
"learning_rate": 5.42429339304461e-07,
"loss": 0.4657,
"step": 810
},
{
"epoch": 0.8658362989323843,
"grad_norm": 0.36129823327064514,
"learning_rate": 5.340095600711343e-07,
"loss": 0.4817,
"step": 811
},
{
"epoch": 0.8669039145907473,
"grad_norm": 0.32177579402923584,
"learning_rate": 5.256519525528254e-07,
"loss": 0.486,
"step": 812
},
{
"epoch": 0.8679715302491103,
"grad_norm": 0.351857453584671,
"learning_rate": 5.173566330970286e-07,
"loss": 0.4574,
"step": 813
},
{
"epoch": 0.8690391459074733,
"grad_norm": 0.34854841232299805,
"learning_rate": 5.091237171841173e-07,
"loss": 0.4603,
"step": 814
},
{
"epoch": 0.8701067615658363,
"grad_norm": 0.36344021558761597,
"learning_rate": 5.009533194257332e-07,
"loss": 0.4655,
"step": 815
},
{
"epoch": 0.8711743772241993,
"grad_norm": 0.3396829068660736,
"learning_rate": 4.92845553563196e-07,
"loss": 0.4776,
"step": 816
},
{
"epoch": 0.8722419928825623,
"grad_norm": 0.35948312282562256,
"learning_rate": 4.848005324659144e-07,
"loss": 0.4879,
"step": 817
},
{
"epoch": 0.8733096085409253,
"grad_norm": 0.3061416447162628,
"learning_rate": 4.768183681298211e-07,
"loss": 0.4335,
"step": 818
},
{
"epoch": 0.8743772241992882,
"grad_norm": 0.35700371861457825,
"learning_rate": 4.6889917167580903e-07,
"loss": 0.4757,
"step": 819
},
{
"epoch": 0.8754448398576512,
"grad_norm": 0.3622047007083893,
"learning_rate": 4.6104305334818577e-07,
"loss": 0.4792,
"step": 820
},
{
"epoch": 0.8765124555160142,
"grad_norm": 0.3496834337711334,
"learning_rate": 4.532501225131408e-07,
"loss": 0.4821,
"step": 821
},
{
"epoch": 0.8775800711743772,
"grad_norm": 0.3690001666545868,
"learning_rate": 4.455204876572172e-07,
"loss": 0.4789,
"step": 822
},
{
"epoch": 0.8786476868327402,
"grad_norm": 0.320921391248703,
"learning_rate": 4.3785425638580847e-07,
"loss": 0.4792,
"step": 823
},
{
"epoch": 0.8797153024911032,
"grad_norm": 0.320486843585968,
"learning_rate": 4.3025153542165744e-07,
"loss": 0.4707,
"step": 824
},
{
"epoch": 0.8807829181494662,
"grad_norm": 0.34054213762283325,
"learning_rate": 4.2271243060336976e-07,
"loss": 0.466,
"step": 825
},
{
"epoch": 0.8818505338078292,
"grad_norm": 0.33979448676109314,
"learning_rate": 4.1523704688394176e-07,
"loss": 0.4562,
"step": 826
},
{
"epoch": 0.8829181494661922,
"grad_norm": 0.3499307632446289,
"learning_rate": 4.0782548832929646e-07,
"loss": 0.4653,
"step": 827
},
{
"epoch": 0.8839857651245552,
"grad_norm": 0.3363668620586395,
"learning_rate": 4.0047785811684116e-07,
"loss": 0.451,
"step": 828
},
{
"epoch": 0.8850533807829182,
"grad_norm": 0.3545955717563629,
"learning_rate": 3.931942585340243e-07,
"loss": 0.4769,
"step": 829
},
{
"epoch": 0.8861209964412812,
"grad_norm": 0.38059499859809875,
"learning_rate": 3.8597479097691626e-07,
"loss": 0.4946,
"step": 830
},
{
"epoch": 0.8871886120996442,
"grad_norm": 0.36763712763786316,
"learning_rate": 3.788195559487956e-07,
"loss": 0.4554,
"step": 831
},
{
"epoch": 0.8882562277580072,
"grad_norm": 0.3406812250614166,
"learning_rate": 3.717286530587483e-07,
"loss": 0.4469,
"step": 832
},
{
"epoch": 0.88932384341637,
"grad_norm": 0.3243533670902252,
"learning_rate": 3.6470218102028607e-07,
"loss": 0.4283,
"step": 833
},
{
"epoch": 0.890391459074733,
"grad_norm": 0.34269580245018005,
"learning_rate": 3.577402376499672e-07,
"loss": 0.4439,
"step": 834
},
{
"epoch": 0.891459074733096,
"grad_norm": 0.3207905888557434,
"learning_rate": 3.508429198660379e-07,
"loss": 0.4562,
"step": 835
},
{
"epoch": 0.892526690391459,
"grad_norm": 0.34439972043037415,
"learning_rate": 3.440103236870823e-07,
"loss": 0.4705,
"step": 836
},
{
"epoch": 0.893594306049822,
"grad_norm": 0.34473180770874023,
"learning_rate": 3.372425442306837e-07,
"loss": 0.4747,
"step": 837
},
{
"epoch": 0.894661921708185,
"grad_norm": 0.3256348669528961,
"learning_rate": 3.3053967571210375e-07,
"loss": 0.4461,
"step": 838
},
{
"epoch": 0.895729537366548,
"grad_norm": 0.3985244929790497,
"learning_rate": 3.2390181144296815e-07,
"loss": 0.5082,
"step": 839
},
{
"epoch": 0.896797153024911,
"grad_norm": 0.3613927364349365,
"learning_rate": 3.1732904382996975e-07,
"loss": 0.4701,
"step": 840
},
{
"epoch": 0.897864768683274,
"grad_norm": 0.34537020325660706,
"learning_rate": 3.108214643735813e-07,
"loss": 0.4683,
"step": 841
},
{
"epoch": 0.898932384341637,
"grad_norm": 0.3321053385734558,
"learning_rate": 3.04379163666782e-07,
"loss": 0.4609,
"step": 842
},
{
"epoch": 0.9,
"grad_norm": 0.3575867712497711,
"learning_rate": 2.98002231393793e-07,
"loss": 0.4571,
"step": 843
},
{
"epoch": 0.901067615658363,
"grad_norm": 0.34244054555892944,
"learning_rate": 2.916907563288357e-07,
"loss": 0.4798,
"step": 844
},
{
"epoch": 0.902135231316726,
"grad_norm": 0.3529646694660187,
"learning_rate": 2.854448263348891e-07,
"loss": 0.4861,
"step": 845
},
{
"epoch": 0.903202846975089,
"grad_norm": 0.3687219023704529,
"learning_rate": 2.792645283624712e-07,
"loss": 0.4688,
"step": 846
},
{
"epoch": 0.904270462633452,
"grad_norm": 0.3753871023654938,
"learning_rate": 2.7314994844842623e-07,
"loss": 0.4744,
"step": 847
},
{
"epoch": 0.9053380782918149,
"grad_norm": 0.31958386301994324,
"learning_rate": 2.671011717147276e-07,
"loss": 0.4667,
"step": 848
},
{
"epoch": 0.9064056939501779,
"grad_norm": 0.34946852922439575,
"learning_rate": 2.611182823672931e-07,
"loss": 0.4763,
"step": 849
},
{
"epoch": 0.9074733096085409,
"grad_norm": 0.32950615882873535,
"learning_rate": 2.5520136369481194e-07,
"loss": 0.4814,
"step": 850
},
{
"epoch": 0.9085409252669039,
"grad_norm": 0.35894763469696045,
"learning_rate": 2.493504980675865e-07,
"loss": 0.4643,
"step": 851
},
{
"epoch": 0.9096085409252669,
"grad_norm": 0.3291400671005249,
"learning_rate": 2.4356576693638555e-07,
"loss": 0.4618,
"step": 852
},
{
"epoch": 0.9106761565836299,
"grad_norm": 0.37263375520706177,
"learning_rate": 2.3784725083130678e-07,
"loss": 0.4765,
"step": 853
},
{
"epoch": 0.9117437722419929,
"grad_norm": 0.346204936504364,
"learning_rate": 2.3219502936066228e-07,
"loss": 0.4684,
"step": 854
},
{
"epoch": 0.9128113879003559,
"grad_norm": 0.3411816656589508,
"learning_rate": 2.266091812098642e-07,
"loss": 0.4665,
"step": 855
},
{
"epoch": 0.9138790035587189,
"grad_norm": 0.39055925607681274,
"learning_rate": 2.210897841403331e-07,
"loss": 0.4752,
"step": 856
},
{
"epoch": 0.9149466192170819,
"grad_norm": 0.3565220236778259,
"learning_rate": 2.1563691498841465e-07,
"loss": 0.4269,
"step": 857
},
{
"epoch": 0.9160142348754449,
"grad_norm": 0.3413132131099701,
"learning_rate": 2.1025064966430697e-07,
"loss": 0.4569,
"step": 858
},
{
"epoch": 0.9170818505338079,
"grad_norm": 0.3309057354927063,
"learning_rate": 2.0493106315100987e-07,
"loss": 0.458,
"step": 859
},
{
"epoch": 0.9181494661921709,
"grad_norm": 0.34652769565582275,
"learning_rate": 1.9967822950327453e-07,
"loss": 0.4765,
"step": 860
},
{
"epoch": 0.9192170818505339,
"grad_norm": 0.34112629294395447,
"learning_rate": 1.944922218465778e-07,
"loss": 0.4573,
"step": 861
},
{
"epoch": 0.9202846975088967,
"grad_norm": 0.35271042585372925,
"learning_rate": 1.8937311237610168e-07,
"loss": 0.4485,
"step": 862
},
{
"epoch": 0.9213523131672597,
"grad_norm": 0.3256247639656067,
"learning_rate": 1.8432097235572655e-07,
"loss": 0.4762,
"step": 863
},
{
"epoch": 0.9224199288256227,
"grad_norm": 0.33354514837265015,
"learning_rate": 1.793358721170435e-07,
"loss": 0.4764,
"step": 864
},
{
"epoch": 0.9234875444839857,
"grad_norm": 0.36107322573661804,
"learning_rate": 1.7441788105837133e-07,
"loss": 0.4675,
"step": 865
},
{
"epoch": 0.9245551601423487,
"grad_norm": 0.36529216170310974,
"learning_rate": 1.6956706764379438e-07,
"loss": 0.4663,
"step": 866
},
{
"epoch": 0.9256227758007117,
"grad_norm": 0.387478232383728,
"learning_rate": 1.6478349940220294e-07,
"loss": 0.4626,
"step": 867
},
{
"epoch": 0.9266903914590747,
"grad_norm": 0.33364489674568176,
"learning_rate": 1.6006724292636166e-07,
"loss": 0.4636,
"step": 868
},
{
"epoch": 0.9277580071174377,
"grad_norm": 0.3466110825538635,
"learning_rate": 1.5541836387197528e-07,
"loss": 0.4599,
"step": 869
},
{
"epoch": 0.9288256227758007,
"grad_norm": 0.3230234384536743,
"learning_rate": 1.508369269567783e-07,
"loss": 0.4628,
"step": 870
},
{
"epoch": 0.9298932384341637,
"grad_norm": 0.34074848890304565,
"learning_rate": 1.4632299595963294e-07,
"loss": 0.5069,
"step": 871
},
{
"epoch": 0.9309608540925267,
"grad_norm": 0.3386795222759247,
"learning_rate": 1.418766337196431e-07,
"loss": 0.4608,
"step": 872
},
{
"epoch": 0.9320284697508897,
"grad_norm": 0.35128504037857056,
"learning_rate": 1.374979021352757e-07,
"loss": 0.4636,
"step": 873
},
{
"epoch": 0.9330960854092527,
"grad_norm": 0.3676503598690033,
"learning_rate": 1.3318686216350241e-07,
"loss": 0.5208,
"step": 874
},
{
"epoch": 0.9341637010676157,
"grad_norm": 0.3126872479915619,
"learning_rate": 1.2894357381894984e-07,
"loss": 0.4391,
"step": 875
},
{
"epoch": 0.9352313167259787,
"grad_norm": 0.32956287264823914,
"learning_rate": 1.2476809617306408e-07,
"loss": 0.4585,
"step": 876
},
{
"epoch": 0.9362989323843416,
"grad_norm": 0.3517782688140869,
"learning_rate": 1.206604873532885e-07,
"loss": 0.4854,
"step": 877
},
{
"epoch": 0.9373665480427046,
"grad_norm": 0.37964025139808655,
"learning_rate": 1.166208045422551e-07,
"loss": 0.4637,
"step": 878
},
{
"epoch": 0.9384341637010676,
"grad_norm": 0.3676295876502991,
"learning_rate": 1.1264910397698614e-07,
"loss": 0.4779,
"step": 879
},
{
"epoch": 0.9395017793594306,
"grad_norm": 0.3368426561355591,
"learning_rate": 1.0874544094811424e-07,
"loss": 0.4594,
"step": 880
},
{
"epoch": 0.9405693950177936,
"grad_norm": 0.35099488496780396,
"learning_rate": 1.0490986979911189e-07,
"loss": 0.4465,
"step": 881
},
{
"epoch": 0.9416370106761566,
"grad_norm": 0.3372381925582886,
"learning_rate": 1.0114244392553318e-07,
"loss": 0.4676,
"step": 882
},
{
"epoch": 0.9427046263345196,
"grad_norm": 0.33478647470474243,
"learning_rate": 9.744321577427218e-08,
"loss": 0.4646,
"step": 883
},
{
"epoch": 0.9437722419928826,
"grad_norm": 0.359651654958725,
"learning_rate": 9.381223684283291e-08,
"loss": 0.4623,
"step": 884
},
{
"epoch": 0.9448398576512456,
"grad_norm": 0.3485049605369568,
"learning_rate": 9.024955767861054e-08,
"loss": 0.4644,
"step": 885
},
{
"epoch": 0.9459074733096086,
"grad_norm": 0.3376001715660095,
"learning_rate": 8.675522787819023e-08,
"loss": 0.4353,
"step": 886
},
{
"epoch": 0.9469750889679716,
"grad_norm": 0.3420683741569519,
"learning_rate": 8.332929608665553e-08,
"loss": 0.4268,
"step": 887
},
{
"epoch": 0.9480427046263346,
"grad_norm": 0.33343154191970825,
"learning_rate": 7.997180999691101e-08,
"loss": 0.4584,
"step": 888
},
{
"epoch": 0.9491103202846976,
"grad_norm": 0.32583436369895935,
"learning_rate": 7.668281634901686e-08,
"loss": 0.4625,
"step": 889
},
{
"epoch": 0.9501779359430605,
"grad_norm": 0.33751362562179565,
"learning_rate": 7.346236092954318e-08,
"loss": 0.46,
"step": 890
},
{
"epoch": 0.9512455516014234,
"grad_norm": 0.329089879989624,
"learning_rate": 7.031048857092604e-08,
"loss": 0.4536,
"step": 891
},
{
"epoch": 0.9523131672597864,
"grad_norm": 0.3587329685688019,
"learning_rate": 6.722724315084805e-08,
"loss": 0.4767,
"step": 892
},
{
"epoch": 0.9533807829181494,
"grad_norm": 0.3284720480442047,
"learning_rate": 6.421266759162659e-08,
"loss": 0.4491,
"step": 893
},
{
"epoch": 0.9544483985765124,
"grad_norm": 0.3474853038787842,
"learning_rate": 6.12668038596137e-08,
"loss": 0.461,
"step": 894
},
{
"epoch": 0.9555160142348754,
"grad_norm": 0.31584909558296204,
"learning_rate": 5.838969296461605e-08,
"loss": 0.4404,
"step": 895
},
{
"epoch": 0.9565836298932384,
"grad_norm": 0.3233558237552643,
"learning_rate": 5.5581374959320366e-08,
"loss": 0.453,
"step": 896
},
{
"epoch": 0.9576512455516014,
"grad_norm": 0.3326091468334198,
"learning_rate": 5.2841888938738314e-08,
"loss": 0.4507,
"step": 897
},
{
"epoch": 0.9587188612099644,
"grad_norm": 0.35787636041641235,
"learning_rate": 5.017127303966085e-08,
"loss": 0.4835,
"step": 898
},
{
"epoch": 0.9597864768683274,
"grad_norm": 0.33811962604522705,
"learning_rate": 4.7569564440128055e-08,
"loss": 0.4424,
"step": 899
},
{
"epoch": 0.9608540925266904,
"grad_norm": 0.3393824100494385,
"learning_rate": 4.50367993589107e-08,
"loss": 0.4733,
"step": 900
},
{
"epoch": 0.9619217081850534,
"grad_norm": 0.3107040822505951,
"learning_rate": 4.257301305500672e-08,
"loss": 0.4453,
"step": 901
},
{
"epoch": 0.9629893238434164,
"grad_norm": 0.336866557598114,
"learning_rate": 4.0178239827151077e-08,
"loss": 0.4726,
"step": 902
},
{
"epoch": 0.9640569395017794,
"grad_norm": 0.33734455704689026,
"learning_rate": 3.785251301333726e-08,
"loss": 0.4609,
"step": 903
},
{
"epoch": 0.9651245551601424,
"grad_norm": 0.3539280593395233,
"learning_rate": 3.559586499035206e-08,
"loss": 0.4769,
"step": 904
},
{
"epoch": 0.9661921708185054,
"grad_norm": 0.3523450791835785,
"learning_rate": 3.340832717332765e-08,
"loss": 0.4701,
"step": 905
},
{
"epoch": 0.9672597864768683,
"grad_norm": 0.33282899856567383,
"learning_rate": 3.128993001530245e-08,
"loss": 0.4708,
"step": 906
},
{
"epoch": 0.9683274021352313,
"grad_norm": 0.32930752635002136,
"learning_rate": 2.9240703006797044e-08,
"loss": 0.479,
"step": 907
},
{
"epoch": 0.9693950177935943,
"grad_norm": 0.3469620943069458,
"learning_rate": 2.7260674675404498e-08,
"loss": 0.4634,
"step": 908
},
{
"epoch": 0.9704626334519573,
"grad_norm": 0.3198079466819763,
"learning_rate": 2.5349872585392898e-08,
"loss": 0.4599,
"step": 909
},
{
"epoch": 0.9715302491103203,
"grad_norm": 0.353118896484375,
"learning_rate": 2.3508323337321225e-08,
"loss": 0.4737,
"step": 910
},
{
"epoch": 0.9725978647686833,
"grad_norm": 0.3712465763092041,
"learning_rate": 2.1736052567670195e-08,
"loss": 0.4837,
"step": 911
},
{
"epoch": 0.9736654804270463,
"grad_norm": 0.32326361536979675,
"learning_rate": 2.0033084948483104e-08,
"loss": 0.4392,
"step": 912
},
{
"epoch": 0.9747330960854093,
"grad_norm": 0.3160242736339569,
"learning_rate": 1.8399444187024995e-08,
"loss": 0.4778,
"step": 913
},
{
"epoch": 0.9758007117437723,
"grad_norm": 0.3436198830604553,
"learning_rate": 1.6835153025451246e-08,
"loss": 0.4531,
"step": 914
},
{
"epoch": 0.9768683274021353,
"grad_norm": 0.3504072427749634,
"learning_rate": 1.534023324049061e-08,
"loss": 0.458,
"step": 915
},
{
"epoch": 0.9779359430604982,
"grad_norm": 0.3485338091850281,
"learning_rate": 1.3914705643143788e-08,
"loss": 0.4405,
"step": 916
},
{
"epoch": 0.9790035587188612,
"grad_norm": 0.3303447961807251,
"learning_rate": 1.2558590078390886e-08,
"loss": 0.4714,
"step": 917
},
{
"epoch": 0.9800711743772242,
"grad_norm": 0.31479117274284363,
"learning_rate": 1.1271905424918294e-08,
"loss": 0.4798,
"step": 918
},
{
"epoch": 0.9811387900355872,
"grad_norm": 0.31873440742492676,
"learning_rate": 1.0054669594853905e-08,
"loss": 0.4448,
"step": 919
},
{
"epoch": 0.9822064056939501,
"grad_norm": 0.35837072134017944,
"learning_rate": 8.906899533517866e-09,
"loss": 0.4583,
"step": 920
},
{
"epoch": 0.9832740213523131,
"grad_norm": 0.33332061767578125,
"learning_rate": 7.828611219187765e-09,
"loss": 0.4802,
"step": 921
},
{
"epoch": 0.9843416370106761,
"grad_norm": 0.3198853135108948,
"learning_rate": 6.819819662874372e-09,
"loss": 0.448,
"step": 922
},
{
"epoch": 0.9854092526690391,
"grad_norm": 0.35830771923065186,
"learning_rate": 5.88053890811513e-09,
"loss": 0.494,
"step": 923
},
{
"epoch": 0.9864768683274021,
"grad_norm": 0.3175657093524933,
"learning_rate": 5.0107820307770945e-09,
"loss": 0.4736,
"step": 924
},
{
"epoch": 0.9875444839857651,
"grad_norm": 0.3394733965396881,
"learning_rate": 4.210561138873193e-09,
"loss": 0.4817,
"step": 925
},
{
"epoch": 0.9886120996441281,
"grad_norm": 0.33911213278770447,
"learning_rate": 3.4798873723984604e-09,
"loss": 0.4372,
"step": 926
},
{
"epoch": 0.9896797153024911,
"grad_norm": 0.34297052025794983,
"learning_rate": 2.818770903170176e-09,
"loss": 0.4613,
"step": 927
},
{
"epoch": 0.9907473309608541,
"grad_norm": 0.3289053738117218,
"learning_rate": 2.2272209346885233e-09,
"loss": 0.4486,
"step": 928
},
{
"epoch": 0.9918149466192171,
"grad_norm": 0.3399280905723572,
"learning_rate": 1.7052457020089175e-09,
"loss": 0.4625,
"step": 929
},
{
"epoch": 0.9928825622775801,
"grad_norm": 0.3539047837257385,
"learning_rate": 1.2528524716259872e-09,
"loss": 0.4833,
"step": 930
},
{
"epoch": 0.9939501779359431,
"grad_norm": 0.34324121475219727,
"learning_rate": 8.700475413719877e-10,
"loss": 0.4721,
"step": 931
},
{
"epoch": 0.9950177935943061,
"grad_norm": 0.34528717398643494,
"learning_rate": 5.568362403318706e-10,
"loss": 0.4494,
"step": 932
},
{
"epoch": 0.9960854092526691,
"grad_norm": 0.313473105430603,
"learning_rate": 3.132229287666766e-10,
"loss": 0.4741,
"step": 933
},
{
"epoch": 0.9971530249110321,
"grad_norm": 0.35655054450035095,
"learning_rate": 1.3921099805302985e-10,
"loss": 0.4796,
"step": 934
},
{
"epoch": 0.998220640569395,
"grad_norm": 0.3347904086112976,
"learning_rate": 3.480287063706289e-11,
"loss": 0.4614,
"step": 935
},
{
"epoch": 0.999288256227758,
"grad_norm": 0.3562867343425751,
"learning_rate": 0.0,
"loss": 0.4649,
"step": 936
},
{
"epoch": 0.999288256227758,
"step": 936,
"total_flos": 1106993868636160.0,
"train_loss": 0.49742555408141553,
"train_runtime": 30683.4136,
"train_samples_per_second": 2.93,
"train_steps_per_second": 0.031
}
],
"logging_steps": 1.0,
"max_steps": 936,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1106993868636160.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}