Orpheus / tr /Orpheus-TTS-Turkish-PT-2000 /trainer_state.json
niobures's picture
Orpheus (tr)
f316adb verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.1822489520685256,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 9.112447603426281e-05,
"grad_norm": 4.810983180999756,
"learning_rate": 4.999999897557577e-05,
"loss": 3.2064,
"step": 1
},
{
"epoch": 0.00018224895206852561,
"grad_norm": 1.9742110967636108,
"learning_rate": 4.999999590230316e-05,
"loss": 2.9028,
"step": 2
},
{
"epoch": 0.0002733734281027884,
"grad_norm": 3.905468463897705,
"learning_rate": 4.99999907801824e-05,
"loss": 3.1475,
"step": 3
},
{
"epoch": 0.00036449790413705123,
"grad_norm": 3.2750675678253174,
"learning_rate": 4.9999983609213935e-05,
"loss": 3.1676,
"step": 4
},
{
"epoch": 0.000455622380171314,
"grad_norm": 2.567809581756592,
"learning_rate": 4.999997438939835e-05,
"loss": 2.3521,
"step": 5
},
{
"epoch": 0.0005467468562055768,
"grad_norm": 2.246769905090332,
"learning_rate": 4.9999963120736396e-05,
"loss": 3.0339,
"step": 6
},
{
"epoch": 0.0006378713322398396,
"grad_norm": 3.4692740440368652,
"learning_rate": 4.999994980322899e-05,
"loss": 2.579,
"step": 7
},
{
"epoch": 0.0007289958082741025,
"grad_norm": 3.0380406379699707,
"learning_rate": 4.999993443687723e-05,
"loss": 3.035,
"step": 8
},
{
"epoch": 0.0008201202843083652,
"grad_norm": 4.001114368438721,
"learning_rate": 4.999991702168238e-05,
"loss": 3.0391,
"step": 9
},
{
"epoch": 0.000911244760342628,
"grad_norm": 2.6958892345428467,
"learning_rate": 4.9999897557645856e-05,
"loss": 3.2347,
"step": 10
},
{
"epoch": 0.0010023692363768909,
"grad_norm": 3.562798500061035,
"learning_rate": 4.9999876044769266e-05,
"loss": 3.0267,
"step": 11
},
{
"epoch": 0.0010934937124111536,
"grad_norm": 3.4214489459991455,
"learning_rate": 4.999985248305436e-05,
"loss": 3.0557,
"step": 12
},
{
"epoch": 0.0011846181884454165,
"grad_norm": 5.271027565002441,
"learning_rate": 4.9999826872503085e-05,
"loss": 3.191,
"step": 13
},
{
"epoch": 0.0012757426644796792,
"grad_norm": 4.908616542816162,
"learning_rate": 4.999979921311753e-05,
"loss": 3.1134,
"step": 14
},
{
"epoch": 0.001366867140513942,
"grad_norm": 3.8159241676330566,
"learning_rate": 4.999976950489995e-05,
"loss": 3.3288,
"step": 15
},
{
"epoch": 0.001457991616548205,
"grad_norm": 4.082758903503418,
"learning_rate": 4.99997377478528e-05,
"loss": 3.1537,
"step": 16
},
{
"epoch": 0.0015491160925824676,
"grad_norm": 3.051913022994995,
"learning_rate": 4.999970394197867e-05,
"loss": 3.2477,
"step": 17
},
{
"epoch": 0.0016402405686167304,
"grad_norm": 4.618947505950928,
"learning_rate": 4.9999668087280336e-05,
"loss": 3.1326,
"step": 18
},
{
"epoch": 0.0017313650446509933,
"grad_norm": 3.8966140747070312,
"learning_rate": 4.9999630183760745e-05,
"loss": 3.0284,
"step": 19
},
{
"epoch": 0.001822489520685256,
"grad_norm": 4.619442462921143,
"learning_rate": 4.999959023142298e-05,
"loss": 3.3404,
"step": 20
},
{
"epoch": 0.0019136139967195188,
"grad_norm": 4.453152656555176,
"learning_rate": 4.999954823027034e-05,
"loss": 3.1907,
"step": 21
},
{
"epoch": 0.0020047384727537817,
"grad_norm": 2.996080160140991,
"learning_rate": 4.999950418030625e-05,
"loss": 3.32,
"step": 22
},
{
"epoch": 0.0020958629487880446,
"grad_norm": 3.222342014312744,
"learning_rate": 4.999945808153433e-05,
"loss": 3.3387,
"step": 23
},
{
"epoch": 0.002186987424822307,
"grad_norm": 2.4748733043670654,
"learning_rate": 4.9999409933958354e-05,
"loss": 3.2016,
"step": 24
},
{
"epoch": 0.00227811190085657,
"grad_norm": 2.91965913772583,
"learning_rate": 4.9999359737582266e-05,
"loss": 3.0563,
"step": 25
},
{
"epoch": 0.002369236376890833,
"grad_norm": 3.890470027923584,
"learning_rate": 4.9999307492410176e-05,
"loss": 2.9124,
"step": 26
},
{
"epoch": 0.0024603608529250955,
"grad_norm": 2.584951877593994,
"learning_rate": 4.999925319844638e-05,
"loss": 3.1967,
"step": 27
},
{
"epoch": 0.0025514853289593585,
"grad_norm": 3.9780001640319824,
"learning_rate": 4.999919685569532e-05,
"loss": 3.279,
"step": 28
},
{
"epoch": 0.0026426098049936214,
"grad_norm": 2.990048885345459,
"learning_rate": 4.999913846416161e-05,
"loss": 3.1481,
"step": 29
},
{
"epoch": 0.002733734281027884,
"grad_norm": 3.9399380683898926,
"learning_rate": 4.999907802385003e-05,
"loss": 2.9727,
"step": 30
},
{
"epoch": 0.002824858757062147,
"grad_norm": 3.745975971221924,
"learning_rate": 4.999901553476555e-05,
"loss": 2.8796,
"step": 31
},
{
"epoch": 0.00291598323309641,
"grad_norm": 3.4274938106536865,
"learning_rate": 4.999895099691328e-05,
"loss": 3.0988,
"step": 32
},
{
"epoch": 0.0030071077091306723,
"grad_norm": 2.9808709621429443,
"learning_rate": 4.999888441029852e-05,
"loss": 3.0458,
"step": 33
},
{
"epoch": 0.0030982321851649353,
"grad_norm": 2.1409685611724854,
"learning_rate": 4.9998815774926714e-05,
"loss": 3.0208,
"step": 34
},
{
"epoch": 0.0031893566611991982,
"grad_norm": 2.318962335586548,
"learning_rate": 4.9998745090803486e-05,
"loss": 3.0808,
"step": 35
},
{
"epoch": 0.0032804811372334607,
"grad_norm": 3.4923746585845947,
"learning_rate": 4.999867235793464e-05,
"loss": 3.2533,
"step": 36
},
{
"epoch": 0.0033716056132677237,
"grad_norm": 2.893838882446289,
"learning_rate": 4.9998597576326135e-05,
"loss": 3.1789,
"step": 37
},
{
"epoch": 0.0034627300893019866,
"grad_norm": 2.9843909740448,
"learning_rate": 4.999852074598409e-05,
"loss": 3.088,
"step": 38
},
{
"epoch": 0.003553854565336249,
"grad_norm": 4.7274041175842285,
"learning_rate": 4.99984418669148e-05,
"loss": 3.0058,
"step": 39
},
{
"epoch": 0.003644979041370512,
"grad_norm": 4.613773822784424,
"learning_rate": 4.999836093912475e-05,
"loss": 3.0773,
"step": 40
},
{
"epoch": 0.003736103517404775,
"grad_norm": 5.928118705749512,
"learning_rate": 4.9998277962620556e-05,
"loss": 3.8138,
"step": 41
},
{
"epoch": 0.0038272279934390375,
"grad_norm": 3.0459301471710205,
"learning_rate": 4.9998192937409015e-05,
"loss": 3.1967,
"step": 42
},
{
"epoch": 0.0039183524694733005,
"grad_norm": 3.857424736022949,
"learning_rate": 4.999810586349711e-05,
"loss": 3.2375,
"step": 43
},
{
"epoch": 0.004009476945507563,
"grad_norm": 2.038663387298584,
"learning_rate": 4.999801674089197e-05,
"loss": 3.0122,
"step": 44
},
{
"epoch": 0.004100601421541826,
"grad_norm": 2.0778284072875977,
"learning_rate": 4.999792556960089e-05,
"loss": 3.0134,
"step": 45
},
{
"epoch": 0.004191725897576089,
"grad_norm": 3.187390089035034,
"learning_rate": 4.999783234963136e-05,
"loss": 2.9639,
"step": 46
},
{
"epoch": 0.004282850373610351,
"grad_norm": 4.144068717956543,
"learning_rate": 4.9997737080991005e-05,
"loss": 3.0554,
"step": 47
},
{
"epoch": 0.004373974849644614,
"grad_norm": 3.5001306533813477,
"learning_rate": 4.999763976368763e-05,
"loss": 3.0371,
"step": 48
},
{
"epoch": 0.004465099325678877,
"grad_norm": 3.6111905574798584,
"learning_rate": 4.9997540397729226e-05,
"loss": 2.9927,
"step": 49
},
{
"epoch": 0.00455622380171314,
"grad_norm": 2.989147901535034,
"learning_rate": 4.999743898312392e-05,
"loss": 3.1585,
"step": 50
},
{
"epoch": 0.004647348277747403,
"grad_norm": 3.8651845455169678,
"learning_rate": 4.9997335519880036e-05,
"loss": 2.4887,
"step": 51
},
{
"epoch": 0.004738472753781666,
"grad_norm": 3.4701950550079346,
"learning_rate": 4.9997230008006045e-05,
"loss": 3.0551,
"step": 52
},
{
"epoch": 0.004829597229815928,
"grad_norm": 2.621978998184204,
"learning_rate": 4.99971224475106e-05,
"loss": 3.1943,
"step": 53
},
{
"epoch": 0.004920721705850191,
"grad_norm": 2.872795343399048,
"learning_rate": 4.999701283840252e-05,
"loss": 2.8501,
"step": 54
},
{
"epoch": 0.005011846181884454,
"grad_norm": 2.817953586578369,
"learning_rate": 4.9996901180690774e-05,
"loss": 2.9754,
"step": 55
},
{
"epoch": 0.005102970657918717,
"grad_norm": 1.9297401905059814,
"learning_rate": 4.999678747438452e-05,
"loss": 3.034,
"step": 56
},
{
"epoch": 0.00519409513395298,
"grad_norm": 3.435847520828247,
"learning_rate": 4.9996671719493085e-05,
"loss": 3.1619,
"step": 57
},
{
"epoch": 0.005285219609987243,
"grad_norm": 1.9299631118774414,
"learning_rate": 4.999655391602594e-05,
"loss": 3.0437,
"step": 58
},
{
"epoch": 0.005376344086021506,
"grad_norm": 3.256587266921997,
"learning_rate": 4.999643406399275e-05,
"loss": 3.0342,
"step": 59
},
{
"epoch": 0.005467468562055768,
"grad_norm": 2.301542282104492,
"learning_rate": 4.999631216340333e-05,
"loss": 3.0987,
"step": 60
},
{
"epoch": 0.005558593038090031,
"grad_norm": 4.333310127258301,
"learning_rate": 4.999618821426768e-05,
"loss": 2.9449,
"step": 61
},
{
"epoch": 0.005649717514124294,
"grad_norm": 1.8521463871002197,
"learning_rate": 4.999606221659595e-05,
"loss": 2.991,
"step": 62
},
{
"epoch": 0.005740841990158557,
"grad_norm": 3.421117067337036,
"learning_rate": 4.999593417039847e-05,
"loss": 3.1391,
"step": 63
},
{
"epoch": 0.00583196646619282,
"grad_norm": 3.233793020248413,
"learning_rate": 4.999580407568573e-05,
"loss": 3.0962,
"step": 64
},
{
"epoch": 0.005923090942227083,
"grad_norm": 3.7364771366119385,
"learning_rate": 4.99956719324684e-05,
"loss": 3.1224,
"step": 65
},
{
"epoch": 0.006014215418261345,
"grad_norm": 3.039381980895996,
"learning_rate": 4.9995537740757296e-05,
"loss": 3.0738,
"step": 66
},
{
"epoch": 0.006105339894295608,
"grad_norm": 2.6636710166931152,
"learning_rate": 4.999540150056343e-05,
"loss": 2.7519,
"step": 67
},
{
"epoch": 0.0061964643703298706,
"grad_norm": 4.360434532165527,
"learning_rate": 4.999526321189796e-05,
"loss": 3.3508,
"step": 68
},
{
"epoch": 0.0062875888463641335,
"grad_norm": 3.0914595127105713,
"learning_rate": 4.999512287477222e-05,
"loss": 3.1953,
"step": 69
},
{
"epoch": 0.0063787133223983964,
"grad_norm": 2.340858221054077,
"learning_rate": 4.999498048919771e-05,
"loss": 3.0567,
"step": 70
},
{
"epoch": 0.006469837798432659,
"grad_norm": 2.7261271476745605,
"learning_rate": 4.9994836055186104e-05,
"loss": 2.9738,
"step": 71
},
{
"epoch": 0.0065609622744669215,
"grad_norm": 4.343580722808838,
"learning_rate": 4.999468957274923e-05,
"loss": 2.1352,
"step": 72
},
{
"epoch": 0.006652086750501184,
"grad_norm": 4.178849220275879,
"learning_rate": 4.9994541041899104e-05,
"loss": 2.9654,
"step": 73
},
{
"epoch": 0.006743211226535447,
"grad_norm": 2.4153099060058594,
"learning_rate": 4.999439046264789e-05,
"loss": 3.0347,
"step": 74
},
{
"epoch": 0.00683433570256971,
"grad_norm": 3.150313377380371,
"learning_rate": 4.999423783500793e-05,
"loss": 3.0065,
"step": 75
},
{
"epoch": 0.006925460178603973,
"grad_norm": 3.502631902694702,
"learning_rate": 4.999408315899173e-05,
"loss": 3.0649,
"step": 76
},
{
"epoch": 0.007016584654638236,
"grad_norm": 3.1519196033477783,
"learning_rate": 4.999392643461198e-05,
"loss": 2.5478,
"step": 77
},
{
"epoch": 0.007107709130672498,
"grad_norm": 3.289674758911133,
"learning_rate": 4.9993767661881505e-05,
"loss": 2.9503,
"step": 78
},
{
"epoch": 0.007198833606706761,
"grad_norm": 2.8465538024902344,
"learning_rate": 4.999360684081333e-05,
"loss": 2.7604,
"step": 79
},
{
"epoch": 0.007289958082741024,
"grad_norm": 1.927045464515686,
"learning_rate": 4.999344397142064e-05,
"loss": 3.2912,
"step": 80
},
{
"epoch": 0.007381082558775287,
"grad_norm": 3.03855037689209,
"learning_rate": 4.9993279053716767e-05,
"loss": 2.8311,
"step": 81
},
{
"epoch": 0.00747220703480955,
"grad_norm": 3.128101348876953,
"learning_rate": 4.9993112087715236e-05,
"loss": 3.2222,
"step": 82
},
{
"epoch": 0.007563331510843813,
"grad_norm": 2.3521552085876465,
"learning_rate": 4.999294307342972e-05,
"loss": 3.0606,
"step": 83
},
{
"epoch": 0.007654455986878075,
"grad_norm": 4.49812650680542,
"learning_rate": 4.999277201087409e-05,
"loss": 2.5668,
"step": 84
},
{
"epoch": 0.007745580462912338,
"grad_norm": 1.8955475091934204,
"learning_rate": 4.999259890006235e-05,
"loss": 2.9524,
"step": 85
},
{
"epoch": 0.007836704938946601,
"grad_norm": 2.4218337535858154,
"learning_rate": 4.999242374100869e-05,
"loss": 2.7087,
"step": 86
},
{
"epoch": 0.007927829414980863,
"grad_norm": 2.254610538482666,
"learning_rate": 4.999224653372747e-05,
"loss": 3.0396,
"step": 87
},
{
"epoch": 0.008018953891015127,
"grad_norm": 3.1680707931518555,
"learning_rate": 4.99920672782332e-05,
"loss": 3.0425,
"step": 88
},
{
"epoch": 0.008110078367049389,
"grad_norm": 3.0137205123901367,
"learning_rate": 4.999188597454059e-05,
"loss": 2.9103,
"step": 89
},
{
"epoch": 0.008201202843083653,
"grad_norm": 3.345235824584961,
"learning_rate": 4.999170262266448e-05,
"loss": 3.1356,
"step": 90
},
{
"epoch": 0.008292327319117915,
"grad_norm": 3.2153403759002686,
"learning_rate": 4.999151722261991e-05,
"loss": 3.3007,
"step": 91
},
{
"epoch": 0.008383451795152179,
"grad_norm": 2.3512165546417236,
"learning_rate": 4.999132977442207e-05,
"loss": 3.0219,
"step": 92
},
{
"epoch": 0.00847457627118644,
"grad_norm": 2.2550835609436035,
"learning_rate": 4.9991140278086316e-05,
"loss": 3.1043,
"step": 93
},
{
"epoch": 0.008565700747220703,
"grad_norm": 4.217181205749512,
"learning_rate": 4.9990948733628186e-05,
"loss": 3.1672,
"step": 94
},
{
"epoch": 0.008656825223254967,
"grad_norm": 3.7786593437194824,
"learning_rate": 4.9990755141063376e-05,
"loss": 3.051,
"step": 95
},
{
"epoch": 0.008747949699289229,
"grad_norm": 2.935464859008789,
"learning_rate": 4.999055950040775e-05,
"loss": 2.7657,
"step": 96
},
{
"epoch": 0.008839074175323492,
"grad_norm": 2.1169307231903076,
"learning_rate": 4.999036181167733e-05,
"loss": 2.951,
"step": 97
},
{
"epoch": 0.008930198651357754,
"grad_norm": 3.458928108215332,
"learning_rate": 4.999016207488835e-05,
"loss": 2.7861,
"step": 98
},
{
"epoch": 0.009021323127392018,
"grad_norm": 3.1938233375549316,
"learning_rate": 4.998996029005715e-05,
"loss": 2.782,
"step": 99
},
{
"epoch": 0.00911244760342628,
"grad_norm": 1.885495901107788,
"learning_rate": 4.998975645720027e-05,
"loss": 2.9136,
"step": 100
},
{
"epoch": 0.009203572079460542,
"grad_norm": 3.273291826248169,
"learning_rate": 4.998955057633442e-05,
"loss": 3.2322,
"step": 101
},
{
"epoch": 0.009294696555494806,
"grad_norm": 1.6428008079528809,
"learning_rate": 4.998934264747648e-05,
"loss": 2.7924,
"step": 102
},
{
"epoch": 0.009385821031529068,
"grad_norm": 3.5032806396484375,
"learning_rate": 4.9989132670643486e-05,
"loss": 2.8194,
"step": 103
},
{
"epoch": 0.009476945507563332,
"grad_norm": 2.517979145050049,
"learning_rate": 4.998892064585264e-05,
"loss": 2.5998,
"step": 104
},
{
"epoch": 0.009568069983597594,
"grad_norm": 2.546837329864502,
"learning_rate": 4.9988706573121324e-05,
"loss": 3.0157,
"step": 105
},
{
"epoch": 0.009659194459631856,
"grad_norm": 3.4117698669433594,
"learning_rate": 4.9988490452467074e-05,
"loss": 3.1974,
"step": 106
},
{
"epoch": 0.00975031893566612,
"grad_norm": 3.3978612422943115,
"learning_rate": 4.9988272283907616e-05,
"loss": 2.4654,
"step": 107
},
{
"epoch": 0.009841443411700382,
"grad_norm": 2.8470406532287598,
"learning_rate": 4.998805206746082e-05,
"loss": 2.7727,
"step": 108
},
{
"epoch": 0.009932567887734646,
"grad_norm": 2.559325695037842,
"learning_rate": 4.998782980314474e-05,
"loss": 2.9934,
"step": 109
},
{
"epoch": 0.010023692363768908,
"grad_norm": 2.0657799243927,
"learning_rate": 4.998760549097758e-05,
"loss": 3.084,
"step": 110
},
{
"epoch": 0.010114816839803172,
"grad_norm": 4.179584980010986,
"learning_rate": 4.998737913097773e-05,
"loss": 2.7185,
"step": 111
},
{
"epoch": 0.010205941315837434,
"grad_norm": 3.370600700378418,
"learning_rate": 4.998715072316375e-05,
"loss": 3.3553,
"step": 112
},
{
"epoch": 0.010297065791871696,
"grad_norm": 3.263859987258911,
"learning_rate": 4.998692026755435e-05,
"loss": 3.2411,
"step": 113
},
{
"epoch": 0.01038819026790596,
"grad_norm": 3.004282236099243,
"learning_rate": 4.998668776416842e-05,
"loss": 3.2068,
"step": 114
},
{
"epoch": 0.010479314743940222,
"grad_norm": 4.776451110839844,
"learning_rate": 4.9986453213024996e-05,
"loss": 2.931,
"step": 115
},
{
"epoch": 0.010570439219974486,
"grad_norm": 3.8841757774353027,
"learning_rate": 4.9986216614143335e-05,
"loss": 2.6147,
"step": 116
},
{
"epoch": 0.010661563696008748,
"grad_norm": 3.1611170768737793,
"learning_rate": 4.9985977967542794e-05,
"loss": 3.2392,
"step": 117
},
{
"epoch": 0.010752688172043012,
"grad_norm": 3.0968759059906006,
"learning_rate": 4.998573727324295e-05,
"loss": 3.0568,
"step": 118
},
{
"epoch": 0.010843812648077274,
"grad_norm": 2.2086093425750732,
"learning_rate": 4.998549453126353e-05,
"loss": 3.0838,
"step": 119
},
{
"epoch": 0.010934937124111536,
"grad_norm": 3.466169834136963,
"learning_rate": 4.998524974162442e-05,
"loss": 2.9381,
"step": 120
},
{
"epoch": 0.0110260616001458,
"grad_norm": 3.442246437072754,
"learning_rate": 4.998500290434568e-05,
"loss": 2.8684,
"step": 121
},
{
"epoch": 0.011117186076180062,
"grad_norm": 3.4763424396514893,
"learning_rate": 4.998475401944754e-05,
"loss": 3.1428,
"step": 122
},
{
"epoch": 0.011208310552214325,
"grad_norm": 3.302222728729248,
"learning_rate": 4.9984503086950416e-05,
"loss": 2.9401,
"step": 123
},
{
"epoch": 0.011299435028248588,
"grad_norm": 2.623926877975464,
"learning_rate": 4.998425010687484e-05,
"loss": 3.0758,
"step": 124
},
{
"epoch": 0.01139055950428285,
"grad_norm": 3.527374744415283,
"learning_rate": 4.998399507924157e-05,
"loss": 2.7145,
"step": 125
},
{
"epoch": 0.011481683980317113,
"grad_norm": 2.3806726932525635,
"learning_rate": 4.9983738004071495e-05,
"loss": 2.7357,
"step": 126
},
{
"epoch": 0.011572808456351375,
"grad_norm": 3.3078513145446777,
"learning_rate": 4.998347888138569e-05,
"loss": 3.0489,
"step": 127
},
{
"epoch": 0.01166393293238564,
"grad_norm": 2.8850951194763184,
"learning_rate": 4.9983217711205386e-05,
"loss": 3.0792,
"step": 128
},
{
"epoch": 0.011755057408419901,
"grad_norm": 1.8585487604141235,
"learning_rate": 4.998295449355199e-05,
"loss": 2.9322,
"step": 129
},
{
"epoch": 0.011846181884454165,
"grad_norm": 4.037771701812744,
"learning_rate": 4.9982689228447064e-05,
"loss": 2.9084,
"step": 130
},
{
"epoch": 0.011937306360488427,
"grad_norm": 2.485532283782959,
"learning_rate": 4.998242191591237e-05,
"loss": 2.8812,
"step": 131
},
{
"epoch": 0.01202843083652269,
"grad_norm": 2.1966612339019775,
"learning_rate": 4.9982152555969786e-05,
"loss": 3.1189,
"step": 132
},
{
"epoch": 0.012119555312556953,
"grad_norm": 1.8410605192184448,
"learning_rate": 4.9981881148641405e-05,
"loss": 3.0395,
"step": 133
},
{
"epoch": 0.012210679788591215,
"grad_norm": 2.812716007232666,
"learning_rate": 4.998160769394947e-05,
"loss": 3.0484,
"step": 134
},
{
"epoch": 0.012301804264625479,
"grad_norm": 3.6690826416015625,
"learning_rate": 4.99813321919164e-05,
"loss": 2.9871,
"step": 135
},
{
"epoch": 0.012392928740659741,
"grad_norm": 4.960578918457031,
"learning_rate": 4.998105464256475e-05,
"loss": 2.1866,
"step": 136
},
{
"epoch": 0.012484053216694003,
"grad_norm": 3.6404716968536377,
"learning_rate": 4.998077504591728e-05,
"loss": 3.0983,
"step": 137
},
{
"epoch": 0.012575177692728267,
"grad_norm": 3.0544800758361816,
"learning_rate": 4.9980493401996905e-05,
"loss": 3.0374,
"step": 138
},
{
"epoch": 0.012666302168762529,
"grad_norm": 2.765801191329956,
"learning_rate": 4.99802097108267e-05,
"loss": 2.8467,
"step": 139
},
{
"epoch": 0.012757426644796793,
"grad_norm": 2.434861898422241,
"learning_rate": 4.997992397242992e-05,
"loss": 2.9516,
"step": 140
},
{
"epoch": 0.012848551120831055,
"grad_norm": 3.1218740940093994,
"learning_rate": 4.997963618682998e-05,
"loss": 3.0314,
"step": 141
},
{
"epoch": 0.012939675596865319,
"grad_norm": 1.805385947227478,
"learning_rate": 4.997934635405047e-05,
"loss": 2.9169,
"step": 142
},
{
"epoch": 0.01303080007289958,
"grad_norm": 3.3232510089874268,
"learning_rate": 4.9979054474115144e-05,
"loss": 3.2397,
"step": 143
},
{
"epoch": 0.013121924548933843,
"grad_norm": 2.7170846462249756,
"learning_rate": 4.9978760547047915e-05,
"loss": 3.1618,
"step": 144
},
{
"epoch": 0.013213049024968107,
"grad_norm": 5.612462520599365,
"learning_rate": 4.9978464572872876e-05,
"loss": 3.7115,
"step": 145
},
{
"epoch": 0.013304173501002369,
"grad_norm": 3.0545284748077393,
"learning_rate": 4.997816655161428e-05,
"loss": 2.8899,
"step": 146
},
{
"epoch": 0.013395297977036633,
"grad_norm": 2.2899675369262695,
"learning_rate": 4.9977866483296544e-05,
"loss": 3.01,
"step": 147
},
{
"epoch": 0.013486422453070895,
"grad_norm": 1.921761393547058,
"learning_rate": 4.997756436794428e-05,
"loss": 2.9777,
"step": 148
},
{
"epoch": 0.013577546929105158,
"grad_norm": 3.515000104904175,
"learning_rate": 4.997726020558223e-05,
"loss": 2.9246,
"step": 149
},
{
"epoch": 0.01366867140513942,
"grad_norm": 1.8955378532409668,
"learning_rate": 4.997695399623533e-05,
"loss": 2.9843,
"step": 150
},
{
"epoch": 0.013759795881173683,
"grad_norm": 2.8406405448913574,
"learning_rate": 4.9976645739928675e-05,
"loss": 2.819,
"step": 151
},
{
"epoch": 0.013850920357207946,
"grad_norm": 2.192831039428711,
"learning_rate": 4.9976335436687525e-05,
"loss": 2.9843,
"step": 152
},
{
"epoch": 0.013942044833242209,
"grad_norm": 2.079318046569824,
"learning_rate": 4.997602308653731e-05,
"loss": 2.9742,
"step": 153
},
{
"epoch": 0.014033169309276472,
"grad_norm": 2.436514139175415,
"learning_rate": 4.997570868950363e-05,
"loss": 2.8987,
"step": 154
},
{
"epoch": 0.014124293785310734,
"grad_norm": 2.0050177574157715,
"learning_rate": 4.9975392245612254e-05,
"loss": 2.9146,
"step": 155
},
{
"epoch": 0.014215418261344996,
"grad_norm": 2.7386975288391113,
"learning_rate": 4.99750737548891e-05,
"loss": 3.106,
"step": 156
},
{
"epoch": 0.01430654273737926,
"grad_norm": 3.8090169429779053,
"learning_rate": 4.9974753217360295e-05,
"loss": 2.5344,
"step": 157
},
{
"epoch": 0.014397667213413522,
"grad_norm": 1.7630343437194824,
"learning_rate": 4.9974430633052085e-05,
"loss": 2.6978,
"step": 158
},
{
"epoch": 0.014488791689447786,
"grad_norm": 2.020094156265259,
"learning_rate": 4.9974106001990923e-05,
"loss": 3.0851,
"step": 159
},
{
"epoch": 0.014579916165482048,
"grad_norm": 2.589174270629883,
"learning_rate": 4.997377932420341e-05,
"loss": 2.8161,
"step": 160
},
{
"epoch": 0.014671040641516312,
"grad_norm": 3.2339718341827393,
"learning_rate": 4.997345059971631e-05,
"loss": 2.6908,
"step": 161
},
{
"epoch": 0.014762165117550574,
"grad_norm": 2.0948266983032227,
"learning_rate": 4.997311982855657e-05,
"loss": 2.9472,
"step": 162
},
{
"epoch": 0.014853289593584836,
"grad_norm": 2.4752578735351562,
"learning_rate": 4.99727870107513e-05,
"loss": 3.1334,
"step": 163
},
{
"epoch": 0.0149444140696191,
"grad_norm": 3.5128135681152344,
"learning_rate": 4.997245214632778e-05,
"loss": 2.8477,
"step": 164
},
{
"epoch": 0.015035538545653362,
"grad_norm": 2.700103282928467,
"learning_rate": 4.997211523531344e-05,
"loss": 3.0588,
"step": 165
},
{
"epoch": 0.015126663021687626,
"grad_norm": 4.285271644592285,
"learning_rate": 4.9971776277735906e-05,
"loss": 2.7581,
"step": 166
},
{
"epoch": 0.015217787497721888,
"grad_norm": 3.8157570362091064,
"learning_rate": 4.997143527362294e-05,
"loss": 3.0998,
"step": 167
},
{
"epoch": 0.01530891197375615,
"grad_norm": 3.245560646057129,
"learning_rate": 4.99710922230025e-05,
"loss": 3.0689,
"step": 168
},
{
"epoch": 0.015400036449790414,
"grad_norm": 3.2004127502441406,
"learning_rate": 4.9970747125902694e-05,
"loss": 2.8422,
"step": 169
},
{
"epoch": 0.015491160925824676,
"grad_norm": 3.4759035110473633,
"learning_rate": 4.997039998235181e-05,
"loss": 3.1256,
"step": 170
},
{
"epoch": 0.01558228540185894,
"grad_norm": 2.497690200805664,
"learning_rate": 4.99700507923783e-05,
"loss": 2.9096,
"step": 171
},
{
"epoch": 0.015673409877893202,
"grad_norm": 2.1334710121154785,
"learning_rate": 4.996969955601078e-05,
"loss": 2.9044,
"step": 172
},
{
"epoch": 0.015764534353927464,
"grad_norm": 2.300891160964966,
"learning_rate": 4.9969346273278025e-05,
"loss": 2.8908,
"step": 173
},
{
"epoch": 0.015855658829961726,
"grad_norm": 2.0513505935668945,
"learning_rate": 4.996899094420901e-05,
"loss": 2.8817,
"step": 174
},
{
"epoch": 0.01594678330599599,
"grad_norm": 2.8278379440307617,
"learning_rate": 4.996863356883282e-05,
"loss": 3.1366,
"step": 175
},
{
"epoch": 0.016037907782030254,
"grad_norm": 3.43684720993042,
"learning_rate": 4.996827414717878e-05,
"loss": 3.0606,
"step": 176
},
{
"epoch": 0.016129032258064516,
"grad_norm": 3.0732672214508057,
"learning_rate": 4.9967912679276316e-05,
"loss": 2.843,
"step": 177
},
{
"epoch": 0.016220156734098778,
"grad_norm": 3.4920480251312256,
"learning_rate": 4.996754916515508e-05,
"loss": 2.8969,
"step": 178
},
{
"epoch": 0.016311281210133043,
"grad_norm": 3.4001033306121826,
"learning_rate": 4.996718360484485e-05,
"loss": 2.8035,
"step": 179
},
{
"epoch": 0.016402405686167305,
"grad_norm": 1.7016775608062744,
"learning_rate": 4.9966815998375575e-05,
"loss": 2.7236,
"step": 180
},
{
"epoch": 0.016493530162201567,
"grad_norm": 3.0281150341033936,
"learning_rate": 4.99664463457774e-05,
"loss": 3.2244,
"step": 181
},
{
"epoch": 0.01658465463823583,
"grad_norm": 3.8529109954833984,
"learning_rate": 4.9966074647080606e-05,
"loss": 2.7765,
"step": 182
},
{
"epoch": 0.01667577911427009,
"grad_norm": 4.410470008850098,
"learning_rate": 4.996570090231566e-05,
"loss": 3.2551,
"step": 183
},
{
"epoch": 0.016766903590304357,
"grad_norm": 2.621544122695923,
"learning_rate": 4.99653251115132e-05,
"loss": 2.8792,
"step": 184
},
{
"epoch": 0.01685802806633862,
"grad_norm": 3.7127602100372314,
"learning_rate": 4.9964947274704e-05,
"loss": 2.4034,
"step": 185
},
{
"epoch": 0.01694915254237288,
"grad_norm": 3.494060754776001,
"learning_rate": 4.996456739191905e-05,
"loss": 2.697,
"step": 186
},
{
"epoch": 0.017040277018407143,
"grad_norm": 2.872035264968872,
"learning_rate": 4.9964185463189475e-05,
"loss": 3.0493,
"step": 187
},
{
"epoch": 0.017131401494441405,
"grad_norm": 3.452061414718628,
"learning_rate": 4.996380148854657e-05,
"loss": 3.1065,
"step": 188
},
{
"epoch": 0.01722252597047567,
"grad_norm": 2.105959415435791,
"learning_rate": 4.996341546802181e-05,
"loss": 2.9128,
"step": 189
},
{
"epoch": 0.017313650446509933,
"grad_norm": 3.6331939697265625,
"learning_rate": 4.996302740164683e-05,
"loss": 2.7058,
"step": 190
},
{
"epoch": 0.017404774922544195,
"grad_norm": 2.7457187175750732,
"learning_rate": 4.996263728945343e-05,
"loss": 2.7081,
"step": 191
},
{
"epoch": 0.017495899398578457,
"grad_norm": 2.6010935306549072,
"learning_rate": 4.9962245131473585e-05,
"loss": 2.6168,
"step": 192
},
{
"epoch": 0.01758702387461272,
"grad_norm": 3.7223236560821533,
"learning_rate": 4.996185092773943e-05,
"loss": 2.9499,
"step": 193
},
{
"epoch": 0.017678148350646985,
"grad_norm": 3.707608938217163,
"learning_rate": 4.996145467828327e-05,
"loss": 2.4925,
"step": 194
},
{
"epoch": 0.017769272826681247,
"grad_norm": 2.3394830226898193,
"learning_rate": 4.996105638313759e-05,
"loss": 2.965,
"step": 195
},
{
"epoch": 0.01786039730271551,
"grad_norm": 3.4291019439697266,
"learning_rate": 4.9960656042335016e-05,
"loss": 2.9025,
"step": 196
},
{
"epoch": 0.01795152177874977,
"grad_norm": 3.2778756618499756,
"learning_rate": 4.9960253655908374e-05,
"loss": 2.7247,
"step": 197
},
{
"epoch": 0.018042646254784037,
"grad_norm": 2.798750877380371,
"learning_rate": 4.995984922389063e-05,
"loss": 2.6033,
"step": 198
},
{
"epoch": 0.0181337707308183,
"grad_norm": 3.6283669471740723,
"learning_rate": 4.995944274631493e-05,
"loss": 2.6426,
"step": 199
},
{
"epoch": 0.01822489520685256,
"grad_norm": 3.634934902191162,
"learning_rate": 4.9959034223214595e-05,
"loss": 2.8299,
"step": 200
},
{
"epoch": 0.018316019682886823,
"grad_norm": 4.604448318481445,
"learning_rate": 4.99586236546231e-05,
"loss": 2.9753,
"step": 201
},
{
"epoch": 0.018407144158921085,
"grad_norm": 3.651458501815796,
"learning_rate": 4.995821104057409e-05,
"loss": 2.5728,
"step": 202
},
{
"epoch": 0.01849826863495535,
"grad_norm": 2.339756727218628,
"learning_rate": 4.9957796381101374e-05,
"loss": 3.1176,
"step": 203
},
{
"epoch": 0.018589393110989613,
"grad_norm": 3.3692636489868164,
"learning_rate": 4.9957379676238945e-05,
"loss": 3.2663,
"step": 204
},
{
"epoch": 0.018680517587023875,
"grad_norm": 4.322436809539795,
"learning_rate": 4.995696092602096e-05,
"loss": 3.0394,
"step": 205
},
{
"epoch": 0.018771642063058137,
"grad_norm": 3.796675205230713,
"learning_rate": 4.995654013048172e-05,
"loss": 2.8621,
"step": 206
},
{
"epoch": 0.0188627665390924,
"grad_norm": 2.7833352088928223,
"learning_rate": 4.995611728965571e-05,
"loss": 2.8098,
"step": 207
},
{
"epoch": 0.018953891015126664,
"grad_norm": 3.21239972114563,
"learning_rate": 4.995569240357761e-05,
"loss": 2.7568,
"step": 208
},
{
"epoch": 0.019045015491160926,
"grad_norm": 1.92779541015625,
"learning_rate": 4.995526547228222e-05,
"loss": 2.9617,
"step": 209
},
{
"epoch": 0.01913613996719519,
"grad_norm": 1.4563132524490356,
"learning_rate": 4.9954836495804525e-05,
"loss": 2.7798,
"step": 210
},
{
"epoch": 0.01922726444322945,
"grad_norm": 2.3598124980926514,
"learning_rate": 4.99544054741797e-05,
"loss": 2.7242,
"step": 211
},
{
"epoch": 0.019318388919263713,
"grad_norm": 3.2183914184570312,
"learning_rate": 4.995397240744305e-05,
"loss": 2.725,
"step": 212
},
{
"epoch": 0.019409513395297978,
"grad_norm": 3.6406822204589844,
"learning_rate": 4.9953537295630074e-05,
"loss": 2.9159,
"step": 213
},
{
"epoch": 0.01950063787133224,
"grad_norm": 3.356208086013794,
"learning_rate": 4.995310013877643e-05,
"loss": 2.3788,
"step": 214
},
{
"epoch": 0.019591762347366502,
"grad_norm": 3.275792360305786,
"learning_rate": 4.9952660936917953e-05,
"loss": 3.2876,
"step": 215
},
{
"epoch": 0.019682886823400764,
"grad_norm": 3.7224249839782715,
"learning_rate": 4.995221969009063e-05,
"loss": 2.4127,
"step": 216
},
{
"epoch": 0.01977401129943503,
"grad_norm": 2.694688081741333,
"learning_rate": 4.995177639833062e-05,
"loss": 2.685,
"step": 217
},
{
"epoch": 0.019865135775469292,
"grad_norm": 2.4117233753204346,
"learning_rate": 4.995133106167425e-05,
"loss": 2.8893,
"step": 218
},
{
"epoch": 0.019956260251503554,
"grad_norm": 2.3335092067718506,
"learning_rate": 4.995088368015804e-05,
"loss": 2.9949,
"step": 219
},
{
"epoch": 0.020047384727537816,
"grad_norm": 2.5341763496398926,
"learning_rate": 4.995043425381862e-05,
"loss": 2.7349,
"step": 220
},
{
"epoch": 0.020138509203572078,
"grad_norm": 2.1053128242492676,
"learning_rate": 4.994998278269286e-05,
"loss": 2.872,
"step": 221
},
{
"epoch": 0.020229633679606344,
"grad_norm": 4.247159957885742,
"learning_rate": 4.9949529266817716e-05,
"loss": 2.8256,
"step": 222
},
{
"epoch": 0.020320758155640606,
"grad_norm": 2.9789533615112305,
"learning_rate": 4.9949073706230395e-05,
"loss": 2.9463,
"step": 223
},
{
"epoch": 0.020411882631674868,
"grad_norm": 4.879419326782227,
"learning_rate": 4.994861610096821e-05,
"loss": 3.0509,
"step": 224
},
{
"epoch": 0.02050300710770913,
"grad_norm": 2.939770221710205,
"learning_rate": 4.994815645106867e-05,
"loss": 2.6338,
"step": 225
},
{
"epoch": 0.020594131583743392,
"grad_norm": 4.3412861824035645,
"learning_rate": 4.994769475656945e-05,
"loss": 3.0108,
"step": 226
},
{
"epoch": 0.020685256059777658,
"grad_norm": 2.51426100730896,
"learning_rate": 4.994723101750838e-05,
"loss": 3.0526,
"step": 227
},
{
"epoch": 0.02077638053581192,
"grad_norm": 3.505676031112671,
"learning_rate": 4.994676523392347e-05,
"loss": 2.7056,
"step": 228
},
{
"epoch": 0.020867505011846182,
"grad_norm": 4.260951519012451,
"learning_rate": 4.994629740585289e-05,
"loss": 3.1183,
"step": 229
},
{
"epoch": 0.020958629487880444,
"grad_norm": 2.5711376667022705,
"learning_rate": 4.994582753333498e-05,
"loss": 3.0857,
"step": 230
},
{
"epoch": 0.021049753963914706,
"grad_norm": 2.166073799133301,
"learning_rate": 4.994535561640824e-05,
"loss": 2.8542,
"step": 231
},
{
"epoch": 0.02114087843994897,
"grad_norm": 1.8539170026779175,
"learning_rate": 4.9944881655111366e-05,
"loss": 3.0293,
"step": 232
},
{
"epoch": 0.021232002915983234,
"grad_norm": 1.9353551864624023,
"learning_rate": 4.994440564948318e-05,
"loss": 2.8262,
"step": 233
},
{
"epoch": 0.021323127392017496,
"grad_norm": 3.0932233333587646,
"learning_rate": 4.994392759956271e-05,
"loss": 2.8178,
"step": 234
},
{
"epoch": 0.021414251868051758,
"grad_norm": 4.661931991577148,
"learning_rate": 4.994344750538913e-05,
"loss": 3.0574,
"step": 235
},
{
"epoch": 0.021505376344086023,
"grad_norm": 3.5828258991241455,
"learning_rate": 4.994296536700177e-05,
"loss": 2.9585,
"step": 236
},
{
"epoch": 0.021596500820120285,
"grad_norm": 3.237452983856201,
"learning_rate": 4.994248118444016e-05,
"loss": 3.2907,
"step": 237
},
{
"epoch": 0.021687625296154547,
"grad_norm": 1.7712563276290894,
"learning_rate": 4.9941994957743976e-05,
"loss": 2.81,
"step": 238
},
{
"epoch": 0.02177874977218881,
"grad_norm": 4.197923183441162,
"learning_rate": 4.9941506686953064e-05,
"loss": 2.8537,
"step": 239
},
{
"epoch": 0.02186987424822307,
"grad_norm": 4.0791239738464355,
"learning_rate": 4.994101637210744e-05,
"loss": 3.1615,
"step": 240
},
{
"epoch": 0.021960998724257337,
"grad_norm": 1.8465322256088257,
"learning_rate": 4.994052401324729e-05,
"loss": 2.8619,
"step": 241
},
{
"epoch": 0.0220521232002916,
"grad_norm": 3.064363956451416,
"learning_rate": 4.9940029610412964e-05,
"loss": 2.9496,
"step": 242
},
{
"epoch": 0.02214324767632586,
"grad_norm": 5.682558536529541,
"learning_rate": 4.993953316364498e-05,
"loss": 2.9704,
"step": 243
},
{
"epoch": 0.022234372152360123,
"grad_norm": 3.240743637084961,
"learning_rate": 4.993903467298402e-05,
"loss": 2.9054,
"step": 244
},
{
"epoch": 0.022325496628394385,
"grad_norm": 3.0782744884490967,
"learning_rate": 4.993853413847094e-05,
"loss": 2.328,
"step": 245
},
{
"epoch": 0.02241662110442865,
"grad_norm": 3.2922260761260986,
"learning_rate": 4.993803156014677e-05,
"loss": 2.9641,
"step": 246
},
{
"epoch": 0.022507745580462913,
"grad_norm": 3.023947238922119,
"learning_rate": 4.993752693805268e-05,
"loss": 2.8576,
"step": 247
},
{
"epoch": 0.022598870056497175,
"grad_norm": 3.0554006099700928,
"learning_rate": 4.993702027223004e-05,
"loss": 2.7858,
"step": 248
},
{
"epoch": 0.022689994532531437,
"grad_norm": 3.693634271621704,
"learning_rate": 4.9936511562720364e-05,
"loss": 2.7147,
"step": 249
},
{
"epoch": 0.0227811190085657,
"grad_norm": 3.201390027999878,
"learning_rate": 4.993600080956535e-05,
"loss": 1.9882,
"step": 250
},
{
"epoch": 0.022872243484599965,
"grad_norm": 1.9936951398849487,
"learning_rate": 4.993548801280686e-05,
"loss": 3.1181,
"step": 251
},
{
"epoch": 0.022963367960634227,
"grad_norm": 3.260298490524292,
"learning_rate": 4.993497317248691e-05,
"loss": 3.1898,
"step": 252
},
{
"epoch": 0.02305449243666849,
"grad_norm": 4.4110517501831055,
"learning_rate": 4.9934456288647694e-05,
"loss": 2.8244,
"step": 253
},
{
"epoch": 0.02314561691270275,
"grad_norm": 2.4315178394317627,
"learning_rate": 4.993393736133157e-05,
"loss": 2.4183,
"step": 254
},
{
"epoch": 0.023236741388737013,
"grad_norm": 3.644021511077881,
"learning_rate": 4.993341639058108e-05,
"loss": 2.654,
"step": 255
},
{
"epoch": 0.02332786586477128,
"grad_norm": 5.052486896514893,
"learning_rate": 4.99328933764389e-05,
"loss": 3.4613,
"step": 256
},
{
"epoch": 0.02341899034080554,
"grad_norm": 2.375401020050049,
"learning_rate": 4.993236831894792e-05,
"loss": 2.9367,
"step": 257
},
{
"epoch": 0.023510114816839803,
"grad_norm": 2.3803887367248535,
"learning_rate": 4.993184121815114e-05,
"loss": 2.9791,
"step": 258
},
{
"epoch": 0.023601239292874065,
"grad_norm": 3.324200391769409,
"learning_rate": 4.993131207409179e-05,
"loss": 2.8913,
"step": 259
},
{
"epoch": 0.02369236376890833,
"grad_norm": 2.2855703830718994,
"learning_rate": 4.99307808868132e-05,
"loss": 3.1401,
"step": 260
},
{
"epoch": 0.023783488244942592,
"grad_norm": 3.8498096466064453,
"learning_rate": 4.9930247656358926e-05,
"loss": 2.7348,
"step": 261
},
{
"epoch": 0.023874612720976855,
"grad_norm": 2.989870071411133,
"learning_rate": 4.992971238277266e-05,
"loss": 2.7794,
"step": 262
},
{
"epoch": 0.023965737197011117,
"grad_norm": 1.9267282485961914,
"learning_rate": 4.9929175066098285e-05,
"loss": 2.7822,
"step": 263
},
{
"epoch": 0.02405686167304538,
"grad_norm": 3.4673049449920654,
"learning_rate": 4.992863570637981e-05,
"loss": 2.7716,
"step": 264
},
{
"epoch": 0.024147986149079644,
"grad_norm": 4.030117034912109,
"learning_rate": 4.9928094303661465e-05,
"loss": 2.3569,
"step": 265
},
{
"epoch": 0.024239110625113906,
"grad_norm": 2.8260750770568848,
"learning_rate": 4.99275508579876e-05,
"loss": 2.9893,
"step": 266
},
{
"epoch": 0.02433023510114817,
"grad_norm": 3.653055429458618,
"learning_rate": 4.9927005369402756e-05,
"loss": 3.1263,
"step": 267
},
{
"epoch": 0.02442135957718243,
"grad_norm": 2.9457454681396484,
"learning_rate": 4.992645783795165e-05,
"loss": 3.0073,
"step": 268
},
{
"epoch": 0.024512484053216693,
"grad_norm": 1.6144747734069824,
"learning_rate": 4.992590826367913e-05,
"loss": 2.7446,
"step": 269
},
{
"epoch": 0.024603608529250958,
"grad_norm": 3.5470383167266846,
"learning_rate": 4.992535664663027e-05,
"loss": 2.7096,
"step": 270
},
{
"epoch": 0.02469473300528522,
"grad_norm": 2.77581787109375,
"learning_rate": 4.992480298685025e-05,
"loss": 2.7322,
"step": 271
},
{
"epoch": 0.024785857481319482,
"grad_norm": 3.7434020042419434,
"learning_rate": 4.992424728438445e-05,
"loss": 2.9467,
"step": 272
},
{
"epoch": 0.024876981957353744,
"grad_norm": 2.4266579151153564,
"learning_rate": 4.992368953927842e-05,
"loss": 2.5198,
"step": 273
},
{
"epoch": 0.024968106433388006,
"grad_norm": 3.2962489128112793,
"learning_rate": 4.9923129751577866e-05,
"loss": 3.1596,
"step": 274
},
{
"epoch": 0.025059230909422272,
"grad_norm": 7.043239116668701,
"learning_rate": 4.9922567921328665e-05,
"loss": 3.488,
"step": 275
},
{
"epoch": 0.025150355385456534,
"grad_norm": 2.4639480113983154,
"learning_rate": 4.992200404857686e-05,
"loss": 2.4812,
"step": 276
},
{
"epoch": 0.025241479861490796,
"grad_norm": 1.857424020767212,
"learning_rate": 4.9921438133368655e-05,
"loss": 2.8176,
"step": 277
},
{
"epoch": 0.025332604337525058,
"grad_norm": 4.494309425354004,
"learning_rate": 4.992087017575044e-05,
"loss": 3.1054,
"step": 278
},
{
"epoch": 0.025423728813559324,
"grad_norm": 4.276947975158691,
"learning_rate": 4.992030017576875e-05,
"loss": 2.1909,
"step": 279
},
{
"epoch": 0.025514853289593586,
"grad_norm": 4.083865642547607,
"learning_rate": 4.991972813347032e-05,
"loss": 2.9479,
"step": 280
},
{
"epoch": 0.025605977765627848,
"grad_norm": 2.375821828842163,
"learning_rate": 4.991915404890201e-05,
"loss": 2.7618,
"step": 281
},
{
"epoch": 0.02569710224166211,
"grad_norm": 3.873976707458496,
"learning_rate": 4.9918577922110875e-05,
"loss": 3.0425,
"step": 282
},
{
"epoch": 0.025788226717696372,
"grad_norm": 2.8449337482452393,
"learning_rate": 4.991799975314414e-05,
"loss": 2.8696,
"step": 283
},
{
"epoch": 0.025879351193730638,
"grad_norm": 4.654402732849121,
"learning_rate": 4.991741954204917e-05,
"loss": 2.7261,
"step": 284
},
{
"epoch": 0.0259704756697649,
"grad_norm": 2.0715343952178955,
"learning_rate": 4.991683728887353e-05,
"loss": 2.9878,
"step": 285
},
{
"epoch": 0.02606160014579916,
"grad_norm": 1.749382734298706,
"learning_rate": 4.991625299366494e-05,
"loss": 2.8315,
"step": 286
},
{
"epoch": 0.026152724621833424,
"grad_norm": 2.439875841140747,
"learning_rate": 4.991566665647127e-05,
"loss": 2.7062,
"step": 287
},
{
"epoch": 0.026243849097867686,
"grad_norm": 3.324129581451416,
"learning_rate": 4.991507827734058e-05,
"loss": 3.0118,
"step": 288
},
{
"epoch": 0.02633497357390195,
"grad_norm": 3.6669745445251465,
"learning_rate": 4.99144878563211e-05,
"loss": 2.8931,
"step": 289
},
{
"epoch": 0.026426098049936213,
"grad_norm": 3.859177350997925,
"learning_rate": 4.99138953934612e-05,
"loss": 1.8732,
"step": 290
},
{
"epoch": 0.026517222525970476,
"grad_norm": 3.1230263710021973,
"learning_rate": 4.991330088880945e-05,
"loss": 3.0979,
"step": 291
},
{
"epoch": 0.026608347002004738,
"grad_norm": 6.740344524383545,
"learning_rate": 4.9912704342414565e-05,
"loss": 3.7346,
"step": 292
},
{
"epoch": 0.026699471478039,
"grad_norm": 2.1976635456085205,
"learning_rate": 4.9912105754325435e-05,
"loss": 2.9069,
"step": 293
},
{
"epoch": 0.026790595954073265,
"grad_norm": 3.5679171085357666,
"learning_rate": 4.991150512459111e-05,
"loss": 3.3382,
"step": 294
},
{
"epoch": 0.026881720430107527,
"grad_norm": 2.9355552196502686,
"learning_rate": 4.9910902453260824e-05,
"loss": 2.7731,
"step": 295
},
{
"epoch": 0.02697284490614179,
"grad_norm": 3.473723888397217,
"learning_rate": 4.991029774038397e-05,
"loss": 2.995,
"step": 296
},
{
"epoch": 0.02706396938217605,
"grad_norm": 3.0785019397735596,
"learning_rate": 4.9909690986010095e-05,
"loss": 2.7569,
"step": 297
},
{
"epoch": 0.027155093858210317,
"grad_norm": 2.493504762649536,
"learning_rate": 4.990908219018894e-05,
"loss": 2.9982,
"step": 298
},
{
"epoch": 0.02724621833424458,
"grad_norm": 2.2580089569091797,
"learning_rate": 4.990847135297038e-05,
"loss": 2.5264,
"step": 299
},
{
"epoch": 0.02733734281027884,
"grad_norm": 2.80513858795166,
"learning_rate": 4.990785847440449e-05,
"loss": 2.6634,
"step": 300
},
{
"epoch": 0.027428467286313103,
"grad_norm": 2.0804553031921387,
"learning_rate": 4.9907243554541486e-05,
"loss": 2.9459,
"step": 301
},
{
"epoch": 0.027519591762347365,
"grad_norm": 3.045847177505493,
"learning_rate": 4.9906626593431776e-05,
"loss": 2.611,
"step": 302
},
{
"epoch": 0.02761071623838163,
"grad_norm": 2.9146780967712402,
"learning_rate": 4.9906007591125915e-05,
"loss": 2.7193,
"step": 303
},
{
"epoch": 0.027701840714415893,
"grad_norm": 2.089606761932373,
"learning_rate": 4.990538654767464e-05,
"loss": 2.9351,
"step": 304
},
{
"epoch": 0.027792965190450155,
"grad_norm": 4.3602471351623535,
"learning_rate": 4.9904763463128826e-05,
"loss": 2.4889,
"step": 305
},
{
"epoch": 0.027884089666484417,
"grad_norm": 2.540752410888672,
"learning_rate": 4.9904138337539566e-05,
"loss": 3.1232,
"step": 306
},
{
"epoch": 0.02797521414251868,
"grad_norm": 3.7896921634674072,
"learning_rate": 4.9903511170958074e-05,
"loss": 3.2954,
"step": 307
},
{
"epoch": 0.028066338618552945,
"grad_norm": 2.222221612930298,
"learning_rate": 4.990288196343575e-05,
"loss": 2.8993,
"step": 308
},
{
"epoch": 0.028157463094587207,
"grad_norm": 3.169107437133789,
"learning_rate": 4.990225071502418e-05,
"loss": 2.6121,
"step": 309
},
{
"epoch": 0.02824858757062147,
"grad_norm": 2.2904651165008545,
"learning_rate": 4.9901617425775067e-05,
"loss": 3.0291,
"step": 310
},
{
"epoch": 0.02833971204665573,
"grad_norm": 2.019195556640625,
"learning_rate": 4.990098209574033e-05,
"loss": 2.8676,
"step": 311
},
{
"epoch": 0.028430836522689993,
"grad_norm": 3.177980661392212,
"learning_rate": 4.9900344724972024e-05,
"loss": 2.7502,
"step": 312
},
{
"epoch": 0.02852196099872426,
"grad_norm": 4.665421485900879,
"learning_rate": 4.989970531352241e-05,
"loss": 2.7847,
"step": 313
},
{
"epoch": 0.02861308547475852,
"grad_norm": 3.270128011703491,
"learning_rate": 4.9899063861443854e-05,
"loss": 2.2844,
"step": 314
},
{
"epoch": 0.028704209950792783,
"grad_norm": 4.186242580413818,
"learning_rate": 4.989842036878895e-05,
"loss": 3.1222,
"step": 315
},
{
"epoch": 0.028795334426827045,
"grad_norm": 3.346442937850952,
"learning_rate": 4.989777483561043e-05,
"loss": 3.1899,
"step": 316
},
{
"epoch": 0.028886458902861307,
"grad_norm": 4.0838541984558105,
"learning_rate": 4.9897127261961196e-05,
"loss": 2.7816,
"step": 317
},
{
"epoch": 0.028977583378895572,
"grad_norm": 2.0978620052337646,
"learning_rate": 4.989647764789432e-05,
"loss": 2.9921,
"step": 318
},
{
"epoch": 0.029068707854929834,
"grad_norm": 2.830665349960327,
"learning_rate": 4.989582599346304e-05,
"loss": 1.9849,
"step": 319
},
{
"epoch": 0.029159832330964097,
"grad_norm": 2.3662290573120117,
"learning_rate": 4.989517229872076e-05,
"loss": 3.0048,
"step": 320
},
{
"epoch": 0.02925095680699836,
"grad_norm": 3.844538927078247,
"learning_rate": 4.989451656372106e-05,
"loss": 2.8494,
"step": 321
},
{
"epoch": 0.029342081283032624,
"grad_norm": 6.312561511993408,
"learning_rate": 4.989385878851767e-05,
"loss": 2.9627,
"step": 322
},
{
"epoch": 0.029433205759066886,
"grad_norm": 3.0908846855163574,
"learning_rate": 4.98931989731645e-05,
"loss": 2.974,
"step": 323
},
{
"epoch": 0.02952433023510115,
"grad_norm": 2.5474183559417725,
"learning_rate": 4.989253711771563e-05,
"loss": 2.8485,
"step": 324
},
{
"epoch": 0.02961545471113541,
"grad_norm": 2.3893277645111084,
"learning_rate": 4.98918732222253e-05,
"loss": 2.8714,
"step": 325
},
{
"epoch": 0.029706579187169672,
"grad_norm": 2.8498189449310303,
"learning_rate": 4.989120728674792e-05,
"loss": 2.567,
"step": 326
},
{
"epoch": 0.029797703663203938,
"grad_norm": 3.217600107192993,
"learning_rate": 4.989053931133806e-05,
"loss": 2.7916,
"step": 327
},
{
"epoch": 0.0298888281392382,
"grad_norm": 3.3806464672088623,
"learning_rate": 4.988986929605047e-05,
"loss": 2.6764,
"step": 328
},
{
"epoch": 0.029979952615272462,
"grad_norm": 3.6862692832946777,
"learning_rate": 4.988919724094005e-05,
"loss": 2.8749,
"step": 329
},
{
"epoch": 0.030071077091306724,
"grad_norm": 2.995811939239502,
"learning_rate": 4.9888523146061885e-05,
"loss": 3.0403,
"step": 330
},
{
"epoch": 0.030162201567340986,
"grad_norm": 2.922734498977661,
"learning_rate": 4.988784701147122e-05,
"loss": 2.792,
"step": 331
},
{
"epoch": 0.030253326043375252,
"grad_norm": 2.79219651222229,
"learning_rate": 4.988716883722348e-05,
"loss": 2.9411,
"step": 332
},
{
"epoch": 0.030344450519409514,
"grad_norm": 4.6752495765686035,
"learning_rate": 4.9886488623374214e-05,
"loss": 3.1196,
"step": 333
},
{
"epoch": 0.030435574995443776,
"grad_norm": 2.8725225925445557,
"learning_rate": 4.988580636997918e-05,
"loss": 2.7962,
"step": 334
},
{
"epoch": 0.030526699471478038,
"grad_norm": 3.8501832485198975,
"learning_rate": 4.9885122077094304e-05,
"loss": 2.6243,
"step": 335
},
{
"epoch": 0.0306178239475123,
"grad_norm": 2.141561985015869,
"learning_rate": 4.988443574477566e-05,
"loss": 3.0925,
"step": 336
},
{
"epoch": 0.030708948423546566,
"grad_norm": 4.128694534301758,
"learning_rate": 4.988374737307949e-05,
"loss": 2.985,
"step": 337
},
{
"epoch": 0.030800072899580828,
"grad_norm": 2.3043148517608643,
"learning_rate": 4.9883056962062213e-05,
"loss": 2.9108,
"step": 338
},
{
"epoch": 0.03089119737561509,
"grad_norm": 2.1815946102142334,
"learning_rate": 4.9882364511780414e-05,
"loss": 2.9105,
"step": 339
},
{
"epoch": 0.030982321851649352,
"grad_norm": 3.693887710571289,
"learning_rate": 4.9881670022290836e-05,
"loss": 2.8508,
"step": 340
},
{
"epoch": 0.031073446327683617,
"grad_norm": 3.411998748779297,
"learning_rate": 4.9880973493650394e-05,
"loss": 3.1107,
"step": 341
},
{
"epoch": 0.03116457080371788,
"grad_norm": 3.362623453140259,
"learning_rate": 4.9880274925916183e-05,
"loss": 2.5887,
"step": 342
},
{
"epoch": 0.03125569527975214,
"grad_norm": 2.2149298191070557,
"learning_rate": 4.987957431914544e-05,
"loss": 2.9536,
"step": 343
},
{
"epoch": 0.031346819755786404,
"grad_norm": 4.359997749328613,
"learning_rate": 4.9878871673395586e-05,
"loss": 3.2973,
"step": 344
},
{
"epoch": 0.03143794423182067,
"grad_norm": 4.717598915100098,
"learning_rate": 4.987816698872421e-05,
"loss": 1.9499,
"step": 345
},
{
"epoch": 0.03152906870785493,
"grad_norm": 3.7561376094818115,
"learning_rate": 4.9877460265189064e-05,
"loss": 2.6827,
"step": 346
},
{
"epoch": 0.03162019318388919,
"grad_norm": 3.771298408508301,
"learning_rate": 4.9876751502848064e-05,
"loss": 2.5356,
"step": 347
},
{
"epoch": 0.03171131765992345,
"grad_norm": 3.1962294578552246,
"learning_rate": 4.9876040701759286e-05,
"loss": 2.7013,
"step": 348
},
{
"epoch": 0.03180244213595772,
"grad_norm": 4.098245143890381,
"learning_rate": 4.9875327861981006e-05,
"loss": 2.8326,
"step": 349
},
{
"epoch": 0.03189356661199198,
"grad_norm": 2.9347190856933594,
"learning_rate": 4.9874612983571614e-05,
"loss": 2.6662,
"step": 350
},
{
"epoch": 0.03198469108802624,
"grad_norm": 3.6982884407043457,
"learning_rate": 4.9873896066589725e-05,
"loss": 3.0266,
"step": 351
},
{
"epoch": 0.03207581556406051,
"grad_norm": 3.387371778488159,
"learning_rate": 4.987317711109408e-05,
"loss": 2.7588,
"step": 352
},
{
"epoch": 0.03216694004009477,
"grad_norm": 4.407079219818115,
"learning_rate": 4.9872456117143607e-05,
"loss": 3.1829,
"step": 353
},
{
"epoch": 0.03225806451612903,
"grad_norm": 3.283046007156372,
"learning_rate": 4.987173308479738e-05,
"loss": 1.7902,
"step": 354
},
{
"epoch": 0.0323491889921633,
"grad_norm": 3.132047653198242,
"learning_rate": 4.987100801411467e-05,
"loss": 2.5832,
"step": 355
},
{
"epoch": 0.032440313468197556,
"grad_norm": 3.831251859664917,
"learning_rate": 4.9870280905154886e-05,
"loss": 2.804,
"step": 356
},
{
"epoch": 0.03253143794423182,
"grad_norm": 4.598052024841309,
"learning_rate": 4.986955175797763e-05,
"loss": 2.3582,
"step": 357
},
{
"epoch": 0.03262256242026609,
"grad_norm": 3.8844218254089355,
"learning_rate": 4.986882057264266e-05,
"loss": 2.5354,
"step": 358
},
{
"epoch": 0.032713686896300345,
"grad_norm": 3.4492027759552,
"learning_rate": 4.986808734920988e-05,
"loss": 2.952,
"step": 359
},
{
"epoch": 0.03280481137233461,
"grad_norm": 3.0955710411071777,
"learning_rate": 4.9867352087739405e-05,
"loss": 2.7746,
"step": 360
},
{
"epoch": 0.03289593584836887,
"grad_norm": 2.497973680496216,
"learning_rate": 4.986661478829147e-05,
"loss": 2.9674,
"step": 361
},
{
"epoch": 0.032987060324403135,
"grad_norm": 4.156097412109375,
"learning_rate": 4.986587545092651e-05,
"loss": 3.0256,
"step": 362
},
{
"epoch": 0.0330781848004374,
"grad_norm": 2.2689907550811768,
"learning_rate": 4.986513407570513e-05,
"loss": 2.888,
"step": 363
},
{
"epoch": 0.03316930927647166,
"grad_norm": 5.262107849121094,
"learning_rate": 4.986439066268807e-05,
"loss": 3.0598,
"step": 364
},
{
"epoch": 0.033260433752505925,
"grad_norm": 2.9902825355529785,
"learning_rate": 4.9863645211936254e-05,
"loss": 3.0898,
"step": 365
},
{
"epoch": 0.03335155822854018,
"grad_norm": 1.6874420642852783,
"learning_rate": 4.986289772351079e-05,
"loss": 2.7454,
"step": 366
},
{
"epoch": 0.03344268270457445,
"grad_norm": 3.7136764526367188,
"learning_rate": 4.986214819747293e-05,
"loss": 2.9232,
"step": 367
},
{
"epoch": 0.033533807180608714,
"grad_norm": 2.174556255340576,
"learning_rate": 4.986139663388409e-05,
"loss": 2.9913,
"step": 368
},
{
"epoch": 0.03362493165664297,
"grad_norm": 3.348562240600586,
"learning_rate": 4.986064303280588e-05,
"loss": 3.0657,
"step": 369
},
{
"epoch": 0.03371605613267724,
"grad_norm": 2.8100669384002686,
"learning_rate": 4.9859887394300055e-05,
"loss": 2.8318,
"step": 370
},
{
"epoch": 0.0338071806087115,
"grad_norm": 3.4799964427948,
"learning_rate": 4.9859129718428536e-05,
"loss": 2.8546,
"step": 371
},
{
"epoch": 0.03389830508474576,
"grad_norm": 3.3681960105895996,
"learning_rate": 4.985837000525343e-05,
"loss": 2.4564,
"step": 372
},
{
"epoch": 0.03398942956078003,
"grad_norm": 3.261798620223999,
"learning_rate": 4.985760825483699e-05,
"loss": 2.6388,
"step": 373
},
{
"epoch": 0.03408055403681429,
"grad_norm": 2.5790557861328125,
"learning_rate": 4.985684446724165e-05,
"loss": 2.9411,
"step": 374
},
{
"epoch": 0.03417167851284855,
"grad_norm": 3.773350715637207,
"learning_rate": 4.985607864252999e-05,
"loss": 2.8002,
"step": 375
},
{
"epoch": 0.03426280298888281,
"grad_norm": 3.27166485786438,
"learning_rate": 4.9855310780764794e-05,
"loss": 2.7819,
"step": 376
},
{
"epoch": 0.034353927464917076,
"grad_norm": 3.0358798503875732,
"learning_rate": 4.985454088200898e-05,
"loss": 2.918,
"step": 377
},
{
"epoch": 0.03444505194095134,
"grad_norm": 2.6746315956115723,
"learning_rate": 4.985376894632564e-05,
"loss": 2.6342,
"step": 378
},
{
"epoch": 0.0345361764169856,
"grad_norm": 3.8095743656158447,
"learning_rate": 4.985299497377805e-05,
"loss": 2.7959,
"step": 379
},
{
"epoch": 0.034627300893019866,
"grad_norm": 4.985168933868408,
"learning_rate": 4.985221896442963e-05,
"loss": 2.5482,
"step": 380
},
{
"epoch": 0.034718425369054125,
"grad_norm": 2.8703434467315674,
"learning_rate": 4.9851440918343985e-05,
"loss": 3.3517,
"step": 381
},
{
"epoch": 0.03480954984508839,
"grad_norm": 3.232175827026367,
"learning_rate": 4.985066083558486e-05,
"loss": 2.5102,
"step": 382
},
{
"epoch": 0.034900674321122656,
"grad_norm": 2.980302095413208,
"learning_rate": 4.9849878716216215e-05,
"loss": 2.7518,
"step": 383
},
{
"epoch": 0.034991798797156914,
"grad_norm": 2.309779167175293,
"learning_rate": 4.9849094560302124e-05,
"loss": 2.9199,
"step": 384
},
{
"epoch": 0.03508292327319118,
"grad_norm": 3.3376975059509277,
"learning_rate": 4.984830836790686e-05,
"loss": 2.9014,
"step": 385
},
{
"epoch": 0.03517404774922544,
"grad_norm": 3.1377711296081543,
"learning_rate": 4.984752013909485e-05,
"loss": 3.1008,
"step": 386
},
{
"epoch": 0.035265172225259704,
"grad_norm": 3.804525136947632,
"learning_rate": 4.9846729873930706e-05,
"loss": 2.6573,
"step": 387
},
{
"epoch": 0.03535629670129397,
"grad_norm": 2.8694443702697754,
"learning_rate": 4.984593757247918e-05,
"loss": 2.8805,
"step": 388
},
{
"epoch": 0.03544742117732823,
"grad_norm": 2.8578102588653564,
"learning_rate": 4.9845143234805216e-05,
"loss": 2.6417,
"step": 389
},
{
"epoch": 0.035538545653362494,
"grad_norm": 1.6129778623580933,
"learning_rate": 4.9844346860973896e-05,
"loss": 2.5198,
"step": 390
},
{
"epoch": 0.03562967012939676,
"grad_norm": 3.4736506938934326,
"learning_rate": 4.98435484510505e-05,
"loss": 2.0989,
"step": 391
},
{
"epoch": 0.03572079460543102,
"grad_norm": 2.6138317584991455,
"learning_rate": 4.984274800510046e-05,
"loss": 2.9925,
"step": 392
},
{
"epoch": 0.035811919081465284,
"grad_norm": 4.978623867034912,
"learning_rate": 4.984194552318936e-05,
"loss": 2.3243,
"step": 393
},
{
"epoch": 0.03590304355749954,
"grad_norm": 1.921769142150879,
"learning_rate": 4.984114100538299e-05,
"loss": 2.95,
"step": 394
},
{
"epoch": 0.03599416803353381,
"grad_norm": 3.2645325660705566,
"learning_rate": 4.984033445174727e-05,
"loss": 2.4694,
"step": 395
},
{
"epoch": 0.03608529250956807,
"grad_norm": 2.63525390625,
"learning_rate": 4.9839525862348304e-05,
"loss": 2.4872,
"step": 396
},
{
"epoch": 0.03617641698560233,
"grad_norm": 3.7979824542999268,
"learning_rate": 4.983871523725235e-05,
"loss": 2.399,
"step": 397
},
{
"epoch": 0.0362675414616366,
"grad_norm": 3.695284128189087,
"learning_rate": 4.983790257652585e-05,
"loss": 3.2067,
"step": 398
},
{
"epoch": 0.036358665937670856,
"grad_norm": 4.359871864318848,
"learning_rate": 4.9837087880235414e-05,
"loss": 2.5611,
"step": 399
},
{
"epoch": 0.03644979041370512,
"grad_norm": 3.501504898071289,
"learning_rate": 4.983627114844779e-05,
"loss": 2.2389,
"step": 400
},
{
"epoch": 0.03654091488973939,
"grad_norm": 3.637976884841919,
"learning_rate": 4.983545238122993e-05,
"loss": 2.4473,
"step": 401
},
{
"epoch": 0.036632039365773646,
"grad_norm": 4.534337043762207,
"learning_rate": 4.983463157864893e-05,
"loss": 3.0207,
"step": 402
},
{
"epoch": 0.03672316384180791,
"grad_norm": 4.795605659484863,
"learning_rate": 4.983380874077204e-05,
"loss": 2.7606,
"step": 403
},
{
"epoch": 0.03681428831784217,
"grad_norm": 2.7595343589782715,
"learning_rate": 4.983298386766672e-05,
"loss": 3.0888,
"step": 404
},
{
"epoch": 0.036905412793876435,
"grad_norm": 4.718511581420898,
"learning_rate": 4.983215695940057e-05,
"loss": 2.9122,
"step": 405
},
{
"epoch": 0.0369965372699107,
"grad_norm": 3.5803143978118896,
"learning_rate": 4.9831328016041335e-05,
"loss": 2.9831,
"step": 406
},
{
"epoch": 0.03708766174594496,
"grad_norm": 3.076673984527588,
"learning_rate": 4.983049703765697e-05,
"loss": 2.1278,
"step": 407
},
{
"epoch": 0.037178786221979225,
"grad_norm": 2.236630439758301,
"learning_rate": 4.9829664024315575e-05,
"loss": 3.0333,
"step": 408
},
{
"epoch": 0.037269910698013484,
"grad_norm": 3.2108254432678223,
"learning_rate": 4.982882897608542e-05,
"loss": 2.612,
"step": 409
},
{
"epoch": 0.03736103517404775,
"grad_norm": 4.408178806304932,
"learning_rate": 4.982799189303493e-05,
"loss": 2.7848,
"step": 410
},
{
"epoch": 0.037452159650082015,
"grad_norm": 3.5266246795654297,
"learning_rate": 4.9827152775232714e-05,
"loss": 2.8434,
"step": 411
},
{
"epoch": 0.03754328412611627,
"grad_norm": 5.857843399047852,
"learning_rate": 4.982631162274753e-05,
"loss": 2.9037,
"step": 412
},
{
"epoch": 0.03763440860215054,
"grad_norm": 2.262103796005249,
"learning_rate": 4.982546843564834e-05,
"loss": 2.9968,
"step": 413
},
{
"epoch": 0.0377255330781848,
"grad_norm": 4.400634288787842,
"learning_rate": 4.982462321400423e-05,
"loss": 2.9611,
"step": 414
},
{
"epoch": 0.03781665755421906,
"grad_norm": 1.9334667921066284,
"learning_rate": 4.982377595788447e-05,
"loss": 2.8443,
"step": 415
},
{
"epoch": 0.03790778203025333,
"grad_norm": 3.3641562461853027,
"learning_rate": 4.9822926667358494e-05,
"loss": 2.8691,
"step": 416
},
{
"epoch": 0.03799890650628759,
"grad_norm": 3.93518328666687,
"learning_rate": 4.98220753424959e-05,
"loss": 2.7432,
"step": 417
},
{
"epoch": 0.03809003098232185,
"grad_norm": 3.3891704082489014,
"learning_rate": 4.982122198336647e-05,
"loss": 3.1939,
"step": 418
},
{
"epoch": 0.03818115545835611,
"grad_norm": 2.4883248805999756,
"learning_rate": 4.982036659004014e-05,
"loss": 2.8615,
"step": 419
},
{
"epoch": 0.03827227993439038,
"grad_norm": 2.802154064178467,
"learning_rate": 4.9819509162587e-05,
"loss": 3.0744,
"step": 420
},
{
"epoch": 0.03836340441042464,
"grad_norm": 4.077276706695557,
"learning_rate": 4.981864970107733e-05,
"loss": 2.6231,
"step": 421
},
{
"epoch": 0.0384545288864589,
"grad_norm": 3.337916135787964,
"learning_rate": 4.981778820558156e-05,
"loss": 2.8649,
"step": 422
},
{
"epoch": 0.03854565336249317,
"grad_norm": 3.7240352630615234,
"learning_rate": 4.98169246761703e-05,
"loss": 3.0618,
"step": 423
},
{
"epoch": 0.038636777838527425,
"grad_norm": 1.917153239250183,
"learning_rate": 4.981605911291432e-05,
"loss": 3.0332,
"step": 424
},
{
"epoch": 0.03872790231456169,
"grad_norm": 3.458873987197876,
"learning_rate": 4.981519151588454e-05,
"loss": 2.77,
"step": 425
},
{
"epoch": 0.038819026790595956,
"grad_norm": 1.8529995679855347,
"learning_rate": 4.9814321885152085e-05,
"loss": 2.8618,
"step": 426
},
{
"epoch": 0.038910151266630215,
"grad_norm": 2.460031509399414,
"learning_rate": 4.981345022078821e-05,
"loss": 2.8602,
"step": 427
},
{
"epoch": 0.03900127574266448,
"grad_norm": 3.4180796146392822,
"learning_rate": 4.981257652286436e-05,
"loss": 2.6805,
"step": 428
},
{
"epoch": 0.03909240021869874,
"grad_norm": 3.609208345413208,
"learning_rate": 4.981170079145213e-05,
"loss": 2.8997,
"step": 429
},
{
"epoch": 0.039183524694733005,
"grad_norm": 4.643214225769043,
"learning_rate": 4.981082302662329e-05,
"loss": 3.1809,
"step": 430
},
{
"epoch": 0.03927464917076727,
"grad_norm": 3.2911436557769775,
"learning_rate": 4.980994322844979e-05,
"loss": 2.982,
"step": 431
},
{
"epoch": 0.03936577364680153,
"grad_norm": 3.3252878189086914,
"learning_rate": 4.980906139700372e-05,
"loss": 3.1547,
"step": 432
},
{
"epoch": 0.039456898122835794,
"grad_norm": 2.907970666885376,
"learning_rate": 4.980817753235735e-05,
"loss": 3.1306,
"step": 433
},
{
"epoch": 0.03954802259887006,
"grad_norm": 2.1587812900543213,
"learning_rate": 4.980729163458312e-05,
"loss": 3.1019,
"step": 434
},
{
"epoch": 0.03963914707490432,
"grad_norm": 2.8087103366851807,
"learning_rate": 4.9806403703753624e-05,
"loss": 3.092,
"step": 435
},
{
"epoch": 0.039730271550938584,
"grad_norm": 3.1665287017822266,
"learning_rate": 4.980551373994164e-05,
"loss": 2.8803,
"step": 436
},
{
"epoch": 0.03982139602697284,
"grad_norm": 3.015409231185913,
"learning_rate": 4.980462174322011e-05,
"loss": 3.0123,
"step": 437
},
{
"epoch": 0.03991252050300711,
"grad_norm": 6.592101573944092,
"learning_rate": 4.980372771366213e-05,
"loss": 3.0624,
"step": 438
},
{
"epoch": 0.040003644979041374,
"grad_norm": 2.965012788772583,
"learning_rate": 4.980283165134097e-05,
"loss": 3.0143,
"step": 439
},
{
"epoch": 0.04009476945507563,
"grad_norm": 1.9055732488632202,
"learning_rate": 4.980193355633006e-05,
"loss": 2.9194,
"step": 440
},
{
"epoch": 0.0401858939311099,
"grad_norm": 2.0998048782348633,
"learning_rate": 4.980103342870301e-05,
"loss": 2.6373,
"step": 441
},
{
"epoch": 0.040277018407144156,
"grad_norm": 1.6069968938827515,
"learning_rate": 4.980013126853358e-05,
"loss": 2.4255,
"step": 442
},
{
"epoch": 0.04036814288317842,
"grad_norm": 2.8056509494781494,
"learning_rate": 4.9799227075895714e-05,
"loss": 2.4602,
"step": 443
},
{
"epoch": 0.04045926735921269,
"grad_norm": 2.874981641769409,
"learning_rate": 4.979832085086352e-05,
"loss": 2.6648,
"step": 444
},
{
"epoch": 0.040550391835246946,
"grad_norm": 4.484694004058838,
"learning_rate": 4.979741259351125e-05,
"loss": 3.4229,
"step": 445
},
{
"epoch": 0.04064151631128121,
"grad_norm": 3.391519546508789,
"learning_rate": 4.979650230391335e-05,
"loss": 2.7623,
"step": 446
},
{
"epoch": 0.04073264078731547,
"grad_norm": 3.0873026847839355,
"learning_rate": 4.979558998214442e-05,
"loss": 2.5945,
"step": 447
},
{
"epoch": 0.040823765263349736,
"grad_norm": 2.205383777618408,
"learning_rate": 4.979467562827923e-05,
"loss": 2.9883,
"step": 448
},
{
"epoch": 0.040914889739384,
"grad_norm": 3.873610019683838,
"learning_rate": 4.979375924239271e-05,
"loss": 3.1105,
"step": 449
},
{
"epoch": 0.04100601421541826,
"grad_norm": 2.3900392055511475,
"learning_rate": 4.979284082455996e-05,
"loss": 3.0243,
"step": 450
},
{
"epoch": 0.041097138691452526,
"grad_norm": 3.442873477935791,
"learning_rate": 4.979192037485626e-05,
"loss": 2.9851,
"step": 451
},
{
"epoch": 0.041188263167486784,
"grad_norm": 2.941448450088501,
"learning_rate": 4.979099789335703e-05,
"loss": 2.648,
"step": 452
},
{
"epoch": 0.04127938764352105,
"grad_norm": 2.6990129947662354,
"learning_rate": 4.979007338013788e-05,
"loss": 2.765,
"step": 453
},
{
"epoch": 0.041370512119555315,
"grad_norm": 2.8547112941741943,
"learning_rate": 4.978914683527458e-05,
"loss": 2.6851,
"step": 454
},
{
"epoch": 0.041461636595589574,
"grad_norm": 1.7790765762329102,
"learning_rate": 4.978821825884306e-05,
"loss": 2.897,
"step": 455
},
{
"epoch": 0.04155276107162384,
"grad_norm": 3.662311553955078,
"learning_rate": 4.978728765091941e-05,
"loss": 2.3728,
"step": 456
},
{
"epoch": 0.0416438855476581,
"grad_norm": 3.8467187881469727,
"learning_rate": 4.978635501157991e-05,
"loss": 2.6218,
"step": 457
},
{
"epoch": 0.041735010023692364,
"grad_norm": 3.8221049308776855,
"learning_rate": 4.978542034090099e-05,
"loss": 2.816,
"step": 458
},
{
"epoch": 0.04182613449972663,
"grad_norm": 4.882442951202393,
"learning_rate": 4.9784483638959254e-05,
"loss": 2.9258,
"step": 459
},
{
"epoch": 0.04191725897576089,
"grad_norm": 2.9506404399871826,
"learning_rate": 4.978354490583146e-05,
"loss": 2.9122,
"step": 460
},
{
"epoch": 0.04200838345179515,
"grad_norm": 3.596014976501465,
"learning_rate": 4.978260414159455e-05,
"loss": 2.9199,
"step": 461
},
{
"epoch": 0.04209950792782941,
"grad_norm": 2.6444709300994873,
"learning_rate": 4.978166134632562e-05,
"loss": 2.9953,
"step": 462
},
{
"epoch": 0.04219063240386368,
"grad_norm": 3.1696035861968994,
"learning_rate": 4.978071652010193e-05,
"loss": 3.0076,
"step": 463
},
{
"epoch": 0.04228175687989794,
"grad_norm": 1.9968247413635254,
"learning_rate": 4.977976966300092e-05,
"loss": 2.8129,
"step": 464
},
{
"epoch": 0.0423728813559322,
"grad_norm": 3.2720398902893066,
"learning_rate": 4.9778820775100185e-05,
"loss": 3.01,
"step": 465
},
{
"epoch": 0.04246400583196647,
"grad_norm": 3.994105577468872,
"learning_rate": 4.9777869856477485e-05,
"loss": 2.5902,
"step": 466
},
{
"epoch": 0.042555130308000726,
"grad_norm": 3.4857635498046875,
"learning_rate": 4.977691690721076e-05,
"loss": 2.8994,
"step": 467
},
{
"epoch": 0.04264625478403499,
"grad_norm": 3.149409294128418,
"learning_rate": 4.977596192737811e-05,
"loss": 2.7171,
"step": 468
},
{
"epoch": 0.04273737926006926,
"grad_norm": 3.7346558570861816,
"learning_rate": 4.9775004917057786e-05,
"loss": 3.0065,
"step": 469
},
{
"epoch": 0.042828503736103515,
"grad_norm": 4.905463695526123,
"learning_rate": 4.977404587632824e-05,
"loss": 2.5023,
"step": 470
},
{
"epoch": 0.04291962821213778,
"grad_norm": 3.6714894771575928,
"learning_rate": 4.9773084805268045e-05,
"loss": 2.1294,
"step": 471
},
{
"epoch": 0.043010752688172046,
"grad_norm": 2.8621413707733154,
"learning_rate": 4.977212170395598e-05,
"loss": 2.8082,
"step": 472
},
{
"epoch": 0.043101877164206305,
"grad_norm": 2.972755193710327,
"learning_rate": 4.9771156572470966e-05,
"loss": 2.8794,
"step": 473
},
{
"epoch": 0.04319300164024057,
"grad_norm": 1.98660409450531,
"learning_rate": 4.97701894108921e-05,
"loss": 2.9151,
"step": 474
},
{
"epoch": 0.04328412611627483,
"grad_norm": 3.258434534072876,
"learning_rate": 4.9769220219298666e-05,
"loss": 2.8181,
"step": 475
},
{
"epoch": 0.043375250592309095,
"grad_norm": 2.2653932571411133,
"learning_rate": 4.9768248997770063e-05,
"loss": 3.051,
"step": 476
},
{
"epoch": 0.04346637506834336,
"grad_norm": 4.378432273864746,
"learning_rate": 4.97672757463859e-05,
"loss": 2.7252,
"step": 477
},
{
"epoch": 0.04355749954437762,
"grad_norm": 5.332482814788818,
"learning_rate": 4.976630046522594e-05,
"loss": 2.9468,
"step": 478
},
{
"epoch": 0.043648624020411884,
"grad_norm": 2.8985297679901123,
"learning_rate": 4.9765323154370114e-05,
"loss": 2.782,
"step": 479
},
{
"epoch": 0.04373974849644614,
"grad_norm": 4.3086137771606445,
"learning_rate": 4.976434381389851e-05,
"loss": 3.0355,
"step": 480
},
{
"epoch": 0.04383087297248041,
"grad_norm": 4.562249660491943,
"learning_rate": 4.976336244389138e-05,
"loss": 2.6953,
"step": 481
},
{
"epoch": 0.043921997448514674,
"grad_norm": 3.0404744148254395,
"learning_rate": 4.9762379044429174e-05,
"loss": 2.9207,
"step": 482
},
{
"epoch": 0.04401312192454893,
"grad_norm": 2.2012205123901367,
"learning_rate": 4.9761393615592465e-05,
"loss": 2.8434,
"step": 483
},
{
"epoch": 0.0441042464005832,
"grad_norm": 2.0561892986297607,
"learning_rate": 4.9760406157462024e-05,
"loss": 2.7924,
"step": 484
},
{
"epoch": 0.04419537087661746,
"grad_norm": 7.559401988983154,
"learning_rate": 4.975941667011877e-05,
"loss": 2.9499,
"step": 485
},
{
"epoch": 0.04428649535265172,
"grad_norm": 3.387563943862915,
"learning_rate": 4.9758425153643804e-05,
"loss": 2.935,
"step": 486
},
{
"epoch": 0.04437761982868599,
"grad_norm": 3.8939521312713623,
"learning_rate": 4.975743160811839e-05,
"loss": 3.1682,
"step": 487
},
{
"epoch": 0.04446874430472025,
"grad_norm": 4.640250205993652,
"learning_rate": 4.975643603362393e-05,
"loss": 3.1556,
"step": 488
},
{
"epoch": 0.04455986878075451,
"grad_norm": 2.0823843479156494,
"learning_rate": 4.975543843024203e-05,
"loss": 2.9787,
"step": 489
},
{
"epoch": 0.04465099325678877,
"grad_norm": 3.4816336631774902,
"learning_rate": 4.975443879805445e-05,
"loss": 3.2459,
"step": 490
},
{
"epoch": 0.044742117732823036,
"grad_norm": 4.104045867919922,
"learning_rate": 4.97534371371431e-05,
"loss": 2.8059,
"step": 491
},
{
"epoch": 0.0448332422088573,
"grad_norm": 5.515511989593506,
"learning_rate": 4.9752433447590084e-05,
"loss": 3.98,
"step": 492
},
{
"epoch": 0.04492436668489156,
"grad_norm": 3.279454231262207,
"learning_rate": 4.975142772947766e-05,
"loss": 3.1761,
"step": 493
},
{
"epoch": 0.045015491160925826,
"grad_norm": 4.770516395568848,
"learning_rate": 4.975041998288824e-05,
"loss": 3.169,
"step": 494
},
{
"epoch": 0.045106615636960085,
"grad_norm": 4.414744853973389,
"learning_rate": 4.9749410207904416e-05,
"loss": 3.3102,
"step": 495
},
{
"epoch": 0.04519774011299435,
"grad_norm": 5.29826021194458,
"learning_rate": 4.974839840460895e-05,
"loss": 3.1497,
"step": 496
},
{
"epoch": 0.045288864589028616,
"grad_norm": 3.6789612770080566,
"learning_rate": 4.974738457308475e-05,
"loss": 3.1284,
"step": 497
},
{
"epoch": 0.045379989065062874,
"grad_norm": 2.8582963943481445,
"learning_rate": 4.974636871341492e-05,
"loss": 3.0947,
"step": 498
},
{
"epoch": 0.04547111354109714,
"grad_norm": 3.199500799179077,
"learning_rate": 4.97453508256827e-05,
"loss": 3.2182,
"step": 499
},
{
"epoch": 0.0455622380171314,
"grad_norm": 3.2001290321350098,
"learning_rate": 4.9744330909971506e-05,
"loss": 3.39,
"step": 500
},
{
"epoch": 0.045653362493165664,
"grad_norm": 2.053631544113159,
"learning_rate": 4.9743308966364945e-05,
"loss": 3.2044,
"step": 501
},
{
"epoch": 0.04574448696919993,
"grad_norm": 3.347944498062134,
"learning_rate": 4.9742284994946756e-05,
"loss": 3.3669,
"step": 502
},
{
"epoch": 0.04583561144523419,
"grad_norm": 4.525710105895996,
"learning_rate": 4.974125899580086e-05,
"loss": 3.661,
"step": 503
},
{
"epoch": 0.045926735921268454,
"grad_norm": 3.3910508155822754,
"learning_rate": 4.974023096901133e-05,
"loss": 3.3766,
"step": 504
},
{
"epoch": 0.04601786039730271,
"grad_norm": 4.090876579284668,
"learning_rate": 4.973920091466243e-05,
"loss": 3.4127,
"step": 505
},
{
"epoch": 0.04610898487333698,
"grad_norm": 3.523660898208618,
"learning_rate": 4.973816883283858e-05,
"loss": 3.4372,
"step": 506
},
{
"epoch": 0.04620010934937124,
"grad_norm": 3.0479791164398193,
"learning_rate": 4.9737134723624355e-05,
"loss": 3.7846,
"step": 507
},
{
"epoch": 0.0462912338254055,
"grad_norm": 1.953797459602356,
"learning_rate": 4.973609858710451e-05,
"loss": 3.1759,
"step": 508
},
{
"epoch": 0.04638235830143977,
"grad_norm": 3.4201056957244873,
"learning_rate": 4.973506042336395e-05,
"loss": 3.7478,
"step": 509
},
{
"epoch": 0.046473482777474026,
"grad_norm": 4.29667854309082,
"learning_rate": 4.9734020232487766e-05,
"loss": 2.8707,
"step": 510
},
{
"epoch": 0.04656460725350829,
"grad_norm": 3.7853410243988037,
"learning_rate": 4.973297801456121e-05,
"loss": 3.1466,
"step": 511
},
{
"epoch": 0.04665573172954256,
"grad_norm": 3.550114870071411,
"learning_rate": 4.973193376966968e-05,
"loss": 3.5852,
"step": 512
},
{
"epoch": 0.046746856205576816,
"grad_norm": 2.124420642852783,
"learning_rate": 4.9730887497898766e-05,
"loss": 3.3123,
"step": 513
},
{
"epoch": 0.04683798068161108,
"grad_norm": 5.119099140167236,
"learning_rate": 4.9729839199334215e-05,
"loss": 3.0232,
"step": 514
},
{
"epoch": 0.04692910515764535,
"grad_norm": 1.975109338760376,
"learning_rate": 4.9728788874061936e-05,
"loss": 3.2217,
"step": 515
},
{
"epoch": 0.047020229633679606,
"grad_norm": 2.8429102897644043,
"learning_rate": 4.9727736522168016e-05,
"loss": 3.5701,
"step": 516
},
{
"epoch": 0.04711135410971387,
"grad_norm": 2.3540258407592773,
"learning_rate": 4.972668214373869e-05,
"loss": 3.1378,
"step": 517
},
{
"epoch": 0.04720247858574813,
"grad_norm": 1.4242401123046875,
"learning_rate": 4.972562573886037e-05,
"loss": 3.0776,
"step": 518
},
{
"epoch": 0.047293603061782395,
"grad_norm": 2.1467764377593994,
"learning_rate": 4.972456730761963e-05,
"loss": 3.1369,
"step": 519
},
{
"epoch": 0.04738472753781666,
"grad_norm": 1.5915725231170654,
"learning_rate": 4.972350685010322e-05,
"loss": 3.0371,
"step": 520
},
{
"epoch": 0.04747585201385092,
"grad_norm": 3.358949899673462,
"learning_rate": 4.972244436639804e-05,
"loss": 3.225,
"step": 521
},
{
"epoch": 0.047566976489885185,
"grad_norm": 2.1057491302490234,
"learning_rate": 4.972137985659117e-05,
"loss": 3.2478,
"step": 522
},
{
"epoch": 0.047658100965919444,
"grad_norm": 1.7229844331741333,
"learning_rate": 4.9720313320769854e-05,
"loss": 3.1378,
"step": 523
},
{
"epoch": 0.04774922544195371,
"grad_norm": 4.693446636199951,
"learning_rate": 4.971924475902149e-05,
"loss": 3.6186,
"step": 524
},
{
"epoch": 0.047840349917987975,
"grad_norm": 2.06803822517395,
"learning_rate": 4.971817417143366e-05,
"loss": 3.3783,
"step": 525
},
{
"epoch": 0.04793147439402223,
"grad_norm": 1.9626054763793945,
"learning_rate": 4.971710155809409e-05,
"loss": 3.1188,
"step": 526
},
{
"epoch": 0.0480225988700565,
"grad_norm": 5.128235340118408,
"learning_rate": 4.9716026919090705e-05,
"loss": 2.7121,
"step": 527
},
{
"epoch": 0.04811372334609076,
"grad_norm": 1.7978111505508423,
"learning_rate": 4.971495025451156e-05,
"loss": 3.2038,
"step": 528
},
{
"epoch": 0.04820484782212502,
"grad_norm": 2.185279130935669,
"learning_rate": 4.971387156444489e-05,
"loss": 3.2331,
"step": 529
},
{
"epoch": 0.04829597229815929,
"grad_norm": 2.912400007247925,
"learning_rate": 4.971279084897912e-05,
"loss": 3.3246,
"step": 530
},
{
"epoch": 0.04838709677419355,
"grad_norm": 2.451106548309326,
"learning_rate": 4.971170810820279e-05,
"loss": 3.2453,
"step": 531
},
{
"epoch": 0.04847822125022781,
"grad_norm": 3.062638282775879,
"learning_rate": 4.9710623342204646e-05,
"loss": 3.1878,
"step": 532
},
{
"epoch": 0.04856934572626207,
"grad_norm": 4.197722434997559,
"learning_rate": 4.97095365510736e-05,
"loss": 3.5296,
"step": 533
},
{
"epoch": 0.04866047020229634,
"grad_norm": 5.098058700561523,
"learning_rate": 4.97084477348987e-05,
"loss": 4.8548,
"step": 534
},
{
"epoch": 0.0487515946783306,
"grad_norm": 2.903400182723999,
"learning_rate": 4.9707356893769194e-05,
"loss": 3.2411,
"step": 535
},
{
"epoch": 0.04884271915436486,
"grad_norm": 3.2839369773864746,
"learning_rate": 4.970626402777447e-05,
"loss": 3.5929,
"step": 536
},
{
"epoch": 0.048933843630399126,
"grad_norm": 2.5680384635925293,
"learning_rate": 4.970516913700411e-05,
"loss": 3.4145,
"step": 537
},
{
"epoch": 0.049024968106433385,
"grad_norm": 2.191035509109497,
"learning_rate": 4.9704072221547824e-05,
"loss": 3.3342,
"step": 538
},
{
"epoch": 0.04911609258246765,
"grad_norm": 4.748012065887451,
"learning_rate": 4.970297328149551e-05,
"loss": 3.2415,
"step": 539
},
{
"epoch": 0.049207217058501916,
"grad_norm": 3.8018529415130615,
"learning_rate": 4.970187231693725e-05,
"loss": 3.5361,
"step": 540
},
{
"epoch": 0.049298341534536175,
"grad_norm": 3.128706216812134,
"learning_rate": 4.970076932796326e-05,
"loss": 3.2684,
"step": 541
},
{
"epoch": 0.04938946601057044,
"grad_norm": 2.324488401412964,
"learning_rate": 4.969966431466393e-05,
"loss": 3.143,
"step": 542
},
{
"epoch": 0.0494805904866047,
"grad_norm": 4.210093975067139,
"learning_rate": 4.969855727712982e-05,
"loss": 3.2956,
"step": 543
},
{
"epoch": 0.049571714962638964,
"grad_norm": 2.552192449569702,
"learning_rate": 4.969744821545166e-05,
"loss": 3.52,
"step": 544
},
{
"epoch": 0.04966283943867323,
"grad_norm": 3.2044615745544434,
"learning_rate": 4.9696337129720346e-05,
"loss": 3.5377,
"step": 545
},
{
"epoch": 0.04975396391470749,
"grad_norm": 3.112868547439575,
"learning_rate": 4.969522402002693e-05,
"loss": 3.6119,
"step": 546
},
{
"epoch": 0.049845088390741754,
"grad_norm": 2.6442015171051025,
"learning_rate": 4.969410888646264e-05,
"loss": 3.3822,
"step": 547
},
{
"epoch": 0.04993621286677601,
"grad_norm": 4.498085975646973,
"learning_rate": 4.969299172911887e-05,
"loss": 3.6879,
"step": 548
},
{
"epoch": 0.05002733734281028,
"grad_norm": 4.011466979980469,
"learning_rate": 4.969187254808715e-05,
"loss": 3.55,
"step": 549
},
{
"epoch": 0.050118461818844544,
"grad_norm": 4.503020286560059,
"learning_rate": 4.969075134345924e-05,
"loss": 3.6521,
"step": 550
},
{
"epoch": 0.0502095862948788,
"grad_norm": 2.6148521900177,
"learning_rate": 4.9689628115326994e-05,
"loss": 3.0139,
"step": 551
},
{
"epoch": 0.05030071077091307,
"grad_norm": 2.978384017944336,
"learning_rate": 4.9688502863782484e-05,
"loss": 3.0377,
"step": 552
},
{
"epoch": 0.050391835246947334,
"grad_norm": 2.7750418186187744,
"learning_rate": 4.9687375588917925e-05,
"loss": 3.5164,
"step": 553
},
{
"epoch": 0.05048295972298159,
"grad_norm": 3.843651056289673,
"learning_rate": 4.96862462908257e-05,
"loss": 3.3884,
"step": 554
},
{
"epoch": 0.05057408419901586,
"grad_norm": 2.7308478355407715,
"learning_rate": 4.968511496959835e-05,
"loss": 3.5797,
"step": 555
},
{
"epoch": 0.050665208675050116,
"grad_norm": 3.842102289199829,
"learning_rate": 4.968398162532861e-05,
"loss": 3.537,
"step": 556
},
{
"epoch": 0.05075633315108438,
"grad_norm": 4.337724685668945,
"learning_rate": 4.968284625810935e-05,
"loss": 3.3409,
"step": 557
},
{
"epoch": 0.05084745762711865,
"grad_norm": 4.96396017074585,
"learning_rate": 4.9681708868033616e-05,
"loss": 3.6793,
"step": 558
},
{
"epoch": 0.050938582103152906,
"grad_norm": 2.3788044452667236,
"learning_rate": 4.9680569455194634e-05,
"loss": 3.3635,
"step": 559
},
{
"epoch": 0.05102970657918717,
"grad_norm": 3.7733585834503174,
"learning_rate": 4.967942801968577e-05,
"loss": 3.3418,
"step": 560
},
{
"epoch": 0.05112083105522143,
"grad_norm": 2.370511293411255,
"learning_rate": 4.9678284561600575e-05,
"loss": 3.3194,
"step": 561
},
{
"epoch": 0.051211955531255696,
"grad_norm": 2.5621142387390137,
"learning_rate": 4.9677139081032754e-05,
"loss": 3.2493,
"step": 562
},
{
"epoch": 0.05130308000728996,
"grad_norm": 2.7476348876953125,
"learning_rate": 4.96759915780762e-05,
"loss": 3.5329,
"step": 563
},
{
"epoch": 0.05139420448332422,
"grad_norm": 2.813443422317505,
"learning_rate": 4.9674842052824934e-05,
"loss": 3.4796,
"step": 564
},
{
"epoch": 0.051485328959358485,
"grad_norm": 2.8847901821136475,
"learning_rate": 4.967369050537317e-05,
"loss": 3.1291,
"step": 565
},
{
"epoch": 0.051576453435392744,
"grad_norm": 2.9698753356933594,
"learning_rate": 4.96725369358153e-05,
"loss": 3.4165,
"step": 566
},
{
"epoch": 0.05166757791142701,
"grad_norm": 4.517305374145508,
"learning_rate": 4.9671381344245846e-05,
"loss": 3.1617,
"step": 567
},
{
"epoch": 0.051758702387461275,
"grad_norm": 2.740018129348755,
"learning_rate": 4.9670223730759515e-05,
"loss": 3.2746,
"step": 568
},
{
"epoch": 0.051849826863495534,
"grad_norm": 3.409982681274414,
"learning_rate": 4.966906409545118e-05,
"loss": 3.1811,
"step": 569
},
{
"epoch": 0.0519409513395298,
"grad_norm": 5.607142925262451,
"learning_rate": 4.9667902438415876e-05,
"loss": 3.5238,
"step": 570
},
{
"epoch": 0.05203207581556406,
"grad_norm": 3.8322174549102783,
"learning_rate": 4.966673875974881e-05,
"loss": 3.4367,
"step": 571
},
{
"epoch": 0.05212320029159832,
"grad_norm": 4.471311569213867,
"learning_rate": 4.9665573059545346e-05,
"loss": 3.3449,
"step": 572
},
{
"epoch": 0.05221432476763259,
"grad_norm": 2.2863168716430664,
"learning_rate": 4.966440533790102e-05,
"loss": 3.4894,
"step": 573
},
{
"epoch": 0.05230544924366685,
"grad_norm": 3.153233289718628,
"learning_rate": 4.966323559491153e-05,
"loss": 3.0977,
"step": 574
},
{
"epoch": 0.05239657371970111,
"grad_norm": 3.909994602203369,
"learning_rate": 4.9662063830672735e-05,
"loss": 3.3355,
"step": 575
},
{
"epoch": 0.05248769819573537,
"grad_norm": 3.271372079849243,
"learning_rate": 4.966089004528068e-05,
"loss": 2.9002,
"step": 576
},
{
"epoch": 0.05257882267176964,
"grad_norm": 3.8702852725982666,
"learning_rate": 4.965971423883155e-05,
"loss": 3.2961,
"step": 577
},
{
"epoch": 0.0526699471478039,
"grad_norm": 2.3711459636688232,
"learning_rate": 4.965853641142171e-05,
"loss": 3.3116,
"step": 578
},
{
"epoch": 0.05276107162383816,
"grad_norm": 1.8924840688705444,
"learning_rate": 4.965735656314769e-05,
"loss": 3.2245,
"step": 579
},
{
"epoch": 0.05285219609987243,
"grad_norm": 2.0791070461273193,
"learning_rate": 4.9656174694106186e-05,
"loss": 3.2511,
"step": 580
},
{
"epoch": 0.052943320575906685,
"grad_norm": 3.904510259628296,
"learning_rate": 4.9654990804394045e-05,
"loss": 2.3235,
"step": 581
},
{
"epoch": 0.05303444505194095,
"grad_norm": 2.213655710220337,
"learning_rate": 4.9653804894108294e-05,
"loss": 3.1096,
"step": 582
},
{
"epoch": 0.05312556952797522,
"grad_norm": 2.9452335834503174,
"learning_rate": 4.965261696334613e-05,
"loss": 3.2722,
"step": 583
},
{
"epoch": 0.053216694004009475,
"grad_norm": 4.414241790771484,
"learning_rate": 4.965142701220491e-05,
"loss": 3.0292,
"step": 584
},
{
"epoch": 0.05330781848004374,
"grad_norm": 3.2087106704711914,
"learning_rate": 4.965023504078215e-05,
"loss": 2.8201,
"step": 585
},
{
"epoch": 0.053398942956078,
"grad_norm": 2.4379191398620605,
"learning_rate": 4.964904104917554e-05,
"loss": 2.8888,
"step": 586
},
{
"epoch": 0.053490067432112265,
"grad_norm": 3.739722728729248,
"learning_rate": 4.964784503748293e-05,
"loss": 3.3616,
"step": 587
},
{
"epoch": 0.05358119190814653,
"grad_norm": 3.223637104034424,
"learning_rate": 4.9646647005802333e-05,
"loss": 3.1671,
"step": 588
},
{
"epoch": 0.05367231638418079,
"grad_norm": 2.3534278869628906,
"learning_rate": 4.9645446954231936e-05,
"loss": 3.1879,
"step": 589
},
{
"epoch": 0.053763440860215055,
"grad_norm": 2.515484094619751,
"learning_rate": 4.964424488287009e-05,
"loss": 3.3132,
"step": 590
},
{
"epoch": 0.05385456533624931,
"grad_norm": 3.4469587802886963,
"learning_rate": 4.964304079181532e-05,
"loss": 3.0847,
"step": 591
},
{
"epoch": 0.05394568981228358,
"grad_norm": 3.8803462982177734,
"learning_rate": 4.964183468116629e-05,
"loss": 3.5651,
"step": 592
},
{
"epoch": 0.054036814288317844,
"grad_norm": 3.385795831680298,
"learning_rate": 4.9640626551021846e-05,
"loss": 3.4961,
"step": 593
},
{
"epoch": 0.0541279387643521,
"grad_norm": 2.582401990890503,
"learning_rate": 4.9639416401481e-05,
"loss": 3.5228,
"step": 594
},
{
"epoch": 0.05421906324038637,
"grad_norm": 4.507946014404297,
"learning_rate": 4.9638204232642945e-05,
"loss": 4.4401,
"step": 595
},
{
"epoch": 0.054310187716420634,
"grad_norm": 1.9351048469543457,
"learning_rate": 4.9636990044607e-05,
"loss": 3.2011,
"step": 596
},
{
"epoch": 0.05440131219245489,
"grad_norm": 3.384533166885376,
"learning_rate": 4.9635773837472686e-05,
"loss": 3.3884,
"step": 597
},
{
"epoch": 0.05449243666848916,
"grad_norm": 2.9044880867004395,
"learning_rate": 4.963455561133967e-05,
"loss": 3.0926,
"step": 598
},
{
"epoch": 0.05458356114452342,
"grad_norm": 3.2804243564605713,
"learning_rate": 4.96333353663078e-05,
"loss": 3.2308,
"step": 599
},
{
"epoch": 0.05467468562055768,
"grad_norm": 2.781702756881714,
"learning_rate": 4.9632113102477066e-05,
"loss": 3.2847,
"step": 600
},
{
"epoch": 0.05476581009659195,
"grad_norm": 1.6848156452178955,
"learning_rate": 4.963088881994764e-05,
"loss": 3.0225,
"step": 601
},
{
"epoch": 0.054856934572626206,
"grad_norm": 2.8916399478912354,
"learning_rate": 4.962966251881987e-05,
"loss": 3.2519,
"step": 602
},
{
"epoch": 0.05494805904866047,
"grad_norm": 5.240880489349365,
"learning_rate": 4.962843419919424e-05,
"loss": 4.7183,
"step": 603
},
{
"epoch": 0.05503918352469473,
"grad_norm": 4.498986721038818,
"learning_rate": 4.962720386117143e-05,
"loss": 4.3066,
"step": 604
},
{
"epoch": 0.055130308000728996,
"grad_norm": 3.2168045043945312,
"learning_rate": 4.962597150485226e-05,
"loss": 3.2982,
"step": 605
},
{
"epoch": 0.05522143247676326,
"grad_norm": 4.392197132110596,
"learning_rate": 4.962473713033773e-05,
"loss": 3.4625,
"step": 606
},
{
"epoch": 0.05531255695279752,
"grad_norm": 2.5749943256378174,
"learning_rate": 4.9623500737729e-05,
"loss": 3.2704,
"step": 607
},
{
"epoch": 0.055403681428831786,
"grad_norm": 2.41140079498291,
"learning_rate": 4.96222623271274e-05,
"loss": 2.9841,
"step": 608
},
{
"epoch": 0.055494805904866044,
"grad_norm": 2.583230972290039,
"learning_rate": 4.962102189863442e-05,
"loss": 3.3146,
"step": 609
},
{
"epoch": 0.05558593038090031,
"grad_norm": 2.78155779838562,
"learning_rate": 4.9619779452351736e-05,
"loss": 3.2742,
"step": 610
},
{
"epoch": 0.055677054856934576,
"grad_norm": 2.217174768447876,
"learning_rate": 4.9618534988381136e-05,
"loss": 3.2342,
"step": 611
},
{
"epoch": 0.055768179332968834,
"grad_norm": 3.625903367996216,
"learning_rate": 4.9617288506824635e-05,
"loss": 3.3274,
"step": 612
},
{
"epoch": 0.0558593038090031,
"grad_norm": 3.1227903366088867,
"learning_rate": 4.961604000778438e-05,
"loss": 3.4848,
"step": 613
},
{
"epoch": 0.05595042828503736,
"grad_norm": 3.104952812194824,
"learning_rate": 4.961478949136269e-05,
"loss": 3.0356,
"step": 614
},
{
"epoch": 0.056041552761071624,
"grad_norm": 1.903206467628479,
"learning_rate": 4.961353695766206e-05,
"loss": 3.18,
"step": 615
},
{
"epoch": 0.05613267723710589,
"grad_norm": 3.809479236602783,
"learning_rate": 4.961228240678512e-05,
"loss": 2.2945,
"step": 616
},
{
"epoch": 0.05622380171314015,
"grad_norm": 1.6982084512710571,
"learning_rate": 4.961102583883469e-05,
"loss": 3.097,
"step": 617
},
{
"epoch": 0.056314926189174414,
"grad_norm": 2.1733193397521973,
"learning_rate": 4.960976725391376e-05,
"loss": 3.1999,
"step": 618
},
{
"epoch": 0.05640605066520867,
"grad_norm": 1.8412457704544067,
"learning_rate": 4.960850665212548e-05,
"loss": 3.1114,
"step": 619
},
{
"epoch": 0.05649717514124294,
"grad_norm": 3.182962417602539,
"learning_rate": 4.9607244033573156e-05,
"loss": 3.5988,
"step": 620
},
{
"epoch": 0.0565882996172772,
"grad_norm": 1.3487292528152466,
"learning_rate": 4.960597939836025e-05,
"loss": 2.8788,
"step": 621
},
{
"epoch": 0.05667942409331146,
"grad_norm": 1.620439052581787,
"learning_rate": 4.960471274659042e-05,
"loss": 3.1345,
"step": 622
},
{
"epoch": 0.05677054856934573,
"grad_norm": 3.5589683055877686,
"learning_rate": 4.9603444078367475e-05,
"loss": 4.3533,
"step": 623
},
{
"epoch": 0.056861673045379986,
"grad_norm": 3.436901330947876,
"learning_rate": 4.960217339379537e-05,
"loss": 3.3396,
"step": 624
},
{
"epoch": 0.05695279752141425,
"grad_norm": 2.2627270221710205,
"learning_rate": 4.960090069297827e-05,
"loss": 3.1737,
"step": 625
},
{
"epoch": 0.05704392199744852,
"grad_norm": 1.921242594718933,
"learning_rate": 4.9599625976020446e-05,
"loss": 3.1868,
"step": 626
},
{
"epoch": 0.057135046473482776,
"grad_norm": 4.410575866699219,
"learning_rate": 4.9598349243026394e-05,
"loss": 3.0296,
"step": 627
},
{
"epoch": 0.05722617094951704,
"grad_norm": 1.6845407485961914,
"learning_rate": 4.959707049410073e-05,
"loss": 3.1197,
"step": 628
},
{
"epoch": 0.0573172954255513,
"grad_norm": 2.5050525665283203,
"learning_rate": 4.9595789729348263e-05,
"loss": 3.5628,
"step": 629
},
{
"epoch": 0.057408419901585565,
"grad_norm": 1.3204115629196167,
"learning_rate": 4.9594506948873945e-05,
"loss": 3.0306,
"step": 630
},
{
"epoch": 0.05749954437761983,
"grad_norm": 2.239015579223633,
"learning_rate": 4.9593222152782916e-05,
"loss": 3.204,
"step": 631
},
{
"epoch": 0.05759066885365409,
"grad_norm": 3.869081497192383,
"learning_rate": 4.9591935341180464e-05,
"loss": 3.3636,
"step": 632
},
{
"epoch": 0.057681793329688355,
"grad_norm": 2.7307801246643066,
"learning_rate": 4.959064651417204e-05,
"loss": 3.1173,
"step": 633
},
{
"epoch": 0.057772917805722614,
"grad_norm": 1.6466689109802246,
"learning_rate": 4.9589355671863295e-05,
"loss": 3.0867,
"step": 634
},
{
"epoch": 0.05786404228175688,
"grad_norm": 3.217461109161377,
"learning_rate": 4.9588062814359996e-05,
"loss": 3.4342,
"step": 635
},
{
"epoch": 0.057955166757791145,
"grad_norm": 3.0771045684814453,
"learning_rate": 4.958676794176811e-05,
"loss": 3.5423,
"step": 636
},
{
"epoch": 0.0580462912338254,
"grad_norm": 4.25571346282959,
"learning_rate": 4.958547105419374e-05,
"loss": 3.4624,
"step": 637
},
{
"epoch": 0.05813741570985967,
"grad_norm": 2.3620123863220215,
"learning_rate": 4.958417215174318e-05,
"loss": 3.1722,
"step": 638
},
{
"epoch": 0.058228540185893934,
"grad_norm": 2.8196592330932617,
"learning_rate": 4.958287123452289e-05,
"loss": 3.4062,
"step": 639
},
{
"epoch": 0.05831966466192819,
"grad_norm": 2.7436940670013428,
"learning_rate": 4.958156830263948e-05,
"loss": 3.3989,
"step": 640
},
{
"epoch": 0.05841078913796246,
"grad_norm": 3.0599286556243896,
"learning_rate": 4.958026335619972e-05,
"loss": 2.8931,
"step": 641
},
{
"epoch": 0.05850191361399672,
"grad_norm": 1.745510458946228,
"learning_rate": 4.957895639531056e-05,
"loss": 3.0919,
"step": 642
},
{
"epoch": 0.05859303809003098,
"grad_norm": 3.2269985675811768,
"learning_rate": 4.957764742007912e-05,
"loss": 3.264,
"step": 643
},
{
"epoch": 0.05868416256606525,
"grad_norm": 2.882855176925659,
"learning_rate": 4.957633643061267e-05,
"loss": 3.3968,
"step": 644
},
{
"epoch": 0.05877528704209951,
"grad_norm": 3.923797130584717,
"learning_rate": 4.9575023427018645e-05,
"loss": 3.1769,
"step": 645
},
{
"epoch": 0.05886641151813377,
"grad_norm": 1.8065714836120605,
"learning_rate": 4.9573708409404665e-05,
"loss": 3.1882,
"step": 646
},
{
"epoch": 0.05895753599416803,
"grad_norm": 2.9968409538269043,
"learning_rate": 4.957239137787848e-05,
"loss": 3.5354,
"step": 647
},
{
"epoch": 0.0590486604702023,
"grad_norm": 2.20745587348938,
"learning_rate": 4.957107233254805e-05,
"loss": 3.2484,
"step": 648
},
{
"epoch": 0.05913978494623656,
"grad_norm": 3.963139057159424,
"learning_rate": 4.9569751273521454e-05,
"loss": 4.0875,
"step": 649
},
{
"epoch": 0.05923090942227082,
"grad_norm": 3.877814531326294,
"learning_rate": 4.956842820090697e-05,
"loss": 4.6056,
"step": 650
},
{
"epoch": 0.059322033898305086,
"grad_norm": 3.671600818634033,
"learning_rate": 4.956710311481303e-05,
"loss": 3.2384,
"step": 651
},
{
"epoch": 0.059413158374339345,
"grad_norm": 3.7011606693267822,
"learning_rate": 4.956577601534822e-05,
"loss": 3.5084,
"step": 652
},
{
"epoch": 0.05950428285037361,
"grad_norm": 1.532228708267212,
"learning_rate": 4.956444690262131e-05,
"loss": 3.1647,
"step": 653
},
{
"epoch": 0.059595407326407876,
"grad_norm": 2.7572762966156006,
"learning_rate": 4.956311577674123e-05,
"loss": 3.1905,
"step": 654
},
{
"epoch": 0.059686531802442135,
"grad_norm": 2.9054203033447266,
"learning_rate": 4.956178263781706e-05,
"loss": 3.45,
"step": 655
},
{
"epoch": 0.0597776562784764,
"grad_norm": 2.3471450805664062,
"learning_rate": 4.9560447485958065e-05,
"loss": 3.4931,
"step": 656
},
{
"epoch": 0.05986878075451066,
"grad_norm": 3.769836664199829,
"learning_rate": 4.955911032127365e-05,
"loss": 3.5662,
"step": 657
},
{
"epoch": 0.059959905230544924,
"grad_norm": 2.2852306365966797,
"learning_rate": 4.955777114387342e-05,
"loss": 3.3788,
"step": 658
},
{
"epoch": 0.06005102970657919,
"grad_norm": 2.383812665939331,
"learning_rate": 4.9556429953867124e-05,
"loss": 3.0372,
"step": 659
},
{
"epoch": 0.06014215418261345,
"grad_norm": 4.165440559387207,
"learning_rate": 4.9555086751364666e-05,
"loss": 4.6025,
"step": 660
},
{
"epoch": 0.060233278658647714,
"grad_norm": 2.7133188247680664,
"learning_rate": 4.955374153647613e-05,
"loss": 3.4581,
"step": 661
},
{
"epoch": 0.06032440313468197,
"grad_norm": 2.894537925720215,
"learning_rate": 4.955239430931177e-05,
"loss": 3.2701,
"step": 662
},
{
"epoch": 0.06041552761071624,
"grad_norm": 2.548617362976074,
"learning_rate": 4.955104506998199e-05,
"loss": 3.5049,
"step": 663
},
{
"epoch": 0.060506652086750504,
"grad_norm": 1.5685203075408936,
"learning_rate": 4.9549693818597365e-05,
"loss": 3.0841,
"step": 664
},
{
"epoch": 0.06059777656278476,
"grad_norm": 3.0904016494750977,
"learning_rate": 4.954834055526864e-05,
"loss": 3.1755,
"step": 665
},
{
"epoch": 0.06068890103881903,
"grad_norm": 2.390272855758667,
"learning_rate": 4.954698528010671e-05,
"loss": 3.0737,
"step": 666
},
{
"epoch": 0.060780025514853286,
"grad_norm": 2.9072399139404297,
"learning_rate": 4.954562799322266e-05,
"loss": 3.426,
"step": 667
},
{
"epoch": 0.06087114999088755,
"grad_norm": 4.058100700378418,
"learning_rate": 4.9544268694727714e-05,
"loss": 3.303,
"step": 668
},
{
"epoch": 0.06096227446692182,
"grad_norm": 2.9293103218078613,
"learning_rate": 4.9542907384733277e-05,
"loss": 3.5832,
"step": 669
},
{
"epoch": 0.061053398942956076,
"grad_norm": 3.660994529724121,
"learning_rate": 4.9541544063350916e-05,
"loss": 3.4459,
"step": 670
},
{
"epoch": 0.06114452341899034,
"grad_norm": 1.5523614883422852,
"learning_rate": 4.954017873069235e-05,
"loss": 3.2077,
"step": 671
},
{
"epoch": 0.0612356478950246,
"grad_norm": 3.498552083969116,
"learning_rate": 4.953881138686948e-05,
"loss": 3.3605,
"step": 672
},
{
"epoch": 0.061326772371058866,
"grad_norm": 1.5058245658874512,
"learning_rate": 4.953744203199437e-05,
"loss": 2.9763,
"step": 673
},
{
"epoch": 0.06141789684709313,
"grad_norm": 3.1755263805389404,
"learning_rate": 4.9536070666179236e-05,
"loss": 3.2367,
"step": 674
},
{
"epoch": 0.06150902132312739,
"grad_norm": 4.108938694000244,
"learning_rate": 4.953469728953647e-05,
"loss": 3.3441,
"step": 675
},
{
"epoch": 0.061600145799161656,
"grad_norm": 2.8655855655670166,
"learning_rate": 4.9533321902178634e-05,
"loss": 3.4349,
"step": 676
},
{
"epoch": 0.06169127027519592,
"grad_norm": 1.3662402629852295,
"learning_rate": 4.953194450421843e-05,
"loss": 3.052,
"step": 677
},
{
"epoch": 0.06178239475123018,
"grad_norm": 3.1429646015167236,
"learning_rate": 4.9530565095768744e-05,
"loss": 3.3275,
"step": 678
},
{
"epoch": 0.061873519227264445,
"grad_norm": 2.9324707984924316,
"learning_rate": 4.952918367694264e-05,
"loss": 2.9269,
"step": 679
},
{
"epoch": 0.061964643703298704,
"grad_norm": 1.5480892658233643,
"learning_rate": 4.952780024785331e-05,
"loss": 3.1536,
"step": 680
},
{
"epoch": 0.06205576817933297,
"grad_norm": 3.543039321899414,
"learning_rate": 4.9526414808614154e-05,
"loss": 4.6384,
"step": 681
},
{
"epoch": 0.062146892655367235,
"grad_norm": 3.411710739135742,
"learning_rate": 4.9525027359338696e-05,
"loss": 3.9351,
"step": 682
},
{
"epoch": 0.062238017131401493,
"grad_norm": 4.721251487731934,
"learning_rate": 4.952363790014064e-05,
"loss": 2.8042,
"step": 683
},
{
"epoch": 0.06232914160743576,
"grad_norm": 2.0499167442321777,
"learning_rate": 4.952224643113388e-05,
"loss": 3.0585,
"step": 684
},
{
"epoch": 0.06242026608347002,
"grad_norm": 2.8441872596740723,
"learning_rate": 4.9520852952432426e-05,
"loss": 3.6444,
"step": 685
},
{
"epoch": 0.06251139055950428,
"grad_norm": 3.619750499725342,
"learning_rate": 4.9519457464150496e-05,
"loss": 3.1606,
"step": 686
},
{
"epoch": 0.06260251503553854,
"grad_norm": 3.968768358230591,
"learning_rate": 4.951805996640245e-05,
"loss": 3.4814,
"step": 687
},
{
"epoch": 0.06269363951157281,
"grad_norm": 2.2831945419311523,
"learning_rate": 4.9516660459302827e-05,
"loss": 3.1286,
"step": 688
},
{
"epoch": 0.06278476398760707,
"grad_norm": 4.824334144592285,
"learning_rate": 4.9515258942966315e-05,
"loss": 3.2481,
"step": 689
},
{
"epoch": 0.06287588846364134,
"grad_norm": 3.6594996452331543,
"learning_rate": 4.951385541750777e-05,
"loss": 2.9803,
"step": 690
},
{
"epoch": 0.06296701293967559,
"grad_norm": 4.994578838348389,
"learning_rate": 4.951244988304221e-05,
"loss": 2.7129,
"step": 691
},
{
"epoch": 0.06305813741570986,
"grad_norm": 3.323155403137207,
"learning_rate": 4.9511042339684846e-05,
"loss": 3.1815,
"step": 692
},
{
"epoch": 0.06314926189174412,
"grad_norm": 3.1093785762786865,
"learning_rate": 4.950963278755102e-05,
"loss": 3.32,
"step": 693
},
{
"epoch": 0.06324038636777839,
"grad_norm": 2.944016456604004,
"learning_rate": 4.950822122675625e-05,
"loss": 3.1048,
"step": 694
},
{
"epoch": 0.06333151084381265,
"grad_norm": 1.7950677871704102,
"learning_rate": 4.950680765741622e-05,
"loss": 3.1967,
"step": 695
},
{
"epoch": 0.0634226353198469,
"grad_norm": 2.822021961212158,
"learning_rate": 4.950539207964677e-05,
"loss": 2.8707,
"step": 696
},
{
"epoch": 0.06351375979588117,
"grad_norm": 3.3690731525421143,
"learning_rate": 4.950397449356392e-05,
"loss": 3.277,
"step": 697
},
{
"epoch": 0.06360488427191544,
"grad_norm": 3.850304126739502,
"learning_rate": 4.9502554899283845e-05,
"loss": 3.3475,
"step": 698
},
{
"epoch": 0.0636960087479497,
"grad_norm": 3.161121129989624,
"learning_rate": 4.9501133296922897e-05,
"loss": 2.9737,
"step": 699
},
{
"epoch": 0.06378713322398397,
"grad_norm": 1.786126971244812,
"learning_rate": 4.949970968659757e-05,
"loss": 3.1964,
"step": 700
},
{
"epoch": 0.06387825770001823,
"grad_norm": 2.7346184253692627,
"learning_rate": 4.949828406842453e-05,
"loss": 3.3159,
"step": 701
},
{
"epoch": 0.06396938217605248,
"grad_norm": 3.1189444065093994,
"learning_rate": 4.9496856442520623e-05,
"loss": 3.3436,
"step": 702
},
{
"epoch": 0.06406050665208675,
"grad_norm": 1.5578367710113525,
"learning_rate": 4.949542680900284e-05,
"loss": 3.093,
"step": 703
},
{
"epoch": 0.06415163112812101,
"grad_norm": 2.1540582180023193,
"learning_rate": 4.9493995167988355e-05,
"loss": 3.2466,
"step": 704
},
{
"epoch": 0.06424275560415528,
"grad_norm": 3.0360822677612305,
"learning_rate": 4.949256151959449e-05,
"loss": 2.8197,
"step": 705
},
{
"epoch": 0.06433388008018955,
"grad_norm": 5.320927143096924,
"learning_rate": 4.9491125863938735e-05,
"loss": 3.4521,
"step": 706
},
{
"epoch": 0.0644250045562238,
"grad_norm": 3.2697649002075195,
"learning_rate": 4.948968820113875e-05,
"loss": 3.3638,
"step": 707
},
{
"epoch": 0.06451612903225806,
"grad_norm": 2.503573417663574,
"learning_rate": 4.948824853131236e-05,
"loss": 3.3617,
"step": 708
},
{
"epoch": 0.06460725350829233,
"grad_norm": 1.5332664251327515,
"learning_rate": 4.948680685457756e-05,
"loss": 3.0453,
"step": 709
},
{
"epoch": 0.0646983779843266,
"grad_norm": 2.2039079666137695,
"learning_rate": 4.948536317105248e-05,
"loss": 3.3395,
"step": 710
},
{
"epoch": 0.06478950246036086,
"grad_norm": 2.7374370098114014,
"learning_rate": 4.948391748085545e-05,
"loss": 3.3615,
"step": 711
},
{
"epoch": 0.06488062693639511,
"grad_norm": 2.1900599002838135,
"learning_rate": 4.948246978410495e-05,
"loss": 3.2993,
"step": 712
},
{
"epoch": 0.06497175141242938,
"grad_norm": 1.8759992122650146,
"learning_rate": 4.948102008091962e-05,
"loss": 3.1306,
"step": 713
},
{
"epoch": 0.06506287588846364,
"grad_norm": 2.68538498878479,
"learning_rate": 4.9479568371418274e-05,
"loss": 3.3092,
"step": 714
},
{
"epoch": 0.06515400036449791,
"grad_norm": 2.6900649070739746,
"learning_rate": 4.947811465571988e-05,
"loss": 3.2655,
"step": 715
},
{
"epoch": 0.06524512484053217,
"grad_norm": 2.2141432762145996,
"learning_rate": 4.947665893394357e-05,
"loss": 3.165,
"step": 716
},
{
"epoch": 0.06533624931656642,
"grad_norm": 2.6907012462615967,
"learning_rate": 4.947520120620865e-05,
"loss": 3.1901,
"step": 717
},
{
"epoch": 0.06542737379260069,
"grad_norm": 2.0056562423706055,
"learning_rate": 4.9473741472634606e-05,
"loss": 3.2852,
"step": 718
},
{
"epoch": 0.06551849826863496,
"grad_norm": 1.5069571733474731,
"learning_rate": 4.947227973334104e-05,
"loss": 3.0089,
"step": 719
},
{
"epoch": 0.06560962274466922,
"grad_norm": 3.0702645778656006,
"learning_rate": 4.947081598844777e-05,
"loss": 3.191,
"step": 720
},
{
"epoch": 0.06570074722070349,
"grad_norm": 3.3552052974700928,
"learning_rate": 4.946935023807474e-05,
"loss": 4.4409,
"step": 721
},
{
"epoch": 0.06579187169673774,
"grad_norm": 5.324817180633545,
"learning_rate": 4.946788248234209e-05,
"loss": 3.2887,
"step": 722
},
{
"epoch": 0.065882996172772,
"grad_norm": 1.67562735080719,
"learning_rate": 4.9466412721370084e-05,
"loss": 3.1257,
"step": 723
},
{
"epoch": 0.06597412064880627,
"grad_norm": 3.6829192638397217,
"learning_rate": 4.9464940955279195e-05,
"loss": 3.0727,
"step": 724
},
{
"epoch": 0.06606524512484054,
"grad_norm": 2.184438705444336,
"learning_rate": 4.946346718419004e-05,
"loss": 3.2543,
"step": 725
},
{
"epoch": 0.0661563696008748,
"grad_norm": 1.7749693393707275,
"learning_rate": 4.9461991408223386e-05,
"loss": 3.107,
"step": 726
},
{
"epoch": 0.06624749407690905,
"grad_norm": 3.877955675125122,
"learning_rate": 4.946051362750018e-05,
"loss": 3.0837,
"step": 727
},
{
"epoch": 0.06633861855294332,
"grad_norm": 2.6731202602386475,
"learning_rate": 4.9459033842141554e-05,
"loss": 2.9075,
"step": 728
},
{
"epoch": 0.06642974302897758,
"grad_norm": 2.0825181007385254,
"learning_rate": 4.9457552052268764e-05,
"loss": 3.2235,
"step": 729
},
{
"epoch": 0.06652086750501185,
"grad_norm": 3.4630510807037354,
"learning_rate": 4.945606825800325e-05,
"loss": 4.399,
"step": 730
},
{
"epoch": 0.06661199198104611,
"grad_norm": 1.572504997253418,
"learning_rate": 4.9454582459466615e-05,
"loss": 2.988,
"step": 731
},
{
"epoch": 0.06670311645708037,
"grad_norm": 3.3033382892608643,
"learning_rate": 4.945309465678063e-05,
"loss": 3.272,
"step": 732
},
{
"epoch": 0.06679424093311463,
"grad_norm": 1.9305294752120972,
"learning_rate": 4.945160485006722e-05,
"loss": 3.1268,
"step": 733
},
{
"epoch": 0.0668853654091489,
"grad_norm": 2.0527968406677246,
"learning_rate": 4.9450113039448484e-05,
"loss": 3.1133,
"step": 734
},
{
"epoch": 0.06697648988518316,
"grad_norm": 3.5695364475250244,
"learning_rate": 4.944861922504669e-05,
"loss": 3.2998,
"step": 735
},
{
"epoch": 0.06706761436121743,
"grad_norm": 2.7028732299804688,
"learning_rate": 4.944712340698424e-05,
"loss": 3.1459,
"step": 736
},
{
"epoch": 0.06715873883725168,
"grad_norm": 1.6403956413269043,
"learning_rate": 4.9445625585383746e-05,
"loss": 3.1004,
"step": 737
},
{
"epoch": 0.06724986331328595,
"grad_norm": 2.2989110946655273,
"learning_rate": 4.9444125760367956e-05,
"loss": 3.3623,
"step": 738
},
{
"epoch": 0.06734098778932021,
"grad_norm": 3.925218343734741,
"learning_rate": 4.944262393205977e-05,
"loss": 3.4177,
"step": 739
},
{
"epoch": 0.06743211226535448,
"grad_norm": 2.743499517440796,
"learning_rate": 4.944112010058229e-05,
"loss": 3.4295,
"step": 740
},
{
"epoch": 0.06752323674138874,
"grad_norm": 2.836487293243408,
"learning_rate": 4.943961426605874e-05,
"loss": 3.1732,
"step": 741
},
{
"epoch": 0.067614361217423,
"grad_norm": 3.4316787719726562,
"learning_rate": 4.943810642861255e-05,
"loss": 3.2019,
"step": 742
},
{
"epoch": 0.06770548569345726,
"grad_norm": 1.637211799621582,
"learning_rate": 4.943659658836728e-05,
"loss": 3.0372,
"step": 743
},
{
"epoch": 0.06779661016949153,
"grad_norm": 2.633004665374756,
"learning_rate": 4.9435084745446666e-05,
"loss": 3.3982,
"step": 744
},
{
"epoch": 0.06788773464552579,
"grad_norm": 3.1574134826660156,
"learning_rate": 4.9433570899974626e-05,
"loss": 4.3972,
"step": 745
},
{
"epoch": 0.06797885912156006,
"grad_norm": 3.46399188041687,
"learning_rate": 4.94320550520752e-05,
"loss": 3.5156,
"step": 746
},
{
"epoch": 0.06806998359759431,
"grad_norm": 1.8419183492660522,
"learning_rate": 4.943053720187264e-05,
"loss": 3.1658,
"step": 747
},
{
"epoch": 0.06816110807362857,
"grad_norm": 4.034026622772217,
"learning_rate": 4.942901734949133e-05,
"loss": 3.2022,
"step": 748
},
{
"epoch": 0.06825223254966284,
"grad_norm": 2.7348647117614746,
"learning_rate": 4.942749549505582e-05,
"loss": 3.2519,
"step": 749
},
{
"epoch": 0.0683433570256971,
"grad_norm": 5.112464904785156,
"learning_rate": 4.9425971638690847e-05,
"loss": 3.2507,
"step": 750
},
{
"epoch": 0.06843448150173137,
"grad_norm": 3.64758563041687,
"learning_rate": 4.942444578052129e-05,
"loss": 2.8225,
"step": 751
},
{
"epoch": 0.06852560597776562,
"grad_norm": 2.541335344314575,
"learning_rate": 4.942291792067221e-05,
"loss": 3.1085,
"step": 752
},
{
"epoch": 0.06861673045379989,
"grad_norm": 3.1781222820281982,
"learning_rate": 4.9421388059268794e-05,
"loss": 3.4272,
"step": 753
},
{
"epoch": 0.06870785492983415,
"grad_norm": 2.2702085971832275,
"learning_rate": 4.941985619643645e-05,
"loss": 3.2569,
"step": 754
},
{
"epoch": 0.06879897940586842,
"grad_norm": 5.204946517944336,
"learning_rate": 4.94183223323007e-05,
"loss": 3.3751,
"step": 755
},
{
"epoch": 0.06889010388190268,
"grad_norm": 2.0559349060058594,
"learning_rate": 4.941678646698726e-05,
"loss": 3.0242,
"step": 756
},
{
"epoch": 0.06898122835793694,
"grad_norm": 3.680403470993042,
"learning_rate": 4.941524860062201e-05,
"loss": 3.3072,
"step": 757
},
{
"epoch": 0.0690723528339712,
"grad_norm": 2.779707908630371,
"learning_rate": 4.941370873333096e-05,
"loss": 2.8916,
"step": 758
},
{
"epoch": 0.06916347731000547,
"grad_norm": 2.8263614177703857,
"learning_rate": 4.941216686524032e-05,
"loss": 3.3456,
"step": 759
},
{
"epoch": 0.06925460178603973,
"grad_norm": 2.906216621398926,
"learning_rate": 4.941062299647645e-05,
"loss": 3.3625,
"step": 760
},
{
"epoch": 0.069345726262074,
"grad_norm": 3.632577419281006,
"learning_rate": 4.9409077127165895e-05,
"loss": 3.2432,
"step": 761
},
{
"epoch": 0.06943685073810825,
"grad_norm": 3.2788524627685547,
"learning_rate": 4.940752925743531e-05,
"loss": 3.2008,
"step": 762
},
{
"epoch": 0.06952797521414252,
"grad_norm": 2.848799467086792,
"learning_rate": 4.9405979387411576e-05,
"loss": 3.6153,
"step": 763
},
{
"epoch": 0.06961909969017678,
"grad_norm": 1.9642467498779297,
"learning_rate": 4.940442751722171e-05,
"loss": 3.1354,
"step": 764
},
{
"epoch": 0.06971022416621105,
"grad_norm": 2.173759698867798,
"learning_rate": 4.9402873646992876e-05,
"loss": 3.0818,
"step": 765
},
{
"epoch": 0.06980134864224531,
"grad_norm": 3.0131309032440186,
"learning_rate": 4.940131777685243e-05,
"loss": 3.4091,
"step": 766
},
{
"epoch": 0.06989247311827956,
"grad_norm": 2.9783716201782227,
"learning_rate": 4.939975990692789e-05,
"loss": 3.2632,
"step": 767
},
{
"epoch": 0.06998359759431383,
"grad_norm": 3.359174966812134,
"learning_rate": 4.9398200037346907e-05,
"loss": 3.057,
"step": 768
},
{
"epoch": 0.0700747220703481,
"grad_norm": 3.2321484088897705,
"learning_rate": 4.939663816823735e-05,
"loss": 2.8868,
"step": 769
},
{
"epoch": 0.07016584654638236,
"grad_norm": 2.782243013381958,
"learning_rate": 4.9395074299727196e-05,
"loss": 3.6189,
"step": 770
},
{
"epoch": 0.07025697102241663,
"grad_norm": 3.496765375137329,
"learning_rate": 4.939350843194462e-05,
"loss": 3.4184,
"step": 771
},
{
"epoch": 0.07034809549845088,
"grad_norm": 3.07650089263916,
"learning_rate": 4.939194056501795e-05,
"loss": 3.1974,
"step": 772
},
{
"epoch": 0.07043921997448514,
"grad_norm": 2.057051181793213,
"learning_rate": 4.939037069907567e-05,
"loss": 3.3996,
"step": 773
},
{
"epoch": 0.07053034445051941,
"grad_norm": 1.907810926437378,
"learning_rate": 4.938879883424645e-05,
"loss": 3.0664,
"step": 774
},
{
"epoch": 0.07062146892655367,
"grad_norm": 3.811920166015625,
"learning_rate": 4.93872249706591e-05,
"loss": 3.3717,
"step": 775
},
{
"epoch": 0.07071259340258794,
"grad_norm": 2.526494026184082,
"learning_rate": 4.938564910844261e-05,
"loss": 3.2352,
"step": 776
},
{
"epoch": 0.07080371787862219,
"grad_norm": 3.059999465942383,
"learning_rate": 4.938407124772613e-05,
"loss": 3.541,
"step": 777
},
{
"epoch": 0.07089484235465646,
"grad_norm": 3.959871530532837,
"learning_rate": 4.9382491388638976e-05,
"loss": 3.5196,
"step": 778
},
{
"epoch": 0.07098596683069072,
"grad_norm": 2.6813833713531494,
"learning_rate": 4.93809095313106e-05,
"loss": 3.1128,
"step": 779
},
{
"epoch": 0.07107709130672499,
"grad_norm": 3.876431465148926,
"learning_rate": 4.937932567587067e-05,
"loss": 3.2911,
"step": 780
},
{
"epoch": 0.07116821578275925,
"grad_norm": 1.5844011306762695,
"learning_rate": 4.9377739822448975e-05,
"loss": 3.0431,
"step": 781
},
{
"epoch": 0.07125934025879352,
"grad_norm": 4.27528190612793,
"learning_rate": 4.937615197117549e-05,
"loss": 3.429,
"step": 782
},
{
"epoch": 0.07135046473482777,
"grad_norm": 3.49869441986084,
"learning_rate": 4.937456212218034e-05,
"loss": 3.5458,
"step": 783
},
{
"epoch": 0.07144158921086204,
"grad_norm": 2.712157964706421,
"learning_rate": 4.9372970275593805e-05,
"loss": 3.2802,
"step": 784
},
{
"epoch": 0.0715327136868963,
"grad_norm": 3.354679584503174,
"learning_rate": 4.937137643154637e-05,
"loss": 3.3316,
"step": 785
},
{
"epoch": 0.07162383816293057,
"grad_norm": 4.650734901428223,
"learning_rate": 4.9369780590168635e-05,
"loss": 3.1748,
"step": 786
},
{
"epoch": 0.07171496263896483,
"grad_norm": 4.040694713592529,
"learning_rate": 4.93681827515914e-05,
"loss": 3.3054,
"step": 787
},
{
"epoch": 0.07180608711499908,
"grad_norm": 3.034775733947754,
"learning_rate": 4.936658291594562e-05,
"loss": 3.4519,
"step": 788
},
{
"epoch": 0.07189721159103335,
"grad_norm": 3.9057462215423584,
"learning_rate": 4.9364981083362374e-05,
"loss": 3.2165,
"step": 789
},
{
"epoch": 0.07198833606706762,
"grad_norm": 3.4691364765167236,
"learning_rate": 4.9363377253972976e-05,
"loss": 3.4187,
"step": 790
},
{
"epoch": 0.07207946054310188,
"grad_norm": 5.106943130493164,
"learning_rate": 4.936177142790885e-05,
"loss": 3.1486,
"step": 791
},
{
"epoch": 0.07217058501913615,
"grad_norm": 2.5772221088409424,
"learning_rate": 4.9360163605301604e-05,
"loss": 3.2757,
"step": 792
},
{
"epoch": 0.0722617094951704,
"grad_norm": 3.403024196624756,
"learning_rate": 4.935855378628299e-05,
"loss": 3.2914,
"step": 793
},
{
"epoch": 0.07235283397120466,
"grad_norm": 1.7153654098510742,
"learning_rate": 4.935694197098496e-05,
"loss": 3.1355,
"step": 794
},
{
"epoch": 0.07244395844723893,
"grad_norm": 3.2709758281707764,
"learning_rate": 4.9355328159539606e-05,
"loss": 3.3144,
"step": 795
},
{
"epoch": 0.0725350829232732,
"grad_norm": 2.949646472930908,
"learning_rate": 4.935371235207917e-05,
"loss": 3.457,
"step": 796
},
{
"epoch": 0.07262620739930746,
"grad_norm": 3.8524835109710693,
"learning_rate": 4.935209454873609e-05,
"loss": 2.6742,
"step": 797
},
{
"epoch": 0.07271733187534171,
"grad_norm": 2.2433860301971436,
"learning_rate": 4.9350474749642946e-05,
"loss": 3.0764,
"step": 798
},
{
"epoch": 0.07280845635137598,
"grad_norm": 2.8999814987182617,
"learning_rate": 4.9348852954932476e-05,
"loss": 3.3237,
"step": 799
},
{
"epoch": 0.07289958082741024,
"grad_norm": 1.8484467267990112,
"learning_rate": 4.9347229164737615e-05,
"loss": 3.1616,
"step": 800
},
{
"epoch": 0.07299070530344451,
"grad_norm": 1.6664539575576782,
"learning_rate": 4.934560337919143e-05,
"loss": 2.9996,
"step": 801
},
{
"epoch": 0.07308182977947877,
"grad_norm": 4.941806316375732,
"learning_rate": 4.934397559842715e-05,
"loss": 3.5425,
"step": 802
},
{
"epoch": 0.07317295425551303,
"grad_norm": 1.4220082759857178,
"learning_rate": 4.9342345822578184e-05,
"loss": 3.1634,
"step": 803
},
{
"epoch": 0.07326407873154729,
"grad_norm": 2.4295237064361572,
"learning_rate": 4.9340714051778106e-05,
"loss": 3.4212,
"step": 804
},
{
"epoch": 0.07335520320758156,
"grad_norm": 1.844810962677002,
"learning_rate": 4.933908028616063e-05,
"loss": 3.1608,
"step": 805
},
{
"epoch": 0.07344632768361582,
"grad_norm": 3.3480727672576904,
"learning_rate": 4.933744452585966e-05,
"loss": 3.4193,
"step": 806
},
{
"epoch": 0.07353745215965009,
"grad_norm": 3.4452803134918213,
"learning_rate": 4.9335806771009266e-05,
"loss": 4.3444,
"step": 807
},
{
"epoch": 0.07362857663568434,
"grad_norm": 3.2160511016845703,
"learning_rate": 4.933416702174365e-05,
"loss": 3.2287,
"step": 808
},
{
"epoch": 0.0737197011117186,
"grad_norm": 1.905893325805664,
"learning_rate": 4.9332525278197195e-05,
"loss": 3.0757,
"step": 809
},
{
"epoch": 0.07381082558775287,
"grad_norm": 4.715121269226074,
"learning_rate": 4.9330881540504457e-05,
"loss": 3.4811,
"step": 810
},
{
"epoch": 0.07390195006378714,
"grad_norm": 3.127492904663086,
"learning_rate": 4.932923580880015e-05,
"loss": 3.5574,
"step": 811
},
{
"epoch": 0.0739930745398214,
"grad_norm": 3.37953782081604,
"learning_rate": 4.9327588083219136e-05,
"loss": 3.4364,
"step": 812
},
{
"epoch": 0.07408419901585565,
"grad_norm": 3.43113374710083,
"learning_rate": 4.932593836389646e-05,
"loss": 2.5653,
"step": 813
},
{
"epoch": 0.07417532349188992,
"grad_norm": 2.3801136016845703,
"learning_rate": 4.9324286650967324e-05,
"loss": 3.1677,
"step": 814
},
{
"epoch": 0.07426644796792418,
"grad_norm": 3.0977799892425537,
"learning_rate": 4.932263294456708e-05,
"loss": 3.0717,
"step": 815
},
{
"epoch": 0.07435757244395845,
"grad_norm": 2.2414751052856445,
"learning_rate": 4.9320977244831277e-05,
"loss": 2.9498,
"step": 816
},
{
"epoch": 0.07444869691999272,
"grad_norm": 3.319639205932617,
"learning_rate": 4.931931955189559e-05,
"loss": 3.3386,
"step": 817
},
{
"epoch": 0.07453982139602697,
"grad_norm": 2.776702642440796,
"learning_rate": 4.931765986589588e-05,
"loss": 3.1402,
"step": 818
},
{
"epoch": 0.07463094587206123,
"grad_norm": 3.072389841079712,
"learning_rate": 4.931599818696817e-05,
"loss": 3.1573,
"step": 819
},
{
"epoch": 0.0747220703480955,
"grad_norm": 3.179121255874634,
"learning_rate": 4.931433451524863e-05,
"loss": 3.2369,
"step": 820
},
{
"epoch": 0.07481319482412976,
"grad_norm": 3.051584005355835,
"learning_rate": 4.9312668850873603e-05,
"loss": 3.381,
"step": 821
},
{
"epoch": 0.07490431930016403,
"grad_norm": 3.383882761001587,
"learning_rate": 4.931100119397961e-05,
"loss": 3.42,
"step": 822
},
{
"epoch": 0.07499544377619828,
"grad_norm": 3.531190872192383,
"learning_rate": 4.930933154470331e-05,
"loss": 3.0216,
"step": 823
},
{
"epoch": 0.07508656825223255,
"grad_norm": 5.148257255554199,
"learning_rate": 4.9307659903181545e-05,
"loss": 3.4292,
"step": 824
},
{
"epoch": 0.07517769272826681,
"grad_norm": 1.8193916082382202,
"learning_rate": 4.9305986269551315e-05,
"loss": 3.1074,
"step": 825
},
{
"epoch": 0.07526881720430108,
"grad_norm": 3.702211380004883,
"learning_rate": 4.930431064394977e-05,
"loss": 3.2786,
"step": 826
},
{
"epoch": 0.07535994168033534,
"grad_norm": 1.5868266820907593,
"learning_rate": 4.930263302651424e-05,
"loss": 2.9994,
"step": 827
},
{
"epoch": 0.0754510661563696,
"grad_norm": 3.0381083488464355,
"learning_rate": 4.930095341738221e-05,
"loss": 3.5689,
"step": 828
},
{
"epoch": 0.07554219063240386,
"grad_norm": 4.9261884689331055,
"learning_rate": 4.929927181669133e-05,
"loss": 4.6381,
"step": 829
},
{
"epoch": 0.07563331510843813,
"grad_norm": 2.415921688079834,
"learning_rate": 4.929758822457943e-05,
"loss": 3.4642,
"step": 830
},
{
"epoch": 0.07572443958447239,
"grad_norm": 2.334571123123169,
"learning_rate": 4.929590264118446e-05,
"loss": 3.1396,
"step": 831
},
{
"epoch": 0.07581556406050666,
"grad_norm": 3.4270524978637695,
"learning_rate": 4.929421506664458e-05,
"loss": 3.0609,
"step": 832
},
{
"epoch": 0.07590668853654091,
"grad_norm": 1.8095070123672485,
"learning_rate": 4.929252550109808e-05,
"loss": 3.0537,
"step": 833
},
{
"epoch": 0.07599781301257517,
"grad_norm": 2.4400718212127686,
"learning_rate": 4.929083394468344e-05,
"loss": 2.9386,
"step": 834
},
{
"epoch": 0.07608893748860944,
"grad_norm": 3.1036880016326904,
"learning_rate": 4.928914039753928e-05,
"loss": 3.5941,
"step": 835
},
{
"epoch": 0.0761800619646437,
"grad_norm": 2.4113924503326416,
"learning_rate": 4.92874448598044e-05,
"loss": 3.4679,
"step": 836
},
{
"epoch": 0.07627118644067797,
"grad_norm": 1.8080517053604126,
"learning_rate": 4.9285747331617746e-05,
"loss": 3.0843,
"step": 837
},
{
"epoch": 0.07636231091671222,
"grad_norm": 2.755985975265503,
"learning_rate": 4.928404781311845e-05,
"loss": 3.5471,
"step": 838
},
{
"epoch": 0.07645343539274649,
"grad_norm": 2.892883777618408,
"learning_rate": 4.928234630444579e-05,
"loss": 3.2349,
"step": 839
},
{
"epoch": 0.07654455986878075,
"grad_norm": 2.8694229125976562,
"learning_rate": 4.92806428057392e-05,
"loss": 3.4227,
"step": 840
},
{
"epoch": 0.07663568434481502,
"grad_norm": 4.09429407119751,
"learning_rate": 4.9278937317138305e-05,
"loss": 3.4834,
"step": 841
},
{
"epoch": 0.07672680882084928,
"grad_norm": 2.272854804992676,
"learning_rate": 4.927722983878286e-05,
"loss": 3.2056,
"step": 842
},
{
"epoch": 0.07681793329688354,
"grad_norm": 5.0233330726623535,
"learning_rate": 4.927552037081282e-05,
"loss": 2.9908,
"step": 843
},
{
"epoch": 0.0769090577729178,
"grad_norm": 2.9697277545928955,
"learning_rate": 4.9273808913368256e-05,
"loss": 3.0797,
"step": 844
},
{
"epoch": 0.07700018224895207,
"grad_norm": 2.6238036155700684,
"learning_rate": 4.927209546658946e-05,
"loss": 3.1607,
"step": 845
},
{
"epoch": 0.07709130672498633,
"grad_norm": 2.3229193687438965,
"learning_rate": 4.9270380030616826e-05,
"loss": 3.0519,
"step": 846
},
{
"epoch": 0.0771824312010206,
"grad_norm": 3.1790342330932617,
"learning_rate": 4.9268662605590963e-05,
"loss": 3.1259,
"step": 847
},
{
"epoch": 0.07727355567705485,
"grad_norm": 2.6441993713378906,
"learning_rate": 4.926694319165261e-05,
"loss": 3.2281,
"step": 848
},
{
"epoch": 0.07736468015308912,
"grad_norm": 3.473982572555542,
"learning_rate": 4.926522178894268e-05,
"loss": 3.0969,
"step": 849
},
{
"epoch": 0.07745580462912338,
"grad_norm": 3.585967540740967,
"learning_rate": 4.926349839760225e-05,
"loss": 3.2388,
"step": 850
},
{
"epoch": 0.07754692910515765,
"grad_norm": 2.786681890487671,
"learning_rate": 4.926177301777256e-05,
"loss": 2.8739,
"step": 851
},
{
"epoch": 0.07763805358119191,
"grad_norm": 2.578705072402954,
"learning_rate": 4.926004564959501e-05,
"loss": 3.1861,
"step": 852
},
{
"epoch": 0.07772917805722616,
"grad_norm": 3.162743091583252,
"learning_rate": 4.925831629321117e-05,
"loss": 3.4526,
"step": 853
},
{
"epoch": 0.07782030253326043,
"grad_norm": 2.0641379356384277,
"learning_rate": 4.925658494876275e-05,
"loss": 3.1193,
"step": 854
},
{
"epoch": 0.0779114270092947,
"grad_norm": 4.980138778686523,
"learning_rate": 4.9254851616391664e-05,
"loss": 3.4487,
"step": 855
},
{
"epoch": 0.07800255148532896,
"grad_norm": 1.8417590856552124,
"learning_rate": 4.9253116296239956e-05,
"loss": 3.1246,
"step": 856
},
{
"epoch": 0.07809367596136323,
"grad_norm": 2.736356496810913,
"learning_rate": 4.9251378988449835e-05,
"loss": 3.3114,
"step": 857
},
{
"epoch": 0.07818480043739748,
"grad_norm": 2.5761330127716064,
"learning_rate": 4.924963969316369e-05,
"loss": 3.3988,
"step": 858
},
{
"epoch": 0.07827592491343174,
"grad_norm": 3.1260087490081787,
"learning_rate": 4.924789841052406e-05,
"loss": 3.2409,
"step": 859
},
{
"epoch": 0.07836704938946601,
"grad_norm": 1.8674402236938477,
"learning_rate": 4.9246155140673646e-05,
"loss": 3.0823,
"step": 860
},
{
"epoch": 0.07845817386550027,
"grad_norm": 2.6160728931427,
"learning_rate": 4.924440988375532e-05,
"loss": 3.4579,
"step": 861
},
{
"epoch": 0.07854929834153454,
"grad_norm": 1.7055904865264893,
"learning_rate": 4.924266263991212e-05,
"loss": 3.119,
"step": 862
},
{
"epoch": 0.0786404228175688,
"grad_norm": 1.8979192972183228,
"learning_rate": 4.924091340928722e-05,
"loss": 3.1205,
"step": 863
},
{
"epoch": 0.07873154729360306,
"grad_norm": 1.8284133672714233,
"learning_rate": 4.923916219202399e-05,
"loss": 2.9849,
"step": 864
},
{
"epoch": 0.07882267176963732,
"grad_norm": 1.7913658618927002,
"learning_rate": 4.923740898826595e-05,
"loss": 3.0129,
"step": 865
},
{
"epoch": 0.07891379624567159,
"grad_norm": 2.9675111770629883,
"learning_rate": 4.9235653798156786e-05,
"loss": 3.2939,
"step": 866
},
{
"epoch": 0.07900492072170585,
"grad_norm": 2.1613569259643555,
"learning_rate": 4.9233896621840326e-05,
"loss": 3.1203,
"step": 867
},
{
"epoch": 0.07909604519774012,
"grad_norm": 2.8138372898101807,
"learning_rate": 4.923213745946059e-05,
"loss": 3.3916,
"step": 868
},
{
"epoch": 0.07918716967377437,
"grad_norm": 5.18245792388916,
"learning_rate": 4.9230376311161744e-05,
"loss": 3.1091,
"step": 869
},
{
"epoch": 0.07927829414980864,
"grad_norm": 3.7926981449127197,
"learning_rate": 4.922861317708812e-05,
"loss": 3.0363,
"step": 870
},
{
"epoch": 0.0793694186258429,
"grad_norm": 2.6583340167999268,
"learning_rate": 4.9226848057384225e-05,
"loss": 3.1699,
"step": 871
},
{
"epoch": 0.07946054310187717,
"grad_norm": 3.3531649112701416,
"learning_rate": 4.92250809521947e-05,
"loss": 3.0846,
"step": 872
},
{
"epoch": 0.07955166757791143,
"grad_norm": 2.9770283699035645,
"learning_rate": 4.922331186166438e-05,
"loss": 3.0176,
"step": 873
},
{
"epoch": 0.07964279205394569,
"grad_norm": 2.4211061000823975,
"learning_rate": 4.922154078593824e-05,
"loss": 3.5094,
"step": 874
},
{
"epoch": 0.07973391652997995,
"grad_norm": 2.4895503520965576,
"learning_rate": 4.9219767725161436e-05,
"loss": 3.4907,
"step": 875
},
{
"epoch": 0.07982504100601422,
"grad_norm": 2.4370858669281006,
"learning_rate": 4.9217992679479266e-05,
"loss": 3.4705,
"step": 876
},
{
"epoch": 0.07991616548204848,
"grad_norm": 2.213453531265259,
"learning_rate": 4.921621564903721e-05,
"loss": 3.1494,
"step": 877
},
{
"epoch": 0.08000728995808275,
"grad_norm": 2.5228660106658936,
"learning_rate": 4.9214436633980904e-05,
"loss": 2.3643,
"step": 878
},
{
"epoch": 0.080098414434117,
"grad_norm": 1.479423999786377,
"learning_rate": 4.921265563445614e-05,
"loss": 2.9752,
"step": 879
},
{
"epoch": 0.08018953891015126,
"grad_norm": 3.9881060123443604,
"learning_rate": 4.921087265060888e-05,
"loss": 3.172,
"step": 880
},
{
"epoch": 0.08028066338618553,
"grad_norm": 3.8781585693359375,
"learning_rate": 4.920908768258524e-05,
"loss": 3.2123,
"step": 881
},
{
"epoch": 0.0803717878622198,
"grad_norm": 1.9390805959701538,
"learning_rate": 4.920730073053152e-05,
"loss": 3.156,
"step": 882
},
{
"epoch": 0.08046291233825406,
"grad_norm": 3.341097116470337,
"learning_rate": 4.920551179459415e-05,
"loss": 3.2462,
"step": 883
},
{
"epoch": 0.08055403681428831,
"grad_norm": 3.1172938346862793,
"learning_rate": 4.9203720874919765e-05,
"loss": 3.2327,
"step": 884
},
{
"epoch": 0.08064516129032258,
"grad_norm": 2.6865100860595703,
"learning_rate": 4.920192797165511e-05,
"loss": 3.0381,
"step": 885
},
{
"epoch": 0.08073628576635684,
"grad_norm": 1.9933525323867798,
"learning_rate": 4.920013308494714e-05,
"loss": 3.1288,
"step": 886
},
{
"epoch": 0.08082741024239111,
"grad_norm": 2.4274346828460693,
"learning_rate": 4.919833621494294e-05,
"loss": 3.1731,
"step": 887
},
{
"epoch": 0.08091853471842538,
"grad_norm": 3.6805949211120605,
"learning_rate": 4.919653736178977e-05,
"loss": 3.4796,
"step": 888
},
{
"epoch": 0.08100965919445963,
"grad_norm": 2.9740312099456787,
"learning_rate": 4.9194736525635074e-05,
"loss": 3.2645,
"step": 889
},
{
"epoch": 0.08110078367049389,
"grad_norm": 1.9813849925994873,
"learning_rate": 4.919293370662642e-05,
"loss": 3.1699,
"step": 890
},
{
"epoch": 0.08119190814652816,
"grad_norm": 2.8427340984344482,
"learning_rate": 4.9191128904911556e-05,
"loss": 3.3489,
"step": 891
},
{
"epoch": 0.08128303262256242,
"grad_norm": 3.997051954269409,
"learning_rate": 4.91893221206384e-05,
"loss": 3.4374,
"step": 892
},
{
"epoch": 0.08137415709859669,
"grad_norm": 1.672037959098816,
"learning_rate": 4.9187513353955016e-05,
"loss": 3.1125,
"step": 893
},
{
"epoch": 0.08146528157463094,
"grad_norm": 2.2593343257904053,
"learning_rate": 4.9185702605009645e-05,
"loss": 3.1311,
"step": 894
},
{
"epoch": 0.0815564060506652,
"grad_norm": 3.1953940391540527,
"learning_rate": 4.9183889873950684e-05,
"loss": 3.366,
"step": 895
},
{
"epoch": 0.08164753052669947,
"grad_norm": 3.4176578521728516,
"learning_rate": 4.91820751609267e-05,
"loss": 3.3673,
"step": 896
},
{
"epoch": 0.08173865500273374,
"grad_norm": 2.87166166305542,
"learning_rate": 4.9180258466086404e-05,
"loss": 3.5602,
"step": 897
},
{
"epoch": 0.081829779478768,
"grad_norm": 2.719068765640259,
"learning_rate": 4.917843978957869e-05,
"loss": 3.2119,
"step": 898
},
{
"epoch": 0.08192090395480225,
"grad_norm": 2.7754950523376465,
"learning_rate": 4.9176619131552604e-05,
"loss": 3.0594,
"step": 899
},
{
"epoch": 0.08201202843083652,
"grad_norm": 3.5347611904144287,
"learning_rate": 4.917479649215735e-05,
"loss": 3.3751,
"step": 900
},
{
"epoch": 0.08210315290687079,
"grad_norm": 4.182806015014648,
"learning_rate": 4.917297187154232e-05,
"loss": 3.1338,
"step": 901
},
{
"epoch": 0.08219427738290505,
"grad_norm": 1.5867587327957153,
"learning_rate": 4.9171145269857024e-05,
"loss": 3.0826,
"step": 902
},
{
"epoch": 0.08228540185893932,
"grad_norm": 3.97678279876709,
"learning_rate": 4.916931668725117e-05,
"loss": 3.3052,
"step": 903
},
{
"epoch": 0.08237652633497357,
"grad_norm": 3.3548977375030518,
"learning_rate": 4.916748612387461e-05,
"loss": 3.3696,
"step": 904
},
{
"epoch": 0.08246765081100783,
"grad_norm": 4.031994342803955,
"learning_rate": 4.916565357987738e-05,
"loss": 3.0432,
"step": 905
},
{
"epoch": 0.0825587752870421,
"grad_norm": 3.2942988872528076,
"learning_rate": 4.916381905540966e-05,
"loss": 3.0257,
"step": 906
},
{
"epoch": 0.08264989976307636,
"grad_norm": 2.751410484313965,
"learning_rate": 4.916198255062179e-05,
"loss": 3.1613,
"step": 907
},
{
"epoch": 0.08274102423911063,
"grad_norm": 2.4237067699432373,
"learning_rate": 4.916014406566428e-05,
"loss": 3.2109,
"step": 908
},
{
"epoch": 0.08283214871514488,
"grad_norm": 1.9257638454437256,
"learning_rate": 4.915830360068781e-05,
"loss": 3.0888,
"step": 909
},
{
"epoch": 0.08292327319117915,
"grad_norm": 2.7850747108459473,
"learning_rate": 4.91564611558432e-05,
"loss": 3.0367,
"step": 910
},
{
"epoch": 0.08301439766721341,
"grad_norm": 2.9995596408843994,
"learning_rate": 4.915461673128146e-05,
"loss": 3.1854,
"step": 911
},
{
"epoch": 0.08310552214324768,
"grad_norm": 2.685365915298462,
"learning_rate": 4.915277032715374e-05,
"loss": 3.491,
"step": 912
},
{
"epoch": 0.08319664661928194,
"grad_norm": 2.090184211730957,
"learning_rate": 4.915092194361136e-05,
"loss": 3.3902,
"step": 913
},
{
"epoch": 0.0832877710953162,
"grad_norm": 2.95298171043396,
"learning_rate": 4.91490715808058e-05,
"loss": 3.349,
"step": 914
},
{
"epoch": 0.08337889557135046,
"grad_norm": 2.3491621017456055,
"learning_rate": 4.914721923888871e-05,
"loss": 3.0253,
"step": 915
},
{
"epoch": 0.08347002004738473,
"grad_norm": 2.7936818599700928,
"learning_rate": 4.914536491801189e-05,
"loss": 3.1338,
"step": 916
},
{
"epoch": 0.08356114452341899,
"grad_norm": 2.7228002548217773,
"learning_rate": 4.914350861832732e-05,
"loss": 3.0906,
"step": 917
},
{
"epoch": 0.08365226899945326,
"grad_norm": 3.0175414085388184,
"learning_rate": 4.914165033998711e-05,
"loss": 3.4414,
"step": 918
},
{
"epoch": 0.08374339347548751,
"grad_norm": 1.8119590282440186,
"learning_rate": 4.9139790083143574e-05,
"loss": 3.1763,
"step": 919
},
{
"epoch": 0.08383451795152178,
"grad_norm": 2.5555202960968018,
"learning_rate": 4.913792784794917e-05,
"loss": 3.356,
"step": 920
},
{
"epoch": 0.08392564242755604,
"grad_norm": 2.8031249046325684,
"learning_rate": 4.913606363455649e-05,
"loss": 3.3269,
"step": 921
},
{
"epoch": 0.0840167669035903,
"grad_norm": 2.178687810897827,
"learning_rate": 4.913419744311835e-05,
"loss": 3.1791,
"step": 922
},
{
"epoch": 0.08410789137962457,
"grad_norm": 2.583512544631958,
"learning_rate": 4.9132329273787655e-05,
"loss": 3.5688,
"step": 923
},
{
"epoch": 0.08419901585565882,
"grad_norm": 2.3542723655700684,
"learning_rate": 4.913045912671753e-05,
"loss": 3.061,
"step": 924
},
{
"epoch": 0.08429014033169309,
"grad_norm": 1.9939539432525635,
"learning_rate": 4.9128587002061245e-05,
"loss": 3.074,
"step": 925
},
{
"epoch": 0.08438126480772735,
"grad_norm": 3.179673194885254,
"learning_rate": 4.912671289997221e-05,
"loss": 2.9628,
"step": 926
},
{
"epoch": 0.08447238928376162,
"grad_norm": 4.300661563873291,
"learning_rate": 4.912483682060403e-05,
"loss": 3.2144,
"step": 927
},
{
"epoch": 0.08456351375979589,
"grad_norm": 3.354478597640991,
"learning_rate": 4.912295876411044e-05,
"loss": 2.8357,
"step": 928
},
{
"epoch": 0.08465463823583014,
"grad_norm": 2.492208957672119,
"learning_rate": 4.9121078730645375e-05,
"loss": 3.2905,
"step": 929
},
{
"epoch": 0.0847457627118644,
"grad_norm": 3.1754820346832275,
"learning_rate": 4.91191967203629e-05,
"loss": 3.205,
"step": 930
},
{
"epoch": 0.08483688718789867,
"grad_norm": 2.5363569259643555,
"learning_rate": 4.911731273341725e-05,
"loss": 3.395,
"step": 931
},
{
"epoch": 0.08492801166393293,
"grad_norm": 4.300615310668945,
"learning_rate": 4.911542676996284e-05,
"loss": 3.0683,
"step": 932
},
{
"epoch": 0.0850191361399672,
"grad_norm": 2.777848958969116,
"learning_rate": 4.911353883015422e-05,
"loss": 3.1315,
"step": 933
},
{
"epoch": 0.08511026061600145,
"grad_norm": 3.849351406097412,
"learning_rate": 4.9111648914146116e-05,
"loss": 3.1234,
"step": 934
},
{
"epoch": 0.08520138509203572,
"grad_norm": 2.4552981853485107,
"learning_rate": 4.910975702209341e-05,
"loss": 3.3631,
"step": 935
},
{
"epoch": 0.08529250956806998,
"grad_norm": 3.1233198642730713,
"learning_rate": 4.910786315415115e-05,
"loss": 3.4195,
"step": 936
},
{
"epoch": 0.08538363404410425,
"grad_norm": 4.6678900718688965,
"learning_rate": 4.910596731047456e-05,
"loss": 3.4252,
"step": 937
},
{
"epoch": 0.08547475852013851,
"grad_norm": 2.9370291233062744,
"learning_rate": 4.9104069491218995e-05,
"loss": 4.3231,
"step": 938
},
{
"epoch": 0.08556588299617277,
"grad_norm": 1.6954116821289062,
"learning_rate": 4.910216969654e-05,
"loss": 3.0919,
"step": 939
},
{
"epoch": 0.08565700747220703,
"grad_norm": 1.941863775253296,
"learning_rate": 4.9100267926593266e-05,
"loss": 3.13,
"step": 940
},
{
"epoch": 0.0857481319482413,
"grad_norm": 2.876239776611328,
"learning_rate": 4.909836418153465e-05,
"loss": 3.3744,
"step": 941
},
{
"epoch": 0.08583925642427556,
"grad_norm": 1.5756586790084839,
"learning_rate": 4.909645846152018e-05,
"loss": 3.0713,
"step": 942
},
{
"epoch": 0.08593038090030983,
"grad_norm": 3.047095775604248,
"learning_rate": 4.909455076670601e-05,
"loss": 3.3628,
"step": 943
},
{
"epoch": 0.08602150537634409,
"grad_norm": 4.294236660003662,
"learning_rate": 4.909264109724853e-05,
"loss": 3.4089,
"step": 944
},
{
"epoch": 0.08611262985237834,
"grad_norm": 3.7276289463043213,
"learning_rate": 4.9090729453304197e-05,
"loss": 3.5265,
"step": 945
},
{
"epoch": 0.08620375432841261,
"grad_norm": 2.4728293418884277,
"learning_rate": 4.908881583502971e-05,
"loss": 4.0518,
"step": 946
},
{
"epoch": 0.08629487880444688,
"grad_norm": 2.5228142738342285,
"learning_rate": 4.908690024258188e-05,
"loss": 3.3074,
"step": 947
},
{
"epoch": 0.08638600328048114,
"grad_norm": 2.56369686126709,
"learning_rate": 4.90849826761177e-05,
"loss": 3.472,
"step": 948
},
{
"epoch": 0.0864771277565154,
"grad_norm": 3.3140337467193604,
"learning_rate": 4.908306313579433e-05,
"loss": 3.027,
"step": 949
},
{
"epoch": 0.08656825223254966,
"grad_norm": 1.8993895053863525,
"learning_rate": 4.908114162176908e-05,
"loss": 3.0605,
"step": 950
},
{
"epoch": 0.08665937670858392,
"grad_norm": 4.037572383880615,
"learning_rate": 4.907921813419942e-05,
"loss": 3.2735,
"step": 951
},
{
"epoch": 0.08675050118461819,
"grad_norm": 2.20011043548584,
"learning_rate": 4.9077292673243e-05,
"loss": 3.4346,
"step": 952
},
{
"epoch": 0.08684162566065246,
"grad_norm": 1.5664113759994507,
"learning_rate": 4.907536523905761e-05,
"loss": 2.9917,
"step": 953
},
{
"epoch": 0.08693275013668672,
"grad_norm": 1.562983512878418,
"learning_rate": 4.907343583180122e-05,
"loss": 2.831,
"step": 954
},
{
"epoch": 0.08702387461272097,
"grad_norm": 1.4592325687408447,
"learning_rate": 4.9071504451631934e-05,
"loss": 3.1492,
"step": 955
},
{
"epoch": 0.08711499908875524,
"grad_norm": 3.860102653503418,
"learning_rate": 4.9069571098708045e-05,
"loss": 3.3501,
"step": 956
},
{
"epoch": 0.0872061235647895,
"grad_norm": 3.7838504314422607,
"learning_rate": 4.9067635773188005e-05,
"loss": 3.2827,
"step": 957
},
{
"epoch": 0.08729724804082377,
"grad_norm": 1.798142671585083,
"learning_rate": 4.906569847523042e-05,
"loss": 3.2199,
"step": 958
},
{
"epoch": 0.08738837251685803,
"grad_norm": 2.6328585147857666,
"learning_rate": 4.906375920499405e-05,
"loss": 3.1803,
"step": 959
},
{
"epoch": 0.08747949699289229,
"grad_norm": 2.4833974838256836,
"learning_rate": 4.906181796263784e-05,
"loss": 3.7958,
"step": 960
},
{
"epoch": 0.08757062146892655,
"grad_norm": 2.0607047080993652,
"learning_rate": 4.9059874748320876e-05,
"loss": 3.1254,
"step": 961
},
{
"epoch": 0.08766174594496082,
"grad_norm": 2.7997632026672363,
"learning_rate": 4.90579295622024e-05,
"loss": 2.953,
"step": 962
},
{
"epoch": 0.08775287042099508,
"grad_norm": 2.249958038330078,
"learning_rate": 4.905598240444185e-05,
"loss": 3.4737,
"step": 963
},
{
"epoch": 0.08784399489702935,
"grad_norm": 3.1202094554901123,
"learning_rate": 4.9054033275198794e-05,
"loss": 3.1832,
"step": 964
},
{
"epoch": 0.0879351193730636,
"grad_norm": 1.4089468717575073,
"learning_rate": 4.905208217463296e-05,
"loss": 3.0008,
"step": 965
},
{
"epoch": 0.08802624384909787,
"grad_norm": 2.194896697998047,
"learning_rate": 4.905012910290426e-05,
"loss": 3.1747,
"step": 966
},
{
"epoch": 0.08811736832513213,
"grad_norm": 1.8730498552322388,
"learning_rate": 4.904817406017275e-05,
"loss": 3.0702,
"step": 967
},
{
"epoch": 0.0882084928011664,
"grad_norm": 1.5486280918121338,
"learning_rate": 4.904621704659866e-05,
"loss": 3.0648,
"step": 968
},
{
"epoch": 0.08829961727720066,
"grad_norm": 2.200500965118408,
"learning_rate": 4.9044258062342376e-05,
"loss": 3.095,
"step": 969
},
{
"epoch": 0.08839074175323491,
"grad_norm": 4.8551788330078125,
"learning_rate": 4.904229710756444e-05,
"loss": 4.232,
"step": 970
},
{
"epoch": 0.08848186622926918,
"grad_norm": 1.4722237586975098,
"learning_rate": 4.904033418242555e-05,
"loss": 3.021,
"step": 971
},
{
"epoch": 0.08857299070530344,
"grad_norm": 2.9047417640686035,
"learning_rate": 4.9038369287086594e-05,
"loss": 2.9605,
"step": 972
},
{
"epoch": 0.08866411518133771,
"grad_norm": 2.3891854286193848,
"learning_rate": 4.9036402421708596e-05,
"loss": 3.4125,
"step": 973
},
{
"epoch": 0.08875523965737198,
"grad_norm": 3.597698211669922,
"learning_rate": 4.903443358645274e-05,
"loss": 3.4755,
"step": 974
},
{
"epoch": 0.08884636413340623,
"grad_norm": 2.3222601413726807,
"learning_rate": 4.903246278148039e-05,
"loss": 3.1024,
"step": 975
},
{
"epoch": 0.0889374886094405,
"grad_norm": 1.7623449563980103,
"learning_rate": 4.903049000695305e-05,
"loss": 3.0337,
"step": 976
},
{
"epoch": 0.08902861308547476,
"grad_norm": 4.266841411590576,
"learning_rate": 4.9028515263032415e-05,
"loss": 3.3937,
"step": 977
},
{
"epoch": 0.08911973756150902,
"grad_norm": 1.746504783630371,
"learning_rate": 4.902653854988031e-05,
"loss": 3.0249,
"step": 978
},
{
"epoch": 0.08921086203754329,
"grad_norm": 3.4824695587158203,
"learning_rate": 4.9024559867658734e-05,
"loss": 3.1459,
"step": 979
},
{
"epoch": 0.08930198651357754,
"grad_norm": 3.1984243392944336,
"learning_rate": 4.9022579216529854e-05,
"loss": 3.1946,
"step": 980
},
{
"epoch": 0.08939311098961181,
"grad_norm": 2.4650838375091553,
"learning_rate": 4.902059659665599e-05,
"loss": 3.3177,
"step": 981
},
{
"epoch": 0.08948423546564607,
"grad_norm": 3.3806710243225098,
"learning_rate": 4.9018612008199616e-05,
"loss": 3.4283,
"step": 982
},
{
"epoch": 0.08957535994168034,
"grad_norm": 2.031496286392212,
"learning_rate": 4.9016625451323396e-05,
"loss": 3.2034,
"step": 983
},
{
"epoch": 0.0896664844177146,
"grad_norm": 1.8821396827697754,
"learning_rate": 4.9014636926190116e-05,
"loss": 3.0086,
"step": 984
},
{
"epoch": 0.08975760889374886,
"grad_norm": 3.310356378555298,
"learning_rate": 4.901264643296276e-05,
"loss": 3.2155,
"step": 985
},
{
"epoch": 0.08984873336978312,
"grad_norm": 2.422724723815918,
"learning_rate": 4.9010653971804444e-05,
"loss": 3.3122,
"step": 986
},
{
"epoch": 0.08993985784581739,
"grad_norm": 4.234830856323242,
"learning_rate": 4.9008659542878464e-05,
"loss": 3.5449,
"step": 987
},
{
"epoch": 0.09003098232185165,
"grad_norm": 1.48231840133667,
"learning_rate": 4.900666314634828e-05,
"loss": 3.0412,
"step": 988
},
{
"epoch": 0.09012210679788592,
"grad_norm": 2.9697065353393555,
"learning_rate": 4.900466478237748e-05,
"loss": 3.2992,
"step": 989
},
{
"epoch": 0.09021323127392017,
"grad_norm": 2.7923099994659424,
"learning_rate": 4.900266445112986e-05,
"loss": 3.4856,
"step": 990
},
{
"epoch": 0.09030435574995443,
"grad_norm": 2.1255149841308594,
"learning_rate": 4.900066215276936e-05,
"loss": 2.9898,
"step": 991
},
{
"epoch": 0.0903954802259887,
"grad_norm": 2.3270339965820312,
"learning_rate": 4.899865788746005e-05,
"loss": 3.127,
"step": 992
},
{
"epoch": 0.09048660470202297,
"grad_norm": 2.2811200618743896,
"learning_rate": 4.899665165536621e-05,
"loss": 3.3315,
"step": 993
},
{
"epoch": 0.09057772917805723,
"grad_norm": 2.0090272426605225,
"learning_rate": 4.8994643456652244e-05,
"loss": 3.1308,
"step": 994
},
{
"epoch": 0.09066885365409148,
"grad_norm": 5.991725921630859,
"learning_rate": 4.8992633291482746e-05,
"loss": 2.8415,
"step": 995
},
{
"epoch": 0.09075997813012575,
"grad_norm": 1.7398011684417725,
"learning_rate": 4.899062116002244e-05,
"loss": 3.247,
"step": 996
},
{
"epoch": 0.09085110260616001,
"grad_norm": 2.6200222969055176,
"learning_rate": 4.898860706243625e-05,
"loss": 3.0824,
"step": 997
},
{
"epoch": 0.09094222708219428,
"grad_norm": 2.7721424102783203,
"learning_rate": 4.898659099888921e-05,
"loss": 4.5453,
"step": 998
},
{
"epoch": 0.09103335155822855,
"grad_norm": 4.053179740905762,
"learning_rate": 4.8984572969546575e-05,
"loss": 3.4542,
"step": 999
},
{
"epoch": 0.0911244760342628,
"grad_norm": 3.0186972618103027,
"learning_rate": 4.8982552974573717e-05,
"loss": 3.1511,
"step": 1000
},
{
"epoch": 0.09121560051029706,
"grad_norm": 2.732668876647949,
"learning_rate": 4.8980531014136175e-05,
"loss": 3.0888,
"step": 1001
},
{
"epoch": 0.09130672498633133,
"grad_norm": 2.304547071456909,
"learning_rate": 4.897850708839966e-05,
"loss": 3.3741,
"step": 1002
},
{
"epoch": 0.0913978494623656,
"grad_norm": 2.8242197036743164,
"learning_rate": 4.897648119753006e-05,
"loss": 3.3513,
"step": 1003
},
{
"epoch": 0.09148897393839986,
"grad_norm": 3.2186803817749023,
"learning_rate": 4.897445334169337e-05,
"loss": 4.7131,
"step": 1004
},
{
"epoch": 0.09158009841443411,
"grad_norm": 2.906078577041626,
"learning_rate": 4.897242352105581e-05,
"loss": 3.0597,
"step": 1005
},
{
"epoch": 0.09167122289046838,
"grad_norm": 3.2613537311553955,
"learning_rate": 4.8970391735783725e-05,
"loss": 3.0269,
"step": 1006
},
{
"epoch": 0.09176234736650264,
"grad_norm": 4.029659271240234,
"learning_rate": 4.896835798604362e-05,
"loss": 3.5813,
"step": 1007
},
{
"epoch": 0.09185347184253691,
"grad_norm": 8.040237426757812,
"learning_rate": 4.8966322272002174e-05,
"loss": 3.176,
"step": 1008
},
{
"epoch": 0.09194459631857117,
"grad_norm": 3.0369622707366943,
"learning_rate": 4.8964284593826215e-05,
"loss": 3.1676,
"step": 1009
},
{
"epoch": 0.09203572079460542,
"grad_norm": 3.2260282039642334,
"learning_rate": 4.8962244951682754e-05,
"loss": 3.3491,
"step": 1010
},
{
"epoch": 0.09212684527063969,
"grad_norm": 1.53379225730896,
"learning_rate": 4.8960203345738934e-05,
"loss": 3.0221,
"step": 1011
},
{
"epoch": 0.09221796974667396,
"grad_norm": 4.076636791229248,
"learning_rate": 4.895815977616208e-05,
"loss": 3.5203,
"step": 1012
},
{
"epoch": 0.09230909422270822,
"grad_norm": 2.1849517822265625,
"learning_rate": 4.895611424311967e-05,
"loss": 3.4105,
"step": 1013
},
{
"epoch": 0.09240021869874249,
"grad_norm": 8.389893531799316,
"learning_rate": 4.8954066746779334e-05,
"loss": 3.2236,
"step": 1014
},
{
"epoch": 0.09249134317477674,
"grad_norm": 2.0767617225646973,
"learning_rate": 4.895201728730888e-05,
"loss": 3.1362,
"step": 1015
},
{
"epoch": 0.092582467650811,
"grad_norm": 2.6527016162872314,
"learning_rate": 4.894996586487627e-05,
"loss": 3.0877,
"step": 1016
},
{
"epoch": 0.09267359212684527,
"grad_norm": 3.5784764289855957,
"learning_rate": 4.8947912479649624e-05,
"loss": 3.279,
"step": 1017
},
{
"epoch": 0.09276471660287954,
"grad_norm": 1.6435048580169678,
"learning_rate": 4.894585713179723e-05,
"loss": 2.9978,
"step": 1018
},
{
"epoch": 0.0928558410789138,
"grad_norm": 1.3273972272872925,
"learning_rate": 4.894379982148753e-05,
"loss": 2.998,
"step": 1019
},
{
"epoch": 0.09294696555494805,
"grad_norm": 1.7817779779434204,
"learning_rate": 4.894174054888912e-05,
"loss": 2.9691,
"step": 1020
},
{
"epoch": 0.09303809003098232,
"grad_norm": 2.4242284297943115,
"learning_rate": 4.893967931417078e-05,
"loss": 3.228,
"step": 1021
},
{
"epoch": 0.09312921450701658,
"grad_norm": 2.9169692993164062,
"learning_rate": 4.8937616117501414e-05,
"loss": 3.5446,
"step": 1022
},
{
"epoch": 0.09322033898305085,
"grad_norm": 3.6334569454193115,
"learning_rate": 4.893555095905014e-05,
"loss": 3.0,
"step": 1023
},
{
"epoch": 0.09331146345908511,
"grad_norm": 2.8542470932006836,
"learning_rate": 4.8933483838986184e-05,
"loss": 3.1218,
"step": 1024
},
{
"epoch": 0.09340258793511938,
"grad_norm": 2.078474283218384,
"learning_rate": 4.8931414757478954e-05,
"loss": 3.2325,
"step": 1025
},
{
"epoch": 0.09349371241115363,
"grad_norm": 3.3290367126464844,
"learning_rate": 4.8929343714698026e-05,
"loss": 3.1449,
"step": 1026
},
{
"epoch": 0.0935848368871879,
"grad_norm": 2.655738592147827,
"learning_rate": 4.892727071081314e-05,
"loss": 4.3078,
"step": 1027
},
{
"epoch": 0.09367596136322216,
"grad_norm": 2.936398983001709,
"learning_rate": 4.8925195745994165e-05,
"loss": 2.9901,
"step": 1028
},
{
"epoch": 0.09376708583925643,
"grad_norm": 1.61790931224823,
"learning_rate": 4.892311882041117e-05,
"loss": 2.9575,
"step": 1029
},
{
"epoch": 0.0938582103152907,
"grad_norm": 4.781036853790283,
"learning_rate": 4.892103993423436e-05,
"loss": 3.3626,
"step": 1030
},
{
"epoch": 0.09394933479132495,
"grad_norm": 4.162670612335205,
"learning_rate": 4.891895908763411e-05,
"loss": 3.5073,
"step": 1031
},
{
"epoch": 0.09404045926735921,
"grad_norm": 2.5747599601745605,
"learning_rate": 4.8916876280780946e-05,
"loss": 2.6645,
"step": 1032
},
{
"epoch": 0.09413158374339348,
"grad_norm": 2.0610013008117676,
"learning_rate": 4.8914791513845575e-05,
"loss": 2.8624,
"step": 1033
},
{
"epoch": 0.09422270821942774,
"grad_norm": 1.6727491617202759,
"learning_rate": 4.8912704786998844e-05,
"loss": 3.0615,
"step": 1034
},
{
"epoch": 0.09431383269546201,
"grad_norm": 2.5787103176116943,
"learning_rate": 4.8910616100411774e-05,
"loss": 3.1513,
"step": 1035
},
{
"epoch": 0.09440495717149626,
"grad_norm": 2.7966387271881104,
"learning_rate": 4.890852545425553e-05,
"loss": 3.2184,
"step": 1036
},
{
"epoch": 0.09449608164753052,
"grad_norm": 3.2339022159576416,
"learning_rate": 4.8906432848701464e-05,
"loss": 3.219,
"step": 1037
},
{
"epoch": 0.09458720612356479,
"grad_norm": 3.6414124965667725,
"learning_rate": 4.8904338283921056e-05,
"loss": 3.0272,
"step": 1038
},
{
"epoch": 0.09467833059959906,
"grad_norm": 1.915804147720337,
"learning_rate": 4.890224176008598e-05,
"loss": 3.0635,
"step": 1039
},
{
"epoch": 0.09476945507563332,
"grad_norm": 1.533538579940796,
"learning_rate": 4.890014327736804e-05,
"loss": 3.1068,
"step": 1040
},
{
"epoch": 0.09486057955166757,
"grad_norm": 4.12912130355835,
"learning_rate": 4.889804283593923e-05,
"loss": 3.1286,
"step": 1041
},
{
"epoch": 0.09495170402770184,
"grad_norm": 3.413926362991333,
"learning_rate": 4.889594043597168e-05,
"loss": 4.3663,
"step": 1042
},
{
"epoch": 0.0950428285037361,
"grad_norm": 3.632355213165283,
"learning_rate": 4.8893836077637686e-05,
"loss": 3.6121,
"step": 1043
},
{
"epoch": 0.09513395297977037,
"grad_norm": 3.041640043258667,
"learning_rate": 4.8891729761109726e-05,
"loss": 3.0379,
"step": 1044
},
{
"epoch": 0.09522507745580464,
"grad_norm": 2.1716883182525635,
"learning_rate": 4.88896214865604e-05,
"loss": 3.0752,
"step": 1045
},
{
"epoch": 0.09531620193183889,
"grad_norm": 1.7440366744995117,
"learning_rate": 4.88875112541625e-05,
"loss": 2.9915,
"step": 1046
},
{
"epoch": 0.09540732640787315,
"grad_norm": 2.8066303730010986,
"learning_rate": 4.888539906408897e-05,
"loss": 3.2162,
"step": 1047
},
{
"epoch": 0.09549845088390742,
"grad_norm": 2.6546630859375,
"learning_rate": 4.888328491651291e-05,
"loss": 3.1054,
"step": 1048
},
{
"epoch": 0.09558957535994168,
"grad_norm": 2.986856460571289,
"learning_rate": 4.888116881160757e-05,
"loss": 3.1334,
"step": 1049
},
{
"epoch": 0.09568069983597595,
"grad_norm": 4.76503324508667,
"learning_rate": 4.8879050749546395e-05,
"loss": 3.461,
"step": 1050
},
{
"epoch": 0.0957718243120102,
"grad_norm": 1.329960823059082,
"learning_rate": 4.8876930730502954e-05,
"loss": 3.0356,
"step": 1051
},
{
"epoch": 0.09586294878804447,
"grad_norm": 2.466423511505127,
"learning_rate": 4.887480875465099e-05,
"loss": 3.3193,
"step": 1052
},
{
"epoch": 0.09595407326407873,
"grad_norm": 3.3241379261016846,
"learning_rate": 4.887268482216442e-05,
"loss": 3.4059,
"step": 1053
},
{
"epoch": 0.096045197740113,
"grad_norm": 2.425245523452759,
"learning_rate": 4.88705589332173e-05,
"loss": 3.0792,
"step": 1054
},
{
"epoch": 0.09613632221614726,
"grad_norm": 2.820553779602051,
"learning_rate": 4.886843108798386e-05,
"loss": 3.1892,
"step": 1055
},
{
"epoch": 0.09622744669218151,
"grad_norm": 2.662749767303467,
"learning_rate": 4.886630128663847e-05,
"loss": 3.4359,
"step": 1056
},
{
"epoch": 0.09631857116821578,
"grad_norm": 3.7689478397369385,
"learning_rate": 4.8864169529355694e-05,
"loss": 3.3501,
"step": 1057
},
{
"epoch": 0.09640969564425005,
"grad_norm": 2.9950053691864014,
"learning_rate": 4.8862035816310225e-05,
"loss": 3.0473,
"step": 1058
},
{
"epoch": 0.09650082012028431,
"grad_norm": 3.828263521194458,
"learning_rate": 4.885990014767694e-05,
"loss": 3.7086,
"step": 1059
},
{
"epoch": 0.09659194459631858,
"grad_norm": 2.4364869594573975,
"learning_rate": 4.885776252363086e-05,
"loss": 4.1158,
"step": 1060
},
{
"epoch": 0.09668306907235283,
"grad_norm": 3.0693185329437256,
"learning_rate": 4.8855622944347174e-05,
"loss": 4.4396,
"step": 1061
},
{
"epoch": 0.0967741935483871,
"grad_norm": 2.158339738845825,
"learning_rate": 4.885348141000122e-05,
"loss": 3.1364,
"step": 1062
},
{
"epoch": 0.09686531802442136,
"grad_norm": 3.3291866779327393,
"learning_rate": 4.885133792076852e-05,
"loss": 3.4187,
"step": 1063
},
{
"epoch": 0.09695644250045563,
"grad_norm": 3.016261100769043,
"learning_rate": 4.884919247682473e-05,
"loss": 3.4883,
"step": 1064
},
{
"epoch": 0.09704756697648989,
"grad_norm": 1.6200766563415527,
"learning_rate": 4.8847045078345674e-05,
"loss": 3.0487,
"step": 1065
},
{
"epoch": 0.09713869145252414,
"grad_norm": 2.354325771331787,
"learning_rate": 4.884489572550736e-05,
"loss": 3.2557,
"step": 1066
},
{
"epoch": 0.09722981592855841,
"grad_norm": 4.061933994293213,
"learning_rate": 4.884274441848592e-05,
"loss": 3.4442,
"step": 1067
},
{
"epoch": 0.09732094040459267,
"grad_norm": 4.645877838134766,
"learning_rate": 4.884059115745766e-05,
"loss": 3.0568,
"step": 1068
},
{
"epoch": 0.09741206488062694,
"grad_norm": 1.73179292678833,
"learning_rate": 4.883843594259905e-05,
"loss": 3.125,
"step": 1069
},
{
"epoch": 0.0975031893566612,
"grad_norm": 1.4215937852859497,
"learning_rate": 4.883627877408673e-05,
"loss": 2.8963,
"step": 1070
},
{
"epoch": 0.09759431383269546,
"grad_norm": 3.7663443088531494,
"learning_rate": 4.8834119652097475e-05,
"loss": 4.3777,
"step": 1071
},
{
"epoch": 0.09768543830872972,
"grad_norm": 1.6245098114013672,
"learning_rate": 4.883195857680824e-05,
"loss": 3.0648,
"step": 1072
},
{
"epoch": 0.09777656278476399,
"grad_norm": 3.0550179481506348,
"learning_rate": 4.882979554839613e-05,
"loss": 3.2421,
"step": 1073
},
{
"epoch": 0.09786768726079825,
"grad_norm": 3.2408952713012695,
"learning_rate": 4.8827630567038416e-05,
"loss": 2.8971,
"step": 1074
},
{
"epoch": 0.09795881173683252,
"grad_norm": 7.056894302368164,
"learning_rate": 4.882546363291253e-05,
"loss": 3.2447,
"step": 1075
},
{
"epoch": 0.09804993621286677,
"grad_norm": 4.1067914962768555,
"learning_rate": 4.882329474619606e-05,
"loss": 3.3844,
"step": 1076
},
{
"epoch": 0.09814106068890104,
"grad_norm": 2.7459664344787598,
"learning_rate": 4.882112390706675e-05,
"loss": 2.8364,
"step": 1077
},
{
"epoch": 0.0982321851649353,
"grad_norm": 1.7303998470306396,
"learning_rate": 4.8818951115702506e-05,
"loss": 3.1362,
"step": 1078
},
{
"epoch": 0.09832330964096957,
"grad_norm": 2.4760732650756836,
"learning_rate": 4.88167763722814e-05,
"loss": 3.5898,
"step": 1079
},
{
"epoch": 0.09841443411700383,
"grad_norm": 1.439005970954895,
"learning_rate": 4.8814599676981667e-05,
"loss": 3.0291,
"step": 1080
},
{
"epoch": 0.09850555859303808,
"grad_norm": 3.3156778812408447,
"learning_rate": 4.881242102998169e-05,
"loss": 2.9964,
"step": 1081
},
{
"epoch": 0.09859668306907235,
"grad_norm": 2.405925750732422,
"learning_rate": 4.881024043146002e-05,
"loss": 3.549,
"step": 1082
},
{
"epoch": 0.09868780754510662,
"grad_norm": 1.8694865703582764,
"learning_rate": 4.880805788159537e-05,
"loss": 3.3145,
"step": 1083
},
{
"epoch": 0.09877893202114088,
"grad_norm": 3.397982358932495,
"learning_rate": 4.880587338056659e-05,
"loss": 3.1483,
"step": 1084
},
{
"epoch": 0.09887005649717515,
"grad_norm": 3.9848830699920654,
"learning_rate": 4.8803686928552736e-05,
"loss": 3.3601,
"step": 1085
},
{
"epoch": 0.0989611809732094,
"grad_norm": 2.076350688934326,
"learning_rate": 4.880149852573297e-05,
"loss": 2.9707,
"step": 1086
},
{
"epoch": 0.09905230544924366,
"grad_norm": 4.586529731750488,
"learning_rate": 4.8799308172286665e-05,
"loss": 3.0142,
"step": 1087
},
{
"epoch": 0.09914342992527793,
"grad_norm": 2.224879503250122,
"learning_rate": 4.8797115868393304e-05,
"loss": 3.2586,
"step": 1088
},
{
"epoch": 0.0992345544013122,
"grad_norm": 2.13420033454895,
"learning_rate": 4.879492161423257e-05,
"loss": 3.3615,
"step": 1089
},
{
"epoch": 0.09932567887734646,
"grad_norm": 2.3102781772613525,
"learning_rate": 4.8792725409984295e-05,
"loss": 3.2469,
"step": 1090
},
{
"epoch": 0.09941680335338071,
"grad_norm": 2.7327070236206055,
"learning_rate": 4.8790527255828453e-05,
"loss": 3.0008,
"step": 1091
},
{
"epoch": 0.09950792782941498,
"grad_norm": 4.016688823699951,
"learning_rate": 4.8788327151945204e-05,
"loss": 3.3525,
"step": 1092
},
{
"epoch": 0.09959905230544924,
"grad_norm": 2.56805682182312,
"learning_rate": 4.878612509851484e-05,
"loss": 2.7244,
"step": 1093
},
{
"epoch": 0.09969017678148351,
"grad_norm": 2.7914106845855713,
"learning_rate": 4.878392109571784e-05,
"loss": 2.9815,
"step": 1094
},
{
"epoch": 0.09978130125751777,
"grad_norm": 1.5488040447235107,
"learning_rate": 4.878171514373483e-05,
"loss": 2.9438,
"step": 1095
},
{
"epoch": 0.09987242573355203,
"grad_norm": 2.1465392112731934,
"learning_rate": 4.87795072427466e-05,
"loss": 3.2417,
"step": 1096
},
{
"epoch": 0.09996355020958629,
"grad_norm": 2.0856845378875732,
"learning_rate": 4.877729739293409e-05,
"loss": 2.9681,
"step": 1097
},
{
"epoch": 0.10005467468562056,
"grad_norm": 2.8549904823303223,
"learning_rate": 4.87750855944784e-05,
"loss": 4.3383,
"step": 1098
},
{
"epoch": 0.10014579916165482,
"grad_norm": 3.344149351119995,
"learning_rate": 4.87728718475608e-05,
"loss": 2.9673,
"step": 1099
},
{
"epoch": 0.10023692363768909,
"grad_norm": 3.0725624561309814,
"learning_rate": 4.877065615236272e-05,
"loss": 3.4142,
"step": 1100
},
{
"epoch": 0.10032804811372334,
"grad_norm": 3.4187722206115723,
"learning_rate": 4.876843850906574e-05,
"loss": 3.392,
"step": 1101
},
{
"epoch": 0.1004191725897576,
"grad_norm": 1.4068093299865723,
"learning_rate": 4.8766218917851614e-05,
"loss": 2.9884,
"step": 1102
},
{
"epoch": 0.10051029706579187,
"grad_norm": 1.7465176582336426,
"learning_rate": 4.876399737890223e-05,
"loss": 3.1005,
"step": 1103
},
{
"epoch": 0.10060142154182614,
"grad_norm": 2.4753262996673584,
"learning_rate": 4.876177389239967e-05,
"loss": 3.3825,
"step": 1104
},
{
"epoch": 0.1006925460178604,
"grad_norm": 2.6038968563079834,
"learning_rate": 4.8759548458526145e-05,
"loss": 3.4349,
"step": 1105
},
{
"epoch": 0.10078367049389467,
"grad_norm": 3.703859806060791,
"learning_rate": 4.8757321077464035e-05,
"loss": 3.128,
"step": 1106
},
{
"epoch": 0.10087479496992892,
"grad_norm": 2.0796546936035156,
"learning_rate": 4.87550917493959e-05,
"loss": 3.1497,
"step": 1107
},
{
"epoch": 0.10096591944596318,
"grad_norm": 1.381535291671753,
"learning_rate": 4.8752860474504424e-05,
"loss": 2.9456,
"step": 1108
},
{
"epoch": 0.10105704392199745,
"grad_norm": 1.740310549736023,
"learning_rate": 4.875062725297248e-05,
"loss": 3.454,
"step": 1109
},
{
"epoch": 0.10114816839803172,
"grad_norm": 2.3114092350006104,
"learning_rate": 4.874839208498309e-05,
"loss": 3.149,
"step": 1110
},
{
"epoch": 0.10123929287406598,
"grad_norm": 2.582498788833618,
"learning_rate": 4.8746154970719414e-05,
"loss": 3.0988,
"step": 1111
},
{
"epoch": 0.10133041735010023,
"grad_norm": 1.5047119855880737,
"learning_rate": 4.874391591036482e-05,
"loss": 3.0241,
"step": 1112
},
{
"epoch": 0.1014215418261345,
"grad_norm": 2.3863258361816406,
"learning_rate": 4.87416749041028e-05,
"loss": 3.3233,
"step": 1113
},
{
"epoch": 0.10151266630216876,
"grad_norm": 2.8434104919433594,
"learning_rate": 4.8739431952117e-05,
"loss": 3.3524,
"step": 1114
},
{
"epoch": 0.10160379077820303,
"grad_norm": 2.5264041423797607,
"learning_rate": 4.8737187054591256e-05,
"loss": 3.2051,
"step": 1115
},
{
"epoch": 0.1016949152542373,
"grad_norm": 1.7804640531539917,
"learning_rate": 4.873494021170953e-05,
"loss": 2.987,
"step": 1116
},
{
"epoch": 0.10178603973027155,
"grad_norm": 1.650615930557251,
"learning_rate": 4.873269142365598e-05,
"loss": 2.9908,
"step": 1117
},
{
"epoch": 0.10187716420630581,
"grad_norm": 1.6238685846328735,
"learning_rate": 4.873044069061489e-05,
"loss": 3.0443,
"step": 1118
},
{
"epoch": 0.10196828868234008,
"grad_norm": 2.299797773361206,
"learning_rate": 4.87281880127707e-05,
"loss": 3.355,
"step": 1119
},
{
"epoch": 0.10205941315837434,
"grad_norm": 1.3027498722076416,
"learning_rate": 4.872593339030806e-05,
"loss": 2.98,
"step": 1120
},
{
"epoch": 0.10215053763440861,
"grad_norm": 1.9283638000488281,
"learning_rate": 4.872367682341173e-05,
"loss": 3.3087,
"step": 1121
},
{
"epoch": 0.10224166211044286,
"grad_norm": 3.2081453800201416,
"learning_rate": 4.872141831226664e-05,
"loss": 3.0544,
"step": 1122
},
{
"epoch": 0.10233278658647713,
"grad_norm": 3.2388205528259277,
"learning_rate": 4.871915785705788e-05,
"loss": 3.2673,
"step": 1123
},
{
"epoch": 0.10242391106251139,
"grad_norm": 2.2480404376983643,
"learning_rate": 4.871689545797072e-05,
"loss": 3.2273,
"step": 1124
},
{
"epoch": 0.10251503553854566,
"grad_norm": 3.065657615661621,
"learning_rate": 4.871463111519056e-05,
"loss": 3.3631,
"step": 1125
},
{
"epoch": 0.10260616001457992,
"grad_norm": 2.148409843444824,
"learning_rate": 4.8712364828902965e-05,
"loss": 2.8683,
"step": 1126
},
{
"epoch": 0.10269728449061417,
"grad_norm": 2.8003501892089844,
"learning_rate": 4.8710096599293695e-05,
"loss": 3.2381,
"step": 1127
},
{
"epoch": 0.10278840896664844,
"grad_norm": 2.558243989944458,
"learning_rate": 4.870782642654861e-05,
"loss": 3.3222,
"step": 1128
},
{
"epoch": 0.1028795334426827,
"grad_norm": 4.242537975311279,
"learning_rate": 4.870555431085377e-05,
"loss": 3.296,
"step": 1129
},
{
"epoch": 0.10297065791871697,
"grad_norm": 2.546668291091919,
"learning_rate": 4.8703280252395385e-05,
"loss": 3.1536,
"step": 1130
},
{
"epoch": 0.10306178239475124,
"grad_norm": 1.5829936265945435,
"learning_rate": 4.870100425135982e-05,
"loss": 2.9867,
"step": 1131
},
{
"epoch": 0.10315290687078549,
"grad_norm": 2.930858850479126,
"learning_rate": 4.869872630793361e-05,
"loss": 3.2111,
"step": 1132
},
{
"epoch": 0.10324403134681975,
"grad_norm": 2.89884877204895,
"learning_rate": 4.869644642230343e-05,
"loss": 3.1203,
"step": 1133
},
{
"epoch": 0.10333515582285402,
"grad_norm": 4.641218662261963,
"learning_rate": 4.869416459465615e-05,
"loss": 3.4105,
"step": 1134
},
{
"epoch": 0.10342628029888828,
"grad_norm": 3.085787534713745,
"learning_rate": 4.869188082517874e-05,
"loss": 3.2745,
"step": 1135
},
{
"epoch": 0.10351740477492255,
"grad_norm": 1.8495999574661255,
"learning_rate": 4.8689595114058375e-05,
"loss": 3.032,
"step": 1136
},
{
"epoch": 0.1036085292509568,
"grad_norm": 1.8329887390136719,
"learning_rate": 4.86873074614824e-05,
"loss": 3.0344,
"step": 1137
},
{
"epoch": 0.10369965372699107,
"grad_norm": 1.369670033454895,
"learning_rate": 4.868501786763827e-05,
"loss": 3.0423,
"step": 1138
},
{
"epoch": 0.10379077820302533,
"grad_norm": 3.01965594291687,
"learning_rate": 4.868272633271363e-05,
"loss": 3.0057,
"step": 1139
},
{
"epoch": 0.1038819026790596,
"grad_norm": 3.3625049591064453,
"learning_rate": 4.868043285689631e-05,
"loss": 3.0685,
"step": 1140
},
{
"epoch": 0.10397302715509386,
"grad_norm": 2.6331353187561035,
"learning_rate": 4.867813744037423e-05,
"loss": 3.3065,
"step": 1141
},
{
"epoch": 0.10406415163112812,
"grad_norm": 2.193513870239258,
"learning_rate": 4.867584008333553e-05,
"loss": 3.1842,
"step": 1142
},
{
"epoch": 0.10415527610716238,
"grad_norm": 3.555842638015747,
"learning_rate": 4.867354078596848e-05,
"loss": 3.3865,
"step": 1143
},
{
"epoch": 0.10424640058319665,
"grad_norm": 1.3614274263381958,
"learning_rate": 4.867123954846152e-05,
"loss": 3.1375,
"step": 1144
},
{
"epoch": 0.10433752505923091,
"grad_norm": 2.8560686111450195,
"learning_rate": 4.8668936371003246e-05,
"loss": 3.347,
"step": 1145
},
{
"epoch": 0.10442864953526518,
"grad_norm": 3.2532799243927,
"learning_rate": 4.8666631253782405e-05,
"loss": 3.5448,
"step": 1146
},
{
"epoch": 0.10451977401129943,
"grad_norm": 2.5030109882354736,
"learning_rate": 4.866432419698792e-05,
"loss": 3.1628,
"step": 1147
},
{
"epoch": 0.1046108984873337,
"grad_norm": 2.1041152477264404,
"learning_rate": 4.866201520080886e-05,
"loss": 3.211,
"step": 1148
},
{
"epoch": 0.10470202296336796,
"grad_norm": 3.0748095512390137,
"learning_rate": 4.8659704265434466e-05,
"loss": 3.3296,
"step": 1149
},
{
"epoch": 0.10479314743940223,
"grad_norm": 1.9824806451797485,
"learning_rate": 4.865739139105411e-05,
"loss": 3.087,
"step": 1150
},
{
"epoch": 0.10488427191543649,
"grad_norm": 2.9198248386383057,
"learning_rate": 4.8655076577857344e-05,
"loss": 2.9909,
"step": 1151
},
{
"epoch": 0.10497539639147074,
"grad_norm": 3.1498401165008545,
"learning_rate": 4.8652759826033886e-05,
"loss": 4.2349,
"step": 1152
},
{
"epoch": 0.10506652086750501,
"grad_norm": 2.7998111248016357,
"learning_rate": 4.86504411357736e-05,
"loss": 3.1282,
"step": 1153
},
{
"epoch": 0.10515764534353927,
"grad_norm": 1.2687691450119019,
"learning_rate": 4.864812050726651e-05,
"loss": 2.9539,
"step": 1154
},
{
"epoch": 0.10524876981957354,
"grad_norm": 2.9933853149414062,
"learning_rate": 4.86457979407028e-05,
"loss": 3.2156,
"step": 1155
},
{
"epoch": 0.1053398942956078,
"grad_norm": 3.073864698410034,
"learning_rate": 4.864347343627281e-05,
"loss": 3.0966,
"step": 1156
},
{
"epoch": 0.10543101877164206,
"grad_norm": 3.451331615447998,
"learning_rate": 4.864114699416706e-05,
"loss": 3.4785,
"step": 1157
},
{
"epoch": 0.10552214324767632,
"grad_norm": 1.6930314302444458,
"learning_rate": 4.863881861457619e-05,
"loss": 3.0383,
"step": 1158
},
{
"epoch": 0.10561326772371059,
"grad_norm": 3.148261785507202,
"learning_rate": 4.8636488297691025e-05,
"loss": 2.4165,
"step": 1159
},
{
"epoch": 0.10570439219974485,
"grad_norm": 3.410334587097168,
"learning_rate": 4.863415604370255e-05,
"loss": 3.277,
"step": 1160
},
{
"epoch": 0.10579551667577912,
"grad_norm": 2.989598274230957,
"learning_rate": 4.8631821852801894e-05,
"loss": 3.1124,
"step": 1161
},
{
"epoch": 0.10588664115181337,
"grad_norm": 2.705728530883789,
"learning_rate": 4.8629485725180364e-05,
"loss": 3.4259,
"step": 1162
},
{
"epoch": 0.10597776562784764,
"grad_norm": 2.8298377990722656,
"learning_rate": 4.862714766102941e-05,
"loss": 2.6366,
"step": 1163
},
{
"epoch": 0.1060688901038819,
"grad_norm": 2.781217098236084,
"learning_rate": 4.862480766054064e-05,
"loss": 3.2281,
"step": 1164
},
{
"epoch": 0.10616001457991617,
"grad_norm": 1.5384852886199951,
"learning_rate": 4.862246572390583e-05,
"loss": 2.9956,
"step": 1165
},
{
"epoch": 0.10625113905595043,
"grad_norm": 3.5114247798919678,
"learning_rate": 4.862012185131691e-05,
"loss": 2.7165,
"step": 1166
},
{
"epoch": 0.10634226353198468,
"grad_norm": 2.069568634033203,
"learning_rate": 4.861777604296597e-05,
"loss": 3.2318,
"step": 1167
},
{
"epoch": 0.10643338800801895,
"grad_norm": 3.111557722091675,
"learning_rate": 4.8615428299045265e-05,
"loss": 3.1355,
"step": 1168
},
{
"epoch": 0.10652451248405322,
"grad_norm": 1.6879878044128418,
"learning_rate": 4.86130786197472e-05,
"loss": 3.1002,
"step": 1169
},
{
"epoch": 0.10661563696008748,
"grad_norm": 3.0851385593414307,
"learning_rate": 4.861072700526433e-05,
"loss": 4.5198,
"step": 1170
},
{
"epoch": 0.10670676143612175,
"grad_norm": 1.7297760248184204,
"learning_rate": 4.860837345578938e-05,
"loss": 3.0423,
"step": 1171
},
{
"epoch": 0.106797885912156,
"grad_norm": 2.900595188140869,
"learning_rate": 4.860601797151525e-05,
"loss": 3.3863,
"step": 1172
},
{
"epoch": 0.10688901038819026,
"grad_norm": 2.360994815826416,
"learning_rate": 4.8603660552634965e-05,
"loss": 3.3508,
"step": 1173
},
{
"epoch": 0.10698013486422453,
"grad_norm": 3.4403953552246094,
"learning_rate": 4.860130119934173e-05,
"loss": 3.4562,
"step": 1174
},
{
"epoch": 0.1070712593402588,
"grad_norm": 2.7521438598632812,
"learning_rate": 4.85989399118289e-05,
"loss": 3.066,
"step": 1175
},
{
"epoch": 0.10716238381629306,
"grad_norm": 2.927729606628418,
"learning_rate": 4.859657669029e-05,
"loss": 3.3748,
"step": 1176
},
{
"epoch": 0.10725350829232731,
"grad_norm": 2.8207826614379883,
"learning_rate": 4.859421153491869e-05,
"loss": 3.097,
"step": 1177
},
{
"epoch": 0.10734463276836158,
"grad_norm": 2.950704336166382,
"learning_rate": 4.859184444590882e-05,
"loss": 2.8803,
"step": 1178
},
{
"epoch": 0.10743575724439584,
"grad_norm": 2.3205230236053467,
"learning_rate": 4.858947542345438e-05,
"loss": 3.405,
"step": 1179
},
{
"epoch": 0.10752688172043011,
"grad_norm": 2.2538068294525146,
"learning_rate": 4.858710446774951e-05,
"loss": 3.2668,
"step": 1180
},
{
"epoch": 0.10761800619646437,
"grad_norm": 1.6739517450332642,
"learning_rate": 4.858473157898853e-05,
"loss": 3.096,
"step": 1181
},
{
"epoch": 0.10770913067249863,
"grad_norm": 2.09653377532959,
"learning_rate": 4.85823567573659e-05,
"loss": 3.1997,
"step": 1182
},
{
"epoch": 0.10780025514853289,
"grad_norm": 2.1584832668304443,
"learning_rate": 4.8579980003076245e-05,
"loss": 3.1801,
"step": 1183
},
{
"epoch": 0.10789137962456716,
"grad_norm": 3.139174461364746,
"learning_rate": 4.857760131631436e-05,
"loss": 2.9841,
"step": 1184
},
{
"epoch": 0.10798250410060142,
"grad_norm": 3.363103151321411,
"learning_rate": 4.857522069727518e-05,
"loss": 3.3184,
"step": 1185
},
{
"epoch": 0.10807362857663569,
"grad_norm": 2.7438161373138428,
"learning_rate": 4.857283814615381e-05,
"loss": 2.904,
"step": 1186
},
{
"epoch": 0.10816475305266994,
"grad_norm": 3.721757411956787,
"learning_rate": 4.8570453663145506e-05,
"loss": 2.8647,
"step": 1187
},
{
"epoch": 0.1082558775287042,
"grad_norm": 2.8211238384246826,
"learning_rate": 4.856806724844568e-05,
"loss": 3.1336,
"step": 1188
},
{
"epoch": 0.10834700200473847,
"grad_norm": 2.988065004348755,
"learning_rate": 4.856567890224992e-05,
"loss": 3.3704,
"step": 1189
},
{
"epoch": 0.10843812648077274,
"grad_norm": 2.3461434841156006,
"learning_rate": 4.856328862475396e-05,
"loss": 3.2745,
"step": 1190
},
{
"epoch": 0.108529250956807,
"grad_norm": 2.9137821197509766,
"learning_rate": 4.8560896416153684e-05,
"loss": 2.8177,
"step": 1191
},
{
"epoch": 0.10862037543284127,
"grad_norm": 2.177649974822998,
"learning_rate": 4.8558502276645146e-05,
"loss": 3.0648,
"step": 1192
},
{
"epoch": 0.10871149990887552,
"grad_norm": 4.471045970916748,
"learning_rate": 4.8556106206424556e-05,
"loss": 3.2198,
"step": 1193
},
{
"epoch": 0.10880262438490979,
"grad_norm": 1.7140840291976929,
"learning_rate": 4.855370820568829e-05,
"loss": 3.0084,
"step": 1194
},
{
"epoch": 0.10889374886094405,
"grad_norm": 1.9925346374511719,
"learning_rate": 4.855130827463285e-05,
"loss": 3.076,
"step": 1195
},
{
"epoch": 0.10898487333697832,
"grad_norm": 4.0725555419921875,
"learning_rate": 4.8548906413454944e-05,
"loss": 3.1775,
"step": 1196
},
{
"epoch": 0.10907599781301258,
"grad_norm": 4.49186897277832,
"learning_rate": 4.85465026223514e-05,
"loss": 3.0196,
"step": 1197
},
{
"epoch": 0.10916712228904683,
"grad_norm": 2.624659776687622,
"learning_rate": 4.8544096901519227e-05,
"loss": 3.3422,
"step": 1198
},
{
"epoch": 0.1092582467650811,
"grad_norm": 2.492645740509033,
"learning_rate": 4.8541689251155575e-05,
"loss": 3.1926,
"step": 1199
},
{
"epoch": 0.10934937124111536,
"grad_norm": 1.60177743434906,
"learning_rate": 4.853927967145777e-05,
"loss": 2.946,
"step": 1200
},
{
"epoch": 0.10944049571714963,
"grad_norm": 2.5063915252685547,
"learning_rate": 4.853686816262327e-05,
"loss": 3.1408,
"step": 1201
},
{
"epoch": 0.1095316201931839,
"grad_norm": 8.176122665405273,
"learning_rate": 4.8534454724849734e-05,
"loss": 3.2513,
"step": 1202
},
{
"epoch": 0.10962274466921815,
"grad_norm": 2.628269910812378,
"learning_rate": 4.853203935833493e-05,
"loss": 3.1195,
"step": 1203
},
{
"epoch": 0.10971386914525241,
"grad_norm": 2.0619876384735107,
"learning_rate": 4.8529622063276814e-05,
"loss": 3.3001,
"step": 1204
},
{
"epoch": 0.10980499362128668,
"grad_norm": 3.177309989929199,
"learning_rate": 4.85272028398735e-05,
"loss": 3.09,
"step": 1205
},
{
"epoch": 0.10989611809732094,
"grad_norm": 2.6613833904266357,
"learning_rate": 4.852478168832323e-05,
"loss": 3.297,
"step": 1206
},
{
"epoch": 0.10998724257335521,
"grad_norm": 2.257078170776367,
"learning_rate": 4.852235860882446e-05,
"loss": 2.2332,
"step": 1207
},
{
"epoch": 0.11007836704938946,
"grad_norm": 3.8638627529144287,
"learning_rate": 4.851993360157575e-05,
"loss": 2.7631,
"step": 1208
},
{
"epoch": 0.11016949152542373,
"grad_norm": 2.2826056480407715,
"learning_rate": 4.851750666677584e-05,
"loss": 3.3047,
"step": 1209
},
{
"epoch": 0.11026061600145799,
"grad_norm": 4.1844706535339355,
"learning_rate": 4.851507780462362e-05,
"loss": 3.2165,
"step": 1210
},
{
"epoch": 0.11035174047749226,
"grad_norm": 2.7843410968780518,
"learning_rate": 4.8512647015318166e-05,
"loss": 2.7579,
"step": 1211
},
{
"epoch": 0.11044286495352652,
"grad_norm": 1.784948706626892,
"learning_rate": 4.851021429905868e-05,
"loss": 3.0934,
"step": 1212
},
{
"epoch": 0.11053398942956077,
"grad_norm": 2.8713064193725586,
"learning_rate": 4.850777965604453e-05,
"loss": 2.8885,
"step": 1213
},
{
"epoch": 0.11062511390559504,
"grad_norm": 2.985374927520752,
"learning_rate": 4.850534308647524e-05,
"loss": 3.232,
"step": 1214
},
{
"epoch": 0.1107162383816293,
"grad_norm": 1.9292786121368408,
"learning_rate": 4.8502904590550514e-05,
"loss": 3.1795,
"step": 1215
},
{
"epoch": 0.11080736285766357,
"grad_norm": 3.1823911666870117,
"learning_rate": 4.850046416847018e-05,
"loss": 3.2059,
"step": 1216
},
{
"epoch": 0.11089848733369784,
"grad_norm": 3.464905023574829,
"learning_rate": 4.849802182043425e-05,
"loss": 2.51,
"step": 1217
},
{
"epoch": 0.11098961180973209,
"grad_norm": 1.3138779401779175,
"learning_rate": 4.8495577546642864e-05,
"loss": 2.9226,
"step": 1218
},
{
"epoch": 0.11108073628576635,
"grad_norm": 2.682544469833374,
"learning_rate": 4.849313134729637e-05,
"loss": 3.2597,
"step": 1219
},
{
"epoch": 0.11117186076180062,
"grad_norm": 1.8260408639907837,
"learning_rate": 4.8490683222595224e-05,
"loss": 2.9877,
"step": 1220
},
{
"epoch": 0.11126298523783489,
"grad_norm": 2.5833330154418945,
"learning_rate": 4.848823317274007e-05,
"loss": 3.2531,
"step": 1221
},
{
"epoch": 0.11135410971386915,
"grad_norm": 3.7257158756256104,
"learning_rate": 4.848578119793169e-05,
"loss": 3.3707,
"step": 1222
},
{
"epoch": 0.1114452341899034,
"grad_norm": 2.2442314624786377,
"learning_rate": 4.848332729837103e-05,
"loss": 3.151,
"step": 1223
},
{
"epoch": 0.11153635866593767,
"grad_norm": 2.0687997341156006,
"learning_rate": 4.8480871474259215e-05,
"loss": 3.2912,
"step": 1224
},
{
"epoch": 0.11162748314197193,
"grad_norm": 2.783472776412964,
"learning_rate": 4.847841372579749e-05,
"loss": 2.5138,
"step": 1225
},
{
"epoch": 0.1117186076180062,
"grad_norm": 2.2932865619659424,
"learning_rate": 4.847595405318729e-05,
"loss": 2.9144,
"step": 1226
},
{
"epoch": 0.11180973209404046,
"grad_norm": 2.7115135192871094,
"learning_rate": 4.847349245663019e-05,
"loss": 2.9571,
"step": 1227
},
{
"epoch": 0.11190085657007472,
"grad_norm": 2.5347988605499268,
"learning_rate": 4.847102893632792e-05,
"loss": 3.4235,
"step": 1228
},
{
"epoch": 0.11199198104610898,
"grad_norm": 2.7291646003723145,
"learning_rate": 4.8468563492482395e-05,
"loss": 3.1117,
"step": 1229
},
{
"epoch": 0.11208310552214325,
"grad_norm": 2.545180320739746,
"learning_rate": 4.8466096125295644e-05,
"loss": 3.1981,
"step": 1230
},
{
"epoch": 0.11217422999817751,
"grad_norm": 1.7681077718734741,
"learning_rate": 4.84636268349699e-05,
"loss": 3.0884,
"step": 1231
},
{
"epoch": 0.11226535447421178,
"grad_norm": 2.8129305839538574,
"learning_rate": 4.846115562170751e-05,
"loss": 3.1274,
"step": 1232
},
{
"epoch": 0.11235647895024603,
"grad_norm": 3.1257529258728027,
"learning_rate": 4.8458682485711014e-05,
"loss": 3.1307,
"step": 1233
},
{
"epoch": 0.1124476034262803,
"grad_norm": 3.14485502243042,
"learning_rate": 4.8456207427183094e-05,
"loss": 3.0517,
"step": 1234
},
{
"epoch": 0.11253872790231456,
"grad_norm": 2.2514820098876953,
"learning_rate": 4.8453730446326585e-05,
"loss": 3.3268,
"step": 1235
},
{
"epoch": 0.11262985237834883,
"grad_norm": 1.3902634382247925,
"learning_rate": 4.845125154334449e-05,
"loss": 3.0041,
"step": 1236
},
{
"epoch": 0.11272097685438309,
"grad_norm": 1.5997536182403564,
"learning_rate": 4.844877071843996e-05,
"loss": 3.08,
"step": 1237
},
{
"epoch": 0.11281210133041734,
"grad_norm": 3.2128071784973145,
"learning_rate": 4.8446287971816305e-05,
"loss": 3.4485,
"step": 1238
},
{
"epoch": 0.11290322580645161,
"grad_norm": 2.0368711948394775,
"learning_rate": 4.844380330367701e-05,
"loss": 2.9709,
"step": 1239
},
{
"epoch": 0.11299435028248588,
"grad_norm": 3.5499250888824463,
"learning_rate": 4.84413167142257e-05,
"loss": 3.1674,
"step": 1240
},
{
"epoch": 0.11308547475852014,
"grad_norm": 2.089275598526001,
"learning_rate": 4.8438828203666156e-05,
"loss": 3.0767,
"step": 1241
},
{
"epoch": 0.1131765992345544,
"grad_norm": 2.063582420349121,
"learning_rate": 4.843633777220231e-05,
"loss": 3.0162,
"step": 1242
},
{
"epoch": 0.11326772371058866,
"grad_norm": 1.9222890138626099,
"learning_rate": 4.843384542003828e-05,
"loss": 3.0367,
"step": 1243
},
{
"epoch": 0.11335884818662292,
"grad_norm": 1.74138343334198,
"learning_rate": 4.843135114737832e-05,
"loss": 3.1665,
"step": 1244
},
{
"epoch": 0.11344997266265719,
"grad_norm": 1.58941650390625,
"learning_rate": 4.8428854954426846e-05,
"loss": 3.1407,
"step": 1245
},
{
"epoch": 0.11354109713869145,
"grad_norm": 1.7341008186340332,
"learning_rate": 4.842635684138843e-05,
"loss": 3.0836,
"step": 1246
},
{
"epoch": 0.11363222161472572,
"grad_norm": 2.3776652812957764,
"learning_rate": 4.84238568084678e-05,
"loss": 3.0876,
"step": 1247
},
{
"epoch": 0.11372334609075997,
"grad_norm": 3.617215156555176,
"learning_rate": 4.842135485586983e-05,
"loss": 2.6156,
"step": 1248
},
{
"epoch": 0.11381447056679424,
"grad_norm": 3.285801410675049,
"learning_rate": 4.841885098379959e-05,
"loss": 3.1519,
"step": 1249
},
{
"epoch": 0.1139055950428285,
"grad_norm": 2.993448257446289,
"learning_rate": 4.841634519246227e-05,
"loss": 2.734,
"step": 1250
},
{
"epoch": 0.11399671951886277,
"grad_norm": 4.181899070739746,
"learning_rate": 4.841383748206324e-05,
"loss": 3.2974,
"step": 1251
},
{
"epoch": 0.11408784399489703,
"grad_norm": 2.419050455093384,
"learning_rate": 4.8411327852808e-05,
"loss": 3.2745,
"step": 1252
},
{
"epoch": 0.11417896847093129,
"grad_norm": 1.4583121538162231,
"learning_rate": 4.8408816304902235e-05,
"loss": 2.9262,
"step": 1253
},
{
"epoch": 0.11427009294696555,
"grad_norm": 2.53656268119812,
"learning_rate": 4.8406302838551765e-05,
"loss": 3.3562,
"step": 1254
},
{
"epoch": 0.11436121742299982,
"grad_norm": 2.8883700370788574,
"learning_rate": 4.840378745396259e-05,
"loss": 3.121,
"step": 1255
},
{
"epoch": 0.11445234189903408,
"grad_norm": 3.119507074356079,
"learning_rate": 4.840127015134086e-05,
"loss": 3.0548,
"step": 1256
},
{
"epoch": 0.11454346637506835,
"grad_norm": 3.061594247817993,
"learning_rate": 4.839875093089286e-05,
"loss": 3.1823,
"step": 1257
},
{
"epoch": 0.1146345908511026,
"grad_norm": 2.9661896228790283,
"learning_rate": 4.839622979282506e-05,
"loss": 3.3524,
"step": 1258
},
{
"epoch": 0.11472571532713687,
"grad_norm": 2.240601062774658,
"learning_rate": 4.8393706737344085e-05,
"loss": 3.0736,
"step": 1259
},
{
"epoch": 0.11481683980317113,
"grad_norm": 2.651048183441162,
"learning_rate": 4.8391181764656696e-05,
"loss": 3.296,
"step": 1260
},
{
"epoch": 0.1149079642792054,
"grad_norm": 1.6715929508209229,
"learning_rate": 4.838865487496983e-05,
"loss": 2.9067,
"step": 1261
},
{
"epoch": 0.11499908875523966,
"grad_norm": 2.577331304550171,
"learning_rate": 4.838612606849058e-05,
"loss": 3.1685,
"step": 1262
},
{
"epoch": 0.11509021323127391,
"grad_norm": 3.5737011432647705,
"learning_rate": 4.8383595345426184e-05,
"loss": 2.9315,
"step": 1263
},
{
"epoch": 0.11518133770730818,
"grad_norm": 4.45041036605835,
"learning_rate": 4.838106270598405e-05,
"loss": 3.2293,
"step": 1264
},
{
"epoch": 0.11527246218334244,
"grad_norm": 2.778306007385254,
"learning_rate": 4.837852815037173e-05,
"loss": 3.1879,
"step": 1265
},
{
"epoch": 0.11536358665937671,
"grad_norm": 2.894092321395874,
"learning_rate": 4.837599167879695e-05,
"loss": 3.4217,
"step": 1266
},
{
"epoch": 0.11545471113541098,
"grad_norm": 2.804297685623169,
"learning_rate": 4.837345329146758e-05,
"loss": 2.3764,
"step": 1267
},
{
"epoch": 0.11554583561144523,
"grad_norm": 1.5919743776321411,
"learning_rate": 4.837091298859165e-05,
"loss": 2.9899,
"step": 1268
},
{
"epoch": 0.11563696008747949,
"grad_norm": 2.645395278930664,
"learning_rate": 4.836837077037735e-05,
"loss": 3.2655,
"step": 1269
},
{
"epoch": 0.11572808456351376,
"grad_norm": 2.5275466442108154,
"learning_rate": 4.8365826637033024e-05,
"loss": 3.2374,
"step": 1270
},
{
"epoch": 0.11581920903954802,
"grad_norm": 1.634635329246521,
"learning_rate": 4.836328058876717e-05,
"loss": 3.035,
"step": 1271
},
{
"epoch": 0.11591033351558229,
"grad_norm": 3.141817808151245,
"learning_rate": 4.836073262578846e-05,
"loss": 3.0451,
"step": 1272
},
{
"epoch": 0.11600145799161656,
"grad_norm": 1.7054616212844849,
"learning_rate": 4.835818274830569e-05,
"loss": 3.0899,
"step": 1273
},
{
"epoch": 0.1160925824676508,
"grad_norm": 3.47708797454834,
"learning_rate": 4.835563095652785e-05,
"loss": 2.872,
"step": 1274
},
{
"epoch": 0.11618370694368507,
"grad_norm": 3.0025618076324463,
"learning_rate": 4.835307725066406e-05,
"loss": 3.2133,
"step": 1275
},
{
"epoch": 0.11627483141971934,
"grad_norm": 1.4146130084991455,
"learning_rate": 4.83505216309236e-05,
"loss": 2.9197,
"step": 1276
},
{
"epoch": 0.1163659558957536,
"grad_norm": 2.802640914916992,
"learning_rate": 4.834796409751593e-05,
"loss": 3.0783,
"step": 1277
},
{
"epoch": 0.11645708037178787,
"grad_norm": 3.4829201698303223,
"learning_rate": 4.834540465065063e-05,
"loss": 3.2377,
"step": 1278
},
{
"epoch": 0.11654820484782212,
"grad_norm": 1.5123839378356934,
"learning_rate": 4.8342843290537476e-05,
"loss": 2.9014,
"step": 1279
},
{
"epoch": 0.11663932932385639,
"grad_norm": 3.511087656021118,
"learning_rate": 4.8340280017386375e-05,
"loss": 3.5285,
"step": 1280
},
{
"epoch": 0.11673045379989065,
"grad_norm": 3.651897430419922,
"learning_rate": 4.833771483140739e-05,
"loss": 3.474,
"step": 1281
},
{
"epoch": 0.11682157827592492,
"grad_norm": 1.5524790287017822,
"learning_rate": 4.833514773281076e-05,
"loss": 2.9491,
"step": 1282
},
{
"epoch": 0.11691270275195918,
"grad_norm": 3.1398253440856934,
"learning_rate": 4.8332578721806856e-05,
"loss": 4.4306,
"step": 1283
},
{
"epoch": 0.11700382722799343,
"grad_norm": 3.3685638904571533,
"learning_rate": 4.8330007798606236e-05,
"loss": 3.0482,
"step": 1284
},
{
"epoch": 0.1170949517040277,
"grad_norm": 2.788203001022339,
"learning_rate": 4.832743496341958e-05,
"loss": 2.9866,
"step": 1285
},
{
"epoch": 0.11718607618006197,
"grad_norm": 1.323476791381836,
"learning_rate": 4.8324860216457744e-05,
"loss": 2.9145,
"step": 1286
},
{
"epoch": 0.11727720065609623,
"grad_norm": 3.488725185394287,
"learning_rate": 4.832228355793175e-05,
"loss": 3.4236,
"step": 1287
},
{
"epoch": 0.1173683251321305,
"grad_norm": 2.773366689682007,
"learning_rate": 4.831970498805275e-05,
"loss": 3.2463,
"step": 1288
},
{
"epoch": 0.11745944960816475,
"grad_norm": 2.8726413249969482,
"learning_rate": 4.8317124507032083e-05,
"loss": 3.1034,
"step": 1289
},
{
"epoch": 0.11755057408419901,
"grad_norm": 2.574613094329834,
"learning_rate": 4.831454211508122e-05,
"loss": 3.2809,
"step": 1290
},
{
"epoch": 0.11764169856023328,
"grad_norm": 4.687624454498291,
"learning_rate": 4.83119578124118e-05,
"loss": 2.8164,
"step": 1291
},
{
"epoch": 0.11773282303626754,
"grad_norm": 1.4595892429351807,
"learning_rate": 4.830937159923562e-05,
"loss": 3.0615,
"step": 1292
},
{
"epoch": 0.11782394751230181,
"grad_norm": 2.886767625808716,
"learning_rate": 4.830678347576463e-05,
"loss": 2.8481,
"step": 1293
},
{
"epoch": 0.11791507198833606,
"grad_norm": 3.3065526485443115,
"learning_rate": 4.830419344221093e-05,
"loss": 3.4463,
"step": 1294
},
{
"epoch": 0.11800619646437033,
"grad_norm": 3.140231132507324,
"learning_rate": 4.83016014987868e-05,
"loss": 2.7775,
"step": 1295
},
{
"epoch": 0.1180973209404046,
"grad_norm": 5.782069206237793,
"learning_rate": 4.829900764570464e-05,
"loss": 3.4923,
"step": 1296
},
{
"epoch": 0.11818844541643886,
"grad_norm": 3.9417202472686768,
"learning_rate": 4.8296411883177026e-05,
"loss": 3.0629,
"step": 1297
},
{
"epoch": 0.11827956989247312,
"grad_norm": 2.3533289432525635,
"learning_rate": 4.829381421141671e-05,
"loss": 3.1436,
"step": 1298
},
{
"epoch": 0.11837069436850738,
"grad_norm": 3.0032739639282227,
"learning_rate": 4.829121463063657e-05,
"loss": 3.3542,
"step": 1299
},
{
"epoch": 0.11846181884454164,
"grad_norm": 3.742929220199585,
"learning_rate": 4.828861314104966e-05,
"loss": 3.1316,
"step": 1300
},
{
"epoch": 0.11855294332057591,
"grad_norm": 2.169480800628662,
"learning_rate": 4.828600974286917e-05,
"loss": 3.1513,
"step": 1301
},
{
"epoch": 0.11864406779661017,
"grad_norm": 2.3254833221435547,
"learning_rate": 4.8283404436308464e-05,
"loss": 3.0674,
"step": 1302
},
{
"epoch": 0.11873519227264444,
"grad_norm": 3.3510477542877197,
"learning_rate": 4.828079722158105e-05,
"loss": 2.7181,
"step": 1303
},
{
"epoch": 0.11882631674867869,
"grad_norm": 2.6919658184051514,
"learning_rate": 4.8278188098900626e-05,
"loss": 3.0987,
"step": 1304
},
{
"epoch": 0.11891744122471296,
"grad_norm": 3.264378786087036,
"learning_rate": 4.827557706848099e-05,
"loss": 3.0503,
"step": 1305
},
{
"epoch": 0.11900856570074722,
"grad_norm": 2.856445550918579,
"learning_rate": 4.827296413053614e-05,
"loss": 4.3108,
"step": 1306
},
{
"epoch": 0.11909969017678149,
"grad_norm": 3.0132710933685303,
"learning_rate": 4.82703492852802e-05,
"loss": 3.0884,
"step": 1307
},
{
"epoch": 0.11919081465281575,
"grad_norm": 1.8259915113449097,
"learning_rate": 4.826773253292749e-05,
"loss": 3.1467,
"step": 1308
},
{
"epoch": 0.11928193912885,
"grad_norm": 2.925410509109497,
"learning_rate": 4.826511387369246e-05,
"loss": 3.1775,
"step": 1309
},
{
"epoch": 0.11937306360488427,
"grad_norm": 2.620035409927368,
"learning_rate": 4.826249330778971e-05,
"loss": 3.2156,
"step": 1310
},
{
"epoch": 0.11946418808091853,
"grad_norm": 3.094045400619507,
"learning_rate": 4.825987083543401e-05,
"loss": 3.187,
"step": 1311
},
{
"epoch": 0.1195553125569528,
"grad_norm": 5.414752960205078,
"learning_rate": 4.825724645684027e-05,
"loss": 3.3959,
"step": 1312
},
{
"epoch": 0.11964643703298707,
"grad_norm": 3.0916340351104736,
"learning_rate": 4.825462017222359e-05,
"loss": 2.8089,
"step": 1313
},
{
"epoch": 0.11973756150902132,
"grad_norm": 2.8586535453796387,
"learning_rate": 4.825199198179919e-05,
"loss": 3.2082,
"step": 1314
},
{
"epoch": 0.11982868598505558,
"grad_norm": 3.6413466930389404,
"learning_rate": 4.824936188578246e-05,
"loss": 3.396,
"step": 1315
},
{
"epoch": 0.11991981046108985,
"grad_norm": 1.7131729125976562,
"learning_rate": 4.824672988438895e-05,
"loss": 3.1427,
"step": 1316
},
{
"epoch": 0.12001093493712411,
"grad_norm": 2.2340872287750244,
"learning_rate": 4.824409597783438e-05,
"loss": 3.2302,
"step": 1317
},
{
"epoch": 0.12010205941315838,
"grad_norm": 2.6722652912139893,
"learning_rate": 4.8241460166334577e-05,
"loss": 2.9591,
"step": 1318
},
{
"epoch": 0.12019318388919263,
"grad_norm": 2.3413898944854736,
"learning_rate": 4.823882245010557e-05,
"loss": 3.0534,
"step": 1319
},
{
"epoch": 0.1202843083652269,
"grad_norm": 3.7826550006866455,
"learning_rate": 4.823618282936354e-05,
"loss": 4.7466,
"step": 1320
},
{
"epoch": 0.12037543284126116,
"grad_norm": 1.7024617195129395,
"learning_rate": 4.82335413043248e-05,
"loss": 3.086,
"step": 1321
},
{
"epoch": 0.12046655731729543,
"grad_norm": 1.9387747049331665,
"learning_rate": 4.8230897875205844e-05,
"loss": 3.2878,
"step": 1322
},
{
"epoch": 0.1205576817933297,
"grad_norm": 1.4321894645690918,
"learning_rate": 4.8228252542223305e-05,
"loss": 3.1001,
"step": 1323
},
{
"epoch": 0.12064880626936395,
"grad_norm": 3.9441845417022705,
"learning_rate": 4.822560530559398e-05,
"loss": 3.1041,
"step": 1324
},
{
"epoch": 0.12073993074539821,
"grad_norm": 2.0349435806274414,
"learning_rate": 4.8222956165534824e-05,
"loss": 3.0336,
"step": 1325
},
{
"epoch": 0.12083105522143248,
"grad_norm": 2.7970826625823975,
"learning_rate": 4.822030512226294e-05,
"loss": 3.1574,
"step": 1326
},
{
"epoch": 0.12092217969746674,
"grad_norm": 2.463871717453003,
"learning_rate": 4.821765217599559e-05,
"loss": 3.2166,
"step": 1327
},
{
"epoch": 0.12101330417350101,
"grad_norm": 2.5402960777282715,
"learning_rate": 4.82149973269502e-05,
"loss": 3.0926,
"step": 1328
},
{
"epoch": 0.12110442864953526,
"grad_norm": 3.7119953632354736,
"learning_rate": 4.821234057534434e-05,
"loss": 3.3525,
"step": 1329
},
{
"epoch": 0.12119555312556952,
"grad_norm": 1.9446157217025757,
"learning_rate": 4.820968192139575e-05,
"loss": 3.0707,
"step": 1330
},
{
"epoch": 0.12128667760160379,
"grad_norm": 3.0048012733459473,
"learning_rate": 4.82070213653223e-05,
"loss": 3.2723,
"step": 1331
},
{
"epoch": 0.12137780207763806,
"grad_norm": 3.404109239578247,
"learning_rate": 4.820435890734204e-05,
"loss": 4.3627,
"step": 1332
},
{
"epoch": 0.12146892655367232,
"grad_norm": 2.606018304824829,
"learning_rate": 4.820169454767318e-05,
"loss": 3.1892,
"step": 1333
},
{
"epoch": 0.12156005102970657,
"grad_norm": 3.522080183029175,
"learning_rate": 4.819902828653406e-05,
"loss": 3.3526,
"step": 1334
},
{
"epoch": 0.12165117550574084,
"grad_norm": 4.494570732116699,
"learning_rate": 4.8196360124143204e-05,
"loss": 3.1921,
"step": 1335
},
{
"epoch": 0.1217422999817751,
"grad_norm": 2.7442262172698975,
"learning_rate": 4.819369006071927e-05,
"loss": 3.2029,
"step": 1336
},
{
"epoch": 0.12183342445780937,
"grad_norm": 2.947127342224121,
"learning_rate": 4.819101809648108e-05,
"loss": 3.2069,
"step": 1337
},
{
"epoch": 0.12192454893384364,
"grad_norm": 3.326021432876587,
"learning_rate": 4.818834423164762e-05,
"loss": 3.2037,
"step": 1338
},
{
"epoch": 0.12201567340987789,
"grad_norm": 2.297687292098999,
"learning_rate": 4.818566846643801e-05,
"loss": 3.1991,
"step": 1339
},
{
"epoch": 0.12210679788591215,
"grad_norm": 3.1498403549194336,
"learning_rate": 4.8182990801071546e-05,
"loss": 3.605,
"step": 1340
},
{
"epoch": 0.12219792236194642,
"grad_norm": 2.481204032897949,
"learning_rate": 4.8180311235767684e-05,
"loss": 3.0523,
"step": 1341
},
{
"epoch": 0.12228904683798068,
"grad_norm": 1.7611827850341797,
"learning_rate": 4.817762977074601e-05,
"loss": 2.9838,
"step": 1342
},
{
"epoch": 0.12238017131401495,
"grad_norm": 2.524806261062622,
"learning_rate": 4.8174946406226286e-05,
"loss": 3.1246,
"step": 1343
},
{
"epoch": 0.1224712957900492,
"grad_norm": 1.8765568733215332,
"learning_rate": 4.817226114242843e-05,
"loss": 3.097,
"step": 1344
},
{
"epoch": 0.12256242026608347,
"grad_norm": 2.483398675918579,
"learning_rate": 4.816957397957249e-05,
"loss": 3.2328,
"step": 1345
},
{
"epoch": 0.12265354474211773,
"grad_norm": 2.1173160076141357,
"learning_rate": 4.816688491787872e-05,
"loss": 3.182,
"step": 1346
},
{
"epoch": 0.122744669218152,
"grad_norm": 3.149275779724121,
"learning_rate": 4.816419395756747e-05,
"loss": 3.2735,
"step": 1347
},
{
"epoch": 0.12283579369418626,
"grad_norm": 1.985774278640747,
"learning_rate": 4.8161501098859295e-05,
"loss": 2.91,
"step": 1348
},
{
"epoch": 0.12292691817022051,
"grad_norm": 3.5181996822357178,
"learning_rate": 4.8158806341974875e-05,
"loss": 3.3365,
"step": 1349
},
{
"epoch": 0.12301804264625478,
"grad_norm": 3.0273499488830566,
"learning_rate": 4.8156109687135064e-05,
"loss": 3.1726,
"step": 1350
},
{
"epoch": 0.12310916712228905,
"grad_norm": 2.0603082180023193,
"learning_rate": 4.8153411134560856e-05,
"loss": 3.2063,
"step": 1351
},
{
"epoch": 0.12320029159832331,
"grad_norm": 2.9190120697021484,
"learning_rate": 4.8150710684473407e-05,
"loss": 3.339,
"step": 1352
},
{
"epoch": 0.12329141607435758,
"grad_norm": 1.935994029045105,
"learning_rate": 4.814800833709403e-05,
"loss": 3.0274,
"step": 1353
},
{
"epoch": 0.12338254055039184,
"grad_norm": 2.194535493850708,
"learning_rate": 4.814530409264421e-05,
"loss": 2.8919,
"step": 1354
},
{
"epoch": 0.1234736650264261,
"grad_norm": 2.7491567134857178,
"learning_rate": 4.814259795134555e-05,
"loss": 3.4111,
"step": 1355
},
{
"epoch": 0.12356478950246036,
"grad_norm": 2.234680414199829,
"learning_rate": 4.8139889913419825e-05,
"loss": 3.2018,
"step": 1356
},
{
"epoch": 0.12365591397849462,
"grad_norm": 2.1068971157073975,
"learning_rate": 4.8137179979088995e-05,
"loss": 2.811,
"step": 1357
},
{
"epoch": 0.12374703845452889,
"grad_norm": 2.9462037086486816,
"learning_rate": 4.8134468148575126e-05,
"loss": 3.0475,
"step": 1358
},
{
"epoch": 0.12383816293056316,
"grad_norm": 2.1699352264404297,
"learning_rate": 4.813175442210047e-05,
"loss": 2.9973,
"step": 1359
},
{
"epoch": 0.12392928740659741,
"grad_norm": 2.727155923843384,
"learning_rate": 4.8129038799887436e-05,
"loss": 3.1522,
"step": 1360
},
{
"epoch": 0.12402041188263167,
"grad_norm": 3.1219122409820557,
"learning_rate": 4.812632128215857e-05,
"loss": 3.1755,
"step": 1361
},
{
"epoch": 0.12411153635866594,
"grad_norm": 1.593032956123352,
"learning_rate": 4.8123601869136594e-05,
"loss": 3.0699,
"step": 1362
},
{
"epoch": 0.1242026608347002,
"grad_norm": 3.5900216102600098,
"learning_rate": 4.8120880561044355e-05,
"loss": 2.9064,
"step": 1363
},
{
"epoch": 0.12429378531073447,
"grad_norm": 3.540292739868164,
"learning_rate": 4.81181573581049e-05,
"loss": 3.166,
"step": 1364
},
{
"epoch": 0.12438490978676872,
"grad_norm": 2.823420763015747,
"learning_rate": 4.811543226054138e-05,
"loss": 2.9087,
"step": 1365
},
{
"epoch": 0.12447603426280299,
"grad_norm": 2.7216222286224365,
"learning_rate": 4.811270526857715e-05,
"loss": 3.2342,
"step": 1366
},
{
"epoch": 0.12456715873883725,
"grad_norm": 1.7820650339126587,
"learning_rate": 4.810997638243569e-05,
"loss": 2.9718,
"step": 1367
},
{
"epoch": 0.12465828321487152,
"grad_norm": 3.528695821762085,
"learning_rate": 4.8107245602340635e-05,
"loss": 3.2545,
"step": 1368
},
{
"epoch": 0.12474940769090578,
"grad_norm": 2.6391193866729736,
"learning_rate": 4.8104512928515795e-05,
"loss": 3.3742,
"step": 1369
},
{
"epoch": 0.12484053216694004,
"grad_norm": 2.5269813537597656,
"learning_rate": 4.8101778361185115e-05,
"loss": 2.9321,
"step": 1370
},
{
"epoch": 0.1249316566429743,
"grad_norm": 1.620867371559143,
"learning_rate": 4.809904190057271e-05,
"loss": 2.9661,
"step": 1371
},
{
"epoch": 0.12502278111900855,
"grad_norm": 3.2925851345062256,
"learning_rate": 4.809630354690284e-05,
"loss": 3.0233,
"step": 1372
},
{
"epoch": 0.12511390559504282,
"grad_norm": 1.9607048034667969,
"learning_rate": 4.809356330039992e-05,
"loss": 3.0341,
"step": 1373
},
{
"epoch": 0.12520503007107708,
"grad_norm": 2.5559041500091553,
"learning_rate": 4.809082116128853e-05,
"loss": 3.0071,
"step": 1374
},
{
"epoch": 0.12529615454711135,
"grad_norm": 2.898434638977051,
"learning_rate": 4.8088077129793395e-05,
"loss": 3.0887,
"step": 1375
},
{
"epoch": 0.12538727902314561,
"grad_norm": 4.129066467285156,
"learning_rate": 4.80853312061394e-05,
"loss": 2.9477,
"step": 1376
},
{
"epoch": 0.12547840349917988,
"grad_norm": 3.0416951179504395,
"learning_rate": 4.80825833905516e-05,
"loss": 3.064,
"step": 1377
},
{
"epoch": 0.12556952797521415,
"grad_norm": 1.4269506931304932,
"learning_rate": 4.8079833683255166e-05,
"loss": 2.9104,
"step": 1378
},
{
"epoch": 0.1256606524512484,
"grad_norm": 1.795159935951233,
"learning_rate": 4.8077082084475455e-05,
"loss": 3.0528,
"step": 1379
},
{
"epoch": 0.12575177692728268,
"grad_norm": 1.8280433416366577,
"learning_rate": 4.8074328594437976e-05,
"loss": 3.0135,
"step": 1380
},
{
"epoch": 0.12584290140331694,
"grad_norm": 2.7084426879882812,
"learning_rate": 4.807157321336838e-05,
"loss": 3.3247,
"step": 1381
},
{
"epoch": 0.12593402587935118,
"grad_norm": 1.4713351726531982,
"learning_rate": 4.8068815941492493e-05,
"loss": 2.9893,
"step": 1382
},
{
"epoch": 0.12602515035538545,
"grad_norm": 1.647141456604004,
"learning_rate": 4.806605677903627e-05,
"loss": 3.0282,
"step": 1383
},
{
"epoch": 0.1261162748314197,
"grad_norm": 2.6358165740966797,
"learning_rate": 4.806329572622585e-05,
"loss": 3.0908,
"step": 1384
},
{
"epoch": 0.12620739930745398,
"grad_norm": 2.0468578338623047,
"learning_rate": 4.80605327832875e-05,
"loss": 3.0642,
"step": 1385
},
{
"epoch": 0.12629852378348824,
"grad_norm": 2.854619026184082,
"learning_rate": 4.8057767950447676e-05,
"loss": 3.3726,
"step": 1386
},
{
"epoch": 0.1263896482595225,
"grad_norm": 2.3269712924957275,
"learning_rate": 4.805500122793293e-05,
"loss": 3.0171,
"step": 1387
},
{
"epoch": 0.12648077273555677,
"grad_norm": 1.5846275091171265,
"learning_rate": 4.805223261597004e-05,
"loss": 3.0804,
"step": 1388
},
{
"epoch": 0.12657189721159104,
"grad_norm": 3.1986141204833984,
"learning_rate": 4.8049462114785884e-05,
"loss": 3.3668,
"step": 1389
},
{
"epoch": 0.1266630216876253,
"grad_norm": 3.514010429382324,
"learning_rate": 4.804668972460752e-05,
"loss": 2.3334,
"step": 1390
},
{
"epoch": 0.12675414616365957,
"grad_norm": 3.47501540184021,
"learning_rate": 4.804391544566216e-05,
"loss": 2.9559,
"step": 1391
},
{
"epoch": 0.1268452706396938,
"grad_norm": 2.4494612216949463,
"learning_rate": 4.804113927817716e-05,
"loss": 3.2474,
"step": 1392
},
{
"epoch": 0.12693639511572807,
"grad_norm": 2.538818836212158,
"learning_rate": 4.8038361222380054e-05,
"loss": 3.3579,
"step": 1393
},
{
"epoch": 0.12702751959176234,
"grad_norm": 3.1854991912841797,
"learning_rate": 4.8035581278498496e-05,
"loss": 3.2522,
"step": 1394
},
{
"epoch": 0.1271186440677966,
"grad_norm": 3.3268232345581055,
"learning_rate": 4.803279944676032e-05,
"loss": 3.4415,
"step": 1395
},
{
"epoch": 0.12720976854383087,
"grad_norm": 3.4658143520355225,
"learning_rate": 4.803001572739352e-05,
"loss": 3.2007,
"step": 1396
},
{
"epoch": 0.12730089301986514,
"grad_norm": 2.278604507446289,
"learning_rate": 4.802723012062622e-05,
"loss": 3.1008,
"step": 1397
},
{
"epoch": 0.1273920174958994,
"grad_norm": 3.4236605167388916,
"learning_rate": 4.8024442626686706e-05,
"loss": 3.1024,
"step": 1398
},
{
"epoch": 0.12748314197193367,
"grad_norm": 1.7195907831192017,
"learning_rate": 4.802165324580344e-05,
"loss": 3.0942,
"step": 1399
},
{
"epoch": 0.12757426644796793,
"grad_norm": 5.269683837890625,
"learning_rate": 4.801886197820501e-05,
"loss": 3.244,
"step": 1400
},
{
"epoch": 0.1276653909240022,
"grad_norm": 3.6192774772644043,
"learning_rate": 4.801606882412017e-05,
"loss": 3.4044,
"step": 1401
},
{
"epoch": 0.12775651540003646,
"grad_norm": 2.8494386672973633,
"learning_rate": 4.8013273783777844e-05,
"loss": 3.2665,
"step": 1402
},
{
"epoch": 0.1278476398760707,
"grad_norm": 2.5437350273132324,
"learning_rate": 4.801047685740709e-05,
"loss": 3.0269,
"step": 1403
},
{
"epoch": 0.12793876435210497,
"grad_norm": 3.878979444503784,
"learning_rate": 4.800767804523713e-05,
"loss": 2.5677,
"step": 1404
},
{
"epoch": 0.12802988882813923,
"grad_norm": 3.141019821166992,
"learning_rate": 4.800487734749732e-05,
"loss": 3.2695,
"step": 1405
},
{
"epoch": 0.1281210133041735,
"grad_norm": 2.654679536819458,
"learning_rate": 4.8002074764417204e-05,
"loss": 3.1931,
"step": 1406
},
{
"epoch": 0.12821213778020776,
"grad_norm": 4.664015293121338,
"learning_rate": 4.799927029622647e-05,
"loss": 3.3309,
"step": 1407
},
{
"epoch": 0.12830326225624203,
"grad_norm": 2.804391384124756,
"learning_rate": 4.799646394315494e-05,
"loss": 3.2404,
"step": 1408
},
{
"epoch": 0.1283943867322763,
"grad_norm": 2.2764317989349365,
"learning_rate": 4.799365570543262e-05,
"loss": 3.1464,
"step": 1409
},
{
"epoch": 0.12848551120831056,
"grad_norm": 2.904567003250122,
"learning_rate": 4.799084558328965e-05,
"loss": 3.3084,
"step": 1410
},
{
"epoch": 0.12857663568434483,
"grad_norm": 1.6689000129699707,
"learning_rate": 4.7988033576956315e-05,
"loss": 2.981,
"step": 1411
},
{
"epoch": 0.1286677601603791,
"grad_norm": 1.643546462059021,
"learning_rate": 4.7985219686663096e-05,
"loss": 3.1424,
"step": 1412
},
{
"epoch": 0.12875888463641333,
"grad_norm": 2.7469589710235596,
"learning_rate": 4.7982403912640594e-05,
"loss": 3.0977,
"step": 1413
},
{
"epoch": 0.1288500091124476,
"grad_norm": 2.6127498149871826,
"learning_rate": 4.797958625511956e-05,
"loss": 3.3504,
"step": 1414
},
{
"epoch": 0.12894113358848186,
"grad_norm": 1.3221989870071411,
"learning_rate": 4.7976766714330936e-05,
"loss": 2.9263,
"step": 1415
},
{
"epoch": 0.12903225806451613,
"grad_norm": 2.6175034046173096,
"learning_rate": 4.7973945290505766e-05,
"loss": 3.2393,
"step": 1416
},
{
"epoch": 0.1291233825405504,
"grad_norm": 1.7804768085479736,
"learning_rate": 4.79711219838753e-05,
"loss": 3.0257,
"step": 1417
},
{
"epoch": 0.12921450701658466,
"grad_norm": 3.141484498977661,
"learning_rate": 4.796829679467091e-05,
"loss": 3.3176,
"step": 1418
},
{
"epoch": 0.12930563149261892,
"grad_norm": 2.9118833541870117,
"learning_rate": 4.796546972312413e-05,
"loss": 3.1751,
"step": 1419
},
{
"epoch": 0.1293967559686532,
"grad_norm": 2.3245956897735596,
"learning_rate": 4.796264076946665e-05,
"loss": 3.2194,
"step": 1420
},
{
"epoch": 0.12948788044468745,
"grad_norm": 1.7661198377609253,
"learning_rate": 4.795980993393032e-05,
"loss": 2.9074,
"step": 1421
},
{
"epoch": 0.12957900492072172,
"grad_norm": 1.461868166923523,
"learning_rate": 4.795697721674713e-05,
"loss": 3.0488,
"step": 1422
},
{
"epoch": 0.12967012939675596,
"grad_norm": 3.5125927925109863,
"learning_rate": 4.795414261814923e-05,
"loss": 3.3445,
"step": 1423
},
{
"epoch": 0.12976125387279022,
"grad_norm": 5.004077434539795,
"learning_rate": 4.795130613836894e-05,
"loss": 2.4742,
"step": 1424
},
{
"epoch": 0.1298523783488245,
"grad_norm": 2.956143856048584,
"learning_rate": 4.7948467777638716e-05,
"loss": 3.1681,
"step": 1425
},
{
"epoch": 0.12994350282485875,
"grad_norm": 1.55666983127594,
"learning_rate": 4.7945627536191166e-05,
"loss": 3.0337,
"step": 1426
},
{
"epoch": 0.13003462730089302,
"grad_norm": 3.5222396850585938,
"learning_rate": 4.7942785414259064e-05,
"loss": 4.1952,
"step": 1427
},
{
"epoch": 0.13012575177692728,
"grad_norm": 1.7636314630508423,
"learning_rate": 4.7939941412075326e-05,
"loss": 3.1187,
"step": 1428
},
{
"epoch": 0.13021687625296155,
"grad_norm": 1.582393765449524,
"learning_rate": 4.7937095529873046e-05,
"loss": 3.0345,
"step": 1429
},
{
"epoch": 0.13030800072899582,
"grad_norm": 2.269049882888794,
"learning_rate": 4.793424776788544e-05,
"loss": 2.9892,
"step": 1430
},
{
"epoch": 0.13039912520503008,
"grad_norm": 2.121647596359253,
"learning_rate": 4.7931398126345895e-05,
"loss": 3.2096,
"step": 1431
},
{
"epoch": 0.13049024968106435,
"grad_norm": 2.4415335655212402,
"learning_rate": 4.7928546605487956e-05,
"loss": 3.3201,
"step": 1432
},
{
"epoch": 0.13058137415709858,
"grad_norm": 1.8279386758804321,
"learning_rate": 4.7925693205545306e-05,
"loss": 3.0392,
"step": 1433
},
{
"epoch": 0.13067249863313285,
"grad_norm": 2.7299156188964844,
"learning_rate": 4.792283792675181e-05,
"loss": 3.3905,
"step": 1434
},
{
"epoch": 0.13076362310916712,
"grad_norm": 2.9097607135772705,
"learning_rate": 4.791998076934145e-05,
"loss": 3.1524,
"step": 1435
},
{
"epoch": 0.13085474758520138,
"grad_norm": 1.3054111003875732,
"learning_rate": 4.79171217335484e-05,
"loss": 3.0252,
"step": 1436
},
{
"epoch": 0.13094587206123565,
"grad_norm": 2.88234543800354,
"learning_rate": 4.7914260819606956e-05,
"loss": 3.0998,
"step": 1437
},
{
"epoch": 0.1310369965372699,
"grad_norm": 1.7457668781280518,
"learning_rate": 4.791139802775158e-05,
"loss": 3.1138,
"step": 1438
},
{
"epoch": 0.13112812101330418,
"grad_norm": 2.7463388442993164,
"learning_rate": 4.79085333582169e-05,
"loss": 3.1257,
"step": 1439
},
{
"epoch": 0.13121924548933844,
"grad_norm": 2.5777535438537598,
"learning_rate": 4.790566681123768e-05,
"loss": 3.1941,
"step": 1440
},
{
"epoch": 0.1313103699653727,
"grad_norm": 4.522150993347168,
"learning_rate": 4.7902798387048845e-05,
"loss": 2.92,
"step": 1441
},
{
"epoch": 0.13140149444140697,
"grad_norm": 1.9641166925430298,
"learning_rate": 4.789992808588547e-05,
"loss": 3.0691,
"step": 1442
},
{
"epoch": 0.1314926189174412,
"grad_norm": 1.9266825914382935,
"learning_rate": 4.78970559079828e-05,
"loss": 3.0564,
"step": 1443
},
{
"epoch": 0.13158374339347548,
"grad_norm": 3.1203954219818115,
"learning_rate": 4.78941818535762e-05,
"loss": 2.7019,
"step": 1444
},
{
"epoch": 0.13167486786950974,
"grad_norm": 4.034260272979736,
"learning_rate": 4.7891305922901235e-05,
"loss": 3.2111,
"step": 1445
},
{
"epoch": 0.131765992345544,
"grad_norm": 1.3861982822418213,
"learning_rate": 4.7888428116193585e-05,
"loss": 3.1494,
"step": 1446
},
{
"epoch": 0.13185711682157827,
"grad_norm": 2.3220410346984863,
"learning_rate": 4.78855484336891e-05,
"loss": 3.4374,
"step": 1447
},
{
"epoch": 0.13194824129761254,
"grad_norm": 2.328646183013916,
"learning_rate": 4.788266687562378e-05,
"loss": 2.9784,
"step": 1448
},
{
"epoch": 0.1320393657736468,
"grad_norm": 2.595676898956299,
"learning_rate": 4.7879783442233776e-05,
"loss": 3.2522,
"step": 1449
},
{
"epoch": 0.13213049024968107,
"grad_norm": 3.259542942047119,
"learning_rate": 4.787689813375541e-05,
"loss": 3.4706,
"step": 1450
},
{
"epoch": 0.13222161472571534,
"grad_norm": 3.3692314624786377,
"learning_rate": 4.787401095042513e-05,
"loss": 3.3304,
"step": 1451
},
{
"epoch": 0.1323127392017496,
"grad_norm": 1.2401968240737915,
"learning_rate": 4.787112189247956e-05,
"loss": 2.7759,
"step": 1452
},
{
"epoch": 0.13240386367778384,
"grad_norm": 2.5161514282226562,
"learning_rate": 4.786823096015547e-05,
"loss": 3.0425,
"step": 1453
},
{
"epoch": 0.1324949881538181,
"grad_norm": 3.511383533477783,
"learning_rate": 4.786533815368978e-05,
"loss": 4.4405,
"step": 1454
},
{
"epoch": 0.13258611262985237,
"grad_norm": 2.5629379749298096,
"learning_rate": 4.786244347331956e-05,
"loss": 3.0869,
"step": 1455
},
{
"epoch": 0.13267723710588664,
"grad_norm": 1.6853193044662476,
"learning_rate": 4.785954691928206e-05,
"loss": 3.2892,
"step": 1456
},
{
"epoch": 0.1327683615819209,
"grad_norm": 1.5359575748443604,
"learning_rate": 4.785664849181465e-05,
"loss": 2.9666,
"step": 1457
},
{
"epoch": 0.13285948605795517,
"grad_norm": 2.309920310974121,
"learning_rate": 4.785374819115487e-05,
"loss": 3.3415,
"step": 1458
},
{
"epoch": 0.13295061053398943,
"grad_norm": 1.5705186128616333,
"learning_rate": 4.7850846017540404e-05,
"loss": 2.9575,
"step": 1459
},
{
"epoch": 0.1330417350100237,
"grad_norm": 2.871138572692871,
"learning_rate": 4.784794197120911e-05,
"loss": 2.8445,
"step": 1460
},
{
"epoch": 0.13313285948605796,
"grad_norm": 2.2359492778778076,
"learning_rate": 4.784503605239898e-05,
"loss": 3.2162,
"step": 1461
},
{
"epoch": 0.13322398396209223,
"grad_norm": 1.5131843090057373,
"learning_rate": 4.7842128261348164e-05,
"loss": 3.1909,
"step": 1462
},
{
"epoch": 0.13331510843812647,
"grad_norm": 2.033951759338379,
"learning_rate": 4.783921859829496e-05,
"loss": 3.2403,
"step": 1463
},
{
"epoch": 0.13340623291416073,
"grad_norm": 3.2505292892456055,
"learning_rate": 4.783630706347785e-05,
"loss": 3.3799,
"step": 1464
},
{
"epoch": 0.133497357390195,
"grad_norm": 2.0687122344970703,
"learning_rate": 4.783339365713542e-05,
"loss": 3.1018,
"step": 1465
},
{
"epoch": 0.13358848186622926,
"grad_norm": 3.194941282272339,
"learning_rate": 4.7830478379506446e-05,
"loss": 3.0541,
"step": 1466
},
{
"epoch": 0.13367960634226353,
"grad_norm": 1.9203050136566162,
"learning_rate": 4.782756123082986e-05,
"loss": 3.1837,
"step": 1467
},
{
"epoch": 0.1337707308182978,
"grad_norm": 1.5487380027770996,
"learning_rate": 4.78246422113447e-05,
"loss": 3.048,
"step": 1468
},
{
"epoch": 0.13386185529433206,
"grad_norm": 3.313387870788574,
"learning_rate": 4.7821721321290216e-05,
"loss": 2.9177,
"step": 1469
},
{
"epoch": 0.13395297977036633,
"grad_norm": 1.603618860244751,
"learning_rate": 4.7818798560905785e-05,
"loss": 3.0036,
"step": 1470
},
{
"epoch": 0.1340441042464006,
"grad_norm": 2.8620636463165283,
"learning_rate": 4.7815873930430934e-05,
"loss": 3.0144,
"step": 1471
},
{
"epoch": 0.13413522872243486,
"grad_norm": 3.5870542526245117,
"learning_rate": 4.7812947430105346e-05,
"loss": 3.2079,
"step": 1472
},
{
"epoch": 0.1342263531984691,
"grad_norm": 1.7587014436721802,
"learning_rate": 4.781001906016887e-05,
"loss": 2.9849,
"step": 1473
},
{
"epoch": 0.13431747767450336,
"grad_norm": 1.599656581878662,
"learning_rate": 4.780708882086148e-05,
"loss": 3.0486,
"step": 1474
},
{
"epoch": 0.13440860215053763,
"grad_norm": 2.6890735626220703,
"learning_rate": 4.780415671242334e-05,
"loss": 3.2422,
"step": 1475
},
{
"epoch": 0.1344997266265719,
"grad_norm": 2.605520248413086,
"learning_rate": 4.780122273509473e-05,
"loss": 3.0581,
"step": 1476
},
{
"epoch": 0.13459085110260616,
"grad_norm": 2.7986021041870117,
"learning_rate": 4.7798286889116113e-05,
"loss": 3.2861,
"step": 1477
},
{
"epoch": 0.13468197557864042,
"grad_norm": 2.143354892730713,
"learning_rate": 4.779534917472809e-05,
"loss": 3.1575,
"step": 1478
},
{
"epoch": 0.1347731000546747,
"grad_norm": 3.173980712890625,
"learning_rate": 4.779240959217141e-05,
"loss": 2.7888,
"step": 1479
},
{
"epoch": 0.13486422453070895,
"grad_norm": 2.8368523120880127,
"learning_rate": 4.7789468141687e-05,
"loss": 3.12,
"step": 1480
},
{
"epoch": 0.13495534900674322,
"grad_norm": 2.084016799926758,
"learning_rate": 4.778652482351591e-05,
"loss": 3.0239,
"step": 1481
},
{
"epoch": 0.13504647348277748,
"grad_norm": 2.599902629852295,
"learning_rate": 4.778357963789936e-05,
"loss": 3.1395,
"step": 1482
},
{
"epoch": 0.13513759795881175,
"grad_norm": 3.741800546646118,
"learning_rate": 4.778063258507872e-05,
"loss": 3.1891,
"step": 1483
},
{
"epoch": 0.135228722434846,
"grad_norm": 2.012301445007324,
"learning_rate": 4.777768366529551e-05,
"loss": 3.3807,
"step": 1484
},
{
"epoch": 0.13531984691088025,
"grad_norm": 1.6227391958236694,
"learning_rate": 4.777473287879142e-05,
"loss": 3.0725,
"step": 1485
},
{
"epoch": 0.13541097138691452,
"grad_norm": 2.414212703704834,
"learning_rate": 4.777178022580826e-05,
"loss": 3.3113,
"step": 1486
},
{
"epoch": 0.13550209586294878,
"grad_norm": 3.2059073448181152,
"learning_rate": 4.776882570658802e-05,
"loss": 3.3099,
"step": 1487
},
{
"epoch": 0.13559322033898305,
"grad_norm": 4.668062210083008,
"learning_rate": 4.7765869321372836e-05,
"loss": 3.2421,
"step": 1488
},
{
"epoch": 0.13568434481501732,
"grad_norm": 1.8324368000030518,
"learning_rate": 4.776291107040498e-05,
"loss": 2.9967,
"step": 1489
},
{
"epoch": 0.13577546929105158,
"grad_norm": 2.761749744415283,
"learning_rate": 4.775995095392692e-05,
"loss": 3.2551,
"step": 1490
},
{
"epoch": 0.13586659376708585,
"grad_norm": 2.2648134231567383,
"learning_rate": 4.775698897218123e-05,
"loss": 3.1676,
"step": 1491
},
{
"epoch": 0.1359577182431201,
"grad_norm": 3.5984230041503906,
"learning_rate": 4.7754025125410654e-05,
"loss": 2.5762,
"step": 1492
},
{
"epoch": 0.13604884271915438,
"grad_norm": 2.518404245376587,
"learning_rate": 4.77510594138581e-05,
"loss": 3.1694,
"step": 1493
},
{
"epoch": 0.13613996719518862,
"grad_norm": 2.6424813270568848,
"learning_rate": 4.7748091837766623e-05,
"loss": 3.5051,
"step": 1494
},
{
"epoch": 0.13623109167122288,
"grad_norm": 1.2104846239089966,
"learning_rate": 4.7745122397379413e-05,
"loss": 2.9033,
"step": 1495
},
{
"epoch": 0.13632221614725715,
"grad_norm": 2.830470085144043,
"learning_rate": 4.774215109293984e-05,
"loss": 2.8473,
"step": 1496
},
{
"epoch": 0.1364133406232914,
"grad_norm": 2.8165035247802734,
"learning_rate": 4.773917792469142e-05,
"loss": 3.152,
"step": 1497
},
{
"epoch": 0.13650446509932568,
"grad_norm": 2.5895888805389404,
"learning_rate": 4.773620289287778e-05,
"loss": 3.1265,
"step": 1498
},
{
"epoch": 0.13659558957535994,
"grad_norm": 1.6958574056625366,
"learning_rate": 4.773322599774278e-05,
"loss": 3.1278,
"step": 1499
},
{
"epoch": 0.1366867140513942,
"grad_norm": 1.5739076137542725,
"learning_rate": 4.773024723953037e-05,
"loss": 2.9989,
"step": 1500
},
{
"epoch": 0.13677783852742847,
"grad_norm": 1.790186882019043,
"learning_rate": 4.772726661848467e-05,
"loss": 2.9863,
"step": 1501
},
{
"epoch": 0.13686896300346274,
"grad_norm": 1.7760246992111206,
"learning_rate": 4.7724284134849945e-05,
"loss": 3.0368,
"step": 1502
},
{
"epoch": 0.136960087479497,
"grad_norm": 1.7839252948760986,
"learning_rate": 4.7721299788870634e-05,
"loss": 3.0473,
"step": 1503
},
{
"epoch": 0.13705121195553124,
"grad_norm": 3.4607291221618652,
"learning_rate": 4.771831358079132e-05,
"loss": 3.3073,
"step": 1504
},
{
"epoch": 0.1371423364315655,
"grad_norm": 4.04274845123291,
"learning_rate": 4.771532551085672e-05,
"loss": 4.4627,
"step": 1505
},
{
"epoch": 0.13723346090759977,
"grad_norm": 1.4731826782226562,
"learning_rate": 4.771233557931172e-05,
"loss": 2.8455,
"step": 1506
},
{
"epoch": 0.13732458538363404,
"grad_norm": 6.251603603363037,
"learning_rate": 4.770934378640137e-05,
"loss": 3.3063,
"step": 1507
},
{
"epoch": 0.1374157098596683,
"grad_norm": 2.3858461380004883,
"learning_rate": 4.7706350132370844e-05,
"loss": 3.1662,
"step": 1508
},
{
"epoch": 0.13750683433570257,
"grad_norm": 2.9865639209747314,
"learning_rate": 4.77033546174655e-05,
"loss": 3.1456,
"step": 1509
},
{
"epoch": 0.13759795881173684,
"grad_norm": 2.4804508686065674,
"learning_rate": 4.7700357241930815e-05,
"loss": 3.1369,
"step": 1510
},
{
"epoch": 0.1376890832877711,
"grad_norm": 3.5824382305145264,
"learning_rate": 4.769735800601245e-05,
"loss": 3.1561,
"step": 1511
},
{
"epoch": 0.13778020776380537,
"grad_norm": 2.742952585220337,
"learning_rate": 4.7694356909956194e-05,
"loss": 4.2205,
"step": 1512
},
{
"epoch": 0.13787133223983963,
"grad_norm": 1.8473784923553467,
"learning_rate": 4.7691353954008e-05,
"loss": 2.8503,
"step": 1513
},
{
"epoch": 0.13796245671587387,
"grad_norm": 2.6001064777374268,
"learning_rate": 4.768834913841398e-05,
"loss": 3.1715,
"step": 1514
},
{
"epoch": 0.13805358119190814,
"grad_norm": 2.0480306148529053,
"learning_rate": 4.768534246342038e-05,
"loss": 3.1111,
"step": 1515
},
{
"epoch": 0.1381447056679424,
"grad_norm": 1.9670919179916382,
"learning_rate": 4.768233392927361e-05,
"loss": 2.9742,
"step": 1516
},
{
"epoch": 0.13823583014397667,
"grad_norm": 2.6016757488250732,
"learning_rate": 4.767932353622025e-05,
"loss": 3.2246,
"step": 1517
},
{
"epoch": 0.13832695462001093,
"grad_norm": 1.480478048324585,
"learning_rate": 4.767631128450699e-05,
"loss": 2.9532,
"step": 1518
},
{
"epoch": 0.1384180790960452,
"grad_norm": 2.2611401081085205,
"learning_rate": 4.767329717438071e-05,
"loss": 3.1999,
"step": 1519
},
{
"epoch": 0.13850920357207946,
"grad_norm": 1.8727288246154785,
"learning_rate": 4.7670281206088406e-05,
"loss": 2.9416,
"step": 1520
},
{
"epoch": 0.13860032804811373,
"grad_norm": 3.45841121673584,
"learning_rate": 4.766726337987728e-05,
"loss": 3.2848,
"step": 1521
},
{
"epoch": 0.138691452524148,
"grad_norm": 2.117701530456543,
"learning_rate": 4.7664243695994634e-05,
"loss": 2.9624,
"step": 1522
},
{
"epoch": 0.13878257700018226,
"grad_norm": 1.6901532411575317,
"learning_rate": 4.766122215468795e-05,
"loss": 2.9421,
"step": 1523
},
{
"epoch": 0.1388737014762165,
"grad_norm": 3.0419657230377197,
"learning_rate": 4.765819875620485e-05,
"loss": 2.9388,
"step": 1524
},
{
"epoch": 0.13896482595225076,
"grad_norm": 2.559110641479492,
"learning_rate": 4.765517350079313e-05,
"loss": 3.0909,
"step": 1525
},
{
"epoch": 0.13905595042828503,
"grad_norm": 1.8691017627716064,
"learning_rate": 4.7652146388700705e-05,
"loss": 3.017,
"step": 1526
},
{
"epoch": 0.1391470749043193,
"grad_norm": 2.1920082569122314,
"learning_rate": 4.764911742017565e-05,
"loss": 2.8878,
"step": 1527
},
{
"epoch": 0.13923819938035356,
"grad_norm": 5.991618633270264,
"learning_rate": 4.764608659546623e-05,
"loss": 3.2354,
"step": 1528
},
{
"epoch": 0.13932932385638783,
"grad_norm": 1.6653143167495728,
"learning_rate": 4.764305391482081e-05,
"loss": 2.9833,
"step": 1529
},
{
"epoch": 0.1394204483324221,
"grad_norm": 2.5954642295837402,
"learning_rate": 4.7640019378487934e-05,
"loss": 3.1639,
"step": 1530
},
{
"epoch": 0.13951157280845636,
"grad_norm": 3.557215929031372,
"learning_rate": 4.763698298671629e-05,
"loss": 2.8795,
"step": 1531
},
{
"epoch": 0.13960269728449062,
"grad_norm": 1.8863126039505005,
"learning_rate": 4.7633944739754746e-05,
"loss": 2.833,
"step": 1532
},
{
"epoch": 0.1396938217605249,
"grad_norm": 5.656214714050293,
"learning_rate": 4.7630904637852275e-05,
"loss": 3.1804,
"step": 1533
},
{
"epoch": 0.13978494623655913,
"grad_norm": 2.910088300704956,
"learning_rate": 4.7627862681258037e-05,
"loss": 3.056,
"step": 1534
},
{
"epoch": 0.1398760707125934,
"grad_norm": 1.1903241872787476,
"learning_rate": 4.762481887022132e-05,
"loss": 2.9107,
"step": 1535
},
{
"epoch": 0.13996719518862766,
"grad_norm": 1.3917659521102905,
"learning_rate": 4.762177320499158e-05,
"loss": 2.9145,
"step": 1536
},
{
"epoch": 0.14005831966466192,
"grad_norm": 2.0048739910125732,
"learning_rate": 4.7618725685818434e-05,
"loss": 2.9777,
"step": 1537
},
{
"epoch": 0.1401494441406962,
"grad_norm": 3.215196371078491,
"learning_rate": 4.761567631295163e-05,
"loss": 2.9824,
"step": 1538
},
{
"epoch": 0.14024056861673045,
"grad_norm": 2.187133550643921,
"learning_rate": 4.761262508664107e-05,
"loss": 3.1835,
"step": 1539
},
{
"epoch": 0.14033169309276472,
"grad_norm": 2.6757946014404297,
"learning_rate": 4.760957200713682e-05,
"loss": 3.1799,
"step": 1540
},
{
"epoch": 0.14042281756879899,
"grad_norm": 2.7127139568328857,
"learning_rate": 4.760651707468908e-05,
"loss": 3.1264,
"step": 1541
},
{
"epoch": 0.14051394204483325,
"grad_norm": 2.6738502979278564,
"learning_rate": 4.760346028954824e-05,
"loss": 3.2401,
"step": 1542
},
{
"epoch": 0.14060506652086752,
"grad_norm": 2.608595371246338,
"learning_rate": 4.76004016519648e-05,
"loss": 3.0873,
"step": 1543
},
{
"epoch": 0.14069619099690175,
"grad_norm": 2.3539223670959473,
"learning_rate": 4.7597341162189426e-05,
"loss": 3.2628,
"step": 1544
},
{
"epoch": 0.14078731547293602,
"grad_norm": 1.7645334005355835,
"learning_rate": 4.7594278820472934e-05,
"loss": 3.0802,
"step": 1545
},
{
"epoch": 0.14087843994897029,
"grad_norm": 2.0945842266082764,
"learning_rate": 4.759121462706631e-05,
"loss": 3.3989,
"step": 1546
},
{
"epoch": 0.14096956442500455,
"grad_norm": 3.260627031326294,
"learning_rate": 4.758814858222066e-05,
"loss": 3.2328,
"step": 1547
},
{
"epoch": 0.14106068890103882,
"grad_norm": 3.0855679512023926,
"learning_rate": 4.7585080686187264e-05,
"loss": 4.337,
"step": 1548
},
{
"epoch": 0.14115181337707308,
"grad_norm": 2.290228843688965,
"learning_rate": 4.758201093921755e-05,
"loss": 3.2579,
"step": 1549
},
{
"epoch": 0.14124293785310735,
"grad_norm": 2.877847194671631,
"learning_rate": 4.7578939341563095e-05,
"loss": 3.0991,
"step": 1550
},
{
"epoch": 0.1413340623291416,
"grad_norm": 2.6873252391815186,
"learning_rate": 4.7575865893475625e-05,
"loss": 3.2119,
"step": 1551
},
{
"epoch": 0.14142518680517588,
"grad_norm": 2.232259750366211,
"learning_rate": 4.757279059520703e-05,
"loss": 2.8747,
"step": 1552
},
{
"epoch": 0.14151631128121014,
"grad_norm": 3.1098685264587402,
"learning_rate": 4.756971344700934e-05,
"loss": 4.3473,
"step": 1553
},
{
"epoch": 0.14160743575724438,
"grad_norm": 1.4829670190811157,
"learning_rate": 4.7566634449134734e-05,
"loss": 3.079,
"step": 1554
},
{
"epoch": 0.14169856023327865,
"grad_norm": 2.051966667175293,
"learning_rate": 4.7563553601835555e-05,
"loss": 3.0552,
"step": 1555
},
{
"epoch": 0.1417896847093129,
"grad_norm": 2.224562883377075,
"learning_rate": 4.756047090536428e-05,
"loss": 3.1404,
"step": 1556
},
{
"epoch": 0.14188080918534718,
"grad_norm": 2.442316770553589,
"learning_rate": 4.7557386359973554e-05,
"loss": 2.9279,
"step": 1557
},
{
"epoch": 0.14197193366138144,
"grad_norm": 2.465324640274048,
"learning_rate": 4.755429996591618e-05,
"loss": 2.7246,
"step": 1558
},
{
"epoch": 0.1420630581374157,
"grad_norm": 2.204219102859497,
"learning_rate": 4.755121172344508e-05,
"loss": 3.0415,
"step": 1559
},
{
"epoch": 0.14215418261344998,
"grad_norm": 1.9507112503051758,
"learning_rate": 4.754812163281335e-05,
"loss": 3.2588,
"step": 1560
},
{
"epoch": 0.14224530708948424,
"grad_norm": 2.620863914489746,
"learning_rate": 4.7545029694274254e-05,
"loss": 2.7928,
"step": 1561
},
{
"epoch": 0.1423364315655185,
"grad_norm": 2.5902299880981445,
"learning_rate": 4.754193590808117e-05,
"loss": 3.3398,
"step": 1562
},
{
"epoch": 0.14242755604155277,
"grad_norm": 1.4641609191894531,
"learning_rate": 4.753884027448765e-05,
"loss": 2.9775,
"step": 1563
},
{
"epoch": 0.14251868051758704,
"grad_norm": 1.9195103645324707,
"learning_rate": 4.753574279374739e-05,
"loss": 3.0164,
"step": 1564
},
{
"epoch": 0.14260980499362128,
"grad_norm": 2.0863749980926514,
"learning_rate": 4.7532643466114266e-05,
"loss": 3.1996,
"step": 1565
},
{
"epoch": 0.14270092946965554,
"grad_norm": 2.3902792930603027,
"learning_rate": 4.752954229184224e-05,
"loss": 3.1756,
"step": 1566
},
{
"epoch": 0.1427920539456898,
"grad_norm": 2.883986711502075,
"learning_rate": 4.75264392711855e-05,
"loss": 3.3782,
"step": 1567
},
{
"epoch": 0.14288317842172407,
"grad_norm": 1.5548768043518066,
"learning_rate": 4.752333440439832e-05,
"loss": 3.0867,
"step": 1568
},
{
"epoch": 0.14297430289775834,
"grad_norm": 2.3053369522094727,
"learning_rate": 4.752022769173519e-05,
"loss": 3.2028,
"step": 1569
},
{
"epoch": 0.1430654273737926,
"grad_norm": 2.11873722076416,
"learning_rate": 4.751711913345069e-05,
"loss": 3.4555,
"step": 1570
},
{
"epoch": 0.14315655184982687,
"grad_norm": 1.6578961610794067,
"learning_rate": 4.7514008729799584e-05,
"loss": 2.8687,
"step": 1571
},
{
"epoch": 0.14324767632586113,
"grad_norm": 2.5347859859466553,
"learning_rate": 4.7510896481036796e-05,
"loss": 3.3134,
"step": 1572
},
{
"epoch": 0.1433388008018954,
"grad_norm": 1.5316507816314697,
"learning_rate": 4.750778238741737e-05,
"loss": 3.0429,
"step": 1573
},
{
"epoch": 0.14342992527792967,
"grad_norm": 2.9990713596343994,
"learning_rate": 4.7504666449196534e-05,
"loss": 3.3818,
"step": 1574
},
{
"epoch": 0.1435210497539639,
"grad_norm": 2.222365617752075,
"learning_rate": 4.750154866662964e-05,
"loss": 3.1164,
"step": 1575
},
{
"epoch": 0.14361217422999817,
"grad_norm": 1.5940253734588623,
"learning_rate": 4.7498429039972195e-05,
"loss": 2.9707,
"step": 1576
},
{
"epoch": 0.14370329870603243,
"grad_norm": 1.7318164110183716,
"learning_rate": 4.7495307569479886e-05,
"loss": 2.9525,
"step": 1577
},
{
"epoch": 0.1437944231820667,
"grad_norm": 2.8493340015411377,
"learning_rate": 4.749218425540851e-05,
"loss": 2.9688,
"step": 1578
},
{
"epoch": 0.14388554765810097,
"grad_norm": 3.3721001148223877,
"learning_rate": 4.748905909801405e-05,
"loss": 3.0444,
"step": 1579
},
{
"epoch": 0.14397667213413523,
"grad_norm": 3.1313891410827637,
"learning_rate": 4.748593209755262e-05,
"loss": 3.0512,
"step": 1580
},
{
"epoch": 0.1440677966101695,
"grad_norm": 1.885908842086792,
"learning_rate": 4.7482803254280484e-05,
"loss": 2.9611,
"step": 1581
},
{
"epoch": 0.14415892108620376,
"grad_norm": 2.9428458213806152,
"learning_rate": 4.747967256845407e-05,
"loss": 2.9153,
"step": 1582
},
{
"epoch": 0.14425004556223803,
"grad_norm": 4.332671165466309,
"learning_rate": 4.747654004032995e-05,
"loss": 3.2974,
"step": 1583
},
{
"epoch": 0.1443411700382723,
"grad_norm": 1.3580037355422974,
"learning_rate": 4.747340567016484e-05,
"loss": 3.0826,
"step": 1584
},
{
"epoch": 0.14443229451430653,
"grad_norm": 9.840303421020508,
"learning_rate": 4.747026945821562e-05,
"loss": 2.5629,
"step": 1585
},
{
"epoch": 0.1445234189903408,
"grad_norm": 2.672960042953491,
"learning_rate": 4.7467131404739315e-05,
"loss": 3.1531,
"step": 1586
},
{
"epoch": 0.14461454346637506,
"grad_norm": 2.945892572402954,
"learning_rate": 4.7463991509993096e-05,
"loss": 3.3506,
"step": 1587
},
{
"epoch": 0.14470566794240933,
"grad_norm": 2.753072738647461,
"learning_rate": 4.74608497742343e-05,
"loss": 3.0978,
"step": 1588
},
{
"epoch": 0.1447967924184436,
"grad_norm": 2.628932237625122,
"learning_rate": 4.7457706197720395e-05,
"loss": 3.2419,
"step": 1589
},
{
"epoch": 0.14488791689447786,
"grad_norm": 3.0868217945098877,
"learning_rate": 4.745456078070901e-05,
"loss": 3.1019,
"step": 1590
},
{
"epoch": 0.14497904137051212,
"grad_norm": 1.7500460147857666,
"learning_rate": 4.745141352345793e-05,
"loss": 3.0763,
"step": 1591
},
{
"epoch": 0.1450701658465464,
"grad_norm": 2.3832669258117676,
"learning_rate": 4.744826442622508e-05,
"loss": 3.0807,
"step": 1592
},
{
"epoch": 0.14516129032258066,
"grad_norm": 3.4366202354431152,
"learning_rate": 4.7445113489268544e-05,
"loss": 3.0111,
"step": 1593
},
{
"epoch": 0.14525241479861492,
"grad_norm": 1.3814283609390259,
"learning_rate": 4.744196071284655e-05,
"loss": 2.9914,
"step": 1594
},
{
"epoch": 0.14534353927464916,
"grad_norm": 3.4350616931915283,
"learning_rate": 4.743880609721749e-05,
"loss": 3.4018,
"step": 1595
},
{
"epoch": 0.14543466375068342,
"grad_norm": 3.0712473392486572,
"learning_rate": 4.7435649642639876e-05,
"loss": 3.1312,
"step": 1596
},
{
"epoch": 0.1455257882267177,
"grad_norm": 2.87288761138916,
"learning_rate": 4.743249134937242e-05,
"loss": 3.259,
"step": 1597
},
{
"epoch": 0.14561691270275195,
"grad_norm": 3.8843579292297363,
"learning_rate": 4.742933121767394e-05,
"loss": 3.4705,
"step": 1598
},
{
"epoch": 0.14570803717878622,
"grad_norm": 2.3016998767852783,
"learning_rate": 4.742616924780342e-05,
"loss": 3.1254,
"step": 1599
},
{
"epoch": 0.14579916165482049,
"grad_norm": 2.080766439437866,
"learning_rate": 4.742300544002e-05,
"loss": 3.0068,
"step": 1600
},
{
"epoch": 0.14589028613085475,
"grad_norm": 1.403579831123352,
"learning_rate": 4.741983979458296e-05,
"loss": 3.0318,
"step": 1601
},
{
"epoch": 0.14598141060688902,
"grad_norm": 2.5273921489715576,
"learning_rate": 4.741667231175175e-05,
"loss": 3.2275,
"step": 1602
},
{
"epoch": 0.14607253508292328,
"grad_norm": 2.89467453956604,
"learning_rate": 4.741350299178595e-05,
"loss": 2.9018,
"step": 1603
},
{
"epoch": 0.14616365955895755,
"grad_norm": 1.527073621749878,
"learning_rate": 4.74103318349453e-05,
"loss": 2.9688,
"step": 1604
},
{
"epoch": 0.14625478403499179,
"grad_norm": 1.2628742456436157,
"learning_rate": 4.7407158841489693e-05,
"loss": 2.9605,
"step": 1605
},
{
"epoch": 0.14634590851102605,
"grad_norm": 1.3628411293029785,
"learning_rate": 4.740398401167916e-05,
"loss": 2.8406,
"step": 1606
},
{
"epoch": 0.14643703298706032,
"grad_norm": 2.6044845581054688,
"learning_rate": 4.740080734577389e-05,
"loss": 2.8398,
"step": 1607
},
{
"epoch": 0.14652815746309458,
"grad_norm": 3.561075210571289,
"learning_rate": 4.7397628844034225e-05,
"loss": 4.4007,
"step": 1608
},
{
"epoch": 0.14661928193912885,
"grad_norm": 1.4533250331878662,
"learning_rate": 4.739444850672067e-05,
"loss": 2.9798,
"step": 1609
},
{
"epoch": 0.1467104064151631,
"grad_norm": 3.046421527862549,
"learning_rate": 4.7391266334093845e-05,
"loss": 3.0307,
"step": 1610
},
{
"epoch": 0.14680153089119738,
"grad_norm": 2.7299585342407227,
"learning_rate": 4.738808232641455e-05,
"loss": 3.1078,
"step": 1611
},
{
"epoch": 0.14689265536723164,
"grad_norm": 1.458348035812378,
"learning_rate": 4.738489648394373e-05,
"loss": 2.9425,
"step": 1612
},
{
"epoch": 0.1469837798432659,
"grad_norm": 1.5302770137786865,
"learning_rate": 4.7381708806942474e-05,
"loss": 3.0449,
"step": 1613
},
{
"epoch": 0.14707490431930018,
"grad_norm": 4.053586006164551,
"learning_rate": 4.737851929567203e-05,
"loss": 3.5561,
"step": 1614
},
{
"epoch": 0.1471660287953344,
"grad_norm": 2.3581931591033936,
"learning_rate": 4.737532795039378e-05,
"loss": 2.928,
"step": 1615
},
{
"epoch": 0.14725715327136868,
"grad_norm": 1.6155071258544922,
"learning_rate": 4.737213477136928e-05,
"loss": 3.1576,
"step": 1616
},
{
"epoch": 0.14734827774740294,
"grad_norm": 2.6783015727996826,
"learning_rate": 4.736893975886022e-05,
"loss": 3.3114,
"step": 1617
},
{
"epoch": 0.1474394022234372,
"grad_norm": 2.132664203643799,
"learning_rate": 4.7365742913128434e-05,
"loss": 2.8533,
"step": 1618
},
{
"epoch": 0.14753052669947148,
"grad_norm": 2.1703169345855713,
"learning_rate": 4.736254423443593e-05,
"loss": 2.973,
"step": 1619
},
{
"epoch": 0.14762165117550574,
"grad_norm": 1.7046056985855103,
"learning_rate": 4.7359343723044844e-05,
"loss": 3.0936,
"step": 1620
},
{
"epoch": 0.14771277565154,
"grad_norm": 2.7974915504455566,
"learning_rate": 4.7356141379217475e-05,
"loss": 3.1275,
"step": 1621
},
{
"epoch": 0.14780390012757427,
"grad_norm": 2.6626646518707275,
"learning_rate": 4.735293720321626e-05,
"loss": 3.1263,
"step": 1622
},
{
"epoch": 0.14789502460360854,
"grad_norm": 3.38659405708313,
"learning_rate": 4.7349731195303805e-05,
"loss": 2.5051,
"step": 1623
},
{
"epoch": 0.1479861490796428,
"grad_norm": 2.316164493560791,
"learning_rate": 4.734652335574285e-05,
"loss": 3.0061,
"step": 1624
},
{
"epoch": 0.14807727355567704,
"grad_norm": 2.6719372272491455,
"learning_rate": 4.7343313684796275e-05,
"loss": 2.7537,
"step": 1625
},
{
"epoch": 0.1481683980317113,
"grad_norm": 2.598212480545044,
"learning_rate": 4.7340102182727155e-05,
"loss": 2.9145,
"step": 1626
},
{
"epoch": 0.14825952250774557,
"grad_norm": 1.4254382848739624,
"learning_rate": 4.7336888849798664e-05,
"loss": 2.917,
"step": 1627
},
{
"epoch": 0.14835064698377984,
"grad_norm": 2.534688949584961,
"learning_rate": 4.733367368627415e-05,
"loss": 2.8936,
"step": 1628
},
{
"epoch": 0.1484417714598141,
"grad_norm": 4.316157817840576,
"learning_rate": 4.7330456692417115e-05,
"loss": 3.0555,
"step": 1629
},
{
"epoch": 0.14853289593584837,
"grad_norm": 2.75527024269104,
"learning_rate": 4.73272378684912e-05,
"loss": 4.4197,
"step": 1630
},
{
"epoch": 0.14862402041188263,
"grad_norm": 2.4005396366119385,
"learning_rate": 4.73240172147602e-05,
"loss": 3.2265,
"step": 1631
},
{
"epoch": 0.1487151448879169,
"grad_norm": 2.969036102294922,
"learning_rate": 4.732079473148806e-05,
"loss": 3.2398,
"step": 1632
},
{
"epoch": 0.14880626936395117,
"grad_norm": 1.4357764720916748,
"learning_rate": 4.7317570418938884e-05,
"loss": 2.9779,
"step": 1633
},
{
"epoch": 0.14889739383998543,
"grad_norm": 3.58138108253479,
"learning_rate": 4.73143442773769e-05,
"loss": 3.0653,
"step": 1634
},
{
"epoch": 0.14898851831601967,
"grad_norm": 2.6361777782440186,
"learning_rate": 4.731111630706652e-05,
"loss": 2.9738,
"step": 1635
},
{
"epoch": 0.14907964279205393,
"grad_norm": 2.812761068344116,
"learning_rate": 4.730788650827227e-05,
"loss": 2.9724,
"step": 1636
},
{
"epoch": 0.1491707672680882,
"grad_norm": 2.8054981231689453,
"learning_rate": 4.730465488125887e-05,
"loss": 3.2216,
"step": 1637
},
{
"epoch": 0.14926189174412247,
"grad_norm": 1.5526211261749268,
"learning_rate": 4.7301421426291135e-05,
"loss": 2.9845,
"step": 1638
},
{
"epoch": 0.14935301622015673,
"grad_norm": 2.076298236846924,
"learning_rate": 4.729818614363409e-05,
"loss": 2.8241,
"step": 1639
},
{
"epoch": 0.149444140696191,
"grad_norm": 1.7731536626815796,
"learning_rate": 4.729494903355285e-05,
"loss": 2.9929,
"step": 1640
},
{
"epoch": 0.14953526517222526,
"grad_norm": 1.967139482498169,
"learning_rate": 4.7291710096312736e-05,
"loss": 2.9744,
"step": 1641
},
{
"epoch": 0.14962638964825953,
"grad_norm": 2.6699776649475098,
"learning_rate": 4.728846933217918e-05,
"loss": 3.0459,
"step": 1642
},
{
"epoch": 0.1497175141242938,
"grad_norm": 1.3542306423187256,
"learning_rate": 4.728522674141776e-05,
"loss": 2.9644,
"step": 1643
},
{
"epoch": 0.14980863860032806,
"grad_norm": 2.028887987136841,
"learning_rate": 4.728198232429424e-05,
"loss": 2.9015,
"step": 1644
},
{
"epoch": 0.14989976307636232,
"grad_norm": 1.5386238098144531,
"learning_rate": 4.72787360810745e-05,
"loss": 2.9069,
"step": 1645
},
{
"epoch": 0.14999088755239656,
"grad_norm": 2.95621395111084,
"learning_rate": 4.727548801202461e-05,
"loss": 3.0468,
"step": 1646
},
{
"epoch": 0.15008201202843083,
"grad_norm": 4.1476545333862305,
"learning_rate": 4.7272238117410715e-05,
"loss": 2.6694,
"step": 1647
},
{
"epoch": 0.1501731365044651,
"grad_norm": 2.2101778984069824,
"learning_rate": 4.726898639749919e-05,
"loss": 3.112,
"step": 1648
},
{
"epoch": 0.15026426098049936,
"grad_norm": 3.93254017829895,
"learning_rate": 4.726573285255652e-05,
"loss": 3.3209,
"step": 1649
},
{
"epoch": 0.15035538545653362,
"grad_norm": 1.543944001197815,
"learning_rate": 4.726247748284935e-05,
"loss": 2.9816,
"step": 1650
},
{
"epoch": 0.1504465099325679,
"grad_norm": 2.223081588745117,
"learning_rate": 4.725922028864446e-05,
"loss": 3.0477,
"step": 1651
},
{
"epoch": 0.15053763440860216,
"grad_norm": 2.9665400981903076,
"learning_rate": 4.725596127020879e-05,
"loss": 3.0741,
"step": 1652
},
{
"epoch": 0.15062875888463642,
"grad_norm": 2.24804949760437,
"learning_rate": 4.7252700427809436e-05,
"loss": 3.0598,
"step": 1653
},
{
"epoch": 0.1507198833606707,
"grad_norm": 2.718592643737793,
"learning_rate": 4.724943776171364e-05,
"loss": 3.0261,
"step": 1654
},
{
"epoch": 0.15081100783670495,
"grad_norm": 1.653093695640564,
"learning_rate": 4.7246173272188774e-05,
"loss": 2.9727,
"step": 1655
},
{
"epoch": 0.1509021323127392,
"grad_norm": 2.4713759422302246,
"learning_rate": 4.72429069595024e-05,
"loss": 2.9652,
"step": 1656
},
{
"epoch": 0.15099325678877346,
"grad_norm": 2.3406810760498047,
"learning_rate": 4.723963882392218e-05,
"loss": 2.9158,
"step": 1657
},
{
"epoch": 0.15108438126480772,
"grad_norm": 1.7240417003631592,
"learning_rate": 4.723636886571597e-05,
"loss": 3.0695,
"step": 1658
},
{
"epoch": 0.151175505740842,
"grad_norm": 3.368987798690796,
"learning_rate": 4.723309708515175e-05,
"loss": 3.1482,
"step": 1659
},
{
"epoch": 0.15126663021687625,
"grad_norm": 1.6549758911132812,
"learning_rate": 4.722982348249765e-05,
"loss": 3.017,
"step": 1660
},
{
"epoch": 0.15135775469291052,
"grad_norm": 3.695152759552002,
"learning_rate": 4.722654805802196e-05,
"loss": 3.2281,
"step": 1661
},
{
"epoch": 0.15144887916894478,
"grad_norm": 2.500075578689575,
"learning_rate": 4.7223270811993116e-05,
"loss": 2.9514,
"step": 1662
},
{
"epoch": 0.15154000364497905,
"grad_norm": 4.324959754943848,
"learning_rate": 4.721999174467969e-05,
"loss": 3.2794,
"step": 1663
},
{
"epoch": 0.15163112812101331,
"grad_norm": 2.2761075496673584,
"learning_rate": 4.7216710856350424e-05,
"loss": 2.7181,
"step": 1664
},
{
"epoch": 0.15172225259704758,
"grad_norm": 2.5118227005004883,
"learning_rate": 4.7213428147274195e-05,
"loss": 3.2282,
"step": 1665
},
{
"epoch": 0.15181337707308182,
"grad_norm": 2.3931479454040527,
"learning_rate": 4.721014361772005e-05,
"loss": 2.7632,
"step": 1666
},
{
"epoch": 0.15190450154911608,
"grad_norm": 1.8002700805664062,
"learning_rate": 4.720685726795714e-05,
"loss": 2.9921,
"step": 1667
},
{
"epoch": 0.15199562602515035,
"grad_norm": 1.4867767095565796,
"learning_rate": 4.720356909825482e-05,
"loss": 2.991,
"step": 1668
},
{
"epoch": 0.15208675050118461,
"grad_norm": 2.3881375789642334,
"learning_rate": 4.7200279108882554e-05,
"loss": 3.2776,
"step": 1669
},
{
"epoch": 0.15217787497721888,
"grad_norm": 2.2196474075317383,
"learning_rate": 4.7196987300109974e-05,
"loss": 3.1953,
"step": 1670
},
{
"epoch": 0.15226899945325315,
"grad_norm": 1.6471738815307617,
"learning_rate": 4.719369367220686e-05,
"loss": 3.0896,
"step": 1671
},
{
"epoch": 0.1523601239292874,
"grad_norm": 2.35978364944458,
"learning_rate": 4.7190398225443134e-05,
"loss": 2.9034,
"step": 1672
},
{
"epoch": 0.15245124840532168,
"grad_norm": 3.6606178283691406,
"learning_rate": 4.718710096008887e-05,
"loss": 3.1773,
"step": 1673
},
{
"epoch": 0.15254237288135594,
"grad_norm": 2.787719249725342,
"learning_rate": 4.7183801876414294e-05,
"loss": 3.3528,
"step": 1674
},
{
"epoch": 0.1526334973573902,
"grad_norm": 4.270319938659668,
"learning_rate": 4.718050097468978e-05,
"loss": 2.5025,
"step": 1675
},
{
"epoch": 0.15272462183342445,
"grad_norm": 1.192939281463623,
"learning_rate": 4.717719825518585e-05,
"loss": 2.9122,
"step": 1676
},
{
"epoch": 0.1528157463094587,
"grad_norm": 2.2663092613220215,
"learning_rate": 4.717389371817316e-05,
"loss": 3.0213,
"step": 1677
},
{
"epoch": 0.15290687078549298,
"grad_norm": 2.0562195777893066,
"learning_rate": 4.717058736392256e-05,
"loss": 3.0291,
"step": 1678
},
{
"epoch": 0.15299799526152724,
"grad_norm": 1.2530059814453125,
"learning_rate": 4.716727919270499e-05,
"loss": 2.8827,
"step": 1679
},
{
"epoch": 0.1530891197375615,
"grad_norm": 1.6773467063903809,
"learning_rate": 4.716396920479158e-05,
"loss": 2.9988,
"step": 1680
},
{
"epoch": 0.15318024421359577,
"grad_norm": 2.1727144718170166,
"learning_rate": 4.71606574004536e-05,
"loss": 2.9866,
"step": 1681
},
{
"epoch": 0.15327136868963004,
"grad_norm": 2.6730339527130127,
"learning_rate": 4.715734377996246e-05,
"loss": 3.1705,
"step": 1682
},
{
"epoch": 0.1533624931656643,
"grad_norm": 2.3407809734344482,
"learning_rate": 4.7154028343589726e-05,
"loss": 2.997,
"step": 1683
},
{
"epoch": 0.15345361764169857,
"grad_norm": 2.6133763790130615,
"learning_rate": 4.7150711091607114e-05,
"loss": 3.0567,
"step": 1684
},
{
"epoch": 0.15354474211773284,
"grad_norm": 1.2702889442443848,
"learning_rate": 4.714739202428648e-05,
"loss": 2.9257,
"step": 1685
},
{
"epoch": 0.15363586659376707,
"grad_norm": 2.4354734420776367,
"learning_rate": 4.714407114189984e-05,
"loss": 2.9063,
"step": 1686
},
{
"epoch": 0.15372699106980134,
"grad_norm": 2.6022043228149414,
"learning_rate": 4.714074844471934e-05,
"loss": 3.0259,
"step": 1687
},
{
"epoch": 0.1538181155458356,
"grad_norm": 3.257946252822876,
"learning_rate": 4.713742393301731e-05,
"loss": 3.0208,
"step": 1688
},
{
"epoch": 0.15390924002186987,
"grad_norm": 2.0955233573913574,
"learning_rate": 4.7134097607066194e-05,
"loss": 2.9569,
"step": 1689
},
{
"epoch": 0.15400036449790414,
"grad_norm": 2.2379086017608643,
"learning_rate": 4.713076946713859e-05,
"loss": 3.005,
"step": 1690
},
{
"epoch": 0.1540914889739384,
"grad_norm": 2.551835060119629,
"learning_rate": 4.712743951350727e-05,
"loss": 2.8125,
"step": 1691
},
{
"epoch": 0.15418261344997267,
"grad_norm": 1.4898242950439453,
"learning_rate": 4.7124107746445126e-05,
"loss": 2.9545,
"step": 1692
},
{
"epoch": 0.15427373792600693,
"grad_norm": 3.3373043537139893,
"learning_rate": 4.7120774166225215e-05,
"loss": 2.5542,
"step": 1693
},
{
"epoch": 0.1543648624020412,
"grad_norm": 2.2917416095733643,
"learning_rate": 4.7117438773120725e-05,
"loss": 3.1463,
"step": 1694
},
{
"epoch": 0.15445598687807546,
"grad_norm": 2.7962570190429688,
"learning_rate": 4.7114101567405016e-05,
"loss": 2.9269,
"step": 1695
},
{
"epoch": 0.1545471113541097,
"grad_norm": 2.6020569801330566,
"learning_rate": 4.7110762549351586e-05,
"loss": 3.0507,
"step": 1696
},
{
"epoch": 0.15463823583014397,
"grad_norm": 3.0041913986206055,
"learning_rate": 4.7107421719234066e-05,
"loss": 3.1105,
"step": 1697
},
{
"epoch": 0.15472936030617823,
"grad_norm": 3.319488525390625,
"learning_rate": 4.710407907732627e-05,
"loss": 3.1954,
"step": 1698
},
{
"epoch": 0.1548204847822125,
"grad_norm": 1.2503721714019775,
"learning_rate": 4.7100734623902135e-05,
"loss": 2.8883,
"step": 1699
},
{
"epoch": 0.15491160925824676,
"grad_norm": 2.5423226356506348,
"learning_rate": 4.709738835923575e-05,
"loss": 3.2076,
"step": 1700
},
{
"epoch": 0.15500273373428103,
"grad_norm": 2.350539445877075,
"learning_rate": 4.7094040283601345e-05,
"loss": 3.1558,
"step": 1701
},
{
"epoch": 0.1550938582103153,
"grad_norm": 2.6543209552764893,
"learning_rate": 4.709069039727332e-05,
"loss": 3.2013,
"step": 1702
},
{
"epoch": 0.15518498268634956,
"grad_norm": 2.5844523906707764,
"learning_rate": 4.708733870052621e-05,
"loss": 4.2824,
"step": 1703
},
{
"epoch": 0.15527610716238383,
"grad_norm": 4.499124526977539,
"learning_rate": 4.708398519363469e-05,
"loss": 3.0362,
"step": 1704
},
{
"epoch": 0.1553672316384181,
"grad_norm": 2.7563819885253906,
"learning_rate": 4.70806298768736e-05,
"loss": 3.3136,
"step": 1705
},
{
"epoch": 0.15545835611445233,
"grad_norm": 2.212899923324585,
"learning_rate": 4.707727275051793e-05,
"loss": 2.9758,
"step": 1706
},
{
"epoch": 0.1555494805904866,
"grad_norm": 2.452393054962158,
"learning_rate": 4.70739138148428e-05,
"loss": 3.3339,
"step": 1707
},
{
"epoch": 0.15564060506652086,
"grad_norm": 3.724048614501953,
"learning_rate": 4.7070553070123494e-05,
"loss": 3.231,
"step": 1708
},
{
"epoch": 0.15573172954255513,
"grad_norm": 2.489640235900879,
"learning_rate": 4.706719051663543e-05,
"loss": 3.2228,
"step": 1709
},
{
"epoch": 0.1558228540185894,
"grad_norm": 1.4192156791687012,
"learning_rate": 4.7063826154654175e-05,
"loss": 3.0394,
"step": 1710
},
{
"epoch": 0.15591397849462366,
"grad_norm": 1.8565047979354858,
"learning_rate": 4.706045998445548e-05,
"loss": 2.9931,
"step": 1711
},
{
"epoch": 0.15600510297065792,
"grad_norm": 3.727804183959961,
"learning_rate": 4.70570920063152e-05,
"loss": 3.3733,
"step": 1712
},
{
"epoch": 0.1560962274466922,
"grad_norm": 1.7734785079956055,
"learning_rate": 4.705372222050934e-05,
"loss": 3.134,
"step": 1713
},
{
"epoch": 0.15618735192272645,
"grad_norm": 1.5476933717727661,
"learning_rate": 4.705035062731409e-05,
"loss": 3.0023,
"step": 1714
},
{
"epoch": 0.15627847639876072,
"grad_norm": 1.2864508628845215,
"learning_rate": 4.7046977227005754e-05,
"loss": 2.832,
"step": 1715
},
{
"epoch": 0.15636960087479496,
"grad_norm": 1.4140430688858032,
"learning_rate": 4.704360201986079e-05,
"loss": 2.9195,
"step": 1716
},
{
"epoch": 0.15646072535082922,
"grad_norm": 1.4808127880096436,
"learning_rate": 4.704022500615583e-05,
"loss": 3.0438,
"step": 1717
},
{
"epoch": 0.1565518498268635,
"grad_norm": 2.3502492904663086,
"learning_rate": 4.7036846186167605e-05,
"loss": 3.0985,
"step": 1718
},
{
"epoch": 0.15664297430289775,
"grad_norm": 2.2176742553710938,
"learning_rate": 4.703346556017305e-05,
"loss": 3.1193,
"step": 1719
},
{
"epoch": 0.15673409877893202,
"grad_norm": 2.9510931968688965,
"learning_rate": 4.70300831284492e-05,
"loss": 3.2911,
"step": 1720
},
{
"epoch": 0.15682522325496628,
"grad_norm": 2.358860969543457,
"learning_rate": 4.702669889127328e-05,
"loss": 3.121,
"step": 1721
},
{
"epoch": 0.15691634773100055,
"grad_norm": 3.136566638946533,
"learning_rate": 4.702331284892262e-05,
"loss": 3.0761,
"step": 1722
},
{
"epoch": 0.15700747220703481,
"grad_norm": 1.5300287008285522,
"learning_rate": 4.701992500167473e-05,
"loss": 3.0094,
"step": 1723
},
{
"epoch": 0.15709859668306908,
"grad_norm": 3.3138129711151123,
"learning_rate": 4.701653534980724e-05,
"loss": 3.2065,
"step": 1724
},
{
"epoch": 0.15718972115910335,
"grad_norm": 2.763362407684326,
"learning_rate": 4.7013143893597984e-05,
"loss": 3.1027,
"step": 1725
},
{
"epoch": 0.1572808456351376,
"grad_norm": 1.5856213569641113,
"learning_rate": 4.700975063332487e-05,
"loss": 3.029,
"step": 1726
},
{
"epoch": 0.15737197011117185,
"grad_norm": 3.636809825897217,
"learning_rate": 4.700635556926601e-05,
"loss": 3.1847,
"step": 1727
},
{
"epoch": 0.15746309458720611,
"grad_norm": 3.5086801052093506,
"learning_rate": 4.7002958701699626e-05,
"loss": 2.9192,
"step": 1728
},
{
"epoch": 0.15755421906324038,
"grad_norm": 2.65805983543396,
"learning_rate": 4.699956003090412e-05,
"loss": 3.2344,
"step": 1729
},
{
"epoch": 0.15764534353927465,
"grad_norm": 1.6364223957061768,
"learning_rate": 4.6996159557158015e-05,
"loss": 2.9734,
"step": 1730
},
{
"epoch": 0.1577364680153089,
"grad_norm": 1.9043081998825073,
"learning_rate": 4.6992757280739994e-05,
"loss": 3.0671,
"step": 1731
},
{
"epoch": 0.15782759249134318,
"grad_norm": 2.699431896209717,
"learning_rate": 4.698935320192889e-05,
"loss": 4.0291,
"step": 1732
},
{
"epoch": 0.15791871696737744,
"grad_norm": 3.2991626262664795,
"learning_rate": 4.698594732100369e-05,
"loss": 2.9972,
"step": 1733
},
{
"epoch": 0.1580098414434117,
"grad_norm": 2.3580875396728516,
"learning_rate": 4.6982539638243506e-05,
"loss": 2.992,
"step": 1734
},
{
"epoch": 0.15810096591944597,
"grad_norm": 1.856777310371399,
"learning_rate": 4.6979130153927605e-05,
"loss": 3.1221,
"step": 1735
},
{
"epoch": 0.15819209039548024,
"grad_norm": 2.132002592086792,
"learning_rate": 4.697571886833544e-05,
"loss": 3.1007,
"step": 1736
},
{
"epoch": 0.15828321487151448,
"grad_norm": 1.3877090215682983,
"learning_rate": 4.697230578174654e-05,
"loss": 3.0206,
"step": 1737
},
{
"epoch": 0.15837433934754874,
"grad_norm": 2.1733388900756836,
"learning_rate": 4.6968890894440646e-05,
"loss": 3.1006,
"step": 1738
},
{
"epoch": 0.158465463823583,
"grad_norm": 2.9271535873413086,
"learning_rate": 4.696547420669761e-05,
"loss": 4.3027,
"step": 1739
},
{
"epoch": 0.15855658829961727,
"grad_norm": 2.7141754627227783,
"learning_rate": 4.696205571879745e-05,
"loss": 4.056,
"step": 1740
},
{
"epoch": 0.15864771277565154,
"grad_norm": 1.7694344520568848,
"learning_rate": 4.6958635431020315e-05,
"loss": 2.9097,
"step": 1741
},
{
"epoch": 0.1587388372516858,
"grad_norm": 2.743277072906494,
"learning_rate": 4.695521334364653e-05,
"loss": 3.0041,
"step": 1742
},
{
"epoch": 0.15882996172772007,
"grad_norm": 2.976694107055664,
"learning_rate": 4.6951789456956524e-05,
"loss": 3.1339,
"step": 1743
},
{
"epoch": 0.15892108620375434,
"grad_norm": 2.281081199645996,
"learning_rate": 4.6948363771230917e-05,
"loss": 3.0526,
"step": 1744
},
{
"epoch": 0.1590122106797886,
"grad_norm": 2.0678000450134277,
"learning_rate": 4.694493628675044e-05,
"loss": 3.2022,
"step": 1745
},
{
"epoch": 0.15910333515582287,
"grad_norm": 1.403245210647583,
"learning_rate": 4.694150700379601e-05,
"loss": 2.8774,
"step": 1746
},
{
"epoch": 0.1591944596318571,
"grad_norm": 2.7492146492004395,
"learning_rate": 4.693807592264866e-05,
"loss": 3.3178,
"step": 1747
},
{
"epoch": 0.15928558410789137,
"grad_norm": 2.7067978382110596,
"learning_rate": 4.693464304358957e-05,
"loss": 3.1067,
"step": 1748
},
{
"epoch": 0.15937670858392564,
"grad_norm": 3.026930093765259,
"learning_rate": 4.693120836690009e-05,
"loss": 2.6783,
"step": 1749
},
{
"epoch": 0.1594678330599599,
"grad_norm": 2.294149398803711,
"learning_rate": 4.6927771892861715e-05,
"loss": 3.0602,
"step": 1750
},
{
"epoch": 0.15955895753599417,
"grad_norm": 2.4239230155944824,
"learning_rate": 4.6924333621756055e-05,
"loss": 3.366,
"step": 1751
},
{
"epoch": 0.15965008201202843,
"grad_norm": 1.500225305557251,
"learning_rate": 4.692089355386491e-05,
"loss": 3.0087,
"step": 1752
},
{
"epoch": 0.1597412064880627,
"grad_norm": 3.9721415042877197,
"learning_rate": 4.691745168947019e-05,
"loss": 1.5708,
"step": 1753
},
{
"epoch": 0.15983233096409696,
"grad_norm": 2.3244717121124268,
"learning_rate": 4.6914008028853974e-05,
"loss": 3.062,
"step": 1754
},
{
"epoch": 0.15992345544013123,
"grad_norm": 3.399137258529663,
"learning_rate": 4.6910562572298496e-05,
"loss": 3.275,
"step": 1755
},
{
"epoch": 0.1600145799161655,
"grad_norm": 1.5590107440948486,
"learning_rate": 4.690711532008611e-05,
"loss": 3.0246,
"step": 1756
},
{
"epoch": 0.16010570439219973,
"grad_norm": 1.5847901105880737,
"learning_rate": 4.690366627249934e-05,
"loss": 3.0687,
"step": 1757
},
{
"epoch": 0.160196828868234,
"grad_norm": 3.033726453781128,
"learning_rate": 4.690021542982084e-05,
"loss": 3.1466,
"step": 1758
},
{
"epoch": 0.16028795334426826,
"grad_norm": 2.4416680335998535,
"learning_rate": 4.689676279233344e-05,
"loss": 3.0355,
"step": 1759
},
{
"epoch": 0.16037907782030253,
"grad_norm": 3.199636459350586,
"learning_rate": 4.689330836032007e-05,
"loss": 2.9404,
"step": 1760
},
{
"epoch": 0.1604702022963368,
"grad_norm": 2.1317837238311768,
"learning_rate": 4.688985213406386e-05,
"loss": 3.3088,
"step": 1761
},
{
"epoch": 0.16056132677237106,
"grad_norm": 2.5305070877075195,
"learning_rate": 4.6886394113848034e-05,
"loss": 2.8364,
"step": 1762
},
{
"epoch": 0.16065245124840533,
"grad_norm": 1.2650429010391235,
"learning_rate": 4.6882934299956014e-05,
"loss": 2.881,
"step": 1763
},
{
"epoch": 0.1607435757244396,
"grad_norm": 3.0399343967437744,
"learning_rate": 4.6879472692671344e-05,
"loss": 2.9391,
"step": 1764
},
{
"epoch": 0.16083470020047386,
"grad_norm": 1.4439702033996582,
"learning_rate": 4.68760092922777e-05,
"loss": 2.9775,
"step": 1765
},
{
"epoch": 0.16092582467650812,
"grad_norm": 3.708564281463623,
"learning_rate": 4.6872544099058934e-05,
"loss": 2.7142,
"step": 1766
},
{
"epoch": 0.16101694915254236,
"grad_norm": 2.397183656692505,
"learning_rate": 4.686907711329903e-05,
"loss": 3.2577,
"step": 1767
},
{
"epoch": 0.16110807362857663,
"grad_norm": 5.58218240737915,
"learning_rate": 4.686560833528213e-05,
"loss": 2.4521,
"step": 1768
},
{
"epoch": 0.1611991981046109,
"grad_norm": 1.9717990159988403,
"learning_rate": 4.6862137765292493e-05,
"loss": 3.0401,
"step": 1769
},
{
"epoch": 0.16129032258064516,
"grad_norm": 2.144928455352783,
"learning_rate": 4.685866540361456e-05,
"loss": 3.0899,
"step": 1770
},
{
"epoch": 0.16138144705667942,
"grad_norm": 2.322648286819458,
"learning_rate": 4.685519125053289e-05,
"loss": 3.0279,
"step": 1771
},
{
"epoch": 0.1614725715327137,
"grad_norm": 1.8815804719924927,
"learning_rate": 4.6851715306332235e-05,
"loss": 3.0445,
"step": 1772
},
{
"epoch": 0.16156369600874795,
"grad_norm": 3.7805604934692383,
"learning_rate": 4.684823757129743e-05,
"loss": 2.8023,
"step": 1773
},
{
"epoch": 0.16165482048478222,
"grad_norm": 3.898134231567383,
"learning_rate": 4.684475804571351e-05,
"loss": 3.1171,
"step": 1774
},
{
"epoch": 0.16174594496081648,
"grad_norm": 2.4973833560943604,
"learning_rate": 4.684127672986562e-05,
"loss": 3.3566,
"step": 1775
},
{
"epoch": 0.16183706943685075,
"grad_norm": 2.69533371925354,
"learning_rate": 4.683779362403908e-05,
"loss": 3.1604,
"step": 1776
},
{
"epoch": 0.161928193912885,
"grad_norm": 1.983508825302124,
"learning_rate": 4.683430872851934e-05,
"loss": 2.9452,
"step": 1777
},
{
"epoch": 0.16201931838891925,
"grad_norm": 2.2573885917663574,
"learning_rate": 4.6830822043591994e-05,
"loss": 3.1903,
"step": 1778
},
{
"epoch": 0.16211044286495352,
"grad_norm": 1.7813071012496948,
"learning_rate": 4.68273335695428e-05,
"loss": 3.0245,
"step": 1779
},
{
"epoch": 0.16220156734098778,
"grad_norm": 2.4912261962890625,
"learning_rate": 4.682384330665765e-05,
"loss": 3.1379,
"step": 1780
},
{
"epoch": 0.16229269181702205,
"grad_norm": 2.328861713409424,
"learning_rate": 4.682035125522258e-05,
"loss": 3.1379,
"step": 1781
},
{
"epoch": 0.16238381629305632,
"grad_norm": 1.43288254737854,
"learning_rate": 4.681685741552379e-05,
"loss": 2.959,
"step": 1782
},
{
"epoch": 0.16247494076909058,
"grad_norm": 3.625856876373291,
"learning_rate": 4.6813361787847585e-05,
"loss": 3.088,
"step": 1783
},
{
"epoch": 0.16256606524512485,
"grad_norm": 2.959545612335205,
"learning_rate": 4.680986437248048e-05,
"loss": 3.1953,
"step": 1784
},
{
"epoch": 0.1626571897211591,
"grad_norm": 4.424033164978027,
"learning_rate": 4.680636516970908e-05,
"loss": 3.2453,
"step": 1785
},
{
"epoch": 0.16274831419719338,
"grad_norm": 2.5210375785827637,
"learning_rate": 4.680286417982017e-05,
"loss": 3.127,
"step": 1786
},
{
"epoch": 0.16283943867322762,
"grad_norm": 2.69915509223938,
"learning_rate": 4.679936140310066e-05,
"loss": 2.9651,
"step": 1787
},
{
"epoch": 0.16293056314926188,
"grad_norm": 2.2764296531677246,
"learning_rate": 4.679585683983763e-05,
"loss": 2.4394,
"step": 1788
},
{
"epoch": 0.16302168762529615,
"grad_norm": 2.4969351291656494,
"learning_rate": 4.679235049031827e-05,
"loss": 3.1952,
"step": 1789
},
{
"epoch": 0.1631128121013304,
"grad_norm": 2.099569797515869,
"learning_rate": 4.6788842354829965e-05,
"loss": 2.9982,
"step": 1790
},
{
"epoch": 0.16320393657736468,
"grad_norm": 2.138705015182495,
"learning_rate": 4.67853324336602e-05,
"loss": 2.8346,
"step": 1791
},
{
"epoch": 0.16329506105339894,
"grad_norm": 3.4511311054229736,
"learning_rate": 4.6781820727096634e-05,
"loss": 2.9689,
"step": 1792
},
{
"epoch": 0.1633861855294332,
"grad_norm": 1.415783166885376,
"learning_rate": 4.677830723542708e-05,
"loss": 2.9163,
"step": 1793
},
{
"epoch": 0.16347731000546747,
"grad_norm": 1.5046038627624512,
"learning_rate": 4.677479195893946e-05,
"loss": 3.024,
"step": 1794
},
{
"epoch": 0.16356843448150174,
"grad_norm": 1.4293829202651978,
"learning_rate": 4.677127489792188e-05,
"loss": 3.0218,
"step": 1795
},
{
"epoch": 0.163659558957536,
"grad_norm": 2.071714401245117,
"learning_rate": 4.676775605266256e-05,
"loss": 3.092,
"step": 1796
},
{
"epoch": 0.16375068343357024,
"grad_norm": 2.291851043701172,
"learning_rate": 4.676423542344991e-05,
"loss": 2.9331,
"step": 1797
},
{
"epoch": 0.1638418079096045,
"grad_norm": 2.606294870376587,
"learning_rate": 4.676071301057243e-05,
"loss": 3.2264,
"step": 1798
},
{
"epoch": 0.16393293238563877,
"grad_norm": 2.889845848083496,
"learning_rate": 4.675718881431882e-05,
"loss": 3.1705,
"step": 1799
},
{
"epoch": 0.16402405686167304,
"grad_norm": 3.2519989013671875,
"learning_rate": 4.675366283497788e-05,
"loss": 4.5846,
"step": 1800
},
{
"epoch": 0.1641151813377073,
"grad_norm": 2.266395092010498,
"learning_rate": 4.67501350728386e-05,
"loss": 3.4063,
"step": 1801
},
{
"epoch": 0.16420630581374157,
"grad_norm": 2.0074515342712402,
"learning_rate": 4.674660552819007e-05,
"loss": 3.1425,
"step": 1802
},
{
"epoch": 0.16429743028977584,
"grad_norm": 5.168177604675293,
"learning_rate": 4.6743074201321577e-05,
"loss": 3.0502,
"step": 1803
},
{
"epoch": 0.1643885547658101,
"grad_norm": 1.329056978225708,
"learning_rate": 4.673954109252251e-05,
"loss": 2.9967,
"step": 1804
},
{
"epoch": 0.16447967924184437,
"grad_norm": 3.3117220401763916,
"learning_rate": 4.6736006202082414e-05,
"loss": 3.1037,
"step": 1805
},
{
"epoch": 0.16457080371787863,
"grad_norm": 2.464008331298828,
"learning_rate": 4.6732469530291e-05,
"loss": 2.9688,
"step": 1806
},
{
"epoch": 0.1646619281939129,
"grad_norm": 2.8803305625915527,
"learning_rate": 4.672893107743812e-05,
"loss": 3.1368,
"step": 1807
},
{
"epoch": 0.16475305266994714,
"grad_norm": 2.8774044513702393,
"learning_rate": 4.672539084381375e-05,
"loss": 3.0782,
"step": 1808
},
{
"epoch": 0.1648441771459814,
"grad_norm": 2.2963814735412598,
"learning_rate": 4.672184882970803e-05,
"loss": 2.9884,
"step": 1809
},
{
"epoch": 0.16493530162201567,
"grad_norm": 2.4225456714630127,
"learning_rate": 4.671830503541124e-05,
"loss": 3.3793,
"step": 1810
},
{
"epoch": 0.16502642609804993,
"grad_norm": 1.4230417013168335,
"learning_rate": 4.671475946121381e-05,
"loss": 3.1628,
"step": 1811
},
{
"epoch": 0.1651175505740842,
"grad_norm": 2.5980818271636963,
"learning_rate": 4.671121210740631e-05,
"loss": 3.2377,
"step": 1812
},
{
"epoch": 0.16520867505011846,
"grad_norm": 1.8780187368392944,
"learning_rate": 4.6707662974279464e-05,
"loss": 2.9965,
"step": 1813
},
{
"epoch": 0.16529979952615273,
"grad_norm": 1.3507145643234253,
"learning_rate": 4.6704112062124146e-05,
"loss": 2.8605,
"step": 1814
},
{
"epoch": 0.165390924002187,
"grad_norm": 2.861724376678467,
"learning_rate": 4.6700559371231345e-05,
"loss": 2.9891,
"step": 1815
},
{
"epoch": 0.16548204847822126,
"grad_norm": 1.8451899290084839,
"learning_rate": 4.6697004901892244e-05,
"loss": 3.0085,
"step": 1816
},
{
"epoch": 0.16557317295425553,
"grad_norm": 2.6136531829833984,
"learning_rate": 4.6693448654398126e-05,
"loss": 3.164,
"step": 1817
},
{
"epoch": 0.16566429743028976,
"grad_norm": 2.2334072589874268,
"learning_rate": 4.668989062904045e-05,
"loss": 3.0821,
"step": 1818
},
{
"epoch": 0.16575542190632403,
"grad_norm": 2.377607583999634,
"learning_rate": 4.66863308261108e-05,
"loss": 2.9773,
"step": 1819
},
{
"epoch": 0.1658465463823583,
"grad_norm": 2.5205888748168945,
"learning_rate": 4.6682769245900924e-05,
"loss": 3.0648,
"step": 1820
},
{
"epoch": 0.16593767085839256,
"grad_norm": 1.433559536933899,
"learning_rate": 4.667920588870271e-05,
"loss": 2.9843,
"step": 1821
},
{
"epoch": 0.16602879533442683,
"grad_norm": 2.934572458267212,
"learning_rate": 4.667564075480818e-05,
"loss": 2.8275,
"step": 1822
},
{
"epoch": 0.1661199198104611,
"grad_norm": 4.768115997314453,
"learning_rate": 4.6672073844509524e-05,
"loss": 2.8542,
"step": 1823
},
{
"epoch": 0.16621104428649536,
"grad_norm": 2.2473628520965576,
"learning_rate": 4.666850515809905e-05,
"loss": 2.9707,
"step": 1824
},
{
"epoch": 0.16630216876252962,
"grad_norm": 2.0815329551696777,
"learning_rate": 4.6664934695869226e-05,
"loss": 3.1296,
"step": 1825
},
{
"epoch": 0.1663932932385639,
"grad_norm": 2.7739152908325195,
"learning_rate": 4.666136245811268e-05,
"loss": 1.463,
"step": 1826
},
{
"epoch": 0.16648441771459815,
"grad_norm": 1.61172616481781,
"learning_rate": 4.6657788445122156e-05,
"loss": 3.1061,
"step": 1827
},
{
"epoch": 0.1665755421906324,
"grad_norm": 4.994141101837158,
"learning_rate": 4.6654212657190574e-05,
"loss": 3.2571,
"step": 1828
},
{
"epoch": 0.16666666666666666,
"grad_norm": 1.818001627922058,
"learning_rate": 4.665063509461097e-05,
"loss": 3.0249,
"step": 1829
},
{
"epoch": 0.16675779114270092,
"grad_norm": 2.999178647994995,
"learning_rate": 4.664705575767654e-05,
"loss": 4.4251,
"step": 1830
},
{
"epoch": 0.1668489156187352,
"grad_norm": 2.0437114238739014,
"learning_rate": 4.6643474646680636e-05,
"loss": 3.1958,
"step": 1831
},
{
"epoch": 0.16694004009476945,
"grad_norm": 2.0116827487945557,
"learning_rate": 4.663989176191673e-05,
"loss": 2.7428,
"step": 1832
},
{
"epoch": 0.16703116457080372,
"grad_norm": 1.6298675537109375,
"learning_rate": 4.6636307103678464e-05,
"loss": 2.9703,
"step": 1833
},
{
"epoch": 0.16712228904683799,
"grad_norm": 2.848588466644287,
"learning_rate": 4.663272067225961e-05,
"loss": 3.0825,
"step": 1834
},
{
"epoch": 0.16721341352287225,
"grad_norm": 1.2318345308303833,
"learning_rate": 4.66291324679541e-05,
"loss": 2.8645,
"step": 1835
},
{
"epoch": 0.16730453799890652,
"grad_norm": 3.15191388130188,
"learning_rate": 4.6625542491055985e-05,
"loss": 3.2674,
"step": 1836
},
{
"epoch": 0.16739566247494078,
"grad_norm": 1.4416121244430542,
"learning_rate": 4.662195074185949e-05,
"loss": 2.9721,
"step": 1837
},
{
"epoch": 0.16748678695097502,
"grad_norm": 1.4033936262130737,
"learning_rate": 4.661835722065896e-05,
"loss": 2.8413,
"step": 1838
},
{
"epoch": 0.16757791142700929,
"grad_norm": 3.393137216567993,
"learning_rate": 4.661476192774892e-05,
"loss": 3.255,
"step": 1839
},
{
"epoch": 0.16766903590304355,
"grad_norm": 1.4797853231430054,
"learning_rate": 4.6611164863424e-05,
"loss": 2.9213,
"step": 1840
},
{
"epoch": 0.16776016037907782,
"grad_norm": 1.99228835105896,
"learning_rate": 4.660756602797899e-05,
"loss": 3.0292,
"step": 1841
},
{
"epoch": 0.16785128485511208,
"grad_norm": 3.2348062992095947,
"learning_rate": 4.6603965421708845e-05,
"loss": 3.0822,
"step": 1842
},
{
"epoch": 0.16794240933114635,
"grad_norm": 1.782169222831726,
"learning_rate": 4.660036304490864e-05,
"loss": 2.9511,
"step": 1843
},
{
"epoch": 0.1680335338071806,
"grad_norm": 4.648063659667969,
"learning_rate": 4.6596758897873605e-05,
"loss": 3.0001,
"step": 1844
},
{
"epoch": 0.16812465828321488,
"grad_norm": 2.3051891326904297,
"learning_rate": 4.659315298089912e-05,
"loss": 2.6815,
"step": 1845
},
{
"epoch": 0.16821578275924914,
"grad_norm": 3.1102521419525146,
"learning_rate": 4.6589545294280694e-05,
"loss": 3.291,
"step": 1846
},
{
"epoch": 0.1683069072352834,
"grad_norm": 2.6495754718780518,
"learning_rate": 4.6585935838313996e-05,
"loss": 3.3111,
"step": 1847
},
{
"epoch": 0.16839803171131765,
"grad_norm": 1.6148706674575806,
"learning_rate": 4.658232461329484e-05,
"loss": 3.0352,
"step": 1848
},
{
"epoch": 0.1684891561873519,
"grad_norm": 2.024562120437622,
"learning_rate": 4.657871161951917e-05,
"loss": 3.2065,
"step": 1849
},
{
"epoch": 0.16858028066338618,
"grad_norm": 3.0572400093078613,
"learning_rate": 4.657509685728309e-05,
"loss": 2.8895,
"step": 1850
},
{
"epoch": 0.16867140513942044,
"grad_norm": 3.482790946960449,
"learning_rate": 4.657148032688285e-05,
"loss": 3.1904,
"step": 1851
},
{
"epoch": 0.1687625296154547,
"grad_norm": 2.436032772064209,
"learning_rate": 4.656786202861483e-05,
"loss": 2.8036,
"step": 1852
},
{
"epoch": 0.16885365409148897,
"grad_norm": 3.0358383655548096,
"learning_rate": 4.6564241962775564e-05,
"loss": 3.1313,
"step": 1853
},
{
"epoch": 0.16894477856752324,
"grad_norm": 2.4795777797698975,
"learning_rate": 4.656062012966173e-05,
"loss": 3.2146,
"step": 1854
},
{
"epoch": 0.1690359030435575,
"grad_norm": 1.5819923877716064,
"learning_rate": 4.655699652957016e-05,
"loss": 3.0148,
"step": 1855
},
{
"epoch": 0.16912702751959177,
"grad_norm": 1.9436419010162354,
"learning_rate": 4.655337116279782e-05,
"loss": 2.9958,
"step": 1856
},
{
"epoch": 0.16921815199562604,
"grad_norm": 2.404379367828369,
"learning_rate": 4.6549744029641816e-05,
"loss": 2.6195,
"step": 1857
},
{
"epoch": 0.16930927647166027,
"grad_norm": 3.0540213584899902,
"learning_rate": 4.6546115130399414e-05,
"loss": 2.9237,
"step": 1858
},
{
"epoch": 0.16940040094769454,
"grad_norm": 1.2016074657440186,
"learning_rate": 4.6542484465368006e-05,
"loss": 3.0172,
"step": 1859
},
{
"epoch": 0.1694915254237288,
"grad_norm": 2.7861342430114746,
"learning_rate": 4.653885203484515e-05,
"loss": 4.522,
"step": 1860
},
{
"epoch": 0.16958264989976307,
"grad_norm": 2.5136163234710693,
"learning_rate": 4.6535217839128545e-05,
"loss": 3.2934,
"step": 1861
},
{
"epoch": 0.16967377437579734,
"grad_norm": 1.917813777923584,
"learning_rate": 4.6531581878516005e-05,
"loss": 2.982,
"step": 1862
},
{
"epoch": 0.1697648988518316,
"grad_norm": 2.3996527194976807,
"learning_rate": 4.652794415330552e-05,
"loss": 3.2438,
"step": 1863
},
{
"epoch": 0.16985602332786587,
"grad_norm": 2.1027286052703857,
"learning_rate": 4.652430466379523e-05,
"loss": 2.9588,
"step": 1864
},
{
"epoch": 0.16994714780390013,
"grad_norm": 1.443307876586914,
"learning_rate": 4.652066341028338e-05,
"loss": 2.993,
"step": 1865
},
{
"epoch": 0.1700382722799344,
"grad_norm": 2.603544235229492,
"learning_rate": 4.6517020393068414e-05,
"loss": 3.099,
"step": 1866
},
{
"epoch": 0.17012939675596866,
"grad_norm": 2.022552728652954,
"learning_rate": 4.651337561244887e-05,
"loss": 3.4147,
"step": 1867
},
{
"epoch": 0.1702205212320029,
"grad_norm": 3.2061586380004883,
"learning_rate": 4.650972906872346e-05,
"loss": 3.2699,
"step": 1868
},
{
"epoch": 0.17031164570803717,
"grad_norm": 1.5013995170593262,
"learning_rate": 4.650608076219103e-05,
"loss": 2.9541,
"step": 1869
},
{
"epoch": 0.17040277018407143,
"grad_norm": 1.4068304300308228,
"learning_rate": 4.650243069315058e-05,
"loss": 2.9621,
"step": 1870
},
{
"epoch": 0.1704938946601057,
"grad_norm": 1.5417219400405884,
"learning_rate": 4.649877886190124e-05,
"loss": 2.9637,
"step": 1871
},
{
"epoch": 0.17058501913613996,
"grad_norm": 1.9698114395141602,
"learning_rate": 4.649512526874229e-05,
"loss": 2.9846,
"step": 1872
},
{
"epoch": 0.17067614361217423,
"grad_norm": 2.276402473449707,
"learning_rate": 4.649146991397317e-05,
"loss": 3.2274,
"step": 1873
},
{
"epoch": 0.1707672680882085,
"grad_norm": 1.367923617362976,
"learning_rate": 4.648781279789344e-05,
"loss": 2.9497,
"step": 1874
},
{
"epoch": 0.17085839256424276,
"grad_norm": 2.30353045463562,
"learning_rate": 4.648415392080281e-05,
"loss": 2.8962,
"step": 1875
},
{
"epoch": 0.17094951704027703,
"grad_norm": 2.0635387897491455,
"learning_rate": 4.6480493283001145e-05,
"loss": 3.2399,
"step": 1876
},
{
"epoch": 0.1710406415163113,
"grad_norm": 1.7822022438049316,
"learning_rate": 4.6476830884788456e-05,
"loss": 2.6609,
"step": 1877
},
{
"epoch": 0.17113176599234553,
"grad_norm": 1.4074290990829468,
"learning_rate": 4.647316672646488e-05,
"loss": 3.0442,
"step": 1878
},
{
"epoch": 0.1712228904683798,
"grad_norm": 1.7174943685531616,
"learning_rate": 4.6469500808330724e-05,
"loss": 3.3071,
"step": 1879
},
{
"epoch": 0.17131401494441406,
"grad_norm": 2.6318490505218506,
"learning_rate": 4.6465833130686405e-05,
"loss": 3.231,
"step": 1880
},
{
"epoch": 0.17140513942044833,
"grad_norm": 2.40649676322937,
"learning_rate": 4.646216369383252e-05,
"loss": 3.1019,
"step": 1881
},
{
"epoch": 0.1714962638964826,
"grad_norm": 2.1710281372070312,
"learning_rate": 4.645849249806977e-05,
"loss": 2.9026,
"step": 1882
},
{
"epoch": 0.17158738837251686,
"grad_norm": 3.329578161239624,
"learning_rate": 4.645481954369906e-05,
"loss": 4.2214,
"step": 1883
},
{
"epoch": 0.17167851284855112,
"grad_norm": 2.663468360900879,
"learning_rate": 4.6451144831021375e-05,
"loss": 3.0867,
"step": 1884
},
{
"epoch": 0.1717696373245854,
"grad_norm": 3.370527982711792,
"learning_rate": 4.6447468360337876e-05,
"loss": 3.0162,
"step": 1885
},
{
"epoch": 0.17186076180061965,
"grad_norm": 1.3907276391983032,
"learning_rate": 4.6443790131949874e-05,
"loss": 2.9264,
"step": 1886
},
{
"epoch": 0.17195188627665392,
"grad_norm": 3.615497350692749,
"learning_rate": 4.644011014615881e-05,
"loss": 3.0735,
"step": 1887
},
{
"epoch": 0.17204301075268819,
"grad_norm": 1.8801993131637573,
"learning_rate": 4.643642840326627e-05,
"loss": 3.1026,
"step": 1888
},
{
"epoch": 0.17213413522872242,
"grad_norm": 1.2994283437728882,
"learning_rate": 4.6432744903573996e-05,
"loss": 2.9899,
"step": 1889
},
{
"epoch": 0.1722252597047567,
"grad_norm": 3.113603115081787,
"learning_rate": 4.6429059647383867e-05,
"loss": 2.4245,
"step": 1890
},
{
"epoch": 0.17231638418079095,
"grad_norm": 1.4192283153533936,
"learning_rate": 4.642537263499788e-05,
"loss": 3.0991,
"step": 1891
},
{
"epoch": 0.17240750865682522,
"grad_norm": 1.6561801433563232,
"learning_rate": 4.642168386671823e-05,
"loss": 3.0053,
"step": 1892
},
{
"epoch": 0.17249863313285949,
"grad_norm": 2.518643617630005,
"learning_rate": 4.64179933428472e-05,
"loss": 3.0779,
"step": 1893
},
{
"epoch": 0.17258975760889375,
"grad_norm": 1.9463084936141968,
"learning_rate": 4.641430106368726e-05,
"loss": 2.9763,
"step": 1894
},
{
"epoch": 0.17268088208492802,
"grad_norm": 2.9101171493530273,
"learning_rate": 4.641060702954101e-05,
"loss": 3.1603,
"step": 1895
},
{
"epoch": 0.17277200656096228,
"grad_norm": 2.2001545429229736,
"learning_rate": 4.640691124071118e-05,
"loss": 2.7663,
"step": 1896
},
{
"epoch": 0.17286313103699655,
"grad_norm": 2.230363130569458,
"learning_rate": 4.6403213697500656e-05,
"loss": 2.4912,
"step": 1897
},
{
"epoch": 0.1729542555130308,
"grad_norm": 2.127821445465088,
"learning_rate": 4.639951440021247e-05,
"loss": 3.0039,
"step": 1898
},
{
"epoch": 0.17304537998906505,
"grad_norm": 2.014512538909912,
"learning_rate": 4.639581334914979e-05,
"loss": 3.266,
"step": 1899
},
{
"epoch": 0.17313650446509932,
"grad_norm": 2.769667625427246,
"learning_rate": 4.639211054461593e-05,
"loss": 3.225,
"step": 1900
},
{
"epoch": 0.17322762894113358,
"grad_norm": 1.3063126802444458,
"learning_rate": 4.6388405986914365e-05,
"loss": 2.9487,
"step": 1901
},
{
"epoch": 0.17331875341716785,
"grad_norm": 2.091466188430786,
"learning_rate": 4.6384699676348674e-05,
"loss": 3.1492,
"step": 1902
},
{
"epoch": 0.1734098778932021,
"grad_norm": 2.709743022918701,
"learning_rate": 4.6380991613222625e-05,
"loss": 3.2338,
"step": 1903
},
{
"epoch": 0.17350100236923638,
"grad_norm": 2.078986406326294,
"learning_rate": 4.637728179784009e-05,
"loss": 3.4682,
"step": 1904
},
{
"epoch": 0.17359212684527064,
"grad_norm": 2.516191005706787,
"learning_rate": 4.637357023050512e-05,
"loss": 3.0394,
"step": 1905
},
{
"epoch": 0.1736832513213049,
"grad_norm": 3.148676872253418,
"learning_rate": 4.636985691152188e-05,
"loss": 3.1488,
"step": 1906
},
{
"epoch": 0.17377437579733918,
"grad_norm": 3.0405311584472656,
"learning_rate": 4.63661418411947e-05,
"loss": 3.0929,
"step": 1907
},
{
"epoch": 0.17386550027337344,
"grad_norm": 1.5074187517166138,
"learning_rate": 4.6362425019828035e-05,
"loss": 2.9689,
"step": 1908
},
{
"epoch": 0.17395662474940768,
"grad_norm": 1.9838029146194458,
"learning_rate": 4.635870644772651e-05,
"loss": 2.8708,
"step": 1909
},
{
"epoch": 0.17404774922544194,
"grad_norm": 1.4794012308120728,
"learning_rate": 4.635498612519486e-05,
"loss": 3.1818,
"step": 1910
},
{
"epoch": 0.1741388737014762,
"grad_norm": 1.3626906871795654,
"learning_rate": 4.6351264052537984e-05,
"loss": 2.8804,
"step": 1911
},
{
"epoch": 0.17422999817751048,
"grad_norm": 1.694874882698059,
"learning_rate": 4.6347540230060924e-05,
"loss": 3.1085,
"step": 1912
},
{
"epoch": 0.17432112265354474,
"grad_norm": 3.162107467651367,
"learning_rate": 4.634381465806886e-05,
"loss": 3.2248,
"step": 1913
},
{
"epoch": 0.174412247129579,
"grad_norm": 1.7732110023498535,
"learning_rate": 4.6340087336867115e-05,
"loss": 3.015,
"step": 1914
},
{
"epoch": 0.17450337160561327,
"grad_norm": 2.3251988887786865,
"learning_rate": 4.633635826676116e-05,
"loss": 2.833,
"step": 1915
},
{
"epoch": 0.17459449608164754,
"grad_norm": 2.435624599456787,
"learning_rate": 4.633262744805661e-05,
"loss": 3.0755,
"step": 1916
},
{
"epoch": 0.1746856205576818,
"grad_norm": 3.7790379524230957,
"learning_rate": 4.6328894881059216e-05,
"loss": 3.1371,
"step": 1917
},
{
"epoch": 0.17477674503371607,
"grad_norm": 5.323071002960205,
"learning_rate": 4.6325160566074875e-05,
"loss": 2.9746,
"step": 1918
},
{
"epoch": 0.1748678695097503,
"grad_norm": 3.0934462547302246,
"learning_rate": 4.632142450340964e-05,
"loss": 3.2001,
"step": 1919
},
{
"epoch": 0.17495899398578457,
"grad_norm": 1.9771567583084106,
"learning_rate": 4.631768669336968e-05,
"loss": 3.1897,
"step": 1920
},
{
"epoch": 0.17505011846181884,
"grad_norm": 4.835346698760986,
"learning_rate": 4.631394713626133e-05,
"loss": 3.0367,
"step": 1921
},
{
"epoch": 0.1751412429378531,
"grad_norm": 2.225339889526367,
"learning_rate": 4.631020583239107e-05,
"loss": 2.8157,
"step": 1922
},
{
"epoch": 0.17523236741388737,
"grad_norm": 3.113792657852173,
"learning_rate": 4.63064627820655e-05,
"loss": 2.4321,
"step": 1923
},
{
"epoch": 0.17532349188992163,
"grad_norm": 1.4210890531539917,
"learning_rate": 4.630271798559138e-05,
"loss": 3.0678,
"step": 1924
},
{
"epoch": 0.1754146163659559,
"grad_norm": 1.9921435117721558,
"learning_rate": 4.629897144327563e-05,
"loss": 3.067,
"step": 1925
},
{
"epoch": 0.17550574084199017,
"grad_norm": 2.643051862716675,
"learning_rate": 4.6295223155425274e-05,
"loss": 2.9527,
"step": 1926
},
{
"epoch": 0.17559686531802443,
"grad_norm": 1.9968456029891968,
"learning_rate": 4.6291473122347494e-05,
"loss": 2.8901,
"step": 1927
},
{
"epoch": 0.1756879897940587,
"grad_norm": 2.101381301879883,
"learning_rate": 4.628772134434964e-05,
"loss": 2.988,
"step": 1928
},
{
"epoch": 0.17577911427009293,
"grad_norm": 3.2429404258728027,
"learning_rate": 4.628396782173918e-05,
"loss": 3.1101,
"step": 1929
},
{
"epoch": 0.1758702387461272,
"grad_norm": 2.9418232440948486,
"learning_rate": 4.6280212554823715e-05,
"loss": 3.2891,
"step": 1930
},
{
"epoch": 0.17596136322216147,
"grad_norm": 1.3050503730773926,
"learning_rate": 4.6276455543911026e-05,
"loss": 2.9632,
"step": 1931
},
{
"epoch": 0.17605248769819573,
"grad_norm": 1.7188957929611206,
"learning_rate": 4.627269678930899e-05,
"loss": 3.0123,
"step": 1932
},
{
"epoch": 0.17614361217423,
"grad_norm": 3.276196002960205,
"learning_rate": 4.626893629132567e-05,
"loss": 3.3355,
"step": 1933
},
{
"epoch": 0.17623473665026426,
"grad_norm": 2.130753517150879,
"learning_rate": 4.6265174050269245e-05,
"loss": 3.1184,
"step": 1934
},
{
"epoch": 0.17632586112629853,
"grad_norm": 3.7487149238586426,
"learning_rate": 4.626141006644805e-05,
"loss": 3.2589,
"step": 1935
},
{
"epoch": 0.1764169856023328,
"grad_norm": 1.4219084978103638,
"learning_rate": 4.625764434017056e-05,
"loss": 2.9983,
"step": 1936
},
{
"epoch": 0.17650811007836706,
"grad_norm": 2.6864495277404785,
"learning_rate": 4.625387687174539e-05,
"loss": 3.1346,
"step": 1937
},
{
"epoch": 0.17659923455440132,
"grad_norm": 1.8427038192749023,
"learning_rate": 4.62501076614813e-05,
"loss": 2.7725,
"step": 1938
},
{
"epoch": 0.17669035903043556,
"grad_norm": 3.0085763931274414,
"learning_rate": 4.624633670968718e-05,
"loss": 4.6207,
"step": 1939
},
{
"epoch": 0.17678148350646983,
"grad_norm": 2.097071647644043,
"learning_rate": 4.6242564016672094e-05,
"loss": 3.2445,
"step": 1940
},
{
"epoch": 0.1768726079825041,
"grad_norm": 2.8107852935791016,
"learning_rate": 4.6238789582745215e-05,
"loss": 3.3332,
"step": 1941
},
{
"epoch": 0.17696373245853836,
"grad_norm": 1.6054763793945312,
"learning_rate": 4.623501340821586e-05,
"loss": 3.0692,
"step": 1942
},
{
"epoch": 0.17705485693457262,
"grad_norm": 2.5180320739746094,
"learning_rate": 4.6231235493393535e-05,
"loss": 3.1702,
"step": 1943
},
{
"epoch": 0.1771459814106069,
"grad_norm": 2.668308734893799,
"learning_rate": 4.6227455838587827e-05,
"loss": 4.3692,
"step": 1944
},
{
"epoch": 0.17723710588664116,
"grad_norm": 3.261211633682251,
"learning_rate": 4.6223674444108514e-05,
"loss": 3.1087,
"step": 1945
},
{
"epoch": 0.17732823036267542,
"grad_norm": 2.2094995975494385,
"learning_rate": 4.621989131026548e-05,
"loss": 3.2398,
"step": 1946
},
{
"epoch": 0.1774193548387097,
"grad_norm": 2.806913375854492,
"learning_rate": 4.621610643736878e-05,
"loss": 3.2686,
"step": 1947
},
{
"epoch": 0.17751047931474395,
"grad_norm": 3.777822494506836,
"learning_rate": 4.621231982572858e-05,
"loss": 2.7723,
"step": 1948
},
{
"epoch": 0.1776016037907782,
"grad_norm": 2.3020384311676025,
"learning_rate": 4.6208531475655236e-05,
"loss": 2.8918,
"step": 1949
},
{
"epoch": 0.17769272826681246,
"grad_norm": 3.0134787559509277,
"learning_rate": 4.6204741387459196e-05,
"loss": 3.2594,
"step": 1950
},
{
"epoch": 0.17778385274284672,
"grad_norm": 3.5491254329681396,
"learning_rate": 4.620094956145108e-05,
"loss": 3.1717,
"step": 1951
},
{
"epoch": 0.177874977218881,
"grad_norm": 1.3344693183898926,
"learning_rate": 4.619715599794164e-05,
"loss": 3.0054,
"step": 1952
},
{
"epoch": 0.17796610169491525,
"grad_norm": 1.2647558450698853,
"learning_rate": 4.619336069724178e-05,
"loss": 2.9496,
"step": 1953
},
{
"epoch": 0.17805722617094952,
"grad_norm": 1.373024821281433,
"learning_rate": 4.6189563659662525e-05,
"loss": 2.9396,
"step": 1954
},
{
"epoch": 0.17814835064698378,
"grad_norm": 1.3420569896697998,
"learning_rate": 4.618576488551508e-05,
"loss": 2.9063,
"step": 1955
},
{
"epoch": 0.17823947512301805,
"grad_norm": 1.575454592704773,
"learning_rate": 4.618196437511075e-05,
"loss": 3.0349,
"step": 1956
},
{
"epoch": 0.17833059959905231,
"grad_norm": 2.5236902236938477,
"learning_rate": 4.617816212876102e-05,
"loss": 2.694,
"step": 1957
},
{
"epoch": 0.17842172407508658,
"grad_norm": 2.2877037525177,
"learning_rate": 4.617435814677748e-05,
"loss": 3.2323,
"step": 1958
},
{
"epoch": 0.17851284855112082,
"grad_norm": 2.5378546714782715,
"learning_rate": 4.6170552429471905e-05,
"loss": 2.8907,
"step": 1959
},
{
"epoch": 0.17860397302715508,
"grad_norm": 2.347435712814331,
"learning_rate": 4.6166744977156154e-05,
"loss": 3.1412,
"step": 1960
},
{
"epoch": 0.17869509750318935,
"grad_norm": 1.9487541913986206,
"learning_rate": 4.616293579014229e-05,
"loss": 2.858,
"step": 1961
},
{
"epoch": 0.17878622197922361,
"grad_norm": 1.4565964937210083,
"learning_rate": 4.6159124868742485e-05,
"loss": 2.9924,
"step": 1962
},
{
"epoch": 0.17887734645525788,
"grad_norm": 1.4799039363861084,
"learning_rate": 4.6155312213269053e-05,
"loss": 2.7628,
"step": 1963
},
{
"epoch": 0.17896847093129215,
"grad_norm": 3.3805723190307617,
"learning_rate": 4.615149782403446e-05,
"loss": 3.2429,
"step": 1964
},
{
"epoch": 0.1790595954073264,
"grad_norm": 2.6682169437408447,
"learning_rate": 4.614768170135132e-05,
"loss": 2.9839,
"step": 1965
},
{
"epoch": 0.17915071988336068,
"grad_norm": 2.525508403778076,
"learning_rate": 4.614386384553235e-05,
"loss": 3.1916,
"step": 1966
},
{
"epoch": 0.17924184435939494,
"grad_norm": 2.9044320583343506,
"learning_rate": 4.614004425689048e-05,
"loss": 3.3356,
"step": 1967
},
{
"epoch": 0.1793329688354292,
"grad_norm": 3.02644419670105,
"learning_rate": 4.6136222935738704e-05,
"loss": 2.914,
"step": 1968
},
{
"epoch": 0.17942409331146347,
"grad_norm": 2.337035655975342,
"learning_rate": 4.6132399882390206e-05,
"loss": 2.9974,
"step": 1969
},
{
"epoch": 0.1795152177874977,
"grad_norm": 2.178936243057251,
"learning_rate": 4.6128575097158314e-05,
"loss": 3.1322,
"step": 1970
},
{
"epoch": 0.17960634226353198,
"grad_norm": 1.6006604433059692,
"learning_rate": 4.612474858035647e-05,
"loss": 3.0153,
"step": 1971
},
{
"epoch": 0.17969746673956624,
"grad_norm": 2.1648287773132324,
"learning_rate": 4.612092033229828e-05,
"loss": 3.2334,
"step": 1972
},
{
"epoch": 0.1797885912156005,
"grad_norm": 2.6615753173828125,
"learning_rate": 4.611709035329747e-05,
"loss": 2.9051,
"step": 1973
},
{
"epoch": 0.17987971569163477,
"grad_norm": 1.9354596138000488,
"learning_rate": 4.6113258643667936e-05,
"loss": 3.108,
"step": 1974
},
{
"epoch": 0.17997084016766904,
"grad_norm": 1.7395933866500854,
"learning_rate": 4.610942520372369e-05,
"loss": 3.0382,
"step": 1975
},
{
"epoch": 0.1800619646437033,
"grad_norm": 2.480259895324707,
"learning_rate": 4.610559003377891e-05,
"loss": 2.9684,
"step": 1976
},
{
"epoch": 0.18015308911973757,
"grad_norm": 1.3295300006866455,
"learning_rate": 4.61017531341479e-05,
"loss": 3.0404,
"step": 1977
},
{
"epoch": 0.18024421359577183,
"grad_norm": 1.599542498588562,
"learning_rate": 4.60979145051451e-05,
"loss": 2.9742,
"step": 1978
},
{
"epoch": 0.1803353380718061,
"grad_norm": 1.8933284282684326,
"learning_rate": 4.609407414708512e-05,
"loss": 3.0518,
"step": 1979
},
{
"epoch": 0.18042646254784034,
"grad_norm": 1.4877815246582031,
"learning_rate": 4.6090232060282666e-05,
"loss": 2.9679,
"step": 1980
},
{
"epoch": 0.1805175870238746,
"grad_norm": 1.6608158349990845,
"learning_rate": 4.6086388245052636e-05,
"loss": 3.0059,
"step": 1981
},
{
"epoch": 0.18060871149990887,
"grad_norm": 3.126404047012329,
"learning_rate": 4.608254270171003e-05,
"loss": 3.1677,
"step": 1982
},
{
"epoch": 0.18069983597594313,
"grad_norm": 2.0683717727661133,
"learning_rate": 4.6078695430570004e-05,
"loss": 3.2245,
"step": 1983
},
{
"epoch": 0.1807909604519774,
"grad_norm": 2.5405118465423584,
"learning_rate": 4.607484643194788e-05,
"loss": 2.286,
"step": 1984
},
{
"epoch": 0.18088208492801167,
"grad_norm": 2.7362663745880127,
"learning_rate": 4.6070995706159075e-05,
"loss": 3.0712,
"step": 1985
},
{
"epoch": 0.18097320940404593,
"grad_norm": 2.33858585357666,
"learning_rate": 4.606714325351918e-05,
"loss": 3.1092,
"step": 1986
},
{
"epoch": 0.1810643338800802,
"grad_norm": 2.5455336570739746,
"learning_rate": 4.606328907434392e-05,
"loss": 3.2204,
"step": 1987
},
{
"epoch": 0.18115545835611446,
"grad_norm": 2.442511558532715,
"learning_rate": 4.605943316894915e-05,
"loss": 3.2713,
"step": 1988
},
{
"epoch": 0.18124658283214873,
"grad_norm": 1.7061405181884766,
"learning_rate": 4.605557553765089e-05,
"loss": 2.9598,
"step": 1989
},
{
"epoch": 0.18133770730818297,
"grad_norm": 2.4000542163848877,
"learning_rate": 4.605171618076528e-05,
"loss": 3.0296,
"step": 1990
},
{
"epoch": 0.18142883178421723,
"grad_norm": 2.3328189849853516,
"learning_rate": 4.6047855098608615e-05,
"loss": 2.9324,
"step": 1991
},
{
"epoch": 0.1815199562602515,
"grad_norm": 2.2985846996307373,
"learning_rate": 4.604399229149733e-05,
"loss": 3.152,
"step": 1992
},
{
"epoch": 0.18161108073628576,
"grad_norm": 1.8579432964324951,
"learning_rate": 4.604012775974798e-05,
"loss": 3.0462,
"step": 1993
},
{
"epoch": 0.18170220521232003,
"grad_norm": 1.2653917074203491,
"learning_rate": 4.6036261503677285e-05,
"loss": 2.9474,
"step": 1994
},
{
"epoch": 0.1817933296883543,
"grad_norm": 1.8129152059555054,
"learning_rate": 4.6032393523602114e-05,
"loss": 3.0247,
"step": 1995
},
{
"epoch": 0.18188445416438856,
"grad_norm": 2.5666768550872803,
"learning_rate": 4.602852381983945e-05,
"loss": 3.043,
"step": 1996
},
{
"epoch": 0.18197557864042282,
"grad_norm": 2.3878471851348877,
"learning_rate": 4.602465239270643e-05,
"loss": 3.5325,
"step": 1997
},
{
"epoch": 0.1820667031164571,
"grad_norm": 2.6700685024261475,
"learning_rate": 4.602077924252034e-05,
"loss": 2.6536,
"step": 1998
},
{
"epoch": 0.18215782759249136,
"grad_norm": 2.682384967803955,
"learning_rate": 4.601690436959859e-05,
"loss": 3.1602,
"step": 1999
},
{
"epoch": 0.1822489520685256,
"grad_norm": 2.54840087890625,
"learning_rate": 4.601302777425875e-05,
"loss": 3.2196,
"step": 2000
}
],
"logging_steps": 1,
"max_steps": 10974,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.76919004465111e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}