ElapticAI-1a / trainer_state.json
elapt1c
epoche 4
4f4e77b verified
raw
history blame
157 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4523,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001105460977227504,
"grad_norm": 2.856322765350342,
"learning_rate": 5.000000000000001e-07,
"loss": 4.5165,
"step": 5
},
{
"epoch": 0.002210921954455008,
"grad_norm": 2.6451292037963867,
"learning_rate": 1.0000000000000002e-06,
"loss": 4.3082,
"step": 10
},
{
"epoch": 0.0033163829316825116,
"grad_norm": 3.0033833980560303,
"learning_rate": 1.5e-06,
"loss": 4.3804,
"step": 15
},
{
"epoch": 0.004421843908910016,
"grad_norm": 2.5815796852111816,
"learning_rate": 2.0000000000000003e-06,
"loss": 4.3296,
"step": 20
},
{
"epoch": 0.0055273048861375195,
"grad_norm": 2.6665921211242676,
"learning_rate": 2.5e-06,
"loss": 4.274,
"step": 25
},
{
"epoch": 0.006632765863365023,
"grad_norm": 2.6247975826263428,
"learning_rate": 3e-06,
"loss": 4.3979,
"step": 30
},
{
"epoch": 0.007738226840592527,
"grad_norm": 2.822925329208374,
"learning_rate": 3.5000000000000004e-06,
"loss": 4.3966,
"step": 35
},
{
"epoch": 0.008843687817820032,
"grad_norm": 2.9031052589416504,
"learning_rate": 4.000000000000001e-06,
"loss": 4.2606,
"step": 40
},
{
"epoch": 0.009949148795047534,
"grad_norm": 2.43031907081604,
"learning_rate": 4.5e-06,
"loss": 4.3432,
"step": 45
},
{
"epoch": 0.011054609772275039,
"grad_norm": 2.6758840084075928,
"learning_rate": 5e-06,
"loss": 4.3543,
"step": 50
},
{
"epoch": 0.012160070749502542,
"grad_norm": 2.5539205074310303,
"learning_rate": 5.500000000000001e-06,
"loss": 4.2527,
"step": 55
},
{
"epoch": 0.013265531726730046,
"grad_norm": 2.552877187728882,
"learning_rate": 6e-06,
"loss": 4.2603,
"step": 60
},
{
"epoch": 0.014370992703957551,
"grad_norm": 2.7067909240722656,
"learning_rate": 6.5000000000000004e-06,
"loss": 4.3525,
"step": 65
},
{
"epoch": 0.015476453681185054,
"grad_norm": 2.633598566055298,
"learning_rate": 7.000000000000001e-06,
"loss": 4.2775,
"step": 70
},
{
"epoch": 0.016581914658412557,
"grad_norm": 2.582083225250244,
"learning_rate": 7.5e-06,
"loss": 4.4059,
"step": 75
},
{
"epoch": 0.017687375635640063,
"grad_norm": 2.7137420177459717,
"learning_rate": 8.000000000000001e-06,
"loss": 4.2763,
"step": 80
},
{
"epoch": 0.018792836612867566,
"grad_norm": 2.743177652359009,
"learning_rate": 8.500000000000002e-06,
"loss": 4.3027,
"step": 85
},
{
"epoch": 0.01989829759009507,
"grad_norm": 2.9156761169433594,
"learning_rate": 9e-06,
"loss": 4.3686,
"step": 90
},
{
"epoch": 0.021003758567322575,
"grad_norm": 2.936218738555908,
"learning_rate": 9.5e-06,
"loss": 4.3308,
"step": 95
},
{
"epoch": 0.022109219544550078,
"grad_norm": 2.5734968185424805,
"learning_rate": 1e-05,
"loss": 4.144,
"step": 100
},
{
"epoch": 0.02321468052177758,
"grad_norm": 3.0580193996429443,
"learning_rate": 1.05e-05,
"loss": 4.2334,
"step": 105
},
{
"epoch": 0.024320141499005084,
"grad_norm": 2.8130428791046143,
"learning_rate": 1.1000000000000001e-05,
"loss": 4.0793,
"step": 110
},
{
"epoch": 0.02542560247623259,
"grad_norm": 2.9316952228546143,
"learning_rate": 1.1500000000000002e-05,
"loss": 4.2116,
"step": 115
},
{
"epoch": 0.026531063453460093,
"grad_norm": 2.8418164253234863,
"learning_rate": 1.2e-05,
"loss": 4.1086,
"step": 120
},
{
"epoch": 0.027636524430687596,
"grad_norm": 2.6649138927459717,
"learning_rate": 1.25e-05,
"loss": 4.1225,
"step": 125
},
{
"epoch": 0.028741985407915102,
"grad_norm": 2.6316049098968506,
"learning_rate": 1.3000000000000001e-05,
"loss": 4.3653,
"step": 130
},
{
"epoch": 0.029847446385142605,
"grad_norm": 2.9526383876800537,
"learning_rate": 1.3500000000000001e-05,
"loss": 4.2687,
"step": 135
},
{
"epoch": 0.030952907362370108,
"grad_norm": 2.761291980743408,
"learning_rate": 1.4000000000000001e-05,
"loss": 4.3326,
"step": 140
},
{
"epoch": 0.03205836833959761,
"grad_norm": 2.6319758892059326,
"learning_rate": 1.45e-05,
"loss": 4.3805,
"step": 145
},
{
"epoch": 0.033163829316825114,
"grad_norm": 2.6983299255371094,
"learning_rate": 1.5e-05,
"loss": 4.2701,
"step": 150
},
{
"epoch": 0.03426929029405262,
"grad_norm": 2.749418020248413,
"learning_rate": 1.55e-05,
"loss": 4.3861,
"step": 155
},
{
"epoch": 0.035374751271280126,
"grad_norm": 2.784226179122925,
"learning_rate": 1.6000000000000003e-05,
"loss": 4.1366,
"step": 160
},
{
"epoch": 0.03648021224850763,
"grad_norm": 2.6632113456726074,
"learning_rate": 1.65e-05,
"loss": 4.2244,
"step": 165
},
{
"epoch": 0.03758567322573513,
"grad_norm": 2.804885149002075,
"learning_rate": 1.7000000000000003e-05,
"loss": 4.1976,
"step": 170
},
{
"epoch": 0.038691134202962635,
"grad_norm": 2.806664228439331,
"learning_rate": 1.75e-05,
"loss": 4.0235,
"step": 175
},
{
"epoch": 0.03979659518019014,
"grad_norm": 2.6123688220977783,
"learning_rate": 1.8e-05,
"loss": 4.1966,
"step": 180
},
{
"epoch": 0.04090205615741764,
"grad_norm": 2.80129075050354,
"learning_rate": 1.85e-05,
"loss": 4.1403,
"step": 185
},
{
"epoch": 0.04200751713464515,
"grad_norm": 2.7253201007843018,
"learning_rate": 1.9e-05,
"loss": 4.1317,
"step": 190
},
{
"epoch": 0.04311297811187265,
"grad_norm": 2.852238178253174,
"learning_rate": 1.9500000000000003e-05,
"loss": 4.243,
"step": 195
},
{
"epoch": 0.044218439089100156,
"grad_norm": 2.968660831451416,
"learning_rate": 2e-05,
"loss": 4.1083,
"step": 200
},
{
"epoch": 0.04532390006632766,
"grad_norm": 2.7089550495147705,
"learning_rate": 2.05e-05,
"loss": 4.3324,
"step": 205
},
{
"epoch": 0.04642936104355516,
"grad_norm": 2.6991310119628906,
"learning_rate": 2.1e-05,
"loss": 4.3528,
"step": 210
},
{
"epoch": 0.047534822020782665,
"grad_norm": 2.5547115802764893,
"learning_rate": 2.15e-05,
"loss": 4.0371,
"step": 215
},
{
"epoch": 0.04864028299801017,
"grad_norm": 2.708559989929199,
"learning_rate": 2.2000000000000003e-05,
"loss": 4.1409,
"step": 220
},
{
"epoch": 0.04974574397523768,
"grad_norm": 2.709721565246582,
"learning_rate": 2.25e-05,
"loss": 4.0498,
"step": 225
},
{
"epoch": 0.05085120495246518,
"grad_norm": 2.7421419620513916,
"learning_rate": 2.3000000000000003e-05,
"loss": 4.1863,
"step": 230
},
{
"epoch": 0.05195666592969268,
"grad_norm": 2.776456832885742,
"learning_rate": 2.35e-05,
"loss": 4.1545,
"step": 235
},
{
"epoch": 0.053062126906920186,
"grad_norm": 2.8448917865753174,
"learning_rate": 2.4e-05,
"loss": 4.0625,
"step": 240
},
{
"epoch": 0.05416758788414769,
"grad_norm": 2.933760404586792,
"learning_rate": 2.45e-05,
"loss": 4.1578,
"step": 245
},
{
"epoch": 0.05527304886137519,
"grad_norm": 3.026527166366577,
"learning_rate": 2.5e-05,
"loss": 4.1339,
"step": 250
},
{
"epoch": 0.056378509838602694,
"grad_norm": 2.5931596755981445,
"learning_rate": 2.5500000000000003e-05,
"loss": 4.0458,
"step": 255
},
{
"epoch": 0.057483970815830204,
"grad_norm": 2.9681997299194336,
"learning_rate": 2.6000000000000002e-05,
"loss": 4.2949,
"step": 260
},
{
"epoch": 0.05858943179305771,
"grad_norm": 2.822819232940674,
"learning_rate": 2.6500000000000004e-05,
"loss": 4.1542,
"step": 265
},
{
"epoch": 0.05969489277028521,
"grad_norm": 2.794525623321533,
"learning_rate": 2.7000000000000002e-05,
"loss": 4.1644,
"step": 270
},
{
"epoch": 0.06080035374751271,
"grad_norm": 2.6282451152801514,
"learning_rate": 2.7500000000000004e-05,
"loss": 4.0558,
"step": 275
},
{
"epoch": 0.061905814724740216,
"grad_norm": 2.87127947807312,
"learning_rate": 2.8000000000000003e-05,
"loss": 4.2151,
"step": 280
},
{
"epoch": 0.06301127570196773,
"grad_norm": 2.7771425247192383,
"learning_rate": 2.8499999999999998e-05,
"loss": 4.1902,
"step": 285
},
{
"epoch": 0.06411673667919522,
"grad_norm": 2.7243714332580566,
"learning_rate": 2.9e-05,
"loss": 4.1633,
"step": 290
},
{
"epoch": 0.06522219765642273,
"grad_norm": 2.733858108520508,
"learning_rate": 2.95e-05,
"loss": 4.2343,
"step": 295
},
{
"epoch": 0.06632765863365023,
"grad_norm": 3.054060935974121,
"learning_rate": 3e-05,
"loss": 4.0605,
"step": 300
},
{
"epoch": 0.06743311961087774,
"grad_norm": 2.681039333343506,
"learning_rate": 3.05e-05,
"loss": 4.0797,
"step": 305
},
{
"epoch": 0.06853858058810525,
"grad_norm": 2.594285011291504,
"learning_rate": 3.1e-05,
"loss": 4.0443,
"step": 310
},
{
"epoch": 0.06964404156533274,
"grad_norm": 2.9265353679656982,
"learning_rate": 3.15e-05,
"loss": 4.2976,
"step": 315
},
{
"epoch": 0.07074950254256025,
"grad_norm": 3.3384079933166504,
"learning_rate": 3.2000000000000005e-05,
"loss": 4.2831,
"step": 320
},
{
"epoch": 0.07185496351978775,
"grad_norm": 2.9113404750823975,
"learning_rate": 3.2500000000000004e-05,
"loss": 4.0696,
"step": 325
},
{
"epoch": 0.07296042449701526,
"grad_norm": 2.746483087539673,
"learning_rate": 3.3e-05,
"loss": 3.9145,
"step": 330
},
{
"epoch": 0.07406588547424275,
"grad_norm": 2.892920970916748,
"learning_rate": 3.35e-05,
"loss": 4.0664,
"step": 335
},
{
"epoch": 0.07517134645147026,
"grad_norm": 3.030963659286499,
"learning_rate": 3.4000000000000007e-05,
"loss": 4.14,
"step": 340
},
{
"epoch": 0.07627680742869777,
"grad_norm": 3.1139981746673584,
"learning_rate": 3.45e-05,
"loss": 4.0361,
"step": 345
},
{
"epoch": 0.07738226840592527,
"grad_norm": 2.646188497543335,
"learning_rate": 3.5e-05,
"loss": 4.0998,
"step": 350
},
{
"epoch": 0.07848772938315278,
"grad_norm": 2.9719629287719727,
"learning_rate": 3.55e-05,
"loss": 3.9815,
"step": 355
},
{
"epoch": 0.07959319036038028,
"grad_norm": 2.6908960342407227,
"learning_rate": 3.6e-05,
"loss": 4.016,
"step": 360
},
{
"epoch": 0.08069865133760779,
"grad_norm": 3.2028872966766357,
"learning_rate": 3.65e-05,
"loss": 4.0359,
"step": 365
},
{
"epoch": 0.08180411231483528,
"grad_norm": 2.9519758224487305,
"learning_rate": 3.7e-05,
"loss": 4.0871,
"step": 370
},
{
"epoch": 0.08290957329206279,
"grad_norm": 2.844874143600464,
"learning_rate": 3.7500000000000003e-05,
"loss": 4.1217,
"step": 375
},
{
"epoch": 0.0840150342692903,
"grad_norm": 2.73949933052063,
"learning_rate": 3.8e-05,
"loss": 4.0189,
"step": 380
},
{
"epoch": 0.0851204952465178,
"grad_norm": 2.928393840789795,
"learning_rate": 3.85e-05,
"loss": 3.9084,
"step": 385
},
{
"epoch": 0.0862259562237453,
"grad_norm": 2.726449966430664,
"learning_rate": 3.9000000000000006e-05,
"loss": 4.2775,
"step": 390
},
{
"epoch": 0.0873314172009728,
"grad_norm": 2.5583412647247314,
"learning_rate": 3.9500000000000005e-05,
"loss": 4.0654,
"step": 395
},
{
"epoch": 0.08843687817820031,
"grad_norm": 2.8123371601104736,
"learning_rate": 4e-05,
"loss": 4.0601,
"step": 400
},
{
"epoch": 0.08954233915542781,
"grad_norm": 3.2048697471618652,
"learning_rate": 4.05e-05,
"loss": 4.1568,
"step": 405
},
{
"epoch": 0.09064780013265532,
"grad_norm": 2.8617966175079346,
"learning_rate": 4.1e-05,
"loss": 4.209,
"step": 410
},
{
"epoch": 0.09175326110988283,
"grad_norm": 3.07211971282959,
"learning_rate": 4.15e-05,
"loss": 3.9661,
"step": 415
},
{
"epoch": 0.09285872208711032,
"grad_norm": 2.84535813331604,
"learning_rate": 4.2e-05,
"loss": 4.0976,
"step": 420
},
{
"epoch": 0.09396418306433783,
"grad_norm": 2.6337199211120605,
"learning_rate": 4.25e-05,
"loss": 3.9352,
"step": 425
},
{
"epoch": 0.09506964404156533,
"grad_norm": 3.1465373039245605,
"learning_rate": 4.3e-05,
"loss": 4.0732,
"step": 430
},
{
"epoch": 0.09617510501879284,
"grad_norm": 2.9059720039367676,
"learning_rate": 4.35e-05,
"loss": 4.0377,
"step": 435
},
{
"epoch": 0.09728056599602034,
"grad_norm": 3.147087812423706,
"learning_rate": 4.4000000000000006e-05,
"loss": 4.0003,
"step": 440
},
{
"epoch": 0.09838602697324784,
"grad_norm": 2.878849983215332,
"learning_rate": 4.4500000000000004e-05,
"loss": 4.1393,
"step": 445
},
{
"epoch": 0.09949148795047535,
"grad_norm": 2.9624218940734863,
"learning_rate": 4.5e-05,
"loss": 3.9989,
"step": 450
},
{
"epoch": 0.10059694892770285,
"grad_norm": 3.047313690185547,
"learning_rate": 4.55e-05,
"loss": 3.9942,
"step": 455
},
{
"epoch": 0.10170240990493036,
"grad_norm": 3.069126605987549,
"learning_rate": 4.600000000000001e-05,
"loss": 4.0184,
"step": 460
},
{
"epoch": 0.10280787088215786,
"grad_norm": 3.046513319015503,
"learning_rate": 4.6500000000000005e-05,
"loss": 4.0441,
"step": 465
},
{
"epoch": 0.10391333185938537,
"grad_norm": 2.829324722290039,
"learning_rate": 4.7e-05,
"loss": 3.9685,
"step": 470
},
{
"epoch": 0.10501879283661286,
"grad_norm": 3.0912318229675293,
"learning_rate": 4.75e-05,
"loss": 3.9195,
"step": 475
},
{
"epoch": 0.10612425381384037,
"grad_norm": 3.3232522010803223,
"learning_rate": 4.8e-05,
"loss": 4.092,
"step": 480
},
{
"epoch": 0.10722971479106788,
"grad_norm": 3.12263822555542,
"learning_rate": 4.85e-05,
"loss": 4.1614,
"step": 485
},
{
"epoch": 0.10833517576829538,
"grad_norm": 3.245594024658203,
"learning_rate": 4.9e-05,
"loss": 3.9858,
"step": 490
},
{
"epoch": 0.10944063674552289,
"grad_norm": 3.0725033283233643,
"learning_rate": 4.9500000000000004e-05,
"loss": 4.0024,
"step": 495
},
{
"epoch": 0.11054609772275038,
"grad_norm": 3.056286334991455,
"learning_rate": 5e-05,
"loss": 3.9464,
"step": 500
},
{
"epoch": 0.1116515586999779,
"grad_norm": 3.068084478378296,
"learning_rate": 4.993785732040766e-05,
"loss": 4.1743,
"step": 505
},
{
"epoch": 0.11275701967720539,
"grad_norm": 3.189666509628296,
"learning_rate": 4.9875714640815315e-05,
"loss": 3.9721,
"step": 510
},
{
"epoch": 0.1138624806544329,
"grad_norm": 3.0129644870758057,
"learning_rate": 4.981357196122297e-05,
"loss": 4.0626,
"step": 515
},
{
"epoch": 0.11496794163166041,
"grad_norm": 2.962771415710449,
"learning_rate": 4.975142928163063e-05,
"loss": 4.0752,
"step": 520
},
{
"epoch": 0.1160734026088879,
"grad_norm": 3.028667688369751,
"learning_rate": 4.968928660203828e-05,
"loss": 4.0011,
"step": 525
},
{
"epoch": 0.11717886358611541,
"grad_norm": 2.98563551902771,
"learning_rate": 4.962714392244594e-05,
"loss": 4.0049,
"step": 530
},
{
"epoch": 0.11828432456334291,
"grad_norm": 3.0009968280792236,
"learning_rate": 4.9565001242853596e-05,
"loss": 3.8586,
"step": 535
},
{
"epoch": 0.11938978554057042,
"grad_norm": 3.038587808609009,
"learning_rate": 4.950285856326125e-05,
"loss": 4.2185,
"step": 540
},
{
"epoch": 0.12049524651779792,
"grad_norm": 2.9189321994781494,
"learning_rate": 4.944071588366891e-05,
"loss": 4.0958,
"step": 545
},
{
"epoch": 0.12160070749502543,
"grad_norm": 2.9720592498779297,
"learning_rate": 4.9378573204076564e-05,
"loss": 4.0651,
"step": 550
},
{
"epoch": 0.12270616847225294,
"grad_norm": 3.094660520553589,
"learning_rate": 4.931643052448422e-05,
"loss": 4.0115,
"step": 555
},
{
"epoch": 0.12381162944948043,
"grad_norm": 3.197223663330078,
"learning_rate": 4.925428784489187e-05,
"loss": 3.9921,
"step": 560
},
{
"epoch": 0.12491709042670794,
"grad_norm": 3.033642053604126,
"learning_rate": 4.919214516529953e-05,
"loss": 3.9471,
"step": 565
},
{
"epoch": 0.12602255140393545,
"grad_norm": 3.1595492362976074,
"learning_rate": 4.913000248570719e-05,
"loss": 3.9529,
"step": 570
},
{
"epoch": 0.12712801238116295,
"grad_norm": 2.948946714401245,
"learning_rate": 4.906785980611484e-05,
"loss": 4.0634,
"step": 575
},
{
"epoch": 0.12823347335839044,
"grad_norm": 3.366753339767456,
"learning_rate": 4.90057171265225e-05,
"loss": 3.8098,
"step": 580
},
{
"epoch": 0.12933893433561794,
"grad_norm": 3.2447152137756348,
"learning_rate": 4.894357444693015e-05,
"loss": 3.9081,
"step": 585
},
{
"epoch": 0.13044439531284546,
"grad_norm": 3.0394585132598877,
"learning_rate": 4.888143176733781e-05,
"loss": 3.8164,
"step": 590
},
{
"epoch": 0.13154985629007296,
"grad_norm": 2.983616828918457,
"learning_rate": 4.881928908774547e-05,
"loss": 3.9558,
"step": 595
},
{
"epoch": 0.13265531726730045,
"grad_norm": 3.1075408458709717,
"learning_rate": 4.875714640815312e-05,
"loss": 3.9563,
"step": 600
},
{
"epoch": 0.13376077824452798,
"grad_norm": 3.068930149078369,
"learning_rate": 4.8695003728560775e-05,
"loss": 3.9616,
"step": 605
},
{
"epoch": 0.13486623922175547,
"grad_norm": 3.2127275466918945,
"learning_rate": 4.863286104896843e-05,
"loss": 4.0268,
"step": 610
},
{
"epoch": 0.13597170019898297,
"grad_norm": 3.0738019943237305,
"learning_rate": 4.857071836937609e-05,
"loss": 4.0659,
"step": 615
},
{
"epoch": 0.1370771611762105,
"grad_norm": 3.2203280925750732,
"learning_rate": 4.850857568978375e-05,
"loss": 4.052,
"step": 620
},
{
"epoch": 0.138182622153438,
"grad_norm": 3.0868825912475586,
"learning_rate": 4.84464330101914e-05,
"loss": 4.1941,
"step": 625
},
{
"epoch": 0.13928808313066549,
"grad_norm": 2.9370384216308594,
"learning_rate": 4.8384290330599056e-05,
"loss": 3.9984,
"step": 630
},
{
"epoch": 0.14039354410789298,
"grad_norm": 3.230595111846924,
"learning_rate": 4.832214765100672e-05,
"loss": 4.0905,
"step": 635
},
{
"epoch": 0.1414990050851205,
"grad_norm": 3.1805593967437744,
"learning_rate": 4.826000497141437e-05,
"loss": 4.0938,
"step": 640
},
{
"epoch": 0.142604466062348,
"grad_norm": 2.952800989151001,
"learning_rate": 4.8197862291822025e-05,
"loss": 3.9694,
"step": 645
},
{
"epoch": 0.1437099270395755,
"grad_norm": 2.96767520904541,
"learning_rate": 4.813571961222968e-05,
"loss": 4.089,
"step": 650
},
{
"epoch": 0.14481538801680302,
"grad_norm": 3.2061245441436768,
"learning_rate": 4.807357693263734e-05,
"loss": 3.925,
"step": 655
},
{
"epoch": 0.14592084899403052,
"grad_norm": 3.4966869354248047,
"learning_rate": 4.801143425304499e-05,
"loss": 4.0303,
"step": 660
},
{
"epoch": 0.147026309971258,
"grad_norm": 3.0343263149261475,
"learning_rate": 4.794929157345265e-05,
"loss": 4.1302,
"step": 665
},
{
"epoch": 0.1481317709484855,
"grad_norm": 3.1001501083374023,
"learning_rate": 4.7887148893860305e-05,
"loss": 4.136,
"step": 670
},
{
"epoch": 0.14923723192571303,
"grad_norm": 3.0706558227539062,
"learning_rate": 4.782500621426796e-05,
"loss": 3.97,
"step": 675
},
{
"epoch": 0.15034269290294053,
"grad_norm": 3.4160215854644775,
"learning_rate": 4.776286353467562e-05,
"loss": 4.1283,
"step": 680
},
{
"epoch": 0.15144815388016802,
"grad_norm": 3.6512129306793213,
"learning_rate": 4.7700720855083274e-05,
"loss": 4.1619,
"step": 685
},
{
"epoch": 0.15255361485739555,
"grad_norm": 2.8638243675231934,
"learning_rate": 4.763857817549093e-05,
"loss": 4.0563,
"step": 690
},
{
"epoch": 0.15365907583462304,
"grad_norm": 2.87731671333313,
"learning_rate": 4.7576435495898586e-05,
"loss": 4.0609,
"step": 695
},
{
"epoch": 0.15476453681185054,
"grad_norm": 3.2787325382232666,
"learning_rate": 4.751429281630624e-05,
"loss": 4.031,
"step": 700
},
{
"epoch": 0.15586999778907804,
"grad_norm": 2.9089596271514893,
"learning_rate": 4.74521501367139e-05,
"loss": 3.8997,
"step": 705
},
{
"epoch": 0.15697545876630556,
"grad_norm": 3.02470326423645,
"learning_rate": 4.7390007457121555e-05,
"loss": 4.1458,
"step": 710
},
{
"epoch": 0.15808091974353305,
"grad_norm": 3.1005873680114746,
"learning_rate": 4.7327864777529204e-05,
"loss": 3.8473,
"step": 715
},
{
"epoch": 0.15918638072076055,
"grad_norm": 3.2032277584075928,
"learning_rate": 4.726572209793687e-05,
"loss": 4.0817,
"step": 720
},
{
"epoch": 0.16029184169798807,
"grad_norm": 3.1510956287384033,
"learning_rate": 4.720357941834452e-05,
"loss": 4.0441,
"step": 725
},
{
"epoch": 0.16139730267521557,
"grad_norm": 3.088815689086914,
"learning_rate": 4.714143673875217e-05,
"loss": 3.8953,
"step": 730
},
{
"epoch": 0.16250276365244307,
"grad_norm": 3.099492073059082,
"learning_rate": 4.7079294059159836e-05,
"loss": 3.8765,
"step": 735
},
{
"epoch": 0.16360822462967056,
"grad_norm": 2.95200252532959,
"learning_rate": 4.7017151379567485e-05,
"loss": 4.0126,
"step": 740
},
{
"epoch": 0.16471368560689809,
"grad_norm": 3.2879955768585205,
"learning_rate": 4.695500869997515e-05,
"loss": 4.0581,
"step": 745
},
{
"epoch": 0.16581914658412558,
"grad_norm": 3.344324827194214,
"learning_rate": 4.6892866020382804e-05,
"loss": 4.056,
"step": 750
},
{
"epoch": 0.16692460756135308,
"grad_norm": 3.3089466094970703,
"learning_rate": 4.6830723340790454e-05,
"loss": 3.9941,
"step": 755
},
{
"epoch": 0.1680300685385806,
"grad_norm": 3.3503427505493164,
"learning_rate": 4.6768580661198117e-05,
"loss": 3.9987,
"step": 760
},
{
"epoch": 0.1691355295158081,
"grad_norm": 3.3430700302124023,
"learning_rate": 4.670643798160577e-05,
"loss": 3.8631,
"step": 765
},
{
"epoch": 0.1702409904930356,
"grad_norm": 3.0984108448028564,
"learning_rate": 4.664429530201342e-05,
"loss": 4.0144,
"step": 770
},
{
"epoch": 0.1713464514702631,
"grad_norm": 3.1141326427459717,
"learning_rate": 4.6582152622421085e-05,
"loss": 3.9256,
"step": 775
},
{
"epoch": 0.1724519124474906,
"grad_norm": 3.1998496055603027,
"learning_rate": 4.6520009942828734e-05,
"loss": 3.9675,
"step": 780
},
{
"epoch": 0.1735573734247181,
"grad_norm": 3.034891128540039,
"learning_rate": 4.645786726323639e-05,
"loss": 4.0099,
"step": 785
},
{
"epoch": 0.1746628344019456,
"grad_norm": 3.2506675720214844,
"learning_rate": 4.6395724583644054e-05,
"loss": 3.9246,
"step": 790
},
{
"epoch": 0.17576829537917313,
"grad_norm": 3.485947608947754,
"learning_rate": 4.63335819040517e-05,
"loss": 3.9919,
"step": 795
},
{
"epoch": 0.17687375635640062,
"grad_norm": 3.2420520782470703,
"learning_rate": 4.627143922445936e-05,
"loss": 4.021,
"step": 800
},
{
"epoch": 0.17797921733362812,
"grad_norm": 2.989863872528076,
"learning_rate": 4.6209296544867015e-05,
"loss": 3.9956,
"step": 805
},
{
"epoch": 0.17908467831085562,
"grad_norm": 2.9505488872528076,
"learning_rate": 4.614715386527467e-05,
"loss": 4.1098,
"step": 810
},
{
"epoch": 0.18019013928808314,
"grad_norm": 3.1943299770355225,
"learning_rate": 4.608501118568233e-05,
"loss": 3.962,
"step": 815
},
{
"epoch": 0.18129560026531064,
"grad_norm": 3.1761474609375,
"learning_rate": 4.6022868506089984e-05,
"loss": 3.8666,
"step": 820
},
{
"epoch": 0.18240106124253813,
"grad_norm": 3.454538345336914,
"learning_rate": 4.596072582649764e-05,
"loss": 4.1169,
"step": 825
},
{
"epoch": 0.18350652221976566,
"grad_norm": 3.3881819248199463,
"learning_rate": 4.5898583146905296e-05,
"loss": 4.0902,
"step": 830
},
{
"epoch": 0.18461198319699315,
"grad_norm": 3.0427277088165283,
"learning_rate": 4.583644046731295e-05,
"loss": 3.9533,
"step": 835
},
{
"epoch": 0.18571744417422065,
"grad_norm": 3.062037944793701,
"learning_rate": 4.577429778772061e-05,
"loss": 4.0955,
"step": 840
},
{
"epoch": 0.18682290515144814,
"grad_norm": 3.1821091175079346,
"learning_rate": 4.5712155108128265e-05,
"loss": 4.1441,
"step": 845
},
{
"epoch": 0.18792836612867567,
"grad_norm": 3.1128711700439453,
"learning_rate": 4.565001242853592e-05,
"loss": 4.1418,
"step": 850
},
{
"epoch": 0.18903382710590316,
"grad_norm": 3.0755162239074707,
"learning_rate": 4.558786974894358e-05,
"loss": 4.0246,
"step": 855
},
{
"epoch": 0.19013928808313066,
"grad_norm": 3.2559144496917725,
"learning_rate": 4.552572706935123e-05,
"loss": 4.0333,
"step": 860
},
{
"epoch": 0.19124474906035818,
"grad_norm": 2.929656744003296,
"learning_rate": 4.546358438975889e-05,
"loss": 4.102,
"step": 865
},
{
"epoch": 0.19235021003758568,
"grad_norm": 3.1212410926818848,
"learning_rate": 4.5401441710166546e-05,
"loss": 3.8648,
"step": 870
},
{
"epoch": 0.19345567101481317,
"grad_norm": 3.0112760066986084,
"learning_rate": 4.53392990305742e-05,
"loss": 4.0,
"step": 875
},
{
"epoch": 0.19456113199204067,
"grad_norm": 3.1704013347625732,
"learning_rate": 4.527715635098186e-05,
"loss": 4.0259,
"step": 880
},
{
"epoch": 0.1956665929692682,
"grad_norm": 2.999876022338867,
"learning_rate": 4.5215013671389514e-05,
"loss": 3.8822,
"step": 885
},
{
"epoch": 0.1967720539464957,
"grad_norm": 3.1141977310180664,
"learning_rate": 4.515287099179717e-05,
"loss": 4.1104,
"step": 890
},
{
"epoch": 0.19787751492372319,
"grad_norm": 3.2327237129211426,
"learning_rate": 4.509072831220482e-05,
"loss": 3.8755,
"step": 895
},
{
"epoch": 0.1989829759009507,
"grad_norm": 3.019273519515991,
"learning_rate": 4.502858563261248e-05,
"loss": 3.9992,
"step": 900
},
{
"epoch": 0.2000884368781782,
"grad_norm": 3.203974962234497,
"learning_rate": 4.496644295302014e-05,
"loss": 3.978,
"step": 905
},
{
"epoch": 0.2011938978554057,
"grad_norm": 3.0810108184814453,
"learning_rate": 4.490430027342779e-05,
"loss": 4.0461,
"step": 910
},
{
"epoch": 0.2022993588326332,
"grad_norm": 3.004460096359253,
"learning_rate": 4.484215759383545e-05,
"loss": 3.9562,
"step": 915
},
{
"epoch": 0.20340481980986072,
"grad_norm": 3.146409034729004,
"learning_rate": 4.478001491424311e-05,
"loss": 4.0321,
"step": 920
},
{
"epoch": 0.20451028078708822,
"grad_norm": 3.180551528930664,
"learning_rate": 4.471787223465076e-05,
"loss": 4.0203,
"step": 925
},
{
"epoch": 0.2056157417643157,
"grad_norm": 3.2521543502807617,
"learning_rate": 4.465572955505842e-05,
"loss": 3.9592,
"step": 930
},
{
"epoch": 0.20672120274154324,
"grad_norm": 3.3072097301483154,
"learning_rate": 4.459358687546607e-05,
"loss": 3.9383,
"step": 935
},
{
"epoch": 0.20782666371877073,
"grad_norm": 3.152592182159424,
"learning_rate": 4.4531444195873725e-05,
"loss": 3.9695,
"step": 940
},
{
"epoch": 0.20893212469599823,
"grad_norm": 3.3956856727600098,
"learning_rate": 4.446930151628139e-05,
"loss": 4.1435,
"step": 945
},
{
"epoch": 0.21003758567322572,
"grad_norm": 3.2591230869293213,
"learning_rate": 4.440715883668904e-05,
"loss": 3.9847,
"step": 950
},
{
"epoch": 0.21114304665045325,
"grad_norm": 3.197763204574585,
"learning_rate": 4.4345016157096694e-05,
"loss": 4.0096,
"step": 955
},
{
"epoch": 0.21224850762768074,
"grad_norm": 3.1687469482421875,
"learning_rate": 4.428287347750435e-05,
"loss": 3.9947,
"step": 960
},
{
"epoch": 0.21335396860490824,
"grad_norm": 3.01877498626709,
"learning_rate": 4.4220730797912006e-05,
"loss": 3.9609,
"step": 965
},
{
"epoch": 0.21445942958213576,
"grad_norm": 3.0294318199157715,
"learning_rate": 4.415858811831967e-05,
"loss": 4.1849,
"step": 970
},
{
"epoch": 0.21556489055936326,
"grad_norm": 3.6619277000427246,
"learning_rate": 4.409644543872732e-05,
"loss": 4.0503,
"step": 975
},
{
"epoch": 0.21667035153659076,
"grad_norm": 3.24751353263855,
"learning_rate": 4.4034302759134975e-05,
"loss": 4.1227,
"step": 980
},
{
"epoch": 0.21777581251381825,
"grad_norm": 3.2298481464385986,
"learning_rate": 4.397216007954264e-05,
"loss": 4.0815,
"step": 985
},
{
"epoch": 0.21888127349104577,
"grad_norm": 3.2555155754089355,
"learning_rate": 4.391001739995029e-05,
"loss": 4.1461,
"step": 990
},
{
"epoch": 0.21998673446827327,
"grad_norm": 3.141761064529419,
"learning_rate": 4.384787472035794e-05,
"loss": 4.021,
"step": 995
},
{
"epoch": 0.22109219544550077,
"grad_norm": 3.0659165382385254,
"learning_rate": 4.37857320407656e-05,
"loss": 3.8781,
"step": 1000
},
{
"epoch": 0.2221976564227283,
"grad_norm": 3.1628031730651855,
"learning_rate": 4.3723589361173255e-05,
"loss": 4.0618,
"step": 1005
},
{
"epoch": 0.2233031173999558,
"grad_norm": 3.143479347229004,
"learning_rate": 4.366144668158091e-05,
"loss": 4.0251,
"step": 1010
},
{
"epoch": 0.22440857837718328,
"grad_norm": 3.302840232849121,
"learning_rate": 4.359930400198857e-05,
"loss": 3.8384,
"step": 1015
},
{
"epoch": 0.22551403935441078,
"grad_norm": 2.7286899089813232,
"learning_rate": 4.3537161322396224e-05,
"loss": 4.0165,
"step": 1020
},
{
"epoch": 0.2266195003316383,
"grad_norm": 3.0600860118865967,
"learning_rate": 4.347501864280388e-05,
"loss": 4.0364,
"step": 1025
},
{
"epoch": 0.2277249613088658,
"grad_norm": 2.9517204761505127,
"learning_rate": 4.3412875963211536e-05,
"loss": 4.0458,
"step": 1030
},
{
"epoch": 0.2288304222860933,
"grad_norm": 3.2530035972595215,
"learning_rate": 4.335073328361919e-05,
"loss": 4.1022,
"step": 1035
},
{
"epoch": 0.22993588326332082,
"grad_norm": 3.277559280395508,
"learning_rate": 4.328859060402685e-05,
"loss": 3.9183,
"step": 1040
},
{
"epoch": 0.2310413442405483,
"grad_norm": 3.286675453186035,
"learning_rate": 4.3226447924434505e-05,
"loss": 4.1264,
"step": 1045
},
{
"epoch": 0.2321468052177758,
"grad_norm": 3.010737180709839,
"learning_rate": 4.3164305244842154e-05,
"loss": 4.0477,
"step": 1050
},
{
"epoch": 0.2332522661950033,
"grad_norm": 3.050497055053711,
"learning_rate": 4.310216256524982e-05,
"loss": 4.0464,
"step": 1055
},
{
"epoch": 0.23435772717223083,
"grad_norm": 3.201765537261963,
"learning_rate": 4.304001988565747e-05,
"loss": 4.0519,
"step": 1060
},
{
"epoch": 0.23546318814945832,
"grad_norm": 3.3649299144744873,
"learning_rate": 4.297787720606512e-05,
"loss": 3.81,
"step": 1065
},
{
"epoch": 0.23656864912668582,
"grad_norm": 3.5535190105438232,
"learning_rate": 4.2915734526472786e-05,
"loss": 4.1328,
"step": 1070
},
{
"epoch": 0.23767411010391334,
"grad_norm": 3.1812844276428223,
"learning_rate": 4.285359184688044e-05,
"loss": 4.052,
"step": 1075
},
{
"epoch": 0.23877957108114084,
"grad_norm": 3.303905725479126,
"learning_rate": 4.279144916728809e-05,
"loss": 3.8988,
"step": 1080
},
{
"epoch": 0.23988503205836834,
"grad_norm": 3.1050772666931152,
"learning_rate": 4.2729306487695754e-05,
"loss": 3.9013,
"step": 1085
},
{
"epoch": 0.24099049303559583,
"grad_norm": 3.2585289478302,
"learning_rate": 4.2667163808103404e-05,
"loss": 4.1435,
"step": 1090
},
{
"epoch": 0.24209595401282336,
"grad_norm": 3.3238561153411865,
"learning_rate": 4.2605021128511067e-05,
"loss": 3.9212,
"step": 1095
},
{
"epoch": 0.24320141499005085,
"grad_norm": 3.151242971420288,
"learning_rate": 4.254287844891872e-05,
"loss": 4.0018,
"step": 1100
},
{
"epoch": 0.24430687596727835,
"grad_norm": 2.9132590293884277,
"learning_rate": 4.248073576932637e-05,
"loss": 3.9902,
"step": 1105
},
{
"epoch": 0.24541233694450587,
"grad_norm": 3.318678140640259,
"learning_rate": 4.2418593089734035e-05,
"loss": 3.9546,
"step": 1110
},
{
"epoch": 0.24651779792173337,
"grad_norm": 3.3934099674224854,
"learning_rate": 4.2356450410141684e-05,
"loss": 3.9115,
"step": 1115
},
{
"epoch": 0.24762325889896086,
"grad_norm": 3.0218331813812256,
"learning_rate": 4.229430773054934e-05,
"loss": 4.0945,
"step": 1120
},
{
"epoch": 0.24872871987618836,
"grad_norm": 3.152254581451416,
"learning_rate": 4.2232165050957004e-05,
"loss": 4.1443,
"step": 1125
},
{
"epoch": 0.24983418085341588,
"grad_norm": 3.2911226749420166,
"learning_rate": 4.217002237136465e-05,
"loss": 4.0634,
"step": 1130
},
{
"epoch": 0.2509396418306434,
"grad_norm": 3.0462334156036377,
"learning_rate": 4.210787969177231e-05,
"loss": 4.0296,
"step": 1135
},
{
"epoch": 0.2520451028078709,
"grad_norm": 3.0708699226379395,
"learning_rate": 4.204573701217997e-05,
"loss": 4.1341,
"step": 1140
},
{
"epoch": 0.25315056378509837,
"grad_norm": 3.381535053253174,
"learning_rate": 4.198359433258762e-05,
"loss": 3.9333,
"step": 1145
},
{
"epoch": 0.2542560247623259,
"grad_norm": 3.021491050720215,
"learning_rate": 4.192145165299528e-05,
"loss": 4.0599,
"step": 1150
},
{
"epoch": 0.2553614857395534,
"grad_norm": 3.339264154434204,
"learning_rate": 4.1859308973402934e-05,
"loss": 4.0867,
"step": 1155
},
{
"epoch": 0.2564669467167809,
"grad_norm": 2.9898245334625244,
"learning_rate": 4.179716629381059e-05,
"loss": 4.0395,
"step": 1160
},
{
"epoch": 0.2575724076940084,
"grad_norm": 3.3147876262664795,
"learning_rate": 4.1735023614218246e-05,
"loss": 3.9406,
"step": 1165
},
{
"epoch": 0.2586778686712359,
"grad_norm": 3.3725435733795166,
"learning_rate": 4.16728809346259e-05,
"loss": 3.9498,
"step": 1170
},
{
"epoch": 0.2597833296484634,
"grad_norm": 3.2875232696533203,
"learning_rate": 4.161073825503356e-05,
"loss": 3.9765,
"step": 1175
},
{
"epoch": 0.2608887906256909,
"grad_norm": 3.117985248565674,
"learning_rate": 4.1548595575441215e-05,
"loss": 4.2161,
"step": 1180
},
{
"epoch": 0.2619942516029184,
"grad_norm": 3.326371669769287,
"learning_rate": 4.148645289584887e-05,
"loss": 3.8891,
"step": 1185
},
{
"epoch": 0.2630997125801459,
"grad_norm": 3.4053702354431152,
"learning_rate": 4.142431021625653e-05,
"loss": 4.1167,
"step": 1190
},
{
"epoch": 0.26420517355737344,
"grad_norm": 2.9902451038360596,
"learning_rate": 4.136216753666418e-05,
"loss": 4.1837,
"step": 1195
},
{
"epoch": 0.2653106345346009,
"grad_norm": 3.04341721534729,
"learning_rate": 4.130002485707184e-05,
"loss": 3.9783,
"step": 1200
},
{
"epoch": 0.26641609551182843,
"grad_norm": 3.1881587505340576,
"learning_rate": 4.123788217747949e-05,
"loss": 4.0327,
"step": 1205
},
{
"epoch": 0.26752155648905596,
"grad_norm": 3.1782286167144775,
"learning_rate": 4.117573949788715e-05,
"loss": 3.9614,
"step": 1210
},
{
"epoch": 0.2686270174662834,
"grad_norm": 3.0777156352996826,
"learning_rate": 4.111359681829481e-05,
"loss": 3.9946,
"step": 1215
},
{
"epoch": 0.26973247844351095,
"grad_norm": 3.0450563430786133,
"learning_rate": 4.1051454138702464e-05,
"loss": 4.0267,
"step": 1220
},
{
"epoch": 0.27083793942073847,
"grad_norm": 3.516542673110962,
"learning_rate": 4.098931145911012e-05,
"loss": 4.0077,
"step": 1225
},
{
"epoch": 0.27194340039796594,
"grad_norm": 3.6443097591400146,
"learning_rate": 4.0927168779517776e-05,
"loss": 3.9799,
"step": 1230
},
{
"epoch": 0.27304886137519346,
"grad_norm": 3.004601240158081,
"learning_rate": 4.086502609992543e-05,
"loss": 3.997,
"step": 1235
},
{
"epoch": 0.274154322352421,
"grad_norm": 2.9626457691192627,
"learning_rate": 4.080288342033309e-05,
"loss": 3.9609,
"step": 1240
},
{
"epoch": 0.27525978332964846,
"grad_norm": 3.267373561859131,
"learning_rate": 4.074074074074074e-05,
"loss": 4.0279,
"step": 1245
},
{
"epoch": 0.276365244306876,
"grad_norm": 3.2012808322906494,
"learning_rate": 4.06785980611484e-05,
"loss": 4.0551,
"step": 1250
},
{
"epoch": 0.27747070528410345,
"grad_norm": 3.1443517208099365,
"learning_rate": 4.061645538155606e-05,
"loss": 3.9241,
"step": 1255
},
{
"epoch": 0.27857616626133097,
"grad_norm": 3.201756238937378,
"learning_rate": 4.055431270196371e-05,
"loss": 4.0168,
"step": 1260
},
{
"epoch": 0.2796816272385585,
"grad_norm": 3.381840229034424,
"learning_rate": 4.049217002237137e-05,
"loss": 4.0506,
"step": 1265
},
{
"epoch": 0.28078708821578596,
"grad_norm": 3.3655803203582764,
"learning_rate": 4.043002734277902e-05,
"loss": 4.0166,
"step": 1270
},
{
"epoch": 0.2818925491930135,
"grad_norm": 3.1821653842926025,
"learning_rate": 4.0367884663186675e-05,
"loss": 4.0161,
"step": 1275
},
{
"epoch": 0.282998010170241,
"grad_norm": 3.2986061573028564,
"learning_rate": 4.030574198359434e-05,
"loss": 3.8855,
"step": 1280
},
{
"epoch": 0.2841034711474685,
"grad_norm": 3.3557889461517334,
"learning_rate": 4.024359930400199e-05,
"loss": 4.0151,
"step": 1285
},
{
"epoch": 0.285208932124696,
"grad_norm": 3.358522891998291,
"learning_rate": 4.0181456624409644e-05,
"loss": 3.9199,
"step": 1290
},
{
"epoch": 0.2863143931019235,
"grad_norm": 3.4547970294952393,
"learning_rate": 4.011931394481731e-05,
"loss": 4.0687,
"step": 1295
},
{
"epoch": 0.287419854079151,
"grad_norm": 3.0661280155181885,
"learning_rate": 4.0057171265224956e-05,
"loss": 4.0077,
"step": 1300
},
{
"epoch": 0.2885253150563785,
"grad_norm": 3.2720112800598145,
"learning_rate": 3.999502858563262e-05,
"loss": 3.8876,
"step": 1305
},
{
"epoch": 0.28963077603360604,
"grad_norm": 3.0981643199920654,
"learning_rate": 3.993288590604027e-05,
"loss": 3.9393,
"step": 1310
},
{
"epoch": 0.2907362370108335,
"grad_norm": 3.2599971294403076,
"learning_rate": 3.9870743226447925e-05,
"loss": 3.8995,
"step": 1315
},
{
"epoch": 0.29184169798806103,
"grad_norm": 3.6165876388549805,
"learning_rate": 3.980860054685559e-05,
"loss": 4.0319,
"step": 1320
},
{
"epoch": 0.2929471589652885,
"grad_norm": 3.432969331741333,
"learning_rate": 3.974645786726324e-05,
"loss": 4.0085,
"step": 1325
},
{
"epoch": 0.294052619942516,
"grad_norm": 3.2116641998291016,
"learning_rate": 3.968431518767089e-05,
"loss": 3.9819,
"step": 1330
},
{
"epoch": 0.29515808091974355,
"grad_norm": 3.476435661315918,
"learning_rate": 3.962217250807855e-05,
"loss": 4.028,
"step": 1335
},
{
"epoch": 0.296263541896971,
"grad_norm": 3.428138017654419,
"learning_rate": 3.9560029828486205e-05,
"loss": 3.9686,
"step": 1340
},
{
"epoch": 0.29736900287419854,
"grad_norm": 3.2953410148620605,
"learning_rate": 3.949788714889386e-05,
"loss": 3.9535,
"step": 1345
},
{
"epoch": 0.29847446385142606,
"grad_norm": 3.800462245941162,
"learning_rate": 3.943574446930152e-05,
"loss": 4.02,
"step": 1350
},
{
"epoch": 0.29957992482865353,
"grad_norm": 3.0902063846588135,
"learning_rate": 3.9373601789709174e-05,
"loss": 4.0621,
"step": 1355
},
{
"epoch": 0.30068538580588106,
"grad_norm": 3.0530946254730225,
"learning_rate": 3.931145911011683e-05,
"loss": 4.0547,
"step": 1360
},
{
"epoch": 0.3017908467831086,
"grad_norm": 3.3780524730682373,
"learning_rate": 3.9249316430524486e-05,
"loss": 3.8966,
"step": 1365
},
{
"epoch": 0.30289630776033605,
"grad_norm": 3.302295207977295,
"learning_rate": 3.918717375093214e-05,
"loss": 4.1423,
"step": 1370
},
{
"epoch": 0.30400176873756357,
"grad_norm": 3.452106237411499,
"learning_rate": 3.91250310713398e-05,
"loss": 3.95,
"step": 1375
},
{
"epoch": 0.3051072297147911,
"grad_norm": 3.3365650177001953,
"learning_rate": 3.9062888391747455e-05,
"loss": 4.0451,
"step": 1380
},
{
"epoch": 0.30621269069201856,
"grad_norm": 3.3903305530548096,
"learning_rate": 3.900074571215511e-05,
"loss": 3.8807,
"step": 1385
},
{
"epoch": 0.3073181516692461,
"grad_norm": 3.6150190830230713,
"learning_rate": 3.893860303256277e-05,
"loss": 4.0183,
"step": 1390
},
{
"epoch": 0.30842361264647356,
"grad_norm": 3.298021078109741,
"learning_rate": 3.887646035297042e-05,
"loss": 4.0159,
"step": 1395
},
{
"epoch": 0.3095290736237011,
"grad_norm": 3.3884518146514893,
"learning_rate": 3.881431767337807e-05,
"loss": 4.0274,
"step": 1400
},
{
"epoch": 0.3106345346009286,
"grad_norm": 3.0882458686828613,
"learning_rate": 3.8752174993785736e-05,
"loss": 4.0236,
"step": 1405
},
{
"epoch": 0.31173999557815607,
"grad_norm": 3.4634859561920166,
"learning_rate": 3.869003231419339e-05,
"loss": 4.106,
"step": 1410
},
{
"epoch": 0.3128454565553836,
"grad_norm": 3.3966925144195557,
"learning_rate": 3.862788963460104e-05,
"loss": 4.1579,
"step": 1415
},
{
"epoch": 0.3139509175326111,
"grad_norm": 3.643110990524292,
"learning_rate": 3.8565746955008704e-05,
"loss": 3.8821,
"step": 1420
},
{
"epoch": 0.3150563785098386,
"grad_norm": 3.37382435798645,
"learning_rate": 3.8503604275416354e-05,
"loss": 4.1456,
"step": 1425
},
{
"epoch": 0.3161618394870661,
"grad_norm": 3.523825168609619,
"learning_rate": 3.8441461595824017e-05,
"loss": 4.0356,
"step": 1430
},
{
"epoch": 0.31726730046429363,
"grad_norm": 3.146383762359619,
"learning_rate": 3.837931891623167e-05,
"loss": 4.1187,
"step": 1435
},
{
"epoch": 0.3183727614415211,
"grad_norm": 3.3049044609069824,
"learning_rate": 3.831717623663932e-05,
"loss": 3.9896,
"step": 1440
},
{
"epoch": 0.3194782224187486,
"grad_norm": 3.3387224674224854,
"learning_rate": 3.8255033557046985e-05,
"loss": 4.0838,
"step": 1445
},
{
"epoch": 0.32058368339597615,
"grad_norm": 3.432584047317505,
"learning_rate": 3.819289087745464e-05,
"loss": 4.2188,
"step": 1450
},
{
"epoch": 0.3216891443732036,
"grad_norm": 3.689253568649292,
"learning_rate": 3.813074819786229e-05,
"loss": 4.0942,
"step": 1455
},
{
"epoch": 0.32279460535043114,
"grad_norm": 3.4148080348968506,
"learning_rate": 3.8068605518269954e-05,
"loss": 4.0352,
"step": 1460
},
{
"epoch": 0.3239000663276586,
"grad_norm": 3.3507676124572754,
"learning_rate": 3.80064628386776e-05,
"loss": 4.0372,
"step": 1465
},
{
"epoch": 0.32500552730488613,
"grad_norm": 3.4236788749694824,
"learning_rate": 3.794432015908526e-05,
"loss": 4.0303,
"step": 1470
},
{
"epoch": 0.32611098828211366,
"grad_norm": 3.2741448879241943,
"learning_rate": 3.788217747949292e-05,
"loss": 3.9362,
"step": 1475
},
{
"epoch": 0.3272164492593411,
"grad_norm": 3.177788734436035,
"learning_rate": 3.782003479990057e-05,
"loss": 4.0183,
"step": 1480
},
{
"epoch": 0.32832191023656865,
"grad_norm": 3.6237776279449463,
"learning_rate": 3.775789212030823e-05,
"loss": 4.0285,
"step": 1485
},
{
"epoch": 0.32942737121379617,
"grad_norm": 3.418241024017334,
"learning_rate": 3.7695749440715884e-05,
"loss": 4.1458,
"step": 1490
},
{
"epoch": 0.33053283219102364,
"grad_norm": 3.0317554473876953,
"learning_rate": 3.763360676112354e-05,
"loss": 3.9586,
"step": 1495
},
{
"epoch": 0.33163829316825116,
"grad_norm": 3.402616024017334,
"learning_rate": 3.7571464081531196e-05,
"loss": 4.1311,
"step": 1500
},
{
"epoch": 0.3327437541454787,
"grad_norm": 3.386590003967285,
"learning_rate": 3.750932140193885e-05,
"loss": 4.189,
"step": 1505
},
{
"epoch": 0.33384921512270616,
"grad_norm": 3.329336404800415,
"learning_rate": 3.744717872234651e-05,
"loss": 3.9931,
"step": 1510
},
{
"epoch": 0.3349546760999337,
"grad_norm": 3.281658411026001,
"learning_rate": 3.7385036042754165e-05,
"loss": 4.0458,
"step": 1515
},
{
"epoch": 0.3360601370771612,
"grad_norm": 3.196786880493164,
"learning_rate": 3.732289336316182e-05,
"loss": 3.9526,
"step": 1520
},
{
"epoch": 0.33716559805438867,
"grad_norm": 3.386678695678711,
"learning_rate": 3.726075068356948e-05,
"loss": 4.1347,
"step": 1525
},
{
"epoch": 0.3382710590316162,
"grad_norm": 2.9931721687316895,
"learning_rate": 3.719860800397713e-05,
"loss": 3.9369,
"step": 1530
},
{
"epoch": 0.33937652000884366,
"grad_norm": 3.7105250358581543,
"learning_rate": 3.713646532438479e-05,
"loss": 3.8733,
"step": 1535
},
{
"epoch": 0.3404819809860712,
"grad_norm": 3.0669617652893066,
"learning_rate": 3.7074322644792446e-05,
"loss": 3.8466,
"step": 1540
},
{
"epoch": 0.3415874419632987,
"grad_norm": 3.449889898300171,
"learning_rate": 3.70121799652001e-05,
"loss": 4.0733,
"step": 1545
},
{
"epoch": 0.3426929029405262,
"grad_norm": 3.4569785594940186,
"learning_rate": 3.695003728560776e-05,
"loss": 3.9711,
"step": 1550
},
{
"epoch": 0.3437983639177537,
"grad_norm": 3.4246673583984375,
"learning_rate": 3.6887894606015414e-05,
"loss": 4.0172,
"step": 1555
},
{
"epoch": 0.3449038248949812,
"grad_norm": 3.5262482166290283,
"learning_rate": 3.682575192642307e-05,
"loss": 4.1475,
"step": 1560
},
{
"epoch": 0.3460092858722087,
"grad_norm": 3.057406425476074,
"learning_rate": 3.6763609246830726e-05,
"loss": 4.0023,
"step": 1565
},
{
"epoch": 0.3471147468494362,
"grad_norm": 3.6714344024658203,
"learning_rate": 3.670146656723838e-05,
"loss": 3.9847,
"step": 1570
},
{
"epoch": 0.34822020782666374,
"grad_norm": 3.396587371826172,
"learning_rate": 3.663932388764604e-05,
"loss": 4.1175,
"step": 1575
},
{
"epoch": 0.3493256688038912,
"grad_norm": 3.11995530128479,
"learning_rate": 3.6577181208053695e-05,
"loss": 4.0171,
"step": 1580
},
{
"epoch": 0.35043112978111873,
"grad_norm": 3.4781930446624756,
"learning_rate": 3.651503852846135e-05,
"loss": 3.9343,
"step": 1585
},
{
"epoch": 0.35153659075834626,
"grad_norm": 3.264204263687134,
"learning_rate": 3.645289584886901e-05,
"loss": 4.1221,
"step": 1590
},
{
"epoch": 0.3526420517355737,
"grad_norm": 3.2987558841705322,
"learning_rate": 3.639075316927666e-05,
"loss": 4.1632,
"step": 1595
},
{
"epoch": 0.35374751271280125,
"grad_norm": 3.6787593364715576,
"learning_rate": 3.632861048968432e-05,
"loss": 3.9153,
"step": 1600
},
{
"epoch": 0.3548529736900287,
"grad_norm": 3.2717323303222656,
"learning_rate": 3.6266467810091976e-05,
"loss": 3.9902,
"step": 1605
},
{
"epoch": 0.35595843466725624,
"grad_norm": 3.1607632637023926,
"learning_rate": 3.6204325130499625e-05,
"loss": 4.0374,
"step": 1610
},
{
"epoch": 0.35706389564448376,
"grad_norm": 3.187629461288452,
"learning_rate": 3.614218245090729e-05,
"loss": 4.2059,
"step": 1615
},
{
"epoch": 0.35816935662171123,
"grad_norm": 3.6148953437805176,
"learning_rate": 3.608003977131494e-05,
"loss": 4.0533,
"step": 1620
},
{
"epoch": 0.35927481759893876,
"grad_norm": 3.3978331089019775,
"learning_rate": 3.6017897091722594e-05,
"loss": 4.0545,
"step": 1625
},
{
"epoch": 0.3603802785761663,
"grad_norm": 3.5654563903808594,
"learning_rate": 3.595575441213026e-05,
"loss": 4.0747,
"step": 1630
},
{
"epoch": 0.36148573955339375,
"grad_norm": 3.0887868404388428,
"learning_rate": 3.5893611732537906e-05,
"loss": 4.0406,
"step": 1635
},
{
"epoch": 0.36259120053062127,
"grad_norm": 2.8452141284942627,
"learning_rate": 3.583146905294556e-05,
"loss": 4.0105,
"step": 1640
},
{
"epoch": 0.3636966615078488,
"grad_norm": 3.3485066890716553,
"learning_rate": 3.576932637335322e-05,
"loss": 4.1587,
"step": 1645
},
{
"epoch": 0.36480212248507626,
"grad_norm": 3.476148843765259,
"learning_rate": 3.5707183693760875e-05,
"loss": 3.9972,
"step": 1650
},
{
"epoch": 0.3659075834623038,
"grad_norm": 3.3700621128082275,
"learning_rate": 3.564504101416854e-05,
"loss": 3.8875,
"step": 1655
},
{
"epoch": 0.3670130444395313,
"grad_norm": 3.48191237449646,
"learning_rate": 3.558289833457619e-05,
"loss": 4.0436,
"step": 1660
},
{
"epoch": 0.3681185054167588,
"grad_norm": 2.992255926132202,
"learning_rate": 3.552075565498384e-05,
"loss": 4.143,
"step": 1665
},
{
"epoch": 0.3692239663939863,
"grad_norm": 3.511962413787842,
"learning_rate": 3.5458612975391506e-05,
"loss": 4.0267,
"step": 1670
},
{
"epoch": 0.37032942737121377,
"grad_norm": 3.1641499996185303,
"learning_rate": 3.5396470295799155e-05,
"loss": 3.9213,
"step": 1675
},
{
"epoch": 0.3714348883484413,
"grad_norm": 3.7594759464263916,
"learning_rate": 3.533432761620681e-05,
"loss": 4.1522,
"step": 1680
},
{
"epoch": 0.3725403493256688,
"grad_norm": 3.7265207767486572,
"learning_rate": 3.527218493661447e-05,
"loss": 3.9366,
"step": 1685
},
{
"epoch": 0.3736458103028963,
"grad_norm": 3.301990270614624,
"learning_rate": 3.5210042257022124e-05,
"loss": 3.9142,
"step": 1690
},
{
"epoch": 0.3747512712801238,
"grad_norm": 3.2270445823669434,
"learning_rate": 3.514789957742978e-05,
"loss": 4.0301,
"step": 1695
},
{
"epoch": 0.37585673225735133,
"grad_norm": 3.4519598484039307,
"learning_rate": 3.5085756897837436e-05,
"loss": 3.9566,
"step": 1700
},
{
"epoch": 0.3769621932345788,
"grad_norm": 3.3497774600982666,
"learning_rate": 3.502361421824509e-05,
"loss": 3.9327,
"step": 1705
},
{
"epoch": 0.3780676542118063,
"grad_norm": 3.5343832969665527,
"learning_rate": 3.496147153865275e-05,
"loss": 4.0446,
"step": 1710
},
{
"epoch": 0.37917311518903385,
"grad_norm": 3.369101047515869,
"learning_rate": 3.4899328859060405e-05,
"loss": 3.9529,
"step": 1715
},
{
"epoch": 0.3802785761662613,
"grad_norm": 3.0477051734924316,
"learning_rate": 3.483718617946806e-05,
"loss": 3.8606,
"step": 1720
},
{
"epoch": 0.38138403714348884,
"grad_norm": 3.516953468322754,
"learning_rate": 3.477504349987572e-05,
"loss": 3.9936,
"step": 1725
},
{
"epoch": 0.38248949812071636,
"grad_norm": 3.628263235092163,
"learning_rate": 3.471290082028337e-05,
"loss": 3.9455,
"step": 1730
},
{
"epoch": 0.38359495909794383,
"grad_norm": 3.476489305496216,
"learning_rate": 3.465075814069103e-05,
"loss": 3.972,
"step": 1735
},
{
"epoch": 0.38470042007517136,
"grad_norm": 3.296743154525757,
"learning_rate": 3.4588615461098686e-05,
"loss": 4.093,
"step": 1740
},
{
"epoch": 0.3858058810523988,
"grad_norm": 3.523559331893921,
"learning_rate": 3.452647278150634e-05,
"loss": 3.9767,
"step": 1745
},
{
"epoch": 0.38691134202962635,
"grad_norm": 3.2359955310821533,
"learning_rate": 3.446433010191399e-05,
"loss": 3.9597,
"step": 1750
},
{
"epoch": 0.38801680300685387,
"grad_norm": 3.318793296813965,
"learning_rate": 3.4402187422321654e-05,
"loss": 4.1788,
"step": 1755
},
{
"epoch": 0.38912226398408134,
"grad_norm": 3.055785655975342,
"learning_rate": 3.434004474272931e-05,
"loss": 3.873,
"step": 1760
},
{
"epoch": 0.39022772496130886,
"grad_norm": 3.787897825241089,
"learning_rate": 3.427790206313696e-05,
"loss": 4.0092,
"step": 1765
},
{
"epoch": 0.3913331859385364,
"grad_norm": 3.6127915382385254,
"learning_rate": 3.421575938354462e-05,
"loss": 3.9295,
"step": 1770
},
{
"epoch": 0.39243864691576386,
"grad_norm": 3.254620313644409,
"learning_rate": 3.415361670395227e-05,
"loss": 3.965,
"step": 1775
},
{
"epoch": 0.3935441078929914,
"grad_norm": 3.480854034423828,
"learning_rate": 3.4091474024359935e-05,
"loss": 4.0151,
"step": 1780
},
{
"epoch": 0.3946495688702189,
"grad_norm": 3.200242280960083,
"learning_rate": 3.402933134476759e-05,
"loss": 4.0929,
"step": 1785
},
{
"epoch": 0.39575502984744637,
"grad_norm": 3.1364223957061768,
"learning_rate": 3.396718866517524e-05,
"loss": 3.9891,
"step": 1790
},
{
"epoch": 0.3968604908246739,
"grad_norm": 3.4453999996185303,
"learning_rate": 3.3905045985582904e-05,
"loss": 4.1357,
"step": 1795
},
{
"epoch": 0.3979659518019014,
"grad_norm": 3.265876531600952,
"learning_rate": 3.384290330599056e-05,
"loss": 3.8728,
"step": 1800
},
{
"epoch": 0.3990714127791289,
"grad_norm": 3.2799103260040283,
"learning_rate": 3.378076062639821e-05,
"loss": 4.1506,
"step": 1805
},
{
"epoch": 0.4001768737563564,
"grad_norm": 3.2966063022613525,
"learning_rate": 3.371861794680587e-05,
"loss": 3.8413,
"step": 1810
},
{
"epoch": 0.4012823347335839,
"grad_norm": 3.346560478210449,
"learning_rate": 3.365647526721352e-05,
"loss": 4.0029,
"step": 1815
},
{
"epoch": 0.4023877957108114,
"grad_norm": 3.191598892211914,
"learning_rate": 3.359433258762118e-05,
"loss": 4.0017,
"step": 1820
},
{
"epoch": 0.4034932566880389,
"grad_norm": 3.689901113510132,
"learning_rate": 3.353218990802884e-05,
"loss": 4.1346,
"step": 1825
},
{
"epoch": 0.4045987176652664,
"grad_norm": 3.4523544311523438,
"learning_rate": 3.347004722843649e-05,
"loss": 4.0593,
"step": 1830
},
{
"epoch": 0.4057041786424939,
"grad_norm": 3.3706953525543213,
"learning_rate": 3.3407904548844146e-05,
"loss": 4.1312,
"step": 1835
},
{
"epoch": 0.40680963961972144,
"grad_norm": 3.5654544830322266,
"learning_rate": 3.33457618692518e-05,
"loss": 4.0694,
"step": 1840
},
{
"epoch": 0.4079151005969489,
"grad_norm": 3.540480136871338,
"learning_rate": 3.328361918965946e-05,
"loss": 4.123,
"step": 1845
},
{
"epoch": 0.40902056157417643,
"grad_norm": 3.286994695663452,
"learning_rate": 3.3221476510067115e-05,
"loss": 4.1365,
"step": 1850
},
{
"epoch": 0.41012602255140396,
"grad_norm": 3.0457570552825928,
"learning_rate": 3.315933383047477e-05,
"loss": 3.9359,
"step": 1855
},
{
"epoch": 0.4112314835286314,
"grad_norm": 3.2751758098602295,
"learning_rate": 3.309719115088243e-05,
"loss": 4.0003,
"step": 1860
},
{
"epoch": 0.41233694450585895,
"grad_norm": 3.345170259475708,
"learning_rate": 3.303504847129008e-05,
"loss": 3.87,
"step": 1865
},
{
"epoch": 0.4134424054830865,
"grad_norm": 3.398428440093994,
"learning_rate": 3.297290579169774e-05,
"loss": 3.9499,
"step": 1870
},
{
"epoch": 0.41454786646031394,
"grad_norm": 3.3243329524993896,
"learning_rate": 3.2910763112105396e-05,
"loss": 4.0548,
"step": 1875
},
{
"epoch": 0.41565332743754146,
"grad_norm": 3.449658155441284,
"learning_rate": 3.284862043251305e-05,
"loss": 3.8984,
"step": 1880
},
{
"epoch": 0.41675878841476893,
"grad_norm": 3.741178035736084,
"learning_rate": 3.278647775292071e-05,
"loss": 4.1575,
"step": 1885
},
{
"epoch": 0.41786424939199646,
"grad_norm": 3.4483730792999268,
"learning_rate": 3.2724335073328364e-05,
"loss": 4.034,
"step": 1890
},
{
"epoch": 0.418969710369224,
"grad_norm": 3.176455020904541,
"learning_rate": 3.266219239373602e-05,
"loss": 3.9522,
"step": 1895
},
{
"epoch": 0.42007517134645145,
"grad_norm": 3.323781967163086,
"learning_rate": 3.2600049714143676e-05,
"loss": 4.04,
"step": 1900
},
{
"epoch": 0.42118063232367897,
"grad_norm": 3.125051498413086,
"learning_rate": 3.253790703455133e-05,
"loss": 3.9916,
"step": 1905
},
{
"epoch": 0.4222860933009065,
"grad_norm": 3.488311767578125,
"learning_rate": 3.247576435495899e-05,
"loss": 4.1544,
"step": 1910
},
{
"epoch": 0.42339155427813396,
"grad_norm": 3.3193490505218506,
"learning_rate": 3.2413621675366645e-05,
"loss": 3.8267,
"step": 1915
},
{
"epoch": 0.4244970152553615,
"grad_norm": 3.118138313293457,
"learning_rate": 3.23514789957743e-05,
"loss": 4.0021,
"step": 1920
},
{
"epoch": 0.425602476232589,
"grad_norm": 3.0843567848205566,
"learning_rate": 3.228933631618196e-05,
"loss": 4.0595,
"step": 1925
},
{
"epoch": 0.4267079372098165,
"grad_norm": 3.249384880065918,
"learning_rate": 3.222719363658961e-05,
"loss": 4.0343,
"step": 1930
},
{
"epoch": 0.427813398187044,
"grad_norm": 3.4635889530181885,
"learning_rate": 3.216505095699727e-05,
"loss": 4.1018,
"step": 1935
},
{
"epoch": 0.4289188591642715,
"grad_norm": 3.705624580383301,
"learning_rate": 3.2102908277404926e-05,
"loss": 4.0875,
"step": 1940
},
{
"epoch": 0.430024320141499,
"grad_norm": 3.6071228981018066,
"learning_rate": 3.2040765597812575e-05,
"loss": 4.0507,
"step": 1945
},
{
"epoch": 0.4311297811187265,
"grad_norm": 3.513573169708252,
"learning_rate": 3.197862291822024e-05,
"loss": 3.9596,
"step": 1950
},
{
"epoch": 0.432235242095954,
"grad_norm": 3.4200334548950195,
"learning_rate": 3.1916480238627894e-05,
"loss": 3.9723,
"step": 1955
},
{
"epoch": 0.4333407030731815,
"grad_norm": 3.472170114517212,
"learning_rate": 3.1854337559035544e-05,
"loss": 4.0224,
"step": 1960
},
{
"epoch": 0.43444616405040903,
"grad_norm": 3.499969482421875,
"learning_rate": 3.179219487944321e-05,
"loss": 3.935,
"step": 1965
},
{
"epoch": 0.4355516250276365,
"grad_norm": 3.5393736362457275,
"learning_rate": 3.1730052199850856e-05,
"loss": 4.1007,
"step": 1970
},
{
"epoch": 0.436657086004864,
"grad_norm": 3.557710647583008,
"learning_rate": 3.166790952025851e-05,
"loss": 4.1932,
"step": 1975
},
{
"epoch": 0.43776254698209155,
"grad_norm": 3.4602739810943604,
"learning_rate": 3.1605766840666175e-05,
"loss": 4.0741,
"step": 1980
},
{
"epoch": 0.438868007959319,
"grad_norm": 3.578395366668701,
"learning_rate": 3.1543624161073825e-05,
"loss": 4.0468,
"step": 1985
},
{
"epoch": 0.43997346893654654,
"grad_norm": 3.3289973735809326,
"learning_rate": 3.148148148148148e-05,
"loss": 3.9823,
"step": 1990
},
{
"epoch": 0.44107892991377406,
"grad_norm": 3.6602888107299805,
"learning_rate": 3.141933880188914e-05,
"loss": 4.0993,
"step": 1995
},
{
"epoch": 0.44218439089100153,
"grad_norm": 3.5060999393463135,
"learning_rate": 3.135719612229679e-05,
"loss": 3.9399,
"step": 2000
},
{
"epoch": 0.44328985186822906,
"grad_norm": 3.185040235519409,
"learning_rate": 3.1295053442704456e-05,
"loss": 4.0126,
"step": 2005
},
{
"epoch": 0.4443953128454566,
"grad_norm": 3.3001205921173096,
"learning_rate": 3.1232910763112105e-05,
"loss": 4.1115,
"step": 2010
},
{
"epoch": 0.44550077382268405,
"grad_norm": 3.4892706871032715,
"learning_rate": 3.117076808351976e-05,
"loss": 4.1656,
"step": 2015
},
{
"epoch": 0.4466062347999116,
"grad_norm": 3.1955862045288086,
"learning_rate": 3.110862540392742e-05,
"loss": 4.0902,
"step": 2020
},
{
"epoch": 0.44771169577713904,
"grad_norm": 3.3935418128967285,
"learning_rate": 3.1046482724335074e-05,
"loss": 3.9551,
"step": 2025
},
{
"epoch": 0.44881715675436656,
"grad_norm": 3.6117637157440186,
"learning_rate": 3.098434004474273e-05,
"loss": 4.1856,
"step": 2030
},
{
"epoch": 0.4499226177315941,
"grad_norm": 3.432446002960205,
"learning_rate": 3.0922197365150386e-05,
"loss": 4.1022,
"step": 2035
},
{
"epoch": 0.45102807870882156,
"grad_norm": 3.3948235511779785,
"learning_rate": 3.086005468555804e-05,
"loss": 4.0777,
"step": 2040
},
{
"epoch": 0.4521335396860491,
"grad_norm": 3.169699192047119,
"learning_rate": 3.07979120059657e-05,
"loss": 4.0572,
"step": 2045
},
{
"epoch": 0.4532390006632766,
"grad_norm": 3.3817138671875,
"learning_rate": 3.0735769326373355e-05,
"loss": 4.0533,
"step": 2050
},
{
"epoch": 0.45434446164050407,
"grad_norm": 3.4111692905426025,
"learning_rate": 3.067362664678101e-05,
"loss": 4.0991,
"step": 2055
},
{
"epoch": 0.4554499226177316,
"grad_norm": 3.7082407474517822,
"learning_rate": 3.061148396718867e-05,
"loss": 4.0745,
"step": 2060
},
{
"epoch": 0.4565553835949591,
"grad_norm": 3.393707036972046,
"learning_rate": 3.054934128759632e-05,
"loss": 3.9929,
"step": 2065
},
{
"epoch": 0.4576608445721866,
"grad_norm": 3.750239133834839,
"learning_rate": 3.048719860800398e-05,
"loss": 4.1835,
"step": 2070
},
{
"epoch": 0.4587663055494141,
"grad_norm": 3.366420030593872,
"learning_rate": 3.0425055928411632e-05,
"loss": 4.0436,
"step": 2075
},
{
"epoch": 0.45987176652664163,
"grad_norm": 3.3570804595947266,
"learning_rate": 3.0362913248819292e-05,
"loss": 3.9977,
"step": 2080
},
{
"epoch": 0.4609772275038691,
"grad_norm": 3.541613817214966,
"learning_rate": 3.0300770569226945e-05,
"loss": 4.0789,
"step": 2085
},
{
"epoch": 0.4620826884810966,
"grad_norm": 3.697382926940918,
"learning_rate": 3.02386278896346e-05,
"loss": 4.1316,
"step": 2090
},
{
"epoch": 0.4631881494583241,
"grad_norm": 3.375995397567749,
"learning_rate": 3.017648521004226e-05,
"loss": 3.935,
"step": 2095
},
{
"epoch": 0.4642936104355516,
"grad_norm": 3.3144774436950684,
"learning_rate": 3.0114342530449913e-05,
"loss": 4.1222,
"step": 2100
},
{
"epoch": 0.46539907141277914,
"grad_norm": 3.600338935852051,
"learning_rate": 3.005219985085757e-05,
"loss": 4.1123,
"step": 2105
},
{
"epoch": 0.4665045323900066,
"grad_norm": 3.3715898990631104,
"learning_rate": 2.999005717126523e-05,
"loss": 4.1952,
"step": 2110
},
{
"epoch": 0.46760999336723413,
"grad_norm": 3.2076468467712402,
"learning_rate": 2.992791449167288e-05,
"loss": 3.9456,
"step": 2115
},
{
"epoch": 0.46871545434446166,
"grad_norm": 3.7750439643859863,
"learning_rate": 2.986577181208054e-05,
"loss": 4.0785,
"step": 2120
},
{
"epoch": 0.4698209153216891,
"grad_norm": 3.3552026748657227,
"learning_rate": 2.980362913248819e-05,
"loss": 4.3222,
"step": 2125
},
{
"epoch": 0.47092637629891665,
"grad_norm": 3.4313700199127197,
"learning_rate": 2.974148645289585e-05,
"loss": 3.9145,
"step": 2130
},
{
"epoch": 0.4720318372761442,
"grad_norm": 3.4928014278411865,
"learning_rate": 2.967934377330351e-05,
"loss": 3.8454,
"step": 2135
},
{
"epoch": 0.47313729825337164,
"grad_norm": 3.6989784240722656,
"learning_rate": 2.9617201093711163e-05,
"loss": 3.9581,
"step": 2140
},
{
"epoch": 0.47424275923059916,
"grad_norm": 3.152308702468872,
"learning_rate": 2.955505841411882e-05,
"loss": 3.9159,
"step": 2145
},
{
"epoch": 0.4753482202078267,
"grad_norm": 3.2610297203063965,
"learning_rate": 2.949291573452647e-05,
"loss": 4.1646,
"step": 2150
},
{
"epoch": 0.47645368118505416,
"grad_norm": 3.4919862747192383,
"learning_rate": 2.943077305493413e-05,
"loss": 3.9627,
"step": 2155
},
{
"epoch": 0.4775591421622817,
"grad_norm": 3.323495388031006,
"learning_rate": 2.9368630375341787e-05,
"loss": 3.9826,
"step": 2160
},
{
"epoch": 0.47866460313950915,
"grad_norm": 3.4803435802459717,
"learning_rate": 2.930648769574944e-05,
"loss": 4.1256,
"step": 2165
},
{
"epoch": 0.47977006411673667,
"grad_norm": 3.3792881965637207,
"learning_rate": 2.92443450161571e-05,
"loss": 3.9697,
"step": 2170
},
{
"epoch": 0.4808755250939642,
"grad_norm": 3.5845255851745605,
"learning_rate": 2.9182202336564756e-05,
"loss": 4.1054,
"step": 2175
},
{
"epoch": 0.48198098607119166,
"grad_norm": 3.275973081588745,
"learning_rate": 2.912005965697241e-05,
"loss": 4.1417,
"step": 2180
},
{
"epoch": 0.4830864470484192,
"grad_norm": 3.3241536617279053,
"learning_rate": 2.9057916977380068e-05,
"loss": 4.0629,
"step": 2185
},
{
"epoch": 0.4841919080256467,
"grad_norm": 3.298708200454712,
"learning_rate": 2.899577429778772e-05,
"loss": 3.9206,
"step": 2190
},
{
"epoch": 0.4852973690028742,
"grad_norm": 3.18892502784729,
"learning_rate": 2.8933631618195377e-05,
"loss": 4.0769,
"step": 2195
},
{
"epoch": 0.4864028299801017,
"grad_norm": 3.206279993057251,
"learning_rate": 2.8871488938603037e-05,
"loss": 3.937,
"step": 2200
},
{
"epoch": 0.4875082909573292,
"grad_norm": 3.4408323764801025,
"learning_rate": 2.880934625901069e-05,
"loss": 4.0496,
"step": 2205
},
{
"epoch": 0.4886137519345567,
"grad_norm": 3.258359670639038,
"learning_rate": 2.8747203579418346e-05,
"loss": 4.038,
"step": 2210
},
{
"epoch": 0.4897192129117842,
"grad_norm": 3.336268424987793,
"learning_rate": 2.8685060899826e-05,
"loss": 4.0437,
"step": 2215
},
{
"epoch": 0.49082467388901174,
"grad_norm": 3.27437686920166,
"learning_rate": 2.8622918220233658e-05,
"loss": 4.1238,
"step": 2220
},
{
"epoch": 0.4919301348662392,
"grad_norm": 3.076141595840454,
"learning_rate": 2.8560775540641317e-05,
"loss": 3.987,
"step": 2225
},
{
"epoch": 0.49303559584346673,
"grad_norm": 3.2528483867645264,
"learning_rate": 2.8498632861048967e-05,
"loss": 3.9728,
"step": 2230
},
{
"epoch": 0.4941410568206942,
"grad_norm": 3.397096872329712,
"learning_rate": 2.8436490181456626e-05,
"loss": 4.1707,
"step": 2235
},
{
"epoch": 0.4952465177979217,
"grad_norm": 3.2209689617156982,
"learning_rate": 2.837434750186428e-05,
"loss": 4.0548,
"step": 2240
},
{
"epoch": 0.49635197877514925,
"grad_norm": 3.292736530303955,
"learning_rate": 2.831220482227194e-05,
"loss": 4.0244,
"step": 2245
},
{
"epoch": 0.4974574397523767,
"grad_norm": 3.461022138595581,
"learning_rate": 2.8250062142679595e-05,
"loss": 4.0763,
"step": 2250
},
{
"epoch": 0.49856290072960424,
"grad_norm": 3.4967451095581055,
"learning_rate": 2.8187919463087248e-05,
"loss": 4.1721,
"step": 2255
},
{
"epoch": 0.49966836170683177,
"grad_norm": 3.2440531253814697,
"learning_rate": 2.8125776783494907e-05,
"loss": 4.0546,
"step": 2260
},
{
"epoch": 0.5007738226840592,
"grad_norm": 3.318380355834961,
"learning_rate": 2.8063634103902563e-05,
"loss": 4.0751,
"step": 2265
},
{
"epoch": 0.5018792836612868,
"grad_norm": 3.1638567447662354,
"learning_rate": 2.8001491424310216e-05,
"loss": 3.9274,
"step": 2270
},
{
"epoch": 0.5029847446385143,
"grad_norm": 3.345717430114746,
"learning_rate": 2.7939348744717876e-05,
"loss": 4.1606,
"step": 2275
},
{
"epoch": 0.5040902056157418,
"grad_norm": 3.5760574340820312,
"learning_rate": 2.787720606512553e-05,
"loss": 3.9832,
"step": 2280
},
{
"epoch": 0.5051956665929692,
"grad_norm": 3.3899612426757812,
"learning_rate": 2.7815063385533185e-05,
"loss": 4.0456,
"step": 2285
},
{
"epoch": 0.5063011275701967,
"grad_norm": 3.3774311542510986,
"learning_rate": 2.7752920705940844e-05,
"loss": 4.0956,
"step": 2290
},
{
"epoch": 0.5074065885474243,
"grad_norm": 3.1358556747436523,
"learning_rate": 2.7690778026348497e-05,
"loss": 4.1611,
"step": 2295
},
{
"epoch": 0.5085120495246518,
"grad_norm": 3.3426547050476074,
"learning_rate": 2.7628635346756153e-05,
"loss": 4.1298,
"step": 2300
},
{
"epoch": 0.5096175105018793,
"grad_norm": 3.252143383026123,
"learning_rate": 2.7566492667163806e-05,
"loss": 4.0572,
"step": 2305
},
{
"epoch": 0.5107229714791068,
"grad_norm": 3.4557764530181885,
"learning_rate": 2.7504349987571466e-05,
"loss": 4.0824,
"step": 2310
},
{
"epoch": 0.5118284324563342,
"grad_norm": 3.2078895568847656,
"learning_rate": 2.7442207307979122e-05,
"loss": 3.9483,
"step": 2315
},
{
"epoch": 0.5129338934335618,
"grad_norm": 3.4674055576324463,
"learning_rate": 2.7380064628386775e-05,
"loss": 4.0843,
"step": 2320
},
{
"epoch": 0.5140393544107893,
"grad_norm": 3.7841782569885254,
"learning_rate": 2.7317921948794434e-05,
"loss": 4.2304,
"step": 2325
},
{
"epoch": 0.5151448153880168,
"grad_norm": 3.267167091369629,
"learning_rate": 2.725577926920209e-05,
"loss": 4.0463,
"step": 2330
},
{
"epoch": 0.5162502763652443,
"grad_norm": 3.782557725906372,
"learning_rate": 2.7193636589609743e-05,
"loss": 4.0149,
"step": 2335
},
{
"epoch": 0.5173557373424718,
"grad_norm": 3.4802868366241455,
"learning_rate": 2.7131493910017403e-05,
"loss": 3.9961,
"step": 2340
},
{
"epoch": 0.5184611983196993,
"grad_norm": 3.346196413040161,
"learning_rate": 2.7069351230425055e-05,
"loss": 3.9326,
"step": 2345
},
{
"epoch": 0.5195666592969268,
"grad_norm": 3.166124105453491,
"learning_rate": 2.7007208550832715e-05,
"loss": 3.9516,
"step": 2350
},
{
"epoch": 0.5206721202741543,
"grad_norm": 3.288295269012451,
"learning_rate": 2.694506587124037e-05,
"loss": 4.1143,
"step": 2355
},
{
"epoch": 0.5217775812513819,
"grad_norm": 3.3296289443969727,
"learning_rate": 2.6882923191648024e-05,
"loss": 3.8863,
"step": 2360
},
{
"epoch": 0.5228830422286094,
"grad_norm": 3.1221563816070557,
"learning_rate": 2.6820780512055683e-05,
"loss": 3.9889,
"step": 2365
},
{
"epoch": 0.5239885032058368,
"grad_norm": 3.225713014602661,
"learning_rate": 2.6758637832463336e-05,
"loss": 3.947,
"step": 2370
},
{
"epoch": 0.5250939641830643,
"grad_norm": 3.5291709899902344,
"learning_rate": 2.6696495152870992e-05,
"loss": 4.1917,
"step": 2375
},
{
"epoch": 0.5261994251602918,
"grad_norm": 3.4283344745635986,
"learning_rate": 2.6634352473278652e-05,
"loss": 4.0173,
"step": 2380
},
{
"epoch": 0.5273048861375194,
"grad_norm": 3.4083287715911865,
"learning_rate": 2.6572209793686305e-05,
"loss": 4.1016,
"step": 2385
},
{
"epoch": 0.5284103471147469,
"grad_norm": 3.3082547187805176,
"learning_rate": 2.651006711409396e-05,
"loss": 4.1025,
"step": 2390
},
{
"epoch": 0.5295158080919744,
"grad_norm": 3.645259141921997,
"learning_rate": 2.644792443450162e-05,
"loss": 3.9657,
"step": 2395
},
{
"epoch": 0.5306212690692018,
"grad_norm": 3.1570723056793213,
"learning_rate": 2.6385781754909273e-05,
"loss": 4.0965,
"step": 2400
},
{
"epoch": 0.5317267300464293,
"grad_norm": 3.387300491333008,
"learning_rate": 2.632363907531693e-05,
"loss": 4.0099,
"step": 2405
},
{
"epoch": 0.5328321910236569,
"grad_norm": 3.4514920711517334,
"learning_rate": 2.6261496395724582e-05,
"loss": 3.9037,
"step": 2410
},
{
"epoch": 0.5339376520008844,
"grad_norm": 3.7543208599090576,
"learning_rate": 2.6199353716132242e-05,
"loss": 4.0804,
"step": 2415
},
{
"epoch": 0.5350431129781119,
"grad_norm": 3.4875600337982178,
"learning_rate": 2.6137211036539898e-05,
"loss": 4.0105,
"step": 2420
},
{
"epoch": 0.5361485739553393,
"grad_norm": 3.4124867916107178,
"learning_rate": 2.607506835694755e-05,
"loss": 4.1436,
"step": 2425
},
{
"epoch": 0.5372540349325668,
"grad_norm": 3.392489194869995,
"learning_rate": 2.601292567735521e-05,
"loss": 4.087,
"step": 2430
},
{
"epoch": 0.5383594959097944,
"grad_norm": 3.3754377365112305,
"learning_rate": 2.5950782997762863e-05,
"loss": 4.0433,
"step": 2435
},
{
"epoch": 0.5394649568870219,
"grad_norm": 3.23037052154541,
"learning_rate": 2.588864031817052e-05,
"loss": 3.9529,
"step": 2440
},
{
"epoch": 0.5405704178642494,
"grad_norm": 3.4852147102355957,
"learning_rate": 2.582649763857818e-05,
"loss": 4.0165,
"step": 2445
},
{
"epoch": 0.5416758788414769,
"grad_norm": 3.5113587379455566,
"learning_rate": 2.576435495898583e-05,
"loss": 4.1145,
"step": 2450
},
{
"epoch": 0.5427813398187044,
"grad_norm": 3.569577693939209,
"learning_rate": 2.5702212279393488e-05,
"loss": 4.2112,
"step": 2455
},
{
"epoch": 0.5438868007959319,
"grad_norm": 3.2119925022125244,
"learning_rate": 2.564006959980114e-05,
"loss": 3.9315,
"step": 2460
},
{
"epoch": 0.5449922617731594,
"grad_norm": 3.502654790878296,
"learning_rate": 2.55779269202088e-05,
"loss": 4.0101,
"step": 2465
},
{
"epoch": 0.5460977227503869,
"grad_norm": 3.343017101287842,
"learning_rate": 2.551578424061646e-05,
"loss": 4.0618,
"step": 2470
},
{
"epoch": 0.5472031837276145,
"grad_norm": 3.0435657501220703,
"learning_rate": 2.545364156102411e-05,
"loss": 4.2169,
"step": 2475
},
{
"epoch": 0.548308644704842,
"grad_norm": 3.167151927947998,
"learning_rate": 2.539149888143177e-05,
"loss": 4.043,
"step": 2480
},
{
"epoch": 0.5494141056820694,
"grad_norm": 3.2351808547973633,
"learning_rate": 2.5329356201839428e-05,
"loss": 4.0381,
"step": 2485
},
{
"epoch": 0.5505195666592969,
"grad_norm": 3.1816964149475098,
"learning_rate": 2.526721352224708e-05,
"loss": 4.1283,
"step": 2490
},
{
"epoch": 0.5516250276365244,
"grad_norm": 3.2556283473968506,
"learning_rate": 2.5205070842654737e-05,
"loss": 4.1709,
"step": 2495
},
{
"epoch": 0.552730488613752,
"grad_norm": 3.2887418270111084,
"learning_rate": 2.514292816306239e-05,
"loss": 4.1116,
"step": 2500
},
{
"epoch": 0.5538359495909795,
"grad_norm": 3.559380531311035,
"learning_rate": 2.508078548347005e-05,
"loss": 4.0527,
"step": 2505
},
{
"epoch": 0.5549414105682069,
"grad_norm": 3.470162868499756,
"learning_rate": 2.5018642803877706e-05,
"loss": 4.0154,
"step": 2510
},
{
"epoch": 0.5560468715454344,
"grad_norm": 3.294788122177124,
"learning_rate": 2.495650012428536e-05,
"loss": 4.0073,
"step": 2515
},
{
"epoch": 0.5571523325226619,
"grad_norm": 3.3408074378967285,
"learning_rate": 2.4894357444693018e-05,
"loss": 4.1111,
"step": 2520
},
{
"epoch": 0.5582577934998895,
"grad_norm": 3.436032295227051,
"learning_rate": 2.4832214765100674e-05,
"loss": 4.0138,
"step": 2525
},
{
"epoch": 0.559363254477117,
"grad_norm": 3.383261203765869,
"learning_rate": 2.4770072085508327e-05,
"loss": 4.1234,
"step": 2530
},
{
"epoch": 0.5604687154543445,
"grad_norm": 3.479888916015625,
"learning_rate": 2.4707929405915983e-05,
"loss": 4.0519,
"step": 2535
},
{
"epoch": 0.5615741764315719,
"grad_norm": 3.390536069869995,
"learning_rate": 2.4645786726323643e-05,
"loss": 4.1424,
"step": 2540
},
{
"epoch": 0.5626796374087994,
"grad_norm": 3.320270538330078,
"learning_rate": 2.4583644046731296e-05,
"loss": 4.0054,
"step": 2545
},
{
"epoch": 0.563785098386027,
"grad_norm": 3.477365016937256,
"learning_rate": 2.452150136713895e-05,
"loss": 4.0191,
"step": 2550
},
{
"epoch": 0.5648905593632545,
"grad_norm": 3.547175884246826,
"learning_rate": 2.4459358687546608e-05,
"loss": 4.0718,
"step": 2555
},
{
"epoch": 0.565996020340482,
"grad_norm": 3.567544937133789,
"learning_rate": 2.4397216007954264e-05,
"loss": 4.1387,
"step": 2560
},
{
"epoch": 0.5671014813177094,
"grad_norm": 3.351850748062134,
"learning_rate": 2.433507332836192e-05,
"loss": 4.0413,
"step": 2565
},
{
"epoch": 0.568206942294937,
"grad_norm": 3.4294025897979736,
"learning_rate": 2.4272930648769576e-05,
"loss": 4.0404,
"step": 2570
},
{
"epoch": 0.5693124032721645,
"grad_norm": 3.4079086780548096,
"learning_rate": 2.4210787969177233e-05,
"loss": 4.1669,
"step": 2575
},
{
"epoch": 0.570417864249392,
"grad_norm": 3.6439168453216553,
"learning_rate": 2.4148645289584885e-05,
"loss": 4.109,
"step": 2580
},
{
"epoch": 0.5715233252266195,
"grad_norm": 3.3144097328186035,
"learning_rate": 2.4086502609992545e-05,
"loss": 3.9591,
"step": 2585
},
{
"epoch": 0.572628786203847,
"grad_norm": 3.3762526512145996,
"learning_rate": 2.40243599304002e-05,
"loss": 4.1867,
"step": 2590
},
{
"epoch": 0.5737342471810745,
"grad_norm": 3.2939674854278564,
"learning_rate": 2.3962217250807857e-05,
"loss": 4.1226,
"step": 2595
},
{
"epoch": 0.574839708158302,
"grad_norm": 3.094438314437866,
"learning_rate": 2.390007457121551e-05,
"loss": 3.9615,
"step": 2600
},
{
"epoch": 0.5759451691355295,
"grad_norm": 3.3845763206481934,
"learning_rate": 2.383793189162317e-05,
"loss": 3.9805,
"step": 2605
},
{
"epoch": 0.577050630112757,
"grad_norm": 3.696262836456299,
"learning_rate": 2.3775789212030826e-05,
"loss": 3.8625,
"step": 2610
},
{
"epoch": 0.5781560910899846,
"grad_norm": 3.3800036907196045,
"learning_rate": 2.371364653243848e-05,
"loss": 4.1462,
"step": 2615
},
{
"epoch": 0.5792615520672121,
"grad_norm": 3.573200225830078,
"learning_rate": 2.3651503852846135e-05,
"loss": 4.1071,
"step": 2620
},
{
"epoch": 0.5803670130444395,
"grad_norm": 3.651068925857544,
"learning_rate": 2.358936117325379e-05,
"loss": 4.0191,
"step": 2625
},
{
"epoch": 0.581472474021667,
"grad_norm": 3.1807289123535156,
"learning_rate": 2.3527218493661447e-05,
"loss": 4.1579,
"step": 2630
},
{
"epoch": 0.5825779349988945,
"grad_norm": 3.5472700595855713,
"learning_rate": 2.3465075814069103e-05,
"loss": 4.0699,
"step": 2635
},
{
"epoch": 0.5836833959761221,
"grad_norm": 3.3236019611358643,
"learning_rate": 2.340293313447676e-05,
"loss": 3.9927,
"step": 2640
},
{
"epoch": 0.5847888569533496,
"grad_norm": 3.5756359100341797,
"learning_rate": 2.3340790454884416e-05,
"loss": 4.2018,
"step": 2645
},
{
"epoch": 0.585894317930577,
"grad_norm": 3.5606160163879395,
"learning_rate": 2.3278647775292072e-05,
"loss": 4.0626,
"step": 2650
},
{
"epoch": 0.5869997789078045,
"grad_norm": 3.5119574069976807,
"learning_rate": 2.3216505095699728e-05,
"loss": 4.0997,
"step": 2655
},
{
"epoch": 0.588105239885032,
"grad_norm": 3.373201847076416,
"learning_rate": 2.3154362416107384e-05,
"loss": 3.9609,
"step": 2660
},
{
"epoch": 0.5892107008622596,
"grad_norm": 3.168120861053467,
"learning_rate": 2.309221973651504e-05,
"loss": 3.8898,
"step": 2665
},
{
"epoch": 0.5903161618394871,
"grad_norm": 3.260366678237915,
"learning_rate": 2.3030077056922693e-05,
"loss": 4.0445,
"step": 2670
},
{
"epoch": 0.5914216228167146,
"grad_norm": 3.53143572807312,
"learning_rate": 2.2967934377330353e-05,
"loss": 4.033,
"step": 2675
},
{
"epoch": 0.592527083793942,
"grad_norm": 3.4146888256073,
"learning_rate": 2.290579169773801e-05,
"loss": 3.9579,
"step": 2680
},
{
"epoch": 0.5936325447711696,
"grad_norm": 3.554407835006714,
"learning_rate": 2.284364901814566e-05,
"loss": 4.0876,
"step": 2685
},
{
"epoch": 0.5947380057483971,
"grad_norm": 3.302635431289673,
"learning_rate": 2.2781506338553318e-05,
"loss": 4.0015,
"step": 2690
},
{
"epoch": 0.5958434667256246,
"grad_norm": 2.994694948196411,
"learning_rate": 2.2719363658960977e-05,
"loss": 4.1925,
"step": 2695
},
{
"epoch": 0.5969489277028521,
"grad_norm": 3.191727876663208,
"learning_rate": 2.2657220979368633e-05,
"loss": 4.0834,
"step": 2700
},
{
"epoch": 0.5980543886800795,
"grad_norm": 3.187432050704956,
"learning_rate": 2.2595078299776286e-05,
"loss": 4.1476,
"step": 2705
},
{
"epoch": 0.5991598496573071,
"grad_norm": 3.8028817176818848,
"learning_rate": 2.2532935620183942e-05,
"loss": 4.0108,
"step": 2710
},
{
"epoch": 0.6002653106345346,
"grad_norm": 3.493286609649658,
"learning_rate": 2.2470792940591602e-05,
"loss": 4.1705,
"step": 2715
},
{
"epoch": 0.6013707716117621,
"grad_norm": 3.4640684127807617,
"learning_rate": 2.2408650260999255e-05,
"loss": 4.1311,
"step": 2720
},
{
"epoch": 0.6024762325889896,
"grad_norm": 3.8911242485046387,
"learning_rate": 2.234650758140691e-05,
"loss": 4.1535,
"step": 2725
},
{
"epoch": 0.6035816935662172,
"grad_norm": 3.4392147064208984,
"learning_rate": 2.2284364901814567e-05,
"loss": 4.1343,
"step": 2730
},
{
"epoch": 0.6046871545434446,
"grad_norm": 3.2995851039886475,
"learning_rate": 2.2222222222222223e-05,
"loss": 4.0273,
"step": 2735
},
{
"epoch": 0.6057926155206721,
"grad_norm": 3.1584272384643555,
"learning_rate": 2.216007954262988e-05,
"loss": 4.2191,
"step": 2740
},
{
"epoch": 0.6068980764978996,
"grad_norm": 3.7929775714874268,
"learning_rate": 2.2097936863037536e-05,
"loss": 3.9746,
"step": 2745
},
{
"epoch": 0.6080035374751271,
"grad_norm": 3.4396305084228516,
"learning_rate": 2.2035794183445192e-05,
"loss": 4.2164,
"step": 2750
},
{
"epoch": 0.6091089984523547,
"grad_norm": 3.2499279975891113,
"learning_rate": 2.1973651503852845e-05,
"loss": 3.9657,
"step": 2755
},
{
"epoch": 0.6102144594295822,
"grad_norm": 3.682943105697632,
"learning_rate": 2.1911508824260504e-05,
"loss": 4.0552,
"step": 2760
},
{
"epoch": 0.6113199204068096,
"grad_norm": 3.217568874359131,
"learning_rate": 2.184936614466816e-05,
"loss": 4.1355,
"step": 2765
},
{
"epoch": 0.6124253813840371,
"grad_norm": 3.696176528930664,
"learning_rate": 2.1787223465075816e-05,
"loss": 4.1971,
"step": 2770
},
{
"epoch": 0.6135308423612647,
"grad_norm": 3.366211175918579,
"learning_rate": 2.172508078548347e-05,
"loss": 4.1779,
"step": 2775
},
{
"epoch": 0.6146363033384922,
"grad_norm": 3.3090131282806396,
"learning_rate": 2.1662938105891125e-05,
"loss": 4.0138,
"step": 2780
},
{
"epoch": 0.6157417643157197,
"grad_norm": 3.492255210876465,
"learning_rate": 2.1600795426298785e-05,
"loss": 4.113,
"step": 2785
},
{
"epoch": 0.6168472252929471,
"grad_norm": 3.2298202514648438,
"learning_rate": 2.1538652746706438e-05,
"loss": 4.0822,
"step": 2790
},
{
"epoch": 0.6179526862701746,
"grad_norm": 3.3362765312194824,
"learning_rate": 2.1476510067114094e-05,
"loss": 4.1301,
"step": 2795
},
{
"epoch": 0.6190581472474022,
"grad_norm": 3.1772379875183105,
"learning_rate": 2.141436738752175e-05,
"loss": 4.0127,
"step": 2800
},
{
"epoch": 0.6201636082246297,
"grad_norm": 3.5195131301879883,
"learning_rate": 2.1352224707929406e-05,
"loss": 4.0411,
"step": 2805
},
{
"epoch": 0.6212690692018572,
"grad_norm": 3.1108715534210205,
"learning_rate": 2.1290082028337062e-05,
"loss": 3.9511,
"step": 2810
},
{
"epoch": 0.6223745301790847,
"grad_norm": 3.278776168823242,
"learning_rate": 2.122793934874472e-05,
"loss": 4.086,
"step": 2815
},
{
"epoch": 0.6234799911563121,
"grad_norm": 3.3844807147979736,
"learning_rate": 2.1165796669152375e-05,
"loss": 4.1389,
"step": 2820
},
{
"epoch": 0.6245854521335397,
"grad_norm": 3.547020673751831,
"learning_rate": 2.110365398956003e-05,
"loss": 4.1154,
"step": 2825
},
{
"epoch": 0.6256909131107672,
"grad_norm": 3.083136558532715,
"learning_rate": 2.1041511309967687e-05,
"loss": 3.9761,
"step": 2830
},
{
"epoch": 0.6267963740879947,
"grad_norm": 3.7824316024780273,
"learning_rate": 2.0979368630375343e-05,
"loss": 4.04,
"step": 2835
},
{
"epoch": 0.6279018350652222,
"grad_norm": 3.584540367126465,
"learning_rate": 2.0917225950783e-05,
"loss": 4.0237,
"step": 2840
},
{
"epoch": 0.6290072960424496,
"grad_norm": 3.4071264266967773,
"learning_rate": 2.0855083271190652e-05,
"loss": 4.0866,
"step": 2845
},
{
"epoch": 0.6301127570196772,
"grad_norm": 3.149873733520508,
"learning_rate": 2.0792940591598312e-05,
"loss": 4.0776,
"step": 2850
},
{
"epoch": 0.6312182179969047,
"grad_norm": 3.3021628856658936,
"learning_rate": 2.0730797912005968e-05,
"loss": 4.1142,
"step": 2855
},
{
"epoch": 0.6323236789741322,
"grad_norm": 3.379462957382202,
"learning_rate": 2.066865523241362e-05,
"loss": 4.197,
"step": 2860
},
{
"epoch": 0.6334291399513597,
"grad_norm": 3.624547243118286,
"learning_rate": 2.0606512552821277e-05,
"loss": 4.1014,
"step": 2865
},
{
"epoch": 0.6345346009285873,
"grad_norm": 3.391458511352539,
"learning_rate": 2.0544369873228937e-05,
"loss": 4.1709,
"step": 2870
},
{
"epoch": 0.6356400619058147,
"grad_norm": 3.3703296184539795,
"learning_rate": 2.0482227193636593e-05,
"loss": 4.0528,
"step": 2875
},
{
"epoch": 0.6367455228830422,
"grad_norm": 3.6773877143859863,
"learning_rate": 2.0420084514044246e-05,
"loss": 4.0063,
"step": 2880
},
{
"epoch": 0.6378509838602697,
"grad_norm": 3.203677177429199,
"learning_rate": 2.03579418344519e-05,
"loss": 4.0733,
"step": 2885
},
{
"epoch": 0.6389564448374973,
"grad_norm": 3.36698055267334,
"learning_rate": 2.0295799154859558e-05,
"loss": 4.0456,
"step": 2890
},
{
"epoch": 0.6400619058147248,
"grad_norm": 3.412586212158203,
"learning_rate": 2.0233656475267214e-05,
"loss": 4.0807,
"step": 2895
},
{
"epoch": 0.6411673667919523,
"grad_norm": 3.175722599029541,
"learning_rate": 2.017151379567487e-05,
"loss": 4.1196,
"step": 2900
},
{
"epoch": 0.6422728277691797,
"grad_norm": 3.315753936767578,
"learning_rate": 2.0109371116082526e-05,
"loss": 4.0987,
"step": 2905
},
{
"epoch": 0.6433782887464072,
"grad_norm": 3.3233401775360107,
"learning_rate": 2.0047228436490183e-05,
"loss": 4.0974,
"step": 2910
},
{
"epoch": 0.6444837497236348,
"grad_norm": 3.648879051208496,
"learning_rate": 1.998508575689784e-05,
"loss": 4.0625,
"step": 2915
},
{
"epoch": 0.6455892107008623,
"grad_norm": 3.3237850666046143,
"learning_rate": 1.9922943077305495e-05,
"loss": 4.138,
"step": 2920
},
{
"epoch": 0.6466946716780898,
"grad_norm": 3.314603090286255,
"learning_rate": 1.986080039771315e-05,
"loss": 4.2303,
"step": 2925
},
{
"epoch": 0.6478001326553172,
"grad_norm": 3.116244316101074,
"learning_rate": 1.9798657718120804e-05,
"loss": 4.1222,
"step": 2930
},
{
"epoch": 0.6489055936325447,
"grad_norm": 3.232257127761841,
"learning_rate": 1.9736515038528463e-05,
"loss": 4.1056,
"step": 2935
},
{
"epoch": 0.6500110546097723,
"grad_norm": 3.373582124710083,
"learning_rate": 1.967437235893612e-05,
"loss": 4.1316,
"step": 2940
},
{
"epoch": 0.6511165155869998,
"grad_norm": 3.2493808269500732,
"learning_rate": 1.9612229679343776e-05,
"loss": 4.1764,
"step": 2945
},
{
"epoch": 0.6522219765642273,
"grad_norm": 2.9851105213165283,
"learning_rate": 1.955008699975143e-05,
"loss": 4.074,
"step": 2950
},
{
"epoch": 0.6533274375414548,
"grad_norm": 3.526233196258545,
"learning_rate": 1.9487944320159085e-05,
"loss": 4.0382,
"step": 2955
},
{
"epoch": 0.6544328985186822,
"grad_norm": 3.4045310020446777,
"learning_rate": 1.9425801640566744e-05,
"loss": 4.012,
"step": 2960
},
{
"epoch": 0.6555383594959098,
"grad_norm": 3.5040388107299805,
"learning_rate": 1.9363658960974397e-05,
"loss": 3.9922,
"step": 2965
},
{
"epoch": 0.6566438204731373,
"grad_norm": 3.4251108169555664,
"learning_rate": 1.9301516281382053e-05,
"loss": 4.0577,
"step": 2970
},
{
"epoch": 0.6577492814503648,
"grad_norm": 3.363278388977051,
"learning_rate": 1.923937360178971e-05,
"loss": 4.1127,
"step": 2975
},
{
"epoch": 0.6588547424275923,
"grad_norm": 3.2592687606811523,
"learning_rate": 1.917723092219737e-05,
"loss": 4.0898,
"step": 2980
},
{
"epoch": 0.6599602034048198,
"grad_norm": 3.295732021331787,
"learning_rate": 1.9115088242605022e-05,
"loss": 4.0772,
"step": 2985
},
{
"epoch": 0.6610656643820473,
"grad_norm": 3.302295684814453,
"learning_rate": 1.9052945563012678e-05,
"loss": 4.1688,
"step": 2990
},
{
"epoch": 0.6621711253592748,
"grad_norm": 3.415590524673462,
"learning_rate": 1.8990802883420334e-05,
"loss": 4.0569,
"step": 2995
},
{
"epoch": 0.6632765863365023,
"grad_norm": 3.4967286586761475,
"learning_rate": 1.892866020382799e-05,
"loss": 4.0951,
"step": 3000
},
{
"epoch": 0.6643820473137299,
"grad_norm": 3.3429524898529053,
"learning_rate": 1.8866517524235646e-05,
"loss": 4.0436,
"step": 3005
},
{
"epoch": 0.6654875082909574,
"grad_norm": 3.2878565788269043,
"learning_rate": 1.8804374844643303e-05,
"loss": 4.0224,
"step": 3010
},
{
"epoch": 0.6665929692681848,
"grad_norm": 3.4439568519592285,
"learning_rate": 1.874223216505096e-05,
"loss": 3.9529,
"step": 3015
},
{
"epoch": 0.6676984302454123,
"grad_norm": 3.4221768379211426,
"learning_rate": 1.868008948545861e-05,
"loss": 4.0604,
"step": 3020
},
{
"epoch": 0.6688038912226398,
"grad_norm": 3.2308311462402344,
"learning_rate": 1.861794680586627e-05,
"loss": 4.0717,
"step": 3025
},
{
"epoch": 0.6699093521998674,
"grad_norm": 3.7637572288513184,
"learning_rate": 1.8555804126273927e-05,
"loss": 4.3161,
"step": 3030
},
{
"epoch": 0.6710148131770949,
"grad_norm": 3.2774343490600586,
"learning_rate": 1.849366144668158e-05,
"loss": 4.1447,
"step": 3035
},
{
"epoch": 0.6721202741543224,
"grad_norm": 3.3979032039642334,
"learning_rate": 1.8431518767089236e-05,
"loss": 4.2971,
"step": 3040
},
{
"epoch": 0.6732257351315498,
"grad_norm": 3.259497880935669,
"learning_rate": 1.8369376087496896e-05,
"loss": 4.2798,
"step": 3045
},
{
"epoch": 0.6743311961087773,
"grad_norm": 3.346216917037964,
"learning_rate": 1.8307233407904552e-05,
"loss": 4.2459,
"step": 3050
},
{
"epoch": 0.6754366570860049,
"grad_norm": 3.195192813873291,
"learning_rate": 1.8245090728312205e-05,
"loss": 4.1107,
"step": 3055
},
{
"epoch": 0.6765421180632324,
"grad_norm": 3.3949368000030518,
"learning_rate": 1.818294804871986e-05,
"loss": 4.1965,
"step": 3060
},
{
"epoch": 0.6776475790404599,
"grad_norm": 3.1918063163757324,
"learning_rate": 1.8120805369127517e-05,
"loss": 4.2232,
"step": 3065
},
{
"epoch": 0.6787530400176873,
"grad_norm": 3.080773115158081,
"learning_rate": 1.8058662689535173e-05,
"loss": 4.1366,
"step": 3070
},
{
"epoch": 0.6798585009949148,
"grad_norm": 3.573559284210205,
"learning_rate": 1.799652000994283e-05,
"loss": 4.0492,
"step": 3075
},
{
"epoch": 0.6809639619721424,
"grad_norm": 3.105289936065674,
"learning_rate": 1.7934377330350486e-05,
"loss": 4.019,
"step": 3080
},
{
"epoch": 0.6820694229493699,
"grad_norm": 3.233858108520508,
"learning_rate": 1.7872234650758142e-05,
"loss": 4.2052,
"step": 3085
},
{
"epoch": 0.6831748839265974,
"grad_norm": 3.489800214767456,
"learning_rate": 1.7810091971165798e-05,
"loss": 4.244,
"step": 3090
},
{
"epoch": 0.6842803449038249,
"grad_norm": 3.919562339782715,
"learning_rate": 1.7747949291573454e-05,
"loss": 4.2778,
"step": 3095
},
{
"epoch": 0.6853858058810524,
"grad_norm": 3.4953386783599854,
"learning_rate": 1.768580661198111e-05,
"loss": 4.0999,
"step": 3100
},
{
"epoch": 0.6864912668582799,
"grad_norm": 3.0462942123413086,
"learning_rate": 1.7623663932388766e-05,
"loss": 4.1613,
"step": 3105
},
{
"epoch": 0.6875967278355074,
"grad_norm": 3.604140520095825,
"learning_rate": 1.756152125279642e-05,
"loss": 4.037,
"step": 3110
},
{
"epoch": 0.6887021888127349,
"grad_norm": 3.4862539768218994,
"learning_rate": 1.749937857320408e-05,
"loss": 4.136,
"step": 3115
},
{
"epoch": 0.6898076497899625,
"grad_norm": 3.3312830924987793,
"learning_rate": 1.7437235893611735e-05,
"loss": 4.1436,
"step": 3120
},
{
"epoch": 0.6909131107671899,
"grad_norm": 3.4092671871185303,
"learning_rate": 1.7375093214019388e-05,
"loss": 4.2998,
"step": 3125
},
{
"epoch": 0.6920185717444174,
"grad_norm": 3.138869285583496,
"learning_rate": 1.7312950534427044e-05,
"loss": 3.9221,
"step": 3130
},
{
"epoch": 0.6931240327216449,
"grad_norm": 3.570099115371704,
"learning_rate": 1.7250807854834704e-05,
"loss": 4.1127,
"step": 3135
},
{
"epoch": 0.6942294936988724,
"grad_norm": 3.4143168926239014,
"learning_rate": 1.7188665175242356e-05,
"loss": 4.1529,
"step": 3140
},
{
"epoch": 0.6953349546761,
"grad_norm": 3.299022674560547,
"learning_rate": 1.7126522495650012e-05,
"loss": 4.094,
"step": 3145
},
{
"epoch": 0.6964404156533275,
"grad_norm": 3.2752246856689453,
"learning_rate": 1.706437981605767e-05,
"loss": 4.0729,
"step": 3150
},
{
"epoch": 0.6975458766305549,
"grad_norm": 3.453444004058838,
"learning_rate": 1.7002237136465328e-05,
"loss": 4.1417,
"step": 3155
},
{
"epoch": 0.6986513376077824,
"grad_norm": 3.2120327949523926,
"learning_rate": 1.694009445687298e-05,
"loss": 4.249,
"step": 3160
},
{
"epoch": 0.6997567985850099,
"grad_norm": 3.4823880195617676,
"learning_rate": 1.6877951777280637e-05,
"loss": 4.197,
"step": 3165
},
{
"epoch": 0.7008622595622375,
"grad_norm": 3.438119888305664,
"learning_rate": 1.6815809097688293e-05,
"loss": 4.1066,
"step": 3170
},
{
"epoch": 0.701967720539465,
"grad_norm": 3.4621167182922363,
"learning_rate": 1.675366641809595e-05,
"loss": 4.2766,
"step": 3175
},
{
"epoch": 0.7030731815166925,
"grad_norm": 3.3527414798736572,
"learning_rate": 1.6691523738503606e-05,
"loss": 4.0086,
"step": 3180
},
{
"epoch": 0.7041786424939199,
"grad_norm": 3.4415431022644043,
"learning_rate": 1.6629381058911262e-05,
"loss": 4.0336,
"step": 3185
},
{
"epoch": 0.7052841034711474,
"grad_norm": 3.243367910385132,
"learning_rate": 1.6567238379318918e-05,
"loss": 4.1119,
"step": 3190
},
{
"epoch": 0.706389564448375,
"grad_norm": 3.515403985977173,
"learning_rate": 1.650509569972657e-05,
"loss": 4.0391,
"step": 3195
},
{
"epoch": 0.7074950254256025,
"grad_norm": 3.0629870891571045,
"learning_rate": 1.644295302013423e-05,
"loss": 4.2706,
"step": 3200
},
{
"epoch": 0.70860048640283,
"grad_norm": 3.412379026412964,
"learning_rate": 1.6380810340541887e-05,
"loss": 4.3555,
"step": 3205
},
{
"epoch": 0.7097059473800574,
"grad_norm": 3.250455141067505,
"learning_rate": 1.631866766094954e-05,
"loss": 4.1877,
"step": 3210
},
{
"epoch": 0.710811408357285,
"grad_norm": 3.0698251724243164,
"learning_rate": 1.6256524981357195e-05,
"loss": 4.0128,
"step": 3215
},
{
"epoch": 0.7119168693345125,
"grad_norm": 3.3195056915283203,
"learning_rate": 1.619438230176485e-05,
"loss": 4.2022,
"step": 3220
},
{
"epoch": 0.71302233031174,
"grad_norm": 3.3622958660125732,
"learning_rate": 1.613223962217251e-05,
"loss": 4.187,
"step": 3225
},
{
"epoch": 0.7141277912889675,
"grad_norm": 3.3840930461883545,
"learning_rate": 1.6070096942580164e-05,
"loss": 4.2928,
"step": 3230
},
{
"epoch": 0.715233252266195,
"grad_norm": 3.4330742359161377,
"learning_rate": 1.600795426298782e-05,
"loss": 4.0841,
"step": 3235
},
{
"epoch": 0.7163387132434225,
"grad_norm": 3.258180856704712,
"learning_rate": 1.5945811583395476e-05,
"loss": 4.1938,
"step": 3240
},
{
"epoch": 0.71744417422065,
"grad_norm": 3.183001756668091,
"learning_rate": 1.5883668903803133e-05,
"loss": 4.078,
"step": 3245
},
{
"epoch": 0.7185496351978775,
"grad_norm": 3.0564966201782227,
"learning_rate": 1.582152622421079e-05,
"loss": 4.089,
"step": 3250
},
{
"epoch": 0.719655096175105,
"grad_norm": 3.324143648147583,
"learning_rate": 1.5759383544618445e-05,
"loss": 4.2551,
"step": 3255
},
{
"epoch": 0.7207605571523326,
"grad_norm": 3.4312210083007812,
"learning_rate": 1.56972408650261e-05,
"loss": 4.1726,
"step": 3260
},
{
"epoch": 0.72186601812956,
"grad_norm": 3.168652057647705,
"learning_rate": 1.5635098185433757e-05,
"loss": 4.0236,
"step": 3265
},
{
"epoch": 0.7229714791067875,
"grad_norm": 3.116694211959839,
"learning_rate": 1.5572955505841413e-05,
"loss": 4.2022,
"step": 3270
},
{
"epoch": 0.724076940084015,
"grad_norm": 3.235372543334961,
"learning_rate": 1.551081282624907e-05,
"loss": 3.8518,
"step": 3275
},
{
"epoch": 0.7251824010612425,
"grad_norm": 3.3609163761138916,
"learning_rate": 1.5448670146656726e-05,
"loss": 3.968,
"step": 3280
},
{
"epoch": 0.7262878620384701,
"grad_norm": 3.4579970836639404,
"learning_rate": 1.538652746706438e-05,
"loss": 4.212,
"step": 3285
},
{
"epoch": 0.7273933230156976,
"grad_norm": 3.582771062850952,
"learning_rate": 1.5324384787472038e-05,
"loss": 4.2005,
"step": 3290
},
{
"epoch": 0.728498783992925,
"grad_norm": 3.151522636413574,
"learning_rate": 1.5262242107879694e-05,
"loss": 4.0769,
"step": 3295
},
{
"epoch": 0.7296042449701525,
"grad_norm": 3.194068193435669,
"learning_rate": 1.5200099428287349e-05,
"loss": 4.2329,
"step": 3300
},
{
"epoch": 0.73070970594738,
"grad_norm": 3.24617600440979,
"learning_rate": 1.5137956748695003e-05,
"loss": 4.0845,
"step": 3305
},
{
"epoch": 0.7318151669246076,
"grad_norm": 3.347874641418457,
"learning_rate": 1.5075814069102661e-05,
"loss": 4.2557,
"step": 3310
},
{
"epoch": 0.7329206279018351,
"grad_norm": 3.392652988433838,
"learning_rate": 1.5013671389510317e-05,
"loss": 4.1908,
"step": 3315
},
{
"epoch": 0.7340260888790626,
"grad_norm": 3.364522933959961,
"learning_rate": 1.4951528709917972e-05,
"loss": 4.1181,
"step": 3320
},
{
"epoch": 0.73513154985629,
"grad_norm": 3.217658042907715,
"learning_rate": 1.4889386030325628e-05,
"loss": 4.1129,
"step": 3325
},
{
"epoch": 0.7362370108335176,
"grad_norm": 3.741403102874756,
"learning_rate": 1.4827243350733282e-05,
"loss": 4.1941,
"step": 3330
},
{
"epoch": 0.7373424718107451,
"grad_norm": 3.6244940757751465,
"learning_rate": 1.4765100671140942e-05,
"loss": 4.133,
"step": 3335
},
{
"epoch": 0.7384479327879726,
"grad_norm": 3.455331563949585,
"learning_rate": 1.4702957991548596e-05,
"loss": 4.1993,
"step": 3340
},
{
"epoch": 0.7395533937652001,
"grad_norm": 3.3067119121551514,
"learning_rate": 1.4640815311956253e-05,
"loss": 4.1962,
"step": 3345
},
{
"epoch": 0.7406588547424275,
"grad_norm": 3.3184375762939453,
"learning_rate": 1.4578672632363907e-05,
"loss": 4.0779,
"step": 3350
},
{
"epoch": 0.7417643157196551,
"grad_norm": 3.617077350616455,
"learning_rate": 1.4516529952771565e-05,
"loss": 3.995,
"step": 3355
},
{
"epoch": 0.7428697766968826,
"grad_norm": 3.471519947052002,
"learning_rate": 1.4454387273179221e-05,
"loss": 4.0302,
"step": 3360
},
{
"epoch": 0.7439752376741101,
"grad_norm": 3.3337936401367188,
"learning_rate": 1.4392244593586876e-05,
"loss": 4.1125,
"step": 3365
},
{
"epoch": 0.7450806986513376,
"grad_norm": 3.5475218296051025,
"learning_rate": 1.4330101913994532e-05,
"loss": 4.1158,
"step": 3370
},
{
"epoch": 0.7461861596285652,
"grad_norm": 3.225281238555908,
"learning_rate": 1.4267959234402186e-05,
"loss": 4.1048,
"step": 3375
},
{
"epoch": 0.7472916206057926,
"grad_norm": 2.9788243770599365,
"learning_rate": 1.4205816554809844e-05,
"loss": 4.1919,
"step": 3380
},
{
"epoch": 0.7483970815830201,
"grad_norm": 2.9584922790527344,
"learning_rate": 1.41436738752175e-05,
"loss": 3.9252,
"step": 3385
},
{
"epoch": 0.7495025425602476,
"grad_norm": 3.4342474937438965,
"learning_rate": 1.4081531195625155e-05,
"loss": 4.2655,
"step": 3390
},
{
"epoch": 0.7506080035374751,
"grad_norm": 3.157142400741577,
"learning_rate": 1.4019388516032811e-05,
"loss": 4.0619,
"step": 3395
},
{
"epoch": 0.7517134645147027,
"grad_norm": 3.739959716796875,
"learning_rate": 1.3957245836440469e-05,
"loss": 4.1531,
"step": 3400
},
{
"epoch": 0.7528189254919301,
"grad_norm": 3.4141812324523926,
"learning_rate": 1.3895103156848125e-05,
"loss": 4.0972,
"step": 3405
},
{
"epoch": 0.7539243864691576,
"grad_norm": 3.140306234359741,
"learning_rate": 1.383296047725578e-05,
"loss": 4.1615,
"step": 3410
},
{
"epoch": 0.7550298474463851,
"grad_norm": 3.495731830596924,
"learning_rate": 1.3770817797663436e-05,
"loss": 4.322,
"step": 3415
},
{
"epoch": 0.7561353084236127,
"grad_norm": 3.2486352920532227,
"learning_rate": 1.3708675118071093e-05,
"loss": 4.1291,
"step": 3420
},
{
"epoch": 0.7572407694008402,
"grad_norm": 3.405538320541382,
"learning_rate": 1.3646532438478748e-05,
"loss": 4.0567,
"step": 3425
},
{
"epoch": 0.7583462303780677,
"grad_norm": 3.2491066455841064,
"learning_rate": 1.3584389758886404e-05,
"loss": 4.2248,
"step": 3430
},
{
"epoch": 0.7594516913552951,
"grad_norm": 3.415019989013672,
"learning_rate": 1.3522247079294059e-05,
"loss": 4.2429,
"step": 3435
},
{
"epoch": 0.7605571523325226,
"grad_norm": 3.0789833068847656,
"learning_rate": 1.3460104399701715e-05,
"loss": 4.0823,
"step": 3440
},
{
"epoch": 0.7616626133097502,
"grad_norm": 3.2663156986236572,
"learning_rate": 1.3397961720109373e-05,
"loss": 4.1684,
"step": 3445
},
{
"epoch": 0.7627680742869777,
"grad_norm": 3.3702750205993652,
"learning_rate": 1.3335819040517029e-05,
"loss": 4.1521,
"step": 3450
},
{
"epoch": 0.7638735352642052,
"grad_norm": 3.318516731262207,
"learning_rate": 1.3273676360924683e-05,
"loss": 4.0572,
"step": 3455
},
{
"epoch": 0.7649789962414327,
"grad_norm": 3.307229995727539,
"learning_rate": 1.321153368133234e-05,
"loss": 4.2087,
"step": 3460
},
{
"epoch": 0.7660844572186601,
"grad_norm": 3.141308546066284,
"learning_rate": 1.3149391001739997e-05,
"loss": 4.2045,
"step": 3465
},
{
"epoch": 0.7671899181958877,
"grad_norm": 3.488524913787842,
"learning_rate": 1.3087248322147652e-05,
"loss": 4.1981,
"step": 3470
},
{
"epoch": 0.7682953791731152,
"grad_norm": 3.333773612976074,
"learning_rate": 1.3025105642555308e-05,
"loss": 4.0546,
"step": 3475
},
{
"epoch": 0.7694008401503427,
"grad_norm": 3.093600273132324,
"learning_rate": 1.2962962962962962e-05,
"loss": 4.1297,
"step": 3480
},
{
"epoch": 0.7705063011275702,
"grad_norm": 3.681091547012329,
"learning_rate": 1.2900820283370619e-05,
"loss": 4.2743,
"step": 3485
},
{
"epoch": 0.7716117621047976,
"grad_norm": 3.2113373279571533,
"learning_rate": 1.2838677603778276e-05,
"loss": 4.1716,
"step": 3490
},
{
"epoch": 0.7727172230820252,
"grad_norm": 3.22847843170166,
"learning_rate": 1.2776534924185931e-05,
"loss": 4.1038,
"step": 3495
},
{
"epoch": 0.7738226840592527,
"grad_norm": 3.2960784435272217,
"learning_rate": 1.2714392244593587e-05,
"loss": 4.2599,
"step": 3500
},
{
"epoch": 0.7749281450364802,
"grad_norm": 3.509111166000366,
"learning_rate": 1.2652249565001242e-05,
"loss": 4.2696,
"step": 3505
},
{
"epoch": 0.7760336060137077,
"grad_norm": 3.4601404666900635,
"learning_rate": 1.2590106885408901e-05,
"loss": 4.0995,
"step": 3510
},
{
"epoch": 0.7771390669909353,
"grad_norm": 3.166656017303467,
"learning_rate": 1.2527964205816556e-05,
"loss": 4.3323,
"step": 3515
},
{
"epoch": 0.7782445279681627,
"grad_norm": 3.115483522415161,
"learning_rate": 1.2465821526224212e-05,
"loss": 4.2784,
"step": 3520
},
{
"epoch": 0.7793499889453902,
"grad_norm": 3.377978563308716,
"learning_rate": 1.2403678846631868e-05,
"loss": 4.1576,
"step": 3525
},
{
"epoch": 0.7804554499226177,
"grad_norm": 3.291743278503418,
"learning_rate": 1.2341536167039522e-05,
"loss": 4.3317,
"step": 3530
},
{
"epoch": 0.7815609108998453,
"grad_norm": 3.091101884841919,
"learning_rate": 1.227939348744718e-05,
"loss": 4.2178,
"step": 3535
},
{
"epoch": 0.7826663718770728,
"grad_norm": 3.3874189853668213,
"learning_rate": 1.2217250807854835e-05,
"loss": 4.0266,
"step": 3540
},
{
"epoch": 0.7837718328543002,
"grad_norm": 3.4406089782714844,
"learning_rate": 1.2155108128262491e-05,
"loss": 4.2243,
"step": 3545
},
{
"epoch": 0.7848772938315277,
"grad_norm": 3.2707858085632324,
"learning_rate": 1.2092965448670147e-05,
"loss": 4.1444,
"step": 3550
},
{
"epoch": 0.7859827548087552,
"grad_norm": 3.2035396099090576,
"learning_rate": 1.2030822769077803e-05,
"loss": 3.981,
"step": 3555
},
{
"epoch": 0.7870882157859828,
"grad_norm": 3.3851969242095947,
"learning_rate": 1.196868008948546e-05,
"loss": 4.219,
"step": 3560
},
{
"epoch": 0.7881936767632103,
"grad_norm": 3.0952658653259277,
"learning_rate": 1.1906537409893114e-05,
"loss": 4.2355,
"step": 3565
},
{
"epoch": 0.7892991377404378,
"grad_norm": 3.3667149543762207,
"learning_rate": 1.1844394730300772e-05,
"loss": 4.2494,
"step": 3570
},
{
"epoch": 0.7904045987176652,
"grad_norm": 3.6815719604492188,
"learning_rate": 1.1782252050708426e-05,
"loss": 4.15,
"step": 3575
},
{
"epoch": 0.7915100596948927,
"grad_norm": 3.330397367477417,
"learning_rate": 1.1720109371116084e-05,
"loss": 4.0933,
"step": 3580
},
{
"epoch": 0.7926155206721203,
"grad_norm": 3.213534355163574,
"learning_rate": 1.1657966691523739e-05,
"loss": 4.0645,
"step": 3585
},
{
"epoch": 0.7937209816493478,
"grad_norm": 3.413196086883545,
"learning_rate": 1.1595824011931397e-05,
"loss": 4.2731,
"step": 3590
},
{
"epoch": 0.7948264426265753,
"grad_norm": 2.9504334926605225,
"learning_rate": 1.1533681332339051e-05,
"loss": 4.0869,
"step": 3595
},
{
"epoch": 0.7959319036038028,
"grad_norm": 3.48688006401062,
"learning_rate": 1.1471538652746707e-05,
"loss": 4.1732,
"step": 3600
},
{
"epoch": 0.7970373645810302,
"grad_norm": 3.202857494354248,
"learning_rate": 1.1409395973154363e-05,
"loss": 4.2084,
"step": 3605
},
{
"epoch": 0.7981428255582578,
"grad_norm": 3.460794687271118,
"learning_rate": 1.1347253293562018e-05,
"loss": 4.2956,
"step": 3610
},
{
"epoch": 0.7992482865354853,
"grad_norm": 3.3727447986602783,
"learning_rate": 1.1285110613969676e-05,
"loss": 4.1854,
"step": 3615
},
{
"epoch": 0.8003537475127128,
"grad_norm": 3.3435420989990234,
"learning_rate": 1.122296793437733e-05,
"loss": 4.3749,
"step": 3620
},
{
"epoch": 0.8014592084899403,
"grad_norm": 3.1651086807250977,
"learning_rate": 1.1160825254784988e-05,
"loss": 4.132,
"step": 3625
},
{
"epoch": 0.8025646694671678,
"grad_norm": 3.482461929321289,
"learning_rate": 1.1098682575192643e-05,
"loss": 4.3037,
"step": 3630
},
{
"epoch": 0.8036701304443953,
"grad_norm": 3.5828919410705566,
"learning_rate": 1.1036539895600299e-05,
"loss": 4.1466,
"step": 3635
},
{
"epoch": 0.8047755914216228,
"grad_norm": 3.344888687133789,
"learning_rate": 1.0974397216007955e-05,
"loss": 4.1947,
"step": 3640
},
{
"epoch": 0.8058810523988503,
"grad_norm": 3.2426233291625977,
"learning_rate": 1.091225453641561e-05,
"loss": 4.0683,
"step": 3645
},
{
"epoch": 0.8069865133760779,
"grad_norm": 3.2281033992767334,
"learning_rate": 1.0850111856823267e-05,
"loss": 4.107,
"step": 3650
},
{
"epoch": 0.8080919743533054,
"grad_norm": 3.1622958183288574,
"learning_rate": 1.0787969177230922e-05,
"loss": 4.2085,
"step": 3655
},
{
"epoch": 0.8091974353305328,
"grad_norm": 3.2309300899505615,
"learning_rate": 1.072582649763858e-05,
"loss": 4.3067,
"step": 3660
},
{
"epoch": 0.8103028963077603,
"grad_norm": 3.1198458671569824,
"learning_rate": 1.0663683818046234e-05,
"loss": 4.0849,
"step": 3665
},
{
"epoch": 0.8114083572849878,
"grad_norm": 3.5155203342437744,
"learning_rate": 1.060154113845389e-05,
"loss": 4.115,
"step": 3670
},
{
"epoch": 0.8125138182622154,
"grad_norm": 3.102889060974121,
"learning_rate": 1.0539398458861546e-05,
"loss": 4.1175,
"step": 3675
},
{
"epoch": 0.8136192792394429,
"grad_norm": 3.3019254207611084,
"learning_rate": 1.0477255779269203e-05,
"loss": 4.2803,
"step": 3680
},
{
"epoch": 0.8147247402166704,
"grad_norm": 3.5849218368530273,
"learning_rate": 1.0415113099676859e-05,
"loss": 4.2005,
"step": 3685
},
{
"epoch": 0.8158302011938978,
"grad_norm": 3.9152631759643555,
"learning_rate": 1.0352970420084515e-05,
"loss": 4.3163,
"step": 3690
},
{
"epoch": 0.8169356621711253,
"grad_norm": 3.0798897743225098,
"learning_rate": 1.0290827740492171e-05,
"loss": 4.1536,
"step": 3695
},
{
"epoch": 0.8180411231483529,
"grad_norm": 3.491821765899658,
"learning_rate": 1.0228685060899826e-05,
"loss": 4.3108,
"step": 3700
},
{
"epoch": 0.8191465841255804,
"grad_norm": 3.093750238418579,
"learning_rate": 1.0166542381307482e-05,
"loss": 4.0467,
"step": 3705
},
{
"epoch": 0.8202520451028079,
"grad_norm": 3.4779791831970215,
"learning_rate": 1.0104399701715138e-05,
"loss": 4.2484,
"step": 3710
},
{
"epoch": 0.8213575060800353,
"grad_norm": 3.1915061473846436,
"learning_rate": 1.0042257022122794e-05,
"loss": 4.3235,
"step": 3715
},
{
"epoch": 0.8224629670572629,
"grad_norm": 3.1019785404205322,
"learning_rate": 9.98011434253045e-06,
"loss": 4.1893,
"step": 3720
},
{
"epoch": 0.8235684280344904,
"grad_norm": 3.3659591674804688,
"learning_rate": 9.917971662938106e-06,
"loss": 4.1759,
"step": 3725
},
{
"epoch": 0.8246738890117179,
"grad_norm": 3.254364013671875,
"learning_rate": 9.855828983345763e-06,
"loss": 3.9382,
"step": 3730
},
{
"epoch": 0.8257793499889454,
"grad_norm": 3.1901118755340576,
"learning_rate": 9.793686303753419e-06,
"loss": 4.1601,
"step": 3735
},
{
"epoch": 0.826884810966173,
"grad_norm": 3.040501832962036,
"learning_rate": 9.731543624161075e-06,
"loss": 4.0918,
"step": 3740
},
{
"epoch": 0.8279902719434004,
"grad_norm": 3.3288450241088867,
"learning_rate": 9.669400944568731e-06,
"loss": 4.1557,
"step": 3745
},
{
"epoch": 0.8290957329206279,
"grad_norm": 3.145031213760376,
"learning_rate": 9.607258264976386e-06,
"loss": 4.2639,
"step": 3750
},
{
"epoch": 0.8302011938978554,
"grad_norm": 2.950425148010254,
"learning_rate": 9.545115585384042e-06,
"loss": 4.0413,
"step": 3755
},
{
"epoch": 0.8313066548750829,
"grad_norm": 3.336622714996338,
"learning_rate": 9.482972905791698e-06,
"loss": 4.2885,
"step": 3760
},
{
"epoch": 0.8324121158523105,
"grad_norm": 3.403669834136963,
"learning_rate": 9.420830226199354e-06,
"loss": 4.224,
"step": 3765
},
{
"epoch": 0.8335175768295379,
"grad_norm": 3.3747620582580566,
"learning_rate": 9.35868754660701e-06,
"loss": 4.1419,
"step": 3770
},
{
"epoch": 0.8346230378067654,
"grad_norm": 3.3672516345977783,
"learning_rate": 9.296544867014666e-06,
"loss": 4.2408,
"step": 3775
},
{
"epoch": 0.8357284987839929,
"grad_norm": 3.1235463619232178,
"learning_rate": 9.234402187422323e-06,
"loss": 4.2304,
"step": 3780
},
{
"epoch": 0.8368339597612204,
"grad_norm": 3.0135231018066406,
"learning_rate": 9.172259507829977e-06,
"loss": 4.3504,
"step": 3785
},
{
"epoch": 0.837939420738448,
"grad_norm": 3.669422149658203,
"learning_rate": 9.110116828237635e-06,
"loss": 4.2286,
"step": 3790
},
{
"epoch": 0.8390448817156755,
"grad_norm": 3.5061023235321045,
"learning_rate": 9.04797414864529e-06,
"loss": 4.043,
"step": 3795
},
{
"epoch": 0.8401503426929029,
"grad_norm": 3.188978672027588,
"learning_rate": 8.985831469052947e-06,
"loss": 4.2602,
"step": 3800
},
{
"epoch": 0.8412558036701304,
"grad_norm": 3.4181642532348633,
"learning_rate": 8.923688789460602e-06,
"loss": 4.1946,
"step": 3805
},
{
"epoch": 0.8423612646473579,
"grad_norm": 3.3051459789276123,
"learning_rate": 8.861546109868258e-06,
"loss": 4.1812,
"step": 3810
},
{
"epoch": 0.8434667256245855,
"grad_norm": 3.0405430793762207,
"learning_rate": 8.799403430275914e-06,
"loss": 4.2455,
"step": 3815
},
{
"epoch": 0.844572186601813,
"grad_norm": 3.1977388858795166,
"learning_rate": 8.737260750683569e-06,
"loss": 4.1665,
"step": 3820
},
{
"epoch": 0.8456776475790405,
"grad_norm": 3.153214693069458,
"learning_rate": 8.675118071091226e-06,
"loss": 4.1227,
"step": 3825
},
{
"epoch": 0.8467831085562679,
"grad_norm": 3.160295009613037,
"learning_rate": 8.612975391498881e-06,
"loss": 4.1928,
"step": 3830
},
{
"epoch": 0.8478885695334955,
"grad_norm": 3.522057294845581,
"learning_rate": 8.550832711906539e-06,
"loss": 4.3234,
"step": 3835
},
{
"epoch": 0.848994030510723,
"grad_norm": 3.3850722312927246,
"learning_rate": 8.488690032314193e-06,
"loss": 4.2035,
"step": 3840
},
{
"epoch": 0.8500994914879505,
"grad_norm": 3.237739324569702,
"learning_rate": 8.42654735272185e-06,
"loss": 4.0377,
"step": 3845
},
{
"epoch": 0.851204952465178,
"grad_norm": 3.3790619373321533,
"learning_rate": 8.364404673129506e-06,
"loss": 4.1112,
"step": 3850
},
{
"epoch": 0.8523104134424054,
"grad_norm": 3.395925760269165,
"learning_rate": 8.302261993537162e-06,
"loss": 4.3152,
"step": 3855
},
{
"epoch": 0.853415874419633,
"grad_norm": 2.8968868255615234,
"learning_rate": 8.240119313944818e-06,
"loss": 4.1642,
"step": 3860
},
{
"epoch": 0.8545213353968605,
"grad_norm": 3.6181344985961914,
"learning_rate": 8.177976634352472e-06,
"loss": 4.27,
"step": 3865
},
{
"epoch": 0.855626796374088,
"grad_norm": 3.3780412673950195,
"learning_rate": 8.11583395476013e-06,
"loss": 4.2319,
"step": 3870
},
{
"epoch": 0.8567322573513155,
"grad_norm": 3.0761659145355225,
"learning_rate": 8.053691275167785e-06,
"loss": 4.2244,
"step": 3875
},
{
"epoch": 0.857837718328543,
"grad_norm": 3.188369035720825,
"learning_rate": 7.991548595575441e-06,
"loss": 4.1855,
"step": 3880
},
{
"epoch": 0.8589431793057705,
"grad_norm": 3.280965805053711,
"learning_rate": 7.929405915983097e-06,
"loss": 4.2297,
"step": 3885
},
{
"epoch": 0.860048640282998,
"grad_norm": 3.428769111633301,
"learning_rate": 7.867263236390753e-06,
"loss": 4.2635,
"step": 3890
},
{
"epoch": 0.8611541012602255,
"grad_norm": 3.372145414352417,
"learning_rate": 7.80512055679841e-06,
"loss": 4.1799,
"step": 3895
},
{
"epoch": 0.862259562237453,
"grad_norm": 3.669572114944458,
"learning_rate": 7.742977877206066e-06,
"loss": 4.1279,
"step": 3900
},
{
"epoch": 0.8633650232146806,
"grad_norm": 3.3069515228271484,
"learning_rate": 7.680835197613722e-06,
"loss": 4.2423,
"step": 3905
},
{
"epoch": 0.864470484191908,
"grad_norm": 3.4965929985046387,
"learning_rate": 7.618692518021378e-06,
"loss": 4.2445,
"step": 3910
},
{
"epoch": 0.8655759451691355,
"grad_norm": 3.3007524013519287,
"learning_rate": 7.556549838429033e-06,
"loss": 4.3169,
"step": 3915
},
{
"epoch": 0.866681406146363,
"grad_norm": 3.3031368255615234,
"learning_rate": 7.494407158836689e-06,
"loss": 4.2489,
"step": 3920
},
{
"epoch": 0.8677868671235905,
"grad_norm": 3.3182923793792725,
"learning_rate": 7.432264479244346e-06,
"loss": 4.1043,
"step": 3925
},
{
"epoch": 0.8688923281008181,
"grad_norm": 3.1912918090820312,
"learning_rate": 7.370121799652001e-06,
"loss": 4.225,
"step": 3930
},
{
"epoch": 0.8699977890780456,
"grad_norm": 3.4221689701080322,
"learning_rate": 7.307979120059657e-06,
"loss": 4.2911,
"step": 3935
},
{
"epoch": 0.871103250055273,
"grad_norm": 3.3450770378112793,
"learning_rate": 7.2458364404673125e-06,
"loss": 4.4661,
"step": 3940
},
{
"epoch": 0.8722087110325005,
"grad_norm": 3.3857436180114746,
"learning_rate": 7.1836937608749695e-06,
"loss": 4.1062,
"step": 3945
},
{
"epoch": 0.873314172009728,
"grad_norm": 3.2162883281707764,
"learning_rate": 7.121551081282625e-06,
"loss": 4.2926,
"step": 3950
},
{
"epoch": 0.8744196329869556,
"grad_norm": 2.971797227859497,
"learning_rate": 7.059408401690282e-06,
"loss": 4.0731,
"step": 3955
},
{
"epoch": 0.8755250939641831,
"grad_norm": 3.228489875793457,
"learning_rate": 6.997265722097937e-06,
"loss": 4.1616,
"step": 3960
},
{
"epoch": 0.8766305549414106,
"grad_norm": 3.2910053730010986,
"learning_rate": 6.935123042505594e-06,
"loss": 4.2075,
"step": 3965
},
{
"epoch": 0.877736015918638,
"grad_norm": 3.1011228561401367,
"learning_rate": 6.8729803629132495e-06,
"loss": 4.1851,
"step": 3970
},
{
"epoch": 0.8788414768958656,
"grad_norm": 3.6701035499572754,
"learning_rate": 6.810837683320905e-06,
"loss": 4.1968,
"step": 3975
},
{
"epoch": 0.8799469378730931,
"grad_norm": 3.310450315475464,
"learning_rate": 6.748695003728561e-06,
"loss": 4.3885,
"step": 3980
},
{
"epoch": 0.8810523988503206,
"grad_norm": 3.3232550621032715,
"learning_rate": 6.686552324136216e-06,
"loss": 4.202,
"step": 3985
},
{
"epoch": 0.8821578598275481,
"grad_norm": 3.33705472946167,
"learning_rate": 6.624409644543873e-06,
"loss": 4.2345,
"step": 3990
},
{
"epoch": 0.8832633208047755,
"grad_norm": 3.648831605911255,
"learning_rate": 6.562266964951529e-06,
"loss": 4.2464,
"step": 3995
},
{
"epoch": 0.8843687817820031,
"grad_norm": 3.2218527793884277,
"learning_rate": 6.500124285359186e-06,
"loss": 4.0956,
"step": 4000
},
{
"epoch": 0.8854742427592306,
"grad_norm": 3.0550131797790527,
"learning_rate": 6.437981605766841e-06,
"loss": 4.1712,
"step": 4005
},
{
"epoch": 0.8865797037364581,
"grad_norm": 3.1984024047851562,
"learning_rate": 6.375838926174497e-06,
"loss": 4.2718,
"step": 4010
},
{
"epoch": 0.8876851647136856,
"grad_norm": 3.2509777545928955,
"learning_rate": 6.3136962465821526e-06,
"loss": 4.0173,
"step": 4015
},
{
"epoch": 0.8887906256909132,
"grad_norm": 3.146519899368286,
"learning_rate": 6.2515535669898096e-06,
"loss": 4.4115,
"step": 4020
},
{
"epoch": 0.8898960866681406,
"grad_norm": 3.422335624694824,
"learning_rate": 6.189410887397465e-06,
"loss": 4.3307,
"step": 4025
},
{
"epoch": 0.8910015476453681,
"grad_norm": 3.50016188621521,
"learning_rate": 6.127268207805121e-06,
"loss": 4.0675,
"step": 4030
},
{
"epoch": 0.8921070086225956,
"grad_norm": 3.059391975402832,
"learning_rate": 6.065125528212777e-06,
"loss": 4.2215,
"step": 4035
},
{
"epoch": 0.8932124695998231,
"grad_norm": 3.585162401199341,
"learning_rate": 6.002982848620433e-06,
"loss": 4.1206,
"step": 4040
},
{
"epoch": 0.8943179305770507,
"grad_norm": 3.1658449172973633,
"learning_rate": 5.940840169028089e-06,
"loss": 4.1826,
"step": 4045
},
{
"epoch": 0.8954233915542781,
"grad_norm": 3.30590558052063,
"learning_rate": 5.878697489435745e-06,
"loss": 4.07,
"step": 4050
},
{
"epoch": 0.8965288525315056,
"grad_norm": 3.5523128509521484,
"learning_rate": 5.8165548098434e-06,
"loss": 4.2302,
"step": 4055
},
{
"epoch": 0.8976343135087331,
"grad_norm": 3.2362444400787354,
"learning_rate": 5.754412130251056e-06,
"loss": 4.1555,
"step": 4060
},
{
"epoch": 0.8987397744859607,
"grad_norm": 2.9280905723571777,
"learning_rate": 5.692269450658713e-06,
"loss": 4.1708,
"step": 4065
},
{
"epoch": 0.8998452354631882,
"grad_norm": 3.277392625808716,
"learning_rate": 5.630126771066369e-06,
"loss": 4.1606,
"step": 4070
},
{
"epoch": 0.9009506964404157,
"grad_norm": 2.9546451568603516,
"learning_rate": 5.567984091474025e-06,
"loss": 4.1486,
"step": 4075
},
{
"epoch": 0.9020561574176431,
"grad_norm": 3.33906888961792,
"learning_rate": 5.50584141188168e-06,
"loss": 4.2423,
"step": 4080
},
{
"epoch": 0.9031616183948706,
"grad_norm": 3.414642572402954,
"learning_rate": 5.4436987322893364e-06,
"loss": 4.1806,
"step": 4085
},
{
"epoch": 0.9042670793720982,
"grad_norm": 3.1724166870117188,
"learning_rate": 5.381556052696993e-06,
"loss": 4.3395,
"step": 4090
},
{
"epoch": 0.9053725403493257,
"grad_norm": 3.3159971237182617,
"learning_rate": 5.319413373104649e-06,
"loss": 4.1692,
"step": 4095
},
{
"epoch": 0.9064780013265532,
"grad_norm": 3.149585008621216,
"learning_rate": 5.257270693512305e-06,
"loss": 4.1873,
"step": 4100
},
{
"epoch": 0.9075834623037807,
"grad_norm": 3.5617358684539795,
"learning_rate": 5.195128013919961e-06,
"loss": 4.2171,
"step": 4105
},
{
"epoch": 0.9086889232810081,
"grad_norm": 3.268549680709839,
"learning_rate": 5.1329853343276164e-06,
"loss": 4.1768,
"step": 4110
},
{
"epoch": 0.9097943842582357,
"grad_norm": 3.424433708190918,
"learning_rate": 5.070842654735273e-06,
"loss": 4.4327,
"step": 4115
},
{
"epoch": 0.9108998452354632,
"grad_norm": 3.495929479598999,
"learning_rate": 5.008699975142928e-06,
"loss": 4.1886,
"step": 4120
},
{
"epoch": 0.9120053062126907,
"grad_norm": 3.045023202896118,
"learning_rate": 4.946557295550584e-06,
"loss": 4.3829,
"step": 4125
},
{
"epoch": 0.9131107671899182,
"grad_norm": 3.1356985569000244,
"learning_rate": 4.88441461595824e-06,
"loss": 4.3304,
"step": 4130
},
{
"epoch": 0.9142162281671457,
"grad_norm": 3.389559507369995,
"learning_rate": 4.8222719363658965e-06,
"loss": 4.1715,
"step": 4135
},
{
"epoch": 0.9153216891443732,
"grad_norm": 3.1588001251220703,
"learning_rate": 4.760129256773553e-06,
"loss": 4.2491,
"step": 4140
},
{
"epoch": 0.9164271501216007,
"grad_norm": 3.5233826637268066,
"learning_rate": 4.697986577181209e-06,
"loss": 4.409,
"step": 4145
},
{
"epoch": 0.9175326110988282,
"grad_norm": 3.0876009464263916,
"learning_rate": 4.635843897588864e-06,
"loss": 4.1037,
"step": 4150
},
{
"epoch": 0.9186380720760557,
"grad_norm": 3.64609956741333,
"learning_rate": 4.57370121799652e-06,
"loss": 4.2202,
"step": 4155
},
{
"epoch": 0.9197435330532833,
"grad_norm": 3.119335174560547,
"learning_rate": 4.511558538404176e-06,
"loss": 4.2293,
"step": 4160
},
{
"epoch": 0.9208489940305107,
"grad_norm": 3.2007765769958496,
"learning_rate": 4.449415858811832e-06,
"loss": 4.2337,
"step": 4165
},
{
"epoch": 0.9219544550077382,
"grad_norm": 2.860046625137329,
"learning_rate": 4.387273179219488e-06,
"loss": 4.2855,
"step": 4170
},
{
"epoch": 0.9230599159849657,
"grad_norm": 3.472074270248413,
"learning_rate": 4.325130499627144e-06,
"loss": 4.2792,
"step": 4175
},
{
"epoch": 0.9241653769621933,
"grad_norm": 3.21456241607666,
"learning_rate": 4.2629878200348e-06,
"loss": 4.2083,
"step": 4180
},
{
"epoch": 0.9252708379394208,
"grad_norm": 3.0883960723876953,
"learning_rate": 4.2008451404424565e-06,
"loss": 4.2125,
"step": 4185
},
{
"epoch": 0.9263762989166482,
"grad_norm": 3.1821343898773193,
"learning_rate": 4.138702460850112e-06,
"loss": 4.3135,
"step": 4190
},
{
"epoch": 0.9274817598938757,
"grad_norm": 3.2891180515289307,
"learning_rate": 4.076559781257768e-06,
"loss": 4.2337,
"step": 4195
},
{
"epoch": 0.9285872208711032,
"grad_norm": 3.036611557006836,
"learning_rate": 4.014417101665424e-06,
"loss": 4.1799,
"step": 4200
},
{
"epoch": 0.9296926818483308,
"grad_norm": 3.262669086456299,
"learning_rate": 3.95227442207308e-06,
"loss": 4.3257,
"step": 4205
},
{
"epoch": 0.9307981428255583,
"grad_norm": 3.32913875579834,
"learning_rate": 3.8901317424807365e-06,
"loss": 4.2918,
"step": 4210
},
{
"epoch": 0.9319036038027858,
"grad_norm": 3.221358299255371,
"learning_rate": 3.827989062888392e-06,
"loss": 4.2922,
"step": 4215
},
{
"epoch": 0.9330090647800132,
"grad_norm": 3.131178617477417,
"learning_rate": 3.7658463832960476e-06,
"loss": 4.1484,
"step": 4220
},
{
"epoch": 0.9341145257572407,
"grad_norm": 3.0813159942626953,
"learning_rate": 3.7037037037037037e-06,
"loss": 4.2841,
"step": 4225
},
{
"epoch": 0.9352199867344683,
"grad_norm": 2.8390700817108154,
"learning_rate": 3.64156102411136e-06,
"loss": 4.0598,
"step": 4230
},
{
"epoch": 0.9363254477116958,
"grad_norm": 3.10927677154541,
"learning_rate": 3.5794183445190157e-06,
"loss": 4.1328,
"step": 4235
},
{
"epoch": 0.9374309086889233,
"grad_norm": 3.2241241931915283,
"learning_rate": 3.517275664926672e-06,
"loss": 4.2188,
"step": 4240
},
{
"epoch": 0.9385363696661508,
"grad_norm": 2.9095420837402344,
"learning_rate": 3.455132985334328e-06,
"loss": 4.068,
"step": 4245
},
{
"epoch": 0.9396418306433783,
"grad_norm": 3.1288955211639404,
"learning_rate": 3.3929903057419838e-06,
"loss": 4.2663,
"step": 4250
},
{
"epoch": 0.9407472916206058,
"grad_norm": 3.026554584503174,
"learning_rate": 3.33084762614964e-06,
"loss": 4.1512,
"step": 4255
},
{
"epoch": 0.9418527525978333,
"grad_norm": 3.222672462463379,
"learning_rate": 3.268704946557296e-06,
"loss": 4.235,
"step": 4260
},
{
"epoch": 0.9429582135750608,
"grad_norm": 3.381204605102539,
"learning_rate": 3.206562266964952e-06,
"loss": 4.1584,
"step": 4265
},
{
"epoch": 0.9440636745522883,
"grad_norm": 3.3569135665893555,
"learning_rate": 3.144419587372607e-06,
"loss": 4.2849,
"step": 4270
},
{
"epoch": 0.9451691355295158,
"grad_norm": 3.2201907634735107,
"learning_rate": 3.0822769077802638e-06,
"loss": 4.1318,
"step": 4275
},
{
"epoch": 0.9462745965067433,
"grad_norm": 3.078237771987915,
"learning_rate": 3.02013422818792e-06,
"loss": 4.2257,
"step": 4280
},
{
"epoch": 0.9473800574839708,
"grad_norm": 2.9291415214538574,
"learning_rate": 2.9579915485955753e-06,
"loss": 4.397,
"step": 4285
},
{
"epoch": 0.9484855184611983,
"grad_norm": 3.3114891052246094,
"learning_rate": 2.8958488690032314e-06,
"loss": 4.1599,
"step": 4290
},
{
"epoch": 0.9495909794384259,
"grad_norm": 3.3049850463867188,
"learning_rate": 2.8337061894108876e-06,
"loss": 4.2123,
"step": 4295
},
{
"epoch": 0.9506964404156534,
"grad_norm": 2.979609251022339,
"learning_rate": 2.7715635098185434e-06,
"loss": 4.1817,
"step": 4300
},
{
"epoch": 0.9518019013928808,
"grad_norm": 3.1335394382476807,
"learning_rate": 2.7094208302261995e-06,
"loss": 4.2932,
"step": 4305
},
{
"epoch": 0.9529073623701083,
"grad_norm": 3.3001952171325684,
"learning_rate": 2.6472781506338553e-06,
"loss": 4.4201,
"step": 4310
},
{
"epoch": 0.9540128233473358,
"grad_norm": 3.1160495281219482,
"learning_rate": 2.5851354710415115e-06,
"loss": 4.1786,
"step": 4315
},
{
"epoch": 0.9551182843245634,
"grad_norm": 2.8716208934783936,
"learning_rate": 2.522992791449167e-06,
"loss": 3.9942,
"step": 4320
},
{
"epoch": 0.9562237453017909,
"grad_norm": 3.0611040592193604,
"learning_rate": 2.4608501118568234e-06,
"loss": 4.4118,
"step": 4325
},
{
"epoch": 0.9573292062790183,
"grad_norm": 2.9500648975372314,
"learning_rate": 2.3987074322644795e-06,
"loss": 4.305,
"step": 4330
},
{
"epoch": 0.9584346672562458,
"grad_norm": 3.5862972736358643,
"learning_rate": 2.3365647526721353e-06,
"loss": 4.3046,
"step": 4335
},
{
"epoch": 0.9595401282334733,
"grad_norm": 3.304366111755371,
"learning_rate": 2.274422073079791e-06,
"loss": 4.3483,
"step": 4340
},
{
"epoch": 0.9606455892107009,
"grad_norm": 3.4040110111236572,
"learning_rate": 2.2122793934874472e-06,
"loss": 4.2975,
"step": 4345
},
{
"epoch": 0.9617510501879284,
"grad_norm": 3.197815179824829,
"learning_rate": 2.1501367138951034e-06,
"loss": 4.3031,
"step": 4350
},
{
"epoch": 0.9628565111651559,
"grad_norm": 3.365293502807617,
"learning_rate": 2.087994034302759e-06,
"loss": 4.2018,
"step": 4355
},
{
"epoch": 0.9639619721423833,
"grad_norm": 3.179311990737915,
"learning_rate": 2.0258513547104153e-06,
"loss": 4.3385,
"step": 4360
},
{
"epoch": 0.9650674331196109,
"grad_norm": 3.1740834712982178,
"learning_rate": 1.963708675118071e-06,
"loss": 4.4034,
"step": 4365
},
{
"epoch": 0.9661728940968384,
"grad_norm": 3.0727176666259766,
"learning_rate": 1.901565995525727e-06,
"loss": 4.2515,
"step": 4370
},
{
"epoch": 0.9672783550740659,
"grad_norm": 2.9758899211883545,
"learning_rate": 1.8394233159333832e-06,
"loss": 4.1974,
"step": 4375
},
{
"epoch": 0.9683838160512934,
"grad_norm": 3.014615774154663,
"learning_rate": 1.7772806363410391e-06,
"loss": 4.3097,
"step": 4380
},
{
"epoch": 0.969489277028521,
"grad_norm": 3.5511038303375244,
"learning_rate": 1.7151379567486951e-06,
"loss": 4.2784,
"step": 4385
},
{
"epoch": 0.9705947380057484,
"grad_norm": 2.977102518081665,
"learning_rate": 1.6529952771563513e-06,
"loss": 4.2234,
"step": 4390
},
{
"epoch": 0.9717001989829759,
"grad_norm": 2.964914083480835,
"learning_rate": 1.5908525975640068e-06,
"loss": 4.1375,
"step": 4395
},
{
"epoch": 0.9728056599602034,
"grad_norm": 2.916311025619507,
"learning_rate": 1.528709917971663e-06,
"loss": 4.1116,
"step": 4400
},
{
"epoch": 0.9739111209374309,
"grad_norm": 3.3200995922088623,
"learning_rate": 1.466567238379319e-06,
"loss": 4.3596,
"step": 4405
},
{
"epoch": 0.9750165819146585,
"grad_norm": 3.0481033325195312,
"learning_rate": 1.4044245587869751e-06,
"loss": 4.303,
"step": 4410
},
{
"epoch": 0.9761220428918859,
"grad_norm": 3.04089617729187,
"learning_rate": 1.3422818791946309e-06,
"loss": 4.3629,
"step": 4415
},
{
"epoch": 0.9772275038691134,
"grad_norm": 3.03387713432312,
"learning_rate": 1.280139199602287e-06,
"loss": 4.2679,
"step": 4420
},
{
"epoch": 0.9783329648463409,
"grad_norm": 3.1632862091064453,
"learning_rate": 1.2179965200099428e-06,
"loss": 4.153,
"step": 4425
},
{
"epoch": 0.9794384258235684,
"grad_norm": 3.382652759552002,
"learning_rate": 1.1558538404175988e-06,
"loss": 4.1147,
"step": 4430
},
{
"epoch": 0.980543886800796,
"grad_norm": 3.4399046897888184,
"learning_rate": 1.093711160825255e-06,
"loss": 4.2737,
"step": 4435
},
{
"epoch": 0.9816493477780235,
"grad_norm": 3.3583288192749023,
"learning_rate": 1.0315684812329107e-06,
"loss": 4.2274,
"step": 4440
},
{
"epoch": 0.9827548087552509,
"grad_norm": 3.291776657104492,
"learning_rate": 9.694258016405668e-07,
"loss": 4.1284,
"step": 4445
},
{
"epoch": 0.9838602697324784,
"grad_norm": 3.148688554763794,
"learning_rate": 9.072831220482228e-07,
"loss": 4.3734,
"step": 4450
},
{
"epoch": 0.9849657307097059,
"grad_norm": 2.98494553565979,
"learning_rate": 8.451404424558787e-07,
"loss": 4.2998,
"step": 4455
},
{
"epoch": 0.9860711916869335,
"grad_norm": 3.550734043121338,
"learning_rate": 7.829977628635347e-07,
"loss": 4.131,
"step": 4460
},
{
"epoch": 0.987176652664161,
"grad_norm": 3.148184299468994,
"learning_rate": 7.208550832711907e-07,
"loss": 4.211,
"step": 4465
},
{
"epoch": 0.9882821136413884,
"grad_norm": 3.389477491378784,
"learning_rate": 6.587124036788466e-07,
"loss": 4.3192,
"step": 4470
},
{
"epoch": 0.9893875746186159,
"grad_norm": 2.744230031967163,
"learning_rate": 5.965697240865026e-07,
"loss": 4.3994,
"step": 4475
},
{
"epoch": 0.9904930355958435,
"grad_norm": 3.189837694168091,
"learning_rate": 5.344270444941587e-07,
"loss": 4.3435,
"step": 4480
},
{
"epoch": 0.991598496573071,
"grad_norm": 3.2491848468780518,
"learning_rate": 4.722843649018146e-07,
"loss": 4.3766,
"step": 4485
},
{
"epoch": 0.9927039575502985,
"grad_norm": 3.1869592666625977,
"learning_rate": 4.1014168530947054e-07,
"loss": 4.393,
"step": 4490
},
{
"epoch": 0.993809418527526,
"grad_norm": 3.4105918407440186,
"learning_rate": 3.4799900571712656e-07,
"loss": 4.2419,
"step": 4495
},
{
"epoch": 0.9949148795047534,
"grad_norm": 3.1611382961273193,
"learning_rate": 2.858563261247825e-07,
"loss": 4.3572,
"step": 4500
},
{
"epoch": 0.996020340481981,
"grad_norm": 3.0471818447113037,
"learning_rate": 2.2371364653243848e-07,
"loss": 4.3163,
"step": 4505
},
{
"epoch": 0.9971258014592085,
"grad_norm": 2.9979894161224365,
"learning_rate": 1.6157096694009447e-07,
"loss": 4.1885,
"step": 4510
},
{
"epoch": 0.998231262436436,
"grad_norm": 3.4176154136657715,
"learning_rate": 9.942828734775043e-08,
"loss": 4.3076,
"step": 4515
},
{
"epoch": 0.9993367234136635,
"grad_norm": 3.594446897506714,
"learning_rate": 3.728560775540641e-08,
"loss": 4.184,
"step": 4520
}
],
"logging_steps": 5,
"max_steps": 4523,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7364382421434368e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}