ElapticAI-1a / trainer_state.json
elapt1c
epoche 3
46df360 verified
raw
history blame
157 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4523,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001105460977227504,
"grad_norm": 2.395341396331787,
"learning_rate": 5.000000000000001e-07,
"loss": 4.6826,
"step": 5
},
{
"epoch": 0.002210921954455008,
"grad_norm": 2.2102696895599365,
"learning_rate": 1.0000000000000002e-06,
"loss": 4.4984,
"step": 10
},
{
"epoch": 0.0033163829316825116,
"grad_norm": 2.5083913803100586,
"learning_rate": 1.5e-06,
"loss": 4.5731,
"step": 15
},
{
"epoch": 0.004421843908910016,
"grad_norm": 2.1317508220672607,
"learning_rate": 2.0000000000000003e-06,
"loss": 4.5149,
"step": 20
},
{
"epoch": 0.0055273048861375195,
"grad_norm": 2.2241172790527344,
"learning_rate": 2.5e-06,
"loss": 4.4769,
"step": 25
},
{
"epoch": 0.006632765863365023,
"grad_norm": 2.1349635124206543,
"learning_rate": 3e-06,
"loss": 4.5924,
"step": 30
},
{
"epoch": 0.007738226840592527,
"grad_norm": 2.366008758544922,
"learning_rate": 3.5000000000000004e-06,
"loss": 4.5941,
"step": 35
},
{
"epoch": 0.008843687817820032,
"grad_norm": 2.4122307300567627,
"learning_rate": 4.000000000000001e-06,
"loss": 4.4631,
"step": 40
},
{
"epoch": 0.009949148795047534,
"grad_norm": 2.023873805999756,
"learning_rate": 4.5e-06,
"loss": 4.5361,
"step": 45
},
{
"epoch": 0.011054609772275039,
"grad_norm": 2.2571287155151367,
"learning_rate": 5e-06,
"loss": 4.5539,
"step": 50
},
{
"epoch": 0.012160070749502542,
"grad_norm": 2.1223011016845703,
"learning_rate": 5.500000000000001e-06,
"loss": 4.4691,
"step": 55
},
{
"epoch": 0.013265531726730046,
"grad_norm": 2.125227451324463,
"learning_rate": 6e-06,
"loss": 4.4713,
"step": 60
},
{
"epoch": 0.014370992703957551,
"grad_norm": 2.272958278656006,
"learning_rate": 6.5000000000000004e-06,
"loss": 4.5511,
"step": 65
},
{
"epoch": 0.015476453681185054,
"grad_norm": 2.1949267387390137,
"learning_rate": 7.000000000000001e-06,
"loss": 4.4926,
"step": 70
},
{
"epoch": 0.016581914658412557,
"grad_norm": 2.155870199203491,
"learning_rate": 7.5e-06,
"loss": 4.6169,
"step": 75
},
{
"epoch": 0.017687375635640063,
"grad_norm": 2.253253698348999,
"learning_rate": 8.000000000000001e-06,
"loss": 4.5112,
"step": 80
},
{
"epoch": 0.018792836612867566,
"grad_norm": 2.2717690467834473,
"learning_rate": 8.500000000000002e-06,
"loss": 4.5192,
"step": 85
},
{
"epoch": 0.01989829759009507,
"grad_norm": 2.4532206058502197,
"learning_rate": 9e-06,
"loss": 4.5964,
"step": 90
},
{
"epoch": 0.021003758567322575,
"grad_norm": 2.420793056488037,
"learning_rate": 9.5e-06,
"loss": 4.5466,
"step": 95
},
{
"epoch": 0.022109219544550078,
"grad_norm": 2.122037649154663,
"learning_rate": 1e-05,
"loss": 4.3679,
"step": 100
},
{
"epoch": 0.02321468052177758,
"grad_norm": 2.5729713439941406,
"learning_rate": 1.05e-05,
"loss": 4.4623,
"step": 105
},
{
"epoch": 0.024320141499005084,
"grad_norm": 2.364610195159912,
"learning_rate": 1.1000000000000001e-05,
"loss": 4.3146,
"step": 110
},
{
"epoch": 0.02542560247623259,
"grad_norm": 2.4596588611602783,
"learning_rate": 1.1500000000000002e-05,
"loss": 4.4393,
"step": 115
},
{
"epoch": 0.026531063453460093,
"grad_norm": 2.401916265487671,
"learning_rate": 1.2e-05,
"loss": 4.3398,
"step": 120
},
{
"epoch": 0.027636524430687596,
"grad_norm": 2.1921896934509277,
"learning_rate": 1.25e-05,
"loss": 4.3521,
"step": 125
},
{
"epoch": 0.028741985407915102,
"grad_norm": 2.193477153778076,
"learning_rate": 1.3000000000000001e-05,
"loss": 4.5969,
"step": 130
},
{
"epoch": 0.029847446385142605,
"grad_norm": 2.442413568496704,
"learning_rate": 1.3500000000000001e-05,
"loss": 4.5048,
"step": 135
},
{
"epoch": 0.030952907362370108,
"grad_norm": 2.314326047897339,
"learning_rate": 1.4000000000000001e-05,
"loss": 4.5778,
"step": 140
},
{
"epoch": 0.03205836833959761,
"grad_norm": 2.1947810649871826,
"learning_rate": 1.45e-05,
"loss": 4.648,
"step": 145
},
{
"epoch": 0.033163829316825114,
"grad_norm": 2.268144369125366,
"learning_rate": 1.5e-05,
"loss": 4.5153,
"step": 150
},
{
"epoch": 0.03426929029405262,
"grad_norm": 2.311060905456543,
"learning_rate": 1.55e-05,
"loss": 4.6327,
"step": 155
},
{
"epoch": 0.035374751271280126,
"grad_norm": 2.2926158905029297,
"learning_rate": 1.6000000000000003e-05,
"loss": 4.3874,
"step": 160
},
{
"epoch": 0.03648021224850763,
"grad_norm": 2.1994051933288574,
"learning_rate": 1.65e-05,
"loss": 4.4664,
"step": 165
},
{
"epoch": 0.03758567322573513,
"grad_norm": 2.3250937461853027,
"learning_rate": 1.7000000000000003e-05,
"loss": 4.4617,
"step": 170
},
{
"epoch": 0.038691134202962635,
"grad_norm": 2.3668570518493652,
"learning_rate": 1.75e-05,
"loss": 4.2885,
"step": 175
},
{
"epoch": 0.03979659518019014,
"grad_norm": 2.1874375343322754,
"learning_rate": 1.8e-05,
"loss": 4.4582,
"step": 180
},
{
"epoch": 0.04090205615741764,
"grad_norm": 2.365499973297119,
"learning_rate": 1.85e-05,
"loss": 4.4125,
"step": 185
},
{
"epoch": 0.04200751713464515,
"grad_norm": 2.272512435913086,
"learning_rate": 1.9e-05,
"loss": 4.3934,
"step": 190
},
{
"epoch": 0.04311297811187265,
"grad_norm": 2.343959093093872,
"learning_rate": 1.9500000000000003e-05,
"loss": 4.5141,
"step": 195
},
{
"epoch": 0.044218439089100156,
"grad_norm": 2.5027010440826416,
"learning_rate": 2e-05,
"loss": 4.3837,
"step": 200
},
{
"epoch": 0.04532390006632766,
"grad_norm": 2.2487406730651855,
"learning_rate": 2.05e-05,
"loss": 4.5956,
"step": 205
},
{
"epoch": 0.04642936104355516,
"grad_norm": 2.242449998855591,
"learning_rate": 2.1e-05,
"loss": 4.6371,
"step": 210
},
{
"epoch": 0.047534822020782665,
"grad_norm": 2.183947801589966,
"learning_rate": 2.15e-05,
"loss": 4.314,
"step": 215
},
{
"epoch": 0.04864028299801017,
"grad_norm": 2.2349512577056885,
"learning_rate": 2.2000000000000003e-05,
"loss": 4.4111,
"step": 220
},
{
"epoch": 0.04974574397523768,
"grad_norm": 2.2303688526153564,
"learning_rate": 2.25e-05,
"loss": 4.3312,
"step": 225
},
{
"epoch": 0.05085120495246518,
"grad_norm": 2.2685301303863525,
"learning_rate": 2.3000000000000003e-05,
"loss": 4.4618,
"step": 230
},
{
"epoch": 0.05195666592969268,
"grad_norm": 2.287493944168091,
"learning_rate": 2.35e-05,
"loss": 4.4438,
"step": 235
},
{
"epoch": 0.053062126906920186,
"grad_norm": 2.350281238555908,
"learning_rate": 2.4e-05,
"loss": 4.3578,
"step": 240
},
{
"epoch": 0.05416758788414769,
"grad_norm": 2.4053986072540283,
"learning_rate": 2.45e-05,
"loss": 4.4378,
"step": 245
},
{
"epoch": 0.05527304886137519,
"grad_norm": 2.5036556720733643,
"learning_rate": 2.5e-05,
"loss": 4.4291,
"step": 250
},
{
"epoch": 0.056378509838602694,
"grad_norm": 2.125025987625122,
"learning_rate": 2.5500000000000003e-05,
"loss": 4.3374,
"step": 255
},
{
"epoch": 0.057483970815830204,
"grad_norm": 2.461651563644409,
"learning_rate": 2.6000000000000002e-05,
"loss": 4.5828,
"step": 260
},
{
"epoch": 0.05858943179305771,
"grad_norm": 2.3358347415924072,
"learning_rate": 2.6500000000000004e-05,
"loss": 4.4477,
"step": 265
},
{
"epoch": 0.05969489277028521,
"grad_norm": 2.2937681674957275,
"learning_rate": 2.7000000000000002e-05,
"loss": 4.4706,
"step": 270
},
{
"epoch": 0.06080035374751271,
"grad_norm": 2.173781633377075,
"learning_rate": 2.7500000000000004e-05,
"loss": 4.3496,
"step": 275
},
{
"epoch": 0.061905814724740216,
"grad_norm": 2.373222827911377,
"learning_rate": 2.8000000000000003e-05,
"loss": 4.5112,
"step": 280
},
{
"epoch": 0.06301127570196773,
"grad_norm": 2.2600908279418945,
"learning_rate": 2.8499999999999998e-05,
"loss": 4.4922,
"step": 285
},
{
"epoch": 0.06411673667919522,
"grad_norm": 2.241600275039673,
"learning_rate": 2.9e-05,
"loss": 4.4719,
"step": 290
},
{
"epoch": 0.06522219765642273,
"grad_norm": 2.2558817863464355,
"learning_rate": 2.95e-05,
"loss": 4.5555,
"step": 295
},
{
"epoch": 0.06632765863365023,
"grad_norm": 2.5431759357452393,
"learning_rate": 3e-05,
"loss": 4.3901,
"step": 300
},
{
"epoch": 0.06743311961087774,
"grad_norm": 2.1968157291412354,
"learning_rate": 3.05e-05,
"loss": 4.3923,
"step": 305
},
{
"epoch": 0.06853858058810525,
"grad_norm": 2.1212503910064697,
"learning_rate": 3.1e-05,
"loss": 4.3401,
"step": 310
},
{
"epoch": 0.06964404156533274,
"grad_norm": 2.430278778076172,
"learning_rate": 3.15e-05,
"loss": 4.609,
"step": 315
},
{
"epoch": 0.07074950254256025,
"grad_norm": 2.741177797317505,
"learning_rate": 3.2000000000000005e-05,
"loss": 4.6077,
"step": 320
},
{
"epoch": 0.07185496351978775,
"grad_norm": 2.3611228466033936,
"learning_rate": 3.2500000000000004e-05,
"loss": 4.3933,
"step": 325
},
{
"epoch": 0.07296042449701526,
"grad_norm": 2.265152931213379,
"learning_rate": 3.3e-05,
"loss": 4.23,
"step": 330
},
{
"epoch": 0.07406588547424275,
"grad_norm": 2.3802292346954346,
"learning_rate": 3.35e-05,
"loss": 4.3844,
"step": 335
},
{
"epoch": 0.07517134645147026,
"grad_norm": 2.5243539810180664,
"learning_rate": 3.4000000000000007e-05,
"loss": 4.4695,
"step": 340
},
{
"epoch": 0.07627680742869777,
"grad_norm": 2.522508144378662,
"learning_rate": 3.45e-05,
"loss": 4.3575,
"step": 345
},
{
"epoch": 0.07738226840592527,
"grad_norm": 2.1260106563568115,
"learning_rate": 3.5e-05,
"loss": 4.4234,
"step": 350
},
{
"epoch": 0.07848772938315278,
"grad_norm": 2.445948839187622,
"learning_rate": 3.55e-05,
"loss": 4.3261,
"step": 355
},
{
"epoch": 0.07959319036038028,
"grad_norm": 2.191976308822632,
"learning_rate": 3.6e-05,
"loss": 4.3506,
"step": 360
},
{
"epoch": 0.08069865133760779,
"grad_norm": 2.582002878189087,
"learning_rate": 3.65e-05,
"loss": 4.379,
"step": 365
},
{
"epoch": 0.08180411231483528,
"grad_norm": 2.395965099334717,
"learning_rate": 3.7e-05,
"loss": 4.4162,
"step": 370
},
{
"epoch": 0.08290957329206279,
"grad_norm": 2.313727617263794,
"learning_rate": 3.7500000000000003e-05,
"loss": 4.4531,
"step": 375
},
{
"epoch": 0.0840150342692903,
"grad_norm": 2.2551207542419434,
"learning_rate": 3.8e-05,
"loss": 4.3614,
"step": 380
},
{
"epoch": 0.0851204952465178,
"grad_norm": 2.3821234703063965,
"learning_rate": 3.85e-05,
"loss": 4.2572,
"step": 385
},
{
"epoch": 0.0862259562237453,
"grad_norm": 2.212198495864868,
"learning_rate": 3.9000000000000006e-05,
"loss": 4.6126,
"step": 390
},
{
"epoch": 0.0873314172009728,
"grad_norm": 2.08597993850708,
"learning_rate": 3.9500000000000005e-05,
"loss": 4.4071,
"step": 395
},
{
"epoch": 0.08843687817820031,
"grad_norm": 2.2900874614715576,
"learning_rate": 4e-05,
"loss": 4.4119,
"step": 400
},
{
"epoch": 0.08954233915542781,
"grad_norm": 2.6229662895202637,
"learning_rate": 4.05e-05,
"loss": 4.5127,
"step": 405
},
{
"epoch": 0.09064780013265532,
"grad_norm": 2.313673496246338,
"learning_rate": 4.1e-05,
"loss": 4.5682,
"step": 410
},
{
"epoch": 0.09175326110988283,
"grad_norm": 2.5078179836273193,
"learning_rate": 4.15e-05,
"loss": 4.3089,
"step": 415
},
{
"epoch": 0.09285872208711032,
"grad_norm": 2.276742696762085,
"learning_rate": 4.2e-05,
"loss": 4.4486,
"step": 420
},
{
"epoch": 0.09396418306433783,
"grad_norm": 2.0925698280334473,
"learning_rate": 4.25e-05,
"loss": 4.2959,
"step": 425
},
{
"epoch": 0.09506964404156533,
"grad_norm": 2.5252251625061035,
"learning_rate": 4.3e-05,
"loss": 4.4345,
"step": 430
},
{
"epoch": 0.09617510501879284,
"grad_norm": 2.374155282974243,
"learning_rate": 4.35e-05,
"loss": 4.3959,
"step": 435
},
{
"epoch": 0.09728056599602034,
"grad_norm": 2.4412851333618164,
"learning_rate": 4.4000000000000006e-05,
"loss": 4.3762,
"step": 440
},
{
"epoch": 0.09838602697324784,
"grad_norm": 2.302851676940918,
"learning_rate": 4.4500000000000004e-05,
"loss": 4.5145,
"step": 445
},
{
"epoch": 0.09949148795047535,
"grad_norm": 2.3877639770507812,
"learning_rate": 4.5e-05,
"loss": 4.3736,
"step": 450
},
{
"epoch": 0.10059694892770285,
"grad_norm": 2.413830280303955,
"learning_rate": 4.55e-05,
"loss": 4.377,
"step": 455
},
{
"epoch": 0.10170240990493036,
"grad_norm": 2.5087687969207764,
"learning_rate": 4.600000000000001e-05,
"loss": 4.4129,
"step": 460
},
{
"epoch": 0.10280787088215786,
"grad_norm": 2.449108600616455,
"learning_rate": 4.6500000000000005e-05,
"loss": 4.4224,
"step": 465
},
{
"epoch": 0.10391333185938537,
"grad_norm": 2.255720376968384,
"learning_rate": 4.7e-05,
"loss": 4.3521,
"step": 470
},
{
"epoch": 0.10501879283661286,
"grad_norm": 2.461012601852417,
"learning_rate": 4.75e-05,
"loss": 4.3307,
"step": 475
},
{
"epoch": 0.10612425381384037,
"grad_norm": 2.6323764324188232,
"learning_rate": 4.8e-05,
"loss": 4.4938,
"step": 480
},
{
"epoch": 0.10722971479106788,
"grad_norm": 2.4425625801086426,
"learning_rate": 4.85e-05,
"loss": 4.5443,
"step": 485
},
{
"epoch": 0.10833517576829538,
"grad_norm": 2.523211717605591,
"learning_rate": 4.9e-05,
"loss": 4.3785,
"step": 490
},
{
"epoch": 0.10944063674552289,
"grad_norm": 2.4563889503479004,
"learning_rate": 4.9500000000000004e-05,
"loss": 4.4033,
"step": 495
},
{
"epoch": 0.11054609772275038,
"grad_norm": 2.4348998069763184,
"learning_rate": 5e-05,
"loss": 4.3472,
"step": 500
},
{
"epoch": 0.1116515586999779,
"grad_norm": 2.430751323699951,
"learning_rate": 4.993785732040766e-05,
"loss": 4.5632,
"step": 505
},
{
"epoch": 0.11275701967720539,
"grad_norm": 2.5139589309692383,
"learning_rate": 4.9875714640815315e-05,
"loss": 4.3744,
"step": 510
},
{
"epoch": 0.1138624806544329,
"grad_norm": 2.3713083267211914,
"learning_rate": 4.981357196122297e-05,
"loss": 4.4622,
"step": 515
},
{
"epoch": 0.11496794163166041,
"grad_norm": 2.348144769668579,
"learning_rate": 4.975142928163063e-05,
"loss": 4.4778,
"step": 520
},
{
"epoch": 0.1160734026088879,
"grad_norm": 2.4068586826324463,
"learning_rate": 4.968928660203828e-05,
"loss": 4.4139,
"step": 525
},
{
"epoch": 0.11717886358611541,
"grad_norm": 2.3237857818603516,
"learning_rate": 4.962714392244594e-05,
"loss": 4.3972,
"step": 530
},
{
"epoch": 0.11828432456334291,
"grad_norm": 2.39794659614563,
"learning_rate": 4.9565001242853596e-05,
"loss": 4.2682,
"step": 535
},
{
"epoch": 0.11938978554057042,
"grad_norm": 2.433943748474121,
"learning_rate": 4.950285856326125e-05,
"loss": 4.6123,
"step": 540
},
{
"epoch": 0.12049524651779792,
"grad_norm": 2.3196094036102295,
"learning_rate": 4.944071588366891e-05,
"loss": 4.4894,
"step": 545
},
{
"epoch": 0.12160070749502543,
"grad_norm": 2.388373851776123,
"learning_rate": 4.9378573204076564e-05,
"loss": 4.4735,
"step": 550
},
{
"epoch": 0.12270616847225294,
"grad_norm": 2.471214532852173,
"learning_rate": 4.931643052448422e-05,
"loss": 4.3991,
"step": 555
},
{
"epoch": 0.12381162944948043,
"grad_norm": 2.5611140727996826,
"learning_rate": 4.925428784489187e-05,
"loss": 4.3706,
"step": 560
},
{
"epoch": 0.12491709042670794,
"grad_norm": 2.365116834640503,
"learning_rate": 4.919214516529953e-05,
"loss": 4.3329,
"step": 565
},
{
"epoch": 0.12602255140393545,
"grad_norm": 2.5337095260620117,
"learning_rate": 4.913000248570719e-05,
"loss": 4.3502,
"step": 570
},
{
"epoch": 0.12712801238116295,
"grad_norm": 2.3166821002960205,
"learning_rate": 4.906785980611484e-05,
"loss": 4.4643,
"step": 575
},
{
"epoch": 0.12823347335839044,
"grad_norm": 2.7000489234924316,
"learning_rate": 4.90057171265225e-05,
"loss": 4.2057,
"step": 580
},
{
"epoch": 0.12933893433561794,
"grad_norm": 2.541940450668335,
"learning_rate": 4.894357444693015e-05,
"loss": 4.3045,
"step": 585
},
{
"epoch": 0.13044439531284546,
"grad_norm": 2.4047327041625977,
"learning_rate": 4.888143176733781e-05,
"loss": 4.2141,
"step": 590
},
{
"epoch": 0.13154985629007296,
"grad_norm": 2.3533935546875,
"learning_rate": 4.881928908774547e-05,
"loss": 4.3564,
"step": 595
},
{
"epoch": 0.13265531726730045,
"grad_norm": 2.469710350036621,
"learning_rate": 4.875714640815312e-05,
"loss": 4.3387,
"step": 600
},
{
"epoch": 0.13376077824452798,
"grad_norm": 2.4111387729644775,
"learning_rate": 4.8695003728560775e-05,
"loss": 4.3647,
"step": 605
},
{
"epoch": 0.13486623922175547,
"grad_norm": 2.5026888847351074,
"learning_rate": 4.863286104896843e-05,
"loss": 4.4231,
"step": 610
},
{
"epoch": 0.13597170019898297,
"grad_norm": 2.4435007572174072,
"learning_rate": 4.857071836937609e-05,
"loss": 4.45,
"step": 615
},
{
"epoch": 0.1370771611762105,
"grad_norm": 2.518418550491333,
"learning_rate": 4.850857568978375e-05,
"loss": 4.4376,
"step": 620
},
{
"epoch": 0.138182622153438,
"grad_norm": 2.4196436405181885,
"learning_rate": 4.84464330101914e-05,
"loss": 4.585,
"step": 625
},
{
"epoch": 0.13928808313066549,
"grad_norm": 2.3251471519470215,
"learning_rate": 4.8384290330599056e-05,
"loss": 4.387,
"step": 630
},
{
"epoch": 0.14039354410789298,
"grad_norm": 2.599461078643799,
"learning_rate": 4.832214765100672e-05,
"loss": 4.4816,
"step": 635
},
{
"epoch": 0.1414990050851205,
"grad_norm": 2.5266942977905273,
"learning_rate": 4.826000497141437e-05,
"loss": 4.4737,
"step": 640
},
{
"epoch": 0.142604466062348,
"grad_norm": 2.3561177253723145,
"learning_rate": 4.8197862291822025e-05,
"loss": 4.355,
"step": 645
},
{
"epoch": 0.1437099270395755,
"grad_norm": 2.291571855545044,
"learning_rate": 4.813571961222968e-05,
"loss": 4.4803,
"step": 650
},
{
"epoch": 0.14481538801680302,
"grad_norm": 2.5574657917022705,
"learning_rate": 4.807357693263734e-05,
"loss": 4.3132,
"step": 655
},
{
"epoch": 0.14592084899403052,
"grad_norm": 2.816318988800049,
"learning_rate": 4.801143425304499e-05,
"loss": 4.4246,
"step": 660
},
{
"epoch": 0.147026309971258,
"grad_norm": 2.3737952709198,
"learning_rate": 4.794929157345265e-05,
"loss": 4.5105,
"step": 665
},
{
"epoch": 0.1481317709484855,
"grad_norm": 2.4100232124328613,
"learning_rate": 4.7887148893860305e-05,
"loss": 4.5111,
"step": 670
},
{
"epoch": 0.14923723192571303,
"grad_norm": 2.36722731590271,
"learning_rate": 4.782500621426796e-05,
"loss": 4.3462,
"step": 675
},
{
"epoch": 0.15034269290294053,
"grad_norm": 2.747675657272339,
"learning_rate": 4.776286353467562e-05,
"loss": 4.518,
"step": 680
},
{
"epoch": 0.15144815388016802,
"grad_norm": 2.8760783672332764,
"learning_rate": 4.7700720855083274e-05,
"loss": 4.544,
"step": 685
},
{
"epoch": 0.15255361485739555,
"grad_norm": 2.1986746788024902,
"learning_rate": 4.763857817549093e-05,
"loss": 4.4212,
"step": 690
},
{
"epoch": 0.15365907583462304,
"grad_norm": 2.2483763694763184,
"learning_rate": 4.7576435495898586e-05,
"loss": 4.4373,
"step": 695
},
{
"epoch": 0.15476453681185054,
"grad_norm": 2.5549709796905518,
"learning_rate": 4.751429281630624e-05,
"loss": 4.4253,
"step": 700
},
{
"epoch": 0.15586999778907804,
"grad_norm": 2.2713725566864014,
"learning_rate": 4.74521501367139e-05,
"loss": 4.2794,
"step": 705
},
{
"epoch": 0.15697545876630556,
"grad_norm": 2.340376615524292,
"learning_rate": 4.7390007457121555e-05,
"loss": 4.5125,
"step": 710
},
{
"epoch": 0.15808091974353305,
"grad_norm": 2.421940803527832,
"learning_rate": 4.7327864777529204e-05,
"loss": 4.2371,
"step": 715
},
{
"epoch": 0.15918638072076055,
"grad_norm": 2.4546539783477783,
"learning_rate": 4.726572209793687e-05,
"loss": 4.4549,
"step": 720
},
{
"epoch": 0.16029184169798807,
"grad_norm": 2.427361011505127,
"learning_rate": 4.720357941834452e-05,
"loss": 4.4328,
"step": 725
},
{
"epoch": 0.16139730267521557,
"grad_norm": 2.4004828929901123,
"learning_rate": 4.714143673875217e-05,
"loss": 4.2623,
"step": 730
},
{
"epoch": 0.16250276365244307,
"grad_norm": 2.3959038257598877,
"learning_rate": 4.7079294059159836e-05,
"loss": 4.2597,
"step": 735
},
{
"epoch": 0.16360822462967056,
"grad_norm": 2.257460594177246,
"learning_rate": 4.7017151379567485e-05,
"loss": 4.382,
"step": 740
},
{
"epoch": 0.16471368560689809,
"grad_norm": 2.546736478805542,
"learning_rate": 4.695500869997515e-05,
"loss": 4.4304,
"step": 745
},
{
"epoch": 0.16581914658412558,
"grad_norm": 2.665574789047241,
"learning_rate": 4.6892866020382804e-05,
"loss": 4.443,
"step": 750
},
{
"epoch": 0.16692460756135308,
"grad_norm": 2.587796926498413,
"learning_rate": 4.6830723340790454e-05,
"loss": 4.3819,
"step": 755
},
{
"epoch": 0.1680300685385806,
"grad_norm": 2.6442179679870605,
"learning_rate": 4.6768580661198117e-05,
"loss": 4.3984,
"step": 760
},
{
"epoch": 0.1691355295158081,
"grad_norm": 2.596620798110962,
"learning_rate": 4.670643798160577e-05,
"loss": 4.2336,
"step": 765
},
{
"epoch": 0.1702409904930356,
"grad_norm": 2.4057729244232178,
"learning_rate": 4.664429530201342e-05,
"loss": 4.3909,
"step": 770
},
{
"epoch": 0.1713464514702631,
"grad_norm": 2.406342029571533,
"learning_rate": 4.6582152622421085e-05,
"loss": 4.3091,
"step": 775
},
{
"epoch": 0.1724519124474906,
"grad_norm": 2.4423723220825195,
"learning_rate": 4.6520009942828734e-05,
"loss": 4.3409,
"step": 780
},
{
"epoch": 0.1735573734247181,
"grad_norm": 2.342496633529663,
"learning_rate": 4.645786726323639e-05,
"loss": 4.3805,
"step": 785
},
{
"epoch": 0.1746628344019456,
"grad_norm": 2.482818365097046,
"learning_rate": 4.6395724583644054e-05,
"loss": 4.323,
"step": 790
},
{
"epoch": 0.17576829537917313,
"grad_norm": 2.6542818546295166,
"learning_rate": 4.63335819040517e-05,
"loss": 4.3603,
"step": 795
},
{
"epoch": 0.17687375635640062,
"grad_norm": 2.499776840209961,
"learning_rate": 4.627143922445936e-05,
"loss": 4.4008,
"step": 800
},
{
"epoch": 0.17797921733362812,
"grad_norm": 2.341139316558838,
"learning_rate": 4.6209296544867015e-05,
"loss": 4.3715,
"step": 805
},
{
"epoch": 0.17908467831085562,
"grad_norm": 2.29777455329895,
"learning_rate": 4.614715386527467e-05,
"loss": 4.4741,
"step": 810
},
{
"epoch": 0.18019013928808314,
"grad_norm": 2.515763282775879,
"learning_rate": 4.608501118568233e-05,
"loss": 4.3415,
"step": 815
},
{
"epoch": 0.18129560026531064,
"grad_norm": 2.4565176963806152,
"learning_rate": 4.6022868506089984e-05,
"loss": 4.2374,
"step": 820
},
{
"epoch": 0.18240106124253813,
"grad_norm": 2.6354682445526123,
"learning_rate": 4.596072582649764e-05,
"loss": 4.4921,
"step": 825
},
{
"epoch": 0.18350652221976566,
"grad_norm": 2.610104560852051,
"learning_rate": 4.5898583146905296e-05,
"loss": 4.4667,
"step": 830
},
{
"epoch": 0.18461198319699315,
"grad_norm": 2.362448215484619,
"learning_rate": 4.583644046731295e-05,
"loss": 4.3195,
"step": 835
},
{
"epoch": 0.18571744417422065,
"grad_norm": 2.380387306213379,
"learning_rate": 4.577429778772061e-05,
"loss": 4.4707,
"step": 840
},
{
"epoch": 0.18682290515144814,
"grad_norm": 2.4917492866516113,
"learning_rate": 4.5712155108128265e-05,
"loss": 4.5139,
"step": 845
},
{
"epoch": 0.18792836612867567,
"grad_norm": 2.3864855766296387,
"learning_rate": 4.565001242853592e-05,
"loss": 4.5081,
"step": 850
},
{
"epoch": 0.18903382710590316,
"grad_norm": 2.3583791255950928,
"learning_rate": 4.558786974894358e-05,
"loss": 4.3965,
"step": 855
},
{
"epoch": 0.19013928808313066,
"grad_norm": 2.506446599960327,
"learning_rate": 4.552572706935123e-05,
"loss": 4.4016,
"step": 860
},
{
"epoch": 0.19124474906035818,
"grad_norm": 2.2975127696990967,
"learning_rate": 4.546358438975889e-05,
"loss": 4.4601,
"step": 865
},
{
"epoch": 0.19235021003758568,
"grad_norm": 2.433366537094116,
"learning_rate": 4.5401441710166546e-05,
"loss": 4.2386,
"step": 870
},
{
"epoch": 0.19345567101481317,
"grad_norm": 2.3259806632995605,
"learning_rate": 4.53392990305742e-05,
"loss": 4.3704,
"step": 875
},
{
"epoch": 0.19456113199204067,
"grad_norm": 2.514643907546997,
"learning_rate": 4.527715635098186e-05,
"loss": 4.4008,
"step": 880
},
{
"epoch": 0.1956665929692682,
"grad_norm": 2.3121140003204346,
"learning_rate": 4.5215013671389514e-05,
"loss": 4.2446,
"step": 885
},
{
"epoch": 0.1967720539464957,
"grad_norm": 2.412771224975586,
"learning_rate": 4.515287099179717e-05,
"loss": 4.4833,
"step": 890
},
{
"epoch": 0.19787751492372319,
"grad_norm": 2.4728493690490723,
"learning_rate": 4.509072831220482e-05,
"loss": 4.2572,
"step": 895
},
{
"epoch": 0.1989829759009507,
"grad_norm": 2.3301310539245605,
"learning_rate": 4.502858563261248e-05,
"loss": 4.3495,
"step": 900
},
{
"epoch": 0.2000884368781782,
"grad_norm": 2.5001354217529297,
"learning_rate": 4.496644295302014e-05,
"loss": 4.3298,
"step": 905
},
{
"epoch": 0.2011938978554057,
"grad_norm": 2.338364601135254,
"learning_rate": 4.490430027342779e-05,
"loss": 4.3985,
"step": 910
},
{
"epoch": 0.2022993588326332,
"grad_norm": 2.30706524848938,
"learning_rate": 4.484215759383545e-05,
"loss": 4.3349,
"step": 915
},
{
"epoch": 0.20340481980986072,
"grad_norm": 2.396179437637329,
"learning_rate": 4.478001491424311e-05,
"loss": 4.3986,
"step": 920
},
{
"epoch": 0.20451028078708822,
"grad_norm": 2.477341890335083,
"learning_rate": 4.471787223465076e-05,
"loss": 4.3669,
"step": 925
},
{
"epoch": 0.2056157417643157,
"grad_norm": 2.5613510608673096,
"learning_rate": 4.465572955505842e-05,
"loss": 4.3262,
"step": 930
},
{
"epoch": 0.20672120274154324,
"grad_norm": 2.5783421993255615,
"learning_rate": 4.459358687546607e-05,
"loss": 4.3136,
"step": 935
},
{
"epoch": 0.20782666371877073,
"grad_norm": 2.4187774658203125,
"learning_rate": 4.4531444195873725e-05,
"loss": 4.3181,
"step": 940
},
{
"epoch": 0.20893212469599823,
"grad_norm": 2.5953481197357178,
"learning_rate": 4.446930151628139e-05,
"loss": 4.5064,
"step": 945
},
{
"epoch": 0.21003758567322572,
"grad_norm": 2.513113260269165,
"learning_rate": 4.440715883668904e-05,
"loss": 4.3423,
"step": 950
},
{
"epoch": 0.21114304665045325,
"grad_norm": 2.44311261177063,
"learning_rate": 4.4345016157096694e-05,
"loss": 4.376,
"step": 955
},
{
"epoch": 0.21224850762768074,
"grad_norm": 2.427305221557617,
"learning_rate": 4.428287347750435e-05,
"loss": 4.3677,
"step": 960
},
{
"epoch": 0.21335396860490824,
"grad_norm": 2.3907687664031982,
"learning_rate": 4.4220730797912006e-05,
"loss": 4.3134,
"step": 965
},
{
"epoch": 0.21445942958213576,
"grad_norm": 2.348848819732666,
"learning_rate": 4.415858811831967e-05,
"loss": 4.5477,
"step": 970
},
{
"epoch": 0.21556489055936326,
"grad_norm": 2.830244302749634,
"learning_rate": 4.409644543872732e-05,
"loss": 4.4073,
"step": 975
},
{
"epoch": 0.21667035153659076,
"grad_norm": 2.5423595905303955,
"learning_rate": 4.4034302759134975e-05,
"loss": 4.4871,
"step": 980
},
{
"epoch": 0.21777581251381825,
"grad_norm": 2.4786319732666016,
"learning_rate": 4.397216007954264e-05,
"loss": 4.4376,
"step": 985
},
{
"epoch": 0.21888127349104577,
"grad_norm": 2.5218095779418945,
"learning_rate": 4.391001739995029e-05,
"loss": 4.5045,
"step": 990
},
{
"epoch": 0.21998673446827327,
"grad_norm": 2.492645740509033,
"learning_rate": 4.384787472035794e-05,
"loss": 4.3694,
"step": 995
},
{
"epoch": 0.22109219544550077,
"grad_norm": 2.3848962783813477,
"learning_rate": 4.37857320407656e-05,
"loss": 4.2347,
"step": 1000
},
{
"epoch": 0.2221976564227283,
"grad_norm": 2.4425323009490967,
"learning_rate": 4.3723589361173255e-05,
"loss": 4.4254,
"step": 1005
},
{
"epoch": 0.2233031173999558,
"grad_norm": 2.4466652870178223,
"learning_rate": 4.366144668158091e-05,
"loss": 4.3855,
"step": 1010
},
{
"epoch": 0.22440857837718328,
"grad_norm": 2.5668978691101074,
"learning_rate": 4.359930400198857e-05,
"loss": 4.1885,
"step": 1015
},
{
"epoch": 0.22551403935441078,
"grad_norm": 2.1038079261779785,
"learning_rate": 4.3537161322396224e-05,
"loss": 4.3574,
"step": 1020
},
{
"epoch": 0.2266195003316383,
"grad_norm": 2.3468997478485107,
"learning_rate": 4.347501864280388e-05,
"loss": 4.3984,
"step": 1025
},
{
"epoch": 0.2277249613088658,
"grad_norm": 2.2503867149353027,
"learning_rate": 4.3412875963211536e-05,
"loss": 4.3913,
"step": 1030
},
{
"epoch": 0.2288304222860933,
"grad_norm": 2.508117437362671,
"learning_rate": 4.335073328361919e-05,
"loss": 4.4638,
"step": 1035
},
{
"epoch": 0.22993588326332082,
"grad_norm": 2.503089666366577,
"learning_rate": 4.328859060402685e-05,
"loss": 4.2682,
"step": 1040
},
{
"epoch": 0.2310413442405483,
"grad_norm": 2.4912095069885254,
"learning_rate": 4.3226447924434505e-05,
"loss": 4.4836,
"step": 1045
},
{
"epoch": 0.2321468052177758,
"grad_norm": 2.383793354034424,
"learning_rate": 4.3164305244842154e-05,
"loss": 4.4063,
"step": 1050
},
{
"epoch": 0.2332522661950033,
"grad_norm": 2.299375534057617,
"learning_rate": 4.310216256524982e-05,
"loss": 4.3989,
"step": 1055
},
{
"epoch": 0.23435772717223083,
"grad_norm": 2.432926893234253,
"learning_rate": 4.304001988565747e-05,
"loss": 4.3972,
"step": 1060
},
{
"epoch": 0.23546318814945832,
"grad_norm": 2.6002376079559326,
"learning_rate": 4.297787720606512e-05,
"loss": 4.166,
"step": 1065
},
{
"epoch": 0.23656864912668582,
"grad_norm": 2.76485013961792,
"learning_rate": 4.2915734526472786e-05,
"loss": 4.4923,
"step": 1070
},
{
"epoch": 0.23767411010391334,
"grad_norm": 2.4608538150787354,
"learning_rate": 4.285359184688044e-05,
"loss": 4.4156,
"step": 1075
},
{
"epoch": 0.23877957108114084,
"grad_norm": 2.5879130363464355,
"learning_rate": 4.279144916728809e-05,
"loss": 4.2349,
"step": 1080
},
{
"epoch": 0.23988503205836834,
"grad_norm": 2.4327921867370605,
"learning_rate": 4.2729306487695754e-05,
"loss": 4.2487,
"step": 1085
},
{
"epoch": 0.24099049303559583,
"grad_norm": 2.4870424270629883,
"learning_rate": 4.2667163808103404e-05,
"loss": 4.499,
"step": 1090
},
{
"epoch": 0.24209595401282336,
"grad_norm": 2.573253631591797,
"learning_rate": 4.2605021128511067e-05,
"loss": 4.2689,
"step": 1095
},
{
"epoch": 0.24320141499005085,
"grad_norm": 2.4426496028900146,
"learning_rate": 4.254287844891872e-05,
"loss": 4.3502,
"step": 1100
},
{
"epoch": 0.24430687596727835,
"grad_norm": 2.2450709342956543,
"learning_rate": 4.248073576932637e-05,
"loss": 4.3314,
"step": 1105
},
{
"epoch": 0.24541233694450587,
"grad_norm": 2.6109743118286133,
"learning_rate": 4.2418593089734035e-05,
"loss": 4.305,
"step": 1110
},
{
"epoch": 0.24651779792173337,
"grad_norm": 2.626323938369751,
"learning_rate": 4.2356450410141684e-05,
"loss": 4.2716,
"step": 1115
},
{
"epoch": 0.24762325889896086,
"grad_norm": 2.320756673812866,
"learning_rate": 4.229430773054934e-05,
"loss": 4.4438,
"step": 1120
},
{
"epoch": 0.24872871987618836,
"grad_norm": 2.481062650680542,
"learning_rate": 4.2232165050957004e-05,
"loss": 4.4925,
"step": 1125
},
{
"epoch": 0.24983418085341588,
"grad_norm": 2.521596908569336,
"learning_rate": 4.217002237136465e-05,
"loss": 4.4221,
"step": 1130
},
{
"epoch": 0.2509396418306434,
"grad_norm": 2.361933469772339,
"learning_rate": 4.210787969177231e-05,
"loss": 4.3693,
"step": 1135
},
{
"epoch": 0.2520451028078709,
"grad_norm": 2.357417106628418,
"learning_rate": 4.204573701217997e-05,
"loss": 4.4775,
"step": 1140
},
{
"epoch": 0.25315056378509837,
"grad_norm": 2.688908576965332,
"learning_rate": 4.198359433258762e-05,
"loss": 4.29,
"step": 1145
},
{
"epoch": 0.2542560247623259,
"grad_norm": 2.2829039096832275,
"learning_rate": 4.192145165299528e-05,
"loss": 4.401,
"step": 1150
},
{
"epoch": 0.2553614857395534,
"grad_norm": 2.6343767642974854,
"learning_rate": 4.1859308973402934e-05,
"loss": 4.4336,
"step": 1155
},
{
"epoch": 0.2564669467167809,
"grad_norm": 2.3044660091400146,
"learning_rate": 4.179716629381059e-05,
"loss": 4.3832,
"step": 1160
},
{
"epoch": 0.2575724076940084,
"grad_norm": 2.5719525814056396,
"learning_rate": 4.1735023614218246e-05,
"loss": 4.2833,
"step": 1165
},
{
"epoch": 0.2586778686712359,
"grad_norm": 2.6642727851867676,
"learning_rate": 4.16728809346259e-05,
"loss": 4.3051,
"step": 1170
},
{
"epoch": 0.2597833296484634,
"grad_norm": 2.5633628368377686,
"learning_rate": 4.161073825503356e-05,
"loss": 4.3245,
"step": 1175
},
{
"epoch": 0.2608887906256909,
"grad_norm": 2.3659725189208984,
"learning_rate": 4.1548595575441215e-05,
"loss": 4.5625,
"step": 1180
},
{
"epoch": 0.2619942516029184,
"grad_norm": 2.5750010013580322,
"learning_rate": 4.148645289584887e-05,
"loss": 4.2276,
"step": 1185
},
{
"epoch": 0.2630997125801459,
"grad_norm": 2.650841474533081,
"learning_rate": 4.142431021625653e-05,
"loss": 4.4841,
"step": 1190
},
{
"epoch": 0.26420517355737344,
"grad_norm": 2.257554292678833,
"learning_rate": 4.136216753666418e-05,
"loss": 4.5292,
"step": 1195
},
{
"epoch": 0.2653106345346009,
"grad_norm": 2.3063228130340576,
"learning_rate": 4.130002485707184e-05,
"loss": 4.3253,
"step": 1200
},
{
"epoch": 0.26641609551182843,
"grad_norm": 2.4297571182250977,
"learning_rate": 4.123788217747949e-05,
"loss": 4.3772,
"step": 1205
},
{
"epoch": 0.26752155648905596,
"grad_norm": 2.431993007659912,
"learning_rate": 4.117573949788715e-05,
"loss": 4.3032,
"step": 1210
},
{
"epoch": 0.2686270174662834,
"grad_norm": 2.3991315364837646,
"learning_rate": 4.111359681829481e-05,
"loss": 4.3427,
"step": 1215
},
{
"epoch": 0.26973247844351095,
"grad_norm": 2.3820011615753174,
"learning_rate": 4.1051454138702464e-05,
"loss": 4.3706,
"step": 1220
},
{
"epoch": 0.27083793942073847,
"grad_norm": 2.670473337173462,
"learning_rate": 4.098931145911012e-05,
"loss": 4.3521,
"step": 1225
},
{
"epoch": 0.27194340039796594,
"grad_norm": 2.8199636936187744,
"learning_rate": 4.0927168779517776e-05,
"loss": 4.3276,
"step": 1230
},
{
"epoch": 0.27304886137519346,
"grad_norm": 2.347820520401001,
"learning_rate": 4.086502609992543e-05,
"loss": 4.3414,
"step": 1235
},
{
"epoch": 0.274154322352421,
"grad_norm": 2.271981716156006,
"learning_rate": 4.080288342033309e-05,
"loss": 4.3148,
"step": 1240
},
{
"epoch": 0.27525978332964846,
"grad_norm": 2.515171527862549,
"learning_rate": 4.074074074074074e-05,
"loss": 4.3787,
"step": 1245
},
{
"epoch": 0.276365244306876,
"grad_norm": 2.4658026695251465,
"learning_rate": 4.06785980611484e-05,
"loss": 4.4014,
"step": 1250
},
{
"epoch": 0.27747070528410345,
"grad_norm": 2.4536259174346924,
"learning_rate": 4.061645538155606e-05,
"loss": 4.2641,
"step": 1255
},
{
"epoch": 0.27857616626133097,
"grad_norm": 2.491704225540161,
"learning_rate": 4.055431270196371e-05,
"loss": 4.3729,
"step": 1260
},
{
"epoch": 0.2796816272385585,
"grad_norm": 2.5859057903289795,
"learning_rate": 4.049217002237137e-05,
"loss": 4.3815,
"step": 1265
},
{
"epoch": 0.28078708821578596,
"grad_norm": 2.5725574493408203,
"learning_rate": 4.043002734277902e-05,
"loss": 4.3624,
"step": 1270
},
{
"epoch": 0.2818925491930135,
"grad_norm": 2.484657049179077,
"learning_rate": 4.0367884663186675e-05,
"loss": 4.3583,
"step": 1275
},
{
"epoch": 0.282998010170241,
"grad_norm": 2.544689178466797,
"learning_rate": 4.030574198359434e-05,
"loss": 4.2289,
"step": 1280
},
{
"epoch": 0.2841034711474685,
"grad_norm": 2.5880086421966553,
"learning_rate": 4.024359930400199e-05,
"loss": 4.3604,
"step": 1285
},
{
"epoch": 0.285208932124696,
"grad_norm": 2.614906072616577,
"learning_rate": 4.0181456624409644e-05,
"loss": 4.2697,
"step": 1290
},
{
"epoch": 0.2863143931019235,
"grad_norm": 2.6999433040618896,
"learning_rate": 4.011931394481731e-05,
"loss": 4.4131,
"step": 1295
},
{
"epoch": 0.287419854079151,
"grad_norm": 2.3542439937591553,
"learning_rate": 4.0057171265224956e-05,
"loss": 4.3436,
"step": 1300
},
{
"epoch": 0.2885253150563785,
"grad_norm": 2.4977333545684814,
"learning_rate": 3.999502858563262e-05,
"loss": 4.2333,
"step": 1305
},
{
"epoch": 0.28963077603360604,
"grad_norm": 2.3839094638824463,
"learning_rate": 3.993288590604027e-05,
"loss": 4.2906,
"step": 1310
},
{
"epoch": 0.2907362370108335,
"grad_norm": 2.583096504211426,
"learning_rate": 3.9870743226447925e-05,
"loss": 4.2372,
"step": 1315
},
{
"epoch": 0.29184169798806103,
"grad_norm": 2.8082754611968994,
"learning_rate": 3.980860054685559e-05,
"loss": 4.3763,
"step": 1320
},
{
"epoch": 0.2929471589652885,
"grad_norm": 2.699869394302368,
"learning_rate": 3.974645786726324e-05,
"loss": 4.3501,
"step": 1325
},
{
"epoch": 0.294052619942516,
"grad_norm": 2.489060878753662,
"learning_rate": 3.968431518767089e-05,
"loss": 4.3261,
"step": 1330
},
{
"epoch": 0.29515808091974355,
"grad_norm": 2.6914567947387695,
"learning_rate": 3.962217250807855e-05,
"loss": 4.3582,
"step": 1335
},
{
"epoch": 0.296263541896971,
"grad_norm": 2.6697006225585938,
"learning_rate": 3.9560029828486205e-05,
"loss": 4.3114,
"step": 1340
},
{
"epoch": 0.29736900287419854,
"grad_norm": 2.5954415798187256,
"learning_rate": 3.949788714889386e-05,
"loss": 4.2934,
"step": 1345
},
{
"epoch": 0.29847446385142606,
"grad_norm": 2.985745906829834,
"learning_rate": 3.943574446930152e-05,
"loss": 4.3548,
"step": 1350
},
{
"epoch": 0.29957992482865353,
"grad_norm": 2.397188186645508,
"learning_rate": 3.9373601789709174e-05,
"loss": 4.378,
"step": 1355
},
{
"epoch": 0.30068538580588106,
"grad_norm": 2.328190565109253,
"learning_rate": 3.931145911011683e-05,
"loss": 4.3864,
"step": 1360
},
{
"epoch": 0.3017908467831086,
"grad_norm": 2.659130096435547,
"learning_rate": 3.9249316430524486e-05,
"loss": 4.2503,
"step": 1365
},
{
"epoch": 0.30289630776033605,
"grad_norm": 2.5458106994628906,
"learning_rate": 3.918717375093214e-05,
"loss": 4.4694,
"step": 1370
},
{
"epoch": 0.30400176873756357,
"grad_norm": 2.6253693103790283,
"learning_rate": 3.91250310713398e-05,
"loss": 4.3011,
"step": 1375
},
{
"epoch": 0.3051072297147911,
"grad_norm": 2.5949649810791016,
"learning_rate": 3.9062888391747455e-05,
"loss": 4.3781,
"step": 1380
},
{
"epoch": 0.30621269069201856,
"grad_norm": 2.6035447120666504,
"learning_rate": 3.900074571215511e-05,
"loss": 4.215,
"step": 1385
},
{
"epoch": 0.3073181516692461,
"grad_norm": 2.7866146564483643,
"learning_rate": 3.893860303256277e-05,
"loss": 4.3382,
"step": 1390
},
{
"epoch": 0.30842361264647356,
"grad_norm": 2.5743088722229004,
"learning_rate": 3.887646035297042e-05,
"loss": 4.3505,
"step": 1395
},
{
"epoch": 0.3095290736237011,
"grad_norm": 2.6363112926483154,
"learning_rate": 3.881431767337807e-05,
"loss": 4.37,
"step": 1400
},
{
"epoch": 0.3106345346009286,
"grad_norm": 2.409414291381836,
"learning_rate": 3.8752174993785736e-05,
"loss": 4.3642,
"step": 1405
},
{
"epoch": 0.31173999557815607,
"grad_norm": 2.6767184734344482,
"learning_rate": 3.869003231419339e-05,
"loss": 4.4374,
"step": 1410
},
{
"epoch": 0.3128454565553836,
"grad_norm": 2.6071739196777344,
"learning_rate": 3.862788963460104e-05,
"loss": 4.4875,
"step": 1415
},
{
"epoch": 0.3139509175326111,
"grad_norm": 2.8153324127197266,
"learning_rate": 3.8565746955008704e-05,
"loss": 4.2156,
"step": 1420
},
{
"epoch": 0.3150563785098386,
"grad_norm": 2.5854175090789795,
"learning_rate": 3.8503604275416354e-05,
"loss": 4.4762,
"step": 1425
},
{
"epoch": 0.3161618394870661,
"grad_norm": 2.6283559799194336,
"learning_rate": 3.8441461595824017e-05,
"loss": 4.3707,
"step": 1430
},
{
"epoch": 0.31726730046429363,
"grad_norm": 2.392477512359619,
"learning_rate": 3.837931891623167e-05,
"loss": 4.4578,
"step": 1435
},
{
"epoch": 0.3183727614415211,
"grad_norm": 2.5749545097351074,
"learning_rate": 3.831717623663932e-05,
"loss": 4.3093,
"step": 1440
},
{
"epoch": 0.3194782224187486,
"grad_norm": 2.57065486907959,
"learning_rate": 3.8255033557046985e-05,
"loss": 4.4154,
"step": 1445
},
{
"epoch": 0.32058368339597615,
"grad_norm": 2.652879476547241,
"learning_rate": 3.819289087745464e-05,
"loss": 4.5573,
"step": 1450
},
{
"epoch": 0.3216891443732036,
"grad_norm": 2.846167802810669,
"learning_rate": 3.813074819786229e-05,
"loss": 4.4113,
"step": 1455
},
{
"epoch": 0.32279460535043114,
"grad_norm": 2.641319513320923,
"learning_rate": 3.8068605518269954e-05,
"loss": 4.3614,
"step": 1460
},
{
"epoch": 0.3239000663276586,
"grad_norm": 2.5918776988983154,
"learning_rate": 3.80064628386776e-05,
"loss": 4.3636,
"step": 1465
},
{
"epoch": 0.32500552730488613,
"grad_norm": 2.6786410808563232,
"learning_rate": 3.794432015908526e-05,
"loss": 4.3731,
"step": 1470
},
{
"epoch": 0.32611098828211366,
"grad_norm": 2.548100233078003,
"learning_rate": 3.788217747949292e-05,
"loss": 4.2728,
"step": 1475
},
{
"epoch": 0.3272164492593411,
"grad_norm": 2.409332752227783,
"learning_rate": 3.782003479990057e-05,
"loss": 4.3442,
"step": 1480
},
{
"epoch": 0.32832191023656865,
"grad_norm": 2.8180229663848877,
"learning_rate": 3.775789212030823e-05,
"loss": 4.3566,
"step": 1485
},
{
"epoch": 0.32942737121379617,
"grad_norm": 2.634147882461548,
"learning_rate": 3.7695749440715884e-05,
"loss": 4.4708,
"step": 1490
},
{
"epoch": 0.33053283219102364,
"grad_norm": 2.3490123748779297,
"learning_rate": 3.763360676112354e-05,
"loss": 4.2733,
"step": 1495
},
{
"epoch": 0.33163829316825116,
"grad_norm": 2.638009548187256,
"learning_rate": 3.7571464081531196e-05,
"loss": 4.4472,
"step": 1500
},
{
"epoch": 0.3327437541454787,
"grad_norm": 2.601348638534546,
"learning_rate": 3.750932140193885e-05,
"loss": 4.5207,
"step": 1505
},
{
"epoch": 0.33384921512270616,
"grad_norm": 2.6195290088653564,
"learning_rate": 3.744717872234651e-05,
"loss": 4.3151,
"step": 1510
},
{
"epoch": 0.3349546760999337,
"grad_norm": 2.5007519721984863,
"learning_rate": 3.7385036042754165e-05,
"loss": 4.3751,
"step": 1515
},
{
"epoch": 0.3360601370771612,
"grad_norm": 2.4757566452026367,
"learning_rate": 3.732289336316182e-05,
"loss": 4.2864,
"step": 1520
},
{
"epoch": 0.33716559805438867,
"grad_norm": 2.612262487411499,
"learning_rate": 3.726075068356948e-05,
"loss": 4.4617,
"step": 1525
},
{
"epoch": 0.3382710590316162,
"grad_norm": 2.3229122161865234,
"learning_rate": 3.719860800397713e-05,
"loss": 4.2659,
"step": 1530
},
{
"epoch": 0.33937652000884366,
"grad_norm": 3.0333845615386963,
"learning_rate": 3.713646532438479e-05,
"loss": 4.2091,
"step": 1535
},
{
"epoch": 0.3404819809860712,
"grad_norm": 2.364445686340332,
"learning_rate": 3.7074322644792446e-05,
"loss": 4.1667,
"step": 1540
},
{
"epoch": 0.3415874419632987,
"grad_norm": 2.6092944145202637,
"learning_rate": 3.70121799652001e-05,
"loss": 4.4148,
"step": 1545
},
{
"epoch": 0.3426929029405262,
"grad_norm": 2.69758677482605,
"learning_rate": 3.695003728560776e-05,
"loss": 4.3029,
"step": 1550
},
{
"epoch": 0.3437983639177537,
"grad_norm": 2.665482997894287,
"learning_rate": 3.6887894606015414e-05,
"loss": 4.3617,
"step": 1555
},
{
"epoch": 0.3449038248949812,
"grad_norm": 2.6900408267974854,
"learning_rate": 3.682575192642307e-05,
"loss": 4.456,
"step": 1560
},
{
"epoch": 0.3460092858722087,
"grad_norm": 2.335728406906128,
"learning_rate": 3.6763609246830726e-05,
"loss": 4.3155,
"step": 1565
},
{
"epoch": 0.3471147468494362,
"grad_norm": 2.85036039352417,
"learning_rate": 3.670146656723838e-05,
"loss": 4.3152,
"step": 1570
},
{
"epoch": 0.34822020782666374,
"grad_norm": 2.652212381362915,
"learning_rate": 3.663932388764604e-05,
"loss": 4.4341,
"step": 1575
},
{
"epoch": 0.3493256688038912,
"grad_norm": 2.3771016597747803,
"learning_rate": 3.6577181208053695e-05,
"loss": 4.3358,
"step": 1580
},
{
"epoch": 0.35043112978111873,
"grad_norm": 2.7119994163513184,
"learning_rate": 3.651503852846135e-05,
"loss": 4.2583,
"step": 1585
},
{
"epoch": 0.35153659075834626,
"grad_norm": 2.4877076148986816,
"learning_rate": 3.645289584886901e-05,
"loss": 4.4398,
"step": 1590
},
{
"epoch": 0.3526420517355737,
"grad_norm": 2.5400094985961914,
"learning_rate": 3.639075316927666e-05,
"loss": 4.4864,
"step": 1595
},
{
"epoch": 0.35374751271280125,
"grad_norm": 2.929621458053589,
"learning_rate": 3.632861048968432e-05,
"loss": 4.2378,
"step": 1600
},
{
"epoch": 0.3548529736900287,
"grad_norm": 2.555133581161499,
"learning_rate": 3.6266467810091976e-05,
"loss": 4.3108,
"step": 1605
},
{
"epoch": 0.35595843466725624,
"grad_norm": 2.410792350769043,
"learning_rate": 3.6204325130499625e-05,
"loss": 4.3592,
"step": 1610
},
{
"epoch": 0.35706389564448376,
"grad_norm": 2.459975004196167,
"learning_rate": 3.614218245090729e-05,
"loss": 4.5196,
"step": 1615
},
{
"epoch": 0.35816935662171123,
"grad_norm": 2.834867000579834,
"learning_rate": 3.608003977131494e-05,
"loss": 4.3758,
"step": 1620
},
{
"epoch": 0.35927481759893876,
"grad_norm": 2.6577582359313965,
"learning_rate": 3.6017897091722594e-05,
"loss": 4.3663,
"step": 1625
},
{
"epoch": 0.3603802785761663,
"grad_norm": 2.725658416748047,
"learning_rate": 3.595575441213026e-05,
"loss": 4.3878,
"step": 1630
},
{
"epoch": 0.36148573955339375,
"grad_norm": 2.368903160095215,
"learning_rate": 3.5893611732537906e-05,
"loss": 4.3393,
"step": 1635
},
{
"epoch": 0.36259120053062127,
"grad_norm": 2.2058262825012207,
"learning_rate": 3.583146905294556e-05,
"loss": 4.3152,
"step": 1640
},
{
"epoch": 0.3636966615078488,
"grad_norm": 2.60345458984375,
"learning_rate": 3.576932637335322e-05,
"loss": 4.4803,
"step": 1645
},
{
"epoch": 0.36480212248507626,
"grad_norm": 2.657458543777466,
"learning_rate": 3.5707183693760875e-05,
"loss": 4.3058,
"step": 1650
},
{
"epoch": 0.3659075834623038,
"grad_norm": 2.596036195755005,
"learning_rate": 3.564504101416854e-05,
"loss": 4.2178,
"step": 1655
},
{
"epoch": 0.3670130444395313,
"grad_norm": 2.7093770503997803,
"learning_rate": 3.558289833457619e-05,
"loss": 4.3902,
"step": 1660
},
{
"epoch": 0.3681185054167588,
"grad_norm": 2.2766308784484863,
"learning_rate": 3.552075565498384e-05,
"loss": 4.4526,
"step": 1665
},
{
"epoch": 0.3692239663939863,
"grad_norm": 2.696753740310669,
"learning_rate": 3.5458612975391506e-05,
"loss": 4.3636,
"step": 1670
},
{
"epoch": 0.37032942737121377,
"grad_norm": 2.463946580886841,
"learning_rate": 3.5396470295799155e-05,
"loss": 4.2369,
"step": 1675
},
{
"epoch": 0.3714348883484413,
"grad_norm": 2.948925018310547,
"learning_rate": 3.533432761620681e-05,
"loss": 4.4674,
"step": 1680
},
{
"epoch": 0.3725403493256688,
"grad_norm": 2.914759874343872,
"learning_rate": 3.527218493661447e-05,
"loss": 4.2563,
"step": 1685
},
{
"epoch": 0.3736458103028963,
"grad_norm": 2.562021255493164,
"learning_rate": 3.5210042257022124e-05,
"loss": 4.2267,
"step": 1690
},
{
"epoch": 0.3747512712801238,
"grad_norm": 2.4976344108581543,
"learning_rate": 3.514789957742978e-05,
"loss": 4.3459,
"step": 1695
},
{
"epoch": 0.37585673225735133,
"grad_norm": 2.656845808029175,
"learning_rate": 3.5085756897837436e-05,
"loss": 4.2767,
"step": 1700
},
{
"epoch": 0.3769621932345788,
"grad_norm": 2.6122493743896484,
"learning_rate": 3.502361421824509e-05,
"loss": 4.2535,
"step": 1705
},
{
"epoch": 0.3780676542118063,
"grad_norm": 2.7145111560821533,
"learning_rate": 3.496147153865275e-05,
"loss": 4.373,
"step": 1710
},
{
"epoch": 0.37917311518903385,
"grad_norm": 2.6271467208862305,
"learning_rate": 3.4899328859060405e-05,
"loss": 4.2728,
"step": 1715
},
{
"epoch": 0.3802785761662613,
"grad_norm": 2.350149631500244,
"learning_rate": 3.483718617946806e-05,
"loss": 4.1621,
"step": 1720
},
{
"epoch": 0.38138403714348884,
"grad_norm": 2.773153305053711,
"learning_rate": 3.477504349987572e-05,
"loss": 4.3022,
"step": 1725
},
{
"epoch": 0.38248949812071636,
"grad_norm": 2.8574771881103516,
"learning_rate": 3.471290082028337e-05,
"loss": 4.2579,
"step": 1730
},
{
"epoch": 0.38359495909794383,
"grad_norm": 2.725560426712036,
"learning_rate": 3.465075814069103e-05,
"loss": 4.2797,
"step": 1735
},
{
"epoch": 0.38470042007517136,
"grad_norm": 2.513237476348877,
"learning_rate": 3.4588615461098686e-05,
"loss": 4.405,
"step": 1740
},
{
"epoch": 0.3858058810523988,
"grad_norm": 2.718583822250366,
"learning_rate": 3.452647278150634e-05,
"loss": 4.2946,
"step": 1745
},
{
"epoch": 0.38691134202962635,
"grad_norm": 2.4899282455444336,
"learning_rate": 3.446433010191399e-05,
"loss": 4.269,
"step": 1750
},
{
"epoch": 0.38801680300685387,
"grad_norm": 2.5338146686553955,
"learning_rate": 3.4402187422321654e-05,
"loss": 4.4835,
"step": 1755
},
{
"epoch": 0.38912226398408134,
"grad_norm": 2.3587207794189453,
"learning_rate": 3.434004474272931e-05,
"loss": 4.1855,
"step": 1760
},
{
"epoch": 0.39022772496130886,
"grad_norm": 2.939471960067749,
"learning_rate": 3.427790206313696e-05,
"loss": 4.31,
"step": 1765
},
{
"epoch": 0.3913331859385364,
"grad_norm": 2.79874324798584,
"learning_rate": 3.421575938354462e-05,
"loss": 4.2398,
"step": 1770
},
{
"epoch": 0.39243864691576386,
"grad_norm": 2.5179383754730225,
"learning_rate": 3.415361670395227e-05,
"loss": 4.2628,
"step": 1775
},
{
"epoch": 0.3935441078929914,
"grad_norm": 2.731872797012329,
"learning_rate": 3.4091474024359935e-05,
"loss": 4.3159,
"step": 1780
},
{
"epoch": 0.3946495688702189,
"grad_norm": 2.5067148208618164,
"learning_rate": 3.402933134476759e-05,
"loss": 4.4061,
"step": 1785
},
{
"epoch": 0.39575502984744637,
"grad_norm": 2.3916046619415283,
"learning_rate": 3.396718866517524e-05,
"loss": 4.2791,
"step": 1790
},
{
"epoch": 0.3968604908246739,
"grad_norm": 2.6597490310668945,
"learning_rate": 3.3905045985582904e-05,
"loss": 4.4391,
"step": 1795
},
{
"epoch": 0.3979659518019014,
"grad_norm": 2.5750606060028076,
"learning_rate": 3.384290330599056e-05,
"loss": 4.1806,
"step": 1800
},
{
"epoch": 0.3990714127791289,
"grad_norm": 2.561917781829834,
"learning_rate": 3.378076062639821e-05,
"loss": 4.4584,
"step": 1805
},
{
"epoch": 0.4001768737563564,
"grad_norm": 2.576657772064209,
"learning_rate": 3.371861794680587e-05,
"loss": 4.1388,
"step": 1810
},
{
"epoch": 0.4012823347335839,
"grad_norm": 2.5817503929138184,
"learning_rate": 3.365647526721352e-05,
"loss": 4.3074,
"step": 1815
},
{
"epoch": 0.4023877957108114,
"grad_norm": 2.4846079349517822,
"learning_rate": 3.359433258762118e-05,
"loss": 4.3061,
"step": 1820
},
{
"epoch": 0.4034932566880389,
"grad_norm": 2.833554744720459,
"learning_rate": 3.353218990802884e-05,
"loss": 4.4506,
"step": 1825
},
{
"epoch": 0.4045987176652664,
"grad_norm": 2.6276683807373047,
"learning_rate": 3.347004722843649e-05,
"loss": 4.3484,
"step": 1830
},
{
"epoch": 0.4057041786424939,
"grad_norm": 2.6111786365509033,
"learning_rate": 3.3407904548844146e-05,
"loss": 4.4257,
"step": 1835
},
{
"epoch": 0.40680963961972144,
"grad_norm": 2.813497304916382,
"learning_rate": 3.33457618692518e-05,
"loss": 4.3713,
"step": 1840
},
{
"epoch": 0.4079151005969489,
"grad_norm": 2.7521538734436035,
"learning_rate": 3.328361918965946e-05,
"loss": 4.4385,
"step": 1845
},
{
"epoch": 0.40902056157417643,
"grad_norm": 2.503818988800049,
"learning_rate": 3.3221476510067115e-05,
"loss": 4.4288,
"step": 1850
},
{
"epoch": 0.41012602255140396,
"grad_norm": 2.3562381267547607,
"learning_rate": 3.315933383047477e-05,
"loss": 4.2368,
"step": 1855
},
{
"epoch": 0.4112314835286314,
"grad_norm": 2.526411294937134,
"learning_rate": 3.309719115088243e-05,
"loss": 4.3008,
"step": 1860
},
{
"epoch": 0.41233694450585895,
"grad_norm": 2.6222381591796875,
"learning_rate": 3.303504847129008e-05,
"loss": 4.1532,
"step": 1865
},
{
"epoch": 0.4134424054830865,
"grad_norm": 2.6735141277313232,
"learning_rate": 3.297290579169774e-05,
"loss": 4.2497,
"step": 1870
},
{
"epoch": 0.41454786646031394,
"grad_norm": 2.612273931503296,
"learning_rate": 3.2910763112105396e-05,
"loss": 4.365,
"step": 1875
},
{
"epoch": 0.41565332743754146,
"grad_norm": 2.7102086544036865,
"learning_rate": 3.284862043251305e-05,
"loss": 4.2006,
"step": 1880
},
{
"epoch": 0.41675878841476893,
"grad_norm": 2.8893067836761475,
"learning_rate": 3.278647775292071e-05,
"loss": 4.4635,
"step": 1885
},
{
"epoch": 0.41786424939199646,
"grad_norm": 2.6870336532592773,
"learning_rate": 3.2724335073328364e-05,
"loss": 4.3284,
"step": 1890
},
{
"epoch": 0.418969710369224,
"grad_norm": 2.454735279083252,
"learning_rate": 3.266219239373602e-05,
"loss": 4.2499,
"step": 1895
},
{
"epoch": 0.42007517134645145,
"grad_norm": 2.5673999786376953,
"learning_rate": 3.2600049714143676e-05,
"loss": 4.3258,
"step": 1900
},
{
"epoch": 0.42118063232367897,
"grad_norm": 2.435605049133301,
"learning_rate": 3.253790703455133e-05,
"loss": 4.2839,
"step": 1905
},
{
"epoch": 0.4222860933009065,
"grad_norm": 2.7508575916290283,
"learning_rate": 3.247576435495899e-05,
"loss": 4.4643,
"step": 1910
},
{
"epoch": 0.42339155427813396,
"grad_norm": 2.5757343769073486,
"learning_rate": 3.2413621675366645e-05,
"loss": 4.1323,
"step": 1915
},
{
"epoch": 0.4244970152553615,
"grad_norm": 2.409933567047119,
"learning_rate": 3.23514789957743e-05,
"loss": 4.2882,
"step": 1920
},
{
"epoch": 0.425602476232589,
"grad_norm": 2.4064886569976807,
"learning_rate": 3.228933631618196e-05,
"loss": 4.3503,
"step": 1925
},
{
"epoch": 0.4267079372098165,
"grad_norm": 2.539107322692871,
"learning_rate": 3.222719363658961e-05,
"loss": 4.3415,
"step": 1930
},
{
"epoch": 0.427813398187044,
"grad_norm": 2.70954966545105,
"learning_rate": 3.216505095699727e-05,
"loss": 4.3901,
"step": 1935
},
{
"epoch": 0.4289188591642715,
"grad_norm": 2.902268171310425,
"learning_rate": 3.2102908277404926e-05,
"loss": 4.3829,
"step": 1940
},
{
"epoch": 0.430024320141499,
"grad_norm": 2.919811487197876,
"learning_rate": 3.2040765597812575e-05,
"loss": 4.3388,
"step": 1945
},
{
"epoch": 0.4311297811187265,
"grad_norm": 2.765904188156128,
"learning_rate": 3.197862291822024e-05,
"loss": 4.2619,
"step": 1950
},
{
"epoch": 0.432235242095954,
"grad_norm": 2.6072490215301514,
"learning_rate": 3.1916480238627894e-05,
"loss": 4.272,
"step": 1955
},
{
"epoch": 0.4333407030731815,
"grad_norm": 2.694185256958008,
"learning_rate": 3.1854337559035544e-05,
"loss": 4.3295,
"step": 1960
},
{
"epoch": 0.43444616405040903,
"grad_norm": 2.6962716579437256,
"learning_rate": 3.179219487944321e-05,
"loss": 4.2222,
"step": 1965
},
{
"epoch": 0.4355516250276365,
"grad_norm": 2.681506395339966,
"learning_rate": 3.1730052199850856e-05,
"loss": 4.3914,
"step": 1970
},
{
"epoch": 0.436657086004864,
"grad_norm": 2.792881488800049,
"learning_rate": 3.166790952025851e-05,
"loss": 4.4958,
"step": 1975
},
{
"epoch": 0.43776254698209155,
"grad_norm": 2.6680564880371094,
"learning_rate": 3.1605766840666175e-05,
"loss": 4.3593,
"step": 1980
},
{
"epoch": 0.438868007959319,
"grad_norm": 2.7864387035369873,
"learning_rate": 3.1543624161073825e-05,
"loss": 4.3489,
"step": 1985
},
{
"epoch": 0.43997346893654654,
"grad_norm": 2.5795204639434814,
"learning_rate": 3.148148148148148e-05,
"loss": 4.269,
"step": 1990
},
{
"epoch": 0.44107892991377406,
"grad_norm": 2.851243019104004,
"learning_rate": 3.141933880188914e-05,
"loss": 4.3883,
"step": 1995
},
{
"epoch": 0.44218439089100153,
"grad_norm": 2.732250452041626,
"learning_rate": 3.135719612229679e-05,
"loss": 4.2467,
"step": 2000
},
{
"epoch": 0.44328985186822906,
"grad_norm": 2.4607598781585693,
"learning_rate": 3.1295053442704456e-05,
"loss": 4.3155,
"step": 2005
},
{
"epoch": 0.4443953128454566,
"grad_norm": 2.546980857849121,
"learning_rate": 3.1232910763112105e-05,
"loss": 4.3949,
"step": 2010
},
{
"epoch": 0.44550077382268405,
"grad_norm": 2.734762191772461,
"learning_rate": 3.117076808351976e-05,
"loss": 4.46,
"step": 2015
},
{
"epoch": 0.4466062347999116,
"grad_norm": 2.5129942893981934,
"learning_rate": 3.110862540392742e-05,
"loss": 4.3879,
"step": 2020
},
{
"epoch": 0.44771169577713904,
"grad_norm": 2.644542694091797,
"learning_rate": 3.1046482724335074e-05,
"loss": 4.2476,
"step": 2025
},
{
"epoch": 0.44881715675436656,
"grad_norm": 2.771726369857788,
"learning_rate": 3.098434004474273e-05,
"loss": 4.4844,
"step": 2030
},
{
"epoch": 0.4499226177315941,
"grad_norm": 2.642275333404541,
"learning_rate": 3.0922197365150386e-05,
"loss": 4.3922,
"step": 2035
},
{
"epoch": 0.45102807870882156,
"grad_norm": 2.6931073665618896,
"learning_rate": 3.086005468555804e-05,
"loss": 4.3635,
"step": 2040
},
{
"epoch": 0.4521335396860491,
"grad_norm": 2.4507226943969727,
"learning_rate": 3.07979120059657e-05,
"loss": 4.3413,
"step": 2045
},
{
"epoch": 0.4532390006632766,
"grad_norm": 2.632704496383667,
"learning_rate": 3.0735769326373355e-05,
"loss": 4.324,
"step": 2050
},
{
"epoch": 0.45434446164050407,
"grad_norm": 2.6872873306274414,
"learning_rate": 3.067362664678101e-05,
"loss": 4.3887,
"step": 2055
},
{
"epoch": 0.4554499226177316,
"grad_norm": 2.8722641468048096,
"learning_rate": 3.061148396718867e-05,
"loss": 4.3594,
"step": 2060
},
{
"epoch": 0.4565553835949591,
"grad_norm": 2.642021417617798,
"learning_rate": 3.054934128759632e-05,
"loss": 4.266,
"step": 2065
},
{
"epoch": 0.4576608445721866,
"grad_norm": 2.8870849609375,
"learning_rate": 3.048719860800398e-05,
"loss": 4.4626,
"step": 2070
},
{
"epoch": 0.4587663055494141,
"grad_norm": 2.623518943786621,
"learning_rate": 3.0425055928411632e-05,
"loss": 4.3157,
"step": 2075
},
{
"epoch": 0.45987176652664163,
"grad_norm": 2.5889763832092285,
"learning_rate": 3.0362913248819292e-05,
"loss": 4.2704,
"step": 2080
},
{
"epoch": 0.4609772275038691,
"grad_norm": 2.8086538314819336,
"learning_rate": 3.0300770569226945e-05,
"loss": 4.3561,
"step": 2085
},
{
"epoch": 0.4620826884810966,
"grad_norm": 2.896907091140747,
"learning_rate": 3.02386278896346e-05,
"loss": 4.4201,
"step": 2090
},
{
"epoch": 0.4631881494583241,
"grad_norm": 2.5891048908233643,
"learning_rate": 3.017648521004226e-05,
"loss": 4.2137,
"step": 2095
},
{
"epoch": 0.4642936104355516,
"grad_norm": 2.5606133937835693,
"learning_rate": 3.0114342530449913e-05,
"loss": 4.3985,
"step": 2100
},
{
"epoch": 0.46539907141277914,
"grad_norm": 2.7957265377044678,
"learning_rate": 3.005219985085757e-05,
"loss": 4.395,
"step": 2105
},
{
"epoch": 0.4665045323900066,
"grad_norm": 2.593770742416382,
"learning_rate": 2.999005717126523e-05,
"loss": 4.4711,
"step": 2110
},
{
"epoch": 0.46760999336723413,
"grad_norm": 2.482818603515625,
"learning_rate": 2.992791449167288e-05,
"loss": 4.2323,
"step": 2115
},
{
"epoch": 0.46871545434446166,
"grad_norm": 2.972776174545288,
"learning_rate": 2.986577181208054e-05,
"loss": 4.3602,
"step": 2120
},
{
"epoch": 0.4698209153216891,
"grad_norm": 2.5987308025360107,
"learning_rate": 2.980362913248819e-05,
"loss": 4.5967,
"step": 2125
},
{
"epoch": 0.47092637629891665,
"grad_norm": 2.6634702682495117,
"learning_rate": 2.974148645289585e-05,
"loss": 4.1932,
"step": 2130
},
{
"epoch": 0.4720318372761442,
"grad_norm": 2.720262050628662,
"learning_rate": 2.967934377330351e-05,
"loss": 4.1392,
"step": 2135
},
{
"epoch": 0.47313729825337164,
"grad_norm": 2.9388368129730225,
"learning_rate": 2.9617201093711163e-05,
"loss": 4.2334,
"step": 2140
},
{
"epoch": 0.47424275923059916,
"grad_norm": 2.426968812942505,
"learning_rate": 2.955505841411882e-05,
"loss": 4.1942,
"step": 2145
},
{
"epoch": 0.4753482202078267,
"grad_norm": 2.53849458694458,
"learning_rate": 2.949291573452647e-05,
"loss": 4.4471,
"step": 2150
},
{
"epoch": 0.47645368118505416,
"grad_norm": 2.7019786834716797,
"learning_rate": 2.943077305493413e-05,
"loss": 4.2433,
"step": 2155
},
{
"epoch": 0.4775591421622817,
"grad_norm": 2.578589677810669,
"learning_rate": 2.9368630375341787e-05,
"loss": 4.2682,
"step": 2160
},
{
"epoch": 0.47866460313950915,
"grad_norm": 2.7424092292785645,
"learning_rate": 2.930648769574944e-05,
"loss": 4.4,
"step": 2165
},
{
"epoch": 0.47977006411673667,
"grad_norm": 2.6316614151000977,
"learning_rate": 2.92443450161571e-05,
"loss": 4.249,
"step": 2170
},
{
"epoch": 0.4808755250939642,
"grad_norm": 2.757974624633789,
"learning_rate": 2.9182202336564756e-05,
"loss": 4.3832,
"step": 2175
},
{
"epoch": 0.48198098607119166,
"grad_norm": 2.591416597366333,
"learning_rate": 2.912005965697241e-05,
"loss": 4.4295,
"step": 2180
},
{
"epoch": 0.4830864470484192,
"grad_norm": 2.576218605041504,
"learning_rate": 2.9057916977380068e-05,
"loss": 4.3352,
"step": 2185
},
{
"epoch": 0.4841919080256467,
"grad_norm": 2.5569541454315186,
"learning_rate": 2.899577429778772e-05,
"loss": 4.1921,
"step": 2190
},
{
"epoch": 0.4852973690028742,
"grad_norm": 2.489694118499756,
"learning_rate": 2.8933631618195377e-05,
"loss": 4.3463,
"step": 2195
},
{
"epoch": 0.4864028299801017,
"grad_norm": 2.486515522003174,
"learning_rate": 2.8871488938603037e-05,
"loss": 4.217,
"step": 2200
},
{
"epoch": 0.4875082909573292,
"grad_norm": 2.6798512935638428,
"learning_rate": 2.880934625901069e-05,
"loss": 4.3241,
"step": 2205
},
{
"epoch": 0.4886137519345567,
"grad_norm": 2.582374095916748,
"learning_rate": 2.8747203579418346e-05,
"loss": 4.3155,
"step": 2210
},
{
"epoch": 0.4897192129117842,
"grad_norm": 2.598309278488159,
"learning_rate": 2.8685060899826e-05,
"loss": 4.3281,
"step": 2215
},
{
"epoch": 0.49082467388901174,
"grad_norm": 2.5720064640045166,
"learning_rate": 2.8622918220233658e-05,
"loss": 4.3937,
"step": 2220
},
{
"epoch": 0.4919301348662392,
"grad_norm": 2.4057793617248535,
"learning_rate": 2.8560775540641317e-05,
"loss": 4.2625,
"step": 2225
},
{
"epoch": 0.49303559584346673,
"grad_norm": 2.5601112842559814,
"learning_rate": 2.8498632861048967e-05,
"loss": 4.2416,
"step": 2230
},
{
"epoch": 0.4941410568206942,
"grad_norm": 2.621948003768921,
"learning_rate": 2.8436490181456626e-05,
"loss": 4.439,
"step": 2235
},
{
"epoch": 0.4952465177979217,
"grad_norm": 2.5221333503723145,
"learning_rate": 2.837434750186428e-05,
"loss": 4.3375,
"step": 2240
},
{
"epoch": 0.49635197877514925,
"grad_norm": 2.555539608001709,
"learning_rate": 2.831220482227194e-05,
"loss": 4.3071,
"step": 2245
},
{
"epoch": 0.4974574397523767,
"grad_norm": 2.71470308303833,
"learning_rate": 2.8250062142679595e-05,
"loss": 4.3431,
"step": 2250
},
{
"epoch": 0.49856290072960424,
"grad_norm": 2.731353759765625,
"learning_rate": 2.8187919463087248e-05,
"loss": 4.4328,
"step": 2255
},
{
"epoch": 0.49966836170683177,
"grad_norm": 2.527031183242798,
"learning_rate": 2.8125776783494907e-05,
"loss": 4.3326,
"step": 2260
},
{
"epoch": 0.5007738226840592,
"grad_norm": 2.539781332015991,
"learning_rate": 2.8063634103902563e-05,
"loss": 4.3398,
"step": 2265
},
{
"epoch": 0.5018792836612868,
"grad_norm": 2.465778350830078,
"learning_rate": 2.8001491424310216e-05,
"loss": 4.1966,
"step": 2270
},
{
"epoch": 0.5029847446385143,
"grad_norm": 2.610877513885498,
"learning_rate": 2.7939348744717876e-05,
"loss": 4.4339,
"step": 2275
},
{
"epoch": 0.5040902056157418,
"grad_norm": 2.833237409591675,
"learning_rate": 2.787720606512553e-05,
"loss": 4.258,
"step": 2280
},
{
"epoch": 0.5051956665929692,
"grad_norm": 2.681429386138916,
"learning_rate": 2.7815063385533185e-05,
"loss": 4.3174,
"step": 2285
},
{
"epoch": 0.5063011275701967,
"grad_norm": 2.621767044067383,
"learning_rate": 2.7752920705940844e-05,
"loss": 4.3556,
"step": 2290
},
{
"epoch": 0.5074065885474243,
"grad_norm": 2.3988664150238037,
"learning_rate": 2.7690778026348497e-05,
"loss": 4.4304,
"step": 2295
},
{
"epoch": 0.5085120495246518,
"grad_norm": 2.6011765003204346,
"learning_rate": 2.7628635346756153e-05,
"loss": 4.3996,
"step": 2300
},
{
"epoch": 0.5096175105018793,
"grad_norm": 2.5418872833251953,
"learning_rate": 2.7566492667163806e-05,
"loss": 4.3227,
"step": 2305
},
{
"epoch": 0.5107229714791068,
"grad_norm": 2.7040741443634033,
"learning_rate": 2.7504349987571466e-05,
"loss": 4.3522,
"step": 2310
},
{
"epoch": 0.5118284324563342,
"grad_norm": 2.4782514572143555,
"learning_rate": 2.7442207307979122e-05,
"loss": 4.2093,
"step": 2315
},
{
"epoch": 0.5129338934335618,
"grad_norm": 2.709933042526245,
"learning_rate": 2.7380064628386775e-05,
"loss": 4.3424,
"step": 2320
},
{
"epoch": 0.5140393544107893,
"grad_norm": 3.0086729526519775,
"learning_rate": 2.7317921948794434e-05,
"loss": 4.5041,
"step": 2325
},
{
"epoch": 0.5151448153880168,
"grad_norm": 2.5372843742370605,
"learning_rate": 2.725577926920209e-05,
"loss": 4.3018,
"step": 2330
},
{
"epoch": 0.5162502763652443,
"grad_norm": 2.94974684715271,
"learning_rate": 2.7193636589609743e-05,
"loss": 4.2941,
"step": 2335
},
{
"epoch": 0.5173557373424718,
"grad_norm": 2.7399137020111084,
"learning_rate": 2.7131493910017403e-05,
"loss": 4.2627,
"step": 2340
},
{
"epoch": 0.5184611983196993,
"grad_norm": 2.6174683570861816,
"learning_rate": 2.7069351230425055e-05,
"loss": 4.2011,
"step": 2345
},
{
"epoch": 0.5195666592969268,
"grad_norm": 2.434396266937256,
"learning_rate": 2.7007208550832715e-05,
"loss": 4.2168,
"step": 2350
},
{
"epoch": 0.5206721202741543,
"grad_norm": 2.5760498046875,
"learning_rate": 2.694506587124037e-05,
"loss": 4.3722,
"step": 2355
},
{
"epoch": 0.5217775812513819,
"grad_norm": 2.616143226623535,
"learning_rate": 2.6882923191648024e-05,
"loss": 4.1671,
"step": 2360
},
{
"epoch": 0.5228830422286094,
"grad_norm": 2.406928539276123,
"learning_rate": 2.6820780512055683e-05,
"loss": 4.2319,
"step": 2365
},
{
"epoch": 0.5239885032058368,
"grad_norm": 2.4793832302093506,
"learning_rate": 2.6758637832463336e-05,
"loss": 4.2182,
"step": 2370
},
{
"epoch": 0.5250939641830643,
"grad_norm": 2.757474660873413,
"learning_rate": 2.6696495152870992e-05,
"loss": 4.4572,
"step": 2375
},
{
"epoch": 0.5261994251602918,
"grad_norm": 2.7199547290802,
"learning_rate": 2.6634352473278652e-05,
"loss": 4.2871,
"step": 2380
},
{
"epoch": 0.5273048861375194,
"grad_norm": 2.6695070266723633,
"learning_rate": 2.6572209793686305e-05,
"loss": 4.3649,
"step": 2385
},
{
"epoch": 0.5284103471147469,
"grad_norm": 2.5903425216674805,
"learning_rate": 2.651006711409396e-05,
"loss": 4.3604,
"step": 2390
},
{
"epoch": 0.5295158080919744,
"grad_norm": 2.871863842010498,
"learning_rate": 2.644792443450162e-05,
"loss": 4.2315,
"step": 2395
},
{
"epoch": 0.5306212690692018,
"grad_norm": 2.49452543258667,
"learning_rate": 2.6385781754909273e-05,
"loss": 4.3564,
"step": 2400
},
{
"epoch": 0.5317267300464293,
"grad_norm": 2.6567633152008057,
"learning_rate": 2.632363907531693e-05,
"loss": 4.2627,
"step": 2405
},
{
"epoch": 0.5328321910236569,
"grad_norm": 2.6986489295959473,
"learning_rate": 2.6261496395724582e-05,
"loss": 4.1613,
"step": 2410
},
{
"epoch": 0.5339376520008844,
"grad_norm": 2.942229986190796,
"learning_rate": 2.6199353716132242e-05,
"loss": 4.3428,
"step": 2415
},
{
"epoch": 0.5350431129781119,
"grad_norm": 2.7262582778930664,
"learning_rate": 2.6137211036539898e-05,
"loss": 4.273,
"step": 2420
},
{
"epoch": 0.5361485739553393,
"grad_norm": 2.6394593715667725,
"learning_rate": 2.607506835694755e-05,
"loss": 4.3921,
"step": 2425
},
{
"epoch": 0.5372540349325668,
"grad_norm": 2.6989800930023193,
"learning_rate": 2.601292567735521e-05,
"loss": 4.3518,
"step": 2430
},
{
"epoch": 0.5383594959097944,
"grad_norm": 2.593045711517334,
"learning_rate": 2.5950782997762863e-05,
"loss": 4.301,
"step": 2435
},
{
"epoch": 0.5394649568870219,
"grad_norm": 2.5254459381103516,
"learning_rate": 2.588864031817052e-05,
"loss": 4.209,
"step": 2440
},
{
"epoch": 0.5405704178642494,
"grad_norm": 2.765732526779175,
"learning_rate": 2.582649763857818e-05,
"loss": 4.2955,
"step": 2445
},
{
"epoch": 0.5416758788414769,
"grad_norm": 2.780750274658203,
"learning_rate": 2.576435495898583e-05,
"loss": 4.3846,
"step": 2450
},
{
"epoch": 0.5427813398187044,
"grad_norm": 2.811513662338257,
"learning_rate": 2.5702212279393488e-05,
"loss": 4.4617,
"step": 2455
},
{
"epoch": 0.5438868007959319,
"grad_norm": 2.5271966457366943,
"learning_rate": 2.564006959980114e-05,
"loss": 4.1798,
"step": 2460
},
{
"epoch": 0.5449922617731594,
"grad_norm": 2.721851348876953,
"learning_rate": 2.55779269202088e-05,
"loss": 4.2644,
"step": 2465
},
{
"epoch": 0.5460977227503869,
"grad_norm": 2.618861436843872,
"learning_rate": 2.551578424061646e-05,
"loss": 4.317,
"step": 2470
},
{
"epoch": 0.5472031837276145,
"grad_norm": 2.3622546195983887,
"learning_rate": 2.545364156102411e-05,
"loss": 4.4589,
"step": 2475
},
{
"epoch": 0.548308644704842,
"grad_norm": 2.5185422897338867,
"learning_rate": 2.539149888143177e-05,
"loss": 4.2975,
"step": 2480
},
{
"epoch": 0.5494141056820694,
"grad_norm": 2.54284930229187,
"learning_rate": 2.5329356201839428e-05,
"loss": 4.29,
"step": 2485
},
{
"epoch": 0.5505195666592969,
"grad_norm": 2.4982147216796875,
"learning_rate": 2.526721352224708e-05,
"loss": 4.3835,
"step": 2490
},
{
"epoch": 0.5516250276365244,
"grad_norm": 2.5386240482330322,
"learning_rate": 2.5205070842654737e-05,
"loss": 4.4286,
"step": 2495
},
{
"epoch": 0.552730488613752,
"grad_norm": 2.5726940631866455,
"learning_rate": 2.514292816306239e-05,
"loss": 4.3666,
"step": 2500
},
{
"epoch": 0.5538359495909795,
"grad_norm": 2.802129030227661,
"learning_rate": 2.508078548347005e-05,
"loss": 4.3205,
"step": 2505
},
{
"epoch": 0.5549414105682069,
"grad_norm": 2.713815212249756,
"learning_rate": 2.5018642803877706e-05,
"loss": 4.2775,
"step": 2510
},
{
"epoch": 0.5560468715454344,
"grad_norm": 2.597898244857788,
"learning_rate": 2.495650012428536e-05,
"loss": 4.2644,
"step": 2515
},
{
"epoch": 0.5571523325226619,
"grad_norm": 2.6316134929656982,
"learning_rate": 2.4894357444693018e-05,
"loss": 4.3634,
"step": 2520
},
{
"epoch": 0.5582577934998895,
"grad_norm": 2.663684129714966,
"learning_rate": 2.4832214765100674e-05,
"loss": 4.2632,
"step": 2525
},
{
"epoch": 0.559363254477117,
"grad_norm": 2.669243574142456,
"learning_rate": 2.4770072085508327e-05,
"loss": 4.3728,
"step": 2530
},
{
"epoch": 0.5604687154543445,
"grad_norm": 2.6854679584503174,
"learning_rate": 2.4707929405915983e-05,
"loss": 4.2938,
"step": 2535
},
{
"epoch": 0.5615741764315719,
"grad_norm": 2.625131130218506,
"learning_rate": 2.4645786726323643e-05,
"loss": 4.3859,
"step": 2540
},
{
"epoch": 0.5626796374087994,
"grad_norm": 2.6042797565460205,
"learning_rate": 2.4583644046731296e-05,
"loss": 4.2591,
"step": 2545
},
{
"epoch": 0.563785098386027,
"grad_norm": 2.763540267944336,
"learning_rate": 2.452150136713895e-05,
"loss": 4.2657,
"step": 2550
},
{
"epoch": 0.5648905593632545,
"grad_norm": 2.8229899406433105,
"learning_rate": 2.4459358687546608e-05,
"loss": 4.3078,
"step": 2555
},
{
"epoch": 0.565996020340482,
"grad_norm": 2.8097963333129883,
"learning_rate": 2.4397216007954264e-05,
"loss": 4.3871,
"step": 2560
},
{
"epoch": 0.5671014813177094,
"grad_norm": 2.6240086555480957,
"learning_rate": 2.433507332836192e-05,
"loss": 4.286,
"step": 2565
},
{
"epoch": 0.568206942294937,
"grad_norm": 2.685115098953247,
"learning_rate": 2.4272930648769576e-05,
"loss": 4.2783,
"step": 2570
},
{
"epoch": 0.5693124032721645,
"grad_norm": 2.697061538696289,
"learning_rate": 2.4210787969177233e-05,
"loss": 4.4211,
"step": 2575
},
{
"epoch": 0.570417864249392,
"grad_norm": 2.8929386138916016,
"learning_rate": 2.4148645289584885e-05,
"loss": 4.3608,
"step": 2580
},
{
"epoch": 0.5715233252266195,
"grad_norm": 2.6032614707946777,
"learning_rate": 2.4086502609992545e-05,
"loss": 4.2024,
"step": 2585
},
{
"epoch": 0.572628786203847,
"grad_norm": 2.629255533218384,
"learning_rate": 2.40243599304002e-05,
"loss": 4.4302,
"step": 2590
},
{
"epoch": 0.5737342471810745,
"grad_norm": 2.5833659172058105,
"learning_rate": 2.3962217250807857e-05,
"loss": 4.372,
"step": 2595
},
{
"epoch": 0.574839708158302,
"grad_norm": 2.425273895263672,
"learning_rate": 2.390007457121551e-05,
"loss": 4.2089,
"step": 2600
},
{
"epoch": 0.5759451691355295,
"grad_norm": 2.651646375656128,
"learning_rate": 2.383793189162317e-05,
"loss": 4.2374,
"step": 2605
},
{
"epoch": 0.577050630112757,
"grad_norm": 2.894827365875244,
"learning_rate": 2.3775789212030826e-05,
"loss": 4.105,
"step": 2610
},
{
"epoch": 0.5781560910899846,
"grad_norm": 2.646923780441284,
"learning_rate": 2.371364653243848e-05,
"loss": 4.3908,
"step": 2615
},
{
"epoch": 0.5792615520672121,
"grad_norm": 2.8050379753112793,
"learning_rate": 2.3651503852846135e-05,
"loss": 4.3573,
"step": 2620
},
{
"epoch": 0.5803670130444395,
"grad_norm": 2.8766565322875977,
"learning_rate": 2.358936117325379e-05,
"loss": 4.2688,
"step": 2625
},
{
"epoch": 0.581472474021667,
"grad_norm": 2.452597141265869,
"learning_rate": 2.3527218493661447e-05,
"loss": 4.3922,
"step": 2630
},
{
"epoch": 0.5825779349988945,
"grad_norm": 2.8422110080718994,
"learning_rate": 2.3465075814069103e-05,
"loss": 4.3008,
"step": 2635
},
{
"epoch": 0.5836833959761221,
"grad_norm": 2.661015033721924,
"learning_rate": 2.340293313447676e-05,
"loss": 4.2432,
"step": 2640
},
{
"epoch": 0.5847888569533496,
"grad_norm": 2.7962839603424072,
"learning_rate": 2.3340790454884416e-05,
"loss": 4.4387,
"step": 2645
},
{
"epoch": 0.585894317930577,
"grad_norm": 2.807640552520752,
"learning_rate": 2.3278647775292072e-05,
"loss": 4.3026,
"step": 2650
},
{
"epoch": 0.5869997789078045,
"grad_norm": 2.77174711227417,
"learning_rate": 2.3216505095699728e-05,
"loss": 4.3376,
"step": 2655
},
{
"epoch": 0.588105239885032,
"grad_norm": 2.6385319232940674,
"learning_rate": 2.3154362416107384e-05,
"loss": 4.211,
"step": 2660
},
{
"epoch": 0.5892107008622596,
"grad_norm": 2.464839458465576,
"learning_rate": 2.309221973651504e-05,
"loss": 4.1263,
"step": 2665
},
{
"epoch": 0.5903161618394871,
"grad_norm": 2.5542917251586914,
"learning_rate": 2.3030077056922693e-05,
"loss": 4.281,
"step": 2670
},
{
"epoch": 0.5914216228167146,
"grad_norm": 2.796891450881958,
"learning_rate": 2.2967934377330353e-05,
"loss": 4.2626,
"step": 2675
},
{
"epoch": 0.592527083793942,
"grad_norm": 2.6826398372650146,
"learning_rate": 2.290579169773801e-05,
"loss": 4.1999,
"step": 2680
},
{
"epoch": 0.5936325447711696,
"grad_norm": 2.77254581451416,
"learning_rate": 2.284364901814566e-05,
"loss": 4.3298,
"step": 2685
},
{
"epoch": 0.5947380057483971,
"grad_norm": 2.6188175678253174,
"learning_rate": 2.2781506338553318e-05,
"loss": 4.2272,
"step": 2690
},
{
"epoch": 0.5958434667256246,
"grad_norm": 2.374133825302124,
"learning_rate": 2.2719363658960977e-05,
"loss": 4.425,
"step": 2695
},
{
"epoch": 0.5969489277028521,
"grad_norm": 2.516446352005005,
"learning_rate": 2.2657220979368633e-05,
"loss": 4.3096,
"step": 2700
},
{
"epoch": 0.5980543886800795,
"grad_norm": 2.5473289489746094,
"learning_rate": 2.2595078299776286e-05,
"loss": 4.3916,
"step": 2705
},
{
"epoch": 0.5991598496573071,
"grad_norm": 2.9763638973236084,
"learning_rate": 2.2532935620183942e-05,
"loss": 4.2488,
"step": 2710
},
{
"epoch": 0.6002653106345346,
"grad_norm": 2.831369161605835,
"learning_rate": 2.2470792940591602e-05,
"loss": 4.4136,
"step": 2715
},
{
"epoch": 0.6013707716117621,
"grad_norm": 2.77677845954895,
"learning_rate": 2.2408650260999255e-05,
"loss": 4.3703,
"step": 2720
},
{
"epoch": 0.6024762325889896,
"grad_norm": 3.102226972579956,
"learning_rate": 2.234650758140691e-05,
"loss": 4.389,
"step": 2725
},
{
"epoch": 0.6035816935662172,
"grad_norm": 2.694725275039673,
"learning_rate": 2.2284364901814567e-05,
"loss": 4.3748,
"step": 2730
},
{
"epoch": 0.6046871545434446,
"grad_norm": 2.628998041152954,
"learning_rate": 2.2222222222222223e-05,
"loss": 4.2702,
"step": 2735
},
{
"epoch": 0.6057926155206721,
"grad_norm": 2.5050158500671387,
"learning_rate": 2.216007954262988e-05,
"loss": 4.4498,
"step": 2740
},
{
"epoch": 0.6068980764978996,
"grad_norm": 3.0304501056671143,
"learning_rate": 2.2097936863037536e-05,
"loss": 4.2093,
"step": 2745
},
{
"epoch": 0.6080035374751271,
"grad_norm": 2.7480475902557373,
"learning_rate": 2.2035794183445192e-05,
"loss": 4.452,
"step": 2750
},
{
"epoch": 0.6091089984523547,
"grad_norm": 2.5752625465393066,
"learning_rate": 2.1973651503852845e-05,
"loss": 4.1986,
"step": 2755
},
{
"epoch": 0.6102144594295822,
"grad_norm": 2.9249074459075928,
"learning_rate": 2.1911508824260504e-05,
"loss": 4.2884,
"step": 2760
},
{
"epoch": 0.6113199204068096,
"grad_norm": 2.565080165863037,
"learning_rate": 2.184936614466816e-05,
"loss": 4.3698,
"step": 2765
},
{
"epoch": 0.6124253813840371,
"grad_norm": 2.9593536853790283,
"learning_rate": 2.1787223465075816e-05,
"loss": 4.4363,
"step": 2770
},
{
"epoch": 0.6135308423612647,
"grad_norm": 2.698092460632324,
"learning_rate": 2.172508078548347e-05,
"loss": 4.4131,
"step": 2775
},
{
"epoch": 0.6146363033384922,
"grad_norm": 2.6179697513580322,
"learning_rate": 2.1662938105891125e-05,
"loss": 4.2489,
"step": 2780
},
{
"epoch": 0.6157417643157197,
"grad_norm": 2.7725419998168945,
"learning_rate": 2.1600795426298785e-05,
"loss": 4.3455,
"step": 2785
},
{
"epoch": 0.6168472252929471,
"grad_norm": 2.5519633293151855,
"learning_rate": 2.1538652746706438e-05,
"loss": 4.3074,
"step": 2790
},
{
"epoch": 0.6179526862701746,
"grad_norm": 2.6183152198791504,
"learning_rate": 2.1476510067114094e-05,
"loss": 4.3562,
"step": 2795
},
{
"epoch": 0.6190581472474022,
"grad_norm": 2.5165317058563232,
"learning_rate": 2.141436738752175e-05,
"loss": 4.2388,
"step": 2800
},
{
"epoch": 0.6201636082246297,
"grad_norm": 2.813973903656006,
"learning_rate": 2.1352224707929406e-05,
"loss": 4.2732,
"step": 2805
},
{
"epoch": 0.6212690692018572,
"grad_norm": 2.489633798599243,
"learning_rate": 2.1290082028337062e-05,
"loss": 4.183,
"step": 2810
},
{
"epoch": 0.6223745301790847,
"grad_norm": 2.606971502304077,
"learning_rate": 2.122793934874472e-05,
"loss": 4.3127,
"step": 2815
},
{
"epoch": 0.6234799911563121,
"grad_norm": 2.74040150642395,
"learning_rate": 2.1165796669152375e-05,
"loss": 4.3576,
"step": 2820
},
{
"epoch": 0.6245854521335397,
"grad_norm": 2.814483642578125,
"learning_rate": 2.110365398956003e-05,
"loss": 4.345,
"step": 2825
},
{
"epoch": 0.6256909131107672,
"grad_norm": 2.4296274185180664,
"learning_rate": 2.1041511309967687e-05,
"loss": 4.2154,
"step": 2830
},
{
"epoch": 0.6267963740879947,
"grad_norm": 3.018310785293579,
"learning_rate": 2.0979368630375343e-05,
"loss": 4.2779,
"step": 2835
},
{
"epoch": 0.6279018350652222,
"grad_norm": 2.85764741897583,
"learning_rate": 2.0917225950783e-05,
"loss": 4.2533,
"step": 2840
},
{
"epoch": 0.6290072960424496,
"grad_norm": 2.690497398376465,
"learning_rate": 2.0855083271190652e-05,
"loss": 4.3148,
"step": 2845
},
{
"epoch": 0.6301127570196772,
"grad_norm": 2.5241053104400635,
"learning_rate": 2.0792940591598312e-05,
"loss": 4.3019,
"step": 2850
},
{
"epoch": 0.6312182179969047,
"grad_norm": 2.63004732131958,
"learning_rate": 2.0730797912005968e-05,
"loss": 4.3274,
"step": 2855
},
{
"epoch": 0.6323236789741322,
"grad_norm": 2.6619880199432373,
"learning_rate": 2.066865523241362e-05,
"loss": 4.4063,
"step": 2860
},
{
"epoch": 0.6334291399513597,
"grad_norm": 2.918989419937134,
"learning_rate": 2.0606512552821277e-05,
"loss": 4.3446,
"step": 2865
},
{
"epoch": 0.6345346009285873,
"grad_norm": 2.6898226737976074,
"learning_rate": 2.0544369873228937e-05,
"loss": 4.3895,
"step": 2870
},
{
"epoch": 0.6356400619058147,
"grad_norm": 2.659388542175293,
"learning_rate": 2.0482227193636593e-05,
"loss": 4.2844,
"step": 2875
},
{
"epoch": 0.6367455228830422,
"grad_norm": 2.9145493507385254,
"learning_rate": 2.0420084514044246e-05,
"loss": 4.234,
"step": 2880
},
{
"epoch": 0.6378509838602697,
"grad_norm": 2.542527198791504,
"learning_rate": 2.03579418344519e-05,
"loss": 4.2848,
"step": 2885
},
{
"epoch": 0.6389564448374973,
"grad_norm": 2.690652847290039,
"learning_rate": 2.0295799154859558e-05,
"loss": 4.2601,
"step": 2890
},
{
"epoch": 0.6400619058147248,
"grad_norm": 2.74469256401062,
"learning_rate": 2.0233656475267214e-05,
"loss": 4.2875,
"step": 2895
},
{
"epoch": 0.6411673667919523,
"grad_norm": 2.5279908180236816,
"learning_rate": 2.017151379567487e-05,
"loss": 4.3336,
"step": 2900
},
{
"epoch": 0.6422728277691797,
"grad_norm": 2.6275908946990967,
"learning_rate": 2.0109371116082526e-05,
"loss": 4.3125,
"step": 2905
},
{
"epoch": 0.6433782887464072,
"grad_norm": 2.629896879196167,
"learning_rate": 2.0047228436490183e-05,
"loss": 4.3233,
"step": 2910
},
{
"epoch": 0.6444837497236348,
"grad_norm": 2.8916358947753906,
"learning_rate": 1.998508575689784e-05,
"loss": 4.2835,
"step": 2915
},
{
"epoch": 0.6455892107008623,
"grad_norm": 2.6450507640838623,
"learning_rate": 1.9922943077305495e-05,
"loss": 4.3504,
"step": 2920
},
{
"epoch": 0.6466946716780898,
"grad_norm": 2.617589235305786,
"learning_rate": 1.986080039771315e-05,
"loss": 4.4431,
"step": 2925
},
{
"epoch": 0.6478001326553172,
"grad_norm": 2.4875051975250244,
"learning_rate": 1.9798657718120804e-05,
"loss": 4.3341,
"step": 2930
},
{
"epoch": 0.6489055936325447,
"grad_norm": 2.5593132972717285,
"learning_rate": 1.9736515038528463e-05,
"loss": 4.335,
"step": 2935
},
{
"epoch": 0.6500110546097723,
"grad_norm": 2.687657594680786,
"learning_rate": 1.967437235893612e-05,
"loss": 4.3632,
"step": 2940
},
{
"epoch": 0.6511165155869998,
"grad_norm": 2.605257987976074,
"learning_rate": 1.9612229679343776e-05,
"loss": 4.3999,
"step": 2945
},
{
"epoch": 0.6522219765642273,
"grad_norm": 2.3589608669281006,
"learning_rate": 1.955008699975143e-05,
"loss": 4.2815,
"step": 2950
},
{
"epoch": 0.6533274375414548,
"grad_norm": 2.8207266330718994,
"learning_rate": 1.9487944320159085e-05,
"loss": 4.2614,
"step": 2955
},
{
"epoch": 0.6544328985186822,
"grad_norm": 2.7098288536071777,
"learning_rate": 1.9425801640566744e-05,
"loss": 4.2278,
"step": 2960
},
{
"epoch": 0.6555383594959098,
"grad_norm": 2.819708824157715,
"learning_rate": 1.9363658960974397e-05,
"loss": 4.22,
"step": 2965
},
{
"epoch": 0.6566438204731373,
"grad_norm": 2.7340097427368164,
"learning_rate": 1.9301516281382053e-05,
"loss": 4.2767,
"step": 2970
},
{
"epoch": 0.6577492814503648,
"grad_norm": 2.6747171878814697,
"learning_rate": 1.923937360178971e-05,
"loss": 4.3268,
"step": 2975
},
{
"epoch": 0.6588547424275923,
"grad_norm": 2.5896904468536377,
"learning_rate": 1.917723092219737e-05,
"loss": 4.309,
"step": 2980
},
{
"epoch": 0.6599602034048198,
"grad_norm": 2.6400575637817383,
"learning_rate": 1.9115088242605022e-05,
"loss": 4.2878,
"step": 2985
},
{
"epoch": 0.6610656643820473,
"grad_norm": 2.62795352935791,
"learning_rate": 1.9052945563012678e-05,
"loss": 4.3861,
"step": 2990
},
{
"epoch": 0.6621711253592748,
"grad_norm": 2.7335047721862793,
"learning_rate": 1.8990802883420334e-05,
"loss": 4.2773,
"step": 2995
},
{
"epoch": 0.6632765863365023,
"grad_norm": 2.781811237335205,
"learning_rate": 1.892866020382799e-05,
"loss": 4.3049,
"step": 3000
},
{
"epoch": 0.6643820473137299,
"grad_norm": 2.65694522857666,
"learning_rate": 1.8866517524235646e-05,
"loss": 4.2534,
"step": 3005
},
{
"epoch": 0.6654875082909574,
"grad_norm": 2.611654043197632,
"learning_rate": 1.8804374844643303e-05,
"loss": 4.2397,
"step": 3010
},
{
"epoch": 0.6665929692681848,
"grad_norm": 2.759890079498291,
"learning_rate": 1.874223216505096e-05,
"loss": 4.1524,
"step": 3015
},
{
"epoch": 0.6676984302454123,
"grad_norm": 2.7549400329589844,
"learning_rate": 1.868008948545861e-05,
"loss": 4.2703,
"step": 3020
},
{
"epoch": 0.6688038912226398,
"grad_norm": 2.606306552886963,
"learning_rate": 1.861794680586627e-05,
"loss": 4.2695,
"step": 3025
},
{
"epoch": 0.6699093521998674,
"grad_norm": 3.0413312911987305,
"learning_rate": 1.8555804126273927e-05,
"loss": 4.5286,
"step": 3030
},
{
"epoch": 0.6710148131770949,
"grad_norm": 2.6322450637817383,
"learning_rate": 1.849366144668158e-05,
"loss": 4.3509,
"step": 3035
},
{
"epoch": 0.6721202741543224,
"grad_norm": 2.7126147747039795,
"learning_rate": 1.8431518767089236e-05,
"loss": 4.502,
"step": 3040
},
{
"epoch": 0.6732257351315498,
"grad_norm": 2.5845155715942383,
"learning_rate": 1.8369376087496896e-05,
"loss": 4.4788,
"step": 3045
},
{
"epoch": 0.6743311961087773,
"grad_norm": 2.713156223297119,
"learning_rate": 1.8307233407904552e-05,
"loss": 4.4627,
"step": 3050
},
{
"epoch": 0.6754366570860049,
"grad_norm": 2.5280685424804688,
"learning_rate": 1.8245090728312205e-05,
"loss": 4.3126,
"step": 3055
},
{
"epoch": 0.6765421180632324,
"grad_norm": 2.6877503395080566,
"learning_rate": 1.818294804871986e-05,
"loss": 4.4045,
"step": 3060
},
{
"epoch": 0.6776475790404599,
"grad_norm": 2.5872035026550293,
"learning_rate": 1.8120805369127517e-05,
"loss": 4.4283,
"step": 3065
},
{
"epoch": 0.6787530400176873,
"grad_norm": 2.494570255279541,
"learning_rate": 1.8058662689535173e-05,
"loss": 4.3445,
"step": 3070
},
{
"epoch": 0.6798585009949148,
"grad_norm": 2.8552112579345703,
"learning_rate": 1.799652000994283e-05,
"loss": 4.2656,
"step": 3075
},
{
"epoch": 0.6809639619721424,
"grad_norm": 2.528190851211548,
"learning_rate": 1.7934377330350486e-05,
"loss": 4.2317,
"step": 3080
},
{
"epoch": 0.6820694229493699,
"grad_norm": 2.6249637603759766,
"learning_rate": 1.7872234650758142e-05,
"loss": 4.4084,
"step": 3085
},
{
"epoch": 0.6831748839265974,
"grad_norm": 2.8214519023895264,
"learning_rate": 1.7810091971165798e-05,
"loss": 4.4469,
"step": 3090
},
{
"epoch": 0.6842803449038249,
"grad_norm": 3.1400296688079834,
"learning_rate": 1.7747949291573454e-05,
"loss": 4.4882,
"step": 3095
},
{
"epoch": 0.6853858058810524,
"grad_norm": 2.7912092208862305,
"learning_rate": 1.768580661198111e-05,
"loss": 4.2987,
"step": 3100
},
{
"epoch": 0.6864912668582799,
"grad_norm": 2.444261312484741,
"learning_rate": 1.7623663932388766e-05,
"loss": 4.3692,
"step": 3105
},
{
"epoch": 0.6875967278355074,
"grad_norm": 2.8983335494995117,
"learning_rate": 1.756152125279642e-05,
"loss": 4.2532,
"step": 3110
},
{
"epoch": 0.6887021888127349,
"grad_norm": 2.8009955883026123,
"learning_rate": 1.749937857320408e-05,
"loss": 4.343,
"step": 3115
},
{
"epoch": 0.6898076497899625,
"grad_norm": 2.664306640625,
"learning_rate": 1.7437235893611735e-05,
"loss": 4.3392,
"step": 3120
},
{
"epoch": 0.6909131107671899,
"grad_norm": 2.744086742401123,
"learning_rate": 1.7375093214019388e-05,
"loss": 4.5081,
"step": 3125
},
{
"epoch": 0.6920185717444174,
"grad_norm": 2.5243453979492188,
"learning_rate": 1.7312950534427044e-05,
"loss": 4.138,
"step": 3130
},
{
"epoch": 0.6931240327216449,
"grad_norm": 2.879436492919922,
"learning_rate": 1.7250807854834704e-05,
"loss": 4.3065,
"step": 3135
},
{
"epoch": 0.6942294936988724,
"grad_norm": 2.766604423522949,
"learning_rate": 1.7188665175242356e-05,
"loss": 4.3584,
"step": 3140
},
{
"epoch": 0.6953349546761,
"grad_norm": 2.644548177719116,
"learning_rate": 1.7126522495650012e-05,
"loss": 4.2898,
"step": 3145
},
{
"epoch": 0.6964404156533275,
"grad_norm": 2.6209113597869873,
"learning_rate": 1.706437981605767e-05,
"loss": 4.273,
"step": 3150
},
{
"epoch": 0.6975458766305549,
"grad_norm": 2.7458090782165527,
"learning_rate": 1.7002237136465328e-05,
"loss": 4.3472,
"step": 3155
},
{
"epoch": 0.6986513376077824,
"grad_norm": 2.5772080421447754,
"learning_rate": 1.694009445687298e-05,
"loss": 4.4346,
"step": 3160
},
{
"epoch": 0.6997567985850099,
"grad_norm": 2.7952399253845215,
"learning_rate": 1.6877951777280637e-05,
"loss": 4.3793,
"step": 3165
},
{
"epoch": 0.7008622595622375,
"grad_norm": 2.724113702774048,
"learning_rate": 1.6815809097688293e-05,
"loss": 4.2947,
"step": 3170
},
{
"epoch": 0.701967720539465,
"grad_norm": 2.809077262878418,
"learning_rate": 1.675366641809595e-05,
"loss": 4.4637,
"step": 3175
},
{
"epoch": 0.7030731815166925,
"grad_norm": 2.6896934509277344,
"learning_rate": 1.6691523738503606e-05,
"loss": 4.2131,
"step": 3180
},
{
"epoch": 0.7041786424939199,
"grad_norm": 2.823146343231201,
"learning_rate": 1.6629381058911262e-05,
"loss": 4.2319,
"step": 3185
},
{
"epoch": 0.7052841034711474,
"grad_norm": 2.5893144607543945,
"learning_rate": 1.6567238379318918e-05,
"loss": 4.3153,
"step": 3190
},
{
"epoch": 0.706389564448375,
"grad_norm": 2.8390941619873047,
"learning_rate": 1.650509569972657e-05,
"loss": 4.2297,
"step": 3195
},
{
"epoch": 0.7074950254256025,
"grad_norm": 2.496361255645752,
"learning_rate": 1.644295302013423e-05,
"loss": 4.4646,
"step": 3200
},
{
"epoch": 0.70860048640283,
"grad_norm": 2.776575803756714,
"learning_rate": 1.6380810340541887e-05,
"loss": 4.5525,
"step": 3205
},
{
"epoch": 0.7097059473800574,
"grad_norm": 2.6303658485412598,
"learning_rate": 1.631866766094954e-05,
"loss": 4.3819,
"step": 3210
},
{
"epoch": 0.710811408357285,
"grad_norm": 2.4757165908813477,
"learning_rate": 1.6256524981357195e-05,
"loss": 4.2136,
"step": 3215
},
{
"epoch": 0.7119168693345125,
"grad_norm": 2.7062437534332275,
"learning_rate": 1.619438230176485e-05,
"loss": 4.3914,
"step": 3220
},
{
"epoch": 0.71302233031174,
"grad_norm": 2.7044432163238525,
"learning_rate": 1.613223962217251e-05,
"loss": 4.3731,
"step": 3225
},
{
"epoch": 0.7141277912889675,
"grad_norm": 2.7421531677246094,
"learning_rate": 1.6070096942580164e-05,
"loss": 4.4874,
"step": 3230
},
{
"epoch": 0.715233252266195,
"grad_norm": 2.770270347595215,
"learning_rate": 1.600795426298782e-05,
"loss": 4.2702,
"step": 3235
},
{
"epoch": 0.7163387132434225,
"grad_norm": 2.617872714996338,
"learning_rate": 1.5945811583395476e-05,
"loss": 4.3877,
"step": 3240
},
{
"epoch": 0.71744417422065,
"grad_norm": 2.5779149532318115,
"learning_rate": 1.5883668903803133e-05,
"loss": 4.2644,
"step": 3245
},
{
"epoch": 0.7185496351978775,
"grad_norm": 2.465280771255493,
"learning_rate": 1.582152622421079e-05,
"loss": 4.2764,
"step": 3250
},
{
"epoch": 0.719655096175105,
"grad_norm": 2.6684722900390625,
"learning_rate": 1.5759383544618445e-05,
"loss": 4.4445,
"step": 3255
},
{
"epoch": 0.7207605571523326,
"grad_norm": 2.7769546508789062,
"learning_rate": 1.56972408650261e-05,
"loss": 4.3571,
"step": 3260
},
{
"epoch": 0.72186601812956,
"grad_norm": 2.58829402923584,
"learning_rate": 1.5635098185433757e-05,
"loss": 4.2226,
"step": 3265
},
{
"epoch": 0.7229714791067875,
"grad_norm": 2.5519750118255615,
"learning_rate": 1.5572955505841413e-05,
"loss": 4.4029,
"step": 3270
},
{
"epoch": 0.724076940084015,
"grad_norm": 2.6074788570404053,
"learning_rate": 1.551081282624907e-05,
"loss": 4.0522,
"step": 3275
},
{
"epoch": 0.7251824010612425,
"grad_norm": 2.721590042114258,
"learning_rate": 1.5448670146656726e-05,
"loss": 4.1492,
"step": 3280
},
{
"epoch": 0.7262878620384701,
"grad_norm": 2.80806827545166,
"learning_rate": 1.538652746706438e-05,
"loss": 4.412,
"step": 3285
},
{
"epoch": 0.7273933230156976,
"grad_norm": 2.87967848777771,
"learning_rate": 1.5324384787472038e-05,
"loss": 4.3851,
"step": 3290
},
{
"epoch": 0.728498783992925,
"grad_norm": 2.5552468299865723,
"learning_rate": 1.5262242107879694e-05,
"loss": 4.2578,
"step": 3295
},
{
"epoch": 0.7296042449701525,
"grad_norm": 2.6064484119415283,
"learning_rate": 1.5200099428287349e-05,
"loss": 4.4176,
"step": 3300
},
{
"epoch": 0.73070970594738,
"grad_norm": 2.6501288414001465,
"learning_rate": 1.5137956748695003e-05,
"loss": 4.2782,
"step": 3305
},
{
"epoch": 0.7318151669246076,
"grad_norm": 2.7041335105895996,
"learning_rate": 1.5075814069102661e-05,
"loss": 4.4355,
"step": 3310
},
{
"epoch": 0.7329206279018351,
"grad_norm": 2.7473063468933105,
"learning_rate": 1.5013671389510317e-05,
"loss": 4.3692,
"step": 3315
},
{
"epoch": 0.7340260888790626,
"grad_norm": 2.753004312515259,
"learning_rate": 1.4951528709917972e-05,
"loss": 4.3074,
"step": 3320
},
{
"epoch": 0.73513154985629,
"grad_norm": 2.5943238735198975,
"learning_rate": 1.4889386030325628e-05,
"loss": 4.2984,
"step": 3325
},
{
"epoch": 0.7362370108335176,
"grad_norm": 3.0592753887176514,
"learning_rate": 1.4827243350733282e-05,
"loss": 4.3758,
"step": 3330
},
{
"epoch": 0.7373424718107451,
"grad_norm": 2.9579524993896484,
"learning_rate": 1.4765100671140942e-05,
"loss": 4.3336,
"step": 3335
},
{
"epoch": 0.7384479327879726,
"grad_norm": 2.8208494186401367,
"learning_rate": 1.4702957991548596e-05,
"loss": 4.3748,
"step": 3340
},
{
"epoch": 0.7395533937652001,
"grad_norm": 2.7068212032318115,
"learning_rate": 1.4640815311956253e-05,
"loss": 4.3802,
"step": 3345
},
{
"epoch": 0.7406588547424275,
"grad_norm": 2.6911303997039795,
"learning_rate": 1.4578672632363907e-05,
"loss": 4.2637,
"step": 3350
},
{
"epoch": 0.7417643157196551,
"grad_norm": 2.925656318664551,
"learning_rate": 1.4516529952771565e-05,
"loss": 4.1862,
"step": 3355
},
{
"epoch": 0.7428697766968826,
"grad_norm": 2.8226230144500732,
"learning_rate": 1.4454387273179221e-05,
"loss": 4.2084,
"step": 3360
},
{
"epoch": 0.7439752376741101,
"grad_norm": 2.73540997505188,
"learning_rate": 1.4392244593586876e-05,
"loss": 4.3171,
"step": 3365
},
{
"epoch": 0.7450806986513376,
"grad_norm": 2.88110613822937,
"learning_rate": 1.4330101913994532e-05,
"loss": 4.3005,
"step": 3370
},
{
"epoch": 0.7461861596285652,
"grad_norm": 2.618785858154297,
"learning_rate": 1.4267959234402186e-05,
"loss": 4.2863,
"step": 3375
},
{
"epoch": 0.7472916206057926,
"grad_norm": 2.434032440185547,
"learning_rate": 1.4205816554809844e-05,
"loss": 4.3868,
"step": 3380
},
{
"epoch": 0.7483970815830201,
"grad_norm": 2.4145843982696533,
"learning_rate": 1.41436738752175e-05,
"loss": 4.1055,
"step": 3385
},
{
"epoch": 0.7495025425602476,
"grad_norm": 2.813927412033081,
"learning_rate": 1.4081531195625155e-05,
"loss": 4.4497,
"step": 3390
},
{
"epoch": 0.7506080035374751,
"grad_norm": 2.5696094036102295,
"learning_rate": 1.4019388516032811e-05,
"loss": 4.2388,
"step": 3395
},
{
"epoch": 0.7517134645147027,
"grad_norm": 3.0586514472961426,
"learning_rate": 1.3957245836440469e-05,
"loss": 4.3375,
"step": 3400
},
{
"epoch": 0.7528189254919301,
"grad_norm": 2.7942728996276855,
"learning_rate": 1.3895103156848125e-05,
"loss": 4.2727,
"step": 3405
},
{
"epoch": 0.7539243864691576,
"grad_norm": 2.541633129119873,
"learning_rate": 1.383296047725578e-05,
"loss": 4.3377,
"step": 3410
},
{
"epoch": 0.7550298474463851,
"grad_norm": 2.821420192718506,
"learning_rate": 1.3770817797663436e-05,
"loss": 4.4895,
"step": 3415
},
{
"epoch": 0.7561353084236127,
"grad_norm": 2.650139570236206,
"learning_rate": 1.3708675118071093e-05,
"loss": 4.3168,
"step": 3420
},
{
"epoch": 0.7572407694008402,
"grad_norm": 2.784208059310913,
"learning_rate": 1.3646532438478748e-05,
"loss": 4.247,
"step": 3425
},
{
"epoch": 0.7583462303780677,
"grad_norm": 2.6416375637054443,
"learning_rate": 1.3584389758886404e-05,
"loss": 4.3903,
"step": 3430
},
{
"epoch": 0.7594516913552951,
"grad_norm": 2.7830934524536133,
"learning_rate": 1.3522247079294059e-05,
"loss": 4.4317,
"step": 3435
},
{
"epoch": 0.7605571523325226,
"grad_norm": 2.5094573497772217,
"learning_rate": 1.3460104399701715e-05,
"loss": 4.2657,
"step": 3440
},
{
"epoch": 0.7616626133097502,
"grad_norm": 2.6464684009552,
"learning_rate": 1.3397961720109373e-05,
"loss": 4.344,
"step": 3445
},
{
"epoch": 0.7627680742869777,
"grad_norm": 2.725152015686035,
"learning_rate": 1.3335819040517029e-05,
"loss": 4.3255,
"step": 3450
},
{
"epoch": 0.7638735352642052,
"grad_norm": 2.7001333236694336,
"learning_rate": 1.3273676360924683e-05,
"loss": 4.2375,
"step": 3455
},
{
"epoch": 0.7649789962414327,
"grad_norm": 2.7043142318725586,
"learning_rate": 1.321153368133234e-05,
"loss": 4.3848,
"step": 3460
},
{
"epoch": 0.7660844572186601,
"grad_norm": 2.5512447357177734,
"learning_rate": 1.3149391001739997e-05,
"loss": 4.3744,
"step": 3465
},
{
"epoch": 0.7671899181958877,
"grad_norm": 2.840555191040039,
"learning_rate": 1.3087248322147652e-05,
"loss": 4.3698,
"step": 3470
},
{
"epoch": 0.7682953791731152,
"grad_norm": 2.7197751998901367,
"learning_rate": 1.3025105642555308e-05,
"loss": 4.2368,
"step": 3475
},
{
"epoch": 0.7694008401503427,
"grad_norm": 2.49568247795105,
"learning_rate": 1.2962962962962962e-05,
"loss": 4.3001,
"step": 3480
},
{
"epoch": 0.7705063011275702,
"grad_norm": 2.975504159927368,
"learning_rate": 1.2900820283370619e-05,
"loss": 4.45,
"step": 3485
},
{
"epoch": 0.7716117621047976,
"grad_norm": 2.614933729171753,
"learning_rate": 1.2838677603778276e-05,
"loss": 4.3452,
"step": 3490
},
{
"epoch": 0.7727172230820252,
"grad_norm": 2.6430065631866455,
"learning_rate": 1.2776534924185931e-05,
"loss": 4.2741,
"step": 3495
},
{
"epoch": 0.7738226840592527,
"grad_norm": 2.71543550491333,
"learning_rate": 1.2714392244593587e-05,
"loss": 4.4366,
"step": 3500
},
{
"epoch": 0.7749281450364802,
"grad_norm": 2.868475914001465,
"learning_rate": 1.2652249565001242e-05,
"loss": 4.4391,
"step": 3505
},
{
"epoch": 0.7760336060137077,
"grad_norm": 2.8595988750457764,
"learning_rate": 1.2590106885408901e-05,
"loss": 4.2634,
"step": 3510
},
{
"epoch": 0.7771390669909353,
"grad_norm": 2.577758312225342,
"learning_rate": 1.2527964205816556e-05,
"loss": 4.4947,
"step": 3515
},
{
"epoch": 0.7782445279681627,
"grad_norm": 2.552488088607788,
"learning_rate": 1.2465821526224212e-05,
"loss": 4.4396,
"step": 3520
},
{
"epoch": 0.7793499889453902,
"grad_norm": 2.7421538829803467,
"learning_rate": 1.2403678846631868e-05,
"loss": 4.344,
"step": 3525
},
{
"epoch": 0.7804554499226177,
"grad_norm": 2.6724436283111572,
"learning_rate": 1.2341536167039522e-05,
"loss": 4.507,
"step": 3530
},
{
"epoch": 0.7815609108998453,
"grad_norm": 2.5183072090148926,
"learning_rate": 1.227939348744718e-05,
"loss": 4.3875,
"step": 3535
},
{
"epoch": 0.7826663718770728,
"grad_norm": 2.7601890563964844,
"learning_rate": 1.2217250807854835e-05,
"loss": 4.2108,
"step": 3540
},
{
"epoch": 0.7837718328543002,
"grad_norm": 2.8598101139068604,
"learning_rate": 1.2155108128262491e-05,
"loss": 4.4034,
"step": 3545
},
{
"epoch": 0.7848772938315277,
"grad_norm": 2.6984620094299316,
"learning_rate": 1.2092965448670147e-05,
"loss": 4.3129,
"step": 3550
},
{
"epoch": 0.7859827548087552,
"grad_norm": 2.6067955493927,
"learning_rate": 1.2030822769077803e-05,
"loss": 4.1753,
"step": 3555
},
{
"epoch": 0.7870882157859828,
"grad_norm": 2.763763904571533,
"learning_rate": 1.196868008948546e-05,
"loss": 4.3784,
"step": 3560
},
{
"epoch": 0.7881936767632103,
"grad_norm": 2.5143606662750244,
"learning_rate": 1.1906537409893114e-05,
"loss": 4.3958,
"step": 3565
},
{
"epoch": 0.7892991377404378,
"grad_norm": 2.7460179328918457,
"learning_rate": 1.1844394730300772e-05,
"loss": 4.4161,
"step": 3570
},
{
"epoch": 0.7904045987176652,
"grad_norm": 2.9888150691986084,
"learning_rate": 1.1782252050708426e-05,
"loss": 4.3169,
"step": 3575
},
{
"epoch": 0.7915100596948927,
"grad_norm": 2.7542128562927246,
"learning_rate": 1.1720109371116084e-05,
"loss": 4.2701,
"step": 3580
},
{
"epoch": 0.7926155206721203,
"grad_norm": 2.622459650039673,
"learning_rate": 1.1657966691523739e-05,
"loss": 4.2324,
"step": 3585
},
{
"epoch": 0.7937209816493478,
"grad_norm": 2.7815279960632324,
"learning_rate": 1.1595824011931397e-05,
"loss": 4.4407,
"step": 3590
},
{
"epoch": 0.7948264426265753,
"grad_norm": 2.414452075958252,
"learning_rate": 1.1533681332339051e-05,
"loss": 4.2533,
"step": 3595
},
{
"epoch": 0.7959319036038028,
"grad_norm": 2.864292860031128,
"learning_rate": 1.1471538652746707e-05,
"loss": 4.3427,
"step": 3600
},
{
"epoch": 0.7970373645810302,
"grad_norm": 2.6127429008483887,
"learning_rate": 1.1409395973154363e-05,
"loss": 4.3717,
"step": 3605
},
{
"epoch": 0.7981428255582578,
"grad_norm": 2.8165504932403564,
"learning_rate": 1.1347253293562018e-05,
"loss": 4.4479,
"step": 3610
},
{
"epoch": 0.7992482865354853,
"grad_norm": 2.7605228424072266,
"learning_rate": 1.1285110613969676e-05,
"loss": 4.3603,
"step": 3615
},
{
"epoch": 0.8003537475127128,
"grad_norm": 2.749600648880005,
"learning_rate": 1.122296793437733e-05,
"loss": 4.5357,
"step": 3620
},
{
"epoch": 0.8014592084899403,
"grad_norm": 2.5620622634887695,
"learning_rate": 1.1160825254784988e-05,
"loss": 4.2939,
"step": 3625
},
{
"epoch": 0.8025646694671678,
"grad_norm": 2.840747356414795,
"learning_rate": 1.1098682575192643e-05,
"loss": 4.4695,
"step": 3630
},
{
"epoch": 0.8036701304443953,
"grad_norm": 2.9626359939575195,
"learning_rate": 1.1036539895600299e-05,
"loss": 4.3105,
"step": 3635
},
{
"epoch": 0.8047755914216228,
"grad_norm": 2.748305320739746,
"learning_rate": 1.0974397216007955e-05,
"loss": 4.3532,
"step": 3640
},
{
"epoch": 0.8058810523988503,
"grad_norm": 2.6843719482421875,
"learning_rate": 1.091225453641561e-05,
"loss": 4.2337,
"step": 3645
},
{
"epoch": 0.8069865133760779,
"grad_norm": 2.6707520484924316,
"learning_rate": 1.0850111856823267e-05,
"loss": 4.282,
"step": 3650
},
{
"epoch": 0.8080919743533054,
"grad_norm": 2.5987465381622314,
"learning_rate": 1.0787969177230922e-05,
"loss": 4.3666,
"step": 3655
},
{
"epoch": 0.8091974353305328,
"grad_norm": 2.6529898643493652,
"learning_rate": 1.072582649763858e-05,
"loss": 4.4617,
"step": 3660
},
{
"epoch": 0.8103028963077603,
"grad_norm": 2.5571646690368652,
"learning_rate": 1.0663683818046234e-05,
"loss": 4.2555,
"step": 3665
},
{
"epoch": 0.8114083572849878,
"grad_norm": 2.8901898860931396,
"learning_rate": 1.060154113845389e-05,
"loss": 4.282,
"step": 3670
},
{
"epoch": 0.8125138182622154,
"grad_norm": 2.535372018814087,
"learning_rate": 1.0539398458861546e-05,
"loss": 4.2765,
"step": 3675
},
{
"epoch": 0.8136192792394429,
"grad_norm": 2.7033450603485107,
"learning_rate": 1.0477255779269203e-05,
"loss": 4.4398,
"step": 3680
},
{
"epoch": 0.8147247402166704,
"grad_norm": 2.949090003967285,
"learning_rate": 1.0415113099676859e-05,
"loss": 4.3627,
"step": 3685
},
{
"epoch": 0.8158302011938978,
"grad_norm": 3.2762537002563477,
"learning_rate": 1.0352970420084515e-05,
"loss": 4.4777,
"step": 3690
},
{
"epoch": 0.8169356621711253,
"grad_norm": 2.536367893218994,
"learning_rate": 1.0290827740492171e-05,
"loss": 4.312,
"step": 3695
},
{
"epoch": 0.8180411231483529,
"grad_norm": 2.8747854232788086,
"learning_rate": 1.0228685060899826e-05,
"loss": 4.466,
"step": 3700
},
{
"epoch": 0.8191465841255804,
"grad_norm": 2.527646780014038,
"learning_rate": 1.0166542381307482e-05,
"loss": 4.2035,
"step": 3705
},
{
"epoch": 0.8202520451028079,
"grad_norm": 2.8456356525421143,
"learning_rate": 1.0104399701715138e-05,
"loss": 4.4013,
"step": 3710
},
{
"epoch": 0.8213575060800353,
"grad_norm": 2.6337332725524902,
"learning_rate": 1.0042257022122794e-05,
"loss": 4.4722,
"step": 3715
},
{
"epoch": 0.8224629670572629,
"grad_norm": 2.5773563385009766,
"learning_rate": 9.98011434253045e-06,
"loss": 4.3434,
"step": 3720
},
{
"epoch": 0.8235684280344904,
"grad_norm": 2.7738966941833496,
"learning_rate": 9.917971662938106e-06,
"loss": 4.3367,
"step": 3725
},
{
"epoch": 0.8246738890117179,
"grad_norm": 2.672043561935425,
"learning_rate": 9.855828983345763e-06,
"loss": 4.1075,
"step": 3730
},
{
"epoch": 0.8257793499889454,
"grad_norm": 2.633709669113159,
"learning_rate": 9.793686303753419e-06,
"loss": 4.3165,
"step": 3735
},
{
"epoch": 0.826884810966173,
"grad_norm": 2.5204927921295166,
"learning_rate": 9.731543624161075e-06,
"loss": 4.265,
"step": 3740
},
{
"epoch": 0.8279902719434004,
"grad_norm": 2.7711668014526367,
"learning_rate": 9.669400944568731e-06,
"loss": 4.3085,
"step": 3745
},
{
"epoch": 0.8290957329206279,
"grad_norm": 2.5938053131103516,
"learning_rate": 9.607258264976386e-06,
"loss": 4.4216,
"step": 3750
},
{
"epoch": 0.8302011938978554,
"grad_norm": 2.4221818447113037,
"learning_rate": 9.545115585384042e-06,
"loss": 4.2004,
"step": 3755
},
{
"epoch": 0.8313066548750829,
"grad_norm": 2.75688099861145,
"learning_rate": 9.482972905791698e-06,
"loss": 4.4424,
"step": 3760
},
{
"epoch": 0.8324121158523105,
"grad_norm": 2.8027572631835938,
"learning_rate": 9.420830226199354e-06,
"loss": 4.3706,
"step": 3765
},
{
"epoch": 0.8335175768295379,
"grad_norm": 2.787280797958374,
"learning_rate": 9.35868754660701e-06,
"loss": 4.298,
"step": 3770
},
{
"epoch": 0.8346230378067654,
"grad_norm": 2.797969341278076,
"learning_rate": 9.296544867014666e-06,
"loss": 4.4039,
"step": 3775
},
{
"epoch": 0.8357284987839929,
"grad_norm": 2.5721869468688965,
"learning_rate": 9.234402187422323e-06,
"loss": 4.3801,
"step": 3780
},
{
"epoch": 0.8368339597612204,
"grad_norm": 2.480556011199951,
"learning_rate": 9.172259507829977e-06,
"loss": 4.5008,
"step": 3785
},
{
"epoch": 0.837939420738448,
"grad_norm": 3.0445311069488525,
"learning_rate": 9.110116828237635e-06,
"loss": 4.376,
"step": 3790
},
{
"epoch": 0.8390448817156755,
"grad_norm": 2.906247615814209,
"learning_rate": 9.04797414864529e-06,
"loss": 4.1985,
"step": 3795
},
{
"epoch": 0.8401503426929029,
"grad_norm": 2.624952793121338,
"learning_rate": 8.985831469052947e-06,
"loss": 4.4116,
"step": 3800
},
{
"epoch": 0.8412558036701304,
"grad_norm": 2.826939821243286,
"learning_rate": 8.923688789460602e-06,
"loss": 4.3384,
"step": 3805
},
{
"epoch": 0.8423612646473579,
"grad_norm": 2.7362842559814453,
"learning_rate": 8.861546109868258e-06,
"loss": 4.3327,
"step": 3810
},
{
"epoch": 0.8434667256245855,
"grad_norm": 2.5066606998443604,
"learning_rate": 8.799403430275914e-06,
"loss": 4.3919,
"step": 3815
},
{
"epoch": 0.844572186601813,
"grad_norm": 2.625035524368286,
"learning_rate": 8.737260750683569e-06,
"loss": 4.3227,
"step": 3820
},
{
"epoch": 0.8456776475790405,
"grad_norm": 2.6161510944366455,
"learning_rate": 8.675118071091226e-06,
"loss": 4.273,
"step": 3825
},
{
"epoch": 0.8467831085562679,
"grad_norm": 2.6360316276550293,
"learning_rate": 8.612975391498881e-06,
"loss": 4.3517,
"step": 3830
},
{
"epoch": 0.8478885695334955,
"grad_norm": 2.945129632949829,
"learning_rate": 8.550832711906539e-06,
"loss": 4.4634,
"step": 3835
},
{
"epoch": 0.848994030510723,
"grad_norm": 2.797037124633789,
"learning_rate": 8.488690032314193e-06,
"loss": 4.3474,
"step": 3840
},
{
"epoch": 0.8500994914879505,
"grad_norm": 2.6918272972106934,
"learning_rate": 8.42654735272185e-06,
"loss": 4.1983,
"step": 3845
},
{
"epoch": 0.851204952465178,
"grad_norm": 2.786607027053833,
"learning_rate": 8.364404673129506e-06,
"loss": 4.2545,
"step": 3850
},
{
"epoch": 0.8523104134424054,
"grad_norm": 2.799255132675171,
"learning_rate": 8.302261993537162e-06,
"loss": 4.4633,
"step": 3855
},
{
"epoch": 0.853415874419633,
"grad_norm": 2.393765926361084,
"learning_rate": 8.240119313944818e-06,
"loss": 4.3144,
"step": 3860
},
{
"epoch": 0.8545213353968605,
"grad_norm": 3.014911413192749,
"learning_rate": 8.177976634352472e-06,
"loss": 4.4218,
"step": 3865
},
{
"epoch": 0.855626796374088,
"grad_norm": 2.7910256385803223,
"learning_rate": 8.11583395476013e-06,
"loss": 4.3782,
"step": 3870
},
{
"epoch": 0.8567322573513155,
"grad_norm": 2.5579280853271484,
"learning_rate": 8.053691275167785e-06,
"loss": 4.3776,
"step": 3875
},
{
"epoch": 0.857837718328543,
"grad_norm": 2.6511480808258057,
"learning_rate": 7.991548595575441e-06,
"loss": 4.3284,
"step": 3880
},
{
"epoch": 0.8589431793057705,
"grad_norm": 2.7104756832122803,
"learning_rate": 7.929405915983097e-06,
"loss": 4.3875,
"step": 3885
},
{
"epoch": 0.860048640282998,
"grad_norm": 2.8262667655944824,
"learning_rate": 7.867263236390753e-06,
"loss": 4.401,
"step": 3890
},
{
"epoch": 0.8611541012602255,
"grad_norm": 2.8072750568389893,
"learning_rate": 7.80512055679841e-06,
"loss": 4.3245,
"step": 3895
},
{
"epoch": 0.862259562237453,
"grad_norm": 3.0384953022003174,
"learning_rate": 7.742977877206066e-06,
"loss": 4.2691,
"step": 3900
},
{
"epoch": 0.8633650232146806,
"grad_norm": 2.7213258743286133,
"learning_rate": 7.680835197613722e-06,
"loss": 4.3848,
"step": 3905
},
{
"epoch": 0.864470484191908,
"grad_norm": 2.9310898780822754,
"learning_rate": 7.618692518021378e-06,
"loss": 4.3875,
"step": 3910
},
{
"epoch": 0.8655759451691355,
"grad_norm": 2.7270753383636475,
"learning_rate": 7.556549838429033e-06,
"loss": 4.4668,
"step": 3915
},
{
"epoch": 0.866681406146363,
"grad_norm": 2.7479376792907715,
"learning_rate": 7.494407158836689e-06,
"loss": 4.3906,
"step": 3920
},
{
"epoch": 0.8677868671235905,
"grad_norm": 2.773819923400879,
"learning_rate": 7.432264479244346e-06,
"loss": 4.2478,
"step": 3925
},
{
"epoch": 0.8688923281008181,
"grad_norm": 2.642632484436035,
"learning_rate": 7.370121799652001e-06,
"loss": 4.3643,
"step": 3930
},
{
"epoch": 0.8699977890780456,
"grad_norm": 2.830242872238159,
"learning_rate": 7.307979120059657e-06,
"loss": 4.4359,
"step": 3935
},
{
"epoch": 0.871103250055273,
"grad_norm": 2.8000121116638184,
"learning_rate": 7.2458364404673125e-06,
"loss": 4.5984,
"step": 3940
},
{
"epoch": 0.8722087110325005,
"grad_norm": 2.8083910942077637,
"learning_rate": 7.1836937608749695e-06,
"loss": 4.2437,
"step": 3945
},
{
"epoch": 0.873314172009728,
"grad_norm": 2.6732099056243896,
"learning_rate": 7.121551081282625e-06,
"loss": 4.4326,
"step": 3950
},
{
"epoch": 0.8744196329869556,
"grad_norm": 2.4670119285583496,
"learning_rate": 7.059408401690282e-06,
"loss": 4.2204,
"step": 3955
},
{
"epoch": 0.8755250939641831,
"grad_norm": 2.698272943496704,
"learning_rate": 6.997265722097937e-06,
"loss": 4.305,
"step": 3960
},
{
"epoch": 0.8766305549414106,
"grad_norm": 2.7143428325653076,
"learning_rate": 6.935123042505594e-06,
"loss": 4.348,
"step": 3965
},
{
"epoch": 0.877736015918638,
"grad_norm": 2.571596145629883,
"learning_rate": 6.8729803629132495e-06,
"loss": 4.3278,
"step": 3970
},
{
"epoch": 0.8788414768958656,
"grad_norm": 3.0739476680755615,
"learning_rate": 6.810837683320905e-06,
"loss": 4.3202,
"step": 3975
},
{
"epoch": 0.8799469378730931,
"grad_norm": 2.72713041305542,
"learning_rate": 6.748695003728561e-06,
"loss": 4.5188,
"step": 3980
},
{
"epoch": 0.8810523988503206,
"grad_norm": 2.7530996799468994,
"learning_rate": 6.686552324136216e-06,
"loss": 4.3479,
"step": 3985
},
{
"epoch": 0.8821578598275481,
"grad_norm": 2.7766714096069336,
"learning_rate": 6.624409644543873e-06,
"loss": 4.3766,
"step": 3990
},
{
"epoch": 0.8832633208047755,
"grad_norm": 3.0622363090515137,
"learning_rate": 6.562266964951529e-06,
"loss": 4.3819,
"step": 3995
},
{
"epoch": 0.8843687817820031,
"grad_norm": 2.711118221282959,
"learning_rate": 6.500124285359186e-06,
"loss": 4.2281,
"step": 4000
},
{
"epoch": 0.8854742427592306,
"grad_norm": 2.5327889919281006,
"learning_rate": 6.437981605766841e-06,
"loss": 4.3108,
"step": 4005
},
{
"epoch": 0.8865797037364581,
"grad_norm": 2.6793577671051025,
"learning_rate": 6.375838926174497e-06,
"loss": 4.419,
"step": 4010
},
{
"epoch": 0.8876851647136856,
"grad_norm": 2.7030229568481445,
"learning_rate": 6.3136962465821526e-06,
"loss": 4.1583,
"step": 4015
},
{
"epoch": 0.8887906256909132,
"grad_norm": 2.6065833568573,
"learning_rate": 6.2515535669898096e-06,
"loss": 4.5478,
"step": 4020
},
{
"epoch": 0.8898960866681406,
"grad_norm": 2.8415439128875732,
"learning_rate": 6.189410887397465e-06,
"loss": 4.4606,
"step": 4025
},
{
"epoch": 0.8910015476453681,
"grad_norm": 2.9203150272369385,
"learning_rate": 6.127268207805121e-06,
"loss": 4.207,
"step": 4030
},
{
"epoch": 0.8921070086225956,
"grad_norm": 2.5476462841033936,
"learning_rate": 6.065125528212777e-06,
"loss": 4.3745,
"step": 4035
},
{
"epoch": 0.8932124695998231,
"grad_norm": 3.014671564102173,
"learning_rate": 6.002982848620433e-06,
"loss": 4.2554,
"step": 4040
},
{
"epoch": 0.8943179305770507,
"grad_norm": 2.628617763519287,
"learning_rate": 5.940840169028089e-06,
"loss": 4.329,
"step": 4045
},
{
"epoch": 0.8954233915542781,
"grad_norm": 2.746119737625122,
"learning_rate": 5.878697489435745e-06,
"loss": 4.2055,
"step": 4050
},
{
"epoch": 0.8965288525315056,
"grad_norm": 2.9705591201782227,
"learning_rate": 5.8165548098434e-06,
"loss": 4.3678,
"step": 4055
},
{
"epoch": 0.8976343135087331,
"grad_norm": 2.6920156478881836,
"learning_rate": 5.754412130251056e-06,
"loss": 4.2901,
"step": 4060
},
{
"epoch": 0.8987397744859607,
"grad_norm": 2.442110538482666,
"learning_rate": 5.692269450658713e-06,
"loss": 4.3028,
"step": 4065
},
{
"epoch": 0.8998452354631882,
"grad_norm": 2.74092698097229,
"learning_rate": 5.630126771066369e-06,
"loss": 4.3002,
"step": 4070
},
{
"epoch": 0.9009506964404157,
"grad_norm": 2.442526340484619,
"learning_rate": 5.567984091474025e-06,
"loss": 4.284,
"step": 4075
},
{
"epoch": 0.9020561574176431,
"grad_norm": 2.78788161277771,
"learning_rate": 5.50584141188168e-06,
"loss": 4.3699,
"step": 4080
},
{
"epoch": 0.9031616183948706,
"grad_norm": 2.884793281555176,
"learning_rate": 5.4436987322893364e-06,
"loss": 4.3204,
"step": 4085
},
{
"epoch": 0.9042670793720982,
"grad_norm": 2.645921230316162,
"learning_rate": 5.381556052696993e-06,
"loss": 4.4775,
"step": 4090
},
{
"epoch": 0.9053725403493257,
"grad_norm": 2.7526016235351562,
"learning_rate": 5.319413373104649e-06,
"loss": 4.2971,
"step": 4095
},
{
"epoch": 0.9064780013265532,
"grad_norm": 2.6196508407592773,
"learning_rate": 5.257270693512305e-06,
"loss": 4.32,
"step": 4100
},
{
"epoch": 0.9075834623037807,
"grad_norm": 2.9636263847351074,
"learning_rate": 5.195128013919961e-06,
"loss": 4.3498,
"step": 4105
},
{
"epoch": 0.9086889232810081,
"grad_norm": 2.7609803676605225,
"learning_rate": 5.1329853343276164e-06,
"loss": 4.3134,
"step": 4110
},
{
"epoch": 0.9097943842582357,
"grad_norm": 2.84635329246521,
"learning_rate": 5.070842654735273e-06,
"loss": 4.5655,
"step": 4115
},
{
"epoch": 0.9108998452354632,
"grad_norm": 2.9101991653442383,
"learning_rate": 5.008699975142928e-06,
"loss": 4.3149,
"step": 4120
},
{
"epoch": 0.9120053062126907,
"grad_norm": 2.50285005569458,
"learning_rate": 4.946557295550584e-06,
"loss": 4.5046,
"step": 4125
},
{
"epoch": 0.9131107671899182,
"grad_norm": 2.6111807823181152,
"learning_rate": 4.88441461595824e-06,
"loss": 4.4572,
"step": 4130
},
{
"epoch": 0.9142162281671457,
"grad_norm": 2.8482987880706787,
"learning_rate": 4.8222719363658965e-06,
"loss": 4.2982,
"step": 4135
},
{
"epoch": 0.9153216891443732,
"grad_norm": 2.635841131210327,
"learning_rate": 4.760129256773553e-06,
"loss": 4.3807,
"step": 4140
},
{
"epoch": 0.9164271501216007,
"grad_norm": 2.969567060470581,
"learning_rate": 4.697986577181209e-06,
"loss": 4.5337,
"step": 4145
},
{
"epoch": 0.9175326110988282,
"grad_norm": 2.5630719661712646,
"learning_rate": 4.635843897588864e-06,
"loss": 4.2317,
"step": 4150
},
{
"epoch": 0.9186380720760557,
"grad_norm": 3.0482473373413086,
"learning_rate": 4.57370121799652e-06,
"loss": 4.3618,
"step": 4155
},
{
"epoch": 0.9197435330532833,
"grad_norm": 2.6049513816833496,
"learning_rate": 4.511558538404176e-06,
"loss": 4.3415,
"step": 4160
},
{
"epoch": 0.9208489940305107,
"grad_norm": 2.672549247741699,
"learning_rate": 4.449415858811832e-06,
"loss": 4.3546,
"step": 4165
},
{
"epoch": 0.9219544550077382,
"grad_norm": 2.3971190452575684,
"learning_rate": 4.387273179219488e-06,
"loss": 4.4124,
"step": 4170
},
{
"epoch": 0.9230599159849657,
"grad_norm": 2.9324026107788086,
"learning_rate": 4.325130499627144e-06,
"loss": 4.4178,
"step": 4175
},
{
"epoch": 0.9241653769621933,
"grad_norm": 2.6847023963928223,
"learning_rate": 4.2629878200348e-06,
"loss": 4.3332,
"step": 4180
},
{
"epoch": 0.9252708379394208,
"grad_norm": 2.586578369140625,
"learning_rate": 4.2008451404424565e-06,
"loss": 4.3377,
"step": 4185
},
{
"epoch": 0.9263762989166482,
"grad_norm": 2.6753554344177246,
"learning_rate": 4.138702460850112e-06,
"loss": 4.4402,
"step": 4190
},
{
"epoch": 0.9274817598938757,
"grad_norm": 2.7684082984924316,
"learning_rate": 4.076559781257768e-06,
"loss": 4.3591,
"step": 4195
},
{
"epoch": 0.9285872208711032,
"grad_norm": 2.5447866916656494,
"learning_rate": 4.014417101665424e-06,
"loss": 4.2935,
"step": 4200
},
{
"epoch": 0.9296926818483308,
"grad_norm": 2.744508981704712,
"learning_rate": 3.95227442207308e-06,
"loss": 4.4423,
"step": 4205
},
{
"epoch": 0.9307981428255583,
"grad_norm": 2.8013176918029785,
"learning_rate": 3.8901317424807365e-06,
"loss": 4.4143,
"step": 4210
},
{
"epoch": 0.9319036038027858,
"grad_norm": 2.7098312377929688,
"learning_rate": 3.827989062888392e-06,
"loss": 4.4103,
"step": 4215
},
{
"epoch": 0.9330090647800132,
"grad_norm": 2.6168668270111084,
"learning_rate": 3.7658463832960476e-06,
"loss": 4.2801,
"step": 4220
},
{
"epoch": 0.9341145257572407,
"grad_norm": 2.5833184719085693,
"learning_rate": 3.7037037037037037e-06,
"loss": 4.4013,
"step": 4225
},
{
"epoch": 0.9352199867344683,
"grad_norm": 2.377253293991089,
"learning_rate": 3.64156102411136e-06,
"loss": 4.185,
"step": 4230
},
{
"epoch": 0.9363254477116958,
"grad_norm": 2.6081435680389404,
"learning_rate": 3.5794183445190157e-06,
"loss": 4.253,
"step": 4235
},
{
"epoch": 0.9374309086889233,
"grad_norm": 2.711153030395508,
"learning_rate": 3.517275664926672e-06,
"loss": 4.3479,
"step": 4240
},
{
"epoch": 0.9385363696661508,
"grad_norm": 2.4365053176879883,
"learning_rate": 3.455132985334328e-06,
"loss": 4.1939,
"step": 4245
},
{
"epoch": 0.9396418306433783,
"grad_norm": 2.638932704925537,
"learning_rate": 3.3929903057419838e-06,
"loss": 4.3875,
"step": 4250
},
{
"epoch": 0.9407472916206058,
"grad_norm": 2.5555827617645264,
"learning_rate": 3.33084762614964e-06,
"loss": 4.2698,
"step": 4255
},
{
"epoch": 0.9418527525978333,
"grad_norm": 2.713468074798584,
"learning_rate": 3.268704946557296e-06,
"loss": 4.349,
"step": 4260
},
{
"epoch": 0.9429582135750608,
"grad_norm": 2.841186761856079,
"learning_rate": 3.206562266964952e-06,
"loss": 4.2716,
"step": 4265
},
{
"epoch": 0.9440636745522883,
"grad_norm": 2.8116109371185303,
"learning_rate": 3.144419587372607e-06,
"loss": 4.4094,
"step": 4270
},
{
"epoch": 0.9451691355295158,
"grad_norm": 2.7146096229553223,
"learning_rate": 3.0822769077802638e-06,
"loss": 4.2455,
"step": 4275
},
{
"epoch": 0.9462745965067433,
"grad_norm": 2.577312469482422,
"learning_rate": 3.02013422818792e-06,
"loss": 4.3422,
"step": 4280
},
{
"epoch": 0.9473800574839708,
"grad_norm": 2.4600229263305664,
"learning_rate": 2.9579915485955753e-06,
"loss": 4.511,
"step": 4285
},
{
"epoch": 0.9484855184611983,
"grad_norm": 2.7700321674346924,
"learning_rate": 2.8958488690032314e-06,
"loss": 4.2781,
"step": 4290
},
{
"epoch": 0.9495909794384259,
"grad_norm": 2.7642529010772705,
"learning_rate": 2.8337061894108876e-06,
"loss": 4.3321,
"step": 4295
},
{
"epoch": 0.9506964404156534,
"grad_norm": 2.4941701889038086,
"learning_rate": 2.7715635098185434e-06,
"loss": 4.2922,
"step": 4300
},
{
"epoch": 0.9518019013928808,
"grad_norm": 2.6204841136932373,
"learning_rate": 2.7094208302261995e-06,
"loss": 4.4099,
"step": 4305
},
{
"epoch": 0.9529073623701083,
"grad_norm": 2.7678253650665283,
"learning_rate": 2.6472781506338553e-06,
"loss": 4.5308,
"step": 4310
},
{
"epoch": 0.9540128233473358,
"grad_norm": 2.610168218612671,
"learning_rate": 2.5851354710415115e-06,
"loss": 4.2916,
"step": 4315
},
{
"epoch": 0.9551182843245634,
"grad_norm": 2.404608726501465,
"learning_rate": 2.522992791449167e-06,
"loss": 4.1172,
"step": 4320
},
{
"epoch": 0.9562237453017909,
"grad_norm": 2.581918478012085,
"learning_rate": 2.4608501118568234e-06,
"loss": 4.5247,
"step": 4325
},
{
"epoch": 0.9573292062790183,
"grad_norm": 2.4554283618927,
"learning_rate": 2.3987074322644795e-06,
"loss": 4.4112,
"step": 4330
},
{
"epoch": 0.9584346672562458,
"grad_norm": 3.0333340167999268,
"learning_rate": 2.3365647526721353e-06,
"loss": 4.4101,
"step": 4335
},
{
"epoch": 0.9595401282334733,
"grad_norm": 2.745823621749878,
"learning_rate": 2.274422073079791e-06,
"loss": 4.4591,
"step": 4340
},
{
"epoch": 0.9606455892107009,
"grad_norm": 2.8770716190338135,
"learning_rate": 2.2122793934874472e-06,
"loss": 4.4189,
"step": 4345
},
{
"epoch": 0.9617510501879284,
"grad_norm": 2.701787233352661,
"learning_rate": 2.1501367138951034e-06,
"loss": 4.4115,
"step": 4350
},
{
"epoch": 0.9628565111651559,
"grad_norm": 2.8112969398498535,
"learning_rate": 2.087994034302759e-06,
"loss": 4.3162,
"step": 4355
},
{
"epoch": 0.9639619721423833,
"grad_norm": 2.660151958465576,
"learning_rate": 2.0258513547104153e-06,
"loss": 4.4636,
"step": 4360
},
{
"epoch": 0.9650674331196109,
"grad_norm": 2.6464245319366455,
"learning_rate": 1.963708675118071e-06,
"loss": 4.5144,
"step": 4365
},
{
"epoch": 0.9661728940968384,
"grad_norm": 2.581138849258423,
"learning_rate": 1.901565995525727e-06,
"loss": 4.3598,
"step": 4370
},
{
"epoch": 0.9672783550740659,
"grad_norm": 2.4853599071502686,
"learning_rate": 1.8394233159333832e-06,
"loss": 4.2964,
"step": 4375
},
{
"epoch": 0.9683838160512934,
"grad_norm": 2.554091691970825,
"learning_rate": 1.7772806363410391e-06,
"loss": 4.4226,
"step": 4380
},
{
"epoch": 0.969489277028521,
"grad_norm": 2.9564058780670166,
"learning_rate": 1.7151379567486951e-06,
"loss": 4.3925,
"step": 4385
},
{
"epoch": 0.9705947380057484,
"grad_norm": 2.502652406692505,
"learning_rate": 1.6529952771563513e-06,
"loss": 4.3428,
"step": 4390
},
{
"epoch": 0.9717001989829759,
"grad_norm": 2.493762969970703,
"learning_rate": 1.5908525975640068e-06,
"loss": 4.249,
"step": 4395
},
{
"epoch": 0.9728056599602034,
"grad_norm": 2.4519858360290527,
"learning_rate": 1.528709917971663e-06,
"loss": 4.2229,
"step": 4400
},
{
"epoch": 0.9739111209374309,
"grad_norm": 2.7903311252593994,
"learning_rate": 1.466567238379319e-06,
"loss": 4.4687,
"step": 4405
},
{
"epoch": 0.9750165819146585,
"grad_norm": 2.556363821029663,
"learning_rate": 1.4044245587869751e-06,
"loss": 4.3987,
"step": 4410
},
{
"epoch": 0.9761220428918859,
"grad_norm": 2.542534351348877,
"learning_rate": 1.3422818791946309e-06,
"loss": 4.4648,
"step": 4415
},
{
"epoch": 0.9772275038691134,
"grad_norm": 2.5431811809539795,
"learning_rate": 1.280139199602287e-06,
"loss": 4.3701,
"step": 4420
},
{
"epoch": 0.9783329648463409,
"grad_norm": 2.6445794105529785,
"learning_rate": 1.2179965200099428e-06,
"loss": 4.263,
"step": 4425
},
{
"epoch": 0.9794384258235684,
"grad_norm": 2.8488686084747314,
"learning_rate": 1.1558538404175988e-06,
"loss": 4.2224,
"step": 4430
},
{
"epoch": 0.980543886800796,
"grad_norm": 2.919131278991699,
"learning_rate": 1.093711160825255e-06,
"loss": 4.3791,
"step": 4435
},
{
"epoch": 0.9816493477780235,
"grad_norm": 2.830904483795166,
"learning_rate": 1.0315684812329107e-06,
"loss": 4.3433,
"step": 4440
},
{
"epoch": 0.9827548087552509,
"grad_norm": 2.7437570095062256,
"learning_rate": 9.694258016405668e-07,
"loss": 4.2391,
"step": 4445
},
{
"epoch": 0.9838602697324784,
"grad_norm": 2.664886713027954,
"learning_rate": 9.072831220482228e-07,
"loss": 4.488,
"step": 4450
},
{
"epoch": 0.9849657307097059,
"grad_norm": 2.518346071243286,
"learning_rate": 8.451404424558787e-07,
"loss": 4.4019,
"step": 4455
},
{
"epoch": 0.9860711916869335,
"grad_norm": 2.9975318908691406,
"learning_rate": 7.829977628635347e-07,
"loss": 4.2323,
"step": 4460
},
{
"epoch": 0.987176652664161,
"grad_norm": 2.6765410900115967,
"learning_rate": 7.208550832711907e-07,
"loss": 4.3188,
"step": 4465
},
{
"epoch": 0.9882821136413884,
"grad_norm": 2.8536341190338135,
"learning_rate": 6.587124036788466e-07,
"loss": 4.4314,
"step": 4470
},
{
"epoch": 0.9893875746186159,
"grad_norm": 2.316105365753174,
"learning_rate": 5.965697240865026e-07,
"loss": 4.5006,
"step": 4475
},
{
"epoch": 0.9904930355958435,
"grad_norm": 2.705261468887329,
"learning_rate": 5.344270444941587e-07,
"loss": 4.4413,
"step": 4480
},
{
"epoch": 0.991598496573071,
"grad_norm": 2.7570252418518066,
"learning_rate": 4.722843649018146e-07,
"loss": 4.4869,
"step": 4485
},
{
"epoch": 0.9927039575502985,
"grad_norm": 2.687154531478882,
"learning_rate": 4.1014168530947054e-07,
"loss": 4.5021,
"step": 4490
},
{
"epoch": 0.993809418527526,
"grad_norm": 2.885932683944702,
"learning_rate": 3.4799900571712656e-07,
"loss": 4.3324,
"step": 4495
},
{
"epoch": 0.9949148795047534,
"grad_norm": 2.6431424617767334,
"learning_rate": 2.858563261247825e-07,
"loss": 4.4544,
"step": 4500
},
{
"epoch": 0.996020340481981,
"grad_norm": 2.5612587928771973,
"learning_rate": 2.2371364653243848e-07,
"loss": 4.4129,
"step": 4505
},
{
"epoch": 0.9971258014592085,
"grad_norm": 2.5301103591918945,
"learning_rate": 1.6157096694009447e-07,
"loss": 4.2888,
"step": 4510
},
{
"epoch": 0.998231262436436,
"grad_norm": 2.852886199951172,
"learning_rate": 9.942828734775043e-08,
"loss": 4.4167,
"step": 4515
},
{
"epoch": 0.9993367234136635,
"grad_norm": 3.017920970916748,
"learning_rate": 3.728560775540641e-08,
"loss": 4.2826,
"step": 4520
}
],
"logging_steps": 5,
"max_steps": 4523,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7364382421434368e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}