FormlessAI's picture
Training in progress, epoch 2, checkpoint
bd7fe10 verified
{
"best_global_step": null,
"best_metric": 0.5373095273971558,
"best_model_checkpoint": null,
"epoch": 2.9276734210915545,
"eval_steps": 50,
"global_step": 4950,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0029581422866439876,
"grad_norm": 0.5572423338890076,
"learning_rate": 1.6589451880122303e-05,
"loss": 1.4063,
"step": 5
},
{
"epoch": 0.005916284573287975,
"grad_norm": 0.4338622987270355,
"learning_rate": 3.7326266730275184e-05,
"loss": 1.3321,
"step": 10
},
{
"epoch": 0.008874426859931962,
"grad_norm": 0.3512386381626129,
"learning_rate": 5.8063081580428065e-05,
"loss": 1.2302,
"step": 15
},
{
"epoch": 0.01183256914657595,
"grad_norm": 0.2740453779697418,
"learning_rate": 7.879989643058095e-05,
"loss": 1.1511,
"step": 20
},
{
"epoch": 0.014790711433219937,
"grad_norm": 0.20490019023418427,
"learning_rate": 9.953671128073382e-05,
"loss": 1.1108,
"step": 25
},
{
"epoch": 0.017748853719863924,
"grad_norm": 0.18535283207893372,
"learning_rate": 0.00012027352613088669,
"loss": 1.0932,
"step": 30
},
{
"epoch": 0.020706996006507914,
"grad_norm": 0.17043855786323547,
"learning_rate": 0.00014101034098103958,
"loss": 1.0726,
"step": 35
},
{
"epoch": 0.0236651382931519,
"grad_norm": 0.16548407077789307,
"learning_rate": 0.00016174715583119247,
"loss": 1.055,
"step": 40
},
{
"epoch": 0.026623280579795888,
"grad_norm": 0.17369449138641357,
"learning_rate": 0.00018248397068134533,
"loss": 1.0256,
"step": 45
},
{
"epoch": 0.029581422866439874,
"grad_norm": 0.17916908860206604,
"learning_rate": 0.00020322078553149822,
"loss": 1.0189,
"step": 50
},
{
"epoch": 0.029581422866439874,
"eval_loss": 1.0180954933166504,
"eval_runtime": 15.5481,
"eval_samples_per_second": 417.416,
"eval_steps_per_second": 13.056,
"step": 50
},
{
"epoch": 0.032539565153083864,
"grad_norm": 0.16111324727535248,
"learning_rate": 0.0002239576003816511,
"loss": 1.0049,
"step": 55
},
{
"epoch": 0.03549770743972785,
"grad_norm": 0.1549229621887207,
"learning_rate": 0.00024469441523180396,
"loss": 1.0133,
"step": 60
},
{
"epoch": 0.03845584972637184,
"grad_norm": 0.16132739186286926,
"learning_rate": 0.00026543123008195685,
"loss": 1.0043,
"step": 65
},
{
"epoch": 0.04141399201301583,
"grad_norm": 0.15359684824943542,
"learning_rate": 0.00028616804493210974,
"loss": 0.9838,
"step": 70
},
{
"epoch": 0.04437213429965981,
"grad_norm": 0.1586950719356537,
"learning_rate": 0.0003069048597822626,
"loss": 0.9858,
"step": 75
},
{
"epoch": 0.0473302765863038,
"grad_norm": 0.15954945981502533,
"learning_rate": 0.0003276416746324155,
"loss": 0.9659,
"step": 80
},
{
"epoch": 0.05028841887294779,
"grad_norm": 0.15601466596126556,
"learning_rate": 0.00034837848948256835,
"loss": 0.9703,
"step": 85
},
{
"epoch": 0.053246561159591775,
"grad_norm": 0.16445401310920715,
"learning_rate": 0.00036911530433272123,
"loss": 0.9421,
"step": 90
},
{
"epoch": 0.056204703446235765,
"grad_norm": 0.15353924036026,
"learning_rate": 0.0003898521191828741,
"loss": 0.9504,
"step": 95
},
{
"epoch": 0.05916284573287975,
"grad_norm": 0.15212711691856384,
"learning_rate": 0.000410588934033027,
"loss": 0.9703,
"step": 100
},
{
"epoch": 0.05916284573287975,
"eval_loss": 0.9503689408302307,
"eval_runtime": 15.2084,
"eval_samples_per_second": 426.739,
"eval_steps_per_second": 13.348,
"step": 100
},
{
"epoch": 0.06212098801952374,
"grad_norm": 0.15998579561710358,
"learning_rate": 0.00041473561507370503,
"loss": 0.9576,
"step": 105
},
{
"epoch": 0.06507913030616773,
"grad_norm": 0.17387987673282623,
"learning_rate": 0.000414732844743397,
"loss": 0.9515,
"step": 110
},
{
"epoch": 0.06803727259281171,
"grad_norm": 0.15958620607852936,
"learning_rate": 0.00041472794341999657,
"loss": 0.9326,
"step": 115
},
{
"epoch": 0.0709954148794557,
"grad_norm": 0.15257656574249268,
"learning_rate": 0.00041472091115387234,
"loss": 0.9414,
"step": 120
},
{
"epoch": 0.07395355716609969,
"grad_norm": 0.16304738819599152,
"learning_rate": 0.0004147117480172918,
"loss": 0.9594,
"step": 125
},
{
"epoch": 0.07691169945274368,
"grad_norm": 0.150679811835289,
"learning_rate": 0.00041470045410442024,
"loss": 0.9134,
"step": 130
},
{
"epoch": 0.07986984173938766,
"grad_norm": 0.1541481912136078,
"learning_rate": 0.00041468702953132027,
"loss": 0.9232,
"step": 135
},
{
"epoch": 0.08282798402603166,
"grad_norm": 0.16420885920524597,
"learning_rate": 0.00041467147443595,
"loss": 0.9352,
"step": 140
},
{
"epoch": 0.08578612631267564,
"grad_norm": 0.15783484280109406,
"learning_rate": 0.00041465378897816206,
"loss": 0.9439,
"step": 145
},
{
"epoch": 0.08874426859931962,
"grad_norm": 0.14839908480644226,
"learning_rate": 0.0004146339733397021,
"loss": 0.9149,
"step": 150
},
{
"epoch": 0.08874426859931962,
"eval_loss": 0.9155654311180115,
"eval_runtime": 15.0718,
"eval_samples_per_second": 430.606,
"eval_steps_per_second": 13.469,
"step": 150
},
{
"epoch": 0.09170241088596362,
"grad_norm": 0.16119568049907684,
"learning_rate": 0.00041461202772420625,
"loss": 0.9055,
"step": 155
},
{
"epoch": 0.0946605531726076,
"grad_norm": 0.14887715876102448,
"learning_rate": 0.0004145879523571998,
"loss": 0.8985,
"step": 160
},
{
"epoch": 0.09761869545925159,
"grad_norm": 0.15016992390155792,
"learning_rate": 0.0004145617474860943,
"loss": 0.8983,
"step": 165
},
{
"epoch": 0.10057683774589558,
"grad_norm": 0.15168191492557526,
"learning_rate": 0.00041453341338018547,
"loss": 0.9227,
"step": 170
},
{
"epoch": 0.10353498003253957,
"grad_norm": 0.1442921757698059,
"learning_rate": 0.00041450295033064997,
"loss": 0.9184,
"step": 175
},
{
"epoch": 0.10649312231918355,
"grad_norm": 0.15555252134799957,
"learning_rate": 0.00041447035865054287,
"loss": 0.8999,
"step": 180
},
{
"epoch": 0.10945126460582753,
"grad_norm": 0.15253983438014984,
"learning_rate": 0.000414435638674794,
"loss": 0.923,
"step": 185
},
{
"epoch": 0.11240940689247153,
"grad_norm": 0.15464100241661072,
"learning_rate": 0.00041439879076020483,
"loss": 0.9159,
"step": 190
},
{
"epoch": 0.11536754917911551,
"grad_norm": 0.15457701683044434,
"learning_rate": 0.0004143598152854448,
"loss": 0.9146,
"step": 195
},
{
"epoch": 0.1183256914657595,
"grad_norm": 0.16058233380317688,
"learning_rate": 0.00041431871265104717,
"loss": 0.9076,
"step": 200
},
{
"epoch": 0.1183256914657595,
"eval_loss": 0.8913146257400513,
"eval_runtime": 15.1605,
"eval_samples_per_second": 428.087,
"eval_steps_per_second": 13.39,
"step": 200
},
{
"epoch": 0.1212838337524035,
"grad_norm": 0.15900303423404694,
"learning_rate": 0.0004142754832794051,
"loss": 0.8976,
"step": 205
},
{
"epoch": 0.12424197603904748,
"grad_norm": 0.15453267097473145,
"learning_rate": 0.0004142301276147672,
"loss": 0.911,
"step": 210
},
{
"epoch": 0.12720011832569147,
"grad_norm": 0.15236064791679382,
"learning_rate": 0.0004141826461232332,
"loss": 0.8884,
"step": 215
},
{
"epoch": 0.13015826061233546,
"grad_norm": 0.15213336050510406,
"learning_rate": 0.0004141330392927488,
"loss": 0.9041,
"step": 220
},
{
"epoch": 0.13311640289897944,
"grad_norm": 0.15676744282245636,
"learning_rate": 0.00041408130763310113,
"loss": 0.9003,
"step": 225
},
{
"epoch": 0.13607454518562342,
"grad_norm": 0.14520438015460968,
"learning_rate": 0.0004140274516759128,
"loss": 0.8936,
"step": 230
},
{
"epoch": 0.1390326874722674,
"grad_norm": 0.147927924990654,
"learning_rate": 0.00041397147197463717,
"loss": 0.9004,
"step": 235
},
{
"epoch": 0.1419908297589114,
"grad_norm": 0.14898143708705902,
"learning_rate": 0.0004139133691045523,
"loss": 0.899,
"step": 240
},
{
"epoch": 0.1449489720455554,
"grad_norm": 0.13882724940776825,
"learning_rate": 0.00041385314366275514,
"loss": 0.8864,
"step": 245
},
{
"epoch": 0.14790711433219939,
"grad_norm": 0.15192769467830658,
"learning_rate": 0.0004137907962681552,
"loss": 0.8892,
"step": 250
},
{
"epoch": 0.14790711433219939,
"eval_loss": 0.8746127486228943,
"eval_runtime": 15.0604,
"eval_samples_per_second": 430.931,
"eval_steps_per_second": 13.479,
"step": 250
},
{
"epoch": 0.15086525661884337,
"grad_norm": 0.14568747580051422,
"learning_rate": 0.0004137263275614684,
"loss": 0.8886,
"step": 255
},
{
"epoch": 0.15382339890548735,
"grad_norm": 0.15014733374118805,
"learning_rate": 0.00041365973820521053,
"loss": 0.8922,
"step": 260
},
{
"epoch": 0.15678154119213134,
"grad_norm": 0.15133148431777954,
"learning_rate": 0.00041359102888369024,
"loss": 0.8966,
"step": 265
},
{
"epoch": 0.15973968347877532,
"grad_norm": 0.15641948580741882,
"learning_rate": 0.00041352020030300206,
"loss": 0.8962,
"step": 270
},
{
"epoch": 0.16269782576541933,
"grad_norm": 0.146999329328537,
"learning_rate": 0.0004134472531910193,
"loss": 0.8973,
"step": 275
},
{
"epoch": 0.1656559680520633,
"grad_norm": 0.15431025624275208,
"learning_rate": 0.0004133721882973865,
"loss": 0.871,
"step": 280
},
{
"epoch": 0.1686141103387073,
"grad_norm": 0.15721946954727173,
"learning_rate": 0.00041329500639351136,
"loss": 0.8848,
"step": 285
},
{
"epoch": 0.17157225262535128,
"grad_norm": 0.15370066463947296,
"learning_rate": 0.0004132157082725574,
"loss": 0.8834,
"step": 290
},
{
"epoch": 0.17453039491199526,
"grad_norm": 0.15350687503814697,
"learning_rate": 0.00041313429474943564,
"loss": 0.8739,
"step": 295
},
{
"epoch": 0.17748853719863925,
"grad_norm": 0.1491025686264038,
"learning_rate": 0.000413050766660796,
"loss": 0.872,
"step": 300
},
{
"epoch": 0.17748853719863925,
"eval_loss": 0.8620766997337341,
"eval_runtime": 15.2152,
"eval_samples_per_second": 426.548,
"eval_steps_per_second": 13.342,
"step": 300
},
{
"epoch": 0.18044667948528323,
"grad_norm": 0.15806402266025543,
"learning_rate": 0.00041296512486501866,
"loss": 0.8851,
"step": 305
},
{
"epoch": 0.18340482177192724,
"grad_norm": 0.15328341722488403,
"learning_rate": 0.0004128773702422057,
"loss": 0.8724,
"step": 310
},
{
"epoch": 0.18636296405857122,
"grad_norm": 0.15714845061302185,
"learning_rate": 0.00041278750369417157,
"loss": 0.8892,
"step": 315
},
{
"epoch": 0.1893211063452152,
"grad_norm": 0.15943311154842377,
"learning_rate": 0.0004126955261444342,
"loss": 0.8683,
"step": 320
},
{
"epoch": 0.1922792486318592,
"grad_norm": 0.1572256088256836,
"learning_rate": 0.00041260143853820517,
"loss": 0.8645,
"step": 325
},
{
"epoch": 0.19523739091850317,
"grad_norm": 0.158464714884758,
"learning_rate": 0.0004125052418423802,
"loss": 0.8634,
"step": 330
},
{
"epoch": 0.19819553320514716,
"grad_norm": 0.15009784698486328,
"learning_rate": 0.0004124069370455292,
"loss": 0.8579,
"step": 335
},
{
"epoch": 0.20115367549179117,
"grad_norm": 0.15192705392837524,
"learning_rate": 0.00041230652515788596,
"loss": 0.8696,
"step": 340
},
{
"epoch": 0.20411181777843515,
"grad_norm": 0.1550079882144928,
"learning_rate": 0.0004122040072113381,
"loss": 0.8438,
"step": 345
},
{
"epoch": 0.20706996006507913,
"grad_norm": 0.14868152141571045,
"learning_rate": 0.00041209938425941614,
"loss": 0.8533,
"step": 350
},
{
"epoch": 0.20706996006507913,
"eval_loss": 0.8512822389602661,
"eval_runtime": 15.4431,
"eval_samples_per_second": 420.252,
"eval_steps_per_second": 13.145,
"step": 350
},
{
"epoch": 0.21002810235172312,
"grad_norm": 0.16069160401821136,
"learning_rate": 0.0004119926573772827,
"loss": 0.8731,
"step": 355
},
{
"epoch": 0.2129862446383671,
"grad_norm": 0.15190179646015167,
"learning_rate": 0.00041188382766172164,
"loss": 0.8707,
"step": 360
},
{
"epoch": 0.21594438692501108,
"grad_norm": 0.15935535728931427,
"learning_rate": 0.0004117728962311268,
"loss": 0.8414,
"step": 365
},
{
"epoch": 0.21890252921165507,
"grad_norm": 0.14721055328845978,
"learning_rate": 0.00041165986422549004,
"loss": 0.853,
"step": 370
},
{
"epoch": 0.22186067149829908,
"grad_norm": 0.15762685239315033,
"learning_rate": 0.0004115447328063903,
"loss": 0.8732,
"step": 375
},
{
"epoch": 0.22481881378494306,
"grad_norm": 0.15402108430862427,
"learning_rate": 0.00041142750315698094,
"loss": 0.8595,
"step": 380
},
{
"epoch": 0.22777695607158704,
"grad_norm": 0.15774597227573395,
"learning_rate": 0.000411308176481978,
"loss": 0.8621,
"step": 385
},
{
"epoch": 0.23073509835823103,
"grad_norm": 0.1595357209444046,
"learning_rate": 0.00041118675400764773,
"loss": 0.8694,
"step": 390
},
{
"epoch": 0.233693240644875,
"grad_norm": 0.14985965192317963,
"learning_rate": 0.000411063236981794,
"loss": 0.8526,
"step": 395
},
{
"epoch": 0.236651382931519,
"grad_norm": 0.15028232336044312,
"learning_rate": 0.0004109376266737452,
"loss": 0.8737,
"step": 400
},
{
"epoch": 0.236651382931519,
"eval_loss": 0.8394450545310974,
"eval_runtime": 15.1118,
"eval_samples_per_second": 429.467,
"eval_steps_per_second": 13.433,
"step": 400
},
{
"epoch": 0.239609525218163,
"grad_norm": 0.1572600156068802,
"learning_rate": 0.00041080992437434155,
"loss": 0.8643,
"step": 405
},
{
"epoch": 0.242567667504807,
"grad_norm": 0.15349853038787842,
"learning_rate": 0.00041068013139592194,
"loss": 0.8768,
"step": 410
},
{
"epoch": 0.24552580979145097,
"grad_norm": 0.15303458273410797,
"learning_rate": 0.00041054824907231,
"loss": 0.8491,
"step": 415
},
{
"epoch": 0.24848395207809496,
"grad_norm": 0.15635432302951813,
"learning_rate": 0.0004104142787588005,
"loss": 0.8489,
"step": 420
},
{
"epoch": 0.25144209436473897,
"grad_norm": 0.1505730003118515,
"learning_rate": 0.00041027822183214595,
"loss": 0.8538,
"step": 425
},
{
"epoch": 0.25440023665138295,
"grad_norm": 0.1578625589609146,
"learning_rate": 0.0004101400796905416,
"loss": 0.8533,
"step": 430
},
{
"epoch": 0.25735837893802693,
"grad_norm": 0.15344196557998657,
"learning_rate": 0.0004099998537536117,
"loss": 0.8577,
"step": 435
},
{
"epoch": 0.2603165212246709,
"grad_norm": 0.15440982580184937,
"learning_rate": 0.0004098575454623947,
"loss": 0.8494,
"step": 440
},
{
"epoch": 0.2632746635113149,
"grad_norm": 0.15573708713054657,
"learning_rate": 0.0004097131562793286,
"loss": 0.8471,
"step": 445
},
{
"epoch": 0.2662328057979589,
"grad_norm": 0.1527046263217926,
"learning_rate": 0.0004095666876882355,
"loss": 0.8486,
"step": 450
},
{
"epoch": 0.2662328057979589,
"eval_loss": 0.8319525718688965,
"eval_runtime": 15.1071,
"eval_samples_per_second": 429.599,
"eval_steps_per_second": 13.437,
"step": 450
},
{
"epoch": 0.26919094808460287,
"grad_norm": 0.1534292846918106,
"learning_rate": 0.00040941814119430694,
"loss": 0.8488,
"step": 455
},
{
"epoch": 0.27214909037124685,
"grad_norm": 0.15478292107582092,
"learning_rate": 0.0004092675183240879,
"loss": 0.8442,
"step": 460
},
{
"epoch": 0.27510723265789083,
"grad_norm": 0.16151192784309387,
"learning_rate": 0.00040911482062546144,
"loss": 0.8569,
"step": 465
},
{
"epoch": 0.2780653749445348,
"grad_norm": 0.15042878687381744,
"learning_rate": 0.0004089600496676326,
"loss": 0.841,
"step": 470
},
{
"epoch": 0.2810235172311788,
"grad_norm": 0.1557908058166504,
"learning_rate": 0.0004088032070411125,
"loss": 0.8455,
"step": 475
},
{
"epoch": 0.2839816595178228,
"grad_norm": 0.15937361121177673,
"learning_rate": 0.00040864429435770184,
"loss": 0.8363,
"step": 480
},
{
"epoch": 0.2869398018044668,
"grad_norm": 0.15416064858436584,
"learning_rate": 0.0004084833132504743,
"loss": 0.851,
"step": 485
},
{
"epoch": 0.2898979440911108,
"grad_norm": 0.14673134684562683,
"learning_rate": 0.00040832026537375974,
"loss": 0.8293,
"step": 490
},
{
"epoch": 0.2928560863777548,
"grad_norm": 0.159574493765831,
"learning_rate": 0.0004081551524031274,
"loss": 0.8419,
"step": 495
},
{
"epoch": 0.29581422866439877,
"grad_norm": 0.15907645225524902,
"learning_rate": 0.0004079879760353685,
"loss": 0.8525,
"step": 500
},
{
"epoch": 0.29581422866439877,
"eval_loss": 0.8241714835166931,
"eval_runtime": 15.1548,
"eval_samples_per_second": 428.247,
"eval_steps_per_second": 13.395,
"step": 500
},
{
"epoch": 0.29877237095104275,
"grad_norm": 0.15710967779159546,
"learning_rate": 0.000407818737988479,
"loss": 0.829,
"step": 505
},
{
"epoch": 0.30173051323768674,
"grad_norm": 0.1684395968914032,
"learning_rate": 0.00040764744000164154,
"loss": 0.8652,
"step": 510
},
{
"epoch": 0.3046886555243307,
"grad_norm": 0.15995532274246216,
"learning_rate": 0.00040747408383520804,
"loss": 0.8387,
"step": 515
},
{
"epoch": 0.3076467978109747,
"grad_norm": 0.161375030875206,
"learning_rate": 0.00040729867127068135,
"loss": 0.8483,
"step": 520
},
{
"epoch": 0.3106049400976187,
"grad_norm": 0.15752190351486206,
"learning_rate": 0.0004071212041106969,
"loss": 0.8366,
"step": 525
},
{
"epoch": 0.31356308238426267,
"grad_norm": 0.15867403149604797,
"learning_rate": 0.00040694168417900443,
"loss": 0.8411,
"step": 530
},
{
"epoch": 0.31652122467090665,
"grad_norm": 0.1642945110797882,
"learning_rate": 0.0004067601133204489,
"loss": 0.8401,
"step": 535
},
{
"epoch": 0.31947936695755064,
"grad_norm": 0.15721701085567474,
"learning_rate": 0.0004065764934009518,
"loss": 0.8392,
"step": 540
},
{
"epoch": 0.3224375092441946,
"grad_norm": 0.1723637729883194,
"learning_rate": 0.000406390826307492,
"loss": 0.8339,
"step": 545
},
{
"epoch": 0.32539565153083866,
"grad_norm": 0.16064327955245972,
"learning_rate": 0.00040620311394808616,
"loss": 0.833,
"step": 550
},
{
"epoch": 0.32539565153083866,
"eval_loss": 0.8170909881591797,
"eval_runtime": 15.1491,
"eval_samples_per_second": 428.409,
"eval_steps_per_second": 13.4,
"step": 550
},
{
"epoch": 0.32835379381748264,
"grad_norm": 0.15267042815685272,
"learning_rate": 0.0004060133582517691,
"loss": 0.8436,
"step": 555
},
{
"epoch": 0.3313119361041266,
"grad_norm": 0.16683924198150635,
"learning_rate": 0.00040582156116857423,
"loss": 0.8494,
"step": 560
},
{
"epoch": 0.3342700783907706,
"grad_norm": 0.16277329623699188,
"learning_rate": 0.0004056277246695134,
"loss": 0.853,
"step": 565
},
{
"epoch": 0.3372282206774146,
"grad_norm": 0.16534163057804108,
"learning_rate": 0.00040543185074655647,
"loss": 0.8282,
"step": 570
},
{
"epoch": 0.3401863629640586,
"grad_norm": 0.1668074131011963,
"learning_rate": 0.00040523394141261113,
"loss": 0.8288,
"step": 575
},
{
"epoch": 0.34314450525070256,
"grad_norm": 0.1653686761856079,
"learning_rate": 0.0004050339987015018,
"loss": 0.842,
"step": 580
},
{
"epoch": 0.34610264753734654,
"grad_norm": 0.15694311261177063,
"learning_rate": 0.00040483202466794953,
"loss": 0.8321,
"step": 585
},
{
"epoch": 0.3490607898239905,
"grad_norm": 0.16013135015964508,
"learning_rate": 0.00040462802138754975,
"loss": 0.83,
"step": 590
},
{
"epoch": 0.3520189321106345,
"grad_norm": 0.1565493494272232,
"learning_rate": 0.00040442199095675185,
"loss": 0.829,
"step": 595
},
{
"epoch": 0.3549770743972785,
"grad_norm": 0.15936292707920074,
"learning_rate": 0.00040421393549283733,
"loss": 0.8292,
"step": 600
},
{
"epoch": 0.3549770743972785,
"eval_loss": 0.8113046288490295,
"eval_runtime": 15.0797,
"eval_samples_per_second": 430.381,
"eval_steps_per_second": 13.462,
"step": 600
},
{
"epoch": 0.3579352166839225,
"grad_norm": 0.1653825342655182,
"learning_rate": 0.00040400385713389793,
"loss": 0.8255,
"step": 605
},
{
"epoch": 0.36089335897056646,
"grad_norm": 0.1677619218826294,
"learning_rate": 0.00040379175803881387,
"loss": 0.834,
"step": 610
},
{
"epoch": 0.3638515012572105,
"grad_norm": 0.17027856409549713,
"learning_rate": 0.0004035776403872316,
"loss": 0.8374,
"step": 615
},
{
"epoch": 0.3668096435438545,
"grad_norm": 0.17102032899856567,
"learning_rate": 0.0004033615063795411,
"loss": 0.8299,
"step": 620
},
{
"epoch": 0.36976778583049846,
"grad_norm": 0.16256418824195862,
"learning_rate": 0.00040314335823685377,
"loss": 0.8367,
"step": 625
},
{
"epoch": 0.37272592811714245,
"grad_norm": 0.15656936168670654,
"learning_rate": 0.00040292319820097936,
"loss": 0.8276,
"step": 630
},
{
"epoch": 0.37568407040378643,
"grad_norm": 0.16188517212867737,
"learning_rate": 0.0004027010285344028,
"loss": 0.8303,
"step": 635
},
{
"epoch": 0.3786422126904304,
"grad_norm": 0.15468132495880127,
"learning_rate": 0.00040247685152026123,
"loss": 0.8257,
"step": 640
},
{
"epoch": 0.3816003549770744,
"grad_norm": 0.16564877331256866,
"learning_rate": 0.0004022506694623202,
"loss": 0.8388,
"step": 645
},
{
"epoch": 0.3845584972637184,
"grad_norm": 0.15713313221931458,
"learning_rate": 0.0004020224846849505,
"loss": 0.8214,
"step": 650
},
{
"epoch": 0.3845584972637184,
"eval_loss": 0.8043718934059143,
"eval_runtime": 15.1261,
"eval_samples_per_second": 429.061,
"eval_steps_per_second": 13.421,
"step": 650
},
{
"epoch": 0.38751663955036236,
"grad_norm": 0.15846404433250427,
"learning_rate": 0.0004017922995331036,
"loss": 0.8492,
"step": 655
},
{
"epoch": 0.39047478183700635,
"grad_norm": 0.16053859889507294,
"learning_rate": 0.0004015601163722882,
"loss": 0.8208,
"step": 660
},
{
"epoch": 0.39343292412365033,
"grad_norm": 0.16480772197246552,
"learning_rate": 0.00040132593758854544,
"loss": 0.8314,
"step": 665
},
{
"epoch": 0.3963910664102943,
"grad_norm": 0.16306859254837036,
"learning_rate": 0.00040108976558842467,
"loss": 0.8336,
"step": 670
},
{
"epoch": 0.3993492086969383,
"grad_norm": 0.15954424440860748,
"learning_rate": 0.00040085160279895856,
"loss": 0.8325,
"step": 675
},
{
"epoch": 0.40230735098358233,
"grad_norm": 0.16945534944534302,
"learning_rate": 0.0004006114516676383,
"loss": 0.8283,
"step": 680
},
{
"epoch": 0.4052654932702263,
"grad_norm": 0.16391772031784058,
"learning_rate": 0.00040036931466238835,
"loss": 0.8321,
"step": 685
},
{
"epoch": 0.4082236355568703,
"grad_norm": 0.16165830194950104,
"learning_rate": 0.0004001251942715411,
"loss": 0.8477,
"step": 690
},
{
"epoch": 0.4111817778435143,
"grad_norm": 0.15972331166267395,
"learning_rate": 0.00039987909300381115,
"loss": 0.8187,
"step": 695
},
{
"epoch": 0.41413992013015827,
"grad_norm": 0.16436176002025604,
"learning_rate": 0.00039963101338826994,
"loss": 0.8277,
"step": 700
},
{
"epoch": 0.41413992013015827,
"eval_loss": 0.7969197630882263,
"eval_runtime": 15.1288,
"eval_samples_per_second": 428.984,
"eval_steps_per_second": 13.418,
"step": 700
},
{
"epoch": 0.41709806241680225,
"grad_norm": 0.16101345419883728,
"learning_rate": 0.0003993809579743193,
"loss": 0.8236,
"step": 705
},
{
"epoch": 0.42005620470344623,
"grad_norm": 0.16163453459739685,
"learning_rate": 0.00039912892933166545,
"loss": 0.8234,
"step": 710
},
{
"epoch": 0.4230143469900902,
"grad_norm": 0.1685408502817154,
"learning_rate": 0.00039887493005029266,
"loss": 0.8164,
"step": 715
},
{
"epoch": 0.4259724892767342,
"grad_norm": 0.16631852090358734,
"learning_rate": 0.00039861896274043663,
"loss": 0.8306,
"step": 720
},
{
"epoch": 0.4289306315633782,
"grad_norm": 0.15881124138832092,
"learning_rate": 0.0003983610300325574,
"loss": 0.8279,
"step": 725
},
{
"epoch": 0.43188877385002217,
"grad_norm": 0.1685400754213333,
"learning_rate": 0.0003981011345773126,
"loss": 0.8261,
"step": 730
},
{
"epoch": 0.43484691613666615,
"grad_norm": 0.16085389256477356,
"learning_rate": 0.0003978392790455303,
"loss": 0.8192,
"step": 735
},
{
"epoch": 0.43780505842331013,
"grad_norm": 0.15905866026878357,
"learning_rate": 0.0003975754661281811,
"loss": 0.8139,
"step": 740
},
{
"epoch": 0.4407632007099542,
"grad_norm": 0.16905803978443146,
"learning_rate": 0.00039730969853635093,
"loss": 0.8241,
"step": 745
},
{
"epoch": 0.44372134299659816,
"grad_norm": 0.1739426702260971,
"learning_rate": 0.000397041979001213,
"loss": 0.8001,
"step": 750
},
{
"epoch": 0.44372134299659816,
"eval_loss": 0.791688859462738,
"eval_runtime": 15.0867,
"eval_samples_per_second": 430.18,
"eval_steps_per_second": 13.456,
"step": 750
},
{
"epoch": 0.44667948528324214,
"grad_norm": 0.16287444531917572,
"learning_rate": 0.0003967723102739998,
"loss": 0.8244,
"step": 755
},
{
"epoch": 0.4496376275698861,
"grad_norm": 0.17151105403900146,
"learning_rate": 0.00039650069512597473,
"loss": 0.8309,
"step": 760
},
{
"epoch": 0.4525957698565301,
"grad_norm": 0.16484029591083527,
"learning_rate": 0.0003962271363484036,
"loss": 0.8193,
"step": 765
},
{
"epoch": 0.4555539121431741,
"grad_norm": 0.16619396209716797,
"learning_rate": 0.0003959516367525262,
"loss": 0.7922,
"step": 770
},
{
"epoch": 0.4585120544298181,
"grad_norm": 0.1678115576505661,
"learning_rate": 0.00039567419916952706,
"loss": 0.8085,
"step": 775
},
{
"epoch": 0.46147019671646206,
"grad_norm": 0.1662498414516449,
"learning_rate": 0.00039539482645050664,
"loss": 0.8095,
"step": 780
},
{
"epoch": 0.46442833900310604,
"grad_norm": 0.17038890719413757,
"learning_rate": 0.0003951135214664519,
"loss": 0.8373,
"step": 785
},
{
"epoch": 0.46738648128975,
"grad_norm": 0.16738362610340118,
"learning_rate": 0.0003948302871082067,
"loss": 0.8164,
"step": 790
},
{
"epoch": 0.470344623576394,
"grad_norm": 0.16601060330867767,
"learning_rate": 0.0003945451262864425,
"loss": 0.8161,
"step": 795
},
{
"epoch": 0.473302765863038,
"grad_norm": 0.16971147060394287,
"learning_rate": 0.00039425804193162774,
"loss": 0.7995,
"step": 800
},
{
"epoch": 0.473302765863038,
"eval_loss": 0.7893310785293579,
"eval_runtime": 15.0955,
"eval_samples_per_second": 429.929,
"eval_steps_per_second": 13.448,
"step": 800
},
{
"epoch": 0.476260908149682,
"grad_norm": 0.17346754670143127,
"learning_rate": 0.00039396903699399845,
"loss": 0.82,
"step": 805
},
{
"epoch": 0.479219050436326,
"grad_norm": 0.16586238145828247,
"learning_rate": 0.00039367811444352747,
"loss": 0.8057,
"step": 810
},
{
"epoch": 0.48217719272297,
"grad_norm": 0.16300995647907257,
"learning_rate": 0.0003933852772698941,
"loss": 0.8095,
"step": 815
},
{
"epoch": 0.485135335009614,
"grad_norm": 0.17610788345336914,
"learning_rate": 0.00039309052848245346,
"loss": 0.8277,
"step": 820
},
{
"epoch": 0.48809347729625796,
"grad_norm": 0.1696024090051651,
"learning_rate": 0.0003927938711102054,
"loss": 0.8172,
"step": 825
},
{
"epoch": 0.49105161958290194,
"grad_norm": 0.16828782856464386,
"learning_rate": 0.00039249530820176335,
"loss": 0.8249,
"step": 830
},
{
"epoch": 0.4940097618695459,
"grad_norm": 0.17049913108348846,
"learning_rate": 0.00039219484282532316,
"loss": 0.8145,
"step": 835
},
{
"epoch": 0.4969679041561899,
"grad_norm": 0.16307583451271057,
"learning_rate": 0.00039189247806863136,
"loss": 0.8137,
"step": 840
},
{
"epoch": 0.4999260464428339,
"grad_norm": 0.17322470247745514,
"learning_rate": 0.00039158821703895387,
"loss": 0.8133,
"step": 845
},
{
"epoch": 0.5028841887294779,
"grad_norm": 0.16524559259414673,
"learning_rate": 0.0003912820628630433,
"loss": 0.8153,
"step": 850
},
{
"epoch": 0.5028841887294779,
"eval_loss": 0.7836877703666687,
"eval_runtime": 15.0882,
"eval_samples_per_second": 430.136,
"eval_steps_per_second": 13.454,
"step": 850
},
{
"epoch": 0.5058423310161219,
"grad_norm": 0.16517767310142517,
"learning_rate": 0.0003909740186871077,
"loss": 0.7993,
"step": 855
},
{
"epoch": 0.5088004733027659,
"grad_norm": 0.16940750181674957,
"learning_rate": 0.0003906640876767774,
"loss": 0.8145,
"step": 860
},
{
"epoch": 0.5117586155894098,
"grad_norm": 0.17082969844341278,
"learning_rate": 0.00039035227301707315,
"loss": 0.8084,
"step": 865
},
{
"epoch": 0.5147167578760539,
"grad_norm": 0.16588161885738373,
"learning_rate": 0.000390038577912373,
"loss": 0.7991,
"step": 870
},
{
"epoch": 0.5176749001626978,
"grad_norm": 0.1763608604669571,
"learning_rate": 0.0003897230055863795,
"loss": 0.8118,
"step": 875
},
{
"epoch": 0.5206330424493418,
"grad_norm": 0.16826897859573364,
"learning_rate": 0.00038940555928208674,
"loss": 0.8041,
"step": 880
},
{
"epoch": 0.5235911847359858,
"grad_norm": 0.17069588601589203,
"learning_rate": 0.00038908624226174633,
"loss": 0.8186,
"step": 885
},
{
"epoch": 0.5265493270226298,
"grad_norm": 0.16954410076141357,
"learning_rate": 0.000388765057806835,
"loss": 0.8085,
"step": 890
},
{
"epoch": 0.5295074693092737,
"grad_norm": 0.17242303490638733,
"learning_rate": 0.00038844200921801976,
"loss": 0.8158,
"step": 895
},
{
"epoch": 0.5324656115959178,
"grad_norm": 0.1635177582502365,
"learning_rate": 0.0003881170998151248,
"loss": 0.7943,
"step": 900
},
{
"epoch": 0.5324656115959178,
"eval_loss": 0.7797777652740479,
"eval_runtime": 15.1642,
"eval_samples_per_second": 427.983,
"eval_steps_per_second": 13.387,
"step": 900
},
{
"epoch": 0.5354237538825618,
"grad_norm": 0.16979654133319855,
"learning_rate": 0.00038779033293709694,
"loss": 0.82,
"step": 905
},
{
"epoch": 0.5383818961692057,
"grad_norm": 0.1702311635017395,
"learning_rate": 0.0003874617119419714,
"loss": 0.8196,
"step": 910
},
{
"epoch": 0.5413400384558498,
"grad_norm": 0.16838914155960083,
"learning_rate": 0.00038713124020683736,
"loss": 0.8044,
"step": 915
},
{
"epoch": 0.5442981807424937,
"grad_norm": 0.16786698997020721,
"learning_rate": 0.00038679892112780315,
"loss": 0.8109,
"step": 920
},
{
"epoch": 0.5472563230291377,
"grad_norm": 0.16810354590415955,
"learning_rate": 0.0003864647581199616,
"loss": 0.7934,
"step": 925
},
{
"epoch": 0.5502144653157817,
"grad_norm": 0.18060894310474396,
"learning_rate": 0.00038612875461735457,
"loss": 0.8181,
"step": 930
},
{
"epoch": 0.5531726076024257,
"grad_norm": 0.17606692016124725,
"learning_rate": 0.00038579091407293784,
"loss": 0.82,
"step": 935
},
{
"epoch": 0.5561307498890696,
"grad_norm": 0.16881082952022552,
"learning_rate": 0.0003854512399585459,
"loss": 0.8165,
"step": 940
},
{
"epoch": 0.5590888921757137,
"grad_norm": 0.1732224076986313,
"learning_rate": 0.0003851097357648557,
"loss": 0.8104,
"step": 945
},
{
"epoch": 0.5620470344623576,
"grad_norm": 0.1755875200033188,
"learning_rate": 0.0003847664050013512,
"loss": 0.8037,
"step": 950
},
{
"epoch": 0.5620470344623576,
"eval_loss": 0.7749656438827515,
"eval_runtime": 15.1327,
"eval_samples_per_second": 428.871,
"eval_steps_per_second": 13.415,
"step": 950
},
{
"epoch": 0.5650051767490016,
"grad_norm": 0.1778503805398941,
"learning_rate": 0.00038442125119628727,
"loss": 0.8009,
"step": 955
},
{
"epoch": 0.5679633190356456,
"grad_norm": 0.1720447540283203,
"learning_rate": 0.0003840742778966532,
"loss": 0.8054,
"step": 960
},
{
"epoch": 0.5709214613222896,
"grad_norm": 0.1675298660993576,
"learning_rate": 0.0003837254886681367,
"loss": 0.8059,
"step": 965
},
{
"epoch": 0.5738796036089336,
"grad_norm": 0.16319267451763153,
"learning_rate": 0.0003833748870950865,
"loss": 0.8017,
"step": 970
},
{
"epoch": 0.5768377458955776,
"grad_norm": 0.1670883148908615,
"learning_rate": 0.0003830224767804762,
"loss": 0.7935,
"step": 975
},
{
"epoch": 0.5797958881822216,
"grad_norm": 0.17353768646717072,
"learning_rate": 0.00038266826134586727,
"loss": 0.8116,
"step": 980
},
{
"epoch": 0.5827540304688655,
"grad_norm": 0.180403470993042,
"learning_rate": 0.0003823122444313713,
"loss": 0.8087,
"step": 985
},
{
"epoch": 0.5857121727555096,
"grad_norm": 0.17455299198627472,
"learning_rate": 0.00038195442969561275,
"loss": 0.8041,
"step": 990
},
{
"epoch": 0.5886703150421535,
"grad_norm": 0.1729428619146347,
"learning_rate": 0.0003815948208156917,
"loss": 0.7989,
"step": 995
},
{
"epoch": 0.5916284573287975,
"grad_norm": 0.16548192501068115,
"learning_rate": 0.00038123342148714594,
"loss": 0.8095,
"step": 1000
},
{
"epoch": 0.5916284573287975,
"eval_loss": 0.7690043449401855,
"eval_runtime": 15.0799,
"eval_samples_per_second": 430.376,
"eval_steps_per_second": 13.462,
"step": 1000
},
{
"epoch": 0.5945865996154415,
"grad_norm": 0.1718030571937561,
"learning_rate": 0.0003808702354239126,
"loss": 0.8028,
"step": 1005
},
{
"epoch": 0.5975447419020855,
"grad_norm": 0.1787232607603073,
"learning_rate": 0.00038050526635829035,
"loss": 0.806,
"step": 1010
},
{
"epoch": 0.6005028841887294,
"grad_norm": 0.16556113958358765,
"learning_rate": 0.0003801385180409012,
"loss": 0.8054,
"step": 1015
},
{
"epoch": 0.6034610264753735,
"grad_norm": 0.19446386396884918,
"learning_rate": 0.00037976999424065147,
"loss": 0.8107,
"step": 1020
},
{
"epoch": 0.6064191687620174,
"grad_norm": 0.17038469016551971,
"learning_rate": 0.0003793996987446934,
"loss": 0.7748,
"step": 1025
},
{
"epoch": 0.6093773110486614,
"grad_norm": 0.1731182187795639,
"learning_rate": 0.00037902763535838606,
"loss": 0.8054,
"step": 1030
},
{
"epoch": 0.6123354533353055,
"grad_norm": 0.1793605238199234,
"learning_rate": 0.0003786538079052565,
"loss": 0.8161,
"step": 1035
},
{
"epoch": 0.6152935956219494,
"grad_norm": 0.174575075507164,
"learning_rate": 0.0003782782202269602,
"loss": 0.8056,
"step": 1040
},
{
"epoch": 0.6182517379085934,
"grad_norm": 0.17301982641220093,
"learning_rate": 0.0003779008761832416,
"loss": 0.8057,
"step": 1045
},
{
"epoch": 0.6212098801952374,
"grad_norm": 0.1758825033903122,
"learning_rate": 0.0003775217796518946,
"loss": 0.7916,
"step": 1050
},
{
"epoch": 0.6212098801952374,
"eval_loss": 0.7642711997032166,
"eval_runtime": 15.0843,
"eval_samples_per_second": 430.249,
"eval_steps_per_second": 13.458,
"step": 1050
},
{
"epoch": 0.6241680224818814,
"grad_norm": 0.1691800355911255,
"learning_rate": 0.0003771409345287227,
"loss": 0.7948,
"step": 1055
},
{
"epoch": 0.6271261647685253,
"grad_norm": 0.167150616645813,
"learning_rate": 0.0003767583447274987,
"loss": 0.812,
"step": 1060
},
{
"epoch": 0.6300843070551694,
"grad_norm": 0.18501031398773193,
"learning_rate": 0.00037637401417992477,
"loss": 0.8015,
"step": 1065
},
{
"epoch": 0.6330424493418133,
"grad_norm": 0.17512960731983185,
"learning_rate": 0.0003759879468355919,
"loss": 0.8057,
"step": 1070
},
{
"epoch": 0.6360005916284573,
"grad_norm": 0.1902162879705429,
"learning_rate": 0.0003756001466619395,
"loss": 0.8044,
"step": 1075
},
{
"epoch": 0.6389587339151013,
"grad_norm": 0.17907238006591797,
"learning_rate": 0.0003752106176442142,
"loss": 0.8126,
"step": 1080
},
{
"epoch": 0.6419168762017453,
"grad_norm": 0.1738317906856537,
"learning_rate": 0.00037481936378542944,
"loss": 0.805,
"step": 1085
},
{
"epoch": 0.6448750184883892,
"grad_norm": 0.18125277757644653,
"learning_rate": 0.00037442638910632406,
"loss": 0.8205,
"step": 1090
},
{
"epoch": 0.6478331607750333,
"grad_norm": 0.16496701538562775,
"learning_rate": 0.00037403169764532073,
"loss": 0.8009,
"step": 1095
},
{
"epoch": 0.6507913030616773,
"grad_norm": 0.17556807398796082,
"learning_rate": 0.000373635293458485,
"loss": 0.814,
"step": 1100
},
{
"epoch": 0.6507913030616773,
"eval_loss": 0.7593667507171631,
"eval_runtime": 15.0495,
"eval_samples_per_second": 431.243,
"eval_steps_per_second": 13.489,
"step": 1100
},
{
"epoch": 0.6537494453483212,
"grad_norm": 0.17472174763679504,
"learning_rate": 0.00037323718061948313,
"loss": 0.8207,
"step": 1105
},
{
"epoch": 0.6567075876349653,
"grad_norm": 0.17784008383750916,
"learning_rate": 0.0003728373632195406,
"loss": 0.7911,
"step": 1110
},
{
"epoch": 0.6596657299216092,
"grad_norm": 0.17336086928844452,
"learning_rate": 0.00037243584536739973,
"loss": 0.8007,
"step": 1115
},
{
"epoch": 0.6626238722082533,
"grad_norm": 0.17311853170394897,
"learning_rate": 0.00037203263118927777,
"loss": 0.8016,
"step": 1120
},
{
"epoch": 0.6655820144948972,
"grad_norm": 0.1790124475955963,
"learning_rate": 0.00037162772482882416,
"loss": 0.8022,
"step": 1125
},
{
"epoch": 0.6685401567815412,
"grad_norm": 0.175547793507576,
"learning_rate": 0.0003712211304470783,
"loss": 0.8097,
"step": 1130
},
{
"epoch": 0.6714982990681851,
"grad_norm": 0.18130792677402496,
"learning_rate": 0.00037081285222242646,
"loss": 0.8033,
"step": 1135
},
{
"epoch": 0.6744564413548292,
"grad_norm": 0.1801801323890686,
"learning_rate": 0.0003704028943505593,
"loss": 0.8023,
"step": 1140
},
{
"epoch": 0.6774145836414731,
"grad_norm": 0.18148784339427948,
"learning_rate": 0.000369991261044428,
"loss": 0.7823,
"step": 1145
},
{
"epoch": 0.6803727259281172,
"grad_norm": 0.18274278938770294,
"learning_rate": 0.0003695779565342018,
"loss": 0.796,
"step": 1150
},
{
"epoch": 0.6803727259281172,
"eval_loss": 0.7549857497215271,
"eval_runtime": 15.0502,
"eval_samples_per_second": 431.224,
"eval_steps_per_second": 13.488,
"step": 1150
},
{
"epoch": 0.6833308682147611,
"grad_norm": 0.1774689257144928,
"learning_rate": 0.000369162985067224,
"loss": 0.7964,
"step": 1155
},
{
"epoch": 0.6862890105014051,
"grad_norm": 0.17815682291984558,
"learning_rate": 0.00036874635090796846,
"loss": 0.7863,
"step": 1160
},
{
"epoch": 0.6892471527880492,
"grad_norm": 0.1769675314426422,
"learning_rate": 0.00036832805833799585,
"loss": 0.8083,
"step": 1165
},
{
"epoch": 0.6922052950746931,
"grad_norm": 0.1735963374376297,
"learning_rate": 0.0003679081116559094,
"loss": 0.7853,
"step": 1170
},
{
"epoch": 0.6951634373613371,
"grad_norm": 0.17244546115398407,
"learning_rate": 0.0003674865151773111,
"loss": 0.784,
"step": 1175
},
{
"epoch": 0.698121579647981,
"grad_norm": 0.1718629151582718,
"learning_rate": 0.00036706327323475713,
"loss": 0.7971,
"step": 1180
},
{
"epoch": 0.7010797219346251,
"grad_norm": 0.18205584585666656,
"learning_rate": 0.0003666383901777132,
"loss": 0.771,
"step": 1185
},
{
"epoch": 0.704037864221269,
"grad_norm": 0.18081381916999817,
"learning_rate": 0.00036621187037251003,
"loss": 0.7801,
"step": 1190
},
{
"epoch": 0.706996006507913,
"grad_norm": 0.18197500705718994,
"learning_rate": 0.00036578371820229874,
"loss": 0.805,
"step": 1195
},
{
"epoch": 0.709954148794557,
"grad_norm": 0.17939776182174683,
"learning_rate": 0.0003653539380670052,
"loss": 0.7792,
"step": 1200
},
{
"epoch": 0.709954148794557,
"eval_loss": 0.7517691254615784,
"eval_runtime": 15.0963,
"eval_samples_per_second": 429.906,
"eval_steps_per_second": 13.447,
"step": 1200
},
{
"epoch": 0.712912291081201,
"grad_norm": 0.19871175289154053,
"learning_rate": 0.0003649225343832853,
"loss": 0.8001,
"step": 1205
},
{
"epoch": 0.715870433367845,
"grad_norm": 0.1700190007686615,
"learning_rate": 0.0003644895115844793,
"loss": 0.7888,
"step": 1210
},
{
"epoch": 0.718828575654489,
"grad_norm": 0.1822744756937027,
"learning_rate": 0.0003640548741205665,
"loss": 0.7745,
"step": 1215
},
{
"epoch": 0.7217867179411329,
"grad_norm": 0.1826736479997635,
"learning_rate": 0.00036361862645811933,
"loss": 0.7822,
"step": 1220
},
{
"epoch": 0.724744860227777,
"grad_norm": 0.18002553284168243,
"learning_rate": 0.0003631807730802575,
"loss": 0.7809,
"step": 1225
},
{
"epoch": 0.727703002514421,
"grad_norm": 0.1772574633359909,
"learning_rate": 0.0003627413184866018,
"loss": 0.7874,
"step": 1230
},
{
"epoch": 0.7306611448010649,
"grad_norm": 0.2032460868358612,
"learning_rate": 0.00036230026719322834,
"loss": 0.7983,
"step": 1235
},
{
"epoch": 0.733619287087709,
"grad_norm": 0.17761647701263428,
"learning_rate": 0.0003618576237326213,
"loss": 0.7991,
"step": 1240
},
{
"epoch": 0.7365774293743529,
"grad_norm": 0.17456114292144775,
"learning_rate": 0.0003614133926536273,
"loss": 0.7882,
"step": 1245
},
{
"epoch": 0.7395355716609969,
"grad_norm": 0.17203833162784576,
"learning_rate": 0.00036096757852140804,
"loss": 0.7704,
"step": 1250
},
{
"epoch": 0.7395355716609969,
"eval_loss": 0.7466259002685547,
"eval_runtime": 15.0614,
"eval_samples_per_second": 430.902,
"eval_steps_per_second": 13.478,
"step": 1250
},
{
"epoch": 0.7424937139476409,
"grad_norm": 0.1762530505657196,
"learning_rate": 0.00036052018591739327,
"loss": 0.7914,
"step": 1255
},
{
"epoch": 0.7454518562342849,
"grad_norm": 0.18494775891304016,
"learning_rate": 0.00036007121943923436,
"loss": 0.7836,
"step": 1260
},
{
"epoch": 0.7484099985209288,
"grad_norm": 0.1850501149892807,
"learning_rate": 0.0003596206837007565,
"loss": 0.7903,
"step": 1265
},
{
"epoch": 0.7513681408075729,
"grad_norm": 0.18491573631763458,
"learning_rate": 0.0003591685833319115,
"loss": 0.7793,
"step": 1270
},
{
"epoch": 0.7543262830942168,
"grad_norm": 0.17476260662078857,
"learning_rate": 0.0003587149229787301,
"loss": 0.7846,
"step": 1275
},
{
"epoch": 0.7572844253808608,
"grad_norm": 0.1844286024570465,
"learning_rate": 0.00035825970730327437,
"loss": 0.7933,
"step": 1280
},
{
"epoch": 0.7602425676675048,
"grad_norm": 0.17650367319583893,
"learning_rate": 0.00035780294098358966,
"loss": 0.7769,
"step": 1285
},
{
"epoch": 0.7632007099541488,
"grad_norm": 0.18384352326393127,
"learning_rate": 0.0003573446287136567,
"loss": 0.794,
"step": 1290
},
{
"epoch": 0.7661588522407928,
"grad_norm": 0.16338300704956055,
"learning_rate": 0.0003568847752033431,
"loss": 0.7857,
"step": 1295
},
{
"epoch": 0.7691169945274368,
"grad_norm": 0.17690972983837128,
"learning_rate": 0.0003564233851783553,
"loss": 0.7754,
"step": 1300
},
{
"epoch": 0.7691169945274368,
"eval_loss": 0.7431380152702332,
"eval_runtime": 15.132,
"eval_samples_per_second": 428.891,
"eval_steps_per_second": 13.415,
"step": 1300
},
{
"epoch": 0.7720751368140808,
"grad_norm": 0.1823299676179886,
"learning_rate": 0.0003559604633801894,
"loss": 0.7862,
"step": 1305
},
{
"epoch": 0.7750332791007247,
"grad_norm": 0.1779462695121765,
"learning_rate": 0.00035549601456608343,
"loss": 0.7896,
"step": 1310
},
{
"epoch": 0.7779914213873688,
"grad_norm": 0.18377184867858887,
"learning_rate": 0.00035503004350896736,
"loss": 0.7925,
"step": 1315
},
{
"epoch": 0.7809495636740127,
"grad_norm": 0.1728079617023468,
"learning_rate": 0.00035456255499741483,
"loss": 0.7695,
"step": 1320
},
{
"epoch": 0.7839077059606567,
"grad_norm": 0.18434765934944153,
"learning_rate": 0.0003540935538355937,
"loss": 0.7758,
"step": 1325
},
{
"epoch": 0.7868658482473007,
"grad_norm": 0.17789992690086365,
"learning_rate": 0.00035362304484321634,
"loss": 0.7922,
"step": 1330
},
{
"epoch": 0.7898239905339447,
"grad_norm": 0.1814350187778473,
"learning_rate": 0.0003531510328554907,
"loss": 0.7799,
"step": 1335
},
{
"epoch": 0.7927821328205886,
"grad_norm": 0.17939285933971405,
"learning_rate": 0.00035267752272307037,
"loss": 0.7789,
"step": 1340
},
{
"epoch": 0.7957402751072327,
"grad_norm": 0.17399096488952637,
"learning_rate": 0.0003522025193120045,
"loss": 0.7764,
"step": 1345
},
{
"epoch": 0.7986984173938766,
"grad_norm": 0.1716017723083496,
"learning_rate": 0.0003517260275036881,
"loss": 0.7668,
"step": 1350
},
{
"epoch": 0.7986984173938766,
"eval_loss": 0.7402477860450745,
"eval_runtime": 15.0911,
"eval_samples_per_second": 430.055,
"eval_steps_per_second": 13.452,
"step": 1350
},
{
"epoch": 0.8016565596805206,
"grad_norm": 0.1866738647222519,
"learning_rate": 0.0003512480521948117,
"loss": 0.779,
"step": 1355
},
{
"epoch": 0.8046147019671647,
"grad_norm": 0.1841634213924408,
"learning_rate": 0.00035076859829731116,
"loss": 0.7792,
"step": 1360
},
{
"epoch": 0.8075728442538086,
"grad_norm": 0.18017052114009857,
"learning_rate": 0.0003502876707383171,
"loss": 0.7752,
"step": 1365
},
{
"epoch": 0.8105309865404526,
"grad_norm": 0.1775410771369934,
"learning_rate": 0.00034980527446010435,
"loss": 0.7654,
"step": 1370
},
{
"epoch": 0.8134891288270966,
"grad_norm": 0.17534510791301727,
"learning_rate": 0.00034932141442004086,
"loss": 0.7778,
"step": 1375
},
{
"epoch": 0.8164472711137406,
"grad_norm": 0.17751650512218475,
"learning_rate": 0.0003488360955905374,
"loss": 0.7792,
"step": 1380
},
{
"epoch": 0.8194054134003845,
"grad_norm": 0.18025827407836914,
"learning_rate": 0.0003483493229589956,
"loss": 0.777,
"step": 1385
},
{
"epoch": 0.8223635556870286,
"grad_norm": 0.18627671897411346,
"learning_rate": 0.0003478611015277576,
"loss": 0.7616,
"step": 1390
},
{
"epoch": 0.8253216979736725,
"grad_norm": 0.18397071957588196,
"learning_rate": 0.0003473714363140539,
"loss": 0.7775,
"step": 1395
},
{
"epoch": 0.8282798402603165,
"grad_norm": 0.17657072842121124,
"learning_rate": 0.0003468803323499522,
"loss": 0.7777,
"step": 1400
},
{
"epoch": 0.8282798402603165,
"eval_loss": 0.7356473803520203,
"eval_runtime": 15.1704,
"eval_samples_per_second": 427.807,
"eval_steps_per_second": 13.381,
"step": 1400
},
{
"epoch": 0.8312379825469605,
"grad_norm": 0.18247635662555695,
"learning_rate": 0.00034638779468230556,
"loss": 0.7741,
"step": 1405
},
{
"epoch": 0.8341961248336045,
"grad_norm": 0.18058709800243378,
"learning_rate": 0.0003458938283727006,
"loss": 0.7856,
"step": 1410
},
{
"epoch": 0.8371542671202484,
"grad_norm": 0.18204925954341888,
"learning_rate": 0.0003453984384974055,
"loss": 0.7684,
"step": 1415
},
{
"epoch": 0.8401124094068925,
"grad_norm": 0.17773495614528656,
"learning_rate": 0.0003449016301473176,
"loss": 0.7772,
"step": 1420
},
{
"epoch": 0.8430705516935365,
"grad_norm": 0.18899886310100555,
"learning_rate": 0.00034440340842791135,
"loss": 0.7828,
"step": 1425
},
{
"epoch": 0.8460286939801804,
"grad_norm": 0.18704986572265625,
"learning_rate": 0.00034390377845918584,
"loss": 0.8089,
"step": 1430
},
{
"epoch": 0.8489868362668245,
"grad_norm": 0.17881925404071808,
"learning_rate": 0.000343402745375612,
"loss": 0.7797,
"step": 1435
},
{
"epoch": 0.8519449785534684,
"grad_norm": 0.1758805811405182,
"learning_rate": 0.00034290031432608007,
"loss": 0.7846,
"step": 1440
},
{
"epoch": 0.8549031208401124,
"grad_norm": 0.17261724174022675,
"learning_rate": 0.0003423964904738463,
"loss": 0.7786,
"step": 1445
},
{
"epoch": 0.8578612631267564,
"grad_norm": 0.18959110975265503,
"learning_rate": 0.0003418912789964804,
"loss": 0.7595,
"step": 1450
},
{
"epoch": 0.8578612631267564,
"eval_loss": 0.7294892072677612,
"eval_runtime": 15.0942,
"eval_samples_per_second": 429.966,
"eval_steps_per_second": 13.449,
"step": 1450
},
{
"epoch": 0.8608194054134004,
"grad_norm": 0.1834767460823059,
"learning_rate": 0.0003413846850858119,
"loss": 0.7906,
"step": 1455
},
{
"epoch": 0.8637775477000443,
"grad_norm": 0.18648895621299744,
"learning_rate": 0.00034087671394787716,
"loss": 0.7774,
"step": 1460
},
{
"epoch": 0.8667356899866884,
"grad_norm": 0.17823517322540283,
"learning_rate": 0.0003403673708028654,
"loss": 0.7572,
"step": 1465
},
{
"epoch": 0.8696938322733323,
"grad_norm": 0.17990326881408691,
"learning_rate": 0.0003398566608850657,
"loss": 0.7847,
"step": 1470
},
{
"epoch": 0.8726519745599763,
"grad_norm": 0.178693026304245,
"learning_rate": 0.0003393445894428125,
"loss": 0.7963,
"step": 1475
},
{
"epoch": 0.8756101168466203,
"grad_norm": 0.19564184546470642,
"learning_rate": 0.00033883116173843216,
"loss": 0.7752,
"step": 1480
},
{
"epoch": 0.8785682591332643,
"grad_norm": 0.1759771704673767,
"learning_rate": 0.0003383163830481888,
"loss": 0.7896,
"step": 1485
},
{
"epoch": 0.8815264014199083,
"grad_norm": 0.1835668683052063,
"learning_rate": 0.0003378002586622298,
"loss": 0.7933,
"step": 1490
},
{
"epoch": 0.8844845437065523,
"grad_norm": 0.17599444091320038,
"learning_rate": 0.000337282793884532,
"loss": 0.7527,
"step": 1495
},
{
"epoch": 0.8874426859931963,
"grad_norm": 0.1790158897638321,
"learning_rate": 0.00033676399403284645,
"loss": 0.7843,
"step": 1500
},
{
"epoch": 0.8874426859931963,
"eval_loss": 0.725857675075531,
"eval_runtime": 15.0739,
"eval_samples_per_second": 430.544,
"eval_steps_per_second": 13.467,
"step": 1500
},
{
"epoch": 0.8904008282798402,
"grad_norm": 0.1715373992919922,
"learning_rate": 0.0003362438644386444,
"loss": 0.7578,
"step": 1505
},
{
"epoch": 0.8933589705664843,
"grad_norm": 0.17853514850139618,
"learning_rate": 0.0003357224104470622,
"loss": 0.775,
"step": 1510
},
{
"epoch": 0.8963171128531282,
"grad_norm": 0.18070384860038757,
"learning_rate": 0.00033519963741684625,
"loss": 0.7762,
"step": 1515
},
{
"epoch": 0.8992752551397722,
"grad_norm": 0.1907287836074829,
"learning_rate": 0.0003346755507202985,
"loss": 0.7899,
"step": 1520
},
{
"epoch": 0.9022333974264162,
"grad_norm": 0.18904021382331848,
"learning_rate": 0.00033415015574322053,
"loss": 0.7556,
"step": 1525
},
{
"epoch": 0.9051915397130602,
"grad_norm": 0.1869528889656067,
"learning_rate": 0.0003336234578848587,
"loss": 0.7789,
"step": 1530
},
{
"epoch": 0.9081496819997041,
"grad_norm": 0.17394182085990906,
"learning_rate": 0.0003330954625578482,
"loss": 0.7585,
"step": 1535
},
{
"epoch": 0.9111078242863482,
"grad_norm": 0.1805906444787979,
"learning_rate": 0.0003325661751881582,
"loss": 0.7622,
"step": 1540
},
{
"epoch": 0.9140659665729921,
"grad_norm": 0.16848941147327423,
"learning_rate": 0.00033203560121503533,
"loss": 0.7532,
"step": 1545
},
{
"epoch": 0.9170241088596361,
"grad_norm": 0.17714829742908478,
"learning_rate": 0.00033150374609094795,
"loss": 0.7629,
"step": 1550
},
{
"epoch": 0.9170241088596361,
"eval_loss": 0.7223963141441345,
"eval_runtime": 15.1068,
"eval_samples_per_second": 429.607,
"eval_steps_per_second": 13.438,
"step": 1550
},
{
"epoch": 0.9199822511462802,
"grad_norm": 0.18904811143875122,
"learning_rate": 0.00033097061528153035,
"loss": 0.7632,
"step": 1555
},
{
"epoch": 0.9229403934329241,
"grad_norm": 0.1821032464504242,
"learning_rate": 0.0003304362142655266,
"loss": 0.7677,
"step": 1560
},
{
"epoch": 0.9258985357195681,
"grad_norm": 0.187586709856987,
"learning_rate": 0.0003299005485347338,
"loss": 0.7531,
"step": 1565
},
{
"epoch": 0.9288566780062121,
"grad_norm": 0.1836872398853302,
"learning_rate": 0.0003293636235939463,
"loss": 0.7557,
"step": 1570
},
{
"epoch": 0.9318148202928561,
"grad_norm": 0.19305482506752014,
"learning_rate": 0.0003288254449608985,
"loss": 0.7681,
"step": 1575
},
{
"epoch": 0.9347729625795,
"grad_norm": 0.19273991882801056,
"learning_rate": 0.00032828601816620856,
"loss": 0.7542,
"step": 1580
},
{
"epoch": 0.9377311048661441,
"grad_norm": 0.1794893592596054,
"learning_rate": 0.0003277453487533214,
"loss": 0.7647,
"step": 1585
},
{
"epoch": 0.940689247152788,
"grad_norm": 0.18218044936656952,
"learning_rate": 0.00032720344227845185,
"loss": 0.7698,
"step": 1590
},
{
"epoch": 0.943647389439432,
"grad_norm": 0.18507668375968933,
"learning_rate": 0.00032666030431052724,
"loss": 0.7766,
"step": 1595
},
{
"epoch": 0.946605531726076,
"grad_norm": 0.17978817224502563,
"learning_rate": 0.0003261159404311306,
"loss": 0.7462,
"step": 1600
},
{
"epoch": 0.946605531726076,
"eval_loss": 0.7182918190956116,
"eval_runtime": 15.1761,
"eval_samples_per_second": 427.645,
"eval_steps_per_second": 13.376,
"step": 1600
},
{
"epoch": 0.94956367401272,
"grad_norm": 0.17872752249240875,
"learning_rate": 0.00032557035623444316,
"loss": 0.746,
"step": 1605
},
{
"epoch": 0.952521816299364,
"grad_norm": 0.1889464557170868,
"learning_rate": 0.0003250235573271866,
"loss": 0.7755,
"step": 1610
},
{
"epoch": 0.955479958586008,
"grad_norm": 0.22346559166908264,
"learning_rate": 0.0003244755493285656,
"loss": 0.7584,
"step": 1615
},
{
"epoch": 0.958438100872652,
"grad_norm": 0.18715400993824005,
"learning_rate": 0.0003239263378702103,
"loss": 0.7626,
"step": 1620
},
{
"epoch": 0.961396243159296,
"grad_norm": 0.1809689849615097,
"learning_rate": 0.0003233759285961183,
"loss": 0.7626,
"step": 1625
},
{
"epoch": 0.96435438544594,
"grad_norm": 0.1916404366493225,
"learning_rate": 0.00032282432716259637,
"loss": 0.7633,
"step": 1630
},
{
"epoch": 0.9673125277325839,
"grad_norm": 0.18187373876571655,
"learning_rate": 0.00032227153923820276,
"loss": 0.7777,
"step": 1635
},
{
"epoch": 0.970270670019228,
"grad_norm": 0.18343503773212433,
"learning_rate": 0.00032171757050368857,
"loss": 0.7632,
"step": 1640
},
{
"epoch": 0.9732288123058719,
"grad_norm": 0.1849653571844101,
"learning_rate": 0.0003211624266519398,
"loss": 0.7727,
"step": 1645
},
{
"epoch": 0.9761869545925159,
"grad_norm": 0.1886414885520935,
"learning_rate": 0.00032060611338791833,
"loss": 0.7832,
"step": 1650
},
{
"epoch": 0.9761869545925159,
"eval_loss": 0.7173508405685425,
"eval_runtime": 15.0955,
"eval_samples_per_second": 429.929,
"eval_steps_per_second": 13.448,
"step": 1650
},
{
"epoch": 0.9791450968791598,
"grad_norm": 0.18768879771232605,
"learning_rate": 0.0003200486364286038,
"loss": 0.7744,
"step": 1655
},
{
"epoch": 0.9821032391658039,
"grad_norm": 0.17705638706684113,
"learning_rate": 0.0003194900015029344,
"loss": 0.752,
"step": 1660
},
{
"epoch": 0.9850613814524478,
"grad_norm": 0.1893269568681717,
"learning_rate": 0.0003189302143517484,
"loss": 0.7673,
"step": 1665
},
{
"epoch": 0.9880195237390919,
"grad_norm": 0.18785062432289124,
"learning_rate": 0.0003183692807277248,
"loss": 0.7712,
"step": 1670
},
{
"epoch": 0.9909776660257359,
"grad_norm": 0.1836850345134735,
"learning_rate": 0.0003178072063953245,
"loss": 0.7714,
"step": 1675
},
{
"epoch": 0.9939358083123798,
"grad_norm": 0.19294902682304382,
"learning_rate": 0.00031724399713073116,
"loss": 0.7482,
"step": 1680
},
{
"epoch": 0.9968939505990239,
"grad_norm": 0.17442071437835693,
"learning_rate": 0.00031667965872179103,
"loss": 0.7504,
"step": 1685
},
{
"epoch": 0.9998520928856678,
"grad_norm": 0.18082062900066376,
"learning_rate": 0.0003161141969679545,
"loss": 0.7581,
"step": 1690
},
{
"epoch": 1.0023665138293152,
"grad_norm": 0.1936347484588623,
"learning_rate": 0.0003155476176802161,
"loss": 0.7118,
"step": 1695
},
{
"epoch": 1.0053246561159592,
"grad_norm": 0.18891729414463043,
"learning_rate": 0.00031497992668105465,
"loss": 0.7042,
"step": 1700
},
{
"epoch": 1.0053246561159592,
"eval_loss": 0.7123794555664062,
"eval_runtime": 15.1307,
"eval_samples_per_second": 428.928,
"eval_steps_per_second": 13.416,
"step": 1700
},
{
"epoch": 1.0082827984026033,
"grad_norm": 0.18976199626922607,
"learning_rate": 0.0003144111298043734,
"loss": 0.7102,
"step": 1705
},
{
"epoch": 1.011240940689247,
"grad_norm": 0.18827883899211884,
"learning_rate": 0.00031384123289544027,
"loss": 0.7105,
"step": 1710
},
{
"epoch": 1.0141990829758911,
"grad_norm": 0.18360069394111633,
"learning_rate": 0.0003132702418108279,
"loss": 0.7162,
"step": 1715
},
{
"epoch": 1.0171572252625352,
"grad_norm": 0.1873406022787094,
"learning_rate": 0.00031269816241835305,
"loss": 0.6896,
"step": 1720
},
{
"epoch": 1.0201153675491792,
"grad_norm": 0.1977914422750473,
"learning_rate": 0.00031212500059701664,
"loss": 0.7108,
"step": 1725
},
{
"epoch": 1.023073509835823,
"grad_norm": 0.1831110417842865,
"learning_rate": 0.0003115507622369431,
"loss": 0.7104,
"step": 1730
},
{
"epoch": 1.026031652122467,
"grad_norm": 0.1879042387008667,
"learning_rate": 0.0003109754532393202,
"loss": 0.7231,
"step": 1735
},
{
"epoch": 1.028989794409111,
"grad_norm": 0.1994623839855194,
"learning_rate": 0.00031039907951633795,
"loss": 0.7083,
"step": 1740
},
{
"epoch": 1.0319479366957551,
"grad_norm": 0.19423262774944305,
"learning_rate": 0.0003098216469911281,
"loss": 0.701,
"step": 1745
},
{
"epoch": 1.034906078982399,
"grad_norm": 0.18887244164943695,
"learning_rate": 0.0003092431615977033,
"loss": 0.717,
"step": 1750
},
{
"epoch": 1.034906078982399,
"eval_loss": 0.7047431468963623,
"eval_runtime": 15.2085,
"eval_samples_per_second": 426.735,
"eval_steps_per_second": 13.348,
"step": 1750
},
{
"epoch": 1.037864221269043,
"grad_norm": 0.1960499882698059,
"learning_rate": 0.000308663629280896,
"loss": 0.7184,
"step": 1755
},
{
"epoch": 1.040822363555687,
"grad_norm": 0.18444953858852386,
"learning_rate": 0.0003080830559962974,
"loss": 0.7302,
"step": 1760
},
{
"epoch": 1.043780505842331,
"grad_norm": 0.19335564970970154,
"learning_rate": 0.00030750144771019635,
"loss": 0.7142,
"step": 1765
},
{
"epoch": 1.046738648128975,
"grad_norm": 0.199940025806427,
"learning_rate": 0.0003069188103995177,
"loss": 0.7266,
"step": 1770
},
{
"epoch": 1.049696790415619,
"grad_norm": 0.20113906264305115,
"learning_rate": 0.0003063351500517615,
"loss": 0.7256,
"step": 1775
},
{
"epoch": 1.052654932702263,
"grad_norm": 0.20398704707622528,
"learning_rate": 0.0003057504726649407,
"loss": 0.7133,
"step": 1780
},
{
"epoch": 1.055613074988907,
"grad_norm": 0.18939799070358276,
"learning_rate": 0.00030516478424752014,
"loss": 0.6997,
"step": 1785
},
{
"epoch": 1.058571217275551,
"grad_norm": 0.19960810244083405,
"learning_rate": 0.0003045780908183545,
"loss": 0.7186,
"step": 1790
},
{
"epoch": 1.0615293595621949,
"grad_norm": 0.19348271191120148,
"learning_rate": 0.00030399039840662645,
"loss": 0.7243,
"step": 1795
},
{
"epoch": 1.064487501848839,
"grad_norm": 0.191221222281456,
"learning_rate": 0.0003034017130517849,
"loss": 0.7194,
"step": 1800
},
{
"epoch": 1.064487501848839,
"eval_loss": 0.7023242115974426,
"eval_runtime": 15.108,
"eval_samples_per_second": 429.573,
"eval_steps_per_second": 13.437,
"step": 1800
},
{
"epoch": 1.067445644135483,
"grad_norm": 0.2013147622346878,
"learning_rate": 0.0003028120408034827,
"loss": 0.7141,
"step": 1805
},
{
"epoch": 1.070403786422127,
"grad_norm": 0.19821201264858246,
"learning_rate": 0.00030222138772151443,
"loss": 0.7151,
"step": 1810
},
{
"epoch": 1.073361928708771,
"grad_norm": 0.1936139017343521,
"learning_rate": 0.00030162975987575453,
"loss": 0.716,
"step": 1815
},
{
"epoch": 1.0763200709954148,
"grad_norm": 0.20006032288074493,
"learning_rate": 0.0003010371633460944,
"loss": 0.7168,
"step": 1820
},
{
"epoch": 1.0792782132820589,
"grad_norm": 0.1984449028968811,
"learning_rate": 0.0003004436042223803,
"loss": 0.7084,
"step": 1825
},
{
"epoch": 1.082236355568703,
"grad_norm": 0.18980693817138672,
"learning_rate": 0.0002998490886043505,
"loss": 0.7093,
"step": 1830
},
{
"epoch": 1.085194497855347,
"grad_norm": 0.19591650366783142,
"learning_rate": 0.000299253622601573,
"loss": 0.7083,
"step": 1835
},
{
"epoch": 1.0881526401419908,
"grad_norm": 0.1950058490037918,
"learning_rate": 0.00029865721233338213,
"loss": 0.7129,
"step": 1840
},
{
"epoch": 1.0911107824286348,
"grad_norm": 0.1933528035879135,
"learning_rate": 0.00029805986392881617,
"loss": 0.7183,
"step": 1845
},
{
"epoch": 1.0940689247152788,
"grad_norm": 0.18293854594230652,
"learning_rate": 0.00029746158352655434,
"loss": 0.7124,
"step": 1850
},
{
"epoch": 1.0940689247152788,
"eval_loss": 0.6965740323066711,
"eval_runtime": 15.1266,
"eval_samples_per_second": 429.045,
"eval_steps_per_second": 13.42,
"step": 1850
},
{
"epoch": 1.0970270670019229,
"grad_norm": 0.19370752573013306,
"learning_rate": 0.00029686237727485334,
"loss": 0.7028,
"step": 1855
},
{
"epoch": 1.0999852092885667,
"grad_norm": 0.196714848279953,
"learning_rate": 0.0002962622513314845,
"loss": 0.727,
"step": 1860
},
{
"epoch": 1.1029433515752107,
"grad_norm": 0.1929130107164383,
"learning_rate": 0.0002956612118636705,
"loss": 0.7109,
"step": 1865
},
{
"epoch": 1.1059014938618548,
"grad_norm": 0.2017488032579422,
"learning_rate": 0.00029505926504802175,
"loss": 0.7134,
"step": 1870
},
{
"epoch": 1.1088596361484988,
"grad_norm": 0.18689104914665222,
"learning_rate": 0.00029445641707047317,
"loss": 0.7072,
"step": 1875
},
{
"epoch": 1.1118177784351428,
"grad_norm": 0.21098212897777557,
"learning_rate": 0.0002938526741262204,
"loss": 0.7387,
"step": 1880
},
{
"epoch": 1.1147759207217867,
"grad_norm": 0.1986856907606125,
"learning_rate": 0.00029324804241965635,
"loss": 0.7228,
"step": 1885
},
{
"epoch": 1.1177340630084307,
"grad_norm": 0.194554403424263,
"learning_rate": 0.00029264252816430734,
"loss": 0.7213,
"step": 1890
},
{
"epoch": 1.1206922052950747,
"grad_norm": 0.1904379278421402,
"learning_rate": 0.00029203613758276915,
"loss": 0.6987,
"step": 1895
},
{
"epoch": 1.1236503475817188,
"grad_norm": 0.19453752040863037,
"learning_rate": 0.0002914288769066432,
"loss": 0.7011,
"step": 1900
},
{
"epoch": 1.1236503475817188,
"eval_loss": 0.6925638318061829,
"eval_runtime": 15.1122,
"eval_samples_per_second": 429.455,
"eval_steps_per_second": 13.433,
"step": 1900
},
{
"epoch": 1.1266084898683626,
"grad_norm": 0.1970696598291397,
"learning_rate": 0.00029082075237647266,
"loss": 0.7354,
"step": 1905
},
{
"epoch": 1.1295666321550066,
"grad_norm": 0.19833780825138092,
"learning_rate": 0.00029021177024167775,
"loss": 0.7154,
"step": 1910
},
{
"epoch": 1.1325247744416507,
"grad_norm": 0.1916065663099289,
"learning_rate": 0.00028960193676049226,
"loss": 0.7172,
"step": 1915
},
{
"epoch": 1.1354829167282947,
"grad_norm": 0.1968769133090973,
"learning_rate": 0.00028899125819989874,
"loss": 0.723,
"step": 1920
},
{
"epoch": 1.1384410590149385,
"grad_norm": 0.1897817701101303,
"learning_rate": 0.0002883797408355643,
"loss": 0.7071,
"step": 1925
},
{
"epoch": 1.1413992013015826,
"grad_norm": 0.19858099520206451,
"learning_rate": 0.00028776739095177597,
"loss": 0.7097,
"step": 1930
},
{
"epoch": 1.1443573435882266,
"grad_norm": 0.19754937291145325,
"learning_rate": 0.0002871542148413762,
"loss": 0.7043,
"step": 1935
},
{
"epoch": 1.1473154858748706,
"grad_norm": 0.20758198201656342,
"learning_rate": 0.00028654021880569834,
"loss": 0.7158,
"step": 1940
},
{
"epoch": 1.1502736281615147,
"grad_norm": 0.19415895640850067,
"learning_rate": 0.00028592540915450154,
"loss": 0.7079,
"step": 1945
},
{
"epoch": 1.1532317704481585,
"grad_norm": 0.19232836365699768,
"learning_rate": 0.0002853097922059063,
"loss": 0.7033,
"step": 1950
},
{
"epoch": 1.1532317704481585,
"eval_loss": 0.6914330124855042,
"eval_runtime": 15.1002,
"eval_samples_per_second": 429.796,
"eval_steps_per_second": 13.444,
"step": 1950
},
{
"epoch": 1.1561899127348025,
"grad_norm": 0.19998547434806824,
"learning_rate": 0.0002846933742863292,
"loss": 0.7044,
"step": 1955
},
{
"epoch": 1.1591480550214466,
"grad_norm": 0.19473986327648163,
"learning_rate": 0.0002840761617304181,
"loss": 0.6919,
"step": 1960
},
{
"epoch": 1.1621061973080906,
"grad_norm": 0.19846412539482117,
"learning_rate": 0.00028345816088098697,
"loss": 0.7021,
"step": 1965
},
{
"epoch": 1.1650643395947344,
"grad_norm": 0.19736585021018982,
"learning_rate": 0.0002828393780889508,
"loss": 0.7163,
"step": 1970
},
{
"epoch": 1.1680224818813785,
"grad_norm": 0.19072949886322021,
"learning_rate": 0.00028221981971326005,
"loss": 0.7155,
"step": 1975
},
{
"epoch": 1.1709806241680225,
"grad_norm": 0.20604072511196136,
"learning_rate": 0.0002815994921208358,
"loss": 0.7092,
"step": 1980
},
{
"epoch": 1.1739387664546665,
"grad_norm": 0.19358354806900024,
"learning_rate": 0.0002809784016865036,
"loss": 0.7027,
"step": 1985
},
{
"epoch": 1.1768969087413104,
"grad_norm": 0.1930014044046402,
"learning_rate": 0.00028035655479292877,
"loss": 0.7109,
"step": 1990
},
{
"epoch": 1.1798550510279544,
"grad_norm": 0.1993882954120636,
"learning_rate": 0.0002797339578305503,
"loss": 0.7035,
"step": 1995
},
{
"epoch": 1.1828131933145984,
"grad_norm": 0.20162735879421234,
"learning_rate": 0.00027911061719751516,
"loss": 0.7044,
"step": 2000
},
{
"epoch": 1.1828131933145984,
"eval_loss": 0.6860821843147278,
"eval_runtime": 15.1933,
"eval_samples_per_second": 427.162,
"eval_steps_per_second": 13.361,
"step": 2000
},
{
"epoch": 1.1857713356012425,
"grad_norm": 0.19949831068515778,
"learning_rate": 0.00027848653929961293,
"loss": 0.7081,
"step": 2005
},
{
"epoch": 1.1887294778878865,
"grad_norm": 0.1930517703294754,
"learning_rate": 0.0002778617305502096,
"loss": 0.7038,
"step": 2010
},
{
"epoch": 1.1916876201745303,
"grad_norm": 0.2007024586200714,
"learning_rate": 0.0002772361973701816,
"loss": 0.7016,
"step": 2015
},
{
"epoch": 1.1946457624611744,
"grad_norm": 0.2045419067144394,
"learning_rate": 0.00027660994618785044,
"loss": 0.7079,
"step": 2020
},
{
"epoch": 1.1976039047478184,
"grad_norm": 0.2049221694469452,
"learning_rate": 0.0002759829834389157,
"loss": 0.7031,
"step": 2025
},
{
"epoch": 1.2005620470344625,
"grad_norm": 0.19344255328178406,
"learning_rate": 0.00027535531556638994,
"loss": 0.7207,
"step": 2030
},
{
"epoch": 1.2035201893211063,
"grad_norm": 0.19495131075382233,
"learning_rate": 0.0002747269490205315,
"loss": 0.7128,
"step": 2035
},
{
"epoch": 1.2064783316077503,
"grad_norm": 0.1935647428035736,
"learning_rate": 0.00027409789025877897,
"loss": 0.701,
"step": 2040
},
{
"epoch": 1.2094364738943943,
"grad_norm": 0.2149600386619568,
"learning_rate": 0.0002734681457456843,
"loss": 0.7072,
"step": 2045
},
{
"epoch": 1.2123946161810384,
"grad_norm": 0.1978955715894699,
"learning_rate": 0.0002728377219528468,
"loss": 0.7042,
"step": 2050
},
{
"epoch": 1.2123946161810384,
"eval_loss": 0.6831551790237427,
"eval_runtime": 15.2782,
"eval_samples_per_second": 424.787,
"eval_steps_per_second": 13.287,
"step": 2050
},
{
"epoch": 1.2153527584676822,
"grad_norm": 0.1999792903661728,
"learning_rate": 0.00027220662535884635,
"loss": 0.7239,
"step": 2055
},
{
"epoch": 1.2183109007543262,
"grad_norm": 0.19362211227416992,
"learning_rate": 0.00027157486244917687,
"loss": 0.7149,
"step": 2060
},
{
"epoch": 1.2212690430409703,
"grad_norm": 0.200283482670784,
"learning_rate": 0.0002709424397161798,
"loss": 0.7277,
"step": 2065
},
{
"epoch": 1.2242271853276143,
"grad_norm": 0.2076682150363922,
"learning_rate": 0.00027030936365897705,
"loss": 0.714,
"step": 2070
},
{
"epoch": 1.2271853276142584,
"grad_norm": 0.20363134145736694,
"learning_rate": 0.00026967564078340483,
"loss": 0.7328,
"step": 2075
},
{
"epoch": 1.2301434699009022,
"grad_norm": 0.19987626373767853,
"learning_rate": 0.000269041277601946,
"loss": 0.7092,
"step": 2080
},
{
"epoch": 1.2331016121875462,
"grad_norm": 0.20434942841529846,
"learning_rate": 0.0002684062806336639,
"loss": 0.7149,
"step": 2085
},
{
"epoch": 1.2360597544741903,
"grad_norm": 0.21225950121879578,
"learning_rate": 0.0002677706564041348,
"loss": 0.6893,
"step": 2090
},
{
"epoch": 1.2390178967608343,
"grad_norm": 0.20148411393165588,
"learning_rate": 0.00026713441144538106,
"loss": 0.703,
"step": 2095
},
{
"epoch": 1.241976039047478,
"grad_norm": 0.20638014376163483,
"learning_rate": 0.0002664975522958041,
"loss": 0.6961,
"step": 2100
},
{
"epoch": 1.241976039047478,
"eval_loss": 0.6792827248573303,
"eval_runtime": 15.2265,
"eval_samples_per_second": 426.23,
"eval_steps_per_second": 13.332,
"step": 2100
},
{
"epoch": 1.2449341813341221,
"grad_norm": 0.20307990908622742,
"learning_rate": 0.000265860085500117,
"loss": 0.7301,
"step": 2105
},
{
"epoch": 1.2478923236207662,
"grad_norm": 0.19271095097064972,
"learning_rate": 0.0002652220176092775,
"loss": 0.7224,
"step": 2110
},
{
"epoch": 1.2508504659074102,
"grad_norm": 0.19397003948688507,
"learning_rate": 0.0002645833551804202,
"loss": 0.7044,
"step": 2115
},
{
"epoch": 1.253808608194054,
"grad_norm": 0.1992734968662262,
"learning_rate": 0.0002639441047767899,
"loss": 0.7267,
"step": 2120
},
{
"epoch": 1.256766750480698,
"grad_norm": 0.19389532506465912,
"learning_rate": 0.0002633042729676735,
"loss": 0.7022,
"step": 2125
},
{
"epoch": 1.2597248927673421,
"grad_norm": 0.21384213864803314,
"learning_rate": 0.00026266386632833275,
"loss": 0.689,
"step": 2130
},
{
"epoch": 1.2626830350539862,
"grad_norm": 0.20540039241313934,
"learning_rate": 0.0002620228914399368,
"loss": 0.6929,
"step": 2135
},
{
"epoch": 1.2656411773406302,
"grad_norm": 0.19188985228538513,
"learning_rate": 0.0002613813548894943,
"loss": 0.7023,
"step": 2140
},
{
"epoch": 1.268599319627274,
"grad_norm": 0.21049626171588898,
"learning_rate": 0.00026073926326978587,
"loss": 0.6864,
"step": 2145
},
{
"epoch": 1.271557461913918,
"grad_norm": 0.20232687890529633,
"learning_rate": 0.0002600966231792964,
"loss": 0.7041,
"step": 2150
},
{
"epoch": 1.271557461913918,
"eval_loss": 0.6756435632705688,
"eval_runtime": 15.2077,
"eval_samples_per_second": 426.757,
"eval_steps_per_second": 13.348,
"step": 2150
},
{
"epoch": 1.274515604200562,
"grad_norm": 0.20849832892417908,
"learning_rate": 0.0002594534412221472,
"loss": 0.7097,
"step": 2155
},
{
"epoch": 1.277473746487206,
"grad_norm": 0.2087436467409134,
"learning_rate": 0.0002588097240080279,
"loss": 0.708,
"step": 2160
},
{
"epoch": 1.28043188877385,
"grad_norm": 0.2046281397342682,
"learning_rate": 0.00025816547815212887,
"loss": 0.6973,
"step": 2165
},
{
"epoch": 1.283390031060494,
"grad_norm": 0.19384223222732544,
"learning_rate": 0.00025752071027507315,
"loss": 0.7067,
"step": 2170
},
{
"epoch": 1.286348173347138,
"grad_norm": 0.20464631915092468,
"learning_rate": 0.00025687542700284817,
"loss": 0.7178,
"step": 2175
},
{
"epoch": 1.289306315633782,
"grad_norm": 0.20493179559707642,
"learning_rate": 0.000256229634966738,
"loss": 0.6897,
"step": 2180
},
{
"epoch": 1.2922644579204259,
"grad_norm": 0.20845326781272888,
"learning_rate": 0.000255583340803255,
"loss": 0.717,
"step": 2185
},
{
"epoch": 1.29522260020707,
"grad_norm": 0.20893819630146027,
"learning_rate": 0.00025493655115407164,
"loss": 0.7003,
"step": 2190
},
{
"epoch": 1.298180742493714,
"grad_norm": 0.2201833724975586,
"learning_rate": 0.0002542892726659523,
"loss": 0.7106,
"step": 2195
},
{
"epoch": 1.301138884780358,
"grad_norm": 0.19646216928958893,
"learning_rate": 0.000253641511990685,
"loss": 0.688,
"step": 2200
},
{
"epoch": 1.301138884780358,
"eval_loss": 0.6724188923835754,
"eval_runtime": 15.2418,
"eval_samples_per_second": 425.804,
"eval_steps_per_second": 13.319,
"step": 2200
},
{
"epoch": 1.304097027067002,
"grad_norm": 0.2002546191215515,
"learning_rate": 0.00025299327578501274,
"loss": 0.6972,
"step": 2205
},
{
"epoch": 1.3070551693536459,
"grad_norm": 0.1986691802740097,
"learning_rate": 0.0002523445707105656,
"loss": 0.726,
"step": 2210
},
{
"epoch": 1.31001331164029,
"grad_norm": 0.20118148624897003,
"learning_rate": 0.00025169540343379193,
"loss": 0.696,
"step": 2215
},
{
"epoch": 1.312971453926934,
"grad_norm": 0.20996816456317902,
"learning_rate": 0.0002510457806258898,
"loss": 0.7079,
"step": 2220
},
{
"epoch": 1.3159295962135777,
"grad_norm": 0.20342901349067688,
"learning_rate": 0.0002503957089627388,
"loss": 0.708,
"step": 2225
},
{
"epoch": 1.3188877385002218,
"grad_norm": 0.20094621181488037,
"learning_rate": 0.000249745195124831,
"loss": 0.7225,
"step": 2230
},
{
"epoch": 1.3218458807868658,
"grad_norm": 0.20358270406723022,
"learning_rate": 0.0002490942457972025,
"loss": 0.7048,
"step": 2235
},
{
"epoch": 1.3248040230735099,
"grad_norm": 0.19839729368686676,
"learning_rate": 0.00024844286766936504,
"loss": 0.6953,
"step": 2240
},
{
"epoch": 1.327762165360154,
"grad_norm": 0.19929233193397522,
"learning_rate": 0.00024779106743523646,
"loss": 0.7127,
"step": 2245
},
{
"epoch": 1.3307203076467977,
"grad_norm": 0.19684284925460815,
"learning_rate": 0.0002471388517930727,
"loss": 0.7118,
"step": 2250
},
{
"epoch": 1.3307203076467977,
"eval_loss": 0.6681681871414185,
"eval_runtime": 15.1564,
"eval_samples_per_second": 428.202,
"eval_steps_per_second": 13.394,
"step": 2250
},
{
"epoch": 1.3336784499334418,
"grad_norm": 0.20598828792572021,
"learning_rate": 0.00024648622744539864,
"loss": 0.7046,
"step": 2255
},
{
"epoch": 1.3366365922200858,
"grad_norm": 0.20147842168807983,
"learning_rate": 0.0002458332010989393,
"loss": 0.7151,
"step": 2260
},
{
"epoch": 1.3395947345067298,
"grad_norm": 0.20049843192100525,
"learning_rate": 0.00024517977946455057,
"loss": 0.7203,
"step": 2265
},
{
"epoch": 1.3425528767933739,
"grad_norm": 0.19043414294719696,
"learning_rate": 0.00024452596925715093,
"loss": 0.7122,
"step": 2270
},
{
"epoch": 1.3455110190800177,
"grad_norm": 0.20228472352027893,
"learning_rate": 0.00024387177719565164,
"loss": 0.7079,
"step": 2275
},
{
"epoch": 1.3484691613666617,
"grad_norm": 0.2019595056772232,
"learning_rate": 0.00024321721000288845,
"loss": 0.6854,
"step": 2280
},
{
"epoch": 1.3514273036533058,
"grad_norm": 0.19533614814281464,
"learning_rate": 0.000242562274405552,
"loss": 0.7039,
"step": 2285
},
{
"epoch": 1.3543854459399496,
"grad_norm": 0.20636282861232758,
"learning_rate": 0.00024190697713411885,
"loss": 0.6876,
"step": 2290
},
{
"epoch": 1.3573435882265936,
"grad_norm": 0.1991133838891983,
"learning_rate": 0.00024125132492278244,
"loss": 0.6944,
"step": 2295
},
{
"epoch": 1.3603017305132377,
"grad_norm": 0.2027919590473175,
"learning_rate": 0.00024059532450938358,
"loss": 0.7037,
"step": 2300
},
{
"epoch": 1.3603017305132377,
"eval_loss": 0.664495587348938,
"eval_runtime": 15.2359,
"eval_samples_per_second": 425.967,
"eval_steps_per_second": 13.324,
"step": 2300
},
{
"epoch": 1.3632598727998817,
"grad_norm": 0.2081235647201538,
"learning_rate": 0.0002399389826353415,
"loss": 0.7097,
"step": 2305
},
{
"epoch": 1.3662180150865257,
"grad_norm": 0.20397891104221344,
"learning_rate": 0.0002392823060455845,
"loss": 0.6983,
"step": 2310
},
{
"epoch": 1.3691761573731696,
"grad_norm": 0.19654420018196106,
"learning_rate": 0.00023862530148848052,
"loss": 0.7147,
"step": 2315
},
{
"epoch": 1.3721342996598136,
"grad_norm": 0.21511806547641754,
"learning_rate": 0.000237967975715768,
"loss": 0.7028,
"step": 2320
},
{
"epoch": 1.3750924419464576,
"grad_norm": 0.19977550208568573,
"learning_rate": 0.00023731033548248618,
"loss": 0.7037,
"step": 2325
},
{
"epoch": 1.3780505842331017,
"grad_norm": 0.20144343376159668,
"learning_rate": 0.00023665238754690604,
"loss": 0.6902,
"step": 2330
},
{
"epoch": 1.3810087265197457,
"grad_norm": 0.20750263333320618,
"learning_rate": 0.00023599413867046056,
"loss": 0.6967,
"step": 2335
},
{
"epoch": 1.3839668688063895,
"grad_norm": 0.20630352199077606,
"learning_rate": 0.0002353355956176755,
"loss": 0.7054,
"step": 2340
},
{
"epoch": 1.3869250110930336,
"grad_norm": 0.19819048047065735,
"learning_rate": 0.0002346767651560995,
"loss": 0.6915,
"step": 2345
},
{
"epoch": 1.3898831533796776,
"grad_norm": 0.19468337297439575,
"learning_rate": 0.00023401765405623495,
"loss": 0.687,
"step": 2350
},
{
"epoch": 1.3898831533796776,
"eval_loss": 0.6600573658943176,
"eval_runtime": 15.1998,
"eval_samples_per_second": 426.98,
"eval_steps_per_second": 13.355,
"step": 2350
},
{
"epoch": 1.3928412956663214,
"grad_norm": 0.2120439112186432,
"learning_rate": 0.00023335826909146824,
"loss": 0.6855,
"step": 2355
},
{
"epoch": 1.3957994379529655,
"grad_norm": 0.19401684403419495,
"learning_rate": 0.0002326986170380001,
"loss": 0.6898,
"step": 2360
},
{
"epoch": 1.3987575802396095,
"grad_norm": 0.20173484086990356,
"learning_rate": 0.0002320387046747759,
"loss": 0.7154,
"step": 2365
},
{
"epoch": 1.4017157225262535,
"grad_norm": 0.19973625242710114,
"learning_rate": 0.00023137853878341628,
"loss": 0.7032,
"step": 2370
},
{
"epoch": 1.4046738648128976,
"grad_norm": 0.19550538063049316,
"learning_rate": 0.00023071812614814722,
"loss": 0.7068,
"step": 2375
},
{
"epoch": 1.4076320070995414,
"grad_norm": 0.19463753700256348,
"learning_rate": 0.00023005747355573026,
"loss": 0.6961,
"step": 2380
},
{
"epoch": 1.4105901493861854,
"grad_norm": 0.19839531183242798,
"learning_rate": 0.00022939658779539304,
"loss": 0.6912,
"step": 2385
},
{
"epoch": 1.4135482916728295,
"grad_norm": 0.20887915790081024,
"learning_rate": 0.00022873547565875927,
"loss": 0.7069,
"step": 2390
},
{
"epoch": 1.4165064339594735,
"grad_norm": 0.20123013854026794,
"learning_rate": 0.00022807414393977905,
"loss": 0.6805,
"step": 2395
},
{
"epoch": 1.4194645762461175,
"grad_norm": 0.2022971361875534,
"learning_rate": 0.00022741259943465894,
"loss": 0.6999,
"step": 2400
},
{
"epoch": 1.4194645762461175,
"eval_loss": 0.6564731001853943,
"eval_runtime": 15.2607,
"eval_samples_per_second": 425.276,
"eval_steps_per_second": 13.302,
"step": 2400
},
{
"epoch": 1.4224227185327614,
"grad_norm": 0.2049475908279419,
"learning_rate": 0.00022675084894179244,
"loss": 0.6881,
"step": 2405
},
{
"epoch": 1.4253808608194054,
"grad_norm": 0.20038989186286926,
"learning_rate": 0.00022608889926168958,
"loss": 0.709,
"step": 2410
},
{
"epoch": 1.4283390031060494,
"grad_norm": 0.20032629370689392,
"learning_rate": 0.00022542675719690753,
"loss": 0.7014,
"step": 2415
},
{
"epoch": 1.4312971453926933,
"grad_norm": 0.19591467082500458,
"learning_rate": 0.00022476442955198057,
"loss": 0.6995,
"step": 2420
},
{
"epoch": 1.4342552876793373,
"grad_norm": 0.2085343301296234,
"learning_rate": 0.0002241019231333499,
"loss": 0.6962,
"step": 2425
},
{
"epoch": 1.4372134299659813,
"grad_norm": 0.20096167922019958,
"learning_rate": 0.00022343924474929415,
"loss": 0.6961,
"step": 2430
},
{
"epoch": 1.4401715722526254,
"grad_norm": 0.20498406887054443,
"learning_rate": 0.0002227764012098589,
"loss": 0.6975,
"step": 2435
},
{
"epoch": 1.4431297145392694,
"grad_norm": 0.20352709293365479,
"learning_rate": 0.00022211339932678715,
"loss": 0.7016,
"step": 2440
},
{
"epoch": 1.4460878568259132,
"grad_norm": 0.200238898396492,
"learning_rate": 0.00022145024591344904,
"loss": 0.6754,
"step": 2445
},
{
"epoch": 1.4490459991125573,
"grad_norm": 0.20030836760997772,
"learning_rate": 0.0002207869477847719,
"loss": 0.6945,
"step": 2450
},
{
"epoch": 1.4490459991125573,
"eval_loss": 0.6526739001274109,
"eval_runtime": 15.2028,
"eval_samples_per_second": 426.896,
"eval_steps_per_second": 13.353,
"step": 2450
},
{
"epoch": 1.4520041413992013,
"grad_norm": 0.20762254297733307,
"learning_rate": 0.00022012351175717035,
"loss": 0.6892,
"step": 2455
},
{
"epoch": 1.4549622836858453,
"grad_norm": 0.19851386547088623,
"learning_rate": 0.000219459944648476,
"loss": 0.6946,
"step": 2460
},
{
"epoch": 1.4579204259724894,
"grad_norm": 0.20084446668624878,
"learning_rate": 0.0002187962532778676,
"loss": 0.6855,
"step": 2465
},
{
"epoch": 1.4608785682591332,
"grad_norm": 0.19501332938671112,
"learning_rate": 0.0002181324444658008,
"loss": 0.7057,
"step": 2470
},
{
"epoch": 1.4638367105457772,
"grad_norm": 0.19630266726016998,
"learning_rate": 0.0002174685250339383,
"loss": 0.6885,
"step": 2475
},
{
"epoch": 1.4667948528324213,
"grad_norm": 0.19841867685317993,
"learning_rate": 0.0002168045018050794,
"loss": 0.6906,
"step": 2480
},
{
"epoch": 1.469752995119065,
"grad_norm": 0.2073049694299698,
"learning_rate": 0.0002161403816030902,
"loss": 0.686,
"step": 2485
},
{
"epoch": 1.4727111374057091,
"grad_norm": 0.20695935189723969,
"learning_rate": 0.00021547617125283332,
"loss": 0.6919,
"step": 2490
},
{
"epoch": 1.4756692796923532,
"grad_norm": 0.21272872388362885,
"learning_rate": 0.00021481187758009784,
"loss": 0.6954,
"step": 2495
},
{
"epoch": 1.4786274219789972,
"grad_norm": 0.20142289996147156,
"learning_rate": 0.00021414750741152895,
"loss": 0.6728,
"step": 2500
},
{
"epoch": 1.4786274219789972,
"eval_loss": 0.650132417678833,
"eval_runtime": 15.1925,
"eval_samples_per_second": 427.184,
"eval_steps_per_second": 13.362,
"step": 2500
},
{
"epoch": 1.4815855642656413,
"grad_norm": 0.19570617377758026,
"learning_rate": 0.0002134830675745581,
"loss": 0.6891,
"step": 2505
},
{
"epoch": 1.484543706552285,
"grad_norm": 0.22009368240833282,
"learning_rate": 0.00021281856489733261,
"loss": 0.6923,
"step": 2510
},
{
"epoch": 1.487501848838929,
"grad_norm": 0.20262496173381805,
"learning_rate": 0.00021215400620864575,
"loss": 0.6902,
"step": 2515
},
{
"epoch": 1.4904599911255731,
"grad_norm": 0.20191776752471924,
"learning_rate": 0.00021148939833786617,
"loss": 0.6916,
"step": 2520
},
{
"epoch": 1.4934181334122172,
"grad_norm": 0.20352528989315033,
"learning_rate": 0.00021082474811486804,
"loss": 0.6995,
"step": 2525
},
{
"epoch": 1.4963762756988612,
"grad_norm": 0.19871221482753754,
"learning_rate": 0.00021016006236996074,
"loss": 0.706,
"step": 2530
},
{
"epoch": 1.499334417985505,
"grad_norm": 0.2028975784778595,
"learning_rate": 0.00020949534793381877,
"loss": 0.6801,
"step": 2535
},
{
"epoch": 1.502292560272149,
"grad_norm": 0.20480629801750183,
"learning_rate": 0.00020883061163741142,
"loss": 0.7002,
"step": 2540
},
{
"epoch": 1.5052507025587931,
"grad_norm": 0.20334535837173462,
"learning_rate": 0.00020816586031193254,
"loss": 0.6992,
"step": 2545
},
{
"epoch": 1.508208844845437,
"grad_norm": 0.1922510415315628,
"learning_rate": 0.00020750110078873057,
"loss": 0.69,
"step": 2550
},
{
"epoch": 1.508208844845437,
"eval_loss": 0.6458378434181213,
"eval_runtime": 15.156,
"eval_samples_per_second": 428.213,
"eval_steps_per_second": 13.394,
"step": 2550
},
{
"epoch": 1.5111669871320812,
"grad_norm": 0.18962906301021576,
"learning_rate": 0.0002068363398992382,
"loss": 0.6879,
"step": 2555
},
{
"epoch": 1.514125129418725,
"grad_norm": 0.19932591915130615,
"learning_rate": 0.000206171584474902,
"loss": 0.6799,
"step": 2560
},
{
"epoch": 1.517083271705369,
"grad_norm": 0.20362286269664764,
"learning_rate": 0.00020550684134711252,
"loss": 0.689,
"step": 2565
},
{
"epoch": 1.520041413992013,
"grad_norm": 0.20676745474338531,
"learning_rate": 0.00020484211734713388,
"loss": 0.7185,
"step": 2570
},
{
"epoch": 1.522999556278657,
"grad_norm": 0.21269002556800842,
"learning_rate": 0.00020417741930603376,
"loss": 0.6852,
"step": 2575
},
{
"epoch": 1.525957698565301,
"grad_norm": 0.2073538452386856,
"learning_rate": 0.00020351275405461282,
"loss": 0.7067,
"step": 2580
},
{
"epoch": 1.528915840851945,
"grad_norm": 0.1996561586856842,
"learning_rate": 0.00020284812842333495,
"loss": 0.6647,
"step": 2585
},
{
"epoch": 1.5318739831385888,
"grad_norm": 0.20402218401432037,
"learning_rate": 0.00020218354924225683,
"loss": 0.6934,
"step": 2590
},
{
"epoch": 1.534832125425233,
"grad_norm": 0.21136872470378876,
"learning_rate": 0.00020151902334095785,
"loss": 0.6849,
"step": 2595
},
{
"epoch": 1.5377902677118769,
"grad_norm": 0.20921598374843597,
"learning_rate": 0.00020085455754846975,
"loss": 0.6916,
"step": 2600
},
{
"epoch": 1.5377902677118769,
"eval_loss": 0.6419690847396851,
"eval_runtime": 15.1809,
"eval_samples_per_second": 427.511,
"eval_steps_per_second": 13.372,
"step": 2600
},
{
"epoch": 1.540748409998521,
"grad_norm": 0.1944100558757782,
"learning_rate": 0.00020019015869320663,
"loss": 0.7003,
"step": 2605
},
{
"epoch": 1.543706552285165,
"grad_norm": 0.19190169870853424,
"learning_rate": 0.00019952583360289473,
"loss": 0.7068,
"step": 2610
},
{
"epoch": 1.5466646945718088,
"grad_norm": 0.20087464153766632,
"learning_rate": 0.00019886158910450218,
"loss": 0.667,
"step": 2615
},
{
"epoch": 1.549622836858453,
"grad_norm": 0.19939038157463074,
"learning_rate": 0.00019819743202416904,
"loss": 0.6686,
"step": 2620
},
{
"epoch": 1.5525809791450969,
"grad_norm": 0.201206237077713,
"learning_rate": 0.00019753336918713668,
"loss": 0.6882,
"step": 2625
},
{
"epoch": 1.555539121431741,
"grad_norm": 0.21987488865852356,
"learning_rate": 0.00019686940741767839,
"loss": 0.6855,
"step": 2630
},
{
"epoch": 1.558497263718385,
"grad_norm": 0.20101770758628845,
"learning_rate": 0.00019620555353902855,
"loss": 0.69,
"step": 2635
},
{
"epoch": 1.5614554060050287,
"grad_norm": 0.20172643661499023,
"learning_rate": 0.00019554181437331296,
"loss": 0.6666,
"step": 2640
},
{
"epoch": 1.5644135482916728,
"grad_norm": 0.19294790923595428,
"learning_rate": 0.00019487819674147844,
"loss": 0.6694,
"step": 2645
},
{
"epoch": 1.5673716905783168,
"grad_norm": 0.20478790998458862,
"learning_rate": 0.00019421470746322294,
"loss": 0.6969,
"step": 2650
},
{
"epoch": 1.5673716905783168,
"eval_loss": 0.6383815407752991,
"eval_runtime": 15.3432,
"eval_samples_per_second": 422.988,
"eval_steps_per_second": 13.231,
"step": 2650
},
{
"epoch": 1.5703298328649606,
"grad_norm": 0.20216462016105652,
"learning_rate": 0.00019355135335692538,
"loss": 0.6687,
"step": 2655
},
{
"epoch": 1.573287975151605,
"grad_norm": 0.2018350064754486,
"learning_rate": 0.00019288814123957554,
"loss": 0.6973,
"step": 2660
},
{
"epoch": 1.5762461174382487,
"grad_norm": 0.20364220440387726,
"learning_rate": 0.00019222507792670412,
"loss": 0.6894,
"step": 2665
},
{
"epoch": 1.5792042597248928,
"grad_norm": 0.19478543102741241,
"learning_rate": 0.00019156217023231245,
"loss": 0.6845,
"step": 2670
},
{
"epoch": 1.5821624020115368,
"grad_norm": 0.19766514003276825,
"learning_rate": 0.00019089942496880276,
"loss": 0.6791,
"step": 2675
},
{
"epoch": 1.5851205442981806,
"grad_norm": 0.20217527449131012,
"learning_rate": 0.00019023684894690812,
"loss": 0.6914,
"step": 2680
},
{
"epoch": 1.5880786865848249,
"grad_norm": 0.19573134183883667,
"learning_rate": 0.00018957444897562225,
"loss": 0.6899,
"step": 2685
},
{
"epoch": 1.5910368288714687,
"grad_norm": 0.19171777367591858,
"learning_rate": 0.00018891223186212974,
"loss": 0.6763,
"step": 2690
},
{
"epoch": 1.5939949711581127,
"grad_norm": 0.20240968465805054,
"learning_rate": 0.00018825020441173607,
"loss": 0.6881,
"step": 2695
},
{
"epoch": 1.5969531134447568,
"grad_norm": 0.1968025267124176,
"learning_rate": 0.0001875883734277976,
"loss": 0.6817,
"step": 2700
},
{
"epoch": 1.5969531134447568,
"eval_loss": 0.6345797181129456,
"eval_runtime": 15.1784,
"eval_samples_per_second": 427.581,
"eval_steps_per_second": 13.374,
"step": 2700
},
{
"epoch": 1.5999112557314006,
"grad_norm": 0.2064265012741089,
"learning_rate": 0.00018692674571165157,
"loss": 0.6783,
"step": 2705
},
{
"epoch": 1.6028693980180446,
"grad_norm": 0.20387953519821167,
"learning_rate": 0.00018626532806254666,
"loss": 0.6935,
"step": 2710
},
{
"epoch": 1.6058275403046887,
"grad_norm": 0.20735909044742584,
"learning_rate": 0.00018560412727757235,
"loss": 0.6804,
"step": 2715
},
{
"epoch": 1.6087856825913325,
"grad_norm": 0.21424676477909088,
"learning_rate": 0.0001849431501515898,
"loss": 0.6906,
"step": 2720
},
{
"epoch": 1.6117438248779767,
"grad_norm": 0.195107102394104,
"learning_rate": 0.00018428240347716172,
"loss": 0.6796,
"step": 2725
},
{
"epoch": 1.6147019671646206,
"grad_norm": 0.20345866680145264,
"learning_rate": 0.00018362189404448243,
"loss": 0.6991,
"step": 2730
},
{
"epoch": 1.6176601094512646,
"grad_norm": 0.19848254323005676,
"learning_rate": 0.00018296162864130837,
"loss": 0.6921,
"step": 2735
},
{
"epoch": 1.6206182517379086,
"grad_norm": 0.20381172001361847,
"learning_rate": 0.00018230161405288807,
"loss": 0.6599,
"step": 2740
},
{
"epoch": 1.6235763940245524,
"grad_norm": 0.19607172906398773,
"learning_rate": 0.00018164185706189267,
"loss": 0.6746,
"step": 2745
},
{
"epoch": 1.6265345363111967,
"grad_norm": 0.21484482288360596,
"learning_rate": 0.000180982364448346,
"loss": 0.67,
"step": 2750
},
{
"epoch": 1.6265345363111967,
"eval_loss": 0.6297933459281921,
"eval_runtime": 15.2129,
"eval_samples_per_second": 426.612,
"eval_steps_per_second": 13.344,
"step": 2750
},
{
"epoch": 1.6294926785978405,
"grad_norm": 0.19467367231845856,
"learning_rate": 0.00018032314298955507,
"loss": 0.6884,
"step": 2755
},
{
"epoch": 1.6324508208844846,
"grad_norm": 0.21421058475971222,
"learning_rate": 0.00017966419946004034,
"loss": 0.6708,
"step": 2760
},
{
"epoch": 1.6354089631711286,
"grad_norm": 0.2013017237186432,
"learning_rate": 0.00017900554063146607,
"loss": 0.6792,
"step": 2765
},
{
"epoch": 1.6383671054577724,
"grad_norm": 0.20477554202079773,
"learning_rate": 0.0001783471732725708,
"loss": 0.6787,
"step": 2770
},
{
"epoch": 1.6413252477444165,
"grad_norm": 0.20022639632225037,
"learning_rate": 0.00017768910414909782,
"loss": 0.6918,
"step": 2775
},
{
"epoch": 1.6442833900310605,
"grad_norm": 0.2140152007341385,
"learning_rate": 0.00017703134002372553,
"loss": 0.6768,
"step": 2780
},
{
"epoch": 1.6472415323177043,
"grad_norm": 0.20383507013320923,
"learning_rate": 0.00017637388765599804,
"loss": 0.6848,
"step": 2785
},
{
"epoch": 1.6501996746043486,
"grad_norm": 0.19069762527942657,
"learning_rate": 0.0001757167538022556,
"loss": 0.6739,
"step": 2790
},
{
"epoch": 1.6531578168909924,
"grad_norm": 0.2004157304763794,
"learning_rate": 0.00017505994521556538,
"loss": 0.7016,
"step": 2795
},
{
"epoch": 1.6561159591776364,
"grad_norm": 0.20051540434360504,
"learning_rate": 0.00017440346864565178,
"loss": 0.6731,
"step": 2800
},
{
"epoch": 1.6561159591776364,
"eval_loss": 0.626323401927948,
"eval_runtime": 15.1818,
"eval_samples_per_second": 427.485,
"eval_steps_per_second": 13.371,
"step": 2800
},
{
"epoch": 1.6590741014642805,
"grad_norm": 0.2020529806613922,
"learning_rate": 0.00017374733083882736,
"loss": 0.6824,
"step": 2805
},
{
"epoch": 1.6620322437509243,
"grad_norm": 0.19849729537963867,
"learning_rate": 0.00017309153853792305,
"loss": 0.6818,
"step": 2810
},
{
"epoch": 1.6649903860375685,
"grad_norm": 0.19597412645816803,
"learning_rate": 0.0001724360984822196,
"loss": 0.6711,
"step": 2815
},
{
"epoch": 1.6679485283242124,
"grad_norm": 0.21375198662281036,
"learning_rate": 0.00017178101740737757,
"loss": 0.6683,
"step": 2820
},
{
"epoch": 1.6709066706108564,
"grad_norm": 0.20787851512432098,
"learning_rate": 0.00017112630204536866,
"loss": 0.6776,
"step": 2825
},
{
"epoch": 1.6738648128975004,
"grad_norm": 0.20456843078136444,
"learning_rate": 0.00017047195912440612,
"loss": 0.6639,
"step": 2830
},
{
"epoch": 1.6768229551841443,
"grad_norm": 0.20255456864833832,
"learning_rate": 0.0001698179953688759,
"loss": 0.6766,
"step": 2835
},
{
"epoch": 1.6797810974707883,
"grad_norm": 0.2101144939661026,
"learning_rate": 0.00016916441749926738,
"loss": 0.6827,
"step": 2840
},
{
"epoch": 1.6827392397574323,
"grad_norm": 0.20345039665699005,
"learning_rate": 0.00016851123223210452,
"loss": 0.6615,
"step": 2845
},
{
"epoch": 1.6856973820440762,
"grad_norm": 0.20403869450092316,
"learning_rate": 0.00016785844627987656,
"loss": 0.682,
"step": 2850
},
{
"epoch": 1.6856973820440762,
"eval_loss": 0.6230462193489075,
"eval_runtime": 15.1867,
"eval_samples_per_second": 427.347,
"eval_steps_per_second": 13.367,
"step": 2850
},
{
"epoch": 1.6886555243307204,
"grad_norm": 0.20366299152374268,
"learning_rate": 0.00016720606635096897,
"loss": 0.6793,
"step": 2855
},
{
"epoch": 1.6916136666173642,
"grad_norm": 0.21023479104042053,
"learning_rate": 0.00016655409914959505,
"loss": 0.672,
"step": 2860
},
{
"epoch": 1.6945718089040083,
"grad_norm": 0.19778937101364136,
"learning_rate": 0.00016590255137572643,
"loss": 0.6758,
"step": 2865
},
{
"epoch": 1.6975299511906523,
"grad_norm": 0.19528599083423615,
"learning_rate": 0.00016525142972502466,
"loss": 0.6751,
"step": 2870
},
{
"epoch": 1.7004880934772961,
"grad_norm": 0.2070263922214508,
"learning_rate": 0.00016460074088877212,
"loss": 0.6921,
"step": 2875
},
{
"epoch": 1.7034462357639404,
"grad_norm": 0.2026599794626236,
"learning_rate": 0.00016395049155380328,
"loss": 0.6843,
"step": 2880
},
{
"epoch": 1.7064043780505842,
"grad_norm": 0.19992013275623322,
"learning_rate": 0.00016330068840243625,
"loss": 0.6571,
"step": 2885
},
{
"epoch": 1.7093625203372282,
"grad_norm": 0.21355992555618286,
"learning_rate": 0.00016265133811240373,
"loss": 0.6607,
"step": 2890
},
{
"epoch": 1.7123206626238723,
"grad_norm": 0.19565469026565552,
"learning_rate": 0.00016200244735678466,
"loss": 0.6737,
"step": 2895
},
{
"epoch": 1.715278804910516,
"grad_norm": 0.20738384127616882,
"learning_rate": 0.00016135402280393553,
"loss": 0.6762,
"step": 2900
},
{
"epoch": 1.715278804910516,
"eval_loss": 0.6193926334381104,
"eval_runtime": 15.232,
"eval_samples_per_second": 426.075,
"eval_steps_per_second": 13.327,
"step": 2900
},
{
"epoch": 1.7182369471971601,
"grad_norm": 0.20475776493549347,
"learning_rate": 0.0001607060711174218,
"loss": 0.6728,
"step": 2905
},
{
"epoch": 1.7211950894838042,
"grad_norm": 0.197879359126091,
"learning_rate": 0.00016005859895594968,
"loss": 0.6728,
"step": 2910
},
{
"epoch": 1.7241532317704482,
"grad_norm": 0.19947008788585663,
"learning_rate": 0.00015941161297329737,
"loss": 0.6636,
"step": 2915
},
{
"epoch": 1.7271113740570923,
"grad_norm": 0.20044192671775818,
"learning_rate": 0.00015876511981824685,
"loss": 0.6697,
"step": 2920
},
{
"epoch": 1.730069516343736,
"grad_norm": 0.1967218816280365,
"learning_rate": 0.00015811912613451556,
"loss": 0.6734,
"step": 2925
},
{
"epoch": 1.73302765863038,
"grad_norm": 0.19173486530780792,
"learning_rate": 0.00015747363856068812,
"loss": 0.6703,
"step": 2930
},
{
"epoch": 1.7359858009170241,
"grad_norm": 0.20418646931648254,
"learning_rate": 0.0001568286637301481,
"loss": 0.6751,
"step": 2935
},
{
"epoch": 1.738943943203668,
"grad_norm": 0.19849319756031036,
"learning_rate": 0.00015618420827100975,
"loss": 0.6572,
"step": 2940
},
{
"epoch": 1.7419020854903122,
"grad_norm": 0.20207248628139496,
"learning_rate": 0.00015554027880605,
"loss": 0.6763,
"step": 2945
},
{
"epoch": 1.744860227776956,
"grad_norm": 0.1988462209701538,
"learning_rate": 0.00015489688195264038,
"loss": 0.6638,
"step": 2950
},
{
"epoch": 1.744860227776956,
"eval_loss": 0.6163128614425659,
"eval_runtime": 15.1693,
"eval_samples_per_second": 427.836,
"eval_steps_per_second": 13.382,
"step": 2950
},
{
"epoch": 1.7478183700636,
"grad_norm": 0.20099283754825592,
"learning_rate": 0.00015425402432267906,
"loss": 0.6579,
"step": 2955
},
{
"epoch": 1.7507765123502441,
"grad_norm": 0.19520577788352966,
"learning_rate": 0.0001536117125225229,
"loss": 0.6725,
"step": 2960
},
{
"epoch": 1.753734654636888,
"grad_norm": 0.2104516476392746,
"learning_rate": 0.0001529699531529194,
"loss": 0.6619,
"step": 2965
},
{
"epoch": 1.756692796923532,
"grad_norm": 0.20790335536003113,
"learning_rate": 0.000152328752808939,
"loss": 0.6771,
"step": 2970
},
{
"epoch": 1.759650939210176,
"grad_norm": 0.20284044742584229,
"learning_rate": 0.00015168811807990732,
"loss": 0.6688,
"step": 2975
},
{
"epoch": 1.76260908149682,
"grad_norm": 0.21280840039253235,
"learning_rate": 0.00015104805554933744,
"loss": 0.6924,
"step": 2980
},
{
"epoch": 1.765567223783464,
"grad_norm": 0.19089211523532867,
"learning_rate": 0.0001504085717948622,
"loss": 0.6801,
"step": 2985
},
{
"epoch": 1.768525366070108,
"grad_norm": 0.20048676431179047,
"learning_rate": 0.00014976967338816653,
"loss": 0.6843,
"step": 2990
},
{
"epoch": 1.771483508356752,
"grad_norm": 0.2061685174703598,
"learning_rate": 0.00014913136689492004,
"loss": 0.6674,
"step": 2995
},
{
"epoch": 1.774441650643396,
"grad_norm": 0.19444270431995392,
"learning_rate": 0.00014849365887470962,
"loss": 0.6786,
"step": 3000
},
{
"epoch": 1.774441650643396,
"eval_loss": 0.6131055951118469,
"eval_runtime": 15.1868,
"eval_samples_per_second": 427.344,
"eval_steps_per_second": 13.367,
"step": 3000
},
{
"epoch": 1.7773997929300398,
"grad_norm": 0.20577003061771393,
"learning_rate": 0.00014785655588097182,
"loss": 0.6652,
"step": 3005
},
{
"epoch": 1.780357935216684,
"grad_norm": 0.201372891664505,
"learning_rate": 0.00014722006446092568,
"loss": 0.6783,
"step": 3010
},
{
"epoch": 1.7833160775033279,
"grad_norm": 0.19634784758090973,
"learning_rate": 0.0001465841911555053,
"loss": 0.6781,
"step": 3015
},
{
"epoch": 1.786274219789972,
"grad_norm": 0.20469844341278076,
"learning_rate": 0.00014594894249929271,
"loss": 0.6726,
"step": 3020
},
{
"epoch": 1.789232362076616,
"grad_norm": 0.2057006061077118,
"learning_rate": 0.0001453143250204508,
"loss": 0.6631,
"step": 3025
},
{
"epoch": 1.7921905043632598,
"grad_norm": 0.20145894587039948,
"learning_rate": 0.000144680345240656,
"loss": 0.6701,
"step": 3030
},
{
"epoch": 1.7951486466499038,
"grad_norm": 0.19412663578987122,
"learning_rate": 0.00014404700967503143,
"loss": 0.6779,
"step": 3035
},
{
"epoch": 1.7981067889365479,
"grad_norm": 0.2155628502368927,
"learning_rate": 0.00014341432483207993,
"loss": 0.6687,
"step": 3040
},
{
"epoch": 1.801064931223192,
"grad_norm": 0.19732846319675446,
"learning_rate": 0.0001427822972136172,
"loss": 0.6673,
"step": 3045
},
{
"epoch": 1.804023073509836,
"grad_norm": 0.19896374642848969,
"learning_rate": 0.00014215093331470494,
"loss": 0.6712,
"step": 3050
},
{
"epoch": 1.804023073509836,
"eval_loss": 0.6097522974014282,
"eval_runtime": 15.1894,
"eval_samples_per_second": 427.271,
"eval_steps_per_second": 13.365,
"step": 3050
},
{
"epoch": 1.8069812157964797,
"grad_norm": 0.19442537426948547,
"learning_rate": 0.00014152023962358398,
"loss": 0.6645,
"step": 3055
},
{
"epoch": 1.8099393580831238,
"grad_norm": 0.20382952690124512,
"learning_rate": 0.00014089022262160788,
"loss": 0.6701,
"step": 3060
},
{
"epoch": 1.8128975003697678,
"grad_norm": 0.19677461683750153,
"learning_rate": 0.00014026088878317611,
"loss": 0.6733,
"step": 3065
},
{
"epoch": 1.8158556426564116,
"grad_norm": 0.20080624520778656,
"learning_rate": 0.00013963224457566755,
"loss": 0.6716,
"step": 3070
},
{
"epoch": 1.818813784943056,
"grad_norm": 0.1959795206785202,
"learning_rate": 0.00013900429645937417,
"loss": 0.6786,
"step": 3075
},
{
"epoch": 1.8217719272296997,
"grad_norm": 0.20665378868579865,
"learning_rate": 0.00013837705088743426,
"loss": 0.6837,
"step": 3080
},
{
"epoch": 1.8247300695163438,
"grad_norm": 0.20092874765396118,
"learning_rate": 0.0001377505143057667,
"loss": 0.673,
"step": 3085
},
{
"epoch": 1.8276882118029878,
"grad_norm": 0.20177334547042847,
"learning_rate": 0.0001371246931530042,
"loss": 0.6676,
"step": 3090
},
{
"epoch": 1.8306463540896316,
"grad_norm": 0.1979428380727768,
"learning_rate": 0.0001364995938604274,
"loss": 0.672,
"step": 3095
},
{
"epoch": 1.8336044963762756,
"grad_norm": 0.20506584644317627,
"learning_rate": 0.00013587522285189873,
"loss": 0.6657,
"step": 3100
},
{
"epoch": 1.8336044963762756,
"eval_loss": 0.6049384474754333,
"eval_runtime": 15.0954,
"eval_samples_per_second": 429.931,
"eval_steps_per_second": 13.448,
"step": 3100
},
{
"epoch": 1.8365626386629197,
"grad_norm": 0.20680159330368042,
"learning_rate": 0.00013525158654379628,
"loss": 0.6698,
"step": 3105
},
{
"epoch": 1.8395207809495637,
"grad_norm": 0.1984373927116394,
"learning_rate": 0.00013462869134494806,
"loss": 0.6603,
"step": 3110
},
{
"epoch": 1.8424789232362078,
"grad_norm": 0.202684223651886,
"learning_rate": 0.000134006543656566,
"loss": 0.6553,
"step": 3115
},
{
"epoch": 1.8454370655228516,
"grad_norm": 0.21238334476947784,
"learning_rate": 0.0001333851498721802,
"loss": 0.6663,
"step": 3120
},
{
"epoch": 1.8483952078094956,
"grad_norm": 0.20673152804374695,
"learning_rate": 0.0001327645163775732,
"loss": 0.6701,
"step": 3125
},
{
"epoch": 1.8513533500961397,
"grad_norm": 0.2024824321269989,
"learning_rate": 0.00013214464955071438,
"loss": 0.6555,
"step": 3130
},
{
"epoch": 1.8543114923827835,
"grad_norm": 0.2038116157054901,
"learning_rate": 0.00013152555576169446,
"loss": 0.6693,
"step": 3135
},
{
"epoch": 1.8572696346694277,
"grad_norm": 0.19714276492595673,
"learning_rate": 0.00013090724137266007,
"loss": 0.6619,
"step": 3140
},
{
"epoch": 1.8602277769560716,
"grad_norm": 0.20710057020187378,
"learning_rate": 0.00013028971273774817,
"loss": 0.6764,
"step": 3145
},
{
"epoch": 1.8631859192427156,
"grad_norm": 0.19880205392837524,
"learning_rate": 0.00012967297620302095,
"loss": 0.6702,
"step": 3150
},
{
"epoch": 1.8631859192427156,
"eval_loss": 0.6030129194259644,
"eval_runtime": 15.2766,
"eval_samples_per_second": 424.834,
"eval_steps_per_second": 13.288,
"step": 3150
},
{
"epoch": 1.8661440615293596,
"grad_norm": 0.21328890323638916,
"learning_rate": 0.00012905703810640054,
"loss": 0.6627,
"step": 3155
},
{
"epoch": 1.8691022038160034,
"grad_norm": 0.20017056167125702,
"learning_rate": 0.00012844190477760388,
"loss": 0.6653,
"step": 3160
},
{
"epoch": 1.8720603461026475,
"grad_norm": 0.204426571726799,
"learning_rate": 0.00012782758253807765,
"loss": 0.6725,
"step": 3165
},
{
"epoch": 1.8750184883892915,
"grad_norm": 0.19793303310871124,
"learning_rate": 0.00012721407770093334,
"loss": 0.6578,
"step": 3170
},
{
"epoch": 1.8779766306759356,
"grad_norm": 0.20140038430690765,
"learning_rate": 0.00012660139657088242,
"loss": 0.6706,
"step": 3175
},
{
"epoch": 1.8809347729625796,
"grad_norm": 0.20214778184890747,
"learning_rate": 0.0001259895454441714,
"loss": 0.6767,
"step": 3180
},
{
"epoch": 1.8838929152492234,
"grad_norm": 0.21046917140483856,
"learning_rate": 0.0001253785306085173,
"loss": 0.6768,
"step": 3185
},
{
"epoch": 1.8868510575358675,
"grad_norm": 0.20898491144180298,
"learning_rate": 0.00012476835834304294,
"loss": 0.6654,
"step": 3190
},
{
"epoch": 1.8898091998225115,
"grad_norm": 0.20452511310577393,
"learning_rate": 0.0001241590349182124,
"loss": 0.6599,
"step": 3195
},
{
"epoch": 1.8927673421091553,
"grad_norm": 0.21516427397727966,
"learning_rate": 0.00012355056659576664,
"loss": 0.6646,
"step": 3200
},
{
"epoch": 1.8927673421091553,
"eval_loss": 0.59881591796875,
"eval_runtime": 15.1719,
"eval_samples_per_second": 427.765,
"eval_steps_per_second": 13.38,
"step": 3200
},
{
"epoch": 1.8957254843957996,
"grad_norm": 0.19969050586223602,
"learning_rate": 0.00012294295962865908,
"loss": 0.6641,
"step": 3205
},
{
"epoch": 1.8986836266824434,
"grad_norm": 0.2035350650548935,
"learning_rate": 0.0001223362202609915,
"loss": 0.6679,
"step": 3210
},
{
"epoch": 1.9016417689690874,
"grad_norm": 0.20504876971244812,
"learning_rate": 0.00012173035472794956,
"loss": 0.664,
"step": 3215
},
{
"epoch": 1.9045999112557315,
"grad_norm": 0.19677531719207764,
"learning_rate": 0.00012112536925573904,
"loss": 0.6605,
"step": 3220
},
{
"epoch": 1.9075580535423753,
"grad_norm": 0.19949232041835785,
"learning_rate": 0.00012052127006152172,
"loss": 0.6718,
"step": 3225
},
{
"epoch": 1.9105161958290193,
"grad_norm": 0.19459928572177887,
"learning_rate": 0.00011991806335335154,
"loss": 0.6639,
"step": 3230
},
{
"epoch": 1.9134743381156634,
"grad_norm": 0.203868106007576,
"learning_rate": 0.00011931575533011058,
"loss": 0.6664,
"step": 3235
},
{
"epoch": 1.9164324804023074,
"grad_norm": 0.1938386857509613,
"learning_rate": 0.00011871435218144587,
"loss": 0.6619,
"step": 3240
},
{
"epoch": 1.9193906226889514,
"grad_norm": 0.20791150629520416,
"learning_rate": 0.00011811386008770509,
"loss": 0.6547,
"step": 3245
},
{
"epoch": 1.9223487649755953,
"grad_norm": 0.20660457015037537,
"learning_rate": 0.00011751428521987375,
"loss": 0.6793,
"step": 3250
},
{
"epoch": 1.9223487649755953,
"eval_loss": 0.5979748368263245,
"eval_runtime": 15.2507,
"eval_samples_per_second": 425.554,
"eval_steps_per_second": 13.311,
"step": 3250
},
{
"epoch": 1.9253069072622393,
"grad_norm": 0.20144982635974884,
"learning_rate": 0.00011691563373951126,
"loss": 0.6696,
"step": 3255
},
{
"epoch": 1.9282650495488833,
"grad_norm": 0.20913758873939514,
"learning_rate": 0.00011631791179868765,
"loss": 0.6535,
"step": 3260
},
{
"epoch": 1.9312231918355272,
"grad_norm": 0.19052323698997498,
"learning_rate": 0.0001157211255399209,
"loss": 0.6608,
"step": 3265
},
{
"epoch": 1.9341813341221714,
"grad_norm": 0.20822355151176453,
"learning_rate": 0.000115125281096113,
"loss": 0.6643,
"step": 3270
},
{
"epoch": 1.9371394764088152,
"grad_norm": 0.21328042447566986,
"learning_rate": 0.00011453038459048767,
"loss": 0.6634,
"step": 3275
},
{
"epoch": 1.9400976186954593,
"grad_norm": 0.19759127497673035,
"learning_rate": 0.00011393644213652677,
"loss": 0.6496,
"step": 3280
},
{
"epoch": 1.9430557609821033,
"grad_norm": 0.20105671882629395,
"learning_rate": 0.00011334345983790816,
"loss": 0.6537,
"step": 3285
},
{
"epoch": 1.9460139032687471,
"grad_norm": 0.2096051126718521,
"learning_rate": 0.00011275144378844229,
"loss": 0.6494,
"step": 3290
},
{
"epoch": 1.9489720455553912,
"grad_norm": 0.20718033611774445,
"learning_rate": 0.00011216040007201014,
"loss": 0.6488,
"step": 3295
},
{
"epoch": 1.9519301878420352,
"grad_norm": 0.2058909386396408,
"learning_rate": 0.0001115703347625003,
"loss": 0.665,
"step": 3300
},
{
"epoch": 1.9519301878420352,
"eval_loss": 0.5932101011276245,
"eval_runtime": 15.197,
"eval_samples_per_second": 427.057,
"eval_steps_per_second": 13.358,
"step": 3300
},
{
"epoch": 1.9548883301286792,
"grad_norm": 0.2102581113576889,
"learning_rate": 0.00011098125392374676,
"loss": 0.6345,
"step": 3305
},
{
"epoch": 1.9578464724153233,
"grad_norm": 0.20526902377605438,
"learning_rate": 0.00011039316360946673,
"loss": 0.6647,
"step": 3310
},
{
"epoch": 1.960804614701967,
"grad_norm": 0.203394815325737,
"learning_rate": 0.00010980606986319787,
"loss": 0.662,
"step": 3315
},
{
"epoch": 1.9637627569886111,
"grad_norm": 0.21088528633117676,
"learning_rate": 0.00010921997871823699,
"loss": 0.6572,
"step": 3320
},
{
"epoch": 1.9667208992752552,
"grad_norm": 0.20182575285434723,
"learning_rate": 0.00010863489619757724,
"loss": 0.6625,
"step": 3325
},
{
"epoch": 1.969679041561899,
"grad_norm": 0.20655593276023865,
"learning_rate": 0.00010805082831384698,
"loss": 0.6421,
"step": 3330
},
{
"epoch": 1.9726371838485433,
"grad_norm": 0.2001488208770752,
"learning_rate": 0.00010746778106924716,
"loss": 0.6604,
"step": 3335
},
{
"epoch": 1.975595326135187,
"grad_norm": 0.2096022516489029,
"learning_rate": 0.00010688576045549053,
"loss": 0.6479,
"step": 3340
},
{
"epoch": 1.978553468421831,
"grad_norm": 0.20127557218074799,
"learning_rate": 0.0001063047724537393,
"loss": 0.6628,
"step": 3345
},
{
"epoch": 1.9815116107084751,
"grad_norm": 0.20845440030097961,
"learning_rate": 0.00010572482303454416,
"loss": 0.6577,
"step": 3350
},
{
"epoch": 1.9815116107084751,
"eval_loss": 0.5896474719047546,
"eval_runtime": 15.2103,
"eval_samples_per_second": 426.685,
"eval_steps_per_second": 13.346,
"step": 3350
},
{
"epoch": 1.984469752995119,
"grad_norm": 0.20478816330432892,
"learning_rate": 0.00010514591815778253,
"loss": 0.6398,
"step": 3355
},
{
"epoch": 1.987427895281763,
"grad_norm": 0.20369143784046173,
"learning_rate": 0.00010456806377259795,
"loss": 0.671,
"step": 3360
},
{
"epoch": 1.990386037568407,
"grad_norm": 0.2043718546628952,
"learning_rate": 0.0001039912658173381,
"loss": 0.6423,
"step": 3365
},
{
"epoch": 1.993344179855051,
"grad_norm": 0.20425325632095337,
"learning_rate": 0.00010341553021949456,
"loss": 0.6566,
"step": 3370
},
{
"epoch": 1.9963023221416951,
"grad_norm": 0.19125695526599884,
"learning_rate": 0.00010284086289564125,
"loss": 0.6491,
"step": 3375
},
{
"epoch": 1.999260464428339,
"grad_norm": 0.20721665024757385,
"learning_rate": 0.00010226726975137421,
"loss": 0.6697,
"step": 3380
},
{
"epoch": 2.0017748853719866,
"grad_norm": 0.1947908252477646,
"learning_rate": 0.0001016947566812503,
"loss": 0.6123,
"step": 3385
},
{
"epoch": 2.0047330276586304,
"grad_norm": 0.22768820822238922,
"learning_rate": 0.0001011233295687272,
"loss": 0.598,
"step": 3390
},
{
"epoch": 2.007691169945274,
"grad_norm": 0.20400209724903107,
"learning_rate": 0.00010055299428610279,
"loss": 0.5928,
"step": 3395
},
{
"epoch": 2.0106493122319185,
"grad_norm": 0.2241700440645218,
"learning_rate": 9.998375669445419e-05,
"loss": 0.5821,
"step": 3400
},
{
"epoch": 2.0106493122319185,
"eval_loss": 0.5831112861633301,
"eval_runtime": 15.2237,
"eval_samples_per_second": 426.309,
"eval_steps_per_second": 13.334,
"step": 3400
},
{
"epoch": 2.0136074545185623,
"grad_norm": 0.20261913537979126,
"learning_rate": 9.941562264357865e-05,
"loss": 0.5866,
"step": 3405
},
{
"epoch": 2.0165655968052065,
"grad_norm": 0.20536375045776367,
"learning_rate": 9.884859797193239e-05,
"loss": 0.5946,
"step": 3410
},
{
"epoch": 2.0195237390918503,
"grad_norm": 0.21416200697422028,
"learning_rate": 9.828268850657138e-05,
"loss": 0.5856,
"step": 3415
},
{
"epoch": 2.022481881378494,
"grad_norm": 0.2201823741197586,
"learning_rate": 9.771790006309084e-05,
"loss": 0.6029,
"step": 3420
},
{
"epoch": 2.0254400236651384,
"grad_norm": 0.2173227071762085,
"learning_rate": 9.715423844556602e-05,
"loss": 0.5871,
"step": 3425
},
{
"epoch": 2.0283981659517822,
"grad_norm": 0.21699583530426025,
"learning_rate": 9.659170944649196e-05,
"loss": 0.5773,
"step": 3430
},
{
"epoch": 2.031356308238426,
"grad_norm": 0.2157624065876007,
"learning_rate": 9.603031884672467e-05,
"loss": 0.5979,
"step": 3435
},
{
"epoch": 2.0343144505250703,
"grad_norm": 0.21308545768260956,
"learning_rate": 9.547007241542108e-05,
"loss": 0.5749,
"step": 3440
},
{
"epoch": 2.037272592811714,
"grad_norm": 0.21203207969665527,
"learning_rate": 9.491097590998e-05,
"loss": 0.5985,
"step": 3445
},
{
"epoch": 2.0402307350983584,
"grad_norm": 0.21418456733226776,
"learning_rate": 9.435303507598322e-05,
"loss": 0.5917,
"step": 3450
},
{
"epoch": 2.0402307350983584,
"eval_loss": 0.5792038440704346,
"eval_runtime": 15.2302,
"eval_samples_per_second": 426.128,
"eval_steps_per_second": 13.329,
"step": 3450
},
{
"epoch": 2.043188877385002,
"grad_norm": 0.21770378947257996,
"learning_rate": 9.379625564713593e-05,
"loss": 0.5706,
"step": 3455
},
{
"epoch": 2.046147019671646,
"grad_norm": 0.21893031895160675,
"learning_rate": 9.324064334520837e-05,
"loss": 0.5926,
"step": 3460
},
{
"epoch": 2.0491051619582903,
"grad_norm": 0.20557957887649536,
"learning_rate": 9.268620387997643e-05,
"loss": 0.5886,
"step": 3465
},
{
"epoch": 2.052063304244934,
"grad_norm": 0.22509662806987762,
"learning_rate": 9.213294294916363e-05,
"loss": 0.5848,
"step": 3470
},
{
"epoch": 2.0550214465315784,
"grad_norm": 0.2100546807050705,
"learning_rate": 9.158086623838189e-05,
"loss": 0.5863,
"step": 3475
},
{
"epoch": 2.057979588818222,
"grad_norm": 0.20398281514644623,
"learning_rate": 9.102997942107373e-05,
"loss": 0.603,
"step": 3480
},
{
"epoch": 2.060937731104866,
"grad_norm": 0.20686747133731842,
"learning_rate": 9.04802881584535e-05,
"loss": 0.5747,
"step": 3485
},
{
"epoch": 2.0638958733915103,
"grad_norm": 0.2102808654308319,
"learning_rate": 8.993179809944937e-05,
"loss": 0.5916,
"step": 3490
},
{
"epoch": 2.066854015678154,
"grad_norm": 0.21308141946792603,
"learning_rate": 8.938451488064526e-05,
"loss": 0.5668,
"step": 3495
},
{
"epoch": 2.069812157964798,
"grad_norm": 0.21371279656887054,
"learning_rate": 8.883844412622322e-05,
"loss": 0.5813,
"step": 3500
},
{
"epoch": 2.069812157964798,
"eval_loss": 0.575473964214325,
"eval_runtime": 15.2042,
"eval_samples_per_second": 426.855,
"eval_steps_per_second": 13.352,
"step": 3500
},
{
"epoch": 2.072770300251442,
"grad_norm": 0.22001944482326508,
"learning_rate": 8.829359144790494e-05,
"loss": 0.5913,
"step": 3505
},
{
"epoch": 2.075728442538086,
"grad_norm": 0.22368231415748596,
"learning_rate": 8.774996244489475e-05,
"loss": 0.5877,
"step": 3510
},
{
"epoch": 2.0786865848247302,
"grad_norm": 0.22365407645702362,
"learning_rate": 8.72075627038219e-05,
"loss": 0.6037,
"step": 3515
},
{
"epoch": 2.081644727111374,
"grad_norm": 0.20973624289035797,
"learning_rate": 8.666639779868279e-05,
"loss": 0.5844,
"step": 3520
},
{
"epoch": 2.084602869398018,
"grad_norm": 0.21420718729496002,
"learning_rate": 8.612647329078422e-05,
"loss": 0.5921,
"step": 3525
},
{
"epoch": 2.087561011684662,
"grad_norm": 0.22668996453285217,
"learning_rate": 8.558779472868585e-05,
"loss": 0.5886,
"step": 3530
},
{
"epoch": 2.090519153971306,
"grad_norm": 0.21416419744491577,
"learning_rate": 8.505036764814334e-05,
"loss": 0.5981,
"step": 3535
},
{
"epoch": 2.09347729625795,
"grad_norm": 0.21493327617645264,
"learning_rate": 8.451419757205141e-05,
"loss": 0.5813,
"step": 3540
},
{
"epoch": 2.096435438544594,
"grad_norm": 0.20993159711360931,
"learning_rate": 8.397929001038732e-05,
"loss": 0.5791,
"step": 3545
},
{
"epoch": 2.099393580831238,
"grad_norm": 0.21955925226211548,
"learning_rate": 8.344565046015369e-05,
"loss": 0.5934,
"step": 3550
},
{
"epoch": 2.099393580831238,
"eval_loss": 0.5726394057273865,
"eval_runtime": 15.2125,
"eval_samples_per_second": 426.624,
"eval_steps_per_second": 13.344,
"step": 3550
},
{
"epoch": 2.102351723117882,
"grad_norm": 0.21743761003017426,
"learning_rate": 8.291328440532275e-05,
"loss": 0.5923,
"step": 3555
},
{
"epoch": 2.105309865404526,
"grad_norm": 0.2203519195318222,
"learning_rate": 8.23821973167792e-05,
"loss": 0.5911,
"step": 3560
},
{
"epoch": 2.1082680076911697,
"grad_norm": 0.21465690433979034,
"learning_rate": 8.185239465226481e-05,
"loss": 0.5821,
"step": 3565
},
{
"epoch": 2.111226149977814,
"grad_norm": 0.21976549923419952,
"learning_rate": 8.132388185632145e-05,
"loss": 0.5812,
"step": 3570
},
{
"epoch": 2.114184292264458,
"grad_norm": 0.21722052991390228,
"learning_rate": 8.079666436023603e-05,
"loss": 0.5763,
"step": 3575
},
{
"epoch": 2.117142434551102,
"grad_norm": 0.21873724460601807,
"learning_rate": 8.027074758198394e-05,
"loss": 0.6005,
"step": 3580
},
{
"epoch": 2.120100576837746,
"grad_norm": 0.21776770055294037,
"learning_rate": 7.974613692617372e-05,
"loss": 0.5781,
"step": 3585
},
{
"epoch": 2.1230587191243897,
"grad_norm": 0.22048649191856384,
"learning_rate": 7.922283778399167e-05,
"loss": 0.5792,
"step": 3590
},
{
"epoch": 2.126016861411034,
"grad_norm": 0.21600739657878876,
"learning_rate": 7.870085553314602e-05,
"loss": 0.6024,
"step": 3595
},
{
"epoch": 2.128975003697678,
"grad_norm": 0.224426731467247,
"learning_rate": 7.818019553781215e-05,
"loss": 0.5959,
"step": 3600
},
{
"epoch": 2.128975003697678,
"eval_loss": 0.5697709321975708,
"eval_runtime": 15.28,
"eval_samples_per_second": 424.739,
"eval_steps_per_second": 13.285,
"step": 3600
},
{
"epoch": 2.131933145984322,
"grad_norm": 0.20624466240406036,
"learning_rate": 7.766086314857693e-05,
"loss": 0.578,
"step": 3605
},
{
"epoch": 2.134891288270966,
"grad_norm": 0.21350590884685516,
"learning_rate": 7.714286370238435e-05,
"loss": 0.5791,
"step": 3610
},
{
"epoch": 2.1378494305576097,
"grad_norm": 0.2191067934036255,
"learning_rate": 7.662620252248002e-05,
"loss": 0.577,
"step": 3615
},
{
"epoch": 2.140807572844254,
"grad_norm": 0.21235939860343933,
"learning_rate": 7.611088491835717e-05,
"loss": 0.5812,
"step": 3620
},
{
"epoch": 2.1437657151308978,
"grad_norm": 0.20971763134002686,
"learning_rate": 7.559691618570121e-05,
"loss": 0.5837,
"step": 3625
},
{
"epoch": 2.146723857417542,
"grad_norm": 0.22103586792945862,
"learning_rate": 7.508430160633623e-05,
"loss": 0.6064,
"step": 3630
},
{
"epoch": 2.149681999704186,
"grad_norm": 0.20907030999660492,
"learning_rate": 7.457304644817021e-05,
"loss": 0.5821,
"step": 3635
},
{
"epoch": 2.1526401419908296,
"grad_norm": 0.21533241868019104,
"learning_rate": 7.406315596514083e-05,
"loss": 0.5904,
"step": 3640
},
{
"epoch": 2.155598284277474,
"grad_norm": 0.2256341576576233,
"learning_rate": 7.355463539716179e-05,
"loss": 0.5935,
"step": 3645
},
{
"epoch": 2.1585564265641177,
"grad_norm": 0.21132247149944305,
"learning_rate": 7.304748997006862e-05,
"loss": 0.5842,
"step": 3650
},
{
"epoch": 2.1585564265641177,
"eval_loss": 0.567834198474884,
"eval_runtime": 15.3306,
"eval_samples_per_second": 423.336,
"eval_steps_per_second": 13.241,
"step": 3650
},
{
"epoch": 2.1615145688507615,
"grad_norm": 0.22597523033618927,
"learning_rate": 7.254172489556542e-05,
"loss": 0.5977,
"step": 3655
},
{
"epoch": 2.164472711137406,
"grad_norm": 0.22739310562610626,
"learning_rate": 7.203734537117064e-05,
"loss": 0.594,
"step": 3660
},
{
"epoch": 2.1674308534240496,
"grad_norm": 0.21618716418743134,
"learning_rate": 7.153435658016453e-05,
"loss": 0.5776,
"step": 3665
},
{
"epoch": 2.170388995710694,
"grad_norm": 0.22005634009838104,
"learning_rate": 7.10327636915349e-05,
"loss": 0.5861,
"step": 3670
},
{
"epoch": 2.1733471379973377,
"grad_norm": 0.2195035070180893,
"learning_rate": 7.053257185992494e-05,
"loss": 0.5941,
"step": 3675
},
{
"epoch": 2.1763052802839815,
"grad_norm": 0.20980341732501984,
"learning_rate": 7.003378622557946e-05,
"loss": 0.5724,
"step": 3680
},
{
"epoch": 2.1792634225706258,
"grad_norm": 0.22259102761745453,
"learning_rate": 6.953641191429277e-05,
"loss": 0.573,
"step": 3685
},
{
"epoch": 2.1822215648572696,
"grad_norm": 0.22640399634838104,
"learning_rate": 6.904045403735528e-05,
"loss": 0.583,
"step": 3690
},
{
"epoch": 2.185179707143914,
"grad_norm": 0.21845850348472595,
"learning_rate": 6.85459176915017e-05,
"loss": 0.5812,
"step": 3695
},
{
"epoch": 2.1881378494305577,
"grad_norm": 0.22164185345172882,
"learning_rate": 6.8052807958858e-05,
"loss": 0.5848,
"step": 3700
},
{
"epoch": 2.1881378494305577,
"eval_loss": 0.5647861361503601,
"eval_runtime": 15.5273,
"eval_samples_per_second": 417.975,
"eval_steps_per_second": 13.074,
"step": 3700
},
{
"epoch": 2.1910959917172015,
"grad_norm": 0.22857585549354553,
"learning_rate": 6.756112990688974e-05,
"loss": 0.5896,
"step": 3705
},
{
"epoch": 2.1940541340038457,
"grad_norm": 0.22805197536945343,
"learning_rate": 6.707088858834962e-05,
"loss": 0.583,
"step": 3710
},
{
"epoch": 2.1970122762904896,
"grad_norm": 0.21297574043273926,
"learning_rate": 6.658208904122559e-05,
"loss": 0.5707,
"step": 3715
},
{
"epoch": 2.1999704185771334,
"grad_norm": 0.21938017010688782,
"learning_rate": 6.609473628868942e-05,
"loss": 0.5826,
"step": 3720
},
{
"epoch": 2.2029285608637776,
"grad_norm": 0.22642748057842255,
"learning_rate": 6.560883533904459e-05,
"loss": 0.5791,
"step": 3725
},
{
"epoch": 2.2058867031504215,
"grad_norm": 0.22415103018283844,
"learning_rate": 6.512439118567521e-05,
"loss": 0.5906,
"step": 3730
},
{
"epoch": 2.2088448454370657,
"grad_norm": 0.22948449850082397,
"learning_rate": 6.46414088069944e-05,
"loss": 0.5946,
"step": 3735
},
{
"epoch": 2.2118029877237095,
"grad_norm": 0.2156379073858261,
"learning_rate": 6.415989316639354e-05,
"loss": 0.58,
"step": 3740
},
{
"epoch": 2.2147611300103534,
"grad_norm": 0.21520055830478668,
"learning_rate": 6.367984921219066e-05,
"loss": 0.5611,
"step": 3745
},
{
"epoch": 2.2177192722969976,
"grad_norm": 0.23094354569911957,
"learning_rate": 6.320128187758033e-05,
"loss": 0.5992,
"step": 3750
},
{
"epoch": 2.2177192722969976,
"eval_loss": 0.5629158020019531,
"eval_runtime": 15.3089,
"eval_samples_per_second": 423.936,
"eval_steps_per_second": 13.26,
"step": 3750
},
{
"epoch": 2.2206774145836414,
"grad_norm": 0.2340443730354309,
"learning_rate": 6.272419608058222e-05,
"loss": 0.6019,
"step": 3755
},
{
"epoch": 2.2236355568702857,
"grad_norm": 0.22871170938014984,
"learning_rate": 6.224859672399101e-05,
"loss": 0.6042,
"step": 3760
},
{
"epoch": 2.2265936991569295,
"grad_norm": 0.22238165140151978,
"learning_rate": 6.17744886953261e-05,
"loss": 0.5862,
"step": 3765
},
{
"epoch": 2.2295518414435733,
"grad_norm": 0.23319736123085022,
"learning_rate": 6.130187686678089e-05,
"loss": 0.5803,
"step": 3770
},
{
"epoch": 2.2325099837302176,
"grad_norm": 0.22640018165111542,
"learning_rate": 6.0830766095173266e-05,
"loss": 0.5948,
"step": 3775
},
{
"epoch": 2.2354681260168614,
"grad_norm": 0.2193189114332199,
"learning_rate": 6.03611612218952e-05,
"loss": 0.5903,
"step": 3780
},
{
"epoch": 2.238426268303505,
"grad_norm": 0.22366072237491608,
"learning_rate": 5.989306707286349e-05,
"loss": 0.5957,
"step": 3785
},
{
"epoch": 2.2413844105901495,
"grad_norm": 0.21540489792823792,
"learning_rate": 5.942648845846961e-05,
"loss": 0.5798,
"step": 3790
},
{
"epoch": 2.2443425528767933,
"grad_norm": 0.21111348271369934,
"learning_rate": 5.896143017353086e-05,
"loss": 0.5841,
"step": 3795
},
{
"epoch": 2.2473006951634376,
"grad_norm": 0.21304626762866974,
"learning_rate": 5.849789699724059e-05,
"loss": 0.5901,
"step": 3800
},
{
"epoch": 2.2473006951634376,
"eval_loss": 0.5599969625473022,
"eval_runtime": 15.2714,
"eval_samples_per_second": 424.977,
"eval_steps_per_second": 13.293,
"step": 3800
},
{
"epoch": 2.2502588374500814,
"grad_norm": 0.2219642996788025,
"learning_rate": 5.803589369311938e-05,
"loss": 0.5921,
"step": 3805
},
{
"epoch": 2.253216979736725,
"grad_norm": 0.21248315274715424,
"learning_rate": 5.757542500896596e-05,
"loss": 0.5721,
"step": 3810
},
{
"epoch": 2.2561751220233695,
"grad_norm": 0.22615401446819305,
"learning_rate": 5.711649567680859e-05,
"loss": 0.5929,
"step": 3815
},
{
"epoch": 2.2591332643100133,
"grad_norm": 0.22166916728019714,
"learning_rate": 5.665911041285612e-05,
"loss": 0.5707,
"step": 3820
},
{
"epoch": 2.2620914065966575,
"grad_norm": 0.2249995470046997,
"learning_rate": 5.620327391744995e-05,
"loss": 0.5878,
"step": 3825
},
{
"epoch": 2.2650495488833013,
"grad_norm": 0.20594008266925812,
"learning_rate": 5.57489908750152e-05,
"loss": 0.5796,
"step": 3830
},
{
"epoch": 2.268007691169945,
"grad_norm": 0.2205434888601303,
"learning_rate": 5.5296265954013146e-05,
"loss": 0.5674,
"step": 3835
},
{
"epoch": 2.2709658334565894,
"grad_norm": 0.22647491097450256,
"learning_rate": 5.484510380689269e-05,
"loss": 0.5821,
"step": 3840
},
{
"epoch": 2.2739239757432332,
"grad_norm": 0.22615395486354828,
"learning_rate": 5.439550907004304e-05,
"loss": 0.5899,
"step": 3845
},
{
"epoch": 2.276882118029877,
"grad_norm": 0.2161322832107544,
"learning_rate": 5.394748636374572e-05,
"loss": 0.5791,
"step": 3850
},
{
"epoch": 2.276882118029877,
"eval_loss": 0.5578413009643555,
"eval_runtime": 15.1815,
"eval_samples_per_second": 427.494,
"eval_steps_per_second": 13.372,
"step": 3850
},
{
"epoch": 2.2798402603165213,
"grad_norm": 0.22290246188640594,
"learning_rate": 5.3501040292127126e-05,
"loss": 0.592,
"step": 3855
},
{
"epoch": 2.282798402603165,
"grad_norm": 0.22553138434886932,
"learning_rate": 5.305617544311153e-05,
"loss": 0.5808,
"step": 3860
},
{
"epoch": 2.2857565448898094,
"grad_norm": 0.2151021510362625,
"learning_rate": 5.2612896388373444e-05,
"loss": 0.5892,
"step": 3865
},
{
"epoch": 2.288714687176453,
"grad_norm": 0.22751715779304504,
"learning_rate": 5.217120768329112e-05,
"loss": 0.5763,
"step": 3870
},
{
"epoch": 2.291672829463097,
"grad_norm": 0.21743617951869965,
"learning_rate": 5.1731113866899264e-05,
"loss": 0.5873,
"step": 3875
},
{
"epoch": 2.2946309717497413,
"grad_norm": 0.21894121170043945,
"learning_rate": 5.12926194618429e-05,
"loss": 0.571,
"step": 3880
},
{
"epoch": 2.297589114036385,
"grad_norm": 0.21481330692768097,
"learning_rate": 5.085572897433036e-05,
"loss": 0.5832,
"step": 3885
},
{
"epoch": 2.3005472563230294,
"grad_norm": 0.2185722440481186,
"learning_rate": 5.042044689408748e-05,
"loss": 0.5796,
"step": 3890
},
{
"epoch": 2.303505398609673,
"grad_norm": 0.21149496734142303,
"learning_rate": 4.998677769431105e-05,
"loss": 0.5774,
"step": 3895
},
{
"epoch": 2.306463540896317,
"grad_norm": 0.2339731901884079,
"learning_rate": 4.9554725831623036e-05,
"loss": 0.5697,
"step": 3900
},
{
"epoch": 2.306463540896317,
"eval_loss": 0.5554924607276917,
"eval_runtime": 15.2313,
"eval_samples_per_second": 426.096,
"eval_steps_per_second": 13.328,
"step": 3900
},
{
"epoch": 2.3094216831829613,
"grad_norm": 0.21912769973278046,
"learning_rate": 4.9124295746024905e-05,
"loss": 0.5794,
"step": 3905
},
{
"epoch": 2.312379825469605,
"grad_norm": 0.21787823736667633,
"learning_rate": 4.869549186085165e-05,
"loss": 0.562,
"step": 3910
},
{
"epoch": 2.315337967756249,
"grad_norm": 0.2172250598669052,
"learning_rate": 4.8268318582726754e-05,
"loss": 0.5806,
"step": 3915
},
{
"epoch": 2.318296110042893,
"grad_norm": 0.21160916984081268,
"learning_rate": 4.784278030151647e-05,
"loss": 0.5779,
"step": 3920
},
{
"epoch": 2.321254252329537,
"grad_norm": 0.21556870639324188,
"learning_rate": 4.7418881390285164e-05,
"loss": 0.5773,
"step": 3925
},
{
"epoch": 2.3242123946161812,
"grad_norm": 0.2316085398197174,
"learning_rate": 4.699662620524988e-05,
"loss": 0.575,
"step": 3930
},
{
"epoch": 2.327170536902825,
"grad_norm": 0.2333250194787979,
"learning_rate": 4.657601908573614e-05,
"loss": 0.597,
"step": 3935
},
{
"epoch": 2.330128679189469,
"grad_norm": 0.21552544832229614,
"learning_rate": 4.6157064354132644e-05,
"loss": 0.5801,
"step": 3940
},
{
"epoch": 2.333086821476113,
"grad_norm": 0.2206254005432129,
"learning_rate": 4.573976631584764e-05,
"loss": 0.5875,
"step": 3945
},
{
"epoch": 2.336044963762757,
"grad_norm": 0.23494865000247955,
"learning_rate": 4.532412925926401e-05,
"loss": 0.5908,
"step": 3950
},
{
"epoch": 2.336044963762757,
"eval_loss": 0.5537571907043457,
"eval_runtime": 15.188,
"eval_samples_per_second": 427.311,
"eval_steps_per_second": 13.366,
"step": 3950
},
{
"epoch": 2.339003106049401,
"grad_norm": 0.2163960486650467,
"learning_rate": 4.491015745569572e-05,
"loss": 0.5735,
"step": 3955
},
{
"epoch": 2.341961248336045,
"grad_norm": 0.2299627661705017,
"learning_rate": 4.4497855159343435e-05,
"loss": 0.5574,
"step": 3960
},
{
"epoch": 2.344919390622689,
"grad_norm": 0.22644494473934174,
"learning_rate": 4.408722660725121e-05,
"loss": 0.5687,
"step": 3965
},
{
"epoch": 2.347877532909333,
"grad_norm": 0.21835680305957794,
"learning_rate": 4.3678276019262836e-05,
"loss": 0.5675,
"step": 3970
},
{
"epoch": 2.350835675195977,
"grad_norm": 0.22462695837020874,
"learning_rate": 4.32710075979782e-05,
"loss": 0.5843,
"step": 3975
},
{
"epoch": 2.3537938174826207,
"grad_norm": 0.22448210418224335,
"learning_rate": 4.28654255287106e-05,
"loss": 0.5863,
"step": 3980
},
{
"epoch": 2.356751959769265,
"grad_norm": 0.21931299567222595,
"learning_rate": 4.2461533979443276e-05,
"loss": 0.5733,
"step": 3985
},
{
"epoch": 2.359710102055909,
"grad_norm": 0.2160944640636444,
"learning_rate": 4.2059337100786736e-05,
"loss": 0.5796,
"step": 3990
},
{
"epoch": 2.362668244342553,
"grad_norm": 0.22542470693588257,
"learning_rate": 4.165883902593623e-05,
"loss": 0.5832,
"step": 3995
},
{
"epoch": 2.365626386629197,
"grad_norm": 0.2219017744064331,
"learning_rate": 4.12600438706292e-05,
"loss": 0.5875,
"step": 4000
},
{
"epoch": 2.365626386629197,
"eval_loss": 0.5519596338272095,
"eval_runtime": 15.192,
"eval_samples_per_second": 427.199,
"eval_steps_per_second": 13.362,
"step": 4000
},
{
"epoch": 2.3685845289158407,
"grad_norm": 0.21874652802944183,
"learning_rate": 4.086295573310277e-05,
"loss": 0.579,
"step": 4005
},
{
"epoch": 2.371542671202485,
"grad_norm": 0.21773286163806915,
"learning_rate": 4.0467578694052067e-05,
"loss": 0.6015,
"step": 4010
},
{
"epoch": 2.374500813489129,
"grad_norm": 0.22749686241149902,
"learning_rate": 4.007391681658778e-05,
"loss": 0.5737,
"step": 4015
},
{
"epoch": 2.377458955775773,
"grad_norm": 0.22802948951721191,
"learning_rate": 3.968197414619491e-05,
"loss": 0.5694,
"step": 4020
},
{
"epoch": 2.380417098062417,
"grad_norm": 0.21437138319015503,
"learning_rate": 3.929175471069067e-05,
"loss": 0.5741,
"step": 4025
},
{
"epoch": 2.3833752403490607,
"grad_norm": 0.22747749090194702,
"learning_rate": 3.8903262520183675e-05,
"loss": 0.5907,
"step": 4030
},
{
"epoch": 2.386333382635705,
"grad_norm": 0.21637846529483795,
"learning_rate": 3.851650156703215e-05,
"loss": 0.5689,
"step": 4035
},
{
"epoch": 2.3892915249223488,
"grad_norm": 0.21709904074668884,
"learning_rate": 3.81314758258033e-05,
"loss": 0.5813,
"step": 4040
},
{
"epoch": 2.3922496672089926,
"grad_norm": 0.22659163177013397,
"learning_rate": 3.7748189253232394e-05,
"loss": 0.5856,
"step": 4045
},
{
"epoch": 2.395207809495637,
"grad_norm": 0.21906381845474243,
"learning_rate": 3.736664578818191e-05,
"loss": 0.5832,
"step": 4050
},
{
"epoch": 2.395207809495637,
"eval_loss": 0.550140917301178,
"eval_runtime": 15.1756,
"eval_samples_per_second": 427.662,
"eval_steps_per_second": 13.377,
"step": 4050
},
{
"epoch": 2.3981659517822806,
"grad_norm": 0.22354574501514435,
"learning_rate": 3.6986849351601395e-05,
"loss": 0.5782,
"step": 4055
},
{
"epoch": 2.401124094068925,
"grad_norm": 0.2217273861169815,
"learning_rate": 3.660880384648673e-05,
"loss": 0.591,
"step": 4060
},
{
"epoch": 2.4040822363555687,
"grad_norm": 0.21969923377037048,
"learning_rate": 3.623251315784055e-05,
"loss": 0.5749,
"step": 4065
},
{
"epoch": 2.4070403786422125,
"grad_norm": 0.22072407603263855,
"learning_rate": 3.5857981152631714e-05,
"loss": 0.5727,
"step": 4070
},
{
"epoch": 2.409998520928857,
"grad_norm": 0.21851158142089844,
"learning_rate": 3.5485211679756226e-05,
"loss": 0.5737,
"step": 4075
},
{
"epoch": 2.4129566632155006,
"grad_norm": 0.22839130461215973,
"learning_rate": 3.51142085699971e-05,
"loss": 0.5833,
"step": 4080
},
{
"epoch": 2.415914805502145,
"grad_norm": 0.2259005606174469,
"learning_rate": 3.474497563598524e-05,
"loss": 0.5942,
"step": 4085
},
{
"epoch": 2.4188729477887887,
"grad_norm": 0.22297006845474243,
"learning_rate": 3.437751667216045e-05,
"loss": 0.5809,
"step": 4090
},
{
"epoch": 2.4218310900754325,
"grad_norm": 0.2227243185043335,
"learning_rate": 3.401183545473203e-05,
"loss": 0.5713,
"step": 4095
},
{
"epoch": 2.4247892323620768,
"grad_norm": 0.22505450248718262,
"learning_rate": 3.364793574164036e-05,
"loss": 0.5814,
"step": 4100
},
{
"epoch": 2.4247892323620768,
"eval_loss": 0.5483865737915039,
"eval_runtime": 15.2502,
"eval_samples_per_second": 425.567,
"eval_steps_per_second": 13.311,
"step": 4100
},
{
"epoch": 2.4277473746487206,
"grad_norm": 0.2335953265428543,
"learning_rate": 3.328582127251795e-05,
"loss": 0.5903,
"step": 4105
},
{
"epoch": 2.4307055169353644,
"grad_norm": 0.22900566458702087,
"learning_rate": 3.29254957686513e-05,
"loss": 0.5795,
"step": 4110
},
{
"epoch": 2.4336636592220087,
"grad_norm": 0.2234223335981369,
"learning_rate": 3.256696293294239e-05,
"loss": 0.5797,
"step": 4115
},
{
"epoch": 2.4366218015086525,
"grad_norm": 0.22404076159000397,
"learning_rate": 3.2210226449870985e-05,
"loss": 0.5707,
"step": 4120
},
{
"epoch": 2.4395799437952967,
"grad_norm": 0.2215253710746765,
"learning_rate": 3.185528998545622e-05,
"loss": 0.5675,
"step": 4125
},
{
"epoch": 2.4425380860819406,
"grad_norm": 0.22823339700698853,
"learning_rate": 3.150215718721953e-05,
"loss": 0.5782,
"step": 4130
},
{
"epoch": 2.4454962283685844,
"grad_norm": 0.21130341291427612,
"learning_rate": 3.1150831684146714e-05,
"loss": 0.5719,
"step": 4135
},
{
"epoch": 2.4484543706552286,
"grad_norm": 0.22080035507678986,
"learning_rate": 3.0801317086651016e-05,
"loss": 0.5729,
"step": 4140
},
{
"epoch": 2.4514125129418725,
"grad_norm": 0.21857194602489471,
"learning_rate": 3.0453616986535577e-05,
"loss": 0.5751,
"step": 4145
},
{
"epoch": 2.4543706552285167,
"grad_norm": 0.2172456979751587,
"learning_rate": 3.010773495695699e-05,
"loss": 0.5829,
"step": 4150
},
{
"epoch": 2.4543706552285167,
"eval_loss": 0.5469695925712585,
"eval_runtime": 15.3408,
"eval_samples_per_second": 423.054,
"eval_steps_per_second": 13.233,
"step": 4150
},
{
"epoch": 2.4573287975151605,
"grad_norm": 0.23181508481502533,
"learning_rate": 2.9763674552388183e-05,
"loss": 0.5826,
"step": 4155
},
{
"epoch": 2.4602869398018044,
"grad_norm": 0.21691949665546417,
"learning_rate": 2.9421439308582223e-05,
"loss": 0.5757,
"step": 4160
},
{
"epoch": 2.4632450820884486,
"grad_norm": 0.2233293354511261,
"learning_rate": 2.908103274253573e-05,
"loss": 0.586,
"step": 4165
},
{
"epoch": 2.4662032243750924,
"grad_norm": 0.22276481986045837,
"learning_rate": 2.87424583524528e-05,
"loss": 0.5745,
"step": 4170
},
{
"epoch": 2.4691613666617362,
"grad_norm": 0.22253195941448212,
"learning_rate": 2.8405719617709216e-05,
"loss": 0.5874,
"step": 4175
},
{
"epoch": 2.4721195089483805,
"grad_norm": 0.22543801367282867,
"learning_rate": 2.8070819998816428e-05,
"loss": 0.5726,
"step": 4180
},
{
"epoch": 2.4750776512350243,
"grad_norm": 0.22294208407402039,
"learning_rate": 2.7737762937386233e-05,
"loss": 0.579,
"step": 4185
},
{
"epoch": 2.4780357935216686,
"grad_norm": 0.22648243606090546,
"learning_rate": 2.7406551856095202e-05,
"loss": 0.5707,
"step": 4190
},
{
"epoch": 2.4809939358083124,
"grad_norm": 0.22971773147583008,
"learning_rate": 2.7077190158649696e-05,
"loss": 0.5892,
"step": 4195
},
{
"epoch": 2.483952078094956,
"grad_norm": 0.2182358354330063,
"learning_rate": 2.6749681229750704e-05,
"loss": 0.5656,
"step": 4200
},
{
"epoch": 2.483952078094956,
"eval_loss": 0.5456834435462952,
"eval_runtime": 15.2341,
"eval_samples_per_second": 426.017,
"eval_steps_per_second": 13.325,
"step": 4200
},
{
"epoch": 2.4869102203816005,
"grad_norm": 0.22291642427444458,
"learning_rate": 2.6424028435059256e-05,
"loss": 0.5852,
"step": 4205
},
{
"epoch": 2.4898683626682443,
"grad_norm": 0.21934635937213898,
"learning_rate": 2.6100235121161643e-05,
"loss": 0.5811,
"step": 4210
},
{
"epoch": 2.4928265049548886,
"grad_norm": 0.22124481201171875,
"learning_rate": 2.5778304615535083e-05,
"loss": 0.5646,
"step": 4215
},
{
"epoch": 2.4957846472415324,
"grad_norm": 0.2168821543455124,
"learning_rate": 2.5458240226513753e-05,
"loss": 0.5701,
"step": 4220
},
{
"epoch": 2.498742789528176,
"grad_norm": 0.22023184597492218,
"learning_rate": 2.5140045243254303e-05,
"loss": 0.5771,
"step": 4225
},
{
"epoch": 2.5017009318148204,
"grad_norm": 0.22466090321540833,
"learning_rate": 2.4823722935702658e-05,
"loss": 0.5746,
"step": 4230
},
{
"epoch": 2.5046590741014643,
"grad_norm": 0.21793463826179504,
"learning_rate": 2.4509276554559827e-05,
"loss": 0.5911,
"step": 4235
},
{
"epoch": 2.507617216388108,
"grad_norm": 0.22143520414829254,
"learning_rate": 2.4196709331248968e-05,
"loss": 0.5827,
"step": 4240
},
{
"epoch": 2.5105753586747523,
"grad_norm": 0.22911550104618073,
"learning_rate": 2.3886024477881854e-05,
"loss": 0.5873,
"step": 4245
},
{
"epoch": 2.513533500961396,
"grad_norm": 0.22519181668758392,
"learning_rate": 2.3577225187226116e-05,
"loss": 0.5883,
"step": 4250
},
{
"epoch": 2.513533500961396,
"eval_loss": 0.5445500612258911,
"eval_runtime": 15.2095,
"eval_samples_per_second": 426.706,
"eval_steps_per_second": 13.347,
"step": 4250
},
{
"epoch": 2.51649164324804,
"grad_norm": 0.22342784702777863,
"learning_rate": 2.3270314632672217e-05,
"loss": 0.5666,
"step": 4255
},
{
"epoch": 2.5194497855346842,
"grad_norm": 0.2204166203737259,
"learning_rate": 2.2965295968200944e-05,
"loss": 0.5656,
"step": 4260
},
{
"epoch": 2.522407927821328,
"grad_norm": 0.2240799218416214,
"learning_rate": 2.2662172328350975e-05,
"loss": 0.5842,
"step": 4265
},
{
"epoch": 2.5253660701079723,
"grad_norm": 0.22049422562122345,
"learning_rate": 2.2360946828186807e-05,
"loss": 0.5602,
"step": 4270
},
{
"epoch": 2.528324212394616,
"grad_norm": 0.2233700156211853,
"learning_rate": 2.20616225632664e-05,
"loss": 0.583,
"step": 4275
},
{
"epoch": 2.5312823546812604,
"grad_norm": 0.21494439244270325,
"learning_rate": 2.176420260960981e-05,
"loss": 0.5665,
"step": 4280
},
{
"epoch": 2.534240496967904,
"grad_norm": 0.22276660799980164,
"learning_rate": 2.146869002366714e-05,
"loss": 0.5828,
"step": 4285
},
{
"epoch": 2.537198639254548,
"grad_norm": 0.23662561178207397,
"learning_rate": 2.1175087842287453e-05,
"loss": 0.5875,
"step": 4290
},
{
"epoch": 2.5401567815411923,
"grad_norm": 0.2167738676071167,
"learning_rate": 2.0883399082687503e-05,
"loss": 0.579,
"step": 4295
},
{
"epoch": 2.543114923827836,
"grad_norm": 0.2297111451625824,
"learning_rate": 2.0593626742420543e-05,
"loss": 0.5786,
"step": 4300
},
{
"epoch": 2.543114923827836,
"eval_loss": 0.5430043935775757,
"eval_runtime": 15.2067,
"eval_samples_per_second": 426.786,
"eval_steps_per_second": 13.349,
"step": 4300
},
{
"epoch": 2.54607306611448,
"grad_norm": 0.2167220413684845,
"learning_rate": 2.0305773799345715e-05,
"loss": 0.5746,
"step": 4305
},
{
"epoch": 2.549031208401124,
"grad_norm": 0.21370890736579895,
"learning_rate": 2.0019843211597343e-05,
"loss": 0.5808,
"step": 4310
},
{
"epoch": 2.551989350687768,
"grad_norm": 0.21757473051548004,
"learning_rate": 1.9735837917554708e-05,
"loss": 0.5833,
"step": 4315
},
{
"epoch": 2.554947492974412,
"grad_norm": 0.22231775522232056,
"learning_rate": 1.9453760835811493e-05,
"loss": 0.5669,
"step": 4320
},
{
"epoch": 2.557905635261056,
"grad_norm": 0.23005709052085876,
"learning_rate": 1.9173614865146273e-05,
"loss": 0.5765,
"step": 4325
},
{
"epoch": 2.5608637775477,
"grad_norm": 0.22354349493980408,
"learning_rate": 1.889540288449228e-05,
"loss": 0.5687,
"step": 4330
},
{
"epoch": 2.563821919834344,
"grad_norm": 0.21966968476772308,
"learning_rate": 1.8619127752908098e-05,
"loss": 0.5756,
"step": 4335
},
{
"epoch": 2.566780062120988,
"grad_norm": 0.21885375678539276,
"learning_rate": 1.8344792309548108e-05,
"loss": 0.5739,
"step": 4340
},
{
"epoch": 2.5697382044076322,
"grad_norm": 0.22621390223503113,
"learning_rate": 1.8072399373633515e-05,
"loss": 0.5851,
"step": 4345
},
{
"epoch": 2.572696346694276,
"grad_norm": 0.2224331647157669,
"learning_rate": 1.7801951744423186e-05,
"loss": 0.5824,
"step": 4350
},
{
"epoch": 2.572696346694276,
"eval_loss": 0.5423793792724609,
"eval_runtime": 15.1763,
"eval_samples_per_second": 427.641,
"eval_steps_per_second": 13.376,
"step": 4350
},
{
"epoch": 2.57565448898092,
"grad_norm": 0.23509664833545685,
"learning_rate": 1.7533452201184873e-05,
"loss": 0.5856,
"step": 4355
},
{
"epoch": 2.578612631267564,
"grad_norm": 0.22954770922660828,
"learning_rate": 1.7266903503166882e-05,
"loss": 0.5799,
"step": 4360
},
{
"epoch": 2.581570773554208,
"grad_norm": 0.2257954627275467,
"learning_rate": 1.7002308389569457e-05,
"loss": 0.5725,
"step": 4365
},
{
"epoch": 2.5845289158408518,
"grad_norm": 0.23301996290683746,
"learning_rate": 1.673966957951685e-05,
"loss": 0.5738,
"step": 4370
},
{
"epoch": 2.587487058127496,
"grad_norm": 0.22801564633846283,
"learning_rate": 1.6478989772029073e-05,
"loss": 0.5701,
"step": 4375
},
{
"epoch": 2.59044520041414,
"grad_norm": 0.20772784948349,
"learning_rate": 1.622027164599458e-05,
"loss": 0.5921,
"step": 4380
},
{
"epoch": 2.5934033427007837,
"grad_norm": 0.22121798992156982,
"learning_rate": 1.5963517860142358e-05,
"loss": 0.5858,
"step": 4385
},
{
"epoch": 2.596361484987428,
"grad_norm": 0.21859820187091827,
"learning_rate": 1.5708731053014873e-05,
"loss": 0.5699,
"step": 4390
},
{
"epoch": 2.5993196272740717,
"grad_norm": 0.2150518298149109,
"learning_rate": 1.5455913842940675e-05,
"loss": 0.566,
"step": 4395
},
{
"epoch": 2.602277769560716,
"grad_norm": 0.22459477186203003,
"learning_rate": 1.5205068828007849e-05,
"loss": 0.5802,
"step": 4400
},
{
"epoch": 2.602277769560716,
"eval_loss": 0.54107266664505,
"eval_runtime": 15.154,
"eval_samples_per_second": 428.269,
"eval_steps_per_second": 13.396,
"step": 4400
},
{
"epoch": 2.60523591184736,
"grad_norm": 0.22681817412376404,
"learning_rate": 1.4956198586036965e-05,
"loss": 0.5926,
"step": 4405
},
{
"epoch": 2.608194054134004,
"grad_norm": 0.2272169440984726,
"learning_rate": 1.4709305674554852e-05,
"loss": 0.5857,
"step": 4410
},
{
"epoch": 2.611152196420648,
"grad_norm": 0.22858625650405884,
"learning_rate": 1.4464392630768207e-05,
"loss": 0.5673,
"step": 4415
},
{
"epoch": 2.6141103387072917,
"grad_norm": 0.22836680710315704,
"learning_rate": 1.4221461971537435e-05,
"loss": 0.5648,
"step": 4420
},
{
"epoch": 2.617068480993936,
"grad_norm": 0.22064611315727234,
"learning_rate": 1.3980516193350969e-05,
"loss": 0.578,
"step": 4425
},
{
"epoch": 2.62002662328058,
"grad_norm": 0.21425370872020721,
"learning_rate": 1.3741557772299449e-05,
"loss": 0.5629,
"step": 4430
},
{
"epoch": 2.6229847655672236,
"grad_norm": 0.23086489737033844,
"learning_rate": 1.3504589164050405e-05,
"loss": 0.5916,
"step": 4435
},
{
"epoch": 2.625942907853868,
"grad_norm": 0.21682678163051605,
"learning_rate": 1.3269612803822861e-05,
"loss": 0.5628,
"step": 4440
},
{
"epoch": 2.6289010501405117,
"grad_norm": 0.2231551855802536,
"learning_rate": 1.3036631106362562e-05,
"loss": 0.5793,
"step": 4445
},
{
"epoch": 2.6318591924271555,
"grad_norm": 0.2130296528339386,
"learning_rate": 1.2805646465916838e-05,
"loss": 0.563,
"step": 4450
},
{
"epoch": 2.6318591924271555,
"eval_loss": 0.5402519106864929,
"eval_runtime": 15.1991,
"eval_samples_per_second": 426.999,
"eval_steps_per_second": 13.356,
"step": 4450
},
{
"epoch": 2.6348173347137998,
"grad_norm": 0.225514218211174,
"learning_rate": 1.257666125621033e-05,
"loss": 0.5828,
"step": 4455
},
{
"epoch": 2.6377754770004436,
"grad_norm": 0.21802076697349548,
"learning_rate": 1.2349677830420293e-05,
"loss": 0.5674,
"step": 4460
},
{
"epoch": 2.640733619287088,
"grad_norm": 0.22092710435390472,
"learning_rate": 1.2124698521152674e-05,
"loss": 0.5715,
"step": 4465
},
{
"epoch": 2.6436917615737316,
"grad_norm": 0.2172520011663437,
"learning_rate": 1.1901725640417918e-05,
"loss": 0.5695,
"step": 4470
},
{
"epoch": 2.646649903860376,
"grad_norm": 0.2219080626964569,
"learning_rate": 1.1680761479607432e-05,
"loss": 0.58,
"step": 4475
},
{
"epoch": 2.6496080461470197,
"grad_norm": 0.22562995553016663,
"learning_rate": 1.1461808309469787e-05,
"loss": 0.5764,
"step": 4480
},
{
"epoch": 2.6525661884336635,
"grad_norm": 0.22420786321163177,
"learning_rate": 1.1244868380087579e-05,
"loss": 0.576,
"step": 4485
},
{
"epoch": 2.655524330720308,
"grad_norm": 0.2163666933774948,
"learning_rate": 1.1029943920854286e-05,
"loss": 0.5697,
"step": 4490
},
{
"epoch": 2.6584824730069516,
"grad_norm": 0.22043688595294952,
"learning_rate": 1.0817037140451184e-05,
"loss": 0.5934,
"step": 4495
},
{
"epoch": 2.6614406152935954,
"grad_norm": 0.22016723453998566,
"learning_rate": 1.0606150226824918e-05,
"loss": 0.584,
"step": 4500
},
{
"epoch": 2.6614406152935954,
"eval_loss": 0.539762556552887,
"eval_runtime": 15.2106,
"eval_samples_per_second": 426.675,
"eval_steps_per_second": 13.346,
"step": 4500
},
{
"epoch": 2.6643987575802397,
"grad_norm": 0.22158069908618927,
"learning_rate": 1.039728534716478e-05,
"loss": 0.5703,
"step": 4505
},
{
"epoch": 2.6673568998668835,
"grad_norm": 0.22302477061748505,
"learning_rate": 1.0190444647880609e-05,
"loss": 0.5894,
"step": 4510
},
{
"epoch": 2.6703150421535273,
"grad_norm": 0.22093521058559418,
"learning_rate": 9.98563025458055e-06,
"loss": 0.5745,
"step": 4515
},
{
"epoch": 2.6732731844401716,
"grad_norm": 0.2225552648305893,
"learning_rate": 9.78284427204948e-06,
"loss": 0.5729,
"step": 4520
},
{
"epoch": 2.6762313267268154,
"grad_norm": 0.227900430560112,
"learning_rate": 9.582088784227052e-06,
"loss": 0.5939,
"step": 4525
},
{
"epoch": 2.6791894690134597,
"grad_norm": 0.21942593157291412,
"learning_rate": 9.3833658541865e-06,
"loss": 0.5775,
"step": 4530
},
{
"epoch": 2.6821476113001035,
"grad_norm": 0.22814010083675385,
"learning_rate": 9.186677524113473e-06,
"loss": 0.5763,
"step": 4535
},
{
"epoch": 2.6851057535867477,
"grad_norm": 0.2286267876625061,
"learning_rate": 8.992025815284826e-06,
"loss": 0.5765,
"step": 4540
},
{
"epoch": 2.6880638958733916,
"grad_norm": 0.22250616550445557,
"learning_rate": 8.799412728048058e-06,
"loss": 0.5762,
"step": 4545
},
{
"epoch": 2.6910220381600354,
"grad_norm": 0.22371500730514526,
"learning_rate": 8.608840241800641e-06,
"loss": 0.5744,
"step": 4550
},
{
"epoch": 2.6910220381600354,
"eval_loss": 0.5388516187667847,
"eval_runtime": 15.2087,
"eval_samples_per_second": 426.73,
"eval_steps_per_second": 13.348,
"step": 4550
},
{
"epoch": 2.6939801804466796,
"grad_norm": 0.22178302705287933,
"learning_rate": 8.420310314969735e-06,
"loss": 0.5766,
"step": 4555
},
{
"epoch": 2.6969383227333235,
"grad_norm": 0.21156929433345795,
"learning_rate": 8.23382488499205e-06,
"loss": 0.5777,
"step": 4560
},
{
"epoch": 2.6998964650199673,
"grad_norm": 0.22891399264335632,
"learning_rate": 8.049385868293896e-06,
"loss": 0.5634,
"step": 4565
},
{
"epoch": 2.7028546073066115,
"grad_norm": 0.22337764501571655,
"learning_rate": 7.866995160271555e-06,
"loss": 0.5713,
"step": 4570
},
{
"epoch": 2.7058127495932554,
"grad_norm": 0.22055917978286743,
"learning_rate": 7.686654635271734e-06,
"loss": 0.5663,
"step": 4575
},
{
"epoch": 2.708770891879899,
"grad_norm": 0.2214854657649994,
"learning_rate": 7.508366146572334e-06,
"loss": 0.5647,
"step": 4580
},
{
"epoch": 2.7117290341665434,
"grad_norm": 0.22195084393024445,
"learning_rate": 7.3321315263634685e-06,
"loss": 0.5661,
"step": 4585
},
{
"epoch": 2.7146871764531872,
"grad_norm": 0.22451895475387573,
"learning_rate": 7.157952585728481e-06,
"loss": 0.5856,
"step": 4590
},
{
"epoch": 2.7176453187398315,
"grad_norm": 0.22339747846126556,
"learning_rate": 6.985831114625555e-06,
"loss": 0.5811,
"step": 4595
},
{
"epoch": 2.7206034610264753,
"grad_norm": 0.216370090842247,
"learning_rate": 6.815768881869047e-06,
"loss": 0.575,
"step": 4600
},
{
"epoch": 2.7206034610264753,
"eval_loss": 0.5387815833091736,
"eval_runtime": 15.2306,
"eval_samples_per_second": 426.115,
"eval_steps_per_second": 13.328,
"step": 4600
},
{
"epoch": 2.7235616033131196,
"grad_norm": 0.2195269763469696,
"learning_rate": 6.647767635111566e-06,
"loss": 0.5778,
"step": 4605
},
{
"epoch": 2.7265197455997634,
"grad_norm": 0.21644777059555054,
"learning_rate": 6.481829100825816e-06,
"loss": 0.5563,
"step": 4610
},
{
"epoch": 2.729477887886407,
"grad_norm": 0.21869517862796783,
"learning_rate": 6.317954984287005e-06,
"loss": 0.569,
"step": 4615
},
{
"epoch": 2.7324360301730515,
"grad_norm": 0.22483299672603607,
"learning_rate": 6.156146969555277e-06,
"loss": 0.5699,
"step": 4620
},
{
"epoch": 2.7353941724596953,
"grad_norm": 0.2190885692834854,
"learning_rate": 5.996406719458241e-06,
"loss": 0.5756,
"step": 4625
},
{
"epoch": 2.738352314746339,
"grad_norm": 0.21631799638271332,
"learning_rate": 5.838735875574182e-06,
"loss": 0.5848,
"step": 4630
},
{
"epoch": 2.7413104570329834,
"grad_norm": 0.22044958174228668,
"learning_rate": 5.6831360582149405e-06,
"loss": 0.5806,
"step": 4635
},
{
"epoch": 2.744268599319627,
"grad_norm": 0.2257104516029358,
"learning_rate": 5.529608866409443e-06,
"loss": 0.5553,
"step": 4640
},
{
"epoch": 2.747226741606271,
"grad_norm": 0.21883299946784973,
"learning_rate": 5.378155877887042e-06,
"loss": 0.5758,
"step": 4645
},
{
"epoch": 2.7501848838929153,
"grad_norm": 0.21661600470542908,
"learning_rate": 5.2287786490616e-06,
"loss": 0.5644,
"step": 4650
},
{
"epoch": 2.7501848838929153,
"eval_loss": 0.5379989743232727,
"eval_runtime": 15.2107,
"eval_samples_per_second": 426.673,
"eval_steps_per_second": 13.346,
"step": 4650
},
{
"epoch": 2.753143026179559,
"grad_norm": 0.22663559019565582,
"learning_rate": 5.081478715015193e-06,
"loss": 0.5832,
"step": 4655
},
{
"epoch": 2.7561011684662033,
"grad_norm": 0.2192746102809906,
"learning_rate": 4.93625758948264e-06,
"loss": 0.572,
"step": 4660
},
{
"epoch": 2.759059310752847,
"grad_norm": 0.21773546934127808,
"learning_rate": 4.793116764835617e-06,
"loss": 0.5719,
"step": 4665
},
{
"epoch": 2.7620174530394914,
"grad_norm": 0.22200337052345276,
"learning_rate": 4.652057712067575e-06,
"loss": 0.5696,
"step": 4670
},
{
"epoch": 2.7649755953261352,
"grad_norm": 0.2322062999010086,
"learning_rate": 4.513081880778574e-06,
"loss": 0.5866,
"step": 4675
},
{
"epoch": 2.767933737612779,
"grad_norm": 0.22154580056667328,
"learning_rate": 4.376190699160239e-06,
"loss": 0.5669,
"step": 4680
},
{
"epoch": 2.7708918798994233,
"grad_norm": 0.21697700023651123,
"learning_rate": 4.241385573981337e-06,
"loss": 0.5948,
"step": 4685
},
{
"epoch": 2.773850022186067,
"grad_norm": 0.22418950498104095,
"learning_rate": 4.108667890573057e-06,
"loss": 0.5685,
"step": 4690
},
{
"epoch": 2.776808164472711,
"grad_norm": 0.2229933887720108,
"learning_rate": 3.978039012814971e-06,
"loss": 0.5821,
"step": 4695
},
{
"epoch": 2.779766306759355,
"grad_norm": 0.23993420600891113,
"learning_rate": 3.84950028312085e-06,
"loss": 0.5829,
"step": 4700
},
{
"epoch": 2.779766306759355,
"eval_loss": 0.5377324223518372,
"eval_runtime": 15.191,
"eval_samples_per_second": 427.228,
"eval_steps_per_second": 13.363,
"step": 4700
},
{
"epoch": 2.782724449045999,
"grad_norm": 0.2284260392189026,
"learning_rate": 3.7230530224251017e-06,
"loss": 0.5808,
"step": 4705
},
{
"epoch": 2.785682591332643,
"grad_norm": 0.21950730681419373,
"learning_rate": 3.5986985301689156e-06,
"loss": 0.5776,
"step": 4710
},
{
"epoch": 2.788640733619287,
"grad_norm": 0.22390055656433105,
"learning_rate": 3.4764380842871153e-06,
"loss": 0.5694,
"step": 4715
},
{
"epoch": 2.791598875905931,
"grad_norm": 0.22625946998596191,
"learning_rate": 3.356272941194918e-06,
"loss": 0.5732,
"step": 4720
},
{
"epoch": 2.794557018192575,
"grad_norm": 0.23500940203666687,
"learning_rate": 3.2382043357751384e-06,
"loss": 0.5641,
"step": 4725
},
{
"epoch": 2.797515160479219,
"grad_norm": 0.22036881744861603,
"learning_rate": 3.122233481365339e-06,
"loss": 0.5807,
"step": 4730
},
{
"epoch": 2.8004733027658633,
"grad_norm": 0.23122946918010712,
"learning_rate": 3.008361569745513e-06,
"loss": 0.5762,
"step": 4735
},
{
"epoch": 2.803431445052507,
"grad_norm": 0.23130850493907928,
"learning_rate": 2.8965897711257245e-06,
"loss": 0.567,
"step": 4740
},
{
"epoch": 2.806389587339151,
"grad_norm": 0.22971408069133759,
"learning_rate": 2.7869192341341095e-06,
"loss": 0.5662,
"step": 4745
},
{
"epoch": 2.809347729625795,
"grad_norm": 0.21526266634464264,
"learning_rate": 2.6793510858051828e-06,
"loss": 0.5702,
"step": 4750
},
{
"epoch": 2.809347729625795,
"eval_loss": 0.5374576449394226,
"eval_runtime": 15.2056,
"eval_samples_per_second": 426.815,
"eval_steps_per_second": 13.35,
"step": 4750
},
{
"epoch": 2.812305871912439,
"grad_norm": 0.22544841468334198,
"learning_rate": 2.5738864315680513e-06,
"loss": 0.5679,
"step": 4755
},
{
"epoch": 2.815264014199083,
"grad_norm": 0.22757330536842346,
"learning_rate": 2.470526355235246e-06,
"loss": 0.5632,
"step": 4760
},
{
"epoch": 2.818222156485727,
"grad_norm": 0.22844068706035614,
"learning_rate": 2.3692719189914185e-06,
"loss": 0.5752,
"step": 4765
},
{
"epoch": 2.821180298772371,
"grad_norm": 0.22197550535202026,
"learning_rate": 2.270124163382614e-06,
"loss": 0.5501,
"step": 4770
},
{
"epoch": 2.8241384410590147,
"grad_norm": 0.21858546137809753,
"learning_rate": 2.173084107305403e-06,
"loss": 0.5726,
"step": 4775
},
{
"epoch": 2.827096583345659,
"grad_norm": 0.22059139609336853,
"learning_rate": 2.0781527479965216e-06,
"loss": 0.5783,
"step": 4780
},
{
"epoch": 2.8300547256323028,
"grad_norm": 0.2304689586162567,
"learning_rate": 1.9853310610225355e-06,
"loss": 0.5821,
"step": 4785
},
{
"epoch": 2.833012867918947,
"grad_norm": 0.22385703027248383,
"learning_rate": 1.8946200002699386e-06,
"loss": 0.5908,
"step": 4790
},
{
"epoch": 2.835971010205591,
"grad_norm": 0.2215282917022705,
"learning_rate": 1.806020497935185e-06,
"loss": 0.5644,
"step": 4795
},
{
"epoch": 2.838929152492235,
"grad_norm": 0.21840998530387878,
"learning_rate": 1.7195334645152737e-06,
"loss": 0.5814,
"step": 4800
},
{
"epoch": 2.838929152492235,
"eval_loss": 0.5373325943946838,
"eval_runtime": 15.2351,
"eval_samples_per_second": 425.989,
"eval_steps_per_second": 13.324,
"step": 4800
},
{
"epoch": 2.841887294778879,
"grad_norm": 0.22534750401973724,
"learning_rate": 1.6351597887982846e-06,
"loss": 0.581,
"step": 4805
},
{
"epoch": 2.8448454370655227,
"grad_norm": 0.23928098380565643,
"learning_rate": 1.5529003378542404e-06,
"loss": 0.5837,
"step": 4810
},
{
"epoch": 2.847803579352167,
"grad_norm": 0.21647833287715912,
"learning_rate": 1.4727559570263333e-06,
"loss": 0.5701,
"step": 4815
},
{
"epoch": 2.850761721638811,
"grad_norm": 0.2176506221294403,
"learning_rate": 1.3947274699220398e-06,
"loss": 0.5626,
"step": 4820
},
{
"epoch": 2.8537198639254546,
"grad_norm": 0.21065934002399445,
"learning_rate": 1.3188156784048088e-06,
"loss": 0.5686,
"step": 4825
},
{
"epoch": 2.856678006212099,
"grad_norm": 0.22182585299015045,
"learning_rate": 1.2450213625857274e-06,
"loss": 0.5761,
"step": 4830
},
{
"epoch": 2.8596361484987427,
"grad_norm": 0.21298271417617798,
"learning_rate": 1.1733452808156017e-06,
"loss": 0.5867,
"step": 4835
},
{
"epoch": 2.8625942907853865,
"grad_norm": 0.229048490524292,
"learning_rate": 1.103788169677036e-06,
"loss": 0.589,
"step": 4840
},
{
"epoch": 2.865552433072031,
"grad_norm": 0.2213655412197113,
"learning_rate": 1.0363507439769986e-06,
"loss": 0.5597,
"step": 4845
},
{
"epoch": 2.8685105753586746,
"grad_norm": 0.21822868287563324,
"learning_rate": 9.7103369673936e-07,
"loss": 0.5712,
"step": 4850
},
{
"epoch": 2.8685105753586746,
"eval_loss": 0.5373578667640686,
"eval_runtime": 15.1783,
"eval_samples_per_second": 427.584,
"eval_steps_per_second": 13.374,
"step": 4850
},
{
"epoch": 2.871468717645319,
"grad_norm": 0.22016650438308716,
"learning_rate": 9.078376991978266e-07,
"loss": 0.5587,
"step": 4855
},
{
"epoch": 2.8744268599319627,
"grad_norm": 0.23947712779045105,
"learning_rate": 8.467634007890796e-07,
"loss": 0.5841,
"step": 4860
},
{
"epoch": 2.877385002218607,
"grad_norm": 0.2243824005126953,
"learning_rate": 7.878114291460063e-07,
"loss": 0.5736,
"step": 4865
},
{
"epoch": 2.8803431445052508,
"grad_norm": 0.22133906185626984,
"learning_rate": 7.309823900913461e-07,
"loss": 0.5764,
"step": 4870
},
{
"epoch": 2.8833012867918946,
"grad_norm": 0.21976634860038757,
"learning_rate": 6.76276867631405e-07,
"loss": 0.5699,
"step": 4875
},
{
"epoch": 2.886259429078539,
"grad_norm": 0.22008314728736877,
"learning_rate": 6.236954239500471e-07,
"loss": 0.5527,
"step": 4880
},
{
"epoch": 2.8892175713651826,
"grad_norm": 0.22807146608829498,
"learning_rate": 5.732385994029618e-07,
"loss": 0.5943,
"step": 4885
},
{
"epoch": 2.8921757136518265,
"grad_norm": 0.22938776016235352,
"learning_rate": 5.249069125121154e-07,
"loss": 0.5825,
"step": 4890
},
{
"epoch": 2.8951338559384707,
"grad_norm": 0.20941923558712006,
"learning_rate": 4.787008599603642e-07,
"loss": 0.5685,
"step": 4895
},
{
"epoch": 2.8980919982251145,
"grad_norm": 0.22085338830947876,
"learning_rate": 4.346209165863655e-07,
"loss": 0.5588,
"step": 4900
},
{
"epoch": 2.8980919982251145,
"eval_loss": 0.5373329520225525,
"eval_runtime": 15.2559,
"eval_samples_per_second": 425.409,
"eval_steps_per_second": 13.306,
"step": 4900
},
{
"epoch": 2.9010501405117584,
"grad_norm": 0.22424866259098053,
"learning_rate": 3.926675353797443e-07,
"loss": 0.5725,
"step": 4905
},
{
"epoch": 2.9040082827984026,
"grad_norm": 0.2182874232530594,
"learning_rate": 3.5284114747641856e-07,
"loss": 0.5582,
"step": 4910
},
{
"epoch": 2.9069664250850464,
"grad_norm": 0.21973784267902374,
"learning_rate": 3.151421621541335e-07,
"loss": 0.5684,
"step": 4915
},
{
"epoch": 2.9099245673716907,
"grad_norm": 0.2083846479654312,
"learning_rate": 2.795709668283172e-07,
"loss": 0.578,
"step": 4920
},
{
"epoch": 2.9128827096583345,
"grad_norm": 0.2196836769580841,
"learning_rate": 2.4612792704798287e-07,
"loss": 0.5603,
"step": 4925
},
{
"epoch": 2.9158408519449788,
"grad_norm": 0.22254040837287903,
"learning_rate": 2.1481338649216013e-07,
"loss": 0.5526,
"step": 4930
},
{
"epoch": 2.9187989942316226,
"grad_norm": 0.2200893610715866,
"learning_rate": 1.8562766696618855e-07,
"loss": 0.5661,
"step": 4935
},
{
"epoch": 2.9217571365182664,
"grad_norm": 0.22102928161621094,
"learning_rate": 1.5857106839847136e-07,
"loss": 0.5905,
"step": 4940
},
{
"epoch": 2.9247152788049107,
"grad_norm": 0.2244081199169159,
"learning_rate": 1.3364386883745962e-07,
"loss": 0.5743,
"step": 4945
},
{
"epoch": 2.9276734210915545,
"grad_norm": 0.23028399050235748,
"learning_rate": 1.1084632444868224e-07,
"loss": 0.5852,
"step": 4950
},
{
"epoch": 2.9276734210915545,
"eval_loss": 0.5373095273971558,
"eval_runtime": 15.2077,
"eval_samples_per_second": 426.758,
"eval_steps_per_second": 13.349,
"step": 4950
}
],
"logging_steps": 5,
"max_steps": 5000,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0001
},
"attributes": {
"early_stopping_patience_counter": 3
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.873445665417724e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}