SmolLM2-1.7B-sft-only / trainer_state.json
loubnabnl's picture
loubnabnl HF Staff
Model save
ad768f3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9983216783216782,
"eval_steps": 500,
"global_step": 1786,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011188811188811189,
"grad_norm": 153.8507844996399,
"learning_rate": 1.6759776536312848e-06,
"loss": 4.8778,
"step": 1
},
{
"epoch": 0.005594405594405594,
"grad_norm": 153.39272858190785,
"learning_rate": 8.379888268156423e-06,
"loss": 4.7304,
"step": 5
},
{
"epoch": 0.011188811188811189,
"grad_norm": 96.41612092958107,
"learning_rate": 1.6759776536312845e-05,
"loss": 4.1396,
"step": 10
},
{
"epoch": 0.016783216783216783,
"grad_norm": 23.87392825650757,
"learning_rate": 2.513966480446927e-05,
"loss": 2.5431,
"step": 15
},
{
"epoch": 0.022377622377622378,
"grad_norm": 6.903016270261399,
"learning_rate": 3.351955307262569e-05,
"loss": 1.742,
"step": 20
},
{
"epoch": 0.027972027972027972,
"grad_norm": 2.441067831027445,
"learning_rate": 4.189944134078212e-05,
"loss": 1.3057,
"step": 25
},
{
"epoch": 0.033566433566433566,
"grad_norm": 1.0984763176398198,
"learning_rate": 5.027932960893854e-05,
"loss": 1.1221,
"step": 30
},
{
"epoch": 0.039160839160839164,
"grad_norm": 0.5734705027472714,
"learning_rate": 5.865921787709496e-05,
"loss": 1.0604,
"step": 35
},
{
"epoch": 0.044755244755244755,
"grad_norm": 0.47491071619847164,
"learning_rate": 6.703910614525138e-05,
"loss": 0.9951,
"step": 40
},
{
"epoch": 0.05034965034965035,
"grad_norm": 0.48157929747551576,
"learning_rate": 7.541899441340782e-05,
"loss": 0.9713,
"step": 45
},
{
"epoch": 0.055944055944055944,
"grad_norm": 0.45861937937875213,
"learning_rate": 8.379888268156423e-05,
"loss": 0.9551,
"step": 50
},
{
"epoch": 0.06153846153846154,
"grad_norm": 0.5047783392223855,
"learning_rate": 9.217877094972066e-05,
"loss": 0.948,
"step": 55
},
{
"epoch": 0.06713286713286713,
"grad_norm": 0.4461540398939836,
"learning_rate": 0.00010055865921787709,
"loss": 0.9392,
"step": 60
},
{
"epoch": 0.07272727272727272,
"grad_norm": 0.3316401246824197,
"learning_rate": 0.00010893854748603351,
"loss": 0.9146,
"step": 65
},
{
"epoch": 0.07832167832167833,
"grad_norm": 0.21722066760341413,
"learning_rate": 0.00011731843575418992,
"loss": 0.9012,
"step": 70
},
{
"epoch": 0.08391608391608392,
"grad_norm": 0.1499443883049875,
"learning_rate": 0.00012569832402234635,
"loss": 0.8927,
"step": 75
},
{
"epoch": 0.08951048951048951,
"grad_norm": 0.22243490348769648,
"learning_rate": 0.00013407821229050276,
"loss": 0.8937,
"step": 80
},
{
"epoch": 0.0951048951048951,
"grad_norm": 0.37625345216705214,
"learning_rate": 0.0001424581005586592,
"loss": 0.8718,
"step": 85
},
{
"epoch": 0.1006993006993007,
"grad_norm": 0.24963786248117179,
"learning_rate": 0.00015083798882681564,
"loss": 0.8921,
"step": 90
},
{
"epoch": 0.1062937062937063,
"grad_norm": 0.26880921530140445,
"learning_rate": 0.00015921787709497208,
"loss": 0.8799,
"step": 95
},
{
"epoch": 0.11188811188811189,
"grad_norm": 0.20889087265244094,
"learning_rate": 0.00016759776536312847,
"loss": 0.8678,
"step": 100
},
{
"epoch": 0.11748251748251748,
"grad_norm": 0.20865989038699168,
"learning_rate": 0.0001759776536312849,
"loss": 0.8715,
"step": 105
},
{
"epoch": 0.12307692307692308,
"grad_norm": 0.17048448558431092,
"learning_rate": 0.00018435754189944132,
"loss": 0.866,
"step": 110
},
{
"epoch": 0.12867132867132866,
"grad_norm": 0.2563726823096772,
"learning_rate": 0.00019273743016759776,
"loss": 0.8786,
"step": 115
},
{
"epoch": 0.13426573426573427,
"grad_norm": 0.22854402328286066,
"learning_rate": 0.00020111731843575417,
"loss": 0.8641,
"step": 120
},
{
"epoch": 0.13986013986013987,
"grad_norm": 0.17876773160294948,
"learning_rate": 0.00020949720670391058,
"loss": 0.8625,
"step": 125
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.3615856863579962,
"learning_rate": 0.00021787709497206702,
"loss": 0.8504,
"step": 130
},
{
"epoch": 0.15104895104895105,
"grad_norm": 0.22751374781762473,
"learning_rate": 0.00022625698324022346,
"loss": 0.8683,
"step": 135
},
{
"epoch": 0.15664335664335666,
"grad_norm": 0.21790912834318224,
"learning_rate": 0.00023463687150837985,
"loss": 0.8519,
"step": 140
},
{
"epoch": 0.16223776223776223,
"grad_norm": 0.18707627983249808,
"learning_rate": 0.0002430167597765363,
"loss": 0.8426,
"step": 145
},
{
"epoch": 0.16783216783216784,
"grad_norm": 0.22404831326656477,
"learning_rate": 0.0002513966480446927,
"loss": 0.8509,
"step": 150
},
{
"epoch": 0.17342657342657342,
"grad_norm": 0.20919591839479193,
"learning_rate": 0.00025977653631284914,
"loss": 0.8594,
"step": 155
},
{
"epoch": 0.17902097902097902,
"grad_norm": 0.15579954649232924,
"learning_rate": 0.0002681564245810055,
"loss": 0.8319,
"step": 160
},
{
"epoch": 0.18461538461538463,
"grad_norm": 0.4055853577053493,
"learning_rate": 0.00027653631284916196,
"loss": 0.8577,
"step": 165
},
{
"epoch": 0.1902097902097902,
"grad_norm": 0.2945253672148607,
"learning_rate": 0.0002849162011173184,
"loss": 0.8579,
"step": 170
},
{
"epoch": 0.1958041958041958,
"grad_norm": 0.21772142109085163,
"learning_rate": 0.00029329608938547484,
"loss": 0.8382,
"step": 175
},
{
"epoch": 0.2013986013986014,
"grad_norm": 0.2281215849497856,
"learning_rate": 0.00029999971336506766,
"loss": 0.8331,
"step": 180
},
{
"epoch": 0.206993006993007,
"grad_norm": 0.17899592128620762,
"learning_rate": 0.0002999896812574594,
"loss": 0.8434,
"step": 185
},
{
"epoch": 0.2125874125874126,
"grad_norm": 0.14731246899285452,
"learning_rate": 0.0002999653184986775,
"loss": 0.8458,
"step": 190
},
{
"epoch": 0.21818181818181817,
"grad_norm": 0.14945248170502354,
"learning_rate": 0.00029992662741644334,
"loss": 0.8457,
"step": 195
},
{
"epoch": 0.22377622377622378,
"grad_norm": 0.14124929357614965,
"learning_rate": 0.0002998736117074673,
"loss": 0.8219,
"step": 200
},
{
"epoch": 0.22937062937062938,
"grad_norm": 0.18621904528265473,
"learning_rate": 0.0002998062764370954,
"loss": 0.8299,
"step": 205
},
{
"epoch": 0.23496503496503496,
"grad_norm": 0.22513445720588005,
"learning_rate": 0.00029972462803882523,
"loss": 0.8502,
"step": 210
},
{
"epoch": 0.24055944055944056,
"grad_norm": 0.1636384983699745,
"learning_rate": 0.0002996286743136916,
"loss": 0.8506,
"step": 215
},
{
"epoch": 0.24615384615384617,
"grad_norm": 0.19587892370520038,
"learning_rate": 0.000299518424429521,
"loss": 0.8382,
"step": 220
},
{
"epoch": 0.2517482517482518,
"grad_norm": 0.17371559645370255,
"learning_rate": 0.0002993938889200556,
"loss": 0.8322,
"step": 225
},
{
"epoch": 0.2573426573426573,
"grad_norm": 0.13779589201692538,
"learning_rate": 0.0002992550796839468,
"loss": 0.8243,
"step": 230
},
{
"epoch": 0.2629370629370629,
"grad_norm": 0.22266694135489237,
"learning_rate": 0.00029910200998361857,
"loss": 0.8332,
"step": 235
},
{
"epoch": 0.26853146853146853,
"grad_norm": 0.21324201617886082,
"learning_rate": 0.0002989346944440003,
"loss": 0.8377,
"step": 240
},
{
"epoch": 0.27412587412587414,
"grad_norm": 0.1378342855087125,
"learning_rate": 0.0002987531490511291,
"loss": 0.8346,
"step": 245
},
{
"epoch": 0.27972027972027974,
"grad_norm": 0.13885067623091277,
"learning_rate": 0.000298557391150623,
"loss": 0.8366,
"step": 250
},
{
"epoch": 0.2853146853146853,
"grad_norm": 0.13254675983004904,
"learning_rate": 0.00029834743944602316,
"loss": 0.8389,
"step": 255
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.15399189190157112,
"learning_rate": 0.0002981233139970071,
"loss": 0.8247,
"step": 260
},
{
"epoch": 0.2965034965034965,
"grad_norm": 0.16475565381996887,
"learning_rate": 0.0002978850362174722,
"loss": 0.8042,
"step": 265
},
{
"epoch": 0.3020979020979021,
"grad_norm": 0.1449584149552635,
"learning_rate": 0.0002976326288734894,
"loss": 0.8113,
"step": 270
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.1479148461740847,
"learning_rate": 0.0002973661160811284,
"loss": 0.8327,
"step": 275
},
{
"epoch": 0.3132867132867133,
"grad_norm": 0.11641111946130961,
"learning_rate": 0.00029708552330415337,
"loss": 0.8266,
"step": 280
},
{
"epoch": 0.31888111888111886,
"grad_norm": 0.1289644823631784,
"learning_rate": 0.0002967908773515898,
"loss": 0.8031,
"step": 285
},
{
"epoch": 0.32447552447552447,
"grad_norm": 0.19696780235530817,
"learning_rate": 0.0002964822063751635,
"loss": 0.8195,
"step": 290
},
{
"epoch": 0.3300699300699301,
"grad_norm": 0.13415245793361397,
"learning_rate": 0.00029615953986661056,
"loss": 0.8232,
"step": 295
},
{
"epoch": 0.3356643356643357,
"grad_norm": 0.1289064453230423,
"learning_rate": 0.0002958229086548595,
"loss": 0.811,
"step": 300
},
{
"epoch": 0.3412587412587413,
"grad_norm": 0.2030532789276685,
"learning_rate": 0.00029547234490308604,
"loss": 0.8196,
"step": 305
},
{
"epoch": 0.34685314685314683,
"grad_norm": 0.1690018504223368,
"learning_rate": 0.00029510788210563996,
"loss": 0.8176,
"step": 310
},
{
"epoch": 0.35244755244755244,
"grad_norm": 0.18348727298089326,
"learning_rate": 0.0002947295550848448,
"loss": 0.8106,
"step": 315
},
{
"epoch": 0.35804195804195804,
"grad_norm": 0.12176145136770436,
"learning_rate": 0.000294337399987671,
"loss": 0.8106,
"step": 320
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.14033633981287297,
"learning_rate": 0.0002939314542822821,
"loss": 0.8061,
"step": 325
},
{
"epoch": 0.36923076923076925,
"grad_norm": 0.1360601366236884,
"learning_rate": 0.0002935117567544547,
"loss": 0.8026,
"step": 330
},
{
"epoch": 0.3748251748251748,
"grad_norm": 0.15461610864168454,
"learning_rate": 0.0002930783475038734,
"loss": 0.8232,
"step": 335
},
{
"epoch": 0.3804195804195804,
"grad_norm": 0.10363695784053935,
"learning_rate": 0.0002926312679402985,
"loss": 0.8049,
"step": 340
},
{
"epoch": 0.386013986013986,
"grad_norm": 0.12894656658416323,
"learning_rate": 0.00029217056077961043,
"loss": 0.7993,
"step": 345
},
{
"epoch": 0.3916083916083916,
"grad_norm": 0.12013728582042035,
"learning_rate": 0.000291696270039728,
"loss": 0.7863,
"step": 350
},
{
"epoch": 0.3972027972027972,
"grad_norm": 0.10813698150493352,
"learning_rate": 0.0002912084410364029,
"loss": 0.7997,
"step": 355
},
{
"epoch": 0.4027972027972028,
"grad_norm": 0.17771298556035153,
"learning_rate": 0.00029070712037889,
"loss": 0.8018,
"step": 360
},
{
"epoch": 0.4083916083916084,
"grad_norm": 0.10837497819616222,
"learning_rate": 0.00029019235596549394,
"loss": 0.8078,
"step": 365
},
{
"epoch": 0.413986013986014,
"grad_norm": 0.12560903325827824,
"learning_rate": 0.0002896641969789932,
"loss": 0.8182,
"step": 370
},
{
"epoch": 0.4195804195804196,
"grad_norm": 0.17753966157649756,
"learning_rate": 0.0002891226938819405,
"loss": 0.8059,
"step": 375
},
{
"epoch": 0.4251748251748252,
"grad_norm": 0.12975220832490364,
"learning_rate": 0.0002885678984118415,
"loss": 0.7811,
"step": 380
},
{
"epoch": 0.4307692307692308,
"grad_norm": 0.18552799892105348,
"learning_rate": 0.0002879998635762118,
"loss": 0.799,
"step": 385
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.17744047449094297,
"learning_rate": 0.000287418643647512,
"loss": 0.7974,
"step": 390
},
{
"epoch": 0.44195804195804195,
"grad_norm": 0.13797503272598644,
"learning_rate": 0.00028682429415796267,
"loss": 0.7931,
"step": 395
},
{
"epoch": 0.44755244755244755,
"grad_norm": 0.12860800142514783,
"learning_rate": 0.0002862168718942383,
"loss": 0.7861,
"step": 400
},
{
"epoch": 0.45314685314685316,
"grad_norm": 0.12668805301774957,
"learning_rate": 0.00028559643489204186,
"loss": 0.8107,
"step": 405
},
{
"epoch": 0.45874125874125876,
"grad_norm": 0.11338400624283074,
"learning_rate": 0.0002849630424305595,
"loss": 0.8088,
"step": 410
},
{
"epoch": 0.4643356643356643,
"grad_norm": 0.1345570809457518,
"learning_rate": 0.00028431675502679717,
"loss": 0.8038,
"step": 415
},
{
"epoch": 0.4699300699300699,
"grad_norm": 0.11102734283999488,
"learning_rate": 0.00028365763442979823,
"loss": 0.8163,
"step": 420
},
{
"epoch": 0.4755244755244755,
"grad_norm": 0.12810877098380058,
"learning_rate": 0.000282985743614744,
"loss": 0.8017,
"step": 425
},
{
"epoch": 0.4811188811188811,
"grad_norm": 0.11873107897366113,
"learning_rate": 0.0002823011467769364,
"loss": 0.7957,
"step": 430
},
{
"epoch": 0.48671328671328673,
"grad_norm": 0.13272846461187762,
"learning_rate": 0.000281603909325665,
"loss": 0.8054,
"step": 435
},
{
"epoch": 0.49230769230769234,
"grad_norm": 0.10875049751356179,
"learning_rate": 0.00028089409787795716,
"loss": 0.7976,
"step": 440
},
{
"epoch": 0.4979020979020979,
"grad_norm": 0.11634952268956057,
"learning_rate": 0.0002801717802522132,
"loss": 0.792,
"step": 445
},
{
"epoch": 0.5034965034965035,
"grad_norm": 0.10607280522202725,
"learning_rate": 0.00027943702546172697,
"loss": 0.8078,
"step": 450
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.15901805386261234,
"learning_rate": 0.00027868990370809164,
"loss": 0.8023,
"step": 455
},
{
"epoch": 0.5146853146853146,
"grad_norm": 0.12222508621029372,
"learning_rate": 0.00027793048637449273,
"loss": 0.7956,
"step": 460
},
{
"epoch": 0.5202797202797202,
"grad_norm": 0.10769602993732762,
"learning_rate": 0.0002771588460188876,
"loss": 0.7897,
"step": 465
},
{
"epoch": 0.5258741258741259,
"grad_norm": 0.1119724845850892,
"learning_rate": 0.00027637505636707315,
"loss": 0.7901,
"step": 470
},
{
"epoch": 0.5314685314685315,
"grad_norm": 0.11279764446201575,
"learning_rate": 0.0002755791923056415,
"loss": 0.79,
"step": 475
},
{
"epoch": 0.5370629370629371,
"grad_norm": 0.16791268697388115,
"learning_rate": 0.0002747713298748253,
"loss": 0.7909,
"step": 480
},
{
"epoch": 0.5426573426573427,
"grad_norm": 0.14084691053374204,
"learning_rate": 0.00027395154626123225,
"loss": 0.8013,
"step": 485
},
{
"epoch": 0.5482517482517483,
"grad_norm": 0.12452861008817544,
"learning_rate": 0.00027311991979047046,
"loss": 0.7888,
"step": 490
},
{
"epoch": 0.5538461538461539,
"grad_norm": 0.12432408060887955,
"learning_rate": 0.00027227652991966507,
"loss": 0.7736,
"step": 495
},
{
"epoch": 0.5594405594405595,
"grad_norm": 0.1167273022435753,
"learning_rate": 0.00027142145722986637,
"loss": 0.7892,
"step": 500
},
{
"epoch": 0.5650349650349651,
"grad_norm": 0.1570515101772784,
"learning_rate": 0.0002705547834183506,
"loss": 0.7735,
"step": 505
},
{
"epoch": 0.5706293706293706,
"grad_norm": 0.1757157276572827,
"learning_rate": 0.00026967659129081465,
"loss": 0.7947,
"step": 510
},
{
"epoch": 0.5762237762237762,
"grad_norm": 0.10842613803532425,
"learning_rate": 0.0002687869647534643,
"loss": 0.7844,
"step": 515
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.14559223171878447,
"learning_rate": 0.0002678859888049972,
"loss": 0.7881,
"step": 520
},
{
"epoch": 0.5874125874125874,
"grad_norm": 0.13398356861671987,
"learning_rate": 0.0002669737495284819,
"loss": 0.7731,
"step": 525
},
{
"epoch": 0.593006993006993,
"grad_norm": 0.16426998549603838,
"learning_rate": 0.00026605033408313354,
"loss": 0.7819,
"step": 530
},
{
"epoch": 0.5986013986013986,
"grad_norm": 0.12765279377973468,
"learning_rate": 0.0002651158306959855,
"loss": 0.7725,
"step": 535
},
{
"epoch": 0.6041958041958042,
"grad_norm": 0.11028117529632633,
"learning_rate": 0.00026417032865346023,
"loss": 0.7926,
"step": 540
},
{
"epoch": 0.6097902097902098,
"grad_norm": 0.12782289803306612,
"learning_rate": 0.00026321391829283884,
"loss": 0.7634,
"step": 545
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.1369549911517866,
"learning_rate": 0.0002622466909936289,
"loss": 0.7628,
"step": 550
},
{
"epoch": 0.620979020979021,
"grad_norm": 0.11521467825966626,
"learning_rate": 0.0002612687391688347,
"loss": 0.7763,
"step": 555
},
{
"epoch": 0.6265734265734266,
"grad_norm": 0.12350512688772212,
"learning_rate": 0.00026028015625612706,
"loss": 0.7884,
"step": 560
},
{
"epoch": 0.6321678321678321,
"grad_norm": 0.10825882258061882,
"learning_rate": 0.000259281036708916,
"loss": 0.7945,
"step": 565
},
{
"epoch": 0.6377622377622377,
"grad_norm": 0.11150400278758185,
"learning_rate": 0.00025827147598732656,
"loss": 0.7862,
"step": 570
},
{
"epoch": 0.6433566433566433,
"grad_norm": 0.10337975508652891,
"learning_rate": 0.00025725157054907777,
"loss": 0.7838,
"step": 575
},
{
"epoch": 0.6489510489510489,
"grad_norm": 0.09770680702107988,
"learning_rate": 0.0002562214178402669,
"loss": 0.7969,
"step": 580
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.1163323874465344,
"learning_rate": 0.00025518111628605885,
"loss": 0.7819,
"step": 585
},
{
"epoch": 0.6601398601398601,
"grad_norm": 0.11896295851189105,
"learning_rate": 0.00025413076528128255,
"loss": 0.7709,
"step": 590
},
{
"epoch": 0.6657342657342658,
"grad_norm": 0.11330549108690711,
"learning_rate": 0.0002530704651809339,
"loss": 0.7744,
"step": 595
},
{
"epoch": 0.6713286713286714,
"grad_norm": 0.13654029100152085,
"learning_rate": 0.0002520003172905878,
"loss": 0.7952,
"step": 600
},
{
"epoch": 0.676923076923077,
"grad_norm": 0.10351068732360345,
"learning_rate": 0.0002509204238567186,
"loss": 0.7755,
"step": 605
},
{
"epoch": 0.6825174825174826,
"grad_norm": 0.14077884569885882,
"learning_rate": 0.00024983088805693163,
"loss": 0.7831,
"step": 610
},
{
"epoch": 0.6881118881118881,
"grad_norm": 0.11948510265909233,
"learning_rate": 0.00024873181399010446,
"loss": 0.7861,
"step": 615
},
{
"epoch": 0.6937062937062937,
"grad_norm": 0.12755481254736167,
"learning_rate": 0.00024762330666644136,
"loss": 0.7782,
"step": 620
},
{
"epoch": 0.6993006993006993,
"grad_norm": 0.10102773025347063,
"learning_rate": 0.0002465054719974401,
"loss": 0.7731,
"step": 625
},
{
"epoch": 0.7048951048951049,
"grad_norm": 0.11665875780674151,
"learning_rate": 0.0002453784167857725,
"loss": 0.7839,
"step": 630
},
{
"epoch": 0.7104895104895105,
"grad_norm": 0.1074509342573267,
"learning_rate": 0.00024424224871508014,
"loss": 0.7769,
"step": 635
},
{
"epoch": 0.7160839160839161,
"grad_norm": 0.12422918258396683,
"learning_rate": 0.0002430970763396861,
"loss": 0.7754,
"step": 640
},
{
"epoch": 0.7216783216783217,
"grad_norm": 0.11786940518440715,
"learning_rate": 0.00024194300907422276,
"loss": 0.7974,
"step": 645
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.10398253596299759,
"learning_rate": 0.00024078015718317818,
"loss": 0.7729,
"step": 650
},
{
"epoch": 0.7328671328671329,
"grad_norm": 0.09903776559603776,
"learning_rate": 0.00023960863177036079,
"loss": 0.774,
"step": 655
},
{
"epoch": 0.7384615384615385,
"grad_norm": 0.09179018818178636,
"learning_rate": 0.00023842854476828411,
"loss": 0.7629,
"step": 660
},
{
"epoch": 0.7440559440559441,
"grad_norm": 0.10143244330106851,
"learning_rate": 0.0002372400089274724,
"loss": 0.781,
"step": 665
},
{
"epoch": 0.7496503496503496,
"grad_norm": 0.1121174501554115,
"learning_rate": 0.00023604313780568772,
"loss": 0.7811,
"step": 670
},
{
"epoch": 0.7552447552447552,
"grad_norm": 0.10047620860251714,
"learning_rate": 0.00023483804575708027,
"loss": 0.7752,
"step": 675
},
{
"epoch": 0.7608391608391608,
"grad_norm": 0.11068987100435401,
"learning_rate": 0.0002336248479212626,
"loss": 0.7657,
"step": 680
},
{
"epoch": 0.7664335664335664,
"grad_norm": 0.12535370469312143,
"learning_rate": 0.0002324036602123086,
"loss": 0.7731,
"step": 685
},
{
"epoch": 0.772027972027972,
"grad_norm": 0.11059830294321518,
"learning_rate": 0.00023117459930767847,
"loss": 0.7831,
"step": 690
},
{
"epoch": 0.7776223776223776,
"grad_norm": 0.11073279493368117,
"learning_rate": 0.00022993778263707105,
"loss": 0.7705,
"step": 695
},
{
"epoch": 0.7832167832167832,
"grad_norm": 0.12764331660853,
"learning_rate": 0.000228693328371204,
"loss": 0.7816,
"step": 700
},
{
"epoch": 0.7888111888111888,
"grad_norm": 0.10951855988735575,
"learning_rate": 0.0002274413554105232,
"loss": 0.7577,
"step": 705
},
{
"epoch": 0.7944055944055944,
"grad_norm": 0.12320185793067193,
"learning_rate": 0.00022618198337384264,
"loss": 0.7744,
"step": 710
},
{
"epoch": 0.8,
"grad_norm": 0.10716655164464266,
"learning_rate": 0.00022491533258691546,
"loss": 0.7752,
"step": 715
},
{
"epoch": 0.8055944055944056,
"grad_norm": 0.11962877035421185,
"learning_rate": 0.00022364152407093737,
"loss": 0.7812,
"step": 720
},
{
"epoch": 0.8111888111888111,
"grad_norm": 0.10651252517324981,
"learning_rate": 0.00022236067953098414,
"loss": 0.78,
"step": 725
},
{
"epoch": 0.8167832167832167,
"grad_norm": 0.1086890422520719,
"learning_rate": 0.00022107292134438298,
"loss": 0.7801,
"step": 730
},
{
"epoch": 0.8223776223776224,
"grad_norm": 0.09743032182612474,
"learning_rate": 0.00021977837254902034,
"loss": 0.7762,
"step": 735
},
{
"epoch": 0.827972027972028,
"grad_norm": 0.09964450561189496,
"learning_rate": 0.0002184771568315862,
"loss": 0.7809,
"step": 740
},
{
"epoch": 0.8335664335664336,
"grad_norm": 0.09852105354101268,
"learning_rate": 0.0002171693985157567,
"loss": 0.7803,
"step": 745
},
{
"epoch": 0.8391608391608392,
"grad_norm": 0.10751247181443387,
"learning_rate": 0.00021585522255031554,
"loss": 0.754,
"step": 750
},
{
"epoch": 0.8447552447552448,
"grad_norm": 0.10083658202350868,
"learning_rate": 0.00021453475449721593,
"loss": 0.7689,
"step": 755
},
{
"epoch": 0.8503496503496504,
"grad_norm": 0.1132302634053956,
"learning_rate": 0.00021320812051958392,
"loss": 0.7667,
"step": 760
},
{
"epoch": 0.855944055944056,
"grad_norm": 0.09942757452904256,
"learning_rate": 0.00021187544736966403,
"loss": 0.7798,
"step": 765
},
{
"epoch": 0.8615384615384616,
"grad_norm": 0.11119180151464317,
"learning_rate": 0.00021053686237670912,
"loss": 0.7768,
"step": 770
},
{
"epoch": 0.8671328671328671,
"grad_norm": 0.11012292213675227,
"learning_rate": 0.0002091924934348146,
"loss": 0.7641,
"step": 775
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.10508899250762296,
"learning_rate": 0.0002078424689906988,
"loss": 0.772,
"step": 780
},
{
"epoch": 0.8783216783216783,
"grad_norm": 0.09916724236606267,
"learning_rate": 0.00020648691803143088,
"loss": 0.7798,
"step": 785
},
{
"epoch": 0.8839160839160839,
"grad_norm": 0.10139136991498161,
"learning_rate": 0.00020512597007210672,
"loss": 0.7595,
"step": 790
},
{
"epoch": 0.8895104895104895,
"grad_norm": 0.09922677378640356,
"learning_rate": 0.00020375975514347447,
"loss": 0.7582,
"step": 795
},
{
"epoch": 0.8951048951048951,
"grad_norm": 0.10734407771584646,
"learning_rate": 0.0002023884037795109,
"loss": 0.7747,
"step": 800
},
{
"epoch": 0.9006993006993007,
"grad_norm": 0.09003146154158007,
"learning_rate": 0.00020101204700494963,
"loss": 0.772,
"step": 805
},
{
"epoch": 0.9062937062937063,
"grad_norm": 0.10224227095976963,
"learning_rate": 0.00019963081632276244,
"loss": 0.7632,
"step": 810
},
{
"epoch": 0.9118881118881119,
"grad_norm": 0.10634964894349899,
"learning_rate": 0.00019824484370159511,
"loss": 0.7621,
"step": 815
},
{
"epoch": 0.9174825174825175,
"grad_norm": 0.08669203171955091,
"learning_rate": 0.00019685426156315817,
"loss": 0.7678,
"step": 820
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.09236536065178236,
"learning_rate": 0.00019545920276957512,
"loss": 0.7615,
"step": 825
},
{
"epoch": 0.9286713286713286,
"grad_norm": 0.09051106183048227,
"learning_rate": 0.00019405980061068813,
"loss": 0.7538,
"step": 830
},
{
"epoch": 0.9342657342657342,
"grad_norm": 0.09075314123364801,
"learning_rate": 0.00019265618879132294,
"loss": 0.7695,
"step": 835
},
{
"epoch": 0.9398601398601398,
"grad_norm": 0.09779655862085175,
"learning_rate": 0.000191248501418514,
"loss": 0.7486,
"step": 840
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.09284439899834651,
"learning_rate": 0.00018983687298869165,
"loss": 0.7757,
"step": 845
},
{
"epoch": 0.951048951048951,
"grad_norm": 0.10906263614923466,
"learning_rate": 0.00018842143837483137,
"loss": 0.7654,
"step": 850
},
{
"epoch": 0.9566433566433566,
"grad_norm": 0.08711563033296012,
"learning_rate": 0.00018700233281356774,
"loss": 0.7661,
"step": 855
},
{
"epoch": 0.9622377622377623,
"grad_norm": 0.10105386217514606,
"learning_rate": 0.00018557969189227327,
"loss": 0.7566,
"step": 860
},
{
"epoch": 0.9678321678321679,
"grad_norm": 0.09374075010702233,
"learning_rate": 0.00018415365153610363,
"loss": 0.7505,
"step": 865
},
{
"epoch": 0.9734265734265735,
"grad_norm": 0.09217739631223294,
"learning_rate": 0.00018272434799501108,
"loss": 0.7513,
"step": 870
},
{
"epoch": 0.9790209790209791,
"grad_norm": 0.10860300198578388,
"learning_rate": 0.00018129191783072644,
"loss": 0.7586,
"step": 875
},
{
"epoch": 0.9846153846153847,
"grad_norm": 0.10382198085691989,
"learning_rate": 0.00017985649790371123,
"loss": 0.7712,
"step": 880
},
{
"epoch": 0.9902097902097902,
"grad_norm": 0.09718012392096963,
"learning_rate": 0.00017841822536008174,
"loss": 0.7548,
"step": 885
},
{
"epoch": 0.9958041958041958,
"grad_norm": 0.09129380340752612,
"learning_rate": 0.00017697723761850529,
"loss": 0.7442,
"step": 890
},
{
"epoch": 0.9991608391608392,
"eval_loss": 1.078917145729065,
"eval_runtime": 368.4649,
"eval_samples_per_second": 55.878,
"eval_steps_per_second": 1.748,
"step": 893
},
{
"epoch": 1.0013986013986014,
"grad_norm": 0.2193429577704184,
"learning_rate": 0.0001755336723570709,
"loss": 0.7304,
"step": 895
},
{
"epoch": 1.006993006993007,
"grad_norm": 0.12029696024418647,
"learning_rate": 0.00017408766750013455,
"loss": 0.6883,
"step": 900
},
{
"epoch": 1.0125874125874126,
"grad_norm": 0.11157245101821324,
"learning_rate": 0.0001726393612051416,
"loss": 0.6937,
"step": 905
},
{
"epoch": 1.018181818181818,
"grad_norm": 0.11562847307206518,
"learning_rate": 0.0001711888918494268,
"loss": 0.7072,
"step": 910
},
{
"epoch": 1.0237762237762238,
"grad_norm": 0.09699091401508494,
"learning_rate": 0.00016973639801699258,
"loss": 0.7002,
"step": 915
},
{
"epoch": 1.0293706293706293,
"grad_norm": 0.10329864513789806,
"learning_rate": 0.0001682820184852687,
"loss": 0.7049,
"step": 920
},
{
"epoch": 1.034965034965035,
"grad_norm": 0.10505439144371424,
"learning_rate": 0.0001668258922118525,
"loss": 0.7062,
"step": 925
},
{
"epoch": 1.0405594405594405,
"grad_norm": 0.09345853743066072,
"learning_rate": 0.0001653681583212326,
"loss": 0.705,
"step": 930
},
{
"epoch": 1.0461538461538462,
"grad_norm": 0.14624259074546206,
"learning_rate": 0.00016390895609149608,
"loss": 0.6862,
"step": 935
},
{
"epoch": 1.0517482517482517,
"grad_norm": 0.09484870970062163,
"learning_rate": 0.00016244842494102135,
"loss": 0.6794,
"step": 940
},
{
"epoch": 1.0573426573426574,
"grad_norm": 0.1093126433741654,
"learning_rate": 0.00016098670441515759,
"loss": 0.6965,
"step": 945
},
{
"epoch": 1.062937062937063,
"grad_norm": 0.08709955416922696,
"learning_rate": 0.000159523934172892,
"loss": 0.6875,
"step": 950
},
{
"epoch": 1.0685314685314686,
"grad_norm": 0.08587078936550793,
"learning_rate": 0.00015806025397350617,
"loss": 0.6816,
"step": 955
},
{
"epoch": 1.0741258741258741,
"grad_norm": 0.09226255228367097,
"learning_rate": 0.00015659580366322265,
"loss": 0.6909,
"step": 960
},
{
"epoch": 1.0797202797202796,
"grad_norm": 0.0936240844549708,
"learning_rate": 0.00015513072316184393,
"loss": 0.6904,
"step": 965
},
{
"epoch": 1.0853146853146853,
"grad_norm": 0.085659439411888,
"learning_rate": 0.0001536651524493834,
"loss": 0.6874,
"step": 970
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.09924612907304702,
"learning_rate": 0.00015219923155269157,
"loss": 0.6953,
"step": 975
},
{
"epoch": 1.0965034965034965,
"grad_norm": 0.10028513455623837,
"learning_rate": 0.00015073310053207665,
"loss": 0.6967,
"step": 980
},
{
"epoch": 1.102097902097902,
"grad_norm": 0.09345136457275896,
"learning_rate": 0.00014926689946792332,
"loss": 0.6905,
"step": 985
},
{
"epoch": 1.1076923076923078,
"grad_norm": 0.08893419418399162,
"learning_rate": 0.00014780076844730849,
"loss": 0.6985,
"step": 990
},
{
"epoch": 1.1132867132867132,
"grad_norm": 0.08853702003030435,
"learning_rate": 0.00014633484755061658,
"loss": 0.7014,
"step": 995
},
{
"epoch": 1.118881118881119,
"grad_norm": 0.08365464476596743,
"learning_rate": 0.0001448692768381561,
"loss": 0.697,
"step": 1000
},
{
"epoch": 1.1244755244755245,
"grad_norm": 0.08780213786162074,
"learning_rate": 0.00014340419633677732,
"loss": 0.7025,
"step": 1005
},
{
"epoch": 1.1300699300699302,
"grad_norm": 0.0884295613746686,
"learning_rate": 0.00014193974602649386,
"loss": 0.6993,
"step": 1010
},
{
"epoch": 1.1356643356643357,
"grad_norm": 0.09059540370700289,
"learning_rate": 0.00014047606582710798,
"loss": 0.6948,
"step": 1015
},
{
"epoch": 1.1412587412587412,
"grad_norm": 0.09899100050319447,
"learning_rate": 0.00013901329558484236,
"loss": 0.6992,
"step": 1020
},
{
"epoch": 1.1468531468531469,
"grad_norm": 0.09283620847093584,
"learning_rate": 0.00013755157505897868,
"loss": 0.7184,
"step": 1025
},
{
"epoch": 1.1524475524475524,
"grad_norm": 0.08887024389956383,
"learning_rate": 0.00013609104390850392,
"loss": 0.697,
"step": 1030
},
{
"epoch": 1.158041958041958,
"grad_norm": 0.08693816785751791,
"learning_rate": 0.0001346318416787674,
"loss": 0.6939,
"step": 1035
},
{
"epoch": 1.1636363636363636,
"grad_norm": 0.09116425941497776,
"learning_rate": 0.00013317410778814745,
"loss": 0.6989,
"step": 1040
},
{
"epoch": 1.1692307692307693,
"grad_norm": 0.08570466903745366,
"learning_rate": 0.00013171798151473133,
"loss": 0.6956,
"step": 1045
},
{
"epoch": 1.1748251748251748,
"grad_norm": 0.0974948891545614,
"learning_rate": 0.0001302636019830074,
"loss": 0.6965,
"step": 1050
},
{
"epoch": 1.1804195804195805,
"grad_norm": 0.08995597994573838,
"learning_rate": 0.0001288111081505732,
"loss": 0.7041,
"step": 1055
},
{
"epoch": 1.186013986013986,
"grad_norm": 0.07861264078439004,
"learning_rate": 0.00012736063879485837,
"loss": 0.7032,
"step": 1060
},
{
"epoch": 1.1916083916083915,
"grad_norm": 0.0841514205254347,
"learning_rate": 0.0001259123324998655,
"loss": 0.6905,
"step": 1065
},
{
"epoch": 1.1972027972027972,
"grad_norm": 0.08692540683602883,
"learning_rate": 0.0001244663276429291,
"loss": 0.7074,
"step": 1070
},
{
"epoch": 1.2027972027972027,
"grad_norm": 0.08776833500890557,
"learning_rate": 0.00012302276238149463,
"loss": 0.7041,
"step": 1075
},
{
"epoch": 1.2083916083916084,
"grad_norm": 0.08727180482054969,
"learning_rate": 0.00012158177463991828,
"loss": 0.696,
"step": 1080
},
{
"epoch": 1.213986013986014,
"grad_norm": 0.08046559487267867,
"learning_rate": 0.00012014350209628875,
"loss": 0.6826,
"step": 1085
},
{
"epoch": 1.2195804195804196,
"grad_norm": 0.08781752670322365,
"learning_rate": 0.00011870808216927356,
"loss": 0.6999,
"step": 1090
},
{
"epoch": 1.2251748251748251,
"grad_norm": 0.08813740501797863,
"learning_rate": 0.00011727565200498888,
"loss": 0.7037,
"step": 1095
},
{
"epoch": 1.2307692307692308,
"grad_norm": 0.08772222997454243,
"learning_rate": 0.00011584634846389638,
"loss": 0.6986,
"step": 1100
},
{
"epoch": 1.2363636363636363,
"grad_norm": 0.10332929019104932,
"learning_rate": 0.00011442030810772673,
"loss": 0.6725,
"step": 1105
},
{
"epoch": 1.241958041958042,
"grad_norm": 0.08709301562019137,
"learning_rate": 0.00011299766718643226,
"loss": 0.7063,
"step": 1110
},
{
"epoch": 1.2475524475524475,
"grad_norm": 0.08449696490718896,
"learning_rate": 0.00011157856162516863,
"loss": 0.692,
"step": 1115
},
{
"epoch": 1.2531468531468533,
"grad_norm": 0.08124616270666368,
"learning_rate": 0.00011016312701130841,
"loss": 0.6915,
"step": 1120
},
{
"epoch": 1.2587412587412588,
"grad_norm": 0.09376360999458151,
"learning_rate": 0.000108751498581486,
"loss": 0.6939,
"step": 1125
},
{
"epoch": 1.2643356643356642,
"grad_norm": 0.08677694965635109,
"learning_rate": 0.00010734381120867707,
"loss": 0.7029,
"step": 1130
},
{
"epoch": 1.26993006993007,
"grad_norm": 0.08737879433308637,
"learning_rate": 0.00010594019938931187,
"loss": 0.6849,
"step": 1135
},
{
"epoch": 1.2755244755244755,
"grad_norm": 0.08311271942623545,
"learning_rate": 0.00010454079723042485,
"loss": 0.6799,
"step": 1140
},
{
"epoch": 1.2811188811188812,
"grad_norm": 0.07508666997892824,
"learning_rate": 0.00010314573843684183,
"loss": 0.6979,
"step": 1145
},
{
"epoch": 1.2867132867132867,
"grad_norm": 0.07967922610661686,
"learning_rate": 0.00010175515629840487,
"loss": 0.6793,
"step": 1150
},
{
"epoch": 1.2923076923076924,
"grad_norm": 0.08373250104545919,
"learning_rate": 0.00010036918367723754,
"loss": 0.6942,
"step": 1155
},
{
"epoch": 1.2979020979020979,
"grad_norm": 0.08415435083106365,
"learning_rate": 9.898795299505037e-05,
"loss": 0.6843,
"step": 1160
},
{
"epoch": 1.3034965034965036,
"grad_norm": 0.0972925237131085,
"learning_rate": 9.761159622048914e-05,
"loss": 0.6786,
"step": 1165
},
{
"epoch": 1.309090909090909,
"grad_norm": 0.08332589008281578,
"learning_rate": 9.624024485652552e-05,
"loss": 0.6895,
"step": 1170
},
{
"epoch": 1.3146853146853146,
"grad_norm": 0.08324352673680815,
"learning_rate": 9.48740299278933e-05,
"loss": 0.6902,
"step": 1175
},
{
"epoch": 1.3202797202797203,
"grad_norm": 0.08212510567034546,
"learning_rate": 9.351308196856911e-05,
"loss": 0.6861,
"step": 1180
},
{
"epoch": 1.325874125874126,
"grad_norm": 0.08320001036155726,
"learning_rate": 9.215753100930118e-05,
"loss": 0.6943,
"step": 1185
},
{
"epoch": 1.3314685314685315,
"grad_norm": 0.09026366102833903,
"learning_rate": 9.08075065651854e-05,
"loss": 0.7031,
"step": 1190
},
{
"epoch": 1.337062937062937,
"grad_norm": 0.07809171125830346,
"learning_rate": 8.946313762329081e-05,
"loss": 0.6974,
"step": 1195
},
{
"epoch": 1.3426573426573427,
"grad_norm": 0.07708050011669573,
"learning_rate": 8.812455263033595e-05,
"loss": 0.7072,
"step": 1200
},
{
"epoch": 1.3482517482517482,
"grad_norm": 0.08672106026682487,
"learning_rate": 8.679187948041605e-05,
"loss": 0.6946,
"step": 1205
},
{
"epoch": 1.353846153846154,
"grad_norm": 0.08223294476361295,
"learning_rate": 8.546524550278405e-05,
"loss": 0.6917,
"step": 1210
},
{
"epoch": 1.3594405594405594,
"grad_norm": 0.07572041983819904,
"learning_rate": 8.414477744968441e-05,
"loss": 0.7068,
"step": 1215
},
{
"epoch": 1.365034965034965,
"grad_norm": 0.07935310328973877,
"learning_rate": 8.283060148424328e-05,
"loss": 0.6825,
"step": 1220
},
{
"epoch": 1.3706293706293706,
"grad_norm": 0.08084222220036809,
"learning_rate": 8.152284316841382e-05,
"loss": 0.6895,
"step": 1225
},
{
"epoch": 1.3762237762237763,
"grad_norm": 0.08362754142551204,
"learning_rate": 8.02216274509797e-05,
"loss": 0.6855,
"step": 1230
},
{
"epoch": 1.3818181818181818,
"grad_norm": 0.08263146843817087,
"learning_rate": 7.892707865561702e-05,
"loss": 0.685,
"step": 1235
},
{
"epoch": 1.3874125874125873,
"grad_norm": 0.07614085795637057,
"learning_rate": 7.763932046901587e-05,
"loss": 0.698,
"step": 1240
},
{
"epoch": 1.393006993006993,
"grad_norm": 0.08297289392137173,
"learning_rate": 7.635847592906259e-05,
"loss": 0.6892,
"step": 1245
},
{
"epoch": 1.3986013986013985,
"grad_norm": 0.09257478488971461,
"learning_rate": 7.50846674130845e-05,
"loss": 0.6819,
"step": 1250
},
{
"epoch": 1.4041958041958043,
"grad_norm": 0.0820635181024322,
"learning_rate": 7.381801662615731e-05,
"loss": 0.6836,
"step": 1255
},
{
"epoch": 1.4097902097902097,
"grad_norm": 0.08479286795375468,
"learning_rate": 7.255864458947677e-05,
"loss": 0.6838,
"step": 1260
},
{
"epoch": 1.4153846153846155,
"grad_norm": 0.08169299364006637,
"learning_rate": 7.130667162879602e-05,
"loss": 0.6912,
"step": 1265
},
{
"epoch": 1.420979020979021,
"grad_norm": 0.0847296311273153,
"learning_rate": 7.006221736292892e-05,
"loss": 0.6824,
"step": 1270
},
{
"epoch": 1.4265734265734267,
"grad_norm": 0.07999623959693618,
"learning_rate": 6.882540069232155e-05,
"loss": 0.6806,
"step": 1275
},
{
"epoch": 1.4321678321678322,
"grad_norm": 0.07964123970317083,
"learning_rate": 6.759633978769139e-05,
"loss": 0.7052,
"step": 1280
},
{
"epoch": 1.4377622377622377,
"grad_norm": 0.0966871184419301,
"learning_rate": 6.63751520787374e-05,
"loss": 0.7002,
"step": 1285
},
{
"epoch": 1.4433566433566434,
"grad_norm": 0.09626897693462537,
"learning_rate": 6.516195424291972e-05,
"loss": 0.6912,
"step": 1290
},
{
"epoch": 1.4489510489510489,
"grad_norm": 0.08145120964067751,
"learning_rate": 6.395686219431232e-05,
"loss": 0.6877,
"step": 1295
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.08789566288442652,
"learning_rate": 6.275999107252758e-05,
"loss": 0.6847,
"step": 1300
},
{
"epoch": 1.46013986013986,
"grad_norm": 0.0806679485875012,
"learning_rate": 6.157145523171587e-05,
"loss": 0.6869,
"step": 1305
},
{
"epoch": 1.4657342657342658,
"grad_norm": 0.07942983212018809,
"learning_rate": 6.039136822963924e-05,
"loss": 0.6767,
"step": 1310
},
{
"epoch": 1.4713286713286713,
"grad_norm": 0.08489122954397245,
"learning_rate": 5.9219842816821796e-05,
"loss": 0.6814,
"step": 1315
},
{
"epoch": 1.476923076923077,
"grad_norm": 0.08653144402466852,
"learning_rate": 5.805699092577722e-05,
"loss": 0.6968,
"step": 1320
},
{
"epoch": 1.4825174825174825,
"grad_norm": 0.08218462407837061,
"learning_rate": 5.6902923660313855e-05,
"loss": 0.6781,
"step": 1325
},
{
"epoch": 1.488111888111888,
"grad_norm": 0.07605468398853794,
"learning_rate": 5.5757751284919836e-05,
"loss": 0.6837,
"step": 1330
},
{
"epoch": 1.4937062937062937,
"grad_norm": 0.07347540817224318,
"learning_rate": 5.462158321422751e-05,
"loss": 0.678,
"step": 1335
},
{
"epoch": 1.4993006993006994,
"grad_norm": 0.08136761352155132,
"learning_rate": 5.34945280025599e-05,
"loss": 0.6843,
"step": 1340
},
{
"epoch": 1.504895104895105,
"grad_norm": 0.079793665685405,
"learning_rate": 5.237669333355863e-05,
"loss": 0.6919,
"step": 1345
},
{
"epoch": 1.5104895104895104,
"grad_norm": 0.07721773411337415,
"learning_rate": 5.126818600989557e-05,
"loss": 0.6826,
"step": 1350
},
{
"epoch": 1.5160839160839161,
"grad_norm": 0.07622519698946323,
"learning_rate": 5.0169111943068374e-05,
"loss": 0.6901,
"step": 1355
},
{
"epoch": 1.5216783216783218,
"grad_norm": 0.07862095262493349,
"learning_rate": 4.9079576143281326e-05,
"loss": 0.691,
"step": 1360
},
{
"epoch": 1.5272727272727273,
"grad_norm": 0.07949120411421966,
"learning_rate": 4.7999682709412216e-05,
"loss": 0.6806,
"step": 1365
},
{
"epoch": 1.5328671328671328,
"grad_norm": 0.08530944602336236,
"learning_rate": 4.692953481906605e-05,
"loss": 0.6847,
"step": 1370
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.08074798934800313,
"learning_rate": 4.586923471871743e-05,
"loss": 0.681,
"step": 1375
},
{
"epoch": 1.544055944055944,
"grad_norm": 0.0777449518216711,
"learning_rate": 4.481888371394115e-05,
"loss": 0.6874,
"step": 1380
},
{
"epoch": 1.5496503496503498,
"grad_norm": 0.07535968734459726,
"learning_rate": 4.377858215973318e-05,
"loss": 0.6751,
"step": 1385
},
{
"epoch": 1.5552447552447553,
"grad_norm": 0.07188075306681382,
"learning_rate": 4.2748429450922263e-05,
"loss": 0.6745,
"step": 1390
},
{
"epoch": 1.5608391608391607,
"grad_norm": 0.0781312708212868,
"learning_rate": 4.172852401267347e-05,
"loss": 0.688,
"step": 1395
},
{
"epoch": 1.5664335664335665,
"grad_norm": 0.07521827314744757,
"learning_rate": 4.0718963291084e-05,
"loss": 0.6757,
"step": 1400
},
{
"epoch": 1.5720279720279722,
"grad_norm": 0.07373545234599518,
"learning_rate": 3.9719843743872964e-05,
"loss": 0.6778,
"step": 1405
},
{
"epoch": 1.5776223776223777,
"grad_norm": 0.07264270828709123,
"learning_rate": 3.873126083116525e-05,
"loss": 0.6864,
"step": 1410
},
{
"epoch": 1.5832167832167832,
"grad_norm": 0.0763271584430483,
"learning_rate": 3.775330900637108e-05,
"loss": 0.683,
"step": 1415
},
{
"epoch": 1.5888111888111887,
"grad_norm": 0.07981572030031307,
"learning_rate": 3.678608170716117e-05,
"loss": 0.6795,
"step": 1420
},
{
"epoch": 1.5944055944055944,
"grad_norm": 0.0746896810608302,
"learning_rate": 3.582967134653972e-05,
"loss": 0.675,
"step": 1425
},
{
"epoch": 1.6,
"grad_norm": 0.07449703423407898,
"learning_rate": 3.488416930401457e-05,
"loss": 0.6805,
"step": 1430
},
{
"epoch": 1.6055944055944056,
"grad_norm": 0.07104343697673421,
"learning_rate": 3.3949665916866466e-05,
"loss": 0.6752,
"step": 1435
},
{
"epoch": 1.611188811188811,
"grad_norm": 0.07498548014087274,
"learning_rate": 3.302625047151807e-05,
"loss": 0.6949,
"step": 1440
},
{
"epoch": 1.6167832167832168,
"grad_norm": 0.07539631481012679,
"learning_rate": 3.211401119500283e-05,
"loss": 0.6892,
"step": 1445
},
{
"epoch": 1.6223776223776225,
"grad_norm": 0.0765200306903301,
"learning_rate": 3.12130352465357e-05,
"loss": 0.6891,
"step": 1450
},
{
"epoch": 1.627972027972028,
"grad_norm": 0.0730441214772004,
"learning_rate": 3.032340870918527e-05,
"loss": 0.6981,
"step": 1455
},
{
"epoch": 1.6335664335664335,
"grad_norm": 0.08171074628055888,
"learning_rate": 2.9445216581649384e-05,
"loss": 0.6936,
"step": 1460
},
{
"epoch": 1.6391608391608392,
"grad_norm": 0.083112396316528,
"learning_rate": 2.8578542770133654e-05,
"loss": 0.6737,
"step": 1465
},
{
"epoch": 1.6447552447552447,
"grad_norm": 0.08252295941296424,
"learning_rate": 2.772347008033492e-05,
"loss": 0.701,
"step": 1470
},
{
"epoch": 1.6503496503496504,
"grad_norm": 0.07835293344801121,
"learning_rate": 2.688008020952952e-05,
"loss": 0.6921,
"step": 1475
},
{
"epoch": 1.655944055944056,
"grad_norm": 0.0778212904803543,
"learning_rate": 2.6048453738767755e-05,
"loss": 0.6764,
"step": 1480
},
{
"epoch": 1.6615384615384614,
"grad_norm": 0.07182079524548696,
"learning_rate": 2.5228670125174704e-05,
"loss": 0.6841,
"step": 1485
},
{
"epoch": 1.6671328671328671,
"grad_norm": 0.072274928421158,
"learning_rate": 2.4420807694358468e-05,
"loss": 0.6823,
"step": 1490
},
{
"epoch": 1.6727272727272728,
"grad_norm": 0.07124500713293849,
"learning_rate": 2.3624943632926853e-05,
"loss": 0.6816,
"step": 1495
},
{
"epoch": 1.6783216783216783,
"grad_norm": 0.06940723884896695,
"learning_rate": 2.2841153981112397e-05,
"loss": 0.6805,
"step": 1500
},
{
"epoch": 1.6839160839160838,
"grad_norm": 0.07325019748524,
"learning_rate": 2.20695136255073e-05,
"loss": 0.6614,
"step": 1505
},
{
"epoch": 1.6895104895104895,
"grad_norm": 0.0727025067676513,
"learning_rate": 2.1310096291908347e-05,
"loss": 0.6851,
"step": 1510
},
{
"epoch": 1.6951048951048953,
"grad_norm": 0.07406758207149883,
"learning_rate": 2.0562974538273024e-05,
"loss": 0.6978,
"step": 1515
},
{
"epoch": 1.7006993006993008,
"grad_norm": 0.0684645890113912,
"learning_rate": 1.9828219747786733e-05,
"loss": 0.6814,
"step": 1520
},
{
"epoch": 1.7062937062937062,
"grad_norm": 0.07103130256598,
"learning_rate": 1.910590212204281e-05,
"loss": 0.6955,
"step": 1525
},
{
"epoch": 1.7118881118881117,
"grad_norm": 0.08660646823989741,
"learning_rate": 1.839609067433495e-05,
"loss": 0.6768,
"step": 1530
},
{
"epoch": 1.7174825174825175,
"grad_norm": 0.07000885068796428,
"learning_rate": 1.7698853223063554e-05,
"loss": 0.6814,
"step": 1535
},
{
"epoch": 1.7230769230769232,
"grad_norm": 0.07227717547773495,
"learning_rate": 1.701425638525601e-05,
"loss": 0.6863,
"step": 1540
},
{
"epoch": 1.7286713286713287,
"grad_norm": 0.08472297754432541,
"learning_rate": 1.634236557020174e-05,
"loss": 0.6739,
"step": 1545
},
{
"epoch": 1.7342657342657342,
"grad_norm": 0.07635635084022904,
"learning_rate": 1.5683244973202848e-05,
"loss": 0.6849,
"step": 1550
},
{
"epoch": 1.7398601398601399,
"grad_norm": 0.0733376716964744,
"learning_rate": 1.5036957569440488e-05,
"loss": 0.6736,
"step": 1555
},
{
"epoch": 1.7454545454545456,
"grad_norm": 0.06984335423469197,
"learning_rate": 1.4403565107958142e-05,
"loss": 0.6801,
"step": 1560
},
{
"epoch": 1.751048951048951,
"grad_norm": 0.07299591046039727,
"learning_rate": 1.3783128105761649e-05,
"loss": 0.6703,
"step": 1565
},
{
"epoch": 1.7566433566433566,
"grad_norm": 0.06856518432971286,
"learning_rate": 1.3175705842037332e-05,
"loss": 0.6811,
"step": 1570
},
{
"epoch": 1.762237762237762,
"grad_norm": 0.06892528407637724,
"learning_rate": 1.2581356352488003e-05,
"loss": 0.6895,
"step": 1575
},
{
"epoch": 1.7678321678321678,
"grad_norm": 0.0731333956161155,
"learning_rate": 1.2000136423788226e-05,
"loss": 0.6947,
"step": 1580
},
{
"epoch": 1.7734265734265735,
"grad_norm": 0.07510563722528918,
"learning_rate": 1.1432101588158487e-05,
"loss": 0.6782,
"step": 1585
},
{
"epoch": 1.779020979020979,
"grad_norm": 0.07335149840877105,
"learning_rate": 1.0877306118059498e-05,
"loss": 0.6832,
"step": 1590
},
{
"epoch": 1.7846153846153845,
"grad_norm": 0.07016146300305476,
"learning_rate": 1.0335803021006783e-05,
"loss": 0.6735,
"step": 1595
},
{
"epoch": 1.7902097902097902,
"grad_norm": 0.07110506905460164,
"learning_rate": 9.807644034506024e-06,
"loss": 0.6933,
"step": 1600
},
{
"epoch": 1.795804195804196,
"grad_norm": 0.07366692101954259,
"learning_rate": 9.292879621110022e-06,
"loss": 0.6775,
"step": 1605
},
{
"epoch": 1.8013986013986014,
"grad_norm": 0.07086756664676215,
"learning_rate": 8.791558963597045e-06,
"loss": 0.6847,
"step": 1610
},
{
"epoch": 1.806993006993007,
"grad_norm": 0.0704179943193231,
"learning_rate": 8.30372996027195e-06,
"loss": 0.6802,
"step": 1615
},
{
"epoch": 1.8125874125874126,
"grad_norm": 0.07178710401033903,
"learning_rate": 7.829439220389521e-06,
"loss": 0.6892,
"step": 1620
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.07500788961567526,
"learning_rate": 7.368732059701499e-06,
"loss": 0.6822,
"step": 1625
},
{
"epoch": 1.8237762237762238,
"grad_norm": 0.07030773912011665,
"learning_rate": 6.921652496126623e-06,
"loss": 0.6749,
"step": 1630
},
{
"epoch": 1.8293706293706293,
"grad_norm": 0.07046870723595076,
"learning_rate": 6.4882432455452606e-06,
"loss": 0.6748,
"step": 1635
},
{
"epoch": 1.8349650349650348,
"grad_norm": 0.07117585543308619,
"learning_rate": 6.068545717717916e-06,
"loss": 0.6828,
"step": 1640
},
{
"epoch": 1.8405594405594405,
"grad_norm": 0.06915527262243856,
"learning_rate": 5.662600012328944e-06,
"loss": 0.6883,
"step": 1645
},
{
"epoch": 1.8461538461538463,
"grad_norm": 0.06677186359807996,
"learning_rate": 5.27044491515512e-06,
"loss": 0.6701,
"step": 1650
},
{
"epoch": 1.8517482517482518,
"grad_norm": 0.06975135811394965,
"learning_rate": 4.892117894359981e-06,
"loss": 0.6896,
"step": 1655
},
{
"epoch": 1.8573426573426572,
"grad_norm": 0.06646442932877118,
"learning_rate": 4.527655096913913e-06,
"loss": 0.6736,
"step": 1660
},
{
"epoch": 1.862937062937063,
"grad_norm": 0.0696338374277028,
"learning_rate": 4.177091345140488e-06,
"loss": 0.6824,
"step": 1665
},
{
"epoch": 1.8685314685314687,
"grad_norm": 0.06913237278308428,
"learning_rate": 3.840460133389434e-06,
"loss": 0.6708,
"step": 1670
},
{
"epoch": 1.8741258741258742,
"grad_norm": 0.07472788742872244,
"learning_rate": 3.5177936248364236e-06,
"loss": 0.6843,
"step": 1675
},
{
"epoch": 1.8797202797202797,
"grad_norm": 0.07149423452101296,
"learning_rate": 3.2091226484101506e-06,
"loss": 0.6716,
"step": 1680
},
{
"epoch": 1.8853146853146852,
"grad_norm": 0.0676192443302248,
"learning_rate": 2.9144766958466014e-06,
"loss": 0.6816,
"step": 1685
},
{
"epoch": 1.8909090909090909,
"grad_norm": 0.07125330690039695,
"learning_rate": 2.6338839188715433e-06,
"loss": 0.686,
"step": 1690
},
{
"epoch": 1.8965034965034966,
"grad_norm": 0.07150788853257964,
"learning_rate": 2.3673711265105754e-06,
"loss": 0.6845,
"step": 1695
},
{
"epoch": 1.902097902097902,
"grad_norm": 0.06921602353027008,
"learning_rate": 2.1149637825277953e-06,
"loss": 0.6851,
"step": 1700
},
{
"epoch": 1.9076923076923076,
"grad_norm": 0.07103009932661779,
"learning_rate": 1.876686002992861e-06,
"loss": 0.6879,
"step": 1705
},
{
"epoch": 1.9132867132867133,
"grad_norm": 0.07078839389394587,
"learning_rate": 1.6525605539768173e-06,
"loss": 0.6842,
"step": 1710
},
{
"epoch": 1.918881118881119,
"grad_norm": 0.0677463790575426,
"learning_rate": 1.4426088493769695e-06,
"loss": 0.6822,
"step": 1715
},
{
"epoch": 1.9244755244755245,
"grad_norm": 0.0689426207416468,
"learning_rate": 1.2468509488708534e-06,
"loss": 0.671,
"step": 1720
},
{
"epoch": 1.93006993006993,
"grad_norm": 0.06820108494343277,
"learning_rate": 1.0653055559997014e-06,
"loss": 0.6775,
"step": 1725
},
{
"epoch": 1.9356643356643357,
"grad_norm": 0.06821597918112166,
"learning_rate": 8.979900163813891e-07,
"loss": 0.6701,
"step": 1730
},
{
"epoch": 1.9412587412587412,
"grad_norm": 0.06837657131866866,
"learning_rate": 7.449203160532102e-07,
"loss": 0.6725,
"step": 1735
},
{
"epoch": 1.946853146853147,
"grad_norm": 0.07000431064997485,
"learning_rate": 6.061110799443991e-07,
"loss": 0.6709,
"step": 1740
},
{
"epoch": 1.9524475524475524,
"grad_norm": 0.07108137280182684,
"learning_rate": 4.815755704789481e-07,
"loss": 0.6846,
"step": 1745
},
{
"epoch": 1.958041958041958,
"grad_norm": 0.07117415645327746,
"learning_rate": 3.7132568630833804e-07,
"loss": 0.6865,
"step": 1750
},
{
"epoch": 1.9636363636363636,
"grad_norm": 0.06606487210968705,
"learning_rate": 2.753719611747474e-07,
"loss": 0.6736,
"step": 1755
},
{
"epoch": 1.9692307692307693,
"grad_norm": 0.06965546022369541,
"learning_rate": 1.9372356290460744e-07,
"loss": 0.7023,
"step": 1760
},
{
"epoch": 1.9748251748251748,
"grad_norm": 0.06925022116566176,
"learning_rate": 1.2638829253265316e-07,
"loss": 0.6665,
"step": 1765
},
{
"epoch": 1.9804195804195803,
"grad_norm": 0.06743940727744605,
"learning_rate": 7.337258355660236e-08,
"loss": 0.6704,
"step": 1770
},
{
"epoch": 1.986013986013986,
"grad_norm": 0.06697392386506328,
"learning_rate": 3.4681501322464386e-08,
"loss": 0.6703,
"step": 1775
},
{
"epoch": 1.9916083916083918,
"grad_norm": 0.06864360169538268,
"learning_rate": 1.0318742540560421e-08,
"loss": 0.6753,
"step": 1780
},
{
"epoch": 1.9972027972027973,
"grad_norm": 0.06821105929452137,
"learning_rate": 2.8663493232272684e-10,
"loss": 0.6705,
"step": 1785
},
{
"epoch": 1.9983216783216782,
"eval_loss": 1.0629972219467163,
"eval_runtime": 366.656,
"eval_samples_per_second": 56.153,
"eval_steps_per_second": 1.756,
"step": 1786
},
{
"epoch": 1.9983216783216782,
"step": 1786,
"total_flos": 1127451463778304.0,
"train_loss": 0.777597175032935,
"train_runtime": 16051.5455,
"train_samples_per_second": 14.253,
"train_steps_per_second": 0.111
}
],
"logging_steps": 5,
"max_steps": 1786,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1127451463778304.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}