LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
f9c96c2 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 2540,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03937007874015748,
"grad_norm": 9.399362564086914,
"learning_rate": 7.086614173228347e-06,
"loss": 1.4557,
"step": 10
},
{
"epoch": 0.07874015748031496,
"grad_norm": 2.942493438720703,
"learning_rate": 1.4960629921259845e-05,
"loss": 0.4296,
"step": 20
},
{
"epoch": 0.11811023622047244,
"grad_norm": 3.1724367141723633,
"learning_rate": 2.283464566929134e-05,
"loss": 0.3657,
"step": 30
},
{
"epoch": 0.15748031496062992,
"grad_norm": 1.5789762735366821,
"learning_rate": 3.070866141732284e-05,
"loss": 0.3486,
"step": 40
},
{
"epoch": 0.1968503937007874,
"grad_norm": 2.097869396209717,
"learning_rate": 3.858267716535433e-05,
"loss": 0.3044,
"step": 50
},
{
"epoch": 0.23622047244094488,
"grad_norm": 3.880457639694214,
"learning_rate": 4.645669291338583e-05,
"loss": 0.3367,
"step": 60
},
{
"epoch": 0.2755905511811024,
"grad_norm": 2.8253917694091797,
"learning_rate": 5.433070866141733e-05,
"loss": 0.3126,
"step": 70
},
{
"epoch": 0.31496062992125984,
"grad_norm": 3.822925090789795,
"learning_rate": 6.220472440944882e-05,
"loss": 0.3004,
"step": 80
},
{
"epoch": 0.3543307086614173,
"grad_norm": 1.3659324645996094,
"learning_rate": 7.007874015748031e-05,
"loss": 0.2605,
"step": 90
},
{
"epoch": 0.3937007874015748,
"grad_norm": 1.7165173292160034,
"learning_rate": 7.795275590551181e-05,
"loss": 0.1676,
"step": 100
},
{
"epoch": 0.4330708661417323,
"grad_norm": 1.704687476158142,
"learning_rate": 8.582677165354331e-05,
"loss": 0.1404,
"step": 110
},
{
"epoch": 0.47244094488188976,
"grad_norm": 1.3101590871810913,
"learning_rate": 9.370078740157481e-05,
"loss": 0.1322,
"step": 120
},
{
"epoch": 0.5118110236220472,
"grad_norm": 1.6621087789535522,
"learning_rate": 9.999983049408561e-05,
"loss": 0.1242,
"step": 130
},
{
"epoch": 0.5511811023622047,
"grad_norm": 0.9743478298187256,
"learning_rate": 9.999389790775648e-05,
"loss": 0.1027,
"step": 140
},
{
"epoch": 0.5905511811023622,
"grad_norm": 1.9478999376296997,
"learning_rate": 9.997949117496292e-05,
"loss": 0.1174,
"step": 150
},
{
"epoch": 0.6299212598425197,
"grad_norm": 0.9509850740432739,
"learning_rate": 9.995661273769822e-05,
"loss": 0.1015,
"step": 160
},
{
"epoch": 0.6692913385826772,
"grad_norm": 0.9505985379219055,
"learning_rate": 9.992526647394022e-05,
"loss": 0.102,
"step": 170
},
{
"epoch": 0.7086614173228346,
"grad_norm": 1.489611268043518,
"learning_rate": 9.988545769699399e-05,
"loss": 0.097,
"step": 180
},
{
"epoch": 0.7480314960629921,
"grad_norm": 1.1149543523788452,
"learning_rate": 9.983719315459114e-05,
"loss": 0.0925,
"step": 190
},
{
"epoch": 0.7874015748031497,
"grad_norm": 1.0860552787780762,
"learning_rate": 9.978048102774613e-05,
"loss": 0.0964,
"step": 200
},
{
"epoch": 0.8267716535433071,
"grad_norm": 1.2707302570343018,
"learning_rate": 9.971533092936954e-05,
"loss": 0.0844,
"step": 210
},
{
"epoch": 0.8661417322834646,
"grad_norm": 1.1820255517959595,
"learning_rate": 9.964175390263856e-05,
"loss": 0.0805,
"step": 220
},
{
"epoch": 0.905511811023622,
"grad_norm": 1.3937278985977173,
"learning_rate": 9.955976241912535e-05,
"loss": 0.0871,
"step": 230
},
{
"epoch": 0.9448818897637795,
"grad_norm": 0.9341478943824768,
"learning_rate": 9.946937037668275e-05,
"loss": 0.0826,
"step": 240
},
{
"epoch": 0.984251968503937,
"grad_norm": 1.7772321701049805,
"learning_rate": 9.937059309708885e-05,
"loss": 0.0873,
"step": 250
},
{
"epoch": 1.0236220472440944,
"grad_norm": 1.2695393562316895,
"learning_rate": 9.926344732344967e-05,
"loss": 0.0794,
"step": 260
},
{
"epoch": 1.0629921259842519,
"grad_norm": 1.1093697547912598,
"learning_rate": 9.914795121736128e-05,
"loss": 0.0758,
"step": 270
},
{
"epoch": 1.1023622047244095,
"grad_norm": 0.8029543161392212,
"learning_rate": 9.902412435583128e-05,
"loss": 0.0678,
"step": 280
},
{
"epoch": 1.141732283464567,
"grad_norm": 0.7547488808631897,
"learning_rate": 9.88919877279604e-05,
"loss": 0.0761,
"step": 290
},
{
"epoch": 1.1811023622047245,
"grad_norm": 0.8116704225540161,
"learning_rate": 9.875156373138489e-05,
"loss": 0.057,
"step": 300
},
{
"epoch": 1.220472440944882,
"grad_norm": 0.8954646587371826,
"learning_rate": 9.86028761684799e-05,
"loss": 0.0738,
"step": 310
},
{
"epoch": 1.2598425196850394,
"grad_norm": 1.016405463218689,
"learning_rate": 9.844595024232495e-05,
"loss": 0.0901,
"step": 320
},
{
"epoch": 1.2992125984251968,
"grad_norm": 1.395342469215393,
"learning_rate": 9.828081255243198e-05,
"loss": 0.0796,
"step": 330
},
{
"epoch": 1.3385826771653544,
"grad_norm": 0.8092917203903198,
"learning_rate": 9.81074910902365e-05,
"loss": 0.0883,
"step": 340
},
{
"epoch": 1.3779527559055118,
"grad_norm": 1.1169452667236328,
"learning_rate": 9.792601523435307e-05,
"loss": 0.0748,
"step": 350
},
{
"epoch": 1.4173228346456692,
"grad_norm": 0.7994294166564941,
"learning_rate": 9.773641574559546e-05,
"loss": 0.0862,
"step": 360
},
{
"epoch": 1.4566929133858268,
"grad_norm": 0.7973235249519348,
"learning_rate": 9.753872476176254e-05,
"loss": 0.0735,
"step": 370
},
{
"epoch": 1.4960629921259843,
"grad_norm": 0.9219651818275452,
"learning_rate": 9.73329757921909e-05,
"loss": 0.077,
"step": 380
},
{
"epoch": 1.5354330708661417,
"grad_norm": 1.1715006828308105,
"learning_rate": 9.711920371207484e-05,
"loss": 0.0691,
"step": 390
},
{
"epoch": 1.574803149606299,
"grad_norm": 0.7752212882041931,
"learning_rate": 9.68974447565549e-05,
"loss": 0.0669,
"step": 400
},
{
"epoch": 1.6141732283464567,
"grad_norm": 1.0260776281356812,
"learning_rate": 9.666773651457588e-05,
"loss": 0.0623,
"step": 410
},
{
"epoch": 1.6535433070866141,
"grad_norm": 0.8338336944580078,
"learning_rate": 9.643011792251538e-05,
"loss": 0.0699,
"step": 420
},
{
"epoch": 1.6929133858267718,
"grad_norm": 0.8776105642318726,
"learning_rate": 9.618462925758392e-05,
"loss": 0.0653,
"step": 430
},
{
"epoch": 1.7322834645669292,
"grad_norm": 0.6896973252296448,
"learning_rate": 9.593131213099789e-05,
"loss": 0.0586,
"step": 440
},
{
"epoch": 1.7716535433070866,
"grad_norm": 1.0852605104446411,
"learning_rate": 9.567020948092616e-05,
"loss": 0.0673,
"step": 450
},
{
"epoch": 1.811023622047244,
"grad_norm": 1.0203490257263184,
"learning_rate": 9.540136556521203e-05,
"loss": 0.0663,
"step": 460
},
{
"epoch": 1.8503937007874016,
"grad_norm": 0.774488091468811,
"learning_rate": 9.512482595387132e-05,
"loss": 0.0609,
"step": 470
},
{
"epoch": 1.889763779527559,
"grad_norm": 0.5737660527229309,
"learning_rate": 9.484063752136805e-05,
"loss": 0.0606,
"step": 480
},
{
"epoch": 1.9291338582677167,
"grad_norm": 1.0153898000717163,
"learning_rate": 9.454884843866912e-05,
"loss": 0.0737,
"step": 490
},
{
"epoch": 1.968503937007874,
"grad_norm": 0.7526334524154663,
"learning_rate": 9.424950816507909e-05,
"loss": 0.0641,
"step": 500
},
{
"epoch": 2.0078740157480315,
"grad_norm": 0.5760018825531006,
"learning_rate": 9.394266743985671e-05,
"loss": 0.0674,
"step": 510
},
{
"epoch": 2.047244094488189,
"grad_norm": 0.70269775390625,
"learning_rate": 9.36283782736144e-05,
"loss": 0.0631,
"step": 520
},
{
"epoch": 2.0866141732283463,
"grad_norm": 0.8864635229110718,
"learning_rate": 9.330669393950219e-05,
"loss": 0.0654,
"step": 530
},
{
"epoch": 2.1259842519685037,
"grad_norm": 0.7043759226799011,
"learning_rate": 9.297766896417793e-05,
"loss": 0.0657,
"step": 540
},
{
"epoch": 2.1653543307086616,
"grad_norm": 0.6329500675201416,
"learning_rate": 9.264135911856462e-05,
"loss": 0.0707,
"step": 550
},
{
"epoch": 2.204724409448819,
"grad_norm": 0.4031962752342224,
"learning_rate": 9.22978214083971e-05,
"loss": 0.0528,
"step": 560
},
{
"epoch": 2.2440944881889764,
"grad_norm": 0.5401821136474609,
"learning_rate": 9.194711406455945e-05,
"loss": 0.0654,
"step": 570
},
{
"epoch": 2.283464566929134,
"grad_norm": 0.713798999786377,
"learning_rate": 9.158929653321451e-05,
"loss": 0.0555,
"step": 580
},
{
"epoch": 2.322834645669291,
"grad_norm": 0.4728735387325287,
"learning_rate": 9.122442946572768e-05,
"loss": 0.0552,
"step": 590
},
{
"epoch": 2.362204724409449,
"grad_norm": 0.7359452843666077,
"learning_rate": 9.085257470838619e-05,
"loss": 0.0677,
"step": 600
},
{
"epoch": 2.4015748031496065,
"grad_norm": 0.6030870676040649,
"learning_rate": 9.047379529191594e-05,
"loss": 0.053,
"step": 610
},
{
"epoch": 2.440944881889764,
"grad_norm": 0.5791817903518677,
"learning_rate": 9.008815542079766e-05,
"loss": 0.0493,
"step": 620
},
{
"epoch": 2.4803149606299213,
"grad_norm": 0.8772215247154236,
"learning_rate": 8.969572046238389e-05,
"loss": 0.0721,
"step": 630
},
{
"epoch": 2.5196850393700787,
"grad_norm": 0.8733668923377991,
"learning_rate": 8.929655693581904e-05,
"loss": 0.0597,
"step": 640
},
{
"epoch": 2.559055118110236,
"grad_norm": 1.0022321939468384,
"learning_rate": 8.889073250076421e-05,
"loss": 0.0659,
"step": 650
},
{
"epoch": 2.5984251968503935,
"grad_norm": 0.7206939458847046,
"learning_rate": 8.84783159459285e-05,
"loss": 0.0452,
"step": 660
},
{
"epoch": 2.637795275590551,
"grad_norm": 0.8875113725662231,
"learning_rate": 8.805937717740918e-05,
"loss": 0.0539,
"step": 670
},
{
"epoch": 2.677165354330709,
"grad_norm": 0.5767335295677185,
"learning_rate": 8.763398720684232e-05,
"loss": 0.0503,
"step": 680
},
{
"epoch": 2.716535433070866,
"grad_norm": 0.5727648138999939,
"learning_rate": 8.72022181393661e-05,
"loss": 0.0457,
"step": 690
},
{
"epoch": 2.7559055118110236,
"grad_norm": 0.8125827312469482,
"learning_rate": 8.676414316139863e-05,
"loss": 0.0607,
"step": 700
},
{
"epoch": 2.795275590551181,
"grad_norm": 0.6720311641693115,
"learning_rate": 8.631983652823267e-05,
"loss": 0.0665,
"step": 710
},
{
"epoch": 2.8346456692913384,
"grad_norm": 0.6637985706329346,
"learning_rate": 8.586937355144908e-05,
"loss": 0.068,
"step": 720
},
{
"epoch": 2.8740157480314963,
"grad_norm": 0.7840360999107361,
"learning_rate": 8.541283058615124e-05,
"loss": 0.0561,
"step": 730
},
{
"epoch": 2.9133858267716537,
"grad_norm": 0.44171687960624695,
"learning_rate": 8.495028501802251e-05,
"loss": 0.0534,
"step": 740
},
{
"epoch": 2.952755905511811,
"grad_norm": 0.4313163459300995,
"learning_rate": 8.448181525020921e-05,
"loss": 0.0391,
"step": 750
},
{
"epoch": 2.9921259842519685,
"grad_norm": 0.7261826395988464,
"learning_rate": 8.400750069003086e-05,
"loss": 0.0486,
"step": 760
},
{
"epoch": 3.031496062992126,
"grad_norm": 0.4469556212425232,
"learning_rate": 8.352742173552046e-05,
"loss": 0.0511,
"step": 770
},
{
"epoch": 3.0708661417322833,
"grad_norm": 0.9129867553710938,
"learning_rate": 8.304165976179667e-05,
"loss": 0.0533,
"step": 780
},
{
"epoch": 3.1102362204724407,
"grad_norm": 1.2041122913360596,
"learning_rate": 8.255029710727048e-05,
"loss": 0.0671,
"step": 790
},
{
"epoch": 3.1496062992125986,
"grad_norm": 0.7420069575309753,
"learning_rate": 8.20534170596885e-05,
"loss": 0.0685,
"step": 800
},
{
"epoch": 3.188976377952756,
"grad_norm": 0.3230190575122833,
"learning_rate": 8.155110384201544e-05,
"loss": 0.0647,
"step": 810
},
{
"epoch": 3.2283464566929134,
"grad_norm": 0.6603342890739441,
"learning_rate": 8.104344259815794e-05,
"loss": 0.0558,
"step": 820
},
{
"epoch": 3.267716535433071,
"grad_norm": 0.5632081031799316,
"learning_rate": 8.053051937853248e-05,
"loss": 0.0558,
"step": 830
},
{
"epoch": 3.3070866141732282,
"grad_norm": 0.7754299640655518,
"learning_rate": 8.001242112547942e-05,
"loss": 0.0632,
"step": 840
},
{
"epoch": 3.3464566929133857,
"grad_norm": 0.7823946475982666,
"learning_rate": 7.948923565852598e-05,
"loss": 0.0662,
"step": 850
},
{
"epoch": 3.3858267716535435,
"grad_norm": 0.7399844527244568,
"learning_rate": 7.896105165950059e-05,
"loss": 0.052,
"step": 860
},
{
"epoch": 3.425196850393701,
"grad_norm": 0.8208476305007935,
"learning_rate": 7.842795865750088e-05,
"loss": 0.0486,
"step": 870
},
{
"epoch": 3.4645669291338583,
"grad_norm": 0.5400993227958679,
"learning_rate": 7.789004701371825e-05,
"loss": 0.0443,
"step": 880
},
{
"epoch": 3.5039370078740157,
"grad_norm": 0.6949036717414856,
"learning_rate": 7.734740790612136e-05,
"loss": 0.0597,
"step": 890
},
{
"epoch": 3.543307086614173,
"grad_norm": 1.0321848392486572,
"learning_rate": 7.680013331400098e-05,
"loss": 0.0446,
"step": 900
},
{
"epoch": 3.5826771653543306,
"grad_norm": 0.5193206071853638,
"learning_rate": 7.624831600237937e-05,
"loss": 0.0499,
"step": 910
},
{
"epoch": 3.622047244094488,
"grad_norm": 0.755699872970581,
"learning_rate": 7.569204950628605e-05,
"loss": 0.0595,
"step": 920
},
{
"epoch": 3.661417322834646,
"grad_norm": 0.4758411645889282,
"learning_rate": 7.513142811490356e-05,
"loss": 0.0403,
"step": 930
},
{
"epoch": 3.7007874015748032,
"grad_norm": 0.9744377732276917,
"learning_rate": 7.456654685558481e-05,
"loss": 0.0566,
"step": 940
},
{
"epoch": 3.7401574803149606,
"grad_norm": 0.46791282296180725,
"learning_rate": 7.399750147774575e-05,
"loss": 0.0445,
"step": 950
},
{
"epoch": 3.779527559055118,
"grad_norm": 0.4000394642353058,
"learning_rate": 7.34243884366355e-05,
"loss": 0.0577,
"step": 960
},
{
"epoch": 3.8188976377952755,
"grad_norm": 0.6318042874336243,
"learning_rate": 7.28473048769868e-05,
"loss": 0.0491,
"step": 970
},
{
"epoch": 3.8582677165354333,
"grad_norm": 0.9635873436927795,
"learning_rate": 7.226634861654965e-05,
"loss": 0.0501,
"step": 980
},
{
"epoch": 3.8976377952755907,
"grad_norm": 0.7494231462478638,
"learning_rate": 7.168161812951084e-05,
"loss": 0.0511,
"step": 990
},
{
"epoch": 3.937007874015748,
"grad_norm": 0.6621044874191284,
"learning_rate": 7.109321252980218e-05,
"loss": 0.0429,
"step": 1000
},
{
"epoch": 3.9763779527559056,
"grad_norm": 0.6507459878921509,
"learning_rate": 7.05012315543004e-05,
"loss": 0.0514,
"step": 1010
},
{
"epoch": 4.015748031496063,
"grad_norm": 0.6397859454154968,
"learning_rate": 6.990577554592134e-05,
"loss": 0.051,
"step": 1020
},
{
"epoch": 4.05511811023622,
"grad_norm": 0.8459829688072205,
"learning_rate": 6.930694543661149e-05,
"loss": 0.0492,
"step": 1030
},
{
"epoch": 4.094488188976378,
"grad_norm": 0.6369995474815369,
"learning_rate": 6.870484273023968e-05,
"loss": 0.0447,
"step": 1040
},
{
"epoch": 4.133858267716535,
"grad_norm": 0.6142792701721191,
"learning_rate": 6.809956948539166e-05,
"loss": 0.044,
"step": 1050
},
{
"epoch": 4.173228346456693,
"grad_norm": 0.5256998538970947,
"learning_rate": 6.749122829807103e-05,
"loss": 0.0427,
"step": 1060
},
{
"epoch": 4.21259842519685,
"grad_norm": 0.7650443911552429,
"learning_rate": 6.687992228430872e-05,
"loss": 0.0525,
"step": 1070
},
{
"epoch": 4.251968503937007,
"grad_norm": 0.6876934170722961,
"learning_rate": 6.62657550626844e-05,
"loss": 0.0385,
"step": 1080
},
{
"epoch": 4.291338582677166,
"grad_norm": 0.7614730000495911,
"learning_rate": 6.564883073676287e-05,
"loss": 0.0543,
"step": 1090
},
{
"epoch": 4.330708661417323,
"grad_norm": 0.591896653175354,
"learning_rate": 6.502925387744807e-05,
"loss": 0.044,
"step": 1100
},
{
"epoch": 4.3700787401574805,
"grad_norm": 0.8287089467048645,
"learning_rate": 6.440712950525791e-05,
"loss": 0.0427,
"step": 1110
},
{
"epoch": 4.409448818897638,
"grad_norm": 0.8359081745147705,
"learning_rate": 6.3782563072523e-05,
"loss": 0.0513,
"step": 1120
},
{
"epoch": 4.448818897637795,
"grad_norm": 0.4965924322605133,
"learning_rate": 6.315566044551197e-05,
"loss": 0.0503,
"step": 1130
},
{
"epoch": 4.488188976377953,
"grad_norm": 0.5000588297843933,
"learning_rate": 6.252652788648691e-05,
"loss": 0.0348,
"step": 1140
},
{
"epoch": 4.52755905511811,
"grad_norm": 0.5434101819992065,
"learning_rate": 6.18952720356914e-05,
"loss": 0.0409,
"step": 1150
},
{
"epoch": 4.566929133858268,
"grad_norm": 0.6852266788482666,
"learning_rate": 6.126199989327462e-05,
"loss": 0.0437,
"step": 1160
},
{
"epoch": 4.606299212598425,
"grad_norm": 0.684528648853302,
"learning_rate": 6.062681880115453e-05,
"loss": 0.0447,
"step": 1170
},
{
"epoch": 4.645669291338582,
"grad_norm": 0.6656462550163269,
"learning_rate": 5.998983642482296e-05,
"loss": 0.0429,
"step": 1180
},
{
"epoch": 4.68503937007874,
"grad_norm": 0.7268936634063721,
"learning_rate": 5.935116073509592e-05,
"loss": 0.0478,
"step": 1190
},
{
"epoch": 4.724409448818898,
"grad_norm": 0.5984519720077515,
"learning_rate": 5.871089998981214e-05,
"loss": 0.038,
"step": 1200
},
{
"epoch": 4.7637795275590555,
"grad_norm": 0.6121963858604431,
"learning_rate": 5.8069162715483e-05,
"loss": 0.0388,
"step": 1210
},
{
"epoch": 4.803149606299213,
"grad_norm": 0.47043073177337646,
"learning_rate": 5.742605768889693e-05,
"loss": 0.0355,
"step": 1220
},
{
"epoch": 4.84251968503937,
"grad_norm": 0.5659105181694031,
"learning_rate": 5.6781693918681275e-05,
"loss": 0.0434,
"step": 1230
},
{
"epoch": 4.881889763779528,
"grad_norm": 0.5114152431488037,
"learning_rate": 5.613618062682502e-05,
"loss": 0.0379,
"step": 1240
},
{
"epoch": 4.921259842519685,
"grad_norm": 0.6941166520118713,
"learning_rate": 5.5489627230165176e-05,
"loss": 0.046,
"step": 1250
},
{
"epoch": 4.960629921259843,
"grad_norm": 0.6256818771362305,
"learning_rate": 5.48421433218403e-05,
"loss": 0.0505,
"step": 1260
},
{
"epoch": 5.0,
"grad_norm": 0.8666470646858215,
"learning_rate": 5.419383865271402e-05,
"loss": 0.0419,
"step": 1270
},
{
"epoch": 5.039370078740157,
"grad_norm": 1.043869972229004,
"learning_rate": 5.354482311277193e-05,
"loss": 0.0483,
"step": 1280
},
{
"epoch": 5.078740157480315,
"grad_norm": 0.4656646251678467,
"learning_rate": 5.289520671249479e-05,
"loss": 0.0333,
"step": 1290
},
{
"epoch": 5.118110236220472,
"grad_norm": 0.45656928420066833,
"learning_rate": 5.224509956421133e-05,
"loss": 0.0373,
"step": 1300
},
{
"epoch": 5.15748031496063,
"grad_norm": 0.7461910247802734,
"learning_rate": 5.159461186343385e-05,
"loss": 0.0427,
"step": 1310
},
{
"epoch": 5.196850393700787,
"grad_norm": 0.646958589553833,
"learning_rate": 5.094385387017967e-05,
"loss": 0.0447,
"step": 1320
},
{
"epoch": 5.2362204724409445,
"grad_norm": 0.2877052128314972,
"learning_rate": 5.02929358902817e-05,
"loss": 0.036,
"step": 1330
},
{
"epoch": 5.275590551181103,
"grad_norm": 0.5020745992660522,
"learning_rate": 4.964196825669112e-05,
"loss": 0.0485,
"step": 1340
},
{
"epoch": 5.31496062992126,
"grad_norm": 0.494816392660141,
"learning_rate": 4.899106131077562e-05,
"loss": 0.0446,
"step": 1350
},
{
"epoch": 5.354330708661418,
"grad_norm": 0.4856385290622711,
"learning_rate": 4.834032538361607e-05,
"loss": 0.0418,
"step": 1360
},
{
"epoch": 5.393700787401575,
"grad_norm": 0.2937265634536743,
"learning_rate": 4.768987077730509e-05,
"loss": 0.0329,
"step": 1370
},
{
"epoch": 5.433070866141732,
"grad_norm": 0.44668734073638916,
"learning_rate": 4.703980774625038e-05,
"loss": 0.0373,
"step": 1380
},
{
"epoch": 5.47244094488189,
"grad_norm": 0.49580681324005127,
"learning_rate": 4.6390246478486196e-05,
"loss": 0.0393,
"step": 1390
},
{
"epoch": 5.511811023622047,
"grad_norm": 0.41949307918548584,
"learning_rate": 4.574129707699617e-05,
"loss": 0.0336,
"step": 1400
},
{
"epoch": 5.551181102362205,
"grad_norm": 0.7558311223983765,
"learning_rate": 4.509306954105028e-05,
"loss": 0.036,
"step": 1410
},
{
"epoch": 5.590551181102362,
"grad_norm": 0.6058725118637085,
"learning_rate": 4.4445673747559776e-05,
"loss": 0.0389,
"step": 1420
},
{
"epoch": 5.6299212598425195,
"grad_norm": 0.5574952960014343,
"learning_rate": 4.3799219432452527e-05,
"loss": 0.0441,
"step": 1430
},
{
"epoch": 5.669291338582677,
"grad_norm": 0.3628334403038025,
"learning_rate": 4.315381617207239e-05,
"loss": 0.0306,
"step": 1440
},
{
"epoch": 5.708661417322834,
"grad_norm": 0.6688554286956787,
"learning_rate": 4.2509573364605695e-05,
"loss": 0.0384,
"step": 1450
},
{
"epoch": 5.748031496062993,
"grad_norm": 0.4437117874622345,
"learning_rate": 4.1866600211537734e-05,
"loss": 0.0359,
"step": 1460
},
{
"epoch": 5.78740157480315,
"grad_norm": 0.5025441646575928,
"learning_rate": 4.122500569914285e-05,
"loss": 0.0339,
"step": 1470
},
{
"epoch": 5.826771653543307,
"grad_norm": 0.6300631761550903,
"learning_rate": 4.058489858001079e-05,
"loss": 0.0412,
"step": 1480
},
{
"epoch": 5.866141732283465,
"grad_norm": 0.33207225799560547,
"learning_rate": 3.9946387354612754e-05,
"loss": 0.0382,
"step": 1490
},
{
"epoch": 5.905511811023622,
"grad_norm": 0.4854939877986908,
"learning_rate": 3.930958025291021e-05,
"loss": 0.0355,
"step": 1500
},
{
"epoch": 5.94488188976378,
"grad_norm": 0.4515645205974579,
"learning_rate": 3.867458521600943e-05,
"loss": 0.0403,
"step": 1510
},
{
"epoch": 5.984251968503937,
"grad_norm": 0.5812086462974548,
"learning_rate": 3.804150987786525e-05,
"loss": 0.045,
"step": 1520
},
{
"epoch": 6.0236220472440944,
"grad_norm": 0.4998365640640259,
"learning_rate": 3.7410461547036534e-05,
"loss": 0.0372,
"step": 1530
},
{
"epoch": 6.062992125984252,
"grad_norm": 0.2945879101753235,
"learning_rate": 3.6781547188497135e-05,
"loss": 0.0355,
"step": 1540
},
{
"epoch": 6.102362204724409,
"grad_norm": 0.4646545946598053,
"learning_rate": 3.6154873405504895e-05,
"loss": 0.0271,
"step": 1550
},
{
"epoch": 6.141732283464567,
"grad_norm": 0.5919941067695618,
"learning_rate": 3.553054642153192e-05,
"loss": 0.04,
"step": 1560
},
{
"epoch": 6.181102362204724,
"grad_norm": 0.4582063555717468,
"learning_rate": 3.4908672062259487e-05,
"loss": 0.0308,
"step": 1570
},
{
"epoch": 6.2204724409448815,
"grad_norm": 0.5543438196182251,
"learning_rate": 3.428935573764005e-05,
"loss": 0.0319,
"step": 1580
},
{
"epoch": 6.259842519685039,
"grad_norm": 0.3909936249256134,
"learning_rate": 3.367270242402999e-05,
"loss": 0.0305,
"step": 1590
},
{
"epoch": 6.299212598425197,
"grad_norm": 0.4638426601886749,
"learning_rate": 3.30588166463957e-05,
"loss": 0.0298,
"step": 1600
},
{
"epoch": 6.338582677165355,
"grad_norm": 0.5119015574455261,
"learning_rate": 3.2447802460596124e-05,
"loss": 0.0299,
"step": 1610
},
{
"epoch": 6.377952755905512,
"grad_norm": 0.4361736476421356,
"learning_rate": 3.183976343574513e-05,
"loss": 0.0279,
"step": 1620
},
{
"epoch": 6.417322834645669,
"grad_norm": 0.7282920479774475,
"learning_rate": 3.123480263665597e-05,
"loss": 0.0305,
"step": 1630
},
{
"epoch": 6.456692913385827,
"grad_norm": 0.4011118710041046,
"learning_rate": 3.063302260637151e-05,
"loss": 0.0294,
"step": 1640
},
{
"epoch": 6.496062992125984,
"grad_norm": 0.44994768500328064,
"learning_rate": 3.0034525348782855e-05,
"loss": 0.0256,
"step": 1650
},
{
"epoch": 6.535433070866142,
"grad_norm": 0.4626915454864502,
"learning_rate": 2.9439412311339175e-05,
"loss": 0.039,
"step": 1660
},
{
"epoch": 6.574803149606299,
"grad_norm": 0.6001310348510742,
"learning_rate": 2.8847784367852184e-05,
"loss": 0.025,
"step": 1670
},
{
"epoch": 6.6141732283464565,
"grad_norm": 0.38421395421028137,
"learning_rate": 2.8259741801397477e-05,
"loss": 0.0373,
"step": 1680
},
{
"epoch": 6.653543307086614,
"grad_norm": 0.37881389260292053,
"learning_rate": 2.7675384287316363e-05,
"loss": 0.034,
"step": 1690
},
{
"epoch": 6.692913385826771,
"grad_norm": 0.43850505352020264,
"learning_rate": 2.709481087632041e-05,
"loss": 0.0367,
"step": 1700
},
{
"epoch": 6.73228346456693,
"grad_norm": 0.6985974907875061,
"learning_rate": 2.6518119977702e-05,
"loss": 0.0364,
"step": 1710
},
{
"epoch": 6.771653543307087,
"grad_norm": 0.4026467502117157,
"learning_rate": 2.5945409342653726e-05,
"loss": 0.0363,
"step": 1720
},
{
"epoch": 6.811023622047244,
"grad_norm": 0.36351197957992554,
"learning_rate": 2.5376776047698965e-05,
"loss": 0.03,
"step": 1730
},
{
"epoch": 6.850393700787402,
"grad_norm": 0.2879717946052551,
"learning_rate": 2.4812316478237353e-05,
"loss": 0.0309,
"step": 1740
},
{
"epoch": 6.889763779527559,
"grad_norm": 0.6626470685005188,
"learning_rate": 2.4252126312206873e-05,
"loss": 0.0431,
"step": 1750
},
{
"epoch": 6.929133858267717,
"grad_norm": 0.4203033447265625,
"learning_rate": 2.3696300503866204e-05,
"loss": 0.0276,
"step": 1760
},
{
"epoch": 6.968503937007874,
"grad_norm": 0.5207045078277588,
"learning_rate": 2.314493326769968e-05,
"loss": 0.0294,
"step": 1770
},
{
"epoch": 7.0078740157480315,
"grad_norm": 0.4823305308818817,
"learning_rate": 2.259811806244741e-05,
"loss": 0.044,
"step": 1780
},
{
"epoch": 7.047244094488189,
"grad_norm": 0.6036306023597717,
"learning_rate": 2.2055947575263912e-05,
"loss": 0.0283,
"step": 1790
},
{
"epoch": 7.086614173228346,
"grad_norm": 0.5445898175239563,
"learning_rate": 2.1518513706007155e-05,
"loss": 0.0299,
"step": 1800
},
{
"epoch": 7.125984251968504,
"grad_norm": 0.44029784202575684,
"learning_rate": 2.0985907551661206e-05,
"loss": 0.0349,
"step": 1810
},
{
"epoch": 7.165354330708661,
"grad_norm": 0.4250989258289337,
"learning_rate": 2.0458219390895106e-05,
"loss": 0.0301,
"step": 1820
},
{
"epoch": 7.2047244094488185,
"grad_norm": 0.31232279539108276,
"learning_rate": 1.9935538668760057e-05,
"loss": 0.0421,
"step": 1830
},
{
"epoch": 7.244094488188976,
"grad_norm": 0.5633496642112732,
"learning_rate": 1.9417953981528424e-05,
"loss": 0.03,
"step": 1840
},
{
"epoch": 7.283464566929134,
"grad_norm": 0.7122541666030884,
"learning_rate": 1.890555306167619e-05,
"loss": 0.0343,
"step": 1850
},
{
"epoch": 7.322834645669292,
"grad_norm": 0.4240683615207672,
"learning_rate": 1.8398422763011985e-05,
"loss": 0.0244,
"step": 1860
},
{
"epoch": 7.362204724409449,
"grad_norm": 0.3466864824295044,
"learning_rate": 1.789664904595518e-05,
"loss": 0.0265,
"step": 1870
},
{
"epoch": 7.4015748031496065,
"grad_norm": 0.7498940229415894,
"learning_rate": 1.7400316962965087e-05,
"loss": 0.0303,
"step": 1880
},
{
"epoch": 7.440944881889764,
"grad_norm": 0.38159969449043274,
"learning_rate": 1.6909510644124455e-05,
"loss": 0.0261,
"step": 1890
},
{
"epoch": 7.480314960629921,
"grad_norm": 0.5678858160972595,
"learning_rate": 1.642431328287899e-05,
"loss": 0.035,
"step": 1900
},
{
"epoch": 7.519685039370079,
"grad_norm": 0.4290473461151123,
"learning_rate": 1.594480712193579e-05,
"loss": 0.026,
"step": 1910
},
{
"epoch": 7.559055118110236,
"grad_norm": 0.5522475242614746,
"learning_rate": 1.547107343932299e-05,
"loss": 0.029,
"step": 1920
},
{
"epoch": 7.5984251968503935,
"grad_norm": 0.41189444065093994,
"learning_rate": 1.5003192534612675e-05,
"loss": 0.0243,
"step": 1930
},
{
"epoch": 7.637795275590551,
"grad_norm": 0.1887262910604477,
"learning_rate": 1.4541243715310005e-05,
"loss": 0.0255,
"step": 1940
},
{
"epoch": 7.677165354330708,
"grad_norm": 0.5400164127349854,
"learning_rate": 1.4085305283410166e-05,
"loss": 0.0251,
"step": 1950
},
{
"epoch": 7.716535433070866,
"grad_norm": 0.5268674492835999,
"learning_rate": 1.3635454522125946e-05,
"loss": 0.036,
"step": 1960
},
{
"epoch": 7.755905511811024,
"grad_norm": 0.6157100200653076,
"learning_rate": 1.3191767682788003e-05,
"loss": 0.0266,
"step": 1970
},
{
"epoch": 7.7952755905511815,
"grad_norm": 0.3194139301776886,
"learning_rate": 1.2754319971919842e-05,
"loss": 0.0243,
"step": 1980
},
{
"epoch": 7.834645669291339,
"grad_norm": 0.37460631132125854,
"learning_rate": 1.2323185538490229e-05,
"loss": 0.0357,
"step": 1990
},
{
"epoch": 7.874015748031496,
"grad_norm": 0.37989112734794617,
"learning_rate": 1.1898437461344518e-05,
"loss": 0.0318,
"step": 2000
},
{
"epoch": 7.913385826771654,
"grad_norm": 0.42782655358314514,
"learning_rate": 1.1480147736817598e-05,
"loss": 0.0263,
"step": 2010
},
{
"epoch": 7.952755905511811,
"grad_norm": 0.5276915431022644,
"learning_rate": 1.1068387266530267e-05,
"loss": 0.025,
"step": 2020
},
{
"epoch": 7.9921259842519685,
"grad_norm": 0.6227043271064758,
"learning_rate": 1.0663225845371045e-05,
"loss": 0.0296,
"step": 2030
},
{
"epoch": 8.031496062992126,
"grad_norm": 0.2772444784641266,
"learning_rate": 1.026473214966584e-05,
"loss": 0.0346,
"step": 2040
},
{
"epoch": 8.070866141732283,
"grad_norm": 0.4647983908653259,
"learning_rate": 9.872973725536955e-06,
"loss": 0.0308,
"step": 2050
},
{
"epoch": 8.11023622047244,
"grad_norm": 0.32798609137535095,
"learning_rate": 9.488016977453807e-06,
"loss": 0.0248,
"step": 2060
},
{
"epoch": 8.149606299212598,
"grad_norm": 0.2955688238143921,
"learning_rate": 9.109927156977122e-06,
"loss": 0.0268,
"step": 2070
},
{
"epoch": 8.188976377952756,
"grad_norm": 0.3243492543697357,
"learning_rate": 8.738768351698574e-06,
"loss": 0.0273,
"step": 2080
},
{
"epoch": 8.228346456692913,
"grad_norm": 0.2228180319070816,
"learning_rate": 8.374603474377718e-06,
"loss": 0.0276,
"step": 2090
},
{
"epoch": 8.26771653543307,
"grad_norm": 0.4943491816520691,
"learning_rate": 8.017494252278019e-06,
"loss": 0.0338,
"step": 2100
},
{
"epoch": 8.307086614173228,
"grad_norm": 0.26574063301086426,
"learning_rate": 7.667501216703849e-06,
"loss": 0.0286,
"step": 2110
},
{
"epoch": 8.346456692913385,
"grad_norm": 0.561998188495636,
"learning_rate": 7.324683692740259e-06,
"loss": 0.0302,
"step": 2120
},
{
"epoch": 8.385826771653543,
"grad_norm": 0.5270190238952637,
"learning_rate": 6.989099789197112e-06,
"loss": 0.0309,
"step": 2130
},
{
"epoch": 8.4251968503937,
"grad_norm": 0.49484336376190186,
"learning_rate": 6.660806388759505e-06,
"loss": 0.0255,
"step": 2140
},
{
"epoch": 8.464566929133857,
"grad_norm": 0.5083401799201965,
"learning_rate": 6.339859138345838e-06,
"loss": 0.0253,
"step": 2150
},
{
"epoch": 8.503937007874015,
"grad_norm": 0.34489235281944275,
"learning_rate": 6.026312439675552e-06,
"loss": 0.0216,
"step": 2160
},
{
"epoch": 8.543307086614174,
"grad_norm": 0.41160085797309875,
"learning_rate": 5.720219440047797e-06,
"loss": 0.0168,
"step": 2170
},
{
"epoch": 8.582677165354331,
"grad_norm": 0.2923263609409332,
"learning_rate": 5.421632023332779e-06,
"loss": 0.0264,
"step": 2180
},
{
"epoch": 8.622047244094489,
"grad_norm": 0.522784411907196,
"learning_rate": 5.130600801177294e-06,
"loss": 0.0222,
"step": 2190
},
{
"epoch": 8.661417322834646,
"grad_norm": 0.413273423910141,
"learning_rate": 4.8471751044257995e-06,
"loss": 0.0312,
"step": 2200
},
{
"epoch": 8.700787401574804,
"grad_norm": 0.4753558039665222,
"learning_rate": 4.571402974758715e-06,
"loss": 0.0285,
"step": 2210
},
{
"epoch": 8.740157480314961,
"grad_norm": 0.36735445261001587,
"learning_rate": 4.303331156549162e-06,
"loss": 0.0303,
"step": 2220
},
{
"epoch": 8.779527559055119,
"grad_norm": 0.3742741048336029,
"learning_rate": 4.043005088939616e-06,
"loss": 0.0166,
"step": 2230
},
{
"epoch": 8.818897637795276,
"grad_norm": 0.2975374460220337,
"learning_rate": 3.7904688981398485e-06,
"loss": 0.0233,
"step": 2240
},
{
"epoch": 8.858267716535433,
"grad_norm": 0.4250410497188568,
"learning_rate": 3.5457653899473197e-06,
"loss": 0.0343,
"step": 2250
},
{
"epoch": 8.89763779527559,
"grad_norm": 0.41226938366889954,
"learning_rate": 3.3089360424914674e-06,
"loss": 0.0329,
"step": 2260
},
{
"epoch": 8.937007874015748,
"grad_norm": 0.18084678053855896,
"learning_rate": 3.080020999203026e-06,
"loss": 0.0195,
"step": 2270
},
{
"epoch": 8.976377952755906,
"grad_norm": 0.29392436146736145,
"learning_rate": 2.8590590620095336e-06,
"loss": 0.0257,
"step": 2280
},
{
"epoch": 9.015748031496063,
"grad_norm": 0.2687474489212036,
"learning_rate": 2.646087684758325e-06,
"loss": 0.0274,
"step": 2290
},
{
"epoch": 9.05511811023622,
"grad_norm": 0.1699579358100891,
"learning_rate": 2.4411429668679043e-06,
"loss": 0.0217,
"step": 2300
},
{
"epoch": 9.094488188976378,
"grad_norm": 0.2965308725833893,
"learning_rate": 2.2442596472089907e-06,
"loss": 0.0266,
"step": 2310
},
{
"epoch": 9.133858267716535,
"grad_norm": 0.29086264967918396,
"learning_rate": 2.0554710982161607e-06,
"loss": 0.0202,
"step": 2320
},
{
"epoch": 9.173228346456693,
"grad_norm": 0.525842010974884,
"learning_rate": 1.8748093202311078e-06,
"loss": 0.0252,
"step": 2330
},
{
"epoch": 9.21259842519685,
"grad_norm": 0.23323991894721985,
"learning_rate": 1.7023049360784193e-06,
"loss": 0.0257,
"step": 2340
},
{
"epoch": 9.251968503937007,
"grad_norm": 0.3511553704738617,
"learning_rate": 1.5379871858749784e-06,
"loss": 0.0341,
"step": 2350
},
{
"epoch": 9.291338582677165,
"grad_norm": 0.2637965977191925,
"learning_rate": 1.3818839220735792e-06,
"loss": 0.0207,
"step": 2360
},
{
"epoch": 9.330708661417322,
"grad_norm": 0.212178036570549,
"learning_rate": 1.2340216047418695e-06,
"loss": 0.0207,
"step": 2370
},
{
"epoch": 9.37007874015748,
"grad_norm": 0.38010916113853455,
"learning_rate": 1.094425297077295e-06,
"loss": 0.0239,
"step": 2380
},
{
"epoch": 9.409448818897637,
"grad_norm": 0.5460922718048096,
"learning_rate": 9.631186611587405e-07,
"loss": 0.0216,
"step": 2390
},
{
"epoch": 9.448818897637794,
"grad_norm": 0.3007522523403168,
"learning_rate": 8.401239539358008e-07,
"loss": 0.0301,
"step": 2400
},
{
"epoch": 9.488188976377952,
"grad_norm": 0.279300719499588,
"learning_rate": 7.254620234560583e-07,
"loss": 0.0313,
"step": 2410
},
{
"epoch": 9.527559055118111,
"grad_norm": 0.6319423913955688,
"learning_rate": 6.191523053313386e-07,
"loss": 0.0298,
"step": 2420
},
{
"epoch": 9.566929133858268,
"grad_norm": 0.5479516983032227,
"learning_rate": 5.212128194432509e-07,
"loss": 0.0222,
"step": 2430
},
{
"epoch": 9.606299212598426,
"grad_norm": 0.44871729612350464,
"learning_rate": 4.3166016688879205e-07,
"loss": 0.021,
"step": 2440
},
{
"epoch": 9.645669291338583,
"grad_norm": 0.5158320665359497,
"learning_rate": 3.505095271663705e-07,
"loss": 0.0174,
"step": 2450
},
{
"epoch": 9.68503937007874,
"grad_norm": 0.3390265107154846,
"learning_rate": 2.7777465560285265e-07,
"loss": 0.0224,
"step": 2460
},
{
"epoch": 9.724409448818898,
"grad_norm": 0.39901813864707947,
"learning_rate": 2.1346788102196148e-07,
"loss": 0.0181,
"step": 2470
},
{
"epoch": 9.763779527559056,
"grad_norm": 0.4687666893005371,
"learning_rate": 1.5760010365450938e-07,
"loss": 0.0213,
"step": 2480
},
{
"epoch": 9.803149606299213,
"grad_norm": 0.5264361500740051,
"learning_rate": 1.1018079329076503e-07,
"loss": 0.0249,
"step": 2490
},
{
"epoch": 9.84251968503937,
"grad_norm": 0.5656106472015381,
"learning_rate": 7.121798767530385e-08,
"loss": 0.0217,
"step": 2500
},
{
"epoch": 9.881889763779528,
"grad_norm": 0.3701815903186798,
"learning_rate": 4.071829114455361e-08,
"loss": 0.0198,
"step": 2510
},
{
"epoch": 9.921259842519685,
"grad_norm": 0.2982766628265381,
"learning_rate": 1.868687350736198e-08,
"loss": 0.0219,
"step": 2520
},
{
"epoch": 9.960629921259843,
"grad_norm": 0.3252989947795868,
"learning_rate": 5.1274691686697965e-09,
"loss": 0.021,
"step": 2530
},
{
"epoch": 10.0,
"grad_norm": 0.5611906051635742,
"learning_rate": 4.23764965562512e-11,
"loss": 0.0357,
"step": 2540
},
{
"epoch": 10.0,
"step": 2540,
"total_flos": 0.0,
"train_loss": 0.06096198053106548,
"train_runtime": 2181.3746,
"train_samples_per_second": 31.434,
"train_steps_per_second": 1.164
}
],
"logging_steps": 10,
"max_steps": 2540,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 27,
"trial_name": null,
"trial_params": null
}