orpheus_tr / trainer_state.json
fguryel's picture
Upload folder using huggingface_hub
5cc4382 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.18053800324968405,
"eval_steps": 500,
"global_step": 20000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00045134500812421015,
"grad_norm": 4.092976093292236,
"learning_rate": 4.9977884094601916e-05,
"loss": 5.2971,
"step": 50
},
{
"epoch": 0.0009026900162484203,
"grad_norm": 3.788925886154175,
"learning_rate": 4.99553168441957e-05,
"loss": 5.2741,
"step": 100
},
{
"epoch": 0.0013540350243726304,
"grad_norm": 5.105148792266846,
"learning_rate": 4.9932749593789494e-05,
"loss": 5.0557,
"step": 150
},
{
"epoch": 0.0018053800324968406,
"grad_norm": 4.261813163757324,
"learning_rate": 4.991018234338328e-05,
"loss": 5.155,
"step": 200
},
{
"epoch": 0.0022567250406210506,
"grad_norm": 6.269433498382568,
"learning_rate": 4.988761509297707e-05,
"loss": 5.098,
"step": 250
},
{
"epoch": 0.002708070048745261,
"grad_norm": 4.494116306304932,
"learning_rate": 4.986504784257086e-05,
"loss": 5.0972,
"step": 300
},
{
"epoch": 0.003159415056869471,
"grad_norm": 3.811136484146118,
"learning_rate": 4.984248059216465e-05,
"loss": 5.0968,
"step": 350
},
{
"epoch": 0.003610760064993681,
"grad_norm": 5.116394996643066,
"learning_rate": 4.9819913341758444e-05,
"loss": 5.0053,
"step": 400
},
{
"epoch": 0.004062105073117891,
"grad_norm": 3.4574902057647705,
"learning_rate": 4.979734609135223e-05,
"loss": 5.0443,
"step": 450
},
{
"epoch": 0.004513450081242101,
"grad_norm": 3.924276113510132,
"learning_rate": 4.977477884094602e-05,
"loss": 4.9834,
"step": 500
},
{
"epoch": 0.004964795089366311,
"grad_norm": 3.120497226715088,
"learning_rate": 4.975221159053981e-05,
"loss": 4.9859,
"step": 550
},
{
"epoch": 0.005416140097490522,
"grad_norm": 5.467548847198486,
"learning_rate": 4.97296443401336e-05,
"loss": 5.0014,
"step": 600
},
{
"epoch": 0.005867485105614732,
"grad_norm": 4.165292739868164,
"learning_rate": 4.970707708972739e-05,
"loss": 4.9381,
"step": 650
},
{
"epoch": 0.006318830113738942,
"grad_norm": 5.348793029785156,
"learning_rate": 4.968450983932118e-05,
"loss": 4.8988,
"step": 700
},
{
"epoch": 0.006770175121863152,
"grad_norm": 5.445329189300537,
"learning_rate": 4.9661942588914965e-05,
"loss": 4.882,
"step": 750
},
{
"epoch": 0.007221520129987362,
"grad_norm": 3.731977939605713,
"learning_rate": 4.963937533850876e-05,
"loss": 4.8879,
"step": 800
},
{
"epoch": 0.007672865138111573,
"grad_norm": 4.9821343421936035,
"learning_rate": 4.9616808088102544e-05,
"loss": 4.8933,
"step": 850
},
{
"epoch": 0.008124210146235782,
"grad_norm": 6.4130401611328125,
"learning_rate": 4.9594240837696337e-05,
"loss": 4.8942,
"step": 900
},
{
"epoch": 0.008575555154359992,
"grad_norm": 5.44791841506958,
"learning_rate": 4.957167358729013e-05,
"loss": 4.816,
"step": 950
},
{
"epoch": 0.009026900162484202,
"grad_norm": 4.63847541809082,
"learning_rate": 4.9549106336883915e-05,
"loss": 4.8797,
"step": 1000
},
{
"epoch": 0.009478245170608413,
"grad_norm": 4.616076946258545,
"learning_rate": 4.952653908647771e-05,
"loss": 4.8622,
"step": 1050
},
{
"epoch": 0.009929590178732623,
"grad_norm": 4.847900390625,
"learning_rate": 4.9503971836071494e-05,
"loss": 4.8765,
"step": 1100
},
{
"epoch": 0.010380935186856833,
"grad_norm": 3.968596935272217,
"learning_rate": 4.9481404585665286e-05,
"loss": 4.7834,
"step": 1150
},
{
"epoch": 0.010832280194981043,
"grad_norm": 3.6416995525360107,
"learning_rate": 4.945883733525907e-05,
"loss": 4.8674,
"step": 1200
},
{
"epoch": 0.011283625203105253,
"grad_norm": 6.88565731048584,
"learning_rate": 4.9436270084852865e-05,
"loss": 4.809,
"step": 1250
},
{
"epoch": 0.011734970211229464,
"grad_norm": 5.064456462860107,
"learning_rate": 4.941370283444665e-05,
"loss": 4.8121,
"step": 1300
},
{
"epoch": 0.012186315219353674,
"grad_norm": 4.556407451629639,
"learning_rate": 4.939113558404044e-05,
"loss": 4.799,
"step": 1350
},
{
"epoch": 0.012637660227477884,
"grad_norm": 4.071566581726074,
"learning_rate": 4.936856833363423e-05,
"loss": 4.7471,
"step": 1400
},
{
"epoch": 0.013089005235602094,
"grad_norm": 4.647943019866943,
"learning_rate": 4.934600108322802e-05,
"loss": 4.7783,
"step": 1450
},
{
"epoch": 0.013540350243726304,
"grad_norm": 4.131853103637695,
"learning_rate": 4.9323433832821814e-05,
"loss": 4.8507,
"step": 1500
},
{
"epoch": 0.013991695251850515,
"grad_norm": 6.93862771987915,
"learning_rate": 4.93008665824156e-05,
"loss": 4.8,
"step": 1550
},
{
"epoch": 0.014443040259974725,
"grad_norm": 4.366854190826416,
"learning_rate": 4.927829933200939e-05,
"loss": 4.8794,
"step": 1600
},
{
"epoch": 0.014894385268098935,
"grad_norm": 3.989370822906494,
"learning_rate": 4.925573208160318e-05,
"loss": 4.7972,
"step": 1650
},
{
"epoch": 0.015345730276223145,
"grad_norm": 4.402428150177002,
"learning_rate": 4.923316483119697e-05,
"loss": 4.9,
"step": 1700
},
{
"epoch": 0.015797075284347355,
"grad_norm": 4.536413192749023,
"learning_rate": 4.921059758079076e-05,
"loss": 4.7663,
"step": 1750
},
{
"epoch": 0.016248420292471564,
"grad_norm": 6.875385284423828,
"learning_rate": 4.918803033038455e-05,
"loss": 4.8557,
"step": 1800
},
{
"epoch": 0.016699765300595776,
"grad_norm": 2.8038690090179443,
"learning_rate": 4.9165463079978336e-05,
"loss": 4.8403,
"step": 1850
},
{
"epoch": 0.017151110308719984,
"grad_norm": 4.83705997467041,
"learning_rate": 4.914289582957213e-05,
"loss": 4.8451,
"step": 1900
},
{
"epoch": 0.017602455316844196,
"grad_norm": 3.359116315841675,
"learning_rate": 4.9120328579165914e-05,
"loss": 4.8559,
"step": 1950
},
{
"epoch": 0.018053800324968405,
"grad_norm": 6.140733242034912,
"learning_rate": 4.909776132875971e-05,
"loss": 4.8633,
"step": 2000
},
{
"epoch": 0.018505145333092617,
"grad_norm": 4.224785327911377,
"learning_rate": 4.90751940783535e-05,
"loss": 4.835,
"step": 2050
},
{
"epoch": 0.018956490341216825,
"grad_norm": 3.613844394683838,
"learning_rate": 4.9052626827947285e-05,
"loss": 4.7293,
"step": 2100
},
{
"epoch": 0.019407835349341037,
"grad_norm": 5.848568439483643,
"learning_rate": 4.903005957754108e-05,
"loss": 4.6969,
"step": 2150
},
{
"epoch": 0.019859180357465245,
"grad_norm": 3.9656293392181396,
"learning_rate": 4.9007492327134864e-05,
"loss": 4.7284,
"step": 2200
},
{
"epoch": 0.020310525365589457,
"grad_norm": 5.00789213180542,
"learning_rate": 4.898492507672866e-05,
"loss": 4.865,
"step": 2250
},
{
"epoch": 0.020761870373713666,
"grad_norm": 4.17151403427124,
"learning_rate": 4.896235782632244e-05,
"loss": 4.7261,
"step": 2300
},
{
"epoch": 0.021213215381837878,
"grad_norm": 3.966817617416382,
"learning_rate": 4.8939790575916235e-05,
"loss": 4.7437,
"step": 2350
},
{
"epoch": 0.021664560389962086,
"grad_norm": 4.516706943511963,
"learning_rate": 4.891722332551002e-05,
"loss": 4.7352,
"step": 2400
},
{
"epoch": 0.022115905398086298,
"grad_norm": 4.184154033660889,
"learning_rate": 4.8894656075103814e-05,
"loss": 4.6622,
"step": 2450
},
{
"epoch": 0.022567250406210507,
"grad_norm": 6.0985188484191895,
"learning_rate": 4.8872088824697606e-05,
"loss": 4.7659,
"step": 2500
},
{
"epoch": 0.02301859541433472,
"grad_norm": 4.630510330200195,
"learning_rate": 4.884952157429139e-05,
"loss": 4.8424,
"step": 2550
},
{
"epoch": 0.023469940422458927,
"grad_norm": 4.261359214782715,
"learning_rate": 4.8826954323885185e-05,
"loss": 4.7576,
"step": 2600
},
{
"epoch": 0.02392128543058314,
"grad_norm": 4.511416435241699,
"learning_rate": 4.880438707347897e-05,
"loss": 4.7644,
"step": 2650
},
{
"epoch": 0.024372630438707348,
"grad_norm": 3.6945180892944336,
"learning_rate": 4.878181982307276e-05,
"loss": 4.7893,
"step": 2700
},
{
"epoch": 0.02482397544683156,
"grad_norm": 5.0419511795043945,
"learning_rate": 4.875925257266655e-05,
"loss": 4.6466,
"step": 2750
},
{
"epoch": 0.025275320454955768,
"grad_norm": 3.80349063873291,
"learning_rate": 4.873668532226034e-05,
"loss": 4.778,
"step": 2800
},
{
"epoch": 0.02572666546307998,
"grad_norm": 3.5543832778930664,
"learning_rate": 4.871411807185413e-05,
"loss": 4.6788,
"step": 2850
},
{
"epoch": 0.02617801047120419,
"grad_norm": 3.064133405685425,
"learning_rate": 4.869155082144792e-05,
"loss": 4.6796,
"step": 2900
},
{
"epoch": 0.0266293554793284,
"grad_norm": 3.449727773666382,
"learning_rate": 4.8668983571041706e-05,
"loss": 4.7513,
"step": 2950
},
{
"epoch": 0.02708070048745261,
"grad_norm": 3.831252098083496,
"learning_rate": 4.86464163206355e-05,
"loss": 4.6886,
"step": 3000
},
{
"epoch": 0.027532045495576817,
"grad_norm": 6.98654842376709,
"learning_rate": 4.862384907022929e-05,
"loss": 4.7468,
"step": 3050
},
{
"epoch": 0.02798339050370103,
"grad_norm": 3.842249870300293,
"learning_rate": 4.860128181982308e-05,
"loss": 4.6317,
"step": 3100
},
{
"epoch": 0.028434735511825238,
"grad_norm": 8.266908645629883,
"learning_rate": 4.857871456941686e-05,
"loss": 4.7426,
"step": 3150
},
{
"epoch": 0.02888608051994945,
"grad_norm": 5.496558666229248,
"learning_rate": 4.8556147319010656e-05,
"loss": 4.5784,
"step": 3200
},
{
"epoch": 0.029337425528073658,
"grad_norm": 4.078311920166016,
"learning_rate": 4.853358006860444e-05,
"loss": 4.6739,
"step": 3250
},
{
"epoch": 0.02978877053619787,
"grad_norm": 3.8962206840515137,
"learning_rate": 4.8511012818198234e-05,
"loss": 4.7384,
"step": 3300
},
{
"epoch": 0.03024011554432208,
"grad_norm": 3.655855178833008,
"learning_rate": 4.848844556779202e-05,
"loss": 4.7782,
"step": 3350
},
{
"epoch": 0.03069146055244629,
"grad_norm": 3.840287446975708,
"learning_rate": 4.846587831738581e-05,
"loss": 4.6969,
"step": 3400
},
{
"epoch": 0.0311428055605705,
"grad_norm": 3.54238224029541,
"learning_rate": 4.84433110669796e-05,
"loss": 4.7998,
"step": 3450
},
{
"epoch": 0.03159415056869471,
"grad_norm": 6.432263374328613,
"learning_rate": 4.842074381657339e-05,
"loss": 4.7554,
"step": 3500
},
{
"epoch": 0.03204549557681892,
"grad_norm": 4.151718616485596,
"learning_rate": 4.839817656616718e-05,
"loss": 4.7455,
"step": 3550
},
{
"epoch": 0.03249684058494313,
"grad_norm": 3.6925272941589355,
"learning_rate": 4.837560931576097e-05,
"loss": 4.7143,
"step": 3600
},
{
"epoch": 0.03294818559306734,
"grad_norm": 5.515355110168457,
"learning_rate": 4.8353042065354756e-05,
"loss": 4.6842,
"step": 3650
},
{
"epoch": 0.03339953060119155,
"grad_norm": 4.059805393218994,
"learning_rate": 4.833047481494855e-05,
"loss": 4.6799,
"step": 3700
},
{
"epoch": 0.03385087560931576,
"grad_norm": 4.311253547668457,
"learning_rate": 4.8307907564542334e-05,
"loss": 4.7937,
"step": 3750
},
{
"epoch": 0.03430222061743997,
"grad_norm": 3.7470786571502686,
"learning_rate": 4.828534031413613e-05,
"loss": 4.6706,
"step": 3800
},
{
"epoch": 0.034753565625564184,
"grad_norm": 3.432297468185425,
"learning_rate": 4.826277306372991e-05,
"loss": 4.6627,
"step": 3850
},
{
"epoch": 0.03520491063368839,
"grad_norm": 2.6612203121185303,
"learning_rate": 4.8240205813323706e-05,
"loss": 4.6027,
"step": 3900
},
{
"epoch": 0.0356562556418126,
"grad_norm": 5.329100131988525,
"learning_rate": 4.821763856291749e-05,
"loss": 4.7471,
"step": 3950
},
{
"epoch": 0.03610760064993681,
"grad_norm": 3.7262275218963623,
"learning_rate": 4.8195071312511284e-05,
"loss": 4.6032,
"step": 4000
},
{
"epoch": 0.036558945658061025,
"grad_norm": 4.605144500732422,
"learning_rate": 4.817250406210507e-05,
"loss": 4.8456,
"step": 4050
},
{
"epoch": 0.03701029066618523,
"grad_norm": 3.8024492263793945,
"learning_rate": 4.814993681169886e-05,
"loss": 4.7256,
"step": 4100
},
{
"epoch": 0.03746163567430944,
"grad_norm": 7.693057060241699,
"learning_rate": 4.8127369561292655e-05,
"loss": 4.6224,
"step": 4150
},
{
"epoch": 0.03791298068243365,
"grad_norm": 4.100279808044434,
"learning_rate": 4.810480231088644e-05,
"loss": 4.7119,
"step": 4200
},
{
"epoch": 0.038364325690557866,
"grad_norm": 4.8026347160339355,
"learning_rate": 4.8082235060480234e-05,
"loss": 4.57,
"step": 4250
},
{
"epoch": 0.038815670698682074,
"grad_norm": 5.2641119956970215,
"learning_rate": 4.805966781007402e-05,
"loss": 4.5887,
"step": 4300
},
{
"epoch": 0.03926701570680628,
"grad_norm": 2.8225934505462646,
"learning_rate": 4.803710055966781e-05,
"loss": 4.6981,
"step": 4350
},
{
"epoch": 0.03971836071493049,
"grad_norm": 3.3784983158111572,
"learning_rate": 4.80145333092616e-05,
"loss": 4.6752,
"step": 4400
},
{
"epoch": 0.040169705723054706,
"grad_norm": 5.6406426429748535,
"learning_rate": 4.799196605885539e-05,
"loss": 4.6396,
"step": 4450
},
{
"epoch": 0.040621050731178915,
"grad_norm": 4.564062595367432,
"learning_rate": 4.7969398808449177e-05,
"loss": 4.6158,
"step": 4500
},
{
"epoch": 0.04107239573930312,
"grad_norm": 3.6431472301483154,
"learning_rate": 4.794683155804297e-05,
"loss": 4.6883,
"step": 4550
},
{
"epoch": 0.04152374074742733,
"grad_norm": 5.026195526123047,
"learning_rate": 4.792426430763676e-05,
"loss": 4.7225,
"step": 4600
},
{
"epoch": 0.04197508575555154,
"grad_norm": 4.776146411895752,
"learning_rate": 4.790169705723055e-05,
"loss": 4.6596,
"step": 4650
},
{
"epoch": 0.042426430763675756,
"grad_norm": 4.838674545288086,
"learning_rate": 4.787912980682434e-05,
"loss": 4.6576,
"step": 4700
},
{
"epoch": 0.042877775771799964,
"grad_norm": 4.529509544372559,
"learning_rate": 4.7856562556418126e-05,
"loss": 4.6721,
"step": 4750
},
{
"epoch": 0.04332912077992417,
"grad_norm": 4.392935752868652,
"learning_rate": 4.783399530601192e-05,
"loss": 4.701,
"step": 4800
},
{
"epoch": 0.04378046578804838,
"grad_norm": 4.331223011016846,
"learning_rate": 4.7811428055605705e-05,
"loss": 4.6795,
"step": 4850
},
{
"epoch": 0.044231810796172596,
"grad_norm": 4.109352111816406,
"learning_rate": 4.77888608051995e-05,
"loss": 4.5997,
"step": 4900
},
{
"epoch": 0.044683155804296805,
"grad_norm": 3.7418441772460938,
"learning_rate": 4.776629355479328e-05,
"loss": 4.6427,
"step": 4950
},
{
"epoch": 0.04513450081242101,
"grad_norm": 3.0237081050872803,
"learning_rate": 4.7743726304387076e-05,
"loss": 4.7359,
"step": 5000
},
{
"epoch": 0.04558584582054522,
"grad_norm": 3.9886231422424316,
"learning_rate": 4.772115905398086e-05,
"loss": 4.5842,
"step": 5050
},
{
"epoch": 0.04603719082866944,
"grad_norm": 4.597533226013184,
"learning_rate": 4.7698591803574654e-05,
"loss": 4.7202,
"step": 5100
},
{
"epoch": 0.046488535836793646,
"grad_norm": 4.520393371582031,
"learning_rate": 4.767602455316845e-05,
"loss": 4.5774,
"step": 5150
},
{
"epoch": 0.046939880844917854,
"grad_norm": 3.2824018001556396,
"learning_rate": 4.765345730276223e-05,
"loss": 4.6084,
"step": 5200
},
{
"epoch": 0.04739122585304206,
"grad_norm": 6.290219783782959,
"learning_rate": 4.7630890052356026e-05,
"loss": 4.6361,
"step": 5250
},
{
"epoch": 0.04784257086116628,
"grad_norm": 4.844172954559326,
"learning_rate": 4.760832280194981e-05,
"loss": 4.6252,
"step": 5300
},
{
"epoch": 0.04829391586929049,
"grad_norm": 4.8328962326049805,
"learning_rate": 4.7585755551543604e-05,
"loss": 4.5557,
"step": 5350
},
{
"epoch": 0.048745260877414695,
"grad_norm": 4.386012077331543,
"learning_rate": 4.756318830113739e-05,
"loss": 4.6911,
"step": 5400
},
{
"epoch": 0.049196605885538904,
"grad_norm": 4.393270969390869,
"learning_rate": 4.754062105073118e-05,
"loss": 4.4869,
"step": 5450
},
{
"epoch": 0.04964795089366312,
"grad_norm": 3.9346606731414795,
"learning_rate": 4.751805380032497e-05,
"loss": 4.608,
"step": 5500
},
{
"epoch": 0.05009929590178733,
"grad_norm": 5.140569686889648,
"learning_rate": 4.749548654991876e-05,
"loss": 4.6262,
"step": 5550
},
{
"epoch": 0.050550640909911536,
"grad_norm": 3.2936654090881348,
"learning_rate": 4.747291929951255e-05,
"loss": 4.6565,
"step": 5600
},
{
"epoch": 0.051001985918035744,
"grad_norm": 3.5564124584198,
"learning_rate": 4.745035204910634e-05,
"loss": 4.5727,
"step": 5650
},
{
"epoch": 0.05145333092615996,
"grad_norm": 3.9385626316070557,
"learning_rate": 4.742778479870013e-05,
"loss": 4.7165,
"step": 5700
},
{
"epoch": 0.05190467593428417,
"grad_norm": 3.736527681350708,
"learning_rate": 4.740521754829392e-05,
"loss": 4.6504,
"step": 5750
},
{
"epoch": 0.05235602094240838,
"grad_norm": 3.3729724884033203,
"learning_rate": 4.738265029788771e-05,
"loss": 4.7029,
"step": 5800
},
{
"epoch": 0.052807365950532585,
"grad_norm": 2.953383445739746,
"learning_rate": 4.73600830474815e-05,
"loss": 4.5483,
"step": 5850
},
{
"epoch": 0.0532587109586568,
"grad_norm": 4.406127452850342,
"learning_rate": 4.733751579707529e-05,
"loss": 4.6443,
"step": 5900
},
{
"epoch": 0.05371005596678101,
"grad_norm": 2.935302495956421,
"learning_rate": 4.7314948546669075e-05,
"loss": 4.61,
"step": 5950
},
{
"epoch": 0.05416140097490522,
"grad_norm": 4.362770080566406,
"learning_rate": 4.729238129626287e-05,
"loss": 4.5821,
"step": 6000
},
{
"epoch": 0.054612745983029426,
"grad_norm": 3.588181972503662,
"learning_rate": 4.7269814045856654e-05,
"loss": 4.6317,
"step": 6050
},
{
"epoch": 0.055064090991153634,
"grad_norm": 2.7238504886627197,
"learning_rate": 4.7247246795450446e-05,
"loss": 4.6867,
"step": 6100
},
{
"epoch": 0.05551543599927785,
"grad_norm": 3.66497802734375,
"learning_rate": 4.722467954504423e-05,
"loss": 4.557,
"step": 6150
},
{
"epoch": 0.05596678100740206,
"grad_norm": 3.9344165325164795,
"learning_rate": 4.7202112294638025e-05,
"loss": 4.5099,
"step": 6200
},
{
"epoch": 0.05641812601552627,
"grad_norm": 3.919712781906128,
"learning_rate": 4.717954504423182e-05,
"loss": 4.6521,
"step": 6250
},
{
"epoch": 0.056869471023650475,
"grad_norm": 6.165071964263916,
"learning_rate": 4.71569777938256e-05,
"loss": 4.6656,
"step": 6300
},
{
"epoch": 0.05732081603177469,
"grad_norm": 3.976167917251587,
"learning_rate": 4.7134410543419396e-05,
"loss": 4.6891,
"step": 6350
},
{
"epoch": 0.0577721610398989,
"grad_norm": 3.4293136596679688,
"learning_rate": 4.711184329301318e-05,
"loss": 4.6213,
"step": 6400
},
{
"epoch": 0.05822350604802311,
"grad_norm": 3.062398910522461,
"learning_rate": 4.7089276042606975e-05,
"loss": 4.6794,
"step": 6450
},
{
"epoch": 0.058674851056147316,
"grad_norm": 3.9836747646331787,
"learning_rate": 4.706670879220076e-05,
"loss": 4.6722,
"step": 6500
},
{
"epoch": 0.05912619606427153,
"grad_norm": 4.0859246253967285,
"learning_rate": 4.704414154179455e-05,
"loss": 4.6909,
"step": 6550
},
{
"epoch": 0.05957754107239574,
"grad_norm": 4.478472709655762,
"learning_rate": 4.702157429138834e-05,
"loss": 4.4942,
"step": 6600
},
{
"epoch": 0.06002888608051995,
"grad_norm": 5.508967399597168,
"learning_rate": 4.699900704098213e-05,
"loss": 4.6658,
"step": 6650
},
{
"epoch": 0.06048023108864416,
"grad_norm": 3.933199644088745,
"learning_rate": 4.697643979057592e-05,
"loss": 4.5696,
"step": 6700
},
{
"epoch": 0.06093157609676837,
"grad_norm": 3.0764100551605225,
"learning_rate": 4.695387254016971e-05,
"loss": 4.7047,
"step": 6750
},
{
"epoch": 0.06138292110489258,
"grad_norm": 3.0718812942504883,
"learning_rate": 4.69313052897635e-05,
"loss": 4.6213,
"step": 6800
},
{
"epoch": 0.06183426611301679,
"grad_norm": 3.2949626445770264,
"learning_rate": 4.690873803935729e-05,
"loss": 4.5174,
"step": 6850
},
{
"epoch": 0.062285611121141,
"grad_norm": 3.5119667053222656,
"learning_rate": 4.688617078895108e-05,
"loss": 4.6313,
"step": 6900
},
{
"epoch": 0.0627369561292652,
"grad_norm": 3.8293747901916504,
"learning_rate": 4.686360353854487e-05,
"loss": 4.6896,
"step": 6950
},
{
"epoch": 0.06318830113738942,
"grad_norm": 3.223698139190674,
"learning_rate": 4.684103628813866e-05,
"loss": 4.6462,
"step": 7000
},
{
"epoch": 0.06363964614551364,
"grad_norm": 3.7061171531677246,
"learning_rate": 4.6818469037732446e-05,
"loss": 4.5423,
"step": 7050
},
{
"epoch": 0.06409099115363784,
"grad_norm": 3.9031214714050293,
"learning_rate": 4.679590178732624e-05,
"loss": 4.6688,
"step": 7100
},
{
"epoch": 0.06454233616176205,
"grad_norm": 11.581488609313965,
"learning_rate": 4.6773334536920024e-05,
"loss": 4.6832,
"step": 7150
},
{
"epoch": 0.06499368116988626,
"grad_norm": 3.9187841415405273,
"learning_rate": 4.675076728651382e-05,
"loss": 4.6451,
"step": 7200
},
{
"epoch": 0.06544502617801047,
"grad_norm": 3.8191521167755127,
"learning_rate": 4.67282000361076e-05,
"loss": 4.5677,
"step": 7250
},
{
"epoch": 0.06589637118613469,
"grad_norm": 3.5511984825134277,
"learning_rate": 4.6705632785701395e-05,
"loss": 4.5412,
"step": 7300
},
{
"epoch": 0.06634771619425889,
"grad_norm": 4.853089809417725,
"learning_rate": 4.668306553529518e-05,
"loss": 4.6636,
"step": 7350
},
{
"epoch": 0.0667990612023831,
"grad_norm": 2.9507358074188232,
"learning_rate": 4.6660498284888974e-05,
"loss": 4.6232,
"step": 7400
},
{
"epoch": 0.06725040621050732,
"grad_norm": 4.20766019821167,
"learning_rate": 4.663793103448276e-05,
"loss": 4.6429,
"step": 7450
},
{
"epoch": 0.06770175121863152,
"grad_norm": 2.9639532566070557,
"learning_rate": 4.6615363784076546e-05,
"loss": 4.5613,
"step": 7500
},
{
"epoch": 0.06815309622675574,
"grad_norm": 4.452625751495361,
"learning_rate": 4.659279653367034e-05,
"loss": 4.7034,
"step": 7550
},
{
"epoch": 0.06860444123487994,
"grad_norm": 4.076809883117676,
"learning_rate": 4.6570229283264124e-05,
"loss": 4.6244,
"step": 7600
},
{
"epoch": 0.06905578624300415,
"grad_norm": 3.361752510070801,
"learning_rate": 4.654766203285792e-05,
"loss": 4.6122,
"step": 7650
},
{
"epoch": 0.06950713125112837,
"grad_norm": 2.9916162490844727,
"learning_rate": 4.65250947824517e-05,
"loss": 4.5939,
"step": 7700
},
{
"epoch": 0.06995847625925257,
"grad_norm": 4.1875200271606445,
"learning_rate": 4.6502527532045495e-05,
"loss": 4.5255,
"step": 7750
},
{
"epoch": 0.07040982126737678,
"grad_norm": 2.9376866817474365,
"learning_rate": 4.647996028163929e-05,
"loss": 4.6222,
"step": 7800
},
{
"epoch": 0.07086116627550099,
"grad_norm": 3.77079176902771,
"learning_rate": 4.6457393031233074e-05,
"loss": 4.5671,
"step": 7850
},
{
"epoch": 0.0713125112836252,
"grad_norm": 6.709794044494629,
"learning_rate": 4.6434825780826866e-05,
"loss": 4.493,
"step": 7900
},
{
"epoch": 0.07176385629174942,
"grad_norm": 4.273845195770264,
"learning_rate": 4.641225853042065e-05,
"loss": 4.6503,
"step": 7950
},
{
"epoch": 0.07221520129987362,
"grad_norm": 3.1263434886932373,
"learning_rate": 4.6389691280014445e-05,
"loss": 4.6878,
"step": 8000
},
{
"epoch": 0.07266654630799783,
"grad_norm": 4.049619674682617,
"learning_rate": 4.636712402960823e-05,
"loss": 4.6728,
"step": 8050
},
{
"epoch": 0.07311789131612205,
"grad_norm": 4.419615745544434,
"learning_rate": 4.6344556779202023e-05,
"loss": 4.5648,
"step": 8100
},
{
"epoch": 0.07356923632424625,
"grad_norm": 4.067174911499023,
"learning_rate": 4.632198952879581e-05,
"loss": 4.6475,
"step": 8150
},
{
"epoch": 0.07402058133237047,
"grad_norm": 3.8273239135742188,
"learning_rate": 4.62994222783896e-05,
"loss": 4.4821,
"step": 8200
},
{
"epoch": 0.07447192634049467,
"grad_norm": 2.988802433013916,
"learning_rate": 4.627685502798339e-05,
"loss": 4.4786,
"step": 8250
},
{
"epoch": 0.07492327134861888,
"grad_norm": 4.000159740447998,
"learning_rate": 4.625428777757718e-05,
"loss": 4.6493,
"step": 8300
},
{
"epoch": 0.0753746163567431,
"grad_norm": 4.026582717895508,
"learning_rate": 4.623172052717097e-05,
"loss": 4.6096,
"step": 8350
},
{
"epoch": 0.0758259613648673,
"grad_norm": 3.3265931606292725,
"learning_rate": 4.620915327676476e-05,
"loss": 4.5148,
"step": 8400
},
{
"epoch": 0.07627730637299152,
"grad_norm": 3.2252328395843506,
"learning_rate": 4.618658602635855e-05,
"loss": 4.6038,
"step": 8450
},
{
"epoch": 0.07672865138111573,
"grad_norm": 3.4897453784942627,
"learning_rate": 4.616401877595234e-05,
"loss": 4.6121,
"step": 8500
},
{
"epoch": 0.07717999638923993,
"grad_norm": 3.3298215866088867,
"learning_rate": 4.614145152554613e-05,
"loss": 4.5457,
"step": 8550
},
{
"epoch": 0.07763134139736415,
"grad_norm": 3.875998020172119,
"learning_rate": 4.6118884275139916e-05,
"loss": 4.5236,
"step": 8600
},
{
"epoch": 0.07808268640548835,
"grad_norm": 3.5962016582489014,
"learning_rate": 4.609631702473371e-05,
"loss": 4.528,
"step": 8650
},
{
"epoch": 0.07853403141361257,
"grad_norm": 2.4850423336029053,
"learning_rate": 4.6073749774327494e-05,
"loss": 4.5441,
"step": 8700
},
{
"epoch": 0.07898537642173678,
"grad_norm": 2.6482949256896973,
"learning_rate": 4.605118252392129e-05,
"loss": 4.5235,
"step": 8750
},
{
"epoch": 0.07943672142986098,
"grad_norm": 3.3628525733947754,
"learning_rate": 4.602861527351507e-05,
"loss": 4.6373,
"step": 8800
},
{
"epoch": 0.0798880664379852,
"grad_norm": 3.0251526832580566,
"learning_rate": 4.6006048023108866e-05,
"loss": 4.3972,
"step": 8850
},
{
"epoch": 0.08033941144610941,
"grad_norm": 3.8248074054718018,
"learning_rate": 4.598348077270266e-05,
"loss": 4.5324,
"step": 8900
},
{
"epoch": 0.08079075645423361,
"grad_norm": 3.5319507122039795,
"learning_rate": 4.5960913522296444e-05,
"loss": 4.6163,
"step": 8950
},
{
"epoch": 0.08124210146235783,
"grad_norm": 5.563832759857178,
"learning_rate": 4.593834627189024e-05,
"loss": 4.5826,
"step": 9000
},
{
"epoch": 0.08169344647048203,
"grad_norm": 3.98085355758667,
"learning_rate": 4.591577902148402e-05,
"loss": 4.6515,
"step": 9050
},
{
"epoch": 0.08214479147860625,
"grad_norm": 6.063210964202881,
"learning_rate": 4.5893211771077815e-05,
"loss": 4.5158,
"step": 9100
},
{
"epoch": 0.08259613648673046,
"grad_norm": 3.957599401473999,
"learning_rate": 4.58706445206716e-05,
"loss": 4.5528,
"step": 9150
},
{
"epoch": 0.08304748149485466,
"grad_norm": 3.1111884117126465,
"learning_rate": 4.5848077270265394e-05,
"loss": 4.5484,
"step": 9200
},
{
"epoch": 0.08349882650297888,
"grad_norm": 4.1915059089660645,
"learning_rate": 4.582551001985918e-05,
"loss": 4.595,
"step": 9250
},
{
"epoch": 0.08395017151110308,
"grad_norm": 4.1448140144348145,
"learning_rate": 4.580294276945297e-05,
"loss": 4.6259,
"step": 9300
},
{
"epoch": 0.0844015165192273,
"grad_norm": 3.6308369636535645,
"learning_rate": 4.5780375519046765e-05,
"loss": 4.6703,
"step": 9350
},
{
"epoch": 0.08485286152735151,
"grad_norm": 6.079587459564209,
"learning_rate": 4.575780826864055e-05,
"loss": 4.6145,
"step": 9400
},
{
"epoch": 0.08530420653547571,
"grad_norm": 3.5566651821136475,
"learning_rate": 4.5735241018234343e-05,
"loss": 4.5084,
"step": 9450
},
{
"epoch": 0.08575555154359993,
"grad_norm": 4.733799934387207,
"learning_rate": 4.571267376782813e-05,
"loss": 4.5918,
"step": 9500
},
{
"epoch": 0.08620689655172414,
"grad_norm": 3.1966097354888916,
"learning_rate": 4.569010651742192e-05,
"loss": 4.4592,
"step": 9550
},
{
"epoch": 0.08665824155984835,
"grad_norm": 3.9291093349456787,
"learning_rate": 4.566753926701571e-05,
"loss": 4.5673,
"step": 9600
},
{
"epoch": 0.08710958656797256,
"grad_norm": 5.446611404418945,
"learning_rate": 4.56449720166095e-05,
"loss": 4.6176,
"step": 9650
},
{
"epoch": 0.08756093157609676,
"grad_norm": 3.054124355316162,
"learning_rate": 4.5622404766203286e-05,
"loss": 4.6921,
"step": 9700
},
{
"epoch": 0.08801227658422098,
"grad_norm": 3.27416729927063,
"learning_rate": 4.559983751579708e-05,
"loss": 4.5667,
"step": 9750
},
{
"epoch": 0.08846362159234519,
"grad_norm": 3.577589273452759,
"learning_rate": 4.5577270265390865e-05,
"loss": 4.442,
"step": 9800
},
{
"epoch": 0.0889149666004694,
"grad_norm": 3.566028118133545,
"learning_rate": 4.555470301498466e-05,
"loss": 4.6025,
"step": 9850
},
{
"epoch": 0.08936631160859361,
"grad_norm": 4.064197540283203,
"learning_rate": 4.553213576457845e-05,
"loss": 4.5812,
"step": 9900
},
{
"epoch": 0.08981765661671783,
"grad_norm": 4.237987041473389,
"learning_rate": 4.5509568514172236e-05,
"loss": 4.6083,
"step": 9950
},
{
"epoch": 0.09026900162484203,
"grad_norm": 3.0101680755615234,
"learning_rate": 4.548700126376603e-05,
"loss": 4.6099,
"step": 10000
},
{
"epoch": 0.09072034663296624,
"grad_norm": 3.5102596282958984,
"learning_rate": 4.5464434013359815e-05,
"loss": 4.5643,
"step": 10050
},
{
"epoch": 0.09117169164109044,
"grad_norm": 4.774995803833008,
"learning_rate": 4.544186676295361e-05,
"loss": 4.5663,
"step": 10100
},
{
"epoch": 0.09162303664921466,
"grad_norm": 3.963777780532837,
"learning_rate": 4.541929951254739e-05,
"loss": 4.5603,
"step": 10150
},
{
"epoch": 0.09207438165733887,
"grad_norm": 2.888615846633911,
"learning_rate": 4.5396732262141186e-05,
"loss": 4.5396,
"step": 10200
},
{
"epoch": 0.09252572666546308,
"grad_norm": 4.281205177307129,
"learning_rate": 4.537416501173497e-05,
"loss": 4.5725,
"step": 10250
},
{
"epoch": 0.09297707167358729,
"grad_norm": 4.1528472900390625,
"learning_rate": 4.5351597761328764e-05,
"loss": 4.6262,
"step": 10300
},
{
"epoch": 0.0934284166817115,
"grad_norm": 3.966341972351074,
"learning_rate": 4.532903051092255e-05,
"loss": 4.5871,
"step": 10350
},
{
"epoch": 0.09387976168983571,
"grad_norm": 3.1821911334991455,
"learning_rate": 4.530646326051634e-05,
"loss": 4.4732,
"step": 10400
},
{
"epoch": 0.09433110669795992,
"grad_norm": 5.116222858428955,
"learning_rate": 4.5283896010110135e-05,
"loss": 4.6084,
"step": 10450
},
{
"epoch": 0.09478245170608413,
"grad_norm": 5.254827976226807,
"learning_rate": 4.526132875970392e-05,
"loss": 4.5696,
"step": 10500
},
{
"epoch": 0.09523379671420834,
"grad_norm": 3.6102991104125977,
"learning_rate": 4.5238761509297714e-05,
"loss": 4.5971,
"step": 10550
},
{
"epoch": 0.09568514172233256,
"grad_norm": 3.348236322402954,
"learning_rate": 4.52161942588915e-05,
"loss": 4.5048,
"step": 10600
},
{
"epoch": 0.09613648673045676,
"grad_norm": 3.1192493438720703,
"learning_rate": 4.519362700848529e-05,
"loss": 4.571,
"step": 10650
},
{
"epoch": 0.09658783173858097,
"grad_norm": 2.9626996517181396,
"learning_rate": 4.517105975807908e-05,
"loss": 4.4975,
"step": 10700
},
{
"epoch": 0.09703917674670517,
"grad_norm": 3.4130876064300537,
"learning_rate": 4.514849250767287e-05,
"loss": 4.6279,
"step": 10750
},
{
"epoch": 0.09749052175482939,
"grad_norm": 2.4458179473876953,
"learning_rate": 4.512592525726666e-05,
"loss": 4.511,
"step": 10800
},
{
"epoch": 0.0979418667629536,
"grad_norm": 5.223287105560303,
"learning_rate": 4.510335800686045e-05,
"loss": 4.5435,
"step": 10850
},
{
"epoch": 0.09839321177107781,
"grad_norm": 4.481621742248535,
"learning_rate": 4.5080790756454235e-05,
"loss": 4.5177,
"step": 10900
},
{
"epoch": 0.09884455677920202,
"grad_norm": 2.959305763244629,
"learning_rate": 4.505822350604803e-05,
"loss": 4.5746,
"step": 10950
},
{
"epoch": 0.09929590178732624,
"grad_norm": 6.753904342651367,
"learning_rate": 4.5035656255641814e-05,
"loss": 4.5424,
"step": 11000
},
{
"epoch": 0.09974724679545044,
"grad_norm": 3.4904415607452393,
"learning_rate": 4.5013089005235606e-05,
"loss": 4.5661,
"step": 11050
},
{
"epoch": 0.10019859180357465,
"grad_norm": 3.316413164138794,
"learning_rate": 4.499052175482939e-05,
"loss": 4.5242,
"step": 11100
},
{
"epoch": 0.10064993681169886,
"grad_norm": 3.974198579788208,
"learning_rate": 4.4967954504423185e-05,
"loss": 4.5251,
"step": 11150
},
{
"epoch": 0.10110128181982307,
"grad_norm": 4.306400775909424,
"learning_rate": 4.494538725401697e-05,
"loss": 4.5209,
"step": 11200
},
{
"epoch": 0.10155262682794729,
"grad_norm": 4.841123104095459,
"learning_rate": 4.4922820003610764e-05,
"loss": 4.5895,
"step": 11250
},
{
"epoch": 0.10200397183607149,
"grad_norm": 3.6396520137786865,
"learning_rate": 4.490025275320455e-05,
"loss": 4.5788,
"step": 11300
},
{
"epoch": 0.1024553168441957,
"grad_norm": 3.500455379486084,
"learning_rate": 4.487768550279834e-05,
"loss": 4.4844,
"step": 11350
},
{
"epoch": 0.10290666185231992,
"grad_norm": 4.19438362121582,
"learning_rate": 4.485511825239213e-05,
"loss": 4.5723,
"step": 11400
},
{
"epoch": 0.10335800686044412,
"grad_norm": 3.513514995574951,
"learning_rate": 4.483255100198592e-05,
"loss": 4.6457,
"step": 11450
},
{
"epoch": 0.10380935186856834,
"grad_norm": 3.5381104946136475,
"learning_rate": 4.4809983751579706e-05,
"loss": 4.4716,
"step": 11500
},
{
"epoch": 0.10426069687669254,
"grad_norm": 4.183605194091797,
"learning_rate": 4.47874165011735e-05,
"loss": 4.5554,
"step": 11550
},
{
"epoch": 0.10471204188481675,
"grad_norm": 3.838669538497925,
"learning_rate": 4.4764849250767285e-05,
"loss": 4.5354,
"step": 11600
},
{
"epoch": 0.10516338689294097,
"grad_norm": 3.651357889175415,
"learning_rate": 4.474228200036108e-05,
"loss": 4.5784,
"step": 11650
},
{
"epoch": 0.10561473190106517,
"grad_norm": 3.6753928661346436,
"learning_rate": 4.4719714749954863e-05,
"loss": 4.5484,
"step": 11700
},
{
"epoch": 0.10606607690918939,
"grad_norm": 4.5028228759765625,
"learning_rate": 4.4697147499548656e-05,
"loss": 4.5781,
"step": 11750
},
{
"epoch": 0.1065174219173136,
"grad_norm": 7.304862022399902,
"learning_rate": 4.467458024914244e-05,
"loss": 4.5041,
"step": 11800
},
{
"epoch": 0.1069687669254378,
"grad_norm": 4.280136585235596,
"learning_rate": 4.4652012998736235e-05,
"loss": 4.6027,
"step": 11850
},
{
"epoch": 0.10742011193356202,
"grad_norm": 3.6763241291046143,
"learning_rate": 4.462944574833002e-05,
"loss": 4.5593,
"step": 11900
},
{
"epoch": 0.10787145694168622,
"grad_norm": 3.8541440963745117,
"learning_rate": 4.460687849792381e-05,
"loss": 4.5144,
"step": 11950
},
{
"epoch": 0.10832280194981043,
"grad_norm": 2.8991189002990723,
"learning_rate": 4.4584311247517606e-05,
"loss": 4.6064,
"step": 12000
},
{
"epoch": 0.10877414695793465,
"grad_norm": 2.928452491760254,
"learning_rate": 4.456174399711139e-05,
"loss": 4.5631,
"step": 12050
},
{
"epoch": 0.10922549196605885,
"grad_norm": 3.3975236415863037,
"learning_rate": 4.4539176746705184e-05,
"loss": 4.5839,
"step": 12100
},
{
"epoch": 0.10967683697418307,
"grad_norm": 3.4614107608795166,
"learning_rate": 4.451660949629897e-05,
"loss": 4.5176,
"step": 12150
},
{
"epoch": 0.11012818198230727,
"grad_norm": 3.582960605621338,
"learning_rate": 4.449404224589276e-05,
"loss": 4.6017,
"step": 12200
},
{
"epoch": 0.11057952699043148,
"grad_norm": 5.049736499786377,
"learning_rate": 4.447147499548655e-05,
"loss": 4.5375,
"step": 12250
},
{
"epoch": 0.1110308719985557,
"grad_norm": 4.15340518951416,
"learning_rate": 4.444890774508034e-05,
"loss": 4.6302,
"step": 12300
},
{
"epoch": 0.1114822170066799,
"grad_norm": 3.0118372440338135,
"learning_rate": 4.442634049467413e-05,
"loss": 4.6187,
"step": 12350
},
{
"epoch": 0.11193356201480412,
"grad_norm": 3.5457749366760254,
"learning_rate": 4.440377324426792e-05,
"loss": 4.5201,
"step": 12400
},
{
"epoch": 0.11238490702292833,
"grad_norm": 3.9251248836517334,
"learning_rate": 4.4381205993861706e-05,
"loss": 4.6261,
"step": 12450
},
{
"epoch": 0.11283625203105253,
"grad_norm": 3.2046866416931152,
"learning_rate": 4.43586387434555e-05,
"loss": 4.5479,
"step": 12500
},
{
"epoch": 0.11328759703917675,
"grad_norm": 3.1684064865112305,
"learning_rate": 4.433607149304929e-05,
"loss": 4.5823,
"step": 12550
},
{
"epoch": 0.11373894204730095,
"grad_norm": 4.124698638916016,
"learning_rate": 4.431350424264308e-05,
"loss": 4.6002,
"step": 12600
},
{
"epoch": 0.11419028705542517,
"grad_norm": 3.9625906944274902,
"learning_rate": 4.429093699223687e-05,
"loss": 4.5571,
"step": 12650
},
{
"epoch": 0.11464163206354938,
"grad_norm": 4.684337139129639,
"learning_rate": 4.4268369741830655e-05,
"loss": 4.5483,
"step": 12700
},
{
"epoch": 0.11509297707167358,
"grad_norm": 4.30114221572876,
"learning_rate": 4.424580249142445e-05,
"loss": 4.5534,
"step": 12750
},
{
"epoch": 0.1155443220797978,
"grad_norm": 3.673405647277832,
"learning_rate": 4.4223235241018234e-05,
"loss": 4.5645,
"step": 12800
},
{
"epoch": 0.11599566708792201,
"grad_norm": 4.129467964172363,
"learning_rate": 4.4200667990612027e-05,
"loss": 4.5647,
"step": 12850
},
{
"epoch": 0.11644701209604622,
"grad_norm": 2.8640856742858887,
"learning_rate": 4.417810074020581e-05,
"loss": 4.475,
"step": 12900
},
{
"epoch": 0.11689835710417043,
"grad_norm": 3.1711478233337402,
"learning_rate": 4.4155533489799605e-05,
"loss": 4.5395,
"step": 12950
},
{
"epoch": 0.11734970211229463,
"grad_norm": 4.4645586013793945,
"learning_rate": 4.413296623939339e-05,
"loss": 4.5149,
"step": 13000
},
{
"epoch": 0.11780104712041885,
"grad_norm": 4.081081867218018,
"learning_rate": 4.4110398988987184e-05,
"loss": 4.3781,
"step": 13050
},
{
"epoch": 0.11825239212854306,
"grad_norm": 3.4459915161132812,
"learning_rate": 4.4087831738580976e-05,
"loss": 4.4447,
"step": 13100
},
{
"epoch": 0.11870373713666726,
"grad_norm": 4.382139205932617,
"learning_rate": 4.406526448817476e-05,
"loss": 4.5698,
"step": 13150
},
{
"epoch": 0.11915508214479148,
"grad_norm": 3.9767699241638184,
"learning_rate": 4.4042697237768555e-05,
"loss": 4.7358,
"step": 13200
},
{
"epoch": 0.1196064271529157,
"grad_norm": 2.903264284133911,
"learning_rate": 4.402012998736234e-05,
"loss": 4.4043,
"step": 13250
},
{
"epoch": 0.1200577721610399,
"grad_norm": 3.7580466270446777,
"learning_rate": 4.399756273695613e-05,
"loss": 4.4899,
"step": 13300
},
{
"epoch": 0.12050911716916411,
"grad_norm": 3.086916446685791,
"learning_rate": 4.397499548654992e-05,
"loss": 4.494,
"step": 13350
},
{
"epoch": 0.12096046217728831,
"grad_norm": 3.9137027263641357,
"learning_rate": 4.395242823614371e-05,
"loss": 4.5933,
"step": 13400
},
{
"epoch": 0.12141180718541253,
"grad_norm": 3.615917205810547,
"learning_rate": 4.39298609857375e-05,
"loss": 4.4373,
"step": 13450
},
{
"epoch": 0.12186315219353674,
"grad_norm": 2.4744229316711426,
"learning_rate": 4.390729373533129e-05,
"loss": 4.6075,
"step": 13500
},
{
"epoch": 0.12231449720166095,
"grad_norm": 3.469045639038086,
"learning_rate": 4.3884726484925076e-05,
"loss": 4.5208,
"step": 13550
},
{
"epoch": 0.12276584220978516,
"grad_norm": 4.882166385650635,
"learning_rate": 4.386215923451887e-05,
"loss": 4.4989,
"step": 13600
},
{
"epoch": 0.12321718721790936,
"grad_norm": 4.610581398010254,
"learning_rate": 4.383959198411266e-05,
"loss": 4.5306,
"step": 13650
},
{
"epoch": 0.12366853222603358,
"grad_norm": 3.6969921588897705,
"learning_rate": 4.381702473370645e-05,
"loss": 4.5978,
"step": 13700
},
{
"epoch": 0.1241198772341578,
"grad_norm": 4.886890888214111,
"learning_rate": 4.379445748330024e-05,
"loss": 4.4131,
"step": 13750
},
{
"epoch": 0.124571222242282,
"grad_norm": 2.121551513671875,
"learning_rate": 4.3771890232894026e-05,
"loss": 4.5148,
"step": 13800
},
{
"epoch": 0.1250225672504062,
"grad_norm": 3.1213953495025635,
"learning_rate": 4.374932298248782e-05,
"loss": 4.6455,
"step": 13850
},
{
"epoch": 0.1254739122585304,
"grad_norm": 3.9660770893096924,
"learning_rate": 4.3726755732081604e-05,
"loss": 4.4807,
"step": 13900
},
{
"epoch": 0.12592525726665463,
"grad_norm": 2.980980396270752,
"learning_rate": 4.37041884816754e-05,
"loss": 4.6299,
"step": 13950
},
{
"epoch": 0.12637660227477884,
"grad_norm": 3.5488901138305664,
"learning_rate": 4.368162123126918e-05,
"loss": 4.5797,
"step": 14000
},
{
"epoch": 0.12682794728290306,
"grad_norm": 2.9502065181732178,
"learning_rate": 4.3659053980862975e-05,
"loss": 4.5491,
"step": 14050
},
{
"epoch": 0.12727929229102727,
"grad_norm": 2.8409996032714844,
"learning_rate": 4.363648673045677e-05,
"loss": 4.5958,
"step": 14100
},
{
"epoch": 0.12773063729915146,
"grad_norm": 5.0700907707214355,
"learning_rate": 4.3613919480050554e-05,
"loss": 4.6128,
"step": 14150
},
{
"epoch": 0.12818198230727568,
"grad_norm": 3.55629301071167,
"learning_rate": 4.3591352229644347e-05,
"loss": 4.492,
"step": 14200
},
{
"epoch": 0.1286333273153999,
"grad_norm": 3.631505250930786,
"learning_rate": 4.356878497923813e-05,
"loss": 4.5023,
"step": 14250
},
{
"epoch": 0.1290846723235241,
"grad_norm": 3.8898086547851562,
"learning_rate": 4.3546217728831925e-05,
"loss": 4.5478,
"step": 14300
},
{
"epoch": 0.12953601733164832,
"grad_norm": 3.2403228282928467,
"learning_rate": 4.352365047842571e-05,
"loss": 4.4829,
"step": 14350
},
{
"epoch": 0.1299873623397725,
"grad_norm": 3.5314269065856934,
"learning_rate": 4.3501083228019504e-05,
"loss": 4.5186,
"step": 14400
},
{
"epoch": 0.13043870734789673,
"grad_norm": 3.769017457962036,
"learning_rate": 4.347851597761329e-05,
"loss": 4.4798,
"step": 14450
},
{
"epoch": 0.13089005235602094,
"grad_norm": 3.4731597900390625,
"learning_rate": 4.345594872720708e-05,
"loss": 4.6109,
"step": 14500
},
{
"epoch": 0.13134139736414516,
"grad_norm": 4.540064811706543,
"learning_rate": 4.343338147680087e-05,
"loss": 4.5537,
"step": 14550
},
{
"epoch": 0.13179274237226937,
"grad_norm": 4.51099157333374,
"learning_rate": 4.341081422639466e-05,
"loss": 4.4937,
"step": 14600
},
{
"epoch": 0.13224408738039356,
"grad_norm": 5.07973051071167,
"learning_rate": 4.338824697598845e-05,
"loss": 4.5471,
"step": 14650
},
{
"epoch": 0.13269543238851778,
"grad_norm": 4.9902753829956055,
"learning_rate": 4.336567972558224e-05,
"loss": 4.5544,
"step": 14700
},
{
"epoch": 0.133146777396642,
"grad_norm": 5.2365031242370605,
"learning_rate": 4.334311247517603e-05,
"loss": 4.4559,
"step": 14750
},
{
"epoch": 0.1335981224047662,
"grad_norm": 4.138045787811279,
"learning_rate": 4.332054522476982e-05,
"loss": 4.3979,
"step": 14800
},
{
"epoch": 0.13404946741289042,
"grad_norm": 3.637258529663086,
"learning_rate": 4.329797797436361e-05,
"loss": 4.5282,
"step": 14850
},
{
"epoch": 0.13450081242101464,
"grad_norm": 3.374943256378174,
"learning_rate": 4.3275410723957396e-05,
"loss": 4.5585,
"step": 14900
},
{
"epoch": 0.13495215742913882,
"grad_norm": 4.198739051818848,
"learning_rate": 4.325284347355119e-05,
"loss": 4.4996,
"step": 14950
},
{
"epoch": 0.13540350243726304,
"grad_norm": 3.0009047985076904,
"learning_rate": 4.3230276223144975e-05,
"loss": 4.5361,
"step": 15000
},
{
"epoch": 0.13585484744538726,
"grad_norm": 3.28633975982666,
"learning_rate": 4.320770897273877e-05,
"loss": 4.5367,
"step": 15050
},
{
"epoch": 0.13630619245351147,
"grad_norm": 3.2945947647094727,
"learning_rate": 4.318514172233255e-05,
"loss": 4.5113,
"step": 15100
},
{
"epoch": 0.1367575374616357,
"grad_norm": 5.111336708068848,
"learning_rate": 4.3162574471926346e-05,
"loss": 4.5238,
"step": 15150
},
{
"epoch": 0.13720888246975987,
"grad_norm": 2.328876256942749,
"learning_rate": 4.314000722152013e-05,
"loss": 4.5512,
"step": 15200
},
{
"epoch": 0.1376602274778841,
"grad_norm": 3.703890323638916,
"learning_rate": 4.311743997111392e-05,
"loss": 4.5192,
"step": 15250
},
{
"epoch": 0.1381115724860083,
"grad_norm": 2.8396573066711426,
"learning_rate": 4.309487272070771e-05,
"loss": 4.5641,
"step": 15300
},
{
"epoch": 0.13856291749413252,
"grad_norm": 3.3222029209136963,
"learning_rate": 4.3072305470301496e-05,
"loss": 4.5332,
"step": 15350
},
{
"epoch": 0.13901426250225674,
"grad_norm": 3.652606725692749,
"learning_rate": 4.304973821989529e-05,
"loss": 4.5056,
"step": 15400
},
{
"epoch": 0.13946560751038092,
"grad_norm": 7.847742080688477,
"learning_rate": 4.3027170969489075e-05,
"loss": 4.4884,
"step": 15450
},
{
"epoch": 0.13991695251850514,
"grad_norm": 3.6494662761688232,
"learning_rate": 4.300460371908287e-05,
"loss": 4.4938,
"step": 15500
},
{
"epoch": 0.14036829752662935,
"grad_norm": 4.544933795928955,
"learning_rate": 4.298203646867665e-05,
"loss": 4.5045,
"step": 15550
},
{
"epoch": 0.14081964253475357,
"grad_norm": 3.429764986038208,
"learning_rate": 4.2959469218270446e-05,
"loss": 4.5033,
"step": 15600
},
{
"epoch": 0.14127098754287779,
"grad_norm": 3.790017604827881,
"learning_rate": 4.293690196786423e-05,
"loss": 4.4775,
"step": 15650
},
{
"epoch": 0.14172233255100197,
"grad_norm": 3.4987452030181885,
"learning_rate": 4.2914334717458024e-05,
"loss": 4.5988,
"step": 15700
},
{
"epoch": 0.1421736775591262,
"grad_norm": 2.5895438194274902,
"learning_rate": 4.289176746705182e-05,
"loss": 4.5285,
"step": 15750
},
{
"epoch": 0.1426250225672504,
"grad_norm": 4.709017276763916,
"learning_rate": 4.28692002166456e-05,
"loss": 4.4137,
"step": 15800
},
{
"epoch": 0.14307636757537462,
"grad_norm": 3.9760525226593018,
"learning_rate": 4.2846632966239395e-05,
"loss": 4.5278,
"step": 15850
},
{
"epoch": 0.14352771258349883,
"grad_norm": 4.445188045501709,
"learning_rate": 4.282406571583318e-05,
"loss": 4.5362,
"step": 15900
},
{
"epoch": 0.14397905759162305,
"grad_norm": 4.021897792816162,
"learning_rate": 4.2801498465426974e-05,
"loss": 4.458,
"step": 15950
},
{
"epoch": 0.14443040259974724,
"grad_norm": 4.263660907745361,
"learning_rate": 4.277893121502076e-05,
"loss": 4.4782,
"step": 16000
},
{
"epoch": 0.14488174760787145,
"grad_norm": 3.184115171432495,
"learning_rate": 4.275636396461455e-05,
"loss": 4.4877,
"step": 16050
},
{
"epoch": 0.14533309261599567,
"grad_norm": 3.6419224739074707,
"learning_rate": 4.273379671420834e-05,
"loss": 4.5329,
"step": 16100
},
{
"epoch": 0.14578443762411988,
"grad_norm": 5.209333896636963,
"learning_rate": 4.271122946380213e-05,
"loss": 4.5252,
"step": 16150
},
{
"epoch": 0.1462357826322441,
"grad_norm": 2.9980499744415283,
"learning_rate": 4.2688662213395924e-05,
"loss": 4.4491,
"step": 16200
},
{
"epoch": 0.1466871276403683,
"grad_norm": 2.8836166858673096,
"learning_rate": 4.266609496298971e-05,
"loss": 4.524,
"step": 16250
},
{
"epoch": 0.1471384726484925,
"grad_norm": 3.24406099319458,
"learning_rate": 4.26435277125835e-05,
"loss": 4.5629,
"step": 16300
},
{
"epoch": 0.14758981765661672,
"grad_norm": 3.78409743309021,
"learning_rate": 4.262096046217729e-05,
"loss": 4.5051,
"step": 16350
},
{
"epoch": 0.14804116266474093,
"grad_norm": 3.738863229751587,
"learning_rate": 4.259839321177108e-05,
"loss": 4.3699,
"step": 16400
},
{
"epoch": 0.14849250767286515,
"grad_norm": 3.1949925422668457,
"learning_rate": 4.2575825961364867e-05,
"loss": 4.4681,
"step": 16450
},
{
"epoch": 0.14894385268098934,
"grad_norm": 3.774017810821533,
"learning_rate": 4.255325871095866e-05,
"loss": 4.4382,
"step": 16500
},
{
"epoch": 0.14939519768911355,
"grad_norm": 3.903379201889038,
"learning_rate": 4.2530691460552445e-05,
"loss": 4.4229,
"step": 16550
},
{
"epoch": 0.14984654269723777,
"grad_norm": 2.8182575702667236,
"learning_rate": 4.250812421014624e-05,
"loss": 4.4755,
"step": 16600
},
{
"epoch": 0.15029788770536198,
"grad_norm": 3.8375935554504395,
"learning_rate": 4.2485556959740024e-05,
"loss": 4.5113,
"step": 16650
},
{
"epoch": 0.1507492327134862,
"grad_norm": 3.6683831214904785,
"learning_rate": 4.2462989709333816e-05,
"loss": 4.5386,
"step": 16700
},
{
"epoch": 0.15120057772161039,
"grad_norm": 4.0321431159973145,
"learning_rate": 4.244042245892761e-05,
"loss": 4.4977,
"step": 16750
},
{
"epoch": 0.1516519227297346,
"grad_norm": 3.8294458389282227,
"learning_rate": 4.2417855208521395e-05,
"loss": 4.4926,
"step": 16800
},
{
"epoch": 0.15210326773785882,
"grad_norm": 3.6209237575531006,
"learning_rate": 4.239528795811519e-05,
"loss": 4.5028,
"step": 16850
},
{
"epoch": 0.15255461274598303,
"grad_norm": 3.8138227462768555,
"learning_rate": 4.237272070770897e-05,
"loss": 4.4808,
"step": 16900
},
{
"epoch": 0.15300595775410725,
"grad_norm": 4.5005927085876465,
"learning_rate": 4.2350153457302766e-05,
"loss": 4.4702,
"step": 16950
},
{
"epoch": 0.15345730276223146,
"grad_norm": 3.48544979095459,
"learning_rate": 4.232758620689655e-05,
"loss": 4.4993,
"step": 17000
},
{
"epoch": 0.15390864777035565,
"grad_norm": 3.5820982456207275,
"learning_rate": 4.2305018956490344e-05,
"loss": 4.5032,
"step": 17050
},
{
"epoch": 0.15435999277847987,
"grad_norm": 4.8123555183410645,
"learning_rate": 4.228245170608413e-05,
"loss": 4.5196,
"step": 17100
},
{
"epoch": 0.15481133778660408,
"grad_norm": 3.8024814128875732,
"learning_rate": 4.225988445567792e-05,
"loss": 4.5327,
"step": 17150
},
{
"epoch": 0.1552626827947283,
"grad_norm": 5.407778263092041,
"learning_rate": 4.223731720527171e-05,
"loss": 4.5355,
"step": 17200
},
{
"epoch": 0.1557140278028525,
"grad_norm": 3.6917614936828613,
"learning_rate": 4.22147499548655e-05,
"loss": 4.4072,
"step": 17250
},
{
"epoch": 0.1561653728109767,
"grad_norm": 3.9421164989471436,
"learning_rate": 4.2192182704459294e-05,
"loss": 4.4812,
"step": 17300
},
{
"epoch": 0.15661671781910091,
"grad_norm": 4.172101974487305,
"learning_rate": 4.216961545405308e-05,
"loss": 4.4737,
"step": 17350
},
{
"epoch": 0.15706806282722513,
"grad_norm": 3.308185577392578,
"learning_rate": 4.214704820364687e-05,
"loss": 4.5789,
"step": 17400
},
{
"epoch": 0.15751940783534935,
"grad_norm": 4.956492900848389,
"learning_rate": 4.212448095324066e-05,
"loss": 4.565,
"step": 17450
},
{
"epoch": 0.15797075284347356,
"grad_norm": 3.411794900894165,
"learning_rate": 4.210191370283445e-05,
"loss": 4.5473,
"step": 17500
},
{
"epoch": 0.15842209785159775,
"grad_norm": 4.067993640899658,
"learning_rate": 4.207934645242824e-05,
"loss": 4.4836,
"step": 17550
},
{
"epoch": 0.15887344285972196,
"grad_norm": 2.9520280361175537,
"learning_rate": 4.205677920202203e-05,
"loss": 4.4962,
"step": 17600
},
{
"epoch": 0.15932478786784618,
"grad_norm": 4.387596130371094,
"learning_rate": 4.2034211951615815e-05,
"loss": 4.513,
"step": 17650
},
{
"epoch": 0.1597761328759704,
"grad_norm": 3.250239849090576,
"learning_rate": 4.201164470120961e-05,
"loss": 4.5496,
"step": 17700
},
{
"epoch": 0.1602274778840946,
"grad_norm": 3.867882013320923,
"learning_rate": 4.1989077450803394e-05,
"loss": 4.5849,
"step": 17750
},
{
"epoch": 0.16067882289221883,
"grad_norm": 3.7500853538513184,
"learning_rate": 4.196651020039719e-05,
"loss": 4.4585,
"step": 17800
},
{
"epoch": 0.161130167900343,
"grad_norm": 3.8945131301879883,
"learning_rate": 4.194394294999098e-05,
"loss": 4.4149,
"step": 17850
},
{
"epoch": 0.16158151290846723,
"grad_norm": 8.667535781860352,
"learning_rate": 4.1921375699584765e-05,
"loss": 4.44,
"step": 17900
},
{
"epoch": 0.16203285791659144,
"grad_norm": 4.284276485443115,
"learning_rate": 4.189880844917856e-05,
"loss": 4.4561,
"step": 17950
},
{
"epoch": 0.16248420292471566,
"grad_norm": 2.9393467903137207,
"learning_rate": 4.1876241198772344e-05,
"loss": 4.5887,
"step": 18000
},
{
"epoch": 0.16293554793283987,
"grad_norm": 3.012742519378662,
"learning_rate": 4.1853673948366136e-05,
"loss": 4.4513,
"step": 18050
},
{
"epoch": 0.16338689294096406,
"grad_norm": 5.467082500457764,
"learning_rate": 4.183110669795992e-05,
"loss": 4.5611,
"step": 18100
},
{
"epoch": 0.16383823794908828,
"grad_norm": 3.46402907371521,
"learning_rate": 4.1808539447553715e-05,
"loss": 4.5312,
"step": 18150
},
{
"epoch": 0.1642895829572125,
"grad_norm": 3.8491625785827637,
"learning_rate": 4.17859721971475e-05,
"loss": 4.4916,
"step": 18200
},
{
"epoch": 0.1647409279653367,
"grad_norm": 5.8692450523376465,
"learning_rate": 4.176340494674129e-05,
"loss": 4.4869,
"step": 18250
},
{
"epoch": 0.16519227297346092,
"grad_norm": 3.2287988662719727,
"learning_rate": 4.174083769633508e-05,
"loss": 4.4431,
"step": 18300
},
{
"epoch": 0.1656436179815851,
"grad_norm": 4.350259304046631,
"learning_rate": 4.171827044592887e-05,
"loss": 4.4968,
"step": 18350
},
{
"epoch": 0.16609496298970933,
"grad_norm": 3.7243659496307373,
"learning_rate": 4.1695703195522664e-05,
"loss": 4.4738,
"step": 18400
},
{
"epoch": 0.16654630799783354,
"grad_norm": 4.834224224090576,
"learning_rate": 4.167313594511645e-05,
"loss": 4.5754,
"step": 18450
},
{
"epoch": 0.16699765300595776,
"grad_norm": 6.014001846313477,
"learning_rate": 4.165056869471024e-05,
"loss": 4.5449,
"step": 18500
},
{
"epoch": 0.16744899801408197,
"grad_norm": 3.7950220108032227,
"learning_rate": 4.162800144430403e-05,
"loss": 4.4007,
"step": 18550
},
{
"epoch": 0.16790034302220616,
"grad_norm": 4.019992828369141,
"learning_rate": 4.160543419389782e-05,
"loss": 4.4231,
"step": 18600
},
{
"epoch": 0.16835168803033038,
"grad_norm": 4.363696575164795,
"learning_rate": 4.158286694349161e-05,
"loss": 4.4445,
"step": 18650
},
{
"epoch": 0.1688030330384546,
"grad_norm": 4.168088912963867,
"learning_rate": 4.15602996930854e-05,
"loss": 4.5064,
"step": 18700
},
{
"epoch": 0.1692543780465788,
"grad_norm": 3.3574249744415283,
"learning_rate": 4.1537732442679186e-05,
"loss": 4.5161,
"step": 18750
},
{
"epoch": 0.16970572305470302,
"grad_norm": 4.255382061004639,
"learning_rate": 4.151516519227298e-05,
"loss": 4.4809,
"step": 18800
},
{
"epoch": 0.17015706806282724,
"grad_norm": 3.896949291229248,
"learning_rate": 4.1492597941866764e-05,
"loss": 4.488,
"step": 18850
},
{
"epoch": 0.17060841307095143,
"grad_norm": 4.572742938995361,
"learning_rate": 4.147003069146056e-05,
"loss": 4.5692,
"step": 18900
},
{
"epoch": 0.17105975807907564,
"grad_norm": 4.25124454498291,
"learning_rate": 4.144746344105434e-05,
"loss": 4.5341,
"step": 18950
},
{
"epoch": 0.17151110308719986,
"grad_norm": 3.2986035346984863,
"learning_rate": 4.1424896190648136e-05,
"loss": 4.4489,
"step": 19000
},
{
"epoch": 0.17196244809532407,
"grad_norm": 3.633592367172241,
"learning_rate": 4.140232894024192e-05,
"loss": 4.5124,
"step": 19050
},
{
"epoch": 0.1724137931034483,
"grad_norm": 3.3687500953674316,
"learning_rate": 4.1379761689835714e-05,
"loss": 4.5165,
"step": 19100
},
{
"epoch": 0.17286513811157247,
"grad_norm": 4.958398342132568,
"learning_rate": 4.13571944394295e-05,
"loss": 4.5043,
"step": 19150
},
{
"epoch": 0.1733164831196967,
"grad_norm": 4.127295017242432,
"learning_rate": 4.133462718902329e-05,
"loss": 4.4783,
"step": 19200
},
{
"epoch": 0.1737678281278209,
"grad_norm": 3.3556175231933594,
"learning_rate": 4.131205993861708e-05,
"loss": 4.4185,
"step": 19250
},
{
"epoch": 0.17421917313594512,
"grad_norm": 4.382410049438477,
"learning_rate": 4.128949268821087e-05,
"loss": 4.5009,
"step": 19300
},
{
"epoch": 0.17467051814406934,
"grad_norm": 3.7760777473449707,
"learning_rate": 4.126692543780466e-05,
"loss": 4.3572,
"step": 19350
},
{
"epoch": 0.17512186315219352,
"grad_norm": 4.594768524169922,
"learning_rate": 4.124435818739845e-05,
"loss": 4.4793,
"step": 19400
},
{
"epoch": 0.17557320816031774,
"grad_norm": 4.605646133422852,
"learning_rate": 4.1221790936992235e-05,
"loss": 4.4462,
"step": 19450
},
{
"epoch": 0.17602455316844196,
"grad_norm": 3.358002185821533,
"learning_rate": 4.119922368658603e-05,
"loss": 4.4986,
"step": 19500
},
{
"epoch": 0.17647589817656617,
"grad_norm": 3.7644100189208984,
"learning_rate": 4.1176656436179814e-05,
"loss": 4.5314,
"step": 19550
},
{
"epoch": 0.17692724318469039,
"grad_norm": 4.109899044036865,
"learning_rate": 4.115408918577361e-05,
"loss": 4.5382,
"step": 19600
},
{
"epoch": 0.1773785881928146,
"grad_norm": 8.232100486755371,
"learning_rate": 4.113152193536739e-05,
"loss": 4.5095,
"step": 19650
},
{
"epoch": 0.1778299332009388,
"grad_norm": 3.442411422729492,
"learning_rate": 4.1108954684961185e-05,
"loss": 4.5861,
"step": 19700
},
{
"epoch": 0.178281278209063,
"grad_norm": 2.404611825942993,
"learning_rate": 4.108638743455497e-05,
"loss": 4.4563,
"step": 19750
},
{
"epoch": 0.17873262321718722,
"grad_norm": 3.3895816802978516,
"learning_rate": 4.1063820184148764e-05,
"loss": 4.4434,
"step": 19800
},
{
"epoch": 0.17918396822531144,
"grad_norm": 2.9194042682647705,
"learning_rate": 4.104125293374255e-05,
"loss": 4.5463,
"step": 19850
},
{
"epoch": 0.17963531323343565,
"grad_norm": 2.6337718963623047,
"learning_rate": 4.101868568333634e-05,
"loss": 4.4246,
"step": 19900
},
{
"epoch": 0.18008665824155984,
"grad_norm": 5.921742916107178,
"learning_rate": 4.0996118432930135e-05,
"loss": 4.3955,
"step": 19950
},
{
"epoch": 0.18053800324968405,
"grad_norm": 3.9008045196533203,
"learning_rate": 4.097355118252392e-05,
"loss": 4.6028,
"step": 20000
}
],
"logging_steps": 50,
"max_steps": 110780,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4772471541951488e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}