gemmafischer-tutor-lora / trainer_state.json
Dontbeafed69's picture
Upload folder using huggingface_hub
47323b7 verified
{
"best_global_step": 1000,
"best_metric": 0.7969963550567627,
"best_model_checkpoint": "checkpoints/lora_tutor/checkpoint-1000",
"epoch": 0.35634743875278396,
"eval_steps": 200,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00035634743875278396,
"grad_norm": 99.66374969482422,
"learning_rate": 0.0,
"loss": 4.1982,
"step": 1
},
{
"epoch": 0.0017817371937639199,
"grad_norm": 57.29983139038086,
"learning_rate": 3.3333333333333333e-06,
"loss": 4.0,
"step": 5
},
{
"epoch": 0.0035634743875278397,
"grad_norm": 27.13236427307129,
"learning_rate": 7.5e-06,
"loss": 2.4799,
"step": 10
},
{
"epoch": 0.005345211581291759,
"grad_norm": 13.770986557006836,
"learning_rate": 1.1666666666666668e-05,
"loss": 1.4129,
"step": 15
},
{
"epoch": 0.0071269487750556795,
"grad_norm": 9.720105171203613,
"learning_rate": 1.5833333333333333e-05,
"loss": 1.2207,
"step": 20
},
{
"epoch": 0.008908685968819599,
"grad_norm": 10.442448616027832,
"learning_rate": 2e-05,
"loss": 1.2037,
"step": 25
},
{
"epoch": 0.010690423162583519,
"grad_norm": 8.392529487609863,
"learning_rate": 2.4166666666666667e-05,
"loss": 1.1535,
"step": 30
},
{
"epoch": 0.012472160356347439,
"grad_norm": 7.494682788848877,
"learning_rate": 2.8333333333333335e-05,
"loss": 1.1494,
"step": 35
},
{
"epoch": 0.014253897550111359,
"grad_norm": 7.756562232971191,
"learning_rate": 3.2500000000000004e-05,
"loss": 1.1588,
"step": 40
},
{
"epoch": 0.016035634743875277,
"grad_norm": 5.802969932556152,
"learning_rate": 3.6666666666666666e-05,
"loss": 1.1049,
"step": 45
},
{
"epoch": 0.017817371937639197,
"grad_norm": 7.09335470199585,
"learning_rate": 4.0833333333333334e-05,
"loss": 1.0902,
"step": 50
},
{
"epoch": 0.019599109131403118,
"grad_norm": 5.961513042449951,
"learning_rate": 4.5e-05,
"loss": 1.0484,
"step": 55
},
{
"epoch": 0.021380846325167038,
"grad_norm": 4.003515720367432,
"learning_rate": 4.9166666666666665e-05,
"loss": 1.0418,
"step": 60
},
{
"epoch": 0.023162583518930958,
"grad_norm": 4.197242259979248,
"learning_rate": 5.333333333333333e-05,
"loss": 1.0574,
"step": 65
},
{
"epoch": 0.024944320712694878,
"grad_norm": 4.823288917541504,
"learning_rate": 5.7499999999999995e-05,
"loss": 1.0194,
"step": 70
},
{
"epoch": 0.026726057906458798,
"grad_norm": 6.500214099884033,
"learning_rate": 6.166666666666667e-05,
"loss": 1.0146,
"step": 75
},
{
"epoch": 0.028507795100222718,
"grad_norm": 6.800583362579346,
"learning_rate": 6.583333333333334e-05,
"loss": 1.0192,
"step": 80
},
{
"epoch": 0.030289532293986638,
"grad_norm": 5.19222354888916,
"learning_rate": 7e-05,
"loss": 1.0455,
"step": 85
},
{
"epoch": 0.032071269487750555,
"grad_norm": 6.357260704040527,
"learning_rate": 7.416666666666668e-05,
"loss": 1.029,
"step": 90
},
{
"epoch": 0.033853006681514475,
"grad_norm": 5.543500900268555,
"learning_rate": 7.833333333333333e-05,
"loss": 1.0228,
"step": 95
},
{
"epoch": 0.035634743875278395,
"grad_norm": 4.388900279998779,
"learning_rate": 8.25e-05,
"loss": 1.044,
"step": 100
},
{
"epoch": 0.037416481069042315,
"grad_norm": 5.311753273010254,
"learning_rate": 8.666666666666667e-05,
"loss": 1.063,
"step": 105
},
{
"epoch": 0.039198218262806235,
"grad_norm": 5.037621974945068,
"learning_rate": 9.083333333333334e-05,
"loss": 1.0138,
"step": 110
},
{
"epoch": 0.040979955456570155,
"grad_norm": 5.125575542449951,
"learning_rate": 9.5e-05,
"loss": 1.0403,
"step": 115
},
{
"epoch": 0.042761692650334075,
"grad_norm": 5.154388904571533,
"learning_rate": 9.916666666666667e-05,
"loss": 1.015,
"step": 120
},
{
"epoch": 0.044543429844097995,
"grad_norm": 3.9509270191192627,
"learning_rate": 9.999661540018812e-05,
"loss": 1.0027,
"step": 125
},
{
"epoch": 0.046325167037861915,
"grad_norm": 3.7814090251922607,
"learning_rate": 9.998286624877786e-05,
"loss": 0.9863,
"step": 130
},
{
"epoch": 0.048106904231625836,
"grad_norm": 3.7585690021514893,
"learning_rate": 9.995854391448606e-05,
"loss": 1.0459,
"step": 135
},
{
"epoch": 0.049888641425389756,
"grad_norm": 6.917703628540039,
"learning_rate": 9.992365354236557e-05,
"loss": 1.0719,
"step": 140
},
{
"epoch": 0.051670378619153676,
"grad_norm": 3.885483503341675,
"learning_rate": 9.987820251299122e-05,
"loss": 1.0123,
"step": 145
},
{
"epoch": 0.053452115812917596,
"grad_norm": 3.287639617919922,
"learning_rate": 9.982220044089859e-05,
"loss": 0.9903,
"step": 150
},
{
"epoch": 0.055233853006681516,
"grad_norm": 3.95298171043396,
"learning_rate": 9.975565917255016e-05,
"loss": 0.9841,
"step": 155
},
{
"epoch": 0.057015590200445436,
"grad_norm": 4.5531721115112305,
"learning_rate": 9.967859278382938e-05,
"loss": 0.9968,
"step": 160
},
{
"epoch": 0.058797327394209356,
"grad_norm": 4.4977641105651855,
"learning_rate": 9.959101757706308e-05,
"loss": 1.006,
"step": 165
},
{
"epoch": 0.060579064587973276,
"grad_norm": 3.260209798812866,
"learning_rate": 9.949295207757299e-05,
"loss": 0.9557,
"step": 170
},
{
"epoch": 0.062360801781737196,
"grad_norm": 3.9708852767944336,
"learning_rate": 9.938441702975689e-05,
"loss": 0.9914,
"step": 175
},
{
"epoch": 0.06414253897550111,
"grad_norm": 3.731992721557617,
"learning_rate": 9.926543539270048e-05,
"loss": 0.9794,
"step": 180
},
{
"epoch": 0.06592427616926504,
"grad_norm": 3.153402805328369,
"learning_rate": 9.913603233532067e-05,
"loss": 0.9525,
"step": 185
},
{
"epoch": 0.06770601336302895,
"grad_norm": 2.9249067306518555,
"learning_rate": 9.899623523104149e-05,
"loss": 0.9578,
"step": 190
},
{
"epoch": 0.06948775055679288,
"grad_norm": 2.661738872528076,
"learning_rate": 9.884607365200356e-05,
"loss": 0.9711,
"step": 195
},
{
"epoch": 0.07126948775055679,
"grad_norm": 3.0224714279174805,
"learning_rate": 9.868557936280855e-05,
"loss": 0.9693,
"step": 200
},
{
"epoch": 0.07126948775055679,
"eval_loss": 0.9798622131347656,
"eval_runtime": 249.2057,
"eval_samples_per_second": 20.02,
"eval_steps_per_second": 2.504,
"step": 200
},
{
"epoch": 0.07305122494432072,
"grad_norm": 2.5287749767303467,
"learning_rate": 9.851478631379982e-05,
"loss": 0.9299,
"step": 205
},
{
"epoch": 0.07483296213808463,
"grad_norm": 2.9961535930633545,
"learning_rate": 9.83337306338807e-05,
"loss": 0.9606,
"step": 210
},
{
"epoch": 0.07661469933184856,
"grad_norm": 3.6630430221557617,
"learning_rate": 9.814245062287189e-05,
"loss": 0.9546,
"step": 215
},
{
"epoch": 0.07839643652561247,
"grad_norm": 2.665858030319214,
"learning_rate": 9.794098674340965e-05,
"loss": 0.958,
"step": 220
},
{
"epoch": 0.0801781737193764,
"grad_norm": 2.741337776184082,
"learning_rate": 9.77293816123866e-05,
"loss": 0.963,
"step": 225
},
{
"epoch": 0.08195991091314031,
"grad_norm": 2.693640947341919,
"learning_rate": 9.750767999193656e-05,
"loss": 0.9677,
"step": 230
},
{
"epoch": 0.08374164810690424,
"grad_norm": 2.718897581100464,
"learning_rate": 9.727592877996585e-05,
"loss": 0.9551,
"step": 235
},
{
"epoch": 0.08552338530066815,
"grad_norm": 3.1531124114990234,
"learning_rate": 9.70341770002326e-05,
"loss": 0.9692,
"step": 240
},
{
"epoch": 0.08730512249443208,
"grad_norm": 2.4551897048950195,
"learning_rate": 9.678247579197657e-05,
"loss": 0.9727,
"step": 245
},
{
"epoch": 0.08908685968819599,
"grad_norm": 2.886244058609009,
"learning_rate": 9.652087839910124e-05,
"loss": 0.9537,
"step": 250
},
{
"epoch": 0.09086859688195992,
"grad_norm": 2.8074824810028076,
"learning_rate": 9.62494401589108e-05,
"loss": 0.9327,
"step": 255
},
{
"epoch": 0.09265033407572383,
"grad_norm": 2.750798463821411,
"learning_rate": 9.596821849040447e-05,
"loss": 0.9228,
"step": 260
},
{
"epoch": 0.09443207126948774,
"grad_norm": 2.552215337753296,
"learning_rate": 9.567727288213005e-05,
"loss": 0.9423,
"step": 265
},
{
"epoch": 0.09621380846325167,
"grad_norm": 2.3609156608581543,
"learning_rate": 9.537666487960019e-05,
"loss": 0.9676,
"step": 270
},
{
"epoch": 0.09799554565701558,
"grad_norm": 2.8906874656677246,
"learning_rate": 9.506645807227312e-05,
"loss": 0.955,
"step": 275
},
{
"epoch": 0.09977728285077951,
"grad_norm": 2.660022497177124,
"learning_rate": 9.474671808010126e-05,
"loss": 0.9695,
"step": 280
},
{
"epoch": 0.10155902004454342,
"grad_norm": 3.261420726776123,
"learning_rate": 9.441751253965021e-05,
"loss": 0.9477,
"step": 285
},
{
"epoch": 0.10334075723830735,
"grad_norm": 3.65535044670105,
"learning_rate": 9.407891108979117e-05,
"loss": 0.9724,
"step": 290
},
{
"epoch": 0.10512249443207126,
"grad_norm": 6.143333911895752,
"learning_rate": 9.373098535696979e-05,
"loss": 0.9477,
"step": 295
},
{
"epoch": 0.10690423162583519,
"grad_norm": 3.469689130783081,
"learning_rate": 9.337380894005463e-05,
"loss": 0.9286,
"step": 300
},
{
"epoch": 0.1086859688195991,
"grad_norm": 2.4321353435516357,
"learning_rate": 9.300745739476829e-05,
"loss": 0.9681,
"step": 305
},
{
"epoch": 0.11046770601336303,
"grad_norm": 2.3954951763153076,
"learning_rate": 9.263200821770461e-05,
"loss": 0.9223,
"step": 310
},
{
"epoch": 0.11224944320712694,
"grad_norm": 3.206364154815674,
"learning_rate": 9.224754082993552e-05,
"loss": 0.9111,
"step": 315
},
{
"epoch": 0.11403118040089087,
"grad_norm": 2.411461591720581,
"learning_rate": 9.185413656021036e-05,
"loss": 0.9254,
"step": 320
},
{
"epoch": 0.11581291759465479,
"grad_norm": 3.2764694690704346,
"learning_rate": 9.145187862775209e-05,
"loss": 0.9388,
"step": 325
},
{
"epoch": 0.11759465478841871,
"grad_norm": 2.724217653274536,
"learning_rate": 9.104085212465336e-05,
"loss": 0.9493,
"step": 330
},
{
"epoch": 0.11937639198218263,
"grad_norm": 2.4242122173309326,
"learning_rate": 9.062114399787647e-05,
"loss": 0.9439,
"step": 335
},
{
"epoch": 0.12115812917594655,
"grad_norm": 2.391575813293457,
"learning_rate": 9.019284303086087e-05,
"loss": 0.9253,
"step": 340
},
{
"epoch": 0.12293986636971047,
"grad_norm": 2.7728800773620605,
"learning_rate": 8.97560398247424e-05,
"loss": 0.946,
"step": 345
},
{
"epoch": 0.12472160356347439,
"grad_norm": 3.3350629806518555,
"learning_rate": 8.931082677918771e-05,
"loss": 0.9318,
"step": 350
},
{
"epoch": 0.12650334075723832,
"grad_norm": 2.887850761413574,
"learning_rate": 8.885729807284856e-05,
"loss": 0.9407,
"step": 355
},
{
"epoch": 0.12828507795100222,
"grad_norm": 2.461491107940674,
"learning_rate": 8.839554964343943e-05,
"loss": 0.9748,
"step": 360
},
{
"epoch": 0.13006681514476615,
"grad_norm": 2.649059772491455,
"learning_rate": 8.792567916744346e-05,
"loss": 0.9569,
"step": 365
},
{
"epoch": 0.13184855233853007,
"grad_norm": 2.505889415740967,
"learning_rate": 8.744778603945011e-05,
"loss": 0.9235,
"step": 370
},
{
"epoch": 0.133630289532294,
"grad_norm": 3.084015369415283,
"learning_rate": 8.69619713511298e-05,
"loss": 0.9466,
"step": 375
},
{
"epoch": 0.1354120267260579,
"grad_norm": 2.242276191711426,
"learning_rate": 8.646833786984927e-05,
"loss": 0.8958,
"step": 380
},
{
"epoch": 0.13719376391982183,
"grad_norm": 2.439112424850464,
"learning_rate": 8.596699001693255e-05,
"loss": 0.9211,
"step": 385
},
{
"epoch": 0.13897550111358575,
"grad_norm": 2.7526488304138184,
"learning_rate": 8.545803384557219e-05,
"loss": 0.9218,
"step": 390
},
{
"epoch": 0.14075723830734965,
"grad_norm": 2.521644353866577,
"learning_rate": 8.4941577018395e-05,
"loss": 0.9365,
"step": 395
},
{
"epoch": 0.14253897550111358,
"grad_norm": 2.8012807369232178,
"learning_rate": 8.44177287846877e-05,
"loss": 0.8991,
"step": 400
},
{
"epoch": 0.14253897550111358,
"eval_loss": 0.9173732995986938,
"eval_runtime": 250.7158,
"eval_samples_per_second": 19.899,
"eval_steps_per_second": 2.489,
"step": 400
},
{
"epoch": 0.1443207126948775,
"grad_norm": 2.3261518478393555,
"learning_rate": 8.388659995728661e-05,
"loss": 0.8968,
"step": 405
},
{
"epoch": 0.14610244988864143,
"grad_norm": 2.2134907245635986,
"learning_rate": 8.334830288913682e-05,
"loss": 0.91,
"step": 410
},
{
"epoch": 0.14788418708240533,
"grad_norm": 3.5786261558532715,
"learning_rate": 8.280295144952536e-05,
"loss": 0.9175,
"step": 415
},
{
"epoch": 0.14966592427616926,
"grad_norm": 2.7428812980651855,
"learning_rate": 8.225066099999392e-05,
"loss": 0.9345,
"step": 420
},
{
"epoch": 0.1514476614699332,
"grad_norm": 2.246025800704956,
"learning_rate": 8.169154836993551e-05,
"loss": 0.9067,
"step": 425
},
{
"epoch": 0.15322939866369711,
"grad_norm": 2.188469886779785,
"learning_rate": 8.112573183188099e-05,
"loss": 0.9537,
"step": 430
},
{
"epoch": 0.155011135857461,
"grad_norm": 2.545259475708008,
"learning_rate": 8.055333107647999e-05,
"loss": 0.9159,
"step": 435
},
{
"epoch": 0.15679287305122494,
"grad_norm": 2.421093463897705,
"learning_rate": 7.99744671871822e-05,
"loss": 0.9034,
"step": 440
},
{
"epoch": 0.15857461024498887,
"grad_norm": 2.5586888790130615,
"learning_rate": 7.938926261462366e-05,
"loss": 0.9072,
"step": 445
},
{
"epoch": 0.1603563474387528,
"grad_norm": 2.444941759109497,
"learning_rate": 7.879784115072417e-05,
"loss": 0.9101,
"step": 450
},
{
"epoch": 0.1621380846325167,
"grad_norm": 2.3764047622680664,
"learning_rate": 7.820032790250074e-05,
"loss": 0.9065,
"step": 455
},
{
"epoch": 0.16391982182628062,
"grad_norm": 2.34041428565979,
"learning_rate": 7.75968492656029e-05,
"loss": 0.8791,
"step": 460
},
{
"epoch": 0.16570155902004455,
"grad_norm": 2.013155698776245,
"learning_rate": 7.698753289757565e-05,
"loss": 0.9058,
"step": 465
},
{
"epoch": 0.16748329621380847,
"grad_norm": 2.3692591190338135,
"learning_rate": 7.6372507690855e-05,
"loss": 0.8898,
"step": 470
},
{
"epoch": 0.16926503340757237,
"grad_norm": 2.4539620876312256,
"learning_rate": 7.575190374550272e-05,
"loss": 0.9201,
"step": 475
},
{
"epoch": 0.1710467706013363,
"grad_norm": 2.6015443801879883,
"learning_rate": 7.51258523416855e-05,
"loss": 0.8823,
"step": 480
},
{
"epoch": 0.17282850779510023,
"grad_norm": 2.413839101791382,
"learning_rate": 7.449448591190435e-05,
"loss": 0.9196,
"step": 485
},
{
"epoch": 0.17461024498886416,
"grad_norm": 2.1962289810180664,
"learning_rate": 7.385793801298042e-05,
"loss": 0.8869,
"step": 490
},
{
"epoch": 0.17639198218262805,
"grad_norm": 2.994487762451172,
"learning_rate": 7.321634329780286e-05,
"loss": 0.9103,
"step": 495
},
{
"epoch": 0.17817371937639198,
"grad_norm": 2.9973297119140625,
"learning_rate": 7.256983748684485e-05,
"loss": 0.9083,
"step": 500
},
{
"epoch": 0.1799554565701559,
"grad_norm": 2.6006710529327393,
"learning_rate": 7.191855733945387e-05,
"loss": 0.9131,
"step": 505
},
{
"epoch": 0.18173719376391984,
"grad_norm": 2.4508118629455566,
"learning_rate": 7.126264062492217e-05,
"loss": 0.8762,
"step": 510
},
{
"epoch": 0.18351893095768373,
"grad_norm": 2.8403897285461426,
"learning_rate": 7.060222609334343e-05,
"loss": 0.8673,
"step": 515
},
{
"epoch": 0.18530066815144766,
"grad_norm": 2.5483813285827637,
"learning_rate": 6.993745344626231e-05,
"loss": 0.8812,
"step": 520
},
{
"epoch": 0.1870824053452116,
"grad_norm": 1.927654504776001,
"learning_rate": 6.926846330712242e-05,
"loss": 0.9213,
"step": 525
},
{
"epoch": 0.1888641425389755,
"grad_norm": 2.8513023853302,
"learning_rate": 6.859539719151933e-05,
"loss": 0.8911,
"step": 530
},
{
"epoch": 0.19064587973273942,
"grad_norm": 2.6732981204986572,
"learning_rate": 6.7918397477265e-05,
"loss": 0.9018,
"step": 535
},
{
"epoch": 0.19242761692650334,
"grad_norm": 2.3755311965942383,
"learning_rate": 6.723760737426971e-05,
"loss": 0.8803,
"step": 540
},
{
"epoch": 0.19420935412026727,
"grad_norm": 2.5072877407073975,
"learning_rate": 6.65531708942479e-05,
"loss": 0.9066,
"step": 545
},
{
"epoch": 0.19599109131403117,
"grad_norm": 2.3347630500793457,
"learning_rate": 6.586523282025462e-05,
"loss": 0.8999,
"step": 550
},
{
"epoch": 0.1977728285077951,
"grad_norm": 2.4541633129119873,
"learning_rate": 6.517393867605855e-05,
"loss": 0.9024,
"step": 555
},
{
"epoch": 0.19955456570155902,
"grad_norm": 2.89241361618042,
"learning_rate": 6.447943469535856e-05,
"loss": 0.8802,
"step": 560
},
{
"epoch": 0.20133630289532295,
"grad_norm": 2.635859251022339,
"learning_rate": 6.378186779084995e-05,
"loss": 0.91,
"step": 565
},
{
"epoch": 0.20311804008908685,
"grad_norm": 2.5360910892486572,
"learning_rate": 6.308138552314718e-05,
"loss": 0.883,
"step": 570
},
{
"epoch": 0.20489977728285078,
"grad_norm": 2.0861408710479736,
"learning_rate": 6.23781360695693e-05,
"loss": 0.9051,
"step": 575
},
{
"epoch": 0.2066815144766147,
"grad_norm": 1.938452959060669,
"learning_rate": 6.167226819279528e-05,
"loss": 0.8763,
"step": 580
},
{
"epoch": 0.20846325167037863,
"grad_norm": 2.333118200302124,
"learning_rate": 6.096393120939516e-05,
"loss": 0.8939,
"step": 585
},
{
"epoch": 0.21024498886414253,
"grad_norm": 2.2652223110198975,
"learning_rate": 6.0253274958244386e-05,
"loss": 0.8992,
"step": 590
},
{
"epoch": 0.21202672605790646,
"grad_norm": 1.830731749534607,
"learning_rate": 5.9540449768827246e-05,
"loss": 0.8617,
"step": 595
},
{
"epoch": 0.21380846325167038,
"grad_norm": 2.4237635135650635,
"learning_rate": 5.882560642943696e-05,
"loss": 0.9189,
"step": 600
},
{
"epoch": 0.21380846325167038,
"eval_loss": 0.8756723999977112,
"eval_runtime": 256.5875,
"eval_samples_per_second": 19.444,
"eval_steps_per_second": 2.432,
"step": 600
},
{
"epoch": 0.2155902004454343,
"grad_norm": 2.6089930534362793,
"learning_rate": 5.810889615527838e-05,
"loss": 0.9052,
"step": 605
},
{
"epoch": 0.2173719376391982,
"grad_norm": 2.457108974456787,
"learning_rate": 5.7390470556480545e-05,
"loss": 0.8959,
"step": 610
},
{
"epoch": 0.21915367483296214,
"grad_norm": 2.3315470218658447,
"learning_rate": 5.667048160602564e-05,
"loss": 0.8772,
"step": 615
},
{
"epoch": 0.22093541202672606,
"grad_norm": 2.0484960079193115,
"learning_rate": 5.5949081607601274e-05,
"loss": 0.8387,
"step": 620
},
{
"epoch": 0.22271714922049,
"grad_norm": 2.341867208480835,
"learning_rate": 5.522642316338268e-05,
"loss": 0.8778,
"step": 625
},
{
"epoch": 0.2244988864142539,
"grad_norm": 2.4177300930023193,
"learning_rate": 5.450265914175187e-05,
"loss": 0.8936,
"step": 630
},
{
"epoch": 0.22628062360801782,
"grad_norm": 2.4489850997924805,
"learning_rate": 5.377794264496041e-05,
"loss": 0.8654,
"step": 635
},
{
"epoch": 0.22806236080178174,
"grad_norm": 2.468477964401245,
"learning_rate": 5.3052426976742855e-05,
"loss": 0.8467,
"step": 640
},
{
"epoch": 0.22984409799554567,
"grad_norm": 2.1568973064422607,
"learning_rate": 5.232626560988735e-05,
"loss": 0.8337,
"step": 645
},
{
"epoch": 0.23162583518930957,
"grad_norm": 2.248286485671997,
"learning_rate": 5.159961215377065e-05,
"loss": 0.8626,
"step": 650
},
{
"epoch": 0.2334075723830735,
"grad_norm": 2.197516918182373,
"learning_rate": 5.0872620321864185e-05,
"loss": 0.8857,
"step": 655
},
{
"epoch": 0.23518930957683742,
"grad_norm": 2.0258774757385254,
"learning_rate": 5.0145443899218105e-05,
"loss": 0.8693,
"step": 660
},
{
"epoch": 0.23697104677060132,
"grad_norm": 2.576545000076294,
"learning_rate": 4.941823670993016e-05,
"loss": 0.8585,
"step": 665
},
{
"epoch": 0.23875278396436525,
"grad_norm": 2.1643807888031006,
"learning_rate": 4.869115258460635e-05,
"loss": 0.8844,
"step": 670
},
{
"epoch": 0.24053452115812918,
"grad_norm": 1.8109593391418457,
"learning_rate": 4.7964345327820217e-05,
"loss": 0.8526,
"step": 675
},
{
"epoch": 0.2423162583518931,
"grad_norm": 2.2996315956115723,
"learning_rate": 4.723796868557758e-05,
"loss": 0.8588,
"step": 680
},
{
"epoch": 0.244097995545657,
"grad_norm": 2.109656810760498,
"learning_rate": 4.6512176312793736e-05,
"loss": 0.8657,
"step": 685
},
{
"epoch": 0.24587973273942093,
"grad_norm": 2.0365986824035645,
"learning_rate": 4.578712174078986e-05,
"loss": 0.8722,
"step": 690
},
{
"epoch": 0.24766146993318486,
"grad_norm": 2.396369695663452,
"learning_rate": 4.506295834481561e-05,
"loss": 0.8595,
"step": 695
},
{
"epoch": 0.24944320712694878,
"grad_norm": 1.9721331596374512,
"learning_rate": 4.433983931160467e-05,
"loss": 0.845,
"step": 700
},
{
"epoch": 0.2512249443207127,
"grad_norm": 2.6028833389282227,
"learning_rate": 4.361791760697027e-05,
"loss": 0.8756,
"step": 705
},
{
"epoch": 0.25300668151447664,
"grad_norm": 2.5747413635253906,
"learning_rate": 4.289734594344738e-05,
"loss": 0.8553,
"step": 710
},
{
"epoch": 0.25478841870824054,
"grad_norm": 2.2102746963500977,
"learning_rate": 4.2178276747988446e-05,
"loss": 0.8301,
"step": 715
},
{
"epoch": 0.25657015590200444,
"grad_norm": 2.2053496837615967,
"learning_rate": 4.146086212971967e-05,
"loss": 0.8347,
"step": 720
},
{
"epoch": 0.2583518930957684,
"grad_norm": 2.1658267974853516,
"learning_rate": 4.074525384776428e-05,
"loss": 0.8583,
"step": 725
},
{
"epoch": 0.2601336302895323,
"grad_norm": 2.4658656120300293,
"learning_rate": 4.003160327914015e-05,
"loss": 0.8448,
"step": 730
},
{
"epoch": 0.2619153674832962,
"grad_norm": 2.5138092041015625,
"learning_rate": 3.932006138673801e-05,
"loss": 0.7994,
"step": 735
},
{
"epoch": 0.26369710467706015,
"grad_norm": 2.3678791522979736,
"learning_rate": 3.861077868738733e-05,
"loss": 0.8543,
"step": 740
},
{
"epoch": 0.26547884187082404,
"grad_norm": 2.174612283706665,
"learning_rate": 3.790390522001662e-05,
"loss": 0.8255,
"step": 745
},
{
"epoch": 0.267260579064588,
"grad_norm": 2.633901596069336,
"learning_rate": 3.719959051391472e-05,
"loss": 0.8574,
"step": 750
},
{
"epoch": 0.2690423162583519,
"grad_norm": 2.3723981380462646,
"learning_rate": 3.649798355709997e-05,
"loss": 0.8313,
"step": 755
},
{
"epoch": 0.2708240534521158,
"grad_norm": 2.452537775039673,
"learning_rate": 3.579923276480387e-05,
"loss": 0.8332,
"step": 760
},
{
"epoch": 0.27260579064587975,
"grad_norm": 2.7250778675079346,
"learning_rate": 3.51034859480759e-05,
"loss": 0.8345,
"step": 765
},
{
"epoch": 0.27438752783964365,
"grad_norm": 2.827697992324829,
"learning_rate": 3.44108902825161e-05,
"loss": 0.8547,
"step": 770
},
{
"epoch": 0.27616926503340755,
"grad_norm": 2.2842516899108887,
"learning_rate": 3.372159227714218e-05,
"loss": 0.8245,
"step": 775
},
{
"epoch": 0.2779510022271715,
"grad_norm": 2.4392411708831787,
"learning_rate": 3.303573774339745e-05,
"loss": 0.827,
"step": 780
},
{
"epoch": 0.2797327394209354,
"grad_norm": 2.548760175704956,
"learning_rate": 3.235347176430656e-05,
"loss": 0.8085,
"step": 785
},
{
"epoch": 0.2815144766146993,
"grad_norm": 2.289919376373291,
"learning_rate": 3.167493866378514e-05,
"loss": 0.8725,
"step": 790
},
{
"epoch": 0.28329621380846326,
"grad_norm": 2.1732709407806396,
"learning_rate": 3.100028197611006e-05,
"loss": 0.8184,
"step": 795
},
{
"epoch": 0.28507795100222716,
"grad_norm": 2.4083878993988037,
"learning_rate": 3.0329644415556758e-05,
"loss": 0.8186,
"step": 800
},
{
"epoch": 0.28507795100222716,
"eval_loss": 0.8320774435997009,
"eval_runtime": 261.5096,
"eval_samples_per_second": 19.078,
"eval_steps_per_second": 2.386,
"step": 800
},
{
"epoch": 0.2868596881959911,
"grad_norm": 2.4117252826690674,
"learning_rate": 2.9663167846209998e-05,
"loss": 0.8061,
"step": 805
},
{
"epoch": 0.288641425389755,
"grad_norm": 2.716094493865967,
"learning_rate": 2.9000993251954527e-05,
"loss": 0.8372,
"step": 810
},
{
"epoch": 0.2904231625835189,
"grad_norm": 1.987546443939209,
"learning_rate": 2.8343260706651864e-05,
"loss": 0.8539,
"step": 815
},
{
"epoch": 0.29220489977728287,
"grad_norm": 2.1564650535583496,
"learning_rate": 2.7690109344509563e-05,
"loss": 0.8398,
"step": 820
},
{
"epoch": 0.29398663697104677,
"grad_norm": 2.394848108291626,
"learning_rate": 2.7041677330649407e-05,
"loss": 0.8257,
"step": 825
},
{
"epoch": 0.29576837416481067,
"grad_norm": 2.211273670196533,
"learning_rate": 2.639810183188045e-05,
"loss": 0.8238,
"step": 830
},
{
"epoch": 0.2975501113585746,
"grad_norm": 2.2479021549224854,
"learning_rate": 2.575951898768315e-05,
"loss": 0.8277,
"step": 835
},
{
"epoch": 0.2993318485523385,
"grad_norm": 2.60609769821167,
"learning_rate": 2.5126063881411188e-05,
"loss": 0.8371,
"step": 840
},
{
"epoch": 0.3011135857461025,
"grad_norm": 2.4049665927886963,
"learning_rate": 2.4497870511716235e-05,
"loss": 0.8077,
"step": 845
},
{
"epoch": 0.3028953229398664,
"grad_norm": 2.140543222427368,
"learning_rate": 2.3875071764202563e-05,
"loss": 0.8288,
"step": 850
},
{
"epoch": 0.3046770601336303,
"grad_norm": 2.6508686542510986,
"learning_rate": 2.3257799383316798e-05,
"loss": 0.848,
"step": 855
},
{
"epoch": 0.30645879732739423,
"grad_norm": 2.6622097492218018,
"learning_rate": 2.264618394447927e-05,
"loss": 0.8133,
"step": 860
},
{
"epoch": 0.3082405345211581,
"grad_norm": 2.2243332862854004,
"learning_rate": 2.2040354826462668e-05,
"loss": 0.8227,
"step": 865
},
{
"epoch": 0.310022271714922,
"grad_norm": 2.4186229705810547,
"learning_rate": 2.1440440184023564e-05,
"loss": 0.7982,
"step": 870
},
{
"epoch": 0.311804008908686,
"grad_norm": 2.1508822441101074,
"learning_rate": 2.0846566920793266e-05,
"loss": 0.8421,
"step": 875
},
{
"epoch": 0.3135857461024499,
"grad_norm": 2.5740039348602295,
"learning_rate": 2.0258860662432942e-05,
"loss": 0.8337,
"step": 880
},
{
"epoch": 0.31536748329621384,
"grad_norm": 2.060276985168457,
"learning_rate": 1.967744573005934e-05,
"loss": 0.8319,
"step": 885
},
{
"epoch": 0.31714922048997773,
"grad_norm": 2.0549917221069336,
"learning_rate": 1.9102445113946343e-05,
"loss": 0.7851,
"step": 890
},
{
"epoch": 0.31893095768374163,
"grad_norm": 2.7247533798217773,
"learning_rate": 1.8533980447508137e-05,
"loss": 0.8113,
"step": 895
},
{
"epoch": 0.3207126948775056,
"grad_norm": 2.852099657058716,
"learning_rate": 1.797217198156924e-05,
"loss": 0.8502,
"step": 900
},
{
"epoch": 0.3224944320712695,
"grad_norm": 2.2780370712280273,
"learning_rate": 1.7417138558927244e-05,
"loss": 0.8175,
"step": 905
},
{
"epoch": 0.3242761692650334,
"grad_norm": 2.220999240875244,
"learning_rate": 1.6868997589213136e-05,
"loss": 0.8107,
"step": 910
},
{
"epoch": 0.32605790645879734,
"grad_norm": 2.26967191696167,
"learning_rate": 1.6327865024054984e-05,
"loss": 0.815,
"step": 915
},
{
"epoch": 0.32783964365256124,
"grad_norm": 3.1814401149749756,
"learning_rate": 1.5793855332550005e-05,
"loss": 0.8274,
"step": 920
},
{
"epoch": 0.32962138084632514,
"grad_norm": 2.5263116359710693,
"learning_rate": 1.526708147705013e-05,
"loss": 0.8126,
"step": 925
},
{
"epoch": 0.3314031180400891,
"grad_norm": 2.7154064178466797,
"learning_rate": 1.4747654889266476e-05,
"loss": 0.8147,
"step": 930
},
{
"epoch": 0.333184855233853,
"grad_norm": 2.2681655883789062,
"learning_rate": 1.4235685446697433e-05,
"loss": 0.8247,
"step": 935
},
{
"epoch": 0.33496659242761695,
"grad_norm": 1.97934889793396,
"learning_rate": 1.373128144938563e-05,
"loss": 0.7941,
"step": 940
},
{
"epoch": 0.33674832962138085,
"grad_norm": 2.35060977935791,
"learning_rate": 1.3234549597008571e-05,
"loss": 0.8306,
"step": 945
},
{
"epoch": 0.33853006681514475,
"grad_norm": 2.231822967529297,
"learning_rate": 1.2745594966307823e-05,
"loss": 0.8044,
"step": 950
},
{
"epoch": 0.3403118040089087,
"grad_norm": 2.1318812370300293,
"learning_rate": 1.22645209888614e-05,
"loss": 0.7989,
"step": 955
},
{
"epoch": 0.3420935412026726,
"grad_norm": 2.565772294998169,
"learning_rate": 1.1791429429204342e-05,
"loss": 0.7852,
"step": 960
},
{
"epoch": 0.3438752783964365,
"grad_norm": 2.2323334217071533,
"learning_rate": 1.132642036330181e-05,
"loss": 0.798,
"step": 965
},
{
"epoch": 0.34565701559020046,
"grad_norm": 2.159836769104004,
"learning_rate": 1.0869592157379304e-05,
"loss": 0.7913,
"step": 970
},
{
"epoch": 0.34743875278396436,
"grad_norm": 2.292523145675659,
"learning_rate": 1.0421041447114838e-05,
"loss": 0.8303,
"step": 975
},
{
"epoch": 0.3492204899777283,
"grad_norm": 2.540412187576294,
"learning_rate": 9.980863117196815e-06,
"loss": 0.8174,
"step": 980
},
{
"epoch": 0.3510022271714922,
"grad_norm": 2.6382853984832764,
"learning_rate": 9.549150281252633e-06,
"loss": 0.7803,
"step": 985
},
{
"epoch": 0.3527839643652561,
"grad_norm": 2.3101236820220947,
"learning_rate": 9.125994262151682e-06,
"loss": 0.8372,
"step": 990
},
{
"epoch": 0.35456570155902006,
"grad_norm": 2.285560131072998,
"learning_rate": 8.711484572687296e-06,
"loss": 0.7965,
"step": 995
},
{
"epoch": 0.35634743875278396,
"grad_norm": 1.8707315921783447,
"learning_rate": 8.305708896641594e-06,
"loss": 0.8255,
"step": 1000
},
{
"epoch": 0.35634743875278396,
"eval_loss": 0.7969963550567627,
"eval_runtime": 258.8113,
"eval_samples_per_second": 19.277,
"eval_steps_per_second": 2.411,
"step": 1000
}
],
"logging_steps": 5,
"max_steps": 1200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1495650375386112.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}