gr00t-paper_return-7w9itxzsox / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
50b3390 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 2490,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.040160642570281124,
"grad_norm": 3.447462558746338,
"learning_rate": 7.2e-06,
"loss": 0.5292,
"step": 10
},
{
"epoch": 0.08032128514056225,
"grad_norm": 1.379734754562378,
"learning_rate": 1.52e-05,
"loss": 0.2833,
"step": 20
},
{
"epoch": 0.12048192771084337,
"grad_norm": 1.1571083068847656,
"learning_rate": 2.32e-05,
"loss": 0.2114,
"step": 30
},
{
"epoch": 0.1606425702811245,
"grad_norm": 1.3929016590118408,
"learning_rate": 3.12e-05,
"loss": 0.2085,
"step": 40
},
{
"epoch": 0.20080321285140562,
"grad_norm": 1.096237301826477,
"learning_rate": 3.9200000000000004e-05,
"loss": 0.1772,
"step": 50
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.8818596601486206,
"learning_rate": 4.72e-05,
"loss": 0.1577,
"step": 60
},
{
"epoch": 0.28112449799196787,
"grad_norm": 0.775509774684906,
"learning_rate": 5.520000000000001e-05,
"loss": 0.1603,
"step": 70
},
{
"epoch": 0.321285140562249,
"grad_norm": 1.0346589088439941,
"learning_rate": 6.32e-05,
"loss": 0.1372,
"step": 80
},
{
"epoch": 0.3614457831325301,
"grad_norm": 0.8337526917457581,
"learning_rate": 7.12e-05,
"loss": 0.1263,
"step": 90
},
{
"epoch": 0.40160642570281124,
"grad_norm": 0.9970714449882507,
"learning_rate": 7.920000000000001e-05,
"loss": 0.1232,
"step": 100
},
{
"epoch": 0.44176706827309237,
"grad_norm": 0.47735488414764404,
"learning_rate": 8.72e-05,
"loss": 0.113,
"step": 110
},
{
"epoch": 0.4819277108433735,
"grad_norm": 1.0498414039611816,
"learning_rate": 9.52e-05,
"loss": 0.105,
"step": 120
},
{
"epoch": 0.5220883534136547,
"grad_norm": 0.5586184859275818,
"learning_rate": 9.999929417599468e-05,
"loss": 0.1085,
"step": 130
},
{
"epoch": 0.5622489959839357,
"grad_norm": 0.7290149927139282,
"learning_rate": 9.999135388478797e-05,
"loss": 0.1037,
"step": 140
},
{
"epoch": 0.6024096385542169,
"grad_norm": 0.9249761700630188,
"learning_rate": 9.997459242813312e-05,
"loss": 0.085,
"step": 150
},
{
"epoch": 0.642570281124498,
"grad_norm": 0.6690048575401306,
"learning_rate": 9.994901276365323e-05,
"loss": 0.0865,
"step": 160
},
{
"epoch": 0.6827309236947792,
"grad_norm": 0.4290846288204193,
"learning_rate": 9.991461940497786e-05,
"loss": 0.0848,
"step": 170
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.6786843538284302,
"learning_rate": 9.987141842094658e-05,
"loss": 0.089,
"step": 180
},
{
"epoch": 0.7630522088353414,
"grad_norm": 0.9659805297851562,
"learning_rate": 9.981941743453815e-05,
"loss": 0.0819,
"step": 190
},
{
"epoch": 0.8032128514056225,
"grad_norm": 0.7636594772338867,
"learning_rate": 9.975862562152532e-05,
"loss": 0.0878,
"step": 200
},
{
"epoch": 0.8433734939759037,
"grad_norm": 0.6373817324638367,
"learning_rate": 9.968905370885586e-05,
"loss": 0.0767,
"step": 210
},
{
"epoch": 0.8835341365461847,
"grad_norm": 0.7489560842514038,
"learning_rate": 9.961071397275963e-05,
"loss": 0.0724,
"step": 220
},
{
"epoch": 0.9236947791164659,
"grad_norm": 0.588756263256073,
"learning_rate": 9.952362023658249e-05,
"loss": 0.0685,
"step": 230
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.4825253486633301,
"learning_rate": 9.9427787868347e-05,
"loss": 0.0705,
"step": 240
},
{
"epoch": 1.0040160642570282,
"grad_norm": 0.4361709654331207,
"learning_rate": 9.93232337780408e-05,
"loss": 0.0692,
"step": 250
},
{
"epoch": 1.0441767068273093,
"grad_norm": 0.6188638210296631,
"learning_rate": 9.920997641463272e-05,
"loss": 0.0682,
"step": 260
},
{
"epoch": 1.0843373493975903,
"grad_norm": 0.4582713842391968,
"learning_rate": 9.908803576281736e-05,
"loss": 0.0731,
"step": 270
},
{
"epoch": 1.1244979919678715,
"grad_norm": 0.6378023028373718,
"learning_rate": 9.895743333948874e-05,
"loss": 0.0832,
"step": 280
},
{
"epoch": 1.1646586345381527,
"grad_norm": 0.45142269134521484,
"learning_rate": 9.881819218994365e-05,
"loss": 0.0707,
"step": 290
},
{
"epoch": 1.2048192771084336,
"grad_norm": 0.5397023558616638,
"learning_rate": 9.867033688381502e-05,
"loss": 0.0747,
"step": 300
},
{
"epoch": 1.2449799196787148,
"grad_norm": 0.8213003277778625,
"learning_rate": 9.85138935107367e-05,
"loss": 0.0691,
"step": 310
},
{
"epoch": 1.285140562248996,
"grad_norm": 0.7952361106872559,
"learning_rate": 9.834888967573976e-05,
"loss": 0.0689,
"step": 320
},
{
"epoch": 1.3253012048192772,
"grad_norm": 0.8249073624610901,
"learning_rate": 9.817535449438148e-05,
"loss": 0.075,
"step": 330
},
{
"epoch": 1.3654618473895583,
"grad_norm": 0.7501718997955322,
"learning_rate": 9.799331858760786e-05,
"loss": 0.0657,
"step": 340
},
{
"epoch": 1.4056224899598393,
"grad_norm": 0.5428710579872131,
"learning_rate": 9.78028140763503e-05,
"loss": 0.0657,
"step": 350
},
{
"epoch": 1.4457831325301205,
"grad_norm": 0.6559877395629883,
"learning_rate": 9.76038745758579e-05,
"loss": 0.066,
"step": 360
},
{
"epoch": 1.4859437751004017,
"grad_norm": 0.566528856754303,
"learning_rate": 9.739653518976581e-05,
"loss": 0.0614,
"step": 370
},
{
"epoch": 1.5261044176706826,
"grad_norm": 0.4224630892276764,
"learning_rate": 9.718083250390113e-05,
"loss": 0.0662,
"step": 380
},
{
"epoch": 1.5662650602409638,
"grad_norm": 0.5740475058555603,
"learning_rate": 9.695680457982713e-05,
"loss": 0.065,
"step": 390
},
{
"epoch": 1.606425702811245,
"grad_norm": 0.4563257098197937,
"learning_rate": 9.67244909481272e-05,
"loss": 0.0672,
"step": 400
},
{
"epoch": 1.6465863453815262,
"grad_norm": 0.5204518437385559,
"learning_rate": 9.648393260142948e-05,
"loss": 0.0567,
"step": 410
},
{
"epoch": 1.6867469879518073,
"grad_norm": 0.47403523325920105,
"learning_rate": 9.623517198717362e-05,
"loss": 0.0572,
"step": 420
},
{
"epoch": 1.7269076305220885,
"grad_norm": 0.6863958835601807,
"learning_rate": 9.597825300012073e-05,
"loss": 0.0616,
"step": 430
},
{
"epoch": 1.7670682730923695,
"grad_norm": 0.4068869352340698,
"learning_rate": 9.571322097460793e-05,
"loss": 0.0689,
"step": 440
},
{
"epoch": 1.8072289156626506,
"grad_norm": 0.43284496665000916,
"learning_rate": 9.544012267654901e-05,
"loss": 0.0633,
"step": 450
},
{
"epoch": 1.8473895582329316,
"grad_norm": 0.9921565651893616,
"learning_rate": 9.51590062951824e-05,
"loss": 0.0653,
"step": 460
},
{
"epoch": 1.8875502008032128,
"grad_norm": 0.6336984634399414,
"learning_rate": 9.486992143456792e-05,
"loss": 0.0622,
"step": 470
},
{
"epoch": 1.927710843373494,
"grad_norm": 0.48028799891471863,
"learning_rate": 9.457291910483409e-05,
"loss": 0.0578,
"step": 480
},
{
"epoch": 1.9678714859437751,
"grad_norm": 0.6273623108863831,
"learning_rate": 9.426805171317701e-05,
"loss": 0.0546,
"step": 490
},
{
"epoch": 2.0080321285140563,
"grad_norm": 0.7483130097389221,
"learning_rate": 9.395537305461311e-05,
"loss": 0.0505,
"step": 500
},
{
"epoch": 2.0481927710843375,
"grad_norm": 0.5147470235824585,
"learning_rate": 9.363493830248666e-05,
"loss": 0.0557,
"step": 510
},
{
"epoch": 2.0883534136546187,
"grad_norm": 0.433223694562912,
"learning_rate": 9.33068039987343e-05,
"loss": 0.051,
"step": 520
},
{
"epoch": 2.1285140562248994,
"grad_norm": 0.440390408039093,
"learning_rate": 9.297102804390798e-05,
"loss": 0.0571,
"step": 530
},
{
"epoch": 2.1686746987951806,
"grad_norm": 0.6565499305725098,
"learning_rate": 9.26276696869582e-05,
"loss": 0.0523,
"step": 540
},
{
"epoch": 2.208835341365462,
"grad_norm": 0.4944412112236023,
"learning_rate": 9.227678951477925e-05,
"loss": 0.0532,
"step": 550
},
{
"epoch": 2.248995983935743,
"grad_norm": 0.48403704166412354,
"learning_rate": 9.19184494415185e-05,
"loss": 0.0575,
"step": 560
},
{
"epoch": 2.289156626506024,
"grad_norm": 0.6252164244651794,
"learning_rate": 9.15527126976514e-05,
"loss": 0.0541,
"step": 570
},
{
"epoch": 2.3293172690763053,
"grad_norm": 0.48273801803588867,
"learning_rate": 9.117964381882413e-05,
"loss": 0.0518,
"step": 580
},
{
"epoch": 2.3694779116465865,
"grad_norm": 0.7384123206138611,
"learning_rate": 9.079930863446612e-05,
"loss": 0.0589,
"step": 590
},
{
"epoch": 2.4096385542168672,
"grad_norm": 0.48452362418174744,
"learning_rate": 9.041177425617427e-05,
"loss": 0.0567,
"step": 600
},
{
"epoch": 2.4497991967871484,
"grad_norm": 0.46295881271362305,
"learning_rate": 9.001710906587064e-05,
"loss": 0.0561,
"step": 610
},
{
"epoch": 2.4899598393574296,
"grad_norm": 0.49404817819595337,
"learning_rate": 8.961538270373639e-05,
"loss": 0.0517,
"step": 620
},
{
"epoch": 2.5301204819277108,
"grad_norm": 0.4810844361782074,
"learning_rate": 8.920666605592341e-05,
"loss": 0.0554,
"step": 630
},
{
"epoch": 2.570281124497992,
"grad_norm": 0.3409781754016876,
"learning_rate": 8.879103124204626e-05,
"loss": 0.0509,
"step": 640
},
{
"epoch": 2.610441767068273,
"grad_norm": 0.661351203918457,
"learning_rate": 8.836855160245629e-05,
"loss": 0.0575,
"step": 650
},
{
"epoch": 2.6506024096385543,
"grad_norm": 0.4867478013038635,
"learning_rate": 8.79393016853005e-05,
"loss": 0.055,
"step": 660
},
{
"epoch": 2.6907630522088355,
"grad_norm": 0.8102213144302368,
"learning_rate": 8.750335723336728e-05,
"loss": 0.0554,
"step": 670
},
{
"epoch": 2.7309236947791167,
"grad_norm": 0.5746546387672424,
"learning_rate": 8.706079517072127e-05,
"loss": 0.0529,
"step": 680
},
{
"epoch": 2.7710843373493974,
"grad_norm": 0.6538225412368774,
"learning_rate": 8.661169358912978e-05,
"loss": 0.0473,
"step": 690
},
{
"epoch": 2.8112449799196786,
"grad_norm": 0.5305560231208801,
"learning_rate": 8.615613173428321e-05,
"loss": 0.0613,
"step": 700
},
{
"epoch": 2.8514056224899598,
"grad_norm": 0.6382079124450684,
"learning_rate": 8.569418999181194e-05,
"loss": 0.0477,
"step": 710
},
{
"epoch": 2.891566265060241,
"grad_norm": 0.5018407106399536,
"learning_rate": 8.522594987310184e-05,
"loss": 0.0454,
"step": 720
},
{
"epoch": 2.931726907630522,
"grad_norm": 0.3260187804698944,
"learning_rate": 8.475149400091137e-05,
"loss": 0.0499,
"step": 730
},
{
"epoch": 2.9718875502008033,
"grad_norm": 0.39027342200279236,
"learning_rate": 8.427090609479245e-05,
"loss": 0.0528,
"step": 740
},
{
"epoch": 3.0120481927710845,
"grad_norm": 0.5888795256614685,
"learning_rate": 8.378427095631776e-05,
"loss": 0.0521,
"step": 750
},
{
"epoch": 3.0522088353413657,
"grad_norm": 0.6883623003959656,
"learning_rate": 8.329167445411732e-05,
"loss": 0.0449,
"step": 760
},
{
"epoch": 3.0923694779116464,
"grad_norm": 0.6276723742485046,
"learning_rate": 8.279320350872655e-05,
"loss": 0.0485,
"step": 770
},
{
"epoch": 3.1325301204819276,
"grad_norm": 0.48254498839378357,
"learning_rate": 8.228894607724878e-05,
"loss": 0.0473,
"step": 780
},
{
"epoch": 3.1726907630522088,
"grad_norm": 0.34066540002822876,
"learning_rate": 8.177899113783492e-05,
"loss": 0.0542,
"step": 790
},
{
"epoch": 3.21285140562249,
"grad_norm": 0.4851544499397278,
"learning_rate": 8.126342867398301e-05,
"loss": 0.0491,
"step": 800
},
{
"epoch": 3.253012048192771,
"grad_norm": 0.5018098950386047,
"learning_rate": 8.074234965866012e-05,
"loss": 0.0467,
"step": 810
},
{
"epoch": 3.2931726907630523,
"grad_norm": 0.47141626477241516,
"learning_rate": 8.021584603824996e-05,
"loss": 0.0497,
"step": 820
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.5625479221343994,
"learning_rate": 7.968401071632855e-05,
"loss": 0.0515,
"step": 830
},
{
"epoch": 3.3734939759036147,
"grad_norm": 0.38696932792663574,
"learning_rate": 7.914693753727091e-05,
"loss": 0.05,
"step": 840
},
{
"epoch": 3.4136546184738954,
"grad_norm": 0.4147994816303253,
"learning_rate": 7.860472126969213e-05,
"loss": 0.0514,
"step": 850
},
{
"epoch": 3.4538152610441766,
"grad_norm": 0.38693082332611084,
"learning_rate": 7.805745758972481e-05,
"loss": 0.0504,
"step": 860
},
{
"epoch": 3.4939759036144578,
"grad_norm": 0.3496777415275574,
"learning_rate": 7.75052430641368e-05,
"loss": 0.0458,
"step": 870
},
{
"epoch": 3.534136546184739,
"grad_norm": 0.3338903784751892,
"learning_rate": 7.694817513329159e-05,
"loss": 0.0413,
"step": 880
},
{
"epoch": 3.57429718875502,
"grad_norm": 0.4792289137840271,
"learning_rate": 7.638635209395453e-05,
"loss": 0.0461,
"step": 890
},
{
"epoch": 3.6144578313253013,
"grad_norm": 0.3006054162979126,
"learning_rate": 7.58198730819481e-05,
"loss": 0.0402,
"step": 900
},
{
"epoch": 3.6546184738955825,
"grad_norm": 0.2644276022911072,
"learning_rate": 7.524883805465888e-05,
"loss": 0.0386,
"step": 910
},
{
"epoch": 3.694779116465863,
"grad_norm": 0.4371125400066376,
"learning_rate": 7.467334777339985e-05,
"loss": 0.0411,
"step": 920
},
{
"epoch": 3.734939759036145,
"grad_norm": 0.5912031531333923,
"learning_rate": 7.409350378563046e-05,
"loss": 0.0474,
"step": 930
},
{
"epoch": 3.7751004016064256,
"grad_norm": 0.29996731877326965,
"learning_rate": 7.350940840703842e-05,
"loss": 0.0522,
"step": 940
},
{
"epoch": 3.8152610441767068,
"grad_norm": 0.49531587958335876,
"learning_rate": 7.292116470348554e-05,
"loss": 0.0514,
"step": 950
},
{
"epoch": 3.855421686746988,
"grad_norm": 0.44657862186431885,
"learning_rate": 7.232887647282147e-05,
"loss": 0.0432,
"step": 960
},
{
"epoch": 3.895582329317269,
"grad_norm": 0.36731448769569397,
"learning_rate": 7.173264822656806e-05,
"loss": 0.0468,
"step": 970
},
{
"epoch": 3.9357429718875503,
"grad_norm": 0.6286973357200623,
"learning_rate": 7.113258517147801e-05,
"loss": 0.048,
"step": 980
},
{
"epoch": 3.9759036144578315,
"grad_norm": 0.41374683380126953,
"learning_rate": 7.052879319097072e-05,
"loss": 0.0478,
"step": 990
},
{
"epoch": 4.016064257028113,
"grad_norm": 0.4046255052089691,
"learning_rate": 6.992137882644868e-05,
"loss": 0.044,
"step": 1000
},
{
"epoch": 4.056224899598393,
"grad_norm": 0.47352728247642517,
"learning_rate": 6.931044925849789e-05,
"loss": 0.0487,
"step": 1010
},
{
"epoch": 4.096385542168675,
"grad_norm": 0.445796936750412,
"learning_rate": 6.869611228797546e-05,
"loss": 0.0432,
"step": 1020
},
{
"epoch": 4.136546184738956,
"grad_norm": 0.5685417652130127,
"learning_rate": 6.807847631698769e-05,
"loss": 0.0423,
"step": 1030
},
{
"epoch": 4.176706827309237,
"grad_norm": 0.31597232818603516,
"learning_rate": 6.745765032976214e-05,
"loss": 0.0415,
"step": 1040
},
{
"epoch": 4.216867469879518,
"grad_norm": 0.3107571005821228,
"learning_rate": 6.683374387341687e-05,
"loss": 0.0428,
"step": 1050
},
{
"epoch": 4.257028112449799,
"grad_norm": 0.5054974555969238,
"learning_rate": 6.620686703863054e-05,
"loss": 0.04,
"step": 1060
},
{
"epoch": 4.2971887550200805,
"grad_norm": 0.3022196590900421,
"learning_rate": 6.557713044021642e-05,
"loss": 0.0408,
"step": 1070
},
{
"epoch": 4.337349397590361,
"grad_norm": 0.39395079016685486,
"learning_rate": 6.494464519760401e-05,
"loss": 0.0455,
"step": 1080
},
{
"epoch": 4.377510040160643,
"grad_norm": 0.3358634114265442,
"learning_rate": 6.430952291523158e-05,
"loss": 0.0362,
"step": 1090
},
{
"epoch": 4.417670682730924,
"grad_norm": 0.39022529125213623,
"learning_rate": 6.367187566285314e-05,
"loss": 0.0443,
"step": 1100
},
{
"epoch": 4.457831325301205,
"grad_norm": 0.3976607620716095,
"learning_rate": 6.303181595576328e-05,
"loss": 0.0421,
"step": 1110
},
{
"epoch": 4.497991967871486,
"grad_norm": 0.47239330410957336,
"learning_rate": 6.238945673494354e-05,
"loss": 0.044,
"step": 1120
},
{
"epoch": 4.538152610441767,
"grad_norm": 0.23172323405742645,
"learning_rate": 6.174491134713332e-05,
"loss": 0.044,
"step": 1130
},
{
"epoch": 4.578313253012048,
"grad_norm": 0.5892140865325928,
"learning_rate": 6.109829352482964e-05,
"loss": 0.0428,
"step": 1140
},
{
"epoch": 4.618473895582329,
"grad_norm": 0.9103882908821106,
"learning_rate": 6.044971736621842e-05,
"loss": 0.0433,
"step": 1150
},
{
"epoch": 4.658634538152611,
"grad_norm": 0.3588651716709137,
"learning_rate": 5.979929731504158e-05,
"loss": 0.0423,
"step": 1160
},
{
"epoch": 4.698795180722891,
"grad_norm": 0.5172322988510132,
"learning_rate": 5.91471481404029e-05,
"loss": 0.042,
"step": 1170
},
{
"epoch": 4.738955823293173,
"grad_norm": 0.38006845116615295,
"learning_rate": 5.849338491651661e-05,
"loss": 0.0394,
"step": 1180
},
{
"epoch": 4.779116465863454,
"grad_norm": 0.3825148046016693,
"learning_rate": 5.783812300240209e-05,
"loss": 0.0355,
"step": 1190
},
{
"epoch": 4.8192771084337345,
"grad_norm": 0.6012735366821289,
"learning_rate": 5.718147802152833e-05,
"loss": 0.0431,
"step": 1200
},
{
"epoch": 4.859437751004016,
"grad_norm": 0.41699841618537903,
"learning_rate": 5.652356584141177e-05,
"loss": 0.0364,
"step": 1210
},
{
"epoch": 4.899598393574297,
"grad_norm": 0.3592054843902588,
"learning_rate": 5.586450255317097e-05,
"loss": 0.036,
"step": 1220
},
{
"epoch": 4.9397590361445785,
"grad_norm": 0.5711321234703064,
"learning_rate": 5.5204404451041894e-05,
"loss": 0.0366,
"step": 1230
},
{
"epoch": 4.979919678714859,
"grad_norm": 0.44127699732780457,
"learning_rate": 5.4543388011857456e-05,
"loss": 0.0316,
"step": 1240
},
{
"epoch": 5.020080321285141,
"grad_norm": 0.4283978044986725,
"learning_rate": 5.388156987449454e-05,
"loss": 0.0467,
"step": 1250
},
{
"epoch": 5.0602409638554215,
"grad_norm": 0.3291616439819336,
"learning_rate": 5.321906681929284e-05,
"loss": 0.0411,
"step": 1260
},
{
"epoch": 5.100401606425703,
"grad_norm": 0.2523420751094818,
"learning_rate": 5.2555995747448364e-05,
"loss": 0.0415,
"step": 1270
},
{
"epoch": 5.140562248995984,
"grad_norm": 0.5164968967437744,
"learning_rate": 5.189247366038583e-05,
"loss": 0.0381,
"step": 1280
},
{
"epoch": 5.180722891566265,
"grad_norm": 0.2963825464248657,
"learning_rate": 5.1228617639113355e-05,
"loss": 0.0391,
"step": 1290
},
{
"epoch": 5.220883534136546,
"grad_norm": 0.39251625537872314,
"learning_rate": 5.0564544823562945e-05,
"loss": 0.0397,
"step": 1300
},
{
"epoch": 5.261044176706827,
"grad_norm": 1.2152100801467896,
"learning_rate": 4.9900372391920875e-05,
"loss": 0.0366,
"step": 1310
},
{
"epoch": 5.301204819277109,
"grad_norm": 0.35082605481147766,
"learning_rate": 4.923621753995099e-05,
"loss": 0.0312,
"step": 1320
},
{
"epoch": 5.341365461847389,
"grad_norm": 0.505351722240448,
"learning_rate": 4.85721974603152e-05,
"loss": 0.036,
"step": 1330
},
{
"epoch": 5.381526104417671,
"grad_norm": 0.2755846679210663,
"learning_rate": 4.790842932189424e-05,
"loss": 0.0352,
"step": 1340
},
{
"epoch": 5.421686746987952,
"grad_norm": 0.4208662807941437,
"learning_rate": 4.724503024911292e-05,
"loss": 0.0351,
"step": 1350
},
{
"epoch": 5.461847389558233,
"grad_norm": 0.5660949945449829,
"learning_rate": 4.6582117301273006e-05,
"loss": 0.0347,
"step": 1360
},
{
"epoch": 5.502008032128514,
"grad_norm": 0.3927326202392578,
"learning_rate": 4.591980745189762e-05,
"loss": 0.0326,
"step": 1370
},
{
"epoch": 5.542168674698795,
"grad_norm": 0.39894360303878784,
"learning_rate": 4.5258217568090876e-05,
"loss": 0.0355,
"step": 1380
},
{
"epoch": 5.582329317269076,
"grad_norm": 0.3474249541759491,
"learning_rate": 4.4597464389916204e-05,
"loss": 0.0361,
"step": 1390
},
{
"epoch": 5.622489959839357,
"grad_norm": 0.4053770899772644,
"learning_rate": 4.3937664509797173e-05,
"loss": 0.0371,
"step": 1400
},
{
"epoch": 5.662650602409639,
"grad_norm": 0.30829307436943054,
"learning_rate": 4.3278934351944185e-05,
"loss": 0.0402,
"step": 1410
},
{
"epoch": 5.7028112449799195,
"grad_norm": 0.396782249212265,
"learning_rate": 4.262139015181111e-05,
"loss": 0.0388,
"step": 1420
},
{
"epoch": 5.742971887550201,
"grad_norm": 0.5452107191085815,
"learning_rate": 4.196514793558508e-05,
"loss": 0.0358,
"step": 1430
},
{
"epoch": 5.783132530120482,
"grad_norm": 0.35190486907958984,
"learning_rate": 4.13103234997131e-05,
"loss": 0.038,
"step": 1440
},
{
"epoch": 5.823293172690763,
"grad_norm": 0.43659260869026184,
"learning_rate": 4.065703239046951e-05,
"loss": 0.035,
"step": 1450
},
{
"epoch": 5.863453815261044,
"grad_norm": 0.27444201707839966,
"learning_rate": 4.000538988356723e-05,
"loss": 0.0349,
"step": 1460
},
{
"epoch": 5.903614457831325,
"grad_norm": 0.40698736906051636,
"learning_rate": 3.9355510963817046e-05,
"loss": 0.0354,
"step": 1470
},
{
"epoch": 5.943775100401607,
"grad_norm": 0.40334540605545044,
"learning_rate": 3.8707510304838e-05,
"loss": 0.0379,
"step": 1480
},
{
"epoch": 5.983935742971887,
"grad_norm": 0.4215904474258423,
"learning_rate": 3.80615022488229e-05,
"loss": 0.0344,
"step": 1490
},
{
"epoch": 6.024096385542169,
"grad_norm": 0.30104267597198486,
"learning_rate": 3.741760078636213e-05,
"loss": 0.0338,
"step": 1500
},
{
"epoch": 6.06425702811245,
"grad_norm": 0.6103472709655762,
"learning_rate": 3.677591953632955e-05,
"loss": 0.0363,
"step": 1510
},
{
"epoch": 6.104417670682731,
"grad_norm": 0.40117689967155457,
"learning_rate": 3.6136571725834116e-05,
"loss": 0.0327,
"step": 1520
},
{
"epoch": 6.144578313253012,
"grad_norm": 0.2717328667640686,
"learning_rate": 3.5499670170240395e-05,
"loss": 0.0385,
"step": 1530
},
{
"epoch": 6.184738955823293,
"grad_norm": 0.28105250000953674,
"learning_rate": 3.486532725326199e-05,
"loss": 0.0343,
"step": 1540
},
{
"epoch": 6.224899598393574,
"grad_norm": 0.3413033187389374,
"learning_rate": 3.4233654907130875e-05,
"loss": 0.034,
"step": 1550
},
{
"epoch": 6.265060240963855,
"grad_norm": 0.2755061388015747,
"learning_rate": 3.3604764592846636e-05,
"loss": 0.0326,
"step": 1560
},
{
"epoch": 6.305220883534137,
"grad_norm": 0.7181720733642578,
"learning_rate": 3.2978767280508736e-05,
"loss": 0.0347,
"step": 1570
},
{
"epoch": 6.3453815261044175,
"grad_norm": 0.2261345237493515,
"learning_rate": 3.2355773429735314e-05,
"loss": 0.03,
"step": 1580
},
{
"epoch": 6.385542168674699,
"grad_norm": 0.5460085868835449,
"learning_rate": 3.17358929701723e-05,
"loss": 0.0304,
"step": 1590
},
{
"epoch": 6.42570281124498,
"grad_norm": 0.955663800239563,
"learning_rate": 3.111923528209577e-05,
"loss": 0.0338,
"step": 1600
},
{
"epoch": 6.4658634538152615,
"grad_norm": 0.26345881819725037,
"learning_rate": 3.0505909177111574e-05,
"loss": 0.032,
"step": 1610
},
{
"epoch": 6.506024096385542,
"grad_norm": 0.8473414778709412,
"learning_rate": 2.9896022878954878e-05,
"loss": 0.0314,
"step": 1620
},
{
"epoch": 6.546184738955823,
"grad_norm": 0.2904120087623596,
"learning_rate": 2.9289684004393836e-05,
"loss": 0.0317,
"step": 1630
},
{
"epoch": 6.586345381526105,
"grad_norm": 0.4706421196460724,
"learning_rate": 2.8686999544240172e-05,
"loss": 0.0337,
"step": 1640
},
{
"epoch": 6.626506024096385,
"grad_norm": 0.4121145009994507,
"learning_rate": 2.808807584447018e-05,
"loss": 0.0279,
"step": 1650
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.3063754737377167,
"learning_rate": 2.7493018587459628e-05,
"loss": 0.033,
"step": 1660
},
{
"epoch": 6.706827309236948,
"grad_norm": 0.5062171220779419,
"learning_rate": 2.6901932773335692e-05,
"loss": 0.0419,
"step": 1670
},
{
"epoch": 6.746987951807229,
"grad_norm": 0.41636672616004944,
"learning_rate": 2.6314922701449286e-05,
"loss": 0.0283,
"step": 1680
},
{
"epoch": 6.78714859437751,
"grad_norm": 0.6910147666931152,
"learning_rate": 2.5732091951970937e-05,
"loss": 0.0319,
"step": 1690
},
{
"epoch": 6.827309236947791,
"grad_norm": 0.6493785381317139,
"learning_rate": 2.515354336761391e-05,
"loss": 0.0335,
"step": 1700
},
{
"epoch": 6.867469879518072,
"grad_norm": 0.3420906960964203,
"learning_rate": 2.457937903548695e-05,
"loss": 0.0306,
"step": 1710
},
{
"epoch": 6.907630522088353,
"grad_norm": 0.33870965242385864,
"learning_rate": 2.4009700269080793e-05,
"loss": 0.0298,
"step": 1720
},
{
"epoch": 6.947791164658635,
"grad_norm": 0.3856060802936554,
"learning_rate": 2.344460759039097e-05,
"loss": 0.0278,
"step": 1730
},
{
"epoch": 6.9879518072289155,
"grad_norm": 0.3366270363330841,
"learning_rate": 2.2884200712180227e-05,
"loss": 0.0351,
"step": 1740
},
{
"epoch": 7.028112449799197,
"grad_norm": 0.4241548180580139,
"learning_rate": 2.2328578520384037e-05,
"loss": 0.0359,
"step": 1750
},
{
"epoch": 7.068273092369478,
"grad_norm": 0.38243892788887024,
"learning_rate": 2.1777839056661554e-05,
"loss": 0.0315,
"step": 1760
},
{
"epoch": 7.108433734939759,
"grad_norm": 0.5330711603164673,
"learning_rate": 2.123207950109596e-05,
"loss": 0.0307,
"step": 1770
},
{
"epoch": 7.14859437751004,
"grad_norm": 0.29397761821746826,
"learning_rate": 2.0691396155046595e-05,
"loss": 0.0279,
"step": 1780
},
{
"epoch": 7.188755020080321,
"grad_norm": 0.3870997130870819,
"learning_rate": 2.0155884424156242e-05,
"loss": 0.0332,
"step": 1790
},
{
"epoch": 7.228915662650603,
"grad_norm": 0.9195613861083984,
"learning_rate": 1.9625638801516407e-05,
"loss": 0.0309,
"step": 1800
},
{
"epoch": 7.269076305220883,
"grad_norm": 0.3219093680381775,
"learning_rate": 1.9100752850993687e-05,
"loss": 0.0281,
"step": 1810
},
{
"epoch": 7.309236947791165,
"grad_norm": 0.2664417028427124,
"learning_rate": 1.8581319190720035e-05,
"loss": 0.0289,
"step": 1820
},
{
"epoch": 7.349397590361446,
"grad_norm": 0.25901272892951965,
"learning_rate": 1.806742947674997e-05,
"loss": 0.0324,
"step": 1830
},
{
"epoch": 7.389558232931727,
"grad_norm": 0.3953966200351715,
"learning_rate": 1.7559174386887477e-05,
"loss": 0.0296,
"step": 1840
},
{
"epoch": 7.429718875502008,
"grad_norm": 1.268621802330017,
"learning_rate": 1.7056643604685596e-05,
"loss": 0.0255,
"step": 1850
},
{
"epoch": 7.469879518072289,
"grad_norm": 0.2772993743419647,
"learning_rate": 1.65599258036214e-05,
"loss": 0.0318,
"step": 1860
},
{
"epoch": 7.51004016064257,
"grad_norm": 0.22611959278583527,
"learning_rate": 1.6069108631449225e-05,
"loss": 0.0293,
"step": 1870
},
{
"epoch": 7.550200803212851,
"grad_norm": 0.27701687812805176,
"learning_rate": 1.5584278694734888e-05,
"loss": 0.0243,
"step": 1880
},
{
"epoch": 7.590361445783133,
"grad_norm": 0.31095728278160095,
"learning_rate": 1.5105521543573647e-05,
"loss": 0.0304,
"step": 1890
},
{
"epoch": 7.6305220883534135,
"grad_norm": 0.44475099444389343,
"learning_rate": 1.4632921656494469e-05,
"loss": 0.0313,
"step": 1900
},
{
"epoch": 7.670682730923695,
"grad_norm": 0.31613418459892273,
"learning_rate": 1.416656242555366e-05,
"loss": 0.0331,
"step": 1910
},
{
"epoch": 7.710843373493976,
"grad_norm": 0.31190618872642517,
"learning_rate": 1.3706526141619792e-05,
"loss": 0.0266,
"step": 1920
},
{
"epoch": 7.7510040160642575,
"grad_norm": 0.3129827082157135,
"learning_rate": 1.3252893979853304e-05,
"loss": 0.03,
"step": 1930
},
{
"epoch": 7.791164658634538,
"grad_norm": 0.2716180980205536,
"learning_rate": 1.2805745985382867e-05,
"loss": 0.0277,
"step": 1940
},
{
"epoch": 7.831325301204819,
"grad_norm": 0.2133369892835617,
"learning_rate": 1.2365161059180942e-05,
"loss": 0.0285,
"step": 1950
},
{
"epoch": 7.871485943775101,
"grad_norm": 0.18476147949695587,
"learning_rate": 1.1931216944141621e-05,
"loss": 0.0256,
"step": 1960
},
{
"epoch": 7.911646586345381,
"grad_norm": 0.39151495695114136,
"learning_rate": 1.1503990211362403e-05,
"loss": 0.0286,
"step": 1970
},
{
"epoch": 7.951807228915663,
"grad_norm": 0.3535381853580475,
"learning_rate": 1.1083556246633048e-05,
"loss": 0.0296,
"step": 1980
},
{
"epoch": 7.991967871485944,
"grad_norm": 0.9443646669387817,
"learning_rate": 1.0669989237133437e-05,
"loss": 0.027,
"step": 1990
},
{
"epoch": 8.032128514056225,
"grad_norm": 0.44257789850234985,
"learning_rate": 1.0263362158342948e-05,
"loss": 0.0306,
"step": 2000
},
{
"epoch": 8.072289156626505,
"grad_norm": 0.3381843864917755,
"learning_rate": 9.863746761163679e-06,
"loss": 0.0278,
"step": 2010
},
{
"epoch": 8.112449799196787,
"grad_norm": 0.22615911066532135,
"learning_rate": 9.471213559259684e-06,
"loss": 0.0321,
"step": 2020
},
{
"epoch": 8.152610441767068,
"grad_norm": 0.334036648273468,
"learning_rate": 9.08583181661461e-06,
"loss": 0.0264,
"step": 2030
},
{
"epoch": 8.19277108433735,
"grad_norm": 0.2973599135875702,
"learning_rate": 8.707669535309793e-06,
"loss": 0.0287,
"step": 2040
},
{
"epoch": 8.23293172690763,
"grad_norm": 0.5360568761825562,
"learning_rate": 8.33679344352501e-06,
"loss": 0.0289,
"step": 2050
},
{
"epoch": 8.273092369477911,
"grad_norm": 0.5042163729667664,
"learning_rate": 7.97326898376406e-06,
"loss": 0.0254,
"step": 2060
},
{
"epoch": 8.313253012048193,
"grad_norm": 0.191526398062706,
"learning_rate": 7.617160301307169e-06,
"loss": 0.0282,
"step": 2070
},
{
"epoch": 8.353413654618475,
"grad_norm": 0.3822993040084839,
"learning_rate": 7.268530232892317e-06,
"loss": 0.0292,
"step": 2080
},
{
"epoch": 8.393574297188755,
"grad_norm": 0.5774808526039124,
"learning_rate": 6.9274402956274686e-06,
"loss": 0.0263,
"step": 2090
},
{
"epoch": 8.433734939759036,
"grad_norm": 0.3628866970539093,
"learning_rate": 6.593950676135624e-06,
"loss": 0.0268,
"step": 2100
},
{
"epoch": 8.473895582329318,
"grad_norm": 0.4431964159011841,
"learning_rate": 6.268120219934631e-06,
"loss": 0.028,
"step": 2110
},
{
"epoch": 8.514056224899598,
"grad_norm": 0.5086953043937683,
"learning_rate": 5.950006421053772e-06,
"loss": 0.0257,
"step": 2120
},
{
"epoch": 8.55421686746988,
"grad_norm": 0.5995669364929199,
"learning_rate": 5.639665411888584e-06,
"loss": 0.0298,
"step": 2130
},
{
"epoch": 8.594377510040161,
"grad_norm": 1.7701818943023682,
"learning_rate": 5.337151953296188e-06,
"loss": 0.0269,
"step": 2140
},
{
"epoch": 8.634538152610443,
"grad_norm": 0.30067703127861023,
"learning_rate": 5.042519424932513e-06,
"loss": 0.0261,
"step": 2150
},
{
"epoch": 8.674698795180722,
"grad_norm": 0.6958829760551453,
"learning_rate": 4.755819815833174e-06,
"loss": 0.0242,
"step": 2160
},
{
"epoch": 8.714859437751004,
"grad_norm": 0.4430091679096222,
"learning_rate": 4.477103715239922e-06,
"loss": 0.0238,
"step": 2170
},
{
"epoch": 8.755020080321286,
"grad_norm": 0.359072208404541,
"learning_rate": 4.2064203036738746e-06,
"loss": 0.0245,
"step": 2180
},
{
"epoch": 8.795180722891565,
"grad_norm": 0.23414134979248047,
"learning_rate": 3.9438173442575e-06,
"loss": 0.024,
"step": 2190
},
{
"epoch": 8.835341365461847,
"grad_norm": 0.24696533381938934,
"learning_rate": 3.6893411742865814e-06,
"loss": 0.0258,
"step": 2200
},
{
"epoch": 8.875502008032129,
"grad_norm": 0.2517929673194885,
"learning_rate": 3.443036697053875e-06,
"loss": 0.0333,
"step": 2210
},
{
"epoch": 8.91566265060241,
"grad_norm": 0.7909056544303894,
"learning_rate": 3.204947373925693e-06,
"loss": 0.0272,
"step": 2220
},
{
"epoch": 8.95582329317269,
"grad_norm": 0.2984614670276642,
"learning_rate": 2.97511521667303e-06,
"loss": 0.0281,
"step": 2230
},
{
"epoch": 8.995983935742972,
"grad_norm": 0.23609600961208344,
"learning_rate": 2.7535807800583957e-06,
"loss": 0.025,
"step": 2240
},
{
"epoch": 9.036144578313253,
"grad_norm": 0.3227684795856476,
"learning_rate": 2.5403831546797875e-06,
"loss": 0.021,
"step": 2250
},
{
"epoch": 9.076305220883533,
"grad_norm": 0.25792595744132996,
"learning_rate": 2.3355599600729915e-06,
"loss": 0.0278,
"step": 2260
},
{
"epoch": 9.116465863453815,
"grad_norm": 0.4950139820575714,
"learning_rate": 2.139147338073466e-06,
"loss": 0.0266,
"step": 2270
},
{
"epoch": 9.156626506024097,
"grad_norm": 0.27984389662742615,
"learning_rate": 1.9511799464390247e-06,
"loss": 0.0252,
"step": 2280
},
{
"epoch": 9.196787148594378,
"grad_norm": 0.36016976833343506,
"learning_rate": 1.7716909527342839e-06,
"loss": 0.0313,
"step": 2290
},
{
"epoch": 9.236947791164658,
"grad_norm": 0.19290144741535187,
"learning_rate": 1.6007120284781518e-06,
"loss": 0.0237,
"step": 2300
},
{
"epoch": 9.27710843373494,
"grad_norm": 0.3681156039237976,
"learning_rate": 1.4382733435552464e-06,
"loss": 0.0265,
"step": 2310
},
{
"epoch": 9.317269076305221,
"grad_norm": 0.31840649247169495,
"learning_rate": 1.2844035608923222e-06,
"loss": 0.0277,
"step": 2320
},
{
"epoch": 9.357429718875501,
"grad_norm": 0.21902896463871002,
"learning_rate": 1.1391298314006037e-06,
"loss": 0.029,
"step": 2330
},
{
"epoch": 9.397590361445783,
"grad_norm": 0.299748957157135,
"learning_rate": 1.0024777891848359e-06,
"loss": 0.0271,
"step": 2340
},
{
"epoch": 9.437751004016064,
"grad_norm": 0.2108592987060547,
"learning_rate": 8.744715470201336e-07,
"loss": 0.0265,
"step": 2350
},
{
"epoch": 9.477911646586346,
"grad_norm": 0.2198696732521057,
"learning_rate": 7.551336920971374e-07,
"loss": 0.0236,
"step": 2360
},
{
"epoch": 9.518072289156626,
"grad_norm": 0.2825567424297333,
"learning_rate": 6.444852820364222e-07,
"loss": 0.0224,
"step": 2370
},
{
"epoch": 9.558232931726907,
"grad_norm": 0.279225617647171,
"learning_rate": 5.425458411728202e-07,
"loss": 0.0265,
"step": 2380
},
{
"epoch": 9.598393574297189,
"grad_norm": 1.2663246393203735,
"learning_rate": 4.4933335711025983e-07,
"loss": 0.0237,
"step": 2390
},
{
"epoch": 9.638554216867469,
"grad_norm": 0.1881272941827774,
"learning_rate": 3.648642775477884e-07,
"loss": 0.0242,
"step": 2400
},
{
"epoch": 9.67871485943775,
"grad_norm": 0.602509617805481,
"learning_rate": 2.891535073773155e-07,
"loss": 0.032,
"step": 2410
},
{
"epoch": 9.718875502008032,
"grad_norm": 0.3743922710418701,
"learning_rate": 2.2221440605359466e-07,
"loss": 0.0259,
"step": 2420
},
{
"epoch": 9.759036144578314,
"grad_norm": 0.23908697068691254,
"learning_rate": 1.6405878523686468e-07,
"loss": 0.0294,
"step": 2430
},
{
"epoch": 9.799196787148594,
"grad_norm": 0.7279542088508606,
"learning_rate": 1.1469690670868894e-07,
"loss": 0.0268,
"step": 2440
},
{
"epoch": 9.839357429718875,
"grad_norm": 0.3277071714401245,
"learning_rate": 7.413748056117609e-08,
"loss": 0.0276,
"step": 2450
},
{
"epoch": 9.879518072289157,
"grad_norm": 0.4907410740852356,
"learning_rate": 4.2387663660081735e-08,
"loss": 0.0227,
"step": 2460
},
{
"epoch": 9.919678714859439,
"grad_norm": 0.47046899795532227,
"learning_rate": 1.9453058381940782e-08,
"loss": 0.0317,
"step": 2470
},
{
"epoch": 9.959839357429718,
"grad_norm": 0.5400535464286804,
"learning_rate": 5.337711625497121e-09,
"loss": 0.0261,
"step": 2480
},
{
"epoch": 10.0,
"grad_norm": 0.32108399271965027,
"learning_rate": 4.4114097635938875e-11,
"loss": 0.0245,
"step": 2490
},
{
"epoch": 10.0,
"step": 2490,
"total_flos": 0.0,
"train_loss": 0.049736060142277716,
"train_runtime": 2579.4556,
"train_samples_per_second": 47.161,
"train_steps_per_second": 0.965
}
],
"logging_steps": 10,
"max_steps": 2490,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 49,
"trial_name": null,
"trial_params": null
}