qwen3-0.6b-base-solver / trainer_state.json
YouAreSpecialToMe's picture
Upload folder using huggingface_hub
17b455f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.7086247086247086,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018648018648018648,
"grad_norm": 6.3322858810424805,
"learning_rate": 0.0,
"loss": 0.7991,
"step": 1
},
{
"epoch": 0.037296037296037296,
"grad_norm": 6.591444492340088,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.8816,
"step": 2
},
{
"epoch": 0.055944055944055944,
"grad_norm": 6.131351947784424,
"learning_rate": 6.666666666666667e-06,
"loss": 0.7863,
"step": 3
},
{
"epoch": 0.07459207459207459,
"grad_norm": 3.621307373046875,
"learning_rate": 1e-05,
"loss": 0.7604,
"step": 4
},
{
"epoch": 0.09324009324009325,
"grad_norm": 5.652075290679932,
"learning_rate": 9.99965389153533e-06,
"loss": 0.7269,
"step": 5
},
{
"epoch": 0.11188811188811189,
"grad_norm": 5.4281415939331055,
"learning_rate": 9.998615614057743e-06,
"loss": 0.7568,
"step": 6
},
{
"epoch": 0.13053613053613053,
"grad_norm": 5.2610087394714355,
"learning_rate": 9.996885311309892e-06,
"loss": 0.7266,
"step": 7
},
{
"epoch": 0.14918414918414918,
"grad_norm": 4.351637840270996,
"learning_rate": 9.994463222840748e-06,
"loss": 0.6749,
"step": 8
},
{
"epoch": 0.16783216783216784,
"grad_norm": 3.5025124549865723,
"learning_rate": 9.991349683972435e-06,
"loss": 0.689,
"step": 9
},
{
"epoch": 0.1864801864801865,
"grad_norm": 2.4112985134124756,
"learning_rate": 9.987545125753818e-06,
"loss": 0.7442,
"step": 10
},
{
"epoch": 0.20512820512820512,
"grad_norm": 2.289846658706665,
"learning_rate": 9.983050074900824e-06,
"loss": 0.6741,
"step": 11
},
{
"epoch": 0.22377622377622378,
"grad_norm": 2.045738697052002,
"learning_rate": 9.977865153723508e-06,
"loss": 0.6719,
"step": 12
},
{
"epoch": 0.24242424242424243,
"grad_norm": 1.7808794975280762,
"learning_rate": 9.971991080039912e-06,
"loss": 0.6307,
"step": 13
},
{
"epoch": 0.26107226107226106,
"grad_norm": 1.6070201396942139,
"learning_rate": 9.965428667076687e-06,
"loss": 0.665,
"step": 14
},
{
"epoch": 0.27972027972027974,
"grad_norm": 1.4720741510391235,
"learning_rate": 9.958178823356503e-06,
"loss": 0.6371,
"step": 15
},
{
"epoch": 0.29836829836829837,
"grad_norm": 1.2774218320846558,
"learning_rate": 9.950242552572272e-06,
"loss": 0.6312,
"step": 16
},
{
"epoch": 0.317016317016317,
"grad_norm": 1.4246975183486938,
"learning_rate": 9.941620953448195e-06,
"loss": 0.6642,
"step": 17
},
{
"epoch": 0.3356643356643357,
"grad_norm": 1.3494893312454224,
"learning_rate": 9.932315219587641e-06,
"loss": 0.5713,
"step": 18
},
{
"epoch": 0.3543123543123543,
"grad_norm": 1.2985285520553589,
"learning_rate": 9.922326639307918e-06,
"loss": 0.5967,
"step": 19
},
{
"epoch": 0.372960372960373,
"grad_norm": 1.5050538778305054,
"learning_rate": 9.911656595461899e-06,
"loss": 0.63,
"step": 20
},
{
"epoch": 0.3916083916083916,
"grad_norm": 1.3601627349853516,
"learning_rate": 9.900306565246579e-06,
"loss": 0.5996,
"step": 21
},
{
"epoch": 0.41025641025641024,
"grad_norm": 1.293419599533081,
"learning_rate": 9.888278119998573e-06,
"loss": 0.6239,
"step": 22
},
{
"epoch": 0.4289044289044289,
"grad_norm": 1.2258663177490234,
"learning_rate": 9.875572924976568e-06,
"loss": 0.6268,
"step": 23
},
{
"epoch": 0.44755244755244755,
"grad_norm": 1.1782615184783936,
"learning_rate": 9.86219273913078e-06,
"loss": 0.5997,
"step": 24
},
{
"epoch": 0.4662004662004662,
"grad_norm": 1.2980071306228638,
"learning_rate": 9.848139414859441e-06,
"loss": 0.5644,
"step": 25
},
{
"epoch": 0.48484848484848486,
"grad_norm": 1.2712095975875854,
"learning_rate": 9.833414897752346e-06,
"loss": 0.5805,
"step": 26
},
{
"epoch": 0.5034965034965035,
"grad_norm": 1.3099210262298584,
"learning_rate": 9.818021226321502e-06,
"loss": 0.5912,
"step": 27
},
{
"epoch": 0.5221445221445221,
"grad_norm": 1.2401714324951172,
"learning_rate": 9.801960531718898e-06,
"loss": 0.6011,
"step": 28
},
{
"epoch": 0.5407925407925408,
"grad_norm": 1.2797614336013794,
"learning_rate": 9.785235037441473e-06,
"loss": 0.6409,
"step": 29
},
{
"epoch": 0.5594405594405595,
"grad_norm": 1.1702829599380493,
"learning_rate": 9.767847059023292e-06,
"loss": 0.5244,
"step": 30
},
{
"epoch": 0.578088578088578,
"grad_norm": 1.1308348178863525,
"learning_rate": 9.749799003714954e-06,
"loss": 0.5703,
"step": 31
},
{
"epoch": 0.5967365967365967,
"grad_norm": 1.2901357412338257,
"learning_rate": 9.731093370150349e-06,
"loss": 0.5943,
"step": 32
},
{
"epoch": 0.6153846153846154,
"grad_norm": 1.243963360786438,
"learning_rate": 9.71173274800072e-06,
"loss": 0.5791,
"step": 33
},
{
"epoch": 0.634032634032634,
"grad_norm": 1.122418999671936,
"learning_rate": 9.691719817616148e-06,
"loss": 0.5494,
"step": 34
},
{
"epoch": 0.6526806526806527,
"grad_norm": 1.1197413206100464,
"learning_rate": 9.671057349654481e-06,
"loss": 0.5844,
"step": 35
},
{
"epoch": 0.6713286713286714,
"grad_norm": 1.1654632091522217,
"learning_rate": 9.649748204697741e-06,
"loss": 0.5525,
"step": 36
},
{
"epoch": 0.6899766899766899,
"grad_norm": 1.2135071754455566,
"learning_rate": 9.627795332856107e-06,
"loss": 0.5611,
"step": 37
},
{
"epoch": 0.7086247086247086,
"grad_norm": 1.0855522155761719,
"learning_rate": 9.605201773359485e-06,
"loss": 0.5426,
"step": 38
},
{
"epoch": 0.7272727272727273,
"grad_norm": 1.2061638832092285,
"learning_rate": 9.581970654136752e-06,
"loss": 0.5942,
"step": 39
},
{
"epoch": 0.745920745920746,
"grad_norm": 1.0460582971572876,
"learning_rate": 9.55810519138271e-06,
"loss": 0.5781,
"step": 40
},
{
"epoch": 0.7645687645687645,
"grad_norm": 1.1728343963623047,
"learning_rate": 9.533608689112827e-06,
"loss": 0.5508,
"step": 41
},
{
"epoch": 0.7832167832167832,
"grad_norm": 1.396593689918518,
"learning_rate": 9.508484538705823e-06,
"loss": 0.5841,
"step": 42
},
{
"epoch": 0.8018648018648019,
"grad_norm": 1.1621497869491577,
"learning_rate": 9.482736218434144e-06,
"loss": 0.5211,
"step": 43
},
{
"epoch": 0.8205128205128205,
"grad_norm": 1.2855342626571655,
"learning_rate": 9.45636729298243e-06,
"loss": 0.5505,
"step": 44
},
{
"epoch": 0.8391608391608392,
"grad_norm": 1.1451700925827026,
"learning_rate": 9.429381412954e-06,
"loss": 0.5283,
"step": 45
},
{
"epoch": 0.8578088578088578,
"grad_norm": 1.2317947149276733,
"learning_rate": 9.401782314365458e-06,
"loss": 0.554,
"step": 46
},
{
"epoch": 0.8764568764568764,
"grad_norm": 1.2228690385818481,
"learning_rate": 9.37357381812946e-06,
"loss": 0.5231,
"step": 47
},
{
"epoch": 0.8951048951048951,
"grad_norm": 1.3035013675689697,
"learning_rate": 9.344759829525734e-06,
"loss": 0.5618,
"step": 48
},
{
"epoch": 0.9137529137529138,
"grad_norm": 1.2204902172088623,
"learning_rate": 9.315344337660422e-06,
"loss": 0.5535,
"step": 49
},
{
"epoch": 0.9324009324009324,
"grad_norm": 1.211380958557129,
"learning_rate": 9.285331414913816e-06,
"loss": 0.5824,
"step": 50
},
{
"epoch": 0.951048951048951,
"grad_norm": 1.1931360960006714,
"learning_rate": 9.254725216376562e-06,
"loss": 0.5421,
"step": 51
},
{
"epoch": 0.9696969696969697,
"grad_norm": 1.2671467065811157,
"learning_rate": 9.223529979274411e-06,
"loss": 0.5926,
"step": 52
},
{
"epoch": 0.9883449883449883,
"grad_norm": 1.2529419660568237,
"learning_rate": 9.191750022381613e-06,
"loss": 0.5775,
"step": 53
},
{
"epoch": 1.0,
"grad_norm": 1.2529419660568237,
"learning_rate": 9.159389745423003e-06,
"loss": 0.5212,
"step": 54
},
{
"epoch": 1.0186480186480187,
"grad_norm": 1.69382905960083,
"learning_rate": 9.126453628464889e-06,
"loss": 0.5111,
"step": 55
},
{
"epoch": 1.0372960372960374,
"grad_norm": 1.2325468063354492,
"learning_rate": 9.09294623129482e-06,
"loss": 0.5528,
"step": 56
},
{
"epoch": 1.055944055944056,
"grad_norm": 1.4443434476852417,
"learning_rate": 9.058872192790314e-06,
"loss": 0.5364,
"step": 57
},
{
"epoch": 1.0745920745920745,
"grad_norm": 1.1799763441085815,
"learning_rate": 9.02423623027663e-06,
"loss": 0.4809,
"step": 58
},
{
"epoch": 1.0932400932400932,
"grad_norm": 1.2783596515655518,
"learning_rate": 8.98904313887369e-06,
"loss": 0.5252,
"step": 59
},
{
"epoch": 1.1118881118881119,
"grad_norm": 1.164589762687683,
"learning_rate": 8.953297790832231e-06,
"loss": 0.4953,
"step": 60
},
{
"epoch": 1.1305361305361306,
"grad_norm": 1.184706687927246,
"learning_rate": 8.917005134859263e-06,
"loss": 0.4995,
"step": 61
},
{
"epoch": 1.1491841491841492,
"grad_norm": 1.1743718385696411,
"learning_rate": 8.88017019543296e-06,
"loss": 0.4566,
"step": 62
},
{
"epoch": 1.167832167832168,
"grad_norm": 1.2495801448822021,
"learning_rate": 8.842798072107055e-06,
"loss": 0.5244,
"step": 63
},
{
"epoch": 1.1864801864801864,
"grad_norm": 1.192482590675354,
"learning_rate": 8.804893938804839e-06,
"loss": 0.4859,
"step": 64
},
{
"epoch": 1.205128205128205,
"grad_norm": 1.2100673913955688,
"learning_rate": 8.766463043102864e-06,
"loss": 0.5466,
"step": 65
},
{
"epoch": 1.2237762237762237,
"grad_norm": 1.2848899364471436,
"learning_rate": 8.727510705504453e-06,
"loss": 0.503,
"step": 66
},
{
"epoch": 1.2424242424242424,
"grad_norm": 1.185261845588684,
"learning_rate": 8.688042318703111e-06,
"loss": 0.5007,
"step": 67
},
{
"epoch": 1.2610722610722611,
"grad_norm": 1.252984881401062,
"learning_rate": 8.648063346835943e-06,
"loss": 0.5121,
"step": 68
},
{
"epoch": 1.2797202797202798,
"grad_norm": 1.2732371091842651,
"learning_rate": 8.607579324727175e-06,
"loss": 0.5247,
"step": 69
},
{
"epoch": 1.2983682983682985,
"grad_norm": 1.2724429368972778,
"learning_rate": 8.566595857121902e-06,
"loss": 0.4986,
"step": 70
},
{
"epoch": 1.317016317016317,
"grad_norm": 1.2770615816116333,
"learning_rate": 8.525118617910144e-06,
"loss": 0.464,
"step": 71
},
{
"epoch": 1.3356643356643356,
"grad_norm": 1.1614543199539185,
"learning_rate": 8.483153349341336e-06,
"loss": 0.5128,
"step": 72
},
{
"epoch": 1.3543123543123543,
"grad_norm": 1.2303462028503418,
"learning_rate": 8.440705861229344e-06,
"loss": 0.4872,
"step": 73
},
{
"epoch": 1.372960372960373,
"grad_norm": 1.168981671333313,
"learning_rate": 8.397782030148147e-06,
"loss": 0.5009,
"step": 74
},
{
"epoch": 1.3916083916083917,
"grad_norm": 1.0755895376205444,
"learning_rate": 8.354387798618254e-06,
"loss": 0.5135,
"step": 75
},
{
"epoch": 1.4102564102564101,
"grad_norm": 1.1931865215301514,
"learning_rate": 8.310529174284004e-06,
"loss": 0.4942,
"step": 76
},
{
"epoch": 1.428904428904429,
"grad_norm": 1.2469918727874756,
"learning_rate": 8.266212229081846e-06,
"loss": 0.5237,
"step": 77
},
{
"epoch": 1.4475524475524475,
"grad_norm": 1.1931370496749878,
"learning_rate": 8.221443098399733e-06,
"loss": 0.5069,
"step": 78
},
{
"epoch": 1.4662004662004662,
"grad_norm": 1.099647045135498,
"learning_rate": 8.176227980227693e-06,
"loss": 0.4646,
"step": 79
},
{
"epoch": 1.4848484848484849,
"grad_norm": 1.2228244543075562,
"learning_rate": 8.130573134299782e-06,
"loss": 0.5151,
"step": 80
},
{
"epoch": 1.5034965034965035,
"grad_norm": 1.2358263731002808,
"learning_rate": 8.084484881227449e-06,
"loss": 0.5119,
"step": 81
},
{
"epoch": 1.5221445221445222,
"grad_norm": 1.1055835485458374,
"learning_rate": 8.037969601624495e-06,
"loss": 0.4502,
"step": 82
},
{
"epoch": 1.5407925407925407,
"grad_norm": 1.2238577604293823,
"learning_rate": 7.99103373522373e-06,
"loss": 0.4787,
"step": 83
},
{
"epoch": 1.5594405594405596,
"grad_norm": 1.2454148530960083,
"learning_rate": 7.943683779985412e-06,
"loss": 0.4767,
"step": 84
},
{
"epoch": 1.578088578088578,
"grad_norm": 1.131277084350586,
"learning_rate": 7.895926291197667e-06,
"loss": 0.4717,
"step": 85
},
{
"epoch": 1.5967365967365967,
"grad_norm": 1.131995439529419,
"learning_rate": 7.847767880568944e-06,
"loss": 0.4875,
"step": 86
},
{
"epoch": 1.6153846153846154,
"grad_norm": 1.0890105962753296,
"learning_rate": 7.799215215312667e-06,
"loss": 0.4749,
"step": 87
},
{
"epoch": 1.6340326340326339,
"grad_norm": 1.1363167762756348,
"learning_rate": 7.750275017224208e-06,
"loss": 0.5297,
"step": 88
},
{
"epoch": 1.6526806526806528,
"grad_norm": 1.196660041809082,
"learning_rate": 7.700954061750295e-06,
"loss": 0.504,
"step": 89
},
{
"epoch": 1.6713286713286712,
"grad_norm": 1.1384004354476929,
"learning_rate": 7.651259177050996e-06,
"loss": 0.498,
"step": 90
},
{
"epoch": 1.68997668997669,
"grad_norm": 1.1040617227554321,
"learning_rate": 7.601197243054411e-06,
"loss": 0.504,
"step": 91
},
{
"epoch": 1.7086247086247086,
"grad_norm": 1.1692686080932617,
"learning_rate": 7.5507751905041885e-06,
"loss": 0.4742,
"step": 92
},
{
"epoch": 1.7272727272727273,
"grad_norm": 1.197273850440979,
"learning_rate": 7.500000000000001e-06,
"loss": 0.4948,
"step": 93
},
{
"epoch": 1.745920745920746,
"grad_norm": 1.1838762760162354,
"learning_rate": 7.4488787010311425e-06,
"loss": 0.4882,
"step": 94
},
{
"epoch": 1.7645687645687644,
"grad_norm": 1.074876070022583,
"learning_rate": 7.3974183710033334e-06,
"loss": 0.4994,
"step": 95
},
{
"epoch": 1.7832167832167833,
"grad_norm": 1.1607307195663452,
"learning_rate": 7.345626134258897e-06,
"loss": 0.498,
"step": 96
},
{
"epoch": 1.8018648018648018,
"grad_norm": 1.1693527698516846,
"learning_rate": 7.293509161090453e-06,
"loss": 0.4706,
"step": 97
},
{
"epoch": 1.8205128205128205,
"grad_norm": 1.1076033115386963,
"learning_rate": 7.241074666748228e-06,
"loss": 0.5055,
"step": 98
},
{
"epoch": 1.8391608391608392,
"grad_norm": 1.1932028532028198,
"learning_rate": 7.188329910441154e-06,
"loss": 0.5006,
"step": 99
},
{
"epoch": 1.8578088578088578,
"grad_norm": 1.2250460386276245,
"learning_rate": 7.135282194331881e-06,
"loss": 0.479,
"step": 100
},
{
"epoch": 1.8764568764568765,
"grad_norm": 1.3001635074615479,
"learning_rate": 7.0819388625258385e-06,
"loss": 0.5101,
"step": 101
},
{
"epoch": 1.895104895104895,
"grad_norm": 1.3003392219543457,
"learning_rate": 7.028307300054499e-06,
"loss": 0.4875,
"step": 102
},
{
"epoch": 1.913752913752914,
"grad_norm": 1.1021766662597656,
"learning_rate": 6.974394931852957e-06,
"loss": 0.4848,
"step": 103
},
{
"epoch": 1.9324009324009324,
"grad_norm": 1.3179751634597778,
"learning_rate": 6.920209221732007e-06,
"loss": 0.5254,
"step": 104
},
{
"epoch": 1.951048951048951,
"grad_norm": 1.2668633460998535,
"learning_rate": 6.865757671344827e-06,
"loss": 0.5057,
"step": 105
},
{
"epoch": 1.9696969696969697,
"grad_norm": 1.1570608615875244,
"learning_rate": 6.811047819148413e-06,
"loss": 0.4837,
"step": 106
},
{
"epoch": 1.9883449883449882,
"grad_norm": 1.1735329627990723,
"learning_rate": 6.756087239359948e-06,
"loss": 0.5096,
"step": 107
},
{
"epoch": 2.0,
"grad_norm": 1.3861793279647827,
"learning_rate": 6.700883540908185e-06,
"loss": 0.4528,
"step": 108
},
{
"epoch": 2.0186480186480185,
"grad_norm": 1.4102225303649902,
"learning_rate": 6.64544436638005e-06,
"loss": 0.4672,
"step": 109
},
{
"epoch": 2.0372960372960374,
"grad_norm": 1.3145784139633179,
"learning_rate": 6.589777390962575e-06,
"loss": 0.471,
"step": 110
},
{
"epoch": 2.055944055944056,
"grad_norm": 1.2322523593902588,
"learning_rate": 6.53389032138032e-06,
"loss": 0.4443,
"step": 111
},
{
"epoch": 2.0745920745920747,
"grad_norm": 1.2904436588287354,
"learning_rate": 6.477790894828422e-06,
"loss": 0.4656,
"step": 112
},
{
"epoch": 2.093240093240093,
"grad_norm": 1.1724885702133179,
"learning_rate": 6.421486877901436e-06,
"loss": 0.4061,
"step": 113
},
{
"epoch": 2.111888111888112,
"grad_norm": 1.1402873992919922,
"learning_rate": 6.364986065518106e-06,
"loss": 0.4268,
"step": 114
},
{
"epoch": 2.1305361305361306,
"grad_norm": 1.2183982133865356,
"learning_rate": 6.308296279842204e-06,
"loss": 0.4333,
"step": 115
},
{
"epoch": 2.149184149184149,
"grad_norm": 1.2880562543869019,
"learning_rate": 6.2514253691996e-06,
"loss": 0.4599,
"step": 116
},
{
"epoch": 2.167832167832168,
"grad_norm": 1.1045494079589844,
"learning_rate": 6.194381206991723e-06,
"loss": 0.4359,
"step": 117
},
{
"epoch": 2.1864801864801864,
"grad_norm": 1.0571209192276,
"learning_rate": 6.1371716906055336e-06,
"loss": 0.4506,
"step": 118
},
{
"epoch": 2.2051282051282053,
"grad_norm": 1.2100480794906616,
"learning_rate": 6.079804740320181e-06,
"loss": 0.4629,
"step": 119
},
{
"epoch": 2.2237762237762237,
"grad_norm": 1.2123056650161743,
"learning_rate": 6.022288298210502e-06,
"loss": 0.4294,
"step": 120
},
{
"epoch": 2.242424242424242,
"grad_norm": 1.1131305694580078,
"learning_rate": 5.964630327047485e-06,
"loss": 0.4652,
"step": 121
},
{
"epoch": 2.261072261072261,
"grad_norm": 1.0908523797988892,
"learning_rate": 5.906838809195879e-06,
"loss": 0.4552,
"step": 122
},
{
"epoch": 2.2797202797202796,
"grad_norm": 1.1521565914154053,
"learning_rate": 5.848921745509094e-06,
"loss": 0.4443,
"step": 123
},
{
"epoch": 2.2983682983682985,
"grad_norm": 1.1898850202560425,
"learning_rate": 5.790887154221521e-06,
"loss": 0.4831,
"step": 124
},
{
"epoch": 2.317016317016317,
"grad_norm": 1.1984978914260864,
"learning_rate": 5.7327430698384775e-06,
"loss": 0.4533,
"step": 125
},
{
"epoch": 2.335664335664336,
"grad_norm": 1.1915768384933472,
"learning_rate": 5.674497542023875e-06,
"loss": 0.4171,
"step": 126
},
{
"epoch": 2.3543123543123543,
"grad_norm": 1.220482349395752,
"learning_rate": 5.616158634485793e-06,
"loss": 0.4745,
"step": 127
},
{
"epoch": 2.3729603729603728,
"grad_norm": 1.2805721759796143,
"learning_rate": 5.557734423860122e-06,
"loss": 0.4597,
"step": 128
},
{
"epoch": 2.3916083916083917,
"grad_norm": 1.1252267360687256,
"learning_rate": 5.499232998592399e-06,
"loss": 0.43,
"step": 129
},
{
"epoch": 2.41025641025641,
"grad_norm": 1.2377195358276367,
"learning_rate": 5.44066245781801e-06,
"loss": 0.4456,
"step": 130
},
{
"epoch": 2.428904428904429,
"grad_norm": 1.0829073190689087,
"learning_rate": 5.382030910240936e-06,
"loss": 0.4475,
"step": 131
},
{
"epoch": 2.4475524475524475,
"grad_norm": 1.1754248142242432,
"learning_rate": 5.3233464730111426e-06,
"loss": 0.4638,
"step": 132
},
{
"epoch": 2.4662004662004664,
"grad_norm": 1.1847262382507324,
"learning_rate": 5.2646172706008154e-06,
"loss": 0.4748,
"step": 133
},
{
"epoch": 2.484848484848485,
"grad_norm": 1.2288694381713867,
"learning_rate": 5.20585143367959e-06,
"loss": 0.4063,
"step": 134
},
{
"epoch": 2.5034965034965033,
"grad_norm": 1.1340534687042236,
"learning_rate": 5.147057097988898e-06,
"loss": 0.4805,
"step": 135
},
{
"epoch": 2.5221445221445222,
"grad_norm": 1.1196503639221191,
"learning_rate": 5.088242403215644e-06,
"loss": 0.4487,
"step": 136
},
{
"epoch": 2.5407925407925407,
"grad_norm": 1.1831355094909668,
"learning_rate": 5.029415491865311e-06,
"loss": 0.4206,
"step": 137
},
{
"epoch": 2.5594405594405596,
"grad_norm": 1.148800253868103,
"learning_rate": 4.97058450813469e-06,
"loss": 0.46,
"step": 138
},
{
"epoch": 2.578088578088578,
"grad_norm": 1.1630257368087769,
"learning_rate": 4.911757596784358e-06,
"loss": 0.4357,
"step": 139
},
{
"epoch": 2.596736596736597,
"grad_norm": 1.2669141292572021,
"learning_rate": 4.8529429020111035e-06,
"loss": 0.4658,
"step": 140
},
{
"epoch": 2.6153846153846154,
"grad_norm": 1.153731107711792,
"learning_rate": 4.794148566320412e-06,
"loss": 0.4333,
"step": 141
},
{
"epoch": 2.634032634032634,
"grad_norm": 1.1950958967208862,
"learning_rate": 4.7353827293991845e-06,
"loss": 0.4582,
"step": 142
},
{
"epoch": 2.652680652680653,
"grad_norm": 1.1688624620437622,
"learning_rate": 4.676653526988858e-06,
"loss": 0.4364,
"step": 143
},
{
"epoch": 2.6713286713286712,
"grad_norm": 1.208266019821167,
"learning_rate": 4.617969089759066e-06,
"loss": 0.4465,
"step": 144
},
{
"epoch": 2.6899766899766897,
"grad_norm": 1.1561684608459473,
"learning_rate": 4.559337542181993e-06,
"loss": 0.4337,
"step": 145
},
{
"epoch": 2.7086247086247086,
"grad_norm": 1.1300450563430786,
"learning_rate": 4.500767001407604e-06,
"loss": 0.4652,
"step": 146
},
{
"epoch": 2.7272727272727275,
"grad_norm": 1.2024763822555542,
"learning_rate": 4.4422655761398785e-06,
"loss": 0.4433,
"step": 147
},
{
"epoch": 2.745920745920746,
"grad_norm": 1.1854138374328613,
"learning_rate": 4.383841365514208e-06,
"loss": 0.4056,
"step": 148
},
{
"epoch": 2.7645687645687644,
"grad_norm": 1.153937816619873,
"learning_rate": 4.325502457976126e-06,
"loss": 0.4267,
"step": 149
},
{
"epoch": 2.7832167832167833,
"grad_norm": 1.0862733125686646,
"learning_rate": 4.267256930161523e-06,
"loss": 0.4348,
"step": 150
},
{
"epoch": 2.801864801864802,
"grad_norm": 1.2374823093414307,
"learning_rate": 4.209112845778481e-06,
"loss": 0.4199,
"step": 151
},
{
"epoch": 2.8205128205128203,
"grad_norm": 1.1854605674743652,
"learning_rate": 4.151078254490908e-06,
"loss": 0.4336,
"step": 152
},
{
"epoch": 2.839160839160839,
"grad_norm": 1.1627620458602905,
"learning_rate": 4.09316119080412e-06,
"loss": 0.4224,
"step": 153
},
{
"epoch": 2.857808857808858,
"grad_norm": 1.1857858896255493,
"learning_rate": 4.035369672952516e-06,
"loss": 0.4475,
"step": 154
},
{
"epoch": 2.8764568764568765,
"grad_norm": 1.1136168241500854,
"learning_rate": 3.977711701789499e-06,
"loss": 0.45,
"step": 155
},
{
"epoch": 2.895104895104895,
"grad_norm": 1.1074330806732178,
"learning_rate": 3.920195259679822e-06,
"loss": 0.4446,
"step": 156
},
{
"epoch": 2.913752913752914,
"grad_norm": 1.1580936908721924,
"learning_rate": 3.862828309394469e-06,
"loss": 0.4632,
"step": 157
},
{
"epoch": 2.9324009324009324,
"grad_norm": 1.1213862895965576,
"learning_rate": 3.805618793008279e-06,
"loss": 0.4525,
"step": 158
},
{
"epoch": 2.951048951048951,
"grad_norm": 1.1242083311080933,
"learning_rate": 3.7485746308004013e-06,
"loss": 0.4262,
"step": 159
},
{
"epoch": 2.9696969696969697,
"grad_norm": 1.1000770330429077,
"learning_rate": 3.6917037201577977e-06,
"loss": 0.4468,
"step": 160
},
{
"epoch": 2.988344988344988,
"grad_norm": 1.1469992399215698,
"learning_rate": 3.635013934481895e-06,
"loss": 0.4818,
"step": 161
},
{
"epoch": 3.0,
"grad_norm": 1.1469992399215698,
"learning_rate": 3.578513122098566e-06,
"loss": 0.4544,
"step": 162
},
{
"epoch": 3.0186480186480185,
"grad_norm": 1.7396659851074219,
"learning_rate": 3.5222091051715803e-06,
"loss": 0.4208,
"step": 163
},
{
"epoch": 3.0372960372960374,
"grad_norm": 1.1009488105773926,
"learning_rate": 3.466109678619681e-06,
"loss": 0.3786,
"step": 164
},
{
"epoch": 3.055944055944056,
"grad_norm": 1.1437153816223145,
"learning_rate": 3.4102226090374246e-06,
"loss": 0.4494,
"step": 165
},
{
"epoch": 3.0745920745920747,
"grad_norm": 1.1509310007095337,
"learning_rate": 3.35455563361995e-06,
"loss": 0.3992,
"step": 166
},
{
"epoch": 3.093240093240093,
"grad_norm": 1.0771594047546387,
"learning_rate": 3.2991164590918162e-06,
"loss": 0.4229,
"step": 167
},
{
"epoch": 3.111888111888112,
"grad_norm": 1.0693249702453613,
"learning_rate": 3.2439127606400546e-06,
"loss": 0.4083,
"step": 168
},
{
"epoch": 3.1305361305361306,
"grad_norm": 1.0992814302444458,
"learning_rate": 3.1889521808515888e-06,
"loss": 0.4571,
"step": 169
},
{
"epoch": 3.149184149184149,
"grad_norm": 1.1360324621200562,
"learning_rate": 3.1342423286551756e-06,
"loss": 0.4196,
"step": 170
},
{
"epoch": 3.167832167832168,
"grad_norm": 1.167336344718933,
"learning_rate": 3.0797907782679944e-06,
"loss": 0.4165,
"step": 171
},
{
"epoch": 3.1864801864801864,
"grad_norm": 1.1829248666763306,
"learning_rate": 3.0256050681470446e-06,
"loss": 0.4764,
"step": 172
},
{
"epoch": 3.2051282051282053,
"grad_norm": 1.150373101234436,
"learning_rate": 2.971692699945502e-06,
"loss": 0.4224,
"step": 173
},
{
"epoch": 3.2237762237762237,
"grad_norm": 1.1187318563461304,
"learning_rate": 2.9180611374741623e-06,
"loss": 0.4192,
"step": 174
},
{
"epoch": 3.242424242424242,
"grad_norm": 1.096235990524292,
"learning_rate": 2.8647178056681197e-06,
"loss": 0.4162,
"step": 175
},
{
"epoch": 3.261072261072261,
"grad_norm": 1.1855800151824951,
"learning_rate": 2.8116700895588473e-06,
"loss": 0.41,
"step": 176
},
{
"epoch": 3.2797202797202796,
"grad_norm": 1.1241765022277832,
"learning_rate": 2.7589253332517736e-06,
"loss": 0.425,
"step": 177
},
{
"epoch": 3.2983682983682985,
"grad_norm": 1.1213021278381348,
"learning_rate": 2.706490838909547e-06,
"loss": 0.414,
"step": 178
},
{
"epoch": 3.317016317016317,
"grad_norm": 1.0516644716262817,
"learning_rate": 2.6543738657411033e-06,
"loss": 0.4102,
"step": 179
},
{
"epoch": 3.335664335664336,
"grad_norm": 1.085395097732544,
"learning_rate": 2.6025816289966703e-06,
"loss": 0.4476,
"step": 180
},
{
"epoch": 3.3543123543123543,
"grad_norm": 1.1523784399032593,
"learning_rate": 2.5511212989688587e-06,
"loss": 0.4453,
"step": 181
},
{
"epoch": 3.3729603729603728,
"grad_norm": 1.1129071712493896,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.4151,
"step": 182
},
{
"epoch": 3.3916083916083917,
"grad_norm": 1.0871580839157104,
"learning_rate": 2.449224809495815e-06,
"loss": 0.4252,
"step": 183
},
{
"epoch": 3.41025641025641,
"grad_norm": 1.0709385871887207,
"learning_rate": 2.3988027569455895e-06,
"loss": 0.3925,
"step": 184
},
{
"epoch": 3.428904428904429,
"grad_norm": 1.0848338603973389,
"learning_rate": 2.348740822949006e-06,
"loss": 0.4265,
"step": 185
},
{
"epoch": 3.4475524475524475,
"grad_norm": 1.0750283002853394,
"learning_rate": 2.2990459382497086e-06,
"loss": 0.3992,
"step": 186
},
{
"epoch": 3.4662004662004664,
"grad_norm": 1.1117126941680908,
"learning_rate": 2.2497249827757933e-06,
"loss": 0.3911,
"step": 187
},
{
"epoch": 3.484848484848485,
"grad_norm": 1.1823192834854126,
"learning_rate": 2.2007847846873342e-06,
"loss": 0.4162,
"step": 188
},
{
"epoch": 3.5034965034965033,
"grad_norm": 1.1932240724563599,
"learning_rate": 2.1522321194310577e-06,
"loss": 0.3889,
"step": 189
},
{
"epoch": 3.5221445221445222,
"grad_norm": 1.0959529876708984,
"learning_rate": 2.1040737088023323e-06,
"loss": 0.4305,
"step": 190
},
{
"epoch": 3.5407925407925407,
"grad_norm": 1.1720889806747437,
"learning_rate": 2.056316220014588e-06,
"loss": 0.4321,
"step": 191
},
{
"epoch": 3.5594405594405596,
"grad_norm": 1.157222032546997,
"learning_rate": 2.0089662647762716e-06,
"loss": 0.4036,
"step": 192
},
{
"epoch": 3.578088578088578,
"grad_norm": 1.0834124088287354,
"learning_rate": 1.962030398375506e-06,
"loss": 0.4039,
"step": 193
},
{
"epoch": 3.596736596736597,
"grad_norm": 1.0103845596313477,
"learning_rate": 1.915515118772555e-06,
"loss": 0.4508,
"step": 194
},
{
"epoch": 3.6153846153846154,
"grad_norm": 1.094143033027649,
"learning_rate": 1.8694268657002197e-06,
"loss": 0.4324,
"step": 195
},
{
"epoch": 3.634032634032634,
"grad_norm": 1.1231107711791992,
"learning_rate": 1.8237720197723075e-06,
"loss": 0.4036,
"step": 196
},
{
"epoch": 3.652680652680653,
"grad_norm": 1.1039270162582397,
"learning_rate": 1.7785569016002686e-06,
"loss": 0.3793,
"step": 197
},
{
"epoch": 3.6713286713286712,
"grad_norm": 1.1073729991912842,
"learning_rate": 1.7337877709181527e-06,
"loss": 0.4372,
"step": 198
},
{
"epoch": 3.6899766899766897,
"grad_norm": 1.109791874885559,
"learning_rate": 1.689470825715998e-06,
"loss": 0.4416,
"step": 199
},
{
"epoch": 3.7086247086247086,
"grad_norm": 1.1324268579483032,
"learning_rate": 1.6456122013817477e-06,
"loss": 0.4021,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 270,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.053262545813504e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}