Safetensors
qwen2
LightThinker-Plus-Qwen / trainer_state.json
Yukirsh's picture
Upload folder using huggingface_hub
cd8ad68 verified
Raw
History Blame Contribute Delete
56.6 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 255,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011764705882352941,
"grad_norm": 3.6131326491907996,
"learning_rate": 9.090909090909091e-07,
"loss": 0.7929452061653137,
"num_input_tokens_seen": 0,
"step": 1
},
{
"epoch": 0.023529411764705882,
"grad_norm": 3.9380904690801763,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.8463644981384277,
"num_input_tokens_seen": 0,
"step": 2
},
{
"epoch": 0.03529411764705882,
"grad_norm": 3.5075707953425375,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.7615697383880615,
"num_input_tokens_seen": 0,
"step": 3
},
{
"epoch": 0.047058823529411764,
"grad_norm": 3.568991966465913,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.7781215906143188,
"num_input_tokens_seen": 0,
"step": 4
},
{
"epoch": 0.058823529411764705,
"grad_norm": 3.6216549878210613,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.8030836582183838,
"num_input_tokens_seen": 0,
"step": 5
},
{
"epoch": 0.07058823529411765,
"grad_norm": 3.0788368926498566,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.7026812434196472,
"num_input_tokens_seen": 0,
"step": 6
},
{
"epoch": 0.08235294117647059,
"grad_norm": 3.1470952265114955,
"learning_rate": 6.363636363636364e-06,
"loss": 0.7064052224159241,
"num_input_tokens_seen": 0,
"step": 7
},
{
"epoch": 0.09411764705882353,
"grad_norm": 2.1537924186137105,
"learning_rate": 7.272727272727273e-06,
"loss": 0.5964475274085999,
"num_input_tokens_seen": 0,
"step": 8
},
{
"epoch": 0.10588235294117647,
"grad_norm": 2.1368649029511326,
"learning_rate": 8.181818181818183e-06,
"loss": 0.6179602146148682,
"num_input_tokens_seen": 0,
"step": 9
},
{
"epoch": 0.11764705882352941,
"grad_norm": 1.981711432431228,
"learning_rate": 9.090909090909091e-06,
"loss": 0.5555359125137329,
"num_input_tokens_seen": 0,
"step": 10
},
{
"epoch": 0.12941176470588237,
"grad_norm": 1.559928530743653,
"learning_rate": 1e-05,
"loss": 0.5162748694419861,
"num_input_tokens_seen": 0,
"step": 11
},
{
"epoch": 0.1411764705882353,
"grad_norm": 1.4191567801273215,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.4931896924972534,
"num_input_tokens_seen": 0,
"step": 12
},
{
"epoch": 0.15294117647058825,
"grad_norm": 1.2550232555964334,
"learning_rate": 1.181818181818182e-05,
"loss": 0.5178971886634827,
"num_input_tokens_seen": 0,
"step": 13
},
{
"epoch": 0.16470588235294117,
"grad_norm": 1.5353572118512469,
"learning_rate": 1.2727272727272728e-05,
"loss": 0.4872450530529022,
"num_input_tokens_seen": 0,
"step": 14
},
{
"epoch": 0.17647058823529413,
"grad_norm": 1.3077407975869617,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.4353964030742645,
"num_input_tokens_seen": 0,
"step": 15
},
{
"epoch": 0.18823529411764706,
"grad_norm": 1.2379322286899677,
"learning_rate": 1.4545454545454546e-05,
"loss": 0.4468710124492645,
"num_input_tokens_seen": 0,
"step": 16
},
{
"epoch": 0.2,
"grad_norm": 0.9822538115083431,
"learning_rate": 1.5454545454545454e-05,
"loss": 0.4612148106098175,
"num_input_tokens_seen": 0,
"step": 17
},
{
"epoch": 0.21176470588235294,
"grad_norm": 1.019398009956526,
"learning_rate": 1.6363636363636366e-05,
"loss": 0.4416177570819855,
"num_input_tokens_seen": 0,
"step": 18
},
{
"epoch": 0.2235294117647059,
"grad_norm": 0.6811851291872145,
"learning_rate": 1.7272727272727274e-05,
"loss": 0.4400823712348938,
"num_input_tokens_seen": 0,
"step": 19
},
{
"epoch": 0.23529411764705882,
"grad_norm": 0.5706281411245361,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.407284140586853,
"num_input_tokens_seen": 0,
"step": 20
},
{
"epoch": 0.24705882352941178,
"grad_norm": 0.5144726641840578,
"learning_rate": 1.9090909090909094e-05,
"loss": 0.40131646394729614,
"num_input_tokens_seen": 0,
"step": 21
},
{
"epoch": 0.25882352941176473,
"grad_norm": 0.5271151443718087,
"learning_rate": 2e-05,
"loss": 0.3690889775753021,
"num_input_tokens_seen": 0,
"step": 22
},
{
"epoch": 0.27058823529411763,
"grad_norm": 0.46867880268180473,
"learning_rate": 1.999969615124717e-05,
"loss": 0.38862237334251404,
"num_input_tokens_seen": 0,
"step": 23
},
{
"epoch": 0.2823529411764706,
"grad_norm": 0.44956811034445643,
"learning_rate": 1.9998784623453477e-05,
"loss": 0.38012465834617615,
"num_input_tokens_seen": 0,
"step": 24
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.3973152538462978,
"learning_rate": 1.9997265472012247e-05,
"loss": 0.3671787679195404,
"num_input_tokens_seen": 0,
"step": 25
},
{
"epoch": 0.3058823529411765,
"grad_norm": 0.44092842775695007,
"learning_rate": 1.999513878924193e-05,
"loss": 0.40712660551071167,
"num_input_tokens_seen": 0,
"step": 26
},
{
"epoch": 0.3176470588235294,
"grad_norm": 0.48371948306233614,
"learning_rate": 1.9992404704380513e-05,
"loss": 0.37348443269729614,
"num_input_tokens_seen": 0,
"step": 27
},
{
"epoch": 0.32941176470588235,
"grad_norm": 0.3835238779084052,
"learning_rate": 1.9989063383577644e-05,
"loss": 0.36719316244125366,
"num_input_tokens_seen": 0,
"step": 28
},
{
"epoch": 0.3411764705882353,
"grad_norm": 0.4114683262445794,
"learning_rate": 1.9985115029884556e-05,
"loss": 0.3744957447052002,
"num_input_tokens_seen": 0,
"step": 29
},
{
"epoch": 0.35294117647058826,
"grad_norm": 0.3926833772366512,
"learning_rate": 1.9980559883241723e-05,
"loss": 0.36837196350097656,
"num_input_tokens_seen": 0,
"step": 30
},
{
"epoch": 0.36470588235294116,
"grad_norm": 0.34510721456392296,
"learning_rate": 1.9975398220464268e-05,
"loss": 0.35771483182907104,
"num_input_tokens_seen": 0,
"step": 31
},
{
"epoch": 0.3764705882352941,
"grad_norm": 0.2906065104717026,
"learning_rate": 1.996963035522515e-05,
"loss": 0.3718343675136566,
"num_input_tokens_seen": 0,
"step": 32
},
{
"epoch": 0.38823529411764707,
"grad_norm": 0.28680989428590703,
"learning_rate": 1.99632566380361e-05,
"loss": 0.35502949357032776,
"num_input_tokens_seen": 0,
"step": 33
},
{
"epoch": 0.4,
"grad_norm": 0.36479584489715183,
"learning_rate": 1.995627745622632e-05,
"loss": 0.3561074733734131,
"num_input_tokens_seen": 0,
"step": 34
},
{
"epoch": 0.4117647058823529,
"grad_norm": 0.3233890688969203,
"learning_rate": 1.994869323391895e-05,
"loss": 0.36625605821609497,
"num_input_tokens_seen": 0,
"step": 35
},
{
"epoch": 0.4235294117647059,
"grad_norm": 0.3014081868844546,
"learning_rate": 1.9940504432005293e-05,
"loss": 0.32261648774147034,
"num_input_tokens_seen": 0,
"step": 36
},
{
"epoch": 0.43529411764705883,
"grad_norm": 0.29988977041893033,
"learning_rate": 1.993171154811679e-05,
"loss": 0.36725232005119324,
"num_input_tokens_seen": 0,
"step": 37
},
{
"epoch": 0.4470588235294118,
"grad_norm": 0.2843556547007244,
"learning_rate": 1.992231511659481e-05,
"loss": 0.3372398316860199,
"num_input_tokens_seen": 0,
"step": 38
},
{
"epoch": 0.4588235294117647,
"grad_norm": 0.25146289166175934,
"learning_rate": 1.9912315708458144e-05,
"loss": 0.35911282896995544,
"num_input_tokens_seen": 0,
"step": 39
},
{
"epoch": 0.47058823529411764,
"grad_norm": 0.25109365889274177,
"learning_rate": 1.9901713931368333e-05,
"loss": 0.35379254817962646,
"num_input_tokens_seen": 0,
"step": 40
},
{
"epoch": 0.4823529411764706,
"grad_norm": 0.25470949144259586,
"learning_rate": 1.989051042959273e-05,
"loss": 0.34498846530914307,
"num_input_tokens_seen": 0,
"step": 41
},
{
"epoch": 0.49411764705882355,
"grad_norm": 0.2603263992351511,
"learning_rate": 1.9878705883965342e-05,
"loss": 0.346971720457077,
"num_input_tokens_seen": 0,
"step": 42
},
{
"epoch": 0.5058823529411764,
"grad_norm": 0.2837708452421679,
"learning_rate": 1.986630101184546e-05,
"loss": 0.3518391251564026,
"num_input_tokens_seen": 0,
"step": 43
},
{
"epoch": 0.5176470588235295,
"grad_norm": 0.2690921713644025,
"learning_rate": 1.9853296567074075e-05,
"loss": 0.3417142331600189,
"num_input_tokens_seen": 0,
"step": 44
},
{
"epoch": 0.5294117647058824,
"grad_norm": 0.27838893675816295,
"learning_rate": 1.983969333992804e-05,
"loss": 0.33975788950920105,
"num_input_tokens_seen": 0,
"step": 45
},
{
"epoch": 0.5411764705882353,
"grad_norm": 0.2500924041093808,
"learning_rate": 1.982549215707209e-05,
"loss": 0.3427805006504059,
"num_input_tokens_seen": 0,
"step": 46
},
{
"epoch": 0.5529411764705883,
"grad_norm": 0.2515785105886048,
"learning_rate": 1.9810693881508548e-05,
"loss": 0.34949395060539246,
"num_input_tokens_seen": 0,
"step": 47
},
{
"epoch": 0.5647058823529412,
"grad_norm": 0.29781103489548,
"learning_rate": 1.9795299412524948e-05,
"loss": 0.34314972162246704,
"num_input_tokens_seen": 0,
"step": 48
},
{
"epoch": 0.5764705882352941,
"grad_norm": 0.2938887318454496,
"learning_rate": 1.9779309685639317e-05,
"loss": 0.3414318263530731,
"num_input_tokens_seen": 0,
"step": 49
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.2895540362722232,
"learning_rate": 1.9762725672543372e-05,
"loss": 0.3192686140537262,
"num_input_tokens_seen": 0,
"step": 50
},
{
"epoch": 0.6,
"grad_norm": 0.23066275708054598,
"learning_rate": 1.9745548381043454e-05,
"loss": 0.3385634422302246,
"num_input_tokens_seen": 0,
"step": 51
},
{
"epoch": 0.611764705882353,
"grad_norm": 0.280246896213228,
"learning_rate": 1.9727778854999283e-05,
"loss": 0.35149312019348145,
"num_input_tokens_seen": 0,
"step": 52
},
{
"epoch": 0.6235294117647059,
"grad_norm": 0.27445702417204193,
"learning_rate": 1.9709418174260523e-05,
"loss": 0.3358836770057678,
"num_input_tokens_seen": 0,
"step": 53
},
{
"epoch": 0.6352941176470588,
"grad_norm": 0.27123307051788814,
"learning_rate": 1.969046745460116e-05,
"loss": 0.35038888454437256,
"num_input_tokens_seen": 0,
"step": 54
},
{
"epoch": 0.6470588235294118,
"grad_norm": 0.3022409414703534,
"learning_rate": 1.9670927847651707e-05,
"loss": 0.3619537651538849,
"num_input_tokens_seen": 0,
"step": 55
},
{
"epoch": 0.6588235294117647,
"grad_norm": 0.28043980426354626,
"learning_rate": 1.9650800540829204e-05,
"loss": 0.334235817193985,
"num_input_tokens_seen": 0,
"step": 56
},
{
"epoch": 0.6705882352941176,
"grad_norm": 0.2608407829948446,
"learning_rate": 1.963008675726506e-05,
"loss": 0.3367481827735901,
"num_input_tokens_seen": 0,
"step": 57
},
{
"epoch": 0.6823529411764706,
"grad_norm": 0.28536414145460753,
"learning_rate": 1.9608787755730746e-05,
"loss": 0.3296854496002197,
"num_input_tokens_seen": 0,
"step": 58
},
{
"epoch": 0.6941176470588235,
"grad_norm": 0.27980621624734936,
"learning_rate": 1.958690483056126e-05,
"loss": 0.32561179995536804,
"num_input_tokens_seen": 0,
"step": 59
},
{
"epoch": 0.7058823529411765,
"grad_norm": 0.2424680758848498,
"learning_rate": 1.9564439311576515e-05,
"loss": 0.33346784114837646,
"num_input_tokens_seen": 0,
"step": 60
},
{
"epoch": 0.7176470588235294,
"grad_norm": 0.24779814083785615,
"learning_rate": 1.954139256400049e-05,
"loss": 0.34621721506118774,
"num_input_tokens_seen": 0,
"step": 61
},
{
"epoch": 0.7294117647058823,
"grad_norm": 0.28660266256207545,
"learning_rate": 1.951776598837829e-05,
"loss": 0.31782716512680054,
"num_input_tokens_seen": 0,
"step": 62
},
{
"epoch": 0.7411764705882353,
"grad_norm": 0.2628734068408129,
"learning_rate": 1.9493561020491024e-05,
"loss": 0.3253316283226013,
"num_input_tokens_seen": 0,
"step": 63
},
{
"epoch": 0.7529411764705882,
"grad_norm": 0.253172171843236,
"learning_rate": 1.9468779131268553e-05,
"loss": 0.32543760538101196,
"num_input_tokens_seen": 0,
"step": 64
},
{
"epoch": 0.7647058823529411,
"grad_norm": 0.28753705660744233,
"learning_rate": 1.9443421826700096e-05,
"loss": 0.32660526037216187,
"num_input_tokens_seen": 0,
"step": 65
},
{
"epoch": 0.7764705882352941,
"grad_norm": 0.2550674488664895,
"learning_rate": 1.9417490647742738e-05,
"loss": 0.29762235283851624,
"num_input_tokens_seen": 0,
"step": 66
},
{
"epoch": 0.788235294117647,
"grad_norm": 0.2387936654216341,
"learning_rate": 1.9390987170227746e-05,
"loss": 0.34908509254455566,
"num_input_tokens_seen": 0,
"step": 67
},
{
"epoch": 0.8,
"grad_norm": 0.28736279751275334,
"learning_rate": 1.9363913004764847e-05,
"loss": 0.3289881944656372,
"num_input_tokens_seen": 0,
"step": 68
},
{
"epoch": 0.8117647058823529,
"grad_norm": 0.2857068101908039,
"learning_rate": 1.9336269796644314e-05,
"loss": 0.316879540681839,
"num_input_tokens_seen": 0,
"step": 69
},
{
"epoch": 0.8235294117647058,
"grad_norm": 0.262248761213064,
"learning_rate": 1.9308059225737015e-05,
"loss": 0.3085065484046936,
"num_input_tokens_seen": 0,
"step": 70
},
{
"epoch": 0.8352941176470589,
"grad_norm": 0.27771726352284015,
"learning_rate": 1.9279283006392304e-05,
"loss": 0.3186359405517578,
"num_input_tokens_seen": 0,
"step": 71
},
{
"epoch": 0.8470588235294118,
"grad_norm": 0.31802345881089383,
"learning_rate": 1.924994288733386e-05,
"loss": 0.329565167427063,
"num_input_tokens_seen": 0,
"step": 72
},
{
"epoch": 0.8588235294117647,
"grad_norm": 0.28385852442224846,
"learning_rate": 1.9220040651553388e-05,
"loss": 0.3364284634590149,
"num_input_tokens_seen": 0,
"step": 73
},
{
"epoch": 0.8705882352941177,
"grad_norm": 0.2533928699463178,
"learning_rate": 1.918957811620231e-05,
"loss": 0.3229159712791443,
"num_input_tokens_seen": 0,
"step": 74
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.27551639833029534,
"learning_rate": 1.915855713248129e-05,
"loss": 0.317361056804657,
"num_input_tokens_seen": 0,
"step": 75
},
{
"epoch": 0.8941176470588236,
"grad_norm": 0.26990146657663827,
"learning_rate": 1.912697958552778e-05,
"loss": 0.31780922412872314,
"num_input_tokens_seen": 0,
"step": 76
},
{
"epoch": 0.9058823529411765,
"grad_norm": 0.3150863510764379,
"learning_rate": 1.9094847394301427e-05,
"loss": 0.33098268508911133,
"num_input_tokens_seen": 0,
"step": 77
},
{
"epoch": 0.9176470588235294,
"grad_norm": 0.3234901727951674,
"learning_rate": 1.906216251146748e-05,
"loss": 0.341233491897583,
"num_input_tokens_seen": 0,
"step": 78
},
{
"epoch": 0.9294117647058824,
"grad_norm": 0.26331326581875253,
"learning_rate": 1.902892692327811e-05,
"loss": 0.33283838629722595,
"num_input_tokens_seen": 0,
"step": 79
},
{
"epoch": 0.9411764705882353,
"grad_norm": 0.31786869502135223,
"learning_rate": 1.899514264945173e-05,
"loss": 0.3331839442253113,
"num_input_tokens_seen": 0,
"step": 80
},
{
"epoch": 0.9529411764705882,
"grad_norm": 0.29539690375673217,
"learning_rate": 1.8960811743050227e-05,
"loss": 0.33531326055526733,
"num_input_tokens_seen": 0,
"step": 81
},
{
"epoch": 0.9647058823529412,
"grad_norm": 0.30472122382886785,
"learning_rate": 1.8925936290354224e-05,
"loss": 0.3103257417678833,
"num_input_tokens_seen": 0,
"step": 82
},
{
"epoch": 0.9764705882352941,
"grad_norm": 0.2530167563030317,
"learning_rate": 1.8890518410736275e-05,
"loss": 0.32245466113090515,
"num_input_tokens_seen": 0,
"step": 83
},
{
"epoch": 0.9882352941176471,
"grad_norm": 0.28897856083778817,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.3198079764842987,
"num_input_tokens_seen": 0,
"step": 84
},
{
"epoch": 1.0,
"grad_norm": 0.26450715598842334,
"learning_rate": 1.8818064012909755e-05,
"loss": 0.3213130235671997,
"num_input_tokens_seen": 0,
"step": 85
},
{
"epoch": 1.011764705882353,
"grad_norm": 0.2626207427201876,
"learning_rate": 1.878103189773686e-05,
"loss": 0.2763475179672241,
"num_input_tokens_seen": 0,
"step": 86
},
{
"epoch": 1.0235294117647058,
"grad_norm": 0.2629994874410627,
"learning_rate": 1.8743466161445823e-05,
"loss": 0.2665697932243347,
"num_input_tokens_seen": 0,
"step": 87
},
{
"epoch": 1.035294117647059,
"grad_norm": 0.29251013301945034,
"learning_rate": 1.8705369086897063e-05,
"loss": 0.2806475758552551,
"num_input_tokens_seen": 0,
"step": 88
},
{
"epoch": 1.0470588235294118,
"grad_norm": 0.2852660256104482,
"learning_rate": 1.86667429892403e-05,
"loss": 0.2648066282272339,
"num_input_tokens_seen": 0,
"step": 89
},
{
"epoch": 1.0588235294117647,
"grad_norm": 0.2762797160323279,
"learning_rate": 1.862759021577385e-05,
"loss": 0.2734478712081909,
"num_input_tokens_seen": 0,
"step": 90
},
{
"epoch": 1.0705882352941176,
"grad_norm": 0.3020506174745607,
"learning_rate": 1.8587913145801998e-05,
"loss": 0.2635505795478821,
"num_input_tokens_seen": 0,
"step": 91
},
{
"epoch": 1.0823529411764705,
"grad_norm": 0.28351991002826543,
"learning_rate": 1.8547714190490385e-05,
"loss": 0.2799134850502014,
"num_input_tokens_seen": 0,
"step": 92
},
{
"epoch": 1.0941176470588236,
"grad_norm": 0.2740110743262188,
"learning_rate": 1.8506995792719498e-05,
"loss": 0.2726055979728699,
"num_input_tokens_seen": 0,
"step": 93
},
{
"epoch": 1.1058823529411765,
"grad_norm": 0.30269062031162386,
"learning_rate": 1.8465760426936212e-05,
"loss": 0.2837594747543335,
"num_input_tokens_seen": 0,
"step": 94
},
{
"epoch": 1.1176470588235294,
"grad_norm": 0.30315593582121325,
"learning_rate": 1.8424010599003424e-05,
"loss": 0.2731676995754242,
"num_input_tokens_seen": 0,
"step": 95
},
{
"epoch": 1.1294117647058823,
"grad_norm": 0.26149596901353317,
"learning_rate": 1.838174884604776e-05,
"loss": 0.2705945372581482,
"num_input_tokens_seen": 0,
"step": 96
},
{
"epoch": 1.1411764705882352,
"grad_norm": 0.3026007790348899,
"learning_rate": 1.8338977736305408e-05,
"loss": 0.2789444923400879,
"num_input_tokens_seen": 0,
"step": 97
},
{
"epoch": 1.1529411764705881,
"grad_norm": 0.3349505482436329,
"learning_rate": 1.8295699868966038e-05,
"loss": 0.2682260572910309,
"num_input_tokens_seen": 0,
"step": 98
},
{
"epoch": 1.1647058823529413,
"grad_norm": 0.2891949628165266,
"learning_rate": 1.8251917874014854e-05,
"loss": 0.28042054176330566,
"num_input_tokens_seen": 0,
"step": 99
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.2941393562882544,
"learning_rate": 1.8207634412072765e-05,
"loss": 0.25862598419189453,
"num_input_tokens_seen": 0,
"step": 100
},
{
"epoch": 1.188235294117647,
"grad_norm": 0.32722595542360156,
"learning_rate": 1.8162852174234712e-05,
"loss": 0.2712678909301758,
"num_input_tokens_seen": 0,
"step": 101
},
{
"epoch": 1.2,
"grad_norm": 0.2755950618714099,
"learning_rate": 1.8117573881906114e-05,
"loss": 0.26205819845199585,
"num_input_tokens_seen": 0,
"step": 102
},
{
"epoch": 1.2117647058823529,
"grad_norm": 0.2571435526644292,
"learning_rate": 1.8071802286637505e-05,
"loss": 0.2622745633125305,
"num_input_tokens_seen": 0,
"step": 103
},
{
"epoch": 1.223529411764706,
"grad_norm": 0.2591172743832164,
"learning_rate": 1.8025540169957315e-05,
"loss": 0.25631460547447205,
"num_input_tokens_seen": 0,
"step": 104
},
{
"epoch": 1.2352941176470589,
"grad_norm": 0.2817321177900711,
"learning_rate": 1.7978790343202826e-05,
"loss": 0.2782523036003113,
"num_input_tokens_seen": 0,
"step": 105
},
{
"epoch": 1.2470588235294118,
"grad_norm": 0.33225523735776513,
"learning_rate": 1.7931555647349358e-05,
"loss": 0.2600249946117401,
"num_input_tokens_seen": 0,
"step": 106
},
{
"epoch": 1.2588235294117647,
"grad_norm": 0.2786742348476795,
"learning_rate": 1.7883838952837595e-05,
"loss": 0.25568312406539917,
"num_input_tokens_seen": 0,
"step": 107
},
{
"epoch": 1.2705882352941176,
"grad_norm": 0.2771371487960206,
"learning_rate": 1.7835643159399156e-05,
"loss": 0.2384142279624939,
"num_input_tokens_seen": 0,
"step": 108
},
{
"epoch": 1.2823529411764705,
"grad_norm": 0.31328815588599274,
"learning_rate": 1.778697119588039e-05,
"loss": 0.2667343318462372,
"num_input_tokens_seen": 0,
"step": 109
},
{
"epoch": 1.2941176470588236,
"grad_norm": 0.2690014796691674,
"learning_rate": 1.7737826020064377e-05,
"loss": 0.2558494210243225,
"num_input_tokens_seen": 0,
"step": 110
},
{
"epoch": 1.3058823529411765,
"grad_norm": 0.3038451633685586,
"learning_rate": 1.76882106184912e-05,
"loss": 0.25802576541900635,
"num_input_tokens_seen": 0,
"step": 111
},
{
"epoch": 1.3176470588235294,
"grad_norm": 0.26349039262552754,
"learning_rate": 1.7638128006276422e-05,
"loss": 0.26081448793411255,
"num_input_tokens_seen": 0,
"step": 112
},
{
"epoch": 1.3294117647058823,
"grad_norm": 0.27581161125402026,
"learning_rate": 1.758758122692791e-05,
"loss": 0.27647483348846436,
"num_input_tokens_seen": 0,
"step": 113
},
{
"epoch": 1.3411764705882354,
"grad_norm": 0.3235486769428178,
"learning_rate": 1.753657335216083e-05,
"loss": 0.2677750587463379,
"num_input_tokens_seen": 0,
"step": 114
},
{
"epoch": 1.3529411764705883,
"grad_norm": 0.2809145367414571,
"learning_rate": 1.7485107481711014e-05,
"loss": 0.2682688236236572,
"num_input_tokens_seen": 0,
"step": 115
},
{
"epoch": 1.3647058823529412,
"grad_norm": 0.2619951939456424,
"learning_rate": 1.743318674314656e-05,
"loss": 0.25316929817199707,
"num_input_tokens_seen": 0,
"step": 116
},
{
"epoch": 1.3764705882352941,
"grad_norm": 0.27411080913366315,
"learning_rate": 1.7380814291677818e-05,
"loss": 0.2697577476501465,
"num_input_tokens_seen": 0,
"step": 117
},
{
"epoch": 1.388235294117647,
"grad_norm": 0.3338822677438316,
"learning_rate": 1.7327993309965583e-05,
"loss": 0.2708876132965088,
"num_input_tokens_seen": 0,
"step": 118
},
{
"epoch": 1.4,
"grad_norm": 0.31962282276030907,
"learning_rate": 1.7274727007927747e-05,
"loss": 0.27048563957214355,
"num_input_tokens_seen": 0,
"step": 119
},
{
"epoch": 1.4117647058823528,
"grad_norm": 0.285342846378909,
"learning_rate": 1.7221018622544197e-05,
"loss": 0.2710177004337311,
"num_input_tokens_seen": 0,
"step": 120
},
{
"epoch": 1.423529411764706,
"grad_norm": 0.308814170391406,
"learning_rate": 1.7166871417660116e-05,
"loss": 0.2526181936264038,
"num_input_tokens_seen": 0,
"step": 121
},
{
"epoch": 1.4352941176470588,
"grad_norm": 0.27775597890631276,
"learning_rate": 1.7112288683787637e-05,
"loss": 0.26763850450515747,
"num_input_tokens_seen": 0,
"step": 122
},
{
"epoch": 1.4470588235294117,
"grad_norm": 0.2958185178060128,
"learning_rate": 1.7057273737905887e-05,
"loss": 0.268245667219162,
"num_input_tokens_seen": 0,
"step": 123
},
{
"epoch": 1.4588235294117646,
"grad_norm": 0.2483775556217329,
"learning_rate": 1.70018299232594e-05,
"loss": 0.25788575410842896,
"num_input_tokens_seen": 0,
"step": 124
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.2811097779442606,
"learning_rate": 1.6945960609154966e-05,
"loss": 0.26732224225997925,
"num_input_tokens_seen": 0,
"step": 125
},
{
"epoch": 1.4823529411764707,
"grad_norm": 0.2934299916938348,
"learning_rate": 1.688966919075687e-05,
"loss": 0.26281166076660156,
"num_input_tokens_seen": 0,
"step": 126
},
{
"epoch": 1.4941176470588236,
"grad_norm": 0.2368134963295287,
"learning_rate": 1.6832959088880557e-05,
"loss": 0.25862863659858704,
"num_input_tokens_seen": 0,
"step": 127
},
{
"epoch": 1.5058823529411764,
"grad_norm": 0.2708304514650526,
"learning_rate": 1.677583374978478e-05,
"loss": 0.2421874701976776,
"num_input_tokens_seen": 0,
"step": 128
},
{
"epoch": 1.5176470588235293,
"grad_norm": 0.2767120423486198,
"learning_rate": 1.6718296644962146e-05,
"loss": 0.2624642550945282,
"num_input_tokens_seen": 0,
"step": 129
},
{
"epoch": 1.5294117647058822,
"grad_norm": 0.2813174470652987,
"learning_rate": 1.6660351270928164e-05,
"loss": 0.24937519431114197,
"num_input_tokens_seen": 0,
"step": 130
},
{
"epoch": 1.5411764705882351,
"grad_norm": 0.3009488397968105,
"learning_rate": 1.660200114900876e-05,
"loss": 0.2704227566719055,
"num_input_tokens_seen": 0,
"step": 131
},
{
"epoch": 1.5529411764705883,
"grad_norm": 0.3141059797795813,
"learning_rate": 1.6543249825126285e-05,
"loss": 0.26932939887046814,
"num_input_tokens_seen": 0,
"step": 132
},
{
"epoch": 1.5647058823529412,
"grad_norm": 0.25053717473426707,
"learning_rate": 1.6484100869584044e-05,
"loss": 0.2592698633670807,
"num_input_tokens_seen": 0,
"step": 133
},
{
"epoch": 1.576470588235294,
"grad_norm": 0.25700597213890997,
"learning_rate": 1.6424557876849308e-05,
"loss": 0.27053964138031006,
"num_input_tokens_seen": 0,
"step": 134
},
{
"epoch": 1.5882352941176472,
"grad_norm": 0.30182930329649144,
"learning_rate": 1.636462446533489e-05,
"loss": 0.25989019870758057,
"num_input_tokens_seen": 0,
"step": 135
},
{
"epoch": 1.6,
"grad_norm": 0.26390881674937633,
"learning_rate": 1.6304304277179267e-05,
"loss": 0.2570236027240753,
"num_input_tokens_seen": 0,
"step": 136
},
{
"epoch": 1.611764705882353,
"grad_norm": 0.2652947312714827,
"learning_rate": 1.6243600978025215e-05,
"loss": 0.2678568363189697,
"num_input_tokens_seen": 0,
"step": 137
},
{
"epoch": 1.6235294117647059,
"grad_norm": 0.2575940385752971,
"learning_rate": 1.6182518256797095e-05,
"loss": 0.2600210905075073,
"num_input_tokens_seen": 0,
"step": 138
},
{
"epoch": 1.6352941176470588,
"grad_norm": 0.2610590842320019,
"learning_rate": 1.612105982547663e-05,
"loss": 0.26671087741851807,
"num_input_tokens_seen": 0,
"step": 139
},
{
"epoch": 1.6470588235294117,
"grad_norm": 0.25464302295329627,
"learning_rate": 1.605922941887737e-05,
"loss": 0.2668280005455017,
"num_input_tokens_seen": 0,
"step": 140
},
{
"epoch": 1.6588235294117646,
"grad_norm": 0.26069231826980477,
"learning_rate": 1.599703079441769e-05,
"loss": 0.2653328478336334,
"num_input_tokens_seen": 0,
"step": 141
},
{
"epoch": 1.6705882352941175,
"grad_norm": 0.27072482250492486,
"learning_rate": 1.5934467731892497e-05,
"loss": 0.2632245719432831,
"num_input_tokens_seen": 0,
"step": 142
},
{
"epoch": 1.6823529411764706,
"grad_norm": 0.24138888757547514,
"learning_rate": 1.5871544033243488e-05,
"loss": 0.26093634963035583,
"num_input_tokens_seen": 0,
"step": 143
},
{
"epoch": 1.6941176470588235,
"grad_norm": 0.25857892670146815,
"learning_rate": 1.5808263522328137e-05,
"loss": 0.2518957853317261,
"num_input_tokens_seen": 0,
"step": 144
},
{
"epoch": 1.7058823529411766,
"grad_norm": 0.25322801625227936,
"learning_rate": 1.5744630044687307e-05,
"loss": 0.25198179483413696,
"num_input_tokens_seen": 0,
"step": 145
},
{
"epoch": 1.7176470588235295,
"grad_norm": 0.23398219863607192,
"learning_rate": 1.568064746731156e-05,
"loss": 0.25039592385292053,
"num_input_tokens_seen": 0,
"step": 146
},
{
"epoch": 1.7294117647058824,
"grad_norm": 0.22752785226042835,
"learning_rate": 1.561631967840617e-05,
"loss": 0.25004899501800537,
"num_input_tokens_seen": 0,
"step": 147
},
{
"epoch": 1.7411764705882353,
"grad_norm": 0.26867363858385673,
"learning_rate": 1.5551650587154815e-05,
"loss": 0.2628065347671509,
"num_input_tokens_seen": 0,
"step": 148
},
{
"epoch": 1.7529411764705882,
"grad_norm": 0.2572214700469002,
"learning_rate": 1.5486644123482047e-05,
"loss": 0.2694377899169922,
"num_input_tokens_seen": 0,
"step": 149
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.2649069012394484,
"learning_rate": 1.542130423781444e-05,
"loss": 0.2698570787906647,
"num_input_tokens_seen": 0,
"step": 150
},
{
"epoch": 1.776470588235294,
"grad_norm": 0.3129557276746984,
"learning_rate": 1.5355634900840558e-05,
"loss": 0.2620123624801636,
"num_input_tokens_seen": 0,
"step": 151
},
{
"epoch": 1.788235294117647,
"grad_norm": 0.219414643912218,
"learning_rate": 1.5289640103269626e-05,
"loss": 0.24250832200050354,
"num_input_tokens_seen": 0,
"step": 152
},
{
"epoch": 1.8,
"grad_norm": 0.2787522458312503,
"learning_rate": 1.5223323855589027e-05,
"loss": 0.2599625885486603,
"num_input_tokens_seen": 0,
"step": 153
},
{
"epoch": 1.811764705882353,
"grad_norm": 0.24624844789559322,
"learning_rate": 1.5156690187820596e-05,
"loss": 0.2539859712123871,
"num_input_tokens_seen": 0,
"step": 154
},
{
"epoch": 1.8235294117647058,
"grad_norm": 0.28786535612403885,
"learning_rate": 1.50897431492757e-05,
"loss": 0.251323938369751,
"num_input_tokens_seen": 0,
"step": 155
},
{
"epoch": 1.835294117647059,
"grad_norm": 0.2861446800798861,
"learning_rate": 1.5022486808309171e-05,
"loss": 0.2852325439453125,
"num_input_tokens_seen": 0,
"step": 156
},
{
"epoch": 1.8470588235294119,
"grad_norm": 0.30835997118524755,
"learning_rate": 1.4954925252072077e-05,
"loss": 0.2626144289970398,
"num_input_tokens_seen": 0,
"step": 157
},
{
"epoch": 1.8588235294117648,
"grad_norm": 0.27166093756727683,
"learning_rate": 1.4887062586263334e-05,
"loss": 0.26250118017196655,
"num_input_tokens_seen": 0,
"step": 158
},
{
"epoch": 1.8705882352941177,
"grad_norm": 0.30818985316404857,
"learning_rate": 1.4818902934880222e-05,
"loss": 0.27699387073516846,
"num_input_tokens_seen": 0,
"step": 159
},
{
"epoch": 1.8823529411764706,
"grad_norm": 0.30205479197808555,
"learning_rate": 1.4750450439967751e-05,
"loss": 0.272649347782135,
"num_input_tokens_seen": 0,
"step": 160
},
{
"epoch": 1.8941176470588235,
"grad_norm": 0.29949042144033816,
"learning_rate": 1.4681709261366963e-05,
"loss": 0.2485789656639099,
"num_input_tokens_seen": 0,
"step": 161
},
{
"epoch": 1.9058823529411764,
"grad_norm": 0.267903631477539,
"learning_rate": 1.4612683576462135e-05,
"loss": 0.2616223096847534,
"num_input_tokens_seen": 0,
"step": 162
},
{
"epoch": 1.9176470588235293,
"grad_norm": 0.27260315220708237,
"learning_rate": 1.4543377579926915e-05,
"loss": 0.27286335825920105,
"num_input_tokens_seen": 0,
"step": 163
},
{
"epoch": 1.9294117647058824,
"grad_norm": 0.28592302424298965,
"learning_rate": 1.4473795483469442e-05,
"loss": 0.24860531091690063,
"num_input_tokens_seen": 0,
"step": 164
},
{
"epoch": 1.9411764705882353,
"grad_norm": 0.27067444548694936,
"learning_rate": 1.4403941515576344e-05,
"loss": 0.2611614167690277,
"num_input_tokens_seen": 0,
"step": 165
},
{
"epoch": 1.9529411764705882,
"grad_norm": 0.26432408877050523,
"learning_rate": 1.4333819921255836e-05,
"loss": 0.26266223192214966,
"num_input_tokens_seen": 0,
"step": 166
},
{
"epoch": 1.9647058823529413,
"grad_norm": 0.32069387585361836,
"learning_rate": 1.4263434961779709e-05,
"loss": 0.24890068173408508,
"num_input_tokens_seen": 0,
"step": 167
},
{
"epoch": 1.9764705882352942,
"grad_norm": 0.28968277975368684,
"learning_rate": 1.41927909144244e-05,
"loss": 0.2612011432647705,
"num_input_tokens_seen": 0,
"step": 168
},
{
"epoch": 1.988235294117647,
"grad_norm": 0.2593706365289158,
"learning_rate": 1.412189207221104e-05,
"loss": 0.24890106916427612,
"num_input_tokens_seen": 0,
"step": 169
},
{
"epoch": 2.0,
"grad_norm": 0.25908450639554936,
"learning_rate": 1.4050742743644588e-05,
"loss": 0.25550538301467896,
"num_input_tokens_seen": 0,
"step": 170
},
{
"epoch": 2.011764705882353,
"grad_norm": 0.32606044201267254,
"learning_rate": 1.3979347252451994e-05,
"loss": 0.20405685901641846,
"num_input_tokens_seen": 0,
"step": 171
},
{
"epoch": 2.023529411764706,
"grad_norm": 0.31532835367496725,
"learning_rate": 1.3907709937319451e-05,
"loss": 0.2080579251050949,
"num_input_tokens_seen": 0,
"step": 172
},
{
"epoch": 2.0352941176470587,
"grad_norm": 0.23106550000023307,
"learning_rate": 1.3835835151628728e-05,
"loss": 0.1862945556640625,
"num_input_tokens_seen": 0,
"step": 173
},
{
"epoch": 2.0470588235294116,
"grad_norm": 0.2399759682184491,
"learning_rate": 1.3763727263192626e-05,
"loss": 0.18684154748916626,
"num_input_tokens_seen": 0,
"step": 174
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.24298409208730917,
"learning_rate": 1.3691390653989536e-05,
"loss": 0.19205346703529358,
"num_input_tokens_seen": 0,
"step": 175
},
{
"epoch": 2.070588235294118,
"grad_norm": 0.2606890104298591,
"learning_rate": 1.3618829719897158e-05,
"loss": 0.19722914695739746,
"num_input_tokens_seen": 0,
"step": 176
},
{
"epoch": 2.0823529411764707,
"grad_norm": 0.2634006155067239,
"learning_rate": 1.3546048870425356e-05,
"loss": 0.18658706545829773,
"num_input_tokens_seen": 0,
"step": 177
},
{
"epoch": 2.0941176470588236,
"grad_norm": 0.2474551016529151,
"learning_rate": 1.3473052528448203e-05,
"loss": 0.18761307001113892,
"num_input_tokens_seen": 0,
"step": 178
},
{
"epoch": 2.1058823529411765,
"grad_norm": 0.2773501459528279,
"learning_rate": 1.3399845129935191e-05,
"loss": 0.2006130963563919,
"num_input_tokens_seen": 0,
"step": 179
},
{
"epoch": 2.1176470588235294,
"grad_norm": 0.24768518968840073,
"learning_rate": 1.3326431123681667e-05,
"loss": 0.1869545876979828,
"num_input_tokens_seen": 0,
"step": 180
},
{
"epoch": 2.1294117647058823,
"grad_norm": 0.24087563849344726,
"learning_rate": 1.3252814971038477e-05,
"loss": 0.19419728219509125,
"num_input_tokens_seen": 0,
"step": 181
},
{
"epoch": 2.1411764705882352,
"grad_norm": 0.24859116981429222,
"learning_rate": 1.3179001145640856e-05,
"loss": 0.1937357634305954,
"num_input_tokens_seen": 0,
"step": 182
},
{
"epoch": 2.152941176470588,
"grad_norm": 0.2513377458414818,
"learning_rate": 1.3104994133136563e-05,
"loss": 0.18806332349777222,
"num_input_tokens_seen": 0,
"step": 183
},
{
"epoch": 2.164705882352941,
"grad_norm": 0.24195612774749747,
"learning_rate": 1.3030798430913289e-05,
"loss": 0.19312450289726257,
"num_input_tokens_seen": 0,
"step": 184
},
{
"epoch": 2.176470588235294,
"grad_norm": 0.2598954308224352,
"learning_rate": 1.295641854782535e-05,
"loss": 0.19178995490074158,
"num_input_tokens_seen": 0,
"step": 185
},
{
"epoch": 2.1882352941176473,
"grad_norm": 0.2738424910649441,
"learning_rate": 1.2881859003919688e-05,
"loss": 0.19293949007987976,
"num_input_tokens_seen": 0,
"step": 186
},
{
"epoch": 2.2,
"grad_norm": 0.24146821641260552,
"learning_rate": 1.2807124330161188e-05,
"loss": 0.18528440594673157,
"num_input_tokens_seen": 0,
"step": 187
},
{
"epoch": 2.211764705882353,
"grad_norm": 0.257111381442425,
"learning_rate": 1.2732219068157335e-05,
"loss": 0.18848256766796112,
"num_input_tokens_seen": 0,
"step": 188
},
{
"epoch": 2.223529411764706,
"grad_norm": 0.2526409622347608,
"learning_rate": 1.2657147769882215e-05,
"loss": 0.18127834796905518,
"num_input_tokens_seen": 0,
"step": 189
},
{
"epoch": 2.235294117647059,
"grad_norm": 0.23701529976763616,
"learning_rate": 1.2581914997399899e-05,
"loss": 0.18892061710357666,
"num_input_tokens_seen": 0,
"step": 190
},
{
"epoch": 2.2470588235294118,
"grad_norm": 0.24297086363023263,
"learning_rate": 1.2506525322587207e-05,
"loss": 0.19873817265033722,
"num_input_tokens_seen": 0,
"step": 191
},
{
"epoch": 2.2588235294117647,
"grad_norm": 0.2537032696104157,
"learning_rate": 1.2430983326855873e-05,
"loss": 0.1893860250711441,
"num_input_tokens_seen": 0,
"step": 192
},
{
"epoch": 2.2705882352941176,
"grad_norm": 0.23876942589975814,
"learning_rate": 1.2355293600874132e-05,
"loss": 0.18759432435035706,
"num_input_tokens_seen": 0,
"step": 193
},
{
"epoch": 2.2823529411764705,
"grad_norm": 0.2435388542806445,
"learning_rate": 1.2279460744287755e-05,
"loss": 0.18849223852157593,
"num_input_tokens_seen": 0,
"step": 194
},
{
"epoch": 2.2941176470588234,
"grad_norm": 0.2647343889775541,
"learning_rate": 1.220348936544052e-05,
"loss": 0.18661049008369446,
"num_input_tokens_seen": 0,
"step": 195
},
{
"epoch": 2.3058823529411763,
"grad_norm": 0.25540155279573523,
"learning_rate": 1.2127384081094167e-05,
"loss": 0.18517085909843445,
"num_input_tokens_seen": 0,
"step": 196
},
{
"epoch": 2.317647058823529,
"grad_norm": 0.24552318557540526,
"learning_rate": 1.205114951614785e-05,
"loss": 0.17878204584121704,
"num_input_tokens_seen": 0,
"step": 197
},
{
"epoch": 2.3294117647058825,
"grad_norm": 0.2258935926658077,
"learning_rate": 1.197479030335706e-05,
"loss": 0.18578067421913147,
"num_input_tokens_seen": 0,
"step": 198
},
{
"epoch": 2.3411764705882354,
"grad_norm": 0.22583777859137,
"learning_rate": 1.1898311083052113e-05,
"loss": 0.19397635757923126,
"num_input_tokens_seen": 0,
"step": 199
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.23201542489820412,
"learning_rate": 1.1821716502856154e-05,
"loss": 0.18146567046642303,
"num_input_tokens_seen": 0,
"step": 200
},
{
"epoch": 2.364705882352941,
"grad_norm": 0.28552700838642453,
"learning_rate": 1.1745011217402709e-05,
"loss": 0.19469541311264038,
"num_input_tokens_seen": 0,
"step": 201
},
{
"epoch": 2.376470588235294,
"grad_norm": 0.24910488131854605,
"learning_rate": 1.1668199888052844e-05,
"loss": 0.18924464285373688,
"num_input_tokens_seen": 0,
"step": 202
},
{
"epoch": 2.388235294117647,
"grad_norm": 0.24952841695443162,
"learning_rate": 1.159128718261189e-05,
"loss": 0.18815085291862488,
"num_input_tokens_seen": 0,
"step": 203
},
{
"epoch": 2.4,
"grad_norm": 0.22629712220582293,
"learning_rate": 1.1514277775045768e-05,
"loss": 0.18509158492088318,
"num_input_tokens_seen": 0,
"step": 204
},
{
"epoch": 2.411764705882353,
"grad_norm": 0.2258797767600323,
"learning_rate": 1.1437176345196967e-05,
"loss": 0.17601992189884186,
"num_input_tokens_seen": 0,
"step": 205
},
{
"epoch": 2.4235294117647057,
"grad_norm": 0.2994549469629298,
"learning_rate": 1.135998757850015e-05,
"loss": 0.19033361971378326,
"num_input_tokens_seen": 0,
"step": 206
},
{
"epoch": 2.435294117647059,
"grad_norm": 0.28669793445051134,
"learning_rate": 1.128271616569741e-05,
"loss": 0.19659247994422913,
"num_input_tokens_seen": 0,
"step": 207
},
{
"epoch": 2.447058823529412,
"grad_norm": 0.24321969874326846,
"learning_rate": 1.1205366802553231e-05,
"loss": 0.189006507396698,
"num_input_tokens_seen": 0,
"step": 208
},
{
"epoch": 2.458823529411765,
"grad_norm": 0.23277687621799142,
"learning_rate": 1.1127944189569122e-05,
"loss": 0.18315881490707397,
"num_input_tokens_seen": 0,
"step": 209
},
{
"epoch": 2.4705882352941178,
"grad_norm": 0.24644185758060683,
"learning_rate": 1.1050453031697958e-05,
"loss": 0.18082918226718903,
"num_input_tokens_seen": 0,
"step": 210
},
{
"epoch": 2.4823529411764707,
"grad_norm": 0.27537652887423003,
"learning_rate": 1.0972898038058077e-05,
"loss": 0.18804597854614258,
"num_input_tokens_seen": 0,
"step": 211
},
{
"epoch": 2.4941176470588236,
"grad_norm": 0.22999355280888956,
"learning_rate": 1.0895283921647098e-05,
"loss": 0.18512041866779327,
"num_input_tokens_seen": 0,
"step": 212
},
{
"epoch": 2.5058823529411764,
"grad_norm": 0.24328460263907906,
"learning_rate": 1.0817615399055513e-05,
"loss": 0.18306857347488403,
"num_input_tokens_seen": 0,
"step": 213
},
{
"epoch": 2.5176470588235293,
"grad_norm": 0.24353741537161722,
"learning_rate": 1.0739897190180066e-05,
"loss": 0.18730933964252472,
"num_input_tokens_seen": 0,
"step": 214
},
{
"epoch": 2.5294117647058822,
"grad_norm": 0.24168464720218039,
"learning_rate": 1.0662134017936924e-05,
"loss": 0.1890895515680313,
"num_input_tokens_seen": 0,
"step": 215
},
{
"epoch": 2.541176470588235,
"grad_norm": 0.2464118373551017,
"learning_rate": 1.0584330607974673e-05,
"loss": 0.1896791309118271,
"num_input_tokens_seen": 0,
"step": 216
},
{
"epoch": 2.552941176470588,
"grad_norm": 0.2272154213017855,
"learning_rate": 1.0506491688387128e-05,
"loss": 0.187567800283432,
"num_input_tokens_seen": 0,
"step": 217
},
{
"epoch": 2.564705882352941,
"grad_norm": 0.23687499350154168,
"learning_rate": 1.0428621989426016e-05,
"loss": 0.19160117208957672,
"num_input_tokens_seen": 0,
"step": 218
},
{
"epoch": 2.576470588235294,
"grad_norm": 0.23229299934050784,
"learning_rate": 1.0350726243213519e-05,
"loss": 0.18402451276779175,
"num_input_tokens_seen": 0,
"step": 219
},
{
"epoch": 2.588235294117647,
"grad_norm": 0.2515873476821987,
"learning_rate": 1.0272809183454701e-05,
"loss": 0.18722085654735565,
"num_input_tokens_seen": 0,
"step": 220
},
{
"epoch": 2.6,
"grad_norm": 0.22755973648814593,
"learning_rate": 1.0194875545149854e-05,
"loss": 0.18111610412597656,
"num_input_tokens_seen": 0,
"step": 221
},
{
"epoch": 2.611764705882353,
"grad_norm": 0.23007823552128587,
"learning_rate": 1.0116930064306736e-05,
"loss": 0.19812649488449097,
"num_input_tokens_seen": 0,
"step": 222
},
{
"epoch": 2.623529411764706,
"grad_norm": 0.22528651243150996,
"learning_rate": 1.0038977477652779e-05,
"loss": 0.18580538034439087,
"num_input_tokens_seen": 0,
"step": 223
},
{
"epoch": 2.635294117647059,
"grad_norm": 0.23539863556511334,
"learning_rate": 9.961022522347226e-06,
"loss": 0.18501965701580048,
"num_input_tokens_seen": 0,
"step": 224
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.22782974012346754,
"learning_rate": 9.883069935693267e-06,
"loss": 0.18402716517448425,
"num_input_tokens_seen": 0,
"step": 225
},
{
"epoch": 2.6588235294117646,
"grad_norm": 0.24487953714591462,
"learning_rate": 9.80512445485015e-06,
"loss": 0.18938913941383362,
"num_input_tokens_seen": 0,
"step": 226
},
{
"epoch": 2.6705882352941175,
"grad_norm": 0.24462792166495934,
"learning_rate": 9.7271908165453e-06,
"loss": 0.19719335436820984,
"num_input_tokens_seen": 0,
"step": 227
},
{
"epoch": 2.682352941176471,
"grad_norm": 0.2366728459616901,
"learning_rate": 9.649273756786486e-06,
"loss": 0.185680091381073,
"num_input_tokens_seen": 0,
"step": 228
},
{
"epoch": 2.6941176470588237,
"grad_norm": 0.2303882056729561,
"learning_rate": 9.57137801057399e-06,
"loss": 0.19624218344688416,
"num_input_tokens_seen": 0,
"step": 229
},
{
"epoch": 2.7058823529411766,
"grad_norm": 0.22987803077687444,
"learning_rate": 9.493508311612874e-06,
"loss": 0.17861570417881012,
"num_input_tokens_seen": 0,
"step": 230
},
{
"epoch": 2.7176470588235295,
"grad_norm": 0.26388168681073687,
"learning_rate": 9.415669392025329e-06,
"loss": 0.18734458088874817,
"num_input_tokens_seen": 0,
"step": 231
},
{
"epoch": 2.7294117647058824,
"grad_norm": 0.22186631357859773,
"learning_rate": 9.337865982063076e-06,
"loss": 0.1946583092212677,
"num_input_tokens_seen": 0,
"step": 232
},
{
"epoch": 2.7411764705882353,
"grad_norm": 0.2311568846601055,
"learning_rate": 9.260102809819939e-06,
"loss": 0.18761436641216278,
"num_input_tokens_seen": 0,
"step": 233
},
{
"epoch": 2.7529411764705882,
"grad_norm": 0.22628859572679205,
"learning_rate": 9.182384600944494e-06,
"loss": 0.18877655267715454,
"num_input_tokens_seen": 0,
"step": 234
},
{
"epoch": 2.764705882352941,
"grad_norm": 0.24528368812451035,
"learning_rate": 9.104716078352906e-06,
"loss": 0.18831658363342285,
"num_input_tokens_seen": 0,
"step": 235
},
{
"epoch": 2.776470588235294,
"grad_norm": 0.22998847186224078,
"learning_rate": 9.027101961941925e-06,
"loss": 0.18712544441223145,
"num_input_tokens_seen": 0,
"step": 236
},
{
"epoch": 2.788235294117647,
"grad_norm": 0.22929072663885758,
"learning_rate": 8.949546968302042e-06,
"loss": 0.20112478733062744,
"num_input_tokens_seen": 0,
"step": 237
},
{
"epoch": 2.8,
"grad_norm": 0.23007743920004314,
"learning_rate": 8.872055810430881e-06,
"loss": 0.18601751327514648,
"num_input_tokens_seen": 0,
"step": 238
},
{
"epoch": 2.8117647058823527,
"grad_norm": 0.23659776591024959,
"learning_rate": 8.79463319744677e-06,
"loss": 0.1808547079563141,
"num_input_tokens_seen": 0,
"step": 239
},
{
"epoch": 2.8235294117647056,
"grad_norm": 0.23509031587485976,
"learning_rate": 8.717283834302593e-06,
"loss": 0.18669113516807556,
"num_input_tokens_seen": 0,
"step": 240
},
{
"epoch": 2.835294117647059,
"grad_norm": 0.23472960086401704,
"learning_rate": 8.640012421499856e-06,
"loss": 0.19292673468589783,
"num_input_tokens_seen": 0,
"step": 241
},
{
"epoch": 2.847058823529412,
"grad_norm": 0.25429765516678327,
"learning_rate": 8.562823654803035e-06,
"loss": 0.18981140851974487,
"num_input_tokens_seen": 0,
"step": 242
},
{
"epoch": 2.8588235294117648,
"grad_norm": 0.23658961623110553,
"learning_rate": 8.485722224954237e-06,
"loss": 0.1999085545539856,
"num_input_tokens_seen": 0,
"step": 243
},
{
"epoch": 2.8705882352941177,
"grad_norm": 0.23199472766369356,
"learning_rate": 8.408712817388113e-06,
"loss": 0.1827118992805481,
"num_input_tokens_seen": 0,
"step": 244
},
{
"epoch": 2.8823529411764706,
"grad_norm": 0.25077191290374024,
"learning_rate": 8.331800111947158e-06,
"loss": 0.1857125163078308,
"num_input_tokens_seen": 0,
"step": 245
},
{
"epoch": 2.8941176470588235,
"grad_norm": 0.23140729045512615,
"learning_rate": 8.254988782597295e-06,
"loss": 0.18820548057556152,
"num_input_tokens_seen": 0,
"step": 246
},
{
"epoch": 2.9058823529411764,
"grad_norm": 0.22239003021686357,
"learning_rate": 8.178283497143851e-06,
"loss": 0.19076308608055115,
"num_input_tokens_seen": 0,
"step": 247
},
{
"epoch": 2.9176470588235293,
"grad_norm": 0.21803105840925516,
"learning_rate": 8.10168891694789e-06,
"loss": 0.18549099564552307,
"num_input_tokens_seen": 0,
"step": 248
},
{
"epoch": 2.9294117647058826,
"grad_norm": 0.23828217230216947,
"learning_rate": 8.025209696642942e-06,
"loss": 0.1965373158454895,
"num_input_tokens_seen": 0,
"step": 249
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.2390427784115555,
"learning_rate": 7.948850483852153e-06,
"loss": 0.18414372205734253,
"num_input_tokens_seen": 0,
"step": 250
},
{
"epoch": 2.9529411764705884,
"grad_norm": 0.24685443360941575,
"learning_rate": 7.872615918905833e-06,
"loss": 0.19256475567817688,
"num_input_tokens_seen": 0,
"step": 251
},
{
"epoch": 2.9647058823529413,
"grad_norm": 0.23889907605167213,
"learning_rate": 7.796510634559487e-06,
"loss": 0.19201350212097168,
"num_input_tokens_seen": 0,
"step": 252
},
{
"epoch": 2.976470588235294,
"grad_norm": 0.24349828471939475,
"learning_rate": 7.720539255712252e-06,
"loss": 0.18964079022407532,
"num_input_tokens_seen": 0,
"step": 253
},
{
"epoch": 2.988235294117647,
"grad_norm": 0.25208339650704936,
"learning_rate": 7.644706399125871e-06,
"loss": 0.19716620445251465,
"num_input_tokens_seen": 0,
"step": 254
},
{
"epoch": 3.0,
"grad_norm": 0.22219530818543262,
"learning_rate": 7.569016673144132e-06,
"loss": 0.19001775979995728,
"num_input_tokens_seen": 0,
"step": 255
}
],
"logging_steps": 1,
"max_steps": 425,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 267247958228992.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}