EmoCaliber-S1 / trainer_state.json
wudq's picture
Add files using upload-large-folder tool
292c363 verified
{
"best_global_step": 666,
"best_metric": 3.76118374,
"best_model_checkpoint": "/mnt/bn/wdq-base1/data/VLMs/vsa_rl/checkpoint/reasoning_sft_1009/v5-20251009-231445/checkpoint-666",
"epoch": 1.0,
"eval_steps": 100,
"global_step": 666,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015015015015015015,
"grad_norm": 18.43874168395996,
"learning_rate": 2.9411764705882356e-07,
"loss": 1.8555517196655273,
"memory(GiB)": 45.75,
"step": 1,
"token_acc": 0.5909588042289464,
"train_speed(iter/s)": 0.01347
},
{
"epoch": 0.0075075075075075074,
"grad_norm": 20.15400505065918,
"learning_rate": 1.4705882352941177e-06,
"loss": 1.8491613864898682,
"memory(GiB)": 53.59,
"step": 5,
"token_acc": 0.6025925925925926,
"train_speed(iter/s)": 0.036205
},
{
"epoch": 0.015015015015015015,
"grad_norm": 10.93825912475586,
"learning_rate": 2.9411764705882355e-06,
"loss": 1.559732437133789,
"memory(GiB)": 53.59,
"step": 10,
"token_acc": 0.6266354798625832,
"train_speed(iter/s)": 0.046254
},
{
"epoch": 0.02252252252252252,
"grad_norm": 5.14630651473999,
"learning_rate": 4.411764705882353e-06,
"loss": 1.0728832244873048,
"memory(GiB)": 53.59,
"step": 15,
"token_acc": 0.7059398496240602,
"train_speed(iter/s)": 0.051056
},
{
"epoch": 0.03003003003003003,
"grad_norm": 10.39709186553955,
"learning_rate": 5.882352941176471e-06,
"loss": 0.8455103874206543,
"memory(GiB)": 53.59,
"step": 20,
"token_acc": 0.7537844383893431,
"train_speed(iter/s)": 0.05419
},
{
"epoch": 0.03753753753753754,
"grad_norm": 4.8650970458984375,
"learning_rate": 7.352941176470589e-06,
"loss": 0.7454294204711914,
"memory(GiB)": 53.59,
"step": 25,
"token_acc": 0.7660878447395302,
"train_speed(iter/s)": 0.055846
},
{
"epoch": 0.04504504504504504,
"grad_norm": 4.418037414550781,
"learning_rate": 8.823529411764707e-06,
"loss": 0.6922950744628906,
"memory(GiB)": 61.49,
"step": 30,
"token_acc": 0.7846728698073119,
"train_speed(iter/s)": 0.057253
},
{
"epoch": 0.052552552552552555,
"grad_norm": 7.153648376464844,
"learning_rate": 9.99993822614516e-06,
"loss": 0.6542837142944335,
"memory(GiB)": 61.49,
"step": 35,
"token_acc": 0.7906924460431655,
"train_speed(iter/s)": 0.058147
},
{
"epoch": 0.06006006006006006,
"grad_norm": 3.7986629009246826,
"learning_rate": 9.997776301493914e-06,
"loss": 0.6283172607421875,
"memory(GiB)": 61.49,
"step": 40,
"token_acc": 0.7965453877251011,
"train_speed(iter/s)": 0.058772
},
{
"epoch": 0.06756756756756757,
"grad_norm": 4.944998264312744,
"learning_rate": 9.992527210334781e-06,
"loss": 0.6150907516479492,
"memory(GiB)": 61.49,
"step": 45,
"token_acc": 0.7986261479877548,
"train_speed(iter/s)": 0.059369
},
{
"epoch": 0.07507507507507508,
"grad_norm": 12.669698715209961,
"learning_rate": 9.98419419507348e-06,
"loss": 0.5961947441101074,
"memory(GiB)": 69.42,
"step": 50,
"token_acc": 0.8023150932050511,
"train_speed(iter/s)": 0.059729
},
{
"epoch": 0.08258258258258258,
"grad_norm": 3.911559820175171,
"learning_rate": 9.972782403080372e-06,
"loss": 0.5979935169219971,
"memory(GiB)": 77.39,
"step": 55,
"token_acc": 0.8086269296107541,
"train_speed(iter/s)": 0.060077
},
{
"epoch": 0.09009009009009009,
"grad_norm": 7.1113362312316895,
"learning_rate": 9.958298883510904e-06,
"loss": 0.5927044868469238,
"memory(GiB)": 77.39,
"step": 60,
"token_acc": 0.8021331371827878,
"train_speed(iter/s)": 0.060454
},
{
"epoch": 0.09759759759759759,
"grad_norm": 3.9366421699523926,
"learning_rate": 9.940752582951283e-06,
"loss": 0.5776666164398193,
"memory(GiB)": 77.39,
"step": 65,
"token_acc": 0.8102722213866747,
"train_speed(iter/s)": 0.060781
},
{
"epoch": 0.10510510510510511,
"grad_norm": 3.8400216102600098,
"learning_rate": 9.920154339892104e-06,
"loss": 0.5775270462036133,
"memory(GiB)": 77.4,
"step": 70,
"token_acc": 0.8103343924950824,
"train_speed(iter/s)": 0.061034
},
{
"epoch": 0.11261261261261261,
"grad_norm": 3.48134183883667,
"learning_rate": 9.896516878033318e-06,
"loss": 0.5721072196960449,
"memory(GiB)": 77.4,
"step": 75,
"token_acc": 0.8125897215044502,
"train_speed(iter/s)": 0.061315
},
{
"epoch": 0.12012012012012012,
"grad_norm": 4.326797962188721,
"learning_rate": 9.869854798424709e-06,
"loss": 0.5744771957397461,
"memory(GiB)": 77.4,
"step": 80,
"token_acc": 0.8050841065097685,
"train_speed(iter/s)": 0.06143
},
{
"epoch": 0.12762762762762764,
"grad_norm": 4.147410869598389,
"learning_rate": 9.840184570446702e-06,
"loss": 0.5583375930786133,
"memory(GiB)": 77.4,
"step": 85,
"token_acc": 0.8205675923100396,
"train_speed(iter/s)": 0.061622
},
{
"epoch": 0.13513513513513514,
"grad_norm": 9.281377792358398,
"learning_rate": 9.807524521637103e-06,
"loss": 0.5598219871520996,
"memory(GiB)": 77.4,
"step": 90,
"token_acc": 0.815196639938908,
"train_speed(iter/s)": 0.061807
},
{
"epoch": 0.14264264264264265,
"grad_norm": 4.255290508270264,
"learning_rate": 9.771894826370021e-06,
"loss": 0.5530188083648682,
"memory(GiB)": 77.4,
"step": 95,
"token_acc": 0.8160852001468968,
"train_speed(iter/s)": 0.06204
},
{
"epoch": 0.15015015015015015,
"grad_norm": 3.306349515914917,
"learning_rate": 9.733317493394004e-06,
"loss": 0.5611482620239258,
"memory(GiB)": 77.4,
"step": 100,
"token_acc": 0.809205586424388,
"train_speed(iter/s)": 0.062203
},
{
"epoch": 0.15015015015015015,
"eval_loss": 4.395487308502197,
"eval_runtime": 49.3031,
"eval_samples_per_second": 17.443,
"eval_steps_per_second": 2.191,
"eval_token_acc": 0.8156251073624902,
"step": 100
},
{
"epoch": 0.15765765765765766,
"grad_norm": 3.02348256111145,
"learning_rate": 9.691816352237052e-06,
"loss": 0.5578757286071777,
"memory(GiB)": 77.4,
"step": 105,
"token_acc": 0.8126542517388378,
"train_speed(iter/s)": 0.060579
},
{
"epoch": 0.16516516516516516,
"grad_norm": 3.450737237930298,
"learning_rate": 9.647417038486936e-06,
"loss": 0.5540652751922608,
"memory(GiB)": 77.4,
"step": 110,
"token_acc": 0.815136660724896,
"train_speed(iter/s)": 0.060695
},
{
"epoch": 0.17267267267267267,
"grad_norm": 3.2497100830078125,
"learning_rate": 9.60014697795588e-06,
"loss": 0.5622821807861328,
"memory(GiB)": 77.4,
"step": 115,
"token_acc": 0.8124953734547339,
"train_speed(iter/s)": 0.06081
},
{
"epoch": 0.18018018018018017,
"grad_norm": 3.766254425048828,
"learning_rate": 9.550035369739416e-06,
"loss": 0.5542448043823243,
"memory(GiB)": 77.4,
"step": 120,
"token_acc": 0.8170215979459297,
"train_speed(iter/s)": 0.060974
},
{
"epoch": 0.18768768768768768,
"grad_norm": 6.819382190704346,
"learning_rate": 9.49711316817988e-06,
"loss": 0.5530866622924805,
"memory(GiB)": 77.4,
"step": 125,
"token_acc": 0.8172684695573349,
"train_speed(iter/s)": 0.061077
},
{
"epoch": 0.19519519519519518,
"grad_norm": 4.784273147583008,
"learning_rate": 9.44141306374566e-06,
"loss": 0.5516416072845459,
"memory(GiB)": 77.4,
"step": 130,
"token_acc": 0.8142543693727775,
"train_speed(iter/s)": 0.0612
},
{
"epoch": 0.20270270270270271,
"grad_norm": 4.253176689147949,
"learning_rate": 9.382969462838023e-06,
"loss": 0.5411262512207031,
"memory(GiB)": 77.4,
"step": 135,
"token_acc": 0.814960036193636,
"train_speed(iter/s)": 0.061361
},
{
"epoch": 0.21021021021021022,
"grad_norm": 7.392826557159424,
"learning_rate": 9.32181846653802e-06,
"loss": 0.5480520725250244,
"memory(GiB)": 77.4,
"step": 140,
"token_acc": 0.8117261471230881,
"train_speed(iter/s)": 0.06147
},
{
"epoch": 0.21771771771771772,
"grad_norm": 5.3913750648498535,
"learning_rate": 9.257997848306548e-06,
"loss": 0.5492410659790039,
"memory(GiB)": 77.4,
"step": 145,
"token_acc": 0.8235338860576558,
"train_speed(iter/s)": 0.061595
},
{
"epoch": 0.22522522522522523,
"grad_norm": 5.326717853546143,
"learning_rate": 9.191547030651383e-06,
"loss": 0.537621259689331,
"memory(GiB)": 77.4,
"step": 150,
"token_acc": 0.8174485718514134,
"train_speed(iter/s)": 0.061668
},
{
"epoch": 0.23273273273273273,
"grad_norm": 5.77912712097168,
"learning_rate": 9.122507060775587e-06,
"loss": 0.5331393241882324,
"memory(GiB)": 77.4,
"step": 155,
"token_acc": 0.8221793062467504,
"train_speed(iter/s)": 0.061777
},
{
"epoch": 0.24024024024024024,
"grad_norm": 3.0749833583831787,
"learning_rate": 9.050920585222309e-06,
"loss": 0.5423390388488769,
"memory(GiB)": 77.4,
"step": 160,
"token_acc": 0.8245859403754141,
"train_speed(iter/s)": 0.061826
},
{
"epoch": 0.24774774774774774,
"grad_norm": 3.131141185760498,
"learning_rate": 8.976831823531683e-06,
"loss": 0.5409300804138184,
"memory(GiB)": 77.4,
"step": 165,
"token_acc": 0.8230075410320864,
"train_speed(iter/s)": 0.061937
},
{
"epoch": 0.2552552552552553,
"grad_norm": 8.984918594360352,
"learning_rate": 8.900286540926062e-06,
"loss": 0.5327372550964355,
"memory(GiB)": 77.4,
"step": 170,
"token_acc": 0.8271838775664823,
"train_speed(iter/s)": 0.062016
},
{
"epoch": 0.2627627627627628,
"grad_norm": 2.6073122024536133,
"learning_rate": 8.82133202004047e-06,
"loss": 0.5299601078033447,
"memory(GiB)": 77.4,
"step": 175,
"token_acc": 0.8204494382022471,
"train_speed(iter/s)": 0.062103
},
{
"epoch": 0.2702702702702703,
"grad_norm": 4.3983154296875,
"learning_rate": 8.74001703171574e-06,
"loss": 0.5317525863647461,
"memory(GiB)": 77.4,
"step": 180,
"token_acc": 0.8145350154868928,
"train_speed(iter/s)": 0.062171
},
{
"epoch": 0.2777777777777778,
"grad_norm": 2.6061856746673584,
"learning_rate": 8.656391804872376e-06,
"loss": 0.5338263511657715,
"memory(GiB)": 77.4,
"step": 185,
"token_acc": 0.8210867324399941,
"train_speed(iter/s)": 0.062229
},
{
"epoch": 0.2852852852852853,
"grad_norm": 3.8644051551818848,
"learning_rate": 8.57050799548375e-06,
"loss": 0.5335628509521484,
"memory(GiB)": 77.4,
"step": 190,
"token_acc": 0.8225940161278167,
"train_speed(iter/s)": 0.062293
},
{
"epoch": 0.2927927927927928,
"grad_norm": 3.1008355617523193,
"learning_rate": 8.482418654667777e-06,
"loss": 0.5237902641296387,
"memory(GiB)": 77.4,
"step": 195,
"token_acc": 0.8233494964565461,
"train_speed(iter/s)": 0.062328
},
{
"epoch": 0.3003003003003003,
"grad_norm": 4.561453342437744,
"learning_rate": 8.392178195916832e-06,
"loss": 0.5280370712280273,
"memory(GiB)": 77.4,
"step": 200,
"token_acc": 0.828030888030888,
"train_speed(iter/s)": 0.062359
},
{
"epoch": 0.3003003003003003,
"eval_loss": 4.170934200286865,
"eval_runtime": 49.0517,
"eval_samples_per_second": 17.533,
"eval_steps_per_second": 2.202,
"eval_token_acc": 0.8229910330848249,
"step": 200
},
{
"epoch": 0.3078078078078078,
"grad_norm": 4.3819122314453125,
"learning_rate": 8.299842361486094e-06,
"loss": 0.520693588256836,
"memory(GiB)": 77.4,
"step": 205,
"token_acc": 0.8251148084017165,
"train_speed(iter/s)": 0.061487
},
{
"epoch": 0.3153153153153153,
"grad_norm": 3.4289472103118896,
"learning_rate": 8.2054681879611e-06,
"loss": 0.524658489227295,
"memory(GiB)": 77.4,
"step": 210,
"token_acc": 0.8237686398553999,
"train_speed(iter/s)": 0.061563
},
{
"epoch": 0.3228228228228228,
"grad_norm": 2.9290993213653564,
"learning_rate": 8.109113971025803e-06,
"loss": 0.5151615142822266,
"memory(GiB)": 77.4,
"step": 215,
"token_acc": 0.8236711338653817,
"train_speed(iter/s)": 0.061657
},
{
"epoch": 0.3303303303303303,
"grad_norm": 4.519934177398682,
"learning_rate": 8.010839229452843e-06,
"loss": 0.5212090492248536,
"memory(GiB)": 77.4,
"step": 220,
"token_acc": 0.8203009737385659,
"train_speed(iter/s)": 0.061706
},
{
"epoch": 0.33783783783783783,
"grad_norm": 2.7104790210723877,
"learning_rate": 7.910704668338338e-06,
"loss": 0.5250448226928711,
"memory(GiB)": 77.4,
"step": 225,
"token_acc": 0.823299804834109,
"train_speed(iter/s)": 0.061773
},
{
"epoch": 0.34534534534534533,
"grad_norm": 2.8757286071777344,
"learning_rate": 7.808772141603855e-06,
"loss": 0.5199666976928711,
"memory(GiB)": 77.4,
"step": 230,
"token_acc": 0.8213438148917235,
"train_speed(iter/s)": 0.061801
},
{
"epoch": 0.35285285285285284,
"grad_norm": 4.525195598602295,
"learning_rate": 7.705104613788743e-06,
"loss": 0.5214581489562988,
"memory(GiB)": 77.4,
"step": 235,
"token_acc": 0.8286637121610456,
"train_speed(iter/s)": 0.061859
},
{
"epoch": 0.36036036036036034,
"grad_norm": 22.122285842895508,
"learning_rate": 7.599766121156436e-06,
"loss": 0.5185123443603515,
"memory(GiB)": 77.4,
"step": 240,
"token_acc": 0.8281073764944733,
"train_speed(iter/s)": 0.061884
},
{
"epoch": 0.36786786786786785,
"grad_norm": 3.6401257514953613,
"learning_rate": 7.492821732138737e-06,
"loss": 0.5193865776062012,
"memory(GiB)": 77.4,
"step": 245,
"token_acc": 0.8321651683681244,
"train_speed(iter/s)": 0.061892
},
{
"epoch": 0.37537537537537535,
"grad_norm": 3.1468453407287598,
"learning_rate": 7.3843375071425315e-06,
"loss": 0.5244226455688477,
"memory(GiB)": 77.4,
"step": 250,
"token_acc": 0.8159338649498866,
"train_speed(iter/s)": 0.061943
},
{
"epoch": 0.38288288288288286,
"grad_norm": 2.953416585922241,
"learning_rate": 7.274380457743731e-06,
"loss": 0.5164532661437988,
"memory(GiB)": 77.4,
"step": 255,
"token_acc": 0.8161441656117358,
"train_speed(iter/s)": 0.062
},
{
"epoch": 0.39039039039039036,
"grad_norm": 3.3566670417785645,
"learning_rate": 7.163018505293703e-06,
"loss": 0.5199567317962647,
"memory(GiB)": 77.4,
"step": 260,
"token_acc": 0.8266276517922458,
"train_speed(iter/s)": 0.062058
},
{
"epoch": 0.3978978978978979,
"grad_norm": 5.95110559463501,
"learning_rate": 7.050320438963691e-06,
"loss": 0.5201972961425781,
"memory(GiB)": 77.4,
"step": 265,
"token_acc": 0.8189985272459499,
"train_speed(iter/s)": 0.0621
},
{
"epoch": 0.40540540540540543,
"grad_norm": 18.081615447998047,
"learning_rate": 6.936355873253207e-06,
"loss": 0.5159478187561035,
"memory(GiB)": 77.4,
"step": 270,
"token_acc": 0.8244876108901805,
"train_speed(iter/s)": 0.062133
},
{
"epoch": 0.41291291291291293,
"grad_norm": 6.3731184005737305,
"learning_rate": 6.821195204988578e-06,
"loss": 0.5061209201812744,
"memory(GiB)": 77.4,
"step": 275,
"token_acc": 0.8274302361238883,
"train_speed(iter/s)": 0.062173
},
{
"epoch": 0.42042042042042044,
"grad_norm": 5.5361008644104,
"learning_rate": 6.704909569838281e-06,
"loss": 0.5148390769958496,
"memory(GiB)": 77.4,
"step": 280,
"token_acc": 0.8375707280271596,
"train_speed(iter/s)": 0.062201
},
{
"epoch": 0.42792792792792794,
"grad_norm": 8.514371871948242,
"learning_rate": 6.58757079837186e-06,
"loss": 0.501787519454956,
"memory(GiB)": 77.4,
"step": 285,
"token_acc": 0.8299818566676747,
"train_speed(iter/s)": 0.062228
},
{
"epoch": 0.43543543543543545,
"grad_norm": 2.4984076023101807,
"learning_rate": 6.469251371689606e-06,
"loss": 0.5198217868804932,
"memory(GiB)": 77.4,
"step": 290,
"token_acc": 0.8295916829893024,
"train_speed(iter/s)": 0.062245
},
{
"epoch": 0.44294294294294295,
"grad_norm": 4.352683067321777,
"learning_rate": 6.350024376650403e-06,
"loss": 0.503413200378418,
"memory(GiB)": 77.4,
"step": 295,
"token_acc": 0.8299345323199638,
"train_speed(iter/s)": 0.062291
},
{
"epoch": 0.45045045045045046,
"grad_norm": 4.458861351013184,
"learning_rate": 6.22996346072539e-06,
"loss": 0.5108905792236328,
"memory(GiB)": 77.4,
"step": 300,
"token_acc": 0.8341271022473582,
"train_speed(iter/s)": 0.062322
},
{
"epoch": 0.45045045045045046,
"eval_loss": 4.029459476470947,
"eval_runtime": 48.9329,
"eval_samples_per_second": 17.575,
"eval_steps_per_second": 2.207,
"eval_token_acc": 0.8278627134366303,
"step": 300
},
{
"epoch": 0.45795795795795796,
"grad_norm": 2.8650991916656494,
"learning_rate": 6.109142786505327e-06,
"loss": 0.5027182102203369,
"memory(GiB)": 77.4,
"step": 305,
"token_acc": 0.8356470769705383,
"train_speed(iter/s)": 0.061736
},
{
"epoch": 0.46546546546546547,
"grad_norm": 4.33251953125,
"learning_rate": 5.987636985889764e-06,
"loss": 0.5072842121124268,
"memory(GiB)": 77.4,
"step": 310,
"token_acc": 0.8324474058546081,
"train_speed(iter/s)": 0.061753
},
{
"epoch": 0.47297297297297297,
"grad_norm": 3.968172073364258,
"learning_rate": 5.865521113986322e-06,
"loss": 0.506615161895752,
"memory(GiB)": 77.4,
"step": 315,
"token_acc": 0.8249737197777444,
"train_speed(iter/s)": 0.061785
},
{
"epoch": 0.4804804804804805,
"grad_norm": 3.18407940864563,
"learning_rate": 5.742870602748547e-06,
"loss": 0.5017033576965332,
"memory(GiB)": 77.4,
"step": 320,
"token_acc": 0.8319403659362999,
"train_speed(iter/s)": 0.061808
},
{
"epoch": 0.487987987987988,
"grad_norm": 3.2644057273864746,
"learning_rate": 5.619761214380998e-06,
"loss": 0.4994755744934082,
"memory(GiB)": 77.4,
"step": 325,
"token_acc": 0.8307278031266363,
"train_speed(iter/s)": 0.061844
},
{
"epoch": 0.4954954954954955,
"grad_norm": 3.396632671356201,
"learning_rate": 5.496268994540309e-06,
"loss": 0.5043362617492676,
"memory(GiB)": 77.4,
"step": 330,
"token_acc": 0.8266277066007343,
"train_speed(iter/s)": 0.061862
},
{
"epoch": 0.503003003003003,
"grad_norm": 2.9528186321258545,
"learning_rate": 5.372470225361189e-06,
"loss": 0.5022759437561035,
"memory(GiB)": 77.4,
"step": 335,
"token_acc": 0.828901303538175,
"train_speed(iter/s)": 0.061871
},
{
"epoch": 0.5105105105105106,
"grad_norm": 3.661381959915161,
"learning_rate": 5.2484413783363335e-06,
"loss": 0.49889430999755857,
"memory(GiB)": 77.4,
"step": 340,
"token_acc": 0.835191142365527,
"train_speed(iter/s)": 0.061903
},
{
"epoch": 0.5180180180180181,
"grad_norm": 2.4464967250823975,
"learning_rate": 5.124259067079365e-06,
"loss": 0.5070960044860839,
"memory(GiB)": 77.4,
"step": 345,
"token_acc": 0.8305346884666372,
"train_speed(iter/s)": 0.061921
},
{
"epoch": 0.5255255255255256,
"grad_norm": 5.827206611633301,
"learning_rate": 5e-06,
"loss": 0.49094176292419434,
"memory(GiB)": 77.4,
"step": 350,
"token_acc": 0.8313569498649054,
"train_speed(iter/s)": 0.061945
},
{
"epoch": 0.5330330330330331,
"grad_norm": 2.506747245788574,
"learning_rate": 4.875740932920635e-06,
"loss": 0.4921010971069336,
"memory(GiB)": 77.4,
"step": 355,
"token_acc": 0.833807882511614,
"train_speed(iter/s)": 0.061968
},
{
"epoch": 0.5405405405405406,
"grad_norm": 2.513331651687622,
"learning_rate": 4.751558621663668e-06,
"loss": 0.49839167594909667,
"memory(GiB)": 77.4,
"step": 360,
"token_acc": 0.8323643410852714,
"train_speed(iter/s)": 0.061988
},
{
"epoch": 0.5480480480480481,
"grad_norm": 2.4384679794311523,
"learning_rate": 4.627529774638812e-06,
"loss": 0.4972184181213379,
"memory(GiB)": 77.4,
"step": 365,
"token_acc": 0.8275731679649097,
"train_speed(iter/s)": 0.062021
},
{
"epoch": 0.5555555555555556,
"grad_norm": 4.762032508850098,
"learning_rate": 4.5037310054596936e-06,
"loss": 0.496537971496582,
"memory(GiB)": 77.4,
"step": 370,
"token_acc": 0.8353353726777587,
"train_speed(iter/s)": 0.062035
},
{
"epoch": 0.5630630630630631,
"grad_norm": 7.371038913726807,
"learning_rate": 4.380238785619003e-06,
"loss": 0.4977581024169922,
"memory(GiB)": 77.4,
"step": 375,
"token_acc": 0.8365855496119943,
"train_speed(iter/s)": 0.062052
},
{
"epoch": 0.5705705705705706,
"grad_norm": 3.048470973968506,
"learning_rate": 4.257129397251453e-06,
"loss": 0.4944156646728516,
"memory(GiB)": 77.4,
"step": 380,
"token_acc": 0.8276846772375441,
"train_speed(iter/s)": 0.062064
},
{
"epoch": 0.5780780780780781,
"grad_norm": 3.883787155151367,
"learning_rate": 4.13447888601368e-06,
"loss": 0.49016704559326174,
"memory(GiB)": 77.4,
"step": 385,
"token_acc": 0.8302655401327701,
"train_speed(iter/s)": 0.062079
},
{
"epoch": 0.5855855855855856,
"grad_norm": 6.71173095703125,
"learning_rate": 4.012363014110237e-06,
"loss": 0.49213333129882814,
"memory(GiB)": 77.4,
"step": 390,
"token_acc": 0.8332108743570904,
"train_speed(iter/s)": 0.062097
},
{
"epoch": 0.5930930930930931,
"grad_norm": 6.448680400848389,
"learning_rate": 3.890857213494673e-06,
"loss": 0.493864631652832,
"memory(GiB)": 77.4,
"step": 395,
"token_acc": 0.829066045970323,
"train_speed(iter/s)": 0.062126
},
{
"epoch": 0.6006006006006006,
"grad_norm": 4.552060604095459,
"learning_rate": 3.7700365392746106e-06,
"loss": 0.48853540420532227,
"memory(GiB)": 77.4,
"step": 400,
"token_acc": 0.8339560603796145,
"train_speed(iter/s)": 0.062161
},
{
"epoch": 0.6006006006006006,
"eval_loss": 3.8978097438812256,
"eval_runtime": 48.9784,
"eval_samples_per_second": 17.559,
"eval_steps_per_second": 2.205,
"eval_token_acc": 0.8327343937884357,
"step": 400
},
{
"epoch": 0.6081081081081081,
"grad_norm": 2.544924736022949,
"learning_rate": 3.649975623349599e-06,
"loss": 0.49090261459350587,
"memory(GiB)": 77.56,
"step": 405,
"token_acc": 0.8355947535052013,
"train_speed(iter/s)": 0.061731
},
{
"epoch": 0.6156156156156156,
"grad_norm": 6.126231670379639,
"learning_rate": 3.5307486283103966e-06,
"loss": 0.49183125495910646,
"memory(GiB)": 77.56,
"step": 410,
"token_acc": 0.831595529217613,
"train_speed(iter/s)": 0.06175
},
{
"epoch": 0.6231231231231231,
"grad_norm": 2.7342617511749268,
"learning_rate": 3.412429201628142e-06,
"loss": 0.4908740520477295,
"memory(GiB)": 77.56,
"step": 415,
"token_acc": 0.8315980081484835,
"train_speed(iter/s)": 0.061767
},
{
"epoch": 0.6306306306306306,
"grad_norm": 2.9041008949279785,
"learning_rate": 3.29509043016172e-06,
"loss": 0.4888955593109131,
"memory(GiB)": 77.56,
"step": 420,
"token_acc": 0.8382599420996496,
"train_speed(iter/s)": 0.061791
},
{
"epoch": 0.6381381381381381,
"grad_norm": 2.52190899848938,
"learning_rate": 3.1788047950114244e-06,
"loss": 0.48609514236450196,
"memory(GiB)": 77.56,
"step": 425,
"token_acc": 0.8281978055012776,
"train_speed(iter/s)": 0.061822
},
{
"epoch": 0.6456456456456456,
"grad_norm": 2.1861705780029297,
"learning_rate": 3.0636441267467955e-06,
"loss": 0.48923444747924805,
"memory(GiB)": 77.56,
"step": 430,
"token_acc": 0.8383622335956,
"train_speed(iter/s)": 0.061843
},
{
"epoch": 0.6531531531531531,
"grad_norm": 4.289586544036865,
"learning_rate": 2.9496795610363087e-06,
"loss": 0.4919395923614502,
"memory(GiB)": 77.56,
"step": 435,
"token_acc": 0.8347007903650734,
"train_speed(iter/s)": 0.061873
},
{
"epoch": 0.6606606606606606,
"grad_norm": 2.4423766136169434,
"learning_rate": 2.8369814947062994e-06,
"loss": 0.48633642196655275,
"memory(GiB)": 77.56,
"step": 440,
"token_acc": 0.8311563010241948,
"train_speed(iter/s)": 0.06188
},
{
"epoch": 0.6681681681681682,
"grad_norm": 2.4229958057403564,
"learning_rate": 2.7256195422562687e-06,
"loss": 0.48715896606445314,
"memory(GiB)": 77.56,
"step": 445,
"token_acc": 0.8328196647372773,
"train_speed(iter/s)": 0.06191
},
{
"epoch": 0.6756756756756757,
"grad_norm": 2.8222622871398926,
"learning_rate": 2.615662492857471e-06,
"loss": 0.4761828422546387,
"memory(GiB)": 77.56,
"step": 450,
"token_acc": 0.8371917082765925,
"train_speed(iter/s)": 0.061929
},
{
"epoch": 0.6831831831831832,
"grad_norm": 2.2745773792266846,
"learning_rate": 2.5071782678612635e-06,
"loss": 0.47379336357116697,
"memory(GiB)": 77.56,
"step": 455,
"token_acc": 0.8373471378197914,
"train_speed(iter/s)": 0.061944
},
{
"epoch": 0.6906906906906907,
"grad_norm": 3.0483222007751465,
"learning_rate": 2.4002338788435654e-06,
"loss": 0.4889671325683594,
"memory(GiB)": 77.56,
"step": 460,
"token_acc": 0.8362183754993342,
"train_speed(iter/s)": 0.061958
},
{
"epoch": 0.6981981981981982,
"grad_norm": 4.672157287597656,
"learning_rate": 2.2948953862112596e-06,
"loss": 0.4797488212585449,
"memory(GiB)": 77.56,
"step": 465,
"token_acc": 0.8255822661574178,
"train_speed(iter/s)": 0.061982
},
{
"epoch": 0.7057057057057057,
"grad_norm": 3.9066431522369385,
"learning_rate": 2.1912278583961454e-06,
"loss": 0.47504510879516604,
"memory(GiB)": 77.56,
"step": 470,
"token_acc": 0.8383151135234309,
"train_speed(iter/s)": 0.062009
},
{
"epoch": 0.7132132132132132,
"grad_norm": 3.3316314220428467,
"learning_rate": 2.0892953316616616e-06,
"loss": 0.4852120399475098,
"memory(GiB)": 77.56,
"step": 475,
"token_acc": 0.8355624907118443,
"train_speed(iter/s)": 0.062037
},
{
"epoch": 0.7207207207207207,
"grad_norm": 3.6110689640045166,
"learning_rate": 1.989160770547159e-06,
"loss": 0.48483953475952146,
"memory(GiB)": 77.56,
"step": 480,
"token_acc": 0.8303629882883539,
"train_speed(iter/s)": 0.06206
},
{
"epoch": 0.7282282282282282,
"grad_norm": 3.1614766120910645,
"learning_rate": 1.8908860289741981e-06,
"loss": 0.48744707107543944,
"memory(GiB)": 77.56,
"step": 485,
"token_acc": 0.8345915240301965,
"train_speed(iter/s)": 0.062073
},
{
"epoch": 0.7357357357357357,
"grad_norm": 2.3462319374084473,
"learning_rate": 1.794531812038901e-06,
"loss": 0.4857791423797607,
"memory(GiB)": 77.56,
"step": 490,
"token_acc": 0.8333702146492586,
"train_speed(iter/s)": 0.062097
},
{
"epoch": 0.7432432432432432,
"grad_norm": 3.077697992324829,
"learning_rate": 1.7001576385139062e-06,
"loss": 0.48043317794799806,
"memory(GiB)": 77.56,
"step": 495,
"token_acc": 0.8398653702318624,
"train_speed(iter/s)": 0.062109
},
{
"epoch": 0.7507507507507507,
"grad_norm": 4.029548168182373,
"learning_rate": 1.6078218040831678e-06,
"loss": 0.48291807174682616,
"memory(GiB)": 77.56,
"step": 500,
"token_acc": 0.830759284534157,
"train_speed(iter/s)": 0.062136
},
{
"epoch": 0.7507507507507507,
"eval_loss": 3.8144214153289795,
"eval_runtime": 49.1053,
"eval_samples_per_second": 17.513,
"eval_steps_per_second": 2.199,
"eval_token_acc": 0.835421032741265,
"step": 500
},
{
"epoch": 0.7582582582582582,
"grad_norm": 3.489283561706543,
"learning_rate": 1.5175813453322252e-06,
"loss": 0.48162593841552737,
"memory(GiB)": 77.56,
"step": 505,
"token_acc": 0.8355083683550837,
"train_speed(iter/s)": 0.061761
},
{
"epoch": 0.7657657657657657,
"grad_norm": 2.7831027507781982,
"learning_rate": 1.4294920045162514e-06,
"loss": 0.47315006256103515,
"memory(GiB)": 77.56,
"step": 510,
"token_acc": 0.8345864661654135,
"train_speed(iter/s)": 0.061779
},
{
"epoch": 0.7732732732732732,
"grad_norm": 3.9989585876464844,
"learning_rate": 1.3436081951276247e-06,
"loss": 0.47411699295043946,
"memory(GiB)": 77.56,
"step": 515,
"token_acc": 0.8337380482622553,
"train_speed(iter/s)": 0.061792
},
{
"epoch": 0.7807807807807807,
"grad_norm": 2.496121883392334,
"learning_rate": 1.2599829682842618e-06,
"loss": 0.47501659393310547,
"memory(GiB)": 77.56,
"step": 520,
"token_acc": 0.8372924268374413,
"train_speed(iter/s)": 0.0618
},
{
"epoch": 0.7882882882882883,
"grad_norm": 2.376746416091919,
"learning_rate": 1.1786679799595308e-06,
"loss": 0.47827887535095215,
"memory(GiB)": 77.56,
"step": 525,
"token_acc": 0.8420151808304808,
"train_speed(iter/s)": 0.061823
},
{
"epoch": 0.7957957957957958,
"grad_norm": 6.058043956756592,
"learning_rate": 1.09971345907394e-06,
"loss": 0.48307170867919924,
"memory(GiB)": 77.56,
"step": 530,
"token_acc": 0.8403267731154845,
"train_speed(iter/s)": 0.06183
},
{
"epoch": 0.8033033033033034,
"grad_norm": 3.4789669513702393,
"learning_rate": 1.0231681764683188e-06,
"loss": 0.47766728401184083,
"memory(GiB)": 77.56,
"step": 535,
"token_acc": 0.8329650905877154,
"train_speed(iter/s)": 0.061854
},
{
"epoch": 0.8108108108108109,
"grad_norm": 2.2162930965423584,
"learning_rate": 9.490794147776927e-07,
"loss": 0.4749112606048584,
"memory(GiB)": 77.56,
"step": 540,
"token_acc": 0.8364345738295318,
"train_speed(iter/s)": 0.061873
},
{
"epoch": 0.8183183183183184,
"grad_norm": 2.1486546993255615,
"learning_rate": 8.774929392244158e-07,
"loss": 0.48050594329833984,
"memory(GiB)": 77.56,
"step": 545,
"token_acc": 0.8377268314539617,
"train_speed(iter/s)": 0.061873
},
{
"epoch": 0.8258258258258259,
"grad_norm": 20.101654052734375,
"learning_rate": 8.084529693486171e-07,
"loss": 0.4786341667175293,
"memory(GiB)": 77.56,
"step": 550,
"token_acc": 0.8336489659874252,
"train_speed(iter/s)": 0.061888
},
{
"epoch": 0.8333333333333334,
"grad_norm": 2.2926764488220215,
"learning_rate": 7.420021516934539e-07,
"loss": 0.4800395488739014,
"memory(GiB)": 77.56,
"step": 555,
"token_acc": 0.8331179198661902,
"train_speed(iter/s)": 0.0619
},
{
"epoch": 0.8408408408408409,
"grad_norm": 3.684734582901001,
"learning_rate": 6.781815334619812e-07,
"loss": 0.471435022354126,
"memory(GiB)": 77.56,
"step": 560,
"token_acc": 0.8375401560348784,
"train_speed(iter/s)": 0.061915
},
{
"epoch": 0.8483483483483484,
"grad_norm": 1.881805419921875,
"learning_rate": 6.170305371619773e-07,
"loss": 0.4786642074584961,
"memory(GiB)": 77.56,
"step": 565,
"token_acc": 0.8357564743298501,
"train_speed(iter/s)": 0.061924
},
{
"epoch": 0.8558558558558559,
"grad_norm": 2.26941180229187,
"learning_rate": 5.585869362543416e-07,
"loss": 0.47544078826904296,
"memory(GiB)": 77.56,
"step": 570,
"token_acc": 0.8454873092554251,
"train_speed(iter/s)": 0.061943
},
{
"epoch": 0.8633633633633634,
"grad_norm": 2.576007604598999,
"learning_rate": 5.028868318201191e-07,
"loss": 0.47312221527099607,
"memory(GiB)": 77.56,
"step": 575,
"token_acc": 0.8362756052141527,
"train_speed(iter/s)": 0.061967
},
{
"epoch": 0.8708708708708709,
"grad_norm": 2.3246395587921143,
"learning_rate": 4.4996463026058476e-07,
"loss": 0.479257869720459,
"memory(GiB)": 77.56,
"step": 580,
"token_acc": 0.8453853838469223,
"train_speed(iter/s)": 0.061969
},
{
"epoch": 0.8783783783783784,
"grad_norm": 2.4527480602264404,
"learning_rate": 3.9985302204412266e-07,
"loss": 0.47733464241027834,
"memory(GiB)": 77.56,
"step": 585,
"token_acc": 0.8370748040483982,
"train_speed(iter/s)": 0.061978
},
{
"epoch": 0.8858858858858859,
"grad_norm": 3.797691583633423,
"learning_rate": 3.5258296151306495e-07,
"loss": 0.4716297149658203,
"memory(GiB)": 77.56,
"step": 590,
"token_acc": 0.8305998651988318,
"train_speed(iter/s)": 0.061995
},
{
"epoch": 0.8933933933933934,
"grad_norm": 2.739664316177368,
"learning_rate": 3.081836477629491e-07,
"loss": 0.480192232131958,
"memory(GiB)": 77.56,
"step": 595,
"token_acc": 0.8329091181314568,
"train_speed(iter/s)": 0.062011
},
{
"epoch": 0.9009009009009009,
"grad_norm": 3.405724048614502,
"learning_rate": 2.666825066059986e-07,
"loss": 0.474263858795166,
"memory(GiB)": 77.56,
"step": 600,
"token_acc": 0.8360385967536839,
"train_speed(iter/s)": 0.062028
},
{
"epoch": 0.9009009009009009,
"eval_loss": 3.765347480773926,
"eval_runtime": 49.0338,
"eval_samples_per_second": 17.539,
"eval_steps_per_second": 2.203,
"eval_token_acc": 0.837152574981963,
"step": 600
},
{
"epoch": 0.9084084084084084,
"grad_norm": 2.454803943634033,
"learning_rate": 2.2810517362997997e-07,
"loss": 0.47376174926757814,
"memory(GiB)": 77.56,
"step": 605,
"token_acc": 0.8413309189678587,
"train_speed(iter/s)": 0.061734
},
{
"epoch": 0.9159159159159159,
"grad_norm": 2.480395793914795,
"learning_rate": 1.9247547836289792e-07,
"loss": 0.4828979015350342,
"memory(GiB)": 77.56,
"step": 610,
"token_acc": 0.8343275692818848,
"train_speed(iter/s)": 0.061749
},
{
"epoch": 0.9234234234234234,
"grad_norm": 2.6262176036834717,
"learning_rate": 1.598154295532983e-07,
"loss": 0.48357315063476564,
"memory(GiB)": 77.56,
"step": 615,
"token_acc": 0.8393310437509333,
"train_speed(iter/s)": 0.061749
},
{
"epoch": 0.9309309309309309,
"grad_norm": 3.147251844406128,
"learning_rate": 1.3014520157529244e-07,
"loss": 0.47496805191040037,
"memory(GiB)": 77.56,
"step": 620,
"token_acc": 0.8347965493549125,
"train_speed(iter/s)": 0.061763
},
{
"epoch": 0.9384384384384384,
"grad_norm": 2.492722511291504,
"learning_rate": 1.034831219666832e-07,
"loss": 0.48235254287719725,
"memory(GiB)": 77.56,
"step": 625,
"token_acc": 0.8455045594995855,
"train_speed(iter/s)": 0.061787
},
{
"epoch": 0.9459459459459459,
"grad_norm": 2.683011054992676,
"learning_rate": 7.984566010789673e-08,
"loss": 0.4709568977355957,
"memory(GiB)": 77.56,
"step": 630,
"token_acc": 0.8387436920991188,
"train_speed(iter/s)": 0.061803
},
{
"epoch": 0.9534534534534534,
"grad_norm": 2.4714860916137695,
"learning_rate": 5.9247417048717284e-08,
"loss": 0.4673150062561035,
"memory(GiB)": 77.56,
"step": 635,
"token_acc": 0.8387750506642648,
"train_speed(iter/s)": 0.061826
},
{
"epoch": 0.960960960960961,
"grad_norm": 2.5630085468292236,
"learning_rate": 4.170111648909736e-08,
"loss": 0.48201580047607423,
"memory(GiB)": 77.56,
"step": 640,
"token_acc": 0.8353464246017044,
"train_speed(iter/s)": 0.061839
},
{
"epoch": 0.9684684684684685,
"grad_norm": 2.9935920238494873,
"learning_rate": 2.721759691962922e-08,
"loss": 0.47791686058044436,
"memory(GiB)": 77.56,
"step": 645,
"token_acc": 0.836272040302267,
"train_speed(iter/s)": 0.061862
},
{
"epoch": 0.975975975975976,
"grad_norm": 2.290571451187134,
"learning_rate": 1.580580492652084e-08,
"loss": 0.47302780151367185,
"memory(GiB)": 77.56,
"step": 650,
"token_acc": 0.8399675060926076,
"train_speed(iter/s)": 0.061876
},
{
"epoch": 0.9834834834834835,
"grad_norm": 7.702187538146973,
"learning_rate": 7.472789665218805e-09,
"loss": 0.4697415351867676,
"memory(GiB)": 77.56,
"step": 655,
"token_acc": 0.8315651906519065,
"train_speed(iter/s)": 0.0619
},
{
"epoch": 0.990990990990991,
"grad_norm": 3.5078024864196777,
"learning_rate": 2.223698506088612e-09,
"loss": 0.46914873123168943,
"memory(GiB)": 77.56,
"step": 660,
"token_acc": 0.8398177880666119,
"train_speed(iter/s)": 0.06191
},
{
"epoch": 0.9984984984984985,
"grad_norm": 2.397590160369873,
"learning_rate": 6.177385484029685e-11,
"loss": 0.4747368812561035,
"memory(GiB)": 77.56,
"step": 665,
"token_acc": 0.8291888691533452,
"train_speed(iter/s)": 0.061924
},
{
"epoch": 1.0,
"eval_loss": 3.761183738708496,
"eval_runtime": 50.3959,
"eval_samples_per_second": 17.065,
"eval_steps_per_second": 2.143,
"eval_token_acc": 0.8374686501528842,
"step": 666
}
],
"logging_steps": 5,
"max_steps": 666,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.285336319720161e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}