sunhaonlp's picture
Upload trainer_state.json with huggingface_hub
529db68 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 14.66467958271237,
"eval_steps": 500,
"global_step": 615,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.12,
"grad_norm": 50.82564606274215,
"learning_rate": 4.032258064516129e-07,
"loss": 2.6296,
"sft_loss": 0.2952887358143926,
"step": 5,
"total_loss": 0.3225633257534355,
"value_loss": 0.27274589324370024,
"value_loss_search": 0.8215309411287308,
"value_loss_thought": 1.3604361996985972
},
{
"epoch": 0.24,
"grad_norm": 43.36034486559219,
"learning_rate": 8.064516129032258e-07,
"loss": 2.4575,
"sft_loss": 0.28640273250639436,
"step": 10,
"total_loss": 0.3117025727406144,
"value_loss": 0.25299841817468405,
"value_loss_search": 0.8143498097546399,
"value_loss_thought": 1.2096375316381454
},
{
"epoch": 0.36,
"grad_norm": 36.31265974182755,
"learning_rate": 1.2096774193548388e-06,
"loss": 2.1814,
"sft_loss": 0.24496446046978235,
"step": 15,
"total_loss": 0.27037954148254356,
"value_loss": 0.2541508126072586,
"value_loss_search": 0.8789042549207806,
"value_loss_thought": 1.15430224314332
},
{
"epoch": 0.48,
"grad_norm": 23.885147220945996,
"learning_rate": 1.6129032258064516e-06,
"loss": 1.5974,
"sft_loss": 0.18549970276653766,
"step": 20,
"total_loss": 0.20942154704825952,
"value_loss": 0.239218432828784,
"value_loss_search": 0.8069580754265189,
"value_loss_thought": 1.1067893739789725
},
{
"epoch": 0.6,
"grad_norm": 14.605157217566555,
"learning_rate": 2.0161290322580646e-06,
"loss": 1.1409,
"sft_loss": 0.11383889233693481,
"step": 25,
"total_loss": 0.13655262757092715,
"value_loss": 0.22713735280558467,
"value_loss_search": 0.8525716276839376,
"value_loss_thought": 0.9645271969959139
},
{
"epoch": 0.72,
"grad_norm": 19.32238588959413,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.9755,
"sft_loss": 0.10292870132252574,
"step": 30,
"total_loss": 0.12329704709118232,
"value_loss": 0.20368346022441983,
"value_loss_search": 0.6985976113937795,
"value_loss_thought": 0.9308700620196759
},
{
"epoch": 0.83,
"grad_norm": 11.366176943904033,
"learning_rate": 2.822580645161291e-06,
"loss": 0.8166,
"sft_loss": 0.08234395189210772,
"step": 35,
"total_loss": 0.102398702444043,
"value_loss": 0.2005474975332618,
"value_loss_search": 0.73103982496541,
"value_loss_thought": 0.8733401508070529
},
{
"epoch": 0.95,
"grad_norm": 18.523228091154312,
"learning_rate": 3.225806451612903e-06,
"loss": 0.7801,
"sft_loss": 0.0764334655366838,
"step": 40,
"total_loss": 0.09517455038730986,
"value_loss": 0.18741084402427077,
"value_loss_search": 0.5712150579318405,
"value_loss_thought": 0.9280716905370354
},
{
"epoch": 1.07,
"grad_norm": 13.717307975492233,
"learning_rate": 3.6290322580645166e-06,
"loss": 0.7302,
"sft_loss": 0.06878535831347107,
"step": 45,
"total_loss": 0.09124175217002631,
"value_loss": 0.22456393418833615,
"value_loss_search": 0.7111891292035579,
"value_loss_thought": 1.0853223511949182
},
{
"epoch": 1.19,
"grad_norm": 19.25629488379356,
"learning_rate": 4.032258064516129e-06,
"loss": 0.722,
"sft_loss": 0.07188704321160913,
"step": 50,
"total_loss": 0.09479074196424335,
"value_loss": 0.22903698151931168,
"value_loss_search": 0.9242108400911093,
"value_loss_thought": 0.9080850033089518
},
{
"epoch": 1.31,
"grad_norm": 16.515271675316733,
"learning_rate": 4.435483870967742e-06,
"loss": 0.7237,
"sft_loss": 0.07221840480342508,
"step": 55,
"total_loss": 0.09306601642165332,
"value_loss": 0.20847611278295516,
"value_loss_search": 0.7067324505187571,
"value_loss_thought": 0.9610764627344907
},
{
"epoch": 1.43,
"grad_norm": 13.559730992843452,
"learning_rate": 4.838709677419355e-06,
"loss": 0.6799,
"sft_loss": 0.06844400409609079,
"step": 60,
"total_loss": 0.08777883652364835,
"value_loss": 0.1933483243919909,
"value_loss_search": 0.6386921301018447,
"value_loss_thought": 0.90809445703635
},
{
"epoch": 1.55,
"grad_norm": 7.838247236141987,
"learning_rate": 4.999636929057196e-06,
"loss": 0.6609,
"sft_loss": 0.06545158205553889,
"step": 65,
"total_loss": 0.08412742811487987,
"value_loss": 0.1867584578692913,
"value_loss_search": 0.6354834865778685,
"value_loss_thought": 0.8585841765627265
},
{
"epoch": 1.67,
"grad_norm": 8.945716463803295,
"learning_rate": 4.997418544072742e-06,
"loss": 0.6528,
"sft_loss": 0.06201667059212923,
"step": 70,
"total_loss": 0.07894517340464517,
"value_loss": 0.1692850286606699,
"value_loss_search": 0.49887247155420483,
"value_loss_thought": 0.8554077588021756
},
{
"epoch": 1.79,
"grad_norm": 8.604383259697785,
"learning_rate": 4.993185267783142e-06,
"loss": 0.6129,
"sft_loss": 0.0569519879296422,
"step": 75,
"total_loss": 0.07289455400314182,
"value_loss": 0.1594256573356688,
"value_loss_search": 0.43842247435823084,
"value_loss_thought": 0.8369827844202519
},
{
"epoch": 1.91,
"grad_norm": 9.195694446581905,
"learning_rate": 4.986940515551676e-06,
"loss": 0.6197,
"sft_loss": 0.06148028993047774,
"step": 80,
"total_loss": 0.0772681548143737,
"value_loss": 0.1578786402475089,
"value_loss_search": 0.4949493734864518,
"value_loss_thought": 0.7680797455832362
},
{
"epoch": 2.03,
"grad_norm": 8.949095878160305,
"learning_rate": 4.978689325579491e-06,
"loss": 0.5908,
"sft_loss": 0.06064098924398422,
"step": 85,
"total_loss": 0.07585813322220929,
"value_loss": 0.15217143492773175,
"value_loss_search": 0.47303855791687965,
"value_loss_thought": 0.7443329165223986
},
{
"epoch": 2.15,
"grad_norm": 5.832639914877802,
"learning_rate": 4.968438354840834e-06,
"loss": 0.5485,
"sft_loss": 0.05426874342374503,
"step": 90,
"total_loss": 0.06687840489903465,
"value_loss": 0.1260966133326292,
"value_loss_search": 0.43105568194878285,
"value_loss_thought": 0.5777172191534191
},
{
"epoch": 2.27,
"grad_norm": 7.605084631748654,
"learning_rate": 4.956195873712274e-06,
"loss": 0.5201,
"sft_loss": 0.056156763760373,
"step": 95,
"total_loss": 0.06784167702426203,
"value_loss": 0.1168491319520399,
"value_loss_search": 0.34044701922684906,
"value_loss_thought": 0.5943460309877991
},
{
"epoch": 2.38,
"grad_norm": 5.404770654306174,
"learning_rate": 4.941971759300249e-06,
"loss": 0.5264,
"sft_loss": 0.05927520957775414,
"step": 100,
"total_loss": 0.07008651044452563,
"value_loss": 0.10811300831846893,
"value_loss_search": 0.30770739456638696,
"value_loss_thought": 0.5571966758929193
},
{
"epoch": 2.5,
"grad_norm": 4.381257694207405,
"learning_rate": 4.925777487472318e-06,
"loss": 0.5074,
"sft_loss": 0.05194715983234346,
"step": 105,
"total_loss": 0.061547464787145144,
"value_loss": 0.09600305040366948,
"value_loss_search": 0.29806660558097064,
"value_loss_thought": 0.46995780025608835
},
{
"epoch": 2.62,
"grad_norm": 7.428215594764233,
"learning_rate": 4.907626123598552e-06,
"loss": 0.5066,
"sft_loss": 0.05760289076715708,
"step": 110,
"total_loss": 0.06731230580771808,
"value_loss": 0.09709415114484728,
"value_loss_search": 0.30406282742042096,
"value_loss_thought": 0.4726903848350048
},
{
"epoch": 2.74,
"grad_norm": 3.2888847744957492,
"learning_rate": 4.8875323120105275e-06,
"loss": 0.5131,
"sft_loss": 0.057099767681211236,
"step": 115,
"total_loss": 0.06571883515571245,
"value_loss": 0.0861906715668738,
"value_loss_search": 0.29786290233023466,
"value_loss_thought": 0.3916624684818089
},
{
"epoch": 2.86,
"grad_norm": 4.869835996951379,
"learning_rate": 4.8655122641864335e-06,
"loss": 0.5126,
"sft_loss": 0.056119235185906294,
"step": 120,
"total_loss": 0.06504482553282287,
"value_loss": 0.08925589976133778,
"value_loss_search": 0.25328616083133965,
"value_loss_thought": 0.4607610349543393
},
{
"epoch": 2.98,
"grad_norm": 3.5856824633030433,
"learning_rate": 4.84158374567182e-06,
"loss": 0.4804,
"sft_loss": 0.046175889065489174,
"step": 125,
"total_loss": 0.054164890843094324,
"value_loss": 0.0798900181078352,
"value_loss_search": 0.2606613096548244,
"value_loss_thought": 0.3784588351845741
},
{
"epoch": 3.1,
"grad_norm": 2.393170606173336,
"learning_rate": 4.815766061746538e-06,
"loss": 0.4603,
"sft_loss": 0.05217493660748005,
"step": 130,
"total_loss": 0.05943963085010182,
"value_loss": 0.0726469449698925,
"value_loss_search": 0.21297869782429188,
"value_loss_thought": 0.3681968664750457
},
{
"epoch": 3.22,
"grad_norm": 1.9679581522016634,
"learning_rate": 4.788080041849443e-06,
"loss": 0.4278,
"sft_loss": 0.052470245282165706,
"step": 135,
"total_loss": 0.05776963674434228,
"value_loss": 0.05299391018925235,
"value_loss_search": 0.17309838664950802,
"value_loss_thought": 0.2508528954349458
},
{
"epoch": 3.34,
"grad_norm": 3.1066245128680725,
"learning_rate": 4.7585480227734175e-06,
"loss": 0.434,
"sft_loss": 0.049476837972179055,
"step": 140,
"total_loss": 0.055079690403363205,
"value_loss": 0.056028524460271,
"value_loss_search": 0.18748179799877107,
"value_loss_thought": 0.2607463992317207
},
{
"epoch": 3.46,
"grad_norm": 3.4261087057869335,
"learning_rate": 4.7271938306442855e-06,
"loss": 0.4274,
"sft_loss": 0.04518961615394801,
"step": 145,
"total_loss": 0.05061705713451374,
"value_loss": 0.054274410346988586,
"value_loss_search": 0.15429693737823982,
"value_loss_thought": 0.27989834365434946
},
{
"epoch": 3.58,
"grad_norm": 3.0813091887098563,
"learning_rate": 4.694042761698135e-06,
"loss": 0.438,
"sft_loss": 0.055581874679774046,
"step": 150,
"total_loss": 0.06076982853701338,
"value_loss": 0.051879540382651614,
"value_loss_search": 0.14725855304568541,
"value_loss_thought": 0.26777777075767517
},
{
"epoch": 3.7,
"grad_norm": 2.0023308919779734,
"learning_rate": 4.6591215618725775e-06,
"loss": 0.4328,
"sft_loss": 0.04702084113378078,
"step": 155,
"total_loss": 0.05245317818043986,
"value_loss": 0.054323368283803575,
"value_loss_search": 0.15814204575144686,
"value_loss_thought": 0.27644489823142065
},
{
"epoch": 3.82,
"grad_norm": 1.977551344543693,
"learning_rate": 4.622458405228411e-06,
"loss": 0.4244,
"sft_loss": 0.05086941795889288,
"step": 160,
"total_loss": 0.0561610649638169,
"value_loss": 0.05291647011763416,
"value_loss_search": 0.15665370621718466,
"value_loss_thought": 0.26667805360630153
},
{
"epoch": 3.93,
"grad_norm": 1.9529739672122766,
"learning_rate": 4.5840828712190725e-06,
"loss": 0.4032,
"sft_loss": 0.04604445670265704,
"step": 165,
"total_loss": 0.05026301761099603,
"value_loss": 0.0421856101078447,
"value_loss_search": 0.12487082706647926,
"value_loss_thought": 0.21261405241675674
},
{
"epoch": 4.05,
"grad_norm": 1.5029470328755332,
"learning_rate": 4.54402592082625e-06,
"loss": 0.3912,
"sft_loss": 0.0438391676871106,
"step": 170,
"total_loss": 0.046969617460126754,
"value_loss": 0.031304498872486874,
"value_loss_search": 0.08991801288211718,
"value_loss_thought": 0.1605179784586653
},
{
"epoch": 4.17,
"grad_norm": 1.5958208749346106,
"learning_rate": 4.502319871580879e-06,
"loss": 0.3559,
"sft_loss": 0.0380060717696324,
"step": 175,
"total_loss": 0.040836501114608834,
"value_loss": 0.028304292738903315,
"value_loss_search": 0.08076013361569494,
"value_loss_thought": 0.1456742096459493
},
{
"epoch": 4.29,
"grad_norm": 1.1597766780252663,
"learning_rate": 4.458998371489695e-06,
"loss": 0.3542,
"sft_loss": 0.04091373980045319,
"step": 180,
"total_loss": 0.04352424936296302,
"value_loss": 0.02610509568476118,
"value_loss_search": 0.08407636135234497,
"value_loss_thought": 0.12476440471946262
},
{
"epoch": 4.41,
"grad_norm": 1.1887589876167461,
"learning_rate": 4.414096371888378e-06,
"loss": 0.3573,
"sft_loss": 0.039319761795923114,
"step": 185,
"total_loss": 0.04204170083394274,
"value_loss": 0.027219387918012216,
"value_loss_search": 0.07986954482039436,
"value_loss_thought": 0.13788555894279853
},
{
"epoch": 4.53,
"grad_norm": 1.1787290016300027,
"learning_rate": 4.367650099243167e-06,
"loss": 0.3568,
"sft_loss": 0.04122671156655997,
"step": 190,
"total_loss": 0.04415785642413539,
"value_loss": 0.029311446825158783,
"value_loss_search": 0.07832561889081262,
"value_loss_thought": 0.15616595644678455
},
{
"epoch": 4.65,
"grad_norm": 1.6029965120643992,
"learning_rate": 4.319697025923736e-06,
"loss": 0.3598,
"sft_loss": 0.04112560288049281,
"step": 195,
"total_loss": 0.043696090345474656,
"value_loss": 0.02570487381599378,
"value_loss_search": 0.08383175324415788,
"value_loss_thought": 0.12180723798810504
},
{
"epoch": 4.77,
"grad_norm": 1.4381210449396125,
"learning_rate": 4.270275839970868e-06,
"loss": 0.3515,
"sft_loss": 0.039879024308174846,
"step": 200,
"total_loss": 0.04217200499406317,
"value_loss": 0.022929807018954307,
"value_loss_search": 0.07454460244625807,
"value_loss_thought": 0.10889385400805622
},
{
"epoch": 4.89,
"grad_norm": 1.2614274454432255,
"learning_rate": 4.219426413883348e-06,
"loss": 0.3625,
"sft_loss": 0.04709821604192257,
"step": 205,
"total_loss": 0.0502521290894947,
"value_loss": 0.03153913400019519,
"value_loss_search": 0.08359810820547864,
"value_loss_thought": 0.1687149630743079
},
{
"epoch": 5.01,
"grad_norm": 1.1506878122900264,
"learning_rate": 4.167189772449248e-06,
"loss": 0.3432,
"sft_loss": 0.04020570595748722,
"step": 210,
"total_loss": 0.04252870589734812,
"value_loss": 0.023229998056194745,
"value_loss_search": 0.07497004266479053,
"value_loss_thought": 0.11086994202341885
},
{
"epoch": 5.13,
"grad_norm": 0.9862059900461185,
"learning_rate": 4.113608059647553e-06,
"loss": 0.2994,
"sft_loss": 0.0355574628803879,
"step": 215,
"total_loss": 0.037456610312801786,
"value_loss": 0.018991474900394677,
"value_loss_search": 0.058882090356200935,
"value_loss_thought": 0.09304970969678834
},
{
"epoch": 5.25,
"grad_norm": 0.9601247633138389,
"learning_rate": 4.058724504646834e-06,
"loss": 0.288,
"sft_loss": 0.03504389475565404,
"step": 220,
"total_loss": 0.03674481045381981,
"value_loss": 0.017009155871346594,
"value_loss_search": 0.050329163245623934,
"value_loss_thought": 0.08574408317217604
},
{
"epoch": 5.37,
"grad_norm": 1.3509709560128629,
"learning_rate": 4.00258338692841e-06,
"loss": 0.2933,
"sft_loss": 0.03573095325846225,
"step": 225,
"total_loss": 0.037422025205160026,
"value_loss": 0.016910719190491363,
"value_loss_search": 0.05178567884140648,
"value_loss_thought": 0.08350007485714742
},
{
"epoch": 5.48,
"grad_norm": 0.9625163641024351,
"learning_rate": 3.945230000562121e-06,
"loss": 0.2842,
"sft_loss": 0.03569462758023292,
"step": 230,
"total_loss": 0.03722189712352701,
"value_loss": 0.015272694976010826,
"value_loss_search": 0.04224359960644506,
"value_loss_thought": 0.07993796134978766
},
{
"epoch": 5.6,
"grad_norm": 1.2461981914737617,
"learning_rate": 3.8867106176635405e-06,
"loss": 0.2944,
"sft_loss": 0.034198284102603796,
"step": 235,
"total_loss": 0.03592255775074591,
"value_loss": 0.017242736062326004,
"value_loss_search": 0.052543783394503406,
"value_loss_thought": 0.08539810547372326
},
{
"epoch": 5.72,
"grad_norm": 1.0230143635639728,
"learning_rate": 3.827072451062118e-06,
"loss": 0.3004,
"sft_loss": 0.037929134699516,
"step": 240,
"total_loss": 0.03954664655320812,
"value_loss": 0.016175120149273425,
"value_loss_search": 0.053255315756541674,
"value_loss_thought": 0.076145645219367
},
{
"epoch": 5.84,
"grad_norm": 1.1409535328456384,
"learning_rate": 3.7663636162103583e-06,
"loss": 0.2922,
"sft_loss": 0.034704937925562264,
"step": 245,
"total_loss": 0.03651254886699462,
"value_loss": 0.018076109029061628,
"value_loss_search": 0.05161890290983138,
"value_loss_thought": 0.09298996879952029
},
{
"epoch": 5.96,
"grad_norm": 0.9301923066699931,
"learning_rate": 3.7046330923647733e-06,
"loss": 0.2982,
"sft_loss": 0.03279960451181978,
"step": 250,
"total_loss": 0.03438955684650864,
"value_loss": 0.01589952201466076,
"value_loss_search": 0.04750958961667493,
"value_loss_thought": 0.07968658691388555
},
{
"epoch": 6.08,
"grad_norm": 0.97678387780499,
"learning_rate": 3.6419306830699275e-06,
"loss": 0.2545,
"sft_loss": 0.031023622630164026,
"step": 255,
"total_loss": 0.03264989823801443,
"value_loss": 0.016262755844218192,
"value_loss_search": 0.05106269954121671,
"value_loss_thought": 0.07903934685018613
},
{
"epoch": 6.2,
"grad_norm": 0.8822926753721482,
"learning_rate": 3.578306975977459e-06,
"loss": 0.2328,
"sft_loss": 0.024921502592042087,
"step": 260,
"total_loss": 0.026061263032170247,
"value_loss": 0.011397602935903705,
"value_loss_search": 0.03485325619985815,
"value_loss_thought": 0.05632756747363601
},
{
"epoch": 6.32,
"grad_norm": 0.9577185704540809,
"learning_rate": 3.513813302032485e-06,
"loss": 0.232,
"sft_loss": 0.030204204528126867,
"step": 265,
"total_loss": 0.03123898759331496,
"value_loss": 0.010347830413957126,
"value_loss_search": 0.03173550037899986,
"value_loss_thought": 0.05104714238696033
},
{
"epoch": 6.44,
"grad_norm": 1.0377123471406164,
"learning_rate": 3.448501694060332e-06,
"loss": 0.2229,
"sft_loss": 0.02486703696195036,
"step": 270,
"total_loss": 0.0260224458801531,
"value_loss": 0.011554089462151751,
"value_loss_search": 0.03946099675667938,
"value_loss_thought": 0.05297171908459859
},
{
"epoch": 6.56,
"grad_norm": 0.9525897324775634,
"learning_rate": 3.38242484478699e-06,
"loss": 0.2427,
"sft_loss": 0.028131387650500984,
"step": 275,
"total_loss": 0.02923572239851637,
"value_loss": 0.011043349133251468,
"value_loss_search": 0.03660392903257161,
"value_loss_thought": 0.051742863486288114
},
{
"epoch": 6.68,
"grad_norm": 0.8743581706882085,
"learning_rate": 3.315636064327174e-06,
"loss": 0.2321,
"sft_loss": 0.028024015191476792,
"step": 280,
"total_loss": 0.02930470963474363,
"value_loss": 0.012806941750750412,
"value_loss_search": 0.04088172500487417,
"value_loss_thought": 0.061573809065157546
},
{
"epoch": 6.8,
"grad_norm": 0.8925599129064181,
"learning_rate": 3.248189237174273e-06,
"loss": 0.2333,
"sft_loss": 0.028793911496177316,
"step": 285,
"total_loss": 0.030005147381052664,
"value_loss": 0.01211236050730804,
"value_loss_search": 0.03714065319800284,
"value_loss_thought": 0.05975823083135765
},
{
"epoch": 6.92,
"grad_norm": 0.9363754532616637,
"learning_rate": 3.1801387787269043e-06,
"loss": 0.2354,
"sft_loss": 0.03100782575784251,
"step": 290,
"total_loss": 0.032353814740054075,
"value_loss": 0.013459890354715754,
"value_loss_search": 0.04140264603483956,
"value_loss_thought": 0.0662764773296658
},
{
"epoch": 7.03,
"grad_norm": 0.9366809408523523,
"learning_rate": 3.1115395913871355e-06,
"loss": 0.2125,
"sft_loss": 0.026877322501968594,
"step": 295,
"total_loss": 0.028246402372678857,
"value_loss": 0.013690798578318208,
"value_loss_search": 0.03995930703094928,
"value_loss_thought": 0.06956708127399906
},
{
"epoch": 7.15,
"grad_norm": 1.119490178115649,
"learning_rate": 3.0424470202657953e-06,
"loss": 0.1853,
"sft_loss": 0.02365317102521658,
"step": 300,
"total_loss": 0.02459036890468269,
"value_loss": 0.009371978462149854,
"value_loss_search": 0.028495647068484686,
"value_loss_thought": 0.0464801803114824
},
{
"epoch": 7.27,
"grad_norm": 1.1048820754695519,
"learning_rate": 2.972916808530619e-06,
"loss": 0.1726,
"sft_loss": 0.019329519721213727,
"step": 305,
"total_loss": 0.020249521383448155,
"value_loss": 0.00920001688064076,
"value_loss_search": 0.03524856393923983,
"value_loss_thought": 0.03835157056746539
},
{
"epoch": 7.39,
"grad_norm": 1.009889999878668,
"learning_rate": 2.903005052433234e-06,
"loss": 0.182,
"sft_loss": 0.020777616742998363,
"step": 310,
"total_loss": 0.021737158278847347,
"value_loss": 0.009595414497016463,
"value_loss_search": 0.031585555197671054,
"value_loss_thought": 0.0451777604612289
},
{
"epoch": 7.51,
"grad_norm": 1.2183303716757812,
"learning_rate": 2.832768156051293e-06,
"loss": 0.1824,
"sft_loss": 0.021867655974347144,
"step": 315,
"total_loss": 0.0228094671114377,
"value_loss": 0.009418110789556521,
"value_loss_search": 0.028446125249320175,
"value_loss_thought": 0.04689876097254455
},
{
"epoch": 7.63,
"grad_norm": 0.9682548506977025,
"learning_rate": 2.7622627857822453e-06,
"loss": 0.1794,
"sft_loss": 0.02051793959690258,
"step": 320,
"total_loss": 0.021331546243163756,
"value_loss": 0.008136065979488193,
"value_loss_search": 0.028256760543445125,
"value_loss_thought": 0.03683176791091682
},
{
"epoch": 7.75,
"grad_norm": 1.008229209327857,
"learning_rate": 2.691545824625483e-06,
"loss": 0.1853,
"sft_loss": 0.02085896288044751,
"step": 325,
"total_loss": 0.021939672244116082,
"value_loss": 0.01080709469388239,
"value_loss_search": 0.02821317232446745,
"value_loss_thought": 0.05824358468817081
},
{
"epoch": 7.87,
"grad_norm": 0.9801384650684389,
"learning_rate": 2.620674326289725e-06,
"loss": 0.1792,
"sft_loss": 0.021865303337108344,
"step": 330,
"total_loss": 0.0226608935457989,
"value_loss": 0.007955901384411846,
"value_loss_search": 0.02793408685174654,
"value_loss_thought": 0.03571312382337055
},
{
"epoch": 7.99,
"grad_norm": 0.9957117603261937,
"learning_rate": 2.5497054691626754e-06,
"loss": 0.1838,
"sft_loss": 0.023318158509209753,
"step": 335,
"total_loss": 0.024168919044313952,
"value_loss": 0.008507605516933836,
"value_loss_search": 0.026497727730020414,
"value_loss_thought": 0.04156311670230935
},
{
"epoch": 8.11,
"grad_norm": 0.9806341509152543,
"learning_rate": 2.478696510180105e-06,
"loss": 0.149,
"sft_loss": 0.0160996129445266,
"step": 340,
"total_loss": 0.016797228009818355,
"value_loss": 0.006976150353148114,
"value_loss_search": 0.023582903065835124,
"value_loss_thought": 0.03222629976226017
},
{
"epoch": 8.23,
"grad_norm": 1.0333112441972616,
"learning_rate": 2.4077047386315375e-06,
"loss": 0.146,
"sft_loss": 0.01726848309626803,
"step": 345,
"total_loss": 0.01786828497188253,
"value_loss": 0.005998019542312249,
"value_loss_search": 0.018190549310384087,
"value_loss_thought": 0.029793607082683592
},
{
"epoch": 8.35,
"grad_norm": 1.1009437866195484,
"learning_rate": 2.3367874299398587e-06,
"loss": 0.1382,
"sft_loss": 0.017719452880555764,
"step": 350,
"total_loss": 0.018400812321306147,
"value_loss": 0.006813595366838854,
"value_loss_search": 0.022706839033344296,
"value_loss_thought": 0.031801923821331
},
{
"epoch": 8.46,
"grad_norm": 1.1026584990458597,
"learning_rate": 2.266001799452094e-06,
"loss": 0.1346,
"sft_loss": 0.01671770153916441,
"step": 355,
"total_loss": 0.017422528978204354,
"value_loss": 0.007048274046974256,
"value_loss_search": 0.023480392742203548,
"value_loss_thought": 0.032905799969739746
},
{
"epoch": 8.58,
"grad_norm": 0.9942201314268132,
"learning_rate": 2.195404956278663e-06,
"loss": 0.1399,
"sft_loss": 0.01536920148646459,
"step": 360,
"total_loss": 0.01604004146747684,
"value_loss": 0.0067083996378642045,
"value_loss_search": 0.02136720167036401,
"value_loss_thought": 0.03229999518080149
},
{
"epoch": 8.7,
"grad_norm": 1.0200716562222094,
"learning_rate": 2.125053857218346e-06,
"loss": 0.1367,
"sft_loss": 0.01647848271531984,
"step": 365,
"total_loss": 0.017283534408488777,
"value_loss": 0.008050516255025286,
"value_loss_search": 0.025714086863445118,
"value_loss_thought": 0.038690043007954956
},
{
"epoch": 8.82,
"grad_norm": 0.9746017587073934,
"learning_rate": 2.055005260806125e-06,
"loss": 0.1386,
"sft_loss": 0.016943779814755543,
"step": 370,
"total_loss": 0.017788732309236367,
"value_loss": 0.008449525324977003,
"value_loss_search": 0.027701811920269392,
"value_loss_thought": 0.03989439076831332
},
{
"epoch": 8.94,
"grad_norm": 1.013640616346488,
"learning_rate": 1.9853156815209955e-06,
"loss": 0.1376,
"sft_loss": 0.01664973153383471,
"step": 375,
"total_loss": 0.017237521769857267,
"value_loss": 0.005877902866632212,
"value_loss_search": 0.020167693262919784,
"value_loss_thought": 0.026855529422755355
},
{
"epoch": 9.06,
"grad_norm": 0.8410872065316937,
"learning_rate": 1.9160413441906667e-06,
"loss": 0.1288,
"sft_loss": 0.01438085613772273,
"step": 380,
"total_loss": 0.0149352347310014,
"value_loss": 0.005543786693306174,
"value_loss_search": 0.021038004008005373,
"value_loss_thought": 0.02331228948896751
},
{
"epoch": 9.18,
"grad_norm": 1.1267722685163901,
"learning_rate": 1.8472381386299597e-06,
"loss": 0.1118,
"sft_loss": 0.013145256665302441,
"step": 385,
"total_loss": 0.013753409318451305,
"value_loss": 0.0060815261575044135,
"value_loss_search": 0.018587293970631434,
"value_loss_thought": 0.030064915464026853
},
{
"epoch": 9.3,
"grad_norm": 1.0416216404779595,
"learning_rate": 1.7789615745494842e-06,
"loss": 0.1081,
"sft_loss": 0.011737877724226565,
"step": 390,
"total_loss": 0.012277404680207837,
"value_loss": 0.005395268987922463,
"value_loss_search": 0.014393719820509432,
"value_loss_thought": 0.028768431980279275
},
{
"epoch": 9.42,
"grad_norm": 1.0705185902916405,
"learning_rate": 1.7112667367709784e-06,
"loss": 0.1076,
"sft_loss": 0.011471840902231634,
"step": 395,
"total_loss": 0.012079083354365139,
"value_loss": 0.006072424399462761,
"value_loss_search": 0.01874679435568396,
"value_loss_thought": 0.029832600799272767
},
{
"epoch": 9.54,
"grad_norm": 0.9769935708025238,
"learning_rate": 1.644208240785454e-06,
"loss": 0.1092,
"sft_loss": 0.012640924844890832,
"step": 400,
"total_loss": 0.013220903444744182,
"value_loss": 0.005799786370334914,
"value_loss_search": 0.017371999034367036,
"value_loss_thought": 0.02902629199961666
},
{
"epoch": 9.66,
"grad_norm": 0.8732612511216603,
"learning_rate": 1.5778401886899808e-06,
"loss": 0.1101,
"sft_loss": 0.011417516821529717,
"step": 405,
"total_loss": 0.011904582896204375,
"value_loss": 0.004870661003951682,
"value_loss_search": 0.017166581230412703,
"value_loss_thought": 0.021798706852132456
},
{
"epoch": 9.78,
"grad_norm": 0.9695707300264521,
"learning_rate": 1.5122161255386815e-06,
"loss": 0.1094,
"sft_loss": 0.01428213594481349,
"step": 410,
"total_loss": 0.014828245776698168,
"value_loss": 0.0054610981664154675,
"value_loss_search": 0.0202188741764985,
"value_loss_thought": 0.02346991123922635
},
{
"epoch": 9.9,
"grad_norm": 0.8746087026209299,
"learning_rate": 1.4473889961431342e-06,
"loss": 0.1039,
"sft_loss": 0.014117157692089677,
"step": 415,
"total_loss": 0.01462695918216923,
"value_loss": 0.005098015529256372,
"value_loss_search": 0.01615281879567192,
"value_loss_thought": 0.024631305370348855
},
{
"epoch": 10.01,
"grad_norm": 0.8874682684740957,
"learning_rate": 1.3834111023570557e-06,
"loss": 0.1081,
"sft_loss": 0.012744372192537412,
"step": 420,
"total_loss": 0.01325692782020269,
"value_loss": 0.005125556160055567,
"value_loss_search": 0.019138199927692768,
"value_loss_thought": 0.021866249560844153
},
{
"epoch": 10.13,
"grad_norm": 0.8207501590556279,
"learning_rate": 1.320334060879702e-06,
"loss": 0.0882,
"sft_loss": 0.01002211165614426,
"step": 425,
"total_loss": 0.010463051758233633,
"value_loss": 0.004409400749136694,
"value_loss_search": 0.015294310338504147,
"value_loss_thought": 0.01998089556072955
},
{
"epoch": 10.25,
"grad_norm": 0.9646575581607053,
"learning_rate": 1.258208761612061e-06,
"loss": 0.0881,
"sft_loss": 0.01025077059166506,
"step": 430,
"total_loss": 0.010693130262006889,
"value_loss": 0.004423596604101476,
"value_loss_search": 0.014729893395269755,
"value_loss_thought": 0.02065887938551896
},
{
"epoch": 10.37,
"grad_norm": 0.81964071156398,
"learning_rate": 1.1970853265994008e-06,
"loss": 0.0842,
"sft_loss": 0.010936628474155441,
"step": 435,
"total_loss": 0.011318862752523274,
"value_loss": 0.0038223424373427404,
"value_loss_search": 0.01328829516278347,
"value_loss_thought": 0.01729044433013769
},
{
"epoch": 10.49,
"grad_norm": 0.8384988992049195,
"learning_rate": 1.1370130695933317e-06,
"loss": 0.0843,
"sft_loss": 0.010979409719584509,
"step": 440,
"total_loss": 0.011374782725033583,
"value_loss": 0.003953730190551142,
"value_loss_search": 0.013550824504636693,
"value_loss_thought": 0.018079017029958778
},
{
"epoch": 10.61,
"grad_norm": 0.8335588319297724,
"learning_rate": 1.07804045626598e-06,
"loss": 0.0855,
"sft_loss": 0.009162665629992262,
"step": 445,
"total_loss": 0.009644234287043219,
"value_loss": 0.004815686383517459,
"value_loss_search": 0.014363445951312314,
"value_loss_thought": 0.024162045223056337
},
{
"epoch": 10.73,
"grad_norm": 0.9120796553660501,
"learning_rate": 1.020215065108393e-06,
"loss": 0.0872,
"sft_loss": 0.011613176169339568,
"step": 450,
"total_loss": 0.01204290009372926,
"value_loss": 0.004297239889274352,
"value_loss_search": 0.013511335075600072,
"value_loss_thought": 0.020866583984752652
},
{
"epoch": 10.85,
"grad_norm": 0.9330832254283613,
"learning_rate": 9.635835490446993e-07,
"loss": 0.0867,
"sft_loss": 0.009662232839036732,
"step": 455,
"total_loss": 0.010051346268846828,
"value_loss": 0.00389113405108219,
"value_loss_search": 0.013773129394394346,
"value_loss_thought": 0.017355942977883388
},
{
"epoch": 10.97,
"grad_norm": 0.8158067315783223,
"learning_rate": 9.081915977930242e-07,
"loss": 0.0897,
"sft_loss": 0.010187394253443926,
"step": 460,
"total_loss": 0.010594041376316454,
"value_loss": 0.004066470502220909,
"value_loss_search": 0.013818205476309232,
"value_loss_thought": 0.018713558375020513
},
{
"epoch": 11.09,
"grad_norm": 0.7146761362099704,
"learning_rate": 8.54083901003492e-07,
"loss": 0.077,
"sft_loss": 0.008773002325324342,
"step": 465,
"total_loss": 0.009241018544889812,
"value_loss": 0.004680162535805721,
"value_loss_search": 0.013033845103927888,
"value_loss_thought": 0.02440745545027312
},
{
"epoch": 11.21,
"grad_norm": 0.7385000543408173,
"learning_rate": 8.013041122030857e-07,
"loss": 0.0729,
"sft_loss": 0.008190885756630451,
"step": 470,
"total_loss": 0.008555836805862782,
"value_loss": 0.003649510513059795,
"value_loss_search": 0.012406850125989877,
"value_loss_thought": 0.016789233975578098
},
{
"epoch": 11.33,
"grad_norm": 0.7633334519531002,
"learning_rate": 7.49894813576437e-07,
"loss": 0.0746,
"sft_loss": 0.008580005669500679,
"step": 475,
"total_loss": 0.008921884518167644,
"value_loss": 0.003418788455019239,
"value_loss_search": 0.010919666264089755,
"value_loss_thought": 0.01643064135714667
},
{
"epoch": 11.45,
"grad_norm": 0.9909650038252632,
"learning_rate": 6.998974816109566e-07,
"loss": 0.0715,
"sft_loss": 0.007525585388066247,
"step": 480,
"total_loss": 0.007853847992191732,
"value_loss": 0.0032826256348926107,
"value_loss_search": 0.010486412676982582,
"value_loss_thought": 0.01577459230902605
},
{
"epoch": 11.56,
"grad_norm": 0.8129682389354967,
"learning_rate": 6.513524536340412e-07,
"loss": 0.0712,
"sft_loss": 0.008444644045084715,
"step": 485,
"total_loss": 0.008825799487021868,
"value_loss": 0.0038115545008622575,
"value_loss_search": 0.010597189843247179,
"value_loss_thought": 0.019895246106898412
},
{
"epoch": 11.68,
"grad_norm": 0.8775605329407913,
"learning_rate": 6.04298895269334e-07,
"loss": 0.0703,
"sft_loss": 0.008042437641415745,
"step": 490,
"total_loss": 0.008418237919249805,
"value_loss": 0.003758002670656424,
"value_loss_search": 0.011610074328200427,
"value_loss_thought": 0.018453947085072288
},
{
"epoch": 11.8,
"grad_norm": 0.8428362204023381,
"learning_rate": 5.5877476883831e-07,
"loss": 0.072,
"sft_loss": 0.00909191146492958,
"step": 495,
"total_loss": 0.009458713585991064,
"value_loss": 0.00366802109128912,
"value_loss_search": 0.012498747254721821,
"value_loss_thought": 0.016845421569450993
},
{
"epoch": 11.92,
"grad_norm": 0.778559570796605,
"learning_rate": 5.148168027326672e-07,
"loss": 0.07,
"sft_loss": 0.008272510836832225,
"step": 500,
"total_loss": 0.008611040847790718,
"value_loss": 0.0033853001063107514,
"value_loss_search": 0.01196195939264726,
"value_loss_thought": 0.015120441405451856
},
{
"epoch": 12.04,
"grad_norm": 0.6559608657124244,
"learning_rate": 4.724604617822429e-07,
"loss": 0.069,
"sft_loss": 0.008424899209057913,
"step": 505,
"total_loss": 0.00873491317679509,
"value_loss": 0.003100138959416654,
"value_loss_search": 0.009996878827223554,
"value_loss_thought": 0.014804233009635936
},
{
"epoch": 12.16,
"grad_norm": 0.6496917858411566,
"learning_rate": 4.317399186423574e-07,
"loss": 0.0674,
"sft_loss": 0.008423441136255861,
"step": 510,
"total_loss": 0.008760591896862024,
"value_loss": 0.003371507487099734,
"value_loss_search": 0.009991695114877075,
"value_loss_thought": 0.01698036474554101
},
{
"epoch": 12.28,
"grad_norm": 0.6988165213179325,
"learning_rate": 3.926880262236724e-07,
"loss": 0.0611,
"sft_loss": 0.007937616313574836,
"step": 515,
"total_loss": 0.008260040113054857,
"value_loss": 0.0032242384204437258,
"value_loss_search": 0.010769415664253757,
"value_loss_thought": 0.015024491625081283
},
{
"epoch": 12.4,
"grad_norm": 0.7528021544771036,
"learning_rate": 3.5533629118680443e-07,
"loss": 0.0643,
"sft_loss": 0.008419578592292965,
"step": 520,
"total_loss": 0.008726798217958276,
"value_loss": 0.003072196059838461,
"value_loss_search": 0.010191632094210945,
"value_loss_thought": 0.01438593643833883
},
{
"epoch": 12.52,
"grad_norm": 0.7891775732477346,
"learning_rate": 3.1971484852307694e-07,
"loss": 0.0611,
"sft_loss": 0.00729263544199057,
"step": 525,
"total_loss": 0.0076024448298994685,
"value_loss": 0.0030980941948655525,
"value_loss_search": 0.010019748501508729,
"value_loss_thought": 0.014765005164372269
},
{
"epoch": 12.64,
"grad_norm": 0.6907626742145749,
"learning_rate": 2.8585243724192466e-07,
"loss": 0.0601,
"sft_loss": 0.007823836395982654,
"step": 530,
"total_loss": 0.008138071943994874,
"value_loss": 0.0031423555094079346,
"value_loss_search": 0.010012611195270438,
"value_loss_thought": 0.015126232872717083
},
{
"epoch": 12.76,
"grad_norm": 0.7429135356582753,
"learning_rate": 2.5377637718455887e-07,
"loss": 0.0659,
"sft_loss": 0.00822238313849084,
"step": 535,
"total_loss": 0.008556838914773834,
"value_loss": 0.0033445580851548585,
"value_loss_search": 0.011346992234757635,
"value_loss_thought": 0.01540947253961349
},
{
"epoch": 12.88,
"grad_norm": 0.7774598016373955,
"learning_rate": 2.23512546982603e-07,
"loss": 0.0645,
"sft_loss": 0.0073554387083277105,
"step": 540,
"total_loss": 0.007690619602180959,
"value_loss": 0.0033518082374939693,
"value_loss_search": 0.010812847971101292,
"value_loss_thought": 0.016001617997244467
},
{
"epoch": 13.0,
"grad_norm": 0.9294156718635098,
"learning_rate": 1.9508536317948358e-07,
"loss": 0.0647,
"sft_loss": 0.008502764551667496,
"step": 545,
"total_loss": 0.008804986921677482,
"value_loss": 0.0030222234428947557,
"value_loss_search": 0.010061871179641458,
"value_loss_thought": 0.014115916420996655
},
{
"epoch": 13.11,
"grad_norm": 0.6263794814598377,
"learning_rate": 1.6851776053141505e-07,
"loss": 0.0568,
"sft_loss": 0.006252302648499608,
"step": 550,
"total_loss": 0.006542081882616913,
"value_loss": 0.002897792073781602,
"value_loss_search": 0.008962227043593885,
"value_loss_thought": 0.014220109464076813
},
{
"epoch": 13.23,
"grad_norm": 0.5596990915451534,
"learning_rate": 1.438311735038783e-07,
"loss": 0.056,
"sft_loss": 0.00756752782908734,
"step": 555,
"total_loss": 0.007857268174484488,
"value_loss": 0.0028974040556931867,
"value_loss_search": 0.00920752819874906,
"value_loss_thought": 0.013971704215146019
},
{
"epoch": 13.35,
"grad_norm": 0.6205386318885459,
"learning_rate": 1.2104551897851645e-07,
"loss": 0.0623,
"sft_loss": 0.008255193088552914,
"step": 560,
"total_loss": 0.008591093667018868,
"value_loss": 0.0033590062324947213,
"value_loss_search": 0.010873512069520076,
"value_loss_thought": 0.01599853788575274
},
{
"epoch": 13.47,
"grad_norm": 0.61362058421556,
"learning_rate": 1.0017918018440182e-07,
"loss": 0.0572,
"sft_loss": 0.006953836599132046,
"step": 565,
"total_loss": 0.007285526537361875,
"value_loss": 0.003316899410856422,
"value_loss_search": 0.010180625680368394,
"value_loss_thought": 0.01635456970980158
},
{
"epoch": 13.59,
"grad_norm": 0.672535364687818,
"learning_rate": 8.124899186663816e-08,
"loss": 0.0589,
"sft_loss": 0.007610985788051039,
"step": 570,
"total_loss": 0.007938396774943613,
"value_loss": 0.003274109651101753,
"value_loss_search": 0.009723765701346565,
"value_loss_thought": 0.01646911151619861
},
{
"epoch": 13.71,
"grad_norm": 0.5999988951381204,
"learning_rate": 6.42702267042633e-08,
"loss": 0.0576,
"sft_loss": 0.006899624702055007,
"step": 575,
"total_loss": 0.0071674311737297105,
"value_loss": 0.0026780645912367618,
"value_loss_search": 0.007902318794367601,
"value_loss_thought": 0.013522197940619662
},
{
"epoch": 13.83,
"grad_norm": 0.5907898559307843,
"learning_rate": 4.9256582988409795e-08,
"loss": 0.0592,
"sft_loss": 0.008203773334389552,
"step": 580,
"total_loss": 0.008553845428650675,
"value_loss": 0.003500720789452316,
"value_loss_search": 0.01093327165726805,
"value_loss_thought": 0.017072494645253754
},
{
"epoch": 13.95,
"grad_norm": 0.5970917111715769,
"learning_rate": 3.6220173570667426e-08,
"loss": 0.0618,
"sft_loss": 0.006927466951310635,
"step": 585,
"total_loss": 0.0071999496663920585,
"value_loss": 0.0027248271569987994,
"value_loss_search": 0.009161660186873633,
"value_loss_thought": 0.012636957273934967
},
{
"epoch": 14.07,
"grad_norm": 0.6162881999842252,
"learning_rate": 2.5171516090559955e-08,
"loss": 0.0589,
"sft_loss": 0.007042083889245987,
"step": 590,
"total_loss": 0.007351792247754929,
"value_loss": 0.003097083197644679,
"value_loss_search": 0.009042211448104354,
"value_loss_thought": 0.015734454097400884
},
{
"epoch": 14.19,
"grad_norm": 0.6339769311529202,
"learning_rate": 1.6119524490022796e-08,
"loss": 0.0573,
"sft_loss": 0.006979101125034503,
"step": 595,
"total_loss": 0.007320650927067618,
"value_loss": 0.003415498048707377,
"value_loss_search": 0.00965559176474926,
"value_loss_thought": 0.01766839253687067
},
{
"epoch": 14.31,
"grad_norm": 0.6862289179498278,
"learning_rate": 9.07150182172717e-09,
"loss": 0.0565,
"sft_loss": 0.006115725569543429,
"step": 600,
"total_loss": 0.006396417736686999,
"value_loss": 0.0028069213738490363,
"value_loss_search": 0.0084622835922346,
"value_loss_thought": 0.013993087414564797
},
{
"epoch": 14.43,
"grad_norm": 0.6965073779773722,
"learning_rate": 4.033134357054047e-09,
"loss": 0.0586,
"sft_loss": 0.005957965517882258,
"step": 605,
"total_loss": 0.0062250764058262575,
"value_loss": 0.0026711090009484905,
"value_loss_search": 0.009071938186025363,
"value_loss_thought": 0.012296933901961894
},
{
"epoch": 14.55,
"grad_norm": 0.5697750508884436,
"learning_rate": 1.0084869984686163e-09,
"loss": 0.0568,
"sft_loss": 0.007345160163822584,
"step": 610,
"total_loss": 0.007681814903298801,
"value_loss": 0.003366547104815254,
"value_loss_search": 0.011690776431350969,
"value_loss_thought": 0.015241600351873785
},
{
"epoch": 14.66,
"grad_norm": 0.585078337634092,
"learning_rate": 0.0,
"loss": 0.0602,
"sft_loss": 0.008140892942901701,
"step": 615,
"total_loss": 0.00840054679711102,
"value_loss": 0.002596538521174807,
"value_loss_search": 0.008188147976397886,
"value_loss_thought": 0.012584160236292519
},
{
"epoch": 14.66,
"step": 615,
"total_flos": 0.0,
"train_loss": 0.3256297383851152,
"train_runtime": 44647.5676,
"train_samples_per_second": 7.206,
"train_steps_per_second": 0.014
}
],
"logging_steps": 5,
"max_steps": 615,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 350,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}