Qwen2.5-1.5B-Open-R1-GRPO / trainer_state.json
ibndias's picture
Model save
d011c79 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9987438399845395,
"eval_steps": 100,
"global_step": 646,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 895.9870979309082,
"epoch": 0.015460430959512996,
"grad_norm": 0.0634842514468751,
"kl": 0.00048552751541137696,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.0,
"reward": 0.2128826495842077,
"reward_std": 0.19821357885375618,
"rewards/accuracy_reward": 0.2128826495842077,
"rewards/format_reward": 0.0,
"step": 10
},
{
"completion_length": 813.6550861358643,
"epoch": 0.03092086191902599,
"grad_norm": 0.05144004581802371,
"kl": 0.00542140007019043,
"learning_rate": 6.153846153846155e-06,
"loss": 0.0002,
"reward": 0.3605867292615585,
"reward_std": 0.2062750784214586,
"rewards/accuracy_reward": 0.3605867292615585,
"rewards/format_reward": 0.0,
"step": 20
},
{
"completion_length": 784.7251134872437,
"epoch": 0.04638129287853899,
"grad_norm": 0.058006301230646685,
"kl": 0.008408355712890624,
"learning_rate": 9.230769230769232e-06,
"loss": 0.0003,
"reward": 0.42461733864620327,
"reward_std": 0.21439061006531118,
"rewards/accuracy_reward": 0.42461733864620327,
"rewards/format_reward": 0.0,
"step": 30
},
{
"completion_length": 752.7404174804688,
"epoch": 0.06184172383805198,
"grad_norm": 0.06100438489544683,
"kl": 0.013679313659667968,
"learning_rate": 1.230769230769231e-05,
"loss": 0.0005,
"reward": 0.48864794997498395,
"reward_std": 0.2098829376511276,
"rewards/accuracy_reward": 0.48864794997498395,
"rewards/format_reward": 0.0,
"step": 40
},
{
"completion_length": 703.8393978118896,
"epoch": 0.07730215479756498,
"grad_norm": 0.06182390025666779,
"kl": 0.024402618408203125,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.001,
"reward": 0.5262754996772856,
"reward_std": 0.23188330424018205,
"rewards/accuracy_reward": 0.5262754996772856,
"rewards/format_reward": 0.0,
"step": 50
},
{
"completion_length": 732.7664375305176,
"epoch": 0.09276258575707798,
"grad_norm": 0.0653391395105433,
"kl": 0.03628692626953125,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.0015,
"reward": 0.5107142759487033,
"reward_std": 0.2243131298571825,
"rewards/accuracy_reward": 0.5107142759487033,
"rewards/format_reward": 0.0,
"step": 60
},
{
"completion_length": 718.8909275054932,
"epoch": 0.10822301671659097,
"grad_norm": 0.16972060323025048,
"kl": 0.14796981811523438,
"learning_rate": 1.999634547413886e-05,
"loss": 0.0059,
"reward": 0.5130101933144033,
"reward_std": 0.2463569703977555,
"rewards/accuracy_reward": 0.5130101933144033,
"rewards/format_reward": 0.0,
"step": 70
},
{
"completion_length": 776.1557273864746,
"epoch": 0.12368344767610397,
"grad_norm": 0.7559210983180688,
"kl": 0.1969940185546875,
"learning_rate": 1.9967125291968495e-05,
"loss": 0.0079,
"reward": 0.46645407350733875,
"reward_std": 0.23498255694285036,
"rewards/accuracy_reward": 0.46645407350733875,
"rewards/format_reward": 0.0,
"step": 80
},
{
"completion_length": 709.4320009231567,
"epoch": 0.13914387863561697,
"grad_norm": 0.1603065487133038,
"kl": 0.27840576171875,
"learning_rate": 1.990877034074683e-05,
"loss": 0.0111,
"reward": 0.4230867256294005,
"reward_std": 0.2592574997805059,
"rewards/accuracy_reward": 0.4230867256294005,
"rewards/format_reward": 0.0,
"step": 90
},
{
"completion_length": 724.9899085998535,
"epoch": 0.15460430959512997,
"grad_norm": 0.14050811846040184,
"kl": 0.38084716796875,
"learning_rate": 1.9821451197042028e-05,
"loss": 0.0152,
"reward": 0.3531887684832327,
"reward_std": 0.24415079602040352,
"rewards/accuracy_reward": 0.3531887684832327,
"rewards/format_reward": 0.0,
"step": 100
},
{
"epoch": 0.15460430959512997,
"eval_completion_length": 714.497265625,
"eval_kl": 0.21029296875,
"eval_loss": 0.008998678997159004,
"eval_reward": 0.41000001683831216,
"eval_reward_std": 0.25652114123106,
"eval_rewards/accuracy_reward": 0.41000001683831216,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 117.6798,
"eval_samples_per_second": 0.841,
"eval_steps_per_second": 0.034,
"step": 100
},
{
"completion_length": 744.1831466674805,
"epoch": 0.17006474055464296,
"grad_norm": 0.10648109851296413,
"kl": 0.22943115234375,
"learning_rate": 1.9705423102261324e-05,
"loss": 0.0092,
"reward": 0.40063774921000006,
"reward_std": 0.22866119714453817,
"rewards/accuracy_reward": 0.40063774921000006,
"rewards/format_reward": 0.0,
"step": 110
},
{
"completion_length": 676.4876100540162,
"epoch": 0.18552517151415596,
"grad_norm": 0.07603341715798907,
"kl": 0.182135009765625,
"learning_rate": 1.956102521655831e-05,
"loss": 0.0073,
"reward": 0.45714284805580974,
"reward_std": 0.24492393187247216,
"rewards/accuracy_reward": 0.45714284805580974,
"rewards/format_reward": 0.0,
"step": 120
},
{
"completion_length": 573.8422079086304,
"epoch": 0.20098560247366895,
"grad_norm": 0.08767607432549998,
"kl": 0.2532989501953125,
"learning_rate": 1.9388679627438486e-05,
"loss": 0.0101,
"reward": 0.450510194664821,
"reward_std": 0.24328936655074357,
"rewards/accuracy_reward": 0.450510194664821,
"rewards/format_reward": 0.0,
"step": 130
},
{
"completion_length": 441.023459815979,
"epoch": 0.21644603343318194,
"grad_norm": 0.09806805953108236,
"kl": 0.26416015625,
"learning_rate": 1.9188890115960967e-05,
"loss": 0.0106,
"reward": 0.47551019601523875,
"reward_std": 0.2608696824405342,
"rewards/accuracy_reward": 0.47551019601523875,
"rewards/format_reward": 0.0,
"step": 140
},
{
"completion_length": 458.37256927490233,
"epoch": 0.23190646439269494,
"grad_norm": 0.1513486117044564,
"kl": 0.3147705078125,
"learning_rate": 1.8962240684142923e-05,
"loss": 0.0126,
"reward": 0.4642857049591839,
"reward_std": 0.2634817813988775,
"rewards/accuracy_reward": 0.4642857049591839,
"rewards/format_reward": 0.0,
"step": 150
},
{
"completion_length": 497.4808575630188,
"epoch": 0.24736689535220793,
"grad_norm": 0.8199204469447977,
"kl": 4.927545166015625,
"learning_rate": 1.8709393847871146e-05,
"loss": 0.1972,
"reward": 0.4331632573157549,
"reward_std": 0.28318220381625,
"rewards/accuracy_reward": 0.4331632573157549,
"rewards/format_reward": 0.0,
"step": 160
},
{
"completion_length": 492.83812828063964,
"epoch": 0.26282732631172095,
"grad_norm": 0.13685058175751932,
"kl": 0.540545654296875,
"learning_rate": 1.8431088700310846e-05,
"loss": 0.0216,
"reward": 0.5280612137168645,
"reward_std": 0.25305321919731794,
"rewards/accuracy_reward": 0.5280612137168645,
"rewards/format_reward": 0.0,
"step": 170
},
{
"completion_length": 749.1317470550537,
"epoch": 0.27828775727123395,
"grad_norm": 0.10491062132812622,
"kl": 0.122552490234375,
"learning_rate": 1.8128138751472432e-05,
"loss": 0.0049,
"reward": 0.434948971029371,
"reward_std": 0.2707877185661346,
"rewards/accuracy_reward": 0.434948971029371,
"rewards/format_reward": 0.0,
"step": 180
},
{
"completion_length": 729.5617179870605,
"epoch": 0.29374818823074694,
"grad_norm": 152.95600809970867,
"kl": 2.2434478759765626,
"learning_rate": 1.780142955025139e-05,
"loss": 0.0897,
"reward": 0.43596938010305164,
"reward_std": 0.2726593071129173,
"rewards/accuracy_reward": 0.43596938010305164,
"rewards/format_reward": 0.0,
"step": 190
},
{
"completion_length": 715.5297044754028,
"epoch": 0.30920861919025994,
"grad_norm": 0.3781996804891015,
"kl": 1.44560546875,
"learning_rate": 1.745191609589231e-05,
"loss": 0.0578,
"reward": 0.42602040050551293,
"reward_std": 0.29439413188956676,
"rewards/accuracy_reward": 0.42602040050551293,
"rewards/format_reward": 0.0,
"step": 200
},
{
"epoch": 0.30920861919025994,
"eval_completion_length": 699.7232763671875,
"eval_kl": 0.60046875,
"eval_loss": 0.023748639971017838,
"eval_reward": 0.44000002443790437,
"eval_reward_std": 0.29031212598085404,
"eval_rewards/accuracy_reward": 0.44000002443790437,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 117.9304,
"eval_samples_per_second": 0.839,
"eval_steps_per_second": 0.034,
"step": 200
},
{
"completion_length": 651.5467977523804,
"epoch": 0.32466905014977293,
"grad_norm": 0.16011847933591739,
"kl": 0.3212890625,
"learning_rate": 1.7080620046443503e-05,
"loss": 0.0128,
"reward": 0.46645407294854524,
"reward_std": 0.2662083923816681,
"rewards/accuracy_reward": 0.46645407294854524,
"rewards/format_reward": 0.0,
"step": 210
},
{
"completion_length": 610.8702682495117,
"epoch": 0.3401294811092859,
"grad_norm": 0.11241315704764059,
"kl": 0.200762939453125,
"learning_rate": 1.6688626732362192e-05,
"loss": 0.008,
"reward": 0.46709182686172424,
"reward_std": 0.2868866888806224,
"rewards/accuracy_reward": 0.46709182686172424,
"rewards/format_reward": 0.0,
"step": 220
},
{
"completion_length": 573.5464164733887,
"epoch": 0.3555899120687989,
"grad_norm": 0.4545690819582109,
"kl": 0.3180419921875,
"learning_rate": 1.6277081983999742e-05,
"loss": 0.0127,
"reward": 0.46811223728582263,
"reward_std": 0.27821872364729644,
"rewards/accuracy_reward": 0.46811223728582263,
"rewards/format_reward": 0.0,
"step": 230
},
{
"completion_length": 591.5071292877197,
"epoch": 0.3710503430283119,
"grad_norm": 3.8443277156368407,
"kl": 0.709783935546875,
"learning_rate": 1.5847188782240473e-05,
"loss": 0.0284,
"reward": 0.4396683592349291,
"reward_std": 0.2964717396069318,
"rewards/accuracy_reward": 0.4396683592349291,
"rewards/format_reward": 0.0,
"step": 240
},
{
"completion_length": 581.6975629806518,
"epoch": 0.3865107739878249,
"grad_norm": 0.3623625896893634,
"kl": 0.77828369140625,
"learning_rate": 1.5400203742084508e-05,
"loss": 0.0311,
"reward": 0.4667091774288565,
"reward_std": 0.29322946835309266,
"rewards/accuracy_reward": 0.4667091774288565,
"rewards/format_reward": 0.0,
"step": 250
},
{
"completion_length": 592.2187366485596,
"epoch": 0.4019712049473379,
"grad_norm": 0.2809541447384242,
"kl": 0.5046875,
"learning_rate": 1.4937433439453465e-05,
"loss": 0.0202,
"reward": 0.46109692989848555,
"reward_std": 0.297008786117658,
"rewards/accuracy_reward": 0.46109692989848555,
"rewards/format_reward": 0.0,
"step": 260
},
{
"completion_length": 551.9140211105347,
"epoch": 0.4174316359068509,
"grad_norm": 0.11113688922654333,
"kl": 0.397216796875,
"learning_rate": 1.4460230591956097e-05,
"loss": 0.0159,
"reward": 0.5364795843139291,
"reward_std": 0.250957741914317,
"rewards/accuracy_reward": 0.5364795843139291,
"rewards/format_reward": 0.0,
"step": 270
},
{
"completion_length": 568.6547088623047,
"epoch": 0.4328920668663639,
"grad_norm": 0.24708378647232437,
"kl": 0.163128662109375,
"learning_rate": 1.3969990104777712e-05,
"loss": 0.0065,
"reward": 0.509438766585663,
"reward_std": 0.25802470711059866,
"rewards/accuracy_reward": 0.509438766585663,
"rewards/format_reward": 0.0,
"step": 280
},
{
"completion_length": 545.0108312606811,
"epoch": 0.4483524978258769,
"grad_norm": 5.507612845198049,
"kl": 1.26832275390625,
"learning_rate": 1.3468144993251735e-05,
"loss": 0.0508,
"reward": 0.45420917579904196,
"reward_std": 0.27056095115840434,
"rewards/accuracy_reward": 0.45420917579904196,
"rewards/format_reward": 0.0,
"step": 290
},
{
"completion_length": 554.2693771362304,
"epoch": 0.4638129287853899,
"grad_norm": 0.6042026046483606,
"kl": 0.679656982421875,
"learning_rate": 1.295616219403197e-05,
"loss": 0.0272,
"reward": 0.4521683591417968,
"reward_std": 0.28633297309279443,
"rewards/accuracy_reward": 0.4521683591417968,
"rewards/format_reward": 0.0,
"step": 300
},
{
"epoch": 0.4638129287853899,
"eval_completion_length": 527.6724047851562,
"eval_kl": 0.7890625,
"eval_loss": 0.030249282717704773,
"eval_reward": 0.4671428832411766,
"eval_reward_std": 0.30915650248527526,
"eval_rewards/accuracy_reward": 0.4671428832411766,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 112.1004,
"eval_samples_per_second": 0.883,
"eval_steps_per_second": 0.036,
"step": 300
},
{
"completion_length": 559.3555992126464,
"epoch": 0.47927335974490287,
"grad_norm": 0.3223266455258349,
"kl": 0.563067626953125,
"learning_rate": 1.2435538277109919e-05,
"loss": 0.0225,
"reward": 0.4325255031813867,
"reward_std": 0.2710647247266024,
"rewards/accuracy_reward": 0.4325255031813867,
"rewards/format_reward": 0.0,
"step": 310
},
{
"completion_length": 567.4769004821777,
"epoch": 0.49473379070441587,
"grad_norm": 0.32191489256962375,
"kl": 0.478643798828125,
"learning_rate": 1.19077950712113e-05,
"loss": 0.0191,
"reward": 0.5024234588257969,
"reward_std": 0.2630208498798311,
"rewards/accuracy_reward": 0.5024234588257969,
"rewards/format_reward": 0.0,
"step": 320
},
{
"completion_length": 593.5192455291748,
"epoch": 0.5101942216639289,
"grad_norm": 0.3534014177091543,
"kl": 0.58690185546875,
"learning_rate": 1.137447521535908e-05,
"loss": 0.0235,
"reward": 0.49515305291861295,
"reward_std": 0.2701924462337047,
"rewards/accuracy_reward": 0.49515305291861295,
"rewards/format_reward": 0.0,
"step": 330
},
{
"completion_length": 580.6474365234375,
"epoch": 0.5256546526234419,
"grad_norm": 0.5793452639869299,
"kl": 0.567840576171875,
"learning_rate": 1.0837137649606241e-05,
"loss": 0.0227,
"reward": 0.48864794997498395,
"reward_std": 0.25743518364615736,
"rewards/accuracy_reward": 0.48864794997498395,
"rewards/format_reward": 0.0,
"step": 340
},
{
"completion_length": 568.6904249191284,
"epoch": 0.5411150835829549,
"grad_norm": 0.18695646465398727,
"kl": 0.605303955078125,
"learning_rate": 1.0297353058119209e-05,
"loss": 0.0242,
"reward": 0.4730867262929678,
"reward_std": 0.2723493260331452,
"rewards/accuracy_reward": 0.4730867262929678,
"rewards/format_reward": 0.0,
"step": 350
},
{
"completion_length": 544.8482028961182,
"epoch": 0.5565755145424679,
"grad_norm": 0.40854460643950763,
"kl": 0.607025146484375,
"learning_rate": 9.756699277932196e-06,
"loss": 0.0243,
"reward": 0.4915816240012646,
"reward_std": 0.2733304013963789,
"rewards/accuracy_reward": 0.4915816240012646,
"rewards/format_reward": 0.0,
"step": 360
},
{
"completion_length": 527.0141460418702,
"epoch": 0.5720359455019809,
"grad_norm": 0.5971511752861693,
"kl": 0.598443603515625,
"learning_rate": 9.216756686793163e-06,
"loss": 0.0239,
"reward": 0.5142857053317129,
"reward_std": 0.25178592149168255,
"rewards/accuracy_reward": 0.5142857053317129,
"rewards/format_reward": 0.0,
"step": 370
},
{
"completion_length": 544.3457790374756,
"epoch": 0.5874963764614939,
"grad_norm": 0.26757335867195303,
"kl": 0.774627685546875,
"learning_rate": 8.67910358358298e-06,
"loss": 0.031,
"reward": 0.49706631591543554,
"reward_std": 0.2742634845431894,
"rewards/accuracy_reward": 0.49706631591543554,
"rewards/format_reward": 0.0,
"step": 380
},
{
"completion_length": 549.8956525802612,
"epoch": 0.6029568074210069,
"grad_norm": 1.6833932480130032,
"kl": 0.77420654296875,
"learning_rate": 8.145311574811325e-06,
"loss": 0.031,
"reward": 0.48048468669876454,
"reward_std": 0.26254036352038385,
"rewards/accuracy_reward": 0.48048468669876454,
"rewards/format_reward": 0.0,
"step": 390
},
{
"completion_length": 526.9630001068115,
"epoch": 0.6184172383805199,
"grad_norm": 1.314728368359331,
"kl": 0.8248779296875,
"learning_rate": 7.616940980675004e-06,
"loss": 0.033,
"reward": 0.4784438674338162,
"reward_std": 0.28052179743535816,
"rewards/accuracy_reward": 0.4784438674338162,
"rewards/format_reward": 0.0,
"step": 400
},
{
"epoch": 0.6184172383805199,
"eval_completion_length": 513.1444018554688,
"eval_kl": 1.018828125,
"eval_loss": 0.04034169018268585,
"eval_reward": 0.4842857384681702,
"eval_reward_std": 0.31372432827949526,
"eval_rewards/accuracy_reward": 0.4842857384681702,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 112.3631,
"eval_samples_per_second": 0.881,
"eval_steps_per_second": 0.036,
"step": 400
},
{
"completion_length": 500.106876373291,
"epoch": 0.6338776693400329,
"grad_norm": 0.3314015554378421,
"kl": 0.7260986328125,
"learning_rate": 7.095536274107046e-06,
"loss": 0.029,
"reward": 0.5030612162780017,
"reward_std": 0.2555597382131964,
"rewards/accuracy_reward": 0.5030612162780017,
"rewards/format_reward": 0.0,
"step": 410
},
{
"completion_length": 572.6313653945923,
"epoch": 0.6493381002995459,
"grad_norm": 1.83967459416783,
"kl": 0.6177490234375,
"learning_rate": 6.58262156614881e-06,
"loss": 0.0247,
"reward": 0.4937499940395355,
"reward_std": 0.2641662787180394,
"rewards/accuracy_reward": 0.4937499940395355,
"rewards/format_reward": 0.0,
"step": 420
},
{
"completion_length": 558.3020294189453,
"epoch": 0.6647985312590589,
"grad_norm": 0.2328326516837162,
"kl": 0.5453857421875,
"learning_rate": 6.079696150841634e-06,
"loss": 0.0218,
"reward": 0.4604591763578355,
"reward_std": 0.25486370851285756,
"rewards/accuracy_reward": 0.4604591763578355,
"rewards/format_reward": 0.0,
"step": 430
},
{
"completion_length": 495.181622505188,
"epoch": 0.6802589622185718,
"grad_norm": 0.45411943739355637,
"kl": 0.489031982421875,
"learning_rate": 5.588230122660672e-06,
"loss": 0.0196,
"reward": 0.501403052546084,
"reward_std": 0.2606009878218174,
"rewards/accuracy_reward": 0.501403052546084,
"rewards/format_reward": 0.0,
"step": 440
},
{
"completion_length": 521.8503698348999,
"epoch": 0.6957193931780848,
"grad_norm": 0.6920580933707077,
"kl": 0.69771728515625,
"learning_rate": 5.109660079301668e-06,
"loss": 0.0279,
"reward": 0.48520407294854523,
"reward_std": 0.2722197940573096,
"rewards/accuracy_reward": 0.48520407294854523,
"rewards/format_reward": 0.0,
"step": 450
},
{
"completion_length": 516.6503719329834,
"epoch": 0.7111798241375978,
"grad_norm": 0.7655139656662532,
"kl": 0.59906005859375,
"learning_rate": 4.64538492238166e-06,
"loss": 0.024,
"reward": 0.4839285622350872,
"reward_std": 0.2701169220265001,
"rewards/accuracy_reward": 0.4839285622350872,
"rewards/format_reward": 0.0,
"step": 460
},
{
"completion_length": 532.4928466796875,
"epoch": 0.7266402550971108,
"grad_norm": 0.42334345776819876,
"kl": 0.660205078125,
"learning_rate": 4.196761768328599e-06,
"loss": 0.0264,
"reward": 0.4859693797305226,
"reward_std": 0.26866118256002663,
"rewards/accuracy_reward": 0.4859693797305226,
"rewards/format_reward": 0.0,
"step": 470
},
{
"completion_length": 534.1709095001221,
"epoch": 0.7421006860566238,
"grad_norm": 0.48787507323058743,
"kl": 0.65863037109375,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.0263,
"reward": 0.4844387672841549,
"reward_std": 0.27398421289399266,
"rewards/accuracy_reward": 0.4844387672841549,
"rewards/format_reward": 0.0,
"step": 480
},
{
"completion_length": 534.6030525207519,
"epoch": 0.7575611170161368,
"grad_norm": 0.5465188749045764,
"kl": 0.7282470703125,
"learning_rate": 3.3516673405151546e-06,
"loss": 0.0291,
"reward": 0.49260203279554843,
"reward_std": 0.27119873408228157,
"rewards/accuracy_reward": 0.49260203279554843,
"rewards/format_reward": 0.0,
"step": 490
},
{
"completion_length": 531.7174621582031,
"epoch": 0.7730215479756498,
"grad_norm": 1.4984058691914603,
"kl": 0.571331787109375,
"learning_rate": 2.957666350839663e-06,
"loss": 0.0229,
"reward": 0.4956632579676807,
"reward_std": 0.25490435254760085,
"rewards/accuracy_reward": 0.4956632579676807,
"rewards/format_reward": 0.0,
"step": 500
},
{
"epoch": 0.7730215479756498,
"eval_completion_length": 518.6042163085938,
"eval_kl": 0.613046875,
"eval_loss": 0.024254189804196358,
"eval_reward": 0.5128571724891663,
"eval_reward_std": 0.2827815467119217,
"eval_rewards/accuracy_reward": 0.5128571724891663,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 112.0796,
"eval_samples_per_second": 0.883,
"eval_steps_per_second": 0.036,
"step": 500
},
{
"completion_length": 565.8936117172241,
"epoch": 0.7884819789351628,
"grad_norm": 1.3357020339930779,
"kl": 0.752362060546875,
"learning_rate": 2.5842507113469307e-06,
"loss": 0.0301,
"reward": 0.48163264396134764,
"reward_std": 0.2701051170937717,
"rewards/accuracy_reward": 0.48163264396134764,
"rewards/format_reward": 0.0,
"step": 510
},
{
"completion_length": 549.8587898254394,
"epoch": 0.8039424098946758,
"grad_norm": 0.27711780573814004,
"kl": 0.6265380859375,
"learning_rate": 2.2325119482391466e-06,
"loss": 0.0251,
"reward": 0.48915815316140654,
"reward_std": 0.2548467483371496,
"rewards/accuracy_reward": 0.48915815316140654,
"rewards/format_reward": 0.0,
"step": 520
},
{
"completion_length": 531.7180992126465,
"epoch": 0.8194028408541888,
"grad_norm": 0.5958984702375305,
"kl": 0.599822998046875,
"learning_rate": 1.9034782243345074e-06,
"loss": 0.024,
"reward": 0.513903050404042,
"reward_std": 0.2422366608865559,
"rewards/accuracy_reward": 0.513903050404042,
"rewards/format_reward": 0.0,
"step": 530
},
{
"completion_length": 536.6831537246704,
"epoch": 0.8348632718137018,
"grad_norm": 0.9836698663565713,
"kl": 0.66021728515625,
"learning_rate": 1.5981113336584041e-06,
"loss": 0.0264,
"reward": 0.5011479528620839,
"reward_std": 0.2621664395555854,
"rewards/accuracy_reward": 0.5011479528620839,
"rewards/format_reward": 0.0,
"step": 540
},
{
"completion_length": 530.7587900161743,
"epoch": 0.8503237027732148,
"grad_norm": 0.644004276722614,
"kl": 0.666015625,
"learning_rate": 1.3173038900362977e-06,
"loss": 0.0266,
"reward": 0.5116071328520775,
"reward_std": 0.264158633723855,
"rewards/accuracy_reward": 0.5116071328520775,
"rewards/format_reward": 0.0,
"step": 550
},
{
"completion_length": 537.9271587371826,
"epoch": 0.8657841337327278,
"grad_norm": 0.708668814006566,
"kl": 0.61654052734375,
"learning_rate": 1.0618767179063416e-06,
"loss": 0.0246,
"reward": 0.49528060380835087,
"reward_std": 0.25958121265284717,
"rewards/accuracy_reward": 0.49528060380835087,
"rewards/format_reward": 0.0,
"step": 560
},
{
"completion_length": 550.545781326294,
"epoch": 0.8812445646922408,
"grad_norm": 0.7500579172257139,
"kl": 0.71644287109375,
"learning_rate": 8.325764529785851e-07,
"loss": 0.0287,
"reward": 0.4839285627938807,
"reward_std": 0.25909215547144415,
"rewards/accuracy_reward": 0.4839285627938807,
"rewards/format_reward": 0.0,
"step": 570
},
{
"completion_length": 546.6188653945923,
"epoch": 0.8967049956517538,
"grad_norm": 0.37217374017534643,
"kl": 0.65179443359375,
"learning_rate": 6.300733597542086e-07,
"loss": 0.0261,
"reward": 0.49107142109423874,
"reward_std": 0.2726855373941362,
"rewards/accuracy_reward": 0.49107142109423874,
"rewards/format_reward": 0.0,
"step": 580
},
{
"completion_length": 548.421669960022,
"epoch": 0.9121654266112668,
"grad_norm": 0.2522060016676339,
"kl": 0.7209228515625,
"learning_rate": 4.549593722844492e-07,
"loss": 0.0288,
"reward": 0.48686223682016133,
"reward_std": 0.2682306385599077,
"rewards/accuracy_reward": 0.48686223682016133,
"rewards/format_reward": 0.0,
"step": 590
},
{
"completion_length": 536.7123596191407,
"epoch": 0.9276258575707798,
"grad_norm": 0.506532147702234,
"kl": 0.7099609375,
"learning_rate": 3.0774636389618196e-07,
"loss": 0.0284,
"reward": 0.508545909030363,
"reward_std": 0.26957538770511746,
"rewards/accuracy_reward": 0.508545909030363,
"rewards/format_reward": 0.0,
"step": 600
},
{
"epoch": 0.9276258575707798,
"eval_completion_length": 519.1669702148438,
"eval_kl": 0.608203125,
"eval_loss": 0.024479562416672707,
"eval_reward": 0.49571430921554566,
"eval_reward_std": 0.3164041242003441,
"eval_rewards/accuracy_reward": 0.49571430921554566,
"eval_rewards/format_reward": 0.0,
"eval_runtime": 118.8299,
"eval_samples_per_second": 0.833,
"eval_steps_per_second": 0.034,
"step": 600
},
{
"completion_length": 539.3131271362305,
"epoch": 0.9430862885302927,
"grad_norm": 0.5791165478383384,
"kl": 0.691119384765625,
"learning_rate": 1.8886465094192895e-07,
"loss": 0.0276,
"reward": 0.490943868085742,
"reward_std": 0.25967655666172507,
"rewards/accuracy_reward": 0.490943868085742,
"rewards/format_reward": 0.0,
"step": 610
},
{
"completion_length": 538.3966737747193,
"epoch": 0.9585467194898057,
"grad_norm": 0.7640246067553359,
"kl": 0.67713623046875,
"learning_rate": 9.866173494794462e-08,
"loss": 0.0271,
"reward": 0.5010203978512436,
"reward_std": 0.2573148904833943,
"rewards/accuracy_reward": 0.5010203978512436,
"rewards/format_reward": 0.0,
"step": 620
},
{
"completion_length": 544.9163146972656,
"epoch": 0.9740071504493187,
"grad_norm": 0.32057223426264814,
"kl": 0.733837890625,
"learning_rate": 3.7401286837214224e-08,
"loss": 0.0294,
"reward": 0.49795917570590975,
"reward_std": 0.2614025991875678,
"rewards/accuracy_reward": 0.49795917570590975,
"rewards/format_reward": 0.0,
"step": 630
},
{
"completion_length": 538.4007551193238,
"epoch": 0.9894675814088317,
"grad_norm": 0.46047905547332224,
"kl": 0.66102294921875,
"learning_rate": 5.262376196544239e-09,
"loss": 0.0264,
"reward": 0.486479582823813,
"reward_std": 0.27064854740165173,
"rewards/accuracy_reward": 0.486479582823813,
"rewards/format_reward": 0.0,
"step": 640
},
{
"completion_length": 533.0607868830363,
"epoch": 0.9987438399845395,
"kl": 0.6852213541666666,
"reward": 0.4989370664892097,
"reward_std": 0.26216560679798323,
"rewards/accuracy_reward": 0.4989370664892097,
"rewards/format_reward": 0.0,
"step": 646,
"total_flos": 0.0,
"train_loss": 0.02447813622547251,
"train_runtime": 60911.9541,
"train_samples_per_second": 1.189,
"train_steps_per_second": 0.011
}
],
"logging_steps": 10,
"max_steps": 646,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 7,
"trial_name": null,
"trial_params": null
}