s300-1.3.0L-GRPO-it3 / trainer_state.json
PocketDoc's picture
Upload folder using huggingface_hub
d925940 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7802340702210663,
"eval_steps": 500,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 366.5625,
"epoch": 0.002600780234070221,
"grad_norm": 0.7637246764761187,
"kl": 0.00033664703369140625,
"learning_rate": 0.0,
"loss": 0.0093,
"reward": 0.27206526696681976,
"reward_std": 0.35790932178497314,
"rewards/preference_model_reward": 0.27206526696681976,
"rewards/preference_model_reward/std": 0.3698284178972244,
"step": 1
},
{
"clip_ratio": 0.0,
"epoch": 0.005201560468140442,
"grad_norm": 0.7634221939067439,
"kl": 0.00033664703369140625,
"learning_rate": 1e-07,
"loss": 0.0093,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 286.765625,
"epoch": 0.007802340702210663,
"grad_norm": 0.6989165605414746,
"kl": 0.0003261566162109375,
"learning_rate": 2e-07,
"loss": 0.0017,
"reward": 0.4179777204990387,
"reward_std": 0.3505486845970154,
"rewards/preference_model_reward": 0.4179777204990387,
"rewards/preference_model_reward/std": 0.42789818346500397,
"step": 3
},
{
"clip_ratio": 0.0003455103069427423,
"epoch": 0.010403120936280884,
"grad_norm": 0.7024752202801559,
"kl": 0.00033855438232421875,
"learning_rate": 3e-07,
"loss": 0.0017,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 347.421875,
"epoch": 0.013003901170351105,
"grad_norm": 0.7903982506139909,
"kl": 0.0003814697265625,
"learning_rate": 4e-07,
"loss": 0.0042,
"reward": 0.42805667221546173,
"reward_std": 0.3343174010515213,
"rewards/preference_model_reward": 0.42805667221546173,
"rewards/preference_model_reward/std": 0.38788357377052307,
"step": 5
},
{
"clip_ratio": 0.00042975760879926383,
"epoch": 0.015604681404421327,
"grad_norm": 0.8018115419028574,
"kl": 0.0003833770751953125,
"learning_rate": 5e-07,
"loss": 0.0042,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 213.625,
"epoch": 0.018205461638491547,
"grad_norm": 0.5838143481805789,
"kl": 0.0004119873046875,
"learning_rate": 6e-07,
"loss": -0.0006,
"reward": 0.1721051186323166,
"reward_std": 0.2126249074935913,
"rewards/preference_model_reward": 0.1721051186323166,
"rewards/preference_model_reward/std": 0.32822249829769135,
"step": 7
},
{
"clip_ratio": 0.00028950837804586627,
"epoch": 0.02080624187256177,
"grad_norm": 0.5890045804822527,
"kl": 0.00039958953857421875,
"learning_rate": 7e-07,
"loss": -0.0006,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 276.53125,
"epoch": 0.02340702210663199,
"grad_norm": 0.8569937422606633,
"kl": 0.000370025634765625,
"learning_rate": 8e-07,
"loss": 0.0074,
"reward": 0.5692853033542633,
"reward_std": 0.4373367577791214,
"rewards/preference_model_reward": 0.5692853033542633,
"rewards/preference_model_reward/std": 0.43920741975307465,
"step": 9
},
{
"clip_ratio": 0.0002302309003425762,
"epoch": 0.02600780234070221,
"grad_norm": 0.8635017364924986,
"kl": 0.000354766845703125,
"learning_rate": 9e-07,
"loss": 0.0075,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 417.921875,
"epoch": 0.02860858257477243,
"grad_norm": 0.7581548485527216,
"kl": 0.000392913818359375,
"learning_rate": 1e-06,
"loss": 0.0146,
"reward": 0.4504364877939224,
"reward_std": 0.3650350868701935,
"rewards/preference_model_reward": 0.4504364877939224,
"rewards/preference_model_reward/std": 0.4405139982700348,
"step": 11
},
{
"clip_ratio": 0.00031334128289017826,
"epoch": 0.031209362808842653,
"grad_norm": 0.7745072225657651,
"kl": 0.00039005279541015625,
"learning_rate": 1e-06,
"loss": 0.0146,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 229.953125,
"epoch": 0.033810143042912875,
"grad_norm": 0.8433641431962554,
"kl": 0.00045299530029296875,
"learning_rate": 1e-06,
"loss": -0.0043,
"reward": 0.2926686853170395,
"reward_std": 0.33062444627285004,
"rewards/preference_model_reward": 0.2926686853170395,
"rewards/preference_model_reward/std": 0.3724062442779541,
"step": 13
},
{
"clip_ratio": 0.00040468364022672176,
"epoch": 0.036410923276983094,
"grad_norm": 0.7555936110067955,
"kl": 0.0004749298095703125,
"learning_rate": 1e-06,
"loss": -0.0043,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 266.859375,
"epoch": 0.03901170351105332,
"grad_norm": 0.8416993973883007,
"kl": 0.00039386749267578125,
"learning_rate": 1e-06,
"loss": 0.0026,
"reward": 0.41267864406108856,
"reward_std": 0.3813520818948746,
"rewards/preference_model_reward": 0.41267864406108856,
"rewards/preference_model_reward/std": 0.4197400361299515,
"step": 15
},
{
"clip_ratio": 0.00021883380395593122,
"epoch": 0.04161248374512354,
"grad_norm": 0.819553453730802,
"kl": 0.000415802001953125,
"learning_rate": 1e-06,
"loss": 0.0026,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 367.84375,
"epoch": 0.044213263979193757,
"grad_norm": 0.5697479623176207,
"kl": 0.00043487548828125,
"learning_rate": 1e-06,
"loss": 0.0076,
"reward": 0.3072579577565193,
"reward_std": 0.20797011256217957,
"rewards/preference_model_reward": 0.3072579577565193,
"rewards/preference_model_reward/std": 0.3955962061882019,
"step": 17
},
{
"clip_ratio": 0.00018104003902408294,
"epoch": 0.04681404421326398,
"grad_norm": 0.5554507387208018,
"kl": 0.0004138946533203125,
"learning_rate": 1e-06,
"loss": 0.0076,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 355.671875,
"epoch": 0.0494148244473342,
"grad_norm": 0.9534071970661521,
"kl": 0.00045013427734375,
"learning_rate": 1e-06,
"loss": 0.0049,
"reward": 0.32867759466171265,
"reward_std": 0.4192758649587631,
"rewards/preference_model_reward": 0.32867759466171265,
"rewards/preference_model_reward/std": 0.41489049792289734,
"step": 19
},
{
"clip_ratio": 0.0001942292437888682,
"epoch": 0.05201560468140442,
"grad_norm": 0.9582264669727778,
"kl": 0.0004558563232421875,
"learning_rate": 1e-06,
"loss": 0.0049,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 340.671875,
"epoch": 0.054616384915474644,
"grad_norm": 0.7518884583016779,
"kl": 0.0004329681396484375,
"learning_rate": 1e-06,
"loss": 0.0054,
"reward": 0.47081659734249115,
"reward_std": 0.37600383162498474,
"rewards/preference_model_reward": 0.47081659734249115,
"rewards/preference_model_reward/std": 0.46160686016082764,
"step": 21
},
{
"clip_ratio": 0.00022455723956227303,
"epoch": 0.05721716514954486,
"grad_norm": 0.7473411198477925,
"kl": 0.0004329681396484375,
"learning_rate": 1e-06,
"loss": 0.0054,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 328.734375,
"epoch": 0.05981794538361508,
"grad_norm": 1.9194684284684271,
"kl": 0.0005474090576171875,
"learning_rate": 1e-06,
"loss": 0.0147,
"reward": 0.4218391329050064,
"reward_std": 0.3977803438901901,
"rewards/preference_model_reward": 0.4218391329050064,
"rewards/preference_model_reward/std": 0.4245973825454712,
"step": 23
},
{
"clip_ratio": 0.0008293713217426557,
"epoch": 0.06241872561768531,
"grad_norm": 1.0216332209635672,
"kl": 0.00061798095703125,
"learning_rate": 1e-06,
"loss": 0.0147,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 361.984375,
"epoch": 0.06501950585175553,
"grad_norm": 1.2061326379172082,
"kl": 0.0006351470947265625,
"learning_rate": 1e-06,
"loss": 0.0006,
"reward": 0.2993537187576294,
"reward_std": 0.33115406334400177,
"rewards/preference_model_reward": 0.2993537187576294,
"rewards/preference_model_reward/std": 0.34632159024477005,
"step": 25
},
{
"clip_ratio": 0.00020526795196929015,
"epoch": 0.06762028608582575,
"grad_norm": 0.7646253192627566,
"kl": 0.000896453857421875,
"learning_rate": 1e-06,
"loss": 0.0006,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 333.59375,
"epoch": 0.07022106631989597,
"grad_norm": 1.093317920354513,
"kl": 0.00075531005859375,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 0.25294405221939087,
"reward_std": 0.26265186071395874,
"rewards/preference_model_reward": 0.25294405221939087,
"rewards/preference_model_reward/std": 0.31941579282283783,
"step": 27
},
{
"clip_ratio": 0.0002510274134692736,
"epoch": 0.07282184655396619,
"grad_norm": 0.5733552941634138,
"kl": 0.000896453857421875,
"learning_rate": 1e-06,
"loss": 0.0029,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 315.234375,
"epoch": 0.0754226267880364,
"grad_norm": 0.8998443055154545,
"kl": 0.000705718994140625,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 0.44713057577610016,
"reward_std": 0.4038489907979965,
"rewards/preference_model_reward": 0.44713057577610016,
"rewards/preference_model_reward/std": 0.4337831437587738,
"step": 29
},
{
"clip_ratio": 7.785470006638207e-05,
"epoch": 0.07802340702210664,
"grad_norm": 0.9074477906453469,
"kl": 0.000736236572265625,
"learning_rate": 1e-06,
"loss": 0.0003,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 435.03125,
"epoch": 0.08062418725617686,
"grad_norm": 0.6726628287711655,
"kl": 0.000946044921875,
"learning_rate": 1e-06,
"loss": 0.0073,
"reward": 0.2786962687969208,
"reward_std": 0.25916408747434616,
"rewards/preference_model_reward": 0.2786962687969208,
"rewards/preference_model_reward/std": 0.3720279037952423,
"step": 31
},
{
"clip_ratio": 0.0002550777353462763,
"epoch": 0.08322496749024708,
"grad_norm": 0.6715022976592974,
"kl": 0.000949859619140625,
"learning_rate": 1e-06,
"loss": 0.0072,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 297.015625,
"epoch": 0.0858257477243173,
"grad_norm": 0.39405296450661176,
"kl": 0.0009059906005859375,
"learning_rate": 1e-06,
"loss": 0.0044,
"reward": 0.09238657355308533,
"reward_std": 0.2289801463484764,
"rewards/preference_model_reward": 0.09238657355308533,
"rewards/preference_model_reward/std": 0.22657855600118637,
"step": 33
},
{
"clip_ratio": 0.00038183602737262845,
"epoch": 0.08842652795838751,
"grad_norm": 0.4049944734565922,
"kl": 0.0009765625,
"learning_rate": 1e-06,
"loss": 0.0044,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 322.203125,
"epoch": 0.09102730819245773,
"grad_norm": 0.827294235606583,
"kl": 0.00092315673828125,
"learning_rate": 1e-06,
"loss": 0.009,
"reward": 0.3779194802045822,
"reward_std": 0.39827682077884674,
"rewards/preference_model_reward": 0.3779194802045822,
"rewards/preference_model_reward/std": 0.43630431592464447,
"step": 35
},
{
"clip_ratio": 0.00026937505026580766,
"epoch": 0.09362808842652796,
"grad_norm": 0.8300155350554602,
"kl": 0.0009613037109375,
"learning_rate": 1e-06,
"loss": 0.009,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 361.75,
"epoch": 0.09622886866059818,
"grad_norm": 0.9040133604531,
"kl": 0.001125335693359375,
"learning_rate": 1e-06,
"loss": 0.0078,
"reward": 0.3842260241508484,
"reward_std": 0.38062165677547455,
"rewards/preference_model_reward": 0.3842260241508484,
"rewards/preference_model_reward/std": 0.42281022667884827,
"step": 37
},
{
"clip_ratio": 0.000247176612901967,
"epoch": 0.0988296488946684,
"grad_norm": 0.8818972451317902,
"kl": 0.001178741455078125,
"learning_rate": 1e-06,
"loss": 0.0078,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 409.0,
"epoch": 0.10143042912873862,
"grad_norm": 0.9975116421261425,
"kl": 0.00125885009765625,
"learning_rate": 1e-06,
"loss": 0.0066,
"reward": 0.46170978248119354,
"reward_std": 0.3637985289096832,
"rewards/preference_model_reward": 0.46170978248119354,
"rewards/preference_model_reward/std": 0.4502808153629303,
"step": 39
},
{
"clip_ratio": 0.00038266702904365957,
"epoch": 0.10403120936280884,
"grad_norm": 0.9904624875110181,
"kl": 0.00133514404296875,
"learning_rate": 1e-06,
"loss": 0.0066,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 221.875,
"epoch": 0.10663198959687907,
"grad_norm": 0.5851402421021292,
"kl": 0.001651763916015625,
"learning_rate": 1e-06,
"loss": -0.0034,
"reward": 0.3681895285844803,
"reward_std": 0.2873292565345764,
"rewards/preference_model_reward": 0.3681895285844803,
"rewards/preference_model_reward/std": 0.42807736992836,
"step": 41
},
{
"clip_ratio": 0.00044402084313333035,
"epoch": 0.10923276983094929,
"grad_norm": 0.5781183841216399,
"kl": 0.00211334228515625,
"learning_rate": 1e-06,
"loss": -0.0034,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 319.390625,
"epoch": 0.11183355006501951,
"grad_norm": 0.8225811515974385,
"kl": 0.0016632080078125,
"learning_rate": 1e-06,
"loss": 0.0131,
"reward": 0.5128970742225647,
"reward_std": 0.3217063844203949,
"rewards/preference_model_reward": 0.5128970742225647,
"rewards/preference_model_reward/std": 0.42474237084388733,
"step": 43
},
{
"clip_ratio": 0.00033481663558632135,
"epoch": 0.11443433029908973,
"grad_norm": 0.8483070262251596,
"kl": 0.001781463623046875,
"learning_rate": 1e-06,
"loss": 0.0131,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 224.203125,
"epoch": 0.11703511053315994,
"grad_norm": 0.5395067031089475,
"kl": 0.00168609619140625,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 0.5381855368614197,
"reward_std": 0.2828982323408127,
"rewards/preference_model_reward": 0.5381855368614197,
"rewards/preference_model_reward/std": 0.4257737398147583,
"step": 45
},
{
"clip_ratio": 8.294625149574131e-05,
"epoch": 0.11963589076723016,
"grad_norm": 0.5469512514582927,
"kl": 0.0019378662109375,
"learning_rate": 1e-06,
"loss": 0.0018,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 313.03125,
"epoch": 0.1222366710013004,
"grad_norm": 0.7038582256559461,
"kl": 0.00201416015625,
"learning_rate": 1e-06,
"loss": 0.0056,
"reward": 0.4148406833410263,
"reward_std": 0.3201068937778473,
"rewards/preference_model_reward": 0.4148406833410263,
"rewards/preference_model_reward/std": 0.4175822138786316,
"step": 47
},
{
"clip_ratio": 7.856693991925567e-05,
"epoch": 0.12483745123537061,
"grad_norm": 0.675491507271435,
"kl": 0.0020904541015625,
"learning_rate": 1e-06,
"loss": 0.0056,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 414.671875,
"epoch": 0.12743823146944083,
"grad_norm": 0.8170063918072458,
"kl": 0.0021820068359375,
"learning_rate": 1e-06,
"loss": 0.0091,
"reward": 0.2692461907863617,
"reward_std": 0.3443475216627121,
"rewards/preference_model_reward": 0.2692461907863617,
"rewards/preference_model_reward/std": 0.36693260073661804,
"step": 49
},
{
"clip_ratio": 0.00021530466619879007,
"epoch": 0.13003901170351106,
"grad_norm": 0.8122674605503017,
"kl": 0.00228118896484375,
"learning_rate": 1e-06,
"loss": 0.0092,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 391.953125,
"epoch": 0.13263979193758127,
"grad_norm": 1.0413631794553713,
"kl": 0.002166748046875,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 0.5847213864326477,
"reward_std": 0.42418254911899567,
"rewards/preference_model_reward": 0.5847213864326477,
"rewards/preference_model_reward/std": 0.43372175097465515,
"step": 51
},
{
"clip_ratio": 0.00017526535884826444,
"epoch": 0.1352405721716515,
"grad_norm": 0.996238286825058,
"kl": 0.00226593017578125,
"learning_rate": 1e-06,
"loss": 0.0036,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 352.640625,
"epoch": 0.1378413524057217,
"grad_norm": 0.8962684423825826,
"kl": 0.0029296875,
"learning_rate": 1e-06,
"loss": -0.0128,
"reward": 0.45333464443683624,
"reward_std": 0.34005598723888397,
"rewards/preference_model_reward": 0.45333464443683624,
"rewards/preference_model_reward/std": 0.4754510223865509,
"step": 53
},
{
"clip_ratio": 0.00030642370984423906,
"epoch": 0.14044213263979194,
"grad_norm": 0.9030383579421986,
"kl": 0.00295257568359375,
"learning_rate": 1e-06,
"loss": -0.0128,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 421.546875,
"epoch": 0.14304291287386217,
"grad_norm": 0.9579654034497551,
"kl": 0.0024871826171875,
"learning_rate": 1e-06,
"loss": 0.0174,
"reward": 0.46721845865249634,
"reward_std": 0.367512583732605,
"rewards/preference_model_reward": 0.46721845865249634,
"rewards/preference_model_reward/std": 0.38719798624515533,
"step": 55
},
{
"clip_ratio": 0.00037521994090639055,
"epoch": 0.14564369310793238,
"grad_norm": 1.1022969678624799,
"kl": 0.002593994140625,
"learning_rate": 1e-06,
"loss": 0.0174,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 263.5,
"epoch": 0.1482444733420026,
"grad_norm": 0.8463126094043792,
"kl": 0.00301361083984375,
"learning_rate": 1e-06,
"loss": -0.009,
"reward": 0.5161054730415344,
"reward_std": 0.4402284473180771,
"rewards/preference_model_reward": 0.5161054730415344,
"rewards/preference_model_reward/std": 0.43457816541194916,
"step": 57
},
{
"clip_ratio": 0.00023281520407181233,
"epoch": 0.1508452535760728,
"grad_norm": 0.8638119217410174,
"kl": 0.00360107421875,
"learning_rate": 1e-06,
"loss": -0.009,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 264.484375,
"epoch": 0.15344603381014305,
"grad_norm": 0.9063786698187589,
"kl": 0.0032806396484375,
"learning_rate": 1e-06,
"loss": 0.0045,
"reward": 0.4931875765323639,
"reward_std": 0.3744415044784546,
"rewards/preference_model_reward": 0.4931875765323639,
"rewards/preference_model_reward/std": 0.41449280083179474,
"step": 59
},
{
"clip_ratio": 0.0004102790408069268,
"epoch": 0.15604681404421328,
"grad_norm": 0.7069394763111468,
"kl": 0.00341796875,
"learning_rate": 1e-06,
"loss": 0.0045,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 325.734375,
"epoch": 0.15864759427828348,
"grad_norm": 0.8614703926848987,
"kl": 0.004241943359375,
"learning_rate": 1e-06,
"loss": -0.0041,
"reward": 0.5906274169683456,
"reward_std": 0.2752673625946045,
"rewards/preference_model_reward": 0.5906274169683456,
"rewards/preference_model_reward/std": 0.4438425600528717,
"step": 61
},
{
"clip_ratio": 8.253894338849932e-05,
"epoch": 0.16124837451235371,
"grad_norm": 4.155206953608738,
"kl": 0.0043792724609375,
"learning_rate": 1e-06,
"loss": -0.0041,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 324.640625,
"epoch": 0.16384915474642392,
"grad_norm": 0.7867835517980187,
"kl": 0.0041961669921875,
"learning_rate": 1e-06,
"loss": 0.0109,
"reward": 0.6263610422611237,
"reward_std": 0.33436986804008484,
"rewards/preference_model_reward": 0.6263610422611237,
"rewards/preference_model_reward/std": 0.437585785984993,
"step": 63
},
{
"clip_ratio": 0.00021243346418486908,
"epoch": 0.16644993498049415,
"grad_norm": 0.776838553332444,
"kl": 0.0043792724609375,
"learning_rate": 1e-06,
"loss": 0.0109,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 348.34375,
"epoch": 0.16905071521456436,
"grad_norm": 0.8132709521354365,
"kl": 0.004913330078125,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 0.6121957302093506,
"reward_std": 0.3261266052722931,
"rewards/preference_model_reward": 0.6121957302093506,
"rewards/preference_model_reward/std": 0.43943892419338226,
"step": 65
},
{
"clip_ratio": 0.0002961681311717257,
"epoch": 0.1716514954486346,
"grad_norm": 0.7884869792445989,
"kl": 0.0051422119140625,
"learning_rate": 1e-06,
"loss": 0.0013,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 361.34375,
"epoch": 0.17425227568270482,
"grad_norm": 0.7001728994457754,
"kl": 0.0053863525390625,
"learning_rate": 1e-06,
"loss": 0.0002,
"reward": 0.41676153242588043,
"reward_std": 0.2956756055355072,
"rewards/preference_model_reward": 0.41676153242588043,
"rewards/preference_model_reward/std": 0.33416300266981125,
"step": 67
},
{
"clip_ratio": 0.00037663579132640734,
"epoch": 0.17685305591677503,
"grad_norm": 0.701103186201825,
"kl": 0.0054931640625,
"learning_rate": 1e-06,
"loss": 0.0002,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 337.234375,
"epoch": 0.17945383615084526,
"grad_norm": 1.2570820396809244,
"kl": 0.1349945068359375,
"learning_rate": 1e-06,
"loss": 0.0013,
"reward": 0.4304375946521759,
"reward_std": 0.3242499828338623,
"rewards/preference_model_reward": 0.4304375946521759,
"rewards/preference_model_reward/std": 0.33702679723501205,
"step": 69
},
{
"clip_ratio": 0.00043001653102692217,
"epoch": 0.18205461638491546,
"grad_norm": 1.6894684029251643,
"kl": 0.099365234375,
"learning_rate": 1e-06,
"loss": 0.0012,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 361.09375,
"epoch": 0.1846553966189857,
"grad_norm": 0.740223544455656,
"kl": 0.0048065185546875,
"learning_rate": 1e-06,
"loss": -0.0079,
"reward": 0.38418276607990265,
"reward_std": 0.32340583205223083,
"rewards/preference_model_reward": 0.38418276607990265,
"rewards/preference_model_reward/std": 0.4079796075820923,
"step": 71
},
{
"clip_ratio": 0.0002601840387796983,
"epoch": 0.18725617685305593,
"grad_norm": 0.7356062747183508,
"kl": 0.0050048828125,
"learning_rate": 1e-06,
"loss": -0.0079,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 244.640625,
"epoch": 0.18985695708712613,
"grad_norm": 0.5133791549848619,
"kl": 0.0059051513671875,
"learning_rate": 1e-06,
"loss": -0.0043,
"reward": 0.19469847530126572,
"reward_std": 0.21297892928123474,
"rewards/preference_model_reward": 0.19469847530126572,
"rewards/preference_model_reward/std": 0.2838420420885086,
"step": 73
},
{
"clip_ratio": 0.00011593603994697332,
"epoch": 0.19245773732119636,
"grad_norm": 0.511693644544892,
"kl": 0.0062713623046875,
"learning_rate": 1e-06,
"loss": -0.0043,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 313.515625,
"epoch": 0.19505851755526657,
"grad_norm": 0.9763617288864301,
"kl": 0.005584716796875,
"learning_rate": 1e-06,
"loss": 0.0132,
"reward": 0.678843080997467,
"reward_std": 0.438846230506897,
"rewards/preference_model_reward": 0.678843080997467,
"rewards/preference_model_reward/std": 0.43212489783763885,
"step": 75
},
{
"clip_ratio": 0.00010097982158185914,
"epoch": 0.1976592977893368,
"grad_norm": 1.0362215753715969,
"kl": 0.0058135986328125,
"learning_rate": 1e-06,
"loss": 0.0132,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 428.25,
"epoch": 0.20026007802340703,
"grad_norm": 0.836837589110761,
"kl": 0.0065460205078125,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 0.26709984242916107,
"reward_std": 0.29253821820020676,
"rewards/preference_model_reward": 0.26709984242916107,
"rewards/preference_model_reward/std": 0.33563828468322754,
"step": 77
},
{
"clip_ratio": 0.00026085925492225215,
"epoch": 0.20286085825747724,
"grad_norm": 0.8750312130202887,
"kl": 0.006805419921875,
"learning_rate": 1e-06,
"loss": 0.0036,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 354.515625,
"epoch": 0.20546163849154747,
"grad_norm": 0.92295112375941,
"kl": 0.00604248046875,
"learning_rate": 1e-06,
"loss": 0.0045,
"reward": 0.35681067407131195,
"reward_std": 0.378703311085701,
"rewards/preference_model_reward": 0.35681067407131195,
"rewards/preference_model_reward/std": 0.3927233815193176,
"step": 79
},
{
"clip_ratio": 8.110202907118946e-05,
"epoch": 0.20806241872561768,
"grad_norm": 0.8460897952801627,
"kl": 0.0062713623046875,
"learning_rate": 1e-06,
"loss": 0.0045,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 290.53125,
"epoch": 0.2106631989596879,
"grad_norm": 0.8210294793253463,
"kl": 0.0078582763671875,
"learning_rate": 1e-06,
"loss": -0.0127,
"reward": 0.47505128383636475,
"reward_std": 0.3855440318584442,
"rewards/preference_model_reward": 0.47505128383636475,
"rewards/preference_model_reward/std": 0.4009372293949127,
"step": 81
},
{
"clip_ratio": 0.00014372689474839717,
"epoch": 0.21326397919375814,
"grad_norm": 0.8027749648905551,
"kl": 0.0082244873046875,
"learning_rate": 1e-06,
"loss": -0.0126,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 323.140625,
"epoch": 0.21586475942782835,
"grad_norm": 0.6142723349382537,
"kl": 0.00732421875,
"learning_rate": 1e-06,
"loss": 0.0002,
"reward": 0.4587654545903206,
"reward_std": 0.29028914868831635,
"rewards/preference_model_reward": 0.4587654545903206,
"rewards/preference_model_reward/std": 0.3300708681344986,
"step": 83
},
{
"clip_ratio": 0.00034744998265523463,
"epoch": 0.21846553966189858,
"grad_norm": 0.5861784424361753,
"kl": 0.0074920654296875,
"learning_rate": 1e-06,
"loss": 0.0002,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 484.484375,
"epoch": 0.22106631989596878,
"grad_norm": 0.9076959060952224,
"kl": 0.0075836181640625,
"learning_rate": 1e-06,
"loss": 0.0072,
"reward": 0.5031348764896393,
"reward_std": 0.2952383682131767,
"rewards/preference_model_reward": 0.5031348764896393,
"rewards/preference_model_reward/std": 0.44817858934402466,
"step": 85
},
{
"clip_ratio": 0.0002725635713431984,
"epoch": 0.22366710013003901,
"grad_norm": 0.9288531435242793,
"kl": 0.007843017578125,
"learning_rate": 1e-06,
"loss": 0.0073,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 339.09375,
"epoch": 0.22626788036410922,
"grad_norm": 0.8923680637356045,
"kl": 0.00830078125,
"learning_rate": 1e-06,
"loss": 0.0105,
"reward": 0.4401181936264038,
"reward_std": 0.4361417144536972,
"rewards/preference_model_reward": 0.4401181936264038,
"rewards/preference_model_reward/std": 0.4305359721183777,
"step": 87
},
{
"clip_ratio": 0.00014025312702869996,
"epoch": 0.22886866059817945,
"grad_norm": 0.8984033497034181,
"kl": 0.008697509765625,
"learning_rate": 1e-06,
"loss": 0.0106,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 321.375,
"epoch": 0.23146944083224968,
"grad_norm": 2.827688577013721,
"kl": 0.008758544921875,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 0.5173258185386658,
"reward_std": 0.4247867166996002,
"rewards/preference_model_reward": 0.5173258185386658,
"rewards/preference_model_reward/std": 0.4418337345123291,
"step": 89
},
{
"clip_ratio": 0.00032082964025903493,
"epoch": 0.2340702210663199,
"grad_norm": 0.9909499799511914,
"kl": 0.009033203125,
"learning_rate": 1e-06,
"loss": 0.0002,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 397.25,
"epoch": 0.23667100130039012,
"grad_norm": 0.958697923736949,
"kl": 0.00958251953125,
"learning_rate": 1e-06,
"loss": -0.0032,
"reward": 0.7365403473377228,
"reward_std": 0.37412843108177185,
"rewards/preference_model_reward": 0.7365403473377228,
"rewards/preference_model_reward/std": 0.38982725143432617,
"step": 91
},
{
"clip_ratio": 0.0003890225198119879,
"epoch": 0.23927178153446033,
"grad_norm": 0.9071467126090668,
"kl": 0.00982666015625,
"learning_rate": 1e-06,
"loss": -0.0031,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 208.75,
"epoch": 0.24187256176853056,
"grad_norm": 0.6085390765827805,
"kl": 0.010498046875,
"learning_rate": 1e-06,
"loss": -0.0095,
"reward": 0.6177057921886444,
"reward_std": 0.36169178783893585,
"rewards/preference_model_reward": 0.6177057921886444,
"rewards/preference_model_reward/std": 0.3608778268098831,
"step": 93
},
{
"clip_ratio": 0.0002914143551606685,
"epoch": 0.2444733420026008,
"grad_norm": 0.6159917640161859,
"kl": 0.010528564453125,
"learning_rate": 1e-06,
"loss": -0.0095,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 357.9375,
"epoch": 0.247074122236671,
"grad_norm": 0.9013427802646543,
"kl": 0.011474609375,
"learning_rate": 1e-06,
"loss": 0.0033,
"reward": 0.4740111082792282,
"reward_std": 0.39952021837234497,
"rewards/preference_model_reward": 0.4740111082792282,
"rewards/preference_model_reward/std": 0.4284791499376297,
"step": 95
},
{
"clip_ratio": 0.0003251660345995333,
"epoch": 0.24967490247074123,
"grad_norm": 0.9042971795796162,
"kl": 0.011688232421875,
"learning_rate": 1e-06,
"loss": 0.0032,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 368.859375,
"epoch": 0.25227568270481143,
"grad_norm": 0.7543210465997314,
"kl": 0.01019287109375,
"learning_rate": 1e-06,
"loss": -0.015,
"reward": 0.732722133398056,
"reward_std": 0.3390260487794876,
"rewards/preference_model_reward": 0.732722133398056,
"rewards/preference_model_reward/std": 0.37897253036499023,
"step": 97
},
{
"clip_ratio": 0.00034867875365307555,
"epoch": 0.25487646293888166,
"grad_norm": 0.7756495986001384,
"kl": 0.0103759765625,
"learning_rate": 1e-06,
"loss": -0.015,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 269.609375,
"epoch": 0.2574772431729519,
"grad_norm": 0.8621435865558427,
"kl": 0.012908935546875,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 0.5670541375875473,
"reward_std": 0.3086087480187416,
"rewards/preference_model_reward": 0.5670541375875473,
"rewards/preference_model_reward/std": 0.4115421622991562,
"step": 99
},
{
"clip_ratio": 0.0002304340960108675,
"epoch": 0.26007802340702213,
"grad_norm": 0.8566337212306596,
"kl": 0.01300048828125,
"learning_rate": 1e-06,
"loss": 0.0035,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 321.296875,
"epoch": 0.2626788036410923,
"grad_norm": 0.7524769990332698,
"kl": 0.01031494140625,
"learning_rate": 1e-06,
"loss": 0.0056,
"reward": 0.5453621596097946,
"reward_std": 0.3028823733329773,
"rewards/preference_model_reward": 0.5453621596097946,
"rewards/preference_model_reward/std": 0.41332288086414337,
"step": 101
},
{
"clip_ratio": 0.0004275508617865853,
"epoch": 0.26527958387516254,
"grad_norm": 0.760555697889034,
"kl": 0.010467529296875,
"learning_rate": 1e-06,
"loss": 0.0056,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 277.53125,
"epoch": 0.26788036410923277,
"grad_norm": 0.7828074285838857,
"kl": 0.01123046875,
"learning_rate": 1e-06,
"loss": -0.0056,
"reward": 0.5679600983858109,
"reward_std": 0.38669461011886597,
"rewards/preference_model_reward": 0.5679600983858109,
"rewards/preference_model_reward/std": 0.3954748064279556,
"step": 103
},
{
"clip_ratio": 0.00024837666569510475,
"epoch": 0.270481144343303,
"grad_norm": 0.7708099657439608,
"kl": 0.011322021484375,
"learning_rate": 1e-06,
"loss": -0.0056,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 449.03125,
"epoch": 0.27308192457737324,
"grad_norm": 0.8471146947264236,
"kl": 0.0101318359375,
"learning_rate": 1e-06,
"loss": 0.0058,
"reward": 0.7424919009208679,
"reward_std": 0.31481410562992096,
"rewards/preference_model_reward": 0.7424919009208679,
"rewards/preference_model_reward/std": 0.3854113817214966,
"step": 105
},
{
"clip_ratio": 0.0001572734909132123,
"epoch": 0.2756827048114434,
"grad_norm": 0.870295151902565,
"kl": 0.01031494140625,
"learning_rate": 1e-06,
"loss": 0.0058,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 460.96875,
"epoch": 0.27828348504551365,
"grad_norm": 1.2070074597423983,
"kl": 0.012939453125,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.5596677958965302,
"reward_std": 0.4118568003177643,
"rewards/preference_model_reward": 0.5596677958965302,
"rewards/preference_model_reward/std": 0.4305266737937927,
"step": 107
},
{
"clip_ratio": 0.00020162294094916433,
"epoch": 0.2808842652795839,
"grad_norm": 1.1701677024313162,
"kl": 0.0130615234375,
"learning_rate": 1e-06,
"loss": 0.0009,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 279.953125,
"epoch": 0.2834850455136541,
"grad_norm": 0.4951581311208896,
"kl": 0.012420654296875,
"learning_rate": 1e-06,
"loss": -0.0018,
"reward": 0.677459716796875,
"reward_std": 0.21432576701045036,
"rewards/preference_model_reward": 0.677459716796875,
"rewards/preference_model_reward/std": 0.40531064569950104,
"step": 109
},
{
"clip_ratio": 0.00015336842989199795,
"epoch": 0.28608582574772434,
"grad_norm": 0.4891314801199321,
"kl": 0.0125732421875,
"learning_rate": 1e-06,
"loss": -0.0017,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 290.359375,
"epoch": 0.2886866059817945,
"grad_norm": 0.8219128100663494,
"kl": 0.011444091796875,
"learning_rate": 1e-06,
"loss": 0.0046,
"reward": 0.5518685728311539,
"reward_std": 0.28263746947050095,
"rewards/preference_model_reward": 0.5518685728311539,
"rewards/preference_model_reward/std": 0.42345236241817474,
"step": 111
},
{
"clip_ratio": 0.0004249960547895171,
"epoch": 0.29128738621586475,
"grad_norm": 1.0133739094911354,
"kl": 0.011749267578125,
"learning_rate": 1e-06,
"loss": 0.0046,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 346.765625,
"epoch": 0.293888166449935,
"grad_norm": 0.8158114597582814,
"kl": 0.01141357421875,
"learning_rate": 1e-06,
"loss": -0.0107,
"reward": 0.6804526448249817,
"reward_std": 0.3598247319459915,
"rewards/preference_model_reward": 0.6804526448249817,
"rewards/preference_model_reward/std": 0.38642027974128723,
"step": 113
},
{
"clip_ratio": 0.0003029858708032407,
"epoch": 0.2964889466840052,
"grad_norm": 0.8253998137210009,
"kl": 0.0115966796875,
"learning_rate": 1e-06,
"loss": -0.0108,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 385.359375,
"epoch": 0.29908972691807545,
"grad_norm": 0.9275019225790542,
"kl": 0.013031005859375,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 0.6333288848400116,
"reward_std": 0.35428452491760254,
"rewards/preference_model_reward": 0.6333288848400116,
"rewards/preference_model_reward/std": 0.3997349590063095,
"step": 115
},
{
"clip_ratio": 0.00022703978174831718,
"epoch": 0.3016905071521456,
"grad_norm": 0.9059205023346738,
"kl": 0.013153076171875,
"learning_rate": 1e-06,
"loss": 0.0001,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 316.828125,
"epoch": 0.30429128738621586,
"grad_norm": 0.9713664565394787,
"kl": 0.01373291015625,
"learning_rate": 1e-06,
"loss": 0.0139,
"reward": 0.7312487959861755,
"reward_std": 0.38407662510871887,
"rewards/preference_model_reward": 0.7312487959861755,
"rewards/preference_model_reward/std": 0.40977030992507935,
"step": 117
},
{
"clip_ratio": 0.0002469179962645285,
"epoch": 0.3068920676202861,
"grad_norm": 1.01290147214235,
"kl": 0.013885498046875,
"learning_rate": 1e-06,
"loss": 0.0139,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 290.09375,
"epoch": 0.3094928478543563,
"grad_norm": 0.8042535298912621,
"kl": 0.01300048828125,
"learning_rate": 1e-06,
"loss": -0.0069,
"reward": 0.6101844310760498,
"reward_std": 0.3753702640533447,
"rewards/preference_model_reward": 0.6101844310760498,
"rewards/preference_model_reward/std": 0.41265669465065,
"step": 119
},
{
"clip_ratio": 0.00021792916231788695,
"epoch": 0.31209362808842656,
"grad_norm": 0.7861169995186488,
"kl": 0.01348876953125,
"learning_rate": 1e-06,
"loss": -0.0069,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 343.8125,
"epoch": 0.31469440832249673,
"grad_norm": 1.0936736694491056,
"kl": 0.0130615234375,
"learning_rate": 1e-06,
"loss": -0.008,
"reward": 0.7429376542568207,
"reward_std": 0.34224678575992584,
"rewards/preference_model_reward": 0.7429376542568207,
"rewards/preference_model_reward/std": 0.39303846657276154,
"step": 121
},
{
"clip_ratio": 0.0004085178370587528,
"epoch": 0.31729518855656696,
"grad_norm": 0.8105152189436506,
"kl": 0.013336181640625,
"learning_rate": 1e-06,
"loss": -0.0081,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 351.578125,
"epoch": 0.3198959687906372,
"grad_norm": 1.1492446304526747,
"kl": 0.012359619140625,
"learning_rate": 1e-06,
"loss": -0.0031,
"reward": 0.666700005531311,
"reward_std": 0.4078214764595032,
"rewards/preference_model_reward": 0.666700005531311,
"rewards/preference_model_reward/std": 0.4199042469263077,
"step": 123
},
{
"clip_ratio": 0.0003381448841537349,
"epoch": 0.32249674902470743,
"grad_norm": 0.9700568661797079,
"kl": 0.0125732421875,
"learning_rate": 1e-06,
"loss": -0.0031,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 295.890625,
"epoch": 0.3250975292587776,
"grad_norm": 0.7872274173675015,
"kl": 0.012054443359375,
"learning_rate": 1e-06,
"loss": -0.0016,
"reward": 0.6267447769641876,
"reward_std": 0.38382989168167114,
"rewards/preference_model_reward": 0.6267447769641876,
"rewards/preference_model_reward/std": 0.40342220664024353,
"step": 125
},
{
"clip_ratio": 0.00043341246782802045,
"epoch": 0.32769830949284784,
"grad_norm": 0.7404724454292584,
"kl": 0.01226806640625,
"learning_rate": 1e-06,
"loss": -0.0016,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 283.328125,
"epoch": 0.33029908972691807,
"grad_norm": 0.7130605700783115,
"kl": 0.0150146484375,
"learning_rate": 1e-06,
"loss": -0.0016,
"reward": 0.5046076327562332,
"reward_std": 0.32486245036125183,
"rewards/preference_model_reward": 0.5046076327562332,
"rewards/preference_model_reward/std": 0.35284098982810974,
"step": 127
},
{
"clip_ratio": 0.0002618366925162263,
"epoch": 0.3328998699609883,
"grad_norm": 0.7176392089800268,
"kl": 0.01519775390625,
"learning_rate": 1e-06,
"loss": -0.0016,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 364.796875,
"epoch": 0.33550065019505854,
"grad_norm": 0.442241343056545,
"kl": 0.013427734375,
"learning_rate": 1e-06,
"loss": -0.0004,
"reward": 0.9121778607368469,
"reward_std": 0.16598587855696678,
"rewards/preference_model_reward": 0.9121778607368469,
"rewards/preference_model_reward/std": 0.16689887270331383,
"step": 129
},
{
"clip_ratio": 0.0002460579635226168,
"epoch": 0.3381014304291287,
"grad_norm": 0.4491215566539981,
"kl": 0.0135498046875,
"learning_rate": 1e-06,
"loss": -0.0004,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 399.328125,
"epoch": 0.34070221066319895,
"grad_norm": 0.9316255175025087,
"kl": 0.0152587890625,
"learning_rate": 1e-06,
"loss": 0.0096,
"reward": 0.4223726838827133,
"reward_std": 0.36841753125190735,
"rewards/preference_model_reward": 0.4223726838827133,
"rewards/preference_model_reward/std": 0.39657390117645264,
"step": 131
},
{
"clip_ratio": 0.0002102968719555065,
"epoch": 0.3433029908972692,
"grad_norm": 0.9385675024218874,
"kl": 0.01544189453125,
"learning_rate": 1e-06,
"loss": 0.0096,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 331.515625,
"epoch": 0.3459037711313394,
"grad_norm": 0.8377549816770966,
"kl": 0.01513671875,
"learning_rate": 1e-06,
"loss": -0.014,
"reward": 0.519294261932373,
"reward_std": 0.36041052639484406,
"rewards/preference_model_reward": 0.519294261932373,
"rewards/preference_model_reward/std": 0.44742684066295624,
"step": 133
},
{
"clip_ratio": 0.00024680225760675967,
"epoch": 0.34850455136540964,
"grad_norm": 0.8569254578138688,
"kl": 0.015411376953125,
"learning_rate": 1e-06,
"loss": -0.0141,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 272.734375,
"epoch": 0.3511053315994798,
"grad_norm": 0.7108994159400903,
"kl": 0.01519775390625,
"learning_rate": 1e-06,
"loss": -0.0135,
"reward": 0.6890691518783569,
"reward_std": 0.3600848317146301,
"rewards/preference_model_reward": 0.6890691518783569,
"rewards/preference_model_reward/std": 0.3913665860891342,
"step": 135
},
{
"clip_ratio": 0.00033790143788792193,
"epoch": 0.35370611183355005,
"grad_norm": 0.7107451748958499,
"kl": 0.015350341796875,
"learning_rate": 1e-06,
"loss": -0.0136,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 304.53125,
"epoch": 0.3563068920676203,
"grad_norm": 0.7112930237728833,
"kl": 0.01519775390625,
"learning_rate": 1e-06,
"loss": -0.0098,
"reward": 0.7829216420650482,
"reward_std": 0.34336017072200775,
"rewards/preference_model_reward": 0.7829216420650482,
"rewards/preference_model_reward/std": 0.34507185220718384,
"step": 137
},
{
"clip_ratio": 7.81005946919322e-05,
"epoch": 0.3589076723016905,
"grad_norm": 0.7120408275310693,
"kl": 0.01519775390625,
"learning_rate": 1e-06,
"loss": -0.0099,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 338.390625,
"epoch": 0.36150845253576075,
"grad_norm": 0.7547500044898569,
"kl": 0.013763427734375,
"learning_rate": 1e-06,
"loss": -0.0076,
"reward": 0.5637124627828598,
"reward_std": 0.2805905416607857,
"rewards/preference_model_reward": 0.5637124627828598,
"rewards/preference_model_reward/std": 0.39679694175720215,
"step": 139
},
{
"clip_ratio": 0.00016640447574900463,
"epoch": 0.3641092327698309,
"grad_norm": 0.8421728078893187,
"kl": 0.01385498046875,
"learning_rate": 1e-06,
"loss": -0.0076,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 322.21875,
"epoch": 0.36671001300390116,
"grad_norm": 0.9221596077030108,
"kl": 0.01507568359375,
"learning_rate": 1e-06,
"loss": -0.0052,
"reward": 0.5962317585945129,
"reward_std": 0.4129558801651001,
"rewards/preference_model_reward": 0.5962317585945129,
"rewards/preference_model_reward/std": 0.43260352313518524,
"step": 141
},
{
"clip_ratio": 0.0002706990926526487,
"epoch": 0.3693107932379714,
"grad_norm": 0.9429041940751776,
"kl": 0.01531982421875,
"learning_rate": 1e-06,
"loss": -0.0051,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 383.921875,
"epoch": 0.3719115734720416,
"grad_norm": 0.7528201748690613,
"kl": 0.01739501953125,
"learning_rate": 1e-06,
"loss": 0.0005,
"reward": 0.8064576983451843,
"reward_std": 0.2952372878789902,
"rewards/preference_model_reward": 0.8064576983451843,
"rewards/preference_model_reward/std": 0.33498962223529816,
"step": 143
},
{
"clip_ratio": 0.0002466185833327472,
"epoch": 0.37451235370611186,
"grad_norm": 0.7343692384638275,
"kl": 0.01751708984375,
"learning_rate": 1e-06,
"loss": 0.0005,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 331.671875,
"epoch": 0.37711313394018203,
"grad_norm": 0.6099207337964894,
"kl": 0.012603759765625,
"learning_rate": 1e-06,
"loss": 0.0025,
"reward": 0.47770172357559204,
"reward_std": 0.2876940220594406,
"rewards/preference_model_reward": 0.47770172357559204,
"rewards/preference_model_reward/std": 0.46976715326309204,
"step": 145
},
{
"clip_ratio": 0.00027403252897784114,
"epoch": 0.37971391417425226,
"grad_norm": 0.6935396105177946,
"kl": 0.01275634765625,
"learning_rate": 1e-06,
"loss": 0.0025,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 291.0625,
"epoch": 0.3823146944083225,
"grad_norm": 0.6431240710584735,
"kl": 0.015350341796875,
"learning_rate": 1e-06,
"loss": -0.0049,
"reward": 0.4983751177787781,
"reward_std": 0.3121785521507263,
"rewards/preference_model_reward": 0.4983751177787781,
"rewards/preference_model_reward/std": 0.4164520502090454,
"step": 147
},
{
"clip_ratio": 0.00021908171038376167,
"epoch": 0.38491547464239273,
"grad_norm": 0.644019538703639,
"kl": 0.01507568359375,
"learning_rate": 1e-06,
"loss": -0.0049,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 342.734375,
"epoch": 0.38751625487646296,
"grad_norm": 1.010397732794426,
"kl": 0.0172119140625,
"learning_rate": 1e-06,
"loss": 0.0067,
"reward": 0.6010620892047882,
"reward_std": 0.39562711119651794,
"rewards/preference_model_reward": 0.6010620892047882,
"rewards/preference_model_reward/std": 0.4233546853065491,
"step": 149
},
{
"clip_ratio": 0.00015757188884890638,
"epoch": 0.39011703511053314,
"grad_norm": 0.9973363628824508,
"kl": 0.0172119140625,
"learning_rate": 1e-06,
"loss": 0.0067,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 448.890625,
"epoch": 0.39271781534460337,
"grad_norm": 1.0100632336398712,
"kl": 0.01690673828125,
"learning_rate": 1e-06,
"loss": 0.0051,
"reward": 0.5471232235431671,
"reward_std": 0.408640593290329,
"rewards/preference_model_reward": 0.5471232235431671,
"rewards/preference_model_reward/std": 0.4592936784029007,
"step": 151
},
{
"clip_ratio": 0.0002920969855040312,
"epoch": 0.3953185955786736,
"grad_norm": 0.9929713506706197,
"kl": 0.017333984375,
"learning_rate": 1e-06,
"loss": 0.005,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 378.53125,
"epoch": 0.39791937581274384,
"grad_norm": 0.7606426983085363,
"kl": 0.017578125,
"learning_rate": 1e-06,
"loss": -0.01,
"reward": 0.7775087058544159,
"reward_std": 0.34680168330669403,
"rewards/preference_model_reward": 0.7775087058544159,
"rewards/preference_model_reward/std": 0.35080482065677643,
"step": 153
},
{
"clip_ratio": 0.0002456825313856825,
"epoch": 0.40052015604681407,
"grad_norm": 0.7710939909605372,
"kl": 0.0179443359375,
"learning_rate": 1e-06,
"loss": -0.01,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 250.796875,
"epoch": 0.40312093628088425,
"grad_norm": 0.5451857762518953,
"kl": 0.02081298828125,
"learning_rate": 1e-06,
"loss": -0.0028,
"reward": 0.5519833117723465,
"reward_std": 0.2390262335538864,
"rewards/preference_model_reward": 0.5519833117723465,
"rewards/preference_model_reward/std": 0.45026274025440216,
"step": 155
},
{
"clip_ratio": 0.00017806489631766453,
"epoch": 0.4057217165149545,
"grad_norm": 0.554072362585335,
"kl": 0.02093505859375,
"learning_rate": 1e-06,
"loss": -0.0028,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 232.078125,
"epoch": 0.4083224967490247,
"grad_norm": 0.5558645545689939,
"kl": 0.0223388671875,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 0.7436761558055878,
"reward_std": 0.2672045975923538,
"rewards/preference_model_reward": 0.7436761558055878,
"rewards/preference_model_reward/std": 0.35222816467285156,
"step": 157
},
{
"clip_ratio": 0.0003454240650171414,
"epoch": 0.41092327698309494,
"grad_norm": 0.6575119278635855,
"kl": 0.0224609375,
"learning_rate": 1e-06,
"loss": 0.0022,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 303.375,
"epoch": 0.4135240572171652,
"grad_norm": 0.7637043400607006,
"kl": 0.01995849609375,
"learning_rate": 1e-06,
"loss": 0.0036,
"reward": 0.5944642722606659,
"reward_std": 0.3134625256061554,
"rewards/preference_model_reward": 0.5944642722606659,
"rewards/preference_model_reward/std": 0.4180496633052826,
"step": 159
},
{
"clip_ratio": 0.00020651466911658645,
"epoch": 0.41612483745123535,
"grad_norm": 0.741675886273459,
"kl": 0.02020263671875,
"learning_rate": 1e-06,
"loss": 0.0035,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 303.96875,
"epoch": 0.4187256176853056,
"grad_norm": 0.5908208021088289,
"kl": 0.01922607421875,
"learning_rate": 1e-06,
"loss": -0.0014,
"reward": 0.8729158341884613,
"reward_std": 0.2293628454208374,
"rewards/preference_model_reward": 0.8729158341884613,
"rewards/preference_model_reward/std": 0.2594291567802429,
"step": 161
},
{
"clip_ratio": 0.000392273606848903,
"epoch": 0.4213263979193758,
"grad_norm": 0.5839404720619328,
"kl": 0.01947021484375,
"learning_rate": 1e-06,
"loss": -0.0013,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 282.125,
"epoch": 0.42392717815344605,
"grad_norm": 0.5173474832622873,
"kl": 0.01922607421875,
"learning_rate": 1e-06,
"loss": -0.0006,
"reward": 0.8816950023174286,
"reward_std": 0.17555147409439087,
"rewards/preference_model_reward": 0.8816950023174286,
"rewards/preference_model_reward/std": 0.24751071631908417,
"step": 163
},
{
"clip_ratio": 0.00020699262677226216,
"epoch": 0.4265279583875163,
"grad_norm": 0.5253345880647052,
"kl": 0.01947021484375,
"learning_rate": 1e-06,
"loss": -0.0006,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 354.40625,
"epoch": 0.42912873862158646,
"grad_norm": 0.642701967238361,
"kl": 0.02044677734375,
"learning_rate": 1e-06,
"loss": -0.0059,
"reward": 0.5235294103622437,
"reward_std": 0.3236909657716751,
"rewards/preference_model_reward": 0.5235294103622437,
"rewards/preference_model_reward/std": 0.3493155986070633,
"step": 165
},
{
"clip_ratio": 0.00025575608015060425,
"epoch": 0.4317295188556567,
"grad_norm": 0.6590828609212557,
"kl": 0.0208740234375,
"learning_rate": 1e-06,
"loss": -0.0059,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 322.296875,
"epoch": 0.4343302990897269,
"grad_norm": 0.6256801460180289,
"kl": 0.01947021484375,
"learning_rate": 1e-06,
"loss": -0.0001,
"reward": 0.8114274740219116,
"reward_std": 0.26273050904273987,
"rewards/preference_model_reward": 0.8114274740219116,
"rewards/preference_model_reward/std": 0.33220019936561584,
"step": 167
},
{
"clip_ratio": 0.00031010911334306,
"epoch": 0.43693107932379716,
"grad_norm": 0.6426336745378424,
"kl": 0.02008056640625,
"learning_rate": 1e-06,
"loss": -0.0001,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 294.84375,
"epoch": 0.43953185955786733,
"grad_norm": 0.5161332894447401,
"kl": 0.02392578125,
"learning_rate": 1e-06,
"loss": -0.0001,
"reward": 0.572134867310524,
"reward_std": 0.21724799275398254,
"rewards/preference_model_reward": 0.572134867310524,
"rewards/preference_model_reward/std": 0.3613039702177048,
"step": 169
},
{
"clip_ratio": 0.0004953075695084408,
"epoch": 0.44213263979193757,
"grad_norm": 0.5949338340498116,
"kl": 0.02392578125,
"learning_rate": 1e-06,
"loss": -0.0001,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 303.921875,
"epoch": 0.4447334200260078,
"grad_norm": 0.770205409238572,
"kl": 0.02276611328125,
"learning_rate": 1e-06,
"loss": 0.0068,
"reward": 0.6543514132499695,
"reward_std": 0.3576260805130005,
"rewards/preference_model_reward": 0.6543514132499695,
"rewards/preference_model_reward/std": 0.3618515580892563,
"step": 171
},
{
"clip_ratio": 0.0002886750335164834,
"epoch": 0.44733420026007803,
"grad_norm": 0.7881488307294554,
"kl": 0.02288818359375,
"learning_rate": 1e-06,
"loss": 0.0068,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 256.265625,
"epoch": 0.44993498049414826,
"grad_norm": 0.6609030894984753,
"kl": 0.02105712890625,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 0.7981027066707611,
"reward_std": 0.23525837063789368,
"rewards/preference_model_reward": 0.7981027066707611,
"rewards/preference_model_reward/std": 0.3589998483657837,
"step": 173
},
{
"clip_ratio": 0.00019836763385683298,
"epoch": 0.45253576072821844,
"grad_norm": 0.5635575828296628,
"kl": 0.020751953125,
"learning_rate": 1e-06,
"loss": 0.003,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 319.453125,
"epoch": 0.45513654096228867,
"grad_norm": 0.8369196057998636,
"kl": 0.0262451171875,
"learning_rate": 1e-06,
"loss": -0.0051,
"reward": 0.7033383548259735,
"reward_std": 0.2920212224125862,
"rewards/preference_model_reward": 0.7033383548259735,
"rewards/preference_model_reward/std": 0.3324955254793167,
"step": 175
},
{
"clip_ratio": 0.00029345130315050483,
"epoch": 0.4577373211963589,
"grad_norm": 0.7558084723044584,
"kl": 0.02667236328125,
"learning_rate": 1e-06,
"loss": -0.0051,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 313.46875,
"epoch": 0.46033810143042914,
"grad_norm": 0.6043611202887026,
"kl": 0.02191162109375,
"learning_rate": 1e-06,
"loss": -0.0042,
"reward": 0.7082796394824982,
"reward_std": 0.2582213580608368,
"rewards/preference_model_reward": 0.7082796394824982,
"rewards/preference_model_reward/std": 0.369435116648674,
"step": 177
},
{
"clip_ratio": 0.00041518576472299173,
"epoch": 0.46293888166449937,
"grad_norm": 0.6071654228678836,
"kl": 0.0220947265625,
"learning_rate": 1e-06,
"loss": -0.0042,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 361.84375,
"epoch": 0.46553966189856955,
"grad_norm": 0.8328067968880296,
"kl": 0.0191650390625,
"learning_rate": 1e-06,
"loss": 0.0056,
"reward": 0.6826794147491455,
"reward_std": 0.39410941302776337,
"rewards/preference_model_reward": 0.6826794147491455,
"rewards/preference_model_reward/std": 0.41487593948841095,
"step": 179
},
{
"clip_ratio": 0.0003094414860242978,
"epoch": 0.4681404421326398,
"grad_norm": 0.8355485459107456,
"kl": 0.0191650390625,
"learning_rate": 1e-06,
"loss": 0.0056,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 187.390625,
"epoch": 0.47074122236671,
"grad_norm": 0.9822709833700972,
"kl": 0.02197265625,
"learning_rate": 1e-06,
"loss": -0.0071,
"reward": 0.7466834783554077,
"reward_std": 0.3152329549193382,
"rewards/preference_model_reward": 0.7466834783554077,
"rewards/preference_model_reward/std": 0.3539666682481766,
"step": 181
},
{
"clip_ratio": 0.00015157322195591405,
"epoch": 0.47334200260078024,
"grad_norm": 0.6823337060903064,
"kl": 0.0220947265625,
"learning_rate": 1e-06,
"loss": -0.0071,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 366.078125,
"epoch": 0.4759427828348505,
"grad_norm": 0.5703827869859761,
"kl": 0.02264404296875,
"learning_rate": 1e-06,
"loss": 0.0004,
"reward": 0.760188639163971,
"reward_std": 0.2790553569793701,
"rewards/preference_model_reward": 0.760188639163971,
"rewards/preference_model_reward/std": 0.3441888093948364,
"step": 183
},
{
"clip_ratio": 0.00012506498387665488,
"epoch": 0.47854356306892065,
"grad_norm": 0.5735123344340435,
"kl": 0.02252197265625,
"learning_rate": 1e-06,
"loss": 0.0004,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 336.71875,
"epoch": 0.4811443433029909,
"grad_norm": 0.8852644183946003,
"kl": 0.0257568359375,
"learning_rate": 1e-06,
"loss": 0.0035,
"reward": 0.7332727313041687,
"reward_std": 0.3565700799226761,
"rewards/preference_model_reward": 0.7332727313041687,
"rewards/preference_model_reward/std": 0.39707188308238983,
"step": 185
},
{
"clip_ratio": 0.00018871420979849063,
"epoch": 0.4837451235370611,
"grad_norm": 0.8348047351432295,
"kl": 0.0257568359375,
"learning_rate": 1e-06,
"loss": 0.0035,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 334.140625,
"epoch": 0.48634590377113135,
"grad_norm": 0.4595696403338184,
"kl": 0.0205078125,
"learning_rate": 1e-06,
"loss": 0.0017,
"reward": 0.7008107006549835,
"reward_std": 0.137342881411314,
"rewards/preference_model_reward": 0.7008107006549835,
"rewards/preference_model_reward/std": 0.2884506806731224,
"step": 187
},
{
"clip_ratio": 0.00015839685511309654,
"epoch": 0.4889466840052016,
"grad_norm": 0.4569164783179551,
"kl": 0.0206298828125,
"learning_rate": 1e-06,
"loss": 0.0017,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 341.6875,
"epoch": 0.49154746423927176,
"grad_norm": 0.6567111545906984,
"kl": 0.0224609375,
"learning_rate": 1e-06,
"loss": -0.0003,
"reward": 0.7975671887397766,
"reward_std": 0.2910696864128113,
"rewards/preference_model_reward": 0.7975671887397766,
"rewards/preference_model_reward/std": 0.34180814027786255,
"step": 189
},
{
"clip_ratio": 0.00016007465819711797,
"epoch": 0.494148244473342,
"grad_norm": 0.6589594674011164,
"kl": 0.0228271484375,
"learning_rate": 1e-06,
"loss": -0.0003,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 258.34375,
"epoch": 0.4967490247074122,
"grad_norm": 1.797805114255913,
"kl": 0.02618408203125,
"learning_rate": 1e-06,
"loss": -0.0071,
"reward": 0.8337420225143433,
"reward_std": 0.2474193051457405,
"rewards/preference_model_reward": 0.8337420225143433,
"rewards/preference_model_reward/std": 0.3199751079082489,
"step": 191
},
{
"clip_ratio": 0.0002587813069112599,
"epoch": 0.49934980494148246,
"grad_norm": 0.5660164857332879,
"kl": 0.02655029296875,
"learning_rate": 1e-06,
"loss": -0.0071,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 288.46875,
"epoch": 0.5019505851755527,
"grad_norm": 0.6740716396818753,
"kl": 0.0220947265625,
"learning_rate": 1e-06,
"loss": -0.0029,
"reward": 0.6083263158798218,
"reward_std": 0.3238120675086975,
"rewards/preference_model_reward": 0.6083263158798218,
"rewards/preference_model_reward/std": 0.3790005147457123,
"step": 193
},
{
"clip_ratio": 0.00020429812866495922,
"epoch": 0.5045513654096229,
"grad_norm": 0.6657232986537709,
"kl": 0.02227783203125,
"learning_rate": 1e-06,
"loss": -0.0029,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 324.203125,
"epoch": 0.5071521456436932,
"grad_norm": 0.9139150980318331,
"kl": 0.01904296875,
"learning_rate": 1e-06,
"loss": -0.0009,
"reward": 0.7892916798591614,
"reward_std": 0.3459463268518448,
"rewards/preference_model_reward": 0.7892916798591614,
"rewards/preference_model_reward/std": 0.3491186946630478,
"step": 195
},
{
"clip_ratio": 0.00033000129042193294,
"epoch": 0.5097529258777633,
"grad_norm": 0.7044330356942116,
"kl": 0.019287109375,
"learning_rate": 1e-06,
"loss": -0.001,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 318.5,
"epoch": 0.5123537061118335,
"grad_norm": 1.097976379791459,
"kl": 0.0263671875,
"learning_rate": 1e-06,
"loss": 0.0085,
"reward": 0.6349725127220154,
"reward_std": 0.22165381908416748,
"rewards/preference_model_reward": 0.6349725127220154,
"rewards/preference_model_reward/std": 0.3097042515873909,
"step": 197
},
{
"clip_ratio": 0.0003432096855249256,
"epoch": 0.5149544863459038,
"grad_norm": 1.0743598050872851,
"kl": 0.02667236328125,
"learning_rate": 1e-06,
"loss": 0.0085,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 445.09375,
"epoch": 0.517555266579974,
"grad_norm": 0.8055405857842888,
"kl": 0.0220947265625,
"learning_rate": 1e-06,
"loss": -0.0007,
"reward": 0.3948906809091568,
"reward_std": 0.2313927859067917,
"rewards/preference_model_reward": 0.3948906809091568,
"rewards/preference_model_reward/std": 0.44849249720573425,
"step": 199
},
{
"clip_ratio": 0.00019127286213915795,
"epoch": 0.5201560468140443,
"grad_norm": 0.7564346551970879,
"kl": 0.02227783203125,
"learning_rate": 1e-06,
"loss": -0.0007,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 372.8125,
"epoch": 0.5227568270481144,
"grad_norm": 0.8534629203388931,
"kl": 0.0289306640625,
"learning_rate": 1e-06,
"loss": 0.0042,
"reward": 0.753670871257782,
"reward_std": 0.3137911409139633,
"rewards/preference_model_reward": 0.753670871257782,
"rewards/preference_model_reward/std": 0.35431434214115143,
"step": 201
},
{
"clip_ratio": 0.0001328055550402496,
"epoch": 0.5253576072821846,
"grad_norm": 0.8274306102017099,
"kl": 0.02947998046875,
"learning_rate": 1e-06,
"loss": 0.0042,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 187.734375,
"epoch": 0.5279583875162549,
"grad_norm": 0.5411453143013689,
"kl": 0.02337646484375,
"learning_rate": 1e-06,
"loss": -0.0073,
"reward": 0.6845378577709198,
"reward_std": 0.35077695548534393,
"rewards/preference_model_reward": 0.6845378577709198,
"rewards/preference_model_reward/std": 0.381725937128067,
"step": 203
},
{
"clip_ratio": 0.0005341880605556071,
"epoch": 0.5305591677503251,
"grad_norm": 0.5152105756393677,
"kl": 0.023681640625,
"learning_rate": 1e-06,
"loss": -0.0073,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 271.65625,
"epoch": 0.5331599479843954,
"grad_norm": 0.5219287048320751,
"kl": 0.022216796875,
"learning_rate": 1e-06,
"loss": -0.0027,
"reward": 0.8503350913524628,
"reward_std": 0.2589127942919731,
"rewards/preference_model_reward": 0.8503350913524628,
"rewards/preference_model_reward/std": 0.2864740937948227,
"step": 205
},
{
"clip_ratio": 0.0003061444003833458,
"epoch": 0.5357607282184655,
"grad_norm": 0.5250746479640336,
"kl": 0.0225830078125,
"learning_rate": 1e-06,
"loss": -0.0027,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 360.9375,
"epoch": 0.5383615084525357,
"grad_norm": 0.9686490492017382,
"kl": 0.0242919921875,
"learning_rate": 1e-06,
"loss": -0.0064,
"reward": 0.8047667145729065,
"reward_std": 0.34497836232185364,
"rewards/preference_model_reward": 0.8047667145729065,
"rewards/preference_model_reward/std": 0.3676797151565552,
"step": 207
},
{
"clip_ratio": 0.00017128348554251716,
"epoch": 0.540962288686606,
"grad_norm": 0.8929961952129637,
"kl": 0.02447509765625,
"learning_rate": 1e-06,
"loss": -0.0064,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 333.5625,
"epoch": 0.5435630689206762,
"grad_norm": 0.4541825457318801,
"kl": 0.02734375,
"learning_rate": 1e-06,
"loss": -0.0084,
"reward": 0.8458181023597717,
"reward_std": 0.23653991520404816,
"rewards/preference_model_reward": 0.8458181023597717,
"rewards/preference_model_reward/std": 0.2930053174495697,
"step": 209
},
{
"clip_ratio": 0.0007538689824286848,
"epoch": 0.5461638491547465,
"grad_norm": 0.45249924575704586,
"kl": 0.02783203125,
"learning_rate": 1e-06,
"loss": -0.0085,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 309.703125,
"epoch": 0.5487646293888166,
"grad_norm": 1.0654026235581544,
"kl": 0.031982421875,
"learning_rate": 1e-06,
"loss": -0.0069,
"reward": 0.5554362535476685,
"reward_std": 0.45049290359020233,
"rewards/preference_model_reward": 0.5554362535476685,
"rewards/preference_model_reward/std": 0.4461488127708435,
"step": 211
},
{
"clip_ratio": 9.597632015356794e-05,
"epoch": 0.5513654096228868,
"grad_norm": 1.0653292288478493,
"kl": 0.032470703125,
"learning_rate": 1e-06,
"loss": -0.0068,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 348.078125,
"epoch": 0.5539661898569571,
"grad_norm": 0.8314788226660293,
"kl": 0.02728271484375,
"learning_rate": 1e-06,
"loss": -0.0033,
"reward": 0.842810720205307,
"reward_std": 0.2719964236021042,
"rewards/preference_model_reward": 0.842810720205307,
"rewards/preference_model_reward/std": 0.317636638879776,
"step": 213
},
{
"clip_ratio": 0.0002034323406405747,
"epoch": 0.5565669700910273,
"grad_norm": 0.8392092498413791,
"kl": 0.0277099609375,
"learning_rate": 1e-06,
"loss": -0.0033,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 389.40625,
"epoch": 0.5591677503250976,
"grad_norm": 0.9192249339442864,
"kl": 0.03179931640625,
"learning_rate": 1e-06,
"loss": -0.0103,
"reward": 0.622369259595871,
"reward_std": 0.35231079161167145,
"rewards/preference_model_reward": 0.622369259595871,
"rewards/preference_model_reward/std": 0.38653427362442017,
"step": 215
},
{
"clip_ratio": 0.0005518103025679011,
"epoch": 0.5617685305591678,
"grad_norm": 0.9297117283091114,
"kl": 0.0323486328125,
"learning_rate": 1e-06,
"loss": -0.0103,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 419.859375,
"epoch": 0.5643693107932379,
"grad_norm": 0.9336213718132743,
"kl": 0.0318603515625,
"learning_rate": 1e-06,
"loss": 0.0165,
"reward": 0.7810149788856506,
"reward_std": 0.30521372705698013,
"rewards/preference_model_reward": 0.7810149788856506,
"rewards/preference_model_reward/std": 0.3358805924654007,
"step": 217
},
{
"clip_ratio": 0.00014426001871470362,
"epoch": 0.5669700910273082,
"grad_norm": 0.9051217250973354,
"kl": 0.03265380859375,
"learning_rate": 1e-06,
"loss": 0.0165,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 320.59375,
"epoch": 0.5695708712613784,
"grad_norm": 0.7224839222870695,
"kl": 0.02740478515625,
"learning_rate": 1e-06,
"loss": -0.0023,
"reward": 0.7635192573070526,
"reward_std": 0.324033185839653,
"rewards/preference_model_reward": 0.7635192573070526,
"rewards/preference_model_reward/std": 0.353701576590538,
"step": 219
},
{
"clip_ratio": 0.0002873854355129879,
"epoch": 0.5721716514954487,
"grad_norm": 0.7595301461919822,
"kl": 0.0274658203125,
"learning_rate": 1e-06,
"loss": -0.0024,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 388.3125,
"epoch": 0.5747724317295189,
"grad_norm": 0.9070734191648187,
"kl": 0.025146484375,
"learning_rate": 1e-06,
"loss": -0.0015,
"reward": 0.7658654153347015,
"reward_std": 0.2642327696084976,
"rewards/preference_model_reward": 0.7658654153347015,
"rewards/preference_model_reward/std": 0.3203909620642662,
"step": 221
},
{
"clip_ratio": 0.00011779637134168297,
"epoch": 0.577373211963589,
"grad_norm": 0.7137387787434728,
"kl": 0.02520751953125,
"learning_rate": 1e-06,
"loss": -0.0015,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 256.203125,
"epoch": 0.5799739921976593,
"grad_norm": 0.5417591118857858,
"kl": 0.0323486328125,
"learning_rate": 1e-06,
"loss": -0.0027,
"reward": 0.7153788208961487,
"reward_std": 0.29760105162858963,
"rewards/preference_model_reward": 0.7153788208961487,
"rewards/preference_model_reward/std": 0.3605159521102905,
"step": 223
},
{
"clip_ratio": 0.0002042335836449638,
"epoch": 0.5825747724317295,
"grad_norm": 0.541117637465139,
"kl": 0.032470703125,
"learning_rate": 1e-06,
"loss": -0.0027,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 414.75,
"epoch": 0.5851755526657998,
"grad_norm": 0.7271472193596403,
"kl": 0.02532958984375,
"learning_rate": 1e-06,
"loss": 0.0076,
"reward": 0.37321533262729645,
"reward_std": 0.22989524900913239,
"rewards/preference_model_reward": 0.37321533262729645,
"rewards/preference_model_reward/std": 0.4283030182123184,
"step": 225
},
{
"clip_ratio": 0.0003217906632926315,
"epoch": 0.58777633289987,
"grad_norm": 0.7195866194223471,
"kl": 0.025634765625,
"learning_rate": 1e-06,
"loss": 0.0075,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 323.0625,
"epoch": 0.5903771131339401,
"grad_norm": 0.8041443917617668,
"kl": 0.03070068359375,
"learning_rate": 1e-06,
"loss": -0.0025,
"reward": 0.7684324085712433,
"reward_std": 0.36028069257736206,
"rewards/preference_model_reward": 0.7684324085712433,
"rewards/preference_model_reward/std": 0.3624133765697479,
"step": 227
},
{
"clip_ratio": 0.00026055803027702495,
"epoch": 0.5929778933680104,
"grad_norm": 0.8283620393853188,
"kl": 0.03125,
"learning_rate": 1e-06,
"loss": -0.0024,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 283.03125,
"epoch": 0.5955786736020806,
"grad_norm": 0.6860393905348983,
"kl": 0.031494140625,
"learning_rate": 1e-06,
"loss": -0.0018,
"reward": 0.7659209966659546,
"reward_std": 0.26482800394296646,
"rewards/preference_model_reward": 0.7659209966659546,
"rewards/preference_model_reward/std": 0.37613604962825775,
"step": 229
},
{
"clip_ratio": 0.0002554464590502903,
"epoch": 0.5981794538361509,
"grad_norm": 0.6751506226066358,
"kl": 0.03173828125,
"learning_rate": 1e-06,
"loss": -0.0019,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 305.890625,
"epoch": 0.6007802340702211,
"grad_norm": 0.6931747416922038,
"kl": 0.03240966796875,
"learning_rate": 1e-06,
"loss": -0.0027,
"reward": 0.7967671155929565,
"reward_std": 0.3124672695994377,
"rewards/preference_model_reward": 0.7967671155929565,
"rewards/preference_model_reward/std": 0.3221246153116226,
"step": 231
},
{
"clip_ratio": 0.00034807444899342954,
"epoch": 0.6033810143042913,
"grad_norm": 0.8871992669497728,
"kl": 0.03277587890625,
"learning_rate": 1e-06,
"loss": -0.0027,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 272.71875,
"epoch": 0.6059817945383615,
"grad_norm": 0.22600317259271924,
"kl": 0.03021240234375,
"learning_rate": 1e-06,
"loss": -0.001,
"reward": 0.7308064997196198,
"reward_std": 0.09175470843911171,
"rewards/preference_model_reward": 0.7308064997196198,
"rewards/preference_model_reward/std": 0.3288180008530617,
"step": 233
},
{
"clip_ratio": 0.00035572612250689417,
"epoch": 0.6085825747724317,
"grad_norm": 0.2269448326022545,
"kl": 0.0306396484375,
"learning_rate": 1e-06,
"loss": -0.001,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 288.0,
"epoch": 0.611183355006502,
"grad_norm": 0.5678201319537751,
"kl": 0.0279541015625,
"learning_rate": 1e-06,
"loss": -0.0027,
"reward": 0.7927112579345703,
"reward_std": 0.2680581137537956,
"rewards/preference_model_reward": 0.7927112579345703,
"rewards/preference_model_reward/std": 0.29819803684949875,
"step": 235
},
{
"clip_ratio": 0.0005594654503511265,
"epoch": 0.6137841352405722,
"grad_norm": 0.5696451305988013,
"kl": 0.0277099609375,
"learning_rate": 1e-06,
"loss": -0.0027,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 387.71875,
"epoch": 0.6163849154746424,
"grad_norm": 0.2826076441567154,
"kl": 0.0311279296875,
"learning_rate": 1e-06,
"loss": 0.0002,
"reward": 0.9802200198173523,
"reward_std": 0.0791199654340744,
"rewards/preference_model_reward": 0.9802200198173523,
"rewards/preference_model_reward/std": 0.08191458880901337,
"step": 237
},
{
"clip_ratio": 0.0001503910607425496,
"epoch": 0.6189856957087126,
"grad_norm": 0.27566046682842354,
"kl": 0.03076171875,
"learning_rate": 1e-06,
"loss": 0.0002,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 278.359375,
"epoch": 0.6215864759427828,
"grad_norm": 0.6496794092367957,
"kl": 0.0299072265625,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 0.8209348022937775,
"reward_std": 0.2692207768559456,
"rewards/preference_model_reward": 0.8209348022937775,
"rewards/preference_model_reward/std": 0.3124052509665489,
"step": 239
},
{
"clip_ratio": 0.00021162991833989508,
"epoch": 0.6241872561768531,
"grad_norm": 0.6797772811922748,
"kl": 0.02978515625,
"learning_rate": 1e-06,
"loss": 0.0041,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 290.09375,
"epoch": 0.6267880364109233,
"grad_norm": 0.6610475273967841,
"kl": 0.03094482421875,
"learning_rate": 1e-06,
"loss": -0.0017,
"reward": 0.5471342355012894,
"reward_std": 0.3384169638156891,
"rewards/preference_model_reward": 0.5471342355012894,
"rewards/preference_model_reward/std": 0.3545994460582733,
"step": 241
},
{
"clip_ratio": 0.00036541650479193777,
"epoch": 0.6293888166449935,
"grad_norm": 0.6636307821447273,
"kl": 0.03094482421875,
"learning_rate": 1e-06,
"loss": -0.0018,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 314.640625,
"epoch": 0.6319895968790638,
"grad_norm": 1.007770555329949,
"kl": 0.0347900390625,
"learning_rate": 1e-06,
"loss": -0.0072,
"reward": 0.8875998258590698,
"reward_std": 0.22977813333272934,
"rewards/preference_model_reward": 0.8875998258590698,
"rewards/preference_model_reward/std": 0.26288190484046936,
"step": 243
},
{
"clip_ratio": 0.00036119218566454947,
"epoch": 0.6345903771131339,
"grad_norm": 0.5276320493290289,
"kl": 0.0350341796875,
"learning_rate": 1e-06,
"loss": -0.0072,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 263.828125,
"epoch": 0.6371911573472041,
"grad_norm": 0.6961169112868046,
"kl": 0.03265380859375,
"learning_rate": 1e-06,
"loss": 0.0047,
"reward": 0.7485232055187225,
"reward_std": 0.27340711653232574,
"rewards/preference_model_reward": 0.7485232055187225,
"rewards/preference_model_reward/std": 0.35039061307907104,
"step": 245
},
{
"clip_ratio": 0.000293049102765508,
"epoch": 0.6397919375812744,
"grad_norm": 0.9025214332063679,
"kl": 0.0330810546875,
"learning_rate": 1e-06,
"loss": 0.0047,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 362.390625,
"epoch": 0.6423927178153446,
"grad_norm": 0.9207891120898004,
"kl": 0.034423828125,
"learning_rate": 1e-06,
"loss": 0.0009,
"reward": 0.6746810376644135,
"reward_std": 0.37852445244789124,
"rewards/preference_model_reward": 0.6746810376644135,
"rewards/preference_model_reward/std": 0.39785870909690857,
"step": 247
},
{
"clip_ratio": 0.0004364640772109851,
"epoch": 0.6449934980494149,
"grad_norm": 0.9428596077830829,
"kl": 0.0345458984375,
"learning_rate": 1e-06,
"loss": 0.0008,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 355.953125,
"epoch": 0.647594278283485,
"grad_norm": 0.9002426185767372,
"kl": 0.0325927734375,
"learning_rate": 1e-06,
"loss": 0.0043,
"reward": 0.7399424314498901,
"reward_std": 0.34358392655849457,
"rewards/preference_model_reward": 0.7399424314498901,
"rewards/preference_model_reward/std": 0.41077572107315063,
"step": 249
},
{
"clip_ratio": 0.0002715677837841213,
"epoch": 0.6501950585175552,
"grad_norm": 0.9388574486276239,
"kl": 0.033447265625,
"learning_rate": 1e-06,
"loss": 0.0043,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 275.640625,
"epoch": 0.6527958387516255,
"grad_norm": 0.6358294351644407,
"kl": 0.031005859375,
"learning_rate": 1e-06,
"loss": -0.008,
"reward": 0.7702620029449463,
"reward_std": 0.30297737568616867,
"rewards/preference_model_reward": 0.7702620029449463,
"rewards/preference_model_reward/std": 0.31037599593400955,
"step": 251
},
{
"clip_ratio": 7.407407247228548e-05,
"epoch": 0.6553966189856957,
"grad_norm": 0.6496709525090261,
"kl": 0.03118896484375,
"learning_rate": 1e-06,
"loss": -0.008,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 383.171875,
"epoch": 0.657997399219766,
"grad_norm": 0.9259779511089412,
"kl": 0.0374755859375,
"learning_rate": 1e-06,
"loss": 0.0011,
"reward": 0.7584672272205353,
"reward_std": 0.34140945971012115,
"rewards/preference_model_reward": 0.7584672272205353,
"rewards/preference_model_reward/std": 0.36738522350788116,
"step": 253
},
{
"clip_ratio": 0.00042696832679212093,
"epoch": 0.6605981794538361,
"grad_norm": 0.9195597011995847,
"kl": 0.0379638671875,
"learning_rate": 1e-06,
"loss": 0.0011,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 414.90625,
"epoch": 0.6631989596879063,
"grad_norm": 0.7640236735659361,
"kl": 0.0379638671875,
"learning_rate": 1e-06,
"loss": -0.0019,
"reward": 0.8538275361061096,
"reward_std": 0.23895438015460968,
"rewards/preference_model_reward": 0.8538275361061096,
"rewards/preference_model_reward/std": 0.3124992400407791,
"step": 255
},
{
"clip_ratio": 0.00029875659674871713,
"epoch": 0.6657997399219766,
"grad_norm": 0.9201327061263956,
"kl": 0.0382080078125,
"learning_rate": 1e-06,
"loss": -0.0018,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 360.078125,
"epoch": 0.6684005201560468,
"grad_norm": 0.627065020941867,
"kl": 0.041015625,
"learning_rate": 1e-06,
"loss": -0.01,
"reward": 0.8764857351779938,
"reward_std": 0.20473513007164001,
"rewards/preference_model_reward": 0.8764857351779938,
"rewards/preference_model_reward/std": 0.20165851712226868,
"step": 257
},
{
"clip_ratio": 0.00010048231342807412,
"epoch": 0.6710013003901171,
"grad_norm": 0.6604841172659577,
"kl": 0.04052734375,
"learning_rate": 1e-06,
"loss": -0.01,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 343.15625,
"epoch": 0.6736020806241872,
"grad_norm": 0.5406956351974641,
"kl": 0.0341796875,
"learning_rate": 1e-06,
"loss": 0.0003,
"reward": 0.6130577623844147,
"reward_std": 0.18850323930382729,
"rewards/preference_model_reward": 0.6130577623844147,
"rewards/preference_model_reward/std": 0.4316726624965668,
"step": 259
},
{
"clip_ratio": 0.00025136396288871765,
"epoch": 0.6762028608582574,
"grad_norm": 0.5511157087672558,
"kl": 0.0343017578125,
"learning_rate": 1e-06,
"loss": 0.0003,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 425.421875,
"epoch": 0.6788036410923277,
"grad_norm": 0.9136732590296446,
"kl": 0.0321044921875,
"learning_rate": 1e-06,
"loss": 0.0049,
"reward": 0.7294110059738159,
"reward_std": 0.36171969771385193,
"rewards/preference_model_reward": 0.7294110059738159,
"rewards/preference_model_reward/std": 0.3690430223941803,
"step": 261
},
{
"clip_ratio": 0.0002961536665679887,
"epoch": 0.6814044213263979,
"grad_norm": 0.9224194439572596,
"kl": 0.03228759765625,
"learning_rate": 1e-06,
"loss": 0.0049,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 485.265625,
"epoch": 0.6840052015604682,
"grad_norm": 1.0125086260119556,
"kl": 0.02874755859375,
"learning_rate": 1e-06,
"loss": 0.0151,
"reward": 0.6575124561786652,
"reward_std": 0.3728269934654236,
"rewards/preference_model_reward": 0.6575124561786652,
"rewards/preference_model_reward/std": 0.377619668841362,
"step": 263
},
{
"clip_ratio": 0.00027208159008296207,
"epoch": 0.6866059817945384,
"grad_norm": 0.9988921149577585,
"kl": 0.02899169921875,
"learning_rate": 1e-06,
"loss": 0.015,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 396.328125,
"epoch": 0.6892067620286085,
"grad_norm": 0.48044088482562075,
"kl": 0.03973388671875,
"learning_rate": 1e-06,
"loss": -0.0047,
"reward": 0.5718486905097961,
"reward_std": 0.15403037518262863,
"rewards/preference_model_reward": 0.5718486905097961,
"rewards/preference_model_reward/std": 0.452066108584404,
"step": 265
},
{
"clip_ratio": 0.0002960472193080932,
"epoch": 0.6918075422626788,
"grad_norm": 0.43607234235741066,
"kl": 0.04034423828125,
"learning_rate": 1e-06,
"loss": -0.0047,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 407.84375,
"epoch": 0.694408322496749,
"grad_norm": 0.05537902433689471,
"kl": 0.0294189453125,
"learning_rate": 1e-06,
"loss": -0.0003,
"reward": 0.750793993473053,
"reward_std": 0.02123763016425073,
"rewards/preference_model_reward": 0.750793993473053,
"rewards/preference_model_reward/std": 0.27511218935251236,
"step": 267
},
{
"clip_ratio": 0.0,
"epoch": 0.6970091027308193,
"grad_norm": 0.05537572405853725,
"kl": 0.0291748046875,
"learning_rate": 1e-06,
"loss": -0.0003,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 406.765625,
"epoch": 0.6996098829648895,
"grad_norm": 0.859848076243301,
"kl": 0.0335693359375,
"learning_rate": 1e-06,
"loss": 0.0232,
"reward": 0.7944641709327698,
"reward_std": 0.25520364195108414,
"rewards/preference_model_reward": 0.7944641709327698,
"rewards/preference_model_reward/std": 0.28688880801200867,
"step": 269
},
{
"clip_ratio": 0.00023486852296628058,
"epoch": 0.7022106631989596,
"grad_norm": 0.8205533173715761,
"kl": 0.0333251953125,
"learning_rate": 1e-06,
"loss": 0.0233,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 281.796875,
"epoch": 0.7048114434330299,
"grad_norm": 0.36632377721796217,
"kl": 0.0333251953125,
"learning_rate": 1e-06,
"loss": -0.001,
"reward": 0.48682448267936707,
"reward_std": 0.16525335051119328,
"rewards/preference_model_reward": 0.48682448267936707,
"rewards/preference_model_reward/std": 0.4657934308052063,
"step": 271
},
{
"clip_ratio": 0.0002138561540050432,
"epoch": 0.7074122236671001,
"grad_norm": 0.38569263532759096,
"kl": 0.03302001953125,
"learning_rate": 1e-06,
"loss": -0.001,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 277.09375,
"epoch": 0.7100130039011704,
"grad_norm": 0.5414845135880526,
"kl": 0.03057861328125,
"learning_rate": 1e-06,
"loss": -0.0044,
"reward": 0.8333463966846466,
"reward_std": 0.21173010021448135,
"rewards/preference_model_reward": 0.8333463966846466,
"rewards/preference_model_reward/std": 0.3135555535554886,
"step": 273
},
{
"clip_ratio": 0.00042767466220539063,
"epoch": 0.7126137841352406,
"grad_norm": 0.5814268658836278,
"kl": 0.0303955078125,
"learning_rate": 1e-06,
"loss": -0.0044,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 300.625,
"epoch": 0.7152145643693107,
"grad_norm": 0.29110876248573814,
"kl": 0.0308837890625,
"learning_rate": 1e-06,
"loss": -0.0006,
"reward": 0.9611281454563141,
"reward_std": 0.0763670519227162,
"rewards/preference_model_reward": 0.9611281454563141,
"rewards/preference_model_reward/std": 0.1133881090208888,
"step": 275
},
{
"clip_ratio": 0.00015091049135662615,
"epoch": 0.717815344603381,
"grad_norm": 0.2924332763673859,
"kl": 0.03094482421875,
"learning_rate": 1e-06,
"loss": -0.0006,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 367.765625,
"epoch": 0.7204161248374512,
"grad_norm": 0.6810325664521416,
"kl": 0.03387451171875,
"learning_rate": 1e-06,
"loss": -0.004,
"reward": 0.5777665078639984,
"reward_std": 0.2041564560495317,
"rewards/preference_model_reward": 0.5777665078639984,
"rewards/preference_model_reward/std": 0.4507894814014435,
"step": 277
},
{
"clip_ratio": 0.0004186662699794397,
"epoch": 0.7230169050715215,
"grad_norm": 0.6837062367984963,
"kl": 0.033935546875,
"learning_rate": 1e-06,
"loss": -0.0039,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 193.3125,
"epoch": 0.7256176853055917,
"grad_norm": 0.4916940767331736,
"kl": 0.0399169921875,
"learning_rate": 1e-06,
"loss": -0.0046,
"reward": 0.7320626676082611,
"reward_std": 0.30857832729816437,
"rewards/preference_model_reward": 0.7320626676082611,
"rewards/preference_model_reward/std": 0.3521760255098343,
"step": 279
},
{
"clip_ratio": 0.00017064846178982407,
"epoch": 0.7282184655396619,
"grad_norm": 0.49237527836693745,
"kl": 0.0400390625,
"learning_rate": 1e-06,
"loss": -0.0046,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 347.984375,
"epoch": 0.7308192457737321,
"grad_norm": 0.7857224409730065,
"kl": 0.0343017578125,
"learning_rate": 1e-06,
"loss": 0.0194,
"reward": 0.6453294306993484,
"reward_std": 0.33257874846458435,
"rewards/preference_model_reward": 0.6453294306993484,
"rewards/preference_model_reward/std": 0.33213719725608826,
"step": 281
},
{
"clip_ratio": 0.000580175816139672,
"epoch": 0.7334200260078023,
"grad_norm": 0.7785268642528227,
"kl": 0.03460693359375,
"learning_rate": 1e-06,
"loss": 0.0194,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 460.59375,
"epoch": 0.7360208062418726,
"grad_norm": 0.4261062042986948,
"kl": 0.0321044921875,
"learning_rate": 1e-06,
"loss": -0.0108,
"reward": 0.943671464920044,
"reward_std": 0.15016759932041168,
"rewards/preference_model_reward": 0.943671464920044,
"rewards/preference_model_reward/std": 0.21713975816965103,
"step": 283
},
{
"clip_ratio": 5.9215664805378765e-05,
"epoch": 0.7386215864759428,
"grad_norm": 0.42689251687876145,
"kl": 0.0323486328125,
"learning_rate": 1e-06,
"loss": -0.0108,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 398.328125,
"epoch": 0.741222366710013,
"grad_norm": 0.8784480579767534,
"kl": 0.0347900390625,
"learning_rate": 1e-06,
"loss": -0.0157,
"reward": 0.7145366668701172,
"reward_std": 0.3674587905406952,
"rewards/preference_model_reward": 0.7145366668701172,
"rewards/preference_model_reward/std": 0.37918268144130707,
"step": 285
},
{
"clip_ratio": 0.00027935014077229425,
"epoch": 0.7438231469440832,
"grad_norm": 0.8835084511138045,
"kl": 0.03515625,
"learning_rate": 1e-06,
"loss": -0.0157,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 307.59375,
"epoch": 0.7464239271781534,
"grad_norm": 0.4936033601748067,
"kl": 0.0343017578125,
"learning_rate": 1e-06,
"loss": -0.0056,
"reward": 0.876055896282196,
"reward_std": 0.22134239226579666,
"rewards/preference_model_reward": 0.876055896282196,
"rewards/preference_model_reward/std": 0.27353301644325256,
"step": 287
},
{
"clip_ratio": 0.00022383942268788815,
"epoch": 0.7490247074122237,
"grad_norm": 0.48882075708940054,
"kl": 0.0345458984375,
"learning_rate": 1e-06,
"loss": -0.0056,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 302.296875,
"epoch": 0.7516254876462939,
"grad_norm": 0.6099100220796305,
"kl": 0.03253173828125,
"learning_rate": 1e-06,
"loss": 0.0018,
"reward": 0.6450457721948624,
"reward_std": 0.16428439319133759,
"rewards/preference_model_reward": 0.6450457721948624,
"rewards/preference_model_reward/std": 0.4192444086074829,
"step": 289
},
{
"clip_ratio": 5.056634472566657e-05,
"epoch": 0.7542262678803641,
"grad_norm": 0.6204674610660649,
"kl": 0.0330810546875,
"learning_rate": 1e-06,
"loss": 0.0018,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 273.171875,
"epoch": 0.7568270481144344,
"grad_norm": 0.4785669921706492,
"kl": 0.0518798828125,
"learning_rate": 1e-06,
"loss": -0.0009,
"reward": 0.8897781372070312,
"reward_std": 0.1889289878308773,
"rewards/preference_model_reward": 0.8897781372070312,
"rewards/preference_model_reward/std": 0.23168149590492249,
"step": 291
},
{
"clip_ratio": 0.0002485646546119824,
"epoch": 0.7594278283485045,
"grad_norm": 0.47439042676333165,
"kl": 0.05322265625,
"learning_rate": 1e-06,
"loss": -0.0009,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 399.4375,
"epoch": 0.7620286085825748,
"grad_norm": 0.9071650770110183,
"kl": 0.037353515625,
"learning_rate": 1e-06,
"loss": 0.0054,
"reward": 0.5175963789224625,
"reward_std": 0.266024149954319,
"rewards/preference_model_reward": 0.5175963789224625,
"rewards/preference_model_reward/std": 0.4135463237762451,
"step": 293
},
{
"clip_ratio": 0.00034057936863973737,
"epoch": 0.764629388816645,
"grad_norm": 0.8822613784734961,
"kl": 0.037353515625,
"learning_rate": 1e-06,
"loss": 0.0054,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 338.1875,
"epoch": 0.7672301690507152,
"grad_norm": 0.7861793558844458,
"kl": 0.035400390625,
"learning_rate": 1e-06,
"loss": -0.0005,
"reward": 0.7742494642734528,
"reward_std": 0.2848004475235939,
"rewards/preference_model_reward": 0.7742494642734528,
"rewards/preference_model_reward/std": 0.3529687523841858,
"step": 295
},
{
"clip_ratio": 0.0001368406847177539,
"epoch": 0.7698309492847855,
"grad_norm": 0.7910539014156112,
"kl": 0.03515625,
"learning_rate": 1e-06,
"loss": -0.0005,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 272.359375,
"epoch": 0.7724317295188556,
"grad_norm": 0.5546679508846072,
"kl": 0.0374755859375,
"learning_rate": 1e-06,
"loss": -0.005,
"reward": 0.8659342527389526,
"reward_std": 0.1929013878107071,
"rewards/preference_model_reward": 0.8659342527389526,
"rewards/preference_model_reward/std": 0.2005062848329544,
"step": 297
},
{
"clip_ratio": 9.600614430382848e-05,
"epoch": 0.7750325097529259,
"grad_norm": 0.49104877942341174,
"kl": 0.037353515625,
"learning_rate": 1e-06,
"loss": -0.005,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 235.375,
"epoch": 0.7776332899869961,
"grad_norm": 0.5420595207231925,
"kl": 0.044921875,
"learning_rate": 1e-06,
"loss": -0.0076,
"reward": 0.7609247267246246,
"reward_std": 0.2636025846004486,
"rewards/preference_model_reward": 0.7609247267246246,
"rewards/preference_model_reward/std": 0.31968455016613007,
"step": 299
},
{
"clip_ratio": 0.00036888108297716826,
"epoch": 0.7802340702210663,
"grad_norm": 0.5424021192905694,
"kl": 0.044921875,
"learning_rate": 1e-06,
"loss": -0.0076,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 1280,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}