NTQuoc commited on
Commit
13e0221
·
verified ·
1 Parent(s): 5b78d4c

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
- base_model: Qwen/Qwen3.5-0.8B
3
- datasets: knoveleng/open-rs
4
  library_name: transformers
5
  model_name: OpenRS-GRPO
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for OpenRS-GRPO
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen3.5-0.8B](https://huggingface.co/Qwen/Qwen3.5-0.8B) on the [knoveleng/open-rs](https://huggingface.co/datasets/knoveleng/open-rs) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
 
1
  ---
2
+ base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
 
3
  library_name: transformers
4
  model_name: OpenRS-GRPO
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for OpenRS-GRPO
13
 
14
+ This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 4.023313522338867e-07,
4
- "train_runtime": 537.7965,
5
  "train_samples": 7000,
6
- "train_samples_per_second": 0.015,
7
- "train_steps_per_second": 0.002
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0013520326372236013,
4
+ "train_runtime": 36067.9715,
5
  "train_samples": 7000,
6
+ "train_samples_per_second": 0.044,
7
+ "train_steps_per_second": 0.003
8
  }
step_metrics.csv CHANGED
@@ -1,3 +1,102 @@
1
  step,epoch,loss,learning_rate,grad_norm,rewards/format_reward,rewards/cosine_scaled_reward,reward,reward_std,gpu_mem_alloc_mb,gpu_mem_peak_mb,step_time_sec
2
- 1,0.0006,4.023313522338867e-07,0.0,,0.0,-0.40046167373657227,-0.8009233474731445,0.2397190211340785,1549.0,3351.1,533.87
3
- 1,0.0006,,,,,,,,1549.0,3351.1,537.79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  step,epoch,loss,learning_rate,grad_norm,rewards/format_reward,rewards/cosine_scaled_reward,reward,reward_std,gpu_mem_alloc_mb,gpu_mem_peak_mb,step_time_sec
2
+ 1,0.0006,3.3527612686157227e-08,0.0,,0.0,-0.45186813920736313,-0.9037362784147263,0.11632500402629375,3617.5,6086.5,356.01
3
+ 2,0.0011,-2.9802322387695312e-08,0.0,,0.0,-0.4544064328074455,-0.9088128805160522,0.134795643389225,3617.5,6091.3,357.14
4
+ 3,0.0017,-2.384185791015625e-07,0.0,,0.0,-0.44806819409132004,-0.8961364179849625,0.1266492954455316,3617.5,6092.8,356.69
5
+ 4,0.0023,3.203749656677246e-07,0.0,,0.0,-0.46924517303705215,-0.9384903311729431,0.08165389159694314,3617.5,6095.2,358.13
6
+ 5,0.0029,2.980232238769531e-07,0.0,,0.0,-0.4361194893717766,-0.8722389936447144,0.1861576708033681,3617.5,6095.2,355.88
7
+ 6,0.0034,3.2782554626464844e-07,0.0,,0.0,-0.4838990569114685,-0.9677980989217758,0.03695695102214813,3617.5,6102.0,359.97
8
+ 7,0.004,-6.48200511932373e-07,0.0,,0.0,-0.46913372725248337,-0.9382674694061279,0.04741490981541574,3617.5,6102.0,366.73
9
+ 8,0.0046,-1.9371509552001953e-07,0.0,,0.0,-0.4689921736717224,-0.9379843473434448,0.08557501714676619,3617.5,6102.0,359.73
10
+ 9,0.0051,6.258487701416016e-07,0.0,,0.0,-0.47829224169254303,-0.9565844535827637,0.04161944845691323,3617.5,6102.0,356.95
11
+ 10,0.0057,0.0,0.0,,0.0,-0.39301924407482147,-0.7860384881496429,0.29622524976730347,3654.4,6102.0,356.9
12
+ 11,0.0063,-1.2665987014770508e-06,1.0000000000000002e-06,,0.0,-0.47354312986135483,-0.9470862597227097,0.025707244174554944,3654.4,6129.7,357.65
13
+ 12,0.0069,-3.2782554626464844e-07,1.0000000000000002e-06,,0.0,-0.4611263796687126,-0.9222527593374252,0.11853919085115194,3654.4,6132.1,359.0
14
+ 13,0.0074,-9.834766387939453e-07,2.0000000000000003e-06,,0.0,-0.40241140127182007,-0.8048228025436401,0.2713719364255667,3654.4,6132.1,359.35
15
+ 14,0.008,-9.98377799987793e-07,3e-06,,0.0,-0.4491143301129341,-0.8982286602258682,0.15008432138711214,3654.4,6132.1,359.42
16
+ 15,0.0086,-1.2367963790893555e-06,4.000000000000001e-06,,0.0,-0.4349117949604988,-0.8698235750198364,0.1454296549782157,3654.4,6132.1,358.18
17
+ 16,0.0091,-1.166015863418579e-06,5e-06,,0.0,-0.4481472671031952,-0.8962945342063904,0.11415091808885336,3654.4,6136.0,359.05
18
+ 17,0.0097,-4.172325134277344e-07,6e-06,,0.0,-0.40375038236379623,-0.8075007796287537,0.2773375315591693,3654.4,6136.0,359.81
19
+ 18,0.0103,1.0356307029724121e-06,7e-06,,0.0,-0.4421778917312622,-0.8843557834625244,0.1459937175968662,3654.4,6136.0,361.88
20
+ 19,0.0109,-1.4901161193847656e-08,8.000000000000001e-06,,0.0,-0.46627073734998703,-0.9325414896011353,0.061190704349428415,3654.4,6136.0,361.95
21
+ 20,0.0114,7.450580596923828e-08,9e-06,,0.0,-0.4463533014059067,-0.8927065879106522,0.1468478236347437,3654.4,6136.0,361.89
22
+ 21,0.012,-3.241002559661865e-07,1e-05,,0.0,-0.430856853723526,-0.861713707447052,0.15973031288012862,3654.4,6136.0,360.43
23
+ 22,0.0126,0.0,9.997258721585931e-06,,0.0,-0.42839662730693817,-0.8567932546138763,0.23303062841296196,3654.4,6136.0,359.97
24
+ 23,0.0131,-1.9371509552001953e-07,9.98903822616921e-06,,0.0,-0.45275644212961197,-0.9055128693580627,0.13258774112910032,3654.4,6136.0,360.57
25
+ 24,0.0137,4.3958425521850586e-07,9.97534852915723e-06,,0.0,-0.46590057015419006,-0.9318011105060577,0.08933348534628749,3654.4,6136.0,362.48
26
+ 25,0.0143,-8.940696716308594e-08,9.956206309337067e-06,,0.0,-0.45125550776720047,-0.9025110006332397,0.15539621422067285,3654.4,6136.0,364.1
27
+ 26,0.0149,-2.8312206268310547e-07,9.931634888554937e-06,,0.0,-0.456791490316391,-0.9135829955339432,0.08904895093291998,3654.4,6136.0,365.75
28
+ 27,0.0154,-2.60770320892334e-06,9.901664203302126e-06,,0.0,-0.4751305654644966,-0.950261116027832,0.06268075766274706,3654.4,6136.0,370.26
29
+ 28,0.016,5.960464477539063e-08,9.866330768241984e-06,,0.0,-0.4367612153291702,-0.8735224008560181,0.16378713678568602,3654.4,6138.0,366.74
30
+ 29,0.0166,4.842877388000488e-07,9.825677631722436e-06,,0.0,-0.4282456487417221,-0.856491282582283,0.1944181639701128,3654.5,6138.0,364.94
31
+ 30,0.0171,1.0058283805847168e-06,9.779754323328192e-06,,0.0,-0.45244697481393814,-0.9048939347267151,0.1466370872221887,3654.4,6138.0,359.72
32
+ 31,0.0177,7.301568984985352e-07,9.728616793536588e-06,,0.0,-0.4787774831056595,-0.9575549513101578,0.04152237856760621,3654.4,6138.0,359.74
33
+ 32,0.0183,7.525086402893066e-07,9.672327345550544e-06,,0.0,-0.4640432074666023,-0.9280864149332047,0.05714223568793386,3654.4,6138.0,359.53
34
+ 33,0.0189,6.705522537231445e-07,9.610954559391704e-06,,0.0,-0.42499294877052307,-0.8499859273433685,0.13816553819924593,3654.5,6138.0,362.56
35
+ 34,0.0194,3.4868717193603516e-06,9.544573208346252e-06,,0.0,-0.4555760398507118,-0.9111520648002625,0.06401598325464875,3654.4,6138.0,362.18
36
+ 35,0.02,1.996755599975586e-06,9.473264167865172e-06,,0.0,-0.4616549611091614,-0.9233099222183228,0.07124835508875549,3654.4,6138.0,360.07
37
+ 36,0.0206,3.5315752029418945e-06,9.397114317029975e-06,,0.0,-0.43700823187828064,-0.8740164637565613,0.1880413582548499,3654.4,6138.0,358.61
38
+ 37,0.0211,0.1341422200202942,9.316216432703918e-06,,0.0,-0.40694746375083923,-0.8138948976993561,0.2712679710239172,3654.4,6138.0,357.92
39
+ 38,0.0217,4.366040229797363e-06,9.230669076497688e-06,,0.0,-0.4537286013364792,-0.9074572026729584,0.140884583350271,3654.5,6138.0,361.01
40
+ 39,0.0223,2.9802322387695312e-06,9.140576474687263e-06,,0.0,-0.4670984223484993,-0.9341968446969986,0.08859914634376764,3654.4,6138.0,358.53
41
+ 40,0.0229,3.2633543014526367e-06,9.046048391230248e-06,,0.0,-0.4738834798336029,-0.9477669596672058,0.03630512161180377,3654.4,6138.0,356.31
42
+ 41,0.0234,3.5390257835388184e-06,8.947199994035402e-06,,0.0,-0.48112839460372925,-0.9622567743062973,0.03198406333103776,3654.4,6138.0,356.46
43
+ 42,0.024,4.059635102748871e-06,8.844151714648274e-06,,0.0,-0.45688286423683167,-0.9137657284736633,0.09395218873396516,3654.4,6138.0,358.44
44
+ 43,0.0246,3.972090780735016e-06,8.737029101523931e-06,,0.0,-0.4356464073061943,-0.8712927997112274,0.17926698923110962,3654.4,6138.0,362.88
45
+ 44,0.0251,7.338821887969971e-06,8.625962667065488e-06,,0.0,-0.4162614122033119,-0.8325228244066238,0.22310001868754625,3654.4,6138.0,358.73
46
+ 45,0.0257,9.98377799987793e-06,8.511087728614863e-06,,0.0,-0.4377835765480995,-0.8755671381950378,0.1843216335400939,3654.4,6138.0,359.1
47
+ 46,0.0263,7.063150405883789e-06,8.392544243589428e-06,,0.0,-0.4647279307246208,-0.9294558465480804,0.08181617665104568,3654.4,6138.0,364.32
48
+ 47,0.0269,9.715557098388672e-06,8.270476638965463e-06,,0.0,-0.4818853959441185,-0.9637707620859146,0.04111184738576412,3654.4,6138.0,366.26
49
+ 48,0.0274,1.0699033737182617e-05,8.14503363531613e-06,,0.0,-0.47147445380687714,-0.9429489076137543,0.05583410756662488,3654.4,6138.0,367.58
50
+ 49,0.028,9.488314390182495e-06,8.016368065618361e-06,,0.0,-0.4839174821972847,-0.9678349643945694,0.031033652368932962,3654.4,6138.0,367.03
51
+ 50,0.0286,1.1995434761047363e-05,7.884636689049423e-06,,0.0,-0.4634588584303856,-0.92691770195961,0.06057529430836439,3654.4,6138.0,365.37
52
+ 51,0.0291,1.093745231628418e-05,7.75e-06,,0.0,-0.48549777269363403,-0.9709955900907516,0.023253681138157845,3654.5,6138.0,357.96
53
+ 52,0.0297,1.2755393981933594e-05,7.612622032536508e-06,,0.0,-0.4660480171442032,-0.932096004486084,0.0814181575551629,3654.5,6138.0,357.69
54
+ 53,0.0303,1.1809170246124268e-05,7.472670160550849e-06,,0.0,-0.42992351949214935,-0.8598470240831375,0.23317514825612307,3654.5,6138.0,360.51
55
+ 54,0.0309,1.2435019016265869e-05,7.330314893841102e-06,,0.0,-0.4458845555782318,-0.8917691111564636,0.16836319211870432,3654.5,6138.0,359.99
56
+ 55,0.0314,1.2367963790893555e-05,7.185729670371605e-06,,0.0,-0.46969927847385406,-0.9393985569477081,0.09125666646286845,3654.5,6138.0,357.92
57
+ 56,0.032,1.3027340173721313e-05,7.0390906449655104e-06,,0.0,-0.4778226688504219,-0.9556453377008438,0.03563447529450059,3654.4,6138.0,359.54
58
+ 57,0.0326,1.173466444015503e-05,6.890576474687264e-06,,0.0,-0.4766504615545273,-0.9533008933067322,0.047348865773528814,3654.5,6138.0,360.89
59
+ 58,0.0331,1.0944902896881104e-05,6.740368101176496e-06,,0.0,-0.4303411394357681,-0.8606822788715363,0.19956759549677372,3654.5,6138.0,357.96
60
+ 59,0.0337,1.4327466487884521e-05,6.588648530198505e-06,,0.0,-0.47175391018390656,-0.9435078203678131,0.028082083677873015,3654.4,6138.0,358.32
61
+ 60,0.0343,1.862645149230957e-05,6.4356026086799176e-06,,0.0,-0.4349738284945488,-0.8699476420879364,0.14623272977769375,3654.5,6138.0,358.57
62
+ 61,0.0349,1.6495585441589355e-05,6.281416799501188e-06,,0.0,-0.43933914601802826,-0.8786782920360565,0.14873503288254142,3654.5,6138.0,359.81
63
+ 62,0.0354,1.4573335647583008e-05,6.126278954320295e-06,,0.0,-0.47655102610588074,-0.9531020373106003,0.04140742728486657,3654.5,6138.0,363.55
64
+ 63,0.036,1.3574957847595215e-05,5.970378084704441e-06,,0.0,-0.4812774509191513,-0.9625549018383026,0.0323515310883522,3654.4,6138.0,361.67
65
+ 64,0.0366,1.7490237951278687e-05,5.813904131848565e-06,,0.0,-0.4819137006998062,-0.9638274163007736,0.03202465921640396,3654.5,6138.0,360.66
66
+ 65,0.0371,1.4990568161010742e-05,5.657047735161256e-06,,0.0,-0.4426536113023758,-0.8853072375059128,0.11870932951569557,3654.5,6138.0,360.47
67
+ 66,0.0377,1.6763806343078613e-05,5.500000000000001e-06,,0.0,-0.4815715327858925,-0.9631430506706238,0.027665999252349138,3654.4,6138.0,364.95
68
+ 67,0.0383,2.3305416107177734e-05,5.342952264838748e-06,,0.0,-0.46308349817991257,-0.9261669814586639,0.09819867718033493,3654.4,6138.0,369.54
69
+ 68,0.0389,2.4430453777313232e-05,5.186095868151436e-06,,0.0,-0.4661281928420067,-0.9322563856840134,0.06839151354506612,3654.4,6138.0,369.64
70
+ 69,0.0394,1.753866672515869e-05,5.02962191529556e-06,,0.0,-0.4736683666706085,-0.947336733341217,0.06602911371737719,3654.5,6138.0,365.41
71
+ 70,0.04,1.9043684005737305e-05,4.873721045679707e-06,,0.0,-0.440708264708519,-0.881416529417038,0.15695088542997837,3654.4,6138.0,364.8
72
+ 71,0.0406,2.4437904357910156e-05,4.718583200498814e-06,,0.0,-0.47764309495687485,-0.9552861750125885,0.04716450162231922,3654.4,6138.0,362.55
73
+ 72,0.0411,1.781061291694641e-05,4.564397391320085e-06,,0.0,-0.45879723131656647,-0.9175944626331329,0.07855925057083368,3654.4,6138.0,359.86
74
+ 73,0.0417,2.104230225086212e-05,4.4113514698014955e-06,,0.0,-0.47955460846424103,-0.9591091871261597,0.03032594360411167,3654.4,6138.0,354.97
75
+ 74,0.0423,1.8224120140075684e-05,4.259631898823504e-06,,0.0,-0.470923513174057,-0.9418470114469528,0.09771440364420414,3654.4,6138.0,359.71
76
+ 75,0.0429,1.697242259979248e-05,4.109423525312738e-06,,0.0,-0.4515884444117546,-0.9031769037246704,0.11431426065973938,3654.4,6138.0,359.51
77
+ 76,0.0434,1.5683472156524658e-05,3.960909355034491e-06,,0.0,-0.43553559482097626,-0.8710711896419525,0.16496195830404758,3654.4,6138.0,357.76
78
+ 77,0.044,2.2258609533309937e-05,3.8142703296283954e-06,,0.0,-0.418168805539608,-0.8363375961780548,0.23763815127313137,3654.4,6138.0,359.55
79
+ 78,0.0446,2.06679105758667e-05,3.6696851061589e-06,,0.0,-0.45626621693372726,-0.912532389163971,0.1374040930531919,3654.5,6138.0,359.56
80
+ 79,0.0451,3.053247928619385e-05,3.527329839449152e-06,,0.0,-0.478363037109375,-0.9567261040210724,0.03342599933966994,3654.4,6138.0,359.31
81
+ 80,0.0457,3.489106893539429e-05,3.3873779674634932e-06,,0.0,-0.43451904505491257,-0.8690381050109863,0.166263896971941,3654.5,6138.0,358.11
82
+ 81,0.0463,2.2970139980316162e-05,3.2500000000000015e-06,,0.0,-0.4851565733551979,-0.9703131467103958,0.018532322952523828,3654.4,6138.0,359.52
83
+ 82,0.0469,2.2605061531066895e-05,3.115363310950579e-06,,0.0,-0.4188460633158684,-0.8376921266317368,0.21680933889001608,3654.5,6138.0,360.43
84
+ 83,0.0474,2.580881118774414e-05,2.98363193438164e-06,,0.0,-0.45973849296569824,-0.9194770008325577,0.07644858444109559,3654.4,6138.0,357.84
85
+ 84,0.048,2.232193946838379e-05,2.854966364683872e-06,,0.0,-0.4607899561524391,-0.921579897403717,0.07768060895614326,3654.4,6138.0,364.25
86
+ 85,0.0486,2.230703830718994e-05,2.7295233610345384e-06,,0.0,-0.40651462972164154,-0.8130292594432831,0.21775285061448812,3654.4,6138.0,361.8
87
+ 86,0.0491,2.0213425159454346e-05,2.607455756410573e-06,,0.0,-0.4558562785387039,-0.911712571978569,0.09181239921599627,3654.5,6138.0,363.18
88
+ 87,0.0497,2.09808349609375e-05,2.4889122713851397e-06,,0.0,-0.47007501125335693,-0.9401500076055527,0.04285714589059353,3654.5,6138.0,368.87
89
+ 88,0.0503,2.3573637008666992e-05,2.374037332934512e-06,,0.0,-0.41688134521245956,-0.8337626904249191,0.24908871483057737,3654.4,6138.0,360.15
90
+ 89,0.0509,2.3312866687774658e-05,2.262970898476071e-06,,0.0,-0.4709980934858322,-0.9419961720705032,0.06526243314146996,3654.5,6138.0,355.62
91
+ 90,0.0514,2.16066837310791e-05,2.1558482853517257e-06,,0.0,-0.47104664146900177,-0.9420932680368423,0.05442978721112013,3654.4,6138.0,359.25
92
+ 91,0.052,2.3186206817626953e-05,2.0528000059646e-06,,0.0,-0.42244888097047806,-0.8448977470397949,0.17764843348413706,3654.4,6138.0,357.58
93
+ 92,0.0526,2.4143606424331665e-05,1.953951608769752e-06,,0.0,-0.43437784910202026,-0.8687557131052017,0.13139949878677726,3654.5,6138.0,360.98
94
+ 93,0.0531,1.7457641661167145e-05,1.8594235253127373e-06,,0.0,-0.4726478382945061,-0.9452957063913345,0.044932478107512,3654.4,6138.0,359.65
95
+ 94,0.0537,2.3424625396728516e-05,1.769330923502313e-06,,0.0,-0.48662828654050827,-0.9732565432786942,0.023667596746236086,3654.4,6138.0,357.33
96
+ 95,0.0543,2.664327621459961e-05,1.6837835672960834e-06,,0.0,-0.41473422944545746,-0.8294684588909149,0.21260959655046463,3654.4,6138.0,360.81
97
+ 96,0.0549,2.310052514076233e-05,1.602885682970026e-06,,0.0,-0.4408787190914154,-0.8817574381828308,0.14791762363165617,3654.4,6138.0,359.17
98
+ 97,0.0554,1.5437602996826172e-05,1.526735832134829e-06,,0.0,-0.46087589859962463,-0.9217518121004105,0.05548063712194562,3654.5,6138.0,357.98
99
+ 98,0.056,2.017989754676819e-05,1.4554267916537495e-06,,0.0,-0.4707222431898117,-0.9414444863796234,0.057337059173732996,3654.4,6138.0,359.07
100
+ 99,0.0566,1.6748905181884766e-05,1.389045440608296e-06,,0.0,-0.43424008786678314,-0.8684801608324051,0.18490357510745525,3654.4,6138.0,357.48
101
+ 100,0.0571,2.2009015083312988e-05,1.3276726544494572e-06,,0.0,-0.44838932156562805,-0.8967786431312561,0.12791539868339896,3654.4,6138.0,358.57
102
+ 100,0.0571,,,,,,,,3654.4,6138.0,362.18
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 4.023313522338867e-07,
4
- "train_runtime": 537.7965,
5
  "train_samples": 7000,
6
- "train_samples_per_second": 0.015,
7
- "train_steps_per_second": 0.002
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.0013520326372236013,
4
+ "train_runtime": 36067.9715,
5
  "train_samples": 7000,
6
+ "train_samples_per_second": 0.044,
7
+ "train_steps_per_second": 0.003
8
  }
trainer_state.json CHANGED
@@ -2,38 +2,1325 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.0005714285714285715,
6
  "eval_steps": 500,
7
- "global_step": 1,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
14
- "completion_length": 445.25,
15
  "epoch": 0.0005714285714285715,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
- "loss": 4.023313522338867e-07,
19
- "reward": -0.8009233474731445,
20
- "reward_std": 0.2397190211340785,
21
- "rewards/cosine_scaled_reward": -0.40046167373657227,
22
  "rewards/format_reward": 0.0,
23
  "step": 1
24
  },
25
  {
26
- "epoch": 0.0005714285714285715,
27
- "step": 1,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  "total_flos": 0.0,
29
- "train_loss": 4.023313522338867e-07,
30
- "train_runtime": 537.7965,
31
- "train_samples_per_second": 0.015,
32
- "train_steps_per_second": 0.002
33
  }
34
  ],
35
  "logging_steps": 1,
36
- "max_steps": 1,
37
  "num_input_tokens_seen": 0,
38
  "num_train_epochs": 1,
39
  "save_steps": 50,
@@ -50,7 +1337,7 @@
50
  }
51
  },
52
  "total_flos": 0.0,
53
- "train_batch_size": 1,
54
  "trial_name": null,
55
  "trial_params": null
56
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.05714285714285714,
6
  "eval_steps": 500,
7
+ "global_step": 100,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "clip_ratio": 0.0,
14
+ "completion_length": 1024.0,
15
  "epoch": 0.0005714285714285715,
16
  "kl": 0.0,
17
  "learning_rate": 0.0,
18
+ "loss": 3.3527612686157227e-08,
19
+ "reward": -0.9037362784147263,
20
+ "reward_std": 0.11632500402629375,
21
+ "rewards/cosine_scaled_reward": -0.45186813920736313,
22
  "rewards/format_reward": 0.0,
23
  "step": 1
24
  },
25
  {
26
+ "clip_ratio": 0.0,
27
+ "completion_length": 1024.0,
28
+ "epoch": 0.001142857142857143,
29
+ "kl": 0.0,
30
+ "learning_rate": 0.0,
31
+ "loss": -2.9802322387695312e-08,
32
+ "reward": -0.9088128805160522,
33
+ "reward_std": 0.134795643389225,
34
+ "rewards/cosine_scaled_reward": -0.4544064328074455,
35
+ "rewards/format_reward": 0.0,
36
+ "step": 2
37
+ },
38
+ {
39
+ "clip_ratio": 0.0,
40
+ "completion_length": 1024.0,
41
+ "epoch": 0.0017142857142857142,
42
+ "kl": 0.0,
43
+ "learning_rate": 0.0,
44
+ "loss": -2.384185791015625e-07,
45
+ "reward": -0.8961364179849625,
46
+ "reward_std": 0.1266492954455316,
47
+ "rewards/cosine_scaled_reward": -0.44806819409132004,
48
+ "rewards/format_reward": 0.0,
49
+ "step": 3
50
+ },
51
+ {
52
+ "clip_ratio": 0.0,
53
+ "completion_length": 1024.0,
54
+ "epoch": 0.002285714285714286,
55
+ "kl": 0.0,
56
+ "learning_rate": 0.0,
57
+ "loss": 3.203749656677246e-07,
58
+ "reward": -0.9384903311729431,
59
+ "reward_std": 0.08165389159694314,
60
+ "rewards/cosine_scaled_reward": -0.46924517303705215,
61
+ "rewards/format_reward": 0.0,
62
+ "step": 4
63
+ },
64
+ {
65
+ "clip_ratio": 0.0,
66
+ "completion_length": 1024.0,
67
+ "epoch": 0.002857142857142857,
68
+ "kl": 0.0,
69
+ "learning_rate": 0.0,
70
+ "loss": 2.980232238769531e-07,
71
+ "reward": -0.8722389936447144,
72
+ "reward_std": 0.1861576708033681,
73
+ "rewards/cosine_scaled_reward": -0.4361194893717766,
74
+ "rewards/format_reward": 0.0,
75
+ "step": 5
76
+ },
77
+ {
78
+ "clip_ratio": 0.0,
79
+ "completion_length": 1024.0,
80
+ "epoch": 0.0034285714285714284,
81
+ "kl": 0.0,
82
+ "learning_rate": 0.0,
83
+ "loss": 3.2782554626464844e-07,
84
+ "reward": -0.9677980989217758,
85
+ "reward_std": 0.03695695102214813,
86
+ "rewards/cosine_scaled_reward": -0.4838990569114685,
87
+ "rewards/format_reward": 0.0,
88
+ "step": 6
89
+ },
90
+ {
91
+ "clip_ratio": 0.0,
92
+ "completion_length": 1024.0,
93
+ "epoch": 0.004,
94
+ "kl": 0.0,
95
+ "learning_rate": 0.0,
96
+ "loss": -6.48200511932373e-07,
97
+ "reward": -0.9382674694061279,
98
+ "reward_std": 0.04741490981541574,
99
+ "rewards/cosine_scaled_reward": -0.46913372725248337,
100
+ "rewards/format_reward": 0.0,
101
+ "step": 7
102
+ },
103
+ {
104
+ "clip_ratio": 0.0,
105
+ "completion_length": 1024.0,
106
+ "epoch": 0.004571428571428572,
107
+ "kl": 0.0,
108
+ "learning_rate": 0.0,
109
+ "loss": -1.9371509552001953e-07,
110
+ "reward": -0.9379843473434448,
111
+ "reward_std": 0.08557501714676619,
112
+ "rewards/cosine_scaled_reward": -0.4689921736717224,
113
+ "rewards/format_reward": 0.0,
114
+ "step": 8
115
+ },
116
+ {
117
+ "clip_ratio": 0.0,
118
+ "completion_length": 1024.0,
119
+ "epoch": 0.005142857142857143,
120
+ "kl": 0.0,
121
+ "learning_rate": 0.0,
122
+ "loss": 6.258487701416016e-07,
123
+ "reward": -0.9565844535827637,
124
+ "reward_std": 0.04161944845691323,
125
+ "rewards/cosine_scaled_reward": -0.47829224169254303,
126
+ "rewards/format_reward": 0.0,
127
+ "step": 9
128
+ },
129
+ {
130
+ "clip_ratio": 0.0,
131
+ "completion_length": 1024.0,
132
+ "epoch": 0.005714285714285714,
133
+ "kl": 0.0,
134
+ "learning_rate": 0.0,
135
+ "loss": 0.0,
136
+ "reward": -0.7860384881496429,
137
+ "reward_std": 0.29622524976730347,
138
+ "rewards/cosine_scaled_reward": -0.39301924407482147,
139
+ "rewards/format_reward": 0.0,
140
+ "step": 10
141
+ },
142
+ {
143
+ "clip_ratio": 0.0,
144
+ "completion_length": 1024.0,
145
+ "epoch": 0.006285714285714286,
146
+ "kl": 0.0,
147
+ "learning_rate": 1.0000000000000002e-06,
148
+ "loss": -1.2665987014770508e-06,
149
+ "reward": -0.9470862597227097,
150
+ "reward_std": 0.025707244174554944,
151
+ "rewards/cosine_scaled_reward": -0.47354312986135483,
152
+ "rewards/format_reward": 0.0,
153
+ "step": 11
154
+ },
155
+ {
156
+ "clip_ratio": 0.0,
157
+ "completion_length": 1024.0,
158
+ "epoch": 0.006857142857142857,
159
+ "kl": 0.0,
160
+ "learning_rate": 1.0000000000000002e-06,
161
+ "loss": -3.2782554626464844e-07,
162
+ "reward": -0.9222527593374252,
163
+ "reward_std": 0.11853919085115194,
164
+ "rewards/cosine_scaled_reward": -0.4611263796687126,
165
+ "rewards/format_reward": 0.0,
166
+ "step": 12
167
+ },
168
+ {
169
+ "clip_ratio": 0.0,
170
+ "completion_length": 1024.0,
171
+ "epoch": 0.0074285714285714285,
172
+ "kl": -2.9742717742919922e-05,
173
+ "learning_rate": 2.0000000000000003e-06,
174
+ "loss": -9.834766387939453e-07,
175
+ "reward": -0.8048228025436401,
176
+ "reward_std": 0.2713719364255667,
177
+ "rewards/cosine_scaled_reward": -0.40241140127182007,
178
+ "rewards/format_reward": 0.0,
179
+ "step": 13
180
+ },
181
+ {
182
+ "clip_ratio": 0.0,
183
+ "completion_length": 1024.0,
184
+ "epoch": 0.008,
185
+ "kl": -1.9848346710205078e-05,
186
+ "learning_rate": 3e-06,
187
+ "loss": -9.98377799987793e-07,
188
+ "reward": -0.8982286602258682,
189
+ "reward_std": 0.15008432138711214,
190
+ "rewards/cosine_scaled_reward": -0.4491143301129341,
191
+ "rewards/format_reward": 0.0,
192
+ "step": 14
193
+ },
194
+ {
195
+ "clip_ratio": 0.0,
196
+ "completion_length": 1024.0,
197
+ "epoch": 0.008571428571428572,
198
+ "kl": -3.173947334289551e-05,
199
+ "learning_rate": 4.000000000000001e-06,
200
+ "loss": -1.2367963790893555e-06,
201
+ "reward": -0.8698235750198364,
202
+ "reward_std": 0.1454296549782157,
203
+ "rewards/cosine_scaled_reward": -0.4349117949604988,
204
+ "rewards/format_reward": 0.0,
205
+ "step": 15
206
+ },
207
+ {
208
+ "clip_ratio": 0.0,
209
+ "completion_length": 1024.0,
210
+ "epoch": 0.009142857142857144,
211
+ "kl": -2.2411346435546875e-05,
212
+ "learning_rate": 5e-06,
213
+ "loss": -1.166015863418579e-06,
214
+ "reward": -0.8962945342063904,
215
+ "reward_std": 0.11415091808885336,
216
+ "rewards/cosine_scaled_reward": -0.4481472671031952,
217
+ "rewards/format_reward": 0.0,
218
+ "step": 16
219
+ },
220
+ {
221
+ "clip_ratio": 0.0,
222
+ "completion_length": 1024.0,
223
+ "epoch": 0.009714285714285713,
224
+ "kl": -9.268522262573242e-06,
225
+ "learning_rate": 6e-06,
226
+ "loss": -4.172325134277344e-07,
227
+ "reward": -0.8075007796287537,
228
+ "reward_std": 0.2773375315591693,
229
+ "rewards/cosine_scaled_reward": -0.40375038236379623,
230
+ "rewards/format_reward": 0.0,
231
+ "step": 17
232
+ },
233
+ {
234
+ "clip_ratio": 0.0,
235
+ "completion_length": 1024.0,
236
+ "epoch": 0.010285714285714285,
237
+ "kl": -1.519918441772461e-06,
238
+ "learning_rate": 7e-06,
239
+ "loss": 1.0356307029724121e-06,
240
+ "reward": -0.8843557834625244,
241
+ "reward_std": 0.1459937175968662,
242
+ "rewards/cosine_scaled_reward": -0.4421778917312622,
243
+ "rewards/format_reward": 0.0,
244
+ "step": 18
245
+ },
246
+ {
247
+ "clip_ratio": 0.0,
248
+ "completion_length": 1024.0,
249
+ "epoch": 0.010857142857142857,
250
+ "kl": -1.1920928955078125e-07,
251
+ "learning_rate": 8.000000000000001e-06,
252
+ "loss": -1.4901161193847656e-08,
253
+ "reward": -0.9325414896011353,
254
+ "reward_std": 0.061190704349428415,
255
+ "rewards/cosine_scaled_reward": -0.46627073734998703,
256
+ "rewards/format_reward": 0.0,
257
+ "step": 19
258
+ },
259
+ {
260
+ "clip_ratio": 0.0,
261
+ "completion_length": 1024.0,
262
+ "epoch": 0.011428571428571429,
263
+ "kl": -7.152557373046875e-07,
264
+ "learning_rate": 9e-06,
265
+ "loss": 7.450580596923828e-08,
266
+ "reward": -0.8927065879106522,
267
+ "reward_std": 0.1468478236347437,
268
+ "rewards/cosine_scaled_reward": -0.4463533014059067,
269
+ "rewards/format_reward": 0.0,
270
+ "step": 20
271
+ },
272
+ {
273
+ "clip_ratio": 0.0,
274
+ "completion_length": 1024.0,
275
+ "epoch": 0.012,
276
+ "kl": -4.470348358154297e-07,
277
+ "learning_rate": 1e-05,
278
+ "loss": -3.241002559661865e-07,
279
+ "reward": -0.861713707447052,
280
+ "reward_std": 0.15973031288012862,
281
+ "rewards/cosine_scaled_reward": -0.430856853723526,
282
+ "rewards/format_reward": 0.0,
283
+ "step": 21
284
+ },
285
+ {
286
+ "clip_ratio": 0.0,
287
+ "completion_length": 1024.0,
288
+ "epoch": 0.012571428571428572,
289
+ "kl": 2.980232238769531e-07,
290
+ "learning_rate": 9.997258721585931e-06,
291
+ "loss": 0.0,
292
+ "reward": -0.8567932546138763,
293
+ "reward_std": 0.23303062841296196,
294
+ "rewards/cosine_scaled_reward": -0.42839662730693817,
295
+ "rewards/format_reward": 0.0,
296
+ "step": 22
297
+ },
298
+ {
299
+ "clip_ratio": 0.0,
300
+ "completion_length": 1024.0,
301
+ "epoch": 0.013142857142857144,
302
+ "kl": 6.258487701416016e-07,
303
+ "learning_rate": 9.98903822616921e-06,
304
+ "loss": -1.9371509552001953e-07,
305
+ "reward": -0.9055128693580627,
306
+ "reward_std": 0.13258774112910032,
307
+ "rewards/cosine_scaled_reward": -0.45275644212961197,
308
+ "rewards/format_reward": 0.0,
309
+ "step": 23
310
+ },
311
+ {
312
+ "clip_ratio": 0.0,
313
+ "completion_length": 1024.0,
314
+ "epoch": 0.013714285714285714,
315
+ "kl": 2.682209014892578e-07,
316
+ "learning_rate": 9.97534852915723e-06,
317
+ "loss": 4.3958425521850586e-07,
318
+ "reward": -0.9318011105060577,
319
+ "reward_std": 0.08933348534628749,
320
+ "rewards/cosine_scaled_reward": -0.46590057015419006,
321
+ "rewards/format_reward": 0.0,
322
+ "step": 24
323
+ },
324
+ {
325
+ "clip_ratio": 0.0,
326
+ "completion_length": 1024.0,
327
+ "epoch": 0.014285714285714285,
328
+ "kl": 3.337860107421875e-06,
329
+ "learning_rate": 9.956206309337067e-06,
330
+ "loss": -8.940696716308594e-08,
331
+ "reward": -0.9025110006332397,
332
+ "reward_std": 0.15539621422067285,
333
+ "rewards/cosine_scaled_reward": -0.45125550776720047,
334
+ "rewards/format_reward": 0.0,
335
+ "step": 25
336
+ },
337
+ {
338
+ "clip_ratio": 0.0,
339
+ "completion_length": 1024.0,
340
+ "epoch": 0.014857142857142857,
341
+ "kl": 2.086162567138672e-06,
342
+ "learning_rate": 9.931634888554937e-06,
343
+ "loss": -2.8312206268310547e-07,
344
+ "reward": -0.9135829955339432,
345
+ "reward_std": 0.08904895093291998,
346
+ "rewards/cosine_scaled_reward": -0.456791490316391,
347
+ "rewards/format_reward": 0.0,
348
+ "step": 26
349
+ },
350
+ {
351
+ "clip_ratio": 0.0,
352
+ "completion_length": 1024.0,
353
+ "epoch": 0.015428571428571429,
354
+ "kl": 2.8312206268310547e-06,
355
+ "learning_rate": 9.901664203302126e-06,
356
+ "loss": -2.60770320892334e-06,
357
+ "reward": -0.950261116027832,
358
+ "reward_std": 0.06268075766274706,
359
+ "rewards/cosine_scaled_reward": -0.4751305654644966,
360
+ "rewards/format_reward": 0.0,
361
+ "step": 27
362
+ },
363
+ {
364
+ "clip_ratio": 0.0,
365
+ "completion_length": 1024.0,
366
+ "epoch": 0.016,
367
+ "kl": 2.8014183044433594e-06,
368
+ "learning_rate": 9.866330768241984e-06,
369
+ "loss": 5.960464477539063e-08,
370
+ "reward": -0.8735224008560181,
371
+ "reward_std": 0.16378713678568602,
372
+ "rewards/cosine_scaled_reward": -0.4367612153291702,
373
+ "rewards/format_reward": 0.0,
374
+ "step": 28
375
+ },
376
+ {
377
+ "clip_ratio": 0.0,
378
+ "completion_length": 1024.0,
379
+ "epoch": 0.01657142857142857,
380
+ "kl": 1.0013580322265625e-05,
381
+ "learning_rate": 9.825677631722436e-06,
382
+ "loss": 4.842877388000488e-07,
383
+ "reward": -0.856491282582283,
384
+ "reward_std": 0.1944181639701128,
385
+ "rewards/cosine_scaled_reward": -0.4282456487417221,
386
+ "rewards/format_reward": 0.0,
387
+ "step": 29
388
+ },
389
+ {
390
+ "clip_ratio": 0.0,
391
+ "completion_length": 1024.0,
392
+ "epoch": 0.017142857142857144,
393
+ "kl": 6.258487701416016e-06,
394
+ "learning_rate": 9.779754323328192e-06,
395
+ "loss": 1.0058283805847168e-06,
396
+ "reward": -0.9048939347267151,
397
+ "reward_std": 0.1466370872221887,
398
+ "rewards/cosine_scaled_reward": -0.45244697481393814,
399
+ "rewards/format_reward": 0.0,
400
+ "step": 30
401
+ },
402
+ {
403
+ "clip_ratio": 0.0,
404
+ "completion_length": 1024.0,
405
+ "epoch": 0.017714285714285714,
406
+ "kl": 1.4841556549072266e-05,
407
+ "learning_rate": 9.728616793536588e-06,
408
+ "loss": 7.301568984985352e-07,
409
+ "reward": -0.9575549513101578,
410
+ "reward_std": 0.04152237856760621,
411
+ "rewards/cosine_scaled_reward": -0.4787774831056595,
412
+ "rewards/format_reward": 0.0,
413
+ "step": 31
414
+ },
415
+ {
416
+ "clip_ratio": 0.0,
417
+ "completion_length": 1024.0,
418
+ "epoch": 0.018285714285714287,
419
+ "kl": 1.5348196029663086e-05,
420
+ "learning_rate": 9.672327345550544e-06,
421
+ "loss": 7.525086402893066e-07,
422
+ "reward": -0.9280864149332047,
423
+ "reward_std": 0.05714223568793386,
424
+ "rewards/cosine_scaled_reward": -0.4640432074666023,
425
+ "rewards/format_reward": 0.0,
426
+ "step": 32
427
+ },
428
+ {
429
+ "clip_ratio": 0.0,
430
+ "completion_length": 1024.0,
431
+ "epoch": 0.018857142857142857,
432
+ "kl": 2.396106719970703e-05,
433
+ "learning_rate": 9.610954559391704e-06,
434
+ "loss": 6.705522537231445e-07,
435
+ "reward": -0.8499859273433685,
436
+ "reward_std": 0.13816553819924593,
437
+ "rewards/cosine_scaled_reward": -0.42499294877052307,
438
+ "rewards/format_reward": 0.0,
439
+ "step": 33
440
+ },
441
+ {
442
+ "clip_ratio": 0.0,
443
+ "completion_length": 1024.0,
444
+ "epoch": 0.019428571428571427,
445
+ "kl": 2.98917293548584e-05,
446
+ "learning_rate": 9.544573208346252e-06,
447
+ "loss": 3.4868717193603516e-06,
448
+ "reward": -0.9111520648002625,
449
+ "reward_std": 0.06401598325464875,
450
+ "rewards/cosine_scaled_reward": -0.4555760398507118,
451
+ "rewards/format_reward": 0.0,
452
+ "step": 34
453
+ },
454
+ {
455
+ "clip_ratio": 0.0,
456
+ "completion_length": 1024.0,
457
+ "epoch": 0.02,
458
+ "kl": 5.048513412475586e-05,
459
+ "learning_rate": 9.473264167865172e-06,
460
+ "loss": 1.996755599975586e-06,
461
+ "reward": -0.9233099222183228,
462
+ "reward_std": 0.07124835508875549,
463
+ "rewards/cosine_scaled_reward": -0.4616549611091614,
464
+ "rewards/format_reward": 0.0,
465
+ "step": 35
466
+ },
467
+ {
468
+ "clip_ratio": 0.0,
469
+ "completion_length": 1024.0,
470
+ "epoch": 0.02057142857142857,
471
+ "kl": 7.835030555725098e-05,
472
+ "learning_rate": 9.397114317029975e-06,
473
+ "loss": 3.5315752029418945e-06,
474
+ "reward": -0.8740164637565613,
475
+ "reward_std": 0.1880413582548499,
476
+ "rewards/cosine_scaled_reward": -0.43700823187828064,
477
+ "rewards/format_reward": 0.0,
478
+ "step": 36
479
+ },
480
+ {
481
+ "clip_ratio": 0.0,
482
+ "completion_length": 960.4375,
483
+ "epoch": 0.021142857142857144,
484
+ "kl": 0.0002009570598602295,
485
+ "learning_rate": 9.316216432703918e-06,
486
+ "loss": 0.1341422200202942,
487
+ "reward": -0.8138948976993561,
488
+ "reward_std": 0.2712679710239172,
489
+ "rewards/cosine_scaled_reward": -0.40694746375083923,
490
+ "rewards/format_reward": 0.0,
491
+ "step": 37
492
+ },
493
+ {
494
+ "clip_ratio": 0.0,
495
+ "completion_length": 1024.0,
496
+ "epoch": 0.021714285714285714,
497
+ "kl": 0.00011423230171203613,
498
+ "learning_rate": 9.230669076497688e-06,
499
+ "loss": 4.366040229797363e-06,
500
+ "reward": -0.9074572026729584,
501
+ "reward_std": 0.140884583350271,
502
+ "rewards/cosine_scaled_reward": -0.4537286013364792,
503
+ "rewards/format_reward": 0.0,
504
+ "step": 38
505
+ },
506
+ {
507
+ "clip_ratio": 0.0,
508
+ "completion_length": 1024.0,
509
+ "epoch": 0.022285714285714287,
510
+ "kl": 7.608532905578613e-05,
511
+ "learning_rate": 9.140576474687263e-06,
512
+ "loss": 2.9802322387695312e-06,
513
+ "reward": -0.9341968446969986,
514
+ "reward_std": 0.08859914634376764,
515
+ "rewards/cosine_scaled_reward": -0.4670984223484993,
516
+ "rewards/format_reward": 0.0,
517
+ "step": 39
518
+ },
519
+ {
520
+ "clip_ratio": 0.0,
521
+ "completion_length": 1024.0,
522
+ "epoch": 0.022857142857142857,
523
+ "kl": 8.574128150939941e-05,
524
+ "learning_rate": 9.046048391230248e-06,
525
+ "loss": 3.2633543014526367e-06,
526
+ "reward": -0.9477669596672058,
527
+ "reward_std": 0.03630512161180377,
528
+ "rewards/cosine_scaled_reward": -0.4738834798336029,
529
+ "rewards/format_reward": 0.0,
530
+ "step": 40
531
+ },
532
+ {
533
+ "clip_ratio": 0.0,
534
+ "completion_length": 1024.0,
535
+ "epoch": 0.023428571428571427,
536
+ "kl": 9.319186210632324e-05,
537
+ "learning_rate": 8.947199994035402e-06,
538
+ "loss": 3.5390257835388184e-06,
539
+ "reward": -0.9622567743062973,
540
+ "reward_std": 0.03198406333103776,
541
+ "rewards/cosine_scaled_reward": -0.48112839460372925,
542
+ "rewards/format_reward": 0.0,
543
+ "step": 41
544
+ },
545
+ {
546
+ "clip_ratio": 0.0,
547
+ "completion_length": 1024.0,
548
+ "epoch": 0.024,
549
+ "kl": 0.00011175870895385742,
550
+ "learning_rate": 8.844151714648274e-06,
551
+ "loss": 4.059635102748871e-06,
552
+ "reward": -0.9137657284736633,
553
+ "reward_std": 0.09395218873396516,
554
+ "rewards/cosine_scaled_reward": -0.45688286423683167,
555
+ "rewards/format_reward": 0.0,
556
+ "step": 42
557
+ },
558
+ {
559
+ "clip_ratio": 0.0,
560
+ "completion_length": 1024.0,
561
+ "epoch": 0.02457142857142857,
562
+ "kl": 9.465217590332031e-05,
563
+ "learning_rate": 8.737029101523931e-06,
564
+ "loss": 3.972090780735016e-06,
565
+ "reward": -0.8712927997112274,
566
+ "reward_std": 0.17926698923110962,
567
+ "rewards/cosine_scaled_reward": -0.4356464073061943,
568
+ "rewards/format_reward": 0.0,
569
+ "step": 43
570
+ },
571
+ {
572
+ "clip_ratio": 0.0,
573
+ "completion_length": 1024.0,
574
+ "epoch": 0.025142857142857144,
575
+ "kl": 0.00018846988677978516,
576
+ "learning_rate": 8.625962667065488e-06,
577
+ "loss": 7.338821887969971e-06,
578
+ "reward": -0.8325228244066238,
579
+ "reward_std": 0.22310001868754625,
580
+ "rewards/cosine_scaled_reward": -0.4162614122033119,
581
+ "rewards/format_reward": 0.0,
582
+ "step": 44
583
+ },
584
+ {
585
+ "clip_ratio": 0.0,
586
+ "completion_length": 1024.0,
587
+ "epoch": 0.025714285714285714,
588
+ "kl": 0.00025010108947753906,
589
+ "learning_rate": 8.511087728614863e-06,
590
+ "loss": 9.98377799987793e-06,
591
+ "reward": -0.8755671381950378,
592
+ "reward_std": 0.1843216335400939,
593
+ "rewards/cosine_scaled_reward": -0.4377835765480995,
594
+ "rewards/format_reward": 0.0,
595
+ "step": 45
596
+ },
597
+ {
598
+ "clip_ratio": 0.0,
599
+ "completion_length": 1024.0,
600
+ "epoch": 0.026285714285714287,
601
+ "kl": 0.00020682811737060547,
602
+ "learning_rate": 8.392544243589428e-06,
603
+ "loss": 7.063150405883789e-06,
604
+ "reward": -0.9294558465480804,
605
+ "reward_std": 0.08181617665104568,
606
+ "rewards/cosine_scaled_reward": -0.4647279307246208,
607
+ "rewards/format_reward": 0.0,
608
+ "step": 46
609
+ },
610
+ {
611
+ "clip_ratio": 0.0,
612
+ "completion_length": 1024.0,
613
+ "epoch": 0.026857142857142857,
614
+ "kl": 0.00023487210273742676,
615
+ "learning_rate": 8.270476638965463e-06,
616
+ "loss": 9.715557098388672e-06,
617
+ "reward": -0.9637707620859146,
618
+ "reward_std": 0.04111184738576412,
619
+ "rewards/cosine_scaled_reward": -0.4818853959441185,
620
+ "rewards/format_reward": 0.0,
621
+ "step": 47
622
+ },
623
+ {
624
+ "clip_ratio": 0.0,
625
+ "completion_length": 1024.0,
626
+ "epoch": 0.027428571428571427,
627
+ "kl": 0.00025856494903564453,
628
+ "learning_rate": 8.14503363531613e-06,
629
+ "loss": 1.0699033737182617e-05,
630
+ "reward": -0.9429489076137543,
631
+ "reward_std": 0.05583410756662488,
632
+ "rewards/cosine_scaled_reward": -0.47147445380687714,
633
+ "rewards/format_reward": 0.0,
634
+ "step": 48
635
+ },
636
+ {
637
+ "clip_ratio": 0.0,
638
+ "completion_length": 1024.0,
639
+ "epoch": 0.028,
640
+ "kl": 0.00023242831230163574,
641
+ "learning_rate": 8.016368065618361e-06,
642
+ "loss": 9.488314390182495e-06,
643
+ "reward": -0.9678349643945694,
644
+ "reward_std": 0.031033652368932962,
645
+ "rewards/cosine_scaled_reward": -0.4839174821972847,
646
+ "rewards/format_reward": 0.0,
647
+ "step": 49
648
+ },
649
+ {
650
+ "clip_ratio": 0.0,
651
+ "completion_length": 1024.0,
652
+ "epoch": 0.02857142857142857,
653
+ "kl": 0.0002980530261993408,
654
+ "learning_rate": 7.884636689049423e-06,
655
+ "loss": 1.1995434761047363e-05,
656
+ "reward": -0.92691770195961,
657
+ "reward_std": 0.06057529430836439,
658
+ "rewards/cosine_scaled_reward": -0.4634588584303856,
659
+ "rewards/format_reward": 0.0,
660
+ "step": 50
661
+ },
662
+ {
663
+ "clip_ratio": 0.0,
664
+ "completion_length": 1024.0,
665
+ "epoch": 0.029142857142857144,
666
+ "kl": 0.0002821683883666992,
667
+ "learning_rate": 7.75e-06,
668
+ "loss": 1.093745231628418e-05,
669
+ "reward": -0.9709955900907516,
670
+ "reward_std": 0.023253681138157845,
671
+ "rewards/cosine_scaled_reward": -0.48549777269363403,
672
+ "rewards/format_reward": 0.0,
673
+ "step": 51
674
+ },
675
+ {
676
+ "clip_ratio": 0.0,
677
+ "completion_length": 1024.0,
678
+ "epoch": 0.029714285714285714,
679
+ "kl": 0.0003021657466888428,
680
+ "learning_rate": 7.612622032536508e-06,
681
+ "loss": 1.2755393981933594e-05,
682
+ "reward": -0.932096004486084,
683
+ "reward_std": 0.0814181575551629,
684
+ "rewards/cosine_scaled_reward": -0.4660480171442032,
685
+ "rewards/format_reward": 0.0,
686
+ "step": 52
687
+ },
688
+ {
689
+ "clip_ratio": 0.0,
690
+ "completion_length": 1024.0,
691
+ "epoch": 0.030285714285714287,
692
+ "kl": 0.0003120899200439453,
693
+ "learning_rate": 7.472670160550849e-06,
694
+ "loss": 1.1809170246124268e-05,
695
+ "reward": -0.8598470240831375,
696
+ "reward_std": 0.23317514825612307,
697
+ "rewards/cosine_scaled_reward": -0.42992351949214935,
698
+ "rewards/format_reward": 0.0,
699
+ "step": 53
700
+ },
701
+ {
702
+ "clip_ratio": 0.0,
703
+ "completion_length": 1024.0,
704
+ "epoch": 0.030857142857142857,
705
+ "kl": 0.00031960010528564453,
706
+ "learning_rate": 7.330314893841102e-06,
707
+ "loss": 1.2435019016265869e-05,
708
+ "reward": -0.8917691111564636,
709
+ "reward_std": 0.16836319211870432,
710
+ "rewards/cosine_scaled_reward": -0.4458845555782318,
711
+ "rewards/format_reward": 0.0,
712
+ "step": 54
713
+ },
714
+ {
715
+ "clip_ratio": 0.0,
716
+ "completion_length": 1024.0,
717
+ "epoch": 0.03142857142857143,
718
+ "kl": 0.0003180503845214844,
719
+ "learning_rate": 7.185729670371605e-06,
720
+ "loss": 1.2367963790893555e-05,
721
+ "reward": -0.9393985569477081,
722
+ "reward_std": 0.09125666646286845,
723
+ "rewards/cosine_scaled_reward": -0.46969927847385406,
724
+ "rewards/format_reward": 0.0,
725
+ "step": 55
726
+ },
727
+ {
728
+ "clip_ratio": 0.0,
729
+ "completion_length": 1024.0,
730
+ "epoch": 0.032,
731
+ "kl": 0.00032332539558410645,
732
+ "learning_rate": 7.0390906449655104e-06,
733
+ "loss": 1.3027340173721313e-05,
734
+ "reward": -0.9556453377008438,
735
+ "reward_std": 0.03563447529450059,
736
+ "rewards/cosine_scaled_reward": -0.4778226688504219,
737
+ "rewards/format_reward": 0.0,
738
+ "step": 56
739
+ },
740
+ {
741
+ "clip_ratio": 0.0,
742
+ "completion_length": 1024.0,
743
+ "epoch": 0.03257142857142857,
744
+ "kl": 0.00029200315475463867,
745
+ "learning_rate": 6.890576474687264e-06,
746
+ "loss": 1.173466444015503e-05,
747
+ "reward": -0.9533008933067322,
748
+ "reward_std": 0.047348865773528814,
749
+ "rewards/cosine_scaled_reward": -0.4766504615545273,
750
+ "rewards/format_reward": 0.0,
751
+ "step": 57
752
+ },
753
+ {
754
+ "clip_ratio": 0.0,
755
+ "completion_length": 1024.0,
756
+ "epoch": 0.03314285714285714,
757
+ "kl": 0.00028020143508911133,
758
+ "learning_rate": 6.740368101176496e-06,
759
+ "loss": 1.0944902896881104e-05,
760
+ "reward": -0.8606822788715363,
761
+ "reward_std": 0.19956759549677372,
762
+ "rewards/cosine_scaled_reward": -0.4303411394357681,
763
+ "rewards/format_reward": 0.0,
764
+ "step": 58
765
+ },
766
+ {
767
+ "clip_ratio": 0.0,
768
+ "completion_length": 1024.0,
769
+ "epoch": 0.03371428571428572,
770
+ "kl": 0.00034242868423461914,
771
+ "learning_rate": 6.588648530198505e-06,
772
+ "loss": 1.4327466487884521e-05,
773
+ "reward": -0.9435078203678131,
774
+ "reward_std": 0.028082083677873015,
775
+ "rewards/cosine_scaled_reward": -0.47175391018390656,
776
+ "rewards/format_reward": 0.0,
777
+ "step": 59
778
+ },
779
+ {
780
+ "clip_ratio": 0.0,
781
+ "completion_length": 1024.0,
782
+ "epoch": 0.03428571428571429,
783
+ "kl": 0.0004596710205078125,
784
+ "learning_rate": 6.4356026086799176e-06,
785
+ "loss": 1.862645149230957e-05,
786
+ "reward": -0.8699476420879364,
787
+ "reward_std": 0.14623272977769375,
788
+ "rewards/cosine_scaled_reward": -0.4349738284945488,
789
+ "rewards/format_reward": 0.0,
790
+ "step": 60
791
+ },
792
+ {
793
+ "clip_ratio": 0.0,
794
+ "completion_length": 1024.0,
795
+ "epoch": 0.03485714285714286,
796
+ "kl": 0.00041598081588745117,
797
+ "learning_rate": 6.281416799501188e-06,
798
+ "loss": 1.6495585441589355e-05,
799
+ "reward": -0.8786782920360565,
800
+ "reward_std": 0.14873503288254142,
801
+ "rewards/cosine_scaled_reward": -0.43933914601802826,
802
+ "rewards/format_reward": 0.0,
803
+ "step": 61
804
+ },
805
+ {
806
+ "clip_ratio": 0.0,
807
+ "completion_length": 1024.0,
808
+ "epoch": 0.03542857142857143,
809
+ "kl": 0.0003428459167480469,
810
+ "learning_rate": 6.126278954320295e-06,
811
+ "loss": 1.4573335647583008e-05,
812
+ "reward": -0.9531020373106003,
813
+ "reward_std": 0.04140742728486657,
814
+ "rewards/cosine_scaled_reward": -0.47655102610588074,
815
+ "rewards/format_reward": 0.0,
816
+ "step": 62
817
+ },
818
+ {
819
+ "clip_ratio": 0.0,
820
+ "completion_length": 1024.0,
821
+ "epoch": 0.036,
822
+ "kl": 0.00034630298614501953,
823
+ "learning_rate": 5.970378084704441e-06,
824
+ "loss": 1.3574957847595215e-05,
825
+ "reward": -0.9625549018383026,
826
+ "reward_std": 0.0323515310883522,
827
+ "rewards/cosine_scaled_reward": -0.4812774509191513,
828
+ "rewards/format_reward": 0.0,
829
+ "step": 63
830
+ },
831
+ {
832
+ "clip_ratio": 0.0,
833
+ "completion_length": 1024.0,
834
+ "epoch": 0.036571428571428574,
835
+ "kl": 0.000436633825302124,
836
+ "learning_rate": 5.813904131848565e-06,
837
+ "loss": 1.7490237951278687e-05,
838
+ "reward": -0.9638274163007736,
839
+ "reward_std": 0.03202465921640396,
840
+ "rewards/cosine_scaled_reward": -0.4819137006998062,
841
+ "rewards/format_reward": 0.0,
842
+ "step": 64
843
+ },
844
+ {
845
+ "clip_ratio": 0.0,
846
+ "completion_length": 1024.0,
847
+ "epoch": 0.037142857142857144,
848
+ "kl": 0.00037598609924316406,
849
+ "learning_rate": 5.657047735161256e-06,
850
+ "loss": 1.4990568161010742e-05,
851
+ "reward": -0.8853072375059128,
852
+ "reward_std": 0.11870932951569557,
853
+ "rewards/cosine_scaled_reward": -0.4426536113023758,
854
+ "rewards/format_reward": 0.0,
855
+ "step": 65
856
+ },
857
+ {
858
+ "clip_ratio": 0.0,
859
+ "completion_length": 1024.0,
860
+ "epoch": 0.037714285714285714,
861
+ "kl": 0.000420987606048584,
862
+ "learning_rate": 5.500000000000001e-06,
863
+ "loss": 1.6763806343078613e-05,
864
+ "reward": -0.9631430506706238,
865
+ "reward_std": 0.027665999252349138,
866
+ "rewards/cosine_scaled_reward": -0.4815715327858925,
867
+ "rewards/format_reward": 0.0,
868
+ "step": 66
869
+ },
870
+ {
871
+ "clip_ratio": 0.0,
872
+ "completion_length": 1024.0,
873
+ "epoch": 0.038285714285714284,
874
+ "kl": 0.0005689859390258789,
875
+ "learning_rate": 5.342952264838748e-06,
876
+ "loss": 2.3305416107177734e-05,
877
+ "reward": -0.9261669814586639,
878
+ "reward_std": 0.09819867718033493,
879
+ "rewards/cosine_scaled_reward": -0.46308349817991257,
880
+ "rewards/format_reward": 0.0,
881
+ "step": 67
882
+ },
883
+ {
884
+ "clip_ratio": 0.0,
885
+ "completion_length": 1024.0,
886
+ "epoch": 0.038857142857142854,
887
+ "kl": 0.0006139874458312988,
888
+ "learning_rate": 5.186095868151436e-06,
889
+ "loss": 2.4430453777313232e-05,
890
+ "reward": -0.9322563856840134,
891
+ "reward_std": 0.06839151354506612,
892
+ "rewards/cosine_scaled_reward": -0.4661281928420067,
893
+ "rewards/format_reward": 0.0,
894
+ "step": 68
895
+ },
896
+ {
897
+ "clip_ratio": 0.0,
898
+ "completion_length": 1024.0,
899
+ "epoch": 0.03942857142857143,
900
+ "kl": 0.0004551410675048828,
901
+ "learning_rate": 5.02962191529556e-06,
902
+ "loss": 1.753866672515869e-05,
903
+ "reward": -0.947336733341217,
904
+ "reward_std": 0.06602911371737719,
905
+ "rewards/cosine_scaled_reward": -0.4736683666706085,
906
+ "rewards/format_reward": 0.0,
907
+ "step": 69
908
+ },
909
+ {
910
+ "clip_ratio": 0.0,
911
+ "completion_length": 1024.0,
912
+ "epoch": 0.04,
913
+ "kl": 0.0004698038101196289,
914
+ "learning_rate": 4.873721045679707e-06,
915
+ "loss": 1.9043684005737305e-05,
916
+ "reward": -0.881416529417038,
917
+ "reward_std": 0.15695088542997837,
918
+ "rewards/cosine_scaled_reward": -0.440708264708519,
919
+ "rewards/format_reward": 0.0,
920
+ "step": 70
921
+ },
922
+ {
923
+ "clip_ratio": 0.0,
924
+ "completion_length": 1024.0,
925
+ "epoch": 0.04057142857142857,
926
+ "kl": 0.0005980134010314941,
927
+ "learning_rate": 4.718583200498814e-06,
928
+ "loss": 2.4437904357910156e-05,
929
+ "reward": -0.9552861750125885,
930
+ "reward_std": 0.04716450162231922,
931
+ "rewards/cosine_scaled_reward": -0.47764309495687485,
932
+ "rewards/format_reward": 0.0,
933
+ "step": 71
934
+ },
935
+ {
936
+ "clip_ratio": 0.0,
937
+ "completion_length": 1024.0,
938
+ "epoch": 0.04114285714285714,
939
+ "kl": 0.00044411420822143555,
940
+ "learning_rate": 4.564397391320085e-06,
941
+ "loss": 1.781061291694641e-05,
942
+ "reward": -0.9175944626331329,
943
+ "reward_std": 0.07855925057083368,
944
+ "rewards/cosine_scaled_reward": -0.45879723131656647,
945
+ "rewards/format_reward": 0.0,
946
+ "step": 72
947
+ },
948
+ {
949
+ "clip_ratio": 0.0,
950
+ "completion_length": 1024.0,
951
+ "epoch": 0.04171428571428572,
952
+ "kl": 0.0005142688751220703,
953
+ "learning_rate": 4.4113514698014955e-06,
954
+ "loss": 2.104230225086212e-05,
955
+ "reward": -0.9591091871261597,
956
+ "reward_std": 0.03032594360411167,
957
+ "rewards/cosine_scaled_reward": -0.47955460846424103,
958
+ "rewards/format_reward": 0.0,
959
+ "step": 73
960
+ },
961
+ {
962
+ "clip_ratio": 0.0,
963
+ "completion_length": 1024.0,
964
+ "epoch": 0.04228571428571429,
965
+ "kl": 0.0004570484161376953,
966
+ "learning_rate": 4.259631898823504e-06,
967
+ "loss": 1.8224120140075684e-05,
968
+ "reward": -0.9418470114469528,
969
+ "reward_std": 0.09771440364420414,
970
+ "rewards/cosine_scaled_reward": -0.470923513174057,
971
+ "rewards/format_reward": 0.0,
972
+ "step": 74
973
+ },
974
+ {
975
+ "clip_ratio": 0.0,
976
+ "completion_length": 1024.0,
977
+ "epoch": 0.04285714285714286,
978
+ "kl": 0.00042492151260375977,
979
+ "learning_rate": 4.109423525312738e-06,
980
+ "loss": 1.697242259979248e-05,
981
+ "reward": -0.9031769037246704,
982
+ "reward_std": 0.11431426065973938,
983
+ "rewards/cosine_scaled_reward": -0.4515884444117546,
984
+ "rewards/format_reward": 0.0,
985
+ "step": 75
986
+ },
987
+ {
988
+ "clip_ratio": 0.0,
989
+ "completion_length": 1024.0,
990
+ "epoch": 0.04342857142857143,
991
+ "kl": 0.0003921389579772949,
992
+ "learning_rate": 3.960909355034491e-06,
993
+ "loss": 1.5683472156524658e-05,
994
+ "reward": -0.8710711896419525,
995
+ "reward_std": 0.16496195830404758,
996
+ "rewards/cosine_scaled_reward": -0.43553559482097626,
997
+ "rewards/format_reward": 0.0,
998
+ "step": 76
999
+ },
1000
+ {
1001
+ "clip_ratio": 0.0,
1002
+ "completion_length": 1024.0,
1003
+ "epoch": 0.044,
1004
+ "kl": 0.0005483031272888184,
1005
+ "learning_rate": 3.8142703296283954e-06,
1006
+ "loss": 2.2258609533309937e-05,
1007
+ "reward": -0.8363375961780548,
1008
+ "reward_std": 0.23763815127313137,
1009
+ "rewards/cosine_scaled_reward": -0.418168805539608,
1010
+ "rewards/format_reward": 0.0,
1011
+ "step": 77
1012
+ },
1013
+ {
1014
+ "clip_ratio": 0.0,
1015
+ "completion_length": 1024.0,
1016
+ "epoch": 0.044571428571428574,
1017
+ "kl": 0.0005096793174743652,
1018
+ "learning_rate": 3.6696851061589e-06,
1019
+ "loss": 2.06679105758667e-05,
1020
+ "reward": -0.912532389163971,
1021
+ "reward_std": 0.1374040930531919,
1022
+ "rewards/cosine_scaled_reward": -0.45626621693372726,
1023
+ "rewards/format_reward": 0.0,
1024
+ "step": 78
1025
+ },
1026
+ {
1027
+ "clip_ratio": 0.0,
1028
+ "completion_length": 1024.0,
1029
+ "epoch": 0.045142857142857144,
1030
+ "kl": 0.0007692575454711914,
1031
+ "learning_rate": 3.527329839449152e-06,
1032
+ "loss": 3.053247928619385e-05,
1033
+ "reward": -0.9567261040210724,
1034
+ "reward_std": 0.03342599933966994,
1035
+ "rewards/cosine_scaled_reward": -0.478363037109375,
1036
+ "rewards/format_reward": 0.0,
1037
+ "step": 79
1038
+ },
1039
+ {
1040
+ "clip_ratio": 0.0,
1041
+ "completion_length": 1024.0,
1042
+ "epoch": 0.045714285714285714,
1043
+ "kl": 0.0008780956268310547,
1044
+ "learning_rate": 3.3873779674634932e-06,
1045
+ "loss": 3.489106893539429e-05,
1046
+ "reward": -0.8690381050109863,
1047
+ "reward_std": 0.166263896971941,
1048
+ "rewards/cosine_scaled_reward": -0.43451904505491257,
1049
+ "rewards/format_reward": 0.0,
1050
+ "step": 80
1051
+ },
1052
+ {
1053
+ "clip_ratio": 0.0,
1054
+ "completion_length": 1024.0,
1055
+ "epoch": 0.046285714285714284,
1056
+ "kl": 0.000619053840637207,
1057
+ "learning_rate": 3.2500000000000015e-06,
1058
+ "loss": 2.2970139980316162e-05,
1059
+ "reward": -0.9703131467103958,
1060
+ "reward_std": 0.018532322952523828,
1061
+ "rewards/cosine_scaled_reward": -0.4851565733551979,
1062
+ "rewards/format_reward": 0.0,
1063
+ "step": 81
1064
+ },
1065
+ {
1066
+ "clip_ratio": 0.0,
1067
+ "completion_length": 1024.0,
1068
+ "epoch": 0.046857142857142854,
1069
+ "kl": 0.0005697011947631836,
1070
+ "learning_rate": 3.115363310950579e-06,
1071
+ "loss": 2.2605061531066895e-05,
1072
+ "reward": -0.8376921266317368,
1073
+ "reward_std": 0.21680933889001608,
1074
+ "rewards/cosine_scaled_reward": -0.4188460633158684,
1075
+ "rewards/format_reward": 0.0,
1076
+ "step": 82
1077
+ },
1078
+ {
1079
+ "clip_ratio": 0.0,
1080
+ "completion_length": 1024.0,
1081
+ "epoch": 0.04742857142857143,
1082
+ "kl": 0.0006507635116577148,
1083
+ "learning_rate": 2.98363193438164e-06,
1084
+ "loss": 2.580881118774414e-05,
1085
+ "reward": -0.9194770008325577,
1086
+ "reward_std": 0.07644858444109559,
1087
+ "rewards/cosine_scaled_reward": -0.45973849296569824,
1088
+ "rewards/format_reward": 0.0,
1089
+ "step": 83
1090
+ },
1091
+ {
1092
+ "clip_ratio": 0.0,
1093
+ "completion_length": 1024.0,
1094
+ "epoch": 0.048,
1095
+ "kl": 0.0005425214767456055,
1096
+ "learning_rate": 2.854966364683872e-06,
1097
+ "loss": 2.232193946838379e-05,
1098
+ "reward": -0.921579897403717,
1099
+ "reward_std": 0.07768060895614326,
1100
+ "rewards/cosine_scaled_reward": -0.4607899561524391,
1101
+ "rewards/format_reward": 0.0,
1102
+ "step": 84
1103
+ },
1104
+ {
1105
+ "clip_ratio": 0.0,
1106
+ "completion_length": 1024.0,
1107
+ "epoch": 0.04857142857142857,
1108
+ "kl": 0.0005598664283752441,
1109
+ "learning_rate": 2.7295233610345384e-06,
1110
+ "loss": 2.230703830718994e-05,
1111
+ "reward": -0.8130292594432831,
1112
+ "reward_std": 0.21775285061448812,
1113
+ "rewards/cosine_scaled_reward": -0.40651462972164154,
1114
+ "rewards/format_reward": 0.0,
1115
+ "step": 85
1116
+ },
1117
+ {
1118
+ "clip_ratio": 0.0,
1119
+ "completion_length": 1024.0,
1120
+ "epoch": 0.04914285714285714,
1121
+ "kl": 0.0005010366439819336,
1122
+ "learning_rate": 2.607455756410573e-06,
1123
+ "loss": 2.0213425159454346e-05,
1124
+ "reward": -0.911712571978569,
1125
+ "reward_std": 0.09181239921599627,
1126
+ "rewards/cosine_scaled_reward": -0.4558562785387039,
1127
+ "rewards/format_reward": 0.0,
1128
+ "step": 86
1129
+ },
1130
+ {
1131
+ "clip_ratio": 0.0,
1132
+ "completion_length": 1024.0,
1133
+ "epoch": 0.04971428571428571,
1134
+ "kl": 0.0005140304565429688,
1135
+ "learning_rate": 2.4889122713851397e-06,
1136
+ "loss": 2.09808349609375e-05,
1137
+ "reward": -0.9401500076055527,
1138
+ "reward_std": 0.04285714589059353,
1139
+ "rewards/cosine_scaled_reward": -0.47007501125335693,
1140
+ "rewards/format_reward": 0.0,
1141
+ "step": 87
1142
+ },
1143
+ {
1144
+ "clip_ratio": 0.0,
1145
+ "completion_length": 1024.0,
1146
+ "epoch": 0.05028571428571429,
1147
+ "kl": 0.0005880594253540039,
1148
+ "learning_rate": 2.374037332934512e-06,
1149
+ "loss": 2.3573637008666992e-05,
1150
+ "reward": -0.8337626904249191,
1151
+ "reward_std": 0.24908871483057737,
1152
+ "rewards/cosine_scaled_reward": -0.41688134521245956,
1153
+ "rewards/format_reward": 0.0,
1154
+ "step": 88
1155
+ },
1156
+ {
1157
+ "clip_ratio": 0.0,
1158
+ "completion_length": 1024.0,
1159
+ "epoch": 0.05085714285714286,
1160
+ "kl": 0.0005828142166137695,
1161
+ "learning_rate": 2.262970898476071e-06,
1162
+ "loss": 2.3312866687774658e-05,
1163
+ "reward": -0.9419961720705032,
1164
+ "reward_std": 0.06526243314146996,
1165
+ "rewards/cosine_scaled_reward": -0.4709980934858322,
1166
+ "rewards/format_reward": 0.0,
1167
+ "step": 89
1168
+ },
1169
+ {
1170
+ "clip_ratio": 0.0,
1171
+ "completion_length": 1024.0,
1172
+ "epoch": 0.05142857142857143,
1173
+ "kl": 0.0005592107772827148,
1174
+ "learning_rate": 2.1558482853517257e-06,
1175
+ "loss": 2.16066837310791e-05,
1176
+ "reward": -0.9420932680368423,
1177
+ "reward_std": 0.05442978721112013,
1178
+ "rewards/cosine_scaled_reward": -0.47104664146900177,
1179
+ "rewards/format_reward": 0.0,
1180
+ "step": 90
1181
+ },
1182
+ {
1183
+ "clip_ratio": 0.0,
1184
+ "completion_length": 1024.0,
1185
+ "epoch": 0.052,
1186
+ "kl": 0.0005729198455810547,
1187
+ "learning_rate": 2.0528000059646e-06,
1188
+ "loss": 2.3186206817626953e-05,
1189
+ "reward": -0.8448977470397949,
1190
+ "reward_std": 0.17764843348413706,
1191
+ "rewards/cosine_scaled_reward": -0.42244888097047806,
1192
+ "rewards/format_reward": 0.0,
1193
+ "step": 91
1194
+ },
1195
+ {
1196
+ "clip_ratio": 0.0,
1197
+ "completion_length": 1024.0,
1198
+ "epoch": 0.052571428571428575,
1199
+ "kl": 0.0006086826324462891,
1200
+ "learning_rate": 1.953951608769752e-06,
1201
+ "loss": 2.4143606424331665e-05,
1202
+ "reward": -0.8687557131052017,
1203
+ "reward_std": 0.13139949878677726,
1204
+ "rewards/cosine_scaled_reward": -0.43437784910202026,
1205
+ "rewards/format_reward": 0.0,
1206
+ "step": 92
1207
+ },
1208
+ {
1209
+ "clip_ratio": 0.0,
1210
+ "completion_length": 1024.0,
1211
+ "epoch": 0.053142857142857144,
1212
+ "kl": 0.0004508495330810547,
1213
+ "learning_rate": 1.8594235253127373e-06,
1214
+ "loss": 1.7457641661167145e-05,
1215
+ "reward": -0.9452957063913345,
1216
+ "reward_std": 0.044932478107512,
1217
+ "rewards/cosine_scaled_reward": -0.4726478382945061,
1218
+ "rewards/format_reward": 0.0,
1219
+ "step": 93
1220
+ },
1221
+ {
1222
+ "clip_ratio": 0.0,
1223
+ "completion_length": 1024.0,
1224
+ "epoch": 0.053714285714285714,
1225
+ "kl": 0.00058746337890625,
1226
+ "learning_rate": 1.769330923502313e-06,
1227
+ "loss": 2.3424625396728516e-05,
1228
+ "reward": -0.9732565432786942,
1229
+ "reward_std": 0.023667596746236086,
1230
+ "rewards/cosine_scaled_reward": -0.48662828654050827,
1231
+ "rewards/format_reward": 0.0,
1232
+ "step": 94
1233
+ },
1234
+ {
1235
+ "clip_ratio": 0.0,
1236
+ "completion_length": 1024.0,
1237
+ "epoch": 0.054285714285714284,
1238
+ "kl": 0.0006615519523620605,
1239
+ "learning_rate": 1.6837835672960834e-06,
1240
+ "loss": 2.664327621459961e-05,
1241
+ "reward": -0.8294684588909149,
1242
+ "reward_std": 0.21260959655046463,
1243
+ "rewards/cosine_scaled_reward": -0.41473422944545746,
1244
+ "rewards/format_reward": 0.0,
1245
+ "step": 95
1246
+ },
1247
+ {
1248
+ "clip_ratio": 0.0,
1249
+ "completion_length": 1024.0,
1250
+ "epoch": 0.054857142857142854,
1251
+ "kl": 0.0005936622619628906,
1252
+ "learning_rate": 1.602885682970026e-06,
1253
+ "loss": 2.310052514076233e-05,
1254
+ "reward": -0.8817574381828308,
1255
+ "reward_std": 0.14791762363165617,
1256
+ "rewards/cosine_scaled_reward": -0.4408787190914154,
1257
+ "rewards/format_reward": 0.0,
1258
+ "step": 96
1259
+ },
1260
+ {
1261
+ "clip_ratio": 0.0,
1262
+ "completion_length": 1024.0,
1263
+ "epoch": 0.05542857142857143,
1264
+ "kl": 0.0004170536994934082,
1265
+ "learning_rate": 1.526735832134829e-06,
1266
+ "loss": 1.5437602996826172e-05,
1267
+ "reward": -0.9217518121004105,
1268
+ "reward_std": 0.05548063712194562,
1269
+ "rewards/cosine_scaled_reward": -0.46087589859962463,
1270
+ "rewards/format_reward": 0.0,
1271
+ "step": 97
1272
+ },
1273
+ {
1274
+ "clip_ratio": 0.0,
1275
+ "completion_length": 1024.0,
1276
+ "epoch": 0.056,
1277
+ "kl": 0.0005135536193847656,
1278
+ "learning_rate": 1.4554267916537495e-06,
1279
+ "loss": 2.017989754676819e-05,
1280
+ "reward": -0.9414444863796234,
1281
+ "reward_std": 0.057337059173732996,
1282
+ "rewards/cosine_scaled_reward": -0.4707222431898117,
1283
+ "rewards/format_reward": 0.0,
1284
+ "step": 98
1285
+ },
1286
+ {
1287
+ "clip_ratio": 0.0,
1288
+ "completion_length": 1024.0,
1289
+ "epoch": 0.05657142857142857,
1290
+ "kl": 0.0004296302795410156,
1291
+ "learning_rate": 1.389045440608296e-06,
1292
+ "loss": 1.6748905181884766e-05,
1293
+ "reward": -0.8684801608324051,
1294
+ "reward_std": 0.18490357510745525,
1295
+ "rewards/cosine_scaled_reward": -0.43424008786678314,
1296
+ "rewards/format_reward": 0.0,
1297
+ "step": 99
1298
+ },
1299
+ {
1300
+ "clip_ratio": 0.0,
1301
+ "completion_length": 1024.0,
1302
+ "epoch": 0.05714285714285714,
1303
+ "kl": 0.0005539655685424805,
1304
+ "learning_rate": 1.3276726544494572e-06,
1305
+ "loss": 2.2009015083312988e-05,
1306
+ "reward": -0.8967786431312561,
1307
+ "reward_std": 0.12791539868339896,
1308
+ "rewards/cosine_scaled_reward": -0.44838932156562805,
1309
+ "rewards/format_reward": 0.0,
1310
+ "step": 100
1311
+ },
1312
+ {
1313
+ "epoch": 0.05714285714285714,
1314
+ "step": 100,
1315
  "total_flos": 0.0,
1316
+ "train_loss": 0.0013520326372236013,
1317
+ "train_runtime": 36067.9715,
1318
+ "train_samples_per_second": 0.044,
1319
+ "train_steps_per_second": 0.003
1320
  }
1321
  ],
1322
  "logging_steps": 1,
1323
+ "max_steps": 100,
1324
  "num_input_tokens_seen": 0,
1325
  "num_train_epochs": 1,
1326
  "save_steps": 50,
 
1337
  }
1338
  },
1339
  "total_flos": 0.0,
1340
+ "train_batch_size": 2,
1341
  "trial_name": null,
1342
  "trial_params": null
1343
  }
training_metrics.txt CHANGED
@@ -1,6 +1,6 @@
1
- total_size_before (MB): 1455.72
2
- total_size_after (MB): 1445.40
3
- total_time (seconds): 544.56
4
- ram_peak (MB): 3195.88
5
- ram_consump (MB): 1477.19
6
- disk_storage (MB): 575.25
 
1
+ total_size_before (MB): 3424.75
2
+ total_size_after (MB): 3407.14
3
+ total_time (seconds): 36073.19
4
+ ram_peak (MB): 5853.69
5
+ ram_consump (MB): 3485.15
6
+ disk_storage (MB): 333.13