Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

README.md +1 -7
adapter_config.json +2 -2
adapter_model.safetensors +1 -1
optimizer.pt +1 -1
rng_state.pth +1 -1
scheduler.pt +1 -1
trainer_state.json +275 -483
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -1,6 +1,5 @@
 ---
-base_model:
-- Qwen/Qwen3-VL-8B-Instruct
 library_name: peft
 pipeline_tag: text-generation
 tags:
@@ -9,11 +8,6 @@ tags:
 - lora
 - transformers
 - trl
-license: mit
-datasets:
-- yoavf/svg-animal-illustrations
-language:
-- en
 ---
 # Model Card for Model ID

 ---
+base_model: Qwen/Qwen3-VL-8B-Instruct
 library_name: peft
 pipeline_tag: text-generation
 tags:
 - lora
 - transformers
 - trl
 ---
 # Model Card for Model ID

adapter_config.json CHANGED Viewed

@@ -29,10 +29,10 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "v_proj",
     "k_proj",
     "o_proj",
-    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "q_proj",
     "k_proj",
     "o_proj",
+    "v_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:82a51e7b37b05dd7ea42594518355673ae114bb50d3da20d0d09dc54fee7f4f8
 size 61384752

 version https://git-lfs.github.com/spec/v1
+oid sha256:cbb49f52f3e22aebc4e1600546d83656b67867231a07d467e75ece4c5bd4af1f
 size 61384752

optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0af329302356ea56aed820fac9c299fb239a668e3d6ae5de9e640e6cbcf2d627
 size 122854795

 version https://git-lfs.github.com/spec/v1
+oid sha256:479122d9a9939c852f3898332cee178a46634af8cbf583fca68c585417a010b4
 size 122854795

rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6ab9afdfe88ca369217f0f7bcc8f84eace053d867ea13532267eecb8300631f7
 size 14645

 version https://git-lfs.github.com/spec/v1
+oid sha256:bb030b37dacda29104e066d95ec1903f37c25d3a91f465220ac18eede3745980
 size 14645

scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9f673db473586ce87d2bfba76c362e332a223d9de40cad5ced52c5bc0b26e7b2
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:fb54d230a0c2ef4f1dd1005e48f8bc418e0bd568b6e7ca0d90419ffd0cd17bf8
 size 1465

trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.08,
   "eval_steps": 500,
-  "global_step": 40,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -16,23 +16,23 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 1053.0,
-      "completions/max_terminated_length": 1053.0,
-      "completions/mean_length": 778.6000366210938,
-      "completions/mean_terminated_length": 778.6000366210938,
-      "completions/min_length": 620.0,
-      "completions/min_terminated_length": 620.0,
-      "entropy": 0.08571863919496536,
       "epoch": 0.002,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.4034070670604706,
-      "learning_rate": 1e-06,
-      "loss": 0.0811,
-      "num_tokens": 3988.0,
-      "reward": 0.6599999666213989,
-      "reward_std": 0.022360675036907196,
-      "rewards/reward_func_with_saving/mean": 0.6599999666213989,
-      "rewards/reward_func_with_saving/std": 0.02236068621277809,
       "step": 1
     },
     {
@@ -41,11 +41,24 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "entropy": 0.05181041359901428,
       "epoch": 0.004,
-      "grad_norm": 0.9778931736946106,
-      "learning_rate": 1e-06,
-      "loss": -0.317,
       "step": 2
     },
     {
@@ -54,37 +67,76 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "entropy": 0.10429538041353226,
       "epoch": 0.006,
-      "grad_norm": 0.31522250175476074,
-      "learning_rate": 1e-06,
-      "loss": 0.1203,
       "step": 3
     },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.001612903201021254,
-      "clip_ratio/low_min": 0.001612903201021254,
-      "clip_ratio/region_mean": 0.001612903201021254,
-      "entropy": 0.06899195909500122,
       "epoch": 0.008,
-      "grad_norm": 0.31318220496177673,
-      "learning_rate": 1e-06,
-      "loss": 0.0709,
       "step": 4
     },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0012224939418956637,
-      "clip_ratio/low_min": 0.0012224939418956637,
-      "clip_ratio/region_mean": 0.0012224939418956637,
-      "entropy": 0.072004035115242,
       "epoch": 0.01,
-      "grad_norm": 0.36428698897361755,
-      "learning_rate": 1e-06,
-      "loss": 0.0935,
       "step": 5
     },
     {
@@ -94,23 +146,23 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 697.0,
-      "completions/max_terminated_length": 697.0,
-      "completions/mean_length": 680.0,
-      "completions/mean_terminated_length": 680.0,
-      "completions/min_length": 662.0,
-      "completions/min_terminated_length": 662.0,
-      "entropy": 0.04808889329433441,
       "epoch": 0.012,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.3064998388290405,
-      "learning_rate": 1e-06,
-      "loss": -0.0905,
-      "num_tokens": 7488.0,
-      "reward": 0.7333332896232605,
-      "reward_std": 0.09316948801279068,
-      "rewards/reward_func_with_saving/mean": 0.7333332896232605,
-      "rewards/reward_func_with_saving/std": 0.09316948801279068,
       "step": 6
     },
     {
@@ -119,11 +171,24 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "entropy": 0.050035636872053146,
       "epoch": 0.014,
-      "grad_norm": 1.2182977199554443,
-      "learning_rate": 1e-06,
-      "loss": 0.348,
       "step": 7
     },
     {
@@ -132,11 +197,24 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "entropy": 0.05900810286402702,
       "epoch": 0.016,
-      "grad_norm": 0.30796700716018677,
-      "learning_rate": 1e-06,
-      "loss": -0.0899,
       "step": 8
     },
     {
@@ -145,11 +223,24 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "entropy": 0.06963960826396942,
       "epoch": 0.018,
-      "grad_norm": 0.40918684005737305,
-      "learning_rate": 1e-06,
-      "loss": -0.0878,
       "step": 9
     },
     {
@@ -158,11 +249,24 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "entropy": 0.06808315217494965,
       "epoch": 0.02,
-      "grad_norm": 0.26108482480049133,
-      "learning_rate": 1e-06,
-      "loss": -0.0915,
       "step": 10
     },
     {
@@ -172,23 +276,23 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 796.0,
-      "completions/max_terminated_length": 796.0,
-      "completions/mean_length": 697.2000122070312,
-      "completions/mean_terminated_length": 697.2000122070312,
-      "completions/min_length": 610.0,
-      "completions/min_terminated_length": 610.0,
-      "entropy": 0.032603874802589417,
       "epoch": 0.022,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.21574945747852325,
-      "learning_rate": 1e-06,
-      "loss": -0.1102,
-      "num_tokens": 11084.0,
-      "reward": 0.7483333945274353,
-      "reward_std": 0.04224595054984093,
-      "rewards/reward_func_with_saving/mean": 0.7483333349227905,
-      "rewards/reward_func_with_saving/std": 0.04224596172571182,
       "step": 11
     },
     {
@@ -197,52 +301,26 @@
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
-      "entropy": 0.04580014944076538,
       "epoch": 0.024,
-      "grad_norm": 0.6932300329208374,
-      "learning_rate": 1e-06,
-      "loss": 0.3115,
       "step": 12
     },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.05027260258793831,
-      "epoch": 0.026,
-      "grad_norm": 0.2965257465839386,
-      "learning_rate": 1e-06,
-      "loss": -0.1438,
-      "step": 13
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.05315280705690384,
-      "epoch": 0.028,
-      "grad_norm": 0.01823524758219719,
-      "learning_rate": 1e-06,
-      "loss": -0.0088,
-      "step": 14
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.0618734173476696,
-      "epoch": 0.03,
-      "grad_norm": 0.4097316861152649,
-      "learning_rate": 1e-06,
-      "loss": -0.0837,
-      "step": 15
-    },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/high_mean": 0.0,
@@ -250,154 +328,24 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 989.0,
-      "completions/max_terminated_length": 989.0,
-      "completions/mean_length": 889.4000244140625,
-      "completions/mean_terminated_length": 889.4000244140625,
       "completions/min_length": 783.0,
       "completions/min_terminated_length": 783.0,
-      "entropy": 0.07022179663181305,
-      "epoch": 0.032,
-      "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.8408637046813965,
-      "learning_rate": 1e-06,
-      "loss": 0.2405,
-      "num_tokens": 15631.0,
-      "reward": 0.6633333563804626,
-      "reward_std": 0.07490735501050949,
-      "rewards/reward_func_with_saving/mean": 0.6633332967758179,
-      "rewards/reward_func_with_saving/std": 0.0749073475599289,
-      "step": 16
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.07662880420684814,
-      "epoch": 0.034,
-      "grad_norm": 0.18040509521961212,
-      "learning_rate": 1e-06,
-      "loss": 0.0395,
-      "step": 17
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.07824546098709106,
-      "epoch": 0.036,
-      "grad_norm": 0.9069496393203735,
-      "learning_rate": 1e-06,
-      "loss": -0.2769,
-      "step": 18
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.07629340887069702,
-      "epoch": 0.038,
-      "grad_norm": 0.20565274357795715,
-      "learning_rate": 1e-06,
-      "loss": 0.0705,
-      "step": 19
-    },
-    {
-      "clip_ratio/high_max": 0.0010515246540307999,
-      "clip_ratio/high_mean": 0.0010515246540307999,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0010515246540307999,
-      "entropy": 0.0615997388958931,
-      "epoch": 0.04,
-      "grad_norm": 0.3993583619594574,
-      "learning_rate": 1e-06,
-      "loss": -0.1046,
-      "step": 20
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "completions/clipped_ratio": 0.0,
-      "completions/max_length": 839.0,
-      "completions/max_terminated_length": 839.0,
-      "completions/mean_length": 745.6000366210938,
-      "completions/mean_terminated_length": 745.6000366210938,
-      "completions/min_length": 617.0,
-      "completions/min_terminated_length": 617.0,
-      "entropy": 0.07856940478086472,
-      "epoch": 0.042,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 1.6567957401275635,
-      "learning_rate": 1e-06,
-      "loss": 0.2708,
-      "num_tokens": 19444.0,
-      "reward": 0.6916667222976685,
-      "reward_std": 0.06291528046131134,
-      "rewards/reward_func_with_saving/mean": 0.6916666626930237,
-      "rewards/reward_func_with_saving/std": 0.06291527301073074,
-      "step": 21
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.06388463824987411,
-      "epoch": 0.044,
-      "grad_norm": 0.2810056209564209,
-      "learning_rate": 1e-06,
-      "loss": 0.0893,
-      "step": 22
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.05800528824329376,
-      "epoch": 0.046,
-      "grad_norm": 0.08997764438390732,
-      "learning_rate": 1e-06,
-      "loss": -0.0287,
-      "step": 23
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.06156563758850098,
-      "epoch": 0.048,
-      "grad_norm": 0.41701430082321167,
-      "learning_rate": 1e-06,
-      "loss": -0.109,
-      "step": 24
-    },
-    {
-      "clip_ratio/high_max": 0.0016207455191761255,
-      "clip_ratio/high_mean": 0.0016207455191761255,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0016207455191761255,
-      "entropy": 0.054504893720149994,
-      "epoch": 0.05,
-      "grad_norm": 1.1660373210906982,
-      "learning_rate": 1e-06,
-      "loss": -0.197,
-      "step": 25
     },
     {
       "clip_ratio/high_max": 0.0,
@@ -406,76 +354,24 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 827.0,
-      "completions/max_terminated_length": 827.0,
-      "completions/mean_length": 715.2000122070312,
-      "completions/mean_terminated_length": 715.2000122070312,
-      "completions/min_length": 600.0,
-      "completions/min_terminated_length": 600.0,
-      "entropy": 0.03946581110358238,
-      "epoch": 0.052,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.4461834132671356,
-      "learning_rate": 1e-06,
-      "loss": 0.1631,
-      "num_tokens": 23130.0,
-      "reward": 0.675000011920929,
-      "reward_std": 0.035355355590581894,
-      "rewards/reward_func_with_saving/mean": 0.675000011920929,
-      "rewards/reward_func_with_saving/std": 0.03535535931587219,
-      "step": 26
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.08581237494945526,
-      "epoch": 0.054,
-      "grad_norm": 0.5848076939582825,
-      "learning_rate": 1e-06,
-      "loss": 0.1479,
-      "step": 27
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.05218680202960968,
-      "epoch": 0.056,
-      "grad_norm": 0.5354483723640442,
-      "learning_rate": 1e-06,
-      "loss": 0.1183,
-      "step": 28
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.053608063608407974,
-      "epoch": 0.058,
-      "grad_norm": 1.3185772895812988,
-      "learning_rate": 1e-06,
-      "loss": -0.2595,
-      "step": 29
-    },
-    {
-      "clip_ratio/high_max": 0.001349527621641755,
-      "clip_ratio/high_mean": 0.001349527621641755,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.001349527621641755,
-      "entropy": 0.075257308781147,
-      "epoch": 0.06,
-      "grad_norm": 0.486283540725708,
-      "learning_rate": 1e-06,
-      "loss": -0.1461,
-      "step": 30
     },
     {
       "clip_ratio/high_max": 0.0,
@@ -484,76 +380,24 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 901.0,
-      "completions/max_terminated_length": 901.0,
-      "completions/mean_length": 713.4000244140625,
-      "completions/mean_terminated_length": 713.4000244140625,
-      "completions/min_length": 608.0,
-      "completions/min_terminated_length": 608.0,
-      "entropy": 0.10886804014444351,
-      "epoch": 0.062,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.2377174347639084,
-      "learning_rate": 1e-06,
-      "loss": 0.0543,
-      "num_tokens": 26782.0,
-      "reward": 0.6883333325386047,
-      "reward_std": 0.04951147362589836,
-      "rewards/reward_func_with_saving/mean": 0.6883333325386047,
-      "rewards/reward_func_with_saving/std": 0.049511492252349854,
-      "step": 31
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.0800568163394928,
-      "epoch": 0.064,
-      "grad_norm": 0.3408770263195038,
-      "learning_rate": 1e-06,
-      "loss": 0.0811,
-      "step": 32
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.11951474100351334,
-      "epoch": 0.066,
-      "grad_norm": 1.2026166915893555,
-      "learning_rate": 1e-06,
-      "loss": -0.2977,
-      "step": 33
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.09091290831565857,
-      "epoch": 0.068,
-      "grad_norm": 0.2414180338382721,
-      "learning_rate": 1e-06,
-      "loss": 0.0679,
-      "step": 34
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.0767383873462677,
-      "epoch": 0.07,
-      "grad_norm": 0.7556729912757874,
-      "learning_rate": 1e-06,
-      "loss": 0.1462,
-      "step": 35
     },
     {
       "clip_ratio/high_max": 0.0,
@@ -562,83 +406,31 @@
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
-      "completions/max_length": 726.0,
-      "completions/max_terminated_length": 726.0,
-      "completions/mean_length": 660.0,
-      "completions/mean_terminated_length": 660.0,
-      "completions/min_length": 582.0,
-      "completions/min_terminated_length": 582.0,
-      "entropy": 0.04803081601858139,
-      "epoch": 0.072,
       "frac_reward_zero_std": 0.0,
-      "grad_norm": 0.5553380846977234,
-      "learning_rate": 1e-06,
-      "loss": -0.1371,
-      "num_tokens": 30162.0,
-      "reward": 0.73499995470047,
-      "reward_std": 0.057554323226213455,
-      "rewards/reward_func_with_saving/mean": 0.73499995470047,
-      "rewards/reward_func_with_saving/std": 0.057554323226213455,
-      "step": 36
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.0411250926554203,
-      "epoch": 0.074,
-      "grad_norm": 0.43590816855430603,
-      "learning_rate": 1e-06,
-      "loss": -0.1224,
-      "step": 37
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.0592646561563015,
-      "epoch": 0.076,
-      "grad_norm": 0.40773651003837585,
-      "learning_rate": 1e-06,
-      "loss": 0.1275,
-      "step": 38
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.049946531653404236,
-      "epoch": 0.078,
-      "grad_norm": 1.0851061344146729,
-      "learning_rate": 1e-06,
-      "loss": 0.3242,
-      "step": 39
-    },
-    {
-      "clip_ratio/high_max": 0.0,
-      "clip_ratio/high_mean": 0.0,
-      "clip_ratio/low_mean": 0.0,
-      "clip_ratio/low_min": 0.0,
-      "clip_ratio/region_mean": 0.0,
-      "entropy": 0.04744778200984001,
-      "epoch": 0.08,
-      "grad_norm": 0.3738895654678345,
-      "learning_rate": 1e-06,
-      "loss": -0.136,
-      "step": 40
     }
   ],
   "logging_steps": 1,
-  "max_steps": 40,
-  "num_input_tokens_seen": 30162,
   "num_train_epochs": 1,
-  "save_steps": 5,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.032,
   "eval_steps": 500,
+  "global_step": 16,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1174.0,
+      "completions/max_terminated_length": 1174.0,
+      "completions/mean_length": 909.0,
+      "completions/mean_terminated_length": 909.0,
+      "completions/min_length": 783.0,
+      "completions/min_terminated_length": 783.0,
+      "entropy": 0.05627120193094015,
       "epoch": 0.002,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.5968819260597229,
+      "learning_rate": 5e-06,
+      "loss": -0.0469,
+      "num_tokens": 3712.0,
+      "reward": 0.8966667056083679,
+      "reward_std": 0.050852831453084946,
+      "rewards/reward_func_with_saving/mean": 0.8966667056083679,
+      "rewards/reward_func_with_saving/std": 0.05085281282663345,
       "step": 1
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 613.0,
+      "completions/max_terminated_length": 613.0,
+      "completions/mean_length": 611.5,
+      "completions/mean_terminated_length": 611.5,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.02739148633554578,
       "epoch": 0.004,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.4688481092453003,
+      "learning_rate": 5e-06,
+      "loss": -0.0004,
+      "num_tokens": 6254.0,
+      "reward": 0.7022222280502319,
+      "reward_std": 0.02222222089767456,
+      "rewards/reward_func_with_saving/mean": 0.7022222280502319,
+      "rewards/reward_func_with_saving/std": 0.02222222276031971,
       "step": 2
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 780.0,
+      "completions/max_terminated_length": 780.0,
+      "completions/mean_length": 703.75,
+      "completions/mean_terminated_length": 703.75,
+      "completions/min_length": 666.0,
+      "completions/min_terminated_length": 666.0,
+      "entropy": 0.10391132719814777,
       "epoch": 0.006,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0785013437271118,
+      "learning_rate": 5e-06,
+      "loss": -0.016,
+      "num_tokens": 9153.0,
+      "reward": 0.8922222256660461,
+      "reward_std": 0.011111120693385601,
+      "rewards/reward_func_with_saving/mean": 0.8922222256660461,
+      "rewards/reward_func_with_saving/std": 0.011111111380159855,
       "step": 3
     },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 798.0,
+      "completions/max_terminated_length": 798.0,
+      "completions/mean_length": 732.0,
+      "completions/mean_terminated_length": 732.0,
+      "completions/min_length": 639.0,
+      "completions/min_terminated_length": 639.0,
+      "entropy": 0.07874358911067247,
       "epoch": 0.008,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.7055785655975342,
+      "learning_rate": 5e-06,
+      "loss": 0.0102,
+      "num_tokens": 12157.0,
+      "reward": 0.8300000429153442,
+      "reward_std": 0.08785511553287506,
+      "rewards/reward_func_with_saving/mean": 0.8300000429153442,
+      "rewards/reward_func_with_saving/std": 0.08785512298345566,
       "step": 4
     },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 731.0,
+      "completions/max_terminated_length": 731.0,
+      "completions/mean_length": 704.5,
+      "completions/mean_terminated_length": 704.5,
+      "completions/min_length": 654.0,
+      "completions/min_terminated_length": 654.0,
+      "entropy": 0.12625528872013092,
       "epoch": 0.01,
+      "frac_reward_zero_std": 1.0,
+      "grad_norm": 0.0,
+      "learning_rate": 5e-06,
+      "loss": 0.0,
+      "num_tokens": 15051.0,
+      "reward": 0.8311111330986023,
+      "reward_std": 0.0,
+      "rewards/reward_func_with_saving/mean": 0.8311111330986023,
+      "rewards/reward_func_with_saving/std": 0.0,
       "step": 5
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1007.0,
+      "completions/max_terminated_length": 1007.0,
+      "completions/mean_length": 816.0,
+      "completions/mean_terminated_length": 816.0,
+      "completions/min_length": 624.0,
+      "completions/min_terminated_length": 624.0,
+      "entropy": 0.1077885851264,
       "epoch": 0.012,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0341051816940308,
+      "learning_rate": 5e-06,
+      "loss": -0.119,
+      "num_tokens": 18407.0,
+      "reward": 0.8322222232818604,
+      "reward_std": 0.06416287273168564,
+      "rewards/reward_func_with_saving/mean": 0.8322222232818604,
+      "rewards/reward_func_with_saving/std": 0.06416288018226624,
       "step": 6
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 724.5,
+      "completions/mean_terminated_length": 724.5,
+      "completions/min_length": 690.0,
+      "completions/min_terminated_length": 690.0,
+      "entropy": 0.1391423474997282,
       "epoch": 0.014,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.149609088897705,
+      "learning_rate": 5e-06,
+      "loss": -0.004,
+      "num_tokens": 21385.0,
+      "reward": 0.7538889050483704,
+      "reward_std": 0.1363290250301361,
+      "rewards/reward_func_with_saving/mean": 0.7538889050483704,
+      "rewards/reward_func_with_saving/std": 0.1363290250301361,
       "step": 7
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 611.0,
+      "completions/max_terminated_length": 611.0,
+      "completions/mean_length": 608.25,
+      "completions/mean_terminated_length": 608.25,
+      "completions/min_length": 601.0,
+      "completions/min_terminated_length": 601.0,
+      "entropy": 0.04191916948184371,
       "epoch": 0.016,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.440533995628357,
+      "learning_rate": 5e-06,
+      "loss": 0.006,
+      "num_tokens": 23894.0,
+      "reward": 0.4983333349227905,
+      "reward_std": 0.09888887405395508,
+      "rewards/reward_func_with_saving/mean": 0.4983333349227905,
+      "rewards/reward_func_with_saving/std": 0.09888887405395508,
       "step": 8
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 643.0,
+      "completions/max_terminated_length": 643.0,
+      "completions/mean_length": 609.25,
+      "completions/mean_terminated_length": 609.25,
+      "completions/min_length": 576.0,
+      "completions/min_terminated_length": 576.0,
+      "entropy": 0.04268141835927963,
       "epoch": 0.018,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.0735034942626953,
+      "learning_rate": 5e-06,
+      "loss": -0.0273,
+      "num_tokens": 26407.0,
+      "reward": 0.5416666865348816,
+      "reward_std": 0.15209239721298218,
+      "rewards/reward_func_with_saving/mean": 0.5416666865348816,
+      "rewards/reward_func_with_saving/std": 0.15209239721298218,
       "step": 9
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 750.0,
+      "completions/max_terminated_length": 750.0,
+      "completions/mean_length": 678.25,
+      "completions/mean_terminated_length": 678.25,
+      "completions/min_length": 612.0,
+      "completions/min_terminated_length": 612.0,
+      "entropy": 0.10207068175077438,
       "epoch": 0.02,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.8149948120117188,
+      "learning_rate": 5e-06,
+      "loss": 0.0611,
+      "num_tokens": 29192.0,
+      "reward": 0.7227777242660522,
+      "reward_std": 0.12421109527349472,
+      "rewards/reward_func_with_saving/mean": 0.7227777242660522,
+      "rewards/reward_func_with_saving/std": 0.12421111017465591,
       "step": 10
     },
     {
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 762.0,
+      "completions/max_terminated_length": 762.0,
+      "completions/mean_length": 659.75,
+      "completions/mean_terminated_length": 659.75,
+      "completions/min_length": 607.0,
+      "completions/min_terminated_length": 607.0,
+      "entropy": 0.11515359580516815,
       "epoch": 0.022,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5091134309768677,
+      "learning_rate": 5e-06,
+      "loss": 0.0615,
+      "num_tokens": 31907.0,
+      "reward": 0.7888888716697693,
+      "reward_std": 0.10210946202278137,
+      "rewards/reward_func_with_saving/mean": 0.7888888716697693,
+      "rewards/reward_func_with_saving/std": 0.10210946202278137,
       "step": 11
     },
     {
       "clip_ratio/low_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 856.0,
+      "completions/max_terminated_length": 856.0,
+      "completions/mean_length": 705.75,
+      "completions/mean_terminated_length": 705.75,
+      "completions/min_length": 530.0,
+      "completions/min_terminated_length": 530.0,
+      "entropy": 0.15026956051588058,
       "epoch": 0.024,
+      "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3372756242752075,
+      "learning_rate": 5e-06,
+      "loss": 0.1075,
+      "num_tokens": 34818.0,
+      "reward": 0.7594444751739502,
+      "reward_std": 0.16865848004817963,
+      "rewards/reward_func_with_saving/mean": 0.7594444751739502,
+      "rewards/reward_func_with_saving/std": 0.16865849494934082,
       "step": 12
     },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/high_mean": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 919.0,
+      "completions/max_terminated_length": 919.0,
+      "completions/mean_length": 863.0,
+      "completions/mean_terminated_length": 863.0,
       "completions/min_length": 783.0,
       "completions/min_terminated_length": 783.0,
+      "entropy": 0.07314991764724255,
+      "epoch": 0.026,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.5220526456832886,
+      "learning_rate": 5e-06,
+      "loss": 0.0265,
+      "num_tokens": 38338.0,
+      "reward": 0.8427777886390686,
+      "reward_std": 0.021111130714416504,
+      "rewards/reward_func_with_saving/mean": 0.8427777886390686,
+      "rewards/reward_func_with_saving/std": 0.021111130714416504,
+      "step": 13
     },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 909.0,
+      "completions/max_terminated_length": 909.0,
+      "completions/mean_length": 733.5,
+      "completions/mean_terminated_length": 733.5,
+      "completions/min_length": 602.0,
+      "completions/min_terminated_length": 602.0,
+      "entropy": 0.11434740386903286,
+      "epoch": 0.028,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.422579050064087,
+      "learning_rate": 5e-06,
+      "loss": 0.0105,
+      "num_tokens": 41344.0,
+      "reward": 0.8372222185134888,
+      "reward_std": 0.03222225233912468,
+      "rewards/reward_func_with_saving/mean": 0.8372222185134888,
+      "rewards/reward_func_with_saving/std": 0.032222241163253784,
+      "step": 14
     },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 655.0,
+      "completions/max_terminated_length": 655.0,
+      "completions/mean_length": 630.0,
+      "completions/mean_terminated_length": 630.0,
+      "completions/min_length": 611.0,
+      "completions/min_terminated_length": 611.0,
+      "entropy": 0.048715847078710794,
+      "epoch": 0.03,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 0.6969804763793945,
+      "learning_rate": 5e-06,
+      "loss": 0.0151,
+      "num_tokens": 43936.0,
+      "reward": 0.528333306312561,
+      "reward_std": 0.15888887643814087,
+      "rewards/reward_func_with_saving/mean": 0.528333306312561,
+      "rewards/reward_func_with_saving/std": 0.15888887643814087,
+      "step": 15
     },
     {
       "clip_ratio/high_max": 0.0,
       "clip_ratio/low_min": 0.0,
       "clip_ratio/region_mean": 0.0,
       "completions/clipped_ratio": 0.0,
+      "completions/max_length": 879.0,
+      "completions/max_terminated_length": 879.0,
+      "completions/mean_length": 747.25,
+      "completions/mean_terminated_length": 747.25,
+      "completions/min_length": 643.0,
+      "completions/min_terminated_length": 643.0,
+      "entropy": 0.07380866352468729,
+      "epoch": 0.032,
       "frac_reward_zero_std": 0.0,
+      "grad_norm": 1.3967198133468628,
+      "learning_rate": 5e-06,
+      "loss": 0.0736,
+      "num_tokens": 46997.0,
+      "reward": 0.7199999690055847,
+      "reward_std": 0.14237260818481445,
+      "rewards/reward_func_with_saving/mean": 0.7199999690055847,
+      "rewards/reward_func_with_saving/std": 0.14237260818481445,
+      "step": 16
     }
   ],
   "logging_steps": 1,
+  "max_steps": 16,
+  "num_input_tokens_seen": 46997,
   "num_train_epochs": 1,
+  "save_steps": 4,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e4a06f1dbd5fe57850327672e334124bc880911e2fbf86ed0e967bbdbd99eded
 size 7313

 version https://git-lfs.github.com/spec/v1
+oid sha256:8bb3a8fe277af9bdf1f31540071cd20f21c0470946d3c2d53dfae7edd515712d
 size 7313