Model save

Browse files

Files changed (5) hide show

README.md +68 -0
all_results.json +8 -0
generation_config.json +13 -0
train_results.json +8 -0
trainer_state.json +550 -0

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+base_model: Qwen/Qwen3-1.7B
+library_name: transformers
+model_name: Qwen3-1.7B-Open-R1-Code-GRPO
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for Qwen3-1.7B-Open-R1-Code-GRPO
+This model is a fine-tuned version of [Qwen/Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="Blancy/Qwen3-1.7B-Open-R1-Code-GRPO", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.15.2
+- Transformers: 4.52.3
+- Pytorch: 2.5.1
+- Datasets: 3.6.0
+- Tokenizers: 0.21.1
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.024176058914698324,
+    "train_runtime": 19938.5063,
+    "train_samples": 1086,
+    "train_samples_per_second": 0.054,
+    "train_steps_per_second": 0.002
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.6,
+  "top_k": 20,
+  "top_p": 0.95,
+  "transformers_version": "4.52.3"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.024176058914698324,
+    "train_runtime": 19938.5063,
+    "train_samples": 1086,
+    "train_samples_per_second": 0.054,
+    "train_steps_per_second": 0.002
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,550 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.0,
+  "eval_steps": 500,
+  "global_step": 39,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "completion_length": 3541.9607543945312,
+      "epoch": 0.02564102564102564,
+      "grad_norm": 5.551374912261963,
+      "kl": 0.0,
+      "learning_rate": 0.0,
+      "loss": 0.0,
+      "reward": 0.3934539742767811,
+      "reward_std": 0.3425147756934166,
+      "rewards/code_reward": 0.3934539742767811,
+      "rewards/format_reward": 0.0,
+      "step": 1
+    },
+    {
+      "completion_length": 3543.0106201171875,
+      "epoch": 0.05128205128205128,
+      "grad_norm": 1.4287736415863037,
+      "kl": 0.0,
+      "learning_rate": 5e-06,
+      "loss": 0.0,
+      "reward": 0.46500000543892384,
+      "reward_std": 0.19661623612046242,
+      "rewards/code_reward": 0.46499999053776264,
+      "rewards/format_reward": 0.0,
+      "step": 2
+    },
+    {
+      "completion_length": 3270.2929077148438,
+      "epoch": 0.07692307692307693,
+      "grad_norm": 2.6348750591278076,
+      "kl": 0.00787353515625,
+      "learning_rate": 1e-05,
+      "loss": 0.0003,
+      "reward": 0.5235317498445511,
+      "reward_std": 0.30637360364198685,
+      "rewards/code_reward": 0.5235317498445511,
+      "rewards/format_reward": 0.0,
+      "step": 3
+    },
+    {
+      "completion_length": 2943.4500122070312,
+      "epoch": 0.10256410256410256,
+      "grad_norm": 0.7752403020858765,
+      "kl": 0.0511474609375,
+      "learning_rate": 9.98378869844137e-06,
+      "loss": 0.002,
+      "reward": 0.5904354751110077,
+      "reward_std": 0.21103981602936983,
+      "rewards/code_reward": 0.5904354676604271,
+      "rewards/format_reward": 0.0,
+      "step": 4
+    },
+    {
+      "completion_length": 3438.1785888671875,
+      "epoch": 0.1282051282051282,
+      "grad_norm": 3.3206803798675537,
+      "kl": 0.120361328125,
+      "learning_rate": 9.935271596564688e-06,
+      "loss": 0.0048,
+      "reward": 0.47174738347530365,
+      "reward_std": 0.1500780526548624,
+      "rewards/code_reward": 0.47174738347530365,
+      "rewards/format_reward": 0.0,
+      "step": 5
+    },
+    {
+      "completion_length": 3396.6856689453125,
+      "epoch": 0.15384615384615385,
+      "grad_norm": 0.2983720600605011,
+      "kl": 0.2265625,
+      "learning_rate": 9.854798261200746e-06,
+      "loss": 0.0091,
+      "reward": 0.5083690956234932,
+      "reward_std": 0.23467476293444633,
+      "rewards/code_reward": 0.5083690956234932,
+      "rewards/format_reward": 0.0,
+      "step": 6
+    },
+    {
+      "completion_length": 3636.0071411132812,
+      "epoch": 0.1794871794871795,
+      "grad_norm": 0.29332682490348816,
+      "kl": 0.26611328125,
+      "learning_rate": 9.74294850457488e-06,
+      "loss": 0.0106,
+      "reward": 0.387291356921196,
+      "reward_std": 0.23526490107178688,
+      "rewards/code_reward": 0.387291356921196,
+      "rewards/format_reward": 0.0,
+      "step": 7
+    },
+    {
+      "completion_length": 3375.800048828125,
+      "epoch": 0.20512820512820512,
+      "grad_norm": 0.3416775166988373,
+      "kl": 0.3359375,
+      "learning_rate": 9.600528206746613e-06,
+      "loss": 0.0134,
+      "reward": 0.48297154158353806,
+      "reward_std": 0.25456428155303,
+      "rewards/code_reward": 0.48297156393527985,
+      "rewards/format_reward": 0.0,
+      "step": 8
+    },
+    {
+      "completion_length": 3155.9608154296875,
+      "epoch": 0.23076923076923078,
+      "grad_norm": 0.240010604262352,
+      "kl": 0.3779296875,
+      "learning_rate": 9.428563509225348e-06,
+      "loss": 0.0151,
+      "reward": 0.5900027677416801,
+      "reward_std": 0.18181878328323364,
+      "rewards/code_reward": 0.5900027677416801,
+      "rewards/format_reward": 0.0,
+      "step": 9
+    },
+    {
+      "completion_length": 3539.346435546875,
+      "epoch": 0.2564102564102564,
+      "grad_norm": 0.23563309013843536,
+      "kl": 0.3955078125,
+      "learning_rate": 9.22829342159729e-06,
+      "loss": 0.0158,
+      "reward": 0.4020918384194374,
+      "reward_std": 0.22648156061768532,
+      "rewards/code_reward": 0.4020918384194374,
+      "rewards/format_reward": 0.0,
+      "step": 10
+    },
+    {
+      "completion_length": 3615.935791015625,
+      "epoch": 0.28205128205128205,
+      "grad_norm": 0.19946347177028656,
+      "kl": 0.47216796875,
+      "learning_rate": 9.001160894432979e-06,
+      "loss": 0.0189,
+      "reward": 0.41117217019200325,
+      "reward_std": 0.20069691445678473,
+      "rewards/code_reward": 0.41117217019200325,
+      "rewards/format_reward": 0.0,
+      "step": 11
+    },
+    {
+      "completion_length": 3652.20361328125,
+      "epoch": 0.3076923076923077,
+      "grad_norm": 0.21409198641777039,
+      "kl": 0.556640625,
+      "learning_rate": 8.748802422795361e-06,
+      "loss": 0.0222,
+      "reward": 0.41303257271647453,
+      "reward_std": 0.2046195026487112,
+      "rewards/code_reward": 0.41303258016705513,
+      "rewards/format_reward": 0.0,
+      "step": 12
+    },
+    {
+      "completion_length": 3682.6357421875,
+      "epoch": 0.3333333333333333,
+      "grad_norm": 0.6496288180351257,
+      "kl": 0.580078125,
+      "learning_rate": 8.473036255255368e-06,
+      "loss": 0.0232,
+      "reward": 0.4388655610382557,
+      "reward_std": 0.22819811291992664,
+      "rewards/code_reward": 0.4388655610382557,
+      "rewards/format_reward": 0.0,
+      "step": 13
+    },
+    {
+      "completion_length": 3709.0142211914062,
+      "epoch": 0.358974358974359,
+      "grad_norm": 0.23685197532176971,
+      "kl": 0.6474609375,
+      "learning_rate": 8.175849293369292e-06,
+      "loss": 0.0259,
+      "reward": 0.37063881754875183,
+      "reward_std": 0.23854901269078255,
+      "rewards/code_reward": 0.37063881754875183,
+      "rewards/format_reward": 0.0,
+      "step": 14
+    },
+    {
+      "completion_length": 3758.8857421875,
+      "epoch": 0.38461538461538464,
+      "grad_norm": 0.22902953624725342,
+      "kl": 0.66015625,
+      "learning_rate": 7.859382776007544e-06,
+      "loss": 0.0264,
+      "reward": 0.31185516342520714,
+      "reward_std": 0.22547182254493237,
+      "rewards/code_reward": 0.31185516342520714,
+      "rewards/format_reward": 0.0,
+      "step": 15
+    },
+    {
+      "completion_length": 3474.4249877929688,
+      "epoch": 0.41025641025641024,
+      "grad_norm": 0.2739468514919281,
+      "kl": 0.658203125,
+      "learning_rate": 7.52591685167953e-06,
+      "loss": 0.0263,
+      "reward": 0.46756455302238464,
+      "reward_std": 0.21291000582277775,
+      "rewards/code_reward": 0.46756456792354584,
+      "rewards/format_reward": 0.0,
+      "step": 16
+    },
+    {
+      "completion_length": 3608.0107421875,
+      "epoch": 0.4358974358974359,
+      "grad_norm": 0.26821818947792053,
+      "kl": 0.66796875,
+      "learning_rate": 7.1778541500113895e-06,
+      "loss": 0.0267,
+      "reward": 0.33532585576176643,
+      "reward_std": 0.29637256264686584,
+      "rewards/code_reward": 0.33532586693763733,
+      "rewards/format_reward": 0.0,
+      "step": 17
+    },
+    {
+      "completion_length": 3718.9964599609375,
+      "epoch": 0.46153846153846156,
+      "grad_norm": 0.24430078268051147,
+      "kl": 0.6259765625,
+      "learning_rate": 6.817702470744477e-06,
+      "loss": 0.025,
+      "reward": 0.22364513762295246,
+      "reward_std": 0.23529299348592758,
+      "rewards/code_reward": 0.22364513762295246,
+      "rewards/format_reward": 0.0,
+      "step": 18
+    },
+    {
+      "completion_length": 3312.9071044921875,
+      "epoch": 0.48717948717948717,
+      "grad_norm": 0.2842939794063568,
+      "kl": 0.5859375,
+      "learning_rate": 6.448056714980768e-06,
+      "loss": 0.0234,
+      "reward": 0.46438145264983177,
+      "reward_std": 0.2700807861983776,
+      "rewards/code_reward": 0.46438145637512207,
+      "rewards/format_reward": 0.0,
+      "step": 19
+    },
+    {
+      "completion_length": 3342.6749877929688,
+      "epoch": 0.5128205128205128,
+      "grad_norm": 0.2545294761657715,
+      "kl": 0.5576171875,
+      "learning_rate": 6.071580188860955e-06,
+      "loss": 0.0223,
+      "reward": 0.5150031447410583,
+      "reward_std": 0.2922932505607605,
+      "rewards/code_reward": 0.5150031298398972,
+      "rewards/format_reward": 0.0,
+      "step": 20
+    },
+    {
+      "completion_length": 2976.3214111328125,
+      "epoch": 0.5384615384615384,
+      "grad_norm": 0.23293468356132507,
+      "kl": 0.6181640625,
+      "learning_rate": 5.690985414382668e-06,
+      "loss": 0.0247,
+      "reward": 0.46903225034475327,
+      "reward_std": 0.23096787184476852,
+      "rewards/code_reward": 0.4690322279930115,
+      "rewards/format_reward": 0.0,
+      "step": 21
+    },
+    {
+      "completion_length": 3359.9892578125,
+      "epoch": 0.5641025641025641,
+      "grad_norm": 0.2094847410917282,
+      "kl": 0.5703125,
+      "learning_rate": 5.309014585617335e-06,
+      "loss": 0.0228,
+      "reward": 0.40152605809271336,
+      "reward_std": 0.2346283309161663,
+      "rewards/code_reward": 0.40152604319155216,
+      "rewards/format_reward": 0.0,
+      "step": 22
+    },
+    {
+      "completion_length": 2678.5321655273438,
+      "epoch": 0.5897435897435898,
+      "grad_norm": 0.24114222824573517,
+      "kl": 0.5654296875,
+      "learning_rate": 4.928419811139046e-06,
+      "loss": 0.0227,
+      "reward": 0.608155146241188,
+      "reward_std": 0.14721081405878067,
+      "rewards/code_reward": 0.6081551611423492,
+      "rewards/format_reward": 0.0,
+      "step": 23
+    },
+    {
+      "completion_length": 2912.8821411132812,
+      "epoch": 0.6153846153846154,
+      "grad_norm": 0.3605176508426666,
+      "kl": 0.5908203125,
+      "learning_rate": 4.551943285019233e-06,
+      "loss": 0.0236,
+      "reward": 0.5329480394721031,
+      "reward_std": 0.1990287434309721,
+      "rewards/code_reward": 0.5329480245709419,
+      "rewards/format_reward": 0.0,
+      "step": 24
+    },
+    {
+      "completion_length": 3301.3857421875,
+      "epoch": 0.6410256410256411,
+      "grad_norm": 6.35435152053833,
+      "kl": 0.716796875,
+      "learning_rate": 4.182297529255525e-06,
+      "loss": 0.0287,
+      "reward": 0.36159154400229454,
+      "reward_std": 0.2206190638244152,
+      "rewards/code_reward": 0.36159154027700424,
+      "rewards/format_reward": 0.0,
+      "step": 25
+    },
+    {
+      "completion_length": 3339.2214965820312,
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.4265838861465454,
+      "kl": 0.7275390625,
+      "learning_rate": 3.822145849988612e-06,
+      "loss": 0.0291,
+      "reward": 0.3386395741254091,
+      "reward_std": 0.18014118261635303,
+      "rewards/code_reward": 0.3386395741254091,
+      "rewards/format_reward": 0.0,
+      "step": 26
+    },
+    {
+      "completion_length": 3056.1214599609375,
+      "epoch": 0.6923076923076923,
+      "grad_norm": 0.4635300636291504,
+      "kl": 0.7158203125,
+      "learning_rate": 3.4740831483204696e-06,
+      "loss": 0.0286,
+      "reward": 0.4216299280524254,
+      "reward_std": 0.24011335149407387,
+      "rewards/code_reward": 0.4216299429535866,
+      "rewards/format_reward": 0.0,
+      "step": 27
+    },
+    {
+      "completion_length": 2741.1821899414062,
+      "epoch": 0.717948717948718,
+      "grad_norm": 0.26547616720199585,
+      "kl": 0.6884765625,
+      "learning_rate": 3.1406172239924583e-06,
+      "loss": 0.0275,
+      "reward": 0.5913942456245422,
+      "reward_std": 0.19475560076534748,
+      "rewards/code_reward": 0.5913942456245422,
+      "rewards/format_reward": 0.0,
+      "step": 28
+    },
+    {
+      "completion_length": 3502.3857421875,
+      "epoch": 0.7435897435897436,
+      "grad_norm": 0.3135121464729309,
+      "kl": 0.8955078125,
+      "learning_rate": 2.8241507066307106e-06,
+      "loss": 0.0358,
+      "reward": 0.33309811167418957,
+      "reward_std": 0.2021036557853222,
+      "rewards/code_reward": 0.3330980967730284,
+      "rewards/format_reward": 0.0,
+      "step": 29
+    },
+    {
+      "completion_length": 3053.2321166992188,
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.25711485743522644,
+      "kl": 0.880859375,
+      "learning_rate": 2.526963744744635e-06,
+      "loss": 0.0352,
+      "reward": 0.44350775331258774,
+      "reward_std": 0.19116137735545635,
+      "rewards/code_reward": 0.44350775331258774,
+      "rewards/format_reward": 0.0,
+      "step": 30
+    },
+    {
+      "completion_length": 3173.0178833007812,
+      "epoch": 0.7948717948717948,
+      "grad_norm": 0.3827289044857025,
+      "kl": 0.8798828125,
+      "learning_rate": 2.2511975772046403e-06,
+      "loss": 0.0352,
+      "reward": 0.42228348553180695,
+      "reward_std": 0.205778568983078,
+      "rewards/code_reward": 0.42228347808122635,
+      "rewards/format_reward": 0.0,
+      "step": 31
+    },
+    {
+      "completion_length": 3241.3535766601562,
+      "epoch": 0.8205128205128205,
+      "grad_norm": 0.4431661367416382,
+      "kl": 0.9033203125,
+      "learning_rate": 1.9988391055670234e-06,
+      "loss": 0.0362,
+      "reward": 0.3359471336007118,
+      "reward_std": 0.20435638166964054,
+      "rewards/code_reward": 0.3359471336007118,
+      "rewards/format_reward": 0.0,
+      "step": 32
+    },
+    {
+      "completion_length": 2643.057159423828,
+      "epoch": 0.8461538461538461,
+      "grad_norm": 0.5116384029388428,
+      "kl": 0.8017578125,
+      "learning_rate": 1.771706578402711e-06,
+      "loss": 0.0321,
+      "reward": 0.5609605759382248,
+      "reward_std": 0.18299106322228909,
+      "rewards/code_reward": 0.5609605610370636,
+      "rewards/format_reward": 0.0,
+      "step": 33
+    },
+    {
+      "completion_length": 2872.3214721679688,
+      "epoch": 0.8717948717948718,
+      "grad_norm": 0.44135168194770813,
+      "kl": 0.962890625,
+      "learning_rate": 1.5714364907746535e-06,
+      "loss": 0.0385,
+      "reward": 0.41167889907956123,
+      "reward_std": 0.1887526996433735,
+      "rewards/code_reward": 0.41167885810136795,
+      "rewards/format_reward": 0.0,
+      "step": 34
+    },
+    {
+      "completion_length": 2978.9928588867188,
+      "epoch": 0.8974358974358975,
+      "grad_norm": 0.6080113053321838,
+      "kl": 1.0380859375,
+      "learning_rate": 1.399471793253389e-06,
+      "loss": 0.0416,
+      "reward": 0.3960940055549145,
+      "reward_std": 0.20829082280397415,
+      "rewards/code_reward": 0.3960940055549145,
+      "rewards/format_reward": 0.0,
+      "step": 35
+    },
+    {
+      "completion_length": 3258.403564453125,
+      "epoch": 0.9230769230769231,
+      "grad_norm": 0.41639065742492676,
+      "kl": 1.072265625,
+      "learning_rate": 1.257051495425121e-06,
+      "loss": 0.0429,
+      "reward": 0.1503874734044075,
+      "reward_std": 0.12068512104451656,
+      "rewards/code_reward": 0.15038747526705265,
+      "rewards/format_reward": 0.0,
+      "step": 36
+    },
+    {
+      "completion_length": 2731.4713745117188,
+      "epoch": 0.9487179487179487,
+      "grad_norm": 0.33002978563308716,
+      "kl": 1.0263671875,
+      "learning_rate": 1.1452017387992552e-06,
+      "loss": 0.041,
+      "reward": 0.234993327409029,
+      "reward_std": 0.1311760265380144,
+      "rewards/code_reward": 0.234993327409029,
+      "rewards/format_reward": 0.0,
+      "step": 37
+    },
+    {
+      "completion_length": 2790.7571411132812,
+      "epoch": 0.9743589743589743,
+      "grad_norm": 0.3518536388874054,
+      "kl": 1.005859375,
+      "learning_rate": 1.0647284034353122e-06,
+      "loss": 0.0402,
+      "reward": 0.28501298278570175,
+      "reward_std": 0.07613634853623807,
+      "rewards/code_reward": 0.28501297906041145,
+      "rewards/format_reward": 0.0,
+      "step": 38
+    },
+    {
+      "completion_length": 2785.1597290039062,
+      "epoch": 1.0,
+      "grad_norm": 0.6018698215484619,
+      "kl": 1.119140625,
+      "learning_rate": 1.0162113015586309e-06,
+      "loss": 0.0448,
+      "reward": 0.26603568717837334,
+      "reward_std": 0.09410964651033282,
+      "rewards/code_reward": 0.26603569462895393,
+      "rewards/format_reward": 0.0,
+      "step": 39
+    },
+    {
+      "epoch": 1.0,
+      "step": 39,
+      "total_flos": 0.0,
+      "train_loss": 0.024176058914698324,
+      "train_runtime": 19938.5063,
+      "train_samples_per_second": 0.054,
+      "train_steps_per_second": 0.002
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 39,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 10,
+  "trial_name": null,
+  "trial_params": null
+}