Model save

Browse files

Files changed (5) hide show

README.md +68 -0
all_results.json +8 -0
generation_config.json +6 -0
train_results.json +8 -0
trainer_state.json +519 -0

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+base_model: Qwen/Qwen2.5-Math-7B
+library_name: transformers
+model_name: Qwen-2.5-Math-7B-Max-v3
+tags:
+- generated_from_trainer
+- trl
+- grpo
+licence: license
+---
+# Model Card for Qwen-2.5-Math-7B-Max-v3
+This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="chenggong1995/Qwen-2.5-Math-7B-Max-v3", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/gongc1995-city-university-of-hong-kong/huggingface/runs/v5cfvn9b)
+This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+### Framework versions
+- TRL: 0.16.0.dev0
+- Transformers: 4.49.0
+- Pytorch: 2.5.1
+- Datasets: 3.3.2
+- Tokenizers: 0.21.0
+## Citations
+Cite GRPO as:
+```bibtex
+@article{zhihong2024deepseekmath,
+    title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+    author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+    year         = 2024,
+    eprint       = {arXiv:2402.03300},
+}
+```
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.03207497191133684,
+    "train_runtime": 41076.4058,
+    "train_samples": 7500,
+    "train_samples_per_second": 0.548,
+    "train_steps_per_second": 0.004
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.49.0"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "total_flos": 0.0,
+    "train_loss": 0.03207497191133684,
+    "train_runtime": 41076.4058,
+    "train_samples": 7500,
+    "train_samples_per_second": 0.548,
+    "train_steps_per_second": 0.004
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,519 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9893390191897655,
+  "eval_steps": 60,
+  "global_step": 174,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 705.7877807617188,
+      "epoch": 0.017057569296375266,
+      "grad_norm": 0.2958298623561859,
+      "kl": 0.0,
+      "learning_rate": 1e-06,
+      "loss": 0.0834,
+      "reward": 0.47402435541152954,
+      "reward_std": 0.3045174852013588,
+      "rewards/cosine_scaled_reward": 0.47402435541152954,
+      "step": 1
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 692.6982593536377,
+      "epoch": 0.08528784648187633,
+      "grad_norm": 0.17230483889579773,
+      "kl": 0.00200594961643219,
+      "learning_rate": 1e-06,
+      "loss": 0.1076,
+      "reward": 0.4518069000914693,
+      "reward_std": 0.2899208152666688,
+      "rewards/cosine_scaled_reward": 0.4518069000914693,
+      "step": 5
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 685.9979393005372,
+      "epoch": 0.17057569296375266,
+      "grad_norm": 0.16429604589939117,
+      "kl": 0.00012786388397216796,
+      "learning_rate": 1e-06,
+      "loss": 0.1077,
+      "reward": 0.4576279394328594,
+      "reward_std": 0.2894581612199545,
+      "rewards/cosine_scaled_reward": 0.4576279394328594,
+      "step": 10
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 664.4338722229004,
+      "epoch": 0.255863539445629,
+      "grad_norm": 0.29537156224250793,
+      "kl": 0.0002796053886413574,
+      "learning_rate": 1e-06,
+      "loss": 0.0932,
+      "reward": 0.48066430613398553,
+      "reward_std": 0.2709693659096956,
+      "rewards/cosine_scaled_reward": 0.48066430613398553,
+      "step": 15
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 661.8659011840821,
+      "epoch": 0.3411513859275053,
+      "grad_norm": 0.17067702114582062,
+      "kl": 0.0004772186279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0929,
+      "reward": 0.4877126231789589,
+      "reward_std": 0.26678171902894976,
+      "rewards/cosine_scaled_reward": 0.4877126231789589,
+      "step": 20
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 686.9898681640625,
+      "epoch": 0.42643923240938164,
+      "grad_norm": 0.22597813606262207,
+      "kl": 0.00075836181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0922,
+      "reward": 0.4946805603802204,
+      "reward_std": 0.26594343446195123,
+      "rewards/cosine_scaled_reward": 0.4946805603802204,
+      "step": 25
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 700.4682479858399,
+      "epoch": 0.511727078891258,
+      "grad_norm": 0.1372915506362915,
+      "kl": 0.0009161949157714844,
+      "learning_rate": 1e-06,
+      "loss": 0.0724,
+      "reward": 0.49349360913038254,
+      "reward_std": 0.2691137969493866,
+      "rewards/cosine_scaled_reward": 0.49349360913038254,
+      "step": 30
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 678.8015800476074,
+      "epoch": 0.5970149253731343,
+      "grad_norm": 0.11743893474340439,
+      "kl": 0.0013933181762695312,
+      "learning_rate": 1e-06,
+      "loss": 0.0516,
+      "reward": 0.49990383088588713,
+      "reward_std": 0.2354368444532156,
+      "rewards/cosine_scaled_reward": 0.49990383088588713,
+      "step": 35
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 670.113818359375,
+      "epoch": 0.6823027718550106,
+      "grad_norm": 0.1322937160730362,
+      "kl": 0.001470184326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0423,
+      "reward": 0.4925546832382679,
+      "reward_std": 0.23934022188186646,
+      "rewards/cosine_scaled_reward": 0.4925546832382679,
+      "step": 40
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 693.9797073364258,
+      "epoch": 0.767590618336887,
+      "grad_norm": 0.11684294044971466,
+      "kl": 0.0016462326049804688,
+      "learning_rate": 1e-06,
+      "loss": 0.0472,
+      "reward": 0.5110804848372936,
+      "reward_std": 0.24233248196542262,
+      "rewards/cosine_scaled_reward": 0.5110804848372936,
+      "step": 45
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 703.8312713623047,
+      "epoch": 0.8528784648187633,
+      "grad_norm": 0.13510337471961975,
+      "kl": 0.0022981643676757814,
+      "learning_rate": 1e-06,
+      "loss": 0.0378,
+      "reward": 0.5143898174166679,
+      "reward_std": 0.24455392695963382,
+      "rewards/cosine_scaled_reward": 0.5143898174166679,
+      "step": 50
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 676.6164245605469,
+      "epoch": 0.9381663113006397,
+      "grad_norm": 0.12922672927379608,
+      "kl": 0.0027494430541992188,
+      "learning_rate": 1e-06,
+      "loss": 0.0368,
+      "reward": 0.538255549967289,
+      "reward_std": 0.23528089113533496,
+      "rewards/cosine_scaled_reward": 0.538255549967289,
+      "step": 55
+    },
+    {
+      "epoch": 1.0341151385927505,
+      "grad_norm": 0.1454162746667862,
+      "learning_rate": 1e-06,
+      "loss": 0.0173,
+      "step": 60
+    },
+    {
+      "epoch": 1.0341151385927505,
+      "eval_clip_ratio": 0.0,
+      "eval_completion_length": 693.2279481887817,
+      "eval_kl": 0.004572391510009766,
+      "eval_loss": 0.017638780176639557,
+      "eval_reward": 0.4241527561098337,
+      "eval_reward_std": 0.27252104552462697,
+      "eval_rewards/cosine_scaled_reward": 0.4241527561098337,
+      "eval_runtime": 731.0086,
+      "eval_samples_per_second": 0.684,
+      "eval_steps_per_second": 0.008,
+      "step": 60
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 658.7437713623046,
+      "epoch": 1.1194029850746268,
+      "grad_norm": 0.14889299869537354,
+      "kl": 0.004099464416503907,
+      "learning_rate": 1e-06,
+      "loss": 0.0233,
+      "reward": 0.5276085119694471,
+      "reward_std": 0.22797267828136683,
+      "rewards/cosine_scaled_reward": 0.5276085119694471,
+      "step": 65
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 661.8356964111329,
+      "epoch": 1.2046908315565032,
+      "grad_norm": 0.13085317611694336,
+      "kl": 0.0031612396240234377,
+      "learning_rate": 1e-06,
+      "loss": 0.0077,
+      "reward": 0.5444076530635357,
+      "reward_std": 0.22460445892065764,
+      "rewards/cosine_scaled_reward": 0.5444076530635357,
+      "step": 70
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 702.0494995117188,
+      "epoch": 1.2899786780383795,
+      "grad_norm": 0.15662160515785217,
+      "kl": 0.003662109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0192,
+      "reward": 0.5370461963117122,
+      "reward_std": 0.23422690220177173,
+      "rewards/cosine_scaled_reward": 0.5370461963117122,
+      "step": 75
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 675.3036636352539,
+      "epoch": 1.375266524520256,
+      "grad_norm": 0.14926958084106445,
+      "kl": 0.004243087768554687,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "reward": 0.5500567473471165,
+      "reward_std": 0.2124465636909008,
+      "rewards/cosine_scaled_reward": 0.5500567473471165,
+      "step": 80
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 690.0687698364258,
+      "epoch": 1.4605543710021323,
+      "grad_norm": 0.12932759523391724,
+      "kl": 0.004961395263671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0239,
+      "reward": 0.5469569325447082,
+      "reward_std": 0.22115669399499893,
+      "rewards/cosine_scaled_reward": 0.5469569325447082,
+      "step": 85
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 704.1914291381836,
+      "epoch": 1.5458422174840085,
+      "grad_norm": 0.27263641357421875,
+      "kl": 0.011516571044921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0134,
+      "reward": 0.540603245049715,
+      "reward_std": 0.220616265386343,
+      "rewards/cosine_scaled_reward": 0.540603245049715,
+      "step": 90
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 705.2396011352539,
+      "epoch": 1.6311300639658848,
+      "grad_norm": 0.1261100471019745,
+      "kl": 0.0060760498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "reward": 0.5562940575182438,
+      "reward_std": 0.22738375030457975,
+      "rewards/cosine_scaled_reward": 0.5562940575182438,
+      "step": 95
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 708.0935134887695,
+      "epoch": 1.716417910447761,
+      "grad_norm": 0.20003671944141388,
+      "kl": 0.0080291748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "reward": 0.5645768508315087,
+      "reward_std": 0.22589275762438774,
+      "rewards/cosine_scaled_reward": 0.5645768508315087,
+      "step": 100
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 730.0916885375976,
+      "epoch": 1.8017057569296375,
+      "grad_norm": 0.14709459245204926,
+      "kl": 0.0078338623046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0176,
+      "reward": 0.5447159253060818,
+      "reward_std": 0.2277662731707096,
+      "rewards/cosine_scaled_reward": 0.5447159253060818,
+      "step": 105
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 733.7476760864258,
+      "epoch": 1.886993603411514,
+      "grad_norm": 0.14343297481536865,
+      "kl": 0.009282684326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0221,
+      "reward": 0.5597066521644593,
+      "reward_std": 0.23097761012613774,
+      "rewards/cosine_scaled_reward": 0.5597066521644593,
+      "step": 110
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 745.3560104370117,
+      "epoch": 1.9722814498933903,
+      "grad_norm": 0.21631674468517303,
+      "kl": 0.011328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0179,
+      "reward": 0.5601713679730892,
+      "reward_std": 0.2243567120283842,
+      "rewards/cosine_scaled_reward": 0.5601713679730892,
+      "step": 115
+    },
+    {
+      "epoch": 2.068230277185501,
+      "grad_norm": 0.22386282682418823,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "step": 120
+    },
+    {
+      "epoch": 2.068230277185501,
+      "eval_clip_ratio": 0.0,
+      "eval_completion_length": 754.1520328521729,
+      "eval_kl": 0.016979217529296875,
+      "eval_loss": 0.012520050629973412,
+      "eval_reward": 0.4722373131662607,
+      "eval_reward_std": 0.2587718339636922,
+      "eval_rewards/cosine_scaled_reward": 0.4722373131662607,
+      "eval_runtime": 724.6498,
+      "eval_samples_per_second": 0.69,
+      "eval_steps_per_second": 0.008,
+      "step": 120
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 749.0557495117188,
+      "epoch": 2.1535181236673773,
+      "grad_norm": 0.17756131291389465,
+      "kl": 0.0161590576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "reward": 0.5662507023662329,
+      "reward_std": 0.22460255604237317,
+      "rewards/cosine_scaled_reward": 0.5662507023662329,
+      "step": 125
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 757.4320526123047,
+      "epoch": 2.2388059701492535,
+      "grad_norm": 0.24769122898578644,
+      "kl": 0.02352294921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0179,
+      "reward": 0.5909504756331444,
+      "reward_std": 0.22746318429708481,
+      "rewards/cosine_scaled_reward": 0.5909504756331444,
+      "step": 130
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 778.7044509887695,
+      "epoch": 2.3240938166311302,
+      "grad_norm": 0.25195103883743286,
+      "kl": 0.02833251953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "reward": 0.5315394312143326,
+      "reward_std": 0.23098385594785215,
+      "rewards/cosine_scaled_reward": 0.5315394312143326,
+      "step": 135
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 780.8755416870117,
+      "epoch": 2.4093816631130065,
+      "grad_norm": 0.3621465265750885,
+      "kl": 0.0350982666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0015,
+      "reward": 0.5728602990508079,
+      "reward_std": 0.24233178310096265,
+      "rewards/cosine_scaled_reward": 0.5728602990508079,
+      "step": 140
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 777.0599136352539,
+      "epoch": 2.4946695095948828,
+      "grad_norm": 0.2630998492240906,
+      "kl": 0.053851318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0229,
+      "reward": 0.5749510392546654,
+      "reward_std": 0.25164939016103743,
+      "rewards/cosine_scaled_reward": 0.5749510392546654,
+      "step": 145
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 806.2307495117187,
+      "epoch": 2.579957356076759,
+      "grad_norm": 0.7152215242385864,
+      "kl": 0.075592041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0256,
+      "reward": 0.5316810458898544,
+      "reward_std": 0.24946709722280502,
+      "rewards/cosine_scaled_reward": 0.5316810458898544,
+      "step": 150
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 807.8872589111328,
+      "epoch": 2.6652452025586353,
+      "grad_norm": 0.9139208197593689,
+      "kl": 0.1150634765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0312,
+      "reward": 0.4877690590918064,
+      "reward_std": 0.28349833004176617,
+      "rewards/cosine_scaled_reward": 0.4877690590918064,
+      "step": 155
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 926.7297088623047,
+      "epoch": 2.750533049040512,
+      "grad_norm": 1.1683220863342285,
+      "kl": 0.2032470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0749,
+      "reward": 0.2821820305660367,
+      "reward_std": 0.3167072061449289,
+      "rewards/cosine_scaled_reward": 0.2821820305660367,
+      "step": 160
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 932.1104385375977,
+      "epoch": 2.835820895522388,
+      "grad_norm": 6.447605609893799,
+      "kl": 0.3660888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0324,
+      "reward": 0.02865399098955095,
+      "reward_std": 0.30335349403321743,
+      "rewards/cosine_scaled_reward": 0.02865399098955095,
+      "step": 165
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 664.3466369628907,
+      "epoch": 2.9211087420042645,
+      "grad_norm": 26.408769607543945,
+      "kl": 0.82470703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0468,
+      "reward": -0.12603640989982523,
+      "reward_std": 0.2628266651183367,
+      "rewards/cosine_scaled_reward": -0.12603640989982523,
+      "step": 170
+    },
+    {
+      "clip_ratio": 0.0,
+      "completion_length": 562.9258012771606,
+      "epoch": 2.9893390191897655,
+      "kl": 0.9171142578125,
+      "reward": -0.19254306121729314,
+      "reward_std": 0.23406662652269006,
+      "rewards/cosine_scaled_reward": -0.19254306121729314,
+      "step": 174,
+      "total_flos": 0.0,
+      "train_loss": 0.03207497191133684,
+      "train_runtime": 41076.4058,
+      "train_samples_per_second": 0.548,
+      "train_steps_per_second": 0.004
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 174,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}