The-Fool-09 commited on
Commit
7df5122
·
verified ·
1 Parent(s): 573a439

Upload model from Kaggle (zip extracted)

Browse files
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. README.md +69 -0
  3. checkpoint-100/README.md +210 -0
  4. checkpoint-100/adapter_config.json +50 -0
  5. checkpoint-100/adapter_model.safetensors +3 -0
  6. checkpoint-100/chat_template.jinja +54 -0
  7. checkpoint-100/optimizer.pt +3 -0
  8. checkpoint-100/rng_state.pth +3 -0
  9. checkpoint-100/scaler.pt +3 -0
  10. checkpoint-100/scheduler.pt +3 -0
  11. checkpoint-100/tokenizer.json +3 -0
  12. checkpoint-100/tokenizer_config.json +202 -0
  13. checkpoint-100/trainer_state.json +574 -0
  14. checkpoint-100/training_args.bin +3 -0
  15. checkpoint-150/README.md +210 -0
  16. checkpoint-150/adapter_config.json +50 -0
  17. checkpoint-150/adapter_model.safetensors +3 -0
  18. checkpoint-150/chat_template.jinja +54 -0
  19. checkpoint-150/optimizer.pt +3 -0
  20. checkpoint-150/rng_state.pth +3 -0
  21. checkpoint-150/scaler.pt +3 -0
  22. checkpoint-150/scheduler.pt +3 -0
  23. checkpoint-150/tokenizer.json +3 -0
  24. checkpoint-150/tokenizer_config.json +202 -0
  25. checkpoint-150/trainer_state.json +844 -0
  26. checkpoint-150/training_args.bin +3 -0
  27. checkpoint-200/README.md +210 -0
  28. checkpoint-200/adapter_config.json +50 -0
  29. checkpoint-200/adapter_model.safetensors +3 -0
  30. checkpoint-200/chat_template.jinja +54 -0
  31. checkpoint-200/optimizer.pt +3 -0
  32. checkpoint-200/rng_state.pth +3 -0
  33. checkpoint-200/scaler.pt +3 -0
  34. checkpoint-200/scheduler.pt +3 -0
  35. checkpoint-200/tokenizer.json +3 -0
  36. checkpoint-200/tokenizer_config.json +202 -0
  37. checkpoint-200/trainer_state.json +1114 -0
  38. checkpoint-200/training_args.bin +3 -0
  39. checkpoint-50/README.md +210 -0
  40. checkpoint-50/adapter_config.json +50 -0
  41. checkpoint-50/adapter_model.safetensors +3 -0
  42. checkpoint-50/chat_template.jinja +54 -0
  43. checkpoint-50/optimizer.pt +3 -0
  44. checkpoint-50/rng_state.pth +3 -0
  45. checkpoint-50/scaler.pt +3 -0
  46. checkpoint-50/scheduler.pt +3 -0
  47. checkpoint-50/tokenizer.json +3 -0
  48. checkpoint-50/tokenizer_config.json +202 -0
  49. checkpoint-50/trainer_state.json +304 -0
  50. checkpoint-50/training_args.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-150/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-50/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Qwen2.5-Coder-0.5B-Instruct
3
+ library_name: transformers
4
+ model_name: debugzero_model
5
+ tags:
6
+ - generated_from_trainer
7
+ - grpo
8
+ - trl
9
+ - unsloth
10
+ licence: license
11
+ ---
12
+
13
+ # Model Card for debugzero_model
14
+
15
+ This model is a fine-tuned version of [unsloth/Qwen2.5-Coder-0.5B-Instruct](https://huggingface.co/unsloth/Qwen2.5-Coder-0.5B-Instruct).
16
+ It has been trained using [TRL](https://github.com/huggingface/trl).
17
+
18
+ ## Quick start
19
+
20
+ ```python
21
+ from transformers import pipeline
22
+
23
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
24
+ generator = pipeline("text-generation", model="None", device="cuda")
25
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
26
+ print(output["generated_text"])
27
+ ```
28
+
29
+ ## Training procedure
30
+
31
+
32
+
33
+
34
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
35
+
36
+ ### Framework versions
37
+
38
+ - TRL: 0.24.0
39
+ - Transformers: 5.5.0
40
+ - Pytorch: 2.10.0+cu128
41
+ - Datasets: 4.3.0
42
+ - Tokenizers: 0.22.2
43
+
44
+ ## Citations
45
+
46
+ Cite GRPO as:
47
+
48
+ ```bibtex
49
+ @article{shao2024deepseekmath,
50
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
51
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
52
+ year = 2024,
53
+ eprint = {arXiv:2402.03300},
54
+ }
55
+
56
+ ```
57
+
58
+ Cite TRL as:
59
+
60
+ ```bibtex
61
+ @misc{vonwerra2022trl,
62
+ title = {{TRL: Transformer Reinforcement Learning}},
63
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
64
+ year = 2020,
65
+ journal = {GitHub repository},
66
+ publisher = {GitHub},
67
+ howpublished = {\url{https://github.com/huggingface/trl}}
68
+ }
69
+ ```
checkpoint-100/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Qwen2.5-Coder-0.5B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/Qwen2.5-Coder-0.5B-Instruct
7
+ - grpo
8
+ - lora
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.1
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Qwen2ForCausalLM",
7
+ "parent_library": "transformers.models.qwen2.modeling_qwen2",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/Qwen2.5-Coder-0.5B-Instruct",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": [
36
+ "v_proj",
37
+ "k_proj",
38
+ "o_proj",
39
+ "up_proj",
40
+ "gate_proj",
41
+ "q_proj",
42
+ "down_proj"
43
+ ],
44
+ "target_parameters": null,
45
+ "task_type": "CAUSAL_LM",
46
+ "trainable_token_indices": null,
47
+ "use_dora": false,
48
+ "use_qalora": false,
49
+ "use_rslora": false
50
+ }
checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:776a58008e6057875a95eb4c9b416067e8bed8fee6055a6c4e789604d0b2da32
3
+ size 70430032
checkpoint-100/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1a5e6c4910d24f8435f25cdc1f357415508b75f8439f7ba16b60adee1f94106
3
+ size 36139685
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a36998df7779670e6875d34eafebcb377dcdcbaff70e54ae678ec64164030cbe
3
+ size 14645
checkpoint-100/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4393a84a3109995aa1202073b039b12062e3189ed89aa0b94ef0510ba843009
3
+ size 1383
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad152981f692f4cc5629bf2d540a4f8fc379072f49c245a2240b74ca1bc6b09b
3
+ size 1465
checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446
3
+ size 11422523
checkpoint-100/tokenizer_config.json ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [],
9
+ "is_local": false,
10
+ "model_max_length": 32768,
11
+ "pad_token": "<|PAD_TOKEN|>",
12
+ "padding_side": "right",
13
+ "split_special_tokens": false,
14
+ "tokenizer_class": "Qwen2Tokenizer",
15
+ "unk_token": null,
16
+ "added_tokens_decoder": {
17
+ "151643": {
18
+ "content": "<|endoftext|>",
19
+ "single_word": false,
20
+ "lstrip": false,
21
+ "rstrip": false,
22
+ "normalized": false,
23
+ "special": true
24
+ },
25
+ "151644": {
26
+ "content": "<|im_start|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ "151645": {
34
+ "content": "<|im_end|>",
35
+ "single_word": false,
36
+ "lstrip": false,
37
+ "rstrip": false,
38
+ "normalized": false,
39
+ "special": true
40
+ },
41
+ "151646": {
42
+ "content": "<|object_ref_start|>",
43
+ "single_word": false,
44
+ "lstrip": false,
45
+ "rstrip": false,
46
+ "normalized": false,
47
+ "special": true
48
+ },
49
+ "151647": {
50
+ "content": "<|object_ref_end|>",
51
+ "single_word": false,
52
+ "lstrip": false,
53
+ "rstrip": false,
54
+ "normalized": false,
55
+ "special": true
56
+ },
57
+ "151648": {
58
+ "content": "<|box_start|>",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ },
65
+ "151649": {
66
+ "content": "<|box_end|>",
67
+ "single_word": false,
68
+ "lstrip": false,
69
+ "rstrip": false,
70
+ "normalized": false,
71
+ "special": true
72
+ },
73
+ "151650": {
74
+ "content": "<|quad_start|>",
75
+ "single_word": false,
76
+ "lstrip": false,
77
+ "rstrip": false,
78
+ "normalized": false,
79
+ "special": true
80
+ },
81
+ "151651": {
82
+ "content": "<|quad_end|>",
83
+ "single_word": false,
84
+ "lstrip": false,
85
+ "rstrip": false,
86
+ "normalized": false,
87
+ "special": true
88
+ },
89
+ "151652": {
90
+ "content": "<|vision_start|>",
91
+ "single_word": false,
92
+ "lstrip": false,
93
+ "rstrip": false,
94
+ "normalized": false,
95
+ "special": true
96
+ },
97
+ "151653": {
98
+ "content": "<|vision_end|>",
99
+ "single_word": false,
100
+ "lstrip": false,
101
+ "rstrip": false,
102
+ "normalized": false,
103
+ "special": true
104
+ },
105
+ "151654": {
106
+ "content": "<|vision_pad|>",
107
+ "single_word": false,
108
+ "lstrip": false,
109
+ "rstrip": false,
110
+ "normalized": false,
111
+ "special": true
112
+ },
113
+ "151655": {
114
+ "content": "<|image_pad|>",
115
+ "single_word": false,
116
+ "lstrip": false,
117
+ "rstrip": false,
118
+ "normalized": false,
119
+ "special": true
120
+ },
121
+ "151656": {
122
+ "content": "<|video_pad|>",
123
+ "single_word": false,
124
+ "lstrip": false,
125
+ "rstrip": false,
126
+ "normalized": false,
127
+ "special": true
128
+ },
129
+ "151657": {
130
+ "content": "<tool_call>",
131
+ "single_word": false,
132
+ "lstrip": false,
133
+ "rstrip": false,
134
+ "normalized": false,
135
+ "special": false
136
+ },
137
+ "151658": {
138
+ "content": "</tool_call>",
139
+ "single_word": false,
140
+ "lstrip": false,
141
+ "rstrip": false,
142
+ "normalized": false,
143
+ "special": false
144
+ },
145
+ "151659": {
146
+ "content": "<|fim_prefix|>",
147
+ "single_word": false,
148
+ "lstrip": false,
149
+ "rstrip": false,
150
+ "normalized": false,
151
+ "special": false
152
+ },
153
+ "151660": {
154
+ "content": "<|fim_middle|>",
155
+ "single_word": false,
156
+ "lstrip": false,
157
+ "rstrip": false,
158
+ "normalized": false,
159
+ "special": false
160
+ },
161
+ "151661": {
162
+ "content": "<|fim_suffix|>",
163
+ "single_word": false,
164
+ "lstrip": false,
165
+ "rstrip": false,
166
+ "normalized": false,
167
+ "special": false
168
+ },
169
+ "151662": {
170
+ "content": "<|fim_pad|>",
171
+ "single_word": false,
172
+ "lstrip": false,
173
+ "rstrip": false,
174
+ "normalized": false,
175
+ "special": false
176
+ },
177
+ "151663": {
178
+ "content": "<|repo_name|>",
179
+ "single_word": false,
180
+ "lstrip": false,
181
+ "rstrip": false,
182
+ "normalized": false,
183
+ "special": false
184
+ },
185
+ "151664": {
186
+ "content": "<|file_sep|>",
187
+ "single_word": false,
188
+ "lstrip": false,
189
+ "rstrip": false,
190
+ "normalized": false,
191
+ "special": false
192
+ },
193
+ "151665": {
194
+ "content": "<|PAD_TOKEN|>",
195
+ "single_word": false,
196
+ "lstrip": false,
197
+ "rstrip": false,
198
+ "normalized": false,
199
+ "special": true
200
+ }
201
+ }
202
+ }
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,574 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completion_length": 95.85,
19
+ "completions/clipped_ratio": 0.125,
20
+ "completions/max_length": 256.0,
21
+ "completions/max_terminated_length": 195.6,
22
+ "completions/mean_length": 95.85,
23
+ "completions/mean_terminated_length": 72.8880989074707,
24
+ "completions/min_length": 36.6,
25
+ "completions/min_terminated_length": 36.6,
26
+ "epoch": 0.5,
27
+ "frac_reward_zero_std": 0.2,
28
+ "grad_norm": 0.3041034936904907,
29
+ "kl": 0.00021898818758927518,
30
+ "learning_rate": 4e-05,
31
+ "loss": 0.0329527884721756,
32
+ "num_tokens": 21444.0,
33
+ "reward": 0.7767857074737549,
34
+ "reward_std": 0.7150911569595337,
35
+ "rewards/reward_fn/mean": 0.7767857074737549,
36
+ "rewards/reward_fn/std": 0.9959433197975158,
37
+ "step": 5
38
+ },
39
+ {
40
+ "clip_ratio/high_max": 0.0,
41
+ "clip_ratio/high_mean": 0.0,
42
+ "clip_ratio/low_mean": 0.0,
43
+ "clip_ratio/low_min": 0.0,
44
+ "clip_ratio/region_mean": 0.0,
45
+ "completion_length": 95.1,
46
+ "completions/clipped_ratio": 0.0875,
47
+ "completions/max_length": 225.2,
48
+ "completions/max_terminated_length": 159.2,
49
+ "completions/mean_length": 95.1,
50
+ "completions/mean_terminated_length": 80.34466400146485,
51
+ "completions/min_length": 34.6,
52
+ "completions/min_terminated_length": 34.6,
53
+ "epoch": 1.0,
54
+ "frac_reward_zero_std": 0.4,
55
+ "grad_norm": 0.2708108425140381,
56
+ "kl": 0.006061523332027719,
57
+ "learning_rate": 9e-05,
58
+ "loss": -0.016053691506385803,
59
+ "num_tokens": 42220.0,
60
+ "reward": 0.6875000119209289,
61
+ "reward_std": 0.5175672590732574,
62
+ "rewards/reward_fn/mean": 0.6875,
63
+ "rewards/reward_fn/std": 0.8034753799438477,
64
+ "step": 10
65
+ },
66
+ {
67
+ "clip_ratio/high_max": 0.0,
68
+ "clip_ratio/high_mean": 0.0,
69
+ "clip_ratio/low_mean": 0.0,
70
+ "clip_ratio/low_min": 0.0,
71
+ "clip_ratio/region_mean": 0.0,
72
+ "completion_length": 146.275,
73
+ "completions/clipped_ratio": 0.325,
74
+ "completions/max_length": 256.0,
75
+ "completions/max_terminated_length": 225.4,
76
+ "completions/mean_length": 146.275,
77
+ "completions/mean_terminated_length": 92.8663101196289,
78
+ "completions/min_length": 32.6,
79
+ "completions/min_terminated_length": 32.6,
80
+ "epoch": 1.5,
81
+ "frac_reward_zero_std": 0.2,
82
+ "grad_norm": 0.27529817819595337,
83
+ "kl": 0.01560373855754733,
84
+ "learning_rate": 9.989068136093873e-05,
85
+ "loss": 0.010054035484790802,
86
+ "num_tokens": 69198.0,
87
+ "reward": 1.1558928728103637,
88
+ "reward_std": 0.8465379416942597,
89
+ "rewards/reward_fn/mean": 1.1558928728103637,
90
+ "rewards/reward_fn/std": 1.1404805421829223,
91
+ "step": 15
92
+ },
93
+ {
94
+ "clip_ratio/high_max": 0.0,
95
+ "clip_ratio/high_mean": 0.0,
96
+ "clip_ratio/low_mean": 0.0,
97
+ "clip_ratio/low_min": 0.0,
98
+ "clip_ratio/region_mean": 0.0,
99
+ "completion_length": 75.85,
100
+ "completions/clipped_ratio": 0.05,
101
+ "completions/max_length": 220.6,
102
+ "completions/max_terminated_length": 151.0,
103
+ "completions/mean_length": 75.85,
104
+ "completions/mean_terminated_length": 66.14857330322266,
105
+ "completions/min_length": 32.8,
106
+ "completions/min_terminated_length": 32.8,
107
+ "epoch": 2.0,
108
+ "frac_reward_zero_std": 0.35,
109
+ "grad_norm": 0.353487491607666,
110
+ "kl": 0.027986684162169696,
111
+ "learning_rate": 9.944739353007344e-05,
112
+ "loss": -0.030885905027389526,
113
+ "num_tokens": 87334.0,
114
+ "reward": 0.7932142853736878,
115
+ "reward_std": 0.4118983089923859,
116
+ "rewards/reward_fn/mean": 0.7932142853736878,
117
+ "rewards/reward_fn/std": 0.7167340397834778,
118
+ "step": 20
119
+ },
120
+ {
121
+ "clip_ratio/high_max": 0.0,
122
+ "clip_ratio/high_mean": 0.0,
123
+ "clip_ratio/low_mean": 0.0,
124
+ "clip_ratio/low_min": 0.0,
125
+ "clip_ratio/region_mean": 0.0,
126
+ "completion_length": 90.85,
127
+ "completions/clipped_ratio": 0.0625,
128
+ "completions/max_length": 248.0,
129
+ "completions/max_terminated_length": 216.4,
130
+ "completions/mean_length": 90.85,
131
+ "completions/mean_terminated_length": 79.819287109375,
132
+ "completions/min_length": 28.0,
133
+ "completions/min_terminated_length": 28.0,
134
+ "epoch": 2.5,
135
+ "frac_reward_zero_std": 0.15,
136
+ "grad_norm": 0.3318491578102112,
137
+ "kl": 0.04692814685404301,
138
+ "learning_rate": 9.86663298624003e-05,
139
+ "loss": 0.05783940553665161,
140
+ "num_tokens": 108330.0,
141
+ "reward": 1.1987500190734863,
142
+ "reward_std": 0.5907092690467834,
143
+ "rewards/reward_fn/mean": 1.1987500190734863,
144
+ "rewards/reward_fn/std": 0.9482298731803894,
145
+ "step": 25
146
+ },
147
+ {
148
+ "clip_ratio/high_max": 0.0,
149
+ "clip_ratio/high_mean": 0.0,
150
+ "clip_ratio/low_mean": 0.0,
151
+ "clip_ratio/low_min": 0.0,
152
+ "clip_ratio/region_mean": 0.0,
153
+ "completion_length": 91.775,
154
+ "completions/clipped_ratio": 0.125,
155
+ "completions/max_length": 248.2,
156
+ "completions/max_terminated_length": 212.6,
157
+ "completions/mean_length": 91.775,
158
+ "completions/mean_terminated_length": 68.68722763061524,
159
+ "completions/min_length": 25.2,
160
+ "completions/min_terminated_length": 25.2,
161
+ "epoch": 3.0,
162
+ "frac_reward_zero_std": 0.4,
163
+ "grad_norm": 0.24605490267276764,
164
+ "kl": 0.07254143096506596,
165
+ "learning_rate": 9.755282581475769e-05,
166
+ "loss": 0.03934891819953919,
167
+ "num_tokens": 129012.0,
168
+ "reward": 1.0242857217788697,
169
+ "reward_std": 0.5802445828914642,
170
+ "rewards/reward_fn/mean": 1.0242857217788697,
171
+ "rewards/reward_fn/std": 0.8706011414527893,
172
+ "step": 30
173
+ },
174
+ {
175
+ "clip_ratio/high_max": 0.0,
176
+ "clip_ratio/high_mean": 0.0,
177
+ "clip_ratio/low_mean": 0.0,
178
+ "clip_ratio/low_min": 0.0,
179
+ "clip_ratio/region_mean": 0.0,
180
+ "completion_length": 59.7875,
181
+ "completions/clipped_ratio": 0.0375,
182
+ "completions/max_length": 175.6,
183
+ "completions/max_terminated_length": 118.4,
184
+ "completions/mean_length": 59.7875,
185
+ "completions/mean_terminated_length": 52.19559555053711,
186
+ "completions/min_length": 25.4,
187
+ "completions/min_terminated_length": 25.4,
188
+ "epoch": 3.5,
189
+ "frac_reward_zero_std": 0.6,
190
+ "grad_norm": 0.2970605790615082,
191
+ "kl": 0.053099883161485194,
192
+ "learning_rate": 9.611448774886924e-05,
193
+ "loss": 0.06977529525756836,
194
+ "num_tokens": 147103.0,
195
+ "reward": 0.9473214387893677,
196
+ "reward_std": 0.3923970699310303,
197
+ "rewards/reward_fn/mean": 0.9473214268684387,
198
+ "rewards/reward_fn/std": 0.649278998374939,
199
+ "step": 35
200
+ },
201
+ {
202
+ "clip_ratio/high_max": 0.0,
203
+ "clip_ratio/high_mean": 0.0,
204
+ "clip_ratio/low_mean": 0.0,
205
+ "clip_ratio/low_min": 0.0,
206
+ "clip_ratio/region_mean": 0.0,
207
+ "completion_length": 47.85,
208
+ "completions/clipped_ratio": 0.0,
209
+ "completions/max_length": 94.8,
210
+ "completions/max_terminated_length": 94.8,
211
+ "completions/mean_length": 47.85,
212
+ "completions/mean_terminated_length": 47.85,
213
+ "completions/min_length": 31.2,
214
+ "completions/min_terminated_length": 31.2,
215
+ "epoch": 4.0,
216
+ "frac_reward_zero_std": 0.85,
217
+ "grad_norm": 0.16646049916744232,
218
+ "kl": 0.056573960930109024,
219
+ "learning_rate": 9.43611409721806e-05,
220
+ "loss": -0.0031644195318222047,
221
+ "num_tokens": 164643.0,
222
+ "reward": 0.7899999976158142,
223
+ "reward_std": 0.12681145668029786,
224
+ "rewards/reward_fn/mean": 0.7899999976158142,
225
+ "rewards/reward_fn/std": 0.5133758962154389,
226
+ "step": 40
227
+ },
228
+ {
229
+ "clip_ratio/high_max": 0.0,
230
+ "clip_ratio/high_mean": 0.0,
231
+ "clip_ratio/low_mean": 0.0,
232
+ "clip_ratio/low_min": 0.0,
233
+ "clip_ratio/region_mean": 0.0,
234
+ "completion_length": 49.2625,
235
+ "completions/clipped_ratio": 0.0,
236
+ "completions/max_length": 93.6,
237
+ "completions/max_terminated_length": 93.6,
238
+ "completions/mean_length": 49.2625,
239
+ "completions/mean_terminated_length": 49.2625,
240
+ "completions/min_length": 33.8,
241
+ "completions/min_terminated_length": 33.8,
242
+ "epoch": 4.5,
243
+ "frac_reward_zero_std": 0.6,
244
+ "grad_norm": 0.3159082531929016,
245
+ "kl": 0.0353464649990201,
246
+ "learning_rate": 9.230476262104677e-05,
247
+ "loss": -0.03417131304740906,
248
+ "num_tokens": 182568.0,
249
+ "reward": 0.7924999833106995,
250
+ "reward_std": 0.3821385264396667,
251
+ "rewards/reward_fn/mean": 0.7924999833106995,
252
+ "rewards/reward_fn/std": 0.62625293135643,
253
+ "step": 45
254
+ },
255
+ {
256
+ "clip_ratio/high_max": 0.0,
257
+ "clip_ratio/high_mean": 0.0,
258
+ "clip_ratio/low_mean": 0.0,
259
+ "clip_ratio/low_min": 0.0,
260
+ "clip_ratio/region_mean": 0.0,
261
+ "completion_length": 58.25,
262
+ "completions/clipped_ratio": 0.0125,
263
+ "completions/max_length": 180.6,
264
+ "completions/max_terminated_length": 153.2,
265
+ "completions/mean_length": 58.25,
266
+ "completions/mean_terminated_length": 55.67250061035156,
267
+ "completions/min_length": 33.2,
268
+ "completions/min_terminated_length": 33.2,
269
+ "epoch": 5.0,
270
+ "frac_reward_zero_std": 0.55,
271
+ "grad_norm": 0.27360936999320984,
272
+ "kl": 0.04483541352674365,
273
+ "learning_rate": 8.995939984474624e-05,
274
+ "loss": 0.026852786540985107,
275
+ "num_tokens": 200660.0,
276
+ "reward": 0.774821424484253,
277
+ "reward_std": 0.32772802114486693,
278
+ "rewards/reward_fn/mean": 0.774821424484253,
279
+ "rewards/reward_fn/std": 0.5432307064533234,
280
+ "step": 50
281
+ },
282
+ {
283
+ "clip_ratio/high_max": 0.0,
284
+ "clip_ratio/high_mean": 0.0,
285
+ "clip_ratio/low_mean": 0.0,
286
+ "clip_ratio/low_min": 0.0,
287
+ "clip_ratio/region_mean": 0.0,
288
+ "completion_length": 51.225,
289
+ "completions/clipped_ratio": 0.0,
290
+ "completions/max_length": 117.4,
291
+ "completions/max_terminated_length": 117.4,
292
+ "completions/mean_length": 51.225,
293
+ "completions/mean_terminated_length": 51.225,
294
+ "completions/min_length": 36.6,
295
+ "completions/min_terminated_length": 36.6,
296
+ "epoch": 5.5,
297
+ "frac_reward_zero_std": 0.6,
298
+ "grad_norm": 0.21470840275287628,
299
+ "kl": 0.04195364024490118,
300
+ "learning_rate": 8.73410738492077e-05,
301
+ "loss": 0.023307889699935913,
302
+ "num_tokens": 218442.0,
303
+ "reward": 1.0364285826683044,
304
+ "reward_std": 0.37462076991796495,
305
+ "rewards/reward_fn/mean": 1.0364285826683044,
306
+ "rewards/reward_fn/std": 0.6156843721866607,
307
+ "step": 55
308
+ },
309
+ {
310
+ "clip_ratio/high_max": 0.0,
311
+ "clip_ratio/high_mean": 0.0,
312
+ "clip_ratio/low_mean": 0.0,
313
+ "clip_ratio/low_min": 0.0,
314
+ "clip_ratio/region_mean": 0.0,
315
+ "completion_length": 53.225,
316
+ "completions/clipped_ratio": 0.0125,
317
+ "completions/max_length": 159.8,
318
+ "completions/max_terminated_length": 123.8,
319
+ "completions/mean_length": 53.225,
320
+ "completions/mean_terminated_length": 50.68583374023437,
321
+ "completions/min_length": 33.8,
322
+ "completions/min_terminated_length": 33.8,
323
+ "epoch": 6.0,
324
+ "frac_reward_zero_std": 0.45,
325
+ "grad_norm": 0.29306381940841675,
326
+ "kl": 0.05784560404717922,
327
+ "learning_rate": 8.44676704559283e-05,
328
+ "loss": 0.015561078488826752,
329
+ "num_tokens": 236112.0,
330
+ "reward": 1.1410714387893677,
331
+ "reward_std": 0.3540264666080475,
332
+ "rewards/reward_fn/mean": 1.1410714387893677,
333
+ "rewards/reward_fn/std": 0.607547128200531,
334
+ "step": 60
335
+ },
336
+ {
337
+ "clip_ratio/high_max": 0.0,
338
+ "clip_ratio/high_mean": 0.0,
339
+ "clip_ratio/low_mean": 0.0,
340
+ "clip_ratio/low_min": 0.0,
341
+ "clip_ratio/region_mean": 0.0,
342
+ "completion_length": 62.2625,
343
+ "completions/clipped_ratio": 0.05,
344
+ "completions/max_length": 153.2,
345
+ "completions/max_terminated_length": 117.4,
346
+ "completions/mean_length": 62.2625,
347
+ "completions/mean_terminated_length": 52.46428680419922,
348
+ "completions/min_length": 30.6,
349
+ "completions/min_terminated_length": 30.6,
350
+ "epoch": 6.5,
351
+ "frac_reward_zero_std": 0.65,
352
+ "grad_norm": 0.2615426480770111,
353
+ "kl": 0.09820330440998078,
354
+ "learning_rate": 8.135881792367686e-05,
355
+ "loss": 0.04314278066158295,
356
+ "num_tokens": 254033.0,
357
+ "reward": 1.2110714197158814,
358
+ "reward_std": 0.2716316431760788,
359
+ "rewards/reward_fn/mean": 1.2110714197158814,
360
+ "rewards/reward_fn/std": 0.6701113641262054,
361
+ "step": 65
362
+ },
363
+ {
364
+ "clip_ratio/high_max": 0.0,
365
+ "clip_ratio/high_mean": 0.0,
366
+ "clip_ratio/low_mean": 0.0,
367
+ "clip_ratio/low_min": 0.0,
368
+ "clip_ratio/region_mean": 0.0,
369
+ "completion_length": 82.5625,
370
+ "completions/clipped_ratio": 0.1125,
371
+ "completions/max_length": 234.4,
372
+ "completions/max_terminated_length": 147.4,
373
+ "completions/mean_length": 82.5625,
374
+ "completions/mean_terminated_length": 60.485835266113284,
375
+ "completions/min_length": 35.4,
376
+ "completions/min_terminated_length": 35.4,
377
+ "epoch": 7.0,
378
+ "frac_reward_zero_std": 0.6,
379
+ "grad_norm": 0.1326226145029068,
380
+ "kl": 0.07194480895996094,
381
+ "learning_rate": 7.803575286758364e-05,
382
+ "loss": 0.03152734041213989,
383
+ "num_tokens": 275222.0,
384
+ "reward": 1.320892834663391,
385
+ "reward_std": 0.35234126150608064,
386
+ "rewards/reward_fn/mean": 1.320892858505249,
387
+ "rewards/reward_fn/std": 0.676600456237793,
388
+ "step": 70
389
+ },
390
+ {
391
+ "clip_ratio/high_max": 0.0,
392
+ "clip_ratio/high_mean": 0.0,
393
+ "clip_ratio/low_mean": 0.0,
394
+ "clip_ratio/low_min": 0.0,
395
+ "clip_ratio/region_mean": 0.0,
396
+ "completion_length": 72.7625,
397
+ "completions/clipped_ratio": 0.0625,
398
+ "completions/max_length": 220.4,
399
+ "completions/max_terminated_length": 166.8,
400
+ "completions/mean_length": 72.7625,
401
+ "completions/mean_terminated_length": 59.95151138305664,
402
+ "completions/min_length": 30.2,
403
+ "completions/min_terminated_length": 30.2,
404
+ "epoch": 7.5,
405
+ "frac_reward_zero_std": 0.7,
406
+ "grad_norm": 0.15138906240463257,
407
+ "kl": 0.09165919721126556,
408
+ "learning_rate": 7.452117519152542e-05,
409
+ "loss": 0.0013809207826852798,
410
+ "num_tokens": 294407.0,
411
+ "reward": 1.3473214387893677,
412
+ "reward_std": 0.17192934565246104,
413
+ "rewards/reward_fn/mean": 1.3473214387893677,
414
+ "rewards/reward_fn/std": 0.6935437321662903,
415
+ "step": 75
416
+ },
417
+ {
418
+ "clip_ratio/high_max": 0.0,
419
+ "clip_ratio/high_mean": 0.0,
420
+ "clip_ratio/low_mean": 0.0,
421
+ "clip_ratio/low_min": 0.0,
422
+ "clip_ratio/region_mean": 0.0,
423
+ "completion_length": 80.9375,
424
+ "completions/clipped_ratio": 0.125,
425
+ "completions/max_length": 222.4,
426
+ "completions/max_terminated_length": 156.0,
427
+ "completions/mean_length": 80.9375,
428
+ "completions/mean_terminated_length": 55.99818344116211,
429
+ "completions/min_length": 31.6,
430
+ "completions/min_terminated_length": 31.6,
431
+ "epoch": 8.0,
432
+ "frac_reward_zero_std": 0.65,
433
+ "grad_norm": 0.19589942693710327,
434
+ "kl": 0.0683181069791317,
435
+ "learning_rate": 7.083909302476453e-05,
436
+ "loss": 0.033023273944854735,
437
+ "num_tokens": 314502.0,
438
+ "reward": 1.1685714244842529,
439
+ "reward_std": 0.27699449211359023,
440
+ "rewards/reward_fn/mean": 1.1685714244842529,
441
+ "rewards/reward_fn/std": 0.5244716942310333,
442
+ "step": 80
443
+ },
444
+ {
445
+ "clip_ratio/high_max": 0.0,
446
+ "clip_ratio/high_mean": 0.0,
447
+ "clip_ratio/low_mean": 0.0,
448
+ "clip_ratio/low_min": 0.0,
449
+ "clip_ratio/region_mean": 0.0,
450
+ "completion_length": 60.6625,
451
+ "completions/clipped_ratio": 0.05,
452
+ "completions/max_length": 207.2,
453
+ "completions/max_terminated_length": 100.0,
454
+ "completions/mean_length": 60.6625,
455
+ "completions/mean_terminated_length": 50.27571563720703,
456
+ "completions/min_length": 35.2,
457
+ "completions/min_terminated_length": 35.2,
458
+ "epoch": 8.5,
459
+ "frac_reward_zero_std": 0.6,
460
+ "grad_norm": 0.20066438615322113,
461
+ "kl": 0.0837025498971343,
462
+ "learning_rate": 6.701465872208216e-05,
463
+ "loss": 0.022454433143138885,
464
+ "num_tokens": 334007.0,
465
+ "reward": 1.3910714387893677,
466
+ "reward_std": 0.21785714849829674,
467
+ "rewards/reward_fn/mean": 1.3910714387893677,
468
+ "rewards/reward_fn/std": 0.6235540926456451,
469
+ "step": 85
470
+ },
471
+ {
472
+ "clip_ratio/high_max": 0.0,
473
+ "clip_ratio/high_mean": 0.0,
474
+ "clip_ratio/low_mean": 0.0,
475
+ "clip_ratio/low_min": 0.0,
476
+ "clip_ratio/region_mean": 0.0,
477
+ "completion_length": 54.025,
478
+ "completions/clipped_ratio": 0.025,
479
+ "completions/max_length": 143.4,
480
+ "completions/max_terminated_length": 117.2,
481
+ "completions/mean_length": 54.025,
482
+ "completions/mean_terminated_length": 49.025001525878906,
483
+ "completions/min_length": 30.2,
484
+ "completions/min_terminated_length": 30.2,
485
+ "epoch": 9.0,
486
+ "frac_reward_zero_std": 0.8,
487
+ "grad_norm": 0.18161360919475555,
488
+ "kl": 0.05805329587310552,
489
+ "learning_rate": 6.307399704769099e-05,
490
+ "loss": -0.04096371531486511,
491
+ "num_tokens": 351041.0,
492
+ "reward": 1.24375,
493
+ "reward_std": 0.2125,
494
+ "rewards/reward_fn/mean": 1.24375,
495
+ "rewards/reward_fn/std": 0.584508627653122,
496
+ "step": 90
497
+ },
498
+ {
499
+ "clip_ratio/high_max": 0.0,
500
+ "clip_ratio/high_mean": 0.0,
501
+ "clip_ratio/low_mean": 0.0,
502
+ "clip_ratio/low_min": 0.0,
503
+ "clip_ratio/region_mean": 0.0,
504
+ "completion_length": 54.65,
505
+ "completions/clipped_ratio": 0.0125,
506
+ "completions/max_length": 145.0,
507
+ "completions/max_terminated_length": 142.2,
508
+ "completions/mean_length": 54.65,
509
+ "completions/mean_terminated_length": 52.23083419799805,
510
+ "completions/min_length": 35.4,
511
+ "completions/min_terminated_length": 35.4,
512
+ "epoch": 9.5,
513
+ "frac_reward_zero_std": 0.9,
514
+ "grad_norm": 0.10424701869487762,
515
+ "kl": 0.05452606473118067,
516
+ "learning_rate": 5.90440267166055e-05,
517
+ "loss": 0.0021311525255441667,
518
+ "num_tokens": 368361.0,
519
+ "reward": 1.1996428489685058,
520
+ "reward_std": 0.058449314022436735,
521
+ "rewards/reward_fn/mean": 1.1996428489685058,
522
+ "rewards/reward_fn/std": 0.4604207634925842,
523
+ "step": 95
524
+ },
525
+ {
526
+ "clip_ratio/high_max": 0.0,
527
+ "clip_ratio/high_mean": 0.0,
528
+ "clip_ratio/low_mean": 0.0,
529
+ "clip_ratio/low_min": 0.0,
530
+ "clip_ratio/region_mean": 0.0,
531
+ "completion_length": 68.3875,
532
+ "completions/clipped_ratio": 0.0625,
533
+ "completions/max_length": 254.4,
534
+ "completions/max_terminated_length": 140.8,
535
+ "completions/mean_length": 68.3875,
536
+ "completions/mean_terminated_length": 55.740359497070315,
537
+ "completions/min_length": 36.2,
538
+ "completions/min_terminated_length": 36.2,
539
+ "epoch": 10.0,
540
+ "frac_reward_zero_std": 0.75,
541
+ "grad_norm": 0.13536690175533295,
542
+ "kl": 0.0854075826704502,
543
+ "learning_rate": 5.495227651252315e-05,
544
+ "loss": -0.025432443618774413,
545
+ "num_tokens": 388400.0,
546
+ "reward": 1.3283928632736206,
547
+ "reward_std": 0.2009493112564087,
548
+ "rewards/reward_fn/mean": 1.3283928632736206,
549
+ "rewards/reward_fn/std": 0.5586225628852844,
550
+ "step": 100
551
+ }
552
+ ],
553
+ "logging_steps": 5,
554
+ "max_steps": 200,
555
+ "num_input_tokens_seen": 388400,
556
+ "num_train_epochs": 20,
557
+ "save_steps": 50,
558
+ "stateful_callbacks": {
559
+ "TrainerControl": {
560
+ "args": {
561
+ "should_epoch_stop": false,
562
+ "should_evaluate": false,
563
+ "should_log": false,
564
+ "should_save": true,
565
+ "should_training_stop": false
566
+ },
567
+ "attributes": {}
568
+ }
569
+ },
570
+ "total_flos": 0.0,
571
+ "train_batch_size": 8,
572
+ "trial_name": null,
573
+ "trial_params": null
574
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61516d246785bb4adf93d26e6fe8ffbb65a924b8a03f13b01e1d00b7066140af
3
+ size 6673
checkpoint-150/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Qwen2.5-Coder-0.5B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/Qwen2.5-Coder-0.5B-Instruct
7
+ - grpo
8
+ - lora
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.1
checkpoint-150/adapter_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Qwen2ForCausalLM",
7
+ "parent_library": "transformers.models.qwen2.modeling_qwen2",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/Qwen2.5-Coder-0.5B-Instruct",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": [
36
+ "v_proj",
37
+ "k_proj",
38
+ "o_proj",
39
+ "up_proj",
40
+ "gate_proj",
41
+ "q_proj",
42
+ "down_proj"
43
+ ],
44
+ "target_parameters": null,
45
+ "task_type": "CAUSAL_LM",
46
+ "trainable_token_indices": null,
47
+ "use_dora": false,
48
+ "use_qalora": false,
49
+ "use_rslora": false
50
+ }
checkpoint-150/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9b3cf15fb2500f3f8f014f9739f1a21141fd5298ffcb0a0d5fe65deb718e615
3
+ size 70430032
checkpoint-150/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-150/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c37453fd3d8af6afa30b40cc9c6ad2d969625ff6e3a19f3a44fedb9dfcaeb1a0
3
+ size 36139685
checkpoint-150/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b68c29a103ea734e571c411420518c24591a5fb6bc226dc1650f76fe8e3921c1
3
+ size 14645
checkpoint-150/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7a646a858b71ab76099f4dca96d8db39ffde2f88258f5a4b082ff62947bcc11
3
+ size 1383
checkpoint-150/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3482f78b8a680a7668ce725c32ed61ffffbbaae8b7bc5042b2c9c5517679d93c
3
+ size 1465
checkpoint-150/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446
3
+ size 11422523
checkpoint-150/tokenizer_config.json ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [],
9
+ "is_local": false,
10
+ "model_max_length": 32768,
11
+ "pad_token": "<|PAD_TOKEN|>",
12
+ "padding_side": "right",
13
+ "split_special_tokens": false,
14
+ "tokenizer_class": "Qwen2Tokenizer",
15
+ "unk_token": null,
16
+ "added_tokens_decoder": {
17
+ "151643": {
18
+ "content": "<|endoftext|>",
19
+ "single_word": false,
20
+ "lstrip": false,
21
+ "rstrip": false,
22
+ "normalized": false,
23
+ "special": true
24
+ },
25
+ "151644": {
26
+ "content": "<|im_start|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ "151645": {
34
+ "content": "<|im_end|>",
35
+ "single_word": false,
36
+ "lstrip": false,
37
+ "rstrip": false,
38
+ "normalized": false,
39
+ "special": true
40
+ },
41
+ "151646": {
42
+ "content": "<|object_ref_start|>",
43
+ "single_word": false,
44
+ "lstrip": false,
45
+ "rstrip": false,
46
+ "normalized": false,
47
+ "special": true
48
+ },
49
+ "151647": {
50
+ "content": "<|object_ref_end|>",
51
+ "single_word": false,
52
+ "lstrip": false,
53
+ "rstrip": false,
54
+ "normalized": false,
55
+ "special": true
56
+ },
57
+ "151648": {
58
+ "content": "<|box_start|>",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ },
65
+ "151649": {
66
+ "content": "<|box_end|>",
67
+ "single_word": false,
68
+ "lstrip": false,
69
+ "rstrip": false,
70
+ "normalized": false,
71
+ "special": true
72
+ },
73
+ "151650": {
74
+ "content": "<|quad_start|>",
75
+ "single_word": false,
76
+ "lstrip": false,
77
+ "rstrip": false,
78
+ "normalized": false,
79
+ "special": true
80
+ },
81
+ "151651": {
82
+ "content": "<|quad_end|>",
83
+ "single_word": false,
84
+ "lstrip": false,
85
+ "rstrip": false,
86
+ "normalized": false,
87
+ "special": true
88
+ },
89
+ "151652": {
90
+ "content": "<|vision_start|>",
91
+ "single_word": false,
92
+ "lstrip": false,
93
+ "rstrip": false,
94
+ "normalized": false,
95
+ "special": true
96
+ },
97
+ "151653": {
98
+ "content": "<|vision_end|>",
99
+ "single_word": false,
100
+ "lstrip": false,
101
+ "rstrip": false,
102
+ "normalized": false,
103
+ "special": true
104
+ },
105
+ "151654": {
106
+ "content": "<|vision_pad|>",
107
+ "single_word": false,
108
+ "lstrip": false,
109
+ "rstrip": false,
110
+ "normalized": false,
111
+ "special": true
112
+ },
113
+ "151655": {
114
+ "content": "<|image_pad|>",
115
+ "single_word": false,
116
+ "lstrip": false,
117
+ "rstrip": false,
118
+ "normalized": false,
119
+ "special": true
120
+ },
121
+ "151656": {
122
+ "content": "<|video_pad|>",
123
+ "single_word": false,
124
+ "lstrip": false,
125
+ "rstrip": false,
126
+ "normalized": false,
127
+ "special": true
128
+ },
129
+ "151657": {
130
+ "content": "<tool_call>",
131
+ "single_word": false,
132
+ "lstrip": false,
133
+ "rstrip": false,
134
+ "normalized": false,
135
+ "special": false
136
+ },
137
+ "151658": {
138
+ "content": "</tool_call>",
139
+ "single_word": false,
140
+ "lstrip": false,
141
+ "rstrip": false,
142
+ "normalized": false,
143
+ "special": false
144
+ },
145
+ "151659": {
146
+ "content": "<|fim_prefix|>",
147
+ "single_word": false,
148
+ "lstrip": false,
149
+ "rstrip": false,
150
+ "normalized": false,
151
+ "special": false
152
+ },
153
+ "151660": {
154
+ "content": "<|fim_middle|>",
155
+ "single_word": false,
156
+ "lstrip": false,
157
+ "rstrip": false,
158
+ "normalized": false,
159
+ "special": false
160
+ },
161
+ "151661": {
162
+ "content": "<|fim_suffix|>",
163
+ "single_word": false,
164
+ "lstrip": false,
165
+ "rstrip": false,
166
+ "normalized": false,
167
+ "special": false
168
+ },
169
+ "151662": {
170
+ "content": "<|fim_pad|>",
171
+ "single_word": false,
172
+ "lstrip": false,
173
+ "rstrip": false,
174
+ "normalized": false,
175
+ "special": false
176
+ },
177
+ "151663": {
178
+ "content": "<|repo_name|>",
179
+ "single_word": false,
180
+ "lstrip": false,
181
+ "rstrip": false,
182
+ "normalized": false,
183
+ "special": false
184
+ },
185
+ "151664": {
186
+ "content": "<|file_sep|>",
187
+ "single_word": false,
188
+ "lstrip": false,
189
+ "rstrip": false,
190
+ "normalized": false,
191
+ "special": false
192
+ },
193
+ "151665": {
194
+ "content": "<|PAD_TOKEN|>",
195
+ "single_word": false,
196
+ "lstrip": false,
197
+ "rstrip": false,
198
+ "normalized": false,
199
+ "special": true
200
+ }
201
+ }
202
+ }
checkpoint-150/trainer_state.json ADDED
@@ -0,0 +1,844 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 15.0,
6
+ "eval_steps": 500,
7
+ "global_step": 150,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completion_length": 95.85,
19
+ "completions/clipped_ratio": 0.125,
20
+ "completions/max_length": 256.0,
21
+ "completions/max_terminated_length": 195.6,
22
+ "completions/mean_length": 95.85,
23
+ "completions/mean_terminated_length": 72.8880989074707,
24
+ "completions/min_length": 36.6,
25
+ "completions/min_terminated_length": 36.6,
26
+ "epoch": 0.5,
27
+ "frac_reward_zero_std": 0.2,
28
+ "grad_norm": 0.3041034936904907,
29
+ "kl": 0.00021898818758927518,
30
+ "learning_rate": 4e-05,
31
+ "loss": 0.0329527884721756,
32
+ "num_tokens": 21444.0,
33
+ "reward": 0.7767857074737549,
34
+ "reward_std": 0.7150911569595337,
35
+ "rewards/reward_fn/mean": 0.7767857074737549,
36
+ "rewards/reward_fn/std": 0.9959433197975158,
37
+ "step": 5
38
+ },
39
+ {
40
+ "clip_ratio/high_max": 0.0,
41
+ "clip_ratio/high_mean": 0.0,
42
+ "clip_ratio/low_mean": 0.0,
43
+ "clip_ratio/low_min": 0.0,
44
+ "clip_ratio/region_mean": 0.0,
45
+ "completion_length": 95.1,
46
+ "completions/clipped_ratio": 0.0875,
47
+ "completions/max_length": 225.2,
48
+ "completions/max_terminated_length": 159.2,
49
+ "completions/mean_length": 95.1,
50
+ "completions/mean_terminated_length": 80.34466400146485,
51
+ "completions/min_length": 34.6,
52
+ "completions/min_terminated_length": 34.6,
53
+ "epoch": 1.0,
54
+ "frac_reward_zero_std": 0.4,
55
+ "grad_norm": 0.2708108425140381,
56
+ "kl": 0.006061523332027719,
57
+ "learning_rate": 9e-05,
58
+ "loss": -0.016053691506385803,
59
+ "num_tokens": 42220.0,
60
+ "reward": 0.6875000119209289,
61
+ "reward_std": 0.5175672590732574,
62
+ "rewards/reward_fn/mean": 0.6875,
63
+ "rewards/reward_fn/std": 0.8034753799438477,
64
+ "step": 10
65
+ },
66
+ {
67
+ "clip_ratio/high_max": 0.0,
68
+ "clip_ratio/high_mean": 0.0,
69
+ "clip_ratio/low_mean": 0.0,
70
+ "clip_ratio/low_min": 0.0,
71
+ "clip_ratio/region_mean": 0.0,
72
+ "completion_length": 146.275,
73
+ "completions/clipped_ratio": 0.325,
74
+ "completions/max_length": 256.0,
75
+ "completions/max_terminated_length": 225.4,
76
+ "completions/mean_length": 146.275,
77
+ "completions/mean_terminated_length": 92.8663101196289,
78
+ "completions/min_length": 32.6,
79
+ "completions/min_terminated_length": 32.6,
80
+ "epoch": 1.5,
81
+ "frac_reward_zero_std": 0.2,
82
+ "grad_norm": 0.27529817819595337,
83
+ "kl": 0.01560373855754733,
84
+ "learning_rate": 9.989068136093873e-05,
85
+ "loss": 0.010054035484790802,
86
+ "num_tokens": 69198.0,
87
+ "reward": 1.1558928728103637,
88
+ "reward_std": 0.8465379416942597,
89
+ "rewards/reward_fn/mean": 1.1558928728103637,
90
+ "rewards/reward_fn/std": 1.1404805421829223,
91
+ "step": 15
92
+ },
93
+ {
94
+ "clip_ratio/high_max": 0.0,
95
+ "clip_ratio/high_mean": 0.0,
96
+ "clip_ratio/low_mean": 0.0,
97
+ "clip_ratio/low_min": 0.0,
98
+ "clip_ratio/region_mean": 0.0,
99
+ "completion_length": 75.85,
100
+ "completions/clipped_ratio": 0.05,
101
+ "completions/max_length": 220.6,
102
+ "completions/max_terminated_length": 151.0,
103
+ "completions/mean_length": 75.85,
104
+ "completions/mean_terminated_length": 66.14857330322266,
105
+ "completions/min_length": 32.8,
106
+ "completions/min_terminated_length": 32.8,
107
+ "epoch": 2.0,
108
+ "frac_reward_zero_std": 0.35,
109
+ "grad_norm": 0.353487491607666,
110
+ "kl": 0.027986684162169696,
111
+ "learning_rate": 9.944739353007344e-05,
112
+ "loss": -0.030885905027389526,
113
+ "num_tokens": 87334.0,
114
+ "reward": 0.7932142853736878,
115
+ "reward_std": 0.4118983089923859,
116
+ "rewards/reward_fn/mean": 0.7932142853736878,
117
+ "rewards/reward_fn/std": 0.7167340397834778,
118
+ "step": 20
119
+ },
120
+ {
121
+ "clip_ratio/high_max": 0.0,
122
+ "clip_ratio/high_mean": 0.0,
123
+ "clip_ratio/low_mean": 0.0,
124
+ "clip_ratio/low_min": 0.0,
125
+ "clip_ratio/region_mean": 0.0,
126
+ "completion_length": 90.85,
127
+ "completions/clipped_ratio": 0.0625,
128
+ "completions/max_length": 248.0,
129
+ "completions/max_terminated_length": 216.4,
130
+ "completions/mean_length": 90.85,
131
+ "completions/mean_terminated_length": 79.819287109375,
132
+ "completions/min_length": 28.0,
133
+ "completions/min_terminated_length": 28.0,
134
+ "epoch": 2.5,
135
+ "frac_reward_zero_std": 0.15,
136
+ "grad_norm": 0.3318491578102112,
137
+ "kl": 0.04692814685404301,
138
+ "learning_rate": 9.86663298624003e-05,
139
+ "loss": 0.05783940553665161,
140
+ "num_tokens": 108330.0,
141
+ "reward": 1.1987500190734863,
142
+ "reward_std": 0.5907092690467834,
143
+ "rewards/reward_fn/mean": 1.1987500190734863,
144
+ "rewards/reward_fn/std": 0.9482298731803894,
145
+ "step": 25
146
+ },
147
+ {
148
+ "clip_ratio/high_max": 0.0,
149
+ "clip_ratio/high_mean": 0.0,
150
+ "clip_ratio/low_mean": 0.0,
151
+ "clip_ratio/low_min": 0.0,
152
+ "clip_ratio/region_mean": 0.0,
153
+ "completion_length": 91.775,
154
+ "completions/clipped_ratio": 0.125,
155
+ "completions/max_length": 248.2,
156
+ "completions/max_terminated_length": 212.6,
157
+ "completions/mean_length": 91.775,
158
+ "completions/mean_terminated_length": 68.68722763061524,
159
+ "completions/min_length": 25.2,
160
+ "completions/min_terminated_length": 25.2,
161
+ "epoch": 3.0,
162
+ "frac_reward_zero_std": 0.4,
163
+ "grad_norm": 0.24605490267276764,
164
+ "kl": 0.07254143096506596,
165
+ "learning_rate": 9.755282581475769e-05,
166
+ "loss": 0.03934891819953919,
167
+ "num_tokens": 129012.0,
168
+ "reward": 1.0242857217788697,
169
+ "reward_std": 0.5802445828914642,
170
+ "rewards/reward_fn/mean": 1.0242857217788697,
171
+ "rewards/reward_fn/std": 0.8706011414527893,
172
+ "step": 30
173
+ },
174
+ {
175
+ "clip_ratio/high_max": 0.0,
176
+ "clip_ratio/high_mean": 0.0,
177
+ "clip_ratio/low_mean": 0.0,
178
+ "clip_ratio/low_min": 0.0,
179
+ "clip_ratio/region_mean": 0.0,
180
+ "completion_length": 59.7875,
181
+ "completions/clipped_ratio": 0.0375,
182
+ "completions/max_length": 175.6,
183
+ "completions/max_terminated_length": 118.4,
184
+ "completions/mean_length": 59.7875,
185
+ "completions/mean_terminated_length": 52.19559555053711,
186
+ "completions/min_length": 25.4,
187
+ "completions/min_terminated_length": 25.4,
188
+ "epoch": 3.5,
189
+ "frac_reward_zero_std": 0.6,
190
+ "grad_norm": 0.2970605790615082,
191
+ "kl": 0.053099883161485194,
192
+ "learning_rate": 9.611448774886924e-05,
193
+ "loss": 0.06977529525756836,
194
+ "num_tokens": 147103.0,
195
+ "reward": 0.9473214387893677,
196
+ "reward_std": 0.3923970699310303,
197
+ "rewards/reward_fn/mean": 0.9473214268684387,
198
+ "rewards/reward_fn/std": 0.649278998374939,
199
+ "step": 35
200
+ },
201
+ {
202
+ "clip_ratio/high_max": 0.0,
203
+ "clip_ratio/high_mean": 0.0,
204
+ "clip_ratio/low_mean": 0.0,
205
+ "clip_ratio/low_min": 0.0,
206
+ "clip_ratio/region_mean": 0.0,
207
+ "completion_length": 47.85,
208
+ "completions/clipped_ratio": 0.0,
209
+ "completions/max_length": 94.8,
210
+ "completions/max_terminated_length": 94.8,
211
+ "completions/mean_length": 47.85,
212
+ "completions/mean_terminated_length": 47.85,
213
+ "completions/min_length": 31.2,
214
+ "completions/min_terminated_length": 31.2,
215
+ "epoch": 4.0,
216
+ "frac_reward_zero_std": 0.85,
217
+ "grad_norm": 0.16646049916744232,
218
+ "kl": 0.056573960930109024,
219
+ "learning_rate": 9.43611409721806e-05,
220
+ "loss": -0.0031644195318222047,
221
+ "num_tokens": 164643.0,
222
+ "reward": 0.7899999976158142,
223
+ "reward_std": 0.12681145668029786,
224
+ "rewards/reward_fn/mean": 0.7899999976158142,
225
+ "rewards/reward_fn/std": 0.5133758962154389,
226
+ "step": 40
227
+ },
228
+ {
229
+ "clip_ratio/high_max": 0.0,
230
+ "clip_ratio/high_mean": 0.0,
231
+ "clip_ratio/low_mean": 0.0,
232
+ "clip_ratio/low_min": 0.0,
233
+ "clip_ratio/region_mean": 0.0,
234
+ "completion_length": 49.2625,
235
+ "completions/clipped_ratio": 0.0,
236
+ "completions/max_length": 93.6,
237
+ "completions/max_terminated_length": 93.6,
238
+ "completions/mean_length": 49.2625,
239
+ "completions/mean_terminated_length": 49.2625,
240
+ "completions/min_length": 33.8,
241
+ "completions/min_terminated_length": 33.8,
242
+ "epoch": 4.5,
243
+ "frac_reward_zero_std": 0.6,
244
+ "grad_norm": 0.3159082531929016,
245
+ "kl": 0.0353464649990201,
246
+ "learning_rate": 9.230476262104677e-05,
247
+ "loss": -0.03417131304740906,
248
+ "num_tokens": 182568.0,
249
+ "reward": 0.7924999833106995,
250
+ "reward_std": 0.3821385264396667,
251
+ "rewards/reward_fn/mean": 0.7924999833106995,
252
+ "rewards/reward_fn/std": 0.62625293135643,
253
+ "step": 45
254
+ },
255
+ {
256
+ "clip_ratio/high_max": 0.0,
257
+ "clip_ratio/high_mean": 0.0,
258
+ "clip_ratio/low_mean": 0.0,
259
+ "clip_ratio/low_min": 0.0,
260
+ "clip_ratio/region_mean": 0.0,
261
+ "completion_length": 58.25,
262
+ "completions/clipped_ratio": 0.0125,
263
+ "completions/max_length": 180.6,
264
+ "completions/max_terminated_length": 153.2,
265
+ "completions/mean_length": 58.25,
266
+ "completions/mean_terminated_length": 55.67250061035156,
267
+ "completions/min_length": 33.2,
268
+ "completions/min_terminated_length": 33.2,
269
+ "epoch": 5.0,
270
+ "frac_reward_zero_std": 0.55,
271
+ "grad_norm": 0.27360936999320984,
272
+ "kl": 0.04483541352674365,
273
+ "learning_rate": 8.995939984474624e-05,
274
+ "loss": 0.026852786540985107,
275
+ "num_tokens": 200660.0,
276
+ "reward": 0.774821424484253,
277
+ "reward_std": 0.32772802114486693,
278
+ "rewards/reward_fn/mean": 0.774821424484253,
279
+ "rewards/reward_fn/std": 0.5432307064533234,
280
+ "step": 50
281
+ },
282
+ {
283
+ "clip_ratio/high_max": 0.0,
284
+ "clip_ratio/high_mean": 0.0,
285
+ "clip_ratio/low_mean": 0.0,
286
+ "clip_ratio/low_min": 0.0,
287
+ "clip_ratio/region_mean": 0.0,
288
+ "completion_length": 51.225,
289
+ "completions/clipped_ratio": 0.0,
290
+ "completions/max_length": 117.4,
291
+ "completions/max_terminated_length": 117.4,
292
+ "completions/mean_length": 51.225,
293
+ "completions/mean_terminated_length": 51.225,
294
+ "completions/min_length": 36.6,
295
+ "completions/min_terminated_length": 36.6,
296
+ "epoch": 5.5,
297
+ "frac_reward_zero_std": 0.6,
298
+ "grad_norm": 0.21470840275287628,
299
+ "kl": 0.04195364024490118,
300
+ "learning_rate": 8.73410738492077e-05,
301
+ "loss": 0.023307889699935913,
302
+ "num_tokens": 218442.0,
303
+ "reward": 1.0364285826683044,
304
+ "reward_std": 0.37462076991796495,
305
+ "rewards/reward_fn/mean": 1.0364285826683044,
306
+ "rewards/reward_fn/std": 0.6156843721866607,
307
+ "step": 55
308
+ },
309
+ {
310
+ "clip_ratio/high_max": 0.0,
311
+ "clip_ratio/high_mean": 0.0,
312
+ "clip_ratio/low_mean": 0.0,
313
+ "clip_ratio/low_min": 0.0,
314
+ "clip_ratio/region_mean": 0.0,
315
+ "completion_length": 53.225,
316
+ "completions/clipped_ratio": 0.0125,
317
+ "completions/max_length": 159.8,
318
+ "completions/max_terminated_length": 123.8,
319
+ "completions/mean_length": 53.225,
320
+ "completions/mean_terminated_length": 50.68583374023437,
321
+ "completions/min_length": 33.8,
322
+ "completions/min_terminated_length": 33.8,
323
+ "epoch": 6.0,
324
+ "frac_reward_zero_std": 0.45,
325
+ "grad_norm": 0.29306381940841675,
326
+ "kl": 0.05784560404717922,
327
+ "learning_rate": 8.44676704559283e-05,
328
+ "loss": 0.015561078488826752,
329
+ "num_tokens": 236112.0,
330
+ "reward": 1.1410714387893677,
331
+ "reward_std": 0.3540264666080475,
332
+ "rewards/reward_fn/mean": 1.1410714387893677,
333
+ "rewards/reward_fn/std": 0.607547128200531,
334
+ "step": 60
335
+ },
336
+ {
337
+ "clip_ratio/high_max": 0.0,
338
+ "clip_ratio/high_mean": 0.0,
339
+ "clip_ratio/low_mean": 0.0,
340
+ "clip_ratio/low_min": 0.0,
341
+ "clip_ratio/region_mean": 0.0,
342
+ "completion_length": 62.2625,
343
+ "completions/clipped_ratio": 0.05,
344
+ "completions/max_length": 153.2,
345
+ "completions/max_terminated_length": 117.4,
346
+ "completions/mean_length": 62.2625,
347
+ "completions/mean_terminated_length": 52.46428680419922,
348
+ "completions/min_length": 30.6,
349
+ "completions/min_terminated_length": 30.6,
350
+ "epoch": 6.5,
351
+ "frac_reward_zero_std": 0.65,
352
+ "grad_norm": 0.2615426480770111,
353
+ "kl": 0.09820330440998078,
354
+ "learning_rate": 8.135881792367686e-05,
355
+ "loss": 0.04314278066158295,
356
+ "num_tokens": 254033.0,
357
+ "reward": 1.2110714197158814,
358
+ "reward_std": 0.2716316431760788,
359
+ "rewards/reward_fn/mean": 1.2110714197158814,
360
+ "rewards/reward_fn/std": 0.6701113641262054,
361
+ "step": 65
362
+ },
363
+ {
364
+ "clip_ratio/high_max": 0.0,
365
+ "clip_ratio/high_mean": 0.0,
366
+ "clip_ratio/low_mean": 0.0,
367
+ "clip_ratio/low_min": 0.0,
368
+ "clip_ratio/region_mean": 0.0,
369
+ "completion_length": 82.5625,
370
+ "completions/clipped_ratio": 0.1125,
371
+ "completions/max_length": 234.4,
372
+ "completions/max_terminated_length": 147.4,
373
+ "completions/mean_length": 82.5625,
374
+ "completions/mean_terminated_length": 60.485835266113284,
375
+ "completions/min_length": 35.4,
376
+ "completions/min_terminated_length": 35.4,
377
+ "epoch": 7.0,
378
+ "frac_reward_zero_std": 0.6,
379
+ "grad_norm": 0.1326226145029068,
380
+ "kl": 0.07194480895996094,
381
+ "learning_rate": 7.803575286758364e-05,
382
+ "loss": 0.03152734041213989,
383
+ "num_tokens": 275222.0,
384
+ "reward": 1.320892834663391,
385
+ "reward_std": 0.35234126150608064,
386
+ "rewards/reward_fn/mean": 1.320892858505249,
387
+ "rewards/reward_fn/std": 0.676600456237793,
388
+ "step": 70
389
+ },
390
+ {
391
+ "clip_ratio/high_max": 0.0,
392
+ "clip_ratio/high_mean": 0.0,
393
+ "clip_ratio/low_mean": 0.0,
394
+ "clip_ratio/low_min": 0.0,
395
+ "clip_ratio/region_mean": 0.0,
396
+ "completion_length": 72.7625,
397
+ "completions/clipped_ratio": 0.0625,
398
+ "completions/max_length": 220.4,
399
+ "completions/max_terminated_length": 166.8,
400
+ "completions/mean_length": 72.7625,
401
+ "completions/mean_terminated_length": 59.95151138305664,
402
+ "completions/min_length": 30.2,
403
+ "completions/min_terminated_length": 30.2,
404
+ "epoch": 7.5,
405
+ "frac_reward_zero_std": 0.7,
406
+ "grad_norm": 0.15138906240463257,
407
+ "kl": 0.09165919721126556,
408
+ "learning_rate": 7.452117519152542e-05,
409
+ "loss": 0.0013809207826852798,
410
+ "num_tokens": 294407.0,
411
+ "reward": 1.3473214387893677,
412
+ "reward_std": 0.17192934565246104,
413
+ "rewards/reward_fn/mean": 1.3473214387893677,
414
+ "rewards/reward_fn/std": 0.6935437321662903,
415
+ "step": 75
416
+ },
417
+ {
418
+ "clip_ratio/high_max": 0.0,
419
+ "clip_ratio/high_mean": 0.0,
420
+ "clip_ratio/low_mean": 0.0,
421
+ "clip_ratio/low_min": 0.0,
422
+ "clip_ratio/region_mean": 0.0,
423
+ "completion_length": 80.9375,
424
+ "completions/clipped_ratio": 0.125,
425
+ "completions/max_length": 222.4,
426
+ "completions/max_terminated_length": 156.0,
427
+ "completions/mean_length": 80.9375,
428
+ "completions/mean_terminated_length": 55.99818344116211,
429
+ "completions/min_length": 31.6,
430
+ "completions/min_terminated_length": 31.6,
431
+ "epoch": 8.0,
432
+ "frac_reward_zero_std": 0.65,
433
+ "grad_norm": 0.19589942693710327,
434
+ "kl": 0.0683181069791317,
435
+ "learning_rate": 7.083909302476453e-05,
436
+ "loss": 0.033023273944854735,
437
+ "num_tokens": 314502.0,
438
+ "reward": 1.1685714244842529,
439
+ "reward_std": 0.27699449211359023,
440
+ "rewards/reward_fn/mean": 1.1685714244842529,
441
+ "rewards/reward_fn/std": 0.5244716942310333,
442
+ "step": 80
443
+ },
444
+ {
445
+ "clip_ratio/high_max": 0.0,
446
+ "clip_ratio/high_mean": 0.0,
447
+ "clip_ratio/low_mean": 0.0,
448
+ "clip_ratio/low_min": 0.0,
449
+ "clip_ratio/region_mean": 0.0,
450
+ "completion_length": 60.6625,
451
+ "completions/clipped_ratio": 0.05,
452
+ "completions/max_length": 207.2,
453
+ "completions/max_terminated_length": 100.0,
454
+ "completions/mean_length": 60.6625,
455
+ "completions/mean_terminated_length": 50.27571563720703,
456
+ "completions/min_length": 35.2,
457
+ "completions/min_terminated_length": 35.2,
458
+ "epoch": 8.5,
459
+ "frac_reward_zero_std": 0.6,
460
+ "grad_norm": 0.20066438615322113,
461
+ "kl": 0.0837025498971343,
462
+ "learning_rate": 6.701465872208216e-05,
463
+ "loss": 0.022454433143138885,
464
+ "num_tokens": 334007.0,
465
+ "reward": 1.3910714387893677,
466
+ "reward_std": 0.21785714849829674,
467
+ "rewards/reward_fn/mean": 1.3910714387893677,
468
+ "rewards/reward_fn/std": 0.6235540926456451,
469
+ "step": 85
470
+ },
471
+ {
472
+ "clip_ratio/high_max": 0.0,
473
+ "clip_ratio/high_mean": 0.0,
474
+ "clip_ratio/low_mean": 0.0,
475
+ "clip_ratio/low_min": 0.0,
476
+ "clip_ratio/region_mean": 0.0,
477
+ "completion_length": 54.025,
478
+ "completions/clipped_ratio": 0.025,
479
+ "completions/max_length": 143.4,
480
+ "completions/max_terminated_length": 117.2,
481
+ "completions/mean_length": 54.025,
482
+ "completions/mean_terminated_length": 49.025001525878906,
483
+ "completions/min_length": 30.2,
484
+ "completions/min_terminated_length": 30.2,
485
+ "epoch": 9.0,
486
+ "frac_reward_zero_std": 0.8,
487
+ "grad_norm": 0.18161360919475555,
488
+ "kl": 0.05805329587310552,
489
+ "learning_rate": 6.307399704769099e-05,
490
+ "loss": -0.04096371531486511,
491
+ "num_tokens": 351041.0,
492
+ "reward": 1.24375,
493
+ "reward_std": 0.2125,
494
+ "rewards/reward_fn/mean": 1.24375,
495
+ "rewards/reward_fn/std": 0.584508627653122,
496
+ "step": 90
497
+ },
498
+ {
499
+ "clip_ratio/high_max": 0.0,
500
+ "clip_ratio/high_mean": 0.0,
501
+ "clip_ratio/low_mean": 0.0,
502
+ "clip_ratio/low_min": 0.0,
503
+ "clip_ratio/region_mean": 0.0,
504
+ "completion_length": 54.65,
505
+ "completions/clipped_ratio": 0.0125,
506
+ "completions/max_length": 145.0,
507
+ "completions/max_terminated_length": 142.2,
508
+ "completions/mean_length": 54.65,
509
+ "completions/mean_terminated_length": 52.23083419799805,
510
+ "completions/min_length": 35.4,
511
+ "completions/min_terminated_length": 35.4,
512
+ "epoch": 9.5,
513
+ "frac_reward_zero_std": 0.9,
514
+ "grad_norm": 0.10424701869487762,
515
+ "kl": 0.05452606473118067,
516
+ "learning_rate": 5.90440267166055e-05,
517
+ "loss": 0.0021311525255441667,
518
+ "num_tokens": 368361.0,
519
+ "reward": 1.1996428489685058,
520
+ "reward_std": 0.058449314022436735,
521
+ "rewards/reward_fn/mean": 1.1996428489685058,
522
+ "rewards/reward_fn/std": 0.4604207634925842,
523
+ "step": 95
524
+ },
525
+ {
526
+ "clip_ratio/high_max": 0.0,
527
+ "clip_ratio/high_mean": 0.0,
528
+ "clip_ratio/low_mean": 0.0,
529
+ "clip_ratio/low_min": 0.0,
530
+ "clip_ratio/region_mean": 0.0,
531
+ "completion_length": 68.3875,
532
+ "completions/clipped_ratio": 0.0625,
533
+ "completions/max_length": 254.4,
534
+ "completions/max_terminated_length": 140.8,
535
+ "completions/mean_length": 68.3875,
536
+ "completions/mean_terminated_length": 55.740359497070315,
537
+ "completions/min_length": 36.2,
538
+ "completions/min_terminated_length": 36.2,
539
+ "epoch": 10.0,
540
+ "frac_reward_zero_std": 0.75,
541
+ "grad_norm": 0.13536690175533295,
542
+ "kl": 0.0854075826704502,
543
+ "learning_rate": 5.495227651252315e-05,
544
+ "loss": -0.025432443618774413,
545
+ "num_tokens": 388400.0,
546
+ "reward": 1.3283928632736206,
547
+ "reward_std": 0.2009493112564087,
548
+ "rewards/reward_fn/mean": 1.3283928632736206,
549
+ "rewards/reward_fn/std": 0.5586225628852844,
550
+ "step": 100
551
+ },
552
+ {
553
+ "clip_ratio/high_max": 0.0,
554
+ "clip_ratio/high_mean": 0.0,
555
+ "clip_ratio/low_mean": 0.0,
556
+ "clip_ratio/low_min": 0.0,
557
+ "clip_ratio/region_mean": 0.0,
558
+ "completion_length": 77.6,
559
+ "completions/clipped_ratio": 0.125,
560
+ "completions/max_length": 256.0,
561
+ "completions/max_terminated_length": 115.6,
562
+ "completions/mean_length": 77.6,
563
+ "completions/mean_terminated_length": 52.14154052734375,
564
+ "completions/min_length": 30.4,
565
+ "completions/min_terminated_length": 30.4,
566
+ "epoch": 10.5,
567
+ "frac_reward_zero_std": 0.9,
568
+ "grad_norm": 0.0006729933083988726,
569
+ "kl": 0.07917941156774759,
570
+ "learning_rate": 5.0826697238317935e-05,
571
+ "loss": -0.0014228260144591332,
572
+ "num_tokens": 408808.0,
573
+ "reward": 1.4,
574
+ "reward_std": 0.1,
575
+ "rewards/reward_fn/mean": 1.4,
576
+ "rewards/reward_fn/std": 0.5159838497638702,
577
+ "step": 105
578
+ },
579
+ {
580
+ "clip_ratio/high_max": 0.0,
581
+ "clip_ratio/high_mean": 0.0,
582
+ "clip_ratio/low_mean": 0.0,
583
+ "clip_ratio/low_min": 0.0,
584
+ "clip_ratio/region_mean": 0.0,
585
+ "completion_length": 71.6375,
586
+ "completions/clipped_ratio": 0.1,
587
+ "completions/max_length": 223.8,
588
+ "completions/max_terminated_length": 125.4,
589
+ "completions/mean_length": 71.6375,
590
+ "completions/mean_terminated_length": 50.991825103759766,
591
+ "completions/min_length": 34.6,
592
+ "completions/min_terminated_length": 34.6,
593
+ "epoch": 11.0,
594
+ "frac_reward_zero_std": 0.85,
595
+ "grad_norm": 0.1310124397277832,
596
+ "kl": 0.0553528118878603,
597
+ "learning_rate": 4.669547078371504e-05,
598
+ "loss": 0.027294650673866272,
599
+ "num_tokens": 427787.0,
600
+ "reward": 1.15,
601
+ "reward_std": 0.15773502588272095,
602
+ "rewards/reward_fn/mean": 1.15,
603
+ "rewards/reward_fn/std": 0.482165002822876,
604
+ "step": 110
605
+ },
606
+ {
607
+ "clip_ratio/high_max": 0.0,
608
+ "clip_ratio/high_mean": 0.0,
609
+ "clip_ratio/low_mean": 0.0,
610
+ "clip_ratio/low_min": 0.0,
611
+ "clip_ratio/region_mean": 0.0,
612
+ "completion_length": 75.975,
613
+ "completions/clipped_ratio": 0.1375,
614
+ "completions/max_length": 214.8,
615
+ "completions/max_terminated_length": 76.2,
616
+ "completions/mean_length": 75.975,
617
+ "completions/mean_terminated_length": 47.73154296875,
618
+ "completions/min_length": 36.2,
619
+ "completions/min_terminated_length": 36.2,
620
+ "epoch": 11.5,
621
+ "frac_reward_zero_std": 0.9,
622
+ "grad_norm": 0.001892779255285859,
623
+ "kl": 0.056033550202846526,
624
+ "learning_rate": 4.2586817614407895e-05,
625
+ "loss": -0.03689469993114471,
626
+ "num_tokens": 447185.0,
627
+ "reward": 1.2867857217788696,
628
+ "reward_std": 0.026428571343421935,
629
+ "rewards/reward_fn/mean": 1.2867857217788696,
630
+ "rewards/reward_fn/std": 0.3760030448436737,
631
+ "step": 115
632
+ },
633
+ {
634
+ "clip_ratio/high_max": 0.0,
635
+ "clip_ratio/high_mean": 0.0,
636
+ "clip_ratio/low_mean": 0.0,
637
+ "clip_ratio/low_min": 0.0,
638
+ "clip_ratio/region_mean": 0.0,
639
+ "completion_length": 76.725,
640
+ "completions/clipped_ratio": 0.1375,
641
+ "completions/max_length": 218.2,
642
+ "completions/max_terminated_length": 82.8,
643
+ "completions/mean_length": 76.725,
644
+ "completions/mean_terminated_length": 48.34801483154297,
645
+ "completions/min_length": 30.4,
646
+ "completions/min_terminated_length": 30.4,
647
+ "epoch": 12.0,
648
+ "frac_reward_zero_std": 0.95,
649
+ "grad_norm": 0.00067809724714607,
650
+ "kl": 0.06624810677021742,
651
+ "learning_rate": 3.852880399766243e-05,
652
+ "loss": 0.005734995752573013,
653
+ "num_tokens": 466979.0,
654
+ "reward": 1.325,
655
+ "reward_std": 0.05,
656
+ "rewards/reward_fn/mean": 1.325,
657
+ "rewards/reward_fn/std": 0.4854445576667786,
658
+ "step": 120
659
+ },
660
+ {
661
+ "clip_ratio/high_max": 0.0,
662
+ "clip_ratio/high_mean": 0.0,
663
+ "clip_ratio/low_mean": 0.0,
664
+ "clip_ratio/low_min": 0.0,
665
+ "clip_ratio/region_mean": 0.0,
666
+ "completion_length": 70.3625,
667
+ "completions/clipped_ratio": 0.0875,
668
+ "completions/max_length": 183.8,
669
+ "completions/max_terminated_length": 99.4,
670
+ "completions/mean_length": 70.3625,
671
+ "completions/mean_terminated_length": 52.45879211425781,
672
+ "completions/min_length": 38.0,
673
+ "completions/min_terminated_length": 38.0,
674
+ "epoch": 12.5,
675
+ "frac_reward_zero_std": 0.9,
676
+ "grad_norm": 0.0024091172963380814,
677
+ "kl": 0.0920485682785511,
678
+ "learning_rate": 3.4549150281252636e-05,
679
+ "loss": -0.03069324791431427,
680
+ "num_tokens": 486616.0,
681
+ "reward": 1.3125,
682
+ "reward_std": 0.075,
683
+ "rewards/reward_fn/mean": 1.3125,
684
+ "rewards/reward_fn/std": 0.46982967257499697,
685
+ "step": 125
686
+ },
687
+ {
688
+ "clip_ratio/high_max": 0.0,
689
+ "clip_ratio/high_mean": 0.0,
690
+ "clip_ratio/low_mean": 0.0,
691
+ "clip_ratio/low_min": 0.0,
692
+ "clip_ratio/region_mean": 0.0,
693
+ "completion_length": 71.8625,
694
+ "completions/clipped_ratio": 0.1,
695
+ "completions/max_length": 214.8,
696
+ "completions/max_terminated_length": 120.2,
697
+ "completions/mean_length": 71.8625,
698
+ "completions/mean_terminated_length": 51.76010284423828,
699
+ "completions/min_length": 33.4,
700
+ "completions/min_terminated_length": 33.4,
701
+ "epoch": 13.0,
702
+ "frac_reward_zero_std": 1.0,
703
+ "grad_norm": 0.0004440450284164399,
704
+ "kl": 0.053088791808113454,
705
+ "learning_rate": 3.0675041535377405e-05,
706
+ "loss": 5.229191156104207e-05,
707
+ "num_tokens": 505905.0,
708
+ "reward": 1.35,
709
+ "reward_std": 0.0,
710
+ "rewards/reward_fn/mean": 1.35,
711
+ "rewards/reward_fn/std": 0.39928138852119444,
712
+ "step": 130
713
+ },
714
+ {
715
+ "clip_ratio/high_max": 0.0,
716
+ "clip_ratio/high_mean": 0.0,
717
+ "clip_ratio/low_mean": 0.0,
718
+ "clip_ratio/low_min": 0.0,
719
+ "clip_ratio/region_mean": 0.0,
720
+ "completion_length": 84.5,
721
+ "completions/clipped_ratio": 0.175,
722
+ "completions/max_length": 182.8,
723
+ "completions/max_terminated_length": 76.6,
724
+ "completions/mean_length": 84.5,
725
+ "completions/mean_terminated_length": 48.71715393066406,
726
+ "completions/min_length": 38.0,
727
+ "completions/min_terminated_length": 38.0,
728
+ "epoch": 13.5,
729
+ "frac_reward_zero_std": 0.95,
730
+ "grad_norm": 0.0007964075193740427,
731
+ "kl": 0.07870542965829372,
732
+ "learning_rate": 2.693294185106562e-05,
733
+ "loss": 0.0014319854788482189,
734
+ "num_tokens": 526773.0,
735
+ "reward": 1.3875,
736
+ "reward_std": 0.025,
737
+ "rewards/reward_fn/mean": 1.3875,
738
+ "rewards/reward_fn/std": 0.38874412178993223,
739
+ "step": 135
740
+ },
741
+ {
742
+ "clip_ratio/high_max": 0.0,
743
+ "clip_ratio/high_mean": 0.0,
744
+ "clip_ratio/low_mean": 0.0,
745
+ "clip_ratio/low_min": 0.0,
746
+ "clip_ratio/region_mean": 0.0,
747
+ "completion_length": 70.6375,
748
+ "completions/clipped_ratio": 0.1,
749
+ "completions/max_length": 219.2,
750
+ "completions/max_terminated_length": 104.2,
751
+ "completions/mean_length": 70.6375,
752
+ "completions/mean_terminated_length": 50.06154937744141,
753
+ "completions/min_length": 37.0,
754
+ "completions/min_terminated_length": 37.0,
755
+ "epoch": 14.0,
756
+ "frac_reward_zero_std": 1.0,
757
+ "grad_norm": 0.00017480542010162026,
758
+ "kl": 0.05223398320376873,
759
+ "learning_rate": 2.3348413563600325e-05,
760
+ "loss": 5.521520506590605e-05,
761
+ "num_tokens": 545864.0,
762
+ "reward": 1.3,
763
+ "reward_std": 0.0,
764
+ "rewards/reward_fn/mean": 1.3,
765
+ "rewards/reward_fn/std": 0.38544455766677854,
766
+ "step": 140
767
+ },
768
+ {
769
+ "clip_ratio/high_max": 0.0,
770
+ "clip_ratio/high_mean": 0.0,
771
+ "clip_ratio/low_mean": 0.0,
772
+ "clip_ratio/low_min": 0.0,
773
+ "clip_ratio/region_mean": 0.0,
774
+ "completion_length": 75.7,
775
+ "completions/clipped_ratio": 0.1125,
776
+ "completions/max_length": 220.0,
777
+ "completions/max_terminated_length": 134.4,
778
+ "completions/mean_length": 75.7,
779
+ "completions/mean_terminated_length": 52.85439682006836,
780
+ "completions/min_length": 36.6,
781
+ "completions/min_terminated_length": 36.6,
782
+ "epoch": 14.5,
783
+ "frac_reward_zero_std": 0.95,
784
+ "grad_norm": 0.0014995499514043331,
785
+ "kl": 0.061327320709824565,
786
+ "learning_rate": 1.9945942635848748e-05,
787
+ "loss": -0.039644479751586914,
788
+ "num_tokens": 565456.0,
789
+ "reward": 1.275,
790
+ "reward_std": 0.05,
791
+ "rewards/reward_fn/mean": 1.275,
792
+ "rewards/reward_fn/std": 0.40599284172058103,
793
+ "step": 145
794
+ },
795
+ {
796
+ "clip_ratio/high_max": 0.0,
797
+ "clip_ratio/high_mean": 0.0,
798
+ "clip_ratio/low_mean": 0.0,
799
+ "clip_ratio/low_min": 0.0,
800
+ "clip_ratio/region_mean": 0.0,
801
+ "completion_length": 88.6875,
802
+ "completions/clipped_ratio": 0.1875,
803
+ "completions/max_length": 214.6,
804
+ "completions/max_terminated_length": 98.8,
805
+ "completions/mean_length": 88.6875,
806
+ "completions/mean_terminated_length": 51.92777633666992,
807
+ "completions/min_length": 35.0,
808
+ "completions/min_terminated_length": 35.0,
809
+ "epoch": 15.0,
810
+ "frac_reward_zero_std": 1.0,
811
+ "grad_norm": 0.001117849606089294,
812
+ "kl": 0.05267696371302009,
813
+ "learning_rate": 1.6748771394307585e-05,
814
+ "loss": 6.003726157359779e-05,
815
+ "num_tokens": 586207.0,
816
+ "reward": 1.35,
817
+ "reward_std": 0.0,
818
+ "rewards/reward_fn/mean": 1.35,
819
+ "rewards/reward_fn/std": 0.37160772681236265,
820
+ "step": 150
821
+ }
822
+ ],
823
+ "logging_steps": 5,
824
+ "max_steps": 200,
825
+ "num_input_tokens_seen": 586207,
826
+ "num_train_epochs": 20,
827
+ "save_steps": 50,
828
+ "stateful_callbacks": {
829
+ "TrainerControl": {
830
+ "args": {
831
+ "should_epoch_stop": false,
832
+ "should_evaluate": false,
833
+ "should_log": false,
834
+ "should_save": true,
835
+ "should_training_stop": false
836
+ },
837
+ "attributes": {}
838
+ }
839
+ },
840
+ "total_flos": 0.0,
841
+ "train_batch_size": 8,
842
+ "trial_name": null,
843
+ "trial_params": null
844
+ }
checkpoint-150/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61516d246785bb4adf93d26e6fe8ffbb65a924b8a03f13b01e1d00b7066140af
3
+ size 6673
checkpoint-200/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Qwen2.5-Coder-0.5B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/Qwen2.5-Coder-0.5B-Instruct
7
+ - grpo
8
+ - lora
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.1
checkpoint-200/adapter_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Qwen2ForCausalLM",
7
+ "parent_library": "transformers.models.qwen2.modeling_qwen2",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/Qwen2.5-Coder-0.5B-Instruct",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": [
36
+ "v_proj",
37
+ "k_proj",
38
+ "o_proj",
39
+ "up_proj",
40
+ "gate_proj",
41
+ "q_proj",
42
+ "down_proj"
43
+ ],
44
+ "target_parameters": null,
45
+ "task_type": "CAUSAL_LM",
46
+ "trainable_token_indices": null,
47
+ "use_dora": false,
48
+ "use_qalora": false,
49
+ "use_rslora": false
50
+ }
checkpoint-200/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c9d0849581a47371084753e072266e18e8c8b5479354ba3c9fb4e44fd19fee0
3
+ size 70430032
checkpoint-200/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-200/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dedb72c3dbe884e5fe47a64e284132cc4464a1ba833bf4fa7637eda2544f60d
3
+ size 36139685
checkpoint-200/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:270254ddfd4c7f90b22f5d661a1c3aedadf08c0fe7fbf135080c566c14a61f6d
3
+ size 14645
checkpoint-200/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:124625e167eb28acbfc793cfcb3e8a08b32e7fea06501462bc9e420a5e1beb2a
3
+ size 1383
checkpoint-200/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e2ea8f0d9967134d39bb3975e73cff05d7e316dc3701da4c4d0c02a6d1776f7
3
+ size 1465
checkpoint-200/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446
3
+ size 11422523
checkpoint-200/tokenizer_config.json ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [],
9
+ "is_local": false,
10
+ "model_max_length": 32768,
11
+ "pad_token": "<|PAD_TOKEN|>",
12
+ "padding_side": "right",
13
+ "split_special_tokens": false,
14
+ "tokenizer_class": "Qwen2Tokenizer",
15
+ "unk_token": null,
16
+ "added_tokens_decoder": {
17
+ "151643": {
18
+ "content": "<|endoftext|>",
19
+ "single_word": false,
20
+ "lstrip": false,
21
+ "rstrip": false,
22
+ "normalized": false,
23
+ "special": true
24
+ },
25
+ "151644": {
26
+ "content": "<|im_start|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ "151645": {
34
+ "content": "<|im_end|>",
35
+ "single_word": false,
36
+ "lstrip": false,
37
+ "rstrip": false,
38
+ "normalized": false,
39
+ "special": true
40
+ },
41
+ "151646": {
42
+ "content": "<|object_ref_start|>",
43
+ "single_word": false,
44
+ "lstrip": false,
45
+ "rstrip": false,
46
+ "normalized": false,
47
+ "special": true
48
+ },
49
+ "151647": {
50
+ "content": "<|object_ref_end|>",
51
+ "single_word": false,
52
+ "lstrip": false,
53
+ "rstrip": false,
54
+ "normalized": false,
55
+ "special": true
56
+ },
57
+ "151648": {
58
+ "content": "<|box_start|>",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ },
65
+ "151649": {
66
+ "content": "<|box_end|>",
67
+ "single_word": false,
68
+ "lstrip": false,
69
+ "rstrip": false,
70
+ "normalized": false,
71
+ "special": true
72
+ },
73
+ "151650": {
74
+ "content": "<|quad_start|>",
75
+ "single_word": false,
76
+ "lstrip": false,
77
+ "rstrip": false,
78
+ "normalized": false,
79
+ "special": true
80
+ },
81
+ "151651": {
82
+ "content": "<|quad_end|>",
83
+ "single_word": false,
84
+ "lstrip": false,
85
+ "rstrip": false,
86
+ "normalized": false,
87
+ "special": true
88
+ },
89
+ "151652": {
90
+ "content": "<|vision_start|>",
91
+ "single_word": false,
92
+ "lstrip": false,
93
+ "rstrip": false,
94
+ "normalized": false,
95
+ "special": true
96
+ },
97
+ "151653": {
98
+ "content": "<|vision_end|>",
99
+ "single_word": false,
100
+ "lstrip": false,
101
+ "rstrip": false,
102
+ "normalized": false,
103
+ "special": true
104
+ },
105
+ "151654": {
106
+ "content": "<|vision_pad|>",
107
+ "single_word": false,
108
+ "lstrip": false,
109
+ "rstrip": false,
110
+ "normalized": false,
111
+ "special": true
112
+ },
113
+ "151655": {
114
+ "content": "<|image_pad|>",
115
+ "single_word": false,
116
+ "lstrip": false,
117
+ "rstrip": false,
118
+ "normalized": false,
119
+ "special": true
120
+ },
121
+ "151656": {
122
+ "content": "<|video_pad|>",
123
+ "single_word": false,
124
+ "lstrip": false,
125
+ "rstrip": false,
126
+ "normalized": false,
127
+ "special": true
128
+ },
129
+ "151657": {
130
+ "content": "<tool_call>",
131
+ "single_word": false,
132
+ "lstrip": false,
133
+ "rstrip": false,
134
+ "normalized": false,
135
+ "special": false
136
+ },
137
+ "151658": {
138
+ "content": "</tool_call>",
139
+ "single_word": false,
140
+ "lstrip": false,
141
+ "rstrip": false,
142
+ "normalized": false,
143
+ "special": false
144
+ },
145
+ "151659": {
146
+ "content": "<|fim_prefix|>",
147
+ "single_word": false,
148
+ "lstrip": false,
149
+ "rstrip": false,
150
+ "normalized": false,
151
+ "special": false
152
+ },
153
+ "151660": {
154
+ "content": "<|fim_middle|>",
155
+ "single_word": false,
156
+ "lstrip": false,
157
+ "rstrip": false,
158
+ "normalized": false,
159
+ "special": false
160
+ },
161
+ "151661": {
162
+ "content": "<|fim_suffix|>",
163
+ "single_word": false,
164
+ "lstrip": false,
165
+ "rstrip": false,
166
+ "normalized": false,
167
+ "special": false
168
+ },
169
+ "151662": {
170
+ "content": "<|fim_pad|>",
171
+ "single_word": false,
172
+ "lstrip": false,
173
+ "rstrip": false,
174
+ "normalized": false,
175
+ "special": false
176
+ },
177
+ "151663": {
178
+ "content": "<|repo_name|>",
179
+ "single_word": false,
180
+ "lstrip": false,
181
+ "rstrip": false,
182
+ "normalized": false,
183
+ "special": false
184
+ },
185
+ "151664": {
186
+ "content": "<|file_sep|>",
187
+ "single_word": false,
188
+ "lstrip": false,
189
+ "rstrip": false,
190
+ "normalized": false,
191
+ "special": false
192
+ },
193
+ "151665": {
194
+ "content": "<|PAD_TOKEN|>",
195
+ "single_word": false,
196
+ "lstrip": false,
197
+ "rstrip": false,
198
+ "normalized": false,
199
+ "special": true
200
+ }
201
+ }
202
+ }
checkpoint-200/trainer_state.json ADDED
@@ -0,0 +1,1114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 20.0,
6
+ "eval_steps": 500,
7
+ "global_step": 200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completion_length": 95.85,
19
+ "completions/clipped_ratio": 0.125,
20
+ "completions/max_length": 256.0,
21
+ "completions/max_terminated_length": 195.6,
22
+ "completions/mean_length": 95.85,
23
+ "completions/mean_terminated_length": 72.8880989074707,
24
+ "completions/min_length": 36.6,
25
+ "completions/min_terminated_length": 36.6,
26
+ "epoch": 0.5,
27
+ "frac_reward_zero_std": 0.2,
28
+ "grad_norm": 0.3041034936904907,
29
+ "kl": 0.00021898818758927518,
30
+ "learning_rate": 4e-05,
31
+ "loss": 0.0329527884721756,
32
+ "num_tokens": 21444.0,
33
+ "reward": 0.7767857074737549,
34
+ "reward_std": 0.7150911569595337,
35
+ "rewards/reward_fn/mean": 0.7767857074737549,
36
+ "rewards/reward_fn/std": 0.9959433197975158,
37
+ "step": 5
38
+ },
39
+ {
40
+ "clip_ratio/high_max": 0.0,
41
+ "clip_ratio/high_mean": 0.0,
42
+ "clip_ratio/low_mean": 0.0,
43
+ "clip_ratio/low_min": 0.0,
44
+ "clip_ratio/region_mean": 0.0,
45
+ "completion_length": 95.1,
46
+ "completions/clipped_ratio": 0.0875,
47
+ "completions/max_length": 225.2,
48
+ "completions/max_terminated_length": 159.2,
49
+ "completions/mean_length": 95.1,
50
+ "completions/mean_terminated_length": 80.34466400146485,
51
+ "completions/min_length": 34.6,
52
+ "completions/min_terminated_length": 34.6,
53
+ "epoch": 1.0,
54
+ "frac_reward_zero_std": 0.4,
55
+ "grad_norm": 0.2708108425140381,
56
+ "kl": 0.006061523332027719,
57
+ "learning_rate": 9e-05,
58
+ "loss": -0.016053691506385803,
59
+ "num_tokens": 42220.0,
60
+ "reward": 0.6875000119209289,
61
+ "reward_std": 0.5175672590732574,
62
+ "rewards/reward_fn/mean": 0.6875,
63
+ "rewards/reward_fn/std": 0.8034753799438477,
64
+ "step": 10
65
+ },
66
+ {
67
+ "clip_ratio/high_max": 0.0,
68
+ "clip_ratio/high_mean": 0.0,
69
+ "clip_ratio/low_mean": 0.0,
70
+ "clip_ratio/low_min": 0.0,
71
+ "clip_ratio/region_mean": 0.0,
72
+ "completion_length": 146.275,
73
+ "completions/clipped_ratio": 0.325,
74
+ "completions/max_length": 256.0,
75
+ "completions/max_terminated_length": 225.4,
76
+ "completions/mean_length": 146.275,
77
+ "completions/mean_terminated_length": 92.8663101196289,
78
+ "completions/min_length": 32.6,
79
+ "completions/min_terminated_length": 32.6,
80
+ "epoch": 1.5,
81
+ "frac_reward_zero_std": 0.2,
82
+ "grad_norm": 0.27529817819595337,
83
+ "kl": 0.01560373855754733,
84
+ "learning_rate": 9.989068136093873e-05,
85
+ "loss": 0.010054035484790802,
86
+ "num_tokens": 69198.0,
87
+ "reward": 1.1558928728103637,
88
+ "reward_std": 0.8465379416942597,
89
+ "rewards/reward_fn/mean": 1.1558928728103637,
90
+ "rewards/reward_fn/std": 1.1404805421829223,
91
+ "step": 15
92
+ },
93
+ {
94
+ "clip_ratio/high_max": 0.0,
95
+ "clip_ratio/high_mean": 0.0,
96
+ "clip_ratio/low_mean": 0.0,
97
+ "clip_ratio/low_min": 0.0,
98
+ "clip_ratio/region_mean": 0.0,
99
+ "completion_length": 75.85,
100
+ "completions/clipped_ratio": 0.05,
101
+ "completions/max_length": 220.6,
102
+ "completions/max_terminated_length": 151.0,
103
+ "completions/mean_length": 75.85,
104
+ "completions/mean_terminated_length": 66.14857330322266,
105
+ "completions/min_length": 32.8,
106
+ "completions/min_terminated_length": 32.8,
107
+ "epoch": 2.0,
108
+ "frac_reward_zero_std": 0.35,
109
+ "grad_norm": 0.353487491607666,
110
+ "kl": 0.027986684162169696,
111
+ "learning_rate": 9.944739353007344e-05,
112
+ "loss": -0.030885905027389526,
113
+ "num_tokens": 87334.0,
114
+ "reward": 0.7932142853736878,
115
+ "reward_std": 0.4118983089923859,
116
+ "rewards/reward_fn/mean": 0.7932142853736878,
117
+ "rewards/reward_fn/std": 0.7167340397834778,
118
+ "step": 20
119
+ },
120
+ {
121
+ "clip_ratio/high_max": 0.0,
122
+ "clip_ratio/high_mean": 0.0,
123
+ "clip_ratio/low_mean": 0.0,
124
+ "clip_ratio/low_min": 0.0,
125
+ "clip_ratio/region_mean": 0.0,
126
+ "completion_length": 90.85,
127
+ "completions/clipped_ratio": 0.0625,
128
+ "completions/max_length": 248.0,
129
+ "completions/max_terminated_length": 216.4,
130
+ "completions/mean_length": 90.85,
131
+ "completions/mean_terminated_length": 79.819287109375,
132
+ "completions/min_length": 28.0,
133
+ "completions/min_terminated_length": 28.0,
134
+ "epoch": 2.5,
135
+ "frac_reward_zero_std": 0.15,
136
+ "grad_norm": 0.3318491578102112,
137
+ "kl": 0.04692814685404301,
138
+ "learning_rate": 9.86663298624003e-05,
139
+ "loss": 0.05783940553665161,
140
+ "num_tokens": 108330.0,
141
+ "reward": 1.1987500190734863,
142
+ "reward_std": 0.5907092690467834,
143
+ "rewards/reward_fn/mean": 1.1987500190734863,
144
+ "rewards/reward_fn/std": 0.9482298731803894,
145
+ "step": 25
146
+ },
147
+ {
148
+ "clip_ratio/high_max": 0.0,
149
+ "clip_ratio/high_mean": 0.0,
150
+ "clip_ratio/low_mean": 0.0,
151
+ "clip_ratio/low_min": 0.0,
152
+ "clip_ratio/region_mean": 0.0,
153
+ "completion_length": 91.775,
154
+ "completions/clipped_ratio": 0.125,
155
+ "completions/max_length": 248.2,
156
+ "completions/max_terminated_length": 212.6,
157
+ "completions/mean_length": 91.775,
158
+ "completions/mean_terminated_length": 68.68722763061524,
159
+ "completions/min_length": 25.2,
160
+ "completions/min_terminated_length": 25.2,
161
+ "epoch": 3.0,
162
+ "frac_reward_zero_std": 0.4,
163
+ "grad_norm": 0.24605490267276764,
164
+ "kl": 0.07254143096506596,
165
+ "learning_rate": 9.755282581475769e-05,
166
+ "loss": 0.03934891819953919,
167
+ "num_tokens": 129012.0,
168
+ "reward": 1.0242857217788697,
169
+ "reward_std": 0.5802445828914642,
170
+ "rewards/reward_fn/mean": 1.0242857217788697,
171
+ "rewards/reward_fn/std": 0.8706011414527893,
172
+ "step": 30
173
+ },
174
+ {
175
+ "clip_ratio/high_max": 0.0,
176
+ "clip_ratio/high_mean": 0.0,
177
+ "clip_ratio/low_mean": 0.0,
178
+ "clip_ratio/low_min": 0.0,
179
+ "clip_ratio/region_mean": 0.0,
180
+ "completion_length": 59.7875,
181
+ "completions/clipped_ratio": 0.0375,
182
+ "completions/max_length": 175.6,
183
+ "completions/max_terminated_length": 118.4,
184
+ "completions/mean_length": 59.7875,
185
+ "completions/mean_terminated_length": 52.19559555053711,
186
+ "completions/min_length": 25.4,
187
+ "completions/min_terminated_length": 25.4,
188
+ "epoch": 3.5,
189
+ "frac_reward_zero_std": 0.6,
190
+ "grad_norm": 0.2970605790615082,
191
+ "kl": 0.053099883161485194,
192
+ "learning_rate": 9.611448774886924e-05,
193
+ "loss": 0.06977529525756836,
194
+ "num_tokens": 147103.0,
195
+ "reward": 0.9473214387893677,
196
+ "reward_std": 0.3923970699310303,
197
+ "rewards/reward_fn/mean": 0.9473214268684387,
198
+ "rewards/reward_fn/std": 0.649278998374939,
199
+ "step": 35
200
+ },
201
+ {
202
+ "clip_ratio/high_max": 0.0,
203
+ "clip_ratio/high_mean": 0.0,
204
+ "clip_ratio/low_mean": 0.0,
205
+ "clip_ratio/low_min": 0.0,
206
+ "clip_ratio/region_mean": 0.0,
207
+ "completion_length": 47.85,
208
+ "completions/clipped_ratio": 0.0,
209
+ "completions/max_length": 94.8,
210
+ "completions/max_terminated_length": 94.8,
211
+ "completions/mean_length": 47.85,
212
+ "completions/mean_terminated_length": 47.85,
213
+ "completions/min_length": 31.2,
214
+ "completions/min_terminated_length": 31.2,
215
+ "epoch": 4.0,
216
+ "frac_reward_zero_std": 0.85,
217
+ "grad_norm": 0.16646049916744232,
218
+ "kl": 0.056573960930109024,
219
+ "learning_rate": 9.43611409721806e-05,
220
+ "loss": -0.0031644195318222047,
221
+ "num_tokens": 164643.0,
222
+ "reward": 0.7899999976158142,
223
+ "reward_std": 0.12681145668029786,
224
+ "rewards/reward_fn/mean": 0.7899999976158142,
225
+ "rewards/reward_fn/std": 0.5133758962154389,
226
+ "step": 40
227
+ },
228
+ {
229
+ "clip_ratio/high_max": 0.0,
230
+ "clip_ratio/high_mean": 0.0,
231
+ "clip_ratio/low_mean": 0.0,
232
+ "clip_ratio/low_min": 0.0,
233
+ "clip_ratio/region_mean": 0.0,
234
+ "completion_length": 49.2625,
235
+ "completions/clipped_ratio": 0.0,
236
+ "completions/max_length": 93.6,
237
+ "completions/max_terminated_length": 93.6,
238
+ "completions/mean_length": 49.2625,
239
+ "completions/mean_terminated_length": 49.2625,
240
+ "completions/min_length": 33.8,
241
+ "completions/min_terminated_length": 33.8,
242
+ "epoch": 4.5,
243
+ "frac_reward_zero_std": 0.6,
244
+ "grad_norm": 0.3159082531929016,
245
+ "kl": 0.0353464649990201,
246
+ "learning_rate": 9.230476262104677e-05,
247
+ "loss": -0.03417131304740906,
248
+ "num_tokens": 182568.0,
249
+ "reward": 0.7924999833106995,
250
+ "reward_std": 0.3821385264396667,
251
+ "rewards/reward_fn/mean": 0.7924999833106995,
252
+ "rewards/reward_fn/std": 0.62625293135643,
253
+ "step": 45
254
+ },
255
+ {
256
+ "clip_ratio/high_max": 0.0,
257
+ "clip_ratio/high_mean": 0.0,
258
+ "clip_ratio/low_mean": 0.0,
259
+ "clip_ratio/low_min": 0.0,
260
+ "clip_ratio/region_mean": 0.0,
261
+ "completion_length": 58.25,
262
+ "completions/clipped_ratio": 0.0125,
263
+ "completions/max_length": 180.6,
264
+ "completions/max_terminated_length": 153.2,
265
+ "completions/mean_length": 58.25,
266
+ "completions/mean_terminated_length": 55.67250061035156,
267
+ "completions/min_length": 33.2,
268
+ "completions/min_terminated_length": 33.2,
269
+ "epoch": 5.0,
270
+ "frac_reward_zero_std": 0.55,
271
+ "grad_norm": 0.27360936999320984,
272
+ "kl": 0.04483541352674365,
273
+ "learning_rate": 8.995939984474624e-05,
274
+ "loss": 0.026852786540985107,
275
+ "num_tokens": 200660.0,
276
+ "reward": 0.774821424484253,
277
+ "reward_std": 0.32772802114486693,
278
+ "rewards/reward_fn/mean": 0.774821424484253,
279
+ "rewards/reward_fn/std": 0.5432307064533234,
280
+ "step": 50
281
+ },
282
+ {
283
+ "clip_ratio/high_max": 0.0,
284
+ "clip_ratio/high_mean": 0.0,
285
+ "clip_ratio/low_mean": 0.0,
286
+ "clip_ratio/low_min": 0.0,
287
+ "clip_ratio/region_mean": 0.0,
288
+ "completion_length": 51.225,
289
+ "completions/clipped_ratio": 0.0,
290
+ "completions/max_length": 117.4,
291
+ "completions/max_terminated_length": 117.4,
292
+ "completions/mean_length": 51.225,
293
+ "completions/mean_terminated_length": 51.225,
294
+ "completions/min_length": 36.6,
295
+ "completions/min_terminated_length": 36.6,
296
+ "epoch": 5.5,
297
+ "frac_reward_zero_std": 0.6,
298
+ "grad_norm": 0.21470840275287628,
299
+ "kl": 0.04195364024490118,
300
+ "learning_rate": 8.73410738492077e-05,
301
+ "loss": 0.023307889699935913,
302
+ "num_tokens": 218442.0,
303
+ "reward": 1.0364285826683044,
304
+ "reward_std": 0.37462076991796495,
305
+ "rewards/reward_fn/mean": 1.0364285826683044,
306
+ "rewards/reward_fn/std": 0.6156843721866607,
307
+ "step": 55
308
+ },
309
+ {
310
+ "clip_ratio/high_max": 0.0,
311
+ "clip_ratio/high_mean": 0.0,
312
+ "clip_ratio/low_mean": 0.0,
313
+ "clip_ratio/low_min": 0.0,
314
+ "clip_ratio/region_mean": 0.0,
315
+ "completion_length": 53.225,
316
+ "completions/clipped_ratio": 0.0125,
317
+ "completions/max_length": 159.8,
318
+ "completions/max_terminated_length": 123.8,
319
+ "completions/mean_length": 53.225,
320
+ "completions/mean_terminated_length": 50.68583374023437,
321
+ "completions/min_length": 33.8,
322
+ "completions/min_terminated_length": 33.8,
323
+ "epoch": 6.0,
324
+ "frac_reward_zero_std": 0.45,
325
+ "grad_norm": 0.29306381940841675,
326
+ "kl": 0.05784560404717922,
327
+ "learning_rate": 8.44676704559283e-05,
328
+ "loss": 0.015561078488826752,
329
+ "num_tokens": 236112.0,
330
+ "reward": 1.1410714387893677,
331
+ "reward_std": 0.3540264666080475,
332
+ "rewards/reward_fn/mean": 1.1410714387893677,
333
+ "rewards/reward_fn/std": 0.607547128200531,
334
+ "step": 60
335
+ },
336
+ {
337
+ "clip_ratio/high_max": 0.0,
338
+ "clip_ratio/high_mean": 0.0,
339
+ "clip_ratio/low_mean": 0.0,
340
+ "clip_ratio/low_min": 0.0,
341
+ "clip_ratio/region_mean": 0.0,
342
+ "completion_length": 62.2625,
343
+ "completions/clipped_ratio": 0.05,
344
+ "completions/max_length": 153.2,
345
+ "completions/max_terminated_length": 117.4,
346
+ "completions/mean_length": 62.2625,
347
+ "completions/mean_terminated_length": 52.46428680419922,
348
+ "completions/min_length": 30.6,
349
+ "completions/min_terminated_length": 30.6,
350
+ "epoch": 6.5,
351
+ "frac_reward_zero_std": 0.65,
352
+ "grad_norm": 0.2615426480770111,
353
+ "kl": 0.09820330440998078,
354
+ "learning_rate": 8.135881792367686e-05,
355
+ "loss": 0.04314278066158295,
356
+ "num_tokens": 254033.0,
357
+ "reward": 1.2110714197158814,
358
+ "reward_std": 0.2716316431760788,
359
+ "rewards/reward_fn/mean": 1.2110714197158814,
360
+ "rewards/reward_fn/std": 0.6701113641262054,
361
+ "step": 65
362
+ },
363
+ {
364
+ "clip_ratio/high_max": 0.0,
365
+ "clip_ratio/high_mean": 0.0,
366
+ "clip_ratio/low_mean": 0.0,
367
+ "clip_ratio/low_min": 0.0,
368
+ "clip_ratio/region_mean": 0.0,
369
+ "completion_length": 82.5625,
370
+ "completions/clipped_ratio": 0.1125,
371
+ "completions/max_length": 234.4,
372
+ "completions/max_terminated_length": 147.4,
373
+ "completions/mean_length": 82.5625,
374
+ "completions/mean_terminated_length": 60.485835266113284,
375
+ "completions/min_length": 35.4,
376
+ "completions/min_terminated_length": 35.4,
377
+ "epoch": 7.0,
378
+ "frac_reward_zero_std": 0.6,
379
+ "grad_norm": 0.1326226145029068,
380
+ "kl": 0.07194480895996094,
381
+ "learning_rate": 7.803575286758364e-05,
382
+ "loss": 0.03152734041213989,
383
+ "num_tokens": 275222.0,
384
+ "reward": 1.320892834663391,
385
+ "reward_std": 0.35234126150608064,
386
+ "rewards/reward_fn/mean": 1.320892858505249,
387
+ "rewards/reward_fn/std": 0.676600456237793,
388
+ "step": 70
389
+ },
390
+ {
391
+ "clip_ratio/high_max": 0.0,
392
+ "clip_ratio/high_mean": 0.0,
393
+ "clip_ratio/low_mean": 0.0,
394
+ "clip_ratio/low_min": 0.0,
395
+ "clip_ratio/region_mean": 0.0,
396
+ "completion_length": 72.7625,
397
+ "completions/clipped_ratio": 0.0625,
398
+ "completions/max_length": 220.4,
399
+ "completions/max_terminated_length": 166.8,
400
+ "completions/mean_length": 72.7625,
401
+ "completions/mean_terminated_length": 59.95151138305664,
402
+ "completions/min_length": 30.2,
403
+ "completions/min_terminated_length": 30.2,
404
+ "epoch": 7.5,
405
+ "frac_reward_zero_std": 0.7,
406
+ "grad_norm": 0.15138906240463257,
407
+ "kl": 0.09165919721126556,
408
+ "learning_rate": 7.452117519152542e-05,
409
+ "loss": 0.0013809207826852798,
410
+ "num_tokens": 294407.0,
411
+ "reward": 1.3473214387893677,
412
+ "reward_std": 0.17192934565246104,
413
+ "rewards/reward_fn/mean": 1.3473214387893677,
414
+ "rewards/reward_fn/std": 0.6935437321662903,
415
+ "step": 75
416
+ },
417
+ {
418
+ "clip_ratio/high_max": 0.0,
419
+ "clip_ratio/high_mean": 0.0,
420
+ "clip_ratio/low_mean": 0.0,
421
+ "clip_ratio/low_min": 0.0,
422
+ "clip_ratio/region_mean": 0.0,
423
+ "completion_length": 80.9375,
424
+ "completions/clipped_ratio": 0.125,
425
+ "completions/max_length": 222.4,
426
+ "completions/max_terminated_length": 156.0,
427
+ "completions/mean_length": 80.9375,
428
+ "completions/mean_terminated_length": 55.99818344116211,
429
+ "completions/min_length": 31.6,
430
+ "completions/min_terminated_length": 31.6,
431
+ "epoch": 8.0,
432
+ "frac_reward_zero_std": 0.65,
433
+ "grad_norm": 0.19589942693710327,
434
+ "kl": 0.0683181069791317,
435
+ "learning_rate": 7.083909302476453e-05,
436
+ "loss": 0.033023273944854735,
437
+ "num_tokens": 314502.0,
438
+ "reward": 1.1685714244842529,
439
+ "reward_std": 0.27699449211359023,
440
+ "rewards/reward_fn/mean": 1.1685714244842529,
441
+ "rewards/reward_fn/std": 0.5244716942310333,
442
+ "step": 80
443
+ },
444
+ {
445
+ "clip_ratio/high_max": 0.0,
446
+ "clip_ratio/high_mean": 0.0,
447
+ "clip_ratio/low_mean": 0.0,
448
+ "clip_ratio/low_min": 0.0,
449
+ "clip_ratio/region_mean": 0.0,
450
+ "completion_length": 60.6625,
451
+ "completions/clipped_ratio": 0.05,
452
+ "completions/max_length": 207.2,
453
+ "completions/max_terminated_length": 100.0,
454
+ "completions/mean_length": 60.6625,
455
+ "completions/mean_terminated_length": 50.27571563720703,
456
+ "completions/min_length": 35.2,
457
+ "completions/min_terminated_length": 35.2,
458
+ "epoch": 8.5,
459
+ "frac_reward_zero_std": 0.6,
460
+ "grad_norm": 0.20066438615322113,
461
+ "kl": 0.0837025498971343,
462
+ "learning_rate": 6.701465872208216e-05,
463
+ "loss": 0.022454433143138885,
464
+ "num_tokens": 334007.0,
465
+ "reward": 1.3910714387893677,
466
+ "reward_std": 0.21785714849829674,
467
+ "rewards/reward_fn/mean": 1.3910714387893677,
468
+ "rewards/reward_fn/std": 0.6235540926456451,
469
+ "step": 85
470
+ },
471
+ {
472
+ "clip_ratio/high_max": 0.0,
473
+ "clip_ratio/high_mean": 0.0,
474
+ "clip_ratio/low_mean": 0.0,
475
+ "clip_ratio/low_min": 0.0,
476
+ "clip_ratio/region_mean": 0.0,
477
+ "completion_length": 54.025,
478
+ "completions/clipped_ratio": 0.025,
479
+ "completions/max_length": 143.4,
480
+ "completions/max_terminated_length": 117.2,
481
+ "completions/mean_length": 54.025,
482
+ "completions/mean_terminated_length": 49.025001525878906,
483
+ "completions/min_length": 30.2,
484
+ "completions/min_terminated_length": 30.2,
485
+ "epoch": 9.0,
486
+ "frac_reward_zero_std": 0.8,
487
+ "grad_norm": 0.18161360919475555,
488
+ "kl": 0.05805329587310552,
489
+ "learning_rate": 6.307399704769099e-05,
490
+ "loss": -0.04096371531486511,
491
+ "num_tokens": 351041.0,
492
+ "reward": 1.24375,
493
+ "reward_std": 0.2125,
494
+ "rewards/reward_fn/mean": 1.24375,
495
+ "rewards/reward_fn/std": 0.584508627653122,
496
+ "step": 90
497
+ },
498
+ {
499
+ "clip_ratio/high_max": 0.0,
500
+ "clip_ratio/high_mean": 0.0,
501
+ "clip_ratio/low_mean": 0.0,
502
+ "clip_ratio/low_min": 0.0,
503
+ "clip_ratio/region_mean": 0.0,
504
+ "completion_length": 54.65,
505
+ "completions/clipped_ratio": 0.0125,
506
+ "completions/max_length": 145.0,
507
+ "completions/max_terminated_length": 142.2,
508
+ "completions/mean_length": 54.65,
509
+ "completions/mean_terminated_length": 52.23083419799805,
510
+ "completions/min_length": 35.4,
511
+ "completions/min_terminated_length": 35.4,
512
+ "epoch": 9.5,
513
+ "frac_reward_zero_std": 0.9,
514
+ "grad_norm": 0.10424701869487762,
515
+ "kl": 0.05452606473118067,
516
+ "learning_rate": 5.90440267166055e-05,
517
+ "loss": 0.0021311525255441667,
518
+ "num_tokens": 368361.0,
519
+ "reward": 1.1996428489685058,
520
+ "reward_std": 0.058449314022436735,
521
+ "rewards/reward_fn/mean": 1.1996428489685058,
522
+ "rewards/reward_fn/std": 0.4604207634925842,
523
+ "step": 95
524
+ },
525
+ {
526
+ "clip_ratio/high_max": 0.0,
527
+ "clip_ratio/high_mean": 0.0,
528
+ "clip_ratio/low_mean": 0.0,
529
+ "clip_ratio/low_min": 0.0,
530
+ "clip_ratio/region_mean": 0.0,
531
+ "completion_length": 68.3875,
532
+ "completions/clipped_ratio": 0.0625,
533
+ "completions/max_length": 254.4,
534
+ "completions/max_terminated_length": 140.8,
535
+ "completions/mean_length": 68.3875,
536
+ "completions/mean_terminated_length": 55.740359497070315,
537
+ "completions/min_length": 36.2,
538
+ "completions/min_terminated_length": 36.2,
539
+ "epoch": 10.0,
540
+ "frac_reward_zero_std": 0.75,
541
+ "grad_norm": 0.13536690175533295,
542
+ "kl": 0.0854075826704502,
543
+ "learning_rate": 5.495227651252315e-05,
544
+ "loss": -0.025432443618774413,
545
+ "num_tokens": 388400.0,
546
+ "reward": 1.3283928632736206,
547
+ "reward_std": 0.2009493112564087,
548
+ "rewards/reward_fn/mean": 1.3283928632736206,
549
+ "rewards/reward_fn/std": 0.5586225628852844,
550
+ "step": 100
551
+ },
552
+ {
553
+ "clip_ratio/high_max": 0.0,
554
+ "clip_ratio/high_mean": 0.0,
555
+ "clip_ratio/low_mean": 0.0,
556
+ "clip_ratio/low_min": 0.0,
557
+ "clip_ratio/region_mean": 0.0,
558
+ "completion_length": 77.6,
559
+ "completions/clipped_ratio": 0.125,
560
+ "completions/max_length": 256.0,
561
+ "completions/max_terminated_length": 115.6,
562
+ "completions/mean_length": 77.6,
563
+ "completions/mean_terminated_length": 52.14154052734375,
564
+ "completions/min_length": 30.4,
565
+ "completions/min_terminated_length": 30.4,
566
+ "epoch": 10.5,
567
+ "frac_reward_zero_std": 0.9,
568
+ "grad_norm": 0.0006729933083988726,
569
+ "kl": 0.07917941156774759,
570
+ "learning_rate": 5.0826697238317935e-05,
571
+ "loss": -0.0014228260144591332,
572
+ "num_tokens": 408808.0,
573
+ "reward": 1.4,
574
+ "reward_std": 0.1,
575
+ "rewards/reward_fn/mean": 1.4,
576
+ "rewards/reward_fn/std": 0.5159838497638702,
577
+ "step": 105
578
+ },
579
+ {
580
+ "clip_ratio/high_max": 0.0,
581
+ "clip_ratio/high_mean": 0.0,
582
+ "clip_ratio/low_mean": 0.0,
583
+ "clip_ratio/low_min": 0.0,
584
+ "clip_ratio/region_mean": 0.0,
585
+ "completion_length": 71.6375,
586
+ "completions/clipped_ratio": 0.1,
587
+ "completions/max_length": 223.8,
588
+ "completions/max_terminated_length": 125.4,
589
+ "completions/mean_length": 71.6375,
590
+ "completions/mean_terminated_length": 50.991825103759766,
591
+ "completions/min_length": 34.6,
592
+ "completions/min_terminated_length": 34.6,
593
+ "epoch": 11.0,
594
+ "frac_reward_zero_std": 0.85,
595
+ "grad_norm": 0.1310124397277832,
596
+ "kl": 0.0553528118878603,
597
+ "learning_rate": 4.669547078371504e-05,
598
+ "loss": 0.027294650673866272,
599
+ "num_tokens": 427787.0,
600
+ "reward": 1.15,
601
+ "reward_std": 0.15773502588272095,
602
+ "rewards/reward_fn/mean": 1.15,
603
+ "rewards/reward_fn/std": 0.482165002822876,
604
+ "step": 110
605
+ },
606
+ {
607
+ "clip_ratio/high_max": 0.0,
608
+ "clip_ratio/high_mean": 0.0,
609
+ "clip_ratio/low_mean": 0.0,
610
+ "clip_ratio/low_min": 0.0,
611
+ "clip_ratio/region_mean": 0.0,
612
+ "completion_length": 75.975,
613
+ "completions/clipped_ratio": 0.1375,
614
+ "completions/max_length": 214.8,
615
+ "completions/max_terminated_length": 76.2,
616
+ "completions/mean_length": 75.975,
617
+ "completions/mean_terminated_length": 47.73154296875,
618
+ "completions/min_length": 36.2,
619
+ "completions/min_terminated_length": 36.2,
620
+ "epoch": 11.5,
621
+ "frac_reward_zero_std": 0.9,
622
+ "grad_norm": 0.001892779255285859,
623
+ "kl": 0.056033550202846526,
624
+ "learning_rate": 4.2586817614407895e-05,
625
+ "loss": -0.03689469993114471,
626
+ "num_tokens": 447185.0,
627
+ "reward": 1.2867857217788696,
628
+ "reward_std": 0.026428571343421935,
629
+ "rewards/reward_fn/mean": 1.2867857217788696,
630
+ "rewards/reward_fn/std": 0.3760030448436737,
631
+ "step": 115
632
+ },
633
+ {
634
+ "clip_ratio/high_max": 0.0,
635
+ "clip_ratio/high_mean": 0.0,
636
+ "clip_ratio/low_mean": 0.0,
637
+ "clip_ratio/low_min": 0.0,
638
+ "clip_ratio/region_mean": 0.0,
639
+ "completion_length": 76.725,
640
+ "completions/clipped_ratio": 0.1375,
641
+ "completions/max_length": 218.2,
642
+ "completions/max_terminated_length": 82.8,
643
+ "completions/mean_length": 76.725,
644
+ "completions/mean_terminated_length": 48.34801483154297,
645
+ "completions/min_length": 30.4,
646
+ "completions/min_terminated_length": 30.4,
647
+ "epoch": 12.0,
648
+ "frac_reward_zero_std": 0.95,
649
+ "grad_norm": 0.00067809724714607,
650
+ "kl": 0.06624810677021742,
651
+ "learning_rate": 3.852880399766243e-05,
652
+ "loss": 0.005734995752573013,
653
+ "num_tokens": 466979.0,
654
+ "reward": 1.325,
655
+ "reward_std": 0.05,
656
+ "rewards/reward_fn/mean": 1.325,
657
+ "rewards/reward_fn/std": 0.4854445576667786,
658
+ "step": 120
659
+ },
660
+ {
661
+ "clip_ratio/high_max": 0.0,
662
+ "clip_ratio/high_mean": 0.0,
663
+ "clip_ratio/low_mean": 0.0,
664
+ "clip_ratio/low_min": 0.0,
665
+ "clip_ratio/region_mean": 0.0,
666
+ "completion_length": 70.3625,
667
+ "completions/clipped_ratio": 0.0875,
668
+ "completions/max_length": 183.8,
669
+ "completions/max_terminated_length": 99.4,
670
+ "completions/mean_length": 70.3625,
671
+ "completions/mean_terminated_length": 52.45879211425781,
672
+ "completions/min_length": 38.0,
673
+ "completions/min_terminated_length": 38.0,
674
+ "epoch": 12.5,
675
+ "frac_reward_zero_std": 0.9,
676
+ "grad_norm": 0.0024091172963380814,
677
+ "kl": 0.0920485682785511,
678
+ "learning_rate": 3.4549150281252636e-05,
679
+ "loss": -0.03069324791431427,
680
+ "num_tokens": 486616.0,
681
+ "reward": 1.3125,
682
+ "reward_std": 0.075,
683
+ "rewards/reward_fn/mean": 1.3125,
684
+ "rewards/reward_fn/std": 0.46982967257499697,
685
+ "step": 125
686
+ },
687
+ {
688
+ "clip_ratio/high_max": 0.0,
689
+ "clip_ratio/high_mean": 0.0,
690
+ "clip_ratio/low_mean": 0.0,
691
+ "clip_ratio/low_min": 0.0,
692
+ "clip_ratio/region_mean": 0.0,
693
+ "completion_length": 71.8625,
694
+ "completions/clipped_ratio": 0.1,
695
+ "completions/max_length": 214.8,
696
+ "completions/max_terminated_length": 120.2,
697
+ "completions/mean_length": 71.8625,
698
+ "completions/mean_terminated_length": 51.76010284423828,
699
+ "completions/min_length": 33.4,
700
+ "completions/min_terminated_length": 33.4,
701
+ "epoch": 13.0,
702
+ "frac_reward_zero_std": 1.0,
703
+ "grad_norm": 0.0004440450284164399,
704
+ "kl": 0.053088791808113454,
705
+ "learning_rate": 3.0675041535377405e-05,
706
+ "loss": 5.229191156104207e-05,
707
+ "num_tokens": 505905.0,
708
+ "reward": 1.35,
709
+ "reward_std": 0.0,
710
+ "rewards/reward_fn/mean": 1.35,
711
+ "rewards/reward_fn/std": 0.39928138852119444,
712
+ "step": 130
713
+ },
714
+ {
715
+ "clip_ratio/high_max": 0.0,
716
+ "clip_ratio/high_mean": 0.0,
717
+ "clip_ratio/low_mean": 0.0,
718
+ "clip_ratio/low_min": 0.0,
719
+ "clip_ratio/region_mean": 0.0,
720
+ "completion_length": 84.5,
721
+ "completions/clipped_ratio": 0.175,
722
+ "completions/max_length": 182.8,
723
+ "completions/max_terminated_length": 76.6,
724
+ "completions/mean_length": 84.5,
725
+ "completions/mean_terminated_length": 48.71715393066406,
726
+ "completions/min_length": 38.0,
727
+ "completions/min_terminated_length": 38.0,
728
+ "epoch": 13.5,
729
+ "frac_reward_zero_std": 0.95,
730
+ "grad_norm": 0.0007964075193740427,
731
+ "kl": 0.07870542965829372,
732
+ "learning_rate": 2.693294185106562e-05,
733
+ "loss": 0.0014319854788482189,
734
+ "num_tokens": 526773.0,
735
+ "reward": 1.3875,
736
+ "reward_std": 0.025,
737
+ "rewards/reward_fn/mean": 1.3875,
738
+ "rewards/reward_fn/std": 0.38874412178993223,
739
+ "step": 135
740
+ },
741
+ {
742
+ "clip_ratio/high_max": 0.0,
743
+ "clip_ratio/high_mean": 0.0,
744
+ "clip_ratio/low_mean": 0.0,
745
+ "clip_ratio/low_min": 0.0,
746
+ "clip_ratio/region_mean": 0.0,
747
+ "completion_length": 70.6375,
748
+ "completions/clipped_ratio": 0.1,
749
+ "completions/max_length": 219.2,
750
+ "completions/max_terminated_length": 104.2,
751
+ "completions/mean_length": 70.6375,
752
+ "completions/mean_terminated_length": 50.06154937744141,
753
+ "completions/min_length": 37.0,
754
+ "completions/min_terminated_length": 37.0,
755
+ "epoch": 14.0,
756
+ "frac_reward_zero_std": 1.0,
757
+ "grad_norm": 0.00017480542010162026,
758
+ "kl": 0.05223398320376873,
759
+ "learning_rate": 2.3348413563600325e-05,
760
+ "loss": 5.521520506590605e-05,
761
+ "num_tokens": 545864.0,
762
+ "reward": 1.3,
763
+ "reward_std": 0.0,
764
+ "rewards/reward_fn/mean": 1.3,
765
+ "rewards/reward_fn/std": 0.38544455766677854,
766
+ "step": 140
767
+ },
768
+ {
769
+ "clip_ratio/high_max": 0.0,
770
+ "clip_ratio/high_mean": 0.0,
771
+ "clip_ratio/low_mean": 0.0,
772
+ "clip_ratio/low_min": 0.0,
773
+ "clip_ratio/region_mean": 0.0,
774
+ "completion_length": 75.7,
775
+ "completions/clipped_ratio": 0.1125,
776
+ "completions/max_length": 220.0,
777
+ "completions/max_terminated_length": 134.4,
778
+ "completions/mean_length": 75.7,
779
+ "completions/mean_terminated_length": 52.85439682006836,
780
+ "completions/min_length": 36.6,
781
+ "completions/min_terminated_length": 36.6,
782
+ "epoch": 14.5,
783
+ "frac_reward_zero_std": 0.95,
784
+ "grad_norm": 0.0014995499514043331,
785
+ "kl": 0.061327320709824565,
786
+ "learning_rate": 1.9945942635848748e-05,
787
+ "loss": -0.039644479751586914,
788
+ "num_tokens": 565456.0,
789
+ "reward": 1.275,
790
+ "reward_std": 0.05,
791
+ "rewards/reward_fn/mean": 1.275,
792
+ "rewards/reward_fn/std": 0.40599284172058103,
793
+ "step": 145
794
+ },
795
+ {
796
+ "clip_ratio/high_max": 0.0,
797
+ "clip_ratio/high_mean": 0.0,
798
+ "clip_ratio/low_mean": 0.0,
799
+ "clip_ratio/low_min": 0.0,
800
+ "clip_ratio/region_mean": 0.0,
801
+ "completion_length": 88.6875,
802
+ "completions/clipped_ratio": 0.1875,
803
+ "completions/max_length": 214.6,
804
+ "completions/max_terminated_length": 98.8,
805
+ "completions/mean_length": 88.6875,
806
+ "completions/mean_terminated_length": 51.92777633666992,
807
+ "completions/min_length": 35.0,
808
+ "completions/min_terminated_length": 35.0,
809
+ "epoch": 15.0,
810
+ "frac_reward_zero_std": 1.0,
811
+ "grad_norm": 0.001117849606089294,
812
+ "kl": 0.05267696371302009,
813
+ "learning_rate": 1.6748771394307585e-05,
814
+ "loss": 6.003726157359779e-05,
815
+ "num_tokens": 586207.0,
816
+ "reward": 1.35,
817
+ "reward_std": 0.0,
818
+ "rewards/reward_fn/mean": 1.35,
819
+ "rewards/reward_fn/std": 0.37160772681236265,
820
+ "step": 150
821
+ },
822
+ {
823
+ "clip_ratio/high_max": 0.0,
824
+ "clip_ratio/high_mean": 0.0,
825
+ "clip_ratio/low_mean": 0.0,
826
+ "clip_ratio/low_min": 0.0,
827
+ "clip_ratio/region_mean": 0.0,
828
+ "completion_length": 78.575,
829
+ "completions/clipped_ratio": 0.15,
830
+ "completions/max_length": 219.0,
831
+ "completions/max_terminated_length": 82.0,
832
+ "completions/mean_length": 78.575,
833
+ "completions/mean_terminated_length": 47.18986129760742,
834
+ "completions/min_length": 33.4,
835
+ "completions/min_terminated_length": 33.4,
836
+ "epoch": 15.5,
837
+ "frac_reward_zero_std": 0.95,
838
+ "grad_norm": 0.00045852441689930856,
839
+ "kl": 0.12903956770896913,
840
+ "learning_rate": 1.3778739760445552e-05,
841
+ "loss": 0.010452968627214431,
842
+ "num_tokens": 606445.0,
843
+ "reward": 1.325,
844
+ "reward_std": 0.05,
845
+ "rewards/reward_fn/mean": 1.325,
846
+ "rewards/reward_fn/std": 0.4198296725749969,
847
+ "step": 155
848
+ },
849
+ {
850
+ "clip_ratio/high_max": 0.0,
851
+ "clip_ratio/high_mean": 0.0,
852
+ "clip_ratio/low_mean": 0.0,
853
+ "clip_ratio/low_min": 0.0,
854
+ "clip_ratio/region_mean": 0.0,
855
+ "completion_length": 67.5625,
856
+ "completions/clipped_ratio": 0.0875,
857
+ "completions/max_length": 181.4,
858
+ "completions/max_terminated_length": 99.4,
859
+ "completions/mean_length": 67.5625,
860
+ "completions/mean_terminated_length": 49.69725341796875,
861
+ "completions/min_length": 33.4,
862
+ "completions/min_terminated_length": 33.4,
863
+ "epoch": 16.0,
864
+ "frac_reward_zero_std": 0.9,
865
+ "grad_norm": 0.0011993228690698743,
866
+ "kl": 0.06518192682415247,
867
+ "learning_rate": 1.1056136061894384e-05,
868
+ "loss": -0.039871737360954285,
869
+ "num_tokens": 624578.0,
870
+ "reward": 1.20625,
871
+ "reward_std": 0.0875,
872
+ "rewards/reward_fn/mean": 1.20625,
873
+ "rewards/reward_fn/std": 0.33549708127975464,
874
+ "step": 160
875
+ },
876
+ {
877
+ "clip_ratio/high_max": 0.0,
878
+ "clip_ratio/high_mean": 0.0,
879
+ "clip_ratio/low_mean": 0.0,
880
+ "clip_ratio/low_min": 0.0,
881
+ "clip_ratio/region_mean": 0.0,
882
+ "completion_length": 94.225,
883
+ "completions/clipped_ratio": 0.2125,
884
+ "completions/max_length": 256.0,
885
+ "completions/max_terminated_length": 82.0,
886
+ "completions/mean_length": 94.225,
887
+ "completions/mean_terminated_length": 50.561650085449216,
888
+ "completions/min_length": 37.0,
889
+ "completions/min_terminated_length": 37.0,
890
+ "epoch": 16.5,
891
+ "frac_reward_zero_std": 0.95,
892
+ "grad_norm": 0.0002106676547555253,
893
+ "kl": 0.047630924731492996,
894
+ "learning_rate": 8.599558442598998e-06,
895
+ "loss": -0.024224117398262024,
896
+ "num_tokens": 646012.0,
897
+ "reward": 1.325,
898
+ "reward_std": 0.05,
899
+ "rewards/reward_fn/mean": 1.325,
900
+ "rewards/reward_fn/std": 0.49543556571006775,
901
+ "step": 165
902
+ },
903
+ {
904
+ "clip_ratio/high_max": 0.0,
905
+ "clip_ratio/high_mean": 0.0,
906
+ "clip_ratio/low_mean": 0.0,
907
+ "clip_ratio/low_min": 0.0,
908
+ "clip_ratio/region_mean": 0.0,
909
+ "completion_length": 78.7625,
910
+ "completions/clipped_ratio": 0.15,
911
+ "completions/max_length": 214.8,
912
+ "completions/max_terminated_length": 95.2,
913
+ "completions/mean_length": 78.7625,
914
+ "completions/mean_terminated_length": 47.7126968383789,
915
+ "completions/min_length": 35.2,
916
+ "completions/min_terminated_length": 35.2,
917
+ "epoch": 17.0,
918
+ "frac_reward_zero_std": 0.9,
919
+ "grad_norm": 0.28769755363464355,
920
+ "kl": 0.06921696178615093,
921
+ "learning_rate": 6.425787818636131e-06,
922
+ "loss": 0.004652551189064979,
923
+ "num_tokens": 665597.0,
924
+ "reward": 1.275,
925
+ "reward_std": 0.05,
926
+ "rewards/reward_fn/mean": 1.275,
927
+ "rewards/reward_fn/std": 0.4579955816268921,
928
+ "step": 170
929
+ },
930
+ {
931
+ "clip_ratio/high_max": 0.0,
932
+ "clip_ratio/high_mean": 0.0,
933
+ "clip_ratio/low_mean": 0.0,
934
+ "clip_ratio/low_min": 0.0,
935
+ "clip_ratio/region_mean": 0.0,
936
+ "completion_length": 102.5,
937
+ "completions/clipped_ratio": 0.25,
938
+ "completions/max_length": 256.0,
939
+ "completions/max_terminated_length": 97.0,
940
+ "completions/mean_length": 102.5,
941
+ "completions/mean_terminated_length": 51.227620697021486,
942
+ "completions/min_length": 38.6,
943
+ "completions/min_terminated_length": 38.6,
944
+ "epoch": 17.5,
945
+ "frac_reward_zero_std": 0.95,
946
+ "grad_norm": 0.0012580830371007323,
947
+ "kl": 0.05462887082248926,
948
+ "learning_rate": 4.549673247541875e-06,
949
+ "loss": -0.015973620116710663,
950
+ "num_tokens": 688689.0,
951
+ "reward": 1.425,
952
+ "reward_std": 0.05,
953
+ "rewards/reward_fn/mean": 1.425,
954
+ "rewards/reward_fn/std": 0.5231092274188995,
955
+ "step": 175
956
+ },
957
+ {
958
+ "clip_ratio/high_max": 0.0,
959
+ "clip_ratio/high_mean": 0.0,
960
+ "clip_ratio/low_mean": 0.0,
961
+ "clip_ratio/low_min": 0.0,
962
+ "clip_ratio/region_mean": 0.0,
963
+ "completion_length": 69.5625,
964
+ "completions/clipped_ratio": 0.1,
965
+ "completions/max_length": 219.2,
966
+ "completions/max_terminated_length": 94.4,
967
+ "completions/mean_length": 69.5625,
968
+ "completions/mean_terminated_length": 49.011925506591794,
969
+ "completions/min_length": 35.6,
970
+ "completions/min_terminated_length": 35.6,
971
+ "epoch": 18.0,
972
+ "frac_reward_zero_std": 1.0,
973
+ "grad_norm": 0.0015256877522915602,
974
+ "kl": 0.06085181171074509,
975
+ "learning_rate": 2.9840304941919415e-06,
976
+ "loss": 5.7958898833021524e-05,
977
+ "num_tokens": 706526.0,
978
+ "reward": 1.2,
979
+ "reward_std": 0.0,
980
+ "rewards/reward_fn/mean": 1.2,
981
+ "rewards/reward_fn/std": 0.35777089595794676,
982
+ "step": 180
983
+ },
984
+ {
985
+ "clip_ratio/high_max": 0.0,
986
+ "clip_ratio/high_mean": 0.0,
987
+ "clip_ratio/low_mean": 0.0,
988
+ "clip_ratio/low_min": 0.0,
989
+ "clip_ratio/region_mean": 0.0,
990
+ "completion_length": 83.6,
991
+ "completions/clipped_ratio": 0.1625,
992
+ "completions/max_length": 235.0,
993
+ "completions/max_terminated_length": 125.4,
994
+ "completions/mean_length": 83.6,
995
+ "completions/mean_terminated_length": 50.05384750366211,
996
+ "completions/min_length": 31.0,
997
+ "completions/min_terminated_length": 31.0,
998
+ "epoch": 18.5,
999
+ "frac_reward_zero_std": 0.95,
1000
+ "grad_norm": 0.09397394210100174,
1001
+ "kl": 0.07756874859333038,
1002
+ "learning_rate": 1.7395544861325718e-06,
1003
+ "loss": -0.0027227483689785,
1004
+ "num_tokens": 726858.0,
1005
+ "reward": 1.325,
1006
+ "reward_std": 0.05,
1007
+ "rewards/reward_fn/mean": 1.325,
1008
+ "rewards/reward_fn/std": 0.39215601086616514,
1009
+ "step": 185
1010
+ },
1011
+ {
1012
+ "clip_ratio/high_max": 0.0,
1013
+ "clip_ratio/high_mean": 0.0,
1014
+ "clip_ratio/low_mean": 0.0,
1015
+ "clip_ratio/low_min": 0.0,
1016
+ "clip_ratio/region_mean": 0.0,
1017
+ "completion_length": 101.4,
1018
+ "completions/clipped_ratio": 0.25,
1019
+ "completions/max_length": 214.8,
1020
+ "completions/max_terminated_length": 105.8,
1021
+ "completions/mean_length": 101.4,
1022
+ "completions/mean_terminated_length": 52.382843017578125,
1023
+ "completions/min_length": 36.6,
1024
+ "completions/min_terminated_length": 36.6,
1025
+ "epoch": 19.0,
1026
+ "frac_reward_zero_std": 0.85,
1027
+ "grad_norm": 0.0934695154428482,
1028
+ "kl": 0.06799561325460672,
1029
+ "learning_rate": 8.247462563808817e-07,
1030
+ "loss": -0.010550712794065475,
1031
+ "num_tokens": 748778.0,
1032
+ "reward": 1.304107141494751,
1033
+ "reward_std": 0.0917857151478529,
1034
+ "rewards/reward_fn/mean": 1.304107141494751,
1035
+ "rewards/reward_fn/std": 0.4396756589412689,
1036
+ "step": 190
1037
+ },
1038
+ {
1039
+ "clip_ratio/high_max": 0.0,
1040
+ "clip_ratio/high_mean": 0.0,
1041
+ "clip_ratio/low_mean": 0.0,
1042
+ "clip_ratio/low_min": 0.0,
1043
+ "clip_ratio/region_mean": 0.0,
1044
+ "completion_length": 78.525,
1045
+ "completions/clipped_ratio": 0.1,
1046
+ "completions/max_length": 181.6,
1047
+ "completions/max_terminated_length": 155.0,
1048
+ "completions/mean_length": 78.525,
1049
+ "completions/mean_terminated_length": 59.155357360839844,
1050
+ "completions/min_length": 33.8,
1051
+ "completions/min_terminated_length": 33.8,
1052
+ "epoch": 19.5,
1053
+ "frac_reward_zero_std": 0.95,
1054
+ "grad_norm": 0.0012224174570292234,
1055
+ "kl": 0.07060394734144211,
1056
+ "learning_rate": 2.458548727494292e-07,
1057
+ "loss": -0.0030293500050902365,
1058
+ "num_tokens": 767928.0,
1059
+ "reward": 1.2375,
1060
+ "reward_std": 0.025,
1061
+ "rewards/reward_fn/mean": 1.2375,
1062
+ "rewards/reward_fn/std": 0.34600183367729187,
1063
+ "step": 195
1064
+ },
1065
+ {
1066
+ "clip_ratio/high_max": 0.0,
1067
+ "clip_ratio/high_mean": 0.0,
1068
+ "clip_ratio/low_mean": 0.0,
1069
+ "clip_ratio/low_min": 0.0,
1070
+ "clip_ratio/region_mean": 0.0,
1071
+ "completion_length": 88.025,
1072
+ "completions/clipped_ratio": 0.1875,
1073
+ "completions/max_length": 256.0,
1074
+ "completions/max_terminated_length": 98.6,
1075
+ "completions/mean_length": 88.025,
1076
+ "completions/mean_terminated_length": 49.32130355834961,
1077
+ "completions/min_length": 32.4,
1078
+ "completions/min_terminated_length": 32.4,
1079
+ "epoch": 20.0,
1080
+ "frac_reward_zero_std": 0.95,
1081
+ "grad_norm": 0.0738036260008812,
1082
+ "kl": 0.055156406760215757,
1083
+ "learning_rate": 6.834750376549792e-09,
1084
+ "loss": -0.026311689615249635,
1085
+ "num_tokens": 788646.0,
1086
+ "reward": 1.325,
1087
+ "reward_std": 0.05,
1088
+ "rewards/reward_fn/mean": 1.325,
1089
+ "rewards/reward_fn/std": 0.4198296725749969,
1090
+ "step": 200
1091
+ }
1092
+ ],
1093
+ "logging_steps": 5,
1094
+ "max_steps": 200,
1095
+ "num_input_tokens_seen": 788646,
1096
+ "num_train_epochs": 20,
1097
+ "save_steps": 50,
1098
+ "stateful_callbacks": {
1099
+ "TrainerControl": {
1100
+ "args": {
1101
+ "should_epoch_stop": false,
1102
+ "should_evaluate": false,
1103
+ "should_log": false,
1104
+ "should_save": true,
1105
+ "should_training_stop": true
1106
+ },
1107
+ "attributes": {}
1108
+ }
1109
+ },
1110
+ "total_flos": 0.0,
1111
+ "train_batch_size": 8,
1112
+ "trial_name": null,
1113
+ "trial_params": null
1114
+ }
checkpoint-200/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61516d246785bb4adf93d26e6fe8ffbb65a924b8a03f13b01e1d00b7066140af
3
+ size 6673
checkpoint-50/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Qwen2.5-Coder-0.5B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:unsloth/Qwen2.5-Coder-0.5B-Instruct
7
+ - grpo
8
+ - lora
9
+ - transformers
10
+ - trl
11
+ - unsloth
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.1
checkpoint-50/adapter_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": {
6
+ "base_model_class": "Qwen2ForCausalLM",
7
+ "parent_library": "transformers.models.qwen2.modeling_qwen2",
8
+ "unsloth_fixed": true
9
+ },
10
+ "base_model_name_or_path": "unsloth/Qwen2.5-Coder-0.5B-Instruct",
11
+ "bias": "none",
12
+ "corda_config": null,
13
+ "ensure_weight_tying": false,
14
+ "eva_config": null,
15
+ "exclude_modules": null,
16
+ "fan_in_fan_out": false,
17
+ "inference_mode": true,
18
+ "init_lora_weights": true,
19
+ "layer_replication": null,
20
+ "layers_pattern": null,
21
+ "layers_to_transform": null,
22
+ "loftq_config": {},
23
+ "lora_alpha": 32,
24
+ "lora_bias": false,
25
+ "lora_dropout": 0,
26
+ "megatron_config": null,
27
+ "megatron_core": "megatron.core",
28
+ "modules_to_save": null,
29
+ "peft_type": "LORA",
30
+ "peft_version": "0.18.1",
31
+ "qalora_group_size": 16,
32
+ "r": 32,
33
+ "rank_pattern": {},
34
+ "revision": null,
35
+ "target_modules": [
36
+ "v_proj",
37
+ "k_proj",
38
+ "o_proj",
39
+ "up_proj",
40
+ "gate_proj",
41
+ "q_proj",
42
+ "down_proj"
43
+ ],
44
+ "target_parameters": null,
45
+ "task_type": "CAUSAL_LM",
46
+ "trainable_token_indices": null,
47
+ "use_dora": false,
48
+ "use_qalora": false,
49
+ "use_rslora": false
50
+ }
checkpoint-50/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fafa669aaab74afdd4e3f1051bdccce2b9bb721f9b3f98d1cc1c2e321d53cb25
3
+ size 70430032
checkpoint-50/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-50/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d97cb0ee5e06ebbb51d7802c576a32012f68786258a8621d55d5c125f8ce2b8b
3
+ size 36139685
checkpoint-50/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f2001ad8fc0f8a681cb2e6e4ea2da8c3ceaffed76150ddb3012314c6f1d9599
3
+ size 14645
checkpoint-50/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cd0e9d505fbc3f97feb166d29026132bdf14eb3e5c7ff77beebc303ee666f96
3
+ size 1383
checkpoint-50/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6ab4a60dabf5285d62294f6dabdfb5eda3b8675d8c862f026b8ef4fa5a2952a
3
+ size 1465
checkpoint-50/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af5891a15588546db1ac7f2baf8fa94835a51a85c032c39793a55bb048b47446
3
+ size 11422523
checkpoint-50/tokenizer_config.json ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [],
9
+ "is_local": false,
10
+ "model_max_length": 32768,
11
+ "pad_token": "<|PAD_TOKEN|>",
12
+ "padding_side": "right",
13
+ "split_special_tokens": false,
14
+ "tokenizer_class": "Qwen2Tokenizer",
15
+ "unk_token": null,
16
+ "added_tokens_decoder": {
17
+ "151643": {
18
+ "content": "<|endoftext|>",
19
+ "single_word": false,
20
+ "lstrip": false,
21
+ "rstrip": false,
22
+ "normalized": false,
23
+ "special": true
24
+ },
25
+ "151644": {
26
+ "content": "<|im_start|>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ "151645": {
34
+ "content": "<|im_end|>",
35
+ "single_word": false,
36
+ "lstrip": false,
37
+ "rstrip": false,
38
+ "normalized": false,
39
+ "special": true
40
+ },
41
+ "151646": {
42
+ "content": "<|object_ref_start|>",
43
+ "single_word": false,
44
+ "lstrip": false,
45
+ "rstrip": false,
46
+ "normalized": false,
47
+ "special": true
48
+ },
49
+ "151647": {
50
+ "content": "<|object_ref_end|>",
51
+ "single_word": false,
52
+ "lstrip": false,
53
+ "rstrip": false,
54
+ "normalized": false,
55
+ "special": true
56
+ },
57
+ "151648": {
58
+ "content": "<|box_start|>",
59
+ "single_word": false,
60
+ "lstrip": false,
61
+ "rstrip": false,
62
+ "normalized": false,
63
+ "special": true
64
+ },
65
+ "151649": {
66
+ "content": "<|box_end|>",
67
+ "single_word": false,
68
+ "lstrip": false,
69
+ "rstrip": false,
70
+ "normalized": false,
71
+ "special": true
72
+ },
73
+ "151650": {
74
+ "content": "<|quad_start|>",
75
+ "single_word": false,
76
+ "lstrip": false,
77
+ "rstrip": false,
78
+ "normalized": false,
79
+ "special": true
80
+ },
81
+ "151651": {
82
+ "content": "<|quad_end|>",
83
+ "single_word": false,
84
+ "lstrip": false,
85
+ "rstrip": false,
86
+ "normalized": false,
87
+ "special": true
88
+ },
89
+ "151652": {
90
+ "content": "<|vision_start|>",
91
+ "single_word": false,
92
+ "lstrip": false,
93
+ "rstrip": false,
94
+ "normalized": false,
95
+ "special": true
96
+ },
97
+ "151653": {
98
+ "content": "<|vision_end|>",
99
+ "single_word": false,
100
+ "lstrip": false,
101
+ "rstrip": false,
102
+ "normalized": false,
103
+ "special": true
104
+ },
105
+ "151654": {
106
+ "content": "<|vision_pad|>",
107
+ "single_word": false,
108
+ "lstrip": false,
109
+ "rstrip": false,
110
+ "normalized": false,
111
+ "special": true
112
+ },
113
+ "151655": {
114
+ "content": "<|image_pad|>",
115
+ "single_word": false,
116
+ "lstrip": false,
117
+ "rstrip": false,
118
+ "normalized": false,
119
+ "special": true
120
+ },
121
+ "151656": {
122
+ "content": "<|video_pad|>",
123
+ "single_word": false,
124
+ "lstrip": false,
125
+ "rstrip": false,
126
+ "normalized": false,
127
+ "special": true
128
+ },
129
+ "151657": {
130
+ "content": "<tool_call>",
131
+ "single_word": false,
132
+ "lstrip": false,
133
+ "rstrip": false,
134
+ "normalized": false,
135
+ "special": false
136
+ },
137
+ "151658": {
138
+ "content": "</tool_call>",
139
+ "single_word": false,
140
+ "lstrip": false,
141
+ "rstrip": false,
142
+ "normalized": false,
143
+ "special": false
144
+ },
145
+ "151659": {
146
+ "content": "<|fim_prefix|>",
147
+ "single_word": false,
148
+ "lstrip": false,
149
+ "rstrip": false,
150
+ "normalized": false,
151
+ "special": false
152
+ },
153
+ "151660": {
154
+ "content": "<|fim_middle|>",
155
+ "single_word": false,
156
+ "lstrip": false,
157
+ "rstrip": false,
158
+ "normalized": false,
159
+ "special": false
160
+ },
161
+ "151661": {
162
+ "content": "<|fim_suffix|>",
163
+ "single_word": false,
164
+ "lstrip": false,
165
+ "rstrip": false,
166
+ "normalized": false,
167
+ "special": false
168
+ },
169
+ "151662": {
170
+ "content": "<|fim_pad|>",
171
+ "single_word": false,
172
+ "lstrip": false,
173
+ "rstrip": false,
174
+ "normalized": false,
175
+ "special": false
176
+ },
177
+ "151663": {
178
+ "content": "<|repo_name|>",
179
+ "single_word": false,
180
+ "lstrip": false,
181
+ "rstrip": false,
182
+ "normalized": false,
183
+ "special": false
184
+ },
185
+ "151664": {
186
+ "content": "<|file_sep|>",
187
+ "single_word": false,
188
+ "lstrip": false,
189
+ "rstrip": false,
190
+ "normalized": false,
191
+ "special": false
192
+ },
193
+ "151665": {
194
+ "content": "<|PAD_TOKEN|>",
195
+ "single_word": false,
196
+ "lstrip": false,
197
+ "rstrip": false,
198
+ "normalized": false,
199
+ "special": true
200
+ }
201
+ }
202
+ }
checkpoint-50/trainer_state.json ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
+ "eval_steps": 500,
7
+ "global_step": 50,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completion_length": 95.85,
19
+ "completions/clipped_ratio": 0.125,
20
+ "completions/max_length": 256.0,
21
+ "completions/max_terminated_length": 195.6,
22
+ "completions/mean_length": 95.85,
23
+ "completions/mean_terminated_length": 72.8880989074707,
24
+ "completions/min_length": 36.6,
25
+ "completions/min_terminated_length": 36.6,
26
+ "epoch": 0.5,
27
+ "frac_reward_zero_std": 0.2,
28
+ "grad_norm": 0.3041034936904907,
29
+ "kl": 0.00021898818758927518,
30
+ "learning_rate": 4e-05,
31
+ "loss": 0.0329527884721756,
32
+ "num_tokens": 21444.0,
33
+ "reward": 0.7767857074737549,
34
+ "reward_std": 0.7150911569595337,
35
+ "rewards/reward_fn/mean": 0.7767857074737549,
36
+ "rewards/reward_fn/std": 0.9959433197975158,
37
+ "step": 5
38
+ },
39
+ {
40
+ "clip_ratio/high_max": 0.0,
41
+ "clip_ratio/high_mean": 0.0,
42
+ "clip_ratio/low_mean": 0.0,
43
+ "clip_ratio/low_min": 0.0,
44
+ "clip_ratio/region_mean": 0.0,
45
+ "completion_length": 95.1,
46
+ "completions/clipped_ratio": 0.0875,
47
+ "completions/max_length": 225.2,
48
+ "completions/max_terminated_length": 159.2,
49
+ "completions/mean_length": 95.1,
50
+ "completions/mean_terminated_length": 80.34466400146485,
51
+ "completions/min_length": 34.6,
52
+ "completions/min_terminated_length": 34.6,
53
+ "epoch": 1.0,
54
+ "frac_reward_zero_std": 0.4,
55
+ "grad_norm": 0.2708108425140381,
56
+ "kl": 0.006061523332027719,
57
+ "learning_rate": 9e-05,
58
+ "loss": -0.016053691506385803,
59
+ "num_tokens": 42220.0,
60
+ "reward": 0.6875000119209289,
61
+ "reward_std": 0.5175672590732574,
62
+ "rewards/reward_fn/mean": 0.6875,
63
+ "rewards/reward_fn/std": 0.8034753799438477,
64
+ "step": 10
65
+ },
66
+ {
67
+ "clip_ratio/high_max": 0.0,
68
+ "clip_ratio/high_mean": 0.0,
69
+ "clip_ratio/low_mean": 0.0,
70
+ "clip_ratio/low_min": 0.0,
71
+ "clip_ratio/region_mean": 0.0,
72
+ "completion_length": 146.275,
73
+ "completions/clipped_ratio": 0.325,
74
+ "completions/max_length": 256.0,
75
+ "completions/max_terminated_length": 225.4,
76
+ "completions/mean_length": 146.275,
77
+ "completions/mean_terminated_length": 92.8663101196289,
78
+ "completions/min_length": 32.6,
79
+ "completions/min_terminated_length": 32.6,
80
+ "epoch": 1.5,
81
+ "frac_reward_zero_std": 0.2,
82
+ "grad_norm": 0.27529817819595337,
83
+ "kl": 0.01560373855754733,
84
+ "learning_rate": 9.989068136093873e-05,
85
+ "loss": 0.010054035484790802,
86
+ "num_tokens": 69198.0,
87
+ "reward": 1.1558928728103637,
88
+ "reward_std": 0.8465379416942597,
89
+ "rewards/reward_fn/mean": 1.1558928728103637,
90
+ "rewards/reward_fn/std": 1.1404805421829223,
91
+ "step": 15
92
+ },
93
+ {
94
+ "clip_ratio/high_max": 0.0,
95
+ "clip_ratio/high_mean": 0.0,
96
+ "clip_ratio/low_mean": 0.0,
97
+ "clip_ratio/low_min": 0.0,
98
+ "clip_ratio/region_mean": 0.0,
99
+ "completion_length": 75.85,
100
+ "completions/clipped_ratio": 0.05,
101
+ "completions/max_length": 220.6,
102
+ "completions/max_terminated_length": 151.0,
103
+ "completions/mean_length": 75.85,
104
+ "completions/mean_terminated_length": 66.14857330322266,
105
+ "completions/min_length": 32.8,
106
+ "completions/min_terminated_length": 32.8,
107
+ "epoch": 2.0,
108
+ "frac_reward_zero_std": 0.35,
109
+ "grad_norm": 0.353487491607666,
110
+ "kl": 0.027986684162169696,
111
+ "learning_rate": 9.944739353007344e-05,
112
+ "loss": -0.030885905027389526,
113
+ "num_tokens": 87334.0,
114
+ "reward": 0.7932142853736878,
115
+ "reward_std": 0.4118983089923859,
116
+ "rewards/reward_fn/mean": 0.7932142853736878,
117
+ "rewards/reward_fn/std": 0.7167340397834778,
118
+ "step": 20
119
+ },
120
+ {
121
+ "clip_ratio/high_max": 0.0,
122
+ "clip_ratio/high_mean": 0.0,
123
+ "clip_ratio/low_mean": 0.0,
124
+ "clip_ratio/low_min": 0.0,
125
+ "clip_ratio/region_mean": 0.0,
126
+ "completion_length": 90.85,
127
+ "completions/clipped_ratio": 0.0625,
128
+ "completions/max_length": 248.0,
129
+ "completions/max_terminated_length": 216.4,
130
+ "completions/mean_length": 90.85,
131
+ "completions/mean_terminated_length": 79.819287109375,
132
+ "completions/min_length": 28.0,
133
+ "completions/min_terminated_length": 28.0,
134
+ "epoch": 2.5,
135
+ "frac_reward_zero_std": 0.15,
136
+ "grad_norm": 0.3318491578102112,
137
+ "kl": 0.04692814685404301,
138
+ "learning_rate": 9.86663298624003e-05,
139
+ "loss": 0.05783940553665161,
140
+ "num_tokens": 108330.0,
141
+ "reward": 1.1987500190734863,
142
+ "reward_std": 0.5907092690467834,
143
+ "rewards/reward_fn/mean": 1.1987500190734863,
144
+ "rewards/reward_fn/std": 0.9482298731803894,
145
+ "step": 25
146
+ },
147
+ {
148
+ "clip_ratio/high_max": 0.0,
149
+ "clip_ratio/high_mean": 0.0,
150
+ "clip_ratio/low_mean": 0.0,
151
+ "clip_ratio/low_min": 0.0,
152
+ "clip_ratio/region_mean": 0.0,
153
+ "completion_length": 91.775,
154
+ "completions/clipped_ratio": 0.125,
155
+ "completions/max_length": 248.2,
156
+ "completions/max_terminated_length": 212.6,
157
+ "completions/mean_length": 91.775,
158
+ "completions/mean_terminated_length": 68.68722763061524,
159
+ "completions/min_length": 25.2,
160
+ "completions/min_terminated_length": 25.2,
161
+ "epoch": 3.0,
162
+ "frac_reward_zero_std": 0.4,
163
+ "grad_norm": 0.24605490267276764,
164
+ "kl": 0.07254143096506596,
165
+ "learning_rate": 9.755282581475769e-05,
166
+ "loss": 0.03934891819953919,
167
+ "num_tokens": 129012.0,
168
+ "reward": 1.0242857217788697,
169
+ "reward_std": 0.5802445828914642,
170
+ "rewards/reward_fn/mean": 1.0242857217788697,
171
+ "rewards/reward_fn/std": 0.8706011414527893,
172
+ "step": 30
173
+ },
174
+ {
175
+ "clip_ratio/high_max": 0.0,
176
+ "clip_ratio/high_mean": 0.0,
177
+ "clip_ratio/low_mean": 0.0,
178
+ "clip_ratio/low_min": 0.0,
179
+ "clip_ratio/region_mean": 0.0,
180
+ "completion_length": 59.7875,
181
+ "completions/clipped_ratio": 0.0375,
182
+ "completions/max_length": 175.6,
183
+ "completions/max_terminated_length": 118.4,
184
+ "completions/mean_length": 59.7875,
185
+ "completions/mean_terminated_length": 52.19559555053711,
186
+ "completions/min_length": 25.4,
187
+ "completions/min_terminated_length": 25.4,
188
+ "epoch": 3.5,
189
+ "frac_reward_zero_std": 0.6,
190
+ "grad_norm": 0.2970605790615082,
191
+ "kl": 0.053099883161485194,
192
+ "learning_rate": 9.611448774886924e-05,
193
+ "loss": 0.06977529525756836,
194
+ "num_tokens": 147103.0,
195
+ "reward": 0.9473214387893677,
196
+ "reward_std": 0.3923970699310303,
197
+ "rewards/reward_fn/mean": 0.9473214268684387,
198
+ "rewards/reward_fn/std": 0.649278998374939,
199
+ "step": 35
200
+ },
201
+ {
202
+ "clip_ratio/high_max": 0.0,
203
+ "clip_ratio/high_mean": 0.0,
204
+ "clip_ratio/low_mean": 0.0,
205
+ "clip_ratio/low_min": 0.0,
206
+ "clip_ratio/region_mean": 0.0,
207
+ "completion_length": 47.85,
208
+ "completions/clipped_ratio": 0.0,
209
+ "completions/max_length": 94.8,
210
+ "completions/max_terminated_length": 94.8,
211
+ "completions/mean_length": 47.85,
212
+ "completions/mean_terminated_length": 47.85,
213
+ "completions/min_length": 31.2,
214
+ "completions/min_terminated_length": 31.2,
215
+ "epoch": 4.0,
216
+ "frac_reward_zero_std": 0.85,
217
+ "grad_norm": 0.16646049916744232,
218
+ "kl": 0.056573960930109024,
219
+ "learning_rate": 9.43611409721806e-05,
220
+ "loss": -0.0031644195318222047,
221
+ "num_tokens": 164643.0,
222
+ "reward": 0.7899999976158142,
223
+ "reward_std": 0.12681145668029786,
224
+ "rewards/reward_fn/mean": 0.7899999976158142,
225
+ "rewards/reward_fn/std": 0.5133758962154389,
226
+ "step": 40
227
+ },
228
+ {
229
+ "clip_ratio/high_max": 0.0,
230
+ "clip_ratio/high_mean": 0.0,
231
+ "clip_ratio/low_mean": 0.0,
232
+ "clip_ratio/low_min": 0.0,
233
+ "clip_ratio/region_mean": 0.0,
234
+ "completion_length": 49.2625,
235
+ "completions/clipped_ratio": 0.0,
236
+ "completions/max_length": 93.6,
237
+ "completions/max_terminated_length": 93.6,
238
+ "completions/mean_length": 49.2625,
239
+ "completions/mean_terminated_length": 49.2625,
240
+ "completions/min_length": 33.8,
241
+ "completions/min_terminated_length": 33.8,
242
+ "epoch": 4.5,
243
+ "frac_reward_zero_std": 0.6,
244
+ "grad_norm": 0.3159082531929016,
245
+ "kl": 0.0353464649990201,
246
+ "learning_rate": 9.230476262104677e-05,
247
+ "loss": -0.03417131304740906,
248
+ "num_tokens": 182568.0,
249
+ "reward": 0.7924999833106995,
250
+ "reward_std": 0.3821385264396667,
251
+ "rewards/reward_fn/mean": 0.7924999833106995,
252
+ "rewards/reward_fn/std": 0.62625293135643,
253
+ "step": 45
254
+ },
255
+ {
256
+ "clip_ratio/high_max": 0.0,
257
+ "clip_ratio/high_mean": 0.0,
258
+ "clip_ratio/low_mean": 0.0,
259
+ "clip_ratio/low_min": 0.0,
260
+ "clip_ratio/region_mean": 0.0,
261
+ "completion_length": 58.25,
262
+ "completions/clipped_ratio": 0.0125,
263
+ "completions/max_length": 180.6,
264
+ "completions/max_terminated_length": 153.2,
265
+ "completions/mean_length": 58.25,
266
+ "completions/mean_terminated_length": 55.67250061035156,
267
+ "completions/min_length": 33.2,
268
+ "completions/min_terminated_length": 33.2,
269
+ "epoch": 5.0,
270
+ "frac_reward_zero_std": 0.55,
271
+ "grad_norm": 0.27360936999320984,
272
+ "kl": 0.04483541352674365,
273
+ "learning_rate": 8.995939984474624e-05,
274
+ "loss": 0.026852786540985107,
275
+ "num_tokens": 200660.0,
276
+ "reward": 0.774821424484253,
277
+ "reward_std": 0.32772802114486693,
278
+ "rewards/reward_fn/mean": 0.774821424484253,
279
+ "rewards/reward_fn/std": 0.5432307064533234,
280
+ "step": 50
281
+ }
282
+ ],
283
+ "logging_steps": 5,
284
+ "max_steps": 200,
285
+ "num_input_tokens_seen": 200660,
286
+ "num_train_epochs": 20,
287
+ "save_steps": 50,
288
+ "stateful_callbacks": {
289
+ "TrainerControl": {
290
+ "args": {
291
+ "should_epoch_stop": false,
292
+ "should_evaluate": false,
293
+ "should_log": false,
294
+ "should_save": true,
295
+ "should_training_stop": false
296
+ },
297
+ "attributes": {}
298
+ }
299
+ },
300
+ "total_flos": 0.0,
301
+ "train_batch_size": 8,
302
+ "trial_name": null,
303
+ "trial_params": null
304
+ }
checkpoint-50/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61516d246785bb4adf93d26e6fe8ffbb65a924b8a03f13b01e1d00b7066140af
3
+ size 6673