leonMW commited on
Commit
213ee0a
·
verified ·
1 Parent(s): c60b794

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ datasets: AIML-TUDA/SLR-Bench
3
+ library_name: transformers
4
+ model_name: DeepSeek-R1-Distill-Qwen-1.5B-S
5
+ tags:
6
+ - generated_from_trainer
7
+ - open-r1
8
+ - trl
9
+ - grpo
10
+ licence: license
11
+ ---
12
+
13
+ # Model Card for DeepSeek-R1-Distill-Qwen-1.5B-S
14
+
15
+ This model is a fine-tuned version of [None](https://huggingface.co/None) on the [AIML-TUDA/SLR-Bench](https://huggingface.co/datasets/AIML-TUDA/SLR-Bench) dataset.
16
+ It has been trained using [TRL](https://github.com/huggingface/trl).
17
+
18
+ ## Quick start
19
+
20
+ ```python
21
+ from transformers import pipeline
22
+
23
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
24
+ generator = pipeline("text-generation", model="leonMW/DeepSeek-R1-Distill-Qwen-1.5B-S", device="cuda")
25
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
26
+ print(output["generated_text"])
27
+ ```
28
+
29
+ ## Training procedure
30
+
31
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/leonwenderoth-tu-darmstadt/huggingface/runs/2j6kdqbv)
32
+
33
+
34
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
35
+
36
+ ### Framework versions
37
+
38
+ - TRL: 0.23.0
39
+ - Transformers: 4.56.1
40
+ - Pytorch: 2.7.1
41
+ - Datasets: 4.1.0
42
+ - Tokenizers: 0.22.0
43
+
44
+ ## Citations
45
+
46
+ Cite GRPO as:
47
+
48
+ ```bibtex
49
+ @article{shao2024deepseekmath,
50
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
51
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
52
+ year = 2024,
53
+ eprint = {arXiv:2402.03300},
54
+ }
55
+
56
+ ```
57
+
58
+ Cite TRL as:
59
+
60
+ ```bibtex
61
+ @misc{vonwerra2022trl,
62
+ title = {{TRL: Transformer Reinforcement Learning}},
63
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
64
+ year = 2020,
65
+ journal = {GitHub repository},
66
+ publisher = {GitHub},
67
+ howpublished = {\url{https://github.com/huggingface/trl}}
68
+ }
69
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.005923212121358475,
4
+ "train_runtime": 6121.2967,
5
+ "train_samples": 3000,
6
+ "train_samples_per_second": 0.98,
7
+ "train_steps_per_second": 0.03
8
+ }
chat_template.jinja ADDED
@@ -0,0 +1 @@
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}
config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151646,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 151643,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
+ "layer_types": [
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention"
42
+ ],
43
+ "max_position_embeddings": 131072,
44
+ "max_window_layers": 21,
45
+ "model_type": "qwen2",
46
+ "num_attention_heads": 12,
47
+ "num_hidden_layers": 28,
48
+ "num_key_value_heads": 2,
49
+ "pad_token_id": 151643,
50
+ "rms_norm_eps": 1e-06,
51
+ "rope_scaling": null,
52
+ "rope_theta": 10000,
53
+ "sliding_window": null,
54
+ "tie_word_embeddings": false,
55
+ "transformers_version": "4.56.1",
56
+ "use_cache": true,
57
+ "use_mrope": false,
58
+ "use_sliding_window": false,
59
+ "vocab_size": 151936
60
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151646,
4
+ "eos_token_id": 151643,
5
+ "pad_token_id": 151643,
6
+ "transformers_version": "4.56.1"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65f2958ea29fc4d6d90d91eb896ecb8bbb4a79f66d7bd3ad2fe1f92528da2c2a
3
+ size 3554214752
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|begin▁of▁sentence|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|end▁of▁sentence|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|end▁of▁sentence|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4256422650d141f228fe954acee98679da412984c29a569877eefd3af69315a
3
+ size 11422959
tokenizer_config.json ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|end▁of▁sentence|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|User|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": false
21
+ },
22
+ "151645": {
23
+ "content": "<|Assistant|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": false
29
+ },
30
+ "151646": {
31
+ "content": "<|begin▁of▁sentence|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|EOT|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": false
45
+ },
46
+ "151648": {
47
+ "content": "<think>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": false
53
+ },
54
+ "151649": {
55
+ "content": "</think>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": false
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ }
182
+ },
183
+ "bos_token": "<|begin▁of▁sentence|>",
184
+ "clean_up_tokenization_spaces": false,
185
+ "eos_token": "<|end▁of▁sentence|>",
186
+ "extra_special_tokens": {},
187
+ "legacy": true,
188
+ "max_length": null,
189
+ "model_max_length": 16384,
190
+ "pad_to_multiple_of": null,
191
+ "pad_token": "<|end▁of▁sentence|>",
192
+ "pad_token_type_id": 0,
193
+ "padding_side": "left",
194
+ "sp_model_kwargs": {},
195
+ "tokenizer_class": "LlamaTokenizerFast",
196
+ "unk_token": null,
197
+ "use_default_system_prompt": false
198
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 0.005923212121358475,
4
+ "train_runtime": 6121.2967,
5
+ "train_samples": 3000,
6
+ "train_samples_per_second": 0.98,
7
+ "train_steps_per_second": 0.03
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2026 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 92,
3
+ "best_metric": 0.0008370681316591799,
4
+ "best_model_checkpoint": "data/DeepSeek-R1-Distill-Qwen-1.5B-Staged-4/checkpoint-92",
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 184,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 473.0,
20
+ "completions/max_terminated_length": 473.0,
21
+ "completions/mean_length": 399.97607421875,
22
+ "completions/mean_terminated_length": 399.97607421875,
23
+ "completions/min_length": 304.0,
24
+ "completions/min_terminated_length": 304.0,
25
+ "entropy": 0.35566435009241104,
26
+ "epoch": 0.010869565217391304,
27
+ "frac_reward_zero_std": 0.0,
28
+ "grad_norm": 0.027134701502786768,
29
+ "learning_rate": 1e-05,
30
+ "loss": 0.0026,
31
+ "num_tokens": 2869071.0,
32
+ "reward": 3.4189200401306152,
33
+ "reward_std": 0.13538040220737457,
34
+ "rewards/ngram_repetition2/mean": 0.9907151460647583,
35
+ "rewards/ngram_repetition2/std": 0.007372148334980011,
36
+ "rewards/ngram_repetition3/mean": 0.9988653659820557,
37
+ "rewards/ngram_repetition3/std": 0.0037813771050423384,
38
+ "rewards/symbolic_reward_accuracy/mean": 0.7431640625,
39
+ "rewards/symbolic_reward_accuracy/std": 0.43699485063552856,
40
+ "rewards/symbolic_reward_partial_score/mean": 0.9029541015625,
41
+ "rewards/symbolic_reward_partial_score/std": 0.19717122614383698,
42
+ "rewards/tag_count_reward/mean": 1.0,
43
+ "rewards/tag_count_reward/std": 0.0,
44
+ "rewards/thinking_answer_ratio_reward/mean": 0.9742211699485779,
45
+ "rewards/thinking_answer_ratio_reward/std": 0.004831792786717415,
46
+ "sampling/importance_sampling_ratio/max": 2.0,
47
+ "sampling/importance_sampling_ratio/mean": 1.1269948482513428,
48
+ "sampling/importance_sampling_ratio/min": 3.996394298155792e-05,
49
+ "sampling/sampling_logp_difference/max": 10.127532958984375,
50
+ "sampling/sampling_logp_difference/mean": 0.19825759530067444,
51
+ "step": 1
52
+ },
53
+ {
54
+ "clip_ratio/high_max": 0.3385416666666667,
55
+ "clip_ratio/high_mean": 0.19986979166666666,
56
+ "clip_ratio/low_mean": 0.2779947916666667,
57
+ "clip_ratio/low_min": 0.125,
58
+ "clip_ratio/region_mean": 0.4778645833333333,
59
+ "entropy": 0.3625817572077115,
60
+ "epoch": 0.043478260869565216,
61
+ "grad_norm": 0.025662250505496927,
62
+ "learning_rate": 1e-05,
63
+ "loss": -0.0008,
64
+ "step": 4
65
+ },
66
+ {
67
+ "clip_ratio/high_max": 0.26953125,
68
+ "clip_ratio/high_mean": 0.15087890625,
69
+ "clip_ratio/low_mean": 0.201171875,
70
+ "clip_ratio/low_min": 0.078125,
71
+ "clip_ratio/region_mean": 0.35205078125,
72
+ "completions/clipped_ratio": 0.0,
73
+ "completions/max_length": 483.0,
74
+ "completions/max_terminated_length": 483.0,
75
+ "completions/mean_length": 389.82080078125,
76
+ "completions/mean_terminated_length": 389.82080078125,
77
+ "completions/min_length": 299.0,
78
+ "completions/min_terminated_length": 299.0,
79
+ "entropy": 0.36707244999706745,
80
+ "epoch": 0.08695652173913043,
81
+ "frac_reward_zero_std": 0.0,
82
+ "grad_norm": 0.02048054919935291,
83
+ "learning_rate": 1e-05,
84
+ "loss": 0.0,
85
+ "num_tokens": 5758528.0,
86
+ "reward": 3.2567176818847656,
87
+ "reward_std": 0.11631277203559875,
88
+ "rewards/ngram_repetition2/mean": 0.9903280138969421,
89
+ "rewards/ngram_repetition2/std": 0.008003082126379013,
90
+ "rewards/ngram_repetition3/mean": 0.9988331198692322,
91
+ "rewards/ngram_repetition3/std": 0.004472358617931604,
92
+ "rewards/symbolic_reward_accuracy/mean": 0.662109375,
93
+ "rewards/symbolic_reward_accuracy/std": 0.47310659289360046,
94
+ "rewards/symbolic_reward_partial_score/mean": 0.9028727412223816,
95
+ "rewards/symbolic_reward_partial_score/std": 0.16355262696743011,
96
+ "rewards/tag_count_reward/mean": 1.0,
97
+ "rewards/tag_count_reward/std": 0.0,
98
+ "rewards/thinking_answer_ratio_reward/mean": 0.9734619855880737,
99
+ "rewards/thinking_answer_ratio_reward/std": 0.005536045413464308,
100
+ "sampling/importance_sampling_ratio/max": 2.0,
101
+ "sampling/importance_sampling_ratio/mean": 1.1309096813201904,
102
+ "sampling/importance_sampling_ratio/min": 1.0269287486153189e-05,
103
+ "sampling/sampling_logp_difference/max": 11.486352920532227,
104
+ "sampling/sampling_logp_difference/mean": 0.20535901188850403,
105
+ "step": 8
106
+ },
107
+ {
108
+ "clip_ratio/high_max": 0.27734375,
109
+ "clip_ratio/high_mean": 0.16259765625,
110
+ "clip_ratio/low_mean": 0.197265625,
111
+ "clip_ratio/low_min": 0.09375,
112
+ "clip_ratio/region_mean": 0.35986328125,
113
+ "completions/clipped_ratio": 0.0,
114
+ "completions/max_length": 503.0,
115
+ "completions/max_terminated_length": 503.0,
116
+ "completions/mean_length": 391.505859375,
117
+ "completions/mean_terminated_length": 391.505859375,
118
+ "completions/min_length": 311.0,
119
+ "completions/min_terminated_length": 311.0,
120
+ "entropy": 0.388662975281477,
121
+ "epoch": 0.13043478260869565,
122
+ "frac_reward_zero_std": 0.0,
123
+ "grad_norm": 0.02483318093046019,
124
+ "learning_rate": 1e-05,
125
+ "loss": 0.0001,
126
+ "num_tokens": 8607084.0,
127
+ "reward": 3.3946127891540527,
128
+ "reward_std": 0.11570382118225098,
129
+ "rewards/ngram_repetition2/mean": 0.9894477725028992,
130
+ "rewards/ngram_repetition2/std": 0.007921576499938965,
131
+ "rewards/ngram_repetition3/mean": 0.9988635778427124,
132
+ "rewards/ngram_repetition3/std": 0.00399815896525979,
133
+ "rewards/symbolic_reward_accuracy/mean": 0.72119140625,
134
+ "rewards/symbolic_reward_accuracy/std": 0.448522686958313,
135
+ "rewards/symbolic_reward_partial_score/mean": 0.922607421875,
136
+ "rewards/symbolic_reward_partial_score/std": 0.14913025498390198,
137
+ "rewards/tag_count_reward/mean": 1.0,
138
+ "rewards/tag_count_reward/std": 0.0,
139
+ "rewards/thinking_answer_ratio_reward/mean": 0.9739440679550171,
140
+ "rewards/thinking_answer_ratio_reward/std": 0.004836579784750938,
141
+ "sampling/importance_sampling_ratio/max": 2.0,
142
+ "sampling/importance_sampling_ratio/mean": 1.1356343030929565,
143
+ "sampling/importance_sampling_ratio/min": 0.0001149894596892409,
144
+ "sampling/sampling_logp_difference/max": 9.070670127868652,
145
+ "sampling/sampling_logp_difference/mean": 0.21488171815872192,
146
+ "step": 12
147
+ },
148
+ {
149
+ "clip_ratio/high_max": 0.296875,
150
+ "clip_ratio/high_mean": 0.177734375,
151
+ "clip_ratio/low_mean": 0.1787109375,
152
+ "clip_ratio/low_min": 0.07421875,
153
+ "clip_ratio/region_mean": 0.3564453125,
154
+ "completions/clipped_ratio": 0.0,
155
+ "completions/max_length": 492.0,
156
+ "completions/max_terminated_length": 492.0,
157
+ "completions/mean_length": 393.09716796875,
158
+ "completions/mean_terminated_length": 393.09716796875,
159
+ "completions/min_length": 313.0,
160
+ "completions/min_terminated_length": 313.0,
161
+ "entropy": 0.39366098679602146,
162
+ "epoch": 0.17391304347826086,
163
+ "frac_reward_zero_std": 0.0,
164
+ "grad_norm": 0.03379401199661016,
165
+ "learning_rate": 1e-05,
166
+ "loss": 0.0008,
167
+ "num_tokens": 11465235.0,
168
+ "reward": 3.1309444904327393,
169
+ "reward_std": 0.27350103855133057,
170
+ "rewards/ngram_repetition2/mean": 0.9783711433410645,
171
+ "rewards/ngram_repetition2/std": 0.021668143570423126,
172
+ "rewards/ngram_repetition3/mean": 0.9917559623718262,
173
+ "rewards/ngram_repetition3/std": 0.014161880128085613,
174
+ "rewards/symbolic_reward_accuracy/mean": 0.625,
175
+ "rewards/symbolic_reward_accuracy/std": 0.48424115777015686,
176
+ "rewards/symbolic_reward_partial_score/mean": 0.8516031503677368,
177
+ "rewards/symbolic_reward_partial_score/std": 0.22019875049591064,
178
+ "rewards/tag_count_reward/mean": 1.0,
179
+ "rewards/tag_count_reward/std": 0.0,
180
+ "rewards/thinking_answer_ratio_reward/mean": 0.9640083312988281,
181
+ "rewards/thinking_answer_ratio_reward/std": 0.01703455112874508,
182
+ "sampling/importance_sampling_ratio/max": 2.0,
183
+ "sampling/importance_sampling_ratio/mean": 1.1348872184753418,
184
+ "sampling/importance_sampling_ratio/min": 4.663659638026729e-06,
185
+ "sampling/sampling_logp_difference/max": 12.275710105895996,
186
+ "sampling/sampling_logp_difference/mean": 0.21620666980743408,
187
+ "step": 16
188
+ },
189
+ {
190
+ "clip_ratio/high_max": 0.296875,
191
+ "clip_ratio/high_mean": 0.1826171875,
192
+ "clip_ratio/low_mean": 0.17724609375,
193
+ "clip_ratio/low_min": 0.09375,
194
+ "clip_ratio/region_mean": 0.35986328125,
195
+ "completions/clipped_ratio": 0.0,
196
+ "completions/max_length": 508.0,
197
+ "completions/max_terminated_length": 508.0,
198
+ "completions/mean_length": 386.9951171875,
199
+ "completions/mean_terminated_length": 386.9951171875,
200
+ "completions/min_length": 305.0,
201
+ "completions/min_terminated_length": 305.0,
202
+ "entropy": 0.44838985428214073,
203
+ "epoch": 0.21739130434782608,
204
+ "frac_reward_zero_std": 0.0,
205
+ "grad_norm": 0.02561700687134101,
206
+ "learning_rate": 1e-05,
207
+ "loss": 0.0009,
208
+ "num_tokens": 14298217.0,
209
+ "reward": 3.3726930618286133,
210
+ "reward_std": 0.11649461090564728,
211
+ "rewards/ngram_repetition2/mean": 0.9818264245986938,
212
+ "rewards/ngram_repetition2/std": 0.01684102788567543,
213
+ "rewards/ngram_repetition3/mean": 0.9955232739448547,
214
+ "rewards/ngram_repetition3/std": 0.010431738570332527,
215
+ "rewards/symbolic_reward_accuracy/mean": 0.72021484375,
216
+ "rewards/symbolic_reward_accuracy/std": 0.4490031898021698,
217
+ "rewards/symbolic_reward_partial_score/mean": 0.9027913212776184,
218
+ "rewards/symbolic_reward_partial_score/std": 0.1795635223388672,
219
+ "rewards/tag_count_reward/mean": 1.0,
220
+ "rewards/tag_count_reward/std": 0.0,
221
+ "rewards/thinking_answer_ratio_reward/mean": 0.9698631763458252,
222
+ "rewards/thinking_answer_ratio_reward/std": 0.012476499192416668,
223
+ "sampling/importance_sampling_ratio/max": 2.0,
224
+ "sampling/importance_sampling_ratio/mean": 1.1498932838439941,
225
+ "sampling/importance_sampling_ratio/min": 4.004936636192724e-05,
226
+ "sampling/sampling_logp_difference/max": 10.125397682189941,
227
+ "sampling/sampling_logp_difference/mean": 0.23924380540847778,
228
+ "step": 20
229
+ },
230
+ {
231
+ "clip_ratio/high_max": 0.328125,
232
+ "clip_ratio/high_mean": 0.20703125,
233
+ "clip_ratio/low_mean": 0.17822265625,
234
+ "clip_ratio/low_min": 0.05859375,
235
+ "clip_ratio/region_mean": 0.38525390625,
236
+ "completions/clipped_ratio": 0.0,
237
+ "completions/max_length": 501.0,
238
+ "completions/max_terminated_length": 501.0,
239
+ "completions/mean_length": 381.43408203125,
240
+ "completions/mean_terminated_length": 381.43408203125,
241
+ "completions/min_length": 293.0,
242
+ "completions/min_terminated_length": 293.0,
243
+ "entropy": 0.45580120012164116,
244
+ "epoch": 0.2608695652173913,
245
+ "frac_reward_zero_std": 0.0,
246
+ "grad_norm": 0.02764242724586242,
247
+ "learning_rate": 1e-05,
248
+ "loss": 0.0008,
249
+ "num_tokens": 17107138.0,
250
+ "reward": 3.2511990070343018,
251
+ "reward_std": 0.14526695013046265,
252
+ "rewards/ngram_repetition2/mean": 0.9758055210113525,
253
+ "rewards/ngram_repetition2/std": 0.026150498539209366,
254
+ "rewards/ngram_repetition3/mean": 0.991034984588623,
255
+ "rewards/ngram_repetition3/std": 0.018479736521840096,
256
+ "rewards/symbolic_reward_accuracy/mean": 0.666015625,
257
+ "rewards/symbolic_reward_accuracy/std": 0.47174936532974243,
258
+ "rewards/symbolic_reward_partial_score/mean": 0.8898518681526184,
259
+ "rewards/symbolic_reward_partial_score/std": 0.18666595220565796,
260
+ "rewards/tag_count_reward/mean": 1.0,
261
+ "rewards/tag_count_reward/std": 0.0,
262
+ "rewards/thinking_answer_ratio_reward/mean": 0.9647442102432251,
263
+ "rewards/thinking_answer_ratio_reward/std": 0.018395813181996346,
264
+ "sampling/importance_sampling_ratio/max": 2.0,
265
+ "sampling/importance_sampling_ratio/mean": 1.151930332183838,
266
+ "sampling/importance_sampling_ratio/min": 2.3768032406223938e-05,
267
+ "sampling/sampling_logp_difference/max": 10.64716911315918,
268
+ "sampling/sampling_logp_difference/mean": 0.242381751537323,
269
+ "step": 24
270
+ },
271
+ {
272
+ "clip_ratio/high_max": 0.34375,
273
+ "clip_ratio/high_mean": 0.21533203125,
274
+ "clip_ratio/low_mean": 0.16552734375,
275
+ "clip_ratio/low_min": 0.07421875,
276
+ "clip_ratio/region_mean": 0.380859375,
277
+ "completions/clipped_ratio": 0.00048828125,
278
+ "completions/max_length": 3072.0,
279
+ "completions/max_terminated_length": 480.0,
280
+ "completions/mean_length": 370.24658203125,
281
+ "completions/mean_terminated_length": 368.9267272949219,
282
+ "completions/min_length": 290.0,
283
+ "completions/min_terminated_length": 290.0,
284
+ "entropy": 0.47484372183680534,
285
+ "epoch": 0.30434782608695654,
286
+ "frac_reward_zero_std": 0.0,
287
+ "grad_norm": 0.02571369534600996,
288
+ "learning_rate": 1e-05,
289
+ "loss": 0.0019,
290
+ "num_tokens": 19915323.0,
291
+ "reward": 3.4752464294433594,
292
+ "reward_std": 0.1370949149131775,
293
+ "rewards/ngram_repetition2/mean": 0.9834737181663513,
294
+ "rewards/ngram_repetition2/std": 0.016693396493792534,
295
+ "rewards/ngram_repetition3/mean": 0.996229887008667,
296
+ "rewards/ngram_repetition3/std": 0.01113222073763609,
297
+ "rewards/symbolic_reward_accuracy/mean": 0.7587890625,
298
+ "rewards/symbolic_reward_accuracy/std": 0.42792245745658875,
299
+ "rewards/symbolic_reward_partial_score/mean": 0.9284260869026184,
300
+ "rewards/symbolic_reward_partial_score/std": 0.15036074817180634,
301
+ "rewards/tag_count_reward/mean": 0.999755859375,
302
+ "rewards/tag_count_reward/std": 0.011048543266952038,
303
+ "rewards/thinking_answer_ratio_reward/mean": 0.9689278602600098,
304
+ "rewards/thinking_answer_ratio_reward/std": 0.024484839290380478,
305
+ "sampling/importance_sampling_ratio/max": 2.0,
306
+ "sampling/importance_sampling_ratio/mean": 1.1540093421936035,
307
+ "sampling/importance_sampling_ratio/min": 4.7302735765697435e-05,
308
+ "sampling/sampling_logp_difference/max": 9.958942413330078,
309
+ "sampling/sampling_logp_difference/mean": 0.24644868075847626,
310
+ "step": 28
311
+ },
312
+ {
313
+ "clip_ratio/high_max": 0.3046875,
314
+ "clip_ratio/high_mean": 0.17724609375,
315
+ "clip_ratio/low_mean": 0.19384765625,
316
+ "clip_ratio/low_min": 0.078125,
317
+ "clip_ratio/region_mean": 0.37109375,
318
+ "completions/clipped_ratio": 0.0,
319
+ "completions/max_length": 452.0,
320
+ "completions/max_terminated_length": 452.0,
321
+ "completions/mean_length": 361.70166015625,
322
+ "completions/mean_terminated_length": 361.70166015625,
323
+ "completions/min_length": 275.0,
324
+ "completions/min_terminated_length": 275.0,
325
+ "entropy": 0.4775677230209112,
326
+ "epoch": 0.34782608695652173,
327
+ "frac_reward_zero_std": 0.0,
328
+ "grad_norm": 0.022611441969899414,
329
+ "learning_rate": 1e-05,
330
+ "loss": 0.0003,
331
+ "num_tokens": 22712344.0,
332
+ "reward": 3.4439804553985596,
333
+ "reward_std": 0.07519370317459106,
334
+ "rewards/ngram_repetition2/mean": 0.9849820137023926,
335
+ "rewards/ngram_repetition2/std": 0.015775341540575027,
336
+ "rewards/ngram_repetition3/mean": 0.9966925382614136,
337
+ "rewards/ngram_repetition3/std": 0.010486208833754063,
338
+ "rewards/symbolic_reward_accuracy/mean": 0.7451171875,
339
+ "rewards/symbolic_reward_accuracy/std": 0.4359017610549927,
340
+ "rewards/symbolic_reward_partial_score/mean": 0.9242349863052368,
341
+ "rewards/symbolic_reward_partial_score/std": 0.1511358916759491,
342
+ "rewards/tag_count_reward/mean": 1.0,
343
+ "rewards/tag_count_reward/std": 0.0,
344
+ "rewards/thinking_answer_ratio_reward/mean": 0.9694297313690186,
345
+ "rewards/thinking_answer_ratio_reward/std": 0.01129063218832016,
346
+ "sampling/importance_sampling_ratio/max": 2.0,
347
+ "sampling/importance_sampling_ratio/mean": 1.1553937196731567,
348
+ "sampling/importance_sampling_ratio/min": 1.3447781777031764e-10,
349
+ "sampling/sampling_logp_difference/max": 22.72962188720703,
350
+ "sampling/sampling_logp_difference/mean": 0.24660193920135498,
351
+ "step": 32
352
+ },
353
+ {
354
+ "clip_ratio/high_max": 0.296875,
355
+ "clip_ratio/high_mean": 0.18896484375,
356
+ "clip_ratio/low_mean": 0.18798828125,
357
+ "clip_ratio/low_min": 0.08203125,
358
+ "clip_ratio/region_mean": 0.376953125,
359
+ "completions/clipped_ratio": 0.0,
360
+ "completions/max_length": 457.0,
361
+ "completions/max_terminated_length": 457.0,
362
+ "completions/mean_length": 360.79736328125,
363
+ "completions/mean_terminated_length": 360.79736328125,
364
+ "completions/min_length": 286.0,
365
+ "completions/min_terminated_length": 286.0,
366
+ "entropy": 0.4877959694713354,
367
+ "epoch": 0.391304347826087,
368
+ "frac_reward_zero_std": 0.0,
369
+ "grad_norm": 0.02221078365492021,
370
+ "learning_rate": 1e-05,
371
+ "loss": 0.0004,
372
+ "num_tokens": 25517017.0,
373
+ "reward": 3.241891622543335,
374
+ "reward_std": 0.03938647359609604,
375
+ "rewards/ngram_repetition2/mean": 0.9857202768325806,
376
+ "rewards/ngram_repetition2/std": 0.013620593585073948,
377
+ "rewards/ngram_repetition3/mean": 0.9973255395889282,
378
+ "rewards/ngram_repetition3/std": 0.008434941992163658,
379
+ "rewards/symbolic_reward_accuracy/mean": 0.65234375,
380
+ "rewards/symbolic_reward_accuracy/std": 0.47634249925613403,
381
+ "rewards/symbolic_reward_partial_score/mean": 0.9076741933822632,
382
+ "rewards/symbolic_reward_partial_score/std": 0.141921728849411,
383
+ "rewards/tag_count_reward/mean": 1.0,
384
+ "rewards/tag_count_reward/std": 0.0,
385
+ "rewards/thinking_answer_ratio_reward/mean": 0.9699524641036987,
386
+ "rewards/thinking_answer_ratio_reward/std": 0.009303269907832146,
387
+ "sampling/importance_sampling_ratio/max": 2.0,
388
+ "sampling/importance_sampling_ratio/mean": 1.156609296798706,
389
+ "sampling/importance_sampling_ratio/min": 1.4608130260995722e-09,
390
+ "sampling/sampling_logp_difference/max": 20.34427261352539,
391
+ "sampling/sampling_logp_difference/mean": 0.24704143404960632,
392
+ "step": 36
393
+ },
394
+ {
395
+ "clip_ratio/high_max": 0.31640625,
396
+ "clip_ratio/high_mean": 0.20263671875,
397
+ "clip_ratio/low_mean": 0.17431640625,
398
+ "clip_ratio/low_min": 0.0546875,
399
+ "clip_ratio/region_mean": 0.376953125,
400
+ "completions/clipped_ratio": 0.0,
401
+ "completions/max_length": 470.0,
402
+ "completions/max_terminated_length": 470.0,
403
+ "completions/mean_length": 365.380859375,
404
+ "completions/mean_terminated_length": 365.380859375,
405
+ "completions/min_length": 289.0,
406
+ "completions/min_terminated_length": 289.0,
407
+ "entropy": 0.500596784055233,
408
+ "epoch": 0.43478260869565216,
409
+ "frac_reward_zero_std": 0.0,
410
+ "grad_norm": 0.028585581797482114,
411
+ "learning_rate": 1e-05,
412
+ "loss": 0.0003,
413
+ "num_tokens": 28302565.0,
414
+ "reward": 3.6249711513519287,
415
+ "reward_std": 0.031787335872650146,
416
+ "rewards/ngram_repetition2/mean": 0.9847173690795898,
417
+ "rewards/ngram_repetition2/std": 0.01566314324736595,
418
+ "rewards/ngram_repetition3/mean": 0.9967639446258545,
419
+ "rewards/ngram_repetition3/std": 0.010298742912709713,
420
+ "rewards/symbolic_reward_accuracy/mean": 0.8203125,
421
+ "rewards/symbolic_reward_accuracy/std": 0.38402071595191956,
422
+ "rewards/symbolic_reward_partial_score/mean": 0.954833984375,
423
+ "rewards/symbolic_reward_partial_score/std": 0.11018021404743195,
424
+ "rewards/tag_count_reward/mean": 1.0,
425
+ "rewards/tag_count_reward/std": 0.0,
426
+ "rewards/thinking_answer_ratio_reward/mean": 0.9697376489639282,
427
+ "rewards/thinking_answer_ratio_reward/std": 0.011102610267698765,
428
+ "sampling/importance_sampling_ratio/max": 2.0,
429
+ "sampling/importance_sampling_ratio/mean": 1.1601823568344116,
430
+ "sampling/importance_sampling_ratio/min": 2.0568222680594772e-05,
431
+ "sampling/sampling_logp_difference/max": 10.791763305664062,
432
+ "sampling/sampling_logp_difference/mean": 0.24917525053024292,
433
+ "step": 40
434
+ },
435
+ {
436
+ "clip_ratio/high_max": 0.26171875,
437
+ "clip_ratio/high_mean": 0.14892578125,
438
+ "clip_ratio/low_mean": 0.1982421875,
439
+ "clip_ratio/low_min": 0.08203125,
440
+ "clip_ratio/region_mean": 0.34716796875,
441
+ "completions/clipped_ratio": 0.0,
442
+ "completions/max_length": 469.0,
443
+ "completions/max_terminated_length": 469.0,
444
+ "completions/mean_length": 373.4169921875,
445
+ "completions/mean_terminated_length": 373.4169921875,
446
+ "completions/min_length": 306.0,
447
+ "completions/min_terminated_length": 306.0,
448
+ "entropy": 0.5211230479180813,
449
+ "epoch": 0.4782608695652174,
450
+ "frac_reward_zero_std": 0.0,
451
+ "grad_norm": 0.031003841790188075,
452
+ "learning_rate": 1e-05,
453
+ "loss": 0.0002,
454
+ "num_tokens": 31126747.0,
455
+ "reward": 3.277402639389038,
456
+ "reward_std": 0.036474019289016724,
457
+ "rewards/ngram_repetition2/mean": 0.9847090840339661,
458
+ "rewards/ngram_repetition2/std": 0.01521327905356884,
459
+ "rewards/ngram_repetition3/mean": 0.9967399835586548,
460
+ "rewards/ngram_repetition3/std": 0.00949870329350233,
461
+ "rewards/symbolic_reward_accuracy/mean": 0.6708984375,
462
+ "rewards/symbolic_reward_accuracy/std": 0.4700016975402832,
463
+ "rewards/symbolic_reward_partial_score/mean": 0.9060872793197632,
464
+ "rewards/symbolic_reward_partial_score/std": 0.16207432746887207,
465
+ "rewards/tag_count_reward/mean": 1.0,
466
+ "rewards/tag_count_reward/std": 0.0,
467
+ "rewards/thinking_answer_ratio_reward/mean": 0.9704017043113708,
468
+ "rewards/thinking_answer_ratio_reward/std": 0.010555099695920944,
469
+ "sampling/importance_sampling_ratio/max": 2.0,
470
+ "sampling/importance_sampling_ratio/mean": 1.1634962558746338,
471
+ "sampling/importance_sampling_ratio/min": 3.222545501557761e-06,
472
+ "sampling/sampling_logp_difference/max": 12.645339012145996,
473
+ "sampling/sampling_logp_difference/mean": 0.2521994411945343,
474
+ "step": 44
475
+ },
476
+ {
477
+ "clip_ratio/high_max": 0.31640625,
478
+ "clip_ratio/high_mean": 0.20166015625,
479
+ "clip_ratio/low_mean": 0.16064453125,
480
+ "clip_ratio/low_min": 0.05078125,
481
+ "clip_ratio/region_mean": 0.3623046875,
482
+ "completions/clipped_ratio": 0.0,
483
+ "completions/max_length": 447.0,
484
+ "completions/max_terminated_length": 447.0,
485
+ "completions/mean_length": 376.16943359375,
486
+ "completions/mean_terminated_length": 376.16943359375,
487
+ "completions/min_length": 287.0,
488
+ "completions/min_terminated_length": 287.0,
489
+ "entropy": 0.5292842984199524,
490
+ "epoch": 0.5217391304347826,
491
+ "frac_reward_zero_std": 0.0,
492
+ "grad_norm": 0.029177978249695414,
493
+ "learning_rate": 1e-05,
494
+ "loss": 0.0003,
495
+ "num_tokens": 33956566.0,
496
+ "reward": 3.5402560234069824,
497
+ "reward_std": 0.05406097322702408,
498
+ "rewards/ngram_repetition2/mean": 0.9857374429702759,
499
+ "rewards/ngram_repetition2/std": 0.011830438859760761,
500
+ "rewards/ngram_repetition3/mean": 0.9977642297744751,
501
+ "rewards/ngram_repetition3/std": 0.00712351780384779,
502
+ "rewards/symbolic_reward_accuracy/mean": 0.7822265625,
503
+ "rewards/symbolic_reward_accuracy/std": 0.4128333628177643,
504
+ "rewards/symbolic_reward_partial_score/mean": 0.9462483525276184,
505
+ "rewards/symbolic_reward_partial_score/std": 0.11659117788076401,
506
+ "rewards/tag_count_reward/mean": 1.0,
507
+ "rewards/tag_count_reward/std": 0.0,
508
+ "rewards/thinking_answer_ratio_reward/mean": 0.9719350934028625,
509
+ "rewards/thinking_answer_ratio_reward/std": 0.0077917324379086494,
510
+ "sampling/importance_sampling_ratio/max": 2.0,
511
+ "sampling/importance_sampling_ratio/mean": 1.1653231382369995,
512
+ "sampling/importance_sampling_ratio/min": 1.30699368128262e-06,
513
+ "sampling/sampling_logp_difference/max": 13.547780990600586,
514
+ "sampling/sampling_logp_difference/mean": 0.2539900541305542,
515
+ "step": 48
516
+ },
517
+ {
518
+ "clip_ratio/high_max": 0.30078125,
519
+ "clip_ratio/high_mean": 0.1875,
520
+ "clip_ratio/low_mean": 0.19287109375,
521
+ "clip_ratio/low_min": 0.05859375,
522
+ "clip_ratio/region_mean": 0.38037109375,
523
+ "completions/clipped_ratio": 0.0,
524
+ "completions/max_length": 476.0,
525
+ "completions/max_terminated_length": 476.0,
526
+ "completions/mean_length": 375.197265625,
527
+ "completions/mean_terminated_length": 375.197265625,
528
+ "completions/min_length": 285.0,
529
+ "completions/min_terminated_length": 285.0,
530
+ "entropy": 0.5228982605040073,
531
+ "epoch": 0.5652173913043478,
532
+ "frac_reward_zero_std": 0.0,
533
+ "grad_norm": 0.019909033791539,
534
+ "learning_rate": 1e-05,
535
+ "loss": 0.0002,
536
+ "num_tokens": 36800234.0,
537
+ "reward": 3.3348586559295654,
538
+ "reward_std": 0.031567975878715515,
539
+ "rewards/ngram_repetition2/mean": 0.9860277771949768,
540
+ "rewards/ngram_repetition2/std": 0.011108696460723877,
541
+ "rewards/ngram_repetition3/mean": 0.9980593919754028,
542
+ "rewards/ngram_repetition3/std": 0.006017275620251894,
543
+ "rewards/symbolic_reward_accuracy/mean": 0.69580078125,
544
+ "rewards/symbolic_reward_accuracy/std": 0.46017980575561523,
545
+ "rewards/symbolic_reward_partial_score/mean": 0.9136962890625,
546
+ "rewards/symbolic_reward_partial_score/std": 0.15295757353305817,
547
+ "rewards/tag_count_reward/mean": 1.0,
548
+ "rewards/tag_count_reward/std": 0.0,
549
+ "rewards/thinking_answer_ratio_reward/mean": 0.9719891548156738,
550
+ "rewards/thinking_answer_ratio_reward/std": 0.007201826199889183,
551
+ "sampling/importance_sampling_ratio/max": 2.0,
552
+ "sampling/importance_sampling_ratio/mean": 1.165921688079834,
553
+ "sampling/importance_sampling_ratio/min": 7.888810068834573e-05,
554
+ "sampling/sampling_logp_difference/max": 9.447480201721191,
555
+ "sampling/sampling_logp_difference/mean": 0.2536054253578186,
556
+ "step": 52
557
+ },
558
+ {
559
+ "clip_ratio/high_max": 0.2890625,
560
+ "clip_ratio/high_mean": 0.19287109375,
561
+ "clip_ratio/low_mean": 0.18603515625,
562
+ "clip_ratio/low_min": 0.08203125,
563
+ "clip_ratio/region_mean": 0.37890625,
564
+ "completions/clipped_ratio": 0.0,
565
+ "completions/max_length": 469.0,
566
+ "completions/max_terminated_length": 469.0,
567
+ "completions/mean_length": 373.00830078125,
568
+ "completions/mean_terminated_length": 373.00830078125,
569
+ "completions/min_length": 310.0,
570
+ "completions/min_terminated_length": 310.0,
571
+ "entropy": 0.5138954482972622,
572
+ "epoch": 0.6086956521739131,
573
+ "frac_reward_zero_std": 0.0,
574
+ "grad_norm": 0.018577199260664073,
575
+ "learning_rate": 1e-05,
576
+ "loss": 0.0001,
577
+ "num_tokens": 39623579.0,
578
+ "reward": 3.4861793518066406,
579
+ "reward_std": 0.01453761849552393,
580
+ "rewards/ngram_repetition2/mean": 0.9885779619216919,
581
+ "rewards/ngram_repetition2/std": 0.008892485871911049,
582
+ "rewards/ngram_repetition3/mean": 0.9986856579780579,
583
+ "rewards/ngram_repetition3/std": 0.004819902591407299,
584
+ "rewards/symbolic_reward_accuracy/mean": 0.76318359375,
585
+ "rewards/symbolic_reward_accuracy/std": 0.42523249983787537,
586
+ "rewards/symbolic_reward_partial_score/mean": 0.9302164316177368,
587
+ "rewards/symbolic_reward_partial_score/std": 0.14360538125038147,
588
+ "rewards/tag_count_reward/mean": 1.0,
589
+ "rewards/tag_count_reward/std": 0.0,
590
+ "rewards/thinking_answer_ratio_reward/mean": 0.9723219871520996,
591
+ "rewards/thinking_answer_ratio_reward/std": 0.006340987980365753,
592
+ "sampling/importance_sampling_ratio/max": 2.0,
593
+ "sampling/importance_sampling_ratio/mean": 1.1633408069610596,
594
+ "sampling/importance_sampling_ratio/min": 9.12318591872463e-06,
595
+ "sampling/sampling_logp_difference/max": 11.604691505432129,
596
+ "sampling/sampling_logp_difference/mean": 0.2456064522266388,
597
+ "step": 56
598
+ },
599
+ {
600
+ "clip_ratio/high_max": 0.26953125,
601
+ "clip_ratio/high_mean": 0.17626953125,
602
+ "clip_ratio/low_mean": 0.18798828125,
603
+ "clip_ratio/low_min": 0.08984375,
604
+ "clip_ratio/region_mean": 0.3642578125,
605
+ "completions/clipped_ratio": 0.0009765625,
606
+ "completions/max_length": 3072.0,
607
+ "completions/max_terminated_length": 473.0,
608
+ "completions/mean_length": 383.15380859375,
609
+ "completions/mean_terminated_length": 380.5254211425781,
610
+ "completions/min_length": 303.0,
611
+ "completions/min_terminated_length": 303.0,
612
+ "entropy": 0.5303183943033218,
613
+ "epoch": 0.6521739130434783,
614
+ "frac_reward_zero_std": 0.0,
615
+ "grad_norm": 0.023767814234547865,
616
+ "learning_rate": 1e-05,
617
+ "loss": 0.0032,
618
+ "num_tokens": 42464534.0,
619
+ "reward": 3.4200026988983154,
620
+ "reward_std": 0.044837385416030884,
621
+ "rewards/ngram_repetition2/mean": 0.9886395931243896,
622
+ "rewards/ngram_repetition2/std": 0.026615649461746216,
623
+ "rewards/ngram_repetition3/mean": 0.9978044629096985,
624
+ "rewards/ngram_repetition3/std": 0.02556409314274788,
625
+ "rewards/symbolic_reward_accuracy/mean": 0.72705078125,
626
+ "rewards/symbolic_reward_accuracy/std": 0.4455837607383728,
627
+ "rewards/symbolic_reward_partial_score/mean": 0.9368082284927368,
628
+ "rewards/symbolic_reward_partial_score/std": 0.12158261984586716,
629
+ "rewards/tag_count_reward/mean": 0.99951171875,
630
+ "rewards/tag_count_reward/std": 0.015621182508766651,
631
+ "rewards/thinking_answer_ratio_reward/mean": 0.971657395362854,
632
+ "rewards/thinking_answer_ratio_reward/std": 0.031062643975019455,
633
+ "sampling/importance_sampling_ratio/max": 2.0,
634
+ "sampling/importance_sampling_ratio/mean": 1.1663137674331665,
635
+ "sampling/importance_sampling_ratio/min": 0.00022333291417453438,
636
+ "sampling/sampling_logp_difference/max": 8.40684700012207,
637
+ "sampling/sampling_logp_difference/mean": 0.2500694990158081,
638
+ "step": 60
639
+ },
640
+ {
641
+ "clip_ratio/high_max": 0.2734375,
642
+ "clip_ratio/high_mean": 0.1826171875,
643
+ "clip_ratio/low_mean": 0.1728515625,
644
+ "clip_ratio/low_min": 0.0625,
645
+ "clip_ratio/region_mean": 0.35546875,
646
+ "completions/clipped_ratio": 0.00048828125,
647
+ "completions/max_length": 3072.0,
648
+ "completions/max_terminated_length": 539.0,
649
+ "completions/mean_length": 386.85986328125,
650
+ "completions/mean_terminated_length": 385.5481262207031,
651
+ "completions/min_length": 326.0,
652
+ "completions/min_terminated_length": 326.0,
653
+ "entropy": 0.5535794571042061,
654
+ "epoch": 0.6956521739130435,
655
+ "frac_reward_zero_std": 0.0,
656
+ "grad_norm": 0.033414142847404786,
657
+ "learning_rate": 1e-05,
658
+ "loss": 0.0016,
659
+ "num_tokens": 45319415.0,
660
+ "reward": 3.5498650074005127,
661
+ "reward_std": 0.044402044266462326,
662
+ "rewards/ngram_repetition2/mean": 0.9896550178527832,
663
+ "rewards/ngram_repetition2/std": 0.01944366842508316,
664
+ "rewards/ngram_repetition3/mean": 0.998386025428772,
665
+ "rewards/ngram_repetition3/std": 0.017801163718104362,
666
+ "rewards/symbolic_reward_accuracy/mean": 0.78857421875,
667
+ "rewards/symbolic_reward_accuracy/std": 0.40841934084892273,
668
+ "rewards/symbolic_reward_partial_score/mean": 0.943359375,
669
+ "rewards/symbolic_reward_partial_score/std": 0.1245667040348053,
670
+ "rewards/tag_count_reward/mean": 0.999755859375,
671
+ "rewards/tag_count_reward/std": 0.011048543266952038,
672
+ "rewards/thinking_answer_ratio_reward/mean": 0.9720906019210815,
673
+ "rewards/thinking_answer_ratio_reward/std": 0.022436225786805153,
674
+ "sampling/importance_sampling_ratio/max": 2.0,
675
+ "sampling/importance_sampling_ratio/mean": 1.1719954013824463,
676
+ "sampling/importance_sampling_ratio/min": 3.318129529361613e-05,
677
+ "sampling/sampling_logp_difference/max": 10.31352424621582,
678
+ "sampling/sampling_logp_difference/mean": 0.25759539008140564,
679
+ "step": 64
680
+ },
681
+ {
682
+ "clip_ratio/high_max": 0.3203125,
683
+ "clip_ratio/high_mean": 0.21630859375,
684
+ "clip_ratio/low_mean": 0.16748046875,
685
+ "clip_ratio/low_min": 0.0625,
686
+ "clip_ratio/region_mean": 0.3837890625,
687
+ "completions/clipped_ratio": 0.0,
688
+ "completions/max_length": 513.0,
689
+ "completions/max_terminated_length": 513.0,
690
+ "completions/mean_length": 394.04443359375,
691
+ "completions/mean_terminated_length": 394.04443359375,
692
+ "completions/min_length": 325.0,
693
+ "completions/min_terminated_length": 325.0,
694
+ "entropy": 0.5683293081820011,
695
+ "epoch": 0.7391304347826086,
696
+ "frac_reward_zero_std": 0.0,
697
+ "grad_norm": 0.02077882386260629,
698
+ "learning_rate": 1e-05,
699
+ "loss": -0.0001,
700
+ "num_tokens": 48182674.0,
701
+ "reward": 3.4722495079040527,
702
+ "reward_std": 0.027738399803638458,
703
+ "rewards/ngram_repetition2/mean": 0.990430474281311,
704
+ "rewards/ngram_repetition2/std": 0.008477425202727318,
705
+ "rewards/ngram_repetition3/mean": 0.998767614364624,
706
+ "rewards/ngram_repetition3/std": 0.004312796052545309,
707
+ "rewards/symbolic_reward_accuracy/mean": 0.75390625,
708
+ "rewards/symbolic_reward_accuracy/std": 0.43083900213241577,
709
+ "rewards/symbolic_reward_partial_score/mean": 0.934814453125,
710
+ "rewards/symbolic_reward_partial_score/std": 0.1286529004573822,
711
+ "rewards/tag_count_reward/mean": 1.0,
712
+ "rewards/tag_count_reward/std": 0.0,
713
+ "rewards/thinking_answer_ratio_reward/mean": 0.9730780124664307,
714
+ "rewards/thinking_answer_ratio_reward/std": 0.006791951600462198,
715
+ "sampling/importance_sampling_ratio/max": 2.0,
716
+ "sampling/importance_sampling_ratio/mean": 1.174986720085144,
717
+ "sampling/importance_sampling_ratio/min": 5.481481275637634e-05,
718
+ "sampling/sampling_logp_difference/max": 9.81155014038086,
719
+ "sampling/sampling_logp_difference/mean": 0.2610657513141632,
720
+ "step": 68
721
+ },
722
+ {
723
+ "clip_ratio/high_max": 0.23046875,
724
+ "clip_ratio/high_mean": 0.13134765625,
725
+ "clip_ratio/low_mean": 0.203125,
726
+ "clip_ratio/low_min": 0.09765625,
727
+ "clip_ratio/region_mean": 0.33447265625,
728
+ "completions/clipped_ratio": 0.0,
729
+ "completions/max_length": 525.0,
730
+ "completions/max_terminated_length": 525.0,
731
+ "completions/mean_length": 402.0390625,
732
+ "completions/mean_terminated_length": 402.0390625,
733
+ "completions/min_length": 305.0,
734
+ "completions/min_terminated_length": 305.0,
735
+ "entropy": 0.5930491574108601,
736
+ "epoch": 0.782608695652174,
737
+ "frac_reward_zero_std": 0.0,
738
+ "grad_norm": 0.0719362481523671,
739
+ "learning_rate": 1e-05,
740
+ "loss": 0.0002,
741
+ "num_tokens": 51030626.0,
742
+ "reward": 3.507338047027588,
743
+ "reward_std": 0.07821064442396164,
744
+ "rewards/ngram_repetition2/mean": 0.9908634424209595,
745
+ "rewards/ngram_repetition2/std": 0.006825427990406752,
746
+ "rewards/ngram_repetition3/mean": 0.9992104768753052,
747
+ "rewards/ngram_repetition3/std": 0.0024387796875089407,
748
+ "rewards/symbolic_reward_accuracy/mean": 0.7705078125,
749
+ "rewards/symbolic_reward_accuracy/std": 0.4206089675426483,
750
+ "rewards/symbolic_reward_partial_score/mean": 0.9366861581802368,
751
+ "rewards/symbolic_reward_partial_score/std": 0.12974172830581665,
752
+ "rewards/tag_count_reward/mean": 1.0,
753
+ "rewards/tag_count_reward/std": 0.0,
754
+ "rewards/thinking_answer_ratio_reward/mean": 0.9735254049301147,
755
+ "rewards/thinking_answer_ratio_reward/std": 0.006129096262156963,
756
+ "sampling/importance_sampling_ratio/max": 2.0,
757
+ "sampling/importance_sampling_ratio/mean": 1.1793080568313599,
758
+ "sampling/importance_sampling_ratio/min": 8.93013350378169e-07,
759
+ "sampling/sampling_logp_difference/max": 13.928664207458496,
760
+ "sampling/sampling_logp_difference/mean": 0.2646637558937073,
761
+ "step": 72
762
+ },
763
+ {
764
+ "clip_ratio/high_max": 0.16796875,
765
+ "clip_ratio/high_mean": 0.10595703125,
766
+ "clip_ratio/low_mean": 0.2255859375,
767
+ "clip_ratio/low_min": 0.11328125,
768
+ "clip_ratio/region_mean": 0.33154296875,
769
+ "completions/clipped_ratio": 0.00634765625,
770
+ "completions/max_length": 3072.0,
771
+ "completions/max_terminated_length": 664.0,
772
+ "completions/mean_length": 425.64794921875,
773
+ "completions/mean_terminated_length": 408.74249267578125,
774
+ "completions/min_length": 326.0,
775
+ "completions/min_terminated_length": 326.0,
776
+ "entropy": 0.6297206245362759,
777
+ "epoch": 0.8260869565217391,
778
+ "frac_reward_zero_std": 0.0,
779
+ "grad_norm": 0.17694802563627895,
780
+ "learning_rate": 1e-05,
781
+ "loss": 0.0207,
782
+ "num_tokens": 53958609.0,
783
+ "reward": 3.434380054473877,
784
+ "reward_std": 0.15169459581375122,
785
+ "rewards/ngram_repetition2/mean": 0.986931562423706,
786
+ "rewards/ngram_repetition2/std": 0.04639098048210144,
787
+ "rewards/ngram_repetition3/mean": 0.9962227940559387,
788
+ "rewards/ngram_repetition3/std": 0.046196915209293365,
789
+ "rewards/symbolic_reward_accuracy/mean": 0.74072265625,
790
+ "rewards/symbolic_reward_accuracy/std": 0.4383451044559479,
791
+ "rewards/symbolic_reward_partial_score/mean": 0.9265950918197632,
792
+ "rewards/symbolic_reward_partial_score/std": 0.153251513838768,
793
+ "rewards/tag_count_reward/mean": 0.996826171875,
794
+ "rewards/tag_count_reward/std": 0.03971915319561958,
795
+ "rewards/thinking_answer_ratio_reward/mean": 0.9682024717330933,
796
+ "rewards/thinking_answer_ratio_reward/std": 0.07753153145313263,
797
+ "sampling/importance_sampling_ratio/max": 2.0,
798
+ "sampling/importance_sampling_ratio/mean": 1.1941275596618652,
799
+ "sampling/importance_sampling_ratio/min": 4.905196692561731e-05,
800
+ "sampling/sampling_logp_difference/max": 9.922630310058594,
801
+ "sampling/sampling_logp_difference/mean": 0.27940261363983154,
802
+ "step": 76
803
+ },
804
+ {
805
+ "clip_ratio/high_max": 0.2109375,
806
+ "clip_ratio/high_mean": 0.125,
807
+ "clip_ratio/low_mean": 0.1884765625,
808
+ "clip_ratio/low_min": 0.07421875,
809
+ "clip_ratio/region_mean": 0.3134765625,
810
+ "completions/clipped_ratio": 0.025390625,
811
+ "completions/max_length": 3072.0,
812
+ "completions/max_terminated_length": 846.0,
813
+ "completions/mean_length": 474.76806640625,
814
+ "completions/mean_terminated_length": 407.1047058105469,
815
+ "completions/min_length": 308.0,
816
+ "completions/min_terminated_length": 308.0,
817
+ "entropy": 0.6249676272273064,
818
+ "epoch": 0.8695652173913043,
819
+ "frac_reward_zero_std": 0.0,
820
+ "grad_norm": 0.036438229713036584,
821
+ "learning_rate": 1e-05,
822
+ "loss": 0.0733,
823
+ "num_tokens": 56952342.0,
824
+ "reward": 3.3732757568359375,
825
+ "reward_std": 0.3173109292984009,
826
+ "rewards/ngram_repetition2/mean": 0.9734116792678833,
827
+ "rewards/ngram_repetition2/std": 0.09751972556114197,
828
+ "rewards/ngram_repetition3/mean": 0.9850020408630371,
829
+ "rewards/ngram_repetition3/std": 0.09716872870922089,
830
+ "rewards/symbolic_reward_accuracy/mean": 0.73046875,
831
+ "rewards/symbolic_reward_accuracy/std": 0.4438246786594391,
832
+ "rewards/symbolic_reward_partial_score/mean": 0.8959553837776184,
833
+ "rewards/symbolic_reward_partial_score/std": 0.21540075540542603,
834
+ "rewards/tag_count_reward/mean": 0.9873046875,
835
+ "rewards/tag_count_reward/std": 0.07867342233657837,
836
+ "rewards/thinking_answer_ratio_reward/mean": 0.9493899941444397,
837
+ "rewards/thinking_answer_ratio_reward/std": 0.15333302319049835,
838
+ "sampling/importance_sampling_ratio/max": 2.0,
839
+ "sampling/importance_sampling_ratio/mean": 1.1926825046539307,
840
+ "sampling/importance_sampling_ratio/min": 1.8732294847723097e-05,
841
+ "sampling/sampling_logp_difference/max": 10.885261535644531,
842
+ "sampling/sampling_logp_difference/mean": 0.27355605363845825,
843
+ "step": 80
844
+ },
845
+ {
846
+ "clip_ratio/high_max": 0.16015625,
847
+ "clip_ratio/high_mean": 0.08154296875,
848
+ "clip_ratio/low_mean": 0.1630859375,
849
+ "clip_ratio/low_min": 0.0546875,
850
+ "clip_ratio/region_mean": 0.24462890625,
851
+ "completions/clipped_ratio": 0.041015625,
852
+ "completions/max_length": 3072.0,
853
+ "completions/max_terminated_length": 1910.0,
854
+ "completions/mean_length": 513.650390625,
855
+ "completions/mean_terminated_length": 404.23016357421875,
856
+ "completions/min_length": 283.0,
857
+ "completions/min_terminated_length": 283.0,
858
+ "entropy": 0.6995769254863262,
859
+ "epoch": 0.9130434782608695,
860
+ "frac_reward_zero_std": 0.0,
861
+ "grad_norm": 0.3126366897938619,
862
+ "learning_rate": 1e-05,
863
+ "loss": 0.1153,
864
+ "num_tokens": 60057386.0,
865
+ "reward": 3.4059977531433105,
866
+ "reward_std": 0.4714565873146057,
867
+ "rewards/ngram_repetition2/mean": 0.9679386615753174,
868
+ "rewards/ngram_repetition2/std": 0.09275452047586441,
869
+ "rewards/ngram_repetition3/mean": 0.984743595123291,
870
+ "rewards/ngram_repetition3/std": 0.09202881902456284,
871
+ "rewards/symbolic_reward_accuracy/mean": 0.7451171875,
872
+ "rewards/symbolic_reward_accuracy/std": 0.4359017610549927,
873
+ "rewards/symbolic_reward_partial_score/mean": 0.9079182744026184,
874
+ "rewards/symbolic_reward_partial_score/std": 0.21840326488018036,
875
+ "rewards/tag_count_reward/mean": 0.97900390625,
876
+ "rewards/tag_count_reward/std": 0.10031013935804367,
877
+ "rewards/thinking_answer_ratio_reward/mean": 0.9314362406730652,
878
+ "rewards/thinking_answer_ratio_reward/std": 0.19521328806877136,
879
+ "sampling/importance_sampling_ratio/max": 2.0,
880
+ "sampling/importance_sampling_ratio/mean": 1.2058464288711548,
881
+ "sampling/importance_sampling_ratio/min": 0.00020593531371559948,
882
+ "sampling/sampling_logp_difference/max": 8.487948417663574,
883
+ "sampling/sampling_logp_difference/mean": 0.2925470471382141,
884
+ "step": 84
885
+ },
886
+ {
887
+ "clip_ratio/high_max": 0.04296875,
888
+ "clip_ratio/high_mean": 0.013671875,
889
+ "clip_ratio/low_mean": 0.30078125,
890
+ "clip_ratio/low_min": 0.16796875,
891
+ "clip_ratio/region_mean": 0.314453125,
892
+ "completions/clipped_ratio": 0.00146484375,
893
+ "completions/max_length": 3072.0,
894
+ "completions/max_terminated_length": 578.0,
895
+ "completions/mean_length": 382.1064453125,
896
+ "completions/mean_terminated_length": 378.160400390625,
897
+ "completions/min_length": 266.0,
898
+ "completions/min_terminated_length": 266.0,
899
+ "entropy": 0.7504777312278748,
900
+ "epoch": 0.9565217391304348,
901
+ "frac_reward_zero_std": 0.0,
902
+ "grad_norm": 0.08070835974712572,
903
+ "learning_rate": 1e-05,
904
+ "loss": 0.0066,
905
+ "num_tokens": 62905700.0,
906
+ "reward": 3.419804811477661,
907
+ "reward_std": 0.09055649489164352,
908
+ "rewards/ngram_repetition2/mean": 0.980319619178772,
909
+ "rewards/ngram_repetition2/std": 0.011525592766702175,
910
+ "rewards/ngram_repetition3/mean": 0.9975647330284119,
911
+ "rewards/ngram_repetition3/std": 0.005651315674185753,
912
+ "rewards/symbolic_reward_accuracy/mean": 0.732421875,
913
+ "rewards/symbolic_reward_accuracy/std": 0.4428044855594635,
914
+ "rewards/symbolic_reward_partial_score/mean": 0.9267171025276184,
915
+ "rewards/symbolic_reward_partial_score/std": 0.14161133766174316,
916
+ "rewards/tag_count_reward/mean": 0.998779296875,
917
+ "rewards/tag_count_reward/std": 0.02468114346265793,
918
+ "rewards/thinking_answer_ratio_reward/mean": 0.9685767292976379,
919
+ "rewards/thinking_answer_ratio_reward/std": 0.04834046587347984,
920
+ "sampling/importance_sampling_ratio/max": 2.0,
921
+ "sampling/importance_sampling_ratio/mean": 1.249918818473816,
922
+ "sampling/importance_sampling_ratio/min": 0.0003460382577031851,
923
+ "sampling/sampling_logp_difference/max": 7.968961238861084,
924
+ "sampling/sampling_logp_difference/mean": 0.3476927876472473,
925
+ "step": 88
926
+ },
927
+ {
928
+ "clip_ratio/high_max": 0.140625,
929
+ "clip_ratio/high_mean": 0.07470703125,
930
+ "clip_ratio/low_mean": 0.2451171875,
931
+ "clip_ratio/low_min": 0.12890625,
932
+ "clip_ratio/region_mean": 0.31982421875,
933
+ "completions/clipped_ratio": 0.00146484375,
934
+ "completions/max_length": 3072.0,
935
+ "completions/max_terminated_length": 579.0,
936
+ "completions/mean_length": 347.271484375,
937
+ "completions/mean_terminated_length": 343.2743225097656,
938
+ "completions/min_length": 187.0,
939
+ "completions/min_terminated_length": 187.0,
940
+ "entropy": 0.8337125517427921,
941
+ "epoch": 1.0,
942
+ "frac_reward_zero_std": 0.0,
943
+ "grad_norm": 0.09077319995516756,
944
+ "learning_rate": 1e-05,
945
+ "loss": 0.0058,
946
+ "num_tokens": 65689008.0,
947
+ "reward": 3.366065502166748,
948
+ "reward_std": 0.12272138148546219,
949
+ "rewards/ngram_repetition2/mean": 0.9778505563735962,
950
+ "rewards/ngram_repetition2/std": 0.02180611714720726,
951
+ "rewards/ngram_repetition3/mean": 0.9966850280761719,
952
+ "rewards/ngram_repetition3/std": 0.01893492229282856,
953
+ "rewards/symbolic_reward_accuracy/mean": 0.7119140625,
954
+ "rewards/symbolic_reward_accuracy/std": 0.4529819190502167,
955
+ "rewards/symbolic_reward_partial_score/mean": 0.91357421875,
956
+ "rewards/symbolic_reward_partial_score/std": 0.1592041552066803,
957
+ "rewards/tag_count_reward/mean": 0.999267578125,
958
+ "rewards/tag_count_reward/std": 0.019127286970615387,
959
+ "rewards/thinking_answer_ratio_reward/mean": 0.9650210738182068,
960
+ "rewards/thinking_answer_ratio_reward/std": 0.03827888146042824,
961
+ "sampling/importance_sampling_ratio/max": 2.0,
962
+ "sampling/importance_sampling_ratio/mean": 1.254873275756836,
963
+ "sampling/importance_sampling_ratio/min": 0.00030355059425346553,
964
+ "sampling/sampling_logp_difference/max": 8.09996223449707,
965
+ "sampling/sampling_logp_difference/mean": 0.347744882106781,
966
+ "step": 92
967
+ },
968
+ {
969
+ "epoch": 1.0,
970
+ "eval_clip_ratio/high_max": 0.0,
971
+ "eval_clip_ratio/high_mean": 0.0,
972
+ "eval_clip_ratio/low_mean": 0.0,
973
+ "eval_clip_ratio/low_min": 0.0,
974
+ "eval_clip_ratio/region_mean": 0.0,
975
+ "eval_completions/clipped_ratio": 0.0008223684210526315,
976
+ "eval_completions/max_length": 730.6842105263158,
977
+ "eval_completions/max_terminated_length": 454.36842105263156,
978
+ "eval_completions/mean_length": 323.31825657894734,
979
+ "eval_completions/mean_terminated_length": 321.05658762078536,
980
+ "eval_completions/min_length": 223.05263157894737,
981
+ "eval_completions/min_terminated_length": 223.05263157894737,
982
+ "eval_entropy": 0.8378904399118925,
983
+ "eval_frac_reward_zero_std": 0.0,
984
+ "eval_loss": 0.0008370681316591799,
985
+ "eval_num_tokens": 65689008.0,
986
+ "eval_reward": 3.2917319975401225,
987
+ "eval_reward_std": 0.11801875697749079,
988
+ "eval_rewards/ngram_repetition2/mean": 0.9756150371149966,
989
+ "eval_rewards/ngram_repetition2/std": 0.01365309997804855,
990
+ "eval_rewards/ngram_repetition3/mean": 0.9962926067804035,
991
+ "eval_rewards/ngram_repetition3/std": 0.006524833691257395,
992
+ "eval_rewards/symbolic_reward_accuracy/mean": 0.678453947368421,
993
+ "eval_rewards/symbolic_reward_accuracy/std": 0.4427479756505866,
994
+ "eval_rewards/symbolic_reward_partial_score/mean": 0.9067297138665852,
995
+ "eval_rewards/symbolic_reward_partial_score/std": 0.1448917659489732,
996
+ "eval_rewards/tag_count_reward/mean": 0.998766447368421,
997
+ "eval_rewards/tag_count_reward/std": 0.012580533757021553,
998
+ "eval_rewards/thinking_answer_ratio_reward/mean": 0.9608869458499708,
999
+ "eval_rewards/thinking_answer_ratio_reward/std": 0.03243385211221481,
1000
+ "eval_runtime": 180.375,
1001
+ "eval_samples_per_second": 0.832,
1002
+ "eval_sampling/importance_sampling_ratio/max": 2.0,
1003
+ "eval_sampling/importance_sampling_ratio/mean": 1.2766095650823492,
1004
+ "eval_sampling/importance_sampling_ratio/min": 0.0030842775350289516,
1005
+ "eval_sampling/sampling_logp_difference/max": 5.888615081184788,
1006
+ "eval_sampling/sampling_logp_difference/mean": 0.37607322868547943,
1007
+ "eval_steps_per_second": 0.011,
1008
+ "step": 92
1009
+ },
1010
+ {
1011
+ "clip_ratio/high_max": 0.1796875,
1012
+ "clip_ratio/high_mean": 0.08984375,
1013
+ "clip_ratio/low_mean": 0.23876953125,
1014
+ "clip_ratio/low_min": 0.10546875,
1015
+ "clip_ratio/region_mean": 0.32861328125,
1016
+ "completions/clipped_ratio": 0.0,
1017
+ "completions/max_length": 484.0,
1018
+ "completions/max_terminated_length": 484.0,
1019
+ "completions/mean_length": 318.796875,
1020
+ "completions/mean_terminated_length": 318.796875,
1021
+ "completions/min_length": 188.0,
1022
+ "completions/min_terminated_length": 188.0,
1023
+ "entropy": 0.8296824619174004,
1024
+ "epoch": 1.0434782608695652,
1025
+ "frac_reward_zero_std": 0.0,
1026
+ "grad_norm": 0.035556589091200325,
1027
+ "learning_rate": 1e-05,
1028
+ "loss": 0.0004,
1029
+ "num_tokens": 68391824.0,
1030
+ "reward": 3.526369333267212,
1031
+ "reward_std": 0.10698950290679932,
1032
+ "rewards/ngram_repetition2/mean": 0.976723313331604,
1033
+ "rewards/ngram_repetition2/std": 0.013903986662626266,
1034
+ "rewards/ngram_repetition3/mean": 0.9966170787811279,
1035
+ "rewards/ngram_repetition3/std": 0.007368884980678558,
1036
+ "rewards/symbolic_reward_accuracy/mean": 0.779296875,
1037
+ "rewards/symbolic_reward_accuracy/std": 0.414821982383728,
1038
+ "rewards/symbolic_reward_partial_score/mean": 0.9383952021598816,
1039
+ "rewards/symbolic_reward_partial_score/std": 0.13407477736473083,
1040
+ "rewards/tag_count_reward/mean": 1.0,
1041
+ "rewards/tag_count_reward/std": 0.0,
1042
+ "rewards/thinking_answer_ratio_reward/mean": 0.9647086262702942,
1043
+ "rewards/thinking_answer_ratio_reward/std": 0.010662911459803581,
1044
+ "sampling/importance_sampling_ratio/max": 2.0,
1045
+ "sampling/importance_sampling_ratio/mean": 1.277923345565796,
1046
+ "sampling/importance_sampling_ratio/min": 0.0010678451508283615,
1047
+ "sampling/sampling_logp_difference/max": 6.8421125411987305,
1048
+ "sampling/sampling_logp_difference/mean": 0.37708646059036255,
1049
+ "step": 96
1050
+ },
1051
+ {
1052
+ "clip_ratio/high_max": 0.3515625,
1053
+ "clip_ratio/high_mean": 0.21728515625,
1054
+ "clip_ratio/low_mean": 0.1474609375,
1055
+ "clip_ratio/low_min": 0.05078125,
1056
+ "clip_ratio/region_mean": 0.36474609375,
1057
+ "completions/clipped_ratio": 0.00048828125,
1058
+ "completions/max_length": 3072.0,
1059
+ "completions/max_terminated_length": 474.0,
1060
+ "completions/mean_length": 312.2705078125,
1061
+ "completions/mean_terminated_length": 310.9223327636719,
1062
+ "completions/min_length": 199.0,
1063
+ "completions/min_terminated_length": 199.0,
1064
+ "entropy": 0.8829836808145046,
1065
+ "epoch": 1.0869565217391304,
1066
+ "frac_reward_zero_std": 0.0,
1067
+ "grad_norm": 0.026459033753815097,
1068
+ "learning_rate": 1e-05,
1069
+ "loss": 0.0015,
1070
+ "num_tokens": 71097114.0,
1071
+ "reward": 3.375732421875,
1072
+ "reward_std": 0.16172271966934204,
1073
+ "rewards/ngram_repetition2/mean": 0.9744597673416138,
1074
+ "rewards/ngram_repetition2/std": 0.013245878741145134,
1075
+ "rewards/ngram_repetition3/mean": 0.996464729309082,
1076
+ "rewards/ngram_repetition3/std": 0.006197268608957529,
1077
+ "rewards/symbolic_reward_accuracy/mean": 0.71337890625,
1078
+ "rewards/symbolic_reward_accuracy/std": 0.45229339599609375,
1079
+ "rewards/symbolic_reward_partial_score/mean": 0.9201253056526184,
1080
+ "rewards/symbolic_reward_partial_score/std": 0.14969654381275177,
1081
+ "rewards/tag_count_reward/mean": 0.99951171875,
1082
+ "rewards/tag_count_reward/std": 0.015621182508766651,
1083
+ "rewards/thinking_answer_ratio_reward/mean": 0.9628262519836426,
1084
+ "rewards/thinking_answer_ratio_reward/std": 0.03255803510546684,
1085
+ "sampling/importance_sampling_ratio/max": 2.0,
1086
+ "sampling/importance_sampling_ratio/mean": 1.280361533164978,
1087
+ "sampling/importance_sampling_ratio/min": 0.000973947171587497,
1088
+ "sampling/sampling_logp_difference/max": 6.9341535568237305,
1089
+ "sampling/sampling_logp_difference/mean": 0.3825719654560089,
1090
+ "step": 100
1091
+ },
1092
+ {
1093
+ "clip_ratio/high_max": 0.2421875,
1094
+ "clip_ratio/high_mean": 0.1552734375,
1095
+ "clip_ratio/low_mean": 0.21044921875,
1096
+ "clip_ratio/low_min": 0.1015625,
1097
+ "clip_ratio/region_mean": 0.36572265625,
1098
+ "completions/clipped_ratio": 0.0,
1099
+ "completions/max_length": 507.0,
1100
+ "completions/max_terminated_length": 507.0,
1101
+ "completions/mean_length": 305.24658203125,
1102
+ "completions/mean_terminated_length": 305.24658203125,
1103
+ "completions/min_length": 184.0,
1104
+ "completions/min_terminated_length": 184.0,
1105
+ "entropy": 0.93367725238204,
1106
+ "epoch": 1.1304347826086956,
1107
+ "frac_reward_zero_std": 0.0,
1108
+ "grad_norm": 0.023622811381027897,
1109
+ "learning_rate": 1e-05,
1110
+ "loss": -0.0013,
1111
+ "num_tokens": 73753171.0,
1112
+ "reward": 3.5650792121887207,
1113
+ "reward_std": 0.07873280346393585,
1114
+ "rewards/ngram_repetition2/mean": 0.9779536724090576,
1115
+ "rewards/ngram_repetition2/std": 0.011236137710511684,
1116
+ "rewards/ngram_repetition3/mean": 0.997450590133667,
1117
+ "rewards/ngram_repetition3/std": 0.004056216217577457,
1118
+ "rewards/symbolic_reward_accuracy/mean": 0.79541015625,
1119
+ "rewards/symbolic_reward_accuracy/std": 0.40350010991096497,
1120
+ "rewards/symbolic_reward_partial_score/mean": 0.9448649287223816,
1121
+ "rewards/symbolic_reward_partial_score/std": 0.12454802542924881,
1122
+ "rewards/tag_count_reward/mean": 1.0,
1123
+ "rewards/tag_count_reward/std": 0.0,
1124
+ "rewards/thinking_answer_ratio_reward/mean": 0.9639977812767029,
1125
+ "rewards/thinking_answer_ratio_reward/std": 0.010575964115560055,
1126
+ "sampling/importance_sampling_ratio/max": 2.0,
1127
+ "sampling/importance_sampling_ratio/mean": 1.2900059223175049,
1128
+ "sampling/importance_sampling_ratio/min": 0.00017328630201518536,
1129
+ "sampling/sampling_logp_difference/max": 8.660565376281738,
1130
+ "sampling/sampling_logp_difference/mean": 0.3951420187950134,
1131
+ "step": 104
1132
+ },
1133
+ {
1134
+ "clip_ratio/high_max": 0.3046875,
1135
+ "clip_ratio/high_mean": 0.193359375,
1136
+ "clip_ratio/low_mean": 0.1904296875,
1137
+ "clip_ratio/low_min": 0.07421875,
1138
+ "clip_ratio/region_mean": 0.3837890625,
1139
+ "completions/clipped_ratio": 0.0,
1140
+ "completions/max_length": 538.0,
1141
+ "completions/max_terminated_length": 538.0,
1142
+ "completions/mean_length": 312.146484375,
1143
+ "completions/mean_terminated_length": 312.146484375,
1144
+ "completions/min_length": 181.0,
1145
+ "completions/min_terminated_length": 181.0,
1146
+ "entropy": 1.0210995934903622,
1147
+ "epoch": 1.1739130434782608,
1148
+ "frac_reward_zero_std": 0.0,
1149
+ "grad_norm": 0.028388966774426528,
1150
+ "learning_rate": 1e-05,
1151
+ "loss": -0.0009,
1152
+ "num_tokens": 76420191.0,
1153
+ "reward": 3.539149761199951,
1154
+ "reward_std": 0.10342732071876526,
1155
+ "rewards/ngram_repetition2/mean": 0.9775739908218384,
1156
+ "rewards/ngram_repetition2/std": 0.011158421635627747,
1157
+ "rewards/ngram_repetition3/mean": 0.99737548828125,
1158
+ "rewards/ngram_repetition3/std": 0.0038673817180097103,
1159
+ "rewards/symbolic_reward_accuracy/mean": 0.78369140625,
1160
+ "rewards/symbolic_reward_accuracy/std": 0.4118276536464691,
1161
+ "rewards/symbolic_reward_partial_score/mean": 0.942626953125,
1162
+ "rewards/symbolic_reward_partial_score/std": 0.12709258496761322,
1163
+ "rewards/tag_count_reward/mean": 0.999755859375,
1164
+ "rewards/tag_count_reward/std": 0.011048543266952038,
1165
+ "rewards/thinking_answer_ratio_reward/mean": 0.9634621739387512,
1166
+ "rewards/thinking_answer_ratio_reward/std": 0.02338651940226555,
1167
+ "sampling/importance_sampling_ratio/max": 2.0,
1168
+ "sampling/importance_sampling_ratio/mean": 1.30389404296875,
1169
+ "sampling/importance_sampling_ratio/min": 0.0012677250197157264,
1170
+ "sampling/sampling_logp_difference/max": 6.670531272888184,
1171
+ "sampling/sampling_logp_difference/mean": 0.4129188656806946,
1172
+ "step": 108
1173
+ },
1174
+ {
1175
+ "clip_ratio/high_max": 0.26171875,
1176
+ "clip_ratio/high_mean": 0.15966796875,
1177
+ "clip_ratio/low_mean": 0.212890625,
1178
+ "clip_ratio/low_min": 0.10546875,
1179
+ "clip_ratio/region_mean": 0.37255859375,
1180
+ "completions/clipped_ratio": 0.00048828125,
1181
+ "completions/max_length": 3072.0,
1182
+ "completions/max_terminated_length": 1036.0,
1183
+ "completions/mean_length": 323.71337890625,
1184
+ "completions/mean_terminated_length": 322.37078857421875,
1185
+ "completions/min_length": 186.0,
1186
+ "completions/min_terminated_length": 186.0,
1187
+ "entropy": 1.1148979514837265,
1188
+ "epoch": 1.2173913043478262,
1189
+ "frac_reward_zero_std": 0.0,
1190
+ "grad_norm": 0.025973854193448703,
1191
+ "learning_rate": 1e-05,
1192
+ "loss": 0.0012,
1193
+ "num_tokens": 79101396.0,
1194
+ "reward": 3.5500121116638184,
1195
+ "reward_std": 0.07026512920856476,
1196
+ "rewards/ngram_repetition2/mean": 0.9797195792198181,
1197
+ "rewards/ngram_repetition2/std": 0.011294333264231682,
1198
+ "rewards/ngram_repetition3/mean": 0.9979097843170166,
1199
+ "rewards/ngram_repetition3/std": 0.003578017931431532,
1200
+ "rewards/symbolic_reward_accuracy/mean": 0.7890625,
1201
+ "rewards/symbolic_reward_accuracy/std": 0.408073753118515,
1202
+ "rewards/symbolic_reward_partial_score/mean": 0.9429525136947632,
1203
+ "rewards/symbolic_reward_partial_score/std": 0.12270573526620865,
1204
+ "rewards/tag_count_reward/mean": 0.99951171875,
1205
+ "rewards/tag_count_reward/std": 0.015621182508766651,
1206
+ "rewards/thinking_answer_ratio_reward/mean": 0.9646666049957275,
1207
+ "rewards/thinking_answer_ratio_reward/std": 0.03140328451991081,
1208
+ "sampling/importance_sampling_ratio/max": 2.0,
1209
+ "sampling/importance_sampling_ratio/mean": 1.3225877285003662,
1210
+ "sampling/importance_sampling_ratio/min": 0.0020175804384052753,
1211
+ "sampling/sampling_logp_difference/max": 6.2058563232421875,
1212
+ "sampling/sampling_logp_difference/mean": 0.43698880076408386,
1213
+ "step": 112
1214
+ },
1215
+ {
1216
+ "clip_ratio/high_max": 0.30859375,
1217
+ "clip_ratio/high_mean": 0.16650390625,
1218
+ "clip_ratio/low_mean": 0.19873046875,
1219
+ "clip_ratio/low_min": 0.0703125,
1220
+ "clip_ratio/region_mean": 0.365234375,
1221
+ "completions/clipped_ratio": 0.0,
1222
+ "completions/max_length": 609.0,
1223
+ "completions/max_terminated_length": 609.0,
1224
+ "completions/mean_length": 321.759765625,
1225
+ "completions/mean_terminated_length": 321.759765625,
1226
+ "completions/min_length": 191.0,
1227
+ "completions/min_terminated_length": 191.0,
1228
+ "entropy": 1.1531813517212868,
1229
+ "epoch": 1.2608695652173914,
1230
+ "frac_reward_zero_std": 0.0,
1231
+ "grad_norm": 0.03531320773224689,
1232
+ "learning_rate": 1e-05,
1233
+ "loss": -0.0021,
1234
+ "num_tokens": 81835624.0,
1235
+ "reward": 3.5205469131469727,
1236
+ "reward_std": 0.06826324760913849,
1237
+ "rewards/ngram_repetition2/mean": 0.9809833765029907,
1238
+ "rewards/ngram_repetition2/std": 0.009805475361645222,
1239
+ "rewards/ngram_repetition3/mean": 0.998246431350708,
1240
+ "rewards/ngram_repetition3/std": 0.003229390596970916,
1241
+ "rewards/symbolic_reward_accuracy/mean": 0.77392578125,
1242
+ "rewards/symbolic_reward_accuracy/std": 0.4183899462223053,
1243
+ "rewards/symbolic_reward_partial_score/mean": 0.9432373046875,
1244
+ "rewards/symbolic_reward_partial_score/std": 0.1203688383102417,
1245
+ "rewards/tag_count_reward/mean": 1.0,
1246
+ "rewards/tag_count_reward/std": 0.0,
1247
+ "rewards/thinking_answer_ratio_reward/mean": 0.9665986895561218,
1248
+ "rewards/thinking_answer_ratio_reward/std": 0.007385050877928734,
1249
+ "sampling/importance_sampling_ratio/max": 2.0,
1250
+ "sampling/importance_sampling_ratio/mean": 1.3322265148162842,
1251
+ "sampling/importance_sampling_ratio/min": 0.0010348489740863442,
1252
+ "sampling/sampling_logp_difference/max": 6.873499870300293,
1253
+ "sampling/sampling_logp_difference/mean": 0.44727569818496704,
1254
+ "step": 116
1255
+ },
1256
+ {
1257
+ "clip_ratio/high_max": 0.2890625,
1258
+ "clip_ratio/high_mean": 0.1962890625,
1259
+ "clip_ratio/low_mean": 0.1875,
1260
+ "clip_ratio/low_min": 0.078125,
1261
+ "clip_ratio/region_mean": 0.3837890625,
1262
+ "completions/clipped_ratio": 0.00048828125,
1263
+ "completions/max_length": 3072.0,
1264
+ "completions/max_terminated_length": 723.0,
1265
+ "completions/mean_length": 349.056640625,
1266
+ "completions/mean_terminated_length": 347.7264404296875,
1267
+ "completions/min_length": 210.0,
1268
+ "completions/min_terminated_length": 210.0,
1269
+ "entropy": 1.2782283127307892,
1270
+ "epoch": 1.3043478260869565,
1271
+ "frac_reward_zero_std": 0.0,
1272
+ "grad_norm": 0.034579763552097215,
1273
+ "learning_rate": 1e-05,
1274
+ "loss": 0.0015,
1275
+ "num_tokens": 84613084.0,
1276
+ "reward": 3.4549241065979004,
1277
+ "reward_std": 0.06410142034292221,
1278
+ "rewards/ngram_repetition2/mean": 0.9802985191345215,
1279
+ "rewards/ngram_repetition2/std": 0.010372255928814411,
1280
+ "rewards/ngram_repetition3/mean": 0.9981052875518799,
1281
+ "rewards/ngram_repetition3/std": 0.0034116564784199,
1282
+ "rewards/symbolic_reward_accuracy/mean": 0.7470703125,
1283
+ "rewards/symbolic_reward_accuracy/std": 0.43479716777801514,
1284
+ "rewards/symbolic_reward_partial_score/mean": 0.9315592050552368,
1285
+ "rewards/symbolic_reward_partial_score/std": 0.14198674261569977,
1286
+ "rewards/tag_count_reward/mean": 0.999755859375,
1287
+ "rewards/tag_count_reward/std": 0.011048543266952038,
1288
+ "rewards/thinking_answer_ratio_reward/mean": 0.9684212803840637,
1289
+ "rewards/thinking_answer_ratio_reward/std": 0.022755270823836327,
1290
+ "sampling/importance_sampling_ratio/max": 2.0,
1291
+ "sampling/importance_sampling_ratio/mean": 1.3524930477142334,
1292
+ "sampling/importance_sampling_ratio/min": 0.0009223732049576938,
1293
+ "sampling/sampling_logp_difference/max": 6.988560676574707,
1294
+ "sampling/sampling_logp_difference/mean": 0.4782055616378784,
1295
+ "step": 120
1296
+ },
1297
+ {
1298
+ "clip_ratio/high_max": 0.25390625,
1299
+ "clip_ratio/high_mean": 0.13916015625,
1300
+ "clip_ratio/low_mean": 0.23193359375,
1301
+ "clip_ratio/low_min": 0.11328125,
1302
+ "clip_ratio/region_mean": 0.37109375,
1303
+ "completions/clipped_ratio": 0.00146484375,
1304
+ "completions/max_length": 3072.0,
1305
+ "completions/max_terminated_length": 644.0,
1306
+ "completions/mean_length": 365.08154296875,
1307
+ "completions/mean_terminated_length": 361.1105041503906,
1308
+ "completions/min_length": 207.0,
1309
+ "completions/min_terminated_length": 207.0,
1310
+ "entropy": 1.3577501401305199,
1311
+ "epoch": 1.3478260869565217,
1312
+ "frac_reward_zero_std": 0.0,
1313
+ "grad_norm": 0.03263029986961485,
1314
+ "learning_rate": 1e-05,
1315
+ "loss": 0.0038,
1316
+ "num_tokens": 87413859.0,
1317
+ "reward": 3.3977296352386475,
1318
+ "reward_std": 0.07217299938201904,
1319
+ "rewards/ngram_repetition2/mean": 0.9815422296524048,
1320
+ "rewards/ngram_repetition2/std": 0.017534635961055756,
1321
+ "rewards/ngram_repetition3/mean": 0.9980996251106262,
1322
+ "rewards/ngram_repetition3/std": 0.014925251714885235,
1323
+ "rewards/symbolic_reward_accuracy/mean": 0.72265625,
1324
+ "rewards/symbolic_reward_accuracy/std": 0.44779694080352783,
1325
+ "rewards/symbolic_reward_partial_score/mean": 0.9239094853401184,
1326
+ "rewards/symbolic_reward_partial_score/std": 0.14968585968017578,
1327
+ "rewards/tag_count_reward/mean": 0.9990234375,
1328
+ "rewards/tag_count_reward/std": 0.022080888971686363,
1329
+ "rewards/thinking_answer_ratio_reward/mean": 0.9687855243682861,
1330
+ "rewards/thinking_answer_ratio_reward/std": 0.043307721614837646,
1331
+ "sampling/importance_sampling_ratio/max": 2.0,
1332
+ "sampling/importance_sampling_ratio/mean": 1.3644728660583496,
1333
+ "sampling/importance_sampling_ratio/min": 4.628046553989407e-06,
1334
+ "sampling/sampling_logp_difference/max": 12.28337574005127,
1335
+ "sampling/sampling_logp_difference/mean": 0.4966655373573303,
1336
+ "step": 124
1337
+ },
1338
+ {
1339
+ "clip_ratio/high_max": 0.26953125,
1340
+ "clip_ratio/high_mean": 0.1572265625,
1341
+ "clip_ratio/low_mean": 0.189453125,
1342
+ "clip_ratio/low_min": 0.078125,
1343
+ "clip_ratio/region_mean": 0.3466796875,
1344
+ "completions/clipped_ratio": 0.001953125,
1345
+ "completions/max_length": 3072.0,
1346
+ "completions/max_terminated_length": 662.0,
1347
+ "completions/mean_length": 374.17041015625,
1348
+ "completions/mean_terminated_length": 368.8908996582031,
1349
+ "completions/min_length": 236.0,
1350
+ "completions/min_terminated_length": 236.0,
1351
+ "entropy": 1.4103393778204918,
1352
+ "epoch": 1.391304347826087,
1353
+ "frac_reward_zero_std": 0.0,
1354
+ "grad_norm": 0.03271499886368749,
1355
+ "learning_rate": 1e-05,
1356
+ "loss": 0.0068,
1357
+ "num_tokens": 90233248.0,
1358
+ "reward": 3.470027446746826,
1359
+ "reward_std": 0.08837710320949554,
1360
+ "rewards/ngram_repetition2/mean": 0.9828887581825256,
1361
+ "rewards/ngram_repetition2/std": 0.008584595285356045,
1362
+ "rewards/ngram_repetition3/mean": 0.9987025260925293,
1363
+ "rewards/ngram_repetition3/std": 0.0025142852682620287,
1364
+ "rewards/symbolic_reward_accuracy/mean": 0.75390625,
1365
+ "rewards/symbolic_reward_accuracy/std": 0.43083900213241577,
1366
+ "rewards/symbolic_reward_partial_score/mean": 0.9336751103401184,
1367
+ "rewards/symbolic_reward_partial_score/std": 0.1408122330904007,
1368
+ "rewards/tag_count_reward/mean": 0.9990234375,
1369
+ "rewards/tag_count_reward/std": 0.022080888971686363,
1370
+ "rewards/thinking_answer_ratio_reward/mean": 0.9700585007667542,
1371
+ "rewards/thinking_answer_ratio_reward/std": 0.043263670057058334,
1372
+ "sampling/importance_sampling_ratio/max": 2.0,
1373
+ "sampling/importance_sampling_ratio/mean": 1.3686330318450928,
1374
+ "sampling/importance_sampling_ratio/min": 1.2951417147633038e-06,
1375
+ "sampling/sampling_logp_difference/max": 13.556890487670898,
1376
+ "sampling/sampling_logp_difference/mean": 0.5069293975830078,
1377
+ "step": 128
1378
+ },
1379
+ {
1380
+ "clip_ratio/high_max": 0.28515625,
1381
+ "clip_ratio/high_mean": 0.1611328125,
1382
+ "clip_ratio/low_mean": 0.2099609375,
1383
+ "clip_ratio/low_min": 0.1015625,
1384
+ "clip_ratio/region_mean": 0.37109375,
1385
+ "completions/clipped_ratio": 0.0,
1386
+ "completions/max_length": 640.0,
1387
+ "completions/max_terminated_length": 640.0,
1388
+ "completions/mean_length": 372.28515625,
1389
+ "completions/mean_terminated_length": 372.28515625,
1390
+ "completions/min_length": 222.0,
1391
+ "completions/min_terminated_length": 222.0,
1392
+ "entropy": 1.4540528357028961,
1393
+ "epoch": 1.434782608695652,
1394
+ "frac_reward_zero_std": 0.0,
1395
+ "grad_norm": 0.037706615340187905,
1396
+ "learning_rate": 1e-05,
1397
+ "loss": -0.0007,
1398
+ "num_tokens": 93048776.0,
1399
+ "reward": 3.6577959060668945,
1400
+ "reward_std": 0.08277644217014313,
1401
+ "rewards/ngram_repetition2/mean": 0.9837426543235779,
1402
+ "rewards/ngram_repetition2/std": 0.008059200830757618,
1403
+ "rewards/ngram_repetition3/mean": 0.9987409710884094,
1404
+ "rewards/ngram_repetition3/std": 0.002457347232848406,
1405
+ "rewards/symbolic_reward_accuracy/mean": 0.8359375,
1406
+ "rewards/symbolic_reward_accuracy/std": 0.37042272090911865,
1407
+ "rewards/symbolic_reward_partial_score/mean": 0.9568685293197632,
1408
+ "rewards/symbolic_reward_partial_score/std": 0.11145035177469254,
1409
+ "rewards/tag_count_reward/mean": 0.99951171875,
1410
+ "rewards/tag_count_reward/std": 0.015621182508766651,
1411
+ "rewards/thinking_answer_ratio_reward/mean": 0.9716061353683472,
1412
+ "rewards/thinking_answer_ratio_reward/std": 0.03078635036945343,
1413
+ "sampling/importance_sampling_ratio/max": 2.0,
1414
+ "sampling/importance_sampling_ratio/mean": 1.3747305870056152,
1415
+ "sampling/importance_sampling_ratio/min": 6.537237368320348e-06,
1416
+ "sampling/sampling_logp_difference/max": 11.937995910644531,
1417
+ "sampling/sampling_logp_difference/mean": 0.5204066634178162,
1418
+ "step": 132
1419
+ },
1420
+ {
1421
+ "clip_ratio/high_max": 0.24609375,
1422
+ "clip_ratio/high_mean": 0.1455078125,
1423
+ "clip_ratio/low_mean": 0.2216796875,
1424
+ "clip_ratio/low_min": 0.109375,
1425
+ "clip_ratio/region_mean": 0.3671875,
1426
+ "completions/clipped_ratio": 0.0,
1427
+ "completions/max_length": 690.0,
1428
+ "completions/max_terminated_length": 690.0,
1429
+ "completions/mean_length": 384.67431640625,
1430
+ "completions/mean_terminated_length": 384.67431640625,
1431
+ "completions/min_length": 243.0,
1432
+ "completions/min_terminated_length": 243.0,
1433
+ "entropy": 1.5088882371783257,
1434
+ "epoch": 1.4782608695652173,
1435
+ "frac_reward_zero_std": 0.0,
1436
+ "grad_norm": 0.037293164778679895,
1437
+ "learning_rate": 1e-05,
1438
+ "loss": -0.0015,
1439
+ "num_tokens": 95896013.0,
1440
+ "reward": 3.551879405975342,
1441
+ "reward_std": 0.05557756870985031,
1442
+ "rewards/ngram_repetition2/mean": 0.9850949048995972,
1443
+ "rewards/ngram_repetition2/std": 0.007369986269623041,
1444
+ "rewards/ngram_repetition3/mean": 0.9990456104278564,
1445
+ "rewards/ngram_repetition3/std": 0.002027435228228569,
1446
+ "rewards/symbolic_reward_accuracy/mean": 0.7919921875,
1447
+ "rewards/symbolic_reward_accuracy/std": 0.40598157048225403,
1448
+ "rewards/symbolic_reward_partial_score/mean": 0.9383137822151184,
1449
+ "rewards/symbolic_reward_partial_score/std": 0.13532200455665588,
1450
+ "rewards/tag_count_reward/mean": 1.0,
1451
+ "rewards/tag_count_reward/std": 0.0,
1452
+ "rewards/thinking_answer_ratio_reward/mean": 0.9739952683448792,
1453
+ "rewards/thinking_answer_ratio_reward/std": 0.00430486723780632,
1454
+ "sampling/importance_sampling_ratio/max": 2.0,
1455
+ "sampling/importance_sampling_ratio/mean": 1.3827202320098877,
1456
+ "sampling/importance_sampling_ratio/min": 3.3355458981532138e-06,
1457
+ "sampling/sampling_logp_difference/max": 12.61087417602539,
1458
+ "sampling/sampling_logp_difference/mean": 0.5356423854827881,
1459
+ "step": 136
1460
+ },
1461
+ {
1462
+ "clip_ratio/high_max": 0.265625,
1463
+ "clip_ratio/high_mean": 0.14111328125,
1464
+ "clip_ratio/low_mean": 0.2041015625,
1465
+ "clip_ratio/low_min": 0.08203125,
1466
+ "clip_ratio/region_mean": 0.34521484375,
1467
+ "completions/clipped_ratio": 0.0,
1468
+ "completions/max_length": 697.0,
1469
+ "completions/max_terminated_length": 697.0,
1470
+ "completions/mean_length": 411.45556640625,
1471
+ "completions/mean_terminated_length": 411.45556640625,
1472
+ "completions/min_length": 243.0,
1473
+ "completions/min_terminated_length": 243.0,
1474
+ "entropy": 1.632333055138588,
1475
+ "epoch": 1.5217391304347827,
1476
+ "frac_reward_zero_std": 0.0,
1477
+ "grad_norm": 0.04682422298321068,
1478
+ "learning_rate": 1e-05,
1479
+ "loss": -0.0012,
1480
+ "num_tokens": 98772754.0,
1481
+ "reward": 3.4415788650512695,
1482
+ "reward_std": 0.06567732989788055,
1483
+ "rewards/ngram_repetition2/mean": 0.9847305417060852,
1484
+ "rewards/ngram_repetition2/std": 0.007420970126986504,
1485
+ "rewards/ngram_repetition3/mean": 0.9989989399909973,
1486
+ "rewards/ngram_repetition3/std": 0.0022030072286725044,
1487
+ "rewards/symbolic_reward_accuracy/mean": 0.74072265625,
1488
+ "rewards/symbolic_reward_accuracy/std": 0.4383451044559479,
1489
+ "rewards/symbolic_reward_partial_score/mean": 0.9307861328125,
1490
+ "rewards/symbolic_reward_partial_score/std": 0.1378849595785141,
1491
+ "rewards/tag_count_reward/mean": 0.999755859375,
1492
+ "rewards/tag_count_reward/std": 0.011048543266952038,
1493
+ "rewards/thinking_answer_ratio_reward/mean": 0.9754141569137573,
1494
+ "rewards/thinking_answer_ratio_reward/std": 0.022000106051564217,
1495
+ "sampling/importance_sampling_ratio/max": 2.0,
1496
+ "sampling/importance_sampling_ratio/mean": 1.3969695568084717,
1497
+ "sampling/importance_sampling_ratio/min": 2.535395157710063e-11,
1498
+ "sampling/sampling_logp_difference/max": 24.398086547851562,
1499
+ "sampling/sampling_logp_difference/mean": 0.5631133317947388,
1500
+ "step": 140
1501
+ },
1502
+ {
1503
+ "clip_ratio/high_max": 0.24609375,
1504
+ "clip_ratio/high_mean": 0.15283203125,
1505
+ "clip_ratio/low_mean": 0.203125,
1506
+ "clip_ratio/low_min": 0.1015625,
1507
+ "clip_ratio/region_mean": 0.35595703125,
1508
+ "completions/clipped_ratio": 0.0,
1509
+ "completions/max_length": 890.0,
1510
+ "completions/max_terminated_length": 890.0,
1511
+ "completions/mean_length": 425.41552734375,
1512
+ "completions/mean_terminated_length": 425.41552734375,
1513
+ "completions/min_length": 258.0,
1514
+ "completions/min_terminated_length": 258.0,
1515
+ "entropy": 1.726756490767002,
1516
+ "epoch": 1.5652173913043477,
1517
+ "frac_reward_zero_std": 0.0,
1518
+ "grad_norm": 0.038093541651771236,
1519
+ "learning_rate": 1e-05,
1520
+ "loss": -0.001,
1521
+ "num_tokens": 101674917.0,
1522
+ "reward": 3.5725719928741455,
1523
+ "reward_std": 0.017834221944212914,
1524
+ "rewards/ngram_repetition2/mean": 0.9846716523170471,
1525
+ "rewards/ngram_repetition2/std": 0.0074483552016317844,
1526
+ "rewards/ngram_repetition3/mean": 0.9990028738975525,
1527
+ "rewards/ngram_repetition3/std": 0.00226139766164124,
1528
+ "rewards/symbolic_reward_accuracy/mean": 0.80322265625,
1529
+ "rewards/symbolic_reward_accuracy/std": 0.39765968918800354,
1530
+ "rewards/symbolic_reward_partial_score/mean": 0.9365234375,
1531
+ "rewards/symbolic_reward_partial_score/std": 0.14711622893810272,
1532
+ "rewards/tag_count_reward/mean": 1.0,
1533
+ "rewards/tag_count_reward/std": 0.0,
1534
+ "rewards/thinking_answer_ratio_reward/mean": 0.9766725897789001,
1535
+ "rewards/thinking_answer_ratio_reward/std": 0.004060372244566679,
1536
+ "sampling/importance_sampling_ratio/max": 2.0,
1537
+ "sampling/importance_sampling_ratio/mean": 1.4077180624008179,
1538
+ "sampling/importance_sampling_ratio/min": 5.527023176910006e-07,
1539
+ "sampling/sampling_logp_difference/max": 14.408446311950684,
1540
+ "sampling/sampling_logp_difference/mean": 0.5831520557403564,
1541
+ "step": 144
1542
+ },
1543
+ {
1544
+ "clip_ratio/high_max": 0.2734375,
1545
+ "clip_ratio/high_mean": 0.1513671875,
1546
+ "clip_ratio/low_mean": 0.22314453125,
1547
+ "clip_ratio/low_min": 0.10546875,
1548
+ "clip_ratio/region_mean": 0.37451171875,
1549
+ "completions/clipped_ratio": 0.0,
1550
+ "completions/max_length": 764.0,
1551
+ "completions/max_terminated_length": 764.0,
1552
+ "completions/mean_length": 434.2431640625,
1553
+ "completions/mean_terminated_length": 434.2431640625,
1554
+ "completions/min_length": 236.0,
1555
+ "completions/min_terminated_length": 236.0,
1556
+ "entropy": 1.815441645681858,
1557
+ "epoch": 1.608695652173913,
1558
+ "frac_reward_zero_std": 0.0,
1559
+ "grad_norm": 0.06173162254334352,
1560
+ "learning_rate": 1e-05,
1561
+ "loss": -0.0011,
1562
+ "num_tokens": 104649015.0,
1563
+ "reward": 3.4869613647460938,
1564
+ "reward_std": 0.0348396934568882,
1565
+ "rewards/ngram_repetition2/mean": 0.9839403629302979,
1566
+ "rewards/ngram_repetition2/std": 0.00729979295283556,
1567
+ "rewards/ngram_repetition3/mean": 0.9989305138587952,
1568
+ "rewards/ngram_repetition3/std": 0.002313731238245964,
1569
+ "rewards/symbolic_reward_accuracy/mean": 0.75830078125,
1570
+ "rewards/symbolic_reward_accuracy/std": 0.4282175302505493,
1571
+ "rewards/symbolic_reward_partial_score/mean": 0.9407551884651184,
1572
+ "rewards/symbolic_reward_partial_score/std": 0.12213268131017685,
1573
+ "rewards/tag_count_reward/mean": 1.0,
1574
+ "rewards/tag_count_reward/std": 0.0,
1575
+ "rewards/thinking_answer_ratio_reward/mean": 0.9775879383087158,
1576
+ "rewards/thinking_answer_ratio_reward/std": 0.003819839097559452,
1577
+ "sampling/importance_sampling_ratio/max": 2.0,
1578
+ "sampling/importance_sampling_ratio/mean": 1.4123494625091553,
1579
+ "sampling/importance_sampling_ratio/min": 8.588379569118842e-05,
1580
+ "sampling/sampling_logp_difference/max": 9.362515449523926,
1581
+ "sampling/sampling_logp_difference/mean": 0.5984525084495544,
1582
+ "step": 148
1583
+ },
1584
+ {
1585
+ "clip_ratio/high_max": 0.2265625,
1586
+ "clip_ratio/high_mean": 0.1328125,
1587
+ "clip_ratio/low_mean": 0.2197265625,
1588
+ "clip_ratio/low_min": 0.1015625,
1589
+ "clip_ratio/region_mean": 0.3525390625,
1590
+ "completions/clipped_ratio": 0.00048828125,
1591
+ "completions/max_length": 3072.0,
1592
+ "completions/max_terminated_length": 932.0,
1593
+ "completions/mean_length": 463.5791015625,
1594
+ "completions/mean_terminated_length": 462.3048400878906,
1595
+ "completions/min_length": 241.0,
1596
+ "completions/min_terminated_length": 241.0,
1597
+ "entropy": 1.9924634099006653,
1598
+ "epoch": 1.6521739130434783,
1599
+ "frac_reward_zero_std": 0.0,
1600
+ "grad_norm": 0.10959925778321504,
1601
+ "learning_rate": 1e-05,
1602
+ "loss": 0.0013,
1603
+ "num_tokens": 107645177.0,
1604
+ "reward": 3.647918462753296,
1605
+ "reward_std": 0.0664098709821701,
1606
+ "rewards/ngram_repetition2/mean": 0.9830807447433472,
1607
+ "rewards/ngram_repetition2/std": 0.007356169633567333,
1608
+ "rewards/ngram_repetition3/mean": 0.9989252090454102,
1609
+ "rewards/ngram_repetition3/std": 0.0019529308192431927,
1610
+ "rewards/symbolic_reward_accuracy/mean": 0.82958984375,
1611
+ "rewards/symbolic_reward_accuracy/std": 0.376084566116333,
1612
+ "rewards/symbolic_reward_partial_score/mean": 0.9603678584098816,
1613
+ "rewards/symbolic_reward_partial_score/std": 0.09740671515464783,
1614
+ "rewards/tag_count_reward/mean": 0.998779296875,
1615
+ "rewards/tag_count_reward/std": 0.02468114346265793,
1616
+ "rewards/thinking_answer_ratio_reward/mean": 0.9771405458450317,
1617
+ "rewards/thinking_answer_ratio_reward/std": 0.043395884335041046,
1618
+ "sampling/importance_sampling_ratio/max": 2.0,
1619
+ "sampling/importance_sampling_ratio/mean": 1.4253789186477661,
1620
+ "sampling/importance_sampling_ratio/min": 0.0006438745185732841,
1621
+ "sampling/sampling_logp_difference/max": 7.348006725311279,
1622
+ "sampling/sampling_logp_difference/mean": 0.6298946738243103,
1623
+ "step": 152
1624
+ },
1625
+ {
1626
+ "clip_ratio/high_max": 0.140625,
1627
+ "clip_ratio/high_mean": 0.08349609375,
1628
+ "clip_ratio/low_mean": 0.263671875,
1629
+ "clip_ratio/low_min": 0.1328125,
1630
+ "clip_ratio/region_mean": 0.34716796875,
1631
+ "completions/clipped_ratio": 0.0,
1632
+ "completions/max_length": 824.0,
1633
+ "completions/max_terminated_length": 824.0,
1634
+ "completions/mean_length": 462.9541015625,
1635
+ "completions/mean_terminated_length": 462.9541015625,
1636
+ "completions/min_length": 234.0,
1637
+ "completions/min_terminated_length": 234.0,
1638
+ "entropy": 2.0628679618239403,
1639
+ "epoch": 1.6956521739130435,
1640
+ "frac_reward_zero_std": 0.0,
1641
+ "grad_norm": 0.24161130829987623,
1642
+ "learning_rate": 1e-05,
1643
+ "loss": -0.0003,
1644
+ "num_tokens": 110662235.0,
1645
+ "reward": 3.3783979415893555,
1646
+ "reward_std": 0.036025457084178925,
1647
+ "rewards/ngram_repetition2/mean": 0.9834574460983276,
1648
+ "rewards/ngram_repetition2/std": 0.0074860225431621075,
1649
+ "rewards/ngram_repetition3/mean": 0.9989546537399292,
1650
+ "rewards/ngram_repetition3/std": 0.002006194554269314,
1651
+ "rewards/symbolic_reward_accuracy/mean": 0.70654296875,
1652
+ "rewards/symbolic_reward_accuracy/std": 0.45545724034309387,
1653
+ "rewards/symbolic_reward_partial_score/mean": 0.9361978769302368,
1654
+ "rewards/symbolic_reward_partial_score/std": 0.10914558917284012,
1655
+ "rewards/tag_count_reward/mean": 0.99951171875,
1656
+ "rewards/tag_count_reward/std": 0.015621182508766651,
1657
+ "rewards/thinking_answer_ratio_reward/mean": 0.9778301119804382,
1658
+ "rewards/thinking_answer_ratio_reward/std": 0.030854862183332443,
1659
+ "sampling/importance_sampling_ratio/max": 2.0,
1660
+ "sampling/importance_sampling_ratio/mean": 1.4279154539108276,
1661
+ "sampling/importance_sampling_ratio/min": 2.4768311050138436e-05,
1662
+ "sampling/sampling_logp_difference/max": 10.605945587158203,
1663
+ "sampling/sampling_logp_difference/mean": 0.6437482237815857,
1664
+ "step": 156
1665
+ },
1666
+ {
1667
+ "clip_ratio/high_max": 0.15234375,
1668
+ "clip_ratio/high_mean": 0.0888671875,
1669
+ "clip_ratio/low_mean": 0.23388671875,
1670
+ "clip_ratio/low_min": 0.125,
1671
+ "clip_ratio/region_mean": 0.32275390625,
1672
+ "completions/clipped_ratio": 0.0,
1673
+ "completions/max_length": 993.0,
1674
+ "completions/max_terminated_length": 993.0,
1675
+ "completions/mean_length": 498.11767578125,
1676
+ "completions/mean_terminated_length": 498.11767578125,
1677
+ "completions/min_length": 291.0,
1678
+ "completions/min_terminated_length": 291.0,
1679
+ "entropy": 2.2991671413183212,
1680
+ "epoch": 1.7391304347826086,
1681
+ "frac_reward_zero_std": 0.0,
1682
+ "grad_norm": 0.3548338605641257,
1683
+ "learning_rate": 1e-05,
1684
+ "loss": 0.0016,
1685
+ "num_tokens": 113729132.0,
1686
+ "reward": 3.5990939140319824,
1687
+ "reward_std": 0.0962018221616745,
1688
+ "rewards/ngram_repetition2/mean": 0.9809653162956238,
1689
+ "rewards/ngram_repetition2/std": 0.007712055929005146,
1690
+ "rewards/ngram_repetition3/mean": 0.9986952543258667,
1691
+ "rewards/ngram_repetition3/std": 0.001996663399040699,
1692
+ "rewards/symbolic_reward_accuracy/mean": 0.8076171875,
1693
+ "rewards/symbolic_reward_accuracy/std": 0.3942683935165405,
1694
+ "rewards/symbolic_reward_partial_score/mean": 0.9549967050552368,
1695
+ "rewards/symbolic_reward_partial_score/std": 0.10741008818149567,
1696
+ "rewards/tag_count_reward/mean": 0.999267578125,
1697
+ "rewards/tag_count_reward/std": 0.019127286970615387,
1698
+ "rewards/thinking_answer_ratio_reward/mean": 0.979852557182312,
1699
+ "rewards/thinking_answer_ratio_reward/std": 0.02200859785079956,
1700
+ "sampling/importance_sampling_ratio/max": 2.0,
1701
+ "sampling/importance_sampling_ratio/mean": 1.4530794620513916,
1702
+ "sampling/importance_sampling_ratio/min": 3.576672469307596e-08,
1703
+ "sampling/sampling_logp_difference/max": 17.14624786376953,
1704
+ "sampling/sampling_logp_difference/mean": 0.6910567283630371,
1705
+ "step": 160
1706
+ },
1707
+ {
1708
+ "clip_ratio/high_max": 0.16796875,
1709
+ "clip_ratio/high_mean": 0.10107421875,
1710
+ "clip_ratio/low_mean": 0.24560546875,
1711
+ "clip_ratio/low_min": 0.1328125,
1712
+ "clip_ratio/region_mean": 0.3466796875,
1713
+ "completions/clipped_ratio": 0.0,
1714
+ "completions/max_length": 1344.0,
1715
+ "completions/max_terminated_length": 1344.0,
1716
+ "completions/mean_length": 543.86083984375,
1717
+ "completions/mean_terminated_length": 543.86083984375,
1718
+ "completions/min_length": 310.0,
1719
+ "completions/min_terminated_length": 310.0,
1720
+ "entropy": 2.6379848271608353,
1721
+ "epoch": 1.7826086956521738,
1722
+ "frac_reward_zero_std": 0.0,
1723
+ "grad_norm": 0.08513499323190751,
1724
+ "learning_rate": 1e-05,
1725
+ "loss": 0.0133,
1726
+ "num_tokens": 116905551.0,
1727
+ "reward": 3.548675775527954,
1728
+ "reward_std": 0.09981994330883026,
1729
+ "rewards/ngram_repetition2/mean": 0.9711927771568298,
1730
+ "rewards/ngram_repetition2/std": 0.011247237212955952,
1731
+ "rewards/ngram_repetition3/mean": 0.9973983764648438,
1732
+ "rewards/ngram_repetition3/std": 0.002803381998091936,
1733
+ "rewards/symbolic_reward_accuracy/mean": 0.7880859375,
1734
+ "rewards/symbolic_reward_accuracy/std": 0.4087640941143036,
1735
+ "rewards/symbolic_reward_partial_score/mean": 0.9429931640625,
1736
+ "rewards/symbolic_reward_partial_score/std": 0.12854379415512085,
1737
+ "rewards/tag_count_reward/mean": 1.0,
1738
+ "rewards/tag_count_reward/std": 0.0,
1739
+ "rewards/thinking_answer_ratio_reward/mean": 0.9824697971343994,
1740
+ "rewards/thinking_answer_ratio_reward/std": 0.004015693906694651,
1741
+ "sampling/importance_sampling_ratio/max": 2.0,
1742
+ "sampling/importance_sampling_ratio/mean": 1.4711500406265259,
1743
+ "sampling/importance_sampling_ratio/min": 1.322712250839686e-07,
1744
+ "sampling/sampling_logp_difference/max": 15.838411331176758,
1745
+ "sampling/sampling_logp_difference/mean": 0.7328225374221802,
1746
+ "step": 164
1747
+ },
1748
+ {
1749
+ "clip_ratio/high_max": 0.24609375,
1750
+ "clip_ratio/high_mean": 0.1396484375,
1751
+ "clip_ratio/low_mean": 0.23193359375,
1752
+ "clip_ratio/low_min": 0.10546875,
1753
+ "clip_ratio/region_mean": 0.37158203125,
1754
+ "completions/clipped_ratio": 0.0,
1755
+ "completions/max_length": 1438.0,
1756
+ "completions/max_terminated_length": 1438.0,
1757
+ "completions/mean_length": 503.81298828125,
1758
+ "completions/mean_terminated_length": 503.81298828125,
1759
+ "completions/min_length": 263.0,
1760
+ "completions/min_terminated_length": 263.0,
1761
+ "entropy": 2.788048267364502,
1762
+ "epoch": 1.8260869565217392,
1763
+ "frac_reward_zero_std": 0.0,
1764
+ "grad_norm": 0.09455791725368402,
1765
+ "learning_rate": 1e-05,
1766
+ "loss": 0.0074,
1767
+ "num_tokens": 120015792.0,
1768
+ "reward": 3.349081039428711,
1769
+ "reward_std": 0.10028517991304398,
1770
+ "rewards/ngram_repetition2/mean": 0.9654586315155029,
1771
+ "rewards/ngram_repetition2/std": 0.012577379122376442,
1772
+ "rewards/ngram_repetition3/mean": 0.9965716004371643,
1773
+ "rewards/ngram_repetition3/std": 0.0033189503010362387,
1774
+ "rewards/symbolic_reward_accuracy/mean": 0.69970703125,
1775
+ "rewards/symbolic_reward_accuracy/std": 0.4584972560405731,
1776
+ "rewards/symbolic_reward_partial_score/mean": 0.9207356572151184,
1777
+ "rewards/symbolic_reward_partial_score/std": 0.14194521307945251,
1778
+ "rewards/tag_count_reward/mean": 0.99951171875,
1779
+ "rewards/tag_count_reward/std": 0.015621182508766651,
1780
+ "rewards/thinking_answer_ratio_reward/mean": 0.979932427406311,
1781
+ "rewards/thinking_answer_ratio_reward/std": 0.031026024371385574,
1782
+ "sampling/importance_sampling_ratio/max": 2.0,
1783
+ "sampling/importance_sampling_ratio/mean": 1.473034381866455,
1784
+ "sampling/importance_sampling_ratio/min": 2.001723487410345e-06,
1785
+ "sampling/sampling_logp_difference/max": 13.121501922607422,
1786
+ "sampling/sampling_logp_difference/mean": 0.7396732568740845,
1787
+ "step": 168
1788
+ },
1789
+ {
1790
+ "clip_ratio/high_max": 0.19140625,
1791
+ "clip_ratio/high_mean": 0.10546875,
1792
+ "clip_ratio/low_mean": 0.25341796875,
1793
+ "clip_ratio/low_min": 0.13671875,
1794
+ "clip_ratio/region_mean": 0.35888671875,
1795
+ "completions/clipped_ratio": 0.0,
1796
+ "completions/max_length": 898.0,
1797
+ "completions/max_terminated_length": 898.0,
1798
+ "completions/mean_length": 467.646484375,
1799
+ "completions/mean_terminated_length": 467.646484375,
1800
+ "completions/min_length": 187.0,
1801
+ "completions/min_terminated_length": 187.0,
1802
+ "entropy": 2.768448531627655,
1803
+ "epoch": 1.8695652173913042,
1804
+ "frac_reward_zero_std": 0.0,
1805
+ "grad_norm": 0.06805611273845928,
1806
+ "learning_rate": 1e-05,
1807
+ "loss": 0.0022,
1808
+ "num_tokens": 123004444.0,
1809
+ "reward": 3.5058183670043945,
1810
+ "reward_std": 0.07877355813980103,
1811
+ "rewards/ngram_repetition2/mean": 0.9659230709075928,
1812
+ "rewards/ngram_repetition2/std": 0.011437240056693554,
1813
+ "rewards/ngram_repetition3/mean": 0.996779203414917,
1814
+ "rewards/ngram_repetition3/std": 0.003355607157573104,
1815
+ "rewards/symbolic_reward_accuracy/mean": 0.77001953125,
1816
+ "rewards/symbolic_reward_accuracy/std": 0.42092275619506836,
1817
+ "rewards/symbolic_reward_partial_score/mean": 0.9363606572151184,
1818
+ "rewards/symbolic_reward_partial_score/std": 0.13817265629768372,
1819
+ "rewards/tag_count_reward/mean": 1.0,
1820
+ "rewards/tag_count_reward/std": 0.0,
1821
+ "rewards/thinking_answer_ratio_reward/mean": 0.9791755676269531,
1822
+ "rewards/thinking_answer_ratio_reward/std": 0.005196898244321346,
1823
+ "sampling/importance_sampling_ratio/max": 2.0,
1824
+ "sampling/importance_sampling_ratio/mean": 1.4763879776000977,
1825
+ "sampling/importance_sampling_ratio/min": 4.474750312510878e-05,
1826
+ "sampling/sampling_logp_difference/max": 10.014474868774414,
1827
+ "sampling/sampling_logp_difference/mean": 0.7400292158126831,
1828
+ "step": 172
1829
+ },
1830
+ {
1831
+ "clip_ratio/high_max": 0.30859375,
1832
+ "clip_ratio/high_mean": 0.18505859375,
1833
+ "clip_ratio/low_mean": 0.17431640625,
1834
+ "clip_ratio/low_min": 0.078125,
1835
+ "clip_ratio/region_mean": 0.359375,
1836
+ "completions/clipped_ratio": 0.00048828125,
1837
+ "completions/max_length": 3072.0,
1838
+ "completions/max_terminated_length": 1376.0,
1839
+ "completions/mean_length": 430.4609375,
1840
+ "completions/mean_terminated_length": 429.1705017089844,
1841
+ "completions/min_length": 189.0,
1842
+ "completions/min_terminated_length": 189.0,
1843
+ "entropy": 2.8502298444509506,
1844
+ "epoch": 1.9130434782608696,
1845
+ "frac_reward_zero_std": 0.0,
1846
+ "grad_norm": 0.06663024928165782,
1847
+ "learning_rate": 1e-05,
1848
+ "loss": 0.003,
1849
+ "num_tokens": 125954956.0,
1850
+ "reward": 3.483172655105591,
1851
+ "reward_std": 0.1899569034576416,
1852
+ "rewards/ngram_repetition2/mean": 0.9636595845222473,
1853
+ "rewards/ngram_repetition2/std": 0.016547439619898796,
1854
+ "rewards/ngram_repetition3/mean": 0.9965729117393494,
1855
+ "rewards/ngram_repetition3/std": 0.011003647930920124,
1856
+ "rewards/symbolic_reward_accuracy/mean": 0.76025390625,
1857
+ "rewards/symbolic_reward_accuracy/std": 0.4270327091217041,
1858
+ "rewards/symbolic_reward_partial_score/mean": 0.9335530400276184,
1859
+ "rewards/symbolic_reward_partial_score/std": 0.14367683231830597,
1860
+ "rewards/tag_count_reward/mean": 0.999755859375,
1861
+ "rewards/tag_count_reward/std": 0.011048543266952038,
1862
+ "rewards/thinking_answer_ratio_reward/mean": 0.9753477573394775,
1863
+ "rewards/thinking_answer_ratio_reward/std": 0.03124345652759075,
1864
+ "sampling/importance_sampling_ratio/max": 2.0,
1865
+ "sampling/importance_sampling_ratio/mean": 1.4821131229400635,
1866
+ "sampling/importance_sampling_ratio/min": 2.7937696359003894e-05,
1867
+ "sampling/sampling_logp_difference/max": 10.485533714294434,
1868
+ "sampling/sampling_logp_difference/mean": 0.7600972652435303,
1869
+ "step": 176
1870
+ },
1871
+ {
1872
+ "clip_ratio/high_max": 0.2421875,
1873
+ "clip_ratio/high_mean": 0.14013671875,
1874
+ "clip_ratio/low_mean": 0.23681640625,
1875
+ "clip_ratio/low_min": 0.12109375,
1876
+ "clip_ratio/region_mean": 0.376953125,
1877
+ "completions/clipped_ratio": 0.00146484375,
1878
+ "completions/max_length": 3072.0,
1879
+ "completions/max_terminated_length": 696.0,
1880
+ "completions/mean_length": 415.51611328125,
1881
+ "completions/mean_terminated_length": 411.61907958984375,
1882
+ "completions/min_length": 171.0,
1883
+ "completions/min_terminated_length": 171.0,
1884
+ "entropy": 2.858258455991745,
1885
+ "epoch": 1.9565217391304348,
1886
+ "frac_reward_zero_std": 0.0,
1887
+ "grad_norm": 0.07966906477050349,
1888
+ "learning_rate": 1e-05,
1889
+ "loss": 0.0034,
1890
+ "num_tokens": 128874861.0,
1891
+ "reward": 3.4829840660095215,
1892
+ "reward_std": 0.08554819971323013,
1893
+ "rewards/ngram_repetition2/mean": 0.9658774733543396,
1894
+ "rewards/ngram_repetition2/std": 0.011492163874208927,
1895
+ "rewards/ngram_repetition3/mean": 0.9971028566360474,
1896
+ "rewards/ngram_repetition3/std": 0.0032859230414032936,
1897
+ "rewards/symbolic_reward_accuracy/mean": 0.7587890625,
1898
+ "rewards/symbolic_reward_accuracy/std": 0.42792245745658875,
1899
+ "rewards/symbolic_reward_partial_score/mean": 0.93701171875,
1900
+ "rewards/symbolic_reward_partial_score/std": 0.13184432685375214,
1901
+ "rewards/tag_count_reward/mean": 0.9990234375,
1902
+ "rewards/tag_count_reward/std": 0.022080888971686363,
1903
+ "rewards/thinking_answer_ratio_reward/mean": 0.9741167426109314,
1904
+ "rewards/thinking_answer_ratio_reward/std": 0.04080890119075775,
1905
+ "sampling/importance_sampling_ratio/max": 2.0,
1906
+ "sampling/importance_sampling_ratio/mean": 1.4818463325500488,
1907
+ "sampling/importance_sampling_ratio/min": 1.6995619489534874e-06,
1908
+ "sampling/sampling_logp_difference/max": 13.285140037536621,
1909
+ "sampling/sampling_logp_difference/mean": 0.747634768486023,
1910
+ "step": 180
1911
+ },
1912
+ {
1913
+ "clip_ratio/high_max": 0.1640625,
1914
+ "clip_ratio/high_mean": 0.09521484375,
1915
+ "clip_ratio/low_mean": 0.2451171875,
1916
+ "clip_ratio/low_min": 0.125,
1917
+ "clip_ratio/region_mean": 0.34033203125,
1918
+ "completions/clipped_ratio": 0.00048828125,
1919
+ "completions/max_length": 3072.0,
1920
+ "completions/max_terminated_length": 2981.0,
1921
+ "completions/mean_length": 415.7646484375,
1922
+ "completions/mean_terminated_length": 414.4670104980469,
1923
+ "completions/min_length": 197.0,
1924
+ "completions/min_terminated_length": 197.0,
1925
+ "entropy": 3.0020454972982407,
1926
+ "epoch": 2.0,
1927
+ "frac_reward_zero_std": 0.0,
1928
+ "grad_norm": 0.20941599091305263,
1929
+ "learning_rate": 1e-05,
1930
+ "loss": 0.0019,
1931
+ "num_tokens": 131785771.0,
1932
+ "reward": 3.4347691535949707,
1933
+ "reward_std": 0.0830899327993393,
1934
+ "rewards/ngram_repetition2/mean": 0.9693027138710022,
1935
+ "rewards/ngram_repetition2/std": 0.02014472335577011,
1936
+ "rewards/ngram_repetition3/mean": 0.9970437288284302,
1937
+ "rewards/ngram_repetition3/std": 0.01778826303780079,
1938
+ "rewards/symbolic_reward_accuracy/mean": 0.73974609375,
1939
+ "rewards/symbolic_reward_accuracy/std": 0.43888023495674133,
1940
+ "rewards/symbolic_reward_partial_score/mean": 0.9258626103401184,
1941
+ "rewards/symbolic_reward_partial_score/std": 0.15298709273338318,
1942
+ "rewards/tag_count_reward/mean": 1.0,
1943
+ "rewards/tag_count_reward/std": 0.0,
1944
+ "rewards/thinking_answer_ratio_reward/mean": 0.9750823378562927,
1945
+ "rewards/thinking_answer_ratio_reward/std": 0.022159311920404434,
1946
+ "sampling/importance_sampling_ratio/max": 2.0,
1947
+ "sampling/importance_sampling_ratio/mean": 1.4881045818328857,
1948
+ "sampling/importance_sampling_ratio/min": 6.7104201662004925e-06,
1949
+ "sampling/sampling_logp_difference/max": 11.911849021911621,
1950
+ "sampling/sampling_logp_difference/mean": 0.7676164507865906,
1951
+ "step": 184
1952
+ },
1953
+ {
1954
+ "epoch": 2.0,
1955
+ "eval_clip_ratio/high_max": 0.0,
1956
+ "eval_clip_ratio/high_mean": 0.0,
1957
+ "eval_clip_ratio/low_mean": 0.0,
1958
+ "eval_clip_ratio/low_min": 0.0,
1959
+ "eval_clip_ratio/region_mean": 0.0,
1960
+ "eval_completions/clipped_ratio": 0.001644736842105263,
1961
+ "eval_completions/max_length": 895.8947368421053,
1962
+ "eval_completions/max_terminated_length": 641.0526315789474,
1963
+ "eval_completions/mean_length": 447.49136513157896,
1964
+ "eval_completions/mean_terminated_length": 443.1748866031044,
1965
+ "eval_completions/min_length": 229.31578947368422,
1966
+ "eval_completions/min_terminated_length": 229.31578947368422,
1967
+ "eval_entropy": 2.969714365507427,
1968
+ "eval_frac_reward_zero_std": 0.0,
1969
+ "eval_loss": 0.0008429406443610787,
1970
+ "eval_num_tokens": 131785771.0,
1971
+ "eval_reward": 3.461897749649851,
1972
+ "eval_reward_std": 0.11290462953145738,
1973
+ "eval_rewards/ngram_repetition2/mean": 0.9665061580507379,
1974
+ "eval_rewards/ngram_repetition2/std": 0.010592278240150526,
1975
+ "eval_rewards/ngram_repetition3/mean": 0.9974528551101685,
1976
+ "eval_rewards/ngram_repetition3/std": 0.002903796377052602,
1977
+ "eval_rewards/symbolic_reward_accuracy/mean": 0.7504111842105263,
1978
+ "eval_rewards/symbolic_reward_accuracy/std": 0.3684137702772492,
1979
+ "eval_rewards/symbolic_reward_partial_score/mean": 0.9324972535434523,
1980
+ "eval_rewards/symbolic_reward_partial_score/std": 0.11325683170243313,
1981
+ "eval_rewards/tag_count_reward/mean": 0.9991776315789473,
1982
+ "eval_rewards/tag_count_reward/std": 0.006552994643387042,
1983
+ "eval_rewards/thinking_answer_ratio_reward/mean": 0.9760947635299281,
1984
+ "eval_rewards/thinking_answer_ratio_reward/std": 0.018366032500604267,
1985
+ "eval_runtime": 197.726,
1986
+ "eval_samples_per_second": 0.759,
1987
+ "eval_sampling/importance_sampling_ratio/max": 2.0,
1988
+ "eval_sampling/importance_sampling_ratio/mean": 1.4970186572325856,
1989
+ "eval_sampling/importance_sampling_ratio/min": 0.002377865083537089,
1990
+ "eval_sampling/sampling_logp_difference/max": 6.292696877529747,
1991
+ "eval_sampling/sampling_logp_difference/mean": 0.7791855492089924,
1992
+ "eval_steps_per_second": 0.01,
1993
+ "step": 184
1994
+ },
1995
+ {
1996
+ "epoch": 2.0,
1997
+ "step": 184,
1998
+ "total_flos": 0.0,
1999
+ "train_loss": 0.005923212121358475,
2000
+ "train_runtime": 6121.2967,
2001
+ "train_samples_per_second": 0.98,
2002
+ "train_steps_per_second": 0.03
2003
+ }
2004
+ ],
2005
+ "logging_steps": 4,
2006
+ "max_steps": 184,
2007
+ "num_input_tokens_seen": 131785771,
2008
+ "num_train_epochs": 2,
2009
+ "save_steps": 500,
2010
+ "stateful_callbacks": {
2011
+ "TrainerControl": {
2012
+ "args": {
2013
+ "should_epoch_stop": false,
2014
+ "should_evaluate": false,
2015
+ "should_log": false,
2016
+ "should_save": true,
2017
+ "should_training_stop": true
2018
+ },
2019
+ "attributes": {}
2020
+ }
2021
+ },
2022
+ "total_flos": 0.0,
2023
+ "train_batch_size": 16,
2024
+ "trial_name": null,
2025
+ "trial_params": null
2026
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83db7a3d8b825260b8766798dd130d927216a883338f5973b2585b5731375f83
3
+ size 11665