MMattaparthy commited on
Commit
f80f695
·
verified ·
1 Parent(s): 6f00eb9

Upload sfo model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-107/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: MMattaparthy/sft_finetined_final
3
+ library_name: transformers
4
+ model_name: ppo_final_model
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - ppo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for ppo_final_model
13
+
14
+ This model is a fine-tuned version of [MMattaparthy/sft_finetined_final](https://huggingface.co/MMattaparthy/sft_finetined_final).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with PPO, a method introduced in [Fine-Tuning Language Models from Human Preferences](https://huggingface.co/papers/1909.08593).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.24.0
38
+ - Transformers: 4.57.1
39
+ - Pytorch: 2.8.0+cu126
40
+ - Datasets: 4.0.0
41
+ - Tokenizers: 0.22.1
42
+
43
+ ## Citations
44
+
45
+ Cite PPO as:
46
+
47
+ ```bibtex
48
+ @article{mziegler2019fine-tuning,
49
+ title = {{Fine-Tuning Language Models from Human Preferences}},
50
+ author = {Daniel M. Ziegler and Nisan Stiennon and Jeffrey Wu and Tom B. Brown and Alec Radford and Dario Amodei and Paul F. Christiano and Geoffrey Irving},
51
+ year = 2019,
52
+ eprint = {arXiv:1909.08593}
53
+ }
54
+ ```
55
+
56
+ Cite TRL as:
57
+
58
+ ```bibtex
59
+ @misc{vonwerra2022trl,
60
+ title = {{TRL: Transformer Reinforcement Learning}},
61
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
62
+ year = 2020,
63
+ journal = {GitHub repository},
64
+ publisher = {GitHub},
65
+ howpublished = {\url{https://github.com/huggingface/trl}}
66
+ }
67
+ ```
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-107/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-107/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoint-107/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 21,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 2,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.1",
55
+ "use_cache": false,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 151936
58
+ }
checkpoint-107/generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "pad_token_id": 151643,
4
+ "repetition_penalty": 1.1,
5
+ "temperature": 0.7,
6
+ "top_k": 20,
7
+ "top_p": 0.8,
8
+ "transformers_version": "4.57.1"
9
+ }
checkpoint-107/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-107/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c93779623565bec515d60d0d5ba3ac8bb8d891825cd93bd6ffb072dbf265550
3
+ size 3087467144
checkpoint-107/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cf57a2c37522bc1897e9890c7c7403fc7f35354df7532984151af4cbfc2c3c5
3
+ size 12350320650
checkpoint-107/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec9e44811f24164e11bec7b150f7b4179380583f524ef1a6d16b4838107e5d16
3
+ size 14709
checkpoint-107/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a67ed9857c6e6351f390426ac7a185bbcb92711588dc22a658de8cf5976ff88
3
+ size 1465
checkpoint-107/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoint-107/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f650899cdee1a74abb093b96e45774316750373414f9e11647b8480290d3937f
3
+ size 11421988
checkpoint-107/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoint-107/trainer_state.json ADDED
@@ -0,0 +1,2282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "episode": 1712,
6
+ "epoch": 3.0035087719298246,
7
+ "eval_steps": 500,
8
+ "global_step": 107,
9
+ "is_hyper_param_search": false,
10
+ "is_local_process_zero": true,
11
+ "is_world_process_zero": true,
12
+ "log_history": [
13
+ {
14
+ "episode": 16,
15
+ "epoch": 0.028070175438596492,
16
+ "eps": 2,
17
+ "loss/policy_avg": 0.040254347026348114,
18
+ "loss/value_avg": 4.365694046020508,
19
+ "lr": 1.41e-05,
20
+ "objective/entropy": 20.665752410888672,
21
+ "objective/kl": 38.31879425048828,
22
+ "objective/non_score_reward": -1.9159398078918457,
23
+ "objective/rlhf_reward": -0.5565648078918457,
24
+ "objective/scores": 1.359375,
25
+ "policy/approxkl_avg": 5.953035354614258,
26
+ "policy/clipfrac_avg": 0.2399764209985733,
27
+ "policy/entropy_avg": 0.6761884689331055,
28
+ "step": 1,
29
+ "val/clipfrac_avg": 0.26650944352149963,
30
+ "val/num_eos_tokens": 0,
31
+ "val/ratio": 0.8525064587593079,
32
+ "val/ratio_var": 0.0006113838753663003
33
+ },
34
+ {
35
+ "episode": 32,
36
+ "epoch": 0.056140350877192984,
37
+ "eps": 2,
38
+ "loss/policy_avg": 0.0446447990834713,
39
+ "loss/value_avg": 3.0273280143737793,
40
+ "lr": 1.3968224299065421e-05,
41
+ "objective/entropy": 21.37685203552246,
42
+ "objective/kl": 73.40159606933594,
43
+ "objective/non_score_reward": -3.6700797080993652,
44
+ "objective/rlhf_reward": -2.5294547080993652,
45
+ "objective/scores": 1.140625,
46
+ "policy/approxkl_avg": 6.8038129806518555,
47
+ "policy/clipfrac_avg": 0.22936320304870605,
48
+ "policy/entropy_avg": 0.6141112446784973,
49
+ "step": 2,
50
+ "val/clipfrac_avg": 0.2458726465702057,
51
+ "val/num_eos_tokens": 0,
52
+ "val/ratio": 0.8650269508361816,
53
+ "val/ratio_var": 0.0003330775070935488
54
+ },
55
+ {
56
+ "episode": 48,
57
+ "epoch": 0.08421052631578947,
58
+ "eps": 3,
59
+ "loss/policy_avg": 0.07702315598726273,
60
+ "loss/value_avg": 2.1064209938049316,
61
+ "lr": 1.3836448598130842e-05,
62
+ "objective/entropy": 26.10182762145996,
63
+ "objective/kl": 74.03025817871094,
64
+ "objective/non_score_reward": -3.7015130519866943,
65
+ "objective/rlhf_reward": -2.6546380519866943,
66
+ "objective/scores": 1.046875,
67
+ "policy/approxkl_avg": 5.6432695388793945,
68
+ "policy/clipfrac_avg": 0.2087264060974121,
69
+ "policy/entropy_avg": 0.7391780614852905,
70
+ "step": 3,
71
+ "val/clipfrac_avg": 0.21639150381088257,
72
+ "val/num_eos_tokens": 0,
73
+ "val/ratio": 0.8773487210273743,
74
+ "val/ratio_var": 0.00039911610656417906
75
+ },
76
+ {
77
+ "episode": 64,
78
+ "epoch": 0.11228070175438597,
79
+ "eps": 3,
80
+ "loss/policy_avg": 0.03978118300437927,
81
+ "loss/value_avg": 1.5315862894058228,
82
+ "lr": 1.3704672897196262e-05,
83
+ "objective/entropy": 20.781639099121094,
84
+ "objective/kl": 68.94087219238281,
85
+ "objective/non_score_reward": -3.4470434188842773,
86
+ "objective/rlhf_reward": -2.4470434188842773,
87
+ "objective/scores": 1.0,
88
+ "policy/approxkl_avg": 4.726678848266602,
89
+ "policy/clipfrac_avg": 0.20341980457305908,
90
+ "policy/entropy_avg": 0.7119085788726807,
91
+ "step": 4,
92
+ "val/clipfrac_avg": 0.16096699237823486,
93
+ "val/num_eos_tokens": 0,
94
+ "val/ratio": 0.8485063314437866,
95
+ "val/ratio_var": 0.0003752955235540867
96
+ },
97
+ {
98
+ "episode": 80,
99
+ "epoch": 0.14035087719298245,
100
+ "eps": 3,
101
+ "loss/policy_avg": 0.06184825301170349,
102
+ "loss/value_avg": 0.9904976487159729,
103
+ "lr": 1.3572897196261683e-05,
104
+ "objective/entropy": 22.48508071899414,
105
+ "objective/kl": 75.60542297363281,
106
+ "objective/non_score_reward": -3.780271291732788,
107
+ "objective/rlhf_reward": -3.272458791732788,
108
+ "objective/scores": 0.5078125,
109
+ "policy/approxkl_avg": 4.352260589599609,
110
+ "policy/clipfrac_avg": 0.2146226465702057,
111
+ "policy/entropy_avg": 0.7200251817703247,
112
+ "step": 5,
113
+ "val/clipfrac_avg": 0.09669811278581619,
114
+ "val/num_eos_tokens": 0,
115
+ "val/ratio": 0.8546276092529297,
116
+ "val/ratio_var": 0.00010198648669756949
117
+ },
118
+ {
119
+ "episode": 96,
120
+ "epoch": 0.16842105263157894,
121
+ "eps": 3,
122
+ "loss/policy_avg": 0.046236515045166016,
123
+ "loss/value_avg": 1.2219572067260742,
124
+ "lr": 1.3441121495327103e-05,
125
+ "objective/entropy": 22.197113037109375,
126
+ "objective/kl": 82.56051635742188,
127
+ "objective/non_score_reward": -4.128026008605957,
128
+ "objective/rlhf_reward": -3.253026008605957,
129
+ "objective/scores": 0.875,
130
+ "policy/approxkl_avg": 3.8577029705047607,
131
+ "policy/clipfrac_avg": 0.22641509771347046,
132
+ "policy/entropy_avg": 0.7238099575042725,
133
+ "step": 6,
134
+ "val/clipfrac_avg": 0.0383254736661911,
135
+ "val/num_eos_tokens": 0,
136
+ "val/ratio": 0.8292837738990784,
137
+ "val/ratio_var": 6.185401434777305e-05
138
+ },
139
+ {
140
+ "episode": 112,
141
+ "epoch": 0.19649122807017544,
142
+ "eps": 3,
143
+ "loss/policy_avg": 0.06606701016426086,
144
+ "loss/value_avg": 1.325202226638794,
145
+ "lr": 1.3309345794392524e-05,
146
+ "objective/entropy": 27.022249221801758,
147
+ "objective/kl": 101.50897216796875,
148
+ "objective/non_score_reward": -5.075448513031006,
149
+ "objective/rlhf_reward": -3.653573513031006,
150
+ "objective/scores": 1.421875,
151
+ "policy/approxkl_avg": 5.585095405578613,
152
+ "policy/clipfrac_avg": 0.24174529314041138,
153
+ "policy/entropy_avg": 0.8859995603561401,
154
+ "step": 7,
155
+ "val/clipfrac_avg": 0.028301887214183807,
156
+ "val/num_eos_tokens": 0,
157
+ "val/ratio": 0.8486474752426147,
158
+ "val/ratio_var": 0.0005388148711062968
159
+ },
160
+ {
161
+ "episode": 128,
162
+ "epoch": 0.22456140350877193,
163
+ "eps": 3,
164
+ "loss/policy_avg": 0.06467999517917633,
165
+ "loss/value_avg": 0.9481044411659241,
166
+ "lr": 1.3177570093457945e-05,
167
+ "objective/entropy": 26.585155487060547,
168
+ "objective/kl": 116.76457214355469,
169
+ "objective/non_score_reward": -5.838229179382324,
170
+ "objective/rlhf_reward": -4.111666679382324,
171
+ "objective/scores": 1.7265625,
172
+ "policy/approxkl_avg": 5.351860046386719,
173
+ "policy/clipfrac_avg": 0.24469339847564697,
174
+ "policy/entropy_avg": 0.8991943597793579,
175
+ "step": 8,
176
+ "val/clipfrac_avg": 0.08785377442836761,
177
+ "val/num_eos_tokens": 0,
178
+ "val/ratio": 0.8524093627929688,
179
+ "val/ratio_var": 3.493872281978838e-05
180
+ },
181
+ {
182
+ "episode": 144,
183
+ "epoch": 0.25263157894736843,
184
+ "eps": 3,
185
+ "loss/policy_avg": 0.03949737548828125,
186
+ "loss/value_avg": 0.9386715888977051,
187
+ "lr": 1.3045794392523365e-05,
188
+ "objective/entropy": 29.664459228515625,
189
+ "objective/kl": 132.19940185546875,
190
+ "objective/non_score_reward": -6.6099700927734375,
191
+ "objective/rlhf_reward": -5.6763763427734375,
192
+ "objective/scores": 0.93359375,
193
+ "policy/approxkl_avg": 5.967764377593994,
194
+ "policy/clipfrac_avg": 0.2057783007621765,
195
+ "policy/entropy_avg": 1.0122931003570557,
196
+ "step": 9,
197
+ "val/clipfrac_avg": 0.017099056392908096,
198
+ "val/num_eos_tokens": 0,
199
+ "val/ratio": 0.8205329179763794,
200
+ "val/ratio_var": 7.763963367324322e-05
201
+ },
202
+ {
203
+ "episode": 160,
204
+ "epoch": 0.2807017543859649,
205
+ "eps": 3,
206
+ "loss/policy_avg": 0.0211679395288229,
207
+ "loss/value_avg": 1.213599681854248,
208
+ "lr": 1.2914018691588786e-05,
209
+ "objective/entropy": 31.076860427856445,
210
+ "objective/kl": 135.477294921875,
211
+ "objective/non_score_reward": -6.77386474609375,
212
+ "objective/rlhf_reward": -5.00042724609375,
213
+ "objective/scores": 1.7734375,
214
+ "policy/approxkl_avg": 3.5569186210632324,
215
+ "policy/clipfrac_avg": 0.21304652094841003,
216
+ "policy/entropy_avg": 1.1141610145568848,
217
+ "step": 10,
218
+ "val/clipfrac_avg": 0.06719152629375458,
219
+ "val/num_eos_tokens": 0,
220
+ "val/ratio": 0.8466310501098633,
221
+ "val/ratio_var": 0.0001414915022905916
222
+ },
223
+ {
224
+ "episode": 176,
225
+ "epoch": 0.3087719298245614,
226
+ "eps": 3,
227
+ "loss/policy_avg": 0.0240048635751009,
228
+ "loss/value_avg": 1.3511584997177124,
229
+ "lr": 1.2782242990654206e-05,
230
+ "objective/entropy": 32.90422058105469,
231
+ "objective/kl": 146.03787231445312,
232
+ "objective/non_score_reward": -7.301893711090088,
233
+ "objective/rlhf_reward": -5.630018711090088,
234
+ "objective/scores": 1.671875,
235
+ "policy/approxkl_avg": 5.625584125518799,
236
+ "policy/clipfrac_avg": 0.20400942862033844,
237
+ "policy/entropy_avg": 1.1266499757766724,
238
+ "step": 11,
239
+ "val/clipfrac_avg": 0.05483490601181984,
240
+ "val/num_eos_tokens": 0,
241
+ "val/ratio": 0.8710312843322754,
242
+ "val/ratio_var": 0.00014447471767198294
243
+ },
244
+ {
245
+ "episode": 192,
246
+ "epoch": 0.3368421052631579,
247
+ "eps": 3,
248
+ "loss/policy_avg": 0.05039631202816963,
249
+ "loss/value_avg": 1.4507163763046265,
250
+ "lr": 1.2650467289719627e-05,
251
+ "objective/entropy": 34.778846740722656,
252
+ "objective/kl": 152.17945861816406,
253
+ "objective/non_score_reward": -7.608973503112793,
254
+ "objective/rlhf_reward": -5.726161003112793,
255
+ "objective/scores": 1.8828125,
256
+ "policy/approxkl_avg": 4.985030174255371,
257
+ "policy/clipfrac_avg": 0.1987028270959854,
258
+ "policy/entropy_avg": 1.1480720043182373,
259
+ "step": 12,
260
+ "val/clipfrac_avg": 0.05188679322600365,
261
+ "val/num_eos_tokens": 0,
262
+ "val/ratio": 0.8588758111000061,
263
+ "val/ratio_var": 3.4296579542569816e-05
264
+ },
265
+ {
266
+ "episode": 208,
267
+ "epoch": 0.3649122807017544,
268
+ "eps": 3,
269
+ "loss/policy_avg": 0.037938639521598816,
270
+ "loss/value_avg": 1.5314528942108154,
271
+ "lr": 1.2518691588785048e-05,
272
+ "objective/entropy": 45.65214538574219,
273
+ "objective/kl": 132.1256866455078,
274
+ "objective/non_score_reward": -6.606284141540527,
275
+ "objective/rlhf_reward": -6.803549766540527,
276
+ "objective/scores": -0.197265625,
277
+ "policy/approxkl_avg": 5.0863566398620605,
278
+ "policy/clipfrac_avg": 0.18867924809455872,
279
+ "policy/entropy_avg": 1.3655226230621338,
280
+ "step": 13,
281
+ "val/clipfrac_avg": 0.06367924809455872,
282
+ "val/num_eos_tokens": 0,
283
+ "val/ratio": 0.8282334804534912,
284
+ "val/ratio_var": 0.0004519254434853792
285
+ },
286
+ {
287
+ "episode": 224,
288
+ "epoch": 0.3929824561403509,
289
+ "eps": 3,
290
+ "loss/policy_avg": 0.036732763051986694,
291
+ "loss/value_avg": 2.189356803894043,
292
+ "lr": 1.2386915887850468e-05,
293
+ "objective/entropy": 38.35211944580078,
294
+ "objective/kl": 113.13496398925781,
295
+ "objective/non_score_reward": -5.656748294830322,
296
+ "objective/rlhf_reward": -4.539560794830322,
297
+ "objective/scores": 1.1171875,
298
+ "policy/approxkl_avg": 4.408792972564697,
299
+ "policy/clipfrac_avg": 0.21201542019844055,
300
+ "policy/entropy_avg": 1.2410473823547363,
301
+ "step": 14,
302
+ "val/clipfrac_avg": 0.10222071409225464,
303
+ "val/num_eos_tokens": 0,
304
+ "val/ratio": 0.8754662275314331,
305
+ "val/ratio_var": 0.0007480247295461595
306
+ },
307
+ {
308
+ "episode": 240,
309
+ "epoch": 0.42105263157894735,
310
+ "eps": 3,
311
+ "loss/policy_avg": 0.03690744563937187,
312
+ "loss/value_avg": 1.4401739835739136,
313
+ "lr": 1.2255140186915889e-05,
314
+ "objective/entropy": 38.69694519042969,
315
+ "objective/kl": 115.77085876464844,
316
+ "objective/non_score_reward": -5.788543224334717,
317
+ "objective/rlhf_reward": -5.627410411834717,
318
+ "objective/scores": 0.1611328125,
319
+ "policy/approxkl_avg": 4.673148155212402,
320
+ "policy/clipfrac_avg": 0.1833726465702057,
321
+ "policy/entropy_avg": 1.2108347415924072,
322
+ "step": 15,
323
+ "val/clipfrac_avg": 0.01179245300590992,
324
+ "val/num_eos_tokens": 0,
325
+ "val/ratio": 0.8487275838851929,
326
+ "val/ratio_var": 0.0008522844291292131
327
+ },
328
+ {
329
+ "episode": 256,
330
+ "epoch": 0.44912280701754387,
331
+ "eps": 3,
332
+ "loss/policy_avg": 0.019956424832344055,
333
+ "loss/value_avg": 1.5383408069610596,
334
+ "lr": 1.212336448598131e-05,
335
+ "objective/entropy": 32.02195739746094,
336
+ "objective/kl": 122.87109375,
337
+ "objective/non_score_reward": -6.1435546875,
338
+ "objective/rlhf_reward": -5.7470703125,
339
+ "objective/scores": 0.396484375,
340
+ "policy/approxkl_avg": 5.454257965087891,
341
+ "policy/clipfrac_avg": 0.22314535081386566,
342
+ "policy/entropy_avg": 1.1098759174346924,
343
+ "step": 16,
344
+ "val/clipfrac_avg": 0.03384597226977348,
345
+ "val/num_eos_tokens": 0,
346
+ "val/ratio": 0.8839770555496216,
347
+ "val/ratio_var": 0.0004538022622000426
348
+ },
349
+ {
350
+ "episode": 272,
351
+ "epoch": 0.47719298245614034,
352
+ "eps": 3,
353
+ "loss/policy_avg": 0.016002152115106583,
354
+ "loss/value_avg": 1.5554126501083374,
355
+ "lr": 1.199158878504673e-05,
356
+ "objective/entropy": 28.698740005493164,
357
+ "objective/kl": 129.80386352539062,
358
+ "objective/non_score_reward": -6.4901933670043945,
359
+ "objective/rlhf_reward": -6.1796464920043945,
360
+ "objective/scores": 0.310546875,
361
+ "policy/approxkl_avg": 3.0268969535827637,
362
+ "policy/clipfrac_avg": 0.19969519972801208,
363
+ "policy/entropy_avg": 0.9890843629837036,
364
+ "step": 17,
365
+ "val/clipfrac_avg": 0.07565668225288391,
366
+ "val/num_eos_tokens": 0,
367
+ "val/ratio": 0.8847370743751526,
368
+ "val/ratio_var": 0.000195541579159908
369
+ },
370
+ {
371
+ "episode": 288,
372
+ "epoch": 0.5052631578947369,
373
+ "eps": 3,
374
+ "loss/policy_avg": 0.013964798301458359,
375
+ "loss/value_avg": 1.590545892715454,
376
+ "lr": 1.185981308411215e-05,
377
+ "objective/entropy": 12.885665893554688,
378
+ "objective/kl": 107.77202606201172,
379
+ "objective/non_score_reward": -5.388601303100586,
380
+ "objective/rlhf_reward": -4.169851303100586,
381
+ "objective/scores": 1.21875,
382
+ "policy/approxkl_avg": 5.9386420249938965,
383
+ "policy/clipfrac_avg": 0.18009786307811737,
384
+ "policy/entropy_avg": 0.6803157925605774,
385
+ "step": 18,
386
+ "val/clipfrac_avg": 0.12902730703353882,
387
+ "val/num_eos_tokens": 0,
388
+ "val/ratio": 0.9129149913787842,
389
+ "val/ratio_var": 0.0004151359898969531
390
+ },
391
+ {
392
+ "episode": 304,
393
+ "epoch": 0.5333333333333333,
394
+ "eps": 3,
395
+ "loss/policy_avg": 0.02675745077431202,
396
+ "loss/value_avg": 1.745023488998413,
397
+ "lr": 1.1728037383177571e-05,
398
+ "objective/entropy": 22.75409698486328,
399
+ "objective/kl": 119.34423828125,
400
+ "objective/non_score_reward": -5.967212200164795,
401
+ "objective/rlhf_reward": -5.197680950164795,
402
+ "objective/scores": 0.76953125,
403
+ "policy/approxkl_avg": 6.532215118408203,
404
+ "policy/clipfrac_avg": 0.1759602427482605,
405
+ "policy/entropy_avg": 0.8050931692123413,
406
+ "step": 19,
407
+ "val/clipfrac_avg": 0.0052770450711250305,
408
+ "val/num_eos_tokens": 0,
409
+ "val/ratio": 0.9205665588378906,
410
+ "val/ratio_var": 0.0005024418351240456
411
+ },
412
+ {
413
+ "episode": 320,
414
+ "epoch": 0.5614035087719298,
415
+ "eps": 3,
416
+ "loss/policy_avg": 0.014609305188059807,
417
+ "loss/value_avg": 1.7434797286987305,
418
+ "lr": 1.159626168224299e-05,
419
+ "objective/entropy": 29.933460235595703,
420
+ "objective/kl": 144.45672607421875,
421
+ "objective/non_score_reward": -7.222836494445801,
422
+ "objective/rlhf_reward": -6.675961494445801,
423
+ "objective/scores": 0.546875,
424
+ "policy/approxkl_avg": 7.733211517333984,
425
+ "policy/clipfrac_avg": 0.1762971729040146,
426
+ "policy/entropy_avg": 0.8804416656494141,
427
+ "step": 20,
428
+ "val/clipfrac_avg": 0.026533018797636032,
429
+ "val/num_eos_tokens": 0,
430
+ "val/ratio": 0.8693416714668274,
431
+ "val/ratio_var": 0.000557584862690419
432
+ },
433
+ {
434
+ "episode": 336,
435
+ "epoch": 0.5894736842105263,
436
+ "eps": 3,
437
+ "loss/policy_avg": 0.019340746104717255,
438
+ "loss/value_avg": 1.471176266670227,
439
+ "lr": 1.146448598130841e-05,
440
+ "objective/entropy": 23.77098846435547,
441
+ "objective/kl": 149.46974182128906,
442
+ "objective/non_score_reward": -7.473487377166748,
443
+ "objective/rlhf_reward": -5.668799877166748,
444
+ "objective/scores": 1.8046875,
445
+ "policy/approxkl_avg": 5.010880470275879,
446
+ "policy/clipfrac_avg": 0.18341976404190063,
447
+ "policy/entropy_avg": 0.973499596118927,
448
+ "step": 21,
449
+ "val/clipfrac_avg": 0.016483133658766747,
450
+ "val/num_eos_tokens": 0,
451
+ "val/ratio": 0.8893705606460571,
452
+ "val/ratio_var": 0.0012118957238271832
453
+ },
454
+ {
455
+ "episode": 352,
456
+ "epoch": 0.6175438596491228,
457
+ "eps": 3,
458
+ "loss/policy_avg": 0.009950436651706696,
459
+ "loss/value_avg": 1.1944406032562256,
460
+ "lr": 1.1332710280373831e-05,
461
+ "objective/entropy": 23.91471290588379,
462
+ "objective/kl": 155.48843383789062,
463
+ "objective/non_score_reward": -7.774421691894531,
464
+ "objective/rlhf_reward": -7.125984191894531,
465
+ "objective/scores": 0.6484375,
466
+ "policy/approxkl_avg": 6.8445940017700195,
467
+ "policy/clipfrac_avg": 0.16214622557163239,
468
+ "policy/entropy_avg": 0.837052047252655,
469
+ "step": 22,
470
+ "val/clipfrac_avg": 0.09257075190544128,
471
+ "val/num_eos_tokens": 0,
472
+ "val/ratio": 0.9027042984962463,
473
+ "val/ratio_var": 0.00019580482330638915
474
+ },
475
+ {
476
+ "episode": 368,
477
+ "epoch": 0.6456140350877193,
478
+ "eps": 3,
479
+ "loss/policy_avg": 0.021877210587263107,
480
+ "loss/value_avg": 1.0937385559082031,
481
+ "lr": 1.1200934579439252e-05,
482
+ "objective/entropy": 23.552492141723633,
483
+ "objective/kl": 172.6247100830078,
484
+ "objective/non_score_reward": -8.631235122680664,
485
+ "objective/rlhf_reward": -6.787485122680664,
486
+ "objective/scores": 1.84375,
487
+ "policy/approxkl_avg": 6.227967262268066,
488
+ "policy/clipfrac_avg": 0.16203010082244873,
489
+ "policy/entropy_avg": 0.8671172857284546,
490
+ "step": 23,
491
+ "val/clipfrac_avg": 0.018410665914416313,
492
+ "val/num_eos_tokens": 0,
493
+ "val/ratio": 0.8919187784194946,
494
+ "val/ratio_var": 0.001309347921051085
495
+ },
496
+ {
497
+ "episode": 384,
498
+ "epoch": 0.6736842105263158,
499
+ "eps": 3,
500
+ "loss/policy_avg": 0.036975178867578506,
501
+ "loss/value_avg": 1.1198090314865112,
502
+ "lr": 1.1069158878504672e-05,
503
+ "objective/entropy": 20.281631469726562,
504
+ "objective/kl": 170.099365234375,
505
+ "objective/non_score_reward": -8.504968643188477,
506
+ "objective/rlhf_reward": -7.262781143188477,
507
+ "objective/scores": 1.2421875,
508
+ "policy/approxkl_avg": 6.825319766998291,
509
+ "policy/clipfrac_avg": 0.14049983024597168,
510
+ "policy/entropy_avg": 0.802283525466919,
511
+ "step": 24,
512
+ "val/clipfrac_avg": 0.0785929411649704,
513
+ "val/num_eos_tokens": 0,
514
+ "val/ratio": 0.8865500092506409,
515
+ "val/ratio_var": 0.0006475243135355413
516
+ },
517
+ {
518
+ "episode": 400,
519
+ "epoch": 0.7017543859649122,
520
+ "eps": 3,
521
+ "loss/policy_avg": 0.0237587820738554,
522
+ "loss/value_avg": 1.3415908813476562,
523
+ "lr": 1.0937383177570093e-05,
524
+ "objective/entropy": 21.36954689025879,
525
+ "objective/kl": 179.03819274902344,
526
+ "objective/non_score_reward": -8.951910018920898,
527
+ "objective/rlhf_reward": -7.334722518920898,
528
+ "objective/scores": 1.6171875,
529
+ "policy/approxkl_avg": 7.667660713195801,
530
+ "policy/clipfrac_avg": 0.11468379199504852,
531
+ "policy/entropy_avg": 0.7623839974403381,
532
+ "step": 25,
533
+ "val/clipfrac_avg": 0.27072370052337646,
534
+ "val/num_eos_tokens": 0,
535
+ "val/ratio": 0.8912988901138306,
536
+ "val/ratio_var": 2.9881122827646323e-05
537
+ },
538
+ {
539
+ "episode": 416,
540
+ "epoch": 0.7298245614035088,
541
+ "eps": 3,
542
+ "loss/policy_avg": 0.018890127539634705,
543
+ "loss/value_avg": 1.2341463565826416,
544
+ "lr": 1.0805607476635514e-05,
545
+ "objective/entropy": 16.852466583251953,
546
+ "objective/kl": 179.8382568359375,
547
+ "objective/non_score_reward": -8.991912841796875,
548
+ "objective/rlhf_reward": -8.312225341796875,
549
+ "objective/scores": 0.6796875,
550
+ "policy/approxkl_avg": 7.357123374938965,
551
+ "policy/clipfrac_avg": 0.11944779008626938,
552
+ "policy/entropy_avg": 0.7555092573165894,
553
+ "step": 26,
554
+ "val/clipfrac_avg": 0.2508935034275055,
555
+ "val/num_eos_tokens": 0,
556
+ "val/ratio": 0.8802620768547058,
557
+ "val/ratio_var": 0.00018670025747269392
558
+ },
559
+ {
560
+ "episode": 432,
561
+ "epoch": 0.7578947368421053,
562
+ "eps": 3,
563
+ "loss/policy_avg": 0.021100062876939774,
564
+ "loss/value_avg": 0.9362931847572327,
565
+ "lr": 1.0673831775700934e-05,
566
+ "objective/entropy": 21.68787384033203,
567
+ "objective/kl": 183.68655395507812,
568
+ "objective/non_score_reward": -9.184328079223633,
569
+ "objective/rlhf_reward": -7.199953079223633,
570
+ "objective/scores": 1.984375,
571
+ "policy/approxkl_avg": 5.103993892669678,
572
+ "policy/clipfrac_avg": 0.12146226316690445,
573
+ "policy/entropy_avg": 0.7987968921661377,
574
+ "step": 27,
575
+ "val/clipfrac_avg": 0.05424528568983078,
576
+ "val/num_eos_tokens": 0,
577
+ "val/ratio": 0.8928566575050354,
578
+ "val/ratio_var": 0.00013602118997368962
579
+ },
580
+ {
581
+ "episode": 448,
582
+ "epoch": 0.7859649122807018,
583
+ "eps": 3,
584
+ "loss/policy_avg": 0.011967534199357033,
585
+ "loss/value_avg": 1.0544021129608154,
586
+ "lr": 1.0542056074766355e-05,
587
+ "objective/entropy": 23.05614471435547,
588
+ "objective/kl": 182.8275146484375,
589
+ "objective/non_score_reward": -9.141375541687012,
590
+ "objective/rlhf_reward": -7.789813041687012,
591
+ "objective/scores": 1.3515625,
592
+ "policy/approxkl_avg": 4.963105201721191,
593
+ "policy/clipfrac_avg": 0.14268869161605835,
594
+ "policy/entropy_avg": 0.8151739835739136,
595
+ "step": 28,
596
+ "val/clipfrac_avg": 0.2146226465702057,
597
+ "val/num_eos_tokens": 0,
598
+ "val/ratio": 0.8715541362762451,
599
+ "val/ratio_var": 7.266430475283414e-05
600
+ },
601
+ {
602
+ "episode": 464,
603
+ "epoch": 0.8140350877192982,
604
+ "eps": 3,
605
+ "loss/policy_avg": 0.011134720407426357,
606
+ "loss/value_avg": 0.775411069393158,
607
+ "lr": 1.0410280373831775e-05,
608
+ "objective/entropy": 25.063724517822266,
609
+ "objective/kl": 188.51422119140625,
610
+ "objective/non_score_reward": -9.425710678100586,
611
+ "objective/rlhf_reward": -8.394460678100586,
612
+ "objective/scores": 1.03125,
613
+ "policy/approxkl_avg": 7.372692108154297,
614
+ "policy/clipfrac_avg": 0.1320754736661911,
615
+ "policy/entropy_avg": 0.8475193977355957,
616
+ "step": 29,
617
+ "val/clipfrac_avg": 0.04245283082127571,
618
+ "val/num_eos_tokens": 0,
619
+ "val/ratio": 0.891329824924469,
620
+ "val/ratio_var": 3.0236831207730575e-06
621
+ },
622
+ {
623
+ "episode": 480,
624
+ "epoch": 0.8421052631578947,
625
+ "eps": 3,
626
+ "loss/policy_avg": 0.017270730808377266,
627
+ "loss/value_avg": 0.7942019701004028,
628
+ "lr": 1.0278504672897196e-05,
629
+ "objective/entropy": 22.402624130249023,
630
+ "objective/kl": 185.72421264648438,
631
+ "objective/non_score_reward": -9.286211013793945,
632
+ "objective/rlhf_reward": -7.606523513793945,
633
+ "objective/scores": 1.6796875,
634
+ "policy/approxkl_avg": 8.755260467529297,
635
+ "policy/clipfrac_avg": 0.11261792480945587,
636
+ "policy/entropy_avg": 0.799101710319519,
637
+ "step": 30,
638
+ "val/clipfrac_avg": 0.014740565791726112,
639
+ "val/num_eos_tokens": 0,
640
+ "val/ratio": 0.8892796635627747,
641
+ "val/ratio_var": 6.262218084884807e-05
642
+ },
643
+ {
644
+ "episode": 496,
645
+ "epoch": 0.8701754385964913,
646
+ "eps": 3,
647
+ "loss/policy_avg": 0.019226763397455215,
648
+ "loss/value_avg": 0.6826229095458984,
649
+ "lr": 1.0146728971962616e-05,
650
+ "objective/entropy": 23.43070411682129,
651
+ "objective/kl": 201.67799377441406,
652
+ "objective/non_score_reward": -10.083900451660156,
653
+ "objective/rlhf_reward": -9.154212951660156,
654
+ "objective/scores": 0.9296875,
655
+ "policy/approxkl_avg": 8.144369125366211,
656
+ "policy/clipfrac_avg": 0.12264151126146317,
657
+ "policy/entropy_avg": 0.8400179147720337,
658
+ "step": 31,
659
+ "val/clipfrac_avg": 0.2228773534297943,
660
+ "val/num_eos_tokens": 0,
661
+ "val/ratio": 0.8812745213508606,
662
+ "val/ratio_var": 0.00010276544344378635
663
+ },
664
+ {
665
+ "episode": 512,
666
+ "epoch": 0.8982456140350877,
667
+ "eps": 3,
668
+ "loss/policy_avg": 0.005793810822069645,
669
+ "loss/value_avg": 0.8381754159927368,
670
+ "lr": 1.0014953271028037e-05,
671
+ "objective/entropy": 21.22824478149414,
672
+ "objective/kl": 185.61614990234375,
673
+ "objective/non_score_reward": -9.280807495117188,
674
+ "objective/rlhf_reward": -7.3511199951171875,
675
+ "objective/scores": 1.9296875,
676
+ "policy/approxkl_avg": 7.533528804779053,
677
+ "policy/clipfrac_avg": 0.12205187976360321,
678
+ "policy/entropy_avg": 0.7801576256752014,
679
+ "step": 32,
680
+ "val/clipfrac_avg": 0.09787735342979431,
681
+ "val/num_eos_tokens": 0,
682
+ "val/ratio": 0.8987904787063599,
683
+ "val/ratio_var": 0.00018136559810955077
684
+ },
685
+ {
686
+ "episode": 528,
687
+ "epoch": 0.9263157894736842,
688
+ "eps": 3,
689
+ "loss/policy_avg": 0.02254486456513405,
690
+ "loss/value_avg": 0.8877236843109131,
691
+ "lr": 9.883177570093458e-06,
692
+ "objective/entropy": 23.442203521728516,
693
+ "objective/kl": 188.65457153320312,
694
+ "objective/non_score_reward": -9.432729721069336,
695
+ "objective/rlhf_reward": -7.534292221069336,
696
+ "objective/scores": 1.8984375,
697
+ "policy/approxkl_avg": 5.329720497131348,
698
+ "policy/clipfrac_avg": 0.12323113530874252,
699
+ "policy/entropy_avg": 0.8007351160049438,
700
+ "step": 33,
701
+ "val/clipfrac_avg": 0.036556605249643326,
702
+ "val/num_eos_tokens": 0,
703
+ "val/ratio": 0.8945379257202148,
704
+ "val/ratio_var": 0.0001225806336151436
705
+ },
706
+ {
707
+ "episode": 544,
708
+ "epoch": 0.9543859649122807,
709
+ "eps": 3,
710
+ "loss/policy_avg": 0.01662730611860752,
711
+ "loss/value_avg": 0.637324869632721,
712
+ "lr": 9.751401869158878e-06,
713
+ "objective/entropy": 20.620216369628906,
714
+ "objective/kl": 186.25180053710938,
715
+ "objective/non_score_reward": -9.312589645385742,
716
+ "objective/rlhf_reward": -7.851652145385742,
717
+ "objective/scores": 1.4609375,
718
+ "policy/approxkl_avg": 8.322406768798828,
719
+ "policy/clipfrac_avg": 0.125,
720
+ "policy/entropy_avg": 0.7402328848838806,
721
+ "step": 34,
722
+ "val/clipfrac_avg": 0.002358490601181984,
723
+ "val/num_eos_tokens": 0,
724
+ "val/ratio": 0.8899158239364624,
725
+ "val/ratio_var": 0.0002540757122915238
726
+ },
727
+ {
728
+ "episode": 560,
729
+ "epoch": 0.9824561403508771,
730
+ "eps": 3,
731
+ "loss/policy_avg": 0.0003616265021264553,
732
+ "loss/value_avg": 0.6015822291374207,
733
+ "lr": 9.619626168224299e-06,
734
+ "objective/entropy": 21.12371063232422,
735
+ "objective/kl": 186.5103302001953,
736
+ "objective/non_score_reward": -9.325516700744629,
737
+ "objective/rlhf_reward": -7.817704200744629,
738
+ "objective/scores": 1.5078125,
739
+ "policy/approxkl_avg": 6.682015419006348,
740
+ "policy/clipfrac_avg": 0.13443395495414734,
741
+ "policy/entropy_avg": 0.7727504372596741,
742
+ "step": 35,
743
+ "val/clipfrac_avg": 0.037146229296922684,
744
+ "val/num_eos_tokens": 0,
745
+ "val/ratio": 0.8780511021614075,
746
+ "val/ratio_var": 6.755981303285807e-05
747
+ },
748
+ {
749
+ "episode": 576,
750
+ "epoch": 1.0105263157894737,
751
+ "eps": 3,
752
+ "loss/policy_avg": 0.009338408708572388,
753
+ "loss/value_avg": 0.6681860685348511,
754
+ "lr": 9.48785046728972e-06,
755
+ "objective/entropy": 21.184894561767578,
756
+ "objective/kl": 192.3019256591797,
757
+ "objective/non_score_reward": -9.615096092224121,
758
+ "objective/rlhf_reward": -7.443221092224121,
759
+ "objective/scores": 2.171875,
760
+ "policy/approxkl_avg": 6.053627967834473,
761
+ "policy/clipfrac_avg": 0.1179245263338089,
762
+ "policy/entropy_avg": 0.7489176392555237,
763
+ "step": 36,
764
+ "val/clipfrac_avg": 0.15683962404727936,
765
+ "val/num_eos_tokens": 0,
766
+ "val/ratio": 0.8922820091247559,
767
+ "val/ratio_var": 0.00026253468240611255
768
+ },
769
+ {
770
+ "episode": 592,
771
+ "epoch": 1.03859649122807,
772
+ "eps": 3,
773
+ "loss/policy_avg": 0.006294197402894497,
774
+ "loss/value_avg": 0.8030184507369995,
775
+ "lr": 9.35607476635514e-06,
776
+ "objective/entropy": 20.368942260742188,
777
+ "objective/kl": 190.313232421875,
778
+ "objective/non_score_reward": -9.51566219329834,
779
+ "objective/rlhf_reward": -8.02347469329834,
780
+ "objective/scores": 1.4921875,
781
+ "policy/approxkl_avg": 6.838142395019531,
782
+ "policy/clipfrac_avg": 0.11615566164255142,
783
+ "policy/entropy_avg": 0.7397478222846985,
784
+ "step": 37,
785
+ "val/clipfrac_avg": 0.003537735901772976,
786
+ "val/num_eos_tokens": 0,
787
+ "val/ratio": 0.8849948644638062,
788
+ "val/ratio_var": 8.990589412860572e-05
789
+ },
790
+ {
791
+ "episode": 608,
792
+ "epoch": 1.0666666666666667,
793
+ "eps": 3,
794
+ "loss/policy_avg": 0.016940509900450706,
795
+ "loss/value_avg": 0.5882998704910278,
796
+ "lr": 9.22429906542056e-06,
797
+ "objective/entropy": 23.183269500732422,
798
+ "objective/kl": 186.7628631591797,
799
+ "objective/non_score_reward": -9.338143348693848,
800
+ "objective/rlhf_reward": -7.564705848693848,
801
+ "objective/scores": 1.7734375,
802
+ "policy/approxkl_avg": 3.9737157821655273,
803
+ "policy/clipfrac_avg": 0.13089622557163239,
804
+ "policy/entropy_avg": 0.7805944681167603,
805
+ "step": 38,
806
+ "val/clipfrac_avg": 0.004127358552068472,
807
+ "val/num_eos_tokens": 0,
808
+ "val/ratio": 0.8967232704162598,
809
+ "val/ratio_var": 0.00010195688810199499
810
+ },
811
+ {
812
+ "episode": 624,
813
+ "epoch": 1.0947368421052632,
814
+ "eps": 3,
815
+ "loss/policy_avg": 0.011485239490866661,
816
+ "loss/value_avg": 0.5787074565887451,
817
+ "lr": 9.092523364485981e-06,
818
+ "objective/entropy": 17.85469627380371,
819
+ "objective/kl": 187.50631713867188,
820
+ "objective/non_score_reward": -9.375316619873047,
821
+ "objective/rlhf_reward": -7.203441619873047,
822
+ "objective/scores": 2.171875,
823
+ "policy/approxkl_avg": 5.274375915527344,
824
+ "policy/clipfrac_avg": 0.11556603759527206,
825
+ "policy/entropy_avg": 0.7150323390960693,
826
+ "step": 39,
827
+ "val/clipfrac_avg": 0.16509434580802917,
828
+ "val/num_eos_tokens": 0,
829
+ "val/ratio": 0.8858749866485596,
830
+ "val/ratio_var": 0.0001809587120078504
831
+ },
832
+ {
833
+ "episode": 640,
834
+ "epoch": 1.1228070175438596,
835
+ "eps": 3,
836
+ "loss/policy_avg": 0.017689252272248268,
837
+ "loss/value_avg": 0.7021454572677612,
838
+ "lr": 8.960747663551402e-06,
839
+ "objective/entropy": 21.994365692138672,
840
+ "objective/kl": 182.05810546875,
841
+ "objective/non_score_reward": -9.1029052734375,
842
+ "objective/rlhf_reward": -6.8841552734375,
843
+ "objective/scores": 2.21875,
844
+ "policy/approxkl_avg": 4.578630447387695,
845
+ "policy/clipfrac_avg": 0.12558962404727936,
846
+ "policy/entropy_avg": 0.7208189964294434,
847
+ "step": 40,
848
+ "val/clipfrac_avg": 0.06426886469125748,
849
+ "val/num_eos_tokens": 0,
850
+ "val/ratio": 0.8957177400588989,
851
+ "val/ratio_var": 7.126481068553403e-05
852
+ },
853
+ {
854
+ "episode": 656,
855
+ "epoch": 1.1508771929824562,
856
+ "eps": 3,
857
+ "loss/policy_avg": 0.005923585034906864,
858
+ "loss/value_avg": 0.5532969236373901,
859
+ "lr": 8.828971962616822e-06,
860
+ "objective/entropy": 18.056703567504883,
861
+ "objective/kl": 170.11863708496094,
862
+ "objective/non_score_reward": -8.505931854248047,
863
+ "objective/rlhf_reward": -6.380931854248047,
864
+ "objective/scores": 2.125,
865
+ "policy/approxkl_avg": 3.3685710430145264,
866
+ "policy/clipfrac_avg": 0.12028301507234573,
867
+ "policy/entropy_avg": 0.6859033107757568,
868
+ "step": 41,
869
+ "val/clipfrac_avg": 0.0,
870
+ "val/num_eos_tokens": 0,
871
+ "val/ratio": 0.8874868154525757,
872
+ "val/ratio_var": 4.332972093834542e-05
873
+ },
874
+ {
875
+ "episode": 672,
876
+ "epoch": 1.1789473684210525,
877
+ "eps": 3,
878
+ "loss/policy_avg": 0.012987145222723484,
879
+ "loss/value_avg": 0.632028341293335,
880
+ "lr": 8.697196261682243e-06,
881
+ "objective/entropy": 19.865787506103516,
882
+ "objective/kl": 177.42971801757812,
883
+ "objective/non_score_reward": -8.871485710144043,
884
+ "objective/rlhf_reward": -6.527735710144043,
885
+ "objective/scores": 2.34375,
886
+ "policy/approxkl_avg": 6.186136245727539,
887
+ "policy/clipfrac_avg": 0.13325470685958862,
888
+ "policy/entropy_avg": 0.6923173069953918,
889
+ "step": 42,
890
+ "val/clipfrac_avg": 0.125,
891
+ "val/num_eos_tokens": 0,
892
+ "val/ratio": 0.8991193771362305,
893
+ "val/ratio_var": 0.0009607934043742716
894
+ },
895
+ {
896
+ "episode": 688,
897
+ "epoch": 1.207017543859649,
898
+ "eps": 3,
899
+ "loss/policy_avg": 0.011711956933140755,
900
+ "loss/value_avg": 0.4474850296974182,
901
+ "lr": 8.565420560747664e-06,
902
+ "objective/entropy": 19.79934310913086,
903
+ "objective/kl": 176.83395385742188,
904
+ "objective/non_score_reward": -8.84169864654541,
905
+ "objective/rlhf_reward": -7.02138614654541,
906
+ "objective/scores": 1.8203125,
907
+ "policy/approxkl_avg": 5.904331207275391,
908
+ "policy/clipfrac_avg": 0.13089622557163239,
909
+ "policy/entropy_avg": 0.6940422654151917,
910
+ "step": 43,
911
+ "val/clipfrac_avg": 0.000589622650295496,
912
+ "val/num_eos_tokens": 0,
913
+ "val/ratio": 0.8869062066078186,
914
+ "val/ratio_var": 0.0008940807892940938
915
+ },
916
+ {
917
+ "episode": 704,
918
+ "epoch": 1.2350877192982457,
919
+ "eps": 3,
920
+ "loss/policy_avg": 0.014619714580476284,
921
+ "loss/value_avg": 0.625725507736206,
922
+ "lr": 8.433644859813084e-06,
923
+ "objective/entropy": 22.385128021240234,
924
+ "objective/kl": 181.2716064453125,
925
+ "objective/non_score_reward": -9.063579559326172,
926
+ "objective/rlhf_reward": -7.493267059326172,
927
+ "objective/scores": 1.5703125,
928
+ "policy/approxkl_avg": 4.966976165771484,
929
+ "policy/clipfrac_avg": 0.14799527823925018,
930
+ "policy/entropy_avg": 0.7825338840484619,
931
+ "step": 44,
932
+ "val/clipfrac_avg": 0.14681604504585266,
933
+ "val/num_eos_tokens": 0,
934
+ "val/ratio": 0.883050799369812,
935
+ "val/ratio_var": 1.7675925846560858e-05
936
+ },
937
+ {
938
+ "episode": 720,
939
+ "epoch": 1.263157894736842,
940
+ "eps": 3,
941
+ "loss/policy_avg": 0.024550937116146088,
942
+ "loss/value_avg": 0.7726404070854187,
943
+ "lr": 8.301869158878505e-06,
944
+ "objective/entropy": 19.87116050720215,
945
+ "objective/kl": 172.38674926757812,
946
+ "objective/non_score_reward": -8.619338035583496,
947
+ "objective/rlhf_reward": -6.572463035583496,
948
+ "objective/scores": 2.046875,
949
+ "policy/approxkl_avg": 4.95554256439209,
950
+ "policy/clipfrac_avg": 0.13089622557163239,
951
+ "policy/entropy_avg": 0.6934086680412292,
952
+ "step": 45,
953
+ "val/clipfrac_avg": 0.009433962404727936,
954
+ "val/num_eos_tokens": 0,
955
+ "val/ratio": 0.8879209756851196,
956
+ "val/ratio_var": 0.0003393842780496925
957
+ },
958
+ {
959
+ "episode": 736,
960
+ "epoch": 1.2912280701754386,
961
+ "eps": 3,
962
+ "loss/policy_avg": 0.019363895058631897,
963
+ "loss/value_avg": 0.5529497861862183,
964
+ "lr": 8.170093457943925e-06,
965
+ "objective/entropy": 17.47795295715332,
966
+ "objective/kl": 174.22576904296875,
967
+ "objective/non_score_reward": -8.711288452148438,
968
+ "objective/rlhf_reward": -6.3206634521484375,
969
+ "objective/scores": 2.390625,
970
+ "policy/approxkl_avg": 3.8177051544189453,
971
+ "policy/clipfrac_avg": 0.11851415038108826,
972
+ "policy/entropy_avg": 0.6579099297523499,
973
+ "step": 46,
974
+ "val/clipfrac_avg": 0.001179245300590992,
975
+ "val/num_eos_tokens": 0,
976
+ "val/ratio": 0.9020024538040161,
977
+ "val/ratio_var": 0.00011536524834809825
978
+ },
979
+ {
980
+ "episode": 752,
981
+ "epoch": 1.3192982456140352,
982
+ "eps": 3,
983
+ "loss/policy_avg": 0.012542858719825745,
984
+ "loss/value_avg": 0.6548900604248047,
985
+ "lr": 8.038317757009346e-06,
986
+ "objective/entropy": 17.823394775390625,
987
+ "objective/kl": 171.56602478027344,
988
+ "objective/non_score_reward": -8.578301429748535,
989
+ "objective/rlhf_reward": -6.828301429748535,
990
+ "objective/scores": 1.75,
991
+ "policy/approxkl_avg": 5.147519111633301,
992
+ "policy/clipfrac_avg": 0.14563679695129395,
993
+ "policy/entropy_avg": 0.6417368650436401,
994
+ "step": 47,
995
+ "val/clipfrac_avg": 0.004127358552068472,
996
+ "val/num_eos_tokens": 0,
997
+ "val/ratio": 0.8848338723182678,
998
+ "val/ratio_var": 0.000510354817379266
999
+ },
1000
+ {
1001
+ "episode": 768,
1002
+ "epoch": 1.3473684210526315,
1003
+ "eps": 3,
1004
+ "loss/policy_avg": 0.013358336873352528,
1005
+ "loss/value_avg": 0.3859623968601227,
1006
+ "lr": 7.906542056074766e-06,
1007
+ "objective/entropy": 16.128374099731445,
1008
+ "objective/kl": 168.9840087890625,
1009
+ "objective/non_score_reward": -8.449200630187988,
1010
+ "objective/rlhf_reward": -6.847638130187988,
1011
+ "objective/scores": 1.6015625,
1012
+ "policy/approxkl_avg": 4.781813144683838,
1013
+ "policy/clipfrac_avg": 0.12028302252292633,
1014
+ "policy/entropy_avg": 0.5887731313705444,
1015
+ "step": 48,
1016
+ "val/clipfrac_avg": 0.01179245300590992,
1017
+ "val/num_eos_tokens": 0,
1018
+ "val/ratio": 0.9012677669525146,
1019
+ "val/ratio_var": 0.00041765146306715906
1020
+ },
1021
+ {
1022
+ "episode": 784,
1023
+ "epoch": 1.3754385964912281,
1024
+ "eps": 3,
1025
+ "loss/policy_avg": 0.017281489446759224,
1026
+ "loss/value_avg": 0.5614770650863647,
1027
+ "lr": 7.774766355140187e-06,
1028
+ "objective/entropy": 20.177621841430664,
1029
+ "objective/kl": 166.16497802734375,
1030
+ "objective/non_score_reward": -8.308249473571777,
1031
+ "objective/rlhf_reward": -7.081686973571777,
1032
+ "objective/scores": 1.2265625,
1033
+ "policy/approxkl_avg": 3.1813056468963623,
1034
+ "policy/clipfrac_avg": 0.1432783007621765,
1035
+ "policy/entropy_avg": 0.7520714998245239,
1036
+ "step": 49,
1037
+ "val/clipfrac_avg": 0.1845518946647644,
1038
+ "val/num_eos_tokens": 0,
1039
+ "val/ratio": 0.8848077654838562,
1040
+ "val/ratio_var": 0.0005292592104524374
1041
+ },
1042
+ {
1043
+ "episode": 800,
1044
+ "epoch": 1.4035087719298245,
1045
+ "eps": 3,
1046
+ "loss/policy_avg": 0.015007663518190384,
1047
+ "loss/value_avg": 0.918763279914856,
1048
+ "lr": 7.642990654205608e-06,
1049
+ "objective/entropy": 17.82724380493164,
1050
+ "objective/kl": 179.90628051757812,
1051
+ "objective/non_score_reward": -8.99531364440918,
1052
+ "objective/rlhf_reward": -7.37812614440918,
1053
+ "objective/scores": 1.6171875,
1054
+ "policy/approxkl_avg": 5.0764312744140625,
1055
+ "policy/clipfrac_avg": 0.1149764209985733,
1056
+ "policy/entropy_avg": 0.6353041529655457,
1057
+ "step": 50,
1058
+ "val/clipfrac_avg": 0.014740565791726112,
1059
+ "val/num_eos_tokens": 0,
1060
+ "val/ratio": 0.9066751599311829,
1061
+ "val/ratio_var": 0.00023197307018563151
1062
+ },
1063
+ {
1064
+ "episode": 816,
1065
+ "epoch": 1.431578947368421,
1066
+ "eps": 3,
1067
+ "loss/policy_avg": 0.013827439397573471,
1068
+ "loss/value_avg": 0.6559504270553589,
1069
+ "lr": 7.511214953271027e-06,
1070
+ "objective/entropy": 17.98678207397461,
1071
+ "objective/kl": 173.85345458984375,
1072
+ "objective/non_score_reward": -8.692672729492188,
1073
+ "objective/rlhf_reward": -7.0911102294921875,
1074
+ "objective/scores": 1.6015625,
1075
+ "policy/approxkl_avg": 4.863851547241211,
1076
+ "policy/clipfrac_avg": 0.12382075190544128,
1077
+ "policy/entropy_avg": 0.6412214040756226,
1078
+ "step": 51,
1079
+ "val/clipfrac_avg": 0.03419811278581619,
1080
+ "val/num_eos_tokens": 0,
1081
+ "val/ratio": 0.8822081089019775,
1082
+ "val/ratio_var": 5.379146841733018e-06
1083
+ },
1084
+ {
1085
+ "episode": 832,
1086
+ "epoch": 1.4596491228070176,
1087
+ "eps": 3,
1088
+ "loss/policy_avg": 0.009551126509904861,
1089
+ "loss/value_avg": 0.46615320444107056,
1090
+ "lr": 7.379439252336448e-06,
1091
+ "objective/entropy": 14.611004829406738,
1092
+ "objective/kl": 169.49464416503906,
1093
+ "objective/non_score_reward": -8.4747314453125,
1094
+ "objective/rlhf_reward": -7.0137939453125,
1095
+ "objective/scores": 1.4609375,
1096
+ "policy/approxkl_avg": 4.765644073486328,
1097
+ "policy/clipfrac_avg": 0.09964622557163239,
1098
+ "policy/entropy_avg": 0.5645979642868042,
1099
+ "step": 52,
1100
+ "val/clipfrac_avg": 0.0,
1101
+ "val/num_eos_tokens": 0,
1102
+ "val/ratio": 0.9012112021446228,
1103
+ "val/ratio_var": 1.3516472790797707e-05
1104
+ },
1105
+ {
1106
+ "episode": 848,
1107
+ "epoch": 1.487719298245614,
1108
+ "eps": 3,
1109
+ "loss/policy_avg": 0.009903261438012123,
1110
+ "loss/value_avg": 1.024169683456421,
1111
+ "lr": 7.2476635514018685e-06,
1112
+ "objective/entropy": 16.012420654296875,
1113
+ "objective/kl": 173.99917602539062,
1114
+ "objective/non_score_reward": -8.699958801269531,
1115
+ "objective/rlhf_reward": -6.371833801269531,
1116
+ "objective/scores": 2.328125,
1117
+ "policy/approxkl_avg": 4.889924049377441,
1118
+ "policy/clipfrac_avg": 0.11261792480945587,
1119
+ "policy/entropy_avg": 0.5950597524642944,
1120
+ "step": 53,
1121
+ "val/clipfrac_avg": 0.20518869161605835,
1122
+ "val/num_eos_tokens": 0,
1123
+ "val/ratio": 0.8948594331741333,
1124
+ "val/ratio_var": 6.705896521452814e-05
1125
+ },
1126
+ {
1127
+ "episode": 864,
1128
+ "epoch": 1.5157894736842106,
1129
+ "eps": 3,
1130
+ "loss/policy_avg": -0.007397271227091551,
1131
+ "loss/value_avg": 0.4296436011791229,
1132
+ "lr": 7.115887850467289e-06,
1133
+ "objective/entropy": 12.315929412841797,
1134
+ "objective/kl": 175.46511840820312,
1135
+ "objective/non_score_reward": -8.773256301879883,
1136
+ "objective/rlhf_reward": -6.570131301879883,
1137
+ "objective/scores": 2.203125,
1138
+ "policy/approxkl_avg": 5.532078742980957,
1139
+ "policy/clipfrac_avg": 0.09375,
1140
+ "policy/entropy_avg": 0.5138251781463623,
1141
+ "step": 54,
1142
+ "val/clipfrac_avg": 0.003537735901772976,
1143
+ "val/num_eos_tokens": 0,
1144
+ "val/ratio": 0.9130541086196899,
1145
+ "val/ratio_var": 8.332962897839025e-05
1146
+ },
1147
+ {
1148
+ "episode": 880,
1149
+ "epoch": 1.543859649122807,
1150
+ "eps": 3,
1151
+ "loss/policy_avg": 0.0055680545046925545,
1152
+ "loss/value_avg": 0.44329357147216797,
1153
+ "lr": 6.9841121495327106e-06,
1154
+ "objective/entropy": 13.753085136413574,
1155
+ "objective/kl": 162.1046905517578,
1156
+ "objective/non_score_reward": -8.10523509979248,
1157
+ "objective/rlhf_reward": -6.1052350997924805,
1158
+ "objective/scores": 2.0,
1159
+ "policy/approxkl_avg": 4.2778730392456055,
1160
+ "policy/clipfrac_avg": 0.10200471431016922,
1161
+ "policy/entropy_avg": 0.538252592086792,
1162
+ "step": 55,
1163
+ "val/clipfrac_avg": 0.000589622650295496,
1164
+ "val/num_eos_tokens": 0,
1165
+ "val/ratio": 0.89923495054245,
1166
+ "val/ratio_var": 0.00013667276652995497
1167
+ },
1168
+ {
1169
+ "episode": 896,
1170
+ "epoch": 1.5719298245614035,
1171
+ "eps": 3,
1172
+ "loss/policy_avg": 0.0029763877391815186,
1173
+ "loss/value_avg": 0.5969923734664917,
1174
+ "lr": 6.852336448598131e-06,
1175
+ "objective/entropy": 10.386423110961914,
1176
+ "objective/kl": 170.64817810058594,
1177
+ "objective/non_score_reward": -8.53240966796875,
1178
+ "objective/rlhf_reward": -5.84490966796875,
1179
+ "objective/scores": 2.6875,
1180
+ "policy/approxkl_avg": 5.515145301818848,
1181
+ "policy/clipfrac_avg": 0.0695754736661911,
1182
+ "policy/entropy_avg": 0.4759911596775055,
1183
+ "step": 56,
1184
+ "val/clipfrac_avg": 0.22051887214183807,
1185
+ "val/num_eos_tokens": 0,
1186
+ "val/ratio": 0.9136029481887817,
1187
+ "val/ratio_var": 0.00015939133299980313
1188
+ },
1189
+ {
1190
+ "episode": 912,
1191
+ "epoch": 1.6,
1192
+ "eps": 3,
1193
+ "loss/policy_avg": -0.0002519981935620308,
1194
+ "loss/value_avg": 0.6188120245933533,
1195
+ "lr": 6.720560747663552e-06,
1196
+ "objective/entropy": 9.047847747802734,
1197
+ "objective/kl": 162.95162963867188,
1198
+ "objective/non_score_reward": -8.147581100463867,
1199
+ "objective/rlhf_reward": -5.835081100463867,
1200
+ "objective/scores": 2.3125,
1201
+ "policy/approxkl_avg": 5.942928314208984,
1202
+ "policy/clipfrac_avg": 0.06721697747707367,
1203
+ "policy/entropy_avg": 0.43892478942871094,
1204
+ "step": 57,
1205
+ "val/clipfrac_avg": 0.03242924436926842,
1206
+ "val/num_eos_tokens": 0,
1207
+ "val/ratio": 0.9175019264221191,
1208
+ "val/ratio_var": 2.5528926926199347e-05
1209
+ },
1210
+ {
1211
+ "episode": 928,
1212
+ "epoch": 1.6280701754385964,
1213
+ "eps": 3,
1214
+ "loss/policy_avg": -0.004241641610860825,
1215
+ "loss/value_avg": 0.6380342245101929,
1216
+ "lr": 6.588785046728972e-06,
1217
+ "objective/entropy": 10.172576904296875,
1218
+ "objective/kl": 172.64210510253906,
1219
+ "objective/non_score_reward": -8.632105827331543,
1220
+ "objective/rlhf_reward": -6.085230827331543,
1221
+ "objective/scores": 2.546875,
1222
+ "policy/approxkl_avg": 5.1512861251831055,
1223
+ "policy/clipfrac_avg": 0.09669811278581619,
1224
+ "policy/entropy_avg": 0.44444799423217773,
1225
+ "step": 58,
1226
+ "val/clipfrac_avg": 0.00294811325147748,
1227
+ "val/num_eos_tokens": 0,
1228
+ "val/ratio": 0.9051207304000854,
1229
+ "val/ratio_var": 9.685073746368289e-05
1230
+ },
1231
+ {
1232
+ "episode": 944,
1233
+ "epoch": 1.656140350877193,
1234
+ "eps": 3,
1235
+ "loss/policy_avg": 0.005844447761774063,
1236
+ "loss/value_avg": 0.46530038118362427,
1237
+ "lr": 6.457009345794393e-06,
1238
+ "objective/entropy": 11.34018611907959,
1239
+ "objective/kl": 167.05087280273438,
1240
+ "objective/non_score_reward": -8.352543830871582,
1241
+ "objective/rlhf_reward": -5.368168830871582,
1242
+ "objective/scores": 2.984375,
1243
+ "policy/approxkl_avg": 4.73173713684082,
1244
+ "policy/clipfrac_avg": 0.06898584961891174,
1245
+ "policy/entropy_avg": 0.4987587630748749,
1246
+ "step": 59,
1247
+ "val/clipfrac_avg": 0.003537735901772976,
1248
+ "val/num_eos_tokens": 0,
1249
+ "val/ratio": 0.9096221327781677,
1250
+ "val/ratio_var": 0.0002903940330725163
1251
+ },
1252
+ {
1253
+ "episode": 960,
1254
+ "epoch": 1.6842105263157894,
1255
+ "eps": 3,
1256
+ "loss/policy_avg": 0.0015796682564541698,
1257
+ "loss/value_avg": 0.5465973615646362,
1258
+ "lr": 6.3252336448598135e-06,
1259
+ "objective/entropy": 10.832345962524414,
1260
+ "objective/kl": 166.35125732421875,
1261
+ "objective/non_score_reward": -8.317562103271484,
1262
+ "objective/rlhf_reward": -5.114437103271484,
1263
+ "objective/scores": 3.203125,
1264
+ "policy/approxkl_avg": 4.080867767333984,
1265
+ "policy/clipfrac_avg": 0.08726415038108826,
1266
+ "policy/entropy_avg": 0.46615108847618103,
1267
+ "step": 60,
1268
+ "val/clipfrac_avg": 0.018278302624821663,
1269
+ "val/num_eos_tokens": 0,
1270
+ "val/ratio": 0.9084208011627197,
1271
+ "val/ratio_var": 1.8292890672455542e-05
1272
+ },
1273
+ {
1274
+ "episode": 976,
1275
+ "epoch": 1.712280701754386,
1276
+ "eps": 3,
1277
+ "loss/policy_avg": -0.0016184533014893532,
1278
+ "loss/value_avg": 0.6316072344779968,
1279
+ "lr": 6.193457943925234e-06,
1280
+ "objective/entropy": 9.0885648727417,
1281
+ "objective/kl": 172.646240234375,
1282
+ "objective/non_score_reward": -8.632311820983887,
1283
+ "objective/rlhf_reward": -5.194811820983887,
1284
+ "objective/scores": 3.4375,
1285
+ "policy/approxkl_avg": 4.502593994140625,
1286
+ "policy/clipfrac_avg": 0.06603773683309555,
1287
+ "policy/entropy_avg": 0.41100969910621643,
1288
+ "step": 61,
1289
+ "val/clipfrac_avg": 0.044811319559812546,
1290
+ "val/num_eos_tokens": 0,
1291
+ "val/ratio": 0.9256702661514282,
1292
+ "val/ratio_var": 7.894221198512241e-05
1293
+ },
1294
+ {
1295
+ "episode": 992,
1296
+ "epoch": 1.7403508771929825,
1297
+ "eps": 3,
1298
+ "loss/policy_avg": -0.0019415542483329773,
1299
+ "loss/value_avg": 0.6046911478042603,
1300
+ "lr": 6.061682242990655e-06,
1301
+ "objective/entropy": 9.12926197052002,
1302
+ "objective/kl": 169.4315185546875,
1303
+ "objective/non_score_reward": -8.471575736999512,
1304
+ "objective/rlhf_reward": -5.424700736999512,
1305
+ "objective/scores": 3.046875,
1306
+ "policy/approxkl_avg": 5.609973907470703,
1307
+ "policy/clipfrac_avg": 0.09198112785816193,
1308
+ "policy/entropy_avg": 0.4236205816268921,
1309
+ "step": 62,
1310
+ "val/clipfrac_avg": 0.001768867950886488,
1311
+ "val/num_eos_tokens": 0,
1312
+ "val/ratio": 0.9198966026306152,
1313
+ "val/ratio_var": 6.228529673535377e-05
1314
+ },
1315
+ {
1316
+ "episode": 1008,
1317
+ "epoch": 1.768421052631579,
1318
+ "eps": 3,
1319
+ "loss/policy_avg": -0.007835395634174347,
1320
+ "loss/value_avg": 0.6853305697441101,
1321
+ "lr": 5.929906542056075e-06,
1322
+ "objective/entropy": 8.566083908081055,
1323
+ "objective/kl": 163.68191528320312,
1324
+ "objective/non_score_reward": -8.18409538269043,
1325
+ "objective/rlhf_reward": -4.09034538269043,
1326
+ "objective/scores": 4.09375,
1327
+ "policy/approxkl_avg": 3.7664973735809326,
1328
+ "policy/clipfrac_avg": 0.07429245114326477,
1329
+ "policy/entropy_avg": 0.41426771879196167,
1330
+ "step": 63,
1331
+ "val/clipfrac_avg": 0.007665094453841448,
1332
+ "val/num_eos_tokens": 0,
1333
+ "val/ratio": 0.9395467042922974,
1334
+ "val/ratio_var": 0.00018259203352499753
1335
+ },
1336
+ {
1337
+ "episode": 1024,
1338
+ "epoch": 1.7964912280701755,
1339
+ "eps": 3,
1340
+ "loss/policy_avg": 0.0056846365332603455,
1341
+ "loss/value_avg": 0.8050791621208191,
1342
+ "lr": 5.798130841121495e-06,
1343
+ "objective/entropy": 7.867904186248779,
1344
+ "objective/kl": 176.44961547851562,
1345
+ "objective/non_score_reward": -8.822481155395508,
1346
+ "objective/rlhf_reward": -4.931856155395508,
1347
+ "objective/scores": 3.890625,
1348
+ "policy/approxkl_avg": 4.615470886230469,
1349
+ "policy/clipfrac_avg": 0.07016509771347046,
1350
+ "policy/entropy_avg": 0.40076911449432373,
1351
+ "step": 64,
1352
+ "val/clipfrac_avg": 0.1179245263338089,
1353
+ "val/num_eos_tokens": 0,
1354
+ "val/ratio": 0.9168256521224976,
1355
+ "val/ratio_var": 1.1071170774812344e-05
1356
+ },
1357
+ {
1358
+ "episode": 1040,
1359
+ "epoch": 1.8245614035087718,
1360
+ "eps": 3,
1361
+ "loss/policy_avg": -0.004829235374927521,
1362
+ "loss/value_avg": 0.7683409452438354,
1363
+ "lr": 5.666355140186916e-06,
1364
+ "objective/entropy": 8.73065185546875,
1365
+ "objective/kl": 165.93441772460938,
1366
+ "objective/non_score_reward": -8.296720504760742,
1367
+ "objective/rlhf_reward": -4.531095504760742,
1368
+ "objective/scores": 3.765625,
1369
+ "policy/approxkl_avg": 4.037623882293701,
1370
+ "policy/clipfrac_avg": 0.0625,
1371
+ "policy/entropy_avg": 0.38483142852783203,
1372
+ "step": 65,
1373
+ "val/clipfrac_avg": 0.0,
1374
+ "val/num_eos_tokens": 0,
1375
+ "val/ratio": 0.9364030361175537,
1376
+ "val/ratio_var": 0.0001283105229958892
1377
+ },
1378
+ {
1379
+ "episode": 1056,
1380
+ "epoch": 1.8526315789473684,
1381
+ "eps": 3,
1382
+ "loss/policy_avg": -0.002082128543406725,
1383
+ "loss/value_avg": 0.8781827688217163,
1384
+ "lr": 5.534579439252336e-06,
1385
+ "objective/entropy": 6.81689977645874,
1386
+ "objective/kl": 173.76760864257812,
1387
+ "objective/non_score_reward": -8.688380241394043,
1388
+ "objective/rlhf_reward": -5.454005241394043,
1389
+ "objective/scores": 3.234375,
1390
+ "policy/approxkl_avg": 5.1825032234191895,
1391
+ "policy/clipfrac_avg": 0.07488207519054413,
1392
+ "policy/entropy_avg": 0.3771995007991791,
1393
+ "step": 66,
1394
+ "val/clipfrac_avg": 0.0,
1395
+ "val/num_eos_tokens": 0,
1396
+ "val/ratio": 0.9298049211502075,
1397
+ "val/ratio_var": 0.00015954735863488168
1398
+ },
1399
+ {
1400
+ "episode": 1072,
1401
+ "epoch": 1.880701754385965,
1402
+ "eps": 3,
1403
+ "loss/policy_avg": 0.005034355446696281,
1404
+ "loss/value_avg": 1.0226874351501465,
1405
+ "lr": 5.402803738317757e-06,
1406
+ "objective/entropy": 5.308557510375977,
1407
+ "objective/kl": 171.6015167236328,
1408
+ "objective/non_score_reward": -8.580076217651367,
1409
+ "objective/rlhf_reward": -4.236326217651367,
1410
+ "objective/scores": 4.34375,
1411
+ "policy/approxkl_avg": 5.336367607116699,
1412
+ "policy/clipfrac_avg": 0.04599056765437126,
1413
+ "policy/entropy_avg": 0.34400177001953125,
1414
+ "step": 67,
1415
+ "val/clipfrac_avg": 0.000589622650295496,
1416
+ "val/num_eos_tokens": 0,
1417
+ "val/ratio": 0.9246993064880371,
1418
+ "val/ratio_var": 9.672338592281449e-07
1419
+ },
1420
+ {
1421
+ "episode": 1088,
1422
+ "epoch": 1.9087719298245613,
1423
+ "eps": 3,
1424
+ "loss/policy_avg": 0.023275576531887054,
1425
+ "loss/value_avg": 0.6750494241714478,
1426
+ "lr": 5.271028037383177e-06,
1427
+ "objective/entropy": 7.23941707611084,
1428
+ "objective/kl": 166.45547485351562,
1429
+ "objective/non_score_reward": -8.322773933410645,
1430
+ "objective/rlhf_reward": -4.6352739334106445,
1431
+ "objective/scores": 3.6875,
1432
+ "policy/approxkl_avg": 3.1369752883911133,
1433
+ "policy/clipfrac_avg": 0.05837263911962509,
1434
+ "policy/entropy_avg": 0.3996211886405945,
1435
+ "step": 68,
1436
+ "val/clipfrac_avg": 0.000589622650295496,
1437
+ "val/num_eos_tokens": 0,
1438
+ "val/ratio": 0.9343756437301636,
1439
+ "val/ratio_var": 0.00011849942529806867
1440
+ },
1441
+ {
1442
+ "episode": 1104,
1443
+ "epoch": 1.936842105263158,
1444
+ "eps": 3,
1445
+ "loss/policy_avg": 0.001583978533744812,
1446
+ "loss/value_avg": 0.7364473342895508,
1447
+ "lr": 5.139252336448598e-06,
1448
+ "objective/entropy": 8.292254447937012,
1449
+ "objective/kl": 174.10446166992188,
1450
+ "objective/non_score_reward": -8.705223083496094,
1451
+ "objective/rlhf_reward": -4.517723083496094,
1452
+ "objective/scores": 4.1875,
1453
+ "policy/approxkl_avg": 5.407079696655273,
1454
+ "policy/clipfrac_avg": 0.06780660152435303,
1455
+ "policy/entropy_avg": 0.3910168409347534,
1456
+ "step": 69,
1457
+ "val/clipfrac_avg": 0.001179245300590992,
1458
+ "val/num_eos_tokens": 0,
1459
+ "val/ratio": 0.9262620210647583,
1460
+ "val/ratio_var": 8.509035978931934e-06
1461
+ },
1462
+ {
1463
+ "episode": 1120,
1464
+ "epoch": 1.9649122807017543,
1465
+ "eps": 3,
1466
+ "loss/policy_avg": 0.014011572115123272,
1467
+ "loss/value_avg": 0.49188750982284546,
1468
+ "lr": 5.0074766355140185e-06,
1469
+ "objective/entropy": 4.73923397064209,
1470
+ "objective/kl": 170.3909912109375,
1471
+ "objective/non_score_reward": -8.519549369812012,
1472
+ "objective/rlhf_reward": -4.535174369812012,
1473
+ "objective/scores": 3.984375,
1474
+ "policy/approxkl_avg": 4.553505897521973,
1475
+ "policy/clipfrac_avg": 0.04658018797636032,
1476
+ "policy/entropy_avg": 0.314146488904953,
1477
+ "step": 70,
1478
+ "val/clipfrac_avg": 0.001768867950886488,
1479
+ "val/num_eos_tokens": 0,
1480
+ "val/ratio": 0.9345220327377319,
1481
+ "val/ratio_var": 0.00011590120993787423
1482
+ },
1483
+ {
1484
+ "episode": 1136,
1485
+ "epoch": 1.9929824561403509,
1486
+ "eps": 3,
1487
+ "loss/policy_avg": 0.014443885535001755,
1488
+ "loss/value_avg": 0.8583539724349976,
1489
+ "lr": 4.875700934579439e-06,
1490
+ "objective/entropy": 6.110556602478027,
1491
+ "objective/kl": 168.1246337890625,
1492
+ "objective/non_score_reward": -8.406231880187988,
1493
+ "objective/rlhf_reward": -4.781231880187988,
1494
+ "objective/scores": 3.625,
1495
+ "policy/approxkl_avg": 3.3112387657165527,
1496
+ "policy/clipfrac_avg": 0.04716981202363968,
1497
+ "policy/entropy_avg": 0.3741912841796875,
1498
+ "step": 71,
1499
+ "val/clipfrac_avg": 0.0,
1500
+ "val/num_eos_tokens": 0,
1501
+ "val/ratio": 0.9311293363571167,
1502
+ "val/ratio_var": 1.7129657862824388e-05
1503
+ },
1504
+ {
1505
+ "episode": 1152,
1506
+ "epoch": 2.0210526315789474,
1507
+ "eps": 3,
1508
+ "loss/policy_avg": 0.0069357771426439285,
1509
+ "loss/value_avg": 0.6024092435836792,
1510
+ "lr": 4.74392523364486e-06,
1511
+ "objective/entropy": 1.9080017805099487,
1512
+ "objective/kl": 175.54367065429688,
1513
+ "objective/non_score_reward": -8.777183532714844,
1514
+ "objective/rlhf_reward": -3.7771835327148438,
1515
+ "objective/scores": 5.0,
1516
+ "policy/approxkl_avg": 6.433887004852295,
1517
+ "policy/clipfrac_avg": 0.03655660152435303,
1518
+ "policy/entropy_avg": 0.2685927748680115,
1519
+ "step": 72,
1520
+ "val/clipfrac_avg": 0.0,
1521
+ "val/num_eos_tokens": 0,
1522
+ "val/ratio": 0.9363144040107727,
1523
+ "val/ratio_var": 2.7767631763708778e-05
1524
+ },
1525
+ {
1526
+ "episode": 1168,
1527
+ "epoch": 2.049122807017544,
1528
+ "eps": 3,
1529
+ "loss/policy_avg": 0.004744451027363539,
1530
+ "loss/value_avg": 0.6521505117416382,
1531
+ "lr": 4.61214953271028e-06,
1532
+ "objective/entropy": 2.584568500518799,
1533
+ "objective/kl": 171.61709594726562,
1534
+ "objective/non_score_reward": -8.580854415893555,
1535
+ "objective/rlhf_reward": -3.1121044158935547,
1536
+ "objective/scores": 5.46875,
1537
+ "policy/approxkl_avg": 4.509120941162109,
1538
+ "policy/clipfrac_avg": 0.03478773683309555,
1539
+ "policy/entropy_avg": 0.2757822573184967,
1540
+ "step": 73,
1541
+ "val/clipfrac_avg": 0.2900943160057068,
1542
+ "val/num_eos_tokens": 0,
1543
+ "val/ratio": 0.9448769092559814,
1544
+ "val/ratio_var": 1.143478584708646e-05
1545
+ },
1546
+ {
1547
+ "episode": 1184,
1548
+ "epoch": 2.07719298245614,
1549
+ "eps": 3,
1550
+ "loss/policy_avg": 0.004101068712770939,
1551
+ "loss/value_avg": 0.44417738914489746,
1552
+ "lr": 4.480373831775701e-06,
1553
+ "objective/entropy": 3.265643835067749,
1554
+ "objective/kl": 179.89352416992188,
1555
+ "objective/non_score_reward": -8.99467658996582,
1556
+ "objective/rlhf_reward": -4.11967658996582,
1557
+ "objective/scores": 4.875,
1558
+ "policy/approxkl_avg": 5.798920154571533,
1559
+ "policy/clipfrac_avg": 0.028891509398818016,
1560
+ "policy/entropy_avg": 0.29206639528274536,
1561
+ "step": 74,
1562
+ "val/clipfrac_avg": 0.05188679322600365,
1563
+ "val/num_eos_tokens": 0,
1564
+ "val/ratio": 0.9267533421516418,
1565
+ "val/ratio_var": 0.0001095838742912747
1566
+ },
1567
+ {
1568
+ "episode": 1200,
1569
+ "epoch": 2.1052631578947367,
1570
+ "eps": 3,
1571
+ "loss/policy_avg": 0.004760343115776777,
1572
+ "loss/value_avg": 0.3549901843070984,
1573
+ "lr": 4.3485981308411215e-06,
1574
+ "objective/entropy": 2.9447989463806152,
1575
+ "objective/kl": 175.41961669921875,
1576
+ "objective/non_score_reward": -8.770980834960938,
1577
+ "objective/rlhf_reward": -3.2084808349609375,
1578
+ "objective/scores": 5.5625,
1579
+ "policy/approxkl_avg": 4.80606746673584,
1580
+ "policy/clipfrac_avg": 0.03419811278581619,
1581
+ "policy/entropy_avg": 0.30916815996170044,
1582
+ "step": 75,
1583
+ "val/clipfrac_avg": 0.004127358552068472,
1584
+ "val/num_eos_tokens": 0,
1585
+ "val/ratio": 0.9180092215538025,
1586
+ "val/ratio_var": 2.3069829694577493e-05
1587
+ },
1588
+ {
1589
+ "episode": 1216,
1590
+ "epoch": 2.1333333333333333,
1591
+ "eps": 3,
1592
+ "loss/policy_avg": 0.010298425331711769,
1593
+ "loss/value_avg": 0.15927723050117493,
1594
+ "lr": 4.216822429906542e-06,
1595
+ "objective/entropy": 1.4227180480957031,
1596
+ "objective/kl": 176.0067138671875,
1597
+ "objective/non_score_reward": -8.800336837768555,
1598
+ "objective/rlhf_reward": -2.6753368377685547,
1599
+ "objective/scores": 6.125,
1600
+ "policy/approxkl_avg": 4.99057149887085,
1601
+ "policy/clipfrac_avg": 0.028301887214183807,
1602
+ "policy/entropy_avg": 0.2751670479774475,
1603
+ "step": 76,
1604
+ "val/clipfrac_avg": 0.10495282709598541,
1605
+ "val/num_eos_tokens": 0,
1606
+ "val/ratio": 0.9201045036315918,
1607
+ "val/ratio_var": 5.366753157431958e-06
1608
+ },
1609
+ {
1610
+ "episode": 1232,
1611
+ "epoch": 2.16140350877193,
1612
+ "eps": 3,
1613
+ "loss/policy_avg": -0.005462624132633209,
1614
+ "loss/value_avg": 0.28704196214675903,
1615
+ "lr": 4.085046728971963e-06,
1616
+ "objective/entropy": 1.6171071529388428,
1617
+ "objective/kl": 176.83685302734375,
1618
+ "objective/non_score_reward": -8.841842651367188,
1619
+ "objective/rlhf_reward": -3.1543426513671875,
1620
+ "objective/scores": 5.6875,
1621
+ "policy/approxkl_avg": 5.847208023071289,
1622
+ "policy/clipfrac_avg": 0.028891509398818016,
1623
+ "policy/entropy_avg": 0.286138117313385,
1624
+ "step": 77,
1625
+ "val/clipfrac_avg": 0.07075471431016922,
1626
+ "val/num_eos_tokens": 0,
1627
+ "val/ratio": 0.9195102453231812,
1628
+ "val/ratio_var": 9.577343917044345e-06
1629
+ },
1630
+ {
1631
+ "episode": 1248,
1632
+ "epoch": 2.1894736842105265,
1633
+ "eps": 3,
1634
+ "loss/policy_avg": 0.0010141655802726746,
1635
+ "loss/value_avg": 0.8408201932907104,
1636
+ "lr": 3.953271028037383e-06,
1637
+ "objective/entropy": 7.40260124206543,
1638
+ "objective/kl": 177.4427490234375,
1639
+ "objective/non_score_reward": -8.872137069702148,
1640
+ "objective/rlhf_reward": -4.903387069702148,
1641
+ "objective/scores": 3.96875,
1642
+ "policy/approxkl_avg": 5.285105228424072,
1643
+ "policy/clipfrac_avg": 0.04658018797636032,
1644
+ "policy/entropy_avg": 0.4253733158111572,
1645
+ "step": 78,
1646
+ "val/clipfrac_avg": 0.000589622650295496,
1647
+ "val/num_eos_tokens": 0,
1648
+ "val/ratio": 0.9203585982322693,
1649
+ "val/ratio_var": 1.65566543728346e-05
1650
+ },
1651
+ {
1652
+ "episode": 1264,
1653
+ "epoch": 2.2175438596491226,
1654
+ "eps": 3,
1655
+ "loss/policy_avg": -0.004624534398317337,
1656
+ "loss/value_avg": 0.7719740271568298,
1657
+ "lr": 3.821495327102804e-06,
1658
+ "objective/entropy": 4.648886203765869,
1659
+ "objective/kl": 181.51986694335938,
1660
+ "objective/non_score_reward": -9.075994491577148,
1661
+ "objective/rlhf_reward": -4.700994491577148,
1662
+ "objective/scores": 4.375,
1663
+ "policy/approxkl_avg": 4.547338485717773,
1664
+ "policy/clipfrac_avg": 0.0383254736661911,
1665
+ "policy/entropy_avg": 0.3513880968093872,
1666
+ "step": 79,
1667
+ "val/clipfrac_avg": 0.1291273534297943,
1668
+ "val/num_eos_tokens": 0,
1669
+ "val/ratio": 0.9286638498306274,
1670
+ "val/ratio_var": 6.422147998819128e-05
1671
+ },
1672
+ {
1673
+ "episode": 1280,
1674
+ "epoch": 2.245614035087719,
1675
+ "eps": 3,
1676
+ "loss/policy_avg": 0.012380128726363182,
1677
+ "loss/value_avg": 0.40563684701919556,
1678
+ "lr": 3.689719626168224e-06,
1679
+ "objective/entropy": 7.685408115386963,
1680
+ "objective/kl": 168.90484619140625,
1681
+ "objective/non_score_reward": -8.445242881774902,
1682
+ "objective/rlhf_reward": -3.4452428817749023,
1683
+ "objective/scores": 5.0,
1684
+ "policy/approxkl_avg": 3.4143970012664795,
1685
+ "policy/clipfrac_avg": 0.041273586452007294,
1686
+ "policy/entropy_avg": 0.4100227355957031,
1687
+ "step": 80,
1688
+ "val/clipfrac_avg": 0.015330187976360321,
1689
+ "val/num_eos_tokens": 0,
1690
+ "val/ratio": 0.9217012524604797,
1691
+ "val/ratio_var": 7.061174983391538e-05
1692
+ },
1693
+ {
1694
+ "episode": 1296,
1695
+ "epoch": 2.2736842105263158,
1696
+ "eps": 3,
1697
+ "loss/policy_avg": 0.011339722201228142,
1698
+ "loss/value_avg": 0.3490160405635834,
1699
+ "lr": 3.5579439252336446e-06,
1700
+ "objective/entropy": 4.046834945678711,
1701
+ "objective/kl": 177.92718505859375,
1702
+ "objective/non_score_reward": -8.89635944366455,
1703
+ "objective/rlhf_reward": -3.958859443664551,
1704
+ "objective/scores": 4.9375,
1705
+ "policy/approxkl_avg": 5.583766460418701,
1706
+ "policy/clipfrac_avg": 0.03478773683309555,
1707
+ "policy/entropy_avg": 0.3193933963775635,
1708
+ "step": 81,
1709
+ "val/clipfrac_avg": 0.0,
1710
+ "val/num_eos_tokens": 0,
1711
+ "val/ratio": 0.9363265037536621,
1712
+ "val/ratio_var": 5.8069101214641705e-05
1713
+ },
1714
+ {
1715
+ "episode": 1312,
1716
+ "epoch": 2.3017543859649123,
1717
+ "eps": 3,
1718
+ "loss/policy_avg": 0.007465606089681387,
1719
+ "loss/value_avg": 0.3137081563472748,
1720
+ "lr": 3.4261682242990656e-06,
1721
+ "objective/entropy": 3.293423652648926,
1722
+ "objective/kl": 173.18377685546875,
1723
+ "objective/non_score_reward": -8.659189224243164,
1724
+ "objective/rlhf_reward": -3.065439224243164,
1725
+ "objective/scores": 5.59375,
1726
+ "policy/approxkl_avg": 4.794089317321777,
1727
+ "policy/clipfrac_avg": 0.0383254736661911,
1728
+ "policy/entropy_avg": 0.29685914516448975,
1729
+ "step": 82,
1730
+ "val/clipfrac_avg": 0.000589622650295496,
1731
+ "val/num_eos_tokens": 0,
1732
+ "val/ratio": 0.9458816647529602,
1733
+ "val/ratio_var": 0.00023432180751115084
1734
+ },
1735
+ {
1736
+ "episode": 1328,
1737
+ "epoch": 2.329824561403509,
1738
+ "eps": 3,
1739
+ "loss/policy_avg": -0.00093865767121315,
1740
+ "loss/value_avg": 0.9402576684951782,
1741
+ "lr": 3.294392523364486e-06,
1742
+ "objective/entropy": 5.09280252456665,
1743
+ "objective/kl": 173.88351440429688,
1744
+ "objective/non_score_reward": -8.694175720214844,
1745
+ "objective/rlhf_reward": -4.866050720214844,
1746
+ "objective/scores": 3.828125,
1747
+ "policy/approxkl_avg": 3.8168904781341553,
1748
+ "policy/clipfrac_avg": 0.03891509398818016,
1749
+ "policy/entropy_avg": 0.35750845074653625,
1750
+ "step": 83,
1751
+ "val/clipfrac_avg": 0.000589622650295496,
1752
+ "val/num_eos_tokens": 0,
1753
+ "val/ratio": 0.9232673645019531,
1754
+ "val/ratio_var": 4.1539384255884215e-05
1755
+ },
1756
+ {
1757
+ "episode": 1344,
1758
+ "epoch": 2.357894736842105,
1759
+ "eps": 3,
1760
+ "loss/policy_avg": -0.007357731461524963,
1761
+ "loss/value_avg": 0.36178284883499146,
1762
+ "lr": 3.1626168224299067e-06,
1763
+ "objective/entropy": 5.281716346740723,
1764
+ "objective/kl": 179.2125701904297,
1765
+ "objective/non_score_reward": -8.960628509521484,
1766
+ "objective/rlhf_reward": -3.9918785095214844,
1767
+ "objective/scores": 4.96875,
1768
+ "policy/approxkl_avg": 4.461269378662109,
1769
+ "policy/clipfrac_avg": 0.05365566164255142,
1770
+ "policy/entropy_avg": 0.35694169998168945,
1771
+ "step": 84,
1772
+ "val/clipfrac_avg": 0.000589622650295496,
1773
+ "val/num_eos_tokens": 0,
1774
+ "val/ratio": 0.9178085923194885,
1775
+ "val/ratio_var": 4.949720823788084e-05
1776
+ },
1777
+ {
1778
+ "episode": 1360,
1779
+ "epoch": 2.3859649122807016,
1780
+ "eps": 3,
1781
+ "loss/policy_avg": 0.0004696398973464966,
1782
+ "loss/value_avg": 0.30143094062805176,
1783
+ "lr": 3.0308411214953273e-06,
1784
+ "objective/entropy": 2.755769729614258,
1785
+ "objective/kl": 173.08140563964844,
1786
+ "objective/non_score_reward": -8.654069900512695,
1787
+ "objective/rlhf_reward": -2.9665699005126953,
1788
+ "objective/scores": 5.6875,
1789
+ "policy/approxkl_avg": 5.356992721557617,
1790
+ "policy/clipfrac_avg": 0.03242924436926842,
1791
+ "policy/entropy_avg": 0.282896488904953,
1792
+ "step": 85,
1793
+ "val/clipfrac_avg": 0.0,
1794
+ "val/num_eos_tokens": 0,
1795
+ "val/ratio": 0.9419326782226562,
1796
+ "val/ratio_var": 3.9359238144243136e-05
1797
+ },
1798
+ {
1799
+ "episode": 1376,
1800
+ "epoch": 2.414035087719298,
1801
+ "eps": 3,
1802
+ "loss/policy_avg": 0.0008706599473953247,
1803
+ "loss/value_avg": 0.5158276557922363,
1804
+ "lr": 2.8990654205607475e-06,
1805
+ "objective/entropy": 4.149250507354736,
1806
+ "objective/kl": 173.71505737304688,
1807
+ "objective/non_score_reward": -8.685752868652344,
1808
+ "objective/rlhf_reward": -3.6857528686523438,
1809
+ "objective/scores": 5.0,
1810
+ "policy/approxkl_avg": 5.095344066619873,
1811
+ "policy/clipfrac_avg": 0.030070755630731583,
1812
+ "policy/entropy_avg": 0.3076534867286682,
1813
+ "step": 86,
1814
+ "val/clipfrac_avg": 0.0,
1815
+ "val/num_eos_tokens": 0,
1816
+ "val/ratio": 0.93389892578125,
1817
+ "val/ratio_var": 9.108168342208955e-06
1818
+ },
1819
+ {
1820
+ "episode": 1392,
1821
+ "epoch": 2.442105263157895,
1822
+ "eps": 3,
1823
+ "loss/policy_avg": -0.0013641200494021177,
1824
+ "loss/value_avg": 0.46665364503860474,
1825
+ "lr": 2.767289719626168e-06,
1826
+ "objective/entropy": 3.9847404956817627,
1827
+ "objective/kl": 171.97903442382812,
1828
+ "objective/non_score_reward": -8.59895133972168,
1829
+ "objective/rlhf_reward": -3.4114513397216797,
1830
+ "objective/scores": 5.1875,
1831
+ "policy/approxkl_avg": 4.758839130401611,
1832
+ "policy/clipfrac_avg": 0.02771226316690445,
1833
+ "policy/entropy_avg": 0.3068329691886902,
1834
+ "step": 87,
1835
+ "val/clipfrac_avg": 0.002358490601181984,
1836
+ "val/num_eos_tokens": 0,
1837
+ "val/ratio": 0.9303795099258423,
1838
+ "val/ratio_var": 2.7705931643140502e-05
1839
+ },
1840
+ {
1841
+ "episode": 1408,
1842
+ "epoch": 2.4701754385964914,
1843
+ "eps": 3,
1844
+ "loss/policy_avg": -0.009293105453252792,
1845
+ "loss/value_avg": 0.1374308168888092,
1846
+ "lr": 2.6355140186915887e-06,
1847
+ "objective/entropy": 2.8504319190979004,
1848
+ "objective/kl": 178.8887176513672,
1849
+ "objective/non_score_reward": -8.944437026977539,
1850
+ "objective/rlhf_reward": -2.975687026977539,
1851
+ "objective/scores": 5.96875,
1852
+ "policy/approxkl_avg": 5.254701614379883,
1853
+ "policy/clipfrac_avg": 0.026533018797636032,
1854
+ "policy/entropy_avg": 0.2935040593147278,
1855
+ "step": 88,
1856
+ "val/clipfrac_avg": 0.002358490601181984,
1857
+ "val/num_eos_tokens": 0,
1858
+ "val/ratio": 0.9373711943626404,
1859
+ "val/ratio_var": 1.766591776686255e-05
1860
+ },
1861
+ {
1862
+ "episode": 1424,
1863
+ "epoch": 2.498245614035088,
1864
+ "eps": 3,
1865
+ "loss/policy_avg": 0.00495288148522377,
1866
+ "loss/value_avg": 0.20061969757080078,
1867
+ "lr": 2.5037383177570093e-06,
1868
+ "objective/entropy": 4.51104211807251,
1869
+ "objective/kl": 169.7410125732422,
1870
+ "objective/non_score_reward": -8.487051010131836,
1871
+ "objective/rlhf_reward": -2.768301010131836,
1872
+ "objective/scores": 5.71875,
1873
+ "policy/approxkl_avg": 4.481791019439697,
1874
+ "policy/clipfrac_avg": 0.03478773683309555,
1875
+ "policy/entropy_avg": 0.3265501856803894,
1876
+ "step": 89,
1877
+ "val/clipfrac_avg": 0.0,
1878
+ "val/num_eos_tokens": 0,
1879
+ "val/ratio": 0.9414163827896118,
1880
+ "val/ratio_var": 0.00011632378300419077
1881
+ },
1882
+ {
1883
+ "episode": 1440,
1884
+ "epoch": 2.526315789473684,
1885
+ "eps": 3,
1886
+ "loss/policy_avg": 0.00646105594933033,
1887
+ "loss/value_avg": 0.42740941047668457,
1888
+ "lr": 2.37196261682243e-06,
1889
+ "objective/entropy": 3.2403650283813477,
1890
+ "objective/kl": 176.238037109375,
1891
+ "objective/non_score_reward": -8.811902046203613,
1892
+ "objective/rlhf_reward": -3.9994020462036133,
1893
+ "objective/scores": 4.8125,
1894
+ "policy/approxkl_avg": 5.44842529296875,
1895
+ "policy/clipfrac_avg": 0.01591981202363968,
1896
+ "policy/entropy_avg": 0.2933845520019531,
1897
+ "step": 90,
1898
+ "val/clipfrac_avg": 0.000589622650295496,
1899
+ "val/num_eos_tokens": 0,
1900
+ "val/ratio": 0.9381667971611023,
1901
+ "val/ratio_var": 9.369335020892322e-05
1902
+ },
1903
+ {
1904
+ "episode": 1456,
1905
+ "epoch": 2.5543859649122806,
1906
+ "eps": 3,
1907
+ "loss/policy_avg": -0.005389541387557983,
1908
+ "loss/value_avg": 0.4948211908340454,
1909
+ "lr": 2.2401869158878504e-06,
1910
+ "objective/entropy": 2.898387908935547,
1911
+ "objective/kl": 173.48486328125,
1912
+ "objective/non_score_reward": -8.674242973327637,
1913
+ "objective/rlhf_reward": -3.5179929733276367,
1914
+ "objective/scores": 5.15625,
1915
+ "policy/approxkl_avg": 4.66801643371582,
1916
+ "policy/clipfrac_avg": 0.020636793226003647,
1917
+ "policy/entropy_avg": 0.2913670837879181,
1918
+ "step": 91,
1919
+ "val/clipfrac_avg": 0.001179245300590992,
1920
+ "val/num_eos_tokens": 0,
1921
+ "val/ratio": 0.9378049373626709,
1922
+ "val/ratio_var": 1.2327662261668593e-05
1923
+ },
1924
+ {
1925
+ "episode": 1472,
1926
+ "epoch": 2.5824561403508772,
1927
+ "eps": 3,
1928
+ "loss/policy_avg": -0.010267895646393299,
1929
+ "loss/value_avg": 0.26834648847579956,
1930
+ "lr": 2.108411214953271e-06,
1931
+ "objective/entropy": 4.616816997528076,
1932
+ "objective/kl": 171.12762451171875,
1933
+ "objective/non_score_reward": -8.556382179260254,
1934
+ "objective/rlhf_reward": -3.587632179260254,
1935
+ "objective/scores": 4.96875,
1936
+ "policy/approxkl_avg": 4.146580219268799,
1937
+ "policy/clipfrac_avg": 0.041273586452007294,
1938
+ "policy/entropy_avg": 0.34417253732681274,
1939
+ "step": 92,
1940
+ "val/clipfrac_avg": 0.000589622650295496,
1941
+ "val/num_eos_tokens": 0,
1942
+ "val/ratio": 0.9241670370101929,
1943
+ "val/ratio_var": 4.9057460273616016e-05
1944
+ },
1945
+ {
1946
+ "episode": 1488,
1947
+ "epoch": 2.610526315789474,
1948
+ "eps": 3,
1949
+ "loss/policy_avg": 0.0006395354866981506,
1950
+ "loss/value_avg": 0.7872554063796997,
1951
+ "lr": 1.9766355140186916e-06,
1952
+ "objective/entropy": 5.483046531677246,
1953
+ "objective/kl": 169.1695098876953,
1954
+ "objective/non_score_reward": -8.458476066589355,
1955
+ "objective/rlhf_reward": -4.4741010665893555,
1956
+ "objective/scores": 3.984375,
1957
+ "policy/approxkl_avg": 2.8852078914642334,
1958
+ "policy/clipfrac_avg": 0.032429248094558716,
1959
+ "policy/entropy_avg": 0.35312554240226746,
1960
+ "step": 93,
1961
+ "val/clipfrac_avg": 0.001768867950886488,
1962
+ "val/num_eos_tokens": 0,
1963
+ "val/ratio": 0.9286659955978394,
1964
+ "val/ratio_var": 2.8370095606078394e-05
1965
+ },
1966
+ {
1967
+ "episode": 1504,
1968
+ "epoch": 2.6385964912280704,
1969
+ "eps": 3,
1970
+ "loss/policy_avg": -0.00652042031288147,
1971
+ "loss/value_avg": 0.17014235258102417,
1972
+ "lr": 1.844859813084112e-06,
1973
+ "objective/entropy": 2.737617015838623,
1974
+ "objective/kl": 178.22747802734375,
1975
+ "objective/non_score_reward": -8.911375045776367,
1976
+ "objective/rlhf_reward": -3.286375045776367,
1977
+ "objective/scores": 5.625,
1978
+ "policy/approxkl_avg": 5.498225688934326,
1979
+ "policy/clipfrac_avg": 0.028891511261463165,
1980
+ "policy/entropy_avg": 0.28995558619499207,
1981
+ "step": 94,
1982
+ "val/clipfrac_avg": 0.001768867950886488,
1983
+ "val/num_eos_tokens": 0,
1984
+ "val/ratio": 0.9345540404319763,
1985
+ "val/ratio_var": 1.2709216434814152e-06
1986
+ },
1987
+ {
1988
+ "episode": 1520,
1989
+ "epoch": 2.6666666666666665,
1990
+ "eps": 3,
1991
+ "loss/policy_avg": 0.007195580750703812,
1992
+ "loss/value_avg": 0.40153437852859497,
1993
+ "lr": 1.7130841121495328e-06,
1994
+ "objective/entropy": 4.30942440032959,
1995
+ "objective/kl": 176.95938110351562,
1996
+ "objective/non_score_reward": -8.847970008850098,
1997
+ "objective/rlhf_reward": -3.8479700088500977,
1998
+ "objective/scores": 5.0,
1999
+ "policy/approxkl_avg": 5.267421245574951,
2000
+ "policy/clipfrac_avg": 0.03125,
2001
+ "policy/entropy_avg": 0.31008654832839966,
2002
+ "step": 95,
2003
+ "val/clipfrac_avg": 0.0,
2004
+ "val/num_eos_tokens": 0,
2005
+ "val/ratio": 0.9336869716644287,
2006
+ "val/ratio_var": 4.966981941834092e-05
2007
+ },
2008
+ {
2009
+ "episode": 1536,
2010
+ "epoch": 2.694736842105263,
2011
+ "eps": 3,
2012
+ "loss/policy_avg": 0.012591801583766937,
2013
+ "loss/value_avg": 0.3597390055656433,
2014
+ "lr": 1.5813084112149534e-06,
2015
+ "objective/entropy": 5.459916591644287,
2016
+ "objective/kl": 172.44110107421875,
2017
+ "objective/non_score_reward": -8.622055053710938,
2018
+ "objective/rlhf_reward": -3.6220550537109375,
2019
+ "objective/scores": 5.0,
2020
+ "policy/approxkl_avg": 4.5339765548706055,
2021
+ "policy/clipfrac_avg": 0.03537736088037491,
2022
+ "policy/entropy_avg": 0.3411254286766052,
2023
+ "step": 96,
2024
+ "val/clipfrac_avg": 0.0,
2025
+ "val/num_eos_tokens": 0,
2026
+ "val/ratio": 0.9462149739265442,
2027
+ "val/ratio_var": 3.8459929783130065e-05
2028
+ },
2029
+ {
2030
+ "episode": 1552,
2031
+ "epoch": 2.7228070175438597,
2032
+ "eps": 3,
2033
+ "loss/policy_avg": -0.003356472123414278,
2034
+ "loss/value_avg": 0.6434417963027954,
2035
+ "lr": 1.4495327102803737e-06,
2036
+ "objective/entropy": 5.633913516998291,
2037
+ "objective/kl": 172.26502990722656,
2038
+ "objective/non_score_reward": -8.613250732421875,
2039
+ "objective/rlhf_reward": -4.363250732421875,
2040
+ "objective/scores": 4.25,
2041
+ "policy/approxkl_avg": 3.585165500640869,
2042
+ "policy/clipfrac_avg": 0.03537735715508461,
2043
+ "policy/entropy_avg": 0.34199586510658264,
2044
+ "step": 97,
2045
+ "val/clipfrac_avg": 0.001179245300590992,
2046
+ "val/num_eos_tokens": 0,
2047
+ "val/ratio": 0.9311625957489014,
2048
+ "val/ratio_var": 6.935850979061797e-05
2049
+ },
2050
+ {
2051
+ "episode": 1568,
2052
+ "epoch": 2.7508771929824563,
2053
+ "eps": 3,
2054
+ "loss/policy_avg": -0.003898909315466881,
2055
+ "loss/value_avg": 0.36550819873809814,
2056
+ "lr": 1.3177570093457943e-06,
2057
+ "objective/entropy": 4.281040191650391,
2058
+ "objective/kl": 174.15972900390625,
2059
+ "objective/non_score_reward": -8.707986831665039,
2060
+ "objective/rlhf_reward": -3.614236831665039,
2061
+ "objective/scores": 5.09375,
2062
+ "policy/approxkl_avg": 5.1715850830078125,
2063
+ "policy/clipfrac_avg": 0.02712264284491539,
2064
+ "policy/entropy_avg": 0.3137935698032379,
2065
+ "step": 98,
2066
+ "val/clipfrac_avg": 0.0,
2067
+ "val/num_eos_tokens": 0,
2068
+ "val/ratio": 0.9350335597991943,
2069
+ "val/ratio_var": 1.248767239303561e-05
2070
+ },
2071
+ {
2072
+ "episode": 1584,
2073
+ "epoch": 2.778947368421053,
2074
+ "eps": 3,
2075
+ "loss/policy_avg": 0.0017306804656982422,
2076
+ "loss/value_avg": 0.2737918496131897,
2077
+ "lr": 1.185981308411215e-06,
2078
+ "objective/entropy": 5.210065841674805,
2079
+ "objective/kl": 173.97068786621094,
2080
+ "objective/non_score_reward": -8.69853401184082,
2081
+ "objective/rlhf_reward": -3.8860340118408203,
2082
+ "objective/scores": 4.8125,
2083
+ "policy/approxkl_avg": 5.011469841003418,
2084
+ "policy/clipfrac_avg": 0.04304245114326477,
2085
+ "policy/entropy_avg": 0.3419041931629181,
2086
+ "step": 99,
2087
+ "val/clipfrac_avg": 0.0,
2088
+ "val/num_eos_tokens": 0,
2089
+ "val/ratio": 0.9419320225715637,
2090
+ "val/ratio_var": 8.726042324269656e-06
2091
+ },
2092
+ {
2093
+ "episode": 1600,
2094
+ "epoch": 2.807017543859649,
2095
+ "eps": 3,
2096
+ "loss/policy_avg": -0.006221463903784752,
2097
+ "loss/value_avg": 0.3625496029853821,
2098
+ "lr": 1.0542056074766355e-06,
2099
+ "objective/entropy": 3.721562623977661,
2100
+ "objective/kl": 175.773193359375,
2101
+ "objective/non_score_reward": -8.78865909576416,
2102
+ "objective/rlhf_reward": -3.47615909576416,
2103
+ "objective/scores": 5.3125,
2104
+ "policy/approxkl_avg": 5.388751029968262,
2105
+ "policy/clipfrac_avg": 0.03419811278581619,
2106
+ "policy/entropy_avg": 0.29315799474716187,
2107
+ "step": 100,
2108
+ "val/clipfrac_avg": 0.0,
2109
+ "val/num_eos_tokens": 0,
2110
+ "val/ratio": 0.949840784072876,
2111
+ "val/ratio_var": 3.881535303662531e-05
2112
+ },
2113
+ {
2114
+ "episode": 1616,
2115
+ "epoch": 2.8350877192982455,
2116
+ "eps": 3,
2117
+ "loss/policy_avg": -0.005170758813619614,
2118
+ "loss/value_avg": 0.4136154055595398,
2119
+ "lr": 9.22429906542056e-07,
2120
+ "objective/entropy": 5.907715320587158,
2121
+ "objective/kl": 171.47390747070312,
2122
+ "objective/non_score_reward": -8.573695182800293,
2123
+ "objective/rlhf_reward": -4.167445182800293,
2124
+ "objective/scores": 4.40625,
2125
+ "policy/approxkl_avg": 5.403087615966797,
2126
+ "policy/clipfrac_avg": 0.03478773683309555,
2127
+ "policy/entropy_avg": 0.33564049005508423,
2128
+ "step": 101,
2129
+ "val/clipfrac_avg": 0.000589622650295496,
2130
+ "val/num_eos_tokens": 0,
2131
+ "val/ratio": 0.9340347051620483,
2132
+ "val/ratio_var": 3.0960076401242986e-05
2133
+ },
2134
+ {
2135
+ "episode": 1632,
2136
+ "epoch": 2.863157894736842,
2137
+ "eps": 3,
2138
+ "loss/policy_avg": 0.002457182854413986,
2139
+ "loss/value_avg": 0.27742013335227966,
2140
+ "lr": 7.906542056074767e-07,
2141
+ "objective/entropy": 5.222499370574951,
2142
+ "objective/kl": 176.05380249023438,
2143
+ "objective/non_score_reward": -8.802690505981445,
2144
+ "objective/rlhf_reward": -3.6151905059814453,
2145
+ "objective/scores": 5.1875,
2146
+ "policy/approxkl_avg": 4.675132751464844,
2147
+ "policy/clipfrac_avg": 0.04304245114326477,
2148
+ "policy/entropy_avg": 0.3403066396713257,
2149
+ "step": 102,
2150
+ "val/clipfrac_avg": 0.0,
2151
+ "val/num_eos_tokens": 0,
2152
+ "val/ratio": 0.9404515027999878,
2153
+ "val/ratio_var": 7.289678615052253e-05
2154
+ },
2155
+ {
2156
+ "episode": 1648,
2157
+ "epoch": 2.8912280701754387,
2158
+ "eps": 3,
2159
+ "loss/policy_avg": -0.006544323638081551,
2160
+ "loss/value_avg": 0.29731422662734985,
2161
+ "lr": 6.588785046728972e-07,
2162
+ "objective/entropy": 4.219725608825684,
2163
+ "objective/kl": 178.1557159423828,
2164
+ "objective/non_score_reward": -8.90778636932373,
2165
+ "objective/rlhf_reward": -3.8452863693237305,
2166
+ "objective/scores": 5.0625,
2167
+ "policy/approxkl_avg": 5.953890800476074,
2168
+ "policy/clipfrac_avg": 0.0383254699409008,
2169
+ "policy/entropy_avg": 0.31199511885643005,
2170
+ "step": 103,
2171
+ "val/clipfrac_avg": 0.0,
2172
+ "val/num_eos_tokens": 0,
2173
+ "val/ratio": 0.9407525658607483,
2174
+ "val/ratio_var": 5.8047560742124915e-05
2175
+ },
2176
+ {
2177
+ "episode": 1664,
2178
+ "epoch": 2.9192982456140353,
2179
+ "eps": 3,
2180
+ "loss/policy_avg": -0.0011880630627274513,
2181
+ "loss/value_avg": 0.21903052926063538,
2182
+ "lr": 5.271028037383178e-07,
2183
+ "objective/entropy": 5.557653427124023,
2184
+ "objective/kl": 170.518798828125,
2185
+ "objective/non_score_reward": -8.52593994140625,
2186
+ "objective/rlhf_reward": -3.05718994140625,
2187
+ "objective/scores": 5.46875,
2188
+ "policy/approxkl_avg": 4.447786331176758,
2189
+ "policy/clipfrac_avg": 0.0383254699409008,
2190
+ "policy/entropy_avg": 0.33431151509284973,
2191
+ "step": 104,
2192
+ "val/clipfrac_avg": 0.0,
2193
+ "val/num_eos_tokens": 0,
2194
+ "val/ratio": 0.9298925399780273,
2195
+ "val/ratio_var": 9.144405339611694e-06
2196
+ },
2197
+ {
2198
+ "episode": 1680,
2199
+ "epoch": 2.9473684210526314,
2200
+ "eps": 3,
2201
+ "loss/policy_avg": -0.0007513905875384808,
2202
+ "loss/value_avg": 0.21037007868289948,
2203
+ "lr": 3.9532710280373834e-07,
2204
+ "objective/entropy": 3.900575876235962,
2205
+ "objective/kl": 174.99456787109375,
2206
+ "objective/non_score_reward": -8.74972915649414,
2207
+ "objective/rlhf_reward": -3.5309791564941406,
2208
+ "objective/scores": 5.21875,
2209
+ "policy/approxkl_avg": 5.165627479553223,
2210
+ "policy/clipfrac_avg": 0.028301887214183807,
2211
+ "policy/entropy_avg": 0.2935909032821655,
2212
+ "step": 105,
2213
+ "val/clipfrac_avg": 0.001179245300590992,
2214
+ "val/num_eos_tokens": 0,
2215
+ "val/ratio": 0.9422957897186279,
2216
+ "val/ratio_var": 2.6614558009896427e-05
2217
+ },
2218
+ {
2219
+ "episode": 1696,
2220
+ "epoch": 2.975438596491228,
2221
+ "eps": 3,
2222
+ "loss/policy_avg": -0.005426734685897827,
2223
+ "loss/value_avg": 0.21496959030628204,
2224
+ "lr": 2.635514018691589e-07,
2225
+ "objective/entropy": 4.556634902954102,
2226
+ "objective/kl": 173.05136108398438,
2227
+ "objective/non_score_reward": -8.652568817138672,
2228
+ "objective/rlhf_reward": -2.933818817138672,
2229
+ "objective/scores": 5.71875,
2230
+ "policy/approxkl_avg": 4.820314884185791,
2231
+ "policy/clipfrac_avg": 0.04304245486855507,
2232
+ "policy/entropy_avg": 0.31966692209243774,
2233
+ "step": 106,
2234
+ "val/clipfrac_avg": 0.0,
2235
+ "val/num_eos_tokens": 0,
2236
+ "val/ratio": 0.9327390193939209,
2237
+ "val/ratio_var": 8.508096652803943e-05
2238
+ },
2239
+ {
2240
+ "episode": 1712,
2241
+ "epoch": 3.0035087719298246,
2242
+ "eps": 3,
2243
+ "loss/policy_avg": 0.0001406269147992134,
2244
+ "loss/value_avg": 0.186610609292984,
2245
+ "lr": 1.3177570093457944e-07,
2246
+ "objective/entropy": 4.999897003173828,
2247
+ "objective/kl": 173.25045776367188,
2248
+ "objective/non_score_reward": -8.66252326965332,
2249
+ "objective/rlhf_reward": -2.9437732696533203,
2250
+ "objective/scores": 5.71875,
2251
+ "policy/approxkl_avg": 4.337066173553467,
2252
+ "policy/clipfrac_avg": 0.04716981202363968,
2253
+ "policy/entropy_avg": 0.3558363914489746,
2254
+ "step": 107,
2255
+ "val/clipfrac_avg": 0.0,
2256
+ "val/num_eos_tokens": 0,
2257
+ "val/ratio": 0.9243938326835632,
2258
+ "val/ratio_var": 0.000128799001686275
2259
+ }
2260
+ ],
2261
+ "logging_steps": 10,
2262
+ "max_steps": 107,
2263
+ "num_input_tokens_seen": 0,
2264
+ "num_train_epochs": 3.0,
2265
+ "save_steps": 500,
2266
+ "stateful_callbacks": {
2267
+ "TrainerControl": {
2268
+ "args": {
2269
+ "should_epoch_stop": false,
2270
+ "should_evaluate": false,
2271
+ "should_log": false,
2272
+ "should_save": true,
2273
+ "should_training_stop": true
2274
+ },
2275
+ "attributes": {}
2276
+ }
2277
+ },
2278
+ "total_flos": 0,
2279
+ "train_batch_size": null,
2280
+ "trial_name": null,
2281
+ "trial_params": null
2282
+ }
checkpoint-107/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce71d5d5d23d373f1100beface744ea0e78db036a0645797d9ced3b0c1d8365
3
+ size 6673
checkpoint-107/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 21,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 2,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.1",
55
+ "use_cache": false,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 151936
58
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "pad_token_id": 151643,
4
+ "repetition_penalty": 1.1,
5
+ "temperature": 0.7,
6
+ "top_k": 20,
7
+ "top_p": 0.8,
8
+ "transformers_version": "4.57.1"
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c93779623565bec515d60d0d5ba3ac8bb8d891825cd93bd6ffb072dbf265550
3
+ size 3087467144
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f650899cdee1a74abb093b96e45774316750373414f9e11647b8480290d3937f
3
+ size 11421988
tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ce71d5d5d23d373f1100beface744ea0e78db036a0645797d9ced3b0c1d8365
3
+ size 6673
vocab.json ADDED
The diff for this file is too large to render. See raw diff