iamPi commited on
Commit
12df14e
·
verified ·
1 Parent(s): 0821e34

Upload alfworld-v3.0.0 model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. README.md +68 -0
  3. adapter_config.json +46 -0
  4. adapter_model.safetensors +3 -0
  5. added_tokens.json +24 -0
  6. chat_template.jinja +5 -0
  7. checkpoint-20/README.md +210 -0
  8. checkpoint-20/adapter_config.json +46 -0
  9. checkpoint-20/adapter_model.safetensors +3 -0
  10. checkpoint-20/added_tokens.json +24 -0
  11. checkpoint-20/chat_template.jinja +5 -0
  12. checkpoint-20/merges.txt +0 -0
  13. checkpoint-20/optimizer.pt +3 -0
  14. checkpoint-20/ref/adapter_config.json +46 -0
  15. checkpoint-20/ref/adapter_model.safetensors +3 -0
  16. checkpoint-20/rng_state_0.pth +3 -0
  17. checkpoint-20/rng_state_1.pth +3 -0
  18. checkpoint-20/scheduler.pt +3 -0
  19. checkpoint-20/special_tokens_map.json +38 -0
  20. checkpoint-20/tokenizer.json +3 -0
  21. checkpoint-20/tokenizer_config.json +207 -0
  22. checkpoint-20/trainer_state.json +694 -0
  23. checkpoint-20/training_args.bin +3 -0
  24. checkpoint-20/vocab.json +0 -0
  25. checkpoint-32/README.md +210 -0
  26. checkpoint-32/adapter_config.json +46 -0
  27. checkpoint-32/adapter_model.safetensors +3 -0
  28. checkpoint-32/added_tokens.json +24 -0
  29. checkpoint-32/chat_template.jinja +5 -0
  30. checkpoint-32/merges.txt +0 -0
  31. checkpoint-32/optimizer.pt +3 -0
  32. checkpoint-32/ref/adapter_config.json +46 -0
  33. checkpoint-32/ref/adapter_model.safetensors +3 -0
  34. checkpoint-32/rng_state_0.pth +3 -0
  35. checkpoint-32/rng_state_1.pth +3 -0
  36. checkpoint-32/scheduler.pt +3 -0
  37. checkpoint-32/special_tokens_map.json +38 -0
  38. checkpoint-32/tokenizer.json +3 -0
  39. checkpoint-32/tokenizer_config.json +207 -0
  40. checkpoint-32/trainer_state.json +1090 -0
  41. checkpoint-32/training_args.bin +3 -0
  42. checkpoint-32/vocab.json +0 -0
  43. config.json +66 -0
  44. debug.log +13 -0
  45. merges.txt +0 -0
  46. ref/adapter_config.json +46 -0
  47. ref/adapter_model.safetensors +3 -0
  48. special_tokens_map.json +38 -0
  49. tokenizer.json +3 -0
  50. tokenizer_config.json +207 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-20/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-32/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ model_name: app/checkpoints/1/alfworld-v3.0.0
4
+ tags:
5
+ - generated_from_trainer
6
+ - axolotl
7
+ - grpo
8
+ - trl
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for app/checkpoints/1/alfworld-v3.0.0
13
+
14
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/iamPi-0519/Gradients-Rollout/runs/v89kzb8a)
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.27.0.dev0
38
+ - Transformers: 4.57.6
39
+ - Pytorch: 2.9.0
40
+ - Datasets: 4.5.0
41
+ - Tokenizers: 0.22.2
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{shao2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/cache/models/Qwen--Qwen2.5-3B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "gate_proj",
33
+ "k_proj",
34
+ "up_proj",
35
+ "q_proj",
36
+ "o_proj",
37
+ "down_proj",
38
+ "v_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b878ff20d084d009a6f57bea4813e4afefd51a1703b7783226841a37ec00fedb
3
+ size 119801528
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
2
+
3
+ '+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>
4
+
5
+ ' }}{% endif %}
checkpoint-20/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /cache/models/Qwen--Qwen2.5-3B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - axolotl
7
+ - base_model:adapter:/cache/models/Qwen--Qwen2.5-3B-Instruct
8
+ - grpo
9
+ - lora
10
+ - transformers
11
+ - trl
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.1
checkpoint-20/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/cache/models/Qwen--Qwen2.5-3B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "gate_proj",
33
+ "k_proj",
34
+ "up_proj",
35
+ "q_proj",
36
+ "o_proj",
37
+ "down_proj",
38
+ "v_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-20/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9832dc1bc2023d5f3837556ccda2868270f77d4d6f58612c28d622015c5f420
3
+ size 119801528
checkpoint-20/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-20/chat_template.jinja ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
2
+
3
+ '+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>
4
+
5
+ ' }}{% endif %}
checkpoint-20/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-20/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c58dcc3771032c6173598b076cb68a1f664661be00fe1e5910df9105bef62a03
3
+ size 61392581
checkpoint-20/ref/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/cache/models/Qwen--Qwen2.5-3B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "gate_proj",
33
+ "k_proj",
34
+ "up_proj",
35
+ "q_proj",
36
+ "o_proj",
37
+ "down_proj",
38
+ "v_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-20/ref/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a73e8a3357e88549443ed67169c44587d8849b718265a36909b7373da2322a00
3
+ size 59934640
checkpoint-20/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:617359dfa3f981ca698950bb07b7755a5d3b648c9110431d345731aec957bafd
3
+ size 14853
checkpoint-20/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce14c9f96a0aa5a11ec1906c8b06547968496f7f4f6c975b4a3a9e1bd70674ce
3
+ size 14917
checkpoint-20/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20e6cb87497d4549ec3c18de3c76741e527086c1a2492b0bd96b9fe6d8fdaaab
3
+ size 1465
checkpoint-20/special_tokens_map.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "bos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "eos_token": {
25
+ "content": "<|im_end|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "pad_token": {
32
+ "content": "<|endoftext|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
+ }
checkpoint-20/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-20/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": "<|im_end|>",
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoint-20/trainer_state.json ADDED
@@ -0,0 +1,694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.3225806451612903,
6
+ "eval_steps": 500,
7
+ "global_step": 20,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.53125,
19
+ "completions/max_length": 512.0,
20
+ "completions/max_terminated_length": 487.0,
21
+ "completions/mean_length": 370.3359375,
22
+ "completions/mean_terminated_length": 209.78334045410156,
23
+ "completions/min_length": 66.0,
24
+ "completions/min_terminated_length": 66.0,
25
+ "entropy": 0.40961652249097824,
26
+ "epoch": 0.016129032258064516,
27
+ "frac_reward_zero_std": 0.0,
28
+ "grad_norm": 0.1428968608379364,
29
+ "kl": 0.0,
30
+ "learning_rate": 0.0,
31
+ "loss": -0.0891,
32
+ "num_tokens": 110863.0,
33
+ "reward": 0.09531250596046448,
34
+ "reward_std": 2.4042975902557373,
35
+ "rewards/alfworld_rollout_reward_func/mean": 0.09531247615814209,
36
+ "rewards/alfworld_rollout_reward_func/std": 2.9919276237487793,
37
+ "sampling/importance_sampling_ratio/max": 2.9990427494049072,
38
+ "sampling/importance_sampling_ratio/mean": 0.6262807846069336,
39
+ "sampling/importance_sampling_ratio/min": 0.0,
40
+ "sampling/sampling_logp_difference/max": 4.450258255004883,
41
+ "sampling/sampling_logp_difference/mean": 0.024438712745904922,
42
+ "step": 1,
43
+ "step_time": 309.93943907799985
44
+ },
45
+ {
46
+ "clip_ratio/high_max": 0.0,
47
+ "clip_ratio/high_mean": 0.0,
48
+ "clip_ratio/low_mean": 0.0,
49
+ "clip_ratio/low_min": 0.0,
50
+ "clip_ratio/region_mean": 0.0,
51
+ "completions/clipped_ratio": 0.5703125,
52
+ "completions/max_length": 512.0,
53
+ "completions/max_terminated_length": 509.0,
54
+ "completions/mean_length": 380.1015625,
55
+ "completions/mean_terminated_length": 205.03636169433594,
56
+ "completions/min_length": 79.0,
57
+ "completions/min_terminated_length": 79.0,
58
+ "entropy": 0.40012454986572266,
59
+ "epoch": 0.03225806451612903,
60
+ "frac_reward_zero_std": 0.0,
61
+ "grad_norm": 0.12177681177854538,
62
+ "kl": 0.0,
63
+ "learning_rate": 2.5e-05,
64
+ "loss": -0.085,
65
+ "num_tokens": 222814.0,
66
+ "reward": 0.5320311784744263,
67
+ "reward_std": 2.642284393310547,
68
+ "rewards/alfworld_rollout_reward_func/mean": 0.5320311784744263,
69
+ "rewards/alfworld_rollout_reward_func/std": 3.4178764820098877,
70
+ "sampling/importance_sampling_ratio/max": 2.909560441970825,
71
+ "sampling/importance_sampling_ratio/mean": 0.5676747560501099,
72
+ "sampling/importance_sampling_ratio/min": 0.0,
73
+ "sampling/sampling_logp_difference/max": 4.949212551116943,
74
+ "sampling/sampling_logp_difference/mean": 0.023133086040616035,
75
+ "step": 2,
76
+ "step_time": 315.77915475400005
77
+ },
78
+ {
79
+ "clip_ratio/high_max": 0.0,
80
+ "clip_ratio/high_mean": 0.0,
81
+ "clip_ratio/low_mean": 0.0,
82
+ "clip_ratio/low_min": 0.0,
83
+ "clip_ratio/region_mean": 0.0,
84
+ "completions/clipped_ratio": 0.6015625,
85
+ "completions/max_length": 512.0,
86
+ "completions/max_terminated_length": 497.0,
87
+ "completions/mean_length": 378.34375,
88
+ "completions/mean_terminated_length": 176.5490264892578,
89
+ "completions/min_length": 76.0,
90
+ "completions/min_terminated_length": 76.0,
91
+ "entropy": 0.4211720749735832,
92
+ "epoch": 0.04838709677419355,
93
+ "frac_reward_zero_std": 0.0,
94
+ "grad_norm": 0.13358080387115479,
95
+ "kl": 0.00272539074649103,
96
+ "learning_rate": 2.493586654239869e-05,
97
+ "loss": -0.0053,
98
+ "num_tokens": 332775.0,
99
+ "reward": 0.925000011920929,
100
+ "reward_std": 3.4297051429748535,
101
+ "rewards/alfworld_rollout_reward_func/mean": 0.925000011920929,
102
+ "rewards/alfworld_rollout_reward_func/std": 3.9396629333496094,
103
+ "sampling/importance_sampling_ratio/max": 2.511305809020996,
104
+ "sampling/importance_sampling_ratio/mean": 0.5851866602897644,
105
+ "sampling/importance_sampling_ratio/min": 0.0,
106
+ "sampling/sampling_logp_difference/max": 3.5749268531799316,
107
+ "sampling/sampling_logp_difference/mean": 0.023533061146736145,
108
+ "step": 3,
109
+ "step_time": 296.23798338899906
110
+ },
111
+ {
112
+ "clip_ratio/high_max": 0.0,
113
+ "clip_ratio/high_mean": 0.0,
114
+ "clip_ratio/low_mean": 0.0,
115
+ "clip_ratio/low_min": 0.0,
116
+ "clip_ratio/region_mean": 0.0,
117
+ "completions/clipped_ratio": 0.53125,
118
+ "completions/max_length": 512.0,
119
+ "completions/max_terminated_length": 511.0,
120
+ "completions/mean_length": 357.84375,
121
+ "completions/mean_terminated_length": 183.1333465576172,
122
+ "completions/min_length": 66.0,
123
+ "completions/min_terminated_length": 66.0,
124
+ "entropy": 0.3783623240888119,
125
+ "epoch": 0.06451612903225806,
126
+ "frac_reward_zero_std": 0.0,
127
+ "grad_norm": 0.15261100232601166,
128
+ "kl": 0.002557590647484176,
129
+ "learning_rate": 2.474412426565618e-05,
130
+ "loss": -0.0004,
131
+ "num_tokens": 441773.0,
132
+ "reward": 0.45625001192092896,
133
+ "reward_std": 2.879180908203125,
134
+ "rewards/alfworld_rollout_reward_func/mean": 0.45625001192092896,
135
+ "rewards/alfworld_rollout_reward_func/std": 3.427947759628296,
136
+ "sampling/importance_sampling_ratio/max": 2.91353440284729,
137
+ "sampling/importance_sampling_ratio/mean": 0.6235978603363037,
138
+ "sampling/importance_sampling_ratio/min": 0.0,
139
+ "sampling/sampling_logp_difference/max": 4.593559741973877,
140
+ "sampling/sampling_logp_difference/mean": 0.023225955665111542,
141
+ "step": 4,
142
+ "step_time": 322.2789937999987
143
+ },
144
+ {
145
+ "clip_ratio/high_max": 0.0,
146
+ "clip_ratio/high_mean": 0.0,
147
+ "clip_ratio/low_mean": 0.0,
148
+ "clip_ratio/low_min": 0.0,
149
+ "clip_ratio/region_mean": 0.0,
150
+ "completions/clipped_ratio": 0.3984375,
151
+ "completions/max_length": 512.0,
152
+ "completions/max_terminated_length": 504.0,
153
+ "completions/mean_length": 298.53125,
154
+ "completions/mean_terminated_length": 157.14285278320312,
155
+ "completions/min_length": 64.0,
156
+ "completions/min_terminated_length": 64.0,
157
+ "entropy": 0.4225323870778084,
158
+ "epoch": 0.08064516129032258,
159
+ "frac_reward_zero_std": 0.0,
160
+ "grad_norm": 0.11338824778795242,
161
+ "kl": 0.003911561769200489,
162
+ "learning_rate": 2.442674070500061e-05,
163
+ "loss": 0.0261,
164
+ "num_tokens": 540709.0,
165
+ "reward": 0.48515623807907104,
166
+ "reward_std": 2.928551197052002,
167
+ "rewards/alfworld_rollout_reward_func/mean": 0.48515623807907104,
168
+ "rewards/alfworld_rollout_reward_func/std": 3.3146796226501465,
169
+ "sampling/importance_sampling_ratio/max": 2.970187187194824,
170
+ "sampling/importance_sampling_ratio/mean": 0.5831338763237,
171
+ "sampling/importance_sampling_ratio/min": 0.0,
172
+ "sampling/sampling_logp_difference/max": 6.362844944000244,
173
+ "sampling/sampling_logp_difference/mean": 0.024303022772073746,
174
+ "step": 5,
175
+ "step_time": 281.38914586900137
176
+ },
177
+ {
178
+ "clip_ratio/high_max": 0.0,
179
+ "clip_ratio/high_mean": 0.0,
180
+ "clip_ratio/low_mean": 0.0,
181
+ "clip_ratio/low_min": 0.0,
182
+ "clip_ratio/region_mean": 0.0,
183
+ "completions/clipped_ratio": 0.34375,
184
+ "completions/max_length": 512.0,
185
+ "completions/max_terminated_length": 476.0,
186
+ "completions/mean_length": 280.25,
187
+ "completions/mean_terminated_length": 158.85714721679688,
188
+ "completions/min_length": 79.0,
189
+ "completions/min_terminated_length": 79.0,
190
+ "entropy": 0.4307485744357109,
191
+ "epoch": 0.0967741935483871,
192
+ "frac_reward_zero_std": 0.0,
193
+ "grad_norm": 0.840685248374939,
194
+ "kl": 0.035365400719456375,
195
+ "learning_rate": 2.3986972645252883e-05,
196
+ "loss": 0.0634,
197
+ "num_tokens": 641237.0,
198
+ "reward": 1.1078124046325684,
199
+ "reward_std": 3.4873318672180176,
200
+ "rewards/alfworld_rollout_reward_func/mean": 1.107812523841858,
201
+ "rewards/alfworld_rollout_reward_func/std": 3.958895206451416,
202
+ "sampling/importance_sampling_ratio/max": 2.7673635482788086,
203
+ "sampling/importance_sampling_ratio/mean": 0.561335563659668,
204
+ "sampling/importance_sampling_ratio/min": 0.0,
205
+ "sampling/sampling_logp_difference/max": 11.066271781921387,
206
+ "sampling/sampling_logp_difference/mean": 0.02636803314089775,
207
+ "step": 6,
208
+ "step_time": 306.572960538002
209
+ },
210
+ {
211
+ "clip_ratio/high_max": 0.0,
212
+ "clip_ratio/high_mean": 0.0,
213
+ "clip_ratio/low_mean": 0.0,
214
+ "clip_ratio/low_min": 0.0,
215
+ "clip_ratio/region_mean": 0.0,
216
+ "completions/clipped_ratio": 0.28125,
217
+ "completions/max_length": 512.0,
218
+ "completions/max_terminated_length": 474.0,
219
+ "completions/mean_length": 248.328125,
220
+ "completions/mean_terminated_length": 145.1521759033203,
221
+ "completions/min_length": 61.0,
222
+ "completions/min_terminated_length": 61.0,
223
+ "entropy": 0.4486994035542011,
224
+ "epoch": 0.11290322580645161,
225
+ "frac_reward_zero_std": 0.0,
226
+ "grad_norm": 0.13363927602767944,
227
+ "kl": 0.010801714262925088,
228
+ "learning_rate": 2.342933270180728e-05,
229
+ "loss": 0.1147,
230
+ "num_tokens": 734359.0,
231
+ "reward": 0.7398437261581421,
232
+ "reward_std": 3.073120594024658,
233
+ "rewards/alfworld_rollout_reward_func/mean": 0.7398437261581421,
234
+ "rewards/alfworld_rollout_reward_func/std": 3.5458602905273438,
235
+ "sampling/importance_sampling_ratio/max": 2.4107882976531982,
236
+ "sampling/importance_sampling_ratio/mean": 0.5770797729492188,
237
+ "sampling/importance_sampling_ratio/min": 0.0,
238
+ "sampling/sampling_logp_difference/max": 7.637388229370117,
239
+ "sampling/sampling_logp_difference/mean": 0.027358299121260643,
240
+ "step": 7,
241
+ "step_time": 291.4309037350031
242
+ },
243
+ {
244
+ "clip_ratio/high_max": 0.0,
245
+ "clip_ratio/high_mean": 0.0,
246
+ "clip_ratio/low_mean": 0.0,
247
+ "clip_ratio/low_min": 0.0,
248
+ "clip_ratio/region_mean": 0.0,
249
+ "completions/clipped_ratio": 0.2109375,
250
+ "completions/max_length": 512.0,
251
+ "completions/max_terminated_length": 487.0,
252
+ "completions/mean_length": 224.53125,
253
+ "completions/mean_terminated_length": 147.68316650390625,
254
+ "completions/min_length": 71.0,
255
+ "completions/min_terminated_length": 71.0,
256
+ "entropy": 0.4458822049200535,
257
+ "epoch": 0.12903225806451613,
258
+ "frac_reward_zero_std": 0.0,
259
+ "grad_norm": 0.1380084902048111,
260
+ "kl": 0.018724333960562944,
261
+ "learning_rate": 2.2759543015090955e-05,
262
+ "loss": 0.0322,
263
+ "num_tokens": 827827.0,
264
+ "reward": 0.8648437261581421,
265
+ "reward_std": 3.5357110500335693,
266
+ "rewards/alfworld_rollout_reward_func/mean": 0.8648437857627869,
267
+ "rewards/alfworld_rollout_reward_func/std": 3.7438578605651855,
268
+ "sampling/importance_sampling_ratio/max": 2.939615249633789,
269
+ "sampling/importance_sampling_ratio/mean": 0.5691805481910706,
270
+ "sampling/importance_sampling_ratio/min": 0.0,
271
+ "sampling/sampling_logp_difference/max": 5.754482269287109,
272
+ "sampling/sampling_logp_difference/mean": 0.028573205694556236,
273
+ "step": 8,
274
+ "step_time": 305.87279431799834
275
+ },
276
+ {
277
+ "clip_ratio/high_max": 0.0,
278
+ "clip_ratio/high_mean": 0.0,
279
+ "clip_ratio/low_mean": 0.0,
280
+ "clip_ratio/low_min": 0.0,
281
+ "clip_ratio/region_mean": 0.0,
282
+ "completions/clipped_ratio": 0.1875,
283
+ "completions/max_length": 512.0,
284
+ "completions/max_terminated_length": 504.0,
285
+ "completions/mean_length": 218.171875,
286
+ "completions/mean_terminated_length": 150.36538696289062,
287
+ "completions/min_length": 68.0,
288
+ "completions/min_terminated_length": 68.0,
289
+ "entropy": 0.4581918604671955,
290
+ "epoch": 0.14516129032258066,
291
+ "frac_reward_zero_std": 0.0,
292
+ "grad_norm": 0.11444258689880371,
293
+ "kl": 0.02050452691037208,
294
+ "learning_rate": 2.1984476533659888e-05,
295
+ "loss": 0.0214,
296
+ "num_tokens": 918938.0,
297
+ "reward": 1.025781273841858,
298
+ "reward_std": 3.5701520442962646,
299
+ "rewards/alfworld_rollout_reward_func/mean": 1.0257811546325684,
300
+ "rewards/alfworld_rollout_reward_func/std": 3.894923686981201,
301
+ "sampling/importance_sampling_ratio/max": 2.700753688812256,
302
+ "sampling/importance_sampling_ratio/mean": 0.6189267635345459,
303
+ "sampling/importance_sampling_ratio/min": 0.0,
304
+ "sampling/sampling_logp_difference/max": 3.1386852264404297,
305
+ "sampling/sampling_logp_difference/mean": 0.0270779300481081,
306
+ "step": 9,
307
+ "step_time": 308.85608878800485
308
+ },
309
+ {
310
+ "clip_ratio/high_max": 0.0,
311
+ "clip_ratio/high_mean": 0.0,
312
+ "clip_ratio/low_mean": 0.0,
313
+ "clip_ratio/low_min": 0.0,
314
+ "clip_ratio/region_mean": 0.0,
315
+ "completions/clipped_ratio": 0.0546875,
316
+ "completions/max_length": 512.0,
317
+ "completions/max_terminated_length": 432.0,
318
+ "completions/mean_length": 150.1875,
319
+ "completions/mean_terminated_length": 129.25619506835938,
320
+ "completions/min_length": 62.0,
321
+ "completions/min_terminated_length": 62.0,
322
+ "entropy": 0.5325295478105545,
323
+ "epoch": 0.16129032258064516,
324
+ "frac_reward_zero_std": 0.0,
325
+ "grad_norm": 0.2502661645412445,
326
+ "kl": 0.03952266392298043,
327
+ "learning_rate": 2.1112086488446085e-05,
328
+ "loss": 0.029,
329
+ "num_tokens": 1000430.0,
330
+ "reward": 0.8335937261581421,
331
+ "reward_std": 3.1759183406829834,
332
+ "rewards/alfworld_rollout_reward_func/mean": 0.8335937261581421,
333
+ "rewards/alfworld_rollout_reward_func/std": 3.7348177433013916,
334
+ "sampling/importance_sampling_ratio/max": 2.995654344558716,
335
+ "sampling/importance_sampling_ratio/mean": 0.7903203964233398,
336
+ "sampling/importance_sampling_ratio/min": 0.0,
337
+ "sampling/sampling_logp_difference/max": 2.775932788848877,
338
+ "sampling/sampling_logp_difference/mean": 0.03168691694736481,
339
+ "step": 10,
340
+ "step_time": 285.58172405699224
341
+ },
342
+ {
343
+ "clip_ratio/high_max": 0.0,
344
+ "clip_ratio/high_mean": 0.0,
345
+ "clip_ratio/low_mean": 0.0,
346
+ "clip_ratio/low_min": 0.0,
347
+ "clip_ratio/region_mean": 0.0,
348
+ "completions/clipped_ratio": 0.0859375,
349
+ "completions/max_length": 512.0,
350
+ "completions/max_terminated_length": 325.0,
351
+ "completions/mean_length": 156.7734375,
352
+ "completions/mean_terminated_length": 123.3760757446289,
353
+ "completions/min_length": 64.0,
354
+ "completions/min_terminated_length": 64.0,
355
+ "entropy": 0.5142566300928593,
356
+ "epoch": 0.1774193548387097,
357
+ "frac_reward_zero_std": 0.0,
358
+ "grad_norm": 0.18933017551898956,
359
+ "kl": 0.04493421735242009,
360
+ "learning_rate": 2.0151324781845787e-05,
361
+ "loss": 0.0496,
362
+ "num_tokens": 1082621.0,
363
+ "reward": 0.9093749523162842,
364
+ "reward_std": 3.467135429382324,
365
+ "rewards/alfworld_rollout_reward_func/mean": 0.909375011920929,
366
+ "rewards/alfworld_rollout_reward_func/std": 3.8583264350891113,
367
+ "sampling/importance_sampling_ratio/max": 2.9975228309631348,
368
+ "sampling/importance_sampling_ratio/mean": 0.8283847570419312,
369
+ "sampling/importance_sampling_ratio/min": 0.0,
370
+ "sampling/sampling_logp_difference/max": 2.3669824600219727,
371
+ "sampling/sampling_logp_difference/mean": 0.028965603560209274,
372
+ "step": 11,
373
+ "step_time": 300.83864827199795
374
+ },
375
+ {
376
+ "clip_ratio/high_max": 0.0,
377
+ "clip_ratio/high_mean": 0.0,
378
+ "clip_ratio/low_mean": 0.0,
379
+ "clip_ratio/low_min": 0.0,
380
+ "clip_ratio/region_mean": 0.0,
381
+ "completions/clipped_ratio": 0.046875,
382
+ "completions/max_length": 512.0,
383
+ "completions/max_terminated_length": 343.0,
384
+ "completions/mean_length": 140.28125,
385
+ "completions/mean_terminated_length": 121.99999237060547,
386
+ "completions/min_length": 51.0,
387
+ "completions/min_terminated_length": 51.0,
388
+ "entropy": 0.519548587501049,
389
+ "epoch": 0.1935483870967742,
390
+ "frac_reward_zero_std": 0.0,
391
+ "grad_norm": 0.2622426152229309,
392
+ "kl": 0.05217514652758837,
393
+ "learning_rate": 1.911205012908703e-05,
394
+ "loss": -0.0594,
395
+ "num_tokens": 1162516.0,
396
+ "reward": 0.8148437738418579,
397
+ "reward_std": 3.1582674980163574,
398
+ "rewards/alfworld_rollout_reward_func/mean": 0.8148437142372131,
399
+ "rewards/alfworld_rollout_reward_func/std": 3.7535512447357178,
400
+ "sampling/importance_sampling_ratio/max": 2.8224730491638184,
401
+ "sampling/importance_sampling_ratio/mean": 0.7709423899650574,
402
+ "sampling/importance_sampling_ratio/min": 0.0,
403
+ "sampling/sampling_logp_difference/max": 2.520348072052002,
404
+ "sampling/sampling_logp_difference/mean": 0.031547725200653076,
405
+ "step": 12,
406
+ "step_time": 287.3358050610004
407
+ },
408
+ {
409
+ "clip_ratio/high_max": 0.0,
410
+ "clip_ratio/high_mean": 0.0,
411
+ "clip_ratio/low_mean": 0.0,
412
+ "clip_ratio/low_min": 0.0,
413
+ "clip_ratio/region_mean": 0.0,
414
+ "completions/clipped_ratio": 0.0703125,
415
+ "completions/max_length": 512.0,
416
+ "completions/max_terminated_length": 256.0,
417
+ "completions/mean_length": 151.046875,
418
+ "completions/mean_terminated_length": 123.7479019165039,
419
+ "completions/min_length": 60.0,
420
+ "completions/min_terminated_length": 60.0,
421
+ "entropy": 0.5261144414544106,
422
+ "epoch": 0.20967741935483872,
423
+ "frac_reward_zero_std": 0.0,
424
+ "grad_norm": 0.1863553375005722,
425
+ "kl": 0.06452470645308495,
426
+ "learning_rate": 1.800492689447043e-05,
427
+ "loss": -0.0566,
428
+ "num_tokens": 1243615.0,
429
+ "reward": 0.8781249523162842,
430
+ "reward_std": 3.3634252548217773,
431
+ "rewards/alfworld_rollout_reward_func/mean": 0.8781249523162842,
432
+ "rewards/alfworld_rollout_reward_func/std": 3.842200994491577,
433
+ "sampling/importance_sampling_ratio/max": 2.4663472175598145,
434
+ "sampling/importance_sampling_ratio/mean": 0.7910538911819458,
435
+ "sampling/importance_sampling_ratio/min": 0.0,
436
+ "sampling/sampling_logp_difference/max": 2.4453465938568115,
437
+ "sampling/sampling_logp_difference/mean": 0.02982635796070099,
438
+ "step": 13,
439
+ "step_time": 293.8646844759969
440
+ },
441
+ {
442
+ "clip_ratio/high_max": 0.0,
443
+ "clip_ratio/high_mean": 0.0,
444
+ "clip_ratio/low_mean": 0.0,
445
+ "clip_ratio/low_min": 0.0,
446
+ "clip_ratio/region_mean": 0.0,
447
+ "completions/clipped_ratio": 0.0078125,
448
+ "completions/max_length": 512.0,
449
+ "completions/max_terminated_length": 380.0,
450
+ "completions/mean_length": 129.8671875,
451
+ "completions/mean_terminated_length": 126.85826873779297,
452
+ "completions/min_length": 56.0,
453
+ "completions/min_terminated_length": 56.0,
454
+ "entropy": 0.5344252400100231,
455
+ "epoch": 0.22580645161290322,
456
+ "frac_reward_zero_std": 0.0,
457
+ "grad_norm": 0.24170275032520294,
458
+ "kl": 0.06391164055094123,
459
+ "learning_rate": 1.6841315660560252e-05,
460
+ "loss": -0.1138,
461
+ "num_tokens": 1321112.0,
462
+ "reward": 0.9546874761581421,
463
+ "reward_std": 3.586996078491211,
464
+ "rewards/alfworld_rollout_reward_func/mean": 0.9546874761581421,
465
+ "rewards/alfworld_rollout_reward_func/std": 3.9051523208618164,
466
+ "sampling/importance_sampling_ratio/max": 2.8537843227386475,
467
+ "sampling/importance_sampling_ratio/mean": 0.7751458883285522,
468
+ "sampling/importance_sampling_ratio/min": 0.0,
469
+ "sampling/sampling_logp_difference/max": 1.7649002075195312,
470
+ "sampling/sampling_logp_difference/mean": 0.03092074766755104,
471
+ "step": 14,
472
+ "step_time": 283.45238025300205
473
+ },
474
+ {
475
+ "clip_ratio/high_max": 0.0,
476
+ "clip_ratio/high_mean": 0.0,
477
+ "clip_ratio/low_mean": 0.0,
478
+ "clip_ratio/low_min": 0.0,
479
+ "clip_ratio/region_mean": 0.0,
480
+ "completions/clipped_ratio": 0.0546875,
481
+ "completions/max_length": 512.0,
482
+ "completions/max_terminated_length": 457.0,
483
+ "completions/mean_length": 149.8515625,
484
+ "completions/mean_terminated_length": 128.90081787109375,
485
+ "completions/min_length": 70.0,
486
+ "completions/min_terminated_length": 70.0,
487
+ "entropy": 0.5125062428414822,
488
+ "epoch": 0.24193548387096775,
489
+ "frac_reward_zero_std": 0.0,
490
+ "grad_norm": 0.16989152133464813,
491
+ "kl": 0.053708435501903296,
492
+ "learning_rate": 1.563315665323401e-05,
493
+ "loss": 0.0051,
494
+ "num_tokens": 1403786.0,
495
+ "reward": 0.34921878576278687,
496
+ "reward_std": 2.4881625175476074,
497
+ "rewards/alfworld_rollout_reward_func/mean": 0.3492187559604645,
498
+ "rewards/alfworld_rollout_reward_func/std": 3.1670050621032715,
499
+ "sampling/importance_sampling_ratio/max": 2.4405319690704346,
500
+ "sampling/importance_sampling_ratio/mean": 0.7845873832702637,
501
+ "sampling/importance_sampling_ratio/min": 0.0,
502
+ "sampling/sampling_logp_difference/max": 1.7177300453186035,
503
+ "sampling/sampling_logp_difference/mean": 0.029662001878023148,
504
+ "step": 15,
505
+ "step_time": 308.28231060100006
506
+ },
507
+ {
508
+ "clip_ratio/high_max": 0.0,
509
+ "clip_ratio/high_mean": 0.0,
510
+ "clip_ratio/low_mean": 0.0,
511
+ "clip_ratio/low_min": 0.0,
512
+ "clip_ratio/region_mean": 0.0,
513
+ "completions/clipped_ratio": 0.0703125,
514
+ "completions/max_length": 512.0,
515
+ "completions/max_terminated_length": 422.0,
516
+ "completions/mean_length": 156.3359375,
517
+ "completions/mean_terminated_length": 129.43698120117188,
518
+ "completions/min_length": 71.0,
519
+ "completions/min_terminated_length": 71.0,
520
+ "entropy": 0.5052059814333916,
521
+ "epoch": 0.25806451612903225,
522
+ "frac_reward_zero_std": 0.0,
523
+ "grad_norm": 0.18680106103420258,
524
+ "kl": 0.052992088720202446,
525
+ "learning_rate": 1.439284721880721e-05,
526
+ "loss": -0.0061,
527
+ "num_tokens": 1485910.0,
528
+ "reward": 1.005468726158142,
529
+ "reward_std": 2.9901723861694336,
530
+ "rewards/alfworld_rollout_reward_func/mean": 1.005468726158142,
531
+ "rewards/alfworld_rollout_reward_func/std": 3.869877815246582,
532
+ "sampling/importance_sampling_ratio/max": 2.6479995250701904,
533
+ "sampling/importance_sampling_ratio/mean": 0.8335222005844116,
534
+ "sampling/importance_sampling_ratio/min": 0.0,
535
+ "sampling/sampling_logp_difference/max": 2.2812881469726562,
536
+ "sampling/sampling_logp_difference/mean": 0.02822798863053322,
537
+ "step": 16,
538
+ "step_time": 288.45321577499635
539
+ },
540
+ {
541
+ "clip_ratio/high_max": 0.0,
542
+ "clip_ratio/high_mean": 0.0,
543
+ "clip_ratio/low_mean": 0.0,
544
+ "clip_ratio/low_min": 0.0,
545
+ "clip_ratio/region_mean": 0.0,
546
+ "completions/clipped_ratio": 0.0625,
547
+ "completions/max_length": 512.0,
548
+ "completions/max_terminated_length": 436.0,
549
+ "completions/mean_length": 155.78125,
550
+ "completions/mean_terminated_length": 132.03334045410156,
551
+ "completions/min_length": 62.0,
552
+ "completions/min_terminated_length": 62.0,
553
+ "entropy": 0.5273009687662125,
554
+ "epoch": 0.27419354838709675,
555
+ "frac_reward_zero_std": 0.0,
556
+ "grad_norm": 0.17423537373542786,
557
+ "kl": 0.07150922995060682,
558
+ "learning_rate": 1.3133114610483909e-05,
559
+ "loss": -0.0034,
560
+ "num_tokens": 1567778.0,
561
+ "reward": 0.25390625,
562
+ "reward_std": 2.4427285194396973,
563
+ "rewards/alfworld_rollout_reward_func/mean": 0.2539062201976776,
564
+ "rewards/alfworld_rollout_reward_func/std": 3.2506790161132812,
565
+ "sampling/importance_sampling_ratio/max": 2.4126062393188477,
566
+ "sampling/importance_sampling_ratio/mean": 0.8037413954734802,
567
+ "sampling/importance_sampling_ratio/min": 0.0,
568
+ "sampling/sampling_logp_difference/max": 2.5358574390411377,
569
+ "sampling/sampling_logp_difference/mean": 0.029498092830181122,
570
+ "step": 17,
571
+ "step_time": 281.3368725319997
572
+ },
573
+ {
574
+ "clip_ratio/high_max": 0.0,
575
+ "clip_ratio/high_mean": 0.0,
576
+ "clip_ratio/low_mean": 0.0,
577
+ "clip_ratio/low_min": 0.0,
578
+ "clip_ratio/region_mean": 0.0,
579
+ "completions/clipped_ratio": 0.1484375,
580
+ "completions/max_length": 512.0,
581
+ "completions/max_terminated_length": 477.0,
582
+ "completions/mean_length": 186.5,
583
+ "completions/mean_terminated_length": 129.76145935058594,
584
+ "completions/min_length": 73.0,
585
+ "completions/min_terminated_length": 73.0,
586
+ "entropy": 0.47341491281986237,
587
+ "epoch": 0.2903225806451613,
588
+ "frac_reward_zero_std": 0.0,
589
+ "grad_norm": 0.1518625020980835,
590
+ "kl": 0.049451613100245595,
591
+ "learning_rate": 1.1866885389516092e-05,
592
+ "loss": 0.0897,
593
+ "num_tokens": 1656705.0,
594
+ "reward": 0.80859375,
595
+ "reward_std": 3.4441990852355957,
596
+ "rewards/alfworld_rollout_reward_func/mean": 0.80859375,
597
+ "rewards/alfworld_rollout_reward_func/std": 3.8008499145507812,
598
+ "sampling/importance_sampling_ratio/max": 2.9426238536834717,
599
+ "sampling/importance_sampling_ratio/mean": 0.7826383113861084,
600
+ "sampling/importance_sampling_ratio/min": 0.0,
601
+ "sampling/sampling_logp_difference/max": 3.3363547325134277,
602
+ "sampling/sampling_logp_difference/mean": 0.027312669903039932,
603
+ "step": 18,
604
+ "step_time": 334.05939280299935
605
+ },
606
+ {
607
+ "clip_ratio/high_max": 0.0,
608
+ "clip_ratio/high_mean": 0.0,
609
+ "clip_ratio/low_mean": 0.0,
610
+ "clip_ratio/low_min": 0.0,
611
+ "clip_ratio/region_mean": 0.0,
612
+ "completions/clipped_ratio": 0.1015625,
613
+ "completions/max_length": 512.0,
614
+ "completions/max_terminated_length": 499.0,
615
+ "completions/mean_length": 166.0859375,
616
+ "completions/mean_terminated_length": 126.98260498046875,
617
+ "completions/min_length": 68.0,
618
+ "completions/min_terminated_length": 68.0,
619
+ "entropy": 0.47634539753198624,
620
+ "epoch": 0.3064516129032258,
621
+ "frac_reward_zero_std": 0.0,
622
+ "grad_norm": 0.2188945859670639,
623
+ "kl": 0.0531598380766809,
624
+ "learning_rate": 1.0607152781192796e-05,
625
+ "loss": 0.0078,
626
+ "num_tokens": 1741466.0,
627
+ "reward": 0.6265625357627869,
628
+ "reward_std": 2.6468114852905273,
629
+ "rewards/alfworld_rollout_reward_func/mean": 0.6265624761581421,
630
+ "rewards/alfworld_rollout_reward_func/std": 3.4735782146453857,
631
+ "sampling/importance_sampling_ratio/max": 2.799043655395508,
632
+ "sampling/importance_sampling_ratio/mean": 0.8041483163833618,
633
+ "sampling/importance_sampling_ratio/min": 0.0,
634
+ "sampling/sampling_logp_difference/max": 2.4288792610168457,
635
+ "sampling/sampling_logp_difference/mean": 0.02712005004286766,
636
+ "step": 19,
637
+ "step_time": 303.70944481100014
638
+ },
639
+ {
640
+ "clip_ratio/high_max": 0.0,
641
+ "clip_ratio/high_mean": 0.0,
642
+ "clip_ratio/low_mean": 0.0,
643
+ "clip_ratio/low_min": 0.0,
644
+ "clip_ratio/region_mean": 0.0,
645
+ "completions/clipped_ratio": 0.0859375,
646
+ "completions/max_length": 512.0,
647
+ "completions/max_terminated_length": 403.0,
648
+ "completions/mean_length": 160.9375,
649
+ "completions/mean_terminated_length": 127.93163299560547,
650
+ "completions/min_length": 76.0,
651
+ "completions/min_terminated_length": 76.0,
652
+ "entropy": 0.46492085233330727,
653
+ "epoch": 0.3225806451612903,
654
+ "frac_reward_zero_std": 0.0,
655
+ "grad_norm": 0.7110471725463867,
656
+ "kl": 0.09847234468907118,
657
+ "learning_rate": 9.366843346765992e-06,
658
+ "loss": 0.0238,
659
+ "num_tokens": 1823852.0,
660
+ "reward": 0.819531261920929,
661
+ "reward_std": 3.2860288619995117,
662
+ "rewards/alfworld_rollout_reward_func/mean": 0.8195312023162842,
663
+ "rewards/alfworld_rollout_reward_func/std": 3.780327320098877,
664
+ "sampling/importance_sampling_ratio/max": 2.7687222957611084,
665
+ "sampling/importance_sampling_ratio/mean": 0.7973555326461792,
666
+ "sampling/importance_sampling_ratio/min": 0.0,
667
+ "sampling/sampling_logp_difference/max": 20.83700942993164,
668
+ "sampling/sampling_logp_difference/mean": 0.027603479102253914,
669
+ "step": 20,
670
+ "step_time": 296.20082658499814
671
+ }
672
+ ],
673
+ "logging_steps": 1,
674
+ "max_steps": 32,
675
+ "num_input_tokens_seen": 1823852,
676
+ "num_train_epochs": 1,
677
+ "save_steps": 20,
678
+ "stateful_callbacks": {
679
+ "TrainerControl": {
680
+ "args": {
681
+ "should_epoch_stop": false,
682
+ "should_evaluate": false,
683
+ "should_log": false,
684
+ "should_save": true,
685
+ "should_training_stop": false
686
+ },
687
+ "attributes": {}
688
+ }
689
+ },
690
+ "total_flos": 0.0,
691
+ "train_batch_size": 8,
692
+ "trial_name": null,
693
+ "trial_params": null
694
+ }
checkpoint-20/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe31dee94209c1973566ddbbc45e4eaa5758a9402713eebce4f579426cfdc6a3
3
+ size 9361
checkpoint-20/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-32/README.md ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: /cache/models/Qwen--Qwen2.5-3B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - axolotl
7
+ - base_model:adapter:/cache/models/Qwen--Qwen2.5-3B-Instruct
8
+ - grpo
9
+ - lora
10
+ - transformers
11
+ - trl
12
+ ---
13
+
14
+ # Model Card for Model ID
15
+
16
+ <!-- Provide a quick summary of what the model is/does. -->
17
+
18
+
19
+
20
+ ## Model Details
21
+
22
+ ### Model Description
23
+
24
+ <!-- Provide a longer summary of what this model is. -->
25
+
26
+
27
+
28
+ - **Developed by:** [More Information Needed]
29
+ - **Funded by [optional]:** [More Information Needed]
30
+ - **Shared by [optional]:** [More Information Needed]
31
+ - **Model type:** [More Information Needed]
32
+ - **Language(s) (NLP):** [More Information Needed]
33
+ - **License:** [More Information Needed]
34
+ - **Finetuned from model [optional]:** [More Information Needed]
35
+
36
+ ### Model Sources [optional]
37
+
38
+ <!-- Provide the basic links for the model. -->
39
+
40
+ - **Repository:** [More Information Needed]
41
+ - **Paper [optional]:** [More Information Needed]
42
+ - **Demo [optional]:** [More Information Needed]
43
+
44
+ ## Uses
45
+
46
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
47
+
48
+ ### Direct Use
49
+
50
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
51
+
52
+ [More Information Needed]
53
+
54
+ ### Downstream Use [optional]
55
+
56
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
57
+
58
+ [More Information Needed]
59
+
60
+ ### Out-of-Scope Use
61
+
62
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
63
+
64
+ [More Information Needed]
65
+
66
+ ## Bias, Risks, and Limitations
67
+
68
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
69
+
70
+ [More Information Needed]
71
+
72
+ ### Recommendations
73
+
74
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
75
+
76
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
77
+
78
+ ## How to Get Started with the Model
79
+
80
+ Use the code below to get started with the model.
81
+
82
+ [More Information Needed]
83
+
84
+ ## Training Details
85
+
86
+ ### Training Data
87
+
88
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
89
+
90
+ [More Information Needed]
91
+
92
+ ### Training Procedure
93
+
94
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
95
+
96
+ #### Preprocessing [optional]
97
+
98
+ [More Information Needed]
99
+
100
+
101
+ #### Training Hyperparameters
102
+
103
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
104
+
105
+ #### Speeds, Sizes, Times [optional]
106
+
107
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
108
+
109
+ [More Information Needed]
110
+
111
+ ## Evaluation
112
+
113
+ <!-- This section describes the evaluation protocols and provides the results. -->
114
+
115
+ ### Testing Data, Factors & Metrics
116
+
117
+ #### Testing Data
118
+
119
+ <!-- This should link to a Dataset Card if possible. -->
120
+
121
+ [More Information Needed]
122
+
123
+ #### Factors
124
+
125
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
126
+
127
+ [More Information Needed]
128
+
129
+ #### Metrics
130
+
131
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
132
+
133
+ [More Information Needed]
134
+
135
+ ### Results
136
+
137
+ [More Information Needed]
138
+
139
+ #### Summary
140
+
141
+
142
+
143
+ ## Model Examination [optional]
144
+
145
+ <!-- Relevant interpretability work for the model goes here -->
146
+
147
+ [More Information Needed]
148
+
149
+ ## Environmental Impact
150
+
151
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
152
+
153
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
154
+
155
+ - **Hardware Type:** [More Information Needed]
156
+ - **Hours used:** [More Information Needed]
157
+ - **Cloud Provider:** [More Information Needed]
158
+ - **Compute Region:** [More Information Needed]
159
+ - **Carbon Emitted:** [More Information Needed]
160
+
161
+ ## Technical Specifications [optional]
162
+
163
+ ### Model Architecture and Objective
164
+
165
+ [More Information Needed]
166
+
167
+ ### Compute Infrastructure
168
+
169
+ [More Information Needed]
170
+
171
+ #### Hardware
172
+
173
+ [More Information Needed]
174
+
175
+ #### Software
176
+
177
+ [More Information Needed]
178
+
179
+ ## Citation [optional]
180
+
181
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
182
+
183
+ **BibTeX:**
184
+
185
+ [More Information Needed]
186
+
187
+ **APA:**
188
+
189
+ [More Information Needed]
190
+
191
+ ## Glossary [optional]
192
+
193
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
194
+
195
+ [More Information Needed]
196
+
197
+ ## More Information [optional]
198
+
199
+ [More Information Needed]
200
+
201
+ ## Model Card Authors [optional]
202
+
203
+ [More Information Needed]
204
+
205
+ ## Model Card Contact
206
+
207
+ [More Information Needed]
208
+ ### Framework versions
209
+
210
+ - PEFT 0.18.1
checkpoint-32/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/cache/models/Qwen--Qwen2.5-3B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "gate_proj",
33
+ "k_proj",
34
+ "up_proj",
35
+ "q_proj",
36
+ "o_proj",
37
+ "down_proj",
38
+ "v_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-32/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b878ff20d084d009a6f57bea4813e4afefd51a1703b7783226841a37ec00fedb
3
+ size 119801528
checkpoint-32/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
checkpoint-32/chat_template.jinja ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
2
+
3
+ '+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>
4
+
5
+ ' }}{% endif %}
checkpoint-32/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-32/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eae68cc173b6507d8f069518b00419e9fe3a5d0ee5b2ff7e1205996be8c53bd7
3
+ size 61392581
checkpoint-32/ref/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/cache/models/Qwen--Qwen2.5-3B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "gate_proj",
33
+ "k_proj",
34
+ "up_proj",
35
+ "q_proj",
36
+ "o_proj",
37
+ "down_proj",
38
+ "v_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
checkpoint-32/ref/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a73e8a3357e88549443ed67169c44587d8849b718265a36909b7373da2322a00
3
+ size 59934640
checkpoint-32/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9d3bb4ce845169fd3ce72597fab37a23453e900b23b17ba40fcfd1fa88b6e6f
3
+ size 14917
checkpoint-32/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8a00654d154682a9b49ef12ed206fff9ae0b9894c482c27008bf4244f61914c
3
+ size 14981
checkpoint-32/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48ea6f0f38f84d67d36c14ee5cfbd976fee0d136a7e48f9dfe00eb968cff425e
3
+ size 1465
checkpoint-32/special_tokens_map.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "bos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "eos_token": {
25
+ "content": "<|im_end|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "pad_token": {
32
+ "content": "<|endoftext|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
+ }
checkpoint-32/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
checkpoint-32/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": "<|im_end|>",
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
checkpoint-32/trainer_state.json ADDED
@@ -0,0 +1,1090 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5161290322580645,
6
+ "eval_steps": 500,
7
+ "global_step": 32,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.53125,
19
+ "completions/max_length": 512.0,
20
+ "completions/max_terminated_length": 487.0,
21
+ "completions/mean_length": 370.3359375,
22
+ "completions/mean_terminated_length": 209.78334045410156,
23
+ "completions/min_length": 66.0,
24
+ "completions/min_terminated_length": 66.0,
25
+ "entropy": 0.40961652249097824,
26
+ "epoch": 0.016129032258064516,
27
+ "frac_reward_zero_std": 0.0,
28
+ "grad_norm": 0.1428968608379364,
29
+ "kl": 0.0,
30
+ "learning_rate": 0.0,
31
+ "loss": -0.0891,
32
+ "num_tokens": 110863.0,
33
+ "reward": 0.09531250596046448,
34
+ "reward_std": 2.4042975902557373,
35
+ "rewards/alfworld_rollout_reward_func/mean": 0.09531247615814209,
36
+ "rewards/alfworld_rollout_reward_func/std": 2.9919276237487793,
37
+ "sampling/importance_sampling_ratio/max": 2.9990427494049072,
38
+ "sampling/importance_sampling_ratio/mean": 0.6262807846069336,
39
+ "sampling/importance_sampling_ratio/min": 0.0,
40
+ "sampling/sampling_logp_difference/max": 4.450258255004883,
41
+ "sampling/sampling_logp_difference/mean": 0.024438712745904922,
42
+ "step": 1,
43
+ "step_time": 309.93943907799985
44
+ },
45
+ {
46
+ "clip_ratio/high_max": 0.0,
47
+ "clip_ratio/high_mean": 0.0,
48
+ "clip_ratio/low_mean": 0.0,
49
+ "clip_ratio/low_min": 0.0,
50
+ "clip_ratio/region_mean": 0.0,
51
+ "completions/clipped_ratio": 0.5703125,
52
+ "completions/max_length": 512.0,
53
+ "completions/max_terminated_length": 509.0,
54
+ "completions/mean_length": 380.1015625,
55
+ "completions/mean_terminated_length": 205.03636169433594,
56
+ "completions/min_length": 79.0,
57
+ "completions/min_terminated_length": 79.0,
58
+ "entropy": 0.40012454986572266,
59
+ "epoch": 0.03225806451612903,
60
+ "frac_reward_zero_std": 0.0,
61
+ "grad_norm": 0.12177681177854538,
62
+ "kl": 0.0,
63
+ "learning_rate": 2.5e-05,
64
+ "loss": -0.085,
65
+ "num_tokens": 222814.0,
66
+ "reward": 0.5320311784744263,
67
+ "reward_std": 2.642284393310547,
68
+ "rewards/alfworld_rollout_reward_func/mean": 0.5320311784744263,
69
+ "rewards/alfworld_rollout_reward_func/std": 3.4178764820098877,
70
+ "sampling/importance_sampling_ratio/max": 2.909560441970825,
71
+ "sampling/importance_sampling_ratio/mean": 0.5676747560501099,
72
+ "sampling/importance_sampling_ratio/min": 0.0,
73
+ "sampling/sampling_logp_difference/max": 4.949212551116943,
74
+ "sampling/sampling_logp_difference/mean": 0.023133086040616035,
75
+ "step": 2,
76
+ "step_time": 315.77915475400005
77
+ },
78
+ {
79
+ "clip_ratio/high_max": 0.0,
80
+ "clip_ratio/high_mean": 0.0,
81
+ "clip_ratio/low_mean": 0.0,
82
+ "clip_ratio/low_min": 0.0,
83
+ "clip_ratio/region_mean": 0.0,
84
+ "completions/clipped_ratio": 0.6015625,
85
+ "completions/max_length": 512.0,
86
+ "completions/max_terminated_length": 497.0,
87
+ "completions/mean_length": 378.34375,
88
+ "completions/mean_terminated_length": 176.5490264892578,
89
+ "completions/min_length": 76.0,
90
+ "completions/min_terminated_length": 76.0,
91
+ "entropy": 0.4211720749735832,
92
+ "epoch": 0.04838709677419355,
93
+ "frac_reward_zero_std": 0.0,
94
+ "grad_norm": 0.13358080387115479,
95
+ "kl": 0.00272539074649103,
96
+ "learning_rate": 2.493586654239869e-05,
97
+ "loss": -0.0053,
98
+ "num_tokens": 332775.0,
99
+ "reward": 0.925000011920929,
100
+ "reward_std": 3.4297051429748535,
101
+ "rewards/alfworld_rollout_reward_func/mean": 0.925000011920929,
102
+ "rewards/alfworld_rollout_reward_func/std": 3.9396629333496094,
103
+ "sampling/importance_sampling_ratio/max": 2.511305809020996,
104
+ "sampling/importance_sampling_ratio/mean": 0.5851866602897644,
105
+ "sampling/importance_sampling_ratio/min": 0.0,
106
+ "sampling/sampling_logp_difference/max": 3.5749268531799316,
107
+ "sampling/sampling_logp_difference/mean": 0.023533061146736145,
108
+ "step": 3,
109
+ "step_time": 296.23798338899906
110
+ },
111
+ {
112
+ "clip_ratio/high_max": 0.0,
113
+ "clip_ratio/high_mean": 0.0,
114
+ "clip_ratio/low_mean": 0.0,
115
+ "clip_ratio/low_min": 0.0,
116
+ "clip_ratio/region_mean": 0.0,
117
+ "completions/clipped_ratio": 0.53125,
118
+ "completions/max_length": 512.0,
119
+ "completions/max_terminated_length": 511.0,
120
+ "completions/mean_length": 357.84375,
121
+ "completions/mean_terminated_length": 183.1333465576172,
122
+ "completions/min_length": 66.0,
123
+ "completions/min_terminated_length": 66.0,
124
+ "entropy": 0.3783623240888119,
125
+ "epoch": 0.06451612903225806,
126
+ "frac_reward_zero_std": 0.0,
127
+ "grad_norm": 0.15261100232601166,
128
+ "kl": 0.002557590647484176,
129
+ "learning_rate": 2.474412426565618e-05,
130
+ "loss": -0.0004,
131
+ "num_tokens": 441773.0,
132
+ "reward": 0.45625001192092896,
133
+ "reward_std": 2.879180908203125,
134
+ "rewards/alfworld_rollout_reward_func/mean": 0.45625001192092896,
135
+ "rewards/alfworld_rollout_reward_func/std": 3.427947759628296,
136
+ "sampling/importance_sampling_ratio/max": 2.91353440284729,
137
+ "sampling/importance_sampling_ratio/mean": 0.6235978603363037,
138
+ "sampling/importance_sampling_ratio/min": 0.0,
139
+ "sampling/sampling_logp_difference/max": 4.593559741973877,
140
+ "sampling/sampling_logp_difference/mean": 0.023225955665111542,
141
+ "step": 4,
142
+ "step_time": 322.2789937999987
143
+ },
144
+ {
145
+ "clip_ratio/high_max": 0.0,
146
+ "clip_ratio/high_mean": 0.0,
147
+ "clip_ratio/low_mean": 0.0,
148
+ "clip_ratio/low_min": 0.0,
149
+ "clip_ratio/region_mean": 0.0,
150
+ "completions/clipped_ratio": 0.3984375,
151
+ "completions/max_length": 512.0,
152
+ "completions/max_terminated_length": 504.0,
153
+ "completions/mean_length": 298.53125,
154
+ "completions/mean_terminated_length": 157.14285278320312,
155
+ "completions/min_length": 64.0,
156
+ "completions/min_terminated_length": 64.0,
157
+ "entropy": 0.4225323870778084,
158
+ "epoch": 0.08064516129032258,
159
+ "frac_reward_zero_std": 0.0,
160
+ "grad_norm": 0.11338824778795242,
161
+ "kl": 0.003911561769200489,
162
+ "learning_rate": 2.442674070500061e-05,
163
+ "loss": 0.0261,
164
+ "num_tokens": 540709.0,
165
+ "reward": 0.48515623807907104,
166
+ "reward_std": 2.928551197052002,
167
+ "rewards/alfworld_rollout_reward_func/mean": 0.48515623807907104,
168
+ "rewards/alfworld_rollout_reward_func/std": 3.3146796226501465,
169
+ "sampling/importance_sampling_ratio/max": 2.970187187194824,
170
+ "sampling/importance_sampling_ratio/mean": 0.5831338763237,
171
+ "sampling/importance_sampling_ratio/min": 0.0,
172
+ "sampling/sampling_logp_difference/max": 6.362844944000244,
173
+ "sampling/sampling_logp_difference/mean": 0.024303022772073746,
174
+ "step": 5,
175
+ "step_time": 281.38914586900137
176
+ },
177
+ {
178
+ "clip_ratio/high_max": 0.0,
179
+ "clip_ratio/high_mean": 0.0,
180
+ "clip_ratio/low_mean": 0.0,
181
+ "clip_ratio/low_min": 0.0,
182
+ "clip_ratio/region_mean": 0.0,
183
+ "completions/clipped_ratio": 0.34375,
184
+ "completions/max_length": 512.0,
185
+ "completions/max_terminated_length": 476.0,
186
+ "completions/mean_length": 280.25,
187
+ "completions/mean_terminated_length": 158.85714721679688,
188
+ "completions/min_length": 79.0,
189
+ "completions/min_terminated_length": 79.0,
190
+ "entropy": 0.4307485744357109,
191
+ "epoch": 0.0967741935483871,
192
+ "frac_reward_zero_std": 0.0,
193
+ "grad_norm": 0.840685248374939,
194
+ "kl": 0.035365400719456375,
195
+ "learning_rate": 2.3986972645252883e-05,
196
+ "loss": 0.0634,
197
+ "num_tokens": 641237.0,
198
+ "reward": 1.1078124046325684,
199
+ "reward_std": 3.4873318672180176,
200
+ "rewards/alfworld_rollout_reward_func/mean": 1.107812523841858,
201
+ "rewards/alfworld_rollout_reward_func/std": 3.958895206451416,
202
+ "sampling/importance_sampling_ratio/max": 2.7673635482788086,
203
+ "sampling/importance_sampling_ratio/mean": 0.561335563659668,
204
+ "sampling/importance_sampling_ratio/min": 0.0,
205
+ "sampling/sampling_logp_difference/max": 11.066271781921387,
206
+ "sampling/sampling_logp_difference/mean": 0.02636803314089775,
207
+ "step": 6,
208
+ "step_time": 306.572960538002
209
+ },
210
+ {
211
+ "clip_ratio/high_max": 0.0,
212
+ "clip_ratio/high_mean": 0.0,
213
+ "clip_ratio/low_mean": 0.0,
214
+ "clip_ratio/low_min": 0.0,
215
+ "clip_ratio/region_mean": 0.0,
216
+ "completions/clipped_ratio": 0.28125,
217
+ "completions/max_length": 512.0,
218
+ "completions/max_terminated_length": 474.0,
219
+ "completions/mean_length": 248.328125,
220
+ "completions/mean_terminated_length": 145.1521759033203,
221
+ "completions/min_length": 61.0,
222
+ "completions/min_terminated_length": 61.0,
223
+ "entropy": 0.4486994035542011,
224
+ "epoch": 0.11290322580645161,
225
+ "frac_reward_zero_std": 0.0,
226
+ "grad_norm": 0.13363927602767944,
227
+ "kl": 0.010801714262925088,
228
+ "learning_rate": 2.342933270180728e-05,
229
+ "loss": 0.1147,
230
+ "num_tokens": 734359.0,
231
+ "reward": 0.7398437261581421,
232
+ "reward_std": 3.073120594024658,
233
+ "rewards/alfworld_rollout_reward_func/mean": 0.7398437261581421,
234
+ "rewards/alfworld_rollout_reward_func/std": 3.5458602905273438,
235
+ "sampling/importance_sampling_ratio/max": 2.4107882976531982,
236
+ "sampling/importance_sampling_ratio/mean": 0.5770797729492188,
237
+ "sampling/importance_sampling_ratio/min": 0.0,
238
+ "sampling/sampling_logp_difference/max": 7.637388229370117,
239
+ "sampling/sampling_logp_difference/mean": 0.027358299121260643,
240
+ "step": 7,
241
+ "step_time": 291.4309037350031
242
+ },
243
+ {
244
+ "clip_ratio/high_max": 0.0,
245
+ "clip_ratio/high_mean": 0.0,
246
+ "clip_ratio/low_mean": 0.0,
247
+ "clip_ratio/low_min": 0.0,
248
+ "clip_ratio/region_mean": 0.0,
249
+ "completions/clipped_ratio": 0.2109375,
250
+ "completions/max_length": 512.0,
251
+ "completions/max_terminated_length": 487.0,
252
+ "completions/mean_length": 224.53125,
253
+ "completions/mean_terminated_length": 147.68316650390625,
254
+ "completions/min_length": 71.0,
255
+ "completions/min_terminated_length": 71.0,
256
+ "entropy": 0.4458822049200535,
257
+ "epoch": 0.12903225806451613,
258
+ "frac_reward_zero_std": 0.0,
259
+ "grad_norm": 0.1380084902048111,
260
+ "kl": 0.018724333960562944,
261
+ "learning_rate": 2.2759543015090955e-05,
262
+ "loss": 0.0322,
263
+ "num_tokens": 827827.0,
264
+ "reward": 0.8648437261581421,
265
+ "reward_std": 3.5357110500335693,
266
+ "rewards/alfworld_rollout_reward_func/mean": 0.8648437857627869,
267
+ "rewards/alfworld_rollout_reward_func/std": 3.7438578605651855,
268
+ "sampling/importance_sampling_ratio/max": 2.939615249633789,
269
+ "sampling/importance_sampling_ratio/mean": 0.5691805481910706,
270
+ "sampling/importance_sampling_ratio/min": 0.0,
271
+ "sampling/sampling_logp_difference/max": 5.754482269287109,
272
+ "sampling/sampling_logp_difference/mean": 0.028573205694556236,
273
+ "step": 8,
274
+ "step_time": 305.87279431799834
275
+ },
276
+ {
277
+ "clip_ratio/high_max": 0.0,
278
+ "clip_ratio/high_mean": 0.0,
279
+ "clip_ratio/low_mean": 0.0,
280
+ "clip_ratio/low_min": 0.0,
281
+ "clip_ratio/region_mean": 0.0,
282
+ "completions/clipped_ratio": 0.1875,
283
+ "completions/max_length": 512.0,
284
+ "completions/max_terminated_length": 504.0,
285
+ "completions/mean_length": 218.171875,
286
+ "completions/mean_terminated_length": 150.36538696289062,
287
+ "completions/min_length": 68.0,
288
+ "completions/min_terminated_length": 68.0,
289
+ "entropy": 0.4581918604671955,
290
+ "epoch": 0.14516129032258066,
291
+ "frac_reward_zero_std": 0.0,
292
+ "grad_norm": 0.11444258689880371,
293
+ "kl": 0.02050452691037208,
294
+ "learning_rate": 2.1984476533659888e-05,
295
+ "loss": 0.0214,
296
+ "num_tokens": 918938.0,
297
+ "reward": 1.025781273841858,
298
+ "reward_std": 3.5701520442962646,
299
+ "rewards/alfworld_rollout_reward_func/mean": 1.0257811546325684,
300
+ "rewards/alfworld_rollout_reward_func/std": 3.894923686981201,
301
+ "sampling/importance_sampling_ratio/max": 2.700753688812256,
302
+ "sampling/importance_sampling_ratio/mean": 0.6189267635345459,
303
+ "sampling/importance_sampling_ratio/min": 0.0,
304
+ "sampling/sampling_logp_difference/max": 3.1386852264404297,
305
+ "sampling/sampling_logp_difference/mean": 0.0270779300481081,
306
+ "step": 9,
307
+ "step_time": 308.85608878800485
308
+ },
309
+ {
310
+ "clip_ratio/high_max": 0.0,
311
+ "clip_ratio/high_mean": 0.0,
312
+ "clip_ratio/low_mean": 0.0,
313
+ "clip_ratio/low_min": 0.0,
314
+ "clip_ratio/region_mean": 0.0,
315
+ "completions/clipped_ratio": 0.0546875,
316
+ "completions/max_length": 512.0,
317
+ "completions/max_terminated_length": 432.0,
318
+ "completions/mean_length": 150.1875,
319
+ "completions/mean_terminated_length": 129.25619506835938,
320
+ "completions/min_length": 62.0,
321
+ "completions/min_terminated_length": 62.0,
322
+ "entropy": 0.5325295478105545,
323
+ "epoch": 0.16129032258064516,
324
+ "frac_reward_zero_std": 0.0,
325
+ "grad_norm": 0.2502661645412445,
326
+ "kl": 0.03952266392298043,
327
+ "learning_rate": 2.1112086488446085e-05,
328
+ "loss": 0.029,
329
+ "num_tokens": 1000430.0,
330
+ "reward": 0.8335937261581421,
331
+ "reward_std": 3.1759183406829834,
332
+ "rewards/alfworld_rollout_reward_func/mean": 0.8335937261581421,
333
+ "rewards/alfworld_rollout_reward_func/std": 3.7348177433013916,
334
+ "sampling/importance_sampling_ratio/max": 2.995654344558716,
335
+ "sampling/importance_sampling_ratio/mean": 0.7903203964233398,
336
+ "sampling/importance_sampling_ratio/min": 0.0,
337
+ "sampling/sampling_logp_difference/max": 2.775932788848877,
338
+ "sampling/sampling_logp_difference/mean": 0.03168691694736481,
339
+ "step": 10,
340
+ "step_time": 285.58172405699224
341
+ },
342
+ {
343
+ "clip_ratio/high_max": 0.0,
344
+ "clip_ratio/high_mean": 0.0,
345
+ "clip_ratio/low_mean": 0.0,
346
+ "clip_ratio/low_min": 0.0,
347
+ "clip_ratio/region_mean": 0.0,
348
+ "completions/clipped_ratio": 0.0859375,
349
+ "completions/max_length": 512.0,
350
+ "completions/max_terminated_length": 325.0,
351
+ "completions/mean_length": 156.7734375,
352
+ "completions/mean_terminated_length": 123.3760757446289,
353
+ "completions/min_length": 64.0,
354
+ "completions/min_terminated_length": 64.0,
355
+ "entropy": 0.5142566300928593,
356
+ "epoch": 0.1774193548387097,
357
+ "frac_reward_zero_std": 0.0,
358
+ "grad_norm": 0.18933017551898956,
359
+ "kl": 0.04493421735242009,
360
+ "learning_rate": 2.0151324781845787e-05,
361
+ "loss": 0.0496,
362
+ "num_tokens": 1082621.0,
363
+ "reward": 0.9093749523162842,
364
+ "reward_std": 3.467135429382324,
365
+ "rewards/alfworld_rollout_reward_func/mean": 0.909375011920929,
366
+ "rewards/alfworld_rollout_reward_func/std": 3.8583264350891113,
367
+ "sampling/importance_sampling_ratio/max": 2.9975228309631348,
368
+ "sampling/importance_sampling_ratio/mean": 0.8283847570419312,
369
+ "sampling/importance_sampling_ratio/min": 0.0,
370
+ "sampling/sampling_logp_difference/max": 2.3669824600219727,
371
+ "sampling/sampling_logp_difference/mean": 0.028965603560209274,
372
+ "step": 11,
373
+ "step_time": 300.83864827199795
374
+ },
375
+ {
376
+ "clip_ratio/high_max": 0.0,
377
+ "clip_ratio/high_mean": 0.0,
378
+ "clip_ratio/low_mean": 0.0,
379
+ "clip_ratio/low_min": 0.0,
380
+ "clip_ratio/region_mean": 0.0,
381
+ "completions/clipped_ratio": 0.046875,
382
+ "completions/max_length": 512.0,
383
+ "completions/max_terminated_length": 343.0,
384
+ "completions/mean_length": 140.28125,
385
+ "completions/mean_terminated_length": 121.99999237060547,
386
+ "completions/min_length": 51.0,
387
+ "completions/min_terminated_length": 51.0,
388
+ "entropy": 0.519548587501049,
389
+ "epoch": 0.1935483870967742,
390
+ "frac_reward_zero_std": 0.0,
391
+ "grad_norm": 0.2622426152229309,
392
+ "kl": 0.05217514652758837,
393
+ "learning_rate": 1.911205012908703e-05,
394
+ "loss": -0.0594,
395
+ "num_tokens": 1162516.0,
396
+ "reward": 0.8148437738418579,
397
+ "reward_std": 3.1582674980163574,
398
+ "rewards/alfworld_rollout_reward_func/mean": 0.8148437142372131,
399
+ "rewards/alfworld_rollout_reward_func/std": 3.7535512447357178,
400
+ "sampling/importance_sampling_ratio/max": 2.8224730491638184,
401
+ "sampling/importance_sampling_ratio/mean": 0.7709423899650574,
402
+ "sampling/importance_sampling_ratio/min": 0.0,
403
+ "sampling/sampling_logp_difference/max": 2.520348072052002,
404
+ "sampling/sampling_logp_difference/mean": 0.031547725200653076,
405
+ "step": 12,
406
+ "step_time": 287.3358050610004
407
+ },
408
+ {
409
+ "clip_ratio/high_max": 0.0,
410
+ "clip_ratio/high_mean": 0.0,
411
+ "clip_ratio/low_mean": 0.0,
412
+ "clip_ratio/low_min": 0.0,
413
+ "clip_ratio/region_mean": 0.0,
414
+ "completions/clipped_ratio": 0.0703125,
415
+ "completions/max_length": 512.0,
416
+ "completions/max_terminated_length": 256.0,
417
+ "completions/mean_length": 151.046875,
418
+ "completions/mean_terminated_length": 123.7479019165039,
419
+ "completions/min_length": 60.0,
420
+ "completions/min_terminated_length": 60.0,
421
+ "entropy": 0.5261144414544106,
422
+ "epoch": 0.20967741935483872,
423
+ "frac_reward_zero_std": 0.0,
424
+ "grad_norm": 0.1863553375005722,
425
+ "kl": 0.06452470645308495,
426
+ "learning_rate": 1.800492689447043e-05,
427
+ "loss": -0.0566,
428
+ "num_tokens": 1243615.0,
429
+ "reward": 0.8781249523162842,
430
+ "reward_std": 3.3634252548217773,
431
+ "rewards/alfworld_rollout_reward_func/mean": 0.8781249523162842,
432
+ "rewards/alfworld_rollout_reward_func/std": 3.842200994491577,
433
+ "sampling/importance_sampling_ratio/max": 2.4663472175598145,
434
+ "sampling/importance_sampling_ratio/mean": 0.7910538911819458,
435
+ "sampling/importance_sampling_ratio/min": 0.0,
436
+ "sampling/sampling_logp_difference/max": 2.4453465938568115,
437
+ "sampling/sampling_logp_difference/mean": 0.02982635796070099,
438
+ "step": 13,
439
+ "step_time": 293.8646844759969
440
+ },
441
+ {
442
+ "clip_ratio/high_max": 0.0,
443
+ "clip_ratio/high_mean": 0.0,
444
+ "clip_ratio/low_mean": 0.0,
445
+ "clip_ratio/low_min": 0.0,
446
+ "clip_ratio/region_mean": 0.0,
447
+ "completions/clipped_ratio": 0.0078125,
448
+ "completions/max_length": 512.0,
449
+ "completions/max_terminated_length": 380.0,
450
+ "completions/mean_length": 129.8671875,
451
+ "completions/mean_terminated_length": 126.85826873779297,
452
+ "completions/min_length": 56.0,
453
+ "completions/min_terminated_length": 56.0,
454
+ "entropy": 0.5344252400100231,
455
+ "epoch": 0.22580645161290322,
456
+ "frac_reward_zero_std": 0.0,
457
+ "grad_norm": 0.24170275032520294,
458
+ "kl": 0.06391164055094123,
459
+ "learning_rate": 1.6841315660560252e-05,
460
+ "loss": -0.1138,
461
+ "num_tokens": 1321112.0,
462
+ "reward": 0.9546874761581421,
463
+ "reward_std": 3.586996078491211,
464
+ "rewards/alfworld_rollout_reward_func/mean": 0.9546874761581421,
465
+ "rewards/alfworld_rollout_reward_func/std": 3.9051523208618164,
466
+ "sampling/importance_sampling_ratio/max": 2.8537843227386475,
467
+ "sampling/importance_sampling_ratio/mean": 0.7751458883285522,
468
+ "sampling/importance_sampling_ratio/min": 0.0,
469
+ "sampling/sampling_logp_difference/max": 1.7649002075195312,
470
+ "sampling/sampling_logp_difference/mean": 0.03092074766755104,
471
+ "step": 14,
472
+ "step_time": 283.45238025300205
473
+ },
474
+ {
475
+ "clip_ratio/high_max": 0.0,
476
+ "clip_ratio/high_mean": 0.0,
477
+ "clip_ratio/low_mean": 0.0,
478
+ "clip_ratio/low_min": 0.0,
479
+ "clip_ratio/region_mean": 0.0,
480
+ "completions/clipped_ratio": 0.0546875,
481
+ "completions/max_length": 512.0,
482
+ "completions/max_terminated_length": 457.0,
483
+ "completions/mean_length": 149.8515625,
484
+ "completions/mean_terminated_length": 128.90081787109375,
485
+ "completions/min_length": 70.0,
486
+ "completions/min_terminated_length": 70.0,
487
+ "entropy": 0.5125062428414822,
488
+ "epoch": 0.24193548387096775,
489
+ "frac_reward_zero_std": 0.0,
490
+ "grad_norm": 0.16989152133464813,
491
+ "kl": 0.053708435501903296,
492
+ "learning_rate": 1.563315665323401e-05,
493
+ "loss": 0.0051,
494
+ "num_tokens": 1403786.0,
495
+ "reward": 0.34921878576278687,
496
+ "reward_std": 2.4881625175476074,
497
+ "rewards/alfworld_rollout_reward_func/mean": 0.3492187559604645,
498
+ "rewards/alfworld_rollout_reward_func/std": 3.1670050621032715,
499
+ "sampling/importance_sampling_ratio/max": 2.4405319690704346,
500
+ "sampling/importance_sampling_ratio/mean": 0.7845873832702637,
501
+ "sampling/importance_sampling_ratio/min": 0.0,
502
+ "sampling/sampling_logp_difference/max": 1.7177300453186035,
503
+ "sampling/sampling_logp_difference/mean": 0.029662001878023148,
504
+ "step": 15,
505
+ "step_time": 308.28231060100006
506
+ },
507
+ {
508
+ "clip_ratio/high_max": 0.0,
509
+ "clip_ratio/high_mean": 0.0,
510
+ "clip_ratio/low_mean": 0.0,
511
+ "clip_ratio/low_min": 0.0,
512
+ "clip_ratio/region_mean": 0.0,
513
+ "completions/clipped_ratio": 0.0703125,
514
+ "completions/max_length": 512.0,
515
+ "completions/max_terminated_length": 422.0,
516
+ "completions/mean_length": 156.3359375,
517
+ "completions/mean_terminated_length": 129.43698120117188,
518
+ "completions/min_length": 71.0,
519
+ "completions/min_terminated_length": 71.0,
520
+ "entropy": 0.5052059814333916,
521
+ "epoch": 0.25806451612903225,
522
+ "frac_reward_zero_std": 0.0,
523
+ "grad_norm": 0.18680106103420258,
524
+ "kl": 0.052992088720202446,
525
+ "learning_rate": 1.439284721880721e-05,
526
+ "loss": -0.0061,
527
+ "num_tokens": 1485910.0,
528
+ "reward": 1.005468726158142,
529
+ "reward_std": 2.9901723861694336,
530
+ "rewards/alfworld_rollout_reward_func/mean": 1.005468726158142,
531
+ "rewards/alfworld_rollout_reward_func/std": 3.869877815246582,
532
+ "sampling/importance_sampling_ratio/max": 2.6479995250701904,
533
+ "sampling/importance_sampling_ratio/mean": 0.8335222005844116,
534
+ "sampling/importance_sampling_ratio/min": 0.0,
535
+ "sampling/sampling_logp_difference/max": 2.2812881469726562,
536
+ "sampling/sampling_logp_difference/mean": 0.02822798863053322,
537
+ "step": 16,
538
+ "step_time": 288.45321577499635
539
+ },
540
+ {
541
+ "clip_ratio/high_max": 0.0,
542
+ "clip_ratio/high_mean": 0.0,
543
+ "clip_ratio/low_mean": 0.0,
544
+ "clip_ratio/low_min": 0.0,
545
+ "clip_ratio/region_mean": 0.0,
546
+ "completions/clipped_ratio": 0.0625,
547
+ "completions/max_length": 512.0,
548
+ "completions/max_terminated_length": 436.0,
549
+ "completions/mean_length": 155.78125,
550
+ "completions/mean_terminated_length": 132.03334045410156,
551
+ "completions/min_length": 62.0,
552
+ "completions/min_terminated_length": 62.0,
553
+ "entropy": 0.5273009687662125,
554
+ "epoch": 0.27419354838709675,
555
+ "frac_reward_zero_std": 0.0,
556
+ "grad_norm": 0.17423537373542786,
557
+ "kl": 0.07150922995060682,
558
+ "learning_rate": 1.3133114610483909e-05,
559
+ "loss": -0.0034,
560
+ "num_tokens": 1567778.0,
561
+ "reward": 0.25390625,
562
+ "reward_std": 2.4427285194396973,
563
+ "rewards/alfworld_rollout_reward_func/mean": 0.2539062201976776,
564
+ "rewards/alfworld_rollout_reward_func/std": 3.2506790161132812,
565
+ "sampling/importance_sampling_ratio/max": 2.4126062393188477,
566
+ "sampling/importance_sampling_ratio/mean": 0.8037413954734802,
567
+ "sampling/importance_sampling_ratio/min": 0.0,
568
+ "sampling/sampling_logp_difference/max": 2.5358574390411377,
569
+ "sampling/sampling_logp_difference/mean": 0.029498092830181122,
570
+ "step": 17,
571
+ "step_time": 281.3368725319997
572
+ },
573
+ {
574
+ "clip_ratio/high_max": 0.0,
575
+ "clip_ratio/high_mean": 0.0,
576
+ "clip_ratio/low_mean": 0.0,
577
+ "clip_ratio/low_min": 0.0,
578
+ "clip_ratio/region_mean": 0.0,
579
+ "completions/clipped_ratio": 0.1484375,
580
+ "completions/max_length": 512.0,
581
+ "completions/max_terminated_length": 477.0,
582
+ "completions/mean_length": 186.5,
583
+ "completions/mean_terminated_length": 129.76145935058594,
584
+ "completions/min_length": 73.0,
585
+ "completions/min_terminated_length": 73.0,
586
+ "entropy": 0.47341491281986237,
587
+ "epoch": 0.2903225806451613,
588
+ "frac_reward_zero_std": 0.0,
589
+ "grad_norm": 0.1518625020980835,
590
+ "kl": 0.049451613100245595,
591
+ "learning_rate": 1.1866885389516092e-05,
592
+ "loss": 0.0897,
593
+ "num_tokens": 1656705.0,
594
+ "reward": 0.80859375,
595
+ "reward_std": 3.4441990852355957,
596
+ "rewards/alfworld_rollout_reward_func/mean": 0.80859375,
597
+ "rewards/alfworld_rollout_reward_func/std": 3.8008499145507812,
598
+ "sampling/importance_sampling_ratio/max": 2.9426238536834717,
599
+ "sampling/importance_sampling_ratio/mean": 0.7826383113861084,
600
+ "sampling/importance_sampling_ratio/min": 0.0,
601
+ "sampling/sampling_logp_difference/max": 3.3363547325134277,
602
+ "sampling/sampling_logp_difference/mean": 0.027312669903039932,
603
+ "step": 18,
604
+ "step_time": 334.05939280299935
605
+ },
606
+ {
607
+ "clip_ratio/high_max": 0.0,
608
+ "clip_ratio/high_mean": 0.0,
609
+ "clip_ratio/low_mean": 0.0,
610
+ "clip_ratio/low_min": 0.0,
611
+ "clip_ratio/region_mean": 0.0,
612
+ "completions/clipped_ratio": 0.1015625,
613
+ "completions/max_length": 512.0,
614
+ "completions/max_terminated_length": 499.0,
615
+ "completions/mean_length": 166.0859375,
616
+ "completions/mean_terminated_length": 126.98260498046875,
617
+ "completions/min_length": 68.0,
618
+ "completions/min_terminated_length": 68.0,
619
+ "entropy": 0.47634539753198624,
620
+ "epoch": 0.3064516129032258,
621
+ "frac_reward_zero_std": 0.0,
622
+ "grad_norm": 0.2188945859670639,
623
+ "kl": 0.0531598380766809,
624
+ "learning_rate": 1.0607152781192796e-05,
625
+ "loss": 0.0078,
626
+ "num_tokens": 1741466.0,
627
+ "reward": 0.6265625357627869,
628
+ "reward_std": 2.6468114852905273,
629
+ "rewards/alfworld_rollout_reward_func/mean": 0.6265624761581421,
630
+ "rewards/alfworld_rollout_reward_func/std": 3.4735782146453857,
631
+ "sampling/importance_sampling_ratio/max": 2.799043655395508,
632
+ "sampling/importance_sampling_ratio/mean": 0.8041483163833618,
633
+ "sampling/importance_sampling_ratio/min": 0.0,
634
+ "sampling/sampling_logp_difference/max": 2.4288792610168457,
635
+ "sampling/sampling_logp_difference/mean": 0.02712005004286766,
636
+ "step": 19,
637
+ "step_time": 303.70944481100014
638
+ },
639
+ {
640
+ "clip_ratio/high_max": 0.0,
641
+ "clip_ratio/high_mean": 0.0,
642
+ "clip_ratio/low_mean": 0.0,
643
+ "clip_ratio/low_min": 0.0,
644
+ "clip_ratio/region_mean": 0.0,
645
+ "completions/clipped_ratio": 0.0859375,
646
+ "completions/max_length": 512.0,
647
+ "completions/max_terminated_length": 403.0,
648
+ "completions/mean_length": 160.9375,
649
+ "completions/mean_terminated_length": 127.93163299560547,
650
+ "completions/min_length": 76.0,
651
+ "completions/min_terminated_length": 76.0,
652
+ "entropy": 0.46492085233330727,
653
+ "epoch": 0.3225806451612903,
654
+ "frac_reward_zero_std": 0.0,
655
+ "grad_norm": 0.7110471725463867,
656
+ "kl": 0.09847234468907118,
657
+ "learning_rate": 9.366843346765992e-06,
658
+ "loss": 0.0238,
659
+ "num_tokens": 1823852.0,
660
+ "reward": 0.819531261920929,
661
+ "reward_std": 3.2860288619995117,
662
+ "rewards/alfworld_rollout_reward_func/mean": 0.8195312023162842,
663
+ "rewards/alfworld_rollout_reward_func/std": 3.780327320098877,
664
+ "sampling/importance_sampling_ratio/max": 2.7687222957611084,
665
+ "sampling/importance_sampling_ratio/mean": 0.7973555326461792,
666
+ "sampling/importance_sampling_ratio/min": 0.0,
667
+ "sampling/sampling_logp_difference/max": 20.83700942993164,
668
+ "sampling/sampling_logp_difference/mean": 0.027603479102253914,
669
+ "step": 20,
670
+ "step_time": 296.20082658499814
671
+ },
672
+ {
673
+ "clip_ratio/high_max": 0.0,
674
+ "clip_ratio/high_mean": 0.0,
675
+ "clip_ratio/low_mean": 0.0,
676
+ "clip_ratio/low_min": 0.0,
677
+ "clip_ratio/region_mean": 0.0,
678
+ "completions/clipped_ratio": 0.078125,
679
+ "completions/max_length": 512.0,
680
+ "completions/max_terminated_length": 356.0,
681
+ "completions/mean_length": 154.0625,
682
+ "completions/mean_terminated_length": 123.72881317138672,
683
+ "completions/min_length": 58.0,
684
+ "completions/min_terminated_length": 58.0,
685
+ "entropy": 0.4891073890030384,
686
+ "epoch": 0.3387096774193548,
687
+ "frac_reward_zero_std": 0.0,
688
+ "grad_norm": 0.3204660415649414,
689
+ "kl": 0.06161752389743924,
690
+ "learning_rate": 8.158684339439748e-06,
691
+ "loss": 0.1854,
692
+ "num_tokens": 1905839.0,
693
+ "reward": 0.66796875,
694
+ "reward_std": 2.7234749794006348,
695
+ "rewards/alfworld_rollout_reward_func/mean": 0.66796875,
696
+ "rewards/alfworld_rollout_reward_func/std": 3.580017566680908,
697
+ "sampling/importance_sampling_ratio/max": 2.9959678649902344,
698
+ "sampling/importance_sampling_ratio/mean": 0.9344255328178406,
699
+ "sampling/importance_sampling_ratio/min": 0.0,
700
+ "sampling/sampling_logp_difference/max": 2.3660998344421387,
701
+ "sampling/sampling_logp_difference/mean": 0.029398081824183464,
702
+ "step": 21,
703
+ "step_time": 279.7738521129977
704
+ },
705
+ {
706
+ "clip_ratio/high_max": 0.0,
707
+ "clip_ratio/high_mean": 0.0,
708
+ "clip_ratio/low_mean": 0.0,
709
+ "clip_ratio/low_min": 0.0,
710
+ "clip_ratio/region_mean": 0.0,
711
+ "completions/clipped_ratio": 0.171875,
712
+ "completions/max_length": 512.0,
713
+ "completions/max_terminated_length": 446.0,
714
+ "completions/mean_length": 197.1015625,
715
+ "completions/mean_terminated_length": 131.7452850341797,
716
+ "completions/min_length": 65.0,
717
+ "completions/min_terminated_length": 65.0,
718
+ "entropy": 0.4758484289050102,
719
+ "epoch": 0.3548387096774194,
720
+ "frac_reward_zero_std": 0.0,
721
+ "grad_norm": 0.16225755214691162,
722
+ "kl": 0.054194105323404074,
723
+ "learning_rate": 6.99507310552957e-06,
724
+ "loss": 0.0319,
725
+ "num_tokens": 1991294.0,
726
+ "reward": 0.69921875,
727
+ "reward_std": 3.503169298171997,
728
+ "rewards/alfworld_rollout_reward_func/mean": 0.69921875,
729
+ "rewards/alfworld_rollout_reward_func/std": 3.726823329925537,
730
+ "sampling/importance_sampling_ratio/max": 2.5170819759368896,
731
+ "sampling/importance_sampling_ratio/mean": 0.777429461479187,
732
+ "sampling/importance_sampling_ratio/min": 0.0,
733
+ "sampling/sampling_logp_difference/max": 2.413573741912842,
734
+ "sampling/sampling_logp_difference/mean": 0.027101196348667145,
735
+ "step": 22,
736
+ "step_time": 285.0048562720003
737
+ },
738
+ {
739
+ "clip_ratio/high_max": 0.0,
740
+ "clip_ratio/high_mean": 0.0,
741
+ "clip_ratio/low_mean": 0.0,
742
+ "clip_ratio/low_min": 0.0,
743
+ "clip_ratio/region_mean": 0.0,
744
+ "completions/clipped_ratio": 0.1171875,
745
+ "completions/max_length": 512.0,
746
+ "completions/max_terminated_length": 471.0,
747
+ "completions/mean_length": 169.8359375,
748
+ "completions/mean_terminated_length": 124.41593170166016,
749
+ "completions/min_length": 57.0,
750
+ "completions/min_terminated_length": 57.0,
751
+ "entropy": 0.5033188574016094,
752
+ "epoch": 0.3709677419354839,
753
+ "frac_reward_zero_std": 0.0,
754
+ "grad_norm": 0.18970800936222076,
755
+ "kl": 0.04351181210950017,
756
+ "learning_rate": 5.8879498709129735e-06,
757
+ "loss": -0.0057,
758
+ "num_tokens": 2075803.0,
759
+ "reward": 0.8039063215255737,
760
+ "reward_std": 3.560253620147705,
761
+ "rewards/alfworld_rollout_reward_func/mean": 0.8039062023162842,
762
+ "rewards/alfworld_rollout_reward_func/std": 3.759028911590576,
763
+ "sampling/importance_sampling_ratio/max": 2.6937098503112793,
764
+ "sampling/importance_sampling_ratio/mean": 0.7721619606018066,
765
+ "sampling/importance_sampling_ratio/min": 0.0,
766
+ "sampling/sampling_logp_difference/max": 3.6162972450256348,
767
+ "sampling/sampling_logp_difference/mean": 0.02900775894522667,
768
+ "step": 23,
769
+ "step_time": 298.5376572299974
770
+ },
771
+ {
772
+ "clip_ratio/high_max": 0.0,
773
+ "clip_ratio/high_mean": 0.0,
774
+ "clip_ratio/low_mean": 0.0,
775
+ "clip_ratio/low_min": 0.0,
776
+ "clip_ratio/region_mean": 0.0,
777
+ "completions/clipped_ratio": 0.1484375,
778
+ "completions/max_length": 512.0,
779
+ "completions/max_terminated_length": 451.0,
780
+ "completions/mean_length": 186.1640625,
781
+ "completions/mean_terminated_length": 129.36697387695312,
782
+ "completions/min_length": 60.0,
783
+ "completions/min_terminated_length": 60.0,
784
+ "entropy": 0.4917047880589962,
785
+ "epoch": 0.3870967741935484,
786
+ "frac_reward_zero_std": 0.0,
787
+ "grad_norm": 0.18956582248210907,
788
+ "kl": 0.04409886850044131,
789
+ "learning_rate": 4.848675218154214e-06,
790
+ "loss": -0.0152,
791
+ "num_tokens": 2160777.0,
792
+ "reward": 0.675000011920929,
793
+ "reward_std": 2.953065872192383,
794
+ "rewards/alfworld_rollout_reward_func/mean": 0.675000011920929,
795
+ "rewards/alfworld_rollout_reward_func/std": 3.6408443450927734,
796
+ "sampling/importance_sampling_ratio/max": 2.5711066722869873,
797
+ "sampling/importance_sampling_ratio/mean": 0.6986918449401855,
798
+ "sampling/importance_sampling_ratio/min": 0.0,
799
+ "sampling/sampling_logp_difference/max": 3.6658549308776855,
800
+ "sampling/sampling_logp_difference/mean": 0.02768370695412159,
801
+ "step": 24,
802
+ "step_time": 286.79983711899695
803
+ },
804
+ {
805
+ "clip_ratio/high_max": 0.0,
806
+ "clip_ratio/high_mean": 0.0,
807
+ "clip_ratio/low_mean": 0.0,
808
+ "clip_ratio/low_min": 0.0,
809
+ "clip_ratio/region_mean": 0.0,
810
+ "completions/clipped_ratio": 0.140625,
811
+ "completions/max_length": 512.0,
812
+ "completions/max_terminated_length": 436.0,
813
+ "completions/mean_length": 176.5703125,
814
+ "completions/mean_terminated_length": 121.68181610107422,
815
+ "completions/min_length": 54.0,
816
+ "completions/min_terminated_length": 54.0,
817
+ "entropy": 0.4625844620168209,
818
+ "epoch": 0.4032258064516129,
819
+ "frac_reward_zero_std": 0.0,
820
+ "grad_norm": 0.1729527860879898,
821
+ "kl": 0.046662998385727406,
822
+ "learning_rate": 3.887913511553917e-06,
823
+ "loss": 0.0258,
824
+ "num_tokens": 2247234.0,
825
+ "reward": 0.8726562261581421,
826
+ "reward_std": 3.0254807472229004,
827
+ "rewards/alfworld_rollout_reward_func/mean": 0.8726562261581421,
828
+ "rewards/alfworld_rollout_reward_func/std": 3.7554731369018555,
829
+ "sampling/importance_sampling_ratio/max": 2.884495496749878,
830
+ "sampling/importance_sampling_ratio/mean": 0.7411025762557983,
831
+ "sampling/importance_sampling_ratio/min": 0.0,
832
+ "sampling/sampling_logp_difference/max": 3.431777000427246,
833
+ "sampling/sampling_logp_difference/mean": 0.02642218954861164,
834
+ "step": 25,
835
+ "step_time": 331.329842252002
836
+ },
837
+ {
838
+ "clip_ratio/high_max": 0.0,
839
+ "clip_ratio/high_mean": 0.0,
840
+ "clip_ratio/low_mean": 0.0,
841
+ "clip_ratio/low_min": 0.0,
842
+ "clip_ratio/region_mean": 0.0,
843
+ "completions/clipped_ratio": 0.1953125,
844
+ "completions/max_length": 512.0,
845
+ "completions/max_terminated_length": 437.0,
846
+ "completions/mean_length": 203.78125,
847
+ "completions/mean_terminated_length": 128.9708709716797,
848
+ "completions/min_length": 68.0,
849
+ "completions/min_terminated_length": 68.0,
850
+ "entropy": 0.4392157681286335,
851
+ "epoch": 0.41935483870967744,
852
+ "frac_reward_zero_std": 0.0,
853
+ "grad_norm": 0.18243148922920227,
854
+ "kl": 0.03993735695257783,
855
+ "learning_rate": 3.0155234663401146e-06,
856
+ "loss": 0.1456,
857
+ "num_tokens": 2332769.0,
858
+ "reward": 2.0328125953674316,
859
+ "reward_std": 4.510900020599365,
860
+ "rewards/alfworld_rollout_reward_func/mean": 2.0328123569488525,
861
+ "rewards/alfworld_rollout_reward_func/std": 4.624057769775391,
862
+ "sampling/importance_sampling_ratio/max": 2.9809186458587646,
863
+ "sampling/importance_sampling_ratio/mean": 0.8058295845985413,
864
+ "sampling/importance_sampling_ratio/min": 0.0,
865
+ "sampling/sampling_logp_difference/max": 3.739076852798462,
866
+ "sampling/sampling_logp_difference/mean": 0.025428924709558487,
867
+ "step": 26,
868
+ "step_time": 262.9066606970009
869
+ },
870
+ {
871
+ "clip_ratio/high_max": 0.0,
872
+ "clip_ratio/high_mean": 0.0,
873
+ "clip_ratio/low_mean": 0.0,
874
+ "clip_ratio/low_min": 0.0,
875
+ "clip_ratio/region_mean": 0.0,
876
+ "completions/clipped_ratio": 0.125,
877
+ "completions/max_length": 512.0,
878
+ "completions/max_terminated_length": 387.0,
879
+ "completions/mean_length": 173.7890625,
880
+ "completions/mean_terminated_length": 125.47322082519531,
881
+ "completions/min_length": 58.0,
882
+ "completions/min_terminated_length": 58.0,
883
+ "entropy": 0.46685169637203217,
884
+ "epoch": 0.43548387096774194,
885
+ "frac_reward_zero_std": 0.0,
886
+ "grad_norm": 0.17903129756450653,
887
+ "kl": 0.046850728802382946,
888
+ "learning_rate": 2.240456984909049e-06,
889
+ "loss": 0.0513,
890
+ "num_tokens": 2415919.0,
891
+ "reward": 1.5,
892
+ "reward_std": 4.049795627593994,
893
+ "rewards/alfworld_rollout_reward_func/mean": 1.5,
894
+ "rewards/alfworld_rollout_reward_func/std": 4.301400661468506,
895
+ "sampling/importance_sampling_ratio/max": 2.8721108436584473,
896
+ "sampling/importance_sampling_ratio/mean": 0.813679575920105,
897
+ "sampling/importance_sampling_ratio/min": 0.0,
898
+ "sampling/sampling_logp_difference/max": 3.3308191299438477,
899
+ "sampling/sampling_logp_difference/mean": 0.027277380228042603,
900
+ "step": 27,
901
+ "step_time": 287.4597008519995
902
+ },
903
+ {
904
+ "clip_ratio/high_max": 0.0,
905
+ "clip_ratio/high_mean": 0.0,
906
+ "clip_ratio/low_mean": 0.0,
907
+ "clip_ratio/low_min": 0.0,
908
+ "clip_ratio/region_mean": 0.0,
909
+ "completions/clipped_ratio": 0.1640625,
910
+ "completions/max_length": 512.0,
911
+ "completions/max_terminated_length": 253.0,
912
+ "completions/mean_length": 182.4140625,
913
+ "completions/mean_terminated_length": 117.72896575927734,
914
+ "completions/min_length": 67.0,
915
+ "completions/min_terminated_length": 67.0,
916
+ "entropy": 0.4677879251539707,
917
+ "epoch": 0.45161290322580644,
918
+ "frac_reward_zero_std": 0.0,
919
+ "grad_norm": 0.13941031694412231,
920
+ "kl": 0.042057349579408765,
921
+ "learning_rate": 1.570667298192724e-06,
922
+ "loss": 0.0048,
923
+ "num_tokens": 2500100.0,
924
+ "reward": 0.5835937261581421,
925
+ "reward_std": 2.6255595684051514,
926
+ "rewards/alfworld_rollout_reward_func/mean": 0.5835937261581421,
927
+ "rewards/alfworld_rollout_reward_func/std": 3.450979232788086,
928
+ "sampling/importance_sampling_ratio/max": 2.854583501815796,
929
+ "sampling/importance_sampling_ratio/mean": 0.7393490076065063,
930
+ "sampling/importance_sampling_ratio/min": 0.0,
931
+ "sampling/sampling_logp_difference/max": 5.095232963562012,
932
+ "sampling/sampling_logp_difference/mean": 0.027033191174268723,
933
+ "step": 28,
934
+ "step_time": 285.79294812399894
935
+ },
936
+ {
937
+ "clip_ratio/high_max": 0.0,
938
+ "clip_ratio/high_mean": 0.0,
939
+ "clip_ratio/low_mean": 0.0,
940
+ "clip_ratio/low_min": 0.0,
941
+ "clip_ratio/region_mean": 0.0,
942
+ "completions/clipped_ratio": 0.2109375,
943
+ "completions/max_length": 512.0,
944
+ "completions/max_terminated_length": 479.0,
945
+ "completions/mean_length": 212.203125,
946
+ "completions/mean_terminated_length": 132.0594024658203,
947
+ "completions/min_length": 58.0,
948
+ "completions/min_terminated_length": 58.0,
949
+ "entropy": 0.4555159918963909,
950
+ "epoch": 0.46774193548387094,
951
+ "frac_reward_zero_std": 0.0,
952
+ "grad_norm": 0.16522692143917084,
953
+ "kl": 0.04728960315696895,
954
+ "learning_rate": 1.0130273547471176e-06,
955
+ "loss": 0.0531,
956
+ "num_tokens": 2589663.0,
957
+ "reward": 0.828906238079071,
958
+ "reward_std": 3.300809860229492,
959
+ "rewards/alfworld_rollout_reward_func/mean": 0.8289061784744263,
960
+ "rewards/alfworld_rollout_reward_func/std": 3.700098991394043,
961
+ "sampling/importance_sampling_ratio/max": 2.9519269466400146,
962
+ "sampling/importance_sampling_ratio/mean": 0.7473565340042114,
963
+ "sampling/importance_sampling_ratio/min": 0.0,
964
+ "sampling/sampling_logp_difference/max": 2.6239898204803467,
965
+ "sampling/sampling_logp_difference/mean": 0.024686329066753387,
966
+ "step": 29,
967
+ "step_time": 316.5334539249998
968
+ },
969
+ {
970
+ "clip_ratio/high_max": 0.0,
971
+ "clip_ratio/high_mean": 0.0,
972
+ "clip_ratio/low_mean": 0.0,
973
+ "clip_ratio/low_min": 0.0,
974
+ "clip_ratio/region_mean": 0.0,
975
+ "completions/clipped_ratio": 0.140625,
976
+ "completions/max_length": 512.0,
977
+ "completions/max_terminated_length": 404.0,
978
+ "completions/mean_length": 182.484375,
979
+ "completions/mean_terminated_length": 128.56362915039062,
980
+ "completions/min_length": 69.0,
981
+ "completions/min_terminated_length": 69.0,
982
+ "entropy": 0.4658549278974533,
983
+ "epoch": 0.4838709677419355,
984
+ "frac_reward_zero_std": 0.0,
985
+ "grad_norm": 0.1560395509004593,
986
+ "kl": 0.05294563015922904,
987
+ "learning_rate": 5.732592949993898e-07,
988
+ "loss": 0.0688,
989
+ "num_tokens": 2675234.0,
990
+ "reward": 0.84765625,
991
+ "reward_std": 3.315725803375244,
992
+ "rewards/alfworld_rollout_reward_func/mean": 0.84765625,
993
+ "rewards/alfworld_rollout_reward_func/std": 3.8803963661193848,
994
+ "sampling/importance_sampling_ratio/max": 2.622981548309326,
995
+ "sampling/importance_sampling_ratio/mean": 0.7878429293632507,
996
+ "sampling/importance_sampling_ratio/min": 0.0,
997
+ "sampling/sampling_logp_difference/max": 3.5301332473754883,
998
+ "sampling/sampling_logp_difference/mean": 0.026188407093286514,
999
+ "step": 30,
1000
+ "step_time": 298.4171370410004
1001
+ },
1002
+ {
1003
+ "clip_ratio/high_max": 0.0,
1004
+ "clip_ratio/high_mean": 0.0,
1005
+ "clip_ratio/low_mean": 0.0,
1006
+ "clip_ratio/low_min": 0.0,
1007
+ "clip_ratio/region_mean": 0.0,
1008
+ "completions/clipped_ratio": 0.1328125,
1009
+ "completions/max_length": 512.0,
1010
+ "completions/max_terminated_length": 446.0,
1011
+ "completions/mean_length": 179.0078125,
1012
+ "completions/mean_terminated_length": 128.00901794433594,
1013
+ "completions/min_length": 63.0,
1014
+ "completions/min_terminated_length": 63.0,
1015
+ "entropy": 0.5069509670138359,
1016
+ "epoch": 0.5,
1017
+ "frac_reward_zero_std": 0.0,
1018
+ "grad_norm": 0.20257869362831116,
1019
+ "kl": 0.055831174831837416,
1020
+ "learning_rate": 2.5587573434381895e-07,
1021
+ "loss": 0.0626,
1022
+ "num_tokens": 2762673.0,
1023
+ "reward": 0.6875,
1024
+ "reward_std": 3.218989372253418,
1025
+ "rewards/alfworld_rollout_reward_func/mean": 0.6875,
1026
+ "rewards/alfworld_rollout_reward_func/std": 3.5843491554260254,
1027
+ "sampling/importance_sampling_ratio/max": 2.8179516792297363,
1028
+ "sampling/importance_sampling_ratio/mean": 0.7038029432296753,
1029
+ "sampling/importance_sampling_ratio/min": 0.0,
1030
+ "sampling/sampling_logp_difference/max": 4.102773666381836,
1031
+ "sampling/sampling_logp_difference/mean": 0.029051225632429123,
1032
+ "step": 31,
1033
+ "step_time": 305.9211267239989
1034
+ },
1035
+ {
1036
+ "clip_ratio/high_max": 0.0,
1037
+ "clip_ratio/high_mean": 0.0,
1038
+ "clip_ratio/low_mean": 0.0,
1039
+ "clip_ratio/low_min": 0.0,
1040
+ "clip_ratio/region_mean": 0.0,
1041
+ "completions/clipped_ratio": 0.140625,
1042
+ "completions/max_length": 512.0,
1043
+ "completions/max_terminated_length": 486.0,
1044
+ "completions/mean_length": 193.7734375,
1045
+ "completions/mean_terminated_length": 141.6999969482422,
1046
+ "completions/min_length": 64.0,
1047
+ "completions/min_terminated_length": 64.0,
1048
+ "entropy": 0.50175466760993,
1049
+ "epoch": 0.5161290322580645,
1050
+ "frac_reward_zero_std": 0.0,
1051
+ "grad_norm": 0.14924179017543793,
1052
+ "kl": 0.03636577841825783,
1053
+ "learning_rate": 6.413345760131057e-08,
1054
+ "loss": 0.1286,
1055
+ "num_tokens": 2849119.0,
1056
+ "reward": 0.9367187023162842,
1057
+ "reward_std": 3.7098560333251953,
1058
+ "rewards/alfworld_rollout_reward_func/mean": 0.9367187023162842,
1059
+ "rewards/alfworld_rollout_reward_func/std": 3.8140616416931152,
1060
+ "sampling/importance_sampling_ratio/max": 2.708343744277954,
1061
+ "sampling/importance_sampling_ratio/mean": 0.8295129537582397,
1062
+ "sampling/importance_sampling_ratio/min": 0.0,
1063
+ "sampling/sampling_logp_difference/max": 3.4670004844665527,
1064
+ "sampling/sampling_logp_difference/mean": 0.027453571557998657,
1065
+ "step": 32,
1066
+ "step_time": 294.9108767129983
1067
+ }
1068
+ ],
1069
+ "logging_steps": 1,
1070
+ "max_steps": 32,
1071
+ "num_input_tokens_seen": 2849119,
1072
+ "num_train_epochs": 1,
1073
+ "save_steps": 20,
1074
+ "stateful_callbacks": {
1075
+ "TrainerControl": {
1076
+ "args": {
1077
+ "should_epoch_stop": false,
1078
+ "should_evaluate": false,
1079
+ "should_log": false,
1080
+ "should_save": true,
1081
+ "should_training_stop": true
1082
+ },
1083
+ "attributes": {}
1084
+ }
1085
+ },
1086
+ "total_flos": 0.0,
1087
+ "train_batch_size": 8,
1088
+ "trial_name": null,
1089
+ "trial_params": null
1090
+ }
checkpoint-32/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe31dee94209c1973566ddbbc45e4eaa5758a9402713eebce4f579426cfdc6a3
3
+ size 9361
checkpoint-32/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151645,
7
+ "dtype": "bfloat16",
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 2048,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 11008,
13
+ "layer_types": [
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention"
50
+ ],
51
+ "max_position_embeddings": 32768,
52
+ "max_window_layers": 70,
53
+ "model_type": "qwen2",
54
+ "num_attention_heads": 16,
55
+ "num_hidden_layers": 36,
56
+ "num_key_value_heads": 2,
57
+ "rms_norm_eps": 1e-06,
58
+ "rope_scaling": null,
59
+ "rope_theta": 1000000.0,
60
+ "sliding_window": null,
61
+ "tie_word_embeddings": true,
62
+ "transformers_version": "4.57.6",
63
+ "use_cache": false,
64
+ "use_sliding_window": false,
65
+ "vocab_size": 151936
66
+ }
debug.log ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-01-26 07:21:36,287] [WARNING] [py.warnings._showwarnmsg:110] [PID:300] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/torch/backends/__init__.py:46: UserWarning: Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices (Triggered internally at /pytorch/aten/src/ATen/Context.cpp:80.)
2
+ self.setter(val)
3
+
4
+
5
+
6
+ [2026-01-26 07:21:58,152] [WARNING] [py.warnings._showwarnmsg:110] [PID:300] <string>:246: FutureWarning: The `max_prompt_length` argument is deprecated and will be removed in version 0.28.0. You should instead filter your dataset before training to ensure that prompts do not exceed your desired length.
7
+
8
+ [2026-01-26 07:21:58,799] [WARNING] [py.warnings._showwarnmsg:110] [PID:300] /workspace/axolotl/src/axolotl/core/trainers/mixins/optimizer.py:209: UserWarning: You are importing from 'rollout_func', which is an experimental feature. This API may change or be removed at any time without prior notice. Silence this warning by setting environment variable TRL_EXPERIMENTAL_SILENCE=1.
9
+ super().__init__(*args, **kwargs)
10
+
11
+ 2026-01-26 07:22:41,128 - INFO - autotuner.py:256 - flashinfer.jit: [Autotuner]: Autotuning process starts ...
12
+ 2026-01-26 07:22:41,141 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process ends
13
+ AlfWorld endpoint initialized on rank 1 at http://environment-server-1:8000
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
ref/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "/cache/models/Qwen--Qwen2.5-3B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": null,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.0,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "gate_proj",
33
+ "k_proj",
34
+ "up_proj",
35
+ "q_proj",
36
+ "o_proj",
37
+ "down_proj",
38
+ "v_proj"
39
+ ],
40
+ "target_parameters": [],
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
ref/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a73e8a3357e88549443ed67169c44587d8849b718265a36909b7373da2322a00
3
+ size 59934640
special_tokens_map.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "bos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "eos_token": {
25
+ "content": "<|im_end|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "pad_token": {
32
+ "content": "<|endoftext|>",
33
+ "lstrip": false,
34
+ "normalized": false,
35
+ "rstrip": false,
36
+ "single_word": false
37
+ }
38
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": "<|im_end|>",
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }