sea-rod commited on
Commit
f83f0e2
·
verified ·
1 Parent(s): 592cfaf

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +23 -0
  2. v3/Base/gen-output/data-00000-of-00001.arrow +3 -0
  3. v3/Base/gen-output/dataset_info.json +34 -0
  4. v3/Base/gen-output/state.json +13 -0
  5. v3/DPO/DPO_10k/DPO_10k/README.md +209 -0
  6. v3/DPO/DPO_10k/DPO_10k/adapter_config.json +46 -0
  7. v3/DPO/DPO_10k/DPO_10k/adapter_model.safetensors +3 -0
  8. v3/DPO/DPO_10k/MDPO_10k/chat_template.jinja +93 -0
  9. v3/DPO/DPO_10k/MDPO_10k/config.json +40 -0
  10. v3/DPO/DPO_10k/MDPO_10k/generation_config.json +12 -0
  11. v3/DPO/DPO_10k/MDPO_10k/model.safetensors +3 -0
  12. v3/DPO/DPO_10k/MDPO_10k/tokenizer.json +3 -0
  13. v3/DPO/DPO_10k/MDPO_10k/tokenizer_config.json +14 -0
  14. v3/DPO/DPO_10k/lora/README.md +68 -0
  15. v3/DPO/DPO_10k/lora/checkpoint-1000/README.md +209 -0
  16. v3/DPO/DPO_10k/lora/checkpoint-1000/adapter_config.json +46 -0
  17. v3/DPO/DPO_10k/lora/checkpoint-1000/adapter_model.safetensors +3 -0
  18. v3/DPO/DPO_10k/lora/checkpoint-1000/chat_template.jinja +93 -0
  19. v3/DPO/DPO_10k/lora/checkpoint-1000/optimizer.pt +3 -0
  20. v3/DPO/DPO_10k/lora/checkpoint-1000/rng_state.pth +3 -0
  21. v3/DPO/DPO_10k/lora/checkpoint-1000/scaler.pt +3 -0
  22. v3/DPO/DPO_10k/lora/checkpoint-1000/scheduler.pt +3 -0
  23. v3/DPO/DPO_10k/lora/checkpoint-1000/tokenizer.json +3 -0
  24. v3/DPO/DPO_10k/lora/checkpoint-1000/tokenizer_config.json +14 -0
  25. v3/DPO/DPO_10k/lora/checkpoint-1000/trainer_state.json +1694 -0
  26. v3/DPO/DPO_10k/lora/checkpoint-1000/training_args.bin +3 -0
  27. v3/DPO/DPO_10k/lora/checkpoint-1500/README.md +209 -0
  28. v3/DPO/DPO_10k/lora/checkpoint-1500/adapter_config.json +46 -0
  29. v3/DPO/DPO_10k/lora/checkpoint-1500/adapter_model.safetensors +3 -0
  30. v3/DPO/DPO_10k/lora/checkpoint-1500/chat_template.jinja +93 -0
  31. v3/DPO/DPO_10k/lora/checkpoint-1500/optimizer.pt +3 -0
  32. v3/DPO/DPO_10k/lora/checkpoint-1500/rng_state.pth +3 -0
  33. v3/DPO/DPO_10k/lora/checkpoint-1500/scaler.pt +3 -0
  34. v3/DPO/DPO_10k/lora/checkpoint-1500/scheduler.pt +3 -0
  35. v3/DPO/DPO_10k/lora/checkpoint-1500/tokenizer.json +3 -0
  36. v3/DPO/DPO_10k/lora/checkpoint-1500/tokenizer_config.json +14 -0
  37. v3/DPO/DPO_10k/lora/checkpoint-1500/trainer_state.json +2524 -0
  38. v3/DPO/DPO_10k/lora/checkpoint-1500/training_args.bin +3 -0
  39. v3/DPO/DPO_10k/lora/checkpoint-2000/README.md +209 -0
  40. v3/DPO/DPO_10k/lora/checkpoint-2000/adapter_config.json +46 -0
  41. v3/DPO/DPO_10k/lora/checkpoint-2000/adapter_model.safetensors +3 -0
  42. v3/DPO/DPO_10k/lora/checkpoint-2000/chat_template.jinja +93 -0
  43. v3/DPO/DPO_10k/lora/checkpoint-2000/optimizer.pt +3 -0
  44. v3/DPO/DPO_10k/lora/checkpoint-2000/rng_state.pth +3 -0
  45. v3/DPO/DPO_10k/lora/checkpoint-2000/scaler.pt +3 -0
  46. v3/DPO/DPO_10k/lora/checkpoint-2000/scheduler.pt +3 -0
  47. v3/DPO/DPO_10k/lora/checkpoint-2000/tokenizer.json +3 -0
  48. v3/DPO/DPO_10k/lora/checkpoint-2000/tokenizer_config.json +14 -0
  49. v3/DPO/DPO_10k/lora/checkpoint-2000/trainer_state.json +0 -0
  50. v3/DPO/DPO_10k/lora/checkpoint-2000/training_args.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,26 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ v3/DPO/DPO_10k/MDPO_10k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ v3/DPO/DPO_10k/lora/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ v3/DPO/DPO_10k/lora/checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ v3/DPO/DPO_10k/lora/checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ v3/DPO/DPO_10k/lora/checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ v3/DPO/DPO_10k/lora/checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ v3/DPO/DPO_10k/lora/checkpoint-3500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ v3/DPO/DPO_10k/lora/checkpoint-3750/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
+ v3/DPO/DPO_10k/lora/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
+ v3/DPO/DPO_1k/MDPO_1k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ v3/DPO/DPO_1k/lora/checkpoint-375/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ v3/DPO/DPO_5k/MDPO_5k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ v3/DPO/DPO_5k/lora/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ v3/DPO/DPO_5k/lora/checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ v3/DPO/DPO_5k/lora/checkpoint-1875/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ v3/DPO/DPO_5k/lora/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ v3/ORPO/ORPO_1k/MORPO_1k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
53
+ v3/ORPO/ORPO_1k/lora/checkpoint-375/tokenizer.json filter=lfs diff=lfs merge=lfs -text
54
+ v3/ORPO/ORPO_5k/MORPO_5k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
55
+ v3/ORPO/ORPO_5k/lora/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
56
+ v3/ORPO/ORPO_5k/lora/checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
57
+ v3/ORPO/ORPO_5k/lora/checkpoint-1875/tokenizer.json filter=lfs diff=lfs merge=lfs -text
58
+ v3/ORPO/ORPO_5k/lora/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
v3/Base/gen-output/data-00000-of-00001.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efabd6f3e982da43a3b3d18d95b36f512aee97801c2e980bce071c90f75c41d3
3
+ size 982376
v3/Base/gen-output/dataset_info.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "prompt": {
6
+ "feature": {
7
+ "content": {
8
+ "dtype": "string",
9
+ "_type": "Value"
10
+ },
11
+ "role": {
12
+ "dtype": "string",
13
+ "_type": "Value"
14
+ }
15
+ },
16
+ "_type": "List"
17
+ },
18
+ "generated_text": {
19
+ "feature": {
20
+ "content": {
21
+ "dtype": "string",
22
+ "_type": "Value"
23
+ },
24
+ "role": {
25
+ "dtype": "string",
26
+ "_type": "Value"
27
+ }
28
+ },
29
+ "_type": "List"
30
+ }
31
+ },
32
+ "homepage": "",
33
+ "license": ""
34
+ }
v3/Base/gen-output/state.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "f319c5a85dc7aa54",
8
+ "_format_columns": null,
9
+ "_format_kwargs": {},
10
+ "_format_type": null,
11
+ "_output_all_columns": false,
12
+ "_split": null
13
+ }
v3/DPO/DPO_10k/DPO_10k/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Llama-3.2-1B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:meta-llama/Llama-3.2-1B-Instruct
7
+ - dpo
8
+ - lora
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
v3/DPO/DPO_10k/DPO_10k/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "down_proj",
35
+ "k_proj",
36
+ "o_proj",
37
+ "up_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
v3/DPO/DPO_10k/DPO_10k/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27a5e5a70889c639646183be14875b89ae81e677265281b25eadec5735c154e0
3
+ size 180385008
v3/DPO/DPO_10k/MDPO_10k/chat_template.jinja ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token }}
2
+ {%- if custom_tools is defined %}
3
+ {%- set tools = custom_tools %}
4
+ {%- endif %}
5
+ {%- if not tools_in_user_message is defined %}
6
+ {%- set tools_in_user_message = true %}
7
+ {%- endif %}
8
+ {%- if not date_string is defined %}
9
+ {%- if strftime_now is defined %}
10
+ {%- set date_string = strftime_now("%d %b %Y") %}
11
+ {%- else %}
12
+ {%- set date_string = "26 Jul 2024" %}
13
+ {%- endif %}
14
+ {%- endif %}
15
+ {%- if not tools is defined %}
16
+ {%- set tools = none %}
17
+ {%- endif %}
18
+
19
+ {#- This block extracts the system message, so we can slot it into the right place. #}
20
+ {%- if messages[0]['role'] == 'system' %}
21
+ {%- set system_message = messages[0]['content']|trim %}
22
+ {%- set messages = messages[1:] %}
23
+ {%- else %}
24
+ {%- set system_message = "" %}
25
+ {%- endif %}
26
+
27
+ {#- System message #}
28
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
29
+ {%- if tools is not none %}
30
+ {{- "Environment: ipython\n" }}
31
+ {%- endif %}
32
+ {{- "Cutting Knowledge Date: December 2023\n" }}
33
+ {{- "Today Date: " + date_string + "\n\n" }}
34
+ {%- if tools is not none and not tools_in_user_message %}
35
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
36
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
37
+ {{- "Do not use variables.\n\n" }}
38
+ {%- for t in tools %}
39
+ {{- t | tojson(indent=4) }}
40
+ {{- "\n\n" }}
41
+ {%- endfor %}
42
+ {%- endif %}
43
+ {{- system_message }}
44
+ {{- "<|eot_id|>" }}
45
+
46
+ {#- Custom tools are passed in a user message with some extra guidance #}
47
+ {%- if tools_in_user_message and not tools is none %}
48
+ {#- Extract the first user message so we can plug it in here #}
49
+ {%- if messages | length != 0 %}
50
+ {%- set first_user_message = messages[0]['content']|trim %}
51
+ {%- set messages = messages[1:] %}
52
+ {%- else %}
53
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
54
+ {%- endif %}
55
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
56
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
57
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
58
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
59
+ {{- "Do not use variables.\n\n" }}
60
+ {%- for t in tools %}
61
+ {{- t | tojson(indent=4) }}
62
+ {{- "\n\n" }}
63
+ {%- endfor %}
64
+ {{- first_user_message + "<|eot_id|>"}}
65
+ {%- endif %}
66
+
67
+ {%- for message in messages %}
68
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
69
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
70
+ {%- elif 'tool_calls' in message %}
71
+ {%- if not message.tool_calls|length == 1 %}
72
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
73
+ {%- endif %}
74
+ {%- set tool_call = message.tool_calls[0].function %}
75
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
76
+ {{- '{"name": "' + tool_call.name + '", ' }}
77
+ {{- '"parameters": ' }}
78
+ {{- tool_call.arguments | tojson }}
79
+ {{- "}" }}
80
+ {{- "<|eot_id|>" }}
81
+ {%- elif message.role == "tool" or message.role == "ipython" %}
82
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
83
+ {%- if message.content is mapping or message.content is iterable %}
84
+ {{- message.content | tojson }}
85
+ {%- else %}
86
+ {{- message.content }}
87
+ {%- endif %}
88
+ {{- "<|eot_id|>" }}
89
+ {%- endif %}
90
+ {%- endfor %}
91
+ {%- if add_generation_prompt %}
92
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
93
+ {%- endif %}
v3/DPO/DPO_10k/MDPO_10k/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 128000,
8
+ "dtype": "float16",
9
+ "eos_token_id": [
10
+ 128001,
11
+ 128008,
12
+ 128009
13
+ ],
14
+ "head_dim": 64,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 2048,
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 8192,
19
+ "max_position_embeddings": 131072,
20
+ "mlp_bias": false,
21
+ "model_type": "llama",
22
+ "num_attention_heads": 32,
23
+ "num_hidden_layers": 16,
24
+ "num_key_value_heads": 8,
25
+ "pad_token_id": null,
26
+ "pretraining_tp": 1,
27
+ "rms_norm_eps": 1e-05,
28
+ "rope_parameters": {
29
+ "factor": 32.0,
30
+ "high_freq_factor": 4.0,
31
+ "low_freq_factor": 1.0,
32
+ "original_max_position_embeddings": 8192,
33
+ "rope_theta": 500000.0,
34
+ "rope_type": "llama3"
35
+ },
36
+ "tie_word_embeddings": true,
37
+ "transformers_version": "5.2.0",
38
+ "use_cache": true,
39
+ "vocab_size": 128256
40
+ }
v3/DPO/DPO_10k/MDPO_10k/generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 128000,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128008,
7
+ 128009
8
+ ],
9
+ "temperature": 0.6,
10
+ "top_p": 0.9,
11
+ "transformers_version": "5.2.0"
12
+ }
v3/DPO/DPO_10k/MDPO_10k/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85c9d82fdf7dbcecd629e2cc9efd75e800f619818033fb81d1f7884887a67cc7
3
+ size 2471645464
v3/DPO/DPO_10k/MDPO_10k/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
+ size 17209920
v3/DPO/DPO_10k/MDPO_10k/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin_of_text|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|eot_id|>",
6
+ "is_local": false,
7
+ "model_input_names": [
8
+ "input_ids",
9
+ "attention_mask"
10
+ ],
11
+ "model_max_length": 131072,
12
+ "pad_token": "<|eot_id|>",
13
+ "tokenizer_class": "TokenizersBackend"
14
+ }
v3/DPO/DPO_10k/lora/README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Llama-3.2-1B-Instruct
3
+ library_name: transformers
4
+ model_name: lora
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - dpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for lora
13
+
14
+ This model is a fine-tuned version of [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/sea-rod/huggingface/runs/9nfqa09t)
31
+
32
+
33
+ This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.28.0
38
+ - Transformers: 5.2.0
39
+ - Pytorch: 2.10.0
40
+ - Datasets: 4.5.0
41
+ - Tokenizers: 0.22.2
42
+
43
+ ## Citations
44
+
45
+ Cite DPO as:
46
+
47
+ ```bibtex
48
+ @inproceedings{rafailov2023direct,
49
+ title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
50
+ author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
51
+ year = 2023,
52
+ booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
53
+ url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
54
+ editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
55
+ }
56
+ ```
57
+
58
+ Cite TRL as:
59
+
60
+ ```bibtex
61
+ @software{vonwerra2020trl,
62
+ title = {{TRL: Transformers Reinforcement Learning}},
63
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
64
+ license = {Apache-2.0},
65
+ url = {https://github.com/huggingface/trl},
66
+ year = {2020}
67
+ }
68
+ ```
v3/DPO/DPO_10k/lora/checkpoint-1000/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Llama-3.2-1B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:meta-llama/Llama-3.2-1B-Instruct
7
+ - dpo
8
+ - lora
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
v3/DPO/DPO_10k/lora/checkpoint-1000/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "down_proj",
35
+ "k_proj",
36
+ "o_proj",
37
+ "up_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
v3/DPO/DPO_10k/lora/checkpoint-1000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27a5e5a70889c639646183be14875b89ae81e677265281b25eadec5735c154e0
3
+ size 180385008
v3/DPO/DPO_10k/lora/checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token }}
2
+ {%- if custom_tools is defined %}
3
+ {%- set tools = custom_tools %}
4
+ {%- endif %}
5
+ {%- if not tools_in_user_message is defined %}
6
+ {%- set tools_in_user_message = true %}
7
+ {%- endif %}
8
+ {%- if not date_string is defined %}
9
+ {%- if strftime_now is defined %}
10
+ {%- set date_string = strftime_now("%d %b %Y") %}
11
+ {%- else %}
12
+ {%- set date_string = "26 Jul 2024" %}
13
+ {%- endif %}
14
+ {%- endif %}
15
+ {%- if not tools is defined %}
16
+ {%- set tools = none %}
17
+ {%- endif %}
18
+
19
+ {#- This block extracts the system message, so we can slot it into the right place. #}
20
+ {%- if messages[0]['role'] == 'system' %}
21
+ {%- set system_message = messages[0]['content']|trim %}
22
+ {%- set messages = messages[1:] %}
23
+ {%- else %}
24
+ {%- set system_message = "" %}
25
+ {%- endif %}
26
+
27
+ {#- System message #}
28
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
29
+ {%- if tools is not none %}
30
+ {{- "Environment: ipython\n" }}
31
+ {%- endif %}
32
+ {{- "Cutting Knowledge Date: December 2023\n" }}
33
+ {{- "Today Date: " + date_string + "\n\n" }}
34
+ {%- if tools is not none and not tools_in_user_message %}
35
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
36
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
37
+ {{- "Do not use variables.\n\n" }}
38
+ {%- for t in tools %}
39
+ {{- t | tojson(indent=4) }}
40
+ {{- "\n\n" }}
41
+ {%- endfor %}
42
+ {%- endif %}
43
+ {{- system_message }}
44
+ {{- "<|eot_id|>" }}
45
+
46
+ {#- Custom tools are passed in a user message with some extra guidance #}
47
+ {%- if tools_in_user_message and not tools is none %}
48
+ {#- Extract the first user message so we can plug it in here #}
49
+ {%- if messages | length != 0 %}
50
+ {%- set first_user_message = messages[0]['content']|trim %}
51
+ {%- set messages = messages[1:] %}
52
+ {%- else %}
53
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
54
+ {%- endif %}
55
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
56
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
57
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
58
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
59
+ {{- "Do not use variables.\n\n" }}
60
+ {%- for t in tools %}
61
+ {{- t | tojson(indent=4) }}
62
+ {{- "\n\n" }}
63
+ {%- endfor %}
64
+ {{- first_user_message + "<|eot_id|>"}}
65
+ {%- endif %}
66
+
67
+ {%- for message in messages %}
68
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
69
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
70
+ {%- elif 'tool_calls' in message %}
71
+ {%- if not message.tool_calls|length == 1 %}
72
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
73
+ {%- endif %}
74
+ {%- set tool_call = message.tool_calls[0].function %}
75
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
76
+ {{- '{"name": "' + tool_call.name + '", ' }}
77
+ {{- '"parameters": ' }}
78
+ {{- tool_call.arguments | tojson }}
79
+ {{- "}" }}
80
+ {{- "<|eot_id|>" }}
81
+ {%- elif message.role == "tool" or message.role == "ipython" %}
82
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
83
+ {%- if message.content is mapping or message.content is iterable %}
84
+ {{- message.content | tojson }}
85
+ {%- else %}
86
+ {{- message.content }}
87
+ {%- endif %}
88
+ {{- "<|eot_id|>" }}
89
+ {%- endif %}
90
+ {%- endfor %}
91
+ {%- if add_generation_prompt %}
92
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
93
+ {%- endif %}
v3/DPO/DPO_10k/lora/checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d87beff974577f720d7e8125b66164907ad5b681a0dedce34f2ee0ab3d344976
3
+ size 360902475
v3/DPO/DPO_10k/lora/checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea11996454b5587fcf33ae0ab5cf14b2031bf5f53f8c2ed5a48e87de31e29c84
3
+ size 14645
v3/DPO/DPO_10k/lora/checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14ae2a2128444abab378aa06c09a61a84665f758fcc19fc46f5789b0bc1b5665
3
+ size 1383
v3/DPO/DPO_10k/lora/checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c517d0407369461be1bd2161fd20105c2aa4886637aab49e0225f47034adb975
3
+ size 1465
v3/DPO/DPO_10k/lora/checkpoint-1000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
+ size 17209920
v3/DPO/DPO_10k/lora/checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin_of_text|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|eot_id|>",
6
+ "is_local": false,
7
+ "model_input_names": [
8
+ "input_ids",
9
+ "attention_mask"
10
+ ],
11
+ "model_max_length": 131072,
12
+ "pad_token": "<|eot_id|>",
13
+ "tokenizer_class": "TokenizersBackend"
14
+ }
v3/DPO/DPO_10k/lora/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,1694 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 0.6880542039871216,
4
+ "best_model_checkpoint": "output/lora/checkpoint-1000",
5
+ "epoch": 0.8,
6
+ "eval_steps": 100,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.008,
14
+ "grad_norm": 1.1179773807525635,
15
+ "learning_rate": 2.4e-08,
16
+ "logits/chosen": 1.4545083045959473,
17
+ "logits/rejected": 1.425490379333496,
18
+ "logps/chosen": -132.46189880371094,
19
+ "logps/rejected": -148.20260620117188,
20
+ "loss": 0.6930626392364502,
21
+ "rewards/accuracies": 0.21250000596046448,
22
+ "rewards/chosen": -6.165739614516497e-05,
23
+ "rewards/margins": 0.00017333509458694607,
24
+ "rewards/rejected": -0.00023499250528402627,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.016,
29
+ "grad_norm": 1.3502839803695679,
30
+ "learning_rate": 5.0666666666666664e-08,
31
+ "logits/chosen": 1.5668801069259644,
32
+ "logits/rejected": 1.5191389322280884,
33
+ "logps/chosen": -157.63624572753906,
34
+ "logps/rejected": -144.2716827392578,
35
+ "loss": 0.6930380821228027,
36
+ "rewards/accuracies": 0.5874999761581421,
37
+ "rewards/chosen": 0.0009244013344869018,
38
+ "rewards/margins": 0.00023505210992880166,
39
+ "rewards/rejected": 0.0006893490790389478,
40
+ "step": 20
41
+ },
42
+ {
43
+ "epoch": 0.024,
44
+ "grad_norm": 1.0219553709030151,
45
+ "learning_rate": 7.733333333333334e-08,
46
+ "logits/chosen": 1.8921797275543213,
47
+ "logits/rejected": 1.7342685461044312,
48
+ "logps/chosen": -164.55799865722656,
49
+ "logps/rejected": -158.92752075195312,
50
+ "loss": 0.69290189743042,
51
+ "rewards/accuracies": 0.5375000238418579,
52
+ "rewards/chosen": 0.0009391927160322666,
53
+ "rewards/margins": 0.0005032324115745723,
54
+ "rewards/rejected": 0.0004359603044576943,
55
+ "step": 30
56
+ },
57
+ {
58
+ "epoch": 0.032,
59
+ "grad_norm": 1.2056113481521606,
60
+ "learning_rate": 1.0399999999999999e-07,
61
+ "logits/chosen": 1.4950047731399536,
62
+ "logits/rejected": 1.6163270473480225,
63
+ "logps/chosen": -145.54183959960938,
64
+ "logps/rejected": -134.19979858398438,
65
+ "loss": 0.6935500621795654,
66
+ "rewards/accuracies": 0.4124999940395355,
67
+ "rewards/chosen": 0.0006421065190806985,
68
+ "rewards/margins": -0.0007943272357806563,
69
+ "rewards/rejected": 0.0014364338712766767,
70
+ "step": 40
71
+ },
72
+ {
73
+ "epoch": 0.04,
74
+ "grad_norm": 1.1029725074768066,
75
+ "learning_rate": 1.3066666666666665e-07,
76
+ "logits/chosen": 1.4076533317565918,
77
+ "logits/rejected": 1.4791977405548096,
78
+ "logps/chosen": -133.23324584960938,
79
+ "logps/rejected": -138.3593292236328,
80
+ "loss": 0.6925833702087403,
81
+ "rewards/accuracies": 0.574999988079071,
82
+ "rewards/chosen": 0.0012372838100418448,
83
+ "rewards/margins": 0.001139522879384458,
84
+ "rewards/rejected": 9.776116348803043e-05,
85
+ "step": 50
86
+ },
87
+ {
88
+ "epoch": 0.048,
89
+ "grad_norm": 0.9960266351699829,
90
+ "learning_rate": 1.573333333333333e-07,
91
+ "logits/chosen": 1.407166600227356,
92
+ "logits/rejected": 1.376042127609253,
93
+ "logps/chosen": -137.8921661376953,
94
+ "logps/rejected": -127.97320556640625,
95
+ "loss": 0.693614149093628,
96
+ "rewards/accuracies": 0.4749999940395355,
97
+ "rewards/chosen": -0.00021316049969755113,
98
+ "rewards/margins": -0.0009249996510334313,
99
+ "rewards/rejected": 0.0007118391804397106,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.056,
104
+ "grad_norm": 1.0946824550628662,
105
+ "learning_rate": 1.8399999999999998e-07,
106
+ "logits/chosen": 1.5944501161575317,
107
+ "logits/rejected": 1.7248146533966064,
108
+ "logps/chosen": -153.5526123046875,
109
+ "logps/rejected": -157.19735717773438,
110
+ "loss": 0.6928449153900147,
111
+ "rewards/accuracies": 0.5375000238418579,
112
+ "rewards/chosen": 0.0026370862033218145,
113
+ "rewards/margins": 0.0006150150438770652,
114
+ "rewards/rejected": 0.0020220710430294275,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.064,
119
+ "grad_norm": 1.0322489738464355,
120
+ "learning_rate": 2.1066666666666665e-07,
121
+ "logits/chosen": 1.4586353302001953,
122
+ "logits/rejected": 1.4842995405197144,
123
+ "logps/chosen": -149.03665161132812,
124
+ "logps/rejected": -136.24081420898438,
125
+ "loss": 0.6932015895843506,
126
+ "rewards/accuracies": 0.512499988079071,
127
+ "rewards/chosen": 0.004061593674123287,
128
+ "rewards/margins": -9.99594121822156e-05,
129
+ "rewards/rejected": 0.004161553457379341,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 0.072,
134
+ "grad_norm": 1.0012447834014893,
135
+ "learning_rate": 2.3733333333333334e-07,
136
+ "logits/chosen": 1.5075902938842773,
137
+ "logits/rejected": 1.5986944437026978,
138
+ "logps/chosen": -159.3433074951172,
139
+ "logps/rejected": -151.85072326660156,
140
+ "loss": 0.6928757190704345,
141
+ "rewards/accuracies": 0.5249999761581421,
142
+ "rewards/chosen": 0.006978762801736593,
143
+ "rewards/margins": 0.0005627512582577765,
144
+ "rewards/rejected": 0.006416010670363903,
145
+ "step": 90
146
+ },
147
+ {
148
+ "epoch": 0.08,
149
+ "grad_norm": 1.280081033706665,
150
+ "learning_rate": 2.64e-07,
151
+ "logits/chosen": 1.448286533355713,
152
+ "logits/rejected": 1.5480695962905884,
153
+ "logps/chosen": -151.12625122070312,
154
+ "logps/rejected": -171.24545288085938,
155
+ "loss": 0.6931126117706299,
156
+ "rewards/accuracies": 0.4749999940395355,
157
+ "rewards/chosen": 0.00693341251462698,
158
+ "rewards/margins": 9.057526767719537e-05,
159
+ "rewards/rejected": 0.006842837668955326,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 0.08,
164
+ "eval_logits/chosen": 1.518183946609497,
165
+ "eval_logits/rejected": 1.5525447130203247,
166
+ "eval_logps/chosen": -153.46630859375,
167
+ "eval_logps/rejected": -147.95005798339844,
168
+ "eval_loss": 0.6926330327987671,
169
+ "eval_rewards/accuracies": 0.5600000023841858,
170
+ "eval_rewards/chosen": 0.008253541775047779,
171
+ "eval_rewards/margins": 0.0010449704714119434,
172
+ "eval_rewards/rejected": 0.007208569906651974,
173
+ "eval_runtime": 91.0812,
174
+ "eval_samples_per_second": 5.49,
175
+ "eval_steps_per_second": 2.745,
176
+ "step": 100
177
+ },
178
+ {
179
+ "epoch": 0.088,
180
+ "grad_norm": 1.3912373781204224,
181
+ "learning_rate": 2.906666666666667e-07,
182
+ "logits/chosen": 1.6896222829818726,
183
+ "logits/rejected": 1.6516921520233154,
184
+ "logps/chosen": -173.70030212402344,
185
+ "logps/rejected": -155.5503692626953,
186
+ "loss": 0.6926521778106689,
187
+ "rewards/accuracies": 0.550000011920929,
188
+ "rewards/chosen": 0.010179834440350533,
189
+ "rewards/margins": 0.0010137532372027636,
190
+ "rewards/rejected": 0.009166081435978413,
191
+ "step": 110
192
+ },
193
+ {
194
+ "epoch": 0.096,
195
+ "grad_norm": 1.1518888473510742,
196
+ "learning_rate": 3.173333333333333e-07,
197
+ "logits/chosen": 1.5166860818862915,
198
+ "logits/rejected": 1.5136979818344116,
199
+ "logps/chosen": -136.39906311035156,
200
+ "logps/rejected": -140.86561584472656,
201
+ "loss": 0.6932918548583984,
202
+ "rewards/accuracies": 0.4375,
203
+ "rewards/chosen": 0.00907944981008768,
204
+ "rewards/margins": -0.00026998287648893893,
205
+ "rewards/rejected": 0.009349432773888111,
206
+ "step": 120
207
+ },
208
+ {
209
+ "epoch": 0.104,
210
+ "grad_norm": 1.3213376998901367,
211
+ "learning_rate": 3.4399999999999996e-07,
212
+ "logits/chosen": 1.4598973989486694,
213
+ "logits/rejected": 1.6739647388458252,
214
+ "logps/chosen": -159.12461853027344,
215
+ "logps/rejected": -163.39346313476562,
216
+ "loss": 0.693086051940918,
217
+ "rewards/accuracies": 0.550000011920929,
218
+ "rewards/chosen": 0.010477779433131218,
219
+ "rewards/margins": 0.00014539004769176245,
220
+ "rewards/rejected": 0.010332388803362846,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.112,
225
+ "grad_norm": 0.8833863735198975,
226
+ "learning_rate": 3.7066666666666665e-07,
227
+ "logits/chosen": 1.3829412460327148,
228
+ "logits/rejected": 1.4100173711776733,
229
+ "logps/chosen": -132.7149658203125,
230
+ "logps/rejected": -126.85478210449219,
231
+ "loss": 0.6932382583618164,
232
+ "rewards/accuracies": 0.4625000059604645,
233
+ "rewards/chosen": 0.010262183845043182,
234
+ "rewards/margins": -0.0001672578218858689,
235
+ "rewards/rejected": 0.010429441928863525,
236
+ "step": 140
237
+ },
238
+ {
239
+ "epoch": 0.12,
240
+ "grad_norm": 1.1472455263137817,
241
+ "learning_rate": 3.973333333333333e-07,
242
+ "logits/chosen": 1.4954869747161865,
243
+ "logits/rejected": 1.5857340097427368,
244
+ "logps/chosen": -165.10568237304688,
245
+ "logps/rejected": -158.97433471679688,
246
+ "loss": 0.6916103363037109,
247
+ "rewards/accuracies": 0.5375000238418579,
248
+ "rewards/chosen": 0.014177520759403706,
249
+ "rewards/margins": 0.003108317730948329,
250
+ "rewards/rejected": 0.011069201864302158,
251
+ "step": 150
252
+ },
253
+ {
254
+ "epoch": 0.128,
255
+ "grad_norm": 1.0628798007965088,
256
+ "learning_rate": 4.24e-07,
257
+ "logits/chosen": 1.6717243194580078,
258
+ "logits/rejected": 1.705092191696167,
259
+ "logps/chosen": -159.332275390625,
260
+ "logps/rejected": -147.21646118164062,
261
+ "loss": 0.6935629844665527,
262
+ "rewards/accuracies": 0.4625000059604645,
263
+ "rewards/chosen": 0.016994481906294823,
264
+ "rewards/margins": -0.0007874655420891941,
265
+ "rewards/rejected": 0.017781946808099747,
266
+ "step": 160
267
+ },
268
+ {
269
+ "epoch": 0.136,
270
+ "grad_norm": 1.390810251235962,
271
+ "learning_rate": 4.506666666666666e-07,
272
+ "logits/chosen": 1.4719234704971313,
273
+ "logits/rejected": 1.499526023864746,
274
+ "logps/chosen": -148.67251586914062,
275
+ "logps/rejected": -161.58021545410156,
276
+ "loss": 0.6926599979400635,
277
+ "rewards/accuracies": 0.48750001192092896,
278
+ "rewards/chosen": 0.021239640191197395,
279
+ "rewards/margins": 0.0010346340714022517,
280
+ "rewards/rejected": 0.020205006003379822,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 0.144,
285
+ "grad_norm": 1.139707326889038,
286
+ "learning_rate": 4.773333333333333e-07,
287
+ "logits/chosen": 1.4833905696868896,
288
+ "logits/rejected": 1.637584924697876,
289
+ "logps/chosen": -158.96731567382812,
290
+ "logps/rejected": -161.9830322265625,
291
+ "loss": 0.6919547557830811,
292
+ "rewards/accuracies": 0.5874999761581421,
293
+ "rewards/chosen": 0.019416773691773415,
294
+ "rewards/margins": 0.0024296878837049007,
295
+ "rewards/rejected": 0.016987085342407227,
296
+ "step": 180
297
+ },
298
+ {
299
+ "epoch": 0.152,
300
+ "grad_norm": 1.2029796838760376,
301
+ "learning_rate": 5.04e-07,
302
+ "logits/chosen": 1.5398027896881104,
303
+ "logits/rejected": 1.6678705215454102,
304
+ "logps/chosen": -143.00283813476562,
305
+ "logps/rejected": -144.89378356933594,
306
+ "loss": 0.692262840270996,
307
+ "rewards/accuracies": 0.5249999761581421,
308
+ "rewards/chosen": 0.018960200250148773,
309
+ "rewards/margins": 0.0018175147706642747,
310
+ "rewards/rejected": 0.017142686992883682,
311
+ "step": 190
312
+ },
313
+ {
314
+ "epoch": 0.16,
315
+ "grad_norm": 1.0527437925338745,
316
+ "learning_rate": 5.306666666666665e-07,
317
+ "logits/chosen": 1.6087623834609985,
318
+ "logits/rejected": 1.5181543827056885,
319
+ "logps/chosen": -159.35073852539062,
320
+ "logps/rejected": -133.59890747070312,
321
+ "loss": 0.6924069881439209,
322
+ "rewards/accuracies": 0.5,
323
+ "rewards/chosen": 0.01926913857460022,
324
+ "rewards/margins": 0.0015339398523792624,
325
+ "rewards/rejected": 0.0177351962774992,
326
+ "step": 200
327
+ },
328
+ {
329
+ "epoch": 0.16,
330
+ "eval_logits/chosen": 1.5193672180175781,
331
+ "eval_logits/rejected": 1.5534639358520508,
332
+ "eval_logps/chosen": -153.34857177734375,
333
+ "eval_logps/rejected": -147.8425750732422,
334
+ "eval_loss": 0.6921458840370178,
335
+ "eval_rewards/accuracies": 0.5600000023841858,
336
+ "eval_rewards/chosen": 0.020027177408337593,
337
+ "eval_rewards/margins": 0.0020714527927339077,
338
+ "eval_rewards/rejected": 0.017955724149942398,
339
+ "eval_runtime": 90.9762,
340
+ "eval_samples_per_second": 5.496,
341
+ "eval_steps_per_second": 2.748,
342
+ "step": 200
343
+ },
344
+ {
345
+ "epoch": 0.168,
346
+ "grad_norm": 1.1897239685058594,
347
+ "learning_rate": 5.573333333333333e-07,
348
+ "logits/chosen": 1.6267732381820679,
349
+ "logits/rejected": 1.5775268077850342,
350
+ "logps/chosen": -159.51437377929688,
351
+ "logps/rejected": -170.17762756347656,
352
+ "loss": 0.6925770759582519,
353
+ "rewards/accuracies": 0.48750001192092896,
354
+ "rewards/chosen": 0.02078000269830227,
355
+ "rewards/margins": 0.0012053751852363348,
356
+ "rewards/rejected": 0.01957462914288044,
357
+ "step": 210
358
+ },
359
+ {
360
+ "epoch": 0.176,
361
+ "grad_norm": 0.9995236396789551,
362
+ "learning_rate": 5.839999999999999e-07,
363
+ "logits/chosen": 1.5781934261322021,
364
+ "logits/rejected": 1.642251968383789,
365
+ "logps/chosen": -159.95309448242188,
366
+ "logps/rejected": -144.36058044433594,
367
+ "loss": 0.6931475639343262,
368
+ "rewards/accuracies": 0.512499988079071,
369
+ "rewards/chosen": 0.023322442546486855,
370
+ "rewards/margins": 6.392716750269756e-05,
371
+ "rewards/rejected": 0.02325851283967495,
372
+ "step": 220
373
+ },
374
+ {
375
+ "epoch": 0.184,
376
+ "grad_norm": 0.8819260597229004,
377
+ "learning_rate": 6.106666666666666e-07,
378
+ "logits/chosen": 1.459991693496704,
379
+ "logits/rejected": 1.6060739755630493,
380
+ "logps/chosen": -162.04745483398438,
381
+ "logps/rejected": -139.5411834716797,
382
+ "loss": 0.6919463634490967,
383
+ "rewards/accuracies": 0.512499988079071,
384
+ "rewards/chosen": 0.026052182540297508,
385
+ "rewards/margins": 0.002481849165633321,
386
+ "rewards/rejected": 0.023570330813527107,
387
+ "step": 230
388
+ },
389
+ {
390
+ "epoch": 0.192,
391
+ "grad_norm": 1.1055208444595337,
392
+ "learning_rate": 6.373333333333333e-07,
393
+ "logits/chosen": 1.558739185333252,
394
+ "logits/rejected": 1.5965116024017334,
395
+ "logps/chosen": -171.9166259765625,
396
+ "logps/rejected": -145.66433715820312,
397
+ "loss": 0.6912176132202148,
398
+ "rewards/accuracies": 0.5874999761581421,
399
+ "rewards/chosen": 0.033868782222270966,
400
+ "rewards/margins": 0.004017929546535015,
401
+ "rewards/rejected": 0.029850851744413376,
402
+ "step": 240
403
+ },
404
+ {
405
+ "epoch": 0.2,
406
+ "grad_norm": 1.1372733116149902,
407
+ "learning_rate": 6.64e-07,
408
+ "logits/chosen": 1.5322860479354858,
409
+ "logits/rejected": 1.4695795774459839,
410
+ "logps/chosen": -161.7803955078125,
411
+ "logps/rejected": -148.91949462890625,
412
+ "loss": 0.6940969944000244,
413
+ "rewards/accuracies": 0.42500001192092896,
414
+ "rewards/chosen": 0.034065864980220795,
415
+ "rewards/margins": -0.0016761379083618522,
416
+ "rewards/rejected": 0.035742007195949554,
417
+ "step": 250
418
+ },
419
+ {
420
+ "epoch": 0.208,
421
+ "grad_norm": 1.0917680263519287,
422
+ "learning_rate": 6.906666666666666e-07,
423
+ "logits/chosen": 1.4094547033309937,
424
+ "logits/rejected": 1.50186026096344,
425
+ "logps/chosen": -132.91455078125,
426
+ "logps/rejected": -146.79714965820312,
427
+ "loss": 0.6930203914642334,
428
+ "rewards/accuracies": 0.4124999940395355,
429
+ "rewards/chosen": 0.03089061938226223,
430
+ "rewards/margins": 0.00040971505222842097,
431
+ "rewards/rejected": 0.030480902642011642,
432
+ "step": 260
433
+ },
434
+ {
435
+ "epoch": 0.216,
436
+ "grad_norm": 1.0280259847640991,
437
+ "learning_rate": 7.173333333333333e-07,
438
+ "logits/chosen": 1.531285285949707,
439
+ "logits/rejected": 1.6163132190704346,
440
+ "logps/chosen": -146.902587890625,
441
+ "logps/rejected": -147.8347625732422,
442
+ "loss": 0.6944749355316162,
443
+ "rewards/accuracies": 0.512499988079071,
444
+ "rewards/chosen": 0.028514739125967026,
445
+ "rewards/margins": -0.0025452517438679934,
446
+ "rewards/rejected": 0.0310599897056818,
447
+ "step": 270
448
+ },
449
+ {
450
+ "epoch": 0.224,
451
+ "grad_norm": 1.398708701133728,
452
+ "learning_rate": 7.44e-07,
453
+ "logits/chosen": 1.563852071762085,
454
+ "logits/rejected": 1.396703839302063,
455
+ "logps/chosen": -172.68153381347656,
456
+ "logps/rejected": -155.80215454101562,
457
+ "loss": 0.689476490020752,
458
+ "rewards/accuracies": 0.625,
459
+ "rewards/chosen": 0.03224216774106026,
460
+ "rewards/margins": 0.007473687641322613,
461
+ "rewards/rejected": 0.02476847730576992,
462
+ "step": 280
463
+ },
464
+ {
465
+ "epoch": 0.232,
466
+ "grad_norm": 1.0394141674041748,
467
+ "learning_rate": 7.706666666666667e-07,
468
+ "logits/chosen": 1.5581729412078857,
469
+ "logits/rejected": 1.5819021463394165,
470
+ "logps/chosen": -144.41465759277344,
471
+ "logps/rejected": -153.55406188964844,
472
+ "loss": 0.6927877902984619,
473
+ "rewards/accuracies": 0.550000011920929,
474
+ "rewards/chosen": 0.03272664546966553,
475
+ "rewards/margins": 0.0008557910332456231,
476
+ "rewards/rejected": 0.031870849430561066,
477
+ "step": 290
478
+ },
479
+ {
480
+ "epoch": 0.24,
481
+ "grad_norm": 1.2557185888290405,
482
+ "learning_rate": 7.973333333333333e-07,
483
+ "logits/chosen": 1.5062446594238281,
484
+ "logits/rejected": 1.5786281824111938,
485
+ "logps/chosen": -153.63558959960938,
486
+ "logps/rejected": -165.46617126464844,
487
+ "loss": 0.6897892951965332,
488
+ "rewards/accuracies": 0.625,
489
+ "rewards/chosen": 0.03914078325033188,
490
+ "rewards/margins": 0.006873926613479853,
491
+ "rewards/rejected": 0.032266855239868164,
492
+ "step": 300
493
+ },
494
+ {
495
+ "epoch": 0.24,
496
+ "eval_logits/chosen": 1.5237834453582764,
497
+ "eval_logits/rejected": 1.557741641998291,
498
+ "eval_logps/chosen": -153.19883728027344,
499
+ "eval_logps/rejected": -147.7100830078125,
500
+ "eval_loss": 0.6913501024246216,
501
+ "eval_rewards/accuracies": 0.5339999794960022,
502
+ "eval_rewards/chosen": 0.03499976545572281,
503
+ "eval_rewards/margins": 0.0037925743963569403,
504
+ "eval_rewards/rejected": 0.031207194551825523,
505
+ "eval_runtime": 90.9133,
506
+ "eval_samples_per_second": 5.5,
507
+ "eval_steps_per_second": 2.75,
508
+ "step": 300
509
+ },
510
+ {
511
+ "epoch": 0.248,
512
+ "grad_norm": 1.1200144290924072,
513
+ "learning_rate": 8.24e-07,
514
+ "logits/chosen": 1.618949294090271,
515
+ "logits/rejected": 1.6625702381134033,
516
+ "logps/chosen": -141.8953857421875,
517
+ "logps/rejected": -156.6927032470703,
518
+ "loss": 0.694425106048584,
519
+ "rewards/accuracies": 0.4124999940395355,
520
+ "rewards/chosen": 0.03162423521280289,
521
+ "rewards/margins": -0.0023833750747144222,
522
+ "rewards/rejected": 0.034007612615823746,
523
+ "step": 310
524
+ },
525
+ {
526
+ "epoch": 0.256,
527
+ "grad_norm": 1.0823743343353271,
528
+ "learning_rate": 8.506666666666667e-07,
529
+ "logits/chosen": 1.5974153280258179,
530
+ "logits/rejected": 1.6237560510635376,
531
+ "logps/chosen": -139.2996368408203,
532
+ "logps/rejected": -141.94407653808594,
533
+ "loss": 0.6914949893951416,
534
+ "rewards/accuracies": 0.574999988079071,
535
+ "rewards/chosen": 0.029445698484778404,
536
+ "rewards/margins": 0.0034329916816204786,
537
+ "rewards/rejected": 0.026012707501649857,
538
+ "step": 320
539
+ },
540
+ {
541
+ "epoch": 0.264,
542
+ "grad_norm": 1.1765722036361694,
543
+ "learning_rate": 8.773333333333332e-07,
544
+ "logits/chosen": 1.465215802192688,
545
+ "logits/rejected": 1.4882014989852905,
546
+ "logps/chosen": -158.4329833984375,
547
+ "logps/rejected": -183.94259643554688,
548
+ "loss": 0.6925934791564942,
549
+ "rewards/accuracies": 0.5,
550
+ "rewards/chosen": 0.026302698999643326,
551
+ "rewards/margins": 0.0012350418837741017,
552
+ "rewards/rejected": 0.025067657232284546,
553
+ "step": 330
554
+ },
555
+ {
556
+ "epoch": 0.272,
557
+ "grad_norm": 1.0667294263839722,
558
+ "learning_rate": 9.039999999999999e-07,
559
+ "logits/chosen": 1.7538875341415405,
560
+ "logits/rejected": 1.6171789169311523,
561
+ "logps/chosen": -159.63790893554688,
562
+ "logps/rejected": -151.1880340576172,
563
+ "loss": 0.6922041416168213,
564
+ "rewards/accuracies": 0.5249999761581421,
565
+ "rewards/chosen": 0.026619985699653625,
566
+ "rewards/margins": 0.002031774492934346,
567
+ "rewards/rejected": 0.024588212370872498,
568
+ "step": 340
569
+ },
570
+ {
571
+ "epoch": 0.28,
572
+ "grad_norm": 1.2210006713867188,
573
+ "learning_rate": 9.306666666666666e-07,
574
+ "logits/chosen": 1.5815479755401611,
575
+ "logits/rejected": 1.710217833518982,
576
+ "logps/chosen": -167.21743774414062,
577
+ "logps/rejected": -196.77989196777344,
578
+ "loss": 0.6899692058563233,
579
+ "rewards/accuracies": 0.6000000238418579,
580
+ "rewards/chosen": 0.03636014088988304,
581
+ "rewards/margins": 0.006536121014505625,
582
+ "rewards/rejected": 0.029824022203683853,
583
+ "step": 350
584
+ },
585
+ {
586
+ "epoch": 0.288,
587
+ "grad_norm": 0.9623845219612122,
588
+ "learning_rate": 9.573333333333333e-07,
589
+ "logits/chosen": 1.4416790008544922,
590
+ "logits/rejected": 1.3618693351745605,
591
+ "logps/chosen": -163.27401733398438,
592
+ "logps/rejected": -169.3351287841797,
593
+ "loss": 0.6910821437835694,
594
+ "rewards/accuracies": 0.550000011920929,
595
+ "rewards/chosen": 0.037200987339019775,
596
+ "rewards/margins": 0.004338678438216448,
597
+ "rewards/rejected": 0.032862309366464615,
598
+ "step": 360
599
+ },
600
+ {
601
+ "epoch": 0.296,
602
+ "grad_norm": 1.1005645990371704,
603
+ "learning_rate": 9.84e-07,
604
+ "logits/chosen": 1.6214616298675537,
605
+ "logits/rejected": 1.6572606563568115,
606
+ "logps/chosen": -162.95504760742188,
607
+ "logps/rejected": -154.91807556152344,
608
+ "loss": 0.6895030021667481,
609
+ "rewards/accuracies": 0.5874999761581421,
610
+ "rewards/chosen": 0.047198958694934845,
611
+ "rewards/margins": 0.0075958347879350185,
612
+ "rewards/rejected": 0.03960312157869339,
613
+ "step": 370
614
+ },
615
+ {
616
+ "epoch": 0.304,
617
+ "grad_norm": 1.1126338243484497,
618
+ "learning_rate": 9.988148148148148e-07,
619
+ "logits/chosen": 1.4842126369476318,
620
+ "logits/rejected": 1.410930871963501,
621
+ "logps/chosen": -143.09182739257812,
622
+ "logps/rejected": -153.48135375976562,
623
+ "loss": 0.6885370254516602,
624
+ "rewards/accuracies": 0.5375000238418579,
625
+ "rewards/chosen": 0.05975018069148064,
626
+ "rewards/margins": 0.00981281790882349,
627
+ "rewards/rejected": 0.04993735998868942,
628
+ "step": 380
629
+ },
630
+ {
631
+ "epoch": 0.312,
632
+ "grad_norm": 1.1146304607391357,
633
+ "learning_rate": 9.95851851851852e-07,
634
+ "logits/chosen": 1.608332633972168,
635
+ "logits/rejected": 1.5848472118377686,
636
+ "logps/chosen": -152.25250244140625,
637
+ "logps/rejected": -138.348876953125,
638
+ "loss": 0.6851807117462159,
639
+ "rewards/accuracies": 0.612500011920929,
640
+ "rewards/chosen": 0.07630724459886551,
641
+ "rewards/margins": 0.016460279002785683,
642
+ "rewards/rejected": 0.059846967458724976,
643
+ "step": 390
644
+ },
645
+ {
646
+ "epoch": 0.32,
647
+ "grad_norm": 1.2686601877212524,
648
+ "learning_rate": 9.928888888888889e-07,
649
+ "logits/chosen": 1.5840070247650146,
650
+ "logits/rejected": 1.5679785013198853,
651
+ "logps/chosen": -155.90219116210938,
652
+ "logps/rejected": -148.89549255371094,
653
+ "loss": 0.6891039371490478,
654
+ "rewards/accuracies": 0.512499988079071,
655
+ "rewards/chosen": 0.08129279315471649,
656
+ "rewards/margins": 0.008997146971523762,
657
+ "rewards/rejected": 0.072295643389225,
658
+ "step": 400
659
+ },
660
+ {
661
+ "epoch": 0.32,
662
+ "eval_logits/chosen": 1.5358340740203857,
663
+ "eval_logits/rejected": 1.5689741373062134,
664
+ "eval_logps/chosen": -152.6895294189453,
665
+ "eval_logps/rejected": -147.23422241210938,
666
+ "eval_loss": 0.6900544166564941,
667
+ "eval_rewards/accuracies": 0.5419999957084656,
668
+ "eval_rewards/chosen": 0.08593228459358215,
669
+ "eval_rewards/margins": 0.0071397931315004826,
670
+ "eval_rewards/rejected": 0.0787924975156784,
671
+ "eval_runtime": 90.9072,
672
+ "eval_samples_per_second": 5.5,
673
+ "eval_steps_per_second": 2.75,
674
+ "step": 400
675
+ },
676
+ {
677
+ "epoch": 0.328,
678
+ "grad_norm": 1.1653108596801758,
679
+ "learning_rate": 9.899259259259258e-07,
680
+ "logits/chosen": 1.5450490713119507,
681
+ "logits/rejected": 1.4428151845932007,
682
+ "logps/chosen": -148.61581420898438,
683
+ "logps/rejected": -188.90919494628906,
684
+ "loss": 0.6918989181518554,
685
+ "rewards/accuracies": 0.5249999761581421,
686
+ "rewards/chosen": 0.0946170911192894,
687
+ "rewards/margins": 0.003271549940109253,
688
+ "rewards/rejected": 0.09134554117918015,
689
+ "step": 410
690
+ },
691
+ {
692
+ "epoch": 0.336,
693
+ "grad_norm": 1.3114937543869019,
694
+ "learning_rate": 9.86962962962963e-07,
695
+ "logits/chosen": 1.5342715978622437,
696
+ "logits/rejected": 1.5572118759155273,
697
+ "logps/chosen": -152.65264892578125,
698
+ "logps/rejected": -168.03384399414062,
699
+ "loss": 0.6972955703735352,
700
+ "rewards/accuracies": 0.42500001192092896,
701
+ "rewards/chosen": 0.08869597315788269,
702
+ "rewards/margins": -0.007436127867549658,
703
+ "rewards/rejected": 0.09613210707902908,
704
+ "step": 420
705
+ },
706
+ {
707
+ "epoch": 0.344,
708
+ "grad_norm": 1.0040781497955322,
709
+ "learning_rate": 9.84e-07,
710
+ "logits/chosen": 1.5979712009429932,
711
+ "logits/rejected": 1.6713664531707764,
712
+ "logps/chosen": -164.2566375732422,
713
+ "logps/rejected": -152.1428680419922,
714
+ "loss": 0.6897010326385498,
715
+ "rewards/accuracies": 0.48750001192092896,
716
+ "rewards/chosen": 0.08101107180118561,
717
+ "rewards/margins": 0.007684691343456507,
718
+ "rewards/rejected": 0.07332637906074524,
719
+ "step": 430
720
+ },
721
+ {
722
+ "epoch": 0.352,
723
+ "grad_norm": 1.0801892280578613,
724
+ "learning_rate": 9.81037037037037e-07,
725
+ "logits/chosen": 1.439635992050171,
726
+ "logits/rejected": 1.3282365798950195,
727
+ "logps/chosen": -157.23403930664062,
728
+ "logps/rejected": -146.54806518554688,
729
+ "loss": 0.6881664752960205,
730
+ "rewards/accuracies": 0.5874999761581421,
731
+ "rewards/chosen": 0.08112544566392899,
732
+ "rewards/margins": 0.010971959680318832,
733
+ "rewards/rejected": 0.07015348225831985,
734
+ "step": 440
735
+ },
736
+ {
737
+ "epoch": 0.36,
738
+ "grad_norm": 1.1139949560165405,
739
+ "learning_rate": 9.78074074074074e-07,
740
+ "logits/chosen": 1.6345123052597046,
741
+ "logits/rejected": 1.5876491069793701,
742
+ "logps/chosen": -157.6160125732422,
743
+ "logps/rejected": -160.02040100097656,
744
+ "loss": 0.6868602275848389,
745
+ "rewards/accuracies": 0.6000000238418579,
746
+ "rewards/chosen": 0.07638200372457504,
747
+ "rewards/margins": 0.013464884832501411,
748
+ "rewards/rejected": 0.06291711330413818,
749
+ "step": 450
750
+ },
751
+ {
752
+ "epoch": 0.368,
753
+ "grad_norm": 1.158035397529602,
754
+ "learning_rate": 9.751111111111112e-07,
755
+ "logits/chosen": 1.5430586338043213,
756
+ "logits/rejected": 1.6022913455963135,
757
+ "logps/chosen": -151.98919677734375,
758
+ "logps/rejected": -164.90565490722656,
759
+ "loss": 0.6891930580139161,
760
+ "rewards/accuracies": 0.5,
761
+ "rewards/chosen": 0.06889279186725616,
762
+ "rewards/margins": 0.008791089057922363,
763
+ "rewards/rejected": 0.0601017065346241,
764
+ "step": 460
765
+ },
766
+ {
767
+ "epoch": 0.376,
768
+ "grad_norm": 1.0903704166412354,
769
+ "learning_rate": 9.721481481481481e-07,
770
+ "logits/chosen": 1.313881754875183,
771
+ "logits/rejected": 1.3182194232940674,
772
+ "logps/chosen": -131.5592041015625,
773
+ "logps/rejected": -123.84442138671875,
774
+ "loss": 0.6889429569244385,
775
+ "rewards/accuracies": 0.5,
776
+ "rewards/chosen": 0.08150745928287506,
777
+ "rewards/margins": 0.009235726669430733,
778
+ "rewards/rejected": 0.07227173447608948,
779
+ "step": 470
780
+ },
781
+ {
782
+ "epoch": 0.384,
783
+ "grad_norm": 1.0766969919204712,
784
+ "learning_rate": 9.69185185185185e-07,
785
+ "logits/chosen": 1.425756812095642,
786
+ "logits/rejected": 1.476854920387268,
787
+ "logps/chosen": -148.72279357910156,
788
+ "logps/rejected": -164.40951538085938,
789
+ "loss": 0.6854116439819335,
790
+ "rewards/accuracies": 0.5874999761581421,
791
+ "rewards/chosen": 0.1009502187371254,
792
+ "rewards/margins": 0.0166789498180151,
793
+ "rewards/rejected": 0.08427127450704575,
794
+ "step": 480
795
+ },
796
+ {
797
+ "epoch": 0.392,
798
+ "grad_norm": 1.2078129053115845,
799
+ "learning_rate": 9.662222222222222e-07,
800
+ "logits/chosen": 1.5511282682418823,
801
+ "logits/rejected": 1.611383080482483,
802
+ "logps/chosen": -143.22006225585938,
803
+ "logps/rejected": -159.70176696777344,
804
+ "loss": 0.6968960762023926,
805
+ "rewards/accuracies": 0.5,
806
+ "rewards/chosen": 0.08779363334178925,
807
+ "rewards/margins": -0.00598606513813138,
808
+ "rewards/rejected": 0.09377971291542053,
809
+ "step": 490
810
+ },
811
+ {
812
+ "epoch": 0.4,
813
+ "grad_norm": 0.9106306433677673,
814
+ "learning_rate": 9.632592592592593e-07,
815
+ "logits/chosen": 1.4217239618301392,
816
+ "logits/rejected": 1.505446195602417,
817
+ "logps/chosen": -144.34109497070312,
818
+ "logps/rejected": -135.52430725097656,
819
+ "loss": 0.691196870803833,
820
+ "rewards/accuracies": 0.5,
821
+ "rewards/chosen": 0.06902458518743515,
822
+ "rewards/margins": 0.005059319548308849,
823
+ "rewards/rejected": 0.06396526843309402,
824
+ "step": 500
825
+ },
826
+ {
827
+ "epoch": 0.4,
828
+ "eval_logits/chosen": 1.5185325145721436,
829
+ "eval_logits/rejected": 1.5510345697402954,
830
+ "eval_logps/chosen": -152.7921905517578,
831
+ "eval_logps/rejected": -147.3463134765625,
832
+ "eval_loss": 0.6898502111434937,
833
+ "eval_rewards/accuracies": 0.5099999904632568,
834
+ "eval_rewards/chosen": 0.07566512376070023,
835
+ "eval_rewards/margins": 0.008083080872893333,
836
+ "eval_rewards/rejected": 0.06758204847574234,
837
+ "eval_runtime": 90.8294,
838
+ "eval_samples_per_second": 5.505,
839
+ "eval_steps_per_second": 2.752,
840
+ "step": 500
841
+ },
842
+ {
843
+ "epoch": 0.408,
844
+ "grad_norm": 1.1637762784957886,
845
+ "learning_rate": 9.602962962962962e-07,
846
+ "logits/chosen": 1.5965330600738525,
847
+ "logits/rejected": 1.5338464975357056,
848
+ "logps/chosen": -131.49871826171875,
849
+ "logps/rejected": -140.05995178222656,
850
+ "loss": 0.6868171215057373,
851
+ "rewards/accuracies": 0.5874999761581421,
852
+ "rewards/chosen": 0.08385131508111954,
853
+ "rewards/margins": 0.013950209133327007,
854
+ "rewards/rejected": 0.06990110129117966,
855
+ "step": 510
856
+ },
857
+ {
858
+ "epoch": 0.416,
859
+ "grad_norm": 1.229719638824463,
860
+ "learning_rate": 9.573333333333333e-07,
861
+ "logits/chosen": 1.7057275772094727,
862
+ "logits/rejected": 1.5898948907852173,
863
+ "logps/chosen": -166.4267120361328,
864
+ "logps/rejected": -172.22337341308594,
865
+ "loss": 0.6912094116210937,
866
+ "rewards/accuracies": 0.5625,
867
+ "rewards/chosen": 0.07427279651165009,
868
+ "rewards/margins": 0.004661654122173786,
869
+ "rewards/rejected": 0.06961113959550858,
870
+ "step": 520
871
+ },
872
+ {
873
+ "epoch": 0.424,
874
+ "grad_norm": 1.136383295059204,
875
+ "learning_rate": 9.543703703703705e-07,
876
+ "logits/chosen": 1.6207077503204346,
877
+ "logits/rejected": 1.5417520999908447,
878
+ "logps/chosen": -166.81509399414062,
879
+ "logps/rejected": -168.5408172607422,
880
+ "loss": 0.6772748470306397,
881
+ "rewards/accuracies": 0.637499988079071,
882
+ "rewards/chosen": 0.0953373983502388,
883
+ "rewards/margins": 0.033697597682476044,
884
+ "rewards/rejected": 0.06163979694247246,
885
+ "step": 530
886
+ },
887
+ {
888
+ "epoch": 0.432,
889
+ "grad_norm": 1.0640541315078735,
890
+ "learning_rate": 9.514074074074074e-07,
891
+ "logits/chosen": 1.6825854778289795,
892
+ "logits/rejected": 1.6327041387557983,
893
+ "logps/chosen": -181.39956665039062,
894
+ "logps/rejected": -158.49647521972656,
895
+ "loss": 0.6883615493774414,
896
+ "rewards/accuracies": 0.5625,
897
+ "rewards/chosen": 0.10187923908233643,
898
+ "rewards/margins": 0.010873330757021904,
899
+ "rewards/rejected": 0.09100590646266937,
900
+ "step": 540
901
+ },
902
+ {
903
+ "epoch": 0.44,
904
+ "grad_norm": 1.477909803390503,
905
+ "learning_rate": 9.484444444444444e-07,
906
+ "logits/chosen": 1.5402686595916748,
907
+ "logits/rejected": 1.6487388610839844,
908
+ "logps/chosen": -151.53814697265625,
909
+ "logps/rejected": -167.8573455810547,
910
+ "loss": 0.6915213584899902,
911
+ "rewards/accuracies": 0.512499988079071,
912
+ "rewards/chosen": 0.09472338110208511,
913
+ "rewards/margins": 0.005475065670907497,
914
+ "rewards/rejected": 0.08924831449985504,
915
+ "step": 550
916
+ },
917
+ {
918
+ "epoch": 0.448,
919
+ "grad_norm": 1.0138317346572876,
920
+ "learning_rate": 9.454814814814814e-07,
921
+ "logits/chosen": 1.6286503076553345,
922
+ "logits/rejected": 1.5223101377487183,
923
+ "logps/chosen": -154.72079467773438,
924
+ "logps/rejected": -139.2535858154297,
925
+ "loss": 0.6835216522216797,
926
+ "rewards/accuracies": 0.5625,
927
+ "rewards/chosen": 0.09194488823413849,
928
+ "rewards/margins": 0.02140027843415737,
929
+ "rewards/rejected": 0.07054460793733597,
930
+ "step": 560
931
+ },
932
+ {
933
+ "epoch": 0.456,
934
+ "grad_norm": 0.896689772605896,
935
+ "learning_rate": 9.425185185185184e-07,
936
+ "logits/chosen": 1.4857146739959717,
937
+ "logits/rejected": 1.4369819164276123,
938
+ "logps/chosen": -158.31809997558594,
939
+ "logps/rejected": -152.33676147460938,
940
+ "loss": 0.6863360404968262,
941
+ "rewards/accuracies": 0.574999988079071,
942
+ "rewards/chosen": 0.07895292341709137,
943
+ "rewards/margins": 0.016260787844657898,
944
+ "rewards/rejected": 0.06269213557243347,
945
+ "step": 570
946
+ },
947
+ {
948
+ "epoch": 0.464,
949
+ "grad_norm": 1.4772330522537231,
950
+ "learning_rate": 9.395555555555556e-07,
951
+ "logits/chosen": 1.315298318862915,
952
+ "logits/rejected": 1.4757534265518188,
953
+ "logps/chosen": -136.4202117919922,
954
+ "logps/rejected": -154.99362182617188,
955
+ "loss": 0.6916167259216308,
956
+ "rewards/accuracies": 0.550000011920929,
957
+ "rewards/chosen": 0.04677204787731171,
958
+ "rewards/margins": 0.005649724509567022,
959
+ "rewards/rejected": 0.041122324764728546,
960
+ "step": 580
961
+ },
962
+ {
963
+ "epoch": 0.472,
964
+ "grad_norm": 0.9806157350540161,
965
+ "learning_rate": 9.365925925925926e-07,
966
+ "logits/chosen": 1.465427279472351,
967
+ "logits/rejected": 1.459496259689331,
968
+ "logps/chosen": -145.07470703125,
969
+ "logps/rejected": -131.71974182128906,
970
+ "loss": 0.6857181549072265,
971
+ "rewards/accuracies": 0.5249999761581421,
972
+ "rewards/chosen": 0.07979898899793625,
973
+ "rewards/margins": 0.016965944319963455,
974
+ "rewards/rejected": 0.0628330409526825,
975
+ "step": 590
976
+ },
977
+ {
978
+ "epoch": 0.48,
979
+ "grad_norm": 1.4313396215438843,
980
+ "learning_rate": 9.336296296296295e-07,
981
+ "logits/chosen": 1.5190367698669434,
982
+ "logits/rejected": 1.563504934310913,
983
+ "logps/chosen": -132.0201416015625,
984
+ "logps/rejected": -158.89865112304688,
985
+ "loss": 0.6964848041534424,
986
+ "rewards/accuracies": 0.5625,
987
+ "rewards/chosen": 0.06544725596904755,
988
+ "rewards/margins": -0.004280097782611847,
989
+ "rewards/rejected": 0.0697273463010788,
990
+ "step": 600
991
+ },
992
+ {
993
+ "epoch": 0.48,
994
+ "eval_logits/chosen": 1.5031698942184448,
995
+ "eval_logits/rejected": 1.5348584651947021,
996
+ "eval_logps/chosen": -152.76596069335938,
997
+ "eval_logps/rejected": -147.3241729736328,
998
+ "eval_loss": 0.6903045773506165,
999
+ "eval_rewards/accuracies": 0.527999997138977,
1000
+ "eval_rewards/chosen": 0.07828804850578308,
1001
+ "eval_rewards/margins": 0.008492168970406055,
1002
+ "eval_rewards/rejected": 0.0697958841919899,
1003
+ "eval_runtime": 90.9881,
1004
+ "eval_samples_per_second": 5.495,
1005
+ "eval_steps_per_second": 2.748,
1006
+ "step": 600
1007
+ },
1008
+ {
1009
+ "epoch": 0.488,
1010
+ "grad_norm": 0.9467533230781555,
1011
+ "learning_rate": 9.306666666666666e-07,
1012
+ "logits/chosen": 1.5242283344268799,
1013
+ "logits/rejected": 1.5798314809799194,
1014
+ "logps/chosen": -131.90048217773438,
1015
+ "logps/rejected": -149.93222045898438,
1016
+ "loss": 0.6899854183197022,
1017
+ "rewards/accuracies": 0.5375000238418579,
1018
+ "rewards/chosen": 0.08419154584407806,
1019
+ "rewards/margins": 0.00830511748790741,
1020
+ "rewards/rejected": 0.07588644325733185,
1021
+ "step": 610
1022
+ },
1023
+ {
1024
+ "epoch": 0.496,
1025
+ "grad_norm": 1.8463492393493652,
1026
+ "learning_rate": 9.277037037037037e-07,
1027
+ "logits/chosen": 1.4965155124664307,
1028
+ "logits/rejected": 1.6336272954940796,
1029
+ "logps/chosen": -156.1863555908203,
1030
+ "logps/rejected": -167.74075317382812,
1031
+ "loss": 0.6977802276611328,
1032
+ "rewards/accuracies": 0.48750001192092896,
1033
+ "rewards/chosen": 0.05518193915486336,
1034
+ "rewards/margins": -0.006808985956013203,
1035
+ "rewards/rejected": 0.061990927904844284,
1036
+ "step": 620
1037
+ },
1038
+ {
1039
+ "epoch": 0.504,
1040
+ "grad_norm": 1.278316617012024,
1041
+ "learning_rate": 9.247407407407407e-07,
1042
+ "logits/chosen": 1.5074065923690796,
1043
+ "logits/rejected": 1.535696029663086,
1044
+ "logps/chosen": -167.56777954101562,
1045
+ "logps/rejected": -147.04922485351562,
1046
+ "loss": 0.6806219100952149,
1047
+ "rewards/accuracies": 0.6000000238418579,
1048
+ "rewards/chosen": 0.0745314508676529,
1049
+ "rewards/margins": 0.027104835957288742,
1050
+ "rewards/rejected": 0.047426607459783554,
1051
+ "step": 630
1052
+ },
1053
+ {
1054
+ "epoch": 0.512,
1055
+ "grad_norm": 1.0424679517745972,
1056
+ "learning_rate": 9.217777777777778e-07,
1057
+ "logits/chosen": 1.4299286603927612,
1058
+ "logits/rejected": 1.4484076499938965,
1059
+ "logps/chosen": -168.305908203125,
1060
+ "logps/rejected": -152.61854553222656,
1061
+ "loss": 0.6871783256530761,
1062
+ "rewards/accuracies": 0.48750001192092896,
1063
+ "rewards/chosen": 0.07751411944627762,
1064
+ "rewards/margins": 0.017045259475708008,
1065
+ "rewards/rejected": 0.06046885997056961,
1066
+ "step": 640
1067
+ },
1068
+ {
1069
+ "epoch": 0.52,
1070
+ "grad_norm": 1.1143999099731445,
1071
+ "learning_rate": 9.188148148148148e-07,
1072
+ "logits/chosen": 1.5520436763763428,
1073
+ "logits/rejected": 1.531718134880066,
1074
+ "logps/chosen": -154.92115783691406,
1075
+ "logps/rejected": -126.78369140625,
1076
+ "loss": 0.6889286518096924,
1077
+ "rewards/accuracies": 0.4749999940395355,
1078
+ "rewards/chosen": 0.0945608839392662,
1079
+ "rewards/margins": 0.01080345083028078,
1080
+ "rewards/rejected": 0.0837574377655983,
1081
+ "step": 650
1082
+ },
1083
+ {
1084
+ "epoch": 0.528,
1085
+ "grad_norm": 1.2762326002120972,
1086
+ "learning_rate": 9.158518518518517e-07,
1087
+ "logits/chosen": 1.5747522115707397,
1088
+ "logits/rejected": 1.4972833395004272,
1089
+ "logps/chosen": -151.13914489746094,
1090
+ "logps/rejected": -132.013916015625,
1091
+ "loss": 0.6900454998016358,
1092
+ "rewards/accuracies": 0.5375000238418579,
1093
+ "rewards/chosen": 0.07303085923194885,
1094
+ "rewards/margins": 0.008073865436017513,
1095
+ "rewards/rejected": 0.06495700031518936,
1096
+ "step": 660
1097
+ },
1098
+ {
1099
+ "epoch": 0.536,
1100
+ "grad_norm": 1.5207557678222656,
1101
+ "learning_rate": 9.128888888888888e-07,
1102
+ "logits/chosen": 1.6594665050506592,
1103
+ "logits/rejected": 1.468492031097412,
1104
+ "logps/chosen": -166.0108184814453,
1105
+ "logps/rejected": -151.38555908203125,
1106
+ "loss": 0.6934059143066407,
1107
+ "rewards/accuracies": 0.5,
1108
+ "rewards/chosen": 0.09294265508651733,
1109
+ "rewards/margins": 0.0021180796902626753,
1110
+ "rewards/rejected": 0.09082455933094025,
1111
+ "step": 670
1112
+ },
1113
+ {
1114
+ "epoch": 0.544,
1115
+ "grad_norm": 1.1073335409164429,
1116
+ "learning_rate": 9.099259259259259e-07,
1117
+ "logits/chosen": 1.5321934223175049,
1118
+ "logits/rejected": 1.5313541889190674,
1119
+ "logps/chosen": -144.5046844482422,
1120
+ "logps/rejected": -133.05470275878906,
1121
+ "loss": 0.6824594974517822,
1122
+ "rewards/accuracies": 0.6000000238418579,
1123
+ "rewards/chosen": 0.10782947391271591,
1124
+ "rewards/margins": 0.0230915155261755,
1125
+ "rewards/rejected": 0.08473796397447586,
1126
+ "step": 680
1127
+ },
1128
+ {
1129
+ "epoch": 0.552,
1130
+ "grad_norm": 1.096970558166504,
1131
+ "learning_rate": 9.069629629629629e-07,
1132
+ "logits/chosen": 1.438305139541626,
1133
+ "logits/rejected": 1.4338308572769165,
1134
+ "logps/chosen": -173.1280517578125,
1135
+ "logps/rejected": -160.20950317382812,
1136
+ "loss": 0.6878445625305176,
1137
+ "rewards/accuracies": 0.512499988079071,
1138
+ "rewards/chosen": 0.09615986794233322,
1139
+ "rewards/margins": 0.012350928969681263,
1140
+ "rewards/rejected": 0.08380892872810364,
1141
+ "step": 690
1142
+ },
1143
+ {
1144
+ "epoch": 0.56,
1145
+ "grad_norm": 1.3664878606796265,
1146
+ "learning_rate": 9.039999999999999e-07,
1147
+ "logits/chosen": 1.530369758605957,
1148
+ "logits/rejected": 1.5520622730255127,
1149
+ "logps/chosen": -159.60740661621094,
1150
+ "logps/rejected": -137.77352905273438,
1151
+ "loss": 0.6991429805755616,
1152
+ "rewards/accuracies": 0.48750001192092896,
1153
+ "rewards/chosen": 0.07249583303928375,
1154
+ "rewards/margins": -0.009333941154181957,
1155
+ "rewards/rejected": 0.08182977139949799,
1156
+ "step": 700
1157
+ },
1158
+ {
1159
+ "epoch": 0.56,
1160
+ "eval_logits/chosen": 1.5136796236038208,
1161
+ "eval_logits/rejected": 1.5456002950668335,
1162
+ "eval_logps/chosen": -152.5748748779297,
1163
+ "eval_logps/rejected": -147.17152404785156,
1164
+ "eval_loss": 0.688511848449707,
1165
+ "eval_rewards/accuracies": 0.5339999794960022,
1166
+ "eval_rewards/chosen": 0.09739598631858826,
1167
+ "eval_rewards/margins": 0.012335418723523617,
1168
+ "eval_rewards/rejected": 0.08506056666374207,
1169
+ "eval_runtime": 90.8734,
1170
+ "eval_samples_per_second": 5.502,
1171
+ "eval_steps_per_second": 2.751,
1172
+ "step": 700
1173
+ },
1174
+ {
1175
+ "epoch": 0.568,
1176
+ "grad_norm": 1.224107265472412,
1177
+ "learning_rate": 9.010370370370371e-07,
1178
+ "logits/chosen": 1.6023967266082764,
1179
+ "logits/rejected": 1.558224081993103,
1180
+ "logps/chosen": -148.47120666503906,
1181
+ "logps/rejected": -163.5648651123047,
1182
+ "loss": 0.6865221500396729,
1183
+ "rewards/accuracies": 0.5375000238418579,
1184
+ "rewards/chosen": 0.12494877725839615,
1185
+ "rewards/margins": 0.01545787788927555,
1186
+ "rewards/rejected": 0.10949089378118515,
1187
+ "step": 710
1188
+ },
1189
+ {
1190
+ "epoch": 0.576,
1191
+ "grad_norm": 1.2135164737701416,
1192
+ "learning_rate": 8.98074074074074e-07,
1193
+ "logits/chosen": 1.5902750492095947,
1194
+ "logits/rejected": 1.7012746334075928,
1195
+ "logps/chosen": -139.7012939453125,
1196
+ "logps/rejected": -135.42225646972656,
1197
+ "loss": 0.7014039039611817,
1198
+ "rewards/accuracies": 0.4749999940395355,
1199
+ "rewards/chosen": 0.09102506935596466,
1200
+ "rewards/margins": -0.014695463702082634,
1201
+ "rewards/rejected": 0.10572052001953125,
1202
+ "step": 720
1203
+ },
1204
+ {
1205
+ "epoch": 0.584,
1206
+ "grad_norm": 1.1953649520874023,
1207
+ "learning_rate": 8.95111111111111e-07,
1208
+ "logits/chosen": 1.6540731191635132,
1209
+ "logits/rejected": 1.6386349201202393,
1210
+ "logps/chosen": -153.12847900390625,
1211
+ "logps/rejected": -149.66319274902344,
1212
+ "loss": 0.6952126502990723,
1213
+ "rewards/accuracies": 0.5,
1214
+ "rewards/chosen": 0.1124584898352623,
1215
+ "rewards/margins": -0.0012063594767823815,
1216
+ "rewards/rejected": 0.11366486549377441,
1217
+ "step": 730
1218
+ },
1219
+ {
1220
+ "epoch": 0.592,
1221
+ "grad_norm": 1.147875428199768,
1222
+ "learning_rate": 8.921481481481481e-07,
1223
+ "logits/chosen": 1.5578330755233765,
1224
+ "logits/rejected": 1.4689290523529053,
1225
+ "logps/chosen": -156.93788146972656,
1226
+ "logps/rejected": -134.1832733154297,
1227
+ "loss": 0.6919830799102783,
1228
+ "rewards/accuracies": 0.512499988079071,
1229
+ "rewards/chosen": 0.09214536100625992,
1230
+ "rewards/margins": 0.004291228950023651,
1231
+ "rewards/rejected": 0.08785412460565567,
1232
+ "step": 740
1233
+ },
1234
+ {
1235
+ "epoch": 0.6,
1236
+ "grad_norm": 1.21685791015625,
1237
+ "learning_rate": 8.891851851851852e-07,
1238
+ "logits/chosen": 1.4241983890533447,
1239
+ "logits/rejected": 1.4537718296051025,
1240
+ "logps/chosen": -136.9163055419922,
1241
+ "logps/rejected": -158.44491577148438,
1242
+ "loss": 0.691422986984253,
1243
+ "rewards/accuracies": 0.5375000238418579,
1244
+ "rewards/chosen": 0.09173480421304703,
1245
+ "rewards/margins": 0.00533579895272851,
1246
+ "rewards/rejected": 0.08639900386333466,
1247
+ "step": 750
1248
+ },
1249
+ {
1250
+ "epoch": 0.608,
1251
+ "grad_norm": 1.3629519939422607,
1252
+ "learning_rate": 8.862222222222222e-07,
1253
+ "logits/chosen": 1.565716028213501,
1254
+ "logits/rejected": 1.3824570178985596,
1255
+ "logps/chosen": -138.8521728515625,
1256
+ "logps/rejected": -129.51028442382812,
1257
+ "loss": 0.6931031227111817,
1258
+ "rewards/accuracies": 0.5249999761581421,
1259
+ "rewards/chosen": 0.0763697624206543,
1260
+ "rewards/margins": 0.0021251302678138018,
1261
+ "rewards/rejected": 0.07424463331699371,
1262
+ "step": 760
1263
+ },
1264
+ {
1265
+ "epoch": 0.616,
1266
+ "grad_norm": 1.1030747890472412,
1267
+ "learning_rate": 8.832592592592593e-07,
1268
+ "logits/chosen": 1.5781702995300293,
1269
+ "logits/rejected": 1.5428775548934937,
1270
+ "logps/chosen": -141.01480102539062,
1271
+ "logps/rejected": -136.47189331054688,
1272
+ "loss": 0.6777657032012939,
1273
+ "rewards/accuracies": 0.637499988079071,
1274
+ "rewards/chosen": 0.10366280376911163,
1275
+ "rewards/margins": 0.0333605632185936,
1276
+ "rewards/rejected": 0.07030224055051804,
1277
+ "step": 770
1278
+ },
1279
+ {
1280
+ "epoch": 0.624,
1281
+ "grad_norm": 1.3228758573532104,
1282
+ "learning_rate": 8.802962962962962e-07,
1283
+ "logits/chosen": 1.451596736907959,
1284
+ "logits/rejected": 1.5489904880523682,
1285
+ "logps/chosen": -153.74618530273438,
1286
+ "logps/rejected": -159.26333618164062,
1287
+ "loss": 0.6870471000671386,
1288
+ "rewards/accuracies": 0.512499988079071,
1289
+ "rewards/chosen": 0.10403706133365631,
1290
+ "rewards/margins": 0.014494165778160095,
1291
+ "rewards/rejected": 0.08954289555549622,
1292
+ "step": 780
1293
+ },
1294
+ {
1295
+ "epoch": 0.632,
1296
+ "grad_norm": 1.3267385959625244,
1297
+ "learning_rate": 8.773333333333332e-07,
1298
+ "logits/chosen": 1.4153831005096436,
1299
+ "logits/rejected": 1.4497296810150146,
1300
+ "logps/chosen": -148.05368041992188,
1301
+ "logps/rejected": -151.3020477294922,
1302
+ "loss": 0.6877872943878174,
1303
+ "rewards/accuracies": 0.512499988079071,
1304
+ "rewards/chosen": 0.07300253212451935,
1305
+ "rewards/margins": 0.013981563039124012,
1306
+ "rewards/rejected": 0.05902096629142761,
1307
+ "step": 790
1308
+ },
1309
+ {
1310
+ "epoch": 0.64,
1311
+ "grad_norm": 1.2227483987808228,
1312
+ "learning_rate": 8.743703703703703e-07,
1313
+ "logits/chosen": 1.6773914098739624,
1314
+ "logits/rejected": 1.6918518543243408,
1315
+ "logps/chosen": -180.06863403320312,
1316
+ "logps/rejected": -194.30752563476562,
1317
+ "loss": 0.7015917778015137,
1318
+ "rewards/accuracies": 0.48750001192092896,
1319
+ "rewards/chosen": 0.07480724155902863,
1320
+ "rewards/margins": -0.013628068380057812,
1321
+ "rewards/rejected": 0.08843531459569931,
1322
+ "step": 800
1323
+ },
1324
+ {
1325
+ "epoch": 0.64,
1326
+ "eval_logits/chosen": 1.5051140785217285,
1327
+ "eval_logits/rejected": 1.5367870330810547,
1328
+ "eval_logps/chosen": -152.8026885986328,
1329
+ "eval_logps/rejected": -147.39437866210938,
1330
+ "eval_loss": 0.688633918762207,
1331
+ "eval_rewards/accuracies": 0.5220000147819519,
1332
+ "eval_rewards/chosen": 0.07461711764335632,
1333
+ "eval_rewards/margins": 0.011839868500828743,
1334
+ "eval_rewards/rejected": 0.06277725100517273,
1335
+ "eval_runtime": 90.9206,
1336
+ "eval_samples_per_second": 5.499,
1337
+ "eval_steps_per_second": 2.75,
1338
+ "step": 800
1339
+ },
1340
+ {
1341
+ "epoch": 0.648,
1342
+ "grad_norm": 1.209770679473877,
1343
+ "learning_rate": 8.714074074074074e-07,
1344
+ "logits/chosen": 1.3785041570663452,
1345
+ "logits/rejected": 1.60085928440094,
1346
+ "logps/chosen": -114.5325927734375,
1347
+ "logps/rejected": -151.88900756835938,
1348
+ "loss": 0.6890413761138916,
1349
+ "rewards/accuracies": 0.5249999761581421,
1350
+ "rewards/chosen": 0.0640551745891571,
1351
+ "rewards/margins": 0.010126596316695213,
1352
+ "rewards/rejected": 0.05392857640981674,
1353
+ "step": 810
1354
+ },
1355
+ {
1356
+ "epoch": 0.656,
1357
+ "grad_norm": 1.0250163078308105,
1358
+ "learning_rate": 8.684444444444444e-07,
1359
+ "logits/chosen": 1.5827982425689697,
1360
+ "logits/rejected": 1.6239618062973022,
1361
+ "logps/chosen": -149.01904296875,
1362
+ "logps/rejected": -141.9372100830078,
1363
+ "loss": 0.6728014469146728,
1364
+ "rewards/accuracies": 0.675000011920929,
1365
+ "rewards/chosen": 0.0830039232969284,
1366
+ "rewards/margins": 0.043090131133794785,
1367
+ "rewards/rejected": 0.03991378843784332,
1368
+ "step": 820
1369
+ },
1370
+ {
1371
+ "epoch": 0.664,
1372
+ "grad_norm": 1.0627655982971191,
1373
+ "learning_rate": 8.654814814814814e-07,
1374
+ "logits/chosen": 1.501873254776001,
1375
+ "logits/rejected": 1.537386417388916,
1376
+ "logps/chosen": -138.75169372558594,
1377
+ "logps/rejected": -134.00970458984375,
1378
+ "loss": 0.6801187992095947,
1379
+ "rewards/accuracies": 0.625,
1380
+ "rewards/chosen": 0.0640982910990715,
1381
+ "rewards/margins": 0.02855527400970459,
1382
+ "rewards/rejected": 0.03554301708936691,
1383
+ "step": 830
1384
+ },
1385
+ {
1386
+ "epoch": 0.672,
1387
+ "grad_norm": 1.2453433275222778,
1388
+ "learning_rate": 8.625185185185186e-07,
1389
+ "logits/chosen": 1.4766005277633667,
1390
+ "logits/rejected": 1.5185787677764893,
1391
+ "logps/chosen": -148.41250610351562,
1392
+ "logps/rejected": -157.73989868164062,
1393
+ "loss": 0.6788079738616943,
1394
+ "rewards/accuracies": 0.550000011920929,
1395
+ "rewards/chosen": 0.06551454961299896,
1396
+ "rewards/margins": 0.03301671892404556,
1397
+ "rewards/rejected": 0.0324978306889534,
1398
+ "step": 840
1399
+ },
1400
+ {
1401
+ "epoch": 0.68,
1402
+ "grad_norm": 1.191698670387268,
1403
+ "learning_rate": 8.595555555555555e-07,
1404
+ "logits/chosen": 1.555551528930664,
1405
+ "logits/rejected": 1.5438499450683594,
1406
+ "logps/chosen": -160.77488708496094,
1407
+ "logps/rejected": -141.9876251220703,
1408
+ "loss": 0.6801597595214843,
1409
+ "rewards/accuracies": 0.5375000238418579,
1410
+ "rewards/chosen": 0.053498052060604095,
1411
+ "rewards/margins": 0.028470590710639954,
1412
+ "rewards/rejected": 0.025027472525835037,
1413
+ "step": 850
1414
+ },
1415
+ {
1416
+ "epoch": 0.688,
1417
+ "grad_norm": 0.9382092356681824,
1418
+ "learning_rate": 8.565925925925925e-07,
1419
+ "logits/chosen": 1.5482738018035889,
1420
+ "logits/rejected": 1.5833700895309448,
1421
+ "logps/chosen": -139.68118286132812,
1422
+ "logps/rejected": -130.05947875976562,
1423
+ "loss": 0.6825921535491943,
1424
+ "rewards/accuracies": 0.5375000238418579,
1425
+ "rewards/chosen": 0.06382572650909424,
1426
+ "rewards/margins": 0.023111844435334206,
1427
+ "rewards/rejected": 0.04071388021111488,
1428
+ "step": 860
1429
+ },
1430
+ {
1431
+ "epoch": 0.696,
1432
+ "grad_norm": 1.2248382568359375,
1433
+ "learning_rate": 8.536296296296296e-07,
1434
+ "logits/chosen": 1.4529926776885986,
1435
+ "logits/rejected": 1.4423902034759521,
1436
+ "logps/chosen": -154.4588165283203,
1437
+ "logps/rejected": -153.44949340820312,
1438
+ "loss": 0.6915592193603516,
1439
+ "rewards/accuracies": 0.5249999761581421,
1440
+ "rewards/chosen": 0.03207828477025032,
1441
+ "rewards/margins": 0.006785523146390915,
1442
+ "rewards/rejected": 0.025292763486504555,
1443
+ "step": 870
1444
+ },
1445
+ {
1446
+ "epoch": 0.704,
1447
+ "grad_norm": 1.2174893617630005,
1448
+ "learning_rate": 8.506666666666667e-07,
1449
+ "logits/chosen": 1.4073113203048706,
1450
+ "logits/rejected": 1.668140172958374,
1451
+ "logps/chosen": -145.18460083007812,
1452
+ "logps/rejected": -157.0777587890625,
1453
+ "loss": 0.6922356605529785,
1454
+ "rewards/accuracies": 0.48750001192092896,
1455
+ "rewards/chosen": 0.049450717866420746,
1456
+ "rewards/margins": 0.004049716051667929,
1457
+ "rewards/rejected": 0.04540099948644638,
1458
+ "step": 880
1459
+ },
1460
+ {
1461
+ "epoch": 0.712,
1462
+ "grad_norm": 1.3186850547790527,
1463
+ "learning_rate": 8.477037037037037e-07,
1464
+ "logits/chosen": 1.2923892736434937,
1465
+ "logits/rejected": 1.491051435470581,
1466
+ "logps/chosen": -149.03018188476562,
1467
+ "logps/rejected": -161.61715698242188,
1468
+ "loss": 0.6841150760650635,
1469
+ "rewards/accuracies": 0.5375000238418579,
1470
+ "rewards/chosen": 0.038961172103881836,
1471
+ "rewards/margins": 0.020759906619787216,
1472
+ "rewards/rejected": 0.01820126175880432,
1473
+ "step": 890
1474
+ },
1475
+ {
1476
+ "epoch": 0.72,
1477
+ "grad_norm": 1.1123124361038208,
1478
+ "learning_rate": 8.447407407407407e-07,
1479
+ "logits/chosen": 1.546547293663025,
1480
+ "logits/rejected": 1.682992696762085,
1481
+ "logps/chosen": -166.2056884765625,
1482
+ "logps/rejected": -180.73330688476562,
1483
+ "loss": 0.6826215744018554,
1484
+ "rewards/accuracies": 0.574999988079071,
1485
+ "rewards/chosen": 0.07279330492019653,
1486
+ "rewards/margins": 0.025264525786042213,
1487
+ "rewards/rejected": 0.04752878472208977,
1488
+ "step": 900
1489
+ },
1490
+ {
1491
+ "epoch": 0.72,
1492
+ "eval_logits/chosen": 1.4819762706756592,
1493
+ "eval_logits/rejected": 1.513027548789978,
1494
+ "eval_logps/chosen": -153.1695556640625,
1495
+ "eval_logps/rejected": -147.7560577392578,
1496
+ "eval_loss": 0.689373254776001,
1497
+ "eval_rewards/accuracies": 0.5299999713897705,
1498
+ "eval_rewards/chosen": 0.03792775049805641,
1499
+ "eval_rewards/margins": 0.011320074088871479,
1500
+ "eval_rewards/rejected": 0.026607677340507507,
1501
+ "eval_runtime": 90.9603,
1502
+ "eval_samples_per_second": 5.497,
1503
+ "eval_steps_per_second": 2.748,
1504
+ "step": 900
1505
+ },
1506
+ {
1507
+ "epoch": 0.728,
1508
+ "grad_norm": 1.1507582664489746,
1509
+ "learning_rate": 8.417777777777777e-07,
1510
+ "logits/chosen": 1.5831308364868164,
1511
+ "logits/rejected": 1.3737132549285889,
1512
+ "logps/chosen": -140.00253295898438,
1513
+ "logps/rejected": -138.45155334472656,
1514
+ "loss": 0.6873391628265381,
1515
+ "rewards/accuracies": 0.574999988079071,
1516
+ "rewards/chosen": 0.05066479369997978,
1517
+ "rewards/margins": 0.015466553159058094,
1518
+ "rewards/rejected": 0.03519824147224426,
1519
+ "step": 910
1520
+ },
1521
+ {
1522
+ "epoch": 0.736,
1523
+ "grad_norm": 1.415456771850586,
1524
+ "learning_rate": 8.388148148148147e-07,
1525
+ "logits/chosen": 1.3000686168670654,
1526
+ "logits/rejected": 1.2642180919647217,
1527
+ "logps/chosen": -160.14027404785156,
1528
+ "logps/rejected": -163.32827758789062,
1529
+ "loss": 0.687142276763916,
1530
+ "rewards/accuracies": 0.5249999761581421,
1531
+ "rewards/chosen": 0.06618930399417877,
1532
+ "rewards/margins": 0.014807088300585747,
1533
+ "rewards/rejected": 0.05138222128152847,
1534
+ "step": 920
1535
+ },
1536
+ {
1537
+ "epoch": 0.744,
1538
+ "grad_norm": 1.3228634595870972,
1539
+ "learning_rate": 8.358518518518518e-07,
1540
+ "logits/chosen": 1.5204170942306519,
1541
+ "logits/rejected": 1.6129217147827148,
1542
+ "logps/chosen": -157.99832153320312,
1543
+ "logps/rejected": -162.09567260742188,
1544
+ "loss": 0.6790886878967285,
1545
+ "rewards/accuracies": 0.625,
1546
+ "rewards/chosen": 0.05940447002649307,
1547
+ "rewards/margins": 0.03161252290010452,
1548
+ "rewards/rejected": 0.027791941538453102,
1549
+ "step": 930
1550
+ },
1551
+ {
1552
+ "epoch": 0.752,
1553
+ "grad_norm": 1.048064947128296,
1554
+ "learning_rate": 8.328888888888889e-07,
1555
+ "logits/chosen": 1.5515320301055908,
1556
+ "logits/rejected": 1.676857352256775,
1557
+ "logps/chosen": -148.4853515625,
1558
+ "logps/rejected": -174.4320526123047,
1559
+ "loss": 0.6935453414916992,
1560
+ "rewards/accuracies": 0.5375000238418579,
1561
+ "rewards/chosen": 0.010225490666925907,
1562
+ "rewards/margins": 0.0021693313028663397,
1563
+ "rewards/rejected": 0.008056161925196648,
1564
+ "step": 940
1565
+ },
1566
+ {
1567
+ "epoch": 0.76,
1568
+ "grad_norm": 1.247799038887024,
1569
+ "learning_rate": 8.299259259259259e-07,
1570
+ "logits/chosen": 1.384231686592102,
1571
+ "logits/rejected": 1.4611561298370361,
1572
+ "logps/chosen": -140.81443786621094,
1573
+ "logps/rejected": -153.81326293945312,
1574
+ "loss": 0.6805226325988769,
1575
+ "rewards/accuracies": 0.6000000238418579,
1576
+ "rewards/chosen": 0.053223587572574615,
1577
+ "rewards/margins": 0.02933613583445549,
1578
+ "rewards/rejected": 0.023887457326054573,
1579
+ "step": 950
1580
+ },
1581
+ {
1582
+ "epoch": 0.768,
1583
+ "grad_norm": 1.1611264944076538,
1584
+ "learning_rate": 8.269629629629629e-07,
1585
+ "logits/chosen": 1.4987528324127197,
1586
+ "logits/rejected": 1.3684321641921997,
1587
+ "logps/chosen": -135.81056213378906,
1588
+ "logps/rejected": -119.05133056640625,
1589
+ "loss": 0.6954316616058349,
1590
+ "rewards/accuracies": 0.4749999940395355,
1591
+ "rewards/chosen": 0.0197703056037426,
1592
+ "rewards/margins": -0.0012637012405321002,
1593
+ "rewards/rejected": 0.021034007892012596,
1594
+ "step": 960
1595
+ },
1596
+ {
1597
+ "epoch": 0.776,
1598
+ "grad_norm": 0.964948832988739,
1599
+ "learning_rate": 8.24e-07,
1600
+ "logits/chosen": 1.612053632736206,
1601
+ "logits/rejected": 1.7114235162734985,
1602
+ "logps/chosen": -153.56137084960938,
1603
+ "logps/rejected": -149.17323303222656,
1604
+ "loss": 0.6916662693023682,
1605
+ "rewards/accuracies": 0.512499988079071,
1606
+ "rewards/chosen": 0.05082285404205322,
1607
+ "rewards/margins": 0.0056606316938996315,
1608
+ "rewards/rejected": 0.04516221210360527,
1609
+ "step": 970
1610
+ },
1611
+ {
1612
+ "epoch": 0.784,
1613
+ "grad_norm": 1.409449815750122,
1614
+ "learning_rate": 8.21037037037037e-07,
1615
+ "logits/chosen": 1.384060025215149,
1616
+ "logits/rejected": 1.316361904144287,
1617
+ "logps/chosen": -168.75360107421875,
1618
+ "logps/rejected": -141.79537963867188,
1619
+ "loss": 0.6845529556274415,
1620
+ "rewards/accuracies": 0.6499999761581421,
1621
+ "rewards/chosen": 0.0886571854352951,
1622
+ "rewards/margins": 0.02084263041615486,
1623
+ "rewards/rejected": 0.06781454384326935,
1624
+ "step": 980
1625
+ },
1626
+ {
1627
+ "epoch": 0.792,
1628
+ "grad_norm": 1.0246562957763672,
1629
+ "learning_rate": 8.18074074074074e-07,
1630
+ "logits/chosen": 1.4986507892608643,
1631
+ "logits/rejected": 1.571103811264038,
1632
+ "logps/chosen": -165.22254943847656,
1633
+ "logps/rejected": -168.63540649414062,
1634
+ "loss": 0.6790967464447022,
1635
+ "rewards/accuracies": 0.5874999761581421,
1636
+ "rewards/chosen": 0.07517905533313751,
1637
+ "rewards/margins": 0.03272219002246857,
1638
+ "rewards/rejected": 0.04245685786008835,
1639
+ "step": 990
1640
+ },
1641
+ {
1642
+ "epoch": 0.8,
1643
+ "grad_norm": 1.014426827430725,
1644
+ "learning_rate": 8.15111111111111e-07,
1645
+ "logits/chosen": 1.4708755016326904,
1646
+ "logits/rejected": 1.4550955295562744,
1647
+ "logps/chosen": -149.49874877929688,
1648
+ "logps/rejected": -125.0966567993164,
1649
+ "loss": 0.6892098903656005,
1650
+ "rewards/accuracies": 0.512499988079071,
1651
+ "rewards/chosen": 0.07825140655040741,
1652
+ "rewards/margins": 0.01007872074842453,
1653
+ "rewards/rejected": 0.06817268580198288,
1654
+ "step": 1000
1655
+ },
1656
+ {
1657
+ "epoch": 0.8,
1658
+ "eval_logits/chosen": 1.4972777366638184,
1659
+ "eval_logits/rejected": 1.5283576250076294,
1660
+ "eval_logps/chosen": -152.8240966796875,
1661
+ "eval_logps/rejected": -147.44175720214844,
1662
+ "eval_loss": 0.6880542039871216,
1663
+ "eval_rewards/accuracies": 0.5419999957084656,
1664
+ "eval_rewards/chosen": 0.07247376441955566,
1665
+ "eval_rewards/margins": 0.01443479023873806,
1666
+ "eval_rewards/rejected": 0.058038972318172455,
1667
+ "eval_runtime": 90.827,
1668
+ "eval_samples_per_second": 5.505,
1669
+ "eval_steps_per_second": 2.752,
1670
+ "step": 1000
1671
+ }
1672
+ ],
1673
+ "logging_steps": 10,
1674
+ "max_steps": 3750,
1675
+ "num_input_tokens_seen": 0,
1676
+ "num_train_epochs": 3,
1677
+ "save_steps": 500,
1678
+ "stateful_callbacks": {
1679
+ "TrainerControl": {
1680
+ "args": {
1681
+ "should_epoch_stop": false,
1682
+ "should_evaluate": false,
1683
+ "should_log": false,
1684
+ "should_save": true,
1685
+ "should_training_stop": false
1686
+ },
1687
+ "attributes": {}
1688
+ }
1689
+ },
1690
+ "total_flos": 0.0,
1691
+ "train_batch_size": 2,
1692
+ "trial_name": null,
1693
+ "trial_params": null
1694
+ }
v3/DPO/DPO_10k/lora/checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4f90e4a7053b7fb440d68cb64a3d198396af09db942d08839c5f4f0f0b8a0c8
3
+ size 6097
v3/DPO/DPO_10k/lora/checkpoint-1500/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Llama-3.2-1B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:meta-llama/Llama-3.2-1B-Instruct
7
+ - dpo
8
+ - lora
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
v3/DPO/DPO_10k/lora/checkpoint-1500/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "down_proj",
35
+ "k_proj",
36
+ "o_proj",
37
+ "up_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
v3/DPO/DPO_10k/lora/checkpoint-1500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7afd4ea14d3390375f16e1a10b8fcca1ede2eb7959bce12efb565ba4b2e00236
3
+ size 180385008
v3/DPO/DPO_10k/lora/checkpoint-1500/chat_template.jinja ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token }}
2
+ {%- if custom_tools is defined %}
3
+ {%- set tools = custom_tools %}
4
+ {%- endif %}
5
+ {%- if not tools_in_user_message is defined %}
6
+ {%- set tools_in_user_message = true %}
7
+ {%- endif %}
8
+ {%- if not date_string is defined %}
9
+ {%- if strftime_now is defined %}
10
+ {%- set date_string = strftime_now("%d %b %Y") %}
11
+ {%- else %}
12
+ {%- set date_string = "26 Jul 2024" %}
13
+ {%- endif %}
14
+ {%- endif %}
15
+ {%- if not tools is defined %}
16
+ {%- set tools = none %}
17
+ {%- endif %}
18
+
19
+ {#- This block extracts the system message, so we can slot it into the right place. #}
20
+ {%- if messages[0]['role'] == 'system' %}
21
+ {%- set system_message = messages[0]['content']|trim %}
22
+ {%- set messages = messages[1:] %}
23
+ {%- else %}
24
+ {%- set system_message = "" %}
25
+ {%- endif %}
26
+
27
+ {#- System message #}
28
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
29
+ {%- if tools is not none %}
30
+ {{- "Environment: ipython\n" }}
31
+ {%- endif %}
32
+ {{- "Cutting Knowledge Date: December 2023\n" }}
33
+ {{- "Today Date: " + date_string + "\n\n" }}
34
+ {%- if tools is not none and not tools_in_user_message %}
35
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
36
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
37
+ {{- "Do not use variables.\n\n" }}
38
+ {%- for t in tools %}
39
+ {{- t | tojson(indent=4) }}
40
+ {{- "\n\n" }}
41
+ {%- endfor %}
42
+ {%- endif %}
43
+ {{- system_message }}
44
+ {{- "<|eot_id|>" }}
45
+
46
+ {#- Custom tools are passed in a user message with some extra guidance #}
47
+ {%- if tools_in_user_message and not tools is none %}
48
+ {#- Extract the first user message so we can plug it in here #}
49
+ {%- if messages | length != 0 %}
50
+ {%- set first_user_message = messages[0]['content']|trim %}
51
+ {%- set messages = messages[1:] %}
52
+ {%- else %}
53
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
54
+ {%- endif %}
55
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
56
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
57
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
58
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
59
+ {{- "Do not use variables.\n\n" }}
60
+ {%- for t in tools %}
61
+ {{- t | tojson(indent=4) }}
62
+ {{- "\n\n" }}
63
+ {%- endfor %}
64
+ {{- first_user_message + "<|eot_id|>"}}
65
+ {%- endif %}
66
+
67
+ {%- for message in messages %}
68
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
69
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
70
+ {%- elif 'tool_calls' in message %}
71
+ {%- if not message.tool_calls|length == 1 %}
72
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
73
+ {%- endif %}
74
+ {%- set tool_call = message.tool_calls[0].function %}
75
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
76
+ {{- '{"name": "' + tool_call.name + '", ' }}
77
+ {{- '"parameters": ' }}
78
+ {{- tool_call.arguments | tojson }}
79
+ {{- "}" }}
80
+ {{- "<|eot_id|>" }}
81
+ {%- elif message.role == "tool" or message.role == "ipython" %}
82
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
83
+ {%- if message.content is mapping or message.content is iterable %}
84
+ {{- message.content | tojson }}
85
+ {%- else %}
86
+ {{- message.content }}
87
+ {%- endif %}
88
+ {{- "<|eot_id|>" }}
89
+ {%- endif %}
90
+ {%- endfor %}
91
+ {%- if add_generation_prompt %}
92
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
93
+ {%- endif %}
v3/DPO/DPO_10k/lora/checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e76e51dcca53476f7d03879d08ea48c310a515073dcada2891911d9d2d8bca3
3
+ size 360902475
v3/DPO/DPO_10k/lora/checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b928d2d8033ac6bd87c58a39b741faace8bd1c6b0d070b7fad23c19520ff9f1a
3
+ size 14645
v3/DPO/DPO_10k/lora/checkpoint-1500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca372268f4fa9335030c0cb7aedb6cdba75f457da50e7a4034abb1a2d0843689
3
+ size 1383
v3/DPO/DPO_10k/lora/checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e10622725c891492a233a860ff051a91cd7997b7a70e593a4bc44e88f0ef3b5
3
+ size 1465
v3/DPO/DPO_10k/lora/checkpoint-1500/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
+ size 17209920
v3/DPO/DPO_10k/lora/checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin_of_text|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|eot_id|>",
6
+ "is_local": false,
7
+ "model_input_names": [
8
+ "input_ids",
9
+ "attention_mask"
10
+ ],
11
+ "model_max_length": 131072,
12
+ "pad_token": "<|eot_id|>",
13
+ "tokenizer_class": "TokenizersBackend"
14
+ }
v3/DPO/DPO_10k/lora/checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,2524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1400,
3
+ "best_metric": 0.6853657364845276,
4
+ "best_model_checkpoint": "output/lora/checkpoint-1000",
5
+ "epoch": 1.2,
6
+ "eval_steps": 100,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.008,
14
+ "grad_norm": 1.1179773807525635,
15
+ "learning_rate": 2.4e-08,
16
+ "logits/chosen": 1.4545083045959473,
17
+ "logits/rejected": 1.425490379333496,
18
+ "logps/chosen": -132.46189880371094,
19
+ "logps/rejected": -148.20260620117188,
20
+ "loss": 0.6930626392364502,
21
+ "rewards/accuracies": 0.21250000596046448,
22
+ "rewards/chosen": -6.165739614516497e-05,
23
+ "rewards/margins": 0.00017333509458694607,
24
+ "rewards/rejected": -0.00023499250528402627,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.016,
29
+ "grad_norm": 1.3502839803695679,
30
+ "learning_rate": 5.0666666666666664e-08,
31
+ "logits/chosen": 1.5668801069259644,
32
+ "logits/rejected": 1.5191389322280884,
33
+ "logps/chosen": -157.63624572753906,
34
+ "logps/rejected": -144.2716827392578,
35
+ "loss": 0.6930380821228027,
36
+ "rewards/accuracies": 0.5874999761581421,
37
+ "rewards/chosen": 0.0009244013344869018,
38
+ "rewards/margins": 0.00023505210992880166,
39
+ "rewards/rejected": 0.0006893490790389478,
40
+ "step": 20
41
+ },
42
+ {
43
+ "epoch": 0.024,
44
+ "grad_norm": 1.0219553709030151,
45
+ "learning_rate": 7.733333333333334e-08,
46
+ "logits/chosen": 1.8921797275543213,
47
+ "logits/rejected": 1.7342685461044312,
48
+ "logps/chosen": -164.55799865722656,
49
+ "logps/rejected": -158.92752075195312,
50
+ "loss": 0.69290189743042,
51
+ "rewards/accuracies": 0.5375000238418579,
52
+ "rewards/chosen": 0.0009391927160322666,
53
+ "rewards/margins": 0.0005032324115745723,
54
+ "rewards/rejected": 0.0004359603044576943,
55
+ "step": 30
56
+ },
57
+ {
58
+ "epoch": 0.032,
59
+ "grad_norm": 1.2056113481521606,
60
+ "learning_rate": 1.0399999999999999e-07,
61
+ "logits/chosen": 1.4950047731399536,
62
+ "logits/rejected": 1.6163270473480225,
63
+ "logps/chosen": -145.54183959960938,
64
+ "logps/rejected": -134.19979858398438,
65
+ "loss": 0.6935500621795654,
66
+ "rewards/accuracies": 0.4124999940395355,
67
+ "rewards/chosen": 0.0006421065190806985,
68
+ "rewards/margins": -0.0007943272357806563,
69
+ "rewards/rejected": 0.0014364338712766767,
70
+ "step": 40
71
+ },
72
+ {
73
+ "epoch": 0.04,
74
+ "grad_norm": 1.1029725074768066,
75
+ "learning_rate": 1.3066666666666665e-07,
76
+ "logits/chosen": 1.4076533317565918,
77
+ "logits/rejected": 1.4791977405548096,
78
+ "logps/chosen": -133.23324584960938,
79
+ "logps/rejected": -138.3593292236328,
80
+ "loss": 0.6925833702087403,
81
+ "rewards/accuracies": 0.574999988079071,
82
+ "rewards/chosen": 0.0012372838100418448,
83
+ "rewards/margins": 0.001139522879384458,
84
+ "rewards/rejected": 9.776116348803043e-05,
85
+ "step": 50
86
+ },
87
+ {
88
+ "epoch": 0.048,
89
+ "grad_norm": 0.9960266351699829,
90
+ "learning_rate": 1.573333333333333e-07,
91
+ "logits/chosen": 1.407166600227356,
92
+ "logits/rejected": 1.376042127609253,
93
+ "logps/chosen": -137.8921661376953,
94
+ "logps/rejected": -127.97320556640625,
95
+ "loss": 0.693614149093628,
96
+ "rewards/accuracies": 0.4749999940395355,
97
+ "rewards/chosen": -0.00021316049969755113,
98
+ "rewards/margins": -0.0009249996510334313,
99
+ "rewards/rejected": 0.0007118391804397106,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.056,
104
+ "grad_norm": 1.0946824550628662,
105
+ "learning_rate": 1.8399999999999998e-07,
106
+ "logits/chosen": 1.5944501161575317,
107
+ "logits/rejected": 1.7248146533966064,
108
+ "logps/chosen": -153.5526123046875,
109
+ "logps/rejected": -157.19735717773438,
110
+ "loss": 0.6928449153900147,
111
+ "rewards/accuracies": 0.5375000238418579,
112
+ "rewards/chosen": 0.0026370862033218145,
113
+ "rewards/margins": 0.0006150150438770652,
114
+ "rewards/rejected": 0.0020220710430294275,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.064,
119
+ "grad_norm": 1.0322489738464355,
120
+ "learning_rate": 2.1066666666666665e-07,
121
+ "logits/chosen": 1.4586353302001953,
122
+ "logits/rejected": 1.4842995405197144,
123
+ "logps/chosen": -149.03665161132812,
124
+ "logps/rejected": -136.24081420898438,
125
+ "loss": 0.6932015895843506,
126
+ "rewards/accuracies": 0.512499988079071,
127
+ "rewards/chosen": 0.004061593674123287,
128
+ "rewards/margins": -9.99594121822156e-05,
129
+ "rewards/rejected": 0.004161553457379341,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 0.072,
134
+ "grad_norm": 1.0012447834014893,
135
+ "learning_rate": 2.3733333333333334e-07,
136
+ "logits/chosen": 1.5075902938842773,
137
+ "logits/rejected": 1.5986944437026978,
138
+ "logps/chosen": -159.3433074951172,
139
+ "logps/rejected": -151.85072326660156,
140
+ "loss": 0.6928757190704345,
141
+ "rewards/accuracies": 0.5249999761581421,
142
+ "rewards/chosen": 0.006978762801736593,
143
+ "rewards/margins": 0.0005627512582577765,
144
+ "rewards/rejected": 0.006416010670363903,
145
+ "step": 90
146
+ },
147
+ {
148
+ "epoch": 0.08,
149
+ "grad_norm": 1.280081033706665,
150
+ "learning_rate": 2.64e-07,
151
+ "logits/chosen": 1.448286533355713,
152
+ "logits/rejected": 1.5480695962905884,
153
+ "logps/chosen": -151.12625122070312,
154
+ "logps/rejected": -171.24545288085938,
155
+ "loss": 0.6931126117706299,
156
+ "rewards/accuracies": 0.4749999940395355,
157
+ "rewards/chosen": 0.00693341251462698,
158
+ "rewards/margins": 9.057526767719537e-05,
159
+ "rewards/rejected": 0.006842837668955326,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 0.08,
164
+ "eval_logits/chosen": 1.518183946609497,
165
+ "eval_logits/rejected": 1.5525447130203247,
166
+ "eval_logps/chosen": -153.46630859375,
167
+ "eval_logps/rejected": -147.95005798339844,
168
+ "eval_loss": 0.6926330327987671,
169
+ "eval_rewards/accuracies": 0.5600000023841858,
170
+ "eval_rewards/chosen": 0.008253541775047779,
171
+ "eval_rewards/margins": 0.0010449704714119434,
172
+ "eval_rewards/rejected": 0.007208569906651974,
173
+ "eval_runtime": 91.0812,
174
+ "eval_samples_per_second": 5.49,
175
+ "eval_steps_per_second": 2.745,
176
+ "step": 100
177
+ },
178
+ {
179
+ "epoch": 0.088,
180
+ "grad_norm": 1.3912373781204224,
181
+ "learning_rate": 2.906666666666667e-07,
182
+ "logits/chosen": 1.6896222829818726,
183
+ "logits/rejected": 1.6516921520233154,
184
+ "logps/chosen": -173.70030212402344,
185
+ "logps/rejected": -155.5503692626953,
186
+ "loss": 0.6926521778106689,
187
+ "rewards/accuracies": 0.550000011920929,
188
+ "rewards/chosen": 0.010179834440350533,
189
+ "rewards/margins": 0.0010137532372027636,
190
+ "rewards/rejected": 0.009166081435978413,
191
+ "step": 110
192
+ },
193
+ {
194
+ "epoch": 0.096,
195
+ "grad_norm": 1.1518888473510742,
196
+ "learning_rate": 3.173333333333333e-07,
197
+ "logits/chosen": 1.5166860818862915,
198
+ "logits/rejected": 1.5136979818344116,
199
+ "logps/chosen": -136.39906311035156,
200
+ "logps/rejected": -140.86561584472656,
201
+ "loss": 0.6932918548583984,
202
+ "rewards/accuracies": 0.4375,
203
+ "rewards/chosen": 0.00907944981008768,
204
+ "rewards/margins": -0.00026998287648893893,
205
+ "rewards/rejected": 0.009349432773888111,
206
+ "step": 120
207
+ },
208
+ {
209
+ "epoch": 0.104,
210
+ "grad_norm": 1.3213376998901367,
211
+ "learning_rate": 3.4399999999999996e-07,
212
+ "logits/chosen": 1.4598973989486694,
213
+ "logits/rejected": 1.6739647388458252,
214
+ "logps/chosen": -159.12461853027344,
215
+ "logps/rejected": -163.39346313476562,
216
+ "loss": 0.693086051940918,
217
+ "rewards/accuracies": 0.550000011920929,
218
+ "rewards/chosen": 0.010477779433131218,
219
+ "rewards/margins": 0.00014539004769176245,
220
+ "rewards/rejected": 0.010332388803362846,
221
+ "step": 130
222
+ },
223
+ {
224
+ "epoch": 0.112,
225
+ "grad_norm": 0.8833863735198975,
226
+ "learning_rate": 3.7066666666666665e-07,
227
+ "logits/chosen": 1.3829412460327148,
228
+ "logits/rejected": 1.4100173711776733,
229
+ "logps/chosen": -132.7149658203125,
230
+ "logps/rejected": -126.85478210449219,
231
+ "loss": 0.6932382583618164,
232
+ "rewards/accuracies": 0.4625000059604645,
233
+ "rewards/chosen": 0.010262183845043182,
234
+ "rewards/margins": -0.0001672578218858689,
235
+ "rewards/rejected": 0.010429441928863525,
236
+ "step": 140
237
+ },
238
+ {
239
+ "epoch": 0.12,
240
+ "grad_norm": 1.1472455263137817,
241
+ "learning_rate": 3.973333333333333e-07,
242
+ "logits/chosen": 1.4954869747161865,
243
+ "logits/rejected": 1.5857340097427368,
244
+ "logps/chosen": -165.10568237304688,
245
+ "logps/rejected": -158.97433471679688,
246
+ "loss": 0.6916103363037109,
247
+ "rewards/accuracies": 0.5375000238418579,
248
+ "rewards/chosen": 0.014177520759403706,
249
+ "rewards/margins": 0.003108317730948329,
250
+ "rewards/rejected": 0.011069201864302158,
251
+ "step": 150
252
+ },
253
+ {
254
+ "epoch": 0.128,
255
+ "grad_norm": 1.0628798007965088,
256
+ "learning_rate": 4.24e-07,
257
+ "logits/chosen": 1.6717243194580078,
258
+ "logits/rejected": 1.705092191696167,
259
+ "logps/chosen": -159.332275390625,
260
+ "logps/rejected": -147.21646118164062,
261
+ "loss": 0.6935629844665527,
262
+ "rewards/accuracies": 0.4625000059604645,
263
+ "rewards/chosen": 0.016994481906294823,
264
+ "rewards/margins": -0.0007874655420891941,
265
+ "rewards/rejected": 0.017781946808099747,
266
+ "step": 160
267
+ },
268
+ {
269
+ "epoch": 0.136,
270
+ "grad_norm": 1.390810251235962,
271
+ "learning_rate": 4.506666666666666e-07,
272
+ "logits/chosen": 1.4719234704971313,
273
+ "logits/rejected": 1.499526023864746,
274
+ "logps/chosen": -148.67251586914062,
275
+ "logps/rejected": -161.58021545410156,
276
+ "loss": 0.6926599979400635,
277
+ "rewards/accuracies": 0.48750001192092896,
278
+ "rewards/chosen": 0.021239640191197395,
279
+ "rewards/margins": 0.0010346340714022517,
280
+ "rewards/rejected": 0.020205006003379822,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 0.144,
285
+ "grad_norm": 1.139707326889038,
286
+ "learning_rate": 4.773333333333333e-07,
287
+ "logits/chosen": 1.4833905696868896,
288
+ "logits/rejected": 1.637584924697876,
289
+ "logps/chosen": -158.96731567382812,
290
+ "logps/rejected": -161.9830322265625,
291
+ "loss": 0.6919547557830811,
292
+ "rewards/accuracies": 0.5874999761581421,
293
+ "rewards/chosen": 0.019416773691773415,
294
+ "rewards/margins": 0.0024296878837049007,
295
+ "rewards/rejected": 0.016987085342407227,
296
+ "step": 180
297
+ },
298
+ {
299
+ "epoch": 0.152,
300
+ "grad_norm": 1.2029796838760376,
301
+ "learning_rate": 5.04e-07,
302
+ "logits/chosen": 1.5398027896881104,
303
+ "logits/rejected": 1.6678705215454102,
304
+ "logps/chosen": -143.00283813476562,
305
+ "logps/rejected": -144.89378356933594,
306
+ "loss": 0.692262840270996,
307
+ "rewards/accuracies": 0.5249999761581421,
308
+ "rewards/chosen": 0.018960200250148773,
309
+ "rewards/margins": 0.0018175147706642747,
310
+ "rewards/rejected": 0.017142686992883682,
311
+ "step": 190
312
+ },
313
+ {
314
+ "epoch": 0.16,
315
+ "grad_norm": 1.0527437925338745,
316
+ "learning_rate": 5.306666666666665e-07,
317
+ "logits/chosen": 1.6087623834609985,
318
+ "logits/rejected": 1.5181543827056885,
319
+ "logps/chosen": -159.35073852539062,
320
+ "logps/rejected": -133.59890747070312,
321
+ "loss": 0.6924069881439209,
322
+ "rewards/accuracies": 0.5,
323
+ "rewards/chosen": 0.01926913857460022,
324
+ "rewards/margins": 0.0015339398523792624,
325
+ "rewards/rejected": 0.0177351962774992,
326
+ "step": 200
327
+ },
328
+ {
329
+ "epoch": 0.16,
330
+ "eval_logits/chosen": 1.5193672180175781,
331
+ "eval_logits/rejected": 1.5534639358520508,
332
+ "eval_logps/chosen": -153.34857177734375,
333
+ "eval_logps/rejected": -147.8425750732422,
334
+ "eval_loss": 0.6921458840370178,
335
+ "eval_rewards/accuracies": 0.5600000023841858,
336
+ "eval_rewards/chosen": 0.020027177408337593,
337
+ "eval_rewards/margins": 0.0020714527927339077,
338
+ "eval_rewards/rejected": 0.017955724149942398,
339
+ "eval_runtime": 90.9762,
340
+ "eval_samples_per_second": 5.496,
341
+ "eval_steps_per_second": 2.748,
342
+ "step": 200
343
+ },
344
+ {
345
+ "epoch": 0.168,
346
+ "grad_norm": 1.1897239685058594,
347
+ "learning_rate": 5.573333333333333e-07,
348
+ "logits/chosen": 1.6267732381820679,
349
+ "logits/rejected": 1.5775268077850342,
350
+ "logps/chosen": -159.51437377929688,
351
+ "logps/rejected": -170.17762756347656,
352
+ "loss": 0.6925770759582519,
353
+ "rewards/accuracies": 0.48750001192092896,
354
+ "rewards/chosen": 0.02078000269830227,
355
+ "rewards/margins": 0.0012053751852363348,
356
+ "rewards/rejected": 0.01957462914288044,
357
+ "step": 210
358
+ },
359
+ {
360
+ "epoch": 0.176,
361
+ "grad_norm": 0.9995236396789551,
362
+ "learning_rate": 5.839999999999999e-07,
363
+ "logits/chosen": 1.5781934261322021,
364
+ "logits/rejected": 1.642251968383789,
365
+ "logps/chosen": -159.95309448242188,
366
+ "logps/rejected": -144.36058044433594,
367
+ "loss": 0.6931475639343262,
368
+ "rewards/accuracies": 0.512499988079071,
369
+ "rewards/chosen": 0.023322442546486855,
370
+ "rewards/margins": 6.392716750269756e-05,
371
+ "rewards/rejected": 0.02325851283967495,
372
+ "step": 220
373
+ },
374
+ {
375
+ "epoch": 0.184,
376
+ "grad_norm": 0.8819260597229004,
377
+ "learning_rate": 6.106666666666666e-07,
378
+ "logits/chosen": 1.459991693496704,
379
+ "logits/rejected": 1.6060739755630493,
380
+ "logps/chosen": -162.04745483398438,
381
+ "logps/rejected": -139.5411834716797,
382
+ "loss": 0.6919463634490967,
383
+ "rewards/accuracies": 0.512499988079071,
384
+ "rewards/chosen": 0.026052182540297508,
385
+ "rewards/margins": 0.002481849165633321,
386
+ "rewards/rejected": 0.023570330813527107,
387
+ "step": 230
388
+ },
389
+ {
390
+ "epoch": 0.192,
391
+ "grad_norm": 1.1055208444595337,
392
+ "learning_rate": 6.373333333333333e-07,
393
+ "logits/chosen": 1.558739185333252,
394
+ "logits/rejected": 1.5965116024017334,
395
+ "logps/chosen": -171.9166259765625,
396
+ "logps/rejected": -145.66433715820312,
397
+ "loss": 0.6912176132202148,
398
+ "rewards/accuracies": 0.5874999761581421,
399
+ "rewards/chosen": 0.033868782222270966,
400
+ "rewards/margins": 0.004017929546535015,
401
+ "rewards/rejected": 0.029850851744413376,
402
+ "step": 240
403
+ },
404
+ {
405
+ "epoch": 0.2,
406
+ "grad_norm": 1.1372733116149902,
407
+ "learning_rate": 6.64e-07,
408
+ "logits/chosen": 1.5322860479354858,
409
+ "logits/rejected": 1.4695795774459839,
410
+ "logps/chosen": -161.7803955078125,
411
+ "logps/rejected": -148.91949462890625,
412
+ "loss": 0.6940969944000244,
413
+ "rewards/accuracies": 0.42500001192092896,
414
+ "rewards/chosen": 0.034065864980220795,
415
+ "rewards/margins": -0.0016761379083618522,
416
+ "rewards/rejected": 0.035742007195949554,
417
+ "step": 250
418
+ },
419
+ {
420
+ "epoch": 0.208,
421
+ "grad_norm": 1.0917680263519287,
422
+ "learning_rate": 6.906666666666666e-07,
423
+ "logits/chosen": 1.4094547033309937,
424
+ "logits/rejected": 1.50186026096344,
425
+ "logps/chosen": -132.91455078125,
426
+ "logps/rejected": -146.79714965820312,
427
+ "loss": 0.6930203914642334,
428
+ "rewards/accuracies": 0.4124999940395355,
429
+ "rewards/chosen": 0.03089061938226223,
430
+ "rewards/margins": 0.00040971505222842097,
431
+ "rewards/rejected": 0.030480902642011642,
432
+ "step": 260
433
+ },
434
+ {
435
+ "epoch": 0.216,
436
+ "grad_norm": 1.0280259847640991,
437
+ "learning_rate": 7.173333333333333e-07,
438
+ "logits/chosen": 1.531285285949707,
439
+ "logits/rejected": 1.6163132190704346,
440
+ "logps/chosen": -146.902587890625,
441
+ "logps/rejected": -147.8347625732422,
442
+ "loss": 0.6944749355316162,
443
+ "rewards/accuracies": 0.512499988079071,
444
+ "rewards/chosen": 0.028514739125967026,
445
+ "rewards/margins": -0.0025452517438679934,
446
+ "rewards/rejected": 0.0310599897056818,
447
+ "step": 270
448
+ },
449
+ {
450
+ "epoch": 0.224,
451
+ "grad_norm": 1.398708701133728,
452
+ "learning_rate": 7.44e-07,
453
+ "logits/chosen": 1.563852071762085,
454
+ "logits/rejected": 1.396703839302063,
455
+ "logps/chosen": -172.68153381347656,
456
+ "logps/rejected": -155.80215454101562,
457
+ "loss": 0.689476490020752,
458
+ "rewards/accuracies": 0.625,
459
+ "rewards/chosen": 0.03224216774106026,
460
+ "rewards/margins": 0.007473687641322613,
461
+ "rewards/rejected": 0.02476847730576992,
462
+ "step": 280
463
+ },
464
+ {
465
+ "epoch": 0.232,
466
+ "grad_norm": 1.0394141674041748,
467
+ "learning_rate": 7.706666666666667e-07,
468
+ "logits/chosen": 1.5581729412078857,
469
+ "logits/rejected": 1.5819021463394165,
470
+ "logps/chosen": -144.41465759277344,
471
+ "logps/rejected": -153.55406188964844,
472
+ "loss": 0.6927877902984619,
473
+ "rewards/accuracies": 0.550000011920929,
474
+ "rewards/chosen": 0.03272664546966553,
475
+ "rewards/margins": 0.0008557910332456231,
476
+ "rewards/rejected": 0.031870849430561066,
477
+ "step": 290
478
+ },
479
+ {
480
+ "epoch": 0.24,
481
+ "grad_norm": 1.2557185888290405,
482
+ "learning_rate": 7.973333333333333e-07,
483
+ "logits/chosen": 1.5062446594238281,
484
+ "logits/rejected": 1.5786281824111938,
485
+ "logps/chosen": -153.63558959960938,
486
+ "logps/rejected": -165.46617126464844,
487
+ "loss": 0.6897892951965332,
488
+ "rewards/accuracies": 0.625,
489
+ "rewards/chosen": 0.03914078325033188,
490
+ "rewards/margins": 0.006873926613479853,
491
+ "rewards/rejected": 0.032266855239868164,
492
+ "step": 300
493
+ },
494
+ {
495
+ "epoch": 0.24,
496
+ "eval_logits/chosen": 1.5237834453582764,
497
+ "eval_logits/rejected": 1.557741641998291,
498
+ "eval_logps/chosen": -153.19883728027344,
499
+ "eval_logps/rejected": -147.7100830078125,
500
+ "eval_loss": 0.6913501024246216,
501
+ "eval_rewards/accuracies": 0.5339999794960022,
502
+ "eval_rewards/chosen": 0.03499976545572281,
503
+ "eval_rewards/margins": 0.0037925743963569403,
504
+ "eval_rewards/rejected": 0.031207194551825523,
505
+ "eval_runtime": 90.9133,
506
+ "eval_samples_per_second": 5.5,
507
+ "eval_steps_per_second": 2.75,
508
+ "step": 300
509
+ },
510
+ {
511
+ "epoch": 0.248,
512
+ "grad_norm": 1.1200144290924072,
513
+ "learning_rate": 8.24e-07,
514
+ "logits/chosen": 1.618949294090271,
515
+ "logits/rejected": 1.6625702381134033,
516
+ "logps/chosen": -141.8953857421875,
517
+ "logps/rejected": -156.6927032470703,
518
+ "loss": 0.694425106048584,
519
+ "rewards/accuracies": 0.4124999940395355,
520
+ "rewards/chosen": 0.03162423521280289,
521
+ "rewards/margins": -0.0023833750747144222,
522
+ "rewards/rejected": 0.034007612615823746,
523
+ "step": 310
524
+ },
525
+ {
526
+ "epoch": 0.256,
527
+ "grad_norm": 1.0823743343353271,
528
+ "learning_rate": 8.506666666666667e-07,
529
+ "logits/chosen": 1.5974153280258179,
530
+ "logits/rejected": 1.6237560510635376,
531
+ "logps/chosen": -139.2996368408203,
532
+ "logps/rejected": -141.94407653808594,
533
+ "loss": 0.6914949893951416,
534
+ "rewards/accuracies": 0.574999988079071,
535
+ "rewards/chosen": 0.029445698484778404,
536
+ "rewards/margins": 0.0034329916816204786,
537
+ "rewards/rejected": 0.026012707501649857,
538
+ "step": 320
539
+ },
540
+ {
541
+ "epoch": 0.264,
542
+ "grad_norm": 1.1765722036361694,
543
+ "learning_rate": 8.773333333333332e-07,
544
+ "logits/chosen": 1.465215802192688,
545
+ "logits/rejected": 1.4882014989852905,
546
+ "logps/chosen": -158.4329833984375,
547
+ "logps/rejected": -183.94259643554688,
548
+ "loss": 0.6925934791564942,
549
+ "rewards/accuracies": 0.5,
550
+ "rewards/chosen": 0.026302698999643326,
551
+ "rewards/margins": 0.0012350418837741017,
552
+ "rewards/rejected": 0.025067657232284546,
553
+ "step": 330
554
+ },
555
+ {
556
+ "epoch": 0.272,
557
+ "grad_norm": 1.0667294263839722,
558
+ "learning_rate": 9.039999999999999e-07,
559
+ "logits/chosen": 1.7538875341415405,
560
+ "logits/rejected": 1.6171789169311523,
561
+ "logps/chosen": -159.63790893554688,
562
+ "logps/rejected": -151.1880340576172,
563
+ "loss": 0.6922041416168213,
564
+ "rewards/accuracies": 0.5249999761581421,
565
+ "rewards/chosen": 0.026619985699653625,
566
+ "rewards/margins": 0.002031774492934346,
567
+ "rewards/rejected": 0.024588212370872498,
568
+ "step": 340
569
+ },
570
+ {
571
+ "epoch": 0.28,
572
+ "grad_norm": 1.2210006713867188,
573
+ "learning_rate": 9.306666666666666e-07,
574
+ "logits/chosen": 1.5815479755401611,
575
+ "logits/rejected": 1.710217833518982,
576
+ "logps/chosen": -167.21743774414062,
577
+ "logps/rejected": -196.77989196777344,
578
+ "loss": 0.6899692058563233,
579
+ "rewards/accuracies": 0.6000000238418579,
580
+ "rewards/chosen": 0.03636014088988304,
581
+ "rewards/margins": 0.006536121014505625,
582
+ "rewards/rejected": 0.029824022203683853,
583
+ "step": 350
584
+ },
585
+ {
586
+ "epoch": 0.288,
587
+ "grad_norm": 0.9623845219612122,
588
+ "learning_rate": 9.573333333333333e-07,
589
+ "logits/chosen": 1.4416790008544922,
590
+ "logits/rejected": 1.3618693351745605,
591
+ "logps/chosen": -163.27401733398438,
592
+ "logps/rejected": -169.3351287841797,
593
+ "loss": 0.6910821437835694,
594
+ "rewards/accuracies": 0.550000011920929,
595
+ "rewards/chosen": 0.037200987339019775,
596
+ "rewards/margins": 0.004338678438216448,
597
+ "rewards/rejected": 0.032862309366464615,
598
+ "step": 360
599
+ },
600
+ {
601
+ "epoch": 0.296,
602
+ "grad_norm": 1.1005645990371704,
603
+ "learning_rate": 9.84e-07,
604
+ "logits/chosen": 1.6214616298675537,
605
+ "logits/rejected": 1.6572606563568115,
606
+ "logps/chosen": -162.95504760742188,
607
+ "logps/rejected": -154.91807556152344,
608
+ "loss": 0.6895030021667481,
609
+ "rewards/accuracies": 0.5874999761581421,
610
+ "rewards/chosen": 0.047198958694934845,
611
+ "rewards/margins": 0.0075958347879350185,
612
+ "rewards/rejected": 0.03960312157869339,
613
+ "step": 370
614
+ },
615
+ {
616
+ "epoch": 0.304,
617
+ "grad_norm": 1.1126338243484497,
618
+ "learning_rate": 9.988148148148148e-07,
619
+ "logits/chosen": 1.4842126369476318,
620
+ "logits/rejected": 1.410930871963501,
621
+ "logps/chosen": -143.09182739257812,
622
+ "logps/rejected": -153.48135375976562,
623
+ "loss": 0.6885370254516602,
624
+ "rewards/accuracies": 0.5375000238418579,
625
+ "rewards/chosen": 0.05975018069148064,
626
+ "rewards/margins": 0.00981281790882349,
627
+ "rewards/rejected": 0.04993735998868942,
628
+ "step": 380
629
+ },
630
+ {
631
+ "epoch": 0.312,
632
+ "grad_norm": 1.1146304607391357,
633
+ "learning_rate": 9.95851851851852e-07,
634
+ "logits/chosen": 1.608332633972168,
635
+ "logits/rejected": 1.5848472118377686,
636
+ "logps/chosen": -152.25250244140625,
637
+ "logps/rejected": -138.348876953125,
638
+ "loss": 0.6851807117462159,
639
+ "rewards/accuracies": 0.612500011920929,
640
+ "rewards/chosen": 0.07630724459886551,
641
+ "rewards/margins": 0.016460279002785683,
642
+ "rewards/rejected": 0.059846967458724976,
643
+ "step": 390
644
+ },
645
+ {
646
+ "epoch": 0.32,
647
+ "grad_norm": 1.2686601877212524,
648
+ "learning_rate": 9.928888888888889e-07,
649
+ "logits/chosen": 1.5840070247650146,
650
+ "logits/rejected": 1.5679785013198853,
651
+ "logps/chosen": -155.90219116210938,
652
+ "logps/rejected": -148.89549255371094,
653
+ "loss": 0.6891039371490478,
654
+ "rewards/accuracies": 0.512499988079071,
655
+ "rewards/chosen": 0.08129279315471649,
656
+ "rewards/margins": 0.008997146971523762,
657
+ "rewards/rejected": 0.072295643389225,
658
+ "step": 400
659
+ },
660
+ {
661
+ "epoch": 0.32,
662
+ "eval_logits/chosen": 1.5358340740203857,
663
+ "eval_logits/rejected": 1.5689741373062134,
664
+ "eval_logps/chosen": -152.6895294189453,
665
+ "eval_logps/rejected": -147.23422241210938,
666
+ "eval_loss": 0.6900544166564941,
667
+ "eval_rewards/accuracies": 0.5419999957084656,
668
+ "eval_rewards/chosen": 0.08593228459358215,
669
+ "eval_rewards/margins": 0.0071397931315004826,
670
+ "eval_rewards/rejected": 0.0787924975156784,
671
+ "eval_runtime": 90.9072,
672
+ "eval_samples_per_second": 5.5,
673
+ "eval_steps_per_second": 2.75,
674
+ "step": 400
675
+ },
676
+ {
677
+ "epoch": 0.328,
678
+ "grad_norm": 1.1653108596801758,
679
+ "learning_rate": 9.899259259259258e-07,
680
+ "logits/chosen": 1.5450490713119507,
681
+ "logits/rejected": 1.4428151845932007,
682
+ "logps/chosen": -148.61581420898438,
683
+ "logps/rejected": -188.90919494628906,
684
+ "loss": 0.6918989181518554,
685
+ "rewards/accuracies": 0.5249999761581421,
686
+ "rewards/chosen": 0.0946170911192894,
687
+ "rewards/margins": 0.003271549940109253,
688
+ "rewards/rejected": 0.09134554117918015,
689
+ "step": 410
690
+ },
691
+ {
692
+ "epoch": 0.336,
693
+ "grad_norm": 1.3114937543869019,
694
+ "learning_rate": 9.86962962962963e-07,
695
+ "logits/chosen": 1.5342715978622437,
696
+ "logits/rejected": 1.5572118759155273,
697
+ "logps/chosen": -152.65264892578125,
698
+ "logps/rejected": -168.03384399414062,
699
+ "loss": 0.6972955703735352,
700
+ "rewards/accuracies": 0.42500001192092896,
701
+ "rewards/chosen": 0.08869597315788269,
702
+ "rewards/margins": -0.007436127867549658,
703
+ "rewards/rejected": 0.09613210707902908,
704
+ "step": 420
705
+ },
706
+ {
707
+ "epoch": 0.344,
708
+ "grad_norm": 1.0040781497955322,
709
+ "learning_rate": 9.84e-07,
710
+ "logits/chosen": 1.5979712009429932,
711
+ "logits/rejected": 1.6713664531707764,
712
+ "logps/chosen": -164.2566375732422,
713
+ "logps/rejected": -152.1428680419922,
714
+ "loss": 0.6897010326385498,
715
+ "rewards/accuracies": 0.48750001192092896,
716
+ "rewards/chosen": 0.08101107180118561,
717
+ "rewards/margins": 0.007684691343456507,
718
+ "rewards/rejected": 0.07332637906074524,
719
+ "step": 430
720
+ },
721
+ {
722
+ "epoch": 0.352,
723
+ "grad_norm": 1.0801892280578613,
724
+ "learning_rate": 9.81037037037037e-07,
725
+ "logits/chosen": 1.439635992050171,
726
+ "logits/rejected": 1.3282365798950195,
727
+ "logps/chosen": -157.23403930664062,
728
+ "logps/rejected": -146.54806518554688,
729
+ "loss": 0.6881664752960205,
730
+ "rewards/accuracies": 0.5874999761581421,
731
+ "rewards/chosen": 0.08112544566392899,
732
+ "rewards/margins": 0.010971959680318832,
733
+ "rewards/rejected": 0.07015348225831985,
734
+ "step": 440
735
+ },
736
+ {
737
+ "epoch": 0.36,
738
+ "grad_norm": 1.1139949560165405,
739
+ "learning_rate": 9.78074074074074e-07,
740
+ "logits/chosen": 1.6345123052597046,
741
+ "logits/rejected": 1.5876491069793701,
742
+ "logps/chosen": -157.6160125732422,
743
+ "logps/rejected": -160.02040100097656,
744
+ "loss": 0.6868602275848389,
745
+ "rewards/accuracies": 0.6000000238418579,
746
+ "rewards/chosen": 0.07638200372457504,
747
+ "rewards/margins": 0.013464884832501411,
748
+ "rewards/rejected": 0.06291711330413818,
749
+ "step": 450
750
+ },
751
+ {
752
+ "epoch": 0.368,
753
+ "grad_norm": 1.158035397529602,
754
+ "learning_rate": 9.751111111111112e-07,
755
+ "logits/chosen": 1.5430586338043213,
756
+ "logits/rejected": 1.6022913455963135,
757
+ "logps/chosen": -151.98919677734375,
758
+ "logps/rejected": -164.90565490722656,
759
+ "loss": 0.6891930580139161,
760
+ "rewards/accuracies": 0.5,
761
+ "rewards/chosen": 0.06889279186725616,
762
+ "rewards/margins": 0.008791089057922363,
763
+ "rewards/rejected": 0.0601017065346241,
764
+ "step": 460
765
+ },
766
+ {
767
+ "epoch": 0.376,
768
+ "grad_norm": 1.0903704166412354,
769
+ "learning_rate": 9.721481481481481e-07,
770
+ "logits/chosen": 1.313881754875183,
771
+ "logits/rejected": 1.3182194232940674,
772
+ "logps/chosen": -131.5592041015625,
773
+ "logps/rejected": -123.84442138671875,
774
+ "loss": 0.6889429569244385,
775
+ "rewards/accuracies": 0.5,
776
+ "rewards/chosen": 0.08150745928287506,
777
+ "rewards/margins": 0.009235726669430733,
778
+ "rewards/rejected": 0.07227173447608948,
779
+ "step": 470
780
+ },
781
+ {
782
+ "epoch": 0.384,
783
+ "grad_norm": 1.0766969919204712,
784
+ "learning_rate": 9.69185185185185e-07,
785
+ "logits/chosen": 1.425756812095642,
786
+ "logits/rejected": 1.476854920387268,
787
+ "logps/chosen": -148.72279357910156,
788
+ "logps/rejected": -164.40951538085938,
789
+ "loss": 0.6854116439819335,
790
+ "rewards/accuracies": 0.5874999761581421,
791
+ "rewards/chosen": 0.1009502187371254,
792
+ "rewards/margins": 0.0166789498180151,
793
+ "rewards/rejected": 0.08427127450704575,
794
+ "step": 480
795
+ },
796
+ {
797
+ "epoch": 0.392,
798
+ "grad_norm": 1.2078129053115845,
799
+ "learning_rate": 9.662222222222222e-07,
800
+ "logits/chosen": 1.5511282682418823,
801
+ "logits/rejected": 1.611383080482483,
802
+ "logps/chosen": -143.22006225585938,
803
+ "logps/rejected": -159.70176696777344,
804
+ "loss": 0.6968960762023926,
805
+ "rewards/accuracies": 0.5,
806
+ "rewards/chosen": 0.08779363334178925,
807
+ "rewards/margins": -0.00598606513813138,
808
+ "rewards/rejected": 0.09377971291542053,
809
+ "step": 490
810
+ },
811
+ {
812
+ "epoch": 0.4,
813
+ "grad_norm": 0.9106306433677673,
814
+ "learning_rate": 9.632592592592593e-07,
815
+ "logits/chosen": 1.4217239618301392,
816
+ "logits/rejected": 1.505446195602417,
817
+ "logps/chosen": -144.34109497070312,
818
+ "logps/rejected": -135.52430725097656,
819
+ "loss": 0.691196870803833,
820
+ "rewards/accuracies": 0.5,
821
+ "rewards/chosen": 0.06902458518743515,
822
+ "rewards/margins": 0.005059319548308849,
823
+ "rewards/rejected": 0.06396526843309402,
824
+ "step": 500
825
+ },
826
+ {
827
+ "epoch": 0.4,
828
+ "eval_logits/chosen": 1.5185325145721436,
829
+ "eval_logits/rejected": 1.5510345697402954,
830
+ "eval_logps/chosen": -152.7921905517578,
831
+ "eval_logps/rejected": -147.3463134765625,
832
+ "eval_loss": 0.6898502111434937,
833
+ "eval_rewards/accuracies": 0.5099999904632568,
834
+ "eval_rewards/chosen": 0.07566512376070023,
835
+ "eval_rewards/margins": 0.008083080872893333,
836
+ "eval_rewards/rejected": 0.06758204847574234,
837
+ "eval_runtime": 90.8294,
838
+ "eval_samples_per_second": 5.505,
839
+ "eval_steps_per_second": 2.752,
840
+ "step": 500
841
+ },
842
+ {
843
+ "epoch": 0.408,
844
+ "grad_norm": 1.1637762784957886,
845
+ "learning_rate": 9.602962962962962e-07,
846
+ "logits/chosen": 1.5965330600738525,
847
+ "logits/rejected": 1.5338464975357056,
848
+ "logps/chosen": -131.49871826171875,
849
+ "logps/rejected": -140.05995178222656,
850
+ "loss": 0.6868171215057373,
851
+ "rewards/accuracies": 0.5874999761581421,
852
+ "rewards/chosen": 0.08385131508111954,
853
+ "rewards/margins": 0.013950209133327007,
854
+ "rewards/rejected": 0.06990110129117966,
855
+ "step": 510
856
+ },
857
+ {
858
+ "epoch": 0.416,
859
+ "grad_norm": 1.229719638824463,
860
+ "learning_rate": 9.573333333333333e-07,
861
+ "logits/chosen": 1.7057275772094727,
862
+ "logits/rejected": 1.5898948907852173,
863
+ "logps/chosen": -166.4267120361328,
864
+ "logps/rejected": -172.22337341308594,
865
+ "loss": 0.6912094116210937,
866
+ "rewards/accuracies": 0.5625,
867
+ "rewards/chosen": 0.07427279651165009,
868
+ "rewards/margins": 0.004661654122173786,
869
+ "rewards/rejected": 0.06961113959550858,
870
+ "step": 520
871
+ },
872
+ {
873
+ "epoch": 0.424,
874
+ "grad_norm": 1.136383295059204,
875
+ "learning_rate": 9.543703703703705e-07,
876
+ "logits/chosen": 1.6207077503204346,
877
+ "logits/rejected": 1.5417520999908447,
878
+ "logps/chosen": -166.81509399414062,
879
+ "logps/rejected": -168.5408172607422,
880
+ "loss": 0.6772748470306397,
881
+ "rewards/accuracies": 0.637499988079071,
882
+ "rewards/chosen": 0.0953373983502388,
883
+ "rewards/margins": 0.033697597682476044,
884
+ "rewards/rejected": 0.06163979694247246,
885
+ "step": 530
886
+ },
887
+ {
888
+ "epoch": 0.432,
889
+ "grad_norm": 1.0640541315078735,
890
+ "learning_rate": 9.514074074074074e-07,
891
+ "logits/chosen": 1.6825854778289795,
892
+ "logits/rejected": 1.6327041387557983,
893
+ "logps/chosen": -181.39956665039062,
894
+ "logps/rejected": -158.49647521972656,
895
+ "loss": 0.6883615493774414,
896
+ "rewards/accuracies": 0.5625,
897
+ "rewards/chosen": 0.10187923908233643,
898
+ "rewards/margins": 0.010873330757021904,
899
+ "rewards/rejected": 0.09100590646266937,
900
+ "step": 540
901
+ },
902
+ {
903
+ "epoch": 0.44,
904
+ "grad_norm": 1.477909803390503,
905
+ "learning_rate": 9.484444444444444e-07,
906
+ "logits/chosen": 1.5402686595916748,
907
+ "logits/rejected": 1.6487388610839844,
908
+ "logps/chosen": -151.53814697265625,
909
+ "logps/rejected": -167.8573455810547,
910
+ "loss": 0.6915213584899902,
911
+ "rewards/accuracies": 0.512499988079071,
912
+ "rewards/chosen": 0.09472338110208511,
913
+ "rewards/margins": 0.005475065670907497,
914
+ "rewards/rejected": 0.08924831449985504,
915
+ "step": 550
916
+ },
917
+ {
918
+ "epoch": 0.448,
919
+ "grad_norm": 1.0138317346572876,
920
+ "learning_rate": 9.454814814814814e-07,
921
+ "logits/chosen": 1.6286503076553345,
922
+ "logits/rejected": 1.5223101377487183,
923
+ "logps/chosen": -154.72079467773438,
924
+ "logps/rejected": -139.2535858154297,
925
+ "loss": 0.6835216522216797,
926
+ "rewards/accuracies": 0.5625,
927
+ "rewards/chosen": 0.09194488823413849,
928
+ "rewards/margins": 0.02140027843415737,
929
+ "rewards/rejected": 0.07054460793733597,
930
+ "step": 560
931
+ },
932
+ {
933
+ "epoch": 0.456,
934
+ "grad_norm": 0.896689772605896,
935
+ "learning_rate": 9.425185185185184e-07,
936
+ "logits/chosen": 1.4857146739959717,
937
+ "logits/rejected": 1.4369819164276123,
938
+ "logps/chosen": -158.31809997558594,
939
+ "logps/rejected": -152.33676147460938,
940
+ "loss": 0.6863360404968262,
941
+ "rewards/accuracies": 0.574999988079071,
942
+ "rewards/chosen": 0.07895292341709137,
943
+ "rewards/margins": 0.016260787844657898,
944
+ "rewards/rejected": 0.06269213557243347,
945
+ "step": 570
946
+ },
947
+ {
948
+ "epoch": 0.464,
949
+ "grad_norm": 1.4772330522537231,
950
+ "learning_rate": 9.395555555555556e-07,
951
+ "logits/chosen": 1.315298318862915,
952
+ "logits/rejected": 1.4757534265518188,
953
+ "logps/chosen": -136.4202117919922,
954
+ "logps/rejected": -154.99362182617188,
955
+ "loss": 0.6916167259216308,
956
+ "rewards/accuracies": 0.550000011920929,
957
+ "rewards/chosen": 0.04677204787731171,
958
+ "rewards/margins": 0.005649724509567022,
959
+ "rewards/rejected": 0.041122324764728546,
960
+ "step": 580
961
+ },
962
+ {
963
+ "epoch": 0.472,
964
+ "grad_norm": 0.9806157350540161,
965
+ "learning_rate": 9.365925925925926e-07,
966
+ "logits/chosen": 1.465427279472351,
967
+ "logits/rejected": 1.459496259689331,
968
+ "logps/chosen": -145.07470703125,
969
+ "logps/rejected": -131.71974182128906,
970
+ "loss": 0.6857181549072265,
971
+ "rewards/accuracies": 0.5249999761581421,
972
+ "rewards/chosen": 0.07979898899793625,
973
+ "rewards/margins": 0.016965944319963455,
974
+ "rewards/rejected": 0.0628330409526825,
975
+ "step": 590
976
+ },
977
+ {
978
+ "epoch": 0.48,
979
+ "grad_norm": 1.4313396215438843,
980
+ "learning_rate": 9.336296296296295e-07,
981
+ "logits/chosen": 1.5190367698669434,
982
+ "logits/rejected": 1.563504934310913,
983
+ "logps/chosen": -132.0201416015625,
984
+ "logps/rejected": -158.89865112304688,
985
+ "loss": 0.6964848041534424,
986
+ "rewards/accuracies": 0.5625,
987
+ "rewards/chosen": 0.06544725596904755,
988
+ "rewards/margins": -0.004280097782611847,
989
+ "rewards/rejected": 0.0697273463010788,
990
+ "step": 600
991
+ },
992
+ {
993
+ "epoch": 0.48,
994
+ "eval_logits/chosen": 1.5031698942184448,
995
+ "eval_logits/rejected": 1.5348584651947021,
996
+ "eval_logps/chosen": -152.76596069335938,
997
+ "eval_logps/rejected": -147.3241729736328,
998
+ "eval_loss": 0.6903045773506165,
999
+ "eval_rewards/accuracies": 0.527999997138977,
1000
+ "eval_rewards/chosen": 0.07828804850578308,
1001
+ "eval_rewards/margins": 0.008492168970406055,
1002
+ "eval_rewards/rejected": 0.0697958841919899,
1003
+ "eval_runtime": 90.9881,
1004
+ "eval_samples_per_second": 5.495,
1005
+ "eval_steps_per_second": 2.748,
1006
+ "step": 600
1007
+ },
1008
+ {
1009
+ "epoch": 0.488,
1010
+ "grad_norm": 0.9467533230781555,
1011
+ "learning_rate": 9.306666666666666e-07,
1012
+ "logits/chosen": 1.5242283344268799,
1013
+ "logits/rejected": 1.5798314809799194,
1014
+ "logps/chosen": -131.90048217773438,
1015
+ "logps/rejected": -149.93222045898438,
1016
+ "loss": 0.6899854183197022,
1017
+ "rewards/accuracies": 0.5375000238418579,
1018
+ "rewards/chosen": 0.08419154584407806,
1019
+ "rewards/margins": 0.00830511748790741,
1020
+ "rewards/rejected": 0.07588644325733185,
1021
+ "step": 610
1022
+ },
1023
+ {
1024
+ "epoch": 0.496,
1025
+ "grad_norm": 1.8463492393493652,
1026
+ "learning_rate": 9.277037037037037e-07,
1027
+ "logits/chosen": 1.4965155124664307,
1028
+ "logits/rejected": 1.6336272954940796,
1029
+ "logps/chosen": -156.1863555908203,
1030
+ "logps/rejected": -167.74075317382812,
1031
+ "loss": 0.6977802276611328,
1032
+ "rewards/accuracies": 0.48750001192092896,
1033
+ "rewards/chosen": 0.05518193915486336,
1034
+ "rewards/margins": -0.006808985956013203,
1035
+ "rewards/rejected": 0.061990927904844284,
1036
+ "step": 620
1037
+ },
1038
+ {
1039
+ "epoch": 0.504,
1040
+ "grad_norm": 1.278316617012024,
1041
+ "learning_rate": 9.247407407407407e-07,
1042
+ "logits/chosen": 1.5074065923690796,
1043
+ "logits/rejected": 1.535696029663086,
1044
+ "logps/chosen": -167.56777954101562,
1045
+ "logps/rejected": -147.04922485351562,
1046
+ "loss": 0.6806219100952149,
1047
+ "rewards/accuracies": 0.6000000238418579,
1048
+ "rewards/chosen": 0.0745314508676529,
1049
+ "rewards/margins": 0.027104835957288742,
1050
+ "rewards/rejected": 0.047426607459783554,
1051
+ "step": 630
1052
+ },
1053
+ {
1054
+ "epoch": 0.512,
1055
+ "grad_norm": 1.0424679517745972,
1056
+ "learning_rate": 9.217777777777778e-07,
1057
+ "logits/chosen": 1.4299286603927612,
1058
+ "logits/rejected": 1.4484076499938965,
1059
+ "logps/chosen": -168.305908203125,
1060
+ "logps/rejected": -152.61854553222656,
1061
+ "loss": 0.6871783256530761,
1062
+ "rewards/accuracies": 0.48750001192092896,
1063
+ "rewards/chosen": 0.07751411944627762,
1064
+ "rewards/margins": 0.017045259475708008,
1065
+ "rewards/rejected": 0.06046885997056961,
1066
+ "step": 640
1067
+ },
1068
+ {
1069
+ "epoch": 0.52,
1070
+ "grad_norm": 1.1143999099731445,
1071
+ "learning_rate": 9.188148148148148e-07,
1072
+ "logits/chosen": 1.5520436763763428,
1073
+ "logits/rejected": 1.531718134880066,
1074
+ "logps/chosen": -154.92115783691406,
1075
+ "logps/rejected": -126.78369140625,
1076
+ "loss": 0.6889286518096924,
1077
+ "rewards/accuracies": 0.4749999940395355,
1078
+ "rewards/chosen": 0.0945608839392662,
1079
+ "rewards/margins": 0.01080345083028078,
1080
+ "rewards/rejected": 0.0837574377655983,
1081
+ "step": 650
1082
+ },
1083
+ {
1084
+ "epoch": 0.528,
1085
+ "grad_norm": 1.2762326002120972,
1086
+ "learning_rate": 9.158518518518517e-07,
1087
+ "logits/chosen": 1.5747522115707397,
1088
+ "logits/rejected": 1.4972833395004272,
1089
+ "logps/chosen": -151.13914489746094,
1090
+ "logps/rejected": -132.013916015625,
1091
+ "loss": 0.6900454998016358,
1092
+ "rewards/accuracies": 0.5375000238418579,
1093
+ "rewards/chosen": 0.07303085923194885,
1094
+ "rewards/margins": 0.008073865436017513,
1095
+ "rewards/rejected": 0.06495700031518936,
1096
+ "step": 660
1097
+ },
1098
+ {
1099
+ "epoch": 0.536,
1100
+ "grad_norm": 1.5207557678222656,
1101
+ "learning_rate": 9.128888888888888e-07,
1102
+ "logits/chosen": 1.6594665050506592,
1103
+ "logits/rejected": 1.468492031097412,
1104
+ "logps/chosen": -166.0108184814453,
1105
+ "logps/rejected": -151.38555908203125,
1106
+ "loss": 0.6934059143066407,
1107
+ "rewards/accuracies": 0.5,
1108
+ "rewards/chosen": 0.09294265508651733,
1109
+ "rewards/margins": 0.0021180796902626753,
1110
+ "rewards/rejected": 0.09082455933094025,
1111
+ "step": 670
1112
+ },
1113
+ {
1114
+ "epoch": 0.544,
1115
+ "grad_norm": 1.1073335409164429,
1116
+ "learning_rate": 9.099259259259259e-07,
1117
+ "logits/chosen": 1.5321934223175049,
1118
+ "logits/rejected": 1.5313541889190674,
1119
+ "logps/chosen": -144.5046844482422,
1120
+ "logps/rejected": -133.05470275878906,
1121
+ "loss": 0.6824594974517822,
1122
+ "rewards/accuracies": 0.6000000238418579,
1123
+ "rewards/chosen": 0.10782947391271591,
1124
+ "rewards/margins": 0.0230915155261755,
1125
+ "rewards/rejected": 0.08473796397447586,
1126
+ "step": 680
1127
+ },
1128
+ {
1129
+ "epoch": 0.552,
1130
+ "grad_norm": 1.096970558166504,
1131
+ "learning_rate": 9.069629629629629e-07,
1132
+ "logits/chosen": 1.438305139541626,
1133
+ "logits/rejected": 1.4338308572769165,
1134
+ "logps/chosen": -173.1280517578125,
1135
+ "logps/rejected": -160.20950317382812,
1136
+ "loss": 0.6878445625305176,
1137
+ "rewards/accuracies": 0.512499988079071,
1138
+ "rewards/chosen": 0.09615986794233322,
1139
+ "rewards/margins": 0.012350928969681263,
1140
+ "rewards/rejected": 0.08380892872810364,
1141
+ "step": 690
1142
+ },
1143
+ {
1144
+ "epoch": 0.56,
1145
+ "grad_norm": 1.3664878606796265,
1146
+ "learning_rate": 9.039999999999999e-07,
1147
+ "logits/chosen": 1.530369758605957,
1148
+ "logits/rejected": 1.5520622730255127,
1149
+ "logps/chosen": -159.60740661621094,
1150
+ "logps/rejected": -137.77352905273438,
1151
+ "loss": 0.6991429805755616,
1152
+ "rewards/accuracies": 0.48750001192092896,
1153
+ "rewards/chosen": 0.07249583303928375,
1154
+ "rewards/margins": -0.009333941154181957,
1155
+ "rewards/rejected": 0.08182977139949799,
1156
+ "step": 700
1157
+ },
1158
+ {
1159
+ "epoch": 0.56,
1160
+ "eval_logits/chosen": 1.5136796236038208,
1161
+ "eval_logits/rejected": 1.5456002950668335,
1162
+ "eval_logps/chosen": -152.5748748779297,
1163
+ "eval_logps/rejected": -147.17152404785156,
1164
+ "eval_loss": 0.688511848449707,
1165
+ "eval_rewards/accuracies": 0.5339999794960022,
1166
+ "eval_rewards/chosen": 0.09739598631858826,
1167
+ "eval_rewards/margins": 0.012335418723523617,
1168
+ "eval_rewards/rejected": 0.08506056666374207,
1169
+ "eval_runtime": 90.8734,
1170
+ "eval_samples_per_second": 5.502,
1171
+ "eval_steps_per_second": 2.751,
1172
+ "step": 700
1173
+ },
1174
+ {
1175
+ "epoch": 0.568,
1176
+ "grad_norm": 1.224107265472412,
1177
+ "learning_rate": 9.010370370370371e-07,
1178
+ "logits/chosen": 1.6023967266082764,
1179
+ "logits/rejected": 1.558224081993103,
1180
+ "logps/chosen": -148.47120666503906,
1181
+ "logps/rejected": -163.5648651123047,
1182
+ "loss": 0.6865221500396729,
1183
+ "rewards/accuracies": 0.5375000238418579,
1184
+ "rewards/chosen": 0.12494877725839615,
1185
+ "rewards/margins": 0.01545787788927555,
1186
+ "rewards/rejected": 0.10949089378118515,
1187
+ "step": 710
1188
+ },
1189
+ {
1190
+ "epoch": 0.576,
1191
+ "grad_norm": 1.2135164737701416,
1192
+ "learning_rate": 8.98074074074074e-07,
1193
+ "logits/chosen": 1.5902750492095947,
1194
+ "logits/rejected": 1.7012746334075928,
1195
+ "logps/chosen": -139.7012939453125,
1196
+ "logps/rejected": -135.42225646972656,
1197
+ "loss": 0.7014039039611817,
1198
+ "rewards/accuracies": 0.4749999940395355,
1199
+ "rewards/chosen": 0.09102506935596466,
1200
+ "rewards/margins": -0.014695463702082634,
1201
+ "rewards/rejected": 0.10572052001953125,
1202
+ "step": 720
1203
+ },
1204
+ {
1205
+ "epoch": 0.584,
1206
+ "grad_norm": 1.1953649520874023,
1207
+ "learning_rate": 8.95111111111111e-07,
1208
+ "logits/chosen": 1.6540731191635132,
1209
+ "logits/rejected": 1.6386349201202393,
1210
+ "logps/chosen": -153.12847900390625,
1211
+ "logps/rejected": -149.66319274902344,
1212
+ "loss": 0.6952126502990723,
1213
+ "rewards/accuracies": 0.5,
1214
+ "rewards/chosen": 0.1124584898352623,
1215
+ "rewards/margins": -0.0012063594767823815,
1216
+ "rewards/rejected": 0.11366486549377441,
1217
+ "step": 730
1218
+ },
1219
+ {
1220
+ "epoch": 0.592,
1221
+ "grad_norm": 1.147875428199768,
1222
+ "learning_rate": 8.921481481481481e-07,
1223
+ "logits/chosen": 1.5578330755233765,
1224
+ "logits/rejected": 1.4689290523529053,
1225
+ "logps/chosen": -156.93788146972656,
1226
+ "logps/rejected": -134.1832733154297,
1227
+ "loss": 0.6919830799102783,
1228
+ "rewards/accuracies": 0.512499988079071,
1229
+ "rewards/chosen": 0.09214536100625992,
1230
+ "rewards/margins": 0.004291228950023651,
1231
+ "rewards/rejected": 0.08785412460565567,
1232
+ "step": 740
1233
+ },
1234
+ {
1235
+ "epoch": 0.6,
1236
+ "grad_norm": 1.21685791015625,
1237
+ "learning_rate": 8.891851851851852e-07,
1238
+ "logits/chosen": 1.4241983890533447,
1239
+ "logits/rejected": 1.4537718296051025,
1240
+ "logps/chosen": -136.9163055419922,
1241
+ "logps/rejected": -158.44491577148438,
1242
+ "loss": 0.691422986984253,
1243
+ "rewards/accuracies": 0.5375000238418579,
1244
+ "rewards/chosen": 0.09173480421304703,
1245
+ "rewards/margins": 0.00533579895272851,
1246
+ "rewards/rejected": 0.08639900386333466,
1247
+ "step": 750
1248
+ },
1249
+ {
1250
+ "epoch": 0.608,
1251
+ "grad_norm": 1.3629519939422607,
1252
+ "learning_rate": 8.862222222222222e-07,
1253
+ "logits/chosen": 1.565716028213501,
1254
+ "logits/rejected": 1.3824570178985596,
1255
+ "logps/chosen": -138.8521728515625,
1256
+ "logps/rejected": -129.51028442382812,
1257
+ "loss": 0.6931031227111817,
1258
+ "rewards/accuracies": 0.5249999761581421,
1259
+ "rewards/chosen": 0.0763697624206543,
1260
+ "rewards/margins": 0.0021251302678138018,
1261
+ "rewards/rejected": 0.07424463331699371,
1262
+ "step": 760
1263
+ },
1264
+ {
1265
+ "epoch": 0.616,
1266
+ "grad_norm": 1.1030747890472412,
1267
+ "learning_rate": 8.832592592592593e-07,
1268
+ "logits/chosen": 1.5781702995300293,
1269
+ "logits/rejected": 1.5428775548934937,
1270
+ "logps/chosen": -141.01480102539062,
1271
+ "logps/rejected": -136.47189331054688,
1272
+ "loss": 0.6777657032012939,
1273
+ "rewards/accuracies": 0.637499988079071,
1274
+ "rewards/chosen": 0.10366280376911163,
1275
+ "rewards/margins": 0.0333605632185936,
1276
+ "rewards/rejected": 0.07030224055051804,
1277
+ "step": 770
1278
+ },
1279
+ {
1280
+ "epoch": 0.624,
1281
+ "grad_norm": 1.3228758573532104,
1282
+ "learning_rate": 8.802962962962962e-07,
1283
+ "logits/chosen": 1.451596736907959,
1284
+ "logits/rejected": 1.5489904880523682,
1285
+ "logps/chosen": -153.74618530273438,
1286
+ "logps/rejected": -159.26333618164062,
1287
+ "loss": 0.6870471000671386,
1288
+ "rewards/accuracies": 0.512499988079071,
1289
+ "rewards/chosen": 0.10403706133365631,
1290
+ "rewards/margins": 0.014494165778160095,
1291
+ "rewards/rejected": 0.08954289555549622,
1292
+ "step": 780
1293
+ },
1294
+ {
1295
+ "epoch": 0.632,
1296
+ "grad_norm": 1.3267385959625244,
1297
+ "learning_rate": 8.773333333333332e-07,
1298
+ "logits/chosen": 1.4153831005096436,
1299
+ "logits/rejected": 1.4497296810150146,
1300
+ "logps/chosen": -148.05368041992188,
1301
+ "logps/rejected": -151.3020477294922,
1302
+ "loss": 0.6877872943878174,
1303
+ "rewards/accuracies": 0.512499988079071,
1304
+ "rewards/chosen": 0.07300253212451935,
1305
+ "rewards/margins": 0.013981563039124012,
1306
+ "rewards/rejected": 0.05902096629142761,
1307
+ "step": 790
1308
+ },
1309
+ {
1310
+ "epoch": 0.64,
1311
+ "grad_norm": 1.2227483987808228,
1312
+ "learning_rate": 8.743703703703703e-07,
1313
+ "logits/chosen": 1.6773914098739624,
1314
+ "logits/rejected": 1.6918518543243408,
1315
+ "logps/chosen": -180.06863403320312,
1316
+ "logps/rejected": -194.30752563476562,
1317
+ "loss": 0.7015917778015137,
1318
+ "rewards/accuracies": 0.48750001192092896,
1319
+ "rewards/chosen": 0.07480724155902863,
1320
+ "rewards/margins": -0.013628068380057812,
1321
+ "rewards/rejected": 0.08843531459569931,
1322
+ "step": 800
1323
+ },
1324
+ {
1325
+ "epoch": 0.64,
1326
+ "eval_logits/chosen": 1.5051140785217285,
1327
+ "eval_logits/rejected": 1.5367870330810547,
1328
+ "eval_logps/chosen": -152.8026885986328,
1329
+ "eval_logps/rejected": -147.39437866210938,
1330
+ "eval_loss": 0.688633918762207,
1331
+ "eval_rewards/accuracies": 0.5220000147819519,
1332
+ "eval_rewards/chosen": 0.07461711764335632,
1333
+ "eval_rewards/margins": 0.011839868500828743,
1334
+ "eval_rewards/rejected": 0.06277725100517273,
1335
+ "eval_runtime": 90.9206,
1336
+ "eval_samples_per_second": 5.499,
1337
+ "eval_steps_per_second": 2.75,
1338
+ "step": 800
1339
+ },
1340
+ {
1341
+ "epoch": 0.648,
1342
+ "grad_norm": 1.209770679473877,
1343
+ "learning_rate": 8.714074074074074e-07,
1344
+ "logits/chosen": 1.3785041570663452,
1345
+ "logits/rejected": 1.60085928440094,
1346
+ "logps/chosen": -114.5325927734375,
1347
+ "logps/rejected": -151.88900756835938,
1348
+ "loss": 0.6890413761138916,
1349
+ "rewards/accuracies": 0.5249999761581421,
1350
+ "rewards/chosen": 0.0640551745891571,
1351
+ "rewards/margins": 0.010126596316695213,
1352
+ "rewards/rejected": 0.05392857640981674,
1353
+ "step": 810
1354
+ },
1355
+ {
1356
+ "epoch": 0.656,
1357
+ "grad_norm": 1.0250163078308105,
1358
+ "learning_rate": 8.684444444444444e-07,
1359
+ "logits/chosen": 1.5827982425689697,
1360
+ "logits/rejected": 1.6239618062973022,
1361
+ "logps/chosen": -149.01904296875,
1362
+ "logps/rejected": -141.9372100830078,
1363
+ "loss": 0.6728014469146728,
1364
+ "rewards/accuracies": 0.675000011920929,
1365
+ "rewards/chosen": 0.0830039232969284,
1366
+ "rewards/margins": 0.043090131133794785,
1367
+ "rewards/rejected": 0.03991378843784332,
1368
+ "step": 820
1369
+ },
1370
+ {
1371
+ "epoch": 0.664,
1372
+ "grad_norm": 1.0627655982971191,
1373
+ "learning_rate": 8.654814814814814e-07,
1374
+ "logits/chosen": 1.501873254776001,
1375
+ "logits/rejected": 1.537386417388916,
1376
+ "logps/chosen": -138.75169372558594,
1377
+ "logps/rejected": -134.00970458984375,
1378
+ "loss": 0.6801187992095947,
1379
+ "rewards/accuracies": 0.625,
1380
+ "rewards/chosen": 0.0640982910990715,
1381
+ "rewards/margins": 0.02855527400970459,
1382
+ "rewards/rejected": 0.03554301708936691,
1383
+ "step": 830
1384
+ },
1385
+ {
1386
+ "epoch": 0.672,
1387
+ "grad_norm": 1.2453433275222778,
1388
+ "learning_rate": 8.625185185185186e-07,
1389
+ "logits/chosen": 1.4766005277633667,
1390
+ "logits/rejected": 1.5185787677764893,
1391
+ "logps/chosen": -148.41250610351562,
1392
+ "logps/rejected": -157.73989868164062,
1393
+ "loss": 0.6788079738616943,
1394
+ "rewards/accuracies": 0.550000011920929,
1395
+ "rewards/chosen": 0.06551454961299896,
1396
+ "rewards/margins": 0.03301671892404556,
1397
+ "rewards/rejected": 0.0324978306889534,
1398
+ "step": 840
1399
+ },
1400
+ {
1401
+ "epoch": 0.68,
1402
+ "grad_norm": 1.191698670387268,
1403
+ "learning_rate": 8.595555555555555e-07,
1404
+ "logits/chosen": 1.555551528930664,
1405
+ "logits/rejected": 1.5438499450683594,
1406
+ "logps/chosen": -160.77488708496094,
1407
+ "logps/rejected": -141.9876251220703,
1408
+ "loss": 0.6801597595214843,
1409
+ "rewards/accuracies": 0.5375000238418579,
1410
+ "rewards/chosen": 0.053498052060604095,
1411
+ "rewards/margins": 0.028470590710639954,
1412
+ "rewards/rejected": 0.025027472525835037,
1413
+ "step": 850
1414
+ },
1415
+ {
1416
+ "epoch": 0.688,
1417
+ "grad_norm": 0.9382092356681824,
1418
+ "learning_rate": 8.565925925925925e-07,
1419
+ "logits/chosen": 1.5482738018035889,
1420
+ "logits/rejected": 1.5833700895309448,
1421
+ "logps/chosen": -139.68118286132812,
1422
+ "logps/rejected": -130.05947875976562,
1423
+ "loss": 0.6825921535491943,
1424
+ "rewards/accuracies": 0.5375000238418579,
1425
+ "rewards/chosen": 0.06382572650909424,
1426
+ "rewards/margins": 0.023111844435334206,
1427
+ "rewards/rejected": 0.04071388021111488,
1428
+ "step": 860
1429
+ },
1430
+ {
1431
+ "epoch": 0.696,
1432
+ "grad_norm": 1.2248382568359375,
1433
+ "learning_rate": 8.536296296296296e-07,
1434
+ "logits/chosen": 1.4529926776885986,
1435
+ "logits/rejected": 1.4423902034759521,
1436
+ "logps/chosen": -154.4588165283203,
1437
+ "logps/rejected": -153.44949340820312,
1438
+ "loss": 0.6915592193603516,
1439
+ "rewards/accuracies": 0.5249999761581421,
1440
+ "rewards/chosen": 0.03207828477025032,
1441
+ "rewards/margins": 0.006785523146390915,
1442
+ "rewards/rejected": 0.025292763486504555,
1443
+ "step": 870
1444
+ },
1445
+ {
1446
+ "epoch": 0.704,
1447
+ "grad_norm": 1.2174893617630005,
1448
+ "learning_rate": 8.506666666666667e-07,
1449
+ "logits/chosen": 1.4073113203048706,
1450
+ "logits/rejected": 1.668140172958374,
1451
+ "logps/chosen": -145.18460083007812,
1452
+ "logps/rejected": -157.0777587890625,
1453
+ "loss": 0.6922356605529785,
1454
+ "rewards/accuracies": 0.48750001192092896,
1455
+ "rewards/chosen": 0.049450717866420746,
1456
+ "rewards/margins": 0.004049716051667929,
1457
+ "rewards/rejected": 0.04540099948644638,
1458
+ "step": 880
1459
+ },
1460
+ {
1461
+ "epoch": 0.712,
1462
+ "grad_norm": 1.3186850547790527,
1463
+ "learning_rate": 8.477037037037037e-07,
1464
+ "logits/chosen": 1.2923892736434937,
1465
+ "logits/rejected": 1.491051435470581,
1466
+ "logps/chosen": -149.03018188476562,
1467
+ "logps/rejected": -161.61715698242188,
1468
+ "loss": 0.6841150760650635,
1469
+ "rewards/accuracies": 0.5375000238418579,
1470
+ "rewards/chosen": 0.038961172103881836,
1471
+ "rewards/margins": 0.020759906619787216,
1472
+ "rewards/rejected": 0.01820126175880432,
1473
+ "step": 890
1474
+ },
1475
+ {
1476
+ "epoch": 0.72,
1477
+ "grad_norm": 1.1123124361038208,
1478
+ "learning_rate": 8.447407407407407e-07,
1479
+ "logits/chosen": 1.546547293663025,
1480
+ "logits/rejected": 1.682992696762085,
1481
+ "logps/chosen": -166.2056884765625,
1482
+ "logps/rejected": -180.73330688476562,
1483
+ "loss": 0.6826215744018554,
1484
+ "rewards/accuracies": 0.574999988079071,
1485
+ "rewards/chosen": 0.07279330492019653,
1486
+ "rewards/margins": 0.025264525786042213,
1487
+ "rewards/rejected": 0.04752878472208977,
1488
+ "step": 900
1489
+ },
1490
+ {
1491
+ "epoch": 0.72,
1492
+ "eval_logits/chosen": 1.4819762706756592,
1493
+ "eval_logits/rejected": 1.513027548789978,
1494
+ "eval_logps/chosen": -153.1695556640625,
1495
+ "eval_logps/rejected": -147.7560577392578,
1496
+ "eval_loss": 0.689373254776001,
1497
+ "eval_rewards/accuracies": 0.5299999713897705,
1498
+ "eval_rewards/chosen": 0.03792775049805641,
1499
+ "eval_rewards/margins": 0.011320074088871479,
1500
+ "eval_rewards/rejected": 0.026607677340507507,
1501
+ "eval_runtime": 90.9603,
1502
+ "eval_samples_per_second": 5.497,
1503
+ "eval_steps_per_second": 2.748,
1504
+ "step": 900
1505
+ },
1506
+ {
1507
+ "epoch": 0.728,
1508
+ "grad_norm": 1.1507582664489746,
1509
+ "learning_rate": 8.417777777777777e-07,
1510
+ "logits/chosen": 1.5831308364868164,
1511
+ "logits/rejected": 1.3737132549285889,
1512
+ "logps/chosen": -140.00253295898438,
1513
+ "logps/rejected": -138.45155334472656,
1514
+ "loss": 0.6873391628265381,
1515
+ "rewards/accuracies": 0.574999988079071,
1516
+ "rewards/chosen": 0.05066479369997978,
1517
+ "rewards/margins": 0.015466553159058094,
1518
+ "rewards/rejected": 0.03519824147224426,
1519
+ "step": 910
1520
+ },
1521
+ {
1522
+ "epoch": 0.736,
1523
+ "grad_norm": 1.415456771850586,
1524
+ "learning_rate": 8.388148148148147e-07,
1525
+ "logits/chosen": 1.3000686168670654,
1526
+ "logits/rejected": 1.2642180919647217,
1527
+ "logps/chosen": -160.14027404785156,
1528
+ "logps/rejected": -163.32827758789062,
1529
+ "loss": 0.687142276763916,
1530
+ "rewards/accuracies": 0.5249999761581421,
1531
+ "rewards/chosen": 0.06618930399417877,
1532
+ "rewards/margins": 0.014807088300585747,
1533
+ "rewards/rejected": 0.05138222128152847,
1534
+ "step": 920
1535
+ },
1536
+ {
1537
+ "epoch": 0.744,
1538
+ "grad_norm": 1.3228634595870972,
1539
+ "learning_rate": 8.358518518518518e-07,
1540
+ "logits/chosen": 1.5204170942306519,
1541
+ "logits/rejected": 1.6129217147827148,
1542
+ "logps/chosen": -157.99832153320312,
1543
+ "logps/rejected": -162.09567260742188,
1544
+ "loss": 0.6790886878967285,
1545
+ "rewards/accuracies": 0.625,
1546
+ "rewards/chosen": 0.05940447002649307,
1547
+ "rewards/margins": 0.03161252290010452,
1548
+ "rewards/rejected": 0.027791941538453102,
1549
+ "step": 930
1550
+ },
1551
+ {
1552
+ "epoch": 0.752,
1553
+ "grad_norm": 1.048064947128296,
1554
+ "learning_rate": 8.328888888888889e-07,
1555
+ "logits/chosen": 1.5515320301055908,
1556
+ "logits/rejected": 1.676857352256775,
1557
+ "logps/chosen": -148.4853515625,
1558
+ "logps/rejected": -174.4320526123047,
1559
+ "loss": 0.6935453414916992,
1560
+ "rewards/accuracies": 0.5375000238418579,
1561
+ "rewards/chosen": 0.010225490666925907,
1562
+ "rewards/margins": 0.0021693313028663397,
1563
+ "rewards/rejected": 0.008056161925196648,
1564
+ "step": 940
1565
+ },
1566
+ {
1567
+ "epoch": 0.76,
1568
+ "grad_norm": 1.247799038887024,
1569
+ "learning_rate": 8.299259259259259e-07,
1570
+ "logits/chosen": 1.384231686592102,
1571
+ "logits/rejected": 1.4611561298370361,
1572
+ "logps/chosen": -140.81443786621094,
1573
+ "logps/rejected": -153.81326293945312,
1574
+ "loss": 0.6805226325988769,
1575
+ "rewards/accuracies": 0.6000000238418579,
1576
+ "rewards/chosen": 0.053223587572574615,
1577
+ "rewards/margins": 0.02933613583445549,
1578
+ "rewards/rejected": 0.023887457326054573,
1579
+ "step": 950
1580
+ },
1581
+ {
1582
+ "epoch": 0.768,
1583
+ "grad_norm": 1.1611264944076538,
1584
+ "learning_rate": 8.269629629629629e-07,
1585
+ "logits/chosen": 1.4987528324127197,
1586
+ "logits/rejected": 1.3684321641921997,
1587
+ "logps/chosen": -135.81056213378906,
1588
+ "logps/rejected": -119.05133056640625,
1589
+ "loss": 0.6954316616058349,
1590
+ "rewards/accuracies": 0.4749999940395355,
1591
+ "rewards/chosen": 0.0197703056037426,
1592
+ "rewards/margins": -0.0012637012405321002,
1593
+ "rewards/rejected": 0.021034007892012596,
1594
+ "step": 960
1595
+ },
1596
+ {
1597
+ "epoch": 0.776,
1598
+ "grad_norm": 0.964948832988739,
1599
+ "learning_rate": 8.24e-07,
1600
+ "logits/chosen": 1.612053632736206,
1601
+ "logits/rejected": 1.7114235162734985,
1602
+ "logps/chosen": -153.56137084960938,
1603
+ "logps/rejected": -149.17323303222656,
1604
+ "loss": 0.6916662693023682,
1605
+ "rewards/accuracies": 0.512499988079071,
1606
+ "rewards/chosen": 0.05082285404205322,
1607
+ "rewards/margins": 0.0056606316938996315,
1608
+ "rewards/rejected": 0.04516221210360527,
1609
+ "step": 970
1610
+ },
1611
+ {
1612
+ "epoch": 0.784,
1613
+ "grad_norm": 1.409449815750122,
1614
+ "learning_rate": 8.21037037037037e-07,
1615
+ "logits/chosen": 1.384060025215149,
1616
+ "logits/rejected": 1.316361904144287,
1617
+ "logps/chosen": -168.75360107421875,
1618
+ "logps/rejected": -141.79537963867188,
1619
+ "loss": 0.6845529556274415,
1620
+ "rewards/accuracies": 0.6499999761581421,
1621
+ "rewards/chosen": 0.0886571854352951,
1622
+ "rewards/margins": 0.02084263041615486,
1623
+ "rewards/rejected": 0.06781454384326935,
1624
+ "step": 980
1625
+ },
1626
+ {
1627
+ "epoch": 0.792,
1628
+ "grad_norm": 1.0246562957763672,
1629
+ "learning_rate": 8.18074074074074e-07,
1630
+ "logits/chosen": 1.4986507892608643,
1631
+ "logits/rejected": 1.571103811264038,
1632
+ "logps/chosen": -165.22254943847656,
1633
+ "logps/rejected": -168.63540649414062,
1634
+ "loss": 0.6790967464447022,
1635
+ "rewards/accuracies": 0.5874999761581421,
1636
+ "rewards/chosen": 0.07517905533313751,
1637
+ "rewards/margins": 0.03272219002246857,
1638
+ "rewards/rejected": 0.04245685786008835,
1639
+ "step": 990
1640
+ },
1641
+ {
1642
+ "epoch": 0.8,
1643
+ "grad_norm": 1.014426827430725,
1644
+ "learning_rate": 8.15111111111111e-07,
1645
+ "logits/chosen": 1.4708755016326904,
1646
+ "logits/rejected": 1.4550955295562744,
1647
+ "logps/chosen": -149.49874877929688,
1648
+ "logps/rejected": -125.0966567993164,
1649
+ "loss": 0.6892098903656005,
1650
+ "rewards/accuracies": 0.512499988079071,
1651
+ "rewards/chosen": 0.07825140655040741,
1652
+ "rewards/margins": 0.01007872074842453,
1653
+ "rewards/rejected": 0.06817268580198288,
1654
+ "step": 1000
1655
+ },
1656
+ {
1657
+ "epoch": 0.8,
1658
+ "eval_logits/chosen": 1.4972777366638184,
1659
+ "eval_logits/rejected": 1.5283576250076294,
1660
+ "eval_logps/chosen": -152.8240966796875,
1661
+ "eval_logps/rejected": -147.44175720214844,
1662
+ "eval_loss": 0.6880542039871216,
1663
+ "eval_rewards/accuracies": 0.5419999957084656,
1664
+ "eval_rewards/chosen": 0.07247376441955566,
1665
+ "eval_rewards/margins": 0.01443479023873806,
1666
+ "eval_rewards/rejected": 0.058038972318172455,
1667
+ "eval_runtime": 90.827,
1668
+ "eval_samples_per_second": 5.505,
1669
+ "eval_steps_per_second": 2.752,
1670
+ "step": 1000
1671
+ },
1672
+ {
1673
+ "epoch": 0.808,
1674
+ "grad_norm": 1.1155726909637451,
1675
+ "learning_rate": 8.121481481481482e-07,
1676
+ "logits/chosen": 1.547481894493103,
1677
+ "logits/rejected": 1.5461527109146118,
1678
+ "logps/chosen": -154.89791870117188,
1679
+ "logps/rejected": -142.05111694335938,
1680
+ "loss": 0.6809016704559326,
1681
+ "rewards/accuracies": 0.6625000238418579,
1682
+ "rewards/chosen": 0.10093016922473907,
1683
+ "rewards/margins": 0.027038773521780968,
1684
+ "rewards/rejected": 0.07389138638973236,
1685
+ "step": 1010
1686
+ },
1687
+ {
1688
+ "epoch": 0.816,
1689
+ "grad_norm": 1.6716786623001099,
1690
+ "learning_rate": 8.091851851851852e-07,
1691
+ "logits/chosen": 1.3954919576644897,
1692
+ "logits/rejected": 1.5088036060333252,
1693
+ "logps/chosen": -143.11846923828125,
1694
+ "logps/rejected": -146.41482543945312,
1695
+ "loss": 0.6849985122680664,
1696
+ "rewards/accuracies": 0.625,
1697
+ "rewards/chosen": 0.05970728397369385,
1698
+ "rewards/margins": 0.021999523043632507,
1699
+ "rewards/rejected": 0.03770776465535164,
1700
+ "step": 1020
1701
+ },
1702
+ {
1703
+ "epoch": 0.824,
1704
+ "grad_norm": 1.308182716369629,
1705
+ "learning_rate": 8.062222222222221e-07,
1706
+ "logits/chosen": 1.532488465309143,
1707
+ "logits/rejected": 1.3316718339920044,
1708
+ "logps/chosen": -163.06858825683594,
1709
+ "logps/rejected": -136.57989501953125,
1710
+ "loss": 0.6789252281188964,
1711
+ "rewards/accuracies": 0.5874999761581421,
1712
+ "rewards/chosen": 0.11086118221282959,
1713
+ "rewards/margins": 0.033117204904556274,
1714
+ "rewards/rejected": 0.07774396985769272,
1715
+ "step": 1030
1716
+ },
1717
+ {
1718
+ "epoch": 0.832,
1719
+ "grad_norm": 1.1019353866577148,
1720
+ "learning_rate": 8.032592592592592e-07,
1721
+ "logits/chosen": 1.4798121452331543,
1722
+ "logits/rejected": 1.5203083753585815,
1723
+ "logps/chosen": -140.3281707763672,
1724
+ "logps/rejected": -141.06216430664062,
1725
+ "loss": 0.6724937915802002,
1726
+ "rewards/accuracies": 0.637499988079071,
1727
+ "rewards/chosen": 0.1202714815735817,
1728
+ "rewards/margins": 0.046712182462215424,
1729
+ "rewards/rejected": 0.07355931401252747,
1730
+ "step": 1040
1731
+ },
1732
+ {
1733
+ "epoch": 0.84,
1734
+ "grad_norm": 1.2966418266296387,
1735
+ "learning_rate": 8.002962962962963e-07,
1736
+ "logits/chosen": 1.3034439086914062,
1737
+ "logits/rejected": 1.4363505840301514,
1738
+ "logps/chosen": -142.1992645263672,
1739
+ "logps/rejected": -132.0046844482422,
1740
+ "loss": 0.6848338603973388,
1741
+ "rewards/accuracies": 0.6000000238418579,
1742
+ "rewards/chosen": 0.11126589775085449,
1743
+ "rewards/margins": 0.019870441406965256,
1744
+ "rewards/rejected": 0.09139545261859894,
1745
+ "step": 1050
1746
+ },
1747
+ {
1748
+ "epoch": 0.848,
1749
+ "grad_norm": 1.0321130752563477,
1750
+ "learning_rate": 7.973333333333333e-07,
1751
+ "logits/chosen": 1.3257265090942383,
1752
+ "logits/rejected": 1.4649587869644165,
1753
+ "logps/chosen": -112.98075103759766,
1754
+ "logps/rejected": -127.4059829711914,
1755
+ "loss": 0.6679783344268799,
1756
+ "rewards/accuracies": 0.612500011920929,
1757
+ "rewards/chosen": 0.17311830818653107,
1758
+ "rewards/margins": 0.05631983280181885,
1759
+ "rewards/rejected": 0.11679847538471222,
1760
+ "step": 1060
1761
+ },
1762
+ {
1763
+ "epoch": 0.856,
1764
+ "grad_norm": 1.2037222385406494,
1765
+ "learning_rate": 7.943703703703704e-07,
1766
+ "logits/chosen": 1.3766987323760986,
1767
+ "logits/rejected": 1.2802103757858276,
1768
+ "logps/chosen": -154.8142547607422,
1769
+ "logps/rejected": -148.680908203125,
1770
+ "loss": 0.668931531906128,
1771
+ "rewards/accuracies": 0.6499999761581421,
1772
+ "rewards/chosen": 0.1449051946401596,
1773
+ "rewards/margins": 0.05721765011548996,
1774
+ "rewards/rejected": 0.08768755197525024,
1775
+ "step": 1070
1776
+ },
1777
+ {
1778
+ "epoch": 0.864,
1779
+ "grad_norm": 0.9230766296386719,
1780
+ "learning_rate": 7.914074074074074e-07,
1781
+ "logits/chosen": 1.447328805923462,
1782
+ "logits/rejected": 1.5235106945037842,
1783
+ "logps/chosen": -150.21812438964844,
1784
+ "logps/rejected": -123.29923248291016,
1785
+ "loss": 0.6716424465179444,
1786
+ "rewards/accuracies": 0.5874999761581421,
1787
+ "rewards/chosen": 0.16066041588783264,
1788
+ "rewards/margins": 0.04824484512209892,
1789
+ "rewards/rejected": 0.11241557449102402,
1790
+ "step": 1080
1791
+ },
1792
+ {
1793
+ "epoch": 0.872,
1794
+ "grad_norm": 1.0423877239227295,
1795
+ "learning_rate": 7.884444444444443e-07,
1796
+ "logits/chosen": 1.6920299530029297,
1797
+ "logits/rejected": 1.6123542785644531,
1798
+ "logps/chosen": -175.1676025390625,
1799
+ "logps/rejected": -185.07223510742188,
1800
+ "loss": 0.6723968505859375,
1801
+ "rewards/accuracies": 0.612500011920929,
1802
+ "rewards/chosen": 0.16490009427070618,
1803
+ "rewards/margins": 0.046577490866184235,
1804
+ "rewards/rejected": 0.11832261085510254,
1805
+ "step": 1090
1806
+ },
1807
+ {
1808
+ "epoch": 0.88,
1809
+ "grad_norm": 1.4142779111862183,
1810
+ "learning_rate": 7.854814814814814e-07,
1811
+ "logits/chosen": 1.453111171722412,
1812
+ "logits/rejected": 1.4981731176376343,
1813
+ "logps/chosen": -139.9765167236328,
1814
+ "logps/rejected": -153.63058471679688,
1815
+ "loss": 0.6872457027435303,
1816
+ "rewards/accuracies": 0.5625,
1817
+ "rewards/chosen": 0.13096000254154205,
1818
+ "rewards/margins": 0.01702108606696129,
1819
+ "rewards/rejected": 0.11393891274929047,
1820
+ "step": 1100
1821
+ },
1822
+ {
1823
+ "epoch": 0.88,
1824
+ "eval_logits/chosen": 1.5076195001602173,
1825
+ "eval_logits/rejected": 1.5373249053955078,
1826
+ "eval_logps/chosen": -152.1697998046875,
1827
+ "eval_logps/rejected": -146.83502197265625,
1828
+ "eval_loss": 0.6873304843902588,
1829
+ "eval_rewards/accuracies": 0.5360000133514404,
1830
+ "eval_rewards/chosen": 0.13790392875671387,
1831
+ "eval_rewards/margins": 0.019191108644008636,
1832
+ "eval_rewards/rejected": 0.11871281266212463,
1833
+ "eval_runtime": 91.0157,
1834
+ "eval_samples_per_second": 5.494,
1835
+ "eval_steps_per_second": 2.747,
1836
+ "step": 1100
1837
+ },
1838
+ {
1839
+ "epoch": 0.888,
1840
+ "grad_norm": 1.413996696472168,
1841
+ "learning_rate": 7.825185185185185e-07,
1842
+ "logits/chosen": 1.5183905363082886,
1843
+ "logits/rejected": 1.4942539930343628,
1844
+ "logps/chosen": -156.383544921875,
1845
+ "logps/rejected": -145.61398315429688,
1846
+ "loss": 0.6810199737548828,
1847
+ "rewards/accuracies": 0.5249999761581421,
1848
+ "rewards/chosen": 0.14243564009666443,
1849
+ "rewards/margins": 0.03136260434985161,
1850
+ "rewards/rejected": 0.11107305437326431,
1851
+ "step": 1110
1852
+ },
1853
+ {
1854
+ "epoch": 0.896,
1855
+ "grad_norm": 1.0046707391738892,
1856
+ "learning_rate": 7.795555555555555e-07,
1857
+ "logits/chosen": 1.4982213973999023,
1858
+ "logits/rejected": 1.5465660095214844,
1859
+ "logps/chosen": -156.79293823242188,
1860
+ "logps/rejected": -150.77853393554688,
1861
+ "loss": 0.6808318614959716,
1862
+ "rewards/accuracies": 0.6499999761581421,
1863
+ "rewards/chosen": 0.1551930010318756,
1864
+ "rewards/margins": 0.028187204152345657,
1865
+ "rewards/rejected": 0.12700578570365906,
1866
+ "step": 1120
1867
+ },
1868
+ {
1869
+ "epoch": 0.904,
1870
+ "grad_norm": 1.125625729560852,
1871
+ "learning_rate": 7.765925925925925e-07,
1872
+ "logits/chosen": 1.6053575277328491,
1873
+ "logits/rejected": 1.47335946559906,
1874
+ "logps/chosen": -148.05690002441406,
1875
+ "logps/rejected": -135.36598205566406,
1876
+ "loss": 0.6889908790588379,
1877
+ "rewards/accuracies": 0.550000011920929,
1878
+ "rewards/chosen": 0.15128108859062195,
1879
+ "rewards/margins": 0.015811312943696976,
1880
+ "rewards/rejected": 0.13546979427337646,
1881
+ "step": 1130
1882
+ },
1883
+ {
1884
+ "epoch": 0.912,
1885
+ "grad_norm": 1.0933659076690674,
1886
+ "learning_rate": 7.736296296296297e-07,
1887
+ "logits/chosen": 1.5862042903900146,
1888
+ "logits/rejected": 1.5827430486679077,
1889
+ "logps/chosen": -156.10145568847656,
1890
+ "logps/rejected": -142.94851684570312,
1891
+ "loss": 0.6753206253051758,
1892
+ "rewards/accuracies": 0.5874999761581421,
1893
+ "rewards/chosen": 0.1547432243824005,
1894
+ "rewards/margins": 0.04229574277997017,
1895
+ "rewards/rejected": 0.11244747787714005,
1896
+ "step": 1140
1897
+ },
1898
+ {
1899
+ "epoch": 0.92,
1900
+ "grad_norm": 1.0774686336517334,
1901
+ "learning_rate": 7.706666666666667e-07,
1902
+ "logits/chosen": 1.2369152307510376,
1903
+ "logits/rejected": 1.2727023363113403,
1904
+ "logps/chosen": -139.23147583007812,
1905
+ "logps/rejected": -130.53038024902344,
1906
+ "loss": 0.6967329025268555,
1907
+ "rewards/accuracies": 0.4375,
1908
+ "rewards/chosen": 0.10801626741886139,
1909
+ "rewards/margins": 0.004670143127441406,
1910
+ "rewards/rejected": 0.10334610939025879,
1911
+ "step": 1150
1912
+ },
1913
+ {
1914
+ "epoch": 0.928,
1915
+ "grad_norm": 1.1111780405044556,
1916
+ "learning_rate": 7.677037037037036e-07,
1917
+ "logits/chosen": 1.501250982284546,
1918
+ "logits/rejected": 1.4272868633270264,
1919
+ "logps/chosen": -127.09858703613281,
1920
+ "logps/rejected": -114.9041748046875,
1921
+ "loss": 0.6853152275085449,
1922
+ "rewards/accuracies": 0.574999988079071,
1923
+ "rewards/chosen": 0.1366150826215744,
1924
+ "rewards/margins": 0.021945688873529434,
1925
+ "rewards/rejected": 0.11466939747333527,
1926
+ "step": 1160
1927
+ },
1928
+ {
1929
+ "epoch": 0.936,
1930
+ "grad_norm": 1.5593854188919067,
1931
+ "learning_rate": 7.647407407407407e-07,
1932
+ "logits/chosen": 1.727927803993225,
1933
+ "logits/rejected": 1.6510727405548096,
1934
+ "logps/chosen": -156.70993041992188,
1935
+ "logps/rejected": -167.07266235351562,
1936
+ "loss": 0.6884279251098633,
1937
+ "rewards/accuracies": 0.550000011920929,
1938
+ "rewards/chosen": 0.14902853965759277,
1939
+ "rewards/margins": 0.01685408689081669,
1940
+ "rewards/rejected": 0.13217447698116302,
1941
+ "step": 1170
1942
+ },
1943
+ {
1944
+ "epoch": 0.944,
1945
+ "grad_norm": 2.0064918994903564,
1946
+ "learning_rate": 7.617777777777778e-07,
1947
+ "logits/chosen": 1.6262975931167603,
1948
+ "logits/rejected": 1.58951997756958,
1949
+ "logps/chosen": -186.59210205078125,
1950
+ "logps/rejected": -151.2965850830078,
1951
+ "loss": 0.6906405448913574,
1952
+ "rewards/accuracies": 0.48750001192092896,
1953
+ "rewards/chosen": 0.14898104965686798,
1954
+ "rewards/margins": 0.013303476385772228,
1955
+ "rewards/rejected": 0.13567757606506348,
1956
+ "step": 1180
1957
+ },
1958
+ {
1959
+ "epoch": 0.952,
1960
+ "grad_norm": 1.209291696548462,
1961
+ "learning_rate": 7.588148148148148e-07,
1962
+ "logits/chosen": 1.639369249343872,
1963
+ "logits/rejected": 1.6400636434555054,
1964
+ "logps/chosen": -166.24684143066406,
1965
+ "logps/rejected": -196.35305786132812,
1966
+ "loss": 0.6789956092834473,
1967
+ "rewards/accuracies": 0.5874999761581421,
1968
+ "rewards/chosen": 0.1348457634449005,
1969
+ "rewards/margins": 0.03475186973810196,
1970
+ "rewards/rejected": 0.10009388625621796,
1971
+ "step": 1190
1972
+ },
1973
+ {
1974
+ "epoch": 0.96,
1975
+ "grad_norm": 1.209230899810791,
1976
+ "learning_rate": 7.558518518518519e-07,
1977
+ "logits/chosen": 1.542942762374878,
1978
+ "logits/rejected": 1.5128916501998901,
1979
+ "logps/chosen": -138.994384765625,
1980
+ "logps/rejected": -145.42880249023438,
1981
+ "loss": 0.6936234951019287,
1982
+ "rewards/accuracies": 0.5874999761581421,
1983
+ "rewards/chosen": 0.13895569741725922,
1984
+ "rewards/margins": 0.005582497920840979,
1985
+ "rewards/rejected": 0.1333732008934021,
1986
+ "step": 1200
1987
+ },
1988
+ {
1989
+ "epoch": 0.96,
1990
+ "eval_logits/chosen": 1.5055651664733887,
1991
+ "eval_logits/rejected": 1.5351229906082153,
1992
+ "eval_logps/chosen": -152.2256622314453,
1993
+ "eval_logps/rejected": -146.909912109375,
1994
+ "eval_loss": 0.6866763830184937,
1995
+ "eval_rewards/accuracies": 0.5360000133514404,
1996
+ "eval_rewards/chosen": 0.13231882452964783,
1997
+ "eval_rewards/margins": 0.02109396643936634,
1998
+ "eval_rewards/rejected": 0.11122485250234604,
1999
+ "eval_runtime": 90.8141,
2000
+ "eval_samples_per_second": 5.506,
2001
+ "eval_steps_per_second": 2.753,
2002
+ "step": 1200
2003
+ },
2004
+ {
2005
+ "epoch": 0.968,
2006
+ "grad_norm": 1.2058950662612915,
2007
+ "learning_rate": 7.528888888888889e-07,
2008
+ "logits/chosen": 1.3591210842132568,
2009
+ "logits/rejected": 1.3811699151992798,
2010
+ "logps/chosen": -155.1759033203125,
2011
+ "logps/rejected": -141.97772216796875,
2012
+ "loss": 0.6902606964111329,
2013
+ "rewards/accuracies": 0.48750001192092896,
2014
+ "rewards/chosen": 0.153532013297081,
2015
+ "rewards/margins": 0.011359233409166336,
2016
+ "rewards/rejected": 0.14217278361320496,
2017
+ "step": 1210
2018
+ },
2019
+ {
2020
+ "epoch": 0.976,
2021
+ "grad_norm": 1.366702914237976,
2022
+ "learning_rate": 7.499259259259258e-07,
2023
+ "logits/chosen": 1.3841370344161987,
2024
+ "logits/rejected": 1.3791372776031494,
2025
+ "logps/chosen": -145.47613525390625,
2026
+ "logps/rejected": -123.9173583984375,
2027
+ "loss": 0.6971709251403808,
2028
+ "rewards/accuracies": 0.42500001192092896,
2029
+ "rewards/chosen": 0.10521620512008667,
2030
+ "rewards/margins": -0.0030749079305678606,
2031
+ "rewards/rejected": 0.1082911267876625,
2032
+ "step": 1220
2033
+ },
2034
+ {
2035
+ "epoch": 0.984,
2036
+ "grad_norm": 1.477455496788025,
2037
+ "learning_rate": 7.469629629629629e-07,
2038
+ "logits/chosen": 1.4447228908538818,
2039
+ "logits/rejected": 1.6022417545318604,
2040
+ "logps/chosen": -137.9462127685547,
2041
+ "logps/rejected": -150.25559997558594,
2042
+ "loss": 0.6915127277374268,
2043
+ "rewards/accuracies": 0.5874999761581421,
2044
+ "rewards/chosen": 0.15108394622802734,
2045
+ "rewards/margins": 0.009388584643602371,
2046
+ "rewards/rejected": 0.14169538021087646,
2047
+ "step": 1230
2048
+ },
2049
+ {
2050
+ "epoch": 0.992,
2051
+ "grad_norm": 1.4407854080200195,
2052
+ "learning_rate": 7.44e-07,
2053
+ "logits/chosen": 1.3998825550079346,
2054
+ "logits/rejected": 1.5924925804138184,
2055
+ "logps/chosen": -144.93775939941406,
2056
+ "logps/rejected": -148.02542114257812,
2057
+ "loss": 0.6699621677398682,
2058
+ "rewards/accuracies": 0.574999988079071,
2059
+ "rewards/chosen": 0.146137997508049,
2060
+ "rewards/margins": 0.05326627567410469,
2061
+ "rewards/rejected": 0.09287171810865402,
2062
+ "step": 1240
2063
+ },
2064
+ {
2065
+ "epoch": 1.0,
2066
+ "grad_norm": 1.183438777923584,
2067
+ "learning_rate": 7.41037037037037e-07,
2068
+ "logits/chosen": 1.5393705368041992,
2069
+ "logits/rejected": 1.6312555074691772,
2070
+ "logps/chosen": -145.61672973632812,
2071
+ "logps/rejected": -158.9256591796875,
2072
+ "loss": 0.6811168670654297,
2073
+ "rewards/accuracies": 0.574999988079071,
2074
+ "rewards/chosen": 0.1304798424243927,
2075
+ "rewards/margins": 0.03360626846551895,
2076
+ "rewards/rejected": 0.09687358886003494,
2077
+ "step": 1250
2078
+ },
2079
+ {
2080
+ "epoch": 1.008,
2081
+ "grad_norm": 1.2436553239822388,
2082
+ "learning_rate": 7.38074074074074e-07,
2083
+ "logits/chosen": 1.4758602380752563,
2084
+ "logits/rejected": 1.534609079360962,
2085
+ "logps/chosen": -124.26841735839844,
2086
+ "logps/rejected": -132.41867065429688,
2087
+ "loss": 0.688177490234375,
2088
+ "rewards/accuracies": 0.48750001192092896,
2089
+ "rewards/chosen": 0.12570872902870178,
2090
+ "rewards/margins": 0.014875045046210289,
2091
+ "rewards/rejected": 0.11083368957042694,
2092
+ "step": 1260
2093
+ },
2094
+ {
2095
+ "epoch": 1.016,
2096
+ "grad_norm": 1.3853868246078491,
2097
+ "learning_rate": 7.351111111111112e-07,
2098
+ "logits/chosen": 1.7195091247558594,
2099
+ "logits/rejected": 1.632851004600525,
2100
+ "logps/chosen": -160.8590850830078,
2101
+ "logps/rejected": -153.91671752929688,
2102
+ "loss": 0.6669504165649414,
2103
+ "rewards/accuracies": 0.612500011920929,
2104
+ "rewards/chosen": 0.181864932179451,
2105
+ "rewards/margins": 0.06175640970468521,
2106
+ "rewards/rejected": 0.12010850757360458,
2107
+ "step": 1270
2108
+ },
2109
+ {
2110
+ "epoch": 1.024,
2111
+ "grad_norm": 1.2765302658081055,
2112
+ "learning_rate": 7.321481481481481e-07,
2113
+ "logits/chosen": 1.5782036781311035,
2114
+ "logits/rejected": 1.5713088512420654,
2115
+ "logps/chosen": -154.3319549560547,
2116
+ "logps/rejected": -141.43345642089844,
2117
+ "loss": 0.6734190464019776,
2118
+ "rewards/accuracies": 0.625,
2119
+ "rewards/chosen": 0.12361389398574829,
2120
+ "rewards/margins": 0.04626934230327606,
2121
+ "rewards/rejected": 0.07734455168247223,
2122
+ "step": 1280
2123
+ },
2124
+ {
2125
+ "epoch": 1.032,
2126
+ "grad_norm": 1.0421397686004639,
2127
+ "learning_rate": 7.291851851851851e-07,
2128
+ "logits/chosen": 1.5199967622756958,
2129
+ "logits/rejected": 1.5965670347213745,
2130
+ "logps/chosen": -145.28416442871094,
2131
+ "logps/rejected": -133.72256469726562,
2132
+ "loss": 0.6804514408111573,
2133
+ "rewards/accuracies": 0.550000011920929,
2134
+ "rewards/chosen": 0.11794980615377426,
2135
+ "rewards/margins": 0.030916064977645874,
2136
+ "rewards/rejected": 0.08703374862670898,
2137
+ "step": 1290
2138
+ },
2139
+ {
2140
+ "epoch": 1.04,
2141
+ "grad_norm": 1.032158613204956,
2142
+ "learning_rate": 7.262222222222222e-07,
2143
+ "logits/chosen": 1.6042639017105103,
2144
+ "logits/rejected": 1.4255346059799194,
2145
+ "logps/chosen": -150.4803009033203,
2146
+ "logps/rejected": -130.2232208251953,
2147
+ "loss": 0.6676507472991944,
2148
+ "rewards/accuracies": 0.612500011920929,
2149
+ "rewards/chosen": 0.15466037392616272,
2150
+ "rewards/margins": 0.05808692425489426,
2151
+ "rewards/rejected": 0.09657344967126846,
2152
+ "step": 1300
2153
+ },
2154
+ {
2155
+ "epoch": 1.04,
2156
+ "eval_logits/chosen": 1.5118356943130493,
2157
+ "eval_logits/rejected": 1.5418950319290161,
2158
+ "eval_logps/chosen": -152.23300170898438,
2159
+ "eval_logps/rejected": -146.9257049560547,
2160
+ "eval_loss": 0.6859617233276367,
2161
+ "eval_rewards/accuracies": 0.5339999794960022,
2162
+ "eval_rewards/chosen": 0.13158410787582397,
2163
+ "eval_rewards/margins": 0.021939631551504135,
2164
+ "eval_rewards/rejected": 0.10964448750019073,
2165
+ "eval_runtime": 90.9624,
2166
+ "eval_samples_per_second": 5.497,
2167
+ "eval_steps_per_second": 2.748,
2168
+ "step": 1300
2169
+ },
2170
+ {
2171
+ "epoch": 1.048,
2172
+ "grad_norm": 1.4942574501037598,
2173
+ "learning_rate": 7.232592592592593e-07,
2174
+ "logits/chosen": 1.689648985862732,
2175
+ "logits/rejected": 1.691888451576233,
2176
+ "logps/chosen": -166.02809143066406,
2177
+ "logps/rejected": -160.30601501464844,
2178
+ "loss": 0.6770285606384278,
2179
+ "rewards/accuracies": 0.5625,
2180
+ "rewards/chosen": 0.14377735555171967,
2181
+ "rewards/margins": 0.03825496509671211,
2182
+ "rewards/rejected": 0.10552239418029785,
2183
+ "step": 1310
2184
+ },
2185
+ {
2186
+ "epoch": 1.056,
2187
+ "grad_norm": 1.3656011819839478,
2188
+ "learning_rate": 7.202962962962963e-07,
2189
+ "logits/chosen": 1.3065154552459717,
2190
+ "logits/rejected": 1.5681203603744507,
2191
+ "logps/chosen": -129.1146240234375,
2192
+ "logps/rejected": -151.2584991455078,
2193
+ "loss": 0.6844655036926269,
2194
+ "rewards/accuracies": 0.5375000238418579,
2195
+ "rewards/chosen": 0.12841743230819702,
2196
+ "rewards/margins": 0.02178655005991459,
2197
+ "rewards/rejected": 0.10663086175918579,
2198
+ "step": 1320
2199
+ },
2200
+ {
2201
+ "epoch": 1.064,
2202
+ "grad_norm": 1.1723405122756958,
2203
+ "learning_rate": 7.173333333333333e-07,
2204
+ "logits/chosen": 1.6219158172607422,
2205
+ "logits/rejected": 1.6032555103302002,
2206
+ "logps/chosen": -156.4185028076172,
2207
+ "logps/rejected": -146.28067016601562,
2208
+ "loss": 0.6650479316711426,
2209
+ "rewards/accuracies": 0.6875,
2210
+ "rewards/chosen": 0.1538492739200592,
2211
+ "rewards/margins": 0.06132146716117859,
2212
+ "rewards/rejected": 0.09252782166004181,
2213
+ "step": 1330
2214
+ },
2215
+ {
2216
+ "epoch": 1.072,
2217
+ "grad_norm": 1.2311245203018188,
2218
+ "learning_rate": 7.143703703703703e-07,
2219
+ "logits/chosen": 1.4625260829925537,
2220
+ "logits/rejected": 1.392937421798706,
2221
+ "logps/chosen": -153.05941772460938,
2222
+ "logps/rejected": -145.37841796875,
2223
+ "loss": 0.6570021152496338,
2224
+ "rewards/accuracies": 0.762499988079071,
2225
+ "rewards/chosen": 0.16768500208854675,
2226
+ "rewards/margins": 0.0787494108080864,
2227
+ "rewards/rejected": 0.08893559873104095,
2228
+ "step": 1340
2229
+ },
2230
+ {
2231
+ "epoch": 1.08,
2232
+ "grad_norm": 1.314666748046875,
2233
+ "learning_rate": 7.114074074074073e-07,
2234
+ "logits/chosen": 1.4812809228897095,
2235
+ "logits/rejected": 1.3796050548553467,
2236
+ "logps/chosen": -165.70205688476562,
2237
+ "logps/rejected": -140.13150024414062,
2238
+ "loss": 0.6535200595855712,
2239
+ "rewards/accuracies": 0.6499999761581421,
2240
+ "rewards/chosen": 0.20691323280334473,
2241
+ "rewards/margins": 0.0911526307463646,
2242
+ "rewards/rejected": 0.11576057970523834,
2243
+ "step": 1350
2244
+ },
2245
+ {
2246
+ "epoch": 1.088,
2247
+ "grad_norm": 0.9556720852851868,
2248
+ "learning_rate": 7.084444444444444e-07,
2249
+ "logits/chosen": 1.6368863582611084,
2250
+ "logits/rejected": 1.678134560585022,
2251
+ "logps/chosen": -166.56253051757812,
2252
+ "logps/rejected": -164.36416625976562,
2253
+ "loss": 0.6742628574371338,
2254
+ "rewards/accuracies": 0.612500011920929,
2255
+ "rewards/chosen": 0.15145304799079895,
2256
+ "rewards/margins": 0.04319921135902405,
2257
+ "rewards/rejected": 0.1082538366317749,
2258
+ "step": 1360
2259
+ },
2260
+ {
2261
+ "epoch": 1.096,
2262
+ "grad_norm": 0.945633053779602,
2263
+ "learning_rate": 7.054814814814815e-07,
2264
+ "logits/chosen": 1.4285938739776611,
2265
+ "logits/rejected": 1.5576260089874268,
2266
+ "logps/chosen": -131.5234832763672,
2267
+ "logps/rejected": -128.02340698242188,
2268
+ "loss": 0.6670140743255615,
2269
+ "rewards/accuracies": 0.6499999761581421,
2270
+ "rewards/chosen": 0.16399362683296204,
2271
+ "rewards/margins": 0.05922812223434448,
2272
+ "rewards/rejected": 0.10476551204919815,
2273
+ "step": 1370
2274
+ },
2275
+ {
2276
+ "epoch": 1.104,
2277
+ "grad_norm": 1.1189475059509277,
2278
+ "learning_rate": 7.025185185185185e-07,
2279
+ "logits/chosen": 1.5019981861114502,
2280
+ "logits/rejected": 1.667614221572876,
2281
+ "logps/chosen": -161.61854553222656,
2282
+ "logps/rejected": -166.39996337890625,
2283
+ "loss": 0.6742690086364747,
2284
+ "rewards/accuracies": 0.675000011920929,
2285
+ "rewards/chosen": 0.15786050260066986,
2286
+ "rewards/margins": 0.044500257819890976,
2287
+ "rewards/rejected": 0.11336024850606918,
2288
+ "step": 1380
2289
+ },
2290
+ {
2291
+ "epoch": 1.112,
2292
+ "grad_norm": 1.3382912874221802,
2293
+ "learning_rate": 6.995555555555555e-07,
2294
+ "logits/chosen": 1.4972480535507202,
2295
+ "logits/rejected": 1.4946129322052002,
2296
+ "logps/chosen": -166.43682861328125,
2297
+ "logps/rejected": -162.3004913330078,
2298
+ "loss": 0.6557814121246338,
2299
+ "rewards/accuracies": 0.637499988079071,
2300
+ "rewards/chosen": 0.18964914977550507,
2301
+ "rewards/margins": 0.08500295877456665,
2302
+ "rewards/rejected": 0.10464620590209961,
2303
+ "step": 1390
2304
+ },
2305
+ {
2306
+ "epoch": 1.12,
2307
+ "grad_norm": 1.2123515605926514,
2308
+ "learning_rate": 6.965925925925926e-07,
2309
+ "logits/chosen": 1.456635594367981,
2310
+ "logits/rejected": 1.5964475870132446,
2311
+ "logps/chosen": -152.08261108398438,
2312
+ "logps/rejected": -161.51023864746094,
2313
+ "loss": 0.6708066940307618,
2314
+ "rewards/accuracies": 0.5874999761581421,
2315
+ "rewards/chosen": 0.216823011636734,
2316
+ "rewards/margins": 0.050363689661026,
2317
+ "rewards/rejected": 0.166459321975708,
2318
+ "step": 1400
2319
+ },
2320
+ {
2321
+ "epoch": 1.12,
2322
+ "eval_logits/chosen": 1.521518588066101,
2323
+ "eval_logits/rejected": 1.5510718822479248,
2324
+ "eval_logps/chosen": -151.858154296875,
2325
+ "eval_logps/rejected": -146.5819549560547,
2326
+ "eval_loss": 0.6853657364845276,
2327
+ "eval_rewards/accuracies": 0.5440000295639038,
2328
+ "eval_rewards/chosen": 0.16906805336475372,
2329
+ "eval_rewards/margins": 0.025049500167369843,
2330
+ "eval_rewards/rejected": 0.14401854574680328,
2331
+ "eval_runtime": 90.8779,
2332
+ "eval_samples_per_second": 5.502,
2333
+ "eval_steps_per_second": 2.751,
2334
+ "step": 1400
2335
+ },
2336
+ {
2337
+ "epoch": 1.1280000000000001,
2338
+ "grad_norm": 1.3488291501998901,
2339
+ "learning_rate": 6.936296296296296e-07,
2340
+ "logits/chosen": 1.6009830236434937,
2341
+ "logits/rejected": 1.6035544872283936,
2342
+ "logps/chosen": -145.44662475585938,
2343
+ "logps/rejected": -149.9284210205078,
2344
+ "loss": 0.679435396194458,
2345
+ "rewards/accuracies": 0.550000011920929,
2346
+ "rewards/chosen": 0.1647307425737381,
2347
+ "rewards/margins": 0.03348377346992493,
2348
+ "rewards/rejected": 0.13124695420265198,
2349
+ "step": 1410
2350
+ },
2351
+ {
2352
+ "epoch": 1.1360000000000001,
2353
+ "grad_norm": 1.6453970670700073,
2354
+ "learning_rate": 6.906666666666666e-07,
2355
+ "logits/chosen": 1.593658447265625,
2356
+ "logits/rejected": 1.648276925086975,
2357
+ "logps/chosen": -135.2926788330078,
2358
+ "logps/rejected": -150.9585723876953,
2359
+ "loss": 0.6875734329223633,
2360
+ "rewards/accuracies": 0.5375000238418579,
2361
+ "rewards/chosen": 0.15669891238212585,
2362
+ "rewards/margins": 0.01690087839961052,
2363
+ "rewards/rejected": 0.13979804515838623,
2364
+ "step": 1420
2365
+ },
2366
+ {
2367
+ "epoch": 1.144,
2368
+ "grad_norm": 1.6117807626724243,
2369
+ "learning_rate": 6.877037037037036e-07,
2370
+ "logits/chosen": 1.4510356187820435,
2371
+ "logits/rejected": 1.2334567308425903,
2372
+ "logps/chosen": -153.94854736328125,
2373
+ "logps/rejected": -147.78079223632812,
2374
+ "loss": 0.6689413070678711,
2375
+ "rewards/accuracies": 0.5874999761581421,
2376
+ "rewards/chosen": 0.1819056123495102,
2377
+ "rewards/margins": 0.058564942330121994,
2378
+ "rewards/rejected": 0.1233406662940979,
2379
+ "step": 1430
2380
+ },
2381
+ {
2382
+ "epoch": 1.152,
2383
+ "grad_norm": 1.5533809661865234,
2384
+ "learning_rate": 6.847407407407408e-07,
2385
+ "logits/chosen": 1.4535889625549316,
2386
+ "logits/rejected": 1.4273865222930908,
2387
+ "logps/chosen": -170.42904663085938,
2388
+ "logps/rejected": -151.9979705810547,
2389
+ "loss": 0.6845246315002441,
2390
+ "rewards/accuracies": 0.512499988079071,
2391
+ "rewards/chosen": 0.14223966002464294,
2392
+ "rewards/margins": 0.022389881312847137,
2393
+ "rewards/rejected": 0.1198497861623764,
2394
+ "step": 1440
2395
+ },
2396
+ {
2397
+ "epoch": 1.16,
2398
+ "grad_norm": 1.5696005821228027,
2399
+ "learning_rate": 6.817777777777778e-07,
2400
+ "logits/chosen": 1.4418725967407227,
2401
+ "logits/rejected": 1.4354021549224854,
2402
+ "logps/chosen": -153.962158203125,
2403
+ "logps/rejected": -148.40512084960938,
2404
+ "loss": 0.6761108875274658,
2405
+ "rewards/accuracies": 0.5874999761581421,
2406
+ "rewards/chosen": 0.15529735386371613,
2407
+ "rewards/margins": 0.040999218821525574,
2408
+ "rewards/rejected": 0.11429812759160995,
2409
+ "step": 1450
2410
+ },
2411
+ {
2412
+ "epoch": 1.168,
2413
+ "grad_norm": 1.0711781978607178,
2414
+ "learning_rate": 6.788148148148147e-07,
2415
+ "logits/chosen": 1.5659481287002563,
2416
+ "logits/rejected": 1.5640677213668823,
2417
+ "logps/chosen": -149.816162109375,
2418
+ "logps/rejected": -174.68643188476562,
2419
+ "loss": 0.677489185333252,
2420
+ "rewards/accuracies": 0.6499999761581421,
2421
+ "rewards/chosen": 0.1510745733976364,
2422
+ "rewards/margins": 0.03914733976125717,
2423
+ "rewards/rejected": 0.11192724853754044,
2424
+ "step": 1460
2425
+ },
2426
+ {
2427
+ "epoch": 1.176,
2428
+ "grad_norm": 1.2336673736572266,
2429
+ "learning_rate": 6.758518518518518e-07,
2430
+ "logits/chosen": 1.4275394678115845,
2431
+ "logits/rejected": 1.6394407749176025,
2432
+ "logps/chosen": -137.93788146972656,
2433
+ "logps/rejected": -154.97195434570312,
2434
+ "loss": 0.6785802364349365,
2435
+ "rewards/accuracies": 0.612500011920929,
2436
+ "rewards/chosen": 0.10557998716831207,
2437
+ "rewards/margins": 0.0352485254406929,
2438
+ "rewards/rejected": 0.07033145427703857,
2439
+ "step": 1470
2440
+ },
2441
+ {
2442
+ "epoch": 1.184,
2443
+ "grad_norm": 1.0030723810195923,
2444
+ "learning_rate": 6.728888888888888e-07,
2445
+ "logits/chosen": 1.4652725458145142,
2446
+ "logits/rejected": 1.4992272853851318,
2447
+ "logps/chosen": -143.87374877929688,
2448
+ "logps/rejected": -139.5693817138672,
2449
+ "loss": 0.6700259208679199,
2450
+ "rewards/accuracies": 0.6625000238418579,
2451
+ "rewards/chosen": 0.13436684012413025,
2452
+ "rewards/margins": 0.05286343768239021,
2453
+ "rewards/rejected": 0.08150340616703033,
2454
+ "step": 1480
2455
+ },
2456
+ {
2457
+ "epoch": 1.192,
2458
+ "grad_norm": 1.0897104740142822,
2459
+ "learning_rate": 6.699259259259259e-07,
2460
+ "logits/chosen": 1.7033601999282837,
2461
+ "logits/rejected": 1.617489218711853,
2462
+ "logps/chosen": -140.43667602539062,
2463
+ "logps/rejected": -146.59683227539062,
2464
+ "loss": 0.6685672283172608,
2465
+ "rewards/accuracies": 0.675000011920929,
2466
+ "rewards/chosen": 0.11724893748760223,
2467
+ "rewards/margins": 0.055138181895017624,
2468
+ "rewards/rejected": 0.06211075931787491,
2469
+ "step": 1490
2470
+ },
2471
+ {
2472
+ "epoch": 1.2,
2473
+ "grad_norm": 0.8573256731033325,
2474
+ "learning_rate": 6.66962962962963e-07,
2475
+ "logits/chosen": 1.5181013345718384,
2476
+ "logits/rejected": 1.5965152978897095,
2477
+ "logps/chosen": -147.78506469726562,
2478
+ "logps/rejected": -142.18930053710938,
2479
+ "loss": 0.6564288139343262,
2480
+ "rewards/accuracies": 0.7250000238418579,
2481
+ "rewards/chosen": 0.14890792965888977,
2482
+ "rewards/margins": 0.08058460801839828,
2483
+ "rewards/rejected": 0.06832331418991089,
2484
+ "step": 1500
2485
+ },
2486
+ {
2487
+ "epoch": 1.2,
2488
+ "eval_logits/chosen": 1.488415002822876,
2489
+ "eval_logits/rejected": 1.5179662704467773,
2490
+ "eval_logps/chosen": -152.60232543945312,
2491
+ "eval_logps/rejected": -147.2954864501953,
2492
+ "eval_loss": 0.6861968040466309,
2493
+ "eval_rewards/accuracies": 0.5400000214576721,
2494
+ "eval_rewards/chosen": 0.09465231001377106,
2495
+ "eval_rewards/margins": 0.021986663341522217,
2496
+ "eval_rewards/rejected": 0.07266565412282944,
2497
+ "eval_runtime": 90.8951,
2498
+ "eval_samples_per_second": 5.501,
2499
+ "eval_steps_per_second": 2.75,
2500
+ "step": 1500
2501
+ }
2502
+ ],
2503
+ "logging_steps": 10,
2504
+ "max_steps": 3750,
2505
+ "num_input_tokens_seen": 0,
2506
+ "num_train_epochs": 3,
2507
+ "save_steps": 500,
2508
+ "stateful_callbacks": {
2509
+ "TrainerControl": {
2510
+ "args": {
2511
+ "should_epoch_stop": false,
2512
+ "should_evaluate": false,
2513
+ "should_log": false,
2514
+ "should_save": true,
2515
+ "should_training_stop": false
2516
+ },
2517
+ "attributes": {}
2518
+ }
2519
+ },
2520
+ "total_flos": 0.0,
2521
+ "train_batch_size": 2,
2522
+ "trial_name": null,
2523
+ "trial_params": null
2524
+ }
v3/DPO/DPO_10k/lora/checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4f90e4a7053b7fb440d68cb64a3d198396af09db942d08839c5f4f0f0b8a0c8
3
+ size 6097
v3/DPO/DPO_10k/lora/checkpoint-2000/README.md ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Llama-3.2-1B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:meta-llama/Llama-3.2-1B-Instruct
7
+ - dpo
8
+ - lora
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
v3/DPO/DPO_10k/lora/checkpoint-2000/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 32,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.1,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 64,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "down_proj",
35
+ "k_proj",
36
+ "o_proj",
37
+ "up_proj",
38
+ "q_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
v3/DPO/DPO_10k/lora/checkpoint-2000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cd69c523e1e782aa020a6dd4857a965f4cb7c32d5baa557697e95c17d5c604e
3
+ size 180385008
v3/DPO/DPO_10k/lora/checkpoint-2000/chat_template.jinja ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{- bos_token }}
2
+ {%- if custom_tools is defined %}
3
+ {%- set tools = custom_tools %}
4
+ {%- endif %}
5
+ {%- if not tools_in_user_message is defined %}
6
+ {%- set tools_in_user_message = true %}
7
+ {%- endif %}
8
+ {%- if not date_string is defined %}
9
+ {%- if strftime_now is defined %}
10
+ {%- set date_string = strftime_now("%d %b %Y") %}
11
+ {%- else %}
12
+ {%- set date_string = "26 Jul 2024" %}
13
+ {%- endif %}
14
+ {%- endif %}
15
+ {%- if not tools is defined %}
16
+ {%- set tools = none %}
17
+ {%- endif %}
18
+
19
+ {#- This block extracts the system message, so we can slot it into the right place. #}
20
+ {%- if messages[0]['role'] == 'system' %}
21
+ {%- set system_message = messages[0]['content']|trim %}
22
+ {%- set messages = messages[1:] %}
23
+ {%- else %}
24
+ {%- set system_message = "" %}
25
+ {%- endif %}
26
+
27
+ {#- System message #}
28
+ {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
29
+ {%- if tools is not none %}
30
+ {{- "Environment: ipython\n" }}
31
+ {%- endif %}
32
+ {{- "Cutting Knowledge Date: December 2023\n" }}
33
+ {{- "Today Date: " + date_string + "\n\n" }}
34
+ {%- if tools is not none and not tools_in_user_message %}
35
+ {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
36
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
37
+ {{- "Do not use variables.\n\n" }}
38
+ {%- for t in tools %}
39
+ {{- t | tojson(indent=4) }}
40
+ {{- "\n\n" }}
41
+ {%- endfor %}
42
+ {%- endif %}
43
+ {{- system_message }}
44
+ {{- "<|eot_id|>" }}
45
+
46
+ {#- Custom tools are passed in a user message with some extra guidance #}
47
+ {%- if tools_in_user_message and not tools is none %}
48
+ {#- Extract the first user message so we can plug it in here #}
49
+ {%- if messages | length != 0 %}
50
+ {%- set first_user_message = messages[0]['content']|trim %}
51
+ {%- set messages = messages[1:] %}
52
+ {%- else %}
53
+ {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
54
+ {%- endif %}
55
+ {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
56
+ {{- "Given the following functions, please respond with a JSON for a function call " }}
57
+ {{- "with its proper arguments that best answers the given prompt.\n\n" }}
58
+ {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
59
+ {{- "Do not use variables.\n\n" }}
60
+ {%- for t in tools %}
61
+ {{- t | tojson(indent=4) }}
62
+ {{- "\n\n" }}
63
+ {%- endfor %}
64
+ {{- first_user_message + "<|eot_id|>"}}
65
+ {%- endif %}
66
+
67
+ {%- for message in messages %}
68
+ {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
69
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
70
+ {%- elif 'tool_calls' in message %}
71
+ {%- if not message.tool_calls|length == 1 %}
72
+ {{- raise_exception("This model only supports single tool-calls at once!") }}
73
+ {%- endif %}
74
+ {%- set tool_call = message.tool_calls[0].function %}
75
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
76
+ {{- '{"name": "' + tool_call.name + '", ' }}
77
+ {{- '"parameters": ' }}
78
+ {{- tool_call.arguments | tojson }}
79
+ {{- "}" }}
80
+ {{- "<|eot_id|>" }}
81
+ {%- elif message.role == "tool" or message.role == "ipython" %}
82
+ {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
83
+ {%- if message.content is mapping or message.content is iterable %}
84
+ {{- message.content | tojson }}
85
+ {%- else %}
86
+ {{- message.content }}
87
+ {%- endif %}
88
+ {{- "<|eot_id|>" }}
89
+ {%- endif %}
90
+ {%- endfor %}
91
+ {%- if add_generation_prompt %}
92
+ {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
93
+ {%- endif %}
v3/DPO/DPO_10k/lora/checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9fab96ca3915ebdde3a55a6d95b9e0ffb2c8eb2ed59436fcdae3be4c642b46f
3
+ size 360902475
v3/DPO/DPO_10k/lora/checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f27403649e37262609f871dba600d7da4f0bfc387b6571584c6bc75289b7c853
3
+ size 14645
v3/DPO/DPO_10k/lora/checkpoint-2000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4aa03f6e0cd07cf67ce1fbe3101d545f5771ef9148b9debf02b11cf6948da5c
3
+ size 1383
v3/DPO/DPO_10k/lora/checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:318db60b99958c78b78a2ad7d7ff224500c5019593c5cf0c4944785079b20a61
3
+ size 1465
v3/DPO/DPO_10k/lora/checkpoint-2000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
+ size 17209920
v3/DPO/DPO_10k/lora/checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<|begin_of_text|>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "eos_token": "<|eot_id|>",
6
+ "is_local": false,
7
+ "model_input_names": [
8
+ "input_ids",
9
+ "attention_mask"
10
+ ],
11
+ "model_max_length": 131072,
12
+ "pad_token": "<|eot_id|>",
13
+ "tokenizer_class": "TokenizersBackend"
14
+ }
v3/DPO/DPO_10k/lora/checkpoint-2000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
v3/DPO/DPO_10k/lora/checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4f90e4a7053b7fb440d68cb64a3d198396af09db942d08839c5f4f0f0b8a0c8
3
+ size 6097