satoyutaka commited on
Commit
ff4dab4
·
verified ·
1 Parent(s): 236d0c3

Upload 8 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,209 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen3-4B-Instruct-2507
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen3-4B-Instruct-2507
7
+ - lora
8
+ - sft
9
+ - transformers
10
+ - trl
11
+ ---
12
+
13
+ # Model Card for Model ID
14
+
15
+ <!-- Provide a quick summary of what the model is/does. -->
16
+
17
+
18
+
19
+ ## Model Details
20
+
21
+ ### Model Description
22
+
23
+ <!-- Provide a longer summary of what this model is. -->
24
+
25
+
26
+
27
+ - **Developed by:** [More Information Needed]
28
+ - **Funded by [optional]:** [More Information Needed]
29
+ - **Shared by [optional]:** [More Information Needed]
30
+ - **Model type:** [More Information Needed]
31
+ - **Language(s) (NLP):** [More Information Needed]
32
+ - **License:** [More Information Needed]
33
+ - **Finetuned from model [optional]:** [More Information Needed]
34
+
35
+ ### Model Sources [optional]
36
+
37
+ <!-- Provide the basic links for the model. -->
38
+
39
+ - **Repository:** [More Information Needed]
40
+ - **Paper [optional]:** [More Information Needed]
41
+ - **Demo [optional]:** [More Information Needed]
42
+
43
+ ## Uses
44
+
45
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
46
+
47
+ ### Direct Use
48
+
49
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
50
+
51
+ [More Information Needed]
52
+
53
+ ### Downstream Use [optional]
54
+
55
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
56
+
57
+ [More Information Needed]
58
+
59
+ ### Out-of-Scope Use
60
+
61
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
62
+
63
+ [More Information Needed]
64
+
65
+ ## Bias, Risks, and Limitations
66
+
67
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
68
+
69
+ [More Information Needed]
70
+
71
+ ### Recommendations
72
+
73
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
74
+
75
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
76
+
77
+ ## How to Get Started with the Model
78
+
79
+ Use the code below to get started with the model.
80
+
81
+ [More Information Needed]
82
+
83
+ ## Training Details
84
+
85
+ ### Training Data
86
+
87
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
88
+
89
+ [More Information Needed]
90
+
91
+ ### Training Procedure
92
+
93
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
94
+
95
+ #### Preprocessing [optional]
96
+
97
+ [More Information Needed]
98
+
99
+
100
+ #### Training Hyperparameters
101
+
102
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
103
+
104
+ #### Speeds, Sizes, Times [optional]
105
+
106
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
107
+
108
+ [More Information Needed]
109
+
110
+ ## Evaluation
111
+
112
+ <!-- This section describes the evaluation protocols and provides the results. -->
113
+
114
+ ### Testing Data, Factors & Metrics
115
+
116
+ #### Testing Data
117
+
118
+ <!-- This should link to a Dataset Card if possible. -->
119
+
120
+ [More Information Needed]
121
+
122
+ #### Factors
123
+
124
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
125
+
126
+ [More Information Needed]
127
+
128
+ #### Metrics
129
+
130
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
131
+
132
+ [More Information Needed]
133
+
134
+ ### Results
135
+
136
+ [More Information Needed]
137
+
138
+ #### Summary
139
+
140
+
141
+
142
+ ## Model Examination [optional]
143
+
144
+ <!-- Relevant interpretability work for the model goes here -->
145
+
146
+ [More Information Needed]
147
+
148
+ ## Environmental Impact
149
+
150
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
151
+
152
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
153
+
154
+ - **Hardware Type:** [More Information Needed]
155
+ - **Hours used:** [More Information Needed]
156
+ - **Cloud Provider:** [More Information Needed]
157
+ - **Compute Region:** [More Information Needed]
158
+ - **Carbon Emitted:** [More Information Needed]
159
+
160
+ ## Technical Specifications [optional]
161
+
162
+ ### Model Architecture and Objective
163
+
164
+ [More Information Needed]
165
+
166
+ ### Compute Infrastructure
167
+
168
+ [More Information Needed]
169
+
170
+ #### Hardware
171
+
172
+ [More Information Needed]
173
+
174
+ #### Software
175
+
176
+ [More Information Needed]
177
+
178
+ ## Citation [optional]
179
+
180
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
181
+
182
+ **BibTeX:**
183
+
184
+ [More Information Needed]
185
+
186
+ **APA:**
187
+
188
+ [More Information Needed]
189
+
190
+ ## Glossary [optional]
191
+
192
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
193
+
194
+ [More Information Needed]
195
+
196
+ ## More Information [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Authors [optional]
201
+
202
+ [More Information Needed]
203
+
204
+ ## Model Card Contact
205
+
206
+ [More Information Needed]
207
+ ### Framework versions
208
+
209
+ - PEFT 0.18.1
adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen3-4B-Instruct-2507",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 64,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 32,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "v_proj",
33
+ "gate_proj",
34
+ "down_proj",
35
+ "up_proj",
36
+ "q_proj",
37
+ "k_proj",
38
+ "o_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83b5c6c241d842a35bc4cf49d9979fa89226052cffefefe498c380035cc0f3dc
3
+ size 264308896
chat_template.jinja ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {{- messages[0].content + '\n\n' }}
5
+ {%- endif %}
6
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
7
+ {%- for tool in tools %}
8
+ {{- "\n" }}
9
+ {{- tool | tojson }}
10
+ {%- endfor %}
11
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12
+ {%- else %}
13
+ {%- if messages[0].role == 'system' %}
14
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15
+ {%- endif %}
16
+ {%- endif %}
17
+ {%- for message in messages %}
18
+ {%- if message.content is string %}
19
+ {%- set content = message.content %}
20
+ {%- else %}
21
+ {%- set content = '' %}
22
+ {%- endif %}
23
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
24
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
25
+ {%- elif message.role == "assistant" %}
26
+ {{- '<|im_start|>' + message.role + '\n' + content }}
27
+ {%- if message.tool_calls %}
28
+ {%- for tool_call in message.tool_calls %}
29
+ {%- if (loop.first and content) or (not loop.first) %}
30
+ {{- '\n' }}
31
+ {%- endif %}
32
+ {%- if tool_call.function %}
33
+ {%- set tool_call = tool_call.function %}
34
+ {%- endif %}
35
+ {{- '<tool_call>\n{"name": "' }}
36
+ {{- tool_call.name }}
37
+ {{- '", "arguments": ' }}
38
+ {%- if tool_call.arguments is string %}
39
+ {{- tool_call.arguments }}
40
+ {%- else %}
41
+ {{- tool_call.arguments | tojson }}
42
+ {%- endif %}
43
+ {{- '}\n</tool_call>' }}
44
+ {%- endfor %}
45
+ {%- endif %}
46
+ {{- '<|im_end|>\n' }}
47
+ {%- elif message.role == "tool" %}
48
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
49
+ {{- '<|im_start|>user' }}
50
+ {%- endif %}
51
+ {{- '\n<tool_response>\n' }}
52
+ {{- content }}
53
+ {{- '\n</tool_response>' }}
54
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
55
+ {{- '<|im_end|>\n' }}
56
+ {%- endif %}
57
+ {%- endif %}
58
+ {%- endfor %}
59
+ {%- if add_generation_prompt %}
60
+ {{- '<|im_start|>assistant\n' }}
61
+ {%- endif %}
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "model_max_length": 1010000,
25
+ "pad_token": "<|endoftext|>",
26
+ "split_special_tokens": false,
27
+ "tokenizer_class": "Qwen2Tokenizer",
28
+ "unk_token": null
29
+ }
trainer_state.json ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.4027183488547697,
6
+ "eval_steps": 50,
7
+ "global_step": 200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 0.9404663916677236,
14
+ "epoch": 0.010067958721369242,
15
+ "grad_norm": 2.6392252445220947,
16
+ "learning_rate": 2.666666666666667e-06,
17
+ "loss": 2.0117156982421873,
18
+ "mean_token_accuracy": 0.6191852029412985,
19
+ "num_tokens": 47701.0,
20
+ "step": 5
21
+ },
22
+ {
23
+ "entropy": 1.0189884781837464,
24
+ "epoch": 0.020135917442738484,
25
+ "grad_norm": 1.8436778783798218,
26
+ "learning_rate": 6e-06,
27
+ "loss": 2.1075166702270507,
28
+ "mean_token_accuracy": 0.5976856101304293,
29
+ "num_tokens": 92619.0,
30
+ "step": 10
31
+ },
32
+ {
33
+ "entropy": 1.029310768097639,
34
+ "epoch": 0.03020387616410773,
35
+ "grad_norm": 1.815084457397461,
36
+ "learning_rate": 9.333333333333334e-06,
37
+ "loss": 2.01068058013916,
38
+ "mean_token_accuracy": 0.6073978751897812,
39
+ "num_tokens": 134847.0,
40
+ "step": 15
41
+ },
42
+ {
43
+ "entropy": 1.0625499848276376,
44
+ "epoch": 0.04027183488547697,
45
+ "grad_norm": 1.4827969074249268,
46
+ "learning_rate": 1.2666666666666667e-05,
47
+ "loss": 1.7313125610351563,
48
+ "mean_token_accuracy": 0.6392445303499699,
49
+ "num_tokens": 181821.0,
50
+ "step": 20
51
+ },
52
+ {
53
+ "entropy": 1.1866111695766448,
54
+ "epoch": 0.05033979360684621,
55
+ "grad_norm": 0.9268224835395813,
56
+ "learning_rate": 1.6000000000000003e-05,
57
+ "loss": 1.6676939010620118,
58
+ "mean_token_accuracy": 0.6374935403466224,
59
+ "num_tokens": 225041.0,
60
+ "step": 25
61
+ },
62
+ {
63
+ "entropy": 1.284090331569314,
64
+ "epoch": 0.06040775232821546,
65
+ "grad_norm": 0.5153523683547974,
66
+ "learning_rate": 1.9333333333333333e-05,
67
+ "loss": 1.5365737915039062,
68
+ "mean_token_accuracy": 0.650652977451682,
69
+ "num_tokens": 271704.0,
70
+ "step": 30
71
+ },
72
+ {
73
+ "entropy": 1.4139859646558761,
74
+ "epoch": 0.0704757110495847,
75
+ "grad_norm": 0.4240514934062958,
76
+ "learning_rate": 1.998917111338525e-05,
77
+ "loss": 1.4139368057250976,
78
+ "mean_token_accuracy": 0.6503775727003813,
79
+ "num_tokens": 319422.0,
80
+ "step": 35
81
+ },
82
+ {
83
+ "entropy": 1.5055108167231084,
84
+ "epoch": 0.08054366977095394,
85
+ "grad_norm": 0.557256281375885,
86
+ "learning_rate": 1.9945218953682736e-05,
87
+ "loss": 1.3467309951782227,
88
+ "mean_token_accuracy": 0.6553127769380808,
89
+ "num_tokens": 368170.0,
90
+ "step": 40
91
+ },
92
+ {
93
+ "entropy": 1.5834380112588406,
94
+ "epoch": 0.09061162849232318,
95
+ "grad_norm": 0.5694743990898132,
96
+ "learning_rate": 1.9867615321125796e-05,
97
+ "loss": 1.3501482963562013,
98
+ "mean_token_accuracy": 0.6700987242162227,
99
+ "num_tokens": 408692.0,
100
+ "step": 45
101
+ },
102
+ {
103
+ "entropy": 1.5377649553120136,
104
+ "epoch": 0.10067958721369243,
105
+ "grad_norm": 0.48829907178878784,
106
+ "learning_rate": 1.9756622801842144e-05,
107
+ "loss": 1.2567365646362305,
108
+ "mean_token_accuracy": 0.6825098309665918,
109
+ "num_tokens": 454374.0,
110
+ "step": 50
111
+ },
112
+ {
113
+ "epoch": 0.10067958721369243,
114
+ "eval_entropy": 1.3134305934906005,
115
+ "eval_loss": 1.1746762990951538,
116
+ "eval_mean_token_accuracy": 0.7086019682884216,
117
+ "eval_num_tokens": 454374.0,
118
+ "eval_runtime": 24163.8119,
119
+ "eval_samples_per_second": 0.041,
120
+ "eval_steps_per_second": 0.005,
121
+ "step": 50
122
+ },
123
+ {
124
+ "entropy": 1.4440373208373785,
125
+ "epoch": 0.11074754593506167,
126
+ "grad_norm": 0.4486835300922394,
127
+ "learning_rate": 1.961261695938319e-05,
128
+ "loss": 1.1860782623291015,
129
+ "mean_token_accuracy": 0.6934177171438932,
130
+ "num_tokens": 497453.0,
131
+ "step": 55
132
+ },
133
+ {
134
+ "entropy": 1.1991986483335495,
135
+ "epoch": 0.12081550465643091,
136
+ "grad_norm": 0.4323367476463318,
137
+ "learning_rate": 1.9436085063935837e-05,
138
+ "loss": 1.0048168182373047,
139
+ "mean_token_accuracy": 0.723202359676361,
140
+ "num_tokens": 547078.0,
141
+ "step": 60
142
+ },
143
+ {
144
+ "entropy": 1.1962539467960596,
145
+ "epoch": 0.13088346337780016,
146
+ "grad_norm": 0.4360666871070862,
147
+ "learning_rate": 1.9227624443554425e-05,
148
+ "loss": 1.049765682220459,
149
+ "mean_token_accuracy": 0.7247624807059765,
150
+ "num_tokens": 593207.0,
151
+ "step": 65
152
+ },
153
+ {
154
+ "entropy": 1.0165199358016253,
155
+ "epoch": 0.1409514220991694,
156
+ "grad_norm": 0.49459174275398254,
157
+ "learning_rate": 1.8987940462991673e-05,
158
+ "loss": 0.9207266807556153,
159
+ "mean_token_accuracy": 0.7483494646847249,
160
+ "num_tokens": 637938.0,
161
+ "step": 70
162
+ },
163
+ {
164
+ "entropy": 0.895136122033,
165
+ "epoch": 0.15101938082053865,
166
+ "grad_norm": 0.4608009457588196,
167
+ "learning_rate": 1.8717844136967626e-05,
168
+ "loss": 0.8429131507873535,
169
+ "mean_token_accuracy": 0.7764634154736996,
170
+ "num_tokens": 679126.0,
171
+ "step": 75
172
+ },
173
+ {
174
+ "entropy": 0.8629299964755773,
175
+ "epoch": 0.16108733954190788,
176
+ "grad_norm": 0.38531047105789185,
177
+ "learning_rate": 1.8418249385952575e-05,
178
+ "loss": 0.8693387985229493,
179
+ "mean_token_accuracy": 0.7764782950282096,
180
+ "num_tokens": 719805.0,
181
+ "step": 80
182
+ },
183
+ {
184
+ "entropy": 0.7840321972966194,
185
+ "epoch": 0.17115529826327713,
186
+ "grad_norm": 0.4015936255455017,
187
+ "learning_rate": 1.8090169943749477e-05,
188
+ "loss": 0.789955997467041,
189
+ "mean_token_accuracy": 0.7866252034902572,
190
+ "num_tokens": 765446.0,
191
+ "step": 85
192
+ },
193
+ {
194
+ "entropy": 0.7604264505207539,
195
+ "epoch": 0.18122325698464636,
196
+ "grad_norm": 0.4798949658870697,
197
+ "learning_rate": 1.7734715927339642e-05,
198
+ "loss": 0.7473703861236572,
199
+ "mean_token_accuracy": 0.7978227443993091,
200
+ "num_tokens": 811505.0,
201
+ "step": 90
202
+ },
203
+ {
204
+ "entropy": 0.7620996758341789,
205
+ "epoch": 0.1912912157060156,
206
+ "grad_norm": 0.4318743348121643,
207
+ "learning_rate": 1.735309008059829e-05,
208
+ "loss": 0.7961347579956055,
209
+ "mean_token_accuracy": 0.7934757456183433,
210
+ "num_tokens": 857681.0,
211
+ "step": 95
212
+ },
213
+ {
214
+ "entropy": 0.7531558889895678,
215
+ "epoch": 0.20135917442738485,
216
+ "grad_norm": 0.5560536980628967,
217
+ "learning_rate": 1.6946583704589973e-05,
218
+ "loss": 0.766410493850708,
219
+ "mean_token_accuracy": 0.7901870187371969,
220
+ "num_tokens": 901933.0,
221
+ "step": 100
222
+ },
223
+ {
224
+ "epoch": 0.20135917442738485,
225
+ "eval_entropy": 0.7256068153381348,
226
+ "eval_loss": 0.7321593165397644,
227
+ "eval_mean_token_accuracy": 0.7850314078330993,
228
+ "eval_num_tokens": 901933.0,
229
+ "eval_runtime": 18225.1125,
230
+ "eval_samples_per_second": 0.054,
231
+ "eval_steps_per_second": 0.007,
232
+ "step": 100
233
+ },
234
+ {
235
+ "entropy": 0.7856348525732756,
236
+ "epoch": 0.21142713314875408,
237
+ "grad_norm": 0.3850078582763672,
238
+ "learning_rate": 1.6516572288214555e-05,
239
+ "loss": 0.8023449897766113,
240
+ "mean_token_accuracy": 0.7852729022502899,
241
+ "num_tokens": 946888.0,
242
+ "step": 105
243
+ },
244
+ {
245
+ "entropy": 0.7041983786970377,
246
+ "epoch": 0.22149509187012334,
247
+ "grad_norm": 0.3476756811141968,
248
+ "learning_rate": 1.6064510853988137e-05,
249
+ "loss": 0.7184148311614991,
250
+ "mean_token_accuracy": 0.8035397931933403,
251
+ "num_tokens": 994267.0,
252
+ "step": 110
253
+ },
254
+ {
255
+ "entropy": 0.6575290430337191,
256
+ "epoch": 0.23156305059149257,
257
+ "grad_norm": 0.42202460765838623,
258
+ "learning_rate": 1.5591929034707468e-05,
259
+ "loss": 0.7003682613372803,
260
+ "mean_token_accuracy": 0.8060932837426662,
261
+ "num_tokens": 1038543.0,
262
+ "step": 115
263
+ },
264
+ {
265
+ "entropy": 0.7608727779239416,
266
+ "epoch": 0.24163100931286183,
267
+ "grad_norm": 0.3540622889995575,
268
+ "learning_rate": 1.5100425897656754e-05,
269
+ "loss": 0.7738057613372803,
270
+ "mean_token_accuracy": 0.7889790445566177,
271
+ "num_tokens": 1085867.0,
272
+ "step": 120
273
+ },
274
+ {
275
+ "entropy": 0.6082758061587811,
276
+ "epoch": 0.2516989680342311,
277
+ "grad_norm": 0.32112520933151245,
278
+ "learning_rate": 1.4591664533870118e-05,
279
+ "loss": 0.6715522766113281,
280
+ "mean_token_accuracy": 0.8123787805438042,
281
+ "num_tokens": 1133833.0,
282
+ "step": 125
283
+ },
284
+ {
285
+ "entropy": 0.6413897173479199,
286
+ "epoch": 0.2617669267556003,
287
+ "grad_norm": 0.43722981214523315,
288
+ "learning_rate": 1.4067366430758004e-05,
289
+ "loss": 0.7195887565612793,
290
+ "mean_token_accuracy": 0.8123183585703373,
291
+ "num_tokens": 1178512.0,
292
+ "step": 130
293
+ },
294
+ {
295
+ "entropy": 0.5526509841904044,
296
+ "epoch": 0.27183488547696955,
297
+ "grad_norm": 0.3406974673271179,
298
+ "learning_rate": 1.3529305647138689e-05,
299
+ "loss": 0.5937692165374756,
300
+ "mean_token_accuracy": 0.8311853632330894,
301
+ "num_tokens": 1224601.0,
302
+ "step": 135
303
+ },
304
+ {
305
+ "entropy": 0.6579740298911929,
306
+ "epoch": 0.2819028441983388,
307
+ "grad_norm": 0.43954288959503174,
308
+ "learning_rate": 1.297930281038482e-05,
309
+ "loss": 0.6883365631103515,
310
+ "mean_token_accuracy": 0.8083381243050098,
311
+ "num_tokens": 1270506.0,
312
+ "step": 140
313
+ },
314
+ {
315
+ "entropy": 0.5645570032298565,
316
+ "epoch": 0.291970802919708,
317
+ "grad_norm": 0.36275994777679443,
318
+ "learning_rate": 1.2419218955996677e-05,
319
+ "loss": 0.6089091777801514,
320
+ "mean_token_accuracy": 0.8329982027411461,
321
+ "num_tokens": 1311666.0,
322
+ "step": 145
323
+ },
324
+ {
325
+ "entropy": 0.6062236651778221,
326
+ "epoch": 0.3020387616410773,
327
+ "grad_norm": 0.3385237157344818,
328
+ "learning_rate": 1.1850949230447146e-05,
329
+ "loss": 0.6368993282318115,
330
+ "mean_token_accuracy": 0.8212810829281807,
331
+ "num_tokens": 1357022.0,
332
+ "step": 150
333
+ },
334
+ {
335
+ "epoch": 0.3020387616410773,
336
+ "eval_entropy": 0.6711692655086517,
337
+ "eval_loss": 0.674369752407074,
338
+ "eval_mean_token_accuracy": 0.7935635781288147,
339
+ "eval_num_tokens": 1357022.0,
340
+ "eval_runtime": 23687.9198,
341
+ "eval_samples_per_second": 0.042,
342
+ "eval_steps_per_second": 0.005,
343
+ "step": 150
344
+ },
345
+ {
346
+ "entropy": 0.6150166101753711,
347
+ "epoch": 0.3121067203624465,
348
+ "grad_norm": 0.33184218406677246,
349
+ "learning_rate": 1.127641647860595e-05,
350
+ "loss": 0.6427908897399902,
351
+ "mean_token_accuracy": 0.8106721416115761,
352
+ "num_tokens": 1409201.0,
353
+ "step": 155
354
+ },
355
+ {
356
+ "entropy": 0.6546612774953247,
357
+ "epoch": 0.32217467908381575,
358
+ "grad_norm": 0.443172812461853,
359
+ "learning_rate": 1.0697564737441254e-05,
360
+ "loss": 0.6632843971252441,
361
+ "mean_token_accuracy": 0.8050490751862526,
362
+ "num_tokens": 1461178.0,
363
+ "step": 160
364
+ },
365
+ {
366
+ "entropy": 0.6669658403843641,
367
+ "epoch": 0.332242637805185,
368
+ "grad_norm": 0.3493345379829407,
369
+ "learning_rate": 1.0116352658013973e-05,
370
+ "loss": 0.6666298389434815,
371
+ "mean_token_accuracy": 0.8056038625538349,
372
+ "num_tokens": 1508214.0,
373
+ "step": 165
374
+ },
375
+ {
376
+ "entropy": 0.5695202240720392,
377
+ "epoch": 0.34231059652655427,
378
+ "grad_norm": 0.413866251707077,
379
+ "learning_rate": 9.534746878022533e-06,
380
+ "loss": 0.6183717727661133,
381
+ "mean_token_accuracy": 0.8245324343442917,
382
+ "num_tokens": 1552040.0,
383
+ "step": 170
384
+ },
385
+ {
386
+ "entropy": 0.5897262109443545,
387
+ "epoch": 0.3523785552479235,
388
+ "grad_norm": 0.3123251795768738,
389
+ "learning_rate": 8.954715367323468e-06,
390
+ "loss": 0.6161505699157714,
391
+ "mean_token_accuracy": 0.8233651638031005,
392
+ "num_tokens": 1594387.0,
393
+ "step": 175
394
+ },
395
+ {
396
+ "entropy": 0.6046821037307382,
397
+ "epoch": 0.3624465139692927,
398
+ "grad_norm": 0.4404999911785126,
399
+ "learning_rate": 8.378220768944328e-06,
400
+ "loss": 0.6254678249359131,
401
+ "mean_token_accuracy": 0.823454175889492,
402
+ "num_tokens": 1640543.0,
403
+ "step": 180
404
+ },
405
+ {
406
+ "entropy": 0.6329730719327926,
407
+ "epoch": 0.37251447269066196,
408
+ "grad_norm": 0.32642191648483276,
409
+ "learning_rate": 7.807213758120965e-06,
410
+ "loss": 0.6730523109436035,
411
+ "mean_token_accuracy": 0.810814143717289,
412
+ "num_tokens": 1684800.0,
413
+ "step": 185
414
+ },
415
+ {
416
+ "entropy": 0.7039387376978994,
417
+ "epoch": 0.3825824314120312,
418
+ "grad_norm": 0.4055386185646057,
419
+ "learning_rate": 7.243626441830009e-06,
420
+ "loss": 0.6949094772338867,
421
+ "mean_token_accuracy": 0.7998988643288613,
422
+ "num_tokens": 1728741.0,
423
+ "step": 190
424
+ },
425
+ {
426
+ "entropy": 0.5845075074583292,
427
+ "epoch": 0.3926503901334005,
428
+ "grad_norm": 0.7357484698295593,
429
+ "learning_rate": 6.689365821150421e-06,
430
+ "loss": 0.6154315948486329,
431
+ "mean_token_accuracy": 0.8205408222973347,
432
+ "num_tokens": 1778359.0,
433
+ "step": 195
434
+ },
435
+ {
436
+ "entropy": 0.683666481077671,
437
+ "epoch": 0.4027183488547697,
438
+ "grad_norm": 0.4041738510131836,
439
+ "learning_rate": 6.146307338575519e-06,
440
+ "loss": 0.6790355205535888,
441
+ "mean_token_accuracy": 0.7988370589911937,
442
+ "num_tokens": 1828434.0,
443
+ "step": 200
444
+ },
445
+ {
446
+ "epoch": 0.4027183488547697,
447
+ "eval_entropy": 0.6506109157800675,
448
+ "eval_loss": 0.6566110253334045,
449
+ "eval_mean_token_accuracy": 0.7965194163322449,
450
+ "eval_num_tokens": 1828434.0,
451
+ "eval_runtime": 22699.7349,
452
+ "eval_samples_per_second": 0.044,
453
+ "eval_steps_per_second": 0.006,
454
+ "step": 200
455
+ }
456
+ ],
457
+ "logging_steps": 5,
458
+ "max_steps": 300,
459
+ "num_input_tokens_seen": 0,
460
+ "num_train_epochs": 1,
461
+ "save_steps": 100,
462
+ "stateful_callbacks": {
463
+ "TrainerControl": {
464
+ "args": {
465
+ "should_epoch_stop": false,
466
+ "should_evaluate": false,
467
+ "should_log": false,
468
+ "should_save": true,
469
+ "should_training_stop": false
470
+ },
471
+ "attributes": {}
472
+ }
473
+ },
474
+ "total_flos": 4.05865418389033e+16,
475
+ "train_batch_size": 1,
476
+ "trial_name": null,
477
+ "trial_params": null
478
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3c92e6d4b3cb2f3ddf41a98f19c1e99328ad7746f0a35bf8f024db49c4bba21
3
+ size 5649