modrill commited on
Commit
8a56f98
·
verified ·
1 Parent(s): e571407

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-659/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,16 +1,58 @@
1
  ---
2
- license: cc-by-nc-4.0
3
- tags:
4
- - qwen3
5
- - mhm
6
- - text-generation
7
  library_name: transformers
 
 
 
 
 
 
8
  ---
9
 
10
- # kodcode_3_qwen3_4b_sft
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Auto-uploaded by watcher.
13
 
14
- - Source path: `trl/qwen3-4b-sft-kodcode-3`
15
- - Uploaded at: `2026-05-20T05:52:00.573487`
16
- - Visibility: `public`
 
 
 
 
 
 
 
 
 
1
  ---
2
+ base_model: Qwen/Qwen3-4B-Base
 
 
 
 
3
  library_name: transformers
4
+ model_name: qwen3-4b-sft-kodcode-3
5
+ tags:
6
+ - generated_from_trainer
7
+ - sft
8
+ - trl
9
+ licence: license
10
  ---
11
 
12
+ # Model Card for qwen3-4b-sft-kodcode-3
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen3-4B-Base](https://huggingface.co/Qwen/Qwen3-4B-Base).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="None", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+
34
+ This model was trained with SFT.
35
+
36
+ ### Framework versions
37
+
38
+ - TRL: 1.5.0.dev0
39
+ - Transformers: 5.8.0
40
+ - Pytorch: 2.11.0
41
+ - Datasets: 4.8.5
42
+ - Tokenizers: 0.22.2
43
+
44
+ ## Citations
45
+
46
 
 
47
 
48
+ Cite TRL as:
49
+
50
+ ```bibtex
51
+ @software{vonwerra2020trl,
52
+ title = {{TRL: Transformers Reinforcement Learning}},
53
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
54
+ license = {Apache-2.0},
55
+ url = {https://github.com/huggingface/trl},
56
+ year = {2020}
57
+ }
58
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- for message in messages %}
2
+ {%- if message.role == "user" %}
3
+ {{- '<|im_start|>user\n' + message.content + '<|im_end|>\n' }}
4
+ {%- elif message.role == "system" %}
5
+ {{- '<|im_start|>system\n' + message.content + '<|im_end|>\n' }}
6
+ {%- elif message.role == "assistant" %}
7
+ {{- '<|im_start|>assistant\n' }}{% generation %}{{ message.content }}{% endgeneration %}{{ '<|im_end|>\n' }}
8
+ {%- endif %}
9
+ {%- endfor %}
10
+ {%- if add_generation_prompt %}
11
+ {{- '<|im_start|>assistant\n' }}
12
+ {%- endif %}
checkpoint-659/chat_template.jinja ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- for message in messages %}
2
+ {%- if message.role == "user" %}
3
+ {{- '<|im_start|>user\n' + message.content + '<|im_end|>\n' }}
4
+ {%- elif message.role == "system" %}
5
+ {{- '<|im_start|>system\n' + message.content + '<|im_end|>\n' }}
6
+ {%- elif message.role == "assistant" %}
7
+ {{- '<|im_start|>assistant\n' }}{% generation %}{{ message.content }}{% endgeneration %}{{ '<|im_end|>\n' }}
8
+ {%- endif %}
9
+ {%- endfor %}
10
+ {%- if add_generation_prompt %}
11
+ {{- '<|im_start|>assistant\n' }}
12
+ {%- endif %}
checkpoint-659/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 9728,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 32768,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": 151643,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_parameters": {
62
+ "rope_theta": 1000000,
63
+ "rope_type": "default"
64
+ },
65
+ "sliding_window": null,
66
+ "tie_word_embeddings": true,
67
+ "transformers_version": "5.8.0",
68
+ "use_cache": false,
69
+ "use_sliding_window": false,
70
+ "vocab_size": 151936
71
+ }
checkpoint-659/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": false,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "max_new_tokens": 2048,
8
+ "pad_token_id": 151643,
9
+ "transformers_version": "5.8.0"
10
+ }
checkpoint-659/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7e18e9f555d63e04b2a5d67a10939ba633cf584772c20342c840bc3158f275c
3
+ size 8044982080
checkpoint-659/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71c2839680bde13ef34d634ec2c6bbbe6f84f6c1edb7b687385a7647accfd8ce
3
+ size 16090225449
checkpoint-659/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:659b1cdee2219458dd84ce6a632a595465680b8080e5c44bd600ff97eca8d752
3
+ size 15429
checkpoint-659/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86accf27064cdd503053e90476a6bd10de333d4ff0594535ad55ea13a473c91d
3
+ size 15429
checkpoint-659/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ca8d714ef40be035404c1957b5a4dee84e1f43980408393f8aa710552ee6f6
3
+ size 15429
checkpoint-659/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cfdebe99e40accc9c9d8f09c63136a14abda997d9b501969ec8e16e9d183179
3
+ size 15429
checkpoint-659/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:638f76f10b8122f6b6d00ef579bef156aea843a74d8ad66f5d19ea5b06be426f
3
+ size 1465
checkpoint-659/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoint-659/tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": false,
24
+ "local_files_only": false,
25
+ "model_max_length": 131072,
26
+ "pad_token": "<|endoftext|>",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }
checkpoint-659/trainer_state.json ADDED
@@ -0,0 +1,684 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 659,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "entropy": 0.3487173642963171,
14
+ "epoch": 0.015186028853454821,
15
+ "grad_norm": 1.609375,
16
+ "learning_rate": 2.7272727272727272e-06,
17
+ "loss": 0.4099268913269043,
18
+ "mean_token_accuracy": 0.8717762351036071,
19
+ "num_tokens": 568708.0,
20
+ "step": 10
21
+ },
22
+ {
23
+ "entropy": 0.37818768359720706,
24
+ "epoch": 0.030372057706909643,
25
+ "grad_norm": 0.87890625,
26
+ "learning_rate": 5.7575757575757586e-06,
27
+ "loss": 0.39949469566345214,
28
+ "mean_token_accuracy": 0.8737476468086243,
29
+ "num_tokens": 1125639.0,
30
+ "step": 20
31
+ },
32
+ {
33
+ "entropy": 0.40618006475269797,
34
+ "epoch": 0.04555808656036447,
35
+ "grad_norm": 0.70703125,
36
+ "learning_rate": 8.787878787878788e-06,
37
+ "loss": 0.3975033760070801,
38
+ "mean_token_accuracy": 0.8727035835385323,
39
+ "num_tokens": 1683225.0,
40
+ "step": 30
41
+ },
42
+ {
43
+ "entropy": 0.3896496780216694,
44
+ "epoch": 0.060744115413819286,
45
+ "grad_norm": 0.6953125,
46
+ "learning_rate": 9.997733473639876e-06,
47
+ "loss": 0.3925030708312988,
48
+ "mean_token_accuracy": 0.8742863699793816,
49
+ "num_tokens": 2236895.0,
50
+ "step": 40
51
+ },
52
+ {
53
+ "entropy": 0.37322904225438835,
54
+ "epoch": 0.07593014426727411,
55
+ "grad_norm": 0.68359375,
56
+ "learning_rate": 9.983889919973586e-06,
57
+ "loss": 0.3752753257751465,
58
+ "mean_token_accuracy": 0.8792506881058216,
59
+ "num_tokens": 2818707.0,
60
+ "step": 50
61
+ },
62
+ {
63
+ "entropy": 0.3811411205679178,
64
+ "epoch": 0.09111617312072894,
65
+ "grad_norm": 0.66796875,
66
+ "learning_rate": 9.957496810072027e-06,
67
+ "loss": 0.38604438304901123,
68
+ "mean_token_accuracy": 0.8750339619815349,
69
+ "num_tokens": 3351348.0,
70
+ "step": 60
71
+ },
72
+ {
73
+ "entropy": 0.3796327030286193,
74
+ "epoch": 0.10630220197418375,
75
+ "grad_norm": 0.66015625,
76
+ "learning_rate": 9.918620602428916e-06,
77
+ "loss": 0.37710745334625245,
78
+ "mean_token_accuracy": 0.8776259452104569,
79
+ "num_tokens": 3915545.0,
80
+ "step": 70
81
+ },
82
+ {
83
+ "entropy": 0.37812459245324137,
84
+ "epoch": 0.12148823082763857,
85
+ "grad_norm": 0.64453125,
86
+ "learning_rate": 9.867359188282193e-06,
87
+ "loss": 0.38009963035583494,
88
+ "mean_token_accuracy": 0.8783061921596527,
89
+ "num_tokens": 4462906.0,
90
+ "step": 80
91
+ },
92
+ {
93
+ "entropy": 0.3751340739428997,
94
+ "epoch": 0.1366742596810934,
95
+ "grad_norm": 0.6640625,
96
+ "learning_rate": 9.803841645121505e-06,
97
+ "loss": 0.37636594772338866,
98
+ "mean_token_accuracy": 0.8778362341225148,
99
+ "num_tokens": 5029003.0,
100
+ "step": 90
101
+ },
102
+ {
103
+ "entropy": 0.37551863975822924,
104
+ "epoch": 0.15186028853454822,
105
+ "grad_norm": 0.6640625,
106
+ "learning_rate": 9.728227911667934e-06,
107
+ "loss": 0.3773549795150757,
108
+ "mean_token_accuracy": 0.8772883579134941,
109
+ "num_tokens": 5596042.0,
110
+ "step": 100
111
+ },
112
+ {
113
+ "entropy": 0.3809764288365841,
114
+ "epoch": 0.16704631738800305,
115
+ "grad_norm": 0.71484375,
116
+ "learning_rate": 9.640708385144403e-06,
117
+ "loss": 0.3807323932647705,
118
+ "mean_token_accuracy": 0.8774459846317768,
119
+ "num_tokens": 6144821.0,
120
+ "step": 110
121
+ },
122
+ {
123
+ "entropy": 0.37467240951955316,
124
+ "epoch": 0.18223234624145787,
125
+ "grad_norm": 0.62890625,
126
+ "learning_rate": 9.541503441850844e-06,
127
+ "loss": 0.37542564868927003,
128
+ "mean_token_accuracy": 0.8782215595245362,
129
+ "num_tokens": 6691491.0,
130
+ "step": 120
131
+ },
132
+ {
133
+ "entropy": 0.3787713166326284,
134
+ "epoch": 0.19741837509491267,
135
+ "grad_norm": 0.7109375,
136
+ "learning_rate": 9.430862882251279e-06,
137
+ "loss": 0.37993783950805665,
138
+ "mean_token_accuracy": 0.8774278596043587,
139
+ "num_tokens": 7247335.0,
140
+ "step": 130
141
+ },
142
+ {
143
+ "entropy": 0.3862619888037443,
144
+ "epoch": 0.2126044039483675,
145
+ "grad_norm": 0.71484375,
146
+ "learning_rate": 9.309065301970193e-06,
147
+ "loss": 0.38727219104766847,
148
+ "mean_token_accuracy": 0.8749251998960972,
149
+ "num_tokens": 7808664.0,
150
+ "step": 140
151
+ },
152
+ {
153
+ "entropy": 0.3778634283691645,
154
+ "epoch": 0.22779043280182232,
155
+ "grad_norm": 0.71484375,
156
+ "learning_rate": 9.176417390281944e-06,
157
+ "loss": 0.38028583526611326,
158
+ "mean_token_accuracy": 0.8772468723356723,
159
+ "num_tokens": 8360893.0,
160
+ "step": 150
161
+ },
162
+ {
163
+ "entropy": 0.3749677825719118,
164
+ "epoch": 0.24297646165527714,
165
+ "grad_norm": 0.69921875,
166
+ "learning_rate": 9.033253157859715e-06,
167
+ "loss": 0.37344467639923096,
168
+ "mean_token_accuracy": 0.8786589197814465,
169
+ "num_tokens": 8905139.0,
170
+ "step": 160
171
+ },
172
+ {
173
+ "entropy": 0.37994367331266404,
174
+ "epoch": 0.25816249050873197,
175
+ "grad_norm": 0.69140625,
176
+ "learning_rate": 8.879933095728485e-06,
177
+ "loss": 0.38379650115966796,
178
+ "mean_token_accuracy": 0.8768095754086971,
179
+ "num_tokens": 9467791.0,
180
+ "step": 170
181
+ },
182
+ {
183
+ "entropy": 0.3774459037929773,
184
+ "epoch": 0.2733485193621868,
185
+ "grad_norm": 0.7109375,
186
+ "learning_rate": 8.716843267539868e-06,
187
+ "loss": 0.3767258644104004,
188
+ "mean_token_accuracy": 0.8779186218976974,
189
+ "num_tokens": 10013526.0,
190
+ "step": 180
191
+ },
192
+ {
193
+ "entropy": 0.3706828704103827,
194
+ "epoch": 0.2885345482156416,
195
+ "grad_norm": 0.6875,
196
+ "learning_rate": 8.544394337454409e-06,
197
+ "loss": 0.373125958442688,
198
+ "mean_token_accuracy": 0.8792334951460361,
199
+ "num_tokens": 10567209.0,
200
+ "step": 190
201
+ },
202
+ {
203
+ "entropy": 0.3781122103333473,
204
+ "epoch": 0.30372057706909644,
205
+ "grad_norm": 0.7109375,
206
+ "learning_rate": 8.36302053607924e-06,
207
+ "loss": 0.3779691457748413,
208
+ "mean_token_accuracy": 0.877835976332426,
209
+ "num_tokens": 11121802.0,
210
+ "step": 200
211
+ },
212
+ {
213
+ "entropy": 0.37852676026523113,
214
+ "epoch": 0.31890660592255127,
215
+ "grad_norm": 0.703125,
216
+ "learning_rate": 8.17317856706482e-06,
217
+ "loss": 0.37905910015106203,
218
+ "mean_token_accuracy": 0.877592646330595,
219
+ "num_tokens": 11677885.0,
220
+ "step": 210
221
+ },
222
+ {
223
+ "entropy": 0.376834512129426,
224
+ "epoch": 0.3340926347760061,
225
+ "grad_norm": 0.66015625,
226
+ "learning_rate": 7.975346457114034e-06,
227
+ "loss": 0.3753563404083252,
228
+ "mean_token_accuracy": 0.8776590585708618,
229
+ "num_tokens": 12235216.0,
230
+ "step": 220
231
+ },
232
+ {
233
+ "entropy": 0.3737819105386734,
234
+ "epoch": 0.3492786636294609,
235
+ "grad_norm": 0.671875,
236
+ "learning_rate": 7.770022352299294e-06,
237
+ "loss": 0.37358593940734863,
238
+ "mean_token_accuracy": 0.878921328485012,
239
+ "num_tokens": 12787759.0,
240
+ "step": 230
241
+ },
242
+ {
243
+ "entropy": 0.3769740372896194,
244
+ "epoch": 0.36446469248291574,
245
+ "grad_norm": 0.73046875,
246
+ "learning_rate": 7.557723263718596e-06,
247
+ "loss": 0.37995898723602295,
248
+ "mean_token_accuracy": 0.8769361607730388,
249
+ "num_tokens": 13346471.0,
250
+ "step": 240
251
+ },
252
+ {
253
+ "entropy": 0.3878506176173687,
254
+ "epoch": 0.37965072133637057,
255
+ "grad_norm": 0.70703125,
256
+ "learning_rate": 7.338983765648985e-06,
257
+ "loss": 0.38782215118408203,
258
+ "mean_token_accuracy": 0.8749015353620052,
259
+ "num_tokens": 13895194.0,
260
+ "step": 250
261
+ },
262
+ {
263
+ "entropy": 0.37555828876793385,
264
+ "epoch": 0.39483675018982534,
265
+ "grad_norm": 0.6875,
266
+ "learning_rate": 7.114354649475499e-06,
267
+ "loss": 0.3771331787109375,
268
+ "mean_token_accuracy": 0.878202386945486,
269
+ "num_tokens": 14453542.0,
270
+ "step": 260
271
+ },
272
+ {
273
+ "entropy": 0.37179951313883064,
274
+ "epoch": 0.41002277904328016,
275
+ "grad_norm": 0.6484375,
276
+ "learning_rate": 6.884401536785045e-06,
277
+ "loss": 0.37206058502197265,
278
+ "mean_token_accuracy": 0.8789021499454975,
279
+ "num_tokens": 15016280.0,
280
+ "step": 270
281
+ },
282
+ {
283
+ "entropy": 0.3706459369510412,
284
+ "epoch": 0.425208807896735,
285
+ "grad_norm": 0.7109375,
286
+ "learning_rate": 6.6497034551174585e-06,
287
+ "loss": 0.37101426124572756,
288
+ "mean_token_accuracy": 0.8798882246017456,
289
+ "num_tokens": 15561057.0,
290
+ "step": 280
291
+ },
292
+ {
293
+ "entropy": 0.37728526555001735,
294
+ "epoch": 0.4403948367501898,
295
+ "grad_norm": 0.65625,
296
+ "learning_rate": 6.41085137996006e-06,
297
+ "loss": 0.37785754203796384,
298
+ "mean_token_accuracy": 0.8779311388731003,
299
+ "num_tokens": 16127699.0,
300
+ "step": 290
301
+ },
302
+ {
303
+ "entropy": 0.3816283464431763,
304
+ "epoch": 0.45558086560364464,
305
+ "grad_norm": 0.81640625,
306
+ "learning_rate": 6.168446746656973e-06,
307
+ "loss": 0.3794879674911499,
308
+ "mean_token_accuracy": 0.8773063771426678,
309
+ "num_tokens": 16686457.0,
310
+ "step": 300
311
+ },
312
+ {
313
+ "entropy": 0.37517447732388975,
314
+ "epoch": 0.47076689445709946,
315
+ "grad_norm": 0.70703125,
316
+ "learning_rate": 5.923099935980278e-06,
317
+ "loss": 0.3782352924346924,
318
+ "mean_token_accuracy": 0.8787827685475349,
319
+ "num_tokens": 17254272.0,
320
+ "step": 310
321
+ },
322
+ {
323
+ "entropy": 0.374018133059144,
324
+ "epoch": 0.4859529233105543,
325
+ "grad_norm": 0.71484375,
326
+ "learning_rate": 5.675428737176367e-06,
327
+ "loss": 0.37341156005859377,
328
+ "mean_token_accuracy": 0.8788688823580741,
329
+ "num_tokens": 17809900.0,
330
+ "step": 320
331
+ },
332
+ {
333
+ "entropy": 0.3753270395100117,
334
+ "epoch": 0.5011389521640092,
335
+ "grad_norm": 0.68359375,
336
+ "learning_rate": 5.426056792357552e-06,
337
+ "loss": 0.3752497673034668,
338
+ "mean_token_accuracy": 0.8784179173409938,
339
+ "num_tokens": 18379566.0,
340
+ "step": 330
341
+ },
342
+ {
343
+ "entropy": 0.3742110010236502,
344
+ "epoch": 0.5163249810174639,
345
+ "grad_norm": 0.6875,
346
+ "learning_rate": 5.175612026156045e-06,
347
+ "loss": 0.3746063232421875,
348
+ "mean_token_accuracy": 0.8782069273293018,
349
+ "num_tokens": 18943281.0,
350
+ "step": 340
351
+ },
352
+ {
353
+ "entropy": 0.37444472052156924,
354
+ "epoch": 0.5315110098709187,
355
+ "grad_norm": 0.71484375,
356
+ "learning_rate": 4.924725064594448e-06,
357
+ "loss": 0.3729024171829224,
358
+ "mean_token_accuracy": 0.8787923693656922,
359
+ "num_tokens": 19488865.0,
360
+ "step": 350
361
+ },
362
+ {
363
+ "entropy": 0.3750518877059221,
364
+ "epoch": 0.5466970387243736,
365
+ "grad_norm": 0.78515625,
366
+ "learning_rate": 4.674027647154037e-06,
367
+ "loss": 0.3758077621459961,
368
+ "mean_token_accuracy": 0.8765743866562843,
369
+ "num_tokens": 20048281.0,
370
+ "step": 360
371
+ },
372
+ {
373
+ "entropy": 0.3787516813725233,
374
+ "epoch": 0.5618830675778284,
375
+ "grad_norm": 0.74609375,
376
+ "learning_rate": 4.424151036039381e-06,
377
+ "loss": 0.3790909767150879,
378
+ "mean_token_accuracy": 0.8769759923219681,
379
+ "num_tokens": 20597434.0,
380
+ "step": 370
381
+ },
382
+ {
383
+ "entropy": 0.3786877432838082,
384
+ "epoch": 0.5770690964312832,
385
+ "grad_norm": 0.68359375,
386
+ "learning_rate": 4.175724426644724e-06,
387
+ "loss": 0.3812232971191406,
388
+ "mean_token_accuracy": 0.8777030549943448,
389
+ "num_tokens": 21161267.0,
390
+ "step": 380
391
+ },
392
+ {
393
+ "entropy": 0.37311795353889465,
394
+ "epoch": 0.592255125284738,
395
+ "grad_norm": 0.6796875,
396
+ "learning_rate": 3.929373363224654e-06,
397
+ "loss": 0.3731100559234619,
398
+ "mean_token_accuracy": 0.8793233536183834,
399
+ "num_tokens": 21709421.0,
400
+ "step": 390
401
+ },
402
+ {
403
+ "entropy": 0.3734915753826499,
404
+ "epoch": 0.6074411541381929,
405
+ "grad_norm": 0.6875,
406
+ "learning_rate": 3.685718163758427e-06,
407
+ "loss": 0.37124335765838623,
408
+ "mean_token_accuracy": 0.8786324210464954,
409
+ "num_tokens": 22250023.0,
410
+ "step": 400
411
+ },
412
+ {
413
+ "entropy": 0.3722789943218231,
414
+ "epoch": 0.6226271829916477,
415
+ "grad_norm": 0.67578125,
416
+ "learning_rate": 3.445372357974194e-06,
417
+ "loss": 0.37429609298706057,
418
+ "mean_token_accuracy": 0.8784996062517166,
419
+ "num_tokens": 22802881.0,
420
+ "step": 410
421
+ },
422
+ {
423
+ "entropy": 0.3826644644141197,
424
+ "epoch": 0.6378132118451025,
425
+ "grad_norm": 0.65625,
426
+ "learning_rate": 3.2089411424661864e-06,
427
+ "loss": 0.3828511953353882,
428
+ "mean_token_accuracy": 0.875868634134531,
429
+ "num_tokens": 23368508.0,
430
+ "step": 420
431
+ },
432
+ {
433
+ "entropy": 0.36304581388831136,
434
+ "epoch": 0.6529992406985573,
435
+ "grad_norm": 0.703125,
436
+ "learning_rate": 2.977019856794955e-06,
437
+ "loss": 0.362534499168396,
438
+ "mean_token_accuracy": 0.8821237675845623,
439
+ "num_tokens": 23923709.0,
440
+ "step": 430
441
+ },
442
+ {
443
+ "entropy": 0.38759873658418653,
444
+ "epoch": 0.6681852695520122,
445
+ "grad_norm": 0.67578125,
446
+ "learning_rate": 2.7501924844078538e-06,
447
+ "loss": 0.38718571662902834,
448
+ "mean_token_accuracy": 0.8746685221791267,
449
+ "num_tokens": 24477925.0,
450
+ "step": 440
451
+ },
452
+ {
453
+ "entropy": 0.3708066754043102,
454
+ "epoch": 0.683371298405467,
455
+ "grad_norm": 0.6875,
456
+ "learning_rate": 2.5290301821544826e-06,
457
+ "loss": 0.36970815658569334,
458
+ "mean_token_accuracy": 0.8802599847316742,
459
+ "num_tokens": 25027245.0,
460
+ "step": 450
461
+ },
462
+ {
463
+ "entropy": 0.36688214987516404,
464
+ "epoch": 0.6985573272589218,
465
+ "grad_norm": 0.68359375,
466
+ "learning_rate": 2.3140898420998425e-06,
467
+ "loss": 0.3657586097717285,
468
+ "mean_token_accuracy": 0.8809232294559479,
469
+ "num_tokens": 25582534.0,
470
+ "step": 460
471
+ },
472
+ {
473
+ "entropy": 0.36983687337487936,
474
+ "epoch": 0.7137433561123766,
475
+ "grad_norm": 0.7109375,
476
+ "learning_rate": 2.105912689256533e-06,
477
+ "loss": 0.37239837646484375,
478
+ "mean_token_accuracy": 0.8794391065835953,
479
+ "num_tokens": 26134875.0,
480
+ "step": 470
481
+ },
482
+ {
483
+ "entropy": 0.3732773784548044,
484
+ "epoch": 0.7289293849658315,
485
+ "grad_norm": 0.63671875,
486
+ "learning_rate": 1.905022918766995e-06,
487
+ "loss": 0.37306258678436277,
488
+ "mean_token_accuracy": 0.8793606124818325,
489
+ "num_tokens": 26681084.0,
490
+ "step": 480
491
+ },
492
+ {
493
+ "entropy": 0.3710829207673669,
494
+ "epoch": 0.7441154138192863,
495
+ "grad_norm": 0.69921875,
496
+ "learning_rate": 1.7119263759673677e-06,
497
+ "loss": 0.3711911678314209,
498
+ "mean_token_accuracy": 0.8803343921899796,
499
+ "num_tokens": 27234332.0,
500
+ "step": 490
501
+ },
502
+ {
503
+ "entropy": 0.3832434043288231,
504
+ "epoch": 0.7593014426727411,
505
+ "grad_norm": 0.9921875,
506
+ "learning_rate": 1.5271092826566108e-06,
507
+ "loss": 0.3841698169708252,
508
+ "mean_token_accuracy": 0.8759766638278961,
509
+ "num_tokens": 27794602.0,
510
+ "step": 500
511
+ },
512
+ {
513
+ "entropy": 0.38211295008659363,
514
+ "epoch": 0.7744874715261959,
515
+ "grad_norm": 0.8046875,
516
+ "learning_rate": 1.3510370127781635e-06,
517
+ "loss": 0.3804590940475464,
518
+ "mean_token_accuracy": 0.8769169762730599,
519
+ "num_tokens": 28354831.0,
520
+ "step": 510
521
+ },
522
+ {
523
+ "entropy": 0.3752284612506628,
524
+ "epoch": 0.7896735003796507,
525
+ "grad_norm": 0.73046875,
526
+ "learning_rate": 1.1841529205970281e-06,
527
+ "loss": 0.37546916007995607,
528
+ "mean_token_accuracy": 0.8786770381033421,
529
+ "num_tokens": 28922852.0,
530
+ "step": 520
531
+ },
532
+ {
533
+ "entropy": 0.36873019095510245,
534
+ "epoch": 0.8048595292331056,
535
+ "grad_norm": 0.6640625,
536
+ "learning_rate": 1.026877224322923e-06,
537
+ "loss": 0.36797375679016114,
538
+ "mean_token_accuracy": 0.880731363594532,
539
+ "num_tokens": 29493442.0,
540
+ "step": 530
541
+ },
542
+ {
543
+ "entropy": 0.3788988694548607,
544
+ "epoch": 0.8200455580865603,
545
+ "grad_norm": 0.69921875,
546
+ "learning_rate": 8.7960594799059e-07,
547
+ "loss": 0.37884984016418455,
548
+ "mean_token_accuracy": 0.8770281121134758,
549
+ "num_tokens": 30034443.0,
550
+ "step": 540
551
+ },
552
+ {
553
+ "entropy": 0.3814096964895725,
554
+ "epoch": 0.8352315869400152,
555
+ "grad_norm": 0.73046875,
556
+ "learning_rate": 7.427099242616348e-07,
557
+ "loss": 0.3821078300476074,
558
+ "mean_token_accuracy": 0.8763406798243523,
559
+ "num_tokens": 30570567.0,
560
+ "step": 550
561
+ },
562
+ {
563
+ "entropy": 0.3758743409067392,
564
+ "epoch": 0.85041761579347,
565
+ "grad_norm": 0.6953125,
566
+ "learning_rate": 6.165338606588517e-07,
567
+ "loss": 0.3744307279586792,
568
+ "mean_token_accuracy": 0.8794760994613171,
569
+ "num_tokens": 31129457.0,
570
+ "step": 560
571
+ },
572
+ {
573
+ "entropy": 0.37431446108967065,
574
+ "epoch": 0.8656036446469249,
575
+ "grad_norm": 0.6640625,
576
+ "learning_rate": 5.0139547158427e-07,
577
+ "loss": 0.37335963249206544,
578
+ "mean_token_accuracy": 0.8793582506477833,
579
+ "num_tokens": 31689902.0,
580
+ "step": 570
581
+ },
582
+ {
583
+ "entropy": 0.38372623883187773,
584
+ "epoch": 0.8807896735003796,
585
+ "grad_norm": 0.7578125,
586
+ "learning_rate": 3.9758467830656623e-07,
587
+ "loss": 0.38321547508239745,
588
+ "mean_token_accuracy": 0.8755873307585716,
589
+ "num_tokens": 32253359.0,
590
+ "step": 580
591
+ },
592
+ {
593
+ "entropy": 0.36962624490261076,
594
+ "epoch": 0.8959757023538345,
595
+ "grad_norm": 0.72265625,
596
+ "learning_rate": 3.0536287893223603e-07,
597
+ "loss": 0.37100839614868164,
598
+ "mean_token_accuracy": 0.8799250744283199,
599
+ "num_tokens": 32813428.0,
600
+ "step": 590
601
+ },
602
+ {
603
+ "entropy": 0.3853254303336143,
604
+ "epoch": 0.9111617312072893,
605
+ "grad_norm": 0.6640625,
606
+ "learning_rate": 2.2496229019879635e-07,
607
+ "loss": 0.3848439693450928,
608
+ "mean_token_accuracy": 0.8762004837393761,
609
+ "num_tokens": 33382024.0,
610
+ "step": 600
611
+ },
612
+ {
613
+ "entropy": 0.3742272950708866,
614
+ "epoch": 0.9263477600607442,
615
+ "grad_norm": 0.671875,
616
+ "learning_rate": 1.5658536274738623e-07,
617
+ "loss": 0.3725078582763672,
618
+ "mean_token_accuracy": 0.8787065915763378,
619
+ "num_tokens": 33939521.0,
620
+ "step": 610
621
+ },
622
+ {
623
+ "entropy": 0.3774934906512499,
624
+ "epoch": 0.9415337889141989,
625
+ "grad_norm": 0.69140625,
626
+ "learning_rate": 1.004042713471165e-07,
627
+ "loss": 0.37858588695526124,
628
+ "mean_token_accuracy": 0.877676124125719,
629
+ "num_tokens": 34482539.0,
630
+ "step": 620
631
+ },
632
+ {
633
+ "entropy": 0.37360552567988636,
634
+ "epoch": 0.9567198177676538,
635
+ "grad_norm": 0.734375,
636
+ "learning_rate": 5.6560481354807625e-08,
637
+ "loss": 0.37269864082336424,
638
+ "mean_token_accuracy": 0.8786865592002868,
639
+ "num_tokens": 35045028.0,
640
+ "step": 630
641
+ },
642
+ {
643
+ "entropy": 0.3767994062975049,
644
+ "epoch": 0.9719058466211086,
645
+ "grad_norm": 0.890625,
646
+ "learning_rate": 2.516439250177749e-08,
647
+ "loss": 0.3758098125457764,
648
+ "mean_token_accuracy": 0.8778494797647,
649
+ "num_tokens": 35600399.0,
650
+ "step": 640
651
+ },
652
+ {
653
+ "entropy": 0.3786265593022108,
654
+ "epoch": 0.9870918754745635,
655
+ "grad_norm": 0.6875,
656
+ "learning_rate": 6.295060904623618e-09,
657
+ "loss": 0.378217077255249,
658
+ "mean_token_accuracy": 0.8778206452727317,
659
+ "num_tokens": 36162293.0,
660
+ "step": 650
661
+ }
662
+ ],
663
+ "logging_steps": 10,
664
+ "max_steps": 659,
665
+ "num_input_tokens_seen": 0,
666
+ "num_train_epochs": 1,
667
+ "save_steps": 500,
668
+ "stateful_callbacks": {
669
+ "TrainerControl": {
670
+ "args": {
671
+ "should_epoch_stop": false,
672
+ "should_evaluate": false,
673
+ "should_log": false,
674
+ "should_save": true,
675
+ "should_training_stop": true
676
+ },
677
+ "attributes": {}
678
+ }
679
+ },
680
+ "total_flos": 9.858614662906511e+17,
681
+ "train_batch_size": 2,
682
+ "trial_name": null,
683
+ "trial_params": null
684
+ }
checkpoint-659/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:514f32c7b13687591d405f4e860f5e4d9145eaaff00bbbfd04aded17ecc9774d
3
+ size 5777
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "dtype": "bfloat16",
9
+ "eos_token_id": 151645,
10
+ "head_dim": 128,
11
+ "hidden_act": "silu",
12
+ "hidden_size": 2560,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 9728,
15
+ "layer_types": [
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention"
52
+ ],
53
+ "max_position_embeddings": 32768,
54
+ "max_window_layers": 36,
55
+ "model_type": "qwen3",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 36,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": 151643,
60
+ "rms_norm_eps": 1e-06,
61
+ "rope_parameters": {
62
+ "rope_theta": 1000000,
63
+ "rope_type": "default"
64
+ },
65
+ "sliding_window": null,
66
+ "tie_word_embeddings": true,
67
+ "transformers_version": "5.8.0",
68
+ "use_cache": false,
69
+ "use_sliding_window": false,
70
+ "vocab_size": 151936
71
+ }
generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": false,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "max_new_tokens": 2048,
8
+ "pad_token_id": 151643,
9
+ "transformers_version": "5.8.0"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7e18e9f555d63e04b2a5d67a10939ba633cf584772c20342c840bc3158f275c
3
+ size 8044982080
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
tokenizer_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": {
9
+ "im_start": "<|im_start|>",
10
+ "im_end": "<|im_end|>",
11
+ "object_ref_start": "<|object_ref_start|>",
12
+ "object_ref_end": "<|object_ref_end|>",
13
+ "box_start": "<|box_start|>",
14
+ "box_end": "<|box_end|>",
15
+ "quad_start": "<|quad_start|>",
16
+ "quad_end": "<|quad_end|>",
17
+ "vision_start": "<|vision_start|>",
18
+ "vision_end": "<|vision_end|>",
19
+ "vision_pad": "<|vision_pad|>",
20
+ "image_pad": "<|image_pad|>",
21
+ "video_pad": "<|video_pad|>"
22
+ },
23
+ "is_local": false,
24
+ "local_files_only": false,
25
+ "model_max_length": 131072,
26
+ "pad_token": "<|endoftext|>",
27
+ "split_special_tokens": false,
28
+ "tokenizer_class": "Qwen2Tokenizer",
29
+ "unk_token": null
30
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:514f32c7b13687591d405f4e860f5e4d9145eaaff00bbbfd04aded17ecc9774d
3
+ size 5777