Ba2han commited on
Commit
2722c40
·
verified ·
1 Parent(s): eaf6741

Training in progress, step 267

Browse files
README.md CHANGED
@@ -1,18 +1,18 @@
1
  ---
2
- base_model: Ba2han/qwen-test-3-longer-2
3
  library_name: transformers
4
  model_name: model-sft-q2
5
  tags:
6
  - generated_from_trainer
7
- - trl
8
  - unsloth
9
  - sft
 
10
  licence: license
11
  ---
12
 
13
  # Model Card for model-sft-q2
14
 
15
- This model is a fine-tuned version of [Ba2han/qwen-test-3-longer-2](https://huggingface.co/Ba2han/qwen-test-3-longer-2).
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
@@ -28,7 +28,7 @@ print(output["generated_text"])
28
 
29
  ## Training procedure
30
 
31
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/batuhan409/huggingface/runs/j7uw0ag9)
32
 
33
 
34
  This model was trained with SFT.
 
1
  ---
2
+ base_model: Ba2han/qwen3_from_scratch
3
  library_name: transformers
4
  model_name: model-sft-q2
5
  tags:
6
  - generated_from_trainer
 
7
  - unsloth
8
  - sft
9
+ - trl
10
  licence: license
11
  ---
12
 
13
  # Model Card for model-sft-q2
14
 
15
+ This model is a fine-tuned version of [Ba2han/qwen3_from_scratch](https://huggingface.co/Ba2han/qwen3_from_scratch).
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
 
28
 
29
  ## Training procedure
30
 
31
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/batuhan409/huggingface/runs/ye3goxhy)
32
 
33
 
34
  This model was trained with SFT.
chat_template.jinja ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ ' + message['content'].strip() + '<|im_end|>' + '
3
+ '}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
4
+ ' }}{% endif %}
config.json CHANGED
@@ -1,17 +1,20 @@
1
  {
2
  "architectures": [
3
- "Qwen3ForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
 
 
 
7
  "bos_token_id": 50030,
8
  "dtype": "bfloat16",
9
- "eos_token_id": 50031,
10
- "head_dim": 96,
11
- "hidden_act": "silu",
12
- "hidden_size": 1152,
13
  "initializer_range": 0.02,
14
- "intermediate_size": 2880,
15
  "layer_types": [
16
  "full_attention",
17
  "full_attention",
@@ -52,25 +55,27 @@
52
  "full_attention",
53
  "full_attention",
54
  "full_attention",
 
 
55
  "full_attention"
56
  ],
57
  "max_position_embeddings": 8192,
58
- "max_window_layers": 40,
59
- "model_name": "Ba2han/qwen-test-3-longer-2",
60
  "model_type": "qwen3",
61
  "num_attention_heads": 8,
62
- "num_hidden_layers": 40,
63
- "num_key_value_heads": 8,
64
  "pad_token_id": 50034,
65
  "rms_norm_eps": 1e-06,
66
  "rope_parameters": {
67
- "rope_theta": 1000000,
68
  "rope_type": "default"
69
  },
70
  "sliding_window": null,
71
- "tie_word_embeddings": false,
72
  "transformers_version": "5.5.0",
73
- "unsloth_version": "2026.4.1",
74
  "use_cache": false,
75
  "use_sliding_window": false,
76
  "vocab_size": 50050
 
1
  {
2
  "architectures": [
3
+ "SquaredReLUQwen3ForCausalLM"
4
  ],
5
  "attention_bias": false,
6
  "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoModelForCausalLM": "patch.SquaredReLUQwen3ForCausalLM"
9
+ },
10
  "bos_token_id": 50030,
11
  "dtype": "bfloat16",
12
+ "eos_token_id": 50049,
13
+ "head_dim": 128,
14
+ "hidden_act": "squared_relu",
15
+ "hidden_size": 1024,
16
  "initializer_range": 0.02,
17
+ "intermediate_size": 2816,
18
  "layer_types": [
19
  "full_attention",
20
  "full_attention",
 
55
  "full_attention",
56
  "full_attention",
57
  "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
  "full_attention"
61
  ],
62
  "max_position_embeddings": 8192,
63
+ "max_window_layers": 28,
64
+ "model_name": "Ba2han/qwen3_from_scratch",
65
  "model_type": "qwen3",
66
  "num_attention_heads": 8,
67
+ "num_hidden_layers": 42,
68
+ "num_key_value_heads": 2,
69
  "pad_token_id": 50034,
70
  "rms_norm_eps": 1e-06,
71
  "rope_parameters": {
72
+ "rope_theta": 10000.0,
73
  "rope_type": "default"
74
  },
75
  "sliding_window": null,
76
+ "tie_word_embeddings": true,
77
  "transformers_version": "5.5.0",
78
+ "unsloth_version": "2026.5.7",
79
  "use_cache": false,
80
  "use_sliding_window": false,
81
  "vocab_size": 50050
generation_config.json CHANGED
@@ -2,6 +2,7 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 50030,
4
  "eos_token_id": [
 
5
  50031
6
  ],
7
  "max_length": 8192,
@@ -9,5 +10,5 @@
9
  "output_hidden_states": false,
10
  "pad_token_id": 50034,
11
  "transformers_version": "5.5.0",
12
- "use_cache": true
13
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 50030,
4
  "eos_token_id": [
5
+ 50049,
6
  50031
7
  ],
8
  "max_length": 8192,
 
10
  "output_hidden_states": false,
11
  "pad_token_id": 50034,
12
  "transformers_version": "5.5.0",
13
+ "use_cache": false
14
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e96d39f6489966dc2411487d22641305de24d001d6d0f08631c48ce89fd00a7f
3
- size 1310260536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c634698e8c168625d167a6eb13258edeefc99eca29d8affc045abb8526782e58
3
+ size 1049614696
tokenizer_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "bos_token": "<|begin_of_text|>",
4
  "clean_up_tokenization_spaces": true,
5
  "eos_token": "<|im_end|>",
6
- "additional_special_tokens": [
7
  "<|im_start|>",
8
  "<|im_end|>"
9
  ],
@@ -15,6 +15,168 @@
15
  "model_max_length": 8192,
16
  "pad_token": "<|finetune_right_pad_id|>",
17
  "padding_side": "right",
18
- "tokenizer_class": "PreTrainedTokenizerFast",
19
- "unk_token": null
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  }
 
3
  "bos_token": "<|begin_of_text|>",
4
  "clean_up_tokenization_spaces": true,
5
  "eos_token": "<|im_end|>",
6
+ "extra_special_tokens": [
7
  "<|im_start|>",
8
  "<|im_end|>"
9
  ],
 
15
  "model_max_length": 8192,
16
  "pad_token": "<|finetune_right_pad_id|>",
17
  "padding_side": "right",
18
+ "tokenizer_class": "TokenizersBackend",
19
+ "unk_token": null,
20
+ "added_tokens_decoder": {
21
+ "50030": {
22
+ "content": "<|begin_of_text|>",
23
+ "single_word": false,
24
+ "lstrip": false,
25
+ "rstrip": false,
26
+ "normalized": false,
27
+ "special": true
28
+ },
29
+ "50031": {
30
+ "content": "<|end_of_text|>",
31
+ "single_word": false,
32
+ "lstrip": false,
33
+ "rstrip": false,
34
+ "normalized": false,
35
+ "special": true
36
+ },
37
+ "50032": {
38
+ "content": "<|reserved_special_token_0|>",
39
+ "single_word": false,
40
+ "lstrip": false,
41
+ "rstrip": false,
42
+ "normalized": false,
43
+ "special": true
44
+ },
45
+ "50033": {
46
+ "content": "<|reserved_special_token_1|>",
47
+ "single_word": false,
48
+ "lstrip": false,
49
+ "rstrip": false,
50
+ "normalized": false,
51
+ "special": true
52
+ },
53
+ "50034": {
54
+ "content": "<|finetune_right_pad_id|>",
55
+ "single_word": false,
56
+ "lstrip": false,
57
+ "rstrip": false,
58
+ "normalized": false,
59
+ "special": true
60
+ },
61
+ "50035": {
62
+ "content": "<|reserved_special_token_2|>",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": false,
67
+ "special": true
68
+ },
69
+ "50036": {
70
+ "content": "<|start_header_id|>",
71
+ "single_word": false,
72
+ "lstrip": false,
73
+ "rstrip": false,
74
+ "normalized": false,
75
+ "special": true
76
+ },
77
+ "50037": {
78
+ "content": "<|end_header_id|>",
79
+ "single_word": false,
80
+ "lstrip": false,
81
+ "rstrip": false,
82
+ "normalized": false,
83
+ "special": true
84
+ },
85
+ "50038": {
86
+ "content": "<|eom_id|>",
87
+ "single_word": false,
88
+ "lstrip": false,
89
+ "rstrip": false,
90
+ "normalized": false,
91
+ "special": true
92
+ },
93
+ "50039": {
94
+ "content": "<|eot_id|>",
95
+ "single_word": false,
96
+ "lstrip": false,
97
+ "rstrip": false,
98
+ "normalized": false,
99
+ "special": true
100
+ },
101
+ "50040": {
102
+ "content": "<|python_tag|>",
103
+ "single_word": false,
104
+ "lstrip": false,
105
+ "rstrip": false,
106
+ "normalized": false,
107
+ "special": true
108
+ },
109
+ "50041": {
110
+ "content": "<|reserved_special_token_3|>",
111
+ "single_word": false,
112
+ "lstrip": false,
113
+ "rstrip": false,
114
+ "normalized": false,
115
+ "special": true
116
+ },
117
+ "50042": {
118
+ "content": "<|reserved_special_token_4|>",
119
+ "single_word": false,
120
+ "lstrip": false,
121
+ "rstrip": false,
122
+ "normalized": false,
123
+ "special": true
124
+ },
125
+ "50043": {
126
+ "content": "<|reserved_special_token_5|>",
127
+ "single_word": false,
128
+ "lstrip": false,
129
+ "rstrip": false,
130
+ "normalized": false,
131
+ "special": true
132
+ },
133
+ "50044": {
134
+ "content": "<|reserved_special_token_6|>",
135
+ "single_word": false,
136
+ "lstrip": false,
137
+ "rstrip": false,
138
+ "normalized": false,
139
+ "special": true
140
+ },
141
+ "50045": {
142
+ "content": "<|reserved_special_token_7|>",
143
+ "single_word": false,
144
+ "lstrip": false,
145
+ "rstrip": false,
146
+ "normalized": false,
147
+ "special": true
148
+ },
149
+ "50046": {
150
+ "content": "<|reserved_special_token_8|>",
151
+ "single_word": false,
152
+ "lstrip": false,
153
+ "rstrip": false,
154
+ "normalized": false,
155
+ "special": true
156
+ },
157
+ "50047": {
158
+ "content": "<|reserved_special_token_9|>",
159
+ "single_word": false,
160
+ "lstrip": false,
161
+ "rstrip": false,
162
+ "normalized": false,
163
+ "special": true
164
+ },
165
+ "50048": {
166
+ "content": "<|im_start|>",
167
+ "single_word": false,
168
+ "lstrip": false,
169
+ "rstrip": false,
170
+ "normalized": false,
171
+ "special": true
172
+ },
173
+ "50049": {
174
+ "content": "<|im_end|>",
175
+ "single_word": false,
176
+ "lstrip": false,
177
+ "rstrip": false,
178
+ "normalized": false,
179
+ "special": true
180
+ }
181
+ }
182
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ce9f89c60b15bdcc2a04b969f0b2369be2edd0951e2475d4f933d4ddce91dd1
3
  size 5713
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80a6c8d882cb703ede24f73dc39c2a799e65102964d1fc9b53392cf548d27209
3
  size 5713