mazesmazes commited on
Commit
134fc67
·
verified ·
1 Parent(s): ce3a6b4

Training in progress, step 1000

Browse files
Files changed (3) hide show
  1. config.json +7 -7
  2. model.safetensors +2 -2
  3. training_args.bin +2 -2
config.json CHANGED
@@ -159,7 +159,7 @@
159
  "inference_warmup_tokens": 10,
160
  "label_smoothing": 0.0,
161
  "length_penalty": 1.0,
162
- "llm_dim": 2048,
163
  "mask_feature_length": 27,
164
  "mask_feature_min_masks": 2,
165
  "mask_feature_prob": 0.0,
@@ -179,8 +179,8 @@
179
  "projector_hidden_dim": null,
180
  "projector_init_std": 0.02,
181
  "projector_num_layers": 2,
182
- "projector_pool_stride": 6,
183
- "projector_type": "mosa",
184
  "qformer_hidden_size": null,
185
  "qformer_intermediate_size": null,
186
  "qformer_num_heads": 16,
@@ -190,7 +190,7 @@
190
  "router_aux_loss_coef": 0.01,
191
  "system_prompt": "/no_think /system_override",
192
  "text_config": {
193
- "_name_or_path": "Qwen/Qwen3-1.7B",
194
  "architectures": [
195
  "Qwen3ForCausalLM"
196
  ],
@@ -200,9 +200,9 @@
200
  "eos_token_id": 151645,
201
  "head_dim": 128,
202
  "hidden_act": "silu",
203
- "hidden_size": 2048,
204
  "initializer_range": 0.02,
205
- "intermediate_size": 6144,
206
  "layer_types": [
207
  "full_attention",
208
  "full_attention",
@@ -251,7 +251,7 @@
251
  "use_sliding_window": false,
252
  "vocab_size": 151670
253
  },
254
- "text_model_id": "Qwen/Qwen3-1.7B",
255
  "transformers_version": "5.0.0.dev0",
256
  "use_cache": false,
257
  "use_specaugment": true,
 
159
  "inference_warmup_tokens": 10,
160
  "label_smoothing": 0.0,
161
  "length_penalty": 1.0,
162
+ "llm_dim": 1024,
163
  "mask_feature_length": 27,
164
  "mask_feature_min_masks": 2,
165
  "mask_feature_prob": 0.0,
 
179
  "projector_hidden_dim": null,
180
  "projector_init_std": 0.02,
181
  "projector_num_layers": 2,
182
+ "projector_pool_stride": 4,
183
+ "projector_type": "linear",
184
  "qformer_hidden_size": null,
185
  "qformer_intermediate_size": null,
186
  "qformer_num_heads": 16,
 
190
  "router_aux_loss_coef": 0.01,
191
  "system_prompt": "/no_think /system_override",
192
  "text_config": {
193
+ "_name_or_path": "Qwen/Qwen3-0.6B",
194
  "architectures": [
195
  "Qwen3ForCausalLM"
196
  ],
 
200
  "eos_token_id": 151645,
201
  "head_dim": 128,
202
  "hidden_act": "silu",
203
+ "hidden_size": 1024,
204
  "initializer_range": 0.02,
205
+ "intermediate_size": 3072,
206
  "layer_types": [
207
  "full_attention",
208
  "full_attention",
 
251
  "use_sliding_window": false,
252
  "vocab_size": 151670
253
  },
254
+ "text_model_id": "Qwen/Qwen3-0.6B",
255
  "transformers_version": "5.0.0.dev0",
256
  "use_cache": false,
257
  "use_specaugment": true,
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd8ccd85735b88efd3c380c00b69fcea27fb1c9f6ef9c48245fd066adab9bd68
3
- size 320134160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a82f0e82a684298cc550fefb3a4968fa974cf79a2568d441bcf3e4b350a76f23
3
+ size 2623704
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b3345ff693dedd9814381c2dd87209e469396be74d8912ab3f03d6661ba4469b
3
- size 5201
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1207623a291f4a86f41326955931e7f95611cf08e5ed2ba9c8eeebca5dc00b18
3
+ size 5265