Zihan Min
commited on
Commit
·
909c76d
1
Parent(s):
156581e
upload new general-trained fusers
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/config.json +57 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/aggregator_config.json +1 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_0.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_0.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_1.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_1.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_10.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_10.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_11.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_11.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_12.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_12.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_13.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_13.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_14.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_14.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_15.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_15.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_16.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_16.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_17.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_17.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_18.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_18.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_19.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_19.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_2.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_2.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_20.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_20.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_21.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_21.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_22.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_22.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_23.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_23.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_24.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_24.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_25.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_25.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_26.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_26.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_27.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_27.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_3.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_3.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_4.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_4.pt +3 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_5.json +20 -0
- qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_5.pt +3 -0
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": {
|
| 3 |
+
"base_model": "Qwen/Qwen3-0.6B",
|
| 4 |
+
"teacher_model": "Qwen/Qwen2.5-Math-1.5B-Instruct",
|
| 5 |
+
"include_response": false,
|
| 6 |
+
"is_do_alignment": false,
|
| 7 |
+
"alignment_strategy": "first",
|
| 8 |
+
"projector": {
|
| 9 |
+
"type": "C2CProjector",
|
| 10 |
+
"params": {
|
| 11 |
+
"hidden_dim": 1024,
|
| 12 |
+
"intermediate_dim": 1024,
|
| 13 |
+
"num_layers": 3,
|
| 14 |
+
"dropout": 0.1,
|
| 15 |
+
"initial_temperature": 1.0,
|
| 16 |
+
"final_temperature": 0.001,
|
| 17 |
+
"anneal_steps": 1929
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"mapping": "last_aligned"
|
| 21 |
+
},
|
| 22 |
+
"training": {
|
| 23 |
+
"learning_rate": 1e-4,
|
| 24 |
+
"weight_decay": 0.01,
|
| 25 |
+
"num_epochs": 1,
|
| 26 |
+
"max_length": 2048,
|
| 27 |
+
"device": "cuda",
|
| 28 |
+
"scheduler_type": "linear",
|
| 29 |
+
"warmup_ratio": 0.1,
|
| 30 |
+
"max_grad_norm": 1.0,
|
| 31 |
+
"gradient_accumulation_steps": 8,
|
| 32 |
+
"per_device_train_batch_size": 4,
|
| 33 |
+
"num_processes": 8,
|
| 34 |
+
"freeze": ["teacher","base"],
|
| 35 |
+
"seed": 42
|
| 36 |
+
},
|
| 37 |
+
"output": {
|
| 38 |
+
"output_dir": "local/checkpoints/0.6+math_C2C_general",
|
| 39 |
+
"save_steps": 500,
|
| 40 |
+
"eval_steps": 100,
|
| 41 |
+
"wandb_config": {
|
| 42 |
+
"project": "Rosetta",
|
| 43 |
+
"mode": "online",
|
| 44 |
+
"entity": "nics-efc",
|
| 45 |
+
"run_name": "0.6B+math_C2C_general"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"data": {
|
| 49 |
+
"type": "OpenHermesChatDataset",
|
| 50 |
+
"kwargs": {
|
| 51 |
+
"split": "train",
|
| 52 |
+
"max_word_count": 2048,
|
| 53 |
+
"num_samples": 500000
|
| 54 |
+
},
|
| 55 |
+
"train_ratio": 0.99
|
| 56 |
+
}
|
| 57 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/aggregator_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_0.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2a04cc6aaf0951e2d1c02cc65d35b227c80c1dc0b3b48fcfb1223fbe4b605d93
|
| 3 |
+
size 34669511
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_1.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:384693a79009a82e1814e962e16a2198f8870f1fea6632d17980ecd13948377f
|
| 3 |
+
size 34669511
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_10.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_10.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4719d57089eb68d187c79cd430a2cb3facf7f558e6026515eb7f551615e47c43
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_11.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_11.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e24b13c4e0913a15a95124b9cf038eec4817d12fa41942076c729504890d894c
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_12.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_12.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fab1ff921640073009591dd1cfdb9123e6b9972ce149cc8c4ea3a6cf78170e70
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_13.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_13.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6740ef5c728556c99f3c80fd6be80ea4663d5933c09c18ef73750425a112f3e6
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_14.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_14.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b12043f37bbc5545b2f53d7cc230b346f4577754dad4bfd3d9094debd967ae1
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_15.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_15.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36e7cbc661af2b845daef58381f9e492fd01360143a43e6f1cd834d59c3fa6a6
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_16.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_16.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:afdacd832765cff3b1da2c2bb5260ea129f96c10178be893fa9e21522c91e6ef
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_17.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_17.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b703b1a7942f052388e4586cc1924b1d46234fd7c7d949ce2396df5b880f35d
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_18.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_18.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:342950f4c5922187897fce4d377291d9c94fc56219bd780d34f35066e8195faa
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_19.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_19.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c19ffb1469f350ab999f49292c953cb46d2b92198de2e57f6c5422366b89246c
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_2.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8687fac1be13e65e0ab9216e84dd001f80943be4be6fee376c3922a3c38b1191
|
| 3 |
+
size 34669511
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_20.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_20.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7148f304c760d0eca1cec0f07567e927fb1764a21985effe60981889a2c7772b
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_21.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_21.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a8239af67a58293570dc54b74e1ad48c0e3e2192593ea0c06f156fd93754a8eb
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_22.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_22.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6145d5a15e79876c5274976040eb99647a029ec964fb00fcf50ff582b5aeacd1
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_23.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_23.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38ed2217c4cdd98b4d21cd6abdba1e2f6e6f643c50e389eef51db64cf901838d
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_24.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_24.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0708dc71f5bb708316d71eafab75ba41bb4c0b74e4b269e5e3d04a7bd1531d78
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_25.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_25.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e79a40e95aa61413085b4d92413a4e12eadc8e6c9b1d1302748e6cb9c189858
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_26.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_26.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b99606b6ce97b5796b88984f81b8da0549bbf7495c34246e55031c83679ac22
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_27.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_27.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:471d909fa34e1c955486bd001feef8b31db49b742a30a88192856da312e06932
|
| 3 |
+
size 34669548
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_3.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:abe9e0e3b93edca1b8c75c8b292c595f7a7d510657119f9282a9e7bd4241dabf
|
| 3 |
+
size 34669511
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_4.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_4.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db5c8c5eaafbe17345f95b27ca0a2d0951fdf0b4eed7d33442f3f492ce505a03
|
| 3 |
+
size 34669511
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_5.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"class": "C2CProjector",
|
| 3 |
+
"init_args": {
|
| 4 |
+
"source_dim": 128,
|
| 5 |
+
"target_dim": 128,
|
| 6 |
+
"source_num_heads": 2,
|
| 7 |
+
"target_num_heads": 8,
|
| 8 |
+
"hidden_dim": 1024,
|
| 9 |
+
"intermediate_dim": 1024,
|
| 10 |
+
"num_layers": 3,
|
| 11 |
+
"dropout": 0.1,
|
| 12 |
+
"initial_temperature": 1.0,
|
| 13 |
+
"final_temperature": 0.001,
|
| 14 |
+
"anneal_steps": 1929,
|
| 15 |
+
"dtype": {
|
| 16 |
+
"__type__": "torch.dtype",
|
| 17 |
+
"value": "bfloat16"
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
}
|
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_5.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fa95e7be8d887ec04db835bcab9c7b8b5cae3c4fc58f7b71e8f3b25d94619bc9
|
| 3 |
+
size 34669511
|