Zihan Min commited on
Commit
6342c7b
·
1 Parent(s): 1e55ac0

upload 0.6+4_base fuser

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. qwen3_0.6b+qwen3_4b_base_Fuser/config.json +82 -0
  2. qwen3_0.6b+qwen3_4b_base_Fuser/final/aggregator_config.json +1 -0
  3. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_0.json +20 -0
  4. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_0.pt +3 -0
  5. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_1.json +20 -0
  6. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_1.pt +3 -0
  7. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_10.json +20 -0
  8. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_10.pt +3 -0
  9. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_11.json +20 -0
  10. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_11.pt +3 -0
  11. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_12.json +20 -0
  12. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_12.pt +3 -0
  13. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_13.json +20 -0
  14. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_13.pt +3 -0
  15. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_14.json +20 -0
  16. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_14.pt +3 -0
  17. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_15.json +20 -0
  18. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_15.pt +3 -0
  19. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_16.json +20 -0
  20. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_16.pt +3 -0
  21. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_17.json +20 -0
  22. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_17.pt +3 -0
  23. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_18.json +20 -0
  24. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_18.pt +3 -0
  25. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_19.json +20 -0
  26. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_19.pt +3 -0
  27. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_2.json +20 -0
  28. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_2.pt +3 -0
  29. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_20.json +20 -0
  30. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_20.pt +3 -0
  31. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_21.json +20 -0
  32. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_21.pt +3 -0
  33. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_22.json +20 -0
  34. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_22.pt +3 -0
  35. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_23.json +20 -0
  36. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_23.pt +3 -0
  37. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_24.json +20 -0
  38. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_24.pt +3 -0
  39. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_25.json +20 -0
  40. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_25.pt +3 -0
  41. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_26.json +20 -0
  42. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_26.pt +3 -0
  43. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_27.json +20 -0
  44. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_27.pt +3 -0
  45. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_3.json +20 -0
  46. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_3.pt +3 -0
  47. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_4.json +20 -0
  48. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_4.pt +3 -0
  49. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_5.json +20 -0
  50. qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_5.pt +3 -0
qwen3_0.6b+qwen3_4b_base_Fuser/config.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ base_model: "Qwen/Qwen3-0.6B"
3
+ teacher_model: "Qwen/Qwen3-4B-Base"
4
+ include_response: false
5
+ is_do_alignment: false
6
+ alignment_strategy: "first"
7
+ projector:
8
+ type: "C2CProjector"
9
+ params:
10
+ hidden_dim: 1024
11
+ intermediate_dim: 1024
12
+ num_layers: 3
13
+ dropout: 0.1
14
+ initial_temperature: 1.0
15
+ final_temperature: 0.001
16
+ anneal_steps: 1953
17
+ # projector:
18
+ # type: "AllInOneProjector"
19
+ # params:
20
+ # hidden_dim: 1024
21
+ # weight_hidden_dim: 1024
22
+ # num_layers: 3
23
+ # dropout: 0.1
24
+ # activation: "gelu"
25
+ # use_layer_norm: true
26
+ # use_residual: true
27
+ # use_swiglu: true
28
+ # use_concat: true
29
+ # gate_granularity: "scalar"
30
+ # gate_depends_on_input: false
31
+ # gate_input_features: "target_key"
32
+ # gate_init_value: 0.0
33
+ # weight_granularity: "head_merged"
34
+ # weight_depends_on_input: true
35
+ # weight_input_features: "target_projected_key"
36
+ # weight_init_value: 0.0
37
+ # use_gumbel: true
38
+ # initial_temperature: 1.0
39
+ # final_temperature: 0.001
40
+ # preserve_target_weight: false
41
+ # add_self: true
42
+ # anneal_steps: 1929
43
+ # scalar_temperature: 1.0
44
+ # max_sequence_length: 8192
45
+ mapping: "last_aligned"
46
+
47
+ training:
48
+ learning_rate: 0.0001
49
+ weight_decay: 0.01
50
+ num_epochs: 1
51
+ max_length: 2048
52
+ device: "cuda"
53
+ scheduler_type: "linear"
54
+ warmup_ratio: 0.1
55
+ max_grad_norm: 1.0
56
+ gradient_accumulation_steps: 4
57
+ per_device_train_batch_size: 8
58
+ num_processes: 8
59
+ freeze:
60
+ - "teacher"
61
+ - "base"
62
+ seed: 42
63
+
64
+ output:
65
+ output_dir: "local/checkpoints/Q3-0.6B_Q3-4B-Base_general_500k_C2C"
66
+ save_steps: 500
67
+ eval_steps: 100
68
+ wandb_config:
69
+ project: "Rosetta"
70
+ mode: "online"
71
+ entity: "nics-efc"
72
+ run_name: "Q3-0.6B_Q3-4B-Base_general_500k_C2C"
73
+
74
+ data:
75
+ type: "OpenHermesChatDataset"
76
+ # type: "DollyChatDataset"
77
+
78
+ kwargs:
79
+ split: "train"
80
+ max_word_count: 2048
81
+ num_samples: 500000
82
+ train_ratio: 0.99
qwen3_0.6b+qwen3_4b_base_Fuser/final/aggregator_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_0.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ca901bc8803e85cae986c4cd656874eda980384f8956f26692b91a00930195e
3
+ size 37815239
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_1.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c37b6a1bca77a587417a932c6b08ab422ef895c21adcaa013cf3423df29e5bf4
3
+ size 37815239
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_10.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_10.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:befd49e5176b4b913597244f97da7f709edd8c344843adb1e665db2c3cd55f33
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_11.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_11.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea1e072a0a736a09e0cf7a7edf36dbe7c69101a7249d8138e64b45e1a0afa354
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_12.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_12.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89bea162bff880010939806fa1c5ea6be7b19ef950448818d69528c649192033
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_13.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_13.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03b2b6856fa7d90d6b9298b584e9cdfe0b2f85da492596bde250f85c41fbb3fc
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_14.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_14.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bb4bf56e219a51545a4bff5c6270555de629e1616f87b9fa0f658ccf27254a9
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_15.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_15.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7af755b043865aa4c27148565c9d20437363e83afa1ba007a4adcbe6b33ae77
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_16.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_16.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e9c310b2bc90edf68c0f0914a00897809bed4767da049b2155767a324c1325b
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_17.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_17.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0055c8e4aebda13f50d5bfa8f8ca0ad4badaa8079923a310c4d76b14f0037d8e
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_18.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_18.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61065ef9e80df17da7ab6ee2cc810c996eeef9d8fc068eeb384240d5776164fa
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_19.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_19.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7253a544a70f2721a6072f3a2ea931a4b2af120312d052d9ba2bd7bebb450875
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_2.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fc8b03defc611cbc58da01cfb67e1a84203e0675f1e9bf4fa5ffcac34851b4b
3
+ size 37815239
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_20.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_20.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c621c50d542d368bf1f127918020afd9823bc6d39057b79e8a0ff368a46fce1
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_21.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_21.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87a0e4f1ad4290c336f1780dd4fdaba6f6ce85a221a3111ec1d477289735ca8e
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_22.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_22.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d3be267fa3472fcac11fbd8252f3a09b8bf050945ec1481f29fca096d815d95
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_23.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_23.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e6207c7442f68832c6678a47c508dfae574c133b3915ad959a1f764eef8d547
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_24.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_24.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d382d0d1b64f303279054234fae9588bcd0ef3ab324c2bde2475d610ca99b16a
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_25.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_25.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33161138959e0125801b74b5be397029733729938edf4715760cdde8e729af0d
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_26.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_26.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3377bdc808175f9b3d239927424481e98039d445d5622ae0281ac7a5e7734647
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_27.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_27.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ac9a6e50aa6886c7cee791b5ce6d2c3f7c806da0a88c61bf0678a34d0e9e348
3
+ size 37815276
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_3.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02b1bb740e0983f75d32c680f742d96722d3c74ad3075847957647984bfe7071
3
+ size 37815239
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_4.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a59d32df77c5c6b88388edd833866284f8e53b557cf2c6e4cbf15db6f7909c6
3
+ size 37815239
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_5.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 8,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1953,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen3_4b_base_Fuser/final/projector_5.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e2a169095e309e749a22f1ebcd9dae026577b5d6ffd8a4b5685a9a3a500c04d
3
+ size 37815239