Zihan Min commited on
Commit
909c76d
·
1 Parent(s): 156581e

upload new general-trained fusers

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/config.json +57 -0
  2. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/aggregator_config.json +1 -0
  3. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_0.json +20 -0
  4. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_0.pt +3 -0
  5. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_1.json +20 -0
  6. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_1.pt +3 -0
  7. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_10.json +20 -0
  8. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_10.pt +3 -0
  9. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_11.json +20 -0
  10. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_11.pt +3 -0
  11. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_12.json +20 -0
  12. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_12.pt +3 -0
  13. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_13.json +20 -0
  14. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_13.pt +3 -0
  15. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_14.json +20 -0
  16. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_14.pt +3 -0
  17. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_15.json +20 -0
  18. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_15.pt +3 -0
  19. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_16.json +20 -0
  20. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_16.pt +3 -0
  21. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_17.json +20 -0
  22. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_17.pt +3 -0
  23. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_18.json +20 -0
  24. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_18.pt +3 -0
  25. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_19.json +20 -0
  26. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_19.pt +3 -0
  27. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_2.json +20 -0
  28. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_2.pt +3 -0
  29. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_20.json +20 -0
  30. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_20.pt +3 -0
  31. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_21.json +20 -0
  32. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_21.pt +3 -0
  33. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_22.json +20 -0
  34. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_22.pt +3 -0
  35. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_23.json +20 -0
  36. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_23.pt +3 -0
  37. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_24.json +20 -0
  38. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_24.pt +3 -0
  39. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_25.json +20 -0
  40. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_25.pt +3 -0
  41. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_26.json +20 -0
  42. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_26.pt +3 -0
  43. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_27.json +20 -0
  44. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_27.pt +3 -0
  45. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_3.json +20 -0
  46. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_3.pt +3 -0
  47. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_4.json +20 -0
  48. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_4.pt +3 -0
  49. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_5.json +20 -0
  50. qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_5.pt +3 -0
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model": {
3
+ "base_model": "Qwen/Qwen3-0.6B",
4
+ "teacher_model": "Qwen/Qwen2.5-Math-1.5B-Instruct",
5
+ "include_response": false,
6
+ "is_do_alignment": false,
7
+ "alignment_strategy": "first",
8
+ "projector": {
9
+ "type": "C2CProjector",
10
+ "params": {
11
+ "hidden_dim": 1024,
12
+ "intermediate_dim": 1024,
13
+ "num_layers": 3,
14
+ "dropout": 0.1,
15
+ "initial_temperature": 1.0,
16
+ "final_temperature": 0.001,
17
+ "anneal_steps": 1929
18
+ }
19
+ },
20
+ "mapping": "last_aligned"
21
+ },
22
+ "training": {
23
+ "learning_rate": 1e-4,
24
+ "weight_decay": 0.01,
25
+ "num_epochs": 1,
26
+ "max_length": 2048,
27
+ "device": "cuda",
28
+ "scheduler_type": "linear",
29
+ "warmup_ratio": 0.1,
30
+ "max_grad_norm": 1.0,
31
+ "gradient_accumulation_steps": 8,
32
+ "per_device_train_batch_size": 4,
33
+ "num_processes": 8,
34
+ "freeze": ["teacher","base"],
35
+ "seed": 42
36
+ },
37
+ "output": {
38
+ "output_dir": "local/checkpoints/0.6+math_C2C_general",
39
+ "save_steps": 500,
40
+ "eval_steps": 100,
41
+ "wandb_config": {
42
+ "project": "Rosetta",
43
+ "mode": "online",
44
+ "entity": "nics-efc",
45
+ "run_name": "0.6B+math_C2C_general"
46
+ }
47
+ },
48
+ "data": {
49
+ "type": "OpenHermesChatDataset",
50
+ "kwargs": {
51
+ "split": "train",
52
+ "max_word_count": 2048,
53
+ "num_samples": 500000
54
+ },
55
+ "train_ratio": 0.99
56
+ }
57
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/aggregator_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_0.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a04cc6aaf0951e2d1c02cc65d35b227c80c1dc0b3b48fcfb1223fbe4b605d93
3
+ size 34669511
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_1.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_1.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:384693a79009a82e1814e962e16a2198f8870f1fea6632d17980ecd13948377f
3
+ size 34669511
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_10.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_10.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4719d57089eb68d187c79cd430a2cb3facf7f558e6026515eb7f551615e47c43
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_11.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_11.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e24b13c4e0913a15a95124b9cf038eec4817d12fa41942076c729504890d894c
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_12.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_12.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fab1ff921640073009591dd1cfdb9123e6b9972ce149cc8c4ea3a6cf78170e70
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_13.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_13.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6740ef5c728556c99f3c80fd6be80ea4663d5933c09c18ef73750425a112f3e6
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_14.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_14.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b12043f37bbc5545b2f53d7cc230b346f4577754dad4bfd3d9094debd967ae1
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_15.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_15.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36e7cbc661af2b845daef58381f9e492fd01360143a43e6f1cd834d59c3fa6a6
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_16.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_16.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afdacd832765cff3b1da2c2bb5260ea129f96c10178be893fa9e21522c91e6ef
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_17.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_17.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b703b1a7942f052388e4586cc1924b1d46234fd7c7d949ce2396df5b880f35d
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_18.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_18.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:342950f4c5922187897fce4d377291d9c94fc56219bd780d34f35066e8195faa
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_19.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_19.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c19ffb1469f350ab999f49292c953cb46d2b92198de2e57f6c5422366b89246c
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_2.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_2.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8687fac1be13e65e0ab9216e84dd001f80943be4be6fee376c3922a3c38b1191
3
+ size 34669511
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_20.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_20.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7148f304c760d0eca1cec0f07567e927fb1764a21985effe60981889a2c7772b
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_21.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_21.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8239af67a58293570dc54b74e1ad48c0e3e2192593ea0c06f156fd93754a8eb
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_22.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_22.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6145d5a15e79876c5274976040eb99647a029ec964fb00fcf50ff582b5aeacd1
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_23.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_23.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38ed2217c4cdd98b4d21cd6abdba1e2f6e6f643c50e389eef51db64cf901838d
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_24.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_24.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0708dc71f5bb708316d71eafab75ba41bb4c0b74e4b269e5e3d04a7bd1531d78
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_25.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_25.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e79a40e95aa61413085b4d92413a4e12eadc8e6c9b1d1302748e6cb9c189858
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_26.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_26.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b99606b6ce97b5796b88984f81b8da0549bbf7495c34246e55031c83679ac22
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_27.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_27.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:471d909fa34e1c955486bd001feef8b31db49b742a30a88192856da312e06932
3
+ size 34669548
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_3.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_3.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abe9e0e3b93edca1b8c75c8b292c595f7a7d510657119f9282a9e7bd4241dabf
3
+ size 34669511
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_4.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_4.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db5c8c5eaafbe17345f95b27ca0a2d0951fdf0b4eed7d33442f3f492ce505a03
3
+ size 34669511
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_5.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "class": "C2CProjector",
3
+ "init_args": {
4
+ "source_dim": 128,
5
+ "target_dim": 128,
6
+ "source_num_heads": 2,
7
+ "target_num_heads": 8,
8
+ "hidden_dim": 1024,
9
+ "intermediate_dim": 1024,
10
+ "num_layers": 3,
11
+ "dropout": 0.1,
12
+ "initial_temperature": 1.0,
13
+ "final_temperature": 0.001,
14
+ "anneal_steps": 1929,
15
+ "dtype": {
16
+ "__type__": "torch.dtype",
17
+ "value": "bfloat16"
18
+ }
19
+ }
20
+ }
qwen3_0.6b+qwen2.5_1.5b_math_Fuser/final/projector_5.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa95e7be8d887ec04db835bcab9c7b8b5cae3c4fc58f7b71e8f3b25d94619bc9
3
+ size 34669511