DavidNguyen commited on
Commit
5bb72ae
·
verified ·
1 Parent(s): fd20217

Delete CompeteSMoE/competesmoe_versions

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. CompeteSMoE/competesmoe_versions/Base_competesmoe/added_tokens.json +0 -13
  2. CompeteSMoE/competesmoe_versions/Base_competesmoe/config.json +0 -181
  3. CompeteSMoE/competesmoe_versions/Base_competesmoe/generation_config.json +0 -12
  4. CompeteSMoE/competesmoe_versions/Base_competesmoe/model-00001-of-00003.safetensors +0 -3
  5. CompeteSMoE/competesmoe_versions/Base_competesmoe/model-00002-of-00003.safetensors +0 -3
  6. CompeteSMoE/competesmoe_versions/Base_competesmoe/model-00003-of-00003.safetensors +0 -3
  7. CompeteSMoE/competesmoe_versions/Base_competesmoe/model.safetensors.index.json +0 -0
  8. CompeteSMoE/competesmoe_versions/Base_competesmoe/special_tokens_map.json +0 -24
  9. CompeteSMoE/competesmoe_versions/Base_competesmoe/tokenizer.model +0 -3
  10. CompeteSMoE/competesmoe_versions/Base_competesmoe/tokenizer_config.json +0 -132
  11. CompeteSMoE/competesmoe_versions/Base_competesmoe/trainer_state.json +0 -0
  12. CompeteSMoE/competesmoe_versions/Base_competesmoe/training_args.bin +0 -3
  13. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/added_tokens.json +0 -13
  14. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/config.json +0 -197
  15. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/generation_config.json +0 -12
  16. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00001-of-00003.safetensors +0 -3
  17. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00002-of-00003.safetensors +0 -3
  18. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00003-of-00003.safetensors +0 -3
  19. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model.safetensors.index.json +0 -0
  20. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/special_tokens_map.json +0 -24
  21. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer.model +0 -3
  22. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer_config.json +0 -132
  23. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/trainer_state.json +0 -0
  24. CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/training_args.bin +0 -3
  25. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/added_tokens.json +0 -13
  26. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/config.json +0 -198
  27. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/generation_config.json +0 -12
  28. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00001-of-00003.safetensors +0 -3
  29. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00002-of-00003.safetensors +0 -3
  30. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00003-of-00003.safetensors +0 -3
  31. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model.safetensors.index.json +0 -0
  32. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/special_tokens_map.json +0 -24
  33. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer.model +0 -3
  34. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer_config.json +0 -132
  35. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/trainer_state.json +0 -0
  36. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/training_args.bin +0 -3
  37. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/added_tokens.json +0 -13
  38. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/config.json +0 -199
  39. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/generation_config.json +0 -12
  40. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00001-of-00003.safetensors +0 -3
  41. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00002-of-00003.safetensors +0 -3
  42. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00003-of-00003.safetensors +0 -3
  43. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model.safetensors.index.json +0 -0
  44. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/special_tokens_map.json +0 -24
  45. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer.model +0 -3
  46. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer_config.json +0 -132
  47. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/trainer_state.json +0 -0
  48. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/training_args.bin +0 -3
  49. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_softmax_competesmoev30/added_tokens.json +0 -13
  50. CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_softmax_competesmoev30/config.json +0 -199
CompeteSMoE/competesmoe_versions/Base_competesmoe/added_tokens.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "<|assistant|>": 32001,
3
- "<|endoftext|>": 32000,
4
- "<|end|>": 32007,
5
- "<|placeholder1|>": 32002,
6
- "<|placeholder2|>": 32003,
7
- "<|placeholder3|>": 32004,
8
- "<|placeholder4|>": 32005,
9
- "<|placeholder5|>": 32008,
10
- "<|placeholder6|>": 32009,
11
- "<|system|>": 32006,
12
- "<|user|>": 32010
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/config.json DELETED
@@ -1,181 +0,0 @@
1
- {
2
- "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft",
3
- "architectures": [
4
- "LlavaPhiForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "auto_map": {
9
- "AutoConfig": "configuration_phi3.Phi3Config",
10
- "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
11
- },
12
- "balance_loss_coef": 0.01,
13
- "bos_token_id": 1,
14
- "clip_smoe": true,
15
- "dropout": false,
16
- "embd_pdrop": 0.0,
17
- "eos_token_id": 32000,
18
- "freeze_mm_mlp_adapter": false,
19
- "hidden_act": "silu",
20
- "hidden_size": 3072,
21
- "image_aspect_ratio": "pad",
22
- "initializer_range": 0.02,
23
- "intermediate_size": 8192,
24
- "local_rank": 0,
25
- "loss1": "balanceloss",
26
- "loss2": "zloss",
27
- "luna": false,
28
- "max_position_embeddings": 131072,
29
- "mlp_smoe": true,
30
- "mm_hidden_size": 1152,
31
- "mm_patch_merge_type": "flat",
32
- "mm_projector_lr": null,
33
- "mm_projector_type": "moe",
34
- "mm_use_im_patch_token": false,
35
- "mm_use_im_start_end": false,
36
- "mm_vision_select_feature": "patch",
37
- "mm_vision_select_layer": -2,
38
- "mm_vision_tower": "google/siglip-so400m-patch14-224",
39
- "model_type": "llava_phi",
40
- "moe_name": "competesmoe",
41
- "normalization": true,
42
- "num_attention_heads": 32,
43
- "num_experts": 4,
44
- "num_hidden_layers": 32,
45
- "num_key_value_heads": 32,
46
- "num_layers": 3,
47
- "num_selected": 2,
48
- "number_of_previous_tokens": 2,
49
- "original_max_position_embeddings": 4096,
50
- "pad_token_id": 32000,
51
- "rate_compete": 0.2,
52
- "rate_flip": 0.07,
53
- "resid_pdrop": 0.0,
54
- "rms_norm_eps": 1e-05,
55
- "rope_scaling": {
56
- "long_factor": [
57
- 1.0800000429153442,
58
- 1.1100000143051147,
59
- 1.1399999856948853,
60
- 1.340000033378601,
61
- 1.5899999141693115,
62
- 1.600000023841858,
63
- 1.6200000047683716,
64
- 2.620000123977661,
65
- 3.2300000190734863,
66
- 3.2300000190734863,
67
- 4.789999961853027,
68
- 7.400000095367432,
69
- 7.700000286102295,
70
- 9.09000015258789,
71
- 12.199999809265137,
72
- 17.670000076293945,
73
- 24.46000099182129,
74
- 28.57000160217285,
75
- 30.420001983642578,
76
- 30.840002059936523,
77
- 32.590003967285156,
78
- 32.93000411987305,
79
- 42.320003509521484,
80
- 44.96000289916992,
81
- 50.340003967285156,
82
- 50.45000457763672,
83
- 57.55000305175781,
84
- 57.93000411987305,
85
- 58.21000289916992,
86
- 60.1400032043457,
87
- 62.61000442504883,
88
- 62.62000274658203,
89
- 62.71000289916992,
90
- 63.1400032043457,
91
- 63.1400032043457,
92
- 63.77000427246094,
93
- 63.93000411987305,
94
- 63.96000289916992,
95
- 63.970001220703125,
96
- 64.02999877929688,
97
- 64.06999969482422,
98
- 64.08000183105469,
99
- 64.12000274658203,
100
- 64.41000366210938,
101
- 64.4800033569336,
102
- 64.51000213623047,
103
- 64.52999877929688,
104
- 64.83999633789062
105
- ],
106
- "short_factor": [
107
- 1.0,
108
- 1.0199999809265137,
109
- 1.0299999713897705,
110
- 1.0299999713897705,
111
- 1.0499999523162842,
112
- 1.0499999523162842,
113
- 1.0499999523162842,
114
- 1.0499999523162842,
115
- 1.0499999523162842,
116
- 1.0699999332427979,
117
- 1.0999999046325684,
118
- 1.1099998950958252,
119
- 1.1599998474121094,
120
- 1.1599998474121094,
121
- 1.1699998378753662,
122
- 1.2899998426437378,
123
- 1.339999794960022,
124
- 1.679999828338623,
125
- 1.7899998426437378,
126
- 1.8199998140335083,
127
- 1.8499997854232788,
128
- 1.8799997568130493,
129
- 1.9099997282028198,
130
- 1.9399996995925903,
131
- 1.9899996519088745,
132
- 2.0199997425079346,
133
- 2.0199997425079346,
134
- 2.0199997425079346,
135
- 2.0199997425079346,
136
- 2.0199997425079346,
137
- 2.0199997425079346,
138
- 2.0299997329711914,
139
- 2.0299997329711914,
140
- 2.0299997329711914,
141
- 2.0299997329711914,
142
- 2.0299997329711914,
143
- 2.0299997329711914,
144
- 2.0299997329711914,
145
- 2.0299997329711914,
146
- 2.0299997329711914,
147
- 2.0799996852874756,
148
- 2.0899996757507324,
149
- 2.189999580383301,
150
- 2.2199995517730713,
151
- 2.5899994373321533,
152
- 2.729999542236328,
153
- 2.749999523162842,
154
- 2.8399994373321533
155
- ],
156
- "type": "longrope"
157
- },
158
- "rope_theta": 10000.0,
159
- "router_loss_coef": 0.01,
160
- "router_z_loss_coef": 0.001,
161
- "scales": [
162
- 1,
163
- 3
164
- ],
165
- "sliding_window": 262144,
166
- "sparse_upcycling": true,
167
- "strategy_train": "base",
168
- "tie_word_embeddings": false,
169
- "tokenizer_model_max_length": 2048,
170
- "tokenizer_padding_side": "right",
171
- "topk_max": 2,
172
- "topk_min": 1,
173
- "torch_dtype": "bfloat16",
174
- "training": true,
175
- "transformers_version": "4.43.0",
176
- "tune_mm_mlp_adapter": false,
177
- "use_cache": true,
178
- "use_mm_proj": true,
179
- "vocab_size": 32064,
180
- "warm_up": 0.05
181
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/generation_config.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 1,
4
- "do_sample": true,
5
- "eos_token_id": [
6
- 32007,
7
- 32001,
8
- 32000
9
- ],
10
- "pad_token_id": 32000,
11
- "transformers_version": "4.43.0"
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d758761d12f32445cff89b886f3d566a98af42308ff9bf36b4aabc1b9fa1e343
3
- size 4972489328
 
 
 
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b756c4b38293726783d4925f205f916a2bf4c68f96c7f2499991c8000c75b4ef
3
- size 4985754844
 
 
 
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbdede0208bda4d46420128fe04f5a4ace54555672c0720105ac3ef2e9933222
3
- size 248943552
 
 
 
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<unk>",
17
- "unk_token": {
18
- "content": "<unk>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/tokenizer_config.json DELETED
@@ -1,132 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "add_prefix_space": true,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "<unk>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "1": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "2": {
23
- "content": "</s>",
24
- "lstrip": false,
25
- "normalized": false,
26
- "rstrip": true,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "32000": {
31
- "content": "<|endoftext|>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": true
37
- },
38
- "32001": {
39
- "content": "<|assistant|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": true,
43
- "single_word": false,
44
- "special": true
45
- },
46
- "32002": {
47
- "content": "<|placeholder1|>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": true,
51
- "single_word": false,
52
- "special": true
53
- },
54
- "32003": {
55
- "content": "<|placeholder2|>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": true,
59
- "single_word": false,
60
- "special": true
61
- },
62
- "32004": {
63
- "content": "<|placeholder3|>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": true,
67
- "single_word": false,
68
- "special": true
69
- },
70
- "32005": {
71
- "content": "<|placeholder4|>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": true,
75
- "single_word": false,
76
- "special": true
77
- },
78
- "32006": {
79
- "content": "<|system|>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": true,
83
- "single_word": false,
84
- "special": true
85
- },
86
- "32007": {
87
- "content": "<|end|>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": true,
91
- "single_word": false,
92
- "special": true
93
- },
94
- "32008": {
95
- "content": "<|placeholder5|>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": true,
99
- "single_word": false,
100
- "special": true
101
- },
102
- "32009": {
103
- "content": "<|placeholder6|>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": true,
107
- "single_word": false,
108
- "special": true
109
- },
110
- "32010": {
111
- "content": "<|user|>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": true,
115
- "single_word": false,
116
- "special": true
117
- }
118
- },
119
- "bos_token": "<s>",
120
- "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
- "clean_up_tokenization_spaces": false,
122
- "eos_token": "<|endoftext|>",
123
- "legacy": false,
124
- "model_max_length": 2048,
125
- "pad_token": "<unk>",
126
- "padding_side": "right",
127
- "sp_model_kwargs": {},
128
- "spaces_between_special_tokens": false,
129
- "tokenizer_class": "LlamaTokenizer",
130
- "unk_token": "<unk>",
131
- "use_default_system_prompt": false
132
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
CompeteSMoE/competesmoe_versions/Base_competesmoe/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:96fc8b7edffad1e7f41bdbffbf5e77a25f0663891b83c6d4ee5bec31afe1df9b
3
- size 7928
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/added_tokens.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "<|assistant|>": 32001,
3
- "<|endoftext|>": 32000,
4
- "<|end|>": 32007,
5
- "<|placeholder1|>": 32002,
6
- "<|placeholder2|>": 32003,
7
- "<|placeholder3|>": 32004,
8
- "<|placeholder4|>": 32005,
9
- "<|placeholder5|>": 32008,
10
- "<|placeholder6|>": 32009,
11
- "<|system|>": 32006,
12
- "<|user|>": 32010
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/config.json DELETED
@@ -1,197 +0,0 @@
1
- {
2
- "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft",
3
- "architectures": [
4
- "LlavaPhiForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "auto_map": {
9
- "AutoConfig": "configuration_phi3.Phi3Config",
10
- "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
11
- },
12
- "balance_loss_coef": 0.01,
13
- "bos_token_id": 1,
14
- "clip_smoe": true,
15
- "diversity_loss_coef": 0.01,
16
- "dropout": false,
17
- "e_loss_coef": 0.001,
18
- "embd_pdrop": 0.0,
19
- "entropy_advance_loss": false,
20
- "eos_token_id": 32000,
21
- "freeze_backbone": false,
22
- "freeze_mm_mlp_adapter": false,
23
- "hidden_act": "silu",
24
- "hidden_size": 3072,
25
- "hybrid": true,
26
- "image_aspect_ratio": "pad",
27
- "init_weight": true,
28
- "initializer_range": 0.02,
29
- "intermediate_size": 8192,
30
- "local_rank": 0,
31
- "loss1": "balanceloss",
32
- "loss2": "zloss",
33
- "luna": false,
34
- "max_compete_in_iter": 7,
35
- "max_position_embeddings": 131072,
36
- "mlp_smoe": true,
37
- "mm_hidden_size": 1152,
38
- "mm_patch_merge_type": "flat",
39
- "mm_projector_lr": null,
40
- "mm_projector_type": "moe",
41
- "mm_use_im_patch_token": false,
42
- "mm_use_im_start_end": false,
43
- "mm_vision_select_feature": "patch",
44
- "mm_vision_select_layer": -2,
45
- "mm_vision_tower": "google/siglip-so400m-patch14-224",
46
- "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft",
47
- "model_type": "llava_phi",
48
- "moe_name": "competesmoev32",
49
- "norm_softmax": false,
50
- "normalization": true,
51
- "num_attention_heads": 32,
52
- "num_experts": 4,
53
- "num_hidden_layers": 32,
54
- "num_key_value_heads": 32,
55
- "num_layers": 3,
56
- "num_selected": 2,
57
- "number_of_previous_tokens": 2,
58
- "original_max_position_embeddings": 4096,
59
- "pad_token_id": 32000,
60
- "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin",
61
- "rate_compete": 0.2,
62
- "rate_flip": 0.07,
63
- "resid_pdrop": 0.0,
64
- "rms_norm_eps": 1e-05,
65
- "rope_scaling": {
66
- "long_factor": [
67
- 1.0800000429153442,
68
- 1.1100000143051147,
69
- 1.1399999856948853,
70
- 1.340000033378601,
71
- 1.5899999141693115,
72
- 1.600000023841858,
73
- 1.6200000047683716,
74
- 2.620000123977661,
75
- 3.2300000190734863,
76
- 3.2300000190734863,
77
- 4.789999961853027,
78
- 7.400000095367432,
79
- 7.700000286102295,
80
- 9.09000015258789,
81
- 12.199999809265137,
82
- 17.670000076293945,
83
- 24.46000099182129,
84
- 28.57000160217285,
85
- 30.420001983642578,
86
- 30.840002059936523,
87
- 32.590003967285156,
88
- 32.93000411987305,
89
- 42.320003509521484,
90
- 44.96000289916992,
91
- 50.340003967285156,
92
- 50.45000457763672,
93
- 57.55000305175781,
94
- 57.93000411987305,
95
- 58.21000289916992,
96
- 60.1400032043457,
97
- 62.61000442504883,
98
- 62.62000274658203,
99
- 62.71000289916992,
100
- 63.1400032043457,
101
- 63.1400032043457,
102
- 63.77000427246094,
103
- 63.93000411987305,
104
- 63.96000289916992,
105
- 63.970001220703125,
106
- 64.02999877929688,
107
- 64.06999969482422,
108
- 64.08000183105469,
109
- 64.12000274658203,
110
- 64.41000366210938,
111
- 64.4800033569336,
112
- 64.51000213623047,
113
- 64.52999877929688,
114
- 64.83999633789062
115
- ],
116
- "short_factor": [
117
- 1.0,
118
- 1.0199999809265137,
119
- 1.0299999713897705,
120
- 1.0299999713897705,
121
- 1.0499999523162842,
122
- 1.0499999523162842,
123
- 1.0499999523162842,
124
- 1.0499999523162842,
125
- 1.0499999523162842,
126
- 1.0699999332427979,
127
- 1.0999999046325684,
128
- 1.1099998950958252,
129
- 1.1599998474121094,
130
- 1.1599998474121094,
131
- 1.1699998378753662,
132
- 1.2899998426437378,
133
- 1.339999794960022,
134
- 1.679999828338623,
135
- 1.7899998426437378,
136
- 1.8199998140335083,
137
- 1.8499997854232788,
138
- 1.8799997568130493,
139
- 1.9099997282028198,
140
- 1.9399996995925903,
141
- 1.9899996519088745,
142
- 2.0199997425079346,
143
- 2.0199997425079346,
144
- 2.0199997425079346,
145
- 2.0199997425079346,
146
- 2.0199997425079346,
147
- 2.0199997425079346,
148
- 2.0299997329711914,
149
- 2.0299997329711914,
150
- 2.0299997329711914,
151
- 2.0299997329711914,
152
- 2.0299997329711914,
153
- 2.0299997329711914,
154
- 2.0299997329711914,
155
- 2.0299997329711914,
156
- 2.0299997329711914,
157
- 2.0799996852874756,
158
- 2.0899996757507324,
159
- 2.189999580383301,
160
- 2.2199995517730713,
161
- 2.5899994373321533,
162
- 2.729999542236328,
163
- 2.749999523162842,
164
- 2.8399994373321533
165
- ],
166
- "type": "longrope"
167
- },
168
- "rope_theta": 10000.0,
169
- "router_loss_coef": 0.01,
170
- "router_theta": 0.5,
171
- "router_z_loss_coef": 0.001,
172
- "scales": [
173
- 1,
174
- 3
175
- ],
176
- "sliding_window": 262144,
177
- "sparse_upcycling": true,
178
- "strategy_train": "base",
179
- "tie_word_embeddings": false,
180
- "tokenizer_model_max_length": 2048,
181
- "tokenizer_padding_side": "right",
182
- "topk_max": 2,
183
- "topk_min": 1,
184
- "torch_dtype": "bfloat16",
185
- "training": true,
186
- "transformers_version": "4.43.0",
187
- "tune_mm_mlp_adapter": false,
188
- "unit_test": true,
189
- "use_cache": true,
190
- "use_mm_proj": true,
191
- "use_old": false,
192
- "version": "phi35",
193
- "vision_tower": "google/siglip-so400m-patch14-224",
194
- "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin",
195
- "vocab_size": 32064,
196
- "warm_up": 0.05
197
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/generation_config.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 1,
4
- "do_sample": true,
5
- "eos_token_id": [
6
- 32007,
7
- 32001,
8
- 32000
9
- ],
10
- "pad_token_id": 32000,
11
- "transformers_version": "4.43.0"
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ecf88bd826e2422e46bb44344ec13166b5528d8abe2979ea189721486cfb2d5a
3
- size 4972489328
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:75b6a1fccc27443f8bc30f9fdd03af9e806e13573c5f0e18414d698b93fefd46
3
- size 4985976068
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e86a9b747ea14004ff2860bafea7a5c9328df2c8c1adace994557c745dcb7cdc
3
- size 248943552
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<unk>",
17
- "unk_token": {
18
- "content": "<unk>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/tokenizer_config.json DELETED
@@ -1,132 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "add_prefix_space": true,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "<unk>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "1": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "2": {
23
- "content": "</s>",
24
- "lstrip": false,
25
- "normalized": false,
26
- "rstrip": true,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "32000": {
31
- "content": "<|endoftext|>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": true
37
- },
38
- "32001": {
39
- "content": "<|assistant|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": true,
43
- "single_word": false,
44
- "special": true
45
- },
46
- "32002": {
47
- "content": "<|placeholder1|>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": true,
51
- "single_word": false,
52
- "special": true
53
- },
54
- "32003": {
55
- "content": "<|placeholder2|>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": true,
59
- "single_word": false,
60
- "special": true
61
- },
62
- "32004": {
63
- "content": "<|placeholder3|>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": true,
67
- "single_word": false,
68
- "special": true
69
- },
70
- "32005": {
71
- "content": "<|placeholder4|>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": true,
75
- "single_word": false,
76
- "special": true
77
- },
78
- "32006": {
79
- "content": "<|system|>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": true,
83
- "single_word": false,
84
- "special": true
85
- },
86
- "32007": {
87
- "content": "<|end|>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": true,
91
- "single_word": false,
92
- "special": true
93
- },
94
- "32008": {
95
- "content": "<|placeholder5|>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": true,
99
- "single_word": false,
100
- "special": true
101
- },
102
- "32009": {
103
- "content": "<|placeholder6|>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": true,
107
- "single_word": false,
108
- "special": true
109
- },
110
- "32010": {
111
- "content": "<|user|>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": true,
115
- "single_word": false,
116
- "special": true
117
- }
118
- },
119
- "bos_token": "<s>",
120
- "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
- "clean_up_tokenization_spaces": false,
122
- "eos_token": "<|endoftext|>",
123
- "legacy": false,
124
- "model_max_length": 2048,
125
- "pad_token": "<unk>",
126
- "padding_side": "right",
127
- "sp_model_kwargs": {},
128
- "spaces_between_special_tokens": false,
129
- "tokenizer_class": "LlamaTokenizer",
130
- "unk_token": "<unk>",
131
- "use_default_system_prompt": false
132
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
CompeteSMoE/competesmoe_versions/Full_BS_theta0.1_RL0.01_competesmoev32/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3368de8c7027967a2c74be84a935a4090535aa9e8533641e4f4b02232e6e70a
3
- size 7992
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/added_tokens.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "<|assistant|>": 32001,
3
- "<|endoftext|>": 32000,
4
- "<|end|>": 32007,
5
- "<|placeholder1|>": 32002,
6
- "<|placeholder2|>": 32003,
7
- "<|placeholder3|>": 32004,
8
- "<|placeholder4|>": 32005,
9
- "<|placeholder5|>": 32008,
10
- "<|placeholder6|>": 32009,
11
- "<|system|>": 32006,
12
- "<|user|>": 32010
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/config.json DELETED
@@ -1,198 +0,0 @@
1
- {
2
- "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft",
3
- "architectures": [
4
- "LlavaPhiForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "auto_map": {
9
- "AutoConfig": "configuration_phi3.Phi3Config",
10
- "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
11
- },
12
- "balance_loss_coef": 0.01,
13
- "bos_token_id": 1,
14
- "clip_smoe": true,
15
- "diversity_loss_coef": 0.01,
16
- "dropout": false,
17
- "e_loss_coef": 0.001,
18
- "embd_pdrop": 0.0,
19
- "entropy_advance_loss": false,
20
- "eos_token_id": 32000,
21
- "freeze_backbone": false,
22
- "freeze_mm_mlp_adapter": false,
23
- "hidden_act": "silu",
24
- "hidden_size": 3072,
25
- "hybrid": true,
26
- "image_aspect_ratio": "pad",
27
- "init_weight": true,
28
- "initializer_range": 0.02,
29
- "intermediate_size": 8192,
30
- "is_cosine": true,
31
- "local_rank": 0,
32
- "loss1": "balanceloss",
33
- "loss2": "zloss",
34
- "luna": false,
35
- "max_compete_in_iter": 8,
36
- "max_position_embeddings": 131072,
37
- "mlp_smoe": true,
38
- "mm_hidden_size": 1152,
39
- "mm_patch_merge_type": "flat",
40
- "mm_projector_lr": null,
41
- "mm_projector_type": "moe",
42
- "mm_use_im_patch_token": false,
43
- "mm_use_im_start_end": false,
44
- "mm_vision_select_feature": "patch",
45
- "mm_vision_select_layer": -2,
46
- "mm_vision_tower": "google/siglip-so400m-patch14-224",
47
- "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft",
48
- "model_type": "llava_phi",
49
- "moe_name": "competesmoev30",
50
- "norm_softmax": false,
51
- "normalization": true,
52
- "num_attention_heads": 32,
53
- "num_experts": 4,
54
- "num_hidden_layers": 32,
55
- "num_key_value_heads": 32,
56
- "num_layers": 3,
57
- "num_selected": 2,
58
- "number_of_previous_tokens": 2,
59
- "original_max_position_embeddings": 4096,
60
- "pad_token_id": 32000,
61
- "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin",
62
- "rate_compete": 0.2,
63
- "rate_flip": 0.07,
64
- "resid_pdrop": 0.0,
65
- "rms_norm_eps": 1e-05,
66
- "rope_scaling": {
67
- "long_factor": [
68
- 1.0800000429153442,
69
- 1.1100000143051147,
70
- 1.1399999856948853,
71
- 1.340000033378601,
72
- 1.5899999141693115,
73
- 1.600000023841858,
74
- 1.6200000047683716,
75
- 2.620000123977661,
76
- 3.2300000190734863,
77
- 3.2300000190734863,
78
- 4.789999961853027,
79
- 7.400000095367432,
80
- 7.700000286102295,
81
- 9.09000015258789,
82
- 12.199999809265137,
83
- 17.670000076293945,
84
- 24.46000099182129,
85
- 28.57000160217285,
86
- 30.420001983642578,
87
- 30.840002059936523,
88
- 32.590003967285156,
89
- 32.93000411987305,
90
- 42.320003509521484,
91
- 44.96000289916992,
92
- 50.340003967285156,
93
- 50.45000457763672,
94
- 57.55000305175781,
95
- 57.93000411987305,
96
- 58.21000289916992,
97
- 60.1400032043457,
98
- 62.61000442504883,
99
- 62.62000274658203,
100
- 62.71000289916992,
101
- 63.1400032043457,
102
- 63.1400032043457,
103
- 63.77000427246094,
104
- 63.93000411987305,
105
- 63.96000289916992,
106
- 63.970001220703125,
107
- 64.02999877929688,
108
- 64.06999969482422,
109
- 64.08000183105469,
110
- 64.12000274658203,
111
- 64.41000366210938,
112
- 64.4800033569336,
113
- 64.51000213623047,
114
- 64.52999877929688,
115
- 64.83999633789062
116
- ],
117
- "short_factor": [
118
- 1.0,
119
- 1.0199999809265137,
120
- 1.0299999713897705,
121
- 1.0299999713897705,
122
- 1.0499999523162842,
123
- 1.0499999523162842,
124
- 1.0499999523162842,
125
- 1.0499999523162842,
126
- 1.0499999523162842,
127
- 1.0699999332427979,
128
- 1.0999999046325684,
129
- 1.1099998950958252,
130
- 1.1599998474121094,
131
- 1.1599998474121094,
132
- 1.1699998378753662,
133
- 1.2899998426437378,
134
- 1.339999794960022,
135
- 1.679999828338623,
136
- 1.7899998426437378,
137
- 1.8199998140335083,
138
- 1.8499997854232788,
139
- 1.8799997568130493,
140
- 1.9099997282028198,
141
- 1.9399996995925903,
142
- 1.9899996519088745,
143
- 2.0199997425079346,
144
- 2.0199997425079346,
145
- 2.0199997425079346,
146
- 2.0199997425079346,
147
- 2.0199997425079346,
148
- 2.0199997425079346,
149
- 2.0299997329711914,
150
- 2.0299997329711914,
151
- 2.0299997329711914,
152
- 2.0299997329711914,
153
- 2.0299997329711914,
154
- 2.0299997329711914,
155
- 2.0299997329711914,
156
- 2.0299997329711914,
157
- 2.0299997329711914,
158
- 2.0799996852874756,
159
- 2.0899996757507324,
160
- 2.189999580383301,
161
- 2.2199995517730713,
162
- 2.5899994373321533,
163
- 2.729999542236328,
164
- 2.749999523162842,
165
- 2.8399994373321533
166
- ],
167
- "type": "longrope"
168
- },
169
- "rope_theta": 10000.0,
170
- "router_loss_coef": 0.01,
171
- "router_theta": 0.1,
172
- "router_z_loss_coef": 0.001,
173
- "scales": [
174
- 1,
175
- 3
176
- ],
177
- "sliding_window": 262144,
178
- "sparse_upcycling": true,
179
- "strategy_train": "base",
180
- "tie_word_embeddings": false,
181
- "tokenizer_model_max_length": 2048,
182
- "tokenizer_padding_side": "right",
183
- "topk_max": 2,
184
- "topk_min": 1,
185
- "torch_dtype": "bfloat16",
186
- "training": true,
187
- "transformers_version": "4.43.0",
188
- "tune_mm_mlp_adapter": false,
189
- "unit_test": true,
190
- "use_cache": true,
191
- "use_mm_proj": true,
192
- "use_old": false,
193
- "version": "phi35",
194
- "vision_tower": "google/siglip-so400m-patch14-224",
195
- "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin",
196
- "vocab_size": 32064,
197
- "warm_up": 0.05
198
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/generation_config.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 1,
4
- "do_sample": true,
5
- "eos_token_id": [
6
- 32007,
7
- 32001,
8
- 32000
9
- ],
10
- "pad_token_id": 32000,
11
- "transformers_version": "4.43.0"
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b9e25d2c7dd35fb520858ee44457e57989c606e7f6027f6a7a12cddca831477
3
- size 4972489328
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:167b5c426a7bb824f1672c9bf19964f425a5db29b55542c7b60478e2c7c9fd20
3
- size 4985976068
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:88d7d575e3bf697f3a39df4f25eeea475ee68b8bf5afbcbed481f5b03d45bb7c
3
- size 248943552
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<unk>",
17
- "unk_token": {
18
- "content": "<unk>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/tokenizer_config.json DELETED
@@ -1,132 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "add_prefix_space": true,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "<unk>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "1": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "2": {
23
- "content": "</s>",
24
- "lstrip": false,
25
- "normalized": false,
26
- "rstrip": true,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "32000": {
31
- "content": "<|endoftext|>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": true
37
- },
38
- "32001": {
39
- "content": "<|assistant|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": true,
43
- "single_word": false,
44
- "special": true
45
- },
46
- "32002": {
47
- "content": "<|placeholder1|>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": true,
51
- "single_word": false,
52
- "special": true
53
- },
54
- "32003": {
55
- "content": "<|placeholder2|>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": true,
59
- "single_word": false,
60
- "special": true
61
- },
62
- "32004": {
63
- "content": "<|placeholder3|>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": true,
67
- "single_word": false,
68
- "special": true
69
- },
70
- "32005": {
71
- "content": "<|placeholder4|>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": true,
75
- "single_word": false,
76
- "special": true
77
- },
78
- "32006": {
79
- "content": "<|system|>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": true,
83
- "single_word": false,
84
- "special": true
85
- },
86
- "32007": {
87
- "content": "<|end|>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": true,
91
- "single_word": false,
92
- "special": true
93
- },
94
- "32008": {
95
- "content": "<|placeholder5|>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": true,
99
- "single_word": false,
100
- "special": true
101
- },
102
- "32009": {
103
- "content": "<|placeholder6|>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": true,
107
- "single_word": false,
108
- "special": true
109
- },
110
- "32010": {
111
- "content": "<|user|>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": true,
115
- "single_word": false,
116
- "special": true
117
- }
118
- },
119
- "bos_token": "<s>",
120
- "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
- "clean_up_tokenization_spaces": false,
122
- "eos_token": "<|endoftext|>",
123
- "legacy": false,
124
- "model_max_length": 2048,
125
- "pad_token": "<unk>",
126
- "padding_side": "right",
127
- "sp_model_kwargs": {},
128
- "spaces_between_special_tokens": false,
129
- "tokenizer_class": "LlamaTokenizer",
130
- "unk_token": "<unk>",
131
- "use_default_system_prompt": false
132
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.1_RL0.01_competesmoev30/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5dca0d87fd4cf21dba2781d9ed4ca6c420f1f15440dc50ff1c08e99716f599d4
3
- size 7992
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/added_tokens.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "<|assistant|>": 32001,
3
- "<|endoftext|>": 32000,
4
- "<|end|>": 32007,
5
- "<|placeholder1|>": 32002,
6
- "<|placeholder2|>": 32003,
7
- "<|placeholder3|>": 32004,
8
- "<|placeholder4|>": 32005,
9
- "<|placeholder5|>": 32008,
10
- "<|placeholder6|>": 32009,
11
- "<|system|>": 32006,
12
- "<|user|>": 32010
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/config.json DELETED
@@ -1,199 +0,0 @@
1
- {
2
- "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft",
3
- "architectures": [
4
- "LlavaPhiForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "auto_map": {
9
- "AutoConfig": "configuration_phi3.Phi3Config",
10
- "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
11
- },
12
- "balance_loss_coef": 0.01,
13
- "bos_token_id": 1,
14
- "clip_smoe": true,
15
- "diversity_loss_coef": 0.01,
16
- "dropout": false,
17
- "e_loss_coef": 0.001,
18
- "embd_pdrop": 0.0,
19
- "entropy_advance_loss": false,
20
- "eos_token_id": 32000,
21
- "freeze_backbone": false,
22
- "freeze_mm_mlp_adapter": false,
23
- "hidden_act": "silu",
24
- "hidden_size": 3072,
25
- "hybrid": true,
26
- "image_aspect_ratio": "pad",
27
- "init_weight": true,
28
- "initializer_range": 0.02,
29
- "intermediate_size": 8192,
30
- "is_cosine": true,
31
- "is_norm_weight": false,
32
- "local_rank": 0,
33
- "loss1": "balanceloss",
34
- "loss2": "zloss",
35
- "luna": false,
36
- "max_compete_in_iter": 8,
37
- "max_position_embeddings": 131072,
38
- "mlp_smoe": true,
39
- "mm_hidden_size": 1152,
40
- "mm_patch_merge_type": "flat",
41
- "mm_projector_lr": null,
42
- "mm_projector_type": "moe",
43
- "mm_use_im_patch_token": false,
44
- "mm_use_im_start_end": false,
45
- "mm_vision_select_feature": "patch",
46
- "mm_vision_select_layer": -2,
47
- "mm_vision_tower": "google/siglip-so400m-patch14-224",
48
- "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft",
49
- "model_type": "llava_phi",
50
- "moe_name": "competesmoev30",
51
- "norm_softmax": false,
52
- "normalization": true,
53
- "num_attention_heads": 32,
54
- "num_experts": 4,
55
- "num_hidden_layers": 32,
56
- "num_key_value_heads": 32,
57
- "num_layers": 3,
58
- "num_selected": 2,
59
- "number_of_previous_tokens": 2,
60
- "original_max_position_embeddings": 4096,
61
- "pad_token_id": 32000,
62
- "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin",
63
- "rate_compete": 0.2,
64
- "rate_flip": 0.07,
65
- "resid_pdrop": 0.0,
66
- "rms_norm_eps": 1e-05,
67
- "rope_scaling": {
68
- "long_factor": [
69
- 1.0800000429153442,
70
- 1.1100000143051147,
71
- 1.1399999856948853,
72
- 1.340000033378601,
73
- 1.5899999141693115,
74
- 1.600000023841858,
75
- 1.6200000047683716,
76
- 2.620000123977661,
77
- 3.2300000190734863,
78
- 3.2300000190734863,
79
- 4.789999961853027,
80
- 7.400000095367432,
81
- 7.700000286102295,
82
- 9.09000015258789,
83
- 12.199999809265137,
84
- 17.670000076293945,
85
- 24.46000099182129,
86
- 28.57000160217285,
87
- 30.420001983642578,
88
- 30.840002059936523,
89
- 32.590003967285156,
90
- 32.93000411987305,
91
- 42.320003509521484,
92
- 44.96000289916992,
93
- 50.340003967285156,
94
- 50.45000457763672,
95
- 57.55000305175781,
96
- 57.93000411987305,
97
- 58.21000289916992,
98
- 60.1400032043457,
99
- 62.61000442504883,
100
- 62.62000274658203,
101
- 62.71000289916992,
102
- 63.1400032043457,
103
- 63.1400032043457,
104
- 63.77000427246094,
105
- 63.93000411987305,
106
- 63.96000289916992,
107
- 63.970001220703125,
108
- 64.02999877929688,
109
- 64.06999969482422,
110
- 64.08000183105469,
111
- 64.12000274658203,
112
- 64.41000366210938,
113
- 64.4800033569336,
114
- 64.51000213623047,
115
- 64.52999877929688,
116
- 64.83999633789062
117
- ],
118
- "short_factor": [
119
- 1.0,
120
- 1.0199999809265137,
121
- 1.0299999713897705,
122
- 1.0299999713897705,
123
- 1.0499999523162842,
124
- 1.0499999523162842,
125
- 1.0499999523162842,
126
- 1.0499999523162842,
127
- 1.0499999523162842,
128
- 1.0699999332427979,
129
- 1.0999999046325684,
130
- 1.1099998950958252,
131
- 1.1599998474121094,
132
- 1.1599998474121094,
133
- 1.1699998378753662,
134
- 1.2899998426437378,
135
- 1.339999794960022,
136
- 1.679999828338623,
137
- 1.7899998426437378,
138
- 1.8199998140335083,
139
- 1.8499997854232788,
140
- 1.8799997568130493,
141
- 1.9099997282028198,
142
- 1.9399996995925903,
143
- 1.9899996519088745,
144
- 2.0199997425079346,
145
- 2.0199997425079346,
146
- 2.0199997425079346,
147
- 2.0199997425079346,
148
- 2.0199997425079346,
149
- 2.0199997425079346,
150
- 2.0299997329711914,
151
- 2.0299997329711914,
152
- 2.0299997329711914,
153
- 2.0299997329711914,
154
- 2.0299997329711914,
155
- 2.0299997329711914,
156
- 2.0299997329711914,
157
- 2.0299997329711914,
158
- 2.0299997329711914,
159
- 2.0799996852874756,
160
- 2.0899996757507324,
161
- 2.189999580383301,
162
- 2.2199995517730713,
163
- 2.5899994373321533,
164
- 2.729999542236328,
165
- 2.749999523162842,
166
- 2.8399994373321533
167
- ],
168
- "type": "longrope"
169
- },
170
- "rope_theta": 10000.0,
171
- "router_loss_coef": 0.01,
172
- "router_theta": 0.2,
173
- "router_z_loss_coef": 0.001,
174
- "scales": [
175
- 1,
176
- 3
177
- ],
178
- "sliding_window": 262144,
179
- "sparse_upcycling": true,
180
- "strategy_train": "base",
181
- "tie_word_embeddings": false,
182
- "tokenizer_model_max_length": 2048,
183
- "tokenizer_padding_side": "right",
184
- "topk_max": 2,
185
- "topk_min": 1,
186
- "torch_dtype": "bfloat16",
187
- "training": true,
188
- "transformers_version": "4.43.0",
189
- "tune_mm_mlp_adapter": false,
190
- "unit_test": true,
191
- "use_cache": true,
192
- "use_mm_proj": true,
193
- "use_old": false,
194
- "version": "phi35",
195
- "vision_tower": "google/siglip-so400m-patch14-224",
196
- "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin",
197
- "vocab_size": 32064,
198
- "warm_up": 0.05
199
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/generation_config.json DELETED
@@ -1,12 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 1,
4
- "do_sample": true,
5
- "eos_token_id": [
6
- 32007,
7
- 32001,
8
- 32000
9
- ],
10
- "pad_token_id": 32000,
11
- "transformers_version": "4.43.0"
12
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b32151dd7864a4e3b06c13abed98d80bf53b2e00d56ec62510b40392f2c9d41b
3
- size 4972489328
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c517446e85d8d0b677a7975a793431529fa4701e4d1ae249f4922758a06a8ad9
3
- size 4985976068
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d87f7abcc16d06b627b721f9f8d1d1eb53b2f639b4881f008487d4b71efe3d0e
3
- size 248943552
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/model.safetensors.index.json DELETED
The diff for this file is too large to render. See raw diff
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<s>",
4
- "lstrip": false,
5
- "normalized": false,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<unk>",
17
- "unk_token": {
18
- "content": "<unk>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/tokenizer_config.json DELETED
@@ -1,132 +0,0 @@
1
- {
2
- "add_bos_token": false,
3
- "add_eos_token": false,
4
- "add_prefix_space": true,
5
- "added_tokens_decoder": {
6
- "0": {
7
- "content": "<unk>",
8
- "lstrip": false,
9
- "normalized": false,
10
- "rstrip": false,
11
- "single_word": false,
12
- "special": true
13
- },
14
- "1": {
15
- "content": "<s>",
16
- "lstrip": false,
17
- "normalized": false,
18
- "rstrip": false,
19
- "single_word": false,
20
- "special": true
21
- },
22
- "2": {
23
- "content": "</s>",
24
- "lstrip": false,
25
- "normalized": false,
26
- "rstrip": true,
27
- "single_word": false,
28
- "special": false
29
- },
30
- "32000": {
31
- "content": "<|endoftext|>",
32
- "lstrip": false,
33
- "normalized": false,
34
- "rstrip": false,
35
- "single_word": false,
36
- "special": true
37
- },
38
- "32001": {
39
- "content": "<|assistant|>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": true,
43
- "single_word": false,
44
- "special": true
45
- },
46
- "32002": {
47
- "content": "<|placeholder1|>",
48
- "lstrip": false,
49
- "normalized": false,
50
- "rstrip": true,
51
- "single_word": false,
52
- "special": true
53
- },
54
- "32003": {
55
- "content": "<|placeholder2|>",
56
- "lstrip": false,
57
- "normalized": false,
58
- "rstrip": true,
59
- "single_word": false,
60
- "special": true
61
- },
62
- "32004": {
63
- "content": "<|placeholder3|>",
64
- "lstrip": false,
65
- "normalized": false,
66
- "rstrip": true,
67
- "single_word": false,
68
- "special": true
69
- },
70
- "32005": {
71
- "content": "<|placeholder4|>",
72
- "lstrip": false,
73
- "normalized": false,
74
- "rstrip": true,
75
- "single_word": false,
76
- "special": true
77
- },
78
- "32006": {
79
- "content": "<|system|>",
80
- "lstrip": false,
81
- "normalized": false,
82
- "rstrip": true,
83
- "single_word": false,
84
- "special": true
85
- },
86
- "32007": {
87
- "content": "<|end|>",
88
- "lstrip": false,
89
- "normalized": false,
90
- "rstrip": true,
91
- "single_word": false,
92
- "special": true
93
- },
94
- "32008": {
95
- "content": "<|placeholder5|>",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": true,
99
- "single_word": false,
100
- "special": true
101
- },
102
- "32009": {
103
- "content": "<|placeholder6|>",
104
- "lstrip": false,
105
- "normalized": false,
106
- "rstrip": true,
107
- "single_word": false,
108
- "special": true
109
- },
110
- "32010": {
111
- "content": "<|user|>",
112
- "lstrip": false,
113
- "normalized": false,
114
- "rstrip": true,
115
- "single_word": false,
116
- "special": true
117
- }
118
- },
119
- "bos_token": "<s>",
120
- "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
121
- "clean_up_tokenization_spaces": false,
122
- "eos_token": "<|endoftext|>",
123
- "legacy": false,
124
- "model_max_length": 2048,
125
- "pad_token": "<unk>",
126
- "padding_side": "right",
127
- "sp_model_kwargs": {},
128
- "spaces_between_special_tokens": false,
129
- "tokenizer_class": "LlamaTokenizer",
130
- "unk_token": "<unk>",
131
- "use_default_system_prompt": false
132
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/trainer_state.json DELETED
The diff for this file is too large to render. See raw diff
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_competesmoev30/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d184055355e29fdb6a50c07848c245cc1a9210d8493f4fbf355458519a5ea64
3
- size 7992
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_softmax_competesmoev30/added_tokens.json DELETED
@@ -1,13 +0,0 @@
1
- {
2
- "<|assistant|>": 32001,
3
- "<|endoftext|>": 32000,
4
- "<|end|>": 32007,
5
- "<|placeholder1|>": 32002,
6
- "<|placeholder2|>": 32003,
7
- "<|placeholder3|>": 32004,
8
- "<|placeholder4|>": 32005,
9
- "<|placeholder5|>": 32008,
10
- "<|placeholder6|>": 32009,
11
- "<|system|>": 32006,
12
- "<|user|>": 32010
13
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
CompeteSMoE/competesmoe_versions/Full_CS_BS_theta0.2_RL0.01_softmax_competesmoev30/config.json DELETED
@@ -1,199 +0,0 @@
1
- {
2
- "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft",
3
- "architectures": [
4
- "LlavaPhiForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "auto_map": {
9
- "AutoConfig": "configuration_phi3.Phi3Config",
10
- "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
11
- },
12
- "balance_loss_coef": 0.01,
13
- "bos_token_id": 1,
14
- "clip_smoe": true,
15
- "diversity_loss_coef": 0.01,
16
- "dropout": false,
17
- "e_loss_coef": 0.001,
18
- "embd_pdrop": 0.0,
19
- "entropy_advance_loss": false,
20
- "eos_token_id": 32000,
21
- "freeze_backbone": false,
22
- "freeze_mm_mlp_adapter": false,
23
- "hidden_act": "silu",
24
- "hidden_size": 3072,
25
- "hybrid": true,
26
- "image_aspect_ratio": "pad",
27
- "init_weight": true,
28
- "initializer_range": 0.02,
29
- "intermediate_size": 8192,
30
- "is_cosine": true,
31
- "is_norm_weight": false,
32
- "local_rank": 0,
33
- "loss1": "balanceloss",
34
- "loss2": "zloss",
35
- "luna": false,
36
- "max_compete_in_iter": 8,
37
- "max_position_embeddings": 131072,
38
- "mlp_smoe": true,
39
- "mm_hidden_size": 1152,
40
- "mm_patch_merge_type": "flat",
41
- "mm_projector_lr": null,
42
- "mm_projector_type": "moe",
43
- "mm_use_im_patch_token": false,
44
- "mm_use_im_start_end": false,
45
- "mm_vision_select_feature": "patch",
46
- "mm_vision_select_layer": -2,
47
- "mm_vision_tower": "google/siglip-so400m-patch14-224",
48
- "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft",
49
- "model_type": "llava_phi",
50
- "moe_name": "competesmoev30",
51
- "norm_softmax": true,
52
- "normalization": true,
53
- "num_attention_heads": 32,
54
- "num_experts": 4,
55
- "num_hidden_layers": 32,
56
- "num_key_value_heads": 32,
57
- "num_layers": 3,
58
- "num_selected": 2,
59
- "number_of_previous_tokens": 2,
60
- "original_max_position_embeddings": 4096,
61
- "pad_token_id": 32000,
62
- "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin",
63
- "rate_compete": 0.2,
64
- "rate_flip": 0.07,
65
- "resid_pdrop": 0.0,
66
- "rms_norm_eps": 1e-05,
67
- "rope_scaling": {
68
- "long_factor": [
69
- 1.0800000429153442,
70
- 1.1100000143051147,
71
- 1.1399999856948853,
72
- 1.340000033378601,
73
- 1.5899999141693115,
74
- 1.600000023841858,
75
- 1.6200000047683716,
76
- 2.620000123977661,
77
- 3.2300000190734863,
78
- 3.2300000190734863,
79
- 4.789999961853027,
80
- 7.400000095367432,
81
- 7.700000286102295,
82
- 9.09000015258789,
83
- 12.199999809265137,
84
- 17.670000076293945,
85
- 24.46000099182129,
86
- 28.57000160217285,
87
- 30.420001983642578,
88
- 30.840002059936523,
89
- 32.590003967285156,
90
- 32.93000411987305,
91
- 42.320003509521484,
92
- 44.96000289916992,
93
- 50.340003967285156,
94
- 50.45000457763672,
95
- 57.55000305175781,
96
- 57.93000411987305,
97
- 58.21000289916992,
98
- 60.1400032043457,
99
- 62.61000442504883,
100
- 62.62000274658203,
101
- 62.71000289916992,
102
- 63.1400032043457,
103
- 63.1400032043457,
104
- 63.77000427246094,
105
- 63.93000411987305,
106
- 63.96000289916992,
107
- 63.970001220703125,
108
- 64.02999877929688,
109
- 64.06999969482422,
110
- 64.08000183105469,
111
- 64.12000274658203,
112
- 64.41000366210938,
113
- 64.4800033569336,
114
- 64.51000213623047,
115
- 64.52999877929688,
116
- 64.83999633789062
117
- ],
118
- "short_factor": [
119
- 1.0,
120
- 1.0199999809265137,
121
- 1.0299999713897705,
122
- 1.0299999713897705,
123
- 1.0499999523162842,
124
- 1.0499999523162842,
125
- 1.0499999523162842,
126
- 1.0499999523162842,
127
- 1.0499999523162842,
128
- 1.0699999332427979,
129
- 1.0999999046325684,
130
- 1.1099998950958252,
131
- 1.1599998474121094,
132
- 1.1599998474121094,
133
- 1.1699998378753662,
134
- 1.2899998426437378,
135
- 1.339999794960022,
136
- 1.679999828338623,
137
- 1.7899998426437378,
138
- 1.8199998140335083,
139
- 1.8499997854232788,
140
- 1.8799997568130493,
141
- 1.9099997282028198,
142
- 1.9399996995925903,
143
- 1.9899996519088745,
144
- 2.0199997425079346,
145
- 2.0199997425079346,
146
- 2.0199997425079346,
147
- 2.0199997425079346,
148
- 2.0199997425079346,
149
- 2.0199997425079346,
150
- 2.0299997329711914,
151
- 2.0299997329711914,
152
- 2.0299997329711914,
153
- 2.0299997329711914,
154
- 2.0299997329711914,
155
- 2.0299997329711914,
156
- 2.0299997329711914,
157
- 2.0299997329711914,
158
- 2.0299997329711914,
159
- 2.0799996852874756,
160
- 2.0899996757507324,
161
- 2.189999580383301,
162
- 2.2199995517730713,
163
- 2.5899994373321533,
164
- 2.729999542236328,
165
- 2.749999523162842,
166
- 2.8399994373321533
167
- ],
168
- "type": "longrope"
169
- },
170
- "rope_theta": 10000.0,
171
- "router_loss_coef": 0.01,
172
- "router_theta": 0.2,
173
- "router_z_loss_coef": 0.001,
174
- "scales": [
175
- 1,
176
- 3
177
- ],
178
- "sliding_window": 262144,
179
- "sparse_upcycling": true,
180
- "strategy_train": "base",
181
- "tie_word_embeddings": false,
182
- "tokenizer_model_max_length": 2048,
183
- "tokenizer_padding_side": "right",
184
- "topk_max": 2,
185
- "topk_min": 1,
186
- "torch_dtype": "bfloat16",
187
- "training": true,
188
- "transformers_version": "4.43.0",
189
- "tune_mm_mlp_adapter": false,
190
- "unit_test": true,
191
- "use_cache": true,
192
- "use_mm_proj": true,
193
- "use_old": false,
194
- "version": "phi35",
195
- "vision_tower": "google/siglip-so400m-patch14-224",
196
- "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin",
197
- "vocab_size": 32064,
198
- "warm_up": 0.05
199
- }