remove strategies
Browse files- myolmoe/modeling_myolmoe.py +1 -12
myolmoe/modeling_myolmoe.py
CHANGED
|
@@ -65,7 +65,6 @@ class OlmoeConfig(PretrainedConfig):
|
|
| 65 |
small_expert_intermediate_ratio=64,
|
| 66 |
small_expert_count=64,
|
| 67 |
small_expert_sparsity_coef=0.1,
|
| 68 |
-
small_expert_strategy="constant", # increment
|
| 69 |
max_small_expert_count=64,
|
| 70 |
**kwargs,
|
| 71 |
):
|
|
@@ -100,7 +99,6 @@ class OlmoeConfig(PretrainedConfig):
|
|
| 100 |
self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
|
| 101 |
self.small_expert_count = small_expert_count
|
| 102 |
self.small_expert_sparsity_coef = small_expert_sparsity_coef
|
| 103 |
-
self.small_expert_strategy = small_expert_strategy
|
| 104 |
self.max_small_expert_count = max_small_expert_count
|
| 105 |
|
| 106 |
# Validate the correctness of rotary position embeddings parameters
|
|
@@ -565,16 +563,7 @@ class OlmoeSparseMoeBlock(nn.Module):
|
|
| 565 |
if in_second_half:
|
| 566 |
second_half_idx = layer_idx - (self.total_layers // 2)
|
| 567 |
num_second_half_blocks = self.total_layers - (self.total_layers // 2)
|
| 568 |
-
|
| 569 |
-
if config.small_expert_strategy == "constant":
|
| 570 |
-
self.num_small_experts = config.max_small_expert_count // num_second_half_blocks
|
| 571 |
-
elif config.small_expert_strategy == "increment":
|
| 572 |
-
# Linearly scale small experts from 1 to max_small_expert_count
|
| 573 |
-
self.num_small_experts = (
|
| 574 |
-
(second_half_idx + 1) * config.max_small_expert_count // ((num_second_half_blocks * (num_second_half_blocks + 1)) // 2)
|
| 575 |
-
)
|
| 576 |
-
else:
|
| 577 |
-
raise ValueError(f"Unknown strategy: {config.small_expert_strategy}")
|
| 578 |
else:
|
| 579 |
self.num_small_experts = 0
|
| 580 |
|
|
|
|
| 65 |
small_expert_intermediate_ratio=64,
|
| 66 |
small_expert_count=64,
|
| 67 |
small_expert_sparsity_coef=0.1,
|
|
|
|
| 68 |
max_small_expert_count=64,
|
| 69 |
**kwargs,
|
| 70 |
):
|
|
|
|
| 99 |
self.small_expert_intermediate_ratio = small_expert_intermediate_ratio
|
| 100 |
self.small_expert_count = small_expert_count
|
| 101 |
self.small_expert_sparsity_coef = small_expert_sparsity_coef
|
|
|
|
| 102 |
self.max_small_expert_count = max_small_expert_count
|
| 103 |
|
| 104 |
# Validate the correctness of rotary position embeddings parameters
|
|
|
|
| 563 |
if in_second_half:
|
| 564 |
second_half_idx = layer_idx - (self.total_layers // 2)
|
| 565 |
num_second_half_blocks = self.total_layers - (self.total_layers // 2)
|
| 566 |
+
self.num_small_experts = config.max_small_expert_count // num_second_half_blocks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
else:
|
| 568 |
self.num_small_experts = 0
|
| 569 |
|