|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Optional, Union |
|
|
|
|
|
from ...layers import MoeConfig |
|
|
from ..modeling_utils import PretrainedConfig |
|
|
|
|
|
|
|
|
class DbrxConfig(PretrainedConfig): |
|
|
|
|
|
def __init__(self, |
|
|
*, |
|
|
bias: bool = False, |
|
|
clip_qkv: Optional[float] = None, |
|
|
rotary_base: float = 500000.0, |
|
|
rotary_scaling: Optional[dict] = None, |
|
|
moe: Optional[Union[MoeConfig, dict]] = None, |
|
|
**kwargs): |
|
|
self.bias = bias |
|
|
self.clip_qkv = clip_qkv |
|
|
self.rotary_base = rotary_base |
|
|
self.rotary_scaling = rotary_scaling |
|
|
if moe is None: |
|
|
|
|
|
moe = MoeConfig( |
|
|
num_experts=kwargs.pop('moe_num_experts', 0), |
|
|
top_k=kwargs.pop('moe_top_k', 0), |
|
|
normalization_mode=kwargs.pop( |
|
|
'moe_normalization_mode', |
|
|
MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE)) |
|
|
elif isinstance(moe, dict): |
|
|
moe = MoeConfig.from_dict(moe) |
|
|
assert isinstance(moe, MoeConfig) |
|
|
self.moe = moe.validate() |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
def to_dict(self): |
|
|
output = super().to_dict() |
|
|
|
|
|
output['bias'] = self.bias |
|
|
output['clip_qkv'] = self.clip_qkv |
|
|
output['rotary_base'] = self.rotary_base |
|
|
output['rotary_scaling'] = self.rotary_scaling |
|
|
output['moe'] = self.moe.to_dict() |
|
|
return output |
|
|
|