|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Optional, Union |
|
|
|
|
|
from ...layers import MoeConfig |
|
|
from ..modeling_utils import PretrainedConfig |
|
|
|
|
|
|
|
|
class GPTConfig(PretrainedConfig): |
|
|
|
|
|
def __init__(self, |
|
|
*, |
|
|
bias: bool = True, |
|
|
q_scaling: float = 1.0, |
|
|
embedding_scale: Optional[float] = None, |
|
|
apply_query_key_layer_scaling: bool = False, |
|
|
rotary_pct: float = 1.0, |
|
|
rotary_base: float = 10000.0, |
|
|
rotary_scaling: Optional[dict] = None, |
|
|
moe: Optional[Union[MoeConfig, dict]] = None, |
|
|
**kwargs): |
|
|
self.bias = bias |
|
|
self.q_scaling = q_scaling |
|
|
self.embedding_scale = embedding_scale |
|
|
self.apply_query_key_layer_scaling = apply_query_key_layer_scaling |
|
|
self.rotary_pct = rotary_pct |
|
|
self.rotary_base = rotary_base |
|
|
self.rotary_scaling = rotary_scaling |
|
|
if moe is None: |
|
|
|
|
|
moe = MoeConfig( |
|
|
num_experts=kwargs.pop('moe_num_experts', 0), |
|
|
top_k=kwargs.pop('moe_top_k', 0), |
|
|
normalization_mode=kwargs.pop( |
|
|
'moe_normalization_mode', |
|
|
MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE)) |
|
|
elif isinstance(moe, dict): |
|
|
moe = MoeConfig.from_dict(moe) |
|
|
assert isinstance(moe, MoeConfig) |
|
|
self.moe = moe.validate() |
|
|
|
|
|
super().__init__(**kwargs) |
|
|
|
|
|
def to_dict(self): |
|
|
output = super().to_dict() |
|
|
|
|
|
output['bias'] = self.bias |
|
|
output['q_scaling'] = self.q_scaling |
|
|
output['embedding_scale'] = self.embedding_scale |
|
|
output[ |
|
|
'apply_query_key_layer_scaling'] = self.apply_query_key_layer_scaling |
|
|
output['rotary_pct'] = self.rotary_pct |
|
|
output['rotary_base'] = self.rotary_base |
|
|
output['rotary_scaling'] = self.rotary_scaling |
|
|
output['moe'] = self.moe.to_dict() |
|
|
return output |
|
|
|