|
|
import logging |
|
|
from transformers import PretrainedConfig |
|
|
from typing import Optional |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class SmalLmConfig(PretrainedConfig): |
|
|
model_type = "smallm" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
|
|
|
hidden_size: int = 512, |
|
|
intermediate_size: int = 2048, |
|
|
mlp_bias: bool = False, |
|
|
num_hidden_layers: int = 27, |
|
|
rms_norm_eps: float = 1e-6, |
|
|
rms_affine: bool = False, |
|
|
initializer_range: float = 0.02, |
|
|
output_hidden_states: bool = False, |
|
|
output_attentions: bool = False, |
|
|
use_cache: bool = True, |
|
|
sliding_window_attention: bool = True, |
|
|
sliding_window_context: int = 1024, |
|
|
sliding_window_period: int = 4, |
|
|
embedding_dropout: float = 0.0, |
|
|
layer_dropout: float = 0.1, |
|
|
max_seq_len: int = 2048, |
|
|
original_seq_len: int | None = None, |
|
|
tie_word_embeddings: bool = True, |
|
|
|
|
|
num_attention_heads: int = 9, |
|
|
num_kv_heads: int = 3, |
|
|
head_size: Optional[int] = None, |
|
|
attention_dropout: float = 0.1, |
|
|
positional_bias_type: str = "rope", |
|
|
high_rotations: int = 32, |
|
|
low_rotations: int = 1, |
|
|
attention_bias: bool = False, |
|
|
rope_base: int = 100000, |
|
|
|
|
|
use_moe: bool = True, |
|
|
moe_period: int = 3, |
|
|
expert_size: int = 256, |
|
|
shared_experts: int = 2, |
|
|
routed_experts: int = 16, |
|
|
token_experts: int = 4, |
|
|
noisy_experts: bool = False, |
|
|
moe_bias: bool = False, |
|
|
balancing_coef: float = 1e-4, |
|
|
no_moe_layers: int = 5, |
|
|
|
|
|
vocab_size: int = 60000, |
|
|
bos_token_id: int = 1, |
|
|
eos_token_id: int = 0, |
|
|
pad_token_id: int = 0, |
|
|
static_residual: bool = False, |
|
|
**kwargs, |
|
|
): |
|
|
if positional_bias_type not in ["alibi", "rope"]: |
|
|
raise ValueError( |
|
|
f"positional_bias_type must be 'alibi' or 'rope', got {positional_bias_type}" |
|
|
) |
|
|
self.static_residual = not static_residual |
|
|
self.no_moe_layers = no_moe_layers |
|
|
self.moe_bias = moe_bias |
|
|
self.balancing_coef = balancing_coef |
|
|
self.noisy_experts = noisy_experts |
|
|
self.high_rotations = high_rotations |
|
|
self.low_rotations = low_rotations |
|
|
self.positional_bias_type = positional_bias_type |
|
|
self.vocab_size = vocab_size |
|
|
self.hidden_size = hidden_size |
|
|
self.mlp_bias = mlp_bias |
|
|
self.num_hidden_layers = num_hidden_layers |
|
|
self.num_attention_heads = num_attention_heads |
|
|
self.num_kv_heads = num_kv_heads |
|
|
self.attention_dropout = attention_dropout |
|
|
self.rms_norm_eps = rms_norm_eps |
|
|
self.max_seq_len = max_seq_len |
|
|
self.use_cache = use_cache |
|
|
self.initializer_range = initializer_range |
|
|
self.embedding_dropout = embedding_dropout |
|
|
self.rms_affine = rms_affine |
|
|
self.output_hidden_states = output_hidden_states |
|
|
self.output_attentions = output_attentions |
|
|
self.layer_dropout = layer_dropout |
|
|
self.use_moe = use_moe |
|
|
self.moe_period = moe_period |
|
|
self.expert_size = expert_size |
|
|
self.shared_experts = shared_experts |
|
|
self.routed_experts = routed_experts |
|
|
self.token_experts = token_experts |
|
|
self.intermediate_size = intermediate_size |
|
|
self.attention_bias = attention_bias |
|
|
self.rope_base = rope_base |
|
|
self.head_size = head_size if head_size else hidden_size // num_attention_heads |
|
|
self.original_seq_len = ( |
|
|
original_seq_len if original_seq_len is not None else max_seq_len |
|
|
) |
|
|
|
|
|
self.sliding_window_attention = sliding_window_attention |
|
|
self.sliding_window_context = sliding_window_context |
|
|
self.sliding_window_period = sliding_window_period |
|
|
if sliding_window_attention and sliding_window_context > max_seq_len: |
|
|
logger.warning( |
|
|
f"sliding_window_context more than max_seq_len, \ |
|
|
set sliding_window_context to {max_seq_len}" |
|
|
) |
|
|
self.sliding_window_context = max_seq_len |
|
|
if not sliding_window_attention: |
|
|
self.sliding_window_context = max_seq_len |
|
|
|
|
|
if self.head_size % 2 != 0 and self.positional_bias_type == "rope": |
|
|
raise ValueError("Head size should divided by 2") |
|
|
|
|
|
super().__init__( |
|
|
bos_token_id=bos_token_id, |
|
|
eos_token_id=eos_token_id, |
|
|
pad_token_id=pad_token_id, |
|
|
tie_word_embeddings=tie_word_embeddings, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
|
|
|
__all__ = ["SmalLmConfig"] |
|
|
|