File size: 4,919 Bytes
61cca66 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import logging
from transformers import PretrainedConfig
from typing import Optional
logger = logging.getLogger(__name__)
class SmalLmConfig(PretrainedConfig):
"""
Base config for all SmalLm models
Raises:
ValueError: Positional_bias_type must be in suported types
ValueError: In case of rope positional_bias_type head_size can't be anything
"""
model_type = "smallm"
def __init__(
self,
# global model params
hidden_size: int = 512,
intermediate_size: int = 2048,
mlp_bias: bool = False,
num_hidden_layers: int = 27,
rms_norm_eps: float = 1e-6,
rms_affine: bool = False,
initializer_range: float = 0.02,
output_hidden_states: bool = False,
output_attentions: bool = False,
use_cache: bool = True,
sliding_window_attention: bool = True,
sliding_window_context: int = 1024,
sliding_window_period: int = 4,
embedding_dropout: float = 0.0,
layer_dropout: float = 0.1,
max_seq_len: int = 2048,
original_seq_len: int | None = None,
tie_word_embeddings: bool = True,
# attention params
num_attention_heads: int = 9,
num_kv_heads: int = 3,
head_size: Optional[int] = None,
attention_dropout: float = 0.1,
positional_bias_type: str = "rope",
high_rotations: int = 32,
low_rotations: int = 1,
attention_bias: bool = False,
rope_base: int = 100000,
# MoE params
use_moe: bool = True,
moe_period: int = 3,
expert_size: int = 256,
shared_experts: int = 2,
routed_experts: int = 16,
token_experts: int = 4,
noisy_experts: bool = False,
moe_bias: bool = False,
balancing_coef: float = 1e-4,
no_moe_layers: int = 5,
# extra params
vocab_size: int = 60000,
bos_token_id: int = 1,
eos_token_id: int = 0,
pad_token_id: int = 0,
static_residual: bool = False,
**kwargs,
):
if positional_bias_type not in ["alibi", "rope"]:
raise ValueError(
f"positional_bias_type must be 'alibi' or 'rope', got {positional_bias_type}"
)
self.static_residual = not static_residual
self.no_moe_layers = no_moe_layers
self.moe_bias = moe_bias
self.balancing_coef = balancing_coef
self.noisy_experts = noisy_experts
self.high_rotations = high_rotations
self.low_rotations = low_rotations
self.positional_bias_type = positional_bias_type
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.mlp_bias = mlp_bias
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_kv_heads = num_kv_heads
self.attention_dropout = attention_dropout
self.rms_norm_eps = rms_norm_eps
self.max_seq_len = max_seq_len
self.use_cache = use_cache
self.initializer_range = initializer_range
self.embedding_dropout = embedding_dropout
self.rms_affine = rms_affine
self.output_hidden_states = output_hidden_states
self.output_attentions = output_attentions
self.layer_dropout = layer_dropout
self.use_moe = use_moe
self.moe_period = moe_period
self.expert_size = expert_size
self.shared_experts = shared_experts
self.routed_experts = routed_experts
self.token_experts = token_experts
self.intermediate_size = intermediate_size
self.attention_bias = attention_bias
self.rope_base = rope_base
self.head_size = head_size if head_size else hidden_size // num_attention_heads
self.original_seq_len = (
original_seq_len if original_seq_len is not None else max_seq_len
)
self.sliding_window_attention = sliding_window_attention
self.sliding_window_context = sliding_window_context
self.sliding_window_period = sliding_window_period
if sliding_window_attention and sliding_window_context > max_seq_len:
logger.warning(
f"sliding_window_context more than max_seq_len, \
set sliding_window_context to {max_seq_len}"
)
self.sliding_window_context = max_seq_len
if not sliding_window_attention:
self.sliding_window_context = max_seq_len
if self.head_size % 2 != 0 and self.positional_bias_type == "rope":
raise ValueError("Head size should divided by 2")
super().__init__(
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
pad_token_id=pad_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
__all__ = ["SmalLmConfig"]
|