File size: 4,919 Bytes
61cca66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import logging
from transformers import PretrainedConfig
from typing import Optional

logger = logging.getLogger(__name__)


class SmalLmConfig(PretrainedConfig):
    """
    Base config for all SmalLm models

    Raises:
        ValueError: Positional_bias_type must be in suported types
        ValueError: In case of rope positional_bias_type head_size can't be anything
    """
    model_type = "smallm"

    def __init__(
        self,
        # global model params
        hidden_size: int = 512,
        intermediate_size: int = 2048,
        mlp_bias: bool = False,
        num_hidden_layers: int = 27,
        rms_norm_eps: float = 1e-6,
        rms_affine: bool = False,
        initializer_range: float = 0.02,
        output_hidden_states: bool = False,
        output_attentions: bool = False,
        use_cache: bool = True,
        sliding_window_attention: bool = True,
        sliding_window_context: int = 1024,
        sliding_window_period: int = 4,
        embedding_dropout: float = 0.0,
        layer_dropout: float = 0.1,
        max_seq_len: int = 2048,
        original_seq_len: int | None = None,
        tie_word_embeddings: bool = True,
        # attention params
        num_attention_heads: int = 9,
        num_kv_heads: int = 3,
        head_size: Optional[int] = None,
        attention_dropout: float = 0.1,
        positional_bias_type: str = "rope",
        high_rotations: int = 32,
        low_rotations: int = 1,
        attention_bias: bool = False,
        rope_base: int = 100000,
        # MoE params
        use_moe: bool = True,
        moe_period: int = 3,
        expert_size: int = 256,
        shared_experts: int = 2,
        routed_experts: int = 16,
        token_experts: int = 4,
        noisy_experts: bool = False,
        moe_bias: bool = False,
        balancing_coef: float = 1e-4,
        no_moe_layers: int = 5,
        # extra params
        vocab_size: int = 60000,
        bos_token_id: int = 1,
        eos_token_id: int = 0,
        pad_token_id: int = 0,
        static_residual: bool = False,
        **kwargs,
    ):
        if positional_bias_type not in ["alibi", "rope"]:
            raise ValueError(
                f"positional_bias_type must be 'alibi' or 'rope', got {positional_bias_type}"
            )
        self.static_residual = not static_residual
        self.no_moe_layers = no_moe_layers
        self.moe_bias = moe_bias
        self.balancing_coef = balancing_coef
        self.noisy_experts = noisy_experts
        self.high_rotations = high_rotations
        self.low_rotations = low_rotations
        self.positional_bias_type = positional_bias_type
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.mlp_bias = mlp_bias
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.num_kv_heads = num_kv_heads
        self.attention_dropout = attention_dropout
        self.rms_norm_eps = rms_norm_eps
        self.max_seq_len = max_seq_len
        self.use_cache = use_cache
        self.initializer_range = initializer_range
        self.embedding_dropout = embedding_dropout
        self.rms_affine = rms_affine
        self.output_hidden_states = output_hidden_states
        self.output_attentions = output_attentions
        self.layer_dropout = layer_dropout
        self.use_moe = use_moe
        self.moe_period = moe_period
        self.expert_size = expert_size
        self.shared_experts = shared_experts
        self.routed_experts = routed_experts
        self.token_experts = token_experts
        self.intermediate_size = intermediate_size
        self.attention_bias = attention_bias
        self.rope_base = rope_base
        self.head_size = head_size if head_size else hidden_size // num_attention_heads
        self.original_seq_len = (
            original_seq_len if original_seq_len is not None else max_seq_len
        )

        self.sliding_window_attention = sliding_window_attention
        self.sliding_window_context = sliding_window_context
        self.sliding_window_period = sliding_window_period
        if sliding_window_attention and sliding_window_context > max_seq_len:
            logger.warning(
                f"sliding_window_context more than max_seq_len, \
                    set sliding_window_context to {max_seq_len}"
            )
            self.sliding_window_context = max_seq_len
        if not sliding_window_attention:
            self.sliding_window_context = max_seq_len

        if self.head_size % 2 != 0 and self.positional_bias_type == "rope":
            raise ValueError("Head size should divided by 2")

        super().__init__(
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            pad_token_id=pad_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


__all__ = ["SmalLmConfig"]