| from typing import List |
| from transformers import PretrainedConfig |
|
|
|
|
| class YingLongConfig(PretrainedConfig): |
| model_type = "yinglong" |
| |
|
|
| def __init__( |
| self, |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| bias = False, |
| condense_ratio = 1, |
| haar_trans = True, |
| haar_trans_inv = True, |
| haar_trans_norm = 'backward', |
| half_diff = False, |
| intermediate_size = 1024, |
| n_embd = 256, |
| n_head = 16, |
| n_layer = 6, |
| n_query_groups = 4, |
| norm_eps = 1e-5, |
| org = 'Alibaba', |
| patch_size = 32, |
| rope_base = 10000, |
| rotary_percentage = 1.0, |
| shared_attention_norm = False, |
| unet = True, |
| _mlp_class = "LLaMAMLP", |
| _norm_class="FusedRMSNorm", |
| *args, |
| **kwargs, |
| ): |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| self.org = 'Alibaba' |
| self.patch_size = patch_size |
| self.unet = unet |
| |
| self.n_embd = n_embd |
| self.intermediate_size = intermediate_size |
| self.n_head = n_head |
| self.n_layer = n_layer |
| self.n_query_groups = n_query_groups |
| self.norm_eps = norm_eps |
| self.bias = bias |
| self.shared_attention_norm = shared_attention_norm |
| |
| self.condense_ratio = condense_ratio |
| self.rope_base = rope_base |
| self.rotary_percentage = rotary_percentage |
| |
| self.haar_trans = haar_trans |
| self.haar_trans_inv = haar_trans_inv |
| self.haar_trans_norm = haar_trans_norm |
| self.half_diff = half_diff |
| |
| self._norm_class = _norm_class |
| |
| self._mlp_class = _mlp_class |
| |
| assert self.n_embd % self.n_head == 0 |
| assert self.n_head % self.n_query_groups == 0 |
|
|
| self.head_size = self.n_embd // self.n_head |
| self.rope_n_elem = int(self.rotary_percentage * self.head_size) |
| self.rope_condense_ratio = self.condense_ratio |
| |
| |
| |
| |
|
|
|
|
| super().__init__( |
| **kwargs, |
| ) |