| |
|
| | from transformers import PretrainedConfig, PreTrainedModel, AutoConfig, AutoModelForCausalLM |
| | from transformers.modeling_outputs import CausalLMOutputWithPast |
| | from typing import List, Optional, Tuple |
| | from torch import nn |
| | import torch |
| | import torch.nn.functional as F |
| | import math |
| |
|
| | repo_name = "BeardedMonster/SabiYarn-125M" |
| |
|
| |
|
| | class GPTJXMoEConfig(PretrainedConfig): |
| | """Configuration class for SabiYarn model.""" |
| | |
| | model_type = "sabiyarn" |
| | |
| | def __init__( |
| | self, |
| | block_size: int = 32768, |
| | vocab_size: int = 52050, |
| | n_layer: int = 12, |
| | n_heads: int = 12, |
| | n_embd: int = 768, |
| | dropout: float = 0.0, |
| | max_batch_size: int = 1, |
| | use_kv_cache: bool = True, |
| | bias: bool = False, |
| | kv_cache_dtype: str = "float32", |
| | |
| | use_moe: bool = False, |
| | num_experts: int = 4, |
| | num_experts_per_tok: int = 2, |
| | moe_dim: int = None, |
| | **kwargs |
| | ): |
| | self.block_size = block_size |
| | self.vocab_size = vocab_size |
| | self.n_layer = n_layer |
| | self.n_heads = n_heads |
| | self.n_embd = n_embd |
| | self.dropout = dropout |
| | self.bias = bias |
| | self.use_kv_cache = use_kv_cache |
| | self.max_batch_size = max_batch_size |
| | self.kv_cache_dtype = kv_cache_dtype |
| | |
| | |
| | self.use_moe = use_moe |
| | self.num_experts = num_experts |
| | self.num_experts_per_tok = num_experts_per_tok |
| | |
| | self.moe_dim = moe_dim if moe_dim is not None else (4 * n_embd) |
| | |
| | super().__init__(**kwargs) |
| |
|
| |
|