MOE / configuration.py
BeardedMonster's picture
Upload GPTJXMoEForCausalLM
04ff41b verified
from transformers import PretrainedConfig, PreTrainedModel, AutoConfig, AutoModelForCausalLM
from transformers.modeling_outputs import CausalLMOutputWithPast
from typing import List, Optional, Tuple
from torch import nn
import torch
import torch.nn.functional as F
import math
repo_name = "BeardedMonster/SabiYarn-125M"
class GPTJXMoEConfig(PretrainedConfig):
"""Configuration class for SabiYarn model."""
model_type = "sabiyarn"
def __init__(
self,
block_size: int = 32768,
vocab_size: int = 52050, # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
n_layer: int = 12,
n_heads: int = 12,
n_embd: int = 768,
dropout: float = 0.0,
max_batch_size: int = 1,
use_kv_cache: bool = True,
bias: bool = False, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
kv_cache_dtype: str = "float32", # "float32" or "float16" for memory savings
# MoE hyperparameters
use_moe: bool = False, # Whether to use MoE instead of dense MLP
num_experts: int = 4, # Number of experts in MoE layer
num_experts_per_tok: int = 2, # Number of experts to route each token to (top-k)
moe_dim: int = None, # MoE hidden dimension (defaults to 4 * n_embd like MLP)
**kwargs
):
self.block_size = block_size
self.vocab_size = vocab_size
self.n_layer = n_layer
self.n_heads = n_heads
self.n_embd = n_embd
self.dropout = dropout
self.bias = bias
self.use_kv_cache = use_kv_cache
self.max_batch_size = max_batch_size
self.kv_cache_dtype = kv_cache_dtype # Memory optimization: use float16 for cache
# MoE configuration
self.use_moe = use_moe
self.num_experts = num_experts
self.num_experts_per_tok = num_experts_per_tok
# Default moe_dim to match MLP expansion (4x)
self.moe_dim = moe_dim if moe_dim is not None else (4 * n_embd)
super().__init__(**kwargs)