Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

config.json +27 -0
configuration_seed.py +48 -0
generation_config.json +12 -0
model.safetensors +3 -0
modeling_seed.py +282 -0
tokenizer.json +0 -0
tokenizer_config.json +45 -0

config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "architectures": [
+    "SeedForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_seed.SeedConfig",
+    "AutoModelForCausalLM": "modeling_seed.SeedForCausalLM"
+  },
+  "model_type": "seed",
+  "vocab_size": 64000,
+  "n_embd": 1024,
+  "n_layer": 28,
+  "n_head": 16,
+  "n_kv_head": 8,
+  "head_dim": 128,
+  "intermediate_size": 3072,
+  "block_size": 4096,
+  "bias": false,
+  "dropout": 0.0,
+  "rms_norm_eps": 1e-6,
+  "rope_theta": 1000000.0,
+  "rope_scaling_type": "none",
+  "rope_scaling_factor": 1.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.57.3"
+}

configuration_seed.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from transformers import PretrainedConfig
+class SeedConfig(PretrainedConfig):
+    model_type = "seed"
+    def __init__(
+        self,
+        vocab_size: int = 64000,
+        n_embd: int = 1024,
+        n_layer: int = 28,
+        n_head: int = 16,
+        n_kv_head: int = 8,
+        head_dim: int = 128,
+        intermediate_size: int = 3072,
+        block_size: int = 4096,
+        bias: bool = False,
+        dropout: float = 0.0,
+        rope_theta: float = 1000000.0,
+        rope_scaling_type: str = "none",
+        rope_scaling_factor: float = 1.0,
+        rms_norm_eps: float = 1e-6,
+        tie_word_embeddings: bool = True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_kv_head = n_kv_head
+        self.head_dim = head_dim
+        self.intermediate_size = intermediate_size
+        self.block_size = block_size
+        self.bias = bias
+        self.dropout = dropout
+        self.rope_theta = rope_theta
+        self.rope_scaling_type = rope_scaling_type
+        self.rope_scaling_factor = rope_scaling_factor
+        self.rms_norm_eps = rms_norm_eps
+        # Transformers compatibility aliases
+        self.hidden_size = n_embd
+        self.num_hidden_layers = n_layer
+        self.num_attention_heads = n_head
+        self.num_key_value_heads = n_kv_head
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "bos_token_id": 2,
+  "eos_token_id": 1,
+  "pad_token_id": 1,
+  "unk_token_id": 0,
+  "do_sample": true,
+  "temperature": 1.0,
+  "top_k": 20,
+  "top_p": 0.95,
+  "max_new_tokens": 256,
+  "transformers_version": "4.57.3"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7fb2e33c84f461a755662e675e3ac45531b0c07a0a77cc38b12f78d37cb451eb
+size 2024045696

modeling_seed.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.cache_utils import DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_seed import SeedConfig
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.epsilon = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        x = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.epsilon) * self.weight
+        return x
+class RoPEEmbedding(nn.Module):
+    def __init__(self, config, device=None):
+        super().__init__()
+        self.config = config
+        assert config.n_embd % config.n_head == 0
+        self.head_dim = config.head_dim
+        self.rope_scaling_type = str(getattr(config, "rope_scaling_type", "none"))
+        self.rope_scaling_factor = float(getattr(config, "rope_scaling_factor", 1.0))
+        base = float(config.rope_theta)
+        self.position_scale = 1.0
+        self.attention_scaling = 1.0
+        if self.rope_scaling_type == "none" or self.rope_scaling_factor == 1.0:
+            pass
+        elif self.rope_scaling_type == "yarn":
+            base = base * (self.rope_scaling_factor ** (self.head_dim / (self.head_dim - 2.0)))
+            self.attention_scaling = 0.1 * math.log(self.rope_scaling_factor) + 1.0
+        elif self.rope_scaling_type == "ntk":
+            base = base * (self.rope_scaling_factor ** (self.head_dim / (self.head_dim - 2.0)))
+        else:
+            raise ValueError(f"Unknown rope_scaling_type={self.rope_scaling_type!r}")
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32, device=device) / float(self.head_dim))
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, x, position_ids):
+        dtype = x.dtype
+        pos = position_ids.float().unsqueeze(-1) * self.position_scale
+        inv_freq = self.inv_freq.unsqueeze(0).unsqueeze(0)
+        freqs = pos * inv_freq
+        emb = torch.cat([freqs, freqs], dim=-1)
+        cos = (emb.cos() * self.attention_scaling).to(dtype)
+        sin = (emb.sin() * self.attention_scaling).to(dtype)
+        return cos, sin
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat([-x2, x1], dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q = (q * cos) + (rotate_half(q) * sin)
+    k = (k * cos) + (rotate_half(k) * sin)
+    return q, k
+class GQA(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.layer_idx = int(layer_idx)
+        self.n_head = config.n_head
+        self.n_kv_head = int(getattr(config, "n_kv_head", config.n_head))
+        self.n_embd = config.n_embd
+        self.block_size = int(config.block_size)
+        assert 1 <= self.n_kv_head <= self.n_head
+        assert self.n_head % self.n_kv_head == 0
+        self.head_dim = config.head_dim
+        q_dim = self.n_head * self.head_dim
+        kv_dim = self.n_kv_head * self.head_dim
+        self.q_proj = nn.Linear(self.n_embd, q_dim, bias=False)
+        self.k_proj = nn.Linear(self.n_embd, kv_dim, bias=False)
+        self.v_proj = nn.Linear(self.n_embd, kv_dim, bias=False)
+        self.o_proj = nn.Linear(q_dim, self.n_embd, bias=False)
+        self.q_norm = RMSNorm(self.head_dim)
+        self.k_norm = RMSNorm(self.head_dim)
+    def forward(self, x, cos, sin, past_key_values=None):
+        B, T, C = x.shape
+        q = self.q_proj(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, T, self.n_kv_head, self.head_dim).transpose(1, 2)
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        q, k = apply_rotary_pos_emb(q, k, cos, sin)
+        past_len = 0
+        if past_key_values is not None:
+            past_len = past_key_values.get_seq_length(self.layer_idx)
+            k, v = past_key_values.update(k, v, self.layer_idx)
+        if self.n_kv_head != self.n_head:
+            repeat_factor = self.n_head // self.n_kv_head
+            k = k.repeat_interleave(repeat_factor, dim=1)
+            v = v.repeat_interleave(repeat_factor, dim=1)
+        if past_len == 0:
+            y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, is_causal=True)
+        else:
+            Tk = int(k.size(2))
+            query_pos = past_len + torch.arange(T, device=x.device)
+            key_pos = torch.arange(Tk, device=x.device)
+            causal_mask = key_pos.unsqueeze(0) <= query_pos.unsqueeze(1)
+            attn_mask = torch.zeros((1, 1, T, Tk), device=x.device, dtype=q.dtype)
+            attn_mask = attn_mask.masked_fill(~causal_mask.view(1, 1, T, Tk), torch.finfo(q.dtype).min)
+            y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, is_causal=False)
+        y = y.transpose(1, 2).contiguous().view(B, T, -1)
+        y = self.o_proj(y)
+        return y
+class SwiGLU(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.n_embd = config.n_embd
+        hidden_dim = getattr(config, "intermediate_size", None)
+        if hidden_dim is None:
+            hidden_dim = int(4 * self.n_embd * 2 / 3)
+            hidden_dim = (hidden_dim + 255) // 256 * 256
+        self.gate_proj = nn.Linear(self.n_embd, hidden_dim, bias=config.bias)
+        self.up_proj   = nn.Linear(self.n_embd, hidden_dim, bias=config.bias)
+        self.down_proj = nn.Linear(hidden_dim, self.n_embd, bias=config.bias)
+    def forward(self, x):
+        gate = self.gate_proj(x)
+        up = self.up_proj(x)
+        x = self.down_proj(F.silu(gate) * up)
+        return x
+class DecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.input_norm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
+        self.post_attn_norm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
+        self.attn = GQA(config, layer_idx=layer_idx)
+        self.mlp = SwiGLU(config)
+    def forward(self, x, cos, sin, past_key_values=None):
+        residual = x
+        x = self.input_norm(x)
+        x = self.attn(x, cos, sin, past_key_values=past_key_values)
+        x = residual + x
+        residual = x
+        x = self.post_attn_norm(x)
+        x = self.mlp(x)
+        x = residual + x
+        return x
+class SeedPreTrainedModel(PreTrainedModel):
+    config_class = SeedConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_sdpa = True
+class SeedForCausalLM(SeedPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.layers = nn.ModuleList([DecoderLayer(config, layer_idx=i) for i in range(config.n_layer)])
+        self.norm = RMSNorm(config.n_embd, eps=config.rms_norm_eps)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.rope = RoPEEmbedding(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, value):
+        self.wte = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        token_type_ids=None,
+        **kwargs
+    ):
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        B, T = inputs_embeds.shape[:2]
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if position_ids is None:
+            past_seen = past_key_values.get_seq_length() if past_key_values is not None else 0
+            position_ids = torch.arange(past_seen, past_seen + T, device=inputs_embeds.device).unsqueeze(0).expand(B, T)
+        cos, sin = self.rope(inputs_embeds, position_ids)
+        x = inputs_embeds
+        for layer in self.layers:
+            x = layer(x, cos, sin, past_key_values=past_key_values)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits[:, :-1].contiguous().view(-1, logits.size(-1)),
+                labels[:, 1:].contiguous().view(-1)
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=past_key_values if use_cache else None
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        past_length = 0
+        if past_key_values is not None:
+            past_length = past_key_values.get_seq_length()
+            if past_length > 0:
+                input_ids = input_ids[:, -1:]
+        if inputs_embeds is not None and past_length == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update({
+            "past_key_values": past_key_values,
+            "use_cache": True,
+            "attention_mask": attention_mask,
+        })
+        return model_inputs

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "bos_token": "<|im_start|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<unk>",
+  "pad_token": "<|endoftext|>",
+  "add_bos_token": false,
+  "add_eos_token": false,
+  "model_max_length": 4096,
+  "clean_up_tokenization_spaces": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  }
+}