Complete model upload with all necessary files

Browse files

Files changed (8) hide show

config.json +24 -0
merges.txt +0 -0
model.safetensors +3 -0
modeling_custom_llama.py +270 -0
special_tokens_map.json +43 -0
tokenizer.json +0 -0
tokenizer_config.json +169 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "CustomLlamaForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "modeling_custom_llama.CustomLlamaConfig",
+    "AutoModelForCausalLM": "modeling_custom_llama.CustomLlamaForCausalLM"
+  },
+  "d_head": 64,
+  "d_mlp_proj": 2560,
+  "d_model": 960,
+  "dtype": "float32",
+  "initializer_range": 0.02,
+  "model_type": "custom_llama",
+  "n_attn_heads": 15,
+  "n_kv_heads": 5,
+  "n_layers": 16,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 100000.0,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.56.1",
+  "vocab_size": 49152
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c1a12cab395be35c66733a2e57c17f5290540b46e8ecb6581df574f2415b49b
+size 1006775160

modeling_custom_llama.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# modeling_custom_llama.py
+# Note: We are adapting your original code to fit the transformers library structure.
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+# Import the necessary base classes from transformers
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+# Step 2a: Create a Config class that inherits from PretrainedConfig
+# This is crucial for saving/loading the model's architecture.
+class CustomLlamaConfig(PretrainedConfig):
+    model_type = "custom_llama"
+    def __init__(
+        self,
+        vocab_size: int = 32000,
+        d_model: int = 960,
+        d_head: int = 64,
+        d_mlp_proj: int = 2560,
+        n_kv_heads: int = 5,
+        n_attn_heads: int = 15,
+        n_layers: int = 16,
+        rms_norm_eps: float = 1e-5,
+        rope_theta: float = 100000.0,
+        initializer_range: float = 0.02,
+        # CHANGE 1: Use `pad_token_id` directly instead of `padding_idx`
+        pad_token_id: Optional[int] = None,
+        tie_word_embeddings: bool = False,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.d_head = d_head
+        self.d_mlp_proj = d_mlp_proj
+        self.n_kv_heads = n_kv_heads
+        self.n_attn_heads = n_attn_heads
+        self.n_layers = n_layers
+        self.rms_norm_eps = rms_norm_eps
+        self.rope_theta = rope_theta
+        self.initializer_range = initializer_range
+        # CHANGE 2: Pass `pad_token_id` directly to the super() call.
+        # Now there's no conflict with kwargs.
+        super().__init__(
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs
+        )
+# Your original helper modules (Rotary, GQA, GatedMlp, DecoderLayer)
+# can stay exactly the same. Just copy them here.
+class Rotary(nn.Module):
+    # ... (your exact Rotary class code) ...
+    def __init__(self, config):
+        super(Rotary, self).__init__()
+        inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, config.d_head, 2).float() / config.d_head))
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+        self.seq_len_cached = None
+        self.cos_cached = None
+        self.sin_cached = None
+    def forward(self, x, seq_dim=1):
+        seq_len = x.size(seq_dim)
+        if seq_len != self.seq_len_cached:
+            self.seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            self.cos_cached = emb.cos()
+            self.sin_cached = emb.sin()
+        return self.cos_cached, self.sin_cached
+class GroupedQueryAttention(nn.Module):
+    # ... (your exact GQA class code) ...
+    def __init__(self, config):
+        super(GroupedQueryAttention, self).__init__()
+        self.q_proj = nn.Linear(config.d_model, config.n_attn_heads * config.d_head, bias=False)
+        self.k_proj = nn.Linear(config.d_model, config.n_kv_heads * config.d_head, bias=False)
+        self.v_proj = nn.Linear(config.d_model, config.n_kv_heads * config.d_head, bias=False)
+        self.o_proj = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.config = config
+        self.attn_scale = config.d_head ** -0.5
+        self.use_flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+    @staticmethod
+    def _rotate_half(x):
+        half = x.shape[-1] // 2
+        x1, x2 = x[..., :half], x[..., half:]
+        return torch.cat([-x2, x1], dim=-1)
+    def _apply_rotary_pos_emb(self, q, k, cos, sin):
+        return q * cos + self._rotate_half(q) * sin, k * cos + self._rotate_half(k) * sin
+    def forward(self, x, cos, sin):
+        b_size, seq_len, _ = x.shape
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        # Shape to (b_size, n_heads or n_kv_heads, seq_len, d_head)
+        q = q.view(b_size, seq_len, -1, self.config.d_head).transpose(1, 2)
+        k = k.view(b_size, seq_len, -1, self.config.d_head).transpose(1, 2)
+        v = v.view(b_size, seq_len, -1, self.config.d_head).transpose(1, 2)
+        q, k = self._apply_rotary_pos_emb(q, k, cos, sin)
+        if self.use_flash:
+            # GQA is enabled by default in recent PyTorch versions
+            # when n_heads_q != n_heads_kv
+            out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        else:
+            k = k.repeat_interleave(self.config.n_attn_heads // self.config.n_kv_heads, dim=1)
+            v = v.repeat_interleave(self.config.n_attn_heads // self.config.n_kv_heads, dim=1)
+            qk_scaled = q @ k.transpose(-2, -1) * self.attn_scale
+            attn_bias = torch.zeros(1, 1, seq_len, seq_len, device=q.device, dtype=q.dtype)
+            temp_mask = torch.ones(seq_len, seq_len, dtype=torch.bool, device=q.device).tril(diagonal=0)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn = F.softmax(qk_scaled + attn_bias, dim=-1)
+            out = attn @ v
+        out = out.transpose(1, 2).contiguous().view(b_size, seq_len, -1)
+        return self.o_proj(out)
+class GatedMlp(nn.Module):
+    # ... (your exact GatedMlp class code) ...
+    def __init__(self, config):
+        super(GatedMlp, self).__init__()
+        self.up_proj = nn.Linear(config.d_model, config.d_mlp_proj, bias=False)
+        self.gate_proj = nn.Linear(config.d_model, config.d_mlp_proj, bias=False)
+        self.down_proj = nn.Linear(config.d_mlp_proj, config.d_model, bias=False)
+        self.silu = nn.SiLU()
+    def forward(self, x):
+        up = self.silu(self.gate_proj(x)) * self.up_proj(x)
+        return self.down_proj(up)
+class DecoderLayer(nn.Module):
+    # ... (your exact DecoderLayer class code) ...
+    def __init__(self, config):
+        super(DecoderLayer, self).__init__()
+        self.self_attn = GroupedQueryAttention(config)
+        self.mlp = GatedMlp(config)
+        self.input_layernorm = nn.modules.normalization.RMSNorm(config.d_model, config.rms_norm_eps)
+        self.post_attention_layernorm = nn.modules.normalization.RMSNorm(config.d_model, config.rms_norm_eps)
+    def forward(self, x, cos, sin):
+        x = x + self.self_attn(self.input_layernorm(x), cos, sin)
+        x = x + self.mlp(self.post_attention_layernorm(x))
+        return x
+# Step 2b: Create the main Model class that inherits from PreTrainedModel
+# We'll rename it to follow HF conventions: `...ForCausalLM`
+class CustomLlamaForCausalLM(PreTrainedModel):
+    # Link this model to its config class
+    config_class = CustomLlamaConfig
+    def __init__(self, config: CustomLlamaConfig):
+        super().__init__(config)
+        self.config = config
+        self.embed_tokens = nn.Embedding(
+            num_embeddings=config.vocab_size,
+            embedding_dim=config.d_model,
+            # CHANGE 3: `nn.Embedding` expects a parameter named `padding_idx`.
+            # Its value comes from the standard `config.pad_token_id`. This is the correct mapping.
+            padding_idx=config.pad_token_id
+        )
+        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.n_layers)])
+        self.norm = nn.modules.normalization.RMSNorm(config.d_model, config.rms_norm_eps)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        self.rotary_emb = Rotary(config)
+        self.post_init()
+    # The `_init_weights` method is called by `post_init` and is the place
+    # to put your custom initialization logic.
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
+            if self.config.pad_token_id is not None:
+                module.weight.data[self.config.pad_token_id].zero_()
+    # Step 2c: Adapt the forward method signature
+    # It should accept `labels` for loss calculation and return a special output object
+    # or a tuple. Returning a tuple `(loss, logits)` is the simplest way.
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Tuple:
+        x = self.embed_tokens(input_ids)
+        cos, sin = self.rotary_emb(x, seq_dim=1)
+        for layer in self.layers:
+            x = layer(x, cos, sin)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        return (loss, logits)
+    @torch.no_grad()
+    def generate(self, idx, temperature=1.0, top_k=None, max_new_tokens=128):
+        for _ in range(max_new_tokens):
+            logits, _, _ = self(idx)
+            logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx
+    def using_flash_attention(self):
+        return self.layers[0].self_attn.use_flash
+# Step 2d: Register your custom classes with the Auto-classes
+# This is the magic that allows `AutoModelForCausalLM.from_pretrained` to find your model.
+from transformers import AutoConfig, AutoModelForCausalLM
+AutoConfig.register("custom_llama", CustomLlamaConfig)
+AutoModelForCausalLM.register(CustomLlamaConfig, CustomLlamaForCausalLM)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,169 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "<|im_start|>",
+    "<|im_end|>",
+    "<repo_name>",
+    "<reponame>",
+    "<file_sep>",
+    "<filename>",
+    "<gh_stars>",
+    "<issue_start>",
+    "<issue_comment>",
+    "<issue_closed>",
+    "<jupyter_start>",
+    "<jupyter_text>",
+    "<jupyter_code>",
+    "<jupyter_output>",
+    "<jupyter_script>",
+    "<empty_output>"
+  ],
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 8192,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff