Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

added_tokens.json +3 -0
config.json +35 -0
configuration_diff_llama.py +52 -0
model.safetensors +3 -0
modeling_diff_llama.py +72 -0
special_tokens_map.json +31 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +52 -0

added_tokens.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "[MASK]": 32000
+}

config.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "_mlp_class": "LLaMAMLP",
+  "_norm_class": "FusedRMSNorm",
+  "architectures": [
+    "DiffusionLlamaLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_diff_llama.DiffusionLlamaConfig",
+    "AutoModel": "modeling_diffusion_llama.DiffusionLlamaLM",
+    "AutoModelForCausalLM": "modeling_diff_llama.DiffusionLlamaLM"
+  },
+  "bias": false,
+  "block_size": 2048,
+  "condense_ratio": 1,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "intermediate_size": 4096,
+  "mask_token_id": 32000,
+  "model_type": "diff_llama_v2",
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_layer": 20,
+  "n_query_groups": 16,
+  "name": "Diff_LLaMA_v2_336M",
+  "norm_eps": 1e-05,
+  "org": "Lightning-AI",
+  "pad_token_id": 0,
+  "padded_vocab_size": 32000,
+  "padding_multiple": 64,
+  "parallel_residual": false,
+  "rotary_percentage": 1.0,
+  "shared_attention_norm": false,
+  "transformers_version": "4.57.3",
+  "vocab_size": 32000
+}

configuration_diff_llama.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from transformers import PretrainedConfig
+from typing import Literal, Optional
+from lit_gpt.config import Config
+class DiffusionLlamaConfig(Config, PretrainedConfig):
+    model_type = "diff_llama_v2"
+    eos_token_id = 2,
+    pad_token_id = 0,
+    mask_token_id = 32000
+    def __init__(
+            self,
+            block_size: int = 4096,
+            vocab_size: int = 50254,
+            padding_multiple: int = 512,
+            padded_vocab_size: Optional[int] = None,
+            n_layer: int = 16,
+            n_head: int = 32,
+            n_embd: int = 4096,
+            rotary_percentage: float = 0.25,
+            parallel_residual: bool = True,
+            bias: bool = True,
+            n_query_groups: Optional[int] = None,
+            shared_attention_norm: bool = False,
+            _norm_class: Literal["LayerNorm", "RMSNorm", "FusedRMSNorm"] = "LayerNorm",
+            norm_eps: float = 1e-5,
+            _mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP",
+            intermediate_size: Optional[int] = None,
+            condense_ratio: int = 1,
+            **kwargs,
+        ):
+            Config.__init__(
+                self,
+                block_size=block_size,
+                vocab_size=vocab_size,
+                padding_multiple=padding_multiple,
+                padded_vocab_size=padded_vocab_size,
+                n_layer=n_layer,
+                n_head=n_head,
+                n_embd=n_embd,
+                rotary_percentage=rotary_percentage,
+                parallel_residual=parallel_residual,
+                bias=bias,
+                n_query_groups=n_query_groups,
+                shared_attention_norm=shared_attention_norm,
+                _norm_class=_norm_class,
+                norm_eps=norm_eps,
+                _mlp_class=_mlp_class,
+                intermediate_size=intermediate_size,
+                condense_ratio=condense_ratio
+            )
+            PretrainedConfig.__init__(self, **kwargs)

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:671794256bef4dff670845aca8d38e5fa382931f8f96d40028b887ee01a116f8
+size 1604509704

modeling_diff_llama.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from .configuration_diff_llama import DiffusionLlamaConfig
+from lit_gpt.diffmodel import TransEncoder
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+import torch
+import torch.nn as nn
+from torch.nn import init
+import math
+from typing import Optional, Union, Tuple
+class DiffusionLlamaLM(PreTrainedModel):
+    config_class = DiffusionLlamaConfig
+    base_model_prefix = "model"
+    def __init__(self, config: DiffusionLlamaConfig):
+        super().__init__(config)
+        self.model = TransEncoder(config)
+        # Initialize weights (Training feature)
+        self.post_init()
+    def _init_weights(self, module: nn.Module) -> None:
+        """
+        Initialization logic for training.
+        Adapted from original TransEncoder._init_weights.
+        """
+        n_layer = self.config.n_layer
+        if isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
+        elif isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=math.sqrt(2.0 / 5 / self.config.n_embd))
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        # Special initialization for SwiGLU / Projections based on names
+        # In HF _init_weights, 'module' is the current leaf. We check specific instances.
+        # if isinstance(module, LLaMAMLP):
+        # module is LLaMAMLP
+        for name, p in module.named_parameters():
+                if "proj.weight" in name:
+                    nn.init.normal_(p, mean=0.0, std=1 / math.sqrt(self.config.n_embd) / n_layer)
+        # if isinstance(module, SwiGLU):
+        #      for name, p in module.named_parameters():
+        #          if "w3.weight" in name:
+        #               nn.init.normal_(p, mean=0.0, std=1 / math.sqrt(self.config.n_embd) / n_layer)
+        # if isinstance(module, SelfAttention):
+        #      for name, p in module.named_parameters():
+        #          if "proj.weight" in name:
+        #               nn.init.normal_(p, mean=0.0, std=1 / math.sqrt(self.config.n_embd) / n_layer)
+    def forward(self, input_ids: torch.Tensor, labels: Optional[torch.Tensor] = None, return_dict: Optional[bool] = None, **kwargs) -> Union[Tuple, CausalLMOutputWithPast]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        logits = self.model(input_ids)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        if not return_dict:
+            return ((loss,) + (logits,)) if loss is not None else (logits,)
+        return CausalLMOutputWithPast(loss=loss, logits=logits)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,52 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "legacy": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}