Upload 6 files

Browse files

Files changed (6) hide show

config.json +13 -0
model.safetensors +3 -0
nova_modelling.py +164 -0
special_tokens_map.json +9 -0
tokenizer.json +0 -0
tokenizer_config.json +49 -0

config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "architectures": [
+    "NovaForCausalLM"
+  ],
+  "block_size": 256,
+  "model_type": "nova",
+  "n_embd": 640,
+  "n_head": 8,
+  "n_layer": 4,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.4",
+  "vocab_size": 6000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2b769ee0e5b1f8532e9abcf402a6e59ebe154b0c4d538dfd0069e3854350d9c
+size 66712216

nova_modelling.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import math
+import torch
+import torch.nn as nn
+from torch.nn import functional
+from transformers import PreTrainedModel, PretrainedConfig
+class Heads(nn.Module):
+    def __init__(self, feature_embed, head_size, block_size):
+        super().__init__()
+        self.q = nn.Linear(feature_embed, head_size, bias=False)
+        self.k = nn.Linear(feature_embed, head_size, bias=False)
+        self.v = nn.Linear(feature_embed, head_size, bias=False)
+        self.register_buffer('tril', torch.tril(torch.ones(block_size,block_size)))
+        self.dropout = nn.Dropout(0.15)
+    def forward(self, x):
+        B, T, C = x.shape
+        k = self.k(x)
+        q = self.q(x)
+        v = self.v(x)
+        weighted = q @ k.transpose(-2,-1) * (k.shape[-1] ** -0.5)
+        weighted = weighted.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
+        weighted = functional.softmax(weighted, dim=-1)
+        weighted = self.dropout(weighted)
+        return weighted @ v
+class MultiHeadAttention(nn.Module):
+    def __init__(self, head_size, n_heads, feature_embed, block_size):
+        super().__init__()
+        self.multiple_heads = nn.ModuleList(Heads(feature_embed, head_size, block_size) for _ in range(n_heads))
+        self.linear = nn.Linear(head_size*n_heads, feature_embed)
+        self.dropout = nn.Dropout(0.1)
+    def forward(self, x):
+        out = torch.cat([head(x) for head in self.multiple_heads], dim=-1)
+        out = self.linear(out)
+        return self.dropout(out)
+class Decoder(nn.Module):
+    def __init__(self, feature_embed, n_heads, block_size):
+        super().__init__()
+        head_size = feature_embed // n_heads
+        self.multihead = MultiHeadAttention(head_size, n_heads, feature_embed, block_size=block_size)
+        self.layerNorm = nn.LayerNorm(feature_embed)
+    def forward(self, x):
+        y = self.multihead(x)
+        return self.layerNorm(x+y)
+class NOVA(nn.Module):
+    def __init__(self, vocab_size, block_size=256, feature_embed=640, n_layers=4, n_heads=8):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.feature_embed = feature_embed
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.vector_embedding = nn.Embedding(vocab_size, feature_embed)
+        self.learnable_position = nn.Embedding(block_size, feature_embed)   # learnable positional encoding
+        # Sinusoidal Positional encoding
+        sinusoid = torch.zeros(block_size, feature_embed)
+        position = torch.arange(0, block_size, dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, feature_embed, 2).float() * (-math.log(10000.0) / feature_embed))
+        sinusoid[:, 0::2] = torch.sin(position * div_term)
+        sinusoid[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('sinusoidal_encoding', sinusoid)  # not trainable
+        # initialising Decoder Model
+        self.decoder_block = nn.Sequential(*[
+            Decoder(feature_embed, n_heads=n_heads, block_size=self.block_size) for _ in range(n_layers)
+        ])
+        self.linear_head = nn.Linear(feature_embed, vocab_size)
+        self.layer_norm = nn.LayerNorm(feature_embed)
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.01)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        if isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.01)
+    def forward(self, indx, target=None):
+        B, T = indx.shape
+        token_embedding = self.vector_embedding(indx)  # [B, T, C]
+        # Positional encoding (hybrid: learned + sinusoidal)
+        learned = self.learnable_position(torch.arange(T, device=indx.device))      # [T, C]
+        sinusoidal = self.sinusoidal_encoding[:T]                                   # [T, C]
+        positional_encoding = learned + sinusoidal                                  # [T, C]
+        positional_encoding = positional_encoding.unsqueeze(0).expand(B, -1, -1)    # [B, T, C]
+        x = token_embedding + positional_encoding                                   # [B, T, C]
+        x = self.decoder_block(x)                                                   # [B, T, C]
+        x = self.layer_norm(x)                                                      # [B, T, C]
+        logits = self.linear_head(x)                                                # [B, T, vocab_size]
+        if target is None:
+            return logits, None
+        # Shift logits and targets for causal language modeling
+        logits = logits[:, :-1, :]      # [B, T-1, vocab_size]
+        target = target[:, 1:]          # [B, T-1]
+        # Flatten for loss
+        logits = logits.contiguous().view(-1, logits.size(-1))  # [B*(T-1), vocab_size]
+        target = target.contiguous().view(-1)                   # [B*(T-1)]
+        loss = functional.cross_entropy(logits, target, ignore_index=-100)
+        return logits, loss
+    @torch.no_grad()
+    def generate(self, index, max_tokens=512):
+        for _ in range(max_tokens):
+            index_cond = index[:,-self.block_size:]
+            logits, loss = self.forward(index_cond)
+            logits = logits[:,-1,:]
+            probs = torch.softmax(logits, dim=-1)
+            next_index = torch.multinomial(probs, num_samples=1)
+            # if next_index == self.eos_id:
+            #     break
+            index = torch.cat((index,next_index), dim=1)
+        return index
+class NovaConfig(PretrainedConfig):
+    model_type = "nova"
+    def __init__(self, vocab_size=6000, block_size=256, feature_embed=640, n_layers=4, n_heads=8, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.block_size = block_size
+        self.n_embd = feature_embed
+        self.n_layer = n_layers
+        self.n_head = n_heads
+class NovaForCausalLM(PreTrainedModel):
+    config_class = NovaConfig
+    def __init__(self, config: NovaConfig):
+        super().__init__(config)
+        # your original model init logic here
+        self.vocab_size = config.vocab_size
+        self.block_size = config.block_size
+        self.model = NOVA(vocab_size=self.vocab_size, block_size=self.block_size,
+                          feature_embed=config.n_embd, n_layers=config.n_layer, n_heads=config.n_head)
+        self.post_init()  # important for HF compatibility
+    def forward(self, input_ids, labels=None):
+        return self.model(input_ids, labels)
+    def generate(self, input_ids, max_length=256):
+        return self.model.generate(input_ids, max_length)

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,49 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}