Sharjeelbaig commited on
Commit
6fc8a82
·
verified ·
1 Parent(s): d71d68a

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -15,3 +15,18 @@ library_name: pytorch
15
  From-scratch narrow-domain coding SLM for React + Tailwind generation and unified-diff edits.
16
 
17
  Includes trained `model.safetensors` weights.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  From-scratch narrow-domain coding SLM for React + Tailwind generation and unified-diff edits.
16
 
17
  Includes trained `model.safetensors` weights.
18
+
19
+ ## Transformers Usage
20
+
21
+ ```python
22
+ from transformers import AutoTokenizer, AutoModelForCausalLM
23
+
24
+ model_id = "Sharjeelbaig/neurocoder"
25
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
26
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True)
27
+
28
+ prompt = "Generate a landing page for marketing agency titled Velocity Landing"
29
+ inputs = tokenizer(prompt, return_tensors="pt")
30
+ outputs = model.generate(**inputs, max_new_tokens=220, temperature=0.7, do_sample=True)
31
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
32
+ ```
config.json CHANGED
@@ -1,17 +1,29 @@
1
  {
2
  "architectures": [
3
- "TinyMoEModel"
4
  ],
 
 
 
 
 
 
 
 
 
5
  "capacity_factor_infer": 1.0,
6
  "capacity_factor_train": 1.25,
7
  "context_length": 320,
 
8
  "ffn_multiplier": 4,
9
  "hidden_size": 256,
10
- "model_type": "tinymoe",
11
  "moe_every_n_layers": 2,
12
  "num_experts": 4,
13
  "num_heads": 8,
14
  "num_layers": 8,
 
15
  "top_k": 2,
 
16
  "vocab_size": 1714
17
  }
 
1
  {
2
  "architectures": [
3
+ "NeuroCoderForCausalLM"
4
  ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_neurocoder.NeuroCoderConfig",
7
+ "AutoModelForCausalLM": "modeling_neurocoder.NeuroCoderForCausalLM",
8
+ "AutoTokenizer": [
9
+ "tokenization_neurocoder.NeuroCoderTokenizer",
10
+ null
11
+ ]
12
+ },
13
+ "bos_token_id": 1,
14
  "capacity_factor_infer": 1.0,
15
  "capacity_factor_train": 1.25,
16
  "context_length": 320,
17
+ "eos_token_id": 2,
18
  "ffn_multiplier": 4,
19
  "hidden_size": 256,
20
+ "model_type": "neurocoder",
21
  "moe_every_n_layers": 2,
22
  "num_experts": 4,
23
  "num_heads": 8,
24
  "num_layers": 8,
25
+ "pad_token_id": 0,
26
  "top_k": 2,
27
+ "unk_token_id": 3,
28
  "vocab_size": 1714
29
  }
configuration_neurocoder.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Transformers config for NeuroCoder remote-code loading."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from transformers import PretrainedConfig
6
+
7
+
8
+ class NeuroCoderConfig(PretrainedConfig):
9
+ model_type = "neurocoder"
10
+
11
+ def __init__(
12
+ self,
13
+ vocab_size: int = 32000,
14
+ context_length: int = 4096,
15
+ hidden_size: int = 1024,
16
+ num_layers: int = 20,
17
+ num_heads: int = 16,
18
+ ffn_multiplier: int = 4,
19
+ moe_every_n_layers: int = 2,
20
+ num_experts: int = 8,
21
+ top_k: int = 2,
22
+ capacity_factor_train: float = 1.25,
23
+ capacity_factor_infer: float = 1.0,
24
+ dropout: float = 0.0,
25
+ **kwargs,
26
+ ) -> None:
27
+ super().__init__(**kwargs)
28
+ self.vocab_size = vocab_size
29
+ self.context_length = context_length
30
+ self.hidden_size = hidden_size
31
+ self.num_layers = num_layers
32
+ self.num_heads = num_heads
33
+ # Aliases expected by Transformers generation/runtime utilities.
34
+ self.num_hidden_layers = num_layers
35
+ self.num_attention_heads = num_heads
36
+ self.max_position_embeddings = context_length
37
+ self.use_cache = False
38
+ self.ffn_multiplier = ffn_multiplier
39
+ self.moe_every_n_layers = moe_every_n_layers
40
+ self.num_experts = num_experts
41
+ self.top_k = top_k
42
+ self.capacity_factor_train = capacity_factor_train
43
+ self.capacity_factor_infer = capacity_factor_infer
44
+ self.dropout = dropout
45
+
46
+ @property
47
+ def head_dim(self) -> int:
48
+ if self.hidden_size % self.num_heads != 0:
49
+ raise ValueError("hidden_size must be divisible by num_heads")
50
+ return self.hidden_size // self.num_heads
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ec3e8e9e20e7cc41eaacb419be91939f3b8ff25e494afe5ba063f2efdf50a1e6
3
  size 75081480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:662bfd3a3fabe2977d92c697faaa0af70c6704d5581fd9549d578a994e13202a
3
  size 75081480
modeling_neurocoder.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Transformers model implementation for NeuroCoder remote-code loading."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from typing import Any
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from torch import Tensor, nn
11
+ from transformers import PreTrainedModel
12
+ from transformers.modeling_outputs import CausalLMOutputWithPast
13
+
14
+ try:
15
+ from .configuration_neurocoder import NeuroCoderConfig
16
+ except Exception:
17
+ from configuration_neurocoder import NeuroCoderConfig
18
+
19
+
20
+ class RMSNorm(nn.Module):
21
+ def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
22
+ super().__init__()
23
+ self.eps = eps
24
+ self.weight = nn.Parameter(torch.ones(hidden_size))
25
+
26
+ def forward(self, x: Tensor) -> Tensor:
27
+ rms = x.pow(2).mean(-1, keepdim=True)
28
+ return x * torch.rsqrt(rms + self.eps) * self.weight
29
+
30
+
31
+ class SelfAttention(nn.Module):
32
+ def __init__(self, config: NeuroCoderConfig) -> None:
33
+ super().__init__()
34
+ self.num_heads = config.num_heads
35
+ self.head_dim = config.head_dim
36
+ self.scale = self.head_dim ** -0.5
37
+ self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3)
38
+ self.out = nn.Linear(config.hidden_size, config.hidden_size)
39
+
40
+ def forward(self, x: Tensor) -> Tensor:
41
+ bsz, seq_len, hidden = x.shape
42
+ qkv = self.qkv(x)
43
+ q, k, v = qkv.chunk(3, dim=-1)
44
+
45
+ def shape_heads(t: Tensor) -> Tensor:
46
+ return t.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
47
+
48
+ q = shape_heads(q)
49
+ k = shape_heads(k)
50
+ v = shape_heads(v)
51
+
52
+ attn = torch.matmul(q, k.transpose(-2, -1)) * self.scale
53
+ mask = torch.tril(torch.ones(seq_len, seq_len, device=x.device, dtype=torch.bool))
54
+ attn = attn.masked_fill(~mask, float("-inf"))
55
+ probs = F.softmax(attn, dim=-1)
56
+ out = torch.matmul(probs, v)
57
+ out = out.transpose(1, 2).contiguous().view(bsz, seq_len, hidden)
58
+ return self.out(out)
59
+
60
+
61
+ class DenseFFN(nn.Module):
62
+ def __init__(self, config: NeuroCoderConfig) -> None:
63
+ super().__init__()
64
+ inner = config.hidden_size * config.ffn_multiplier
65
+ self.gate = nn.Linear(config.hidden_size, inner)
66
+ self.up = nn.Linear(config.hidden_size, inner)
67
+ self.down = nn.Linear(inner, config.hidden_size)
68
+
69
+ def forward(self, x: Tensor) -> Tensor:
70
+ return self.down(F.silu(self.gate(x)) * self.up(x))
71
+
72
+
73
+ class MoEFeedForward(nn.Module):
74
+ def __init__(self, config: NeuroCoderConfig) -> None:
75
+ super().__init__()
76
+ self.num_experts = config.num_experts
77
+ self.top_k = config.top_k
78
+ self.capacity_factor_train = config.capacity_factor_train
79
+ self.capacity_factor_infer = config.capacity_factor_infer
80
+ self.router = nn.Linear(config.hidden_size, config.num_experts, bias=False)
81
+ self.experts = nn.ModuleList([DenseFFN(config) for _ in range(config.num_experts)])
82
+
83
+ def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
84
+ bsz, seq_len, hidden = x.shape
85
+ x_flat = x.reshape(-1, hidden)
86
+ tokens = x_flat.shape[0]
87
+
88
+ logits = self.router(x_flat)
89
+ probs = F.softmax(logits, dim=-1)
90
+ top_vals, top_idx = torch.topk(probs, k=self.top_k, dim=-1)
91
+
92
+ capacity_factor = self.capacity_factor_train if self.training else self.capacity_factor_infer
93
+ capacity = max(1, math.ceil(capacity_factor * tokens / self.num_experts))
94
+
95
+ output = torch.zeros_like(x_flat)
96
+ expert_load = []
97
+
98
+ for expert_id in range(self.num_experts):
99
+ expert = self.experts[expert_id]
100
+ assigned_indices = []
101
+ assigned_weights = []
102
+ for rank in range(self.top_k):
103
+ mask = top_idx[:, rank] == expert_id
104
+ idx = torch.nonzero(mask, as_tuple=False).squeeze(-1)
105
+ if idx.numel() == 0:
106
+ continue
107
+ weights = top_vals[idx, rank]
108
+ assigned_indices.append(idx)
109
+ assigned_weights.append(weights)
110
+
111
+ if not assigned_indices:
112
+ expert_load.append(0.0)
113
+ continue
114
+
115
+ token_indices = torch.cat(assigned_indices, dim=0)
116
+ token_weights = torch.cat(assigned_weights, dim=0)
117
+ if token_indices.numel() > capacity:
118
+ token_indices = token_indices[:capacity]
119
+ token_weights = token_weights[:capacity]
120
+
121
+ expert_in = x_flat[token_indices]
122
+ expert_out = expert(expert_in)
123
+ output[token_indices] += expert_out * token_weights.unsqueeze(-1)
124
+ expert_load.append(float(token_indices.numel() / max(tokens, 1)))
125
+
126
+ load_tensor = torch.tensor(expert_load, device=x.device)
127
+ mean_prob = probs.mean(dim=0)
128
+ aux_loss = self.num_experts * torch.sum(mean_prob * load_tensor)
129
+ z_loss = torch.mean(torch.logsumexp(logits, dim=-1) ** 2)
130
+ return output.reshape(bsz, seq_len, hidden), aux_loss, z_loss
131
+
132
+
133
+ class TransformerBlock(nn.Module):
134
+ def __init__(self, config: NeuroCoderConfig, use_moe: bool) -> None:
135
+ super().__init__()
136
+ self.norm1 = RMSNorm(config.hidden_size)
137
+ self.norm2 = RMSNorm(config.hidden_size)
138
+ self.attn = SelfAttention(config)
139
+ self.ffn = MoEFeedForward(config) if use_moe else DenseFFN(config)
140
+ self.use_moe = use_moe
141
+
142
+ def forward(self, x: Tensor) -> tuple[Tensor, Tensor, Tensor]:
143
+ x = x + self.attn(self.norm1(x))
144
+ aux_loss = torch.tensor(0.0, device=x.device)
145
+ z_loss = torch.tensor(0.0, device=x.device)
146
+ ffn_input = self.norm2(x)
147
+ if self.use_moe:
148
+ ffn_out, aux_loss, z_loss = self.ffn(ffn_input)
149
+ else:
150
+ ffn_out = self.ffn(ffn_input)
151
+ x = x + ffn_out
152
+ return x, aux_loss, z_loss
153
+
154
+
155
+ class NeuroCoderForCausalLM(PreTrainedModel):
156
+ config_class = NeuroCoderConfig
157
+ base_model_prefix = "neurocoder"
158
+ _no_split_modules = ["TransformerBlock", "MoEFeedForward"]
159
+
160
+ def __init__(self, config: NeuroCoderConfig) -> None:
161
+ super().__init__(config)
162
+ self.token_embed = nn.Embedding(config.vocab_size, config.hidden_size)
163
+ self.layers = nn.ModuleList(
164
+ [
165
+ TransformerBlock(config, use_moe=((idx + 1) % config.moe_every_n_layers == 0))
166
+ for idx in range(config.num_layers)
167
+ ]
168
+ )
169
+ self.norm = RMSNorm(config.hidden_size)
170
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
171
+ self.lm_head.weight = self.token_embed.weight
172
+ self.post_init()
173
+
174
+ def get_input_embeddings(self) -> nn.Embedding:
175
+ return self.token_embed
176
+
177
+ def set_input_embeddings(self, value: nn.Embedding) -> None:
178
+ self.token_embed = value
179
+
180
+ def get_output_embeddings(self) -> nn.Linear:
181
+ return self.lm_head
182
+
183
+ def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
184
+ self.lm_head = new_embeddings
185
+
186
+ def prepare_inputs_for_generation(
187
+ self,
188
+ input_ids: Tensor,
189
+ **kwargs: Any,
190
+ ) -> dict[str, Tensor]:
191
+ return {"input_ids": input_ids}
192
+
193
+ def forward(
194
+ self,
195
+ input_ids: Tensor | None = None,
196
+ attention_mask: Tensor | None = None,
197
+ labels: Tensor | None = None,
198
+ **kwargs: Any,
199
+ ) -> CausalLMOutputWithPast:
200
+ if input_ids is None:
201
+ raise ValueError("input_ids is required")
202
+
203
+ x = self.token_embed(input_ids)
204
+ aux_loss = torch.tensor(0.0, device=input_ids.device)
205
+ z_loss = torch.tensor(0.0, device=input_ids.device)
206
+
207
+ for layer in self.layers:
208
+ x, layer_aux, layer_z = layer(x)
209
+ aux_loss = aux_loss + layer_aux
210
+ z_loss = z_loss + layer_z
211
+
212
+ x = self.norm(x)
213
+ logits = self.lm_head(x)
214
+
215
+ loss = None
216
+ if labels is not None:
217
+ loss = F.cross_entropy(
218
+ logits.view(-1, logits.size(-1)),
219
+ labels.view(-1),
220
+ ignore_index=-100,
221
+ )
222
+ loss = loss + 0.01 * aux_loss + 0.001 * z_loss
223
+
224
+ return CausalLMOutputWithPast(loss=loss, logits=logits)
special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<bos>",
3
+ "eos_token": "<eos>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
tokenization_neurocoder.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Transformers tokenizer for NeuroCoder remote-code loading."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ import re
8
+ from typing import Any
9
+
10
+ from transformers import PreTrainedTokenizer
11
+
12
+ TOKEN_PATTERN = re.compile(r"\s+|[A-Za-z_][A-Za-z0-9_]*|\d+|\S")
13
+ SPECIAL_TOKENS = ["<pad>", "<bos>", "<eos>", "<unk>"]
14
+
15
+
16
+ class NeuroCoderTokenizer(PreTrainedTokenizer):
17
+ vocab_files_names = {"vocab_file": "tokenizer.json"}
18
+ model_input_names = ["input_ids", "attention_mask"]
19
+
20
+ def __init__(self, vocab_file: str | None = None, **kwargs: Any) -> None:
21
+ self.vocab: dict[str, int] = {}
22
+ self.id_to_token: list[str] = []
23
+
24
+ if vocab_file is not None:
25
+ payload = json.loads(Path(vocab_file).read_text(encoding="utf-8"))
26
+ self.vocab = {str(k): int(v) for k, v in payload.get("vocab", {}).items()}
27
+ max_id = max(self.vocab.values()) if self.vocab else -1
28
+ self.id_to_token = ["<unk>"] * (max_id + 1)
29
+ for token, idx in self.vocab.items():
30
+ self.id_to_token[idx] = token
31
+
32
+ if not self.vocab:
33
+ self.vocab = {token: idx for idx, token in enumerate(SPECIAL_TOKENS)}
34
+ self.id_to_token = SPECIAL_TOKENS[:]
35
+
36
+ kwargs.setdefault("bos_token", "<bos>")
37
+ kwargs.setdefault("eos_token", "<eos>")
38
+ kwargs.setdefault("unk_token", "<unk>")
39
+ kwargs.setdefault("pad_token", "<pad>")
40
+ super().__init__(**kwargs)
41
+
42
+ @property
43
+ def vocab_size(self) -> int:
44
+ return len(self.vocab)
45
+
46
+ def get_vocab(self) -> dict[str, int]:
47
+ return dict(self.vocab)
48
+
49
+ def _tokenize(self, text: str) -> list[str]:
50
+ return TOKEN_PATTERN.findall(text)
51
+
52
+ def _convert_token_to_id(self, token: str) -> int:
53
+ return self.vocab.get(token, self.vocab.get(self.unk_token, 0))
54
+
55
+ def _convert_id_to_token(self, index: int) -> str:
56
+ if 0 <= index < len(self.id_to_token):
57
+ return self.id_to_token[index]
58
+ return self.unk_token
59
+
60
+ def convert_tokens_to_string(self, tokens: list[str]) -> str:
61
+ return "".join(tokens)
62
+
63
+ def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: list[int] | None = None) -> list[int]:
64
+ if token_ids_1 is None:
65
+ return token_ids_0
66
+ return token_ids_0 + token_ids_1
67
+
68
+ def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str]:
69
+ out_dir = Path(save_directory)
70
+ out_dir.mkdir(parents=True, exist_ok=True)
71
+ file_name = "tokenizer.json" if filename_prefix is None else f"{filename_prefix}-tokenizer.json"
72
+ out_path = out_dir / file_name
73
+ payload = {
74
+ "type": "simple_regex_tokenizer",
75
+ "special_tokens": SPECIAL_TOKENS,
76
+ "vocab": self.vocab,
77
+ }
78
+ out_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")
79
+ return (str(out_path),)
tokenizer.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "special_tokens": [
3
  "<pad>",
4
  "<bos>",
 
1
  {
2
+ "added_tokens": [],
3
  "special_tokens": [
4
  "<pad>",
5
  "<bos>",
tokenizer_config.json CHANGED
@@ -1,4 +1,10 @@
1
  {
 
 
 
 
 
 
2
  "model_max_length": 320,
3
  "padding_side": "right",
4
  "special_tokens_map": {
 
1
  {
2
+ "auto_map": {
3
+ "AutoTokenizer": [
4
+ "tokenization_neurocoder.NeuroCoderTokenizer",
5
+ null
6
+ ]
7
+ },
8
  "model_max_length": 320,
9
  "padding_side": "right",
10
  "special_tokens_map": {