luozhangzichen commited on
Commit
2ec4b0e
·
verified ·
1 Parent(s): 0898df0

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. config.json +13 -0
  2. neon213.py +104 -0
  3. tokenizer.json +0 -0
config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": ["Neon213"],
3
+ "model_type": "neon",
4
+ "vocab_size": 16384,
5
+ "d_model": 384,
6
+ "n_head": 6,
7
+ "d_ff": 1536,
8
+ "n_layers": 8,
9
+ "block_size": 1024,
10
+ "auto_map": {
11
+ "AutoModel": "neon213.Neon213"
12
+ }
13
+ }
neon213.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Neon213: Growable SwiGLU-Conv Architecture for Progressive Training.
2
+ Same as neon185 but with configurable conv kernel sizes.
3
+ Supports growing from k=1 (pointwise) to k=9 (full context).
4
+ Config: d_model=384, n_head=6, d_ff=1536, n_layers=4→8, conv_k/mlp_k=1→9.
5
+ """
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ from models.neon015 import RMSNorm, apply_rotary_emb
10
+
11
+ class GrowableConvAttention(nn.Module):
12
+ def __init__(self, config):
13
+ super().__init__()
14
+ self.n_head = config['n_head']
15
+ self.head_dim = config['d_model'] // config['n_head']
16
+ d_model = config['d_model']
17
+ self.k = config.get('conv_k', 1)
18
+
19
+ self.c_attn = nn.Linear(d_model, 4 * d_model, bias=False)
20
+ self.conv_q = nn.Conv1d(d_model, d_model, kernel_size=self.k, groups=d_model, bias=False)
21
+ self.conv_k = nn.Conv1d(d_model, d_model, kernel_size=self.k, groups=d_model, bias=False)
22
+ self.conv_v = nn.Conv1d(d_model, d_model, kernel_size=self.k, groups=d_model, bias=False)
23
+ self.conv_i = nn.Conv1d(d_model, d_model, kernel_size=self.k, groups=d_model, bias=False)
24
+ self.q_norm = RMSNorm(self.head_dim)
25
+ self.k_norm = RMSNorm(self.head_dim)
26
+ self.c_proj = nn.Linear(d_model, d_model, bias=False)
27
+
28
+ def forward(self, x, freqs_cos, freqs_sin):
29
+ B, T, C = x.shape
30
+ q, k, v, intent = self.c_attn(x).split(C, dim=2)
31
+ pad = self.k - 1
32
+ q = self.conv_q(F.pad(q.transpose(1,2), (pad, 0))).transpose(1,2)
33
+ k = self.conv_k(F.pad(k.transpose(1,2), (pad, 0))).transpose(1,2)
34
+ v = self.conv_v(F.pad(v.transpose(1,2), (pad, 0))).transpose(1,2)
35
+ intent = self.conv_i(F.pad(intent.transpose(1,2), (pad, 0))).transpose(1,2)
36
+ q = q.view(B, T, self.n_head, self.head_dim)
37
+ k = k.view(B, T, self.n_head, self.head_dim)
38
+ v = v.view(B, T, self.n_head, self.head_dim)
39
+ intent = intent.view(B, T, self.n_head, self.head_dim)
40
+ q, k = self.q_norm(q), self.k_norm(k)
41
+ q = apply_rotary_emb(q, freqs_cos, freqs_sin)
42
+ k = apply_rotary_emb(k, freqs_cos, freqs_sin)
43
+ q, k, v = q.transpose(1,2), k.transpose(1,2), v.transpose(1,2)
44
+ intent = intent.transpose(1,2)
45
+ attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
46
+ y = torch.sigmoid(intent) * attn_out
47
+ y = y.transpose(1,2).contiguous().view(B, T, C)
48
+ return self.c_proj(y)
49
+
50
+ class GrowableHydraMLP(nn.Module):
51
+ def __init__(self, config):
52
+ super().__init__()
53
+ d_model = config['d_model']
54
+ d_ff = config['d_ff']
55
+ self.k = config.get('mlp_k', 1)
56
+ self.conv_gate = nn.Conv1d(d_model, d_model, kernel_size=self.k, groups=d_model, bias=False)
57
+ self.c_gate_proj = nn.Linear(d_model, d_ff, bias=False)
58
+ self.w1 = nn.Linear(d_model, d_ff, bias=False)
59
+ self.w2 = nn.Linear(d_ff, d_model, bias=False)
60
+
61
+ def forward(self, x):
62
+ x_t = x.transpose(1, 2)
63
+ pad = self.k - 1
64
+ c = self.conv_gate(F.pad(x_t, (pad, 0))).transpose(1, 2)
65
+ gate = F.silu(self.c_gate_proj(c))
66
+ return self.w2(gate * self.w1(x))
67
+
68
+ class Block(nn.Module):
69
+ def __init__(self, config):
70
+ super().__init__()
71
+ self.ln1 = RMSNorm(config['d_model'])
72
+ self.attn = GrowableConvAttention(config)
73
+ self.ln2 = RMSNorm(config['d_model'])
74
+ self.mlp = GrowableHydraMLP(config)
75
+ def forward(self, x, f_cos, f_sin):
76
+ x = x + self.attn(self.ln1(x), f_cos, f_sin)
77
+ x = x + self.mlp(self.ln2(x))
78
+ return x
79
+
80
+ class Neon213(nn.Module):
81
+ def __init__(self, config, warm_embeddings=None):
82
+ super().__init__()
83
+ self.config = config
84
+ self.token_emb = nn.Embedding(config['vocab_size'], config['d_model'])
85
+ if warm_embeddings is not None:
86
+ self.token_emb.weight.data.copy_(warm_embeddings)
87
+ self.blocks = nn.ModuleList([Block(config) for _ in range(config['n_layers'])])
88
+ self.ln_f = RMSNorm(config['d_model'])
89
+ self.head = nn.Linear(config['d_model'], config['vocab_size'], bias=False)
90
+ self.token_emb.weight = self.head.weight
91
+ dim = config['d_model'] // config['n_head']
92
+ inv_freq = 1.0 / (10000.0 ** (torch.arange(0, dim, 2).float() / dim))
93
+ t = torch.arange(config['block_size']).float()
94
+ freqs = torch.outer(t, inv_freq)
95
+ self.register_buffer("freqs_cos", torch.cos(freqs))
96
+ self.register_buffer("freqs_sin", torch.sin(freqs))
97
+
98
+ def forward(self, idx, targets=None):
99
+ x = self.token_emb(idx)
100
+ for block in self.blocks:
101
+ x = block(x, self.freqs_cos, self.freqs_sin)
102
+ logits = self.head(self.ln_f(x))
103
+ loss = F.cross_entropy(logits.view(-1, self.config['vocab_size']), targets.view(-1)) if targets is not None else None
104
+ return logits, loss
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff