SykoSLM commited on
Commit
93f1e86
·
verified ·
1 Parent(s): f664427

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoConfig": "configuration_sykollm.SykoConfig",
4
+ "AutoModelForCausalLM": "modeling_sykollm.SykoSLM"
5
+ },
6
+ "bos_token_id": 2,
7
+ "chunk_size": 128,
8
+ "context_size": 1024,
9
+ "d_model": 768,
10
+ "eos_token_id": 3,
11
+ "intermediate_size": 3072,
12
+ "model_type": "sykollm",
13
+ "n_heads": 6,
14
+ "n_layers": 24,
15
+ "num_global_memory_tokens": 32,
16
+ "num_memory_tokens": 16,
17
+ "pad_token_id": 0,
18
+ "transformers_version": "5.2.0",
19
+ "vocab_size": 32000
20
+ }
configuration_sykollm.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class SykoConfig(PretrainedConfig):
4
+ model_type = "sykollm"
5
+
6
+ def __init__(
7
+ self,
8
+ vocab_size=32000,
9
+ d_model=768,
10
+ n_layers=24,
11
+ n_heads=6,
12
+ num_memory_tokens=16,
13
+ num_global_memory_tokens=32,
14
+ intermediate_size=3072,
15
+ chunk_size=128,
16
+ context_size=1024,
17
+ **kwargs
18
+ ):
19
+ super().__init__(**kwargs)
20
+ self.vocab_size = vocab_size
21
+ self.d_model = d_model
22
+ self.n_layers = n_layers
23
+ self.n_heads = n_heads
24
+ self.num_memory_tokens = num_memory_tokens
25
+ self.num_global_memory_tokens = num_global_memory_tokens
26
+ self.intermediate_size = intermediate_size
27
+ self.chunk_size = chunk_size
28
+ self.context_size = context_size
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1b0719c5e29697456f0cae86d8f2a23c297e8137344be6eb2937d64d530cfbc
3
+ size 904134544
modeling_sykollm.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import PreTrainedModel
4
+ from configuration_sykollm import SykoConfig
5
+
6
+
7
+ class SykoMemoryGate(nn.Module):
8
+ def __init__(self, d_model):
9
+ super().__init__()
10
+ self.forget_linear = nn.Linear(d_model * 2, d_model)
11
+ self.update_linear = nn.Linear(d_model, d_model)
12
+ self.norm = nn.LayerNorm(d_model)
13
+
14
+ def forward(self, current_context, prev_memory):
15
+ combined = torch.cat([current_context, prev_memory], dim=-1)
16
+ forget_ratio = torch.sigmoid(self.forget_linear(combined))
17
+ new_candidate = torch.tanh(self.update_linear(current_context))
18
+ new_memory = (forget_ratio * prev_memory) + ((1 - forget_ratio) * new_candidate)
19
+ return self.norm(new_memory)
20
+
21
+
22
+ class SykoSmartMemoryGate(nn.Module):
23
+ def __init__(self, d_model, num_heads=4):
24
+ super().__init__()
25
+ self.summarizer = nn.MultiheadAttention(d_model, num_heads, batch_first=True)
26
+ self.forget_linear = nn.Linear(d_model * 2, d_model)
27
+ self.update_linear = nn.Linear(d_model, d_model)
28
+ self.norm = nn.LayerNorm(d_model)
29
+
30
+ def forward(self, full_chunk_output, global_memory_output, prev_global_memory):
31
+ summary, _ = self.summarizer(
32
+ query=global_memory_output,
33
+ key=full_chunk_output,
34
+ value=full_chunk_output
35
+ )
36
+ combined = torch.cat([summary, prev_global_memory], dim=-1)
37
+ forget_ratio = torch.sigmoid(self.forget_linear(combined))
38
+ new_candidate = torch.tanh(self.update_linear(summary))
39
+ new_memory = (forget_ratio * prev_global_memory) + ((1 - forget_ratio) * new_candidate)
40
+ return self.norm(new_memory)
41
+
42
+
43
+ class SykoSLM(PreTrainedModel):
44
+ config_class = SykoConfig
45
+
46
+ def __init__(self, config: SykoConfig):
47
+ super().__init__(config)
48
+ self.d_model = config.d_model
49
+ self.mem_tokens = config.num_memory_tokens
50
+ self.g_mem_tokens = config.num_global_memory_tokens
51
+
52
+ self.embedding = nn.Embedding(config.vocab_size, config.d_model)
53
+ self.pos_embedding = nn.Embedding(
54
+ config.context_size + config.num_memory_tokens + config.num_global_memory_tokens,
55
+ config.d_model
56
+ )
57
+
58
+ encoder_layer = nn.TransformerEncoderLayer(
59
+ d_model=config.d_model,
60
+ nhead=config.n_heads,
61
+ dim_feedforward=config.intermediate_size,
62
+ batch_first=True,
63
+ norm_first=True
64
+ )
65
+ self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=config.n_layers, norm=nn.LayerNorm(config.d_model))
66
+ self.memory_gate = SykoMemoryGate(config.d_model)
67
+ self.global_memory_gate = SykoSmartMemoryGate(config.d_model, num_heads=4)
68
+ self.fc_out = nn.Linear(config.d_model, config.vocab_size)
69
+
70
+ def forward(self, input_ids, prev_memory, global_memory, chunk_start_idx=0):
71
+ x = self.embedding(input_ids)
72
+ x_with_memory = torch.cat([global_memory, prev_memory, x], dim=1)
73
+ seq_len = x.size(1)
74
+
75
+ g_mem_positions = torch.arange(0, self.g_mem_tokens, device=input_ids.device)
76
+ mem_positions = torch.arange(self.g_mem_tokens, self.g_mem_tokens + self.mem_tokens, device=input_ids.device)
77
+ word_positions = torch.arange(
78
+ self.g_mem_tokens + self.mem_tokens + chunk_start_idx,
79
+ self.g_mem_tokens + self.mem_tokens + chunk_start_idx + seq_len,
80
+ device=input_ids.device
81
+ )
82
+ positions = torch.cat([g_mem_positions, mem_positions, word_positions]).unsqueeze(0)
83
+ x_with_memory = x_with_memory + self.pos_embedding(positions)
84
+
85
+ causal_mask = nn.Transformer.generate_square_subsequent_mask(x_with_memory.size(1), device=input_ids.device)
86
+ out = self.transformer(x_with_memory, mask=causal_mask)
87
+
88
+ global_memory_output = out[:, :self.g_mem_tokens, :]
89
+ memory_output = out[:, self.g_mem_tokens : self.g_mem_tokens + self.mem_tokens, :]
90
+ token_outputs = out[:, self.g_mem_tokens + self.mem_tokens:, :]
91
+
92
+ logits = self.fc_out(token_outputs)
93
+ new_memory = self.memory_gate(memory_output, prev_memory)
94
+ new_global_memory = self.global_memory_gate(token_outputs, global_memory_output, global_memory)
95
+
96
+ return logits, new_memory, new_global_memory
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "bos_token": "<bos>",
4
+ "eos_token": "<eos>",
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "pad_token": "<pad>",
7
+ "tokenizer_class": "TokenizersBackend",
8
+ "unk_token": "<unk>"
9
+ }