teszenofficial commited on
Commit
6e0266a
·
verified ·
1 Parent(s): 808d9a4

Upload 5 files

Browse files
Files changed (5) hide show
  1. config.yaml +24 -0
  2. model.py +202 -0
  3. mtp_mini.pkl +3 -0
  4. mtp_tokenizer.model +3 -0
  5. tokenizer.py +138 -0
config.yaml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MTP Mini Configuration - Optimized
2
+
3
+ model:
4
+ vocab_size: 4000
5
+ d_model: 384 # Aumentado de 256 para más capacidad
6
+ n_layers: 6 # Aumentado de 4 para más profundidad
7
+ n_heads: 6 # Aumentado de 4 para mejor atención
8
+ d_ff: 1536 # Aumentado de 1024 (4x d_model)
9
+ max_seq_len: 256 # Aumentado de 128 para contexto más largo
10
+ dropout: 0.1
11
+
12
+ training:
13
+ batch_size: 8 # Aumentado de 4 para mejor gradiente
14
+ epochs: 50 # Aumentado de 20 para más entrenamiento
15
+ learning_rate: 0.0001 # Reducido de 0.0003 para estabilidad
16
+ weight_decay: 0.01
17
+ max_grad_norm: 0.5 # Reducido de 1.0 para mejor estabilidad
18
+ num_threads: 4
19
+ save_every: 10
20
+ warmup_steps: 100 # Nuevo: warmup del learning rate
21
+ use_lr_scheduler: true # Nuevo: learning rate decay
22
+
23
+ data:
24
+ corpus_path: corpus/mtp_mini_corpus.jsonl
model.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import math
5
+
6
+
7
+ class MultiHeadSelfAttention(nn.Module):
8
+ """Multi-Head Self-Attention mechanism"""
9
+
10
+ def __init__(self, d_model, n_heads, dropout=0.1):
11
+ super().__init__()
12
+ assert d_model % n_heads == 0
13
+
14
+ self.d_model = d_model
15
+ self.n_heads = n_heads
16
+ self.d_k = d_model // n_heads
17
+
18
+ self.q_linear = nn.Linear(d_model, d_model)
19
+ self.k_linear = nn.Linear(d_model, d_model)
20
+ self.v_linear = nn.Linear(d_model, d_model)
21
+ self.out_linear = nn.Linear(d_model, d_model)
22
+
23
+ self.dropout = nn.Dropout(dropout)
24
+
25
+ def forward(self, x, mask=None):
26
+ batch_size, seq_len, d_model = x.size()
27
+
28
+ # Linear projections
29
+ Q = self.q_linear(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
30
+ K = self.k_linear(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
31
+ V = self.v_linear(x).view(batch_size, seq_len, self.n_heads, self.d_k).transpose(1, 2)
32
+
33
+ # Scaled dot-product attention
34
+ scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
35
+
36
+ if mask is not None:
37
+ scores = scores.masked_fill(mask == 0, float('-inf'))
38
+
39
+ attn_weights = F.softmax(scores, dim=-1)
40
+ attn_weights = self.dropout(attn_weights)
41
+
42
+ context = torch.matmul(attn_weights, V)
43
+ context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, d_model)
44
+
45
+ output = self.out_linear(context)
46
+ return output
47
+
48
+
49
+ class FeedForward(nn.Module):
50
+ """Position-wise Feed-Forward Network"""
51
+
52
+ def __init__(self, d_model, d_ff, dropout=0.1):
53
+ super().__init__()
54
+ self.linear1 = nn.Linear(d_model, d_ff)
55
+ self.linear2 = nn.Linear(d_ff, d_model)
56
+ self.dropout = nn.Dropout(dropout)
57
+
58
+ def forward(self, x):
59
+ return self.linear2(self.dropout(F.gelu(self.linear1(x))))
60
+
61
+
62
+ class TransformerBlock(nn.Module):
63
+ """Single Transformer Decoder Block"""
64
+
65
+ def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
66
+ super().__init__()
67
+ self.attention = MultiHeadSelfAttention(d_model, n_heads, dropout)
68
+ self.feed_forward = FeedForward(d_model, d_ff, dropout)
69
+ self.ln1 = nn.LayerNorm(d_model)
70
+ self.ln2 = nn.LayerNorm(d_model)
71
+ self.dropout1 = nn.Dropout(dropout)
72
+ self.dropout2 = nn.Dropout(dropout)
73
+
74
+ def forward(self, x, mask=None):
75
+ # Self-attention with residual connection
76
+ attn_output = self.attention(self.ln1(x), mask)
77
+ x = x + self.dropout1(attn_output)
78
+
79
+ # Feed-forward with residual connection
80
+ ff_output = self.feed_forward(self.ln2(x))
81
+ x = x + self.dropout2(ff_output)
82
+
83
+ return x
84
+
85
+
86
+ class MTPMiniModel(nn.Module):
87
+ """MTP Mini - GPT-style Transformer Language Model"""
88
+
89
+ def __init__(self, vocab_size, d_model=256, n_layers=4, n_heads=4,
90
+ d_ff=1024, max_seq_len=128, dropout=0.1):
91
+ super().__init__()
92
+
93
+ self.vocab_size = vocab_size
94
+ self.d_model = d_model
95
+ self.max_seq_len = max_seq_len
96
+
97
+ # Token embeddings
98
+ self.token_embedding = nn.Embedding(vocab_size, d_model)
99
+
100
+ # Positional embeddings (learnable)
101
+ self.position_embedding = nn.Embedding(max_seq_len, d_model)
102
+
103
+ # Transformer blocks
104
+ self.blocks = nn.ModuleList([
105
+ TransformerBlock(d_model, n_heads, d_ff, dropout)
106
+ for _ in range(n_layers)
107
+ ])
108
+
109
+ # Final layer norm
110
+ self.ln_f = nn.LayerNorm(d_model)
111
+
112
+ # Output projection to vocabulary
113
+ self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
114
+
115
+ # Weight tying
116
+ self.lm_head.weight = self.token_embedding.weight
117
+
118
+ self.dropout = nn.Dropout(dropout)
119
+
120
+ # Initialize weights
121
+ self.apply(self._init_weights)
122
+
123
+ def _init_weights(self, module):
124
+ if isinstance(module, nn.Linear):
125
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
126
+ if module.bias is not None:
127
+ torch.nn.init.zeros_(module.bias)
128
+ elif isinstance(module, nn.Embedding):
129
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
130
+ elif isinstance(module, nn.LayerNorm):
131
+ torch.nn.init.zeros_(module.bias)
132
+ torch.nn.init.ones_(module.weight)
133
+
134
+ def forward(self, input_ids, targets=None):
135
+ batch_size, seq_len = input_ids.size()
136
+
137
+ # Create causal mask
138
+ mask = torch.tril(torch.ones(seq_len, seq_len, device=input_ids.device)).view(1, 1, seq_len, seq_len)
139
+
140
+ # Token embeddings + positional embeddings
141
+ positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0)
142
+ tok_emb = self.token_embedding(input_ids)
143
+ pos_emb = self.position_embedding(positions)
144
+ x = self.dropout(tok_emb + pos_emb)
145
+
146
+ # Pass through transformer blocks
147
+ for block in self.blocks:
148
+ x = block(x, mask)
149
+
150
+ # Final layer norm
151
+ x = self.ln_f(x)
152
+
153
+ # Project to vocabulary
154
+ logits = self.lm_head(x)
155
+
156
+ # Calculate loss if targets provided
157
+ loss = None
158
+ if targets is not None:
159
+ loss = F.cross_entropy(logits.view(-1, self.vocab_size), targets.view(-1))
160
+
161
+ return logits, loss
162
+
163
+ def generate(self, input_ids, max_new_tokens=50, temperature=1.0, top_k=50, top_p=0.9):
164
+ """Autoregressive generation with sampling"""
165
+ self.eval()
166
+
167
+ with torch.no_grad():
168
+ for _ in range(max_new_tokens):
169
+ # Crop to max_seq_len
170
+ input_ids_cond = input_ids if input_ids.size(1) <= self.max_seq_len else input_ids[:, -self.max_seq_len:]
171
+
172
+ # Forward pass
173
+ logits, _ = self(input_ids_cond)
174
+ logits = logits[:, -1, :] / temperature
175
+
176
+ # Top-k filtering
177
+ if top_k > 0:
178
+ v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
179
+ logits[logits < v[:, [-1]]] = float('-inf')
180
+
181
+ # Top-p (nucleus) filtering
182
+ if top_p < 1.0:
183
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
184
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
185
+ sorted_indices_to_remove = cumulative_probs > top_p
186
+ sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
187
+ sorted_indices_to_remove[:, 0] = 0
188
+ indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
189
+ logits[indices_to_remove] = float('-inf')
190
+
191
+ # Sample from distribution
192
+ probs = F.softmax(logits, dim=-1)
193
+ next_token = torch.multinomial(probs, num_samples=1)
194
+
195
+ # Append to sequence
196
+ input_ids = torch.cat([input_ids, next_token], dim=1)
197
+
198
+ return input_ids
199
+
200
+ def count_parameters(self):
201
+ """Count trainable parameters"""
202
+ return sum(p.numel() for p in self.parameters() if p.requires_grad)
mtp_mini.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e68560574df89b94d55dde621ee671699f987e552ec1c2c05684d94c838a8992
3
+ size 54245198
mtp_tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c6a030518205092295cf9166d47ad232e97e0cbec03c2044cb3b8ac4a9f0392
3
+ size 56484
tokenizer.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sentencepiece as spm
2
+ import os
3
+ import json
4
+
5
+
6
+ class MTPTokenizer:
7
+ """Tokenizer using SentencePiece BPE"""
8
+
9
+ def __init__(self, model_path=None):
10
+ self.sp = None
11
+ self.model_path = model_path
12
+
13
+ if model_path and os.path.exists(model_path):
14
+ self.load(model_path)
15
+
16
+ def train(self, corpus_path, vocab_size=4000, model_prefix='mtp_tokenizer'):
17
+ """Train SentencePiece BPE tokenizer on corpus"""
18
+
19
+ # Extract text from JSONL corpus
20
+ texts = []
21
+ with open(corpus_path, 'r', encoding='utf-8') as f:
22
+ for line in f:
23
+ data = json.loads(line)
24
+ if 'instruction' in data:
25
+ texts.append(data['instruction'])
26
+ if 'response' in data:
27
+ texts.append(data['response'])
28
+
29
+ # Save temporary text file
30
+ temp_file = 'temp_corpus.txt'
31
+ with open(temp_file, 'w', encoding='utf-8') as f:
32
+ f.write('\n'.join(texts))
33
+
34
+ # Calculate optimal vocab size based on corpus
35
+ total_chars = sum(len(text) for text in texts)
36
+ max_vocab = min(vocab_size, int(total_chars * 0.15)) # Heuristic: ~15% of chars
37
+
38
+ print(f" → Corpus stats: {len(texts)} texts, {total_chars} characters")
39
+ print(f" → Adjusted vocab size: {max_vocab} (requested: {vocab_size})")
40
+
41
+ # Train SentencePiece with adjusted parameters
42
+ try:
43
+ spm.SentencePieceTrainer.train(
44
+ input=temp_file,
45
+ model_prefix=model_prefix,
46
+ vocab_size=max_vocab,
47
+ model_type='bpe',
48
+ pad_id=0,
49
+ unk_id=1,
50
+ bos_id=2,
51
+ eos_id=3,
52
+ character_coverage=1.0,
53
+ normalization_rule_name='identity',
54
+ num_threads=4,
55
+ split_digits=True,
56
+ allow_whitespace_only_pieces=False,
57
+ byte_fallback=False,
58
+ max_sentencepiece_length=16
59
+ )
60
+ except RuntimeError as e:
61
+ if "Vocabulary size too high" in str(e):
62
+ # Extract suggested max from error and retry
63
+ import re
64
+ match = re.search(r'value <= (\d+)', str(e))
65
+ if match:
66
+ suggested_max = int(match.group(1))
67
+ print(f" → Retrying with vocab size: {suggested_max}")
68
+ spm.SentencePieceTrainer.train(
69
+ input=temp_file,
70
+ model_prefix=model_prefix,
71
+ vocab_size=suggested_max,
72
+ model_type='bpe',
73
+ pad_id=0,
74
+ unk_id=1,
75
+ bos_id=2,
76
+ eos_id=3,
77
+ character_coverage=1.0,
78
+ normalization_rule_name='identity',
79
+ num_threads=4,
80
+ split_digits=True,
81
+ allow_whitespace_only_pieces=False,
82
+ byte_fallback=False,
83
+ max_sentencepiece_length=16
84
+ )
85
+ else:
86
+ raise
87
+ else:
88
+ raise
89
+
90
+ # Clean up
91
+ os.remove(temp_file)
92
+
93
+ # Load the trained model
94
+ self.model_path = f"{model_prefix}.model"
95
+ self.load(self.model_path)
96
+
97
+ print(f"✓ Tokenizer trained: {self.vocab_size()} tokens")
98
+ print(f"✓ Model saved: {self.model_path}")
99
+
100
+ def load(self, model_path):
101
+ """Load trained tokenizer"""
102
+ self.sp = spm.SentencePieceProcessor()
103
+ self.sp.load(model_path)
104
+ self.model_path = model_path
105
+
106
+ def encode(self, text):
107
+ """Encode text to token IDs"""
108
+ if self.sp is None:
109
+ raise ValueError("Tokenizer not loaded. Train or load a model first.")
110
+ return self.sp.encode_as_ids(text)
111
+
112
+ def decode(self, ids):
113
+ """Decode token IDs to text"""
114
+ if self.sp is None:
115
+ raise ValueError("Tokenizer not loaded. Train or load a model first.")
116
+ return self.sp.decode_ids(ids)
117
+
118
+ def vocab_size(self):
119
+ """Get vocabulary size"""
120
+ if self.sp is None:
121
+ return 0
122
+ return self.sp.get_piece_size()
123
+
124
+ def bos_id(self):
125
+ """Beginning of sentence token ID"""
126
+ return self.sp.bos_id()
127
+
128
+ def eos_id(self):
129
+ """End of sentence token ID"""
130
+ return self.sp.eos_id()
131
+
132
+ def pad_id(self):
133
+ """Padding token ID"""
134
+ return self.sp.pad_id()
135
+
136
+ def unk_id(self):
137
+ """Unknown token ID"""
138
+ return self.sp.unk_id()