SkillForge45 commited on
Commit
d666c9e
·
verified ·
1 Parent(s): af8c3a1

Create model.py

Browse files
Files changed (1) hide show
  1. model.py +141 -0
model.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.utils.data import Dataset, DataLoader
3
+ from torch import nn
4
+ from datasets import load_dataset, concatenate_datasets
5
+ from tokenizers import Tokenizer, models, trainers
6
+ import math
7
+
8
+ # --------------------------------------------------
9
+ # 1. Loading datasets from Hugging Face
10
+ # --------------------------------------------------
11
+ def load_hf_datasets():
12
+ """Load and concatenate datasets"""
13
+ bookcorpus = load_dataset("bookcorpus", split="train") # 11K books
14
+ wiki = load_dataset("wikitext", "wikitext-103-raw-v1", split="train") # Wikipedia
15
+ fineweb = load_dataset("fineweb", split="train")
16
+ arabic_raw_text = load_dataset("ARABIC-RAW-TEXT", split="train")
17
+ tinybooks = load_dataset("tiny-textbooks", split="train")
18
+ cc_trajectories = load_dataset("CC-Bench-trajectories", split="train")
19
+ textbook = load_dataset("TextbookReasoning", split="train")
20
+ megascience = load_dataset("MegaScience", split="train")
21
+ return concatenate_datasets([bookcorpus, wiki, fineweb, arabic_raw_text, tinybooks, cc_trajectories, textbook, megascience])
22
+
23
+ # --------------------------------------------------
24
+ # 2. Tokenization (BPE)
25
+ # --------------------------------------------------
26
+ def train_tokenizer(dataset, vocab_size=30000):
27
+ """Train a Byte-Level BPE tokenizer"""
28
+ tokenizer = Tokenizer(models.BPE())
29
+ trainer = trainers.BpeTrainer(
30
+ vocab_size=vocab_size,
31
+ special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]"]
32
+ )
33
+
34
+ # Train on dataset texts
35
+ def batch_iterator(batch_size=1000):
36
+ for i in range(0, len(dataset), batch_size):
37
+ yield dataset[i:i+batch_size]["text"]
38
+
39
+ tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
40
+ return tokenizer
41
+
42
+ # --------------------------------------------------
43
+ # 3. Preparing DataLoader
44
+ # --------------------------------------------------
45
+ class TextDataset(Dataset):
46
+ def __init__(self, encoded_text, seq_length=128):
47
+ self.data = encoded_text
48
+ self.seq_length = seq_length
49
+
50
+ def __len__(self):
51
+ return len(self.data) - self.seq_length
52
+
53
+ def __getitem__(self, idx):
54
+ x = self.data[idx:idx+self.seq_length]
55
+ y = self.data[idx+1:idx+self.seq_length+1]
56
+ return torch.tensor(x), torch.tensor(y)
57
+
58
+ # --------------------------------------------------
59
+ # 4. Transformer Model
60
+ # --------------------------------------------------
61
+ class TransformerModel(nn.Module):
62
+ def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6):
63
+ super().__init__()
64
+ self.embedding = nn.Embedding(vocab_size, d_model)
65
+ self.pos_encoder = PositionalEncoding(d_model)
66
+ encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward=d_model*4)
67
+ self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
68
+ self.fc = nn.Linear(d_model, vocab_size)
69
+
70
+ def forward(self, x):
71
+ x = self.embedding(x) * torch.sqrt(torch.tensor(self.embedding.embedding_dim))
72
+ x = self.pos_encoder(x)
73
+ x = self.transformer(x)
74
+ return self.fc(x)
75
+
76
+ class PositionalEncoding(nn.Module):
77
+ def __init__(self, d_model, max_len=5000):
78
+ super().__init__()
79
+ pe = torch.zeros(max_len, d_model)
80
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
81
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
82
+ pe[:, 0::2] = torch.sin(position * div_term)
83
+ pe[:, 1::2] = torch.cos(position * div_term)
84
+ self.register_buffer('pe', pe)
85
+
86
+ def forward(self, x):
87
+ return x + self.pe[:x.size(1), :]
88
+
89
+ # --------------------------------------------------
90
+ # 5. Training and Generation
91
+ # --------------------------------------------------
92
+ def main():
93
+ # Configuration
94
+ SEQ_LENGTH = 128
95
+ BATCH_SIZE = 64
96
+ VOCAB_SIZE = 30000
97
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
98
+
99
+ # 1. Load data
100
+ dataset = load_hf_datasets()
101
+
102
+ # 2. Tokenization
103
+ tokenizer = train_tokenizer(dataset, VOCAB_SIZE)
104
+ encoded_text = tokenizer.encode(dataset["text"]).ids
105
+
106
+ # 3. DataLoader
107
+ train_dataset = TextDataset(encoded_text, SEQ_LENGTH)
108
+ dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
109
+
110
+ # 4. Model
111
+ model = TransformerModel(VOCAB_SIZE).to(DEVICE)
112
+ optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
113
+ criterion = nn.CrossEntropyLoss()
114
+
115
+ # 5. Training
116
+ for epoch in range(10):
117
+ for batch_x, batch_y in dataloader:
118
+ batch_x, batch_y = batch_x.to(DEVICE), batch_y.to(DEVICE)
119
+ optimizer.zero_grad()
120
+ logits = model(batch_x)
121
+ loss = criterion(logits.view(-1, VOCAB_SIZE), batch_y.view(-1))
122
+ loss.backward()
123
+ optimizer.step()
124
+ print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
125
+
126
+ # 6. Text generation
127
+ def generate(prompt, max_length=100, temperature=0.7):
128
+ model.eval()
129
+ tokens = tokenizer.encode(prompt).ids
130
+ for _ in range(max_length):
131
+ with torch.no_grad():
132
+ logits = model(torch.tensor([tokens[-SEQ_LENGTH:]]).to(DEVICE))
133
+ probs = torch.softmax(logits[0, -1] / temperature, dim=-1)
134
+ next_token = torch.multinomial(probs, num_samples=1).item()
135
+ tokens.append(next_token)
136
+ return tokenizer.decode(tokens)
137
+
138
+ print(generate("The meaning of life is"))
139
+
140
+ if __name__ == "__main__":
141
+ main()