openagi-agi commited on
Commit
c96148d
·
verified ·
1 Parent(s): 31aaf50

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. ckpt.pt +3 -0
  3. dataset_clean.txt +3 -0
  4. embed_test.py +202 -0
  5. test_emb_in.py +89 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ dataset_clean.txt filter=lfs diff=lfs merge=lfs -text
ckpt.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0359c69944bbacbcf74882bcd09ac65f0d43cb046777313c47188011246ff8da
3
+ size 49830281
dataset_clean.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9ed7430e51dded852a98d3672f80274d96e84ab81a5b45290e2c87de3478379
3
+ size 529707835
embed_test.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from torch import nn
4
+ from torch.optim import AdamW
5
+ import torch.nn.functional as F
6
+ from torch.utils.data import DataLoader, Dataset
7
+ from tokenizers import Tokenizer, models, trainers, pre_tokenizers
8
+ import math
9
+
10
+ # =========================
11
+ # Juicy variables
12
+ # =========================
13
+ DATA_PATH = "dataset_clean.txt" # one text per line
14
+ VOCAB_LIMIT = None # None = all tokens, or int = cap vocab
15
+ MODEL_DIM = 256
16
+ NUM_LAYERS = 6
17
+ NUM_HEADS = 4
18
+ FF_DIM = 1024
19
+ SEQ_LEN = 128
20
+
21
+ BATCH_SIZE = 64
22
+ LEARNING_RATE = 3e-4
23
+ WEIGHT_DECAY = 0.01
24
+ WARMUP_STEPS = 50
25
+ MAX_STEPS = 100
26
+ TEMPERATURE = 0.05
27
+
28
+ OPTIMIZER = "adamw" # "adamw" or "muon"
29
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
30
+
31
+ def estimate_params(vocab_size, model_dim, ff_dim, num_layers, seq_len):
32
+ # Embedding + positional
33
+ emb_params = vocab_size * model_dim
34
+ pos_params = seq_len * model_dim
35
+
36
+ # Per-layer Transformer block
37
+ # Attention projections (Q, K, V, O): 4 * d^2
38
+ attn_params = 4 * (model_dim ** 2)
39
+ # Feed-forward (two linear layers): 2 * d * ff_dim
40
+ ff_params = 2 * model_dim * ff_dim
41
+ # LayerNorms ~2 * d, negligible compared to above
42
+ per_layer = attn_params + ff_params
43
+
44
+ # Multiply by number of layers
45
+ encoder_params = num_layers * per_layer
46
+
47
+ total = emb_params + pos_params + encoder_params
48
+ return {
49
+ "embeddings": emb_params,
50
+ "positional": pos_params,
51
+ "encoder_layers": encoder_params,
52
+ "total": total
53
+ }
54
+
55
+ # =========================
56
+
57
+ # -------------------------
58
+ # Build tokenizer from dataset
59
+ # -------------------------
60
+ def build_tokenizer(data_path, vocab_limit=None):
61
+ tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]"))
62
+ if vocab_limit is not None:
63
+ trainer = trainers.WordLevelTrainer(
64
+ vocab_size=vocab_limit,
65
+ min_frequency=1,
66
+ special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
67
+ )
68
+ else:
69
+ trainer = trainers.WordLevelTrainer(
70
+ min_frequency=1,
71
+ special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
72
+ )
73
+ tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
74
+
75
+ with open(data_path, "r", encoding="utf-8") as f:
76
+ lines = [line.strip() for line in f if line.strip()]
77
+
78
+ tokenizer.train_from_iterator(lines, trainer=trainer)
79
+ os.makedirs("tokenizer", exist_ok=True)
80
+ tokenizer.save("tokenizer/tokenizer.json")
81
+ return tokenizer
82
+
83
+ tokenizer = build_tokenizer(DATA_PATH, VOCAB_LIMIT)
84
+ VOCAB_SIZE = tokenizer.get_vocab_size()
85
+ print(f"[INFO] Custom vocab size: {VOCAB_SIZE}")
86
+
87
+ est = estimate_params(VOCAB_SIZE, MODEL_DIM, FF_DIM, NUM_LAYERS, SEQ_LEN)
88
+ print("Parameter estimate:")
89
+ for k, v in est.items():
90
+ print(f"{k:15}: {v:,}")
91
+
92
+ # -------------------------
93
+ # Dataset wrapper
94
+ # -------------------------
95
+ class TextDataset(Dataset):
96
+ def __init__(self, path, tokenizer, seq_len):
97
+ with open(path, "r", encoding="utf-8") as f:
98
+ self.lines = [line.strip() for line in f if line.strip()]
99
+ self.tokenizer = tokenizer
100
+ self.seq_len = seq_len
101
+ self.pad_id = self.tokenizer.token_to_id("[PAD]")
102
+
103
+ def __len__(self):
104
+ return len(self.lines)
105
+
106
+ def __getitem__(self, idx):
107
+ tokens = self.tokenizer.encode(self.lines[idx]).ids
108
+ # pad / truncate
109
+ tokens = tokens[:self.seq_len]
110
+ tokens += [self.pad_id] * (self.seq_len - len(tokens))
111
+ return torch.tensor(tokens, dtype=torch.long)
112
+
113
+ dataset = TextDataset(DATA_PATH, tokenizer, SEQ_LEN)
114
+ loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
115
+
116
+ # -------------------------
117
+ # Transformer Encoder
118
+ # -------------------------
119
+ class TransformerEncoder(nn.Module):
120
+ def __init__(self):
121
+ super().__init__()
122
+ self.token_emb = nn.Embedding(VOCAB_SIZE, MODEL_DIM)
123
+ self.pos_emb = nn.Embedding(SEQ_LEN, MODEL_DIM)
124
+
125
+ encoder_layer = nn.TransformerEncoderLayer(
126
+ d_model=MODEL_DIM,
127
+ nhead=NUM_HEADS,
128
+ dim_feedforward=FF_DIM,
129
+ activation="gelu",
130
+ batch_first=True
131
+ )
132
+ self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=NUM_LAYERS)
133
+ self.norm = nn.LayerNorm(MODEL_DIM)
134
+
135
+ def forward(self, x):
136
+ positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
137
+ h = self.token_emb(x) + self.pos_emb(positions)
138
+ h = self.encoder(h)
139
+ h = self.norm(h)
140
+ return h.mean(dim=1) # pooled embedding
141
+
142
+ # -------------------------
143
+ # Contrastive loss
144
+ # -------------------------
145
+ def contrastive_loss(z1, z2, temperature=TEMPERATURE):
146
+ z1 = F.normalize(z1, dim=1)
147
+ z2 = F.normalize(z2, dim=1)
148
+ logits = z1 @ z2.t() / temperature
149
+ labels = torch.arange(z1.size(0), device=z1.device)
150
+ return F.cross_entropy(logits, labels)
151
+
152
+ # -------------------------
153
+ # Setup
154
+ # -------------------------
155
+ model = TransformerEncoder().to(DEVICE)
156
+
157
+ if OPTIMIZER == "adamw":
158
+ optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
159
+ elif OPTIMIZER == "muon":
160
+ from muon import Muon
161
+ optimizer = Muon(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
162
+ else:
163
+ raise ValueError("Invalid optimizer")
164
+
165
+ def lr_lambda(step):
166
+ if step < WARMUP_STEPS:
167
+ return float(step) / float(max(1, WARMUP_STEPS))
168
+ progress = float(step - WARMUP_STEPS) / float(max(1, MAX_STEPS - WARMUP_STEPS))
169
+ return 0.5 * (1.0 + math.cos(math.pi * progress))
170
+
171
+ scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
172
+
173
+ # -------------------------
174
+ # Training loop
175
+ # -------------------------
176
+ step = 0
177
+ while step < MAX_STEPS:
178
+ for batch in loader:
179
+ if step >= MAX_STEPS:
180
+ break
181
+
182
+ x = batch.to(DEVICE)
183
+ # "Augment" — here just duplicate batch (replace with dropout/noise if you want)
184
+ z1 = model(x)
185
+ z2 = model(x)
186
+
187
+ loss = contrastive_loss(z1, z2)
188
+
189
+ optimizer.zero_grad()
190
+ loss.backward()
191
+ optimizer.step()
192
+ scheduler.step()
193
+
194
+ if step % 100 == 0:
195
+ print(f"Step {step}: loss={loss.item():.4f}, lr={scheduler.get_last_lr()[0]:.6f}")
196
+
197
+ step += 1
198
+
199
+ print("[DONE] Training complete")
200
+ print("[INFO] Saving model...")
201
+ torch.save(model.state_dict(), "ckpt.pt")
202
+ print("[DONE] Model saved to ckpt.pt")
test_emb_in.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from tokenizers import Tokenizer
4
+
5
+ # =========================
6
+ # Juicy variables
7
+ # =========================
8
+ CHECKPOINT_PATH = "ckpt.pt"
9
+ TOKENIZER_PATH = "tokenizer/tokenizer.json"
10
+ SEQ_LEN = 128
11
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ # =========================
14
+ # Load tokenizer
15
+ # =========================
16
+ tokenizer = Tokenizer.from_file(TOKENIZER_PATH)
17
+ pad_id = tokenizer.token_to_id("[PAD]")
18
+
19
+ def encode_sentences(sentences):
20
+ ids = []
21
+ for s in sentences:
22
+ tokens = tokenizer.encode(s).ids
23
+ tokens = tokens[:SEQ_LEN]
24
+ tokens += [pad_id] * (SEQ_LEN - len(tokens))
25
+ ids.append(tokens)
26
+ return torch.tensor(ids, dtype=torch.long, device=DEVICE)
27
+
28
+ # =========================
29
+ # Model (must match training definition)
30
+ # =========================
31
+ class TransformerEncoder(torch.nn.Module):
32
+ def __init__(self, vocab_size, model_dim=256, num_layers=6, num_heads=4, ff_dim=1024, seq_len=128):
33
+ super().__init__()
34
+ self.token_emb = torch.nn.Embedding(vocab_size, model_dim)
35
+ self.pos_emb = torch.nn.Embedding(seq_len, model_dim)
36
+
37
+ encoder_layer = torch.nn.TransformerEncoderLayer(
38
+ d_model=model_dim,
39
+ nhead=num_heads,
40
+ dim_feedforward=ff_dim,
41
+ activation="gelu",
42
+ batch_first=True
43
+ )
44
+ self.encoder = torch.nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
45
+ self.norm = torch.nn.LayerNorm(model_dim)
46
+
47
+ def forward(self, x):
48
+ positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
49
+ h = self.token_emb(x) + self.pos_emb(positions)
50
+ h = self.encoder(h)
51
+ h = self.norm(h)
52
+ return h.mean(dim=1) # pooled embedding
53
+
54
+ # =========================
55
+ # Load checkpoint
56
+ # =========================
57
+ VOCAB_SIZE = tokenizer.get_vocab_size()
58
+ model = TransformerEncoder(vocab_size=VOCAB_SIZE).to(DEVICE)
59
+ model.load_state_dict(torch.load(CHECKPOINT_PATH, map_location=DEVICE))
60
+ model.eval()
61
+
62
+ print("[INFO] Model loaded.")
63
+
64
+ # =========================
65
+ # Test sentences
66
+ # =========================
67
+ sentences = [
68
+ "The quick brown fox jumps over the lazy dog.",
69
+ "Neural networks are changing artificial intelligence.",
70
+ "I love eating pizza on weekends.",
71
+ "Quantum physics is hard but fascinating.",
72
+ ]
73
+
74
+ inputs = encode_sentences(sentences)
75
+ with torch.no_grad():
76
+ embeddings = model(inputs)
77
+
78
+ # Normalize for cosine sim
79
+ embeddings = F.normalize(embeddings, dim=1)
80
+
81
+ print("\nEmbeddings:")
82
+ for s, e in zip(sentences, embeddings):
83
+ print(f"{s}\n -> {e[:5].cpu().numpy()}...") # show first 5 dims
84
+
85
+ print("\nCosine similarities:")
86
+ sims = embeddings @ embeddings.T
87
+ for i in range(len(sentences)):
88
+ row = ["{:.2f}".format(x.item()) for x in sims[i]]
89
+ print(f"{i}: {row}")