File size: 6,588 Bytes
2d3d653 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | import time
import torch
import torch.nn as nn
import torch.nn.functional as F
# --- Configuration & Data ---
data = """To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
And by opposing end them. To die—to sleep,
No more; and by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to: 'tis a consummation
Devoutly to be wish'd. To die, to sleep;
To sleep, perchance to dream—ay, there's the rub:
For in that sleep of death what dreams may come,
When we have shuffled off this mortal coil,
Must give us pause—there's the respect
That makes calamity of so long life.
For who would bear the whips and scorns of time,
Th'oppressor's wrong, the proud man's contumely,
The pangs of dispriz'd love, the law's delay,
The insolence of office, and the spurns
That patient merit of th'unworthy takes,
When he himself might his quietus make
With a bare bodkin? Who would fardels bear,
To grunt and sweat under a weary life,
But that the dread of something after death,
The undiscovere'd country, from whose bourn
No traveller returns, puzzles the will,
And makes us rather bear those ills we have
Than fly to others that we know not of?
Thus conscience doth make cowards of us all,
And thus the native hue of resolution
Is sicklied o'er with the pale cast of thought,
And enterprises of great pith and moment
With this regard their currents turn awry
And lose the name of action."""
chars = sorted(list(set(data)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encoded = torch.tensor([stoi[c] for c in data], dtype=torch.long)
# Hyperparameters based on your architecture
D_MODEL = 256
N_LAYERS = 4
MAX_SEQ_LEN = 64
LOCAL_K = 5
GLOBAL_K = 128
FFT_SIZE = 256
TRAIN_TIME = 60
BATCH_SIZE = 8
# --- Architecture Components ---
class GlobalConv1D(nn.Module):
def __init__(self, d_model, kernel_size, fft_size):
super().__init__()
self.kernel = nn.Parameter(torch.randn(d_model, kernel_size) * 0.01)
self.kernel_size = kernel_size
self.fft_size = fft_size
def forward(self, x):
B, C, T = x.shape
K = min(self.kernel_size, T)
overlap = K - 1
block = self.fft_size - overlap
x = F.pad(x, (overlap, 0))
k = self.kernel[:, :K]
k = F.pad(k, (0, self.fft_size - K))
k_f = torch.fft.rfft(k, n=self.fft_size)
outs = []
pos = 0
while pos < T:
seg = x[..., pos:pos + self.fft_size]
if seg.shape[-1] < self.fft_size:
seg = F.pad(seg, (0, self.fft_size - seg.shape[-1]))
y = torch.fft.irfft(torch.fft.rfft(seg, n=self.fft_size) * k_f.unsqueeze(0), n=self.fft_size)
outs.append(y[..., overlap:overlap + block])
pos += block
return torch.cat(outs, dim=-1)[..., :T]
class LocalConv1D(nn.Module):
def __init__(self, d_model, k):
super().__init__()
self.k = k
self.dw = nn.Conv1d(d_model, d_model, k, groups=d_model)
self.pw = nn.Conv1d(d_model, d_model, 1)
def forward(self, x):
x = F.pad(x, (self.k - 1, 0))
return self.pw(F.relu(self.dw(x)))
class Block(nn.Module):
def __init__(self, d_model, use_global):
super().__init__()
self.use_global = use_global
self.ln1 = nn.LayerNorm(d_model)
self.local = LocalConv1D(d_model, LOCAL_K)
if use_global:
self.ln2 = nn.LayerNorm(d_model)
self.global_conv = GlobalConv1D(d_model, GLOBAL_K, FFT_SIZE)
self.ln3 = nn.LayerNorm(d_model)
self.ff = nn.Sequential(
nn.Linear(d_model, d_model * 4),
nn.GELU(),
nn.Linear(d_model * 4, d_model)
)
def forward(self, x):
x = x + self.local(self.ln1(x).transpose(1, 2)).transpose(1, 2)
if self.use_global:
x = x + self.global_conv(self.ln2(x).transpose(1, 2)).transpose(1, 2)
return x + self.ff(self.ln3(x))
class GCLM(nn.Module):
def __init__(self, vocab):
super().__init__()
self.emb = nn.Embedding(vocab, D_MODEL)
self.pos = nn.Embedding(MAX_SEQ_LEN, D_MODEL)
self.layers = nn.ModuleList([Block(D_MODEL, i % 2 == 0) for i in range(N_LAYERS)])
self.ln = nn.LayerNorm(D_MODEL)
self.head = nn.Linear(D_MODEL, vocab)
self.head.weight = self.emb.weight # Weight Tying
def forward(self, x):
T = x.size(1)
h = self.emb(x) + self.pos(torch.arange(T, device=x.device))
for layer in self.layers:
h = layer(h)
return self.head(self.ln(h))
# --- Training Setup ---
device = "cuda" if torch.cuda.is_available() else "cpu"
model = GCLM(vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
print(f"Training on {device} for {TRAIN_TIME} seconds...")
start_time = time.time()
step = 0
model.train()
while (time.time() - start_time) < TRAIN_TIME:
# Random batching
ix = torch.randint(0, len(encoded) - MAX_SEQ_LEN, (BATCH_SIZE,))
x = torch.stack([encoded[i : i + MAX_SEQ_LEN] for i in ix]).to(device)
y = torch.stack([encoded[i + 1 : i + MAX_SEQ_LEN + 1] for i in ix]).to(device)
logits = model(x)
loss = F.cross_entropy(logits.view(-1, vocab_size), y.view(-1))
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
if step % 10 == 0:
elapsed = time.time() - start_time
print(f"\rStep {step} | Loss: {loss.item():.4f} | Progress: {min(100, (elapsed/TRAIN_TIME)*100):.1f}%", end="")
step += 1
# --- Generation ---
print("\n\nTraining Complete. Generating:\n" + "-"*30)
model.eval()
prompt = "To be, "
ctx = torch.tensor([[stoi[c] for c in prompt]], dtype=torch.long, device=device)
print(prompt, end="", flush=True)
with torch.no_grad():
for _ in range(MAX_SEQ_LEN * 2):
# Crop context to model's MAX_SEQ_LEN
inp = ctx[:, -MAX_SEQ_LEN:]
logits = model(inp)
logits = logits[:, -1, :] / 0.8 # Temperature
# Simple top-k to keep it clean
v, _ = torch.topk(logits, min(10, vocab_size))
logits[logits < v[:, [-1]]] = -float('Inf')
probs = F.softmax(logits, dim=-1)
next_char_idx = torch.multinomial(probs, num_samples=1)
ctx = torch.cat((ctx, next_char_idx), dim=1)
print(itos[next_char_idx.item()], end="", flush=True)
print("\n" + "-"*30) |