Initial clean Space commit

Browse files

Files changed (4) hide show

.gitignore +1 -0
bdh.py +171 -0
requirements.txt +3 -0
train.py +126 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ input.txt

bdh.py ADDED Viewed

	@@ -0,0 +1,171 @@

+# Copyright 2025 Pathway Technology, Inc.
+import dataclasses
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn
+@dataclasses.dataclass
+class BDHConfig:
+    n_layer: int = 6
+    n_embd: int = 256
+    dropout: float = 0.1
+    n_head: int = 4
+    mlp_internal_dim_multiplier: int = 128
+    vocab_size: int = 256
+def get_freqs(n, theta, dtype):
+    def quantize(t, q=2):
+        return (t / q).floor() * q
+    return (
+        1.0
+        / (theta ** (quantize(torch.arange(0, n, 1, dtype=dtype)) / n))
+        / (2 * math.pi)
+    )
+class Attention(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        nh = config.n_head
+        D = config.n_embd
+        N = config.mlp_internal_dim_multiplier * D // nh
+        self.freqs = torch.nn.Buffer(
+            get_freqs(N, theta=2**16, dtype=torch.float32).view(1, 1, 1, N)
+        )
+    @staticmethod
+    def phases_cos_sin(phases):
+        phases = (phases % 1) * (2 * math.pi)
+        phases_cos = torch.cos(phases)
+        phases_sin = torch.sin(phases)
+        return phases_cos, phases_sin
+    @staticmethod
+    def rope(phases, v):
+        v_rot = torch.stack((-v[..., 1::2], v[..., ::2]), dim=-1).view(*v.size())
+        phases_cos, phases_sin = Attention.phases_cos_sin(phases)
+        return (v * phases_cos).to(v.dtype) + (v_rot * phases_sin).to(v.dtype)
+    def forward(self, Q, K, V):
+        assert self.freqs.dtype == torch.float32
+        assert K is Q
+        _, _, T, _ = Q.size()
+        r_phases = (
+            torch.arange(
+                0,
+                T,
+                device=self.freqs.device,
+                dtype=self.freqs.dtype,
+            ).view(1, 1, -1, 1)
+        ) * self.freqs
+        QR = self.rope(r_phases, Q)
+        KR = QR
+        # Current attention
+        scores = (QR @ KR.mT).tril(diagonal=-1)
+        return scores @ V
+class BDH(nn.Module):
+    def __init__(self, config: BDHConfig):
+        super().__init__()
+        assert config.vocab_size is not None
+        self.config = config
+        nh = config.n_head
+        D = config.n_embd
+        N = config.mlp_internal_dim_multiplier * D // nh
+        self.decoder = nn.Parameter(torch.zeros((nh * N, D)).normal_(std=0.02))
+        self.encoder = nn.Parameter(torch.zeros((nh, D, N)).normal_(std=0.02))
+        self.attn = Attention(config)
+        self.ln = nn.LayerNorm(D, elementwise_affine=False, bias=False)
+        self.embed = nn.Embedding(config.vocab_size, D)
+        self.drop = nn.Dropout(config.dropout)
+        self.encoder_v = nn.Parameter(torch.zeros((nh, D, N)).normal_(std=0.02))
+        self.lm_head = nn.Parameter(
+            torch.zeros((D, config.vocab_size)).normal_(std=0.02)
+        )
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, targets=None):
+        C = self.config
+        B, T = idx.size()
+        D = C.n_embd
+        nh = C.n_head
+        N = D * C.mlp_internal_dim_multiplier // nh
+        x = self.embed(idx).unsqueeze(1)
+        # actually helps with training
+        x = self.ln(x)  # B, 1, T, D
+        for level in range(C.n_layer):
+            x_latent = x @ self.encoder
+            x_sparse = F.relu(x_latent)  # B, nh, T, N
+            yKV = self.attn(
+                Q=x_sparse,
+                K=x_sparse,
+                V=x,
+            )
+            yKV = self.ln(yKV)
+            y_latent = yKV @ self.encoder_v
+            y_sparse = F.relu(y_latent)
+            xy_sparse = x_sparse * y_sparse  # B, nh, T, N
+            xy_sparse = self.drop(xy_sparse)
+            yMLP = (
+                xy_sparse.transpose(1, 2).reshape(B, 1, T, N * nh) @ self.decoder
+            )  # B, 1, T, D
+            y = self.ln(yMLP)
+            x = self.ln(x + y)
+        logits = x.view(B, T, D) @ self.lm_head
+        loss = None
+        if targets is not None:
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
+        return logits, loss
+    @torch.no_grad()
+    def generate(
+        self,
+        idx: torch.Tensor,
+        max_new_tokens: int,
+        temperature: float = 1.0,
+        top_k: int | None = None,
+    ) -> torch.Tensor:
+        for _ in range(max_new_tokens):
+            idx_cond = idx
+            logits, _ = self(idx_cond)
+            logits = logits[:, -1, :] / temperature
+            if top_k is not None:
+                values, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < values[:, [-1]]] = float("-inf")
+            probs = F.softmax(logits, dim=-1)
+            idx_next = torch.multinomial(probs, num_samples=1)
+            idx = torch.cat((idx, idx_next), dim=1)
+        return idx

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+torch
+numpy
+requests

train.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# Copyright Pathway Technology, Inc.
+import os
+from contextlib import nullcontext
+import bdh
+import numpy as np
+import requests
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# On a Mac you can also try
+# device=torch.device('mps')
+dtype = (
+    "bfloat16"
+    if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
+    else "float16"
+)  # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
+ptdtype = {
+    "float32": torch.float32,
+    "bfloat16": torch.bfloat16,
+    "float16": torch.float16,
+}[dtype]
+ctx = (
+    torch.amp.autocast(device_type=device.type, dtype=ptdtype)
+    if "cuda" in device.type
+    else nullcontext()
+)
+scaler = torch.amp.GradScaler(device=device.type, enabled=(dtype == "float16"))
+torch.manual_seed(1337)
+torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
+torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
+print(f"Using device: {device} with dtype {dtype}")
+# Configuration
+BDH_CONFIG = bdh.BDHConfig()
+BLOCK_SIZE = 512
+BATCH_SIZE = 32
+MAX_ITERS = 3000
+LEARNING_RATE = 1e-3
+WEIGHT_DECAY = 0.1
+LOG_FREQ = 100
+input_file_path = os.path.join(os.path.dirname(__file__), "input.txt")
+# Fetch the tiny Shakespeare dataset
+def fetch_data():
+    if not os.path.exists(input_file_path):
+        data_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
+        with open(input_file_path, "w") as f:
+            f.write(requests.get(data_url).text)
+def get_batch(split):
+    # treat the file as bytes
+    data = np.memmap(input_file_path, dtype=np.uint8, mode="r")
+    if split == "train":
+        data = data[: int(0.9 * len(data))]
+    else:
+        data = data[int(0.9 * len(data)) :]
+    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
+    x = torch.stack(
+        [torch.from_numpy((data[i : i + BLOCK_SIZE]).astype(np.int64)) for i in ix]
+    )
+    y = torch.stack(
+        [
+            torch.from_numpy((data[i + 1 : i + 1 + BLOCK_SIZE]).astype(np.int64))
+            for i in ix
+        ]
+    )
+    if torch.cuda.is_available():
+        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
+        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(
+            device, non_blocking=True
+        )
+    else:
+        x, y = x.to(device), y.to(device)
+    return x, y
+def eval(model):
+    model.eval()
+if __name__ == "__main__":
+    fetch_data()
+    model = bdh.BDH(BDH_CONFIG).to(device)
+    model = torch.compile(model)
+    optimizer = torch.optim.AdamW(
+        model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY
+    )
+    x, y = get_batch("train")
+    loss_acc = 0
+    loss_steps = 0
+    for step in range(MAX_ITERS):
+        with ctx:
+            logits, loss = model(x, y)
+        x, y = get_batch("train")
+        loss_acc += loss
+        loss_steps += 1
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+        optimizer.zero_grad()
+        if step % LOG_FREQ == 0:
+            print(f"Step: {step}/{MAX_ITERS} loss {loss_acc.item() / loss_steps:.3}")
+            loss_acc = 0
+            loss_steps = 0
+    print("Training done, now generating a sample ")
+    model.eval()
+    prompt = torch.tensor(
+        bytearray("To be or ", "utf-8"), dtype=torch.long, device=device
+    ).unsqueeze(0)
+    ret = model.generate(prompt, max_new_tokens=100, top_k=3)
+    ret_decoded = bytes(ret.to(torch.uint8).to("cpu").squeeze(0)).decode(
+        errors="backslashreplace"
+    )
+    print(ret_decoded)