MarxistLeninist
/

expAGILLM

Model card Files Files and versions

xet

Community

MarxistLeninist commited on Jul 6, 2025

Commit

6393d0d

verified ·

1 Parent(s): 27db0fa

Upload a3.py

Browse files

inference code (fix the nat inferencing using chatbot but ar works tho)

Files changed (1) hide show

a3.py +318 -0

a3.py ADDED Viewed

	@@ -0,0 +1,318 @@

+#!/usr/bin/env python3
+"""
+a2.py – joint-train low-rank AR + NAT, auto-resume at epoch 84
+• Loads ar_ep084.pt & nat_ep084.pt from ckpts1/ if present, then trains
+  from epoch 85.  Otherwise starts from scratch.
+• Dataset: WikiText-103 (raw) streamed, default cap = 100 M tokens.
+• Checkpoints: epoch 1, every 5 epochs, and final.
+• Default preset = small (fits 11 GB GPUs).
+"""
+from __future__ import annotations
+import argparse, math, pathlib, time
+from contextlib import nullcontext
+import torch, torch.nn as nn
+from torch.utils.data import DataLoader, IterableDataset
+from datasets import load_dataset
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, logging as hf_log
+# ╭─ AMP shim ─╮
+try:
+    from torch.amp import autocast as _ac_new
+    from torch.amp import GradScaler
+    _AMP = "new"
+except ImportError:                                   # torch < 2.2
+    from torch.cuda.amp import autocast as _ac_old
+    from torch.cuda.amp import GradScaler
+    _AMP = "old"
+def amp(enabled, dtype, device="cuda"):
+    if not enabled:
+        return nullcontext()
+    return _ac_new(device_type=device, dtype=dtype) if _AMP == "new" else _ac_old(dtype=dtype)
+# ╰─────────────╯
+hf_log.set_verbosity_error()
+torch.backends.cuda.matmul.allow_tf32 = True   # free speed-up on Ampere+
+# ───────────── presets ─────────────
+PRESETS = {
+    "small": dict(ar_d=512,  ar_layers=8,  ar_heads=16,
+                  nat_d=640, nat_layers=12, nat_heads=20),
+    "base":  dict(ar_d=768,  ar_layers=12, ar_heads=24,
+                  nat_d=1024,nat_layers=16, nat_heads=32),
+    "large": dict(ar_d=1024, ar_layers=16, ar_heads=32,
+                  nat_d=1280,nat_layers=24, nat_heads=40),
+}
+BLOCK       = 128
+DROP_P      = 0.1
+LR_AR = LR_NAT = 2e-4
+ALPHA_KL    = 1.0
+CKDIR       = pathlib.Path("ckpts1")
+DEV         = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+SAVE_EVERY  = 5
+RESUME_EPOCH= 84            # ← hard-coded resume point
+# ───────────── tokenizer ─────────────
+tok = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-0528", use_fast=True)
+if tok.pad_token is None:
+    tok.add_special_tokens({"pad_token": "[PAD]"})
+BLANK_ID = tok.pad_token_id
+VOCAB    = max(tok.get_vocab().values()) + 1
+# ───────────── data streaming ─────────────
+def stream_wikitext(max_tokens=0):
+    """Yield tokens from WikiText-103 until *max_tokens* reached (0 = no cap)."""
+    n = 0
+    for ex in load_dataset("wikitext", "wikitext-103-raw-v1",
+                           split="train", streaming=True):
+        for t in tok.encode(ex["text"]):
+            yield t
+            n += 1
+            if max_tokens and n >= max_tokens:
+                return
+class ARDataset(IterableDataset):
+    def __init__(self, blk, max_tokens=0):
+        self.blk, self.max = blk, max_tokens
+    def __iter__(self):
+        buf, gen = [], stream_wikitext(self.max)
+        for t in gen:
+            buf.append(t)
+            while len(buf) > self.blk:
+                yield torch.tensor(buf[:self.blk]), torch.tensor(buf[1:self.blk+1])
+                buf = buf[1:]
+class NATDataset(IterableDataset):
+    def __init__(self, blk, max_tokens=0):
+        self.blk, self.max = blk, max_tokens
+    def __iter__(self):
+        buf, gen = [], stream_wikitext(self.max)
+        for t in gen:
+            buf.append(t)
+            while len(buf) >= self.blk:
+                tgt, buf = buf[:self.blk], buf[self.blk:]
+                inp = [BLANK_ID if i % 2 == 0 else tgt[i//2]
+                       for i in range(self.blk * 2)]
+                yield torch.tensor(inp), torch.tensor(tgt)
+# ───────────── transformer components ─────────────
+class LowRankMHA(nn.Module):
+    def __init__(self, d, h, r):
+        super().__init__()
+        self.h, self.dk = h, d // h
+        self.q = self.k = self.v = nn.Linear(d, d, bias=False)
+        self.U = nn.Parameter(torch.randn(self.dk, r)); nn.init.orthogonal_(self.U)
+        self.proj = nn.Linear(h * r, d, bias=False)
+        self.drop = nn.Dropout(DROP_P)
+    def _proj(self, x):
+        B, N, _ = x.shape
+        return (x.view(B, N, self.h, self.dk).transpose(1, 2) @ self.U)
+    def forward(self, x, mask=None):
+        q, k, v = map(self._proj, (self.q(x), self.k(x), self.v(x)))
+        att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
+        if mask is not None:
+            att += mask
+        out = (att.softmax(-1) @ v).transpose(1, 2).reshape(x.size(0), x.size(1), -1)
+        return self.drop(self.proj(out))
+class Block(nn.Module):
+    def __init__(self, d, h, dff, r):
+        super().__init__()
+        self.ln1, self.ln2 = nn.LayerNorm(d), nn.LayerNorm(d)
+        self.mha = LowRankMHA(d, h, r)
+        self.ff = nn.Sequential(
+            nn.Linear(d, dff), nn.ReLU(), nn.Dropout(DROP_P), nn.Linear(dff, d)
+        )
+    def forward(self, x, mask=None):
+        y = self.ln1(x)
+        x = x + self.mha(y, mask)
+        return x + self.ff(self.ln2(x))
+# ───────────── model builders ─────────────
+def make_transformer(d, n_layers, n_heads, vocab, max_len=8192):
+    dff = 4 * d; low_rank = max(32, d // 16)
+    m = nn.Module()
+    m.emb = nn.Embedding(vocab, d)
+    m.pos = nn.Embedding(max_len, d)
+    m.blocks = nn.ModuleList(Block(d, n_heads, dff, low_rank)
+                             for _ in range(n_layers))
+    m.ln  = nn.LayerNorm(d)
+    m.out = nn.Linear(d, vocab)
+    return m
+def make_ar(cfg):  return make_transformer(cfg["ar_d"],  cfg["ar_layers"],
+                                          cfg["ar_heads"], VOCAB, 4096)
+def make_nat(cfg): return make_transformer(cfg["nat_d"], cfg["nat_layers"],
+                                          cfg["nat_heads"], VOCAB, 8192)
+# ───────────── NAT helpers ─────────────
+class NATWrap(nn.Module):
+    def __init__(self, core): super().__init__(); self.core = core
+    def forward(self, x): return self.core(torch.repeat_interleave(x, 2, 1))
+class ParScale(nn.Module):
+    def __init__(self, nat, P): super().__init__(); self.nat,self.P = nat,P
+    @torch.no_grad()
+    def generate(self, x, passes=1):
+        for _ in range(passes):
+            logits = self.nat(x); logits[..., BLANK_ID] = -1e9
+            cand   = logits.topk(self.P, -1).indices.permute(2,0,1)
+            best   = (cand != BLANK_ID).float().mean(-1).argmax(0)
+            x = cand[best, torch.arange(x.size(0), device=x.device)][:, ::2]
+        return x
+# ───────────── helpers ─────────────
+def fwd(model, ids, causal=False):
+    B, N = ids.shape
+    x = model.emb(ids) + model.pos(torch.arange(N, device=ids.device))
+    mask = None
+    if causal:
+        mask = torch.triu(torch.full((1,1,N,N), float("-inf"),
+                                     device=ids.device), 1)
+    for blk in model.blocks:
+        x = blk(x, mask)
+    return model.out(model.ln(x))
+# ───────────── training ─────────────
+def train_joint(a):
+    cfg = PRESETS[a.preset]
+    ar_loader  = DataLoader(ARDataset(BLOCK, a.max_tokens), batch_size=a.batch)
+    nat_loader = DataLoader(NATDataset(BLOCK, a.max_tokens), batch_size=a.batch)
+    ar , nat   = make_ar(cfg).to(DEV), make_nat(cfg).to(DEV)
+    # ----- resume if we have epoch-84 weights -----
+    start_ep = 0
+    ck_ar  = CKDIR / f"ar_ep{RESUME_EPOCH:03d}.pt"
+    ck_nat = CKDIR / f"nat_ep{RESUME_EPOCH:03d}.pt"
+    if ck_ar.exists() and ck_nat.exists():
+        ar.load_state_dict(torch.load(ck_ar,  map_location=DEV))
+        nat.load_state_dict(torch.load(ck_nat, map_location=DEV))
+        start_ep = RESUME_EPOCH
+        print(f"Resuming from epoch {start_ep} checkpoints.")
+    opt = torch.optim.AdamW(
+        [{"params": ar.parameters(),  "lr": LR_AR},
+         {"params": nat.parameters(), "lr": LR_NAT}]
+    )
+    # >>>>>>>  FIX: ensure 'initial_lr' so scheduler can resume  <<<<<<<
+    for pg in opt.param_groups:
+        pg.setdefault("initial_lr", pg["lr"])
+    # ------------------------------------------------------------------
+    sched = torch.optim.lr_scheduler.CosineAnnealingLR(
+        opt, T_max=a.epochs, last_epoch=start_ep - 1
+    )
+    ce  = nn.CrossEntropyLoss(label_smoothing=0.1)
+    ctc = nn.CTCLoss(blank=BLANK_ID, zero_infinity=True)
+    kl  = nn.KLDivLoss(reduction="batchmean")
+    use_amp = DEV.type == "cuda" and a.amp
+    scaler  = GradScaler(enabled=use_amp)
+    cast_dt = torch.bfloat16 if use_amp else torch.float32
+    CKDIR.mkdir(exist_ok=True)
+    tot_batches = None if not a.max_tokens else math.ceil(
+        math.ceil(a.max_tokens / BLOCK) / a.batch)
+    for ep in range(start_ep + 1, a.epochs + 1):
+        ar.train(); nat.train(); tot = steps = 0
+        loop = tqdm(zip(ar_loader, nat_loader), total=tot_batches,
+                    desc=f"Epoch {ep}/{a.epochs}", unit="batch")
+        for (x_ar, y_ar), (x_nat, y_nat) in loop:
+            x_ar, y_ar, x_nat, y_nat = map(lambda t: t.to(DEV),
+                                           (x_ar, y_ar, x_nat, y_nat))
+            opt.zero_grad(set_to_none=True)
+            with amp(use_amp, cast_dt, DEV.type):
+                logits_ar = fwd(ar, x_ar, causal=True)
+                loss_ar   = ce(logits_ar.reshape(-1, VOCAB), y_ar.reshape(-1))
+                logp_nat  = fwd(nat, x_nat).log_softmax(-1).transpose(0, 1)
+                ilen=tlen = torch.full((x_nat.size(0),), x_nat.size(1)//2,
+                                       dtype=torch.long, device=DEV)
+                loss_nat = ctc(logp_nat, y_nat, ilen, tlen)
+                loss_kld = kl(fwd(nat, x_ar).log_softmax(-1),
+                              logits_ar.softmax(-1).detach())
+                loss = loss_ar + loss_nat + ALPHA_KL * loss_kld
+            scaler.scale(loss).backward()
+            scaler.unscale_(opt)
+            nn.utils.clip_grad_norm_(ar.parameters(), 1.0)
+            nn.utils.clip_grad_norm_(nat.parameters(), 1.0)
+            scaler.step(opt); scaler.update()
+            tot += loss.item(); steps += 1
+            loop.set_postfix(loss=f"{loss.item():.3f}",
+                             avg=f"{tot/steps:.3f}", refresh=False)
+        sched.step()
+        if ep == 1 or ep % SAVE_EVERY == 0 or ep == a.epochs:
+            torch.save(nat.state_dict(), CKDIR / f"nat_ep{ep:03d}.pt")
+            torch.save(ar.state_dict(),  CKDIR / f"ar_ep{ep:03d}.pt")
+            print(f"Epoch {ep}: checkpoints written.")
+        print(f"Epoch {ep}: avg loss {tot/max(steps,1):.4f}")
+# ───────────── inference helpers ─────────────
+@torch.no_grad()
+def nat_infer(ckpt, prompt, max_new, passes, streams, preset):
+    nat = make_nat(PRESETS[preset]).to(DEV)
+    nat.load_state_dict(torch.load(ckpt, map_location=DEV)); nat.eval()
+    gen = ParScale(NATWrap(nat), P=streams).to(DEV)
+    inp = torch.tensor([tok.encode(prompt) + [BLANK_ID]*max_new], device=DEV)
+    t0 = time.time(); out = gen.generate(inp, passes=passes)[0]; dt = time.time() - t0
+    txt = tok.decode([t for t in out.tolist() if t != BLANK_ID], skip_special_tokens=True)
+    print(txt); print(f"[{len(txt.split()) - len(prompt.split())} new tokens in {dt:.2f}s]")
+@torch.no_grad()
+def ar_infer(ckpt, prompt, max_new, preset):
+    ar = make_ar(PRESETS[preset]).to(DEV)
+    ar.load_state_dict(torch.load(ckpt, map_location=DEV)); ar.eval()
+    ids = torch.tensor([tok.encode(prompt)], device=DEV); t0 = time.time()
+    for _ in range(max_new):
+        next_id = fwd(ar, ids, causal=True)[:, -1].argmax(-1, keepdim=True)
+        ids = torch.cat([ids, next_id], 1)
+    dt = time.time() - t0
+    txt = tok.decode(ids[0].tolist(), skip_special_tokens=True)
+    print(txt); print(f"[{len(txt.split()) - len(prompt.split())} new tokens in {dt:.2f}s]")
+# ───────────── CLI ─────────────
+def main():
+    p = argparse.ArgumentParser()
+    sub = p.add_subparsers(dest="cmd", required=True)
+    tr = sub.add_parser("train")
+    tr.add_argument("--preset", choices=PRESETS.keys(), default="small")
+    tr.add_argument("--epochs", type=int, default=128)
+    tr.add_argument("--batch",  type=int, default=2)
+    tr.add_argument("--max_tokens", type=int, default=100_000_000)
+    tr.add_argument("--amp", action="store_true")
+    inf = sub.add_parser("infer")
+    inf.add_argument("--preset", choices=PRESETS.keys(), default="small")
+    inf.add_argument("--mode",   choices=["nat","ar"], required=True)
+    inf.add_argument("--prompt", required=True)
+    inf.add_argument("--max_new", type=int, default=120)
+    inf.add_argument("--ckpt", required=True)
+    inf.add_argument("--passes",  type=int, default=1)
+    inf.add_argument("--streams", type=int, default=5)
+    args = p.parse_args()
+    if args.cmd == "train":
+        train_joint(args)
+    else:
+        if args.mode == "nat":
+            nat_infer(args.ckpt, args.prompt, args.max_new,
+                      args.passes, args.streams, args.preset)
+        else:
+            ar_infer(args.ckpt, args.prompt, args.max_new, args.preset)
+if __name__ == "__main__":
+    main()